- podman-etcd: prevent learner from starting before cluster is ready

Resolves: RHEL-131181
2025-11-27 10:21:59 +01:00 · 2025-11-27 10:21:59 +01:00 · 963c977ce4
commit 963c977ce4
parent 0e9fc0284c
2 changed files with 115 additions and 1 deletions
--- a/RHEL-131181-podman-etcd-prevent-learner-from-starting-before-cluster-is-ready.patch
+++ b/RHEL-131181-podman-etcd-prevent-learner-from-starting-before-cluster-is-ready.patch
@ -0,0 +1,107 @@
+From 5cc74acd67c294da36b3f40e44842a82aa7d0957 Mon Sep 17 00:00:00 2001
+From: Carlo Lobrano <c.lobrano@gmail.com>
+Date: Wed, 26 Nov 2025 11:43:25 +0100
+Subject: [PATCH] OCPEDGE-2213: podman-etcd: fix to prevent learner from
+ starting before cluster is ready (#2098)
+
+* OCPEDGE-2213: fix(podman-etcd): prevent learner from starting before cluster is ready
+
+Clear stale learner_node attribute during stop and on restart when no
+active resources exist, ensuring learner always waits for peer
+availability.
+
+* fix: podman-etcd should cleanup standalone/learner attributes when promotion succeeds
+
+* fix: remove misleading endpoint IP from log
+---
+ heartbeat/podman-etcd | 33 +++++++++++++++++++--------------
+ 1 file changed, 19 insertions(+), 14 deletions(-)
+
+diff --git a/heartbeat/podman-etcd b/heartbeat/podman-etcd
+index b1f52cd5c..3e3f1d60e 100755
+--- a/heartbeat/podman-etcd
+++ b/heartbeat/podman-etcd
+@@ -880,7 +880,7 @@ add_member_as_learner()
+ 	local endpoint_url=$(ip_url $(attribute_node_ip get))
+ 	local peer_url=$(ip_url $member_ip)
+ 
+-	ocf_log info "add $member_name ($member_ip, $endpoint_url) to the member list as learner"
+	ocf_log info "add $member_name ($member_ip) to the member list as learner"
+ 	out=$(podman exec "${CONTAINER}" etcdctl --endpoints="$endpoint_url:2379" member add "$member_name" --peer-urls="$peer_url:2380" --learner)
+ 	rc=$?
+ 	if [ $rc -ne 0 ]; then
+@@ -1032,7 +1032,7 @@ promote_learner_member()
+ 	if ! ocf_run podman exec "${CONTAINER}" etcdctl member promote "$learner_member_id_hex" 2>&1; then
+ 		# promotion is expected to fail if the peer is not yet up-to-date
+ 		ocf_log info "could not promote member $learner_member_id_hex, error code: $?"
+-		return $OCF_SUCCESS
+		return $OCF_ERR_GENERIC
+ 	fi
+ 	ocf_log info "successfully promoted member '$learner_member_id_hex'"
+ 	return $OCF_SUCCESS
+@@ -1063,19 +1063,19 @@ reconcile_member_state()
+ 	fi
+ 
+ 	if [ -n "$learner_member_id" ]; then
+-		promote_learner_member "$learner_member_id"
+-		return $?
+-	fi
+-
+-	if [ -z "$learner_member_id" ]; then
+-		if ! clear_standalone_node; then
+-			ocf_log error "could not clear standalone_node attribute, error code: $?"
+-			return $OCF_ERR_GENERIC
+-		fi
+-		if ! attribute_learner_node clear; then
+-			ocf_log error "could not clear learner_node attribute, error code: $?"
+		if ! promote_learner_member "$learner_member_id"; then
+ 			return $OCF_ERR_GENERIC
+ 		fi
+		# promotion succeded: continue to clear standalone_node and learner_node
+	fi
+
+	if ! clear_standalone_node; then
+		ocf_log error "could not clear standalone_node attribute, error code: $?"
+		return $OCF_ERR_GENERIC
+	fi
+	if ! attribute_learner_node clear; then
+		ocf_log error "could not clear learner_node attribute, error code: $?"
+		return $OCF_ERR_GENERIC
+ 	fi
+ 
+ 	return $OCF_SUCCESS
+@@ -1258,6 +1258,7 @@ manage_peer_membership()
+ 			set_standalone_node
+ 		else
+ 			ocf_log debug "$name is in the members list by IP: $ip"
+			# Errors from reconcile_member_state are logged internally. Ignoring them here prevents stopping a healthy voter agent; critical local failures are caught by detect_cluster_leadership_loss.
+ 			reconcile_member_state "$member_list_json"
+ 		fi
+ 	done
+@@ -1369,7 +1370,7 @@ container_health_check()
+ 	# Could not execute monitor check command and state file exists - the container failed, check recovery status in this lifecycle
+ 	local time_since_heartbeat
+ 	time_since_heartbeat=$(get_time_since_last_heartbeat)
+-	ocf_log err "Container ${CONTAINER} failed (last healthy: ${time_since_heartbeat}s ago)"
+	ocf_log err "Container ${CONTAINER} failed (last healthy: ${time_since_heartbeat}s ago, error code: $rc)"
+ 
+ 	# Check if peer has set force_new_cluster for recovery
+ 	local fnc_holders
+@@ -1795,6 +1796,9 @@ podman_start()
+ 				fi
+ 				;;
+ 			0)
+				# No active resources: clear any stale learner_node attribute from previous failed session
+				ocf_log debug "clearing stale learner_node attribute (safe when active_resources_count=0)"
+				attribute_learner_node clear
+ 				# count how many agents are starting now
+ 				local start_resources_count
+ 				start_resources_count=$(echo "$OCF_RESKEY_CRM_meta_notify_start_resource" | wc -w)
+@@ -2090,6 +2094,7 @@ podman_stop()
+ 		ocf_log err "could not delete container health check state file"
+ 	fi
+ 
+	attribute_learner_node clear
+ 	attribute_node_revision update
+ 	attribute_node_cluster_id update
+ 
--- a/resource-agents.spec
+++ b/resource-agents.spec
@ -45,7 +45,7 @@
 Name:		resource-agents
 Summary:	Open Source HA Reusable Cluster Resource Scripts
 Version:	4.16.0
-Release:	44%{?rcver:%{rcver}}%{?numcomm:.%{numcomm}}%{?alphatag:.%{alphatag}}%{?dirty:.%{dirty}}%{?dist}
+Release:	45%{?rcver:%{rcver}}%{?numcomm:.%{numcomm}}%{?alphatag:.%{alphatag}}%{?dirty:.%{dirty}}%{?dist}
 License:	GPL-2.0-or-later AND LGPL-2.1-or-later
 URL:		https://github.com/ClusterLabs/resource-agents
 Source0:	%{upstream_prefix}-%{upstream_version}.tar.gz
@ -109,6 +109,7 @@ Patch56:	RHEL-112443-2-nginx-restore-selinux-context-for-pid-file-during-validat
 Patch57:	RHEL-130576-1-podman-etcd-prevent-last-active-member-from-leaving.patch
 Patch58:	RHEL-130576-2-podman-etcd-remove-test-code.patch
 Patch59:	RHEL-126083-2-podman-etcd-fix-count-of-fnc-holders-in-container_health_check.patch
+Patch60:	RHEL-131181-podman-etcd-prevent-learner-from-starting-before-cluster-is-ready.patch

 # bundled ha-cloud-support libs
 Patch500:	ha-cloud-support-aliyun.patch
@ -339,6 +340,7 @@ exit 1
 %patch -p1 -P 57
 %patch -p1 -P 58
 %patch -p1 -P 59
+%patch -p1 -P 60

 # bundled ha-cloud-support libs
 %patch -p1 -P 500
@ -671,6 +673,11 @@ rm -rf %{buildroot}/usr/share/doc/resource-agents
 %{_usr}/lib/ocf/lib/heartbeat/OCF_*.pm

 %changelog
+* Thu Nov 27 2025 Oyvind Albrigtsen <oalbrigt@redhat.com> - 4.16.0-45
+- podman-etcd: prevent learner from starting before cluster is ready
+
+  Resolves: RHEL-131181
+
 * Mon Nov 24 2025 Oyvind Albrigtsen <oalbrigt@redhat.com> - 4.16.0-44
 - podman-etcd: add container crash detection with coordinated recovery