- podman-etcd: fix port 2380 binding race

- podman-etcd: fix machine deletion deadlock - podman-etcd: fix learner start deadlock Resolves: RHEL-177850, RHEL-177840, RHEL-177845
2026-05-20 09:18:16 +02:00 · 2026-05-20 09:18:16 +02:00 · afa76dbdac
commit afa76dbdac
parent 44aa4ccf08
4 changed files with 523 additions and 1 deletions
--- a/RHEL-177840-podman-etcd-fix-machine-deletion-deadlock.patch
+++ b/RHEL-177840-podman-etcd-fix-machine-deletion-deadlock.patch
@ -0,0 +1,283 @@
+From 9ba19a62543de4d7365fc711b908a2759f811af9 Mon Sep 17 00:00:00 2001
+From: Vincenzo Mauro <vmauro@redhat.com>
+Date: Tue, 5 May 2026 14:24:43 +0200
+Subject: [PATCH 1/4] fix: fixed etcd learner deadlock
+
+---
+ heartbeat/podman-etcd | 79 +++++++++++++++++++++++++++++++++++++++----
+ 1 file changed, 73 insertions(+), 6 deletions(-)
+
+diff --git a/heartbeat/podman-etcd b/heartbeat/podman-etcd
+index 4c9bbd4fa..5bb3b2897 100755
+--- a/heartbeat/podman-etcd
+++ b/heartbeat/podman-etcd
+@@ -50,6 +50,7 @@ OCF_RESKEY_oom_default="-997"
+ OCF_RESKEY_config_location_default="/var/lib/etcd"
+ OCF_RESKEY_backup_location_default="/var/lib/etcd"
+ OCF_RESKEY_max_backup_snapshots_default="3"
+OCF_RESKEY_kubeconfig_default="/etc/kubernetes/static-pod-resources/kube-apiserver-certs/secrets/node-kubeconfigs/localhost.kubeconfig"
+ 
+ : ${OCF_RESKEY_image=${OCF_RESKEY_image_default}}
+ : ${OCF_RESKEY_pod_manifest=${OCF_RESKEY_pod_manifest_default}}
+@@ -63,6 +64,7 @@ OCF_RESKEY_max_backup_snapshots_default="3"
+ : ${OCF_RESKEY_config_location=${OCF_RESKEY_config_location_default}}
+ : ${OCF_RESKEY_backup_location=${OCF_RESKEY_backup_location_default}}
+ : ${OCF_RESKEY_max_backup_snapshots=${OCF_RESKEY_max_backup_snapshots_default}}
+: ${OCF_RESKEY_kubeconfig=${OCF_RESKEY_kubeconfig_default}}
+ 
+ 
+ #######################################################################
+@@ -288,6 +290,16 @@ Set max_backup_snapshots=0 to disable backups.
+ <content type="integer" default="${OCF_RESKEY_max_backup_snapshots_default}"/>
+ </parameter>
+ 
+<parameter name="kubeconfig" required="0" unique="0">
+<longdesc lang="en">
+Path to a kubeconfig file for querying Machine API objects. Used to detect
+whether a peer node's Machine is being deleted, preventing the resource agent
+from re-adding it as an etcd learner during Machine deletion flows.
+</longdesc>
+<shortdesc lang="en">Kubeconfig for Machine API queries</shortdesc>
+<content type="string" default="${OCF_RESKEY_kubeconfig_default}"/>
+</parameter>
+
+ </parameters>
+ 
+ <actions>
+@@ -1505,6 +1517,34 @@ detect_cluster_leadership_loss()
+ }
+ 
+ 
+# Checks whether the Machine object for a given node is being deleted.
+# Returns 0 (true) if the Machine has a deletionTimestamp set, 1 (false) otherwise.
+# Fails open: returns 1 on API errors to preserve current learner-addition behavior.
+is_peer_machine_deleting()
+{
+	local node_name="$1"
+	local out
+	local deletion_ts
+
+	out=$(timeout 10 oc --kubeconfig="$OCF_RESKEY_kubeconfig" get machines \
+		-n openshift-machine-api -o json 2>&1)
+	if [ $? -ne 0 ]; then
+		ocf_log warn "could not query Machine API for node $node_name (fail-open): $out"
+		return 1
+	fi
+
+	# Select the Machine object for the given node and extract its deletionTimestamp if present
+	deletion_ts=$(printf "%s" "$out" | jq -r --arg name "$node_name" \
+		'.items[] | select(.status.nodeRef.name == $name) | .metadata.deletionTimestamp // empty')
+
+	if [ -n "$deletion_ts" ]; then
+		ocf_log info "Machine for node $node_name is being deleted (deletionTimestamp: $deletion_ts)"
+		return 0
+	fi
+
+	return 1
+}
+
+ # Manages etcd peer membership by detecting and handling missing or rejoining peers
+ # Adds missing peers as learners and reconciles member states when peers rejoin
+ # Args: $1 - member list JSON from etcdctl
+@@ -1542,9 +1582,21 @@ manage_peer_membership()
+ 	# NOTE: voting members have a "name" field but no "isLearner" field,
+ 	# while learner members have "isLearner": true (boolean) but no "name" field, so we search for peerURLs matching.
+ 	peer_member_id=$(printf "%s" "$member_list_json" | jq -r ".members[] | select( .peerURLs | map(test(\"$peer_member_ip\")) | any).ID")
+	# During Machine deletion, CEO's MachineDeletionHooksController
+	# keeps the EtcdQuorumOperator preDrain hook as long as the peer IP appears in the etcd
+	# member list (learners included). If we add or keep a learner for a peer whose Machine
+	# is being deleted, CEO never clears the hook, MAO never drains, and the Machine hangs
+	# in Deleting. Two safeguards cover the race:
+	#   A (below): peer is not yet in the member list — skip adding it as a learner if machine is deleting
+	#   B (learner exists): a prior monitor cycle added the learner before the Machine
+	#     deletion started — remove it so CEO can clear the hook.
+ 	if [ -z "$peer_member_id" ]; then
+ 		ocf_log info "$peer_member_name is not in the members list"
+-		add_member_as_learner "$peer_member_name" "$peer_member_ip"
+		if ! is_peer_machine_deleting "$peer_member_name"; then
+			add_member_as_learner "$peer_member_name" "$peer_member_ip"
+		else
+			ocf_log info "peer Machine is being deleted, skipping learner addition for $peer_member_name"
+		fi
+ 		set_standalone_node
+ 		return
+ 	fi
+@@ -1552,10 +1604,21 @@ manage_peer_membership()
+ 	# Ensure learner_node attribute is always set when we have a learner member
+ 	local learner_member_id=$(printf "%s" "$member_list_json" | jq -r ".members[] | select( .isLearner==true ).ID")
+ 	local current_learner_node=$(attribute_learner_node get)
+-	if [ -n "$learner_member_id" ] && [ -z "$current_learner_node" ]; then
+-		ocf_log debug "$peer_member_name found as learner in member list, but learner_node attribute was not set. Updating"
+-		attribute_learner_node update "$peer_member_name"
+-		return
+
+	if [ -n "$learner_member_id" ]; then
+		# Clean up a learner added before the Machine deletion started
+		if is_peer_machine_deleting "$peer_member_name"; then
+			ocf_log info "peer Machine is being deleted, removing learner $peer_member_name from member list"
+			remove_etcd_member_by_ip "$peer_member_ip"
+			attribute_learner_node clear
+			set_standalone_node
+			return
+		fi
+		if [ -z "$current_learner_node" ]; then
+			ocf_log debug "$peer_member_name found as learner in member list, but learner_node attribute was not set. Updating"
+			attribute_learner_node update "$peer_member_name"
+			return
+		fi
+ 	fi
+ 
+ 	ocf_log debug "$peer_member_name is in the members list by IP: $peer_member_ip"
+@@ -2312,7 +2375,11 @@ podman_start()
+ 				peer_node_name="$(get_peer_node_name)"
+ 				peer_node_ip="$(attribute_node_ip_peer)"
+ 				if [ -n "$peer_node_name" ] && [ -n "$peer_node_ip" ]; then
+-					add_member_as_learner "$peer_node_name" "$peer_node_ip"
+					if is_peer_machine_deleting "$peer_node_name"; then
+						ocf_log info "peer Machine is being deleted, skipping learner addition for $peer_node_name"
+					else
+						add_member_as_learner "$peer_node_name" "$peer_node_ip"
+					fi
+ 					set_standalone_node
+ 				else
+ 					ocf_log err "could not add peer as learner (peer node name: ${peer_node_name:-unknown}, peer ip: ${peer_node_ip:-unknown})"
+
+From 56d9754311ab0595dea1c47e26eca85bbcfb049c Mon Sep 17 00:00:00 2001
+From: Vincenzo Mauro <vmauro@redhat.com>
+Date: Wed, 6 May 2026 15:15:27 +0200
+Subject: [PATCH 2/4] fix: added support for both MAPI and CAPI
+
+---
+ heartbeat/podman-etcd | 42 +++++++++++++++++++++++++++++++++++-------
+ 1 file changed, 35 insertions(+), 7 deletions(-)
+
+diff --git a/heartbeat/podman-etcd b/heartbeat/podman-etcd
+index 5bb3b2897..ad9804c1d 100755
+--- a/heartbeat/podman-etcd
+++ b/heartbeat/podman-etcd
+@@ -295,6 +295,7 @@ Set max_backup_snapshots=0 to disable backups.
+ Path to a kubeconfig file for querying Machine API objects. Used to detect
+ whether a peer node's Machine is being deleted, preventing the resource agent
+ from re-adding it as an etcd learner during Machine deletion flows.
+Supports both MAPI (machine.openshift.io) and CAPI (cluster.x-k8s.io) Machine resources.
+ </longdesc>
+ <shortdesc lang="en">Kubeconfig for Machine API queries</shortdesc>
+ <content type="string" default="${OCF_RESKEY_kubeconfig_default}"/>
+@@ -1525,15 +1526,39 @@ is_peer_machine_deleting()
+ 	local node_name="$1"
+ 	local out
+ 	local deletion_ts
+	local oc_rc
+	local item_count
+ 
+-	out=$(timeout 10 oc --kubeconfig="$OCF_RESKEY_kubeconfig" get machines \
+	# Try MAPI first (machine.openshift.io), fall back to CAPI (cluster.x-k8s.io)
+	out=$(timeout 10 oc --kubeconfig="$OCF_RESKEY_kubeconfig" get machines.machine.openshift.io \
+ 		-n openshift-machine-api -o json 2>&1)
+-	if [ $? -ne 0 ]; then
+-		ocf_log warn "could not query Machine API for node $node_name (fail-open): $out"
+-		return 1
+	oc_rc=$?
+
+	if [ $oc_rc -eq 0 ]; then
+		item_count=$(printf "%s" "$out" | jq '.items | length' 2>/dev/null)
+	fi
+
+	# MAPI CRD missing, namespace absent, or no Machine objects — try CAPI
+	if [ $oc_rc -ne 0 ] || [ "${item_count:-0}" -eq 0 ]; then
+		ocf_log info "MAPI returned no machines, trying CAPI for node $node_name"
+		out=$(timeout 10 oc --kubeconfig="$OCF_RESKEY_kubeconfig" get machines.cluster.x-k8s.io \
+			-n openshift-cluster-api -o json 2>&1)
+		if [ $? -ne 0 ]; then
+			ocf_log warn "could not query Machine API (MAPI or CAPI) for node $node_name (fail-open): $out"
+			return 1
+		fi
+ 	fi
+ 
+ 	# Select the Machine object for the given node and extract its deletionTimestamp if present
+	local machine_count
+	machine_count=$(printf "%s" "$out" | jq -r --arg name "$node_name" \
+		'[.items[] | select(.status.nodeRef.name == $name)] | length' 2>/dev/null)
+
+	if [ "$machine_count" = "0" ] || [ -z "$machine_count" ]; then
+		ocf_log warn "No Machine object found for node $node_name (fail-open): nodeRef may not be populated yet"
+		return 1
+	fi
+
+ 	deletion_ts=$(printf "%s" "$out" | jq -r --arg name "$node_name" \
+ 		'.items[] | select(.status.nodeRef.name == $name) | .metadata.deletionTimestamp // empty')
+ 
+@@ -1609,9 +1634,12 @@ manage_peer_membership()
+ 		# Clean up a learner added before the Machine deletion started
+ 		if is_peer_machine_deleting "$peer_member_name"; then
+ 			ocf_log info "peer Machine is being deleted, removing learner $peer_member_name from member list"
+-			remove_etcd_member_by_ip "$peer_member_ip"
+-			attribute_learner_node clear
+-			set_standalone_node
+			if remove_etcd_member_by_ip "$peer_member_ip"; then
+				attribute_learner_node clear
+				set_standalone_node
+			else
+				ocf_log err "failed to remove learner for deleting Machine $peer_member_name; will retry next monitor cycle"
+			fi
+ 			return
+ 		fi
+ 		if [ -z "$current_learner_node" ]; then
+
+From beab70c7acd4f6ccc33c4dbcb3d72f94fc560812 Mon Sep 17 00:00:00 2001
+From: Vincenzo Mauro <vmauro@redhat.com>
+Date: Tue, 12 May 2026 09:54:33 +0200
+Subject: [PATCH 3/4] reduced timeout to 5 and fixed MCAPI return code
+
+---
+ heartbeat/podman-etcd | 3 ++-
+ 1 file changed, 2 insertions(+), 1 deletion(-)
+
+diff --git a/heartbeat/podman-etcd b/heartbeat/podman-etcd
+index ad9804c1d..e022869a8 100755
+--- a/heartbeat/podman-etcd
+++ b/heartbeat/podman-etcd
+@@ -1530,7 +1530,7 @@ is_peer_machine_deleting()
+ 	local item_count
+ 
+ 	# Try MAPI first (machine.openshift.io), fall back to CAPI (cluster.x-k8s.io)
+-	out=$(timeout 10 oc --kubeconfig="$OCF_RESKEY_kubeconfig" get machines.machine.openshift.io \
+	out=$(timeout 5 oc --kubeconfig="$OCF_RESKEY_kubeconfig" get machines.machine.openshift.io \
+ 		-n openshift-machine-api -o json 2>&1)
+ 	oc_rc=$?
+ 
+@@ -1607,6 +1607,7 @@ manage_peer_membership()
+ 	# NOTE: voting members have a "name" field but no "isLearner" field,
+ 	# while learner members have "isLearner": true (boolean) but no "name" field, so we search for peerURLs matching.
+ 	peer_member_id=$(printf "%s" "$member_list_json" | jq -r ".members[] | select( .peerURLs | map(test(\"$peer_member_ip\")) | any).ID")
+	
+ 	# During Machine deletion, CEO's MachineDeletionHooksController
+ 	# keeps the EtcdQuorumOperator preDrain hook as long as the peer IP appears in the etcd
+ 	# member list (learners included). If we add or keep a learner for a peer whose Machine
+
+From 2b06ed31bda015543a365e02e1bc5a47b3fa0439 Mon Sep 17 00:00:00 2001
+From: Vincenzo Mauro <vmauro@redhat.com>
+Date: Tue, 12 May 2026 10:30:48 +0200
+Subject: [PATCH 4/4] fixed return code for CAPI
+
+---
+ heartbeat/podman-etcd | 5 +++--
+ 1 file changed, 3 insertions(+), 2 deletions(-)
+
+diff --git a/heartbeat/podman-etcd b/heartbeat/podman-etcd
+index e022869a8..2dbaf9991 100755
+--- a/heartbeat/podman-etcd
+++ b/heartbeat/podman-etcd
+@@ -1541,9 +1541,10 @@ is_peer_machine_deleting()
+ 	# MAPI CRD missing, namespace absent, or no Machine objects — try CAPI
+ 	if [ $oc_rc -ne 0 ] || [ "${item_count:-0}" -eq 0 ]; then
+ 		ocf_log info "MAPI returned no machines, trying CAPI for node $node_name"
+-		out=$(timeout 10 oc --kubeconfig="$OCF_RESKEY_kubeconfig" get machines.cluster.x-k8s.io \
+		out=$(timeout 5 oc --kubeconfig="$OCF_RESKEY_kubeconfig" get machines.cluster.x-k8s.io \
+ 			-n openshift-cluster-api -o json 2>&1)
+-		if [ $? -ne 0 ]; then
+		oc_rc=$?
+		if [ $oc_rc -ne 0 ]; then
+ 			ocf_log warn "could not query Machine API (MAPI or CAPI) for node $node_name (fail-open): $out"
+ 			return 1
+ 		fi
--- a/RHEL-177845-podman-etcd-fix-learner-start-deadlock.patch
+++ b/RHEL-177845-podman-etcd-fix-learner-start-deadlock.patch
@ -0,0 +1,130 @@
+From db041869f4b8612e44561f4ba4a46ed09d18e24e Mon Sep 17 00:00:00 2001
+From: Vincenzo Mauro <vmauro@redhat.com>
+Date: Thu, 7 May 2026 18:14:04 +0200
+Subject: [PATCH 1/4] fixed OCPBUGS-83333
+
+---
+ heartbeat/podman-etcd | 22 +++++++++++++++++++++-
+ 1 file changed, 21 insertions(+), 1 deletion(-)
+
+diff --git a/heartbeat/podman-etcd b/heartbeat/podman-etcd
+index 4c9bbd4fa..d96c055e3 100755
+--- a/heartbeat/podman-etcd
+++ b/heartbeat/podman-etcd
+@@ -2519,7 +2519,27 @@ podman_validate()
+ 
+ podman_notify()
+ {
+-	ocf_log info "notify: type=${OCF_RESKEY_CRM_meta_notify_type}, operation=${OCF_RESKEY_CRM_meta_notify_operation}, nodes { active=[${OCF_RESKEY_CRM_meta_notify_active_uname}], start=[${OCF_RESKEY_CRM_meta_notify_start_uname}], stop=[${OCF_RESKEY_CRM_meta_notify_stop_uname}] }, resources { active=[${OCF_RESKEY_CRM_meta_notify_active_resource}], start =[${OCF_RESKEY_CRM_meta_notify_start_resource}], stop=[${OCF_RESKEY_CRM_meta_notify_stop_resource}] }"
+	local notify_type="${OCF_RESKEY_CRM_meta_notify_type}"
+	local notify_operation="${OCF_RESKEY_CRM_meta_notify_operation}"
+
+	ocf_log info "notify: type=${notify_type}, operation=${notify_operation}, nodes { active=[${OCF_RESKEY_CRM_meta_notify_active_uname}], start=[${OCF_RESKEY_CRM_meta_notify_start_uname}], stop=[${OCF_RESKEY_CRM_meta_notify_stop_uname}] }, resources { active=[${OCF_RESKEY_CRM_meta_notify_active_resource}], start =[${OCF_RESKEY_CRM_meta_notify_start_resource}], stop=[${OCF_RESKEY_CRM_meta_notify_stop_resource}] }"
+
+	# Pacemaker serializes operations per resource per node. The start sequence
+	# with notifications is:
+	#   pre-notify(start) on peer → start on joiner → post-notify(start) on peer
+	# Between pre-notify and post-notify, the peer's recurring monitor is
+	# queued — Pacemaker won't overlap operations for the same resource on the
+	# same node. The monitor path (check_peer → manage_peer_membership →
+	# add_member_as_learner) is the primary way a running peer adds the
+	# starting node to the etcd member list. Without handling it here, the
+	# starting node's podman_start poll loop (waiting for learner_node attribute)
+	# deadlocks: start waits for learner_node, monitor waits for start to finish.
+	# pre_notify_start fires before the start action, giving us the window to
+	# add the learner so the joiner's poll loop finds it immediately.
+	if [ "$notify_type" = "pre" ] && [ "$notify_operation" = "start" ]; then
+		ocf_log info "pre_notify_start: running peer membership check for starting node"
+		check_peer
+	fi
+ }
+ 
+ # TODO :
+
+From d1c817108276ee3019a20164fca0646985d99cde Mon Sep 17 00:00:00 2001
+From: Vincenzo Mauro <vmauro@redhat.com>
+Date: Fri, 8 May 2026 10:38:12 +0200
+Subject: [PATCH 2/4] Updated deadlock comment
+
+---
+ heartbeat/podman-etcd | 18 ++++++------------
+ 1 file changed, 6 insertions(+), 12 deletions(-)
+
+diff --git a/heartbeat/podman-etcd b/heartbeat/podman-etcd
+index d96c055e3..21a5e01e1 100755
+--- a/heartbeat/podman-etcd
+++ b/heartbeat/podman-etcd
+@@ -2524,18 +2524,12 @@ podman_notify()
+ 
+ 	ocf_log info "notify: type=${notify_type}, operation=${notify_operation}, nodes { active=[${OCF_RESKEY_CRM_meta_notify_active_uname}], start=[${OCF_RESKEY_CRM_meta_notify_start_uname}], stop=[${OCF_RESKEY_CRM_meta_notify_stop_uname}] }, resources { active=[${OCF_RESKEY_CRM_meta_notify_active_resource}], start =[${OCF_RESKEY_CRM_meta_notify_start_resource}], stop=[${OCF_RESKEY_CRM_meta_notify_stop_resource}] }"
+ 
+-	# Pacemaker serializes operations per resource per node. The start sequence
+-	# with notifications is:
+-	#   pre-notify(start) on peer → start on joiner → post-notify(start) on peer
+-	# Between pre-notify and post-notify, the peer's recurring monitor is
+-	# queued — Pacemaker won't overlap operations for the same resource on the
+-	# same node. The monitor path (check_peer → manage_peer_membership →
+-	# add_member_as_learner) is the primary way a running peer adds the
+-	# starting node to the etcd member list. Without handling it here, the
+-	# starting node's podman_start poll loop (waiting for learner_node attribute)
+-	# deadlocks: start waits for learner_node, monitor waits for start to finish.
+-	# pre_notify_start fires before the start action, giving us the window to
+-	# add the learner so the joiner's poll loop finds it immediately.
+	# Pacemaker suppresses the peer's monitor during an
+	# active start/notify cycle. Since monitor is the only path that calls
+	# add_member_as_learner (outside force_new_cluster), the joiner's
+	# podman_start poll loop deadlocks; it waits for learner_node, but
+	# no monitor runs to set it. pre_notify_start fires before start,
+	# so we add the learner here to break the deadlock
+ 	if [ "$notify_type" = "pre" ] && [ "$notify_operation" = "start" ]; then
+ 		ocf_log info "pre_notify_start: running peer membership check for starting node"
+ 		check_peer
+
+From 0fafab701878ce4b8c7413610e41dce3e69447aa Mon Sep 17 00:00:00 2001
+From: Vincenzo Mauro <vmauro@redhat.com>
+Date: Tue, 12 May 2026 10:24:26 +0200
+Subject: [PATCH 3/4] improved logging
+
+---
+ heartbeat/podman-etcd | 4 ++--
+ 1 file changed, 2 insertions(+), 2 deletions(-)
+
+diff --git a/heartbeat/podman-etcd b/heartbeat/podman-etcd
+index 21a5e01e1..41ce84ff1 100755
+--- a/heartbeat/podman-etcd
+++ b/heartbeat/podman-etcd
+@@ -2531,7 +2531,7 @@ podman_notify()
+ 	# no monitor runs to set it. pre_notify_start fires before start,
+ 	# so we add the learner here to break the deadlock
+ 	if [ "$notify_type" = "pre" ] && [ "$notify_operation" = "start" ]; then
+-		ocf_log info "pre_notify_start: running peer membership check for starting node"
+		ocf_log info "pre_notify_start: running peer membership check for ${OCF_RESKEY_CRM_meta_notify_start_uname}"
+ 		check_peer
+ 	fi
+ }
+@@ -2616,4 +2616,4 @@ validate-all)	podman_validate;;
+ esac
+ rc=$?
+ ocf_log debug "${OCF_RESOURCE_INSTANCE} $__OCF_ACTION : $rc"
+-exit $rc
+exc
+
+From bdce9048b4fc2c38255d36e73a1f73a7d72b7471 Mon Sep 17 00:00:00 2001
+From: Vincenzo Mauro <vmauro@redhat.com>
+Date: Tue, 12 May 2026 10:46:49 +0200
+Subject: [PATCH 4/4] fixed typo
+
+---
+ heartbeat/podman-etcd | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+diff --git a/heartbeat/podman-etcd b/heartbeat/podman-etcd
+index 41ce84ff1..740e2edb4 100755
+--- a/heartbeat/podman-etcd
+++ b/heartbeat/podman-etcd
+@@ -2616,4 +2616,4 @@ validate-all)	podman_validate;;
+ esac
+ rc=$?
+ ocf_log debug "${OCF_RESOURCE_INSTANCE} $__OCF_ACTION : $rc"
+-exc
+exit $rc
--- a/RHEL-177850-podman-etcd-fix-port-2380-binding-race.patch
+++ b/RHEL-177850-podman-etcd-fix-port-2380-binding-race.patch
@ -0,0 +1,96 @@
+From 42dfef941ed80d6073022141fa1cad513e8dae4f Mon Sep 17 00:00:00 2001
+From: Pablo Fontanilla <pfontani@redhat.com>
+Date: Wed, 22 Apr 2026 12:54:58 +0200
+Subject: [PATCH 1/2] fix(podman-etcd): use -ge 1 in
+ etcd_pod_container_exists()
+
+PR #2112 added -a to crictl ps to include exited containers, but
+did not update the count check from -eq 1 to -ge 1. During install,
+etcd container crashes create exited containers that inflate the
+count past 1, causing the guard to report 'pod not found' despite
+the pod running.
+
+Fixes: OCPBUGS-83742
+
+Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
+---
+ heartbeat/podman-etcd | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+diff --git a/heartbeat/podman-etcd b/heartbeat/podman-etcd
+index 4c9bbd4fa..52b2a1386 100755
+--- a/heartbeat/podman-etcd
+++ b/heartbeat/podman-etcd
+@@ -932,7 +932,7 @@ etcd_pod_container_exists() {
+ 	local count_matches
+ 	# Check whether the etcd pod exists on the same node (including stopped/exited containers)
+ 	count_matches=$(crictl pods --label app=etcd -q | xargs -I {} crictl ps -a --pod {} -o json | jq -r '.containers[].metadata | select ( .name == "etcd" ).name' | wc -l)
+-	if [ "$count_matches" -eq 1 ]; then
+	if [ "$count_matches" -ge 1 ]; then
+ 		# etcd pod found
+ 		return 0
+ 	fi
+
+From 30d20f6b99ae9898bf801c0a5e690b81fc928faa Mon Sep 17 00:00:00 2001
+From: Pablo Fontanilla <pfontani@redhat.com>
+Date: Wed, 22 Apr 2026 12:57:46 +0200
+Subject: [PATCH 2/2] fix(podman-etcd): wait for etcd ports before starting
+ container
+
+During the static-pod to podman-etcd transition, the old etcd process
+may still hold ports 2379/2380 when the RA tries to start its container.
+This causes 'bind: address already in use' errors and eventual fallback
+to standalone mode.
+
+Add a 60-second wait loop (modeled on CEO's pod.gotpl.yaml port check)
+that blocks until the ports are free before calling podman run/start.
+
+Fixes: OCPBUGS-83742
+
+Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
+---
+ heartbeat/podman-etcd | 24 ++++++++++++++++++++++++
+ 1 file changed, 24 insertions(+)
+
+diff --git a/heartbeat/podman-etcd b/heartbeat/podman-etcd
+index 52b2a1386..9a960914b 100755
+--- a/heartbeat/podman-etcd
+++ b/heartbeat/podman-etcd
+@@ -940,6 +940,25 @@ etcd_pod_container_exists() {
+ 	return 1
+ }
+ 
+wait_for_etcd_ports_release() {
+	local timeout=${1:-60}
+	local elapsed=0
+	if [ -z "$(ss -Htan '( sport = 2379 or sport = 2380 )')" ]; then
+		return 0
+	fi
+	ocf_log info "waiting for etcd ports 2379/2380 to be released (timeout: ${timeout}s)"
+	while [ -n "$(ss -Htan '( sport = 2379 or sport = 2380 )')" ]; do
+		if [ "$elapsed" -ge "$timeout" ]; then
+			ocf_log err "etcd ports still in use after ${timeout}s"
+			return 1
+		fi
+		sleep 1
+		elapsed=$((elapsed + 1))
+	done
+	ocf_log info "etcd ports released after ${elapsed}s"
+	return 0
+}
+
+ attribute_node_cluster_id()
+ {
+ 	local action="$1"
+@@ -2267,6 +2286,11 @@ podman_start()
+ 		ocf_log notice "Pull image not required, ${OCF_RESKEY_image}"
+ 	fi
+ 
+	if ! wait_for_etcd_ports_release 60; then
+		ocf_exit_reason "etcd ports 2379/2380 still bound — cannot start container"
+		return $OCF_ERR_GENERIC
+	fi
+
+ 	if ocf_is_true "$OCF_RESKEY_reuse" && container_exists; then
+ 		ocf_log info "starting existing container $CONTAINER."
+ 		ocf_run podman start "$CONTAINER"
--- a/resource-agents.spec
+++ b/resource-agents.spec
@ -45,7 +45,7 @@
 Name:		resource-agents
 Summary:	Open Source HA Reusable Cluster Resource Scripts
 Version:	4.10.0
-Release:	116%{?rcver:%{rcver}}%{?numcomm:.%{numcomm}}%{?alphatag:.%{alphatag}}%{?dirty:.%{dirty}}%{?dist}
+Release:	117%{?rcver:%{rcver}}%{?numcomm:.%{numcomm}}%{?alphatag:.%{alphatag}}%{?dirty:.%{dirty}}%{?dist}
 License:	GPLv2+ and LGPLv2+
 URL:		https://github.com/ClusterLabs/resource-agents
 Source0:	%{upstream_prefix}-%{upstream_version}.tar.gz
@ -209,6 +209,9 @@ Patch156:	RHEL-148198-2-db2-do-not-use-db2stop-to-avoid-divergence-in-the-log.pa
 Patch157:	RHEL-160863-1-Filesystem-do-not-return-CONFIGURED-during-monitor-action.patch
 Patch158:	RHEL-160863-2-Filesystem-always-return-OCF_ERR_GENERIC-when-another-device-is-mounted-on-mountpoint.patch
 Patch159:	RHEL-150850-pgsql-use-monitor_user-for-monitor-calls-and-use-pgpass-when-monitor_password-is-not-specified.patch
+Patch160:	RHEL-177850-podman-etcd-fix-port-2380-binding-race.patch
+Patch161:	RHEL-177840-podman-etcd-fix-machine-deletion-deadlock.patch
+Patch162:	RHEL-177845-podman-etcd-fix-learner-start-deadlock.patch

 # bundled ha-cloud-support libs
 Patch500:	ha-cloud-support-aliyun.patch
@ -513,6 +516,9 @@ exit 1
 %patch -p1 -P 157
 %patch -p1 -P 158
 %patch -p1 -P 159
+%patch -p1 -P 160
+%patch -p1 -P 161
+%patch -p1 -P 162

 # bundled ha-cloud-support libs
 %patch -p1 -P 500
@ -847,6 +853,13 @@ rm -rf %{buildroot}/usr/share/doc/resource-agents
 %{_usr}/lib/ocf/lib/heartbeat/OCF_*.pm

 %changelog
+* Wed May 20 2026 Oyvind Albrigtsen <oalbrigt@redhat.com> - 4.10.0-117
+- podman-etcd: fix port 2380 binding race
+- podman-etcd: fix machine deletion deadlock
+- podman-etcd: fix learner start deadlock
+
+  Resolves: RHEL-177850, RHEL-177840, RHEL-177845
+
 * Tue May 19 2026 Oyvind Albrigtsen <oalbrigt@redhat.com> - 4.10.0-116
 - pgsql: use monitor_user for monitor-calls and use .pgpass when
  monitor_password is not specified