From afa76dbdac344f2c0a44119b317dfe681714ea26 Mon Sep 17 00:00:00 2001 From: Oyvind Albrigtsen Date: Wed, 20 May 2026 09:18:16 +0200 Subject: [PATCH] - podman-etcd: fix port 2380 binding race - podman-etcd: fix machine deletion deadlock - podman-etcd: fix learner start deadlock Resolves: RHEL-177850, RHEL-177840, RHEL-177845 --- ...n-etcd-fix-machine-deletion-deadlock.patch | 283 ++++++++++++++++++ ...dman-etcd-fix-learner-start-deadlock.patch | 130 ++++++++ ...dman-etcd-fix-port-2380-binding-race.patch | 96 ++++++ resource-agents.spec | 15 +- 4 files changed, 523 insertions(+), 1 deletion(-) create mode 100644 RHEL-177840-podman-etcd-fix-machine-deletion-deadlock.patch create mode 100644 RHEL-177845-podman-etcd-fix-learner-start-deadlock.patch create mode 100644 RHEL-177850-podman-etcd-fix-port-2380-binding-race.patch diff --git a/RHEL-177840-podman-etcd-fix-machine-deletion-deadlock.patch b/RHEL-177840-podman-etcd-fix-machine-deletion-deadlock.patch new file mode 100644 index 0000000..b1a8fcb --- /dev/null +++ b/RHEL-177840-podman-etcd-fix-machine-deletion-deadlock.patch @@ -0,0 +1,283 @@ +From 9ba19a62543de4d7365fc711b908a2759f811af9 Mon Sep 17 00:00:00 2001 +From: Vincenzo Mauro +Date: Tue, 5 May 2026 14:24:43 +0200 +Subject: [PATCH 1/4] fix: fixed etcd learner deadlock + +--- + heartbeat/podman-etcd | 79 +++++++++++++++++++++++++++++++++++++++---- + 1 file changed, 73 insertions(+), 6 deletions(-) + +diff --git a/heartbeat/podman-etcd b/heartbeat/podman-etcd +index 4c9bbd4fa..5bb3b2897 100755 +--- a/heartbeat/podman-etcd ++++ b/heartbeat/podman-etcd +@@ -50,6 +50,7 @@ OCF_RESKEY_oom_default="-997" + OCF_RESKEY_config_location_default="/var/lib/etcd" + OCF_RESKEY_backup_location_default="/var/lib/etcd" + OCF_RESKEY_max_backup_snapshots_default="3" ++OCF_RESKEY_kubeconfig_default="/etc/kubernetes/static-pod-resources/kube-apiserver-certs/secrets/node-kubeconfigs/localhost.kubeconfig" + + : ${OCF_RESKEY_image=${OCF_RESKEY_image_default}} + : ${OCF_RESKEY_pod_manifest=${OCF_RESKEY_pod_manifest_default}} +@@ -63,6 +64,7 @@ OCF_RESKEY_max_backup_snapshots_default="3" + : ${OCF_RESKEY_config_location=${OCF_RESKEY_config_location_default}} + : ${OCF_RESKEY_backup_location=${OCF_RESKEY_backup_location_default}} + : ${OCF_RESKEY_max_backup_snapshots=${OCF_RESKEY_max_backup_snapshots_default}} ++: ${OCF_RESKEY_kubeconfig=${OCF_RESKEY_kubeconfig_default}} + + + ####################################################################### +@@ -288,6 +290,16 @@ Set max_backup_snapshots=0 to disable backups. + + + ++ ++ ++Path to a kubeconfig file for querying Machine API objects. Used to detect ++whether a peer node's Machine is being deleted, preventing the resource agent ++from re-adding it as an etcd learner during Machine deletion flows. ++ ++Kubeconfig for Machine API queries ++ ++ ++ + + + +@@ -1505,6 +1517,34 @@ detect_cluster_leadership_loss() + } + + ++# Checks whether the Machine object for a given node is being deleted. ++# Returns 0 (true) if the Machine has a deletionTimestamp set, 1 (false) otherwise. ++# Fails open: returns 1 on API errors to preserve current learner-addition behavior. ++is_peer_machine_deleting() ++{ ++ local node_name="$1" ++ local out ++ local deletion_ts ++ ++ out=$(timeout 10 oc --kubeconfig="$OCF_RESKEY_kubeconfig" get machines \ ++ -n openshift-machine-api -o json 2>&1) ++ if [ $? -ne 0 ]; then ++ ocf_log warn "could not query Machine API for node $node_name (fail-open): $out" ++ return 1 ++ fi ++ ++ # Select the Machine object for the given node and extract its deletionTimestamp if present ++ deletion_ts=$(printf "%s" "$out" | jq -r --arg name "$node_name" \ ++ '.items[] | select(.status.nodeRef.name == $name) | .metadata.deletionTimestamp // empty') ++ ++ if [ -n "$deletion_ts" ]; then ++ ocf_log info "Machine for node $node_name is being deleted (deletionTimestamp: $deletion_ts)" ++ return 0 ++ fi ++ ++ return 1 ++} ++ + # Manages etcd peer membership by detecting and handling missing or rejoining peers + # Adds missing peers as learners and reconciles member states when peers rejoin + # Args: $1 - member list JSON from etcdctl +@@ -1542,9 +1582,21 @@ manage_peer_membership() + # NOTE: voting members have a "name" field but no "isLearner" field, + # while learner members have "isLearner": true (boolean) but no "name" field, so we search for peerURLs matching. + peer_member_id=$(printf "%s" "$member_list_json" | jq -r ".members[] | select( .peerURLs | map(test(\"$peer_member_ip\")) | any).ID") ++ # During Machine deletion, CEO's MachineDeletionHooksController ++ # keeps the EtcdQuorumOperator preDrain hook as long as the peer IP appears in the etcd ++ # member list (learners included). If we add or keep a learner for a peer whose Machine ++ # is being deleted, CEO never clears the hook, MAO never drains, and the Machine hangs ++ # in Deleting. Two safeguards cover the race: ++ # A (below): peer is not yet in the member list — skip adding it as a learner if machine is deleting ++ # B (learner exists): a prior monitor cycle added the learner before the Machine ++ # deletion started — remove it so CEO can clear the hook. + if [ -z "$peer_member_id" ]; then + ocf_log info "$peer_member_name is not in the members list" +- add_member_as_learner "$peer_member_name" "$peer_member_ip" ++ if ! is_peer_machine_deleting "$peer_member_name"; then ++ add_member_as_learner "$peer_member_name" "$peer_member_ip" ++ else ++ ocf_log info "peer Machine is being deleted, skipping learner addition for $peer_member_name" ++ fi + set_standalone_node + return + fi +@@ -1552,10 +1604,21 @@ manage_peer_membership() + # Ensure learner_node attribute is always set when we have a learner member + local learner_member_id=$(printf "%s" "$member_list_json" | jq -r ".members[] | select( .isLearner==true ).ID") + local current_learner_node=$(attribute_learner_node get) +- if [ -n "$learner_member_id" ] && [ -z "$current_learner_node" ]; then +- ocf_log debug "$peer_member_name found as learner in member list, but learner_node attribute was not set. Updating" +- attribute_learner_node update "$peer_member_name" +- return ++ ++ if [ -n "$learner_member_id" ]; then ++ # Clean up a learner added before the Machine deletion started ++ if is_peer_machine_deleting "$peer_member_name"; then ++ ocf_log info "peer Machine is being deleted, removing learner $peer_member_name from member list" ++ remove_etcd_member_by_ip "$peer_member_ip" ++ attribute_learner_node clear ++ set_standalone_node ++ return ++ fi ++ if [ -z "$current_learner_node" ]; then ++ ocf_log debug "$peer_member_name found as learner in member list, but learner_node attribute was not set. Updating" ++ attribute_learner_node update "$peer_member_name" ++ return ++ fi + fi + + ocf_log debug "$peer_member_name is in the members list by IP: $peer_member_ip" +@@ -2312,7 +2375,11 @@ podman_start() + peer_node_name="$(get_peer_node_name)" + peer_node_ip="$(attribute_node_ip_peer)" + if [ -n "$peer_node_name" ] && [ -n "$peer_node_ip" ]; then +- add_member_as_learner "$peer_node_name" "$peer_node_ip" ++ if is_peer_machine_deleting "$peer_node_name"; then ++ ocf_log info "peer Machine is being deleted, skipping learner addition for $peer_node_name" ++ else ++ add_member_as_learner "$peer_node_name" "$peer_node_ip" ++ fi + set_standalone_node + else + ocf_log err "could not add peer as learner (peer node name: ${peer_node_name:-unknown}, peer ip: ${peer_node_ip:-unknown})" + +From 56d9754311ab0595dea1c47e26eca85bbcfb049c Mon Sep 17 00:00:00 2001 +From: Vincenzo Mauro +Date: Wed, 6 May 2026 15:15:27 +0200 +Subject: [PATCH 2/4] fix: added support for both MAPI and CAPI + +--- + heartbeat/podman-etcd | 42 +++++++++++++++++++++++++++++++++++------- + 1 file changed, 35 insertions(+), 7 deletions(-) + +diff --git a/heartbeat/podman-etcd b/heartbeat/podman-etcd +index 5bb3b2897..ad9804c1d 100755 +--- a/heartbeat/podman-etcd ++++ b/heartbeat/podman-etcd +@@ -295,6 +295,7 @@ Set max_backup_snapshots=0 to disable backups. + Path to a kubeconfig file for querying Machine API objects. Used to detect + whether a peer node's Machine is being deleted, preventing the resource agent + from re-adding it as an etcd learner during Machine deletion flows. ++Supports both MAPI (machine.openshift.io) and CAPI (cluster.x-k8s.io) Machine resources. + + Kubeconfig for Machine API queries + +@@ -1525,15 +1526,39 @@ is_peer_machine_deleting() + local node_name="$1" + local out + local deletion_ts ++ local oc_rc ++ local item_count + +- out=$(timeout 10 oc --kubeconfig="$OCF_RESKEY_kubeconfig" get machines \ ++ # Try MAPI first (machine.openshift.io), fall back to CAPI (cluster.x-k8s.io) ++ out=$(timeout 10 oc --kubeconfig="$OCF_RESKEY_kubeconfig" get machines.machine.openshift.io \ + -n openshift-machine-api -o json 2>&1) +- if [ $? -ne 0 ]; then +- ocf_log warn "could not query Machine API for node $node_name (fail-open): $out" +- return 1 ++ oc_rc=$? ++ ++ if [ $oc_rc -eq 0 ]; then ++ item_count=$(printf "%s" "$out" | jq '.items | length' 2>/dev/null) ++ fi ++ ++ # MAPI CRD missing, namespace absent, or no Machine objects — try CAPI ++ if [ $oc_rc -ne 0 ] || [ "${item_count:-0}" -eq 0 ]; then ++ ocf_log info "MAPI returned no machines, trying CAPI for node $node_name" ++ out=$(timeout 10 oc --kubeconfig="$OCF_RESKEY_kubeconfig" get machines.cluster.x-k8s.io \ ++ -n openshift-cluster-api -o json 2>&1) ++ if [ $? -ne 0 ]; then ++ ocf_log warn "could not query Machine API (MAPI or CAPI) for node $node_name (fail-open): $out" ++ return 1 ++ fi + fi + + # Select the Machine object for the given node and extract its deletionTimestamp if present ++ local machine_count ++ machine_count=$(printf "%s" "$out" | jq -r --arg name "$node_name" \ ++ '[.items[] | select(.status.nodeRef.name == $name)] | length' 2>/dev/null) ++ ++ if [ "$machine_count" = "0" ] || [ -z "$machine_count" ]; then ++ ocf_log warn "No Machine object found for node $node_name (fail-open): nodeRef may not be populated yet" ++ return 1 ++ fi ++ + deletion_ts=$(printf "%s" "$out" | jq -r --arg name "$node_name" \ + '.items[] | select(.status.nodeRef.name == $name) | .metadata.deletionTimestamp // empty') + +@@ -1609,9 +1634,12 @@ manage_peer_membership() + # Clean up a learner added before the Machine deletion started + if is_peer_machine_deleting "$peer_member_name"; then + ocf_log info "peer Machine is being deleted, removing learner $peer_member_name from member list" +- remove_etcd_member_by_ip "$peer_member_ip" +- attribute_learner_node clear +- set_standalone_node ++ if remove_etcd_member_by_ip "$peer_member_ip"; then ++ attribute_learner_node clear ++ set_standalone_node ++ else ++ ocf_log err "failed to remove learner for deleting Machine $peer_member_name; will retry next monitor cycle" ++ fi + return + fi + if [ -z "$current_learner_node" ]; then + +From beab70c7acd4f6ccc33c4dbcb3d72f94fc560812 Mon Sep 17 00:00:00 2001 +From: Vincenzo Mauro +Date: Tue, 12 May 2026 09:54:33 +0200 +Subject: [PATCH 3/4] reduced timeout to 5 and fixed MCAPI return code + +--- + heartbeat/podman-etcd | 3 ++- + 1 file changed, 2 insertions(+), 1 deletion(-) + +diff --git a/heartbeat/podman-etcd b/heartbeat/podman-etcd +index ad9804c1d..e022869a8 100755 +--- a/heartbeat/podman-etcd ++++ b/heartbeat/podman-etcd +@@ -1530,7 +1530,7 @@ is_peer_machine_deleting() + local item_count + + # Try MAPI first (machine.openshift.io), fall back to CAPI (cluster.x-k8s.io) +- out=$(timeout 10 oc --kubeconfig="$OCF_RESKEY_kubeconfig" get machines.machine.openshift.io \ ++ out=$(timeout 5 oc --kubeconfig="$OCF_RESKEY_kubeconfig" get machines.machine.openshift.io \ + -n openshift-machine-api -o json 2>&1) + oc_rc=$? + +@@ -1607,6 +1607,7 @@ manage_peer_membership() + # NOTE: voting members have a "name" field but no "isLearner" field, + # while learner members have "isLearner": true (boolean) but no "name" field, so we search for peerURLs matching. + peer_member_id=$(printf "%s" "$member_list_json" | jq -r ".members[] | select( .peerURLs | map(test(\"$peer_member_ip\")) | any).ID") ++ + # During Machine deletion, CEO's MachineDeletionHooksController + # keeps the EtcdQuorumOperator preDrain hook as long as the peer IP appears in the etcd + # member list (learners included). If we add or keep a learner for a peer whose Machine + +From 2b06ed31bda015543a365e02e1bc5a47b3fa0439 Mon Sep 17 00:00:00 2001 +From: Vincenzo Mauro +Date: Tue, 12 May 2026 10:30:48 +0200 +Subject: [PATCH 4/4] fixed return code for CAPI + +--- + heartbeat/podman-etcd | 5 +++-- + 1 file changed, 3 insertions(+), 2 deletions(-) + +diff --git a/heartbeat/podman-etcd b/heartbeat/podman-etcd +index e022869a8..2dbaf9991 100755 +--- a/heartbeat/podman-etcd ++++ b/heartbeat/podman-etcd +@@ -1541,9 +1541,10 @@ is_peer_machine_deleting() + # MAPI CRD missing, namespace absent, or no Machine objects — try CAPI + if [ $oc_rc -ne 0 ] || [ "${item_count:-0}" -eq 0 ]; then + ocf_log info "MAPI returned no machines, trying CAPI for node $node_name" +- out=$(timeout 10 oc --kubeconfig="$OCF_RESKEY_kubeconfig" get machines.cluster.x-k8s.io \ ++ out=$(timeout 5 oc --kubeconfig="$OCF_RESKEY_kubeconfig" get machines.cluster.x-k8s.io \ + -n openshift-cluster-api -o json 2>&1) +- if [ $? -ne 0 ]; then ++ oc_rc=$? ++ if [ $oc_rc -ne 0 ]; then + ocf_log warn "could not query Machine API (MAPI or CAPI) for node $node_name (fail-open): $out" + return 1 + fi diff --git a/RHEL-177845-podman-etcd-fix-learner-start-deadlock.patch b/RHEL-177845-podman-etcd-fix-learner-start-deadlock.patch new file mode 100644 index 0000000..7b6cd6f --- /dev/null +++ b/RHEL-177845-podman-etcd-fix-learner-start-deadlock.patch @@ -0,0 +1,130 @@ +From db041869f4b8612e44561f4ba4a46ed09d18e24e Mon Sep 17 00:00:00 2001 +From: Vincenzo Mauro +Date: Thu, 7 May 2026 18:14:04 +0200 +Subject: [PATCH 1/4] fixed OCPBUGS-83333 + +--- + heartbeat/podman-etcd | 22 +++++++++++++++++++++- + 1 file changed, 21 insertions(+), 1 deletion(-) + +diff --git a/heartbeat/podman-etcd b/heartbeat/podman-etcd +index 4c9bbd4fa..d96c055e3 100755 +--- a/heartbeat/podman-etcd ++++ b/heartbeat/podman-etcd +@@ -2519,7 +2519,27 @@ podman_validate() + + podman_notify() + { +- ocf_log info "notify: type=${OCF_RESKEY_CRM_meta_notify_type}, operation=${OCF_RESKEY_CRM_meta_notify_operation}, nodes { active=[${OCF_RESKEY_CRM_meta_notify_active_uname}], start=[${OCF_RESKEY_CRM_meta_notify_start_uname}], stop=[${OCF_RESKEY_CRM_meta_notify_stop_uname}] }, resources { active=[${OCF_RESKEY_CRM_meta_notify_active_resource}], start =[${OCF_RESKEY_CRM_meta_notify_start_resource}], stop=[${OCF_RESKEY_CRM_meta_notify_stop_resource}] }" ++ local notify_type="${OCF_RESKEY_CRM_meta_notify_type}" ++ local notify_operation="${OCF_RESKEY_CRM_meta_notify_operation}" ++ ++ ocf_log info "notify: type=${notify_type}, operation=${notify_operation}, nodes { active=[${OCF_RESKEY_CRM_meta_notify_active_uname}], start=[${OCF_RESKEY_CRM_meta_notify_start_uname}], stop=[${OCF_RESKEY_CRM_meta_notify_stop_uname}] }, resources { active=[${OCF_RESKEY_CRM_meta_notify_active_resource}], start =[${OCF_RESKEY_CRM_meta_notify_start_resource}], stop=[${OCF_RESKEY_CRM_meta_notify_stop_resource}] }" ++ ++ # Pacemaker serializes operations per resource per node. The start sequence ++ # with notifications is: ++ # pre-notify(start) on peer → start on joiner → post-notify(start) on peer ++ # Between pre-notify and post-notify, the peer's recurring monitor is ++ # queued — Pacemaker won't overlap operations for the same resource on the ++ # same node. The monitor path (check_peer → manage_peer_membership → ++ # add_member_as_learner) is the primary way a running peer adds the ++ # starting node to the etcd member list. Without handling it here, the ++ # starting node's podman_start poll loop (waiting for learner_node attribute) ++ # deadlocks: start waits for learner_node, monitor waits for start to finish. ++ # pre_notify_start fires before the start action, giving us the window to ++ # add the learner so the joiner's poll loop finds it immediately. ++ if [ "$notify_type" = "pre" ] && [ "$notify_operation" = "start" ]; then ++ ocf_log info "pre_notify_start: running peer membership check for starting node" ++ check_peer ++ fi + } + + # TODO : + +From d1c817108276ee3019a20164fca0646985d99cde Mon Sep 17 00:00:00 2001 +From: Vincenzo Mauro +Date: Fri, 8 May 2026 10:38:12 +0200 +Subject: [PATCH 2/4] Updated deadlock comment + +--- + heartbeat/podman-etcd | 18 ++++++------------ + 1 file changed, 6 insertions(+), 12 deletions(-) + +diff --git a/heartbeat/podman-etcd b/heartbeat/podman-etcd +index d96c055e3..21a5e01e1 100755 +--- a/heartbeat/podman-etcd ++++ b/heartbeat/podman-etcd +@@ -2524,18 +2524,12 @@ podman_notify() + + ocf_log info "notify: type=${notify_type}, operation=${notify_operation}, nodes { active=[${OCF_RESKEY_CRM_meta_notify_active_uname}], start=[${OCF_RESKEY_CRM_meta_notify_start_uname}], stop=[${OCF_RESKEY_CRM_meta_notify_stop_uname}] }, resources { active=[${OCF_RESKEY_CRM_meta_notify_active_resource}], start =[${OCF_RESKEY_CRM_meta_notify_start_resource}], stop=[${OCF_RESKEY_CRM_meta_notify_stop_resource}] }" + +- # Pacemaker serializes operations per resource per node. The start sequence +- # with notifications is: +- # pre-notify(start) on peer → start on joiner → post-notify(start) on peer +- # Between pre-notify and post-notify, the peer's recurring monitor is +- # queued — Pacemaker won't overlap operations for the same resource on the +- # same node. The monitor path (check_peer → manage_peer_membership → +- # add_member_as_learner) is the primary way a running peer adds the +- # starting node to the etcd member list. Without handling it here, the +- # starting node's podman_start poll loop (waiting for learner_node attribute) +- # deadlocks: start waits for learner_node, monitor waits for start to finish. +- # pre_notify_start fires before the start action, giving us the window to +- # add the learner so the joiner's poll loop finds it immediately. ++ # Pacemaker suppresses the peer's monitor during an ++ # active start/notify cycle. Since monitor is the only path that calls ++ # add_member_as_learner (outside force_new_cluster), the joiner's ++ # podman_start poll loop deadlocks; it waits for learner_node, but ++ # no monitor runs to set it. pre_notify_start fires before start, ++ # so we add the learner here to break the deadlock + if [ "$notify_type" = "pre" ] && [ "$notify_operation" = "start" ]; then + ocf_log info "pre_notify_start: running peer membership check for starting node" + check_peer + +From 0fafab701878ce4b8c7413610e41dce3e69447aa Mon Sep 17 00:00:00 2001 +From: Vincenzo Mauro +Date: Tue, 12 May 2026 10:24:26 +0200 +Subject: [PATCH 3/4] improved logging + +--- + heartbeat/podman-etcd | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +diff --git a/heartbeat/podman-etcd b/heartbeat/podman-etcd +index 21a5e01e1..41ce84ff1 100755 +--- a/heartbeat/podman-etcd ++++ b/heartbeat/podman-etcd +@@ -2531,7 +2531,7 @@ podman_notify() + # no monitor runs to set it. pre_notify_start fires before start, + # so we add the learner here to break the deadlock + if [ "$notify_type" = "pre" ] && [ "$notify_operation" = "start" ]; then +- ocf_log info "pre_notify_start: running peer membership check for starting node" ++ ocf_log info "pre_notify_start: running peer membership check for ${OCF_RESKEY_CRM_meta_notify_start_uname}" + check_peer + fi + } +@@ -2616,4 +2616,4 @@ validate-all) podman_validate;; + esac + rc=$? + ocf_log debug "${OCF_RESOURCE_INSTANCE} $__OCF_ACTION : $rc" +-exit $rc ++exc + +From bdce9048b4fc2c38255d36e73a1f73a7d72b7471 Mon Sep 17 00:00:00 2001 +From: Vincenzo Mauro +Date: Tue, 12 May 2026 10:46:49 +0200 +Subject: [PATCH 4/4] fixed typo + +--- + heartbeat/podman-etcd | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/heartbeat/podman-etcd b/heartbeat/podman-etcd +index 41ce84ff1..740e2edb4 100755 +--- a/heartbeat/podman-etcd ++++ b/heartbeat/podman-etcd +@@ -2616,4 +2616,4 @@ validate-all) podman_validate;; + esac + rc=$? + ocf_log debug "${OCF_RESOURCE_INSTANCE} $__OCF_ACTION : $rc" +-exc ++exit $rc diff --git a/RHEL-177850-podman-etcd-fix-port-2380-binding-race.patch b/RHEL-177850-podman-etcd-fix-port-2380-binding-race.patch new file mode 100644 index 0000000..e7b3d29 --- /dev/null +++ b/RHEL-177850-podman-etcd-fix-port-2380-binding-race.patch @@ -0,0 +1,96 @@ +From 42dfef941ed80d6073022141fa1cad513e8dae4f Mon Sep 17 00:00:00 2001 +From: Pablo Fontanilla +Date: Wed, 22 Apr 2026 12:54:58 +0200 +Subject: [PATCH 1/2] fix(podman-etcd): use -ge 1 in + etcd_pod_container_exists() + +PR #2112 added -a to crictl ps to include exited containers, but +did not update the count check from -eq 1 to -ge 1. During install, +etcd container crashes create exited containers that inflate the +count past 1, causing the guard to report 'pod not found' despite +the pod running. + +Fixes: OCPBUGS-83742 + +Co-Authored-By: Claude Opus 4.6 +--- + heartbeat/podman-etcd | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/heartbeat/podman-etcd b/heartbeat/podman-etcd +index 4c9bbd4fa..52b2a1386 100755 +--- a/heartbeat/podman-etcd ++++ b/heartbeat/podman-etcd +@@ -932,7 +932,7 @@ etcd_pod_container_exists() { + local count_matches + # Check whether the etcd pod exists on the same node (including stopped/exited containers) + count_matches=$(crictl pods --label app=etcd -q | xargs -I {} crictl ps -a --pod {} -o json | jq -r '.containers[].metadata | select ( .name == "etcd" ).name' | wc -l) +- if [ "$count_matches" -eq 1 ]; then ++ if [ "$count_matches" -ge 1 ]; then + # etcd pod found + return 0 + fi + +From 30d20f6b99ae9898bf801c0a5e690b81fc928faa Mon Sep 17 00:00:00 2001 +From: Pablo Fontanilla +Date: Wed, 22 Apr 2026 12:57:46 +0200 +Subject: [PATCH 2/2] fix(podman-etcd): wait for etcd ports before starting + container + +During the static-pod to podman-etcd transition, the old etcd process +may still hold ports 2379/2380 when the RA tries to start its container. +This causes 'bind: address already in use' errors and eventual fallback +to standalone mode. + +Add a 60-second wait loop (modeled on CEO's pod.gotpl.yaml port check) +that blocks until the ports are free before calling podman run/start. + +Fixes: OCPBUGS-83742 + +Co-Authored-By: Claude Opus 4.6 +--- + heartbeat/podman-etcd | 24 ++++++++++++++++++++++++ + 1 file changed, 24 insertions(+) + +diff --git a/heartbeat/podman-etcd b/heartbeat/podman-etcd +index 52b2a1386..9a960914b 100755 +--- a/heartbeat/podman-etcd ++++ b/heartbeat/podman-etcd +@@ -940,6 +940,25 @@ etcd_pod_container_exists() { + return 1 + } + ++wait_for_etcd_ports_release() { ++ local timeout=${1:-60} ++ local elapsed=0 ++ if [ -z "$(ss -Htan '( sport = 2379 or sport = 2380 )')" ]; then ++ return 0 ++ fi ++ ocf_log info "waiting for etcd ports 2379/2380 to be released (timeout: ${timeout}s)" ++ while [ -n "$(ss -Htan '( sport = 2379 or sport = 2380 )')" ]; do ++ if [ "$elapsed" -ge "$timeout" ]; then ++ ocf_log err "etcd ports still in use after ${timeout}s" ++ return 1 ++ fi ++ sleep 1 ++ elapsed=$((elapsed + 1)) ++ done ++ ocf_log info "etcd ports released after ${elapsed}s" ++ return 0 ++} ++ + attribute_node_cluster_id() + { + local action="$1" +@@ -2267,6 +2286,11 @@ podman_start() + ocf_log notice "Pull image not required, ${OCF_RESKEY_image}" + fi + ++ if ! wait_for_etcd_ports_release 60; then ++ ocf_exit_reason "etcd ports 2379/2380 still bound — cannot start container" ++ return $OCF_ERR_GENERIC ++ fi ++ + if ocf_is_true "$OCF_RESKEY_reuse" && container_exists; then + ocf_log info "starting existing container $CONTAINER." + ocf_run podman start "$CONTAINER" diff --git a/resource-agents.spec b/resource-agents.spec index cfb0d3b..6eebfb1 100644 --- a/resource-agents.spec +++ b/resource-agents.spec @@ -45,7 +45,7 @@ Name: resource-agents Summary: Open Source HA Reusable Cluster Resource Scripts Version: 4.10.0 -Release: 116%{?rcver:%{rcver}}%{?numcomm:.%{numcomm}}%{?alphatag:.%{alphatag}}%{?dirty:.%{dirty}}%{?dist} +Release: 117%{?rcver:%{rcver}}%{?numcomm:.%{numcomm}}%{?alphatag:.%{alphatag}}%{?dirty:.%{dirty}}%{?dist} License: GPLv2+ and LGPLv2+ URL: https://github.com/ClusterLabs/resource-agents Source0: %{upstream_prefix}-%{upstream_version}.tar.gz @@ -209,6 +209,9 @@ Patch156: RHEL-148198-2-db2-do-not-use-db2stop-to-avoid-divergence-in-the-log.pa Patch157: RHEL-160863-1-Filesystem-do-not-return-CONFIGURED-during-monitor-action.patch Patch158: RHEL-160863-2-Filesystem-always-return-OCF_ERR_GENERIC-when-another-device-is-mounted-on-mountpoint.patch Patch159: RHEL-150850-pgsql-use-monitor_user-for-monitor-calls-and-use-pgpass-when-monitor_password-is-not-specified.patch +Patch160: RHEL-177850-podman-etcd-fix-port-2380-binding-race.patch +Patch161: RHEL-177840-podman-etcd-fix-machine-deletion-deadlock.patch +Patch162: RHEL-177845-podman-etcd-fix-learner-start-deadlock.patch # bundled ha-cloud-support libs Patch500: ha-cloud-support-aliyun.patch @@ -513,6 +516,9 @@ exit 1 %patch -p1 -P 157 %patch -p1 -P 158 %patch -p1 -P 159 +%patch -p1 -P 160 +%patch -p1 -P 161 +%patch -p1 -P 162 # bundled ha-cloud-support libs %patch -p1 -P 500 @@ -847,6 +853,13 @@ rm -rf %{buildroot}/usr/share/doc/resource-agents %{_usr}/lib/ocf/lib/heartbeat/OCF_*.pm %changelog +* Wed May 20 2026 Oyvind Albrigtsen - 4.10.0-117 +- podman-etcd: fix port 2380 binding race +- podman-etcd: fix machine deletion deadlock +- podman-etcd: fix learner start deadlock + + Resolves: RHEL-177850, RHEL-177840, RHEL-177845 + * Tue May 19 2026 Oyvind Albrigtsen - 4.10.0-116 - pgsql: use monitor_user for monitor-calls and use .pgpass when monitor_password is not specified