- podman-etcd: fix port 2380 binding race

- podman-etcd: fix machine deletion deadlock
- podman-etcd: fix learner start deadlock

  Resolves: RHEL-177850, RHEL-177840, RHEL-177845
This commit is contained in:
Oyvind Albrigtsen 2026-05-20 09:18:16 +02:00
parent 44aa4ccf08
commit afa76dbdac
4 changed files with 523 additions and 1 deletions

View File

@ -0,0 +1,283 @@
From 9ba19a62543de4d7365fc711b908a2759f811af9 Mon Sep 17 00:00:00 2001
From: Vincenzo Mauro <vmauro@redhat.com>
Date: Tue, 5 May 2026 14:24:43 +0200
Subject: [PATCH 1/4] fix: fixed etcd learner deadlock
---
heartbeat/podman-etcd | 79 +++++++++++++++++++++++++++++++++++++++----
1 file changed, 73 insertions(+), 6 deletions(-)
diff --git a/heartbeat/podman-etcd b/heartbeat/podman-etcd
index 4c9bbd4fa..5bb3b2897 100755
--- a/heartbeat/podman-etcd
+++ b/heartbeat/podman-etcd
@@ -50,6 +50,7 @@ OCF_RESKEY_oom_default="-997"
OCF_RESKEY_config_location_default="/var/lib/etcd"
OCF_RESKEY_backup_location_default="/var/lib/etcd"
OCF_RESKEY_max_backup_snapshots_default="3"
+OCF_RESKEY_kubeconfig_default="/etc/kubernetes/static-pod-resources/kube-apiserver-certs/secrets/node-kubeconfigs/localhost.kubeconfig"
: ${OCF_RESKEY_image=${OCF_RESKEY_image_default}}
: ${OCF_RESKEY_pod_manifest=${OCF_RESKEY_pod_manifest_default}}
@@ -63,6 +64,7 @@ OCF_RESKEY_max_backup_snapshots_default="3"
: ${OCF_RESKEY_config_location=${OCF_RESKEY_config_location_default}}
: ${OCF_RESKEY_backup_location=${OCF_RESKEY_backup_location_default}}
: ${OCF_RESKEY_max_backup_snapshots=${OCF_RESKEY_max_backup_snapshots_default}}
+: ${OCF_RESKEY_kubeconfig=${OCF_RESKEY_kubeconfig_default}}
#######################################################################
@@ -288,6 +290,16 @@ Set max_backup_snapshots=0 to disable backups.
<content type="integer" default="${OCF_RESKEY_max_backup_snapshots_default}"/>
</parameter>
+<parameter name="kubeconfig" required="0" unique="0">
+<longdesc lang="en">
+Path to a kubeconfig file for querying Machine API objects. Used to detect
+whether a peer node's Machine is being deleted, preventing the resource agent
+from re-adding it as an etcd learner during Machine deletion flows.
+</longdesc>
+<shortdesc lang="en">Kubeconfig for Machine API queries</shortdesc>
+<content type="string" default="${OCF_RESKEY_kubeconfig_default}"/>
+</parameter>
+
</parameters>
<actions>
@@ -1505,6 +1517,34 @@ detect_cluster_leadership_loss()
}
+# Checks whether the Machine object for a given node is being deleted.
+# Returns 0 (true) if the Machine has a deletionTimestamp set, 1 (false) otherwise.
+# Fails open: returns 1 on API errors to preserve current learner-addition behavior.
+is_peer_machine_deleting()
+{
+ local node_name="$1"
+ local out
+ local deletion_ts
+
+ out=$(timeout 10 oc --kubeconfig="$OCF_RESKEY_kubeconfig" get machines \
+ -n openshift-machine-api -o json 2>&1)
+ if [ $? -ne 0 ]; then
+ ocf_log warn "could not query Machine API for node $node_name (fail-open): $out"
+ return 1
+ fi
+
+ # Select the Machine object for the given node and extract its deletionTimestamp if present
+ deletion_ts=$(printf "%s" "$out" | jq -r --arg name "$node_name" \
+ '.items[] | select(.status.nodeRef.name == $name) | .metadata.deletionTimestamp // empty')
+
+ if [ -n "$deletion_ts" ]; then
+ ocf_log info "Machine for node $node_name is being deleted (deletionTimestamp: $deletion_ts)"
+ return 0
+ fi
+
+ return 1
+}
+
# Manages etcd peer membership by detecting and handling missing or rejoining peers
# Adds missing peers as learners and reconciles member states when peers rejoin
# Args: $1 - member list JSON from etcdctl
@@ -1542,9 +1582,21 @@ manage_peer_membership()
# NOTE: voting members have a "name" field but no "isLearner" field,
# while learner members have "isLearner": true (boolean) but no "name" field, so we search for peerURLs matching.
peer_member_id=$(printf "%s" "$member_list_json" | jq -r ".members[] | select( .peerURLs | map(test(\"$peer_member_ip\")) | any).ID")
+ # During Machine deletion, CEO's MachineDeletionHooksController
+ # keeps the EtcdQuorumOperator preDrain hook as long as the peer IP appears in the etcd
+ # member list (learners included). If we add or keep a learner for a peer whose Machine
+ # is being deleted, CEO never clears the hook, MAO never drains, and the Machine hangs
+ # in Deleting. Two safeguards cover the race:
+ # A (below): peer is not yet in the member list — skip adding it as a learner if machine is deleting
+ # B (learner exists): a prior monitor cycle added the learner before the Machine
+ # deletion started — remove it so CEO can clear the hook.
if [ -z "$peer_member_id" ]; then
ocf_log info "$peer_member_name is not in the members list"
- add_member_as_learner "$peer_member_name" "$peer_member_ip"
+ if ! is_peer_machine_deleting "$peer_member_name"; then
+ add_member_as_learner "$peer_member_name" "$peer_member_ip"
+ else
+ ocf_log info "peer Machine is being deleted, skipping learner addition for $peer_member_name"
+ fi
set_standalone_node
return
fi
@@ -1552,10 +1604,21 @@ manage_peer_membership()
# Ensure learner_node attribute is always set when we have a learner member
local learner_member_id=$(printf "%s" "$member_list_json" | jq -r ".members[] | select( .isLearner==true ).ID")
local current_learner_node=$(attribute_learner_node get)
- if [ -n "$learner_member_id" ] && [ -z "$current_learner_node" ]; then
- ocf_log debug "$peer_member_name found as learner in member list, but learner_node attribute was not set. Updating"
- attribute_learner_node update "$peer_member_name"
- return
+
+ if [ -n "$learner_member_id" ]; then
+ # Clean up a learner added before the Machine deletion started
+ if is_peer_machine_deleting "$peer_member_name"; then
+ ocf_log info "peer Machine is being deleted, removing learner $peer_member_name from member list"
+ remove_etcd_member_by_ip "$peer_member_ip"
+ attribute_learner_node clear
+ set_standalone_node
+ return
+ fi
+ if [ -z "$current_learner_node" ]; then
+ ocf_log debug "$peer_member_name found as learner in member list, but learner_node attribute was not set. Updating"
+ attribute_learner_node update "$peer_member_name"
+ return
+ fi
fi
ocf_log debug "$peer_member_name is in the members list by IP: $peer_member_ip"
@@ -2312,7 +2375,11 @@ podman_start()
peer_node_name="$(get_peer_node_name)"
peer_node_ip="$(attribute_node_ip_peer)"
if [ -n "$peer_node_name" ] && [ -n "$peer_node_ip" ]; then
- add_member_as_learner "$peer_node_name" "$peer_node_ip"
+ if is_peer_machine_deleting "$peer_node_name"; then
+ ocf_log info "peer Machine is being deleted, skipping learner addition for $peer_node_name"
+ else
+ add_member_as_learner "$peer_node_name" "$peer_node_ip"
+ fi
set_standalone_node
else
ocf_log err "could not add peer as learner (peer node name: ${peer_node_name:-unknown}, peer ip: ${peer_node_ip:-unknown})"
From 56d9754311ab0595dea1c47e26eca85bbcfb049c Mon Sep 17 00:00:00 2001
From: Vincenzo Mauro <vmauro@redhat.com>
Date: Wed, 6 May 2026 15:15:27 +0200
Subject: [PATCH 2/4] fix: added support for both MAPI and CAPI
---
heartbeat/podman-etcd | 42 +++++++++++++++++++++++++++++++++++-------
1 file changed, 35 insertions(+), 7 deletions(-)
diff --git a/heartbeat/podman-etcd b/heartbeat/podman-etcd
index 5bb3b2897..ad9804c1d 100755
--- a/heartbeat/podman-etcd
+++ b/heartbeat/podman-etcd
@@ -295,6 +295,7 @@ Set max_backup_snapshots=0 to disable backups.
Path to a kubeconfig file for querying Machine API objects. Used to detect
whether a peer node's Machine is being deleted, preventing the resource agent
from re-adding it as an etcd learner during Machine deletion flows.
+Supports both MAPI (machine.openshift.io) and CAPI (cluster.x-k8s.io) Machine resources.
</longdesc>
<shortdesc lang="en">Kubeconfig for Machine API queries</shortdesc>
<content type="string" default="${OCF_RESKEY_kubeconfig_default}"/>
@@ -1525,15 +1526,39 @@ is_peer_machine_deleting()
local node_name="$1"
local out
local deletion_ts
+ local oc_rc
+ local item_count
- out=$(timeout 10 oc --kubeconfig="$OCF_RESKEY_kubeconfig" get machines \
+ # Try MAPI first (machine.openshift.io), fall back to CAPI (cluster.x-k8s.io)
+ out=$(timeout 10 oc --kubeconfig="$OCF_RESKEY_kubeconfig" get machines.machine.openshift.io \
-n openshift-machine-api -o json 2>&1)
- if [ $? -ne 0 ]; then
- ocf_log warn "could not query Machine API for node $node_name (fail-open): $out"
- return 1
+ oc_rc=$?
+
+ if [ $oc_rc -eq 0 ]; then
+ item_count=$(printf "%s" "$out" | jq '.items | length' 2>/dev/null)
+ fi
+
+ # MAPI CRD missing, namespace absent, or no Machine objects — try CAPI
+ if [ $oc_rc -ne 0 ] || [ "${item_count:-0}" -eq 0 ]; then
+ ocf_log info "MAPI returned no machines, trying CAPI for node $node_name"
+ out=$(timeout 10 oc --kubeconfig="$OCF_RESKEY_kubeconfig" get machines.cluster.x-k8s.io \
+ -n openshift-cluster-api -o json 2>&1)
+ if [ $? -ne 0 ]; then
+ ocf_log warn "could not query Machine API (MAPI or CAPI) for node $node_name (fail-open): $out"
+ return 1
+ fi
fi
# Select the Machine object for the given node and extract its deletionTimestamp if present
+ local machine_count
+ machine_count=$(printf "%s" "$out" | jq -r --arg name "$node_name" \
+ '[.items[] | select(.status.nodeRef.name == $name)] | length' 2>/dev/null)
+
+ if [ "$machine_count" = "0" ] || [ -z "$machine_count" ]; then
+ ocf_log warn "No Machine object found for node $node_name (fail-open): nodeRef may not be populated yet"
+ return 1
+ fi
+
deletion_ts=$(printf "%s" "$out" | jq -r --arg name "$node_name" \
'.items[] | select(.status.nodeRef.name == $name) | .metadata.deletionTimestamp // empty')
@@ -1609,9 +1634,12 @@ manage_peer_membership()
# Clean up a learner added before the Machine deletion started
if is_peer_machine_deleting "$peer_member_name"; then
ocf_log info "peer Machine is being deleted, removing learner $peer_member_name from member list"
- remove_etcd_member_by_ip "$peer_member_ip"
- attribute_learner_node clear
- set_standalone_node
+ if remove_etcd_member_by_ip "$peer_member_ip"; then
+ attribute_learner_node clear
+ set_standalone_node
+ else
+ ocf_log err "failed to remove learner for deleting Machine $peer_member_name; will retry next monitor cycle"
+ fi
return
fi
if [ -z "$current_learner_node" ]; then
From beab70c7acd4f6ccc33c4dbcb3d72f94fc560812 Mon Sep 17 00:00:00 2001
From: Vincenzo Mauro <vmauro@redhat.com>
Date: Tue, 12 May 2026 09:54:33 +0200
Subject: [PATCH 3/4] reduced timeout to 5 and fixed MCAPI return code
---
heartbeat/podman-etcd | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/heartbeat/podman-etcd b/heartbeat/podman-etcd
index ad9804c1d..e022869a8 100755
--- a/heartbeat/podman-etcd
+++ b/heartbeat/podman-etcd
@@ -1530,7 +1530,7 @@ is_peer_machine_deleting()
local item_count
# Try MAPI first (machine.openshift.io), fall back to CAPI (cluster.x-k8s.io)
- out=$(timeout 10 oc --kubeconfig="$OCF_RESKEY_kubeconfig" get machines.machine.openshift.io \
+ out=$(timeout 5 oc --kubeconfig="$OCF_RESKEY_kubeconfig" get machines.machine.openshift.io \
-n openshift-machine-api -o json 2>&1)
oc_rc=$?
@@ -1607,6 +1607,7 @@ manage_peer_membership()
# NOTE: voting members have a "name" field but no "isLearner" field,
# while learner members have "isLearner": true (boolean) but no "name" field, so we search for peerURLs matching.
peer_member_id=$(printf "%s" "$member_list_json" | jq -r ".members[] | select( .peerURLs | map(test(\"$peer_member_ip\")) | any).ID")
+
# During Machine deletion, CEO's MachineDeletionHooksController
# keeps the EtcdQuorumOperator preDrain hook as long as the peer IP appears in the etcd
# member list (learners included). If we add or keep a learner for a peer whose Machine
From 2b06ed31bda015543a365e02e1bc5a47b3fa0439 Mon Sep 17 00:00:00 2001
From: Vincenzo Mauro <vmauro@redhat.com>
Date: Tue, 12 May 2026 10:30:48 +0200
Subject: [PATCH 4/4] fixed return code for CAPI
---
heartbeat/podman-etcd | 5 +++--
1 file changed, 3 insertions(+), 2 deletions(-)
diff --git a/heartbeat/podman-etcd b/heartbeat/podman-etcd
index e022869a8..2dbaf9991 100755
--- a/heartbeat/podman-etcd
+++ b/heartbeat/podman-etcd
@@ -1541,9 +1541,10 @@ is_peer_machine_deleting()
# MAPI CRD missing, namespace absent, or no Machine objects — try CAPI
if [ $oc_rc -ne 0 ] || [ "${item_count:-0}" -eq 0 ]; then
ocf_log info "MAPI returned no machines, trying CAPI for node $node_name"
- out=$(timeout 10 oc --kubeconfig="$OCF_RESKEY_kubeconfig" get machines.cluster.x-k8s.io \
+ out=$(timeout 5 oc --kubeconfig="$OCF_RESKEY_kubeconfig" get machines.cluster.x-k8s.io \
-n openshift-cluster-api -o json 2>&1)
- if [ $? -ne 0 ]; then
+ oc_rc=$?
+ if [ $oc_rc -ne 0 ]; then
ocf_log warn "could not query Machine API (MAPI or CAPI) for node $node_name (fail-open): $out"
return 1
fi

View File

@ -0,0 +1,130 @@
From db041869f4b8612e44561f4ba4a46ed09d18e24e Mon Sep 17 00:00:00 2001
From: Vincenzo Mauro <vmauro@redhat.com>
Date: Thu, 7 May 2026 18:14:04 +0200
Subject: [PATCH 1/4] fixed OCPBUGS-83333
---
heartbeat/podman-etcd | 22 +++++++++++++++++++++-
1 file changed, 21 insertions(+), 1 deletion(-)
diff --git a/heartbeat/podman-etcd b/heartbeat/podman-etcd
index 4c9bbd4fa..d96c055e3 100755
--- a/heartbeat/podman-etcd
+++ b/heartbeat/podman-etcd
@@ -2519,7 +2519,27 @@ podman_validate()
podman_notify()
{
- ocf_log info "notify: type=${OCF_RESKEY_CRM_meta_notify_type}, operation=${OCF_RESKEY_CRM_meta_notify_operation}, nodes { active=[${OCF_RESKEY_CRM_meta_notify_active_uname}], start=[${OCF_RESKEY_CRM_meta_notify_start_uname}], stop=[${OCF_RESKEY_CRM_meta_notify_stop_uname}] }, resources { active=[${OCF_RESKEY_CRM_meta_notify_active_resource}], start =[${OCF_RESKEY_CRM_meta_notify_start_resource}], stop=[${OCF_RESKEY_CRM_meta_notify_stop_resource}] }"
+ local notify_type="${OCF_RESKEY_CRM_meta_notify_type}"
+ local notify_operation="${OCF_RESKEY_CRM_meta_notify_operation}"
+
+ ocf_log info "notify: type=${notify_type}, operation=${notify_operation}, nodes { active=[${OCF_RESKEY_CRM_meta_notify_active_uname}], start=[${OCF_RESKEY_CRM_meta_notify_start_uname}], stop=[${OCF_RESKEY_CRM_meta_notify_stop_uname}] }, resources { active=[${OCF_RESKEY_CRM_meta_notify_active_resource}], start =[${OCF_RESKEY_CRM_meta_notify_start_resource}], stop=[${OCF_RESKEY_CRM_meta_notify_stop_resource}] }"
+
+ # Pacemaker serializes operations per resource per node. The start sequence
+ # with notifications is:
+ # pre-notify(start) on peer → start on joiner → post-notify(start) on peer
+ # Between pre-notify and post-notify, the peer's recurring monitor is
+ # queued — Pacemaker won't overlap operations for the same resource on the
+ # same node. The monitor path (check_peer → manage_peer_membership →
+ # add_member_as_learner) is the primary way a running peer adds the
+ # starting node to the etcd member list. Without handling it here, the
+ # starting node's podman_start poll loop (waiting for learner_node attribute)
+ # deadlocks: start waits for learner_node, monitor waits for start to finish.
+ # pre_notify_start fires before the start action, giving us the window to
+ # add the learner so the joiner's poll loop finds it immediately.
+ if [ "$notify_type" = "pre" ] && [ "$notify_operation" = "start" ]; then
+ ocf_log info "pre_notify_start: running peer membership check for starting node"
+ check_peer
+ fi
}
# TODO :
From d1c817108276ee3019a20164fca0646985d99cde Mon Sep 17 00:00:00 2001
From: Vincenzo Mauro <vmauro@redhat.com>
Date: Fri, 8 May 2026 10:38:12 +0200
Subject: [PATCH 2/4] Updated deadlock comment
---
heartbeat/podman-etcd | 18 ++++++------------
1 file changed, 6 insertions(+), 12 deletions(-)
diff --git a/heartbeat/podman-etcd b/heartbeat/podman-etcd
index d96c055e3..21a5e01e1 100755
--- a/heartbeat/podman-etcd
+++ b/heartbeat/podman-etcd
@@ -2524,18 +2524,12 @@ podman_notify()
ocf_log info "notify: type=${notify_type}, operation=${notify_operation}, nodes { active=[${OCF_RESKEY_CRM_meta_notify_active_uname}], start=[${OCF_RESKEY_CRM_meta_notify_start_uname}], stop=[${OCF_RESKEY_CRM_meta_notify_stop_uname}] }, resources { active=[${OCF_RESKEY_CRM_meta_notify_active_resource}], start =[${OCF_RESKEY_CRM_meta_notify_start_resource}], stop=[${OCF_RESKEY_CRM_meta_notify_stop_resource}] }"
- # Pacemaker serializes operations per resource per node. The start sequence
- # with notifications is:
- # pre-notify(start) on peer → start on joiner → post-notify(start) on peer
- # Between pre-notify and post-notify, the peer's recurring monitor is
- # queued — Pacemaker won't overlap operations for the same resource on the
- # same node. The monitor path (check_peer → manage_peer_membership →
- # add_member_as_learner) is the primary way a running peer adds the
- # starting node to the etcd member list. Without handling it here, the
- # starting node's podman_start poll loop (waiting for learner_node attribute)
- # deadlocks: start waits for learner_node, monitor waits for start to finish.
- # pre_notify_start fires before the start action, giving us the window to
- # add the learner so the joiner's poll loop finds it immediately.
+ # Pacemaker suppresses the peer's monitor during an
+ # active start/notify cycle. Since monitor is the only path that calls
+ # add_member_as_learner (outside force_new_cluster), the joiner's
+ # podman_start poll loop deadlocks; it waits for learner_node, but
+ # no monitor runs to set it. pre_notify_start fires before start,
+ # so we add the learner here to break the deadlock
if [ "$notify_type" = "pre" ] && [ "$notify_operation" = "start" ]; then
ocf_log info "pre_notify_start: running peer membership check for starting node"
check_peer
From 0fafab701878ce4b8c7413610e41dce3e69447aa Mon Sep 17 00:00:00 2001
From: Vincenzo Mauro <vmauro@redhat.com>
Date: Tue, 12 May 2026 10:24:26 +0200
Subject: [PATCH 3/4] improved logging
---
heartbeat/podman-etcd | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/heartbeat/podman-etcd b/heartbeat/podman-etcd
index 21a5e01e1..41ce84ff1 100755
--- a/heartbeat/podman-etcd
+++ b/heartbeat/podman-etcd
@@ -2531,7 +2531,7 @@ podman_notify()
# no monitor runs to set it. pre_notify_start fires before start,
# so we add the learner here to break the deadlock
if [ "$notify_type" = "pre" ] && [ "$notify_operation" = "start" ]; then
- ocf_log info "pre_notify_start: running peer membership check for starting node"
+ ocf_log info "pre_notify_start: running peer membership check for ${OCF_RESKEY_CRM_meta_notify_start_uname}"
check_peer
fi
}
@@ -2616,4 +2616,4 @@ validate-all) podman_validate;;
esac
rc=$?
ocf_log debug "${OCF_RESOURCE_INSTANCE} $__OCF_ACTION : $rc"
-exit $rc
+exc
From bdce9048b4fc2c38255d36e73a1f73a7d72b7471 Mon Sep 17 00:00:00 2001
From: Vincenzo Mauro <vmauro@redhat.com>
Date: Tue, 12 May 2026 10:46:49 +0200
Subject: [PATCH 4/4] fixed typo
---
heartbeat/podman-etcd | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/heartbeat/podman-etcd b/heartbeat/podman-etcd
index 41ce84ff1..740e2edb4 100755
--- a/heartbeat/podman-etcd
+++ b/heartbeat/podman-etcd
@@ -2616,4 +2616,4 @@ validate-all) podman_validate;;
esac
rc=$?
ocf_log debug "${OCF_RESOURCE_INSTANCE} $__OCF_ACTION : $rc"
-exc
+exit $rc

View File

@ -0,0 +1,96 @@
From 42dfef941ed80d6073022141fa1cad513e8dae4f Mon Sep 17 00:00:00 2001
From: Pablo Fontanilla <pfontani@redhat.com>
Date: Wed, 22 Apr 2026 12:54:58 +0200
Subject: [PATCH 1/2] fix(podman-etcd): use -ge 1 in
etcd_pod_container_exists()
PR #2112 added -a to crictl ps to include exited containers, but
did not update the count check from -eq 1 to -ge 1. During install,
etcd container crashes create exited containers that inflate the
count past 1, causing the guard to report 'pod not found' despite
the pod running.
Fixes: OCPBUGS-83742
Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
heartbeat/podman-etcd | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/heartbeat/podman-etcd b/heartbeat/podman-etcd
index 4c9bbd4fa..52b2a1386 100755
--- a/heartbeat/podman-etcd
+++ b/heartbeat/podman-etcd
@@ -932,7 +932,7 @@ etcd_pod_container_exists() {
local count_matches
# Check whether the etcd pod exists on the same node (including stopped/exited containers)
count_matches=$(crictl pods --label app=etcd -q | xargs -I {} crictl ps -a --pod {} -o json | jq -r '.containers[].metadata | select ( .name == "etcd" ).name' | wc -l)
- if [ "$count_matches" -eq 1 ]; then
+ if [ "$count_matches" -ge 1 ]; then
# etcd pod found
return 0
fi
From 30d20f6b99ae9898bf801c0a5e690b81fc928faa Mon Sep 17 00:00:00 2001
From: Pablo Fontanilla <pfontani@redhat.com>
Date: Wed, 22 Apr 2026 12:57:46 +0200
Subject: [PATCH 2/2] fix(podman-etcd): wait for etcd ports before starting
container
During the static-pod to podman-etcd transition, the old etcd process
may still hold ports 2379/2380 when the RA tries to start its container.
This causes 'bind: address already in use' errors and eventual fallback
to standalone mode.
Add a 60-second wait loop (modeled on CEO's pod.gotpl.yaml port check)
that blocks until the ports are free before calling podman run/start.
Fixes: OCPBUGS-83742
Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
heartbeat/podman-etcd | 24 ++++++++++++++++++++++++
1 file changed, 24 insertions(+)
diff --git a/heartbeat/podman-etcd b/heartbeat/podman-etcd
index 52b2a1386..9a960914b 100755
--- a/heartbeat/podman-etcd
+++ b/heartbeat/podman-etcd
@@ -940,6 +940,25 @@ etcd_pod_container_exists() {
return 1
}
+wait_for_etcd_ports_release() {
+ local timeout=${1:-60}
+ local elapsed=0
+ if [ -z "$(ss -Htan '( sport = 2379 or sport = 2380 )')" ]; then
+ return 0
+ fi
+ ocf_log info "waiting for etcd ports 2379/2380 to be released (timeout: ${timeout}s)"
+ while [ -n "$(ss -Htan '( sport = 2379 or sport = 2380 )')" ]; do
+ if [ "$elapsed" -ge "$timeout" ]; then
+ ocf_log err "etcd ports still in use after ${timeout}s"
+ return 1
+ fi
+ sleep 1
+ elapsed=$((elapsed + 1))
+ done
+ ocf_log info "etcd ports released after ${elapsed}s"
+ return 0
+}
+
attribute_node_cluster_id()
{
local action="$1"
@@ -2267,6 +2286,11 @@ podman_start()
ocf_log notice "Pull image not required, ${OCF_RESKEY_image}"
fi
+ if ! wait_for_etcd_ports_release 60; then
+ ocf_exit_reason "etcd ports 2379/2380 still bound — cannot start container"
+ return $OCF_ERR_GENERIC
+ fi
+
if ocf_is_true "$OCF_RESKEY_reuse" && container_exists; then
ocf_log info "starting existing container $CONTAINER."
ocf_run podman start "$CONTAINER"

View File

@ -45,7 +45,7 @@
Name: resource-agents
Summary: Open Source HA Reusable Cluster Resource Scripts
Version: 4.10.0
Release: 116%{?rcver:%{rcver}}%{?numcomm:.%{numcomm}}%{?alphatag:.%{alphatag}}%{?dirty:.%{dirty}}%{?dist}
Release: 117%{?rcver:%{rcver}}%{?numcomm:.%{numcomm}}%{?alphatag:.%{alphatag}}%{?dirty:.%{dirty}}%{?dist}
License: GPLv2+ and LGPLv2+
URL: https://github.com/ClusterLabs/resource-agents
Source0: %{upstream_prefix}-%{upstream_version}.tar.gz
@ -209,6 +209,9 @@ Patch156: RHEL-148198-2-db2-do-not-use-db2stop-to-avoid-divergence-in-the-log.pa
Patch157: RHEL-160863-1-Filesystem-do-not-return-CONFIGURED-during-monitor-action.patch
Patch158: RHEL-160863-2-Filesystem-always-return-OCF_ERR_GENERIC-when-another-device-is-mounted-on-mountpoint.patch
Patch159: RHEL-150850-pgsql-use-monitor_user-for-monitor-calls-and-use-pgpass-when-monitor_password-is-not-specified.patch
Patch160: RHEL-177850-podman-etcd-fix-port-2380-binding-race.patch
Patch161: RHEL-177840-podman-etcd-fix-machine-deletion-deadlock.patch
Patch162: RHEL-177845-podman-etcd-fix-learner-start-deadlock.patch
# bundled ha-cloud-support libs
Patch500: ha-cloud-support-aliyun.patch
@ -513,6 +516,9 @@ exit 1
%patch -p1 -P 157
%patch -p1 -P 158
%patch -p1 -P 159
%patch -p1 -P 160
%patch -p1 -P 161
%patch -p1 -P 162
# bundled ha-cloud-support libs
%patch -p1 -P 500
@ -847,6 +853,13 @@ rm -rf %{buildroot}/usr/share/doc/resource-agents
%{_usr}/lib/ocf/lib/heartbeat/OCF_*.pm
%changelog
* Wed May 20 2026 Oyvind Albrigtsen <oalbrigt@redhat.com> - 4.10.0-117
- podman-etcd: fix port 2380 binding race
- podman-etcd: fix machine deletion deadlock
- podman-etcd: fix learner start deadlock
Resolves: RHEL-177850, RHEL-177840, RHEL-177845
* Tue May 19 2026 Oyvind Albrigtsen <oalbrigt@redhat.com> - 4.10.0-116
- pgsql: use monitor_user for monitor-calls and use .pgpass when
monitor_password is not specified