- podman-etcd: fix port 2380 binding race
- podman-etcd: fix machine deletion deadlock - podman-etcd: fix learner start deadlock Resolves: RHEL-177850, RHEL-177840, RHEL-177845
This commit is contained in:
parent
44aa4ccf08
commit
afa76dbdac
283
RHEL-177840-podman-etcd-fix-machine-deletion-deadlock.patch
Normal file
283
RHEL-177840-podman-etcd-fix-machine-deletion-deadlock.patch
Normal file
@ -0,0 +1,283 @@
|
||||
From 9ba19a62543de4d7365fc711b908a2759f811af9 Mon Sep 17 00:00:00 2001
|
||||
From: Vincenzo Mauro <vmauro@redhat.com>
|
||||
Date: Tue, 5 May 2026 14:24:43 +0200
|
||||
Subject: [PATCH 1/4] fix: fixed etcd learner deadlock
|
||||
|
||||
---
|
||||
heartbeat/podman-etcd | 79 +++++++++++++++++++++++++++++++++++++++----
|
||||
1 file changed, 73 insertions(+), 6 deletions(-)
|
||||
|
||||
diff --git a/heartbeat/podman-etcd b/heartbeat/podman-etcd
|
||||
index 4c9bbd4fa..5bb3b2897 100755
|
||||
--- a/heartbeat/podman-etcd
|
||||
+++ b/heartbeat/podman-etcd
|
||||
@@ -50,6 +50,7 @@ OCF_RESKEY_oom_default="-997"
|
||||
OCF_RESKEY_config_location_default="/var/lib/etcd"
|
||||
OCF_RESKEY_backup_location_default="/var/lib/etcd"
|
||||
OCF_RESKEY_max_backup_snapshots_default="3"
|
||||
+OCF_RESKEY_kubeconfig_default="/etc/kubernetes/static-pod-resources/kube-apiserver-certs/secrets/node-kubeconfigs/localhost.kubeconfig"
|
||||
|
||||
: ${OCF_RESKEY_image=${OCF_RESKEY_image_default}}
|
||||
: ${OCF_RESKEY_pod_manifest=${OCF_RESKEY_pod_manifest_default}}
|
||||
@@ -63,6 +64,7 @@ OCF_RESKEY_max_backup_snapshots_default="3"
|
||||
: ${OCF_RESKEY_config_location=${OCF_RESKEY_config_location_default}}
|
||||
: ${OCF_RESKEY_backup_location=${OCF_RESKEY_backup_location_default}}
|
||||
: ${OCF_RESKEY_max_backup_snapshots=${OCF_RESKEY_max_backup_snapshots_default}}
|
||||
+: ${OCF_RESKEY_kubeconfig=${OCF_RESKEY_kubeconfig_default}}
|
||||
|
||||
|
||||
#######################################################################
|
||||
@@ -288,6 +290,16 @@ Set max_backup_snapshots=0 to disable backups.
|
||||
<content type="integer" default="${OCF_RESKEY_max_backup_snapshots_default}"/>
|
||||
</parameter>
|
||||
|
||||
+<parameter name="kubeconfig" required="0" unique="0">
|
||||
+<longdesc lang="en">
|
||||
+Path to a kubeconfig file for querying Machine API objects. Used to detect
|
||||
+whether a peer node's Machine is being deleted, preventing the resource agent
|
||||
+from re-adding it as an etcd learner during Machine deletion flows.
|
||||
+</longdesc>
|
||||
+<shortdesc lang="en">Kubeconfig for Machine API queries</shortdesc>
|
||||
+<content type="string" default="${OCF_RESKEY_kubeconfig_default}"/>
|
||||
+</parameter>
|
||||
+
|
||||
</parameters>
|
||||
|
||||
<actions>
|
||||
@@ -1505,6 +1517,34 @@ detect_cluster_leadership_loss()
|
||||
}
|
||||
|
||||
|
||||
+# Checks whether the Machine object for a given node is being deleted.
|
||||
+# Returns 0 (true) if the Machine has a deletionTimestamp set, 1 (false) otherwise.
|
||||
+# Fails open: returns 1 on API errors to preserve current learner-addition behavior.
|
||||
+is_peer_machine_deleting()
|
||||
+{
|
||||
+ local node_name="$1"
|
||||
+ local out
|
||||
+ local deletion_ts
|
||||
+
|
||||
+ out=$(timeout 10 oc --kubeconfig="$OCF_RESKEY_kubeconfig" get machines \
|
||||
+ -n openshift-machine-api -o json 2>&1)
|
||||
+ if [ $? -ne 0 ]; then
|
||||
+ ocf_log warn "could not query Machine API for node $node_name (fail-open): $out"
|
||||
+ return 1
|
||||
+ fi
|
||||
+
|
||||
+ # Select the Machine object for the given node and extract its deletionTimestamp if present
|
||||
+ deletion_ts=$(printf "%s" "$out" | jq -r --arg name "$node_name" \
|
||||
+ '.items[] | select(.status.nodeRef.name == $name) | .metadata.deletionTimestamp // empty')
|
||||
+
|
||||
+ if [ -n "$deletion_ts" ]; then
|
||||
+ ocf_log info "Machine for node $node_name is being deleted (deletionTimestamp: $deletion_ts)"
|
||||
+ return 0
|
||||
+ fi
|
||||
+
|
||||
+ return 1
|
||||
+}
|
||||
+
|
||||
# Manages etcd peer membership by detecting and handling missing or rejoining peers
|
||||
# Adds missing peers as learners and reconciles member states when peers rejoin
|
||||
# Args: $1 - member list JSON from etcdctl
|
||||
@@ -1542,9 +1582,21 @@ manage_peer_membership()
|
||||
# NOTE: voting members have a "name" field but no "isLearner" field,
|
||||
# while learner members have "isLearner": true (boolean) but no "name" field, so we search for peerURLs matching.
|
||||
peer_member_id=$(printf "%s" "$member_list_json" | jq -r ".members[] | select( .peerURLs | map(test(\"$peer_member_ip\")) | any).ID")
|
||||
+ # During Machine deletion, CEO's MachineDeletionHooksController
|
||||
+ # keeps the EtcdQuorumOperator preDrain hook as long as the peer IP appears in the etcd
|
||||
+ # member list (learners included). If we add or keep a learner for a peer whose Machine
|
||||
+ # is being deleted, CEO never clears the hook, MAO never drains, and the Machine hangs
|
||||
+ # in Deleting. Two safeguards cover the race:
|
||||
+ # A (below): peer is not yet in the member list — skip adding it as a learner if machine is deleting
|
||||
+ # B (learner exists): a prior monitor cycle added the learner before the Machine
|
||||
+ # deletion started — remove it so CEO can clear the hook.
|
||||
if [ -z "$peer_member_id" ]; then
|
||||
ocf_log info "$peer_member_name is not in the members list"
|
||||
- add_member_as_learner "$peer_member_name" "$peer_member_ip"
|
||||
+ if ! is_peer_machine_deleting "$peer_member_name"; then
|
||||
+ add_member_as_learner "$peer_member_name" "$peer_member_ip"
|
||||
+ else
|
||||
+ ocf_log info "peer Machine is being deleted, skipping learner addition for $peer_member_name"
|
||||
+ fi
|
||||
set_standalone_node
|
||||
return
|
||||
fi
|
||||
@@ -1552,10 +1604,21 @@ manage_peer_membership()
|
||||
# Ensure learner_node attribute is always set when we have a learner member
|
||||
local learner_member_id=$(printf "%s" "$member_list_json" | jq -r ".members[] | select( .isLearner==true ).ID")
|
||||
local current_learner_node=$(attribute_learner_node get)
|
||||
- if [ -n "$learner_member_id" ] && [ -z "$current_learner_node" ]; then
|
||||
- ocf_log debug "$peer_member_name found as learner in member list, but learner_node attribute was not set. Updating"
|
||||
- attribute_learner_node update "$peer_member_name"
|
||||
- return
|
||||
+
|
||||
+ if [ -n "$learner_member_id" ]; then
|
||||
+ # Clean up a learner added before the Machine deletion started
|
||||
+ if is_peer_machine_deleting "$peer_member_name"; then
|
||||
+ ocf_log info "peer Machine is being deleted, removing learner $peer_member_name from member list"
|
||||
+ remove_etcd_member_by_ip "$peer_member_ip"
|
||||
+ attribute_learner_node clear
|
||||
+ set_standalone_node
|
||||
+ return
|
||||
+ fi
|
||||
+ if [ -z "$current_learner_node" ]; then
|
||||
+ ocf_log debug "$peer_member_name found as learner in member list, but learner_node attribute was not set. Updating"
|
||||
+ attribute_learner_node update "$peer_member_name"
|
||||
+ return
|
||||
+ fi
|
||||
fi
|
||||
|
||||
ocf_log debug "$peer_member_name is in the members list by IP: $peer_member_ip"
|
||||
@@ -2312,7 +2375,11 @@ podman_start()
|
||||
peer_node_name="$(get_peer_node_name)"
|
||||
peer_node_ip="$(attribute_node_ip_peer)"
|
||||
if [ -n "$peer_node_name" ] && [ -n "$peer_node_ip" ]; then
|
||||
- add_member_as_learner "$peer_node_name" "$peer_node_ip"
|
||||
+ if is_peer_machine_deleting "$peer_node_name"; then
|
||||
+ ocf_log info "peer Machine is being deleted, skipping learner addition for $peer_node_name"
|
||||
+ else
|
||||
+ add_member_as_learner "$peer_node_name" "$peer_node_ip"
|
||||
+ fi
|
||||
set_standalone_node
|
||||
else
|
||||
ocf_log err "could not add peer as learner (peer node name: ${peer_node_name:-unknown}, peer ip: ${peer_node_ip:-unknown})"
|
||||
|
||||
From 56d9754311ab0595dea1c47e26eca85bbcfb049c Mon Sep 17 00:00:00 2001
|
||||
From: Vincenzo Mauro <vmauro@redhat.com>
|
||||
Date: Wed, 6 May 2026 15:15:27 +0200
|
||||
Subject: [PATCH 2/4] fix: added support for both MAPI and CAPI
|
||||
|
||||
---
|
||||
heartbeat/podman-etcd | 42 +++++++++++++++++++++++++++++++++++-------
|
||||
1 file changed, 35 insertions(+), 7 deletions(-)
|
||||
|
||||
diff --git a/heartbeat/podman-etcd b/heartbeat/podman-etcd
|
||||
index 5bb3b2897..ad9804c1d 100755
|
||||
--- a/heartbeat/podman-etcd
|
||||
+++ b/heartbeat/podman-etcd
|
||||
@@ -295,6 +295,7 @@ Set max_backup_snapshots=0 to disable backups.
|
||||
Path to a kubeconfig file for querying Machine API objects. Used to detect
|
||||
whether a peer node's Machine is being deleted, preventing the resource agent
|
||||
from re-adding it as an etcd learner during Machine deletion flows.
|
||||
+Supports both MAPI (machine.openshift.io) and CAPI (cluster.x-k8s.io) Machine resources.
|
||||
</longdesc>
|
||||
<shortdesc lang="en">Kubeconfig for Machine API queries</shortdesc>
|
||||
<content type="string" default="${OCF_RESKEY_kubeconfig_default}"/>
|
||||
@@ -1525,15 +1526,39 @@ is_peer_machine_deleting()
|
||||
local node_name="$1"
|
||||
local out
|
||||
local deletion_ts
|
||||
+ local oc_rc
|
||||
+ local item_count
|
||||
|
||||
- out=$(timeout 10 oc --kubeconfig="$OCF_RESKEY_kubeconfig" get machines \
|
||||
+ # Try MAPI first (machine.openshift.io), fall back to CAPI (cluster.x-k8s.io)
|
||||
+ out=$(timeout 10 oc --kubeconfig="$OCF_RESKEY_kubeconfig" get machines.machine.openshift.io \
|
||||
-n openshift-machine-api -o json 2>&1)
|
||||
- if [ $? -ne 0 ]; then
|
||||
- ocf_log warn "could not query Machine API for node $node_name (fail-open): $out"
|
||||
- return 1
|
||||
+ oc_rc=$?
|
||||
+
|
||||
+ if [ $oc_rc -eq 0 ]; then
|
||||
+ item_count=$(printf "%s" "$out" | jq '.items | length' 2>/dev/null)
|
||||
+ fi
|
||||
+
|
||||
+ # MAPI CRD missing, namespace absent, or no Machine objects — try CAPI
|
||||
+ if [ $oc_rc -ne 0 ] || [ "${item_count:-0}" -eq 0 ]; then
|
||||
+ ocf_log info "MAPI returned no machines, trying CAPI for node $node_name"
|
||||
+ out=$(timeout 10 oc --kubeconfig="$OCF_RESKEY_kubeconfig" get machines.cluster.x-k8s.io \
|
||||
+ -n openshift-cluster-api -o json 2>&1)
|
||||
+ if [ $? -ne 0 ]; then
|
||||
+ ocf_log warn "could not query Machine API (MAPI or CAPI) for node $node_name (fail-open): $out"
|
||||
+ return 1
|
||||
+ fi
|
||||
fi
|
||||
|
||||
# Select the Machine object for the given node and extract its deletionTimestamp if present
|
||||
+ local machine_count
|
||||
+ machine_count=$(printf "%s" "$out" | jq -r --arg name "$node_name" \
|
||||
+ '[.items[] | select(.status.nodeRef.name == $name)] | length' 2>/dev/null)
|
||||
+
|
||||
+ if [ "$machine_count" = "0" ] || [ -z "$machine_count" ]; then
|
||||
+ ocf_log warn "No Machine object found for node $node_name (fail-open): nodeRef may not be populated yet"
|
||||
+ return 1
|
||||
+ fi
|
||||
+
|
||||
deletion_ts=$(printf "%s" "$out" | jq -r --arg name "$node_name" \
|
||||
'.items[] | select(.status.nodeRef.name == $name) | .metadata.deletionTimestamp // empty')
|
||||
|
||||
@@ -1609,9 +1634,12 @@ manage_peer_membership()
|
||||
# Clean up a learner added before the Machine deletion started
|
||||
if is_peer_machine_deleting "$peer_member_name"; then
|
||||
ocf_log info "peer Machine is being deleted, removing learner $peer_member_name from member list"
|
||||
- remove_etcd_member_by_ip "$peer_member_ip"
|
||||
- attribute_learner_node clear
|
||||
- set_standalone_node
|
||||
+ if remove_etcd_member_by_ip "$peer_member_ip"; then
|
||||
+ attribute_learner_node clear
|
||||
+ set_standalone_node
|
||||
+ else
|
||||
+ ocf_log err "failed to remove learner for deleting Machine $peer_member_name; will retry next monitor cycle"
|
||||
+ fi
|
||||
return
|
||||
fi
|
||||
if [ -z "$current_learner_node" ]; then
|
||||
|
||||
From beab70c7acd4f6ccc33c4dbcb3d72f94fc560812 Mon Sep 17 00:00:00 2001
|
||||
From: Vincenzo Mauro <vmauro@redhat.com>
|
||||
Date: Tue, 12 May 2026 09:54:33 +0200
|
||||
Subject: [PATCH 3/4] reduced timeout to 5 and fixed MCAPI return code
|
||||
|
||||
---
|
||||
heartbeat/podman-etcd | 3 ++-
|
||||
1 file changed, 2 insertions(+), 1 deletion(-)
|
||||
|
||||
diff --git a/heartbeat/podman-etcd b/heartbeat/podman-etcd
|
||||
index ad9804c1d..e022869a8 100755
|
||||
--- a/heartbeat/podman-etcd
|
||||
+++ b/heartbeat/podman-etcd
|
||||
@@ -1530,7 +1530,7 @@ is_peer_machine_deleting()
|
||||
local item_count
|
||||
|
||||
# Try MAPI first (machine.openshift.io), fall back to CAPI (cluster.x-k8s.io)
|
||||
- out=$(timeout 10 oc --kubeconfig="$OCF_RESKEY_kubeconfig" get machines.machine.openshift.io \
|
||||
+ out=$(timeout 5 oc --kubeconfig="$OCF_RESKEY_kubeconfig" get machines.machine.openshift.io \
|
||||
-n openshift-machine-api -o json 2>&1)
|
||||
oc_rc=$?
|
||||
|
||||
@@ -1607,6 +1607,7 @@ manage_peer_membership()
|
||||
# NOTE: voting members have a "name" field but no "isLearner" field,
|
||||
# while learner members have "isLearner": true (boolean) but no "name" field, so we search for peerURLs matching.
|
||||
peer_member_id=$(printf "%s" "$member_list_json" | jq -r ".members[] | select( .peerURLs | map(test(\"$peer_member_ip\")) | any).ID")
|
||||
+
|
||||
# During Machine deletion, CEO's MachineDeletionHooksController
|
||||
# keeps the EtcdQuorumOperator preDrain hook as long as the peer IP appears in the etcd
|
||||
# member list (learners included). If we add or keep a learner for a peer whose Machine
|
||||
|
||||
From 2b06ed31bda015543a365e02e1bc5a47b3fa0439 Mon Sep 17 00:00:00 2001
|
||||
From: Vincenzo Mauro <vmauro@redhat.com>
|
||||
Date: Tue, 12 May 2026 10:30:48 +0200
|
||||
Subject: [PATCH 4/4] fixed return code for CAPI
|
||||
|
||||
---
|
||||
heartbeat/podman-etcd | 5 +++--
|
||||
1 file changed, 3 insertions(+), 2 deletions(-)
|
||||
|
||||
diff --git a/heartbeat/podman-etcd b/heartbeat/podman-etcd
|
||||
index e022869a8..2dbaf9991 100755
|
||||
--- a/heartbeat/podman-etcd
|
||||
+++ b/heartbeat/podman-etcd
|
||||
@@ -1541,9 +1541,10 @@ is_peer_machine_deleting()
|
||||
# MAPI CRD missing, namespace absent, or no Machine objects — try CAPI
|
||||
if [ $oc_rc -ne 0 ] || [ "${item_count:-0}" -eq 0 ]; then
|
||||
ocf_log info "MAPI returned no machines, trying CAPI for node $node_name"
|
||||
- out=$(timeout 10 oc --kubeconfig="$OCF_RESKEY_kubeconfig" get machines.cluster.x-k8s.io \
|
||||
+ out=$(timeout 5 oc --kubeconfig="$OCF_RESKEY_kubeconfig" get machines.cluster.x-k8s.io \
|
||||
-n openshift-cluster-api -o json 2>&1)
|
||||
- if [ $? -ne 0 ]; then
|
||||
+ oc_rc=$?
|
||||
+ if [ $oc_rc -ne 0 ]; then
|
||||
ocf_log warn "could not query Machine API (MAPI or CAPI) for node $node_name (fail-open): $out"
|
||||
return 1
|
||||
fi
|
||||
130
RHEL-177845-podman-etcd-fix-learner-start-deadlock.patch
Normal file
130
RHEL-177845-podman-etcd-fix-learner-start-deadlock.patch
Normal file
@ -0,0 +1,130 @@
|
||||
From db041869f4b8612e44561f4ba4a46ed09d18e24e Mon Sep 17 00:00:00 2001
|
||||
From: Vincenzo Mauro <vmauro@redhat.com>
|
||||
Date: Thu, 7 May 2026 18:14:04 +0200
|
||||
Subject: [PATCH 1/4] fixed OCPBUGS-83333
|
||||
|
||||
---
|
||||
heartbeat/podman-etcd | 22 +++++++++++++++++++++-
|
||||
1 file changed, 21 insertions(+), 1 deletion(-)
|
||||
|
||||
diff --git a/heartbeat/podman-etcd b/heartbeat/podman-etcd
|
||||
index 4c9bbd4fa..d96c055e3 100755
|
||||
--- a/heartbeat/podman-etcd
|
||||
+++ b/heartbeat/podman-etcd
|
||||
@@ -2519,7 +2519,27 @@ podman_validate()
|
||||
|
||||
podman_notify()
|
||||
{
|
||||
- ocf_log info "notify: type=${OCF_RESKEY_CRM_meta_notify_type}, operation=${OCF_RESKEY_CRM_meta_notify_operation}, nodes { active=[${OCF_RESKEY_CRM_meta_notify_active_uname}], start=[${OCF_RESKEY_CRM_meta_notify_start_uname}], stop=[${OCF_RESKEY_CRM_meta_notify_stop_uname}] }, resources { active=[${OCF_RESKEY_CRM_meta_notify_active_resource}], start =[${OCF_RESKEY_CRM_meta_notify_start_resource}], stop=[${OCF_RESKEY_CRM_meta_notify_stop_resource}] }"
|
||||
+ local notify_type="${OCF_RESKEY_CRM_meta_notify_type}"
|
||||
+ local notify_operation="${OCF_RESKEY_CRM_meta_notify_operation}"
|
||||
+
|
||||
+ ocf_log info "notify: type=${notify_type}, operation=${notify_operation}, nodes { active=[${OCF_RESKEY_CRM_meta_notify_active_uname}], start=[${OCF_RESKEY_CRM_meta_notify_start_uname}], stop=[${OCF_RESKEY_CRM_meta_notify_stop_uname}] }, resources { active=[${OCF_RESKEY_CRM_meta_notify_active_resource}], start =[${OCF_RESKEY_CRM_meta_notify_start_resource}], stop=[${OCF_RESKEY_CRM_meta_notify_stop_resource}] }"
|
||||
+
|
||||
+ # Pacemaker serializes operations per resource per node. The start sequence
|
||||
+ # with notifications is:
|
||||
+ # pre-notify(start) on peer → start on joiner → post-notify(start) on peer
|
||||
+ # Between pre-notify and post-notify, the peer's recurring monitor is
|
||||
+ # queued — Pacemaker won't overlap operations for the same resource on the
|
||||
+ # same node. The monitor path (check_peer → manage_peer_membership →
|
||||
+ # add_member_as_learner) is the primary way a running peer adds the
|
||||
+ # starting node to the etcd member list. Without handling it here, the
|
||||
+ # starting node's podman_start poll loop (waiting for learner_node attribute)
|
||||
+ # deadlocks: start waits for learner_node, monitor waits for start to finish.
|
||||
+ # pre_notify_start fires before the start action, giving us the window to
|
||||
+ # add the learner so the joiner's poll loop finds it immediately.
|
||||
+ if [ "$notify_type" = "pre" ] && [ "$notify_operation" = "start" ]; then
|
||||
+ ocf_log info "pre_notify_start: running peer membership check for starting node"
|
||||
+ check_peer
|
||||
+ fi
|
||||
}
|
||||
|
||||
# TODO :
|
||||
|
||||
From d1c817108276ee3019a20164fca0646985d99cde Mon Sep 17 00:00:00 2001
|
||||
From: Vincenzo Mauro <vmauro@redhat.com>
|
||||
Date: Fri, 8 May 2026 10:38:12 +0200
|
||||
Subject: [PATCH 2/4] Updated deadlock comment
|
||||
|
||||
---
|
||||
heartbeat/podman-etcd | 18 ++++++------------
|
||||
1 file changed, 6 insertions(+), 12 deletions(-)
|
||||
|
||||
diff --git a/heartbeat/podman-etcd b/heartbeat/podman-etcd
|
||||
index d96c055e3..21a5e01e1 100755
|
||||
--- a/heartbeat/podman-etcd
|
||||
+++ b/heartbeat/podman-etcd
|
||||
@@ -2524,18 +2524,12 @@ podman_notify()
|
||||
|
||||
ocf_log info "notify: type=${notify_type}, operation=${notify_operation}, nodes { active=[${OCF_RESKEY_CRM_meta_notify_active_uname}], start=[${OCF_RESKEY_CRM_meta_notify_start_uname}], stop=[${OCF_RESKEY_CRM_meta_notify_stop_uname}] }, resources { active=[${OCF_RESKEY_CRM_meta_notify_active_resource}], start =[${OCF_RESKEY_CRM_meta_notify_start_resource}], stop=[${OCF_RESKEY_CRM_meta_notify_stop_resource}] }"
|
||||
|
||||
- # Pacemaker serializes operations per resource per node. The start sequence
|
||||
- # with notifications is:
|
||||
- # pre-notify(start) on peer → start on joiner → post-notify(start) on peer
|
||||
- # Between pre-notify and post-notify, the peer's recurring monitor is
|
||||
- # queued — Pacemaker won't overlap operations for the same resource on the
|
||||
- # same node. The monitor path (check_peer → manage_peer_membership →
|
||||
- # add_member_as_learner) is the primary way a running peer adds the
|
||||
- # starting node to the etcd member list. Without handling it here, the
|
||||
- # starting node's podman_start poll loop (waiting for learner_node attribute)
|
||||
- # deadlocks: start waits for learner_node, monitor waits for start to finish.
|
||||
- # pre_notify_start fires before the start action, giving us the window to
|
||||
- # add the learner so the joiner's poll loop finds it immediately.
|
||||
+ # Pacemaker suppresses the peer's monitor during an
|
||||
+ # active start/notify cycle. Since monitor is the only path that calls
|
||||
+ # add_member_as_learner (outside force_new_cluster), the joiner's
|
||||
+ # podman_start poll loop deadlocks; it waits for learner_node, but
|
||||
+ # no monitor runs to set it. pre_notify_start fires before start,
|
||||
+ # so we add the learner here to break the deadlock
|
||||
if [ "$notify_type" = "pre" ] && [ "$notify_operation" = "start" ]; then
|
||||
ocf_log info "pre_notify_start: running peer membership check for starting node"
|
||||
check_peer
|
||||
|
||||
From 0fafab701878ce4b8c7413610e41dce3e69447aa Mon Sep 17 00:00:00 2001
|
||||
From: Vincenzo Mauro <vmauro@redhat.com>
|
||||
Date: Tue, 12 May 2026 10:24:26 +0200
|
||||
Subject: [PATCH 3/4] improved logging
|
||||
|
||||
---
|
||||
heartbeat/podman-etcd | 4 ++--
|
||||
1 file changed, 2 insertions(+), 2 deletions(-)
|
||||
|
||||
diff --git a/heartbeat/podman-etcd b/heartbeat/podman-etcd
|
||||
index 21a5e01e1..41ce84ff1 100755
|
||||
--- a/heartbeat/podman-etcd
|
||||
+++ b/heartbeat/podman-etcd
|
||||
@@ -2531,7 +2531,7 @@ podman_notify()
|
||||
# no monitor runs to set it. pre_notify_start fires before start,
|
||||
# so we add the learner here to break the deadlock
|
||||
if [ "$notify_type" = "pre" ] && [ "$notify_operation" = "start" ]; then
|
||||
- ocf_log info "pre_notify_start: running peer membership check for starting node"
|
||||
+ ocf_log info "pre_notify_start: running peer membership check for ${OCF_RESKEY_CRM_meta_notify_start_uname}"
|
||||
check_peer
|
||||
fi
|
||||
}
|
||||
@@ -2616,4 +2616,4 @@ validate-all) podman_validate;;
|
||||
esac
|
||||
rc=$?
|
||||
ocf_log debug "${OCF_RESOURCE_INSTANCE} $__OCF_ACTION : $rc"
|
||||
-exit $rc
|
||||
+exc
|
||||
|
||||
From bdce9048b4fc2c38255d36e73a1f73a7d72b7471 Mon Sep 17 00:00:00 2001
|
||||
From: Vincenzo Mauro <vmauro@redhat.com>
|
||||
Date: Tue, 12 May 2026 10:46:49 +0200
|
||||
Subject: [PATCH 4/4] fixed typo
|
||||
|
||||
---
|
||||
heartbeat/podman-etcd | 2 +-
|
||||
1 file changed, 1 insertion(+), 1 deletion(-)
|
||||
|
||||
diff --git a/heartbeat/podman-etcd b/heartbeat/podman-etcd
|
||||
index 41ce84ff1..740e2edb4 100755
|
||||
--- a/heartbeat/podman-etcd
|
||||
+++ b/heartbeat/podman-etcd
|
||||
@@ -2616,4 +2616,4 @@ validate-all) podman_validate;;
|
||||
esac
|
||||
rc=$?
|
||||
ocf_log debug "${OCF_RESOURCE_INSTANCE} $__OCF_ACTION : $rc"
|
||||
-exc
|
||||
+exit $rc
|
||||
96
RHEL-177850-podman-etcd-fix-port-2380-binding-race.patch
Normal file
96
RHEL-177850-podman-etcd-fix-port-2380-binding-race.patch
Normal file
@ -0,0 +1,96 @@
|
||||
From 42dfef941ed80d6073022141fa1cad513e8dae4f Mon Sep 17 00:00:00 2001
|
||||
From: Pablo Fontanilla <pfontani@redhat.com>
|
||||
Date: Wed, 22 Apr 2026 12:54:58 +0200
|
||||
Subject: [PATCH 1/2] fix(podman-etcd): use -ge 1 in
|
||||
etcd_pod_container_exists()
|
||||
|
||||
PR #2112 added -a to crictl ps to include exited containers, but
|
||||
did not update the count check from -eq 1 to -ge 1. During install,
|
||||
etcd container crashes create exited containers that inflate the
|
||||
count past 1, causing the guard to report 'pod not found' despite
|
||||
the pod running.
|
||||
|
||||
Fixes: OCPBUGS-83742
|
||||
|
||||
Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
|
||||
---
|
||||
heartbeat/podman-etcd | 2 +-
|
||||
1 file changed, 1 insertion(+), 1 deletion(-)
|
||||
|
||||
diff --git a/heartbeat/podman-etcd b/heartbeat/podman-etcd
|
||||
index 4c9bbd4fa..52b2a1386 100755
|
||||
--- a/heartbeat/podman-etcd
|
||||
+++ b/heartbeat/podman-etcd
|
||||
@@ -932,7 +932,7 @@ etcd_pod_container_exists() {
|
||||
local count_matches
|
||||
# Check whether the etcd pod exists on the same node (including stopped/exited containers)
|
||||
count_matches=$(crictl pods --label app=etcd -q | xargs -I {} crictl ps -a --pod {} -o json | jq -r '.containers[].metadata | select ( .name == "etcd" ).name' | wc -l)
|
||||
- if [ "$count_matches" -eq 1 ]; then
|
||||
+ if [ "$count_matches" -ge 1 ]; then
|
||||
# etcd pod found
|
||||
return 0
|
||||
fi
|
||||
|
||||
From 30d20f6b99ae9898bf801c0a5e690b81fc928faa Mon Sep 17 00:00:00 2001
|
||||
From: Pablo Fontanilla <pfontani@redhat.com>
|
||||
Date: Wed, 22 Apr 2026 12:57:46 +0200
|
||||
Subject: [PATCH 2/2] fix(podman-etcd): wait for etcd ports before starting
|
||||
container
|
||||
|
||||
During the static-pod to podman-etcd transition, the old etcd process
|
||||
may still hold ports 2379/2380 when the RA tries to start its container.
|
||||
This causes 'bind: address already in use' errors and eventual fallback
|
||||
to standalone mode.
|
||||
|
||||
Add a 60-second wait loop (modeled on CEO's pod.gotpl.yaml port check)
|
||||
that blocks until the ports are free before calling podman run/start.
|
||||
|
||||
Fixes: OCPBUGS-83742
|
||||
|
||||
Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
|
||||
---
|
||||
heartbeat/podman-etcd | 24 ++++++++++++++++++++++++
|
||||
1 file changed, 24 insertions(+)
|
||||
|
||||
diff --git a/heartbeat/podman-etcd b/heartbeat/podman-etcd
|
||||
index 52b2a1386..9a960914b 100755
|
||||
--- a/heartbeat/podman-etcd
|
||||
+++ b/heartbeat/podman-etcd
|
||||
@@ -940,6 +940,25 @@ etcd_pod_container_exists() {
|
||||
return 1
|
||||
}
|
||||
|
||||
+wait_for_etcd_ports_release() {
|
||||
+ local timeout=${1:-60}
|
||||
+ local elapsed=0
|
||||
+ if [ -z "$(ss -Htan '( sport = 2379 or sport = 2380 )')" ]; then
|
||||
+ return 0
|
||||
+ fi
|
||||
+ ocf_log info "waiting for etcd ports 2379/2380 to be released (timeout: ${timeout}s)"
|
||||
+ while [ -n "$(ss -Htan '( sport = 2379 or sport = 2380 )')" ]; do
|
||||
+ if [ "$elapsed" -ge "$timeout" ]; then
|
||||
+ ocf_log err "etcd ports still in use after ${timeout}s"
|
||||
+ return 1
|
||||
+ fi
|
||||
+ sleep 1
|
||||
+ elapsed=$((elapsed + 1))
|
||||
+ done
|
||||
+ ocf_log info "etcd ports released after ${elapsed}s"
|
||||
+ return 0
|
||||
+}
|
||||
+
|
||||
attribute_node_cluster_id()
|
||||
{
|
||||
local action="$1"
|
||||
@@ -2267,6 +2286,11 @@ podman_start()
|
||||
ocf_log notice "Pull image not required, ${OCF_RESKEY_image}"
|
||||
fi
|
||||
|
||||
+ if ! wait_for_etcd_ports_release 60; then
|
||||
+ ocf_exit_reason "etcd ports 2379/2380 still bound — cannot start container"
|
||||
+ return $OCF_ERR_GENERIC
|
||||
+ fi
|
||||
+
|
||||
if ocf_is_true "$OCF_RESKEY_reuse" && container_exists; then
|
||||
ocf_log info "starting existing container $CONTAINER."
|
||||
ocf_run podman start "$CONTAINER"
|
||||
@ -45,7 +45,7 @@
|
||||
Name: resource-agents
|
||||
Summary: Open Source HA Reusable Cluster Resource Scripts
|
||||
Version: 4.10.0
|
||||
Release: 116%{?rcver:%{rcver}}%{?numcomm:.%{numcomm}}%{?alphatag:.%{alphatag}}%{?dirty:.%{dirty}}%{?dist}
|
||||
Release: 117%{?rcver:%{rcver}}%{?numcomm:.%{numcomm}}%{?alphatag:.%{alphatag}}%{?dirty:.%{dirty}}%{?dist}
|
||||
License: GPLv2+ and LGPLv2+
|
||||
URL: https://github.com/ClusterLabs/resource-agents
|
||||
Source0: %{upstream_prefix}-%{upstream_version}.tar.gz
|
||||
@ -209,6 +209,9 @@ Patch156: RHEL-148198-2-db2-do-not-use-db2stop-to-avoid-divergence-in-the-log.pa
|
||||
Patch157: RHEL-160863-1-Filesystem-do-not-return-CONFIGURED-during-monitor-action.patch
|
||||
Patch158: RHEL-160863-2-Filesystem-always-return-OCF_ERR_GENERIC-when-another-device-is-mounted-on-mountpoint.patch
|
||||
Patch159: RHEL-150850-pgsql-use-monitor_user-for-monitor-calls-and-use-pgpass-when-monitor_password-is-not-specified.patch
|
||||
Patch160: RHEL-177850-podman-etcd-fix-port-2380-binding-race.patch
|
||||
Patch161: RHEL-177840-podman-etcd-fix-machine-deletion-deadlock.patch
|
||||
Patch162: RHEL-177845-podman-etcd-fix-learner-start-deadlock.patch
|
||||
|
||||
# bundled ha-cloud-support libs
|
||||
Patch500: ha-cloud-support-aliyun.patch
|
||||
@ -513,6 +516,9 @@ exit 1
|
||||
%patch -p1 -P 157
|
||||
%patch -p1 -P 158
|
||||
%patch -p1 -P 159
|
||||
%patch -p1 -P 160
|
||||
%patch -p1 -P 161
|
||||
%patch -p1 -P 162
|
||||
|
||||
# bundled ha-cloud-support libs
|
||||
%patch -p1 -P 500
|
||||
@ -847,6 +853,13 @@ rm -rf %{buildroot}/usr/share/doc/resource-agents
|
||||
%{_usr}/lib/ocf/lib/heartbeat/OCF_*.pm
|
||||
|
||||
%changelog
|
||||
* Wed May 20 2026 Oyvind Albrigtsen <oalbrigt@redhat.com> - 4.10.0-117
|
||||
- podman-etcd: fix port 2380 binding race
|
||||
- podman-etcd: fix machine deletion deadlock
|
||||
- podman-etcd: fix learner start deadlock
|
||||
|
||||
Resolves: RHEL-177850, RHEL-177840, RHEL-177845
|
||||
|
||||
* Tue May 19 2026 Oyvind Albrigtsen <oalbrigt@redhat.com> - 4.10.0-116
|
||||
- pgsql: use monitor_user for monitor-calls and use .pgpass when
|
||||
monitor_password is not specified
|
||||
|
||||
Loading…
Reference in New Issue
Block a user