- podman-etcd: prevent learner from starting before cluster is ready

Resolves: RHEL-131181
This commit is contained in:
Oyvind Albrigtsen 2025-11-27 10:21:59 +01:00
parent 0e9fc0284c
commit 963c977ce4
2 changed files with 115 additions and 1 deletions

View File

@ -0,0 +1,107 @@
From 5cc74acd67c294da36b3f40e44842a82aa7d0957 Mon Sep 17 00:00:00 2001
From: Carlo Lobrano <c.lobrano@gmail.com>
Date: Wed, 26 Nov 2025 11:43:25 +0100
Subject: [PATCH] OCPEDGE-2213: podman-etcd: fix to prevent learner from
starting before cluster is ready (#2098)
* OCPEDGE-2213: fix(podman-etcd): prevent learner from starting before cluster is ready
Clear stale learner_node attribute during stop and on restart when no
active resources exist, ensuring learner always waits for peer
availability.
* fix: podman-etcd should cleanup standalone/learner attributes when promotion succeeds
* fix: remove misleading endpoint IP from log
---
heartbeat/podman-etcd | 33 +++++++++++++++++++--------------
1 file changed, 19 insertions(+), 14 deletions(-)
diff --git a/heartbeat/podman-etcd b/heartbeat/podman-etcd
index b1f52cd5c..3e3f1d60e 100755
--- a/heartbeat/podman-etcd
+++ b/heartbeat/podman-etcd
@@ -880,7 +880,7 @@ add_member_as_learner()
local endpoint_url=$(ip_url $(attribute_node_ip get))
local peer_url=$(ip_url $member_ip)
- ocf_log info "add $member_name ($member_ip, $endpoint_url) to the member list as learner"
+ ocf_log info "add $member_name ($member_ip) to the member list as learner"
out=$(podman exec "${CONTAINER}" etcdctl --endpoints="$endpoint_url:2379" member add "$member_name" --peer-urls="$peer_url:2380" --learner)
rc=$?
if [ $rc -ne 0 ]; then
@@ -1032,7 +1032,7 @@ promote_learner_member()
if ! ocf_run podman exec "${CONTAINER}" etcdctl member promote "$learner_member_id_hex" 2>&1; then
# promotion is expected to fail if the peer is not yet up-to-date
ocf_log info "could not promote member $learner_member_id_hex, error code: $?"
- return $OCF_SUCCESS
+ return $OCF_ERR_GENERIC
fi
ocf_log info "successfully promoted member '$learner_member_id_hex'"
return $OCF_SUCCESS
@@ -1063,19 +1063,19 @@ reconcile_member_state()
fi
if [ -n "$learner_member_id" ]; then
- promote_learner_member "$learner_member_id"
- return $?
- fi
-
- if [ -z "$learner_member_id" ]; then
- if ! clear_standalone_node; then
- ocf_log error "could not clear standalone_node attribute, error code: $?"
- return $OCF_ERR_GENERIC
- fi
- if ! attribute_learner_node clear; then
- ocf_log error "could not clear learner_node attribute, error code: $?"
+ if ! promote_learner_member "$learner_member_id"; then
return $OCF_ERR_GENERIC
fi
+ # promotion succeded: continue to clear standalone_node and learner_node
+ fi
+
+ if ! clear_standalone_node; then
+ ocf_log error "could not clear standalone_node attribute, error code: $?"
+ return $OCF_ERR_GENERIC
+ fi
+ if ! attribute_learner_node clear; then
+ ocf_log error "could not clear learner_node attribute, error code: $?"
+ return $OCF_ERR_GENERIC
fi
return $OCF_SUCCESS
@@ -1258,6 +1258,7 @@ manage_peer_membership()
set_standalone_node
else
ocf_log debug "$name is in the members list by IP: $ip"
+ # Errors from reconcile_member_state are logged internally. Ignoring them here prevents stopping a healthy voter agent; critical local failures are caught by detect_cluster_leadership_loss.
reconcile_member_state "$member_list_json"
fi
done
@@ -1369,7 +1370,7 @@ container_health_check()
# Could not execute monitor check command and state file exists - the container failed, check recovery status in this lifecycle
local time_since_heartbeat
time_since_heartbeat=$(get_time_since_last_heartbeat)
- ocf_log err "Container ${CONTAINER} failed (last healthy: ${time_since_heartbeat}s ago)"
+ ocf_log err "Container ${CONTAINER} failed (last healthy: ${time_since_heartbeat}s ago, error code: $rc)"
# Check if peer has set force_new_cluster for recovery
local fnc_holders
@@ -1795,6 +1796,9 @@ podman_start()
fi
;;
0)
+ # No active resources: clear any stale learner_node attribute from previous failed session
+ ocf_log debug "clearing stale learner_node attribute (safe when active_resources_count=0)"
+ attribute_learner_node clear
# count how many agents are starting now
local start_resources_count
start_resources_count=$(echo "$OCF_RESKEY_CRM_meta_notify_start_resource" | wc -w)
@@ -2090,6 +2094,7 @@ podman_stop()
ocf_log err "could not delete container health check state file"
fi
+ attribute_learner_node clear
attribute_node_revision update
attribute_node_cluster_id update

View File

@ -45,7 +45,7 @@
Name: resource-agents
Summary: Open Source HA Reusable Cluster Resource Scripts
Version: 4.16.0
Release: 44%{?rcver:%{rcver}}%{?numcomm:.%{numcomm}}%{?alphatag:.%{alphatag}}%{?dirty:.%{dirty}}%{?dist}
Release: 45%{?rcver:%{rcver}}%{?numcomm:.%{numcomm}}%{?alphatag:.%{alphatag}}%{?dirty:.%{dirty}}%{?dist}
License: GPL-2.0-or-later AND LGPL-2.1-or-later
URL: https://github.com/ClusterLabs/resource-agents
Source0: %{upstream_prefix}-%{upstream_version}.tar.gz
@ -109,6 +109,7 @@ Patch56: RHEL-112443-2-nginx-restore-selinux-context-for-pid-file-during-validat
Patch57: RHEL-130576-1-podman-etcd-prevent-last-active-member-from-leaving.patch
Patch58: RHEL-130576-2-podman-etcd-remove-test-code.patch
Patch59: RHEL-126083-2-podman-etcd-fix-count-of-fnc-holders-in-container_health_check.patch
Patch60: RHEL-131181-podman-etcd-prevent-learner-from-starting-before-cluster-is-ready.patch
# bundled ha-cloud-support libs
Patch500: ha-cloud-support-aliyun.patch
@ -339,6 +340,7 @@ exit 1
%patch -p1 -P 57
%patch -p1 -P 58
%patch -p1 -P 59
%patch -p1 -P 60
# bundled ha-cloud-support libs
%patch -p1 -P 500
@ -671,6 +673,11 @@ rm -rf %{buildroot}/usr/share/doc/resource-agents
%{_usr}/lib/ocf/lib/heartbeat/OCF_*.pm
%changelog
* Thu Nov 27 2025 Oyvind Albrigtsen <oalbrigt@redhat.com> - 4.16.0-45
- podman-etcd: prevent learner from starting before cluster is ready
Resolves: RHEL-131181
* Mon Nov 24 2025 Oyvind Albrigtsen <oalbrigt@redhat.com> - 4.16.0-44
- podman-etcd: add container crash detection with coordinated recovery