- podman-etcd: prevent learner from starting before cluster is ready
Resolves: RHEL-131181
This commit is contained in:
parent
0e9fc0284c
commit
963c977ce4
@ -0,0 +1,107 @@
|
||||
From 5cc74acd67c294da36b3f40e44842a82aa7d0957 Mon Sep 17 00:00:00 2001
|
||||
From: Carlo Lobrano <c.lobrano@gmail.com>
|
||||
Date: Wed, 26 Nov 2025 11:43:25 +0100
|
||||
Subject: [PATCH] OCPEDGE-2213: podman-etcd: fix to prevent learner from
|
||||
starting before cluster is ready (#2098)
|
||||
|
||||
* OCPEDGE-2213: fix(podman-etcd): prevent learner from starting before cluster is ready
|
||||
|
||||
Clear stale learner_node attribute during stop and on restart when no
|
||||
active resources exist, ensuring learner always waits for peer
|
||||
availability.
|
||||
|
||||
* fix: podman-etcd should cleanup standalone/learner attributes when promotion succeeds
|
||||
|
||||
* fix: remove misleading endpoint IP from log
|
||||
---
|
||||
heartbeat/podman-etcd | 33 +++++++++++++++++++--------------
|
||||
1 file changed, 19 insertions(+), 14 deletions(-)
|
||||
|
||||
diff --git a/heartbeat/podman-etcd b/heartbeat/podman-etcd
|
||||
index b1f52cd5c..3e3f1d60e 100755
|
||||
--- a/heartbeat/podman-etcd
|
||||
+++ b/heartbeat/podman-etcd
|
||||
@@ -880,7 +880,7 @@ add_member_as_learner()
|
||||
local endpoint_url=$(ip_url $(attribute_node_ip get))
|
||||
local peer_url=$(ip_url $member_ip)
|
||||
|
||||
- ocf_log info "add $member_name ($member_ip, $endpoint_url) to the member list as learner"
|
||||
+ ocf_log info "add $member_name ($member_ip) to the member list as learner"
|
||||
out=$(podman exec "${CONTAINER}" etcdctl --endpoints="$endpoint_url:2379" member add "$member_name" --peer-urls="$peer_url:2380" --learner)
|
||||
rc=$?
|
||||
if [ $rc -ne 0 ]; then
|
||||
@@ -1032,7 +1032,7 @@ promote_learner_member()
|
||||
if ! ocf_run podman exec "${CONTAINER}" etcdctl member promote "$learner_member_id_hex" 2>&1; then
|
||||
# promotion is expected to fail if the peer is not yet up-to-date
|
||||
ocf_log info "could not promote member $learner_member_id_hex, error code: $?"
|
||||
- return $OCF_SUCCESS
|
||||
+ return $OCF_ERR_GENERIC
|
||||
fi
|
||||
ocf_log info "successfully promoted member '$learner_member_id_hex'"
|
||||
return $OCF_SUCCESS
|
||||
@@ -1063,19 +1063,19 @@ reconcile_member_state()
|
||||
fi
|
||||
|
||||
if [ -n "$learner_member_id" ]; then
|
||||
- promote_learner_member "$learner_member_id"
|
||||
- return $?
|
||||
- fi
|
||||
-
|
||||
- if [ -z "$learner_member_id" ]; then
|
||||
- if ! clear_standalone_node; then
|
||||
- ocf_log error "could not clear standalone_node attribute, error code: $?"
|
||||
- return $OCF_ERR_GENERIC
|
||||
- fi
|
||||
- if ! attribute_learner_node clear; then
|
||||
- ocf_log error "could not clear learner_node attribute, error code: $?"
|
||||
+ if ! promote_learner_member "$learner_member_id"; then
|
||||
return $OCF_ERR_GENERIC
|
||||
fi
|
||||
+ # promotion succeded: continue to clear standalone_node and learner_node
|
||||
+ fi
|
||||
+
|
||||
+ if ! clear_standalone_node; then
|
||||
+ ocf_log error "could not clear standalone_node attribute, error code: $?"
|
||||
+ return $OCF_ERR_GENERIC
|
||||
+ fi
|
||||
+ if ! attribute_learner_node clear; then
|
||||
+ ocf_log error "could not clear learner_node attribute, error code: $?"
|
||||
+ return $OCF_ERR_GENERIC
|
||||
fi
|
||||
|
||||
return $OCF_SUCCESS
|
||||
@@ -1258,6 +1258,7 @@ manage_peer_membership()
|
||||
set_standalone_node
|
||||
else
|
||||
ocf_log debug "$name is in the members list by IP: $ip"
|
||||
+ # Errors from reconcile_member_state are logged internally. Ignoring them here prevents stopping a healthy voter agent; critical local failures are caught by detect_cluster_leadership_loss.
|
||||
reconcile_member_state "$member_list_json"
|
||||
fi
|
||||
done
|
||||
@@ -1369,7 +1370,7 @@ container_health_check()
|
||||
# Could not execute monitor check command and state file exists - the container failed, check recovery status in this lifecycle
|
||||
local time_since_heartbeat
|
||||
time_since_heartbeat=$(get_time_since_last_heartbeat)
|
||||
- ocf_log err "Container ${CONTAINER} failed (last healthy: ${time_since_heartbeat}s ago)"
|
||||
+ ocf_log err "Container ${CONTAINER} failed (last healthy: ${time_since_heartbeat}s ago, error code: $rc)"
|
||||
|
||||
# Check if peer has set force_new_cluster for recovery
|
||||
local fnc_holders
|
||||
@@ -1795,6 +1796,9 @@ podman_start()
|
||||
fi
|
||||
;;
|
||||
0)
|
||||
+ # No active resources: clear any stale learner_node attribute from previous failed session
|
||||
+ ocf_log debug "clearing stale learner_node attribute (safe when active_resources_count=0)"
|
||||
+ attribute_learner_node clear
|
||||
# count how many agents are starting now
|
||||
local start_resources_count
|
||||
start_resources_count=$(echo "$OCF_RESKEY_CRM_meta_notify_start_resource" | wc -w)
|
||||
@@ -2090,6 +2094,7 @@ podman_stop()
|
||||
ocf_log err "could not delete container health check state file"
|
||||
fi
|
||||
|
||||
+ attribute_learner_node clear
|
||||
attribute_node_revision update
|
||||
attribute_node_cluster_id update
|
||||
|
||||
@ -45,7 +45,7 @@
|
||||
Name: resource-agents
|
||||
Summary: Open Source HA Reusable Cluster Resource Scripts
|
||||
Version: 4.16.0
|
||||
Release: 44%{?rcver:%{rcver}}%{?numcomm:.%{numcomm}}%{?alphatag:.%{alphatag}}%{?dirty:.%{dirty}}%{?dist}
|
||||
Release: 45%{?rcver:%{rcver}}%{?numcomm:.%{numcomm}}%{?alphatag:.%{alphatag}}%{?dirty:.%{dirty}}%{?dist}
|
||||
License: GPL-2.0-or-later AND LGPL-2.1-or-later
|
||||
URL: https://github.com/ClusterLabs/resource-agents
|
||||
Source0: %{upstream_prefix}-%{upstream_version}.tar.gz
|
||||
@ -109,6 +109,7 @@ Patch56: RHEL-112443-2-nginx-restore-selinux-context-for-pid-file-during-validat
|
||||
Patch57: RHEL-130576-1-podman-etcd-prevent-last-active-member-from-leaving.patch
|
||||
Patch58: RHEL-130576-2-podman-etcd-remove-test-code.patch
|
||||
Patch59: RHEL-126083-2-podman-etcd-fix-count-of-fnc-holders-in-container_health_check.patch
|
||||
Patch60: RHEL-131181-podman-etcd-prevent-learner-from-starting-before-cluster-is-ready.patch
|
||||
|
||||
# bundled ha-cloud-support libs
|
||||
Patch500: ha-cloud-support-aliyun.patch
|
||||
@ -339,6 +340,7 @@ exit 1
|
||||
%patch -p1 -P 57
|
||||
%patch -p1 -P 58
|
||||
%patch -p1 -P 59
|
||||
%patch -p1 -P 60
|
||||
|
||||
# bundled ha-cloud-support libs
|
||||
%patch -p1 -P 500
|
||||
@ -671,6 +673,11 @@ rm -rf %{buildroot}/usr/share/doc/resource-agents
|
||||
%{_usr}/lib/ocf/lib/heartbeat/OCF_*.pm
|
||||
|
||||
%changelog
|
||||
* Thu Nov 27 2025 Oyvind Albrigtsen <oalbrigt@redhat.com> - 4.16.0-45
|
||||
- podman-etcd: prevent learner from starting before cluster is ready
|
||||
|
||||
Resolves: RHEL-131181
|
||||
|
||||
* Mon Nov 24 2025 Oyvind Albrigtsen <oalbrigt@redhat.com> - 4.16.0-44
|
||||
- podman-etcd: add container crash detection with coordinated recovery
|
||||
|
||||
|
||||
Loading…
Reference in New Issue
Block a user