From 963c977ce4fcd9f1e46dd5d46e92489abb8f1d8a Mon Sep 17 00:00:00 2001 From: Oyvind Albrigtsen Date: Thu, 27 Nov 2025 10:21:59 +0100 Subject: [PATCH] - podman-etcd: prevent learner from starting before cluster is ready Resolves: RHEL-131181 --- ...rom-starting-before-cluster-is-ready.patch | 107 ++++++++++++++++++ resource-agents.spec | 9 +- 2 files changed, 115 insertions(+), 1 deletion(-) create mode 100644 RHEL-131181-podman-etcd-prevent-learner-from-starting-before-cluster-is-ready.patch diff --git a/RHEL-131181-podman-etcd-prevent-learner-from-starting-before-cluster-is-ready.patch b/RHEL-131181-podman-etcd-prevent-learner-from-starting-before-cluster-is-ready.patch new file mode 100644 index 0000000..191f430 --- /dev/null +++ b/RHEL-131181-podman-etcd-prevent-learner-from-starting-before-cluster-is-ready.patch @@ -0,0 +1,107 @@ +From 5cc74acd67c294da36b3f40e44842a82aa7d0957 Mon Sep 17 00:00:00 2001 +From: Carlo Lobrano +Date: Wed, 26 Nov 2025 11:43:25 +0100 +Subject: [PATCH] OCPEDGE-2213: podman-etcd: fix to prevent learner from + starting before cluster is ready (#2098) + +* OCPEDGE-2213: fix(podman-etcd): prevent learner from starting before cluster is ready + +Clear stale learner_node attribute during stop and on restart when no +active resources exist, ensuring learner always waits for peer +availability. + +* fix: podman-etcd should cleanup standalone/learner attributes when promotion succeeds + +* fix: remove misleading endpoint IP from log +--- + heartbeat/podman-etcd | 33 +++++++++++++++++++-------------- + 1 file changed, 19 insertions(+), 14 deletions(-) + +diff --git a/heartbeat/podman-etcd b/heartbeat/podman-etcd +index b1f52cd5c..3e3f1d60e 100755 +--- a/heartbeat/podman-etcd ++++ b/heartbeat/podman-etcd +@@ -880,7 +880,7 @@ add_member_as_learner() + local endpoint_url=$(ip_url $(attribute_node_ip get)) + local peer_url=$(ip_url $member_ip) + +- ocf_log info "add $member_name ($member_ip, $endpoint_url) to the member list as learner" ++ ocf_log info "add $member_name ($member_ip) to the member list as learner" + out=$(podman exec "${CONTAINER}" etcdctl --endpoints="$endpoint_url:2379" member add "$member_name" --peer-urls="$peer_url:2380" --learner) + rc=$? + if [ $rc -ne 0 ]; then +@@ -1032,7 +1032,7 @@ promote_learner_member() + if ! ocf_run podman exec "${CONTAINER}" etcdctl member promote "$learner_member_id_hex" 2>&1; then + # promotion is expected to fail if the peer is not yet up-to-date + ocf_log info "could not promote member $learner_member_id_hex, error code: $?" +- return $OCF_SUCCESS ++ return $OCF_ERR_GENERIC + fi + ocf_log info "successfully promoted member '$learner_member_id_hex'" + return $OCF_SUCCESS +@@ -1063,19 +1063,19 @@ reconcile_member_state() + fi + + if [ -n "$learner_member_id" ]; then +- promote_learner_member "$learner_member_id" +- return $? +- fi +- +- if [ -z "$learner_member_id" ]; then +- if ! clear_standalone_node; then +- ocf_log error "could not clear standalone_node attribute, error code: $?" +- return $OCF_ERR_GENERIC +- fi +- if ! attribute_learner_node clear; then +- ocf_log error "could not clear learner_node attribute, error code: $?" ++ if ! promote_learner_member "$learner_member_id"; then + return $OCF_ERR_GENERIC + fi ++ # promotion succeded: continue to clear standalone_node and learner_node ++ fi ++ ++ if ! clear_standalone_node; then ++ ocf_log error "could not clear standalone_node attribute, error code: $?" ++ return $OCF_ERR_GENERIC ++ fi ++ if ! attribute_learner_node clear; then ++ ocf_log error "could not clear learner_node attribute, error code: $?" ++ return $OCF_ERR_GENERIC + fi + + return $OCF_SUCCESS +@@ -1258,6 +1258,7 @@ manage_peer_membership() + set_standalone_node + else + ocf_log debug "$name is in the members list by IP: $ip" ++ # Errors from reconcile_member_state are logged internally. Ignoring them here prevents stopping a healthy voter agent; critical local failures are caught by detect_cluster_leadership_loss. + reconcile_member_state "$member_list_json" + fi + done +@@ -1369,7 +1370,7 @@ container_health_check() + # Could not execute monitor check command and state file exists - the container failed, check recovery status in this lifecycle + local time_since_heartbeat + time_since_heartbeat=$(get_time_since_last_heartbeat) +- ocf_log err "Container ${CONTAINER} failed (last healthy: ${time_since_heartbeat}s ago)" ++ ocf_log err "Container ${CONTAINER} failed (last healthy: ${time_since_heartbeat}s ago, error code: $rc)" + + # Check if peer has set force_new_cluster for recovery + local fnc_holders +@@ -1795,6 +1796,9 @@ podman_start() + fi + ;; + 0) ++ # No active resources: clear any stale learner_node attribute from previous failed session ++ ocf_log debug "clearing stale learner_node attribute (safe when active_resources_count=0)" ++ attribute_learner_node clear + # count how many agents are starting now + local start_resources_count + start_resources_count=$(echo "$OCF_RESKEY_CRM_meta_notify_start_resource" | wc -w) +@@ -2090,6 +2094,7 @@ podman_stop() + ocf_log err "could not delete container health check state file" + fi + ++ attribute_learner_node clear + attribute_node_revision update + attribute_node_cluster_id update + diff --git a/resource-agents.spec b/resource-agents.spec index de8a21f..5e52dae 100644 --- a/resource-agents.spec +++ b/resource-agents.spec @@ -45,7 +45,7 @@ Name: resource-agents Summary: Open Source HA Reusable Cluster Resource Scripts Version: 4.16.0 -Release: 44%{?rcver:%{rcver}}%{?numcomm:.%{numcomm}}%{?alphatag:.%{alphatag}}%{?dirty:.%{dirty}}%{?dist} +Release: 45%{?rcver:%{rcver}}%{?numcomm:.%{numcomm}}%{?alphatag:.%{alphatag}}%{?dirty:.%{dirty}}%{?dist} License: GPL-2.0-or-later AND LGPL-2.1-or-later URL: https://github.com/ClusterLabs/resource-agents Source0: %{upstream_prefix}-%{upstream_version}.tar.gz @@ -109,6 +109,7 @@ Patch56: RHEL-112443-2-nginx-restore-selinux-context-for-pid-file-during-validat Patch57: RHEL-130576-1-podman-etcd-prevent-last-active-member-from-leaving.patch Patch58: RHEL-130576-2-podman-etcd-remove-test-code.patch Patch59: RHEL-126083-2-podman-etcd-fix-count-of-fnc-holders-in-container_health_check.patch +Patch60: RHEL-131181-podman-etcd-prevent-learner-from-starting-before-cluster-is-ready.patch # bundled ha-cloud-support libs Patch500: ha-cloud-support-aliyun.patch @@ -339,6 +340,7 @@ exit 1 %patch -p1 -P 57 %patch -p1 -P 58 %patch -p1 -P 59 +%patch -p1 -P 60 # bundled ha-cloud-support libs %patch -p1 -P 500 @@ -671,6 +673,11 @@ rm -rf %{buildroot}/usr/share/doc/resource-agents %{_usr}/lib/ocf/lib/heartbeat/OCF_*.pm %changelog +* Thu Nov 27 2025 Oyvind Albrigtsen - 4.16.0-45 +- podman-etcd: prevent learner from starting before cluster is ready + + Resolves: RHEL-131181 + * Mon Nov 24 2025 Oyvind Albrigtsen - 4.16.0-44 - podman-etcd: add container crash detection with coordinated recovery