- podman etcd: ignore learners when considering which node has higher revision

- podman etcd: handle existing peer URLs gracefully during force_new_cluster recovery
- db2: set reintegration when promotion is successful

  Resolves: RHEL-156709, RHEL-157146, RHEL-153158
This commit is contained in:
Oyvind Albrigtsen 2026-03-19 11:33:15 +01:00
parent 283977cc84
commit eea4aa580e
4 changed files with 230 additions and 1 deletions

View File

@ -0,0 +1,46 @@
From 66885ea0227e847b571608015b150d391a6234d7 Mon Sep 17 00:00:00 2001
From: Oyvind Albrigtsen <oalbrigt@redhat.com>
Date: Mon, 23 Feb 2026 13:35:58 +0100
Subject: [PATCH] db2: set reintegration when promotion is successful
---
heartbeat/db2 | 19 +++++++++++++++++++
1 file changed, 19 insertions(+)
diff --git a/heartbeat/db2 b/heartbeat/db2
index 82f2f82c3..4420b9989 100755
--- a/heartbeat/db2
+++ b/heartbeat/db2
@@ -955,6 +955,16 @@ db2_promote() {
PRIMARY/PEER/*|PRIMARY/REMOTE_CATCHUP/*|PRIMARY/REMOTE_CATCHUP_PENDING/CONNECTED|Primary/Peer)
# nothing to do, only update pacemaker's view
echo MASTER > $STATE_FILE
+
+ if [ -n "$remote_host" ]; then
+ for db in $dblist
+ do
+ reint_attr="db2hadr-${inst1}_${inst2}_${db}_reint"
+ ocf_log debug "Promotion succeeded, setting $reint_attr = 1"
+ crm_attribute -n "$reint_attr" -N "$remote_host" -v "1" -l forever
+ done
+ fi
+
return $OCF_SUCCESS
;;
@@ -981,6 +991,15 @@ db2_promote() {
# update pacemaker's view
echo MASTER > $STATE_FILE
+ if [ -n "$remote_host" ]; then
+ for db in $dblist
+ do
+ reint_attr="db2hadr-${inst1}_${inst2}_${db}_reint"
+ ocf_log debug "Promotion succeeded, setting $reint_attr = 1"
+ crm_attribute -n "$reint_attr" -N "$remote_host" -v "1" -l forever
+ done
+ fi
+
return $OCF_SUCCESS
fi

View File

@ -0,0 +1,40 @@
From 5890f47bc61703130cd27d767118367f03bca95f Mon Sep 17 00:00:00 2001
From: Carlo Lobrano <c.lobrano@gmail.com>
Date: Tue, 10 Mar 2026 17:26:04 +0100
Subject: [PATCH] podman-etcd: Preserve standalone voter identity during
restart
If the standalone voter restarts before the peer is added to the member
list, the learner_node attribute may not be set yet. Without checking
is_standalone, the voter incorrectly joins as a learner, causing both
nodes to become learners and creating an unrecoverable deadlock.
Check is_standalone to ensure the voter restarts in the same role it
had before the restart.
---
heartbeat/podman-etcd | 11 +++++++++--
1 file changed, 9 insertions(+), 2 deletions(-)
diff --git a/heartbeat/podman-etcd b/heartbeat/podman-etcd
index 539ad33b2..2f8aa122f 100755
--- a/heartbeat/podman-etcd
+++ b/heartbeat/podman-etcd
@@ -2002,9 +2002,16 @@ podman_start()
ocf_log info "found '$active_resources_count' active etcd resources (active: '$OCF_RESKEY_CRM_meta_notify_active_resource', stop: '$OCF_RESKEY_CRM_meta_notify_stop_resource')"
case "$active_resources_count" in
1)
- if [ "$(attribute_learner_node get)" = "$(get_peer_node_name)" ]; then
- ocf_log info "peer active but in learner mode: start normally"
+ # is_standalone may return true here due to a restart: in the previous run,
+ # this agent was the sole voter and the peer had not yet joined the member
+ # list (learner_node unset). Since standalone_node was not cleared before
+ # the restart, start normally to recover the previous cluster state.
+ if is_standalone; then
+ ocf_log info "peer active but not a voter: start normally to recover"
+ elif [ "$(attribute_learner_node get)" = "$(get_peer_node_name)" ]; then
+ ocf_log info "peer active but in learner mode: start normally to recover"
else
+ # If (A) we must join the peer's new cluster
ocf_log info "peer is active standalone: joining as learner"
JOIN_AS_LEARNER=true
fi

View File

@ -0,0 +1,130 @@
From 83d16b59a354a20bf7679af45a9acfa9f344959a Mon Sep 17 00:00:00 2001
From: Carlo Lobrano <c.lobrano@gmail.com>
Date: Wed, 18 Mar 2026 11:30:31 +0100
Subject: [PATCH] OCPBUGS-78482: podman-etcd: fix "Peer URLs already exists" in
add_member_as_learner (#2136)
* podman-etcd: handle "Peer URLs already exists" in add_member_as_learner
When etcdctl member add fails with "Peer URLs already exists", the stale
member entry is removed and the add is retried.
Without this fix, add_member_as_learner returns early without setting
the learner_node attribute, causing the peer node to time out waiting
for it.
---
heartbeat/podman-etcd | 85 +++++++++++++++++++++++++++++++++++++------
1 file changed, 73 insertions(+), 12 deletions(-)
diff --git a/heartbeat/podman-etcd b/heartbeat/podman-etcd
index 2f8aa122f..860aca817 100755
--- a/heartbeat/podman-etcd
+++ b/heartbeat/podman-etcd
@@ -1072,6 +1072,49 @@ attribute_node_member_id()
esac
}
+# remove an etcd member identified by its IP from the member list
+remove_etcd_member_by_ip()
+{
+ local rc
+ local out
+ local member_ip
+ local endpoint_url
+ local member_list_json
+ local stale_member_id
+ local stale_member_id_hex
+
+ member_ip=$1
+ member_list_json=$(get_member_list_json)
+ rc=$?
+ if [ "$rc" -ne 0 ] ; then
+ ocf_log err "could not remove etcd member. Failed to get member list, error code: $rc"
+ return $OCF_ERR_GENERIC
+ fi
+
+ stale_member_id=$(printf "%s" "$member_list_json" | jq -r ".members[] | select( .peerURLs | map(test(\"$member_ip\")) | any).ID")
+ if [ -z "$stale_member_id" ]; then
+ ocf_log err "could not remove etcd member. Failed to find member ID"
+ return $OCF_ERR_GENERIC
+ fi
+
+ # JSON member_id is decimal, while etcdctl command needs the hex version
+ if ! stale_member_id_hex=$(decimal_to_hex "$stale_member_id"); then
+ ocf_log err "could not remove etcd member. Failed to convert member_id '$stale_member_id' into hex format"
+ return $OCF_ERR_GENERIC
+ fi
+
+ endpoint_url=$(ip_url $(attribute_node_ip get))
+ out=$(podman exec "${CONTAINER}" etcdctl --endpoints="$endpoint_url:2379" member remove "$stale_member_id_hex" 2>&1)
+ rc=$?
+ if [ "$rc" -ne 0 ] ; then
+ ocf_log err "could not remove etcd member. etcdctl member remove command failed, error code: $rc, output: $out"
+ return $OCF_ERR_GENERIC
+ fi
+
+ ocf_log info "$out"
+ return $OCF_SUCCESS
+}
+
add_member_as_learner()
{
local rc
@@ -1079,18 +1122,36 @@ add_member_as_learner()
local member_ip=$2
local endpoint_url=$(ip_url $(attribute_node_ip get))
local peer_url=$(ip_url $member_ip)
+ local i
+ local max_retries
+ local out
- ocf_log info "add $member_name ($member_ip) to the member list as learner"
- out=$(podman exec "${CONTAINER}" etcdctl --endpoints="$endpoint_url:2379" member add "$member_name" --peer-urls="$peer_url:2380" --learner 2>&1)
- rc=$?
- if [ $rc -ne 0 ]; then
- ocf_log err "could not add $member_name as learner, error code $rc, etcdctl output: $out"
- return $rc
- fi
- ocf_log info "$out"
+ i=0
+ max_retries=3
+ while [ "$i" -lt "$max_retries" ]; do
+ i=$((i + 1))
+ ocf_log info "adding $member_name ($member_ip) to the member list as learner (attempt $i of $max_retries)"
+ out=$(podman exec "${CONTAINER}" etcdctl --endpoints="$endpoint_url:2379" \
+ member add "$member_name" \
+ --peer-urls="$peer_url:2380" \
+ --learner 2>&1)
+ rc=$?
+ if [ "$rc" -eq 0 ]; then
+ ocf_log info "$out"
+ attribute_learner_node update "$member_name"
+ return $OCF_SUCCESS
+ fi
+ if echo "$out" | grep -q "Peer URLs already exists"; then
+ # etcd data might have stale membership data
+ ocf_log warn "could not add member: Peer URLs already exists"
+ remove_etcd_member_by_ip "$member_ip"
+ else
+ ocf_log warn "could not add member: $out"
+ fi
+ done
- attribute_learner_node update "$member_name"
- return $?
+ ocf_log err "could not add $member_name as learner, error code $rc, etcdctl output: $out"
+ return $OCF_ERR_GENERIC
}
set_force_new_cluster()
@@ -1454,8 +1515,8 @@ manage_peer_membership()
# "https://<node IP>:2379"
# ]
# }
- # NOTE that the "name" field is present in voting members only, while "isLearner"
- # field in learner members only and the value is always true (not a string) in that case.
+ # NOTE: voting members have a "name" field but no "isLearner" field,
+ # while learner members have "isLearner": true (boolean) but no "name" field, so we search for peerURLs matching.
peer_member_id=$(printf "%s" "$member_list_json" | jq -r ".members[] | select( .peerURLs | map(test(\"$peer_member_ip\")) | any).ID")
if [ -z "$peer_member_id" ]; then
ocf_log info "$peer_member_name is not in the members list"

View File

@ -45,7 +45,7 @@
Name: resource-agents
Summary: Open Source HA Reusable Cluster Resource Scripts
Version: 4.10.0
Release: 109%{?rcver:%{rcver}}%{?numcomm:.%{numcomm}}%{?alphatag:.%{alphatag}}%{?dirty:.%{dirty}}%{?dist}
Release: 110%{?rcver:%{rcver}}%{?numcomm:.%{numcomm}}%{?alphatag:.%{alphatag}}%{?dirty:.%{dirty}}%{?dist}
License: GPLv2+ and LGPLv2+
URL: https://github.com/ClusterLabs/resource-agents
Source0: %{upstream_prefix}-%{upstream_version}.tar.gz
@ -200,6 +200,9 @@ Patch147: RHEL-143527-powervs-move-ip-powervs-subnet-fix-error-logging.patch
Patch148: RHEL-145628-podman-etcd-enhance-etcd-data-backup-with-snapshots-and-retention.patch
Patch149: RHEL-150700-podman-etcd-set-attributes-if-they-fail-during-force-new-cluster.patch
Patch150: RHEL-116151-4-portblock-check-inverse-action.patch
Patch151: RHEL-156709-podman-etcd-ignore-learners-when-considering-which-node-has-higher-revision.patch
Patch152: RHEL-157146-podman-etcd-handle-existing-peer-URLs-gracefully-during-force_new_cluster-recovery.patch
Patch153: RHEL-153158-db2-set-reintegration-when-promotion-is-successful.patch
# bundled ha-cloud-support libs
Patch500: ha-cloud-support-aliyun.patch
@ -495,6 +498,9 @@ exit 1
%patch -p1 -P 148
%patch -p1 -P 149
%patch -p1 -P 150
%patch -p1 -P 151
%patch -p1 -P 152
%patch -p1 -P 153
# bundled ha-cloud-support libs
%patch -p1 -P 500
@ -827,6 +833,13 @@ rm -rf %{buildroot}/usr/share/doc/resource-agents
%{_usr}/lib/ocf/lib/heartbeat/OCF_*.pm
%changelog
* Thu Mar 19 2026 Oyvind Albrigtsen <oalbrigt@redhat.com> - 4.10.0-110
- podman etcd: ignore learners when considering which node has higher revision
- podman etcd: handle existing peer URLs gracefully during force_new_cluster recovery
- db2: set reintegration when promotion is successful
Resolves: RHEL-156709, RHEL-157146, RHEL-153158
* Fri Feb 27 2026 Oyvind Albrigtsen <oalbrigt@redhat.com> - 4.10.0-109
- portblock: check inverse action state file for non-promotable
resources to avoid issues when doing e.g. block followed by unblock