- podman etcd: ignore learners when considering which node has higher revision

- podman etcd: handle existing peer URLs gracefully during force_new_cluster recovery - db2: set reintegration when promotion is successful Resolves: RHEL-156709, RHEL-157146, RHEL-153158
2026-03-19 11:33:15 +01:00 · 2026-03-19 11:33:15 +01:00 · eea4aa580e
commit eea4aa580e
parent 283977cc84
4 changed files with 230 additions and 1 deletions
--- a/RHEL-153158-db2-set-reintegration-when-promotion-is-successful.patch
+++ b/RHEL-153158-db2-set-reintegration-when-promotion-is-successful.patch
@ -0,0 +1,46 @@
+From 66885ea0227e847b571608015b150d391a6234d7 Mon Sep 17 00:00:00 2001
+From: Oyvind Albrigtsen <oalbrigt@redhat.com>
+Date: Mon, 23 Feb 2026 13:35:58 +0100
+Subject: [PATCH] db2: set reintegration when promotion is successful
+
+---
+ heartbeat/db2 | 19 +++++++++++++++++++
+ 1 file changed, 19 insertions(+)
+
+diff --git a/heartbeat/db2 b/heartbeat/db2
+index 82f2f82c3..4420b9989 100755
+--- a/heartbeat/db2
+++ b/heartbeat/db2
+@@ -955,6 +955,16 @@ db2_promote() {
+             PRIMARY/PEER/*|PRIMARY/REMOTE_CATCHUP/*|PRIMARY/REMOTE_CATCHUP_PENDING/CONNECTED|Primary/Peer)
+             # nothing to do, only update pacemaker's view
+             echo MASTER > $STATE_FILE
+
+            if [ -n "$remote_host" ]; then
+                for db in $dblist
+                do
+                    reint_attr="db2hadr-${inst1}_${inst2}_${db}_reint"
+                    ocf_log debug "Promotion succeeded, setting $reint_attr = 1"
+                    crm_attribute -n "$reint_attr" -N "$remote_host" -v "1" -l forever
+                done
+            fi
+
+             return $OCF_SUCCESS
+             ;;
+ 
+@@ -981,6 +991,15 @@ db2_promote() {
+             # update pacemaker's view
+             echo MASTER > $STATE_FILE
+ 
+            if [ -n "$remote_host" ]; then
+                for db in $dblist
+                do
+                    reint_attr="db2hadr-${inst1}_${inst2}_${db}_reint"
+                    ocf_log debug "Promotion succeeded, setting $reint_attr = 1"
+                    crm_attribute -n "$reint_attr" -N "$remote_host" -v "1" -l forever
+                done
+            fi
+
+             return $OCF_SUCCESS
+         fi
+ 
--- a/RHEL-156709-podman-etcd-ignore-learners-when-considering-which-node-has-higher-revision.patch
+++ b/RHEL-156709-podman-etcd-ignore-learners-when-considering-which-node-has-higher-revision.patch
@ -0,0 +1,40 @@
+From 5890f47bc61703130cd27d767118367f03bca95f Mon Sep 17 00:00:00 2001
+From: Carlo Lobrano <c.lobrano@gmail.com>
+Date: Tue, 10 Mar 2026 17:26:04 +0100
+Subject: [PATCH] podman-etcd: Preserve standalone voter identity during
+ restart
+
+If the standalone voter restarts before the peer is added to the member
+list, the learner_node attribute may not be set yet. Without checking
+is_standalone, the voter incorrectly joins as a learner, causing both
+nodes to become learners and creating an unrecoverable deadlock.
+
+Check is_standalone to ensure the voter restarts in the same role it
+had before the restart.
+---
+ heartbeat/podman-etcd | 11 +++++++++--
+ 1 file changed, 9 insertions(+), 2 deletions(-)
+
+diff --git a/heartbeat/podman-etcd b/heartbeat/podman-etcd
+index 539ad33b2..2f8aa122f 100755
+--- a/heartbeat/podman-etcd
+++ b/heartbeat/podman-etcd
+@@ -2002,9 +2002,16 @@ podman_start()
+ 			ocf_log info "found '$active_resources_count' active etcd resources (active: '$OCF_RESKEY_CRM_meta_notify_active_resource', stop: '$OCF_RESKEY_CRM_meta_notify_stop_resource')"
+ 			case "$active_resources_count" in
+ 			1)
+-				if [ "$(attribute_learner_node get)" = "$(get_peer_node_name)" ]; then
+-					ocf_log info "peer active but in learner mode: start normally"
+				# is_standalone may return true here due to a restart: in the previous run,
+				# this agent was the sole voter and the peer had not yet joined the member
+				# list (learner_node unset). Since standalone_node was not cleared before
+				# the restart, start normally to recover the previous cluster state.
+				if is_standalone; then
+					ocf_log info "peer active but not a voter: start normally to recover"
+				elif [ "$(attribute_learner_node get)" = "$(get_peer_node_name)" ]; then
+					ocf_log info "peer active but in learner mode: start normally to recover"
+ 				else
+					# If (A) we must join the peer's new cluster
+ 					ocf_log info "peer is active standalone: joining as learner"
+ 					JOIN_AS_LEARNER=true
+ 				fi
--- a/RHEL-157146-podman-etcd-handle-existing-peer-URLs-gracefully-during-force_new_cluster-recovery.patch
+++ b/RHEL-157146-podman-etcd-handle-existing-peer-URLs-gracefully-during-force_new_cluster-recovery.patch
@ -0,0 +1,130 @@
+From 83d16b59a354a20bf7679af45a9acfa9f344959a Mon Sep 17 00:00:00 2001
+From: Carlo Lobrano <c.lobrano@gmail.com>
+Date: Wed, 18 Mar 2026 11:30:31 +0100
+Subject: [PATCH] OCPBUGS-78482: podman-etcd: fix "Peer URLs already exists" in
+ add_member_as_learner (#2136)
+
+* podman-etcd: handle "Peer URLs already exists" in add_member_as_learner
+
+When etcdctl member add fails with "Peer URLs already exists", the stale
+member entry is removed and the add is retried.
+
+Without this fix, add_member_as_learner returns early without setting
+the learner_node attribute, causing the peer node to time out waiting
+for it.
+---
+ heartbeat/podman-etcd | 85 +++++++++++++++++++++++++++++++++++++------
+ 1 file changed, 73 insertions(+), 12 deletions(-)
+
+diff --git a/heartbeat/podman-etcd b/heartbeat/podman-etcd
+index 2f8aa122f..860aca817 100755
+--- a/heartbeat/podman-etcd
+++ b/heartbeat/podman-etcd
+@@ -1072,6 +1072,49 @@ attribute_node_member_id()
+ 	esac
+ }
+ 
+# remove an etcd member identified by its IP from the member list
+remove_etcd_member_by_ip()
+{
+	local rc
+	local out
+	local member_ip
+	local endpoint_url
+	local member_list_json
+	local stale_member_id
+	local stale_member_id_hex
+
+	member_ip=$1
+	member_list_json=$(get_member_list_json)
+	rc=$?
+	if [ "$rc" -ne 0 ] ; then
+		ocf_log err "could not remove etcd member. Failed to get member list, error code: $rc"
+		return $OCF_ERR_GENERIC
+	fi
+
+	stale_member_id=$(printf "%s" "$member_list_json" | jq -r ".members[] | select( .peerURLs | map(test(\"$member_ip\")) | any).ID")
+	if [ -z "$stale_member_id" ]; then
+		ocf_log err "could not remove etcd member. Failed to find member ID"
+		return $OCF_ERR_GENERIC
+	fi
+
+	# JSON member_id is decimal, while etcdctl command needs the hex version
+	if ! stale_member_id_hex=$(decimal_to_hex "$stale_member_id"); then
+		ocf_log err "could not remove etcd member. Failed to convert member_id '$stale_member_id' into hex format"
+		return $OCF_ERR_GENERIC
+	fi
+
+	endpoint_url=$(ip_url $(attribute_node_ip get))
+	out=$(podman exec "${CONTAINER}" etcdctl --endpoints="$endpoint_url:2379" member remove "$stale_member_id_hex" 2>&1)
+	rc=$?
+	if [ "$rc" -ne 0 ] ; then
+		ocf_log err "could not remove etcd member. etcdctl member remove command failed, error code: $rc, output: $out"
+		return $OCF_ERR_GENERIC
+	fi
+
+	ocf_log info "$out"
+	return $OCF_SUCCESS
+}
+
+ add_member_as_learner()
+ {
+ 	local rc
+@@ -1079,18 +1122,36 @@ add_member_as_learner()
+ 	local member_ip=$2
+ 	local endpoint_url=$(ip_url $(attribute_node_ip get))
+ 	local peer_url=$(ip_url $member_ip)
+	local i
+	local max_retries
+	local out
+ 
+-	ocf_log info "add $member_name ($member_ip) to the member list as learner"
+-	out=$(podman exec "${CONTAINER}" etcdctl --endpoints="$endpoint_url:2379" member add "$member_name" --peer-urls="$peer_url:2380" --learner 2>&1)
+-	rc=$?
+-	if [ $rc -ne 0 ]; then
+-		ocf_log err "could not add $member_name as learner, error code $rc, etcdctl output: $out"
+-		return $rc
+-	fi
+-	ocf_log info "$out"
+	i=0
+	max_retries=3
+	while [ "$i" -lt "$max_retries" ]; do
+		i=$((i + 1))
+		ocf_log info "adding $member_name ($member_ip) to the member list as learner (attempt $i of $max_retries)"
+		out=$(podman exec "${CONTAINER}" etcdctl --endpoints="$endpoint_url:2379" \
+				member add "$member_name" \
+				--peer-urls="$peer_url:2380" \
+				--learner 2>&1)
+		rc=$?
+		if [ "$rc" -eq 0 ]; then
+			ocf_log info "$out"
+			attribute_learner_node update "$member_name"
+			return $OCF_SUCCESS
+		fi
+		if echo "$out" | grep -q "Peer URLs already exists"; then
+			# etcd data might have stale membership data
+			ocf_log warn "could not add member: Peer URLs already exists"
+			remove_etcd_member_by_ip "$member_ip"
+		else
+			ocf_log warn "could not add member: $out"
+		fi
+	done
+ 
+-	attribute_learner_node update "$member_name"
+-	return $?
+	ocf_log err "could not add $member_name as learner, error code $rc, etcdctl output: $out"
+	return $OCF_ERR_GENERIC
+ }
+ 
+ set_force_new_cluster()
+@@ -1454,8 +1515,8 @@ manage_peer_membership()
+ 	#       "https://<node IP>:2379"
+ 	#   ]
+ 	# }
+-	# NOTE that the "name" field is present in voting members only, while "isLearner"
+-	# field in learner members only and the value is always true (not a string) in that case.
+	# NOTE: voting members have a "name" field but no "isLearner" field,
+	# while learner members have "isLearner": true (boolean) but no "name" field, so we search for peerURLs matching.
+ 	peer_member_id=$(printf "%s" "$member_list_json" | jq -r ".members[] | select( .peerURLs | map(test(\"$peer_member_ip\")) | any).ID")
+ 	if [ -z "$peer_member_id" ]; then
+ 		ocf_log info "$peer_member_name is not in the members list"
--- a/resource-agents.spec
+++ b/resource-agents.spec
@ -45,7 +45,7 @@
 Name:		resource-agents
 Summary:	Open Source HA Reusable Cluster Resource Scripts
 Version:	4.10.0
-Release:	109%{?rcver:%{rcver}}%{?numcomm:.%{numcomm}}%{?alphatag:.%{alphatag}}%{?dirty:.%{dirty}}%{?dist}
+Release:	110%{?rcver:%{rcver}}%{?numcomm:.%{numcomm}}%{?alphatag:.%{alphatag}}%{?dirty:.%{dirty}}%{?dist}
 License:	GPLv2+ and LGPLv2+
 URL:		https://github.com/ClusterLabs/resource-agents
 Source0:	%{upstream_prefix}-%{upstream_version}.tar.gz
@ -200,6 +200,9 @@ Patch147:	RHEL-143527-powervs-move-ip-powervs-subnet-fix-error-logging.patch
 Patch148:	RHEL-145628-podman-etcd-enhance-etcd-data-backup-with-snapshots-and-retention.patch
 Patch149:	RHEL-150700-podman-etcd-set-attributes-if-they-fail-during-force-new-cluster.patch
 Patch150:	RHEL-116151-4-portblock-check-inverse-action.patch
+Patch151:	RHEL-156709-podman-etcd-ignore-learners-when-considering-which-node-has-higher-revision.patch
+Patch152:	RHEL-157146-podman-etcd-handle-existing-peer-URLs-gracefully-during-force_new_cluster-recovery.patch
+Patch153:	RHEL-153158-db2-set-reintegration-when-promotion-is-successful.patch

 # bundled ha-cloud-support libs
 Patch500:	ha-cloud-support-aliyun.patch
@ -495,6 +498,9 @@ exit 1
 %patch -p1 -P 148
 %patch -p1 -P 149
 %patch -p1 -P 150
+%patch -p1 -P 151
+%patch -p1 -P 152
+%patch -p1 -P 153

 # bundled ha-cloud-support libs
 %patch -p1 -P 500
@ -827,6 +833,13 @@ rm -rf %{buildroot}/usr/share/doc/resource-agents
 %{_usr}/lib/ocf/lib/heartbeat/OCF_*.pm

 %changelog
+* Thu Mar 19 2026 Oyvind Albrigtsen <oalbrigt@redhat.com> - 4.10.0-110
+- podman etcd: ignore learners when considering which node has higher revision
+- podman etcd: handle existing peer URLs gracefully during force_new_cluster recovery
+- db2: set reintegration when promotion is successful
+
+  Resolves: RHEL-156709, RHEL-157146, RHEL-153158
+
 * Fri Feb 27 2026 Oyvind Albrigtsen <oalbrigt@redhat.com> - 4.10.0-109
 - portblock: check inverse action state file for non-promotable
  resources to avoid issues when doing e.g. block followed by unblock