From eea4aa580e123fcfb89d875e150f663dca7da057 Mon Sep 17 00:00:00 2001 From: Oyvind Albrigtsen Date: Thu, 19 Mar 2026 11:33:15 +0100 Subject: [PATCH] - podman etcd: ignore learners when considering which node has higher revision - podman etcd: handle existing peer URLs gracefully during force_new_cluster recovery - db2: set reintegration when promotion is successful Resolves: RHEL-156709, RHEL-157146, RHEL-153158 --- ...gration-when-promotion-is-successful.patch | 46 +++++++ ...ering-which-node-has-higher-revision.patch | 40 ++++++ ...ly-during-force_new_cluster-recovery.patch | 130 ++++++++++++++++++ resource-agents.spec | 15 +- 4 files changed, 230 insertions(+), 1 deletion(-) create mode 100644 RHEL-153158-db2-set-reintegration-when-promotion-is-successful.patch create mode 100644 RHEL-156709-podman-etcd-ignore-learners-when-considering-which-node-has-higher-revision.patch create mode 100644 RHEL-157146-podman-etcd-handle-existing-peer-URLs-gracefully-during-force_new_cluster-recovery.patch diff --git a/RHEL-153158-db2-set-reintegration-when-promotion-is-successful.patch b/RHEL-153158-db2-set-reintegration-when-promotion-is-successful.patch new file mode 100644 index 0000000..56bc252 --- /dev/null +++ b/RHEL-153158-db2-set-reintegration-when-promotion-is-successful.patch @@ -0,0 +1,46 @@ +From 66885ea0227e847b571608015b150d391a6234d7 Mon Sep 17 00:00:00 2001 +From: Oyvind Albrigtsen +Date: Mon, 23 Feb 2026 13:35:58 +0100 +Subject: [PATCH] db2: set reintegration when promotion is successful + +--- + heartbeat/db2 | 19 +++++++++++++++++++ + 1 file changed, 19 insertions(+) + +diff --git a/heartbeat/db2 b/heartbeat/db2 +index 82f2f82c3..4420b9989 100755 +--- a/heartbeat/db2 ++++ b/heartbeat/db2 +@@ -955,6 +955,16 @@ db2_promote() { + PRIMARY/PEER/*|PRIMARY/REMOTE_CATCHUP/*|PRIMARY/REMOTE_CATCHUP_PENDING/CONNECTED|Primary/Peer) + # nothing to do, only update pacemaker's view + echo MASTER > $STATE_FILE ++ ++ if [ -n "$remote_host" ]; then ++ for db in $dblist ++ do ++ reint_attr="db2hadr-${inst1}_${inst2}_${db}_reint" ++ ocf_log debug "Promotion succeeded, setting $reint_attr = 1" ++ crm_attribute -n "$reint_attr" -N "$remote_host" -v "1" -l forever ++ done ++ fi ++ + return $OCF_SUCCESS + ;; + +@@ -981,6 +991,15 @@ db2_promote() { + # update pacemaker's view + echo MASTER > $STATE_FILE + ++ if [ -n "$remote_host" ]; then ++ for db in $dblist ++ do ++ reint_attr="db2hadr-${inst1}_${inst2}_${db}_reint" ++ ocf_log debug "Promotion succeeded, setting $reint_attr = 1" ++ crm_attribute -n "$reint_attr" -N "$remote_host" -v "1" -l forever ++ done ++ fi ++ + return $OCF_SUCCESS + fi + diff --git a/RHEL-156709-podman-etcd-ignore-learners-when-considering-which-node-has-higher-revision.patch b/RHEL-156709-podman-etcd-ignore-learners-when-considering-which-node-has-higher-revision.patch new file mode 100644 index 0000000..280b14f --- /dev/null +++ b/RHEL-156709-podman-etcd-ignore-learners-when-considering-which-node-has-higher-revision.patch @@ -0,0 +1,40 @@ +From 5890f47bc61703130cd27d767118367f03bca95f Mon Sep 17 00:00:00 2001 +From: Carlo Lobrano +Date: Tue, 10 Mar 2026 17:26:04 +0100 +Subject: [PATCH] podman-etcd: Preserve standalone voter identity during + restart + +If the standalone voter restarts before the peer is added to the member +list, the learner_node attribute may not be set yet. Without checking +is_standalone, the voter incorrectly joins as a learner, causing both +nodes to become learners and creating an unrecoverable deadlock. + +Check is_standalone to ensure the voter restarts in the same role it +had before the restart. +--- + heartbeat/podman-etcd | 11 +++++++++-- + 1 file changed, 9 insertions(+), 2 deletions(-) + +diff --git a/heartbeat/podman-etcd b/heartbeat/podman-etcd +index 539ad33b2..2f8aa122f 100755 +--- a/heartbeat/podman-etcd ++++ b/heartbeat/podman-etcd +@@ -2002,9 +2002,16 @@ podman_start() + ocf_log info "found '$active_resources_count' active etcd resources (active: '$OCF_RESKEY_CRM_meta_notify_active_resource', stop: '$OCF_RESKEY_CRM_meta_notify_stop_resource')" + case "$active_resources_count" in + 1) +- if [ "$(attribute_learner_node get)" = "$(get_peer_node_name)" ]; then +- ocf_log info "peer active but in learner mode: start normally" ++ # is_standalone may return true here due to a restart: in the previous run, ++ # this agent was the sole voter and the peer had not yet joined the member ++ # list (learner_node unset). Since standalone_node was not cleared before ++ # the restart, start normally to recover the previous cluster state. ++ if is_standalone; then ++ ocf_log info "peer active but not a voter: start normally to recover" ++ elif [ "$(attribute_learner_node get)" = "$(get_peer_node_name)" ]; then ++ ocf_log info "peer active but in learner mode: start normally to recover" + else ++ # If (A) we must join the peer's new cluster + ocf_log info "peer is active standalone: joining as learner" + JOIN_AS_LEARNER=true + fi diff --git a/RHEL-157146-podman-etcd-handle-existing-peer-URLs-gracefully-during-force_new_cluster-recovery.patch b/RHEL-157146-podman-etcd-handle-existing-peer-URLs-gracefully-during-force_new_cluster-recovery.patch new file mode 100644 index 0000000..89b206e --- /dev/null +++ b/RHEL-157146-podman-etcd-handle-existing-peer-URLs-gracefully-during-force_new_cluster-recovery.patch @@ -0,0 +1,130 @@ +From 83d16b59a354a20bf7679af45a9acfa9f344959a Mon Sep 17 00:00:00 2001 +From: Carlo Lobrano +Date: Wed, 18 Mar 2026 11:30:31 +0100 +Subject: [PATCH] OCPBUGS-78482: podman-etcd: fix "Peer URLs already exists" in + add_member_as_learner (#2136) + +* podman-etcd: handle "Peer URLs already exists" in add_member_as_learner + +When etcdctl member add fails with "Peer URLs already exists", the stale +member entry is removed and the add is retried. + +Without this fix, add_member_as_learner returns early without setting +the learner_node attribute, causing the peer node to time out waiting +for it. +--- + heartbeat/podman-etcd | 85 +++++++++++++++++++++++++++++++++++++------ + 1 file changed, 73 insertions(+), 12 deletions(-) + +diff --git a/heartbeat/podman-etcd b/heartbeat/podman-etcd +index 2f8aa122f..860aca817 100755 +--- a/heartbeat/podman-etcd ++++ b/heartbeat/podman-etcd +@@ -1072,6 +1072,49 @@ attribute_node_member_id() + esac + } + ++# remove an etcd member identified by its IP from the member list ++remove_etcd_member_by_ip() ++{ ++ local rc ++ local out ++ local member_ip ++ local endpoint_url ++ local member_list_json ++ local stale_member_id ++ local stale_member_id_hex ++ ++ member_ip=$1 ++ member_list_json=$(get_member_list_json) ++ rc=$? ++ if [ "$rc" -ne 0 ] ; then ++ ocf_log err "could not remove etcd member. Failed to get member list, error code: $rc" ++ return $OCF_ERR_GENERIC ++ fi ++ ++ stale_member_id=$(printf "%s" "$member_list_json" | jq -r ".members[] | select( .peerURLs | map(test(\"$member_ip\")) | any).ID") ++ if [ -z "$stale_member_id" ]; then ++ ocf_log err "could not remove etcd member. Failed to find member ID" ++ return $OCF_ERR_GENERIC ++ fi ++ ++ # JSON member_id is decimal, while etcdctl command needs the hex version ++ if ! stale_member_id_hex=$(decimal_to_hex "$stale_member_id"); then ++ ocf_log err "could not remove etcd member. Failed to convert member_id '$stale_member_id' into hex format" ++ return $OCF_ERR_GENERIC ++ fi ++ ++ endpoint_url=$(ip_url $(attribute_node_ip get)) ++ out=$(podman exec "${CONTAINER}" etcdctl --endpoints="$endpoint_url:2379" member remove "$stale_member_id_hex" 2>&1) ++ rc=$? ++ if [ "$rc" -ne 0 ] ; then ++ ocf_log err "could not remove etcd member. etcdctl member remove command failed, error code: $rc, output: $out" ++ return $OCF_ERR_GENERIC ++ fi ++ ++ ocf_log info "$out" ++ return $OCF_SUCCESS ++} ++ + add_member_as_learner() + { + local rc +@@ -1079,18 +1122,36 @@ add_member_as_learner() + local member_ip=$2 + local endpoint_url=$(ip_url $(attribute_node_ip get)) + local peer_url=$(ip_url $member_ip) ++ local i ++ local max_retries ++ local out + +- ocf_log info "add $member_name ($member_ip) to the member list as learner" +- out=$(podman exec "${CONTAINER}" etcdctl --endpoints="$endpoint_url:2379" member add "$member_name" --peer-urls="$peer_url:2380" --learner 2>&1) +- rc=$? +- if [ $rc -ne 0 ]; then +- ocf_log err "could not add $member_name as learner, error code $rc, etcdctl output: $out" +- return $rc +- fi +- ocf_log info "$out" ++ i=0 ++ max_retries=3 ++ while [ "$i" -lt "$max_retries" ]; do ++ i=$((i + 1)) ++ ocf_log info "adding $member_name ($member_ip) to the member list as learner (attempt $i of $max_retries)" ++ out=$(podman exec "${CONTAINER}" etcdctl --endpoints="$endpoint_url:2379" \ ++ member add "$member_name" \ ++ --peer-urls="$peer_url:2380" \ ++ --learner 2>&1) ++ rc=$? ++ if [ "$rc" -eq 0 ]; then ++ ocf_log info "$out" ++ attribute_learner_node update "$member_name" ++ return $OCF_SUCCESS ++ fi ++ if echo "$out" | grep -q "Peer URLs already exists"; then ++ # etcd data might have stale membership data ++ ocf_log warn "could not add member: Peer URLs already exists" ++ remove_etcd_member_by_ip "$member_ip" ++ else ++ ocf_log warn "could not add member: $out" ++ fi ++ done + +- attribute_learner_node update "$member_name" +- return $? ++ ocf_log err "could not add $member_name as learner, error code $rc, etcdctl output: $out" ++ return $OCF_ERR_GENERIC + } + + set_force_new_cluster() +@@ -1454,8 +1515,8 @@ manage_peer_membership() + # "https://:2379" + # ] + # } +- # NOTE that the "name" field is present in voting members only, while "isLearner" +- # field in learner members only and the value is always true (not a string) in that case. ++ # NOTE: voting members have a "name" field but no "isLearner" field, ++ # while learner members have "isLearner": true (boolean) but no "name" field, so we search for peerURLs matching. + peer_member_id=$(printf "%s" "$member_list_json" | jq -r ".members[] | select( .peerURLs | map(test(\"$peer_member_ip\")) | any).ID") + if [ -z "$peer_member_id" ]; then + ocf_log info "$peer_member_name is not in the members list" diff --git a/resource-agents.spec b/resource-agents.spec index eb36a26..8a04283 100644 --- a/resource-agents.spec +++ b/resource-agents.spec @@ -45,7 +45,7 @@ Name: resource-agents Summary: Open Source HA Reusable Cluster Resource Scripts Version: 4.10.0 -Release: 109%{?rcver:%{rcver}}%{?numcomm:.%{numcomm}}%{?alphatag:.%{alphatag}}%{?dirty:.%{dirty}}%{?dist} +Release: 110%{?rcver:%{rcver}}%{?numcomm:.%{numcomm}}%{?alphatag:.%{alphatag}}%{?dirty:.%{dirty}}%{?dist} License: GPLv2+ and LGPLv2+ URL: https://github.com/ClusterLabs/resource-agents Source0: %{upstream_prefix}-%{upstream_version}.tar.gz @@ -200,6 +200,9 @@ Patch147: RHEL-143527-powervs-move-ip-powervs-subnet-fix-error-logging.patch Patch148: RHEL-145628-podman-etcd-enhance-etcd-data-backup-with-snapshots-and-retention.patch Patch149: RHEL-150700-podman-etcd-set-attributes-if-they-fail-during-force-new-cluster.patch Patch150: RHEL-116151-4-portblock-check-inverse-action.patch +Patch151: RHEL-156709-podman-etcd-ignore-learners-when-considering-which-node-has-higher-revision.patch +Patch152: RHEL-157146-podman-etcd-handle-existing-peer-URLs-gracefully-during-force_new_cluster-recovery.patch +Patch153: RHEL-153158-db2-set-reintegration-when-promotion-is-successful.patch # bundled ha-cloud-support libs Patch500: ha-cloud-support-aliyun.patch @@ -495,6 +498,9 @@ exit 1 %patch -p1 -P 148 %patch -p1 -P 149 %patch -p1 -P 150 +%patch -p1 -P 151 +%patch -p1 -P 152 +%patch -p1 -P 153 # bundled ha-cloud-support libs %patch -p1 -P 500 @@ -827,6 +833,13 @@ rm -rf %{buildroot}/usr/share/doc/resource-agents %{_usr}/lib/ocf/lib/heartbeat/OCF_*.pm %changelog +* Thu Mar 19 2026 Oyvind Albrigtsen - 4.10.0-110 +- podman etcd: ignore learners when considering which node has higher revision +- podman etcd: handle existing peer URLs gracefully during force_new_cluster recovery +- db2: set reintegration when promotion is successful + + Resolves: RHEL-156709, RHEL-157146, RHEL-153158 + * Fri Feb 27 2026 Oyvind Albrigtsen - 4.10.0-109 - portblock: check inverse action state file for non-promotable resources to avoid issues when doing e.g. block followed by unblock