- podman etcd: ignore learners when considering which node has higher revision
- podman etcd: handle existing peer URLs gracefully during force_new_cluster recovery - db2: set reintegration when promotion is successful Resolves: RHEL-156709, RHEL-157146, RHEL-153158
This commit is contained in:
parent
283977cc84
commit
eea4aa580e
@ -0,0 +1,46 @@
|
||||
From 66885ea0227e847b571608015b150d391a6234d7 Mon Sep 17 00:00:00 2001
|
||||
From: Oyvind Albrigtsen <oalbrigt@redhat.com>
|
||||
Date: Mon, 23 Feb 2026 13:35:58 +0100
|
||||
Subject: [PATCH] db2: set reintegration when promotion is successful
|
||||
|
||||
---
|
||||
heartbeat/db2 | 19 +++++++++++++++++++
|
||||
1 file changed, 19 insertions(+)
|
||||
|
||||
diff --git a/heartbeat/db2 b/heartbeat/db2
|
||||
index 82f2f82c3..4420b9989 100755
|
||||
--- a/heartbeat/db2
|
||||
+++ b/heartbeat/db2
|
||||
@@ -955,6 +955,16 @@ db2_promote() {
|
||||
PRIMARY/PEER/*|PRIMARY/REMOTE_CATCHUP/*|PRIMARY/REMOTE_CATCHUP_PENDING/CONNECTED|Primary/Peer)
|
||||
# nothing to do, only update pacemaker's view
|
||||
echo MASTER > $STATE_FILE
|
||||
+
|
||||
+ if [ -n "$remote_host" ]; then
|
||||
+ for db in $dblist
|
||||
+ do
|
||||
+ reint_attr="db2hadr-${inst1}_${inst2}_${db}_reint"
|
||||
+ ocf_log debug "Promotion succeeded, setting $reint_attr = 1"
|
||||
+ crm_attribute -n "$reint_attr" -N "$remote_host" -v "1" -l forever
|
||||
+ done
|
||||
+ fi
|
||||
+
|
||||
return $OCF_SUCCESS
|
||||
;;
|
||||
|
||||
@@ -981,6 +991,15 @@ db2_promote() {
|
||||
# update pacemaker's view
|
||||
echo MASTER > $STATE_FILE
|
||||
|
||||
+ if [ -n "$remote_host" ]; then
|
||||
+ for db in $dblist
|
||||
+ do
|
||||
+ reint_attr="db2hadr-${inst1}_${inst2}_${db}_reint"
|
||||
+ ocf_log debug "Promotion succeeded, setting $reint_attr = 1"
|
||||
+ crm_attribute -n "$reint_attr" -N "$remote_host" -v "1" -l forever
|
||||
+ done
|
||||
+ fi
|
||||
+
|
||||
return $OCF_SUCCESS
|
||||
fi
|
||||
|
||||
@ -0,0 +1,40 @@
|
||||
From 5890f47bc61703130cd27d767118367f03bca95f Mon Sep 17 00:00:00 2001
|
||||
From: Carlo Lobrano <c.lobrano@gmail.com>
|
||||
Date: Tue, 10 Mar 2026 17:26:04 +0100
|
||||
Subject: [PATCH] podman-etcd: Preserve standalone voter identity during
|
||||
restart
|
||||
|
||||
If the standalone voter restarts before the peer is added to the member
|
||||
list, the learner_node attribute may not be set yet. Without checking
|
||||
is_standalone, the voter incorrectly joins as a learner, causing both
|
||||
nodes to become learners and creating an unrecoverable deadlock.
|
||||
|
||||
Check is_standalone to ensure the voter restarts in the same role it
|
||||
had before the restart.
|
||||
---
|
||||
heartbeat/podman-etcd | 11 +++++++++--
|
||||
1 file changed, 9 insertions(+), 2 deletions(-)
|
||||
|
||||
diff --git a/heartbeat/podman-etcd b/heartbeat/podman-etcd
|
||||
index 539ad33b2..2f8aa122f 100755
|
||||
--- a/heartbeat/podman-etcd
|
||||
+++ b/heartbeat/podman-etcd
|
||||
@@ -2002,9 +2002,16 @@ podman_start()
|
||||
ocf_log info "found '$active_resources_count' active etcd resources (active: '$OCF_RESKEY_CRM_meta_notify_active_resource', stop: '$OCF_RESKEY_CRM_meta_notify_stop_resource')"
|
||||
case "$active_resources_count" in
|
||||
1)
|
||||
- if [ "$(attribute_learner_node get)" = "$(get_peer_node_name)" ]; then
|
||||
- ocf_log info "peer active but in learner mode: start normally"
|
||||
+ # is_standalone may return true here due to a restart: in the previous run,
|
||||
+ # this agent was the sole voter and the peer had not yet joined the member
|
||||
+ # list (learner_node unset). Since standalone_node was not cleared before
|
||||
+ # the restart, start normally to recover the previous cluster state.
|
||||
+ if is_standalone; then
|
||||
+ ocf_log info "peer active but not a voter: start normally to recover"
|
||||
+ elif [ "$(attribute_learner_node get)" = "$(get_peer_node_name)" ]; then
|
||||
+ ocf_log info "peer active but in learner mode: start normally to recover"
|
||||
else
|
||||
+ # If (A) we must join the peer's new cluster
|
||||
ocf_log info "peer is active standalone: joining as learner"
|
||||
JOIN_AS_LEARNER=true
|
||||
fi
|
||||
@ -0,0 +1,130 @@
|
||||
From 83d16b59a354a20bf7679af45a9acfa9f344959a Mon Sep 17 00:00:00 2001
|
||||
From: Carlo Lobrano <c.lobrano@gmail.com>
|
||||
Date: Wed, 18 Mar 2026 11:30:31 +0100
|
||||
Subject: [PATCH] OCPBUGS-78482: podman-etcd: fix "Peer URLs already exists" in
|
||||
add_member_as_learner (#2136)
|
||||
|
||||
* podman-etcd: handle "Peer URLs already exists" in add_member_as_learner
|
||||
|
||||
When etcdctl member add fails with "Peer URLs already exists", the stale
|
||||
member entry is removed and the add is retried.
|
||||
|
||||
Without this fix, add_member_as_learner returns early without setting
|
||||
the learner_node attribute, causing the peer node to time out waiting
|
||||
for it.
|
||||
---
|
||||
heartbeat/podman-etcd | 85 +++++++++++++++++++++++++++++++++++++------
|
||||
1 file changed, 73 insertions(+), 12 deletions(-)
|
||||
|
||||
diff --git a/heartbeat/podman-etcd b/heartbeat/podman-etcd
|
||||
index 2f8aa122f..860aca817 100755
|
||||
--- a/heartbeat/podman-etcd
|
||||
+++ b/heartbeat/podman-etcd
|
||||
@@ -1072,6 +1072,49 @@ attribute_node_member_id()
|
||||
esac
|
||||
}
|
||||
|
||||
+# remove an etcd member identified by its IP from the member list
|
||||
+remove_etcd_member_by_ip()
|
||||
+{
|
||||
+ local rc
|
||||
+ local out
|
||||
+ local member_ip
|
||||
+ local endpoint_url
|
||||
+ local member_list_json
|
||||
+ local stale_member_id
|
||||
+ local stale_member_id_hex
|
||||
+
|
||||
+ member_ip=$1
|
||||
+ member_list_json=$(get_member_list_json)
|
||||
+ rc=$?
|
||||
+ if [ "$rc" -ne 0 ] ; then
|
||||
+ ocf_log err "could not remove etcd member. Failed to get member list, error code: $rc"
|
||||
+ return $OCF_ERR_GENERIC
|
||||
+ fi
|
||||
+
|
||||
+ stale_member_id=$(printf "%s" "$member_list_json" | jq -r ".members[] | select( .peerURLs | map(test(\"$member_ip\")) | any).ID")
|
||||
+ if [ -z "$stale_member_id" ]; then
|
||||
+ ocf_log err "could not remove etcd member. Failed to find member ID"
|
||||
+ return $OCF_ERR_GENERIC
|
||||
+ fi
|
||||
+
|
||||
+ # JSON member_id is decimal, while etcdctl command needs the hex version
|
||||
+ if ! stale_member_id_hex=$(decimal_to_hex "$stale_member_id"); then
|
||||
+ ocf_log err "could not remove etcd member. Failed to convert member_id '$stale_member_id' into hex format"
|
||||
+ return $OCF_ERR_GENERIC
|
||||
+ fi
|
||||
+
|
||||
+ endpoint_url=$(ip_url $(attribute_node_ip get))
|
||||
+ out=$(podman exec "${CONTAINER}" etcdctl --endpoints="$endpoint_url:2379" member remove "$stale_member_id_hex" 2>&1)
|
||||
+ rc=$?
|
||||
+ if [ "$rc" -ne 0 ] ; then
|
||||
+ ocf_log err "could not remove etcd member. etcdctl member remove command failed, error code: $rc, output: $out"
|
||||
+ return $OCF_ERR_GENERIC
|
||||
+ fi
|
||||
+
|
||||
+ ocf_log info "$out"
|
||||
+ return $OCF_SUCCESS
|
||||
+}
|
||||
+
|
||||
add_member_as_learner()
|
||||
{
|
||||
local rc
|
||||
@@ -1079,18 +1122,36 @@ add_member_as_learner()
|
||||
local member_ip=$2
|
||||
local endpoint_url=$(ip_url $(attribute_node_ip get))
|
||||
local peer_url=$(ip_url $member_ip)
|
||||
+ local i
|
||||
+ local max_retries
|
||||
+ local out
|
||||
|
||||
- ocf_log info "add $member_name ($member_ip) to the member list as learner"
|
||||
- out=$(podman exec "${CONTAINER}" etcdctl --endpoints="$endpoint_url:2379" member add "$member_name" --peer-urls="$peer_url:2380" --learner 2>&1)
|
||||
- rc=$?
|
||||
- if [ $rc -ne 0 ]; then
|
||||
- ocf_log err "could not add $member_name as learner, error code $rc, etcdctl output: $out"
|
||||
- return $rc
|
||||
- fi
|
||||
- ocf_log info "$out"
|
||||
+ i=0
|
||||
+ max_retries=3
|
||||
+ while [ "$i" -lt "$max_retries" ]; do
|
||||
+ i=$((i + 1))
|
||||
+ ocf_log info "adding $member_name ($member_ip) to the member list as learner (attempt $i of $max_retries)"
|
||||
+ out=$(podman exec "${CONTAINER}" etcdctl --endpoints="$endpoint_url:2379" \
|
||||
+ member add "$member_name" \
|
||||
+ --peer-urls="$peer_url:2380" \
|
||||
+ --learner 2>&1)
|
||||
+ rc=$?
|
||||
+ if [ "$rc" -eq 0 ]; then
|
||||
+ ocf_log info "$out"
|
||||
+ attribute_learner_node update "$member_name"
|
||||
+ return $OCF_SUCCESS
|
||||
+ fi
|
||||
+ if echo "$out" | grep -q "Peer URLs already exists"; then
|
||||
+ # etcd data might have stale membership data
|
||||
+ ocf_log warn "could not add member: Peer URLs already exists"
|
||||
+ remove_etcd_member_by_ip "$member_ip"
|
||||
+ else
|
||||
+ ocf_log warn "could not add member: $out"
|
||||
+ fi
|
||||
+ done
|
||||
|
||||
- attribute_learner_node update "$member_name"
|
||||
- return $?
|
||||
+ ocf_log err "could not add $member_name as learner, error code $rc, etcdctl output: $out"
|
||||
+ return $OCF_ERR_GENERIC
|
||||
}
|
||||
|
||||
set_force_new_cluster()
|
||||
@@ -1454,8 +1515,8 @@ manage_peer_membership()
|
||||
# "https://<node IP>:2379"
|
||||
# ]
|
||||
# }
|
||||
- # NOTE that the "name" field is present in voting members only, while "isLearner"
|
||||
- # field in learner members only and the value is always true (not a string) in that case.
|
||||
+ # NOTE: voting members have a "name" field but no "isLearner" field,
|
||||
+ # while learner members have "isLearner": true (boolean) but no "name" field, so we search for peerURLs matching.
|
||||
peer_member_id=$(printf "%s" "$member_list_json" | jq -r ".members[] | select( .peerURLs | map(test(\"$peer_member_ip\")) | any).ID")
|
||||
if [ -z "$peer_member_id" ]; then
|
||||
ocf_log info "$peer_member_name is not in the members list"
|
||||
@ -45,7 +45,7 @@
|
||||
Name: resource-agents
|
||||
Summary: Open Source HA Reusable Cluster Resource Scripts
|
||||
Version: 4.10.0
|
||||
Release: 109%{?rcver:%{rcver}}%{?numcomm:.%{numcomm}}%{?alphatag:.%{alphatag}}%{?dirty:.%{dirty}}%{?dist}
|
||||
Release: 110%{?rcver:%{rcver}}%{?numcomm:.%{numcomm}}%{?alphatag:.%{alphatag}}%{?dirty:.%{dirty}}%{?dist}
|
||||
License: GPLv2+ and LGPLv2+
|
||||
URL: https://github.com/ClusterLabs/resource-agents
|
||||
Source0: %{upstream_prefix}-%{upstream_version}.tar.gz
|
||||
@ -200,6 +200,9 @@ Patch147: RHEL-143527-powervs-move-ip-powervs-subnet-fix-error-logging.patch
|
||||
Patch148: RHEL-145628-podman-etcd-enhance-etcd-data-backup-with-snapshots-and-retention.patch
|
||||
Patch149: RHEL-150700-podman-etcd-set-attributes-if-they-fail-during-force-new-cluster.patch
|
||||
Patch150: RHEL-116151-4-portblock-check-inverse-action.patch
|
||||
Patch151: RHEL-156709-podman-etcd-ignore-learners-when-considering-which-node-has-higher-revision.patch
|
||||
Patch152: RHEL-157146-podman-etcd-handle-existing-peer-URLs-gracefully-during-force_new_cluster-recovery.patch
|
||||
Patch153: RHEL-153158-db2-set-reintegration-when-promotion-is-successful.patch
|
||||
|
||||
# bundled ha-cloud-support libs
|
||||
Patch500: ha-cloud-support-aliyun.patch
|
||||
@ -495,6 +498,9 @@ exit 1
|
||||
%patch -p1 -P 148
|
||||
%patch -p1 -P 149
|
||||
%patch -p1 -P 150
|
||||
%patch -p1 -P 151
|
||||
%patch -p1 -P 152
|
||||
%patch -p1 -P 153
|
||||
|
||||
# bundled ha-cloud-support libs
|
||||
%patch -p1 -P 500
|
||||
@ -827,6 +833,13 @@ rm -rf %{buildroot}/usr/share/doc/resource-agents
|
||||
%{_usr}/lib/ocf/lib/heartbeat/OCF_*.pm
|
||||
|
||||
%changelog
|
||||
* Thu Mar 19 2026 Oyvind Albrigtsen <oalbrigt@redhat.com> - 4.10.0-110
|
||||
- podman etcd: ignore learners when considering which node has higher revision
|
||||
- podman etcd: handle existing peer URLs gracefully during force_new_cluster recovery
|
||||
- db2: set reintegration when promotion is successful
|
||||
|
||||
Resolves: RHEL-156709, RHEL-157146, RHEL-153158
|
||||
|
||||
* Fri Feb 27 2026 Oyvind Albrigtsen <oalbrigt@redhat.com> - 4.10.0-109
|
||||
- portblock: check inverse action state file for non-promotable
|
||||
resources to avoid issues when doing e.g. block followed by unblock
|
||||
|
||||
Loading…
Reference in New Issue
Block a user