From 4ac67a87f2f2f1474493cd7ca6d9b76a58ae9430 Mon Sep 17 00:00:00 2001 From: Oyvind Albrigtsen Date: Thu, 9 Oct 2025 14:36:38 +0200 Subject: [PATCH] - podman-etcd: add automatic learner member promotion Resolves: RHEL-119495 --- ...d-automatic-learner-member-promotion.patch | 321 ++++++++++++++++++ resource-agents.spec | 9 +- 2 files changed, 329 insertions(+), 1 deletion(-) create mode 100644 RHEL-119495-podman-etcd-add-automatic-learner-member-promotion.patch diff --git a/RHEL-119495-podman-etcd-add-automatic-learner-member-promotion.patch b/RHEL-119495-podman-etcd-add-automatic-learner-member-promotion.patch new file mode 100644 index 0000000..74795e5 --- /dev/null +++ b/RHEL-119495-podman-etcd-add-automatic-learner-member-promotion.patch @@ -0,0 +1,321 @@ +From a31f15104fc712cd25f8a59d49f1bbcdbbbc5434 Mon Sep 17 00:00:00 2001 +From: Carlo Lobrano +Date: Tue, 30 Sep 2025 11:54:44 +0200 +Subject: [PATCH 1/2] Refactor(podman-etcd): improve peer checking and + leadership loss detection + +The check_peers function is broken up into smaller, more manageable +functions. This refactoring separates the logic for detecting a loss of +cluster leadership from the logic for managing peer membership. + +The main function is renamed to check_peer as there is only 1 peer to +check (it was check_peers). +--- + heartbeat/podman-etcd | 78 +++++++++++++++++++++++++------------------ + 1 file changed, 45 insertions(+), 33 deletions(-) + +diff --git a/heartbeat/podman-etcd b/heartbeat/podman-etcd +index f3a6da5e2..3d1e4c520 100755 +--- a/heartbeat/podman-etcd ++++ b/heartbeat/podman-etcd +@@ -1014,42 +1014,35 @@ get_member_list_json() { + podman exec "${CONTAINER}" etcdctl member list --endpoints="$this_node_endpoint" -w json + } + +-check_peers() ++detect_cluster_leadership_loss() + { +- # Check peers endpoint status and locally accessible member list +- local member_list_json +- +- if ! container_exists; then +- # we need a running container to execute etcdctl. +- return $OCF_SUCCESS ++ endpoint_status_json=$(get_endpoint_status_json) ++ ocf_log info "endpoint status: $endpoint_status_json" ++ ++ count_endpoints=$(printf "%s" "$endpoint_status_json" | jq -r ".[].Endpoint" | wc -l) ++ if [ "$count_endpoints" -eq 1 ]; then ++ ocf_log info "one endpoint only: checking status errors" ++ endpoint_status_errors=$(printf "%s" "$endpoint_status_json" | jq -r ".[0].Status.errors") ++ if echo "$endpoint_status_errors" | grep -q "no leader"; then ++ set_force_new_cluster ++ set_standalone_node ++ ocf_exit_reason "$NODENAME must force a new cluster" ++ return $OCF_ERR_GENERIC ++ fi ++ if [ "$endpoint_status_errors" != "null" ]; then ++ ocf_log err "unmanaged endpoint status error: $endpoint_status_errors" ++ fi + fi + +- member_list_json=$(get_member_list_json) +- rc=$? +- ocf_log debug "member list: $member_list_json" +- if [ $rc -ne 0 ]; then +- ocf_log info "podman failed to get member list, error code: $rc" +- +- endpoint_status_json=$(get_endpoint_status_json) +- ocf_log info "endpoint status: $endpoint_status_json" +- +- count_endpoints=$(printf "%s" "$endpoint_status_json" | jq -r ".[].Endpoint" | wc -l) +- if [ "$count_endpoints" -eq 1 ]; then +- ocf_log info "one endpoint only: checking status errors" +- endpoint_status_errors=$(printf "%s" "$endpoint_status_json" | jq -r ".[0].Status.errors") +- if echo "$endpoint_status_errors" | grep -q "no leader"; then +- set_force_new_cluster +- set_standalone_node +- ocf_exit_reason "$NODENAME must force a new cluster" +- return $OCF_ERR_GENERIC +- fi +- if [ "$endpoint_status_errors" != "null" ]; then +- ocf_log err "unmanaged endpoint status error: $endpoint_status_errors" +- fi +- fi ++ return $OCF_SUCCESS ++} + +- return $OCF_SUCCESS +- fi ++manage_peer_membership() ++{ ++ # Read etcd member list to detect the status of the peer member. ++ # If the peer is missing from the member list, it will be added back as learner ++ # If the peer is back in the member list, we ensure that the related CIB attributes (standalone and learner_node) are reset ++ local member_list_json="$1" + + # Example of .members[] instance fields in member list json format: + # NOTE that "name" is present in voting members only, while "isLearner" in learner members only +@@ -1083,6 +1076,25 @@ check_peers() + clear_standalone_and_learner_if_not_learners "$member_list_json" + fi + done ++} ++ ++check_peer() ++{ ++ # Check peers endpoint status and locally accessible member list ++ local member_list_json ++ ++ # we need a running container to execute etcdctl. ++ if ! container_exists; then ++ return $OCF_SUCCESS ++ fi ++ ++ if ! member_list_json=$(get_member_list_json); then ++ ocf_log info "podman failed to get member list, error code: $?" ++ detect_cluster_leadership_loss ++ return $? ++ fi ++ ++ manage_peer_membership "$member_list_json" + return $OCF_SUCCESS + } + +@@ -1124,7 +1136,7 @@ podman_monitor() + # monitor operation to fail. + # TODO: move this inside check_peers where we already query member list json + attribute_node_member_id update +- if ! check_peers; then ++ if ! check_peer; then + return $OCF_ERR_GENERIC + fi + + +From de7c73a933cefb8f7b9e810bd23c3d12f6d6f29a Mon Sep 17 00:00:00 2001 +From: Carlo Lobrano +Date: Tue, 30 Sep 2025 18:38:06 +0200 +Subject: [PATCH 2/2] OCPBUGS-42808: podman-etcd: add automatic learner member + promotion + +Automatically promote etcd learner members to voting members when detected. +Includes refactored member management functions and improved validation. +--- + heartbeat/podman-etcd | 108 ++++++++++++++++++++++++++++++------------ + 1 file changed, 79 insertions(+), 29 deletions(-) + +diff --git a/heartbeat/podman-etcd b/heartbeat/podman-etcd +index 3d1e4c520..e1425ec02 100755 +--- a/heartbeat/podman-etcd ++++ b/heartbeat/podman-etcd +@@ -712,6 +712,22 @@ attribute_node_revision_peer() + crm_attribute --query --type nodes --node "$nodename" --name "revision" | awk -F"value=" '{print $2}' + } + ++# Converts a decimal number to hexadecimal format with validation ++# Args: $1 - decimal number (test for non-negative integer too) ++# Returns: 0 on success, OCF_ERR_GENERIC on invalid input ++# Outputs: hexadecimal representation to stdout ++decimal_to_hex() { ++ local dec=$1 ++ ++ if ! echo "$dec" | grep -q "^[1-9][0-9]*$"; then ++ ocf_log err "Invalid member ID format: '$dec' (expected decimal number)" ++ return $OCF_ERR_GENERIC ++ fi ++ ++ printf "%x" "$dec" ++ return $OCF_SUCCESS ++} ++ + attribute_node_member_id() + { + local action="$1" +@@ -737,7 +753,7 @@ attribute_node_member_id() + return "$rc" + fi + +- local value ++ local value value_hex + if ! value=$(echo -n "$member_list_json" | jq -r ".header.member_id"); then + rc=$? + ocf_log err "could not get $attribute from member list JSON, error code: $rc" +@@ -745,8 +761,11 @@ attribute_node_member_id() + fi + + # JSON member_id is decimal, while etcdctl command needs the hex version +- value=$(printf "%x" "$value") +- if ! crm_attribute --type nodes --node "$NODENAME" --name "$attribute" --update "$value"; then ++ if ! value_hex=$(decimal_to_hex "$value"); then ++ ocf_log err "could not convert decimal member_id '$value' to hex, error code: $?" ++ return $OCF_ERR_GENERIC ++ fi ++ if ! crm_attribute --type nodes --node "$NODENAME" --name "$attribute" --update "$value_hex"; then + rc=$? + ocf_log err "could not update etcd $attribute, error code: $rc" + return "$rc" +@@ -905,42 +924,70 @@ clear_standalone_node() + crm_attribute --name "standalone_node" --delete + } + +-clear_standalone_and_learner_if_not_learners() ++ ++# Promotes an etcd learner member to a voting member ++# Args: $1 - learner member ID in decimal format ++# Returns: OCF_SUCCESS (even on expected promotion failures), OCF_ERR_GENERIC on conversion errors ++# Note: Promotion failures are expected and logged as info (peer may not be up-to-date) ++promote_learner_member() ++{ ++ local learner_member_id=$1 ++ ++ # JSON member_id is decimal, while etcdctl command needs the hex version ++ if ! learner_member_id_hex=$(decimal_to_hex "$learner_member_id"); then ++ ocf_log err "could not convert decimal member_id '$learner_member_id' to hex, error code: $?" ++ return $OCF_ERR_GENERIC ++ fi ++ if ! ocf_run podman exec "${CONTAINER}" etcdctl member promote "$learner_member_id_hex" 2>&1; then ++ # promotion is expected to fail if the peer is not yet up-to-date ++ ocf_log info "could not promote member $learner_member_id_hex, error code: $?" ++ return $OCF_SUCCESS ++ fi ++ ocf_log info "successfully promoted member '$learner_member_id_hex'" ++ return $OCF_SUCCESS ++} ++ ++# Reconciles etcd cluster member states ++# Promotes learner members or clears standalone/learner attributes as needed ++# Args: $1 - member list JSON from etcdctl ++# Returns: OCF_SUCCESS on completion, OCF_ERR_GENERIC on errors ++# Note: Only operates when exactly 2 started members are present ++reconcile_member_state() + { + local rc + local member_list_json="$1" + +- number_of_members=$(printf "%s" "$member_list_json" | jq -r ".members[].ID" | wc -l) +- if [ "$number_of_members" -ne 2 ]; then +- ocf_log info "could not clear standalone_node, nor learner_node properties: found $number_of_members members, need 2" ++ # count only the started members, which have the ".name" JSON field ++ number_of_started_members=$(printf "%s" "$member_list_json" | jq -r ".members[].name | select(. != null)" | wc -l) ++ if [ "$number_of_started_members" -ne 2 ]; then ++ ocf_log info "could not clear standalone_node, nor learner_node properties: found $number_of_started_members members, need 2" + return $OCF_SUCCESS + fi + +- id=$(printf "%s" "$member_list_json" | jq -r ".members[] | select( .isLearner==true ).ID") ++ learner_member_id=$(printf "%s" "$member_list_json" | jq -r ".members[] | select( .isLearner==true ).ID") + rc=$? + if [ $rc -ne 0 ]; then + ocf_log err "could not get isLearner field from member list, error code: $rc" + return $rc + fi + +- if [ -z "$id" ]; then +- clear_standalone_node +- rc=$? +- if [ $rc -ne 0 ]; then +- ocf_og error "could not clear standalone_node attribute, error code: $rc" +- return $rc +- fi ++ if [ -n "$learner_member_id" ]; then ++ promote_learner_member "$learner_member_id" ++ return $? + fi +- if [ -z "$id" ]; then +- attribute_learner_node clear +- rc=$? +- if [ $rc -ne 0 ]; then +- ocf_og error "could not clear learner_node attribute, error code: $rc" +- return $rc ++ ++ if [ -z "$learner_member_id" ]; then ++ if ! clear_standalone_node; then ++ ocf_log error "could not clear standalone_node attribute, error code: $?" ++ return $OCF_ERR_GENERIC ++ fi ++ if ! attribute_learner_node clear; then ++ ocf_log error "could not clear learner_node attribute, error code: $?" ++ return $OCF_ERR_GENERIC + fi + fi + +- return $rc ++ return $OCF_SUCCESS + } + + attribute_learner_node() +@@ -1019,7 +1066,7 @@ detect_cluster_leadership_loss() + endpoint_status_json=$(get_endpoint_status_json) + ocf_log info "endpoint status: $endpoint_status_json" + +- count_endpoints=$(printf "%s" "$endpoint_status_json" | jq -r ".[].Endpoint" | wc -l) ++ count_endpoints=$(printf "%s" "$endpoint_status_json" | jq -r ".[].Endpoint" | wc -l) + if [ "$count_endpoints" -eq 1 ]; then + ocf_log info "one endpoint only: checking status errors" + endpoint_status_errors=$(printf "%s" "$endpoint_status_json" | jq -r ".[0].Status.errors") +@@ -1037,11 +1084,14 @@ detect_cluster_leadership_loss() + return $OCF_SUCCESS + } + ++ ++# Manages etcd peer membership by detecting and handling missing or rejoining peers ++# Adds missing peers as learners and reconciles member states when peers rejoin ++# Args: $1 - member list JSON from etcdctl ++# Returns: OCF_SUCCESS on completion, OCF_ERR_GENERIC on errors ++# Note: Iterates through all peer nodes to ensure proper cluster membership + manage_peer_membership() + { +- # Read etcd member list to detect the status of the peer member. +- # If the peer is missing from the member list, it will be added back as learner +- # If the peer is back in the member list, we ensure that the related CIB attributes (standalone and learner_node) are reset + local member_list_json="$1" + + # Example of .members[] instance fields in member list json format: +@@ -1066,14 +1116,14 @@ manage_peer_membership() + + # Check by IP instead of Name since "learner" members appear only in peerURLs, not by Name. + ip=$(echo "$node" | cut -d: -f2-) # Grab everything after the first : this covers ipv4/ipv6 +- id=$(printf "%s" "$member_list_json" | jq -r ".members[] | select( .peerURLs | map(test(\"$ip\")) | any).ID") +- if [ -z "$id" ]; then ++ peer_member_id=$(printf "%s" "$member_list_json" | jq -r ".members[] | select( .peerURLs | map(test(\"$ip\")) | any).ID") ++ if [ -z "$peer_member_id" ]; then + ocf_log info "$name is not in the members list" + add_member_as_learner "$name" "$ip" + set_standalone_node + else + ocf_log debug "$name is in the members list by IP: $ip" +- clear_standalone_and_learner_if_not_learners "$member_list_json" ++ reconcile_member_state "$member_list_json" + fi + done + } diff --git a/resource-agents.spec b/resource-agents.spec index d26f85e..f29f880 100644 --- a/resource-agents.spec +++ b/resource-agents.spec @@ -45,7 +45,7 @@ Name: resource-agents Summary: Open Source HA Reusable Cluster Resource Scripts Version: 4.10.0 -Release: 87%{?rcver:%{rcver}}%{?numcomm:.%{numcomm}}%{?alphatag:.%{alphatag}}%{?dirty:.%{dirty}}%{?dist} +Release: 88%{?rcver:%{rcver}}%{?numcomm:.%{numcomm}}%{?alphatag:.%{alphatag}}%{?dirty:.%{dirty}}%{?dist} License: GPLv2+ and LGPLv2+ URL: https://github.com/ClusterLabs/resource-agents Source0: %{upstream_prefix}-%{upstream_version}.tar.gz @@ -173,6 +173,7 @@ Patch120: RHEL-113766-podman-etcd-preserve-containers-for-debugging.patch Patch121: RHEL-116206-podman-etcd-add-cluster-wide-force_new_cluster-attribute-check.patch Patch122: RHEL-116151-1-ocf-shellfuncs-add-ocf_promotion_score.patch Patch123: RHEL-116151-2-portblock-add-promotable-support.patch +Patch124: RHEL-119495-podman-etcd-add-automatic-learner-member-promotion.patch # bundled ha-cloud-support libs Patch500: ha-cloud-support-aliyun.patch @@ -441,6 +442,7 @@ exit 1 %patch -p1 -P 121 %patch -p1 -P 122 %patch -p1 -P 123 +%patch -p1 -P 124 # bundled ha-cloud-support libs %patch -p1 -P 500 @@ -773,6 +775,11 @@ rm -rf %{buildroot}/usr/share/doc/resource-agents %{_usr}/lib/ocf/lib/heartbeat/OCF_*.pm %changelog +* Thu Oct 9 2025 Oyvind Albrigtsen - 4.10.0-88 +- podman-etcd: add automatic learner member promotion + + Resolves: RHEL-119495 + * Wed Oct 8 2025 Oyvind Albrigtsen - 4.10.0-87 - build: make nfs-utils a weak dependency