- podman-etcd: hardened monitor/stop actions

Resolves: RHEL-159203
This commit is contained in:
Oyvind Albrigtsen 2026-03-27 07:50:05 +01:00
parent eea4aa580e
commit ee02a536d4
2 changed files with 273 additions and 1 deletions

View File

@ -0,0 +1,265 @@
From c909003639ef36f995f855f5b954a5ae2132f19c Mon Sep 17 00:00:00 2001
From: Vincenzo Mauro <43814449+vimauro@users.noreply.github.com>
Date: Mon, 23 Mar 2026 11:54:51 +0100
Subject: [PATCH] OCPBUGS-76538: podman-etcd: monitor/stop hardening (#2130)
* monitor/stop hardening
* removed noisy log
* Improved return code handling + PR comments
* reduced wait_timeout_sec in podman_start
* enriched log line on container_running
* Fixed detect_cluster_leadership_loss in case podman exec fails
* Reverted detect_cluster_leadership_loss changes
* Updated return code in attribute_node_revision
* restored initial attribute_node_revision logic and updated comment
* restored original log line in attribute_node_revision
* updated return codes on changed code path that reaches pacemaker
* updated log line in check_peer
---
heartbeat/podman-etcd | 133 ++++++++++++++++++++++++++----------------
1 file changed, 83 insertions(+), 50 deletions(-)
diff --git a/heartbeat/podman-etcd b/heartbeat/podman-etcd
index 860aca817..4c9bbd4fa 100755
--- a/heartbeat/podman-etcd
+++ b/heartbeat/podman-etcd
@@ -371,25 +371,34 @@ monitor_cmd_exec()
{
local rc=$OCF_SUCCESS
local out
-
- out=$(podman exec ${CONTAINER} $OCF_RESKEY_monitor_cmd 2>&1)
- rc=$?
- # 125: no container with name or ID ${CONTAINER} found
- # 126: container state improper (not running)
- # 127: any other error
- # 255: podman 2+: container not running
- case "$rc" in
- 125|126|255)
- rc=$OCF_NOT_RUNNING
- ;;
- 0)
- ocf_log debug "monitor cmd passed: exit code = $rc"
- ;;
- *)
- ocf_exit_reason "monitor cmd failed (rc=$rc), output: $out"
- rc=$OCF_ERR_GENERIC
- ;;
- esac
+ local attempt
+ # 3 attempts × 5s = 15s worst case, fits within the 25s monitor timeout.
+ # The health check normally completes in <1s; the 5s per-attempt timeout
+ # is a safety net for when the container's process namespace is slow.
+ local max_attempts=3
+ local attempt_timeout=5
+
+ for attempt in $(seq 1 $max_attempts); do
+ out=$(timeout $attempt_timeout podman exec ${CONTAINER} $OCF_RESKEY_monitor_cmd 2>&1)
+ rc=$?
+ # 125: no container with name or ID ${CONTAINER} found
+ # 126: container state improper (not running)
+ # 127: any other error
+ # 255: podman 2+: container not running
+ case "$rc" in
+ 125|126|255)
+ return $OCF_NOT_RUNNING
+ ;;
+ 0)
+ ocf_log debug "monitor cmd passed: exit code = $rc"
+ return $OCF_SUCCESS
+ ;;
+ *)
+ ocf_log warn "monitor cmd failed (rc=$rc), output: $out"
+ rc=$OCF_ERR_GENERIC
+ ;;
+ esac
+ done
return $rc
}
@@ -527,8 +536,9 @@ get_env_from_manifest() {
exit "$OCF_ERR_INSTALLED"
fi
- if ! env_var_value=$(jq -r ".spec.containers[].env[] | select( .name == \"$env_var_name\" ).value" "$OCF_RESKEY_pod_manifest"); then
- rc=$?
+ env_var_value=$(jq -r ".spec.containers[].env[] | select( .name == \"$env_var_name\" ).value" "$OCF_RESKEY_pod_manifest")
+ rc=$?
+ if [ $rc -ne 0 ]; then
ocf_log err "could not find environment variable $env_var_name in etcd pod manifest, error code: $rc"
exit "$OCF_ERR_INSTALLED"
fi
@@ -934,10 +944,14 @@ attribute_node_cluster_id()
{
local action="$1"
local value
- if ! value=$(jq -r ".clusterId" "$ETCD_REVISION_JSON"); then
- rc=$?
+ local rc
+ value=$(jq -r ".clusterId" "$ETCD_REVISION_JSON")
+ rc=$?
+ if [ $rc -ne 0 ]; then
+ # Log the error but return success to avoid monitor failure if the file is not available yet.
+ #This should not block cluster recovery.
ocf_log err "could not get cluster_id, error code: $rc"
- return "$rc"
+ return $OCF_SUCCESS
fi
case "$action" in
@@ -945,10 +959,12 @@ attribute_node_cluster_id()
echo "$value"
;;
update)
- if ! crm_attribute --type nodes --node "$NODENAME" --name "cluster_id" --update "$value"; then
- rc=$?
+ crm_attribute --type nodes --node "$NODENAME" --name "cluster_id" --update "$value"
+ rc=$?
+ if [ $rc -ne 0 ]; then
+ # Log the error but return success to avoid monitor failure if we can not update the attribute.
ocf_log err "could not update cluster_id, error code: $rc"
- return "$rc"
+ return $OCF_SUCCESS
fi
;;
*)
@@ -983,10 +999,12 @@ attribute_node_revision()
echo "$value"
;;
update)
- if ! crm_attribute --type nodes --node "$NODENAME" --name "$attribute" --update "$value"; then
- rc=$?
- ocf_log err "could not update etcd $revision, error code: $rc"
- return "$rc"
+ crm_attribute --type nodes --node "$NODENAME" --name "$attribute" --update "$value"
+ rc=$?
+ if [ $rc -ne 0 ]; then
+ # Log the error but return success to avoid monitor failure if we can not update the attribute.
+ ocf_log err "could not update etcd $attribute, error code: $rc"
+ return $OCF_SUCCESS
fi
;;
*)
@@ -1041,25 +1059,31 @@ attribute_node_member_id()
ocf_log info "member list: $member_list_json"
if [ -z "$member_list_json" ] ; then
ocf_log err "could not get $attribute: could not get member list JSON"
- return "$rc"
+ return $OCF_ERR_GENERIC
fi
- local value value_hex
- if ! value=$(echo -n "$member_list_json" | jq -r ".header.member_id"); then
- rc=$?
+ local value value_hex rc
+ value=$(echo -n "$member_list_json" | jq -r ".header.member_id")
+ rc=$?
+ if [ $rc -ne 0 ]; then
+ # Log the error but return success to avoid monitor failure if the file is not available yet.
ocf_log err "could not get $attribute from member list JSON, error code: $rc"
- return "$rc"
+ return $OCF_SUCCESS
fi
# JSON member_id is decimal, while etcdctl command needs the hex version
- if ! value_hex=$(decimal_to_hex "$value"); then
- ocf_log err "could not convert decimal member_id '$value' to hex, error code: $?"
+ value_hex=$(decimal_to_hex "$value")
+ rc=$?
+ if [ $rc -ne 0 ]; then
+ ocf_log err "could not convert decimal member_id '$value' to hex, error code: $rc"
return $OCF_ERR_GENERIC
fi
- if ! crm_attribute --type nodes --node "$NODENAME" --name "$attribute" --update "$value_hex"; then
- rc=$?
+ crm_attribute --type nodes --node "$NODENAME" --name "$attribute" --update "$value_hex"
+ rc=$?
+ if [ $rc -ne 0 ]; then
+ # Log the error but return success to avoid monitor failure if we can not update the attribute.
ocf_log err "could not update etcd $attribute, error code: $rc"
- return "$rc"
+ return $OCF_SUCCESS
fi
;;
clear)
@@ -1446,7 +1470,7 @@ get_endpoint_status_json()
local all_etcd_endpoints
all_etcd_endpoints=$(get_all_etcd_endpoints)
- podman exec "${CONTAINER}" etcdctl endpoint status --endpoints="$all_etcd_endpoints" -w json
+ podman exec "${CONTAINER}" etcdctl endpoint status --command-timeout="$MONITOR_ETCDCTL_TIMEOUT" --endpoints="$all_etcd_endpoints" -w json
}
get_member_list_json() {
@@ -1454,7 +1478,7 @@ get_member_list_json() {
local this_node_endpoint
this_node_endpoint="$(ip_url $(attribute_node_ip get)):2379"
- podman exec "${CONTAINER}" etcdctl member list --endpoints="$this_node_endpoint" -w json
+ podman exec "${CONTAINER}" etcdctl member list --command-timeout="$MONITOR_ETCDCTL_TIMEOUT" --endpoints="$this_node_endpoint" -w json
}
detect_cluster_leadership_loss()
@@ -1550,7 +1574,7 @@ check_peer()
fi
if ! member_list_json=$(get_member_list_json); then
- ocf_log info "podman failed to get member list, error code: $?"
+ ocf_log info "podman failed to get member list"
detect_cluster_leadership_loss
return $?
fi
@@ -2145,7 +2169,7 @@ podman_start()
run_opts="$run_opts --oom-score-adj=${OCF_RESKEY_oom}"
if ocf_is_true "$JOIN_AS_LEARNER"; then
- local wait_timeout_sec=$((10*60))
+ local wait_timeout_sec=$((2*60))
local poll_interval_sec=5
local retries=$(( wait_timeout_sec / poll_interval_sec ))
@@ -2354,8 +2378,9 @@ leave_etcd_member_list()
ocf_log info "leaving members list as member with ID $member_id"
local endpoint
endpoint="$(ip_url $(attribute_node_ip get)):2379"
- if ! ocf_run podman exec "$CONTAINER" etcdctl member remove "$member_id" --endpoints="$endpoint"; then
- rc=$?
+ ocf_run timeout 30 podman exec "$CONTAINER" etcdctl member remove "$member_id" --endpoints="$endpoint"
+ rc=$?
+ if [ $rc -ne 0 ]; then
ocf_log err "error leaving members list, error code: $rc"
fi
}
@@ -2376,14 +2401,19 @@ podman_stop()
attribute_node_revision update
attribute_node_cluster_id update
- podman_simple_status
- if [ $? -eq $OCF_NOT_RUNNING ]; then
- ocf_log info "could not leave members list: etcd container not running"
+ # Use podman inspect instead of podman exec (podman_simple_status) to check
+ # container state. podman exec enters the container's process namespace and
+ # hangs when etcd is unresponsive — the typical scenario that triggers a stop.
+ local container_running
+ container_running=$(podman inspect --format '{{.State.Running}}' "$CONTAINER" 2>/dev/null)
+ if [ "$container_running" != "true" ]; then
+ ocf_log info "could not leave members list: $CONTAINER container not running, running state: ${container_running}"
attribute_node_member_id clear
return $OCF_SUCCESS
fi
leave_etcd_member_list
+
# clear node_member_id CIB attribute only after leaving the member list
attribute_node_member_id clear
@@ -2527,6 +2557,9 @@ ETCD_CERTS_HASH_FILE="${OCF_RESKEY_config_location}/certs.hash"
# This is intentional - reboots are controlled stops, not failures requiring detection.
CONTAINER_HEARTBEAT_FILE=${HA_RSCTMP}/podman-container-last-running
DELAY_SECOND_NODE_LEAVE_SEC=10
+# Shorter etcdctl command-timeout for monitor-path calls to prevent
+# consuming the 25s monitor budget. Non-monitor callers use the default 5s.
+MONITOR_ETCDCTL_TIMEOUT="3s"
# Note: we currently monitor podman containers by with the "podman exec"
# command, so make sure that invocation is always valid by enforcing the

View File

@ -45,7 +45,7 @@
Name: resource-agents
Summary: Open Source HA Reusable Cluster Resource Scripts
Version: 4.10.0
Release: 110%{?rcver:%{rcver}}%{?numcomm:.%{numcomm}}%{?alphatag:.%{alphatag}}%{?dirty:.%{dirty}}%{?dist}
Release: 111%{?rcver:%{rcver}}%{?numcomm:.%{numcomm}}%{?alphatag:.%{alphatag}}%{?dirty:.%{dirty}}%{?dist}
License: GPLv2+ and LGPLv2+
URL: https://github.com/ClusterLabs/resource-agents
Source0: %{upstream_prefix}-%{upstream_version}.tar.gz
@ -203,6 +203,7 @@ Patch150: RHEL-116151-4-portblock-check-inverse-action.patch
Patch151: RHEL-156709-podman-etcd-ignore-learners-when-considering-which-node-has-higher-revision.patch
Patch152: RHEL-157146-podman-etcd-handle-existing-peer-URLs-gracefully-during-force_new_cluster-recovery.patch
Patch153: RHEL-153158-db2-set-reintegration-when-promotion-is-successful.patch
Patch154: RHEL-159203-podman-etcd-hardened-monitor-stop-actions.patch
# bundled ha-cloud-support libs
Patch500: ha-cloud-support-aliyun.patch
@ -501,6 +502,7 @@ exit 1
%patch -p1 -P 151
%patch -p1 -P 152
%patch -p1 -P 153
%patch -p1 -P 154
# bundled ha-cloud-support libs
%patch -p1 -P 500
@ -833,6 +835,11 @@ rm -rf %{buildroot}/usr/share/doc/resource-agents
%{_usr}/lib/ocf/lib/heartbeat/OCF_*.pm
%changelog
* Fri Mar 27 2026 Oyvind Albrigtsen <oalbrigt@redhat.com> - 4.10.0-111
- podman-etcd: hardened monitor/stop actions
Resolves: RHEL-159203
* Thu Mar 19 2026 Oyvind Albrigtsen <oalbrigt@redhat.com> - 4.10.0-110
- podman etcd: ignore learners when considering which node has higher revision
- podman etcd: handle existing peer URLs gracefully during force_new_cluster recovery