From ee02a536d4abb2b60fc9538b71d7546349c3fd47 Mon Sep 17 00:00:00 2001
From: Oyvind Albrigtsen <oalbrigt@redhat.com>
Date: Fri, 27 Mar 2026 07:50:05 +0100
Subject: [PATCH] - podman-etcd: hardened monitor/stop actions

  Resolves: RHEL-159203
---
 ...n-etcd-hardened-monitor-stop-actions.patch | 265 ++++++++++++++++++
 resource-agents.spec                          |   9 +-
 2 files changed, 273 insertions(+), 1 deletion(-)
 create mode 100644 RHEL-159203-podman-etcd-hardened-monitor-stop-actions.patch

diff --git a/RHEL-159203-podman-etcd-hardened-monitor-stop-actions.patch b/RHEL-159203-podman-etcd-hardened-monitor-stop-actions.patch
new file mode 100644
index 0000000..3ef0d95
--- /dev/null
+++ b/RHEL-159203-podman-etcd-hardened-monitor-stop-actions.patch
@@ -0,0 +1,265 @@
+From c909003639ef36f995f855f5b954a5ae2132f19c Mon Sep 17 00:00:00 2001
+From: Vincenzo Mauro <43814449+vimauro@users.noreply.github.com>
+Date: Mon, 23 Mar 2026 11:54:51 +0100
+Subject: [PATCH] OCPBUGS-76538: podman-etcd: monitor/stop hardening (#2130)
+
+* monitor/stop hardening
+* removed noisy log
+* Improved return code handling + PR comments
+* reduced wait_timeout_sec in podman_start
+* enriched log line on container_running
+* Fixed detect_cluster_leadership_loss in case podman exec fails
+* Reverted detect_cluster_leadership_loss changes
+* Updated return code in attribute_node_revision
+* restored initial attribute_node_revision logic and updated comment
+* restored original log line in attribute_node_revision
+* updated return codes on changed code path that reaches pacemaker
+* updated log line in check_peer
+---
+ heartbeat/podman-etcd | 133 ++++++++++++++++++++++++++----------------
+ 1 file changed, 83 insertions(+), 50 deletions(-)
+
+diff --git a/heartbeat/podman-etcd b/heartbeat/podman-etcd
+index 860aca817..4c9bbd4fa 100755
+--- a/heartbeat/podman-etcd
++++ b/heartbeat/podman-etcd
+@@ -371,25 +371,34 @@ monitor_cmd_exec()
+ {
+ 	local rc=$OCF_SUCCESS
+ 	local out
+-
+-	out=$(podman exec ${CONTAINER} $OCF_RESKEY_monitor_cmd 2>&1)
+-	rc=$?
+-	# 125: no container with name or ID ${CONTAINER} found
+-	# 126: container state improper (not running)
+-	# 127: any other error
+-	# 255: podman 2+: container not running
+-	case "$rc" in
+-		125|126|255)
+-			rc=$OCF_NOT_RUNNING
+-			;;
+-		0)
+-			ocf_log debug "monitor cmd passed: exit code = $rc"
+-			;;
+-		*)
+-			ocf_exit_reason "monitor cmd failed (rc=$rc), output: $out"
+-			rc=$OCF_ERR_GENERIC
+-			;;
+-	esac
++	local attempt
++	# 3 attempts × 5s = 15s worst case, fits within the 25s monitor timeout.
++	# The health check normally completes in <1s; the 5s per-attempt timeout
++	# is a safety net for when the container's process namespace is slow.
++	local max_attempts=3
++	local attempt_timeout=5
++
++	for attempt in $(seq 1 $max_attempts); do
++		out=$(timeout $attempt_timeout podman exec ${CONTAINER} $OCF_RESKEY_monitor_cmd 2>&1)
++		rc=$?
++		# 125: no container with name or ID ${CONTAINER} found
++		# 126: container state improper (not running)
++		# 127: any other error
++		# 255: podman 2+: container not running
++		case "$rc" in
++			125|126|255)
++				return $OCF_NOT_RUNNING
++				;;
++			0)
++				ocf_log debug "monitor cmd passed: exit code = $rc"
++				return $OCF_SUCCESS
++				;;
++			*)
++				ocf_log warn "monitor cmd failed (rc=$rc), output: $out"
++				rc=$OCF_ERR_GENERIC
++				;;
++		esac
++	done
+ 
+ 	return $rc
+ }
+@@ -527,8 +536,9 @@ get_env_from_manifest() {
+ 		exit "$OCF_ERR_INSTALLED"
+ 	fi
+ 
+-	if ! env_var_value=$(jq -r ".spec.containers[].env[] | select( .name == \"$env_var_name\" ).value" "$OCF_RESKEY_pod_manifest"); then
+-		rc=$?
++	env_var_value=$(jq -r ".spec.containers[].env[] | select( .name == \"$env_var_name\" ).value" "$OCF_RESKEY_pod_manifest")
++	rc=$?
++	if [ $rc -ne 0 ]; then
+ 		ocf_log err "could not find environment variable $env_var_name in etcd pod manifest, error code: $rc"
+ 		exit "$OCF_ERR_INSTALLED"
+ 	fi
+@@ -934,10 +944,14 @@ attribute_node_cluster_id()
+ {
+ 	local action="$1"
+ 	local value
+-	if ! value=$(jq -r ".clusterId" "$ETCD_REVISION_JSON"); then
+-		rc=$?
++	local rc
++	value=$(jq -r ".clusterId" "$ETCD_REVISION_JSON")
++	rc=$?
++	if [ $rc -ne 0 ]; then
++		# Log the error but return success to avoid monitor failure if the file is not available yet. 
++		#This should not block cluster recovery.
+ 		ocf_log err "could not get cluster_id, error code: $rc"
+-		return "$rc"
++		return $OCF_SUCCESS
+ 	fi
+ 
+ 	case "$action" in
+@@ -945,10 +959,12 @@ attribute_node_cluster_id()
+ 			echo "$value"
+ 			;;
+ 		update)
+-			if ! crm_attribute --type nodes --node "$NODENAME" --name "cluster_id" --update "$value"; then
+-				rc=$?
++			crm_attribute --type nodes --node "$NODENAME" --name "cluster_id" --update "$value"
++			rc=$?
++			if [ $rc -ne 0 ]; then
++			 	# Log the error but return success to avoid monitor failure if we can not update the attribute.
+ 				ocf_log err "could not update cluster_id, error code: $rc"
+-				return "$rc"
++				return $OCF_SUCCESS
+ 			fi
+ 			;;
+ 		*)
+@@ -983,10 +999,12 @@ attribute_node_revision()
+ 			echo "$value"
+ 			;;
+ 		update)
+-			if ! crm_attribute --type nodes --node "$NODENAME" --name "$attribute" --update "$value"; then
+-				rc=$?
+-				ocf_log err "could not update etcd $revision, error code: $rc"
+-				return "$rc"
++			crm_attribute --type nodes --node "$NODENAME" --name "$attribute" --update "$value"
++			rc=$?
++			if [ $rc -ne 0 ]; then
++				# Log the error but return success to avoid monitor failure if we can not update the attribute.
++				ocf_log err "could not update etcd $attribute, error code: $rc"
++				return $OCF_SUCCESS
+ 			fi
+ 			;;
+ 		*)
+@@ -1041,25 +1059,31 @@ attribute_node_member_id()
+ 			ocf_log info "member list: $member_list_json"
+ 			if [ -z "$member_list_json" ] ; then
+ 				ocf_log err "could not get $attribute: could not get member list JSON"
+-				return "$rc"
++				return $OCF_ERR_GENERIC
+ 			fi
+ 
+-			local value value_hex
+-			if ! value=$(echo -n "$member_list_json" | jq -r ".header.member_id"); then
+-				rc=$?
++			local value value_hex rc
++			value=$(echo -n "$member_list_json" | jq -r ".header.member_id")
++			rc=$?
++			if [ $rc -ne 0 ]; then
++				# Log the error but return success to avoid monitor failure if the file is not available yet.
+ 				ocf_log err "could not get $attribute from member list JSON, error code: $rc"
+-				return "$rc"
++				return $OCF_SUCCESS
+ 			fi
+ 
+ 			# JSON member_id is decimal, while etcdctl command needs the hex version
+-			if ! value_hex=$(decimal_to_hex "$value"); then
+-				ocf_log err "could not convert decimal member_id '$value' to hex, error code: $?"
++			value_hex=$(decimal_to_hex "$value")
++			rc=$?
++			if [ $rc -ne 0 ]; then
++				ocf_log err "could not convert decimal member_id '$value' to hex, error code: $rc"
+ 				return $OCF_ERR_GENERIC
+ 			fi
+-			if ! crm_attribute --type nodes --node "$NODENAME" --name "$attribute" --update "$value_hex"; then
+-				rc=$?
++			crm_attribute --type nodes --node "$NODENAME" --name "$attribute" --update "$value_hex"
++			rc=$?
++			if [ $rc -ne 0 ]; then
++				# Log the error but return success to avoid monitor failure if we can not update the attribute.
+ 				ocf_log err "could not update etcd $attribute, error code: $rc"
+-				return "$rc"
++				return $OCF_SUCCESS
+ 			fi
+ 			;;
+ 		clear)
+@@ -1446,7 +1470,7 @@ get_endpoint_status_json()
+ 	local all_etcd_endpoints
+ 
+ 	all_etcd_endpoints=$(get_all_etcd_endpoints)
+-	podman exec "${CONTAINER}" etcdctl endpoint status --endpoints="$all_etcd_endpoints" -w json
++	podman exec "${CONTAINER}" etcdctl endpoint status --command-timeout="$MONITOR_ETCDCTL_TIMEOUT" --endpoints="$all_etcd_endpoints" -w json
+ }
+ 
+ get_member_list_json() {
+@@ -1454,7 +1478,7 @@ get_member_list_json() {
+ 	local this_node_endpoint
+ 
+ 	this_node_endpoint="$(ip_url $(attribute_node_ip get)):2379"
+-	podman exec "${CONTAINER}" etcdctl member list --endpoints="$this_node_endpoint" -w json
++	podman exec "${CONTAINER}" etcdctl member list --command-timeout="$MONITOR_ETCDCTL_TIMEOUT" --endpoints="$this_node_endpoint" -w json
+ }
+ 
+ detect_cluster_leadership_loss()
+@@ -1550,7 +1574,7 @@ check_peer()
+ 	fi
+ 
+ 	if ! member_list_json=$(get_member_list_json); then
+-		ocf_log info "podman failed to get member list, error code: $?"
++		ocf_log info "podman failed to get member list"
+ 		detect_cluster_leadership_loss
+ 		return $?
+ 	fi
+@@ -2145,7 +2169,7 @@ podman_start()
+ 	run_opts="$run_opts --oom-score-adj=${OCF_RESKEY_oom}"
+ 
+ 	if ocf_is_true "$JOIN_AS_LEARNER"; then
+-		local wait_timeout_sec=$((10*60))
++		local wait_timeout_sec=$((2*60))
+ 		local poll_interval_sec=5
+ 		local retries=$(( wait_timeout_sec / poll_interval_sec ))
+ 
+@@ -2354,8 +2378,9 @@ leave_etcd_member_list()
+ 	ocf_log info "leaving members list as member with ID $member_id"
+ 	local endpoint
+ 	endpoint="$(ip_url $(attribute_node_ip get)):2379"
+-	if ! ocf_run podman exec "$CONTAINER" etcdctl member remove "$member_id" --endpoints="$endpoint"; then
+-		rc=$?
++	ocf_run timeout 30 podman exec "$CONTAINER" etcdctl member remove "$member_id" --endpoints="$endpoint"
++	rc=$?
++	if [ $rc -ne 0 ]; then
+ 		ocf_log err "error leaving members list, error code: $rc"
+ 	fi
+ }
+@@ -2376,14 +2401,19 @@ podman_stop()
+ 	attribute_node_revision update
+ 	attribute_node_cluster_id update
+ 
+-	podman_simple_status
+-	if [ $? -eq  $OCF_NOT_RUNNING ]; then
+-		ocf_log info "could not leave members list: etcd container not running"
++	# Use podman inspect instead of podman exec (podman_simple_status) to check
++	# container state. podman exec enters the container's process namespace and
++	# hangs when etcd is unresponsive — the typical scenario that triggers a stop.
++	local container_running
++	container_running=$(podman inspect --format '{{.State.Running}}' "$CONTAINER" 2>/dev/null)
++	if [ "$container_running" != "true" ]; then
++		ocf_log info "could not leave members list: $CONTAINER container not running, running state: ${container_running}"
+ 		attribute_node_member_id clear
+ 		return $OCF_SUCCESS
+ 	fi
+ 
+ 	leave_etcd_member_list
++
+ 	# clear node_member_id CIB attribute only after leaving the member list
+ 	attribute_node_member_id clear
+ 
+@@ -2527,6 +2557,9 @@ ETCD_CERTS_HASH_FILE="${OCF_RESKEY_config_location}/certs.hash"
+ # This is intentional - reboots are controlled stops, not failures requiring detection.
+ CONTAINER_HEARTBEAT_FILE=${HA_RSCTMP}/podman-container-last-running
+ DELAY_SECOND_NODE_LEAVE_SEC=10
++# Shorter etcdctl command-timeout for monitor-path calls to prevent
++# consuming the 25s monitor budget. Non-monitor callers use the default 5s.
++MONITOR_ETCDCTL_TIMEOUT="3s"
+ 
+ # Note: we currently monitor podman containers by with the "podman exec"
+ # command, so make sure that invocation is always valid by enforcing the
diff --git a/resource-agents.spec b/resource-agents.spec
index 8a04283..88723ce 100644
--- a/resource-agents.spec
+++ b/resource-agents.spec
@@ -45,7 +45,7 @@
 Name:		resource-agents
 Summary:	Open Source HA Reusable Cluster Resource Scripts
 Version:	4.10.0
-Release:	110%{?rcver:%{rcver}}%{?numcomm:.%{numcomm}}%{?alphatag:.%{alphatag}}%{?dirty:.%{dirty}}%{?dist}
+Release:	111%{?rcver:%{rcver}}%{?numcomm:.%{numcomm}}%{?alphatag:.%{alphatag}}%{?dirty:.%{dirty}}%{?dist}
 License:	GPLv2+ and LGPLv2+
 URL:		https://github.com/ClusterLabs/resource-agents
 Source0:	%{upstream_prefix}-%{upstream_version}.tar.gz
@@ -203,6 +203,7 @@ Patch150:	RHEL-116151-4-portblock-check-inverse-action.patch
 Patch151:	RHEL-156709-podman-etcd-ignore-learners-when-considering-which-node-has-higher-revision.patch
 Patch152:	RHEL-157146-podman-etcd-handle-existing-peer-URLs-gracefully-during-force_new_cluster-recovery.patch
 Patch153:	RHEL-153158-db2-set-reintegration-when-promotion-is-successful.patch
+Patch154:	RHEL-159203-podman-etcd-hardened-monitor-stop-actions.patch
 
 # bundled ha-cloud-support libs
 Patch500:	ha-cloud-support-aliyun.patch
@@ -501,6 +502,7 @@ exit 1
 %patch -p1 -P 151
 %patch -p1 -P 152
 %patch -p1 -P 153
+%patch -p1 -P 154
 
 # bundled ha-cloud-support libs
 %patch -p1 -P 500
@@ -833,6 +835,11 @@ rm -rf %{buildroot}/usr/share/doc/resource-agents
 %{_usr}/lib/ocf/lib/heartbeat/OCF_*.pm
 
 %changelog
+* Fri Mar 27 2026 Oyvind Albrigtsen <oalbrigt@redhat.com> - 4.10.0-111
+- podman-etcd: hardened monitor/stop actions
+
+  Resolves: RHEL-159203
+
 * Thu Mar 19 2026 Oyvind Albrigtsen <oalbrigt@redhat.com> - 4.10.0-110
 - podman etcd: ignore learners when considering which node has higher revision
 - podman etcd: handle existing peer URLs gracefully during force_new_cluster recovery