From a7e8b855e0194faa48ce4bf7c707c98e0f3fc031 Mon Sep 17 00:00:00 2001
From: Oyvind Albrigtsen <oalbrigt@redhat.com>
Date: Thu, 13 Nov 2025 11:33:32 +0100
Subject: [PATCH] - podman-etcd: exclude stopping resources from active count -
 podman-etcd: add container crash detection with coordinated recovery

  Resolves: RHEL-127840, RHEL-126083
---
 ...-detection-with-coordinated-recovery.patch | 222 ++++++++++++++++++
 ...stopping-resources-from-active-count.patch | 106 +++++++++
 resource-agents.spec                          |  12 +-
 3 files changed, 339 insertions(+), 1 deletion(-)
 create mode 100644 RHEL-126083-podman-etcd-add-container-crash-detection-with-coordinated-recovery.patch
 create mode 100644 RHEL-127840-podman-etcd-exclude-stopping-resources-from-active-count.patch

diff --git a/RHEL-126083-podman-etcd-add-container-crash-detection-with-coordinated-recovery.patch b/RHEL-126083-podman-etcd-add-container-crash-detection-with-coordinated-recovery.patch
new file mode 100644
index 0000000..0c2f3e9
--- /dev/null
+++ b/RHEL-126083-podman-etcd-add-container-crash-detection-with-coordinated-recovery.patch
@@ -0,0 +1,222 @@
+From e8fb2ad9cc14e91b74b5cde1e012d92afcddb1a5 Mon Sep 17 00:00:00 2001
+From: Carlo Lobrano <c.lobrano@gmail.com>
+Date: Sat, 25 Oct 2025 17:27:42 +0200
+Subject: [PATCH] podman-etcd: add container crash detection with coordinated
+ recovery
+
+This change prevents the agent from starting prematurely when the etcd
+container has failed. Previously, an early start would cause the agent
+to block while waiting for peer-initiated recovery. This blocking
+prevented Pacemaker from allowing the surviving agent to stop and
+properly recover the cluster.
+
+The change introduces `container_health_check` function to monitor the
+container's state and catch etcd failures. This check uses a state file
+to distinguish between a planned shutdown and an unexpected failure:
+
+* Container Running: The state file is created or updated with the
+  current epoch (timestamp). Returns: "healthy".
+* Container Not Running + No State File: It's the first check. Returns:
+  "not-running".
+* Container Not Running + State File: An unexpected failure is detected.
+  * If force_new_cluster is set, the status is: "failed-restart-now".
+  * Otherwise, the status is: "failed-wait-for-peer".
+
+The state file is written in a temporary directory (HA_RSCTMP) to ensure
+automatic cleanup on reboot. It is also explicitly removed in
+`podman_start` and `podman_stop` to mark planned transitions.
+
+A new helper function `get_time_since_last_heartbeat()` calculates
+elapsed time since the last healthy check for diagnostic logging.
+
+Monitor behavior changes:
+* failed-wait-for-peer: Returns OCF_SUCCESS to keep resource running
+  while waiting for peer-initiated recovery, as the agent is not able
+  to recover the cluster from a failed state.
+* failed-restart-now: Returns OCF_ERR_GENERIC to trigger restart once
+  peer has set force_new_cluster
+---
+ heartbeat/podman-etcd | 133 +++++++++++++++++++++++++++++++++++++++---
+ 1 file changed, 124 insertions(+), 9 deletions(-)
+
+diff --git a/heartbeat/podman-etcd b/heartbeat/podman-etcd
+index b8dfb2f9e..d596c6f2a 100755
+--- a/heartbeat/podman-etcd
++++ b/heartbeat/podman-etcd
+@@ -1226,22 +1226,122 @@ podman_simple_status()
+ 	return $rc
+ }
+ 
+-podman_monitor()
++# get_time_since_last_heartbeat returns the time in seconds since the heartbeat file was last updated.
++#
++# Returns: time in seconds since last heartbeat, or empty string if file doesn't exist
++get_time_since_last_heartbeat()
+ {
++	local last_heartbeat
++
++	if [ ! -f "$CONTAINER_HEARTBEAT_FILE" ]; then
++		return
++	fi
++
++	last_heartbeat=$(cat "$CONTAINER_HEARTBEAT_FILE")
++	echo $(($(date +%s) - last_heartbeat))
++}
++
++# container_health_check performs comprehensive health monitoring for the container.
++# This function allows coordinated failure handling where the agent waits for
++# peer-initiated cluster recovery in case of container failure.
++#
++# Uses a state file to track container state:
++# - Container running: Update state file with current epoch, return "healthy"
++# - Container not running + no state file: Return "not-running" (never checked before)
++# - Container not running + state file: Failure detected, check force_new_cluster
++#   - If force_new_cluster set: Return "failed-restart-now"
++#   - Otherwise: Return "failed-wait-for-peer"
++#
++# Returns: healthy, not-running, failed-restart-now, failed-wait-for-peer
++
++container_health_check()
++{
++	local rc
++
+ 	# We rely on running podman exec to monitor the container
+ 	# state because that command seems to be less prone to
+ 	# performance issue under IO load.
+ 	#
+ 	# For probes to work, we expect cmd_exec to be able to report
+-	# when a container is not running. Here, we're not interested
+-	# in distinguishing whether it's stopped or non existing
+-	# (there's function container_exists for that)
++	#   when a container is not running. Here, we're not interested
++	#   in distinguishing whether it's stopped or non existing
++	#   (there's function container_exists for that)
++	# For monitor, however, we still need to know if it has stopped
++	#   recently (i.e. a failure), or not (fresh start)
+ 	monitor_cmd_exec
+ 	rc=$?
+-	if [ $rc -ne 0 ]; then
+-		return $rc
++	if [ "$rc" -eq 0 ]; then
++		# Container is running - update state file with current epoch
++		local current_epoch
++		current_epoch=$(date +%s)
++		if ! echo "$current_epoch" > "$CONTAINER_HEARTBEAT_FILE"; then
++			ocf_log warn "Failed to update container heartbeat file, error code: $?"
++			# wait for peer to detect any real issue with the etcd cluster or wait for the
++			# next monitor interval
++			echo "failed-wait-for-peer"
++			return
++		fi
++		echo "healthy"
++		return
+ 	fi
+ 
++	# Check if state file exists (was container running on last check?)
++	if [ ! -f "$CONTAINER_HEARTBEAT_FILE" ]; then
++		# No state file - container was never checked before
++		ocf_log debug "Container ${CONTAINER} has no previous state"
++		echo "not-running"
++		# NOTE: this is where the probe is expected to exit, keeping the logic
++		# quick and less prone to performance issue under IO load.
++		return
++	fi
++
++	# State file exists - the container failed, check recovery status in this lifecycle
++	local time_since_heartbeat
++	time_since_heartbeat=$(get_time_since_last_heartbeat)
++	ocf_log err "Container ${CONTAINER} failed (last healthy: ${time_since_heartbeat}s ago)"
++
++	# Check if peer has set force_new_cluster for recovery
++	local fnc_holders
++	if ! fnc_holders=$(get_force_new_cluster); then
++		ocf_log err "Could not detect peer-initiated recovery. Checking again in the next monitor cycle"
++		echo "failed-wait-for-peer"
++		return
++	fi
++
++	if [ -n "$fnc_holders" ]; then
++		ocf_log debug "force_new_cluster detected (set by: $fnc_holders), triggering restart"
++		echo "failed-restart-now"
++		return
++	fi
++
++	echo "failed-wait-for-peer"
++}
++
++podman_monitor()
++{
++	local container_health_state
++
++	container_health_state=$(container_health_check)
++	case "$container_health_state" in
++		healthy)
++			# Continue with normal monitoring
++			;;
++		not-running)
++			return $OCF_NOT_RUNNING
++			;;
++		failed-restart-now)
++			return $OCF_ERR_GENERIC
++			;;
++		failed-wait-for-peer)
++			# Continue running, waiting for peer recovery
++			return $OCF_SUCCESS
++			;;
++		*)
++			ocf_log err "Unknown health state: $container_health_state"
++			return $OCF_ERR_GENERIC
++			;;
++	esac
++
+ 	# Check if certificate files have changed, if they have, etcd needs to be restarted
+ 	if ! etcd_certificates_hash_manager "check"; then
+ 		return $OCF_ERR_GENERIC
+@@ -1533,6 +1633,12 @@ podman_start()
+ 	local pod_was_running=false
+ 
+ 	ocf_log notice "podman-etcd start"
++
++	# Clear container health check state file
++	if ! rm -f "$CONTAINER_HEARTBEAT_FILE"; then
++		ocf_log err "could not delete container health check state file"
++	fi
++
+ 	attribute_node_ip update
+ 	attribute_node_cluster_id update
+ 	attribute_node_revision update
+@@ -1849,15 +1955,21 @@ podman_stop()
+ 	local rc
+ 
+ 	ocf_log notice "podman-etcd stop"
++
++	# Clear container health check state file
++	if ! rm -f "$CONTAINER_HEARTBEAT_FILE"; then
++		ocf_log err "could not delete container health check state file"
++	fi
++
++	attribute_node_revision update
++	attribute_node_cluster_id update
++
+ 	podman_simple_status
+ 	if [ $? -eq  $OCF_NOT_RUNNING ]; then
+ 		ocf_log info "could not leave members list: etcd container not running"
+ 		return $OCF_SUCCESS
+ 	fi
+ 
+-	attribute_node_revision update
+-	attribute_node_cluster_id update
+-
+ 	if ! member_id=$(attribute_node_member_id get); then
+ 		ocf_log err "error leaving members list: could not get member-id"
+ 	else
+@@ -2007,6 +2119,9 @@ POD_MANIFEST_COPY="${OCF_RESKEY_config_location}/pod.yaml"
+ ETCD_CONFIGURATION_FILE="${OCF_RESKEY_config_location}/config.yaml"
+ ETCD_BACKUP_FILE="${OCF_RESKEY_backup_location}/config-previous.tar.gz"
+ ETCD_CERTS_HASH_FILE="${OCF_RESKEY_config_location}/certs.hash"
++# State file location: Uses HA_RSCTMP to ensure automatic cleanup on reboot.
++# This is intentional - reboots are controlled stops, not failures requiring detection.
++CONTAINER_HEARTBEAT_FILE=${HA_RSCTMP}/podman-container-last-running
+ 
+ # Note: we currently monitor podman containers by with the "podman exec"
+ # command, so make sure that invocation is always valid by enforcing the
diff --git a/RHEL-127840-podman-etcd-exclude-stopping-resources-from-active-count.patch b/RHEL-127840-podman-etcd-exclude-stopping-resources-from-active-count.patch
new file mode 100644
index 0000000..d065a34
--- /dev/null
+++ b/RHEL-127840-podman-etcd-exclude-stopping-resources-from-active-count.patch
@@ -0,0 +1,106 @@
+From d5b4428e6cd66fd47680531ff0244d9b56e4e4c2 Mon Sep 17 00:00:00 2001
+From: Pablo Fontanilla <pfontani@redhat.com>
+Date: Tue, 14 Oct 2025 11:57:09 +0200
+Subject: [PATCH 1/2] Redo counting of active_resources
+
+---
+ heartbeat/podman-etcd | 46 +++++++++++++++++++++++++++++++++++++++++--
+ 1 file changed, 44 insertions(+), 2 deletions(-)
+
+diff --git a/heartbeat/podman-etcd b/heartbeat/podman-etcd
+index e1425ec02..dbf16918d 100755
+--- a/heartbeat/podman-etcd
++++ b/heartbeat/podman-etcd
+@@ -1029,6 +1029,48 @@ get_peer_node_name() {
+ 	crm_node -l | awk '{print $2}' | grep -v "$NODENAME"
+ }
+ 
++# Calculate the count of truly active resources by excluding those being stopped.
++# According to Pacemaker documentation, during "Post-notification (stop) /
++# Pre-notification (start)" transitions, the true active resource count should be:
++#   Active resources = $OCF_RESKEY_CRM_meta_notify_active_resource
++#                      minus $OCF_RESKEY_CRM_meta_notify_stop_resource
++# This handles the case where a resource appears in both the active and stop lists
++# during rapid restart scenarios (e.g., process crash recovery).
++get_truly_active_resources_count() {
++	local active_list="$OCF_RESKEY_CRM_meta_notify_active_resource"
++	local stop_list="$OCF_RESKEY_CRM_meta_notify_stop_resource"
++	local truly_active=""
++
++	# If no active resources, return 0
++	if [ -z "$active_list" ]; then
++		echo "0"
++		return
++	fi
++
++	# If no resources being stopped, return count of active resources
++	if [ -z "$stop_list" ]; then
++		echo "$active_list" | wc -w
++		return
++	fi
++
++	# Filter out resources that are being stopped from the active list
++	for resource in $active_list; do
++		local is_stopping=0
++		for stop_resource in $stop_list; do
++			if [ "$resource" = "$stop_resource" ]; then
++				is_stopping=1
++				break
++			fi
++		done
++		if [ $is_stopping -eq 0 ]; then
++			truly_active="$truly_active $resource"
++		fi
++	done
++
++	# Count the truly active resources (trim leading space and count words)
++	echo "$truly_active" | wc -w
++}
++
+ get_all_etcd_endpoints() {
+ 	for node in $(echo "$OCF_RESKEY_node_ip_map" | sed "s/\s//g;s/;/ /g"); do
+         name=$(echo "$node" | cut -d: -f1)
+@@ -1529,8 +1571,8 @@ podman_start()
+ 			# - 0 active agents, 1 starting: we are starting; the peer is not starting
+ 			# - 0 active agents, 2 starting: both agents are starting simultaneously
+ 			local active_resources_count
+-			active_resources_count=$(echo "$OCF_RESKEY_CRM_meta_notify_active_resource" | wc -w)
+-			ocf_log info "found '$active_resources_count' active etcd resources (meta notify environment variable: '$OCF_RESKEY_CRM_meta_notify_active_resource')"
++			active_resources_count=$(get_truly_active_resources_count)
++			ocf_log info "found '$active_resources_count' active etcd resources (active: '$OCF_RESKEY_CRM_meta_notify_active_resource', stop: '$OCF_RESKEY_CRM_meta_notify_stop_resource')"
+ 			case "$active_resources_count" in
+ 			1)
+ 				if [ "$(attribute_learner_node get)" = "$(get_peer_node_name)" ]; then
+
+From 0114ddf83c95122a7f9fe9f704f864242cdb284a Mon Sep 17 00:00:00 2001
+From: Pablo Fontanilla <pfontani@redhat.com>
+Date: Wed, 29 Oct 2025 12:49:17 +0100
+Subject: [PATCH 2/2] Update truly active resources count with safer empty
+ calculation
+
+---
+ heartbeat/podman-etcd | 6 ++++--
+ 1 file changed, 4 insertions(+), 2 deletions(-)
+
+diff --git a/heartbeat/podman-etcd b/heartbeat/podman-etcd
+index dbf16918d..8fc92a537 100755
+--- a/heartbeat/podman-etcd
++++ b/heartbeat/podman-etcd
+@@ -1042,13 +1042,15 @@ get_truly_active_resources_count() {
+ 	local truly_active=""
+ 
+ 	# If no active resources, return 0
+-	if [ -z "$active_list" ]; then
++	# Use word count to handle whitespace-only values
++	if [ "$(echo "$active_list" | wc -w)" -eq 0 ]; then
+ 		echo "0"
+ 		return
+ 	fi
+ 
+ 	# If no resources being stopped, return count of active resources
+-	if [ -z "$stop_list" ]; then
++	# Use word count to handle whitespace-only values
++	if [ "$(echo "$stop_list" | wc -w)" -eq 0 ]; then
+ 		echo "$active_list" | wc -w
+ 		return
+ 	fi
diff --git a/resource-agents.spec b/resource-agents.spec
index 3b63a15..9495b48 100644
--- a/resource-agents.spec
+++ b/resource-agents.spec
@@ -45,7 +45,7 @@
 Name:		resource-agents
 Summary:	Open Source HA Reusable Cluster Resource Scripts
 Version:	4.16.0
-Release:	40%{?rcver:%{rcver}}%{?numcomm:.%{numcomm}}%{?alphatag:.%{alphatag}}%{?dirty:.%{dirty}}%{?dist}
+Release:	41%{?rcver:%{rcver}}%{?numcomm:.%{numcomm}}%{?alphatag:.%{alphatag}}%{?dirty:.%{dirty}}%{?dist}
 License:	GPL-2.0-or-later AND LGPL-2.1-or-later
 URL:		https://github.com/ClusterLabs/resource-agents
 Source0:	%{upstream_prefix}-%{upstream_version}.tar.gz
@@ -103,6 +103,8 @@ Patch50:	RHEL-102779-pgsqlms-fix-validate-warnings.patch
 Patch51:	RHEL-112443-nginx-fix-validate-warnings.patch
 Patch52:	RHEL-121985-Filesystem-speed-up-get-PIDs.patch
 Patch53:	RHEL-126791-storage_mon-fix-handling-of-4k-block-devices.patch
+Patch54:	RHEL-127840-podman-etcd-exclude-stopping-resources-from-active-count.patch
+Patch55:	RHEL-126083-podman-etcd-add-container-crash-detection-with-coordinated-recovery.patch
 
 # bundled ha-cloud-support libs
 Patch500:	ha-cloud-support-aliyun.patch
@@ -327,6 +329,8 @@ exit 1
 %patch -p1 -P 51
 %patch -p1 -P 52
 %patch -p1 -P 53
+%patch -p1 -P 54
+%patch -p1 -P 55 -F2
 
 # bundled ha-cloud-support libs
 %patch -p1 -P 500
@@ -659,6 +663,12 @@ rm -rf %{buildroot}/usr/share/doc/resource-agents
 %{_usr}/lib/ocf/lib/heartbeat/OCF_*.pm
 
 %changelog
+* Thu Nov 13 2025 Oyvind Albrigtsen <oalbrigt@redhat.com> - 4.16.0-41
+- podman-etcd: exclude stopping resources from active count
+- podman-etcd: add container crash detection with coordinated recovery
+
+  Resolves: RHEL-127840, RHEL-126083
+
 * Mon Nov 10 2025 Oyvind Albrigtsen <oalbrigt@redhat.com> - 4.16.0-40
 - storage_mon: fix handling of 4k block devices