resource-agents/RHEL-126083-podman-etcd-add-container-crash-detection-with-coordinated-recovery.patch

From e8fb2ad9cc14e91b74b5cde1e012d92afcddb1a5 Mon Sep 17 00:00:00 2001
From: Carlo Lobrano <c.lobrano@gmail.com>
Date: Sat, 25 Oct 2025 17:27:42 +0200
Subject: [PATCH] podman-etcd: add container crash detection with coordinated
 recovery

This change prevents the agent from starting prematurely when the etcd
container has failed. Previously, an early start would cause the agent
to block while waiting for peer-initiated recovery. This blocking
prevented Pacemaker from allowing the surviving agent to stop and
properly recover the cluster.

The change introduces `container_health_check` function to monitor the
container's state and catch etcd failures. This check uses a state file
to distinguish between a planned shutdown and an unexpected failure:

* Container Running: The state file is created or updated with the
  current epoch (timestamp). Returns: "healthy".
* Container Not Running + No State File: It's the first check. Returns:
  "not-running".
* Container Not Running + State File: An unexpected failure is detected.
  * If force_new_cluster is set, the status is: "failed-restart-now".
  * Otherwise, the status is: "failed-wait-for-peer".

The state file is written in a temporary directory (HA_RSCTMP) to ensure
automatic cleanup on reboot. It is also explicitly removed in
`podman_start` and `podman_stop` to mark planned transitions.

A new helper function `get_time_since_last_heartbeat()` calculates
elapsed time since the last healthy check for diagnostic logging.

Monitor behavior changes:
* failed-wait-for-peer: Returns OCF_SUCCESS to keep resource running
  while waiting for peer-initiated recovery, as the agent is not able
  to recover the cluster from a failed state.
* failed-restart-now: Returns OCF_ERR_GENERIC to trigger restart once
  peer has set force_new_cluster
---
 heartbeat/podman-etcd | 133 +++++++++++++++++++++++++++++++++++++++---
 1 file changed, 124 insertions(+), 9 deletions(-)

diff --git a/heartbeat/podman-etcd b/heartbeat/podman-etcd
index b8dfb2f9e..d596c6f2a 100755
--- a/heartbeat/podman-etcd
+++ b/heartbeat/podman-etcd
@@ -1226,22 +1226,122 @@ podman_simple_status()
 	return $rc
 }

-podman_monitor()
+# get_time_since_last_heartbeat returns the time in seconds since the heartbeat file was last updated.
+#
+# Returns: time in seconds since last heartbeat, or empty string if file doesn't exist
+get_time_since_last_heartbeat()
 {
+	local last_heartbeat
+
+	if [ ! -f "$CONTAINER_HEARTBEAT_FILE" ]; then
+		return
+	fi
+
+	last_heartbeat=$(cat "$CONTAINER_HEARTBEAT_FILE")
+	echo $(($(date +%s) - last_heartbeat))
+}
+
+# container_health_check performs comprehensive health monitoring for the container.
+# This function allows coordinated failure handling where the agent waits for
+# peer-initiated cluster recovery in case of container failure.
+#
+# Uses a state file to track container state:
+# - Container running: Update state file with current epoch, return "healthy"
+# - Container not running + no state file: Return "not-running" (never checked before)
+# - Container not running + state file: Failure detected, check force_new_cluster
+#   - If force_new_cluster set: Return "failed-restart-now"
+#   - Otherwise: Return "failed-wait-for-peer"
+#
+# Returns: healthy, not-running, failed-restart-now, failed-wait-for-peer
+
+container_health_check()
+{
+	local rc
+
 	# We rely on running podman exec to monitor the container
 	# state because that command seems to be less prone to
 	# performance issue under IO load.
 	#
 	# For probes to work, we expect cmd_exec to be able to report
-	# when a container is not running. Here, we're not interested
-	# in distinguishing whether it's stopped or non existing
-	# (there's function container_exists for that)
+	#   when a container is not running. Here, we're not interested
+	#   in distinguishing whether it's stopped or non existing
+	#   (there's function container_exists for that)
+	# For monitor, however, we still need to know if it has stopped
+	#   recently (i.e. a failure), or not (fresh start)
 	monitor_cmd_exec
 	rc=$?
-	if [ $rc -ne 0 ]; then
-		return $rc
+	if [ "$rc" -eq 0 ]; then
+		# Container is running - update state file with current epoch
+		local current_epoch
+		current_epoch=$(date +%s)
+		if ! echo "$current_epoch" > "$CONTAINER_HEARTBEAT_FILE"; then
+			ocf_log warn "Failed to update container heartbeat file, error code: $?"
+			# wait for peer to detect any real issue with the etcd cluster or wait for the
+			# next monitor interval
+			echo "failed-wait-for-peer"
+			return
+		fi
+		echo "healthy"
+		return
 	fi

+	# Check if state file exists (was container running on last check?)
+	if [ ! -f "$CONTAINER_HEARTBEAT_FILE" ]; then
+		# No state file - container was never checked before
+		ocf_log debug "Container ${CONTAINER} has no previous state"
+		echo "not-running"
+		# NOTE: this is where the probe is expected to exit, keeping the logic
+		# quick and less prone to performance issue under IO load.
+		return
+	fi
+
+	# State file exists - the container failed, check recovery status in this lifecycle
+	local time_since_heartbeat
+	time_since_heartbeat=$(get_time_since_last_heartbeat)
+	ocf_log err "Container ${CONTAINER} failed (last healthy: ${time_since_heartbeat}s ago)"
+
+	# Check if peer has set force_new_cluster for recovery
+	local fnc_holders
+	if ! fnc_holders=$(get_force_new_cluster); then
+		ocf_log err "Could not detect peer-initiated recovery. Checking again in the next monitor cycle"
+		echo "failed-wait-for-peer"
+		return
+	fi
+
+	if [ -n "$fnc_holders" ]; then
+		ocf_log debug "force_new_cluster detected (set by: $fnc_holders), triggering restart"
+		echo "failed-restart-now"
+		return
+	fi
+
+	echo "failed-wait-for-peer"
+}
+
+podman_monitor()
+{
+	local container_health_state
+
+	container_health_state=$(container_health_check)
+	case "$container_health_state" in
+		healthy)
+			# Continue with normal monitoring
+			;;
+		not-running)
+			return $OCF_NOT_RUNNING
+			;;
+		failed-restart-now)
+			return $OCF_ERR_GENERIC
+			;;
+		failed-wait-for-peer)
+			# Continue running, waiting for peer recovery
+			return $OCF_SUCCESS
+			;;
+		*)
+			ocf_log err "Unknown health state: $container_health_state"
+			return $OCF_ERR_GENERIC
+			;;
+	esac
+
 	# Check if certificate files have changed, if they have, etcd needs to be restarted
 	if ! etcd_certificates_hash_manager "check"; then
 		return $OCF_ERR_GENERIC
@@ -1533,6 +1633,12 @@ podman_start()
 	local pod_was_running=false

 	ocf_log notice "podman-etcd start"
+
+	# Clear container health check state file
+	if ! rm -f "$CONTAINER_HEARTBEAT_FILE"; then
+		ocf_log err "could not delete container health check state file"
+	fi
+
 	attribute_node_ip update
 	attribute_node_cluster_id update
 	attribute_node_revision update
@@ -1849,15 +1955,21 @@ podman_stop()
 	local rc

 	ocf_log notice "podman-etcd stop"
+
+	# Clear container health check state file
+	if ! rm -f "$CONTAINER_HEARTBEAT_FILE"; then
+		ocf_log err "could not delete container health check state file"
+	fi
+
+	attribute_node_revision update
+	attribute_node_cluster_id update
+
 	podman_simple_status
 	if [ $? -eq  $OCF_NOT_RUNNING ]; then
 		ocf_log info "could not leave members list: etcd container not running"
 		return $OCF_SUCCESS
 	fi

-	attribute_node_revision update
-	attribute_node_cluster_id update
-
 	if ! member_id=$(attribute_node_member_id get); then
 		ocf_log err "error leaving members list: could not get member-id"
 	else
@@ -2007,6 +2119,9 @@ POD_MANIFEST_COPY="${OCF_RESKEY_config_location}/pod.yaml"
 ETCD_CONFIGURATION_FILE="${OCF_RESKEY_config_location}/config.yaml"
 ETCD_BACKUP_FILE="${OCF_RESKEY_backup_location}/config-previous.tar.gz"
 ETCD_CERTS_HASH_FILE="${OCF_RESKEY_config_location}/certs.hash"
+# State file location: Uses HA_RSCTMP to ensure automatic cleanup on reboot.
+# This is intentional - reboots are controlled stops, not failures requiring detection.
+CONTAINER_HEARTBEAT_FILE=${HA_RSCTMP}/podman-container-last-running

 # Note: we currently monitor podman containers by with the "podman exec"
 # command, so make sure that invocation is always valid by enforcing the