- podman-etcd: exclude stopping resources from active count

- podman-etcd: add container crash detection with coordinated recovery Resolves: RHEL-127840, RHEL-126083
2025-11-13 11:33:32 +01:00 · 2025-11-13 11:33:32 +01:00 · a7e8b855e0
commit a7e8b855e0
parent 42cfbb8ada
3 changed files with 339 additions and 1 deletions
--- a/RHEL-126083-podman-etcd-add-container-crash-detection-with-coordinated-recovery.patch
+++ b/RHEL-126083-podman-etcd-add-container-crash-detection-with-coordinated-recovery.patch
@ -0,0 +1,222 @@
+From e8fb2ad9cc14e91b74b5cde1e012d92afcddb1a5 Mon Sep 17 00:00:00 2001
+From: Carlo Lobrano <c.lobrano@gmail.com>
+Date: Sat, 25 Oct 2025 17:27:42 +0200
+Subject: [PATCH] podman-etcd: add container crash detection with coordinated
+ recovery
+
+This change prevents the agent from starting prematurely when the etcd
+container has failed. Previously, an early start would cause the agent
+to block while waiting for peer-initiated recovery. This blocking
+prevented Pacemaker from allowing the surviving agent to stop and
+properly recover the cluster.
+
+The change introduces `container_health_check` function to monitor the
+container's state and catch etcd failures. This check uses a state file
+to distinguish between a planned shutdown and an unexpected failure:
+
+* Container Running: The state file is created or updated with the
+  current epoch (timestamp). Returns: "healthy".
+* Container Not Running + No State File: It's the first check. Returns:
+  "not-running".
+* Container Not Running + State File: An unexpected failure is detected.
+  * If force_new_cluster is set, the status is: "failed-restart-now".
+  * Otherwise, the status is: "failed-wait-for-peer".
+
+The state file is written in a temporary directory (HA_RSCTMP) to ensure
+automatic cleanup on reboot. It is also explicitly removed in
+`podman_start` and `podman_stop` to mark planned transitions.
+
+A new helper function `get_time_since_last_heartbeat()` calculates
+elapsed time since the last healthy check for diagnostic logging.
+
+Monitor behavior changes:
+* failed-wait-for-peer: Returns OCF_SUCCESS to keep resource running
+  while waiting for peer-initiated recovery, as the agent is not able
+  to recover the cluster from a failed state.
+* failed-restart-now: Returns OCF_ERR_GENERIC to trigger restart once
+  peer has set force_new_cluster
+---
+ heartbeat/podman-etcd | 133 +++++++++++++++++++++++++++++++++++++++---
+ 1 file changed, 124 insertions(+), 9 deletions(-)
+
+diff --git a/heartbeat/podman-etcd b/heartbeat/podman-etcd
+index b8dfb2f9e..d596c6f2a 100755
+--- a/heartbeat/podman-etcd
+++ b/heartbeat/podman-etcd
+@@ -1226,22 +1226,122 @@ podman_simple_status()
+ 	return $rc
+ }
+ 
+-podman_monitor()
+# get_time_since_last_heartbeat returns the time in seconds since the heartbeat file was last updated.
+#
+# Returns: time in seconds since last heartbeat, or empty string if file doesn't exist
+get_time_since_last_heartbeat()
+ {
+	local last_heartbeat
+
+	if [ ! -f "$CONTAINER_HEARTBEAT_FILE" ]; then
+		return
+	fi
+
+	last_heartbeat=$(cat "$CONTAINER_HEARTBEAT_FILE")
+	echo $(($(date +%s) - last_heartbeat))
+}
+
+# container_health_check performs comprehensive health monitoring for the container.
+# This function allows coordinated failure handling where the agent waits for
+# peer-initiated cluster recovery in case of container failure.
+#
+# Uses a state file to track container state:
+# - Container running: Update state file with current epoch, return "healthy"
+# - Container not running + no state file: Return "not-running" (never checked before)
+# - Container not running + state file: Failure detected, check force_new_cluster
+#   - If force_new_cluster set: Return "failed-restart-now"
+#   - Otherwise: Return "failed-wait-for-peer"
+#
+# Returns: healthy, not-running, failed-restart-now, failed-wait-for-peer
+
+container_health_check()
+{
+	local rc
+
+ 	# We rely on running podman exec to monitor the container
+ 	# state because that command seems to be less prone to
+ 	# performance issue under IO load.
+ 	#
+ 	# For probes to work, we expect cmd_exec to be able to report
+-	# when a container is not running. Here, we're not interested
+-	# in distinguishing whether it's stopped or non existing
+-	# (there's function container_exists for that)
+	#   when a container is not running. Here, we're not interested
+	#   in distinguishing whether it's stopped or non existing
+	#   (there's function container_exists for that)
+	# For monitor, however, we still need to know if it has stopped
+	#   recently (i.e. a failure), or not (fresh start)
+ 	monitor_cmd_exec
+ 	rc=$?
+-	if [ $rc -ne 0 ]; then
+-		return $rc
+	if [ "$rc" -eq 0 ]; then
+		# Container is running - update state file with current epoch
+		local current_epoch
+		current_epoch=$(date +%s)
+		if ! echo "$current_epoch" > "$CONTAINER_HEARTBEAT_FILE"; then
+			ocf_log warn "Failed to update container heartbeat file, error code: $?"
+			# wait for peer to detect any real issue with the etcd cluster or wait for the
+			# next monitor interval
+			echo "failed-wait-for-peer"
+			return
+		fi
+		echo "healthy"
+		return
+ 	fi
+ 
+	# Check if state file exists (was container running on last check?)
+	if [ ! -f "$CONTAINER_HEARTBEAT_FILE" ]; then
+		# No state file - container was never checked before
+		ocf_log debug "Container ${CONTAINER} has no previous state"
+		echo "not-running"
+		# NOTE: this is where the probe is expected to exit, keeping the logic
+		# quick and less prone to performance issue under IO load.
+		return
+	fi
+
+	# State file exists - the container failed, check recovery status in this lifecycle
+	local time_since_heartbeat
+	time_since_heartbeat=$(get_time_since_last_heartbeat)
+	ocf_log err "Container ${CONTAINER} failed (last healthy: ${time_since_heartbeat}s ago)"
+
+	# Check if peer has set force_new_cluster for recovery
+	local fnc_holders
+	if ! fnc_holders=$(get_force_new_cluster); then
+		ocf_log err "Could not detect peer-initiated recovery. Checking again in the next monitor cycle"
+		echo "failed-wait-for-peer"
+		return
+	fi
+
+	if [ -n "$fnc_holders" ]; then
+		ocf_log debug "force_new_cluster detected (set by: $fnc_holders), triggering restart"
+		echo "failed-restart-now"
+		return
+	fi
+
+	echo "failed-wait-for-peer"
+}
+
+podman_monitor()
+{
+	local container_health_state
+
+	container_health_state=$(container_health_check)
+	case "$container_health_state" in
+		healthy)
+			# Continue with normal monitoring
+			;;
+		not-running)
+			return $OCF_NOT_RUNNING
+			;;
+		failed-restart-now)
+			return $OCF_ERR_GENERIC
+			;;
+		failed-wait-for-peer)
+			# Continue running, waiting for peer recovery
+			return $OCF_SUCCESS
+			;;
+		*)
+			ocf_log err "Unknown health state: $container_health_state"
+			return $OCF_ERR_GENERIC
+			;;
+	esac
+
+ 	# Check if certificate files have changed, if they have, etcd needs to be restarted
+ 	if ! etcd_certificates_hash_manager "check"; then
+ 		return $OCF_ERR_GENERIC
+@@ -1533,6 +1633,12 @@ podman_start()
+ 	local pod_was_running=false
+ 
+ 	ocf_log notice "podman-etcd start"
+
+	# Clear container health check state file
+	if ! rm -f "$CONTAINER_HEARTBEAT_FILE"; then
+		ocf_log err "could not delete container health check state file"
+	fi
+
+ 	attribute_node_ip update
+ 	attribute_node_cluster_id update
+ 	attribute_node_revision update
+@@ -1849,15 +1955,21 @@ podman_stop()
+ 	local rc
+ 
+ 	ocf_log notice "podman-etcd stop"
+
+	# Clear container health check state file
+	if ! rm -f "$CONTAINER_HEARTBEAT_FILE"; then
+		ocf_log err "could not delete container health check state file"
+	fi
+
+	attribute_node_revision update
+	attribute_node_cluster_id update
+
+ 	podman_simple_status
+ 	if [ $? -eq  $OCF_NOT_RUNNING ]; then
+ 		ocf_log info "could not leave members list: etcd container not running"
+ 		return $OCF_SUCCESS
+ 	fi
+ 
+-	attribute_node_revision update
+-	attribute_node_cluster_id update
+-
+ 	if ! member_id=$(attribute_node_member_id get); then
+ 		ocf_log err "error leaving members list: could not get member-id"
+ 	else
+@@ -2007,6 +2119,9 @@ POD_MANIFEST_COPY="${OCF_RESKEY_config_location}/pod.yaml"
+ ETCD_CONFIGURATION_FILE="${OCF_RESKEY_config_location}/config.yaml"
+ ETCD_BACKUP_FILE="${OCF_RESKEY_backup_location}/config-previous.tar.gz"
+ ETCD_CERTS_HASH_FILE="${OCF_RESKEY_config_location}/certs.hash"
+# State file location: Uses HA_RSCTMP to ensure automatic cleanup on reboot.
+# This is intentional - reboots are controlled stops, not failures requiring detection.
+CONTAINER_HEARTBEAT_FILE=${HA_RSCTMP}/podman-container-last-running
+ 
+ # Note: we currently monitor podman containers by with the "podman exec"
+ # command, so make sure that invocation is always valid by enforcing the
--- a/RHEL-127840-podman-etcd-exclude-stopping-resources-from-active-count.patch
+++ b/RHEL-127840-podman-etcd-exclude-stopping-resources-from-active-count.patch
@ -0,0 +1,106 @@
+From d5b4428e6cd66fd47680531ff0244d9b56e4e4c2 Mon Sep 17 00:00:00 2001
+From: Pablo Fontanilla <pfontani@redhat.com>
+Date: Tue, 14 Oct 2025 11:57:09 +0200
+Subject: [PATCH 1/2] Redo counting of active_resources
+
+---
+ heartbeat/podman-etcd | 46 +++++++++++++++++++++++++++++++++++++++++--
+ 1 file changed, 44 insertions(+), 2 deletions(-)
+
+diff --git a/heartbeat/podman-etcd b/heartbeat/podman-etcd
+index e1425ec02..dbf16918d 100755
+--- a/heartbeat/podman-etcd
+++ b/heartbeat/podman-etcd
+@@ -1029,6 +1029,48 @@ get_peer_node_name() {
+ 	crm_node -l | awk '{print $2}' | grep -v "$NODENAME"
+ }
+ 
+# Calculate the count of truly active resources by excluding those being stopped.
+# According to Pacemaker documentation, during "Post-notification (stop) /
+# Pre-notification (start)" transitions, the true active resource count should be:
+#   Active resources = $OCF_RESKEY_CRM_meta_notify_active_resource
+#                      minus $OCF_RESKEY_CRM_meta_notify_stop_resource
+# This handles the case where a resource appears in both the active and stop lists
+# during rapid restart scenarios (e.g., process crash recovery).
+get_truly_active_resources_count() {
+	local active_list="$OCF_RESKEY_CRM_meta_notify_active_resource"
+	local stop_list="$OCF_RESKEY_CRM_meta_notify_stop_resource"
+	local truly_active=""
+
+	# If no active resources, return 0
+	if [ -z "$active_list" ]; then
+		echo "0"
+		return
+	fi
+
+	# If no resources being stopped, return count of active resources
+	if [ -z "$stop_list" ]; then
+		echo "$active_list" | wc -w
+		return
+	fi
+
+	# Filter out resources that are being stopped from the active list
+	for resource in $active_list; do
+		local is_stopping=0
+		for stop_resource in $stop_list; do
+			if [ "$resource" = "$stop_resource" ]; then
+				is_stopping=1
+				break
+			fi
+		done
+		if [ $is_stopping -eq 0 ]; then
+			truly_active="$truly_active $resource"
+		fi
+	done
+
+	# Count the truly active resources (trim leading space and count words)
+	echo "$truly_active" | wc -w
+}
+
+ get_all_etcd_endpoints() {
+ 	for node in $(echo "$OCF_RESKEY_node_ip_map" | sed "s/\s//g;s/;/ /g"); do
+         name=$(echo "$node" | cut -d: -f1)
+@@ -1529,8 +1571,8 @@ podman_start()
+ 			# - 0 active agents, 1 starting: we are starting; the peer is not starting
+ 			# - 0 active agents, 2 starting: both agents are starting simultaneously
+ 			local active_resources_count
+-			active_resources_count=$(echo "$OCF_RESKEY_CRM_meta_notify_active_resource" | wc -w)
+-			ocf_log info "found '$active_resources_count' active etcd resources (meta notify environment variable: '$OCF_RESKEY_CRM_meta_notify_active_resource')"
+			active_resources_count=$(get_truly_active_resources_count)
+			ocf_log info "found '$active_resources_count' active etcd resources (active: '$OCF_RESKEY_CRM_meta_notify_active_resource', stop: '$OCF_RESKEY_CRM_meta_notify_stop_resource')"
+ 			case "$active_resources_count" in
+ 			1)
+ 				if [ "$(attribute_learner_node get)" = "$(get_peer_node_name)" ]; then
+
+From 0114ddf83c95122a7f9fe9f704f864242cdb284a Mon Sep 17 00:00:00 2001
+From: Pablo Fontanilla <pfontani@redhat.com>
+Date: Wed, 29 Oct 2025 12:49:17 +0100
+Subject: [PATCH 2/2] Update truly active resources count with safer empty
+ calculation
+
+---
+ heartbeat/podman-etcd | 6 ++++--
+ 1 file changed, 4 insertions(+), 2 deletions(-)
+
+diff --git a/heartbeat/podman-etcd b/heartbeat/podman-etcd
+index dbf16918d..8fc92a537 100755
+--- a/heartbeat/podman-etcd
+++ b/heartbeat/podman-etcd
+@@ -1042,13 +1042,15 @@ get_truly_active_resources_count() {
+ 	local truly_active=""
+ 
+ 	# If no active resources, return 0
+-	if [ -z "$active_list" ]; then
+	# Use word count to handle whitespace-only values
+	if [ "$(echo "$active_list" | wc -w)" -eq 0 ]; then
+ 		echo "0"
+ 		return
+ 	fi
+ 
+ 	# If no resources being stopped, return count of active resources
+-	if [ -z "$stop_list" ]; then
+	# Use word count to handle whitespace-only values
+	if [ "$(echo "$stop_list" | wc -w)" -eq 0 ]; then
+ 		echo "$active_list" | wc -w
+ 		return
+ 	fi
--- a/resource-agents.spec
+++ b/resource-agents.spec
@ -45,7 +45,7 @@
 Name:		resource-agents
 Summary:	Open Source HA Reusable Cluster Resource Scripts
 Version:	4.16.0
-Release:	40%{?rcver:%{rcver}}%{?numcomm:.%{numcomm}}%{?alphatag:.%{alphatag}}%{?dirty:.%{dirty}}%{?dist}
+Release:	41%{?rcver:%{rcver}}%{?numcomm:.%{numcomm}}%{?alphatag:.%{alphatag}}%{?dirty:.%{dirty}}%{?dist}
 License:	GPL-2.0-or-later AND LGPL-2.1-or-later
 URL:		https://github.com/ClusterLabs/resource-agents
 Source0:	%{upstream_prefix}-%{upstream_version}.tar.gz
@ -103,6 +103,8 @@ Patch50:	RHEL-102779-pgsqlms-fix-validate-warnings.patch
 Patch51:	RHEL-112443-nginx-fix-validate-warnings.patch
 Patch52:	RHEL-121985-Filesystem-speed-up-get-PIDs.patch
 Patch53:	RHEL-126791-storage_mon-fix-handling-of-4k-block-devices.patch
+Patch54:	RHEL-127840-podman-etcd-exclude-stopping-resources-from-active-count.patch
+Patch55:	RHEL-126083-podman-etcd-add-container-crash-detection-with-coordinated-recovery.patch

 # bundled ha-cloud-support libs
 Patch500:	ha-cloud-support-aliyun.patch
@ -327,6 +329,8 @@ exit 1
 %patch -p1 -P 51
 %patch -p1 -P 52
 %patch -p1 -P 53
+%patch -p1 -P 54
+%patch -p1 -P 55 -F2

 # bundled ha-cloud-support libs
 %patch -p1 -P 500
@ -659,6 +663,12 @@ rm -rf %{buildroot}/usr/share/doc/resource-agents
 %{_usr}/lib/ocf/lib/heartbeat/OCF_*.pm

 %changelog
+* Thu Nov 13 2025 Oyvind Albrigtsen <oalbrigt@redhat.com> - 4.16.0-41
+- podman-etcd: exclude stopping resources from active count
+- podman-etcd: add container crash detection with coordinated recovery
+
+  Resolves: RHEL-127840, RHEL-126083
+
 * Mon Nov 10 2025 Oyvind Albrigtsen <oalbrigt@redhat.com> - 4.16.0-40
 - storage_mon: fix handling of 4k block devices