- podman-etcd: exclude stopping resources from active count

- podman-etcd: add container crash detection with coordinated recovery

  Resolves: RHEL-127840, RHEL-126083
This commit is contained in:
Oyvind Albrigtsen 2025-11-13 11:33:32 +01:00
parent 42cfbb8ada
commit a7e8b855e0
3 changed files with 339 additions and 1 deletions

View File

@ -0,0 +1,222 @@
From e8fb2ad9cc14e91b74b5cde1e012d92afcddb1a5 Mon Sep 17 00:00:00 2001
From: Carlo Lobrano <c.lobrano@gmail.com>
Date: Sat, 25 Oct 2025 17:27:42 +0200
Subject: [PATCH] podman-etcd: add container crash detection with coordinated
recovery
This change prevents the agent from starting prematurely when the etcd
container has failed. Previously, an early start would cause the agent
to block while waiting for peer-initiated recovery. This blocking
prevented Pacemaker from allowing the surviving agent to stop and
properly recover the cluster.
The change introduces `container_health_check` function to monitor the
container's state and catch etcd failures. This check uses a state file
to distinguish between a planned shutdown and an unexpected failure:
* Container Running: The state file is created or updated with the
current epoch (timestamp). Returns: "healthy".
* Container Not Running + No State File: It's the first check. Returns:
"not-running".
* Container Not Running + State File: An unexpected failure is detected.
* If force_new_cluster is set, the status is: "failed-restart-now".
* Otherwise, the status is: "failed-wait-for-peer".
The state file is written in a temporary directory (HA_RSCTMP) to ensure
automatic cleanup on reboot. It is also explicitly removed in
`podman_start` and `podman_stop` to mark planned transitions.
A new helper function `get_time_since_last_heartbeat()` calculates
elapsed time since the last healthy check for diagnostic logging.
Monitor behavior changes:
* failed-wait-for-peer: Returns OCF_SUCCESS to keep resource running
while waiting for peer-initiated recovery, as the agent is not able
to recover the cluster from a failed state.
* failed-restart-now: Returns OCF_ERR_GENERIC to trigger restart once
peer has set force_new_cluster
---
heartbeat/podman-etcd | 133 +++++++++++++++++++++++++++++++++++++++---
1 file changed, 124 insertions(+), 9 deletions(-)
diff --git a/heartbeat/podman-etcd b/heartbeat/podman-etcd
index b8dfb2f9e..d596c6f2a 100755
--- a/heartbeat/podman-etcd
+++ b/heartbeat/podman-etcd
@@ -1226,22 +1226,122 @@ podman_simple_status()
return $rc
}
-podman_monitor()
+# get_time_since_last_heartbeat returns the time in seconds since the heartbeat file was last updated.
+#
+# Returns: time in seconds since last heartbeat, or empty string if file doesn't exist
+get_time_since_last_heartbeat()
{
+ local last_heartbeat
+
+ if [ ! -f "$CONTAINER_HEARTBEAT_FILE" ]; then
+ return
+ fi
+
+ last_heartbeat=$(cat "$CONTAINER_HEARTBEAT_FILE")
+ echo $(($(date +%s) - last_heartbeat))
+}
+
+# container_health_check performs comprehensive health monitoring for the container.
+# This function allows coordinated failure handling where the agent waits for
+# peer-initiated cluster recovery in case of container failure.
+#
+# Uses a state file to track container state:
+# - Container running: Update state file with current epoch, return "healthy"
+# - Container not running + no state file: Return "not-running" (never checked before)
+# - Container not running + state file: Failure detected, check force_new_cluster
+# - If force_new_cluster set: Return "failed-restart-now"
+# - Otherwise: Return "failed-wait-for-peer"
+#
+# Returns: healthy, not-running, failed-restart-now, failed-wait-for-peer
+
+container_health_check()
+{
+ local rc
+
# We rely on running podman exec to monitor the container
# state because that command seems to be less prone to
# performance issue under IO load.
#
# For probes to work, we expect cmd_exec to be able to report
- # when a container is not running. Here, we're not interested
- # in distinguishing whether it's stopped or non existing
- # (there's function container_exists for that)
+ # when a container is not running. Here, we're not interested
+ # in distinguishing whether it's stopped or non existing
+ # (there's function container_exists for that)
+ # For monitor, however, we still need to know if it has stopped
+ # recently (i.e. a failure), or not (fresh start)
monitor_cmd_exec
rc=$?
- if [ $rc -ne 0 ]; then
- return $rc
+ if [ "$rc" -eq 0 ]; then
+ # Container is running - update state file with current epoch
+ local current_epoch
+ current_epoch=$(date +%s)
+ if ! echo "$current_epoch" > "$CONTAINER_HEARTBEAT_FILE"; then
+ ocf_log warn "Failed to update container heartbeat file, error code: $?"
+ # wait for peer to detect any real issue with the etcd cluster or wait for the
+ # next monitor interval
+ echo "failed-wait-for-peer"
+ return
+ fi
+ echo "healthy"
+ return
fi
+ # Check if state file exists (was container running on last check?)
+ if [ ! -f "$CONTAINER_HEARTBEAT_FILE" ]; then
+ # No state file - container was never checked before
+ ocf_log debug "Container ${CONTAINER} has no previous state"
+ echo "not-running"
+ # NOTE: this is where the probe is expected to exit, keeping the logic
+ # quick and less prone to performance issue under IO load.
+ return
+ fi
+
+ # State file exists - the container failed, check recovery status in this lifecycle
+ local time_since_heartbeat
+ time_since_heartbeat=$(get_time_since_last_heartbeat)
+ ocf_log err "Container ${CONTAINER} failed (last healthy: ${time_since_heartbeat}s ago)"
+
+ # Check if peer has set force_new_cluster for recovery
+ local fnc_holders
+ if ! fnc_holders=$(get_force_new_cluster); then
+ ocf_log err "Could not detect peer-initiated recovery. Checking again in the next monitor cycle"
+ echo "failed-wait-for-peer"
+ return
+ fi
+
+ if [ -n "$fnc_holders" ]; then
+ ocf_log debug "force_new_cluster detected (set by: $fnc_holders), triggering restart"
+ echo "failed-restart-now"
+ return
+ fi
+
+ echo "failed-wait-for-peer"
+}
+
+podman_monitor()
+{
+ local container_health_state
+
+ container_health_state=$(container_health_check)
+ case "$container_health_state" in
+ healthy)
+ # Continue with normal monitoring
+ ;;
+ not-running)
+ return $OCF_NOT_RUNNING
+ ;;
+ failed-restart-now)
+ return $OCF_ERR_GENERIC
+ ;;
+ failed-wait-for-peer)
+ # Continue running, waiting for peer recovery
+ return $OCF_SUCCESS
+ ;;
+ *)
+ ocf_log err "Unknown health state: $container_health_state"
+ return $OCF_ERR_GENERIC
+ ;;
+ esac
+
# Check if certificate files have changed, if they have, etcd needs to be restarted
if ! etcd_certificates_hash_manager "check"; then
return $OCF_ERR_GENERIC
@@ -1533,6 +1633,12 @@ podman_start()
local pod_was_running=false
ocf_log notice "podman-etcd start"
+
+ # Clear container health check state file
+ if ! rm -f "$CONTAINER_HEARTBEAT_FILE"; then
+ ocf_log err "could not delete container health check state file"
+ fi
+
attribute_node_ip update
attribute_node_cluster_id update
attribute_node_revision update
@@ -1849,15 +1955,21 @@ podman_stop()
local rc
ocf_log notice "podman-etcd stop"
+
+ # Clear container health check state file
+ if ! rm -f "$CONTAINER_HEARTBEAT_FILE"; then
+ ocf_log err "could not delete container health check state file"
+ fi
+
+ attribute_node_revision update
+ attribute_node_cluster_id update
+
podman_simple_status
if [ $? -eq $OCF_NOT_RUNNING ]; then
ocf_log info "could not leave members list: etcd container not running"
return $OCF_SUCCESS
fi
- attribute_node_revision update
- attribute_node_cluster_id update
-
if ! member_id=$(attribute_node_member_id get); then
ocf_log err "error leaving members list: could not get member-id"
else
@@ -2007,6 +2119,9 @@ POD_MANIFEST_COPY="${OCF_RESKEY_config_location}/pod.yaml"
ETCD_CONFIGURATION_FILE="${OCF_RESKEY_config_location}/config.yaml"
ETCD_BACKUP_FILE="${OCF_RESKEY_backup_location}/config-previous.tar.gz"
ETCD_CERTS_HASH_FILE="${OCF_RESKEY_config_location}/certs.hash"
+# State file location: Uses HA_RSCTMP to ensure automatic cleanup on reboot.
+# This is intentional - reboots are controlled stops, not failures requiring detection.
+CONTAINER_HEARTBEAT_FILE=${HA_RSCTMP}/podman-container-last-running
# Note: we currently monitor podman containers by with the "podman exec"
# command, so make sure that invocation is always valid by enforcing the

View File

@ -0,0 +1,106 @@
From d5b4428e6cd66fd47680531ff0244d9b56e4e4c2 Mon Sep 17 00:00:00 2001
From: Pablo Fontanilla <pfontani@redhat.com>
Date: Tue, 14 Oct 2025 11:57:09 +0200
Subject: [PATCH 1/2] Redo counting of active_resources
---
heartbeat/podman-etcd | 46 +++++++++++++++++++++++++++++++++++++++++--
1 file changed, 44 insertions(+), 2 deletions(-)
diff --git a/heartbeat/podman-etcd b/heartbeat/podman-etcd
index e1425ec02..dbf16918d 100755
--- a/heartbeat/podman-etcd
+++ b/heartbeat/podman-etcd
@@ -1029,6 +1029,48 @@ get_peer_node_name() {
crm_node -l | awk '{print $2}' | grep -v "$NODENAME"
}
+# Calculate the count of truly active resources by excluding those being stopped.
+# According to Pacemaker documentation, during "Post-notification (stop) /
+# Pre-notification (start)" transitions, the true active resource count should be:
+# Active resources = $OCF_RESKEY_CRM_meta_notify_active_resource
+# minus $OCF_RESKEY_CRM_meta_notify_stop_resource
+# This handles the case where a resource appears in both the active and stop lists
+# during rapid restart scenarios (e.g., process crash recovery).
+get_truly_active_resources_count() {
+ local active_list="$OCF_RESKEY_CRM_meta_notify_active_resource"
+ local stop_list="$OCF_RESKEY_CRM_meta_notify_stop_resource"
+ local truly_active=""
+
+ # If no active resources, return 0
+ if [ -z "$active_list" ]; then
+ echo "0"
+ return
+ fi
+
+ # If no resources being stopped, return count of active resources
+ if [ -z "$stop_list" ]; then
+ echo "$active_list" | wc -w
+ return
+ fi
+
+ # Filter out resources that are being stopped from the active list
+ for resource in $active_list; do
+ local is_stopping=0
+ for stop_resource in $stop_list; do
+ if [ "$resource" = "$stop_resource" ]; then
+ is_stopping=1
+ break
+ fi
+ done
+ if [ $is_stopping -eq 0 ]; then
+ truly_active="$truly_active $resource"
+ fi
+ done
+
+ # Count the truly active resources (trim leading space and count words)
+ echo "$truly_active" | wc -w
+}
+
get_all_etcd_endpoints() {
for node in $(echo "$OCF_RESKEY_node_ip_map" | sed "s/\s//g;s/;/ /g"); do
name=$(echo "$node" | cut -d: -f1)
@@ -1529,8 +1571,8 @@ podman_start()
# - 0 active agents, 1 starting: we are starting; the peer is not starting
# - 0 active agents, 2 starting: both agents are starting simultaneously
local active_resources_count
- active_resources_count=$(echo "$OCF_RESKEY_CRM_meta_notify_active_resource" | wc -w)
- ocf_log info "found '$active_resources_count' active etcd resources (meta notify environment variable: '$OCF_RESKEY_CRM_meta_notify_active_resource')"
+ active_resources_count=$(get_truly_active_resources_count)
+ ocf_log info "found '$active_resources_count' active etcd resources (active: '$OCF_RESKEY_CRM_meta_notify_active_resource', stop: '$OCF_RESKEY_CRM_meta_notify_stop_resource')"
case "$active_resources_count" in
1)
if [ "$(attribute_learner_node get)" = "$(get_peer_node_name)" ]; then
From 0114ddf83c95122a7f9fe9f704f864242cdb284a Mon Sep 17 00:00:00 2001
From: Pablo Fontanilla <pfontani@redhat.com>
Date: Wed, 29 Oct 2025 12:49:17 +0100
Subject: [PATCH 2/2] Update truly active resources count with safer empty
calculation
---
heartbeat/podman-etcd | 6 ++++--
1 file changed, 4 insertions(+), 2 deletions(-)
diff --git a/heartbeat/podman-etcd b/heartbeat/podman-etcd
index dbf16918d..8fc92a537 100755
--- a/heartbeat/podman-etcd
+++ b/heartbeat/podman-etcd
@@ -1042,13 +1042,15 @@ get_truly_active_resources_count() {
local truly_active=""
# If no active resources, return 0
- if [ -z "$active_list" ]; then
+ # Use word count to handle whitespace-only values
+ if [ "$(echo "$active_list" | wc -w)" -eq 0 ]; then
echo "0"
return
fi
# If no resources being stopped, return count of active resources
- if [ -z "$stop_list" ]; then
+ # Use word count to handle whitespace-only values
+ if [ "$(echo "$stop_list" | wc -w)" -eq 0 ]; then
echo "$active_list" | wc -w
return
fi

View File

@ -45,7 +45,7 @@
Name: resource-agents
Summary: Open Source HA Reusable Cluster Resource Scripts
Version: 4.16.0
Release: 40%{?rcver:%{rcver}}%{?numcomm:.%{numcomm}}%{?alphatag:.%{alphatag}}%{?dirty:.%{dirty}}%{?dist}
Release: 41%{?rcver:%{rcver}}%{?numcomm:.%{numcomm}}%{?alphatag:.%{alphatag}}%{?dirty:.%{dirty}}%{?dist}
License: GPL-2.0-or-later AND LGPL-2.1-or-later
URL: https://github.com/ClusterLabs/resource-agents
Source0: %{upstream_prefix}-%{upstream_version}.tar.gz
@ -103,6 +103,8 @@ Patch50: RHEL-102779-pgsqlms-fix-validate-warnings.patch
Patch51: RHEL-112443-nginx-fix-validate-warnings.patch
Patch52: RHEL-121985-Filesystem-speed-up-get-PIDs.patch
Patch53: RHEL-126791-storage_mon-fix-handling-of-4k-block-devices.patch
Patch54: RHEL-127840-podman-etcd-exclude-stopping-resources-from-active-count.patch
Patch55: RHEL-126083-podman-etcd-add-container-crash-detection-with-coordinated-recovery.patch
# bundled ha-cloud-support libs
Patch500: ha-cloud-support-aliyun.patch
@ -327,6 +329,8 @@ exit 1
%patch -p1 -P 51
%patch -p1 -P 52
%patch -p1 -P 53
%patch -p1 -P 54
%patch -p1 -P 55 -F2
# bundled ha-cloud-support libs
%patch -p1 -P 500
@ -659,6 +663,12 @@ rm -rf %{buildroot}/usr/share/doc/resource-agents
%{_usr}/lib/ocf/lib/heartbeat/OCF_*.pm
%changelog
* Thu Nov 13 2025 Oyvind Albrigtsen <oalbrigt@redhat.com> - 4.16.0-41
- podman-etcd: exclude stopping resources from active count
- podman-etcd: add container crash detection with coordinated recovery
Resolves: RHEL-127840, RHEL-126083
* Mon Nov 10 2025 Oyvind Albrigtsen <oalbrigt@redhat.com> - 4.16.0-40
- storage_mon: fix handling of 4k block devices