- podman-etcd: exclude stopping resources from active count
- podman-etcd: add container crash detection with coordinated recovery Resolves: RHEL-127840, RHEL-126083
This commit is contained in:
parent
42cfbb8ada
commit
a7e8b855e0
@ -0,0 +1,222 @@
|
||||
From e8fb2ad9cc14e91b74b5cde1e012d92afcddb1a5 Mon Sep 17 00:00:00 2001
|
||||
From: Carlo Lobrano <c.lobrano@gmail.com>
|
||||
Date: Sat, 25 Oct 2025 17:27:42 +0200
|
||||
Subject: [PATCH] podman-etcd: add container crash detection with coordinated
|
||||
recovery
|
||||
|
||||
This change prevents the agent from starting prematurely when the etcd
|
||||
container has failed. Previously, an early start would cause the agent
|
||||
to block while waiting for peer-initiated recovery. This blocking
|
||||
prevented Pacemaker from allowing the surviving agent to stop and
|
||||
properly recover the cluster.
|
||||
|
||||
The change introduces `container_health_check` function to monitor the
|
||||
container's state and catch etcd failures. This check uses a state file
|
||||
to distinguish between a planned shutdown and an unexpected failure:
|
||||
|
||||
* Container Running: The state file is created or updated with the
|
||||
current epoch (timestamp). Returns: "healthy".
|
||||
* Container Not Running + No State File: It's the first check. Returns:
|
||||
"not-running".
|
||||
* Container Not Running + State File: An unexpected failure is detected.
|
||||
* If force_new_cluster is set, the status is: "failed-restart-now".
|
||||
* Otherwise, the status is: "failed-wait-for-peer".
|
||||
|
||||
The state file is written in a temporary directory (HA_RSCTMP) to ensure
|
||||
automatic cleanup on reboot. It is also explicitly removed in
|
||||
`podman_start` and `podman_stop` to mark planned transitions.
|
||||
|
||||
A new helper function `get_time_since_last_heartbeat()` calculates
|
||||
elapsed time since the last healthy check for diagnostic logging.
|
||||
|
||||
Monitor behavior changes:
|
||||
* failed-wait-for-peer: Returns OCF_SUCCESS to keep resource running
|
||||
while waiting for peer-initiated recovery, as the agent is not able
|
||||
to recover the cluster from a failed state.
|
||||
* failed-restart-now: Returns OCF_ERR_GENERIC to trigger restart once
|
||||
peer has set force_new_cluster
|
||||
---
|
||||
heartbeat/podman-etcd | 133 +++++++++++++++++++++++++++++++++++++++---
|
||||
1 file changed, 124 insertions(+), 9 deletions(-)
|
||||
|
||||
diff --git a/heartbeat/podman-etcd b/heartbeat/podman-etcd
|
||||
index b8dfb2f9e..d596c6f2a 100755
|
||||
--- a/heartbeat/podman-etcd
|
||||
+++ b/heartbeat/podman-etcd
|
||||
@@ -1226,22 +1226,122 @@ podman_simple_status()
|
||||
return $rc
|
||||
}
|
||||
|
||||
-podman_monitor()
|
||||
+# get_time_since_last_heartbeat returns the time in seconds since the heartbeat file was last updated.
|
||||
+#
|
||||
+# Returns: time in seconds since last heartbeat, or empty string if file doesn't exist
|
||||
+get_time_since_last_heartbeat()
|
||||
{
|
||||
+ local last_heartbeat
|
||||
+
|
||||
+ if [ ! -f "$CONTAINER_HEARTBEAT_FILE" ]; then
|
||||
+ return
|
||||
+ fi
|
||||
+
|
||||
+ last_heartbeat=$(cat "$CONTAINER_HEARTBEAT_FILE")
|
||||
+ echo $(($(date +%s) - last_heartbeat))
|
||||
+}
|
||||
+
|
||||
+# container_health_check performs comprehensive health monitoring for the container.
|
||||
+# This function allows coordinated failure handling where the agent waits for
|
||||
+# peer-initiated cluster recovery in case of container failure.
|
||||
+#
|
||||
+# Uses a state file to track container state:
|
||||
+# - Container running: Update state file with current epoch, return "healthy"
|
||||
+# - Container not running + no state file: Return "not-running" (never checked before)
|
||||
+# - Container not running + state file: Failure detected, check force_new_cluster
|
||||
+# - If force_new_cluster set: Return "failed-restart-now"
|
||||
+# - Otherwise: Return "failed-wait-for-peer"
|
||||
+#
|
||||
+# Returns: healthy, not-running, failed-restart-now, failed-wait-for-peer
|
||||
+
|
||||
+container_health_check()
|
||||
+{
|
||||
+ local rc
|
||||
+
|
||||
# We rely on running podman exec to monitor the container
|
||||
# state because that command seems to be less prone to
|
||||
# performance issue under IO load.
|
||||
#
|
||||
# For probes to work, we expect cmd_exec to be able to report
|
||||
- # when a container is not running. Here, we're not interested
|
||||
- # in distinguishing whether it's stopped or non existing
|
||||
- # (there's function container_exists for that)
|
||||
+ # when a container is not running. Here, we're not interested
|
||||
+ # in distinguishing whether it's stopped or non existing
|
||||
+ # (there's function container_exists for that)
|
||||
+ # For monitor, however, we still need to know if it has stopped
|
||||
+ # recently (i.e. a failure), or not (fresh start)
|
||||
monitor_cmd_exec
|
||||
rc=$?
|
||||
- if [ $rc -ne 0 ]; then
|
||||
- return $rc
|
||||
+ if [ "$rc" -eq 0 ]; then
|
||||
+ # Container is running - update state file with current epoch
|
||||
+ local current_epoch
|
||||
+ current_epoch=$(date +%s)
|
||||
+ if ! echo "$current_epoch" > "$CONTAINER_HEARTBEAT_FILE"; then
|
||||
+ ocf_log warn "Failed to update container heartbeat file, error code: $?"
|
||||
+ # wait for peer to detect any real issue with the etcd cluster or wait for the
|
||||
+ # next monitor interval
|
||||
+ echo "failed-wait-for-peer"
|
||||
+ return
|
||||
+ fi
|
||||
+ echo "healthy"
|
||||
+ return
|
||||
fi
|
||||
|
||||
+ # Check if state file exists (was container running on last check?)
|
||||
+ if [ ! -f "$CONTAINER_HEARTBEAT_FILE" ]; then
|
||||
+ # No state file - container was never checked before
|
||||
+ ocf_log debug "Container ${CONTAINER} has no previous state"
|
||||
+ echo "not-running"
|
||||
+ # NOTE: this is where the probe is expected to exit, keeping the logic
|
||||
+ # quick and less prone to performance issue under IO load.
|
||||
+ return
|
||||
+ fi
|
||||
+
|
||||
+ # State file exists - the container failed, check recovery status in this lifecycle
|
||||
+ local time_since_heartbeat
|
||||
+ time_since_heartbeat=$(get_time_since_last_heartbeat)
|
||||
+ ocf_log err "Container ${CONTAINER} failed (last healthy: ${time_since_heartbeat}s ago)"
|
||||
+
|
||||
+ # Check if peer has set force_new_cluster for recovery
|
||||
+ local fnc_holders
|
||||
+ if ! fnc_holders=$(get_force_new_cluster); then
|
||||
+ ocf_log err "Could not detect peer-initiated recovery. Checking again in the next monitor cycle"
|
||||
+ echo "failed-wait-for-peer"
|
||||
+ return
|
||||
+ fi
|
||||
+
|
||||
+ if [ -n "$fnc_holders" ]; then
|
||||
+ ocf_log debug "force_new_cluster detected (set by: $fnc_holders), triggering restart"
|
||||
+ echo "failed-restart-now"
|
||||
+ return
|
||||
+ fi
|
||||
+
|
||||
+ echo "failed-wait-for-peer"
|
||||
+}
|
||||
+
|
||||
+podman_monitor()
|
||||
+{
|
||||
+ local container_health_state
|
||||
+
|
||||
+ container_health_state=$(container_health_check)
|
||||
+ case "$container_health_state" in
|
||||
+ healthy)
|
||||
+ # Continue with normal monitoring
|
||||
+ ;;
|
||||
+ not-running)
|
||||
+ return $OCF_NOT_RUNNING
|
||||
+ ;;
|
||||
+ failed-restart-now)
|
||||
+ return $OCF_ERR_GENERIC
|
||||
+ ;;
|
||||
+ failed-wait-for-peer)
|
||||
+ # Continue running, waiting for peer recovery
|
||||
+ return $OCF_SUCCESS
|
||||
+ ;;
|
||||
+ *)
|
||||
+ ocf_log err "Unknown health state: $container_health_state"
|
||||
+ return $OCF_ERR_GENERIC
|
||||
+ ;;
|
||||
+ esac
|
||||
+
|
||||
# Check if certificate files have changed, if they have, etcd needs to be restarted
|
||||
if ! etcd_certificates_hash_manager "check"; then
|
||||
return $OCF_ERR_GENERIC
|
||||
@@ -1533,6 +1633,12 @@ podman_start()
|
||||
local pod_was_running=false
|
||||
|
||||
ocf_log notice "podman-etcd start"
|
||||
+
|
||||
+ # Clear container health check state file
|
||||
+ if ! rm -f "$CONTAINER_HEARTBEAT_FILE"; then
|
||||
+ ocf_log err "could not delete container health check state file"
|
||||
+ fi
|
||||
+
|
||||
attribute_node_ip update
|
||||
attribute_node_cluster_id update
|
||||
attribute_node_revision update
|
||||
@@ -1849,15 +1955,21 @@ podman_stop()
|
||||
local rc
|
||||
|
||||
ocf_log notice "podman-etcd stop"
|
||||
+
|
||||
+ # Clear container health check state file
|
||||
+ if ! rm -f "$CONTAINER_HEARTBEAT_FILE"; then
|
||||
+ ocf_log err "could not delete container health check state file"
|
||||
+ fi
|
||||
+
|
||||
+ attribute_node_revision update
|
||||
+ attribute_node_cluster_id update
|
||||
+
|
||||
podman_simple_status
|
||||
if [ $? -eq $OCF_NOT_RUNNING ]; then
|
||||
ocf_log info "could not leave members list: etcd container not running"
|
||||
return $OCF_SUCCESS
|
||||
fi
|
||||
|
||||
- attribute_node_revision update
|
||||
- attribute_node_cluster_id update
|
||||
-
|
||||
if ! member_id=$(attribute_node_member_id get); then
|
||||
ocf_log err "error leaving members list: could not get member-id"
|
||||
else
|
||||
@@ -2007,6 +2119,9 @@ POD_MANIFEST_COPY="${OCF_RESKEY_config_location}/pod.yaml"
|
||||
ETCD_CONFIGURATION_FILE="${OCF_RESKEY_config_location}/config.yaml"
|
||||
ETCD_BACKUP_FILE="${OCF_RESKEY_backup_location}/config-previous.tar.gz"
|
||||
ETCD_CERTS_HASH_FILE="${OCF_RESKEY_config_location}/certs.hash"
|
||||
+# State file location: Uses HA_RSCTMP to ensure automatic cleanup on reboot.
|
||||
+# This is intentional - reboots are controlled stops, not failures requiring detection.
|
||||
+CONTAINER_HEARTBEAT_FILE=${HA_RSCTMP}/podman-container-last-running
|
||||
|
||||
# Note: we currently monitor podman containers by with the "podman exec"
|
||||
# command, so make sure that invocation is always valid by enforcing the
|
||||
@ -0,0 +1,106 @@
|
||||
From d5b4428e6cd66fd47680531ff0244d9b56e4e4c2 Mon Sep 17 00:00:00 2001
|
||||
From: Pablo Fontanilla <pfontani@redhat.com>
|
||||
Date: Tue, 14 Oct 2025 11:57:09 +0200
|
||||
Subject: [PATCH 1/2] Redo counting of active_resources
|
||||
|
||||
---
|
||||
heartbeat/podman-etcd | 46 +++++++++++++++++++++++++++++++++++++++++--
|
||||
1 file changed, 44 insertions(+), 2 deletions(-)
|
||||
|
||||
diff --git a/heartbeat/podman-etcd b/heartbeat/podman-etcd
|
||||
index e1425ec02..dbf16918d 100755
|
||||
--- a/heartbeat/podman-etcd
|
||||
+++ b/heartbeat/podman-etcd
|
||||
@@ -1029,6 +1029,48 @@ get_peer_node_name() {
|
||||
crm_node -l | awk '{print $2}' | grep -v "$NODENAME"
|
||||
}
|
||||
|
||||
+# Calculate the count of truly active resources by excluding those being stopped.
|
||||
+# According to Pacemaker documentation, during "Post-notification (stop) /
|
||||
+# Pre-notification (start)" transitions, the true active resource count should be:
|
||||
+# Active resources = $OCF_RESKEY_CRM_meta_notify_active_resource
|
||||
+# minus $OCF_RESKEY_CRM_meta_notify_stop_resource
|
||||
+# This handles the case where a resource appears in both the active and stop lists
|
||||
+# during rapid restart scenarios (e.g., process crash recovery).
|
||||
+get_truly_active_resources_count() {
|
||||
+ local active_list="$OCF_RESKEY_CRM_meta_notify_active_resource"
|
||||
+ local stop_list="$OCF_RESKEY_CRM_meta_notify_stop_resource"
|
||||
+ local truly_active=""
|
||||
+
|
||||
+ # If no active resources, return 0
|
||||
+ if [ -z "$active_list" ]; then
|
||||
+ echo "0"
|
||||
+ return
|
||||
+ fi
|
||||
+
|
||||
+ # If no resources being stopped, return count of active resources
|
||||
+ if [ -z "$stop_list" ]; then
|
||||
+ echo "$active_list" | wc -w
|
||||
+ return
|
||||
+ fi
|
||||
+
|
||||
+ # Filter out resources that are being stopped from the active list
|
||||
+ for resource in $active_list; do
|
||||
+ local is_stopping=0
|
||||
+ for stop_resource in $stop_list; do
|
||||
+ if [ "$resource" = "$stop_resource" ]; then
|
||||
+ is_stopping=1
|
||||
+ break
|
||||
+ fi
|
||||
+ done
|
||||
+ if [ $is_stopping -eq 0 ]; then
|
||||
+ truly_active="$truly_active $resource"
|
||||
+ fi
|
||||
+ done
|
||||
+
|
||||
+ # Count the truly active resources (trim leading space and count words)
|
||||
+ echo "$truly_active" | wc -w
|
||||
+}
|
||||
+
|
||||
get_all_etcd_endpoints() {
|
||||
for node in $(echo "$OCF_RESKEY_node_ip_map" | sed "s/\s//g;s/;/ /g"); do
|
||||
name=$(echo "$node" | cut -d: -f1)
|
||||
@@ -1529,8 +1571,8 @@ podman_start()
|
||||
# - 0 active agents, 1 starting: we are starting; the peer is not starting
|
||||
# - 0 active agents, 2 starting: both agents are starting simultaneously
|
||||
local active_resources_count
|
||||
- active_resources_count=$(echo "$OCF_RESKEY_CRM_meta_notify_active_resource" | wc -w)
|
||||
- ocf_log info "found '$active_resources_count' active etcd resources (meta notify environment variable: '$OCF_RESKEY_CRM_meta_notify_active_resource')"
|
||||
+ active_resources_count=$(get_truly_active_resources_count)
|
||||
+ ocf_log info "found '$active_resources_count' active etcd resources (active: '$OCF_RESKEY_CRM_meta_notify_active_resource', stop: '$OCF_RESKEY_CRM_meta_notify_stop_resource')"
|
||||
case "$active_resources_count" in
|
||||
1)
|
||||
if [ "$(attribute_learner_node get)" = "$(get_peer_node_name)" ]; then
|
||||
|
||||
From 0114ddf83c95122a7f9fe9f704f864242cdb284a Mon Sep 17 00:00:00 2001
|
||||
From: Pablo Fontanilla <pfontani@redhat.com>
|
||||
Date: Wed, 29 Oct 2025 12:49:17 +0100
|
||||
Subject: [PATCH 2/2] Update truly active resources count with safer empty
|
||||
calculation
|
||||
|
||||
---
|
||||
heartbeat/podman-etcd | 6 ++++--
|
||||
1 file changed, 4 insertions(+), 2 deletions(-)
|
||||
|
||||
diff --git a/heartbeat/podman-etcd b/heartbeat/podman-etcd
|
||||
index dbf16918d..8fc92a537 100755
|
||||
--- a/heartbeat/podman-etcd
|
||||
+++ b/heartbeat/podman-etcd
|
||||
@@ -1042,13 +1042,15 @@ get_truly_active_resources_count() {
|
||||
local truly_active=""
|
||||
|
||||
# If no active resources, return 0
|
||||
- if [ -z "$active_list" ]; then
|
||||
+ # Use word count to handle whitespace-only values
|
||||
+ if [ "$(echo "$active_list" | wc -w)" -eq 0 ]; then
|
||||
echo "0"
|
||||
return
|
||||
fi
|
||||
|
||||
# If no resources being stopped, return count of active resources
|
||||
- if [ -z "$stop_list" ]; then
|
||||
+ # Use word count to handle whitespace-only values
|
||||
+ if [ "$(echo "$stop_list" | wc -w)" -eq 0 ]; then
|
||||
echo "$active_list" | wc -w
|
||||
return
|
||||
fi
|
||||
@ -45,7 +45,7 @@
|
||||
Name: resource-agents
|
||||
Summary: Open Source HA Reusable Cluster Resource Scripts
|
||||
Version: 4.16.0
|
||||
Release: 40%{?rcver:%{rcver}}%{?numcomm:.%{numcomm}}%{?alphatag:.%{alphatag}}%{?dirty:.%{dirty}}%{?dist}
|
||||
Release: 41%{?rcver:%{rcver}}%{?numcomm:.%{numcomm}}%{?alphatag:.%{alphatag}}%{?dirty:.%{dirty}}%{?dist}
|
||||
License: GPL-2.0-or-later AND LGPL-2.1-or-later
|
||||
URL: https://github.com/ClusterLabs/resource-agents
|
||||
Source0: %{upstream_prefix}-%{upstream_version}.tar.gz
|
||||
@ -103,6 +103,8 @@ Patch50: RHEL-102779-pgsqlms-fix-validate-warnings.patch
|
||||
Patch51: RHEL-112443-nginx-fix-validate-warnings.patch
|
||||
Patch52: RHEL-121985-Filesystem-speed-up-get-PIDs.patch
|
||||
Patch53: RHEL-126791-storage_mon-fix-handling-of-4k-block-devices.patch
|
||||
Patch54: RHEL-127840-podman-etcd-exclude-stopping-resources-from-active-count.patch
|
||||
Patch55: RHEL-126083-podman-etcd-add-container-crash-detection-with-coordinated-recovery.patch
|
||||
|
||||
# bundled ha-cloud-support libs
|
||||
Patch500: ha-cloud-support-aliyun.patch
|
||||
@ -327,6 +329,8 @@ exit 1
|
||||
%patch -p1 -P 51
|
||||
%patch -p1 -P 52
|
||||
%patch -p1 -P 53
|
||||
%patch -p1 -P 54
|
||||
%patch -p1 -P 55 -F2
|
||||
|
||||
# bundled ha-cloud-support libs
|
||||
%patch -p1 -P 500
|
||||
@ -659,6 +663,12 @@ rm -rf %{buildroot}/usr/share/doc/resource-agents
|
||||
%{_usr}/lib/ocf/lib/heartbeat/OCF_*.pm
|
||||
|
||||
%changelog
|
||||
* Thu Nov 13 2025 Oyvind Albrigtsen <oalbrigt@redhat.com> - 4.16.0-41
|
||||
- podman-etcd: exclude stopping resources from active count
|
||||
- podman-etcd: add container crash detection with coordinated recovery
|
||||
|
||||
Resolves: RHEL-127840, RHEL-126083
|
||||
|
||||
* Mon Nov 10 2025 Oyvind Albrigtsen <oalbrigt@redhat.com> - 4.16.0-40
|
||||
- storage_mon: fix handling of 4k block devices
|
||||
|
||||
|
||||
Loading…
Reference in New Issue
Block a user