From a7e8b855e0194faa48ce4bf7c707c98e0f3fc031 Mon Sep 17 00:00:00 2001 From: Oyvind Albrigtsen Date: Thu, 13 Nov 2025 11:33:32 +0100 Subject: [PATCH] - podman-etcd: exclude stopping resources from active count - podman-etcd: add container crash detection with coordinated recovery Resolves: RHEL-127840, RHEL-126083 --- ...-detection-with-coordinated-recovery.patch | 222 ++++++++++++++++++ ...stopping-resources-from-active-count.patch | 106 +++++++++ resource-agents.spec | 12 +- 3 files changed, 339 insertions(+), 1 deletion(-) create mode 100644 RHEL-126083-podman-etcd-add-container-crash-detection-with-coordinated-recovery.patch create mode 100644 RHEL-127840-podman-etcd-exclude-stopping-resources-from-active-count.patch diff --git a/RHEL-126083-podman-etcd-add-container-crash-detection-with-coordinated-recovery.patch b/RHEL-126083-podman-etcd-add-container-crash-detection-with-coordinated-recovery.patch new file mode 100644 index 0000000..0c2f3e9 --- /dev/null +++ b/RHEL-126083-podman-etcd-add-container-crash-detection-with-coordinated-recovery.patch @@ -0,0 +1,222 @@ +From e8fb2ad9cc14e91b74b5cde1e012d92afcddb1a5 Mon Sep 17 00:00:00 2001 +From: Carlo Lobrano +Date: Sat, 25 Oct 2025 17:27:42 +0200 +Subject: [PATCH] podman-etcd: add container crash detection with coordinated + recovery + +This change prevents the agent from starting prematurely when the etcd +container has failed. Previously, an early start would cause the agent +to block while waiting for peer-initiated recovery. This blocking +prevented Pacemaker from allowing the surviving agent to stop and +properly recover the cluster. + +The change introduces `container_health_check` function to monitor the +container's state and catch etcd failures. This check uses a state file +to distinguish between a planned shutdown and an unexpected failure: + +* Container Running: The state file is created or updated with the + current epoch (timestamp). Returns: "healthy". +* Container Not Running + No State File: It's the first check. Returns: + "not-running". +* Container Not Running + State File: An unexpected failure is detected. + * If force_new_cluster is set, the status is: "failed-restart-now". + * Otherwise, the status is: "failed-wait-for-peer". + +The state file is written in a temporary directory (HA_RSCTMP) to ensure +automatic cleanup on reboot. It is also explicitly removed in +`podman_start` and `podman_stop` to mark planned transitions. + +A new helper function `get_time_since_last_heartbeat()` calculates +elapsed time since the last healthy check for diagnostic logging. + +Monitor behavior changes: +* failed-wait-for-peer: Returns OCF_SUCCESS to keep resource running + while waiting for peer-initiated recovery, as the agent is not able + to recover the cluster from a failed state. +* failed-restart-now: Returns OCF_ERR_GENERIC to trigger restart once + peer has set force_new_cluster +--- + heartbeat/podman-etcd | 133 +++++++++++++++++++++++++++++++++++++++--- + 1 file changed, 124 insertions(+), 9 deletions(-) + +diff --git a/heartbeat/podman-etcd b/heartbeat/podman-etcd +index b8dfb2f9e..d596c6f2a 100755 +--- a/heartbeat/podman-etcd ++++ b/heartbeat/podman-etcd +@@ -1226,22 +1226,122 @@ podman_simple_status() + return $rc + } + +-podman_monitor() ++# get_time_since_last_heartbeat returns the time in seconds since the heartbeat file was last updated. ++# ++# Returns: time in seconds since last heartbeat, or empty string if file doesn't exist ++get_time_since_last_heartbeat() + { ++ local last_heartbeat ++ ++ if [ ! -f "$CONTAINER_HEARTBEAT_FILE" ]; then ++ return ++ fi ++ ++ last_heartbeat=$(cat "$CONTAINER_HEARTBEAT_FILE") ++ echo $(($(date +%s) - last_heartbeat)) ++} ++ ++# container_health_check performs comprehensive health monitoring for the container. ++# This function allows coordinated failure handling where the agent waits for ++# peer-initiated cluster recovery in case of container failure. ++# ++# Uses a state file to track container state: ++# - Container running: Update state file with current epoch, return "healthy" ++# - Container not running + no state file: Return "not-running" (never checked before) ++# - Container not running + state file: Failure detected, check force_new_cluster ++# - If force_new_cluster set: Return "failed-restart-now" ++# - Otherwise: Return "failed-wait-for-peer" ++# ++# Returns: healthy, not-running, failed-restart-now, failed-wait-for-peer ++ ++container_health_check() ++{ ++ local rc ++ + # We rely on running podman exec to monitor the container + # state because that command seems to be less prone to + # performance issue under IO load. + # + # For probes to work, we expect cmd_exec to be able to report +- # when a container is not running. Here, we're not interested +- # in distinguishing whether it's stopped or non existing +- # (there's function container_exists for that) ++ # when a container is not running. Here, we're not interested ++ # in distinguishing whether it's stopped or non existing ++ # (there's function container_exists for that) ++ # For monitor, however, we still need to know if it has stopped ++ # recently (i.e. a failure), or not (fresh start) + monitor_cmd_exec + rc=$? +- if [ $rc -ne 0 ]; then +- return $rc ++ if [ "$rc" -eq 0 ]; then ++ # Container is running - update state file with current epoch ++ local current_epoch ++ current_epoch=$(date +%s) ++ if ! echo "$current_epoch" > "$CONTAINER_HEARTBEAT_FILE"; then ++ ocf_log warn "Failed to update container heartbeat file, error code: $?" ++ # wait for peer to detect any real issue with the etcd cluster or wait for the ++ # next monitor interval ++ echo "failed-wait-for-peer" ++ return ++ fi ++ echo "healthy" ++ return + fi + ++ # Check if state file exists (was container running on last check?) ++ if [ ! -f "$CONTAINER_HEARTBEAT_FILE" ]; then ++ # No state file - container was never checked before ++ ocf_log debug "Container ${CONTAINER} has no previous state" ++ echo "not-running" ++ # NOTE: this is where the probe is expected to exit, keeping the logic ++ # quick and less prone to performance issue under IO load. ++ return ++ fi ++ ++ # State file exists - the container failed, check recovery status in this lifecycle ++ local time_since_heartbeat ++ time_since_heartbeat=$(get_time_since_last_heartbeat) ++ ocf_log err "Container ${CONTAINER} failed (last healthy: ${time_since_heartbeat}s ago)" ++ ++ # Check if peer has set force_new_cluster for recovery ++ local fnc_holders ++ if ! fnc_holders=$(get_force_new_cluster); then ++ ocf_log err "Could not detect peer-initiated recovery. Checking again in the next monitor cycle" ++ echo "failed-wait-for-peer" ++ return ++ fi ++ ++ if [ -n "$fnc_holders" ]; then ++ ocf_log debug "force_new_cluster detected (set by: $fnc_holders), triggering restart" ++ echo "failed-restart-now" ++ return ++ fi ++ ++ echo "failed-wait-for-peer" ++} ++ ++podman_monitor() ++{ ++ local container_health_state ++ ++ container_health_state=$(container_health_check) ++ case "$container_health_state" in ++ healthy) ++ # Continue with normal monitoring ++ ;; ++ not-running) ++ return $OCF_NOT_RUNNING ++ ;; ++ failed-restart-now) ++ return $OCF_ERR_GENERIC ++ ;; ++ failed-wait-for-peer) ++ # Continue running, waiting for peer recovery ++ return $OCF_SUCCESS ++ ;; ++ *) ++ ocf_log err "Unknown health state: $container_health_state" ++ return $OCF_ERR_GENERIC ++ ;; ++ esac ++ + # Check if certificate files have changed, if they have, etcd needs to be restarted + if ! etcd_certificates_hash_manager "check"; then + return $OCF_ERR_GENERIC +@@ -1533,6 +1633,12 @@ podman_start() + local pod_was_running=false + + ocf_log notice "podman-etcd start" ++ ++ # Clear container health check state file ++ if ! rm -f "$CONTAINER_HEARTBEAT_FILE"; then ++ ocf_log err "could not delete container health check state file" ++ fi ++ + attribute_node_ip update + attribute_node_cluster_id update + attribute_node_revision update +@@ -1849,15 +1955,21 @@ podman_stop() + local rc + + ocf_log notice "podman-etcd stop" ++ ++ # Clear container health check state file ++ if ! rm -f "$CONTAINER_HEARTBEAT_FILE"; then ++ ocf_log err "could not delete container health check state file" ++ fi ++ ++ attribute_node_revision update ++ attribute_node_cluster_id update ++ + podman_simple_status + if [ $? -eq $OCF_NOT_RUNNING ]; then + ocf_log info "could not leave members list: etcd container not running" + return $OCF_SUCCESS + fi + +- attribute_node_revision update +- attribute_node_cluster_id update +- + if ! member_id=$(attribute_node_member_id get); then + ocf_log err "error leaving members list: could not get member-id" + else +@@ -2007,6 +2119,9 @@ POD_MANIFEST_COPY="${OCF_RESKEY_config_location}/pod.yaml" + ETCD_CONFIGURATION_FILE="${OCF_RESKEY_config_location}/config.yaml" + ETCD_BACKUP_FILE="${OCF_RESKEY_backup_location}/config-previous.tar.gz" + ETCD_CERTS_HASH_FILE="${OCF_RESKEY_config_location}/certs.hash" ++# State file location: Uses HA_RSCTMP to ensure automatic cleanup on reboot. ++# This is intentional - reboots are controlled stops, not failures requiring detection. ++CONTAINER_HEARTBEAT_FILE=${HA_RSCTMP}/podman-container-last-running + + # Note: we currently monitor podman containers by with the "podman exec" + # command, so make sure that invocation is always valid by enforcing the diff --git a/RHEL-127840-podman-etcd-exclude-stopping-resources-from-active-count.patch b/RHEL-127840-podman-etcd-exclude-stopping-resources-from-active-count.patch new file mode 100644 index 0000000..d065a34 --- /dev/null +++ b/RHEL-127840-podman-etcd-exclude-stopping-resources-from-active-count.patch @@ -0,0 +1,106 @@ +From d5b4428e6cd66fd47680531ff0244d9b56e4e4c2 Mon Sep 17 00:00:00 2001 +From: Pablo Fontanilla +Date: Tue, 14 Oct 2025 11:57:09 +0200 +Subject: [PATCH 1/2] Redo counting of active_resources + +--- + heartbeat/podman-etcd | 46 +++++++++++++++++++++++++++++++++++++++++-- + 1 file changed, 44 insertions(+), 2 deletions(-) + +diff --git a/heartbeat/podman-etcd b/heartbeat/podman-etcd +index e1425ec02..dbf16918d 100755 +--- a/heartbeat/podman-etcd ++++ b/heartbeat/podman-etcd +@@ -1029,6 +1029,48 @@ get_peer_node_name() { + crm_node -l | awk '{print $2}' | grep -v "$NODENAME" + } + ++# Calculate the count of truly active resources by excluding those being stopped. ++# According to Pacemaker documentation, during "Post-notification (stop) / ++# Pre-notification (start)" transitions, the true active resource count should be: ++# Active resources = $OCF_RESKEY_CRM_meta_notify_active_resource ++# minus $OCF_RESKEY_CRM_meta_notify_stop_resource ++# This handles the case where a resource appears in both the active and stop lists ++# during rapid restart scenarios (e.g., process crash recovery). ++get_truly_active_resources_count() { ++ local active_list="$OCF_RESKEY_CRM_meta_notify_active_resource" ++ local stop_list="$OCF_RESKEY_CRM_meta_notify_stop_resource" ++ local truly_active="" ++ ++ # If no active resources, return 0 ++ if [ -z "$active_list" ]; then ++ echo "0" ++ return ++ fi ++ ++ # If no resources being stopped, return count of active resources ++ if [ -z "$stop_list" ]; then ++ echo "$active_list" | wc -w ++ return ++ fi ++ ++ # Filter out resources that are being stopped from the active list ++ for resource in $active_list; do ++ local is_stopping=0 ++ for stop_resource in $stop_list; do ++ if [ "$resource" = "$stop_resource" ]; then ++ is_stopping=1 ++ break ++ fi ++ done ++ if [ $is_stopping -eq 0 ]; then ++ truly_active="$truly_active $resource" ++ fi ++ done ++ ++ # Count the truly active resources (trim leading space and count words) ++ echo "$truly_active" | wc -w ++} ++ + get_all_etcd_endpoints() { + for node in $(echo "$OCF_RESKEY_node_ip_map" | sed "s/\s//g;s/;/ /g"); do + name=$(echo "$node" | cut -d: -f1) +@@ -1529,8 +1571,8 @@ podman_start() + # - 0 active agents, 1 starting: we are starting; the peer is not starting + # - 0 active agents, 2 starting: both agents are starting simultaneously + local active_resources_count +- active_resources_count=$(echo "$OCF_RESKEY_CRM_meta_notify_active_resource" | wc -w) +- ocf_log info "found '$active_resources_count' active etcd resources (meta notify environment variable: '$OCF_RESKEY_CRM_meta_notify_active_resource')" ++ active_resources_count=$(get_truly_active_resources_count) ++ ocf_log info "found '$active_resources_count' active etcd resources (active: '$OCF_RESKEY_CRM_meta_notify_active_resource', stop: '$OCF_RESKEY_CRM_meta_notify_stop_resource')" + case "$active_resources_count" in + 1) + if [ "$(attribute_learner_node get)" = "$(get_peer_node_name)" ]; then + +From 0114ddf83c95122a7f9fe9f704f864242cdb284a Mon Sep 17 00:00:00 2001 +From: Pablo Fontanilla +Date: Wed, 29 Oct 2025 12:49:17 +0100 +Subject: [PATCH 2/2] Update truly active resources count with safer empty + calculation + +--- + heartbeat/podman-etcd | 6 ++++-- + 1 file changed, 4 insertions(+), 2 deletions(-) + +diff --git a/heartbeat/podman-etcd b/heartbeat/podman-etcd +index dbf16918d..8fc92a537 100755 +--- a/heartbeat/podman-etcd ++++ b/heartbeat/podman-etcd +@@ -1042,13 +1042,15 @@ get_truly_active_resources_count() { + local truly_active="" + + # If no active resources, return 0 +- if [ -z "$active_list" ]; then ++ # Use word count to handle whitespace-only values ++ if [ "$(echo "$active_list" | wc -w)" -eq 0 ]; then + echo "0" + return + fi + + # If no resources being stopped, return count of active resources +- if [ -z "$stop_list" ]; then ++ # Use word count to handle whitespace-only values ++ if [ "$(echo "$stop_list" | wc -w)" -eq 0 ]; then + echo "$active_list" | wc -w + return + fi diff --git a/resource-agents.spec b/resource-agents.spec index 3b63a15..9495b48 100644 --- a/resource-agents.spec +++ b/resource-agents.spec @@ -45,7 +45,7 @@ Name: resource-agents Summary: Open Source HA Reusable Cluster Resource Scripts Version: 4.16.0 -Release: 40%{?rcver:%{rcver}}%{?numcomm:.%{numcomm}}%{?alphatag:.%{alphatag}}%{?dirty:.%{dirty}}%{?dist} +Release: 41%{?rcver:%{rcver}}%{?numcomm:.%{numcomm}}%{?alphatag:.%{alphatag}}%{?dirty:.%{dirty}}%{?dist} License: GPL-2.0-or-later AND LGPL-2.1-or-later URL: https://github.com/ClusterLabs/resource-agents Source0: %{upstream_prefix}-%{upstream_version}.tar.gz @@ -103,6 +103,8 @@ Patch50: RHEL-102779-pgsqlms-fix-validate-warnings.patch Patch51: RHEL-112443-nginx-fix-validate-warnings.patch Patch52: RHEL-121985-Filesystem-speed-up-get-PIDs.patch Patch53: RHEL-126791-storage_mon-fix-handling-of-4k-block-devices.patch +Patch54: RHEL-127840-podman-etcd-exclude-stopping-resources-from-active-count.patch +Patch55: RHEL-126083-podman-etcd-add-container-crash-detection-with-coordinated-recovery.patch # bundled ha-cloud-support libs Patch500: ha-cloud-support-aliyun.patch @@ -327,6 +329,8 @@ exit 1 %patch -p1 -P 51 %patch -p1 -P 52 %patch -p1 -P 53 +%patch -p1 -P 54 +%patch -p1 -P 55 -F2 # bundled ha-cloud-support libs %patch -p1 -P 500 @@ -659,6 +663,12 @@ rm -rf %{buildroot}/usr/share/doc/resource-agents %{_usr}/lib/ocf/lib/heartbeat/OCF_*.pm %changelog +* Thu Nov 13 2025 Oyvind Albrigtsen - 4.16.0-41 +- podman-etcd: exclude stopping resources from active count +- podman-etcd: add container crash detection with coordinated recovery + + Resolves: RHEL-127840, RHEL-126083 + * Mon Nov 10 2025 Oyvind Albrigtsen - 4.16.0-40 - storage_mon: fix handling of 4k block devices