- podman-etcd: hardened monitor/stop actions
Resolves: RHEL-159203
This commit is contained in:
parent
eea4aa580e
commit
ee02a536d4
265
RHEL-159203-podman-etcd-hardened-monitor-stop-actions.patch
Normal file
265
RHEL-159203-podman-etcd-hardened-monitor-stop-actions.patch
Normal file
@ -0,0 +1,265 @@
|
||||
From c909003639ef36f995f855f5b954a5ae2132f19c Mon Sep 17 00:00:00 2001
|
||||
From: Vincenzo Mauro <43814449+vimauro@users.noreply.github.com>
|
||||
Date: Mon, 23 Mar 2026 11:54:51 +0100
|
||||
Subject: [PATCH] OCPBUGS-76538: podman-etcd: monitor/stop hardening (#2130)
|
||||
|
||||
* monitor/stop hardening
|
||||
* removed noisy log
|
||||
* Improved return code handling + PR comments
|
||||
* reduced wait_timeout_sec in podman_start
|
||||
* enriched log line on container_running
|
||||
* Fixed detect_cluster_leadership_loss in case podman exec fails
|
||||
* Reverted detect_cluster_leadership_loss changes
|
||||
* Updated return code in attribute_node_revision
|
||||
* restored initial attribute_node_revision logic and updated comment
|
||||
* restored original log line in attribute_node_revision
|
||||
* updated return codes on changed code path that reaches pacemaker
|
||||
* updated log line in check_peer
|
||||
---
|
||||
heartbeat/podman-etcd | 133 ++++++++++++++++++++++++++----------------
|
||||
1 file changed, 83 insertions(+), 50 deletions(-)
|
||||
|
||||
diff --git a/heartbeat/podman-etcd b/heartbeat/podman-etcd
|
||||
index 860aca817..4c9bbd4fa 100755
|
||||
--- a/heartbeat/podman-etcd
|
||||
+++ b/heartbeat/podman-etcd
|
||||
@@ -371,25 +371,34 @@ monitor_cmd_exec()
|
||||
{
|
||||
local rc=$OCF_SUCCESS
|
||||
local out
|
||||
-
|
||||
- out=$(podman exec ${CONTAINER} $OCF_RESKEY_monitor_cmd 2>&1)
|
||||
- rc=$?
|
||||
- # 125: no container with name or ID ${CONTAINER} found
|
||||
- # 126: container state improper (not running)
|
||||
- # 127: any other error
|
||||
- # 255: podman 2+: container not running
|
||||
- case "$rc" in
|
||||
- 125|126|255)
|
||||
- rc=$OCF_NOT_RUNNING
|
||||
- ;;
|
||||
- 0)
|
||||
- ocf_log debug "monitor cmd passed: exit code = $rc"
|
||||
- ;;
|
||||
- *)
|
||||
- ocf_exit_reason "monitor cmd failed (rc=$rc), output: $out"
|
||||
- rc=$OCF_ERR_GENERIC
|
||||
- ;;
|
||||
- esac
|
||||
+ local attempt
|
||||
+ # 3 attempts × 5s = 15s worst case, fits within the 25s monitor timeout.
|
||||
+ # The health check normally completes in <1s; the 5s per-attempt timeout
|
||||
+ # is a safety net for when the container's process namespace is slow.
|
||||
+ local max_attempts=3
|
||||
+ local attempt_timeout=5
|
||||
+
|
||||
+ for attempt in $(seq 1 $max_attempts); do
|
||||
+ out=$(timeout $attempt_timeout podman exec ${CONTAINER} $OCF_RESKEY_monitor_cmd 2>&1)
|
||||
+ rc=$?
|
||||
+ # 125: no container with name or ID ${CONTAINER} found
|
||||
+ # 126: container state improper (not running)
|
||||
+ # 127: any other error
|
||||
+ # 255: podman 2+: container not running
|
||||
+ case "$rc" in
|
||||
+ 125|126|255)
|
||||
+ return $OCF_NOT_RUNNING
|
||||
+ ;;
|
||||
+ 0)
|
||||
+ ocf_log debug "monitor cmd passed: exit code = $rc"
|
||||
+ return $OCF_SUCCESS
|
||||
+ ;;
|
||||
+ *)
|
||||
+ ocf_log warn "monitor cmd failed (rc=$rc), output: $out"
|
||||
+ rc=$OCF_ERR_GENERIC
|
||||
+ ;;
|
||||
+ esac
|
||||
+ done
|
||||
|
||||
return $rc
|
||||
}
|
||||
@@ -527,8 +536,9 @@ get_env_from_manifest() {
|
||||
exit "$OCF_ERR_INSTALLED"
|
||||
fi
|
||||
|
||||
- if ! env_var_value=$(jq -r ".spec.containers[].env[] | select( .name == \"$env_var_name\" ).value" "$OCF_RESKEY_pod_manifest"); then
|
||||
- rc=$?
|
||||
+ env_var_value=$(jq -r ".spec.containers[].env[] | select( .name == \"$env_var_name\" ).value" "$OCF_RESKEY_pod_manifest")
|
||||
+ rc=$?
|
||||
+ if [ $rc -ne 0 ]; then
|
||||
ocf_log err "could not find environment variable $env_var_name in etcd pod manifest, error code: $rc"
|
||||
exit "$OCF_ERR_INSTALLED"
|
||||
fi
|
||||
@@ -934,10 +944,14 @@ attribute_node_cluster_id()
|
||||
{
|
||||
local action="$1"
|
||||
local value
|
||||
- if ! value=$(jq -r ".clusterId" "$ETCD_REVISION_JSON"); then
|
||||
- rc=$?
|
||||
+ local rc
|
||||
+ value=$(jq -r ".clusterId" "$ETCD_REVISION_JSON")
|
||||
+ rc=$?
|
||||
+ if [ $rc -ne 0 ]; then
|
||||
+ # Log the error but return success to avoid monitor failure if the file is not available yet.
|
||||
+ #This should not block cluster recovery.
|
||||
ocf_log err "could not get cluster_id, error code: $rc"
|
||||
- return "$rc"
|
||||
+ return $OCF_SUCCESS
|
||||
fi
|
||||
|
||||
case "$action" in
|
||||
@@ -945,10 +959,12 @@ attribute_node_cluster_id()
|
||||
echo "$value"
|
||||
;;
|
||||
update)
|
||||
- if ! crm_attribute --type nodes --node "$NODENAME" --name "cluster_id" --update "$value"; then
|
||||
- rc=$?
|
||||
+ crm_attribute --type nodes --node "$NODENAME" --name "cluster_id" --update "$value"
|
||||
+ rc=$?
|
||||
+ if [ $rc -ne 0 ]; then
|
||||
+ # Log the error but return success to avoid monitor failure if we can not update the attribute.
|
||||
ocf_log err "could not update cluster_id, error code: $rc"
|
||||
- return "$rc"
|
||||
+ return $OCF_SUCCESS
|
||||
fi
|
||||
;;
|
||||
*)
|
||||
@@ -983,10 +999,12 @@ attribute_node_revision()
|
||||
echo "$value"
|
||||
;;
|
||||
update)
|
||||
- if ! crm_attribute --type nodes --node "$NODENAME" --name "$attribute" --update "$value"; then
|
||||
- rc=$?
|
||||
- ocf_log err "could not update etcd $revision, error code: $rc"
|
||||
- return "$rc"
|
||||
+ crm_attribute --type nodes --node "$NODENAME" --name "$attribute" --update "$value"
|
||||
+ rc=$?
|
||||
+ if [ $rc -ne 0 ]; then
|
||||
+ # Log the error but return success to avoid monitor failure if we can not update the attribute.
|
||||
+ ocf_log err "could not update etcd $attribute, error code: $rc"
|
||||
+ return $OCF_SUCCESS
|
||||
fi
|
||||
;;
|
||||
*)
|
||||
@@ -1041,25 +1059,31 @@ attribute_node_member_id()
|
||||
ocf_log info "member list: $member_list_json"
|
||||
if [ -z "$member_list_json" ] ; then
|
||||
ocf_log err "could not get $attribute: could not get member list JSON"
|
||||
- return "$rc"
|
||||
+ return $OCF_ERR_GENERIC
|
||||
fi
|
||||
|
||||
- local value value_hex
|
||||
- if ! value=$(echo -n "$member_list_json" | jq -r ".header.member_id"); then
|
||||
- rc=$?
|
||||
+ local value value_hex rc
|
||||
+ value=$(echo -n "$member_list_json" | jq -r ".header.member_id")
|
||||
+ rc=$?
|
||||
+ if [ $rc -ne 0 ]; then
|
||||
+ # Log the error but return success to avoid monitor failure if the file is not available yet.
|
||||
ocf_log err "could not get $attribute from member list JSON, error code: $rc"
|
||||
- return "$rc"
|
||||
+ return $OCF_SUCCESS
|
||||
fi
|
||||
|
||||
# JSON member_id is decimal, while etcdctl command needs the hex version
|
||||
- if ! value_hex=$(decimal_to_hex "$value"); then
|
||||
- ocf_log err "could not convert decimal member_id '$value' to hex, error code: $?"
|
||||
+ value_hex=$(decimal_to_hex "$value")
|
||||
+ rc=$?
|
||||
+ if [ $rc -ne 0 ]; then
|
||||
+ ocf_log err "could not convert decimal member_id '$value' to hex, error code: $rc"
|
||||
return $OCF_ERR_GENERIC
|
||||
fi
|
||||
- if ! crm_attribute --type nodes --node "$NODENAME" --name "$attribute" --update "$value_hex"; then
|
||||
- rc=$?
|
||||
+ crm_attribute --type nodes --node "$NODENAME" --name "$attribute" --update "$value_hex"
|
||||
+ rc=$?
|
||||
+ if [ $rc -ne 0 ]; then
|
||||
+ # Log the error but return success to avoid monitor failure if we can not update the attribute.
|
||||
ocf_log err "could not update etcd $attribute, error code: $rc"
|
||||
- return "$rc"
|
||||
+ return $OCF_SUCCESS
|
||||
fi
|
||||
;;
|
||||
clear)
|
||||
@@ -1446,7 +1470,7 @@ get_endpoint_status_json()
|
||||
local all_etcd_endpoints
|
||||
|
||||
all_etcd_endpoints=$(get_all_etcd_endpoints)
|
||||
- podman exec "${CONTAINER}" etcdctl endpoint status --endpoints="$all_etcd_endpoints" -w json
|
||||
+ podman exec "${CONTAINER}" etcdctl endpoint status --command-timeout="$MONITOR_ETCDCTL_TIMEOUT" --endpoints="$all_etcd_endpoints" -w json
|
||||
}
|
||||
|
||||
get_member_list_json() {
|
||||
@@ -1454,7 +1478,7 @@ get_member_list_json() {
|
||||
local this_node_endpoint
|
||||
|
||||
this_node_endpoint="$(ip_url $(attribute_node_ip get)):2379"
|
||||
- podman exec "${CONTAINER}" etcdctl member list --endpoints="$this_node_endpoint" -w json
|
||||
+ podman exec "${CONTAINER}" etcdctl member list --command-timeout="$MONITOR_ETCDCTL_TIMEOUT" --endpoints="$this_node_endpoint" -w json
|
||||
}
|
||||
|
||||
detect_cluster_leadership_loss()
|
||||
@@ -1550,7 +1574,7 @@ check_peer()
|
||||
fi
|
||||
|
||||
if ! member_list_json=$(get_member_list_json); then
|
||||
- ocf_log info "podman failed to get member list, error code: $?"
|
||||
+ ocf_log info "podman failed to get member list"
|
||||
detect_cluster_leadership_loss
|
||||
return $?
|
||||
fi
|
||||
@@ -2145,7 +2169,7 @@ podman_start()
|
||||
run_opts="$run_opts --oom-score-adj=${OCF_RESKEY_oom}"
|
||||
|
||||
if ocf_is_true "$JOIN_AS_LEARNER"; then
|
||||
- local wait_timeout_sec=$((10*60))
|
||||
+ local wait_timeout_sec=$((2*60))
|
||||
local poll_interval_sec=5
|
||||
local retries=$(( wait_timeout_sec / poll_interval_sec ))
|
||||
|
||||
@@ -2354,8 +2378,9 @@ leave_etcd_member_list()
|
||||
ocf_log info "leaving members list as member with ID $member_id"
|
||||
local endpoint
|
||||
endpoint="$(ip_url $(attribute_node_ip get)):2379"
|
||||
- if ! ocf_run podman exec "$CONTAINER" etcdctl member remove "$member_id" --endpoints="$endpoint"; then
|
||||
- rc=$?
|
||||
+ ocf_run timeout 30 podman exec "$CONTAINER" etcdctl member remove "$member_id" --endpoints="$endpoint"
|
||||
+ rc=$?
|
||||
+ if [ $rc -ne 0 ]; then
|
||||
ocf_log err "error leaving members list, error code: $rc"
|
||||
fi
|
||||
}
|
||||
@@ -2376,14 +2401,19 @@ podman_stop()
|
||||
attribute_node_revision update
|
||||
attribute_node_cluster_id update
|
||||
|
||||
- podman_simple_status
|
||||
- if [ $? -eq $OCF_NOT_RUNNING ]; then
|
||||
- ocf_log info "could not leave members list: etcd container not running"
|
||||
+ # Use podman inspect instead of podman exec (podman_simple_status) to check
|
||||
+ # container state. podman exec enters the container's process namespace and
|
||||
+ # hangs when etcd is unresponsive — the typical scenario that triggers a stop.
|
||||
+ local container_running
|
||||
+ container_running=$(podman inspect --format '{{.State.Running}}' "$CONTAINER" 2>/dev/null)
|
||||
+ if [ "$container_running" != "true" ]; then
|
||||
+ ocf_log info "could not leave members list: $CONTAINER container not running, running state: ${container_running}"
|
||||
attribute_node_member_id clear
|
||||
return $OCF_SUCCESS
|
||||
fi
|
||||
|
||||
leave_etcd_member_list
|
||||
+
|
||||
# clear node_member_id CIB attribute only after leaving the member list
|
||||
attribute_node_member_id clear
|
||||
|
||||
@@ -2527,6 +2557,9 @@ ETCD_CERTS_HASH_FILE="${OCF_RESKEY_config_location}/certs.hash"
|
||||
# This is intentional - reboots are controlled stops, not failures requiring detection.
|
||||
CONTAINER_HEARTBEAT_FILE=${HA_RSCTMP}/podman-container-last-running
|
||||
DELAY_SECOND_NODE_LEAVE_SEC=10
|
||||
+# Shorter etcdctl command-timeout for monitor-path calls to prevent
|
||||
+# consuming the 25s monitor budget. Non-monitor callers use the default 5s.
|
||||
+MONITOR_ETCDCTL_TIMEOUT="3s"
|
||||
|
||||
# Note: we currently monitor podman containers by with the "podman exec"
|
||||
# command, so make sure that invocation is always valid by enforcing the
|
||||
@ -45,7 +45,7 @@
|
||||
Name: resource-agents
|
||||
Summary: Open Source HA Reusable Cluster Resource Scripts
|
||||
Version: 4.10.0
|
||||
Release: 110%{?rcver:%{rcver}}%{?numcomm:.%{numcomm}}%{?alphatag:.%{alphatag}}%{?dirty:.%{dirty}}%{?dist}
|
||||
Release: 111%{?rcver:%{rcver}}%{?numcomm:.%{numcomm}}%{?alphatag:.%{alphatag}}%{?dirty:.%{dirty}}%{?dist}
|
||||
License: GPLv2+ and LGPLv2+
|
||||
URL: https://github.com/ClusterLabs/resource-agents
|
||||
Source0: %{upstream_prefix}-%{upstream_version}.tar.gz
|
||||
@ -203,6 +203,7 @@ Patch150: RHEL-116151-4-portblock-check-inverse-action.patch
|
||||
Patch151: RHEL-156709-podman-etcd-ignore-learners-when-considering-which-node-has-higher-revision.patch
|
||||
Patch152: RHEL-157146-podman-etcd-handle-existing-peer-URLs-gracefully-during-force_new_cluster-recovery.patch
|
||||
Patch153: RHEL-153158-db2-set-reintegration-when-promotion-is-successful.patch
|
||||
Patch154: RHEL-159203-podman-etcd-hardened-monitor-stop-actions.patch
|
||||
|
||||
# bundled ha-cloud-support libs
|
||||
Patch500: ha-cloud-support-aliyun.patch
|
||||
@ -501,6 +502,7 @@ exit 1
|
||||
%patch -p1 -P 151
|
||||
%patch -p1 -P 152
|
||||
%patch -p1 -P 153
|
||||
%patch -p1 -P 154
|
||||
|
||||
# bundled ha-cloud-support libs
|
||||
%patch -p1 -P 500
|
||||
@ -833,6 +835,11 @@ rm -rf %{buildroot}/usr/share/doc/resource-agents
|
||||
%{_usr}/lib/ocf/lib/heartbeat/OCF_*.pm
|
||||
|
||||
%changelog
|
||||
* Fri Mar 27 2026 Oyvind Albrigtsen <oalbrigt@redhat.com> - 4.10.0-111
|
||||
- podman-etcd: hardened monitor/stop actions
|
||||
|
||||
Resolves: RHEL-159203
|
||||
|
||||
* Thu Mar 19 2026 Oyvind Albrigtsen <oalbrigt@redhat.com> - 4.10.0-110
|
||||
- podman etcd: ignore learners when considering which node has higher revision
|
||||
- podman etcd: handle existing peer URLs gracefully during force_new_cluster recovery
|
||||
|
||||
Loading…
Reference in New Issue
Block a user