- podman-etcd: prevent retries on fatal errors

Resolves: RHEL-132052
This commit is contained in:
Oyvind Albrigtsen 2025-12-02 10:25:32 +01:00
parent 67fbe64ac6
commit b77913a2bd
2 changed files with 154 additions and 1 deletions

View File

@ -0,0 +1,146 @@
From 192b0ecbe015e8b8a4d32f8b066ead3a6dba0589 Mon Sep 17 00:00:00 2001
From: Carlo Lobrano <c.lobrano@gmail.com>
Date: Tue, 2 Dec 2025 10:01:01 +0100
Subject: [PATCH] OCPEDGE-2231: podman-etcd: improve error handling to support
retry on start errors (#2105)
* podman-etcd: improve add_member_as_learner error log
Improving add_member_as_learner error log to better debug rare issue
when the podman exec command returns error, but the etcd member is added
to the list anyway. This is critical as the `learner_node` attribute
won't be cleaned up anymore.
Signed-off-by: Carlo Lobrano <c.lobrano@gmail.com>
* podman-etcd: remove duplicated check for container already started
* podman-etcd: improve error return codes to support start retries
Improved and/or changed some returns code to allow or forbid retry in
case of start errors.
see: OCPEDGE-2231
---------
Signed-off-by: Carlo Lobrano <c.lobrano@gmail.com>
---
heartbeat/podman-etcd | 40 +++++++++++++++++++++++++---------------
1 file changed, 25 insertions(+), 15 deletions(-)
diff --git a/heartbeat/podman-etcd b/heartbeat/podman-etcd
index 3e3f1d60e..242226bb1 100755
--- a/heartbeat/podman-etcd
+++ b/heartbeat/podman-etcd
@@ -617,9 +617,13 @@ prepare_env() {
LISTEN_CLIENT_URLS="0.0.0.0"
LISTEN_PEER_URLS="0.0.0.0"
LISTEN_METRICS_URLS="0.0.0.0"
+
+ return $OCF_SUCCESS
}
compute_bump_revision() {
+ local rc
+
# Same logic used by cluster-etcd-operator quorum-restore-pod utility.
# see https://github.com/openshift/cluster-etcd-operator/blob/215998939f5223da916622c71fd07d17656faf6b/bindata/etcd/quorum-restore-pod.yaml#L26-L34
# set a default value: 1bn would be an etcd running at 1000 writes/s for about eleven days.
@@ -691,7 +695,13 @@ experimental-max-learners: 1
experimental-warning-apply-duration: $(convert_duration_in_nanoseconds "$ETCD_EXPERIMENTAL_WARNING_APPLY_DURATION")
experimental-watch-progress-notify-interval: $(convert_duration_in_nanoseconds "$ETCD_EXPERIMENTAL_WATCH_PROGRESS_NOTIFY_INTERVAL")
EOF
+ rc=$?
+ if [ $rc -ne 0 ]; then
+ ocf_log err "could not create etcd configuration, 'cat' error code: $rc"
+ return $OCF_ERR_CONFIGURED
+ fi
+ # Append cipher suites from the env variable where the entries are comma separated.
{
if [ -n "$ETCD_CIPHER_SUITES" ]; then
echo "cipher-suites:"
@@ -700,6 +710,13 @@ EOF
done
fi
} >> "$ETCD_CONFIGURATION_FILE"
+ rc=$?
+ if [ $rc -ne 0 ]; then
+ ocf_log err "could not append cipher suites to etcd configuration, error code: $rc"
+ return $OCF_ERR_CONFIGURED
+ fi
+
+ return $OCF_SUCCESS
}
archive_data_folder()
@@ -884,7 +901,7 @@ add_member_as_learner()
out=$(podman exec "${CONTAINER}" etcdctl --endpoints="$endpoint_url:2379" member add "$member_name" --peer-urls="$peer_url:2380" --learner)
rc=$?
if [ $rc -ne 0 ]; then
- ocf_log err "could not add $member_name as learner, error code: $rc"
+ ocf_log err "could not add $member_name as learner, error code $rc, etcdctl output: $out"
return $rc
fi
ocf_log info "$out"
@@ -1763,7 +1780,7 @@ podman_start()
fnc_holder_count=$(echo "$fnc_holders" | wc -w)
if [ "$fnc_holder_count" -gt 1 ]; then
ocf_exit_reason "force_new_cluster attribute is set on multiple nodes ($fnc_holders)"
- return "$OCF_ERR_GENERIC"
+ return "$OCF_ERR_CONFIGURED"
fi
if [ "$fnc_holder_count" -eq 1 ]; then
@@ -1837,7 +1854,7 @@ podman_start()
ocf_log info "same cluster_id and revision: start normal"
else
ocf_exit_reason "same revision but different cluster id"
- return "$OCF_ERR_GENERIC"
+ return "$OCF_ERR_CONFIGURED"
fi
fi
;;
@@ -1862,12 +1879,6 @@ podman_start()
run_opts="$run_opts --oom-score-adj=${OCF_RESKEY_oom}"
- # check to see if the container has already started
- podman_simple_status
- if [ $? -eq $OCF_SUCCESS ]; then
- return "$OCF_SUCCESS"
- fi
-
if ocf_is_true "$JOIN_AS_LEARNER"; then
local wait_timeout_sec=$((10*60))
local poll_interval_sec=5
@@ -1894,9 +1905,8 @@ podman_start()
ocf_log info "check for changes in pod manifest to decide if the container should be reused or replaced"
if ! can_reuse_container ; then
- rc="$?"
- ocf_log err "could not determine etcd container reuse strategy, rc: $rc"
- return "$rc"
+ ocf_log err "could not determine etcd container reuse strategy"
+ return $OCF_ERR_GENERIC
fi
# Archive current container and its configuration before creating
@@ -1912,13 +1922,13 @@ podman_start()
fi
if ! prepare_env; then
- ocf_log err "Could not prepare environment for podman, error code: $?"
+ ocf_log err "Could not prepare environment for podman"
return $OCF_ERR_GENERIC
fi
if ! generate_etcd_configuration; then
- ocf_log err "Could not generate etcd configuration, error code: $?"
- return $OCF_ERR_GENERIC
+ ocf_log err "Could not generate etcd configuration"
+ return $OCF_ERR_CONFIGURED
fi
run_opts="$run_opts \

View File

@ -45,7 +45,7 @@
Name: resource-agents
Summary: Open Source HA Reusable Cluster Resource Scripts
Version: 4.10.0
Release: 101%{?rcver:%{rcver}}%{?numcomm:.%{numcomm}}%{?alphatag:.%{alphatag}}%{?dirty:.%{dirty}}%{?dist}
Release: 102%{?rcver:%{rcver}}%{?numcomm:.%{numcomm}}%{?alphatag:.%{alphatag}}%{?dirty:.%{dirty}}%{?dist}
License: GPLv2+ and LGPLv2+
URL: https://github.com/ClusterLabs/resource-agents
Source0: %{upstream_prefix}-%{upstream_version}.tar.gz
@ -192,6 +192,7 @@ Patch139: RHEL-130580-1-podman-etcd-prevent-last-active-member-from-leaving.patc
Patch140: RHEL-130580-2-podman-etcd-remove-test-code.patch
Patch141: RHEL-126087-2-podman-etcd-fix-count-of-fnc-holders-in-container_health_check.patch
Patch142: RHEL-131185-podman-etcd-prevent-learner-from-starting-before-cluster-is-ready.patch
Patch143: RHEL-132052-podman-etcd-prevent-retries-on-fatal-errors.patch
# bundled ha-cloud-support libs
Patch500: ha-cloud-support-aliyun.patch
@ -479,6 +480,7 @@ exit 1
%patch -p1 -P 140
%patch -p1 -P 141
%patch -p1 -P 142
%patch -p1 -P 143
# bundled ha-cloud-support libs
%patch -p1 -P 500
@ -811,6 +813,11 @@ rm -rf %{buildroot}/usr/share/doc/resource-agents
%{_usr}/lib/ocf/lib/heartbeat/OCF_*.pm
%changelog
* Tue Dec 2 2025 Oyvind Albrigtsen <oalbrigt@redhat.com> - 4.10.0-102
- podman-etcd: prevent retries on fatal errors
Resolves: RHEL-132052
* Thu Nov 27 2025 Oyvind Albrigtsen <oalbrigt@redhat.com> - 4.10.0-101
- podman-etcd: prevent learner from starting before cluster is ready