- podman-etcd: prevent retries on fatal errors
Resolves: RHEL-132052
This commit is contained in:
parent
67fbe64ac6
commit
b77913a2bd
146
RHEL-132052-podman-etcd-prevent-retries-on-fatal-errors.patch
Normal file
146
RHEL-132052-podman-etcd-prevent-retries-on-fatal-errors.patch
Normal file
@ -0,0 +1,146 @@
|
||||
From 192b0ecbe015e8b8a4d32f8b066ead3a6dba0589 Mon Sep 17 00:00:00 2001
|
||||
From: Carlo Lobrano <c.lobrano@gmail.com>
|
||||
Date: Tue, 2 Dec 2025 10:01:01 +0100
|
||||
Subject: [PATCH] OCPEDGE-2231: podman-etcd: improve error handling to support
|
||||
retry on start errors (#2105)
|
||||
|
||||
* podman-etcd: improve add_member_as_learner error log
|
||||
|
||||
Improving add_member_as_learner error log to better debug rare issue
|
||||
when the podman exec command returns error, but the etcd member is added
|
||||
to the list anyway. This is critical as the `learner_node` attribute
|
||||
won't be cleaned up anymore.
|
||||
|
||||
Signed-off-by: Carlo Lobrano <c.lobrano@gmail.com>
|
||||
|
||||
* podman-etcd: remove duplicated check for container already started
|
||||
|
||||
* podman-etcd: improve error return codes to support start retries
|
||||
|
||||
Improved and/or changed some returns code to allow or forbid retry in
|
||||
case of start errors.
|
||||
|
||||
see: OCPEDGE-2231
|
||||
|
||||
---------
|
||||
|
||||
Signed-off-by: Carlo Lobrano <c.lobrano@gmail.com>
|
||||
---
|
||||
heartbeat/podman-etcd | 40 +++++++++++++++++++++++++---------------
|
||||
1 file changed, 25 insertions(+), 15 deletions(-)
|
||||
|
||||
diff --git a/heartbeat/podman-etcd b/heartbeat/podman-etcd
|
||||
index 3e3f1d60e..242226bb1 100755
|
||||
--- a/heartbeat/podman-etcd
|
||||
+++ b/heartbeat/podman-etcd
|
||||
@@ -617,9 +617,13 @@ prepare_env() {
|
||||
LISTEN_CLIENT_URLS="0.0.0.0"
|
||||
LISTEN_PEER_URLS="0.0.0.0"
|
||||
LISTEN_METRICS_URLS="0.0.0.0"
|
||||
+
|
||||
+ return $OCF_SUCCESS
|
||||
}
|
||||
|
||||
compute_bump_revision() {
|
||||
+ local rc
|
||||
+
|
||||
# Same logic used by cluster-etcd-operator quorum-restore-pod utility.
|
||||
# see https://github.com/openshift/cluster-etcd-operator/blob/215998939f5223da916622c71fd07d17656faf6b/bindata/etcd/quorum-restore-pod.yaml#L26-L34
|
||||
# set a default value: 1bn would be an etcd running at 1000 writes/s for about eleven days.
|
||||
@@ -691,7 +695,13 @@ experimental-max-learners: 1
|
||||
experimental-warning-apply-duration: $(convert_duration_in_nanoseconds "$ETCD_EXPERIMENTAL_WARNING_APPLY_DURATION")
|
||||
experimental-watch-progress-notify-interval: $(convert_duration_in_nanoseconds "$ETCD_EXPERIMENTAL_WATCH_PROGRESS_NOTIFY_INTERVAL")
|
||||
EOF
|
||||
+ rc=$?
|
||||
+ if [ $rc -ne 0 ]; then
|
||||
+ ocf_log err "could not create etcd configuration, 'cat' error code: $rc"
|
||||
+ return $OCF_ERR_CONFIGURED
|
||||
+ fi
|
||||
|
||||
+ # Append cipher suites from the env variable where the entries are comma separated.
|
||||
{
|
||||
if [ -n "$ETCD_CIPHER_SUITES" ]; then
|
||||
echo "cipher-suites:"
|
||||
@@ -700,6 +710,13 @@ EOF
|
||||
done
|
||||
fi
|
||||
} >> "$ETCD_CONFIGURATION_FILE"
|
||||
+ rc=$?
|
||||
+ if [ $rc -ne 0 ]; then
|
||||
+ ocf_log err "could not append cipher suites to etcd configuration, error code: $rc"
|
||||
+ return $OCF_ERR_CONFIGURED
|
||||
+ fi
|
||||
+
|
||||
+ return $OCF_SUCCESS
|
||||
}
|
||||
|
||||
archive_data_folder()
|
||||
@@ -884,7 +901,7 @@ add_member_as_learner()
|
||||
out=$(podman exec "${CONTAINER}" etcdctl --endpoints="$endpoint_url:2379" member add "$member_name" --peer-urls="$peer_url:2380" --learner)
|
||||
rc=$?
|
||||
if [ $rc -ne 0 ]; then
|
||||
- ocf_log err "could not add $member_name as learner, error code: $rc"
|
||||
+ ocf_log err "could not add $member_name as learner, error code $rc, etcdctl output: $out"
|
||||
return $rc
|
||||
fi
|
||||
ocf_log info "$out"
|
||||
@@ -1763,7 +1780,7 @@ podman_start()
|
||||
fnc_holder_count=$(echo "$fnc_holders" | wc -w)
|
||||
if [ "$fnc_holder_count" -gt 1 ]; then
|
||||
ocf_exit_reason "force_new_cluster attribute is set on multiple nodes ($fnc_holders)"
|
||||
- return "$OCF_ERR_GENERIC"
|
||||
+ return "$OCF_ERR_CONFIGURED"
|
||||
fi
|
||||
|
||||
if [ "$fnc_holder_count" -eq 1 ]; then
|
||||
@@ -1837,7 +1854,7 @@ podman_start()
|
||||
ocf_log info "same cluster_id and revision: start normal"
|
||||
else
|
||||
ocf_exit_reason "same revision but different cluster id"
|
||||
- return "$OCF_ERR_GENERIC"
|
||||
+ return "$OCF_ERR_CONFIGURED"
|
||||
fi
|
||||
fi
|
||||
;;
|
||||
@@ -1862,12 +1879,6 @@ podman_start()
|
||||
|
||||
run_opts="$run_opts --oom-score-adj=${OCF_RESKEY_oom}"
|
||||
|
||||
- # check to see if the container has already started
|
||||
- podman_simple_status
|
||||
- if [ $? -eq $OCF_SUCCESS ]; then
|
||||
- return "$OCF_SUCCESS"
|
||||
- fi
|
||||
-
|
||||
if ocf_is_true "$JOIN_AS_LEARNER"; then
|
||||
local wait_timeout_sec=$((10*60))
|
||||
local poll_interval_sec=5
|
||||
@@ -1894,9 +1905,8 @@ podman_start()
|
||||
|
||||
ocf_log info "check for changes in pod manifest to decide if the container should be reused or replaced"
|
||||
if ! can_reuse_container ; then
|
||||
- rc="$?"
|
||||
- ocf_log err "could not determine etcd container reuse strategy, rc: $rc"
|
||||
- return "$rc"
|
||||
+ ocf_log err "could not determine etcd container reuse strategy"
|
||||
+ return $OCF_ERR_GENERIC
|
||||
fi
|
||||
|
||||
# Archive current container and its configuration before creating
|
||||
@@ -1912,13 +1922,13 @@ podman_start()
|
||||
fi
|
||||
|
||||
if ! prepare_env; then
|
||||
- ocf_log err "Could not prepare environment for podman, error code: $?"
|
||||
+ ocf_log err "Could not prepare environment for podman"
|
||||
return $OCF_ERR_GENERIC
|
||||
fi
|
||||
|
||||
if ! generate_etcd_configuration; then
|
||||
- ocf_log err "Could not generate etcd configuration, error code: $?"
|
||||
- return $OCF_ERR_GENERIC
|
||||
+ ocf_log err "Could not generate etcd configuration"
|
||||
+ return $OCF_ERR_CONFIGURED
|
||||
fi
|
||||
|
||||
run_opts="$run_opts \
|
||||
@ -45,7 +45,7 @@
|
||||
Name: resource-agents
|
||||
Summary: Open Source HA Reusable Cluster Resource Scripts
|
||||
Version: 4.10.0
|
||||
Release: 101%{?rcver:%{rcver}}%{?numcomm:.%{numcomm}}%{?alphatag:.%{alphatag}}%{?dirty:.%{dirty}}%{?dist}
|
||||
Release: 102%{?rcver:%{rcver}}%{?numcomm:.%{numcomm}}%{?alphatag:.%{alphatag}}%{?dirty:.%{dirty}}%{?dist}
|
||||
License: GPLv2+ and LGPLv2+
|
||||
URL: https://github.com/ClusterLabs/resource-agents
|
||||
Source0: %{upstream_prefix}-%{upstream_version}.tar.gz
|
||||
@ -192,6 +192,7 @@ Patch139: RHEL-130580-1-podman-etcd-prevent-last-active-member-from-leaving.patc
|
||||
Patch140: RHEL-130580-2-podman-etcd-remove-test-code.patch
|
||||
Patch141: RHEL-126087-2-podman-etcd-fix-count-of-fnc-holders-in-container_health_check.patch
|
||||
Patch142: RHEL-131185-podman-etcd-prevent-learner-from-starting-before-cluster-is-ready.patch
|
||||
Patch143: RHEL-132052-podman-etcd-prevent-retries-on-fatal-errors.patch
|
||||
|
||||
# bundled ha-cloud-support libs
|
||||
Patch500: ha-cloud-support-aliyun.patch
|
||||
@ -479,6 +480,7 @@ exit 1
|
||||
%patch -p1 -P 140
|
||||
%patch -p1 -P 141
|
||||
%patch -p1 -P 142
|
||||
%patch -p1 -P 143
|
||||
|
||||
# bundled ha-cloud-support libs
|
||||
%patch -p1 -P 500
|
||||
@ -811,6 +813,11 @@ rm -rf %{buildroot}/usr/share/doc/resource-agents
|
||||
%{_usr}/lib/ocf/lib/heartbeat/OCF_*.pm
|
||||
|
||||
%changelog
|
||||
* Tue Dec 2 2025 Oyvind Albrigtsen <oalbrigt@redhat.com> - 4.10.0-102
|
||||
- podman-etcd: prevent retries on fatal errors
|
||||
|
||||
Resolves: RHEL-132052
|
||||
|
||||
* Thu Nov 27 2025 Oyvind Albrigtsen <oalbrigt@redhat.com> - 4.10.0-101
|
||||
- podman-etcd: prevent learner from starting before cluster is ready
|
||||
|
||||
|
||||
Loading…
Reference in New Issue
Block a user