- podman-etcd: prevent retries on fatal errors

Resolves: RHEL-132052
2025-12-02 10:25:32 +01:00 · 2025-12-02 10:25:32 +01:00 · b77913a2bd
commit b77913a2bd
parent 67fbe64ac6
2 changed files with 154 additions and 1 deletions
--- a/RHEL-132052-podman-etcd-prevent-retries-on-fatal-errors.patch
+++ b/RHEL-132052-podman-etcd-prevent-retries-on-fatal-errors.patch
@ -0,0 +1,146 @@
+From 192b0ecbe015e8b8a4d32f8b066ead3a6dba0589 Mon Sep 17 00:00:00 2001
+From: Carlo Lobrano <c.lobrano@gmail.com>
+Date: Tue, 2 Dec 2025 10:01:01 +0100
+Subject: [PATCH] OCPEDGE-2231: podman-etcd: improve error handling to support
+ retry on start errors (#2105)
+
+* podman-etcd: improve add_member_as_learner error log
+
+Improving add_member_as_learner error log to better debug rare issue
+when the podman exec command returns error, but the etcd member is added
+to the list anyway. This is critical as the `learner_node` attribute
+won't be cleaned up anymore.
+
+Signed-off-by: Carlo Lobrano <c.lobrano@gmail.com>
+
+* podman-etcd: remove duplicated check for container already started
+
+* podman-etcd: improve error return codes to support start retries
+
+Improved and/or changed some returns code to allow or forbid retry in
+case of start errors.
+
+see: OCPEDGE-2231
+
+---------
+
+Signed-off-by: Carlo Lobrano <c.lobrano@gmail.com>
+---
+ heartbeat/podman-etcd | 40 +++++++++++++++++++++++++---------------
+ 1 file changed, 25 insertions(+), 15 deletions(-)
+
+diff --git a/heartbeat/podman-etcd b/heartbeat/podman-etcd
+index 3e3f1d60e..242226bb1 100755
+--- a/heartbeat/podman-etcd
+++ b/heartbeat/podman-etcd
+@@ -617,9 +617,13 @@ prepare_env() {
+ 	LISTEN_CLIENT_URLS="0.0.0.0"
+ 	LISTEN_PEER_URLS="0.0.0.0"
+ 	LISTEN_METRICS_URLS="0.0.0.0"
+
+	return $OCF_SUCCESS
+ }
+ 
+ compute_bump_revision() {
+	local rc
+
+ 	# Same logic used by cluster-etcd-operator quorum-restore-pod utility.
+ 	# see https://github.com/openshift/cluster-etcd-operator/blob/215998939f5223da916622c71fd07d17656faf6b/bindata/etcd/quorum-restore-pod.yaml#L26-L34
+ 	# set a default value: 1bn would be an etcd running at 1000 writes/s for about eleven days.
+@@ -691,7 +695,13 @@ experimental-max-learners: 1
+ experimental-warning-apply-duration: $(convert_duration_in_nanoseconds "$ETCD_EXPERIMENTAL_WARNING_APPLY_DURATION")
+ experimental-watch-progress-notify-interval: $(convert_duration_in_nanoseconds "$ETCD_EXPERIMENTAL_WATCH_PROGRESS_NOTIFY_INTERVAL")
+ EOF
+	rc=$?
+	if [ $rc -ne 0 ]; then
+		ocf_log err "could not create etcd configuration, 'cat' error code: $rc"
+		return $OCF_ERR_CONFIGURED
+	fi
+ 
+	# Append cipher suites from the env variable where the entries are comma separated.
+ 	{
+ 		if [ -n "$ETCD_CIPHER_SUITES" ]; then
+ 			echo "cipher-suites:"
+@@ -700,6 +710,13 @@ EOF
+ 			done
+ 		fi
+ 	} >> "$ETCD_CONFIGURATION_FILE"
+	rc=$?
+	if [ $rc -ne 0 ]; then
+		ocf_log err "could not append cipher suites to etcd configuration, error code: $rc"
+		return $OCF_ERR_CONFIGURED
+	fi
+
+	return $OCF_SUCCESS
+ }
+ 
+ archive_data_folder()
+@@ -884,7 +901,7 @@ add_member_as_learner()
+ 	out=$(podman exec "${CONTAINER}" etcdctl --endpoints="$endpoint_url:2379" member add "$member_name" --peer-urls="$peer_url:2380" --learner)
+ 	rc=$?
+ 	if [ $rc -ne 0 ]; then
+-		ocf_log err "could not add $member_name as learner, error code: $rc"
+		ocf_log err "could not add $member_name as learner, error code $rc, etcdctl output: $out"
+ 		return $rc
+ 	fi
+ 	ocf_log info "$out"
+@@ -1763,7 +1780,7 @@ podman_start()
+ 		fnc_holder_count=$(echo "$fnc_holders" | wc -w)
+ 		if [ "$fnc_holder_count" -gt 1 ]; then
+ 			ocf_exit_reason "force_new_cluster attribute is set on multiple nodes ($fnc_holders)"
+-			return "$OCF_ERR_GENERIC"
+			return "$OCF_ERR_CONFIGURED"
+ 		fi
+ 
+ 		if [ "$fnc_holder_count" -eq 1 ]; then
+@@ -1837,7 +1854,7 @@ podman_start()
+ 							ocf_log info "same cluster_id and revision: start normal"
+ 						else
+ 							ocf_exit_reason "same revision but different cluster id"
+-							return "$OCF_ERR_GENERIC"
+							return "$OCF_ERR_CONFIGURED"
+ 						fi
+ 					fi
+ 					;;
+@@ -1862,12 +1879,6 @@ podman_start()
+ 
+ 	run_opts="$run_opts --oom-score-adj=${OCF_RESKEY_oom}"
+ 
+-	# check to see if the container has already started
+-	podman_simple_status
+-	if [ $? -eq $OCF_SUCCESS ]; then
+-		return "$OCF_SUCCESS"
+-	fi
+-
+ 	if ocf_is_true "$JOIN_AS_LEARNER"; then
+ 		local wait_timeout_sec=$((10*60))
+ 		local poll_interval_sec=5
+@@ -1894,9 +1905,8 @@ podman_start()
+ 
+ 	ocf_log info "check for changes in pod manifest to decide if the container should be reused or replaced"
+ 	if ! can_reuse_container ; then
+-		rc="$?"
+-		ocf_log err "could not determine etcd container reuse strategy, rc: $rc"
+-		return "$rc"
+		ocf_log err "could not determine etcd container reuse strategy"
+		return $OCF_ERR_GENERIC
+ 	fi
+ 
+ 	# Archive current container and its configuration before creating
+@@ -1912,13 +1922,13 @@ podman_start()
+ 	fi
+ 
+ 	if ! prepare_env; then
+-		ocf_log err "Could not prepare environment for podman, error code: $?"
+		ocf_log err "Could not prepare environment for podman"
+ 		return $OCF_ERR_GENERIC
+ 	fi
+ 
+ 	if ! generate_etcd_configuration; then
+-		ocf_log err "Could not generate etcd configuration, error code: $?"
+-		return $OCF_ERR_GENERIC
+		ocf_log err "Could not generate etcd configuration"
+		return $OCF_ERR_CONFIGURED
+ 	fi
+ 
+ 	run_opts="$run_opts \
--- a/resource-agents.spec
+++ b/resource-agents.spec
@ -45,7 +45,7 @@
 Name:		resource-agents
 Summary:	Open Source HA Reusable Cluster Resource Scripts
 Version:	4.10.0
-Release:	101%{?rcver:%{rcver}}%{?numcomm:.%{numcomm}}%{?alphatag:.%{alphatag}}%{?dirty:.%{dirty}}%{?dist}
+Release:	102%{?rcver:%{rcver}}%{?numcomm:.%{numcomm}}%{?alphatag:.%{alphatag}}%{?dirty:.%{dirty}}%{?dist}
 License:	GPLv2+ and LGPLv2+
 URL:		https://github.com/ClusterLabs/resource-agents
 Source0:	%{upstream_prefix}-%{upstream_version}.tar.gz
@ -192,6 +192,7 @@ Patch139:	RHEL-130580-1-podman-etcd-prevent-last-active-member-from-leaving.patc
 Patch140:	RHEL-130580-2-podman-etcd-remove-test-code.patch
 Patch141:	RHEL-126087-2-podman-etcd-fix-count-of-fnc-holders-in-container_health_check.patch
 Patch142:	RHEL-131185-podman-etcd-prevent-learner-from-starting-before-cluster-is-ready.patch
+Patch143:	RHEL-132052-podman-etcd-prevent-retries-on-fatal-errors.patch

 # bundled ha-cloud-support libs
 Patch500:	ha-cloud-support-aliyun.patch
@ -479,6 +480,7 @@ exit 1
 %patch -p1 -P 140
 %patch -p1 -P 141
 %patch -p1 -P 142
+%patch -p1 -P 143

 # bundled ha-cloud-support libs
 %patch -p1 -P 500
@ -811,6 +813,11 @@ rm -rf %{buildroot}/usr/share/doc/resource-agents
 %{_usr}/lib/ocf/lib/heartbeat/OCF_*.pm

 %changelog
+* Tue Dec  2 2025 Oyvind Albrigtsen <oalbrigt@redhat.com> - 4.10.0-102
+- podman-etcd: prevent retries on fatal errors
+
+  Resolves: RHEL-132052
+
 * Thu Nov 27 2025 Oyvind Albrigtsen <oalbrigt@redhat.com> - 4.10.0-101
 - podman-etcd: prevent learner from starting before cluster is ready