From b77913a2bdb0351362a059dce4c8d335d91375e8 Mon Sep 17 00:00:00 2001 From: Oyvind Albrigtsen Date: Tue, 2 Dec 2025 10:25:32 +0100 Subject: [PATCH] - podman-etcd: prevent retries on fatal errors Resolves: RHEL-132052 --- ...etcd-prevent-retries-on-fatal-errors.patch | 146 ++++++++++++++++++ resource-agents.spec | 9 +- 2 files changed, 154 insertions(+), 1 deletion(-) create mode 100644 RHEL-132052-podman-etcd-prevent-retries-on-fatal-errors.patch diff --git a/RHEL-132052-podman-etcd-prevent-retries-on-fatal-errors.patch b/RHEL-132052-podman-etcd-prevent-retries-on-fatal-errors.patch new file mode 100644 index 0000000..3297c6c --- /dev/null +++ b/RHEL-132052-podman-etcd-prevent-retries-on-fatal-errors.patch @@ -0,0 +1,146 @@ +From 192b0ecbe015e8b8a4d32f8b066ead3a6dba0589 Mon Sep 17 00:00:00 2001 +From: Carlo Lobrano +Date: Tue, 2 Dec 2025 10:01:01 +0100 +Subject: [PATCH] OCPEDGE-2231: podman-etcd: improve error handling to support + retry on start errors (#2105) + +* podman-etcd: improve add_member_as_learner error log + +Improving add_member_as_learner error log to better debug rare issue +when the podman exec command returns error, but the etcd member is added +to the list anyway. This is critical as the `learner_node` attribute +won't be cleaned up anymore. + +Signed-off-by: Carlo Lobrano + +* podman-etcd: remove duplicated check for container already started + +* podman-etcd: improve error return codes to support start retries + +Improved and/or changed some returns code to allow or forbid retry in +case of start errors. + +see: OCPEDGE-2231 + +--------- + +Signed-off-by: Carlo Lobrano +--- + heartbeat/podman-etcd | 40 +++++++++++++++++++++++++--------------- + 1 file changed, 25 insertions(+), 15 deletions(-) + +diff --git a/heartbeat/podman-etcd b/heartbeat/podman-etcd +index 3e3f1d60e..242226bb1 100755 +--- a/heartbeat/podman-etcd ++++ b/heartbeat/podman-etcd +@@ -617,9 +617,13 @@ prepare_env() { + LISTEN_CLIENT_URLS="0.0.0.0" + LISTEN_PEER_URLS="0.0.0.0" + LISTEN_METRICS_URLS="0.0.0.0" ++ ++ return $OCF_SUCCESS + } + + compute_bump_revision() { ++ local rc ++ + # Same logic used by cluster-etcd-operator quorum-restore-pod utility. + # see https://github.com/openshift/cluster-etcd-operator/blob/215998939f5223da916622c71fd07d17656faf6b/bindata/etcd/quorum-restore-pod.yaml#L26-L34 + # set a default value: 1bn would be an etcd running at 1000 writes/s for about eleven days. +@@ -691,7 +695,13 @@ experimental-max-learners: 1 + experimental-warning-apply-duration: $(convert_duration_in_nanoseconds "$ETCD_EXPERIMENTAL_WARNING_APPLY_DURATION") + experimental-watch-progress-notify-interval: $(convert_duration_in_nanoseconds "$ETCD_EXPERIMENTAL_WATCH_PROGRESS_NOTIFY_INTERVAL") + EOF ++ rc=$? ++ if [ $rc -ne 0 ]; then ++ ocf_log err "could not create etcd configuration, 'cat' error code: $rc" ++ return $OCF_ERR_CONFIGURED ++ fi + ++ # Append cipher suites from the env variable where the entries are comma separated. + { + if [ -n "$ETCD_CIPHER_SUITES" ]; then + echo "cipher-suites:" +@@ -700,6 +710,13 @@ EOF + done + fi + } >> "$ETCD_CONFIGURATION_FILE" ++ rc=$? ++ if [ $rc -ne 0 ]; then ++ ocf_log err "could not append cipher suites to etcd configuration, error code: $rc" ++ return $OCF_ERR_CONFIGURED ++ fi ++ ++ return $OCF_SUCCESS + } + + archive_data_folder() +@@ -884,7 +901,7 @@ add_member_as_learner() + out=$(podman exec "${CONTAINER}" etcdctl --endpoints="$endpoint_url:2379" member add "$member_name" --peer-urls="$peer_url:2380" --learner) + rc=$? + if [ $rc -ne 0 ]; then +- ocf_log err "could not add $member_name as learner, error code: $rc" ++ ocf_log err "could not add $member_name as learner, error code $rc, etcdctl output: $out" + return $rc + fi + ocf_log info "$out" +@@ -1763,7 +1780,7 @@ podman_start() + fnc_holder_count=$(echo "$fnc_holders" | wc -w) + if [ "$fnc_holder_count" -gt 1 ]; then + ocf_exit_reason "force_new_cluster attribute is set on multiple nodes ($fnc_holders)" +- return "$OCF_ERR_GENERIC" ++ return "$OCF_ERR_CONFIGURED" + fi + + if [ "$fnc_holder_count" -eq 1 ]; then +@@ -1837,7 +1854,7 @@ podman_start() + ocf_log info "same cluster_id and revision: start normal" + else + ocf_exit_reason "same revision but different cluster id" +- return "$OCF_ERR_GENERIC" ++ return "$OCF_ERR_CONFIGURED" + fi + fi + ;; +@@ -1862,12 +1879,6 @@ podman_start() + + run_opts="$run_opts --oom-score-adj=${OCF_RESKEY_oom}" + +- # check to see if the container has already started +- podman_simple_status +- if [ $? -eq $OCF_SUCCESS ]; then +- return "$OCF_SUCCESS" +- fi +- + if ocf_is_true "$JOIN_AS_LEARNER"; then + local wait_timeout_sec=$((10*60)) + local poll_interval_sec=5 +@@ -1894,9 +1905,8 @@ podman_start() + + ocf_log info "check for changes in pod manifest to decide if the container should be reused or replaced" + if ! can_reuse_container ; then +- rc="$?" +- ocf_log err "could not determine etcd container reuse strategy, rc: $rc" +- return "$rc" ++ ocf_log err "could not determine etcd container reuse strategy" ++ return $OCF_ERR_GENERIC + fi + + # Archive current container and its configuration before creating +@@ -1912,13 +1922,13 @@ podman_start() + fi + + if ! prepare_env; then +- ocf_log err "Could not prepare environment for podman, error code: $?" ++ ocf_log err "Could not prepare environment for podman" + return $OCF_ERR_GENERIC + fi + + if ! generate_etcd_configuration; then +- ocf_log err "Could not generate etcd configuration, error code: $?" +- return $OCF_ERR_GENERIC ++ ocf_log err "Could not generate etcd configuration" ++ return $OCF_ERR_CONFIGURED + fi + + run_opts="$run_opts \ diff --git a/resource-agents.spec b/resource-agents.spec index 7313f82..ab67332 100644 --- a/resource-agents.spec +++ b/resource-agents.spec @@ -45,7 +45,7 @@ Name: resource-agents Summary: Open Source HA Reusable Cluster Resource Scripts Version: 4.10.0 -Release: 101%{?rcver:%{rcver}}%{?numcomm:.%{numcomm}}%{?alphatag:.%{alphatag}}%{?dirty:.%{dirty}}%{?dist} +Release: 102%{?rcver:%{rcver}}%{?numcomm:.%{numcomm}}%{?alphatag:.%{alphatag}}%{?dirty:.%{dirty}}%{?dist} License: GPLv2+ and LGPLv2+ URL: https://github.com/ClusterLabs/resource-agents Source0: %{upstream_prefix}-%{upstream_version}.tar.gz @@ -192,6 +192,7 @@ Patch139: RHEL-130580-1-podman-etcd-prevent-last-active-member-from-leaving.patc Patch140: RHEL-130580-2-podman-etcd-remove-test-code.patch Patch141: RHEL-126087-2-podman-etcd-fix-count-of-fnc-holders-in-container_health_check.patch Patch142: RHEL-131185-podman-etcd-prevent-learner-from-starting-before-cluster-is-ready.patch +Patch143: RHEL-132052-podman-etcd-prevent-retries-on-fatal-errors.patch # bundled ha-cloud-support libs Patch500: ha-cloud-support-aliyun.patch @@ -479,6 +480,7 @@ exit 1 %patch -p1 -P 140 %patch -p1 -P 141 %patch -p1 -P 142 +%patch -p1 -P 143 # bundled ha-cloud-support libs %patch -p1 -P 500 @@ -811,6 +813,11 @@ rm -rf %{buildroot}/usr/share/doc/resource-agents %{_usr}/lib/ocf/lib/heartbeat/OCF_*.pm %changelog +* Tue Dec 2 2025 Oyvind Albrigtsen - 4.10.0-102 +- podman-etcd: prevent retries on fatal errors + + Resolves: RHEL-132052 + * Thu Nov 27 2025 Oyvind Albrigtsen - 4.10.0-101 - podman-etcd: prevent learner from starting before cluster is ready