From 580971cd281c07f94afb5cb83a0da7b90a8b642b Mon Sep 17 00:00:00 2001
From: Oyvind Albrigtsen <oalbrigt@redhat.com>
Date: Wed, 10 Apr 2024 14:31:48 +0200
Subject: [PATCH] - galera: fix issue where joiner promotion fails is the node
 reports   being in non-primary state

  Resolves: RHEL-31763
---
 ...era-fix-joiner-promotion-fails-issue.patch | 75 +++++++++++++++++++
 resource-agents.spec                          | 10 ++-
 2 files changed, 84 insertions(+), 1 deletion(-)
 create mode 100644 RHEL-31763-galera-fix-joiner-promotion-fails-issue.patch

diff --git a/RHEL-31763-galera-fix-joiner-promotion-fails-issue.patch b/RHEL-31763-galera-fix-joiner-promotion-fails-issue.patch
new file mode 100644
index 0000000..acdfc9c
--- /dev/null
+++ b/RHEL-31763-galera-fix-joiner-promotion-fails-issue.patch
@@ -0,0 +1,75 @@
+From 4357f0dbb8668ac4090cd7070c2ea195e5683326 Mon Sep 17 00:00:00 2001
+From: Damien Ciabrini <dciabrin@redhat.com>
+Date: Wed, 24 Jan 2024 13:27:26 +0100
+Subject: [PATCH] galera: allow joiner to report non-Primary during initial IST
+
+It seems that with recent galera versions, when a galera node
+joins a cluster, there is a small time window where the node is
+connected to the primary component of the galera cluster, but it
+might still be preparing its IST. During this time, it can report
+itself as being 'not ready' and in 'non-primary' state.
+
+Update the galera resource agent to allow the node to be in
+non-primary state, but only if running a "promote" operation. Any
+network partition during the promotion will be caught by the
+promote timeout.
+
+In reworking the promotion code, we move the check for primary
+partition into the "galera_monitor" function. The check works
+as before for regular "monitor" or "probe" operations.
+
+Related-Bug: rhbz#2255414
+---
+ heartbeat/galera.in | 25 +++++++++++++++++--------
+ 1 file changed, 17 insertions(+), 8 deletions(-)
+
+diff --git a/heartbeat/galera.in b/heartbeat/galera.in
+index 6aed3e4b6d..b518595cb0 100755
+--- a/heartbeat/galera.in
++++ b/heartbeat/galera.in
+@@ -822,6 +822,11 @@ galera_promote()
+         return $rc
+     fi
+ 
++    # At this point, the mysql pidfile is created on disk and the
++    # mysql server is reacheable via its UNIX socket. If we are a
++    # joiner, SST transfers (rsync) have finished, but an IST may
++    # still be requested or ongoing
++
+     galera_monitor
+     rc=$?
+     if [ $rc != $OCF_SUCCESS -a $rc != $OCF_RUNNING_MASTER ]; then
+@@ -835,12 +840,6 @@ galera_promote()
+         return $OCF_ERR_GENERIC
+     fi
+ 
+-    is_primary
+-    if [ $? -ne 0 ]; then
+-        ocf_exit_reason "Failure. Master instance started, but is not in Primary mode."
+-        return $OCF_ERR_GENERIC
+-    fi
+-
+     if ocf_is_true $bootstrap; then
+         promote_everyone
+         clear_bootstrap_node
+@@ -991,8 +990,18 @@ galera_monitor()
+         fi
+         rc=$OCF_RUNNING_MASTER
+     else
+-        ocf_exit_reason "local node <${NODENAME}> is started, but not in primary mode. Unknown state."
+-        rc=$OCF_ERR_GENERIC
++        # It seems that with recent galera (26.4+), a joiner that is
++        # connected to a Primary component and is preparing its IST
++        # request might still temporarily report its state as
++        # Non-Primary.  Do not fail in this case as the promote
++        # operation will loop until the IST finishes or the promote
++        # times out.
++        if [ "$__OCF_ACTION" = "promote" ] && ! ocf_is_true $(is_bootstrap); then
++                ocf_log info "local node <${NODENAME}> is receiving a State Transfer."
++        else
++            ocf_exit_reason "local node <${NODENAME}> is started, but not in primary mode. Unknown state."
++            rc=$OCF_ERR_GENERIC
++        fi
+     fi
+ 
+     return $rc
diff --git a/resource-agents.spec b/resource-agents.spec
index e079dca..64a9725 100644
--- a/resource-agents.spec
+++ b/resource-agents.spec
@@ -45,7 +45,7 @@
 Name:		resource-agents
 Summary:	Open Source HA Reusable Cluster Resource Scripts
 Version:	4.10.0
-Release:	53%{?rcver:%{rcver}}%{?numcomm:.%{numcomm}}%{?alphatag:.%{alphatag}}%{?dirty:.%{dirty}}%{?dist}
+Release:	54%{?rcver:%{rcver}}%{?numcomm:.%{numcomm}}%{?alphatag:.%{alphatag}}%{?dirty:.%{dirty}}%{?dist}
 License:	GPLv2+ and LGPLv2+
 URL:		https://github.com/ClusterLabs/resource-agents
 Source0:	%{upstream_prefix}-%{upstream_version}.tar.gz
@@ -119,6 +119,7 @@ Patch66:	RHEL-16247-aws-vpc-move-ip-aws-vpc-route53-awseip-awsvip-auth_type-role
 Patch67:	RHEL-17072-1-storage_mon-findif-leak-unitialized-values-EOS-fixes.patch
 Patch68:	RHEL-17072-2-storage_mon-use-memset-to-fix-covscan-error.patch
 Patch69:	RHEL-15304-2-findif.sh-dont-use-table-parameter.patch
+Patch70:	RHEL-31763-galera-fix-joiner-promotion-fails-issue.patch
 
 # bundled ha-cloud-support libs
 Patch500:	ha-cloud-support-aws.patch
@@ -312,6 +313,7 @@ exit 1
 %patch -p1 -P 67
 %patch -p1 -P 68
 %patch -p1 -P 69
+%patch -p1 -P 70
 
 # bundled ha-cloud-support libs
 %patch -p1 -P 500
@@ -633,6 +635,12 @@ rm -rf %{buildroot}/usr/share/doc/resource-agents
 %{_usr}/lib/ocf/lib/heartbeat/OCF_*.pm
 
 %changelog
+* Wed Apr 10 2024 Oyvind Albrigtsen <oalbrigt@redhat.com> - 4.10.0-54
+- galera: fix issue where joiner promotion fails is the node reports
+  being in non-primary state
+
+  Resolves: RHEL-31763
+
 * Wed Mar  6 2024 Oyvind Albrigtsen <oalbrigt@redhat.com> - 4.10.0-53
 - aliyun-vpc-move-ip: use new aliyun-cli