From 580971cd281c07f94afb5cb83a0da7b90a8b642b Mon Sep 17 00:00:00 2001 From: Oyvind Albrigtsen Date: Wed, 10 Apr 2024 14:31:48 +0200 Subject: [PATCH] - galera: fix issue where joiner promotion fails is the node reports being in non-primary state Resolves: RHEL-31763 --- ...era-fix-joiner-promotion-fails-issue.patch | 75 +++++++++++++++++++ resource-agents.spec | 10 ++- 2 files changed, 84 insertions(+), 1 deletion(-) create mode 100644 RHEL-31763-galera-fix-joiner-promotion-fails-issue.patch diff --git a/RHEL-31763-galera-fix-joiner-promotion-fails-issue.patch b/RHEL-31763-galera-fix-joiner-promotion-fails-issue.patch new file mode 100644 index 0000000..acdfc9c --- /dev/null +++ b/RHEL-31763-galera-fix-joiner-promotion-fails-issue.patch @@ -0,0 +1,75 @@ +From 4357f0dbb8668ac4090cd7070c2ea195e5683326 Mon Sep 17 00:00:00 2001 +From: Damien Ciabrini +Date: Wed, 24 Jan 2024 13:27:26 +0100 +Subject: [PATCH] galera: allow joiner to report non-Primary during initial IST + +It seems that with recent galera versions, when a galera node +joins a cluster, there is a small time window where the node is +connected to the primary component of the galera cluster, but it +might still be preparing its IST. During this time, it can report +itself as being 'not ready' and in 'non-primary' state. + +Update the galera resource agent to allow the node to be in +non-primary state, but only if running a "promote" operation. Any +network partition during the promotion will be caught by the +promote timeout. + +In reworking the promotion code, we move the check for primary +partition into the "galera_monitor" function. The check works +as before for regular "monitor" or "probe" operations. + +Related-Bug: rhbz#2255414 +--- + heartbeat/galera.in | 25 +++++++++++++++++-------- + 1 file changed, 17 insertions(+), 8 deletions(-) + +diff --git a/heartbeat/galera.in b/heartbeat/galera.in +index 6aed3e4b6d..b518595cb0 100755 +--- a/heartbeat/galera.in ++++ b/heartbeat/galera.in +@@ -822,6 +822,11 @@ galera_promote() + return $rc + fi + ++ # At this point, the mysql pidfile is created on disk and the ++ # mysql server is reacheable via its UNIX socket. If we are a ++ # joiner, SST transfers (rsync) have finished, but an IST may ++ # still be requested or ongoing ++ + galera_monitor + rc=$? + if [ $rc != $OCF_SUCCESS -a $rc != $OCF_RUNNING_MASTER ]; then +@@ -835,12 +840,6 @@ galera_promote() + return $OCF_ERR_GENERIC + fi + +- is_primary +- if [ $? -ne 0 ]; then +- ocf_exit_reason "Failure. Master instance started, but is not in Primary mode." +- return $OCF_ERR_GENERIC +- fi +- + if ocf_is_true $bootstrap; then + promote_everyone + clear_bootstrap_node +@@ -991,8 +990,18 @@ galera_monitor() + fi + rc=$OCF_RUNNING_MASTER + else +- ocf_exit_reason "local node <${NODENAME}> is started, but not in primary mode. Unknown state." +- rc=$OCF_ERR_GENERIC ++ # It seems that with recent galera (26.4+), a joiner that is ++ # connected to a Primary component and is preparing its IST ++ # request might still temporarily report its state as ++ # Non-Primary. Do not fail in this case as the promote ++ # operation will loop until the IST finishes or the promote ++ # times out. ++ if [ "$__OCF_ACTION" = "promote" ] && ! ocf_is_true $(is_bootstrap); then ++ ocf_log info "local node <${NODENAME}> is receiving a State Transfer." ++ else ++ ocf_exit_reason "local node <${NODENAME}> is started, but not in primary mode. Unknown state." ++ rc=$OCF_ERR_GENERIC ++ fi + fi + + return $rc diff --git a/resource-agents.spec b/resource-agents.spec index e079dca..64a9725 100644 --- a/resource-agents.spec +++ b/resource-agents.spec @@ -45,7 +45,7 @@ Name: resource-agents Summary: Open Source HA Reusable Cluster Resource Scripts Version: 4.10.0 -Release: 53%{?rcver:%{rcver}}%{?numcomm:.%{numcomm}}%{?alphatag:.%{alphatag}}%{?dirty:.%{dirty}}%{?dist} +Release: 54%{?rcver:%{rcver}}%{?numcomm:.%{numcomm}}%{?alphatag:.%{alphatag}}%{?dirty:.%{dirty}}%{?dist} License: GPLv2+ and LGPLv2+ URL: https://github.com/ClusterLabs/resource-agents Source0: %{upstream_prefix}-%{upstream_version}.tar.gz @@ -119,6 +119,7 @@ Patch66: RHEL-16247-aws-vpc-move-ip-aws-vpc-route53-awseip-awsvip-auth_type-role Patch67: RHEL-17072-1-storage_mon-findif-leak-unitialized-values-EOS-fixes.patch Patch68: RHEL-17072-2-storage_mon-use-memset-to-fix-covscan-error.patch Patch69: RHEL-15304-2-findif.sh-dont-use-table-parameter.patch +Patch70: RHEL-31763-galera-fix-joiner-promotion-fails-issue.patch # bundled ha-cloud-support libs Patch500: ha-cloud-support-aws.patch @@ -312,6 +313,7 @@ exit 1 %patch -p1 -P 67 %patch -p1 -P 68 %patch -p1 -P 69 +%patch -p1 -P 70 # bundled ha-cloud-support libs %patch -p1 -P 500 @@ -633,6 +635,12 @@ rm -rf %{buildroot}/usr/share/doc/resource-agents %{_usr}/lib/ocf/lib/heartbeat/OCF_*.pm %changelog +* Wed Apr 10 2024 Oyvind Albrigtsen - 4.10.0-54 +- galera: fix issue where joiner promotion fails is the node reports + being in non-primary state + + Resolves: RHEL-31763 + * Wed Mar 6 2024 Oyvind Albrigtsen - 4.10.0-53 - aliyun-vpc-move-ip: use new aliyun-cli