Backport a couple of fixes from upstream

- Fix watchdog agent version information
- Ensure transient attributes are cleared when multiple nodes are lost
- Resolves: rhbz1443666
- Resolves: rhbz1986998
This commit is contained in:
Ken Gaillot 2021-08-10 15:42:12 -05:00
parent f08eacae17
commit 103449b89e
3 changed files with 191 additions and 3 deletions

58
017-watchdog-fixes.patch Normal file
View File

@ -0,0 +1,58 @@
From 61eb9c240004d1dbd0b5973e2fecda3686bb4c53 Mon Sep 17 00:00:00 2001
From: Klaus Wenninger <klaus.wenninger@aon.at>
Date: Tue, 10 Aug 2021 09:06:55 +0200
Subject: [PATCH 1/2] Build: rpm: package fence_watchdog in base-package
---
rpm/pacemaker.spec.in | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/rpm/pacemaker.spec.in b/rpm/pacemaker.spec.in
index f58357a77..0c569b9ca 100644
--- a/rpm/pacemaker.spec.in
+++ b/rpm/pacemaker.spec.in
@@ -734,6 +734,7 @@ exit 0
%{_sbindir}/crm_attribute
%{_sbindir}/crm_master
%{_sbindir}/fence_legacy
+%{_sbindir}/fence_watchdog
%doc %{_mandir}/man7/pacemaker-controld.*
%doc %{_mandir}/man7/pacemaker-schedulerd.*
@@ -797,7 +798,6 @@ exit 0
%{_sbindir}/crm_simulate
%{_sbindir}/crm_report
%{_sbindir}/crm_ticket
-%{_sbindir}/fence_watchdog
%{_sbindir}/stonith_admin
# "dirname" is owned by -schemas, which is a prerequisite
%{_datadir}/pacemaker/report.collector
--
2.27.0
From 88e75d5b98df197fa731e7642434951a24a67095 Mon Sep 17 00:00:00 2001
From: Klaus Wenninger <klaus.wenninger@aon.at>
Date: Tue, 10 Aug 2021 09:10:23 +0200
Subject: [PATCH 2/2] Fix: fence_watchdog: fix version output needed for
help2man
---
daemons/fenced/fence_watchdog.in | 1 +
1 file changed, 1 insertion(+)
diff --git a/daemons/fenced/fence_watchdog.in b/daemons/fenced/fence_watchdog.in
index c83304f1d..700065e0e 100755
--- a/daemons/fenced/fence_watchdog.in
+++ b/daemons/fenced/fence_watchdog.in
@@ -12,6 +12,7 @@ import sys
import atexit
import getopt
+AGENT_VERSION = "1.0.0"
SHORT_DESC = "Dummy watchdog fence agent"
LONG_DESC = """fence_watchdog just provides
meta-data - actual fencing is done by the pacemaker internal watchdog agent."""
--
2.27.0

122
018-controller.patch Normal file
View File

@ -0,0 +1,122 @@
From ee7eba6a7a05bdf0a12d60ebabb334d8ee021101 Mon Sep 17 00:00:00 2001
From: Ken Gaillot <kgaillot@redhat.com>
Date: Mon, 9 Aug 2021 14:48:57 -0500
Subject: [PATCH] Fix: controller: ensure lost node's transient attributes are
cleared without DC
Previously, peer_update_callback() cleared a lost node's transient attributes
if either the local node is DC, or there is no DC.
However, that left the possibility of the DC being lost at the same time as
another node -- the local node would still have fsa_our_dc set while processing
the leave notifications, so no node would clear the attributes for the non-DC
node.
Now, the controller has its own CPG configuration change callback, which sets a
global boolean before calling the usual one, so that peer_update_callback() can
know when the DC has been lost.
---
daemons/controld/controld_callbacks.c | 4 +-
daemons/controld/controld_corosync.c | 57 ++++++++++++++++++++++++++-
2 files changed, 59 insertions(+), 2 deletions(-)
diff --git a/daemons/controld/controld_callbacks.c b/daemons/controld/controld_callbacks.c
index af24856ae..e564b3dcd 100644
--- a/daemons/controld/controld_callbacks.c
+++ b/daemons/controld/controld_callbacks.c
@@ -99,6 +99,8 @@ node_alive(const crm_node_t *node)
#define state_text(state) ((state)? (const char *)(state) : "in unknown state")
+bool controld_dc_left = false;
+
void
peer_update_callback(enum crm_status_type type, crm_node_t * node, const void *data)
{
@@ -217,7 +219,7 @@ peer_update_callback(enum crm_status_type type, crm_node_t * node, const void *d
cib_scope_local);
}
- } else if (AM_I_DC || (fsa_our_dc == NULL)) {
+ } else if (AM_I_DC || controld_dc_left || (fsa_our_dc == NULL)) {
/* This only needs to be done once, so normally the DC should do
* it. However if there is no DC, every node must do it, since
* there is no other way to ensure some one node does it.
diff --git a/daemons/controld/controld_corosync.c b/daemons/controld/controld_corosync.c
index db99630fb..c5ab6580a 100644
--- a/daemons/controld/controld_corosync.c
+++ b/daemons/controld/controld_corosync.c
@@ -87,6 +87,61 @@ crmd_cs_destroy(gpointer user_data)
}
}
+extern bool controld_dc_left;
+
+/*!
+ * \brief Handle a Corosync notification of a CPG configuration change
+ *
+ * \param[in] handle CPG connection
+ * \param[in] cpg_name CPG group name
+ * \param[in] member_list List of current CPG members
+ * \param[in] member_list_entries Number of entries in \p member_list
+ * \param[in] left_list List of CPG members that left
+ * \param[in] left_list_entries Number of entries in \p left_list
+ * \param[in] joined_list List of CPG members that joined
+ * \param[in] joined_list_entries Number of entries in \p joined_list
+ */
+static void
+cpg_membership_callback(cpg_handle_t handle, const struct cpg_name *cpg_name,
+ const struct cpg_address *member_list,
+ size_t member_list_entries,
+ const struct cpg_address *left_list,
+ size_t left_list_entries,
+ const struct cpg_address *joined_list,
+ size_t joined_list_entries)
+{
+ /* When nodes leave CPG, the DC clears their transient node attributes.
+ *
+ * However if there is no DC, or the DC is among the nodes that left, each
+ * remaining node needs to do the clearing, to ensure it gets done.
+ * Otherwise, the attributes would persist when the nodes rejoin, which
+ * could have serious consequences for unfencing, agents that use attributes
+ * for internal logic, etc.
+ *
+ * Here, we set a global boolean if the DC is among the nodes that left, for
+ * use by the peer callback.
+ */
+ if (fsa_our_dc != NULL) {
+ crm_node_t *peer = pcmk__search_cluster_node_cache(0, fsa_our_dc);
+
+ if (peer != NULL) {
+ for (int i = 0; i < left_list_entries; ++i) {
+ if (left_list[i].nodeid == peer->id) {
+ controld_dc_left = true;
+ break;
+ }
+ }
+ }
+ }
+
+ // Process the change normally, which will call the peer callback as needed
+ pcmk_cpg_membership(handle, cpg_name, member_list, member_list_entries,
+ left_list, left_list_entries,
+ joined_list, joined_list_entries);
+
+ controld_dc_left = false;
+}
+
extern gboolean crm_connect_corosync(crm_cluster_t * cluster);
gboolean
@@ -95,7 +150,7 @@ crm_connect_corosync(crm_cluster_t * cluster)
if (is_corosync_cluster()) {
crm_set_status_callback(&peer_update_callback);
cluster->cpg.cpg_deliver_fn = crmd_cs_dispatch;
- cluster->cpg.cpg_confchg_fn = pcmk_cpg_membership;
+ cluster->cpg.cpg_confchg_fn = cpg_membership_callback;
cluster->destroy = crmd_cs_destroy;
if (crm_cluster_connect(cluster)) {
--
2.27.0

View File

@ -36,7 +36,7 @@
## can be incremented to build packages reliably considered "newer"
## than previously built packages with the same pcmkversion)
%global pcmkversion 2.1.0
%global specversion 7
%global specversion 8
## Upstream commit (full commit ID, abbreviated commit ID, or tag) to build
%global commit 7c3f660707a495a1331716ad32cd3ac9d9f8ff58
@ -226,7 +226,7 @@
Name: pacemaker
Summary: Scalable High-Availability cluster resource manager
Version: %{pcmkversion}
Release: %{pcmk_release}%{?dist}.1
Release: %{pcmk_release}%{?dist}
License: GPLv2+ and LGPLv2+
Url: https://www.clusterlabs.org/
@ -258,6 +258,8 @@ Patch13: 013-leaks.patch
Patch14: 014-str-list.patch
Patch15: 015-sbd.patch
Patch16: 016-cts.patch
Patch17: 017-watchdog-fixes.patch
Patch18: 018-controller.patch
Requires: resource-agents
Requires: %{pkgname_pcmk_libs}%{?_isa} = %{version}-%{release}
@ -695,6 +697,7 @@ exit 0
%{_sbindir}/crm_attribute
%{_sbindir}/crm_master
%{_sbindir}/fence_watchdog
%doc %{_mandir}/man7/pacemaker-controld.*
%doc %{_mandir}/man7/pacemaker-schedulerd.*
@ -744,7 +747,6 @@ exit 0
%{_sbindir}/crm_simulate
%{_sbindir}/crm_report
%{_sbindir}/crm_ticket
%{_sbindir}/fence_watchdog
%{_sbindir}/stonith_admin
# "dirname" is owned by -schemas, which is a prerequisite
%{_datadir}/pacemaker/report.collector
@ -860,6 +862,12 @@ exit 0
%license %{nagios_name}-%{nagios_hash}/COPYING
%changelog
* Tue Aug 10 2021 Ken Gaillot <kgaillot@redhat.com> - 2.1.0-8
- Fix watchdog agent version information
- Ensure transient attributes are cleared when multiple nodes are lost
- Resolves: rhbz1443666
- Resolves: rhbz1986998
* Mon Aug 09 2021 Mohan Boddu <mboddu@redhat.com> - 2.1.0-7.1
- Rebuilt for IMA sigs, glibc 2.34, aarch64 flags
Related: rhbz#1991688