pacemaker/007-unfencing-loop.patch

734 lines
28 KiB
Diff
Raw Normal View History

From 6dcd6b51d7d3993bc483588d6ed75077518ed600 Mon Sep 17 00:00:00 2001
From: Ken Gaillot <kgaillot@redhat.com>
Date: Fri, 4 Jun 2021 16:30:55 -0500
Subject: [PATCH 01/11] Low: controller: check whether unfenced node was remote
node
... so the controller can indicate the node is remote (if known at that point,
which is not guaranteed) when setting unfencing-related node attributes.
---
daemons/controld/controld_fencing.c | 21 ++++++++++++++++++---
1 file changed, 18 insertions(+), 3 deletions(-)
diff --git a/daemons/controld/controld_fencing.c b/daemons/controld/controld_fencing.c
index 23dff28..0fba661 100644
--- a/daemons/controld/controld_fencing.c
+++ b/daemons/controld/controld_fencing.c
@@ -757,15 +757,30 @@ tengine_stonith_callback(stonith_t *stonith, stonith_callback_data_t *data)
if (pcmk__str_eq("on", op, pcmk__str_casei)) {
const char *value = NULL;
char *now = pcmk__ttoa(time(NULL));
+ gboolean is_remote_node = FALSE;
+
+ /* This check is not 100% reliable, since this node is not
+ * guaranteed to have the remote node cached. However, it
+ * doesn't have to be reliable, since the attribute manager can
+ * learn a node's "remoteness" by other means sooner or later.
+ * This allows it to learn more quickly if this node does have
+ * the information.
+ */
+ if (g_hash_table_lookup(crm_remote_peer_cache, uuid) != NULL) {
+ is_remote_node = TRUE;
+ }
- update_attrd(target, CRM_ATTR_UNFENCED, now, NULL, FALSE);
+ update_attrd(target, CRM_ATTR_UNFENCED, now, NULL,
+ is_remote_node);
free(now);
value = crm_meta_value(action->params, XML_OP_ATTR_DIGESTS_ALL);
- update_attrd(target, CRM_ATTR_DIGESTS_ALL, value, NULL, FALSE);
+ update_attrd(target, CRM_ATTR_DIGESTS_ALL, value, NULL,
+ is_remote_node);
value = crm_meta_value(action->params, XML_OP_ATTR_DIGESTS_SECURE);
- update_attrd(target, CRM_ATTR_DIGESTS_SECURE, value, NULL, FALSE);
+ update_attrd(target, CRM_ATTR_DIGESTS_SECURE, value, NULL,
+ is_remote_node);
} else if (action->sent_update == FALSE) {
send_stonith_update(action, target, uuid);
--
1.8.3.1
From 3ef6d9403f68ab8559c45cc99f5a8da05ca6420b Mon Sep 17 00:00:00 2001
From: Ken Gaillot <kgaillot@redhat.com>
Date: Mon, 7 Jun 2021 10:50:36 -0500
Subject: [PATCH 02/11] Refactor: pacemaker-attrd: functionize adding remote
node to cache
... for future reuse
---
daemons/attrd/attrd_commands.c | 34 +++++++++++++++++++++++-----------
1 file changed, 23 insertions(+), 11 deletions(-)
diff --git a/daemons/attrd/attrd_commands.c b/daemons/attrd/attrd_commands.c
index 731c243..93a165b 100644
--- a/daemons/attrd/attrd_commands.c
+++ b/daemons/attrd/attrd_commands.c
@@ -102,6 +102,28 @@ free_attribute(gpointer data)
}
}
+/*!
+ * \internal
+ * \brief Ensure a Pacemaker Remote node is in the correct peer cache
+ *
+ * \param[in]
+ */
+static void
+cache_remote_node(const char *node_name)
+{
+ /* If we previously assumed this node was an unseen cluster node,
+ * remove its entry from the cluster peer cache.
+ */
+ crm_node_t *dup = pcmk__search_cluster_node_cache(0, node_name);
+
+ if (dup && (dup->uuid == NULL)) {
+ reap_crm_member(0, node_name);
+ }
+
+ // Ensure node is in the remote peer cache
+ CRM_ASSERT(crm_remote_peer_get(node_name) != NULL);
+}
+
static xmlNode *
build_attribute_xml(
xmlNode *parent, const char *name, const char *set, const char *uuid, unsigned int timeout_ms, const char *user,
@@ -709,17 +731,7 @@ attrd_lookup_or_create_value(GHashTable *values, const char *host, xmlNode *xml)
crm_element_value_int(xml, PCMK__XA_ATTR_IS_REMOTE, &is_remote);
if (is_remote) {
- /* If we previously assumed this node was an unseen cluster node,
- * remove its entry from the cluster peer cache.
- */
- crm_node_t *dup = pcmk__search_cluster_node_cache(0, host);
-
- if (dup && (dup->uuid == NULL)) {
- reap_crm_member(0, host);
- }
-
- /* Ensure this host is in the remote peer cache */
- CRM_ASSERT(crm_remote_peer_get(host) != NULL);
+ cache_remote_node(host);
}
if (v == NULL) {
--
1.8.3.1
From 6fac2c71bc2c56870ac828d7cd7b7c799279c47e Mon Sep 17 00:00:00 2001
From: Ken Gaillot <kgaillot@redhat.com>
Date: Mon, 7 Jun 2021 10:39:34 -0500
Subject: [PATCH 03/11] Refactor: pacemaker-attrd: don't try to remove votes
for remote nodes
Remote nodes never vote.
This has no effect in practice since the removal would simply do nothing,
but we might as well not waste time trying.
---
daemons/attrd/attrd_commands.c | 11 ++++++-----
1 file changed, 6 insertions(+), 5 deletions(-)
diff --git a/daemons/attrd/attrd_commands.c b/daemons/attrd/attrd_commands.c
index 93a165b..dbe777e 100644
--- a/daemons/attrd/attrd_commands.c
+++ b/daemons/attrd/attrd_commands.c
@@ -976,7 +976,8 @@ attrd_election_cb(gpointer user_data)
void
attrd_peer_change_cb(enum crm_status_type kind, crm_node_t *peer, const void *data)
{
- bool remove_voter = FALSE;
+ bool gone = false;
+ bool is_remote = pcmk_is_set(peer->flags, crm_remote_node);
switch (kind) {
case crm_status_uname:
@@ -984,7 +985,7 @@ attrd_peer_change_cb(enum crm_status_type kind, crm_node_t *peer, const void *da
case crm_status_processes:
if (!pcmk_is_set(peer->processes, crm_get_cluster_proc())) {
- remove_voter = TRUE;
+ gone = true;
}
break;
@@ -1000,13 +1001,13 @@ attrd_peer_change_cb(enum crm_status_type kind, crm_node_t *peer, const void *da
} else {
// Remove all attribute values associated with lost nodes
attrd_peer_remove(peer->uname, FALSE, "loss");
- remove_voter = TRUE;
+ gone = true;
}
break;
}
- // In case an election is in progress, remove any vote by the node
- if (remove_voter) {
+ // Remove votes from cluster nodes that leave, in case election in progress
+ if (gone && !is_remote) {
attrd_remove_voter(peer);
}
}
--
1.8.3.1
From 54089fc663d6aaf10ca164c6c94b3b17237788de Mon Sep 17 00:00:00 2001
From: Ken Gaillot <kgaillot@redhat.com>
Date: Mon, 7 Jun 2021 10:40:06 -0500
Subject: [PATCH 04/11] Low: pacemaker-attrd: check for remote nodes in peer
update callback
If a remote node was started before the local cluster node joined the cluster,
the cluster node will assume its node attributes are for a cluster node until
it learns otherwise. Check for remoteness in the peer update callback, to have
another way we can learn it.
---
daemons/attrd/attrd_commands.c | 4 ++++
1 file changed, 4 insertions(+)
diff --git a/daemons/attrd/attrd_commands.c b/daemons/attrd/attrd_commands.c
index dbe777e..5f6a754 100644
--- a/daemons/attrd/attrd_commands.c
+++ b/daemons/attrd/attrd_commands.c
@@ -1009,6 +1009,10 @@ attrd_peer_change_cb(enum crm_status_type kind, crm_node_t *peer, const void *da
// Remove votes from cluster nodes that leave, in case election in progress
if (gone && !is_remote) {
attrd_remove_voter(peer);
+
+ // Ensure remote nodes that come up are in the remote node cache
+ } else if (!gone && is_remote) {
+ cache_remote_node(peer->uname);
}
}
--
1.8.3.1
From 8c048df0312d0d9c857d87b570a352429a710928 Mon Sep 17 00:00:00 2001
From: Ken Gaillot <kgaillot@redhat.com>
Date: Mon, 7 Jun 2021 11:29:12 -0500
Subject: [PATCH 05/11] Log: pacemaker-attrd: log peer status changes
---
daemons/attrd/attrd_commands.c | 9 +++++++++
1 file changed, 9 insertions(+)
diff --git a/daemons/attrd/attrd_commands.c b/daemons/attrd/attrd_commands.c
index 5f6a754..d6d179b 100644
--- a/daemons/attrd/attrd_commands.c
+++ b/daemons/attrd/attrd_commands.c
@@ -972,6 +972,7 @@ attrd_election_cb(gpointer user_data)
return FALSE;
}
+#define state_text(state) ((state)? (const char *)(state) : "in unknown state")
void
attrd_peer_change_cb(enum crm_status_type kind, crm_node_t *peer, const void *data)
@@ -981,15 +982,23 @@ attrd_peer_change_cb(enum crm_status_type kind, crm_node_t *peer, const void *da
switch (kind) {
case crm_status_uname:
+ crm_debug("%s node %s is now %s",
+ (is_remote? "Remote" : "Cluster"),
+ peer->uname, state_text(peer->state));
break;
case crm_status_processes:
if (!pcmk_is_set(peer->processes, crm_get_cluster_proc())) {
gone = true;
}
+ crm_debug("Node %s is %s a peer",
+ peer->uname, (gone? "no longer" : "now"));
break;
case crm_status_nstate:
+ crm_debug("%s node %s is now %s (was %s)",
+ (is_remote? "Remote" : "Cluster"),
+ peer->uname, state_text(peer->state), state_text(data));
if (pcmk__str_eq(peer->state, CRM_NODE_MEMBER, pcmk__str_casei)) {
/* If we're the writer, send new peers a list of all attributes
* (unless it's a remote node, which doesn't run its own attrd)
--
1.8.3.1
From 1dcc8dee4990cf0dbdec0e14db6d9a3ad67a41d5 Mon Sep 17 00:00:00 2001
From: Ken Gaillot <kgaillot@redhat.com>
Date: Mon, 7 Jun 2021 11:13:53 -0500
Subject: [PATCH 06/11] Low: pacemaker-attrd: ensure node ID is only set for
attributes when known
In most cases, attribute updates contained the node ID, and the node ID was
used by other code, only if known (i.e. positive). However a couple places did
not check this, so add that.
I am unsure whether the missing check caused problems in practice, but there
appears to be the possibility that a remote node would wrongly be added to the
cluster node cache.
---
daemons/attrd/attrd_commands.c | 6 ++++--
1 file changed, 4 insertions(+), 2 deletions(-)
diff --git a/daemons/attrd/attrd_commands.c b/daemons/attrd/attrd_commands.c
index d6d179b..b3f441c 100644
--- a/daemons/attrd/attrd_commands.c
+++ b/daemons/attrd/attrd_commands.c
@@ -136,7 +136,9 @@ build_attribute_xml(
crm_xml_add(xml, PCMK__XA_ATTR_UUID, uuid);
crm_xml_add(xml, PCMK__XA_ATTR_USER, user);
crm_xml_add(xml, PCMK__XA_ATTR_NODE_NAME, peer);
- crm_xml_add_int(xml, PCMK__XA_ATTR_NODE_ID, peerid);
+ if (peerid > 0) {
+ crm_xml_add_int(xml, PCMK__XA_ATTR_NODE_ID, peerid);
+ }
crm_xml_add(xml, PCMK__XA_ATTR_VALUE, value);
crm_xml_add_int(xml, PCMK__XA_ATTR_DAMPENING, timeout_ms/1000);
crm_xml_add_int(xml, PCMK__XA_ATTR_IS_PRIVATE, is_private);
@@ -937,7 +939,7 @@ attrd_peer_update(crm_node_t *peer, xmlNode *xml, const char *host, bool filter)
/* If this is a cluster node whose node ID we are learning, remember it */
if ((v->nodeid == 0) && (v->is_remote == FALSE)
&& (crm_element_value_int(xml, PCMK__XA_ATTR_NODE_ID,
- (int*)&v->nodeid) == 0)) {
+ (int*)&v->nodeid) == 0) && (v->nodeid > 0)) {
crm_node_t *known_peer = crm_get_peer(v->nodeid, host);
--
1.8.3.1
From 8d12490e88b558d01db37a38f7d35175c6d2d69a Mon Sep 17 00:00:00 2001
From: Ken Gaillot <kgaillot@redhat.com>
Date: Thu, 10 Jun 2021 17:25:57 -0500
Subject: [PATCH 07/11] Refactor: pacemaker-attrd: functionize processing a
sync response
... for code isolation, and because we need to add more to it
---
daemons/attrd/attrd_commands.c | 59 ++++++++++++++++++++++++++++--------------
1 file changed, 39 insertions(+), 20 deletions(-)
diff --git a/daemons/attrd/attrd_commands.c b/daemons/attrd/attrd_commands.c
index b3f441c..d02d3e6 100644
--- a/daemons/attrd/attrd_commands.c
+++ b/daemons/attrd/attrd_commands.c
@@ -572,6 +572,43 @@ attrd_peer_clear_failure(crm_node_t *peer, xmlNode *xml)
}
/*!
+ * \internal
+ * \brief Load attributes from a peer sync response
+ *
+ * \param[in] peer Peer that sent clear request
+ * \param[in] peer_won Whether peer is the attribute writer
+ * \param[in] xml Request XML
+ */
+static void
+process_peer_sync_response(crm_node_t *peer, bool peer_won, xmlNode *xml)
+{
+ crm_info("Processing " PCMK__ATTRD_CMD_SYNC_RESPONSE " from %s",
+ peer->uname);
+
+ if (peer_won) {
+ /* Initialize the "seen" flag for all attributes to cleared, so we can
+ * detect attributes that local node has but the writer doesn't.
+ */
+ clear_attribute_value_seen();
+ }
+
+ // Process each attribute update in the sync response
+ for (xmlNode *child = pcmk__xml_first_child(xml); child != NULL;
+ child = pcmk__xml_next(child)) {
+ attrd_peer_update(peer, child,
+ crm_element_value(child, PCMK__XA_ATTR_NODE_NAME),
+ TRUE);
+ }
+
+ if (peer_won) {
+ /* If any attributes are still not marked as seen, the writer doesn't
+ * know about them, so send all peers an update with them.
+ */
+ attrd_current_only_attribute_update(peer, xml);
+ }
+}
+
+/*!
\internal
\brief Broadcast private attribute for local node with protocol version
*/
@@ -596,7 +633,7 @@ attrd_peer_message(crm_node_t *peer, xmlNode *xml)
const char *op = crm_element_value(xml, PCMK__XA_TASK);
const char *election_op = crm_element_value(xml, F_CRM_TASK);
const char *host = crm_element_value(xml, PCMK__XA_ATTR_NODE_NAME);
- bool peer_won = FALSE;
+ bool peer_won = false;
if (election_op) {
attrd_handle_election_op(peer, xml);
@@ -631,25 +668,7 @@ attrd_peer_message(crm_node_t *peer, xmlNode *xml)
} else if (pcmk__str_eq(op, PCMK__ATTRD_CMD_SYNC_RESPONSE, pcmk__str_casei)
&& !pcmk__str_eq(peer->uname, attrd_cluster->uname, pcmk__str_casei)) {
- xmlNode *child = NULL;
-
- crm_info("Processing %s from %s", op, peer->uname);
-
- /* Clear the seen flag for attribute processing held only in the own node. */
- if (peer_won) {
- clear_attribute_value_seen();
- }
-
- for (child = pcmk__xml_first_child(xml); child != NULL;
- child = pcmk__xml_next(child)) {
- host = crm_element_value(child, PCMK__XA_ATTR_NODE_NAME);
- attrd_peer_update(peer, child, host, TRUE);
- }
-
- if (peer_won) {
- /* Synchronize if there is an attribute held only by own node that Writer does not have. */
- attrd_current_only_attribute_update(peer, xml);
- }
+ process_peer_sync_response(peer, peer_won, xml);
} else if (pcmk__str_eq(op, PCMK__ATTRD_CMD_FLUSH, pcmk__str_casei)) {
/* Ignore. The flush command was removed in 2.0.0 but may be
--
1.8.3.1
From a890a0e5bbbcabf907f51ed0460868035f72464d Mon Sep 17 00:00:00 2001
From: Ken Gaillot <kgaillot@redhat.com>
Date: Fri, 11 Jun 2021 14:40:39 -0500
Subject: [PATCH 08/11] Refactor: pacemaker-attrd: functionize broadcasting
local override
... for code isolation
---
daemons/attrd/attrd_commands.c | 42 +++++++++++++++++++++++++++++-------------
1 file changed, 29 insertions(+), 13 deletions(-)
diff --git a/daemons/attrd/attrd_commands.c b/daemons/attrd/attrd_commands.c
index d02d3e6..4783427 100644
--- a/daemons/attrd/attrd_commands.c
+++ b/daemons/attrd/attrd_commands.c
@@ -804,6 +804,34 @@ attrd_current_only_attribute_update(crm_node_t *peer, xmlNode *xml)
free_xml(sync);
}
+/*!
+ * \internal
+ * \brief Override an attribute sync with a local value
+ *
+ * Broadcast the local node's value for an attribute that's different from the
+ * value provided in a peer's attribute synchronization response. This ensures a
+ * node's values for itself take precedence and all peers are kept in sync.
+ *
+ * \param[in] a Attribute entry to override
+ *
+ * \return Local instance of attribute value
+ */
+static attribute_value_t *
+broadcast_local_value(attribute_t *a)
+{
+ attribute_value_t *v = g_hash_table_lookup(a->values, attrd_cluster->uname);
+ xmlNode *sync = create_xml_node(NULL, __func__);
+
+ crm_xml_add(sync, PCMK__XA_TASK, PCMK__ATTRD_CMD_SYNC_RESPONSE);
+ build_attribute_xml(sync, a->id, a->set, a->uuid, a->timeout_ms,
+ a->user, a->is_private, v->nodename, v->nodeid,
+ v->current, FALSE);
+ attrd_xml_add_writer(sync);
+ send_attrd_message(NULL, sync);
+ free_xml(sync);
+ return v;
+}
+
void
attrd_peer_update(crm_node_t *peer, xmlNode *xml, const char *host, bool filter)
{
@@ -899,21 +927,9 @@ attrd_peer_update(crm_node_t *peer, xmlNode *xml, const char *host, bool filter)
if (filter && !pcmk__str_eq(v->current, value, pcmk__str_casei)
&& pcmk__str_eq(host, attrd_cluster->uname, pcmk__str_casei)) {
- xmlNode *sync = create_xml_node(NULL, __func__);
-
crm_notice("%s[%s]: local value '%s' takes priority over '%s' from %s",
attr, host, v->current, value, peer->uname);
-
- crm_xml_add(sync, PCMK__XA_TASK, PCMK__ATTRD_CMD_SYNC_RESPONSE);
- v = g_hash_table_lookup(a->values, host);
- build_attribute_xml(sync, attr, a->set, a->uuid, a->timeout_ms, a->user,
- a->is_private, v->nodename, v->nodeid, v->current, FALSE);
-
- attrd_xml_add_writer(sync);
-
- /* Broadcast in case any other nodes had the inconsistent value */
- send_attrd_message(NULL, sync);
- free_xml(sync);
+ v = broadcast_local_value(a);
} else if (!pcmk__str_eq(v->current, value, pcmk__str_casei)) {
crm_notice("Setting %s[%s]: %s -> %s " CRM_XS " from %s",
--
1.8.3.1
From f6f65e3dab070f1bbdf6d1383f4d6173a8840bc9 Mon Sep 17 00:00:00 2001
From: Ken Gaillot <kgaillot@redhat.com>
Date: Fri, 11 Jun 2021 14:50:29 -0500
Subject: [PATCH 09/11] Log: pacemaker-attrd: improve messages when
broadcasting local-only values
The traces aren't necessary since build_attribute_xml() already logs the same
info at debug. Also, rename function for clarity, and make static.
---
daemons/attrd/attrd_commands.c | 35 ++++++++++++++++-------------------
1 file changed, 16 insertions(+), 19 deletions(-)
diff --git a/daemons/attrd/attrd_commands.c b/daemons/attrd/attrd_commands.c
index 4783427..356defb 100644
--- a/daemons/attrd/attrd_commands.c
+++ b/daemons/attrd/attrd_commands.c
@@ -51,11 +51,12 @@ GHashTable *attributes = NULL;
void write_attribute(attribute_t *a, bool ignore_delay);
void write_or_elect_attribute(attribute_t *a);
-void attrd_current_only_attribute_update(crm_node_t *peer, xmlNode *xml);
void attrd_peer_update(crm_node_t *peer, xmlNode *xml, const char *host, bool filter);
void attrd_peer_sync(crm_node_t *peer, xmlNode *xml);
void attrd_peer_remove(const char *host, gboolean uncache, const char *source);
+static void broadcast_unseen_local_values(crm_node_t *peer, xmlNode *xml);
+
static gboolean
send_attrd_message(crm_node_t * node, xmlNode * data)
{
@@ -604,7 +605,7 @@ process_peer_sync_response(crm_node_t *peer, bool peer_won, xmlNode *xml)
/* If any attributes are still not marked as seen, the writer doesn't
* know about them, so send all peers an update with them.
*/
- attrd_current_only_attribute_update(peer, xml);
+ broadcast_unseen_local_values(peer, xml);
}
}
@@ -768,40 +769,36 @@ attrd_lookup_or_create_value(GHashTable *values, const char *host, xmlNode *xml)
return(v);
}
-void
-attrd_current_only_attribute_update(crm_node_t *peer, xmlNode *xml)
+void
+broadcast_unseen_local_values(crm_node_t *peer, xmlNode *xml)
{
GHashTableIter aIter;
GHashTableIter vIter;
- attribute_t *a;
+ attribute_t *a = NULL;
attribute_value_t *v = NULL;
- xmlNode *sync = create_xml_node(NULL, __func__);
- gboolean build = FALSE;
-
- crm_xml_add(sync, PCMK__XA_TASK, PCMK__ATTRD_CMD_SYNC_RESPONSE);
+ xmlNode *sync = NULL;
g_hash_table_iter_init(&aIter, attributes);
while (g_hash_table_iter_next(&aIter, NULL, (gpointer *) & a)) {
g_hash_table_iter_init(&vIter, a->values);
while (g_hash_table_iter_next(&vIter, NULL, (gpointer *) & v)) {
- if (pcmk__str_eq(v->nodename, attrd_cluster->uname, pcmk__str_casei) && v->seen == FALSE) {
- crm_trace("Syncing %s[%s] = %s to everyone.(from local only attributes)", a->id, v->nodename, v->current);
-
- build = TRUE;
+ if (!(v->seen) && pcmk__str_eq(v->nodename, attrd_cluster->uname,
+ pcmk__str_casei)) {
+ if (sync == NULL) {
+ sync = create_xml_node(NULL, __func__);
+ crm_xml_add(sync, PCMK__XA_TASK, PCMK__ATTRD_CMD_SYNC_RESPONSE);
+ }
build_attribute_xml(sync, a->id, a->set, a->uuid, a->timeout_ms, a->user, a->is_private,
v->nodename, v->nodeid, v->current, (a->timeout_ms && a->timer ? TRUE : FALSE));
- } else {
- crm_trace("Local attribute(%s[%s] = %s) was ignore.(another host) : [%s]", a->id, v->nodename, v->current, attrd_cluster->uname);
- continue;
}
}
}
- if (build) {
- crm_debug("Syncing values to everyone.(from local only attributes)");
+ if (sync != NULL) {
+ crm_debug("Broadcasting local-only values");
send_attrd_message(NULL, sync);
+ free_xml(sync);
}
- free_xml(sync);
}
/*!
--
1.8.3.1
From ab90ffb785ea018556f216b8f540f8c3429a3947 Mon Sep 17 00:00:00 2001
From: Ken Gaillot <kgaillot@redhat.com>
Date: Fri, 11 Jun 2021 15:04:20 -0500
Subject: [PATCH 10/11] Refactor: pacemaker-attrd: simplify attribute XML
creation function
... and rename for clarity
---
daemons/attrd/attrd_commands.c | 48 ++++++++++++++++++++++++------------------
1 file changed, 27 insertions(+), 21 deletions(-)
diff --git a/daemons/attrd/attrd_commands.c b/daemons/attrd/attrd_commands.c
index 356defb..5b32a77 100644
--- a/daemons/attrd/attrd_commands.c
+++ b/daemons/attrd/attrd_commands.c
@@ -125,25 +125,35 @@ cache_remote_node(const char *node_name)
CRM_ASSERT(crm_remote_peer_get(node_name) != NULL);
}
+/*!
+ * \internal
+ * \brief Create an XML representation of an attribute for use in peer messages
+ *
+ * \param[in] parent Create attribute XML as child element of this element
+ * \param[in] a Attribute to represent
+ * \param[in] v Attribute value to represent
+ * \param[in] force_write If true, value should be written even if unchanged
+ *
+ * \return XML representation of attribute
+ */
static xmlNode *
-build_attribute_xml(
- xmlNode *parent, const char *name, const char *set, const char *uuid, unsigned int timeout_ms, const char *user,
- gboolean is_private, const char *peer, uint32_t peerid, const char *value, gboolean is_force_write)
+add_attribute_value_xml(xmlNode *parent, attribute_t *a, attribute_value_t *v,
+ bool force_write)
{
xmlNode *xml = create_xml_node(parent, __func__);
- crm_xml_add(xml, PCMK__XA_ATTR_NAME, name);
- crm_xml_add(xml, PCMK__XA_ATTR_SET, set);
- crm_xml_add(xml, PCMK__XA_ATTR_UUID, uuid);
- crm_xml_add(xml, PCMK__XA_ATTR_USER, user);
- crm_xml_add(xml, PCMK__XA_ATTR_NODE_NAME, peer);
- if (peerid > 0) {
- crm_xml_add_int(xml, PCMK__XA_ATTR_NODE_ID, peerid);
+ crm_xml_add(xml, PCMK__XA_ATTR_NAME, a->id);
+ crm_xml_add(xml, PCMK__XA_ATTR_SET, a->set);
+ crm_xml_add(xml, PCMK__XA_ATTR_UUID, a->uuid);
+ crm_xml_add(xml, PCMK__XA_ATTR_USER, a->user);
+ crm_xml_add(xml, PCMK__XA_ATTR_NODE_NAME, v->nodename);
+ if (v->nodeid > 0) {
+ crm_xml_add_int(xml, PCMK__XA_ATTR_NODE_ID, v->nodeid);
}
- crm_xml_add(xml, PCMK__XA_ATTR_VALUE, value);
- crm_xml_add_int(xml, PCMK__XA_ATTR_DAMPENING, timeout_ms/1000);
- crm_xml_add_int(xml, PCMK__XA_ATTR_IS_PRIVATE, is_private);
- crm_xml_add_int(xml, PCMK__XA_ATTR_FORCE, is_force_write);
+ crm_xml_add(xml, PCMK__XA_ATTR_VALUE, v->current);
+ crm_xml_add_int(xml, PCMK__XA_ATTR_DAMPENING, a->timeout_ms / 1000);
+ crm_xml_add_int(xml, PCMK__XA_ATTR_IS_PRIVATE, a->is_private);
+ crm_xml_add_int(xml, PCMK__XA_ATTR_FORCE, force_write);
return xml;
}
@@ -695,8 +705,7 @@ attrd_peer_sync(crm_node_t *peer, xmlNode *xml)
g_hash_table_iter_init(&vIter, a->values);
while (g_hash_table_iter_next(&vIter, NULL, (gpointer *) & v)) {
crm_debug("Syncing %s[%s] = %s to %s", a->id, v->nodename, v->current, peer?peer->uname:"everyone");
- build_attribute_xml(sync, a->id, a->set, a->uuid, a->timeout_ms, a->user, a->is_private,
- v->nodename, v->nodeid, v->current, FALSE);
+ add_attribute_value_xml(sync, a, v, false);
}
}
@@ -788,8 +797,7 @@ broadcast_unseen_local_values(crm_node_t *peer, xmlNode *xml)
sync = create_xml_node(NULL, __func__);
crm_xml_add(sync, PCMK__XA_TASK, PCMK__ATTRD_CMD_SYNC_RESPONSE);
}
- build_attribute_xml(sync, a->id, a->set, a->uuid, a->timeout_ms, a->user, a->is_private,
- v->nodename, v->nodeid, v->current, (a->timeout_ms && a->timer ? TRUE : FALSE));
+ add_attribute_value_xml(sync, a, v, a->timeout_ms && a->timer);
}
}
}
@@ -820,9 +828,7 @@ broadcast_local_value(attribute_t *a)
xmlNode *sync = create_xml_node(NULL, __func__);
crm_xml_add(sync, PCMK__XA_TASK, PCMK__ATTRD_CMD_SYNC_RESPONSE);
- build_attribute_xml(sync, a->id, a->set, a->uuid, a->timeout_ms,
- a->user, a->is_private, v->nodename, v->nodeid,
- v->current, FALSE);
+ add_attribute_value_xml(sync, a, v, false);
attrd_xml_add_writer(sync);
send_attrd_message(NULL, sync);
free_xml(sync);
--
1.8.3.1
From 540d74130c5c8d9c626d6c50475e4dc4f64234e7 Mon Sep 17 00:00:00 2001
From: Ken Gaillot <kgaillot@redhat.com>
Date: Fri, 4 Jun 2021 16:34:26 -0500
Subject: [PATCH 11/11] Fix: pacemaker-attrd: avoid repeated unfencing of
remote nodes
The attribute manager can't record a remote node's attributes to the CIB until
it knows the node is remote. Normally, this is learned when the remote node
starts, because the controller clears the CRM_OP_PROBED attribute and indicates
that it is for a remote node.
However, if a cluster node is down when a remote node starts, and later comes
up, it learns the remote node's existing attributes as part of the attribute
sync. Previously, this did not include whether each value is for a cluster or
remote node, so the newly joined attribute manager couldn't write out remote
nodes' attributes until it learned that via some other event -- which might not
happen before the node becomes DC, in which case its scheduler will not see any
unfencing-related node attributes and may wrongly schedule unfencing.
The sync response handling already calls attrd_lookup_or_create_value(), which
checks PCMK__XA_ATTR_IS_REMOTE, so all we need to do is add that to the sync
response.
---
daemons/attrd/attrd_commands.c | 6 +++++-
1 file changed, 5 insertions(+), 1 deletion(-)
diff --git a/daemons/attrd/attrd_commands.c b/daemons/attrd/attrd_commands.c
index 5b32a77..0142383 100644
--- a/daemons/attrd/attrd_commands.c
+++ b/daemons/attrd/attrd_commands.c
@@ -43,8 +43,9 @@
* 1 1.1.15 PCMK__ATTRD_CMD_UPDATE_BOTH,
* PCMK__ATTRD_CMD_UPDATE_DELAY
* 2 1.1.17 PCMK__ATTRD_CMD_CLEAR_FAILURE
+ * 3 2.1.1 PCMK__ATTRD_CMD_SYNC_RESPONSE indicates remote nodes
*/
-#define ATTRD_PROTOCOL_VERSION "2"
+#define ATTRD_PROTOCOL_VERSION "3"
int last_cib_op_done = 0;
GHashTable *attributes = NULL;
@@ -150,6 +151,9 @@ add_attribute_value_xml(xmlNode *parent, attribute_t *a, attribute_value_t *v,
if (v->nodeid > 0) {
crm_xml_add_int(xml, PCMK__XA_ATTR_NODE_ID, v->nodeid);
}
+ if (v->is_remote != 0) {
+ crm_xml_add_int(xml, PCMK__XA_ATTR_IS_REMOTE, 1);
+ }
crm_xml_add(xml, PCMK__XA_ATTR_VALUE, v->current);
crm_xml_add_int(xml, PCMK__XA_ATTR_DAMPENING, a->timeout_ms / 1000);
crm_xml_add_int(xml, PCMK__XA_ATTR_IS_PRIVATE, a->is_private);
--
1.8.3.1