diff --git a/001-sync-points.patch b/001-sync-points.patch deleted file mode 100644 index c034c78..0000000 --- a/001-sync-points.patch +++ /dev/null @@ -1,2429 +0,0 @@ -From de05f6b52c667155d262ceeb541dc1041d079d71 Mon Sep 17 00:00:00 2001 -From: Chris Lumens -Date: Thu, 8 Sep 2022 11:36:58 -0400 -Subject: [PATCH 01/26] Refactor: tools: Use a uint32_t for attr_options. - ---- - tools/attrd_updater.c | 2 +- - 1 file changed, 1 insertion(+), 1 deletion(-) - -diff --git a/tools/attrd_updater.c b/tools/attrd_updater.c -index d90567a..b85a281 100644 ---- a/tools/attrd_updater.c -+++ b/tools/attrd_updater.c -@@ -47,7 +47,7 @@ struct { - gchar *attr_node; - gchar *attr_set; - char *attr_value; -- int attr_options; -+ uint32_t attr_options; - gboolean query_all; - gboolean quiet; - } options = { --- -2.31.1 - -From c6637520b474d44553ade52c0dbe9e36e873135f Mon Sep 17 00:00:00 2001 -From: Chris Lumens -Date: Fri, 21 Oct 2022 14:31:16 -0400 -Subject: [PATCH 02/26] Refactor: libcrmcommon: Make pcmk__xe_match more - broadly useful. - -If attr_v is NULL, simply return the first node with a matching name. ---- - lib/common/xml.c | 10 ++++++---- - 1 file changed, 6 insertions(+), 4 deletions(-) - -diff --git a/lib/common/xml.c b/lib/common/xml.c -index 036dd87..ac6f46a 100644 ---- a/lib/common/xml.c -+++ b/lib/common/xml.c -@@ -510,7 +510,7 @@ find_xml_node(const xmlNode *root, const char *search_path, gboolean must_find) - * \param[in] parent XML element to search - * \param[in] node_name If not NULL, only match children of this type - * \param[in] attr_n If not NULL, only match children with an attribute -- * of this name and a value of \p attr_v -+ * of this name. - * \param[in] attr_v If \p attr_n and this are not NULL, only match children - * with an attribute named \p attr_n and this value - * -@@ -520,14 +520,16 @@ xmlNode * - pcmk__xe_match(const xmlNode *parent, const char *node_name, - const char *attr_n, const char *attr_v) - { -- /* ensure attr_v specified when attr_n is */ -- CRM_CHECK(attr_n == NULL || attr_v != NULL, return NULL); -+ CRM_CHECK(parent != NULL, return NULL); -+ CRM_CHECK(attr_v == NULL || attr_n != NULL, return NULL); - - for (xmlNode *child = pcmk__xml_first_child(parent); child != NULL; - child = pcmk__xml_next(child)) { - if (pcmk__str_eq(node_name, (const char *) (child->name), - pcmk__str_null_matches) -- && ((attr_n == NULL) || attr_matches(child, attr_n, attr_v))) { -+ && ((attr_n == NULL) || -+ (attr_v == NULL && xmlHasProp(child, (pcmkXmlStr) attr_n)) || -+ (attr_v != NULL && attr_matches(child, attr_n, attr_v)))) { - return child; - } - } --- -2.31.1 - -From dd520579484c6ec091f7fbb550347941302dad0e Mon Sep 17 00:00:00 2001 -From: Chris Lumens -Date: Fri, 21 Oct 2022 14:32:46 -0400 -Subject: [PATCH 03/26] Tests: libcrmcommon: Add tests for pcmk__xe_match. - ---- - lib/common/tests/xml/Makefile.am | 3 +- - lib/common/tests/xml/pcmk__xe_match_test.c | 105 +++++++++++++++++++++ - 2 files changed, 107 insertions(+), 1 deletion(-) - create mode 100644 lib/common/tests/xml/pcmk__xe_match_test.c - -diff --git a/lib/common/tests/xml/Makefile.am b/lib/common/tests/xml/Makefile.am -index 342ca07..0ccdcc3 100644 ---- a/lib/common/tests/xml/Makefile.am -+++ b/lib/common/tests/xml/Makefile.am -@@ -11,6 +11,7 @@ include $(top_srcdir)/mk/tap.mk - include $(top_srcdir)/mk/unittest.mk - - # Add "_test" to the end of all test program names to simplify .gitignore. --check_PROGRAMS = pcmk__xe_foreach_child_test -+check_PROGRAMS = pcmk__xe_foreach_child_test \ -+ pcmk__xe_match_test - - TESTS = $(check_PROGRAMS) -diff --git a/lib/common/tests/xml/pcmk__xe_match_test.c b/lib/common/tests/xml/pcmk__xe_match_test.c -new file mode 100644 -index 0000000..fd529ba ---- /dev/null -+++ b/lib/common/tests/xml/pcmk__xe_match_test.c -@@ -0,0 +1,105 @@ -+/* -+ * Copyright 2022 the Pacemaker project contributors -+ * -+ * The version control history for this file may have further details. -+ * -+ * This source code is licensed under the GNU Lesser General Public License -+ * version 2.1 or later (LGPLv2.1+) WITHOUT ANY WARRANTY. -+ */ -+ -+#include -+ -+#include -+#include -+ -+const char *str1 = -+ "\n" -+ " \n" -+ " \n" -+ " content\n" -+ " \n" -+ " \n" -+ " \n" -+ " content\n" -+ " \n" -+ " \n" -+ " \n" -+ " content\n" -+ " \n" -+ " \n" -+ " \n" -+ " content\n" -+ " \n" -+ " \n" -+ " \n" -+ " content\n" -+ " \n" -+ ""; -+ -+static void -+bad_input(void **state) { -+ xmlNode *xml = string2xml(str1); -+ -+ assert_null(pcmk__xe_match(NULL, NULL, NULL, NULL)); -+ assert_null(pcmk__xe_match(NULL, NULL, NULL, "attrX")); -+ -+ free_xml(xml); -+} -+ -+static void -+not_found(void **state) { -+ xmlNode *xml = string2xml(str1); -+ -+ /* No node with an attrX attribute */ -+ assert_null(pcmk__xe_match(xml, NULL, "attrX", NULL)); -+ /* No nodeX node */ -+ assert_null(pcmk__xe_match(xml, "nodeX", NULL, NULL)); -+ /* No nodeA node with attrX */ -+ assert_null(pcmk__xe_match(xml, "nodeA", "attrX", NULL)); -+ /* No nodeA node with attrA=XYZ */ -+ assert_null(pcmk__xe_match(xml, "nodeA", "attrA", "XYZ")); -+ -+ free_xml(xml); -+} -+ -+static void -+find_attrB(void **state) { -+ xmlNode *xml = string2xml(str1); -+ xmlNode *result = NULL; -+ -+ /* Find the first node with attrB */ -+ result = pcmk__xe_match(xml, NULL, "attrB", NULL); -+ assert_non_null(result); -+ assert_string_equal(crm_element_value(result, "id"), "3"); -+ -+ /* Find the first nodeB with attrB */ -+ result = pcmk__xe_match(xml, "nodeB", "attrB", NULL); -+ assert_non_null(result); -+ assert_string_equal(crm_element_value(result, "id"), "5"); -+ -+ free_xml(xml); -+} -+ -+static void -+find_attrA_matching(void **state) { -+ xmlNode *xml = string2xml(str1); -+ xmlNode *result = NULL; -+ -+ /* Find attrA=456 */ -+ result = pcmk__xe_match(xml, NULL, "attrA", "456"); -+ assert_non_null(result); -+ assert_string_equal(crm_element_value(result, "id"), "2"); -+ -+ /* Find a nodeB with attrA=123 */ -+ result = pcmk__xe_match(xml, "nodeB", "attrA", "123"); -+ assert_non_null(result); -+ assert_string_equal(crm_element_value(result, "id"), "4"); -+ -+ free_xml(xml); -+} -+ -+PCMK__UNIT_TEST(NULL, NULL, -+ cmocka_unit_test(bad_input), -+ cmocka_unit_test(not_found), -+ cmocka_unit_test(find_attrB), -+ cmocka_unit_test(find_attrA_matching)); --- -2.31.1 - -From 03af8498d8aaf21c509cec9b0ec4b78475da41d7 Mon Sep 17 00:00:00 2001 -From: Chris Lumens -Date: Thu, 8 Sep 2022 12:22:26 -0400 -Subject: [PATCH 04/26] Feature: libcrmcommon: Add attrd options for specifying - a sync point. - ---- - include/crm/common/attrd_internal.h | 16 +++++++++------- - 1 file changed, 9 insertions(+), 7 deletions(-) - -diff --git a/include/crm/common/attrd_internal.h b/include/crm/common/attrd_internal.h -index f7033ad..389be48 100644 ---- a/include/crm/common/attrd_internal.h -+++ b/include/crm/common/attrd_internal.h -@@ -16,13 +16,15 @@ extern "C" { - - // Options for clients to use with functions below - enum pcmk__node_attr_opts { -- pcmk__node_attr_none = 0, -- pcmk__node_attr_remote = (1 << 0), -- pcmk__node_attr_private = (1 << 1), -- pcmk__node_attr_pattern = (1 << 2), -- pcmk__node_attr_value = (1 << 3), -- pcmk__node_attr_delay = (1 << 4), -- pcmk__node_attr_perm = (1 << 5), -+ pcmk__node_attr_none = 0, -+ pcmk__node_attr_remote = (1 << 0), -+ pcmk__node_attr_private = (1 << 1), -+ pcmk__node_attr_pattern = (1 << 2), -+ pcmk__node_attr_value = (1 << 3), -+ pcmk__node_attr_delay = (1 << 4), -+ pcmk__node_attr_perm = (1 << 5), -+ pcmk__node_attr_sync_local = (1 << 6), -+ pcmk__node_attr_sync_cluster = (1 << 7), - }; - - #define pcmk__set_node_attr_flags(node_attr_flags, flags_to_set) do { \ --- -2.31.1 - -From 5c8825293ee21d3823bdcd01b0df9c7d39739940 Mon Sep 17 00:00:00 2001 -From: Chris Lumens -Date: Thu, 8 Sep 2022 12:23:09 -0400 -Subject: [PATCH 05/26] Feature: libcrmcommon: Add sync point to IPC request - XML. - -If one of the pcmk__node_attr_sync_* options is provided, add an -attribute to the request XML. This will later be inspected by the -server to determine when to send the reply to the client. ---- - include/crm/common/options_internal.h | 2 ++ - include/crm_internal.h | 1 + - lib/common/ipc_attrd.c | 6 ++++++ - 3 files changed, 9 insertions(+) - -diff --git a/include/crm/common/options_internal.h b/include/crm/common/options_internal.h -index b153c67..f29ba3f 100644 ---- a/include/crm/common/options_internal.h -+++ b/include/crm/common/options_internal.h -@@ -145,9 +145,11 @@ bool pcmk__valid_sbd_timeout(const char *value); - #define PCMK__META_ALLOW_UNHEALTHY_NODES "allow-unhealthy-nodes" - - // Constants for enumerated values for various options -+#define PCMK__VALUE_CLUSTER "cluster" - #define PCMK__VALUE_CUSTOM "custom" - #define PCMK__VALUE_FENCING "fencing" - #define PCMK__VALUE_GREEN "green" -+#define PCMK__VALUE_LOCAL "local" - #define PCMK__VALUE_MIGRATE_ON_RED "migrate-on-red" - #define PCMK__VALUE_NONE "none" - #define PCMK__VALUE_NOTHING "nothing" -diff --git a/include/crm_internal.h b/include/crm_internal.h -index e6e2e96..08193c3 100644 ---- a/include/crm_internal.h -+++ b/include/crm_internal.h -@@ -71,6 +71,7 @@ - #define PCMK__XA_ATTR_RESOURCE "attr_resource" - #define PCMK__XA_ATTR_SECTION "attr_section" - #define PCMK__XA_ATTR_SET "attr_set" -+#define PCMK__XA_ATTR_SYNC_POINT "attr_sync_point" - #define PCMK__XA_ATTR_USER "attr_user" - #define PCMK__XA_ATTR_UUID "attr_key" - #define PCMK__XA_ATTR_VALUE "attr_value" -diff --git a/lib/common/ipc_attrd.c b/lib/common/ipc_attrd.c -index f6cfbc4..4606509 100644 ---- a/lib/common/ipc_attrd.c -+++ b/lib/common/ipc_attrd.c -@@ -431,6 +431,12 @@ populate_update_op(xmlNode *op, const char *node, const char *name, const char * - pcmk_is_set(options, pcmk__node_attr_remote)); - crm_xml_add_int(op, PCMK__XA_ATTR_IS_PRIVATE, - pcmk_is_set(options, pcmk__node_attr_private)); -+ -+ if (pcmk_is_set(options, pcmk__node_attr_sync_local)) { -+ crm_xml_add(op, PCMK__XA_ATTR_SYNC_POINT, PCMK__VALUE_LOCAL); -+ } else if (pcmk_is_set(options, pcmk__node_attr_sync_cluster)) { -+ crm_xml_add(op, PCMK__XA_ATTR_SYNC_POINT, PCMK__VALUE_CLUSTER); -+ } - } - - int --- -2.31.1 - -From e2b3fee630caf0846ca8bbffcef4d6d2acfd32a5 Mon Sep 17 00:00:00 2001 -From: Chris Lumens -Date: Thu, 8 Sep 2022 12:26:28 -0400 -Subject: [PATCH 06/26] Feature: tools: Add --wait= parameter to attrd_updater. - -This command line option is used to specify the sync point to use. For -the moment, it has no effect. ---- - tools/attrd_updater.c | 24 ++++++++++++++++++++++++ - 1 file changed, 24 insertions(+) - -diff --git a/tools/attrd_updater.c b/tools/attrd_updater.c -index b85a281..c4779a6 100644 ---- a/tools/attrd_updater.c -+++ b/tools/attrd_updater.c -@@ -97,6 +97,22 @@ section_cb (const gchar *option_name, const gchar *optarg, gpointer data, GError - return TRUE; - } - -+static gboolean -+wait_cb (const gchar *option_name, const gchar *optarg, gpointer data, GError **err) { -+ if (pcmk__str_eq(optarg, "no", pcmk__str_none)) { -+ pcmk__clear_node_attr_flags(options.attr_options, pcmk__node_attr_sync_local | pcmk__node_attr_sync_cluster); -+ return TRUE; -+ } else if (pcmk__str_eq(optarg, PCMK__VALUE_LOCAL, pcmk__str_none)) { -+ pcmk__clear_node_attr_flags(options.attr_options, pcmk__node_attr_sync_local | pcmk__node_attr_sync_cluster); -+ pcmk__set_node_attr_flags(options.attr_options, pcmk__node_attr_sync_local); -+ return TRUE; -+ } else { -+ g_set_error(err, PCMK__EXITC_ERROR, CRM_EX_USAGE, -+ "--wait= must be one of 'no', 'local', 'cluster'"); -+ return FALSE; -+ } -+} -+ - #define INDENT " " - - static GOptionEntry required_entries[] = { -@@ -175,6 +191,14 @@ static GOptionEntry addl_entries[] = { - "If this creates a new attribute, never write the attribute to CIB", - NULL }, - -+ { "wait", 'W', 0, G_OPTION_ARG_CALLBACK, wait_cb, -+ "Wait for some event to occur before returning. Values are 'no' (wait\n" -+ INDENT "only for the attribute daemon to acknowledge the request) or\n" -+ INDENT "'local' (wait until the change has propagated to where a local\n" -+ INDENT "query will return the request value, or the value set by a\n" -+ INDENT "later request). Default is 'no'.", -+ "UNTIL" }, -+ - { NULL } - }; - --- -2.31.1 - -From 52d51ab41b2f00e72724ab39835b3db86605a96b Mon Sep 17 00:00:00 2001 -From: Chris Lumens -Date: Thu, 20 Oct 2022 14:40:13 -0400 -Subject: [PATCH 07/26] Feature: daemons: Add functions for checking a request - for a sync point. - ---- - daemons/attrd/Makefile.am | 1 + - daemons/attrd/attrd_sync.c | 38 +++++++++++++++++++++++++++++++++ - daemons/attrd/pacemaker-attrd.h | 3 +++ - 3 files changed, 42 insertions(+) - create mode 100644 daemons/attrd/attrd_sync.c - -diff --git a/daemons/attrd/Makefile.am b/daemons/attrd/Makefile.am -index 1a3d360..6bb81c4 100644 ---- a/daemons/attrd/Makefile.am -+++ b/daemons/attrd/Makefile.am -@@ -32,6 +32,7 @@ pacemaker_attrd_SOURCES = attrd_alerts.c \ - attrd_elections.c \ - attrd_ipc.c \ - attrd_messages.c \ -+ attrd_sync.c \ - attrd_utils.c \ - pacemaker-attrd.c - -diff --git a/daemons/attrd/attrd_sync.c b/daemons/attrd/attrd_sync.c -new file mode 100644 -index 0000000..92759d2 ---- /dev/null -+++ b/daemons/attrd/attrd_sync.c -@@ -0,0 +1,38 @@ -+/* -+ * Copyright 2022 the Pacemaker project contributors -+ * -+ * The version control history for this file may have further details. -+ * -+ * This source code is licensed under the GNU General Public License version 2 -+ * or later (GPLv2+) WITHOUT ANY WARRANTY. -+ */ -+ -+#include -+ -+#include -+#include -+ -+#include "pacemaker-attrd.h" -+ -+const char * -+attrd_request_sync_point(xmlNode *xml) -+{ -+ if (xml_has_children(xml)) { -+ xmlNode *child = pcmk__xe_match(xml, XML_ATTR_OP, PCMK__XA_ATTR_SYNC_POINT, NULL); -+ -+ if (child) { -+ return crm_element_value(child, PCMK__XA_ATTR_SYNC_POINT); -+ } else { -+ return NULL; -+ } -+ -+ } else { -+ return crm_element_value(xml, PCMK__XA_ATTR_SYNC_POINT); -+ } -+} -+ -+bool -+attrd_request_has_sync_point(xmlNode *xml) -+{ -+ return attrd_request_sync_point(xml) != NULL; -+} -diff --git a/daemons/attrd/pacemaker-attrd.h b/daemons/attrd/pacemaker-attrd.h -index 71ce90a..ff850bb 100644 ---- a/daemons/attrd/pacemaker-attrd.h -+++ b/daemons/attrd/pacemaker-attrd.h -@@ -182,4 +182,7 @@ mainloop_timer_t *attrd_add_timer(const char *id, int timeout_ms, attribute_t *a - void attrd_unregister_handlers(void); - void attrd_handle_request(pcmk__request_t *request); - -+const char *attrd_request_sync_point(xmlNode *xml); -+bool attrd_request_has_sync_point(xmlNode *xml); -+ - #endif /* PACEMAKER_ATTRD__H */ --- -2.31.1 - -From 2e0509a12ee7d4a612133ee65b75245eea7d271d Mon Sep 17 00:00:00 2001 -From: Chris Lumens -Date: Thu, 20 Oct 2022 14:42:04 -0400 -Subject: [PATCH 08/26] Refactor: daemons: Don't ACK update requests that give - a sync point. - -The ACK is the only response from the server for update messages. If -the message specified that it wanted to wait for a sync point, we need -to delay sending that response until the sync point is reached. -Therefore, do not always immediately send the ACK. ---- - daemons/attrd/attrd_messages.c | 19 ++++++++++++++----- - 1 file changed, 14 insertions(+), 5 deletions(-) - -diff --git a/daemons/attrd/attrd_messages.c b/daemons/attrd/attrd_messages.c -index de4a28a..9e8ae40 100644 ---- a/daemons/attrd/attrd_messages.c -+++ b/daemons/attrd/attrd_messages.c -@@ -137,12 +137,21 @@ handle_update_request(pcmk__request_t *request) - attrd_peer_update(peer, request->xml, host, false); - pcmk__set_result(&request->result, CRM_EX_OK, PCMK_EXEC_DONE, NULL); - return NULL; -+ - } else { -- /* Because attrd_client_update can be called recursively, we send the ACK -- * here to ensure that the client only ever receives one. -- */ -- attrd_send_ack(request->ipc_client, request->ipc_id, -- request->flags|crm_ipc_client_response); -+ if (!attrd_request_has_sync_point(request->xml)) { -+ /* If the client doesn't want to wait for a sync point, go ahead and send -+ * the ACK immediately. Otherwise, we'll send the ACK when the appropriate -+ * sync point is reached. -+ * -+ * In the normal case, attrd_client_update can be called recursively which -+ * makes where to send the ACK tricky. Doing it here ensures the client -+ * only ever receives one. -+ */ -+ attrd_send_ack(request->ipc_client, request->ipc_id, -+ request->flags|crm_ipc_client_response); -+ } -+ - return attrd_client_update(request); - } - } --- -2.31.1 - -From 2a0ff66cdf0085c4c8ab1992ef7e785a4facc8c7 Mon Sep 17 00:00:00 2001 -From: Chris Lumens -Date: Thu, 20 Oct 2022 14:48:48 -0400 -Subject: [PATCH 09/26] Feature: daemons: Add support for local sync points on - updates. - -In the IPC dispatcher for attrd, add the client to a wait list if its -request specifies a sync point. When the attribute's value is changed -on the local attrd, alert any clients waiting on a local sync point by -then sending the previously delayed ACK. - -Sync points for other requests and the global sync point are not yet -supported. - -Fixes T35. ---- - daemons/attrd/attrd_corosync.c | 18 +++++ - daemons/attrd/attrd_messages.c | 12 ++- - daemons/attrd/attrd_sync.c | 137 ++++++++++++++++++++++++++++++++ - daemons/attrd/pacemaker-attrd.h | 7 ++ - 4 files changed, 173 insertions(+), 1 deletion(-) - -diff --git a/daemons/attrd/attrd_corosync.c b/daemons/attrd/attrd_corosync.c -index 539e5bf..4337280 100644 ---- a/daemons/attrd/attrd_corosync.c -+++ b/daemons/attrd/attrd_corosync.c -@@ -568,14 +568,32 @@ void - attrd_peer_update(const crm_node_t *peer, xmlNode *xml, const char *host, - bool filter) - { -+ bool handle_sync_point = false; -+ - if (xml_has_children(xml)) { - for (xmlNode *child = first_named_child(xml, XML_ATTR_OP); child != NULL; - child = crm_next_same_xml(child)) { - copy_attrs(xml, child); - attrd_peer_update_one(peer, child, filter); -+ -+ if (attrd_request_has_sync_point(child)) { -+ handle_sync_point = true; -+ } - } - - } else { - attrd_peer_update_one(peer, xml, filter); -+ -+ if (attrd_request_has_sync_point(xml)) { -+ handle_sync_point = true; -+ } -+ } -+ -+ /* If the update XML specified that the client wanted to wait for a sync -+ * point, process that now. -+ */ -+ if (handle_sync_point) { -+ crm_debug("Hit local sync point for attribute update"); -+ attrd_ack_waitlist_clients(attrd_sync_point_local, xml); - } - } -diff --git a/daemons/attrd/attrd_messages.c b/daemons/attrd/attrd_messages.c -index 9e8ae40..c96700f 100644 ---- a/daemons/attrd/attrd_messages.c -+++ b/daemons/attrd/attrd_messages.c -@@ -139,7 +139,17 @@ handle_update_request(pcmk__request_t *request) - return NULL; - - } else { -- if (!attrd_request_has_sync_point(request->xml)) { -+ if (attrd_request_has_sync_point(request->xml)) { -+ /* If this client supplied a sync point it wants to wait for, add it to -+ * the wait list. Clients on this list will not receive an ACK until -+ * their sync point is hit which will result in the client stalled there -+ * until it receives a response. -+ * -+ * All other clients will receive the expected response as normal. -+ */ -+ attrd_add_client_to_waitlist(request); -+ -+ } else { - /* If the client doesn't want to wait for a sync point, go ahead and send - * the ACK immediately. Otherwise, we'll send the ACK when the appropriate - * sync point is reached. -diff --git a/daemons/attrd/attrd_sync.c b/daemons/attrd/attrd_sync.c -index 92759d2..2981bd0 100644 ---- a/daemons/attrd/attrd_sync.c -+++ b/daemons/attrd/attrd_sync.c -@@ -14,6 +14,143 @@ - - #include "pacemaker-attrd.h" - -+/* A hash table storing clients that are waiting on a sync point to be reached. -+ * The key is waitlist_client - just a plain int. The obvious key would be -+ * the IPC client's ID, but this is not guaranteed to be unique. A single client -+ * could be waiting on a sync point for multiple attributes at the same time. -+ * -+ * It is not expected that this hash table will ever be especially large. -+ */ -+static GHashTable *waitlist = NULL; -+static int waitlist_client = 0; -+ -+struct waitlist_node { -+ /* What kind of sync point does this node describe? */ -+ enum attrd_sync_point sync_point; -+ -+ /* Information required to construct and send a reply to the client. */ -+ char *client_id; -+ uint32_t ipc_id; -+ uint32_t flags; -+}; -+ -+static void -+next_key(void) -+{ -+ do { -+ waitlist_client++; -+ if (waitlist_client < 0) { -+ waitlist_client = 1; -+ } -+ } while (g_hash_table_contains(waitlist, GINT_TO_POINTER(waitlist_client))); -+} -+ -+static void -+free_waitlist_node(gpointer data) -+{ -+ struct waitlist_node *wl = (struct waitlist_node *) data; -+ -+ free(wl->client_id); -+ free(wl); -+} -+ -+static const char * -+sync_point_str(enum attrd_sync_point sync_point) -+{ -+ if (sync_point == attrd_sync_point_local) { -+ return PCMK__VALUE_LOCAL; -+ } else if (sync_point == attrd_sync_point_cluster) { -+ return PCMK__VALUE_CLUSTER; -+ } else { -+ return "unknown"; -+ } -+} -+ -+void -+attrd_add_client_to_waitlist(pcmk__request_t *request) -+{ -+ const char *sync_point = attrd_request_sync_point(request->xml); -+ struct waitlist_node *wl = NULL; -+ -+ if (sync_point == NULL) { -+ return; -+ } -+ -+ if (waitlist == NULL) { -+ waitlist = pcmk__intkey_table(free_waitlist_node); -+ } -+ -+ wl = calloc(sizeof(struct waitlist_node), 1); -+ -+ CRM_ASSERT(wl != NULL); -+ -+ wl->client_id = strdup(request->ipc_client->id); -+ -+ CRM_ASSERT(wl->client_id); -+ -+ if (pcmk__str_eq(sync_point, PCMK__VALUE_LOCAL, pcmk__str_none)) { -+ wl->sync_point = attrd_sync_point_local; -+ } else if (pcmk__str_eq(sync_point, PCMK__VALUE_CLUSTER, pcmk__str_none)) { -+ wl->sync_point = attrd_sync_point_cluster; -+ } else { -+ free_waitlist_node(wl); -+ return; -+ } -+ -+ wl->ipc_id = request->ipc_id; -+ wl->flags = request->flags; -+ -+ crm_debug("Added client %s to waitlist for %s sync point", -+ wl->client_id, sync_point_str(wl->sync_point)); -+ -+ next_key(); -+ pcmk__intkey_table_insert(waitlist, waitlist_client, wl); -+ -+ /* And then add the key to the request XML so we can uniquely identify -+ * it when it comes time to issue the ACK. -+ */ -+ crm_xml_add_int(request->xml, XML_LRM_ATTR_CALLID, waitlist_client); -+} -+ -+void -+attrd_ack_waitlist_clients(enum attrd_sync_point sync_point, const xmlNode *xml) -+{ -+ int callid; -+ gpointer value; -+ -+ if (waitlist == NULL) { -+ return; -+ } -+ -+ if (crm_element_value_int(xml, XML_LRM_ATTR_CALLID, &callid) == -1) { -+ crm_warn("Could not get callid from request XML"); -+ return; -+ } -+ -+ value = pcmk__intkey_table_lookup(waitlist, callid); -+ if (value != NULL) { -+ struct waitlist_node *wl = (struct waitlist_node *) value; -+ pcmk__client_t *client = NULL; -+ -+ if (wl->sync_point != sync_point) { -+ return; -+ } -+ -+ crm_debug("Alerting client %s for reached %s sync point", -+ wl->client_id, sync_point_str(wl->sync_point)); -+ -+ client = pcmk__find_client_by_id(wl->client_id); -+ if (client == NULL) { -+ return; -+ } -+ -+ attrd_send_ack(client, wl->ipc_id, wl->flags | crm_ipc_client_response); -+ -+ /* And then remove the client so it doesn't get alerted again. */ -+ pcmk__intkey_table_remove(waitlist, callid); -+ } -+} -+ - const char * - attrd_request_sync_point(xmlNode *xml) - { -diff --git a/daemons/attrd/pacemaker-attrd.h b/daemons/attrd/pacemaker-attrd.h -index ff850bb..9dd8320 100644 ---- a/daemons/attrd/pacemaker-attrd.h -+++ b/daemons/attrd/pacemaker-attrd.h -@@ -182,6 +182,13 @@ mainloop_timer_t *attrd_add_timer(const char *id, int timeout_ms, attribute_t *a - void attrd_unregister_handlers(void); - void attrd_handle_request(pcmk__request_t *request); - -+enum attrd_sync_point { -+ attrd_sync_point_local, -+ attrd_sync_point_cluster, -+}; -+ -+void attrd_add_client_to_waitlist(pcmk__request_t *request); -+void attrd_ack_waitlist_clients(enum attrd_sync_point sync_point, const xmlNode *xml); - const char *attrd_request_sync_point(xmlNode *xml); - bool attrd_request_has_sync_point(xmlNode *xml); - --- -2.31.1 - -From 59caaf1682191a91d6062358b770f8b9457ba3eb Mon Sep 17 00:00:00 2001 -From: Chris Lumens -Date: Thu, 20 Oct 2022 14:56:58 -0400 -Subject: [PATCH 10/26] Feature: daemons: If a client disconnects, remove it - from the waitlist. - ---- - daemons/attrd/attrd_ipc.c | 5 +++++ - daemons/attrd/attrd_sync.c | 21 +++++++++++++++++++++ - daemons/attrd/pacemaker-attrd.h | 1 + - 3 files changed, 27 insertions(+) - -diff --git a/daemons/attrd/attrd_ipc.c b/daemons/attrd/attrd_ipc.c -index 7e4a1c0..8aa39c2 100644 ---- a/daemons/attrd/attrd_ipc.c -+++ b/daemons/attrd/attrd_ipc.c -@@ -438,8 +438,13 @@ attrd_ipc_closed(qb_ipcs_connection_t *c) - crm_trace("Ignoring request to clean up unknown connection %p", c); - } else { - crm_trace("Cleaning up closed client connection %p", c); -+ -+ /* Remove the client from the sync point waitlist if it's present. */ -+ attrd_remove_client_from_waitlist(client); -+ - pcmk__free_client(client); - } -+ - return FALSE; - } - -diff --git a/daemons/attrd/attrd_sync.c b/daemons/attrd/attrd_sync.c -index 2981bd0..7293318 100644 ---- a/daemons/attrd/attrd_sync.c -+++ b/daemons/attrd/attrd_sync.c -@@ -112,6 +112,27 @@ attrd_add_client_to_waitlist(pcmk__request_t *request) - crm_xml_add_int(request->xml, XML_LRM_ATTR_CALLID, waitlist_client); - } - -+void -+attrd_remove_client_from_waitlist(pcmk__client_t *client) -+{ -+ GHashTableIter iter; -+ gpointer value; -+ -+ if (waitlist == NULL) { -+ return; -+ } -+ -+ g_hash_table_iter_init(&iter, waitlist); -+ -+ while (g_hash_table_iter_next(&iter, NULL, &value)) { -+ struct waitlist_node *wl = (struct waitlist_node *) value; -+ -+ if (wl->client_id == client->id) { -+ g_hash_table_iter_remove(&iter); -+ } -+ } -+} -+ - void - attrd_ack_waitlist_clients(enum attrd_sync_point sync_point, const xmlNode *xml) - { -diff --git a/daemons/attrd/pacemaker-attrd.h b/daemons/attrd/pacemaker-attrd.h -index 9dd8320..b6ecb75 100644 ---- a/daemons/attrd/pacemaker-attrd.h -+++ b/daemons/attrd/pacemaker-attrd.h -@@ -189,6 +189,7 @@ enum attrd_sync_point { - - void attrd_add_client_to_waitlist(pcmk__request_t *request); - void attrd_ack_waitlist_clients(enum attrd_sync_point sync_point, const xmlNode *xml); -+void attrd_remove_client_from_waitlist(pcmk__client_t *client); - const char *attrd_request_sync_point(xmlNode *xml); - bool attrd_request_has_sync_point(xmlNode *xml); - --- -2.31.1 - -From b28042e1d64b48c96dbd9da1e9ee3ff481bbf620 Mon Sep 17 00:00:00 2001 -From: Chris Lumens -Date: Mon, 10 Oct 2022 11:00:20 -0400 -Subject: [PATCH 11/26] Feature: daemons: Add support for local sync points on - clearing failures. - -attrd_clear_client_failure just calls attrd_client_update underneath, so -that function will handle all the rest of the sync point functionality -for us. ---- - daemons/attrd/attrd_ipc.c | 2 -- - daemons/attrd/attrd_messages.c | 19 +++++++++++++++++++ - 2 files changed, 19 insertions(+), 2 deletions(-) - -diff --git a/daemons/attrd/attrd_ipc.c b/daemons/attrd/attrd_ipc.c -index 8aa39c2..2e614e8 100644 ---- a/daemons/attrd/attrd_ipc.c -+++ b/daemons/attrd/attrd_ipc.c -@@ -101,8 +101,6 @@ attrd_client_clear_failure(pcmk__request_t *request) - xmlNode *xml = request->xml; - const char *rsc, *op, *interval_spec; - -- attrd_send_ack(request->ipc_client, request->ipc_id, request->ipc_flags); -- - if (minimum_protocol_version >= 2) { - /* Propagate to all peers (including ourselves). - * This ends up at attrd_peer_message(). -diff --git a/daemons/attrd/attrd_messages.c b/daemons/attrd/attrd_messages.c -index c96700f..3ba14a6 100644 ---- a/daemons/attrd/attrd_messages.c -+++ b/daemons/attrd/attrd_messages.c -@@ -42,6 +42,25 @@ handle_clear_failure_request(pcmk__request_t *request) - pcmk__set_result(&request->result, CRM_EX_OK, PCMK_EXEC_DONE, NULL); - return NULL; - } else { -+ if (attrd_request_has_sync_point(request->xml)) { -+ /* If this client supplied a sync point it wants to wait for, add it to -+ * the wait list. Clients on this list will not receive an ACK until -+ * their sync point is hit which will result in the client stalled there -+ * until it receives a response. -+ * -+ * All other clients will receive the expected response as normal. -+ */ -+ attrd_add_client_to_waitlist(request); -+ -+ } else { -+ /* If the client doesn't want to wait for a sync point, go ahead and send -+ * the ACK immediately. Otherwise, we'll send the ACK when the appropriate -+ * sync point is reached. -+ */ -+ attrd_send_ack(request->ipc_client, request->ipc_id, -+ request->ipc_flags); -+ } -+ - return attrd_client_clear_failure(request); - } - } --- -2.31.1 - -From 291dc3b91e57f2584bbf88cfbe3a360e0332e814 Mon Sep 17 00:00:00 2001 -From: Chris Lumens -Date: Mon, 10 Oct 2022 13:17:24 -0400 -Subject: [PATCH 12/26] Refactor: daemons: Free the waitlist on attrd exit. - ---- - daemons/attrd/attrd_sync.c | 11 +++++++++++ - daemons/attrd/attrd_utils.c | 2 ++ - daemons/attrd/pacemaker-attrd.c | 1 + - daemons/attrd/pacemaker-attrd.h | 1 + - 4 files changed, 15 insertions(+) - -diff --git a/daemons/attrd/attrd_sync.c b/daemons/attrd/attrd_sync.c -index 7293318..557e49a 100644 ---- a/daemons/attrd/attrd_sync.c -+++ b/daemons/attrd/attrd_sync.c -@@ -112,6 +112,17 @@ attrd_add_client_to_waitlist(pcmk__request_t *request) - crm_xml_add_int(request->xml, XML_LRM_ATTR_CALLID, waitlist_client); - } - -+void -+attrd_free_waitlist(void) -+{ -+ if (waitlist == NULL) { -+ return; -+ } -+ -+ g_hash_table_destroy(waitlist); -+ waitlist = NULL; -+} -+ - void - attrd_remove_client_from_waitlist(pcmk__client_t *client) - { -diff --git a/daemons/attrd/attrd_utils.c b/daemons/attrd/attrd_utils.c -index 6a19009..00b879b 100644 ---- a/daemons/attrd/attrd_utils.c -+++ b/daemons/attrd/attrd_utils.c -@@ -93,6 +93,8 @@ attrd_shutdown(int nsig) - mainloop_destroy_signal(SIGUSR2); - mainloop_destroy_signal(SIGTRAP); - -+ attrd_free_waitlist(); -+ - if ((mloop == NULL) || !g_main_loop_is_running(mloop)) { - /* If there's no main loop active, just exit. This should be possible - * only if we get SIGTERM in brief windows at start-up and shutdown. -diff --git a/daemons/attrd/pacemaker-attrd.c b/daemons/attrd/pacemaker-attrd.c -index 2100db4..1336542 100644 ---- a/daemons/attrd/pacemaker-attrd.c -+++ b/daemons/attrd/pacemaker-attrd.c -@@ -300,6 +300,7 @@ main(int argc, char **argv) - attrd_ipc_fini(); - attrd_lrmd_disconnect(); - attrd_cib_disconnect(); -+ attrd_free_waitlist(); - g_hash_table_destroy(attributes); - } - -diff --git a/daemons/attrd/pacemaker-attrd.h b/daemons/attrd/pacemaker-attrd.h -index b6ecb75..537bf85 100644 ---- a/daemons/attrd/pacemaker-attrd.h -+++ b/daemons/attrd/pacemaker-attrd.h -@@ -52,6 +52,7 @@ void attrd_run_mainloop(void); - - void attrd_set_requesting_shutdown(void); - void attrd_clear_requesting_shutdown(void); -+void attrd_free_waitlist(void); - bool attrd_requesting_shutdown(void); - bool attrd_shutting_down(void); - void attrd_shutdown(int nsig); --- -2.31.1 - -From 7715ce617c520e14687a82e11ff794c93cd7f64a Mon Sep 17 00:00:00 2001 -From: Chris Lumens -Date: Mon, 10 Oct 2022 13:21:16 -0400 -Subject: [PATCH 13/26] Feature: includes: Bump CRM_FEATURE_SET for local sync - points. - ---- - include/crm/crm.h | 2 +- - 1 file changed, 1 insertion(+), 1 deletion(-) - -diff --git a/include/crm/crm.h b/include/crm/crm.h -index 5710e4b..7c5c602 100644 ---- a/include/crm/crm.h -+++ b/include/crm/crm.h -@@ -66,7 +66,7 @@ extern "C" { - * >=3.0.13: Fail counts include operation name and interval - * >=3.2.0: DC supports PCMK_EXEC_INVALID and PCMK_EXEC_NOT_CONNECTED - */ --# define CRM_FEATURE_SET "3.16.1" -+# define CRM_FEATURE_SET "3.16.2" - - /* Pacemaker's CPG protocols use fixed-width binary fields for the sender and - * recipient of a CPG message. This imposes an arbitrary limit on cluster node --- -2.31.1 - -From b9054425a76d03f538cd0b3ae27490b1874eee8a Mon Sep 17 00:00:00 2001 -From: Chris Lumens -Date: Fri, 28 Oct 2022 14:23:49 -0400 -Subject: [PATCH 14/26] Refactor: daemons: Add comments for previously added - sync point code. - ---- - daemons/attrd/attrd_sync.c | 63 ++++++++++++++++++++++++++++++++++++++ - 1 file changed, 63 insertions(+) - -diff --git a/daemons/attrd/attrd_sync.c b/daemons/attrd/attrd_sync.c -index 557e49a..e9690b5 100644 ---- a/daemons/attrd/attrd_sync.c -+++ b/daemons/attrd/attrd_sync.c -@@ -66,6 +66,20 @@ sync_point_str(enum attrd_sync_point sync_point) - } - } - -+/*! -+ * \internal -+ * \brief Add a client to the attrd waitlist -+ * -+ * Typically, a client receives an ACK for its XML IPC request immediately. However, -+ * some clients want to wait until their request has been processed and taken effect. -+ * This is called a sync point. Any client placed on this waitlist will have its -+ * ACK message delayed until either its requested sync point is hit, or until it -+ * times out. -+ * -+ * The XML IPC request must specify the type of sync point it wants to wait for. -+ * -+ * \param[in,out] request The request describing the client to place on the waitlist. -+ */ - void - attrd_add_client_to_waitlist(pcmk__request_t *request) - { -@@ -112,6 +126,11 @@ attrd_add_client_to_waitlist(pcmk__request_t *request) - crm_xml_add_int(request->xml, XML_LRM_ATTR_CALLID, waitlist_client); - } - -+/*! -+ * \internal -+ * \brief Free all memory associated with the waitlist. This is most typically -+ * used when attrd shuts down. -+ */ - void - attrd_free_waitlist(void) - { -@@ -123,6 +142,13 @@ attrd_free_waitlist(void) - waitlist = NULL; - } - -+/*! -+ * \internal -+ * \brief Unconditionally remove a client from the waitlist, such as when the client -+ * node disconnects from the cluster -+ * -+ * \param[in] client The client to remove -+ */ - void - attrd_remove_client_from_waitlist(pcmk__client_t *client) - { -@@ -144,6 +170,18 @@ attrd_remove_client_from_waitlist(pcmk__client_t *client) - } - } - -+/*! -+ * \internal -+ * \brief Send an IPC ACK message to all awaiting clients -+ * -+ * This function will search the waitlist for all clients that are currently awaiting -+ * an ACK indicating their attrd operation is complete. Only those clients with a -+ * matching sync point type and callid from their original XML IPC request will be -+ * ACKed. Once they have received an ACK, they will be removed from the waitlist. -+ * -+ * \param[in] sync_point What kind of sync point have we hit? -+ * \param[in] xml The original XML IPC request. -+ */ - void - attrd_ack_waitlist_clients(enum attrd_sync_point sync_point, const xmlNode *xml) - { -@@ -183,6 +221,23 @@ attrd_ack_waitlist_clients(enum attrd_sync_point sync_point, const xmlNode *xml) - } - } - -+/*! -+ * \internal -+ * \brief Return the sync point attribute for an IPC request -+ * -+ * This function will check both the top-level element of \p xml for a sync -+ * point attribute, as well as all of its \p op children, if any. The latter -+ * is useful for newer versions of attrd that can put multiple IPC requests -+ * into a single message. -+ * -+ * \param[in] xml An XML IPC request -+ * -+ * \note It is assumed that if one child element has a sync point attribute, -+ * all will have a sync point attribute and they will all be the same -+ * sync point. No other configuration is supported. -+ * -+ * \return The sync point attribute of \p xml, or NULL if none. -+ */ - const char * - attrd_request_sync_point(xmlNode *xml) - { -@@ -200,6 +255,14 @@ attrd_request_sync_point(xmlNode *xml) - } - } - -+/*! -+ * \internal -+ * \brief Does an IPC request contain any sync point attribute? -+ * -+ * \param[in] xml An XML IPC request -+ * -+ * \return true if there's a sync point attribute, false otherwise -+ */ - bool - attrd_request_has_sync_point(xmlNode *xml) - { --- -2.31.1 - -From 64219fb7075ee58d29f94f077a3b8f94174bb32a Mon Sep 17 00:00:00 2001 -From: Chris Lumens -Date: Wed, 26 Oct 2022 12:43:05 -0400 -Subject: [PATCH 15/26] Feature: tools: Add --wait=cluster option to - attrd_updater. - ---- - tools/attrd_updater.c | 10 ++++++++-- - 1 file changed, 8 insertions(+), 2 deletions(-) - -diff --git a/tools/attrd_updater.c b/tools/attrd_updater.c -index c4779a6..3cd766d 100644 ---- a/tools/attrd_updater.c -+++ b/tools/attrd_updater.c -@@ -106,6 +106,10 @@ wait_cb (const gchar *option_name, const gchar *optarg, gpointer data, GError ** - pcmk__clear_node_attr_flags(options.attr_options, pcmk__node_attr_sync_local | pcmk__node_attr_sync_cluster); - pcmk__set_node_attr_flags(options.attr_options, pcmk__node_attr_sync_local); - return TRUE; -+ } else if (pcmk__str_eq(optarg, PCMK__VALUE_CLUSTER, pcmk__str_none)) { -+ pcmk__clear_node_attr_flags(options.attr_options, pcmk__node_attr_sync_local | pcmk__node_attr_sync_cluster); -+ pcmk__set_node_attr_flags(options.attr_options, pcmk__node_attr_sync_cluster); -+ return TRUE; - } else { - g_set_error(err, PCMK__EXITC_ERROR, CRM_EX_USAGE, - "--wait= must be one of 'no', 'local', 'cluster'"); -@@ -193,10 +197,12 @@ static GOptionEntry addl_entries[] = { - - { "wait", 'W', 0, G_OPTION_ARG_CALLBACK, wait_cb, - "Wait for some event to occur before returning. Values are 'no' (wait\n" -- INDENT "only for the attribute daemon to acknowledge the request) or\n" -+ INDENT "only for the attribute daemon to acknowledge the request),\n" - INDENT "'local' (wait until the change has propagated to where a local\n" - INDENT "query will return the request value, or the value set by a\n" -- INDENT "later request). Default is 'no'.", -+ INDENT "later request), or 'cluster' (wait until the change has propagated\n" -+ INDENT "to where a query anywhere on the cluster will return the requested\n" -+ INDENT "value, or the value set by a later request). Default is 'no'.", - "UNTIL" }, - - { NULL } --- -2.31.1 - -From 1bc5511fadf6ad670508bd3a2a55129bde16f774 Mon Sep 17 00:00:00 2001 -From: Chris Lumens -Date: Fri, 16 Sep 2022 14:55:06 -0400 -Subject: [PATCH 16/26] Refactor: daemons: Add a confirm= attribute to attrd - messages. - -This allows informing the originator of a message that the message has -been received and processed. As yet, there is no mechanism for handling -and returning the confirmation, only for requesting it. ---- - daemons/attrd/attrd_corosync.c | 6 +++--- - daemons/attrd/attrd_ipc.c | 26 +++++++++++++++++++++----- - daemons/attrd/attrd_messages.c | 11 +++++++++-- - daemons/attrd/pacemaker-attrd.h | 7 ++++--- - include/crm_internal.h | 1 + - 5 files changed, 38 insertions(+), 13 deletions(-) - -diff --git a/daemons/attrd/attrd_corosync.c b/daemons/attrd/attrd_corosync.c -index 4337280..e86ca07 100644 ---- a/daemons/attrd/attrd_corosync.c -+++ b/daemons/attrd/attrd_corosync.c -@@ -124,7 +124,7 @@ broadcast_local_value(const attribute_t *a) - - crm_xml_add(sync, PCMK__XA_TASK, PCMK__ATTRD_CMD_SYNC_RESPONSE); - attrd_add_value_xml(sync, a, v, false); -- attrd_send_message(NULL, sync); -+ attrd_send_message(NULL, sync, false); - free_xml(sync); - return v; - } -@@ -387,7 +387,7 @@ broadcast_unseen_local_values(void) - - if (sync != NULL) { - crm_debug("Broadcasting local-only values"); -- attrd_send_message(NULL, sync); -+ attrd_send_message(NULL, sync, false); - free_xml(sync); - } - } -@@ -539,7 +539,7 @@ attrd_peer_sync(crm_node_t *peer, xmlNode *xml) - } - - crm_debug("Syncing values to %s", peer?peer->uname:"everyone"); -- attrd_send_message(peer, sync); -+ attrd_send_message(peer, sync, false); - free_xml(sync); - } - -diff --git a/daemons/attrd/attrd_ipc.c b/daemons/attrd/attrd_ipc.c -index 2e614e8..0fc5e93 100644 ---- a/daemons/attrd/attrd_ipc.c -+++ b/daemons/attrd/attrd_ipc.c -@@ -105,7 +105,7 @@ attrd_client_clear_failure(pcmk__request_t *request) - /* Propagate to all peers (including ourselves). - * This ends up at attrd_peer_message(). - */ -- attrd_send_message(NULL, xml); -+ attrd_send_message(NULL, xml, false); - pcmk__set_result(&request->result, CRM_EX_OK, PCMK_EXEC_DONE, NULL); - return NULL; - } -@@ -184,7 +184,7 @@ attrd_client_peer_remove(pcmk__request_t *request) - if (host) { - crm_info("Client %s is requesting all values for %s be removed", - pcmk__client_name(request->ipc_client), host); -- attrd_send_message(NULL, xml); /* ends up at attrd_peer_message() */ -+ attrd_send_message(NULL, xml, false); /* ends up at attrd_peer_message() */ - free(host_alloc); - } else { - crm_info("Ignoring request by client %s to remove all peer values without specifying peer", -@@ -314,7 +314,7 @@ attrd_client_update(pcmk__request_t *request) - } - } - -- attrd_send_message(NULL, xml); -+ attrd_send_message(NULL, xml, false); - pcmk__set_result(&request->result, CRM_EX_OK, PCMK_EXEC_DONE, NULL); - - } else { -@@ -358,7 +358,7 @@ attrd_client_update(pcmk__request_t *request) - if (status == 0) { - crm_trace("Matched %s with %s", attr, regex); - crm_xml_add(xml, PCMK__XA_ATTR_NAME, attr); -- attrd_send_message(NULL, xml); -+ attrd_send_message(NULL, xml, false); - } - } - -@@ -388,7 +388,23 @@ attrd_client_update(pcmk__request_t *request) - crm_debug("Broadcasting %s[%s]=%s%s", attr, crm_element_value(xml, PCMK__XA_ATTR_NODE_NAME), - value, (attrd_election_won()? " (writer)" : "")); - -- attrd_send_message(NULL, xml); /* ends up at attrd_peer_message() */ -+ if (pcmk__str_eq(attrd_request_sync_point(xml), PCMK__VALUE_CLUSTER, pcmk__str_none)) { -+ /* The client is waiting on the cluster-wide sync point. In this case, -+ * the response ACK is not sent until this attrd broadcasts the update -+ * and receives its own confirmation back from all peers. -+ */ -+ attrd_send_message(NULL, xml, true); /* ends up at attrd_peer_message() */ -+ -+ } else { -+ /* The client is either waiting on the local sync point or was not -+ * waiting on any sync point at all. For the local sync point, the -+ * response ACK is sent in attrd_peer_update. For clients not -+ * waiting on any sync point, the response ACK is sent in -+ * handle_update_request immediately before this function was called. -+ */ -+ attrd_send_message(NULL, xml, false); /* ends up at attrd_peer_message() */ -+ } -+ - pcmk__set_result(&request->result, CRM_EX_OK, PCMK_EXEC_DONE, NULL); - return NULL; - } -diff --git a/daemons/attrd/attrd_messages.c b/daemons/attrd/attrd_messages.c -index 3ba14a6..78df0d0 100644 ---- a/daemons/attrd/attrd_messages.c -+++ b/daemons/attrd/attrd_messages.c -@@ -279,16 +279,23 @@ attrd_broadcast_protocol(void) - crm_debug("Broadcasting attrd protocol version %s for node %s", - ATTRD_PROTOCOL_VERSION, attrd_cluster->uname); - -- attrd_send_message(NULL, attrd_op); /* ends up at attrd_peer_message() */ -+ attrd_send_message(NULL, attrd_op, false); /* ends up at attrd_peer_message() */ - - free_xml(attrd_op); - } - - gboolean --attrd_send_message(crm_node_t * node, xmlNode * data) -+attrd_send_message(crm_node_t *node, xmlNode *data, bool confirm) - { - crm_xml_add(data, F_TYPE, T_ATTRD); - crm_xml_add(data, PCMK__XA_ATTR_VERSION, ATTRD_PROTOCOL_VERSION); -+ -+ /* Request a confirmation from the destination peer node (which could -+ * be all if node is NULL) that the message has been received and -+ * acted upon. -+ */ -+ pcmk__xe_set_bool_attr(data, PCMK__XA_CONFIRM, confirm); -+ - attrd_xml_add_writer(data); - return send_cluster_message(node, crm_msg_attrd, data, TRUE); - } -diff --git a/daemons/attrd/pacemaker-attrd.h b/daemons/attrd/pacemaker-attrd.h -index 537bf85..25f7c8a 100644 ---- a/daemons/attrd/pacemaker-attrd.h -+++ b/daemons/attrd/pacemaker-attrd.h -@@ -39,10 +39,11 @@ - * PCMK__ATTRD_CMD_UPDATE_DELAY - * 2 1.1.17 PCMK__ATTRD_CMD_CLEAR_FAILURE - * 3 2.1.1 PCMK__ATTRD_CMD_SYNC_RESPONSE indicates remote nodes -- * 4 2.2.0 Multiple attributes can be updated in a single IPC -+ * 4 2.1.5 Multiple attributes can be updated in a single IPC - * message -+ * 5 2.1.5 Peers can request confirmation of a sent message - */ --#define ATTRD_PROTOCOL_VERSION "4" -+#define ATTRD_PROTOCOL_VERSION "5" - - #define attrd_send_ack(client, id, flags) \ - pcmk__ipc_send_ack((client), (id), (flags), "ack", ATTRD_PROTOCOL_VERSION, CRM_EX_INDETERMINATE) -@@ -162,7 +163,7 @@ xmlNode *attrd_client_clear_failure(pcmk__request_t *request); - xmlNode *attrd_client_update(pcmk__request_t *request); - xmlNode *attrd_client_refresh(pcmk__request_t *request); - xmlNode *attrd_client_query(pcmk__request_t *request); --gboolean attrd_send_message(crm_node_t * node, xmlNode * data); -+gboolean attrd_send_message(crm_node_t *node, xmlNode *data, bool confirm); - - xmlNode *attrd_add_value_xml(xmlNode *parent, const attribute_t *a, - const attribute_value_t *v, bool force_write); -diff --git a/include/crm_internal.h b/include/crm_internal.h -index 08193c3..63a1726 100644 ---- a/include/crm_internal.h -+++ b/include/crm_internal.h -@@ -79,6 +79,7 @@ - #define PCMK__XA_ATTR_WRITER "attr_writer" - #define PCMK__XA_CONFIG_ERRORS "config-errors" - #define PCMK__XA_CONFIG_WARNINGS "config-warnings" -+#define PCMK__XA_CONFIRM "confirm" - #define PCMK__XA_GRAPH_ERRORS "graph-errors" - #define PCMK__XA_GRAPH_WARNINGS "graph-warnings" - #define PCMK__XA_MODE "mode" --- -2.31.1 - -From 6f389038fc0b11f6291c022c99f188666c65f530 Mon Sep 17 00:00:00 2001 -From: Chris Lumens -Date: Wed, 26 Oct 2022 14:44:42 -0400 -Subject: [PATCH 17/26] Feature: daemons: Respond to received attrd - confirmation requests. - -On the receiving peer side, if the XML request contains confirm="true", -construct a confirmation message after handling the request completes -and send it back to the originating peer. - -On the originating peer side, add a skeleton handler for confirmation -messages. This does nothing at the moment except log it. ---- - daemons/attrd/attrd_corosync.c | 38 ++++++++++++++++++++++++++++++++++ - daemons/attrd/attrd_messages.c | 13 ++++++++++++ - include/crm_internal.h | 1 + - 3 files changed, 52 insertions(+) - -diff --git a/daemons/attrd/attrd_corosync.c b/daemons/attrd/attrd_corosync.c -index e86ca07..1245d9c 100644 ---- a/daemons/attrd/attrd_corosync.c -+++ b/daemons/attrd/attrd_corosync.c -@@ -25,6 +25,19 @@ - - extern crm_exit_t attrd_exit_status; - -+static xmlNode * -+attrd_confirmation(int callid) -+{ -+ xmlNode *node = create_xml_node(NULL, __func__); -+ -+ crm_xml_add(node, F_TYPE, T_ATTRD); -+ crm_xml_add(node, F_ORIG, get_local_node_name()); -+ crm_xml_add(node, PCMK__XA_TASK, PCMK__ATTRD_CMD_CONFIRM); -+ crm_xml_add_int(node, XML_LRM_ATTR_CALLID, callid); -+ -+ return node; -+} -+ - static void - attrd_peer_message(crm_node_t *peer, xmlNode *xml) - { -@@ -57,6 +70,31 @@ attrd_peer_message(crm_node_t *peer, xmlNode *xml) - CRM_CHECK(request.op != NULL, return); - - attrd_handle_request(&request); -+ -+ /* Having finished handling the request, check to see if the originating -+ * peer requested confirmation. If so, send that confirmation back now. -+ */ -+ if (pcmk__xe_attr_is_true(xml, PCMK__XA_CONFIRM)) { -+ int callid = 0; -+ xmlNode *reply = NULL; -+ -+ /* Add the confirmation ID for the message we are confirming to the -+ * response so the originating peer knows what they're a confirmation -+ * for. -+ */ -+ crm_element_value_int(xml, XML_LRM_ATTR_CALLID, &callid); -+ reply = attrd_confirmation(callid); -+ -+ /* And then send the confirmation back to the originating peer. This -+ * ends up right back in this same function (attrd_peer_message) on the -+ * peer where it will have to do something with a PCMK__XA_CONFIRM type -+ * message. -+ */ -+ crm_debug("Sending %s a confirmation", peer->uname); -+ attrd_send_message(peer, reply, false); -+ free_xml(reply); -+ } -+ - pcmk__reset_request(&request); - } - } -diff --git a/daemons/attrd/attrd_messages.c b/daemons/attrd/attrd_messages.c -index 78df0d0..9c792b2 100644 ---- a/daemons/attrd/attrd_messages.c -+++ b/daemons/attrd/attrd_messages.c -@@ -65,6 +65,18 @@ handle_clear_failure_request(pcmk__request_t *request) - } - } - -+static xmlNode * -+handle_confirm_request(pcmk__request_t *request) -+{ -+ if (request->peer != NULL) { -+ crm_debug("Received confirmation from %s", request->peer); -+ pcmk__set_result(&request->result, CRM_EX_OK, PCMK_EXEC_DONE, NULL); -+ return NULL; -+ } else { -+ return handle_unknown_request(request); -+ } -+} -+ - static xmlNode * - handle_flush_request(pcmk__request_t *request) - { -@@ -190,6 +202,7 @@ attrd_register_handlers(void) - { - pcmk__server_command_t handlers[] = { - { PCMK__ATTRD_CMD_CLEAR_FAILURE, handle_clear_failure_request }, -+ { PCMK__ATTRD_CMD_CONFIRM, handle_confirm_request }, - { PCMK__ATTRD_CMD_FLUSH, handle_flush_request }, - { PCMK__ATTRD_CMD_PEER_REMOVE, handle_remove_request }, - { PCMK__ATTRD_CMD_QUERY, handle_query_request }, -diff --git a/include/crm_internal.h b/include/crm_internal.h -index 63a1726..f60e7b4 100644 ---- a/include/crm_internal.h -+++ b/include/crm_internal.h -@@ -108,6 +108,7 @@ - #define PCMK__ATTRD_CMD_SYNC "sync" - #define PCMK__ATTRD_CMD_SYNC_RESPONSE "sync-response" - #define PCMK__ATTRD_CMD_CLEAR_FAILURE "clear-failure" -+#define PCMK__ATTRD_CMD_CONFIRM "confirm" - - #define PCMK__CONTROLD_CMD_NODES "list-nodes" - --- -2.31.1 - -From dfb730e9ced9dc75886fda9452c584860573fe30 Mon Sep 17 00:00:00 2001 -From: Chris Lumens -Date: Wed, 26 Oct 2022 15:58:00 -0400 -Subject: [PATCH 18/26] Feature: daemons: Keep track of #attrd-protocol from - each peer. - -This information can be used in the future when dealing with -cluster-wide sync points to know which peers we are waiting on a reply -from. ---- - daemons/attrd/attrd_corosync.c | 3 +- - daemons/attrd/attrd_utils.c | 60 ++++++++++++++++++++++++++++++--- - daemons/attrd/pacemaker-attrd.h | 4 ++- - 3 files changed, 60 insertions(+), 7 deletions(-) - -diff --git a/daemons/attrd/attrd_corosync.c b/daemons/attrd/attrd_corosync.c -index 1245d9c..6f88ab6 100644 ---- a/daemons/attrd/attrd_corosync.c -+++ b/daemons/attrd/attrd_corosync.c -@@ -268,6 +268,7 @@ attrd_peer_change_cb(enum crm_status_type kind, crm_node_t *peer, const void *da - // Remove votes from cluster nodes that leave, in case election in progress - if (gone && !is_remote) { - attrd_remove_voter(peer); -+ attrd_remove_peer_protocol_ver(peer->uname); - - // Ensure remote nodes that come up are in the remote node cache - } else if (!gone && is_remote) { -@@ -395,7 +396,7 @@ attrd_peer_update_one(const crm_node_t *peer, xmlNode *xml, bool filter) - * version, check to see if it's a new minimum version. - */ - if (pcmk__str_eq(attr, CRM_ATTR_PROTOCOL, pcmk__str_none)) { -- attrd_update_minimum_protocol_ver(value); -+ attrd_update_minimum_protocol_ver(peer->uname, value); - } - } - -diff --git a/daemons/attrd/attrd_utils.c b/daemons/attrd/attrd_utils.c -index 00b879b..421faed 100644 ---- a/daemons/attrd/attrd_utils.c -+++ b/daemons/attrd/attrd_utils.c -@@ -29,6 +29,11 @@ static bool requesting_shutdown = false; - static bool shutting_down = false; - static GMainLoop *mloop = NULL; - -+/* A hash table storing information on the protocol version of each peer attrd. -+ * The key is the peer's uname, and the value is the protocol version number. -+ */ -+GHashTable *peer_protocol_vers = NULL; -+ - /*! - * \internal - * \brief Set requesting_shutdown state -@@ -94,6 +99,10 @@ attrd_shutdown(int nsig) - mainloop_destroy_signal(SIGTRAP); - - attrd_free_waitlist(); -+ if (peer_protocol_vers != NULL) { -+ g_hash_table_destroy(peer_protocol_vers); -+ peer_protocol_vers = NULL; -+ } - - if ((mloop == NULL) || !g_main_loop_is_running(mloop)) { - /* If there's no main loop active, just exit. This should be possible -@@ -273,16 +282,57 @@ attrd_free_attribute(gpointer data) - } - } - -+/*! -+ * \internal -+ * \brief When a peer node leaves the cluster, stop tracking its protocol version. -+ * -+ * \param[in] host The peer node's uname to be removed -+ */ -+void -+attrd_remove_peer_protocol_ver(const char *host) -+{ -+ if (peer_protocol_vers != NULL) { -+ g_hash_table_remove(peer_protocol_vers, host); -+ } -+} -+ -+/*! -+ * \internal -+ * \brief When a peer node broadcasts a message with its protocol version, keep -+ * track of that information. -+ * -+ * We keep track of each peer's protocol version so we know which peers to -+ * expect confirmation messages from when handling cluster-wide sync points. -+ * We additionally keep track of the lowest protocol version supported by all -+ * peers so we know when we can send IPC messages containing more than one -+ * request. -+ * -+ * \param[in] host The peer node's uname to be tracked -+ * \param[in] value The peer node's protocol version -+ */ - void --attrd_update_minimum_protocol_ver(const char *value) -+attrd_update_minimum_protocol_ver(const char *host, const char *value) - { - int ver; - -+ if (peer_protocol_vers == NULL) { -+ peer_protocol_vers = pcmk__strkey_table(free, NULL); -+ } -+ - pcmk__scan_min_int(value, &ver, 0); - -- if (ver > 0 && (minimum_protocol_version == -1 || ver < minimum_protocol_version)) { -- minimum_protocol_version = ver; -- crm_trace("Set minimum attrd protocol version to %d", -- minimum_protocol_version); -+ if (ver > 0) { -+ char *host_name = strdup(host); -+ -+ /* Record the peer attrd's protocol version. */ -+ CRM_ASSERT(host_name != NULL); -+ g_hash_table_insert(peer_protocol_vers, host_name, GINT_TO_POINTER(ver)); -+ -+ /* If the protocol version is a new minimum, record it as such. */ -+ if (minimum_protocol_version == -1 || ver < minimum_protocol_version) { -+ minimum_protocol_version = ver; -+ crm_trace("Set minimum attrd protocol version to %d", -+ minimum_protocol_version); -+ } - } - } -diff --git a/daemons/attrd/pacemaker-attrd.h b/daemons/attrd/pacemaker-attrd.h -index 25f7c8a..302ef63 100644 ---- a/daemons/attrd/pacemaker-attrd.h -+++ b/daemons/attrd/pacemaker-attrd.h -@@ -145,6 +145,7 @@ typedef struct attribute_value_s { - - extern crm_cluster_t *attrd_cluster; - extern GHashTable *attributes; -+extern GHashTable *peer_protocol_vers; - - #define CIB_OP_TIMEOUT_S 120 - -@@ -177,7 +178,8 @@ void attrd_write_attributes(bool all, bool ignore_delay); - void attrd_write_or_elect_attribute(attribute_t *a); - - extern int minimum_protocol_version; --void attrd_update_minimum_protocol_ver(const char *value); -+void attrd_remove_peer_protocol_ver(const char *host); -+void attrd_update_minimum_protocol_ver(const char *host, const char *value); - - mainloop_timer_t *attrd_add_timer(const char *id, int timeout_ms, attribute_t *attr); - --- -2.31.1 - -From 945f0fe51d3bf69c2cb1258b394f2f11b8996525 Mon Sep 17 00:00:00 2001 -From: Chris Lumens -Date: Thu, 27 Oct 2022 14:42:59 -0400 -Subject: [PATCH 19/26] Feature: daemons: Handle cluster-wide sync points in - attrd. - -When an attrd receives an IPC request to update some value, record the -protocol versions of all peer attrds. Additionally register a function -that will be called when all confirmations are received. - -The originating IPC cilent (attrd_updater for instance) will sit there -waiting for an ACK until its timeout is hit. - -As each confirmation message comes back to attrd, mark it off the list -of peers we are waiting on. When no more peers are expected, call the -previously registered function. - -For attribute updates, this function just sends an ack back to -attrd_updater. - -Fixes T35 ---- - daemons/attrd/attrd_corosync.c | 1 + - daemons/attrd/attrd_ipc.c | 4 + - daemons/attrd/attrd_messages.c | 10 ++ - daemons/attrd/attrd_sync.c | 260 +++++++++++++++++++++++++++++++- - daemons/attrd/attrd_utils.c | 2 + - daemons/attrd/pacemaker-attrd.h | 8 + - 6 files changed, 281 insertions(+), 4 deletions(-) - -diff --git a/daemons/attrd/attrd_corosync.c b/daemons/attrd/attrd_corosync.c -index 6f88ab6..37701aa 100644 ---- a/daemons/attrd/attrd_corosync.c -+++ b/daemons/attrd/attrd_corosync.c -@@ -269,6 +269,7 @@ attrd_peer_change_cb(enum crm_status_type kind, crm_node_t *peer, const void *da - if (gone && !is_remote) { - attrd_remove_voter(peer); - attrd_remove_peer_protocol_ver(peer->uname); -+ attrd_do_not_expect_from_peer(peer->uname); - - // Ensure remote nodes that come up are in the remote node cache - } else if (!gone && is_remote) { -diff --git a/daemons/attrd/attrd_ipc.c b/daemons/attrd/attrd_ipc.c -index 0fc5e93..c70aa1b 100644 ---- a/daemons/attrd/attrd_ipc.c -+++ b/daemons/attrd/attrd_ipc.c -@@ -393,6 +393,7 @@ attrd_client_update(pcmk__request_t *request) - * the response ACK is not sent until this attrd broadcasts the update - * and receives its own confirmation back from all peers. - */ -+ attrd_expect_confirmations(request, attrd_cluster_sync_point_update); - attrd_send_message(NULL, xml, true); /* ends up at attrd_peer_message() */ - - } else { -@@ -456,6 +457,9 @@ attrd_ipc_closed(qb_ipcs_connection_t *c) - /* Remove the client from the sync point waitlist if it's present. */ - attrd_remove_client_from_waitlist(client); - -+ /* And no longer wait for confirmations from any peers. */ -+ attrd_do_not_wait_for_client(client); -+ - pcmk__free_client(client); - } - -diff --git a/daemons/attrd/attrd_messages.c b/daemons/attrd/attrd_messages.c -index 9c792b2..f7b9c7c 100644 ---- a/daemons/attrd/attrd_messages.c -+++ b/daemons/attrd/attrd_messages.c -@@ -69,7 +69,17 @@ static xmlNode * - handle_confirm_request(pcmk__request_t *request) - { - if (request->peer != NULL) { -+ int callid; -+ - crm_debug("Received confirmation from %s", request->peer); -+ -+ if (crm_element_value_int(request->xml, XML_LRM_ATTR_CALLID, &callid) == -1) { -+ pcmk__set_result(&request->result, CRM_EX_PROTOCOL, PCMK_EXEC_INVALID, -+ "Could not get callid from XML"); -+ } else { -+ attrd_handle_confirmation(callid, request->peer); -+ } -+ - pcmk__set_result(&request->result, CRM_EX_OK, PCMK_EXEC_DONE, NULL); - return NULL; - } else { -diff --git a/daemons/attrd/attrd_sync.c b/daemons/attrd/attrd_sync.c -index e9690b5..d3d7108 100644 ---- a/daemons/attrd/attrd_sync.c -+++ b/daemons/attrd/attrd_sync.c -@@ -34,6 +34,51 @@ struct waitlist_node { - uint32_t flags; - }; - -+/* A hash table storing information on in-progress IPC requests that are awaiting -+ * confirmations. These requests are currently being processed by peer attrds and -+ * we are waiting to receive confirmation messages from each peer indicating that -+ * processing is complete. -+ * -+ * Multiple requests could be waiting on confirmations at the same time. -+ * -+ * The key is the unique callid for the IPC request, and the value is a -+ * confirmation_action struct. -+ */ -+static GHashTable *expected_confirmations = NULL; -+ -+/*! -+ * \internal -+ * \brief A structure describing a single IPC request that is awaiting confirmations -+ */ -+struct confirmation_action { -+ /*! -+ * \brief A list of peer attrds that we are waiting to receive confirmation -+ * messages from -+ * -+ * This list is dynamic - as confirmations arrive from peer attrds, they will -+ * be removed from this list. When the list is empty, all peers have processed -+ * the request and the associated confirmation action will be taken. -+ */ -+ GList *respondents; -+ -+ /*! -+ * \brief A function to run when all confirmations have been received -+ */ -+ attrd_confirmation_action_fn fn; -+ -+ /*! -+ * \brief Information required to construct and send a reply to the client -+ */ -+ char *client_id; -+ uint32_t ipc_id; -+ uint32_t flags; -+ -+ /*! -+ * \brief The XML request containing the callid associated with this action -+ */ -+ void *xml; -+}; -+ - static void - next_key(void) - { -@@ -114,12 +159,13 @@ attrd_add_client_to_waitlist(pcmk__request_t *request) - wl->ipc_id = request->ipc_id; - wl->flags = request->flags; - -- crm_debug("Added client %s to waitlist for %s sync point", -- wl->client_id, sync_point_str(wl->sync_point)); -- - next_key(); - pcmk__intkey_table_insert(waitlist, waitlist_client, wl); - -+ crm_trace("Added client %s to waitlist for %s sync point", -+ wl->client_id, sync_point_str(wl->sync_point)); -+ crm_trace("%d clients now on waitlist", g_hash_table_size(waitlist)); -+ - /* And then add the key to the request XML so we can uniquely identify - * it when it comes time to issue the ACK. - */ -@@ -166,6 +212,7 @@ attrd_remove_client_from_waitlist(pcmk__client_t *client) - - if (wl->client_id == client->id) { - g_hash_table_iter_remove(&iter); -+ crm_trace("%d clients now on waitlist", g_hash_table_size(waitlist)); - } - } - } -@@ -206,7 +253,7 @@ attrd_ack_waitlist_clients(enum attrd_sync_point sync_point, const xmlNode *xml) - return; - } - -- crm_debug("Alerting client %s for reached %s sync point", -+ crm_trace("Alerting client %s for reached %s sync point", - wl->client_id, sync_point_str(wl->sync_point)); - - client = pcmk__find_client_by_id(wl->client_id); -@@ -218,9 +265,28 @@ attrd_ack_waitlist_clients(enum attrd_sync_point sync_point, const xmlNode *xml) - - /* And then remove the client so it doesn't get alerted again. */ - pcmk__intkey_table_remove(waitlist, callid); -+ -+ crm_trace("%d clients now on waitlist", g_hash_table_size(waitlist)); - } - } - -+/*! -+ * \internal -+ * \brief Action to take when a cluster sync point is hit for a -+ * PCMK__ATTRD_CMD_UPDATE* message. -+ * -+ * \param[in] xml The request that should be passed along to -+ * attrd_ack_waitlist_clients. This should be the original -+ * IPC request containing the callid for this update message. -+ */ -+int -+attrd_cluster_sync_point_update(xmlNode *xml) -+{ -+ crm_trace("Hit cluster sync point for attribute update"); -+ attrd_ack_waitlist_clients(attrd_sync_point_cluster, xml); -+ return pcmk_rc_ok; -+} -+ - /*! - * \internal - * \brief Return the sync point attribute for an IPC request -@@ -268,3 +334,189 @@ attrd_request_has_sync_point(xmlNode *xml) - { - return attrd_request_sync_point(xml) != NULL; - } -+ -+static void -+free_action(gpointer data) -+{ -+ struct confirmation_action *action = (struct confirmation_action *) data; -+ g_list_free_full(action->respondents, free); -+ free_xml(action->xml); -+ free(action->client_id); -+ free(action); -+} -+ -+/*! -+ * \internal -+ * \brief When a peer disconnects from the cluster, no longer wait for its confirmation -+ * for any IPC action. If this peer is the last one being waited on, this will -+ * trigger the confirmation action. -+ * -+ * \param[in] host The disconnecting peer attrd's uname -+ */ -+void -+attrd_do_not_expect_from_peer(const char *host) -+{ -+ GList *keys = g_hash_table_get_keys(expected_confirmations); -+ -+ crm_trace("Removing peer %s from expected confirmations", host); -+ -+ for (GList *node = keys; node != NULL; node = node->next) { -+ int callid = *(int *) node->data; -+ attrd_handle_confirmation(callid, host); -+ } -+ -+ g_list_free(keys); -+} -+ -+/*! -+ * \internal -+ * \brief When a client disconnects from the cluster, no longer wait on confirmations -+ * for it. Because the peer attrds may still be processing the original IPC -+ * message, they may still send us confirmations. However, we will take no -+ * action on them. -+ * -+ * \param[in] client The disconnecting client -+ */ -+void -+attrd_do_not_wait_for_client(pcmk__client_t *client) -+{ -+ GHashTableIter iter; -+ gpointer value; -+ -+ if (expected_confirmations == NULL) { -+ return; -+ } -+ -+ g_hash_table_iter_init(&iter, expected_confirmations); -+ -+ while (g_hash_table_iter_next(&iter, NULL, &value)) { -+ struct confirmation_action *action = (struct confirmation_action *) value; -+ -+ if (pcmk__str_eq(action->client_id, client->id, pcmk__str_none)) { -+ crm_trace("Removing client %s from expected confirmations", client->id); -+ g_hash_table_iter_remove(&iter); -+ crm_trace("%d requests now in expected confirmations table", g_hash_table_size(expected_confirmations)); -+ break; -+ } -+ } -+} -+ -+/*! -+ * \internal -+ * \brief Register some action to be taken when IPC request confirmations are -+ * received -+ * -+ * When this function is called, a list of all peer attrds that support confirming -+ * requests is generated. As confirmations from these peer attrds are received, -+ * they are removed from this list. When the list is empty, the registered action -+ * will be called. -+ * -+ * \note This function should always be called before attrd_send_message is called -+ * to broadcast to the peers to ensure that we know what replies we are -+ * waiting on. Otherwise, it is possible the peer could finish and confirm -+ * before we know to expect it. -+ * -+ * \param[in] request The request that is awaiting confirmations -+ * \param[in] fn A function to be run after all confirmations are received -+ */ -+void -+attrd_expect_confirmations(pcmk__request_t *request, attrd_confirmation_action_fn fn) -+{ -+ struct confirmation_action *action = NULL; -+ GHashTableIter iter; -+ gpointer host, ver; -+ GList *respondents = NULL; -+ int callid; -+ -+ if (expected_confirmations == NULL) { -+ expected_confirmations = pcmk__intkey_table((GDestroyNotify) free_action); -+ } -+ -+ if (crm_element_value_int(request->xml, XML_LRM_ATTR_CALLID, &callid) == -1) { -+ crm_err("Could not get callid from xml"); -+ return; -+ } -+ -+ if (pcmk__intkey_table_lookup(expected_confirmations, callid)) { -+ crm_err("Already waiting on confirmations for call id %d", callid); -+ return; -+ } -+ -+ g_hash_table_iter_init(&iter, peer_protocol_vers); -+ while (g_hash_table_iter_next(&iter, &host, &ver)) { -+ if (GPOINTER_TO_INT(ver) >= 5) { -+ char *s = strdup((char *) host); -+ -+ CRM_ASSERT(s != NULL); -+ respondents = g_list_prepend(respondents, s); -+ } -+ } -+ -+ action = calloc(1, sizeof(struct confirmation_action)); -+ CRM_ASSERT(action != NULL); -+ -+ action->respondents = respondents; -+ action->fn = fn; -+ action->xml = copy_xml(request->xml); -+ -+ action->client_id = strdup(request->ipc_client->id); -+ CRM_ASSERT(action->client_id != NULL); -+ -+ action->ipc_id = request->ipc_id; -+ action->flags = request->flags; -+ -+ pcmk__intkey_table_insert(expected_confirmations, callid, action); -+ crm_trace("Callid %d now waiting on %d confirmations", callid, g_list_length(respondents)); -+ crm_trace("%d requests now in expected confirmations table", g_hash_table_size(expected_confirmations)); -+} -+ -+void -+attrd_free_confirmations(void) -+{ -+ if (expected_confirmations != NULL) { -+ g_hash_table_destroy(expected_confirmations); -+ expected_confirmations = NULL; -+ } -+} -+ -+/*! -+ * \internal -+ * \brief Process a confirmation message from a peer attrd -+ * -+ * This function is called every time a PCMK__ATTRD_CMD_CONFIRM message is -+ * received from a peer attrd. If this is the last confirmation we are waiting -+ * on for a given operation, the registered action will be called. -+ * -+ * \param[in] callid The unique callid for the XML IPC request -+ * \param[in] host The confirming peer attrd's uname -+ */ -+void -+attrd_handle_confirmation(int callid, const char *host) -+{ -+ struct confirmation_action *action = NULL; -+ GList *node = NULL; -+ -+ if (expected_confirmations == NULL) { -+ return; -+ } -+ -+ action = pcmk__intkey_table_lookup(expected_confirmations, callid); -+ if (action == NULL) { -+ return; -+ } -+ -+ node = g_list_find_custom(action->respondents, host, (GCompareFunc) strcasecmp); -+ -+ if (node == NULL) { -+ return; -+ } -+ -+ action->respondents = g_list_remove(action->respondents, node->data); -+ crm_trace("Callid %d now waiting on %d confirmations", callid, g_list_length(action->respondents)); -+ -+ if (action->respondents == NULL) { -+ action->fn(action->xml); -+ pcmk__intkey_table_remove(expected_confirmations, callid); -+ crm_trace("%d requests now in expected confirmations table", g_hash_table_size(expected_confirmations)); -+ } -+} -diff --git a/daemons/attrd/attrd_utils.c b/daemons/attrd/attrd_utils.c -index 421faed..f3a2059 100644 ---- a/daemons/attrd/attrd_utils.c -+++ b/daemons/attrd/attrd_utils.c -@@ -99,6 +99,8 @@ attrd_shutdown(int nsig) - mainloop_destroy_signal(SIGTRAP); - - attrd_free_waitlist(); -+ attrd_free_confirmations(); -+ - if (peer_protocol_vers != NULL) { - g_hash_table_destroy(peer_protocol_vers); - peer_protocol_vers = NULL; -diff --git a/daemons/attrd/pacemaker-attrd.h b/daemons/attrd/pacemaker-attrd.h -index 302ef63..bcc329d 100644 ---- a/daemons/attrd/pacemaker-attrd.h -+++ b/daemons/attrd/pacemaker-attrd.h -@@ -191,8 +191,16 @@ enum attrd_sync_point { - attrd_sync_point_cluster, - }; - -+typedef int (*attrd_confirmation_action_fn)(xmlNode *); -+ - void attrd_add_client_to_waitlist(pcmk__request_t *request); - void attrd_ack_waitlist_clients(enum attrd_sync_point sync_point, const xmlNode *xml); -+int attrd_cluster_sync_point_update(xmlNode *xml); -+void attrd_do_not_expect_from_peer(const char *host); -+void attrd_do_not_wait_for_client(pcmk__client_t *client); -+void attrd_expect_confirmations(pcmk__request_t *request, attrd_confirmation_action_fn fn); -+void attrd_free_confirmations(void); -+void attrd_handle_confirmation(int callid, const char *host); - void attrd_remove_client_from_waitlist(pcmk__client_t *client); - const char *attrd_request_sync_point(xmlNode *xml); - bool attrd_request_has_sync_point(xmlNode *xml); --- -2.31.1 - -From 07a032a7eb2f03dce18a7c94c56b8c837dedda15 Mon Sep 17 00:00:00 2001 -From: Chris Lumens -Date: Fri, 28 Oct 2022 14:54:15 -0400 -Subject: [PATCH 20/26] Refactor: daemons: Add some attrd version checking - macros. - -These are just to make it a little more obvious what is actually being -asked in the code, instead of having magic numbers sprinkled around. ---- - daemons/attrd/attrd_ipc.c | 2 +- - daemons/attrd/attrd_sync.c | 2 +- - daemons/attrd/pacemaker-attrd.h | 3 +++ - 3 files changed, 5 insertions(+), 2 deletions(-) - -diff --git a/daemons/attrd/attrd_ipc.c b/daemons/attrd/attrd_ipc.c -index c70aa1b..16bfff4 100644 ---- a/daemons/attrd/attrd_ipc.c -+++ b/daemons/attrd/attrd_ipc.c -@@ -294,7 +294,7 @@ attrd_client_update(pcmk__request_t *request) - * two ways we can handle that. - */ - if (xml_has_children(xml)) { -- if (minimum_protocol_version >= 4) { -+ if (ATTRD_SUPPORTS_MULTI_MESSAGE(minimum_protocol_version)) { - /* First, if all peers support a certain protocol version, we can - * just broadcast the big message and they'll handle it. However, - * we also need to apply all the transformations in this function -diff --git a/daemons/attrd/attrd_sync.c b/daemons/attrd/attrd_sync.c -index d3d7108..e48f82e 100644 ---- a/daemons/attrd/attrd_sync.c -+++ b/daemons/attrd/attrd_sync.c -@@ -444,7 +444,7 @@ attrd_expect_confirmations(pcmk__request_t *request, attrd_confirmation_action_f - - g_hash_table_iter_init(&iter, peer_protocol_vers); - while (g_hash_table_iter_next(&iter, &host, &ver)) { -- if (GPOINTER_TO_INT(ver) >= 5) { -+ if (ATTRD_SUPPORTS_CONFIRMATION(GPOINTER_TO_INT(ver))) { - char *s = strdup((char *) host); - - CRM_ASSERT(s != NULL); -diff --git a/daemons/attrd/pacemaker-attrd.h b/daemons/attrd/pacemaker-attrd.h -index bcc329d..83d7c6b 100644 ---- a/daemons/attrd/pacemaker-attrd.h -+++ b/daemons/attrd/pacemaker-attrd.h -@@ -45,6 +45,9 @@ - */ - #define ATTRD_PROTOCOL_VERSION "5" - -+#define ATTRD_SUPPORTS_MULTI_MESSAGE(x) ((x) >= 4) -+#define ATTRD_SUPPORTS_CONFIRMATION(x) ((x) >= 5) -+ - #define attrd_send_ack(client, id, flags) \ - pcmk__ipc_send_ack((client), (id), (flags), "ack", ATTRD_PROTOCOL_VERSION, CRM_EX_INDETERMINATE) - --- -2.31.1 - -From 811361b96c6f26a1f5eccc54b6e8bf6e6fd003be Mon Sep 17 00:00:00 2001 -From: Chris Lumens -Date: Mon, 31 Oct 2022 12:53:22 -0400 -Subject: [PATCH 21/26] Low: attrd: Fix removing clients from the waitlist when - they disconnect. - -The client ID is a string, so it must be compared like a string. ---- - daemons/attrd/attrd_sync.c | 2 +- - 1 file changed, 1 insertion(+), 1 deletion(-) - -diff --git a/daemons/attrd/attrd_sync.c b/daemons/attrd/attrd_sync.c -index e48f82e..c9b4784 100644 ---- a/daemons/attrd/attrd_sync.c -+++ b/daemons/attrd/attrd_sync.c -@@ -210,7 +210,7 @@ attrd_remove_client_from_waitlist(pcmk__client_t *client) - while (g_hash_table_iter_next(&iter, NULL, &value)) { - struct waitlist_node *wl = (struct waitlist_node *) value; - -- if (wl->client_id == client->id) { -+ if (pcmk__str_eq(wl->client_id, client->id, pcmk__str_none)) { - g_hash_table_iter_remove(&iter); - crm_trace("%d clients now on waitlist", g_hash_table_size(waitlist)); - } --- -2.31.1 - -From 4e933ad14456af85c60701410c3b23b4eab03f86 Mon Sep 17 00:00:00 2001 -From: Chris Lumens -Date: Tue, 1 Nov 2022 12:35:12 -0400 -Subject: [PATCH 22/26] Feature: daemons: Handle an attrd client timing out. - -If the update confirmations do not come back in time, use a main loop -timer to remove the client from the table. ---- - daemons/attrd/attrd_sync.c | 49 ++++++++++++++++++++++++++++++++++++++ - 1 file changed, 49 insertions(+) - -diff --git a/daemons/attrd/attrd_sync.c b/daemons/attrd/attrd_sync.c -index c9b4784..9d07796 100644 ---- a/daemons/attrd/attrd_sync.c -+++ b/daemons/attrd/attrd_sync.c -@@ -61,6 +61,12 @@ struct confirmation_action { - */ - GList *respondents; - -+ /*! -+ * \brief A timer that will be used to remove the client should it time out -+ * before receiving all confirmations -+ */ -+ mainloop_timer_t *timer; -+ - /*! - * \brief A function to run when all confirmations have been received - */ -@@ -340,11 +346,51 @@ free_action(gpointer data) - { - struct confirmation_action *action = (struct confirmation_action *) data; - g_list_free_full(action->respondents, free); -+ mainloop_timer_del(action->timer); - free_xml(action->xml); - free(action->client_id); - free(action); - } - -+/* Remove an IPC request from the expected_confirmations table if the peer attrds -+ * don't respond before the timeout is hit. We set the timeout to 15s. The exact -+ * number isn't critical - we just want to make sure that the table eventually gets -+ * cleared of things that didn't complete. -+ */ -+static gboolean -+confirmation_timeout_cb(gpointer data) -+{ -+ struct confirmation_action *action = (struct confirmation_action *) data; -+ -+ GHashTableIter iter; -+ gpointer value; -+ -+ if (expected_confirmations == NULL) { -+ return G_SOURCE_REMOVE; -+ } -+ -+ g_hash_table_iter_init(&iter, expected_confirmations); -+ -+ while (g_hash_table_iter_next(&iter, NULL, &value)) { -+ if (value == action) { -+ pcmk__client_t *client = pcmk__find_client_by_id(action->client_id); -+ if (client == NULL) { -+ return G_SOURCE_REMOVE; -+ } -+ -+ crm_trace("Timed out waiting for confirmations for client %s", client->id); -+ pcmk__ipc_send_ack(client, action->ipc_id, action->flags | crm_ipc_client_response, -+ "ack", ATTRD_PROTOCOL_VERSION, CRM_EX_TIMEOUT); -+ -+ g_hash_table_iter_remove(&iter); -+ crm_trace("%d requests now in expected confirmations table", g_hash_table_size(expected_confirmations)); -+ break; -+ } -+ } -+ -+ return G_SOURCE_REMOVE; -+} -+ - /*! - * \internal - * \brief When a peer disconnects from the cluster, no longer wait for its confirmation -@@ -465,6 +511,9 @@ attrd_expect_confirmations(pcmk__request_t *request, attrd_confirmation_action_f - action->ipc_id = request->ipc_id; - action->flags = request->flags; - -+ action->timer = mainloop_timer_add(NULL, 15000, FALSE, confirmation_timeout_cb, action); -+ mainloop_timer_start(action->timer); -+ - pcmk__intkey_table_insert(expected_confirmations, callid, action); - crm_trace("Callid %d now waiting on %d confirmations", callid, g_list_length(respondents)); - crm_trace("%d requests now in expected confirmations table", g_hash_table_size(expected_confirmations)); --- -2.31.1 - -From 101896383cbe0103c98078e46540c076af08f040 Mon Sep 17 00:00:00 2001 -From: Chris Lumens -Date: Wed, 2 Nov 2022 14:40:30 -0400 -Subject: [PATCH 23/26] Refactor: Demote a sync point related message to trace. - ---- - daemons/attrd/attrd_corosync.c | 2 +- - 1 file changed, 1 insertion(+), 1 deletion(-) - -diff --git a/daemons/attrd/attrd_corosync.c b/daemons/attrd/attrd_corosync.c -index 37701aa..5cbed7e 100644 ---- a/daemons/attrd/attrd_corosync.c -+++ b/daemons/attrd/attrd_corosync.c -@@ -633,7 +633,7 @@ attrd_peer_update(const crm_node_t *peer, xmlNode *xml, const char *host, - * point, process that now. - */ - if (handle_sync_point) { -- crm_debug("Hit local sync point for attribute update"); -+ crm_trace("Hit local sync point for attribute update"); - attrd_ack_waitlist_clients(attrd_sync_point_local, xml); - } - } --- -2.31.1 - -From acd13246d4c2bef7982ca103e34896efcad22348 Mon Sep 17 00:00:00 2001 -From: Chris Lumens -Date: Thu, 3 Nov 2022 10:29:20 -0400 -Subject: [PATCH 24/26] Low: daemons: Avoid infinite confirm loops in attrd. - -On the sending side, do not add confirm="yes" to a message with -op="confirm". On the receiving side, do not confirm a message with -op="confirm" even if confirm="yes" is set. ---- - daemons/attrd/attrd_corosync.c | 3 ++- - daemons/attrd/attrd_messages.c | 6 +++++- - 2 files changed, 7 insertions(+), 2 deletions(-) - -diff --git a/daemons/attrd/attrd_corosync.c b/daemons/attrd/attrd_corosync.c -index 5cbed7e..88c1ecc 100644 ---- a/daemons/attrd/attrd_corosync.c -+++ b/daemons/attrd/attrd_corosync.c -@@ -74,7 +74,8 @@ attrd_peer_message(crm_node_t *peer, xmlNode *xml) - /* Having finished handling the request, check to see if the originating - * peer requested confirmation. If so, send that confirmation back now. - */ -- if (pcmk__xe_attr_is_true(xml, PCMK__XA_CONFIRM)) { -+ if (pcmk__xe_attr_is_true(xml, PCMK__XA_CONFIRM) && -+ !pcmk__str_eq(request.op, PCMK__ATTRD_CMD_CONFIRM, pcmk__str_none)) { - int callid = 0; - xmlNode *reply = NULL; - -diff --git a/daemons/attrd/attrd_messages.c b/daemons/attrd/attrd_messages.c -index f7b9c7c..184176a 100644 ---- a/daemons/attrd/attrd_messages.c -+++ b/daemons/attrd/attrd_messages.c -@@ -310,6 +310,8 @@ attrd_broadcast_protocol(void) - gboolean - attrd_send_message(crm_node_t *node, xmlNode *data, bool confirm) - { -+ const char *op = crm_element_value(data, PCMK__XA_TASK); -+ - crm_xml_add(data, F_TYPE, T_ATTRD); - crm_xml_add(data, PCMK__XA_ATTR_VERSION, ATTRD_PROTOCOL_VERSION); - -@@ -317,7 +319,9 @@ attrd_send_message(crm_node_t *node, xmlNode *data, bool confirm) - * be all if node is NULL) that the message has been received and - * acted upon. - */ -- pcmk__xe_set_bool_attr(data, PCMK__XA_CONFIRM, confirm); -+ if (!pcmk__str_eq(op, PCMK__ATTRD_CMD_CONFIRM, pcmk__str_none)) { -+ pcmk__xe_set_bool_attr(data, PCMK__XA_CONFIRM, confirm); -+ } - - attrd_xml_add_writer(data); - return send_cluster_message(node, crm_msg_attrd, data, TRUE); --- -2.31.1 - -From 115e6c3a0d8db4df3eccf6da1c344168799f890d Mon Sep 17 00:00:00 2001 -From: Chris Lumens -Date: Tue, 15 Nov 2022 09:35:28 -0500 -Subject: [PATCH 25/26] Fix: daemons: Check for NULL in - attrd_do_not_expect_from_peer. - ---- - daemons/attrd/attrd_sync.c | 8 +++++++- - 1 file changed, 7 insertions(+), 1 deletion(-) - -diff --git a/daemons/attrd/attrd_sync.c b/daemons/attrd/attrd_sync.c -index 9d07796..6936771 100644 ---- a/daemons/attrd/attrd_sync.c -+++ b/daemons/attrd/attrd_sync.c -@@ -402,7 +402,13 @@ confirmation_timeout_cb(gpointer data) - void - attrd_do_not_expect_from_peer(const char *host) - { -- GList *keys = g_hash_table_get_keys(expected_confirmations); -+ GList *keys = NULL; -+ -+ if (expected_confirmations == NULL) { -+ return; -+ } -+ -+ keys = g_hash_table_get_keys(expected_confirmations); - - crm_trace("Removing peer %s from expected confirmations", host); - --- -2.31.1 - -From 05da14f97ccd4f63f53801acc107ad661e5fd0c8 Mon Sep 17 00:00:00 2001 -From: Chris Lumens -Date: Wed, 16 Nov 2022 17:37:44 -0500 -Subject: [PATCH 26/26] Low: daemons: Support cluster-wide sync points for - multi IPC messages. - -Supporting cluster-wide sync points means attrd_expect_confirmations -needs to be called, and then attrd_send_message needs "true" as a third -argument. This indicates attrd wants confirmations back from all its -peers when they have applied the update. - -We're already doing this at the end of attrd_client_update for -single-update IPC messages, and handling it for multi-update messages is -a simple matter of breaking that code out into a function and making -sure it's called. - -Note that this leaves two other spots where sync points still need to be -dealt with: - -* An update message that uses a regex. See - https://projects.clusterlabs.org/T600 for details. - -* A multi-update IPC message in a cluster where that is not supported. - See https://projects.clusterlabs.org/T601 for details. ---- - daemons/attrd/attrd_ipc.c | 43 ++++++++++++++++++++++----------------- - 1 file changed, 24 insertions(+), 19 deletions(-) - -diff --git a/daemons/attrd/attrd_ipc.c b/daemons/attrd/attrd_ipc.c -index 16bfff4..8c5660d 100644 ---- a/daemons/attrd/attrd_ipc.c -+++ b/daemons/attrd/attrd_ipc.c -@@ -283,6 +283,28 @@ handle_value_expansion(const char **value, xmlNode *xml, const char *op, - return pcmk_rc_ok; - } - -+static void -+send_update_msg_to_cluster(pcmk__request_t *request, xmlNode *xml) -+{ -+ if (pcmk__str_eq(attrd_request_sync_point(xml), PCMK__VALUE_CLUSTER, pcmk__str_none)) { -+ /* The client is waiting on the cluster-wide sync point. In this case, -+ * the response ACK is not sent until this attrd broadcasts the update -+ * and receives its own confirmation back from all peers. -+ */ -+ attrd_expect_confirmations(request, attrd_cluster_sync_point_update); -+ attrd_send_message(NULL, xml, true); /* ends up at attrd_peer_message() */ -+ -+ } else { -+ /* The client is either waiting on the local sync point or was not -+ * waiting on any sync point at all. For the local sync point, the -+ * response ACK is sent in attrd_peer_update. For clients not -+ * waiting on any sync point, the response ACK is sent in -+ * handle_update_request immediately before this function was called. -+ */ -+ attrd_send_message(NULL, xml, false); /* ends up at attrd_peer_message() */ -+ } -+} -+ - xmlNode * - attrd_client_update(pcmk__request_t *request) - { -@@ -314,7 +336,7 @@ attrd_client_update(pcmk__request_t *request) - } - } - -- attrd_send_message(NULL, xml, false); -+ send_update_msg_to_cluster(request, xml); - pcmk__set_result(&request->result, CRM_EX_OK, PCMK_EXEC_DONE, NULL); - - } else { -@@ -388,24 +410,7 @@ attrd_client_update(pcmk__request_t *request) - crm_debug("Broadcasting %s[%s]=%s%s", attr, crm_element_value(xml, PCMK__XA_ATTR_NODE_NAME), - value, (attrd_election_won()? " (writer)" : "")); - -- if (pcmk__str_eq(attrd_request_sync_point(xml), PCMK__VALUE_CLUSTER, pcmk__str_none)) { -- /* The client is waiting on the cluster-wide sync point. In this case, -- * the response ACK is not sent until this attrd broadcasts the update -- * and receives its own confirmation back from all peers. -- */ -- attrd_expect_confirmations(request, attrd_cluster_sync_point_update); -- attrd_send_message(NULL, xml, true); /* ends up at attrd_peer_message() */ -- -- } else { -- /* The client is either waiting on the local sync point or was not -- * waiting on any sync point at all. For the local sync point, the -- * response ACK is sent in attrd_peer_update. For clients not -- * waiting on any sync point, the response ACK is sent in -- * handle_update_request immediately before this function was called. -- */ -- attrd_send_message(NULL, xml, false); /* ends up at attrd_peer_message() */ -- } -- -+ send_update_msg_to_cluster(request, xml); - pcmk__set_result(&request->result, CRM_EX_OK, PCMK_EXEC_DONE, NULL); - return NULL; - } --- -2.31.1 - diff --git a/002-remote-regression.patch b/002-remote-regression.patch deleted file mode 100644 index 0f0bea8..0000000 --- a/002-remote-regression.patch +++ /dev/null @@ -1,98 +0,0 @@ -From d8e08729ad5e3dc62f774172f992210902fc0ed4 Mon Sep 17 00:00:00 2001 -From: Ken Gaillot -Date: Mon, 23 Jan 2023 14:25:56 -0600 -Subject: [PATCH] High: executor: fix regression in remote node shutdown - -This reverts the essential part of d61494347, which was based on misdiagnosing -a remote node shutdown issue. Initially, it was thought that a "TLS server -session ended" log just after a remote node requested shutdown indicated that -the proxy connection coincidentally dropped at that moment. It actually is the -routine stopping of accepting new proxy connections, and existing when that -happens makes the remote node exit immediately without waiting for the -all-clear from the cluster. - -Fixes T361 ---- - daemons/execd/pacemaker-execd.c | 19 +------------------ - daemons/execd/pacemaker-execd.h | 3 +-- - daemons/execd/remoted_tls.c | 6 +----- - 3 files changed, 3 insertions(+), 25 deletions(-) - -diff --git a/daemons/execd/pacemaker-execd.c b/daemons/execd/pacemaker-execd.c -index db12674f13..491808974a 100644 ---- a/daemons/execd/pacemaker-execd.c -+++ b/daemons/execd/pacemaker-execd.c -@@ -1,5 +1,5 @@ - /* -- * Copyright 2012-2022 the Pacemaker project contributors -+ * Copyright 2012-2023 the Pacemaker project contributors - * - * The version control history for this file may have further details. - * -@@ -305,23 +305,6 @@ lrmd_exit(gpointer data) - return FALSE; - } - --/*! -- * \internal -- * \brief Clean up and exit if shutdown has started -- * -- * \return Doesn't return -- */ --void --execd_exit_if_shutting_down(void) --{ --#ifdef PCMK__COMPILE_REMOTE -- if (shutting_down) { -- crm_warn("exit because TLS connection was closed and 'shutting_down' set"); -- lrmd_exit(NULL); -- } --#endif --} -- - /*! - * \internal - * \brief Request cluster shutdown if appropriate, otherwise exit immediately -diff --git a/daemons/execd/pacemaker-execd.h b/daemons/execd/pacemaker-execd.h -index 6646ae29e3..f78e8dcdde 100644 ---- a/daemons/execd/pacemaker-execd.h -+++ b/daemons/execd/pacemaker-execd.h -@@ -1,5 +1,5 @@ - /* -- * Copyright 2012-2022 the Pacemaker project contributors -+ * Copyright 2012-2023 the Pacemaker project contributors - * - * The version control history for this file may have further details. - * -@@ -105,6 +105,5 @@ void remoted_spawn_pidone(int argc, char **argv, char **envp); - int process_lrmd_alert_exec(pcmk__client_t *client, uint32_t id, - xmlNode *request); - void lrmd_drain_alerts(GMainLoop *mloop); --void execd_exit_if_shutting_down(void); - - #endif // PACEMAKER_EXECD__H -diff --git a/daemons/execd/remoted_tls.c b/daemons/execd/remoted_tls.c -index 6f4b2d0062..c65e3f394d 100644 ---- a/daemons/execd/remoted_tls.c -+++ b/daemons/execd/remoted_tls.c -@@ -1,5 +1,5 @@ - /* -- * Copyright 2012-2022 the Pacemaker project contributors -+ * Copyright 2012-2023 the Pacemaker project contributors - * - * The version control history for this file may have further details. - * -@@ -250,10 +250,6 @@ static void - tls_server_dropped(gpointer user_data) - { - crm_notice("TLS server session ended"); -- /* If we are in the process of shutting down, then we should actually exit. -- * bz#1804259 -- */ -- execd_exit_if_shutting_down(); - return; - } - --- -2.31.1 - diff --git a/003-history-cleanup.patch b/003-history-cleanup.patch deleted file mode 100644 index 87a3e27..0000000 --- a/003-history-cleanup.patch +++ /dev/null @@ -1,2829 +0,0 @@ -From e953591a9796edebd4796c344df0eddcbc7a2dff Mon Sep 17 00:00:00 2001 -From: Ken Gaillot -Date: Mon, 30 Jan 2023 16:34:32 -0600 -Subject: [PATCH 01/14] Refactor: scheduler: drop unneeded arguments from - process_rsc_state() - -migrate_op has been unused since at least 2011 ---- - lib/pengine/unpack.c | 36 +++++++++++++++--------------------- - 1 file changed, 15 insertions(+), 21 deletions(-) - -diff --git a/lib/pengine/unpack.c b/lib/pengine/unpack.c -index 5fcba3b..9524def 100644 ---- a/lib/pengine/unpack.c -+++ b/lib/pengine/unpack.c -@@ -1963,8 +1963,7 @@ process_orphan_resource(xmlNode * rsc_entry, pe_node_t * node, pe_working_set_t - - static void - process_rsc_state(pe_resource_t * rsc, pe_node_t * node, -- enum action_fail_response on_fail, -- xmlNode * migrate_op, pe_working_set_t * data_set) -+ enum action_fail_response on_fail) - { - pe_node_t *tmpnode = NULL; - char *reason = NULL; -@@ -2016,7 +2015,7 @@ process_rsc_state(pe_resource_t * rsc, pe_node_t * node, - pe__set_resource_flags(rsc, pe_rsc_failed|pe_rsc_stop); - should_fence = TRUE; - -- } else if (pcmk_is_set(data_set->flags, pe_flag_stonith_enabled)) { -+ } else if (pcmk_is_set(rsc->cluster->flags, pe_flag_stonith_enabled)) { - if (pe__is_remote_node(node) && node->details->remote_rsc - && !pcmk_is_set(node->details->remote_rsc->flags, pe_rsc_failed)) { - -@@ -2039,7 +2038,7 @@ process_rsc_state(pe_resource_t * rsc, pe_node_t * node, - if (reason == NULL) { - reason = crm_strdup_printf("%s is thought to be active there", rsc->id); - } -- pe_fence_node(data_set, node, reason, FALSE); -+ pe_fence_node(rsc->cluster, node, reason, FALSE); - } - free(reason); - } -@@ -2069,7 +2068,7 @@ process_rsc_state(pe_resource_t * rsc, pe_node_t * node, - * but also mark the node as unclean - */ - reason = crm_strdup_printf("%s failed there", rsc->id); -- pe_fence_node(data_set, node, reason, FALSE); -+ pe_fence_node(rsc->cluster, node, reason, FALSE); - free(reason); - break; - -@@ -2090,7 +2089,8 @@ process_rsc_state(pe_resource_t * rsc, pe_node_t * node, - /* make sure it comes up somewhere else - * or not at all - */ -- resource_location(rsc, node, -INFINITY, "__action_migration_auto__", data_set); -+ resource_location(rsc, node, -INFINITY, "__action_migration_auto__", -+ rsc->cluster); - break; - - case action_fail_stop: -@@ -2112,8 +2112,8 @@ process_rsc_state(pe_resource_t * rsc, pe_node_t * node, - * container is running yet, so remember it and add a stop - * action for it later. - */ -- data_set->stop_needed = g_list_prepend(data_set->stop_needed, -- rsc->container); -+ rsc->cluster->stop_needed = -+ g_list_prepend(rsc->cluster->stop_needed, rsc->container); - } else if (rsc->container) { - stop_action(rsc->container, node, FALSE); - } else if (rsc->role != RSC_ROLE_STOPPED && rsc->role != RSC_ROLE_UNKNOWN) { -@@ -2123,10 +2123,10 @@ process_rsc_state(pe_resource_t * rsc, pe_node_t * node, - - case action_fail_reset_remote: - pe__set_resource_flags(rsc, pe_rsc_failed|pe_rsc_stop); -- if (pcmk_is_set(data_set->flags, pe_flag_stonith_enabled)) { -+ if (pcmk_is_set(rsc->cluster->flags, pe_flag_stonith_enabled)) { - tmpnode = NULL; - if (rsc->is_remote_node) { -- tmpnode = pe_find_node(data_set->nodes, rsc->id); -+ tmpnode = pe_find_node(rsc->cluster->nodes, rsc->id); - } - if (tmpnode && - pe__is_remote_node(tmpnode) && -@@ -2135,7 +2135,7 @@ process_rsc_state(pe_resource_t * rsc, pe_node_t * node, - /* The remote connection resource failed in a way that - * should result in fencing the remote node. - */ -- pe_fence_node(data_set, tmpnode, -+ pe_fence_node(rsc->cluster, tmpnode, - "remote connection is unrecoverable", FALSE); - } - } -@@ -2158,7 +2158,7 @@ process_rsc_state(pe_resource_t * rsc, pe_node_t * node, - * result in a fencing operation regardless if we're going to attempt to - * reconnect to the remote-node in this transition or not. */ - if (pcmk_is_set(rsc->flags, pe_rsc_failed) && rsc->is_remote_node) { -- tmpnode = pe_find_node(data_set->nodes, rsc->id); -+ tmpnode = pe_find_node(rsc->cluster->nodes, rsc->id); - if (tmpnode && tmpnode->details->unclean) { - tmpnode->details->unseen = FALSE; - } -@@ -2177,7 +2177,8 @@ process_rsc_state(pe_resource_t * rsc, pe_node_t * node, - } - } - -- native_add_running(rsc, node, data_set, (save_on_fail != action_fail_ignore)); -+ native_add_running(rsc, node, rsc->cluster, -+ (save_on_fail != action_fail_ignore)); - switch (on_fail) { - case action_fail_ignore: - break; -@@ -2376,14 +2377,12 @@ unpack_lrm_resource(pe_node_t *node, xmlNode *lrm_resource, - int start_index = -1; - enum rsc_role_e req_role = RSC_ROLE_UNKNOWN; - -- const char *task = NULL; - const char *rsc_id = ID(lrm_resource); - - pe_resource_t *rsc = NULL; - GList *op_list = NULL; - GList *sorted_op_list = NULL; - -- xmlNode *migrate_op = NULL; - xmlNode *rsc_op = NULL; - xmlNode *last_failure = NULL; - -@@ -2437,11 +2436,6 @@ unpack_lrm_resource(pe_node_t *node, xmlNode *lrm_resource, - for (gIter = sorted_op_list; gIter != NULL; gIter = gIter->next) { - xmlNode *rsc_op = (xmlNode *) gIter->data; - -- task = crm_element_value(rsc_op, XML_LRM_ATTR_TASK); -- if (pcmk__str_eq(task, CRMD_ACTION_MIGRATED, pcmk__str_casei)) { -- migrate_op = rsc_op; -- } -- - unpack_rsc_op(rsc, node, rsc_op, &last_failure, &on_fail, data_set); - } - -@@ -2452,7 +2446,7 @@ unpack_lrm_resource(pe_node_t *node, xmlNode *lrm_resource, - /* no need to free the contents */ - g_list_free(sorted_op_list); - -- process_rsc_state(rsc, node, on_fail, migrate_op, data_set); -+ process_rsc_state(rsc, node, on_fail); - - if (get_target_role(rsc, &req_role)) { - if (rsc->next_role == RSC_ROLE_UNKNOWN || req_role < rsc->next_role) { --- -2.31.1 - -From 6f4e34cccc4864961d2020a2dd547450ac53a44e Mon Sep 17 00:00:00 2001 -From: Ken Gaillot -Date: Wed, 1 Feb 2023 16:30:20 -0600 -Subject: [PATCH 02/14] Log: scheduler: improve trace logs when unpacking - resource history - ---- - lib/pengine/unpack.c | 112 +++++++++++++++++++++++++++---------------- - 1 file changed, 71 insertions(+), 41 deletions(-) - -diff --git a/lib/pengine/unpack.c b/lib/pengine/unpack.c -index 9524def..b7b2873 100644 ---- a/lib/pengine/unpack.c -+++ b/lib/pengine/unpack.c -@@ -3363,6 +3363,24 @@ check_recoverable(pe_resource_t *rsc, pe_node_t *node, const char *task, - pe__set_resource_flags(rsc, pe_rsc_block); - } - -+/*! -+ * \internal -+ * \brief Update an integer value and why -+ * -+ * \param[in,out] i Pointer to integer to update -+ * \param[in,out] why Where to store reason for update -+ * \param[in] value New value -+ * \param[in,out] reason Description of why value was changed -+ */ -+static inline void -+remap_because(int *i, const char **why, int value, const char *reason) -+{ -+ if (*i != value) { -+ *i = value; -+ *why = reason; -+ } -+} -+ - /*! - * \internal - * \brief Remap informational monitor results and operation status -@@ -3393,29 +3411,34 @@ check_recoverable(pe_resource_t *rsc, pe_node_t *node, const char *task, - static void - remap_operation(xmlNode *xml_op, pe_resource_t *rsc, pe_node_t *node, - pe_working_set_t *data_set, enum action_fail_response *on_fail, -- int target_rc, int *rc, int *status) { -+ int target_rc, int *rc, int *status) -+{ - bool is_probe = false; -+ int orig_exit_status = *rc; -+ int orig_exec_status = *status; -+ const char *why = NULL; - const char *task = crm_element_value(xml_op, XML_LRM_ATTR_TASK); - const char *key = get_op_key(xml_op); - const char *exit_reason = crm_element_value(xml_op, - XML_LRM_ATTR_EXIT_REASON); - - if (pcmk__str_eq(task, CRMD_ACTION_STATUS, pcmk__str_none)) { -- int remapped_rc = pcmk__effective_rc(*rc); -- -- if (*rc != remapped_rc) { -- crm_trace("Remapping monitor result %d to %d", *rc, remapped_rc); -+ // Remap degraded results to their usual counterparts -+ *rc = pcmk__effective_rc(*rc); -+ if (*rc != orig_exit_status) { -+ why = "degraded monitor result"; - if (!node->details->shutdown || node->details->online) { - record_failed_op(xml_op, node, rsc, data_set); - } -- -- *rc = remapped_rc; - } - } - - if (!pe_rsc_is_bundled(rsc) && pcmk_xe_mask_probe_failure(xml_op)) { -- *status = PCMK_EXEC_DONE; -- *rc = PCMK_OCF_NOT_RUNNING; -+ if ((*status != PCMK_EXEC_DONE) || (*rc != PCMK_OCF_NOT_RUNNING)) { -+ *status = PCMK_EXEC_DONE; -+ *rc = PCMK_OCF_NOT_RUNNING; -+ why = "irrelevant probe result"; -+ } - } - - /* If the executor reported an operation status of anything but done or -@@ -3423,22 +3446,19 @@ remap_operation(xmlNode *xml_op, pe_resource_t *rsc, pe_node_t *node, - * it should be treated as a failure or not, because we know the expected - * result. - */ -- if (*status != PCMK_EXEC_DONE && *status != PCMK_EXEC_ERROR) { -- return; -+ switch (*status) { -+ case PCMK_EXEC_DONE: -+ case PCMK_EXEC_ERROR: -+ break; -+ default: -+ goto remap_done; - } - -- CRM_ASSERT(rsc); -- CRM_CHECK(task != NULL, -- *status = PCMK_EXEC_ERROR; return); -- -- *status = PCMK_EXEC_DONE; -- - if (exit_reason == NULL) { - exit_reason = ""; - } - - is_probe = pcmk_xe_is_probe(xml_op); -- - if (is_probe) { - task = "probe"; - } -@@ -3452,12 +3472,15 @@ remap_operation(xmlNode *xml_op, pe_resource_t *rsc, pe_node_t *node, - * those versions or processing of saved CIB files from those versions, - * so we do not need to care much about this case. - */ -- *status = PCMK_EXEC_ERROR; -+ remap_because(status, &why, PCMK_EXEC_ERROR, "obsolete history format"); - crm_warn("Expected result not found for %s on %s (corrupt or obsolete CIB?)", - key, pe__node_name(node)); - -- } else if (target_rc != *rc) { -- *status = PCMK_EXEC_ERROR; -+ } else if (*rc == target_rc) { -+ remap_because(status, &why, PCMK_EXEC_DONE, "expected result"); -+ -+ } else { -+ remap_because(status, &why, PCMK_EXEC_ERROR, "unexpected result"); - pe_rsc_debug(rsc, "%s on %s: expected %d (%s), got %d (%s%s%s)", - key, pe__node_name(node), - target_rc, services_ocf_exitcode_str(target_rc), -@@ -3468,7 +3491,7 @@ remap_operation(xmlNode *xml_op, pe_resource_t *rsc, pe_node_t *node, - switch (*rc) { - case PCMK_OCF_OK: - if (is_probe && (target_rc == PCMK_OCF_NOT_RUNNING)) { -- *status = PCMK_EXEC_DONE; -+ remap_because(status, &why,PCMK_EXEC_DONE, "probe"); - pe_rsc_info(rsc, "Probe found %s active on %s at %s", - rsc->id, pe__node_name(node), - last_change_str(xml_op)); -@@ -3479,7 +3502,7 @@ remap_operation(xmlNode *xml_op, pe_resource_t *rsc, pe_node_t *node, - if (is_probe || (target_rc == *rc) - || !pcmk_is_set(rsc->flags, pe_rsc_managed)) { - -- *status = PCMK_EXEC_DONE; -+ remap_because(status, &why, PCMK_EXEC_DONE, "exit status"); - rsc->role = RSC_ROLE_STOPPED; - - /* clear any previous failure actions */ -@@ -3490,7 +3513,7 @@ remap_operation(xmlNode *xml_op, pe_resource_t *rsc, pe_node_t *node, - - case PCMK_OCF_RUNNING_PROMOTED: - if (is_probe && (*rc != target_rc)) { -- *status = PCMK_EXEC_DONE; -+ remap_because(status, &why, PCMK_EXEC_DONE, "probe"); - pe_rsc_info(rsc, - "Probe found %s active and promoted on %s at %s", - rsc->id, pe__node_name(node), -@@ -3502,11 +3525,11 @@ remap_operation(xmlNode *xml_op, pe_resource_t *rsc, pe_node_t *node, - case PCMK_OCF_DEGRADED_PROMOTED: - case PCMK_OCF_FAILED_PROMOTED: - rsc->role = RSC_ROLE_PROMOTED; -- *status = PCMK_EXEC_ERROR; -+ remap_because(status, &why, PCMK_EXEC_ERROR, "exit status"); - break; - - case PCMK_OCF_NOT_CONFIGURED: -- *status = PCMK_EXEC_ERROR_FATAL; -+ remap_because(status, &why, PCMK_EXEC_ERROR_FATAL, "exit status"); - break; - - case PCMK_OCF_UNIMPLEMENT_FEATURE: -@@ -3517,9 +3540,11 @@ remap_operation(xmlNode *xml_op, pe_resource_t *rsc, pe_node_t *node, - - if (interval_ms == 0) { - check_recoverable(rsc, node, task, *rc, xml_op); -- *status = PCMK_EXEC_ERROR_HARD; -+ remap_because(status, &why, PCMK_EXEC_ERROR_HARD, -+ "exit status"); - } else { -- *status = PCMK_EXEC_NOT_SUPPORTED; -+ remap_because(status, &why, PCMK_EXEC_NOT_SUPPORTED, -+ "exit status"); - } - } - break; -@@ -3528,7 +3553,7 @@ remap_operation(xmlNode *xml_op, pe_resource_t *rsc, pe_node_t *node, - case PCMK_OCF_INVALID_PARAM: - case PCMK_OCF_INSUFFICIENT_PRIV: - check_recoverable(rsc, node, task, *rc, xml_op); -- *status = PCMK_EXEC_ERROR_HARD; -+ remap_because(status, &why, PCMK_EXEC_ERROR_HARD, "exit status"); - break; - - default: -@@ -3537,13 +3562,21 @@ remap_operation(xmlNode *xml_op, pe_resource_t *rsc, pe_node_t *node, - "on %s at %s as failure", - *rc, task, rsc->id, pe__node_name(node), - last_change_str(xml_op)); -- *status = PCMK_EXEC_ERROR; -+ remap_because(status, &why, PCMK_EXEC_ERROR, -+ "unknown exit status"); - } - break; - } - -- pe_rsc_trace(rsc, "Remapped %s status to '%s'", -- key, pcmk_exec_status_str(*status)); -+remap_done: -+ if (why != NULL) { -+ pe_rsc_trace(rsc, -+ "Remapped %s result from [%s: %s] to [%s: %s] " -+ "because of %s", -+ key, pcmk_exec_status_str(orig_exec_status), -+ crm_exit_str(orig_exit_status), -+ pcmk_exec_status_str(*status), crm_exit_str(*rc), why); -+ } - } - - // return TRUE if start or monitor last failure but parameters changed -@@ -3947,9 +3980,9 @@ unpack_rsc_op(pe_resource_t *rsc, pe_node_t *node, xmlNode *xml_op, - parent = uber_parent(rsc); - } - -- pe_rsc_trace(rsc, "Unpacking task %s/%s (call_id=%d, status=%d, rc=%d) on %s (role=%s)", -- task_key, task, task_id, status, rc, pe__node_name(node), -- role2text(rsc->role)); -+ pe_rsc_trace(rsc, "Unpacking %s (%s call %d on %s): %s (%s)", -+ ID(xml_op), task, task_id, pe__node_name(node), -+ pcmk_exec_status_str(status), crm_exit_str(rc)); - - if (node->details->unclean) { - pe_rsc_trace(rsc, -@@ -4077,9 +4110,6 @@ unpack_rsc_op(pe_resource_t *rsc, pe_node_t *node, xmlNode *xml_op, - goto done; - - case PCMK_EXEC_DONE: -- pe_rsc_trace(rsc, "%s of %s on %s completed at %s " CRM_XS " id=%s", -- task, rsc->id, pe__node_name(node), -- last_change_str(xml_op), ID(xml_op)); - update_resource_state(rsc, node, xml_op, task, rc, *last_failure, on_fail, data_set); - goto done; - -@@ -4175,9 +4205,9 @@ unpack_rsc_op(pe_resource_t *rsc, pe_node_t *node, xmlNode *xml_op, - } - - done: -- pe_rsc_trace(rsc, "Resource %s after %s: role=%s, next=%s", -- rsc->id, task, role2text(rsc->role), -- role2text(rsc->next_role)); -+ pe_rsc_trace(rsc, "%s role on %s after %s is %s (next %s)", -+ rsc->id, pe__node_name(node), ID(xml_op), -+ role2text(rsc->role), role2text(rsc->next_role)); - } - - static void --- -2.31.1 - -From 5a1d2a3ba58fa73225433dab40cee0a6e0ef9bda Mon Sep 17 00:00:00 2001 -From: Ken Gaillot -Date: Wed, 1 Feb 2023 12:08:55 -0600 -Subject: [PATCH 03/14] Low: scheduler: improve migration history validation - -Instead of a simple CRM_CHECK(), functionize parsing the source and target node -names from a migration action's resource history entry. This reduces -duplication and allows us to log more helpful errors. - -Also, CRM_CHECK() tries to dump core for debugging, and that's not helpful for -corrupted CIB entries. ---- - lib/pengine/unpack.c | 87 ++++++++++++++++++++++++++++++++++++++------ - 1 file changed, 75 insertions(+), 12 deletions(-) - -diff --git a/lib/pengine/unpack.c b/lib/pengine/unpack.c -index b7b2873..cd1b038 100644 ---- a/lib/pengine/unpack.c -+++ b/lib/pengine/unpack.c -@@ -2786,6 +2786,60 @@ newer_state_after_migrate(const char *rsc_id, const char *node_name, - || monitor_not_running_after(rsc_id, node_name, xml_op, same_node, - data_set); - } -+ -+/*! -+ * \internal -+ * \brief Parse migration source and target node names from history entry -+ * -+ * \param[in] entry Resource history entry for a migration action -+ * \param[in] source_node If not NULL, source must match this node -+ * \param[in] target_node If not NULL, target must match this node -+ * \param[out] source_name Where to store migration source node name -+ * \param[out] target_name Where to store migration target node name -+ * -+ * \return Standard Pacemaker return code -+ */ -+static int -+get_migration_node_names(const xmlNode *entry, const pe_node_t *source_node, -+ const pe_node_t *target_node, -+ const char **source_name, const char **target_name) -+{ -+ const char *id = ID(entry); -+ -+ if (id == NULL) { -+ crm_err("Ignoring resource history entry without ID"); -+ return pcmk_rc_unpack_error; -+ } -+ -+ *source_name = crm_element_value(entry, XML_LRM_ATTR_MIGRATE_SOURCE); -+ *target_name = crm_element_value(entry, XML_LRM_ATTR_MIGRATE_TARGET); -+ if ((*source_name == NULL) || (*target_name == NULL)) { -+ crm_err("Ignoring resource history entry %s without " -+ XML_LRM_ATTR_MIGRATE_SOURCE " and " XML_LRM_ATTR_MIGRATE_TARGET, -+ id); -+ return pcmk_rc_unpack_error; -+ } -+ -+ if ((source_node != NULL) -+ && !pcmk__str_eq(*source_name, source_node->details->uname, -+ pcmk__str_casei|pcmk__str_null_matches)) { -+ crm_err("Ignoring resource history entry %s because " -+ XML_LRM_ATTR_MIGRATE_SOURCE "='%s' does not match %s", -+ id, pcmk__s(*source_name, ""), pe__node_name(source_node)); -+ return pcmk_rc_unpack_error; -+ } -+ -+ if ((target_node != NULL) -+ && !pcmk__str_eq(*target_name, target_node->details->uname, -+ pcmk__str_casei|pcmk__str_null_matches)) { -+ crm_err("Ignoring resource history entry %s because " -+ XML_LRM_ATTR_MIGRATE_TARGET "='%s' does not match %s", -+ id, pcmk__s(*target_name, ""), pe__node_name(target_node)); -+ return pcmk_rc_unpack_error; -+ } -+ -+ return pcmk_rc_ok; -+} - - static void - unpack_migrate_to_success(pe_resource_t *rsc, pe_node_t *node, xmlNode *xml_op, -@@ -2834,13 +2888,16 @@ unpack_migrate_to_success(pe_resource_t *rsc, pe_node_t *node, xmlNode *xml_op, - pe_node_t *target_node = NULL; - pe_node_t *source_node = NULL; - xmlNode *migrate_from = NULL; -- const char *source = crm_element_value(xml_op, XML_LRM_ATTR_MIGRATE_SOURCE); -- const char *target = crm_element_value(xml_op, XML_LRM_ATTR_MIGRATE_TARGET); -+ const char *source = NULL; -+ const char *target = NULL; - bool source_newer_op = false; - bool target_newer_state = false; - -- // Sanity check -- CRM_CHECK(source && target && !strcmp(source, node->details->uname), return); -+ // Get source and target node names from XML -+ if (get_migration_node_names(xml_op, node, NULL, &source, -+ &target) != pcmk_rc_ok) { -+ return; -+ } - - /* If there's any newer non-monitor operation on the source, this migrate_to - * potentially no longer matters for the source. -@@ -2949,11 +3006,14 @@ unpack_migrate_to_failure(pe_resource_t *rsc, pe_node_t *node, xmlNode *xml_op, - pe_working_set_t *data_set) - { - xmlNode *target_migrate_from = NULL; -- const char *source = crm_element_value(xml_op, XML_LRM_ATTR_MIGRATE_SOURCE); -- const char *target = crm_element_value(xml_op, XML_LRM_ATTR_MIGRATE_TARGET); -+ const char *source = NULL; -+ const char *target = NULL; - -- // Sanity check -- CRM_CHECK(source && target && !strcmp(source, node->details->uname), return); -+ // Get source and target node names from XML -+ if (get_migration_node_names(xml_op, node, NULL, &source, -+ &target) != pcmk_rc_ok) { -+ return; -+ } - - /* If a migration failed, we have to assume the resource is active. Clones - * are not allowed to migrate, so role can't be promoted. -@@ -3001,11 +3061,14 @@ unpack_migrate_from_failure(pe_resource_t *rsc, pe_node_t *node, - xmlNode *xml_op, pe_working_set_t *data_set) - { - xmlNode *source_migrate_to = NULL; -- const char *source = crm_element_value(xml_op, XML_LRM_ATTR_MIGRATE_SOURCE); -- const char *target = crm_element_value(xml_op, XML_LRM_ATTR_MIGRATE_TARGET); -+ const char *source = NULL; -+ const char *target = NULL; - -- // Sanity check -- CRM_CHECK(source && target && !strcmp(target, node->details->uname), return); -+ // Get source and target node names from XML -+ if (get_migration_node_names(xml_op, NULL, node, &source, -+ &target) != pcmk_rc_ok) { -+ return; -+ } - - /* If a migration failed, we have to assume the resource is active. Clones - * are not allowed to migrate, so role can't be promoted. --- -2.31.1 - -From 5139e5369769e733b05bc28940d3dccb4f7fca95 Mon Sep 17 00:00:00 2001 -From: Ken Gaillot -Date: Tue, 31 Jan 2023 14:30:16 -0600 -Subject: [PATCH 04/14] Refactor: scheduler: functionize adding a dangling - migration - -... for code isolation and readability ---- - lib/pengine/unpack.c | 31 +++++++++++++++++++++++-------- - 1 file changed, 23 insertions(+), 8 deletions(-) - -diff --git a/lib/pengine/unpack.c b/lib/pengine/unpack.c -index cd1b038..fa7c2cc 100644 ---- a/lib/pengine/unpack.c -+++ b/lib/pengine/unpack.c -@@ -2841,6 +2841,28 @@ get_migration_node_names(const xmlNode *entry, const pe_node_t *source_node, - return pcmk_rc_ok; - } - -+/* -+ * \internal -+ * \brief Add a migration source to a resource's list of dangling migrations -+ * -+ * If the migrate_to and migrate_from actions in a live migration both -+ * succeeded, but there is no stop on the source, the migration is considered -+ * "dangling." Add the source to the resource's dangling migration list, which -+ * will be used to schedule a stop on the source without affecting the target. -+ * -+ * \param[in,out] rsc Resource involved in migration -+ * \param[in] node Migration source -+ */ -+static void -+add_dangling_migration(pe_resource_t *rsc, const pe_node_t *node) -+{ -+ pe_rsc_trace(rsc, "Dangling migration of %s requires stop on %s", -+ rsc->id, pe__node_name(node)); -+ rsc->role = RSC_ROLE_STOPPED; -+ rsc->dangling_migrations = g_list_prepend(rsc->dangling_migrations, -+ (gpointer) node); -+} -+ - static void - unpack_migrate_to_success(pe_resource_t *rsc, pe_node_t *node, xmlNode *xml_op, - pe_working_set_t *data_set) -@@ -2941,14 +2963,7 @@ unpack_migrate_to_success(pe_resource_t *rsc, pe_node_t *node, xmlNode *xml_op, - - if (migrate_from && from_rc == PCMK_OCF_OK - && (from_status == PCMK_EXEC_DONE)) { -- /* The migrate_to and migrate_from both succeeded, so mark the migration -- * as "dangling". This will be used to schedule a stop action on the -- * source without affecting the target. -- */ -- pe_rsc_trace(rsc, "Detected dangling migration op: %s on %s", ID(xml_op), -- source); -- rsc->role = RSC_ROLE_STOPPED; -- rsc->dangling_migrations = g_list_prepend(rsc->dangling_migrations, node); -+ add_dangling_migration(rsc, node); - - } else if (migrate_from && (from_status != PCMK_EXEC_PENDING)) { // Failed - /* If the resource has newer state on the target, this migrate_to no --- -2.31.1 - -From da71c04463d31338dd5da54d1d48b53e413716dc Mon Sep 17 00:00:00 2001 -From: Ken Gaillot -Date: Tue, 31 Jan 2023 16:57:55 -0600 -Subject: [PATCH 05/14] Refactor: scheduler: check for dangling migration - before setting role - -Previously, unpack_migrate_to_success() set rsc->role = RSC_ROLE_STARTED -then checked for dangling migration, which would reset it to RSC_ROLE_STOPPED. - -For clarity, do the dangling migration check first. ---- - lib/pengine/unpack.c | 47 ++++++++++++++++++++++++-------------------- - 1 file changed, 26 insertions(+), 21 deletions(-) - -diff --git a/lib/pengine/unpack.c b/lib/pengine/unpack.c -index fa7c2cc..b858b59 100644 ---- a/lib/pengine/unpack.c -+++ b/lib/pengine/unpack.c -@@ -2905,8 +2905,8 @@ unpack_migrate_to_success(pe_resource_t *rsc, pe_node_t *node, xmlNode *xml_op, - * migration is considered to be "dangling". Schedule a stop on the source - * in this case. - */ -- int from_rc = 0; -- int from_status = 0; -+ int from_rc = PCMK_OCF_OK; -+ int from_status = PCMK_EXEC_PENDING; - pe_node_t *target_node = NULL; - pe_node_t *source_node = NULL; - xmlNode *migrate_from = NULL; -@@ -2930,12 +2930,17 @@ unpack_migrate_to_success(pe_resource_t *rsc, pe_node_t *node, xmlNode *xml_op, - // Check whether there was a migrate_from action on the target - migrate_from = find_lrm_op(rsc->id, CRMD_ACTION_MIGRATED, target, - source, -1, data_set); -- -- /* Even if there's a newer non-monitor operation on the source, we still -- * need to check how this migrate_to might matter for the target. -- */ -- if (source_newer_op && migrate_from) { -- return; -+ if (migrate_from != NULL) { -+ if (source_newer_op) { -+ /* There's a newer non-monitor operation on the source and a -+ * migrate_from on the target, so this migrate_to is irrelevant to -+ * the resource's state. -+ */ -+ return; -+ } -+ crm_element_value_int(migrate_from, XML_LRM_ATTR_RC, &from_rc); -+ crm_element_value_int(migrate_from, XML_LRM_ATTR_OPSTATUS, -+ &from_status); - } - - /* If the resource has newer state on the target after the migration -@@ -2948,24 +2953,24 @@ unpack_migrate_to_success(pe_resource_t *rsc, pe_node_t *node, xmlNode *xml_op, - return; - } - -- // Clones are not allowed to migrate, so role can't be promoted -+ /* Check for dangling migration (migrate_from succeeded but stop not done). -+ * We know there's no stop because we already returned if the target has a -+ * migrate_from and the source has any newer non-monitor operation. -+ */ -+ if ((from_rc == PCMK_OCF_OK) && (from_status == PCMK_EXEC_DONE)) { -+ add_dangling_migration(rsc, node); -+ return; -+ } -+ -+ /* Without newer state, this migrate_to implies the resource is active. -+ * (Clones are not allowed to migrate, so role can't be promoted.) -+ */ - rsc->role = RSC_ROLE_STARTED; - - target_node = pe_find_node(data_set->nodes, target); - source_node = pe_find_node(data_set->nodes, source); - -- if (migrate_from) { -- crm_element_value_int(migrate_from, XML_LRM_ATTR_RC, &from_rc); -- crm_element_value_int(migrate_from, XML_LRM_ATTR_OPSTATUS, &from_status); -- pe_rsc_trace(rsc, "%s op on %s exited with status=%d, rc=%d", -- ID(migrate_from), target, from_status, from_rc); -- } -- -- if (migrate_from && from_rc == PCMK_OCF_OK -- && (from_status == PCMK_EXEC_DONE)) { -- add_dangling_migration(rsc, node); -- -- } else if (migrate_from && (from_status != PCMK_EXEC_PENDING)) { // Failed -+ if (from_status != PCMK_EXEC_PENDING) { // migrate_from failed on target - /* If the resource has newer state on the target, this migrate_to no - * longer matters for the target. - */ --- -2.31.1 - -From d98a2687d68747b0598554939dea05c420456a12 Mon Sep 17 00:00:00 2001 -From: Ken Gaillot -Date: Tue, 31 Jan 2023 17:05:50 -0600 -Subject: [PATCH 06/14] Refactor: scheduler: avoid duplication of - active-on-target check - ---- - lib/pengine/unpack.c | 24 ++++++------------------ - 1 file changed, 6 insertions(+), 18 deletions(-) - -diff --git a/lib/pengine/unpack.c b/lib/pengine/unpack.c -index b858b59..8cfc0ef 100644 ---- a/lib/pengine/unpack.c -+++ b/lib/pengine/unpack.c -@@ -2914,6 +2914,7 @@ unpack_migrate_to_success(pe_resource_t *rsc, pe_node_t *node, xmlNode *xml_op, - const char *target = NULL; - bool source_newer_op = false; - bool target_newer_state = false; -+ bool active_on_target = false; - - // Get source and target node names from XML - if (get_migration_node_names(xml_op, node, NULL, &source, -@@ -2969,23 +2970,14 @@ unpack_migrate_to_success(pe_resource_t *rsc, pe_node_t *node, xmlNode *xml_op, - - target_node = pe_find_node(data_set->nodes, target); - source_node = pe_find_node(data_set->nodes, source); -+ active_on_target = !target_newer_state && (target_node != NULL) -+ && target_node->details->online; - - if (from_status != PCMK_EXEC_PENDING) { // migrate_from failed on target -- /* If the resource has newer state on the target, this migrate_to no -- * longer matters for the target. -- */ -- if (!target_newer_state -- && target_node && target_node->details->online) { -- pe_rsc_trace(rsc, "Marking active on %s %p %d", target, target_node, -- target_node->details->online); -+ if (active_on_target) { - native_add_running(rsc, target_node, data_set, TRUE); -- - } else { -- /* With the earlier bail logic, migrate_from != NULL here implies -- * source_newer_op is false, meaning this migrate_to still matters -- * for the source. -- * Consider it failed here - forces a restart, prevents migration -- */ -+ // Mark resource as failed, require recovery, and prevent migration - pe__set_resource_flags(rsc, pe_rsc_failed|pe_rsc_stop); - pe__clear_resource_flags(rsc, pe_rsc_allow_migrate); - } -@@ -2994,11 +2986,7 @@ unpack_migrate_to_success(pe_resource_t *rsc, pe_node_t *node, xmlNode *xml_op, - /* If the resource has newer state on the target, this migrate_to no - * longer matters for the target. - */ -- if (!target_newer_state -- && target_node && target_node->details->online) { -- pe_rsc_trace(rsc, "Marking active on %s %p %d", target, target_node, -- target_node->details->online); -- -+ if (active_on_target) { - native_add_running(rsc, target_node, data_set, FALSE); - if (source_node && source_node->details->online) { - /* This is a partial migration: the migrate_to completed --- -2.31.1 - -From ae145309e3fdb26608e99f6d1fe1a7859d98efd0 Mon Sep 17 00:00:00 2001 -From: Ken Gaillot -Date: Tue, 31 Jan 2023 17:07:58 -0600 -Subject: [PATCH 07/14] Refactor: scheduler: improve unpacking of successful - migrate_to - -Improve log messages, comments, and formatting, and avoid doing things until -needed, to improve efficiency of early returns. ---- - lib/pengine/unpack.c | 109 +++++++++++++++++++------------------------ - 1 file changed, 48 insertions(+), 61 deletions(-) - -diff --git a/lib/pengine/unpack.c b/lib/pengine/unpack.c -index 8cfc0ef..224b7b5 100644 ---- a/lib/pengine/unpack.c -+++ b/lib/pengine/unpack.c -@@ -2867,48 +2867,40 @@ static void - unpack_migrate_to_success(pe_resource_t *rsc, pe_node_t *node, xmlNode *xml_op, - pe_working_set_t *data_set) - { -- /* A successful migration sequence is: -- * migrate_to on source node -- * migrate_from on target node -- * stop on source node -+ /* A complete migration sequence is: -+ * 1. migrate_to on source node (which succeeded if we get to this function) -+ * 2. migrate_from on target node -+ * 3. stop on source node - * -- * But there could be scenarios like (It's easier to produce with cluster -- * property batch-limit=1): -- * -- * - rscA is live-migrating from node1 to node2. -- * -- * - Before migrate_to on node1 returns, put node2 into standby. -- * -- * - Transition aborts upon return of successful migrate_to on node1. New -- * transition is going to stop the rscA on both nodes and start it on -- * node1. -+ * If no migrate_from has happened, the migration is considered to be -+ * "partial". If the migrate_from succeeded but no stop has happened, the -+ * migration is considered to be "dangling". - * -- * - While it is stopping on node1, run something that is going to make -- * the transition abort again like: -- * crm_resource --resource rscA --ban --node node2 -+ * If a successful migrate_to and stop have happened on the source node, we -+ * still need to check for a partial migration, due to scenarios (easier to -+ * produce with batch-limit=1) like: - * -- * - Transition aborts upon return of stop on node1. -+ * - A resource is migrating from node1 to node2, and a migrate_to is -+ * initiated for it on node1. - * -- * Now although there's a stop on node1, it's still a partial migration and -- * rscA is still potentially active on node2. -+ * - node2 goes into standby mode while the migrate_to is pending, which -+ * aborts the transition. - * -- * So even if a migrate_to is followed by a stop, we still need to check -- * whether there's a corresponding migrate_from or any newer operation on -- * the target. -+ * - Upon completion of the migrate_to, a new transition schedules a stop -+ * on both nodes and a start on node1. - * -- * If no migrate_from has happened, the migration is considered to be -- * "partial". If the migrate_from failed, make sure the resource gets -- * stopped on both source and target (if up). -+ * - If the new transition is aborted for any reason while the resource is -+ * stopping on node1, the transition after that stop completes will see -+ * the migrate_from and stop on the source, but it's still a partial -+ * migration, and the resource must be stopped on node2 because it is -+ * potentially active there due to the migrate_to. - * -- * If the migrate_to and migrate_from both succeeded (which also implies the -- * resource is no longer running on the source), but there is no stop, the -- * migration is considered to be "dangling". Schedule a stop on the source -- * in this case. -+ * We also need to take into account that either node's history may be -+ * cleared at any point in the migration process. - */ - int from_rc = PCMK_OCF_OK; - int from_status = PCMK_EXEC_PENDING; - pe_node_t *target_node = NULL; -- pe_node_t *source_node = NULL; - xmlNode *migrate_from = NULL; - const char *source = NULL; - const char *target = NULL; -@@ -2922,13 +2914,11 @@ unpack_migrate_to_success(pe_resource_t *rsc, pe_node_t *node, xmlNode *xml_op, - return; - } - -- /* If there's any newer non-monitor operation on the source, this migrate_to -- * potentially no longer matters for the source. -- */ -+ // Check for newer state on the source - source_newer_op = non_monitor_after(rsc->id, source, xml_op, true, - data_set); - -- // Check whether there was a migrate_from action on the target -+ // Check for a migrate_from action from this source on the target - migrate_from = find_lrm_op(rsc->id, CRMD_ACTION_MIGRATED, target, - source, -1, data_set); - if (migrate_from != NULL) { -@@ -2944,12 +2934,11 @@ unpack_migrate_to_success(pe_resource_t *rsc, pe_node_t *node, xmlNode *xml_op, - &from_status); - } - -- /* If the resource has newer state on the target after the migration -- * events, this migrate_to no longer matters for the target. -+ /* If the resource has newer state on both the source and target after the -+ * migration events, this migrate_to is irrelevant to the resource's state. - */ - target_newer_state = newer_state_after_migrate(rsc->id, target, xml_op, - migrate_from, data_set); -- - if (source_newer_op && target_newer_state) { - return; - } -@@ -2969,7 +2958,6 @@ unpack_migrate_to_success(pe_resource_t *rsc, pe_node_t *node, xmlNode *xml_op, - rsc->role = RSC_ROLE_STARTED; - - target_node = pe_find_node(data_set->nodes, target); -- source_node = pe_find_node(data_set->nodes, source); - active_on_target = !target_newer_state && (target_node != NULL) - && target_node->details->online; - -@@ -2981,31 +2969,30 @@ unpack_migrate_to_success(pe_resource_t *rsc, pe_node_t *node, xmlNode *xml_op, - pe__set_resource_flags(rsc, pe_rsc_failed|pe_rsc_stop); - pe__clear_resource_flags(rsc, pe_rsc_allow_migrate); - } -+ return; -+ } - -- } else { // Pending, or complete but erased -- /* If the resource has newer state on the target, this migrate_to no -- * longer matters for the target. -- */ -- if (active_on_target) { -- native_add_running(rsc, target_node, data_set, FALSE); -- if (source_node && source_node->details->online) { -- /* This is a partial migration: the migrate_to completed -- * successfully on the source, but the migrate_from has not -- * completed. Remember the source and target; if the newly -- * chosen target remains the same when we schedule actions -- * later, we may continue with the migration. -- */ -- rsc->partial_migration_target = target_node; -- rsc->partial_migration_source = source_node; -- } -- } else if (!source_newer_op) { -- /* This migrate_to matters for the source only if it's the last -- * non-monitor operation here. -- * Consider it failed here - forces a restart, prevents migration -+ // The migrate_from is pending, complete but erased, or to be scheduled -+ -+ if (active_on_target) { -+ pe_node_t *source_node = pe_find_node(data_set->nodes, source); -+ -+ native_add_running(rsc, target_node, data_set, FALSE); -+ if ((source_node != NULL) && source_node->details->online) { -+ /* This is a partial migration: the migrate_to completed -+ * successfully on the source, but the migrate_from has not -+ * completed. Remember the source and target; if the newly -+ * chosen target remains the same when we schedule actions -+ * later, we may continue with the migration. - */ -- pe__set_resource_flags(rsc, pe_rsc_failed|pe_rsc_stop); -- pe__clear_resource_flags(rsc, pe_rsc_allow_migrate); -+ rsc->partial_migration_target = target_node; -+ rsc->partial_migration_source = source_node; - } -+ -+ } else if (!source_newer_op) { -+ // Mark resource as failed, require recovery, and prevent migration -+ pe__set_resource_flags(rsc, pe_rsc_failed|pe_rsc_stop); -+ pe__clear_resource_flags(rsc, pe_rsc_allow_migrate); - } - } - --- -2.31.1 - -From 7d63ed8d52f64d2523367cff36bf77bd85296bd9 Mon Sep 17 00:00:00 2001 -From: Ken Gaillot -Date: Tue, 31 Jan 2023 17:14:57 -0600 -Subject: [PATCH 08/14] Refactor: scheduler: drop redundant argument from - unpack_migrate_to_success() - ---- - lib/pengine/unpack.c | 19 +++++++++---------- - 1 file changed, 9 insertions(+), 10 deletions(-) - -diff --git a/lib/pengine/unpack.c b/lib/pengine/unpack.c -index 224b7b5..6222115 100644 ---- a/lib/pengine/unpack.c -+++ b/lib/pengine/unpack.c -@@ -2864,8 +2864,7 @@ add_dangling_migration(pe_resource_t *rsc, const pe_node_t *node) - } - - static void --unpack_migrate_to_success(pe_resource_t *rsc, pe_node_t *node, xmlNode *xml_op, -- pe_working_set_t *data_set) -+unpack_migrate_to_success(pe_resource_t *rsc, pe_node_t *node, xmlNode *xml_op) - { - /* A complete migration sequence is: - * 1. migrate_to on source node (which succeeded if we get to this function) -@@ -2916,11 +2915,11 @@ unpack_migrate_to_success(pe_resource_t *rsc, pe_node_t *node, xmlNode *xml_op, - - // Check for newer state on the source - source_newer_op = non_monitor_after(rsc->id, source, xml_op, true, -- data_set); -+ rsc->cluster); - - // Check for a migrate_from action from this source on the target - migrate_from = find_lrm_op(rsc->id, CRMD_ACTION_MIGRATED, target, -- source, -1, data_set); -+ source, -1, rsc->cluster); - if (migrate_from != NULL) { - if (source_newer_op) { - /* There's a newer non-monitor operation on the source and a -@@ -2938,7 +2937,7 @@ unpack_migrate_to_success(pe_resource_t *rsc, pe_node_t *node, xmlNode *xml_op, - * migration events, this migrate_to is irrelevant to the resource's state. - */ - target_newer_state = newer_state_after_migrate(rsc->id, target, xml_op, -- migrate_from, data_set); -+ migrate_from, rsc->cluster); - if (source_newer_op && target_newer_state) { - return; - } -@@ -2957,13 +2956,13 @@ unpack_migrate_to_success(pe_resource_t *rsc, pe_node_t *node, xmlNode *xml_op, - */ - rsc->role = RSC_ROLE_STARTED; - -- target_node = pe_find_node(data_set->nodes, target); -+ target_node = pe_find_node(rsc->cluster->nodes, target); - active_on_target = !target_newer_state && (target_node != NULL) - && target_node->details->online; - - if (from_status != PCMK_EXEC_PENDING) { // migrate_from failed on target - if (active_on_target) { -- native_add_running(rsc, target_node, data_set, TRUE); -+ native_add_running(rsc, target_node, rsc->cluster, TRUE); - } else { - // Mark resource as failed, require recovery, and prevent migration - pe__set_resource_flags(rsc, pe_rsc_failed|pe_rsc_stop); -@@ -2975,9 +2974,9 @@ unpack_migrate_to_success(pe_resource_t *rsc, pe_node_t *node, xmlNode *xml_op, - // The migrate_from is pending, complete but erased, or to be scheduled - - if (active_on_target) { -- pe_node_t *source_node = pe_find_node(data_set->nodes, source); -+ pe_node_t *source_node = pe_find_node(rsc->cluster->nodes, source); - -- native_add_running(rsc, target_node, data_set, FALSE); -+ native_add_running(rsc, target_node, rsc->cluster, FALSE); - if ((source_node != NULL) && source_node->details->online) { - /* This is a partial migration: the migrate_to completed - * successfully on the source, but the migrate_from has not -@@ -3946,7 +3945,7 @@ update_resource_state(pe_resource_t * rsc, pe_node_t * node, xmlNode * xml_op, c - clear_past_failure = TRUE; - - } else if (pcmk__str_eq(task, CRMD_ACTION_MIGRATE, pcmk__str_casei)) { -- unpack_migrate_to_success(rsc, node, xml_op, data_set); -+ unpack_migrate_to_success(rsc, node, xml_op); - - } else if (rsc->role < RSC_ROLE_STARTED) { - pe_rsc_trace(rsc, "%s active on %s", rsc->id, pe__node_name(node)); --- -2.31.1 - -From 3be487f87bf5e26277379148922525fd98d29681 Mon Sep 17 00:00:00 2001 -From: Ken Gaillot -Date: Thu, 2 Feb 2023 09:13:30 -0600 -Subject: [PATCH 09/14] Doc: scheduler: clarify comments about unpacking - migration history - -per review ---- - lib/pengine/unpack.c | 20 ++++++++++---------- - 1 file changed, 10 insertions(+), 10 deletions(-) - -diff --git a/lib/pengine/unpack.c b/lib/pengine/unpack.c -index 6222115..ec2cf26 100644 ---- a/lib/pengine/unpack.c -+++ b/lib/pengine/unpack.c -@@ -2791,9 +2791,9 @@ newer_state_after_migrate(const char *rsc_id, const char *node_name, - * \internal - * \brief Parse migration source and target node names from history entry - * -- * \param[in] entry Resource history entry for a migration action -- * \param[in] source_node If not NULL, source must match this node -- * \param[in] target_node If not NULL, target must match this node -+ * \param[in] entry Resource history entry for a migration action -+ * \param[in] source_node If not NULL, source must match this node -+ * \param[in] target_node If not NULL, target must match this node - * \param[out] source_name Where to store migration source node name - * \param[out] target_name Where to store migration target node name - * -@@ -2825,7 +2825,7 @@ get_migration_node_names(const xmlNode *entry, const pe_node_t *source_node, - pcmk__str_casei|pcmk__str_null_matches)) { - crm_err("Ignoring resource history entry %s because " - XML_LRM_ATTR_MIGRATE_SOURCE "='%s' does not match %s", -- id, pcmk__s(*source_name, ""), pe__node_name(source_node)); -+ id, *source_name, pe__node_name(source_node)); - return pcmk_rc_unpack_error; - } - -@@ -2834,7 +2834,7 @@ get_migration_node_names(const xmlNode *entry, const pe_node_t *source_node, - pcmk__str_casei|pcmk__str_null_matches)) { - crm_err("Ignoring resource history entry %s because " - XML_LRM_ATTR_MIGRATE_TARGET "='%s' does not match %s", -- id, pcmk__s(*target_name, ""), pe__node_name(target_node)); -+ id, *target_name, pe__node_name(target_node)); - return pcmk_rc_unpack_error; - } - -@@ -2890,7 +2890,7 @@ unpack_migrate_to_success(pe_resource_t *rsc, pe_node_t *node, xmlNode *xml_op) - * - * - If the new transition is aborted for any reason while the resource is - * stopping on node1, the transition after that stop completes will see -- * the migrate_from and stop on the source, but it's still a partial -+ * the migrate_to and stop on the source, but it's still a partial - * migration, and the resource must be stopped on node2 because it is - * potentially active there due to the migrate_to. - * -@@ -3425,9 +3425,9 @@ check_recoverable(pe_resource_t *rsc, pe_node_t *node, const char *task, - * \brief Update an integer value and why - * - * \param[in,out] i Pointer to integer to update -- * \param[in,out] why Where to store reason for update -+ * \param[out] why Where to store reason for update - * \param[in] value New value -- * \param[in,out] reason Description of why value was changed -+ * \param[in] reason Description of why value was changed - */ - static inline void - remap_because(int *i, const char **why, int value, const char *reason) -@@ -3456,7 +3456,7 @@ remap_because(int *i, const char **why, int value, const char *reason) - * \param[in] data_set Current cluster working set - * \param[in,out] on_fail What should be done about the result - * \param[in] target_rc Expected return code of operation -- * \param[in,out] rc Actual return code of operation -+ * \param[in,out] rc Actual return code of operation (treated as OCF) - * \param[in,out] status Operation execution status - * - * \note If the result is remapped and the node is not shutting down or failed, -@@ -3548,7 +3548,7 @@ remap_operation(xmlNode *xml_op, pe_resource_t *rsc, pe_node_t *node, - switch (*rc) { - case PCMK_OCF_OK: - if (is_probe && (target_rc == PCMK_OCF_NOT_RUNNING)) { -- remap_because(status, &why,PCMK_EXEC_DONE, "probe"); -+ remap_because(status, &why, PCMK_EXEC_DONE, "probe"); - pe_rsc_info(rsc, "Probe found %s active on %s at %s", - rsc->id, pe__node_name(node), - last_change_str(xml_op)); --- -2.31.1 - -From 3ef6c84a7b0dd434731e72d91f2724bdb52e292e Mon Sep 17 00:00:00 2001 -From: Ken Gaillot -Date: Thu, 2 Feb 2023 09:42:01 -0600 -Subject: [PATCH 10/14] Refactor: scheduler: improve xpath efficiency when - unpacking - -Using "//" means that every child must be searched recursively. If we know the -exact path, we should explicitly specify it. ---- - lib/pengine/unpack.c | 20 ++++++++++++-------- - 1 file changed, 12 insertions(+), 8 deletions(-) - -diff --git a/lib/pengine/unpack.c b/lib/pengine/unpack.c -index ec2cf26..8aead58 100644 ---- a/lib/pengine/unpack.c -+++ b/lib/pengine/unpack.c -@@ -2571,6 +2571,13 @@ set_node_score(gpointer key, gpointer value, gpointer user_data) - node->weight = *score; - } - -+#define XPATH_NODE_STATE "/" XML_TAG_CIB "/" XML_CIB_TAG_STATUS \ -+ "/" XML_CIB_TAG_STATE -+#define SUB_XPATH_LRM_RESOURCE "/" XML_CIB_TAG_LRM \ -+ "/" XML_LRM_TAG_RESOURCES \ -+ "/" XML_LRM_TAG_RESOURCE -+#define SUB_XPATH_LRM_RSC_OP "/" XML_LRM_TAG_RSC_OP -+ - static xmlNode * - find_lrm_op(const char *resource, const char *op, const char *node, const char *source, - int target_rc, pe_working_set_t *data_set) -@@ -2583,10 +2590,9 @@ find_lrm_op(const char *resource, const char *op, const char *node, const char * - - xpath = g_string_sized_new(256); - pcmk__g_strcat(xpath, -- "//" XML_CIB_TAG_STATE "[@" XML_ATTR_UNAME "='", node, "']" -- "//" XML_LRM_TAG_RESOURCE -- "[@" XML_ATTR_ID "='", resource, "']" -- "/" XML_LRM_TAG_RSC_OP "[@" XML_LRM_ATTR_TASK "='", op, "'", -+ XPATH_NODE_STATE "[@" XML_ATTR_UNAME "='", node, "']" -+ SUB_XPATH_LRM_RESOURCE "[@" XML_ATTR_ID "='", resource, "']" -+ SUB_XPATH_LRM_RSC_OP "[@" XML_LRM_ATTR_TASK "='", op, "'", - NULL); - - /* Need to check against transition_magic too? */ -@@ -2631,10 +2637,8 @@ find_lrm_resource(const char *rsc_id, const char *node_name, - - xpath = g_string_sized_new(256); - pcmk__g_strcat(xpath, -- "//" XML_CIB_TAG_STATE -- "[@" XML_ATTR_UNAME "='", node_name, "']" -- "//" XML_LRM_TAG_RESOURCE -- "[@" XML_ATTR_ID "='", rsc_id, "']", -+ XPATH_NODE_STATE "[@" XML_ATTR_UNAME "='", node_name, "']" -+ SUB_XPATH_LRM_RESOURCE "[@" XML_ATTR_ID "='", rsc_id, "']", - NULL); - - xml = get_xpath_object((const char *) xpath->str, data_set->input, --- -2.31.1 - -From 1869f99bc8eeedb976f96f0f1cc3d4dd86735504 Mon Sep 17 00:00:00 2001 -From: Ken Gaillot -Date: Thu, 2 Feb 2023 10:25:53 -0600 -Subject: [PATCH 11/14] Low: scheduler: unknown_on_node() should ignore pending - actions - -Previously, unknown_on_node() looked for any lrm_rsc_op at all to decide -whether a resource is known on a node. However if the only action is pending, -the resource is not yet known. - -Also drop a redundant argument and add a doxygen block. (The rsc argument is -not const due to a getDocPtr() call in the chain, as well as libxml2 calls that -are likely const in practice but aren't marked as such.) ---- - lib/pengine/unpack.c | 37 +++++++++++++++++++++++++------------ - 1 file changed, 25 insertions(+), 12 deletions(-) - -diff --git a/lib/pengine/unpack.c b/lib/pengine/unpack.c -index 8aead58..14dc202 100644 ---- a/lib/pengine/unpack.c -+++ b/lib/pengine/unpack.c -@@ -2648,19 +2648,32 @@ find_lrm_resource(const char *rsc_id, const char *node_name, - return xml; - } - -+/*! -+ * \internal -+ * \brief Check whether a resource has no completed action history on a node -+ * -+ * \param[in,out] rsc Resource to check -+ * \param[in] node_name Node to check -+ * -+ * \return true if \p rsc_id is unknown on \p node_name, otherwise false -+ */ - static bool --unknown_on_node(const char *rsc_id, const char *node_name, -- pe_working_set_t *data_set) -+unknown_on_node(pe_resource_t *rsc, const char *node_name) - { -- xmlNode *lrm_resource = NULL; -- -- lrm_resource = find_lrm_resource(rsc_id, node_name, data_set); -+ bool result = false; -+ xmlXPathObjectPtr search; -+ GString *xpath = g_string_sized_new(256); - -- /* If the resource has no lrm_rsc_op history on the node, that means its -- * state is unknown there. -- */ -- return (lrm_resource == NULL -- || first_named_child(lrm_resource, XML_LRM_TAG_RSC_OP) == NULL); -+ pcmk__g_strcat(xpath, -+ XPATH_NODE_STATE "[@" XML_ATTR_UNAME "='", node_name, "']" -+ SUB_XPATH_LRM_RESOURCE "[@" XML_ATTR_ID "='", rsc->id, "']" -+ SUB_XPATH_LRM_RSC_OP "[@" XML_LRM_ATTR_RC "!='193']", -+ NULL); -+ search = xpath_search(rsc->cluster->input, (const char *) xpath->str); -+ result = (numXpathResults(search) == 0); -+ freeXpathObject(search); -+ g_string_free(xpath, TRUE); -+ return result; - } - - /*! -@@ -3027,7 +3040,7 @@ unpack_migrate_to_failure(pe_resource_t *rsc, pe_node_t *node, xmlNode *xml_op, - * Don't just consider it running there. We will get back here anyway in - * case the probe detects it's running there. - */ -- !unknown_on_node(rsc->id, target, data_set) -+ !unknown_on_node(rsc, target) - /* If the resource has newer state on the target after the migration - * events, this migrate_to no longer matters for the target. - */ -@@ -3082,7 +3095,7 @@ unpack_migrate_from_failure(pe_resource_t *rsc, pe_node_t *node, - * Don't just consider it running there. We will get back here anyway in - * case the probe detects it's running there. - */ -- !unknown_on_node(rsc->id, source, data_set) -+ !unknown_on_node(rsc, source) - /* If the resource has newer state on the source after the migration - * events, this migrate_from no longer matters for the source. - */ --- -2.31.1 - -From 22fbab8e0d449d2accb231dfcec94294ded27f4e Mon Sep 17 00:00:00 2001 -From: Ken Gaillot -Date: Tue, 31 Jan 2023 12:11:19 -0600 -Subject: [PATCH 12/14] Test: scheduler: add regression test for migration - intermediary - -As of this commit, the cluster wrongly restarts the migrated resource ---- - cts/cts-scheduler.in | 3 + - .../dot/migration-intermediary-cleaned.dot | 46 ++ - .../exp/migration-intermediary-cleaned.exp | 316 +++++++++++ - .../migration-intermediary-cleaned.scores | 201 +++++++ - .../migration-intermediary-cleaned.summary | 94 ++++ - .../xml/migration-intermediary-cleaned.xml | 513 ++++++++++++++++++ - 6 files changed, 1173 insertions(+) - create mode 100644 cts/scheduler/dot/migration-intermediary-cleaned.dot - create mode 100644 cts/scheduler/exp/migration-intermediary-cleaned.exp - create mode 100644 cts/scheduler/scores/migration-intermediary-cleaned.scores - create mode 100644 cts/scheduler/summary/migration-intermediary-cleaned.summary - create mode 100644 cts/scheduler/xml/migration-intermediary-cleaned.xml - -diff --git a/cts/cts-scheduler.in b/cts/cts-scheduler.in -index feb5dc8..9899c36 100644 ---- a/cts/cts-scheduler.in -+++ b/cts/cts-scheduler.in -@@ -387,6 +387,9 @@ TESTS = [ - [ "probe-target-of-failed-migrate_to-1", "Failed migrate_to, target rejoins" ], - [ "probe-target-of-failed-migrate_to-2", "Failed migrate_to, target rejoined and probed" ], - [ "partial-live-migration-multiple-active", "Prevent running on multiple nodes due to partial live migration" ], -+ [ "migration-intermediary-cleaned", -+ "Probe live-migration intermediary with no history" -+ ], - [ "bug-lf-2422", "Dependency on partially active group - stop ocfs:*" ], - ], - [ -diff --git a/cts/scheduler/dot/migration-intermediary-cleaned.dot b/cts/scheduler/dot/migration-intermediary-cleaned.dot -new file mode 100644 -index 0000000..09568d0 ---- /dev/null -+++ b/cts/scheduler/dot/migration-intermediary-cleaned.dot -@@ -0,0 +1,46 @@ -+ digraph "g" { -+"Connectivity_running_0" [ style=bold color="green" fontcolor="orange"] -+"Connectivity_start_0" -> "Connectivity_running_0" [ style = bold] -+"Connectivity_start_0" -> "ping-1_start_0 rhel8-2" [ style = bold] -+"Connectivity_start_0" [ style=bold color="green" fontcolor="orange"] -+"FencingFail_monitor_0 rhel8-2" [ style=bold color="green" fontcolor="black"] -+"FencingPass_monitor_0 rhel8-2" [ style=bold color="green" fontcolor="black"] -+"Fencing_monitor_0 rhel8-2" [ style=bold color="green" fontcolor="black"] -+"lsb-dummy_monitor_0 rhel8-2" [ style=bold color="green" fontcolor="black"] -+"migrator_monitor_0 rhel8-2" -> "migrator_start_0 rhel8-5" [ style = bold] -+"migrator_monitor_0 rhel8-2" [ style=bold color="green" fontcolor="black"] -+"migrator_monitor_10000 rhel8-5" [ style=bold color="green" fontcolor="black"] -+"migrator_start_0 rhel8-5" -> "migrator_monitor_10000 rhel8-5" [ style = bold] -+"migrator_start_0 rhel8-5" [ style=bold color="green" fontcolor="black"] -+"migrator_stop_0 rhel8-2" -> "migrator_start_0 rhel8-5" [ style = bold] -+"migrator_stop_0 rhel8-2" [ style=bold color="green" fontcolor="black"] -+"migrator_stop_0 rhel8-5" -> "migrator_start_0 rhel8-5" [ style = bold] -+"migrator_stop_0 rhel8-5" [ style=bold color="green" fontcolor="black"] -+"petulant_monitor_0 rhel8-2" [ style=bold color="green" fontcolor="black"] -+"ping-1_monitor_0 rhel8-2" -> "Connectivity_start_0" [ style = bold] -+"ping-1_monitor_0 rhel8-2" [ style=bold color="green" fontcolor="black"] -+"ping-1_monitor_60000 rhel8-2" [ style=bold color="green" fontcolor="black"] -+"ping-1_start_0 rhel8-2" -> "Connectivity_running_0" [ style = bold] -+"ping-1_start_0 rhel8-2" -> "ping-1_monitor_60000 rhel8-2" [ style = bold] -+"ping-1_start_0 rhel8-2" [ style=bold color="green" fontcolor="black"] -+"r192.168.122.207_monitor_0 rhel8-2" [ style=bold color="green" fontcolor="black"] -+"r192.168.122.208_monitor_0 rhel8-2" [ style=bold color="green" fontcolor="black"] -+"rsc_rhel8-1_monitor_0 rhel8-2" -> "rsc_rhel8-1_start_0 rhel8-2" [ style = bold] -+"rsc_rhel8-1_monitor_0 rhel8-2" [ style=bold color="green" fontcolor="black"] -+"rsc_rhel8-1_monitor_5000 rhel8-2" [ style=bold color="green" fontcolor="black"] -+"rsc_rhel8-1_start_0 rhel8-2" -> "rsc_rhel8-1_monitor_5000 rhel8-2" [ style = bold] -+"rsc_rhel8-1_start_0 rhel8-2" [ style=bold color="green" fontcolor="black"] -+"rsc_rhel8-1_stop_0 rhel8-3" -> "rsc_rhel8-1_start_0 rhel8-2" [ style = bold] -+"rsc_rhel8-1_stop_0 rhel8-3" [ style=bold color="green" fontcolor="black"] -+"rsc_rhel8-2_monitor_0 rhel8-2" -> "rsc_rhel8-2_start_0 rhel8-2" [ style = bold] -+"rsc_rhel8-2_monitor_0 rhel8-2" [ style=bold color="green" fontcolor="black"] -+"rsc_rhel8-2_monitor_5000 rhel8-2" [ style=bold color="green" fontcolor="black"] -+"rsc_rhel8-2_start_0 rhel8-2" -> "rsc_rhel8-2_monitor_5000 rhel8-2" [ style = bold] -+"rsc_rhel8-2_start_0 rhel8-2" [ style=bold color="green" fontcolor="black"] -+"rsc_rhel8-2_stop_0 rhel8-4" -> "rsc_rhel8-2_start_0 rhel8-2" [ style = bold] -+"rsc_rhel8-2_stop_0 rhel8-4" [ style=bold color="green" fontcolor="black"] -+"rsc_rhel8-3_monitor_0 rhel8-2" [ style=bold color="green" fontcolor="black"] -+"rsc_rhel8-4_monitor_0 rhel8-2" [ style=bold color="green" fontcolor="black"] -+"rsc_rhel8-5_monitor_0 rhel8-2" [ style=bold color="green" fontcolor="black"] -+"stateful-1_monitor_0 rhel8-2" [ style=bold color="green" fontcolor="black"] -+} -diff --git a/cts/scheduler/exp/migration-intermediary-cleaned.exp b/cts/scheduler/exp/migration-intermediary-cleaned.exp -new file mode 100644 -index 0000000..28fa776 ---- /dev/null -+++ b/cts/scheduler/exp/migration-intermediary-cleaned.exp -@@ -0,0 +1,316 @@ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -diff --git a/cts/scheduler/scores/migration-intermediary-cleaned.scores b/cts/scheduler/scores/migration-intermediary-cleaned.scores -new file mode 100644 -index 0000000..b3b8dff ---- /dev/null -+++ b/cts/scheduler/scores/migration-intermediary-cleaned.scores -@@ -0,0 +1,201 @@ -+ -+pcmk__clone_allocate: Connectivity allocation score on rhel8-1: 0 -+pcmk__clone_allocate: Connectivity allocation score on rhel8-2: 0 -+pcmk__clone_allocate: Connectivity allocation score on rhel8-3: 0 -+pcmk__clone_allocate: Connectivity allocation score on rhel8-4: 0 -+pcmk__clone_allocate: Connectivity allocation score on rhel8-5: 0 -+pcmk__clone_allocate: ping-1:0 allocation score on rhel8-1: 0 -+pcmk__clone_allocate: ping-1:0 allocation score on rhel8-2: 0 -+pcmk__clone_allocate: ping-1:0 allocation score on rhel8-3: 1 -+pcmk__clone_allocate: ping-1:0 allocation score on rhel8-4: 0 -+pcmk__clone_allocate: ping-1:0 allocation score on rhel8-5: 0 -+pcmk__clone_allocate: ping-1:1 allocation score on rhel8-1: 0 -+pcmk__clone_allocate: ping-1:1 allocation score on rhel8-2: 0 -+pcmk__clone_allocate: ping-1:1 allocation score on rhel8-3: 0 -+pcmk__clone_allocate: ping-1:1 allocation score on rhel8-4: 1 -+pcmk__clone_allocate: ping-1:1 allocation score on rhel8-5: 0 -+pcmk__clone_allocate: ping-1:2 allocation score on rhel8-1: 0 -+pcmk__clone_allocate: ping-1:2 allocation score on rhel8-2: 0 -+pcmk__clone_allocate: ping-1:2 allocation score on rhel8-3: 0 -+pcmk__clone_allocate: ping-1:2 allocation score on rhel8-4: 0 -+pcmk__clone_allocate: ping-1:2 allocation score on rhel8-5: 1 -+pcmk__clone_allocate: ping-1:3 allocation score on rhel8-1: 0 -+pcmk__clone_allocate: ping-1:3 allocation score on rhel8-2: 0 -+pcmk__clone_allocate: ping-1:3 allocation score on rhel8-3: 0 -+pcmk__clone_allocate: ping-1:3 allocation score on rhel8-4: 0 -+pcmk__clone_allocate: ping-1:3 allocation score on rhel8-5: 0 -+pcmk__clone_allocate: ping-1:4 allocation score on rhel8-1: 0 -+pcmk__clone_allocate: ping-1:4 allocation score on rhel8-2: 0 -+pcmk__clone_allocate: ping-1:4 allocation score on rhel8-3: 0 -+pcmk__clone_allocate: ping-1:4 allocation score on rhel8-4: 0 -+pcmk__clone_allocate: ping-1:4 allocation score on rhel8-5: 0 -+pcmk__clone_allocate: promotable-1 allocation score on rhel8-1: -INFINITY -+pcmk__clone_allocate: promotable-1 allocation score on rhel8-2: -INFINITY -+pcmk__clone_allocate: promotable-1 allocation score on rhel8-3: 0 -+pcmk__clone_allocate: promotable-1 allocation score on rhel8-4: 0 -+pcmk__clone_allocate: promotable-1 allocation score on rhel8-5: 0 -+pcmk__clone_allocate: stateful-1:0 allocation score on rhel8-1: -INFINITY -+pcmk__clone_allocate: stateful-1:0 allocation score on rhel8-2: -INFINITY -+pcmk__clone_allocate: stateful-1:0 allocation score on rhel8-3: 11 -+pcmk__clone_allocate: stateful-1:0 allocation score on rhel8-4: 0 -+pcmk__clone_allocate: stateful-1:0 allocation score on rhel8-5: 0 -+pcmk__clone_allocate: stateful-1:1 allocation score on rhel8-1: -INFINITY -+pcmk__clone_allocate: stateful-1:1 allocation score on rhel8-2: -INFINITY -+pcmk__clone_allocate: stateful-1:1 allocation score on rhel8-3: 0 -+pcmk__clone_allocate: stateful-1:1 allocation score on rhel8-4: 6 -+pcmk__clone_allocate: stateful-1:1 allocation score on rhel8-5: 0 -+pcmk__clone_allocate: stateful-1:2 allocation score on rhel8-1: -INFINITY -+pcmk__clone_allocate: stateful-1:2 allocation score on rhel8-2: -INFINITY -+pcmk__clone_allocate: stateful-1:2 allocation score on rhel8-3: 0 -+pcmk__clone_allocate: stateful-1:2 allocation score on rhel8-4: 0 -+pcmk__clone_allocate: stateful-1:2 allocation score on rhel8-5: 6 -+pcmk__clone_allocate: stateful-1:3 allocation score on rhel8-1: -INFINITY -+pcmk__clone_allocate: stateful-1:3 allocation score on rhel8-2: -INFINITY -+pcmk__clone_allocate: stateful-1:3 allocation score on rhel8-3: 0 -+pcmk__clone_allocate: stateful-1:3 allocation score on rhel8-4: 0 -+pcmk__clone_allocate: stateful-1:3 allocation score on rhel8-5: 0 -+pcmk__clone_allocate: stateful-1:4 allocation score on rhel8-1: -INFINITY -+pcmk__clone_allocate: stateful-1:4 allocation score on rhel8-2: -INFINITY -+pcmk__clone_allocate: stateful-1:4 allocation score on rhel8-3: 10 -+pcmk__clone_allocate: stateful-1:4 allocation score on rhel8-4: 5 -+pcmk__clone_allocate: stateful-1:4 allocation score on rhel8-5: 5 -+pcmk__group_assign: group-1 allocation score on rhel8-1: 0 -+pcmk__group_assign: group-1 allocation score on rhel8-2: 0 -+pcmk__group_assign: group-1 allocation score on rhel8-3: 0 -+pcmk__group_assign: group-1 allocation score on rhel8-4: 0 -+pcmk__group_assign: group-1 allocation score on rhel8-5: 0 -+pcmk__group_assign: petulant allocation score on rhel8-1: 0 -+pcmk__group_assign: petulant allocation score on rhel8-2: 0 -+pcmk__group_assign: petulant allocation score on rhel8-3: 0 -+pcmk__group_assign: petulant allocation score on rhel8-4: 0 -+pcmk__group_assign: petulant allocation score on rhel8-5: 0 -+pcmk__group_assign: r192.168.122.207 allocation score on rhel8-1: 0 -+pcmk__group_assign: r192.168.122.207 allocation score on rhel8-2: 0 -+pcmk__group_assign: r192.168.122.207 allocation score on rhel8-3: 0 -+pcmk__group_assign: r192.168.122.207 allocation score on rhel8-4: 0 -+pcmk__group_assign: r192.168.122.207 allocation score on rhel8-5: 0 -+pcmk__group_assign: r192.168.122.208 allocation score on rhel8-1: 0 -+pcmk__group_assign: r192.168.122.208 allocation score on rhel8-2: 0 -+pcmk__group_assign: r192.168.122.208 allocation score on rhel8-3: 0 -+pcmk__group_assign: r192.168.122.208 allocation score on rhel8-4: 0 -+pcmk__group_assign: r192.168.122.208 allocation score on rhel8-5: 0 -+pcmk__primitive_assign: Fencing allocation score on rhel8-1: 0 -+pcmk__primitive_assign: Fencing allocation score on rhel8-2: 0 -+pcmk__primitive_assign: Fencing allocation score on rhel8-3: 0 -+pcmk__primitive_assign: Fencing allocation score on rhel8-4: 0 -+pcmk__primitive_assign: Fencing allocation score on rhel8-5: 0 -+pcmk__primitive_assign: FencingFail allocation score on rhel8-1: 0 -+pcmk__primitive_assign: FencingFail allocation score on rhel8-2: 0 -+pcmk__primitive_assign: FencingFail allocation score on rhel8-3: 0 -+pcmk__primitive_assign: FencingFail allocation score on rhel8-4: 0 -+pcmk__primitive_assign: FencingFail allocation score on rhel8-5: 0 -+pcmk__primitive_assign: FencingPass allocation score on rhel8-1: 0 -+pcmk__primitive_assign: FencingPass allocation score on rhel8-2: 0 -+pcmk__primitive_assign: FencingPass allocation score on rhel8-3: 0 -+pcmk__primitive_assign: FencingPass allocation score on rhel8-4: 0 -+pcmk__primitive_assign: FencingPass allocation score on rhel8-5: 0 -+pcmk__primitive_assign: lsb-dummy allocation score on rhel8-1: -INFINITY -+pcmk__primitive_assign: lsb-dummy allocation score on rhel8-2: -INFINITY -+pcmk__primitive_assign: lsb-dummy allocation score on rhel8-3: 0 -+pcmk__primitive_assign: lsb-dummy allocation score on rhel8-4: -INFINITY -+pcmk__primitive_assign: lsb-dummy allocation score on rhel8-5: -INFINITY -+pcmk__primitive_assign: migrator allocation score on rhel8-1: 0 -+pcmk__primitive_assign: migrator allocation score on rhel8-2: 0 -+pcmk__primitive_assign: migrator allocation score on rhel8-3: 0 -+pcmk__primitive_assign: migrator allocation score on rhel8-4: 0 -+pcmk__primitive_assign: migrator allocation score on rhel8-5: 0 -+pcmk__primitive_assign: petulant allocation score on rhel8-1: -INFINITY -+pcmk__primitive_assign: petulant allocation score on rhel8-2: -INFINITY -+pcmk__primitive_assign: petulant allocation score on rhel8-3: 0 -+pcmk__primitive_assign: petulant allocation score on rhel8-4: -INFINITY -+pcmk__primitive_assign: petulant allocation score on rhel8-5: -INFINITY -+pcmk__primitive_assign: ping-1:0 allocation score on rhel8-1: -INFINITY -+pcmk__primitive_assign: ping-1:0 allocation score on rhel8-2: 0 -+pcmk__primitive_assign: ping-1:0 allocation score on rhel8-3: 1 -+pcmk__primitive_assign: ping-1:0 allocation score on rhel8-4: 0 -+pcmk__primitive_assign: ping-1:0 allocation score on rhel8-5: 0 -+pcmk__primitive_assign: ping-1:1 allocation score on rhel8-1: -INFINITY -+pcmk__primitive_assign: ping-1:1 allocation score on rhel8-2: 0 -+pcmk__primitive_assign: ping-1:1 allocation score on rhel8-3: -INFINITY -+pcmk__primitive_assign: ping-1:1 allocation score on rhel8-4: 1 -+pcmk__primitive_assign: ping-1:1 allocation score on rhel8-5: 0 -+pcmk__primitive_assign: ping-1:2 allocation score on rhel8-1: -INFINITY -+pcmk__primitive_assign: ping-1:2 allocation score on rhel8-2: 0 -+pcmk__primitive_assign: ping-1:2 allocation score on rhel8-3: -INFINITY -+pcmk__primitive_assign: ping-1:2 allocation score on rhel8-4: -INFINITY -+pcmk__primitive_assign: ping-1:2 allocation score on rhel8-5: 1 -+pcmk__primitive_assign: ping-1:3 allocation score on rhel8-1: -INFINITY -+pcmk__primitive_assign: ping-1:3 allocation score on rhel8-2: 0 -+pcmk__primitive_assign: ping-1:3 allocation score on rhel8-3: -INFINITY -+pcmk__primitive_assign: ping-1:3 allocation score on rhel8-4: -INFINITY -+pcmk__primitive_assign: ping-1:3 allocation score on rhel8-5: -INFINITY -+pcmk__primitive_assign: ping-1:4 allocation score on rhel8-1: -INFINITY -+pcmk__primitive_assign: ping-1:4 allocation score on rhel8-2: -INFINITY -+pcmk__primitive_assign: ping-1:4 allocation score on rhel8-3: -INFINITY -+pcmk__primitive_assign: ping-1:4 allocation score on rhel8-4: -INFINITY -+pcmk__primitive_assign: ping-1:4 allocation score on rhel8-5: -INFINITY -+pcmk__primitive_assign: r192.168.122.207 allocation score on rhel8-1: -INFINITY -+pcmk__primitive_assign: r192.168.122.207 allocation score on rhel8-2: -INFINITY -+pcmk__primitive_assign: r192.168.122.207 allocation score on rhel8-3: 11 -+pcmk__primitive_assign: r192.168.122.207 allocation score on rhel8-4: -INFINITY -+pcmk__primitive_assign: r192.168.122.207 allocation score on rhel8-5: -INFINITY -+pcmk__primitive_assign: r192.168.122.208 allocation score on rhel8-1: -INFINITY -+pcmk__primitive_assign: r192.168.122.208 allocation score on rhel8-2: -INFINITY -+pcmk__primitive_assign: r192.168.122.208 allocation score on rhel8-3: 0 -+pcmk__primitive_assign: r192.168.122.208 allocation score on rhel8-4: -INFINITY -+pcmk__primitive_assign: r192.168.122.208 allocation score on rhel8-5: -INFINITY -+pcmk__primitive_assign: rsc_rhel8-1 allocation score on rhel8-1: 100 -+pcmk__primitive_assign: rsc_rhel8-1 allocation score on rhel8-2: 0 -+pcmk__primitive_assign: rsc_rhel8-1 allocation score on rhel8-3: 0 -+pcmk__primitive_assign: rsc_rhel8-1 allocation score on rhel8-4: 0 -+pcmk__primitive_assign: rsc_rhel8-1 allocation score on rhel8-5: 0 -+pcmk__primitive_assign: rsc_rhel8-2 allocation score on rhel8-1: 0 -+pcmk__primitive_assign: rsc_rhel8-2 allocation score on rhel8-2: 100 -+pcmk__primitive_assign: rsc_rhel8-2 allocation score on rhel8-3: 0 -+pcmk__primitive_assign: rsc_rhel8-2 allocation score on rhel8-4: 0 -+pcmk__primitive_assign: rsc_rhel8-2 allocation score on rhel8-5: 0 -+pcmk__primitive_assign: rsc_rhel8-3 allocation score on rhel8-1: 0 -+pcmk__primitive_assign: rsc_rhel8-3 allocation score on rhel8-2: 0 -+pcmk__primitive_assign: rsc_rhel8-3 allocation score on rhel8-3: 100 -+pcmk__primitive_assign: rsc_rhel8-3 allocation score on rhel8-4: 0 -+pcmk__primitive_assign: rsc_rhel8-3 allocation score on rhel8-5: 0 -+pcmk__primitive_assign: rsc_rhel8-4 allocation score on rhel8-1: 0 -+pcmk__primitive_assign: rsc_rhel8-4 allocation score on rhel8-2: 0 -+pcmk__primitive_assign: rsc_rhel8-4 allocation score on rhel8-3: 0 -+pcmk__primitive_assign: rsc_rhel8-4 allocation score on rhel8-4: 100 -+pcmk__primitive_assign: rsc_rhel8-4 allocation score on rhel8-5: 0 -+pcmk__primitive_assign: rsc_rhel8-5 allocation score on rhel8-1: 0 -+pcmk__primitive_assign: rsc_rhel8-5 allocation score on rhel8-2: 0 -+pcmk__primitive_assign: rsc_rhel8-5 allocation score on rhel8-3: 0 -+pcmk__primitive_assign: rsc_rhel8-5 allocation score on rhel8-4: 0 -+pcmk__primitive_assign: rsc_rhel8-5 allocation score on rhel8-5: 100 -+pcmk__primitive_assign: stateful-1:0 allocation score on rhel8-1: -INFINITY -+pcmk__primitive_assign: stateful-1:0 allocation score on rhel8-2: -INFINITY -+pcmk__primitive_assign: stateful-1:0 allocation score on rhel8-3: 11 -+pcmk__primitive_assign: stateful-1:0 allocation score on rhel8-4: 0 -+pcmk__primitive_assign: stateful-1:0 allocation score on rhel8-5: 0 -+pcmk__primitive_assign: stateful-1:1 allocation score on rhel8-1: -INFINITY -+pcmk__primitive_assign: stateful-1:1 allocation score on rhel8-2: -INFINITY -+pcmk__primitive_assign: stateful-1:1 allocation score on rhel8-3: -INFINITY -+pcmk__primitive_assign: stateful-1:1 allocation score on rhel8-4: 6 -+pcmk__primitive_assign: stateful-1:1 allocation score on rhel8-5: 0 -+pcmk__primitive_assign: stateful-1:2 allocation score on rhel8-1: -INFINITY -+pcmk__primitive_assign: stateful-1:2 allocation score on rhel8-2: -INFINITY -+pcmk__primitive_assign: stateful-1:2 allocation score on rhel8-3: -INFINITY -+pcmk__primitive_assign: stateful-1:2 allocation score on rhel8-4: -INFINITY -+pcmk__primitive_assign: stateful-1:2 allocation score on rhel8-5: 6 -+pcmk__primitive_assign: stateful-1:3 allocation score on rhel8-1: -INFINITY -+pcmk__primitive_assign: stateful-1:3 allocation score on rhel8-2: -INFINITY -+pcmk__primitive_assign: stateful-1:3 allocation score on rhel8-3: -INFINITY -+pcmk__primitive_assign: stateful-1:3 allocation score on rhel8-4: -INFINITY -+pcmk__primitive_assign: stateful-1:3 allocation score on rhel8-5: -INFINITY -+pcmk__primitive_assign: stateful-1:4 allocation score on rhel8-1: -INFINITY -+pcmk__primitive_assign: stateful-1:4 allocation score on rhel8-2: -INFINITY -+pcmk__primitive_assign: stateful-1:4 allocation score on rhel8-3: -INFINITY -+pcmk__primitive_assign: stateful-1:4 allocation score on rhel8-4: -INFINITY -+pcmk__primitive_assign: stateful-1:4 allocation score on rhel8-5: -INFINITY -+stateful-1:0 promotion score on rhel8-3: 10 -+stateful-1:1 promotion score on rhel8-4: 5 -+stateful-1:2 promotion score on rhel8-5: 5 -+stateful-1:3 promotion score on none: 0 -+stateful-1:4 promotion score on none: 0 -diff --git a/cts/scheduler/summary/migration-intermediary-cleaned.summary b/cts/scheduler/summary/migration-intermediary-cleaned.summary -new file mode 100644 -index 0000000..5de1355 ---- /dev/null -+++ b/cts/scheduler/summary/migration-intermediary-cleaned.summary -@@ -0,0 +1,94 @@ -+Using the original execution date of: 2023-01-19 21:05:59Z -+Current cluster status: -+ * Node List: -+ * Online: [ rhel8-2 rhel8-3 rhel8-4 rhel8-5 ] -+ * OFFLINE: [ rhel8-1 ] -+ -+ * Full List of Resources: -+ * Fencing (stonith:fence_xvm): Started rhel8-3 -+ * FencingPass (stonith:fence_dummy): Started rhel8-4 -+ * FencingFail (stonith:fence_dummy): Started rhel8-5 -+ * rsc_rhel8-1 (ocf:heartbeat:IPaddr2): Started rhel8-3 -+ * rsc_rhel8-2 (ocf:heartbeat:IPaddr2): Started rhel8-4 -+ * rsc_rhel8-3 (ocf:heartbeat:IPaddr2): Started rhel8-3 -+ * rsc_rhel8-4 (ocf:heartbeat:IPaddr2): Started rhel8-4 -+ * rsc_rhel8-5 (ocf:heartbeat:IPaddr2): Started rhel8-5 -+ * migrator (ocf:pacemaker:Dummy): Started [ rhel8-5 rhel8-2 ] -+ * Clone Set: Connectivity [ping-1]: -+ * Started: [ rhel8-3 rhel8-4 rhel8-5 ] -+ * Stopped: [ rhel8-1 rhel8-2 ] -+ * Clone Set: promotable-1 [stateful-1] (promotable): -+ * Promoted: [ rhel8-3 ] -+ * Unpromoted: [ rhel8-4 rhel8-5 ] -+ * Stopped: [ rhel8-1 rhel8-2 ] -+ * Resource Group: group-1: -+ * r192.168.122.207 (ocf:heartbeat:IPaddr2): Started rhel8-3 -+ * petulant (service:pacemaker-cts-dummyd@10): Started rhel8-3 -+ * r192.168.122.208 (ocf:heartbeat:IPaddr2): Started rhel8-3 -+ * lsb-dummy (lsb:LSBDummy): Started rhel8-3 -+ -+Transition Summary: -+ * Move rsc_rhel8-1 ( rhel8-3 -> rhel8-2 ) -+ * Move rsc_rhel8-2 ( rhel8-4 -> rhel8-2 ) -+ * Restart migrator ( rhel8-5 ) -+ * Start ping-1:3 ( rhel8-2 ) -+ -+Executing Cluster Transition: -+ * Resource action: Fencing monitor on rhel8-2 -+ * Resource action: FencingPass monitor on rhel8-2 -+ * Resource action: FencingFail monitor on rhel8-2 -+ * Resource action: rsc_rhel8-1 stop on rhel8-3 -+ * Resource action: rsc_rhel8-1 monitor on rhel8-2 -+ * Resource action: rsc_rhel8-2 stop on rhel8-4 -+ * Resource action: rsc_rhel8-2 monitor on rhel8-2 -+ * Resource action: rsc_rhel8-3 monitor on rhel8-2 -+ * Resource action: rsc_rhel8-4 monitor on rhel8-2 -+ * Resource action: rsc_rhel8-5 monitor on rhel8-2 -+ * Resource action: migrator stop on rhel8-2 -+ * Resource action: migrator stop on rhel8-5 -+ * Resource action: migrator monitor on rhel8-2 -+ * Resource action: ping-1 monitor on rhel8-2 -+ * Pseudo action: Connectivity_start_0 -+ * Resource action: stateful-1 monitor on rhel8-2 -+ * Resource action: r192.168.122.207 monitor on rhel8-2 -+ * Resource action: petulant monitor on rhel8-2 -+ * Resource action: r192.168.122.208 monitor on rhel8-2 -+ * Resource action: lsb-dummy monitor on rhel8-2 -+ * Resource action: rsc_rhel8-1 start on rhel8-2 -+ * Resource action: rsc_rhel8-2 start on rhel8-2 -+ * Resource action: migrator start on rhel8-5 -+ * Resource action: migrator monitor=10000 on rhel8-5 -+ * Resource action: ping-1 start on rhel8-2 -+ * Pseudo action: Connectivity_running_0 -+ * Resource action: rsc_rhel8-1 monitor=5000 on rhel8-2 -+ * Resource action: rsc_rhel8-2 monitor=5000 on rhel8-2 -+ * Resource action: ping-1 monitor=60000 on rhel8-2 -+Using the original execution date of: 2023-01-19 21:05:59Z -+ -+Revised Cluster Status: -+ * Node List: -+ * Online: [ rhel8-2 rhel8-3 rhel8-4 rhel8-5 ] -+ * OFFLINE: [ rhel8-1 ] -+ -+ * Full List of Resources: -+ * Fencing (stonith:fence_xvm): Started rhel8-3 -+ * FencingPass (stonith:fence_dummy): Started rhel8-4 -+ * FencingFail (stonith:fence_dummy): Started rhel8-5 -+ * rsc_rhel8-1 (ocf:heartbeat:IPaddr2): Started rhel8-2 -+ * rsc_rhel8-2 (ocf:heartbeat:IPaddr2): Started rhel8-2 -+ * rsc_rhel8-3 (ocf:heartbeat:IPaddr2): Started rhel8-3 -+ * rsc_rhel8-4 (ocf:heartbeat:IPaddr2): Started rhel8-4 -+ * rsc_rhel8-5 (ocf:heartbeat:IPaddr2): Started rhel8-5 -+ * migrator (ocf:pacemaker:Dummy): Started [ rhel8-2 rhel8-5 ] -+ * Clone Set: Connectivity [ping-1]: -+ * Started: [ rhel8-2 rhel8-3 rhel8-4 rhel8-5 ] -+ * Stopped: [ rhel8-1 ] -+ * Clone Set: promotable-1 [stateful-1] (promotable): -+ * Promoted: [ rhel8-3 ] -+ * Unpromoted: [ rhel8-4 rhel8-5 ] -+ * Stopped: [ rhel8-1 rhel8-2 ] -+ * Resource Group: group-1: -+ * r192.168.122.207 (ocf:heartbeat:IPaddr2): Started rhel8-3 -+ * petulant (service:pacemaker-cts-dummyd@10): Started rhel8-3 -+ * r192.168.122.208 (ocf:heartbeat:IPaddr2): Started rhel8-3 -+ * lsb-dummy (lsb:LSBDummy): Started rhel8-3 -diff --git a/cts/scheduler/xml/migration-intermediary-cleaned.xml b/cts/scheduler/xml/migration-intermediary-cleaned.xml -new file mode 100644 -index 0000000..bec7888 ---- /dev/null -+++ b/cts/scheduler/xml/migration-intermediary-cleaned.xml -@@ -0,0 +1,513 @@ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ --- -2.31.1 - -From 1f9fadbb06baded3fc393cfe30a0cb620aca0829 Mon Sep 17 00:00:00 2001 -From: Ken Gaillot -Date: Wed, 1 Feb 2023 17:12:13 -0600 -Subject: [PATCH 13/14] Fix: scheduler: handle cleaned migrate_from history - correctly - -Fixes T623 ---- - lib/pengine/unpack.c | 9 +++++++++ - 1 file changed, 9 insertions(+) - -diff --git a/lib/pengine/unpack.c b/lib/pengine/unpack.c -index 14dc202..9c99183 100644 ---- a/lib/pengine/unpack.c -+++ b/lib/pengine/unpack.c -@@ -2990,6 +2990,15 @@ unpack_migrate_to_success(pe_resource_t *rsc, pe_node_t *node, xmlNode *xml_op) - - // The migrate_from is pending, complete but erased, or to be scheduled - -+ /* If there is no history at all for the resource on an online target, then -+ * it was likely cleaned. Just return, and we'll schedule a probe. Once we -+ * have the probe result, it will be reflected in target_newer_state. -+ */ -+ if ((target_node != NULL) && target_node->details->online -+ && unknown_on_node(rsc, target)) { -+ return; -+ } -+ - if (active_on_target) { - pe_node_t *source_node = pe_find_node(rsc->cluster->nodes, source); - --- -2.31.1 - -From d9d1bf19e8522ea29c87f0c39b05828947bc5b0f Mon Sep 17 00:00:00 2001 -From: Ken Gaillot -Date: Thu, 2 Feb 2023 15:48:01 -0600 -Subject: [PATCH 14/14] Test: scheduler: update expected output for migration - fix - ---- - .../dot/migration-intermediary-cleaned.dot | 8 -- - .../exp/migration-intermediary-cleaned.exp | 88 ++++--------------- - .../migration-intermediary-cleaned.scores | 2 +- - .../migration-intermediary-cleaned.summary | 9 +- - 4 files changed, 22 insertions(+), 85 deletions(-) - -diff --git a/cts/scheduler/dot/migration-intermediary-cleaned.dot b/cts/scheduler/dot/migration-intermediary-cleaned.dot -index 09568d0..f6eabba 100644 ---- a/cts/scheduler/dot/migration-intermediary-cleaned.dot -+++ b/cts/scheduler/dot/migration-intermediary-cleaned.dot -@@ -7,15 +7,7 @@ - "FencingPass_monitor_0 rhel8-2" [ style=bold color="green" fontcolor="black"] - "Fencing_monitor_0 rhel8-2" [ style=bold color="green" fontcolor="black"] - "lsb-dummy_monitor_0 rhel8-2" [ style=bold color="green" fontcolor="black"] --"migrator_monitor_0 rhel8-2" -> "migrator_start_0 rhel8-5" [ style = bold] - "migrator_monitor_0 rhel8-2" [ style=bold color="green" fontcolor="black"] --"migrator_monitor_10000 rhel8-5" [ style=bold color="green" fontcolor="black"] --"migrator_start_0 rhel8-5" -> "migrator_monitor_10000 rhel8-5" [ style = bold] --"migrator_start_0 rhel8-5" [ style=bold color="green" fontcolor="black"] --"migrator_stop_0 rhel8-2" -> "migrator_start_0 rhel8-5" [ style = bold] --"migrator_stop_0 rhel8-2" [ style=bold color="green" fontcolor="black"] --"migrator_stop_0 rhel8-5" -> "migrator_start_0 rhel8-5" [ style = bold] --"migrator_stop_0 rhel8-5" [ style=bold color="green" fontcolor="black"] - "petulant_monitor_0 rhel8-2" [ style=bold color="green" fontcolor="black"] - "ping-1_monitor_0 rhel8-2" -> "Connectivity_start_0" [ style = bold] - "ping-1_monitor_0 rhel8-2" [ style=bold color="green" fontcolor="black"] -diff --git a/cts/scheduler/exp/migration-intermediary-cleaned.exp b/cts/scheduler/exp/migration-intermediary-cleaned.exp -index 28fa776..8b9bb39 100644 ---- a/cts/scheduler/exp/migration-intermediary-cleaned.exp -+++ b/cts/scheduler/exp/migration-intermediary-cleaned.exp -@@ -148,91 +148,41 @@ - - - -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- - - - -- -+ - - - - -- -- -- -- -- -- -- -- -- -- -- -- -- -- -+ - -- -+ - - - - - - -- -+ - - - -- -+ - -- -+ - - - - - - -- -+ - - - -- -+ - - - -@@ -241,24 +191,24 @@ - - - -- -+ - -- -+ - - - - - -- -+ - - -- -+ - - - -- -+ - -- -+ - - - -@@ -268,7 +218,7 @@ - - - -- -+ - - - -@@ -277,7 +227,7 @@ - - - -- -+ - - - -@@ -286,7 +236,7 @@ - - - -- -+ - - - -@@ -295,7 +245,7 @@ - - - -- -+ - - - -@@ -304,7 +254,7 @@ - - - -- -+ - - - -diff --git a/cts/scheduler/scores/migration-intermediary-cleaned.scores b/cts/scheduler/scores/migration-intermediary-cleaned.scores -index b3b8dff..09f05d1 100644 ---- a/cts/scheduler/scores/migration-intermediary-cleaned.scores -+++ b/cts/scheduler/scores/migration-intermediary-cleaned.scores -@@ -103,7 +103,7 @@ pcmk__primitive_assign: migrator allocation score on rhel8-1: 0 - pcmk__primitive_assign: migrator allocation score on rhel8-2: 0 - pcmk__primitive_assign: migrator allocation score on rhel8-3: 0 - pcmk__primitive_assign: migrator allocation score on rhel8-4: 0 --pcmk__primitive_assign: migrator allocation score on rhel8-5: 0 -+pcmk__primitive_assign: migrator allocation score on rhel8-5: 1 - pcmk__primitive_assign: petulant allocation score on rhel8-1: -INFINITY - pcmk__primitive_assign: petulant allocation score on rhel8-2: -INFINITY - pcmk__primitive_assign: petulant allocation score on rhel8-3: 0 -diff --git a/cts/scheduler/summary/migration-intermediary-cleaned.summary b/cts/scheduler/summary/migration-intermediary-cleaned.summary -index 5de1355..dd127a8 100644 ---- a/cts/scheduler/summary/migration-intermediary-cleaned.summary -+++ b/cts/scheduler/summary/migration-intermediary-cleaned.summary -@@ -13,7 +13,7 @@ Current cluster status: - * rsc_rhel8-3 (ocf:heartbeat:IPaddr2): Started rhel8-3 - * rsc_rhel8-4 (ocf:heartbeat:IPaddr2): Started rhel8-4 - * rsc_rhel8-5 (ocf:heartbeat:IPaddr2): Started rhel8-5 -- * migrator (ocf:pacemaker:Dummy): Started [ rhel8-5 rhel8-2 ] -+ * migrator (ocf:pacemaker:Dummy): Started rhel8-5 - * Clone Set: Connectivity [ping-1]: - * Started: [ rhel8-3 rhel8-4 rhel8-5 ] - * Stopped: [ rhel8-1 rhel8-2 ] -@@ -30,7 +30,6 @@ Current cluster status: - Transition Summary: - * Move rsc_rhel8-1 ( rhel8-3 -> rhel8-2 ) - * Move rsc_rhel8-2 ( rhel8-4 -> rhel8-2 ) -- * Restart migrator ( rhel8-5 ) - * Start ping-1:3 ( rhel8-2 ) - - Executing Cluster Transition: -@@ -44,8 +43,6 @@ Executing Cluster Transition: - * Resource action: rsc_rhel8-3 monitor on rhel8-2 - * Resource action: rsc_rhel8-4 monitor on rhel8-2 - * Resource action: rsc_rhel8-5 monitor on rhel8-2 -- * Resource action: migrator stop on rhel8-2 -- * Resource action: migrator stop on rhel8-5 - * Resource action: migrator monitor on rhel8-2 - * Resource action: ping-1 monitor on rhel8-2 - * Pseudo action: Connectivity_start_0 -@@ -56,8 +53,6 @@ Executing Cluster Transition: - * Resource action: lsb-dummy monitor on rhel8-2 - * Resource action: rsc_rhel8-1 start on rhel8-2 - * Resource action: rsc_rhel8-2 start on rhel8-2 -- * Resource action: migrator start on rhel8-5 -- * Resource action: migrator monitor=10000 on rhel8-5 - * Resource action: ping-1 start on rhel8-2 - * Pseudo action: Connectivity_running_0 - * Resource action: rsc_rhel8-1 monitor=5000 on rhel8-2 -@@ -79,7 +74,7 @@ Revised Cluster Status: - * rsc_rhel8-3 (ocf:heartbeat:IPaddr2): Started rhel8-3 - * rsc_rhel8-4 (ocf:heartbeat:IPaddr2): Started rhel8-4 - * rsc_rhel8-5 (ocf:heartbeat:IPaddr2): Started rhel8-5 -- * migrator (ocf:pacemaker:Dummy): Started [ rhel8-2 rhel8-5 ] -+ * migrator (ocf:pacemaker:Dummy): Started rhel8-5 - * Clone Set: Connectivity [ping-1]: - * Started: [ rhel8-2 rhel8-3 rhel8-4 rhel8-5 ] - * Stopped: [ rhel8-1 ] --- -2.31.1 - diff --git a/004-g_source_remove.patch b/004-g_source_remove.patch deleted file mode 100644 index 2af0f47..0000000 --- a/004-g_source_remove.patch +++ /dev/null @@ -1,107 +0,0 @@ -From 45617b727e280cac384a28ae3d96145e066e6197 Mon Sep 17 00:00:00 2001 -From: Reid Wahl -Date: Fri, 3 Feb 2023 12:08:57 -0800 -Subject: [PATCH 01/02] Fix: fencer: Prevent double g_source_remove of op_timer_one - -QE observed a rarely reproducible core dump in the fencer during -Pacemaker shutdown, in which we try to g_source_remove() an op timer -that's already been removed. - -free_stonith_remote_op_list() --> g_hash_table_destroy() --> g_hash_table_remove_all_nodes() --> clear_remote_op_timers() --> g_source_remove() --> crm_glib_handler() --> "Source ID 190 was not found when attempting to remove it" - -The likely cause is that request_peer_fencing() doesn't set -op->op_timer_one to 0 after calling g_source_remove() on it, so if that -op is still in the stonith_remote_op_list at shutdown with the same -timer, clear_remote_op_timers() tries to remove the source for -op_timer_one again. - -There are only five locations that call g_source_remove() on a -remote_fencing_op_t timer. -* Three of them are in clear_remote_op_timers(), which first 0-checks - the timer and then sets it to 0 after g_source_remove(). -* One is in remote_op_query_timeout(), which does the same. -* The last is the one we fix here in request_peer_fencing(). - -I don't know all the conditions of QE's test scenario at this point. -What I do know: -* have-watchdog=true -* stonith-watchdog-timeout=10 -* no explicit topology -* fence agent script is missing for the configured fence device -* requested fencing of one node -* cluster shutdown - -Fixes RHBZ2166967 - -Signed-off-by: Reid Wahl ---- - daemons/fenced/fenced_remote.c | 1 + - 1 file changed, 1 insertion(+) - -diff --git a/daemons/fenced/fenced_remote.c b/daemons/fenced/fenced_remote.c -index d61b5bd..b7426ff 100644 ---- a/daemons/fenced/fenced_remote.c -+++ b/daemons/fenced/fenced_remote.c -@@ -1825,6 +1825,7 @@ request_peer_fencing(remote_fencing_op_t *op, peer_device_info_t *peer) - op->state = st_exec; - if (op->op_timer_one) { - g_source_remove(op->op_timer_one); -+ op->op_timer_one = 0; - } - - if (!((stonith_watchdog_timeout_ms > 0) --- -2.31.1 - -From 0291db4750322ec7f01ae6a4a2a30abca9d8e19e Mon Sep 17 00:00:00 2001 -From: Reid Wahl -Date: Wed, 15 Feb 2023 22:30:27 -0800 -Subject: [PATCH 02/02] Fix: fencer: Avoid double source remove of op_timer_total - -remote_op_timeout() returns G_SOURCE_REMOVE, which tells GLib to remove -the source from the main loop after returning. Currently this function -is used as the callback only when creating op->op_timer_total. - -If we don't set op->op_timer_total to 0 before returning from -remote_op_timeout(), then we can get an assertion and core dump from -GLib when the op's timers are being cleared (either during op -finalization or during fencer shutdown). This is because -clear_remote_op_timers() sees that op->op_timer_total != 0 and tries to -remove the source, but the source has already been removed. - -Note that we're already (correctly) zeroing op->op_timer_one and -op->query_timeout as appropriate in their respective callback functions. - -Fortunately, GLib doesn't care whether the source has already been -removed before we return G_SOURCE_REMOVE from a callback. So it's safe -to call finalize_op() (which removes all the op's timer sources) from -within a callback. - -Fixes RHBZ#2166967 - -Signed-off-by: Reid Wahl ---- - daemons/fenced/fenced_remote.c | 2 ++ - 1 file changed, 2 insertions(+) - -diff --git a/daemons/fenced/fenced_remote.c b/daemons/fenced/fenced_remote.c -index b7426ff88..adea3d7d8 100644 ---- a/daemons/fenced/fenced_remote.c -+++ b/daemons/fenced/fenced_remote.c -@@ -718,6 +718,8 @@ remote_op_timeout(gpointer userdata) - { - remote_fencing_op_t *op = userdata; - -+ op->op_timer_total = 0; -+ - if (op->state == st_done) { - crm_debug("Action '%s' targeting %s for client %s already completed " - CRM_XS " id=%.8s", --- -2.39.0 diff --git a/005-query-null.patch b/005-query-null.patch deleted file mode 100644 index 194cd33..0000000 --- a/005-query-null.patch +++ /dev/null @@ -1,151 +0,0 @@ -From 0d15568a538349ac41028db6b506d13dd23e8732 Mon Sep 17 00:00:00 2001 -From: Chris Lumens -Date: Tue, 14 Feb 2023 14:00:37 -0500 -Subject: [PATCH] High: libcrmcommon: Fix handling node=NULL in - pcmk__attrd_api_query. - -According to the header file, if node is NULL, pcmk__attrd_api_query -should query the value of the given attribute on all cluster nodes. -This is also what the server expects and how attrd_updater is supposed -to work. - -However, pcmk__attrd_api_query has no way of letting callers decide -whether they want to query all nodes or whether they want to use the -local node. We were passing NULL for the node name, which it took to -mean it should look up the local node name. This calls -pcmk__node_attr_target, which probes the local cluster name and returns -that to pcmk__attrd_api_query. If it returns non-NULL, that value will -then be put into the XML IPC call which means the server will only -return the value for that node. - -In testing this was usually fine. However, in pratice, the methods -pcmk__node_attr_target uses to figure out the local cluster node name -involves checking the OCF_RESKEY_CRM_meta_on_node environment variable -among others. - -This variable was never set in testing, but can be set in the real -world. This leads to circumstances where the user did "attrd_updater -QA" -expecting to get the values on all nodes, but instead only got the value -on the local cluster node. - -In pacemaker-2.1.4 and prior, pcmk__node_attr_target was simply never -called if the node was NULL but was called otherwise. - -The fix is to modify pcmk__attrd_api_query to take an option for -querying all nodes. If that's present, we'll query all nodes. If it's -not present, we'll look at the given node name - NULL means look it up, -anything else means just that node. - -Regression in 2.1.5 introduced by eb20a65577 ---- - include/crm/common/attrd_internal.h | 6 +++++- - include/crm/common/ipc_attrd_internal.h | 7 +++++-- - lib/common/ipc_attrd.c | 12 ++++++++---- - tools/attrd_updater.c | 5 +++-- - 4 files changed, 21 insertions(+), 9 deletions(-) - -diff --git a/include/crm/common/attrd_internal.h b/include/crm/common/attrd_internal.h -index 389be48..7337c38 100644 ---- a/include/crm/common/attrd_internal.h -+++ b/include/crm/common/attrd_internal.h -@@ -1,5 +1,5 @@ - /* -- * Copyright 2004-2022 the Pacemaker project contributors -+ * Copyright 2004-2023 the Pacemaker project contributors - * - * The version control history for this file may have further details. - * -@@ -25,6 +25,10 @@ enum pcmk__node_attr_opts { - pcmk__node_attr_perm = (1 << 5), - pcmk__node_attr_sync_local = (1 << 6), - pcmk__node_attr_sync_cluster = (1 << 7), -+ // pcmk__node_attr_utilization is 8, but that has not been backported. -+ // I'm leaving the gap here in case we backport that in the future and -+ // also to avoid problems on mixed-version clusters. -+ pcmk__node_attr_query_all = (1 << 9), - }; - - #define pcmk__set_node_attr_flags(node_attr_flags, flags_to_set) do { \ -diff --git a/include/crm/common/ipc_attrd_internal.h b/include/crm/common/ipc_attrd_internal.h -index 2c6713f..b1b7584 100644 ---- a/include/crm/common/ipc_attrd_internal.h -+++ b/include/crm/common/ipc_attrd_internal.h -@@ -1,5 +1,5 @@ - /* -- * Copyright 2022 the Pacemaker project contributors -+ * Copyright 2022-2023 the Pacemaker project contributors - * - * The version control history for this file may have further details. - * -@@ -110,10 +110,13 @@ int pcmk__attrd_api_purge(pcmk_ipc_api_t *api, const char *node); - * - * \param[in,out] api Connection to pacemaker-attrd - * \param[in] node Look up the attribute for this node -- * (or NULL for all nodes) -+ * (or NULL for the local node) - * \param[in] name Attribute name - * \param[in] options Bitmask of pcmk__node_attr_opts - * -+ * \note Passing pcmk__node_attr_query_all will cause the function to query -+ * the value of \p name on all nodes, regardless of the value of \p node. -+ * - * \return Standard Pacemaker return code - */ - int pcmk__attrd_api_query(pcmk_ipc_api_t *api, const char *node, const char *name, -diff --git a/lib/common/ipc_attrd.c b/lib/common/ipc_attrd.c -index 4606509..dece49b 100644 ---- a/lib/common/ipc_attrd.c -+++ b/lib/common/ipc_attrd.c -@@ -1,5 +1,5 @@ - /* -- * Copyright 2011-2022 the Pacemaker project contributors -+ * Copyright 2011-2023 the Pacemaker project contributors - * - * The version control history for this file may have further details. - * -@@ -332,10 +332,14 @@ pcmk__attrd_api_query(pcmk_ipc_api_t *api, const char *node, const char *name, - return EINVAL; - } - -- target = pcmk__node_attr_target(node); -+ if (pcmk_is_set(options, pcmk__node_attr_query_all)) { -+ node = NULL; -+ } else { -+ target = pcmk__node_attr_target(node); - -- if (target != NULL) { -- node = target; -+ if (target != NULL) { -+ node = target; -+ } - } - - request = create_attrd_op(NULL); -diff --git a/tools/attrd_updater.c b/tools/attrd_updater.c -index 3cd766d..cbd341d 100644 ---- a/tools/attrd_updater.c -+++ b/tools/attrd_updater.c -@@ -376,6 +376,7 @@ attrd_event_cb(pcmk_ipc_api_t *attrd_api, enum pcmk_ipc_event event_type, - static int - send_attrd_query(pcmk__output_t *out, const char *attr_name, const char *attr_node, gboolean query_all) - { -+ uint32_t options = pcmk__node_attr_none; - pcmk_ipc_api_t *attrd_api = NULL; - int rc = pcmk_rc_ok; - -@@ -400,10 +401,10 @@ send_attrd_query(pcmk__output_t *out, const char *attr_name, const char *attr_no - - /* Decide which node(s) to query */ - if (query_all == TRUE) { -- attr_node = NULL; -+ options |= pcmk__node_attr_query_all; - } - -- rc = pcmk__attrd_api_query(attrd_api, attr_node, attr_name, 0); -+ rc = pcmk__attrd_api_query(attrd_api, attr_node, attr_name, options); - - if (rc != pcmk_rc_ok) { - g_set_error(&error, PCMK__RC_ERROR, rc, "Could not query value of %s: %s (%d)", --- -2.31.1 - diff --git a/006-watchdog-fencing-topology.patch b/006-watchdog-fencing-topology.patch deleted file mode 100644 index 7651584..0000000 --- a/006-watchdog-fencing-topology.patch +++ /dev/null @@ -1,142 +0,0 @@ -From 17cc49e1564b0ae55cc8212d14c5c055f88040da Mon Sep 17 00:00:00 2001 -From: Klaus Wenninger -Date: Tue, 14 Feb 2023 15:35:37 +0100 -Subject: [PATCH] Fix: watchdog-fencing: terminate dangling timer before - watchdog-waiting - ---- - daemons/fenced/fenced_remote.c | 6 +++++- - 1 file changed, 5 insertions(+), 1 deletion(-) - -diff --git a/daemons/fenced/fenced_remote.c b/daemons/fenced/fenced_remote.c -index 5c3fe25e3..aab185adb 100644 ---- a/daemons/fenced/fenced_remote.c -+++ b/daemons/fenced/fenced_remote.c -@@ -1,5 +1,5 @@ - /* -- * Copyright 2009-2022 the Pacemaker project contributors -+ * Copyright 2009-2023 the Pacemaker project contributors - * - * The version control history for this file may have further details. - * -@@ -1702,6 +1702,10 @@ check_watchdog_fencing_and_wait(remote_fencing_op_t * op) - "client %s " CRM_XS " id=%.8s", - (stonith_watchdog_timeout_ms / 1000), - op->target, op->action, op->client_name, op->id); -+ -+ if (op->op_timer_one) { -+ g_source_remove(op->op_timer_one); -+ } - op->op_timer_one = g_timeout_add(stonith_watchdog_timeout_ms, - remote_op_watchdog_done, op); - return TRUE; --- -2.39.0 - -From f2cc2a4277124230903a18713e50604a8f1842cd Mon Sep 17 00:00:00 2001 -From: Klaus Wenninger -Date: Wed, 1 Mar 2023 15:00:15 +0100 -Subject: [PATCH] Refactor: watchdog-fencing: convenience function - pcmk__is_fencing_action - -for consistency and add comment making clear why this block exits -with new timer set in any case ---- - daemons/fenced/fenced_remote.c | 5 ++++- - 1 file changed, 4 insertions(+), 1 deletion(-) - -diff --git a/daemons/fenced/fenced_remote.c b/daemons/fenced/fenced_remote.c -index aab185adb..e0f8de057 100644 ---- a/daemons/fenced/fenced_remote.c -+++ b/daemons/fenced/fenced_remote.c -@@ -1834,7 +1834,7 @@ request_peer_fencing(remote_fencing_op_t *op, peer_device_info_t *peer) - if (!((stonith_watchdog_timeout_ms > 0) - && (pcmk__str_eq(device, STONITH_WATCHDOG_ID, pcmk__str_none) - || (pcmk__str_eq(peer->host, op->target, pcmk__str_casei) -- && !pcmk__str_eq(op->action, "on", pcmk__str_none))) -+ && pcmk__is_fencing_action(op->action))) - && check_watchdog_fencing_and_wait(op))) { - - /* Some thoughts about self-fencing cases reaching this point: -@@ -1854,6 +1854,9 @@ request_peer_fencing(remote_fencing_op_t *op, peer_device_info_t *peer) - Otherwise the selection of stonith-watchdog-timeout at - least is questionable. - */ -+ -+ /* coming here we're not waiting for watchdog timeout - -+ thus engage timer with timout evaluated before */ - op->op_timer_one = g_timeout_add((1000 * timeout_one), remote_op_timeout_one, op); - } - --- -2.39.0 - -From c4eb45a986f8865fc5e69350fd5b9f4b056d9d69 Mon Sep 17 00:00:00 2001 -From: Klaus Wenninger -Date: Tue, 14 Feb 2023 11:57:17 +0100 -Subject: [PATCH] Fix: watchdog-fencing: correctly derive timeout with topology - -up to now the timeout for watchdog-fencing was just added to -the overall timeout if the node to be fenced was visible and -reported back to the query. ---- - daemons/fenced/fenced_remote.c | 28 +++++++++++++++++++++++++--- - 1 file changed, 25 insertions(+), 3 deletions(-) - -diff --git a/daemons/fenced/fenced_remote.c b/daemons/fenced/fenced_remote.c -index e0f8de057..3b7ab05e9 100644 ---- a/daemons/fenced/fenced_remote.c -+++ b/daemons/fenced/fenced_remote.c -@@ -969,8 +969,9 @@ advance_topology_level(remote_fencing_op_t *op, bool empty_ok) - return pcmk_rc_ok; - } - -- crm_info("All fencing options targeting %s for client %s@%s failed " -+ crm_info("All %sfencing options targeting %s for client %s@%s failed " - CRM_XS " id=%.8s", -+ (stonith_watchdog_timeout_ms > 0)?"non-watchdog ":"", - op->target, op->client_name, op->originator, op->id); - return ENODEV; - } -@@ -1434,8 +1435,17 @@ stonith_choose_peer(remote_fencing_op_t * op) - && pcmk_is_set(op->call_options, st_opt_topology) - && (advance_topology_level(op, false) == pcmk_rc_ok)); - -- crm_notice("Couldn't find anyone to fence (%s) %s using %s", -- op->action, op->target, (device? device : "any device")); -+ if ((stonith_watchdog_timeout_ms > 0) -+ && pcmk__is_fencing_action(op->action) -+ && pcmk__str_eq(device, STONITH_WATCHDOG_ID, pcmk__str_none) -+ && node_does_watchdog_fencing(op->target)) { -+ crm_info("Couldn't contact watchdog-fencing target-node (%s)", -+ op->target); -+ /* check_watchdog_fencing_and_wait will log additional info */ -+ } else { -+ crm_notice("Couldn't find anyone to fence (%s) %s using %s", -+ op->action, op->target, (device? device : "any device")); -+ } - return NULL; - } - -@@ -1531,6 +1541,18 @@ get_op_total_timeout(const remote_fencing_op_t *op, - continue; - } - for (device_list = tp->levels[i]; device_list; device_list = device_list->next) { -+ /* in case of watchdog-device we add the timeout to the budget -+ regardless of if we got a reply or not -+ */ -+ if ((stonith_watchdog_timeout_ms > 0) -+ && pcmk__is_fencing_action(op->action) -+ && pcmk__str_eq(device_list->data, STONITH_WATCHDOG_ID, -+ pcmk__str_none) -+ && node_does_watchdog_fencing(op->target)) { -+ total_timeout += stonith_watchdog_timeout_ms / 1000; -+ continue; -+ } -+ - for (iter = op->query_results; iter != NULL; iter = iter->next) { - const peer_device_info_t *peer = iter->data; - --- -2.39.0 - diff --git a/pacemaker.spec b/pacemaker.spec index 7da7a5d..022a526 100644 --- a/pacemaker.spec +++ b/pacemaker.spec @@ -35,11 +35,11 @@ ## Upstream pacemaker version, and its package version (specversion ## can be incremented to build packages reliably considered "newer" ## than previously built packages with the same pcmkversion) -%global pcmkversion 2.1.5 -%global specversion 9 +%global pcmkversion 2.1.6 +%global specversion 1 ## Upstream commit (full commit ID, abbreviated commit ID, or tag) to build -%global commit a3f44794f94e1571c6ba0042915ade369b4ce4b1 +%global commit 802a72226be8f90747e9c897c626b060512d6fe6 ## Since git v2.11, the extent of abbreviation is autoscaled by default ## (used to be constant of 7), so we need to convey it for non-tags, too. @@ -233,7 +233,7 @@ Name: pacemaker Summary: Scalable High-Availability cluster resource manager Version: %{pcmkversion} Release: %{pcmk_release}%{?dist} -License: GPLv2+ and LGPLv2+ +License: GPL-2.0-or-later AND LGPL-2.1-or-later Url: https://www.clusterlabs.org/ # Example: https://codeload.github.com/ClusterLabs/pacemaker/tar.gz/e91769e @@ -248,17 +248,15 @@ Source0: https://codeload.github.com/%{github_owner}/%{name}/tar.gz/%{arch Source1: https://codeload.github.com/%{github_owner}/%{nagios_name}/tar.gz/%{nagios_archive_github_url} # upstream commits -Patch001: 001-sync-points.patch -Patch002: 002-remote-regression.patch -Patch003: 003-history-cleanup.patch -Patch004: 004-g_source_remove.patch -Patch005: 005-query-null.patch -Patch006: 006-watchdog-fencing-topology.patch +#Patch001: 001-xxxx.patch Requires: resource-agents Requires: %{pkgname_pcmk_libs}%{?_isa} = %{version}-%{release} Requires: %{name}-cluster-libs%{?_isa} = %{version}-%{release} Requires: %{name}-cli = %{version}-%{release} +%if %{with stonithd} +Requires: %{python_name}-%{name} = %{version}-%{release} +%endif %{?systemd_requires} %if %{defined centos} @@ -271,6 +269,7 @@ ExclusiveArch: aarch64 i686 ppc64le s390x x86_64 Requires: %{python_path} BuildRequires: %{python_name}-devel +BuildRequires: %{python_name}-setuptools # Pacemaker requires a minimum libqb functionality # RHEL requires a higher version than upstream, for qb_ipcc_connect_async() @@ -360,7 +359,7 @@ Available rpmbuild rebuild options: stonithd %package cli -License: GPLv2+ and LGPLv2+ +License: GPL-2.0-or-later AND LGPL-2.1-or-later Summary: Command line tools for controlling Pacemaker clusters Requires: %{pkgname_pcmk_libs}%{?_isa} = %{version}-%{release} %if 0%{?supports_recommends} @@ -383,7 +382,7 @@ to query and control the cluster from machines that may, or may not, be part of the cluster. %package -n %{pkgname_pcmk_libs} -License: GPLv2+ and LGPLv2+ +License: GPL-2.0-or-later AND LGPL-2.1-or-later Summary: Core Pacemaker libraries Requires(pre): %{pkgname_shadow_utils} Requires: %{name}-schemas = %{version}-%{release} @@ -400,7 +399,7 @@ The %{pkgname_pcmk_libs} package contains shared libraries needed for cluster nodes and those just running the CLI tools. %package cluster-libs -License: GPLv2+ and LGPLv2+ +License: GPL-2.0-or-later AND LGPL-2.1-or-later Summary: Cluster Libraries used by Pacemaker Requires: %{pkgname_pcmk_libs}%{?_isa} = %{version}-%{release} @@ -411,8 +410,22 @@ manager. The %{name}-cluster-libs package contains cluster-aware shared libraries needed for nodes that will form part of the cluster nodes. +%package -n %{python_name}-%{name} +License: LGPL-2.1-or-later +Summary: Python libraries for Pacemaker +Requires: %{python_path} +Requires: %{pkgname_pcmk_libs} = %{version}-%{release} +BuildArch: noarch + +%description -n %{python_name}-%{name} +Pacemaker is an advanced, scalable High-Availability cluster resource +manager. + +The %{python_name}-%{name} package contains a Python library that can be used +to interface with Pacemaker. + %package remote -License: GPLv2+ and LGPLv2+ +License: GPL-2.0-or-later AND LGPL-2.1-or-later Summary: Pacemaker remote executor daemon for non-cluster nodes Requires: %{pkgname_pcmk_libs}%{?_isa} = %{version}-%{release} Requires: %{name}-cli = %{version}-%{release} @@ -431,7 +444,7 @@ which is capable of extending pacemaker functionality to remote nodes not running the full corosync/cluster stack. %package -n %{pkgname_pcmk_libs}-devel -License: GPLv2+ and LGPLv2+ +License: GPL-2.0-or-later AND LGPL-2.1-or-later Summary: Pacemaker development package Requires: %{pkgname_pcmk_libs}%{?_isa} = %{version}-%{release} Requires: %{name}-cluster-libs%{?_isa} = %{version}-%{release} @@ -454,11 +467,12 @@ The %{pkgname_pcmk_libs}-devel package contains headers and shared libraries for developing tools for Pacemaker. %package cts -License: GPLv2+ and LGPLv2+ +License: GPL-2.0-or-later AND LGPL-2.1-or-later Summary: Test framework for cluster-related technologies like Pacemaker Requires: %{python_path} Requires: %{pkgname_pcmk_libs} = %{version}-%{release} Requires: %{name}-cli = %{version}-%{release} +Requires: %{python_name}-%{name} = %{version}-%{release} Requires: %{pkgname_procps} Requires: psmisc Requires: %{python_name}-psutil @@ -486,7 +500,7 @@ Pacemaker is an advanced, scalable High-Availability cluster resource manager. %package schemas -License: GPLv2+ +License: GPL-2.0-or-later Summary: Schemas and upgrade stylesheets for Pacemaker BuildArch: noarch @@ -558,6 +572,10 @@ export LDFLAGS_HARDENED_LIB="%{?_hardening_ldflags}" make %{_smp_mflags} V=1 +pushd python +%py3_build +popd + %check make %{_smp_mflags} check { cts/cts-scheduler --run load-stopped-loop \ @@ -575,6 +593,10 @@ make install \ DESTDIR=%{buildroot} V=1 docdir=%{pcmk_docdir} \ %{?_python_bytecompile_extra:%{?py_byte_compile:am__py_compile=true}} +pushd python +%py3_install +popd + mkdir -p %{buildroot}%{_datadir}/pacemaker/nagios/plugins-metadata for file in $(find %{nagios_name}-%{nagios_hash}/metadata -type f); do install -m 644 $file %{buildroot}%{_datadir}/pacemaker/nagios/plugins-metadata @@ -759,19 +781,22 @@ exit 0 %dir %{ocf_root}/resource.d %{ocf_root}/resource.d/pacemaker -%doc %{_mandir}/man7/* +%doc %{_mandir}/man7/*pacemaker* %exclude %{_mandir}/man7/pacemaker-controld.* %exclude %{_mandir}/man7/pacemaker-schedulerd.* %exclude %{_mandir}/man7/pacemaker-fenced.* %exclude %{_mandir}/man7/ocf_pacemaker_controld.* %exclude %{_mandir}/man7/ocf_pacemaker_o2cb.* %exclude %{_mandir}/man7/ocf_pacemaker_remote.* -%doc %{_mandir}/man8/* +%doc %{_mandir}/man8/crm*.8.gz %exclude %{_mandir}/man8/crm_master.* -%exclude %{_mandir}/man8/fence_legacy.* -%exclude %{_mandir}/man8/fence_watchdog.* -%exclude %{_mandir}/man8/pacemakerd.* -%exclude %{_mandir}/man8/pacemaker-remoted.* +%doc %{_mandir}/man8/attrd_updater.* +%doc %{_mandir}/man8/cibadmin.* +%if %{with cibsecrets} + %doc %{_mandir}/man8/cibsecret.* +%endif +%doc %{_mandir}/man8/iso8601.* +%doc %{_mandir}/man8/stonith_admin.* %license licenses/GPLv2 %doc COPYING @@ -802,6 +827,14 @@ exit 0 %doc COPYING %doc ChangeLog +%files -n %{python_name}-%{name} +%{python3_sitelib}/pacemaker/ +%{python3_sitelib}/pacemaker-*.egg-info +%exclude %{python3_sitelib}/pacemaker/_cts/ +%license licenses/LGPLv2.1 +%doc COPYING +%doc ChangeLog + %files remote %config(noreplace) %{_sysconfdir}/sysconfig/pacemaker # state directory is shared between the subpackets @@ -823,6 +856,7 @@ exit 0 %files cts %{python_site}/cts +%{python3_sitelib}/pacemaker/_cts/ %{_datadir}/pacemaker/tests %{_libexecdir}/pacemaker/cts-log-watcher @@ -834,8 +868,16 @@ exit 0 %files -n %{pkgname_pcmk_libs}-devel %{_includedir}/pacemaker -%{_libdir}/*.so -%{_libdir}/pkgconfig/*.pc +%{_libdir}/libcib.so +%{_libdir}/liblrmd.so +%{_libdir}/libcrmservice.so +%{_libdir}/libcrmcommon.so +%{_libdir}/libpe_status.so +%{_libdir}/libpe_rules.so +%{_libdir}/libpacemaker.so +%{_libdir}/libstonithd.so +%{_libdir}/libcrmcluster.so +%{_libdir}/pkgconfig/*pacemaker*.pc %license licenses/LGPLv2.1 %doc COPYING %doc ChangeLog @@ -856,6 +898,10 @@ exit 0 %license %{nagios_name}-%{nagios_hash}/COPYING %changelog +* Tue May 23 2023 Chris Lumens - 2.1.6-1 +- Rebase on upstream 2.1.6-rc2 release +- Resolves: rhbz2182482 + * Wed May 17 2023 Klaus Wenninger - 2.1.5-9 - Rebuild with incremented release to allow a safe upgrade from c8s/rhel-8 diff --git a/sources b/sources index 70aed1f..ae42c9c 100644 --- a/sources +++ b/sources @@ -1,2 +1,2 @@ SHA512 (nagios-agents-metadata-105ab8a7b2c16b9a29cf1c1596b80136eeef332b.tar.gz) = 11ddeb48a4929e7642b6dfa9c7962aa1d7a1af1c569830f55ed6cd6773abac13377317327bc1db8411c8077884f83f81cc54d746c834b63a99fa6dc219b5caad -SHA512 (pacemaker-a3f44794f.tar.gz) = 554f4c47fcf2f5be61afe9b1485a6bb19de1bdfc0ec8d9be44eba31d77d0df1c6f07b29590299c4664e676a5c90e9c5fdb8a84921665151211df92d8f549dbc9 +SHA512 (pacemaker-802a72226.tar.gz) = fa4259c1f44de38ced1488fa827441ad101f4760e7cb1962c94610a9a3ed5a5a8227cd307636525a14396311ce3c5ae13919a8111f734a1a59905f99f79081e4