diff --git a/.gitignore b/.gitignore index 3ddb934..e6eb4fe 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1 @@ -/ClusterLabs-pacemaker-*.tar.gz -/[Pp]acemaker-*.tar.gz -/nagios-agents-metadata-*.tar.gz +pacemaker-9a5e54bae.tar.gz diff --git a/001-econnrefused.patch b/001-econnrefused.patch new file mode 100644 index 0000000..3b26b84 --- /dev/null +++ b/001-econnrefused.patch @@ -0,0 +1,40 @@ +From 125b434943f57778816135ad147fc827fa706e99 Mon Sep 17 00:00:00 2001 +From: Chris Lumens +Date: Mon, 4 Aug 2025 10:38:00 -0400 +Subject: [PATCH] Med: libpacemaker: Do not retry on ECONNREFUSED in tools. + +This is a regression introduced by e438946787. In that patch, what +we're trying to do is retry IPC connections between daemons. If a +daemon gets ECONNREFUSED when it initiates an IPC connection, the most +likely reason is that another daemon has been killed and is restarting +but is not yet ready to accept connections. Waiting and retrying +repeatedly is an acceptable way to deal with this. + +However, if a command line tool gets ECONNREFUSED, it's more likely that +the problem is the cluster isn't running at all. In this case, waiting +and retrying just introduces a delay for a situation that will never be +resolved. Reverting just the part in pcmk_cluster_queries.c should fix +this problem without affecting any of the daemons - they don't call this +code. + +Fixes RHEL-106594 +--- + lib/pacemaker/pcmk_cluster_queries.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/lib/pacemaker/pcmk_cluster_queries.c b/lib/pacemaker/pcmk_cluster_queries.c +index 8a08d99180..2f91a68738 100644 +--- a/lib/pacemaker/pcmk_cluster_queries.c ++++ b/lib/pacemaker/pcmk_cluster_queries.c +@@ -360,7 +360,7 @@ ipc_connect(data_t *data, enum pcmk_ipc_server server, pcmk_ipc_callback_t cb, + pcmk_register_ipc_callback(api, cb, data); + } + +- rc = pcmk__connect_ipc_retry_conrefused(api, dispatch_type, 5); ++ rc = pcmk__connect_ipc(api, dispatch_type, 5); + if (rc != pcmk_rc_ok) { + if (rc == EREMOTEIO) { + data->pcmkd_state = pcmk_pacemakerd_state_remote; +-- +2.49.0 + diff --git a/001-reset-error-warning-flags.patch b/001-reset-error-warning-flags.patch deleted file mode 100644 index 4c6c12b..0000000 --- a/001-reset-error-warning-flags.patch +++ /dev/null @@ -1,26 +0,0 @@ -From 78e6b46a2bbb80af24a804045313370a6404a251 Mon Sep 17 00:00:00 2001 -From: Hideo Yamauchi -Date: Thu, 9 Jan 2025 08:32:48 +0900 -Subject: [PATCH] Mid: schedulerd: Resetting error and warning flags. - ---- - lib/pengine/status.c | 3 +++ - 1 file changed, 3 insertions(+), 0 deletions(-) - -diff --git a/lib/pengine/status.c b/lib/pengine/status.c -index a97d6f7..87b5f6c 100644 ---- a/lib/pengine/status.c -+++ b/lib/pengine/status.c -@@ -447,6 +447,9 @@ set_working_set_defaults(pcmk_scheduler_t *scheduler) - |pcmk__sched_stop_removed_resources - |pcmk__sched_cancel_removed_actions); - #endif -+ -+ pcmk__config_has_error = false; -+ pcmk__config_has_warning = false; - } - - pcmk_resource_t * --- -2.47.0 - diff --git a/002-corosync.patch b/002-corosync.patch new file mode 100644 index 0000000..3f048ce --- /dev/null +++ b/002-corosync.patch @@ -0,0 +1,75 @@ +From b1fd6ccea9083826c1c2fb40418651704989a904 Mon Sep 17 00:00:00 2001 +From: Reid Wahl +Date: Wed, 13 Aug 2025 17:33:16 -0700 +Subject: [PATCH] Fix: cts: Start corosync using systemd if available + +As of corosync upstream commit ae859515, in systemd builds, +StateDirectory is set in the systemd corosync.service file. The corosync +state directory defaults to this value if not set in the corosync config +file. Corosync falls back to using /var/lib/corosync only if the systemd +StateDirectory is not set. + +The same commit removes /var/lib/corosync from RPM builds with systemd. + +As a result, if corosync was built with systemd, then starting corosync +outside of systemd fails unless /var/lib/corosync has been created +manually or through some other means. Starting corosync directly from +the command line fails with the following error, because the +STATE_DIRECTORY environment variable was not set by systemd: + +Cannot chdir to state directory /var/lib/corosync. No such file or +directory + +This causes Pacemaker's cts-fencing script to fail. + +This seems like a bug in corosync, as it now assumes that corosync will +always be started by systemd if available. Here, we work around it in +cts by doing exactly that. + +Signed-off-by: Reid Wahl +--- + python/pacemaker/_cts/corosync.py | 11 ++++++++++- + 1 file changed, 10 insertions(+), 1 deletion(-) + +diff --git a/python/pacemaker/_cts/corosync.py b/python/pacemaker/_cts/corosync.py +index 0a55dd7c96..beb574d2b8 100644 +--- a/python/pacemaker/_cts/corosync.py ++++ b/python/pacemaker/_cts/corosync.py +@@ -11,6 +11,7 @@ import tempfile + import time + + from pacemaker.buildoptions import BuildOptions ++from pacemaker._cts.environment import EnvFactory + from pacemaker._cts.process import killall, stdout_from_command + + +@@ -112,6 +113,9 @@ class Corosync: + self.logdir = logdir + self.cluster_name = cluster_name + ++ # The Corosync class doesn't use self._env._nodes, but the ++ # "--nodes" argument is required to be present and nonempty ++ self._env = EnvFactory().getInstance(args=["--nodes", "localhost"]) + self._existing_cfg_file = None + + def _ready(self, logfile, timeout=10): +@@ -149,10 +153,15 @@ class Corosync: + self.cluster_name, localname()) + logfile = corosync_log_file(BuildOptions.COROSYNC_CONFIG_FILE) + ++ if self._env["have_systemd"]: ++ cmd = ["systemctl", "start", "corosync.service"] ++ else: ++ cmd = ["corosync"] ++ + if self.verbose: + print("Starting corosync") + +- with subprocess.Popen("corosync", stdout=subprocess.PIPE, stderr=subprocess.PIPE) as test: ++ with subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) as test: + test.wait() + + # Wait for corosync to be ready before returning +-- +2.50.1 + diff --git a/002-remote-fencing.patch b/002-remote-fencing.patch deleted file mode 100644 index 739b63d..0000000 --- a/002-remote-fencing.patch +++ /dev/null @@ -1,353 +0,0 @@ -From a8065dbd5b5e5c56ce05830b2a8bafb40d5a57d4 Mon Sep 17 00:00:00 2001 -From: Chris Lumens -Date: Fri, 28 Mar 2025 15:04:24 -0400 -Subject: [PATCH] Refactor: scheduler: Fix formatting in pe_can_fence. - ---- - lib/pengine/utils.c | 4 ++-- - 1 file changed, 2 insertions(+), 2 deletions(-) - -diff --git a/lib/pengine/utils.c b/lib/pengine/utils.c -index 8a2e946c92..acd825b468 100644 ---- a/lib/pengine/utils.c -+++ b/lib/pengine/utils.c -@@ -62,10 +62,10 @@ pe_can_fence(const pcmk_scheduler_t *scheduler, const pcmk_node_t *node) - } else if (scheduler->no_quorum_policy == pcmk_no_quorum_ignore) { - return true; - -- } else if(node == NULL) { -+ } else if (node == NULL) { - return false; - -- } else if(node->details->online) { -+ } else if (node->details->online) { - crm_notice("We can fence %s without quorum because they're in our membership", - pcmk__node_name(node)); - return true; --- -2.31.1 - -From 0912256460730ac5e64c41c72543253518370255 Mon Sep 17 00:00:00 2001 -From: Chris Lumens -Date: Fri, 28 Mar 2025 15:08:56 -0400 -Subject: [PATCH] Med: scheduler: Don't always fence online remote nodes. - -Let's assume you have a cluster configured as follows: - -* Three nodes, plus one Pacemaker Remote node. -* At least two NICs on each node. -* Multiple layers of fencing, including fence_kdump. -* The timeout for fence_kdump is set higher on the real nodes than it is - on the remote node. -* A resource is configured that can only be run on the remote node. - -Now, let's assume that the node running the connection resource for the -remote node is disconnect from the rest of the cluster. In testing, -this disconnection was done by bringing one network interface down. - -Due to the fence timeouts, the following things will occur: - -* The node whose interface was brought down will split off into its own - cluster partition without quorum, while the other two nodes maintain - quorum. -* The partition with quorum will restart the remote node resource on - another real node in the partition. -* The node by itself will be fenced. However, due to the long - fence_kdump timeout, it will continue to make decisions regarding - resources. -* The node by itself will re-assign resources, including the remote - connection resource. This resource will be assigned back to the same - node again. -* The node by itself will decide to fence the remote node, which will - hit the "in our membership" clause of pe_can_fence. This is because - remote nodes are marked as online when they are assigned, not when - they are actually running. -* When the fence_kdump timeout expires, the node by itself will fence - the remote node. This succeeds because there is still a secondary - network connection it can use. This fencing will succeed, causing the - remote node to reboot and then causing a loss of service. -* The node by itself will then be fenced. - -The bug to me seems to be that the remote resource is marked as online -when it isn't yet. I think with that changed, all the other remote -fencing related code would then work as intended. However, it probably -has to remain as-is in order to schedule resources on the remote node - -resources probably can't be assigned to an offline node. Making changes -in pe_can_fence seems like the least invasive way to deal with this -problem. - -I also think this probably has probably been here for a very long time - -perhaps always - but we just haven't seen it due to the number of things -that have to be configured before it can show up. In particular, the -fencing timeouts and secondary network connection are what allow this -behavior to happen. - -I can't think of a good reason why a node without quorum would ever want -to fence a remote node, especially if the connection resource has been -moved to the quorate node. - -My fix here therefore is just to test whether there is another node it -could have been moved to and if so, don't fence it. ---- - lib/pengine/utils.c | 33 +++++++++++++++++++++++++++++++++ - 1 file changed, 33 insertions(+) - -diff --git a/lib/pengine/utils.c b/lib/pengine/utils.c -index acd825b468..1822acaa54 100644 ---- a/lib/pengine/utils.c -+++ b/lib/pengine/utils.c -@@ -66,6 +66,39 @@ pe_can_fence(const pcmk_scheduler_t *scheduler, const pcmk_node_t *node) - return false; - - } else if (node->details->online) { -+ /* Remote nodes are marked online when we assign their resource to a -+ * node, not when they are actually started (see remote_connection_assigned) -+ * so the above test by itself isn't good enough. -+ */ -+ if (pcmk__is_pacemaker_remote_node(node)) { -+ /* If we're on a system without quorum, it's entirely possible that -+ * the remote resource was automatically moved to a node on the -+ * partition with quorum. We can't tell that from this node - the -+ * best we can do is check if it's possible for the resource to run -+ * on another node in the partition with quorum. If so, it has -+ * likely been moved and we shouldn't fence it. -+ * -+ * NOTE: This condition appears to only come up in very limited -+ * circumstances. It at least requires some very lengthy fencing -+ * timeouts set, some way for fencing to still take place (a second -+ * NIC is how I've reproduced it in testing, but fence_scsi or -+ * sbd could work too), and a resource that runs on the remote node. -+ */ -+ pcmk_resource_t *rsc = node->priv->remote; -+ pcmk_node_t *n = NULL; -+ GHashTableIter iter; -+ -+ g_hash_table_iter_init(&iter, rsc->priv->allowed_nodes); -+ while (g_hash_table_iter_next(&iter, NULL, (void **) &n)) { -+ /* A node that's not online according to this non-quorum node -+ * is a node that's in another partition. -+ */ -+ if (!n->details->online) { -+ return false; -+ } -+ } -+ } -+ - crm_notice("We can fence %s without quorum because they're in our membership", - pcmk__node_name(node)); - return true; --- -2.31.1 - -From b0e6544bbf578285918b69ff9c9b35d2c9f54713 Mon Sep 17 00:00:00 2001 -From: Chris Lumens -Date: Mon, 9 Jun 2025 14:23:53 -0400 -Subject: [PATCH] Med: scheduler: Require a cluster option for old remote - fencing behavior. - -If the user wants to preserve the old fencing behavior, where a node -without quorum is allowed to fence remote nodes in the same partition -even if they've been restarted, they need to add -fence-remote-without-quorum="true" to their CIB. Omitting this option -or setting it to false will get the new remote fencing behavior. ---- - cts/cli/regression.crm_attribute.exp | 14 ++++++++++++++ - cts/cli/regression.daemons.exp | 9 +++++++++ - include/crm/common/options.h | 3 ++- - include/crm/common/scheduler_internal.h | 5 +++++ - lib/common/options.c | 12 +++++++++++- - lib/pengine/unpack.c | 8 ++++++++ - lib/pengine/utils.c | 3 ++- - 7 files changed, 51 insertions(+), 3 deletions(-) - -diff --git a/cts/cli/regression.crm_attribute.exp b/cts/cli/regression.crm_attribute.exp -index c84860490b..0fff171721 100644 ---- a/cts/cli/regression.crm_attribute.exp -+++ b/cts/cli/regression.crm_attribute.exp -@@ -272,6 +272,11 @@ Also known as properties, these are options that affect behavior across the enti - Whether the cluster should check for active resources during start-up - - -+ -+ By default, an inquorate node can not fence Pacemaker Remote nodes that are part of its partition as long as the cluster thinks they can be restarted. If true, inquorate nodes will be able to fence remote nodes regardless. -+ Whether remote nodes can be fenced without quorum -+ -+ - - If false, unresponsive nodes are immediately assumed to be harmless, and resources that were active on them may be recovered elsewhere. This can result in a "split-brain" situation, potentially leading to data loss and/or service unavailability. - Whether nodes may be fenced as part of recovery -@@ -598,6 +603,10 @@ Also known as properties, these are options that affect behavior across the enti - * Delay cluster recovery for this much time to allow for additional events to occur. Useful if your configuration is sensitive to the order in which ping updates arrive. - * Possible values: duration (default: ) - -+ * fence-remote-without-quorum: Whether remote nodes can be fenced without quorum -+ * By default, an inquorate node can not fence Pacemaker Remote nodes that are part of its partition as long as the cluster thinks they can be restarted. If true, inquorate nodes will be able to fence remote nodes regardless. -+ * Possible values: boolean (default: ) -+ - * stonith-enabled: Whether nodes may be fenced as part of recovery - * If false, unresponsive nodes are immediately assumed to be harmless, and resources that were active on them may be recovered elsewhere. This can result in a "split-brain" situation, potentially leading to data loss and/or service unavailability. - * Possible values: boolean (default: ) -@@ -724,6 +733,11 @@ Also known as properties, these are options that affect behavior across the enti - Whether the cluster should check for active resources during start-up - - -+ -+ By default, an inquorate node can not fence Pacemaker Remote nodes that are part of its partition as long as the cluster thinks they can be restarted. If true, inquorate nodes will be able to fence remote nodes regardless. -+ Whether remote nodes can be fenced without quorum -+ -+ - - If false, unresponsive nodes are immediately assumed to be harmless, and resources that were active on them may be recovered elsewhere. This can result in a "split-brain" situation, potentially leading to data loss and/or service unavailability. - Whether nodes may be fenced as part of recovery -diff --git a/cts/cli/regression.daemons.exp b/cts/cli/regression.daemons.exp -index 26e9286d58..09c7941fa8 100644 ---- a/cts/cli/regression.daemons.exp -+++ b/cts/cli/regression.daemons.exp -@@ -514,6 +514,15 @@ - - - -+ -+ -+ By default, an inquorate node can not fence Pacemaker Remote nodes that are part of its partition as long as the cluster thinks they can be restarted. If true, inquorate nodes will be able to fence remote nodes regardless. -+ -+ -+ *** Advanced Use Only *** Whether remote nodes can be fenced without quorum -+ -+ -+ - - - If false, unresponsive nodes are immediately assumed to be harmless, and resources that were active on them may be recovered elsewhere. This can result in a "split-brain" situation, potentially leading to data loss and/or service unavailability. -diff --git a/include/crm/common/options.h b/include/crm/common/options.h -index 91016315af..e425aa03d9 100644 ---- a/include/crm/common/options.h -+++ b/include/crm/common/options.h -@@ -1,5 +1,5 @@ - /* -- * Copyright 2024 the Pacemaker project contributors -+ * Copyright 2024-2025 the Pacemaker project contributors - * - * The version control history for this file may have further details. - * -@@ -37,6 +37,7 @@ extern "C" { - #define PCMK_OPT_ENABLE_ACL "enable-acl" - #define PCMK_OPT_ENABLE_STARTUP_PROBES "enable-startup-probes" - #define PCMK_OPT_FENCE_REACTION "fence-reaction" -+#define PCMK_OPT_FENCE_REMOTE_WITHOUT_QUORUM "fence-remote-without-quorum" - #define PCMK_OPT_HAVE_WATCHDOG "have-watchdog" - #define PCMK_OPT_JOIN_FINALIZATION_TIMEOUT "join-finalization-timeout" - #define PCMK_OPT_JOIN_INTEGRATION_TIMEOUT "join-integration-timeout" -diff --git a/include/crm/common/scheduler_internal.h b/include/crm/common/scheduler_internal.h -index 82805ac4ac..3fa2812b66 100644 ---- a/include/crm/common/scheduler_internal.h -+++ b/include/crm/common/scheduler_internal.h -@@ -154,6 +154,11 @@ enum pcmk__scheduler_flags { - * applying node-specific location criteria, assignment, etc.) - */ - pcmk__sched_validate_only = (1ULL << 27), -+ -+ /* Can Pacemaker Remote nodes be fenced even from a node that doesn't -+ * have quorum? -+ */ -+ pcmk__sched_fence_remote_no_quorum = (1ULL << 28), - }; - - // Implementation of pcmk__scheduler_private_t -diff --git a/lib/common/options.c b/lib/common/options.c -index 7ed6bd9990..b8f4943fda 100644 ---- a/lib/common/options.c -+++ b/lib/common/options.c -@@ -1,5 +1,5 @@ - /* -- * Copyright 2004-2024 the Pacemaker project contributors -+ * Copyright 2004-2025 the Pacemaker project contributors - * - * The version control history for this file may have further details. - * -@@ -229,6 +229,16 @@ static const pcmk__cluster_option_t cluster_options[] = { - }, - - // Fencing-related options -+ { -+ PCMK_OPT_FENCE_REMOTE_WITHOUT_QUORUM, NULL, PCMK_VALUE_BOOLEAN, NULL, -+ PCMK_VALUE_FALSE, pcmk__valid_boolean, -+ pcmk__opt_schedulerd|pcmk__opt_advanced, -+ N_("Whether remote nodes can be fenced without quorum"), -+ N_("By default, an inquorate node can not fence Pacemaker Remote nodes " -+ "that are part of its partition as long as the cluster thinks they " -+ "can be restarted. If true, inquorate nodes will be able to fence " -+ "remote nodes regardless."), -+ }, - { - PCMK_OPT_STONITH_ENABLED, NULL, PCMK_VALUE_BOOLEAN, NULL, - PCMK_VALUE_TRUE, pcmk__valid_boolean, -diff --git a/lib/pengine/unpack.c b/lib/pengine/unpack.c -index 83ecb2d838..2141fca6d8 100644 ---- a/lib/pengine/unpack.c -+++ b/lib/pengine/unpack.c -@@ -423,6 +423,14 @@ unpack_config(xmlNode *config, pcmk_scheduler_t *scheduler) - pcmk__readable_interval(scheduler->priv->node_pending_ms)); - } - -+ set_config_flag(scheduler, PCMK_OPT_FENCE_REMOTE_WITHOUT_QUORUM, -+ pcmk__sched_fence_remote_no_quorum); -+ if (pcmk_is_set(scheduler->flags, pcmk__sched_fence_remote_no_quorum)) { -+ crm_trace("Pacemaker Remote nodes may be fenced without quorum"); -+ } else { -+ crm_trace("Pacemaker Remote nodes require quorum to be fenced"); -+ } -+ - return TRUE; - } - -diff --git a/lib/pengine/utils.c b/lib/pengine/utils.c -index 1822acaa54..664b91994e 100644 ---- a/lib/pengine/utils.c -+++ b/lib/pengine/utils.c -@@ -70,7 +70,8 @@ pe_can_fence(const pcmk_scheduler_t *scheduler, const pcmk_node_t *node) - * node, not when they are actually started (see remote_connection_assigned) - * so the above test by itself isn't good enough. - */ -- if (pcmk__is_pacemaker_remote_node(node)) { -+ if (pcmk__is_pacemaker_remote_node(node) -+ && !pcmk_is_set(scheduler->flags, pcmk__sched_fence_remote_no_quorum)) { - /* If we're on a system without quorum, it's entirely possible that - * the remote resource was automatically moved to a node on the - * partition with quorum. We can't tell that from this node - the --- -2.31.1 - -From 168f0df263f739dc1e558b97aae6b49d5b7aa2c2 Mon Sep 17 00:00:00 2001 -From: Chris Lumens -Date: Wed, 18 Jun 2025 08:41:18 -0400 -Subject: [PATCH] Feature: libcrmcommon: bump feature set to 3.20.1 - -...for Pacemaker Remote node fencing changes. ---- - include/crm/crm.h | 4 ++-- - 1 file changed, 2 insertions(+), 2 deletions(-) - -diff --git a/include/crm/crm.h b/include/crm/crm.h -index 4d70c7d278..b8a213cb4d 100644 ---- a/include/crm/crm.h -+++ b/include/crm/crm.h -@@ -1,5 +1,5 @@ - /* -- * Copyright 2004-2024 the Pacemaker project contributors -+ * Copyright 2004-2025 the Pacemaker project contributors - * - * The version control history for this file may have further details. - * -@@ -63,7 +63,7 @@ extern "C" { - * >=3.2.0: DC supports PCMK_EXEC_INVALID and PCMK_EXEC_NOT_CONNECTED - * >=3.19.0: DC supports PCMK__CIB_REQUEST_COMMIT_TRANSACT - */ --#define CRM_FEATURE_SET "3.20.0" -+#define CRM_FEATURE_SET "3.20.1" - - /* Pacemaker's CPG protocols use fixed-width binary fields for the sender and - * recipient of a CPG message. This imposes an arbitrary limit on cluster node --- -2.31.1 - diff --git a/003-transient_attrs.patch b/003-transient_attrs.patch new file mode 100644 index 0000000..59b34e2 --- /dev/null +++ b/003-transient_attrs.patch @@ -0,0 +1,1262 @@ +From 26c022d2a3b6061ff9a60f86e50834a08e8360d4 Mon Sep 17 00:00:00 2001 +From: Reid Wahl +Date: Thu, 13 Nov 2025 02:14:45 -0800 +Subject: [PATCH 01/10] Fix: pacemaker-attrd: Wipe CIB along with memory + +Previously, when the attribute manager purged a node, it would purge the +node's transient attributes only from memory, and assumed the controller +would purge them from the CIB. Now, the writer will purge them from the +CIB as well. + +This fixes a variety of timing issues when multiple nodes including the +attribute writer are shutting down. If the writer leaves before some +other node, the DC wipes that other node's attributes from the CIB when +that other node leaves the controller process group (or all other nodes +do if the DC is the leaving node). If a new writer (possibly even the +node itself) is elected before the node's attribute manager leaves the +cluster layer, it will write the attributes back to the CIB. Once the +other node leaves the cluster layer, all attribute managers remove its +attributes from memory, but they are now "stuck" in the CIB. + +As of this commit, the controller still erases the attributes from the +CIB when the node leaves the controller process group, which is +redundant but doesn't cause any new problems. This will be corrected in +an upcoming commit. + +Note: This will cause an insignificant regression if backported to +Pacemaker 2. The Pacemaker 2 controller purges attributes from the CIB +for leaving DCs only if they are at version 1.1.13 or later, because +earlier DCs will otherwise get fenced after a clean shutdown. Since the +attribute manager doesn't know the DC or its version, the attributes +would now always be wiped, so old leaving DCs will get fenced. The +fencing would occur only in the highly unlikely situation of a rolling +upgrade from Pacemaker 2-supported versions 1.1.11 or 1.1.12, and the +upgrade would still succeed without any negative impact on resources. + +Fixes T138 + +Co-Authored-By: Ken Gaillot +Co-Authored-By: Chris Lumens + +Signed-off-by: Reid Wahl +--- + daemons/attrd/attrd_corosync.c | 93 +++++++++++++++++++++++++++++++++ + daemons/attrd/attrd_elections.c | 2 + + daemons/attrd/pacemaker-attrd.c | 1 + + daemons/attrd/pacemaker-attrd.h | 3 ++ + 4 files changed, 99 insertions(+) + +diff --git a/daemons/attrd/attrd_corosync.c b/daemons/attrd/attrd_corosync.c +index 94fc85f..8497f34 100644 +--- a/daemons/attrd/attrd_corosync.c ++++ b/daemons/attrd/attrd_corosync.c +@@ -23,6 +23,43 @@ + + #include "pacemaker-attrd.h" + ++/*! ++ * \internal ++ * \brief Nodes removed by \c attrd_peer_remove() ++ * ++ * This table is to be used as a set. It contains nodes that have been removed ++ * by \c attrd_peer_remove() and whose transient attributes should be erased ++ * from the CIB. ++ * ++ * Setting an attribute value for a node via \c update_attr_on_host() removes ++ * the node from the table. At that point, we have transient attributes in ++ * memory for the node, so it should no longer be erased from the CIB. ++ * ++ * If another node erases a removed node's transient attributes from the CIB, ++ * the removed node remains in this table until an attribute value is set for ++ * it. This is for convenience: it avoids the need to monitor for CIB updates ++ * that erase a node's \c node_state or \c transient attributes element, just to ++ * remove the node from the table. ++ * ++ * Leaving a removed node in the table after erasure should be harmless. If a ++ * node is in this table, then we have no transient attributes for it in memory. ++ * If for some reason we erase its transient attributes from the CIB twice, its ++ * state in the CIB will still be correct. ++ */ ++static GHashTable *removed_peers = NULL; ++ ++/*! ++ * \internal ++ * \brief Free the removed nodes table ++ */ ++void ++attrd_free_removed_peers(void) ++{ ++ if (removed_peers != NULL) { ++ g_hash_table_destroy(removed_peers); ++ } ++} ++ + static xmlNode * + attrd_confirmation(int callid) + { +@@ -236,6 +273,10 @@ update_attr_on_host(attribute_t *a, const pcmk__node_status_t *peer, + const char *prev_xml_id = NULL; + const char *node_xml_id = crm_element_value(xml, PCMK__XA_ATTR_HOST_ID); + ++ if (removed_peers != NULL) { ++ g_hash_table_remove(removed_peers, host); ++ } ++ + // Create entry for value if not already existing + v = g_hash_table_lookup(a->values, host); + if (v == NULL) { +@@ -530,6 +571,29 @@ attrd_peer_sync_response(const pcmk__node_status_t *peer, bool peer_won, + } + } + ++/*! ++ * \internal ++ * \brief Erase all removed nodes' transient attributes from the CIB ++ * ++ * This should be called by a newly elected writer upon winning the election. ++ */ ++void ++attrd_erase_removed_peer_attributes(void) ++{ ++ const char *host = NULL; ++ GHashTableIter iter; ++ ++ if (!attrd_election_won() || (removed_peers == NULL)) { ++ return; ++ } ++ ++ g_hash_table_iter_init(&iter, removed_peers); ++ while (g_hash_table_iter_next(&iter, (gpointer *) &host, NULL)) { ++ attrd_cib_erase_transient_attrs(host); ++ g_hash_table_iter_remove(&iter); ++ } ++} ++ + /*! + * \internal + * \brief Remove all attributes and optionally peer cache entries for a node +@@ -556,6 +620,35 @@ attrd_peer_remove(const char *host, bool uncache, const char *source) + } + } + ++ if (attrd_election_won()) { ++ // We are the writer. Wipe node's transient attributes from CIB now. ++ attrd_cib_erase_transient_attrs(host); ++ ++ } else { ++ /* Make sure the attributes get erased from the CIB eventually. ++ * - If there's already a writer, it will call this function and enter ++ * the "if" block above, requesting the erasure (unless it leaves ++ * before sending the request -- see below). ++ * attrd_start_election_if_needed() will do nothing here. ++ * - Otherwise, we ensure an election is happening (unless we're ++ * shutting down). The winner will erase transient attributes from the ++ * CIB for all removed nodes in attrd_election_cb(). ++ * ++ * We add the node to the removed_peers table in case we win an election ++ * and need to request CIB erasures based on the table contents. This ++ * could happen for either of two reasons: ++ * - There is no current writer and we're not shutting down. An election ++ * either is already in progress or will be triggered here. ++ * - The current writer leaves before sending the CIB update request. A ++ * new election will be triggered. ++ */ ++ if (removed_peers == NULL) { ++ removed_peers = pcmk__strikey_table(free, NULL); ++ } ++ g_hash_table_add(removed_peers, pcmk__str_copy(host)); ++ attrd_start_election_if_needed(); ++ } ++ + if (uncache) { + pcmk__purge_node_from_cache(host, 0); + attrd_forget_node_xml_id(host); +diff --git a/daemons/attrd/attrd_elections.c b/daemons/attrd/attrd_elections.c +index 281ec12..e75a1d3 100644 +--- a/daemons/attrd/attrd_elections.c ++++ b/daemons/attrd/attrd_elections.c +@@ -24,6 +24,8 @@ attrd_election_cb(pcmk_cluster_t *cluster) + /* Update the peers after an election */ + attrd_peer_sync(NULL); + ++ attrd_erase_removed_peer_attributes(); ++ + /* After winning an election, update the CIB with the values of all + * attributes as the winner knows them. + */ +diff --git a/daemons/attrd/pacemaker-attrd.c b/daemons/attrd/pacemaker-attrd.c +index 7711fd2..3fa099b 100644 +--- a/daemons/attrd/pacemaker-attrd.c ++++ b/daemons/attrd/pacemaker-attrd.c +@@ -201,6 +201,7 @@ main(int argc, char **argv) + attrd_cib_disconnect(); + } + ++ attrd_free_removed_peers(); + attrd_free_waitlist(); + pcmk_cluster_disconnect(attrd_cluster); + pcmk_cluster_free(attrd_cluster); +diff --git a/daemons/attrd/pacemaker-attrd.h b/daemons/attrd/pacemaker-attrd.h +index d9423c8..80ae0d9 100644 +--- a/daemons/attrd/pacemaker-attrd.h ++++ b/daemons/attrd/pacemaker-attrd.h +@@ -184,6 +184,9 @@ extern GHashTable *peer_protocol_vers; + + #define CIB_OP_TIMEOUT_S 120 + ++void attrd_free_removed_peers(void); ++void attrd_erase_removed_peer_attributes(void); ++ + int attrd_cluster_connect(void); + void attrd_broadcast_value(const attribute_t *a, const attribute_value_t *v); + void attrd_peer_update(const pcmk__node_status_t *peer, xmlNode *xml, +-- +2.47.1 + +From 9db7cad74c9c051761c9d8a099a235cc2320f35d Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Thu, 14 Dec 2023 14:56:11 -0600 +Subject: [PATCH 02/10] Low: pacemaker-attrd: Drop "requesting shutdown" code + +The requesting_shutdown variable was checked only by +attrd_shutting_down(), when the if_requested argument was set to true. +In that case, it returned true if either the shutting_down variable was +true or both the if_requested argument and the requesting_shutdown +variable were true. + +The only caller that passed if_requested=true was +attrd_cib_updated_cb(). It did this if: +a. the alerts section was changed, or +b. the status section or nodes section was changed by an untrusted + client. + +Details: +a. Prior to f42e170, we didn't pass if_requested=true for an alerts + section change. We started doing so as of that commit mostly for + convenience. We decided that it seemed reasonable to ignore alert + changes when there was a shutdown pending. + + This commit reverts to NOT ignoring alert changes due to pending + shutdown. That seems like it might be better. I'm not sure if it's + possible for us to land in attrd_send_attribute_alert() while a + shutdown is requested but has not begun. If so, it would be good to + send the correct alerts. + +b. The other call with true is to avoid writing out all attributes when + the status or nodes section changes. It's probably okay to drop the + true there too. It was added by a1a9c54, to resolve a race condition + where: + * node2 left. + * node1's controller deleted node2's transient attributes from the + CIB. + * node1 took over as DC and replaced the CIB. + * node2's attribute manager was not yet actually shutting down, and + it responded to the CIB replacement by writing out all of the + attributes that were in its memory, including its own "shutdown" + attribute. + + Now (as of the previous commit), node1's attribute manager would + delete this "shutdown" attribute as part of its shutdown process. (Or + more accurately, I think the attribute writer node will do that.) + + So if we understand correctly, the attrd_shutting_down(true) + workaround is no longer needed. + +With no more callers needing to pass true, the supporting code can go +away. + +Co-Authored-By: Reid Wahl +--- + daemons/attrd/attrd_cib.c | 6 +++--- + daemons/attrd/attrd_corosync.c | 15 ++----------- + daemons/attrd/attrd_elections.c | 4 ++-- + daemons/attrd/attrd_ipc.c | 2 +- + daemons/attrd/attrd_utils.c | 37 ++++----------------------------- + daemons/attrd/pacemaker-attrd.h | 4 +--- + 6 files changed, 13 insertions(+), 55 deletions(-) + +diff --git a/daemons/attrd/attrd_cib.c b/daemons/attrd/attrd_cib.c +index 4231e4a..acd4621 100644 +--- a/daemons/attrd/attrd_cib.c ++++ b/daemons/attrd/attrd_cib.c +@@ -34,7 +34,7 @@ attrd_cib_destroy_cb(gpointer user_data) + + cib->cmds->signoff(cib); + +- if (attrd_shutting_down(false)) { ++ if (attrd_shutting_down()) { + crm_info("Disconnected from the CIB manager"); + + } else { +@@ -57,7 +57,7 @@ attrd_cib_updated_cb(const char *event, xmlNode *msg) + } + + if (pcmk__cib_element_in_patchset(patchset, PCMK_XE_ALERTS)) { +- if (attrd_shutting_down(true)) { ++ if (attrd_shutting_down()) { + crm_debug("Ignoring alerts change in CIB during shutdown"); + } else { + mainloop_set_trigger(attrd_config_read); +@@ -82,7 +82,7 @@ attrd_cib_updated_cb(const char *event, xmlNode *msg) + if (status_changed + || pcmk__cib_element_in_patchset(patchset, PCMK_XE_NODES)) { + +- if (attrd_shutting_down(true)) { ++ if (attrd_shutting_down()) { + crm_debug("Ignoring node change in CIB during shutdown"); + return; + } +diff --git a/daemons/attrd/attrd_corosync.c b/daemons/attrd/attrd_corosync.c +index 8497f34..02ddec6 100644 +--- a/daemons/attrd/attrd_corosync.c ++++ b/daemons/attrd/attrd_corosync.c +@@ -83,7 +83,7 @@ attrd_peer_message(pcmk__node_status_t *peer, xmlNode *xml) + return; + } + +- if (attrd_shutting_down(false)) { ++ if (attrd_shutting_down()) { + /* If we're shutting down, we want to continue responding to election + * ops as long as we're a cluster member (because our vote may be + * needed). Ignore all other messages. +@@ -166,7 +166,7 @@ attrd_cpg_dispatch(cpg_handle_t handle, + static void + attrd_cpg_destroy(gpointer unused) + { +- if (attrd_shutting_down(false)) { ++ if (attrd_shutting_down()) { + crm_info("Disconnected from Corosync process group"); + + } else { +@@ -328,17 +328,6 @@ update_attr_on_host(attribute_t *a, const pcmk__node_status_t *peer, + pcmk__str_update(&v->current, value); + attrd_set_attr_flags(a, attrd_attr_changed); + +- if (pcmk__str_eq(host, attrd_cluster->priv->node_name, pcmk__str_casei) +- && pcmk__str_eq(attr, PCMK__NODE_ATTR_SHUTDOWN, pcmk__str_none)) { +- +- if (!pcmk__str_eq(value, "0", pcmk__str_null_matches)) { +- attrd_set_requesting_shutdown(); +- +- } else { +- attrd_clear_requesting_shutdown(); +- } +- } +- + // Write out new value or start dampening timer + if (a->timeout_ms && a->timer) { + crm_trace("Delaying write of %s %s for dampening", +diff --git a/daemons/attrd/attrd_elections.c b/daemons/attrd/attrd_elections.c +index e75a1d3..eb9ef8c 100644 +--- a/daemons/attrd/attrd_elections.c ++++ b/daemons/attrd/attrd_elections.c +@@ -43,7 +43,7 @@ attrd_start_election_if_needed(void) + { + if ((peer_writer == NULL) + && (election_state(attrd_cluster) != election_in_progress) +- && !attrd_shutting_down(false)) { ++ && !attrd_shutting_down()) { + + crm_info("Starting an election to determine the writer"); + election_vote(attrd_cluster); +@@ -65,7 +65,7 @@ attrd_handle_election_op(const pcmk__node_status_t *peer, xmlNode *xml) + crm_xml_add(xml, PCMK__XA_SRC, peer->name); + + // Don't become writer if we're shutting down +- rc = election_count_vote(attrd_cluster, xml, !attrd_shutting_down(false)); ++ rc = election_count_vote(attrd_cluster, xml, !attrd_shutting_down()); + + switch(rc) { + case election_start: +diff --git a/daemons/attrd/attrd_ipc.c b/daemons/attrd/attrd_ipc.c +index 43e0f41..8a3bb36 100644 +--- a/daemons/attrd/attrd_ipc.c ++++ b/daemons/attrd/attrd_ipc.c +@@ -492,7 +492,7 @@ static int32_t + attrd_ipc_accept(qb_ipcs_connection_t *c, uid_t uid, gid_t gid) + { + crm_trace("New client connection %p", c); +- if (attrd_shutting_down(false)) { ++ if (attrd_shutting_down()) { + crm_info("Ignoring new connection from pid %d during shutdown", + pcmk__client_pid(c)); + return -ECONNREFUSED; +diff --git a/daemons/attrd/attrd_utils.c b/daemons/attrd/attrd_utils.c +index f219b88..e3e814d 100644 +--- a/daemons/attrd/attrd_utils.c ++++ b/daemons/attrd/attrd_utils.c +@@ -25,7 +25,6 @@ + + cib_t *the_cib = NULL; + +-static bool requesting_shutdown = false; + static bool shutting_down = false; + static GMainLoop *mloop = NULL; + +@@ -34,45 +33,17 @@ static GMainLoop *mloop = NULL; + */ + GHashTable *peer_protocol_vers = NULL; + +-/*! +- * \internal +- * \brief Set requesting_shutdown state +- */ +-void +-attrd_set_requesting_shutdown(void) +-{ +- requesting_shutdown = true; +-} +- +-/*! +- * \internal +- * \brief Clear requesting_shutdown state +- */ +-void +-attrd_clear_requesting_shutdown(void) +-{ +- requesting_shutdown = false; +-} +- + /*! + * \internal + * \brief Check whether local attribute manager is shutting down + * +- * \param[in] if_requested If \c true, also consider presence of +- * \c PCMK__NODE_ATTR_SHUTDOWN attribute +- * +- * \return \c true if local attribute manager has begun shutdown sequence +- * or (if \p if_requested is \c true) whether local node has a nonzero +- * \c PCMK__NODE_ATTR_SHUTDOWN attribute set, otherwise \c false +- * \note Most callers should pass \c false for \p if_requested, because the +- * attribute manager needs to continue performing while the controller is +- * shutting down, and even needs to be eligible for election in case all +- * nodes are shutting down. ++ * \return \c true if local attribute manager has begun shutdown sequence, ++ * otherwise \c false + */ + bool +-attrd_shutting_down(bool if_requested) ++attrd_shutting_down(void) + { +- return shutting_down || (if_requested && requesting_shutdown); ++ return shutting_down; + } + + /*! +diff --git a/daemons/attrd/pacemaker-attrd.h b/daemons/attrd/pacemaker-attrd.h +index 80ae0d9..d3e5765 100644 +--- a/daemons/attrd/pacemaker-attrd.h ++++ b/daemons/attrd/pacemaker-attrd.h +@@ -56,10 +56,8 @@ + void attrd_init_mainloop(void); + void attrd_run_mainloop(void); + +-void attrd_set_requesting_shutdown(void); +-void attrd_clear_requesting_shutdown(void); + void attrd_free_waitlist(void); +-bool attrd_shutting_down(bool if_requested); ++bool attrd_shutting_down(void); + void attrd_shutdown(int nsig); + void attrd_init_ipc(void); + void attrd_ipc_fini(void); +-- +2.47.1 + +From 19a157cb90466aaa5d929573edeabded3ba047ef Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Mon, 18 Dec 2023 11:38:00 -0600 +Subject: [PATCH 03/10] Low: controller: don't need to erase node attributes + for remote nodes + +Now that the attribute manager will erase transient attributes from the +CIB when purging a node, we don't need to do that separately in the +controller. + +Co-Authored-By: Chris Lumens +--- + daemons/controld/controld_remote_ra.c | 41 +++++++-------------------- + 1 file changed, 11 insertions(+), 30 deletions(-) + +diff --git a/daemons/controld/controld_remote_ra.c b/daemons/controld/controld_remote_ra.c +index 1cc4ae0..c9adf97 100644 +--- a/daemons/controld/controld_remote_ra.c ++++ b/daemons/controld/controld_remote_ra.c +@@ -237,35 +237,19 @@ should_purge_attributes(pcmk__node_status_t *node) + return true; + } + +-static enum controld_section_e +-section_to_delete(bool purge) +-{ +- if (pcmk_is_set(controld_globals.flags, controld_shutdown_lock_enabled)) { +- if (purge) { +- return controld_section_all_unlocked; +- } else { +- return controld_section_lrm_unlocked; +- } +- } else { +- if (purge) { +- return controld_section_all; +- } else { +- return controld_section_lrm; +- } +- } +-} +- + static void + purge_remote_node_attrs(int call_opt, pcmk__node_status_t *node) + { +- bool purge = should_purge_attributes(node); +- enum controld_section_e section = section_to_delete(purge); ++ enum controld_section_e section = controld_section_lrm; + +- /* Purge node from attrd's memory */ +- if (purge) { ++ // Purge node's transient attributes (from attribute manager and CIB) ++ if (should_purge_attributes(node)) { + update_attrd_remote_node_removed(node->name, NULL); + } + ++ if (pcmk_is_set(controld_globals.flags, controld_shutdown_lock_enabled)) { ++ section = controld_section_lrm_unlocked; ++ } + controld_delete_node_state(node->name, section, call_opt); + } + +@@ -367,18 +351,15 @@ remote_node_down(const char *node_name, const enum down_opts opts) + int call_opt = crmd_cib_smart_opt(); + pcmk__node_status_t *node = NULL; + +- /* Purge node from attrd's memory */ ++ // Purge node's transient attributes (from attribute manager and CIB) + update_attrd_remote_node_removed(node_name, NULL); + +- /* Normally, only node attributes should be erased, and the resource history +- * should be kept until the node comes back up. However, after a successful +- * fence, we want to clear the history as well, so we don't think resources +- * are still running on the node. ++ /* Normally, the resource history should be kept until the node comes back ++ * up. However, after a successful fence, clear the history so we don't ++ * think resources are still running on the node. + */ + if (opts == DOWN_ERASE_LRM) { +- controld_delete_node_state(node_name, controld_section_all, call_opt); +- } else { +- controld_delete_node_state(node_name, controld_section_attrs, call_opt); ++ controld_delete_node_state(node_name, controld_section_lrm, call_opt); + } + + /* Ensure node is in the remote peer cache with lost state */ +-- +2.47.1 + +From d49965412a5433a9a92463178d69074da9b3c349 Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Thu, 14 Dec 2023 15:42:39 -0600 +Subject: [PATCH 04/10] Refactor: controller: Allow purging node attrs without + cache removal + +Nothing uses the new capability yet. +--- + daemons/controld/controld_attrd.c | 22 +++++++++++++++------- + daemons/controld/controld_remote_ra.c | 4 ++-- + daemons/controld/controld_utils.h | 2 +- + 3 files changed, 18 insertions(+), 10 deletions(-) + +diff --git a/daemons/controld/controld_attrd.c b/daemons/controld/controld_attrd.c +index eff8070..c8591ef 100644 +--- a/daemons/controld/controld_attrd.c ++++ b/daemons/controld/controld_attrd.c +@@ -106,8 +106,15 @@ update_attrd_list(GList *attrs, uint32_t opts) + } + } + ++/*! ++ * \internal ++ * \brief Ask attribute manager to purge a node and its transient attributes ++ * ++ * \param[in] node_name Node to purge ++ * \param[in] from_cache If true, purge from node caches as well ++ */ + void +-update_attrd_remote_node_removed(const char *host, const char *user_name) ++controld_purge_node_attrs(const char *node_name, bool from_cache) + { + int rc = pcmk_rc_ok; + +@@ -115,14 +122,15 @@ update_attrd_remote_node_removed(const char *host, const char *user_name) + rc = pcmk_new_ipc_api(&attrd_api, pcmk_ipc_attrd); + } + if (rc == pcmk_rc_ok) { +- crm_trace("Asking attribute manager to purge Pacemaker Remote node %s", +- host); +- rc = pcmk__attrd_api_purge(attrd_api, host, true); ++ crm_debug("Asking %s to purge transient attributes%s for %s", ++ pcmk_ipc_name(attrd_api, true), ++ (from_cache? " and node cache" : ""), node_name); ++ rc = pcmk__attrd_api_purge(attrd_api, node_name, from_cache); + } + if (rc != pcmk_rc_ok) { +- crm_err("Could not purge Pacemaker Remote node %s " +- "in attribute manager%s: %s " QB_XS " rc=%d", +- host, when(), pcmk_rc_str(rc), rc); ++ crm_err("Could not purge node %s from %s%s: %s " ++ QB_XS " rc=%d", node_name, pcmk_ipc_name(attrd_api, true), ++ when(), pcmk_rc_str(rc), rc); + } + } + +diff --git a/daemons/controld/controld_remote_ra.c b/daemons/controld/controld_remote_ra.c +index c9adf97..3136180 100644 +--- a/daemons/controld/controld_remote_ra.c ++++ b/daemons/controld/controld_remote_ra.c +@@ -244,7 +244,7 @@ purge_remote_node_attrs(int call_opt, pcmk__node_status_t *node) + + // Purge node's transient attributes (from attribute manager and CIB) + if (should_purge_attributes(node)) { +- update_attrd_remote_node_removed(node->name, NULL); ++ controld_purge_node_attrs(node->name, true); + } + + if (pcmk_is_set(controld_globals.flags, controld_shutdown_lock_enabled)) { +@@ -352,7 +352,7 @@ remote_node_down(const char *node_name, const enum down_opts opts) + pcmk__node_status_t *node = NULL; + + // Purge node's transient attributes (from attribute manager and CIB) +- update_attrd_remote_node_removed(node_name, NULL); ++ controld_purge_node_attrs(node_name, true); + + /* Normally, the resource history should be kept until the node comes back + * up. However, after a successful fence, clear the history so we don't +diff --git a/daemons/controld/controld_utils.h b/daemons/controld/controld_utils.h +index e633888..262e0d1 100644 +--- a/daemons/controld/controld_utils.h ++++ b/daemons/controld/controld_utils.h +@@ -69,7 +69,7 @@ void crm_update_quorum(gboolean quorum, gboolean force_update); + void controld_close_attrd_ipc(void); + void update_attrd(const char *host, const char *name, const char *value, const char *user_name, gboolean is_remote_node); + void update_attrd_list(GList *attrs, uint32_t opts); +-void update_attrd_remote_node_removed(const char *host, const char *user_name); ++void controld_purge_node_attrs(const char *node_name, bool from_cache); + void update_attrd_clear_failures(const char *host, const char *rsc, + const char *op, const char *interval_spec, + gboolean is_remote_node); +-- +2.47.1 + +From 5fb8fdc72f457c7e9a691c10a99d54d0e03bd77d Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Thu, 14 Dec 2023 16:09:40 -0600 +Subject: [PATCH 05/10] Fix: controller: Don't purge transient attributes on + node loss + +With recent changes, the attribute manager now handles it when the node +leaves the cluster, so the controller purge is redundant. + +This does alter the timing somewhat, since the controller's purge +occurred when the node left the controller process group, while the +attribute manager's purge occurs when it leaves the cluster, but that +shouldn't make a significant difference. + +This fixes a problem when a node's controller crashes and is respawned +while fencing is disabled. Previously, another node's controller would +remove that node's transient attributes from the CIB, but they would +remain in the attribute managers' memory. Now, the attributes are +correctly retained in the CIB in this situation. + +Fixes T137 +Fixes T139 + +Co-Authored-By: Chris Lumens +--- + daemons/controld/controld_callbacks.c | 14 +------------- + 1 file changed, 1 insertion(+), 13 deletions(-) + +diff --git a/daemons/controld/controld_callbacks.c b/daemons/controld/controld_callbacks.c +index 48c255e..57e5183 100644 +--- a/daemons/controld/controld_callbacks.c ++++ b/daemons/controld/controld_callbacks.c +@@ -233,19 +233,11 @@ peer_update_callback(enum pcmk__node_update type, pcmk__node_status_t *node, + pcmk__str_casei) + && !pcmk__cluster_is_node_active(node)) { + +- /* The DC has left, so delete its transient attributes and +- * trigger a new election. +- * +- * A DC sends its shutdown request to all peers, who update the +- * DC's expected state to down. This avoids fencing upon +- * deletion of its transient attributes. +- */ ++ // The DC has left, so trigger a new election + crm_notice("Our peer on the DC (%s) is dead", + controld_globals.dc_name); + + register_fsa_input(C_CRMD_STATUS_CALLBACK, I_ELECTION, NULL); +- controld_delete_node_state(node->name, controld_section_attrs, +- cib_none); + + } else if (AM_I_DC + || pcmk_is_set(controld_globals.flags, controld_dc_left) +@@ -256,10 +248,6 @@ peer_update_callback(enum pcmk__node_update type, pcmk__node_status_t *node, + */ + if (appeared) { + te_trigger_stonith_history_sync(FALSE); +- } else { +- controld_delete_node_state(node->name, +- controld_section_attrs, +- cib_none); + } + } + break; +-- +2.47.1 + +From c40026fb77a6f7ee804979293e3019943a34e06b Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Mon, 18 Dec 2023 13:05:35 -0600 +Subject: [PATCH 06/10] Low: controller: Ask attribute manager to purge fenced + nodes' attributes + +...instead of wiping from the CIB directly. + +Co-Authored-By: Chris Lumens +--- + daemons/controld/controld_fencing.c | 8 +++++++- + 1 file changed, 7 insertions(+), 1 deletion(-) + +diff --git a/daemons/controld/controld_fencing.c b/daemons/controld/controld_fencing.c +index 51367ca..de074aa 100644 +--- a/daemons/controld/controld_fencing.c ++++ b/daemons/controld/controld_fencing.c +@@ -267,7 +267,13 @@ update_node_state_after_fencing(const char *target, const char *target_xml_id) + crm_debug("Updating node state for %s after fencing (call %d)", target, rc); + fsa_register_cib_callback(rc, pcmk__str_copy(target), cib_fencing_updated); + +- controld_delete_node_state(peer->name, controld_section_all, cib_none); ++ // Delete node's resource history from CIB ++ controld_delete_node_state(peer->name, controld_section_lrm, cib_none); ++ ++ // Ask attribute manager to delete node's transient attributes ++ // @TODO: This is the only call to controld_purge_node_attrs that doesn't ++ // want to also purge the node from the caches. Why? ++ controld_purge_node_attrs(peer->name, false); + } + + /*! +-- +2.47.1 + +From d9d19827d93f2394a831a9651aae064ea5a04fa4 Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Mon, 18 Dec 2023 13:14:53 -0600 +Subject: [PATCH 07/10] Refactor: controller: Drop no-longer-used section enum + values + +--- + daemons/controld/controld_cib.c | 24 ------------------------ + daemons/controld/controld_cib.h | 5 +---- + 2 files changed, 1 insertion(+), 28 deletions(-) + +diff --git a/daemons/controld/controld_cib.c b/daemons/controld/controld_cib.c +index e2a0d50..39c2b06 100644 +--- a/daemons/controld/controld_cib.c ++++ b/daemons/controld/controld_cib.c +@@ -279,17 +279,6 @@ cib_delete_callback(xmlNode *msg, int call_id, int rc, xmlNode *output, + "[not(@" PCMK_OPT_SHUTDOWN_LOCK ") " \ + "or " PCMK_OPT_SHUTDOWN_LOCK "<%lld]" + +-// Node's PCMK__XE_TRANSIENT_ATTRIBUTES section (name 1x) +-#define XPATH_NODE_ATTRS XPATH_NODE_STATE "/" PCMK__XE_TRANSIENT_ATTRIBUTES +- +-// Everything under PCMK__XE_NODE_STATE (name 1x) +-#define XPATH_NODE_ALL XPATH_NODE_STATE "/*" +- +-/* Unlocked history + transient attributes +- * (name 2x, (seconds_since_epoch - PCMK_OPT_SHUTDOWN_LOCK_LIMIT) 1x, name 1x) +- */ +-#define XPATH_NODE_ALL_UNLOCKED XPATH_NODE_LRM_UNLOCKED "|" XPATH_NODE_ATTRS +- + /*! + * \internal + * \brief Get the XPath and description of a node state section to be deleted +@@ -320,19 +309,6 @@ controld_node_state_deletion_strings(const char *uname, + uname, uname, expire); + desc_pre = "resource history (other than shutdown locks)"; + break; +- case controld_section_attrs: +- *xpath = crm_strdup_printf(XPATH_NODE_ATTRS, uname); +- desc_pre = "transient attributes"; +- break; +- case controld_section_all: +- *xpath = crm_strdup_printf(XPATH_NODE_ALL, uname); +- desc_pre = "all state"; +- break; +- case controld_section_all_unlocked: +- *xpath = crm_strdup_printf(XPATH_NODE_ALL_UNLOCKED, +- uname, uname, expire, uname); +- desc_pre = "all state (other than shutdown locks)"; +- break; + default: + // We called this function incorrectly + pcmk__assert(false); +diff --git a/daemons/controld/controld_cib.h b/daemons/controld/controld_cib.h +index b8622d5..25277e7 100644 +--- a/daemons/controld/controld_cib.h ++++ b/daemons/controld/controld_cib.h +@@ -1,5 +1,5 @@ + /* +- * Copyright 2004-2024 the Pacemaker project contributors ++ * Copyright 2004-2025 the Pacemaker project contributors + * + * The version control history for this file may have further details. + * +@@ -50,9 +50,6 @@ unsigned int cib_op_timeout(void); + enum controld_section_e { + controld_section_lrm, + controld_section_lrm_unlocked, +- controld_section_attrs, +- controld_section_all, +- controld_section_all_unlocked + }; + + void controld_node_state_deletion_strings(const char *uname, +-- +2.47.1 + +From 1056a0e3f6b618c23eb5a73d7e4a600619713a0c Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Mon, 18 Dec 2023 13:39:49 -0600 +Subject: [PATCH 08/10] Refactor: controller: Drop node state section enum + +It now boils down to a bool for whether we want only unlocked resources. +--- + daemons/controld/controld_cib.c | 48 +++++++++++---------------- + daemons/controld/controld_cib.h | 13 ++------ + daemons/controld/controld_execd.c | 3 +- + daemons/controld/controld_fencing.c | 2 +- + daemons/controld/controld_join_dc.c | 9 +++-- + daemons/controld/controld_remote_ra.c | 10 +++--- + 6 files changed, 32 insertions(+), 53 deletions(-) + +diff --git a/daemons/controld/controld_cib.c b/daemons/controld/controld_cib.c +index 39c2b06..298c321 100644 +--- a/daemons/controld/controld_cib.c ++++ b/daemons/controld/controld_cib.c +@@ -281,16 +281,15 @@ cib_delete_callback(xmlNode *msg, int call_id, int rc, xmlNode *output, + + /*! + * \internal +- * \brief Get the XPath and description of a node state section to be deleted ++ * \brief Get the XPath and description of resource history to be deleted + * +- * \param[in] uname Desired node +- * \param[in] section Subsection of \c PCMK__XE_NODE_STATE to be deleted +- * \param[out] xpath Where to store XPath of \p section +- * \param[out] desc If not \c NULL, where to store description of \p section ++ * \param[in] uname Name of node to delete resource history for ++ * \param[in] unlocked_only If true, delete history of only unlocked resources ++ * \param[out] xpath Where to store XPath for history deletion ++ * \param[out] desc If not NULL, where to store loggable description + */ + void +-controld_node_state_deletion_strings(const char *uname, +- enum controld_section_e section, ++controld_node_state_deletion_strings(const char *uname, bool unlocked_only, + char **xpath, char **desc) + { + const char *desc_pre = NULL; +@@ -299,20 +298,13 @@ controld_node_state_deletion_strings(const char *uname, + long long expire = (long long) time(NULL) + - controld_globals.shutdown_lock_limit; + +- switch (section) { +- case controld_section_lrm: +- *xpath = crm_strdup_printf(XPATH_NODE_LRM, uname); +- desc_pre = "resource history"; +- break; +- case controld_section_lrm_unlocked: +- *xpath = crm_strdup_printf(XPATH_NODE_LRM_UNLOCKED, +- uname, uname, expire); +- desc_pre = "resource history (other than shutdown locks)"; +- break; +- default: +- // We called this function incorrectly +- pcmk__assert(false); +- break; ++ if (unlocked_only) { ++ *xpath = crm_strdup_printf(XPATH_NODE_LRM_UNLOCKED, ++ uname, uname, expire); ++ desc_pre = "resource history (other than shutdown locks)"; ++ } else { ++ *xpath = crm_strdup_printf(XPATH_NODE_LRM, uname); ++ desc_pre = "resource history"; + } + + if (desc != NULL) { +@@ -322,15 +314,14 @@ controld_node_state_deletion_strings(const char *uname, + + /*! + * \internal +- * \brief Delete subsection of a node's CIB \c PCMK__XE_NODE_STATE ++ * \brief Delete a node's resource history from the CIB + * +- * \param[in] uname Desired node +- * \param[in] section Subsection of \c PCMK__XE_NODE_STATE to delete +- * \param[in] options CIB call options to use ++ * \param[in] uname Name of node to delete resource history for ++ * \param[in] unlocked_only If true, delete history of only unlocked resources ++ * \param[in] options CIB call options to use + */ + void +-controld_delete_node_state(const char *uname, enum controld_section_e section, +- int options) ++controld_delete_node_state(const char *uname, bool unlocked_only, int options) + { + cib_t *cib = controld_globals.cib_conn; + char *xpath = NULL; +@@ -339,8 +330,7 @@ controld_delete_node_state(const char *uname, enum controld_section_e section, + + pcmk__assert((uname != NULL) && (cib != NULL)); + +- controld_node_state_deletion_strings(uname, section, &xpath, &desc); +- ++ controld_node_state_deletion_strings(uname, unlocked_only, &xpath, &desc); + cib__set_call_options(options, "node state deletion", + cib_xpath|cib_multiple); + cib_rc = cib->cmds->remove(cib, xpath, NULL, options); +diff --git a/daemons/controld/controld_cib.h b/daemons/controld/controld_cib.h +index 25277e7..f423f93 100644 +--- a/daemons/controld/controld_cib.h ++++ b/daemons/controld/controld_cib.h +@@ -46,17 +46,10 @@ int controld_update_cib(const char *section, xmlNode *data, int options, + void *)); + unsigned int cib_op_timeout(void); + +-// Subsections of PCMK__XE_NODE_STATE +-enum controld_section_e { +- controld_section_lrm, +- controld_section_lrm_unlocked, +-}; +- +-void controld_node_state_deletion_strings(const char *uname, +- enum controld_section_e section, ++void controld_node_state_deletion_strings(const char *uname, bool unlocked_only, + char **xpath, char **desc); +-void controld_delete_node_state(const char *uname, +- enum controld_section_e section, int options); ++void controld_delete_node_state(const char *uname, bool unlocked_only, ++ int options); + int controld_delete_resource_history(const char *rsc_id, const char *node, + const char *user_name, int call_options); + +diff --git a/daemons/controld/controld_execd.c b/daemons/controld/controld_execd.c +index 2ec6893..801a5db 100644 +--- a/daemons/controld/controld_execd.c ++++ b/daemons/controld/controld_execd.c +@@ -1074,8 +1074,7 @@ force_reprobe(lrm_state_t *lrm_state, const char *from_sys, + } + + /* Now delete the copy in the CIB */ +- controld_delete_node_state(lrm_state->node_name, controld_section_lrm, +- cib_none); ++ controld_delete_node_state(lrm_state->node_name, false, cib_none); + } + + /*! +diff --git a/daemons/controld/controld_fencing.c b/daemons/controld/controld_fencing.c +index de074aa..6270dcd 100644 +--- a/daemons/controld/controld_fencing.c ++++ b/daemons/controld/controld_fencing.c +@@ -268,7 +268,7 @@ update_node_state_after_fencing(const char *target, const char *target_xml_id) + fsa_register_cib_callback(rc, pcmk__str_copy(target), cib_fencing_updated); + + // Delete node's resource history from CIB +- controld_delete_node_state(peer->name, controld_section_lrm, cib_none); ++ controld_delete_node_state(peer->name, false, cib_none); + + // Ask attribute manager to delete node's transient attributes + // @TODO: This is the only call to controld_purge_node_attrs that doesn't +diff --git a/daemons/controld/controld_join_dc.c b/daemons/controld/controld_join_dc.c +index a91fbfa..f88cc47 100644 +--- a/daemons/controld/controld_join_dc.c ++++ b/daemons/controld/controld_join_dc.c +@@ -771,7 +771,8 @@ do_dc_join_ack(long long action, + pcmk__node_status_t *peer = NULL; + enum controld_join_phase phase = controld_join_none; + +- enum controld_section_e section = controld_section_lrm; ++ const bool unlocked_only = pcmk_is_set(controld_globals.flags, ++ controld_shutdown_lock_enabled); + char *xpath = NULL; + xmlNode *state = join_ack->xml; + xmlNode *execd_state = NULL; +@@ -832,10 +833,8 @@ do_dc_join_ack(long long action, + } + + // Delete relevant parts of node's current executor state from CIB +- if (pcmk_is_set(controld_globals.flags, controld_shutdown_lock_enabled)) { +- section = controld_section_lrm_unlocked; +- } +- controld_node_state_deletion_strings(join_from, section, &xpath, NULL); ++ controld_node_state_deletion_strings(join_from, unlocked_only, &xpath, ++ NULL); + + rc = cib->cmds->remove(cib, xpath, NULL, + cib_xpath|cib_multiple|cib_transaction); +diff --git a/daemons/controld/controld_remote_ra.c b/daemons/controld/controld_remote_ra.c +index 3136180..86a3544 100644 +--- a/daemons/controld/controld_remote_ra.c ++++ b/daemons/controld/controld_remote_ra.c +@@ -240,17 +240,15 @@ should_purge_attributes(pcmk__node_status_t *node) + static void + purge_remote_node_attrs(int call_opt, pcmk__node_status_t *node) + { +- enum controld_section_e section = controld_section_lrm; ++ const bool unlocked_only = pcmk_is_set(controld_globals.flags, ++ controld_shutdown_lock_enabled); + + // Purge node's transient attributes (from attribute manager and CIB) + if (should_purge_attributes(node)) { + controld_purge_node_attrs(node->name, true); + } + +- if (pcmk_is_set(controld_globals.flags, controld_shutdown_lock_enabled)) { +- section = controld_section_lrm_unlocked; +- } +- controld_delete_node_state(node->name, section, call_opt); ++ controld_delete_node_state(node->name, unlocked_only, call_opt); + } + + /*! +@@ -359,7 +357,7 @@ remote_node_down(const char *node_name, const enum down_opts opts) + * think resources are still running on the node. + */ + if (opts == DOWN_ERASE_LRM) { +- controld_delete_node_state(node_name, controld_section_lrm, call_opt); ++ controld_delete_node_state(node_name, false, call_opt); + } + + /* Ensure node is in the remote peer cache with lost state */ +-- +2.47.1 + +From 050a3caad4989cc1c958420dff47b04be9a1cd55 Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Mon, 18 Dec 2023 15:45:00 -0600 +Subject: [PATCH 09/10] Refactor: controller: Rename + controld_delete_node_state() + +...to controld_delete_node_history(), and +controld_node_state_deletion_strings() to +controld_node_history_deletion_strings(), since they delete only history +now. +--- + daemons/controld/controld_cib.c | 8 ++++---- + daemons/controld/controld_cib.h | 9 +++++---- + daemons/controld/controld_execd.c | 2 +- + daemons/controld/controld_fencing.c | 2 +- + daemons/controld/controld_join_dc.c | 4 ++-- + daemons/controld/controld_remote_ra.c | 4 ++-- + 6 files changed, 15 insertions(+), 14 deletions(-) + +diff --git a/daemons/controld/controld_cib.c b/daemons/controld/controld_cib.c +index 298c321..fb06f22 100644 +--- a/daemons/controld/controld_cib.c ++++ b/daemons/controld/controld_cib.c +@@ -289,8 +289,8 @@ cib_delete_callback(xmlNode *msg, int call_id, int rc, xmlNode *output, + * \param[out] desc If not NULL, where to store loggable description + */ + void +-controld_node_state_deletion_strings(const char *uname, bool unlocked_only, +- char **xpath, char **desc) ++controld_node_history_deletion_strings(const char *uname, bool unlocked_only, ++ char **xpath, char **desc) + { + const char *desc_pre = NULL; + +@@ -321,7 +321,7 @@ controld_node_state_deletion_strings(const char *uname, bool unlocked_only, + * \param[in] options CIB call options to use + */ + void +-controld_delete_node_state(const char *uname, bool unlocked_only, int options) ++controld_delete_node_history(const char *uname, bool unlocked_only, int options) + { + cib_t *cib = controld_globals.cib_conn; + char *xpath = NULL; +@@ -330,7 +330,7 @@ controld_delete_node_state(const char *uname, bool unlocked_only, int options) + + pcmk__assert((uname != NULL) && (cib != NULL)); + +- controld_node_state_deletion_strings(uname, unlocked_only, &xpath, &desc); ++ controld_node_history_deletion_strings(uname, unlocked_only, &xpath, &desc); + cib__set_call_options(options, "node state deletion", + cib_xpath|cib_multiple); + cib_rc = cib->cmds->remove(cib, xpath, NULL, options); +diff --git a/daemons/controld/controld_cib.h b/daemons/controld/controld_cib.h +index f423f93..116db64 100644 +--- a/daemons/controld/controld_cib.h ++++ b/daemons/controld/controld_cib.h +@@ -46,10 +46,11 @@ int controld_update_cib(const char *section, xmlNode *data, int options, + void *)); + unsigned int cib_op_timeout(void); + +-void controld_node_state_deletion_strings(const char *uname, bool unlocked_only, +- char **xpath, char **desc); +-void controld_delete_node_state(const char *uname, bool unlocked_only, +- int options); ++void controld_node_history_deletion_strings(const char *uname, ++ bool unlocked_only, ++ char **xpath, char **desc); ++void controld_delete_node_history(const char *uname, bool unlocked_only, ++ int options); + int controld_delete_resource_history(const char *rsc_id, const char *node, + const char *user_name, int call_options); + +diff --git a/daemons/controld/controld_execd.c b/daemons/controld/controld_execd.c +index 801a5db..977acf0 100644 +--- a/daemons/controld/controld_execd.c ++++ b/daemons/controld/controld_execd.c +@@ -1074,7 +1074,7 @@ force_reprobe(lrm_state_t *lrm_state, const char *from_sys, + } + + /* Now delete the copy in the CIB */ +- controld_delete_node_state(lrm_state->node_name, false, cib_none); ++ controld_delete_node_history(lrm_state->node_name, false, cib_none); + } + + /*! +diff --git a/daemons/controld/controld_fencing.c b/daemons/controld/controld_fencing.c +index 6270dcd..026b240 100644 +--- a/daemons/controld/controld_fencing.c ++++ b/daemons/controld/controld_fencing.c +@@ -268,7 +268,7 @@ update_node_state_after_fencing(const char *target, const char *target_xml_id) + fsa_register_cib_callback(rc, pcmk__str_copy(target), cib_fencing_updated); + + // Delete node's resource history from CIB +- controld_delete_node_state(peer->name, false, cib_none); ++ controld_delete_node_history(peer->name, false, cib_none); + + // Ask attribute manager to delete node's transient attributes + // @TODO: This is the only call to controld_purge_node_attrs that doesn't +diff --git a/daemons/controld/controld_join_dc.c b/daemons/controld/controld_join_dc.c +index f88cc47..90d1bc0 100644 +--- a/daemons/controld/controld_join_dc.c ++++ b/daemons/controld/controld_join_dc.c +@@ -833,8 +833,8 @@ do_dc_join_ack(long long action, + } + + // Delete relevant parts of node's current executor state from CIB +- controld_node_state_deletion_strings(join_from, unlocked_only, &xpath, +- NULL); ++ controld_node_history_deletion_strings(join_from, unlocked_only, &xpath, ++ NULL); + + rc = cib->cmds->remove(cib, xpath, NULL, + cib_xpath|cib_multiple|cib_transaction); +diff --git a/daemons/controld/controld_remote_ra.c b/daemons/controld/controld_remote_ra.c +index 86a3544..1c52477 100644 +--- a/daemons/controld/controld_remote_ra.c ++++ b/daemons/controld/controld_remote_ra.c +@@ -248,7 +248,7 @@ purge_remote_node_attrs(int call_opt, pcmk__node_status_t *node) + controld_purge_node_attrs(node->name, true); + } + +- controld_delete_node_state(node->name, unlocked_only, call_opt); ++ controld_delete_node_history(node->name, unlocked_only, call_opt); + } + + /*! +@@ -357,7 +357,7 @@ remote_node_down(const char *node_name, const enum down_opts opts) + * think resources are still running on the node. + */ + if (opts == DOWN_ERASE_LRM) { +- controld_delete_node_state(node_name, false, call_opt); ++ controld_delete_node_history(node_name, false, call_opt); + } + + /* Ensure node is in the remote peer cache with lost state */ +-- +2.47.1 + +From 97dfc11f6c9d1a90ef744e5de2fe7678f3518bba Mon Sep 17 00:00:00 2001 +From: Chris Lumens +Date: Wed, 10 Sep 2025 14:59:38 -0400 +Subject: [PATCH 10/10] Refactor: daemons: Remove the down_opts enum + +This has only ever had two values, which basically just means it's a +bool. +--- + daemons/controld/controld_remote_ra.c | 21 ++++++++------------- + 1 file changed, 8 insertions(+), 13 deletions(-) + +diff --git a/daemons/controld/controld_remote_ra.c b/daemons/controld/controld_remote_ra.c +index 1c52477..eb1bc55 100644 +--- a/daemons/controld/controld_remote_ra.c ++++ b/daemons/controld/controld_remote_ra.c +@@ -330,20 +330,15 @@ remote_node_up(const char *node_name) + pcmk__xml_free(update); + } + +-enum down_opts { +- DOWN_KEEP_LRM, +- DOWN_ERASE_LRM +-}; +- + /*! + * \internal + * \brief Handle cluster communication related to pacemaker_remote node leaving + * + * \param[in] node_name Name of lost node +- * \param[in] opts Whether to keep or erase LRM history ++ * \param[in] erase_lrm If \c true, erase the LRM history + */ + static void +-remote_node_down(const char *node_name, const enum down_opts opts) ++remote_node_down(const char *node_name, bool erase_lrm) + { + xmlNode *update; + int call_opt = crmd_cib_smart_opt(); +@@ -356,7 +351,7 @@ remote_node_down(const char *node_name, const enum down_opts opts) + * up. However, after a successful fence, clear the history so we don't + * think resources are still running on the node. + */ +- if (opts == DOWN_ERASE_LRM) { ++ if (erase_lrm) { + controld_delete_node_history(node_name, false, call_opt); + } + +@@ -416,7 +411,7 @@ check_remote_node_state(const remote_ra_cmd_t *cmd) + if (ra_data) { + if (!pcmk_is_set(ra_data->status, takeover_complete)) { + /* Stop means down if we didn't successfully migrate elsewhere */ +- remote_node_down(cmd->rsc_id, DOWN_KEEP_LRM); ++ remote_node_down(cmd->rsc_id, false); + } else if (AM_I_DC == FALSE) { + /* Only the connection host and DC track node state, + * so if the connection migrated elsewhere and we aren't DC, +@@ -692,7 +687,7 @@ remote_lrm_op_callback(lrmd_event_data_t * op) + lrm_state->node_name); + /* Do roughly what a 'stop' on the remote-resource would do */ + handle_remote_ra_stop(lrm_state, NULL); +- remote_node_down(lrm_state->node_name, DOWN_KEEP_LRM); ++ remote_node_down(lrm_state->node_name, false); + /* now fake the reply of a successful 'stop' */ + synthesize_lrmd_success(NULL, lrm_state->node_name, + PCMK_ACTION_STOP); +@@ -1366,11 +1361,11 @@ remote_ra_process_pseudo(xmlNode *xml) + * peer cache state will be incorrect unless and until the guest is + * recovered. + */ +- if (result) { ++ if (result != NULL) { + const char *remote = pcmk__xe_id(result); + +- if (remote) { +- remote_node_down(remote, DOWN_ERASE_LRM); ++ if (remote != NULL) { ++ remote_node_down(remote, true); + } + } + } +-- +2.47.1 diff --git a/gating.yaml b/gating.yaml deleted file mode 100644 index e8aadb9..0000000 --- a/gating.yaml +++ /dev/null @@ -1,22 +0,0 @@ ---- !Policy -product_versions: - - fedora-* -decision_context: bodhi_update_push_testing -subject_type: koji_build -rules: - - !PassingTestCaseRule {test_case_name: fedora-ci.koji-build.tier0.functional} - ---- !Policy -product_versions: - - fedora-* -decision_context: bodhi_update_push_stable -subject_type: koji_build -rules: - - !PassingTestCaseRule {test_case_name: fedora-ci.koji-build.tier0.functional} - ---- !Policy -product_versions: - - rhel-* -decision_context: osci_compose_gate -rules: - - !PassingTestCaseRule {test_case_name: osci.brew-build.tier0.functional} diff --git a/pacemaker.spec b/pacemaker.spec index 97d1115..c888895 100644 --- a/pacemaker.spec +++ b/pacemaker.spec @@ -1,5 +1,5 @@ # -# Copyright 2008-2024 the Pacemaker project contributors +# Copyright 2008-2026 the Pacemaker project contributors # # The version control history for this file may have further details. # @@ -27,7 +27,7 @@ ## Where bug reports should be submitted ## Leave bug_url undefined to use ClusterLabs default, others define it here %if 0%{?rhel} -%global bug_url https://issues.redhat.com/ +%global bug_url https://github.com/oracle/oracle-linux/ %else %if 0%{?fedora} %global bug_url https://bugz.fedoraproject.org/%{name} @@ -40,11 +40,11 @@ ## Upstream pacemaker version, and its package version (specversion ## can be incremented to build packages reliably considered "newer" ## than previously built packages with the same pcmkversion) -%global pcmkversion 3.0.0 -%global specversion 5 +%global pcmkversion 3.0.1 +%global specversion 3 ## Upstream commit (full commit ID, abbreviated commit ID, or tag) to build -%global commit d8340737c46ccb265bd82c5493b58e9c14ba67e5 +%global commit 9a5e54bae85847c4bb6ed7c7fb06103ebebbc64a ## Since git v2.11, the extent of abbreviation is autoscaled by default ## (used to be constant of 7), so we need to convey it for non-tags, too. @@ -184,7 +184,7 @@ Name: pacemaker Summary: Scalable High-Availability cluster resource manager Version: %{pcmkversion} -Release: %{pcmk_release}.1%{?dist} +Release: %{pcmk_release}.1.0.1%{?dist} License: GPL-2.0-or-later AND LGPL-2.1-or-later Url: https://www.clusterlabs.org/ @@ -199,8 +199,9 @@ Url: https://www.clusterlabs.org/ Source0: https://codeload.github.com/%{github_owner}/%{name}/tar.gz/%{archive_github_url} Source1: pacemaker.sysusers # upstream commits -Patch001: 001-reset-error-warning-flags.patch -Patch002: 002-remote-fencing.patch +Patch001: 001-econnrefused.patch +Patch002: 002-corosync.patch +Patch003: 003-transient_attrs.patch Requires: resource-agents Requires: %{pkgname_pcmk_libs}%{?_isa} = %{version}-%{release} @@ -212,10 +213,10 @@ Requires: %{python_name}-%{name} = %{version}-%{release} %{?systemd_requires} %if %{defined centos} -ExclusiveArch: aarch64 i686 ppc64le s390x x86_64 %{arm} +ExclusiveArch: aarch64 i686 ppc64le s390x x86_64 %{arm} riscv64 %else %if 0%{?rhel} -ExclusiveArch: aarch64 i686 ppc64le s390x x86_64 +ExclusiveArch: aarch64 i686 ppc64le s390x x86_64 riscv64 %endif %endif @@ -274,7 +275,6 @@ BuildRequires: %{pkgname_glue_libs}-devel %endif %if %{with doc} -BuildRequires: asciidoc BuildRequires: %{python_name}-sphinx %endif @@ -500,7 +500,7 @@ popd %check make %{_smp_mflags} check { cts/cts-scheduler --run load-stopped-loop \ - && cts/cts-cli \ + && cts/cts-cli -V \ && touch .CHECKED } 2>&1 | sed 's/[fF]ail/faiil/g' # prevent false positives in rpmlint [ -f .CHECKED ] && rm -f -- .CHECKED @@ -793,9 +793,34 @@ exit 0 %{_datadir}/pkgconfig/pacemaker-schemas.pc %changelog -* Fri Aug 08 2025 Darren Archibald - 3.0.0-5.1 -- Add option for controlling remote node fencing behavior -- Resolves: RHEL-101072 +* Tue Feb 24 2026 EL Errata - 3.0.1-3.1.0.1 +- Replace bug url [Orabug: 34202300] +- Upstream reference in pacemaker crm_report binary [Orabug: 32825154] + +* Mon Jan 19 2026 Chris Lumens - 3.0.1-3.1 +- Fix a race condition between daemons when erasing transient attrs +- Resolves: RHEL-135091 + +* Wed Aug 13 2025 Reid Wahl - 3.0.1-3 +- CTS launches Corosync using systemd if available. +- Resolves: RHEL-110075 + +* Mon Aug 11 2025 Chris Lumens - 3.0.1-2 +- Do not retry on ECONNREFUSED in command line tools. +- Resolves: RHEL-106594 + +* Tue Jun 24 2025 Chris Lumens - 3.0.1-1 +- Rebase on upstream 3.0.1-rc1 +- Use dbus to detect completion of systemd resource start/stop actions +- Add an option for controlling remote node fencing behavior +- Split large IPC messages into multiple smaller ones +- Related: RHEL-86085 +- Resolves: RHEL-71181 +- Resolves: RHEL-86146 +- Resolves: RHEL-86144 + +* Wed Apr 02 2025 Kashyap Chamarthy - 3.0.0-6 +- Add riscv64 into ExclusiveArch (thanks, Zhengyu He) * Fri Jan 10 2025 Chris Lumens - 3.0.0-5 - Rebase on upstream 3.0.0 final release diff --git a/sources b/sources index 2484111..9f16bb4 100644 --- a/sources +++ b/sources @@ -1 +1 @@ -SHA512 (pacemaker-d8340737c.tar.gz) = d91c7aa0cd5c607061e43b9b9403cfa806b86eb2730187e701a40c2b245ce8fbf34361af7a5ac9a4985b34f0dbdc6436815dcba5b6b177a7d058692599bd46ec +SHA512 (pacemaker-9a5e54bae.tar.gz) = 7c90f7cb985933ba3e0254118bab5c2af050e61c22ab683255c06282df196dcca439ecdc016e22fa7751a4744092abd6801451babfb8f4d03d4d67c1fee56ed9 diff --git a/tests/inventory b/tests/inventory deleted file mode 100755 index 52687db..0000000 --- a/tests/inventory +++ /dev/null @@ -1,4 +0,0 @@ -#!/bin/bash -export TEST_DOCKER_EXTRA_ARGS="--network host" -exec merge-standard-inventory "$@" - diff --git a/tests/tests.yml b/tests/tests.yml deleted file mode 100644 index bb4c63b..0000000 --- a/tests/tests.yml +++ /dev/null @@ -1,14 +0,0 @@ ---- -- hosts: localhost - roles: - - role: standard-test-basic - tags: - - classic - - container - tests: - - cts-regression: - dir: . - run: /usr/share/pacemaker/tests/cts-regression cli scheduler fencing - required_packages: - - pacemaker - - pacemaker-cts