From 58b7f8c03b3cd8143f62aae796dbc0fad26e9690 Mon Sep 17 00:00:00 2001 From: Chris Lumens Date: Mon, 8 Dec 2025 15:30:42 -0500 Subject: [PATCH] Fix a race condition between daemons when erasing transient attrs - Resolves: RHEL-23082 --- 007-transient_attrs.patch | 1262 +++++++++++++++++++++++++++++++++++++ pacemaker.spec | 7 +- 2 files changed, 1268 insertions(+), 1 deletion(-) create mode 100644 007-transient_attrs.patch diff --git a/007-transient_attrs.patch b/007-transient_attrs.patch new file mode 100644 index 0000000..59b34e2 --- /dev/null +++ b/007-transient_attrs.patch @@ -0,0 +1,1262 @@ +From 26c022d2a3b6061ff9a60f86e50834a08e8360d4 Mon Sep 17 00:00:00 2001 +From: Reid Wahl +Date: Thu, 13 Nov 2025 02:14:45 -0800 +Subject: [PATCH 01/10] Fix: pacemaker-attrd: Wipe CIB along with memory + +Previously, when the attribute manager purged a node, it would purge the +node's transient attributes only from memory, and assumed the controller +would purge them from the CIB. Now, the writer will purge them from the +CIB as well. + +This fixes a variety of timing issues when multiple nodes including the +attribute writer are shutting down. If the writer leaves before some +other node, the DC wipes that other node's attributes from the CIB when +that other node leaves the controller process group (or all other nodes +do if the DC is the leaving node). If a new writer (possibly even the +node itself) is elected before the node's attribute manager leaves the +cluster layer, it will write the attributes back to the CIB. Once the +other node leaves the cluster layer, all attribute managers remove its +attributes from memory, but they are now "stuck" in the CIB. + +As of this commit, the controller still erases the attributes from the +CIB when the node leaves the controller process group, which is +redundant but doesn't cause any new problems. This will be corrected in +an upcoming commit. + +Note: This will cause an insignificant regression if backported to +Pacemaker 2. The Pacemaker 2 controller purges attributes from the CIB +for leaving DCs only if they are at version 1.1.13 or later, because +earlier DCs will otherwise get fenced after a clean shutdown. Since the +attribute manager doesn't know the DC or its version, the attributes +would now always be wiped, so old leaving DCs will get fenced. The +fencing would occur only in the highly unlikely situation of a rolling +upgrade from Pacemaker 2-supported versions 1.1.11 or 1.1.12, and the +upgrade would still succeed without any negative impact on resources. + +Fixes T138 + +Co-Authored-By: Ken Gaillot +Co-Authored-By: Chris Lumens + +Signed-off-by: Reid Wahl +--- + daemons/attrd/attrd_corosync.c | 93 +++++++++++++++++++++++++++++++++ + daemons/attrd/attrd_elections.c | 2 + + daemons/attrd/pacemaker-attrd.c | 1 + + daemons/attrd/pacemaker-attrd.h | 3 ++ + 4 files changed, 99 insertions(+) + +diff --git a/daemons/attrd/attrd_corosync.c b/daemons/attrd/attrd_corosync.c +index 94fc85f..8497f34 100644 +--- a/daemons/attrd/attrd_corosync.c ++++ b/daemons/attrd/attrd_corosync.c +@@ -23,6 +23,43 @@ + + #include "pacemaker-attrd.h" + ++/*! ++ * \internal ++ * \brief Nodes removed by \c attrd_peer_remove() ++ * ++ * This table is to be used as a set. It contains nodes that have been removed ++ * by \c attrd_peer_remove() and whose transient attributes should be erased ++ * from the CIB. ++ * ++ * Setting an attribute value for a node via \c update_attr_on_host() removes ++ * the node from the table. At that point, we have transient attributes in ++ * memory for the node, so it should no longer be erased from the CIB. ++ * ++ * If another node erases a removed node's transient attributes from the CIB, ++ * the removed node remains in this table until an attribute value is set for ++ * it. This is for convenience: it avoids the need to monitor for CIB updates ++ * that erase a node's \c node_state or \c transient attributes element, just to ++ * remove the node from the table. ++ * ++ * Leaving a removed node in the table after erasure should be harmless. If a ++ * node is in this table, then we have no transient attributes for it in memory. ++ * If for some reason we erase its transient attributes from the CIB twice, its ++ * state in the CIB will still be correct. ++ */ ++static GHashTable *removed_peers = NULL; ++ ++/*! ++ * \internal ++ * \brief Free the removed nodes table ++ */ ++void ++attrd_free_removed_peers(void) ++{ ++ if (removed_peers != NULL) { ++ g_hash_table_destroy(removed_peers); ++ } ++} ++ + static xmlNode * + attrd_confirmation(int callid) + { +@@ -236,6 +273,10 @@ update_attr_on_host(attribute_t *a, const pcmk__node_status_t *peer, + const char *prev_xml_id = NULL; + const char *node_xml_id = crm_element_value(xml, PCMK__XA_ATTR_HOST_ID); + ++ if (removed_peers != NULL) { ++ g_hash_table_remove(removed_peers, host); ++ } ++ + // Create entry for value if not already existing + v = g_hash_table_lookup(a->values, host); + if (v == NULL) { +@@ -530,6 +571,29 @@ attrd_peer_sync_response(const pcmk__node_status_t *peer, bool peer_won, + } + } + ++/*! ++ * \internal ++ * \brief Erase all removed nodes' transient attributes from the CIB ++ * ++ * This should be called by a newly elected writer upon winning the election. ++ */ ++void ++attrd_erase_removed_peer_attributes(void) ++{ ++ const char *host = NULL; ++ GHashTableIter iter; ++ ++ if (!attrd_election_won() || (removed_peers == NULL)) { ++ return; ++ } ++ ++ g_hash_table_iter_init(&iter, removed_peers); ++ while (g_hash_table_iter_next(&iter, (gpointer *) &host, NULL)) { ++ attrd_cib_erase_transient_attrs(host); ++ g_hash_table_iter_remove(&iter); ++ } ++} ++ + /*! + * \internal + * \brief Remove all attributes and optionally peer cache entries for a node +@@ -556,6 +620,35 @@ attrd_peer_remove(const char *host, bool uncache, const char *source) + } + } + ++ if (attrd_election_won()) { ++ // We are the writer. Wipe node's transient attributes from CIB now. ++ attrd_cib_erase_transient_attrs(host); ++ ++ } else { ++ /* Make sure the attributes get erased from the CIB eventually. ++ * - If there's already a writer, it will call this function and enter ++ * the "if" block above, requesting the erasure (unless it leaves ++ * before sending the request -- see below). ++ * attrd_start_election_if_needed() will do nothing here. ++ * - Otherwise, we ensure an election is happening (unless we're ++ * shutting down). The winner will erase transient attributes from the ++ * CIB for all removed nodes in attrd_election_cb(). ++ * ++ * We add the node to the removed_peers table in case we win an election ++ * and need to request CIB erasures based on the table contents. This ++ * could happen for either of two reasons: ++ * - There is no current writer and we're not shutting down. An election ++ * either is already in progress or will be triggered here. ++ * - The current writer leaves before sending the CIB update request. A ++ * new election will be triggered. ++ */ ++ if (removed_peers == NULL) { ++ removed_peers = pcmk__strikey_table(free, NULL); ++ } ++ g_hash_table_add(removed_peers, pcmk__str_copy(host)); ++ attrd_start_election_if_needed(); ++ } ++ + if (uncache) { + pcmk__purge_node_from_cache(host, 0); + attrd_forget_node_xml_id(host); +diff --git a/daemons/attrd/attrd_elections.c b/daemons/attrd/attrd_elections.c +index 281ec12..e75a1d3 100644 +--- a/daemons/attrd/attrd_elections.c ++++ b/daemons/attrd/attrd_elections.c +@@ -24,6 +24,8 @@ attrd_election_cb(pcmk_cluster_t *cluster) + /* Update the peers after an election */ + attrd_peer_sync(NULL); + ++ attrd_erase_removed_peer_attributes(); ++ + /* After winning an election, update the CIB with the values of all + * attributes as the winner knows them. + */ +diff --git a/daemons/attrd/pacemaker-attrd.c b/daemons/attrd/pacemaker-attrd.c +index 7711fd2..3fa099b 100644 +--- a/daemons/attrd/pacemaker-attrd.c ++++ b/daemons/attrd/pacemaker-attrd.c +@@ -201,6 +201,7 @@ main(int argc, char **argv) + attrd_cib_disconnect(); + } + ++ attrd_free_removed_peers(); + attrd_free_waitlist(); + pcmk_cluster_disconnect(attrd_cluster); + pcmk_cluster_free(attrd_cluster); +diff --git a/daemons/attrd/pacemaker-attrd.h b/daemons/attrd/pacemaker-attrd.h +index d9423c8..80ae0d9 100644 +--- a/daemons/attrd/pacemaker-attrd.h ++++ b/daemons/attrd/pacemaker-attrd.h +@@ -184,6 +184,9 @@ extern GHashTable *peer_protocol_vers; + + #define CIB_OP_TIMEOUT_S 120 + ++void attrd_free_removed_peers(void); ++void attrd_erase_removed_peer_attributes(void); ++ + int attrd_cluster_connect(void); + void attrd_broadcast_value(const attribute_t *a, const attribute_value_t *v); + void attrd_peer_update(const pcmk__node_status_t *peer, xmlNode *xml, +-- +2.47.1 + +From 9db7cad74c9c051761c9d8a099a235cc2320f35d Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Thu, 14 Dec 2023 14:56:11 -0600 +Subject: [PATCH 02/10] Low: pacemaker-attrd: Drop "requesting shutdown" code + +The requesting_shutdown variable was checked only by +attrd_shutting_down(), when the if_requested argument was set to true. +In that case, it returned true if either the shutting_down variable was +true or both the if_requested argument and the requesting_shutdown +variable were true. + +The only caller that passed if_requested=true was +attrd_cib_updated_cb(). It did this if: +a. the alerts section was changed, or +b. the status section or nodes section was changed by an untrusted + client. + +Details: +a. Prior to f42e170, we didn't pass if_requested=true for an alerts + section change. We started doing so as of that commit mostly for + convenience. We decided that it seemed reasonable to ignore alert + changes when there was a shutdown pending. + + This commit reverts to NOT ignoring alert changes due to pending + shutdown. That seems like it might be better. I'm not sure if it's + possible for us to land in attrd_send_attribute_alert() while a + shutdown is requested but has not begun. If so, it would be good to + send the correct alerts. + +b. The other call with true is to avoid writing out all attributes when + the status or nodes section changes. It's probably okay to drop the + true there too. It was added by a1a9c54, to resolve a race condition + where: + * node2 left. + * node1's controller deleted node2's transient attributes from the + CIB. + * node1 took over as DC and replaced the CIB. + * node2's attribute manager was not yet actually shutting down, and + it responded to the CIB replacement by writing out all of the + attributes that were in its memory, including its own "shutdown" + attribute. + + Now (as of the previous commit), node1's attribute manager would + delete this "shutdown" attribute as part of its shutdown process. (Or + more accurately, I think the attribute writer node will do that.) + + So if we understand correctly, the attrd_shutting_down(true) + workaround is no longer needed. + +With no more callers needing to pass true, the supporting code can go +away. + +Co-Authored-By: Reid Wahl +--- + daemons/attrd/attrd_cib.c | 6 +++--- + daemons/attrd/attrd_corosync.c | 15 ++----------- + daemons/attrd/attrd_elections.c | 4 ++-- + daemons/attrd/attrd_ipc.c | 2 +- + daemons/attrd/attrd_utils.c | 37 ++++----------------------------- + daemons/attrd/pacemaker-attrd.h | 4 +--- + 6 files changed, 13 insertions(+), 55 deletions(-) + +diff --git a/daemons/attrd/attrd_cib.c b/daemons/attrd/attrd_cib.c +index 4231e4a..acd4621 100644 +--- a/daemons/attrd/attrd_cib.c ++++ b/daemons/attrd/attrd_cib.c +@@ -34,7 +34,7 @@ attrd_cib_destroy_cb(gpointer user_data) + + cib->cmds->signoff(cib); + +- if (attrd_shutting_down(false)) { ++ if (attrd_shutting_down()) { + crm_info("Disconnected from the CIB manager"); + + } else { +@@ -57,7 +57,7 @@ attrd_cib_updated_cb(const char *event, xmlNode *msg) + } + + if (pcmk__cib_element_in_patchset(patchset, PCMK_XE_ALERTS)) { +- if (attrd_shutting_down(true)) { ++ if (attrd_shutting_down()) { + crm_debug("Ignoring alerts change in CIB during shutdown"); + } else { + mainloop_set_trigger(attrd_config_read); +@@ -82,7 +82,7 @@ attrd_cib_updated_cb(const char *event, xmlNode *msg) + if (status_changed + || pcmk__cib_element_in_patchset(patchset, PCMK_XE_NODES)) { + +- if (attrd_shutting_down(true)) { ++ if (attrd_shutting_down()) { + crm_debug("Ignoring node change in CIB during shutdown"); + return; + } +diff --git a/daemons/attrd/attrd_corosync.c b/daemons/attrd/attrd_corosync.c +index 8497f34..02ddec6 100644 +--- a/daemons/attrd/attrd_corosync.c ++++ b/daemons/attrd/attrd_corosync.c +@@ -83,7 +83,7 @@ attrd_peer_message(pcmk__node_status_t *peer, xmlNode *xml) + return; + } + +- if (attrd_shutting_down(false)) { ++ if (attrd_shutting_down()) { + /* If we're shutting down, we want to continue responding to election + * ops as long as we're a cluster member (because our vote may be + * needed). Ignore all other messages. +@@ -166,7 +166,7 @@ attrd_cpg_dispatch(cpg_handle_t handle, + static void + attrd_cpg_destroy(gpointer unused) + { +- if (attrd_shutting_down(false)) { ++ if (attrd_shutting_down()) { + crm_info("Disconnected from Corosync process group"); + + } else { +@@ -328,17 +328,6 @@ update_attr_on_host(attribute_t *a, const pcmk__node_status_t *peer, + pcmk__str_update(&v->current, value); + attrd_set_attr_flags(a, attrd_attr_changed); + +- if (pcmk__str_eq(host, attrd_cluster->priv->node_name, pcmk__str_casei) +- && pcmk__str_eq(attr, PCMK__NODE_ATTR_SHUTDOWN, pcmk__str_none)) { +- +- if (!pcmk__str_eq(value, "0", pcmk__str_null_matches)) { +- attrd_set_requesting_shutdown(); +- +- } else { +- attrd_clear_requesting_shutdown(); +- } +- } +- + // Write out new value or start dampening timer + if (a->timeout_ms && a->timer) { + crm_trace("Delaying write of %s %s for dampening", +diff --git a/daemons/attrd/attrd_elections.c b/daemons/attrd/attrd_elections.c +index e75a1d3..eb9ef8c 100644 +--- a/daemons/attrd/attrd_elections.c ++++ b/daemons/attrd/attrd_elections.c +@@ -43,7 +43,7 @@ attrd_start_election_if_needed(void) + { + if ((peer_writer == NULL) + && (election_state(attrd_cluster) != election_in_progress) +- && !attrd_shutting_down(false)) { ++ && !attrd_shutting_down()) { + + crm_info("Starting an election to determine the writer"); + election_vote(attrd_cluster); +@@ -65,7 +65,7 @@ attrd_handle_election_op(const pcmk__node_status_t *peer, xmlNode *xml) + crm_xml_add(xml, PCMK__XA_SRC, peer->name); + + // Don't become writer if we're shutting down +- rc = election_count_vote(attrd_cluster, xml, !attrd_shutting_down(false)); ++ rc = election_count_vote(attrd_cluster, xml, !attrd_shutting_down()); + + switch(rc) { + case election_start: +diff --git a/daemons/attrd/attrd_ipc.c b/daemons/attrd/attrd_ipc.c +index 43e0f41..8a3bb36 100644 +--- a/daemons/attrd/attrd_ipc.c ++++ b/daemons/attrd/attrd_ipc.c +@@ -492,7 +492,7 @@ static int32_t + attrd_ipc_accept(qb_ipcs_connection_t *c, uid_t uid, gid_t gid) + { + crm_trace("New client connection %p", c); +- if (attrd_shutting_down(false)) { ++ if (attrd_shutting_down()) { + crm_info("Ignoring new connection from pid %d during shutdown", + pcmk__client_pid(c)); + return -ECONNREFUSED; +diff --git a/daemons/attrd/attrd_utils.c b/daemons/attrd/attrd_utils.c +index f219b88..e3e814d 100644 +--- a/daemons/attrd/attrd_utils.c ++++ b/daemons/attrd/attrd_utils.c +@@ -25,7 +25,6 @@ + + cib_t *the_cib = NULL; + +-static bool requesting_shutdown = false; + static bool shutting_down = false; + static GMainLoop *mloop = NULL; + +@@ -34,45 +33,17 @@ static GMainLoop *mloop = NULL; + */ + GHashTable *peer_protocol_vers = NULL; + +-/*! +- * \internal +- * \brief Set requesting_shutdown state +- */ +-void +-attrd_set_requesting_shutdown(void) +-{ +- requesting_shutdown = true; +-} +- +-/*! +- * \internal +- * \brief Clear requesting_shutdown state +- */ +-void +-attrd_clear_requesting_shutdown(void) +-{ +- requesting_shutdown = false; +-} +- + /*! + * \internal + * \brief Check whether local attribute manager is shutting down + * +- * \param[in] if_requested If \c true, also consider presence of +- * \c PCMK__NODE_ATTR_SHUTDOWN attribute +- * +- * \return \c true if local attribute manager has begun shutdown sequence +- * or (if \p if_requested is \c true) whether local node has a nonzero +- * \c PCMK__NODE_ATTR_SHUTDOWN attribute set, otherwise \c false +- * \note Most callers should pass \c false for \p if_requested, because the +- * attribute manager needs to continue performing while the controller is +- * shutting down, and even needs to be eligible for election in case all +- * nodes are shutting down. ++ * \return \c true if local attribute manager has begun shutdown sequence, ++ * otherwise \c false + */ + bool +-attrd_shutting_down(bool if_requested) ++attrd_shutting_down(void) + { +- return shutting_down || (if_requested && requesting_shutdown); ++ return shutting_down; + } + + /*! +diff --git a/daemons/attrd/pacemaker-attrd.h b/daemons/attrd/pacemaker-attrd.h +index 80ae0d9..d3e5765 100644 +--- a/daemons/attrd/pacemaker-attrd.h ++++ b/daemons/attrd/pacemaker-attrd.h +@@ -56,10 +56,8 @@ + void attrd_init_mainloop(void); + void attrd_run_mainloop(void); + +-void attrd_set_requesting_shutdown(void); +-void attrd_clear_requesting_shutdown(void); + void attrd_free_waitlist(void); +-bool attrd_shutting_down(bool if_requested); ++bool attrd_shutting_down(void); + void attrd_shutdown(int nsig); + void attrd_init_ipc(void); + void attrd_ipc_fini(void); +-- +2.47.1 + +From 19a157cb90466aaa5d929573edeabded3ba047ef Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Mon, 18 Dec 2023 11:38:00 -0600 +Subject: [PATCH 03/10] Low: controller: don't need to erase node attributes + for remote nodes + +Now that the attribute manager will erase transient attributes from the +CIB when purging a node, we don't need to do that separately in the +controller. + +Co-Authored-By: Chris Lumens +--- + daemons/controld/controld_remote_ra.c | 41 +++++++-------------------- + 1 file changed, 11 insertions(+), 30 deletions(-) + +diff --git a/daemons/controld/controld_remote_ra.c b/daemons/controld/controld_remote_ra.c +index 1cc4ae0..c9adf97 100644 +--- a/daemons/controld/controld_remote_ra.c ++++ b/daemons/controld/controld_remote_ra.c +@@ -237,35 +237,19 @@ should_purge_attributes(pcmk__node_status_t *node) + return true; + } + +-static enum controld_section_e +-section_to_delete(bool purge) +-{ +- if (pcmk_is_set(controld_globals.flags, controld_shutdown_lock_enabled)) { +- if (purge) { +- return controld_section_all_unlocked; +- } else { +- return controld_section_lrm_unlocked; +- } +- } else { +- if (purge) { +- return controld_section_all; +- } else { +- return controld_section_lrm; +- } +- } +-} +- + static void + purge_remote_node_attrs(int call_opt, pcmk__node_status_t *node) + { +- bool purge = should_purge_attributes(node); +- enum controld_section_e section = section_to_delete(purge); ++ enum controld_section_e section = controld_section_lrm; + +- /* Purge node from attrd's memory */ +- if (purge) { ++ // Purge node's transient attributes (from attribute manager and CIB) ++ if (should_purge_attributes(node)) { + update_attrd_remote_node_removed(node->name, NULL); + } + ++ if (pcmk_is_set(controld_globals.flags, controld_shutdown_lock_enabled)) { ++ section = controld_section_lrm_unlocked; ++ } + controld_delete_node_state(node->name, section, call_opt); + } + +@@ -367,18 +351,15 @@ remote_node_down(const char *node_name, const enum down_opts opts) + int call_opt = crmd_cib_smart_opt(); + pcmk__node_status_t *node = NULL; + +- /* Purge node from attrd's memory */ ++ // Purge node's transient attributes (from attribute manager and CIB) + update_attrd_remote_node_removed(node_name, NULL); + +- /* Normally, only node attributes should be erased, and the resource history +- * should be kept until the node comes back up. However, after a successful +- * fence, we want to clear the history as well, so we don't think resources +- * are still running on the node. ++ /* Normally, the resource history should be kept until the node comes back ++ * up. However, after a successful fence, clear the history so we don't ++ * think resources are still running on the node. + */ + if (opts == DOWN_ERASE_LRM) { +- controld_delete_node_state(node_name, controld_section_all, call_opt); +- } else { +- controld_delete_node_state(node_name, controld_section_attrs, call_opt); ++ controld_delete_node_state(node_name, controld_section_lrm, call_opt); + } + + /* Ensure node is in the remote peer cache with lost state */ +-- +2.47.1 + +From d49965412a5433a9a92463178d69074da9b3c349 Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Thu, 14 Dec 2023 15:42:39 -0600 +Subject: [PATCH 04/10] Refactor: controller: Allow purging node attrs without + cache removal + +Nothing uses the new capability yet. +--- + daemons/controld/controld_attrd.c | 22 +++++++++++++++------- + daemons/controld/controld_remote_ra.c | 4 ++-- + daemons/controld/controld_utils.h | 2 +- + 3 files changed, 18 insertions(+), 10 deletions(-) + +diff --git a/daemons/controld/controld_attrd.c b/daemons/controld/controld_attrd.c +index eff8070..c8591ef 100644 +--- a/daemons/controld/controld_attrd.c ++++ b/daemons/controld/controld_attrd.c +@@ -106,8 +106,15 @@ update_attrd_list(GList *attrs, uint32_t opts) + } + } + ++/*! ++ * \internal ++ * \brief Ask attribute manager to purge a node and its transient attributes ++ * ++ * \param[in] node_name Node to purge ++ * \param[in] from_cache If true, purge from node caches as well ++ */ + void +-update_attrd_remote_node_removed(const char *host, const char *user_name) ++controld_purge_node_attrs(const char *node_name, bool from_cache) + { + int rc = pcmk_rc_ok; + +@@ -115,14 +122,15 @@ update_attrd_remote_node_removed(const char *host, const char *user_name) + rc = pcmk_new_ipc_api(&attrd_api, pcmk_ipc_attrd); + } + if (rc == pcmk_rc_ok) { +- crm_trace("Asking attribute manager to purge Pacemaker Remote node %s", +- host); +- rc = pcmk__attrd_api_purge(attrd_api, host, true); ++ crm_debug("Asking %s to purge transient attributes%s for %s", ++ pcmk_ipc_name(attrd_api, true), ++ (from_cache? " and node cache" : ""), node_name); ++ rc = pcmk__attrd_api_purge(attrd_api, node_name, from_cache); + } + if (rc != pcmk_rc_ok) { +- crm_err("Could not purge Pacemaker Remote node %s " +- "in attribute manager%s: %s " QB_XS " rc=%d", +- host, when(), pcmk_rc_str(rc), rc); ++ crm_err("Could not purge node %s from %s%s: %s " ++ QB_XS " rc=%d", node_name, pcmk_ipc_name(attrd_api, true), ++ when(), pcmk_rc_str(rc), rc); + } + } + +diff --git a/daemons/controld/controld_remote_ra.c b/daemons/controld/controld_remote_ra.c +index c9adf97..3136180 100644 +--- a/daemons/controld/controld_remote_ra.c ++++ b/daemons/controld/controld_remote_ra.c +@@ -244,7 +244,7 @@ purge_remote_node_attrs(int call_opt, pcmk__node_status_t *node) + + // Purge node's transient attributes (from attribute manager and CIB) + if (should_purge_attributes(node)) { +- update_attrd_remote_node_removed(node->name, NULL); ++ controld_purge_node_attrs(node->name, true); + } + + if (pcmk_is_set(controld_globals.flags, controld_shutdown_lock_enabled)) { +@@ -352,7 +352,7 @@ remote_node_down(const char *node_name, const enum down_opts opts) + pcmk__node_status_t *node = NULL; + + // Purge node's transient attributes (from attribute manager and CIB) +- update_attrd_remote_node_removed(node_name, NULL); ++ controld_purge_node_attrs(node_name, true); + + /* Normally, the resource history should be kept until the node comes back + * up. However, after a successful fence, clear the history so we don't +diff --git a/daemons/controld/controld_utils.h b/daemons/controld/controld_utils.h +index e633888..262e0d1 100644 +--- a/daemons/controld/controld_utils.h ++++ b/daemons/controld/controld_utils.h +@@ -69,7 +69,7 @@ void crm_update_quorum(gboolean quorum, gboolean force_update); + void controld_close_attrd_ipc(void); + void update_attrd(const char *host, const char *name, const char *value, const char *user_name, gboolean is_remote_node); + void update_attrd_list(GList *attrs, uint32_t opts); +-void update_attrd_remote_node_removed(const char *host, const char *user_name); ++void controld_purge_node_attrs(const char *node_name, bool from_cache); + void update_attrd_clear_failures(const char *host, const char *rsc, + const char *op, const char *interval_spec, + gboolean is_remote_node); +-- +2.47.1 + +From 5fb8fdc72f457c7e9a691c10a99d54d0e03bd77d Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Thu, 14 Dec 2023 16:09:40 -0600 +Subject: [PATCH 05/10] Fix: controller: Don't purge transient attributes on + node loss + +With recent changes, the attribute manager now handles it when the node +leaves the cluster, so the controller purge is redundant. + +This does alter the timing somewhat, since the controller's purge +occurred when the node left the controller process group, while the +attribute manager's purge occurs when it leaves the cluster, but that +shouldn't make a significant difference. + +This fixes a problem when a node's controller crashes and is respawned +while fencing is disabled. Previously, another node's controller would +remove that node's transient attributes from the CIB, but they would +remain in the attribute managers' memory. Now, the attributes are +correctly retained in the CIB in this situation. + +Fixes T137 +Fixes T139 + +Co-Authored-By: Chris Lumens +--- + daemons/controld/controld_callbacks.c | 14 +------------- + 1 file changed, 1 insertion(+), 13 deletions(-) + +diff --git a/daemons/controld/controld_callbacks.c b/daemons/controld/controld_callbacks.c +index 48c255e..57e5183 100644 +--- a/daemons/controld/controld_callbacks.c ++++ b/daemons/controld/controld_callbacks.c +@@ -233,19 +233,11 @@ peer_update_callback(enum pcmk__node_update type, pcmk__node_status_t *node, + pcmk__str_casei) + && !pcmk__cluster_is_node_active(node)) { + +- /* The DC has left, so delete its transient attributes and +- * trigger a new election. +- * +- * A DC sends its shutdown request to all peers, who update the +- * DC's expected state to down. This avoids fencing upon +- * deletion of its transient attributes. +- */ ++ // The DC has left, so trigger a new election + crm_notice("Our peer on the DC (%s) is dead", + controld_globals.dc_name); + + register_fsa_input(C_CRMD_STATUS_CALLBACK, I_ELECTION, NULL); +- controld_delete_node_state(node->name, controld_section_attrs, +- cib_none); + + } else if (AM_I_DC + || pcmk_is_set(controld_globals.flags, controld_dc_left) +@@ -256,10 +248,6 @@ peer_update_callback(enum pcmk__node_update type, pcmk__node_status_t *node, + */ + if (appeared) { + te_trigger_stonith_history_sync(FALSE); +- } else { +- controld_delete_node_state(node->name, +- controld_section_attrs, +- cib_none); + } + } + break; +-- +2.47.1 + +From c40026fb77a6f7ee804979293e3019943a34e06b Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Mon, 18 Dec 2023 13:05:35 -0600 +Subject: [PATCH 06/10] Low: controller: Ask attribute manager to purge fenced + nodes' attributes + +...instead of wiping from the CIB directly. + +Co-Authored-By: Chris Lumens +--- + daemons/controld/controld_fencing.c | 8 +++++++- + 1 file changed, 7 insertions(+), 1 deletion(-) + +diff --git a/daemons/controld/controld_fencing.c b/daemons/controld/controld_fencing.c +index 51367ca..de074aa 100644 +--- a/daemons/controld/controld_fencing.c ++++ b/daemons/controld/controld_fencing.c +@@ -267,7 +267,13 @@ update_node_state_after_fencing(const char *target, const char *target_xml_id) + crm_debug("Updating node state for %s after fencing (call %d)", target, rc); + fsa_register_cib_callback(rc, pcmk__str_copy(target), cib_fencing_updated); + +- controld_delete_node_state(peer->name, controld_section_all, cib_none); ++ // Delete node's resource history from CIB ++ controld_delete_node_state(peer->name, controld_section_lrm, cib_none); ++ ++ // Ask attribute manager to delete node's transient attributes ++ // @TODO: This is the only call to controld_purge_node_attrs that doesn't ++ // want to also purge the node from the caches. Why? ++ controld_purge_node_attrs(peer->name, false); + } + + /*! +-- +2.47.1 + +From d9d19827d93f2394a831a9651aae064ea5a04fa4 Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Mon, 18 Dec 2023 13:14:53 -0600 +Subject: [PATCH 07/10] Refactor: controller: Drop no-longer-used section enum + values + +--- + daemons/controld/controld_cib.c | 24 ------------------------ + daemons/controld/controld_cib.h | 5 +---- + 2 files changed, 1 insertion(+), 28 deletions(-) + +diff --git a/daemons/controld/controld_cib.c b/daemons/controld/controld_cib.c +index e2a0d50..39c2b06 100644 +--- a/daemons/controld/controld_cib.c ++++ b/daemons/controld/controld_cib.c +@@ -279,17 +279,6 @@ cib_delete_callback(xmlNode *msg, int call_id, int rc, xmlNode *output, + "[not(@" PCMK_OPT_SHUTDOWN_LOCK ") " \ + "or " PCMK_OPT_SHUTDOWN_LOCK "<%lld]" + +-// Node's PCMK__XE_TRANSIENT_ATTRIBUTES section (name 1x) +-#define XPATH_NODE_ATTRS XPATH_NODE_STATE "/" PCMK__XE_TRANSIENT_ATTRIBUTES +- +-// Everything under PCMK__XE_NODE_STATE (name 1x) +-#define XPATH_NODE_ALL XPATH_NODE_STATE "/*" +- +-/* Unlocked history + transient attributes +- * (name 2x, (seconds_since_epoch - PCMK_OPT_SHUTDOWN_LOCK_LIMIT) 1x, name 1x) +- */ +-#define XPATH_NODE_ALL_UNLOCKED XPATH_NODE_LRM_UNLOCKED "|" XPATH_NODE_ATTRS +- + /*! + * \internal + * \brief Get the XPath and description of a node state section to be deleted +@@ -320,19 +309,6 @@ controld_node_state_deletion_strings(const char *uname, + uname, uname, expire); + desc_pre = "resource history (other than shutdown locks)"; + break; +- case controld_section_attrs: +- *xpath = crm_strdup_printf(XPATH_NODE_ATTRS, uname); +- desc_pre = "transient attributes"; +- break; +- case controld_section_all: +- *xpath = crm_strdup_printf(XPATH_NODE_ALL, uname); +- desc_pre = "all state"; +- break; +- case controld_section_all_unlocked: +- *xpath = crm_strdup_printf(XPATH_NODE_ALL_UNLOCKED, +- uname, uname, expire, uname); +- desc_pre = "all state (other than shutdown locks)"; +- break; + default: + // We called this function incorrectly + pcmk__assert(false); +diff --git a/daemons/controld/controld_cib.h b/daemons/controld/controld_cib.h +index b8622d5..25277e7 100644 +--- a/daemons/controld/controld_cib.h ++++ b/daemons/controld/controld_cib.h +@@ -1,5 +1,5 @@ + /* +- * Copyright 2004-2024 the Pacemaker project contributors ++ * Copyright 2004-2025 the Pacemaker project contributors + * + * The version control history for this file may have further details. + * +@@ -50,9 +50,6 @@ unsigned int cib_op_timeout(void); + enum controld_section_e { + controld_section_lrm, + controld_section_lrm_unlocked, +- controld_section_attrs, +- controld_section_all, +- controld_section_all_unlocked + }; + + void controld_node_state_deletion_strings(const char *uname, +-- +2.47.1 + +From 1056a0e3f6b618c23eb5a73d7e4a600619713a0c Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Mon, 18 Dec 2023 13:39:49 -0600 +Subject: [PATCH 08/10] Refactor: controller: Drop node state section enum + +It now boils down to a bool for whether we want only unlocked resources. +--- + daemons/controld/controld_cib.c | 48 +++++++++++---------------- + daemons/controld/controld_cib.h | 13 ++------ + daemons/controld/controld_execd.c | 3 +- + daemons/controld/controld_fencing.c | 2 +- + daemons/controld/controld_join_dc.c | 9 +++-- + daemons/controld/controld_remote_ra.c | 10 +++--- + 6 files changed, 32 insertions(+), 53 deletions(-) + +diff --git a/daemons/controld/controld_cib.c b/daemons/controld/controld_cib.c +index 39c2b06..298c321 100644 +--- a/daemons/controld/controld_cib.c ++++ b/daemons/controld/controld_cib.c +@@ -281,16 +281,15 @@ cib_delete_callback(xmlNode *msg, int call_id, int rc, xmlNode *output, + + /*! + * \internal +- * \brief Get the XPath and description of a node state section to be deleted ++ * \brief Get the XPath and description of resource history to be deleted + * +- * \param[in] uname Desired node +- * \param[in] section Subsection of \c PCMK__XE_NODE_STATE to be deleted +- * \param[out] xpath Where to store XPath of \p section +- * \param[out] desc If not \c NULL, where to store description of \p section ++ * \param[in] uname Name of node to delete resource history for ++ * \param[in] unlocked_only If true, delete history of only unlocked resources ++ * \param[out] xpath Where to store XPath for history deletion ++ * \param[out] desc If not NULL, where to store loggable description + */ + void +-controld_node_state_deletion_strings(const char *uname, +- enum controld_section_e section, ++controld_node_state_deletion_strings(const char *uname, bool unlocked_only, + char **xpath, char **desc) + { + const char *desc_pre = NULL; +@@ -299,20 +298,13 @@ controld_node_state_deletion_strings(const char *uname, + long long expire = (long long) time(NULL) + - controld_globals.shutdown_lock_limit; + +- switch (section) { +- case controld_section_lrm: +- *xpath = crm_strdup_printf(XPATH_NODE_LRM, uname); +- desc_pre = "resource history"; +- break; +- case controld_section_lrm_unlocked: +- *xpath = crm_strdup_printf(XPATH_NODE_LRM_UNLOCKED, +- uname, uname, expire); +- desc_pre = "resource history (other than shutdown locks)"; +- break; +- default: +- // We called this function incorrectly +- pcmk__assert(false); +- break; ++ if (unlocked_only) { ++ *xpath = crm_strdup_printf(XPATH_NODE_LRM_UNLOCKED, ++ uname, uname, expire); ++ desc_pre = "resource history (other than shutdown locks)"; ++ } else { ++ *xpath = crm_strdup_printf(XPATH_NODE_LRM, uname); ++ desc_pre = "resource history"; + } + + if (desc != NULL) { +@@ -322,15 +314,14 @@ controld_node_state_deletion_strings(const char *uname, + + /*! + * \internal +- * \brief Delete subsection of a node's CIB \c PCMK__XE_NODE_STATE ++ * \brief Delete a node's resource history from the CIB + * +- * \param[in] uname Desired node +- * \param[in] section Subsection of \c PCMK__XE_NODE_STATE to delete +- * \param[in] options CIB call options to use ++ * \param[in] uname Name of node to delete resource history for ++ * \param[in] unlocked_only If true, delete history of only unlocked resources ++ * \param[in] options CIB call options to use + */ + void +-controld_delete_node_state(const char *uname, enum controld_section_e section, +- int options) ++controld_delete_node_state(const char *uname, bool unlocked_only, int options) + { + cib_t *cib = controld_globals.cib_conn; + char *xpath = NULL; +@@ -339,8 +330,7 @@ controld_delete_node_state(const char *uname, enum controld_section_e section, + + pcmk__assert((uname != NULL) && (cib != NULL)); + +- controld_node_state_deletion_strings(uname, section, &xpath, &desc); +- ++ controld_node_state_deletion_strings(uname, unlocked_only, &xpath, &desc); + cib__set_call_options(options, "node state deletion", + cib_xpath|cib_multiple); + cib_rc = cib->cmds->remove(cib, xpath, NULL, options); +diff --git a/daemons/controld/controld_cib.h b/daemons/controld/controld_cib.h +index 25277e7..f423f93 100644 +--- a/daemons/controld/controld_cib.h ++++ b/daemons/controld/controld_cib.h +@@ -46,17 +46,10 @@ int controld_update_cib(const char *section, xmlNode *data, int options, + void *)); + unsigned int cib_op_timeout(void); + +-// Subsections of PCMK__XE_NODE_STATE +-enum controld_section_e { +- controld_section_lrm, +- controld_section_lrm_unlocked, +-}; +- +-void controld_node_state_deletion_strings(const char *uname, +- enum controld_section_e section, ++void controld_node_state_deletion_strings(const char *uname, bool unlocked_only, + char **xpath, char **desc); +-void controld_delete_node_state(const char *uname, +- enum controld_section_e section, int options); ++void controld_delete_node_state(const char *uname, bool unlocked_only, ++ int options); + int controld_delete_resource_history(const char *rsc_id, const char *node, + const char *user_name, int call_options); + +diff --git a/daemons/controld/controld_execd.c b/daemons/controld/controld_execd.c +index 2ec6893..801a5db 100644 +--- a/daemons/controld/controld_execd.c ++++ b/daemons/controld/controld_execd.c +@@ -1074,8 +1074,7 @@ force_reprobe(lrm_state_t *lrm_state, const char *from_sys, + } + + /* Now delete the copy in the CIB */ +- controld_delete_node_state(lrm_state->node_name, controld_section_lrm, +- cib_none); ++ controld_delete_node_state(lrm_state->node_name, false, cib_none); + } + + /*! +diff --git a/daemons/controld/controld_fencing.c b/daemons/controld/controld_fencing.c +index de074aa..6270dcd 100644 +--- a/daemons/controld/controld_fencing.c ++++ b/daemons/controld/controld_fencing.c +@@ -268,7 +268,7 @@ update_node_state_after_fencing(const char *target, const char *target_xml_id) + fsa_register_cib_callback(rc, pcmk__str_copy(target), cib_fencing_updated); + + // Delete node's resource history from CIB +- controld_delete_node_state(peer->name, controld_section_lrm, cib_none); ++ controld_delete_node_state(peer->name, false, cib_none); + + // Ask attribute manager to delete node's transient attributes + // @TODO: This is the only call to controld_purge_node_attrs that doesn't +diff --git a/daemons/controld/controld_join_dc.c b/daemons/controld/controld_join_dc.c +index a91fbfa..f88cc47 100644 +--- a/daemons/controld/controld_join_dc.c ++++ b/daemons/controld/controld_join_dc.c +@@ -771,7 +771,8 @@ do_dc_join_ack(long long action, + pcmk__node_status_t *peer = NULL; + enum controld_join_phase phase = controld_join_none; + +- enum controld_section_e section = controld_section_lrm; ++ const bool unlocked_only = pcmk_is_set(controld_globals.flags, ++ controld_shutdown_lock_enabled); + char *xpath = NULL; + xmlNode *state = join_ack->xml; + xmlNode *execd_state = NULL; +@@ -832,10 +833,8 @@ do_dc_join_ack(long long action, + } + + // Delete relevant parts of node's current executor state from CIB +- if (pcmk_is_set(controld_globals.flags, controld_shutdown_lock_enabled)) { +- section = controld_section_lrm_unlocked; +- } +- controld_node_state_deletion_strings(join_from, section, &xpath, NULL); ++ controld_node_state_deletion_strings(join_from, unlocked_only, &xpath, ++ NULL); + + rc = cib->cmds->remove(cib, xpath, NULL, + cib_xpath|cib_multiple|cib_transaction); +diff --git a/daemons/controld/controld_remote_ra.c b/daemons/controld/controld_remote_ra.c +index 3136180..86a3544 100644 +--- a/daemons/controld/controld_remote_ra.c ++++ b/daemons/controld/controld_remote_ra.c +@@ -240,17 +240,15 @@ should_purge_attributes(pcmk__node_status_t *node) + static void + purge_remote_node_attrs(int call_opt, pcmk__node_status_t *node) + { +- enum controld_section_e section = controld_section_lrm; ++ const bool unlocked_only = pcmk_is_set(controld_globals.flags, ++ controld_shutdown_lock_enabled); + + // Purge node's transient attributes (from attribute manager and CIB) + if (should_purge_attributes(node)) { + controld_purge_node_attrs(node->name, true); + } + +- if (pcmk_is_set(controld_globals.flags, controld_shutdown_lock_enabled)) { +- section = controld_section_lrm_unlocked; +- } +- controld_delete_node_state(node->name, section, call_opt); ++ controld_delete_node_state(node->name, unlocked_only, call_opt); + } + + /*! +@@ -359,7 +357,7 @@ remote_node_down(const char *node_name, const enum down_opts opts) + * think resources are still running on the node. + */ + if (opts == DOWN_ERASE_LRM) { +- controld_delete_node_state(node_name, controld_section_lrm, call_opt); ++ controld_delete_node_state(node_name, false, call_opt); + } + + /* Ensure node is in the remote peer cache with lost state */ +-- +2.47.1 + +From 050a3caad4989cc1c958420dff47b04be9a1cd55 Mon Sep 17 00:00:00 2001 +From: Ken Gaillot +Date: Mon, 18 Dec 2023 15:45:00 -0600 +Subject: [PATCH 09/10] Refactor: controller: Rename + controld_delete_node_state() + +...to controld_delete_node_history(), and +controld_node_state_deletion_strings() to +controld_node_history_deletion_strings(), since they delete only history +now. +--- + daemons/controld/controld_cib.c | 8 ++++---- + daemons/controld/controld_cib.h | 9 +++++---- + daemons/controld/controld_execd.c | 2 +- + daemons/controld/controld_fencing.c | 2 +- + daemons/controld/controld_join_dc.c | 4 ++-- + daemons/controld/controld_remote_ra.c | 4 ++-- + 6 files changed, 15 insertions(+), 14 deletions(-) + +diff --git a/daemons/controld/controld_cib.c b/daemons/controld/controld_cib.c +index 298c321..fb06f22 100644 +--- a/daemons/controld/controld_cib.c ++++ b/daemons/controld/controld_cib.c +@@ -289,8 +289,8 @@ cib_delete_callback(xmlNode *msg, int call_id, int rc, xmlNode *output, + * \param[out] desc If not NULL, where to store loggable description + */ + void +-controld_node_state_deletion_strings(const char *uname, bool unlocked_only, +- char **xpath, char **desc) ++controld_node_history_deletion_strings(const char *uname, bool unlocked_only, ++ char **xpath, char **desc) + { + const char *desc_pre = NULL; + +@@ -321,7 +321,7 @@ controld_node_state_deletion_strings(const char *uname, bool unlocked_only, + * \param[in] options CIB call options to use + */ + void +-controld_delete_node_state(const char *uname, bool unlocked_only, int options) ++controld_delete_node_history(const char *uname, bool unlocked_only, int options) + { + cib_t *cib = controld_globals.cib_conn; + char *xpath = NULL; +@@ -330,7 +330,7 @@ controld_delete_node_state(const char *uname, bool unlocked_only, int options) + + pcmk__assert((uname != NULL) && (cib != NULL)); + +- controld_node_state_deletion_strings(uname, unlocked_only, &xpath, &desc); ++ controld_node_history_deletion_strings(uname, unlocked_only, &xpath, &desc); + cib__set_call_options(options, "node state deletion", + cib_xpath|cib_multiple); + cib_rc = cib->cmds->remove(cib, xpath, NULL, options); +diff --git a/daemons/controld/controld_cib.h b/daemons/controld/controld_cib.h +index f423f93..116db64 100644 +--- a/daemons/controld/controld_cib.h ++++ b/daemons/controld/controld_cib.h +@@ -46,10 +46,11 @@ int controld_update_cib(const char *section, xmlNode *data, int options, + void *)); + unsigned int cib_op_timeout(void); + +-void controld_node_state_deletion_strings(const char *uname, bool unlocked_only, +- char **xpath, char **desc); +-void controld_delete_node_state(const char *uname, bool unlocked_only, +- int options); ++void controld_node_history_deletion_strings(const char *uname, ++ bool unlocked_only, ++ char **xpath, char **desc); ++void controld_delete_node_history(const char *uname, bool unlocked_only, ++ int options); + int controld_delete_resource_history(const char *rsc_id, const char *node, + const char *user_name, int call_options); + +diff --git a/daemons/controld/controld_execd.c b/daemons/controld/controld_execd.c +index 801a5db..977acf0 100644 +--- a/daemons/controld/controld_execd.c ++++ b/daemons/controld/controld_execd.c +@@ -1074,7 +1074,7 @@ force_reprobe(lrm_state_t *lrm_state, const char *from_sys, + } + + /* Now delete the copy in the CIB */ +- controld_delete_node_state(lrm_state->node_name, false, cib_none); ++ controld_delete_node_history(lrm_state->node_name, false, cib_none); + } + + /*! +diff --git a/daemons/controld/controld_fencing.c b/daemons/controld/controld_fencing.c +index 6270dcd..026b240 100644 +--- a/daemons/controld/controld_fencing.c ++++ b/daemons/controld/controld_fencing.c +@@ -268,7 +268,7 @@ update_node_state_after_fencing(const char *target, const char *target_xml_id) + fsa_register_cib_callback(rc, pcmk__str_copy(target), cib_fencing_updated); + + // Delete node's resource history from CIB +- controld_delete_node_state(peer->name, false, cib_none); ++ controld_delete_node_history(peer->name, false, cib_none); + + // Ask attribute manager to delete node's transient attributes + // @TODO: This is the only call to controld_purge_node_attrs that doesn't +diff --git a/daemons/controld/controld_join_dc.c b/daemons/controld/controld_join_dc.c +index f88cc47..90d1bc0 100644 +--- a/daemons/controld/controld_join_dc.c ++++ b/daemons/controld/controld_join_dc.c +@@ -833,8 +833,8 @@ do_dc_join_ack(long long action, + } + + // Delete relevant parts of node's current executor state from CIB +- controld_node_state_deletion_strings(join_from, unlocked_only, &xpath, +- NULL); ++ controld_node_history_deletion_strings(join_from, unlocked_only, &xpath, ++ NULL); + + rc = cib->cmds->remove(cib, xpath, NULL, + cib_xpath|cib_multiple|cib_transaction); +diff --git a/daemons/controld/controld_remote_ra.c b/daemons/controld/controld_remote_ra.c +index 86a3544..1c52477 100644 +--- a/daemons/controld/controld_remote_ra.c ++++ b/daemons/controld/controld_remote_ra.c +@@ -248,7 +248,7 @@ purge_remote_node_attrs(int call_opt, pcmk__node_status_t *node) + controld_purge_node_attrs(node->name, true); + } + +- controld_delete_node_state(node->name, unlocked_only, call_opt); ++ controld_delete_node_history(node->name, unlocked_only, call_opt); + } + + /*! +@@ -357,7 +357,7 @@ remote_node_down(const char *node_name, const enum down_opts opts) + * think resources are still running on the node. + */ + if (opts == DOWN_ERASE_LRM) { +- controld_delete_node_state(node_name, false, call_opt); ++ controld_delete_node_history(node_name, false, call_opt); + } + + /* Ensure node is in the remote peer cache with lost state */ +-- +2.47.1 + +From 97dfc11f6c9d1a90ef744e5de2fe7678f3518bba Mon Sep 17 00:00:00 2001 +From: Chris Lumens +Date: Wed, 10 Sep 2025 14:59:38 -0400 +Subject: [PATCH 10/10] Refactor: daemons: Remove the down_opts enum + +This has only ever had two values, which basically just means it's a +bool. +--- + daemons/controld/controld_remote_ra.c | 21 ++++++++------------- + 1 file changed, 8 insertions(+), 13 deletions(-) + +diff --git a/daemons/controld/controld_remote_ra.c b/daemons/controld/controld_remote_ra.c +index 1c52477..eb1bc55 100644 +--- a/daemons/controld/controld_remote_ra.c ++++ b/daemons/controld/controld_remote_ra.c +@@ -330,20 +330,15 @@ remote_node_up(const char *node_name) + pcmk__xml_free(update); + } + +-enum down_opts { +- DOWN_KEEP_LRM, +- DOWN_ERASE_LRM +-}; +- + /*! + * \internal + * \brief Handle cluster communication related to pacemaker_remote node leaving + * + * \param[in] node_name Name of lost node +- * \param[in] opts Whether to keep or erase LRM history ++ * \param[in] erase_lrm If \c true, erase the LRM history + */ + static void +-remote_node_down(const char *node_name, const enum down_opts opts) ++remote_node_down(const char *node_name, bool erase_lrm) + { + xmlNode *update; + int call_opt = crmd_cib_smart_opt(); +@@ -356,7 +351,7 @@ remote_node_down(const char *node_name, const enum down_opts opts) + * up. However, after a successful fence, clear the history so we don't + * think resources are still running on the node. + */ +- if (opts == DOWN_ERASE_LRM) { ++ if (erase_lrm) { + controld_delete_node_history(node_name, false, call_opt); + } + +@@ -416,7 +411,7 @@ check_remote_node_state(const remote_ra_cmd_t *cmd) + if (ra_data) { + if (!pcmk_is_set(ra_data->status, takeover_complete)) { + /* Stop means down if we didn't successfully migrate elsewhere */ +- remote_node_down(cmd->rsc_id, DOWN_KEEP_LRM); ++ remote_node_down(cmd->rsc_id, false); + } else if (AM_I_DC == FALSE) { + /* Only the connection host and DC track node state, + * so if the connection migrated elsewhere and we aren't DC, +@@ -692,7 +687,7 @@ remote_lrm_op_callback(lrmd_event_data_t * op) + lrm_state->node_name); + /* Do roughly what a 'stop' on the remote-resource would do */ + handle_remote_ra_stop(lrm_state, NULL); +- remote_node_down(lrm_state->node_name, DOWN_KEEP_LRM); ++ remote_node_down(lrm_state->node_name, false); + /* now fake the reply of a successful 'stop' */ + synthesize_lrmd_success(NULL, lrm_state->node_name, + PCMK_ACTION_STOP); +@@ -1366,11 +1361,11 @@ remote_ra_process_pseudo(xmlNode *xml) + * peer cache state will be incorrect unless and until the guest is + * recovered. + */ +- if (result) { ++ if (result != NULL) { + const char *remote = pcmk__xe_id(result); + +- if (remote) { +- remote_node_down(remote, DOWN_ERASE_LRM); ++ if (remote != NULL) { ++ remote_node_down(remote, true); + } + } + } +-- +2.47.1 diff --git a/pacemaker.spec b/pacemaker.spec index 83b50ac..896ed56 100644 --- a/pacemaker.spec +++ b/pacemaker.spec @@ -41,7 +41,7 @@ ## can be incremented to build packages reliably considered "newer" ## than previously built packages with the same pcmkversion) %global pcmkversion 3.0.1 -%global specversion 4 +%global specversion 5 ## Upstream commit (full commit ID, abbreviated commit ID, or tag) to build %global commit 9a5e54bae85847c4bb6ed7c7fb06103ebebbc64a @@ -205,6 +205,7 @@ Patch003: 003-promotable-follows.patch Patch004: 004-crm_resource_wait.patch Patch005: 005-ipc_evict.patch Patch006: 006-fewer_messages.patch +Patch007: 007-transient_attrs.patch Requires: resource-agents Requires: %{pkgname_pcmk_libs}%{?_isa} = %{version}-%{release} @@ -796,6 +797,10 @@ exit 0 %{_datadir}/pkgconfig/pacemaker-schemas.pc %changelog +* Mon Dec 8 2025 Chris Lumens - 3.0.1-5 +- Fix a race condition between daemons when erasing transient attrs +- Resolves: RHEL-23082 + * Thu Nov 13 2025 Chris Lumens - 3.0.1-4 - Fix promoting instances of a cloned resource - Handle large timeouts correctly in crm_resource --wait