From 6e69eb11db0bfcd317859cbcec190ee173133e0c Mon Sep 17 00:00:00 2001
From: Chris Lumens <clumens@redhat.com>
Date: Tue, 12 May 2026 11:39:54 -0400
Subject: [PATCH] Rebase on upstream 3.0.2-rc2

- Don't hang waiting on certain pending monitor actions
- Resolves: RHEL-78393
---
 001-econnrefused.patch       |   40 --
 002-corosync.patch           |   75 --
 003-promotable-follows.patch |  801 ---------------------
 004-crm_resource_wait.patch  |   76 --
 005-ipc_evict.patch          |  400 -----------
 006-fewer_messages.patch     |   88 ---
 007-transient_attrs.patch    | 1262 ----------------------------------
 pacemaker.spec               |   19 +-
 sources                      |    2 +-
 9 files changed, 10 insertions(+), 2753 deletions(-)
 delete mode 100644 001-econnrefused.patch
 delete mode 100644 002-corosync.patch
 delete mode 100644 003-promotable-follows.patch
 delete mode 100644 004-crm_resource_wait.patch
 delete mode 100644 005-ipc_evict.patch
 delete mode 100644 006-fewer_messages.patch
 delete mode 100644 007-transient_attrs.patch

diff --git a/001-econnrefused.patch b/001-econnrefused.patch
deleted file mode 100644
index 3b26b84..0000000
--- a/001-econnrefused.patch
+++ /dev/null
@@ -1,40 +0,0 @@
-From 125b434943f57778816135ad147fc827fa706e99 Mon Sep 17 00:00:00 2001
-From: Chris Lumens <clumens@redhat.com>
-Date: Mon, 4 Aug 2025 10:38:00 -0400
-Subject: [PATCH] Med: libpacemaker: Do not retry on ECONNREFUSED in tools.
-
-This is a regression introduced by e438946787.  In that patch, what
-we're trying to do is retry IPC connections between daemons.  If a
-daemon gets ECONNREFUSED when it initiates an IPC connection, the most
-likely reason is that another daemon has been killed and is restarting
-but is not yet ready to accept connections.  Waiting and retrying
-repeatedly is an acceptable way to deal with this.
-
-However, if a command line tool gets ECONNREFUSED, it's more likely that
-the problem is the cluster isn't running at all.  In this case, waiting
-and retrying just introduces a delay for a situation that will never be
-resolved.  Reverting just the part in pcmk_cluster_queries.c should fix
-this problem without affecting any of the daemons - they don't call this
-code.
-
-Fixes RHEL-106594
----
- lib/pacemaker/pcmk_cluster_queries.c | 2 +-
- 1 file changed, 1 insertion(+), 1 deletion(-)
-
-diff --git a/lib/pacemaker/pcmk_cluster_queries.c b/lib/pacemaker/pcmk_cluster_queries.c
-index 8a08d99180..2f91a68738 100644
---- a/lib/pacemaker/pcmk_cluster_queries.c
-+++ b/lib/pacemaker/pcmk_cluster_queries.c
-@@ -360,7 +360,7 @@ ipc_connect(data_t *data, enum pcmk_ipc_server server, pcmk_ipc_callback_t cb,
-         pcmk_register_ipc_callback(api, cb, data);
-     }
- 
--    rc = pcmk__connect_ipc_retry_conrefused(api, dispatch_type, 5);
-+    rc = pcmk__connect_ipc(api, dispatch_type, 5);
-     if (rc != pcmk_rc_ok) {
-         if (rc == EREMOTEIO) {
-             data->pcmkd_state = pcmk_pacemakerd_state_remote;
--- 
-2.49.0
-
diff --git a/002-corosync.patch b/002-corosync.patch
deleted file mode 100644
index 3f048ce..0000000
--- a/002-corosync.patch
+++ /dev/null
@@ -1,75 +0,0 @@
-From b1fd6ccea9083826c1c2fb40418651704989a904 Mon Sep 17 00:00:00 2001
-From: Reid Wahl <nrwahl@protonmail.com>
-Date: Wed, 13 Aug 2025 17:33:16 -0700
-Subject: [PATCH] Fix: cts: Start corosync using systemd if available
-
-As of corosync upstream commit ae859515, in systemd builds,
-StateDirectory is set in the systemd corosync.service file. The corosync
-state directory defaults to this value if not set in the corosync config
-file. Corosync falls back to using /var/lib/corosync only if the systemd
-StateDirectory is not set.
-
-The same commit removes /var/lib/corosync from RPM builds with systemd.
-
-As a result, if corosync was built with systemd, then starting corosync
-outside of systemd fails unless /var/lib/corosync has been created
-manually or through some other means. Starting corosync directly from
-the command line fails with the following error, because the
-STATE_DIRECTORY environment variable was not set by systemd:
-
-Cannot chdir to state directory /var/lib/corosync. No such file or
-directory
-
-This causes Pacemaker's cts-fencing script to fail.
-
-This seems like a bug in corosync, as it now assumes that corosync will
-always be started by systemd if available. Here, we work around it in
-cts by doing exactly that.
-
-Signed-off-by: Reid Wahl <nrwahl@protonmail.com>
----
- python/pacemaker/_cts/corosync.py | 11 ++++++++++-
- 1 file changed, 10 insertions(+), 1 deletion(-)
-
-diff --git a/python/pacemaker/_cts/corosync.py b/python/pacemaker/_cts/corosync.py
-index 0a55dd7c96..beb574d2b8 100644
---- a/python/pacemaker/_cts/corosync.py
-+++ b/python/pacemaker/_cts/corosync.py
-@@ -11,6 +11,7 @@ import tempfile
- import time
- 
- from pacemaker.buildoptions import BuildOptions
-+from pacemaker._cts.environment import EnvFactory
- from pacemaker._cts.process import killall, stdout_from_command
- 
- 
-@@ -112,6 +113,9 @@ class Corosync:
-         self.logdir = logdir
-         self.cluster_name = cluster_name
- 
-+        # The Corosync class doesn't use self._env._nodes, but the
-+        # "--nodes" argument is required to be present and nonempty
-+        self._env = EnvFactory().getInstance(args=["--nodes", "localhost"])
-         self._existing_cfg_file = None
- 
-     def _ready(self, logfile, timeout=10):
-@@ -149,10 +153,15 @@ class Corosync:
-                                                         self.cluster_name, localname())
-         logfile = corosync_log_file(BuildOptions.COROSYNC_CONFIG_FILE)
- 
-+        if self._env["have_systemd"]:
-+            cmd = ["systemctl", "start", "corosync.service"]
-+        else:
-+            cmd = ["corosync"]
-+
-         if self.verbose:
-             print("Starting corosync")
- 
--        with subprocess.Popen("corosync", stdout=subprocess.PIPE, stderr=subprocess.PIPE) as test:
-+        with subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) as test:
-             test.wait()
- 
-         # Wait for corosync to be ready before returning
--- 
-2.50.1
-
diff --git a/003-promotable-follows.patch b/003-promotable-follows.patch
deleted file mode 100644
index 7b413d7..0000000
--- a/003-promotable-follows.patch
+++ /dev/null
@@ -1,801 +0,0 @@
-From 6e5d574de9ad3a131cc0c51f2c5300e2cf4e7db3 Mon Sep 17 00:00:00 2001
-From: Klaus Wenninger <klaus.wenninger@aon.at>
-Date: Tue, 7 Oct 2025 05:07:04 +0200
-Subject: [PATCH 1/2] Test: scheduler: promoted state with promoted state with
- attribute
-
-Add testcase. Previous fix attribute based colocation didn't adhere
-the attribute with promoted state with promoted state.
----
- cts/cts-scheduler.in                          |   1 +
- ...motable-colocation-with-node-attribute.dot |  28 +++
- ...motable-colocation-with-node-attribute.exp | 175 ++++++++++++++++++
- ...able-colocation-with-node-attribute.scores |  81 ++++++++
- ...ble-colocation-with-node-attribute.summary |  45 +++++
- ...motable-colocation-with-node-attribute.xml | 155 ++++++++++++++++
- 6 files changed, 485 insertions(+)
- create mode 100644 cts/scheduler/dot/promotable-colocation-with-node-attribute.dot
- create mode 100644 cts/scheduler/exp/promotable-colocation-with-node-attribute.exp
- create mode 100644 cts/scheduler/scores/promotable-colocation-with-node-attribute.scores
- create mode 100644 cts/scheduler/summary/promotable-colocation-with-node-attribute.summary
- create mode 100644 cts/scheduler/xml/promotable-colocation-with-node-attribute.xml
-
-diff --git a/cts/cts-scheduler.in b/cts/cts-scheduler.in
-index f5d4ed5..be8af87 100644
---- a/cts/cts-scheduler.in
-+++ b/cts/cts-scheduler.in
-@@ -633,6 +633,7 @@ TESTS = [
-         SchedulerTest("no_quorum_demote", "Promotable demotion and primitive stop with no-quorum-policy=\"demote\""),
-         SchedulerTest("no-promote-on-unrunnable-guest", "Don't select bundle instance for promotion when container can't run"),
-         SchedulerTest("leftover-pending-monitor", "Prevent a leftover pending monitor from causing unexpected stop of other instances"),
-+        SchedulerTest("promotable-colocation-with-node-attribute", "Promote dependent clone on nodes belonging to a site that has a primary clone promoted"),
-     ]),
-     SchedulerTestGroup([
-         SchedulerTest("history-1", "Correctly parse stateful-1 resource state"),
-diff --git a/cts/scheduler/dot/promotable-colocation-with-node-attribute.dot b/cts/scheduler/dot/promotable-colocation-with-node-attribute.dot
-new file mode 100644
-index 0000000..89d066f
---- /dev/null
-+++ b/cts/scheduler/dot/promotable-colocation-with-node-attribute.dot
-@@ -0,0 +1,28 @@
-+ digraph "g" {
-+"dependent-clone_demote_0" -> "dependent-clone_demoted_0" [ style = bold]
-+"dependent-clone_demote_0" -> "dependent-rsc_demote_0 node3" [ style = bold]
-+"dependent-clone_demote_0" [ style=bold color="green" fontcolor="orange"]
-+"dependent-clone_demoted_0" -> "dependent-clone_promote_0" [ style = bold]
-+"dependent-clone_demoted_0" [ style=bold color="green" fontcolor="orange"]
-+"dependent-clone_promote_0" -> "dependent-rsc_promote_0 node1" [ style = bold]
-+"dependent-clone_promote_0" -> "dependent-rsc_promote_0 node2" [ style = bold]
-+"dependent-clone_promote_0" [ style=bold color="green" fontcolor="orange"]
-+"dependent-clone_promoted_0" [ style=bold color="green" fontcolor="orange"]
-+"dependent-rsc_demote_0 node3" -> "dependent-clone_demoted_0" [ style = bold]
-+"dependent-rsc_demote_0 node3" -> "dependent-rsc_monitor_11000 node3" [ style = bold]
-+"dependent-rsc_demote_0 node3" [ style=bold color="green" fontcolor="black"]
-+"dependent-rsc_monitor_10000 node1" [ style=bold color="green" fontcolor="black"]
-+"dependent-rsc_monitor_10000 node2" [ style=bold color="green" fontcolor="black"]
-+"dependent-rsc_monitor_11000 node3" [ style=bold color="green" fontcolor="black"]
-+"dependent-rsc_monitor_11000 node4" [ style=bold color="green" fontcolor="black"]
-+"dependent-rsc_promote_0 node1" -> "dependent-clone_promoted_0" [ style = bold]
-+"dependent-rsc_promote_0 node1" -> "dependent-rsc_monitor_10000 node1" [ style = bold]
-+"dependent-rsc_promote_0 node1" [ style=bold color="green" fontcolor="black"]
-+"dependent-rsc_promote_0 node2" -> "dependent-clone_promoted_0" [ style = bold]
-+"dependent-rsc_promote_0 node2" -> "dependent-rsc_monitor_10000 node2" [ style = bold]
-+"dependent-rsc_promote_0 node2" [ style=bold color="green" fontcolor="black"]
-+"primary-rsc_monitor_10000 node1" [ style=bold color="green" fontcolor="black"]
-+"primary-rsc_monitor_11000 node2" [ style=bold color="green" fontcolor="black"]
-+"primary-rsc_monitor_11000 node3" [ style=bold color="green" fontcolor="black"]
-+"primary-rsc_monitor_11000 node4" [ style=bold color="green" fontcolor="black"]
-+}
-diff --git a/cts/scheduler/exp/promotable-colocation-with-node-attribute.exp b/cts/scheduler/exp/promotable-colocation-with-node-attribute.exp
-new file mode 100644
-index 0000000..76371f1
---- /dev/null
-+++ b/cts/scheduler/exp/promotable-colocation-with-node-attribute.exp
-@@ -0,0 +1,175 @@
-+<transition_graph cluster-delay="60s" stonith-timeout="60s" failed-stop-offset="INFINITY" failed-start-offset="INFINITY"  transition_id="1">
-+  <synapse id="0">
-+    <action_set>
-+      <rsc_op id="5" operation="monitor" operation_key="primary-rsc_monitor_10000" internal_operation_key="primary-rsc:0_monitor_10000" on_node="node1" on_node_uuid="node1">
-+        <primitive id="primary-rsc" long-id="primary-rsc:0" class="ocf" provider="pacemaker" type="Stateful"/>
-+        <attributes CRM_meta_clone="0" CRM_meta_clone_max="4" CRM_meta_clone_node_max="1" CRM_meta_globally_unique="false" CRM_meta_interval="10000" CRM_meta_master_max="1" CRM_meta_master_node_max="1" CRM_meta_name="monitor" CRM_meta_notify="false" CRM_meta_on_node="node1" CRM_meta_on_node_uuid="node1" CRM_meta_op_target_rc="8" CRM_meta_promoted_max="1" CRM_meta_promoted_node_max="1" CRM_meta_role="Promoted" CRM_meta_timeout="20000" />
-+      </rsc_op>
-+    </action_set>
-+    <inputs/>
-+  </synapse>
-+  <synapse id="1">
-+    <action_set>
-+      <rsc_op id="8" operation="monitor" operation_key="primary-rsc_monitor_11000" internal_operation_key="primary-rsc:1_monitor_11000" on_node="node2" on_node_uuid="node2">
-+        <primitive id="primary-rsc" long-id="primary-rsc:1" class="ocf" provider="pacemaker" type="Stateful"/>
-+        <attributes CRM_meta_clone="1" CRM_meta_clone_max="4" CRM_meta_clone_node_max="1" CRM_meta_globally_unique="false" CRM_meta_interval="11000" CRM_meta_master_max="1" CRM_meta_master_node_max="1" CRM_meta_name="monitor" CRM_meta_notify="false" CRM_meta_on_node="node2" CRM_meta_on_node_uuid="node2" CRM_meta_promoted_max="1" CRM_meta_promoted_node_max="1" CRM_meta_role="Unpromoted" CRM_meta_timeout="20000" />
-+      </rsc_op>
-+    </action_set>
-+    <inputs/>
-+  </synapse>
-+  <synapse id="2">
-+    <action_set>
-+      <rsc_op id="11" operation="monitor" operation_key="primary-rsc_monitor_11000" internal_operation_key="primary-rsc:2_monitor_11000" on_node="node3" on_node_uuid="node3">
-+        <primitive id="primary-rsc" long-id="primary-rsc:2" class="ocf" provider="pacemaker" type="Stateful"/>
-+        <attributes CRM_meta_clone="2" CRM_meta_clone_max="4" CRM_meta_clone_node_max="1" CRM_meta_globally_unique="false" CRM_meta_interval="11000" CRM_meta_master_max="1" CRM_meta_master_node_max="1" CRM_meta_name="monitor" CRM_meta_notify="false" CRM_meta_on_node="node3" CRM_meta_on_node_uuid="node3" CRM_meta_promoted_max="1" CRM_meta_promoted_node_max="1" CRM_meta_role="Unpromoted" CRM_meta_timeout="20000" />
-+      </rsc_op>
-+    </action_set>
-+    <inputs/>
-+  </synapse>
-+  <synapse id="3">
-+    <action_set>
-+      <rsc_op id="14" operation="monitor" operation_key="primary-rsc_monitor_11000" internal_operation_key="primary-rsc:3_monitor_11000" on_node="node4" on_node_uuid="node4">
-+        <primitive id="primary-rsc" long-id="primary-rsc:3" class="ocf" provider="pacemaker" type="Stateful"/>
-+        <attributes CRM_meta_clone="3" CRM_meta_clone_max="4" CRM_meta_clone_node_max="1" CRM_meta_globally_unique="false" CRM_meta_interval="11000" CRM_meta_master_max="1" CRM_meta_master_node_max="1" CRM_meta_name="monitor" CRM_meta_notify="false" CRM_meta_on_node="node4" CRM_meta_on_node_uuid="node4" CRM_meta_promoted_max="1" CRM_meta_promoted_node_max="1" CRM_meta_role="Unpromoted" CRM_meta_timeout="20000" />
-+      </rsc_op>
-+    </action_set>
-+    <inputs/>
-+  </synapse>
-+  <synapse id="4">
-+    <action_set>
-+      <rsc_op id="26" operation="monitor" operation_key="dependent-rsc_monitor_10000" internal_operation_key="dependent-rsc:0_monitor_10000" on_node="node1" on_node_uuid="node1">
-+        <primitive id="dependent-rsc" long-id="dependent-rsc:0" class="ocf" provider="pacemaker" type="Stateful"/>
-+        <attributes CRM_meta_clone="0" CRM_meta_clone_max="4" CRM_meta_clone_node_max="1" CRM_meta_globally_unique="false" CRM_meta_interval="10000" CRM_meta_master_max="2" CRM_meta_master_node_max="1" CRM_meta_name="monitor" CRM_meta_notify="false" CRM_meta_on_node="node1" CRM_meta_on_node_uuid="node1" CRM_meta_op_target_rc="8" CRM_meta_promoted_max="2" CRM_meta_promoted_node_max="1" CRM_meta_role="Promoted" CRM_meta_timeout="20000" />
-+      </rsc_op>
-+    </action_set>
-+    <inputs>
-+      <trigger>
-+        <rsc_op id="25" operation="promote" operation_key="dependent-rsc_promote_0" internal_operation_key="dependent-rsc:0_promote_0" on_node="node1" on_node_uuid="node1"/>
-+      </trigger>
-+    </inputs>
-+  </synapse>
-+  <synapse id="5">
-+    <action_set>
-+      <rsc_op id="25" operation="promote" operation_key="dependent-rsc_promote_0" internal_operation_key="dependent-rsc:0_promote_0" on_node="node1" on_node_uuid="node1">
-+        <primitive id="dependent-rsc" long-id="dependent-rsc:0" class="ocf" provider="pacemaker" type="Stateful"/>
-+        <attributes CRM_meta_clone="0" CRM_meta_clone_max="4" CRM_meta_clone_node_max="1" CRM_meta_globally_unique="false" CRM_meta_master_max="2" CRM_meta_master_node_max="1" CRM_meta_notify="false" CRM_meta_on_node="node1" CRM_meta_on_node_uuid="node1" CRM_meta_promoted_max="2" CRM_meta_promoted_node_max="1" CRM_meta_timeout="20000" />
-+      </rsc_op>
-+    </action_set>
-+    <inputs>
-+      <trigger>
-+        <pseudo_event id="42" operation="promote" operation_key="dependent-clone_promote_0"/>
-+      </trigger>
-+    </inputs>
-+  </synapse>
-+  <synapse id="6">
-+    <action_set>
-+      <rsc_op id="30" operation="monitor" operation_key="dependent-rsc_monitor_10000" internal_operation_key="dependent-rsc:1_monitor_10000" on_node="node2" on_node_uuid="node2">
-+        <primitive id="dependent-rsc" long-id="dependent-rsc:1" class="ocf" provider="pacemaker" type="Stateful"/>
-+        <attributes CRM_meta_clone="1" CRM_meta_clone_max="4" CRM_meta_clone_node_max="1" CRM_meta_globally_unique="false" CRM_meta_interval="10000" CRM_meta_master_max="2" CRM_meta_master_node_max="1" CRM_meta_name="monitor" CRM_meta_notify="false" CRM_meta_on_node="node2" CRM_meta_on_node_uuid="node2" CRM_meta_op_target_rc="8" CRM_meta_promoted_max="2" CRM_meta_promoted_node_max="1" CRM_meta_role="Promoted" CRM_meta_timeout="20000" />
-+      </rsc_op>
-+    </action_set>
-+    <inputs>
-+      <trigger>
-+        <rsc_op id="29" operation="promote" operation_key="dependent-rsc_promote_0" internal_operation_key="dependent-rsc:1_promote_0" on_node="node2" on_node_uuid="node2"/>
-+      </trigger>
-+    </inputs>
-+  </synapse>
-+  <synapse id="7">
-+    <action_set>
-+      <rsc_op id="29" operation="promote" operation_key="dependent-rsc_promote_0" internal_operation_key="dependent-rsc:1_promote_0" on_node="node2" on_node_uuid="node2">
-+        <primitive id="dependent-rsc" long-id="dependent-rsc:1" class="ocf" provider="pacemaker" type="Stateful"/>
-+        <attributes CRM_meta_clone="1" CRM_meta_clone_max="4" CRM_meta_clone_node_max="1" CRM_meta_globally_unique="false" CRM_meta_master_max="2" CRM_meta_master_node_max="1" CRM_meta_notify="false" CRM_meta_on_node="node2" CRM_meta_on_node_uuid="node2" CRM_meta_promoted_max="2" CRM_meta_promoted_node_max="1" CRM_meta_timeout="20000" />
-+      </rsc_op>
-+    </action_set>
-+    <inputs>
-+      <trigger>
-+        <pseudo_event id="42" operation="promote" operation_key="dependent-clone_promote_0"/>
-+      </trigger>
-+    </inputs>
-+  </synapse>
-+  <synapse id="8">
-+    <action_set>
-+      <rsc_op id="34" operation="monitor" operation_key="dependent-rsc_monitor_11000" internal_operation_key="dependent-rsc:2_monitor_11000" on_node="node3" on_node_uuid="node3">
-+        <primitive id="dependent-rsc" long-id="dependent-rsc:2" class="ocf" provider="pacemaker" type="Stateful"/>
-+        <attributes CRM_meta_clone="2" CRM_meta_clone_max="4" CRM_meta_clone_node_max="1" CRM_meta_globally_unique="false" CRM_meta_interval="11000" CRM_meta_master_max="2" CRM_meta_master_node_max="1" CRM_meta_name="monitor" CRM_meta_notify="false" CRM_meta_on_node="node3" CRM_meta_on_node_uuid="node3" CRM_meta_promoted_max="2" CRM_meta_promoted_node_max="1" CRM_meta_role="Unpromoted" CRM_meta_timeout="20000" />
-+      </rsc_op>
-+    </action_set>
-+    <inputs>
-+      <trigger>
-+        <rsc_op id="32" operation="demote" operation_key="dependent-rsc_demote_0" internal_operation_key="dependent-rsc:2_demote_0" on_node="node3" on_node_uuid="node3"/>
-+      </trigger>
-+    </inputs>
-+  </synapse>
-+  <synapse id="9">
-+    <action_set>
-+      <rsc_op id="32" operation="demote" operation_key="dependent-rsc_demote_0" internal_operation_key="dependent-rsc:2_demote_0" on_node="node3" on_node_uuid="node3">
-+        <primitive id="dependent-rsc" long-id="dependent-rsc:2" class="ocf" provider="pacemaker" type="Stateful"/>
-+        <attributes CRM_meta_clone="2" CRM_meta_clone_max="4" CRM_meta_clone_node_max="1" CRM_meta_globally_unique="false" CRM_meta_master_max="2" CRM_meta_master_node_max="1" CRM_meta_notify="false" CRM_meta_on_node="node3" CRM_meta_on_node_uuid="node3" CRM_meta_promoted_max="2" CRM_meta_promoted_node_max="1" CRM_meta_timeout="20000" />
-+      </rsc_op>
-+    </action_set>
-+    <inputs>
-+      <trigger>
-+        <pseudo_event id="44" operation="demote" operation_key="dependent-clone_demote_0"/>
-+      </trigger>
-+    </inputs>
-+  </synapse>
-+  <synapse id="10">
-+    <action_set>
-+      <rsc_op id="37" operation="monitor" operation_key="dependent-rsc_monitor_11000" internal_operation_key="dependent-rsc:3_monitor_11000" on_node="node4" on_node_uuid="node4">
-+        <primitive id="dependent-rsc" long-id="dependent-rsc:3" class="ocf" provider="pacemaker" type="Stateful"/>
-+        <attributes CRM_meta_clone="3" CRM_meta_clone_max="4" CRM_meta_clone_node_max="1" CRM_meta_globally_unique="false" CRM_meta_interval="11000" CRM_meta_master_max="2" CRM_meta_master_node_max="1" CRM_meta_name="monitor" CRM_meta_notify="false" CRM_meta_on_node="node4" CRM_meta_on_node_uuid="node4" CRM_meta_promoted_max="2" CRM_meta_promoted_node_max="1" CRM_meta_role="Unpromoted" CRM_meta_timeout="20000" />
-+      </rsc_op>
-+    </action_set>
-+    <inputs/>
-+  </synapse>
-+  <synapse id="11" priority="1000000">
-+    <action_set>
-+      <pseudo_event id="45" operation="demoted" operation_key="dependent-clone_demoted_0">
-+        <attributes CRM_meta_clone_max="4" CRM_meta_clone_node_max="1" CRM_meta_globally_unique="false" CRM_meta_master_max="2" CRM_meta_master_node_max="1" CRM_meta_notify="false" CRM_meta_promoted_max="2" CRM_meta_promoted_node_max="1" CRM_meta_timeout="20000" />
-+      </pseudo_event>
-+    </action_set>
-+    <inputs>
-+      <trigger>
-+        <rsc_op id="32" operation="demote" operation_key="dependent-rsc_demote_0" internal_operation_key="dependent-rsc:2_demote_0" on_node="node3" on_node_uuid="node3"/>
-+      </trigger>
-+      <trigger>
-+        <pseudo_event id="44" operation="demote" operation_key="dependent-clone_demote_0"/>
-+      </trigger>
-+    </inputs>
-+  </synapse>
-+  <synapse id="12">
-+    <action_set>
-+      <pseudo_event id="44" operation="demote" operation_key="dependent-clone_demote_0">
-+        <attributes CRM_meta_clone_max="4" CRM_meta_clone_node_max="1" CRM_meta_globally_unique="false" CRM_meta_master_max="2" CRM_meta_master_node_max="1" CRM_meta_notify="false" CRM_meta_promoted_max="2" CRM_meta_promoted_node_max="1" CRM_meta_timeout="20000" />
-+      </pseudo_event>
-+    </action_set>
-+    <inputs/>
-+  </synapse>
-+  <synapse id="13" priority="1000000">
-+    <action_set>
-+      <pseudo_event id="43" operation="promoted" operation_key="dependent-clone_promoted_0">
-+        <attributes CRM_meta_clone_max="4" CRM_meta_clone_node_max="1" CRM_meta_globally_unique="false" CRM_meta_master_max="2" CRM_meta_master_node_max="1" CRM_meta_notify="false" CRM_meta_promoted_max="2" CRM_meta_promoted_node_max="1" CRM_meta_timeout="20000" />
-+      </pseudo_event>
-+    </action_set>
-+    <inputs>
-+      <trigger>
-+        <rsc_op id="25" operation="promote" operation_key="dependent-rsc_promote_0" internal_operation_key="dependent-rsc:0_promote_0" on_node="node1" on_node_uuid="node1"/>
-+      </trigger>
-+      <trigger>
-+        <rsc_op id="29" operation="promote" operation_key="dependent-rsc_promote_0" internal_operation_key="dependent-rsc:1_promote_0" on_node="node2" on_node_uuid="node2"/>
-+      </trigger>
-+    </inputs>
-+  </synapse>
-+  <synapse id="14">
-+    <action_set>
-+      <pseudo_event id="42" operation="promote" operation_key="dependent-clone_promote_0">
-+        <attributes CRM_meta_clone_max="4" CRM_meta_clone_node_max="1" CRM_meta_globally_unique="false" CRM_meta_master_max="2" CRM_meta_master_node_max="1" CRM_meta_notify="false" CRM_meta_promoted_max="2" CRM_meta_promoted_node_max="1" CRM_meta_timeout="20000" />
-+      </pseudo_event>
-+    </action_set>
-+    <inputs>
-+      <trigger>
-+        <pseudo_event id="45" operation="demoted" operation_key="dependent-clone_demoted_0"/>
-+      </trigger>
-+    </inputs>
-+  </synapse>
-+</transition_graph>
-diff --git a/cts/scheduler/scores/promotable-colocation-with-node-attribute.scores b/cts/scheduler/scores/promotable-colocation-with-node-attribute.scores
-new file mode 100644
-index 0000000..023ee77
---- /dev/null
-+++ b/cts/scheduler/scores/promotable-colocation-with-node-attribute.scores
-@@ -0,0 +1,81 @@
-+
-+dependent-rsc:0 promotion score on node1: INFINITY
-+dependent-rsc:1 promotion score on node2: INFINITY
-+dependent-rsc:2 promotion score on node3: -INFINITY
-+dependent-rsc:3 promotion score on node4: -INFINITY
-+pcmk__clone_assign: dependent-clone allocation score on node1: 0
-+pcmk__clone_assign: dependent-clone allocation score on node2: 0
-+pcmk__clone_assign: dependent-clone allocation score on node3: 0
-+pcmk__clone_assign: dependent-clone allocation score on node4: 0
-+pcmk__clone_assign: dependent-rsc:0 allocation score on node1: 6
-+pcmk__clone_assign: dependent-rsc:0 allocation score on node2: 0
-+pcmk__clone_assign: dependent-rsc:0 allocation score on node3: 0
-+pcmk__clone_assign: dependent-rsc:0 allocation score on node4: 0
-+pcmk__clone_assign: dependent-rsc:1 allocation score on node1: 0
-+pcmk__clone_assign: dependent-rsc:1 allocation score on node2: 6
-+pcmk__clone_assign: dependent-rsc:1 allocation score on node3: 0
-+pcmk__clone_assign: dependent-rsc:1 allocation score on node4: 0
-+pcmk__clone_assign: dependent-rsc:2 allocation score on node1: 0
-+pcmk__clone_assign: dependent-rsc:2 allocation score on node2: 0
-+pcmk__clone_assign: dependent-rsc:2 allocation score on node3: 11
-+pcmk__clone_assign: dependent-rsc:2 allocation score on node4: 0
-+pcmk__clone_assign: dependent-rsc:3 allocation score on node1: 0
-+pcmk__clone_assign: dependent-rsc:3 allocation score on node2: 0
-+pcmk__clone_assign: dependent-rsc:3 allocation score on node3: 0
-+pcmk__clone_assign: dependent-rsc:3 allocation score on node4: 6
-+pcmk__clone_assign: primary-clone allocation score on node1: 0
-+pcmk__clone_assign: primary-clone allocation score on node2: 0
-+pcmk__clone_assign: primary-clone allocation score on node3: 0
-+pcmk__clone_assign: primary-clone allocation score on node4: 0
-+pcmk__clone_assign: primary-rsc:0 allocation score on node1: 11
-+pcmk__clone_assign: primary-rsc:0 allocation score on node2: 0
-+pcmk__clone_assign: primary-rsc:0 allocation score on node3: 0
-+pcmk__clone_assign: primary-rsc:0 allocation score on node4: 0
-+pcmk__clone_assign: primary-rsc:1 allocation score on node1: 0
-+pcmk__clone_assign: primary-rsc:1 allocation score on node2: 6
-+pcmk__clone_assign: primary-rsc:1 allocation score on node3: 0
-+pcmk__clone_assign: primary-rsc:1 allocation score on node4: 0
-+pcmk__clone_assign: primary-rsc:2 allocation score on node1: 0
-+pcmk__clone_assign: primary-rsc:2 allocation score on node2: 0
-+pcmk__clone_assign: primary-rsc:2 allocation score on node3: 6
-+pcmk__clone_assign: primary-rsc:2 allocation score on node4: 0
-+pcmk__clone_assign: primary-rsc:3 allocation score on node1: 0
-+pcmk__clone_assign: primary-rsc:3 allocation score on node2: 0
-+pcmk__clone_assign: primary-rsc:3 allocation score on node3: 0
-+pcmk__clone_assign: primary-rsc:3 allocation score on node4: 6
-+pcmk__primitive_assign: dependent-rsc:0 allocation score on node1: 6
-+pcmk__primitive_assign: dependent-rsc:0 allocation score on node2: 0
-+pcmk__primitive_assign: dependent-rsc:0 allocation score on node3: -INFINITY
-+pcmk__primitive_assign: dependent-rsc:0 allocation score on node4: 0
-+pcmk__primitive_assign: dependent-rsc:1 allocation score on node1: -INFINITY
-+pcmk__primitive_assign: dependent-rsc:1 allocation score on node2: 6
-+pcmk__primitive_assign: dependent-rsc:1 allocation score on node3: -INFINITY
-+pcmk__primitive_assign: dependent-rsc:1 allocation score on node4: 0
-+pcmk__primitive_assign: dependent-rsc:2 allocation score on node1: 0
-+pcmk__primitive_assign: dependent-rsc:2 allocation score on node2: 0
-+pcmk__primitive_assign: dependent-rsc:2 allocation score on node3: 11
-+pcmk__primitive_assign: dependent-rsc:2 allocation score on node4: 0
-+pcmk__primitive_assign: dependent-rsc:3 allocation score on node1: -INFINITY
-+pcmk__primitive_assign: dependent-rsc:3 allocation score on node2: -INFINITY
-+pcmk__primitive_assign: dependent-rsc:3 allocation score on node3: -INFINITY
-+pcmk__primitive_assign: dependent-rsc:3 allocation score on node4: 6
-+pcmk__primitive_assign: primary-rsc:0 allocation score on node1: 11
-+pcmk__primitive_assign: primary-rsc:0 allocation score on node2: 0
-+pcmk__primitive_assign: primary-rsc:0 allocation score on node3: 0
-+pcmk__primitive_assign: primary-rsc:0 allocation score on node4: 0
-+pcmk__primitive_assign: primary-rsc:1 allocation score on node1: -INFINITY
-+pcmk__primitive_assign: primary-rsc:1 allocation score on node2: 6
-+pcmk__primitive_assign: primary-rsc:1 allocation score on node3: 0
-+pcmk__primitive_assign: primary-rsc:1 allocation score on node4: 0
-+pcmk__primitive_assign: primary-rsc:2 allocation score on node1: -INFINITY
-+pcmk__primitive_assign: primary-rsc:2 allocation score on node2: -INFINITY
-+pcmk__primitive_assign: primary-rsc:2 allocation score on node3: 6
-+pcmk__primitive_assign: primary-rsc:2 allocation score on node4: 0
-+pcmk__primitive_assign: primary-rsc:3 allocation score on node1: -INFINITY
-+pcmk__primitive_assign: primary-rsc:3 allocation score on node2: -INFINITY
-+pcmk__primitive_assign: primary-rsc:3 allocation score on node3: -INFINITY
-+pcmk__primitive_assign: primary-rsc:3 allocation score on node4: 6
-+primary-rsc:0 promotion score on node1: 10
-+primary-rsc:1 promotion score on node2: 5
-+primary-rsc:2 promotion score on node3: 5
-+primary-rsc:3 promotion score on node4: 5
-diff --git a/cts/scheduler/summary/promotable-colocation-with-node-attribute.summary b/cts/scheduler/summary/promotable-colocation-with-node-attribute.summary
-new file mode 100644
-index 0000000..30e81c8
---- /dev/null
-+++ b/cts/scheduler/summary/promotable-colocation-with-node-attribute.summary
-@@ -0,0 +1,45 @@
-+Current cluster status:
-+  * Node List:
-+    * Online: [ node1 node2 node3 node4 ]
-+
-+  * Full List of Resources:
-+    * Clone Set: primary-clone [primary-rsc] (promotable):
-+      * Promoted: [ node1 ]
-+      * Unpromoted: [ node2 node3 node4 ]
-+    * Clone Set: dependent-clone [dependent-rsc] (promotable):
-+      * Promoted: [ node3 ]
-+      * Unpromoted: [ node1 node2 node4 ]
-+
-+Transition Summary:
-+  * Promote    dependent-rsc:0     ( Unpromoted -> Promoted node1 )
-+  * Promote    dependent-rsc:1     ( Unpromoted -> Promoted node2 )
-+  * Demote     dependent-rsc:2     ( Promoted -> Unpromoted node3 )
-+
-+Executing Cluster Transition:
-+  * Resource action: primary-rsc     monitor=10000 on node1
-+  * Resource action: primary-rsc     monitor=11000 on node2
-+  * Resource action: primary-rsc     monitor=11000 on node3
-+  * Resource action: primary-rsc     monitor=11000 on node4
-+  * Resource action: dependent-rsc   monitor=11000 on node4
-+  * Pseudo action:   dependent-clone_demote_0
-+  * Resource action: dependent-rsc   demote on node3
-+  * Pseudo action:   dependent-clone_demoted_0
-+  * Pseudo action:   dependent-clone_promote_0
-+  * Resource action: dependent-rsc   promote on node1
-+  * Resource action: dependent-rsc   promote on node2
-+  * Resource action: dependent-rsc   monitor=11000 on node3
-+  * Pseudo action:   dependent-clone_promoted_0
-+  * Resource action: dependent-rsc   monitor=10000 on node1
-+  * Resource action: dependent-rsc   monitor=10000 on node2
-+
-+Revised Cluster Status:
-+  * Node List:
-+    * Online: [ node1 node2 node3 node4 ]
-+
-+  * Full List of Resources:
-+    * Clone Set: primary-clone [primary-rsc] (promotable):
-+      * Promoted: [ node1 ]
-+      * Unpromoted: [ node2 node3 node4 ]
-+    * Clone Set: dependent-clone [dependent-rsc] (promotable):
-+      * Promoted: [ node1 node2 ]
-+      * Unpromoted: [ node3 node4 ]
-diff --git a/cts/scheduler/xml/promotable-colocation-with-node-attribute.xml b/cts/scheduler/xml/promotable-colocation-with-node-attribute.xml
-new file mode 100644
-index 0000000..5b4ab10
---- /dev/null
-+++ b/cts/scheduler/xml/promotable-colocation-with-node-attribute.xml
-@@ -0,0 +1,155 @@
-+<cib crm_feature_set="3.19.7" validate-with="pacemaker-3.10" epoch="1" num_updates="0" admin_epoch="0" cib-last-written="Mon Jan 1 12:00:00 2024" have-quorum="1" dc-uuid="node1">
-+  <configuration>
-+    <crm_config>
-+      <cluster_property_set id="cib-bootstrap-options">
-+        <nvpair id="cib-bootstrap-options-dc-version" name="dc-version" value="2.1.8"/>
-+        <nvpair id="cib-bootstrap-options-cluster-infrastructure" name="cluster-infrastructure" value="corosync"/>
-+        <nvpair id="cib-bootstrap-options-stonith-enabled" name="stonith-enabled" value="false"/>
-+      </cluster_property_set>
-+    </crm_config>
-+    <nodes>
-+      <!-- Site A nodes -->
-+      <node id="node1" uname="node1">
-+        <instance_attributes id="node1-attrs">
-+          <nvpair id="node1-site" name="site" value="siteA"/>
-+        </instance_attributes>
-+      </node>
-+      <node id="node2" uname="node2">
-+        <instance_attributes id="node2-attrs">
-+          <nvpair id="node2-site" name="site" value="siteA"/>
-+        </instance_attributes>
-+      </node>
-+      <!-- Site B nodes -->
-+      <node id="node3" uname="node3">
-+        <instance_attributes id="node3-attrs">
-+          <nvpair id="node3-site" name="site" value="siteB"/>
-+        </instance_attributes>
-+      </node>
-+      <node id="node4" uname="node4">
-+        <instance_attributes id="node4-attrs">
-+          <nvpair id="node4-site" name="site" value="siteB"/>
-+        </instance_attributes>
-+      </node>
-+    </nodes>
-+    <resources>
-+      <!-- Primary promotable clone -->
-+      <clone id="primary-clone">
-+        <meta_attributes id="primary-clone-meta">
-+          <nvpair id="primary-clone-promotable" name="promotable" value="true"/>
-+          <nvpair id="primary-clone-promoted-max" name="promoted-max" value="1"/>
-+          <nvpair id="primary-clone-clone-max" name="clone-max" value="4"/>
-+        </meta_attributes>
-+        <primitive id="primary-rsc" class="ocf" provider="pacemaker" type="Stateful">
-+          <operations>
-+            <op id="primary-rsc-monitor-promoted" name="monitor" interval="10s" role="Promoted"/>
-+            <op id="primary-rsc-monitor-unpromoted" name="monitor" interval="11s" role="Unpromoted"/>
-+          </operations>
-+        </primitive>
-+      </clone>
-+      <!-- Dependent promotable clone -->
-+      <clone id="dependent-clone">
-+        <meta_attributes id="dependent-clone-meta">
-+          <nvpair id="dependent-clone-promotable" name="promotable" value="true"/>
-+          <nvpair id="dependent-clone-promoted-max" name="promoted-max" value="2"/>
-+          <nvpair id="dependent-clone-clone-max" name="clone-max" value="4"/>
-+        </meta_attributes>
-+        <primitive id="dependent-rsc" class="ocf" provider="pacemaker" type="Stateful">
-+          <operations>
-+            <op id="dependent-rsc-monitor-promoted" name="monitor" interval="10s" role="Promoted"/>
-+            <op id="dependent-rsc-monitor-unpromoted" name="monitor" interval="11s" role="Unpromoted"/>
-+          </operations>
-+        </primitive>
-+      </clone>
-+    </resources>
-+    <constraints>
-+      <!--
-+        This constraint should ensure that dependent-clone is promoted
-+        on all nodes in the same site as where primary-clone is promoted.
-+        With the fix, if primary-clone is promoted on nodes in siteA,
-+        dependent-clone should also be promoted on nodes in siteA.
-+      -->
-+      <rsc_colocation id="coloc-dependent-with-primary-promoted"
-+                      rsc="dependent-clone"
-+                      rsc-role="Promoted"
-+                      with-rsc="primary-clone"
-+                      with-rsc-role="Promoted"
-+                      node-attribute="site"
-+                      score="INFINITY"/>
-+    </constraints>
-+  </configuration>
-+  <status>
-+    <!-- All nodes are online and clean -->
-+    <node_state id="node1" uname="node1" in_ccm="true" crmd="online" join="member" expected="member">
-+      <transient_attributes id="node1-transient">
-+        <instance_attributes id="node1-transient-attrs">
-+          <nvpair id="node1-promoted-primary-rsc" name="master-primary-rsc" value="10"/>
-+          <nvpair id="node1-promoted-dependent-rsc" name="master-dependent-rsc" value="5"/>
-+        </instance_attributes>
-+      </transient_attributes>
-+      <lrm id="node1-lrm">
-+        <lrm_resources>
-+          <lrm_resource id="primary-rsc" type="Stateful" class="ocf" provider="pacemaker">
-+            <lrm_rsc_op id="primary-rsc_last_0" operation_key="primary-rsc_promote_0" operation="promote" crm-debug-origin="do_update_resource" transition-key="1:0:0:test" on_node="node1" call-id="10" rc-code="0" op-status="0" interval="0" last-rc-change="1609459200" exec-time="100" queue-time="0" op-digest="f2317cad3d54cec5d7d7aa7d0bf35cf8"/>
-+          </lrm_resource>
-+          <lrm_resource id="dependent-rsc" type="Stateful" class="ocf" provider="pacemaker">
-+            <lrm_rsc_op id="dependent-rsc_last_0" operation_key="dependent-rsc_start_0" operation="start" crm-debug-origin="do_update_resource" transition-key="1:0:0:test" on_node="node1" call-id="10" rc-code="0" op-status="0" interval="0" last-rc-change="1609459200" exec-time="100" queue-time="0" op-digest="f2317cad3d54cec5d7d7aa7d0bf35cf8"/>
-+          </lrm_resource>
-+        </lrm_resources>
-+      </lrm>
-+    </node_state>
-+    <node_state id="node2" uname="node2" in_ccm="true" crmd="online" join="member" expected="member">
-+      <transient_attributes id="node2-transient">
-+        <instance_attributes id="node2-transient-attrs">
-+          <nvpair id="node2-promoted-primary-rsc" name="master-primary-rsc" value="5"/>
-+          <nvpair id="node2-promoted-dependent-rsc" name="master-dependent-rsc" value="5"/>
-+        </instance_attributes>
-+      </transient_attributes>
-+      <lrm id="node2-lrm">
-+        <lrm_resources>
-+          <lrm_resource id="primary-rsc" type="Stateful" class="ocf" provider="pacemaker">
-+            <lrm_rsc_op id="primary-rsc_last_0" operation_key="primary-rsc_start_0" operation="start" crm-debug-origin="do_update_resource" transition-key="1:0:0:test" on_node="node2" call-id="10" rc-code="0" op-status="0" interval="0" last-rc-change="1609459200" exec-time="100" queue-time="0" op-digest="f2317cad3d54cec5d7d7aa7d0bf35cf8"/>
-+          </lrm_resource>
-+          <lrm_resource id="dependent-rsc" type="Stateful" class="ocf" provider="pacemaker">
-+            <lrm_rsc_op id="dependent-rsc_last_0" operation_key="dependent-rsc_start_0" operation="start" crm-debug-origin="do_update_resource" transition-key="1:0:0:test" on_node="node2" call-id="10" rc-code="0" op-status="0" interval="0" last-rc-change="1609459200" exec-time="100" queue-time="0" op-digest="f2317cad3d54cec5d7d7aa7d0bf35cf8"/>
-+          </lrm_resource>
-+        </lrm_resources>
-+      </lrm>
-+    </node_state>
-+    <node_state id="node3" uname="node3" in_ccm="true" crmd="online" join="member" expected="member">
-+      <transient_attributes id="node3-transient">
-+        <instance_attributes id="node3-transient-attrs">
-+          <nvpair id="node3-promoted-primary-rsc" name="master-primary-rsc" value="5"/>
-+          <nvpair id="node3-promoted-dependent-rsc" name="master-dependent-rsc" value="10"/>
-+        </instance_attributes>
-+      </transient_attributes>
-+      <lrm id="node3-lrm">
-+        <lrm_resources>
-+          <lrm_resource id="primary-rsc" type="Stateful" class="ocf" provider="pacemaker">
-+            <lrm_rsc_op id="primary-rsc_last_0" operation_key="primary-rsc_start_0" operation="start" crm-debug-origin="do_update_resource" transition-key="1:0:0:test" on_node="node3" call-id="10" rc-code="0" op-status="0" interval="0" last-rc-change="1609459200" exec-time="100" queue-time="0" op-digest="f2317cad3d54cec5d7d7aa7d0bf35cf8"/>
-+          </lrm_resource>
-+          <lrm_resource id="dependent-rsc" type="Stateful" class="ocf" provider="pacemaker">
-+            <lrm_rsc_op id="dependent-rsc_last_0" operation_key="dependent-rsc_promote_0" operation="promote" crm-debug-origin="do_update_resource" transition-key="1:0:0:test" on_node="node3" call-id="10" rc-code="0" op-status="0" interval="0" last-rc-change="1609459200" exec-time="100" queue-time="0" op-digest="f2317cad3d54cec5d7d7aa7d0bf35cf8"/>
-+          </lrm_resource>
-+        </lrm_resources>
-+      </lrm>
-+    </node_state>
-+    <node_state id="node4" uname="node4" in_ccm="true" crmd="online" join="member" expected="member">
-+      <transient_attributes id="node4-transient">
-+        <instance_attributes id="node4-transient-attrs">
-+          <nvpair id="node4-promoted-primary-rsc" name="master-primary-rsc" value="5"/>
-+          <nvpair id="node4-promoted-dependent-rsc" name="master-dependent-rsc" value="5"/>
-+        </instance_attributes>
-+      </transient_attributes>
-+      <lrm id="node4-lrm">
-+        <lrm_resources>
-+          <lrm_resource id="primary-rsc" type="Stateful" class="ocf" provider="pacemaker">
-+            <lrm_rsc_op id="primary-rsc_last_0" operation_key="primary-rsc_start_0" operation="start" crm-debug-origin="do_update_resource" transition-key="1:0:0:test" on_node="node4" call-id="10" rc-code="0" op-status="0" interval="0" last-rc-change="1609459200" exec-time="100" queue-time="0" op-digest="f2317cad3d54cec5d7d7aa7d0bf35cf8"/>
-+          </lrm_resource>
-+          <lrm_resource id="dependent-rsc" type="Stateful" class="ocf" provider="pacemaker">
-+            <lrm_rsc_op id="dependent-rsc_last_0" operation_key="dependent-rsc_start_0" operation="start" crm-debug-origin="do_update_resource" transition-key="1:0:0:test" on_node="node4" call-id="10" rc-code="0" op-status="0" interval="0" last-rc-change="1609459200" exec-time="100" queue-time="0" op-digest="f2317cad3d54cec5d7d7aa7d0bf35cf8"/>
-+          </lrm_resource>
-+        </lrm_resources>
-+      </lrm>
-+    </node_state>
-+  </status>
-+</cib>
--- 
-2.47.1
-
-From 31d5785ffc68acb54af76bc55f732117f77ef4b9 Mon Sep 17 00:00:00 2001
-From: Klaus Wenninger <klaus.wenninger@aon.at>
-Date: Tue, 7 Oct 2025 05:11:44 +0200
-Subject: [PATCH 2/2] Fix: scheduler: promoted state with promoted state with
- attribute
-
-Previously attribute based colocation didn't adhere
-the attribute with promoted state with promoted	state.
----
- lib/pacemaker/libpacemaker_private.h  |  5 +-
- lib/pacemaker/pcmk_sched_bundle.c     |  2 +-
- lib/pacemaker/pcmk_sched_clone.c      |  2 +-
- lib/pacemaker/pcmk_sched_instances.c  | 76 ++++++++++++++++++---------
- lib/pacemaker/pcmk_sched_probes.c     |  2 +-
- lib/pacemaker/pcmk_sched_promotable.c |  3 +-
- 6 files changed, 60 insertions(+), 30 deletions(-)
-
-diff --git a/lib/pacemaker/libpacemaker_private.h b/lib/pacemaker/libpacemaker_private.h
-index 58435a6..fadfc8b 100644
---- a/lib/pacemaker/libpacemaker_private.h
-+++ b/lib/pacemaker/libpacemaker_private.h
-@@ -941,13 +941,14 @@ void pcmk__create_instance_actions(pcmk_resource_t *rsc, GList *instances);
- G_GNUC_INTERNAL
- bool pcmk__instance_matches(const pcmk_resource_t *instance,
-                             const pcmk_node_t *node, enum rsc_role_e role,
--                            bool current);
-+                            bool current, const char *node_attribute);
- 
- G_GNUC_INTERNAL
- pcmk_resource_t *pcmk__find_compatible_instance(const pcmk_resource_t *match_rsc,
-                                                 const pcmk_resource_t *rsc,
-                                                 enum rsc_role_e role,
--                                                bool current);
-+                                                bool current,
-+                                                const char *node_attribute);
- 
- G_GNUC_INTERNAL
- uint32_t pcmk__instance_update_ordered_actions(pcmk_action_t *first,
-diff --git a/lib/pacemaker/pcmk_sched_bundle.c b/lib/pacemaker/pcmk_sched_bundle.c
-index 14e7be5..2d7e879 100644
---- a/lib/pacemaker/pcmk_sched_bundle.c
-+++ b/lib/pacemaker/pcmk_sched_bundle.c
-@@ -383,7 +383,7 @@ match_replica_container(const pcmk__bundle_replica_t *replica, void *user_data)
-     struct match_data *match_data = user_data;
- 
-     if (pcmk__instance_matches(replica->container, match_data->node,
--                               pcmk_role_unknown, false)) {
-+                               pcmk_role_unknown, false, NULL)) {
-         match_data->container = replica->container;
-         return false; // Match found, don't bother searching further replicas
-     }
-diff --git a/lib/pacemaker/pcmk_sched_clone.c b/lib/pacemaker/pcmk_sched_clone.c
-index 4f86621..99fa8b2 100644
---- a/lib/pacemaker/pcmk_sched_clone.c
-+++ b/lib/pacemaker/pcmk_sched_clone.c
-@@ -301,7 +301,7 @@ pcmk__clone_apply_coloc_score(pcmk_resource_t *dependent,
- 
-         primary_instance = pcmk__find_compatible_instance(dependent, primary,
-                                                           pcmk_role_unknown,
--                                                          false);
-+                                                          false, NULL);
-         if (primary_instance != NULL) {
-             pcmk__rsc_debug(primary, "Interleaving %s with %s",
-                             dependent->id, primary_instance->id);
-diff --git a/lib/pacemaker/pcmk_sched_instances.c b/lib/pacemaker/pcmk_sched_instances.c
-index f2bc1a4..5344234 100644
---- a/lib/pacemaker/pcmk_sched_instances.c
-+++ b/lib/pacemaker/pcmk_sched_instances.c
-@@ -1073,18 +1073,22 @@ free_instance_list(const pcmk_resource_t *rsc, GList *list)
-  * \internal
-  * \brief Check whether an instance is compatible with a role and node
-  *
-- * \param[in] instance  Clone instance or bundle replica container
-- * \param[in] node      Instance must match this node
-- * \param[in] role      If not pcmk_role_unknown, instance must match this role
-- * \param[in] current   If true, compare instance's original node and role,
-- *                      otherwise compare assigned next node and role
-+ * \param[in] instance       Clone instance or bundle replica container
-+ * \param[in] node           Instance must match this node
-+ * \param[in] role           If not pcmk_role_unknown, instance must match this role
-+ * \param[in] current        If true, compare instance's original node and role,
-+ *                           otherwise compare assigned next node and role
-+ * \param[in] node_attribute If not NULL, instance's node must have the same value
-+ *                           for this attribute as \p node (instead of requiring
-+ *                           the exact same node)
-  *
-  * \return true if \p instance is compatible with \p node and \p role,
-  *         otherwise false
-  */
- bool
- pcmk__instance_matches(const pcmk_resource_t *instance, const pcmk_node_t *node,
--                       enum rsc_role_e role, bool current)
-+                       enum rsc_role_e role, bool current,
-+                       const char *node_attribute)
- {
-     pcmk_node_t *instance_node = NULL;
- 
-@@ -1117,7 +1121,25 @@ pcmk__instance_matches(const pcmk_resource_t *instance, const pcmk_node_t *node,
-         return false;
-     }
- 
--    if (!pcmk__same_node(instance_node, node)) {
-+    if (node_attribute != NULL) {
-+        // Compare by node attribute value instead of node identity
-+        const char *instance_value = pcmk__colocation_node_attr(instance_node,
-+                                                                node_attribute,
-+                                                                instance);
-+        const char *target_value = pcmk__colocation_node_attr(node,
-+                                                              node_attribute,
-+                                                              instance);
-+
-+        if (!pcmk__str_eq(instance_value, target_value, pcmk__str_casei)) {
-+            pcmk__rsc_trace(instance,
-+                            "%s is not a compatible instance "
-+                            "(instance has %s=%s, target node has %s=%s)",
-+                            instance->id, node_attribute,
-+                            pcmk__s(instance_value, "<none>"),
-+                            node_attribute, pcmk__s(target_value, "<none>"));
-+            return false;
-+        }
-+    } else if (!pcmk__same_node(instance_node, node)) {
-         pcmk__rsc_trace(instance,
-                         "%s is not a compatible instance "
-                         "(assigned to %s not %s)",
-@@ -1136,12 +1158,14 @@ pcmk__instance_matches(const pcmk_resource_t *instance, const pcmk_node_t *node,
-  * \internal
-  * \brief Find an instance that matches a given resource by node and role
-  *
-- * \param[in] match_rsc  Resource that instance must match (for logging only)
-- * \param[in] rsc        Clone or bundle resource to check for matching instance
-- * \param[in] node       Instance must match this node
-- * \param[in] role       If not pcmk_role_unknown, instance must match this role
-- * \param[in] current    If true, compare instance's original node and role,
-- *                       otherwise compare assigned next node and role
-+ * \param[in] match_rsc      Resource that instance must match (for logging only)
-+ * \param[in] rsc            Clone or bundle resource to check for matching instance
-+ * \param[in] node           Instance must match this node
-+ * \param[in] role           If not pcmk_role_unknown, instance must match this role
-+ * \param[in] current        If true, compare instance's original node and role,
-+ *                           otherwise compare assigned next node and role
-+ * \param[in] node_attribute If not NULL, match instances by this node attribute
-+ *                           instead of by node identity
-  *
-  * \return \p rsc instance matching \p node and \p role if any, otherwise NULL
-  */
-@@ -1149,7 +1173,7 @@ static pcmk_resource_t *
- find_compatible_instance_on_node(const pcmk_resource_t *match_rsc,
-                                  const pcmk_resource_t *rsc,
-                                  const pcmk_node_t *node, enum rsc_role_e role,
--                                 bool current)
-+                                 bool current, const char *node_attribute)
- {
-     GList *instances = NULL;
- 
-@@ -1157,7 +1181,8 @@ find_compatible_instance_on_node(const pcmk_resource_t *match_rsc,
-     for (GList *iter = instances; iter != NULL; iter = iter->next) {
-         pcmk_resource_t *instance = (pcmk_resource_t *) iter->data;
- 
--        if (pcmk__instance_matches(instance, node, role, current)) {
-+        if (pcmk__instance_matches(instance, node, role, current,
-+                                   node_attribute)) {
-             pcmk__rsc_trace(match_rsc,
-                             "Found %s %s instance %s compatible with %s on %s",
-                             display_role(role), rsc->id, instance->id,
-@@ -1179,11 +1204,13 @@ find_compatible_instance_on_node(const pcmk_resource_t *match_rsc,
-  * \internal
-  * \brief Find a clone instance or bundle container compatible with a resource
-  *
-- * \param[in] match_rsc  Resource that instance must match
-- * \param[in] rsc        Clone or bundle resource to check for matching instance
-- * \param[in] role       If not pcmk_role_unknown, instance must match this role
-- * \param[in] current    If true, compare instance's original node and role,
-- *                       otherwise compare assigned next node and role
-+ * \param[in] match_rsc      Resource that instance must match
-+ * \param[in] rsc            Clone or bundle resource to check for matching instance
-+ * \param[in] role           If not pcmk_role_unknown, instance must match this role
-+ * \param[in] current        If true, compare instance's original node and role,
-+ *                           otherwise compare assigned next node and role
-+ * \param[in] node_attribute If not NULL, match instances by this node attribute
-+ *                           instead of by node identity
-  *
-  * \return Compatible (by \p role and \p match_rsc location) instance of \p rsc
-  *         if any, otherwise NULL
-@@ -1191,7 +1218,7 @@ find_compatible_instance_on_node(const pcmk_resource_t *match_rsc,
- pcmk_resource_t *
- pcmk__find_compatible_instance(const pcmk_resource_t *match_rsc,
-                                const pcmk_resource_t *rsc, enum rsc_role_e role,
--                               bool current)
-+                               bool current, const char *node_attribute)
- {
-     pcmk_resource_t *instance = NULL;
-     GList *nodes = NULL;
-@@ -1207,7 +1234,7 @@ pcmk__find_compatible_instance(const pcmk_resource_t *match_rsc,
-     node = match_rsc->priv->fns->location(match_rsc, NULL, target);
-     if (node != NULL) {
-         return find_compatible_instance_on_node(match_rsc, rsc, node, role,
--                                                current);
-+                                                current, node_attribute);
-     }
- 
-     // Otherwise check for an instance matching any of match_rsc's allowed nodes
-@@ -1216,7 +1243,8 @@ pcmk__find_compatible_instance(const pcmk_resource_t *match_rsc,
-          iter = iter->next) {
-         instance = find_compatible_instance_on_node(match_rsc, rsc,
-                                                     (pcmk_node_t *) iter->data,
--                                                    role, current);
-+                                                    role, current,
-+                                                    node_attribute);
-     }
- 
-     if (instance == NULL) {
-@@ -1423,7 +1451,7 @@ update_interleaved_actions(pcmk_action_t *first, pcmk_action_t *then,
-         first_instance = pcmk__find_compatible_instance(then_instance,
-                                                         first->rsc,
-                                                         pcmk_role_unknown,
--                                                        current);
-+                                                        current, NULL);
- 
-         if (first_instance == NULL) { // No instance can be interleaved
-             if (unassign_if_mandatory(first, then, then_instance, type,
-diff --git a/lib/pacemaker/pcmk_sched_probes.c b/lib/pacemaker/pcmk_sched_probes.c
-index bda90ce..a7d2364 100644
---- a/lib/pacemaker/pcmk_sched_probes.c
-+++ b/lib/pacemaker/pcmk_sched_probes.c
-@@ -614,7 +614,7 @@ add_restart_orderings_for_probe(pcmk_action_t *probe, pcmk_action_t *after)
-             compatible_rsc = pcmk__find_compatible_instance(probe->rsc,
-                                                             after->rsc,
-                                                             pcmk_role_unknown,
--                                                            false);
-+                                                            false, NULL);
-         }
-     }
- 
-diff --git a/lib/pacemaker/pcmk_sched_promotable.c b/lib/pacemaker/pcmk_sched_promotable.c
-index cdf276f..0da0d8b 100644
---- a/lib/pacemaker/pcmk_sched_promotable.c
-+++ b/lib/pacemaker/pcmk_sched_promotable.c
-@@ -1333,7 +1333,8 @@ pcmk__update_promotable_dependent_priority(const pcmk_resource_t *primary,
-     // Look for a primary instance where dependent will be
-     primary_instance = pcmk__find_compatible_instance(dependent, primary,
-                                                       colocation->primary_role,
--                                                      false);
-+                                                      false,
-+                                                      colocation->node_attribute);
- 
-     if (primary_instance != NULL) {
-         // Add primary instance's priority to dependent's
--- 
-2.47.1
-
diff --git a/004-crm_resource_wait.patch b/004-crm_resource_wait.patch
deleted file mode 100644
index ffd87d6..0000000
--- a/004-crm_resource_wait.patch
+++ /dev/null
@@ -1,76 +0,0 @@
-From ce1dc488d46b373292569b397c9c765b55654eea Mon Sep 17 00:00:00 2001
-From: Reid Wahl <nrwahl@protonmail.com>
-Date: Fri, 5 Sep 2025 20:35:31 -0700
-Subject: [PATCH] Fix: tools: Handle large timeouts correctly in crm_resource
- --wait
-
-Previously, if the --timeout value parsed to a value greater than
-(UINT_MAX - 999), the wait timeout would overflow. The effective timeout
-would be either 0 seconds or 1 second. This is because 999 was added to
-the guint value before passing it to pcmk__timeout_ms2s().
-
-Now, we simply pass the timeout in milliseconds to
-pcmk__timeout_ms2s(), without adding 999.
-
-This implies a slight behavior change. Previously, timeouts were always
-rounded up to the next greatest second. Now, they're rounded to the
-nearest second. For example, previously:
-* timeout values between 1ms and 500ms => wait timeout of 1 second
-* timeout values between 501ms and 1500ms => wait timeout of 2 seconds
-* timeout values between 1501ms and 2500ms => wait timeout of 3 seconds
-* and so on
-
-Now:
-* timeout values between 1ms and 1499ms => wait timeout of 1 second
-* timeout values between 1500ms and 2499ms => wait timeout of 2 seconds
-* timeout values between 2500ms and 3499ms => wait timeout of 3 seconds
-* and so on
-
-The previous rounding behavior has existed since crm_resource --wait was
-added by 424afcdf.
-
-Update the help text to note the granularity and rounding behavior. The
-exact behavior of the restart command is confusing, and its logic should
-be cleaned up in the future.
-
-Fixes RHEL-45869
-Fixes RHEL-86148
-Closes T841
-
-Signed-off-by: Reid Wahl <nrwahl@protonmail.com>
----
- tools/crm_resource.c         | 4 +++-
- tools/crm_resource_runtime.c | 2 +-
- 2 files changed, 4 insertions(+), 2 deletions(-)
-
-diff --git a/tools/crm_resource.c b/tools/crm_resource.c
-index 162ae40..74f84f0 100644
---- a/tools/crm_resource.c
-+++ b/tools/crm_resource.c
-@@ -831,7 +831,9 @@ static GOptionEntry addl_entries[] = {
-       "ID" },
-     { "timeout", 'T', G_OPTION_FLAG_NONE, G_OPTION_ARG_CALLBACK, timeout_cb,
-       "(Advanced) Abort if command does not finish in this time (with\n"
--      INDENT "--restart, --wait, --force-*)",
-+      INDENT "--restart, --wait, --force-*). The --restart command uses a\n"
-+      INDENT "two-second granularity and the --wait command uses a one-second\n"
-+      INDENT "granularity, with rounding.",
-       "N" },
-     { "all", 0, G_OPTION_FLAG_NONE, G_OPTION_ARG_NONE, &options.all,
-       "List all options, including advanced and deprecated (with\n"
-diff --git a/tools/crm_resource_runtime.c b/tools/crm_resource_runtime.c
-index f0a84c0..a44794e 100644
---- a/tools/crm_resource_runtime.c
-+++ b/tools/crm_resource_runtime.c
-@@ -2108,7 +2108,7 @@ wait_till_stable(pcmk__output_t *out, guint timeout_ms, cib_t * cib)
-     if (timeout_ms == 0) {
-         expire_time += WAIT_DEFAULT_TIMEOUT_S;
-     } else {
--        expire_time += pcmk__timeout_ms2s(timeout_ms + 999);
-+        expire_time += pcmk__timeout_ms2s(timeout_ms);
-     }
- 
-     scheduler = pcmk_new_scheduler();
--- 
-2.47.1
-
diff --git a/005-ipc_evict.patch b/005-ipc_evict.patch
deleted file mode 100644
index 1d82ec4..0000000
--- a/005-ipc_evict.patch
+++ /dev/null
@@ -1,400 +0,0 @@
-From 79f5a67e8242b3e72aa9dcf0dbd286b3fb719baa Mon Sep 17 00:00:00 2001
-From: Chris Lumens <clumens@redhat.com>
-Date: Wed, 27 Aug 2025 10:41:13 -0400
-Subject: [PATCH 1/6] Refactor: libcrmcommon: Rearrange the queue_len check.
-
-Check if the queue length is 0 first and return, which allows everything
-else to be un-indented one level.
----
- lib/common/ipc_server.c | 47 ++++++++++++++++++++---------------------
- 1 file changed, 23 insertions(+), 24 deletions(-)
-
-diff --git a/lib/common/ipc_server.c b/lib/common/ipc_server.c
-index 25c788b..4b33c64 100644
---- a/lib/common/ipc_server.c
-+++ b/lib/common/ipc_server.c
-@@ -541,34 +541,33 @@ no_more_retries:
-                   sent, queue_len, c->ipcs, c->pid, pcmk_rc_str(rc), qb_rc);
-     }
- 
--    if (queue_len) {
--
--        /* Allow clients to briefly fall behind on processing incoming messages,
--         * but drop completely unresponsive clients so the connection doesn't
--         * consume resources indefinitely.
--         */
--        if (queue_len > QB_MAX(c->queue_max, PCMK_IPC_DEFAULT_QUEUE_MAX)) {
--            if ((c->queue_backlog <= 1) || (queue_len < c->queue_backlog)) {
--                /* Don't evict for a new or shrinking backlog */
--                crm_warn("Client with process ID %u has a backlog of %u messages "
--                         QB_XS " %p", c->pid, queue_len, c->ipcs);
--            } else {
--                crm_err("Evicting client with process ID %u due to backlog of %u messages "
--                         QB_XS " %p", c->pid, queue_len, c->ipcs);
--                c->queue_backlog = 0;
--                qb_ipcs_disconnect(c->ipcs);
--                return rc;
--            }
--        }
--
--        c->queue_backlog = queue_len;
--        delay_next_flush(c, queue_len);
--
--    } else {
-+    if (queue_len == 0) {
-         /* Event queue is empty, there is no backlog */
-         c->queue_backlog = 0;
-+        return rc;
-     }
- 
-+    /* Allow clients to briefly fall behind on processing incoming messages,
-+     * but drop completely unresponsive clients so the connection doesn't
-+     * consume resources indefinitely.
-+     */
-+    if (queue_len > QB_MAX(c->queue_max, PCMK_IPC_DEFAULT_QUEUE_MAX)) {
-+        if ((c->queue_backlog <= 1) || (queue_len < c->queue_backlog)) {
-+            /* Don't evict for a new or shrinking backlog */
-+            crm_warn("Client with process ID %u has a backlog of %u messages "
-+                     QB_XS " %p", c->pid, queue_len, c->ipcs);
-+        } else {
-+            crm_err("Evicting client with process ID %u due to backlog of %u messages "
-+                     QB_XS " %p", c->pid, queue_len, c->ipcs);
-+            c->queue_backlog = 0;
-+            qb_ipcs_disconnect(c->ipcs);
-+            return rc;
-+        }
-+    }
-+
-+    c->queue_backlog = queue_len;
-+    delay_next_flush(c, queue_len);
-+
-     return rc;
- }
- 
--- 
-2.47.1
-
-From 014699003c6506bba8638ed57efea49da403d0e1 Mon Sep 17 00:00:00 2001
-From: Chris Lumens <clumens@redhat.com>
-Date: Wed, 27 Aug 2025 11:31:37 -0400
-Subject: [PATCH 2/6] Refactor: libcrmcommon: Simplify an empty event queue
- check.
-
-I find this just a little bit more straightforward to follow.
----
- lib/common/ipc_server.c | 9 ++++-----
- 1 file changed, 4 insertions(+), 5 deletions(-)
-
-diff --git a/lib/common/ipc_server.c b/lib/common/ipc_server.c
-index 4b33c64..dbd885a 100644
---- a/lib/common/ipc_server.c
-+++ b/lib/common/ipc_server.c
-@@ -491,14 +491,13 @@ crm_ipcs_flush_events(pcmk__client_t *c)
-         pcmk__ipc_header_t *header = NULL;
-         struct iovec *event = NULL;
- 
--        if (c->event_queue) {
--            // We don't pop unless send is successful
--            event = g_queue_peek_head(c->event_queue);
--        }
--        if (event == NULL) { // Queue is empty
-+        if ((c->event_queue == NULL) || g_queue_is_empty(c->event_queue)) {
-             break;
-         }
- 
-+        // We don't pop unless send is successful
-+        event = g_queue_peek_head(c->event_queue);
-+
-         /* Retry sending the event up to five times.  If we get -EAGAIN, sleep
-          * a very short amount of time (too long here is bad) and try again.
-          * If we simply exit the while loop on -EAGAIN, we'll have to wait until
--- 
-2.47.1
-
-From f999ac3d86d8107dee5288497f5f7fff07956d18 Mon Sep 17 00:00:00 2001
-From: Chris Lumens <clumens@redhat.com>
-Date: Wed, 27 Aug 2025 11:35:38 -0400
-Subject: [PATCH 3/6] Refactor: libcrmcommon: Rearrange a few tests in
- crm_ipcs_flush_events.
-
-Again, no important code changes here.  I just find these a little
-easier to follow.
----
- lib/common/ipc_server.c | 6 ++++--
- 1 file changed, 4 insertions(+), 2 deletions(-)
-
-diff --git a/lib/common/ipc_server.c b/lib/common/ipc_server.c
-index dbd885a..b76847b 100644
---- a/lib/common/ipc_server.c
-+++ b/lib/common/ipc_server.c
-@@ -477,16 +477,18 @@ crm_ipcs_flush_events(pcmk__client_t *c)
- 
-     if (c == NULL) {
-         return rc;
-+    }
- 
--    } else if (c->event_timer) {
-+    if (c->event_timer != 0) {
-         /* There is already a timer, wait until it goes off */
-         crm_trace("Timer active for %p - %d", c->ipcs, c->event_timer);
-         return rc;
-     }
- 
--    if (c->event_queue) {
-+    if (c->event_queue != NULL) {
-         queue_len = g_queue_get_length(c->event_queue);
-     }
-+
-     while (sent < 100) {
-         pcmk__ipc_header_t *header = NULL;
-         struct iovec *event = NULL;
--- 
-2.47.1
-
-From 9e76007bb0bc1d4cb5a88dcfaaf96aa8853f42dc Mon Sep 17 00:00:00 2001
-From: Chris Lumens <clumens@redhat.com>
-Date: Wed, 27 Aug 2025 11:48:48 -0400
-Subject: [PATCH 4/6] Refactor: libcrmcommon: Unindent retry code in
- crm_ipcs_flush_events.
-
-If we're breaking or jumping to a label, there's no need to have all
-these nested else blocks.
----
- lib/common/ipc_server.c | 16 ++++++++--------
- 1 file changed, 8 insertions(+), 8 deletions(-)
-
-diff --git a/lib/common/ipc_server.c b/lib/common/ipc_server.c
-index b76847b..73cc58f 100644
---- a/lib/common/ipc_server.c
-+++ b/lib/common/ipc_server.c
-@@ -513,16 +513,16 @@ crm_ipcs_flush_events(pcmk__client_t *c)
-         for (unsigned int retries = 5; retries > 0; retries--) {
-             qb_rc = qb_ipcs_event_sendv(c->ipcs, event, 2);
- 
--            if (qb_rc < 0) {
--                if (retries == 1 || qb_rc != -EAGAIN) {
--                    rc = (int) -qb_rc;
--                    goto no_more_retries;
--                } else {
--                    pcmk__sleep_ms(5);
--                }
--            } else {
-+            if (qb_rc >= 0) {
-                 break;
-             }
-+
-+            if (retries == 1 || qb_rc != -EAGAIN) {
-+                rc = (int) -qb_rc;
-+                goto no_more_retries;
-+            }
-+
-+            pcmk__sleep_ms(5);
-         }
- 
-         event = g_queue_pop_head(c->event_queue);
--- 
-2.47.1
-
-From b73be21a454f795bc747aad1dbeea82f67d8b232 Mon Sep 17 00:00:00 2001
-From: Chris Lumens <clumens@redhat.com>
-Date: Wed, 27 Aug 2025 13:14:54 -0400
-Subject: [PATCH 5/6] Feature: libcrmcommon: Be more lenient in evicting IPC
- clients.
-
-Each IPC connection has a message queue.  If the client is unable to
-process messages faster than the server is sending them, that queue
-start to back up.  pacemaker enforces a cap on the queue size, and
-that's adjustable with the cluster-ipc-limit parameter.  Once the queue
-grows beyond that size, the client is assumed to be dead and is evicted
-so it can be restarted and the queue resources freed.
-
-However, it's possible that the client is not dead.  On clusters with
-very large numbers of resources (I've tried with 300, but fewer might
-also cause problems), certain actions can happen that cause a spike in
-IPC messages.  In RHEL-76276, the action that causes this is moving
-nodes in and out of standby.  This spike in messages causes the server
-to overwhelm the client, which is then evicted.
-
-My multi-part IPC patches made this even worse, as now if the CIB is so
-large that it needs to split an IPC message up, there will be more
-messages than before.
-
-What this fix does is get rid of the cap on the queue size for pacemaker
-daemons.  As long as the server has been able to send messages to the
-client, the client is still doing work and shouldn't be evicted.  It may
-just be processing messages slower than the server is sending them.
-Note that this could lead the queue to grow without bound, eventually
-crashing the server.  For this reason, we're only allowing pacemaker
-daemons to ignore the queue size limit.
-
-Potential problems with this approach:
-
-* If the client is so busy that it can't receive even a single message
-  that crm_ipcs_flush_events tries to send, it will still be evicted.
-  However, the flush operation does retry with a delay several times
-  giving the client time to finish up what it's doing.
-
-* We have timers all over the place with daemons waiting on replies.
-  It's possible that because we are no longer just evicting the clients,
-  we will now see those timers expire which will just lead to different
-  problems.  If so, these fixes would probably need to take place in the
-  client code.
-
-Fixes T38
----
- lib/common/ipc_server.c | 14 ++++++++++++--
- 1 file changed, 12 insertions(+), 2 deletions(-)
-
-diff --git a/lib/common/ipc_server.c b/lib/common/ipc_server.c
-index 73cc58f..4420070 100644
---- a/lib/common/ipc_server.c
-+++ b/lib/common/ipc_server.c
-@@ -553,10 +553,20 @@ no_more_retries:
-      * consume resources indefinitely.
-      */
-     if (queue_len > QB_MAX(c->queue_max, PCMK_IPC_DEFAULT_QUEUE_MAX)) {
--        if ((c->queue_backlog <= 1) || (queue_len < c->queue_backlog)) {
--            /* Don't evict for a new or shrinking backlog */
-+        /* Don't evict:
-+         * - Clients with a new backlog.
-+         * - Clients with a shrinking backlog (the client is processing
-+         *   messages faster than the server is sending them).
-+         * - Clients that are pacemaker daemons and have had any messages sent
-+         *   to them in this flush call (the server is sending messages faster
-+         *   than the client is processing them, but the client is not dead).
-+         */
-+        if ((c->queue_backlog <= 1)
-+            || (queue_len < c->queue_backlog)
-+            || ((sent > 0) && (pcmk__parse_server(c->name) != pcmk_ipc_unknown))) {
-             crm_warn("Client with process ID %u has a backlog of %u messages "
-                      QB_XS " %p", c->pid, queue_len, c->ipcs);
-+
-         } else {
-             crm_err("Evicting client with process ID %u due to backlog of %u messages "
-                      QB_XS " %p", c->pid, queue_len, c->ipcs);
--- 
-2.47.1
-
-From 4682953c567e16409d8e7972d9d5891348d4c360 Mon Sep 17 00:00:00 2001
-From: Chris Lumens <clumens@redhat.com>
-Date: Wed, 27 Aug 2025 15:56:27 -0400
-Subject: [PATCH 6/6] Feature: libcrmcommon: Update documentation for
- cluster-ipc-limit.
-
-Clarify that this no longer applies to pacemaker daemons.
----
- cts/cli/regression.crm_attribute.exp             | 16 ++++++++--------
- cts/cli/regression.daemons.exp                   |  4 ++--
- .../Pacemaker_Explained/cluster-options.rst      | 12 +++++++-----
- lib/common/options.c                             |  6 +++---
- 4 files changed, 20 insertions(+), 18 deletions(-)
-
-diff --git a/cts/cli/regression.crm_attribute.exp b/cts/cli/regression.crm_attribute.exp
-index e161f49..36cba76 100644
---- a/cts/cli/regression.crm_attribute.exp
-+++ b/cts/cli/regression.crm_attribute.exp
-@@ -111,8 +111,8 @@ Also known as properties, these are options that affect behavior across the enti
-   * migration-limit: The number of live migration actions that the cluster is allowed to execute in parallel on a node (-1 means no limit)
-     * Possible values: integer (default: )
- 
--  * cluster-ipc-limit: Maximum IPC message backlog before disconnecting a cluster daemon
--    * Raise this if log has "Evicting client" messages for cluster daemon PIDs (a good value is the number of resources in the cluster multiplied by the number of nodes).
-+  * cluster-ipc-limit: Maximum IPC message backlog before disconnecting a client
-+    * Raise this if log has "Evicting client" messages for cluster PIDs (a good value is the number of resources in the cluster multiplied by the number of nodes).
-     * Possible values: nonnegative_integer (default: )
- 
-   * stop-all-resources: Whether the cluster should stop all active resources
-@@ -357,8 +357,8 @@ Also known as properties, these are options that affect behavior across the enti
-         <content type="integer" default=""/>
-       </parameter>
-       <parameter name="cluster-ipc-limit" advanced="0" generated="0">
--        <longdesc lang="en">Raise this if log has "Evicting client" messages for cluster daemon PIDs (a good value is the number of resources in the cluster multiplied by the number of nodes).</longdesc>
--        <shortdesc lang="en">Maximum IPC message backlog before disconnecting a cluster daemon</shortdesc>
-+        <longdesc lang="en">Raise this if log has "Evicting client" messages for cluster PIDs (a good value is the number of resources in the cluster multiplied by the number of nodes).</longdesc>
-+        <shortdesc lang="en">Maximum IPC message backlog before disconnecting a client</shortdesc>
-         <content type="nonnegative_integer" default=""/>
-       </parameter>
-       <parameter name="stop-all-resources" advanced="0" generated="0">
-@@ -537,8 +537,8 @@ Also known as properties, these are options that affect behavior across the enti
-   * migration-limit: The number of live migration actions that the cluster is allowed to execute in parallel on a node (-1 means no limit)
-     * Possible values: integer (default: )
- 
--  * cluster-ipc-limit: Maximum IPC message backlog before disconnecting a cluster daemon
--    * Raise this if log has "Evicting client" messages for cluster daemon PIDs (a good value is the number of resources in the cluster multiplied by the number of nodes).
-+  * cluster-ipc-limit: Maximum IPC message backlog before disconnecting a client
-+    * Raise this if log has "Evicting client" messages for cluster PIDs (a good value is the number of resources in the cluster multiplied by the number of nodes).
-     * Possible values: nonnegative_integer (default: )
- 
-   * stop-all-resources: Whether the cluster should stop all active resources
-@@ -824,8 +824,8 @@ Also known as properties, these are options that affect behavior across the enti
-         <content type="integer" default=""/>
-       </parameter>
-       <parameter name="cluster-ipc-limit" advanced="0" generated="0">
--        <longdesc lang="en">Raise this if log has "Evicting client" messages for cluster daemon PIDs (a good value is the number of resources in the cluster multiplied by the number of nodes).</longdesc>
--        <shortdesc lang="en">Maximum IPC message backlog before disconnecting a cluster daemon</shortdesc>
-+        <longdesc lang="en">Raise this if log has "Evicting client" messages for cluster PIDs (a good value is the number of resources in the cluster multiplied by the number of nodes).</longdesc>
-+        <shortdesc lang="en">Maximum IPC message backlog before disconnecting a client</shortdesc>
-         <content type="nonnegative_integer" default=""/>
-       </parameter>
-       <parameter name="stop-all-resources" advanced="0" generated="0">
-diff --git a/cts/cli/regression.daemons.exp b/cts/cli/regression.daemons.exp
-index fc8535a..6274eeb 100644
---- a/cts/cli/regression.daemons.exp
-+++ b/cts/cli/regression.daemons.exp
-@@ -21,10 +21,10 @@
-     </parameter>
-     <parameter name="cluster-ipc-limit">
-       <longdesc lang="en">
--        Raise this if log has "Evicting client" messages for cluster daemon PIDs (a good value is the number of resources in the cluster multiplied by the number of nodes).
-+        Raise this if log has "Evicting client" messages for cluster PIDs (a good value is the number of resources in the cluster multiplied by the number of nodes).
-       </longdesc>
-       <shortdesc lang="en">
--        Maximum IPC message backlog before disconnecting a cluster daemon
-+        Maximum IPC message backlog before disconnecting a client
-       </shortdesc>
-       <content type="integer" default=""/>
-     </parameter>
-diff --git a/doc/sphinx/Pacemaker_Explained/cluster-options.rst b/doc/sphinx/Pacemaker_Explained/cluster-options.rst
-index 6ebe5f3..22e1a50 100644
---- a/doc/sphinx/Pacemaker_Explained/cluster-options.rst
-+++ b/doc/sphinx/Pacemaker_Explained/cluster-options.rst
-@@ -693,11 +693,13 @@ values, by running the ``man pacemaker-schedulerd`` and
-        cluster-ipc-limit
-      - :ref:`nonnegative integer <nonnegative_integer>`
-      - 500
--     - The maximum IPC message backlog before one cluster daemon will
--       disconnect another. This is of use in large clusters, for which a good
--       value is the number of resources in the cluster multiplied by the number
--       of nodes. The default of 500 is also the minimum. Raise this if you see
--       "Evicting client" log messages for cluster daemon process IDs.
-+     - The maximum IPC message backlog before a cluster daemon will disconnect
-+       a client.  Other cluster daemons are not subject to this limit as long as
-+       they are still processing messages.  This is of use in large clusters,
-+       for which a good value is the number of resources in the cluster
-+       multiplied by the number of nodes. The default of 500 is also the
-+       minimum. Raise this if you see "Evicting client" log messages for
-+       cluster process IDs.
-    * - .. _pe_error_series_max:
-       
-        .. index::
-diff --git a/lib/common/options.c b/lib/common/options.c
-index b8f4943..af1b073 100644
---- a/lib/common/options.c
-+++ b/lib/common/options.c
-@@ -432,10 +432,10 @@ static const pcmk__cluster_option_t cluster_options[] = {
-         PCMK_OPT_CLUSTER_IPC_LIMIT, NULL, PCMK_VALUE_NONNEGATIVE_INTEGER, NULL,
-         "500", pcmk__valid_positive_int,
-         pcmk__opt_based,
--        N_("Maximum IPC message backlog before disconnecting a cluster daemon"),
-+        N_("Maximum IPC message backlog before disconnecting a client"),
-         N_("Raise this if log has \"Evicting client\" messages for cluster "
--            "daemon PIDs (a good value is the number of resources in the "
--            "cluster multiplied by the number of nodes)."),
-+            "PIDs (a good value is the number of resources in the cluster "
-+            "multiplied by the number of nodes)."),
-     },
- 
-     // Orphans and stopping
--- 
-2.47.1
-
diff --git a/006-fewer_messages.patch b/006-fewer_messages.patch
deleted file mode 100644
index 8935d44..0000000
--- a/006-fewer_messages.patch
+++ /dev/null
@@ -1,88 +0,0 @@
-From 8ddaf5330cf7605c7b710061c72dba8112db6cc6 Mon Sep 17 00:00:00 2001
-From: Chris Lumens <clumens@redhat.com>
-Date: Fri, 31 Oct 2025 11:24:14 -0400
-Subject: [PATCH] Med: daemons: Don't add repeated I_PE_CALC messages to the
- fsa queue.
-
-Let's say you have a two node cluster, node1 and node2.  For purposes of
-testing, it's easiest if you use fence_dummy instead of a real fencing
-agent as this will fake fencing happening but without rebooting the node
-so you can see all the log files.
-
-Assume the DC is node1.  Now do the following on node2:
-
-- pcs node standby node1
-- pcs resource defaults update resource-stickiness=1
-- for i in $(seq 1 300); do echo $i; pcs resource create dummy$i ocf:heartbeat:Dummy --group dummy-group; done
-- pcs node unstandby node1
-
-It will take a long time to create that many resources.  After node1
-comes out of standby, it'll take a minute or two but eventually you'll
-see that node1 was fenced.  On node1, you'll see a lot of transition
-abort messages happen.  Each of these transition aborts causes an
-I_PE_CALC message to be generated and added to the fsa queue.  In my
-testing, I've seen the queue grow to ~ 600 messages, all of which are
-exactly the same thing.
-
-The FSA is triggered at G_PRIORITY_HIGH, and once it is triggered, it
-will run until its queue is empty.  With so many messages being added so
-quickly, we've basically ensured it won't be empty any time soon.  While
-controld is processing the FSA messages, it will be unable to read
-anything out of the IPC backlog.
-
-based continues to attempt to send IPC events to controld but is unable
-to do so, so the backlog continues to grow.  Eventually, the backlog
-reaches that 500 message threshold without anything having been read by
-controld, which triggers the eviction process.
-
-There doesn't seem to be any reason for all these I_PE_CALC messages to
-be generated.  They're all exactly the same, they don't appear to be
-tagged with any unique data tying them to a specific query, and their
-presence just slows everything down.
-
-Thus, the fix here is very simple: if the latest message in the queue is
-an I_PE_CALC message, just don't add another one.  We could also make
-sure there's only ever one I_PE_CALC message in the queue, but there
-could potentially be valid reasons for there to be multiple interleaved
-with other message types.  I am erring on the side of caution with this
-minimal fix.
-
-Related: RHEL-76276
----
- daemons/controld/controld_messages.c | 20 ++++++++++++++++++++
- 1 file changed, 20 insertions(+)
-
-diff --git a/daemons/controld/controld_messages.c b/daemons/controld/controld_messages.c
-index df215e6..866fde3 100644
---- a/daemons/controld/controld_messages.c
-+++ b/daemons/controld/controld_messages.c
-@@ -73,6 +73,26 @@ register_fsa_input_adv(enum crmd_fsa_cause cause, enum crmd_fsa_input input,
-         return;
-     }
- 
-+    if (input == I_PE_CALC) {
-+        GList *ele = NULL;
-+
-+        if (prepend) {
-+            ele = g_list_first(controld_globals.fsa_message_queue);
-+        } else {
-+            ele = g_list_last(controld_globals.fsa_message_queue);
-+        }
-+
-+        if (ele != NULL) {
-+            fsa_data_t *message = (fsa_data_t *) ele->data;
-+
-+            if (message->fsa_input == I_PE_CALC) {
-+                crm_debug("%s item in fsa queue is I_PE_CALC, not adding another",
-+                         (prepend ? "First" : "Last"));
-+                return;
-+            }
-+        }
-+    }
-+
-     if (input == I_WAIT_FOR_EVENT) {
-         controld_set_global_flags(controld_fsa_is_stalled);
-         crm_debug("Stalling the FSA pending further input: source=%s cause=%s data=%p queue=%d",
--- 
-2.47.1
-
diff --git a/007-transient_attrs.patch b/007-transient_attrs.patch
deleted file mode 100644
index 59b34e2..0000000
--- a/007-transient_attrs.patch
+++ /dev/null
@@ -1,1262 +0,0 @@
-From 26c022d2a3b6061ff9a60f86e50834a08e8360d4 Mon Sep 17 00:00:00 2001
-From: Reid Wahl <nrwahl@protonmail.com>
-Date: Thu, 13 Nov 2025 02:14:45 -0800
-Subject: [PATCH 01/10] Fix: pacemaker-attrd: Wipe CIB along with memory
-
-Previously, when the attribute manager purged a node, it would purge the
-node's transient attributes only from memory, and assumed the controller
-would purge them from the CIB. Now, the writer will purge them from the
-CIB as well.
-
-This fixes a variety of timing issues when multiple nodes including the
-attribute writer are shutting down. If the writer leaves before some
-other node, the DC wipes that other node's attributes from the CIB when
-that other node leaves the controller process group (or all other nodes
-do if the DC is the leaving node). If a new writer (possibly even the
-node itself) is elected before the node's attribute manager leaves the
-cluster layer, it will write the attributes back to the CIB. Once the
-other node leaves the cluster layer, all attribute managers remove its
-attributes from memory, but they are now "stuck" in the CIB.
-
-As of this commit, the controller still erases the attributes from the
-CIB when the node leaves the controller process group, which is
-redundant but doesn't cause any new problems. This will be corrected in
-an upcoming commit.
-
-Note: This will cause an insignificant regression if backported to
-Pacemaker 2. The Pacemaker 2 controller purges attributes from the CIB
-for leaving DCs only if they are at version 1.1.13 or later, because
-earlier DCs will otherwise get fenced after a clean shutdown. Since the
-attribute manager doesn't know the DC or its version, the attributes
-would now always be wiped, so old leaving DCs will get fenced. The
-fencing would occur only in the highly unlikely situation of a rolling
-upgrade from Pacemaker 2-supported versions 1.1.11 or 1.1.12, and the
-upgrade would still succeed without any negative impact on resources.
-
-Fixes T138
-
-Co-Authored-By: Ken Gaillot <kgaillot@redhat.com>
-Co-Authored-By: Chris Lumens <clumens@redhat.com>
-
-Signed-off-by: Reid Wahl <nrwahl@protonmail.com>
----
- daemons/attrd/attrd_corosync.c  | 93 +++++++++++++++++++++++++++++++++
- daemons/attrd/attrd_elections.c |  2 +
- daemons/attrd/pacemaker-attrd.c |  1 +
- daemons/attrd/pacemaker-attrd.h |  3 ++
- 4 files changed, 99 insertions(+)
-
-diff --git a/daemons/attrd/attrd_corosync.c b/daemons/attrd/attrd_corosync.c
-index 94fc85f..8497f34 100644
---- a/daemons/attrd/attrd_corosync.c
-+++ b/daemons/attrd/attrd_corosync.c
-@@ -23,6 +23,43 @@
- 
- #include "pacemaker-attrd.h"
- 
-+/*!
-+ * \internal
-+ * \brief Nodes removed by \c attrd_peer_remove()
-+ *
-+ * This table is to be used as a set. It contains nodes that have been removed
-+ * by \c attrd_peer_remove() and whose transient attributes should be erased
-+ * from the CIB.
-+ *
-+ * Setting an attribute value for a node via \c update_attr_on_host() removes
-+ * the node from the table. At that point, we have transient attributes in
-+ * memory for the node, so it should no longer be erased from the CIB.
-+ *
-+ * If another node erases a removed node's transient attributes from the CIB,
-+ * the removed node remains in this table until an attribute value is set for
-+ * it. This is for convenience: it avoids the need to monitor for CIB updates
-+ * that erase a node's \c node_state or \c transient attributes element, just to
-+ * remove the node from the table.
-+ *
-+ * Leaving a removed node in the table after erasure should be harmless. If a
-+ * node is in this table, then we have no transient attributes for it in memory.
-+ * If for some reason we erase its transient attributes from the CIB twice, its
-+ * state in the CIB will still be correct.
-+ */
-+static GHashTable *removed_peers = NULL;
-+
-+/*!
-+ * \internal
-+ * \brief Free the removed nodes table
-+ */
-+void
-+attrd_free_removed_peers(void)
-+{
-+    if (removed_peers != NULL) {
-+        g_hash_table_destroy(removed_peers);
-+    }
-+}
-+
- static xmlNode *
- attrd_confirmation(int callid)
- {
-@@ -236,6 +273,10 @@ update_attr_on_host(attribute_t *a, const pcmk__node_status_t *peer,
-     const char *prev_xml_id = NULL;
-     const char *node_xml_id = crm_element_value(xml, PCMK__XA_ATTR_HOST_ID);
- 
-+    if (removed_peers != NULL) {
-+        g_hash_table_remove(removed_peers, host);
-+    }
-+
-     // Create entry for value if not already existing
-     v = g_hash_table_lookup(a->values, host);
-     if (v == NULL) {
-@@ -530,6 +571,29 @@ attrd_peer_sync_response(const pcmk__node_status_t *peer, bool peer_won,
-     }
- }
- 
-+/*!
-+ * \internal
-+ * \brief Erase all removed nodes' transient attributes from the CIB
-+ *
-+ * This should be called by a newly elected writer upon winning the election.
-+ */
-+void
-+attrd_erase_removed_peer_attributes(void)
-+{
-+    const char *host = NULL;
-+    GHashTableIter iter;
-+
-+    if (!attrd_election_won() || (removed_peers == NULL)) {
-+        return;
-+    }
-+
-+    g_hash_table_iter_init(&iter, removed_peers);
-+    while (g_hash_table_iter_next(&iter, (gpointer *) &host, NULL)) {
-+        attrd_cib_erase_transient_attrs(host);
-+        g_hash_table_iter_remove(&iter);
-+    }
-+}
-+
- /*!
-  * \internal
-  * \brief Remove all attributes and optionally peer cache entries for a node
-@@ -556,6 +620,35 @@ attrd_peer_remove(const char *host, bool uncache, const char *source)
-         }
-     }
- 
-+    if (attrd_election_won()) {
-+        // We are the writer. Wipe node's transient attributes from CIB now.
-+        attrd_cib_erase_transient_attrs(host);
-+
-+    } else {
-+        /* Make sure the attributes get erased from the CIB eventually.
-+         * - If there's already a writer, it will call this function and enter
-+         *   the "if" block above, requesting the erasure (unless it leaves
-+         *   before sending the request -- see below).
-+         *   attrd_start_election_if_needed() will do nothing here.
-+         * - Otherwise, we ensure an election is happening (unless we're
-+         *   shutting down). The winner will erase transient attributes from the
-+         *   CIB for all removed nodes in attrd_election_cb().
-+         *
-+         * We add the node to the removed_peers table in case we win an election
-+         * and need to request CIB erasures based on the table contents. This
-+         * could happen for either of two reasons:
-+         * - There is no current writer and we're not shutting down. An election
-+         *   either is already in progress or will be triggered here.
-+         * - The current writer leaves before sending the CIB update request. A
-+         *   new election will be triggered.
-+         */
-+        if (removed_peers == NULL) {
-+            removed_peers = pcmk__strikey_table(free, NULL);
-+        }
-+        g_hash_table_add(removed_peers, pcmk__str_copy(host));
-+        attrd_start_election_if_needed();
-+    }
-+
-     if (uncache) {
-         pcmk__purge_node_from_cache(host, 0);
-         attrd_forget_node_xml_id(host);
-diff --git a/daemons/attrd/attrd_elections.c b/daemons/attrd/attrd_elections.c
-index 281ec12..e75a1d3 100644
---- a/daemons/attrd/attrd_elections.c
-+++ b/daemons/attrd/attrd_elections.c
-@@ -24,6 +24,8 @@ attrd_election_cb(pcmk_cluster_t *cluster)
-     /* Update the peers after an election */
-     attrd_peer_sync(NULL);
- 
-+    attrd_erase_removed_peer_attributes();
-+
-     /* After winning an election, update the CIB with the values of all
-      * attributes as the winner knows them.
-      */
-diff --git a/daemons/attrd/pacemaker-attrd.c b/daemons/attrd/pacemaker-attrd.c
-index 7711fd2..3fa099b 100644
---- a/daemons/attrd/pacemaker-attrd.c
-+++ b/daemons/attrd/pacemaker-attrd.c
-@@ -201,6 +201,7 @@ main(int argc, char **argv)
-             attrd_cib_disconnect();
-         }
- 
-+        attrd_free_removed_peers();
-         attrd_free_waitlist();
-         pcmk_cluster_disconnect(attrd_cluster);
-         pcmk_cluster_free(attrd_cluster);
-diff --git a/daemons/attrd/pacemaker-attrd.h b/daemons/attrd/pacemaker-attrd.h
-index d9423c8..80ae0d9 100644
---- a/daemons/attrd/pacemaker-attrd.h
-+++ b/daemons/attrd/pacemaker-attrd.h
-@@ -184,6 +184,9 @@ extern GHashTable *peer_protocol_vers;
- 
- #define CIB_OP_TIMEOUT_S 120
- 
-+void attrd_free_removed_peers(void);
-+void attrd_erase_removed_peer_attributes(void);
-+
- int attrd_cluster_connect(void);
- void attrd_broadcast_value(const attribute_t *a, const attribute_value_t *v);
- void attrd_peer_update(const pcmk__node_status_t *peer, xmlNode *xml,
--- 
-2.47.1
-
-From 9db7cad74c9c051761c9d8a099a235cc2320f35d Mon Sep 17 00:00:00 2001
-From: Ken Gaillot <kgaillot@redhat.com>
-Date: Thu, 14 Dec 2023 14:56:11 -0600
-Subject: [PATCH 02/10] Low: pacemaker-attrd: Drop "requesting shutdown" code
-
-The requesting_shutdown variable was checked only by
-attrd_shutting_down(), when the if_requested argument was set to true.
-In that case, it returned true if either the shutting_down variable was
-true or both the if_requested argument and the requesting_shutdown
-variable were true.
-
-The only caller that passed if_requested=true was
-attrd_cib_updated_cb(). It did this if:
-a. the alerts section was changed, or
-b. the status section or nodes section was changed by an untrusted
-  client.
-
-Details:
-a. Prior to f42e170, we didn't pass if_requested=true for an alerts
-   section change. We started doing so as of that commit mostly for
-   convenience. We decided that it seemed reasonable to ignore alert
-   changes when there was a shutdown pending.
-
-   This commit reverts to NOT ignoring alert changes due to pending
-   shutdown. That seems like it might be better. I'm not sure if it's
-   possible for us to land in attrd_send_attribute_alert() while a
-   shutdown is requested but has not begun. If so, it would be good to
-   send the correct alerts.
-
-b. The other call with true is to avoid writing out all attributes when
-   the status or nodes section changes. It's probably okay to drop the
-   true there too. It was added by a1a9c54, to resolve a race condition
-   where:
-   * node2 left.
-   * node1's controller deleted node2's transient attributes from the
-     CIB.
-   * node1 took over as DC and replaced the CIB.
-   * node2's attribute manager was not yet actually shutting down, and
-     it responded to the CIB replacement by writing out all of the
-     attributes that were in its memory, including its own "shutdown"
-     attribute.
-
-   Now (as of the previous commit), node1's attribute manager would
-   delete this "shutdown" attribute as part of its shutdown process. (Or
-   more accurately, I think the attribute writer node will do that.)
-
-   So if we understand correctly, the attrd_shutting_down(true)
-   workaround is no longer needed.
-
-With no more callers needing to pass true, the supporting code can go
-away.
-
-Co-Authored-By: Reid Wahl <nrwahl@protonmail.com>
----
- daemons/attrd/attrd_cib.c       |  6 +++---
- daemons/attrd/attrd_corosync.c  | 15 ++-----------
- daemons/attrd/attrd_elections.c |  4 ++--
- daemons/attrd/attrd_ipc.c       |  2 +-
- daemons/attrd/attrd_utils.c     | 37 ++++-----------------------------
- daemons/attrd/pacemaker-attrd.h |  4 +---
- 6 files changed, 13 insertions(+), 55 deletions(-)
-
-diff --git a/daemons/attrd/attrd_cib.c b/daemons/attrd/attrd_cib.c
-index 4231e4a..acd4621 100644
---- a/daemons/attrd/attrd_cib.c
-+++ b/daemons/attrd/attrd_cib.c
-@@ -34,7 +34,7 @@ attrd_cib_destroy_cb(gpointer user_data)
- 
-     cib->cmds->signoff(cib);
- 
--    if (attrd_shutting_down(false)) {
-+    if (attrd_shutting_down()) {
-         crm_info("Disconnected from the CIB manager");
- 
-     } else {
-@@ -57,7 +57,7 @@ attrd_cib_updated_cb(const char *event, xmlNode *msg)
-     }
- 
-     if (pcmk__cib_element_in_patchset(patchset, PCMK_XE_ALERTS)) {
--        if (attrd_shutting_down(true)) {
-+        if (attrd_shutting_down()) {
-             crm_debug("Ignoring alerts change in CIB during shutdown");
-         } else {
-             mainloop_set_trigger(attrd_config_read);
-@@ -82,7 +82,7 @@ attrd_cib_updated_cb(const char *event, xmlNode *msg)
-     if (status_changed
-         || pcmk__cib_element_in_patchset(patchset, PCMK_XE_NODES)) {
- 
--        if (attrd_shutting_down(true)) {
-+        if (attrd_shutting_down()) {
-             crm_debug("Ignoring node change in CIB during shutdown");
-             return;
-         }
-diff --git a/daemons/attrd/attrd_corosync.c b/daemons/attrd/attrd_corosync.c
-index 8497f34..02ddec6 100644
---- a/daemons/attrd/attrd_corosync.c
-+++ b/daemons/attrd/attrd_corosync.c
-@@ -83,7 +83,7 @@ attrd_peer_message(pcmk__node_status_t *peer, xmlNode *xml)
-         return;
-     }
- 
--    if (attrd_shutting_down(false)) {
-+    if (attrd_shutting_down()) {
-         /* If we're shutting down, we want to continue responding to election
-          * ops as long as we're a cluster member (because our vote may be
-          * needed). Ignore all other messages.
-@@ -166,7 +166,7 @@ attrd_cpg_dispatch(cpg_handle_t handle,
- static void
- attrd_cpg_destroy(gpointer unused)
- {
--    if (attrd_shutting_down(false)) {
-+    if (attrd_shutting_down()) {
-         crm_info("Disconnected from Corosync process group");
- 
-     } else {
-@@ -328,17 +328,6 @@ update_attr_on_host(attribute_t *a, const pcmk__node_status_t *peer,
-         pcmk__str_update(&v->current, value);
-         attrd_set_attr_flags(a, attrd_attr_changed);
- 
--        if (pcmk__str_eq(host, attrd_cluster->priv->node_name, pcmk__str_casei)
--            && pcmk__str_eq(attr, PCMK__NODE_ATTR_SHUTDOWN, pcmk__str_none)) {
--
--            if (!pcmk__str_eq(value, "0", pcmk__str_null_matches)) {
--                attrd_set_requesting_shutdown();
--
--            } else {
--                attrd_clear_requesting_shutdown();
--            }
--        }
--
-         // Write out new value or start dampening timer
-         if (a->timeout_ms && a->timer) {
-             crm_trace("Delaying write of %s %s for dampening",
-diff --git a/daemons/attrd/attrd_elections.c b/daemons/attrd/attrd_elections.c
-index e75a1d3..eb9ef8c 100644
---- a/daemons/attrd/attrd_elections.c
-+++ b/daemons/attrd/attrd_elections.c
-@@ -43,7 +43,7 @@ attrd_start_election_if_needed(void)
- {
-     if ((peer_writer == NULL)
-         && (election_state(attrd_cluster) != election_in_progress)
--        && !attrd_shutting_down(false)) {
-+        && !attrd_shutting_down()) {
- 
-         crm_info("Starting an election to determine the writer");
-         election_vote(attrd_cluster);
-@@ -65,7 +65,7 @@ attrd_handle_election_op(const pcmk__node_status_t *peer, xmlNode *xml)
-     crm_xml_add(xml, PCMK__XA_SRC, peer->name);
- 
-     // Don't become writer if we're shutting down
--    rc = election_count_vote(attrd_cluster, xml, !attrd_shutting_down(false));
-+    rc = election_count_vote(attrd_cluster, xml, !attrd_shutting_down());
- 
-     switch(rc) {
-         case election_start:
-diff --git a/daemons/attrd/attrd_ipc.c b/daemons/attrd/attrd_ipc.c
-index 43e0f41..8a3bb36 100644
---- a/daemons/attrd/attrd_ipc.c
-+++ b/daemons/attrd/attrd_ipc.c
-@@ -492,7 +492,7 @@ static int32_t
- attrd_ipc_accept(qb_ipcs_connection_t *c, uid_t uid, gid_t gid)
- {
-     crm_trace("New client connection %p", c);
--    if (attrd_shutting_down(false)) {
-+    if (attrd_shutting_down()) {
-         crm_info("Ignoring new connection from pid %d during shutdown",
-                  pcmk__client_pid(c));
-         return -ECONNREFUSED;
-diff --git a/daemons/attrd/attrd_utils.c b/daemons/attrd/attrd_utils.c
-index f219b88..e3e814d 100644
---- a/daemons/attrd/attrd_utils.c
-+++ b/daemons/attrd/attrd_utils.c
-@@ -25,7 +25,6 @@
- 
- cib_t *the_cib = NULL;
- 
--static bool requesting_shutdown = false;
- static bool shutting_down = false;
- static GMainLoop *mloop = NULL;
- 
-@@ -34,45 +33,17 @@ static GMainLoop *mloop = NULL;
-  */
- GHashTable *peer_protocol_vers = NULL;
- 
--/*!
-- * \internal
-- * \brief  Set requesting_shutdown state
-- */
--void
--attrd_set_requesting_shutdown(void)
--{
--    requesting_shutdown = true;
--}
--
--/*!
-- * \internal
-- * \brief  Clear requesting_shutdown state
-- */
--void
--attrd_clear_requesting_shutdown(void)
--{
--    requesting_shutdown = false;
--}
--
- /*!
-  * \internal
-  * \brief Check whether local attribute manager is shutting down
-  *
-- * \param[in] if_requested  If \c true, also consider presence of
-- *                          \c PCMK__NODE_ATTR_SHUTDOWN attribute
-- *
-- * \return \c true if local attribute manager has begun shutdown sequence
-- *         or (if \p if_requested is \c true) whether local node has a nonzero
-- *         \c PCMK__NODE_ATTR_SHUTDOWN attribute set, otherwise \c false
-- * \note Most callers should pass \c false for \p if_requested, because the
-- *       attribute manager needs to continue performing while the controller is
-- *       shutting down, and even needs to be eligible for election in case all
-- *       nodes are shutting down.
-+ * \return \c true if local attribute manager has begun shutdown sequence,
-+ *         otherwise \c false
-  */
- bool
--attrd_shutting_down(bool if_requested)
-+attrd_shutting_down(void)
- {
--    return shutting_down || (if_requested && requesting_shutdown);
-+    return shutting_down;
- }
- 
- /*!
-diff --git a/daemons/attrd/pacemaker-attrd.h b/daemons/attrd/pacemaker-attrd.h
-index 80ae0d9..d3e5765 100644
---- a/daemons/attrd/pacemaker-attrd.h
-+++ b/daemons/attrd/pacemaker-attrd.h
-@@ -56,10 +56,8 @@
- void attrd_init_mainloop(void);
- void attrd_run_mainloop(void);
- 
--void attrd_set_requesting_shutdown(void);
--void attrd_clear_requesting_shutdown(void);
- void attrd_free_waitlist(void);
--bool attrd_shutting_down(bool if_requested);
-+bool attrd_shutting_down(void);
- void attrd_shutdown(int nsig);
- void attrd_init_ipc(void);
- void attrd_ipc_fini(void);
--- 
-2.47.1
-
-From 19a157cb90466aaa5d929573edeabded3ba047ef Mon Sep 17 00:00:00 2001
-From: Ken Gaillot <kgaillot@redhat.com>
-Date: Mon, 18 Dec 2023 11:38:00 -0600
-Subject: [PATCH 03/10] Low: controller: don't need to erase node attributes
- for remote nodes
-
-Now that the attribute manager will erase transient attributes from the
-CIB when purging a node, we don't need to do that separately in the
-controller.
-
-Co-Authored-By: Chris Lumens <clumens@redhat.com>
----
- daemons/controld/controld_remote_ra.c | 41 +++++++--------------------
- 1 file changed, 11 insertions(+), 30 deletions(-)
-
-diff --git a/daemons/controld/controld_remote_ra.c b/daemons/controld/controld_remote_ra.c
-index 1cc4ae0..c9adf97 100644
---- a/daemons/controld/controld_remote_ra.c
-+++ b/daemons/controld/controld_remote_ra.c
-@@ -237,35 +237,19 @@ should_purge_attributes(pcmk__node_status_t *node)
-     return true;
- }
- 
--static enum controld_section_e
--section_to_delete(bool purge)
--{
--    if (pcmk_is_set(controld_globals.flags, controld_shutdown_lock_enabled)) {
--        if (purge) {
--            return controld_section_all_unlocked;
--        } else {
--            return controld_section_lrm_unlocked;
--        }
--    } else {
--        if (purge) {
--            return controld_section_all;
--        } else {
--            return controld_section_lrm;
--        }
--    }
--}
--
- static void
- purge_remote_node_attrs(int call_opt, pcmk__node_status_t *node)
- {
--    bool purge = should_purge_attributes(node);
--    enum controld_section_e section = section_to_delete(purge);
-+    enum controld_section_e section = controld_section_lrm;
- 
--    /* Purge node from attrd's memory */
--    if (purge) {
-+    // Purge node's transient attributes (from attribute manager and CIB)
-+    if (should_purge_attributes(node)) {
-         update_attrd_remote_node_removed(node->name, NULL);
-     }
- 
-+    if (pcmk_is_set(controld_globals.flags, controld_shutdown_lock_enabled)) {
-+        section = controld_section_lrm_unlocked;
-+    }
-     controld_delete_node_state(node->name, section, call_opt);
- }
- 
-@@ -367,18 +351,15 @@ remote_node_down(const char *node_name, const enum down_opts opts)
-     int call_opt = crmd_cib_smart_opt();
-     pcmk__node_status_t *node = NULL;
- 
--    /* Purge node from attrd's memory */
-+    // Purge node's transient attributes (from attribute manager and CIB)
-     update_attrd_remote_node_removed(node_name, NULL);
- 
--    /* Normally, only node attributes should be erased, and the resource history
--     * should be kept until the node comes back up. However, after a successful
--     * fence, we want to clear the history as well, so we don't think resources
--     * are still running on the node.
-+    /* Normally, the resource history should be kept until the node comes back
-+     * up. However, after a successful fence, clear the history so we don't
-+     * think resources are still running on the node.
-      */
-     if (opts == DOWN_ERASE_LRM) {
--        controld_delete_node_state(node_name, controld_section_all, call_opt);
--    } else {
--        controld_delete_node_state(node_name, controld_section_attrs, call_opt);
-+        controld_delete_node_state(node_name, controld_section_lrm, call_opt);
-     }
- 
-     /* Ensure node is in the remote peer cache with lost state */
--- 
-2.47.1
-
-From d49965412a5433a9a92463178d69074da9b3c349 Mon Sep 17 00:00:00 2001
-From: Ken Gaillot <kgaillot@redhat.com>
-Date: Thu, 14 Dec 2023 15:42:39 -0600
-Subject: [PATCH 04/10] Refactor: controller: Allow purging node attrs without
- cache removal
-
-Nothing uses the new capability yet.
----
- daemons/controld/controld_attrd.c     | 22 +++++++++++++++-------
- daemons/controld/controld_remote_ra.c |  4 ++--
- daemons/controld/controld_utils.h     |  2 +-
- 3 files changed, 18 insertions(+), 10 deletions(-)
-
-diff --git a/daemons/controld/controld_attrd.c b/daemons/controld/controld_attrd.c
-index eff8070..c8591ef 100644
---- a/daemons/controld/controld_attrd.c
-+++ b/daemons/controld/controld_attrd.c
-@@ -106,8 +106,15 @@ update_attrd_list(GList *attrs, uint32_t opts)
-     }
- }
- 
-+/*!
-+ * \internal
-+ * \brief Ask attribute manager to purge a node and its transient attributes
-+ *
-+ * \param[in] node_name   Node to purge
-+ * \param[in] from_cache  If true, purge from node caches as well
-+ */
- void
--update_attrd_remote_node_removed(const char *host, const char *user_name)
-+controld_purge_node_attrs(const char *node_name, bool from_cache)
- {
-     int rc = pcmk_rc_ok;
- 
-@@ -115,14 +122,15 @@ update_attrd_remote_node_removed(const char *host, const char *user_name)
-         rc = pcmk_new_ipc_api(&attrd_api, pcmk_ipc_attrd);
-     }
-     if (rc == pcmk_rc_ok) {
--        crm_trace("Asking attribute manager to purge Pacemaker Remote node %s",
--                  host);
--        rc = pcmk__attrd_api_purge(attrd_api, host, true);
-+        crm_debug("Asking %s to purge transient attributes%s for %s",
-+                  pcmk_ipc_name(attrd_api, true),
-+                  (from_cache? " and node cache" : ""), node_name);
-+        rc = pcmk__attrd_api_purge(attrd_api, node_name, from_cache);
-     }
-     if (rc != pcmk_rc_ok) {
--        crm_err("Could not purge Pacemaker Remote node %s "
--                "in attribute manager%s: %s " QB_XS " rc=%d",
--                host, when(), pcmk_rc_str(rc), rc);
-+        crm_err("Could not purge node %s from %s%s: %s "
-+                QB_XS " rc=%d", node_name, pcmk_ipc_name(attrd_api, true),
-+                when(), pcmk_rc_str(rc), rc);
-     }
- }
- 
-diff --git a/daemons/controld/controld_remote_ra.c b/daemons/controld/controld_remote_ra.c
-index c9adf97..3136180 100644
---- a/daemons/controld/controld_remote_ra.c
-+++ b/daemons/controld/controld_remote_ra.c
-@@ -244,7 +244,7 @@ purge_remote_node_attrs(int call_opt, pcmk__node_status_t *node)
- 
-     // Purge node's transient attributes (from attribute manager and CIB)
-     if (should_purge_attributes(node)) {
--        update_attrd_remote_node_removed(node->name, NULL);
-+        controld_purge_node_attrs(node->name, true);
-     }
- 
-     if (pcmk_is_set(controld_globals.flags, controld_shutdown_lock_enabled)) {
-@@ -352,7 +352,7 @@ remote_node_down(const char *node_name, const enum down_opts opts)
-     pcmk__node_status_t *node = NULL;
- 
-     // Purge node's transient attributes (from attribute manager and CIB)
--    update_attrd_remote_node_removed(node_name, NULL);
-+    controld_purge_node_attrs(node_name, true);
- 
-     /* Normally, the resource history should be kept until the node comes back
-      * up. However, after a successful fence, clear the history so we don't
-diff --git a/daemons/controld/controld_utils.h b/daemons/controld/controld_utils.h
-index e633888..262e0d1 100644
---- a/daemons/controld/controld_utils.h
-+++ b/daemons/controld/controld_utils.h
-@@ -69,7 +69,7 @@ void crm_update_quorum(gboolean quorum, gboolean force_update);
- void controld_close_attrd_ipc(void);
- void update_attrd(const char *host, const char *name, const char *value, const char *user_name, gboolean is_remote_node);
- void update_attrd_list(GList *attrs, uint32_t opts);
--void update_attrd_remote_node_removed(const char *host, const char *user_name);
-+void controld_purge_node_attrs(const char *node_name, bool from_cache);
- void update_attrd_clear_failures(const char *host, const char *rsc,
-                                  const char *op, const char *interval_spec,
-                                  gboolean is_remote_node);
--- 
-2.47.1
-
-From 5fb8fdc72f457c7e9a691c10a99d54d0e03bd77d Mon Sep 17 00:00:00 2001
-From: Ken Gaillot <kgaillot@redhat.com>
-Date: Thu, 14 Dec 2023 16:09:40 -0600
-Subject: [PATCH 05/10] Fix: controller: Don't purge transient attributes on
- node loss
-
-With recent changes, the attribute manager now handles it when the node
-leaves the cluster, so the controller purge is redundant.
-
-This does alter the timing somewhat, since the controller's purge
-occurred when the node left the controller process group, while the
-attribute manager's purge occurs when it leaves the cluster, but that
-shouldn't make a significant difference.
-
-This fixes a problem when a node's controller crashes and is respawned
-while fencing is disabled. Previously, another node's controller would
-remove that node's transient attributes from the CIB, but they would
-remain in the attribute managers' memory. Now, the attributes are
-correctly retained in the CIB in this situation.
-
-Fixes T137
-Fixes T139
-
-Co-Authored-By: Chris Lumens <clumens@redhat.com>
----
- daemons/controld/controld_callbacks.c | 14 +-------------
- 1 file changed, 1 insertion(+), 13 deletions(-)
-
-diff --git a/daemons/controld/controld_callbacks.c b/daemons/controld/controld_callbacks.c
-index 48c255e..57e5183 100644
---- a/daemons/controld/controld_callbacks.c
-+++ b/daemons/controld/controld_callbacks.c
-@@ -233,19 +233,11 @@ peer_update_callback(enum pcmk__node_update type, pcmk__node_status_t *node,
-                                     pcmk__str_casei)
-                        && !pcmk__cluster_is_node_active(node)) {
- 
--                /* The DC has left, so delete its transient attributes and
--                 * trigger a new election.
--                 *
--                 * A DC sends its shutdown request to all peers, who update the
--                 * DC's expected state to down. This avoids fencing upon
--                 * deletion of its transient attributes.
--                 */
-+                // The DC has left, so trigger a new election
-                 crm_notice("Our peer on the DC (%s) is dead",
-                            controld_globals.dc_name);
- 
-                 register_fsa_input(C_CRMD_STATUS_CALLBACK, I_ELECTION, NULL);
--                controld_delete_node_state(node->name, controld_section_attrs,
--                                           cib_none);
- 
-             } else if (AM_I_DC
-                        || pcmk_is_set(controld_globals.flags, controld_dc_left)
-@@ -256,10 +248,6 @@ peer_update_callback(enum pcmk__node_update type, pcmk__node_status_t *node,
-                  */
-                 if (appeared) {
-                     te_trigger_stonith_history_sync(FALSE);
--                } else {
--                    controld_delete_node_state(node->name,
--                                               controld_section_attrs,
--                                               cib_none);
-                 }
-             }
-             break;
--- 
-2.47.1
-
-From c40026fb77a6f7ee804979293e3019943a34e06b Mon Sep 17 00:00:00 2001
-From: Ken Gaillot <kgaillot@redhat.com>
-Date: Mon, 18 Dec 2023 13:05:35 -0600
-Subject: [PATCH 06/10] Low: controller: Ask attribute manager to purge fenced
- nodes' attributes
-
-...instead of wiping from the CIB directly.
-
-Co-Authored-By: Chris Lumens <clumens@redhat.com>
----
- daemons/controld/controld_fencing.c | 8 +++++++-
- 1 file changed, 7 insertions(+), 1 deletion(-)
-
-diff --git a/daemons/controld/controld_fencing.c b/daemons/controld/controld_fencing.c
-index 51367ca..de074aa 100644
---- a/daemons/controld/controld_fencing.c
-+++ b/daemons/controld/controld_fencing.c
-@@ -267,7 +267,13 @@ update_node_state_after_fencing(const char *target, const char *target_xml_id)
-     crm_debug("Updating node state for %s after fencing (call %d)", target, rc);
-     fsa_register_cib_callback(rc, pcmk__str_copy(target), cib_fencing_updated);
- 
--    controld_delete_node_state(peer->name, controld_section_all, cib_none);
-+    // Delete node's resource history from CIB
-+    controld_delete_node_state(peer->name, controld_section_lrm, cib_none);
-+
-+    // Ask attribute manager to delete node's transient attributes
-+    // @TODO: This is the only call to controld_purge_node_attrs that doesn't
-+    //        want to also purge the node from the caches.  Why?
-+    controld_purge_node_attrs(peer->name, false);
- }
- 
- /*!
--- 
-2.47.1
-
-From d9d19827d93f2394a831a9651aae064ea5a04fa4 Mon Sep 17 00:00:00 2001
-From: Ken Gaillot <kgaillot@redhat.com>
-Date: Mon, 18 Dec 2023 13:14:53 -0600
-Subject: [PATCH 07/10] Refactor: controller: Drop no-longer-used section enum
- values
-
----
- daemons/controld/controld_cib.c | 24 ------------------------
- daemons/controld/controld_cib.h |  5 +----
- 2 files changed, 1 insertion(+), 28 deletions(-)
-
-diff --git a/daemons/controld/controld_cib.c b/daemons/controld/controld_cib.c
-index e2a0d50..39c2b06 100644
---- a/daemons/controld/controld_cib.c
-+++ b/daemons/controld/controld_cib.c
-@@ -279,17 +279,6 @@ cib_delete_callback(xmlNode *msg, int call_id, int rc, xmlNode *output,
-                                 "[not(@" PCMK_OPT_SHUTDOWN_LOCK ") "        \
-                                     "or " PCMK_OPT_SHUTDOWN_LOCK "<%lld]"
- 
--// Node's PCMK__XE_TRANSIENT_ATTRIBUTES section (name 1x)
--#define XPATH_NODE_ATTRS XPATH_NODE_STATE "/" PCMK__XE_TRANSIENT_ATTRIBUTES
--
--// Everything under PCMK__XE_NODE_STATE (name 1x)
--#define XPATH_NODE_ALL          XPATH_NODE_STATE "/*"
--
--/* Unlocked history + transient attributes
-- * (name 2x, (seconds_since_epoch - PCMK_OPT_SHUTDOWN_LOCK_LIMIT) 1x, name 1x)
-- */
--#define XPATH_NODE_ALL_UNLOCKED XPATH_NODE_LRM_UNLOCKED "|" XPATH_NODE_ATTRS
--
- /*!
-  * \internal
-  * \brief Get the XPath and description of a node state section to be deleted
-@@ -320,19 +309,6 @@ controld_node_state_deletion_strings(const char *uname,
-                                        uname, uname, expire);
-             desc_pre = "resource history (other than shutdown locks)";
-             break;
--        case controld_section_attrs:
--            *xpath = crm_strdup_printf(XPATH_NODE_ATTRS, uname);
--            desc_pre = "transient attributes";
--            break;
--        case controld_section_all:
--            *xpath = crm_strdup_printf(XPATH_NODE_ALL, uname);
--            desc_pre = "all state";
--            break;
--        case controld_section_all_unlocked:
--            *xpath = crm_strdup_printf(XPATH_NODE_ALL_UNLOCKED,
--                                       uname, uname, expire, uname);
--            desc_pre = "all state (other than shutdown locks)";
--            break;
-         default:
-             // We called this function incorrectly
-             pcmk__assert(false);
-diff --git a/daemons/controld/controld_cib.h b/daemons/controld/controld_cib.h
-index b8622d5..25277e7 100644
---- a/daemons/controld/controld_cib.h
-+++ b/daemons/controld/controld_cib.h
-@@ -1,5 +1,5 @@
- /*
-- * Copyright 2004-2024 the Pacemaker project contributors
-+ * Copyright 2004-2025 the Pacemaker project contributors
-  *
-  * The version control history for this file may have further details.
-  *
-@@ -50,9 +50,6 @@ unsigned int cib_op_timeout(void);
- enum controld_section_e {
-     controld_section_lrm,
-     controld_section_lrm_unlocked,
--    controld_section_attrs,
--    controld_section_all,
--    controld_section_all_unlocked
- };
- 
- void controld_node_state_deletion_strings(const char *uname,
--- 
-2.47.1
-
-From 1056a0e3f6b618c23eb5a73d7e4a600619713a0c Mon Sep 17 00:00:00 2001
-From: Ken Gaillot <kgaillot@redhat.com>
-Date: Mon, 18 Dec 2023 13:39:49 -0600
-Subject: [PATCH 08/10] Refactor: controller: Drop node state section enum
-
-It now boils down to a bool for whether we want only unlocked resources.
----
- daemons/controld/controld_cib.c       | 48 +++++++++++----------------
- daemons/controld/controld_cib.h       | 13 ++------
- daemons/controld/controld_execd.c     |  3 +-
- daemons/controld/controld_fencing.c   |  2 +-
- daemons/controld/controld_join_dc.c   |  9 +++--
- daemons/controld/controld_remote_ra.c | 10 +++---
- 6 files changed, 32 insertions(+), 53 deletions(-)
-
-diff --git a/daemons/controld/controld_cib.c b/daemons/controld/controld_cib.c
-index 39c2b06..298c321 100644
---- a/daemons/controld/controld_cib.c
-+++ b/daemons/controld/controld_cib.c
-@@ -281,16 +281,15 @@ cib_delete_callback(xmlNode *msg, int call_id, int rc, xmlNode *output,
- 
- /*!
-  * \internal
-- * \brief Get the XPath and description of a node state section to be deleted
-+ * \brief Get the XPath and description of resource history to be deleted
-  *
-- * \param[in]  uname    Desired node
-- * \param[in]  section  Subsection of \c PCMK__XE_NODE_STATE to be deleted
-- * \param[out] xpath    Where to store XPath of \p section
-- * \param[out] desc     If not \c NULL, where to store description of \p section
-+ * \param[in]  uname          Name of node to delete resource history for
-+ * \param[in]  unlocked_only  If true, delete history of only unlocked resources
-+ * \param[out] xpath          Where to store XPath for history deletion
-+ * \param[out] desc           If not NULL, where to store loggable description
-  */
- void
--controld_node_state_deletion_strings(const char *uname,
--                                     enum controld_section_e section,
-+controld_node_state_deletion_strings(const char *uname, bool unlocked_only,
-                                      char **xpath, char **desc)
- {
-     const char *desc_pre = NULL;
-@@ -299,20 +298,13 @@ controld_node_state_deletion_strings(const char *uname,
-     long long expire = (long long) time(NULL)
-                        - controld_globals.shutdown_lock_limit;
- 
--    switch (section) {
--        case controld_section_lrm:
--            *xpath = crm_strdup_printf(XPATH_NODE_LRM, uname);
--            desc_pre = "resource history";
--            break;
--        case controld_section_lrm_unlocked:
--            *xpath = crm_strdup_printf(XPATH_NODE_LRM_UNLOCKED,
--                                       uname, uname, expire);
--            desc_pre = "resource history (other than shutdown locks)";
--            break;
--        default:
--            // We called this function incorrectly
--            pcmk__assert(false);
--            break;
-+    if (unlocked_only) {
-+        *xpath = crm_strdup_printf(XPATH_NODE_LRM_UNLOCKED,
-+                                   uname, uname, expire);
-+        desc_pre = "resource history (other than shutdown locks)";
-+    } else {
-+        *xpath = crm_strdup_printf(XPATH_NODE_LRM, uname);
-+        desc_pre = "resource history";
-     }
- 
-     if (desc != NULL) {
-@@ -322,15 +314,14 @@ controld_node_state_deletion_strings(const char *uname,
- 
- /*!
-  * \internal
-- * \brief Delete subsection of a node's CIB \c PCMK__XE_NODE_STATE
-+ * \brief Delete a node's resource history from the CIB
-  *
-- * \param[in] uname    Desired node
-- * \param[in] section  Subsection of \c PCMK__XE_NODE_STATE to delete
-- * \param[in] options  CIB call options to use
-+ * \param[in] uname          Name of node to delete resource history for
-+ * \param[in] unlocked_only  If true, delete history of only unlocked resources
-+ * \param[in] options        CIB call options to use
-  */
- void
--controld_delete_node_state(const char *uname, enum controld_section_e section,
--                           int options)
-+controld_delete_node_state(const char *uname, bool unlocked_only, int options)
- {
-     cib_t *cib = controld_globals.cib_conn;
-     char *xpath = NULL;
-@@ -339,8 +330,7 @@ controld_delete_node_state(const char *uname, enum controld_section_e section,
- 
-     pcmk__assert((uname != NULL) && (cib != NULL));
- 
--    controld_node_state_deletion_strings(uname, section, &xpath, &desc);
--
-+    controld_node_state_deletion_strings(uname, unlocked_only, &xpath, &desc);
-     cib__set_call_options(options, "node state deletion",
-                           cib_xpath|cib_multiple);
-     cib_rc = cib->cmds->remove(cib, xpath, NULL, options);
-diff --git a/daemons/controld/controld_cib.h b/daemons/controld/controld_cib.h
-index 25277e7..f423f93 100644
---- a/daemons/controld/controld_cib.h
-+++ b/daemons/controld/controld_cib.h
-@@ -46,17 +46,10 @@ int controld_update_cib(const char *section, xmlNode *data, int options,
-                                          void *));
- unsigned int cib_op_timeout(void);
- 
--// Subsections of PCMK__XE_NODE_STATE
--enum controld_section_e {
--    controld_section_lrm,
--    controld_section_lrm_unlocked,
--};
--
--void controld_node_state_deletion_strings(const char *uname,
--                                          enum controld_section_e section,
-+void controld_node_state_deletion_strings(const char *uname, bool unlocked_only,
-                                           char **xpath, char **desc);
--void controld_delete_node_state(const char *uname,
--                                enum controld_section_e section, int options);
-+void controld_delete_node_state(const char *uname, bool unlocked_only,
-+                                int options);
- int controld_delete_resource_history(const char *rsc_id, const char *node,
-                                      const char *user_name, int call_options);
- 
-diff --git a/daemons/controld/controld_execd.c b/daemons/controld/controld_execd.c
-index 2ec6893..801a5db 100644
---- a/daemons/controld/controld_execd.c
-+++ b/daemons/controld/controld_execd.c
-@@ -1074,8 +1074,7 @@ force_reprobe(lrm_state_t *lrm_state, const char *from_sys,
-     }
- 
-     /* Now delete the copy in the CIB */
--    controld_delete_node_state(lrm_state->node_name, controld_section_lrm,
--                               cib_none);
-+    controld_delete_node_state(lrm_state->node_name, false, cib_none);
- }
- 
- /*!
-diff --git a/daemons/controld/controld_fencing.c b/daemons/controld/controld_fencing.c
-index de074aa..6270dcd 100644
---- a/daemons/controld/controld_fencing.c
-+++ b/daemons/controld/controld_fencing.c
-@@ -268,7 +268,7 @@ update_node_state_after_fencing(const char *target, const char *target_xml_id)
-     fsa_register_cib_callback(rc, pcmk__str_copy(target), cib_fencing_updated);
- 
-     // Delete node's resource history from CIB
--    controld_delete_node_state(peer->name, controld_section_lrm, cib_none);
-+    controld_delete_node_state(peer->name, false, cib_none);
- 
-     // Ask attribute manager to delete node's transient attributes
-     // @TODO: This is the only call to controld_purge_node_attrs that doesn't
-diff --git a/daemons/controld/controld_join_dc.c b/daemons/controld/controld_join_dc.c
-index a91fbfa..f88cc47 100644
---- a/daemons/controld/controld_join_dc.c
-+++ b/daemons/controld/controld_join_dc.c
-@@ -771,7 +771,8 @@ do_dc_join_ack(long long action,
-     pcmk__node_status_t *peer = NULL;
-     enum controld_join_phase phase = controld_join_none;
- 
--    enum controld_section_e section = controld_section_lrm;
-+    const bool unlocked_only = pcmk_is_set(controld_globals.flags,
-+                                           controld_shutdown_lock_enabled);
-     char *xpath = NULL;
-     xmlNode *state = join_ack->xml;
-     xmlNode *execd_state = NULL;
-@@ -832,10 +833,8 @@ do_dc_join_ack(long long action,
-     }
- 
-     // Delete relevant parts of node's current executor state from CIB
--    if (pcmk_is_set(controld_globals.flags, controld_shutdown_lock_enabled)) {
--        section = controld_section_lrm_unlocked;
--    }
--    controld_node_state_deletion_strings(join_from, section, &xpath, NULL);
-+    controld_node_state_deletion_strings(join_from, unlocked_only, &xpath,
-+                                         NULL);
- 
-     rc = cib->cmds->remove(cib, xpath, NULL,
-                            cib_xpath|cib_multiple|cib_transaction);
-diff --git a/daemons/controld/controld_remote_ra.c b/daemons/controld/controld_remote_ra.c
-index 3136180..86a3544 100644
---- a/daemons/controld/controld_remote_ra.c
-+++ b/daemons/controld/controld_remote_ra.c
-@@ -240,17 +240,15 @@ should_purge_attributes(pcmk__node_status_t *node)
- static void
- purge_remote_node_attrs(int call_opt, pcmk__node_status_t *node)
- {
--    enum controld_section_e section = controld_section_lrm;
-+    const bool unlocked_only = pcmk_is_set(controld_globals.flags,
-+                                           controld_shutdown_lock_enabled);
- 
-     // Purge node's transient attributes (from attribute manager and CIB)
-     if (should_purge_attributes(node)) {
-         controld_purge_node_attrs(node->name, true);
-     }
- 
--    if (pcmk_is_set(controld_globals.flags, controld_shutdown_lock_enabled)) {
--        section = controld_section_lrm_unlocked;
--    }
--    controld_delete_node_state(node->name, section, call_opt);
-+    controld_delete_node_state(node->name, unlocked_only, call_opt);
- }
- 
- /*!
-@@ -359,7 +357,7 @@ remote_node_down(const char *node_name, const enum down_opts opts)
-      * think resources are still running on the node.
-      */
-     if (opts == DOWN_ERASE_LRM) {
--        controld_delete_node_state(node_name, controld_section_lrm, call_opt);
-+        controld_delete_node_state(node_name, false, call_opt);
-     }
- 
-     /* Ensure node is in the remote peer cache with lost state */
--- 
-2.47.1
-
-From 050a3caad4989cc1c958420dff47b04be9a1cd55 Mon Sep 17 00:00:00 2001
-From: Ken Gaillot <kgaillot@redhat.com>
-Date: Mon, 18 Dec 2023 15:45:00 -0600
-Subject: [PATCH 09/10] Refactor: controller: Rename
- controld_delete_node_state()
-
-...to controld_delete_node_history(), and
-controld_node_state_deletion_strings() to
-controld_node_history_deletion_strings(), since they delete only history
-now.
----
- daemons/controld/controld_cib.c       | 8 ++++----
- daemons/controld/controld_cib.h       | 9 +++++----
- daemons/controld/controld_execd.c     | 2 +-
- daemons/controld/controld_fencing.c   | 2 +-
- daemons/controld/controld_join_dc.c   | 4 ++--
- daemons/controld/controld_remote_ra.c | 4 ++--
- 6 files changed, 15 insertions(+), 14 deletions(-)
-
-diff --git a/daemons/controld/controld_cib.c b/daemons/controld/controld_cib.c
-index 298c321..fb06f22 100644
---- a/daemons/controld/controld_cib.c
-+++ b/daemons/controld/controld_cib.c
-@@ -289,8 +289,8 @@ cib_delete_callback(xmlNode *msg, int call_id, int rc, xmlNode *output,
-  * \param[out] desc           If not NULL, where to store loggable description
-  */
- void
--controld_node_state_deletion_strings(const char *uname, bool unlocked_only,
--                                     char **xpath, char **desc)
-+controld_node_history_deletion_strings(const char *uname, bool unlocked_only,
-+                                       char **xpath, char **desc)
- {
-     const char *desc_pre = NULL;
- 
-@@ -321,7 +321,7 @@ controld_node_state_deletion_strings(const char *uname, bool unlocked_only,
-  * \param[in] options        CIB call options to use
-  */
- void
--controld_delete_node_state(const char *uname, bool unlocked_only, int options)
-+controld_delete_node_history(const char *uname, bool unlocked_only, int options)
- {
-     cib_t *cib = controld_globals.cib_conn;
-     char *xpath = NULL;
-@@ -330,7 +330,7 @@ controld_delete_node_state(const char *uname, bool unlocked_only, int options)
- 
-     pcmk__assert((uname != NULL) && (cib != NULL));
- 
--    controld_node_state_deletion_strings(uname, unlocked_only, &xpath, &desc);
-+    controld_node_history_deletion_strings(uname, unlocked_only, &xpath, &desc);
-     cib__set_call_options(options, "node state deletion",
-                           cib_xpath|cib_multiple);
-     cib_rc = cib->cmds->remove(cib, xpath, NULL, options);
-diff --git a/daemons/controld/controld_cib.h b/daemons/controld/controld_cib.h
-index f423f93..116db64 100644
---- a/daemons/controld/controld_cib.h
-+++ b/daemons/controld/controld_cib.h
-@@ -46,10 +46,11 @@ int controld_update_cib(const char *section, xmlNode *data, int options,
-                                          void *));
- unsigned int cib_op_timeout(void);
- 
--void controld_node_state_deletion_strings(const char *uname, bool unlocked_only,
--                                          char **xpath, char **desc);
--void controld_delete_node_state(const char *uname, bool unlocked_only,
--                                int options);
-+void controld_node_history_deletion_strings(const char *uname,
-+                                            bool unlocked_only,
-+                                            char **xpath, char **desc);
-+void controld_delete_node_history(const char *uname, bool unlocked_only,
-+                                  int options);
- int controld_delete_resource_history(const char *rsc_id, const char *node,
-                                      const char *user_name, int call_options);
- 
-diff --git a/daemons/controld/controld_execd.c b/daemons/controld/controld_execd.c
-index 801a5db..977acf0 100644
---- a/daemons/controld/controld_execd.c
-+++ b/daemons/controld/controld_execd.c
-@@ -1074,7 +1074,7 @@ force_reprobe(lrm_state_t *lrm_state, const char *from_sys,
-     }
- 
-     /* Now delete the copy in the CIB */
--    controld_delete_node_state(lrm_state->node_name, false, cib_none);
-+    controld_delete_node_history(lrm_state->node_name, false, cib_none);
- }
- 
- /*!
-diff --git a/daemons/controld/controld_fencing.c b/daemons/controld/controld_fencing.c
-index 6270dcd..026b240 100644
---- a/daemons/controld/controld_fencing.c
-+++ b/daemons/controld/controld_fencing.c
-@@ -268,7 +268,7 @@ update_node_state_after_fencing(const char *target, const char *target_xml_id)
-     fsa_register_cib_callback(rc, pcmk__str_copy(target), cib_fencing_updated);
- 
-     // Delete node's resource history from CIB
--    controld_delete_node_state(peer->name, false, cib_none);
-+    controld_delete_node_history(peer->name, false, cib_none);
- 
-     // Ask attribute manager to delete node's transient attributes
-     // @TODO: This is the only call to controld_purge_node_attrs that doesn't
-diff --git a/daemons/controld/controld_join_dc.c b/daemons/controld/controld_join_dc.c
-index f88cc47..90d1bc0 100644
---- a/daemons/controld/controld_join_dc.c
-+++ b/daemons/controld/controld_join_dc.c
-@@ -833,8 +833,8 @@ do_dc_join_ack(long long action,
-     }
- 
-     // Delete relevant parts of node's current executor state from CIB
--    controld_node_state_deletion_strings(join_from, unlocked_only, &xpath,
--                                         NULL);
-+    controld_node_history_deletion_strings(join_from, unlocked_only, &xpath,
-+                                           NULL);
- 
-     rc = cib->cmds->remove(cib, xpath, NULL,
-                            cib_xpath|cib_multiple|cib_transaction);
-diff --git a/daemons/controld/controld_remote_ra.c b/daemons/controld/controld_remote_ra.c
-index 86a3544..1c52477 100644
---- a/daemons/controld/controld_remote_ra.c
-+++ b/daemons/controld/controld_remote_ra.c
-@@ -248,7 +248,7 @@ purge_remote_node_attrs(int call_opt, pcmk__node_status_t *node)
-         controld_purge_node_attrs(node->name, true);
-     }
- 
--    controld_delete_node_state(node->name, unlocked_only, call_opt);
-+    controld_delete_node_history(node->name, unlocked_only, call_opt);
- }
- 
- /*!
-@@ -357,7 +357,7 @@ remote_node_down(const char *node_name, const enum down_opts opts)
-      * think resources are still running on the node.
-      */
-     if (opts == DOWN_ERASE_LRM) {
--        controld_delete_node_state(node_name, false, call_opt);
-+        controld_delete_node_history(node_name, false, call_opt);
-     }
- 
-     /* Ensure node is in the remote peer cache with lost state */
--- 
-2.47.1
-
-From 97dfc11f6c9d1a90ef744e5de2fe7678f3518bba Mon Sep 17 00:00:00 2001
-From: Chris Lumens <clumens@redhat.com>
-Date: Wed, 10 Sep 2025 14:59:38 -0400
-Subject: [PATCH 10/10] Refactor: daemons: Remove the down_opts enum
-
-This has only ever had two values, which basically just means it's a
-bool.
----
- daemons/controld/controld_remote_ra.c | 21 ++++++++-------------
- 1 file changed, 8 insertions(+), 13 deletions(-)
-
-diff --git a/daemons/controld/controld_remote_ra.c b/daemons/controld/controld_remote_ra.c
-index 1c52477..eb1bc55 100644
---- a/daemons/controld/controld_remote_ra.c
-+++ b/daemons/controld/controld_remote_ra.c
-@@ -330,20 +330,15 @@ remote_node_up(const char *node_name)
-     pcmk__xml_free(update);
- }
- 
--enum down_opts {
--    DOWN_KEEP_LRM,
--    DOWN_ERASE_LRM
--};
--
- /*!
-  * \internal
-  * \brief Handle cluster communication related to pacemaker_remote node leaving
-  *
-  * \param[in] node_name  Name of lost node
-- * \param[in] opts       Whether to keep or erase LRM history
-+ * \param[in] erase_lrm  If \c true, erase the LRM history
-  */
- static void
--remote_node_down(const char *node_name, const enum down_opts opts)
-+remote_node_down(const char *node_name, bool erase_lrm)
- {
-     xmlNode *update;
-     int call_opt = crmd_cib_smart_opt();
-@@ -356,7 +351,7 @@ remote_node_down(const char *node_name, const enum down_opts opts)
-      * up. However, after a successful fence, clear the history so we don't
-      * think resources are still running on the node.
-      */
--    if (opts == DOWN_ERASE_LRM) {
-+    if (erase_lrm) {
-         controld_delete_node_history(node_name, false, call_opt);
-     }
- 
-@@ -416,7 +411,7 @@ check_remote_node_state(const remote_ra_cmd_t *cmd)
-         if (ra_data) {
-             if (!pcmk_is_set(ra_data->status, takeover_complete)) {
-                 /* Stop means down if we didn't successfully migrate elsewhere */
--                remote_node_down(cmd->rsc_id, DOWN_KEEP_LRM);
-+                remote_node_down(cmd->rsc_id, false);
-             } else if (AM_I_DC == FALSE) {
-                 /* Only the connection host and DC track node state,
-                  * so if the connection migrated elsewhere and we aren't DC,
-@@ -692,7 +687,7 @@ remote_lrm_op_callback(lrmd_event_data_t * op)
-                        lrm_state->node_name);
-             /* Do roughly what a 'stop' on the remote-resource would do */
-             handle_remote_ra_stop(lrm_state, NULL);
--            remote_node_down(lrm_state->node_name, DOWN_KEEP_LRM);
-+            remote_node_down(lrm_state->node_name, false);
-             /* now fake the reply of a successful 'stop' */
-             synthesize_lrmd_success(NULL, lrm_state->node_name,
-                                     PCMK_ACTION_STOP);
-@@ -1366,11 +1361,11 @@ remote_ra_process_pseudo(xmlNode *xml)
-          * peer cache state will be incorrect unless and until the guest is
-          * recovered.
-          */
--        if (result) {
-+        if (result != NULL) {
-             const char *remote = pcmk__xe_id(result);
- 
--            if (remote) {
--                remote_node_down(remote, DOWN_ERASE_LRM);
-+            if (remote != NULL) {
-+                remote_node_down(remote, true);
-             }
-         }
-     }
--- 
-2.47.1
diff --git a/pacemaker.spec b/pacemaker.spec
index 896ed56..97087aa 100644
--- a/pacemaker.spec
+++ b/pacemaker.spec
@@ -40,11 +40,11 @@
 ## Upstream pacemaker version, and its package version (specversion
 ## can be incremented to build packages reliably considered "newer"
 ## than previously built packages with the same pcmkversion)
-%global pcmkversion 3.0.1
-%global specversion 5
+%global pcmkversion 3.0.2
+%global specversion 1
 
 ## Upstream commit (full commit ID, abbreviated commit ID, or tag) to build
-%global commit 9a5e54bae85847c4bb6ed7c7fb06103ebebbc64a
+%global commit 6629f2e0e672280ca765324858f245fdcd85f22d
 
 ## Since git v2.11, the extent of abbreviation is autoscaled by default
 ## (used to be constant of 7), so we need to convey it for non-tags, too.
@@ -199,13 +199,7 @@ Url:           https://www.clusterlabs.org/
 Source0:       https://codeload.github.com/%{github_owner}/%{name}/tar.gz/%{archive_github_url}
 Source1:       pacemaker.sysusers
 # upstream commits
-Patch001:      001-econnrefused.patch
-Patch002:      002-corosync.patch
-Patch003:      003-promotable-follows.patch
-Patch004:      004-crm_resource_wait.patch
-Patch005:      005-ipc_evict.patch
-Patch006:      006-fewer_messages.patch
-Patch007:      007-transient_attrs.patch
+#Patch001:      001-xxxx.patch
 
 Requires:      resource-agents
 Requires:      %{pkgname_pcmk_libs}%{?_isa} = %{version}-%{release}
@@ -797,6 +791,11 @@ exit 0
 %{_datadir}/pkgconfig/pacemaker-schemas.pc
 
 %changelog
+* Tue May 12 2026 Chris Lumens <clumens@redhat.com> - 3.0.2-1
+- Rebase on upstream 3.0.2-rc2
+- Don't hang waiting on certain pending monitor actions
+- Resolves: RHEL-78393
+
 * Mon Dec 8 2025 Chris Lumens <clumens@redhat.com> - 3.0.1-5
 - Fix a race condition between daemons when erasing transient attrs
 - Resolves: RHEL-23082
diff --git a/sources b/sources
index 9f16bb4..9b4a289 100644
--- a/sources
+++ b/sources
@@ -1 +1 @@
-SHA512 (pacemaker-9a5e54bae.tar.gz) = 7c90f7cb985933ba3e0254118bab5c2af050e61c22ab683255c06282df196dcca439ecdc016e22fa7751a4744092abd6801451babfb8f4d03d4d67c1fee56ed9
+SHA512 (pacemaker-6629f2e0e.tar.gz) = 242ad2aefcd4f0f435b39a0806a862b2888ee5b875977a1358338036bb6e7d8180692fb4ece30a69ce63d03d51acefde812f86169a28f06b9c05a1464454b588