Backport fixes from main.

- Fix promoting instances of a cloned resource
- Handle large timeouts correctly in crm_resource --wait
- Don't evict IPC clients as long as they're still processing messages
- Don't overwhelm the FSA queue with repeated CIB queries
- Resolves: RHEL-120932
- Resolves: RHEL-86148
- Resolves: RHEL-114895
This commit is contained in:
Chris Lumens 2025-11-13 10:43:07 -05:00
parent 5738c06925
commit 0f1dfa8d71
5 changed files with 1379 additions and 1 deletions

View File

@ -0,0 +1,801 @@
From 6e5d574de9ad3a131cc0c51f2c5300e2cf4e7db3 Mon Sep 17 00:00:00 2001
From: Klaus Wenninger <klaus.wenninger@aon.at>
Date: Tue, 7 Oct 2025 05:07:04 +0200
Subject: [PATCH 1/2] Test: scheduler: promoted state with promoted state with
attribute
Add testcase. Previous fix attribute based colocation didn't adhere
the attribute with promoted state with promoted state.
---
cts/cts-scheduler.in | 1 +
...motable-colocation-with-node-attribute.dot | 28 +++
...motable-colocation-with-node-attribute.exp | 175 ++++++++++++++++++
...able-colocation-with-node-attribute.scores | 81 ++++++++
...ble-colocation-with-node-attribute.summary | 45 +++++
...motable-colocation-with-node-attribute.xml | 155 ++++++++++++++++
6 files changed, 485 insertions(+)
create mode 100644 cts/scheduler/dot/promotable-colocation-with-node-attribute.dot
create mode 100644 cts/scheduler/exp/promotable-colocation-with-node-attribute.exp
create mode 100644 cts/scheduler/scores/promotable-colocation-with-node-attribute.scores
create mode 100644 cts/scheduler/summary/promotable-colocation-with-node-attribute.summary
create mode 100644 cts/scheduler/xml/promotable-colocation-with-node-attribute.xml
diff --git a/cts/cts-scheduler.in b/cts/cts-scheduler.in
index f5d4ed5..be8af87 100644
--- a/cts/cts-scheduler.in
+++ b/cts/cts-scheduler.in
@@ -633,6 +633,7 @@ TESTS = [
SchedulerTest("no_quorum_demote", "Promotable demotion and primitive stop with no-quorum-policy=\"demote\""),
SchedulerTest("no-promote-on-unrunnable-guest", "Don't select bundle instance for promotion when container can't run"),
SchedulerTest("leftover-pending-monitor", "Prevent a leftover pending monitor from causing unexpected stop of other instances"),
+ SchedulerTest("promotable-colocation-with-node-attribute", "Promote dependent clone on nodes belonging to a site that has a primary clone promoted"),
]),
SchedulerTestGroup([
SchedulerTest("history-1", "Correctly parse stateful-1 resource state"),
diff --git a/cts/scheduler/dot/promotable-colocation-with-node-attribute.dot b/cts/scheduler/dot/promotable-colocation-with-node-attribute.dot
new file mode 100644
index 0000000..89d066f
--- /dev/null
+++ b/cts/scheduler/dot/promotable-colocation-with-node-attribute.dot
@@ -0,0 +1,28 @@
+ digraph "g" {
+"dependent-clone_demote_0" -> "dependent-clone_demoted_0" [ style = bold]
+"dependent-clone_demote_0" -> "dependent-rsc_demote_0 node3" [ style = bold]
+"dependent-clone_demote_0" [ style=bold color="green" fontcolor="orange"]
+"dependent-clone_demoted_0" -> "dependent-clone_promote_0" [ style = bold]
+"dependent-clone_demoted_0" [ style=bold color="green" fontcolor="orange"]
+"dependent-clone_promote_0" -> "dependent-rsc_promote_0 node1" [ style = bold]
+"dependent-clone_promote_0" -> "dependent-rsc_promote_0 node2" [ style = bold]
+"dependent-clone_promote_0" [ style=bold color="green" fontcolor="orange"]
+"dependent-clone_promoted_0" [ style=bold color="green" fontcolor="orange"]
+"dependent-rsc_demote_0 node3" -> "dependent-clone_demoted_0" [ style = bold]
+"dependent-rsc_demote_0 node3" -> "dependent-rsc_monitor_11000 node3" [ style = bold]
+"dependent-rsc_demote_0 node3" [ style=bold color="green" fontcolor="black"]
+"dependent-rsc_monitor_10000 node1" [ style=bold color="green" fontcolor="black"]
+"dependent-rsc_monitor_10000 node2" [ style=bold color="green" fontcolor="black"]
+"dependent-rsc_monitor_11000 node3" [ style=bold color="green" fontcolor="black"]
+"dependent-rsc_monitor_11000 node4" [ style=bold color="green" fontcolor="black"]
+"dependent-rsc_promote_0 node1" -> "dependent-clone_promoted_0" [ style = bold]
+"dependent-rsc_promote_0 node1" -> "dependent-rsc_monitor_10000 node1" [ style = bold]
+"dependent-rsc_promote_0 node1" [ style=bold color="green" fontcolor="black"]
+"dependent-rsc_promote_0 node2" -> "dependent-clone_promoted_0" [ style = bold]
+"dependent-rsc_promote_0 node2" -> "dependent-rsc_monitor_10000 node2" [ style = bold]
+"dependent-rsc_promote_0 node2" [ style=bold color="green" fontcolor="black"]
+"primary-rsc_monitor_10000 node1" [ style=bold color="green" fontcolor="black"]
+"primary-rsc_monitor_11000 node2" [ style=bold color="green" fontcolor="black"]
+"primary-rsc_monitor_11000 node3" [ style=bold color="green" fontcolor="black"]
+"primary-rsc_monitor_11000 node4" [ style=bold color="green" fontcolor="black"]
+}
diff --git a/cts/scheduler/exp/promotable-colocation-with-node-attribute.exp b/cts/scheduler/exp/promotable-colocation-with-node-attribute.exp
new file mode 100644
index 0000000..76371f1
--- /dev/null
+++ b/cts/scheduler/exp/promotable-colocation-with-node-attribute.exp
@@ -0,0 +1,175 @@
+<transition_graph cluster-delay="60s" stonith-timeout="60s" failed-stop-offset="INFINITY" failed-start-offset="INFINITY" transition_id="1">
+ <synapse id="0">
+ <action_set>
+ <rsc_op id="5" operation="monitor" operation_key="primary-rsc_monitor_10000" internal_operation_key="primary-rsc:0_monitor_10000" on_node="node1" on_node_uuid="node1">
+ <primitive id="primary-rsc" long-id="primary-rsc:0" class="ocf" provider="pacemaker" type="Stateful"/>
+ <attributes CRM_meta_clone="0" CRM_meta_clone_max="4" CRM_meta_clone_node_max="1" CRM_meta_globally_unique="false" CRM_meta_interval="10000" CRM_meta_master_max="1" CRM_meta_master_node_max="1" CRM_meta_name="monitor" CRM_meta_notify="false" CRM_meta_on_node="node1" CRM_meta_on_node_uuid="node1" CRM_meta_op_target_rc="8" CRM_meta_promoted_max="1" CRM_meta_promoted_node_max="1" CRM_meta_role="Promoted" CRM_meta_timeout="20000" />
+ </rsc_op>
+ </action_set>
+ <inputs/>
+ </synapse>
+ <synapse id="1">
+ <action_set>
+ <rsc_op id="8" operation="monitor" operation_key="primary-rsc_monitor_11000" internal_operation_key="primary-rsc:1_monitor_11000" on_node="node2" on_node_uuid="node2">
+ <primitive id="primary-rsc" long-id="primary-rsc:1" class="ocf" provider="pacemaker" type="Stateful"/>
+ <attributes CRM_meta_clone="1" CRM_meta_clone_max="4" CRM_meta_clone_node_max="1" CRM_meta_globally_unique="false" CRM_meta_interval="11000" CRM_meta_master_max="1" CRM_meta_master_node_max="1" CRM_meta_name="monitor" CRM_meta_notify="false" CRM_meta_on_node="node2" CRM_meta_on_node_uuid="node2" CRM_meta_promoted_max="1" CRM_meta_promoted_node_max="1" CRM_meta_role="Unpromoted" CRM_meta_timeout="20000" />
+ </rsc_op>
+ </action_set>
+ <inputs/>
+ </synapse>
+ <synapse id="2">
+ <action_set>
+ <rsc_op id="11" operation="monitor" operation_key="primary-rsc_monitor_11000" internal_operation_key="primary-rsc:2_monitor_11000" on_node="node3" on_node_uuid="node3">
+ <primitive id="primary-rsc" long-id="primary-rsc:2" class="ocf" provider="pacemaker" type="Stateful"/>
+ <attributes CRM_meta_clone="2" CRM_meta_clone_max="4" CRM_meta_clone_node_max="1" CRM_meta_globally_unique="false" CRM_meta_interval="11000" CRM_meta_master_max="1" CRM_meta_master_node_max="1" CRM_meta_name="monitor" CRM_meta_notify="false" CRM_meta_on_node="node3" CRM_meta_on_node_uuid="node3" CRM_meta_promoted_max="1" CRM_meta_promoted_node_max="1" CRM_meta_role="Unpromoted" CRM_meta_timeout="20000" />
+ </rsc_op>
+ </action_set>
+ <inputs/>
+ </synapse>
+ <synapse id="3">
+ <action_set>
+ <rsc_op id="14" operation="monitor" operation_key="primary-rsc_monitor_11000" internal_operation_key="primary-rsc:3_monitor_11000" on_node="node4" on_node_uuid="node4">
+ <primitive id="primary-rsc" long-id="primary-rsc:3" class="ocf" provider="pacemaker" type="Stateful"/>
+ <attributes CRM_meta_clone="3" CRM_meta_clone_max="4" CRM_meta_clone_node_max="1" CRM_meta_globally_unique="false" CRM_meta_interval="11000" CRM_meta_master_max="1" CRM_meta_master_node_max="1" CRM_meta_name="monitor" CRM_meta_notify="false" CRM_meta_on_node="node4" CRM_meta_on_node_uuid="node4" CRM_meta_promoted_max="1" CRM_meta_promoted_node_max="1" CRM_meta_role="Unpromoted" CRM_meta_timeout="20000" />
+ </rsc_op>
+ </action_set>
+ <inputs/>
+ </synapse>
+ <synapse id="4">
+ <action_set>
+ <rsc_op id="26" operation="monitor" operation_key="dependent-rsc_monitor_10000" internal_operation_key="dependent-rsc:0_monitor_10000" on_node="node1" on_node_uuid="node1">
+ <primitive id="dependent-rsc" long-id="dependent-rsc:0" class="ocf" provider="pacemaker" type="Stateful"/>
+ <attributes CRM_meta_clone="0" CRM_meta_clone_max="4" CRM_meta_clone_node_max="1" CRM_meta_globally_unique="false" CRM_meta_interval="10000" CRM_meta_master_max="2" CRM_meta_master_node_max="1" CRM_meta_name="monitor" CRM_meta_notify="false" CRM_meta_on_node="node1" CRM_meta_on_node_uuid="node1" CRM_meta_op_target_rc="8" CRM_meta_promoted_max="2" CRM_meta_promoted_node_max="1" CRM_meta_role="Promoted" CRM_meta_timeout="20000" />
+ </rsc_op>
+ </action_set>
+ <inputs>
+ <trigger>
+ <rsc_op id="25" operation="promote" operation_key="dependent-rsc_promote_0" internal_operation_key="dependent-rsc:0_promote_0" on_node="node1" on_node_uuid="node1"/>
+ </trigger>
+ </inputs>
+ </synapse>
+ <synapse id="5">
+ <action_set>
+ <rsc_op id="25" operation="promote" operation_key="dependent-rsc_promote_0" internal_operation_key="dependent-rsc:0_promote_0" on_node="node1" on_node_uuid="node1">
+ <primitive id="dependent-rsc" long-id="dependent-rsc:0" class="ocf" provider="pacemaker" type="Stateful"/>
+ <attributes CRM_meta_clone="0" CRM_meta_clone_max="4" CRM_meta_clone_node_max="1" CRM_meta_globally_unique="false" CRM_meta_master_max="2" CRM_meta_master_node_max="1" CRM_meta_notify="false" CRM_meta_on_node="node1" CRM_meta_on_node_uuid="node1" CRM_meta_promoted_max="2" CRM_meta_promoted_node_max="1" CRM_meta_timeout="20000" />
+ </rsc_op>
+ </action_set>
+ <inputs>
+ <trigger>
+ <pseudo_event id="42" operation="promote" operation_key="dependent-clone_promote_0"/>
+ </trigger>
+ </inputs>
+ </synapse>
+ <synapse id="6">
+ <action_set>
+ <rsc_op id="30" operation="monitor" operation_key="dependent-rsc_monitor_10000" internal_operation_key="dependent-rsc:1_monitor_10000" on_node="node2" on_node_uuid="node2">
+ <primitive id="dependent-rsc" long-id="dependent-rsc:1" class="ocf" provider="pacemaker" type="Stateful"/>
+ <attributes CRM_meta_clone="1" CRM_meta_clone_max="4" CRM_meta_clone_node_max="1" CRM_meta_globally_unique="false" CRM_meta_interval="10000" CRM_meta_master_max="2" CRM_meta_master_node_max="1" CRM_meta_name="monitor" CRM_meta_notify="false" CRM_meta_on_node="node2" CRM_meta_on_node_uuid="node2" CRM_meta_op_target_rc="8" CRM_meta_promoted_max="2" CRM_meta_promoted_node_max="1" CRM_meta_role="Promoted" CRM_meta_timeout="20000" />
+ </rsc_op>
+ </action_set>
+ <inputs>
+ <trigger>
+ <rsc_op id="29" operation="promote" operation_key="dependent-rsc_promote_0" internal_operation_key="dependent-rsc:1_promote_0" on_node="node2" on_node_uuid="node2"/>
+ </trigger>
+ </inputs>
+ </synapse>
+ <synapse id="7">
+ <action_set>
+ <rsc_op id="29" operation="promote" operation_key="dependent-rsc_promote_0" internal_operation_key="dependent-rsc:1_promote_0" on_node="node2" on_node_uuid="node2">
+ <primitive id="dependent-rsc" long-id="dependent-rsc:1" class="ocf" provider="pacemaker" type="Stateful"/>
+ <attributes CRM_meta_clone="1" CRM_meta_clone_max="4" CRM_meta_clone_node_max="1" CRM_meta_globally_unique="false" CRM_meta_master_max="2" CRM_meta_master_node_max="1" CRM_meta_notify="false" CRM_meta_on_node="node2" CRM_meta_on_node_uuid="node2" CRM_meta_promoted_max="2" CRM_meta_promoted_node_max="1" CRM_meta_timeout="20000" />
+ </rsc_op>
+ </action_set>
+ <inputs>
+ <trigger>
+ <pseudo_event id="42" operation="promote" operation_key="dependent-clone_promote_0"/>
+ </trigger>
+ </inputs>
+ </synapse>
+ <synapse id="8">
+ <action_set>
+ <rsc_op id="34" operation="monitor" operation_key="dependent-rsc_monitor_11000" internal_operation_key="dependent-rsc:2_monitor_11000" on_node="node3" on_node_uuid="node3">
+ <primitive id="dependent-rsc" long-id="dependent-rsc:2" class="ocf" provider="pacemaker" type="Stateful"/>
+ <attributes CRM_meta_clone="2" CRM_meta_clone_max="4" CRM_meta_clone_node_max="1" CRM_meta_globally_unique="false" CRM_meta_interval="11000" CRM_meta_master_max="2" CRM_meta_master_node_max="1" CRM_meta_name="monitor" CRM_meta_notify="false" CRM_meta_on_node="node3" CRM_meta_on_node_uuid="node3" CRM_meta_promoted_max="2" CRM_meta_promoted_node_max="1" CRM_meta_role="Unpromoted" CRM_meta_timeout="20000" />
+ </rsc_op>
+ </action_set>
+ <inputs>
+ <trigger>
+ <rsc_op id="32" operation="demote" operation_key="dependent-rsc_demote_0" internal_operation_key="dependent-rsc:2_demote_0" on_node="node3" on_node_uuid="node3"/>
+ </trigger>
+ </inputs>
+ </synapse>
+ <synapse id="9">
+ <action_set>
+ <rsc_op id="32" operation="demote" operation_key="dependent-rsc_demote_0" internal_operation_key="dependent-rsc:2_demote_0" on_node="node3" on_node_uuid="node3">
+ <primitive id="dependent-rsc" long-id="dependent-rsc:2" class="ocf" provider="pacemaker" type="Stateful"/>
+ <attributes CRM_meta_clone="2" CRM_meta_clone_max="4" CRM_meta_clone_node_max="1" CRM_meta_globally_unique="false" CRM_meta_master_max="2" CRM_meta_master_node_max="1" CRM_meta_notify="false" CRM_meta_on_node="node3" CRM_meta_on_node_uuid="node3" CRM_meta_promoted_max="2" CRM_meta_promoted_node_max="1" CRM_meta_timeout="20000" />
+ </rsc_op>
+ </action_set>
+ <inputs>
+ <trigger>
+ <pseudo_event id="44" operation="demote" operation_key="dependent-clone_demote_0"/>
+ </trigger>
+ </inputs>
+ </synapse>
+ <synapse id="10">
+ <action_set>
+ <rsc_op id="37" operation="monitor" operation_key="dependent-rsc_monitor_11000" internal_operation_key="dependent-rsc:3_monitor_11000" on_node="node4" on_node_uuid="node4">
+ <primitive id="dependent-rsc" long-id="dependent-rsc:3" class="ocf" provider="pacemaker" type="Stateful"/>
+ <attributes CRM_meta_clone="3" CRM_meta_clone_max="4" CRM_meta_clone_node_max="1" CRM_meta_globally_unique="false" CRM_meta_interval="11000" CRM_meta_master_max="2" CRM_meta_master_node_max="1" CRM_meta_name="monitor" CRM_meta_notify="false" CRM_meta_on_node="node4" CRM_meta_on_node_uuid="node4" CRM_meta_promoted_max="2" CRM_meta_promoted_node_max="1" CRM_meta_role="Unpromoted" CRM_meta_timeout="20000" />
+ </rsc_op>
+ </action_set>
+ <inputs/>
+ </synapse>
+ <synapse id="11" priority="1000000">
+ <action_set>
+ <pseudo_event id="45" operation="demoted" operation_key="dependent-clone_demoted_0">
+ <attributes CRM_meta_clone_max="4" CRM_meta_clone_node_max="1" CRM_meta_globally_unique="false" CRM_meta_master_max="2" CRM_meta_master_node_max="1" CRM_meta_notify="false" CRM_meta_promoted_max="2" CRM_meta_promoted_node_max="1" CRM_meta_timeout="20000" />
+ </pseudo_event>
+ </action_set>
+ <inputs>
+ <trigger>
+ <rsc_op id="32" operation="demote" operation_key="dependent-rsc_demote_0" internal_operation_key="dependent-rsc:2_demote_0" on_node="node3" on_node_uuid="node3"/>
+ </trigger>
+ <trigger>
+ <pseudo_event id="44" operation="demote" operation_key="dependent-clone_demote_0"/>
+ </trigger>
+ </inputs>
+ </synapse>
+ <synapse id="12">
+ <action_set>
+ <pseudo_event id="44" operation="demote" operation_key="dependent-clone_demote_0">
+ <attributes CRM_meta_clone_max="4" CRM_meta_clone_node_max="1" CRM_meta_globally_unique="false" CRM_meta_master_max="2" CRM_meta_master_node_max="1" CRM_meta_notify="false" CRM_meta_promoted_max="2" CRM_meta_promoted_node_max="1" CRM_meta_timeout="20000" />
+ </pseudo_event>
+ </action_set>
+ <inputs/>
+ </synapse>
+ <synapse id="13" priority="1000000">
+ <action_set>
+ <pseudo_event id="43" operation="promoted" operation_key="dependent-clone_promoted_0">
+ <attributes CRM_meta_clone_max="4" CRM_meta_clone_node_max="1" CRM_meta_globally_unique="false" CRM_meta_master_max="2" CRM_meta_master_node_max="1" CRM_meta_notify="false" CRM_meta_promoted_max="2" CRM_meta_promoted_node_max="1" CRM_meta_timeout="20000" />
+ </pseudo_event>
+ </action_set>
+ <inputs>
+ <trigger>
+ <rsc_op id="25" operation="promote" operation_key="dependent-rsc_promote_0" internal_operation_key="dependent-rsc:0_promote_0" on_node="node1" on_node_uuid="node1"/>
+ </trigger>
+ <trigger>
+ <rsc_op id="29" operation="promote" operation_key="dependent-rsc_promote_0" internal_operation_key="dependent-rsc:1_promote_0" on_node="node2" on_node_uuid="node2"/>
+ </trigger>
+ </inputs>
+ </synapse>
+ <synapse id="14">
+ <action_set>
+ <pseudo_event id="42" operation="promote" operation_key="dependent-clone_promote_0">
+ <attributes CRM_meta_clone_max="4" CRM_meta_clone_node_max="1" CRM_meta_globally_unique="false" CRM_meta_master_max="2" CRM_meta_master_node_max="1" CRM_meta_notify="false" CRM_meta_promoted_max="2" CRM_meta_promoted_node_max="1" CRM_meta_timeout="20000" />
+ </pseudo_event>
+ </action_set>
+ <inputs>
+ <trigger>
+ <pseudo_event id="45" operation="demoted" operation_key="dependent-clone_demoted_0"/>
+ </trigger>
+ </inputs>
+ </synapse>
+</transition_graph>
diff --git a/cts/scheduler/scores/promotable-colocation-with-node-attribute.scores b/cts/scheduler/scores/promotable-colocation-with-node-attribute.scores
new file mode 100644
index 0000000..023ee77
--- /dev/null
+++ b/cts/scheduler/scores/promotable-colocation-with-node-attribute.scores
@@ -0,0 +1,81 @@
+
+dependent-rsc:0 promotion score on node1: INFINITY
+dependent-rsc:1 promotion score on node2: INFINITY
+dependent-rsc:2 promotion score on node3: -INFINITY
+dependent-rsc:3 promotion score on node4: -INFINITY
+pcmk__clone_assign: dependent-clone allocation score on node1: 0
+pcmk__clone_assign: dependent-clone allocation score on node2: 0
+pcmk__clone_assign: dependent-clone allocation score on node3: 0
+pcmk__clone_assign: dependent-clone allocation score on node4: 0
+pcmk__clone_assign: dependent-rsc:0 allocation score on node1: 6
+pcmk__clone_assign: dependent-rsc:0 allocation score on node2: 0
+pcmk__clone_assign: dependent-rsc:0 allocation score on node3: 0
+pcmk__clone_assign: dependent-rsc:0 allocation score on node4: 0
+pcmk__clone_assign: dependent-rsc:1 allocation score on node1: 0
+pcmk__clone_assign: dependent-rsc:1 allocation score on node2: 6
+pcmk__clone_assign: dependent-rsc:1 allocation score on node3: 0
+pcmk__clone_assign: dependent-rsc:1 allocation score on node4: 0
+pcmk__clone_assign: dependent-rsc:2 allocation score on node1: 0
+pcmk__clone_assign: dependent-rsc:2 allocation score on node2: 0
+pcmk__clone_assign: dependent-rsc:2 allocation score on node3: 11
+pcmk__clone_assign: dependent-rsc:2 allocation score on node4: 0
+pcmk__clone_assign: dependent-rsc:3 allocation score on node1: 0
+pcmk__clone_assign: dependent-rsc:3 allocation score on node2: 0
+pcmk__clone_assign: dependent-rsc:3 allocation score on node3: 0
+pcmk__clone_assign: dependent-rsc:3 allocation score on node4: 6
+pcmk__clone_assign: primary-clone allocation score on node1: 0
+pcmk__clone_assign: primary-clone allocation score on node2: 0
+pcmk__clone_assign: primary-clone allocation score on node3: 0
+pcmk__clone_assign: primary-clone allocation score on node4: 0
+pcmk__clone_assign: primary-rsc:0 allocation score on node1: 11
+pcmk__clone_assign: primary-rsc:0 allocation score on node2: 0
+pcmk__clone_assign: primary-rsc:0 allocation score on node3: 0
+pcmk__clone_assign: primary-rsc:0 allocation score on node4: 0
+pcmk__clone_assign: primary-rsc:1 allocation score on node1: 0
+pcmk__clone_assign: primary-rsc:1 allocation score on node2: 6
+pcmk__clone_assign: primary-rsc:1 allocation score on node3: 0
+pcmk__clone_assign: primary-rsc:1 allocation score on node4: 0
+pcmk__clone_assign: primary-rsc:2 allocation score on node1: 0
+pcmk__clone_assign: primary-rsc:2 allocation score on node2: 0
+pcmk__clone_assign: primary-rsc:2 allocation score on node3: 6
+pcmk__clone_assign: primary-rsc:2 allocation score on node4: 0
+pcmk__clone_assign: primary-rsc:3 allocation score on node1: 0
+pcmk__clone_assign: primary-rsc:3 allocation score on node2: 0
+pcmk__clone_assign: primary-rsc:3 allocation score on node3: 0
+pcmk__clone_assign: primary-rsc:3 allocation score on node4: 6
+pcmk__primitive_assign: dependent-rsc:0 allocation score on node1: 6
+pcmk__primitive_assign: dependent-rsc:0 allocation score on node2: 0
+pcmk__primitive_assign: dependent-rsc:0 allocation score on node3: -INFINITY
+pcmk__primitive_assign: dependent-rsc:0 allocation score on node4: 0
+pcmk__primitive_assign: dependent-rsc:1 allocation score on node1: -INFINITY
+pcmk__primitive_assign: dependent-rsc:1 allocation score on node2: 6
+pcmk__primitive_assign: dependent-rsc:1 allocation score on node3: -INFINITY
+pcmk__primitive_assign: dependent-rsc:1 allocation score on node4: 0
+pcmk__primitive_assign: dependent-rsc:2 allocation score on node1: 0
+pcmk__primitive_assign: dependent-rsc:2 allocation score on node2: 0
+pcmk__primitive_assign: dependent-rsc:2 allocation score on node3: 11
+pcmk__primitive_assign: dependent-rsc:2 allocation score on node4: 0
+pcmk__primitive_assign: dependent-rsc:3 allocation score on node1: -INFINITY
+pcmk__primitive_assign: dependent-rsc:3 allocation score on node2: -INFINITY
+pcmk__primitive_assign: dependent-rsc:3 allocation score on node3: -INFINITY
+pcmk__primitive_assign: dependent-rsc:3 allocation score on node4: 6
+pcmk__primitive_assign: primary-rsc:0 allocation score on node1: 11
+pcmk__primitive_assign: primary-rsc:0 allocation score on node2: 0
+pcmk__primitive_assign: primary-rsc:0 allocation score on node3: 0
+pcmk__primitive_assign: primary-rsc:0 allocation score on node4: 0
+pcmk__primitive_assign: primary-rsc:1 allocation score on node1: -INFINITY
+pcmk__primitive_assign: primary-rsc:1 allocation score on node2: 6
+pcmk__primitive_assign: primary-rsc:1 allocation score on node3: 0
+pcmk__primitive_assign: primary-rsc:1 allocation score on node4: 0
+pcmk__primitive_assign: primary-rsc:2 allocation score on node1: -INFINITY
+pcmk__primitive_assign: primary-rsc:2 allocation score on node2: -INFINITY
+pcmk__primitive_assign: primary-rsc:2 allocation score on node3: 6
+pcmk__primitive_assign: primary-rsc:2 allocation score on node4: 0
+pcmk__primitive_assign: primary-rsc:3 allocation score on node1: -INFINITY
+pcmk__primitive_assign: primary-rsc:3 allocation score on node2: -INFINITY
+pcmk__primitive_assign: primary-rsc:3 allocation score on node3: -INFINITY
+pcmk__primitive_assign: primary-rsc:3 allocation score on node4: 6
+primary-rsc:0 promotion score on node1: 10
+primary-rsc:1 promotion score on node2: 5
+primary-rsc:2 promotion score on node3: 5
+primary-rsc:3 promotion score on node4: 5
diff --git a/cts/scheduler/summary/promotable-colocation-with-node-attribute.summary b/cts/scheduler/summary/promotable-colocation-with-node-attribute.summary
new file mode 100644
index 0000000..30e81c8
--- /dev/null
+++ b/cts/scheduler/summary/promotable-colocation-with-node-attribute.summary
@@ -0,0 +1,45 @@
+Current cluster status:
+ * Node List:
+ * Online: [ node1 node2 node3 node4 ]
+
+ * Full List of Resources:
+ * Clone Set: primary-clone [primary-rsc] (promotable):
+ * Promoted: [ node1 ]
+ * Unpromoted: [ node2 node3 node4 ]
+ * Clone Set: dependent-clone [dependent-rsc] (promotable):
+ * Promoted: [ node3 ]
+ * Unpromoted: [ node1 node2 node4 ]
+
+Transition Summary:
+ * Promote dependent-rsc:0 ( Unpromoted -> Promoted node1 )
+ * Promote dependent-rsc:1 ( Unpromoted -> Promoted node2 )
+ * Demote dependent-rsc:2 ( Promoted -> Unpromoted node3 )
+
+Executing Cluster Transition:
+ * Resource action: primary-rsc monitor=10000 on node1
+ * Resource action: primary-rsc monitor=11000 on node2
+ * Resource action: primary-rsc monitor=11000 on node3
+ * Resource action: primary-rsc monitor=11000 on node4
+ * Resource action: dependent-rsc monitor=11000 on node4
+ * Pseudo action: dependent-clone_demote_0
+ * Resource action: dependent-rsc demote on node3
+ * Pseudo action: dependent-clone_demoted_0
+ * Pseudo action: dependent-clone_promote_0
+ * Resource action: dependent-rsc promote on node1
+ * Resource action: dependent-rsc promote on node2
+ * Resource action: dependent-rsc monitor=11000 on node3
+ * Pseudo action: dependent-clone_promoted_0
+ * Resource action: dependent-rsc monitor=10000 on node1
+ * Resource action: dependent-rsc monitor=10000 on node2
+
+Revised Cluster Status:
+ * Node List:
+ * Online: [ node1 node2 node3 node4 ]
+
+ * Full List of Resources:
+ * Clone Set: primary-clone [primary-rsc] (promotable):
+ * Promoted: [ node1 ]
+ * Unpromoted: [ node2 node3 node4 ]
+ * Clone Set: dependent-clone [dependent-rsc] (promotable):
+ * Promoted: [ node1 node2 ]
+ * Unpromoted: [ node3 node4 ]
diff --git a/cts/scheduler/xml/promotable-colocation-with-node-attribute.xml b/cts/scheduler/xml/promotable-colocation-with-node-attribute.xml
new file mode 100644
index 0000000..5b4ab10
--- /dev/null
+++ b/cts/scheduler/xml/promotable-colocation-with-node-attribute.xml
@@ -0,0 +1,155 @@
+<cib crm_feature_set="3.19.7" validate-with="pacemaker-3.10" epoch="1" num_updates="0" admin_epoch="0" cib-last-written="Mon Jan 1 12:00:00 2024" have-quorum="1" dc-uuid="node1">
+ <configuration>
+ <crm_config>
+ <cluster_property_set id="cib-bootstrap-options">
+ <nvpair id="cib-bootstrap-options-dc-version" name="dc-version" value="2.1.8"/>
+ <nvpair id="cib-bootstrap-options-cluster-infrastructure" name="cluster-infrastructure" value="corosync"/>
+ <nvpair id="cib-bootstrap-options-stonith-enabled" name="stonith-enabled" value="false"/>
+ </cluster_property_set>
+ </crm_config>
+ <nodes>
+ <!-- Site A nodes -->
+ <node id="node1" uname="node1">
+ <instance_attributes id="node1-attrs">
+ <nvpair id="node1-site" name="site" value="siteA"/>
+ </instance_attributes>
+ </node>
+ <node id="node2" uname="node2">
+ <instance_attributes id="node2-attrs">
+ <nvpair id="node2-site" name="site" value="siteA"/>
+ </instance_attributes>
+ </node>
+ <!-- Site B nodes -->
+ <node id="node3" uname="node3">
+ <instance_attributes id="node3-attrs">
+ <nvpair id="node3-site" name="site" value="siteB"/>
+ </instance_attributes>
+ </node>
+ <node id="node4" uname="node4">
+ <instance_attributes id="node4-attrs">
+ <nvpair id="node4-site" name="site" value="siteB"/>
+ </instance_attributes>
+ </node>
+ </nodes>
+ <resources>
+ <!-- Primary promotable clone -->
+ <clone id="primary-clone">
+ <meta_attributes id="primary-clone-meta">
+ <nvpair id="primary-clone-promotable" name="promotable" value="true"/>
+ <nvpair id="primary-clone-promoted-max" name="promoted-max" value="1"/>
+ <nvpair id="primary-clone-clone-max" name="clone-max" value="4"/>
+ </meta_attributes>
+ <primitive id="primary-rsc" class="ocf" provider="pacemaker" type="Stateful">
+ <operations>
+ <op id="primary-rsc-monitor-promoted" name="monitor" interval="10s" role="Promoted"/>
+ <op id="primary-rsc-monitor-unpromoted" name="monitor" interval="11s" role="Unpromoted"/>
+ </operations>
+ </primitive>
+ </clone>
+ <!-- Dependent promotable clone -->
+ <clone id="dependent-clone">
+ <meta_attributes id="dependent-clone-meta">
+ <nvpair id="dependent-clone-promotable" name="promotable" value="true"/>
+ <nvpair id="dependent-clone-promoted-max" name="promoted-max" value="2"/>
+ <nvpair id="dependent-clone-clone-max" name="clone-max" value="4"/>
+ </meta_attributes>
+ <primitive id="dependent-rsc" class="ocf" provider="pacemaker" type="Stateful">
+ <operations>
+ <op id="dependent-rsc-monitor-promoted" name="monitor" interval="10s" role="Promoted"/>
+ <op id="dependent-rsc-monitor-unpromoted" name="monitor" interval="11s" role="Unpromoted"/>
+ </operations>
+ </primitive>
+ </clone>
+ </resources>
+ <constraints>
+ <!--
+ This constraint should ensure that dependent-clone is promoted
+ on all nodes in the same site as where primary-clone is promoted.
+ With the fix, if primary-clone is promoted on nodes in siteA,
+ dependent-clone should also be promoted on nodes in siteA.
+ -->
+ <rsc_colocation id="coloc-dependent-with-primary-promoted"
+ rsc="dependent-clone"
+ rsc-role="Promoted"
+ with-rsc="primary-clone"
+ with-rsc-role="Promoted"
+ node-attribute="site"
+ score="INFINITY"/>
+ </constraints>
+ </configuration>
+ <status>
+ <!-- All nodes are online and clean -->
+ <node_state id="node1" uname="node1" in_ccm="true" crmd="online" join="member" expected="member">
+ <transient_attributes id="node1-transient">
+ <instance_attributes id="node1-transient-attrs">
+ <nvpair id="node1-promoted-primary-rsc" name="master-primary-rsc" value="10"/>
+ <nvpair id="node1-promoted-dependent-rsc" name="master-dependent-rsc" value="5"/>
+ </instance_attributes>
+ </transient_attributes>
+ <lrm id="node1-lrm">
+ <lrm_resources>
+ <lrm_resource id="primary-rsc" type="Stateful" class="ocf" provider="pacemaker">
+ <lrm_rsc_op id="primary-rsc_last_0" operation_key="primary-rsc_promote_0" operation="promote" crm-debug-origin="do_update_resource" transition-key="1:0:0:test" on_node="node1" call-id="10" rc-code="0" op-status="0" interval="0" last-rc-change="1609459200" exec-time="100" queue-time="0" op-digest="f2317cad3d54cec5d7d7aa7d0bf35cf8"/>
+ </lrm_resource>
+ <lrm_resource id="dependent-rsc" type="Stateful" class="ocf" provider="pacemaker">
+ <lrm_rsc_op id="dependent-rsc_last_0" operation_key="dependent-rsc_start_0" operation="start" crm-debug-origin="do_update_resource" transition-key="1:0:0:test" on_node="node1" call-id="10" rc-code="0" op-status="0" interval="0" last-rc-change="1609459200" exec-time="100" queue-time="0" op-digest="f2317cad3d54cec5d7d7aa7d0bf35cf8"/>
+ </lrm_resource>
+ </lrm_resources>
+ </lrm>
+ </node_state>
+ <node_state id="node2" uname="node2" in_ccm="true" crmd="online" join="member" expected="member">
+ <transient_attributes id="node2-transient">
+ <instance_attributes id="node2-transient-attrs">
+ <nvpair id="node2-promoted-primary-rsc" name="master-primary-rsc" value="5"/>
+ <nvpair id="node2-promoted-dependent-rsc" name="master-dependent-rsc" value="5"/>
+ </instance_attributes>
+ </transient_attributes>
+ <lrm id="node2-lrm">
+ <lrm_resources>
+ <lrm_resource id="primary-rsc" type="Stateful" class="ocf" provider="pacemaker">
+ <lrm_rsc_op id="primary-rsc_last_0" operation_key="primary-rsc_start_0" operation="start" crm-debug-origin="do_update_resource" transition-key="1:0:0:test" on_node="node2" call-id="10" rc-code="0" op-status="0" interval="0" last-rc-change="1609459200" exec-time="100" queue-time="0" op-digest="f2317cad3d54cec5d7d7aa7d0bf35cf8"/>
+ </lrm_resource>
+ <lrm_resource id="dependent-rsc" type="Stateful" class="ocf" provider="pacemaker">
+ <lrm_rsc_op id="dependent-rsc_last_0" operation_key="dependent-rsc_start_0" operation="start" crm-debug-origin="do_update_resource" transition-key="1:0:0:test" on_node="node2" call-id="10" rc-code="0" op-status="0" interval="0" last-rc-change="1609459200" exec-time="100" queue-time="0" op-digest="f2317cad3d54cec5d7d7aa7d0bf35cf8"/>
+ </lrm_resource>
+ </lrm_resources>
+ </lrm>
+ </node_state>
+ <node_state id="node3" uname="node3" in_ccm="true" crmd="online" join="member" expected="member">
+ <transient_attributes id="node3-transient">
+ <instance_attributes id="node3-transient-attrs">
+ <nvpair id="node3-promoted-primary-rsc" name="master-primary-rsc" value="5"/>
+ <nvpair id="node3-promoted-dependent-rsc" name="master-dependent-rsc" value="10"/>
+ </instance_attributes>
+ </transient_attributes>
+ <lrm id="node3-lrm">
+ <lrm_resources>
+ <lrm_resource id="primary-rsc" type="Stateful" class="ocf" provider="pacemaker">
+ <lrm_rsc_op id="primary-rsc_last_0" operation_key="primary-rsc_start_0" operation="start" crm-debug-origin="do_update_resource" transition-key="1:0:0:test" on_node="node3" call-id="10" rc-code="0" op-status="0" interval="0" last-rc-change="1609459200" exec-time="100" queue-time="0" op-digest="f2317cad3d54cec5d7d7aa7d0bf35cf8"/>
+ </lrm_resource>
+ <lrm_resource id="dependent-rsc" type="Stateful" class="ocf" provider="pacemaker">
+ <lrm_rsc_op id="dependent-rsc_last_0" operation_key="dependent-rsc_promote_0" operation="promote" crm-debug-origin="do_update_resource" transition-key="1:0:0:test" on_node="node3" call-id="10" rc-code="0" op-status="0" interval="0" last-rc-change="1609459200" exec-time="100" queue-time="0" op-digest="f2317cad3d54cec5d7d7aa7d0bf35cf8"/>
+ </lrm_resource>
+ </lrm_resources>
+ </lrm>
+ </node_state>
+ <node_state id="node4" uname="node4" in_ccm="true" crmd="online" join="member" expected="member">
+ <transient_attributes id="node4-transient">
+ <instance_attributes id="node4-transient-attrs">
+ <nvpair id="node4-promoted-primary-rsc" name="master-primary-rsc" value="5"/>
+ <nvpair id="node4-promoted-dependent-rsc" name="master-dependent-rsc" value="5"/>
+ </instance_attributes>
+ </transient_attributes>
+ <lrm id="node4-lrm">
+ <lrm_resources>
+ <lrm_resource id="primary-rsc" type="Stateful" class="ocf" provider="pacemaker">
+ <lrm_rsc_op id="primary-rsc_last_0" operation_key="primary-rsc_start_0" operation="start" crm-debug-origin="do_update_resource" transition-key="1:0:0:test" on_node="node4" call-id="10" rc-code="0" op-status="0" interval="0" last-rc-change="1609459200" exec-time="100" queue-time="0" op-digest="f2317cad3d54cec5d7d7aa7d0bf35cf8"/>
+ </lrm_resource>
+ <lrm_resource id="dependent-rsc" type="Stateful" class="ocf" provider="pacemaker">
+ <lrm_rsc_op id="dependent-rsc_last_0" operation_key="dependent-rsc_start_0" operation="start" crm-debug-origin="do_update_resource" transition-key="1:0:0:test" on_node="node4" call-id="10" rc-code="0" op-status="0" interval="0" last-rc-change="1609459200" exec-time="100" queue-time="0" op-digest="f2317cad3d54cec5d7d7aa7d0bf35cf8"/>
+ </lrm_resource>
+ </lrm_resources>
+ </lrm>
+ </node_state>
+ </status>
+</cib>
--
2.47.1
From 31d5785ffc68acb54af76bc55f732117f77ef4b9 Mon Sep 17 00:00:00 2001
From: Klaus Wenninger <klaus.wenninger@aon.at>
Date: Tue, 7 Oct 2025 05:11:44 +0200
Subject: [PATCH 2/2] Fix: scheduler: promoted state with promoted state with
attribute
Previously attribute based colocation didn't adhere
the attribute with promoted state with promoted state.
---
lib/pacemaker/libpacemaker_private.h | 5 +-
lib/pacemaker/pcmk_sched_bundle.c | 2 +-
lib/pacemaker/pcmk_sched_clone.c | 2 +-
lib/pacemaker/pcmk_sched_instances.c | 76 ++++++++++++++++++---------
lib/pacemaker/pcmk_sched_probes.c | 2 +-
lib/pacemaker/pcmk_sched_promotable.c | 3 +-
6 files changed, 60 insertions(+), 30 deletions(-)
diff --git a/lib/pacemaker/libpacemaker_private.h b/lib/pacemaker/libpacemaker_private.h
index 58435a6..fadfc8b 100644
--- a/lib/pacemaker/libpacemaker_private.h
+++ b/lib/pacemaker/libpacemaker_private.h
@@ -941,13 +941,14 @@ void pcmk__create_instance_actions(pcmk_resource_t *rsc, GList *instances);
G_GNUC_INTERNAL
bool pcmk__instance_matches(const pcmk_resource_t *instance,
const pcmk_node_t *node, enum rsc_role_e role,
- bool current);
+ bool current, const char *node_attribute);
G_GNUC_INTERNAL
pcmk_resource_t *pcmk__find_compatible_instance(const pcmk_resource_t *match_rsc,
const pcmk_resource_t *rsc,
enum rsc_role_e role,
- bool current);
+ bool current,
+ const char *node_attribute);
G_GNUC_INTERNAL
uint32_t pcmk__instance_update_ordered_actions(pcmk_action_t *first,
diff --git a/lib/pacemaker/pcmk_sched_bundle.c b/lib/pacemaker/pcmk_sched_bundle.c
index 14e7be5..2d7e879 100644
--- a/lib/pacemaker/pcmk_sched_bundle.c
+++ b/lib/pacemaker/pcmk_sched_bundle.c
@@ -383,7 +383,7 @@ match_replica_container(const pcmk__bundle_replica_t *replica, void *user_data)
struct match_data *match_data = user_data;
if (pcmk__instance_matches(replica->container, match_data->node,
- pcmk_role_unknown, false)) {
+ pcmk_role_unknown, false, NULL)) {
match_data->container = replica->container;
return false; // Match found, don't bother searching further replicas
}
diff --git a/lib/pacemaker/pcmk_sched_clone.c b/lib/pacemaker/pcmk_sched_clone.c
index 4f86621..99fa8b2 100644
--- a/lib/pacemaker/pcmk_sched_clone.c
+++ b/lib/pacemaker/pcmk_sched_clone.c
@@ -301,7 +301,7 @@ pcmk__clone_apply_coloc_score(pcmk_resource_t *dependent,
primary_instance = pcmk__find_compatible_instance(dependent, primary,
pcmk_role_unknown,
- false);
+ false, NULL);
if (primary_instance != NULL) {
pcmk__rsc_debug(primary, "Interleaving %s with %s",
dependent->id, primary_instance->id);
diff --git a/lib/pacemaker/pcmk_sched_instances.c b/lib/pacemaker/pcmk_sched_instances.c
index f2bc1a4..5344234 100644
--- a/lib/pacemaker/pcmk_sched_instances.c
+++ b/lib/pacemaker/pcmk_sched_instances.c
@@ -1073,18 +1073,22 @@ free_instance_list(const pcmk_resource_t *rsc, GList *list)
* \internal
* \brief Check whether an instance is compatible with a role and node
*
- * \param[in] instance Clone instance or bundle replica container
- * \param[in] node Instance must match this node
- * \param[in] role If not pcmk_role_unknown, instance must match this role
- * \param[in] current If true, compare instance's original node and role,
- * otherwise compare assigned next node and role
+ * \param[in] instance Clone instance or bundle replica container
+ * \param[in] node Instance must match this node
+ * \param[in] role If not pcmk_role_unknown, instance must match this role
+ * \param[in] current If true, compare instance's original node and role,
+ * otherwise compare assigned next node and role
+ * \param[in] node_attribute If not NULL, instance's node must have the same value
+ * for this attribute as \p node (instead of requiring
+ * the exact same node)
*
* \return true if \p instance is compatible with \p node and \p role,
* otherwise false
*/
bool
pcmk__instance_matches(const pcmk_resource_t *instance, const pcmk_node_t *node,
- enum rsc_role_e role, bool current)
+ enum rsc_role_e role, bool current,
+ const char *node_attribute)
{
pcmk_node_t *instance_node = NULL;
@@ -1117,7 +1121,25 @@ pcmk__instance_matches(const pcmk_resource_t *instance, const pcmk_node_t *node,
return false;
}
- if (!pcmk__same_node(instance_node, node)) {
+ if (node_attribute != NULL) {
+ // Compare by node attribute value instead of node identity
+ const char *instance_value = pcmk__colocation_node_attr(instance_node,
+ node_attribute,
+ instance);
+ const char *target_value = pcmk__colocation_node_attr(node,
+ node_attribute,
+ instance);
+
+ if (!pcmk__str_eq(instance_value, target_value, pcmk__str_casei)) {
+ pcmk__rsc_trace(instance,
+ "%s is not a compatible instance "
+ "(instance has %s=%s, target node has %s=%s)",
+ instance->id, node_attribute,
+ pcmk__s(instance_value, "<none>"),
+ node_attribute, pcmk__s(target_value, "<none>"));
+ return false;
+ }
+ } else if (!pcmk__same_node(instance_node, node)) {
pcmk__rsc_trace(instance,
"%s is not a compatible instance "
"(assigned to %s not %s)",
@@ -1136,12 +1158,14 @@ pcmk__instance_matches(const pcmk_resource_t *instance, const pcmk_node_t *node,
* \internal
* \brief Find an instance that matches a given resource by node and role
*
- * \param[in] match_rsc Resource that instance must match (for logging only)
- * \param[in] rsc Clone or bundle resource to check for matching instance
- * \param[in] node Instance must match this node
- * \param[in] role If not pcmk_role_unknown, instance must match this role
- * \param[in] current If true, compare instance's original node and role,
- * otherwise compare assigned next node and role
+ * \param[in] match_rsc Resource that instance must match (for logging only)
+ * \param[in] rsc Clone or bundle resource to check for matching instance
+ * \param[in] node Instance must match this node
+ * \param[in] role If not pcmk_role_unknown, instance must match this role
+ * \param[in] current If true, compare instance's original node and role,
+ * otherwise compare assigned next node and role
+ * \param[in] node_attribute If not NULL, match instances by this node attribute
+ * instead of by node identity
*
* \return \p rsc instance matching \p node and \p role if any, otherwise NULL
*/
@@ -1149,7 +1173,7 @@ static pcmk_resource_t *
find_compatible_instance_on_node(const pcmk_resource_t *match_rsc,
const pcmk_resource_t *rsc,
const pcmk_node_t *node, enum rsc_role_e role,
- bool current)
+ bool current, const char *node_attribute)
{
GList *instances = NULL;
@@ -1157,7 +1181,8 @@ find_compatible_instance_on_node(const pcmk_resource_t *match_rsc,
for (GList *iter = instances; iter != NULL; iter = iter->next) {
pcmk_resource_t *instance = (pcmk_resource_t *) iter->data;
- if (pcmk__instance_matches(instance, node, role, current)) {
+ if (pcmk__instance_matches(instance, node, role, current,
+ node_attribute)) {
pcmk__rsc_trace(match_rsc,
"Found %s %s instance %s compatible with %s on %s",
display_role(role), rsc->id, instance->id,
@@ -1179,11 +1204,13 @@ find_compatible_instance_on_node(const pcmk_resource_t *match_rsc,
* \internal
* \brief Find a clone instance or bundle container compatible with a resource
*
- * \param[in] match_rsc Resource that instance must match
- * \param[in] rsc Clone or bundle resource to check for matching instance
- * \param[in] role If not pcmk_role_unknown, instance must match this role
- * \param[in] current If true, compare instance's original node and role,
- * otherwise compare assigned next node and role
+ * \param[in] match_rsc Resource that instance must match
+ * \param[in] rsc Clone or bundle resource to check for matching instance
+ * \param[in] role If not pcmk_role_unknown, instance must match this role
+ * \param[in] current If true, compare instance's original node and role,
+ * otherwise compare assigned next node and role
+ * \param[in] node_attribute If not NULL, match instances by this node attribute
+ * instead of by node identity
*
* \return Compatible (by \p role and \p match_rsc location) instance of \p rsc
* if any, otherwise NULL
@@ -1191,7 +1218,7 @@ find_compatible_instance_on_node(const pcmk_resource_t *match_rsc,
pcmk_resource_t *
pcmk__find_compatible_instance(const pcmk_resource_t *match_rsc,
const pcmk_resource_t *rsc, enum rsc_role_e role,
- bool current)
+ bool current, const char *node_attribute)
{
pcmk_resource_t *instance = NULL;
GList *nodes = NULL;
@@ -1207,7 +1234,7 @@ pcmk__find_compatible_instance(const pcmk_resource_t *match_rsc,
node = match_rsc->priv->fns->location(match_rsc, NULL, target);
if (node != NULL) {
return find_compatible_instance_on_node(match_rsc, rsc, node, role,
- current);
+ current, node_attribute);
}
// Otherwise check for an instance matching any of match_rsc's allowed nodes
@@ -1216,7 +1243,8 @@ pcmk__find_compatible_instance(const pcmk_resource_t *match_rsc,
iter = iter->next) {
instance = find_compatible_instance_on_node(match_rsc, rsc,
(pcmk_node_t *) iter->data,
- role, current);
+ role, current,
+ node_attribute);
}
if (instance == NULL) {
@@ -1423,7 +1451,7 @@ update_interleaved_actions(pcmk_action_t *first, pcmk_action_t *then,
first_instance = pcmk__find_compatible_instance(then_instance,
first->rsc,
pcmk_role_unknown,
- current);
+ current, NULL);
if (first_instance == NULL) { // No instance can be interleaved
if (unassign_if_mandatory(first, then, then_instance, type,
diff --git a/lib/pacemaker/pcmk_sched_probes.c b/lib/pacemaker/pcmk_sched_probes.c
index bda90ce..a7d2364 100644
--- a/lib/pacemaker/pcmk_sched_probes.c
+++ b/lib/pacemaker/pcmk_sched_probes.c
@@ -614,7 +614,7 @@ add_restart_orderings_for_probe(pcmk_action_t *probe, pcmk_action_t *after)
compatible_rsc = pcmk__find_compatible_instance(probe->rsc,
after->rsc,
pcmk_role_unknown,
- false);
+ false, NULL);
}
}
diff --git a/lib/pacemaker/pcmk_sched_promotable.c b/lib/pacemaker/pcmk_sched_promotable.c
index cdf276f..0da0d8b 100644
--- a/lib/pacemaker/pcmk_sched_promotable.c
+++ b/lib/pacemaker/pcmk_sched_promotable.c
@@ -1333,7 +1333,8 @@ pcmk__update_promotable_dependent_priority(const pcmk_resource_t *primary,
// Look for a primary instance where dependent will be
primary_instance = pcmk__find_compatible_instance(dependent, primary,
colocation->primary_role,
- false);
+ false,
+ colocation->node_attribute);
if (primary_instance != NULL) {
// Add primary instance's priority to dependent's
--
2.47.1

View File

@ -0,0 +1,76 @@
From ce1dc488d46b373292569b397c9c765b55654eea Mon Sep 17 00:00:00 2001
From: Reid Wahl <nrwahl@protonmail.com>
Date: Fri, 5 Sep 2025 20:35:31 -0700
Subject: [PATCH] Fix: tools: Handle large timeouts correctly in crm_resource
--wait
Previously, if the --timeout value parsed to a value greater than
(UINT_MAX - 999), the wait timeout would overflow. The effective timeout
would be either 0 seconds or 1 second. This is because 999 was added to
the guint value before passing it to pcmk__timeout_ms2s().
Now, we simply pass the timeout in milliseconds to
pcmk__timeout_ms2s(), without adding 999.
This implies a slight behavior change. Previously, timeouts were always
rounded up to the next greatest second. Now, they're rounded to the
nearest second. For example, previously:
* timeout values between 1ms and 500ms => wait timeout of 1 second
* timeout values between 501ms and 1500ms => wait timeout of 2 seconds
* timeout values between 1501ms and 2500ms => wait timeout of 3 seconds
* and so on
Now:
* timeout values between 1ms and 1499ms => wait timeout of 1 second
* timeout values between 1500ms and 2499ms => wait timeout of 2 seconds
* timeout values between 2500ms and 3499ms => wait timeout of 3 seconds
* and so on
The previous rounding behavior has existed since crm_resource --wait was
added by 424afcdf.
Update the help text to note the granularity and rounding behavior. The
exact behavior of the restart command is confusing, and its logic should
be cleaned up in the future.
Fixes RHEL-45869
Fixes RHEL-86148
Closes T841
Signed-off-by: Reid Wahl <nrwahl@protonmail.com>
---
tools/crm_resource.c | 4 +++-
tools/crm_resource_runtime.c | 2 +-
2 files changed, 4 insertions(+), 2 deletions(-)
diff --git a/tools/crm_resource.c b/tools/crm_resource.c
index 162ae40..74f84f0 100644
--- a/tools/crm_resource.c
+++ b/tools/crm_resource.c
@@ -831,7 +831,9 @@ static GOptionEntry addl_entries[] = {
"ID" },
{ "timeout", 'T', G_OPTION_FLAG_NONE, G_OPTION_ARG_CALLBACK, timeout_cb,
"(Advanced) Abort if command does not finish in this time (with\n"
- INDENT "--restart, --wait, --force-*)",
+ INDENT "--restart, --wait, --force-*). The --restart command uses a\n"
+ INDENT "two-second granularity and the --wait command uses a one-second\n"
+ INDENT "granularity, with rounding.",
"N" },
{ "all", 0, G_OPTION_FLAG_NONE, G_OPTION_ARG_NONE, &options.all,
"List all options, including advanced and deprecated (with\n"
diff --git a/tools/crm_resource_runtime.c b/tools/crm_resource_runtime.c
index f0a84c0..a44794e 100644
--- a/tools/crm_resource_runtime.c
+++ b/tools/crm_resource_runtime.c
@@ -2108,7 +2108,7 @@ wait_till_stable(pcmk__output_t *out, guint timeout_ms, cib_t * cib)
if (timeout_ms == 0) {
expire_time += WAIT_DEFAULT_TIMEOUT_S;
} else {
- expire_time += pcmk__timeout_ms2s(timeout_ms + 999);
+ expire_time += pcmk__timeout_ms2s(timeout_ms);
}
scheduler = pcmk_new_scheduler();
--
2.47.1

400
005-ipc_evict.patch Normal file
View File

@ -0,0 +1,400 @@
From 79f5a67e8242b3e72aa9dcf0dbd286b3fb719baa Mon Sep 17 00:00:00 2001
From: Chris Lumens <clumens@redhat.com>
Date: Wed, 27 Aug 2025 10:41:13 -0400
Subject: [PATCH 1/6] Refactor: libcrmcommon: Rearrange the queue_len check.
Check if the queue length is 0 first and return, which allows everything
else to be un-indented one level.
---
lib/common/ipc_server.c | 47 ++++++++++++++++++++---------------------
1 file changed, 23 insertions(+), 24 deletions(-)
diff --git a/lib/common/ipc_server.c b/lib/common/ipc_server.c
index 25c788b..4b33c64 100644
--- a/lib/common/ipc_server.c
+++ b/lib/common/ipc_server.c
@@ -541,34 +541,33 @@ no_more_retries:
sent, queue_len, c->ipcs, c->pid, pcmk_rc_str(rc), qb_rc);
}
- if (queue_len) {
-
- /* Allow clients to briefly fall behind on processing incoming messages,
- * but drop completely unresponsive clients so the connection doesn't
- * consume resources indefinitely.
- */
- if (queue_len > QB_MAX(c->queue_max, PCMK_IPC_DEFAULT_QUEUE_MAX)) {
- if ((c->queue_backlog <= 1) || (queue_len < c->queue_backlog)) {
- /* Don't evict for a new or shrinking backlog */
- crm_warn("Client with process ID %u has a backlog of %u messages "
- QB_XS " %p", c->pid, queue_len, c->ipcs);
- } else {
- crm_err("Evicting client with process ID %u due to backlog of %u messages "
- QB_XS " %p", c->pid, queue_len, c->ipcs);
- c->queue_backlog = 0;
- qb_ipcs_disconnect(c->ipcs);
- return rc;
- }
- }
-
- c->queue_backlog = queue_len;
- delay_next_flush(c, queue_len);
-
- } else {
+ if (queue_len == 0) {
/* Event queue is empty, there is no backlog */
c->queue_backlog = 0;
+ return rc;
}
+ /* Allow clients to briefly fall behind on processing incoming messages,
+ * but drop completely unresponsive clients so the connection doesn't
+ * consume resources indefinitely.
+ */
+ if (queue_len > QB_MAX(c->queue_max, PCMK_IPC_DEFAULT_QUEUE_MAX)) {
+ if ((c->queue_backlog <= 1) || (queue_len < c->queue_backlog)) {
+ /* Don't evict for a new or shrinking backlog */
+ crm_warn("Client with process ID %u has a backlog of %u messages "
+ QB_XS " %p", c->pid, queue_len, c->ipcs);
+ } else {
+ crm_err("Evicting client with process ID %u due to backlog of %u messages "
+ QB_XS " %p", c->pid, queue_len, c->ipcs);
+ c->queue_backlog = 0;
+ qb_ipcs_disconnect(c->ipcs);
+ return rc;
+ }
+ }
+
+ c->queue_backlog = queue_len;
+ delay_next_flush(c, queue_len);
+
return rc;
}
--
2.47.1
From 014699003c6506bba8638ed57efea49da403d0e1 Mon Sep 17 00:00:00 2001
From: Chris Lumens <clumens@redhat.com>
Date: Wed, 27 Aug 2025 11:31:37 -0400
Subject: [PATCH 2/6] Refactor: libcrmcommon: Simplify an empty event queue
check.
I find this just a little bit more straightforward to follow.
---
lib/common/ipc_server.c | 9 ++++-----
1 file changed, 4 insertions(+), 5 deletions(-)
diff --git a/lib/common/ipc_server.c b/lib/common/ipc_server.c
index 4b33c64..dbd885a 100644
--- a/lib/common/ipc_server.c
+++ b/lib/common/ipc_server.c
@@ -491,14 +491,13 @@ crm_ipcs_flush_events(pcmk__client_t *c)
pcmk__ipc_header_t *header = NULL;
struct iovec *event = NULL;
- if (c->event_queue) {
- // We don't pop unless send is successful
- event = g_queue_peek_head(c->event_queue);
- }
- if (event == NULL) { // Queue is empty
+ if ((c->event_queue == NULL) || g_queue_is_empty(c->event_queue)) {
break;
}
+ // We don't pop unless send is successful
+ event = g_queue_peek_head(c->event_queue);
+
/* Retry sending the event up to five times. If we get -EAGAIN, sleep
* a very short amount of time (too long here is bad) and try again.
* If we simply exit the while loop on -EAGAIN, we'll have to wait until
--
2.47.1
From f999ac3d86d8107dee5288497f5f7fff07956d18 Mon Sep 17 00:00:00 2001
From: Chris Lumens <clumens@redhat.com>
Date: Wed, 27 Aug 2025 11:35:38 -0400
Subject: [PATCH 3/6] Refactor: libcrmcommon: Rearrange a few tests in
crm_ipcs_flush_events.
Again, no important code changes here. I just find these a little
easier to follow.
---
lib/common/ipc_server.c | 6 ++++--
1 file changed, 4 insertions(+), 2 deletions(-)
diff --git a/lib/common/ipc_server.c b/lib/common/ipc_server.c
index dbd885a..b76847b 100644
--- a/lib/common/ipc_server.c
+++ b/lib/common/ipc_server.c
@@ -477,16 +477,18 @@ crm_ipcs_flush_events(pcmk__client_t *c)
if (c == NULL) {
return rc;
+ }
- } else if (c->event_timer) {
+ if (c->event_timer != 0) {
/* There is already a timer, wait until it goes off */
crm_trace("Timer active for %p - %d", c->ipcs, c->event_timer);
return rc;
}
- if (c->event_queue) {
+ if (c->event_queue != NULL) {
queue_len = g_queue_get_length(c->event_queue);
}
+
while (sent < 100) {
pcmk__ipc_header_t *header = NULL;
struct iovec *event = NULL;
--
2.47.1
From 9e76007bb0bc1d4cb5a88dcfaaf96aa8853f42dc Mon Sep 17 00:00:00 2001
From: Chris Lumens <clumens@redhat.com>
Date: Wed, 27 Aug 2025 11:48:48 -0400
Subject: [PATCH 4/6] Refactor: libcrmcommon: Unindent retry code in
crm_ipcs_flush_events.
If we're breaking or jumping to a label, there's no need to have all
these nested else blocks.
---
lib/common/ipc_server.c | 16 ++++++++--------
1 file changed, 8 insertions(+), 8 deletions(-)
diff --git a/lib/common/ipc_server.c b/lib/common/ipc_server.c
index b76847b..73cc58f 100644
--- a/lib/common/ipc_server.c
+++ b/lib/common/ipc_server.c
@@ -513,16 +513,16 @@ crm_ipcs_flush_events(pcmk__client_t *c)
for (unsigned int retries = 5; retries > 0; retries--) {
qb_rc = qb_ipcs_event_sendv(c->ipcs, event, 2);
- if (qb_rc < 0) {
- if (retries == 1 || qb_rc != -EAGAIN) {
- rc = (int) -qb_rc;
- goto no_more_retries;
- } else {
- pcmk__sleep_ms(5);
- }
- } else {
+ if (qb_rc >= 0) {
break;
}
+
+ if (retries == 1 || qb_rc != -EAGAIN) {
+ rc = (int) -qb_rc;
+ goto no_more_retries;
+ }
+
+ pcmk__sleep_ms(5);
}
event = g_queue_pop_head(c->event_queue);
--
2.47.1
From b73be21a454f795bc747aad1dbeea82f67d8b232 Mon Sep 17 00:00:00 2001
From: Chris Lumens <clumens@redhat.com>
Date: Wed, 27 Aug 2025 13:14:54 -0400
Subject: [PATCH 5/6] Feature: libcrmcommon: Be more lenient in evicting IPC
clients.
Each IPC connection has a message queue. If the client is unable to
process messages faster than the server is sending them, that queue
start to back up. pacemaker enforces a cap on the queue size, and
that's adjustable with the cluster-ipc-limit parameter. Once the queue
grows beyond that size, the client is assumed to be dead and is evicted
so it can be restarted and the queue resources freed.
However, it's possible that the client is not dead. On clusters with
very large numbers of resources (I've tried with 300, but fewer might
also cause problems), certain actions can happen that cause a spike in
IPC messages. In RHEL-76276, the action that causes this is moving
nodes in and out of standby. This spike in messages causes the server
to overwhelm the client, which is then evicted.
My multi-part IPC patches made this even worse, as now if the CIB is so
large that it needs to split an IPC message up, there will be more
messages than before.
What this fix does is get rid of the cap on the queue size for pacemaker
daemons. As long as the server has been able to send messages to the
client, the client is still doing work and shouldn't be evicted. It may
just be processing messages slower than the server is sending them.
Note that this could lead the queue to grow without bound, eventually
crashing the server. For this reason, we're only allowing pacemaker
daemons to ignore the queue size limit.
Potential problems with this approach:
* If the client is so busy that it can't receive even a single message
that crm_ipcs_flush_events tries to send, it will still be evicted.
However, the flush operation does retry with a delay several times
giving the client time to finish up what it's doing.
* We have timers all over the place with daemons waiting on replies.
It's possible that because we are no longer just evicting the clients,
we will now see those timers expire which will just lead to different
problems. If so, these fixes would probably need to take place in the
client code.
Fixes T38
---
lib/common/ipc_server.c | 14 ++++++++++++--
1 file changed, 12 insertions(+), 2 deletions(-)
diff --git a/lib/common/ipc_server.c b/lib/common/ipc_server.c
index 73cc58f..4420070 100644
--- a/lib/common/ipc_server.c
+++ b/lib/common/ipc_server.c
@@ -553,10 +553,20 @@ no_more_retries:
* consume resources indefinitely.
*/
if (queue_len > QB_MAX(c->queue_max, PCMK_IPC_DEFAULT_QUEUE_MAX)) {
- if ((c->queue_backlog <= 1) || (queue_len < c->queue_backlog)) {
- /* Don't evict for a new or shrinking backlog */
+ /* Don't evict:
+ * - Clients with a new backlog.
+ * - Clients with a shrinking backlog (the client is processing
+ * messages faster than the server is sending them).
+ * - Clients that are pacemaker daemons and have had any messages sent
+ * to them in this flush call (the server is sending messages faster
+ * than the client is processing them, but the client is not dead).
+ */
+ if ((c->queue_backlog <= 1)
+ || (queue_len < c->queue_backlog)
+ || ((sent > 0) && (pcmk__parse_server(c->name) != pcmk_ipc_unknown))) {
crm_warn("Client with process ID %u has a backlog of %u messages "
QB_XS " %p", c->pid, queue_len, c->ipcs);
+
} else {
crm_err("Evicting client with process ID %u due to backlog of %u messages "
QB_XS " %p", c->pid, queue_len, c->ipcs);
--
2.47.1
From 4682953c567e16409d8e7972d9d5891348d4c360 Mon Sep 17 00:00:00 2001
From: Chris Lumens <clumens@redhat.com>
Date: Wed, 27 Aug 2025 15:56:27 -0400
Subject: [PATCH 6/6] Feature: libcrmcommon: Update documentation for
cluster-ipc-limit.
Clarify that this no longer applies to pacemaker daemons.
---
cts/cli/regression.crm_attribute.exp | 16 ++++++++--------
cts/cli/regression.daemons.exp | 4 ++--
.../Pacemaker_Explained/cluster-options.rst | 12 +++++++-----
lib/common/options.c | 6 +++---
4 files changed, 20 insertions(+), 18 deletions(-)
diff --git a/cts/cli/regression.crm_attribute.exp b/cts/cli/regression.crm_attribute.exp
index e161f49..36cba76 100644
--- a/cts/cli/regression.crm_attribute.exp
+++ b/cts/cli/regression.crm_attribute.exp
@@ -111,8 +111,8 @@ Also known as properties, these are options that affect behavior across the enti
* migration-limit: The number of live migration actions that the cluster is allowed to execute in parallel on a node (-1 means no limit)
* Possible values: integer (default: )
- * cluster-ipc-limit: Maximum IPC message backlog before disconnecting a cluster daemon
- * Raise this if log has "Evicting client" messages for cluster daemon PIDs (a good value is the number of resources in the cluster multiplied by the number of nodes).
+ * cluster-ipc-limit: Maximum IPC message backlog before disconnecting a client
+ * Raise this if log has "Evicting client" messages for cluster PIDs (a good value is the number of resources in the cluster multiplied by the number of nodes).
* Possible values: nonnegative_integer (default: )
* stop-all-resources: Whether the cluster should stop all active resources
@@ -357,8 +357,8 @@ Also known as properties, these are options that affect behavior across the enti
<content type="integer" default=""/>
</parameter>
<parameter name="cluster-ipc-limit" advanced="0" generated="0">
- <longdesc lang="en">Raise this if log has "Evicting client" messages for cluster daemon PIDs (a good value is the number of resources in the cluster multiplied by the number of nodes).</longdesc>
- <shortdesc lang="en">Maximum IPC message backlog before disconnecting a cluster daemon</shortdesc>
+ <longdesc lang="en">Raise this if log has "Evicting client" messages for cluster PIDs (a good value is the number of resources in the cluster multiplied by the number of nodes).</longdesc>
+ <shortdesc lang="en">Maximum IPC message backlog before disconnecting a client</shortdesc>
<content type="nonnegative_integer" default=""/>
</parameter>
<parameter name="stop-all-resources" advanced="0" generated="0">
@@ -537,8 +537,8 @@ Also known as properties, these are options that affect behavior across the enti
* migration-limit: The number of live migration actions that the cluster is allowed to execute in parallel on a node (-1 means no limit)
* Possible values: integer (default: )
- * cluster-ipc-limit: Maximum IPC message backlog before disconnecting a cluster daemon
- * Raise this if log has "Evicting client" messages for cluster daemon PIDs (a good value is the number of resources in the cluster multiplied by the number of nodes).
+ * cluster-ipc-limit: Maximum IPC message backlog before disconnecting a client
+ * Raise this if log has "Evicting client" messages for cluster PIDs (a good value is the number of resources in the cluster multiplied by the number of nodes).
* Possible values: nonnegative_integer (default: )
* stop-all-resources: Whether the cluster should stop all active resources
@@ -824,8 +824,8 @@ Also known as properties, these are options that affect behavior across the enti
<content type="integer" default=""/>
</parameter>
<parameter name="cluster-ipc-limit" advanced="0" generated="0">
- <longdesc lang="en">Raise this if log has "Evicting client" messages for cluster daemon PIDs (a good value is the number of resources in the cluster multiplied by the number of nodes).</longdesc>
- <shortdesc lang="en">Maximum IPC message backlog before disconnecting a cluster daemon</shortdesc>
+ <longdesc lang="en">Raise this if log has "Evicting client" messages for cluster PIDs (a good value is the number of resources in the cluster multiplied by the number of nodes).</longdesc>
+ <shortdesc lang="en">Maximum IPC message backlog before disconnecting a client</shortdesc>
<content type="nonnegative_integer" default=""/>
</parameter>
<parameter name="stop-all-resources" advanced="0" generated="0">
diff --git a/cts/cli/regression.daemons.exp b/cts/cli/regression.daemons.exp
index fc8535a..6274eeb 100644
--- a/cts/cli/regression.daemons.exp
+++ b/cts/cli/regression.daemons.exp
@@ -21,10 +21,10 @@
</parameter>
<parameter name="cluster-ipc-limit">
<longdesc lang="en">
- Raise this if log has "Evicting client" messages for cluster daemon PIDs (a good value is the number of resources in the cluster multiplied by the number of nodes).
+ Raise this if log has "Evicting client" messages for cluster PIDs (a good value is the number of resources in the cluster multiplied by the number of nodes).
</longdesc>
<shortdesc lang="en">
- Maximum IPC message backlog before disconnecting a cluster daemon
+ Maximum IPC message backlog before disconnecting a client
</shortdesc>
<content type="integer" default=""/>
</parameter>
diff --git a/doc/sphinx/Pacemaker_Explained/cluster-options.rst b/doc/sphinx/Pacemaker_Explained/cluster-options.rst
index 6ebe5f3..22e1a50 100644
--- a/doc/sphinx/Pacemaker_Explained/cluster-options.rst
+++ b/doc/sphinx/Pacemaker_Explained/cluster-options.rst
@@ -693,11 +693,13 @@ values, by running the ``man pacemaker-schedulerd`` and
cluster-ipc-limit
- :ref:`nonnegative integer <nonnegative_integer>`
- 500
- - The maximum IPC message backlog before one cluster daemon will
- disconnect another. This is of use in large clusters, for which a good
- value is the number of resources in the cluster multiplied by the number
- of nodes. The default of 500 is also the minimum. Raise this if you see
- "Evicting client" log messages for cluster daemon process IDs.
+ - The maximum IPC message backlog before a cluster daemon will disconnect
+ a client. Other cluster daemons are not subject to this limit as long as
+ they are still processing messages. This is of use in large clusters,
+ for which a good value is the number of resources in the cluster
+ multiplied by the number of nodes. The default of 500 is also the
+ minimum. Raise this if you see "Evicting client" log messages for
+ cluster process IDs.
* - .. _pe_error_series_max:
.. index::
diff --git a/lib/common/options.c b/lib/common/options.c
index b8f4943..af1b073 100644
--- a/lib/common/options.c
+++ b/lib/common/options.c
@@ -432,10 +432,10 @@ static const pcmk__cluster_option_t cluster_options[] = {
PCMK_OPT_CLUSTER_IPC_LIMIT, NULL, PCMK_VALUE_NONNEGATIVE_INTEGER, NULL,
"500", pcmk__valid_positive_int,
pcmk__opt_based,
- N_("Maximum IPC message backlog before disconnecting a cluster daemon"),
+ N_("Maximum IPC message backlog before disconnecting a client"),
N_("Raise this if log has \"Evicting client\" messages for cluster "
- "daemon PIDs (a good value is the number of resources in the "
- "cluster multiplied by the number of nodes)."),
+ "PIDs (a good value is the number of resources in the cluster "
+ "multiplied by the number of nodes)."),
},
// Orphans and stopping
--
2.47.1

88
006-fewer_messages.patch Normal file
View File

@ -0,0 +1,88 @@
From 8ddaf5330cf7605c7b710061c72dba8112db6cc6 Mon Sep 17 00:00:00 2001
From: Chris Lumens <clumens@redhat.com>
Date: Fri, 31 Oct 2025 11:24:14 -0400
Subject: [PATCH] Med: daemons: Don't add repeated I_PE_CALC messages to the
fsa queue.
Let's say you have a two node cluster, node1 and node2. For purposes of
testing, it's easiest if you use fence_dummy instead of a real fencing
agent as this will fake fencing happening but without rebooting the node
so you can see all the log files.
Assume the DC is node1. Now do the following on node2:
- pcs node standby node1
- pcs resource defaults update resource-stickiness=1
- for i in $(seq 1 300); do echo $i; pcs resource create dummy$i ocf:heartbeat:Dummy --group dummy-group; done
- pcs node unstandby node1
It will take a long time to create that many resources. After node1
comes out of standby, it'll take a minute or two but eventually you'll
see that node1 was fenced. On node1, you'll see a lot of transition
abort messages happen. Each of these transition aborts causes an
I_PE_CALC message to be generated and added to the fsa queue. In my
testing, I've seen the queue grow to ~ 600 messages, all of which are
exactly the same thing.
The FSA is triggered at G_PRIORITY_HIGH, and once it is triggered, it
will run until its queue is empty. With so many messages being added so
quickly, we've basically ensured it won't be empty any time soon. While
controld is processing the FSA messages, it will be unable to read
anything out of the IPC backlog.
based continues to attempt to send IPC events to controld but is unable
to do so, so the backlog continues to grow. Eventually, the backlog
reaches that 500 message threshold without anything having been read by
controld, which triggers the eviction process.
There doesn't seem to be any reason for all these I_PE_CALC messages to
be generated. They're all exactly the same, they don't appear to be
tagged with any unique data tying them to a specific query, and their
presence just slows everything down.
Thus, the fix here is very simple: if the latest message in the queue is
an I_PE_CALC message, just don't add another one. We could also make
sure there's only ever one I_PE_CALC message in the queue, but there
could potentially be valid reasons for there to be multiple interleaved
with other message types. I am erring on the side of caution with this
minimal fix.
Related: RHEL-76276
---
daemons/controld/controld_messages.c | 20 ++++++++++++++++++++
1 file changed, 20 insertions(+)
diff --git a/daemons/controld/controld_messages.c b/daemons/controld/controld_messages.c
index df215e6..866fde3 100644
--- a/daemons/controld/controld_messages.c
+++ b/daemons/controld/controld_messages.c
@@ -73,6 +73,26 @@ register_fsa_input_adv(enum crmd_fsa_cause cause, enum crmd_fsa_input input,
return;
}
+ if (input == I_PE_CALC) {
+ GList *ele = NULL;
+
+ if (prepend) {
+ ele = g_list_first(controld_globals.fsa_message_queue);
+ } else {
+ ele = g_list_last(controld_globals.fsa_message_queue);
+ }
+
+ if (ele != NULL) {
+ fsa_data_t *message = (fsa_data_t *) ele->data;
+
+ if (message->fsa_input == I_PE_CALC) {
+ crm_debug("%s item in fsa queue is I_PE_CALC, not adding another",
+ (prepend ? "First" : "Last"));
+ return;
+ }
+ }
+ }
+
if (input == I_WAIT_FOR_EVENT) {
controld_set_global_flags(controld_fsa_is_stalled);
crm_debug("Stalling the FSA pending further input: source=%s cause=%s data=%p queue=%d",
--
2.47.1

View File

@ -41,7 +41,7 @@
## can be incremented to build packages reliably considered "newer"
## than previously built packages with the same pcmkversion)
%global pcmkversion 3.0.1
%global specversion 3
%global specversion 4
## Upstream commit (full commit ID, abbreviated commit ID, or tag) to build
%global commit 9a5e54bae85847c4bb6ed7c7fb06103ebebbc64a
@ -201,6 +201,10 @@ Source1: pacemaker.sysusers
# upstream commits
Patch001: 001-econnrefused.patch
Patch002: 002-corosync.patch
Patch003: 003-promotable-follows.patch
Patch004: 004-crm_resource_wait.patch
Patch005: 005-ipc_evict.patch
Patch006: 006-fewer_messages.patch
Requires: resource-agents
Requires: %{pkgname_pcmk_libs}%{?_isa} = %{version}-%{release}
@ -792,6 +796,15 @@ exit 0
%{_datadir}/pkgconfig/pacemaker-schemas.pc
%changelog
* Thu Nov 13 2025 Chris Lumens <clumens@redhat.com> - 3.0.1-4
- Fix promoting instances of a cloned resource
- Handle large timeouts correctly in crm_resource --wait
- Don't evict IPC clients as long as they're still processing messages
- Don't overwhelm the FSA queue with repeated CIB queries
- Resolves: RHEL-120932
- Resolves: RHEL-86148
- Resolves: RHEL-114895
* Wed Aug 13 2025 Reid Wahl <nwahl@redhat.com> - 3.0.1-3
- CTS launches Corosync using systemd if available.
- Resolves: RHEL-110075