diff --git a/003-promotable-follows.patch b/003-promotable-follows.patch new file mode 100644 index 0000000..7b413d7 --- /dev/null +++ b/003-promotable-follows.patch @@ -0,0 +1,801 @@ +From 6e5d574de9ad3a131cc0c51f2c5300e2cf4e7db3 Mon Sep 17 00:00:00 2001 +From: Klaus Wenninger +Date: Tue, 7 Oct 2025 05:07:04 +0200 +Subject: [PATCH 1/2] Test: scheduler: promoted state with promoted state with + attribute + +Add testcase. Previous fix attribute based colocation didn't adhere +the attribute with promoted state with promoted state. +--- + cts/cts-scheduler.in | 1 + + ...motable-colocation-with-node-attribute.dot | 28 +++ + ...motable-colocation-with-node-attribute.exp | 175 ++++++++++++++++++ + ...able-colocation-with-node-attribute.scores | 81 ++++++++ + ...ble-colocation-with-node-attribute.summary | 45 +++++ + ...motable-colocation-with-node-attribute.xml | 155 ++++++++++++++++ + 6 files changed, 485 insertions(+) + create mode 100644 cts/scheduler/dot/promotable-colocation-with-node-attribute.dot + create mode 100644 cts/scheduler/exp/promotable-colocation-with-node-attribute.exp + create mode 100644 cts/scheduler/scores/promotable-colocation-with-node-attribute.scores + create mode 100644 cts/scheduler/summary/promotable-colocation-with-node-attribute.summary + create mode 100644 cts/scheduler/xml/promotable-colocation-with-node-attribute.xml + +diff --git a/cts/cts-scheduler.in b/cts/cts-scheduler.in +index f5d4ed5..be8af87 100644 +--- a/cts/cts-scheduler.in ++++ b/cts/cts-scheduler.in +@@ -633,6 +633,7 @@ TESTS = [ + SchedulerTest("no_quorum_demote", "Promotable demotion and primitive stop with no-quorum-policy=\"demote\""), + SchedulerTest("no-promote-on-unrunnable-guest", "Don't select bundle instance for promotion when container can't run"), + SchedulerTest("leftover-pending-monitor", "Prevent a leftover pending monitor from causing unexpected stop of other instances"), ++ SchedulerTest("promotable-colocation-with-node-attribute", "Promote dependent clone on nodes belonging to a site that has a primary clone promoted"), + ]), + SchedulerTestGroup([ + SchedulerTest("history-1", "Correctly parse stateful-1 resource state"), +diff --git a/cts/scheduler/dot/promotable-colocation-with-node-attribute.dot b/cts/scheduler/dot/promotable-colocation-with-node-attribute.dot +new file mode 100644 +index 0000000..89d066f +--- /dev/null ++++ b/cts/scheduler/dot/promotable-colocation-with-node-attribute.dot +@@ -0,0 +1,28 @@ ++ digraph "g" { ++"dependent-clone_demote_0" -> "dependent-clone_demoted_0" [ style = bold] ++"dependent-clone_demote_0" -> "dependent-rsc_demote_0 node3" [ style = bold] ++"dependent-clone_demote_0" [ style=bold color="green" fontcolor="orange"] ++"dependent-clone_demoted_0" -> "dependent-clone_promote_0" [ style = bold] ++"dependent-clone_demoted_0" [ style=bold color="green" fontcolor="orange"] ++"dependent-clone_promote_0" -> "dependent-rsc_promote_0 node1" [ style = bold] ++"dependent-clone_promote_0" -> "dependent-rsc_promote_0 node2" [ style = bold] ++"dependent-clone_promote_0" [ style=bold color="green" fontcolor="orange"] ++"dependent-clone_promoted_0" [ style=bold color="green" fontcolor="orange"] ++"dependent-rsc_demote_0 node3" -> "dependent-clone_demoted_0" [ style = bold] ++"dependent-rsc_demote_0 node3" -> "dependent-rsc_monitor_11000 node3" [ style = bold] ++"dependent-rsc_demote_0 node3" [ style=bold color="green" fontcolor="black"] ++"dependent-rsc_monitor_10000 node1" [ style=bold color="green" fontcolor="black"] ++"dependent-rsc_monitor_10000 node2" [ style=bold color="green" fontcolor="black"] ++"dependent-rsc_monitor_11000 node3" [ style=bold color="green" fontcolor="black"] ++"dependent-rsc_monitor_11000 node4" [ style=bold color="green" fontcolor="black"] ++"dependent-rsc_promote_0 node1" -> "dependent-clone_promoted_0" [ style = bold] ++"dependent-rsc_promote_0 node1" -> "dependent-rsc_monitor_10000 node1" [ style = bold] ++"dependent-rsc_promote_0 node1" [ style=bold color="green" fontcolor="black"] ++"dependent-rsc_promote_0 node2" -> "dependent-clone_promoted_0" [ style = bold] ++"dependent-rsc_promote_0 node2" -> "dependent-rsc_monitor_10000 node2" [ style = bold] ++"dependent-rsc_promote_0 node2" [ style=bold color="green" fontcolor="black"] ++"primary-rsc_monitor_10000 node1" [ style=bold color="green" fontcolor="black"] ++"primary-rsc_monitor_11000 node2" [ style=bold color="green" fontcolor="black"] ++"primary-rsc_monitor_11000 node3" [ style=bold color="green" fontcolor="black"] ++"primary-rsc_monitor_11000 node4" [ style=bold color="green" fontcolor="black"] ++} +diff --git a/cts/scheduler/exp/promotable-colocation-with-node-attribute.exp b/cts/scheduler/exp/promotable-colocation-with-node-attribute.exp +new file mode 100644 +index 0000000..76371f1 +--- /dev/null ++++ b/cts/scheduler/exp/promotable-colocation-with-node-attribute.exp +@@ -0,0 +1,175 @@ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ +diff --git a/cts/scheduler/scores/promotable-colocation-with-node-attribute.scores b/cts/scheduler/scores/promotable-colocation-with-node-attribute.scores +new file mode 100644 +index 0000000..023ee77 +--- /dev/null ++++ b/cts/scheduler/scores/promotable-colocation-with-node-attribute.scores +@@ -0,0 +1,81 @@ ++ ++dependent-rsc:0 promotion score on node1: INFINITY ++dependent-rsc:1 promotion score on node2: INFINITY ++dependent-rsc:2 promotion score on node3: -INFINITY ++dependent-rsc:3 promotion score on node4: -INFINITY ++pcmk__clone_assign: dependent-clone allocation score on node1: 0 ++pcmk__clone_assign: dependent-clone allocation score on node2: 0 ++pcmk__clone_assign: dependent-clone allocation score on node3: 0 ++pcmk__clone_assign: dependent-clone allocation score on node4: 0 ++pcmk__clone_assign: dependent-rsc:0 allocation score on node1: 6 ++pcmk__clone_assign: dependent-rsc:0 allocation score on node2: 0 ++pcmk__clone_assign: dependent-rsc:0 allocation score on node3: 0 ++pcmk__clone_assign: dependent-rsc:0 allocation score on node4: 0 ++pcmk__clone_assign: dependent-rsc:1 allocation score on node1: 0 ++pcmk__clone_assign: dependent-rsc:1 allocation score on node2: 6 ++pcmk__clone_assign: dependent-rsc:1 allocation score on node3: 0 ++pcmk__clone_assign: dependent-rsc:1 allocation score on node4: 0 ++pcmk__clone_assign: dependent-rsc:2 allocation score on node1: 0 ++pcmk__clone_assign: dependent-rsc:2 allocation score on node2: 0 ++pcmk__clone_assign: dependent-rsc:2 allocation score on node3: 11 ++pcmk__clone_assign: dependent-rsc:2 allocation score on node4: 0 ++pcmk__clone_assign: dependent-rsc:3 allocation score on node1: 0 ++pcmk__clone_assign: dependent-rsc:3 allocation score on node2: 0 ++pcmk__clone_assign: dependent-rsc:3 allocation score on node3: 0 ++pcmk__clone_assign: dependent-rsc:3 allocation score on node4: 6 ++pcmk__clone_assign: primary-clone allocation score on node1: 0 ++pcmk__clone_assign: primary-clone allocation score on node2: 0 ++pcmk__clone_assign: primary-clone allocation score on node3: 0 ++pcmk__clone_assign: primary-clone allocation score on node4: 0 ++pcmk__clone_assign: primary-rsc:0 allocation score on node1: 11 ++pcmk__clone_assign: primary-rsc:0 allocation score on node2: 0 ++pcmk__clone_assign: primary-rsc:0 allocation score on node3: 0 ++pcmk__clone_assign: primary-rsc:0 allocation score on node4: 0 ++pcmk__clone_assign: primary-rsc:1 allocation score on node1: 0 ++pcmk__clone_assign: primary-rsc:1 allocation score on node2: 6 ++pcmk__clone_assign: primary-rsc:1 allocation score on node3: 0 ++pcmk__clone_assign: primary-rsc:1 allocation score on node4: 0 ++pcmk__clone_assign: primary-rsc:2 allocation score on node1: 0 ++pcmk__clone_assign: primary-rsc:2 allocation score on node2: 0 ++pcmk__clone_assign: primary-rsc:2 allocation score on node3: 6 ++pcmk__clone_assign: primary-rsc:2 allocation score on node4: 0 ++pcmk__clone_assign: primary-rsc:3 allocation score on node1: 0 ++pcmk__clone_assign: primary-rsc:3 allocation score on node2: 0 ++pcmk__clone_assign: primary-rsc:3 allocation score on node3: 0 ++pcmk__clone_assign: primary-rsc:3 allocation score on node4: 6 ++pcmk__primitive_assign: dependent-rsc:0 allocation score on node1: 6 ++pcmk__primitive_assign: dependent-rsc:0 allocation score on node2: 0 ++pcmk__primitive_assign: dependent-rsc:0 allocation score on node3: -INFINITY ++pcmk__primitive_assign: dependent-rsc:0 allocation score on node4: 0 ++pcmk__primitive_assign: dependent-rsc:1 allocation score on node1: -INFINITY ++pcmk__primitive_assign: dependent-rsc:1 allocation score on node2: 6 ++pcmk__primitive_assign: dependent-rsc:1 allocation score on node3: -INFINITY ++pcmk__primitive_assign: dependent-rsc:1 allocation score on node4: 0 ++pcmk__primitive_assign: dependent-rsc:2 allocation score on node1: 0 ++pcmk__primitive_assign: dependent-rsc:2 allocation score on node2: 0 ++pcmk__primitive_assign: dependent-rsc:2 allocation score on node3: 11 ++pcmk__primitive_assign: dependent-rsc:2 allocation score on node4: 0 ++pcmk__primitive_assign: dependent-rsc:3 allocation score on node1: -INFINITY ++pcmk__primitive_assign: dependent-rsc:3 allocation score on node2: -INFINITY ++pcmk__primitive_assign: dependent-rsc:3 allocation score on node3: -INFINITY ++pcmk__primitive_assign: dependent-rsc:3 allocation score on node4: 6 ++pcmk__primitive_assign: primary-rsc:0 allocation score on node1: 11 ++pcmk__primitive_assign: primary-rsc:0 allocation score on node2: 0 ++pcmk__primitive_assign: primary-rsc:0 allocation score on node3: 0 ++pcmk__primitive_assign: primary-rsc:0 allocation score on node4: 0 ++pcmk__primitive_assign: primary-rsc:1 allocation score on node1: -INFINITY ++pcmk__primitive_assign: primary-rsc:1 allocation score on node2: 6 ++pcmk__primitive_assign: primary-rsc:1 allocation score on node3: 0 ++pcmk__primitive_assign: primary-rsc:1 allocation score on node4: 0 ++pcmk__primitive_assign: primary-rsc:2 allocation score on node1: -INFINITY ++pcmk__primitive_assign: primary-rsc:2 allocation score on node2: -INFINITY ++pcmk__primitive_assign: primary-rsc:2 allocation score on node3: 6 ++pcmk__primitive_assign: primary-rsc:2 allocation score on node4: 0 ++pcmk__primitive_assign: primary-rsc:3 allocation score on node1: -INFINITY ++pcmk__primitive_assign: primary-rsc:3 allocation score on node2: -INFINITY ++pcmk__primitive_assign: primary-rsc:3 allocation score on node3: -INFINITY ++pcmk__primitive_assign: primary-rsc:3 allocation score on node4: 6 ++primary-rsc:0 promotion score on node1: 10 ++primary-rsc:1 promotion score on node2: 5 ++primary-rsc:2 promotion score on node3: 5 ++primary-rsc:3 promotion score on node4: 5 +diff --git a/cts/scheduler/summary/promotable-colocation-with-node-attribute.summary b/cts/scheduler/summary/promotable-colocation-with-node-attribute.summary +new file mode 100644 +index 0000000..30e81c8 +--- /dev/null ++++ b/cts/scheduler/summary/promotable-colocation-with-node-attribute.summary +@@ -0,0 +1,45 @@ ++Current cluster status: ++ * Node List: ++ * Online: [ node1 node2 node3 node4 ] ++ ++ * Full List of Resources: ++ * Clone Set: primary-clone [primary-rsc] (promotable): ++ * Promoted: [ node1 ] ++ * Unpromoted: [ node2 node3 node4 ] ++ * Clone Set: dependent-clone [dependent-rsc] (promotable): ++ * Promoted: [ node3 ] ++ * Unpromoted: [ node1 node2 node4 ] ++ ++Transition Summary: ++ * Promote dependent-rsc:0 ( Unpromoted -> Promoted node1 ) ++ * Promote dependent-rsc:1 ( Unpromoted -> Promoted node2 ) ++ * Demote dependent-rsc:2 ( Promoted -> Unpromoted node3 ) ++ ++Executing Cluster Transition: ++ * Resource action: primary-rsc monitor=10000 on node1 ++ * Resource action: primary-rsc monitor=11000 on node2 ++ * Resource action: primary-rsc monitor=11000 on node3 ++ * Resource action: primary-rsc monitor=11000 on node4 ++ * Resource action: dependent-rsc monitor=11000 on node4 ++ * Pseudo action: dependent-clone_demote_0 ++ * Resource action: dependent-rsc demote on node3 ++ * Pseudo action: dependent-clone_demoted_0 ++ * Pseudo action: dependent-clone_promote_0 ++ * Resource action: dependent-rsc promote on node1 ++ * Resource action: dependent-rsc promote on node2 ++ * Resource action: dependent-rsc monitor=11000 on node3 ++ * Pseudo action: dependent-clone_promoted_0 ++ * Resource action: dependent-rsc monitor=10000 on node1 ++ * Resource action: dependent-rsc monitor=10000 on node2 ++ ++Revised Cluster Status: ++ * Node List: ++ * Online: [ node1 node2 node3 node4 ] ++ ++ * Full List of Resources: ++ * Clone Set: primary-clone [primary-rsc] (promotable): ++ * Promoted: [ node1 ] ++ * Unpromoted: [ node2 node3 node4 ] ++ * Clone Set: dependent-clone [dependent-rsc] (promotable): ++ * Promoted: [ node1 node2 ] ++ * Unpromoted: [ node3 node4 ] +diff --git a/cts/scheduler/xml/promotable-colocation-with-node-attribute.xml b/cts/scheduler/xml/promotable-colocation-with-node-attribute.xml +new file mode 100644 +index 0000000..5b4ab10 +--- /dev/null ++++ b/cts/scheduler/xml/promotable-colocation-with-node-attribute.xml +@@ -0,0 +1,155 @@ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ +-- +2.47.1 + +From 31d5785ffc68acb54af76bc55f732117f77ef4b9 Mon Sep 17 00:00:00 2001 +From: Klaus Wenninger +Date: Tue, 7 Oct 2025 05:11:44 +0200 +Subject: [PATCH 2/2] Fix: scheduler: promoted state with promoted state with + attribute + +Previously attribute based colocation didn't adhere +the attribute with promoted state with promoted state. +--- + lib/pacemaker/libpacemaker_private.h | 5 +- + lib/pacemaker/pcmk_sched_bundle.c | 2 +- + lib/pacemaker/pcmk_sched_clone.c | 2 +- + lib/pacemaker/pcmk_sched_instances.c | 76 ++++++++++++++++++--------- + lib/pacemaker/pcmk_sched_probes.c | 2 +- + lib/pacemaker/pcmk_sched_promotable.c | 3 +- + 6 files changed, 60 insertions(+), 30 deletions(-) + +diff --git a/lib/pacemaker/libpacemaker_private.h b/lib/pacemaker/libpacemaker_private.h +index 58435a6..fadfc8b 100644 +--- a/lib/pacemaker/libpacemaker_private.h ++++ b/lib/pacemaker/libpacemaker_private.h +@@ -941,13 +941,14 @@ void pcmk__create_instance_actions(pcmk_resource_t *rsc, GList *instances); + G_GNUC_INTERNAL + bool pcmk__instance_matches(const pcmk_resource_t *instance, + const pcmk_node_t *node, enum rsc_role_e role, +- bool current); ++ bool current, const char *node_attribute); + + G_GNUC_INTERNAL + pcmk_resource_t *pcmk__find_compatible_instance(const pcmk_resource_t *match_rsc, + const pcmk_resource_t *rsc, + enum rsc_role_e role, +- bool current); ++ bool current, ++ const char *node_attribute); + + G_GNUC_INTERNAL + uint32_t pcmk__instance_update_ordered_actions(pcmk_action_t *first, +diff --git a/lib/pacemaker/pcmk_sched_bundle.c b/lib/pacemaker/pcmk_sched_bundle.c +index 14e7be5..2d7e879 100644 +--- a/lib/pacemaker/pcmk_sched_bundle.c ++++ b/lib/pacemaker/pcmk_sched_bundle.c +@@ -383,7 +383,7 @@ match_replica_container(const pcmk__bundle_replica_t *replica, void *user_data) + struct match_data *match_data = user_data; + + if (pcmk__instance_matches(replica->container, match_data->node, +- pcmk_role_unknown, false)) { ++ pcmk_role_unknown, false, NULL)) { + match_data->container = replica->container; + return false; // Match found, don't bother searching further replicas + } +diff --git a/lib/pacemaker/pcmk_sched_clone.c b/lib/pacemaker/pcmk_sched_clone.c +index 4f86621..99fa8b2 100644 +--- a/lib/pacemaker/pcmk_sched_clone.c ++++ b/lib/pacemaker/pcmk_sched_clone.c +@@ -301,7 +301,7 @@ pcmk__clone_apply_coloc_score(pcmk_resource_t *dependent, + + primary_instance = pcmk__find_compatible_instance(dependent, primary, + pcmk_role_unknown, +- false); ++ false, NULL); + if (primary_instance != NULL) { + pcmk__rsc_debug(primary, "Interleaving %s with %s", + dependent->id, primary_instance->id); +diff --git a/lib/pacemaker/pcmk_sched_instances.c b/lib/pacemaker/pcmk_sched_instances.c +index f2bc1a4..5344234 100644 +--- a/lib/pacemaker/pcmk_sched_instances.c ++++ b/lib/pacemaker/pcmk_sched_instances.c +@@ -1073,18 +1073,22 @@ free_instance_list(const pcmk_resource_t *rsc, GList *list) + * \internal + * \brief Check whether an instance is compatible with a role and node + * +- * \param[in] instance Clone instance or bundle replica container +- * \param[in] node Instance must match this node +- * \param[in] role If not pcmk_role_unknown, instance must match this role +- * \param[in] current If true, compare instance's original node and role, +- * otherwise compare assigned next node and role ++ * \param[in] instance Clone instance or bundle replica container ++ * \param[in] node Instance must match this node ++ * \param[in] role If not pcmk_role_unknown, instance must match this role ++ * \param[in] current If true, compare instance's original node and role, ++ * otherwise compare assigned next node and role ++ * \param[in] node_attribute If not NULL, instance's node must have the same value ++ * for this attribute as \p node (instead of requiring ++ * the exact same node) + * + * \return true if \p instance is compatible with \p node and \p role, + * otherwise false + */ + bool + pcmk__instance_matches(const pcmk_resource_t *instance, const pcmk_node_t *node, +- enum rsc_role_e role, bool current) ++ enum rsc_role_e role, bool current, ++ const char *node_attribute) + { + pcmk_node_t *instance_node = NULL; + +@@ -1117,7 +1121,25 @@ pcmk__instance_matches(const pcmk_resource_t *instance, const pcmk_node_t *node, + return false; + } + +- if (!pcmk__same_node(instance_node, node)) { ++ if (node_attribute != NULL) { ++ // Compare by node attribute value instead of node identity ++ const char *instance_value = pcmk__colocation_node_attr(instance_node, ++ node_attribute, ++ instance); ++ const char *target_value = pcmk__colocation_node_attr(node, ++ node_attribute, ++ instance); ++ ++ if (!pcmk__str_eq(instance_value, target_value, pcmk__str_casei)) { ++ pcmk__rsc_trace(instance, ++ "%s is not a compatible instance " ++ "(instance has %s=%s, target node has %s=%s)", ++ instance->id, node_attribute, ++ pcmk__s(instance_value, ""), ++ node_attribute, pcmk__s(target_value, "")); ++ return false; ++ } ++ } else if (!pcmk__same_node(instance_node, node)) { + pcmk__rsc_trace(instance, + "%s is not a compatible instance " + "(assigned to %s not %s)", +@@ -1136,12 +1158,14 @@ pcmk__instance_matches(const pcmk_resource_t *instance, const pcmk_node_t *node, + * \internal + * \brief Find an instance that matches a given resource by node and role + * +- * \param[in] match_rsc Resource that instance must match (for logging only) +- * \param[in] rsc Clone or bundle resource to check for matching instance +- * \param[in] node Instance must match this node +- * \param[in] role If not pcmk_role_unknown, instance must match this role +- * \param[in] current If true, compare instance's original node and role, +- * otherwise compare assigned next node and role ++ * \param[in] match_rsc Resource that instance must match (for logging only) ++ * \param[in] rsc Clone or bundle resource to check for matching instance ++ * \param[in] node Instance must match this node ++ * \param[in] role If not pcmk_role_unknown, instance must match this role ++ * \param[in] current If true, compare instance's original node and role, ++ * otherwise compare assigned next node and role ++ * \param[in] node_attribute If not NULL, match instances by this node attribute ++ * instead of by node identity + * + * \return \p rsc instance matching \p node and \p role if any, otherwise NULL + */ +@@ -1149,7 +1173,7 @@ static pcmk_resource_t * + find_compatible_instance_on_node(const pcmk_resource_t *match_rsc, + const pcmk_resource_t *rsc, + const pcmk_node_t *node, enum rsc_role_e role, +- bool current) ++ bool current, const char *node_attribute) + { + GList *instances = NULL; + +@@ -1157,7 +1181,8 @@ find_compatible_instance_on_node(const pcmk_resource_t *match_rsc, + for (GList *iter = instances; iter != NULL; iter = iter->next) { + pcmk_resource_t *instance = (pcmk_resource_t *) iter->data; + +- if (pcmk__instance_matches(instance, node, role, current)) { ++ if (pcmk__instance_matches(instance, node, role, current, ++ node_attribute)) { + pcmk__rsc_trace(match_rsc, + "Found %s %s instance %s compatible with %s on %s", + display_role(role), rsc->id, instance->id, +@@ -1179,11 +1204,13 @@ find_compatible_instance_on_node(const pcmk_resource_t *match_rsc, + * \internal + * \brief Find a clone instance or bundle container compatible with a resource + * +- * \param[in] match_rsc Resource that instance must match +- * \param[in] rsc Clone or bundle resource to check for matching instance +- * \param[in] role If not pcmk_role_unknown, instance must match this role +- * \param[in] current If true, compare instance's original node and role, +- * otherwise compare assigned next node and role ++ * \param[in] match_rsc Resource that instance must match ++ * \param[in] rsc Clone or bundle resource to check for matching instance ++ * \param[in] role If not pcmk_role_unknown, instance must match this role ++ * \param[in] current If true, compare instance's original node and role, ++ * otherwise compare assigned next node and role ++ * \param[in] node_attribute If not NULL, match instances by this node attribute ++ * instead of by node identity + * + * \return Compatible (by \p role and \p match_rsc location) instance of \p rsc + * if any, otherwise NULL +@@ -1191,7 +1218,7 @@ find_compatible_instance_on_node(const pcmk_resource_t *match_rsc, + pcmk_resource_t * + pcmk__find_compatible_instance(const pcmk_resource_t *match_rsc, + const pcmk_resource_t *rsc, enum rsc_role_e role, +- bool current) ++ bool current, const char *node_attribute) + { + pcmk_resource_t *instance = NULL; + GList *nodes = NULL; +@@ -1207,7 +1234,7 @@ pcmk__find_compatible_instance(const pcmk_resource_t *match_rsc, + node = match_rsc->priv->fns->location(match_rsc, NULL, target); + if (node != NULL) { + return find_compatible_instance_on_node(match_rsc, rsc, node, role, +- current); ++ current, node_attribute); + } + + // Otherwise check for an instance matching any of match_rsc's allowed nodes +@@ -1216,7 +1243,8 @@ pcmk__find_compatible_instance(const pcmk_resource_t *match_rsc, + iter = iter->next) { + instance = find_compatible_instance_on_node(match_rsc, rsc, + (pcmk_node_t *) iter->data, +- role, current); ++ role, current, ++ node_attribute); + } + + if (instance == NULL) { +@@ -1423,7 +1451,7 @@ update_interleaved_actions(pcmk_action_t *first, pcmk_action_t *then, + first_instance = pcmk__find_compatible_instance(then_instance, + first->rsc, + pcmk_role_unknown, +- current); ++ current, NULL); + + if (first_instance == NULL) { // No instance can be interleaved + if (unassign_if_mandatory(first, then, then_instance, type, +diff --git a/lib/pacemaker/pcmk_sched_probes.c b/lib/pacemaker/pcmk_sched_probes.c +index bda90ce..a7d2364 100644 +--- a/lib/pacemaker/pcmk_sched_probes.c ++++ b/lib/pacemaker/pcmk_sched_probes.c +@@ -614,7 +614,7 @@ add_restart_orderings_for_probe(pcmk_action_t *probe, pcmk_action_t *after) + compatible_rsc = pcmk__find_compatible_instance(probe->rsc, + after->rsc, + pcmk_role_unknown, +- false); ++ false, NULL); + } + } + +diff --git a/lib/pacemaker/pcmk_sched_promotable.c b/lib/pacemaker/pcmk_sched_promotable.c +index cdf276f..0da0d8b 100644 +--- a/lib/pacemaker/pcmk_sched_promotable.c ++++ b/lib/pacemaker/pcmk_sched_promotable.c +@@ -1333,7 +1333,8 @@ pcmk__update_promotable_dependent_priority(const pcmk_resource_t *primary, + // Look for a primary instance where dependent will be + primary_instance = pcmk__find_compatible_instance(dependent, primary, + colocation->primary_role, +- false); ++ false, ++ colocation->node_attribute); + + if (primary_instance != NULL) { + // Add primary instance's priority to dependent's +-- +2.47.1 + diff --git a/004-crm_resource_wait.patch b/004-crm_resource_wait.patch new file mode 100644 index 0000000..ffd87d6 --- /dev/null +++ b/004-crm_resource_wait.patch @@ -0,0 +1,76 @@ +From ce1dc488d46b373292569b397c9c765b55654eea Mon Sep 17 00:00:00 2001 +From: Reid Wahl +Date: Fri, 5 Sep 2025 20:35:31 -0700 +Subject: [PATCH] Fix: tools: Handle large timeouts correctly in crm_resource + --wait + +Previously, if the --timeout value parsed to a value greater than +(UINT_MAX - 999), the wait timeout would overflow. The effective timeout +would be either 0 seconds or 1 second. This is because 999 was added to +the guint value before passing it to pcmk__timeout_ms2s(). + +Now, we simply pass the timeout in milliseconds to +pcmk__timeout_ms2s(), without adding 999. + +This implies a slight behavior change. Previously, timeouts were always +rounded up to the next greatest second. Now, they're rounded to the +nearest second. For example, previously: +* timeout values between 1ms and 500ms => wait timeout of 1 second +* timeout values between 501ms and 1500ms => wait timeout of 2 seconds +* timeout values between 1501ms and 2500ms => wait timeout of 3 seconds +* and so on + +Now: +* timeout values between 1ms and 1499ms => wait timeout of 1 second +* timeout values between 1500ms and 2499ms => wait timeout of 2 seconds +* timeout values between 2500ms and 3499ms => wait timeout of 3 seconds +* and so on + +The previous rounding behavior has existed since crm_resource --wait was +added by 424afcdf. + +Update the help text to note the granularity and rounding behavior. The +exact behavior of the restart command is confusing, and its logic should +be cleaned up in the future. + +Fixes RHEL-45869 +Fixes RHEL-86148 +Closes T841 + +Signed-off-by: Reid Wahl +--- + tools/crm_resource.c | 4 +++- + tools/crm_resource_runtime.c | 2 +- + 2 files changed, 4 insertions(+), 2 deletions(-) + +diff --git a/tools/crm_resource.c b/tools/crm_resource.c +index 162ae40..74f84f0 100644 +--- a/tools/crm_resource.c ++++ b/tools/crm_resource.c +@@ -831,7 +831,9 @@ static GOptionEntry addl_entries[] = { + "ID" }, + { "timeout", 'T', G_OPTION_FLAG_NONE, G_OPTION_ARG_CALLBACK, timeout_cb, + "(Advanced) Abort if command does not finish in this time (with\n" +- INDENT "--restart, --wait, --force-*)", ++ INDENT "--restart, --wait, --force-*). The --restart command uses a\n" ++ INDENT "two-second granularity and the --wait command uses a one-second\n" ++ INDENT "granularity, with rounding.", + "N" }, + { "all", 0, G_OPTION_FLAG_NONE, G_OPTION_ARG_NONE, &options.all, + "List all options, including advanced and deprecated (with\n" +diff --git a/tools/crm_resource_runtime.c b/tools/crm_resource_runtime.c +index f0a84c0..a44794e 100644 +--- a/tools/crm_resource_runtime.c ++++ b/tools/crm_resource_runtime.c +@@ -2108,7 +2108,7 @@ wait_till_stable(pcmk__output_t *out, guint timeout_ms, cib_t * cib) + if (timeout_ms == 0) { + expire_time += WAIT_DEFAULT_TIMEOUT_S; + } else { +- expire_time += pcmk__timeout_ms2s(timeout_ms + 999); ++ expire_time += pcmk__timeout_ms2s(timeout_ms); + } + + scheduler = pcmk_new_scheduler(); +-- +2.47.1 + diff --git a/005-ipc_evict.patch b/005-ipc_evict.patch new file mode 100644 index 0000000..1d82ec4 --- /dev/null +++ b/005-ipc_evict.patch @@ -0,0 +1,400 @@ +From 79f5a67e8242b3e72aa9dcf0dbd286b3fb719baa Mon Sep 17 00:00:00 2001 +From: Chris Lumens +Date: Wed, 27 Aug 2025 10:41:13 -0400 +Subject: [PATCH 1/6] Refactor: libcrmcommon: Rearrange the queue_len check. + +Check if the queue length is 0 first and return, which allows everything +else to be un-indented one level. +--- + lib/common/ipc_server.c | 47 ++++++++++++++++++++--------------------- + 1 file changed, 23 insertions(+), 24 deletions(-) + +diff --git a/lib/common/ipc_server.c b/lib/common/ipc_server.c +index 25c788b..4b33c64 100644 +--- a/lib/common/ipc_server.c ++++ b/lib/common/ipc_server.c +@@ -541,34 +541,33 @@ no_more_retries: + sent, queue_len, c->ipcs, c->pid, pcmk_rc_str(rc), qb_rc); + } + +- if (queue_len) { +- +- /* Allow clients to briefly fall behind on processing incoming messages, +- * but drop completely unresponsive clients so the connection doesn't +- * consume resources indefinitely. +- */ +- if (queue_len > QB_MAX(c->queue_max, PCMK_IPC_DEFAULT_QUEUE_MAX)) { +- if ((c->queue_backlog <= 1) || (queue_len < c->queue_backlog)) { +- /* Don't evict for a new or shrinking backlog */ +- crm_warn("Client with process ID %u has a backlog of %u messages " +- QB_XS " %p", c->pid, queue_len, c->ipcs); +- } else { +- crm_err("Evicting client with process ID %u due to backlog of %u messages " +- QB_XS " %p", c->pid, queue_len, c->ipcs); +- c->queue_backlog = 0; +- qb_ipcs_disconnect(c->ipcs); +- return rc; +- } +- } +- +- c->queue_backlog = queue_len; +- delay_next_flush(c, queue_len); +- +- } else { ++ if (queue_len == 0) { + /* Event queue is empty, there is no backlog */ + c->queue_backlog = 0; ++ return rc; + } + ++ /* Allow clients to briefly fall behind on processing incoming messages, ++ * but drop completely unresponsive clients so the connection doesn't ++ * consume resources indefinitely. ++ */ ++ if (queue_len > QB_MAX(c->queue_max, PCMK_IPC_DEFAULT_QUEUE_MAX)) { ++ if ((c->queue_backlog <= 1) || (queue_len < c->queue_backlog)) { ++ /* Don't evict for a new or shrinking backlog */ ++ crm_warn("Client with process ID %u has a backlog of %u messages " ++ QB_XS " %p", c->pid, queue_len, c->ipcs); ++ } else { ++ crm_err("Evicting client with process ID %u due to backlog of %u messages " ++ QB_XS " %p", c->pid, queue_len, c->ipcs); ++ c->queue_backlog = 0; ++ qb_ipcs_disconnect(c->ipcs); ++ return rc; ++ } ++ } ++ ++ c->queue_backlog = queue_len; ++ delay_next_flush(c, queue_len); ++ + return rc; + } + +-- +2.47.1 + +From 014699003c6506bba8638ed57efea49da403d0e1 Mon Sep 17 00:00:00 2001 +From: Chris Lumens +Date: Wed, 27 Aug 2025 11:31:37 -0400 +Subject: [PATCH 2/6] Refactor: libcrmcommon: Simplify an empty event queue + check. + +I find this just a little bit more straightforward to follow. +--- + lib/common/ipc_server.c | 9 ++++----- + 1 file changed, 4 insertions(+), 5 deletions(-) + +diff --git a/lib/common/ipc_server.c b/lib/common/ipc_server.c +index 4b33c64..dbd885a 100644 +--- a/lib/common/ipc_server.c ++++ b/lib/common/ipc_server.c +@@ -491,14 +491,13 @@ crm_ipcs_flush_events(pcmk__client_t *c) + pcmk__ipc_header_t *header = NULL; + struct iovec *event = NULL; + +- if (c->event_queue) { +- // We don't pop unless send is successful +- event = g_queue_peek_head(c->event_queue); +- } +- if (event == NULL) { // Queue is empty ++ if ((c->event_queue == NULL) || g_queue_is_empty(c->event_queue)) { + break; + } + ++ // We don't pop unless send is successful ++ event = g_queue_peek_head(c->event_queue); ++ + /* Retry sending the event up to five times. If we get -EAGAIN, sleep + * a very short amount of time (too long here is bad) and try again. + * If we simply exit the while loop on -EAGAIN, we'll have to wait until +-- +2.47.1 + +From f999ac3d86d8107dee5288497f5f7fff07956d18 Mon Sep 17 00:00:00 2001 +From: Chris Lumens +Date: Wed, 27 Aug 2025 11:35:38 -0400 +Subject: [PATCH 3/6] Refactor: libcrmcommon: Rearrange a few tests in + crm_ipcs_flush_events. + +Again, no important code changes here. I just find these a little +easier to follow. +--- + lib/common/ipc_server.c | 6 ++++-- + 1 file changed, 4 insertions(+), 2 deletions(-) + +diff --git a/lib/common/ipc_server.c b/lib/common/ipc_server.c +index dbd885a..b76847b 100644 +--- a/lib/common/ipc_server.c ++++ b/lib/common/ipc_server.c +@@ -477,16 +477,18 @@ crm_ipcs_flush_events(pcmk__client_t *c) + + if (c == NULL) { + return rc; ++ } + +- } else if (c->event_timer) { ++ if (c->event_timer != 0) { + /* There is already a timer, wait until it goes off */ + crm_trace("Timer active for %p - %d", c->ipcs, c->event_timer); + return rc; + } + +- if (c->event_queue) { ++ if (c->event_queue != NULL) { + queue_len = g_queue_get_length(c->event_queue); + } ++ + while (sent < 100) { + pcmk__ipc_header_t *header = NULL; + struct iovec *event = NULL; +-- +2.47.1 + +From 9e76007bb0bc1d4cb5a88dcfaaf96aa8853f42dc Mon Sep 17 00:00:00 2001 +From: Chris Lumens +Date: Wed, 27 Aug 2025 11:48:48 -0400 +Subject: [PATCH 4/6] Refactor: libcrmcommon: Unindent retry code in + crm_ipcs_flush_events. + +If we're breaking or jumping to a label, there's no need to have all +these nested else blocks. +--- + lib/common/ipc_server.c | 16 ++++++++-------- + 1 file changed, 8 insertions(+), 8 deletions(-) + +diff --git a/lib/common/ipc_server.c b/lib/common/ipc_server.c +index b76847b..73cc58f 100644 +--- a/lib/common/ipc_server.c ++++ b/lib/common/ipc_server.c +@@ -513,16 +513,16 @@ crm_ipcs_flush_events(pcmk__client_t *c) + for (unsigned int retries = 5; retries > 0; retries--) { + qb_rc = qb_ipcs_event_sendv(c->ipcs, event, 2); + +- if (qb_rc < 0) { +- if (retries == 1 || qb_rc != -EAGAIN) { +- rc = (int) -qb_rc; +- goto no_more_retries; +- } else { +- pcmk__sleep_ms(5); +- } +- } else { ++ if (qb_rc >= 0) { + break; + } ++ ++ if (retries == 1 || qb_rc != -EAGAIN) { ++ rc = (int) -qb_rc; ++ goto no_more_retries; ++ } ++ ++ pcmk__sleep_ms(5); + } + + event = g_queue_pop_head(c->event_queue); +-- +2.47.1 + +From b73be21a454f795bc747aad1dbeea82f67d8b232 Mon Sep 17 00:00:00 2001 +From: Chris Lumens +Date: Wed, 27 Aug 2025 13:14:54 -0400 +Subject: [PATCH 5/6] Feature: libcrmcommon: Be more lenient in evicting IPC + clients. + +Each IPC connection has a message queue. If the client is unable to +process messages faster than the server is sending them, that queue +start to back up. pacemaker enforces a cap on the queue size, and +that's adjustable with the cluster-ipc-limit parameter. Once the queue +grows beyond that size, the client is assumed to be dead and is evicted +so it can be restarted and the queue resources freed. + +However, it's possible that the client is not dead. On clusters with +very large numbers of resources (I've tried with 300, but fewer might +also cause problems), certain actions can happen that cause a spike in +IPC messages. In RHEL-76276, the action that causes this is moving +nodes in and out of standby. This spike in messages causes the server +to overwhelm the client, which is then evicted. + +My multi-part IPC patches made this even worse, as now if the CIB is so +large that it needs to split an IPC message up, there will be more +messages than before. + +What this fix does is get rid of the cap on the queue size for pacemaker +daemons. As long as the server has been able to send messages to the +client, the client is still doing work and shouldn't be evicted. It may +just be processing messages slower than the server is sending them. +Note that this could lead the queue to grow without bound, eventually +crashing the server. For this reason, we're only allowing pacemaker +daemons to ignore the queue size limit. + +Potential problems with this approach: + +* If the client is so busy that it can't receive even a single message + that crm_ipcs_flush_events tries to send, it will still be evicted. + However, the flush operation does retry with a delay several times + giving the client time to finish up what it's doing. + +* We have timers all over the place with daemons waiting on replies. + It's possible that because we are no longer just evicting the clients, + we will now see those timers expire which will just lead to different + problems. If so, these fixes would probably need to take place in the + client code. + +Fixes T38 +--- + lib/common/ipc_server.c | 14 ++++++++++++-- + 1 file changed, 12 insertions(+), 2 deletions(-) + +diff --git a/lib/common/ipc_server.c b/lib/common/ipc_server.c +index 73cc58f..4420070 100644 +--- a/lib/common/ipc_server.c ++++ b/lib/common/ipc_server.c +@@ -553,10 +553,20 @@ no_more_retries: + * consume resources indefinitely. + */ + if (queue_len > QB_MAX(c->queue_max, PCMK_IPC_DEFAULT_QUEUE_MAX)) { +- if ((c->queue_backlog <= 1) || (queue_len < c->queue_backlog)) { +- /* Don't evict for a new or shrinking backlog */ ++ /* Don't evict: ++ * - Clients with a new backlog. ++ * - Clients with a shrinking backlog (the client is processing ++ * messages faster than the server is sending them). ++ * - Clients that are pacemaker daemons and have had any messages sent ++ * to them in this flush call (the server is sending messages faster ++ * than the client is processing them, but the client is not dead). ++ */ ++ if ((c->queue_backlog <= 1) ++ || (queue_len < c->queue_backlog) ++ || ((sent > 0) && (pcmk__parse_server(c->name) != pcmk_ipc_unknown))) { + crm_warn("Client with process ID %u has a backlog of %u messages " + QB_XS " %p", c->pid, queue_len, c->ipcs); ++ + } else { + crm_err("Evicting client with process ID %u due to backlog of %u messages " + QB_XS " %p", c->pid, queue_len, c->ipcs); +-- +2.47.1 + +From 4682953c567e16409d8e7972d9d5891348d4c360 Mon Sep 17 00:00:00 2001 +From: Chris Lumens +Date: Wed, 27 Aug 2025 15:56:27 -0400 +Subject: [PATCH 6/6] Feature: libcrmcommon: Update documentation for + cluster-ipc-limit. + +Clarify that this no longer applies to pacemaker daemons. +--- + cts/cli/regression.crm_attribute.exp | 16 ++++++++-------- + cts/cli/regression.daemons.exp | 4 ++-- + .../Pacemaker_Explained/cluster-options.rst | 12 +++++++----- + lib/common/options.c | 6 +++--- + 4 files changed, 20 insertions(+), 18 deletions(-) + +diff --git a/cts/cli/regression.crm_attribute.exp b/cts/cli/regression.crm_attribute.exp +index e161f49..36cba76 100644 +--- a/cts/cli/regression.crm_attribute.exp ++++ b/cts/cli/regression.crm_attribute.exp +@@ -111,8 +111,8 @@ Also known as properties, these are options that affect behavior across the enti + * migration-limit: The number of live migration actions that the cluster is allowed to execute in parallel on a node (-1 means no limit) + * Possible values: integer (default: ) + +- * cluster-ipc-limit: Maximum IPC message backlog before disconnecting a cluster daemon +- * Raise this if log has "Evicting client" messages for cluster daemon PIDs (a good value is the number of resources in the cluster multiplied by the number of nodes). ++ * cluster-ipc-limit: Maximum IPC message backlog before disconnecting a client ++ * Raise this if log has "Evicting client" messages for cluster PIDs (a good value is the number of resources in the cluster multiplied by the number of nodes). + * Possible values: nonnegative_integer (default: ) + + * stop-all-resources: Whether the cluster should stop all active resources +@@ -357,8 +357,8 @@ Also known as properties, these are options that affect behavior across the enti + + + +- Raise this if log has "Evicting client" messages for cluster daemon PIDs (a good value is the number of resources in the cluster multiplied by the number of nodes). +- Maximum IPC message backlog before disconnecting a cluster daemon ++ Raise this if log has "Evicting client" messages for cluster PIDs (a good value is the number of resources in the cluster multiplied by the number of nodes). ++ Maximum IPC message backlog before disconnecting a client + + + +@@ -537,8 +537,8 @@ Also known as properties, these are options that affect behavior across the enti + * migration-limit: The number of live migration actions that the cluster is allowed to execute in parallel on a node (-1 means no limit) + * Possible values: integer (default: ) + +- * cluster-ipc-limit: Maximum IPC message backlog before disconnecting a cluster daemon +- * Raise this if log has "Evicting client" messages for cluster daemon PIDs (a good value is the number of resources in the cluster multiplied by the number of nodes). ++ * cluster-ipc-limit: Maximum IPC message backlog before disconnecting a client ++ * Raise this if log has "Evicting client" messages for cluster PIDs (a good value is the number of resources in the cluster multiplied by the number of nodes). + * Possible values: nonnegative_integer (default: ) + + * stop-all-resources: Whether the cluster should stop all active resources +@@ -824,8 +824,8 @@ Also known as properties, these are options that affect behavior across the enti + + + +- Raise this if log has "Evicting client" messages for cluster daemon PIDs (a good value is the number of resources in the cluster multiplied by the number of nodes). +- Maximum IPC message backlog before disconnecting a cluster daemon ++ Raise this if log has "Evicting client" messages for cluster PIDs (a good value is the number of resources in the cluster multiplied by the number of nodes). ++ Maximum IPC message backlog before disconnecting a client + + + +diff --git a/cts/cli/regression.daemons.exp b/cts/cli/regression.daemons.exp +index fc8535a..6274eeb 100644 +--- a/cts/cli/regression.daemons.exp ++++ b/cts/cli/regression.daemons.exp +@@ -21,10 +21,10 @@ + + + +- Raise this if log has "Evicting client" messages for cluster daemon PIDs (a good value is the number of resources in the cluster multiplied by the number of nodes). ++ Raise this if log has "Evicting client" messages for cluster PIDs (a good value is the number of resources in the cluster multiplied by the number of nodes). + + +- Maximum IPC message backlog before disconnecting a cluster daemon ++ Maximum IPC message backlog before disconnecting a client + + + +diff --git a/doc/sphinx/Pacemaker_Explained/cluster-options.rst b/doc/sphinx/Pacemaker_Explained/cluster-options.rst +index 6ebe5f3..22e1a50 100644 +--- a/doc/sphinx/Pacemaker_Explained/cluster-options.rst ++++ b/doc/sphinx/Pacemaker_Explained/cluster-options.rst +@@ -693,11 +693,13 @@ values, by running the ``man pacemaker-schedulerd`` and + cluster-ipc-limit + - :ref:`nonnegative integer ` + - 500 +- - The maximum IPC message backlog before one cluster daemon will +- disconnect another. This is of use in large clusters, for which a good +- value is the number of resources in the cluster multiplied by the number +- of nodes. The default of 500 is also the minimum. Raise this if you see +- "Evicting client" log messages for cluster daemon process IDs. ++ - The maximum IPC message backlog before a cluster daemon will disconnect ++ a client. Other cluster daemons are not subject to this limit as long as ++ they are still processing messages. This is of use in large clusters, ++ for which a good value is the number of resources in the cluster ++ multiplied by the number of nodes. The default of 500 is also the ++ minimum. Raise this if you see "Evicting client" log messages for ++ cluster process IDs. + * - .. _pe_error_series_max: + + .. index:: +diff --git a/lib/common/options.c b/lib/common/options.c +index b8f4943..af1b073 100644 +--- a/lib/common/options.c ++++ b/lib/common/options.c +@@ -432,10 +432,10 @@ static const pcmk__cluster_option_t cluster_options[] = { + PCMK_OPT_CLUSTER_IPC_LIMIT, NULL, PCMK_VALUE_NONNEGATIVE_INTEGER, NULL, + "500", pcmk__valid_positive_int, + pcmk__opt_based, +- N_("Maximum IPC message backlog before disconnecting a cluster daemon"), ++ N_("Maximum IPC message backlog before disconnecting a client"), + N_("Raise this if log has \"Evicting client\" messages for cluster " +- "daemon PIDs (a good value is the number of resources in the " +- "cluster multiplied by the number of nodes)."), ++ "PIDs (a good value is the number of resources in the cluster " ++ "multiplied by the number of nodes)."), + }, + + // Orphans and stopping +-- +2.47.1 + diff --git a/006-fewer_messages.patch b/006-fewer_messages.patch new file mode 100644 index 0000000..8935d44 --- /dev/null +++ b/006-fewer_messages.patch @@ -0,0 +1,88 @@ +From 8ddaf5330cf7605c7b710061c72dba8112db6cc6 Mon Sep 17 00:00:00 2001 +From: Chris Lumens +Date: Fri, 31 Oct 2025 11:24:14 -0400 +Subject: [PATCH] Med: daemons: Don't add repeated I_PE_CALC messages to the + fsa queue. + +Let's say you have a two node cluster, node1 and node2. For purposes of +testing, it's easiest if you use fence_dummy instead of a real fencing +agent as this will fake fencing happening but without rebooting the node +so you can see all the log files. + +Assume the DC is node1. Now do the following on node2: + +- pcs node standby node1 +- pcs resource defaults update resource-stickiness=1 +- for i in $(seq 1 300); do echo $i; pcs resource create dummy$i ocf:heartbeat:Dummy --group dummy-group; done +- pcs node unstandby node1 + +It will take a long time to create that many resources. After node1 +comes out of standby, it'll take a minute or two but eventually you'll +see that node1 was fenced. On node1, you'll see a lot of transition +abort messages happen. Each of these transition aborts causes an +I_PE_CALC message to be generated and added to the fsa queue. In my +testing, I've seen the queue grow to ~ 600 messages, all of which are +exactly the same thing. + +The FSA is triggered at G_PRIORITY_HIGH, and once it is triggered, it +will run until its queue is empty. With so many messages being added so +quickly, we've basically ensured it won't be empty any time soon. While +controld is processing the FSA messages, it will be unable to read +anything out of the IPC backlog. + +based continues to attempt to send IPC events to controld but is unable +to do so, so the backlog continues to grow. Eventually, the backlog +reaches that 500 message threshold without anything having been read by +controld, which triggers the eviction process. + +There doesn't seem to be any reason for all these I_PE_CALC messages to +be generated. They're all exactly the same, they don't appear to be +tagged with any unique data tying them to a specific query, and their +presence just slows everything down. + +Thus, the fix here is very simple: if the latest message in the queue is +an I_PE_CALC message, just don't add another one. We could also make +sure there's only ever one I_PE_CALC message in the queue, but there +could potentially be valid reasons for there to be multiple interleaved +with other message types. I am erring on the side of caution with this +minimal fix. + +Related: RHEL-76276 +--- + daemons/controld/controld_messages.c | 20 ++++++++++++++++++++ + 1 file changed, 20 insertions(+) + +diff --git a/daemons/controld/controld_messages.c b/daemons/controld/controld_messages.c +index df215e6..866fde3 100644 +--- a/daemons/controld/controld_messages.c ++++ b/daemons/controld/controld_messages.c +@@ -73,6 +73,26 @@ register_fsa_input_adv(enum crmd_fsa_cause cause, enum crmd_fsa_input input, + return; + } + ++ if (input == I_PE_CALC) { ++ GList *ele = NULL; ++ ++ if (prepend) { ++ ele = g_list_first(controld_globals.fsa_message_queue); ++ } else { ++ ele = g_list_last(controld_globals.fsa_message_queue); ++ } ++ ++ if (ele != NULL) { ++ fsa_data_t *message = (fsa_data_t *) ele->data; ++ ++ if (message->fsa_input == I_PE_CALC) { ++ crm_debug("%s item in fsa queue is I_PE_CALC, not adding another", ++ (prepend ? "First" : "Last")); ++ return; ++ } ++ } ++ } ++ + if (input == I_WAIT_FOR_EVENT) { + controld_set_global_flags(controld_fsa_is_stalled); + crm_debug("Stalling the FSA pending further input: source=%s cause=%s data=%p queue=%d", +-- +2.47.1 + diff --git a/pacemaker.spec b/pacemaker.spec index efa30bc..83b50ac 100644 --- a/pacemaker.spec +++ b/pacemaker.spec @@ -41,7 +41,7 @@ ## can be incremented to build packages reliably considered "newer" ## than previously built packages with the same pcmkversion) %global pcmkversion 3.0.1 -%global specversion 3 +%global specversion 4 ## Upstream commit (full commit ID, abbreviated commit ID, or tag) to build %global commit 9a5e54bae85847c4bb6ed7c7fb06103ebebbc64a @@ -201,6 +201,10 @@ Source1: pacemaker.sysusers # upstream commits Patch001: 001-econnrefused.patch Patch002: 002-corosync.patch +Patch003: 003-promotable-follows.patch +Patch004: 004-crm_resource_wait.patch +Patch005: 005-ipc_evict.patch +Patch006: 006-fewer_messages.patch Requires: resource-agents Requires: %{pkgname_pcmk_libs}%{?_isa} = %{version}-%{release} @@ -792,6 +796,15 @@ exit 0 %{_datadir}/pkgconfig/pacemaker-schemas.pc %changelog +* Thu Nov 13 2025 Chris Lumens - 3.0.1-4 +- Fix promoting instances of a cloned resource +- Handle large timeouts correctly in crm_resource --wait +- Don't evict IPC clients as long as they're still processing messages +- Don't overwhelm the FSA queue with repeated CIB queries +- Resolves: RHEL-120932 +- Resolves: RHEL-86148 +- Resolves: RHEL-114895 + * Wed Aug 13 2025 Reid Wahl - 3.0.1-3 - CTS launches Corosync using systemd if available. - Resolves: RHEL-110075