Don't hang waiting on certain pending monitor actions

- Resolves: RHEL-153661
This commit is contained in:
Chris Lumens 2026-05-11 12:51:08 -04:00
parent 2982dbace6
commit 25ec17dbd1
2 changed files with 289 additions and 1 deletions

283
005-pending_actions.patch Normal file
View File

@ -0,0 +1,283 @@
From e2db52ba7f9cb2d976771897435324c2f1637581 Mon Sep 17 00:00:00 2001
From: Chris Lumens <clumens@redhat.com>
Date: Thu, 19 Mar 2026 11:24:07 -0400
Subject: [PATCH 1/3] Refactor: tools: Move pending xpath query into its own
define.
---
tools/crm_resource_runtime.c | 17 +++++++++--------
1 file changed, 9 insertions(+), 8 deletions(-)
diff --git a/tools/crm_resource_runtime.c b/tools/crm_resource_runtime.c
index 286c10c..6655c85 100644
--- a/tools/crm_resource_runtime.c
+++ b/tools/crm_resource_runtime.c
@@ -1,5 +1,5 @@
/*
- * Copyright 2004-2024 the Pacemaker project contributors
+ * Copyright 2004-2026 the Pacemaker project contributors
*
* The version control history for this file may have further details.
*
@@ -1946,6 +1946,13 @@ print_pending_actions(pcmk__output_t *out, GList *actions)
/* For --wait, how long to sleep between cluster state checks */
#define WAIT_SLEEP_S (2)
+#define XPATH_PENDING_ACTION "/" PCMK_XE_CIB "/" PCMK_XE_STATUS \
+ "/" PCMK__XE_NODE_STATE "/" PCMK__XE_LRM \
+ "/" PCMK__XE_LRM_RESOURCES \
+ "/" PCMK__XE_LRM_RESOURCE \
+ "/" PCMK__XE_LRM_RSC_OP \
+ "[@" PCMK__XA_RC_CODE "='%d']"
+
/*!
* \internal
* \brief Wait until all pending cluster actions are complete
@@ -1985,13 +1992,7 @@ wait_till_stable(pcmk__output_t *out, guint timeout_ms, cib_t * cib)
return ENOMEM;
}
- xpath = crm_strdup_printf("/" PCMK_XE_CIB "/" PCMK_XE_STATUS
- "/" PCMK__XE_NODE_STATE "/" PCMK__XE_LRM
- "/" PCMK__XE_LRM_RESOURCES
- "/" PCMK__XE_LRM_RESOURCE
- "/" PCMK__XE_LRM_RSC_OP
- "[@" PCMK__XA_RC_CODE "='%d']",
- PCMK_OCF_UNKNOWN);
+ xpath = crm_strdup_printf(XPATH_PENDING_ACTION, PCMK_OCF_UNKNOWN);
do {
/* Abort if timeout is reached */
time_diff = expire_time - time(NULL);
--
2.53.0
From f6915f09ec3412aaaf824fe1fdd3f7c9dc4a27da Mon Sep 17 00:00:00 2001
From: Chris Lumens <clumens@redhat.com>
Date: Thu, 19 Mar 2026 11:28:47 -0400
Subject: [PATCH 2/3] Refactor: tools: Add pending_actions_in_cib to
crm_resource.
This just refactors a little bit of code into its own function to reduce
complexity in wait_till_stable and to give me a place to add a bunch
more code later.
---
tools/crm_resource_runtime.c | 28 +++++++++++++++++++---------
1 file changed, 19 insertions(+), 9 deletions(-)
diff --git a/tools/crm_resource_runtime.c b/tools/crm_resource_runtime.c
index 6655c85..06ff68d 100644
--- a/tools/crm_resource_runtime.c
+++ b/tools/crm_resource_runtime.c
@@ -1953,6 +1953,23 @@ print_pending_actions(pcmk__output_t *out, GList *actions)
"/" PCMK__XE_LRM_RSC_OP \
"[@" PCMK__XA_RC_CODE "='%d']"
+static bool
+pending_actions_in_cib(pcmk_scheduler_t *scheduler)
+{
+ xmlXPathObject *search = NULL;
+ bool pending = false;
+ char *xpath = NULL;
+
+ xpath = crm_strdup_printf(XPATH_PENDING_ACTION, PCMK_OCF_UNKNOWN);
+ search = xpath_search(scheduler->input, xpath);
+ pending = (numXpathResults(search) > 0);
+
+ xmlXPathFreeObject(search);
+ free(xpath);
+
+ return pending;
+}
+
/*!
* \internal
* \brief Wait until all pending cluster actions are complete
@@ -1973,13 +1990,10 @@ int
wait_till_stable(pcmk__output_t *out, guint timeout_ms, cib_t * cib)
{
pcmk_scheduler_t *scheduler = NULL;
- xmlXPathObjectPtr search;
int rc = pcmk_rc_ok;
- bool pending_unknown_state_resources;
time_t expire_time = time(NULL);
time_t time_diff;
bool printed_version_warning = out->is_quiet(out); // i.e. don't print if quiet
- char *xpath = NULL;
if (timeout_ms == 0) {
expire_time += WAIT_DEFAULT_TIMEOUT_S;
@@ -1992,7 +2006,6 @@ wait_till_stable(pcmk__output_t *out, guint timeout_ms, cib_t * cib)
return ENOMEM;
}
- xpath = crm_strdup_printf(XPATH_PENDING_ACTION, PCMK_OCF_UNKNOWN);
do {
/* Abort if timeout is reached */
time_diff = expire_time - time(NULL);
@@ -2038,13 +2051,10 @@ wait_till_stable(pcmk__output_t *out, guint timeout_ms, cib_t * cib)
}
}
- search = xpath_search(scheduler->input, xpath);
- pending_unknown_state_resources = (numXpathResults(search) > 0);
- freeXpathObject(search);
- } while (actions_are_pending(scheduler->actions) || pending_unknown_state_resources);
+ } while (actions_are_pending(scheduler->actions)
+ || pending_actions_in_cib(scheduler));
pe_free_working_set(scheduler);
- free(xpath);
return rc;
}
--
2.53.0
From 5783c35095c8f8b06550ea71174183af87473b4e Mon Sep 17 00:00:00 2001
From: Chris Lumens <clumens@redhat.com>
Date: Mon, 11 May 2026 12:43:09 -0400
Subject: [PATCH 3/3] Low: tools: Don't wait on certain "pending" monitor
actions.
d253cdf introduced a change where `crm_resource --wait` will wait on
pending actions in the CIB before returning. Most of the time this is
fine, but there's a very specific case where it's not.
It's possible to end up in a situation where you have a resource that is
disabled and constrained such that it can't run where it wants to. In
that case, the CIB will still contain a pending lrm_rsc_op history entry
for a recurring monitor operation even after the resource fails to
start. It will look something like this:
<lrm_rsc_op id="dummy1_monitor_10000" operation_key="dummy1_monitor_10000" operation="monitor" call-id="-1" rc-code="193" op-status="-1" interval="10000" last-rc-change="1773083270" exec-time="0" .../>
By design, pacemaker doesn't replace these pending monitor entries with
a new entry when they fail. The scheduler requires this for remote
connection resources at least. See bbadfe553.
So instead, we'll fix this in the tools. If the pending monitor action
has failed, it will also have a history entry like this:
<lrm_rsc_op id="dummy1_last_failure_0" operation_key="dummy1_monitor_10000" operation="monitor" call-id="-1" rc-code="193" op-status="-1" interval="10000" last-rc-change="1773083270" exec-time="0" queue-time="0" .../>
We can look through the history entries for one with an operation_key
matching the pending recurring monitor entry, and see if its ID contains
_last_failure_0. If so, that's a monitor action we shouldn't wait on.
Fixes RHEL-78393
---
tools/crm_resource_runtime.c | 86 ++++++++++++++++++++++++++++++++++--
1 file changed, 83 insertions(+), 3 deletions(-)
diff --git a/tools/crm_resource_runtime.c b/tools/crm_resource_runtime.c
index 06ff68d..e2cb94b 100644
--- a/tools/crm_resource_runtime.c
+++ b/tools/crm_resource_runtime.c
@@ -1953,21 +1953,101 @@ print_pending_actions(pcmk__output_t *out, GList *actions)
"/" PCMK__XE_LRM_RSC_OP \
"[@" PCMK__XA_RC_CODE "='%d']"
+#define XPATH_LAST_FAILURE "/" PCMK_XE_CIB "/" PCMK_XE_STATUS \
+ "/" PCMK__XE_NODE_STATE "/" PCMK__XE_LRM \
+ "/" PCMK__XE_LRM_RESOURCES \
+ "/" PCMK__XE_LRM_RESOURCE \
+ "/" PCMK__XE_LRM_RSC_OP \
+ "[@" PCMK__XA_OPERATION_KEY "='%s']"
+/*!
+ * \internal
+ * \brief Check if there's a lrm_rsc_op last_failure entry for a given key
+ *
+ * \param[in] scheduler The scheduler object
+ * \param[in] key The operation_key attribute of some lrm_rsc_op entry
+ *
+ * \return \c true if there is an lrm_rsc_op history entry with \p key as its
+ * operation_key and with an id attribute ending in "_last_failure_0",
+ * \c false otherwise
+ */
+static bool
+action_has_matching_last_failure(pcmk_scheduler_t *scheduler, const char *key)
+{
+ xmlXPathObject *search = NULL;
+ bool retval = false;
+ char *xpath = NULL;
+
+ xpath = crm_strdup_printf(XPATH_LAST_FAILURE, key);
+ search = xpath_search(scheduler->input, xpath);
+
+ for (int i = 0; i < numXpathResults(search); i++) {
+ const xmlNode *lrm_op_xml = getXpathResult(search, i);
+
+ if (g_str_has_suffix(crm_element_value(lrm_op_xml, PCMK_XA_ID),
+ "_last_failure_0")) {
+ retval = true;
+ break;
+ }
+ }
+
+ xmlXPathFreeObject(search);
+ free(xpath);
+
+ return retval;
+}
+
+/*!
+ * \internal
+ * \brief Determine if there are certain pending actions in the CIB
+ *
+ * \param[in] scheduler The scheduler object
+ *
+ * \return \c true if there are any pending actions in the CIB, after
+ * filtering out pending recurring monitor actions with a last_failure
+ * history entry; \c false otherwise
+ *
+ * \note We filter out certain recurring monitor actions because they might
+ * always be present. The scheduler can't replace the history entry
+ * with a failure entry (see bbadfe553), but it's still not a pending
+ * action and we don't want to wait for it.
+ */
static bool
pending_actions_in_cib(pcmk_scheduler_t *scheduler)
{
xmlXPathObject *search = NULL;
- bool pending = false;
char *xpath = NULL;
+ bool any_pending = false;
xpath = crm_strdup_printf(XPATH_PENDING_ACTION, PCMK_OCF_UNKNOWN);
search = xpath_search(scheduler->input, xpath);
- pending = (numXpathResults(search) > 0);
+
+ for (int i = 0; i < numXpathResults(search); i++) {
+ const char *op_key = NULL;
+ const xmlNode *lrm_op_xml = getXpathResult(search, i);
+
+ if (!pcmk__str_eq(PCMK_ACTION_MONITOR,
+ crm_element_value(lrm_op_xml, PCMK_XA_OPERATION),
+ pcmk__str_none)) {
+ any_pending = true;
+ break;
+ }
+
+ if (pcmk_xe_is_probe(lrm_op_xml)) {
+ any_pending = true;
+ break;
+ }
+
+ op_key = crm_element_value(lrm_op_xml, PCMK__XA_OPERATION_KEY);
+ if ((op_key == NULL) || !action_has_matching_last_failure(scheduler, op_key)) {
+ any_pending = true;
+ break;
+ }
+ }
xmlXPathFreeObject(search);
free(xpath);
- return pending;
+ return any_pending;
}
/*!
--
2.53.0

View File

@ -36,7 +36,7 @@
## can be incremented to build packages reliably considered "newer"
## than previously built packages with the same pcmkversion)
%global pcmkversion 2.1.10
%global specversion 2
%global specversion 3
## Upstream commit (full commit ID, abbreviated commit ID, or tag) to build
%global commit 5693eaeeef06faa1622515963082b5a1731d9fc0
@ -251,6 +251,7 @@ Patch001: 001-crm_resource_wait.patch
Patch002: 002-ipc_connect_retry.patch
Patch003: 003-ipc_evict.patch
Patch004: 004-fewer_messages.patch
Patch005: 005-pending_actions.patch
Requires: resource-agents
Requires: %{pkgname_pcmk_libs}%{?_isa} = %{version}-%{release}
@ -917,6 +918,10 @@ exit 0
%license %{nagios_name}-%{nagios_hash}/COPYING
%changelog
* Mon May 11 2026 Chris Lumens <clumens@redhat.com> - 2.1.10-3
- Don't hang waiting on certain pending monitor actions
- Resolves: RHEL-153661
* Wed Nov 12 2025 Chris Lumens <clumens@redhat.com> - 2.1.10-2
- Handle large timeouts correctly in crm_resource --wait
- Do not try to connect to subdaemons before they're respawned