Don't hang waiting on certain pending monitor actions
- Resolves: RHEL-153661
This commit is contained in:
parent
2982dbace6
commit
25ec17dbd1
283
005-pending_actions.patch
Normal file
283
005-pending_actions.patch
Normal file
@ -0,0 +1,283 @@
|
||||
From e2db52ba7f9cb2d976771897435324c2f1637581 Mon Sep 17 00:00:00 2001
|
||||
From: Chris Lumens <clumens@redhat.com>
|
||||
Date: Thu, 19 Mar 2026 11:24:07 -0400
|
||||
Subject: [PATCH 1/3] Refactor: tools: Move pending xpath query into its own
|
||||
define.
|
||||
|
||||
---
|
||||
tools/crm_resource_runtime.c | 17 +++++++++--------
|
||||
1 file changed, 9 insertions(+), 8 deletions(-)
|
||||
|
||||
diff --git a/tools/crm_resource_runtime.c b/tools/crm_resource_runtime.c
|
||||
index 286c10c..6655c85 100644
|
||||
--- a/tools/crm_resource_runtime.c
|
||||
+++ b/tools/crm_resource_runtime.c
|
||||
@@ -1,5 +1,5 @@
|
||||
/*
|
||||
- * Copyright 2004-2024 the Pacemaker project contributors
|
||||
+ * Copyright 2004-2026 the Pacemaker project contributors
|
||||
*
|
||||
* The version control history for this file may have further details.
|
||||
*
|
||||
@@ -1946,6 +1946,13 @@ print_pending_actions(pcmk__output_t *out, GList *actions)
|
||||
/* For --wait, how long to sleep between cluster state checks */
|
||||
#define WAIT_SLEEP_S (2)
|
||||
|
||||
+#define XPATH_PENDING_ACTION "/" PCMK_XE_CIB "/" PCMK_XE_STATUS \
|
||||
+ "/" PCMK__XE_NODE_STATE "/" PCMK__XE_LRM \
|
||||
+ "/" PCMK__XE_LRM_RESOURCES \
|
||||
+ "/" PCMK__XE_LRM_RESOURCE \
|
||||
+ "/" PCMK__XE_LRM_RSC_OP \
|
||||
+ "[@" PCMK__XA_RC_CODE "='%d']"
|
||||
+
|
||||
/*!
|
||||
* \internal
|
||||
* \brief Wait until all pending cluster actions are complete
|
||||
@@ -1985,13 +1992,7 @@ wait_till_stable(pcmk__output_t *out, guint timeout_ms, cib_t * cib)
|
||||
return ENOMEM;
|
||||
}
|
||||
|
||||
- xpath = crm_strdup_printf("/" PCMK_XE_CIB "/" PCMK_XE_STATUS
|
||||
- "/" PCMK__XE_NODE_STATE "/" PCMK__XE_LRM
|
||||
- "/" PCMK__XE_LRM_RESOURCES
|
||||
- "/" PCMK__XE_LRM_RESOURCE
|
||||
- "/" PCMK__XE_LRM_RSC_OP
|
||||
- "[@" PCMK__XA_RC_CODE "='%d']",
|
||||
- PCMK_OCF_UNKNOWN);
|
||||
+ xpath = crm_strdup_printf(XPATH_PENDING_ACTION, PCMK_OCF_UNKNOWN);
|
||||
do {
|
||||
/* Abort if timeout is reached */
|
||||
time_diff = expire_time - time(NULL);
|
||||
--
|
||||
2.53.0
|
||||
|
||||
From f6915f09ec3412aaaf824fe1fdd3f7c9dc4a27da Mon Sep 17 00:00:00 2001
|
||||
From: Chris Lumens <clumens@redhat.com>
|
||||
Date: Thu, 19 Mar 2026 11:28:47 -0400
|
||||
Subject: [PATCH 2/3] Refactor: tools: Add pending_actions_in_cib to
|
||||
crm_resource.
|
||||
|
||||
This just refactors a little bit of code into its own function to reduce
|
||||
complexity in wait_till_stable and to give me a place to add a bunch
|
||||
more code later.
|
||||
---
|
||||
tools/crm_resource_runtime.c | 28 +++++++++++++++++++---------
|
||||
1 file changed, 19 insertions(+), 9 deletions(-)
|
||||
|
||||
diff --git a/tools/crm_resource_runtime.c b/tools/crm_resource_runtime.c
|
||||
index 6655c85..06ff68d 100644
|
||||
--- a/tools/crm_resource_runtime.c
|
||||
+++ b/tools/crm_resource_runtime.c
|
||||
@@ -1953,6 +1953,23 @@ print_pending_actions(pcmk__output_t *out, GList *actions)
|
||||
"/" PCMK__XE_LRM_RSC_OP \
|
||||
"[@" PCMK__XA_RC_CODE "='%d']"
|
||||
|
||||
+static bool
|
||||
+pending_actions_in_cib(pcmk_scheduler_t *scheduler)
|
||||
+{
|
||||
+ xmlXPathObject *search = NULL;
|
||||
+ bool pending = false;
|
||||
+ char *xpath = NULL;
|
||||
+
|
||||
+ xpath = crm_strdup_printf(XPATH_PENDING_ACTION, PCMK_OCF_UNKNOWN);
|
||||
+ search = xpath_search(scheduler->input, xpath);
|
||||
+ pending = (numXpathResults(search) > 0);
|
||||
+
|
||||
+ xmlXPathFreeObject(search);
|
||||
+ free(xpath);
|
||||
+
|
||||
+ return pending;
|
||||
+}
|
||||
+
|
||||
/*!
|
||||
* \internal
|
||||
* \brief Wait until all pending cluster actions are complete
|
||||
@@ -1973,13 +1990,10 @@ int
|
||||
wait_till_stable(pcmk__output_t *out, guint timeout_ms, cib_t * cib)
|
||||
{
|
||||
pcmk_scheduler_t *scheduler = NULL;
|
||||
- xmlXPathObjectPtr search;
|
||||
int rc = pcmk_rc_ok;
|
||||
- bool pending_unknown_state_resources;
|
||||
time_t expire_time = time(NULL);
|
||||
time_t time_diff;
|
||||
bool printed_version_warning = out->is_quiet(out); // i.e. don't print if quiet
|
||||
- char *xpath = NULL;
|
||||
|
||||
if (timeout_ms == 0) {
|
||||
expire_time += WAIT_DEFAULT_TIMEOUT_S;
|
||||
@@ -1992,7 +2006,6 @@ wait_till_stable(pcmk__output_t *out, guint timeout_ms, cib_t * cib)
|
||||
return ENOMEM;
|
||||
}
|
||||
|
||||
- xpath = crm_strdup_printf(XPATH_PENDING_ACTION, PCMK_OCF_UNKNOWN);
|
||||
do {
|
||||
/* Abort if timeout is reached */
|
||||
time_diff = expire_time - time(NULL);
|
||||
@@ -2038,13 +2051,10 @@ wait_till_stable(pcmk__output_t *out, guint timeout_ms, cib_t * cib)
|
||||
}
|
||||
}
|
||||
|
||||
- search = xpath_search(scheduler->input, xpath);
|
||||
- pending_unknown_state_resources = (numXpathResults(search) > 0);
|
||||
- freeXpathObject(search);
|
||||
- } while (actions_are_pending(scheduler->actions) || pending_unknown_state_resources);
|
||||
+ } while (actions_are_pending(scheduler->actions)
|
||||
+ || pending_actions_in_cib(scheduler));
|
||||
|
||||
pe_free_working_set(scheduler);
|
||||
- free(xpath);
|
||||
return rc;
|
||||
}
|
||||
|
||||
--
|
||||
2.53.0
|
||||
|
||||
From 5783c35095c8f8b06550ea71174183af87473b4e Mon Sep 17 00:00:00 2001
|
||||
From: Chris Lumens <clumens@redhat.com>
|
||||
Date: Mon, 11 May 2026 12:43:09 -0400
|
||||
Subject: [PATCH 3/3] Low: tools: Don't wait on certain "pending" monitor
|
||||
actions.
|
||||
|
||||
d253cdf introduced a change where `crm_resource --wait` will wait on
|
||||
pending actions in the CIB before returning. Most of the time this is
|
||||
fine, but there's a very specific case where it's not.
|
||||
|
||||
It's possible to end up in a situation where you have a resource that is
|
||||
disabled and constrained such that it can't run where it wants to. In
|
||||
that case, the CIB will still contain a pending lrm_rsc_op history entry
|
||||
for a recurring monitor operation even after the resource fails to
|
||||
start. It will look something like this:
|
||||
|
||||
<lrm_rsc_op id="dummy1_monitor_10000" operation_key="dummy1_monitor_10000" operation="monitor" call-id="-1" rc-code="193" op-status="-1" interval="10000" last-rc-change="1773083270" exec-time="0" .../>
|
||||
|
||||
By design, pacemaker doesn't replace these pending monitor entries with
|
||||
a new entry when they fail. The scheduler requires this for remote
|
||||
connection resources at least. See bbadfe553.
|
||||
|
||||
So instead, we'll fix this in the tools. If the pending monitor action
|
||||
has failed, it will also have a history entry like this:
|
||||
|
||||
<lrm_rsc_op id="dummy1_last_failure_0" operation_key="dummy1_monitor_10000" operation="monitor" call-id="-1" rc-code="193" op-status="-1" interval="10000" last-rc-change="1773083270" exec-time="0" queue-time="0" .../>
|
||||
|
||||
We can look through the history entries for one with an operation_key
|
||||
matching the pending recurring monitor entry, and see if its ID contains
|
||||
_last_failure_0. If so, that's a monitor action we shouldn't wait on.
|
||||
|
||||
Fixes RHEL-78393
|
||||
---
|
||||
tools/crm_resource_runtime.c | 86 ++++++++++++++++++++++++++++++++++--
|
||||
1 file changed, 83 insertions(+), 3 deletions(-)
|
||||
|
||||
diff --git a/tools/crm_resource_runtime.c b/tools/crm_resource_runtime.c
|
||||
index 06ff68d..e2cb94b 100644
|
||||
--- a/tools/crm_resource_runtime.c
|
||||
+++ b/tools/crm_resource_runtime.c
|
||||
@@ -1953,21 +1953,101 @@ print_pending_actions(pcmk__output_t *out, GList *actions)
|
||||
"/" PCMK__XE_LRM_RSC_OP \
|
||||
"[@" PCMK__XA_RC_CODE "='%d']"
|
||||
|
||||
+#define XPATH_LAST_FAILURE "/" PCMK_XE_CIB "/" PCMK_XE_STATUS \
|
||||
+ "/" PCMK__XE_NODE_STATE "/" PCMK__XE_LRM \
|
||||
+ "/" PCMK__XE_LRM_RESOURCES \
|
||||
+ "/" PCMK__XE_LRM_RESOURCE \
|
||||
+ "/" PCMK__XE_LRM_RSC_OP \
|
||||
+ "[@" PCMK__XA_OPERATION_KEY "='%s']"
|
||||
+/*!
|
||||
+ * \internal
|
||||
+ * \brief Check if there's a lrm_rsc_op last_failure entry for a given key
|
||||
+ *
|
||||
+ * \param[in] scheduler The scheduler object
|
||||
+ * \param[in] key The operation_key attribute of some lrm_rsc_op entry
|
||||
+ *
|
||||
+ * \return \c true if there is an lrm_rsc_op history entry with \p key as its
|
||||
+ * operation_key and with an id attribute ending in "_last_failure_0",
|
||||
+ * \c false otherwise
|
||||
+ */
|
||||
+static bool
|
||||
+action_has_matching_last_failure(pcmk_scheduler_t *scheduler, const char *key)
|
||||
+{
|
||||
+ xmlXPathObject *search = NULL;
|
||||
+ bool retval = false;
|
||||
+ char *xpath = NULL;
|
||||
+
|
||||
+ xpath = crm_strdup_printf(XPATH_LAST_FAILURE, key);
|
||||
+ search = xpath_search(scheduler->input, xpath);
|
||||
+
|
||||
+ for (int i = 0; i < numXpathResults(search); i++) {
|
||||
+ const xmlNode *lrm_op_xml = getXpathResult(search, i);
|
||||
+
|
||||
+ if (g_str_has_suffix(crm_element_value(lrm_op_xml, PCMK_XA_ID),
|
||||
+ "_last_failure_0")) {
|
||||
+ retval = true;
|
||||
+ break;
|
||||
+ }
|
||||
+ }
|
||||
+
|
||||
+ xmlXPathFreeObject(search);
|
||||
+ free(xpath);
|
||||
+
|
||||
+ return retval;
|
||||
+}
|
||||
+
|
||||
+/*!
|
||||
+ * \internal
|
||||
+ * \brief Determine if there are certain pending actions in the CIB
|
||||
+ *
|
||||
+ * \param[in] scheduler The scheduler object
|
||||
+ *
|
||||
+ * \return \c true if there are any pending actions in the CIB, after
|
||||
+ * filtering out pending recurring monitor actions with a last_failure
|
||||
+ * history entry; \c false otherwise
|
||||
+ *
|
||||
+ * \note We filter out certain recurring monitor actions because they might
|
||||
+ * always be present. The scheduler can't replace the history entry
|
||||
+ * with a failure entry (see bbadfe553), but it's still not a pending
|
||||
+ * action and we don't want to wait for it.
|
||||
+ */
|
||||
static bool
|
||||
pending_actions_in_cib(pcmk_scheduler_t *scheduler)
|
||||
{
|
||||
xmlXPathObject *search = NULL;
|
||||
- bool pending = false;
|
||||
char *xpath = NULL;
|
||||
+ bool any_pending = false;
|
||||
|
||||
xpath = crm_strdup_printf(XPATH_PENDING_ACTION, PCMK_OCF_UNKNOWN);
|
||||
search = xpath_search(scheduler->input, xpath);
|
||||
- pending = (numXpathResults(search) > 0);
|
||||
+
|
||||
+ for (int i = 0; i < numXpathResults(search); i++) {
|
||||
+ const char *op_key = NULL;
|
||||
+ const xmlNode *lrm_op_xml = getXpathResult(search, i);
|
||||
+
|
||||
+ if (!pcmk__str_eq(PCMK_ACTION_MONITOR,
|
||||
+ crm_element_value(lrm_op_xml, PCMK_XA_OPERATION),
|
||||
+ pcmk__str_none)) {
|
||||
+ any_pending = true;
|
||||
+ break;
|
||||
+ }
|
||||
+
|
||||
+ if (pcmk_xe_is_probe(lrm_op_xml)) {
|
||||
+ any_pending = true;
|
||||
+ break;
|
||||
+ }
|
||||
+
|
||||
+ op_key = crm_element_value(lrm_op_xml, PCMK__XA_OPERATION_KEY);
|
||||
+ if ((op_key == NULL) || !action_has_matching_last_failure(scheduler, op_key)) {
|
||||
+ any_pending = true;
|
||||
+ break;
|
||||
+ }
|
||||
+ }
|
||||
|
||||
xmlXPathFreeObject(search);
|
||||
free(xpath);
|
||||
|
||||
- return pending;
|
||||
+ return any_pending;
|
||||
}
|
||||
|
||||
/*!
|
||||
--
|
||||
2.53.0
|
||||
|
||||
@ -36,7 +36,7 @@
|
||||
## can be incremented to build packages reliably considered "newer"
|
||||
## than previously built packages with the same pcmkversion)
|
||||
%global pcmkversion 2.1.10
|
||||
%global specversion 2
|
||||
%global specversion 3
|
||||
|
||||
## Upstream commit (full commit ID, abbreviated commit ID, or tag) to build
|
||||
%global commit 5693eaeeef06faa1622515963082b5a1731d9fc0
|
||||
@ -251,6 +251,7 @@ Patch001: 001-crm_resource_wait.patch
|
||||
Patch002: 002-ipc_connect_retry.patch
|
||||
Patch003: 003-ipc_evict.patch
|
||||
Patch004: 004-fewer_messages.patch
|
||||
Patch005: 005-pending_actions.patch
|
||||
|
||||
Requires: resource-agents
|
||||
Requires: %{pkgname_pcmk_libs}%{?_isa} = %{version}-%{release}
|
||||
@ -917,6 +918,10 @@ exit 0
|
||||
%license %{nagios_name}-%{nagios_hash}/COPYING
|
||||
|
||||
%changelog
|
||||
* Mon May 11 2026 Chris Lumens <clumens@redhat.com> - 2.1.10-3
|
||||
- Don't hang waiting on certain pending monitor actions
|
||||
- Resolves: RHEL-153661
|
||||
|
||||
* Wed Nov 12 2025 Chris Lumens <clumens@redhat.com> - 2.1.10-2
|
||||
- Handle large timeouts correctly in crm_resource --wait
|
||||
- Do not try to connect to subdaemons before they're respawned
|
||||
|
||||
Loading…
Reference in New Issue
Block a user