diff --git a/005-pending_actions.patch b/005-pending_actions.patch new file mode 100644 index 0000000..3b005e1 --- /dev/null +++ b/005-pending_actions.patch @@ -0,0 +1,283 @@ +From e2db52ba7f9cb2d976771897435324c2f1637581 Mon Sep 17 00:00:00 2001 +From: Chris Lumens +Date: Thu, 19 Mar 2026 11:24:07 -0400 +Subject: [PATCH 1/3] Refactor: tools: Move pending xpath query into its own + define. + +--- + tools/crm_resource_runtime.c | 17 +++++++++-------- + 1 file changed, 9 insertions(+), 8 deletions(-) + +diff --git a/tools/crm_resource_runtime.c b/tools/crm_resource_runtime.c +index 286c10c..6655c85 100644 +--- a/tools/crm_resource_runtime.c ++++ b/tools/crm_resource_runtime.c +@@ -1,5 +1,5 @@ + /* +- * Copyright 2004-2024 the Pacemaker project contributors ++ * Copyright 2004-2026 the Pacemaker project contributors + * + * The version control history for this file may have further details. + * +@@ -1946,6 +1946,13 @@ print_pending_actions(pcmk__output_t *out, GList *actions) + /* For --wait, how long to sleep between cluster state checks */ + #define WAIT_SLEEP_S (2) + ++#define XPATH_PENDING_ACTION "/" PCMK_XE_CIB "/" PCMK_XE_STATUS \ ++ "/" PCMK__XE_NODE_STATE "/" PCMK__XE_LRM \ ++ "/" PCMK__XE_LRM_RESOURCES \ ++ "/" PCMK__XE_LRM_RESOURCE \ ++ "/" PCMK__XE_LRM_RSC_OP \ ++ "[@" PCMK__XA_RC_CODE "='%d']" ++ + /*! + * \internal + * \brief Wait until all pending cluster actions are complete +@@ -1985,13 +1992,7 @@ wait_till_stable(pcmk__output_t *out, guint timeout_ms, cib_t * cib) + return ENOMEM; + } + +- xpath = crm_strdup_printf("/" PCMK_XE_CIB "/" PCMK_XE_STATUS +- "/" PCMK__XE_NODE_STATE "/" PCMK__XE_LRM +- "/" PCMK__XE_LRM_RESOURCES +- "/" PCMK__XE_LRM_RESOURCE +- "/" PCMK__XE_LRM_RSC_OP +- "[@" PCMK__XA_RC_CODE "='%d']", +- PCMK_OCF_UNKNOWN); ++ xpath = crm_strdup_printf(XPATH_PENDING_ACTION, PCMK_OCF_UNKNOWN); + do { + /* Abort if timeout is reached */ + time_diff = expire_time - time(NULL); +-- +2.53.0 + +From f6915f09ec3412aaaf824fe1fdd3f7c9dc4a27da Mon Sep 17 00:00:00 2001 +From: Chris Lumens +Date: Thu, 19 Mar 2026 11:28:47 -0400 +Subject: [PATCH 2/3] Refactor: tools: Add pending_actions_in_cib to + crm_resource. + +This just refactors a little bit of code into its own function to reduce +complexity in wait_till_stable and to give me a place to add a bunch +more code later. +--- + tools/crm_resource_runtime.c | 28 +++++++++++++++++++--------- + 1 file changed, 19 insertions(+), 9 deletions(-) + +diff --git a/tools/crm_resource_runtime.c b/tools/crm_resource_runtime.c +index 6655c85..06ff68d 100644 +--- a/tools/crm_resource_runtime.c ++++ b/tools/crm_resource_runtime.c +@@ -1953,6 +1953,23 @@ print_pending_actions(pcmk__output_t *out, GList *actions) + "/" PCMK__XE_LRM_RSC_OP \ + "[@" PCMK__XA_RC_CODE "='%d']" + ++static bool ++pending_actions_in_cib(pcmk_scheduler_t *scheduler) ++{ ++ xmlXPathObject *search = NULL; ++ bool pending = false; ++ char *xpath = NULL; ++ ++ xpath = crm_strdup_printf(XPATH_PENDING_ACTION, PCMK_OCF_UNKNOWN); ++ search = xpath_search(scheduler->input, xpath); ++ pending = (numXpathResults(search) > 0); ++ ++ xmlXPathFreeObject(search); ++ free(xpath); ++ ++ return pending; ++} ++ + /*! + * \internal + * \brief Wait until all pending cluster actions are complete +@@ -1973,13 +1990,10 @@ int + wait_till_stable(pcmk__output_t *out, guint timeout_ms, cib_t * cib) + { + pcmk_scheduler_t *scheduler = NULL; +- xmlXPathObjectPtr search; + int rc = pcmk_rc_ok; +- bool pending_unknown_state_resources; + time_t expire_time = time(NULL); + time_t time_diff; + bool printed_version_warning = out->is_quiet(out); // i.e. don't print if quiet +- char *xpath = NULL; + + if (timeout_ms == 0) { + expire_time += WAIT_DEFAULT_TIMEOUT_S; +@@ -1992,7 +2006,6 @@ wait_till_stable(pcmk__output_t *out, guint timeout_ms, cib_t * cib) + return ENOMEM; + } + +- xpath = crm_strdup_printf(XPATH_PENDING_ACTION, PCMK_OCF_UNKNOWN); + do { + /* Abort if timeout is reached */ + time_diff = expire_time - time(NULL); +@@ -2038,13 +2051,10 @@ wait_till_stable(pcmk__output_t *out, guint timeout_ms, cib_t * cib) + } + } + +- search = xpath_search(scheduler->input, xpath); +- pending_unknown_state_resources = (numXpathResults(search) > 0); +- freeXpathObject(search); +- } while (actions_are_pending(scheduler->actions) || pending_unknown_state_resources); ++ } while (actions_are_pending(scheduler->actions) ++ || pending_actions_in_cib(scheduler)); + + pe_free_working_set(scheduler); +- free(xpath); + return rc; + } + +-- +2.53.0 + +From 5783c35095c8f8b06550ea71174183af87473b4e Mon Sep 17 00:00:00 2001 +From: Chris Lumens +Date: Mon, 11 May 2026 12:43:09 -0400 +Subject: [PATCH 3/3] Low: tools: Don't wait on certain "pending" monitor + actions. + +d253cdf introduced a change where `crm_resource --wait` will wait on +pending actions in the CIB before returning. Most of the time this is +fine, but there's a very specific case where it's not. + +It's possible to end up in a situation where you have a resource that is +disabled and constrained such that it can't run where it wants to. In +that case, the CIB will still contain a pending lrm_rsc_op history entry +for a recurring monitor operation even after the resource fails to +start. It will look something like this: + + + +By design, pacemaker doesn't replace these pending monitor entries with +a new entry when they fail. The scheduler requires this for remote +connection resources at least. See bbadfe553. + +So instead, we'll fix this in the tools. If the pending monitor action +has failed, it will also have a history entry like this: + + + +We can look through the history entries for one with an operation_key +matching the pending recurring monitor entry, and see if its ID contains +_last_failure_0. If so, that's a monitor action we shouldn't wait on. + +Fixes RHEL-78393 +--- + tools/crm_resource_runtime.c | 86 ++++++++++++++++++++++++++++++++++-- + 1 file changed, 83 insertions(+), 3 deletions(-) + +diff --git a/tools/crm_resource_runtime.c b/tools/crm_resource_runtime.c +index 06ff68d..e2cb94b 100644 +--- a/tools/crm_resource_runtime.c ++++ b/tools/crm_resource_runtime.c +@@ -1953,21 +1953,101 @@ print_pending_actions(pcmk__output_t *out, GList *actions) + "/" PCMK__XE_LRM_RSC_OP \ + "[@" PCMK__XA_RC_CODE "='%d']" + ++#define XPATH_LAST_FAILURE "/" PCMK_XE_CIB "/" PCMK_XE_STATUS \ ++ "/" PCMK__XE_NODE_STATE "/" PCMK__XE_LRM \ ++ "/" PCMK__XE_LRM_RESOURCES \ ++ "/" PCMK__XE_LRM_RESOURCE \ ++ "/" PCMK__XE_LRM_RSC_OP \ ++ "[@" PCMK__XA_OPERATION_KEY "='%s']" ++/*! ++ * \internal ++ * \brief Check if there's a lrm_rsc_op last_failure entry for a given key ++ * ++ * \param[in] scheduler The scheduler object ++ * \param[in] key The operation_key attribute of some lrm_rsc_op entry ++ * ++ * \return \c true if there is an lrm_rsc_op history entry with \p key as its ++ * operation_key and with an id attribute ending in "_last_failure_0", ++ * \c false otherwise ++ */ ++static bool ++action_has_matching_last_failure(pcmk_scheduler_t *scheduler, const char *key) ++{ ++ xmlXPathObject *search = NULL; ++ bool retval = false; ++ char *xpath = NULL; ++ ++ xpath = crm_strdup_printf(XPATH_LAST_FAILURE, key); ++ search = xpath_search(scheduler->input, xpath); ++ ++ for (int i = 0; i < numXpathResults(search); i++) { ++ const xmlNode *lrm_op_xml = getXpathResult(search, i); ++ ++ if (g_str_has_suffix(crm_element_value(lrm_op_xml, PCMK_XA_ID), ++ "_last_failure_0")) { ++ retval = true; ++ break; ++ } ++ } ++ ++ xmlXPathFreeObject(search); ++ free(xpath); ++ ++ return retval; ++} ++ ++/*! ++ * \internal ++ * \brief Determine if there are certain pending actions in the CIB ++ * ++ * \param[in] scheduler The scheduler object ++ * ++ * \return \c true if there are any pending actions in the CIB, after ++ * filtering out pending recurring monitor actions with a last_failure ++ * history entry; \c false otherwise ++ * ++ * \note We filter out certain recurring monitor actions because they might ++ * always be present. The scheduler can't replace the history entry ++ * with a failure entry (see bbadfe553), but it's still not a pending ++ * action and we don't want to wait for it. ++ */ + static bool + pending_actions_in_cib(pcmk_scheduler_t *scheduler) + { + xmlXPathObject *search = NULL; +- bool pending = false; + char *xpath = NULL; ++ bool any_pending = false; + + xpath = crm_strdup_printf(XPATH_PENDING_ACTION, PCMK_OCF_UNKNOWN); + search = xpath_search(scheduler->input, xpath); +- pending = (numXpathResults(search) > 0); ++ ++ for (int i = 0; i < numXpathResults(search); i++) { ++ const char *op_key = NULL; ++ const xmlNode *lrm_op_xml = getXpathResult(search, i); ++ ++ if (!pcmk__str_eq(PCMK_ACTION_MONITOR, ++ crm_element_value(lrm_op_xml, PCMK_XA_OPERATION), ++ pcmk__str_none)) { ++ any_pending = true; ++ break; ++ } ++ ++ if (pcmk_xe_is_probe(lrm_op_xml)) { ++ any_pending = true; ++ break; ++ } ++ ++ op_key = crm_element_value(lrm_op_xml, PCMK__XA_OPERATION_KEY); ++ if ((op_key == NULL) || !action_has_matching_last_failure(scheduler, op_key)) { ++ any_pending = true; ++ break; ++ } ++ } + + xmlXPathFreeObject(search); + free(xpath); + +- return pending; ++ return any_pending; + } + + /*! +-- +2.53.0 + diff --git a/pacemaker.spec b/pacemaker.spec index 2eade75..73187b0 100644 --- a/pacemaker.spec +++ b/pacemaker.spec @@ -36,7 +36,7 @@ ## can be incremented to build packages reliably considered "newer" ## than previously built packages with the same pcmkversion) %global pcmkversion 2.1.10 -%global specversion 2 +%global specversion 3 ## Upstream commit (full commit ID, abbreviated commit ID, or tag) to build %global commit 5693eaeeef06faa1622515963082b5a1731d9fc0 @@ -251,6 +251,7 @@ Patch001: 001-crm_resource_wait.patch Patch002: 002-ipc_connect_retry.patch Patch003: 003-ipc_evict.patch Patch004: 004-fewer_messages.patch +Patch005: 005-pending_actions.patch Requires: resource-agents Requires: %{pkgname_pcmk_libs}%{?_isa} = %{version}-%{release} @@ -917,6 +918,10 @@ exit 0 %license %{nagios_name}-%{nagios_hash}/COPYING %changelog +* Mon May 11 2026 Chris Lumens - 2.1.10-3 +- Don't hang waiting on certain pending monitor actions +- Resolves: RHEL-153661 + * Wed Nov 12 2025 Chris Lumens - 2.1.10-2 - Handle large timeouts correctly in crm_resource --wait - Do not try to connect to subdaemons before they're respawned