From 3a11e8a74338fc4d5227ec3e91e6b6940153410b Mon Sep 17 00:00:00 2001 From: CentOS Sources Date: Wed, 1 Jun 2022 18:11:03 +0000 Subject: [PATCH] import pacemaker-2.1.3-1.el8 --- .gitignore | 2 +- .pacemaker.metadata | 2 +- SOURCES/001-acl-group-schema.patch | 230 -- SOURCES/002-fencing-reasons.patch | 2100 ------------ SOURCES/003-fencing-reasons.patch | 2476 -------------- SOURCES/004-systemd-metadata.patch | 73 - SOURCES/005-fencing-reasons.patch | 2200 ------------- SOURCES/006-stateful-metadata.patch | 143 - SOURCES/007-memory-leak.patch | 39 - SOURCES/008-fencing-history.patch | 43 - SOURCES/009-fencing-reasons.patch | 2985 ----------------- SOURCES/010-probe-failures.patch | 4157 ------------------------ SOURCES/011-fencing-reasons.patch | 1450 --------- SOURCES/012-notify-crash.patch | 65 - SOURCES/013-probe-failures.patch | 26 - SOURCES/014-pcmk_delay_base.patch | 43 - SOURCES/015-fencing-reasons.patch | 1093 ------- SOURCES/016-fencing-crash.patch | 56 - SOURCES/017-fencing-reasons.patch | 875 ----- SOURCES/018-failure-messages.patch | 796 ----- SOURCES/019-corosync-tracking.patch | 29 - SOURCES/020-systemd-unit.patch | 41 - SOURCES/021-failure-messages.patch | 1338 -------- SOURCES/022-memory-leak.patch | 82 - SOURCES/023-regression.patch | 30 - SOURCES/024-stop_unexpected.patch | 806 ----- SOURCES/025-stop_unexpected-test.patch | 495 --- SOURCES/026-stop_unexpected-fix.patch | 589 ---- SPECS/pacemaker.spec | 101 +- 29 files changed, 41 insertions(+), 22324 deletions(-) delete mode 100644 SOURCES/001-acl-group-schema.patch delete mode 100644 SOURCES/002-fencing-reasons.patch delete mode 100644 SOURCES/003-fencing-reasons.patch delete mode 100644 SOURCES/004-systemd-metadata.patch delete mode 100644 SOURCES/005-fencing-reasons.patch delete mode 100644 SOURCES/006-stateful-metadata.patch delete mode 100644 SOURCES/007-memory-leak.patch delete mode 100644 SOURCES/008-fencing-history.patch delete mode 100644 SOURCES/009-fencing-reasons.patch delete mode 100644 SOURCES/010-probe-failures.patch delete mode 100644 SOURCES/011-fencing-reasons.patch delete mode 100644 SOURCES/012-notify-crash.patch delete mode 100644 SOURCES/013-probe-failures.patch delete mode 100644 SOURCES/014-pcmk_delay_base.patch delete mode 100644 SOURCES/015-fencing-reasons.patch delete mode 100644 SOURCES/016-fencing-crash.patch delete mode 100644 SOURCES/017-fencing-reasons.patch delete mode 100644 SOURCES/018-failure-messages.patch delete mode 100644 SOURCES/019-corosync-tracking.patch delete mode 100644 SOURCES/020-systemd-unit.patch delete mode 100644 SOURCES/021-failure-messages.patch delete mode 100644 SOURCES/022-memory-leak.patch delete mode 100644 SOURCES/023-regression.patch delete mode 100644 SOURCES/024-stop_unexpected.patch delete mode 100644 SOURCES/025-stop_unexpected-test.patch delete mode 100644 SOURCES/026-stop_unexpected-fix.patch diff --git a/.gitignore b/.gitignore index fad0d0f..234232d 100644 --- a/.gitignore +++ b/.gitignore @@ -1,2 +1,2 @@ SOURCES/nagios-agents-metadata-105ab8a.tar.gz -SOURCES/pacemaker-ada5c3b.tar.gz +SOURCES/pacemaker-dff7c3a.tar.gz diff --git a/.pacemaker.metadata b/.pacemaker.metadata index 5af6b2a..4a22e2e 100644 --- a/.pacemaker.metadata +++ b/.pacemaker.metadata @@ -1,2 +1,2 @@ ea6c0a27fd0ae8ce02f84a11f08a0d79377041c3 SOURCES/nagios-agents-metadata-105ab8a.tar.gz -f9fd69263d5b21446b530f9750c262f7b492cad4 SOURCES/pacemaker-ada5c3b.tar.gz +aa7a8aecfe487f051545845476fd83d493da0326 SOURCES/pacemaker-dff7c3a.tar.gz diff --git a/SOURCES/001-acl-group-schema.patch b/SOURCES/001-acl-group-schema.patch deleted file mode 100644 index 4835e3e..0000000 --- a/SOURCES/001-acl-group-schema.patch +++ /dev/null @@ -1,230 +0,0 @@ -From f5ffbaf1f537d3d5b00e594211cd322f97df51ac Mon Sep 17 00:00:00 2001 -From: Grace Chin -Date: Fri, 5 Nov 2021 11:39:39 -0400 -Subject: [PATCH 1/3] Low: xml: clone acls schema in preparation for changes - ---- - xml/acls-3.8.rng | 80 ++++++++++++++++++++++++++++++++++++++++++++++++ - 1 file changed, 80 insertions(+) - create mode 100644 xml/acls-3.8.rng - -diff --git a/xml/acls-3.8.rng b/xml/acls-3.8.rng -new file mode 100644 -index 000000000..0fe6eed96 ---- /dev/null -+++ b/xml/acls-3.8.rng -@@ -0,0 +1,80 @@ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ read -+ write -+ deny -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ --- -2.27.0 - - -From 7838213fc639236bdedf5f15320152d973f1bdad Mon Sep 17 00:00:00 2001 -From: Grace Chin -Date: Fri, 5 Nov 2021 11:40:48 -0400 -Subject: [PATCH 2/3] Add a 'name' attribute to acl_target and acl_group - elements - ---- - xml/acls-3.8.rng | 6 ++++++ - 1 file changed, 6 insertions(+) - -diff --git a/xml/acls-3.8.rng b/xml/acls-3.8.rng -index 0fe6eed96..48bcdffe3 100644 ---- a/xml/acls-3.8.rng -+++ b/xml/acls-3.8.rng -@@ -13,6 +13,9 @@ - - - -+ -+ -+ - - - -@@ -22,6 +25,9 @@ - - - -+ -+ -+ - - - --- -2.27.0 - - -From c3c498f4636f57e29670f8e385b625024ed222d7 Mon Sep 17 00:00:00 2001 -From: Grace Chin -Date: Fri, 5 Nov 2021 11:42:48 -0400 -Subject: [PATCH 3/3] Changes made by run of 'cts/cts-cli -s' - ---- - cts/cli/regression.upgrade.exp | 7 +++++-- - cts/cli/regression.validity.exp | 22 ++++++++++++++++++---- - 2 files changed, 23 insertions(+), 6 deletions(-) - -diff --git a/cts/cli/regression.upgrade.exp b/cts/cli/regression.upgrade.exp -index e38adebdd..7ce7ec13b 100644 ---- a/cts/cli/regression.upgrade.exp -+++ b/cts/cli/regression.upgrade.exp -@@ -91,8 +91,11 @@ update_validation debug: Configuration valid for schema: pacemaker-3.6 - update_validation debug: pacemaker-3.6-style configuration is also valid for pacemaker-3.7 - update_validation debug: Testing 'pacemaker-3.7' validation (21 of X) - update_validation debug: Configuration valid for schema: pacemaker-3.7 --update_validation trace: Stopping at pacemaker-3.7 --update_validation info: Transformed the configuration from pacemaker-2.10 to pacemaker-3.7 -+update_validation debug: pacemaker-3.7-style configuration is also valid for pacemaker-3.8 -+update_validation debug: Testing 'pacemaker-3.8' validation (22 of X) -+update_validation debug: Configuration valid for schema: pacemaker-3.8 -+update_validation trace: Stopping at pacemaker-3.8 -+update_validation info: Transformed the configuration from pacemaker-2.10 to pacemaker-3.8 - =#=#=#= Current cib after: Upgrade to latest CIB schema (trigger 2.10.xsl + the wrapping) =#=#=#= - - -diff --git a/cts/cli/regression.validity.exp b/cts/cli/regression.validity.exp -index 5ace430e7..125035a47 100644 ---- a/cts/cli/regression.validity.exp -+++ b/cts/cli/regression.validity.exp -@@ -121,7 +121,11 @@ update_validation debug: Testing 'pacemaker-3.7' validation (21 of X) - element rsc_order: Relax-NG validity error : Invalid attribute first-action for element rsc_order - element rsc_order: Relax-NG validity error : Element constraints has extra content: rsc_order - update_validation trace: pacemaker-3.7 validation failed --Cannot upgrade configuration (claiming schema pacemaker-1.2) to at least pacemaker-3.0 because it does not validate with any schema from pacemaker-1.2 to pacemaker-3.7 -+update_validation debug: Testing 'pacemaker-3.8' validation (22 of X) -+element rsc_order: Relax-NG validity error : Invalid attribute first-action for element rsc_order -+element rsc_order: Relax-NG validity error : Element constraints has extra content: rsc_order -+update_validation trace: pacemaker-3.8 validation failed -+Cannot upgrade configuration (claiming schema pacemaker-1.2) to at least pacemaker-3.0 because it does not validate with any schema from pacemaker-1.2 to pacemaker-3.8 - =#=#=#= End test: Run crm_simulate with invalid CIB (enum violation) - Invalid configuration (78) =#=#=#= - * Passed: crm_simulate - Run crm_simulate with invalid CIB (enum violation) - =#=#=#= Begin test: Try to make resulting CIB invalid (unrecognized validate-with) =#=#=#= -@@ -226,7 +230,10 @@ update_validation trace: pacemaker-3.6 validation failed - update_validation debug: Testing 'pacemaker-3.7' validation (21 of X) - element cib: Relax-NG validity error : Invalid attribute validate-with for element cib - update_validation trace: pacemaker-3.7 validation failed --Cannot upgrade configuration (claiming schema pacemaker-9999.0) to at least pacemaker-3.0 because it does not validate with any schema from unknown to pacemaker-3.7 -+update_validation debug: Testing 'pacemaker-3.8' validation (22 of X) -+element cib: Relax-NG validity error : Invalid attribute validate-with for element cib -+update_validation trace: pacemaker-3.8 validation failed -+Cannot upgrade configuration (claiming schema pacemaker-9999.0) to at least pacemaker-3.0 because it does not validate with any schema from unknown to pacemaker-3.8 - =#=#=#= End test: Run crm_simulate with invalid CIB (unrecognized validate-with) - Invalid configuration (78) =#=#=#= - * Passed: crm_simulate - Run crm_simulate with invalid CIB (unrecognized validate-with) - =#=#=#= Begin test: Try to make resulting CIB invalid, but possibly recoverable (valid with X.Y+1) =#=#=#= -@@ -326,8 +333,11 @@ update_validation debug: Configuration valid for schema: pacemaker-3.6 - update_validation debug: pacemaker-3.6-style configuration is also valid for pacemaker-3.7 - update_validation debug: Testing 'pacemaker-3.7' validation (21 of X) - update_validation debug: Configuration valid for schema: pacemaker-3.7 --update_validation trace: Stopping at pacemaker-3.7 --update_validation info: Transformed the configuration from pacemaker-1.2 to pacemaker-3.7 -+update_validation debug: pacemaker-3.7-style configuration is also valid for pacemaker-3.8 -+update_validation debug: Testing 'pacemaker-3.8' validation (22 of X) -+update_validation debug: Configuration valid for schema: pacemaker-3.8 -+update_validation trace: Stopping at pacemaker-3.8 -+update_validation info: Transformed the configuration from pacemaker-1.2 to pacemaker-3.8 - unpack_resources error: Resource start-up disabled since no STONITH resources have been defined - unpack_resources error: Either configure some or disable STONITH with the stonith-enabled option - unpack_resources error: NOTE: Clusters with shared data need STONITH to ensure data integrity -@@ -437,6 +447,8 @@ element rsc_order: Relax-NG validity error : Invalid attribute first-action for - element rsc_order: Relax-NG validity error : Element constraints has extra content: rsc_order - element rsc_order: Relax-NG validity error : Invalid attribute first-action for element rsc_order - element rsc_order: Relax-NG validity error : Element constraints has extra content: rsc_order -+element rsc_order: Relax-NG validity error : Invalid attribute first-action for element rsc_order -+element rsc_order: Relax-NG validity error : Element constraints has extra content: rsc_order - =#=#=#= Current cib after: Make resulting CIB invalid, and without validate-with attribute =#=#=#= - - -@@ -502,6 +514,8 @@ validity.bad.xml:10: element rsc_order: Relax-NG validity error : Invalid attrib - validity.bad.xml:10: element rsc_order: Relax-NG validity error : Element constraints has extra content: rsc_order - validity.bad.xml:10: element rsc_order: Relax-NG validity error : Invalid attribute first-action for element rsc_order - validity.bad.xml:10: element rsc_order: Relax-NG validity error : Element constraints has extra content: rsc_order -+validity.bad.xml:10: element rsc_order: Relax-NG validity error : Invalid attribute first-action for element rsc_order -+validity.bad.xml:10: element rsc_order: Relax-NG validity error : Element constraints has extra content: rsc_order - unpack_resources error: Resource start-up disabled since no STONITH resources have been defined - unpack_resources error: Either configure some or disable STONITH with the stonith-enabled option - unpack_resources error: NOTE: Clusters with shared data need STONITH to ensure data integrity --- -2.27.0 - diff --git a/SOURCES/002-fencing-reasons.patch b/SOURCES/002-fencing-reasons.patch deleted file mode 100644 index f89cbec..0000000 --- a/SOURCES/002-fencing-reasons.patch +++ /dev/null @@ -1,2100 +0,0 @@ -From 95b4f87aae5fb2cf771cf9a8f8e5420b65fb213f Mon Sep 17 00:00:00 2001 -From: Ken Gaillot -Date: Tue, 21 Sep 2021 10:47:51 -0500 -Subject: [PATCH 01/12] Refactor: fencing: use pcmk__action_result_t in - stonith_action_t - -stonith_action_t previously had an rc member for a legacy return code, along -with output and error members for action stdout/stderr. When setting rc based -on the svc_action_t result, it used a mapping function svc_action_to_errno(). - -This replaces those with a pcmk__action_result_t member, which means we now -track the exit status and execution status as originally set by libcrmservice, -rather than the mapped rc. The library now calls the mapping function, now -returning standard codes and called result2rc(), when calling the client -callback. - -The exit_reason member is unused as of this commit. - -The behavior should be identical, with the small exception of -services_action_async() failure leaving the exit status as set by the services -library, which means callers will get the result2rc() mapping of the actual -result instead of the former -ECONNABORTED. ---- - lib/fencing/st_client.c | 118 +++++++++++++++++++++++----------------- - 1 file changed, 68 insertions(+), 50 deletions(-) - -diff --git a/lib/fencing/st_client.c b/lib/fencing/st_client.c -index 08adb51c6..6c607b010 100644 ---- a/lib/fencing/st_client.c -+++ b/lib/fencing/st_client.c -@@ -29,6 +29,7 @@ - #include - #include - #include -+#include - - #include - -@@ -57,9 +58,7 @@ struct stonith_action_s { - int max_retries; - - int pid; -- int rc; -- char *output; -- char *error; -+ pcmk__action_result_t result; - }; - - typedef struct stonith_private_s { -@@ -120,6 +119,7 @@ static void stonith_connection_destroy(gpointer user_data); - static void stonith_send_notification(gpointer data, gpointer user_data); - static int internal_stonith_action_execute(stonith_action_t * action); - static void log_action(stonith_action_t *action, pid_t pid); -+static int result2rc(const pcmk__action_result_t *result); - - /*! - * \brief Get agent namespace by name -@@ -196,6 +196,23 @@ stonith_get_namespace(const char *agent, const char *namespace_s) - return st_namespace_invalid; - } - -+/*! -+ * \internal -+ * \brief Set an action's result based on services library result -+ * -+ * \param[in] action Fence action to set result for -+ * \param[in] svc_action Service action to get result from -+ */ -+static void -+set_result_from_svc_action(stonith_action_t *action, svc_action_t *svc_action) -+{ -+ pcmk__set_result(&(action->result), svc_action->rc, svc_action->status, -+ NULL); -+ pcmk__set_result_output(&(action->result), -+ services__grab_stdout(svc_action), -+ services__grab_stderr(svc_action)); -+} -+ - gboolean - stonith__watchdog_fencing_enabled_for_node_api(stonith_t *st, const char *node) - { -@@ -259,19 +276,19 @@ stonith__watchdog_fencing_enabled_for_node(const char *node) - static void - log_action(stonith_action_t *action, pid_t pid) - { -- if (action->output) { -+ if (action->result.action_stdout != NULL) { - /* Logging the whole string confuses syslog when the string is xml */ - char *prefix = crm_strdup_printf("%s[%d] stdout:", action->agent, pid); - -- crm_log_output(LOG_TRACE, prefix, action->output); -+ crm_log_output(LOG_TRACE, prefix, action->result.action_stdout); - free(prefix); - } - -- if (action->error) { -+ if (action->result.action_stderr != NULL) { - /* Logging the whole string confuses syslog when the string is xml */ - char *prefix = crm_strdup_printf("%s[%d] stderr:", action->agent, pid); - -- crm_log_output(LOG_WARNING, prefix, action->error); -+ crm_log_output(LOG_WARNING, prefix, action->result.action_stderr); - free(prefix); - } - } -@@ -645,8 +662,7 @@ stonith__destroy_action(stonith_action_t *action) - if (action->svc_action) { - services_action_free(action->svc_action); - } -- free(action->output); -- free(action->error); -+ pcmk__reset_result(&(action->result)); - free(action); - } - } -@@ -678,15 +694,15 @@ stonith__action_result(stonith_action_t *action, int *rc, char **output, - } - if (action != NULL) { - if (rc) { -- *rc = action->rc; -+ *rc = pcmk_rc2legacy(result2rc(&(action->result))); - } -- if (output && action->output) { -- *output = action->output; -- action->output = NULL; // hand off memory management to caller -+ if ((output != NULL) && (action->result.action_stdout != NULL)) { -+ *output = action->result.action_stdout; -+ action->result.action_stdout = NULL; // hand off ownership to caller - } -- if (error_output && action->error) { -- *error_output = action->error; -- action->error = NULL; // hand off memory management to caller -+ if ((error_output != NULL) && (action->result.action_stderr != NULL)) { -+ *error_output = action->result.action_stderr; -+ action->result.action_stderr = NULL; // hand off ownership to caller - } - } - } -@@ -715,6 +731,9 @@ stonith_action_create(const char *agent, - action->timeout = action->remaining_timeout = timeout; - action->max_retries = FAILURE_MAX_RETRIES; - -+ pcmk__set_result(&(action->result), PCMK_OCF_UNKNOWN, PCMK_EXEC_UNKNOWN, -+ NULL); -+ - if (device_args) { - char buffer[512]; - const char *value = NULL; -@@ -739,7 +758,8 @@ update_remaining_timeout(stonith_action_t * action) - crm_info("Attempted to execute agent %s (%s) the maximum number of times (%d) allowed", - action->agent, action->action, action->max_retries); - action->remaining_timeout = 0; -- } else if ((action->rc != -ETIME) && diff < (action->timeout * 0.7)) { -+ } else if ((action->result.execution_status != PCMK_EXEC_TIMEOUT) -+ && (diff < (action->timeout * 0.7))) { - /* only set remaining timeout period if there is 30% - * or greater of the original timeout period left */ - action->remaining_timeout = action->timeout - diff; -@@ -750,31 +770,31 @@ update_remaining_timeout(stonith_action_t * action) - } - - static int --svc_action_to_errno(svc_action_t *svc_action) { -- int rv = pcmk_ok; -+result2rc(const pcmk__action_result_t *result) { -+ int rc = pcmk_rc_ok; - -- if (svc_action->status == PCMK_EXEC_TIMEOUT) { -- rv = -ETIME; -+ if (result->execution_status == PCMK_EXEC_TIMEOUT) { -+ rc = ETIME; - -- } else if (svc_action->rc != PCMK_OCF_OK) { -+ } else if (result->exit_status != CRM_EX_OK) { - /* Try to provide a useful error code based on the fence agent's - * error output. - */ -- if (svc_action->stderr_data == NULL) { -- rv = -ENODATA; -+ if (result->action_stderr == NULL) { -+ rc = ENODATA; - -- } else if (strstr(svc_action->stderr_data, "imed out")) { -+ } else if (strstr(result->action_stderr, "imed out")) { - /* Some agents have their own internal timeouts */ -- rv = -ETIME; -+ rc = ETIME; - -- } else if (strstr(svc_action->stderr_data, "Unrecognised action")) { -- rv = -EOPNOTSUPP; -+ } else if (strstr(result->action_stderr, "Unrecognised action")) { -+ rc = EOPNOTSUPP; - - } else { -- rv = -pcmk_err_generic; -+ rc = pcmk_rc_error; - } - } -- return rv; -+ return rc; - } - - static void -@@ -782,11 +802,7 @@ stonith_action_async_done(svc_action_t *svc_action) - { - stonith_action_t *action = (stonith_action_t *) svc_action->cb_data; - -- action->rc = svc_action_to_errno(svc_action); -- action->output = svc_action->stdout_data; -- svc_action->stdout_data = NULL; -- action->error = svc_action->stderr_data; -- svc_action->stderr_data = NULL; -+ set_result_from_svc_action(action, svc_action); - - svc_action->params = NULL; - -@@ -795,7 +811,9 @@ stonith_action_async_done(svc_action_t *svc_action) - - log_action(action, action->pid); - -- if (action->rc != pcmk_ok && update_remaining_timeout(action)) { -+ if ((action->result.exit_status != CRM_EX_OK) -+ && update_remaining_timeout(action)) { -+ - int rc = internal_stonith_action_execute(action); - if (rc == pcmk_ok) { - return; -@@ -803,7 +821,8 @@ stonith_action_async_done(svc_action_t *svc_action) - } - - if (action->done_cb) { -- action->done_cb(action->pid, action->rc, action->output, action->userdata); -+ action->done_cb(action->pid, pcmk_rc2legacy(result2rc(&(action->result))), -+ action->result.action_stdout, action->userdata); - } - - action->svc_action = NULL; // don't remove our caller -@@ -835,9 +854,13 @@ internal_stonith_action_execute(stonith_action_t * action) - static int stonith_sequence = 0; - char *buffer = NULL; - -- if ((action == NULL) || (action->action == NULL) || (action->args == NULL) -+ CRM_CHECK(action != NULL, return -EINVAL); -+ -+ if ((action->action == NULL) || (action->args == NULL) - || (action->agent == NULL)) { -- return -EPROTO; -+ pcmk__set_result(&(action->result), PCMK_OCF_UNKNOWN_ERROR, -+ PCMK_EXEC_ERROR_FATAL, NULL); -+ return -EINVAL; - } - - if (!action->tries) { -@@ -857,6 +880,7 @@ internal_stonith_action_execute(stonith_action_t * action) - free(buffer); - - if (svc_action->rc != PCMK_OCF_UNKNOWN) { -+ set_result_from_svc_action(action, svc_action); - services_action_free(svc_action); - return -E2BIG; - } -@@ -877,10 +901,7 @@ internal_stonith_action_execute(stonith_action_t * action) - - /* keep retries from executing out of control and free previous results */ - if (is_retry) { -- free(action->output); -- action->output = NULL; -- free(action->error); -- action->error = NULL; -+ pcmk__reset_result(&(action->result)); - sleep(1); - } - -@@ -889,22 +910,19 @@ internal_stonith_action_execute(stonith_action_t * action) - if (services_action_async_fork_notify(svc_action, - &stonith_action_async_done, - &stonith_action_async_forked)) { -+ pcmk__set_result(&(action->result), PCMK_OCF_UNKNOWN, -+ PCMK_EXEC_PENDING, NULL); - return pcmk_ok; - } - - } else if (services_action_sync(svc_action)) { // sync success - rc = pcmk_ok; -- action->rc = svc_action_to_errno(svc_action); -- action->output = svc_action->stdout_data; -- svc_action->stdout_data = NULL; -- action->error = svc_action->stderr_data; -- svc_action->stderr_data = NULL; - - } else { // sync failure -- action->rc = -ECONNABORTED; -- rc = action->rc; -+ rc = -ECONNABORTED; - } - -+ set_result_from_svc_action(action, svc_action); - svc_action->params = NULL; - services_action_free(svc_action); - return rc; --- -2.27.0 - - -From 4c8e0b0ecc53cb3883f0da0eede20b900fff48d1 Mon Sep 17 00:00:00 2001 -From: Ken Gaillot -Date: Tue, 21 Sep 2021 11:14:31 -0500 -Subject: [PATCH 02/12] Low: fencing: improve return code given back to library - callers - -Expose result2rc() internally for future reuse, and expand it to handle more -cases. In theory, this can give us better log messages and status output for -failures. ---- - include/crm/fencing/internal.h | 1 + - lib/fencing/st_client.c | 63 +++++++++++++++++++++------------- - 2 files changed, 41 insertions(+), 23 deletions(-) - -diff --git a/include/crm/fencing/internal.h b/include/crm/fencing/internal.h -index fa9059e6f..0d23967bb 100644 ---- a/include/crm/fencing/internal.h -+++ b/include/crm/fencing/internal.h -@@ -60,6 +60,7 @@ stonith_action_t *stonith_action_create(const char *agent, - void stonith__destroy_action(stonith_action_t *action); - void stonith__action_result(stonith_action_t *action, int *rc, char **output, - char **error_output); -+int stonith__result2rc(const pcmk__action_result_t *result); - - int - stonith_action_execute_async(stonith_action_t * action, -diff --git a/lib/fencing/st_client.c b/lib/fencing/st_client.c -index 6c607b010..809be1640 100644 ---- a/lib/fencing/st_client.c -+++ b/lib/fencing/st_client.c -@@ -119,7 +119,6 @@ static void stonith_connection_destroy(gpointer user_data); - static void stonith_send_notification(gpointer data, gpointer user_data); - static int internal_stonith_action_execute(stonith_action_t * action); - static void log_action(stonith_action_t *action, pid_t pid); --static int result2rc(const pcmk__action_result_t *result); - - /*! - * \brief Get agent namespace by name -@@ -694,7 +693,7 @@ stonith__action_result(stonith_action_t *action, int *rc, char **output, - } - if (action != NULL) { - if (rc) { -- *rc = pcmk_rc2legacy(result2rc(&(action->result))); -+ *rc = pcmk_rc2legacy(stonith__result2rc(&(action->result))); - } - if ((output != NULL) && (action->result.action_stdout != NULL)) { - *output = action->result.action_stdout; -@@ -769,32 +768,49 @@ update_remaining_timeout(stonith_action_t * action) - return action->remaining_timeout ? TRUE : FALSE; - } - --static int --result2rc(const pcmk__action_result_t *result) { -- int rc = pcmk_rc_ok; -+/*! -+ * \internal -+ * \brief Map a fencing action result to a standard return code -+ * -+ * \param[in] result Fencing action result to map -+ * -+ * \return Standard Pacemaker return code that best corresponds to \p result -+ */ -+int -+stonith__result2rc(const pcmk__action_result_t *result) -+{ -+ switch (result->execution_status) { -+ case PCMK_EXEC_CANCELLED: return ECANCELED; -+ case PCMK_EXEC_TIMEOUT: return ETIME; -+ case PCMK_EXEC_NOT_INSTALLED: return ENOENT; -+ case PCMK_EXEC_NOT_SUPPORTED: return EOPNOTSUPP; -+ case PCMK_EXEC_NOT_CONNECTED: return ENOTCONN; -+ case PCMK_EXEC_NO_FENCE_DEVICE: return ENODEV; -+ case PCMK_EXEC_NO_SECRETS: return EACCES; -+ default: break; -+ } - -- if (result->execution_status == PCMK_EXEC_TIMEOUT) { -- rc = ETIME; -+ if (result->exit_status == CRM_EX_OK) { -+ return pcmk_rc_ok; -+ } - -- } else if (result->exit_status != CRM_EX_OK) { -- /* Try to provide a useful error code based on the fence agent's -- * error output. -- */ -- if (result->action_stderr == NULL) { -- rc = ENODATA; -+ // Try to provide useful error code based on result's error output - -- } else if (strstr(result->action_stderr, "imed out")) { -- /* Some agents have their own internal timeouts */ -- rc = ETIME; -+ if (result->action_stderr == NULL) { -+ return ENODATA; - -- } else if (strstr(result->action_stderr, "Unrecognised action")) { -- rc = EOPNOTSUPP; -+ } else if (strcasestr(result->action_stderr, "timed out") -+ || strcasestr(result->action_stderr, "timeout")) { -+ return ETIME; - -- } else { -- rc = pcmk_rc_error; -- } -+ } else if (strcasestr(result->action_stderr, "unrecognised action") -+ || strcasestr(result->action_stderr, "unrecognized action") -+ || strcasestr(result->action_stderr, "unsupported action")) { -+ return EOPNOTSUPP; - } -- return rc; -+ -+ // Oh well, we tried -+ return pcmk_rc_error; - } - - static void -@@ -821,7 +837,8 @@ stonith_action_async_done(svc_action_t *svc_action) - } - - if (action->done_cb) { -- action->done_cb(action->pid, pcmk_rc2legacy(result2rc(&(action->result))), -+ action->done_cb(action->pid, -+ pcmk_rc2legacy(stonith__result2rc(&(action->result))), - action->result.action_stdout, action->userdata); - } - --- -2.27.0 - - -From 153c9b552a5bad9dd36e8635fa478ed9cad1f240 Mon Sep 17 00:00:00 2001 -From: Ken Gaillot -Date: Thu, 7 Oct 2021 11:35:44 -0500 -Subject: [PATCH 03/12] Refactor: fencing: return full result from - stonith__action_result() - -Previously, stonith__action_result() grabbed an action's legacy rc, stdout, and -stderr separately. Now, directly return a pointer to the action's result -object, and map that to a legacy rc in the callers when needed. ---- - include/crm/fencing/internal.h | 3 +-- - lib/fencing/st_client.c | 36 ++++--------------------- - lib/fencing/st_rhcs.c | 48 ++++++++++++++++++++++++---------- - 3 files changed, 40 insertions(+), 47 deletions(-) - -diff --git a/include/crm/fencing/internal.h b/include/crm/fencing/internal.h -index 0d23967bb..4e9f50fe8 100644 ---- a/include/crm/fencing/internal.h -+++ b/include/crm/fencing/internal.h -@@ -58,8 +58,7 @@ stonith_action_t *stonith_action_create(const char *agent, - GHashTable * port_map, - const char * host_arg); - void stonith__destroy_action(stonith_action_t *action); --void stonith__action_result(stonith_action_t *action, int *rc, char **output, -- char **error_output); -+pcmk__action_result_t *stonith__action_result(stonith_action_t *action); - int stonith__result2rc(const pcmk__action_result_t *result); - - int -diff --git a/lib/fencing/st_client.c b/lib/fencing/st_client.c -index 809be1640..b9df18465 100644 ---- a/lib/fencing/st_client.c -+++ b/lib/fencing/st_client.c -@@ -670,40 +670,14 @@ stonith__destroy_action(stonith_action_t *action) - * \internal - * \brief Get the result of an executed stonith action - * -- * \param[in,out] action Executed action -- * \param[out] rc Where to store result code (or NULL) -- * \param[out] output Where to store standard output (or NULL) -- * \param[out] error_output Where to store standard error output (or NULL) -+ * \param[in] action Executed action - * -- * \note If output or error_output is not NULL, the caller is responsible for -- * freeing the memory. -+ * \return Pointer to action's result (or NULL if \p action is NULL) - */ --void --stonith__action_result(stonith_action_t *action, int *rc, char **output, -- char **error_output) -+pcmk__action_result_t * -+stonith__action_result(stonith_action_t *action) - { -- if (rc) { -- *rc = pcmk_ok; -- } -- if (output) { -- *output = NULL; -- } -- if (error_output) { -- *error_output = NULL; -- } -- if (action != NULL) { -- if (rc) { -- *rc = pcmk_rc2legacy(stonith__result2rc(&(action->result))); -- } -- if ((output != NULL) && (action->result.action_stdout != NULL)) { -- *output = action->result.action_stdout; -- action->result.action_stdout = NULL; // hand off ownership to caller -- } -- if ((error_output != NULL) && (action->result.action_stderr != NULL)) { -- *error_output = action->result.action_stderr; -- action->result.action_stderr = NULL; // hand off ownership to caller -- } -- } -+ return (action == NULL)? NULL : &(action->result); - } - - #define FAILURE_MAX_RETRIES 2 -diff --git a/lib/fencing/st_rhcs.c b/lib/fencing/st_rhcs.c -index 89a2625bd..23e694975 100644 ---- a/lib/fencing/st_rhcs.c -+++ b/lib/fencing/st_rhcs.c -@@ -1,5 +1,5 @@ - /* -- * Copyright 2004-2020 the Pacemaker project contributors -+ * Copyright 2004-2021 the Pacemaker project contributors - * - * The version control history for this file may have further details. - * -@@ -123,10 +123,10 @@ stonith_rhcs_parameter_not_required(xmlNode *metadata, const char *parameter) - static int - stonith__rhcs_get_metadata(const char *agent, int timeout, xmlNode **metadata) - { -- char *buffer = NULL; - xmlNode *xml = NULL; - xmlNode *actions = NULL; - xmlXPathObject *xpathObj = NULL; -+ pcmk__action_result_t *result = NULL; - stonith_action_t *action = stonith_action_create(agent, "metadata", NULL, 0, - 5, NULL, NULL, NULL); - int rc = stonith__execute(action); -@@ -138,23 +138,31 @@ stonith__rhcs_get_metadata(const char *agent, int timeout, xmlNode **metadata) - return rc; - } - -- stonith__action_result(action, &rc, &buffer, NULL); -- stonith__destroy_action(action); -- if (rc < 0) { -- crm_warn("Metadata action for %s failed: %s " CRM_XS "rc=%d", -- agent, pcmk_strerror(rc), rc); -- free(buffer); -- return rc; -+ result = stonith__action_result(action); -+ -+ if (result->execution_status != PCMK_EXEC_DONE) { -+ crm_warn("Could not execute metadata action for %s: %s", -+ agent, pcmk_exec_status_str(result->execution_status)); -+ stonith__destroy_action(action); -+ return pcmk_rc2legacy(stonith__result2rc(result)); - } - -- if (buffer == NULL) { -+ if (result->exit_status != CRM_EX_OK) { -+ crm_warn("Metadata action for %s returned error code %d", -+ agent, result->exit_status); -+ stonith__destroy_action(action); -+ return pcmk_rc2legacy(stonith__result2rc(result)); -+ } -+ -+ if (result->action_stdout == NULL) { - crm_warn("Metadata action for %s returned no data", agent); -+ stonith__destroy_action(action); - return -ENODATA; - } - -- xml = string2xml(buffer); -- free(buffer); -- buffer = NULL; -+ xml = string2xml(result->action_stdout); -+ stonith__destroy_action(action); -+ - if (xml == NULL) { - crm_warn("Metadata for %s is invalid", agent); - return -pcmk_err_schema_validation; -@@ -289,7 +297,19 @@ stonith__rhcs_validate(stonith_t *st, int call_options, const char *target, - - rc = stonith__execute(action); - if (rc == pcmk_ok) { -- stonith__action_result(action, &rc, output, error_output); -+ pcmk__action_result_t *result = stonith__action_result(action); -+ -+ rc = pcmk_rc2legacy(stonith__result2rc(result)); -+ -+ // Take ownership of output so stonith__destroy_action() doesn't free it -+ if (output != NULL) { -+ *output = result->action_stdout; -+ result->action_stdout = NULL; -+ } -+ if (error_output != NULL) { -+ *error_output = result->action_stderr; -+ result->action_stderr = NULL; -+ } - } - stonith__destroy_action(action); - return rc; --- -2.27.0 - - -From 7f7067014357cccb229a0bef091e234eb3765f7a Mon Sep 17 00:00:00 2001 -From: Ken Gaillot -Date: Tue, 21 Sep 2021 13:05:54 -0500 -Subject: [PATCH 04/12] Refactor: fencing: pass full result to async action - callback - -When executing an asynchronous fence agent command, the fencing library gets -the full result (exit status, execution status, and exit reason) from the -services library, then maps that to a legacy return code. - -Now, pass the full result object to the fencing async callback, rather than -separate arguments for legacy code and stdout. The mapping to a legacy code now -happens in the fencer rather than the fencing library. - -The goal of this and following commits is to push the full result object -further down the code path, so that ultimately the full result is always -available internally, and the legacy code mapping is only done for backward -compatibility when sending the result back to a client. - -This commit focuses on the async callback (done_cb() in both the fencer's -async_command_t and the fencing library's stonith_action_t). Later commits will -follow the chain: - - st_child_done() and stonith_fence_get_devices_cb() - -> stonith_send_async_reply() - -> stonith_construct_async_reply() and log_async_result() ---- - daemons/fenced/fenced_commands.c | 78 +++++++++++++++++++++----------- - include/crm/fencing/internal.h | 3 +- - lib/fencing/st_client.c | 10 ++-- - 3 files changed, 58 insertions(+), 33 deletions(-) - -diff --git a/daemons/fenced/fenced_commands.c b/daemons/fenced/fenced_commands.c -index b5ae28d90..d5d04ae69 100644 ---- a/daemons/fenced/fenced_commands.c -+++ b/daemons/fenced/fenced_commands.c -@@ -62,7 +62,8 @@ struct device_search_s { - }; - - static gboolean stonith_device_dispatch(gpointer user_data); --static void st_child_done(int pid, int rc, const char *output, void *user_data); -+static void st_child_done(int pid, const pcmk__action_result_t *result, -+ void *user_data); - static void stonith_send_reply(xmlNode * reply, int call_options, const char *remote_peer, - const char *client_id); - -@@ -99,7 +100,8 @@ typedef struct async_command_s { - GList *device_next; - - void *internal_user_data; -- void (*done_cb) (int pid, int rc, const char *output, void *user_data); -+ void (*done_cb) (int pid, const pcmk__action_result_t *result, -+ void *user_data); - guint timer_sigterm; - guint timer_sigkill; - /*! If the operation timed out, this is the last signal -@@ -377,13 +379,25 @@ get_agent_metadata_cb(gpointer data) { - * \internal - * \brief Call a command's action callback for an internal (not library) result - * -- * \param[in] cmd Command to report result for -- * \param[in] rc Legacy return code to pass to callback -+ * \param[in] cmd Command to report result for -+ * \param[in] execution_status Execution status to use for result -+ * \param[in] exit_status Exit status to use for result -+ * \param[in] exit_reason Exit reason to use for result - */ - static void --report_internal_result(async_command_t *cmd, int rc) -+report_internal_result(async_command_t *cmd, int exit_status, -+ int execution_status, const char *exit_reason) - { -- cmd->done_cb(0, rc, NULL, cmd); -+ pcmk__action_result_t result = { -+ // Ensure we don't pass garbage to free() -+ .exit_reason = NULL, -+ .action_stdout = NULL, -+ .action_stderr = NULL -+ }; -+ -+ pcmk__set_result(&result, exit_status, execution_status, exit_reason); -+ cmd->done_cb(0, &result, cmd); -+ pcmk__reset_result(&result); - } - - static gboolean -@@ -446,7 +460,7 @@ stonith_device_execute(stonith_device_t * device) - } - } else { - crm_info("Faking success for %s watchdog operation", cmd->action); -- report_internal_result(cmd, pcmk_ok); -+ report_internal_result(cmd, CRM_EX_OK, PCMK_EXEC_DONE, NULL); - goto done; - } - } -@@ -462,7 +476,8 @@ stonith_device_execute(stonith_device_t * device) - crm_err("Considering %s unconfigured " - "because unable to load CIB secrets: %s", - device->id, pcmk_rc_str(exec_rc)); -- report_internal_result(cmd, -EACCES); -+ report_internal_result(cmd, CRM_EX_ERROR, PCMK_EXEC_NO_SECRETS, -+ NULL); - goto done; - } - } -@@ -501,7 +516,7 @@ stonith_device_execute(stonith_device_t * device) - cmd->done_cb, fork_cb); - if (exec_rc < 0) { - cmd->activating_on = NULL; -- report_internal_result(cmd, exec_rc); -+ cmd->done_cb(0, stonith__action_result(action), cmd); - stonith__destroy_action(action); - } - -@@ -625,7 +640,8 @@ free_device(gpointer data) - async_command_t *cmd = gIter->data; - - crm_warn("Removal of device '%s' purged operation '%s'", device->id, cmd->action); -- report_internal_result(cmd, -ENODEV); -+ report_internal_result(cmd, CRM_EX_ERROR, PCMK_EXEC_NO_FENCE_DEVICE, -+ NULL); - } - g_list_free(device->pending_ops); - -@@ -1079,7 +1095,8 @@ schedule_internal_command(const char *origin, - const char *victim, - int timeout, - void *internal_user_data, -- void (*done_cb) (int pid, int rc, const char *output, -+ void (*done_cb) (int pid, -+ const pcmk__action_result_t *result, - void *user_data)) - { - async_command_t *cmd = NULL; -@@ -1111,7 +1128,7 @@ enum fence_status_code { - }; - - static void --status_search_cb(int pid, int rc, const char *output, void *user_data) -+status_search_cb(int pid, const pcmk__action_result_t *result, void *user_data) - { - async_command_t *cmd = user_data; - struct device_search_s *search = cmd->internal_user_data; -@@ -1127,7 +1144,7 @@ status_search_cb(int pid, int rc, const char *output, void *user_data) - - mainloop_set_trigger(dev->work); - -- switch (rc) { -+ switch (result->exit_status) { - case fence_status_unknown: - crm_trace("%s reported it cannot fence %s", dev->id, search->host); - break; -@@ -1141,14 +1158,15 @@ status_search_cb(int pid, int rc, const char *output, void *user_data) - default: - crm_warn("Assuming %s cannot fence %s " - "(status returned unknown code %d)", -- dev->id, search->host, rc); -+ dev->id, search->host, result->exit_status); - break; - } - search_devices_record_result(search, dev->id, can); - } - - static void --dynamic_list_search_cb(int pid, int rc, const char *output, void *user_data) -+dynamic_list_search_cb(int pid, const pcmk__action_result_t *result, -+ void *user_data) - { - async_command_t *cmd = user_data; - struct device_search_s *search = cmd->internal_user_data; -@@ -1169,21 +1187,21 @@ dynamic_list_search_cb(int pid, int rc, const char *output, void *user_data) - - mainloop_set_trigger(dev->work); - -- if (rc == CRM_EX_OK) { -+ if (result->exit_status == CRM_EX_OK) { - crm_info("Refreshing target list for %s", dev->id); - g_list_free_full(dev->targets, free); -- dev->targets = stonith__parse_targets(output); -+ dev->targets = stonith__parse_targets(result->action_stdout); - dev->targets_age = time(NULL); - - } else if (dev->targets != NULL) { - crm_info("Reusing most recent target list for %s " - "because list returned error code %d", -- dev->id, rc); -+ dev->id, result->exit_status); - - } else { // We have never successfully executed list - crm_warn("Assuming %s cannot fence %s " - "because list returned error code %d", -- dev->id, search->host, rc); -+ dev->id, search->host, result->exit_status); - - /* Fall back to pcmk_host_check="status" if the user didn't explicitly - * specify "dynamic-list". -@@ -2407,7 +2425,7 @@ cancel_stonith_command(async_command_t * cmd) - } - - static void --st_child_done(int pid, int rc, const char *output, void *user_data) -+st_child_done(int pid, const pcmk__action_result_t *result, void *user_data) - { - stonith_device_t *device = NULL; - stonith_device_t *next_device = NULL; -@@ -2423,7 +2441,7 @@ st_child_done(int pid, int rc, const char *output, void *user_data) - /* The device is ready to do something else now */ - device = g_hash_table_lookup(device_list, cmd->device); - if (device) { -- if (!device->verified && (rc == pcmk_ok) && -+ if (!device->verified && (result->exit_status == CRM_EX_OK) && - (pcmk__strcase_any_of(cmd->action, "list", "monitor", "status", NULL))) { - - device->verified = TRUE; -@@ -2432,7 +2450,7 @@ st_child_done(int pid, int rc, const char *output, void *user_data) - mainloop_set_trigger(device->work); - } - -- if (rc == 0) { -+ if (result->exit_status == CRM_EX_OK) { - GList *iter; - /* see if there are any required devices left to execute for this op */ - for (iter = cmd->device_next; iter != NULL; iter = iter->next) { -@@ -2445,7 +2463,8 @@ st_child_done(int pid, int rc, const char *output, void *user_data) - next_device = NULL; - } - -- } else if (rc != 0 && cmd->device_next && (is_action_required(cmd->action, device) == FALSE)) { -+ } else if ((cmd->device_next != NULL) -+ && !is_action_required(cmd->action, device)) { - /* if this device didn't work out, see if there are any others we can try. - * if the failed device was 'required', we can't pick another device. */ - next_device = g_hash_table_lookup(device_list, cmd->device_next->data); -@@ -2454,16 +2473,19 @@ st_child_done(int pid, int rc, const char *output, void *user_data) - - /* this operation requires more fencing, hooray! */ - if (next_device) { -- log_async_result(cmd, rc, pid, next_device->id, output, FALSE); -+ log_async_result(cmd, pcmk_rc2legacy(stonith__result2rc(result)), pid, -+ next_device->id, result->action_stdout, FALSE); - schedule_stonith_command(cmd, next_device); - /* Prevent cmd from being freed */ - cmd = NULL; - goto done; - } - -- stonith_send_async_reply(cmd, output, rc, pid, false); -+ stonith_send_async_reply(cmd, result->action_stdout, -+ pcmk_rc2legacy(stonith__result2rc(result)), pid, -+ false); - -- if (rc != 0) { -+ if (result->exit_status != CRM_EX_OK) { - goto done; - } - -@@ -2509,7 +2531,9 @@ st_child_done(int pid, int rc, const char *output, void *user_data) - - cmd_list = g_list_remove_link(cmd_list, gIter); - -- stonith_send_async_reply(cmd_other, output, rc, pid, true); -+ stonith_send_async_reply(cmd_other, result->action_stdout, -+ pcmk_rc2legacy(stonith__result2rc(result)), -+ pid, true); - cancel_stonith_command(cmd_other); - - free_async_command(cmd_other); -diff --git a/include/crm/fencing/internal.h b/include/crm/fencing/internal.h -index 4e9f50fe8..6a7e4232c 100644 ---- a/include/crm/fencing/internal.h -+++ b/include/crm/fencing/internal.h -@@ -64,7 +64,8 @@ int stonith__result2rc(const pcmk__action_result_t *result); - int - stonith_action_execute_async(stonith_action_t * action, - void *userdata, -- void (*done) (int pid, int rc, const char *output, -+ void (*done) (int pid, -+ const pcmk__action_result_t *result, - void *user_data), - void (*fork_cb) (int pid, void *user_data)); - -diff --git a/lib/fencing/st_client.c b/lib/fencing/st_client.c -index b9df18465..59dcab9a3 100644 ---- a/lib/fencing/st_client.c -+++ b/lib/fencing/st_client.c -@@ -46,7 +46,8 @@ struct stonith_action_s { - int timeout; - int async; - void *userdata; -- void (*done_cb) (int pid, int status, const char *output, void *user_data); -+ void (*done_cb) (int pid, const pcmk__action_result_t *result, -+ void *user_data); - void (*fork_cb) (int pid, void *user_data); - - svc_action_t *svc_action; -@@ -811,9 +812,7 @@ stonith_action_async_done(svc_action_t *svc_action) - } - - if (action->done_cb) { -- action->done_cb(action->pid, -- pcmk_rc2legacy(stonith__result2rc(&(action->result))), -- action->result.action_stdout, action->userdata); -+ action->done_cb(action->pid, &(action->result), action->userdata); - } - - action->svc_action = NULL; // don't remove our caller -@@ -933,7 +932,8 @@ internal_stonith_action_execute(stonith_action_t * action) - int - stonith_action_execute_async(stonith_action_t * action, - void *userdata, -- void (*done) (int pid, int rc, const char *output, -+ void (*done) (int pid, -+ const pcmk__action_result_t *result, - void *user_data), - void (*fork_cb) (int pid, void *user_data)) - { --- -2.27.0 - - -From bbd022306df7a873c0ecb2be2d33c56fbf327b8c Mon Sep 17 00:00:00 2001 -From: Ken Gaillot -Date: Tue, 21 Sep 2021 11:51:28 -0500 -Subject: [PATCH 05/12] Feature: fencing: set exit reason for internal - execution errors - -... most importantly, copying any exit reason set by the services library. -This ensures that the stonith_action_t exit reason is set when appropriate. -However, nothing uses it as of this commit. ---- - daemons/fenced/fenced_commands.c | 4 ++-- - lib/fencing/st_client.c | 6 +++--- - 2 files changed, 5 insertions(+), 5 deletions(-) - -diff --git a/daemons/fenced/fenced_commands.c b/daemons/fenced/fenced_commands.c -index d5d04ae69..f55a32649 100644 ---- a/daemons/fenced/fenced_commands.c -+++ b/daemons/fenced/fenced_commands.c -@@ -477,7 +477,7 @@ stonith_device_execute(stonith_device_t * device) - "because unable to load CIB secrets: %s", - device->id, pcmk_rc_str(exec_rc)); - report_internal_result(cmd, CRM_EX_ERROR, PCMK_EXEC_NO_SECRETS, -- NULL); -+ "Failed to get CIB secrets"); - goto done; - } - } -@@ -641,7 +641,7 @@ free_device(gpointer data) - - crm_warn("Removal of device '%s' purged operation '%s'", device->id, cmd->action); - report_internal_result(cmd, CRM_EX_ERROR, PCMK_EXEC_NO_FENCE_DEVICE, -- NULL); -+ "Device was removed before action could be executed"); - } - g_list_free(device->pending_ops); - -diff --git a/lib/fencing/st_client.c b/lib/fencing/st_client.c -index 59dcab9a3..3d4127eff 100644 ---- a/lib/fencing/st_client.c -+++ b/lib/fencing/st_client.c -@@ -207,7 +207,7 @@ static void - set_result_from_svc_action(stonith_action_t *action, svc_action_t *svc_action) - { - pcmk__set_result(&(action->result), svc_action->rc, svc_action->status, -- NULL); -+ services__exit_reason(svc_action)); - pcmk__set_result_output(&(action->result), - services__grab_stdout(svc_action), - services__grab_stderr(svc_action)); -@@ -706,7 +706,7 @@ stonith_action_create(const char *agent, - action->max_retries = FAILURE_MAX_RETRIES; - - pcmk__set_result(&(action->result), PCMK_OCF_UNKNOWN, PCMK_EXEC_UNKNOWN, -- NULL); -+ "Initialization bug in fencing library"); - - if (device_args) { - char buffer[512]; -@@ -849,7 +849,7 @@ internal_stonith_action_execute(stonith_action_t * action) - if ((action->action == NULL) || (action->args == NULL) - || (action->agent == NULL)) { - pcmk__set_result(&(action->result), PCMK_OCF_UNKNOWN_ERROR, -- PCMK_EXEC_ERROR_FATAL, NULL); -+ PCMK_EXEC_ERROR_FATAL, "Bug in fencing library"); - return -EINVAL; - } - --- -2.27.0 - - -From ed08f600688af1d25412d2427502ba5d4a55c0d6 Mon Sep 17 00:00:00 2001 -From: Ken Gaillot -Date: Thu, 7 Oct 2021 12:06:10 -0500 -Subject: [PATCH 06/12] Fix: fencer: handle dynamic target query failures - better - -Previously, the callbacks for list and status queries checked only the result's -exit status. However, the services library will use PCMK_OCF_UNKNOWN_ERROR (1) -as the exit status for internal failures, and that value signifies a recognized -node (not an error) for fence list actions. - -Now, the callbacks check the execution status as well. ---- - daemons/fenced/fenced_commands.c | 46 +++++++++++++++++++++++++++----- - 1 file changed, 39 insertions(+), 7 deletions(-) - -diff --git a/daemons/fenced/fenced_commands.c b/daemons/fenced/fenced_commands.c -index f55a32649..7b3fb25a1 100644 ---- a/daemons/fenced/fenced_commands.c -+++ b/daemons/fenced/fenced_commands.c -@@ -1144,6 +1144,18 @@ status_search_cb(int pid, const pcmk__action_result_t *result, void *user_data) - - mainloop_set_trigger(dev->work); - -+ if (result->execution_status != PCMK_EXEC_DONE) { -+ crm_warn("Assuming %s cannot fence %s " -+ "because status could not be executed: %s%s%s%s", -+ dev->id, search->host, -+ pcmk_exec_status_str(result->execution_status), -+ ((result->exit_reason == NULL)? "" : " ("), -+ ((result->exit_reason == NULL)? "" : result->exit_reason), -+ ((result->exit_reason == NULL)? "" : ")")); -+ search_devices_record_result(search, dev->id, FALSE); -+ return; -+ } -+ - switch (result->exit_status) { - case fence_status_unknown: - crm_trace("%s reported it cannot fence %s", dev->id, search->host); -@@ -1187,21 +1199,41 @@ dynamic_list_search_cb(int pid, const pcmk__action_result_t *result, - - mainloop_set_trigger(dev->work); - -- if (result->exit_status == CRM_EX_OK) { -+ if ((result->execution_status == PCMK_EXEC_DONE) -+ && (result->exit_status == CRM_EX_OK)) { - crm_info("Refreshing target list for %s", dev->id); - g_list_free_full(dev->targets, free); - dev->targets = stonith__parse_targets(result->action_stdout); - dev->targets_age = time(NULL); - - } else if (dev->targets != NULL) { -- crm_info("Reusing most recent target list for %s " -- "because list returned error code %d", -- dev->id, result->exit_status); -+ if (result->execution_status == PCMK_EXEC_DONE) { -+ crm_info("Reusing most recent target list for %s " -+ "because list returned error code %d", -+ dev->id, result->exit_status); -+ } else { -+ crm_info("Reusing most recent target list for %s " -+ "because list could not be executed: %s%s%s%s", -+ dev->id, pcmk_exec_status_str(result->execution_status), -+ ((result->exit_reason == NULL)? "" : " ("), -+ ((result->exit_reason == NULL)? "" : result->exit_reason), -+ ((result->exit_reason == NULL)? "" : ")")); -+ } - - } else { // We have never successfully executed list -- crm_warn("Assuming %s cannot fence %s " -- "because list returned error code %d", -- dev->id, search->host, result->exit_status); -+ if (result->execution_status == PCMK_EXEC_DONE) { -+ crm_warn("Assuming %s cannot fence %s " -+ "because list returned error code %d", -+ dev->id, search->host, result->exit_status); -+ } else { -+ crm_warn("Assuming %s cannot fence %s " -+ "because list could not be executed: %s%s%s%s", -+ dev->id, search->host, -+ pcmk_exec_status_str(result->execution_status), -+ ((result->exit_reason == NULL)? "" : " ("), -+ ((result->exit_reason == NULL)? "" : result->exit_reason), -+ ((result->exit_reason == NULL)? "" : ")")); -+ } - - /* Fall back to pcmk_host_check="status" if the user didn't explicitly - * specify "dynamic-list". --- -2.27.0 - - -From 5a30238a3b8691a5fc20f53906c0efcc50193306 Mon Sep 17 00:00:00 2001 -From: Ken Gaillot -Date: Tue, 21 Sep 2021 15:57:50 -0500 -Subject: [PATCH 07/12] Refactor: fencer: pass result object when sending an - async reply - -... via stonith_send_async_reply(), instead of sending the mapped legacy code -and action stdout separately. Also, drop the "stonith_" prefix since the -function is static. - -This moves the mapping from the stonith_send_async_reply() callers to the -function itself, so we use the result object and standard codes as long as -possible, and map to a legacy code only where needed. ---- - daemons/fenced/fenced_commands.c | 62 +++++++++++++++++++------------- - daemons/fenced/fenced_remote.c | 2 +- - 2 files changed, 39 insertions(+), 25 deletions(-) - -diff --git a/daemons/fenced/fenced_commands.c b/daemons/fenced/fenced_commands.c -index 7b3fb25a1..e5f8162ce 100644 ---- a/daemons/fenced/fenced_commands.c -+++ b/daemons/fenced/fenced_commands.c -@@ -2376,12 +2376,28 @@ log_async_result(async_command_t *cmd, int rc, int pid, const char *next, - } - } - -+/*! -+ * \internal -+ * \brief Reply to requester after asynchronous command completion -+ * -+ * \param[in] cmd Command that completed -+ * \param[in] result Result of command -+ * \param[in] pid Process ID of command, if available -+ * \param[in] merged If true, command was merged with another, not executed -+ */ - static void --stonith_send_async_reply(async_command_t *cmd, const char *output, int rc, -- int pid, bool merged) -+send_async_reply(async_command_t *cmd, const pcmk__action_result_t *result, -+ int pid, bool merged) - { - xmlNode *reply = NULL; - gboolean bcast = FALSE; -+ const char *output = NULL; -+ int rc = pcmk_ok; -+ -+ CRM_CHECK((cmd != NULL) && (result != NULL), return); -+ -+ output = result->action_stdout; -+ rc = pcmk_rc2legacy(stonith__result2rc(result)); - - reply = stonith_construct_async_reply(cmd, output, NULL, rc); - -@@ -2513,9 +2529,7 @@ st_child_done(int pid, const pcmk__action_result_t *result, void *user_data) - goto done; - } - -- stonith_send_async_reply(cmd, result->action_stdout, -- pcmk_rc2legacy(stonith__result2rc(result)), pid, -- false); -+ send_async_reply(cmd, result, pid, false); - - if (result->exit_status != CRM_EX_OK) { - goto done; -@@ -2563,9 +2577,7 @@ st_child_done(int pid, const pcmk__action_result_t *result, void *user_data) - - cmd_list = g_list_remove_link(cmd_list, gIter); - -- stonith_send_async_reply(cmd_other, result->action_stdout, -- pcmk_rc2legacy(stonith__result2rc(result)), -- pid, true); -+ send_async_reply(cmd_other, result, pid, true); - cancel_stonith_command(cmd_other); - - free_async_command(cmd_other); -@@ -2604,26 +2616,28 @@ stonith_fence_get_devices_cb(GList * devices, void *user_data) - /* Order based on priority */ - devices = g_list_sort(devices, sort_device_priority); - device = g_hash_table_lookup(device_list, devices->data); -- -- if (device) { -- cmd->device_list = devices; -- cmd->device_next = devices->next; -- devices = NULL; /* list owned by cmd now */ -- } - } - -- /* we have a device, schedule it for fencing. */ -- if (device) { -- schedule_stonith_command(cmd, device); -- /* in progress */ -- return; -- } -+ if (device == NULL) { // No device found -+ pcmk__action_result_t result = { -+ // Ensure we don't pass garbage to free() -+ .exit_reason = NULL, -+ .action_stdout = NULL, -+ .action_stderr = NULL -+ }; - -- /* no device found! */ -- stonith_send_async_reply(cmd, NULL, -ENODEV, 0, false); -+ pcmk__set_result(&result, CRM_EX_ERROR, PCMK_EXEC_NO_FENCE_DEVICE, -+ "No fence device configured for target"); -+ send_async_reply(cmd, &result, 0, false); -+ pcmk__reset_result(&result); -+ free_async_command(cmd); -+ g_list_free_full(devices, free); - -- free_async_command(cmd); -- g_list_free_full(devices, free); -+ } else { // Device found, schedule it for fencing -+ cmd->device_list = devices; -+ cmd->device_next = devices->next; -+ schedule_stonith_command(cmd, device); -+ } - } - - static int -diff --git a/daemons/fenced/fenced_remote.c b/daemons/fenced/fenced_remote.c -index ffaf60018..b09d2865e 100644 ---- a/daemons/fenced/fenced_remote.c -+++ b/daemons/fenced/fenced_remote.c -@@ -996,7 +996,7 @@ stonith_manual_ack(xmlNode * msg, remote_fencing_op_t * op) - - remote_op_done(op, msg, pcmk_ok, FALSE); - -- /* Replies are sent via done_cb->stonith_send_async_reply()->do_local_reply() */ -+ // Replies are sent via done_cb -> send_async_reply() -> do_local_reply() - return -EINPROGRESS; - } - --- -2.27.0 - - -From c67b6bfbe0baa1253058417ddfb9bc4cf0844e27 Mon Sep 17 00:00:00 2001 -From: Ken Gaillot -Date: Thu, 7 Oct 2021 17:25:38 -0500 -Subject: [PATCH 08/12] Refactor: fencer: pass result object when building - async reply - -... via stonith_construct_async_reply(), instead of passing a mapped legacy rc -and action output separately, which will be helpful when we add the exit reason -to the reply. Also, drop the "stonith_" prefix since the function is static, and -drop an unused argument. ---- - daemons/fenced/fenced_commands.c | 33 +++++++++++++++----------------- - 1 file changed, 15 insertions(+), 18 deletions(-) - -diff --git a/daemons/fenced/fenced_commands.c b/daemons/fenced/fenced_commands.c -index e5f8162ce..6bc12e6c4 100644 ---- a/daemons/fenced/fenced_commands.c -+++ b/daemons/fenced/fenced_commands.c -@@ -112,8 +112,8 @@ typedef struct async_command_s { - stonith_device_t *activating_on; - } async_command_t; - --static xmlNode *stonith_construct_async_reply(async_command_t * cmd, const char *output, -- xmlNode * data, int rc); -+static xmlNode *construct_async_reply(async_command_t *cmd, -+ const pcmk__action_result_t *result); - - static gboolean - is_action_required(const char *action, stonith_device_t *device) -@@ -2399,7 +2399,7 @@ send_async_reply(async_command_t *cmd, const pcmk__action_result_t *result, - output = result->action_stdout; - rc = pcmk_rc2legacy(stonith__result2rc(result)); - -- reply = stonith_construct_async_reply(cmd, output, NULL, rc); -+ reply = construct_async_reply(cmd, result); - - // Only replies for certain actions are broadcast - if (pcmk__str_any_of(cmd->action, "metadata", "monitor", "list", "status", -@@ -2732,17 +2732,20 @@ stonith_construct_reply(xmlNode * request, const char *output, xmlNode * data, i - return reply; - } - -+/*! -+ * \internal -+ * \brief Build an XML reply to an asynchronous fencing command -+ * -+ * \param[in] cmd Fencing command that reply is for -+ * \param[in] result Command result -+ */ - static xmlNode * --stonith_construct_async_reply(async_command_t * cmd, const char *output, xmlNode * data, int rc) -+construct_async_reply(async_command_t *cmd, const pcmk__action_result_t *result) - { -- xmlNode *reply = NULL; -- -- crm_trace("Creating a basic reply"); -- reply = create_xml_node(NULL, T_STONITH_REPLY); -+ xmlNode *reply = create_xml_node(NULL, T_STONITH_REPLY); - - crm_xml_add(reply, "st_origin", __func__); - crm_xml_add(reply, F_TYPE, T_STONITH_NG); -- - crm_xml_add(reply, F_STONITH_OPERATION, cmd->op); - crm_xml_add(reply, F_STONITH_DEVICE, cmd->device); - crm_xml_add(reply, F_STONITH_REMOTE_OP_ID, cmd->remote_op_id); -@@ -2753,15 +2756,9 @@ stonith_construct_async_reply(async_command_t * cmd, const char *output, xmlNode - crm_xml_add(reply, F_STONITH_ORIGIN, cmd->origin); - crm_xml_add_int(reply, F_STONITH_CALLID, cmd->id); - crm_xml_add_int(reply, F_STONITH_CALLOPTS, cmd->options); -- -- crm_xml_add_int(reply, F_STONITH_RC, rc); -- -- crm_xml_add(reply, "st_output", output); -- -- if (data != NULL) { -- crm_info("Attaching reply output"); -- add_message_xml(reply, F_STONITH_CALLDATA, data); -- } -+ crm_xml_add_int(reply, F_STONITH_RC, -+ pcmk_rc2legacy(stonith__result2rc(result))); -+ crm_xml_add(reply, "st_output", result->action_stdout); - return reply; - } - --- -2.27.0 - - -From 2686caeb3b74f687ddd86a4e483250ca8096ba7c Mon Sep 17 00:00:00 2001 -From: Ken Gaillot -Date: Tue, 19 Oct 2021 18:27:31 -0500 -Subject: [PATCH 09/12] Log: fencer: improve messages for asynchronous results - -Now that we have the full result object, pass it to log_async_result(). -Instead of logging a mapped legacy rc, log the execution status or exit status -as appropriate, along with the exit reason. ---- - daemons/fenced/fenced_commands.c | 43 +++++++++++++++++--------------- - 1 file changed, 23 insertions(+), 20 deletions(-) - -diff --git a/daemons/fenced/fenced_commands.c b/daemons/fenced/fenced_commands.c -index 6bc12e6c4..9d06c68dc 100644 ---- a/daemons/fenced/fenced_commands.c -+++ b/daemons/fenced/fenced_commands.c -@@ -2305,15 +2305,14 @@ stonith_query(xmlNode * msg, const char *remote_peer, const char *client_id, int - * \brief Log the result of an asynchronous command - * - * \param[in] cmd Command the result is for -- * \param[in] rc Legacy return code corresponding to result -+ * \param[in] result Result of command - * \param[in] pid Process ID of command, if available - * \param[in] next Alternate device that will be tried if command failed -- * \param[in] output Command output, if any - * \param[in] op_merged Whether this command was merged with an earlier one - */ - static void --log_async_result(async_command_t *cmd, int rc, int pid, const char *next, -- const char *output, gboolean op_merged) -+log_async_result(async_command_t *cmd, const pcmk__action_result_t *result, -+ int pid, const char *next, bool op_merged) - { - int log_level = LOG_ERR; - int output_log_level = LOG_NEVER; -@@ -2321,17 +2320,18 @@ log_async_result(async_command_t *cmd, int rc, int pid, const char *next, - - GString *msg = g_string_sized_new(80); // Reasonable starting size - -- // Choose log levels appropriately -- if (rc == 0) { // Success -+ // Choose log levels appropriately if we have a result -+ if ((result->execution_status == PCMK_EXEC_DONE) -+ && (result->exit_status == CRM_EX_OK)) { // Success - log_level = (cmd->victim == NULL)? LOG_DEBUG : LOG_NOTICE; -- if ((output != NULL) -+ if ((result->action_stdout != NULL) - && !pcmk__str_eq(cmd->action, "metadata", pcmk__str_casei)) { - output_log_level = LOG_DEBUG; - } - next = NULL; - } else { // Failure - log_level = (cmd->victim == NULL)? LOG_NOTICE : LOG_ERR; -- if ((output != NULL) -+ if ((result->action_stdout != NULL) - && !pcmk__str_eq(cmd->action, "metadata", pcmk__str_casei)) { - output_log_level = LOG_WARNING; - } -@@ -2347,10 +2347,18 @@ log_async_result(async_command_t *cmd, int rc, int pid, const char *next, - } - g_string_append_printf(msg, "using %s ", cmd->device); - -- // Add result -- g_string_append_printf(msg, "returned %d (%s)", rc, pcmk_strerror(rc)); -+ // Add exit status or execution status as appropriate -+ if (result->execution_status == PCMK_EXEC_DONE) { -+ g_string_append_printf(msg, "returned %d", result->exit_status); -+ } else { -+ g_string_append_printf(msg, "could not be executed: %s", -+ pcmk_exec_status_str(result->execution_status)); -+ } - -- // Add next device if appropriate -+ // Add exit reason and next device if appropriate -+ if (result->exit_reason != NULL) { -+ g_string_append_printf(msg, " (%s)", result->exit_reason); -+ } - if (next != NULL) { - g_string_append_printf(msg, ", retrying with %s", next); - } -@@ -2371,7 +2379,7 @@ log_async_result(async_command_t *cmd, int rc, int pid, const char *next, - if (output_log_level != LOG_NEVER) { - char *prefix = crm_strdup_printf("%s[%d]", cmd->device, pid); - -- crm_log_output(output_log_level, prefix, output); -+ crm_log_output(output_log_level, prefix, result->action_stdout); - free(prefix); - } - } -@@ -2391,14 +2399,9 @@ send_async_reply(async_command_t *cmd, const pcmk__action_result_t *result, - { - xmlNode *reply = NULL; - gboolean bcast = FALSE; -- const char *output = NULL; -- int rc = pcmk_ok; - - CRM_CHECK((cmd != NULL) && (result != NULL), return); - -- output = result->action_stdout; -- rc = pcmk_rc2legacy(stonith__result2rc(result)); -- - reply = construct_async_reply(cmd, result); - - // Only replies for certain actions are broadcast -@@ -2412,7 +2415,7 @@ send_async_reply(async_command_t *cmd, const pcmk__action_result_t *result, - bcast = TRUE; - } - -- log_async_result(cmd, rc, pid, NULL, output, merged); -+ log_async_result(cmd, result, pid, NULL, merged); - crm_log_xml_trace(reply, "Reply"); - - if (merged) { -@@ -2436,6 +2439,7 @@ send_async_reply(async_command_t *cmd, const pcmk__action_result_t *result, - if (stand_alone) { - /* Do notification with a clean data object */ - xmlNode *notify_data = create_xml_node(NULL, T_STONITH_NOTIFY_FENCE); -+ int rc = pcmk_rc2legacy(stonith__result2rc(result)); - - crm_xml_add_int(notify_data, F_STONITH_RC, rc); - crm_xml_add(notify_data, F_STONITH_TARGET, cmd->victim); -@@ -2521,8 +2525,7 @@ st_child_done(int pid, const pcmk__action_result_t *result, void *user_data) - - /* this operation requires more fencing, hooray! */ - if (next_device) { -- log_async_result(cmd, pcmk_rc2legacy(stonith__result2rc(result)), pid, -- next_device->id, result->action_stdout, FALSE); -+ log_async_result(cmd, result, pid, next_device->id, false); - schedule_stonith_command(cmd, next_device); - /* Prevent cmd from being freed */ - cmd = NULL; --- -2.27.0 - - -From 9f9dea518da50f629589d505ea0f330a47111d76 Mon Sep 17 00:00:00 2001 -From: Ken Gaillot -Date: Thu, 28 Oct 2021 13:29:31 -0500 -Subject: [PATCH 10/12] Test: cts-fencing: update expected log messages - -... which now log the original exit status rather than a mapped legacy rc ---- - cts/cts-fencing.in | 28 ++++++++++++++-------------- - 1 file changed, 14 insertions(+), 14 deletions(-) - -diff --git a/cts/cts-fencing.in b/cts/cts-fencing.in -index babfb6351..5cd9f7b8f 100644 ---- a/cts/cts-fencing.in -+++ b/cts/cts-fencing.in -@@ -886,7 +886,7 @@ class Tests(object): - test.add_cmd("stonith_admin", "--output-as=xml -F node3 -t 20") - - test.add_stonith_log_pattern("Total timeout set to 40") -- test.add_stonith_log_pattern("targeting node3 using false returned -201") -+ test.add_stonith_log_pattern("targeting node3 using false returned 1") - test.add_stonith_log_pattern("targeting node3 using true returned 0") - - # test what happens when the first fencing level fails. -@@ -920,8 +920,8 @@ class Tests(object): - test.add_cmd("stonith_admin", "--output-as=xml -F node3 -t 3") - - test.add_stonith_log_pattern("Total timeout set to 18") -- test.add_stonith_log_pattern("targeting node3 using false1 returned -201") -- test.add_stonith_log_pattern("targeting node3 using false2 returned -201") -+ test.add_stonith_log_pattern("targeting node3 using false1 returned 1") -+ test.add_stonith_log_pattern("targeting node3 using false2 returned 1") - test.add_stonith_log_pattern("targeting node3 using true3 returned 0") - test.add_stonith_log_pattern("targeting node3 using true4 returned 0") - -@@ -987,7 +987,7 @@ class Tests(object): - test.add_cmd("stonith_admin", "--output-as=xml -F node3 -t 20") - - test.add_stonith_log_pattern("Total timeout set to 8") -- test.add_stonith_log_pattern("targeting node3 using false1 returned -201") -+ test.add_stonith_log_pattern("targeting node3 using false1 returned 1") - test.add_stonith_neg_log_pattern("targeting node3 using false2 returned ") - test.add_stonith_log_pattern("targeting node3 using true3 returned 0") - test.add_stonith_log_pattern("targeting node3 using true4 returned 0") -@@ -1147,7 +1147,7 @@ class Tests(object): - "--output-as=xml -R true1 -a fence_dummy_no_reboot -o \"mode=pass\" -o \"pcmk_host_list=node1 node2 node3\"") - test.add_cmd("stonith_admin", "--output-as=xml -B node1 -t 5 -V") - test.add_stonith_log_pattern("does not support reboot") -- test.add_stonith_log_pattern("using true1 returned 0 (OK)") -+ test.add_stonith_log_pattern("using true1 returned 0") - - # make sure reboot is used when reboot action is advertised - for test_type in test_types: -@@ -1158,7 +1158,7 @@ class Tests(object): - "--output-as=xml -R true1 -a fence_dummy -o \"mode=pass\" -o \"pcmk_host_list=node1 node2 node3\"") - test.add_cmd("stonith_admin", "--output-as=xml -B node1 -t 5 -V") - test.add_stonith_neg_log_pattern("does not advertise support for 'reboot', performing 'off'") -- test.add_stonith_log_pattern("using true1 returned 0 (OK)") -+ test.add_stonith_log_pattern("using true1 returned 0") - - # make sure requested fencing delay is applied only for the first device in the first level - # make sure static delay from pcmk_delay_base is added -@@ -1240,8 +1240,8 @@ class Tests(object): - '--output-as=xml -R true2 -a fence_dummy_auto_unfence -o "mode=pass" -o "pcmk_host_list=%s"' % (our_uname)) - test.add_cmd("stonith_admin", "--output-as=xml -U %s -t 3" % (our_uname)) - # both devices should be executed -- test.add_stonith_log_pattern("using true1 returned 0 (OK)") -- test.add_stonith_log_pattern("using true2 returned 0 (OK)") -+ test.add_stonith_log_pattern("using true1 returned 0") -+ test.add_stonith_log_pattern("using true2 returned 0") - - ### verify unfencing using automatic unfencing fails if any of the required agents fail - test = self.new_test("cpg_unfence_required_2", -@@ -1264,8 +1264,8 @@ class Tests(object): - test.add_cmd("stonith_admin", "--output-as=xml -r %s -i 1 -v true1" % (our_uname)) - test.add_cmd("stonith_admin", "--output-as=xml -r %s -i 2 -v true2" % (our_uname)) - test.add_cmd("stonith_admin", "--output-as=xml -U %s -t 3" % (our_uname)) -- test.add_stonith_log_pattern("using true1 returned 0 (OK)") -- test.add_stonith_log_pattern("using true2 returned 0 (OK)") -+ test.add_stonith_log_pattern("using true1 returned 0") -+ test.add_stonith_log_pattern("using true2 returned 0") - - ### verify unfencing using automatic devices with topology - test = self.new_test("cpg_unfence_required_4", -@@ -1296,10 +1296,10 @@ class Tests(object): - test.add_cmd("stonith_admin", "--output-as=xml -r %s -i 3 -v false4" % (our_uname)) - test.add_cmd("stonith_admin", "--output-as=xml -r %s -i 4 -v true4" % (our_uname)) - test.add_cmd("stonith_admin", "--output-as=xml -U %s -t 3" % (our_uname)) -- test.add_stonith_log_pattern("using true1 returned 0 (OK)") -- test.add_stonith_log_pattern("using true2 returned 0 (OK)") -- test.add_stonith_log_pattern("using true3 returned 0 (OK)") -- test.add_stonith_log_pattern("using true4 returned 0 (OK)") -+ test.add_stonith_log_pattern("using true1 returned 0") -+ test.add_stonith_log_pattern("using true2 returned 0") -+ test.add_stonith_log_pattern("using true3 returned 0") -+ test.add_stonith_log_pattern("using true4 returned 0") - - def build_unfence_on_target_tests(self): - """ Register tests that verify unfencing that runs on the target """ --- -2.27.0 - - -From be72166ed9ccb53c218529783660503df95da719 Mon Sep 17 00:00:00 2001 -From: Ken Gaillot -Date: Thu, 16 Sep 2021 16:50:23 -0500 -Subject: [PATCH 11/12] Log: libcrmservice: downgrade failed action messages - -Previously, we would often get duplicate log messages for failed actions, -from the service library and again from its callers. - -Now that the service library tracks and provides exit reasons, callers can log -sufficient detail with better context, so downgrade the library's messages to -info level or lower. Similarly, avoid duplicate logs of process output. - -Certain messages (such as out-of-memory) remain at higher severity. ---- - daemons/controld/controld_execd.c | 15 +++--- - lib/fencing/st_client.c | 11 ++--- - lib/services/services.c | 14 +++--- - lib/services/services_linux.c | 80 ++++++++++++++++--------------- - lib/services/systemd.c | 20 ++++---- - lib/services/upstart.c | 19 ++++---- - 6 files changed, 80 insertions(+), 79 deletions(-) - -diff --git a/daemons/controld/controld_execd.c b/daemons/controld/controld_execd.c -index bded6e6b6..3ddff6e13 100644 ---- a/daemons/controld/controld_execd.c -+++ b/daemons/controld/controld_execd.c -@@ -2684,16 +2684,15 @@ log_executor_event(lrmd_event_data_t *op, const char *op_key, - do_crm_log(log_level, "%s", str->str); - g_string_free(str, TRUE); - -- if (op->output != NULL) { -- char *prefix = crm_strdup_printf("%s-" PCMK__OP_FMT ":%d", node_name, -+ /* The services library has already logged the output at info or debug -+ * level, so just raise to notice if it looks like a failure. -+ */ -+ if ((op->output != NULL) && (op->rc != PCMK_OCF_OK)) { -+ char *prefix = crm_strdup_printf(PCMK__OP_FMT "@%s output", - op->rsc_id, op->op_type, -- op->interval_ms, op->call_id); -+ op->interval_ms, node_name); - -- if (op->rc) { -- crm_log_output(LOG_NOTICE, prefix, op->output); -- } else { -- crm_log_output(LOG_DEBUG, prefix, op->output); -- } -+ crm_log_output(LOG_NOTICE, prefix, op->output); - free(prefix); - } - } -diff --git a/lib/fencing/st_client.c b/lib/fencing/st_client.c -index 3d4127eff..2fbff7f24 100644 ---- a/lib/fencing/st_client.c -+++ b/lib/fencing/st_client.c -@@ -276,14 +276,9 @@ stonith__watchdog_fencing_enabled_for_node(const char *node) - static void - log_action(stonith_action_t *action, pid_t pid) - { -- if (action->result.action_stdout != NULL) { -- /* Logging the whole string confuses syslog when the string is xml */ -- char *prefix = crm_strdup_printf("%s[%d] stdout:", action->agent, pid); -- -- crm_log_output(LOG_TRACE, prefix, action->result.action_stdout); -- free(prefix); -- } -- -+ /* The services library has already logged the output at info or debug -+ * level, so just raise to warning for stderr. -+ */ - if (action->result.action_stderr != NULL) { - /* Logging the whole string confuses syslog when the string is xml */ - char *prefix = crm_strdup_printf("%s[%d] stderr:", action->agent, pid); -diff --git a/lib/services/services.c b/lib/services/services.c -index 86a0a213c..cf8bbc70e 100644 ---- a/lib/services/services.c -+++ b/lib/services/services.c -@@ -319,13 +319,13 @@ services__create_resource_action(const char *name, const char *standard, - rc = services__nagios_prepare(op); - #endif - } else { -- crm_err("Unknown resource standard: %s", op->standard); -+ crm_info("Unknown resource standard: %s", op->standard); - rc = ENOENT; - } - - if (rc != pcmk_rc_ok) { -- crm_err("Cannot prepare %s operation for %s: %s", -- action, name, strerror(rc)); -+ crm_info("Cannot prepare %s operation for %s: %s", -+ action, name, strerror(rc)); - services__handle_exec_error(op, rc); - } - return op; -@@ -967,14 +967,14 @@ execute_metadata_action(svc_action_t *op) - const char *class = op->standard; - - if (op->agent == NULL) { -- crm_err("meta-data requested without specifying agent"); -+ crm_info("Meta-data requested without specifying agent"); - services__set_result(op, services__generic_error(op), - PCMK_EXEC_ERROR_FATAL, "Agent not specified"); - return EINVAL; - } - - if (class == NULL) { -- crm_err("meta-data requested for agent %s without specifying class", -+ crm_info("Meta-data requested for agent %s without specifying class", - op->agent); - services__set_result(op, services__generic_error(op), - PCMK_EXEC_ERROR_FATAL, -@@ -986,8 +986,8 @@ execute_metadata_action(svc_action_t *op) - class = resources_find_service_class(op->agent); - } - if (class == NULL) { -- crm_err("meta-data requested for %s, but could not determine class", -- op->agent); -+ crm_info("Meta-data requested for %s, but could not determine class", -+ op->agent); - services__set_result(op, services__generic_error(op), - PCMK_EXEC_ERROR_HARD, - "Agent standard could not be determined"); -diff --git a/lib/services/services_linux.c b/lib/services/services_linux.c -index b2ff27a0d..9a4c6cf80 100644 ---- a/lib/services/services_linux.c -+++ b/lib/services/services_linux.c -@@ -64,8 +64,8 @@ sigchld_setup(struct sigchld_data_s *data) - - // Block SIGCHLD (saving previous set of blocked signals to restore later) - if (sigprocmask(SIG_BLOCK, &(data->mask), &(data->old_mask)) < 0) { -- crm_err("Wait for child process completion failed: %s " -- CRM_XS " source=sigprocmask", pcmk_strerror(errno)); -+ crm_info("Wait for child process completion failed: %s " -+ CRM_XS " source=sigprocmask", pcmk_strerror(errno)); - return false; - } - return true; -@@ -81,8 +81,8 @@ sigchld_open(struct sigchld_data_s *data) - - fd = signalfd(-1, &(data->mask), SFD_NONBLOCK); - if (fd < 0) { -- crm_err("Wait for child process completion failed: %s " -- CRM_XS " source=signalfd", pcmk_strerror(errno)); -+ crm_info("Wait for child process completion failed: %s " -+ CRM_XS " source=signalfd", pcmk_strerror(errno)); - } - return fd; - } -@@ -108,8 +108,8 @@ sigchld_received(int fd) - } - s = read(fd, &fdsi, sizeof(struct signalfd_siginfo)); - if (s != sizeof(struct signalfd_siginfo)) { -- crm_err("Wait for child process completion failed: %s " -- CRM_XS " source=read", pcmk_strerror(errno)); -+ crm_info("Wait for child process completion failed: %s " -+ CRM_XS " source=read", pcmk_strerror(errno)); - - } else if (fdsi.ssi_signo == SIGCHLD) { - return true; -@@ -149,8 +149,8 @@ sigchld_handler() - if ((last_sigchld_data != NULL) - && (last_sigchld_data->pipe_fd[1] >= 0) - && (write(last_sigchld_data->pipe_fd[1], "", 1) == -1)) { -- crm_err("Wait for child process completion failed: %s " -- CRM_XS " source=write", pcmk_strerror(errno)); -+ crm_info("Wait for child process completion failed: %s " -+ CRM_XS " source=write", pcmk_strerror(errno)); - } - } - -@@ -162,19 +162,19 @@ sigchld_setup(struct sigchld_data_s *data) - data->pipe_fd[0] = data->pipe_fd[1] = -1; - - if (pipe(data->pipe_fd) == -1) { -- crm_err("Wait for child process completion failed: %s " -- CRM_XS " source=pipe", pcmk_strerror(errno)); -+ crm_info("Wait for child process completion failed: %s " -+ CRM_XS " source=pipe", pcmk_strerror(errno)); - return false; - } - - rc = pcmk__set_nonblocking(data->pipe_fd[0]); - if (rc != pcmk_rc_ok) { -- crm_warn("Could not set pipe input non-blocking: %s " CRM_XS " rc=%d", -+ crm_info("Could not set pipe input non-blocking: %s " CRM_XS " rc=%d", - pcmk_rc_str(rc), rc); - } - rc = pcmk__set_nonblocking(data->pipe_fd[1]); - if (rc != pcmk_rc_ok) { -- crm_warn("Could not set pipe output non-blocking: %s " CRM_XS " rc=%d", -+ crm_info("Could not set pipe output non-blocking: %s " CRM_XS " rc=%d", - pcmk_rc_str(rc), rc); - } - -@@ -183,8 +183,8 @@ sigchld_setup(struct sigchld_data_s *data) - data->sa.sa_flags = 0; - sigemptyset(&(data->sa.sa_mask)); - if (sigaction(SIGCHLD, &(data->sa), &(data->old_sa)) < 0) { -- crm_err("Wait for child process completion failed: %s " -- CRM_XS " source=sigaction", pcmk_strerror(errno)); -+ crm_info("Wait for child process completion failed: %s " -+ CRM_XS " source=sigaction", pcmk_strerror(errno)); - } - - // Remember data for use in signal handler -@@ -585,7 +585,11 @@ log_op_output(svc_action_t *op) - { - char *prefix = crm_strdup_printf("%s[%d] error output", op->id, op->pid); - -- crm_log_output(LOG_NOTICE, prefix, op->stderr_data); -+ /* The library caller has better context to know how important the output -+ * is, so log it at info and debug severity here. They can log it again at -+ * higher severity if appropriate. -+ */ -+ crm_log_output(LOG_INFO, prefix, op->stderr_data); - strcpy(prefix + strlen(prefix) - strlen("error output"), "output"); - crm_log_output(LOG_DEBUG, prefix, op->stdout_data); - free(prefix); -@@ -673,7 +677,7 @@ async_action_complete(mainloop_child_t *p, pid_t pid, int core, int signo, - parse_exit_reason_from_stderr(op); - - } else if (mainloop_child_timeout(p)) { -- crm_warn("%s[%d] timed out after %dms", op->id, op->pid, op->timeout); -+ crm_info("%s[%d] timed out after %dms", op->id, op->pid, op->timeout); - services__set_result(op, services__generic_error(op), PCMK_EXEC_TIMEOUT, - "Process did not exit within specified timeout"); - -@@ -686,7 +690,7 @@ async_action_complete(mainloop_child_t *p, pid_t pid, int core, int signo, - services__set_result(op, PCMK_OCF_OK, PCMK_EXEC_CANCELLED, NULL); - - } else { -- crm_warn("%s[%d] terminated with signal %d (%s)", -+ crm_info("%s[%d] terminated with signal %d (%s)", - op->id, op->pid, signo, strsignal(signo)); - services__set_result(op, PCMK_OCF_UNKNOWN_ERROR, PCMK_EXEC_ERROR, - "Process interrupted by signal"); -@@ -908,12 +912,12 @@ action_launch_child(svc_action_t *op) - sp.sched_priority = 0; - - if (sched_setscheduler(0, SCHED_OTHER, &sp) == -1) { -- crm_warn("Could not reset scheduling policy for %s", op->id); -+ crm_info("Could not reset scheduling policy for %s", op->id); - } - } - #endif - if (setpriority(PRIO_PROCESS, 0, 0) == -1) { -- crm_warn("Could not reset process priority for %s", op->id); -+ crm_info("Could not reset process priority for %s", op->id); - } - - /* Man: The call setpgrp() is equivalent to setpgid(0,0) -@@ -941,7 +945,7 @@ action_launch_child(svc_action_t *op) - } else { - crm_err("Considering %s unconfigured " - "because unable to load CIB secrets: %s", -- op->rsc, pcmk_rc_str(rc)); -+ op->rsc, pcmk_rc_str(rc)); - exit_child(op, services__configuration_error(op, false), - "Unable to load CIB secrets"); - } -@@ -1043,7 +1047,7 @@ wait_for_sync_result(svc_action_t *op, struct sigchld_data_s *data) - - } else if (wait_rc < 0) { - wait_reason = pcmk_rc_str(errno); -- crm_warn("Wait for completion of %s[%d] failed: %s " -+ crm_info("Wait for completion of %s[%d] failed: %s " - CRM_XS " source=waitpid", - op->id, op->pid, wait_reason); - wait_rc = 0; // Act as if process is still running -@@ -1057,8 +1061,8 @@ wait_for_sync_result(svc_action_t *op, struct sigchld_data_s *data) - - } else if ((poll_rc < 0) && (errno != EINTR)) { - wait_reason = pcmk_rc_str(errno); -- crm_err("Wait for completion of %s[%d] failed: %s " -- CRM_XS " source=poll", op->id, op->pid, wait_reason); -+ crm_info("Wait for completion of %s[%d] failed: %s " -+ CRM_XS " source=poll", op->id, op->pid, wait_reason); - break; - } - -@@ -1078,7 +1082,7 @@ wait_for_sync_result(svc_action_t *op, struct sigchld_data_s *data) - services__set_result(op, services__generic_error(op), - PCMK_EXEC_TIMEOUT, - "Process did not exit within specified timeout"); -- crm_warn("%s[%d] timed out after %dms", -+ crm_info("%s[%d] timed out after %dms", - op->id, op->pid, op->timeout); - - } else { -@@ -1110,8 +1114,8 @@ wait_for_sync_result(svc_action_t *op, struct sigchld_data_s *data) - - services__set_result(op, services__generic_error(op), PCMK_EXEC_ERROR, - "Process interrupted by signal"); -- crm_err("%s[%d] terminated with signal %d (%s)", -- op->id, op->pid, signo, strsignal(signo)); -+ crm_info("%s[%d] terminated with signal %d (%s)", -+ op->id, op->pid, signo, strsignal(signo)); - - #ifdef WCOREDUMP - if (WCOREDUMP(status)) { -@@ -1155,7 +1159,7 @@ services__execute_file(svc_action_t *op) - // Catch common failure conditions early - if (stat(op->opaque->exec, &st) != 0) { - rc = errno; -- crm_warn("Cannot execute '%s': %s " CRM_XS " stat rc=%d", -+ crm_info("Cannot execute '%s': %s " CRM_XS " stat rc=%d", - op->opaque->exec, pcmk_strerror(rc), rc); - services__handle_exec_error(op, rc); - goto done; -@@ -1163,8 +1167,8 @@ services__execute_file(svc_action_t *op) - - if (pipe(stdout_fd) < 0) { - rc = errno; -- crm_err("Cannot execute '%s': %s " CRM_XS " pipe(stdout) rc=%d", -- op->opaque->exec, pcmk_strerror(rc), rc); -+ crm_info("Cannot execute '%s': %s " CRM_XS " pipe(stdout) rc=%d", -+ op->opaque->exec, pcmk_strerror(rc), rc); - services__handle_exec_error(op, rc); - goto done; - } -@@ -1174,8 +1178,8 @@ services__execute_file(svc_action_t *op) - - close_pipe(stdout_fd); - -- crm_err("Cannot execute '%s': %s " CRM_XS " pipe(stderr) rc=%d", -- op->opaque->exec, pcmk_strerror(rc), rc); -+ crm_info("Cannot execute '%s': %s " CRM_XS " pipe(stderr) rc=%d", -+ op->opaque->exec, pcmk_strerror(rc), rc); - services__handle_exec_error(op, rc); - goto done; - } -@@ -1187,8 +1191,8 @@ services__execute_file(svc_action_t *op) - close_pipe(stdout_fd); - close_pipe(stderr_fd); - -- crm_err("Cannot execute '%s': %s " CRM_XS " pipe(stdin) rc=%d", -- op->opaque->exec, pcmk_strerror(rc), rc); -+ crm_info("Cannot execute '%s': %s " CRM_XS " pipe(stdin) rc=%d", -+ op->opaque->exec, pcmk_strerror(rc), rc); - services__handle_exec_error(op, rc); - goto done; - } -@@ -1212,8 +1216,8 @@ services__execute_file(svc_action_t *op) - close_pipe(stdout_fd); - close_pipe(stderr_fd); - -- crm_err("Cannot execute '%s': %s " CRM_XS " fork rc=%d", -- op->opaque->exec, pcmk_strerror(rc), rc); -+ crm_info("Cannot execute '%s': %s " CRM_XS " fork rc=%d", -+ op->opaque->exec, pcmk_strerror(rc), rc); - services__handle_exec_error(op, rc); - if (op->synchronous) { - sigchld_cleanup(&data); -@@ -1271,7 +1275,7 @@ services__execute_file(svc_action_t *op) - op->opaque->stdout_fd = stdout_fd[0]; - rc = pcmk__set_nonblocking(op->opaque->stdout_fd); - if (rc != pcmk_rc_ok) { -- crm_warn("Could not set '%s' output non-blocking: %s " -+ crm_info("Could not set '%s' output non-blocking: %s " - CRM_XS " rc=%d", - op->opaque->exec, pcmk_rc_str(rc), rc); - } -@@ -1279,7 +1283,7 @@ services__execute_file(svc_action_t *op) - op->opaque->stderr_fd = stderr_fd[0]; - rc = pcmk__set_nonblocking(op->opaque->stderr_fd); - if (rc != pcmk_rc_ok) { -- crm_warn("Could not set '%s' error output non-blocking: %s " -+ crm_info("Could not set '%s' error output non-blocking: %s " - CRM_XS " rc=%d", - op->opaque->exec, pcmk_rc_str(rc), rc); - } -@@ -1290,7 +1294,7 @@ services__execute_file(svc_action_t *op) - // as long as no other standard uses stdin_fd assume stonith - rc = pcmk__set_nonblocking(op->opaque->stdin_fd); - if (rc != pcmk_rc_ok) { -- crm_warn("Could not set '%s' input non-blocking: %s " -+ crm_info("Could not set '%s' input non-blocking: %s " - CRM_XS " fd=%d,rc=%d", op->opaque->exec, - pcmk_rc_str(rc), op->opaque->stdin_fd, rc); - } -diff --git a/lib/services/systemd.c b/lib/services/systemd.c -index 6f5bef960..8e9fff484 100644 ---- a/lib/services/systemd.c -+++ b/lib/services/systemd.c -@@ -232,7 +232,8 @@ systemd_daemon_reload_complete(DBusPendingCall *pending, void *user_data) - } - - if (pcmk_dbus_find_error(pending, reply, &error)) { -- crm_err("Could not issue systemd reload %d: %s", reload_count, error.message); -+ crm_warn("Could not issue systemd reload %d: %s", -+ reload_count, error.message); - dbus_error_free(&error); - - } else { -@@ -291,8 +292,8 @@ set_result_from_method_error(svc_action_t *op, const DBusError *error) - PCMK_EXEC_NOT_INSTALLED, "systemd unit not found"); - } - -- crm_err("DBus request for %s of systemd unit %s for resource %s failed: %s", -- op->action, op->agent, crm_str(op->rsc), error->message); -+ crm_info("DBus request for %s of systemd unit %s for resource %s failed: %s", -+ op->action, op->agent, crm_str(op->rsc), error->message); - } - - /*! -@@ -325,11 +326,11 @@ execute_after_loadunit(DBusMessage *reply, svc_action_t *op) - if (op != NULL) { - services__set_result(op, PCMK_OCF_UNKNOWN_ERROR, PCMK_EXEC_ERROR, - "systemd DBus method had unexpected reply"); -- crm_err("Could not load systemd unit %s for %s: " -- "DBus reply has unexpected type", op->agent, op->id); -+ crm_info("Could not load systemd unit %s for %s: " -+ "DBus reply has unexpected type", op->agent, op->id); - } else { -- crm_err("Could not load systemd unit: " -- "DBus reply has unexpected type"); -+ crm_info("Could not load systemd unit: " -+ "DBus reply has unexpected type"); - } - - } else { -@@ -688,7 +689,7 @@ process_unit_method_reply(DBusMessage *reply, svc_action_t *op) - - } else if (!pcmk_dbus_type_check(reply, NULL, DBUS_TYPE_OBJECT_PATH, - __func__, __LINE__)) { -- crm_warn("DBus request for %s of %s succeeded but " -+ crm_info("DBus request for %s of %s succeeded but " - "return type was unexpected", op->action, crm_str(op->rsc)); - services__set_result(op, PCMK_OCF_OK, PCMK_EXEC_DONE, - "systemd DBus method had unexpected reply"); -@@ -981,7 +982,8 @@ systemd_timeout_callback(gpointer p) - svc_action_t * op = p; - - op->opaque->timerid = 0; -- crm_warn("%s operation on systemd unit %s named '%s' timed out", op->action, op->agent, op->rsc); -+ crm_info("%s action for systemd unit %s named '%s' timed out", -+ op->action, op->agent, op->rsc); - services__set_result(op, PCMK_OCF_UNKNOWN_ERROR, PCMK_EXEC_TIMEOUT, - "Systemd action did not complete within specified timeout"); - services__finalize_async_op(op); -diff --git a/lib/services/upstart.c b/lib/services/upstart.c -index 2fdc229ad..2ece803e1 100644 ---- a/lib/services/upstart.c -+++ b/lib/services/upstart.c -@@ -308,21 +308,21 @@ get_first_instance(const gchar * job, int timeout) - dbus_message_unref(msg); - - if (dbus_error_is_set(&error)) { -- crm_err("Call to %s failed: %s", method, error.message); -+ crm_info("Call to %s failed: %s", method, error.message); - dbus_error_free(&error); - goto done; - - } else if(reply == NULL) { -- crm_err("Call to %s failed: no reply", method); -+ crm_info("Call to %s failed: no reply", method); - goto done; - - } else if (!dbus_message_iter_init(reply, &args)) { -- crm_err("Call to %s failed: Message has no arguments", method); -+ crm_info("Call to %s failed: Message has no arguments", method); - goto done; - } - - if(!pcmk_dbus_type_check(reply, &args, DBUS_TYPE_ARRAY, __func__, __LINE__)) { -- crm_err("Call to %s failed: Message has invalid arguments", method); -+ crm_info("Call to %s failed: Message has invalid arguments", method); - goto done; - } - -@@ -432,8 +432,8 @@ set_result_from_method_error(svc_action_t *op, const DBusError *error) - return; - } - -- crm_err("DBus request for %s of Upstart job %s for resource %s failed: %s", -- op->action, op->agent, crm_str(op->rsc), error->message); -+ crm_info("DBus request for %s of Upstart job %s for resource %s failed: %s", -+ op->action, op->agent, crm_str(op->rsc), error->message); - } - - /*! -@@ -468,7 +468,7 @@ job_method_complete(DBusPendingCall *pending, void *user_data) - - } else if (!pcmk_dbus_type_check(reply, NULL, DBUS_TYPE_OBJECT_PATH, - __func__, __LINE__)) { -- crm_warn("DBus request for %s of %s succeeded but " -+ crm_info("DBus request for %s of %s succeeded but " - "return type was unexpected", op->action, crm_str(op->rsc)); - services__set_result(op, PCMK_OCF_OK, PCMK_EXEC_DONE, NULL); - -@@ -667,7 +667,8 @@ services__execute_upstart(svc_action_t *op) - - } else if (!pcmk_dbus_type_check(reply, NULL, DBUS_TYPE_OBJECT_PATH, - __func__, __LINE__)) { -- crm_warn("Call to %s passed but return type was unexpected", op->action); -+ crm_info("Call to %s passed but return type was unexpected", -+ op->action); - services__set_result(op, PCMK_OCF_OK, PCMK_EXEC_DONE, NULL); - - } else { -@@ -675,7 +676,7 @@ services__execute_upstart(svc_action_t *op) - - dbus_message_get_args(reply, NULL, DBUS_TYPE_OBJECT_PATH, &path, - DBUS_TYPE_INVALID); -- crm_info("Call to %s passed: %s", op->action, path); -+ crm_debug("Call to %s passed: %s", op->action, path); - services__set_result(op, PCMK_OCF_OK, PCMK_EXEC_DONE, NULL); - } - --- -2.27.0 - - -From 39f6861c72eb9dd76d2cf3da287fe7485615631b Mon Sep 17 00:00:00 2001 -From: Ken Gaillot -Date: Mon, 8 Nov 2021 09:43:38 -0600 -Subject: [PATCH 12/12] Low: fencing: avoid use-after-free with new result - object - -itnroduced by 153c9b552 (not released) ---- - lib/fencing/st_rhcs.c | 6 ++++-- - 1 file changed, 4 insertions(+), 2 deletions(-) - -diff --git a/lib/fencing/st_rhcs.c b/lib/fencing/st_rhcs.c -index 23e694975..6c8cbedc7 100644 ---- a/lib/fencing/st_rhcs.c -+++ b/lib/fencing/st_rhcs.c -@@ -143,15 +143,17 @@ stonith__rhcs_get_metadata(const char *agent, int timeout, xmlNode **metadata) - if (result->execution_status != PCMK_EXEC_DONE) { - crm_warn("Could not execute metadata action for %s: %s", - agent, pcmk_exec_status_str(result->execution_status)); -+ rc = pcmk_rc2legacy(stonith__result2rc(result)); - stonith__destroy_action(action); -- return pcmk_rc2legacy(stonith__result2rc(result)); -+ return rc; - } - - if (result->exit_status != CRM_EX_OK) { - crm_warn("Metadata action for %s returned error code %d", - agent, result->exit_status); -+ rc = pcmk_rc2legacy(stonith__result2rc(result)); - stonith__destroy_action(action); -- return pcmk_rc2legacy(stonith__result2rc(result)); -+ return rc; - } - - if (result->action_stdout == NULL) { --- -2.27.0 - diff --git a/SOURCES/003-fencing-reasons.patch b/SOURCES/003-fencing-reasons.patch deleted file mode 100644 index 666a12a..0000000 --- a/SOURCES/003-fencing-reasons.patch +++ /dev/null @@ -1,2476 +0,0 @@ -From 8e6362cb2129bd56f817d449a195f3da87a545fa Mon Sep 17 00:00:00 2001 -From: Ken Gaillot -Date: Fri, 12 Nov 2021 14:28:56 -0600 -Subject: [PATCH 01/13] Refactor: libcrmcommon,fencer: convenience macro for - initializing results - -for future reuse ---- - daemons/fenced/fenced_commands.c | 14 ++------------ - include/crm/common/results_internal.h | 15 +++++++++++++++ - 2 files changed, 17 insertions(+), 12 deletions(-) - -diff --git a/daemons/fenced/fenced_commands.c b/daemons/fenced/fenced_commands.c -index 87600573e..9f2f1cc40 100644 ---- a/daemons/fenced/fenced_commands.c -+++ b/daemons/fenced/fenced_commands.c -@@ -388,12 +388,7 @@ static void - report_internal_result(async_command_t *cmd, int exit_status, - int execution_status, const char *exit_reason) - { -- pcmk__action_result_t result = { -- // Ensure we don't pass garbage to free() -- .exit_reason = NULL, -- .action_stdout = NULL, -- .action_stderr = NULL -- }; -+ pcmk__action_result_t result = PCMK__UNKNOWN_RESULT; - - pcmk__set_result(&result, exit_status, execution_status, exit_reason); - cmd->done_cb(0, &result, cmd); -@@ -2616,12 +2611,7 @@ stonith_fence_get_devices_cb(GList * devices, void *user_data) - } - - if (device == NULL) { // No device found -- pcmk__action_result_t result = { -- // Ensure we don't pass garbage to free() -- .exit_reason = NULL, -- .action_stdout = NULL, -- .action_stderr = NULL -- }; -+ pcmk__action_result_t result = PCMK__UNKNOWN_RESULT; - - pcmk__set_result(&result, CRM_EX_ERROR, PCMK_EXEC_NO_FENCE_DEVICE, - "No fence device configured for target"); -diff --git a/include/crm/common/results_internal.h b/include/crm/common/results_internal.h -index 804bf2a7a..6befaa0ed 100644 ---- a/include/crm/common/results_internal.h -+++ b/include/crm/common/results_internal.h -@@ -30,6 +30,21 @@ typedef struct { - char *action_stderr; // Action error output - } pcmk__action_result_t; - -+/*! -+ * \internal -+ * \brief Static initialization for an action result -+ * -+ * \note Importantly, this ensures pcmk__reset_result() won't try to free -+ * garbage. -+ */ -+#define PCMK__UNKNOWN_RESULT { \ -+ .exit_status = CRM_EX_OK, \ -+ .execution_status = PCMK_EXEC_UNKNOWN, \ -+ .exit_reason = NULL, \ -+ .action_stdout = NULL, \ -+ .action_stderr = NULL, \ -+ } -+ - void pcmk__set_result(pcmk__action_result_t *result, int exit_status, - enum pcmk_exec_status exec_status, - const char *exit_reason); --- -2.27.0 - - -From 0937c92476ac737a5f5146932824bde8bdd7db98 Mon Sep 17 00:00:00 2001 -From: Ken Gaillot -Date: Fri, 12 Nov 2021 16:02:27 -0600 -Subject: [PATCH 02/13] Refactor: various: add convenience function for - checking result success - -A successful pcmk__action_result_t has both exit status CRM_EX_OK (a.k.a -PCMK_OCF_OK) and execution status PCMK_EXEC_DONE. Since checking that is -clunky, we sometimes just check exit status, which is less than ideal. - -The convenience function makes it easy to check both, and improves readability. ---- - daemons/controld/controld_remote_ra.c | 4 ++-- - daemons/execd/execd_commands.c | 12 ++++++------ - daemons/fenced/fenced_commands.c | 14 ++++++-------- - include/crm/common/results_internal.h | 16 ++++++++++++++++ - lib/fencing/st_client.c | 4 ++-- - lib/fencing/st_rhcs.c | 2 +- - 6 files changed, 33 insertions(+), 19 deletions(-) - -diff --git a/daemons/controld/controld_remote_ra.c b/daemons/controld/controld_remote_ra.c -index 74cbfd673..55ac162c7 100644 ---- a/daemons/controld/controld_remote_ra.c -+++ b/daemons/controld/controld_remote_ra.c -@@ -297,7 +297,7 @@ static void - check_remote_node_state(remote_ra_cmd_t *cmd) - { - /* Only successful actions can change node state */ -- if (cmd->result.exit_status != PCMK_OCF_OK) { -+ if (!pcmk__result_ok(&(cmd->result))) { - return; - } - -@@ -365,7 +365,7 @@ report_remote_ra_result(remote_ra_cmd_t * cmd) - lrmd__set_result(&op, cmd->result.exit_status, cmd->result.execution_status, - cmd->result.exit_reason); - -- if (cmd->reported_success && (cmd->result.exit_status != PCMK_OCF_OK)) { -+ if (cmd->reported_success && !pcmk__result_ok(&(cmd->result))) { - op.t_rcchange = (unsigned int) time(NULL); - /* This edge case will likely never ever occur, but if it does the - * result is that a failure will not be processed correctly. This is only -diff --git a/daemons/execd/execd_commands.c b/daemons/execd/execd_commands.c -index 667525039..02070bf11 100644 ---- a/daemons/execd/execd_commands.c -+++ b/daemons/execd/execd_commands.c -@@ -878,7 +878,7 @@ action_complete(svc_action_t * action) - } - - if (pcmk__str_eq(rclass, PCMK_RESOURCE_CLASS_SYSTEMD, pcmk__str_casei)) { -- if ((cmd->result.exit_status == PCMK_OCF_OK) -+ if (pcmk__result_ok(&(cmd->result)) - && pcmk__strcase_any_of(cmd->action, "start", "stop", NULL)) { - /* systemd returns from start and stop actions after the action - * begins, not after it completes. We have to jump through a few -@@ -894,7 +894,7 @@ action_complete(svc_action_t * action) - if (cmd->result.execution_status == PCMK_EXEC_PENDING) { - goagain = true; - -- } else if ((cmd->result.exit_status == PCMK_OCF_OK) -+ } else if (pcmk__result_ok(&(cmd->result)) - && pcmk__str_eq(cmd->real_action, "stop", pcmk__str_casei)) { - goagain = true; - -@@ -927,12 +927,12 @@ action_complete(svc_action_t * action) - #if SUPPORT_NAGIOS - if (rsc && pcmk__str_eq(rsc->class, PCMK_RESOURCE_CLASS_NAGIOS, pcmk__str_casei)) { - if (action_matches(cmd, "monitor", 0) -- && (cmd->result.exit_status == PCMK_OCF_OK)) { -+ && pcmk__result_ok(&(cmd->result))) { - /* Successfully executed --version for the nagios plugin */ - cmd->result.exit_status = PCMK_OCF_NOT_RUNNING; - - } else if (pcmk__str_eq(cmd->action, "start", pcmk__str_casei) -- && (cmd->result.exit_status != PCMK_OCF_OK)) { -+ && !pcmk__result_ok(&(cmd->result))) { - #ifdef PCMK__TIME_USE_CGT - goagain = true; - #endif -@@ -955,7 +955,7 @@ action_complete(svc_action_t * action) - cmd->start_delay = delay; - cmd->timeout = timeout_left; - -- if (cmd->result.exit_status == PCMK_OCF_OK) { -+ if (pcmk__result_ok(&(cmd->result))) { - crm_debug("%s %s may still be in progress: re-scheduling (elapsed=%dms, remaining=%dms, start_delay=%dms)", - cmd->rsc_id, cmd->real_action, time_sum, timeout_left, delay); - -@@ -1066,7 +1066,7 @@ stonith_action_complete(lrmd_cmd_t * cmd, int rc) - cmd->interval_ms, rc); - - // Certain successful actions change the known state of the resource -- if ((rsc != NULL) && (cmd->result.exit_status == PCMK_OCF_OK)) { -+ if ((rsc != NULL) && pcmk__result_ok(&(cmd->result))) { - if (pcmk__str_eq(cmd->action, "start", pcmk__str_casei)) { - rsc->st_probe_rc = pcmk_ok; // maps to PCMK_OCF_OK - } else if (pcmk__str_eq(cmd->action, "stop", pcmk__str_casei)) { -diff --git a/daemons/fenced/fenced_commands.c b/daemons/fenced/fenced_commands.c -index 9f2f1cc40..26501a4b3 100644 ---- a/daemons/fenced/fenced_commands.c -+++ b/daemons/fenced/fenced_commands.c -@@ -1188,8 +1188,7 @@ dynamic_list_search_cb(int pid, const pcmk__action_result_t *result, - - mainloop_set_trigger(dev->work); - -- if ((result->execution_status == PCMK_EXEC_DONE) -- && (result->exit_status == CRM_EX_OK)) { -+ if (pcmk__result_ok(result)) { - crm_info("Refreshing target list for %s", dev->id); - g_list_free_full(dev->targets, free); - dev->targets = stonith__parse_targets(result->action_stdout); -@@ -2310,15 +2309,14 @@ log_async_result(async_command_t *cmd, const pcmk__action_result_t *result, - GString *msg = g_string_sized_new(80); // Reasonable starting size - - // Choose log levels appropriately if we have a result -- if ((result->execution_status == PCMK_EXEC_DONE) -- && (result->exit_status == CRM_EX_OK)) { // Success -+ if (pcmk__result_ok(result)) { - log_level = (cmd->victim == NULL)? LOG_DEBUG : LOG_NOTICE; - if ((result->action_stdout != NULL) - && !pcmk__str_eq(cmd->action, "metadata", pcmk__str_casei)) { - output_log_level = LOG_DEBUG; - } - next = NULL; -- } else { // Failure -+ } else { - log_level = (cmd->victim == NULL)? LOG_NOTICE : LOG_ERR; - if ((result->action_stdout != NULL) - && !pcmk__str_eq(cmd->action, "metadata", pcmk__str_casei)) { -@@ -2482,7 +2480,7 @@ st_child_done(int pid, const pcmk__action_result_t *result, void *user_data) - /* The device is ready to do something else now */ - device = g_hash_table_lookup(device_list, cmd->device); - if (device) { -- if (!device->verified && (result->exit_status == CRM_EX_OK) && -+ if (!device->verified && pcmk__result_ok(result) && - (pcmk__strcase_any_of(cmd->action, "list", "monitor", "status", NULL))) { - - device->verified = TRUE; -@@ -2491,7 +2489,7 @@ st_child_done(int pid, const pcmk__action_result_t *result, void *user_data) - mainloop_set_trigger(device->work); - } - -- if (result->exit_status == CRM_EX_OK) { -+ if (pcmk__result_ok(result)) { - GList *iter; - /* see if there are any required devices left to execute for this op */ - for (iter = cmd->device_next; iter != NULL; iter = iter->next) { -@@ -2523,7 +2521,7 @@ st_child_done(int pid, const pcmk__action_result_t *result, void *user_data) - - send_async_reply(cmd, result, pid, false); - -- if (result->exit_status != CRM_EX_OK) { -+ if (!pcmk__result_ok(result)) { - goto done; - } - -diff --git a/include/crm/common/results_internal.h b/include/crm/common/results_internal.h -index 6befaa0ed..0c5833937 100644 ---- a/include/crm/common/results_internal.h -+++ b/include/crm/common/results_internal.h -@@ -54,4 +54,20 @@ void pcmk__set_result_output(pcmk__action_result_t *result, - - void pcmk__reset_result(pcmk__action_result_t *result); - -+/*! -+ * \internal -+ * \brief Check whether a result is OK -+ * -+ * \param[in] result -+ * -+ * \return true if the result's exit status is CRM_EX_OK and its -+ * execution status is PCMK_EXEC_DONE, otherwise false -+ */ -+static inline bool -+pcmk__result_ok(const pcmk__action_result_t *result) -+{ -+ return (result != NULL) && (result->exit_status == CRM_EX_OK) -+ && (result->execution_status == PCMK_EXEC_DONE); -+} -+ - #endif // PCMK__COMMON_RESULTS_INTERNAL__H -diff --git a/lib/fencing/st_client.c b/lib/fencing/st_client.c -index 2fbff7f24..af461d0d4 100644 ---- a/lib/fencing/st_client.c -+++ b/lib/fencing/st_client.c -@@ -760,7 +760,7 @@ stonith__result2rc(const pcmk__action_result_t *result) - default: break; - } - -- if (result->exit_status == CRM_EX_OK) { -+ if (pcmk__result_ok(result)) { - return pcmk_rc_ok; - } - -@@ -797,7 +797,7 @@ stonith_action_async_done(svc_action_t *svc_action) - - log_action(action, action->pid); - -- if ((action->result.exit_status != CRM_EX_OK) -+ if (!pcmk__result_ok(&(action->result)) - && update_remaining_timeout(action)) { - - int rc = internal_stonith_action_execute(action); -diff --git a/lib/fencing/st_rhcs.c b/lib/fencing/st_rhcs.c -index 6c8cbedc7..865e04bc2 100644 ---- a/lib/fencing/st_rhcs.c -+++ b/lib/fencing/st_rhcs.c -@@ -148,7 +148,7 @@ stonith__rhcs_get_metadata(const char *agent, int timeout, xmlNode **metadata) - return rc; - } - -- if (result->exit_status != CRM_EX_OK) { -+ if (!pcmk__result_ok(result)) { - crm_warn("Metadata action for %s returned error code %d", - agent, result->exit_status); - rc = pcmk_rc2legacy(stonith__result2rc(result)); --- -2.27.0 - - -From 4c39ff00a0c028354a9da7f80986f7e34b05ba08 Mon Sep 17 00:00:00 2001 -From: Ken Gaillot -Date: Fri, 12 Nov 2021 16:07:01 -0600 -Subject: [PATCH 03/13] Low: fencing: improve mapping of execution status to - legacy return code - -PCMK_EXEC_PENDING is likely not possible with the current code, but map it to -EINPROGRESS for completeness. - -PCMK_EXEC_INVALID is not yet used by the fencer but will be. ---- - lib/fencing/st_client.c | 30 ++++++++++++++++++++++++++---- - 1 file changed, 26 insertions(+), 4 deletions(-) - -diff --git a/lib/fencing/st_client.c b/lib/fencing/st_client.c -index af461d0d4..93513e9f3 100644 ---- a/lib/fencing/st_client.c -+++ b/lib/fencing/st_client.c -@@ -749,7 +749,12 @@ update_remaining_timeout(stonith_action_t * action) - int - stonith__result2rc(const pcmk__action_result_t *result) - { -+ if (pcmk__result_ok(result)) { -+ return pcmk_rc_ok; -+ } -+ - switch (result->execution_status) { -+ case PCMK_EXEC_PENDING: return EINPROGRESS; - case PCMK_EXEC_CANCELLED: return ECANCELED; - case PCMK_EXEC_TIMEOUT: return ETIME; - case PCMK_EXEC_NOT_INSTALLED: return ENOENT; -@@ -757,11 +762,28 @@ stonith__result2rc(const pcmk__action_result_t *result) - case PCMK_EXEC_NOT_CONNECTED: return ENOTCONN; - case PCMK_EXEC_NO_FENCE_DEVICE: return ENODEV; - case PCMK_EXEC_NO_SECRETS: return EACCES; -- default: break; -- } - -- if (pcmk__result_ok(result)) { -- return pcmk_rc_ok; -+ /* For the fencing API, PCMK_EXEC_INVALID is used with fencer API -+ * operations that don't involve executing an agent (for example, -+ * registering devices). This allows us to use the CRM_EX_* codes in the -+ * exit status for finer-grained responses. -+ */ -+ case PCMK_EXEC_INVALID: -+ switch (result->exit_status) { -+ case CRM_EX_INSUFFICIENT_PRIV: return EACCES; -+ case CRM_EX_PROTOCOL: return EPROTO; -+ -+ /* CRM_EX_EXPIRED is used for orphaned fencing operations left -+ * over from a previous instance of the fencer. For API backward -+ * compatibility, this is mapped to the previously used code for -+ * this case, EHOSTUNREACH. -+ */ -+ case CRM_EX_EXPIRED: return EHOSTUNREACH; -+ default: break; -+ } -+ -+ default: -+ break; - } - - // Try to provide useful error code based on result's error output --- -2.27.0 - - -From 4e638783d1cd7c9398a603fc6df7e9d868262b16 Mon Sep 17 00:00:00 2001 -From: Ken Gaillot -Date: Thu, 18 Nov 2021 11:41:12 -0600 -Subject: [PATCH 04/13] Refactor: libstonithd: separate action-related code - into own source file - -Everything related to stonith_action_t has been moved from st_client.c to a new -st_actions.c, since st_client.c was ridiculously large, and the action stuff -isn't all client-related. No code was changed. - -Before: - 2804 st_client.c - -After: - 545 lib/fencing/st_actions.c - 2278 lib/fencing/st_client.c ---- - lib/fencing/Makefile.am | 2 +- - lib/fencing/st_actions.c | 545 +++++++++++++++++++++++++++++++++++++++ - lib/fencing/st_client.c | 528 +------------------------------------ - 3 files changed, 547 insertions(+), 528 deletions(-) - create mode 100644 lib/fencing/st_actions.c - -diff --git a/lib/fencing/Makefile.am b/lib/fencing/Makefile.am -index 205c4873d..dac215c16 100644 ---- a/lib/fencing/Makefile.am -+++ b/lib/fencing/Makefile.am -@@ -22,7 +22,7 @@ libstonithd_la_LDFLAGS += $(LDFLAGS_HARDENED_LIB) - libstonithd_la_LIBADD = $(top_builddir)/lib/common/libcrmcommon.la - libstonithd_la_LIBADD += $(top_builddir)/lib/services/libcrmservice.la - --libstonithd_la_SOURCES = st_client.c st_output.c st_rhcs.c -+libstonithd_la_SOURCES = st_actions.c st_client.c st_output.c st_rhcs.c - if BUILD_LHA_SUPPORT - libstonithd_la_SOURCES += st_lha.c - endif -diff --git a/lib/fencing/st_actions.c b/lib/fencing/st_actions.c -new file mode 100644 -index 000000000..64d3afd5d ---- /dev/null -+++ b/lib/fencing/st_actions.c -@@ -0,0 +1,545 @@ -+/* -+ * Copyright 2004-2021 the Pacemaker project contributors -+ * -+ * The version control history for this file may have further details. -+ * -+ * This source code is licensed under the GNU Lesser General Public License -+ * version 2.1 or later (LGPLv2.1+) WITHOUT ANY WARRANTY. -+ */ -+ -+#include -+ -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+ -+#include -+#include -+#include -+#include -+#include -+ -+#include "fencing_private.h" -+ -+struct stonith_action_s { -+ /*! user defined data */ -+ char *agent; -+ char *action; -+ char *victim; -+ GHashTable *args; -+ int timeout; -+ int async; -+ void *userdata; -+ void (*done_cb) (int pid, const pcmk__action_result_t *result, -+ void *user_data); -+ void (*fork_cb) (int pid, void *user_data); -+ -+ svc_action_t *svc_action; -+ -+ /*! internal timing information */ -+ time_t initial_start_time; -+ int tries; -+ int remaining_timeout; -+ int max_retries; -+ -+ int pid; -+ pcmk__action_result_t result; -+}; -+ -+static int internal_stonith_action_execute(stonith_action_t *action); -+static void log_action(stonith_action_t *action, pid_t pid); -+ -+/*! -+ * \internal -+ * \brief Set an action's result based on services library result -+ * -+ * \param[in] action Fence action to set result for -+ * \param[in] svc_action Service action to get result from -+ */ -+static void -+set_result_from_svc_action(stonith_action_t *action, svc_action_t *svc_action) -+{ -+ pcmk__set_result(&(action->result), svc_action->rc, svc_action->status, -+ services__exit_reason(svc_action)); -+ pcmk__set_result_output(&(action->result), -+ services__grab_stdout(svc_action), -+ services__grab_stderr(svc_action)); -+} -+ -+static void -+log_action(stonith_action_t *action, pid_t pid) -+{ -+ /* The services library has already logged the output at info or debug -+ * level, so just raise to warning for stderr. -+ */ -+ if (action->result.action_stderr != NULL) { -+ /* Logging the whole string confuses syslog when the string is xml */ -+ char *prefix = crm_strdup_printf("%s[%d] stderr:", action->agent, pid); -+ -+ crm_log_output(LOG_WARNING, prefix, action->result.action_stderr); -+ free(prefix); -+ } -+} -+ -+static void -+append_config_arg(gpointer key, gpointer value, gpointer user_data) -+{ -+ /* The fencer will filter "action" out when it registers the device, -+ * but ignore it here in case any external API users don't. -+ * -+ * Also filter out parameters handled directly by Pacemaker. -+ */ -+ if (!pcmk__str_eq(key, STONITH_ATTR_ACTION_OP, pcmk__str_casei) -+ && !pcmk_stonith_param(key) -+ && (strstr(key, CRM_META) == NULL) -+ && !pcmk__str_eq(key, "crm_feature_set", pcmk__str_casei)) { -+ -+ crm_trace("Passing %s=%s with fence action", -+ (const char *) key, (const char *) (value? value : "")); -+ g_hash_table_insert((GHashTable *) user_data, -+ strdup(key), strdup(value? value : "")); -+ } -+} -+ -+static GHashTable * -+make_args(const char *agent, const char *action, const char *victim, -+ uint32_t victim_nodeid, GHashTable * device_args, -+ GHashTable * port_map, const char *host_arg) -+{ -+ GHashTable *arg_list = NULL; -+ const char *value = NULL; -+ -+ CRM_CHECK(action != NULL, return NULL); -+ -+ arg_list = pcmk__strkey_table(free, free); -+ -+ // Add action to arguments (using an alias if requested) -+ if (device_args) { -+ char buffer[512]; -+ -+ snprintf(buffer, sizeof(buffer), "pcmk_%s_action", action); -+ value = g_hash_table_lookup(device_args, buffer); -+ if (value) { -+ crm_debug("Substituting '%s' for fence action %s targeting %s", -+ value, action, victim); -+ action = value; -+ } -+ } -+ g_hash_table_insert(arg_list, strdup(STONITH_ATTR_ACTION_OP), -+ strdup(action)); -+ -+ /* If this is a fencing operation against another node, add more standard -+ * arguments. -+ */ -+ if (victim && device_args) { -+ const char *param = NULL; -+ -+ /* Always pass the target's name, per -+ * https://github.com/ClusterLabs/fence-agents/blob/master/doc/FenceAgentAPI.md -+ */ -+ g_hash_table_insert(arg_list, strdup("nodename"), strdup(victim)); -+ -+ // If the target's node ID was specified, pass it, too -+ if (victim_nodeid) { -+ char *nodeid = crm_strdup_printf("%" PRIu32, victim_nodeid); -+ -+ // cts-fencing looks for this log message -+ crm_info("Passing '%s' as nodeid with fence action '%s' targeting %s", -+ nodeid, action, victim); -+ g_hash_table_insert(arg_list, strdup("nodeid"), nodeid); -+ } -+ -+ // Check whether target must be specified in some other way -+ param = g_hash_table_lookup(device_args, PCMK_STONITH_HOST_ARGUMENT); -+ if (!pcmk__str_eq(agent, "fence_legacy", pcmk__str_none) -+ && !pcmk__str_eq(param, "none", pcmk__str_casei)) { -+ -+ if (param == NULL) { -+ /* Use the caller's default for pcmk_host_argument, or "port" if -+ * none was given -+ */ -+ param = (host_arg == NULL)? "port" : host_arg; -+ } -+ value = g_hash_table_lookup(device_args, param); -+ -+ if (pcmk__str_eq(value, "dynamic", -+ pcmk__str_casei|pcmk__str_null_matches)) { -+ /* If the host argument was "dynamic" or not explicitly specified, -+ * add it with the target -+ */ -+ const char *alias = NULL; -+ -+ if (port_map) { -+ alias = g_hash_table_lookup(port_map, victim); -+ } -+ if (alias == NULL) { -+ alias = victim; -+ } -+ crm_debug("Passing %s='%s' with fence action %s targeting %s", -+ param, alias, action, victim); -+ g_hash_table_insert(arg_list, strdup(param), strdup(alias)); -+ } -+ } -+ } -+ -+ if (device_args) { -+ g_hash_table_foreach(device_args, append_config_arg, arg_list); -+ } -+ -+ return arg_list; -+} -+ -+/*! -+ * \internal -+ * \brief Free all memory used by a stonith action -+ * -+ * \param[in,out] action Action to free -+ */ -+void -+stonith__destroy_action(stonith_action_t *action) -+{ -+ if (action) { -+ free(action->agent); -+ if (action->args) { -+ g_hash_table_destroy(action->args); -+ } -+ free(action->action); -+ free(action->victim); -+ if (action->svc_action) { -+ services_action_free(action->svc_action); -+ } -+ pcmk__reset_result(&(action->result)); -+ free(action); -+ } -+} -+ -+/*! -+ * \internal -+ * \brief Get the result of an executed stonith action -+ * -+ * \param[in] action Executed action -+ * -+ * \return Pointer to action's result (or NULL if \p action is NULL) -+ */ -+pcmk__action_result_t * -+stonith__action_result(stonith_action_t *action) -+{ -+ return (action == NULL)? NULL : &(action->result); -+} -+ -+#define FAILURE_MAX_RETRIES 2 -+stonith_action_t * -+stonith_action_create(const char *agent, -+ const char *_action, -+ const char *victim, -+ uint32_t victim_nodeid, -+ int timeout, GHashTable * device_args, -+ GHashTable * port_map, const char *host_arg) -+{ -+ stonith_action_t *action; -+ -+ action = calloc(1, sizeof(stonith_action_t)); -+ action->args = make_args(agent, _action, victim, victim_nodeid, -+ device_args, port_map, host_arg); -+ crm_debug("Preparing '%s' action for %s using agent %s", -+ _action, (victim? victim : "no target"), agent); -+ action->agent = strdup(agent); -+ action->action = strdup(_action); -+ if (victim) { -+ action->victim = strdup(victim); -+ } -+ action->timeout = action->remaining_timeout = timeout; -+ action->max_retries = FAILURE_MAX_RETRIES; -+ -+ pcmk__set_result(&(action->result), PCMK_OCF_UNKNOWN, PCMK_EXEC_UNKNOWN, -+ "Initialization bug in fencing library"); -+ -+ if (device_args) { -+ char buffer[512]; -+ const char *value = NULL; -+ -+ snprintf(buffer, sizeof(buffer), "pcmk_%s_retries", _action); -+ value = g_hash_table_lookup(device_args, buffer); -+ -+ if (value) { -+ action->max_retries = atoi(value); -+ } -+ } -+ -+ return action; -+} -+ -+static gboolean -+update_remaining_timeout(stonith_action_t * action) -+{ -+ int diff = time(NULL) - action->initial_start_time; -+ -+ if (action->tries >= action->max_retries) { -+ crm_info("Attempted to execute agent %s (%s) the maximum number of times (%d) allowed", -+ action->agent, action->action, action->max_retries); -+ action->remaining_timeout = 0; -+ } else if ((action->result.execution_status != PCMK_EXEC_TIMEOUT) -+ && (diff < (action->timeout * 0.7))) { -+ /* only set remaining timeout period if there is 30% -+ * or greater of the original timeout period left */ -+ action->remaining_timeout = action->timeout - diff; -+ } else { -+ action->remaining_timeout = 0; -+ } -+ return action->remaining_timeout ? TRUE : FALSE; -+} -+ -+/*! -+ * \internal -+ * \brief Map a fencing action result to a standard return code -+ * -+ * \param[in] result Fencing action result to map -+ * -+ * \return Standard Pacemaker return code that best corresponds to \p result -+ */ -+int -+stonith__result2rc(const pcmk__action_result_t *result) -+{ -+ if (pcmk__result_ok(result)) { -+ return pcmk_rc_ok; -+ } -+ -+ switch (result->execution_status) { -+ case PCMK_EXEC_PENDING: return EINPROGRESS; -+ case PCMK_EXEC_CANCELLED: return ECANCELED; -+ case PCMK_EXEC_TIMEOUT: return ETIME; -+ case PCMK_EXEC_NOT_INSTALLED: return ENOENT; -+ case PCMK_EXEC_NOT_SUPPORTED: return EOPNOTSUPP; -+ case PCMK_EXEC_NOT_CONNECTED: return ENOTCONN; -+ case PCMK_EXEC_NO_FENCE_DEVICE: return ENODEV; -+ case PCMK_EXEC_NO_SECRETS: return EACCES; -+ -+ /* For the fencing API, PCMK_EXEC_INVALID is used with fencer API -+ * operations that don't involve executing an agent (for example, -+ * registering devices). This allows us to use the CRM_EX_* codes in the -+ * exit status for finer-grained responses. -+ */ -+ case PCMK_EXEC_INVALID: -+ switch (result->exit_status) { -+ case CRM_EX_INSUFFICIENT_PRIV: return EACCES; -+ case CRM_EX_PROTOCOL: return EPROTO; -+ -+ /* CRM_EX_EXPIRED is used for orphaned fencing operations left -+ * over from a previous instance of the fencer. For API backward -+ * compatibility, this is mapped to the previously used code for -+ * this case, EHOSTUNREACH. -+ */ -+ case CRM_EX_EXPIRED: return EHOSTUNREACH; -+ default: break; -+ } -+ -+ default: -+ break; -+ } -+ -+ // Try to provide useful error code based on result's error output -+ -+ if (result->action_stderr == NULL) { -+ return ENODATA; -+ -+ } else if (strcasestr(result->action_stderr, "timed out") -+ || strcasestr(result->action_stderr, "timeout")) { -+ return ETIME; -+ -+ } else if (strcasestr(result->action_stderr, "unrecognised action") -+ || strcasestr(result->action_stderr, "unrecognized action") -+ || strcasestr(result->action_stderr, "unsupported action")) { -+ return EOPNOTSUPP; -+ } -+ -+ // Oh well, we tried -+ return pcmk_rc_error; -+} -+ -+static void -+stonith_action_async_done(svc_action_t *svc_action) -+{ -+ stonith_action_t *action = (stonith_action_t *) svc_action->cb_data; -+ -+ set_result_from_svc_action(action, svc_action); -+ -+ svc_action->params = NULL; -+ -+ crm_debug("Child process %d performing action '%s' exited with rc %d", -+ action->pid, action->action, svc_action->rc); -+ -+ log_action(action, action->pid); -+ -+ if (!pcmk__result_ok(&(action->result)) -+ && update_remaining_timeout(action)) { -+ -+ int rc = internal_stonith_action_execute(action); -+ if (rc == pcmk_ok) { -+ return; -+ } -+ } -+ -+ if (action->done_cb) { -+ action->done_cb(action->pid, &(action->result), action->userdata); -+ } -+ -+ action->svc_action = NULL; // don't remove our caller -+ stonith__destroy_action(action); -+} -+ -+static void -+stonith_action_async_forked(svc_action_t *svc_action) -+{ -+ stonith_action_t *action = (stonith_action_t *) svc_action->cb_data; -+ -+ action->pid = svc_action->pid; -+ action->svc_action = svc_action; -+ -+ if (action->fork_cb) { -+ (action->fork_cb) (svc_action->pid, action->userdata); -+ } -+ -+ crm_trace("Child process %d performing action '%s' successfully forked", -+ action->pid, action->action); -+} -+ -+static int -+internal_stonith_action_execute(stonith_action_t * action) -+{ -+ int rc = -EPROTO; -+ int is_retry = 0; -+ svc_action_t *svc_action = NULL; -+ static int stonith_sequence = 0; -+ char *buffer = NULL; -+ -+ CRM_CHECK(action != NULL, return -EINVAL); -+ -+ if ((action->action == NULL) || (action->args == NULL) -+ || (action->agent == NULL)) { -+ pcmk__set_result(&(action->result), PCMK_OCF_UNKNOWN_ERROR, -+ PCMK_EXEC_ERROR_FATAL, "Bug in fencing library"); -+ return -EINVAL; -+ } -+ -+ if (!action->tries) { -+ action->initial_start_time = time(NULL); -+ } -+ action->tries++; -+ -+ if (action->tries > 1) { -+ crm_info("Attempt %d to execute %s (%s). remaining timeout is %d", -+ action->tries, action->agent, action->action, action->remaining_timeout); -+ is_retry = 1; -+ } -+ -+ buffer = crm_strdup_printf(PCMK__FENCE_BINDIR "/%s", -+ basename(action->agent)); -+ svc_action = services_action_create_generic(buffer, NULL); -+ free(buffer); -+ -+ if (svc_action->rc != PCMK_OCF_UNKNOWN) { -+ set_result_from_svc_action(action, svc_action); -+ services_action_free(svc_action); -+ return -E2BIG; -+ } -+ -+ svc_action->timeout = 1000 * action->remaining_timeout; -+ svc_action->standard = strdup(PCMK_RESOURCE_CLASS_STONITH); -+ svc_action->id = crm_strdup_printf("%s_%s_%d", basename(action->agent), -+ action->action, action->tries); -+ svc_action->agent = strdup(action->agent); -+ svc_action->sequence = stonith_sequence++; -+ svc_action->params = action->args; -+ svc_action->cb_data = (void *) action; -+ svc_action->flags = pcmk__set_flags_as(__func__, __LINE__, -+ LOG_TRACE, "Action", -+ svc_action->id, svc_action->flags, -+ SVC_ACTION_NON_BLOCKED, -+ "SVC_ACTION_NON_BLOCKED"); -+ -+ /* keep retries from executing out of control and free previous results */ -+ if (is_retry) { -+ pcmk__reset_result(&(action->result)); -+ sleep(1); -+ } -+ -+ if (action->async) { -+ /* async */ -+ if (services_action_async_fork_notify(svc_action, -+ &stonith_action_async_done, -+ &stonith_action_async_forked)) { -+ pcmk__set_result(&(action->result), PCMK_OCF_UNKNOWN, -+ PCMK_EXEC_PENDING, NULL); -+ return pcmk_ok; -+ } -+ -+ } else if (services_action_sync(svc_action)) { // sync success -+ rc = pcmk_ok; -+ -+ } else { // sync failure -+ rc = -ECONNABORTED; -+ } -+ -+ set_result_from_svc_action(action, svc_action); -+ svc_action->params = NULL; -+ services_action_free(svc_action); -+ return rc; -+} -+ -+/*! -+ * \internal -+ * \brief Kick off execution of an async stonith action -+ * -+ * \param[in,out] action Action to be executed -+ * \param[in,out] userdata Datapointer to be passed to callbacks -+ * \param[in] done Callback to notify action has failed/succeeded -+ * \param[in] fork_callback Callback to notify successful fork of child -+ * -+ * \return pcmk_ok if ownership of action has been taken, -errno otherwise -+ */ -+int -+stonith_action_execute_async(stonith_action_t * action, -+ void *userdata, -+ void (*done) (int pid, -+ const pcmk__action_result_t *result, -+ void *user_data), -+ void (*fork_cb) (int pid, void *user_data)) -+{ -+ if (!action) { -+ return -EINVAL; -+ } -+ -+ action->userdata = userdata; -+ action->done_cb = done; -+ action->fork_cb = fork_cb; -+ action->async = 1; -+ -+ return internal_stonith_action_execute(action); -+} -+ -+/*! -+ * \internal -+ * \brief Execute a stonith action -+ * -+ * \param[in,out] action Action to execute -+ * -+ * \return pcmk_ok on success, -errno otherwise -+ */ -+int -+stonith__execute(stonith_action_t *action) -+{ -+ int rc = pcmk_ok; -+ -+ CRM_CHECK(action != NULL, return -EINVAL); -+ -+ // Keep trying until success, max retries, or timeout -+ do { -+ rc = internal_stonith_action_execute(action); -+ } while ((rc != pcmk_ok) && update_remaining_timeout(action)); -+ -+ return rc; -+} -diff --git a/lib/fencing/st_client.c b/lib/fencing/st_client.c -index 93513e9f3..944cd1863 100644 ---- a/lib/fencing/st_client.c -+++ b/lib/fencing/st_client.c -@@ -8,28 +8,20 @@ - */ - - #include --#include -+ - #include - #include - #include - #include - #include --#include - #include -- --#include - #include --#include -- - #include - - #include - #include - #include - #include --#include --#include --#include - - #include - -@@ -37,31 +29,6 @@ - - CRM_TRACE_INIT_DATA(stonith); - --struct stonith_action_s { -- /*! user defined data */ -- char *agent; -- char *action; -- char *victim; -- GHashTable *args; -- int timeout; -- int async; -- void *userdata; -- void (*done_cb) (int pid, const pcmk__action_result_t *result, -- void *user_data); -- void (*fork_cb) (int pid, void *user_data); -- -- svc_action_t *svc_action; -- -- /*! internal timing information */ -- time_t initial_start_time; -- int tries; -- int remaining_timeout; -- int max_retries; -- -- int pid; -- pcmk__action_result_t result; --}; -- - typedef struct stonith_private_s { - char *token; - crm_ipc_t *ipc; -@@ -118,8 +85,6 @@ static int stonith_send_command(stonith_t *stonith, const char *op, - - static void stonith_connection_destroy(gpointer user_data); - static void stonith_send_notification(gpointer data, gpointer user_data); --static int internal_stonith_action_execute(stonith_action_t * action); --static void log_action(stonith_action_t *action, pid_t pid); - - /*! - * \brief Get agent namespace by name -@@ -196,23 +161,6 @@ stonith_get_namespace(const char *agent, const char *namespace_s) - return st_namespace_invalid; - } - --/*! -- * \internal -- * \brief Set an action's result based on services library result -- * -- * \param[in] action Fence action to set result for -- * \param[in] svc_action Service action to get result from -- */ --static void --set_result_from_svc_action(stonith_action_t *action, svc_action_t *svc_action) --{ -- pcmk__set_result(&(action->result), svc_action->rc, svc_action->status, -- services__exit_reason(svc_action)); -- pcmk__set_result_output(&(action->result), -- services__grab_stdout(svc_action), -- services__grab_stderr(svc_action)); --} -- - gboolean - stonith__watchdog_fencing_enabled_for_node_api(stonith_t *st, const char *node) - { -@@ -273,21 +221,6 @@ stonith__watchdog_fencing_enabled_for_node(const char *node) - return stonith__watchdog_fencing_enabled_for_node_api(NULL, node); - } - --static void --log_action(stonith_action_t *action, pid_t pid) --{ -- /* The services library has already logged the output at info or debug -- * level, so just raise to warning for stderr. -- */ -- if (action->result.action_stderr != NULL) { -- /* Logging the whole string confuses syslog when the string is xml */ -- char *prefix = crm_strdup_printf("%s[%d] stderr:", action->agent, pid); -- -- crm_log_output(LOG_WARNING, prefix, action->result.action_stderr); -- free(prefix); -- } --} -- - /* when cycling through the list we don't want to delete items - so just mark them and when we know nobody is using the list - loop over it to remove the marked items -@@ -530,465 +463,6 @@ stonith_api_register_level(stonith_t * st, int options, const char *node, int le - level, device_list); - } - --static void --append_config_arg(gpointer key, gpointer value, gpointer user_data) --{ -- /* The fencer will filter "action" out when it registers the device, -- * but ignore it here in case any external API users don't. -- * -- * Also filter out parameters handled directly by Pacemaker. -- */ -- if (!pcmk__str_eq(key, STONITH_ATTR_ACTION_OP, pcmk__str_casei) -- && !pcmk_stonith_param(key) -- && (strstr(key, CRM_META) == NULL) -- && !pcmk__str_eq(key, "crm_feature_set", pcmk__str_casei)) { -- -- crm_trace("Passing %s=%s with fence action", -- (const char *) key, (const char *) (value? value : "")); -- g_hash_table_insert((GHashTable *) user_data, -- strdup(key), strdup(value? value : "")); -- } --} -- --static GHashTable * --make_args(const char *agent, const char *action, const char *victim, -- uint32_t victim_nodeid, GHashTable * device_args, -- GHashTable * port_map, const char *host_arg) --{ -- GHashTable *arg_list = NULL; -- const char *value = NULL; -- -- CRM_CHECK(action != NULL, return NULL); -- -- arg_list = pcmk__strkey_table(free, free); -- -- // Add action to arguments (using an alias if requested) -- if (device_args) { -- char buffer[512]; -- -- snprintf(buffer, sizeof(buffer), "pcmk_%s_action", action); -- value = g_hash_table_lookup(device_args, buffer); -- if (value) { -- crm_debug("Substituting '%s' for fence action %s targeting %s", -- value, action, victim); -- action = value; -- } -- } -- g_hash_table_insert(arg_list, strdup(STONITH_ATTR_ACTION_OP), -- strdup(action)); -- -- /* If this is a fencing operation against another node, add more standard -- * arguments. -- */ -- if (victim && device_args) { -- const char *param = NULL; -- -- /* Always pass the target's name, per -- * https://github.com/ClusterLabs/fence-agents/blob/master/doc/FenceAgentAPI.md -- */ -- g_hash_table_insert(arg_list, strdup("nodename"), strdup(victim)); -- -- // If the target's node ID was specified, pass it, too -- if (victim_nodeid) { -- char *nodeid = crm_strdup_printf("%" PRIu32, victim_nodeid); -- -- // cts-fencing looks for this log message -- crm_info("Passing '%s' as nodeid with fence action '%s' targeting %s", -- nodeid, action, victim); -- g_hash_table_insert(arg_list, strdup("nodeid"), nodeid); -- } -- -- // Check whether target must be specified in some other way -- param = g_hash_table_lookup(device_args, PCMK_STONITH_HOST_ARGUMENT); -- if (!pcmk__str_eq(agent, "fence_legacy", pcmk__str_none) -- && !pcmk__str_eq(param, "none", pcmk__str_casei)) { -- -- if (param == NULL) { -- /* Use the caller's default for pcmk_host_argument, or "port" if -- * none was given -- */ -- param = (host_arg == NULL)? "port" : host_arg; -- } -- value = g_hash_table_lookup(device_args, param); -- -- if (pcmk__str_eq(value, "dynamic", -- pcmk__str_casei|pcmk__str_null_matches)) { -- /* If the host argument was "dynamic" or not explicitly specified, -- * add it with the target -- */ -- const char *alias = NULL; -- -- if (port_map) { -- alias = g_hash_table_lookup(port_map, victim); -- } -- if (alias == NULL) { -- alias = victim; -- } -- crm_debug("Passing %s='%s' with fence action %s targeting %s", -- param, alias, action, victim); -- g_hash_table_insert(arg_list, strdup(param), strdup(alias)); -- } -- } -- } -- -- if (device_args) { -- g_hash_table_foreach(device_args, append_config_arg, arg_list); -- } -- -- return arg_list; --} -- --/*! -- * \internal -- * \brief Free all memory used by a stonith action -- * -- * \param[in,out] action Action to free -- */ --void --stonith__destroy_action(stonith_action_t *action) --{ -- if (action) { -- free(action->agent); -- if (action->args) { -- g_hash_table_destroy(action->args); -- } -- free(action->action); -- free(action->victim); -- if (action->svc_action) { -- services_action_free(action->svc_action); -- } -- pcmk__reset_result(&(action->result)); -- free(action); -- } --} -- --/*! -- * \internal -- * \brief Get the result of an executed stonith action -- * -- * \param[in] action Executed action -- * -- * \return Pointer to action's result (or NULL if \p action is NULL) -- */ --pcmk__action_result_t * --stonith__action_result(stonith_action_t *action) --{ -- return (action == NULL)? NULL : &(action->result); --} -- --#define FAILURE_MAX_RETRIES 2 --stonith_action_t * --stonith_action_create(const char *agent, -- const char *_action, -- const char *victim, -- uint32_t victim_nodeid, -- int timeout, GHashTable * device_args, -- GHashTable * port_map, const char *host_arg) --{ -- stonith_action_t *action; -- -- action = calloc(1, sizeof(stonith_action_t)); -- action->args = make_args(agent, _action, victim, victim_nodeid, -- device_args, port_map, host_arg); -- crm_debug("Preparing '%s' action for %s using agent %s", -- _action, (victim? victim : "no target"), agent); -- action->agent = strdup(agent); -- action->action = strdup(_action); -- if (victim) { -- action->victim = strdup(victim); -- } -- action->timeout = action->remaining_timeout = timeout; -- action->max_retries = FAILURE_MAX_RETRIES; -- -- pcmk__set_result(&(action->result), PCMK_OCF_UNKNOWN, PCMK_EXEC_UNKNOWN, -- "Initialization bug in fencing library"); -- -- if (device_args) { -- char buffer[512]; -- const char *value = NULL; -- -- snprintf(buffer, sizeof(buffer), "pcmk_%s_retries", _action); -- value = g_hash_table_lookup(device_args, buffer); -- -- if (value) { -- action->max_retries = atoi(value); -- } -- } -- -- return action; --} -- --static gboolean --update_remaining_timeout(stonith_action_t * action) --{ -- int diff = time(NULL) - action->initial_start_time; -- -- if (action->tries >= action->max_retries) { -- crm_info("Attempted to execute agent %s (%s) the maximum number of times (%d) allowed", -- action->agent, action->action, action->max_retries); -- action->remaining_timeout = 0; -- } else if ((action->result.execution_status != PCMK_EXEC_TIMEOUT) -- && (diff < (action->timeout * 0.7))) { -- /* only set remaining timeout period if there is 30% -- * or greater of the original timeout period left */ -- action->remaining_timeout = action->timeout - diff; -- } else { -- action->remaining_timeout = 0; -- } -- return action->remaining_timeout ? TRUE : FALSE; --} -- --/*! -- * \internal -- * \brief Map a fencing action result to a standard return code -- * -- * \param[in] result Fencing action result to map -- * -- * \return Standard Pacemaker return code that best corresponds to \p result -- */ --int --stonith__result2rc(const pcmk__action_result_t *result) --{ -- if (pcmk__result_ok(result)) { -- return pcmk_rc_ok; -- } -- -- switch (result->execution_status) { -- case PCMK_EXEC_PENDING: return EINPROGRESS; -- case PCMK_EXEC_CANCELLED: return ECANCELED; -- case PCMK_EXEC_TIMEOUT: return ETIME; -- case PCMK_EXEC_NOT_INSTALLED: return ENOENT; -- case PCMK_EXEC_NOT_SUPPORTED: return EOPNOTSUPP; -- case PCMK_EXEC_NOT_CONNECTED: return ENOTCONN; -- case PCMK_EXEC_NO_FENCE_DEVICE: return ENODEV; -- case PCMK_EXEC_NO_SECRETS: return EACCES; -- -- /* For the fencing API, PCMK_EXEC_INVALID is used with fencer API -- * operations that don't involve executing an agent (for example, -- * registering devices). This allows us to use the CRM_EX_* codes in the -- * exit status for finer-grained responses. -- */ -- case PCMK_EXEC_INVALID: -- switch (result->exit_status) { -- case CRM_EX_INSUFFICIENT_PRIV: return EACCES; -- case CRM_EX_PROTOCOL: return EPROTO; -- -- /* CRM_EX_EXPIRED is used for orphaned fencing operations left -- * over from a previous instance of the fencer. For API backward -- * compatibility, this is mapped to the previously used code for -- * this case, EHOSTUNREACH. -- */ -- case CRM_EX_EXPIRED: return EHOSTUNREACH; -- default: break; -- } -- -- default: -- break; -- } -- -- // Try to provide useful error code based on result's error output -- -- if (result->action_stderr == NULL) { -- return ENODATA; -- -- } else if (strcasestr(result->action_stderr, "timed out") -- || strcasestr(result->action_stderr, "timeout")) { -- return ETIME; -- -- } else if (strcasestr(result->action_stderr, "unrecognised action") -- || strcasestr(result->action_stderr, "unrecognized action") -- || strcasestr(result->action_stderr, "unsupported action")) { -- return EOPNOTSUPP; -- } -- -- // Oh well, we tried -- return pcmk_rc_error; --} -- --static void --stonith_action_async_done(svc_action_t *svc_action) --{ -- stonith_action_t *action = (stonith_action_t *) svc_action->cb_data; -- -- set_result_from_svc_action(action, svc_action); -- -- svc_action->params = NULL; -- -- crm_debug("Child process %d performing action '%s' exited with rc %d", -- action->pid, action->action, svc_action->rc); -- -- log_action(action, action->pid); -- -- if (!pcmk__result_ok(&(action->result)) -- && update_remaining_timeout(action)) { -- -- int rc = internal_stonith_action_execute(action); -- if (rc == pcmk_ok) { -- return; -- } -- } -- -- if (action->done_cb) { -- action->done_cb(action->pid, &(action->result), action->userdata); -- } -- -- action->svc_action = NULL; // don't remove our caller -- stonith__destroy_action(action); --} -- --static void --stonith_action_async_forked(svc_action_t *svc_action) --{ -- stonith_action_t *action = (stonith_action_t *) svc_action->cb_data; -- -- action->pid = svc_action->pid; -- action->svc_action = svc_action; -- -- if (action->fork_cb) { -- (action->fork_cb) (svc_action->pid, action->userdata); -- } -- -- crm_trace("Child process %d performing action '%s' successfully forked", -- action->pid, action->action); --} -- --static int --internal_stonith_action_execute(stonith_action_t * action) --{ -- int rc = -EPROTO; -- int is_retry = 0; -- svc_action_t *svc_action = NULL; -- static int stonith_sequence = 0; -- char *buffer = NULL; -- -- CRM_CHECK(action != NULL, return -EINVAL); -- -- if ((action->action == NULL) || (action->args == NULL) -- || (action->agent == NULL)) { -- pcmk__set_result(&(action->result), PCMK_OCF_UNKNOWN_ERROR, -- PCMK_EXEC_ERROR_FATAL, "Bug in fencing library"); -- return -EINVAL; -- } -- -- if (!action->tries) { -- action->initial_start_time = time(NULL); -- } -- action->tries++; -- -- if (action->tries > 1) { -- crm_info("Attempt %d to execute %s (%s). remaining timeout is %d", -- action->tries, action->agent, action->action, action->remaining_timeout); -- is_retry = 1; -- } -- -- buffer = crm_strdup_printf(PCMK__FENCE_BINDIR "/%s", -- basename(action->agent)); -- svc_action = services_action_create_generic(buffer, NULL); -- free(buffer); -- -- if (svc_action->rc != PCMK_OCF_UNKNOWN) { -- set_result_from_svc_action(action, svc_action); -- services_action_free(svc_action); -- return -E2BIG; -- } -- -- svc_action->timeout = 1000 * action->remaining_timeout; -- svc_action->standard = strdup(PCMK_RESOURCE_CLASS_STONITH); -- svc_action->id = crm_strdup_printf("%s_%s_%d", basename(action->agent), -- action->action, action->tries); -- svc_action->agent = strdup(action->agent); -- svc_action->sequence = stonith_sequence++; -- svc_action->params = action->args; -- svc_action->cb_data = (void *) action; -- svc_action->flags = pcmk__set_flags_as(__func__, __LINE__, -- LOG_TRACE, "Action", -- svc_action->id, svc_action->flags, -- SVC_ACTION_NON_BLOCKED, -- "SVC_ACTION_NON_BLOCKED"); -- -- /* keep retries from executing out of control and free previous results */ -- if (is_retry) { -- pcmk__reset_result(&(action->result)); -- sleep(1); -- } -- -- if (action->async) { -- /* async */ -- if (services_action_async_fork_notify(svc_action, -- &stonith_action_async_done, -- &stonith_action_async_forked)) { -- pcmk__set_result(&(action->result), PCMK_OCF_UNKNOWN, -- PCMK_EXEC_PENDING, NULL); -- return pcmk_ok; -- } -- -- } else if (services_action_sync(svc_action)) { // sync success -- rc = pcmk_ok; -- -- } else { // sync failure -- rc = -ECONNABORTED; -- } -- -- set_result_from_svc_action(action, svc_action); -- svc_action->params = NULL; -- services_action_free(svc_action); -- return rc; --} -- --/*! -- * \internal -- * \brief Kick off execution of an async stonith action -- * -- * \param[in,out] action Action to be executed -- * \param[in,out] userdata Datapointer to be passed to callbacks -- * \param[in] done Callback to notify action has failed/succeeded -- * \param[in] fork_callback Callback to notify successful fork of child -- * -- * \return pcmk_ok if ownership of action has been taken, -errno otherwise -- */ --int --stonith_action_execute_async(stonith_action_t * action, -- void *userdata, -- void (*done) (int pid, -- const pcmk__action_result_t *result, -- void *user_data), -- void (*fork_cb) (int pid, void *user_data)) --{ -- if (!action) { -- return -EINVAL; -- } -- -- action->userdata = userdata; -- action->done_cb = done; -- action->fork_cb = fork_cb; -- action->async = 1; -- -- return internal_stonith_action_execute(action); --} -- --/*! -- * \internal -- * \brief Execute a stonith action -- * -- * \param[in,out] action Action to execute -- * -- * \return pcmk_ok on success, -errno otherwise -- */ --int --stonith__execute(stonith_action_t *action) --{ -- int rc = pcmk_ok; -- -- CRM_CHECK(action != NULL, return -EINVAL); -- -- // Keep trying until success, max retries, or timeout -- do { -- rc = internal_stonith_action_execute(action); -- } while ((rc != pcmk_ok) && update_remaining_timeout(action)); -- -- return rc; --} -- - static int - stonith_api_device_list(stonith_t * stonith, int call_options, const char *namespace, - stonith_key_value_t ** devices, int timeout) --- -2.27.0 - - -From 883a3cf7d3f73d02417d3997a7885dd5a7bebac7 Mon Sep 17 00:00:00 2001 -From: Ken Gaillot -Date: Wed, 10 Nov 2021 15:39:17 -0600 -Subject: [PATCH 05/13] Low: fencing,executor: improve mapping of legacy return - code to execution status - -Move stonith_rc2status() from the executor to the fencing library for future -reuse, exposing it internally as stonith__legacy2status(). Update it to use -recently added execution status codes. ---- - daemons/execd/execd_commands.c | 66 ++++++++-------------------------- - include/crm/fencing/internal.h | 2 ++ - lib/fencing/st_actions.c | 36 +++++++++++++++++++ - 3 files changed, 52 insertions(+), 52 deletions(-) - -diff --git a/daemons/execd/execd_commands.c b/daemons/execd/execd_commands.c -index 02070bf11..0ccaa1ced 100644 ---- a/daemons/execd/execd_commands.c -+++ b/daemons/execd/execd_commands.c -@@ -21,6 +21,7 @@ - #include - - #include -+#include - #include - #include - #include -@@ -999,56 +1000,6 @@ action_complete(svc_action_t * action) - cmd_finalize(cmd, rsc); - } - --/*! -- * \internal -- * \brief Determine operation status of a stonith operation -- * -- * Non-stonith resource operations get their operation status directly from the -- * service library, but the fencer does not have an equivalent, so we must infer -- * an operation status from the fencer API's return code. -- * -- * \param[in] action Name of action performed on stonith resource -- * \param[in] interval_ms Action interval -- * \param[in] rc Action result from fencer -- * -- * \return Operation status corresponding to fencer API return code -- */ --static int --stonith_rc2status(const char *action, guint interval_ms, int rc) --{ -- int status = PCMK_EXEC_DONE; -- -- switch (rc) { -- case pcmk_ok: -- break; -- -- case -EOPNOTSUPP: -- case -EPROTONOSUPPORT: -- status = PCMK_EXEC_NOT_SUPPORTED; -- break; -- -- case -ETIME: -- case -ETIMEDOUT: -- status = PCMK_EXEC_TIMEOUT; -- break; -- -- case -ENOTCONN: -- case -ECOMM: -- // Couldn't talk to fencer -- status = PCMK_EXEC_ERROR; -- break; -- -- case -ENODEV: -- // The device is not registered with the fencer -- status = PCMK_EXEC_ERROR; -- break; -- -- default: -- break; -- } -- return status; --} -- - static void - stonith_action_complete(lrmd_cmd_t * cmd, int rc) - { -@@ -1062,8 +1013,19 @@ stonith_action_complete(lrmd_cmd_t * cmd, int rc) - * the fencer return code. - */ - if (cmd->result.execution_status != PCMK_EXEC_CANCELLED) { -- cmd->result.execution_status = stonith_rc2status(cmd->action, -- cmd->interval_ms, rc); -+ cmd->result.execution_status = stonith__legacy2status(rc); -+ -+ // Simplify status codes from fencer -+ switch (cmd->result.execution_status) { -+ case PCMK_EXEC_NOT_CONNECTED: -+ case PCMK_EXEC_INVALID: -+ case PCMK_EXEC_NO_FENCE_DEVICE: -+ case PCMK_EXEC_NO_SECRETS: -+ cmd->result.execution_status = PCMK_EXEC_ERROR; -+ break; -+ default: -+ break; -+ } - - // Certain successful actions change the known state of the resource - if ((rsc != NULL) && pcmk__result_ok(&(cmd->result))) { -diff --git a/include/crm/fencing/internal.h b/include/crm/fencing/internal.h -index 6a7e4232c..80f6443be 100644 ---- a/include/crm/fencing/internal.h -+++ b/include/crm/fencing/internal.h -@@ -182,6 +182,8 @@ bool stonith__event_state_pending(stonith_history_t *history, void *user_data); - bool stonith__event_state_eq(stonith_history_t *history, void *user_data); - bool stonith__event_state_neq(stonith_history_t *history, void *user_data); - -+int stonith__legacy2status(int rc); -+ - /*! - * \internal - * \brief Is a fencing operation in pending state? -diff --git a/lib/fencing/st_actions.c b/lib/fencing/st_actions.c -index 64d3afd5d..9e785595a 100644 ---- a/lib/fencing/st_actions.c -+++ b/lib/fencing/st_actions.c -@@ -360,6 +360,42 @@ stonith__result2rc(const pcmk__action_result_t *result) - return pcmk_rc_error; - } - -+/*! -+ * \internal -+ * \brief Determine execution status equivalent of legacy fencer return code -+ * -+ * Fence action notifications, and fence action callbacks from older fencers -+ * (<=2.1.2) in a rolling upgrade, will have only a legacy return code. Map this -+ * to an execution status as best as possible (essentially, the inverse of -+ * stonith__result2rc()). -+ * -+ * \param[in] rc Legacy return code from fencer -+ * -+ * \return Execution status best corresponding to \p rc -+ */ -+int -+stonith__legacy2status(int rc) -+{ -+ if (rc >= 0) { -+ return PCMK_EXEC_DONE; -+ } -+ switch (-rc) { -+ case EACCES: return PCMK_EXEC_NO_SECRETS; -+ case ECANCELED: return PCMK_EXEC_CANCELLED; -+ case EHOSTUNREACH: return PCMK_EXEC_INVALID; -+ case EINPROGRESS: return PCMK_EXEC_PENDING; -+ case ENODEV: return PCMK_EXEC_NO_FENCE_DEVICE; -+ case ENOENT: return PCMK_EXEC_NOT_INSTALLED; -+ case ENOTCONN: return PCMK_EXEC_NOT_CONNECTED; -+ case EOPNOTSUPP: return PCMK_EXEC_NOT_SUPPORTED; -+ case EPROTO: return PCMK_EXEC_INVALID; -+ case EPROTONOSUPPORT: return PCMK_EXEC_NOT_SUPPORTED; -+ case ETIME: return PCMK_EXEC_TIMEOUT; -+ case ETIMEDOUT: return PCMK_EXEC_TIMEOUT; -+ default: return PCMK_EXEC_ERROR; -+ } -+} -+ - static void - stonith_action_async_done(svc_action_t *svc_action) - { --- -2.27.0 - - -From 639a9f4a2cbeb6cc41b754a1dcb1f360a9500e03 Mon Sep 17 00:00:00 2001 -From: Ken Gaillot -Date: Thu, 11 Nov 2021 16:54:32 -0600 -Subject: [PATCH 06/13] Refactor: fencing: add functions for getting/setting - result via XML - -These will come in handy as we update the various fencer messages to include a -full result rather than just a legacy return code. The functions are in a new -source file fenced_messages.c which can have other stuff moved to it later. ---- - include/crm/fencing/internal.h | 3 + - lib/fencing/st_actions.c | 107 +++++++++++++++++++++++++++++++++ - 2 files changed, 110 insertions(+) - -diff --git a/include/crm/fencing/internal.h b/include/crm/fencing/internal.h -index 80f6443be..4b5fd3959 100644 ---- a/include/crm/fencing/internal.h -+++ b/include/crm/fencing/internal.h -@@ -60,6 +60,9 @@ stonith_action_t *stonith_action_create(const char *agent, - void stonith__destroy_action(stonith_action_t *action); - pcmk__action_result_t *stonith__action_result(stonith_action_t *action); - int stonith__result2rc(const pcmk__action_result_t *result); -+void stonith__xe_set_result(xmlNode *xml, const pcmk__action_result_t *result); -+void stonith__xe_get_result(xmlNode *xml, pcmk__action_result_t *result); -+xmlNode *stonith__find_xe_with_result(xmlNode *xml); - - int - stonith_action_execute_async(stonith_action_t * action, -diff --git a/lib/fencing/st_actions.c b/lib/fencing/st_actions.c -index 9e785595a..d4fc3f5ed 100644 ---- a/lib/fencing/st_actions.c -+++ b/lib/fencing/st_actions.c -@@ -396,6 +396,113 @@ stonith__legacy2status(int rc) - } - } - -+/*! -+ * \internal -+ * \brief Add a fencing result to an XML element as attributes -+ * -+ * \param[in] xml XML element to add result to -+ * \param[in] result Fencing result to add (assume success if NULL) -+ */ -+void -+stonith__xe_set_result(xmlNode *xml, const pcmk__action_result_t *result) -+{ -+ int exit_status = CRM_EX_OK; -+ enum pcmk_exec_status execution_status = PCMK_EXEC_DONE; -+ const char *exit_reason = NULL; -+ const char *action_stdout = NULL; -+ int rc = pcmk_ok; -+ -+ CRM_CHECK(xml != NULL, return); -+ -+ if (result != NULL) { -+ exit_status = result->exit_status; -+ execution_status = result->execution_status; -+ exit_reason = result->exit_reason; -+ action_stdout = result->action_stdout; -+ rc = pcmk_rc2legacy(stonith__result2rc(result)); -+ } -+ -+ crm_xml_add_int(xml, XML_LRM_ATTR_OPSTATUS, (int) execution_status); -+ crm_xml_add_int(xml, XML_LRM_ATTR_RC, exit_status); -+ crm_xml_add(xml, XML_LRM_ATTR_EXIT_REASON, exit_reason); -+ crm_xml_add(xml, "st_output", action_stdout); -+ -+ /* @COMPAT Peers in rolling upgrades, Pacemaker Remote nodes, and external -+ * code that use libstonithd <=2.1.2 don't check for the full result, and -+ * need a legacy return code instead. -+ */ -+ crm_xml_add_int(xml, F_STONITH_RC, rc); -+} -+ -+/*! -+ * \internal -+ * \brief Find a fencing result beneath an XML element -+ * -+ * \param[in] xml XML element to search -+ * -+ * \return \p xml or descendent of it that contains a fencing result, else NULL -+ */ -+xmlNode * -+stonith__find_xe_with_result(xmlNode *xml) -+{ -+ xmlNode *match = get_xpath_object("//@" XML_LRM_ATTR_RC, xml, LOG_NEVER); -+ -+ if (match == NULL) { -+ /* @COMPAT Peers <=2.1.2 in a rolling upgrade provide only a legacy -+ * return code, not a full result, so check for that. -+ */ -+ match = get_xpath_object("//@" F_STONITH_RC, xml, LOG_ERR); -+ } -+ return match; -+} -+ -+/*! -+ * \internal -+ * \brief Get a fencing result from an XML element's attributes -+ * -+ * \param[in] xml XML element with fencing result -+ * \param[out] result Where to store fencing result -+ */ -+void -+stonith__xe_get_result(xmlNode *xml, pcmk__action_result_t *result) -+{ -+ int exit_status = CRM_EX_OK; -+ int execution_status = PCMK_EXEC_DONE; -+ const char *exit_reason = NULL; -+ char *action_stdout = NULL; -+ -+ CRM_CHECK((xml != NULL) && (result != NULL), return); -+ -+ exit_reason = crm_element_value(xml, XML_LRM_ATTR_EXIT_REASON); -+ action_stdout = crm_element_value_copy(xml, "st_output"); -+ -+ // A result must include an exit status and execution status -+ if ((crm_element_value_int(xml, XML_LRM_ATTR_RC, &exit_status) < 0) -+ || (crm_element_value_int(xml, XML_LRM_ATTR_OPSTATUS, -+ &execution_status) < 0)) { -+ int rc = pcmk_ok; -+ exit_status = CRM_EX_ERROR; -+ -+ /* @COMPAT Peers <=2.1.2 in rolling upgrades provide only a legacy -+ * return code, not a full result, so check for that. -+ */ -+ if (crm_element_value_int(xml, F_STONITH_RC, &rc) == 0) { -+ if ((rc == pcmk_ok) || (rc == -EINPROGRESS)) { -+ exit_status = CRM_EX_OK; -+ } -+ execution_status = stonith__legacy2status(rc); -+ exit_reason = pcmk_strerror(rc); -+ -+ } else { -+ execution_status = PCMK_EXEC_ERROR; -+ exit_reason = "Fencer reply contained neither a full result " -+ "nor a legacy return code (bug?)"; -+ } -+ } -+ pcmk__set_result(result, exit_status, execution_status, exit_reason); -+ pcmk__set_result_output(result, action_stdout, NULL); -+} -+ - static void - stonith_action_async_done(svc_action_t *svc_action) - { --- -2.27.0 - - -From 1f0121c6ad0d0235bcf01c8b60f9153592b3db83 Mon Sep 17 00:00:00 2001 -From: Ken Gaillot -Date: Thu, 11 Nov 2021 10:10:53 -0600 -Subject: [PATCH 07/13] Refactor: fencing: rename functions for invoking fence - callbacks - -... to make it clearer what the difference between them is ---- - lib/fencing/st_client.c | 44 +++++++++++++++++++++++++++++++++-------- - 1 file changed, 36 insertions(+), 8 deletions(-) - -diff --git a/lib/fencing/st_client.c b/lib/fencing/st_client.c -index 944cd1863..dfc5860fc 100644 ---- a/lib/fencing/st_client.c -+++ b/lib/fencing/st_client.c -@@ -847,9 +847,21 @@ stonith_api_del_callback(stonith_t * stonith, int call_id, bool all_callbacks) - return pcmk_ok; - } - -+/*! -+ * \internal -+ * \brief Invoke a (single) specified fence action callback -+ * -+ * \param[in] st Fencer API connection -+ * \param[in] call_id If positive, call ID of completed fence action, otherwise -+ * legacy return code for early action failure -+ * \param[in] rc Legacy return code for action result -+ * \param[in] userdata User data to pass to callback -+ * \param[in] callback Fence action callback to invoke -+ */ - static void --invoke_callback(stonith_t * st, int call_id, int rc, void *userdata, -- void (*callback) (stonith_t * st, stonith_callback_data_t * data)) -+invoke_fence_action_callback(stonith_t *st, int call_id, int rc, void *userdata, -+ void (*callback) (stonith_t *st, -+ stonith_callback_data_t *data)) - { - stonith_callback_data_t data = { 0, }; - -@@ -860,8 +872,21 @@ invoke_callback(stonith_t * st, int call_id, int rc, void *userdata, - callback(st, &data); - } - -+/*! -+ * \internal -+ * \brief Invoke any callbacks registered for a specified fence action result -+ * -+ * Given a fence action result from the fencer, invoke any callback registered -+ * for that action, as well as any global callback registered. -+ * -+ * \param[in] st Fencer API connection -+ * \param[in] msg If non-NULL, fencer reply -+ * \param[in] call_id If \p msg is NULL, call ID of action that timed out -+ * \param[in] rc Legacy return code for result of action -+ */ - static void --stonith_perform_callback(stonith_t * stonith, xmlNode * msg, int call_id, int rc) -+invoke_registered_callbacks(stonith_t *stonith, xmlNode *msg, int call_id, -+ int rc) - { - stonith_private_t *private = NULL; - stonith_callback_client_t *blob = NULL; -@@ -899,7 +924,8 @@ stonith_perform_callback(stonith_t * stonith, xmlNode * msg, int call_id, int rc - - if (local_blob.callback != NULL && (rc == pcmk_ok || local_blob.only_success == FALSE)) { - crm_trace("Invoking callback %s for call %d", crm_str(local_blob.id), call_id); -- invoke_callback(stonith, call_id, rc, local_blob.user_data, local_blob.callback); -+ invoke_fence_action_callback(stonith, call_id, rc, local_blob.user_data, -+ local_blob.callback); - - } else if (private->op_callback == NULL && rc != pcmk_ok) { - crm_warn("Fencing command failed: %s", pcmk_strerror(rc)); -@@ -908,7 +934,8 @@ stonith_perform_callback(stonith_t * stonith, xmlNode * msg, int call_id, int rc - - if (private->op_callback != NULL) { - crm_trace("Invoking global callback for call %d", call_id); -- invoke_callback(stonith, call_id, rc, NULL, private->op_callback); -+ invoke_fence_action_callback(stonith, call_id, rc, NULL, -+ private->op_callback); - } - crm_trace("OP callback activated."); - } -@@ -919,7 +946,7 @@ stonith_async_timeout_handler(gpointer data) - struct timer_rec_s *timer = data; - - crm_err("Async call %d timed out after %dms", timer->call_id, timer->timeout); -- stonith_perform_callback(timer->stonith, NULL, timer->call_id, -ETIME); -+ invoke_registered_callbacks(timer->stonith, NULL, timer->call_id, -ETIME); - - /* Always return TRUE, never remove the handler - * We do that in stonith_del_callback() -@@ -994,7 +1021,7 @@ stonith_dispatch_internal(const char *buffer, ssize_t length, gpointer userdata) - crm_trace("Activating %s callbacks...", type); - - if (pcmk__str_eq(type, T_STONITH_NG, pcmk__str_casei)) { -- stonith_perform_callback(st, blob.xml, 0, 0); -+ invoke_registered_callbacks(st, blob.xml, 0, 0); - - } else if (pcmk__str_eq(type, T_STONITH_NOTIFY, pcmk__str_casei)) { - foreach_notify_entry(private, stonith_send_notification, &blob); -@@ -1229,7 +1256,8 @@ stonith_api_add_callback(stonith_t * stonith, int call_id, int timeout, int opti - } else if (call_id < 0) { - if (!(options & st_opt_report_only_success)) { - crm_trace("Call failed, calling %s: %s", callback_name, pcmk_strerror(call_id)); -- invoke_callback(stonith, call_id, call_id, user_data, callback); -+ invoke_fence_action_callback(stonith, call_id, call_id, user_data, -+ callback); - } else { - crm_warn("Fencer call failed: %s", pcmk_strerror(call_id)); - } --- -2.27.0 - - -From c32f11e70a88244f5a3217608055a4eaf8d28231 Mon Sep 17 00:00:00 2001 -From: Ken Gaillot -Date: Thu, 11 Nov 2021 10:21:00 -0600 -Subject: [PATCH 08/13] Refactor: fencing: drop unnecessary argument when - invoking callbacks - -Refactor invoke_registered_callbacks() to treat a NULL message as a timeout, so -we can drop the rc argument. ---- - lib/fencing/st_client.c | 17 +++++++++++------ - 1 file changed, 11 insertions(+), 6 deletions(-) - -diff --git a/lib/fencing/st_client.c b/lib/fencing/st_client.c -index dfc5860fc..9f2b0c1c1 100644 ---- a/lib/fencing/st_client.c -+++ b/lib/fencing/st_client.c -@@ -882,15 +882,14 @@ invoke_fence_action_callback(stonith_t *st, int call_id, int rc, void *userdata, - * \param[in] st Fencer API connection - * \param[in] msg If non-NULL, fencer reply - * \param[in] call_id If \p msg is NULL, call ID of action that timed out -- * \param[in] rc Legacy return code for result of action - */ - static void --invoke_registered_callbacks(stonith_t *stonith, xmlNode *msg, int call_id, -- int rc) -+invoke_registered_callbacks(stonith_t *stonith, xmlNode *msg, int call_id) - { - stonith_private_t *private = NULL; - stonith_callback_client_t *blob = NULL; - stonith_callback_client_t local_blob; -+ int rc = pcmk_ok; - - CRM_CHECK(stonith != NULL, return); - CRM_CHECK(stonith->st_private != NULL, return); -@@ -902,7 +901,13 @@ invoke_registered_callbacks(stonith_t *stonith, xmlNode *msg, int call_id, - local_blob.user_data = NULL; - local_blob.only_success = FALSE; - -- if (msg != NULL) { -+ if (msg == NULL) { -+ // Fencer didn't reply in time -+ rc = -ETIME; -+ -+ } else { -+ // We have the fencer reply -+ - crm_element_value_int(msg, F_STONITH_RC, &rc); - crm_element_value_int(msg, F_STONITH_CALLID, &call_id); - } -@@ -946,7 +951,7 @@ stonith_async_timeout_handler(gpointer data) - struct timer_rec_s *timer = data; - - crm_err("Async call %d timed out after %dms", timer->call_id, timer->timeout); -- invoke_registered_callbacks(timer->stonith, NULL, timer->call_id, -ETIME); -+ invoke_registered_callbacks(timer->stonith, NULL, timer->call_id); - - /* Always return TRUE, never remove the handler - * We do that in stonith_del_callback() -@@ -1021,7 +1026,7 @@ stonith_dispatch_internal(const char *buffer, ssize_t length, gpointer userdata) - crm_trace("Activating %s callbacks...", type); - - if (pcmk__str_eq(type, T_STONITH_NG, pcmk__str_casei)) { -- invoke_registered_callbacks(st, blob.xml, 0, 0); -+ invoke_registered_callbacks(st, blob.xml, 0); - - } else if (pcmk__str_eq(type, T_STONITH_NOTIFY, pcmk__str_casei)) { - foreach_notify_entry(private, stonith_send_notification, &blob); --- -2.27.0 - - -From 5d8279b51ea9df738354649e4065663f2c16f1e6 Mon Sep 17 00:00:00 2001 -From: Ken Gaillot -Date: Thu, 11 Nov 2021 10:21:57 -0600 -Subject: [PATCH 09/13] Log: fencing: improve message for callback errors - -Improve checking of fencer replies, which also allows us to distinguish an -internal bug from a bad fencer reply in logs. Lower the bad reply message to -warning. ---- - lib/fencing/st_client.c | 13 +++++++++---- - 1 file changed, 9 insertions(+), 4 deletions(-) - -diff --git a/lib/fencing/st_client.c b/lib/fencing/st_client.c -index 9f2b0c1c1..170b9d450 100644 ---- a/lib/fencing/st_client.c -+++ b/lib/fencing/st_client.c -@@ -904,15 +904,20 @@ invoke_registered_callbacks(stonith_t *stonith, xmlNode *msg, int call_id) - if (msg == NULL) { - // Fencer didn't reply in time - rc = -ETIME; -+ CRM_LOG_ASSERT(call_id > 0); - - } else { - // We have the fencer reply - -- crm_element_value_int(msg, F_STONITH_RC, &rc); -- crm_element_value_int(msg, F_STONITH_CALLID, &call_id); -- } -+ if (crm_element_value_int(msg, F_STONITH_RC, &rc) != 0) { -+ rc = -pcmk_err_generic; -+ } - -- CRM_CHECK(call_id > 0, crm_log_xml_err(msg, "Bad result")); -+ if ((crm_element_value_int(msg, F_STONITH_CALLID, &call_id) != 0) -+ || (call_id <= 0)) { -+ crm_log_xml_warn(msg, "Bad fencer reply"); -+ } -+ } - - blob = pcmk__intkey_table_lookup(private->stonith_op_callback_table, - call_id); --- -2.27.0 - - -From e03c14d24e8cb011e870b9460930d139705bf0a2 Mon Sep 17 00:00:00 2001 -From: Ken Gaillot -Date: Tue, 9 Nov 2021 14:59:12 -0600 -Subject: [PATCH 10/13] Doc: fencing: correct stonith_api_operations_t method - descriptions - -Many of the methods return a positive call ID on success ---- - include/crm/stonith-ng.h | 60 ++++++++++++++++++++++------------------ - 1 file changed, 33 insertions(+), 27 deletions(-) - -diff --git a/include/crm/stonith-ng.h b/include/crm/stonith-ng.h -index 8d6ad477d..9643820e9 100644 ---- a/include/crm/stonith-ng.h -+++ b/include/crm/stonith-ng.h -@@ -164,39 +164,38 @@ typedef struct stonith_api_operations_s - int (*disconnect)(stonith_t *st); - - /*! -- * \brief Remove a registered stonith device with the local stonith daemon. -+ * \brief Unregister a fence device with the local fencer - * -- * \note Synchronous, guaranteed to occur in daemon before function returns. -- * -- * \return Legacy Pacemaker return code -+ * \return pcmk_ok (if synchronous) or positive call ID (if asynchronous) -+ * on success, otherwise a negative legacy Pacemaker return code - */ - int (*remove_device)( - stonith_t *st, int options, const char *name); - - /*! -- * \brief Register a stonith device with the local stonith daemon. -+ * \brief Register a fence device with the local fencer - * -- * \note Synchronous, guaranteed to occur in daemon before function returns. -- * -- * \return Legacy Pacemaker return code -+ * \return pcmk_ok (if synchronous) or positive call ID (if asynchronous) -+ * on success, otherwise a negative legacy Pacemaker return code - */ - int (*register_device)( - stonith_t *st, int options, const char *id, - const char *provider, const char *agent, stonith_key_value_t *params); - - /*! -- * \brief Remove a fencing level for a specific node. -+ * \brief Unregister a fencing level for specified node with local fencer - * -- * \return Legacy Pacemaker return code -+ * \return pcmk_ok (if synchronous) or positive call ID (if asynchronous) -+ * on success, otherwise a negative legacy Pacemaker return code - */ - int (*remove_level)( - stonith_t *st, int options, const char *node, int level); - - /*! -- * \brief Register a fencing level containing the fencing devices to be used -- * at that level for a specific node. -+ * \brief Register a fencing level for specified node with local fencer - * -- * \return Legacy Pacemaker return code -+ * \return pcmk_ok (if synchronous) or positive call ID (if asynchronous) -+ * on success, otherwise a negative legacy Pacemaker return code - */ - int (*register_level)( - stonith_t *st, int options, const char *node, int level, stonith_key_value_t *device_list); -@@ -226,21 +225,24 @@ typedef struct stonith_api_operations_s - /*! - * \brief Retrieve string listing hosts and port assignments from a local stonith device. - * -- * \return Legacy Pacemaker return code -+ * \return pcmk_ok (if synchronous) or positive call ID (if asynchronous) -+ * on success, otherwise a negative legacy Pacemaker return code - */ - int (*list)(stonith_t *st, int options, const char *id, char **list_output, int timeout); - - /*! - * \brief Check to see if a local stonith device is reachable - * -- * \return Legacy Pacemaker return code -+ * \return pcmk_ok (if synchronous) or positive call ID (if asynchronous) -+ * on success, otherwise a negative legacy Pacemaker return code - */ - int (*monitor)(stonith_t *st, int options, const char *id, int timeout); - - /*! - * \brief Check to see if a local stonith device's port is reachable - * -- * \return Legacy Pacemaker return code -+ * \return pcmk_ok (if synchronous) or positive call ID (if asynchronous) -+ * on success, otherwise a negative legacy Pacemaker return code - */ - int (*status)(stonith_t *st, int options, const char *id, const char *port, int timeout); - -@@ -267,7 +269,8 @@ typedef struct stonith_api_operations_s - * \param timeout, The default per device timeout to use with each device - * capable of fencing the target. - * -- * \return Legacy Pacemaker return code -+ * \return pcmk_ok (if synchronous) or positive call ID (if asynchronous) -+ * on success, otherwise a negative legacy Pacemaker return code - */ - int (*fence)(stonith_t *st, int options, const char *node, const char *action, - int timeout, int tolerance); -@@ -275,7 +278,8 @@ typedef struct stonith_api_operations_s - /*! - * \brief Manually confirm that a node is down. - * -- * \return Legacy Pacemaker return code -+ * \return pcmk_ok (if synchronous) or positive call ID (if asynchronous) -+ * on success, otherwise a negative legacy Pacemaker return code - */ - int (*confirm)(stonith_t *st, int options, const char *node); - -@@ -304,9 +308,6 @@ typedef struct stonith_api_operations_s - * \param[in] callback The callback function to register - * - * \return \c TRUE on success, \c FALSE if call_id is negative, -errno otherwise -- * -- * \todo This function should return \c pcmk_ok on success, and \c call_id -- * when negative, but that would break backward compatibility. - */ - int (*register_callback)(stonith_t *st, - int call_id, -@@ -317,12 +318,14 @@ typedef struct stonith_api_operations_s - void (*callback)(stonith_t *st, stonith_callback_data_t *data)); - - /*! -- * \brief Remove a registered callback for a given call id. -+ * \brief Remove a registered callback for a given call id -+ * -+ * \return pcmk_ok - */ - int (*remove_callback)(stonith_t *st, int call_id, bool all_callbacks); - - /*! -- * \brief Remove fencing level for specific node, node regex or attribute -+ * \brief Unregister fencing level for specified node, pattern or attribute - * - * \param[in] st Fencer connection to use - * \param[in] options Bitmask of stonith_call_options to pass to the fencer -@@ -332,7 +335,8 @@ typedef struct stonith_api_operations_s - * \param[in] value If not NULL, target by this node attribute value - * \param[in] level Index number of level to remove - * -- * \return 0 on success, negative error code otherwise -+ * \return pcmk_ok (if synchronous) or positive call ID (if asynchronous) -+ * on success, otherwise a negative legacy Pacemaker return code - * - * \note The caller should set only one of node, pattern or attr/value. - */ -@@ -341,7 +345,7 @@ typedef struct stonith_api_operations_s - const char *attr, const char *value, int level); - - /*! -- * \brief Register fencing level for specific node, node regex or attribute -+ * \brief Register fencing level for specified node, pattern or attribute - * - * \param[in] st Fencer connection to use - * \param[in] options Bitmask of stonith_call_options to pass to fencer -@@ -352,7 +356,8 @@ typedef struct stonith_api_operations_s - * \param[in] level Index number of level to add - * \param[in] device_list Devices to use in level - * -- * \return 0 on success, negative error code otherwise -+ * \return pcmk_ok (if synchronous) or positive call ID (if asynchronous) -+ * on success, otherwise a negative legacy Pacemaker return code - * - * \note The caller should set only one of node, pattern or attr/value. - */ -@@ -398,7 +403,8 @@ typedef struct stonith_api_operations_s - * \param delay, Apply a fencing delay. Value -1 means disable also any - * static/random fencing delays from pcmk_delay_base/max - * -- * \return Legacy Pacemaker return code -+ * \return pcmk_ok (if synchronous) or positive call ID (if asynchronous) -+ * on success, otherwise a negative legacy Pacemaker return code - */ - int (*fence_with_delay)(stonith_t *st, int options, const char *node, const char *action, - int timeout, int tolerance, int delay); --- -2.27.0 - - -From 18c382731889b626b21ba6a14f9213ef1e45a524 Mon Sep 17 00:00:00 2001 -From: Ken Gaillot -Date: Tue, 23 Nov 2021 11:14:24 -0600 -Subject: [PATCH 11/13] Refactor: fencing: define constant for XML attribute - for action output - ---- - daemons/fenced/fenced_commands.c | 4 ++-- - include/crm/fencing/internal.h | 1 + - lib/fencing/st_actions.c | 4 ++-- - lib/fencing/st_client.c | 2 +- - 4 files changed, 6 insertions(+), 5 deletions(-) - -diff --git a/daemons/fenced/fenced_commands.c b/daemons/fenced/fenced_commands.c -index 26501a4b3..aa14c52af 100644 ---- a/daemons/fenced/fenced_commands.c -+++ b/daemons/fenced/fenced_commands.c -@@ -2677,7 +2677,7 @@ stonith_construct_reply(xmlNode * request, const char *output, xmlNode * data, i - - crm_xml_add(reply, "st_origin", __func__); - crm_xml_add(reply, F_TYPE, T_STONITH_NG); -- crm_xml_add(reply, "st_output", output); -+ crm_xml_add(reply, F_STONITH_OUTPUT, output); - crm_xml_add_int(reply, F_STONITH_RC, rc); - - if (request == NULL) { -@@ -2743,7 +2743,7 @@ construct_async_reply(async_command_t *cmd, const pcmk__action_result_t *result) - crm_xml_add_int(reply, F_STONITH_CALLOPTS, cmd->options); - crm_xml_add_int(reply, F_STONITH_RC, - pcmk_rc2legacy(stonith__result2rc(result))); -- crm_xml_add(reply, "st_output", result->action_stdout); -+ crm_xml_add(reply, F_STONITH_OUTPUT, result->action_stdout); - return reply; - } - -diff --git a/include/crm/fencing/internal.h b/include/crm/fencing/internal.h -index 4b5fd3959..f0d294a0b 100644 ---- a/include/crm/fencing/internal.h -+++ b/include/crm/fencing/internal.h -@@ -105,6 +105,7 @@ void stonith__device_parameter_flags(uint32_t *device_flags, - # define F_STONITH_REMOTE_OP_ID "st_remote_op" - # define F_STONITH_REMOTE_OP_ID_RELAY "st_remote_op_relay" - # define F_STONITH_RC "st_rc" -+# define F_STONITH_OUTPUT "st_output" - /*! Timeout period per a device execution */ - # define F_STONITH_TIMEOUT "st_timeout" - # define F_STONITH_TOLERANCE "st_tolerance" -diff --git a/lib/fencing/st_actions.c b/lib/fencing/st_actions.c -index d4fc3f5ed..5636810a5 100644 ---- a/lib/fencing/st_actions.c -+++ b/lib/fencing/st_actions.c -@@ -425,7 +425,7 @@ stonith__xe_set_result(xmlNode *xml, const pcmk__action_result_t *result) - crm_xml_add_int(xml, XML_LRM_ATTR_OPSTATUS, (int) execution_status); - crm_xml_add_int(xml, XML_LRM_ATTR_RC, exit_status); - crm_xml_add(xml, XML_LRM_ATTR_EXIT_REASON, exit_reason); -- crm_xml_add(xml, "st_output", action_stdout); -+ crm_xml_add(xml, F_STONITH_OUTPUT, action_stdout); - - /* @COMPAT Peers in rolling upgrades, Pacemaker Remote nodes, and external - * code that use libstonithd <=2.1.2 don't check for the full result, and -@@ -474,7 +474,7 @@ stonith__xe_get_result(xmlNode *xml, pcmk__action_result_t *result) - CRM_CHECK((xml != NULL) && (result != NULL), return); - - exit_reason = crm_element_value(xml, XML_LRM_ATTR_EXIT_REASON); -- action_stdout = crm_element_value_copy(xml, "st_output"); -+ action_stdout = crm_element_value_copy(xml, F_STONITH_OUTPUT); - - // A result must include an exit status and execution status - if ((crm_element_value_int(xml, XML_LRM_ATTR_RC, &exit_status) < 0) -diff --git a/lib/fencing/st_client.c b/lib/fencing/st_client.c -index 170b9d450..2dfadf922 100644 ---- a/lib/fencing/st_client.c -+++ b/lib/fencing/st_client.c -@@ -600,7 +600,7 @@ stonith_api_list(stonith_t * stonith, int call_options, const char *id, char **l - if (output && list_info) { - const char *list_str; - -- list_str = crm_element_value(output, "st_output"); -+ list_str = crm_element_value(output, F_STONITH_OUTPUT); - - if (list_str) { - *list_info = strdup(list_str); --- -2.27.0 - - -From 9fe9ed5d46c810cb9c12eb07271373ab92d271cd Mon Sep 17 00:00:00 2001 -From: Ken Gaillot -Date: Tue, 23 Nov 2021 11:39:32 -0600 -Subject: [PATCH 12/13] Refactor: fencing: simplify invoking callbacks - ---- - lib/fencing/st_client.c | 42 +++++++++++++++++------------------------ - 1 file changed, 17 insertions(+), 25 deletions(-) - -diff --git a/lib/fencing/st_client.c b/lib/fencing/st_client.c -index 2dfadf922..2ca094566 100644 ---- a/lib/fencing/st_client.c -+++ b/lib/fencing/st_client.c -@@ -887,8 +887,7 @@ static void - invoke_registered_callbacks(stonith_t *stonith, xmlNode *msg, int call_id) - { - stonith_private_t *private = NULL; -- stonith_callback_client_t *blob = NULL; -- stonith_callback_client_t local_blob; -+ stonith_callback_client_t *cb_info = NULL; - int rc = pcmk_ok; - - CRM_CHECK(stonith != NULL, return); -@@ -896,11 +895,6 @@ invoke_registered_callbacks(stonith_t *stonith, xmlNode *msg, int call_id) - - private = stonith->st_private; - -- local_blob.id = NULL; -- local_blob.callback = NULL; -- local_blob.user_data = NULL; -- local_blob.only_success = FALSE; -- - if (msg == NULL) { - // Fencer didn't reply in time - rc = -ETIME; -@@ -919,26 +913,21 @@ invoke_registered_callbacks(stonith_t *stonith, xmlNode *msg, int call_id) - } - } - -- blob = pcmk__intkey_table_lookup(private->stonith_op_callback_table, -- call_id); -- if (blob != NULL) { -- local_blob = *blob; -- blob = NULL; -- -- stonith_api_del_callback(stonith, call_id, FALSE); -- -- } else { -- crm_trace("No callback found for call %d", call_id); -- local_blob.callback = NULL; -+ if (call_id > 0) { -+ cb_info = pcmk__intkey_table_lookup(private->stonith_op_callback_table, -+ call_id); - } - -- if (local_blob.callback != NULL && (rc == pcmk_ok || local_blob.only_success == FALSE)) { -- crm_trace("Invoking callback %s for call %d", crm_str(local_blob.id), call_id); -- invoke_fence_action_callback(stonith, call_id, rc, local_blob.user_data, -- local_blob.callback); -+ if ((cb_info != NULL) && (cb_info->callback != NULL) -+ && (rc == pcmk_ok || !(cb_info->only_success))) { -+ crm_trace("Invoking callback %s for call %d", -+ crm_str(cb_info->id), call_id); -+ invoke_fence_action_callback(stonith, call_id, rc, cb_info->user_data, -+ cb_info->callback); - -- } else if (private->op_callback == NULL && rc != pcmk_ok) { -- crm_warn("Fencing command failed: %s", pcmk_strerror(rc)); -+ } else if ((private->op_callback == NULL) && (rc != pcmk_ok)) { -+ crm_warn("Fencing action without registered callback failed: %s", -+ pcmk_strerror(rc)); - crm_log_xml_debug(msg, "Failed fence update"); - } - -@@ -947,7 +936,10 @@ invoke_registered_callbacks(stonith_t *stonith, xmlNode *msg, int call_id) - invoke_fence_action_callback(stonith, call_id, rc, NULL, - private->op_callback); - } -- crm_trace("OP callback activated."); -+ -+ if (cb_info != NULL) { -+ stonith_api_del_callback(stonith, call_id, FALSE); -+ } - } - - static gboolean --- -2.27.0 - - -From 8113b800ce677ba17a16ca176e8f6f9b4a042316 Mon Sep 17 00:00:00 2001 -From: Ken Gaillot -Date: Tue, 23 Nov 2021 18:14:48 -0600 -Subject: [PATCH 13/13] Refactor: fencing: add a missing "break" statement - -No effect, but more correct ---- - lib/fencing/st_actions.c | 1 + - 1 file changed, 1 insertion(+) - -diff --git a/lib/fencing/st_actions.c b/lib/fencing/st_actions.c -index 5636810a5..7eaa8b0f2 100644 ---- a/lib/fencing/st_actions.c -+++ b/lib/fencing/st_actions.c -@@ -336,6 +336,7 @@ stonith__result2rc(const pcmk__action_result_t *result) - case CRM_EX_EXPIRED: return EHOSTUNREACH; - default: break; - } -+ break; - - default: - break; --- -2.27.0 - diff --git a/SOURCES/004-systemd-metadata.patch b/SOURCES/004-systemd-metadata.patch deleted file mode 100644 index 142ef6a..0000000 --- a/SOURCES/004-systemd-metadata.patch +++ /dev/null @@ -1,73 +0,0 @@ -From 09ef95a2eed48b4eb7488788a1b655d67eafe783 Mon Sep 17 00:00:00 2001 -From: Chris Lumens -Date: Tue, 30 Nov 2021 14:47:12 -0500 -Subject: [PATCH] Low: libcrmservice: Handle systemd service templates. - -These unit files (which have an @ sign at the end) expect to be -parameterized by an instance name. Not providing an instance name -causes the dbus lookup to fail, and we fall back to assume this is an -LSB service. If the user doesn't provide an instance name, just add a -fake one. It doesn't seem to matter what name is given for the lookup. - -See: rhbz#2003151 ---- - lib/services/systemd.c | 22 ++++++++++++++++------ - 1 file changed, 16 insertions(+), 6 deletions(-) - -diff --git a/lib/services/systemd.c b/lib/services/systemd.c -index 8e9fff484..27a3b376d 100644 ---- a/lib/services/systemd.c -+++ b/lib/services/systemd.c -@@ -206,17 +206,27 @@ systemd_unit_extension(const char *name) - } - - static char * --systemd_service_name(const char *name) -+systemd_service_name(const char *name, bool add_instance_name) - { -- if (name == NULL) { -+ if (pcmk__str_empty(name)) { - return NULL; - } - - if (systemd_unit_extension(name)) { - return strdup(name); -- } - -- return crm_strdup_printf("%s.service", name); -+ /* Services that end with an @ sign are systemd templates. They expect an -+ * instance name to follow the service name. If no instance name was -+ * provided, just add "x" to the string as the instance name. It doesn't -+ * seem to matter for purposes of looking up whether a service exists or -+ * not. -+ */ -+ } else if (add_instance_name && *(name+strlen(name)-1) == '@') { -+ return crm_strdup_printf("%sx.service", name); -+ -+ } else { -+ return crm_strdup_printf("%s.service", name); -+ } - } - - static void -@@ -427,7 +437,7 @@ invoke_unit_by_name(const char *arg_name, svc_action_t *op, char **path) - CRM_ASSERT(msg != NULL); - - // Add the (expanded) unit name as the argument -- name = systemd_service_name(arg_name); -+ name = systemd_service_name(arg_name, op == NULL || pcmk__str_eq(op->action, "meta-data", pcmk__str_none)); - CRM_LOG_ASSERT(dbus_message_append_args(msg, DBUS_TYPE_STRING, &name, - DBUS_TYPE_INVALID)); - free(name); -@@ -944,7 +954,7 @@ invoke_unit_by_path(svc_action_t *op, const char *unit) - /* (ss) */ - { - const char *replace_s = "replace"; -- char *name = systemd_service_name(op->agent); -+ char *name = systemd_service_name(op->agent, pcmk__str_eq(op->action, "meta-data", pcmk__str_none)); - - CRM_LOG_ASSERT(dbus_message_append_args(msg, DBUS_TYPE_STRING, &name, DBUS_TYPE_INVALID)); - CRM_LOG_ASSERT(dbus_message_append_args(msg, DBUS_TYPE_STRING, &replace_s, DBUS_TYPE_INVALID)); --- -2.27.0 - diff --git a/SOURCES/005-fencing-reasons.patch b/SOURCES/005-fencing-reasons.patch deleted file mode 100644 index e0772c6..0000000 --- a/SOURCES/005-fencing-reasons.patch +++ /dev/null @@ -1,2200 +0,0 @@ -From 3d10dad9a555aae040d8473edfe31a4e4279c066 Mon Sep 17 00:00:00 2001 -From: Ken Gaillot -Date: Thu, 11 Nov 2021 12:34:03 -0600 -Subject: [PATCH 01/19] Refactor: libcrmcommon: add internal API for checking - for fencing action - -The naming is a little awkward -- "fencing action" has multiple meanings -depending on the context. It can refer to fencer API requests, fence device -actions, fence agent actions, or just those actions that fence a node (off and -reboot). - -This new function pcmk__is_fencing_action() uses the last meaning, so it does -*not* return true for unfencing ("on" actions). ---- - include/crm/common/internal.h | 1 + - lib/common/operations.c | 14 ++++++++++++++ - 2 files changed, 15 insertions(+) - -diff --git a/include/crm/common/internal.h b/include/crm/common/internal.h -index a35c5769a..694fc6cd4 100644 ---- a/include/crm/common/internal.h -+++ b/include/crm/common/internal.h -@@ -218,6 +218,7 @@ char *pcmk__notify_key(const char *rsc_id, const char *notify_type, - char *pcmk__transition_key(int transition_id, int action_id, int target_rc, - const char *node); - void pcmk__filter_op_for_digest(xmlNode *param_set); -+bool pcmk__is_fencing_action(const char *action); - - - // bitwise arithmetic utilities -diff --git a/lib/common/operations.c b/lib/common/operations.c -index aa7106ce6..366c18970 100644 ---- a/lib/common/operations.c -+++ b/lib/common/operations.c -@@ -523,3 +523,17 @@ crm_op_needs_metadata(const char *rsc_class, const char *op) - CRMD_ACTION_MIGRATE, CRMD_ACTION_MIGRATED, - CRMD_ACTION_NOTIFY, NULL); - } -+ -+/*! -+ * \internal -+ * \brief Check whether an action name is for a fencing action -+ * -+ * \param[in] action Action name to check -+ * -+ * \return true if \p action is "off", "reboot", or "poweroff", otherwise false -+ */ -+bool -+pcmk__is_fencing_action(const char *action) -+{ -+ return pcmk__str_any_of(action, "off", "reboot", "poweroff", NULL); -+} --- -2.27.0 - - -From 86ac00fb3e99d79ca2c442ae1670fe850146f734 Mon Sep 17 00:00:00 2001 -From: Ken Gaillot -Date: Thu, 11 Nov 2021 12:38:58 -0600 -Subject: [PATCH 02/19] Low: fencer,scheduler: compare fence action names - case-sensitively - -Use the new convenience function pcmk__is_fencing_action() to check whether -an action name is a fencing action ("off", "reboot", or "poweroff"). This -changes the behavior from case-insensitive to case-sensitive, which is more -appropriate (the case-insensitivity was inherited from lazy use of the old -safe_str_eq() function which was always case-insensitive). ---- - daemons/fenced/fenced_commands.c | 6 +++--- - daemons/fenced/fenced_remote.c | 2 +- - lib/pacemaker/pcmk_graph_producer.c | 2 +- - lib/pengine/common.c | 8 +------- - 4 files changed, 6 insertions(+), 12 deletions(-) - -diff --git a/daemons/fenced/fenced_commands.c b/daemons/fenced/fenced_commands.c -index 63bfad3a9..46c840f2a 100644 ---- a/daemons/fenced/fenced_commands.c -+++ b/daemons/fenced/fenced_commands.c -@@ -128,7 +128,7 @@ get_action_delay_max(stonith_device_t * device, const char * action) - const char *value = NULL; - int delay_max = 0; - -- if (!pcmk__strcase_any_of(action, "off", "reboot", NULL)) { -+ if (!pcmk__is_fencing_action(action)) { - return 0; - } - -@@ -146,7 +146,7 @@ get_action_delay_base(stonith_device_t *device, const char *action, const char * - char *hash_value = NULL; - int delay_base = 0; - -- if (!pcmk__strcase_any_of(action, "off", "reboot", NULL)) { -+ if (!pcmk__is_fencing_action(action)) { - return 0; - } - -@@ -448,7 +448,7 @@ stonith_device_execute(stonith_device_t * device) - - if (pcmk__str_any_of(device->agent, STONITH_WATCHDOG_AGENT, - STONITH_WATCHDOG_AGENT_INTERNAL, NULL)) { -- if (pcmk__strcase_any_of(cmd->action, "reboot", "off", NULL)) { -+ if (pcmk__is_fencing_action(cmd->action)) { - if (node_does_watchdog_fencing(stonith_our_uname)) { - pcmk__panic(__func__); - goto done; -diff --git a/daemons/fenced/fenced_remote.c b/daemons/fenced/fenced_remote.c -index 963433bf3..358ea3aa7 100644 ---- a/daemons/fenced/fenced_remote.c -+++ b/daemons/fenced/fenced_remote.c -@@ -1758,7 +1758,7 @@ all_topology_devices_found(remote_fencing_op_t * op) - if (!tp) { - return FALSE; - } -- if (pcmk__strcase_any_of(op->action, "off", "reboot", NULL)) { -+ if (pcmk__is_fencing_action(op->action)) { - /* Don't count the devices on the target node if we are killing - * the target node. */ - skip_target = TRUE; -diff --git a/lib/pacemaker/pcmk_graph_producer.c b/lib/pacemaker/pcmk_graph_producer.c -index ffcbd1274..5bec9d8ce 100644 ---- a/lib/pacemaker/pcmk_graph_producer.c -+++ b/lib/pacemaker/pcmk_graph_producer.c -@@ -721,7 +721,7 @@ add_downed_nodes(xmlNode *xml, const pe_action_t *action, - /* Fencing makes the action's node and any hosted guest nodes down */ - const char *fence = g_hash_table_lookup(action->meta, "stonith_action"); - -- if (pcmk__strcase_any_of(fence, "off", "reboot", NULL)) { -+ if (pcmk__is_fencing_action(fence)) { - xmlNode *downed = create_xml_node(xml, XML_GRAPH_TAG_DOWNED); - add_node_to_xml_by_id(action->node->details->id, downed); - pe_foreach_guest_node(data_set, action->node, add_node_to_xml, downed); -diff --git a/lib/pengine/common.c b/lib/pengine/common.c -index 236fc26b1..fe4223816 100644 ---- a/lib/pengine/common.c -+++ b/lib/pengine/common.c -@@ -27,12 +27,6 @@ check_health(const char *value) - "migrate-on-red", NULL); - } - --static bool --check_stonith_action(const char *value) --{ -- return pcmk__strcase_any_of(value, "reboot", "poweroff", "off", NULL); --} -- - static bool - check_placement_strategy(const char *value) - { -@@ -114,7 +108,7 @@ static pcmk__cluster_option_t pe_opts[] = { - }, - { - "stonith-action", NULL, "select", "reboot, off, poweroff", -- "reboot", check_stonith_action, -+ "reboot", pcmk__is_fencing_action, - "Action to send to fence device when a node needs to be fenced " - "(\"poweroff\" is a deprecated alias for \"off\")", - NULL --- -2.27.0 - - -From c8f6e8a04c4fa4271db817af0a23aa941c9d7689 Mon Sep 17 00:00:00 2001 -From: Ken Gaillot -Date: Fri, 12 Nov 2021 17:42:21 -0600 -Subject: [PATCH 03/19] Refactor: fencing: rename type for peer query replies - -st_query_result_t contains the device information parsed from a peer's query -reply, but the name could easily be confused with the actual success/failure -result of the query action itself. Rename it to peer_device_info_t. ---- - daemons/fenced/fenced_remote.c | 103 +++++++++++++++++---------------- - 1 file changed, 52 insertions(+), 51 deletions(-) - -diff --git a/daemons/fenced/fenced_remote.c b/daemons/fenced/fenced_remote.c -index 358ea3aa7..9e2f62804 100644 ---- a/daemons/fenced/fenced_remote.c -+++ b/daemons/fenced/fenced_remote.c -@@ -41,7 +41,7 @@ - - /* When one fencer queries its peers for devices able to handle a fencing - * request, each peer will reply with a list of such devices available to it. -- * Each reply will be parsed into a st_query_result_t, with each device's -+ * Each reply will be parsed into a peer_device_info_t, with each device's - * information kept in a device_properties_t. - */ - -@@ -72,18 +72,19 @@ typedef struct st_query_result_s { - int ndevices; - /* Devices available to this host that are capable of fencing the target */ - GHashTable *devices; --} st_query_result_t; -+} peer_device_info_t; - - GHashTable *stonith_remote_op_list = NULL; - --void call_remote_stonith(remote_fencing_op_t * op, st_query_result_t * peer, int rc); -+void call_remote_stonith(remote_fencing_op_t *op, peer_device_info_t *peer, -+ int rc); - static void remote_op_done(remote_fencing_op_t * op, xmlNode * data, int rc, int dup); - extern xmlNode *stonith_create_op(int call_id, const char *token, const char *op, xmlNode * data, - int call_options); - - static void report_timeout_period(remote_fencing_op_t * op, int op_timeout); - static int get_op_total_timeout(const remote_fencing_op_t *op, -- const st_query_result_t *chosen_peer); -+ const peer_device_info_t *chosen_peer); - - static gint - sort_strings(gconstpointer a, gconstpointer b) -@@ -95,7 +96,7 @@ static void - free_remote_query(gpointer data) - { - if (data) { -- st_query_result_t *query = data; -+ peer_device_info_t *query = data; - - crm_trace("Free'ing query result from %s", query->host); - g_hash_table_destroy(query->devices); -@@ -150,8 +151,8 @@ count_peer_device(gpointer key, gpointer value, gpointer user_data) - * \return Number of devices available to peer that were not already executed - */ - static int --count_peer_devices(const remote_fencing_op_t *op, const st_query_result_t *peer, -- gboolean verified_only) -+count_peer_devices(const remote_fencing_op_t *op, -+ const peer_device_info_t *peer, gboolean verified_only) - { - struct peer_count_data data; - -@@ -175,7 +176,7 @@ count_peer_devices(const remote_fencing_op_t *op, const st_query_result_t *peer, - * \return Device properties if found, NULL otherwise - */ - static device_properties_t * --find_peer_device(const remote_fencing_op_t *op, const st_query_result_t *peer, -+find_peer_device(const remote_fencing_op_t *op, const peer_device_info_t *peer, - const char *device) - { - device_properties_t *props = g_hash_table_lookup(peer->devices, device); -@@ -196,7 +197,7 @@ find_peer_device(const remote_fencing_op_t *op, const st_query_result_t *peer, - * \return TRUE if device was found and marked, FALSE otherwise - */ - static gboolean --grab_peer_device(const remote_fencing_op_t *op, st_query_result_t *peer, -+grab_peer_device(const remote_fencing_op_t *op, peer_device_info_t *peer, - const char *device, gboolean verified_devices_only) - { - device_properties_t *props = find_peer_device(op, peer, device); -@@ -1216,7 +1217,7 @@ enum find_best_peer_options { - FIND_PEER_VERIFIED_ONLY = 0x0004, - }; - --static st_query_result_t * -+static peer_device_info_t * - find_best_peer(const char *device, remote_fencing_op_t * op, enum find_best_peer_options options) - { - GList *iter = NULL; -@@ -1227,7 +1228,7 @@ find_best_peer(const char *device, remote_fencing_op_t * op, enum find_best_peer - } - - for (iter = op->query_results; iter != NULL; iter = iter->next) { -- st_query_result_t *peer = iter->data; -+ peer_device_info_t *peer = iter->data; - - crm_trace("Testing result from %s targeting %s with %d device%s: %d %x", - peer->host, op->target, peer->ndevices, -@@ -1257,11 +1258,11 @@ find_best_peer(const char *device, remote_fencing_op_t * op, enum find_best_peer - return NULL; - } - --static st_query_result_t * -+static peer_device_info_t * - stonith_choose_peer(remote_fencing_op_t * op) - { - const char *device = NULL; -- st_query_result_t *peer = NULL; -+ peer_device_info_t *peer = NULL; - uint32_t active = fencing_active_peers(); - - do { -@@ -1317,8 +1318,8 @@ stonith_choose_peer(remote_fencing_op_t * op) - } - - static int --get_device_timeout(const remote_fencing_op_t *op, const st_query_result_t *peer, -- const char *device) -+get_device_timeout(const remote_fencing_op_t *op, -+ const peer_device_info_t *peer, const char *device) - { - device_properties_t *props; - -@@ -1338,7 +1339,7 @@ get_device_timeout(const remote_fencing_op_t *op, const st_query_result_t *peer, - - struct timeout_data { - const remote_fencing_op_t *op; -- const st_query_result_t *peer; -+ const peer_device_info_t *peer; - int total_timeout; - }; - -@@ -1365,7 +1366,7 @@ add_device_timeout(gpointer key, gpointer value, gpointer user_data) - } - - static int --get_peer_timeout(const remote_fencing_op_t *op, const st_query_result_t *peer) -+get_peer_timeout(const remote_fencing_op_t *op, const peer_device_info_t *peer) - { - struct timeout_data timeout; - -@@ -1380,7 +1381,7 @@ get_peer_timeout(const remote_fencing_op_t *op, const st_query_result_t *peer) - - static int - get_op_total_timeout(const remote_fencing_op_t *op, -- const st_query_result_t *chosen_peer) -+ const peer_device_info_t *chosen_peer) - { - int total_timeout = 0; - stonith_topology_t *tp = find_topology_for_host(op->target); -@@ -1403,7 +1404,7 @@ get_op_total_timeout(const remote_fencing_op_t *op, - } - for (device_list = tp->levels[i]; device_list; device_list = device_list->next) { - for (iter = op->query_results; iter != NULL; iter = iter->next) { -- const st_query_result_t *peer = iter->data; -+ const peer_device_info_t *peer = iter->data; - - if (find_peer_device(op, peer, device_list->data)) { - total_timeout += get_device_timeout(op, peer, -@@ -1555,7 +1556,7 @@ check_watchdog_fencing_and_wait(remote_fencing_op_t * op) - } - - void --call_remote_stonith(remote_fencing_op_t * op, st_query_result_t * peer, int rc) -+call_remote_stonith(remote_fencing_op_t *op, peer_device_info_t *peer, int rc) - { - const char *device = NULL; - int timeout = op->base_timeout; -@@ -1734,8 +1735,8 @@ call_remote_stonith(remote_fencing_op_t * op, st_query_result_t * peer, int rc) - static gint - sort_peers(gconstpointer a, gconstpointer b) - { -- const st_query_result_t *peer_a = a; -- const st_query_result_t *peer_b = b; -+ const peer_device_info_t *peer_a = a; -+ const peer_device_info_t *peer_b = b; - - return (peer_b->ndevices - peer_a->ndevices); - } -@@ -1768,7 +1769,7 @@ all_topology_devices_found(remote_fencing_op_t * op) - for (device = tp->levels[i]; device; device = device->next) { - match = NULL; - for (iter = op->query_results; iter && !match; iter = iter->next) { -- st_query_result_t *peer = iter->data; -+ peer_device_info_t *peer = iter->data; - - if (skip_target && pcmk__str_eq(peer->host, op->target, pcmk__str_casei)) { - continue; -@@ -1850,31 +1851,31 @@ parse_action_specific(xmlNode *xml, const char *peer, const char *device, - * - * \param[in] xml XML node containing device properties - * \param[in,out] op Operation that query and reply relate to -- * \param[in,out] result Peer's results -+ * \param[in,out] peer Peer's device information - * \param[in] device ID of device being parsed - */ - static void - add_device_properties(xmlNode *xml, remote_fencing_op_t *op, -- st_query_result_t *result, const char *device) -+ peer_device_info_t *peer, const char *device) - { - xmlNode *child; - int verified = 0; - device_properties_t *props = calloc(1, sizeof(device_properties_t)); - -- /* Add a new entry to this result's devices list */ -+ /* Add a new entry to this peer's devices list */ - CRM_ASSERT(props != NULL); -- g_hash_table_insert(result->devices, strdup(device), props); -+ g_hash_table_insert(peer->devices, strdup(device), props); - - /* Peers with verified (monitored) access will be preferred */ - crm_element_value_int(xml, F_STONITH_DEVICE_VERIFIED, &verified); - if (verified) { - crm_trace("Peer %s has confirmed a verified device %s", -- result->host, device); -+ peer->host, device); - props->verified = TRUE; - } - - /* Parse action-specific device properties */ -- parse_action_specific(xml, result->host, device, op_requested_action(op), -+ parse_action_specific(xml, peer->host, device, op_requested_action(op), - op, st_phase_requested, props); - for (child = pcmk__xml_first_child(xml); child != NULL; - child = pcmk__xml_next(child)) { -@@ -1883,10 +1884,10 @@ add_device_properties(xmlNode *xml, remote_fencing_op_t *op, - * winds up getting remapped. - */ - if (pcmk__str_eq(ID(child), "off", pcmk__str_casei)) { -- parse_action_specific(child, result->host, device, "off", -+ parse_action_specific(child, peer->host, device, "off", - op, st_phase_off, props); - } else if (pcmk__str_eq(ID(child), "on", pcmk__str_casei)) { -- parse_action_specific(child, result->host, device, "on", -+ parse_action_specific(child, peer->host, device, "on", - op, st_phase_on, props); - } - } -@@ -1903,17 +1904,17 @@ add_device_properties(xmlNode *xml, remote_fencing_op_t *op, - * - * \return Newly allocated result structure with parsed reply - */ --static st_query_result_t * -+static peer_device_info_t * - add_result(remote_fencing_op_t *op, const char *host, int ndevices, xmlNode *xml) - { -- st_query_result_t *result = calloc(1, sizeof(st_query_result_t)); -+ peer_device_info_t *peer = calloc(1, sizeof(peer_device_info_t)); - xmlNode *child; - - // cppcheck seems not to understand the abort logic in CRM_CHECK - // cppcheck-suppress memleak -- CRM_CHECK(result != NULL, return NULL); -- result->host = strdup(host); -- result->devices = pcmk__strkey_table(free, free); -+ CRM_CHECK(peer != NULL, return NULL); -+ peer->host = strdup(host); -+ peer->devices = pcmk__strkey_table(free, free); - - /* Each child element describes one capable device available to the peer */ - for (child = pcmk__xml_first_child(xml); child != NULL; -@@ -1921,17 +1922,17 @@ add_result(remote_fencing_op_t *op, const char *host, int ndevices, xmlNode *xml - const char *device = ID(child); - - if (device) { -- add_device_properties(child, op, result, device); -+ add_device_properties(child, op, peer, device); - } - } - -- result->ndevices = g_hash_table_size(result->devices); -- CRM_CHECK(ndevices == result->ndevices, -+ peer->ndevices = g_hash_table_size(peer->devices); -+ CRM_CHECK(ndevices == peer->ndevices, - crm_err("Query claimed to have %d device%s but %d found", -- ndevices, pcmk__plural_s(ndevices), result->ndevices)); -+ ndevices, pcmk__plural_s(ndevices), peer->ndevices)); - -- op->query_results = g_list_insert_sorted(op->query_results, result, sort_peers); -- return result; -+ op->query_results = g_list_insert_sorted(op->query_results, peer, sort_peers); -+ return peer; - } - - /*! -@@ -1957,7 +1958,7 @@ process_remote_stonith_query(xmlNode * msg) - const char *id = NULL; - const char *host = NULL; - remote_fencing_op_t *op = NULL; -- st_query_result_t *result = NULL; -+ peer_device_info_t *peer = NULL; - uint32_t replies_expected; - xmlNode *dev = get_xpath_object("//@" F_STONITH_REMOTE_OP_ID, msg, LOG_ERR); - -@@ -1991,7 +1992,7 @@ process_remote_stonith_query(xmlNode * msg) - op->replies, replies_expected, host, - op->target, op->action, ndevices, pcmk__plural_s(ndevices), id); - if (ndevices > 0) { -- result = add_result(op, host, ndevices, dev); -+ peer = add_result(op, host, ndevices, dev); - } - - if (pcmk_is_set(op->call_options, st_opt_topology)) { -@@ -2001,7 +2002,7 @@ process_remote_stonith_query(xmlNode * msg) - if (op->state == st_query && all_topology_devices_found(op)) { - /* All the query results are in for the topology, start the fencing ops. */ - crm_trace("All topology devices found"); -- call_remote_stonith(op, result, pcmk_ok); -+ call_remote_stonith(op, peer, pcmk_ok); - - } else if (have_all_replies) { - crm_info("All topology query replies have arrived, continuing (%d expected/%d received) ", -@@ -2010,15 +2011,15 @@ process_remote_stonith_query(xmlNode * msg) - } - - } else if (op->state == st_query) { -- int nverified = count_peer_devices(op, result, TRUE); -+ int nverified = count_peer_devices(op, peer, TRUE); - - /* We have a result for a non-topology fencing op that looks promising, - * go ahead and start fencing before query timeout */ -- if (result && (host_is_target == FALSE) && nverified) { -+ if ((peer != NULL) && !host_is_target && nverified) { - /* we have a verified device living on a peer that is not the target */ - crm_trace("Found %d verified device%s", - nverified, pcmk__plural_s(nverified)); -- call_remote_stonith(op, result, pcmk_ok); -+ call_remote_stonith(op, peer, pcmk_ok); - - } else if (have_all_replies) { - crm_info("All query replies have arrived, continuing (%d expected/%d received) ", -@@ -2029,10 +2030,10 @@ process_remote_stonith_query(xmlNode * msg) - crm_trace("Waiting for more peer results before launching fencing operation"); - } - -- } else if (result && (op->state == st_done)) { -+ } else if ((peer != NULL) && (op->state == st_done)) { - crm_info("Discarding query result from %s (%d device%s): " -- "Operation is %s", result->host, -- result->ndevices, pcmk__plural_s(result->ndevices), -+ "Operation is %s", peer->host, -+ peer->ndevices, pcmk__plural_s(peer->ndevices), - stonith_op_state_str(op->state)); - } - --- -2.27.0 - - -From 913e0620310089d2250e9ecde383df757f8e8063 Mon Sep 17 00:00:00 2001 -From: Ken Gaillot -Date: Thu, 11 Nov 2021 12:46:37 -0600 -Subject: [PATCH 04/19] Low: fencer: improve broadcasting replies for fenced - originators - -If the target of a fencing action was also the originator, the executioner -broadcasts the result on their behalf. - -Previously, it would check if the action was not in a list of actions that are -never broadcasted. However we really only want to broadcast off/reboot results -so just check for that instead. - -This also rearranges reply creation slightly so we don't trace-log the reply -until it is fully created. ---- - daemons/fenced/fenced_commands.c | 19 +++++++++---------- - 1 file changed, 9 insertions(+), 10 deletions(-) - -diff --git a/daemons/fenced/fenced_commands.c b/daemons/fenced/fenced_commands.c -index 46c840f2a..e4185f6e1 100644 ---- a/daemons/fenced/fenced_commands.c -+++ b/daemons/fenced/fenced_commands.c -@@ -2385,32 +2385,31 @@ send_async_reply(async_command_t *cmd, const pcmk__action_result_t *result, - int pid, bool merged) - { - xmlNode *reply = NULL; -- gboolean bcast = FALSE; -+ bool bcast = false; - - CRM_CHECK((cmd != NULL) && (result != NULL), return); - - reply = construct_async_reply(cmd, result); - -- // Only replies for certain actions are broadcast -- if (pcmk__str_any_of(cmd->action, "metadata", "monitor", "list", "status", -- NULL)) { -- crm_trace("Never broadcast '%s' replies", cmd->action); -+ // If target was also the originator, broadcast fencing results for it -+ if (!stand_alone && pcmk__is_fencing_action(cmd->action) -+ && pcmk__str_eq(cmd->origin, cmd->victim, pcmk__str_casei)) { - -- } else if (!stand_alone && pcmk__str_eq(cmd->origin, cmd->victim, pcmk__str_casei) && !pcmk__str_eq(cmd->action, "on", pcmk__str_casei)) { -- crm_trace("Broadcast '%s' reply for %s", cmd->action, cmd->victim); -+ crm_trace("Broadcast '%s' result for %s (target was also originator)", -+ cmd->action, cmd->victim); - crm_xml_add(reply, F_SUBTYPE, "broadcast"); -- bcast = TRUE; -+ crm_xml_add(reply, F_STONITH_OPERATION, T_STONITH_NOTIFY); -+ bcast = true; - } - - log_async_result(cmd, result, pid, NULL, merged); -- crm_log_xml_trace(reply, "Reply"); - - if (merged) { - crm_xml_add(reply, F_STONITH_MERGED, "true"); - } -+ crm_log_xml_trace(reply, "Reply"); - - if (bcast) { -- crm_xml_add(reply, F_STONITH_OPERATION, T_STONITH_NOTIFY); - send_cluster_message(NULL, crm_msg_stonith_ng, reply, FALSE); - - } else if (cmd->origin) { --- -2.27.0 - - -From 8b8f94fd9ca5e61922cb81e32c8a3d0f1d75fb0b Mon Sep 17 00:00:00 2001 -From: Ken Gaillot -Date: Thu, 11 Nov 2021 14:40:49 -0600 -Subject: [PATCH 05/19] Refactor: fencer: avoid code duplication when sending - async reply - -... and clean up reply function ---- - daemons/fenced/fenced_commands.c | 33 ++++++++++++++++++-------------- - 1 file changed, 19 insertions(+), 14 deletions(-) - -diff --git a/daemons/fenced/fenced_commands.c b/daemons/fenced/fenced_commands.c -index e4185f6e1..4ea0a337a 100644 ---- a/daemons/fenced/fenced_commands.c -+++ b/daemons/fenced/fenced_commands.c -@@ -2411,15 +2411,8 @@ send_async_reply(async_command_t *cmd, const pcmk__action_result_t *result, - - if (bcast) { - send_cluster_message(NULL, crm_msg_stonith_ng, reply, FALSE); -- -- } else if (cmd->origin) { -- crm_trace("Directed reply to %s", cmd->origin); -- send_cluster_message(crm_get_peer(0, cmd->origin), crm_msg_stonith_ng, reply, FALSE); -- - } else { -- crm_trace("Directed local %ssync reply to %s", -- (cmd->options & st_opt_sync_call) ? "" : "a-", cmd->client_name); -- do_local_reply(reply, cmd->client, cmd->options & st_opt_sync_call, FALSE); -+ stonith_send_reply(reply, cmd->options, cmd->origin, cmd->client); - } - - if (stand_alone) { -@@ -2814,16 +2807,28 @@ check_alternate_host(const char *target) - return alternate_host; - } - -+/*! -+ * \internal -+ * \brief Send a reply to a CPG peer or IPC client -+ * -+ * \param[in] reply XML reply to send -+ * \param[in] call_options Send synchronously if st_opt_sync_call is set here -+ * \param[in] remote_peer If not NULL, name of peer node to send CPG reply -+ * \param[in] client_id If not NULL, name of client to send IPC reply -+ */ - static void --stonith_send_reply(xmlNode * reply, int call_options, const char *remote_peer, -+stonith_send_reply(xmlNode *reply, int call_options, const char *remote_peer, - const char *client_id) - { -- if (remote_peer) { -- send_cluster_message(crm_get_peer(0, remote_peer), crm_msg_stonith_ng, reply, FALSE); -- } else { -+ CRM_CHECK((reply != NULL) && ((remote_peer != NULL) || (client_id != NULL)), -+ return); -+ -+ if (remote_peer == NULL) { - do_local_reply(reply, client_id, -- pcmk_is_set(call_options, st_opt_sync_call), -- (remote_peer != NULL)); -+ pcmk_is_set(call_options, st_opt_sync_call), FALSE); -+ } else { -+ send_cluster_message(crm_get_peer(0, remote_peer), crm_msg_stonith_ng, -+ reply, FALSE); - } - } - --- -2.27.0 - - -From 2cdbda58f0e9f38a0e302506107fd933cb415144 Mon Sep 17 00:00:00 2001 -From: Ken Gaillot -Date: Tue, 23 Nov 2021 17:24:09 -0600 -Subject: [PATCH 06/19] Refactor: fencer: ensure all requests get clean-up - -handle_request() has if-else blocks for each type of request. Previously, if a -request didn't need a reply, the function would do any clean-up needed and -return immediately. Now, we track whether a reply is needed, and all request -types flow to the end of the function for consistent clean-up. - -This doesn't change any behavior at this point, but allows us to do more at the -end of request handling. ---- - daemons/fenced/fenced_commands.c | 46 ++++++++++++++++++-------------- - 1 file changed, 26 insertions(+), 20 deletions(-) - -diff --git a/daemons/fenced/fenced_commands.c b/daemons/fenced/fenced_commands.c -index 4ea0a337a..19477b49b 100644 ---- a/daemons/fenced/fenced_commands.c -+++ b/daemons/fenced/fenced_commands.c -@@ -2892,6 +2892,7 @@ handle_request(pcmk__client_t *client, uint32_t id, uint32_t flags, - - xmlNode *data = NULL; - xmlNode *reply = NULL; -+ bool need_reply = true; - - char *output = NULL; - const char *op = crm_element_value(request, F_STONITH_OPERATION); -@@ -2921,10 +2922,12 @@ handle_request(pcmk__client_t *client, uint32_t id, uint32_t flags, - pcmk__ipc_send_xml(client, id, reply, flags); - client->request_id = 0; - free_xml(reply); -- return 0; -+ rc = pcmk_ok; -+ need_reply = false; - - } else if (pcmk__str_eq(op, STONITH_OP_EXEC, pcmk__str_none)) { - rc = stonith_device_action(request, &output); -+ need_reply = (rc != -EINPROGRESS); - - } else if (pcmk__str_eq(op, STONITH_OP_TIMEOUT_UPDATE, pcmk__str_none)) { - const char *call_id = crm_element_value(request, F_STONITH_CALLID); -@@ -2933,7 +2936,8 @@ handle_request(pcmk__client_t *client, uint32_t id, uint32_t flags, - - crm_element_value_int(request, F_STONITH_TIMEOUT, &op_timeout); - do_stonith_async_timeout_update(client_id, call_id, op_timeout); -- return 0; -+ rc = pcmk_ok; -+ need_reply = false; - - } else if (pcmk__str_eq(op, STONITH_OP_QUERY, pcmk__str_none)) { - if (remote_peer) { -@@ -2944,7 +2948,8 @@ handle_request(pcmk__client_t *client, uint32_t id, uint32_t flags, - remove_relay_op(request); - - stonith_query(request, remote_peer, client_id, call_options); -- return 0; -+ rc = pcmk_ok; -+ need_reply = false; - - } else if (pcmk__str_eq(op, T_STONITH_NOTIFY, pcmk__str_none)) { - const char *flag_name = NULL; -@@ -2965,7 +2970,8 @@ handle_request(pcmk__client_t *client, uint32_t id, uint32_t flags, - } - - pcmk__ipc_send_ack(client, id, flags, "ack", CRM_EX_OK); -- return 0; -+ rc = pcmk_ok; -+ need_reply = false; - - } else if (pcmk__str_eq(op, STONITH_OP_RELAY, pcmk__str_none)) { - xmlNode *dev = get_xpath_object("//@" F_STONITH_TARGET, request, LOG_TRACE); -@@ -2977,8 +2983,11 @@ handle_request(pcmk__client_t *client, uint32_t id, uint32_t flags, - crm_element_value(dev, F_STONITH_ACTION), - crm_element_value(dev, F_STONITH_TARGET)); - -- if (initiate_remote_stonith_op(NULL, request, FALSE) != NULL) { -+ if (initiate_remote_stonith_op(NULL, request, FALSE) == NULL) { -+ rc = -EPROTO; -+ } else { - rc = -EINPROGRESS; -+ need_reply = false; - } - - } else if (pcmk__str_eq(op, STONITH_OP_FENCE, pcmk__str_none)) { -@@ -3012,7 +3021,7 @@ handle_request(pcmk__client_t *client, uint32_t id, uint32_t flags, - crm_element_value_int(dev, F_STONITH_TOLERANCE, &tolerance); - - if (stonith_check_fence_tolerance(tolerance, target, action)) { -- rc = 0; -+ rc = pcmk_ok; - goto done; - } - -@@ -3047,10 +3056,13 @@ handle_request(pcmk__client_t *client, uint32_t id, uint32_t flags, - FALSE); - rc = -EINPROGRESS; - -- } else if (initiate_remote_stonith_op(client, request, FALSE) != NULL) { -+ } else if (initiate_remote_stonith_op(client, request, FALSE) == NULL) { -+ rc = -EPROTO; -+ } else { - rc = -EINPROGRESS; - } - } -+ need_reply = (rc != -EINPROGRESS); - - } else if (pcmk__str_eq(op, STONITH_OP_FENCE_HISTORY, pcmk__str_none)) { - rc = stonith_fence_history(request, &data, remote_peer, call_options); -@@ -3058,8 +3070,8 @@ handle_request(pcmk__client_t *client, uint32_t id, uint32_t flags, - /* we don't expect answers to the broadcast - * we might have sent out - */ -- free_xml(data); -- return pcmk_ok; -+ rc = pcmk_ok; -+ need_reply = false; - } - - } else if (pcmk__str_eq(op, STONITH_OP_DEVICE_ADD, pcmk__str_none)) { -@@ -3111,8 +3123,8 @@ handle_request(pcmk__client_t *client, uint32_t id, uint32_t flags, - crm_element_value_int(request, XML_ATTR_ID, &node_id); - name = crm_element_value(request, XML_ATTR_UNAME); - reap_crm_member(node_id, name); -- -- return pcmk_ok; -+ rc = pcmk_ok; -+ need_reply = false; - - } else { - crm_err("Unknown IPC request %s from %s %s", op, -@@ -3120,20 +3132,14 @@ handle_request(pcmk__client_t *client, uint32_t id, uint32_t flags, - ((client == NULL)? remote_peer : pcmk__client_name(client))); - } - -- done: -- -+done: - if (rc == -EACCES) { - crm_warn("Rejecting IPC request '%s' from unprivileged client %s", - crm_str(op), pcmk__client_name(client)); - } - -- /* Always reply unless the request is in process still. -- * If in progress, a reply will happen async after the request -- * processing is finished */ -- if (rc != -EINPROGRESS) { -- crm_trace("Reply handling: %p %u %u %d %d %s", client, client?client->request_id:0, -- id, pcmk_is_set(call_options, st_opt_sync_call), call_options, -- crm_element_value(request, F_STONITH_CALLOPTS)); -+ // Reply if result is known -+ if (need_reply) { - - if (pcmk_is_set(call_options, st_opt_sync_call)) { - CRM_ASSERT(client == NULL || client->request_id == id); --- -2.27.0 - - -From 067d655ebd3fbb0ed27f4e7426db4c3b661ba777 Mon Sep 17 00:00:00 2001 -From: Ken Gaillot -Date: Tue, 23 Nov 2021 17:26:32 -0600 -Subject: [PATCH 07/19] Log: fencer: improve debug logs when processing CPG/IPC - messages - -By moving the result log messages from stonith_command() to handle_reply() and -handle_request(), we can simplify stonith_command() and give slightly better -messages. ---- - daemons/fenced/fenced_commands.c | 80 +++++++++++++++----------------- - 1 file changed, 38 insertions(+), 42 deletions(-) - -diff --git a/daemons/fenced/fenced_commands.c b/daemons/fenced/fenced_commands.c -index 19477b49b..98af0e04f 100644 ---- a/daemons/fenced/fenced_commands.c -+++ b/daemons/fenced/fenced_commands.c -@@ -2883,7 +2883,7 @@ remove_relay_op(xmlNode * request) - } - } - --static int -+static void - handle_request(pcmk__client_t *client, uint32_t id, uint32_t flags, - xmlNode *request, const char *remote_peer) - { -@@ -3152,73 +3152,69 @@ done: - free_xml(data); - free_xml(reply); - -- return rc; -+ crm_debug("Processed %s request from %s %s: %s (rc=%d)", -+ op, ((client == NULL)? "peer" : "client"), -+ ((client == NULL)? remote_peer : pcmk__client_name(client)), -+ ((rc > 0)? "" : pcmk_strerror(rc)), rc); - } - - static void - handle_reply(pcmk__client_t *client, xmlNode *request, const char *remote_peer) - { -- const char *op = crm_element_value(request, F_STONITH_OPERATION); -+ // Copy, because request might be freed before we want to log this -+ char *op = crm_element_value_copy(request, F_STONITH_OPERATION); - - if (pcmk__str_eq(op, STONITH_OP_QUERY, pcmk__str_none)) { - process_remote_stonith_query(request); -- } else if (pcmk__str_eq(op, T_STONITH_NOTIFY, pcmk__str_none)) { -- process_remote_stonith_exec(request); -- } else if (pcmk__str_eq(op, STONITH_OP_FENCE, pcmk__str_none)) { -- /* Reply to a complex fencing op */ -+ } else if (pcmk__str_any_of(op, T_STONITH_NOTIFY, STONITH_OP_FENCE, NULL)) { - process_remote_stonith_exec(request); - } else { -- crm_err("Unknown %s reply from %s %s", op, -- ((client == NULL)? "peer" : "client"), -+ crm_err("Ignoring unknown %s reply from %s %s", -+ crm_str(op), ((client == NULL)? "peer" : "client"), - ((client == NULL)? remote_peer : pcmk__client_name(client))); - crm_log_xml_warn(request, "UnknownOp"); -+ free(op); -+ return; - } -+ crm_debug("Processed %s reply from %s %s", -+ op, ((client == NULL)? "peer" : "client"), -+ ((client == NULL)? remote_peer : pcmk__client_name(client))); -+ free(op); - } - -+/*! -+ * \internal -+ * \brief Handle a message from an IPC client or CPG peer -+ * -+ * \param[in] client If not NULL, IPC client that sent message -+ * \param[in] id If from IPC client, IPC message ID -+ * \param[in] flags Message flags -+ * \param[in] message Message XML -+ * \param[in] remote_peer If not NULL, CPG peer that sent message -+ */ - void - stonith_command(pcmk__client_t *client, uint32_t id, uint32_t flags, -- xmlNode *request, const char *remote_peer) -+ xmlNode *message, const char *remote_peer) - { -- int call_options = 0; -- int rc = 0; -- gboolean is_reply = FALSE; -- -- /* Copy op for reporting. The original might get freed by handle_reply() -- * before we use it in crm_debug(): -- * handle_reply() -- * |- process_remote_stonith_exec() -- * |-- remote_op_done() -- * |--- handle_local_reply_and_notify() -- * |---- crm_xml_add(...F_STONITH_OPERATION...) -- * |--- free_xml(op->request) -- */ -- char *op = crm_element_value_copy(request, F_STONITH_OPERATION); -- -- if (get_xpath_object("//" T_STONITH_REPLY, request, LOG_NEVER)) { -- is_reply = TRUE; -- } -+ int call_options = st_opt_none; -+ bool is_reply = get_xpath_object("//" T_STONITH_REPLY, message, -+ LOG_NEVER) != NULL; - -- crm_element_value_int(request, F_STONITH_CALLOPTS, &call_options); -- crm_debug("Processing %s%s %u from %s %s with call options 0x%08x", -- op, (is_reply? " reply" : ""), id, -+ crm_element_value_int(message, F_STONITH_CALLOPTS, &call_options); -+ crm_debug("Processing %ssynchronous %s %s %u from %s %s", -+ pcmk_is_set(call_options, st_opt_sync_call)? "" : "a", -+ crm_element_value(message, F_STONITH_OPERATION), -+ (is_reply? "reply" : "request"), id, - ((client == NULL)? "peer" : "client"), -- ((client == NULL)? remote_peer : pcmk__client_name(client)), -- call_options); -+ ((client == NULL)? remote_peer : pcmk__client_name(client))); - - if (pcmk_is_set(call_options, st_opt_sync_call)) { - CRM_ASSERT(client == NULL || client->request_id == id); - } - - if (is_reply) { -- handle_reply(client, request, remote_peer); -+ handle_reply(client, message, remote_peer); - } else { -- rc = handle_request(client, id, flags, request, remote_peer); -+ handle_request(client, id, flags, message, remote_peer); - } -- -- crm_debug("Processed %s%s from %s %s: %s (rc=%d)", -- op, (is_reply? " reply" : ""), -- ((client == NULL)? "peer" : "client"), -- ((client == NULL)? remote_peer : pcmk__client_name(client)), -- ((rc > 0)? "" : pcmk_strerror(rc)), rc); -- free(op); - } --- -2.27.0 - - -From 44cb340c11b4652f452a47eb2b0050b4a459382b Mon Sep 17 00:00:00 2001 -From: Ken Gaillot -Date: Mon, 15 Nov 2021 16:29:09 -0600 -Subject: [PATCH 08/19] Refactor: fencer: drop unused argument from - notification functions - ---- - daemons/fenced/fenced_commands.c | 12 ++++++------ - daemons/fenced/fenced_history.c | 6 +++--- - daemons/fenced/fenced_remote.c | 6 +++--- - daemons/fenced/pacemaker-fenced.c | 18 +++++++++--------- - daemons/fenced/pacemaker-fenced.h | 6 +++--- - 5 files changed, 24 insertions(+), 24 deletions(-) - -diff --git a/daemons/fenced/fenced_commands.c b/daemons/fenced/fenced_commands.c -index 98af0e04f..946ce4042 100644 ---- a/daemons/fenced/fenced_commands.c -+++ b/daemons/fenced/fenced_commands.c -@@ -2428,8 +2428,8 @@ send_async_reply(async_command_t *cmd, const pcmk__action_result_t *result, - crm_xml_add(notify_data, F_STONITH_REMOTE_OP_ID, cmd->remote_op_id); - crm_xml_add(notify_data, F_STONITH_ORIGIN, cmd->client); - -- do_stonith_notify(0, T_STONITH_NOTIFY_FENCE, rc, notify_data); -- do_stonith_notify(0, T_STONITH_NOTIFY_HISTORY, 0, NULL); -+ do_stonith_notify(T_STONITH_NOTIFY_FENCE, rc, notify_data); -+ do_stonith_notify(T_STONITH_NOTIFY_HISTORY, pcmk_ok, NULL); - } - - free_xml(reply); -@@ -3082,7 +3082,7 @@ handle_request(pcmk__client_t *client, uint32_t id, uint32_t flags, - } else { - rc = -EACCES; - } -- do_stonith_notify_device(call_options, op, rc, device_id); -+ do_stonith_notify_device(op, rc, device_id); - - } else if (pcmk__str_eq(op, STONITH_OP_DEVICE_DEL, pcmk__str_none)) { - xmlNode *dev = get_xpath_object("//" F_STONITH_DEVICE, request, LOG_ERR); -@@ -3093,7 +3093,7 @@ handle_request(pcmk__client_t *client, uint32_t id, uint32_t flags, - } else { - rc = -EACCES; - } -- do_stonith_notify_device(call_options, op, rc, device_id); -+ do_stonith_notify_device(op, rc, device_id); - - } else if (pcmk__str_eq(op, STONITH_OP_LEVEL_ADD, pcmk__str_none)) { - char *device_id = NULL; -@@ -3103,7 +3103,7 @@ handle_request(pcmk__client_t *client, uint32_t id, uint32_t flags, - } else { - rc = -EACCES; - } -- do_stonith_notify_level(call_options, op, rc, device_id); -+ do_stonith_notify_level(op, rc, device_id); - free(device_id); - - } else if (pcmk__str_eq(op, STONITH_OP_LEVEL_DEL, pcmk__str_none)) { -@@ -3114,7 +3114,7 @@ handle_request(pcmk__client_t *client, uint32_t id, uint32_t flags, - } else { - rc = -EACCES; - } -- do_stonith_notify_level(call_options, op, rc, device_id); -+ do_stonith_notify_level(op, rc, device_id); - - } else if(pcmk__str_eq(op, CRM_OP_RM_NODE_CACHE, pcmk__str_casei)) { - int node_id = 0; -diff --git a/daemons/fenced/fenced_history.c b/daemons/fenced/fenced_history.c -index 1ba034ba9..7127593b6 100644 ---- a/daemons/fenced/fenced_history.c -+++ b/daemons/fenced/fenced_history.c -@@ -100,7 +100,7 @@ stonith_fence_history_cleanup(const char *target, - g_hash_table_foreach_remove(stonith_remote_op_list, - stonith_remove_history_entry, - (gpointer) target); -- do_stonith_notify(0, T_STONITH_NOTIFY_HISTORY, 0, NULL); -+ do_stonith_notify(T_STONITH_NOTIFY_HISTORY, pcmk_ok, NULL); - } - } - -@@ -396,7 +396,7 @@ stonith_local_history_diff_and_merge(GHashTable *remote_history, - - if (updated) { - stonith_fence_history_trim(); -- do_stonith_notify(0, T_STONITH_NOTIFY_HISTORY, 0, NULL); -+ do_stonith_notify(T_STONITH_NOTIFY_HISTORY, pcmk_ok, NULL); - } - - if (cnt == 0) { -@@ -470,7 +470,7 @@ stonith_fence_history(xmlNode *msg, xmlNode **output, - is done so send a notification for anything - that smells like history-sync - */ -- do_stonith_notify(0, T_STONITH_NOTIFY_HISTORY_SYNCED, 0, NULL); -+ do_stonith_notify(T_STONITH_NOTIFY_HISTORY_SYNCED, pcmk_ok, NULL); - if (crm_element_value(msg, F_STONITH_CALLID)) { - /* this is coming from the stonith-API - * -diff --git a/daemons/fenced/fenced_remote.c b/daemons/fenced/fenced_remote.c -index 9e2f62804..c907cd120 100644 ---- a/daemons/fenced/fenced_remote.c -+++ b/daemons/fenced/fenced_remote.c -@@ -423,8 +423,8 @@ handle_local_reply_and_notify(remote_fencing_op_t * op, xmlNode * data, int rc) - do_local_reply(reply, op->client_id, op->call_options & st_opt_sync_call, FALSE); - - /* bcast to all local clients that the fencing operation happend */ -- do_stonith_notify(0, T_STONITH_NOTIFY_FENCE, rc, notify_data); -- do_stonith_notify(0, T_STONITH_NOTIFY_HISTORY, 0, NULL); -+ do_stonith_notify(T_STONITH_NOTIFY_FENCE, rc, notify_data); -+ do_stonith_notify(T_STONITH_NOTIFY_HISTORY, pcmk_ok, NULL); - - /* mark this op as having notify's already sent */ - op->notify_sent = TRUE; -@@ -1119,7 +1119,7 @@ create_remote_stonith_op(const char *client, xmlNode * request, gboolean peer) - - if (op->state != st_duplicate) { - /* kick history readers */ -- do_stonith_notify(0, T_STONITH_NOTIFY_HISTORY, 0, NULL); -+ do_stonith_notify(T_STONITH_NOTIFY_HISTORY, pcmk_ok, NULL); - } - - /* safe to trim as long as that doesn't touch pending ops */ -diff --git a/daemons/fenced/pacemaker-fenced.c b/daemons/fenced/pacemaker-fenced.c -index a64004ce1..a290e1670 100644 ---- a/daemons/fenced/pacemaker-fenced.c -+++ b/daemons/fenced/pacemaker-fenced.c -@@ -357,7 +357,7 @@ do_stonith_async_timeout_update(const char *client_id, const char *call_id, int - } - - void --do_stonith_notify(int options, const char *type, int result, xmlNode * data) -+do_stonith_notify(const char *type, int result, xmlNode *data) - { - /* TODO: Standardize the contents of data */ - xmlNode *update_msg = create_xml_node(NULL, "notify"); -@@ -380,7 +380,7 @@ do_stonith_notify(int options, const char *type, int result, xmlNode * data) - } - - static void --do_stonith_notify_config(int options, const char *op, int rc, -+do_stonith_notify_config(const char *op, int rc, - const char *desc, int active) - { - xmlNode *notify_data = create_xml_node(NULL, op); -@@ -390,20 +390,20 @@ do_stonith_notify_config(int options, const char *op, int rc, - crm_xml_add(notify_data, F_STONITH_DEVICE, desc); - crm_xml_add_int(notify_data, F_STONITH_ACTIVE, active); - -- do_stonith_notify(options, op, rc, notify_data); -+ do_stonith_notify(op, rc, notify_data); - free_xml(notify_data); - } - - void --do_stonith_notify_device(int options, const char *op, int rc, const char *desc) -+do_stonith_notify_device(const char *op, int rc, const char *desc) - { -- do_stonith_notify_config(options, op, rc, desc, g_hash_table_size(device_list)); -+ do_stonith_notify_config(op, rc, desc, g_hash_table_size(device_list)); - } - - void --do_stonith_notify_level(int options, const char *op, int rc, const char *desc) -+do_stonith_notify_level(const char *op, int rc, const char *desc) - { -- do_stonith_notify_config(options, op, rc, desc, g_hash_table_size(topology)); -+ do_stonith_notify_config(op, rc, desc, g_hash_table_size(topology)); - } - - static void -@@ -418,7 +418,7 @@ topology_remove_helper(const char *node, int level) - crm_xml_add(data, XML_ATTR_STONITH_TARGET, node); - - rc = stonith_level_remove(data, &desc); -- do_stonith_notify_level(0, STONITH_OP_LEVEL_DEL, rc, desc); -+ do_stonith_notify_level(STONITH_OP_LEVEL_DEL, rc, desc); - - free_xml(data); - free(desc); -@@ -468,7 +468,7 @@ handle_topology_change(xmlNode *match, bool remove) - } - - rc = stonith_level_register(match, &desc); -- do_stonith_notify_level(0, STONITH_OP_LEVEL_ADD, rc, desc); -+ do_stonith_notify_level(STONITH_OP_LEVEL_ADD, rc, desc); - - free(desc); - } -diff --git a/daemons/fenced/pacemaker-fenced.h b/daemons/fenced/pacemaker-fenced.h -index a64b57693..3e41d867e 100644 ---- a/daemons/fenced/pacemaker-fenced.h -+++ b/daemons/fenced/pacemaker-fenced.h -@@ -233,9 +233,9 @@ xmlNode *stonith_construct_reply(xmlNode * request, const char *output, xmlNode - void - do_stonith_async_timeout_update(const char *client, const char *call_id, int timeout); - --void do_stonith_notify(int options, const char *type, int result, xmlNode * data); --void do_stonith_notify_device(int options, const char *op, int rc, const char *desc); --void do_stonith_notify_level(int options, const char *op, int rc, const char *desc); -+void do_stonith_notify(const char *type, int result, xmlNode *data); -+void do_stonith_notify_device(const char *op, int rc, const char *desc); -+void do_stonith_notify_level(const char *op, int rc, const char *desc); - - remote_fencing_op_t *initiate_remote_stonith_op(pcmk__client_t *client, - xmlNode *request, --- -2.27.0 - - -From a49df4901b663b3366634c1d58f04625ecba4005 Mon Sep 17 00:00:00 2001 -From: Ken Gaillot -Date: Tue, 16 Nov 2021 11:57:14 -0600 -Subject: [PATCH 09/19] Refactor: fencer: functionize checking for privileged - client - -... for readability and to make planned changes easier ---- - daemons/fenced/fenced_commands.c | 49 +++++++++++++++++++------------- - 1 file changed, 30 insertions(+), 19 deletions(-) - -diff --git a/daemons/fenced/fenced_commands.c b/daemons/fenced/fenced_commands.c -index 946ce4042..34c956f5c 100644 ---- a/daemons/fenced/fenced_commands.c -+++ b/daemons/fenced/fenced_commands.c -@@ -2883,6 +2883,32 @@ remove_relay_op(xmlNode * request) - } - } - -+/*! -+ * \internal -+ * \brief Check whether an API request was sent by a privileged user -+ * -+ * API commands related to fencing configuration may be done only by privileged -+ * IPC users (i.e. root or hacluster), because all other users should go through -+ * the CIB to have ACLs applied. If no client was given, this is a peer request, -+ * which is always allowed. -+ * -+ * \param[in] c IPC client that sent request (or NULL if sent by CPG peer) -+ * \param[in] op Requested API operation (for logging only) -+ * -+ * \return true if sender is peer or privileged client, otherwise false -+ */ -+static inline bool -+is_privileged(pcmk__client_t *c, const char *op) -+{ -+ if ((c == NULL) || pcmk_is_set(c->flags, pcmk__client_privileged)) { -+ return true; -+ } else { -+ crm_warn("Rejecting IPC request '%s' from unprivileged client %s", -+ crm_str(op), pcmk__client_name(c)); -+ return false; -+ } -+} -+ - static void - handle_request(pcmk__client_t *client, uint32_t id, uint32_t flags, - xmlNode *request, const char *remote_peer) -@@ -2898,15 +2924,6 @@ handle_request(pcmk__client_t *client, uint32_t id, uint32_t flags, - const char *op = crm_element_value(request, F_STONITH_OPERATION); - const char *client_id = crm_element_value(request, F_STONITH_CLIENTID); - -- /* IPC commands related to fencing configuration may be done only by -- * privileged users (i.e. root or hacluster), because all other users should -- * go through the CIB to have ACLs applied. -- * -- * If no client was given, this is a peer request, which is always allowed. -- */ -- bool allowed = (client == NULL) -- || pcmk_is_set(client->flags, pcmk__client_privileged); -- - crm_element_value_int(request, F_STONITH_CALLOPTS, &call_options); - - if (pcmk_is_set(call_options, st_opt_sync_call)) { -@@ -3077,7 +3094,7 @@ handle_request(pcmk__client_t *client, uint32_t id, uint32_t flags, - } else if (pcmk__str_eq(op, STONITH_OP_DEVICE_ADD, pcmk__str_none)) { - const char *device_id = NULL; - -- if (allowed) { -+ if (is_privileged(client, op)) { - rc = stonith_device_register(request, &device_id, FALSE); - } else { - rc = -EACCES; -@@ -3088,7 +3105,7 @@ handle_request(pcmk__client_t *client, uint32_t id, uint32_t flags, - xmlNode *dev = get_xpath_object("//" F_STONITH_DEVICE, request, LOG_ERR); - const char *device_id = crm_element_value(dev, XML_ATTR_ID); - -- if (allowed) { -+ if (is_privileged(client, op)) { - rc = stonith_device_remove(device_id, FALSE); - } else { - rc = -EACCES; -@@ -3098,7 +3115,7 @@ handle_request(pcmk__client_t *client, uint32_t id, uint32_t flags, - } else if (pcmk__str_eq(op, STONITH_OP_LEVEL_ADD, pcmk__str_none)) { - char *device_id = NULL; - -- if (allowed) { -+ if (is_privileged(client, op)) { - rc = stonith_level_register(request, &device_id); - } else { - rc = -EACCES; -@@ -3109,7 +3126,7 @@ handle_request(pcmk__client_t *client, uint32_t id, uint32_t flags, - } else if (pcmk__str_eq(op, STONITH_OP_LEVEL_DEL, pcmk__str_none)) { - char *device_id = NULL; - -- if (allowed) { -+ if (is_privileged(client, op)) { - rc = stonith_level_remove(request, &device_id); - } else { - rc = -EACCES; -@@ -3133,14 +3150,8 @@ handle_request(pcmk__client_t *client, uint32_t id, uint32_t flags, - } - - done: -- if (rc == -EACCES) { -- crm_warn("Rejecting IPC request '%s' from unprivileged client %s", -- crm_str(op), pcmk__client_name(client)); -- } -- - // Reply if result is known - if (need_reply) { -- - if (pcmk_is_set(call_options, st_opt_sync_call)) { - CRM_ASSERT(client == NULL || client->request_id == id); - } --- -2.27.0 - - -From 10ca8a5ef5266159bc3f993802aeae6537ceeb11 Mon Sep 17 00:00:00 2001 -From: Ken Gaillot -Date: Tue, 16 Nov 2021 16:59:03 -0600 -Subject: [PATCH 10/19] Low: fencer: return -ETIME for peer fencing timeouts - -94c55684 set the result as pcmk_ok, but it appears that the intent was just to -keep the delegate from being set, and -ETIME should still do that, while being -more appropriate. ---- - daemons/fenced/fenced_remote.c | 2 +- - 1 file changed, 1 insertion(+), 1 deletion(-) - -diff --git a/daemons/fenced/fenced_remote.c b/daemons/fenced/fenced_remote.c -index c907cd120..dc7b802da 100644 ---- a/daemons/fenced/fenced_remote.c -+++ b/daemons/fenced/fenced_remote.c -@@ -608,7 +608,7 @@ remote_op_timeout_one(gpointer userdata) - - crm_notice("Peer's '%s' action targeting %s for client %s timed out " CRM_XS - " id=%.8s", op->action, op->target, op->client_name, op->id); -- call_remote_stonith(op, NULL, pcmk_ok); -+ call_remote_stonith(op, NULL, -ETIME); - return FALSE; - } - --- -2.27.0 - - -From fb2eefeb695cc92e1a2aed6f1f1d2b900d4fb83e Mon Sep 17 00:00:00 2001 -From: Ken Gaillot -Date: Tue, 16 Nov 2021 17:54:56 -0600 -Subject: [PATCH 11/19] Refactor: fencer: functionize common part of timeout - handling - -Previously, remote_op_timeout() was called from multiple places, but only one -of those places needed the full processing. The common part is now in a new -function finalize_timed_out_op() called from all the places, and -remote_op_timeout() now has just the additional processing needed by the one -place plus a call to the new function. - -This will allow a future change to set a different exit reason depending on -which step timed out. ---- - daemons/fenced/fenced_remote.c | 49 +++++++++++++++++++++++----------- - 1 file changed, 34 insertions(+), 15 deletions(-) - -diff --git a/daemons/fenced/fenced_remote.c b/daemons/fenced/fenced_remote.c -index dc7b802da..22c4b0772 100644 ---- a/daemons/fenced/fenced_remote.c -+++ b/daemons/fenced/fenced_remote.c -@@ -612,20 +612,18 @@ remote_op_timeout_one(gpointer userdata) - return FALSE; - } - --static gboolean --remote_op_timeout(gpointer userdata) -+/*! -+ * \internal -+ * \brief Finalize a remote fencer operation that timed out -+ * -+ * \param[in] op Fencer operation that timed out -+ */ -+static void -+finalize_timed_out_op(remote_fencing_op_t *op) - { -- remote_fencing_op_t *op = userdata; - - op->op_timer_total = 0; - -- if (op->state == st_done) { -- crm_debug("Action '%s' targeting %s for client %s already completed " -- CRM_XS " id=%.8s", -- op->action, op->target, op->client_name, op->id); -- return FALSE; -- } -- - crm_debug("Action '%s' targeting %s for client %s timed out " - CRM_XS " id=%.8s", - op->action, op->target, op->client_name, op->id); -@@ -637,14 +635,35 @@ remote_op_timeout(gpointer userdata) - */ - op->state = st_done; - remote_op_done(op, NULL, pcmk_ok, FALSE); -- return FALSE; -+ return; - } - - op->state = st_failed; - - remote_op_done(op, NULL, -ETIME, FALSE); -+} - -- return FALSE; -+/*! -+ * \internal -+ * \brief Finalize a remote fencer operation that timed out -+ * -+ * \param[in] userdata Fencer operation that timed out -+ * -+ * \return G_SOURCE_REMOVE (which tells glib not to restart timer) -+ */ -+static gboolean -+remote_op_timeout(gpointer userdata) -+{ -+ remote_fencing_op_t *op = userdata; -+ -+ if (op->state == st_done) { -+ crm_debug("Action '%s' targeting %s for client %s already completed " -+ CRM_XS " id=%.8s", -+ op->action, op->target, op->client_name, op->id); -+ } else { -+ finalize_timed_out_op(userdata); -+ } -+ return G_SOURCE_REMOVE; - } - - static gboolean -@@ -670,7 +689,7 @@ remote_op_query_timeout(gpointer data) - g_source_remove(op->op_timer_total); - op->op_timer_total = 0; - } -- remote_op_timeout(op); -+ finalize_timed_out_op(op); - } - - return FALSE; -@@ -1675,8 +1694,8 @@ call_remote_stonith(remote_fencing_op_t *op, peer_device_info_t *peer, int rc) - crm_info("No remaining peers capable of fencing (%s) %s for client %s " - CRM_XS " state=%s", op->action, op->target, op->client_name, - stonith_op_state_str(op->state)); -- CRM_LOG_ASSERT(op->state < st_done); -- remote_op_timeout(op); -+ CRM_CHECK(op->state < st_done, return); -+ finalize_timed_out_op(op); - - } else if(op->replies >= op->replies_expected || op->replies >= fencing_active_peers()) { - // int rc = -EHOSTUNREACH; --- -2.27.0 - - -From c047005a112ac7da5ba62084e39c79db739f0923 Mon Sep 17 00:00:00 2001 -From: Ken Gaillot -Date: Thu, 18 Nov 2021 10:05:18 -0600 -Subject: [PATCH 12/19] Low: fencer: handle malformed manual confirmation - requests better - -Rename stonith_manual_ack() to fenced_handle_manual_confirmation(), and move -more of the manual confirmation handling in handle_request() into it, for -better code isolation. This will also make planned changes easier. - -The one behavioral difference is that a failure of initiate_remote_stonith_op() -will now be ignored rather than segmentation fault trying to dereference NULL. ---- - daemons/fenced/fenced_commands.c | 20 ++++++++++++-------- - daemons/fenced/fenced_remote.c | 29 ++++++++++++++++++++++++----- - daemons/fenced/pacemaker-fenced.h | 2 +- - 3 files changed, 37 insertions(+), 14 deletions(-) - -diff --git a/daemons/fenced/fenced_commands.c b/daemons/fenced/fenced_commands.c -index 34c956f5c..6f325b9e8 100644 ---- a/daemons/fenced/fenced_commands.c -+++ b/daemons/fenced/fenced_commands.c -@@ -3012,14 +3012,18 @@ handle_request(pcmk__client_t *client, uint32_t id, uint32_t flags, - if (remote_peer || stand_alone) { - rc = stonith_fence(request); - -- } else if (call_options & st_opt_manual_ack) { -- remote_fencing_op_t *rop = NULL; -- xmlNode *dev = get_xpath_object("//@" F_STONITH_TARGET, request, LOG_TRACE); -- const char *target = crm_element_value(dev, F_STONITH_TARGET); -- -- crm_notice("Received manual confirmation that %s is fenced", target); -- rop = initiate_remote_stonith_op(client, request, TRUE); -- rc = stonith_manual_ack(request, rop); -+ } else if (pcmk_is_set(call_options, st_opt_manual_ack)) { -+ switch (fenced_handle_manual_confirmation(client, request)) { -+ case pcmk_rc_ok: -+ rc = pcmk_ok; -+ break; -+ case EINPROGRESS: -+ rc = -EINPROGRESS; -+ break; -+ default: -+ rc = -EPROTO; -+ break; -+ } - - } else { - const char *alternate_host = NULL; -diff --git a/daemons/fenced/fenced_remote.c b/daemons/fenced/fenced_remote.c -index 22c4b0772..60ee5e32e 100644 ---- a/daemons/fenced/fenced_remote.c -+++ b/daemons/fenced/fenced_remote.c -@@ -1003,22 +1003,41 @@ static uint32_t fencing_active_peers(void) - return count; - } - -+/*! -+ * \internal -+ * \brief Process a manual confirmation of a pending fence action -+ * -+ * \param[in] client IPC client that sent confirmation -+ * \param[in] msg Request XML with manual confirmation -+ * -+ * \return Standard Pacemaker return code -+ */ - int --stonith_manual_ack(xmlNode * msg, remote_fencing_op_t * op) -+fenced_handle_manual_confirmation(pcmk__client_t *client, xmlNode *msg) - { -+ remote_fencing_op_t *op = NULL; - xmlNode *dev = get_xpath_object("//@" F_STONITH_TARGET, msg, LOG_ERR); - -+ CRM_CHECK(dev != NULL, return EPROTO); -+ -+ crm_notice("Received manual confirmation that %s has been fenced", -+ crm_str(crm_element_value(dev, F_STONITH_TARGET))); -+ op = initiate_remote_stonith_op(client, msg, TRUE); -+ if (op == NULL) { -+ return EPROTO; -+ } - op->state = st_done; - set_fencing_completed(op); - op->delegate = strdup("a human"); - -- crm_notice("Injecting manual confirmation that %s is safely off/down", -- crm_element_value(dev, F_STONITH_TARGET)); -+ // For the fencer's purposes, the fencing operation is done - - remote_op_done(op, msg, pcmk_ok, FALSE); - -- // Replies are sent via done_cb -> send_async_reply() -> do_local_reply() -- return -EINPROGRESS; -+ /* For the requester's purposes, the operation is still pending. The -+ * actual result will be sent asynchronously via the operation's done_cb(). -+ */ -+ return EINPROGRESS; - } - - /*! -diff --git a/daemons/fenced/pacemaker-fenced.h b/daemons/fenced/pacemaker-fenced.h -index 3e41d867e..cf88644f1 100644 ---- a/daemons/fenced/pacemaker-fenced.h -+++ b/daemons/fenced/pacemaker-fenced.h -@@ -256,7 +256,7 @@ bool fencing_peer_active(crm_node_t *peer); - - void set_fencing_completed(remote_fencing_op_t * op); - --int stonith_manual_ack(xmlNode * msg, remote_fencing_op_t * op); -+int fenced_handle_manual_confirmation(pcmk__client_t *client, xmlNode *msg); - - gboolean node_has_attr(const char *node, const char *name, const char *value); - --- -2.27.0 - - -From ec60f014b5a8f774aa57a26e40a2b1b94a7e3d3a Mon Sep 17 00:00:00 2001 -From: Ken Gaillot -Date: Thu, 18 Nov 2021 10:35:31 -0600 -Subject: [PATCH 13/19] Low: fencer: handle malformed topology level removal - requests better - -Log the malformed request, and return -EPROTO instead of -EINVAL. If a request -is missing a level number, treat it as malformed instead of as a request to -remove all. ---- - daemons/fenced/fenced_commands.c | 18 +++++++++--------- - 1 file changed, 9 insertions(+), 9 deletions(-) - -diff --git a/daemons/fenced/fenced_commands.c b/daemons/fenced/fenced_commands.c -index 6f325b9e8..358844203 100644 ---- a/daemons/fenced/fenced_commands.c -+++ b/daemons/fenced/fenced_commands.c -@@ -1678,27 +1678,27 @@ stonith_level_register(xmlNode *msg, char **desc) - int - stonith_level_remove(xmlNode *msg, char **desc) - { -- int id = 0; -+ int id = -1; - stonith_topology_t *tp; - char *target; - - /* Unlike additions, removal requests should always have one level tag */ - xmlNode *level = get_xpath_object("//" XML_TAG_FENCING_LEVEL, msg, LOG_ERR); - -- CRM_CHECK(level != NULL, return -EINVAL); -+ CRM_CHECK(level != NULL, return -EPROTO); - - target = stonith_level_key(level, -1); - crm_element_value_int(level, XML_ATTR_STONITH_INDEX, &id); -+ -+ CRM_CHECK((id >= 0) && (id < ST_LEVEL_MAX), -+ crm_log_xml_warn(msg, "invalid level"); -+ free(target); -+ return -EPROTO); -+ - if (desc) { - *desc = crm_strdup_printf("%s[%d]", target, id); - } - -- /* Sanity-check arguments */ -- if (id >= ST_LEVEL_MAX) { -- free(target); -- return -EINVAL; -- } -- - tp = g_hash_table_lookup(topology, target); - if (tp == NULL) { - guint nentries = g_hash_table_size(topology); -@@ -1714,7 +1714,7 @@ stonith_level_remove(xmlNode *msg, char **desc) - "(%d active %s remaining)", target, nentries, - pcmk__plural_alt(nentries, "entry", "entries")); - -- } else if (id > 0 && tp->levels[id] != NULL) { -+ } else if (tp->levels[id] != NULL) { - guint nlevels; - - g_list_free_full(tp->levels[id], free); --- -2.27.0 - - -From ee0cfb6b284c2d6d21f8e77bf6ff286b1364235d Mon Sep 17 00:00:00 2001 -From: Ken Gaillot -Date: Thu, 18 Nov 2021 12:33:05 -0600 -Subject: [PATCH 14/19] Refactor: fencer: avoid obscuring a variable - -handle_request() declared a xmlNode *reply variable, and then one of its "if" -blocks defined another one, obscuring the first. Drop the first declaration, -and instead move it to the one other place that needed it. - -Also remove a redundant assertion. ---- - daemons/fenced/fenced_commands.c | 13 +++++-------- - 1 file changed, 5 insertions(+), 8 deletions(-) - -diff --git a/daemons/fenced/fenced_commands.c b/daemons/fenced/fenced_commands.c -index 358844203..af0a92450 100644 ---- a/daemons/fenced/fenced_commands.c -+++ b/daemons/fenced/fenced_commands.c -@@ -2917,7 +2917,6 @@ handle_request(pcmk__client_t *client, uint32_t id, uint32_t flags, - int rc = -EOPNOTSUPP; - - xmlNode *data = NULL; -- xmlNode *reply = NULL; - bool need_reply = true; - - char *output = NULL; -@@ -2926,8 +2925,8 @@ handle_request(pcmk__client_t *client, uint32_t id, uint32_t flags, - - crm_element_value_int(request, F_STONITH_CALLOPTS, &call_options); - -- if (pcmk_is_set(call_options, st_opt_sync_call)) { -- CRM_ASSERT(client == NULL || client->request_id == id); -+ if (pcmk_is_set(call_options, st_opt_sync_call) && (client != NULL)) { -+ CRM_ASSERT(client->request_id == id); - } - - if (pcmk__str_eq(op, CRM_OP_REGISTER, pcmk__str_none)) { -@@ -3156,16 +3155,14 @@ handle_request(pcmk__client_t *client, uint32_t id, uint32_t flags, - done: - // Reply if result is known - if (need_reply) { -- if (pcmk_is_set(call_options, st_opt_sync_call)) { -- CRM_ASSERT(client == NULL || client->request_id == id); -- } -- reply = stonith_construct_reply(request, output, data, rc); -+ xmlNode *reply = stonith_construct_reply(request, output, data, rc); -+ - stonith_send_reply(reply, call_options, remote_peer, client_id); -+ free_xml(reply); - } - - free(output); - free_xml(data); -- free_xml(reply); - - crm_debug("Processed %s request from %s %s: %s (rc=%d)", - op, ((client == NULL)? "peer" : "client"), --- -2.27.0 - - -From a5fef7b95b7541860e29c1ff33be38db327208fb Mon Sep 17 00:00:00 2001 -From: Ken Gaillot -Date: Thu, 18 Nov 2021 12:37:10 -0600 -Subject: [PATCH 15/19] Refactor: fencer: add convenience function for setting - protocol error result - -The fencer will soon track and return the full result (rather than just a -legacy return code) for fencing actions, for callbacks and notifications. -To simplify that process as well as move away from the legacy codes in general, -all fencer API operations will be modified to return a full result. - -This convenience function will come in handy for that. ---- - daemons/fenced/pacemaker-fenced.h | 7 +++++++ - 1 file changed, 7 insertions(+) - -diff --git a/daemons/fenced/pacemaker-fenced.h b/daemons/fenced/pacemaker-fenced.h -index cf88644f1..3bc5dc3d1 100644 ---- a/daemons/fenced/pacemaker-fenced.h -+++ b/daemons/fenced/pacemaker-fenced.h -@@ -262,6 +262,13 @@ gboolean node_has_attr(const char *node, const char *name, const char *value); - - gboolean node_does_watchdog_fencing(const char *node); - -+static inline void -+fenced_set_protocol_error(pcmk__action_result_t *result) -+{ -+ pcmk__set_result(result, CRM_EX_PROTOCOL, PCMK_EXEC_INVALID, -+ "Fencer API request missing required information (bug?)"); -+} -+ - extern char *stonith_our_uname; - extern gboolean stand_alone; - extern GHashTable *device_list; --- -2.27.0 - - -From ed770d36fb34dc7b3344cd326830a6c06cc789ce Mon Sep 17 00:00:00 2001 -From: Ken Gaillot -Date: Fri, 19 Nov 2021 09:59:51 -0600 -Subject: [PATCH 16/19] Refactor: fencer: make a few functions return void - -... to make planned changes easier. The return values were previously ignored. ---- - daemons/fenced/fenced_commands.c | 17 ++++++++------- - daemons/fenced/fenced_history.c | 6 +----- - daemons/fenced/fenced_remote.c | 35 ++++++++++++++----------------- - daemons/fenced/pacemaker-fenced.c | 6 +++--- - daemons/fenced/pacemaker-fenced.h | 8 +++---- - 5 files changed, 33 insertions(+), 39 deletions(-) - -diff --git a/daemons/fenced/fenced_commands.c b/daemons/fenced/fenced_commands.c -index af0a92450..ea7d281ce 100644 ---- a/daemons/fenced/fenced_commands.c -+++ b/daemons/fenced/fenced_commands.c -@@ -1411,8 +1411,8 @@ stonith_device_register(xmlNode * msg, const char **desc, gboolean from_cib) - return pcmk_ok; - } - --int --stonith_device_remove(const char *id, gboolean from_cib) -+void -+stonith_device_remove(const char *id, bool from_cib) - { - stonith_device_t *device = g_hash_table_lookup(device_list, id); - guint ndevices = 0; -@@ -1421,7 +1421,7 @@ stonith_device_remove(const char *id, gboolean from_cib) - ndevices = g_hash_table_size(device_list); - crm_info("Device '%s' not found (%d active device%s)", - id, ndevices, pcmk__plural_s(ndevices)); -- return pcmk_ok; -+ return; - } - - if (from_cib) { -@@ -1443,7 +1443,6 @@ stonith_device_remove(const char *id, gboolean from_cib) - (device->cib_registered? " cib" : ""), - (device->api_registered? " api" : "")); - } -- return pcmk_ok; - } - - /*! -@@ -3085,8 +3084,9 @@ handle_request(pcmk__client_t *client, uint32_t id, uint32_t flags, - need_reply = (rc != -EINPROGRESS); - - } else if (pcmk__str_eq(op, STONITH_OP_FENCE_HISTORY, pcmk__str_none)) { -- rc = stonith_fence_history(request, &data, remote_peer, call_options); -- if (call_options & st_opt_discard_reply) { -+ stonith_fence_history(request, &data, remote_peer, call_options); -+ rc = pcmk_ok; -+ if (pcmk_is_set(call_options, st_opt_discard_reply)) { - /* we don't expect answers to the broadcast - * we might have sent out - */ -@@ -3109,7 +3109,8 @@ handle_request(pcmk__client_t *client, uint32_t id, uint32_t flags, - const char *device_id = crm_element_value(dev, XML_ATTR_ID); - - if (is_privileged(client, op)) { -- rc = stonith_device_remove(device_id, FALSE); -+ stonith_device_remove(device_id, false); -+ rc = pcmk_ok; - } else { - rc = -EACCES; - } -@@ -3179,7 +3180,7 @@ handle_reply(pcmk__client_t *client, xmlNode *request, const char *remote_peer) - if (pcmk__str_eq(op, STONITH_OP_QUERY, pcmk__str_none)) { - process_remote_stonith_query(request); - } else if (pcmk__str_any_of(op, T_STONITH_NOTIFY, STONITH_OP_FENCE, NULL)) { -- process_remote_stonith_exec(request); -+ fenced_process_fencing_reply(request); - } else { - crm_err("Ignoring unknown %s reply from %s %s", - crm_str(op), ((client == NULL)? "peer" : "client"), -diff --git a/daemons/fenced/fenced_history.c b/daemons/fenced/fenced_history.c -index 7127593b6..bc159383c 100644 ---- a/daemons/fenced/fenced_history.c -+++ b/daemons/fenced/fenced_history.c -@@ -433,14 +433,11 @@ stonith_local_history(gboolean add_id, const char *target) - * a reply from - * \param[in] remote_peer - * \param[in] options call-options from the request -- * -- * \return always success as there is actully nothing that can go really wrong - */ --int -+void - stonith_fence_history(xmlNode *msg, xmlNode **output, - const char *remote_peer, int options) - { -- int rc = 0; - const char *target = NULL; - xmlNode *dev = get_xpath_object("//@" F_STONITH_TARGET, msg, LOG_NEVER); - xmlNode *out_history = NULL; -@@ -525,5 +522,4 @@ stonith_fence_history(xmlNode *msg, xmlNode **output, - *output = stonith_local_history(FALSE, target); - } - free_xml(out_history); -- return rc; - } -diff --git a/daemons/fenced/fenced_remote.c b/daemons/fenced/fenced_remote.c -index 60ee5e32e..6338aebde 100644 ---- a/daemons/fenced/fenced_remote.c -+++ b/daemons/fenced/fenced_remote.c -@@ -2086,11 +2086,9 @@ process_remote_stonith_query(xmlNode * msg) - * or attempt another device as appropriate. - * - * \param[in] msg XML reply received -- * -- * \return pcmk_ok on success, -errno on error - */ --int --process_remote_stonith_exec(xmlNode * msg) -+void -+fenced_process_fencing_reply(xmlNode *msg) - { - int rc = 0; - const char *id = NULL; -@@ -2098,13 +2096,13 @@ process_remote_stonith_exec(xmlNode * msg) - remote_fencing_op_t *op = NULL; - xmlNode *dev = get_xpath_object("//@" F_STONITH_REMOTE_OP_ID, msg, LOG_ERR); - -- CRM_CHECK(dev != NULL, return -EPROTO); -+ CRM_CHECK(dev != NULL, return); - - id = crm_element_value(dev, F_STONITH_REMOTE_OP_ID); -- CRM_CHECK(id != NULL, return -EPROTO); -+ CRM_CHECK(id != NULL, return); - - dev = get_xpath_object("//@" F_STONITH_RC, msg, LOG_ERR); -- CRM_CHECK(dev != NULL, return -EPROTO); -+ CRM_CHECK(dev != NULL, return); - - crm_element_value_int(dev, F_STONITH_RC, &rc); - -@@ -2125,35 +2123,35 @@ process_remote_stonith_exec(xmlNode * msg) - /* Could be for an event that began before we started */ - /* TODO: Record the op for later querying */ - crm_info("Received peer result of unknown or expired operation %s", id); -- return -EOPNOTSUPP; -+ return; - } - - if (op->devices && device && !pcmk__str_eq(op->devices->data, device, pcmk__str_casei)) { - crm_err("Received outdated reply for device %s (instead of %s) to " - "fence (%s) %s. Operation already timed out at peer level.", - device, (const char *) op->devices->data, op->action, op->target); -- return rc; -+ return; - } - - if (pcmk__str_eq(crm_element_value(msg, F_SUBTYPE), "broadcast", pcmk__str_casei)) { - crm_debug("Finalizing action '%s' targeting %s on behalf of %s@%s: %s " -- CRM_XS " rc=%d id=%.8s", -+ CRM_XS " id=%.8s", - op->action, op->target, op->client_name, op->originator, -- pcmk_strerror(rc), rc, op->id); -+ pcmk_strerror(rc), op->id); - if (rc == pcmk_ok) { - op->state = st_done; - } else { - op->state = st_failed; - } - remote_op_done(op, msg, rc, FALSE); -- return pcmk_ok; -+ return; - } else if (!pcmk__str_eq(op->originator, stonith_our_uname, pcmk__str_casei)) { - /* If this isn't a remote level broadcast, and we are not the - * originator of the operation, we should not be receiving this msg. */ - crm_err("Received non-broadcast fencing result for operation %.8s " - "we do not own (device %s targeting %s)", - op->id, device, op->target); -- return rc; -+ return; - } - - if (pcmk_is_set(op->call_options, st_opt_topology)) { -@@ -2168,7 +2166,7 @@ process_remote_stonith_exec(xmlNode * msg) - * and notify our local clients. */ - if (op->state == st_done) { - remote_op_done(op, msg, rc, FALSE); -- return rc; -+ return; - } - - if ((op->phase == 2) && (rc != pcmk_ok)) { -@@ -2184,14 +2182,14 @@ process_remote_stonith_exec(xmlNode * msg) - /* An operation completed successfully. Try another device if - * necessary, otherwise mark the operation as done. */ - advance_topology_device_in_level(op, device, msg, rc); -- return rc; -+ return; - } else { - /* This device failed, time to try another topology level. If no other - * levels are available, mark this operation as failed and report results. */ - if (advance_topology_level(op, false) != pcmk_rc_ok) { - op->state = st_failed; - remote_op_done(op, msg, rc, FALSE); -- return rc; -+ return; - } - } - } else if (rc == pcmk_ok && op->devices == NULL) { -@@ -2199,12 +2197,12 @@ process_remote_stonith_exec(xmlNode * msg) - - op->state = st_done; - remote_op_done(op, msg, rc, FALSE); -- return rc; -+ return; - } else if (rc == -ETIME && op->devices == NULL) { - /* If the operation timed out don't bother retrying other peers. */ - op->state = st_failed; - remote_op_done(op, msg, rc, FALSE); -- return rc; -+ return; - } else { - /* fall-through and attempt other fencing action using another peer */ - } -@@ -2213,7 +2211,6 @@ process_remote_stonith_exec(xmlNode * msg) - crm_trace("Next for %s on behalf of %s@%s (rc was %d)", op->target, op->originator, - op->client_name, rc); - call_remote_stonith(op, NULL, rc); -- return rc; - } - - gboolean -diff --git a/daemons/fenced/pacemaker-fenced.c b/daemons/fenced/pacemaker-fenced.c -index a290e1670..0a8b3bf6f 100644 ---- a/daemons/fenced/pacemaker-fenced.c -+++ b/daemons/fenced/pacemaker-fenced.c -@@ -445,7 +445,7 @@ remove_cib_device(xmlXPathObjectPtr xpathObj) - - rsc_id = crm_element_value(match, XML_ATTR_ID); - -- stonith_device_remove(rsc_id, TRUE); -+ stonith_device_remove(rsc_id, true); - } - } - -@@ -610,7 +610,7 @@ watchdog_device_update(void) - } else { - /* be silent if no device - todo parameter to stonith_device_remove */ - if (g_hash_table_lookup(device_list, STONITH_WATCHDOG_ID)) { -- stonith_device_remove(STONITH_WATCHDOG_ID, TRUE); -+ stonith_device_remove(STONITH_WATCHDOG_ID, true); - } - } - } -@@ -847,7 +847,7 @@ update_cib_stonith_devices_v2(const char *event, xmlNode * msg) - } - if (search != NULL) { - *search = 0; -- stonith_device_remove(rsc_id, TRUE); -+ stonith_device_remove(rsc_id, true); - /* watchdog_device_update called afterwards - to fall back to implicit definition if needed */ - } else { -diff --git a/daemons/fenced/pacemaker-fenced.h b/daemons/fenced/pacemaker-fenced.h -index 3bc5dc3d1..5162ada75 100644 ---- a/daemons/fenced/pacemaker-fenced.h -+++ b/daemons/fenced/pacemaker-fenced.h -@@ -214,7 +214,7 @@ void stonith_command(pcmk__client_t *client, uint32_t id, uint32_t flags, - - int stonith_device_register(xmlNode * msg, const char **desc, gboolean from_cib); - --int stonith_device_remove(const char *id, gboolean from_cib); -+void stonith_device_remove(const char *id, bool from_cib); - - char *stonith_level_key(xmlNode * msg, int mode); - int stonith_level_kind(xmlNode * msg); -@@ -241,14 +241,14 @@ remote_fencing_op_t *initiate_remote_stonith_op(pcmk__client_t *client, - xmlNode *request, - gboolean manual_ack); - --int process_remote_stonith_exec(xmlNode * msg); -+void fenced_process_fencing_reply(xmlNode *msg); - - int process_remote_stonith_query(xmlNode * msg); - - void *create_remote_stonith_op(const char *client, xmlNode * request, gboolean peer); - --int stonith_fence_history(xmlNode *msg, xmlNode **output, -- const char *remote_peer, int options); -+void stonith_fence_history(xmlNode *msg, xmlNode **output, -+ const char *remote_peer, int options); - - void stonith_fence_history_trim(void); - --- -2.27.0 - - -From 27df49460930738e77f5ca42536aff1d3bdfcae7 Mon Sep 17 00:00:00 2001 -From: Ken Gaillot -Date: Fri, 19 Nov 2021 10:06:43 -0600 -Subject: [PATCH 17/19] Refactor: fencer: drop unnecessary argument when - advancing topology device - -If we're advancing to the next device in a topology level, by necessity that -means any previous device succeeded. ---- - daemons/fenced/fenced_remote.c | 19 +++++++++---------- - 1 file changed, 9 insertions(+), 10 deletions(-) - -diff --git a/daemons/fenced/fenced_remote.c b/daemons/fenced/fenced_remote.c -index 6338aebde..d54e6a4ef 100644 ---- a/daemons/fenced/fenced_remote.c -+++ b/daemons/fenced/fenced_remote.c -@@ -1519,14 +1519,13 @@ report_timeout_period(remote_fencing_op_t * op, int op_timeout) - * \internal - * \brief Advance an operation to the next device in its topology - * -- * \param[in,out] op Operation to advance -- * \param[in] device ID of device just completed -- * \param[in] msg XML reply that contained device result (if available) -- * \param[in] rc Return code of device's execution -+ * \param[in] op Fencer operation to advance -+ * \param[in] device ID of device that just completed -+ * \param[in] msg If not NULL, XML reply of last delegated fencing operation - */ - static void - advance_topology_device_in_level(remote_fencing_op_t *op, const char *device, -- xmlNode *msg, int rc) -+ xmlNode *msg) - { - /* Advance to the next device at this topology level, if any */ - if (op->devices) { -@@ -1556,8 +1555,8 @@ advance_topology_device_in_level(remote_fencing_op_t *op, const char *device, - - if (op->devices) { - /* Necessary devices remain, so execute the next one */ -- crm_trace("Next targeting %s on behalf of %s@%s (rc was %d)", -- op->target, op->client_name, op->originator, rc); -+ crm_trace("Next targeting %s on behalf of %s@%s", -+ op->target, op->client_name, op->originator); - - // The requested delay has been applied for the first device - if (op->delay > 0) { -@@ -1570,7 +1569,7 @@ advance_topology_device_in_level(remote_fencing_op_t *op, const char *device, - crm_trace("Marking complex fencing op targeting %s as complete", - op->target); - op->state = st_done; -- remote_op_done(op, msg, rc, FALSE); -+ remote_op_done(op, msg, pcmk_ok, FALSE); - } - } - -@@ -1701,7 +1700,7 @@ call_remote_stonith(remote_fencing_op_t *op, peer_device_info_t *peer, int rc) - */ - crm_warn("Ignoring %s 'on' failure (no capable peers) targeting %s " - "after successful 'off'", device, op->target); -- advance_topology_device_in_level(op, device, NULL, pcmk_ok); -+ advance_topology_device_in_level(op, device, NULL); - return; - - } else if (op->owner == FALSE) { -@@ -2181,7 +2180,7 @@ fenced_process_fencing_reply(xmlNode *msg) - if (rc == pcmk_ok) { - /* An operation completed successfully. Try another device if - * necessary, otherwise mark the operation as done. */ -- advance_topology_device_in_level(op, device, msg, rc); -+ advance_topology_device_in_level(op, device, msg); - return; - } else { - /* This device failed, time to try another topology level. If no other --- -2.27.0 - - -From 05437e1339bc1f9071b43e97d5846a939687951d Mon Sep 17 00:00:00 2001 -From: Ken Gaillot -Date: Mon, 29 Nov 2021 11:59:17 -0600 -Subject: [PATCH 18/19] Refactor: fencer: minor renames for consistency - -... per review ---- - daemons/fenced/fenced_remote.c | 13 ++++++------- - 1 file changed, 6 insertions(+), 7 deletions(-) - -diff --git a/daemons/fenced/fenced_remote.c b/daemons/fenced/fenced_remote.c -index d54e6a4ef..8feb40147 100644 ---- a/daemons/fenced/fenced_remote.c -+++ b/daemons/fenced/fenced_remote.c -@@ -63,7 +63,7 @@ typedef struct device_properties_s { - int delay_base[st_phase_max]; - } device_properties_t; - --typedef struct st_query_result_s { -+typedef struct { - /* Name of peer that sent this result */ - char *host; - /* Only try peers for non-topology based operations once */ -@@ -95,13 +95,12 @@ sort_strings(gconstpointer a, gconstpointer b) - static void - free_remote_query(gpointer data) - { -- if (data) { -- peer_device_info_t *query = data; -+ if (data != NULL) { -+ peer_device_info_t *peer = data; - -- crm_trace("Free'ing query result from %s", query->host); -- g_hash_table_destroy(query->devices); -- free(query->host); -- free(query); -+ g_hash_table_destroy(peer->devices); -+ free(peer->host); -+ free(peer); - } - } - --- -2.27.0 - - -From 86974d7cef05bafbed540d02e59514292581ae65 Mon Sep 17 00:00:00 2001 -From: Ken Gaillot -Date: Tue, 30 Nov 2021 08:33:41 -0600 -Subject: [PATCH 19/19] Refactor: fencer: simplify send_async_reply() - -... as suggested in review ---- - daemons/fenced/fenced_commands.c | 28 ++++++++++++---------------- - 1 file changed, 12 insertions(+), 16 deletions(-) - -diff --git a/daemons/fenced/fenced_commands.c b/daemons/fenced/fenced_commands.c -index ea7d281ce..f34cb4f13 100644 ---- a/daemons/fenced/fenced_commands.c -+++ b/daemons/fenced/fenced_commands.c -@@ -2384,36 +2384,34 @@ send_async_reply(async_command_t *cmd, const pcmk__action_result_t *result, - int pid, bool merged) - { - xmlNode *reply = NULL; -- bool bcast = false; - - CRM_CHECK((cmd != NULL) && (result != NULL), return); - -+ log_async_result(cmd, result, pid, NULL, merged); -+ - reply = construct_async_reply(cmd, result); -+ if (merged) { -+ crm_xml_add(reply, F_STONITH_MERGED, "true"); -+ } - -- // If target was also the originator, broadcast fencing results for it - if (!stand_alone && pcmk__is_fencing_action(cmd->action) - && pcmk__str_eq(cmd->origin, cmd->victim, pcmk__str_casei)) { -- -+ /* The target was also the originator, so broadcast the result on its -+ * behalf (since it will be unable to). -+ */ - crm_trace("Broadcast '%s' result for %s (target was also originator)", - cmd->action, cmd->victim); - crm_xml_add(reply, F_SUBTYPE, "broadcast"); - crm_xml_add(reply, F_STONITH_OPERATION, T_STONITH_NOTIFY); -- bcast = true; -- } -- -- log_async_result(cmd, result, pid, NULL, merged); -- -- if (merged) { -- crm_xml_add(reply, F_STONITH_MERGED, "true"); -- } -- crm_log_xml_trace(reply, "Reply"); -- -- if (bcast) { - send_cluster_message(NULL, crm_msg_stonith_ng, reply, FALSE); - } else { -+ // Reply only to the originator - stonith_send_reply(reply, cmd->options, cmd->origin, cmd->client); - } - -+ crm_log_xml_trace(reply, "Reply"); -+ free_xml(reply); -+ - if (stand_alone) { - /* Do notification with a clean data object */ - xmlNode *notify_data = create_xml_node(NULL, T_STONITH_NOTIFY_FENCE); -@@ -2430,8 +2428,6 @@ send_async_reply(async_command_t *cmd, const pcmk__action_result_t *result, - do_stonith_notify(T_STONITH_NOTIFY_FENCE, rc, notify_data); - do_stonith_notify(T_STONITH_NOTIFY_HISTORY, pcmk_ok, NULL); - } -- -- free_xml(reply); - } - - static void --- -2.27.0 - diff --git a/SOURCES/006-stateful-metadata.patch b/SOURCES/006-stateful-metadata.patch deleted file mode 100644 index a9ea6f4..0000000 --- a/SOURCES/006-stateful-metadata.patch +++ /dev/null @@ -1,143 +0,0 @@ -From b52fe799c89637e2a761a5725c2376db5c05f2d1 Mon Sep 17 00:00:00 2001 -From: Ken Gaillot -Date: Tue, 30 Nov 2021 15:51:54 -0600 -Subject: [PATCH 1/2] Low: resources: remove DOCTYPE from OCF 1.1-compliant - agents - -OCF 1.1 replaced the DTD schema with RNG, but DOCTYPE still refers to the DTD. -There's no DOCTYPE for RNG, and DOCTYPE is optional, so just remove it. ---- - extra/resources/Dummy | 3 +-- - extra/resources/HealthIOWait | 3 +-- - extra/resources/Stateful | 3 +-- - extra/resources/attribute | 3 +-- - extra/resources/ping | 3 +-- - extra/resources/remote | 3 +-- - 6 files changed, 6 insertions(+), 12 deletions(-) - -diff --git a/extra/resources/Dummy b/extra/resources/Dummy -index a344deac0..56584e564 100755 ---- a/extra/resources/Dummy -+++ b/extra/resources/Dummy -@@ -58,8 +58,7 @@ - meta_data() { - cat < -- -- -+ - 1.1 - - -diff --git a/extra/resources/HealthIOWait b/extra/resources/HealthIOWait -index 43a8b70c4..5f1483ef7 100755 ---- a/extra/resources/HealthIOWait -+++ b/extra/resources/HealthIOWait -@@ -25,8 +25,7 @@ - meta_data() { - cat < -- -- -+ - 1.1 - - -diff --git a/extra/resources/Stateful b/extra/resources/Stateful -index ae3424bbf..0d2062d51 100755 ---- a/extra/resources/Stateful -+++ b/extra/resources/Stateful -@@ -39,8 +39,7 @@ SCORE_PROMOTED=10 - meta_data() { - cat < -- -- -+ - 1.1 - - -diff --git a/extra/resources/attribute b/extra/resources/attribute -index 1800dff8f..a2bd353e0 100755 ---- a/extra/resources/attribute -+++ b/extra/resources/attribute -@@ -57,8 +57,7 @@ END - meta_data() { - cat < -- -- -+ - 1.1 - Manages a node attribute - -diff --git a/extra/resources/ping b/extra/resources/ping -index 6e296979f..7cc6b802d 100755 ---- a/extra/resources/ping -+++ b/extra/resources/ping -@@ -36,8 +36,7 @@ - meta_data() { - cat < -- -- -+ - 1.1 - - -diff --git a/extra/resources/remote b/extra/resources/remote -index a53262bb6..f7e40dc81 100755 ---- a/extra/resources/remote -+++ b/extra/resources/remote -@@ -24,8 +24,7 @@ - meta_data() { - cat < -- -- -+ - 1.1 - Pacemaker Remote connection - --- -2.27.0 - - -From 70f469120f8db6a024c786466ee74a6c7fbd1f43 Mon Sep 17 00:00:00 2001 -From: Ken Gaillot -Date: Tue, 30 Nov 2021 15:53:39 -0600 -Subject: [PATCH 2/2] Fix: resources: use correct syntax in Stateful meta-data - -The OCF standard only allows "0" or "1" for booleans. - -This fixes incorrect ocf:pacemaker:Stateful meta-data syntax introduced by -7024398 as a regression in the 2.1.0 release. ---- - extra/resources/Stateful | 4 ++-- - 1 file changed, 2 insertions(+), 2 deletions(-) - -diff --git a/extra/resources/Stateful b/extra/resources/Stateful -index 0d2062d51..2ebe6725f 100755 ---- a/extra/resources/Stateful -+++ b/extra/resources/Stateful -@@ -57,7 +57,7 @@ Location to store the resource state in - - - -- -+ - - If this is set, the environment will be dumped to this file for every call. - -@@ -65,7 +65,7 @@ If this is set, the environment will be dumped to this file for every call. - - - -- -+ - - The notify action will sleep for this many seconds before returning, - to simulate a long-running notify. --- -2.27.0 - diff --git a/SOURCES/007-memory-leak.patch b/SOURCES/007-memory-leak.patch deleted file mode 100644 index 38ad3a2..0000000 --- a/SOURCES/007-memory-leak.patch +++ /dev/null @@ -1,39 +0,0 @@ -From f491d9d5a7ed554fed985de356bb085fdec3421c Mon Sep 17 00:00:00 2001 -From: Ken Gaillot -Date: Tue, 7 Dec 2021 09:01:00 -0600 -Subject: [PATCH] Fix: fencer: avoid memory leak when broadcasting history - differences - -Regression introduced in 2.1.0 by dbc27b2 ---- - daemons/fenced/fenced_history.c | 7 +++++-- - 1 file changed, 5 insertions(+), 2 deletions(-) - -diff --git a/daemons/fenced/fenced_history.c b/daemons/fenced/fenced_history.c -index bc159383c..a9c57dc86 100644 ---- a/daemons/fenced/fenced_history.c -+++ b/daemons/fenced/fenced_history.c -@@ -484,8 +484,6 @@ stonith_fence_history(xmlNode *msg, xmlNode **output, - !pcmk__str_eq(remote_peer, stonith_our_uname, pcmk__str_casei)) { - xmlNode *history = get_xpath_object("//" F_STONITH_HISTORY_LIST, - msg, LOG_NEVER); -- GHashTable *received_history = -- history?stonith_xml_history_to_list(history):NULL; - - /* either a broadcast created directly upon stonith-API request - * or a diff as response to such a thing -@@ -497,6 +495,11 @@ stonith_fence_history(xmlNode *msg, xmlNode **output, - if (!history || - !crm_is_true(crm_element_value(history, - F_STONITH_DIFFERENTIAL))) { -+ GHashTable *received_history = NULL; -+ -+ if (history != NULL) { -+ received_history = stonith_xml_history_to_list(history); -+ } - out_history = - stonith_local_history_diff_and_merge(received_history, TRUE, NULL); - if (out_history) { --- -2.27.0 - diff --git a/SOURCES/008-fencing-history.patch b/SOURCES/008-fencing-history.patch deleted file mode 100644 index 1ea9ac7..0000000 --- a/SOURCES/008-fencing-history.patch +++ /dev/null @@ -1,43 +0,0 @@ -From 0339e89f3238b31df78b864dae8684b82c370741 Mon Sep 17 00:00:00 2001 -From: Ken Gaillot -Date: Mon, 13 Dec 2021 15:22:40 -0600 -Subject: [PATCH] Fix: fencer: get current time correctly - -f52bc8e1ce (2.1.2) introduced a regression by using clock_gettime() with -CLOCK_MONOTONIC to get the current time. Use qb_util_timespec_from_epoch_get() -instead (which as of this writing uses clock_gettime() with CLOCK_REALTIME if -available, and falls back to gettimeofday() if not). ---- - daemons/fenced/fenced_commands.c | 11 +++-------- - 1 file changed, 3 insertions(+), 8 deletions(-) - -diff --git a/daemons/fenced/fenced_commands.c b/daemons/fenced/fenced_commands.c -index f34cb4f13..7685cb8c3 100644 ---- a/daemons/fenced/fenced_commands.c -+++ b/daemons/fenced/fenced_commands.c -@@ -2746,19 +2746,14 @@ bool fencing_peer_active(crm_node_t *peer) - return FALSE; - } - --void set_fencing_completed(remote_fencing_op_t * op) -+void -+set_fencing_completed(remote_fencing_op_t *op) - { --#ifdef CLOCK_MONOTONIC - struct timespec tv; - -- clock_gettime(CLOCK_MONOTONIC, &tv); -- -+ qb_util_timespec_from_epoch_get(&tv); - op->completed = tv.tv_sec; - op->completed_nsec = tv.tv_nsec; --#else -- op->completed = time(NULL); -- op->completed_nsec = 0L; --#endif - } - - /*! --- -2.27.0 - diff --git a/SOURCES/009-fencing-reasons.patch b/SOURCES/009-fencing-reasons.patch deleted file mode 100644 index 3fb5bc7..0000000 --- a/SOURCES/009-fencing-reasons.patch +++ /dev/null @@ -1,2985 +0,0 @@ -From fcd42a5926e9a63d425586552ecc7b543838d352 Mon Sep 17 00:00:00 2001 -From: Ken Gaillot -Date: Thu, 11 Nov 2021 16:57:03 -0600 -Subject: [PATCH 01/23] Feature: fencer: pass full result in async command - replies - -The services library callbacks for async commands, which call -send_async_reply() -> construct_async_reply() to create the reply, now add -fields for exit status, operation status, and exit reason, in addition to the -existing action standard output and legacy return code. - -Nothing uses the new fields yet. ---- - daemons/fenced/fenced_commands.c | 10 ++++------ - 1 file changed, 4 insertions(+), 6 deletions(-) - -diff --git a/daemons/fenced/fenced_commands.c b/daemons/fenced/fenced_commands.c -index f34cb4f136..3497428c18 100644 ---- a/daemons/fenced/fenced_commands.c -+++ b/daemons/fenced/fenced_commands.c -@@ -2415,9 +2415,8 @@ send_async_reply(async_command_t *cmd, const pcmk__action_result_t *result, - if (stand_alone) { - /* Do notification with a clean data object */ - xmlNode *notify_data = create_xml_node(NULL, T_STONITH_NOTIFY_FENCE); -- int rc = pcmk_rc2legacy(stonith__result2rc(result)); - -- crm_xml_add_int(notify_data, F_STONITH_RC, rc); -+ stonith__xe_set_result(notify_data, result); - crm_xml_add(notify_data, F_STONITH_TARGET, cmd->victim); - crm_xml_add(notify_data, F_STONITH_OPERATION, cmd->op); - crm_xml_add(notify_data, F_STONITH_DELEGATE, "localhost"); -@@ -2425,7 +2424,7 @@ send_async_reply(async_command_t *cmd, const pcmk__action_result_t *result, - crm_xml_add(notify_data, F_STONITH_REMOTE_OP_ID, cmd->remote_op_id); - crm_xml_add(notify_data, F_STONITH_ORIGIN, cmd->client); - -- do_stonith_notify(T_STONITH_NOTIFY_FENCE, rc, notify_data); -+ do_stonith_notify(T_STONITH_NOTIFY_FENCE, pcmk_rc2legacy(stonith__result2rc(result)), notify_data); - do_stonith_notify(T_STONITH_NOTIFY_HISTORY, pcmk_ok, NULL); - } - } -@@ -2728,9 +2727,8 @@ construct_async_reply(async_command_t *cmd, const pcmk__action_result_t *result) - crm_xml_add(reply, F_STONITH_ORIGIN, cmd->origin); - crm_xml_add_int(reply, F_STONITH_CALLID, cmd->id); - crm_xml_add_int(reply, F_STONITH_CALLOPTS, cmd->options); -- crm_xml_add_int(reply, F_STONITH_RC, -- pcmk_rc2legacy(stonith__result2rc(result))); -- crm_xml_add(reply, F_STONITH_OUTPUT, result->action_stdout); -+ -+ stonith__xe_set_result(reply, result); - return reply; - } - --- -2.27.0 - - -From 4bac2e9811872f92571e4f5a47d8c5032cfc3016 Mon Sep 17 00:00:00 2001 -From: Ken Gaillot -Date: Thu, 18 Nov 2021 12:41:29 -0600 -Subject: [PATCH 02/23] Refactor: fencer: track full result for direct agent - actions - -This renames stonith_device_action() to execute_agent_action() for readability, -and has it set a full result rather than return a legacy return code. - -As of this commit, handle_request() just maps the result back to a legacy code, -but it will make better use of it with planned changes. ---- - daemons/fenced/fenced_commands.c | 95 +++++++++++++++++++------------- - 1 file changed, 56 insertions(+), 39 deletions(-) - -diff --git a/daemons/fenced/fenced_commands.c b/daemons/fenced/fenced_commands.c -index 3497428c18..2f59ef84b7 100644 ---- a/daemons/fenced/fenced_commands.c -+++ b/daemons/fenced/fenced_commands.c -@@ -1729,23 +1729,6 @@ stonith_level_remove(xmlNode *msg, char **desc) - return pcmk_ok; - } - --/*! -- * \internal -- * \brief Schedule an (asynchronous) action directly on a stonith device -- * -- * Handle a STONITH_OP_EXEC API message by scheduling a requested agent action -- * directly on a specified device. Only list, monitor, and status actions are -- * expected to use this call, though it should work with any agent command. -- * -- * \param[in] msg API message XML with desired action -- * \param[out] output Unused -- * -- * \return -EINPROGRESS on success, -errno otherwise -- * \note If the action is monitor, the device must be registered via the API -- * (CIB registration is not sufficient), because monitor should not be -- * possible unless the device is "started" (API registered). -- */ -- - static char * - list_to_string(GList *list, const char *delim, gboolean terminate_with_delim) - { -@@ -1778,8 +1761,23 @@ list_to_string(GList *list, const char *delim, gboolean terminate_with_delim) - return rv; - } - --static int --stonith_device_action(xmlNode * msg, char **output) -+/*! -+ * \internal -+ * \brief Execute a fence agent action directly (and asynchronously) -+ * -+ * Handle a STONITH_OP_EXEC API message by scheduling a requested agent action -+ * directly on a specified device. Only list, monitor, and status actions are -+ * expected to use this call, though it should work with any agent command. -+ * -+ * \param[in] msg Request XML specifying action -+ * \param[out] result Where to store result of action -+ * -+ * \note If the action is monitor, the device must be registered via the API -+ * (CIB registration is not sufficient), because monitor should not be -+ * possible unless the device is "started" (API registered). -+ */ -+static void -+execute_agent_action(xmlNode *msg, pcmk__action_result_t *result) - { - xmlNode *dev = get_xpath_object("//" F_STONITH_DEVICE, msg, LOG_ERR); - xmlNode *op = get_xpath_object("//@" F_STONITH_ACTION, msg, LOG_ERR); -@@ -1792,39 +1790,56 @@ stonith_device_action(xmlNode * msg, char **output) - crm_info("Malformed API action request: device %s, action %s", - (id? id : "not specified"), - (action? action : "not specified")); -- return -EPROTO; -+ fenced_set_protocol_error(result); -+ return; - } - - if (pcmk__str_eq(id, STONITH_WATCHDOG_ID, pcmk__str_none)) { -+ // Watchdog agent actions are implemented internally - if (stonith_watchdog_timeout_ms <= 0) { -- return -ENODEV; -- } else { -- if (pcmk__str_eq(action, "list", pcmk__str_casei)) { -- *output = list_to_string(stonith_watchdog_targets, "\n", TRUE); -- return pcmk_ok; -- } else if (pcmk__str_eq(action, "monitor", pcmk__str_casei)) { -- return pcmk_ok; -- } -+ pcmk__set_result(result, CRM_EX_ERROR, PCMK_EXEC_NO_FENCE_DEVICE, -+ "Watchdog fence device not configured"); -+ return; -+ -+ } else if (pcmk__str_eq(action, "list", pcmk__str_casei)) { -+ pcmk__set_result(result, CRM_EX_OK, PCMK_EXEC_DONE, NULL); -+ pcmk__set_result_output(result, -+ list_to_string(stonith_watchdog_targets, -+ "\n", TRUE), -+ NULL); -+ return; -+ -+ } else if (pcmk__str_eq(action, "monitor", pcmk__str_casei)) { -+ pcmk__set_result(result, CRM_EX_OK, PCMK_EXEC_DONE, NULL); -+ return; - } - } - - device = g_hash_table_lookup(device_list, id); -- if ((device == NULL) -- || (!device->api_registered && !strcmp(action, "monitor"))) { -+ if (device == NULL) { -+ crm_info("Ignoring API '%s' action request because device %s not found", -+ action, id); -+ pcmk__set_result(result, CRM_EX_ERROR, PCMK_EXEC_NO_FENCE_DEVICE, -+ NULL); -+ return; - -+ } else if (!device->api_registered && !strcmp(action, "monitor")) { - // Monitors may run only on "started" (API-registered) devices -- crm_info("Ignoring API '%s' action request because device %s not found", -+ crm_info("Ignoring API '%s' action request because device %s not active", - action, id); -- return -ENODEV; -+ pcmk__set_result(result, CRM_EX_ERROR, PCMK_EXEC_NO_FENCE_DEVICE, -+ "Fence device not active"); -+ return; - } - - cmd = create_async_command(msg); - if (cmd == NULL) { -- return -EPROTO; -+ fenced_set_protocol_error(result); -+ return; - } - - schedule_stonith_command(cmd, device); -- return -EINPROGRESS; -+ pcmk__set_result(result, CRM_EX_OK, PCMK_EXEC_PENDING, NULL); - } - - static void -@@ -2911,8 +2926,8 @@ handle_request(pcmk__client_t *client, uint32_t id, uint32_t flags, - - xmlNode *data = NULL; - bool need_reply = true; -+ pcmk__action_result_t result = PCMK__UNKNOWN_RESULT; - -- char *output = NULL; - const char *op = crm_element_value(request, F_STONITH_OPERATION); - const char *client_id = crm_element_value(request, F_STONITH_CLIENTID); - -@@ -2935,8 +2950,9 @@ handle_request(pcmk__client_t *client, uint32_t id, uint32_t flags, - need_reply = false; - - } else if (pcmk__str_eq(op, STONITH_OP_EXEC, pcmk__str_none)) { -- rc = stonith_device_action(request, &output); -- need_reply = (rc != -EINPROGRESS); -+ execute_agent_action(request, &result); -+ need_reply = (result.execution_status != PCMK_EXEC_PENDING); -+ rc = pcmk_rc2legacy(stonith__result2rc(&result)); - - } else if (pcmk__str_eq(op, STONITH_OP_TIMEOUT_UPDATE, pcmk__str_none)) { - const char *call_id = crm_element_value(request, F_STONITH_CALLID); -@@ -3150,19 +3166,20 @@ handle_request(pcmk__client_t *client, uint32_t id, uint32_t flags, - done: - // Reply if result is known - if (need_reply) { -- xmlNode *reply = stonith_construct_reply(request, output, data, rc); -+ xmlNode *reply = stonith_construct_reply(request, result.action_stdout, data, rc); - - stonith_send_reply(reply, call_options, remote_peer, client_id); - free_xml(reply); - } - -- free(output); - free_xml(data); - - crm_debug("Processed %s request from %s %s: %s (rc=%d)", - op, ((client == NULL)? "peer" : "client"), - ((client == NULL)? remote_peer : pcmk__client_name(client)), - ((rc > 0)? "" : pcmk_strerror(rc)), rc); -+ -+ pcmk__reset_result(&result); - } - - static void --- -2.27.0 - - -From 9601b2aff1ea6a4eef0bb2701c22c1e971a657eb Mon Sep 17 00:00:00 2001 -From: Ken Gaillot -Date: Thu, 18 Nov 2021 17:31:20 -0600 -Subject: [PATCH 03/23] Refactor: fencer: track full result for local fencing - -This renames stonith_fence() to fence_locally() for readability, and has it set -a full result rather than return a legacy return code. - -As of this commit, handle_request() just maps the result back to a legacy code, -but it will make better use of it with planned changes. ---- - daemons/fenced/fenced_commands.c | 38 +++++++++++++++++++++----------- - 1 file changed, 25 insertions(+), 13 deletions(-) - -diff --git a/daemons/fenced/fenced_commands.c b/daemons/fenced/fenced_commands.c -index 2f59ef84b7..bfb0d71e5f 100644 ---- a/daemons/fenced/fenced_commands.c -+++ b/daemons/fenced/fenced_commands.c -@@ -2626,37 +2626,49 @@ stonith_fence_get_devices_cb(GList * devices, void *user_data) - } - } - --static int --stonith_fence(xmlNode * msg) -+/*! -+ * \internal -+ * \brief Execute a fence action via the local node -+ * -+ * \param[in] msg Fencing request -+ * \param[out] result Where to store result of fence action -+ */ -+static void -+fence_locally(xmlNode *msg, pcmk__action_result_t *result) - { - const char *device_id = NULL; - stonith_device_t *device = NULL; - async_command_t *cmd = create_async_command(msg); - xmlNode *dev = get_xpath_object("//@" F_STONITH_TARGET, msg, LOG_ERR); - -+ CRM_CHECK(result != NULL, return); -+ - if (cmd == NULL) { -- return -EPROTO; -+ fenced_set_protocol_error(result); -+ return; - } - - device_id = crm_element_value(dev, F_STONITH_DEVICE); -- if (device_id) { -+ if (device_id != NULL) { - device = g_hash_table_lookup(device_list, device_id); - if (device == NULL) { - crm_err("Requested device '%s' is not available", device_id); -- return -ENODEV; -+ pcmk__set_result(result, CRM_EX_ERROR, PCMK_EXEC_NO_FENCE_DEVICE, -+ "Requested fence device not found"); -+ return; - } - schedule_stonith_command(cmd, device); - - } else { - const char *host = crm_element_value(dev, F_STONITH_TARGET); - -- if (cmd->options & st_opt_cs_nodeid) { -- int nodeid; -- crm_node_t *node; -+ if (pcmk_is_set(cmd->options, st_opt_cs_nodeid)) { -+ int nodeid = 0; -+ crm_node_t *node = NULL; - - pcmk__scan_min_int(host, &nodeid, 0); - node = pcmk__search_known_node_cache(nodeid, NULL, CRM_GET_PEER_ANY); -- if (node) { -+ if (node != NULL) { - host = node->uname; - } - } -@@ -2666,7 +2678,7 @@ stonith_fence(xmlNode * msg) - TRUE, cmd, stonith_fence_get_devices_cb); - } - -- return -EINPROGRESS; -+ pcmk__set_result(result, CRM_EX_OK, PCMK_EXEC_PENDING, NULL); - } - - xmlNode * -@@ -3016,9 +3028,9 @@ handle_request(pcmk__client_t *client, uint32_t id, uint32_t flags, - } - - } else if (pcmk__str_eq(op, STONITH_OP_FENCE, pcmk__str_none)) { -- -- if (remote_peer || stand_alone) { -- rc = stonith_fence(request); -+ if ((remote_peer != NULL) || stand_alone) { -+ fence_locally(request, &result); -+ rc = pcmk_rc2legacy(stonith__result2rc(&result)); - - } else if (pcmk_is_set(call_options, st_opt_manual_ack)) { - switch (fenced_handle_manual_confirmation(client, request)) { --- -2.27.0 - - -From b7c7676cfd36fd72d3b29e86a23db97081e19b03 Mon Sep 17 00:00:00 2001 -From: Ken Gaillot -Date: Mon, 29 Nov 2021 17:06:52 -0600 -Subject: [PATCH 04/23] Low: fencer: handle topology level registration errors - better - -Rename stonith_level_register() to fenced_register_level() for consistency, and -refactor it to return a full result rather than a legacy return code. - -Return a protocol error for missing information in the request XML, and log -invalid level numbers at warning level. Use a new combination of -PCMK_EXEC_INVALID with CRM_EX_INVALID_PARAM for invalid levels, so it gets -mapped back to the legacy code -EINVAL (which was returned before). ---- - daemons/fenced/fenced_commands.c | 52 +++++++++++++++++++++---------- - daemons/fenced/pacemaker-fenced.c | 9 +++--- - daemons/fenced/pacemaker-fenced.h | 3 +- - lib/fencing/st_actions.c | 1 + - 4 files changed, 44 insertions(+), 21 deletions(-) - -diff --git a/daemons/fenced/fenced_commands.c b/daemons/fenced/fenced_commands.c -index bfb0d71e5f..975f8633a4 100644 ---- a/daemons/fenced/fenced_commands.c -+++ b/daemons/fenced/fenced_commands.c -@@ -1583,20 +1583,19 @@ parse_device_list(const char *devices) - - /*! - * \internal -- * \brief Register a STONITH level for a target -+ * \brief Register a fencing topology level for a target - * - * Given an XML request specifying the target name, level index, and device IDs - * for the level, this will create an entry for the target in the global topology - * table if one does not already exist, then append the specified device IDs to - * the entry's device list for the specified level. - * -- * \param[in] msg XML request for STONITH level registration -- * \param[out] desc If not NULL, will be set to string representation ("TARGET[LEVEL]") -- * -- * \return pcmk_ok on success, -EINVAL if XML does not specify valid level index -+ * \param[in] msg XML request for STONITH level registration -+ * \param[out] desc If not NULL, set to string representation "TARGET[LEVEL]" -+ * \param[out] result Where to set result of registration - */ --int --stonith_level_register(xmlNode *msg, char **desc) -+void -+fenced_register_level(xmlNode *msg, char **desc, pcmk__action_result_t *result) - { - int id = 0; - xmlNode *level; -@@ -1607,6 +1606,13 @@ stonith_level_register(xmlNode *msg, char **desc) - stonith_key_value_t *dIter = NULL; - stonith_key_value_t *devices = NULL; - -+ CRM_CHECK(result != NULL, return); -+ -+ if (msg == NULL) { -+ fenced_set_protocol_error(result); -+ return; -+ } -+ - /* Allow the XML here to point to the level tag directly, or wrapped in - * another tag. If directly, don't search by xpath, because it might give - * multiple hits (e.g. if the XML is the CIB). -@@ -1614,11 +1620,15 @@ stonith_level_register(xmlNode *msg, char **desc) - if (pcmk__str_eq(TYPE(msg), XML_TAG_FENCING_LEVEL, pcmk__str_casei)) { - level = msg; - } else { -- level = get_xpath_object("//" XML_TAG_FENCING_LEVEL, msg, LOG_ERR); -+ level = get_xpath_object("//" XML_TAG_FENCING_LEVEL, msg, LOG_WARNING); -+ } -+ if (level == NULL) { -+ fenced_set_protocol_error(result); -+ return; - } -- CRM_CHECK(level != NULL, return -EINVAL); - - mode = stonith_level_kind(level); -+ - target = stonith_level_key(level, mode); - crm_element_value_int(level, XML_ATTR_STONITH_INDEX, &id); - -@@ -1626,18 +1636,26 @@ stonith_level_register(xmlNode *msg, char **desc) - *desc = crm_strdup_printf("%s[%d]", target, id); - } - -- /* Sanity-check arguments */ -- if (mode >= 3 || (id <= 0) || (id >= ST_LEVEL_MAX)) { -- crm_trace("Could not add %s[%d] (%d) to the topology (%d active entries)", target, id, mode, g_hash_table_size(topology)); -+ // Ensure level ID is in allowed range -+ if ((id <= 0) || (id >= ST_LEVEL_MAX)) { -+ crm_warn("Ignoring topology registration for %s with invalid level %d", -+ target, id); - free(target); -- crm_log_xml_err(level, "Bad topology"); -- return -EINVAL; -+ crm_log_xml_warn(level, "Bad level"); -+ pcmk__set_result(result, CRM_EX_INVALID_PARAM, PCMK_EXEC_INVALID, -+ "Invalid topology level"); -+ return; - } - - /* Find or create topology table entry */ - tp = g_hash_table_lookup(topology, target); - if (tp == NULL) { - tp = calloc(1, sizeof(stonith_topology_t)); -+ if (tp == NULL) { -+ pcmk__set_result(result, CRM_EX_ERROR, PCMK_EXEC_ERROR, -+ strerror(ENOMEM)); -+ return; -+ } - tp->kind = mode; - tp->target = target; - tp->target_value = crm_element_value_copy(level, XML_ATTR_STONITH_TARGET_VALUE); -@@ -1671,7 +1689,8 @@ stonith_level_register(xmlNode *msg, char **desc) - crm_info("Target %s has %d active fencing level%s", - tp->target, nlevels, pcmk__plural_s(nlevels)); - } -- return pcmk_ok; -+ -+ pcmk__set_result(result, CRM_EX_OK, PCMK_EXEC_DONE, NULL); - } - - int -@@ -3142,7 +3161,8 @@ handle_request(pcmk__client_t *client, uint32_t id, uint32_t flags, - char *device_id = NULL; - - if (is_privileged(client, op)) { -- rc = stonith_level_register(request, &device_id); -+ fenced_register_level(request, &device_id, &result); -+ rc = pcmk_rc2legacy(stonith__result2rc(&result)); - } else { - rc = -EACCES; - } -diff --git a/daemons/fenced/pacemaker-fenced.c b/daemons/fenced/pacemaker-fenced.c -index 0a8b3bf6f2..469304f67c 100644 ---- a/daemons/fenced/pacemaker-fenced.c -+++ b/daemons/fenced/pacemaker-fenced.c -@@ -452,8 +452,8 @@ remove_cib_device(xmlXPathObjectPtr xpathObj) - static void - handle_topology_change(xmlNode *match, bool remove) - { -- int rc; - char *desc = NULL; -+ pcmk__action_result_t result = PCMK__UNKNOWN_RESULT; - - CRM_CHECK(match != NULL, return); - crm_trace("Updating %s", ID(match)); -@@ -467,9 +467,10 @@ handle_topology_change(xmlNode *match, bool remove) - free(key); - } - -- rc = stonith_level_register(match, &desc); -- do_stonith_notify_level(STONITH_OP_LEVEL_ADD, rc, desc); -- -+ fenced_register_level(match, &desc, &result); -+ do_stonith_notify_level(STONITH_OP_LEVEL_ADD, -+ pcmk_rc2legacy(stonith__result2rc(&result)), desc); -+ pcmk__reset_result(&result); - free(desc); - } - -diff --git a/daemons/fenced/pacemaker-fenced.h b/daemons/fenced/pacemaker-fenced.h -index 5162ada75d..cf114fb979 100644 ---- a/daemons/fenced/pacemaker-fenced.h -+++ b/daemons/fenced/pacemaker-fenced.h -@@ -218,7 +218,8 @@ void stonith_device_remove(const char *id, bool from_cib); - - char *stonith_level_key(xmlNode * msg, int mode); - int stonith_level_kind(xmlNode * msg); --int stonith_level_register(xmlNode * msg, char **desc); -+void fenced_register_level(xmlNode *msg, char **desc, -+ pcmk__action_result_t *result); - - int stonith_level_remove(xmlNode * msg, char **desc); - -diff --git a/lib/fencing/st_actions.c b/lib/fencing/st_actions.c -index 7eaa8b0f2b..37fa849847 100644 ---- a/lib/fencing/st_actions.c -+++ b/lib/fencing/st_actions.c -@@ -325,6 +325,7 @@ stonith__result2rc(const pcmk__action_result_t *result) - */ - case PCMK_EXEC_INVALID: - switch (result->exit_status) { -+ case CRM_EX_INVALID_PARAM: return EINVAL; - case CRM_EX_INSUFFICIENT_PRIV: return EACCES; - case CRM_EX_PROTOCOL: return EPROTO; - --- -2.27.0 - - -From 27cedca4070328ecac1761f81c2890059af19dcf Mon Sep 17 00:00:00 2001 -From: Ken Gaillot -Date: Mon, 29 Nov 2021 17:29:38 -0600 -Subject: [PATCH 05/23] Low: fencer: handle topology level unregistration - errors better - -Rename stonith_level_remove() to fenced_unregister_level() for consistency, and -refactor it to return a full result rather than a legacy return code. - -Return a protocol error for missing information in the request XML, and log -invalid level numbers at warning level. Use PCMK_EXEC_INVALID with -CRM_EX_INVALID_PARAM for invalid levels, so it gets mapped back to the legacy -code -EINVAL (which reverses the recent change in ec60f014b, both for backward -compatibility and because it makes sense -- a missing parameter is a protocol -error, while an invalid parameter is an invalid parameter error). ---- - daemons/fenced/fenced_commands.c | 52 ++++++++++++++++++++++++------- - daemons/fenced/pacemaker-fenced.c | 9 +++--- - daemons/fenced/pacemaker-fenced.h | 4 +-- - 3 files changed, 48 insertions(+), 17 deletions(-) - -diff --git a/daemons/fenced/fenced_commands.c b/daemons/fenced/fenced_commands.c -index 975f8633a4..ef41dc0e52 100644 ---- a/daemons/fenced/fenced_commands.c -+++ b/daemons/fenced/fenced_commands.c -@@ -1693,25 +1693,54 @@ fenced_register_level(xmlNode *msg, char **desc, pcmk__action_result_t *result) - pcmk__set_result(result, CRM_EX_OK, PCMK_EXEC_DONE, NULL); - } - --int --stonith_level_remove(xmlNode *msg, char **desc) -+/*! -+ * \internal -+ * \brief Unregister a fencing topology level for a target -+ * -+ * Given an XML request specifying the target name and level index (or 0 for all -+ * levels), this will remove any corresponding entry for the target from the -+ * global topology table. -+ * -+ * \param[in] msg XML request for STONITH level registration -+ * \param[out] desc If not NULL, set to string representation "TARGET[LEVEL]" -+ * \param[out] result Where to set result of unregistration -+ */ -+void -+fenced_unregister_level(xmlNode *msg, char **desc, -+ pcmk__action_result_t *result) - { - int id = -1; - stonith_topology_t *tp; - char *target; -+ xmlNode *level = NULL; -+ -+ CRM_CHECK(result != NULL, return); - -- /* Unlike additions, removal requests should always have one level tag */ -- xmlNode *level = get_xpath_object("//" XML_TAG_FENCING_LEVEL, msg, LOG_ERR); -+ if (msg == NULL) { -+ fenced_set_protocol_error(result); -+ return; -+ } - -- CRM_CHECK(level != NULL, return -EPROTO); -+ // Unlike additions, removal requests should always have one level tag -+ level = get_xpath_object("//" XML_TAG_FENCING_LEVEL, msg, LOG_WARNING); -+ if (level == NULL) { -+ fenced_set_protocol_error(result); -+ return; -+ } - - target = stonith_level_key(level, -1); - crm_element_value_int(level, XML_ATTR_STONITH_INDEX, &id); - -- CRM_CHECK((id >= 0) && (id < ST_LEVEL_MAX), -- crm_log_xml_warn(msg, "invalid level"); -- free(target); -- return -EPROTO); -+ // Ensure level ID is in allowed range -+ if ((id < 0) || (id >= ST_LEVEL_MAX)) { -+ crm_warn("Ignoring topology unregistration for %s with invalid level %d", -+ target, id); -+ free(target); -+ crm_log_xml_warn(level, "Bad level"); -+ pcmk__set_result(result, CRM_EX_INVALID_PARAM, PCMK_EXEC_INVALID, -+ "Invalid topology level"); -+ return; -+ } - - if (desc) { - *desc = crm_strdup_printf("%s[%d]", target, id); -@@ -1745,7 +1774,7 @@ stonith_level_remove(xmlNode *msg, char **desc) - } - - free(target); -- return pcmk_ok; -+ pcmk__set_result(result, CRM_EX_OK, PCMK_EXEC_DONE, NULL); - } - - static char * -@@ -3173,7 +3202,8 @@ handle_request(pcmk__client_t *client, uint32_t id, uint32_t flags, - char *device_id = NULL; - - if (is_privileged(client, op)) { -- rc = stonith_level_remove(request, &device_id); -+ fenced_unregister_level(request, &device_id, &result); -+ rc = pcmk_rc2legacy(stonith__result2rc(&result)); - } else { - rc = -EACCES; - } -diff --git a/daemons/fenced/pacemaker-fenced.c b/daemons/fenced/pacemaker-fenced.c -index 469304f67c..56acc93f31 100644 ---- a/daemons/fenced/pacemaker-fenced.c -+++ b/daemons/fenced/pacemaker-fenced.c -@@ -409,17 +409,18 @@ do_stonith_notify_level(const char *op, int rc, const char *desc) - static void - topology_remove_helper(const char *node, int level) - { -- int rc; - char *desc = NULL; -+ pcmk__action_result_t result = PCMK__UNKNOWN_RESULT; - xmlNode *data = create_xml_node(NULL, XML_TAG_FENCING_LEVEL); - - crm_xml_add(data, F_STONITH_ORIGIN, __func__); - crm_xml_add_int(data, XML_ATTR_STONITH_INDEX, level); - crm_xml_add(data, XML_ATTR_STONITH_TARGET, node); - -- rc = stonith_level_remove(data, &desc); -- do_stonith_notify_level(STONITH_OP_LEVEL_DEL, rc, desc); -- -+ fenced_unregister_level(data, &desc, &result); -+ do_stonith_notify_level(STONITH_OP_LEVEL_DEL, -+ pcmk_rc2legacy(stonith__result2rc(&result)), desc); -+ pcmk__reset_result(&result); - free_xml(data); - free(desc); - } -diff --git a/daemons/fenced/pacemaker-fenced.h b/daemons/fenced/pacemaker-fenced.h -index cf114fb979..0006e02e7d 100644 ---- a/daemons/fenced/pacemaker-fenced.h -+++ b/daemons/fenced/pacemaker-fenced.h -@@ -220,8 +220,8 @@ char *stonith_level_key(xmlNode * msg, int mode); - int stonith_level_kind(xmlNode * msg); - void fenced_register_level(xmlNode *msg, char **desc, - pcmk__action_result_t *result); -- --int stonith_level_remove(xmlNode * msg, char **desc); -+void fenced_unregister_level(xmlNode *msg, char **desc, -+ pcmk__action_result_t *result); - - stonith_topology_t *find_topology_for_host(const char *host); - --- -2.27.0 - - -From 3f603defca78eb2bdd46c51a80ed04a4c773442b Mon Sep 17 00:00:00 2001 -From: Ken Gaillot -Date: Fri, 19 Nov 2021 12:22:33 -0600 -Subject: [PATCH 06/23] Log: fencer: track and log full result when handling - requests - -handle_request() now tracks and logs a full result rather than just a -legacy return code. ---- - daemons/fenced/fenced_commands.c | 95 ++++++++++++++++++-------------- - 1 file changed, 53 insertions(+), 42 deletions(-) - -diff --git a/daemons/fenced/fenced_commands.c b/daemons/fenced/fenced_commands.c -index ef41dc0e52..996c18faaa 100644 ---- a/daemons/fenced/fenced_commands.c -+++ b/daemons/fenced/fenced_commands.c -@@ -2981,9 +2981,7 @@ static void - handle_request(pcmk__client_t *client, uint32_t id, uint32_t flags, - xmlNode *request, const char *remote_peer) - { -- int call_options = 0; -- int rc = -EOPNOTSUPP; -- -+ int call_options = st_opt_none; - xmlNode *data = NULL; - bool need_reply = true; - pcmk__action_result_t result = PCMK__UNKNOWN_RESULT; -@@ -3006,13 +3004,12 @@ handle_request(pcmk__client_t *client, uint32_t id, uint32_t flags, - pcmk__ipc_send_xml(client, id, reply, flags); - client->request_id = 0; - free_xml(reply); -- rc = pcmk_ok; -+ pcmk__set_result(&result, CRM_EX_OK, PCMK_EXEC_DONE, NULL); - need_reply = false; - - } else if (pcmk__str_eq(op, STONITH_OP_EXEC, pcmk__str_none)) { - execute_agent_action(request, &result); - need_reply = (result.execution_status != PCMK_EXEC_PENDING); -- rc = pcmk_rc2legacy(stonith__result2rc(&result)); - - } else if (pcmk__str_eq(op, STONITH_OP_TIMEOUT_UPDATE, pcmk__str_none)) { - const char *call_id = crm_element_value(request, F_STONITH_CALLID); -@@ -3021,7 +3018,7 @@ handle_request(pcmk__client_t *client, uint32_t id, uint32_t flags, - - crm_element_value_int(request, F_STONITH_TIMEOUT, &op_timeout); - do_stonith_async_timeout_update(client_id, call_id, op_timeout); -- rc = pcmk_ok; -+ pcmk__set_result(&result, CRM_EX_OK, PCMK_EXEC_DONE, NULL); - need_reply = false; - - } else if (pcmk__str_eq(op, STONITH_OP_QUERY, pcmk__str_none)) { -@@ -3033,7 +3030,7 @@ handle_request(pcmk__client_t *client, uint32_t id, uint32_t flags, - remove_relay_op(request); - - stonith_query(request, remote_peer, client_id, call_options); -- rc = pcmk_ok; -+ pcmk__set_result(&result, CRM_EX_OK, PCMK_EXEC_DONE, NULL); - need_reply = false; - - } else if (pcmk__str_eq(op, T_STONITH_NOTIFY, pcmk__str_none)) { -@@ -3055,7 +3052,7 @@ handle_request(pcmk__client_t *client, uint32_t id, uint32_t flags, - } - - pcmk__ipc_send_ack(client, id, flags, "ack", CRM_EX_OK); -- rc = pcmk_ok; -+ pcmk__set_result(&result, CRM_EX_OK, PCMK_EXEC_DONE, NULL); - need_reply = false; - - } else if (pcmk__str_eq(op, STONITH_OP_RELAY, pcmk__str_none)) { -@@ -3069,27 +3066,27 @@ handle_request(pcmk__client_t *client, uint32_t id, uint32_t flags, - crm_element_value(dev, F_STONITH_TARGET)); - - if (initiate_remote_stonith_op(NULL, request, FALSE) == NULL) { -- rc = -EPROTO; -+ fenced_set_protocol_error(&result); - } else { -- rc = -EINPROGRESS; -+ pcmk__set_result(&result, CRM_EX_OK, PCMK_EXEC_PENDING, NULL); - need_reply = false; - } - - } else if (pcmk__str_eq(op, STONITH_OP_FENCE, pcmk__str_none)) { - if ((remote_peer != NULL) || stand_alone) { - fence_locally(request, &result); -- rc = pcmk_rc2legacy(stonith__result2rc(&result)); - - } else if (pcmk_is_set(call_options, st_opt_manual_ack)) { - switch (fenced_handle_manual_confirmation(client, request)) { - case pcmk_rc_ok: -- rc = pcmk_ok; -+ pcmk__set_result(&result, CRM_EX_OK, PCMK_EXEC_DONE, NULL); - break; - case EINPROGRESS: -- rc = -EINPROGRESS; -+ pcmk__set_result(&result, CRM_EX_OK, PCMK_EXEC_PENDING, -+ NULL); - break; - default: -- rc = -EPROTO; -+ fenced_set_protocol_error(&result); - break; - } - -@@ -3100,17 +3097,15 @@ handle_request(pcmk__client_t *client, uint32_t id, uint32_t flags, - const char *action = crm_element_value(dev, F_STONITH_ACTION); - const char *device = crm_element_value(dev, F_STONITH_DEVICE); - -- if (client) { -+ if (client != NULL) { - int tolerance = 0; - - crm_notice("Client %s wants to fence (%s) %s using %s", - pcmk__client_name(client), action, - target, (device? device : "any device")); -- - crm_element_value_int(dev, F_STONITH_TOLERANCE, &tolerance); -- - if (stonith_check_fence_tolerance(tolerance, target, action)) { -- rc = pcmk_ok; -+ pcmk__set_result(&result, CRM_EX_OK, PCMK_EXEC_DONE, NULL); - goto done; - } - -@@ -3143,24 +3138,24 @@ handle_request(pcmk__client_t *client, uint32_t id, uint32_t flags, - crm_xml_add(request, F_STONITH_REMOTE_OP_ID, op->id); - send_cluster_message(crm_get_peer(0, alternate_host), crm_msg_stonith_ng, request, - FALSE); -- rc = -EINPROGRESS; -+ pcmk__set_result(&result, CRM_EX_OK, PCMK_EXEC_PENDING, NULL); - - } else if (initiate_remote_stonith_op(client, request, FALSE) == NULL) { -- rc = -EPROTO; -+ fenced_set_protocol_error(&result); -+ - } else { -- rc = -EINPROGRESS; -+ pcmk__set_result(&result, CRM_EX_OK, PCMK_EXEC_PENDING, NULL); - } - } -- need_reply = (rc != -EINPROGRESS); -+ need_reply = (result.execution_status != PCMK_EXEC_PENDING); - - } else if (pcmk__str_eq(op, STONITH_OP_FENCE_HISTORY, pcmk__str_none)) { - stonith_fence_history(request, &data, remote_peer, call_options); -- rc = pcmk_ok; -+ pcmk__set_result(&result, CRM_EX_OK, PCMK_EXEC_DONE, NULL); - if (pcmk_is_set(call_options, st_opt_discard_reply)) { - /* we don't expect answers to the broadcast - * we might have sent out - */ -- rc = pcmk_ok; - need_reply = false; - } - -@@ -3168,11 +3163,18 @@ handle_request(pcmk__client_t *client, uint32_t id, uint32_t flags, - const char *device_id = NULL; - - if (is_privileged(client, op)) { -- rc = stonith_device_register(request, &device_id, FALSE); -+ int rc = stonith_device_register(request, &device_id, FALSE); -+ -+ pcmk__set_result(&result, -+ ((rc == pcmk_ok)? CRM_EX_OK : CRM_EX_ERROR), -+ stonith__legacy2status(rc), -+ ((rc == pcmk_ok)? NULL : pcmk_strerror(rc))); - } else { -- rc = -EACCES; -+ pcmk__set_result(&result, CRM_EX_INSUFFICIENT_PRIV, -+ PCMK_EXEC_INVALID, -+ "Unprivileged users must register device via CIB"); - } -- do_stonith_notify_device(op, rc, device_id); -+ do_stonith_notify_device(op, pcmk_rc2legacy(stonith__result2rc(&result)), device_id); - - } else if (pcmk__str_eq(op, STONITH_OP_DEVICE_DEL, pcmk__str_none)) { - xmlNode *dev = get_xpath_object("//" F_STONITH_DEVICE, request, LOG_ERR); -@@ -3180,22 +3182,25 @@ handle_request(pcmk__client_t *client, uint32_t id, uint32_t flags, - - if (is_privileged(client, op)) { - stonith_device_remove(device_id, false); -- rc = pcmk_ok; -+ pcmk__set_result(&result, CRM_EX_OK, PCMK_EXEC_DONE, NULL); - } else { -- rc = -EACCES; -+ pcmk__set_result(&result, CRM_EX_INSUFFICIENT_PRIV, -+ PCMK_EXEC_INVALID, -+ "Unprivileged users must delete device via CIB"); - } -- do_stonith_notify_device(op, rc, device_id); -+ do_stonith_notify_device(op, pcmk_rc2legacy(stonith__result2rc(&result)), device_id); - - } else if (pcmk__str_eq(op, STONITH_OP_LEVEL_ADD, pcmk__str_none)) { - char *device_id = NULL; - - if (is_privileged(client, op)) { - fenced_register_level(request, &device_id, &result); -- rc = pcmk_rc2legacy(stonith__result2rc(&result)); - } else { -- rc = -EACCES; -+ pcmk__set_result(&result, CRM_EX_INSUFFICIENT_PRIV, -+ PCMK_EXEC_INVALID, -+ "Unprivileged users must add level via CIB"); - } -- do_stonith_notify_level(op, rc, device_id); -+ do_stonith_notify_level(op, pcmk_rc2legacy(stonith__result2rc(&result)), device_id); - free(device_id); - - } else if (pcmk__str_eq(op, STONITH_OP_LEVEL_DEL, pcmk__str_none)) { -@@ -3203,11 +3208,12 @@ handle_request(pcmk__client_t *client, uint32_t id, uint32_t flags, - - if (is_privileged(client, op)) { - fenced_unregister_level(request, &device_id, &result); -- rc = pcmk_rc2legacy(stonith__result2rc(&result)); - } else { -- rc = -EACCES; -+ pcmk__set_result(&result, CRM_EX_INSUFFICIENT_PRIV, -+ PCMK_EXEC_INVALID, -+ "Unprivileged users must delete level via CIB"); - } -- do_stonith_notify_level(op, rc, device_id); -+ do_stonith_notify_level(op, pcmk_rc2legacy(stonith__result2rc(&result)), device_id); - - } else if(pcmk__str_eq(op, CRM_OP_RM_NODE_CACHE, pcmk__str_casei)) { - int node_id = 0; -@@ -3216,31 +3222,36 @@ handle_request(pcmk__client_t *client, uint32_t id, uint32_t flags, - crm_element_value_int(request, XML_ATTR_ID, &node_id); - name = crm_element_value(request, XML_ATTR_UNAME); - reap_crm_member(node_id, name); -- rc = pcmk_ok; -+ pcmk__set_result(&result, CRM_EX_OK, PCMK_EXEC_DONE, NULL); - need_reply = false; - - } else { - crm_err("Unknown IPC request %s from %s %s", op, - ((client == NULL)? "peer" : "client"), - ((client == NULL)? remote_peer : pcmk__client_name(client))); -+ pcmk__set_result(&result, CRM_EX_PROTOCOL, PCMK_EXEC_INVALID, -+ "Unknown IPC request type (bug?)"); - } - - done: - // Reply if result is known - if (need_reply) { -- xmlNode *reply = stonith_construct_reply(request, result.action_stdout, data, rc); -+ xmlNode *reply = stonith_construct_reply(request, result.action_stdout, data, -+ pcmk_rc2legacy(stonith__result2rc(&result))); - - stonith_send_reply(reply, call_options, remote_peer, client_id); - free_xml(reply); - } - -- free_xml(data); -- -- crm_debug("Processed %s request from %s %s: %s (rc=%d)", -+ crm_debug("Processed %s request from %s %s: %s%s%s%s", - op, ((client == NULL)? "peer" : "client"), - ((client == NULL)? remote_peer : pcmk__client_name(client)), -- ((rc > 0)? "" : pcmk_strerror(rc)), rc); -+ pcmk_exec_status_str(result.execution_status), -+ (result.exit_reason == NULL)? "" : " (", -+ (result.exit_reason == NULL)? "" : result.exit_reason, -+ (result.exit_reason == NULL)? "" : ")"); - -+ free_xml(data); - pcmk__reset_result(&result); - } - --- -2.27.0 - - -From 5e13199699a4e9279520b3668c072e3db49c9782 Mon Sep 17 00:00:00 2001 -From: Ken Gaillot -Date: Fri, 19 Nov 2021 15:10:36 -0600 -Subject: [PATCH 07/23] Feature: fencer: pass full result in replies to - requests - -Rename stonith_construct_reply() to fenced_construct_reply() for consistency, -make it take a full result as an argument rather than separate arguments for -legacy return code and output, and add the full result to the reply (along with -the legacy return code, for backward compatibility). - -This is used for peer query replies and some request replies (including replies -to local clients who requested fencing). Other replies, such as those built by -construct_async_reply(), are not affected by this commit. ---- - daemons/fenced/fenced_commands.c | 33 ++++++++++++++++++++++--------- - daemons/fenced/fenced_remote.c | 9 ++++++++- - daemons/fenced/pacemaker-fenced.h | 4 ++-- - 3 files changed, 34 insertions(+), 12 deletions(-) - -diff --git a/daemons/fenced/fenced_commands.c b/daemons/fenced/fenced_commands.c -index 996c18faaa..84f89e8daf 100644 ---- a/daemons/fenced/fenced_commands.c -+++ b/daemons/fenced/fenced_commands.c -@@ -2322,6 +2322,7 @@ stonith_query(xmlNode * msg, const char *remote_peer, const char *client_id, int - const char *target = NULL; - int timeout = 0; - xmlNode *dev = get_xpath_object("//@" F_STONITH_ACTION, msg, LOG_NEVER); -+ pcmk__action_result_t result = PCMK__UNKNOWN_RESULT; - - crm_element_value_int(msg, F_STONITH_TIMEOUT, &timeout); - if (dev) { -@@ -2338,7 +2339,8 @@ stonith_query(xmlNode * msg, const char *remote_peer, const char *client_id, int - crm_log_xml_debug(msg, "Query"); - query = calloc(1, sizeof(struct st_query_data)); - -- query->reply = stonith_construct_reply(msg, NULL, NULL, pcmk_ok); -+ pcmk__set_result(&result, CRM_EX_OK, PCMK_EXEC_DONE, NULL); -+ query->reply = fenced_construct_reply(msg, NULL, &result); - query->remote_peer = remote_peer ? strdup(remote_peer) : NULL; - query->client_id = client_id ? strdup(client_id) : NULL; - query->target = target ? strdup(target) : NULL; -@@ -2729,8 +2731,23 @@ fence_locally(xmlNode *msg, pcmk__action_result_t *result) - pcmk__set_result(result, CRM_EX_OK, PCMK_EXEC_PENDING, NULL); - } - -+/*! -+ * \internal -+ * \brief Build an XML reply for a fencing operation -+ * -+ * \param[in] request Request that reply is for -+ * \param[in] data If not NULL, add to reply as call data -+ * \param[in] result Full result of fencing operation -+ * -+ * \return Newly created XML reply -+ * \note The caller is responsible for freeing the result. -+ * \note This has some overlap with construct_async_reply(), but that copies -+ * values from an async_command_t, whereas this one copies them from the -+ * request. -+ */ - xmlNode * --stonith_construct_reply(xmlNode * request, const char *output, xmlNode * data, int rc) -+fenced_construct_reply(xmlNode *request, xmlNode *data, -+ pcmk__action_result_t *result) - { - xmlNode *reply = NULL; - -@@ -2738,8 +2755,7 @@ stonith_construct_reply(xmlNode * request, const char *output, xmlNode * data, i - - crm_xml_add(reply, "st_origin", __func__); - crm_xml_add(reply, F_TYPE, T_STONITH_NG); -- crm_xml_add(reply, F_STONITH_OUTPUT, output); -- crm_xml_add_int(reply, F_STONITH_RC, rc); -+ stonith__xe_set_result(reply, result); - - if (request == NULL) { - /* Most likely, this is the result of a stonith operation that was -@@ -2749,12 +2765,14 @@ stonith_construct_reply(xmlNode * request, const char *output, xmlNode * data, i - * @TODO Maybe synchronize this information at start-up? - */ - crm_warn("Missing request information for client notifications for " -- "operation with result %d (initiated before we came up?)", rc); -+ "operation with result '%s' (initiated before we came up?)", -+ pcmk_exec_status_str(result->execution_status)); - - } else { - const char *name = NULL; - const char *value = NULL; - -+ // Attributes to copy from request to reply - const char *names[] = { - F_STONITH_OPERATION, - F_STONITH_CALLID, -@@ -2764,8 +2782,6 @@ stonith_construct_reply(xmlNode * request, const char *output, xmlNode * data, i - F_STONITH_CALLOPTS - }; - -- crm_trace("Creating a result reply with%s reply output (rc=%d)", -- (data? "" : "out"), rc); - for (int lpc = 0; lpc < PCMK__NELEM(names); lpc++) { - name = names[lpc]; - value = crm_element_value(request, name); -@@ -3236,8 +3252,7 @@ handle_request(pcmk__client_t *client, uint32_t id, uint32_t flags, - done: - // Reply if result is known - if (need_reply) { -- xmlNode *reply = stonith_construct_reply(request, result.action_stdout, data, -- pcmk_rc2legacy(stonith__result2rc(&result))); -+ xmlNode *reply = fenced_construct_reply(request, data, &result); - - stonith_send_reply(reply, call_options, remote_peer, client_id); - free_xml(reply); -diff --git a/daemons/fenced/fenced_remote.c b/daemons/fenced/fenced_remote.c -index 8feb401477..baa07d9e78 100644 ---- a/daemons/fenced/fenced_remote.c -+++ b/daemons/fenced/fenced_remote.c -@@ -415,7 +415,14 @@ handle_local_reply_and_notify(remote_fencing_op_t * op, xmlNode * data, int rc) - crm_xml_add(data, F_STONITH_TARGET, op->target); - crm_xml_add(data, F_STONITH_OPERATION, op->action); - -- reply = stonith_construct_reply(op->request, NULL, data, rc); -+ { -+ pcmk__action_result_t result = PCMK__UNKNOWN_RESULT; -+ -+ pcmk__set_result(&result, -+ ((rc == pcmk_ok)? CRM_EX_OK : CRM_EX_ERROR), -+ stonith__legacy2status(rc), NULL); -+ reply = fenced_construct_reply(op->request, data, &result); -+ } - crm_xml_add(reply, F_STONITH_DELEGATE, op->delegate); - - /* Send fencing OP reply to local client that initiated fencing */ -diff --git a/daemons/fenced/pacemaker-fenced.h b/daemons/fenced/pacemaker-fenced.h -index 0006e02e7d..d5f4bc79fd 100644 ---- a/daemons/fenced/pacemaker-fenced.h -+++ b/daemons/fenced/pacemaker-fenced.h -@@ -228,8 +228,8 @@ stonith_topology_t *find_topology_for_host(const char *host); - void do_local_reply(xmlNode * notify_src, const char *client_id, gboolean sync_reply, - gboolean from_peer); - --xmlNode *stonith_construct_reply(xmlNode * request, const char *output, xmlNode * data, -- int rc); -+xmlNode *fenced_construct_reply(xmlNode *request, xmlNode *data, -+ pcmk__action_result_t *result); - - void - do_stonith_async_timeout_update(const char *client, const char *call_id, int timeout); --- -2.27.0 - - -From b32aa252b321ff40c834d153cb23f8b3be471611 Mon Sep 17 00:00:00 2001 -From: Ken Gaillot -Date: Fri, 19 Nov 2021 15:43:20 -0600 -Subject: [PATCH 08/23] Log: fencer: grab and log full result when processing - peer fencing replies - -fenced_process_fencing_reply() now checks for the full result, instead of only -a legacy return code, in peer replies, and uses it in log messages. ---- - daemons/fenced/fenced_remote.c | 63 ++++++++++++++++++++-------------- - 1 file changed, 37 insertions(+), 26 deletions(-) - -diff --git a/daemons/fenced/fenced_remote.c b/daemons/fenced/fenced_remote.c -index baa07d9e78..c6369f0051 100644 ---- a/daemons/fenced/fenced_remote.c -+++ b/daemons/fenced/fenced_remote.c -@@ -2095,21 +2095,21 @@ process_remote_stonith_query(xmlNode * msg) - void - fenced_process_fencing_reply(xmlNode *msg) - { -- int rc = 0; - const char *id = NULL; - const char *device = NULL; - remote_fencing_op_t *op = NULL; - xmlNode *dev = get_xpath_object("//@" F_STONITH_REMOTE_OP_ID, msg, LOG_ERR); -+ pcmk__action_result_t result = PCMK__UNKNOWN_RESULT; - - CRM_CHECK(dev != NULL, return); - - id = crm_element_value(dev, F_STONITH_REMOTE_OP_ID); - CRM_CHECK(id != NULL, return); - -- dev = get_xpath_object("//@" F_STONITH_RC, msg, LOG_ERR); -+ dev = stonith__find_xe_with_result(msg); - CRM_CHECK(dev != NULL, return); - -- crm_element_value_int(dev, F_STONITH_RC, &rc); -+ stonith__xe_get_result(dev, &result); - - device = crm_element_value(dev, F_STONITH_DEVICE); - -@@ -2117,7 +2117,7 @@ fenced_process_fencing_reply(xmlNode *msg) - op = g_hash_table_lookup(stonith_remote_op_list, id); - } - -- if (op == NULL && rc == pcmk_ok) { -+ if ((op == NULL) && pcmk__result_ok(&result)) { - /* Record successful fencing operations */ - const char *client_id = crm_element_value(dev, F_STONITH_CLIENTID); - -@@ -2139,16 +2139,19 @@ fenced_process_fencing_reply(xmlNode *msg) - } - - if (pcmk__str_eq(crm_element_value(msg, F_SUBTYPE), "broadcast", pcmk__str_casei)) { -- crm_debug("Finalizing action '%s' targeting %s on behalf of %s@%s: %s " -+ crm_debug("Finalizing action '%s' targeting %s on behalf of %s@%s: %s%s%s%s " - CRM_XS " id=%.8s", - op->action, op->target, op->client_name, op->originator, -- pcmk_strerror(rc), op->id); -- if (rc == pcmk_ok) { -+ pcmk_exec_status_str(result.execution_status), -+ (result.exit_reason == NULL)? "" : " (", -+ (result.exit_reason == NULL)? "" : result.exit_reason, -+ (result.exit_reason == NULL)? "" : ")", op->id); -+ if (pcmk__result_ok(&result)) { - op->state = st_done; - } else { - op->state = st_failed; - } -- remote_op_done(op, msg, rc, FALSE); -+ remote_op_done(op, msg, pcmk_rc2legacy(stonith__result2rc(&result)), FALSE); - return; - } else if (!pcmk__str_eq(op->originator, stonith_our_uname, pcmk__str_casei)) { - /* If this isn't a remote level broadcast, and we are not the -@@ -2162,28 +2165,35 @@ fenced_process_fencing_reply(xmlNode *msg) - if (pcmk_is_set(op->call_options, st_opt_topology)) { - const char *device = crm_element_value(msg, F_STONITH_DEVICE); - -- crm_notice("Action '%s' targeting %s using %s on behalf of %s@%s: %s " -- CRM_XS " rc=%d", -+ crm_notice("Action '%s' targeting %s using %s on behalf of %s@%s: %s%s%s%s", - op->action, op->target, device, op->client_name, -- op->originator, pcmk_strerror(rc), rc); -+ op->originator, -+ pcmk_exec_status_str(result.execution_status), -+ (result.exit_reason == NULL)? "" : " (", -+ (result.exit_reason == NULL)? "" : result.exit_reason, -+ (result.exit_reason == NULL)? "" : ")"); - - /* We own the op, and it is complete. broadcast the result to all nodes - * and notify our local clients. */ - if (op->state == st_done) { -- remote_op_done(op, msg, rc, FALSE); -+ remote_op_done(op, msg, pcmk_rc2legacy(stonith__result2rc(&result)), FALSE); - return; - } - -- if ((op->phase == 2) && (rc != pcmk_ok)) { -+ if ((op->phase == 2) && !pcmk__result_ok(&result)) { - /* A remapped "on" failed, but the node was already turned off - * successfully, so ignore the error and continue. - */ -- crm_warn("Ignoring %s 'on' failure (exit code %d) targeting %s " -- "after successful 'off'", device, rc, op->target); -- rc = pcmk_ok; -+ crm_warn("Ignoring %s 'on' failure (%s%s%s) targeting %s " -+ "after successful 'off'", -+ device, pcmk_exec_status_str(result.execution_status), -+ (result.exit_reason == NULL)? "" : ": ", -+ (result.exit_reason == NULL)? "" : result.exit_reason, -+ op->target); -+ pcmk__set_result(&result, CRM_EX_OK, PCMK_EXEC_DONE, NULL); - } - -- if (rc == pcmk_ok) { -+ if (pcmk__result_ok(&result)) { - /* An operation completed successfully. Try another device if - * necessary, otherwise mark the operation as done. */ - advance_topology_device_in_level(op, device, msg); -@@ -2193,29 +2203,30 @@ fenced_process_fencing_reply(xmlNode *msg) - * levels are available, mark this operation as failed and report results. */ - if (advance_topology_level(op, false) != pcmk_rc_ok) { - op->state = st_failed; -- remote_op_done(op, msg, rc, FALSE); -+ remote_op_done(op, msg, pcmk_rc2legacy(stonith__result2rc(&result)), FALSE); - return; - } - } -- } else if (rc == pcmk_ok && op->devices == NULL) { -+ } else if (pcmk__result_ok(&result) && (op->devices == NULL)) { - crm_trace("All done for %s", op->target); -- - op->state = st_done; -- remote_op_done(op, msg, rc, FALSE); -+ remote_op_done(op, msg, pcmk_rc2legacy(stonith__result2rc(&result)), FALSE); - return; -- } else if (rc == -ETIME && op->devices == NULL) { -+ } else if ((result.execution_status == PCMK_EXEC_TIMEOUT) -+ && (op->devices == NULL)) { - /* If the operation timed out don't bother retrying other peers. */ - op->state = st_failed; -- remote_op_done(op, msg, rc, FALSE); -+ remote_op_done(op, msg, pcmk_rc2legacy(stonith__result2rc(&result)), FALSE); - return; - } else { - /* fall-through and attempt other fencing action using another peer */ - } - - /* Retry on failure */ -- crm_trace("Next for %s on behalf of %s@%s (rc was %d)", op->target, op->originator, -- op->client_name, rc); -- call_remote_stonith(op, NULL, rc); -+ crm_trace("Next for %s on behalf of %s@%s (result was: %s)", -+ op->target, op->originator, op->client_name, -+ pcmk_exec_status_str(result.execution_status)); -+ call_remote_stonith(op, NULL, pcmk_rc2legacy(stonith__result2rc(&result))); - } - - gboolean --- -2.27.0 - - -From afb5706ac606a8ea883aa1597ee63d9891cc2e13 Mon Sep 17 00:00:00 2001 -From: Ken Gaillot -Date: Fri, 19 Nov 2021 15:56:30 -0600 -Subject: [PATCH 09/23] Refactor: fencer: pass full result of previous failed - action when initiating peer fencing - -Rename call_remote_stonith() to request_peer_fencing() for readability, and -make it take the full result of the previous failed action, rather than just -its legacy return code, as an argument. - -This does cause one change in behavior: if topology is in use, a previous -attempt failed, and no more peers have the appropriate device, then the -legacy return code returned will be -ENODEV rather than -EHOSTUNREACH. -These are treated similarly internally, and hopefully that will not cause -problems for external code. ---- - daemons/fenced/fenced_remote.c | 89 +++++++++++++++++++++++++--------- - 1 file changed, 67 insertions(+), 22 deletions(-) - -diff --git a/daemons/fenced/fenced_remote.c b/daemons/fenced/fenced_remote.c -index c6369f0051..31d5ee6e93 100644 ---- a/daemons/fenced/fenced_remote.c -+++ b/daemons/fenced/fenced_remote.c -@@ -76,12 +76,13 @@ typedef struct { - - GHashTable *stonith_remote_op_list = NULL; - --void call_remote_stonith(remote_fencing_op_t *op, peer_device_info_t *peer, -- int rc); - static void remote_op_done(remote_fencing_op_t * op, xmlNode * data, int rc, int dup); - extern xmlNode *stonith_create_op(int call_id, const char *token, const char *op, xmlNode * data, - int call_options); - -+static void request_peer_fencing(remote_fencing_op_t *op, -+ peer_device_info_t *peer, -+ pcmk__action_result_t *result); - static void report_timeout_period(remote_fencing_op_t * op, int op_timeout); - static int get_op_total_timeout(const remote_fencing_op_t *op, - const peer_device_info_t *chosen_peer); -@@ -609,12 +610,16 @@ static gboolean - remote_op_timeout_one(gpointer userdata) - { - remote_fencing_op_t *op = userdata; -+ pcmk__action_result_t result = PCMK__UNKNOWN_RESULT; - - op->op_timer_one = 0; - - crm_notice("Peer's '%s' action targeting %s for client %s timed out " CRM_XS - " id=%.8s", op->action, op->target, op->client_name, op->id); -- call_remote_stonith(op, NULL, -ETIME); -+ pcmk__set_result(&result, CRM_EX_ERROR, PCMK_EXEC_TIMEOUT, NULL); -+ -+ // Try another device, if appropriate -+ request_peer_fencing(op, NULL, &result); - return FALSE; - } - -@@ -685,9 +690,13 @@ remote_op_query_timeout(gpointer data) - crm_debug("Operation %.8s targeting %s already in progress", - op->id, op->target); - } else if (op->query_results) { -+ // Result won't be used in this case, but we need to pass something -+ pcmk__action_result_t result = PCMK__UNKNOWN_RESULT; -+ -+ // Query succeeded, so attempt the actual fencing - crm_debug("Query %.8s targeting %s complete (state=%s)", - op->id, op->target, stonith_op_state_str(op->state)); -- call_remote_stonith(op, NULL, pcmk_ok); -+ request_peer_fencing(op, NULL, &result); - } else { - crm_debug("Query %.8s targeting %s timed out (state=%s)", - op->id, op->target, stonith_op_state_str(op->state)); -@@ -1533,6 +1542,10 @@ static void - advance_topology_device_in_level(remote_fencing_op_t *op, const char *device, - xmlNode *msg) - { -+ pcmk__action_result_t result = PCMK__UNKNOWN_RESULT; -+ -+ pcmk__set_result(&result, CRM_EX_OK, PCMK_EXEC_DONE, NULL); -+ - /* Advance to the next device at this topology level, if any */ - if (op->devices) { - op->devices = op->devices->next; -@@ -1569,7 +1582,7 @@ advance_topology_device_in_level(remote_fencing_op_t *op, const char *device, - op->delay = 0; - } - -- call_remote_stonith(op, NULL, pcmk_ok); -+ request_peer_fencing(op, NULL, &result); - } else { - /* We're done with all devices and phases, so finalize operation */ - crm_trace("Marking complex fencing op targeting %s as complete", -@@ -1598,15 +1611,30 @@ check_watchdog_fencing_and_wait(remote_fencing_op_t * op) - return FALSE; - } - --void --call_remote_stonith(remote_fencing_op_t *op, peer_device_info_t *peer, int rc) -+/*! -+ * \internal -+ * \brief Ask a peer to execute a fencing operation -+ * -+ * \param[in] op Fencing operation to be executed -+ * \param[in] peer If NULL or topology is in use, choose best peer to execute -+ * the fencing, otherwise use this peer -+ * \param[in] result Full result of previous failed attempt, if any (used as -+ * final result only if a previous attempt failed, topology -+ * is not in use, and no devices remain to be attempted) -+ */ -+static void -+request_peer_fencing(remote_fencing_op_t *op, peer_device_info_t *peer, -+ pcmk__action_result_t *result) - { - const char *device = NULL; -- int timeout = op->base_timeout; -+ int timeout; -+ -+ CRM_CHECK(op != NULL, return); - - crm_trace("Action %.8s targeting %s for %s is %s", - op->id, op->target, op->client_name, - stonith_op_state_str(op->state)); -+ timeout = op->base_timeout; - if ((peer == NULL) && !pcmk_is_set(op->call_options, st_opt_topology)) { - peer = stonith_choose_peer(op); - } -@@ -1623,9 +1651,14 @@ call_remote_stonith(remote_fencing_op_t *op, peer_device_info_t *peer, int rc) - } - - if (pcmk_is_set(op->call_options, st_opt_topology) && op->devices) { -- /* Ignore any peer preference, they might not have the device we need */ -- /* When using topology, stonith_choose_peer() removes the device from -- * further consideration, so be sure to calculate timeout beforehand */ -+ /* Ignore the caller's peer preference if topology is in use, because -+ * that peer might not have access to the required device. With -+ * topology, stonith_choose_peer() removes the device from further -+ * consideration, so the timeout must be calculated beforehand. -+ * -+ * @TODO Basing the total timeout on the caller's preferred peer (above) -+ * is less than ideal. -+ */ - peer = stonith_choose_peer(op); - - device = op->devices->data; -@@ -1722,8 +1755,6 @@ call_remote_stonith(remote_fencing_op_t *op, peer_device_info_t *peer, int rc) - finalize_timed_out_op(op); - - } else if(op->replies >= op->replies_expected || op->replies >= fencing_active_peers()) { --// int rc = -EHOSTUNREACH; -- - /* if the operation never left the query state, - * but we have all the expected replies, then no devices - * are available to execute the fencing operation. */ -@@ -1735,17 +1766,28 @@ call_remote_stonith(remote_fencing_op_t *op, peer_device_info_t *peer, int rc) - } - } - -+ // This is the only case in which result will be used -+ CRM_CHECK(result != NULL, return); -+ - if (op->state == st_query) { - crm_info("No peers (out of %d) have devices capable of fencing " - "(%s) %s for client %s " CRM_XS " state=%s", - op->replies, op->action, op->target, op->client_name, - stonith_op_state_str(op->state)); - -- rc = -ENODEV; -+ pcmk__reset_result(result); -+ pcmk__set_result(result, CRM_EX_ERROR, PCMK_EXEC_NO_FENCE_DEVICE, -+ NULL); - } else { - if (pcmk_is_set(op->call_options, st_opt_topology)) { -- rc = -EHOSTUNREACH; -- } -+ pcmk__reset_result(result); -+ pcmk__set_result(result, CRM_EX_ERROR, -+ PCMK_EXEC_NO_FENCE_DEVICE, NULL); -+ } -+ /* ... else use result provided by caller -- overwriting it with -+ PCMK_EXEC_NO_FENCE_DEVICE would prevent remote_op_done() from -+ setting the correct delegate if needed. -+ */ - - crm_info("No peers (out of %d) are capable of fencing (%s) %s " - "for client %s " CRM_XS " state=%s", -@@ -1754,7 +1796,7 @@ call_remote_stonith(remote_fencing_op_t *op, peer_device_info_t *peer, int rc) - } - - op->state = st_failed; -- remote_op_done(op, NULL, rc, FALSE); -+ remote_op_done(op, NULL, pcmk_rc2legacy(stonith__result2rc(result)), FALSE); - - } else { - crm_info("Waiting for additional peers capable of fencing (%s) %s%s%s " -@@ -2004,6 +2046,7 @@ process_remote_stonith_query(xmlNode * msg) - peer_device_info_t *peer = NULL; - uint32_t replies_expected; - xmlNode *dev = get_xpath_object("//@" F_STONITH_REMOTE_OP_ID, msg, LOG_ERR); -+ pcmk__action_result_t result = PCMK__UNKNOWN_RESULT; - - CRM_CHECK(dev != NULL, return -EPROTO); - -@@ -2038,6 +2081,8 @@ process_remote_stonith_query(xmlNode * msg) - peer = add_result(op, host, ndevices, dev); - } - -+ pcmk__set_result(&result, CRM_EX_OK, PCMK_EXEC_DONE, NULL); -+ - if (pcmk_is_set(op->call_options, st_opt_topology)) { - /* If we start the fencing before all the topology results are in, - * it is possible fencing levels will be skipped because of the missing -@@ -2045,12 +2090,12 @@ process_remote_stonith_query(xmlNode * msg) - if (op->state == st_query && all_topology_devices_found(op)) { - /* All the query results are in for the topology, start the fencing ops. */ - crm_trace("All topology devices found"); -- call_remote_stonith(op, peer, pcmk_ok); -+ request_peer_fencing(op, peer, &result); - - } else if (have_all_replies) { - crm_info("All topology query replies have arrived, continuing (%d expected/%d received) ", - replies_expected, op->replies); -- call_remote_stonith(op, NULL, pcmk_ok); -+ request_peer_fencing(op, NULL, &result); - } - - } else if (op->state == st_query) { -@@ -2062,12 +2107,12 @@ process_remote_stonith_query(xmlNode * msg) - /* we have a verified device living on a peer that is not the target */ - crm_trace("Found %d verified device%s", - nverified, pcmk__plural_s(nverified)); -- call_remote_stonith(op, peer, pcmk_ok); -+ request_peer_fencing(op, peer, &result); - - } else if (have_all_replies) { - crm_info("All query replies have arrived, continuing (%d expected/%d received) ", - replies_expected, op->replies); -- call_remote_stonith(op, NULL, pcmk_ok); -+ request_peer_fencing(op, NULL, &result); - - } else { - crm_trace("Waiting for more peer results before launching fencing operation"); -@@ -2226,7 +2271,7 @@ fenced_process_fencing_reply(xmlNode *msg) - crm_trace("Next for %s on behalf of %s@%s (result was: %s)", - op->target, op->originator, op->client_name, - pcmk_exec_status_str(result.execution_status)); -- call_remote_stonith(op, NULL, pcmk_rc2legacy(stonith__result2rc(&result))); -+ request_peer_fencing(op, NULL, &result); - } - - gboolean --- -2.27.0 - - -From 43e08ba7ee1635e47bfaf2a57636101c675b89ae Mon Sep 17 00:00:00 2001 -From: Ken Gaillot -Date: Fri, 19 Nov 2021 16:02:04 -0600 -Subject: [PATCH 10/23] Feature: fencer: set exit reason for timeouts waiting - for peer replies - ---- - daemons/fenced/fenced_remote.c | 4 +++- - 1 file changed, 3 insertions(+), 1 deletion(-) - -diff --git a/daemons/fenced/fenced_remote.c b/daemons/fenced/fenced_remote.c -index 31d5ee6e93..415a7c1b98 100644 ---- a/daemons/fenced/fenced_remote.c -+++ b/daemons/fenced/fenced_remote.c -@@ -616,7 +616,9 @@ remote_op_timeout_one(gpointer userdata) - - crm_notice("Peer's '%s' action targeting %s for client %s timed out " CRM_XS - " id=%.8s", op->action, op->target, op->client_name, op->id); -- pcmk__set_result(&result, CRM_EX_ERROR, PCMK_EXEC_TIMEOUT, NULL); -+ pcmk__set_result(&result, CRM_EX_ERROR, PCMK_EXEC_TIMEOUT, -+ "Peer did not send fence result within timeout"); -+ - - // Try another device, if appropriate - request_peer_fencing(op, NULL, &result); --- -2.27.0 - - -From 34e5baebac78b7235825b31bebc44e3d65ae45cc Mon Sep 17 00:00:00 2001 -From: Ken Gaillot -Date: Fri, 19 Nov 2021 16:10:28 -0600 -Subject: [PATCH 11/23] Refactor: fencer: pass full result when handling - duplicate actions - -Rename handle_duplicates() to finalize_op_duplicates() for readability, and -make it take a full result rather than a legacy return code as an argument. ---- - daemons/fenced/fenced_remote.c | 29 +++++++++++++++++++++-------- - 1 file changed, 21 insertions(+), 8 deletions(-) - -diff --git a/daemons/fenced/fenced_remote.c b/daemons/fenced/fenced_remote.c -index 415a7c1b98..850bfb6eb3 100644 ---- a/daemons/fenced/fenced_remote.c -+++ b/daemons/fenced/fenced_remote.c -@@ -439,12 +439,19 @@ handle_local_reply_and_notify(remote_fencing_op_t * op, xmlNode * data, int rc) - free_xml(notify_data); - } - -+/*! -+ * \internal -+ * \brief Finalize all duplicates of a given fencer operation -+ * -+ * \param[in] op Fencer operation that completed -+ * \param[in] data Top-level XML to add notification to -+ * \param[in] result Full operation result -+ */ - static void --handle_duplicates(remote_fencing_op_t * op, xmlNode * data, int rc) -+finalize_op_duplicates(remote_fencing_op_t *op, xmlNode *data, -+ pcmk__action_result_t *result) - { -- GList *iter = NULL; -- -- for (iter = op->duplicates; iter != NULL; iter = iter->next) { -+ for (GList *iter = op->duplicates; iter != NULL; iter = iter->next) { - remote_fencing_op_t *other = iter->data; - - if (other->state == st_duplicate) { -@@ -452,8 +459,9 @@ handle_duplicates(remote_fencing_op_t * op, xmlNode * data, int rc) - crm_debug("Performing duplicate notification for %s@%s: %s " - CRM_XS " id=%.8s", - other->client_name, other->originator, -- pcmk_strerror(rc), other->id); -- remote_op_done(other, data, rc, TRUE); -+ pcmk_exec_status_str(result->execution_status), -+ other->id); -+ remote_op_done(other, data, pcmk_rc2legacy(stonith__result2rc(result)), TRUE); - - } else { - // Possible if (for example) it timed out already -@@ -570,8 +578,13 @@ remote_op_done(remote_fencing_op_t * op, xmlNode * data, int rc, int dup) - - handle_local_reply_and_notify(op, data, rc); - -- if (dup == FALSE) { -- handle_duplicates(op, data, rc); -+ if (!dup) { -+ pcmk__action_result_t result = PCMK__UNKNOWN_RESULT; -+ -+ pcmk__set_result(&result, -+ ((rc == pcmk_ok)? CRM_EX_OK : CRM_EX_ERROR), -+ stonith__legacy2status(rc), NULL); -+ finalize_op_duplicates(op, data, &result); - } - - /* Free non-essential parts of the record --- -2.27.0 - - -From 939bd6f5f0f79b19d0cc4d869f3c8980fda2e461 Mon Sep 17 00:00:00 2001 -From: Ken Gaillot -Date: Fri, 19 Nov 2021 16:23:20 -0600 -Subject: [PATCH 12/23] Feature: fencer: set exit reasons for fencing timeouts - -finalize_timed_out_op() now takes an exit reason as an argument. -It is called for fencing timeouts, peer query reply timeouts, -and all capable nodes failing to fence. - -At this point, the exit reason is not used, but that is planned. ---- - daemons/fenced/fenced_remote.c | 25 +++++++++++++++---------- - 1 file changed, 15 insertions(+), 10 deletions(-) - -diff --git a/daemons/fenced/fenced_remote.c b/daemons/fenced/fenced_remote.c -index 850bfb6eb3..c10a32442e 100644 ---- a/daemons/fenced/fenced_remote.c -+++ b/daemons/fenced/fenced_remote.c -@@ -643,10 +643,12 @@ remote_op_timeout_one(gpointer userdata) - * \brief Finalize a remote fencer operation that timed out - * - * \param[in] op Fencer operation that timed out -+ * \param[in] reason Readable description of what step timed out - */ - static void --finalize_timed_out_op(remote_fencing_op_t *op) -+finalize_timed_out_op(remote_fencing_op_t *op, const char *reason) - { -+ pcmk__action_result_t result = PCMK__UNKNOWN_RESULT; - - op->op_timer_total = 0; - -@@ -660,13 +662,13 @@ finalize_timed_out_op(remote_fencing_op_t *op) - * devices, and return success. - */ - op->state = st_done; -- remote_op_done(op, NULL, pcmk_ok, FALSE); -- return; -+ pcmk__set_result(&result, CRM_EX_OK, PCMK_EXEC_DONE, NULL); -+ } else { -+ op->state = st_failed; -+ pcmk__set_result(&result, CRM_EX_ERROR, PCMK_EXEC_TIMEOUT, reason); - } -- -- op->state = st_failed; -- -- remote_op_done(op, NULL, -ETIME, FALSE); -+ remote_op_done(op, NULL, pcmk_rc2legacy(stonith__result2rc(&result)), FALSE); -+ pcmk__reset_result(&result); - } - - /*! -@@ -687,7 +689,8 @@ remote_op_timeout(gpointer userdata) - CRM_XS " id=%.8s", - op->action, op->target, op->client_name, op->id); - } else { -- finalize_timed_out_op(userdata); -+ finalize_timed_out_op(userdata, "Fencing could not be completed " -+ "within overall timeout"); - } - return G_SOURCE_REMOVE; - } -@@ -719,7 +722,8 @@ remote_op_query_timeout(gpointer data) - g_source_remove(op->op_timer_total); - op->op_timer_total = 0; - } -- finalize_timed_out_op(op); -+ finalize_timed_out_op(op, "No capable peers replied to device query " -+ "within timeout"); - } - - return FALSE; -@@ -1767,7 +1771,8 @@ request_peer_fencing(remote_fencing_op_t *op, peer_device_info_t *peer, - CRM_XS " state=%s", op->action, op->target, op->client_name, - stonith_op_state_str(op->state)); - CRM_CHECK(op->state < st_done, return); -- finalize_timed_out_op(op); -+ finalize_timed_out_op(op, "All nodes failed, or are unable, to " -+ "fence target"); - - } else if(op->replies >= op->replies_expected || op->replies >= fencing_active_peers()) { - /* if the operation never left the query state, --- -2.27.0 - - -From b80b02799260feb98723a460f2f8e8ad5cdc467f Mon Sep 17 00:00:00 2001 -From: Ken Gaillot -Date: Fri, 19 Nov 2021 16:32:04 -0600 -Subject: [PATCH 13/23] Refactor: fencer: pass full result when finalizing peer - fencing actions - -Rename remote_op_done() to finalize_op() for readability, and make it take a -full result as an argument, rather than a legacy return code. - -This does cause one change in behavior: when all topology levels fail, -the legacy return code returned will be -pcmk_err_generic instead of EINVAL. ---- - daemons/fenced/fenced_history.c | 2 +- - daemons/fenced/fenced_remote.c | 177 ++++++++++++++++++-------------- - 2 files changed, 103 insertions(+), 76 deletions(-) - -diff --git a/daemons/fenced/fenced_history.c b/daemons/fenced/fenced_history.c -index bc159383c2..9e38ff0a20 100644 ---- a/daemons/fenced/fenced_history.c -+++ b/daemons/fenced/fenced_history.c -@@ -374,7 +374,7 @@ stonith_local_history_diff_and_merge(GHashTable *remote_history, - set_fencing_completed(op); - /* use -EHOSTUNREACH to not introduce a new return-code that might - trigger unexpected results at other places and to prevent -- remote_op_done from setting the delegate if not present -+ finalize_op from setting the delegate if not present - */ - stonith_bcast_result_to_peers(op, -EHOSTUNREACH, FALSE); - } -diff --git a/daemons/fenced/fenced_remote.c b/daemons/fenced/fenced_remote.c -index c10a32442e..aefc5f311c 100644 ---- a/daemons/fenced/fenced_remote.c -+++ b/daemons/fenced/fenced_remote.c -@@ -76,13 +76,14 @@ typedef struct { - - GHashTable *stonith_remote_op_list = NULL; - --static void remote_op_done(remote_fencing_op_t * op, xmlNode * data, int rc, int dup); - extern xmlNode *stonith_create_op(int call_id, const char *token, const char *op, xmlNode * data, - int call_options); - - static void request_peer_fencing(remote_fencing_op_t *op, - peer_device_info_t *peer, - pcmk__action_result_t *result); -+static void finalize_op(remote_fencing_op_t *op, xmlNode *data, -+ pcmk__action_result_t *result, bool dup); - static void report_timeout_period(remote_fencing_op_t * op, int op_timeout); - static int get_op_total_timeout(const remote_fencing_op_t *op, - const peer_device_info_t *chosen_peer); -@@ -461,7 +462,7 @@ finalize_op_duplicates(remote_fencing_op_t *op, xmlNode *data, - other->client_name, other->originator, - pcmk_exec_status_str(result->execution_status), - other->id); -- remote_op_done(other, data, pcmk_rc2legacy(stonith__result2rc(result)), TRUE); -+ finalize_op(other, data, result, true); - - } else { - // Possible if (for example) it timed out already -@@ -487,104 +488,100 @@ delegate_from_xml(xmlNode *xml) - - /*! - * \internal -- * \brief Finalize a remote operation. -+ * \brief Finalize a peer fencing operation - * -- * \description This function has two code paths. -+ * Clean up after a fencing operation completes. This function has two code -+ * paths: the executioner uses it to broadcast the result to CPG peers, and then -+ * each peer (including the executioner) uses it to process that broadcast and -+ * notify its IPC clients of the result. - * -- * Path 1. This node is the owner of the operation and needs -- * to notify the cpg group via a broadcast as to the operation's -- * results. -- * -- * Path 2. The cpg broadcast is received. All nodes notify their local -- * stonith clients the operation results. -- * -- * So, The owner of the operation first notifies the cluster of the result, -- * and once that cpg notify is received back it notifies all the local clients. -- * -- * Nodes that are passive watchers of the operation will receive the -- * broadcast and only need to notify their local clients the operation finished. -- * -- * \param op, The fencing operation to finalize -- * \param data, The xml msg reply (if present) of the last delegated fencing -- * operation. -- * \param dup, Is this operation a duplicate, if so treat it a little differently -- * making sure the broadcast is not sent out. -+ * \param[in] op Fencer operation that completed -+ * \param[in] data If not NULL, XML reply of last delegated fencing operation -+ * \param[in] result Full operation result -+ * \param[in] dup Whether this operation is a duplicate of another -+ * (in which case, do not broadcast the result) - */ - static void --remote_op_done(remote_fencing_op_t * op, xmlNode * data, int rc, int dup) -+finalize_op(remote_fencing_op_t *op, xmlNode *data, -+ pcmk__action_result_t *result, bool dup) - { - int level = LOG_ERR; - const char *subt = NULL; - xmlNode *local_data = NULL; - gboolean op_merged = FALSE; - -+ CRM_CHECK((op != NULL) && (result != NULL), return); -+ -+ if (op->notify_sent) { -+ // Most likely, this is a timed-out action that eventually completed -+ crm_notice("Operation '%s'%s%s by %s for %s@%s%s: " -+ "Result arrived too late " CRM_XS " id=%.8s", -+ op->action, (op->target? " targeting " : ""), -+ (op->target? op->target : ""), -+ (op->delegate? op->delegate : "unknown node"), -+ op->client_name, op->originator, -+ (op_merged? " (merged)" : ""), -+ op->id); -+ return; -+ } -+ - set_fencing_completed(op); - clear_remote_op_timers(op); - undo_op_remap(op); - -- if (op->notify_sent == TRUE) { -- crm_err("Already sent notifications for '%s' targeting %s by %s for " -- "client %s@%s: %s " CRM_XS " rc=%d state=%s id=%.8s", -- op->action, op->target, -- (op->delegate? op->delegate : "unknown node"), -- op->client_name, op->originator, pcmk_strerror(rc), -- rc, stonith_op_state_str(op->state), op->id); -- goto remote_op_done_cleanup; -- } -- - if (data == NULL) { - data = create_xml_node(NULL, "remote-op"); - local_data = data; - - } else if (op->delegate == NULL) { -- switch (rc) { -- case -ENODEV: -- case -EHOSTUNREACH: -+ switch (result->execution_status) { -+ case PCMK_EXEC_NO_FENCE_DEVICE: - break; -+ case PCMK_EXEC_INVALID: -+ if (result->exit_status == CRM_EX_EXPIRED) { -+ break; -+ } -+ // else fall through - default: - op->delegate = delegate_from_xml(data); - break; - } - } - -- if(dup) { -- op_merged = TRUE; -- } else if (crm_element_value(data, F_STONITH_MERGED)) { -- op_merged = TRUE; -- } -+ if (dup || (crm_element_value(data, F_STONITH_MERGED) != NULL)) { -+ op_merged = true; -+ } - - /* Tell everyone the operation is done, we will continue - * with doing the local notifications once we receive - * the broadcast back. */ - subt = crm_element_value(data, F_SUBTYPE); -- if (dup == FALSE && !pcmk__str_eq(subt, "broadcast", pcmk__str_casei)) { -+ if (!dup && !pcmk__str_eq(subt, "broadcast", pcmk__str_casei)) { - /* Defer notification until the bcast message arrives */ -- stonith_bcast_result_to_peers(op, rc, (op_merged? TRUE: FALSE)); -- goto remote_op_done_cleanup; -+ stonith_bcast_result_to_peers(op, pcmk_rc2legacy(stonith__result2rc(result)), op_merged); -+ free_xml(local_data); -+ return; - } - -- if (rc == pcmk_ok || dup) { -- level = LOG_NOTICE; -- } else if (!pcmk__str_eq(op->originator, stonith_our_uname, pcmk__str_casei)) { -+ if (pcmk__result_ok(result) || dup -+ || !pcmk__str_eq(op->originator, stonith_our_uname, pcmk__str_casei)) { - level = LOG_NOTICE; - } -- -- do_crm_log(level, "Operation '%s'%s%s by %s for %s@%s%s: %s " -+ do_crm_log(level, "Operation '%s'%s%s by %s for %s@%s%s: %s (%s%s%s) " - CRM_XS " id=%.8s", op->action, (op->target? " targeting " : ""), - (op->target? op->target : ""), - (op->delegate? op->delegate : "unknown node"), - op->client_name, op->originator, -- (op_merged? " (merged)" : ""), pcmk_strerror(rc), op->id); -+ (op_merged? " (merged)" : ""), crm_exit_str(result->exit_status), -+ pcmk_exec_status_str(result->execution_status), -+ ((result->exit_reason == NULL)? "" : ": "), -+ ((result->exit_reason == NULL)? "" : result->exit_reason), -+ op->id); - -- handle_local_reply_and_notify(op, data, rc); -+ handle_local_reply_and_notify(op, data, pcmk_rc2legacy(stonith__result2rc(result))); - - if (!dup) { -- pcmk__action_result_t result = PCMK__UNKNOWN_RESULT; -- -- pcmk__set_result(&result, -- ((rc == pcmk_ok)? CRM_EX_OK : CRM_EX_ERROR), -- stonith__legacy2status(rc), NULL); -- finalize_op_duplicates(op, data, &result); -+ finalize_op_duplicates(op, data, result); - } - - /* Free non-essential parts of the record -@@ -594,20 +591,27 @@ remote_op_done(remote_fencing_op_t * op, xmlNode * data, int rc, int dup) - g_list_free_full(op->query_results, free_remote_query); - op->query_results = NULL; - } -- - if (op->request) { - free_xml(op->request); - op->request = NULL; - } - -- remote_op_done_cleanup: - free_xml(local_data); - } - -+/*! -+ * \internal -+ * \brief Finalize a watchdog fencer op after the waiting time expires -+ * -+ * \param[in] userdata Fencer operation that completed -+ * -+ * \return G_SOURCE_REMOVE (which tells glib not to restart timer) -+ */ - static gboolean - remote_op_watchdog_done(gpointer userdata) - { - remote_fencing_op_t *op = userdata; -+ pcmk__action_result_t result = PCMK__UNKNOWN_RESULT; - - op->op_timer_one = 0; - -@@ -615,8 +619,9 @@ remote_op_watchdog_done(gpointer userdata) - CRM_XS " id=%.8s", - op->action, op->target, op->client_name, op->id); - op->state = st_done; -- remote_op_done(op, NULL, pcmk_ok, FALSE); -- return FALSE; -+ pcmk__set_result(&result, CRM_EX_OK, PCMK_EXEC_DONE, NULL); -+ finalize_op(op, NULL, &result, false); -+ return G_SOURCE_REMOVE; - } - - static gboolean -@@ -667,7 +672,7 @@ finalize_timed_out_op(remote_fencing_op_t *op, const char *reason) - op->state = st_failed; - pcmk__set_result(&result, CRM_EX_ERROR, PCMK_EXEC_TIMEOUT, reason); - } -- remote_op_done(op, NULL, pcmk_rc2legacy(stonith__result2rc(&result)), FALSE); -+ finalize_op(op, NULL, &result, false); - pcmk__reset_result(&result); - } - -@@ -1064,9 +1069,13 @@ fenced_handle_manual_confirmation(pcmk__client_t *client, xmlNode *msg) - set_fencing_completed(op); - op->delegate = strdup("a human"); - -- // For the fencer's purposes, the fencing operation is done -+ { -+ // For the fencer's purposes, the fencing operation is done -+ pcmk__action_result_t result = PCMK__UNKNOWN_RESULT; - -- remote_op_done(op, msg, pcmk_ok, FALSE); -+ pcmk__set_result(&result, CRM_EX_OK, PCMK_EXEC_DONE, NULL); -+ finalize_op(op, msg, &result, false); -+ } - - /* For the requester's purposes, the operation is still pending. The - * actual result will be sent asynchronously via the operation's done_cb(). -@@ -1200,6 +1209,16 @@ create_remote_stonith_op(const char *client, xmlNode * request, gboolean peer) - return op; - } - -+/*! -+ * \internal -+ * \brief Create a peer fencing operation from a request, and initiate it -+ * -+ * \param[in] client IPC client that made request (NULL to get from request) -+ * \param[in] request Request XML -+ * \param[in] manual_ack Whether this is a manual action confirmation -+ * -+ * \return Newly created operation on success, otherwise NULL -+ */ - remote_fencing_op_t * - initiate_remote_stonith_op(pcmk__client_t *client, xmlNode *request, - gboolean manual_ack) -@@ -1234,9 +1253,17 @@ initiate_remote_stonith_op(pcmk__client_t *client, xmlNode *request, - - switch (op->state) { - case st_failed: -- crm_warn("Could not request peer fencing (%s) targeting %s " -- CRM_XS " id=%.8s", op->action, op->target, op->id); -- remote_op_done(op, NULL, -EINVAL, FALSE); -+ // advance_topology_level() exhausted levels -+ { -+ pcmk__action_result_t result = PCMK__UNKNOWN_RESULT; -+ -+ pcmk__set_result(&result, CRM_EX_ERROR, PCMK_EXEC_ERROR, -+ "All topology levels failed"); -+ crm_warn("Could not request peer fencing (%s) targeting %s " -+ CRM_XS " id=%.8s", op->action, op->target, op->id); -+ finalize_op(op, NULL, &result, false); -+ pcmk__reset_result(&result); -+ } - return op; - - case st_duplicate: -@@ -1607,7 +1634,7 @@ advance_topology_device_in_level(remote_fencing_op_t *op, const char *device, - crm_trace("Marking complex fencing op targeting %s as complete", - op->target); - op->state = st_done; -- remote_op_done(op, msg, pcmk_ok, FALSE); -+ finalize_op(op, msg, &result, false); - } - } - -@@ -1805,7 +1832,7 @@ request_peer_fencing(remote_fencing_op_t *op, peer_device_info_t *peer, - PCMK_EXEC_NO_FENCE_DEVICE, NULL); - } - /* ... else use result provided by caller -- overwriting it with -- PCMK_EXEC_NO_FENCE_DEVICE would prevent remote_op_done() from -+ PCMK_EXEC_NO_FENCE_DEVICE would prevent finalize_op() from - setting the correct delegate if needed. - */ - -@@ -1816,7 +1843,7 @@ request_peer_fencing(remote_fencing_op_t *op, peer_device_info_t *peer, - } - - op->state = st_failed; -- remote_op_done(op, NULL, pcmk_rc2legacy(stonith__result2rc(result)), FALSE); -+ finalize_op(op, NULL, result, false); - - } else { - crm_info("Waiting for additional peers capable of fencing (%s) %s%s%s " -@@ -2216,7 +2243,7 @@ fenced_process_fencing_reply(xmlNode *msg) - } else { - op->state = st_failed; - } -- remote_op_done(op, msg, pcmk_rc2legacy(stonith__result2rc(&result)), FALSE); -+ finalize_op(op, msg, &result, false); - return; - } else if (!pcmk__str_eq(op->originator, stonith_our_uname, pcmk__str_casei)) { - /* If this isn't a remote level broadcast, and we are not the -@@ -2241,7 +2268,7 @@ fenced_process_fencing_reply(xmlNode *msg) - /* We own the op, and it is complete. broadcast the result to all nodes - * and notify our local clients. */ - if (op->state == st_done) { -- remote_op_done(op, msg, pcmk_rc2legacy(stonith__result2rc(&result)), FALSE); -+ finalize_op(op, msg, &result, false); - return; - } - -@@ -2268,20 +2295,20 @@ fenced_process_fencing_reply(xmlNode *msg) - * levels are available, mark this operation as failed and report results. */ - if (advance_topology_level(op, false) != pcmk_rc_ok) { - op->state = st_failed; -- remote_op_done(op, msg, pcmk_rc2legacy(stonith__result2rc(&result)), FALSE); -+ finalize_op(op, msg, &result, false); - return; - } - } - } else if (pcmk__result_ok(&result) && (op->devices == NULL)) { - crm_trace("All done for %s", op->target); - op->state = st_done; -- remote_op_done(op, msg, pcmk_rc2legacy(stonith__result2rc(&result)), FALSE); -+ finalize_op(op, msg, &result, false); - return; - } else if ((result.execution_status == PCMK_EXEC_TIMEOUT) - && (op->devices == NULL)) { - /* If the operation timed out don't bother retrying other peers. */ - op->state = st_failed; -- remote_op_done(op, msg, pcmk_rc2legacy(stonith__result2rc(&result)), FALSE); -+ finalize_op(op, msg, &result, false); - return; - } else { - /* fall-through and attempt other fencing action using another peer */ --- -2.27.0 - - -From 8f19c09f1b961ba9aa510b7dcd1875bbabcddcdc Mon Sep 17 00:00:00 2001 -From: Ken Gaillot -Date: Fri, 19 Nov 2021 16:39:23 -0600 -Subject: [PATCH 14/23] Refactor: fencer: pass full result when broadcasting - replies - -Rename stonith_bcast_result_to_peers() to fenced_broadcast_op_result() for -consistency, and make it take the full result as an argument instead of a -legacy return code. The full result is not yet used, but that is planned. ---- - daemons/fenced/fenced_history.c | 18 ++++++++++++------ - daemons/fenced/fenced_remote.c | 15 ++++++++++++--- - daemons/fenced/pacemaker-fenced.h | 9 ++------- - 3 files changed, 26 insertions(+), 16 deletions(-) - -diff --git a/daemons/fenced/fenced_history.c b/daemons/fenced/fenced_history.c -index 9e38ff0a20..1e07a9815a 100644 ---- a/daemons/fenced/fenced_history.c -+++ b/daemons/fenced/fenced_history.c -@@ -359,24 +359,29 @@ stonith_local_history_diff_and_merge(GHashTable *remote_history, - } - - if (remote_history) { -+ pcmk__action_result_t result = PCMK__UNKNOWN_RESULT; -+ - init_stonith_remote_op_hash_table(&stonith_remote_op_list); - - updated |= g_hash_table_size(remote_history); - - g_hash_table_iter_init(&iter, remote_history); - while (g_hash_table_iter_next(&iter, NULL, (void **)&op)) { -- - if (stonith__op_state_pending(op->state) && - pcmk__str_eq(op->originator, stonith_our_uname, pcmk__str_casei)) { -+ - crm_warn("Failing pending operation %.8s originated by us but " - "known only from peer history", op->id); - op->state = st_failed; - set_fencing_completed(op); -- /* use -EHOSTUNREACH to not introduce a new return-code that might -- trigger unexpected results at other places and to prevent -- finalize_op from setting the delegate if not present -- */ -- stonith_bcast_result_to_peers(op, -EHOSTUNREACH, FALSE); -+ -+ /* CRM_EX_EXPIRED + PCMK_EXEC_INVALID prevents finalize_op() -+ * from setting a delegate -+ */ -+ pcmk__set_result(&result, CRM_EX_EXPIRED, PCMK_EXEC_INVALID, -+ "Initiated by earlier fencer " -+ "process and presumed failed"); -+ fenced_broadcast_op_result(op, &result, false); - } - - g_hash_table_iter_steal(&iter); -@@ -391,6 +396,7 @@ stonith_local_history_diff_and_merge(GHashTable *remote_history, - */ - } - -+ pcmk__reset_result(&result); - g_hash_table_destroy(remote_history); /* remove what is left */ - } - -diff --git a/daemons/fenced/fenced_remote.c b/daemons/fenced/fenced_remote.c -index aefc5f311c..a0f026c790 100644 ---- a/daemons/fenced/fenced_remote.c -+++ b/daemons/fenced/fenced_remote.c -@@ -374,12 +374,21 @@ create_op_done_notify(remote_fencing_op_t * op, int rc) - return notify_data; - } - -+/*! -+ * \internal -+ * \brief Broadcast a fence result notification to all CPG peers -+ * -+ * \param[in] op Fencer operation that completed -+ * \param[in] result Full operation result -+ * \param[in] op_merged Whether this operation is a duplicate of another -+ */ - void --stonith_bcast_result_to_peers(remote_fencing_op_t * op, int rc, gboolean op_merged) -+fenced_broadcast_op_result(remote_fencing_op_t *op, -+ pcmk__action_result_t *result, bool op_merged) - { - static int count = 0; - xmlNode *bcast = create_xml_node(NULL, T_STONITH_REPLY); -- xmlNode *notify_data = create_op_done_notify(op, rc); -+ xmlNode *notify_data = create_op_done_notify(op, pcmk_rc2legacy(stonith__result2rc(result))); - - count++; - crm_trace("Broadcasting result to peers"); -@@ -558,7 +567,7 @@ finalize_op(remote_fencing_op_t *op, xmlNode *data, - subt = crm_element_value(data, F_SUBTYPE); - if (!dup && !pcmk__str_eq(subt, "broadcast", pcmk__str_casei)) { - /* Defer notification until the bcast message arrives */ -- stonith_bcast_result_to_peers(op, pcmk_rc2legacy(stonith__result2rc(result)), op_merged); -+ fenced_broadcast_op_result(op, result, op_merged); - free_xml(local_data); - return; - } -diff --git a/daemons/fenced/pacemaker-fenced.h b/daemons/fenced/pacemaker-fenced.h -index d5f4bc79fd..ed47ab046c 100644 ---- a/daemons/fenced/pacemaker-fenced.h -+++ b/daemons/fenced/pacemaker-fenced.h -@@ -153,13 +153,8 @@ typedef struct remote_fencing_op_s { - - } remote_fencing_op_t; - --/*! -- * \internal -- * \brief Broadcast the result of an operation to the peers. -- * \param op, Operation whose result should be broadcast -- * \param rc, Result of the operation -- */ --void stonith_bcast_result_to_peers(remote_fencing_op_t * op, int rc, gboolean op_merged); -+void fenced_broadcast_op_result(remote_fencing_op_t *op, -+ pcmk__action_result_t *result, bool op_merged); - - // Fencer-specific client flags - enum st_client_flags { --- -2.27.0 - - -From 3396e66b4c9cca895c7412b66159fd2342de1911 Mon Sep 17 00:00:00 2001 -From: Ken Gaillot -Date: Fri, 19 Nov 2021 16:42:46 -0600 -Subject: [PATCH 15/23] Feature: fencer: add full result to local replies - -handle_local_reply_and_notify() now takes the full result as an argument -instead of a legacy return code, and adds it to the reply to the local -requester. It does not add it to notifications yet, but that is planned. ---- - daemons/fenced/fenced_remote.c | 26 ++++++++++++++------------ - 1 file changed, 14 insertions(+), 12 deletions(-) - -diff --git a/daemons/fenced/fenced_remote.c b/daemons/fenced/fenced_remote.c -index a0f026c790..329e06c444 100644 ---- a/daemons/fenced/fenced_remote.c -+++ b/daemons/fenced/fenced_remote.c -@@ -409,8 +409,17 @@ fenced_broadcast_op_result(remote_fencing_op_t *op, - return; - } - -+/*! -+ * \internal -+ * \brief Reply to a local request originator and notify all subscribed clients -+ * -+ * \param[in] op Fencer operation that completed -+ * \param[in] data Top-level XML to add notification to -+ * \param[in] result Full operation result -+ */ - static void --handle_local_reply_and_notify(remote_fencing_op_t * op, xmlNode * data, int rc) -+handle_local_reply_and_notify(remote_fencing_op_t *op, xmlNode *data, -+ pcmk__action_result_t *result) - { - xmlNode *notify_data = NULL; - xmlNode *reply = NULL; -@@ -421,26 +430,19 @@ handle_local_reply_and_notify(remote_fencing_op_t * op, xmlNode * data, int rc) - } - - /* Do notification with a clean data object */ -- notify_data = create_op_done_notify(op, rc); -+ notify_data = create_op_done_notify(op, pcmk_rc2legacy(stonith__result2rc(result))); - crm_xml_add_int(data, "state", op->state); - crm_xml_add(data, F_STONITH_TARGET, op->target); - crm_xml_add(data, F_STONITH_OPERATION, op->action); - -- { -- pcmk__action_result_t result = PCMK__UNKNOWN_RESULT; -- -- pcmk__set_result(&result, -- ((rc == pcmk_ok)? CRM_EX_OK : CRM_EX_ERROR), -- stonith__legacy2status(rc), NULL); -- reply = fenced_construct_reply(op->request, data, &result); -- } -+ reply = fenced_construct_reply(op->request, data, result); - crm_xml_add(reply, F_STONITH_DELEGATE, op->delegate); - - /* Send fencing OP reply to local client that initiated fencing */ - do_local_reply(reply, op->client_id, op->call_options & st_opt_sync_call, FALSE); - - /* bcast to all local clients that the fencing operation happend */ -- do_stonith_notify(T_STONITH_NOTIFY_FENCE, rc, notify_data); -+ do_stonith_notify(T_STONITH_NOTIFY_FENCE, pcmk_rc2legacy(stonith__result2rc(result)), notify_data); - do_stonith_notify(T_STONITH_NOTIFY_HISTORY, pcmk_ok, NULL); - - /* mark this op as having notify's already sent */ -@@ -587,7 +589,7 @@ finalize_op(remote_fencing_op_t *op, xmlNode *data, - ((result->exit_reason == NULL)? "" : result->exit_reason), - op->id); - -- handle_local_reply_and_notify(op, data, pcmk_rc2legacy(stonith__result2rc(result))); -+ handle_local_reply_and_notify(op, data, result); - - if (!dup) { - finalize_op_duplicates(op, data, result); --- -2.27.0 - - -From 004583f3ef908cbd9dc6305597cb55d5ad22882c Mon Sep 17 00:00:00 2001 -From: Ken Gaillot -Date: Fri, 19 Nov 2021 16:47:13 -0600 -Subject: [PATCH 16/23] Refactor: fencer: pass full result when sending device - notifications - -Rename do_stonith_notify_device() to fenced_send_device_notification() for -consistency, and make it take the full result as an argument rather than a -legacy return code. The full result is not used yet, but that is planned. ---- - daemons/fenced/fenced_commands.c | 4 ++-- - daemons/fenced/pacemaker-fenced.c | 15 +++++++++++++-- - daemons/fenced/pacemaker-fenced.h | 4 +++- - 3 files changed, 18 insertions(+), 5 deletions(-) - -diff --git a/daemons/fenced/fenced_commands.c b/daemons/fenced/fenced_commands.c -index 84f89e8daf..86a761dfab 100644 ---- a/daemons/fenced/fenced_commands.c -+++ b/daemons/fenced/fenced_commands.c -@@ -3190,7 +3190,7 @@ handle_request(pcmk__client_t *client, uint32_t id, uint32_t flags, - PCMK_EXEC_INVALID, - "Unprivileged users must register device via CIB"); - } -- do_stonith_notify_device(op, pcmk_rc2legacy(stonith__result2rc(&result)), device_id); -+ fenced_send_device_notification(op, &result, device_id); - - } else if (pcmk__str_eq(op, STONITH_OP_DEVICE_DEL, pcmk__str_none)) { - xmlNode *dev = get_xpath_object("//" F_STONITH_DEVICE, request, LOG_ERR); -@@ -3204,7 +3204,7 @@ handle_request(pcmk__client_t *client, uint32_t id, uint32_t flags, - PCMK_EXEC_INVALID, - "Unprivileged users must delete device via CIB"); - } -- do_stonith_notify_device(op, pcmk_rc2legacy(stonith__result2rc(&result)), device_id); -+ fenced_send_device_notification(op, &result, device_id); - - } else if (pcmk__str_eq(op, STONITH_OP_LEVEL_ADD, pcmk__str_none)) { - char *device_id = NULL; -diff --git a/daemons/fenced/pacemaker-fenced.c b/daemons/fenced/pacemaker-fenced.c -index 56acc93f31..42e167ce78 100644 ---- a/daemons/fenced/pacemaker-fenced.c -+++ b/daemons/fenced/pacemaker-fenced.c -@@ -394,10 +394,21 @@ do_stonith_notify_config(const char *op, int rc, - free_xml(notify_data); - } - -+/*! -+ * \internal -+ * \brief Send notifications for a device change to subscribed clients -+ * -+ * \param[in] op Notification type (STONITH_OP_DEVICE_ADD or -+ * STONITH_OP_DEVICE_DEL) -+ * \param[in] result Operation result -+ * \param[in] desc ID of device that changed -+ */ - void --do_stonith_notify_device(const char *op, int rc, const char *desc) -+fenced_send_device_notification(const char *op, -+ const pcmk__action_result_t *result, -+ const char *desc) - { -- do_stonith_notify_config(op, rc, desc, g_hash_table_size(device_list)); -+ do_stonith_notify_config(op, pcmk_rc2legacy(stonith__result2rc(result)), desc, g_hash_table_size(device_list)); - } - - void -diff --git a/daemons/fenced/pacemaker-fenced.h b/daemons/fenced/pacemaker-fenced.h -index ed47ab046c..0b63680171 100644 ---- a/daemons/fenced/pacemaker-fenced.h -+++ b/daemons/fenced/pacemaker-fenced.h -@@ -230,7 +230,9 @@ void - do_stonith_async_timeout_update(const char *client, const char *call_id, int timeout); - - void do_stonith_notify(const char *type, int result, xmlNode *data); --void do_stonith_notify_device(const char *op, int rc, const char *desc); -+void fenced_send_device_notification(const char *op, -+ const pcmk__action_result_t *result, -+ const char *desc); - void do_stonith_notify_level(const char *op, int rc, const char *desc); - - remote_fencing_op_t *initiate_remote_stonith_op(pcmk__client_t *client, --- -2.27.0 - - -From ee0777d5ca99d8d2d7805d4a73241ab696c68751 Mon Sep 17 00:00:00 2001 -From: Ken Gaillot -Date: Fri, 19 Nov 2021 16:51:55 -0600 -Subject: [PATCH 17/23] Refactor: fencer: pass full result when sending - topology notifications - -Rename do_stonith_notify_level() to fenced_send_level_notification() for -consistency, and make it take the full result as an argument rather than a -legacy return code. The full result is not used yet, but that is planned. ---- - daemons/fenced/fenced_commands.c | 4 ++-- - daemons/fenced/pacemaker-fenced.c | 21 +++++++++++++++------ - daemons/fenced/pacemaker-fenced.h | 4 +++- - 3 files changed, 20 insertions(+), 9 deletions(-) - -diff --git a/daemons/fenced/fenced_commands.c b/daemons/fenced/fenced_commands.c -index 86a761dfab..2f3dbb035a 100644 ---- a/daemons/fenced/fenced_commands.c -+++ b/daemons/fenced/fenced_commands.c -@@ -3216,7 +3216,7 @@ handle_request(pcmk__client_t *client, uint32_t id, uint32_t flags, - PCMK_EXEC_INVALID, - "Unprivileged users must add level via CIB"); - } -- do_stonith_notify_level(op, pcmk_rc2legacy(stonith__result2rc(&result)), device_id); -+ fenced_send_level_notification(op, &result, device_id); - free(device_id); - - } else if (pcmk__str_eq(op, STONITH_OP_LEVEL_DEL, pcmk__str_none)) { -@@ -3229,7 +3229,7 @@ handle_request(pcmk__client_t *client, uint32_t id, uint32_t flags, - PCMK_EXEC_INVALID, - "Unprivileged users must delete level via CIB"); - } -- do_stonith_notify_level(op, pcmk_rc2legacy(stonith__result2rc(&result)), device_id); -+ fenced_send_level_notification(op, &result, device_id); - - } else if(pcmk__str_eq(op, CRM_OP_RM_NODE_CACHE, pcmk__str_casei)) { - int node_id = 0; -diff --git a/daemons/fenced/pacemaker-fenced.c b/daemons/fenced/pacemaker-fenced.c -index 42e167ce78..773cf57f6b 100644 ---- a/daemons/fenced/pacemaker-fenced.c -+++ b/daemons/fenced/pacemaker-fenced.c -@@ -411,10 +411,21 @@ fenced_send_device_notification(const char *op, - do_stonith_notify_config(op, pcmk_rc2legacy(stonith__result2rc(result)), desc, g_hash_table_size(device_list)); - } - -+/*! -+ * \internal -+ * \brief Send notifications for a topology level change to subscribed clients -+ * -+ * \param[in] op Notification type (STONITH_OP_LEVEL_ADD or -+ * STONITH_OP_LEVEL_DEL) -+ * \param[in] result Operation result -+ * \param[in] desc String representation of level ([]) -+ */ - void --do_stonith_notify_level(const char *op, int rc, const char *desc) -+fenced_send_level_notification(const char *op, -+ const pcmk__action_result_t *result, -+ const char *desc) - { -- do_stonith_notify_config(op, rc, desc, g_hash_table_size(topology)); -+ do_stonith_notify_config(op, pcmk_rc2legacy(stonith__result2rc(result)), desc, g_hash_table_size(topology)); - } - - static void -@@ -429,8 +440,7 @@ topology_remove_helper(const char *node, int level) - crm_xml_add(data, XML_ATTR_STONITH_TARGET, node); - - fenced_unregister_level(data, &desc, &result); -- do_stonith_notify_level(STONITH_OP_LEVEL_DEL, -- pcmk_rc2legacy(stonith__result2rc(&result)), desc); -+ fenced_send_level_notification(STONITH_OP_LEVEL_DEL, &result, desc); - pcmk__reset_result(&result); - free_xml(data); - free(desc); -@@ -480,8 +490,7 @@ handle_topology_change(xmlNode *match, bool remove) - } - - fenced_register_level(match, &desc, &result); -- do_stonith_notify_level(STONITH_OP_LEVEL_ADD, -- pcmk_rc2legacy(stonith__result2rc(&result)), desc); -+ fenced_send_level_notification(STONITH_OP_LEVEL_ADD, &result, desc); - pcmk__reset_result(&result); - free(desc); - } -diff --git a/daemons/fenced/pacemaker-fenced.h b/daemons/fenced/pacemaker-fenced.h -index 0b63680171..8503e813bf 100644 ---- a/daemons/fenced/pacemaker-fenced.h -+++ b/daemons/fenced/pacemaker-fenced.h -@@ -233,7 +233,9 @@ void do_stonith_notify(const char *type, int result, xmlNode *data); - void fenced_send_device_notification(const char *op, - const pcmk__action_result_t *result, - const char *desc); --void do_stonith_notify_level(const char *op, int rc, const char *desc); -+void fenced_send_level_notification(const char *op, -+ const pcmk__action_result_t *result, -+ const char *desc); - - remote_fencing_op_t *initiate_remote_stonith_op(pcmk__client_t *client, - xmlNode *request, --- -2.27.0 - - -From deec1ea9bcd7e0062755aa8b74358bfd12e4b9f0 Mon Sep 17 00:00:00 2001 -From: Ken Gaillot -Date: Fri, 19 Nov 2021 16:53:26 -0600 -Subject: [PATCH 18/23] Refactor: fencer: pass full result when sending - configuration notifications - -Rename do_stonith_notify_config() to send_config_notification() for -consistency, and make it take the full result as an argument rather than a -legacy return code. The full result is not used yet, but that is planned. ---- - daemons/fenced/pacemaker-fenced.c | 19 +++++++++++++++---- - 1 file changed, 15 insertions(+), 4 deletions(-) - -diff --git a/daemons/fenced/pacemaker-fenced.c b/daemons/fenced/pacemaker-fenced.c -index 773cf57f6b..d64358e07f 100644 ---- a/daemons/fenced/pacemaker-fenced.c -+++ b/daemons/fenced/pacemaker-fenced.c -@@ -379,8 +379,19 @@ do_stonith_notify(const char *type, int result, xmlNode *data) - crm_trace("Notify complete"); - } - -+/*! -+ * \internal -+ * \brief Send notifications for a configuration change to subscribed clients -+ * -+ * \param[in] op Notification type (STONITH_OP_DEVICE_ADD, -+ * STONITH_OP_DEVICE_DEL, STONITH_OP_LEVEL_ADD, or -+ * STONITH_OP_LEVEL_DEL) -+ * \param[in] result Operation result -+ * \param[in] desc Description of what changed -+ * \param[in] active Current number of devices or topologies in use -+ */ - static void --do_stonith_notify_config(const char *op, int rc, -+send_config_notification(const char *op, const pcmk__action_result_t *result, - const char *desc, int active) - { - xmlNode *notify_data = create_xml_node(NULL, op); -@@ -390,7 +401,7 @@ do_stonith_notify_config(const char *op, int rc, - crm_xml_add(notify_data, F_STONITH_DEVICE, desc); - crm_xml_add_int(notify_data, F_STONITH_ACTIVE, active); - -- do_stonith_notify(op, rc, notify_data); -+ do_stonith_notify(op, pcmk_rc2legacy(stonith__result2rc(result)), notify_data); - free_xml(notify_data); - } - -@@ -408,7 +419,7 @@ fenced_send_device_notification(const char *op, - const pcmk__action_result_t *result, - const char *desc) - { -- do_stonith_notify_config(op, pcmk_rc2legacy(stonith__result2rc(result)), desc, g_hash_table_size(device_list)); -+ send_config_notification(op, result, desc, g_hash_table_size(device_list)); - } - - /*! -@@ -425,7 +436,7 @@ fenced_send_level_notification(const char *op, - const pcmk__action_result_t *result, - const char *desc) - { -- do_stonith_notify_config(op, pcmk_rc2legacy(stonith__result2rc(result)), desc, g_hash_table_size(topology)); -+ send_config_notification(op, result, desc, g_hash_table_size(topology)); - } - - static void --- -2.27.0 - - -From 432e4445b630fb158482a5f6de1e0e41697a381f Mon Sep 17 00:00:00 2001 -From: Ken Gaillot -Date: Fri, 19 Nov 2021 16:56:12 -0600 -Subject: [PATCH 19/23] Feature: fencer: pass full result when sending - notifications - -Rename do_stonith_notify() to fenced_send_notification() for consistency, and -make it take the full result as an argument rather than a legacy return code, -and add the full result to the notifications. ---- - daemons/fenced/fenced_commands.c | 4 ++-- - daemons/fenced/fenced_history.c | 6 +++--- - daemons/fenced/fenced_remote.c | 6 +++--- - daemons/fenced/pacemaker-fenced.c | 15 ++++++++++++--- - daemons/fenced/pacemaker-fenced.h | 4 +++- - 5 files changed, 23 insertions(+), 12 deletions(-) - -diff --git a/daemons/fenced/fenced_commands.c b/daemons/fenced/fenced_commands.c -index 2f3dbb035a..54ebc12947 100644 ---- a/daemons/fenced/fenced_commands.c -+++ b/daemons/fenced/fenced_commands.c -@@ -2489,8 +2489,8 @@ send_async_reply(async_command_t *cmd, const pcmk__action_result_t *result, - crm_xml_add(notify_data, F_STONITH_REMOTE_OP_ID, cmd->remote_op_id); - crm_xml_add(notify_data, F_STONITH_ORIGIN, cmd->client); - -- do_stonith_notify(T_STONITH_NOTIFY_FENCE, pcmk_rc2legacy(stonith__result2rc(result)), notify_data); -- do_stonith_notify(T_STONITH_NOTIFY_HISTORY, pcmk_ok, NULL); -+ fenced_send_notification(T_STONITH_NOTIFY_FENCE, result, notify_data); -+ fenced_send_notification(T_STONITH_NOTIFY_HISTORY, NULL, NULL); - } - } - -diff --git a/daemons/fenced/fenced_history.c b/daemons/fenced/fenced_history.c -index 1e07a9815a..44310ed77b 100644 ---- a/daemons/fenced/fenced_history.c -+++ b/daemons/fenced/fenced_history.c -@@ -100,7 +100,7 @@ stonith_fence_history_cleanup(const char *target, - g_hash_table_foreach_remove(stonith_remote_op_list, - stonith_remove_history_entry, - (gpointer) target); -- do_stonith_notify(T_STONITH_NOTIFY_HISTORY, pcmk_ok, NULL); -+ fenced_send_notification(T_STONITH_NOTIFY_HISTORY, NULL, NULL); - } - } - -@@ -402,7 +402,7 @@ stonith_local_history_diff_and_merge(GHashTable *remote_history, - - if (updated) { - stonith_fence_history_trim(); -- do_stonith_notify(T_STONITH_NOTIFY_HISTORY, pcmk_ok, NULL); -+ fenced_send_notification(T_STONITH_NOTIFY_HISTORY, NULL, NULL); - } - - if (cnt == 0) { -@@ -473,7 +473,7 @@ stonith_fence_history(xmlNode *msg, xmlNode **output, - is done so send a notification for anything - that smells like history-sync - */ -- do_stonith_notify(T_STONITH_NOTIFY_HISTORY_SYNCED, pcmk_ok, NULL); -+ fenced_send_notification(T_STONITH_NOTIFY_HISTORY_SYNCED, NULL, NULL); - if (crm_element_value(msg, F_STONITH_CALLID)) { - /* this is coming from the stonith-API - * -diff --git a/daemons/fenced/fenced_remote.c b/daemons/fenced/fenced_remote.c -index 329e06c444..16c181b4b0 100644 ---- a/daemons/fenced/fenced_remote.c -+++ b/daemons/fenced/fenced_remote.c -@@ -442,8 +442,8 @@ handle_local_reply_and_notify(remote_fencing_op_t *op, xmlNode *data, - do_local_reply(reply, op->client_id, op->call_options & st_opt_sync_call, FALSE); - - /* bcast to all local clients that the fencing operation happend */ -- do_stonith_notify(T_STONITH_NOTIFY_FENCE, pcmk_rc2legacy(stonith__result2rc(result)), notify_data); -- do_stonith_notify(T_STONITH_NOTIFY_HISTORY, pcmk_ok, NULL); -+ fenced_send_notification(T_STONITH_NOTIFY_FENCE, result, notify_data); -+ fenced_send_notification(T_STONITH_NOTIFY_HISTORY, NULL, NULL); - - /* mark this op as having notify's already sent */ - op->notify_sent = TRUE; -@@ -1211,7 +1211,7 @@ create_remote_stonith_op(const char *client, xmlNode * request, gboolean peer) - - if (op->state != st_duplicate) { - /* kick history readers */ -- do_stonith_notify(T_STONITH_NOTIFY_HISTORY, pcmk_ok, NULL); -+ fenced_send_notification(T_STONITH_NOTIFY_HISTORY, NULL, NULL); - } - - /* safe to trim as long as that doesn't touch pending ops */ -diff --git a/daemons/fenced/pacemaker-fenced.c b/daemons/fenced/pacemaker-fenced.c -index d64358e07f..6b31b814a3 100644 ---- a/daemons/fenced/pacemaker-fenced.c -+++ b/daemons/fenced/pacemaker-fenced.c -@@ -356,8 +356,17 @@ do_stonith_async_timeout_update(const char *client_id, const char *call_id, int - free_xml(notify_data); - } - -+/*! -+ * \internal -+ * \brief Notify relevant IPC clients of a fencing operation result -+ * -+ * \param[in] type Notification type -+ * \param[in] result Result of fencing operation (assume success if NULL) -+ * \param[in] data If not NULL, add to notification as call data -+ */ - void --do_stonith_notify(const char *type, int result, xmlNode *data) -+fenced_send_notification(const char *type, const pcmk__action_result_t *result, -+ xmlNode *data) - { - /* TODO: Standardize the contents of data */ - xmlNode *update_msg = create_xml_node(NULL, "notify"); -@@ -367,7 +376,7 @@ do_stonith_notify(const char *type, int result, xmlNode *data) - crm_xml_add(update_msg, F_TYPE, T_STONITH_NOTIFY); - crm_xml_add(update_msg, F_SUBTYPE, type); - crm_xml_add(update_msg, F_STONITH_OPERATION, type); -- crm_xml_add_int(update_msg, F_STONITH_RC, result); -+ stonith__xe_set_result(update_msg, result); - - if (data != NULL) { - add_message_xml(update_msg, F_STONITH_CALLDATA, data); -@@ -401,7 +410,7 @@ send_config_notification(const char *op, const pcmk__action_result_t *result, - crm_xml_add(notify_data, F_STONITH_DEVICE, desc); - crm_xml_add_int(notify_data, F_STONITH_ACTIVE, active); - -- do_stonith_notify(op, pcmk_rc2legacy(stonith__result2rc(result)), notify_data); -+ fenced_send_notification(op, result, notify_data); - free_xml(notify_data); - } - -diff --git a/daemons/fenced/pacemaker-fenced.h b/daemons/fenced/pacemaker-fenced.h -index 8503e813bf..502fcc9a29 100644 ---- a/daemons/fenced/pacemaker-fenced.h -+++ b/daemons/fenced/pacemaker-fenced.h -@@ -229,7 +229,9 @@ xmlNode *fenced_construct_reply(xmlNode *request, xmlNode *data, - void - do_stonith_async_timeout_update(const char *client, const char *call_id, int timeout); - --void do_stonith_notify(const char *type, int result, xmlNode *data); -+void fenced_send_notification(const char *type, -+ const pcmk__action_result_t *result, -+ xmlNode *data); - void fenced_send_device_notification(const char *op, - const pcmk__action_result_t *result, - const char *desc); --- -2.27.0 - - -From 86deababe506c2bb8259538e5380b6a78dc4b770 Mon Sep 17 00:00:00 2001 -From: Ken Gaillot -Date: Fri, 19 Nov 2021 16:58:03 -0600 -Subject: [PATCH 20/23] Feature: fencer: pass full result when sending - notifications - -Rename create_op_done_notify() to fencing_result2xml() for readability, -make it take the full result as an argument rather than a legacy return code, -and add the full result to broadcasts and notifications. ---- - daemons/fenced/fenced_remote.c | 20 +++++++++++++++----- - 1 file changed, 15 insertions(+), 5 deletions(-) - -diff --git a/daemons/fenced/fenced_remote.c b/daemons/fenced/fenced_remote.c -index 16c181b4b0..4cf723e6df 100644 ---- a/daemons/fenced/fenced_remote.c -+++ b/daemons/fenced/fenced_remote.c -@@ -356,13 +356,22 @@ undo_op_remap(remote_fencing_op_t *op) - } - } - -+/*! -+ * \internal -+ * \brief Create notification data XML for a fencing operation result -+ * -+ * \param[in] op Fencer operation that completed -+ * \param[in] result Full operation result -+ * -+ * \return Newly created XML to add as notification data -+ * \note The caller is responsible for freeing the result. -+ */ - static xmlNode * --create_op_done_notify(remote_fencing_op_t * op, int rc) -+fencing_result2xml(remote_fencing_op_t *op, pcmk__action_result_t *result) - { - xmlNode *notify_data = create_xml_node(NULL, T_STONITH_NOTIFY_FENCE); - - crm_xml_add_int(notify_data, "state", op->state); -- crm_xml_add_int(notify_data, F_STONITH_RC, rc); - crm_xml_add(notify_data, F_STONITH_TARGET, op->target); - crm_xml_add(notify_data, F_STONITH_ACTION, op->action); - crm_xml_add(notify_data, F_STONITH_DELEGATE, op->delegate); -@@ -371,6 +380,7 @@ create_op_done_notify(remote_fencing_op_t * op, int rc) - crm_xml_add(notify_data, F_STONITH_CLIENTID, op->client_id); - crm_xml_add(notify_data, F_STONITH_CLIENTNAME, op->client_name); - -+ stonith__xe_set_result(notify_data, result); - return notify_data; - } - -@@ -388,7 +398,7 @@ fenced_broadcast_op_result(remote_fencing_op_t *op, - { - static int count = 0; - xmlNode *bcast = create_xml_node(NULL, T_STONITH_REPLY); -- xmlNode *notify_data = create_op_done_notify(op, pcmk_rc2legacy(stonith__result2rc(result))); -+ xmlNode *notify_data = fencing_result2xml(op, result); - - count++; - crm_trace("Broadcasting result to peers"); -@@ -430,7 +440,6 @@ handle_local_reply_and_notify(remote_fencing_op_t *op, xmlNode *data, - } - - /* Do notification with a clean data object */ -- notify_data = create_op_done_notify(op, pcmk_rc2legacy(stonith__result2rc(result))); - crm_xml_add_int(data, "state", op->state); - crm_xml_add(data, F_STONITH_TARGET, op->target); - crm_xml_add(data, F_STONITH_OPERATION, op->action); -@@ -442,13 +451,14 @@ handle_local_reply_and_notify(remote_fencing_op_t *op, xmlNode *data, - do_local_reply(reply, op->client_id, op->call_options & st_opt_sync_call, FALSE); - - /* bcast to all local clients that the fencing operation happend */ -+ notify_data = fencing_result2xml(op, result); - fenced_send_notification(T_STONITH_NOTIFY_FENCE, result, notify_data); -+ free_xml(notify_data); - fenced_send_notification(T_STONITH_NOTIFY_HISTORY, NULL, NULL); - - /* mark this op as having notify's already sent */ - op->notify_sent = TRUE; - free_xml(reply); -- free_xml(notify_data); - } - - /*! --- -2.27.0 - - -From 2814cde97520b63ca5f9baf3df37d73507e89d34 Mon Sep 17 00:00:00 2001 -From: Ken Gaillot -Date: Wed, 15 Dec 2021 17:40:52 -0600 -Subject: [PATCH 21/23] Low: fencer: restore check for invalid topology level - target - -... per review. b7c7676c mistakenly dropped it ---- - daemons/fenced/fenced_commands.c | 12 +++++++++++- - 1 file changed, 11 insertions(+), 1 deletion(-) - -diff --git a/daemons/fenced/fenced_commands.c b/daemons/fenced/fenced_commands.c -index 54ebc12947..1a4a791385 100644 ---- a/daemons/fenced/fenced_commands.c -+++ b/daemons/fenced/fenced_commands.c -@@ -1636,6 +1636,16 @@ fenced_register_level(xmlNode *msg, char **desc, pcmk__action_result_t *result) - *desc = crm_strdup_printf("%s[%d]", target, id); - } - -+ // Ensure a valid target was specified -+ if ((mode < 0) || (mode > 2)) { -+ crm_warn("Ignoring topology level registration without valid target"); -+ free(target); -+ crm_log_xml_warn(level, "Bad level"); -+ pcmk__set_result(result, CRM_EX_INVALID_PARAM, PCMK_EXEC_INVALID, -+ "Invalid topology level target"); -+ return; -+ } -+ - // Ensure level ID is in allowed range - if ((id <= 0) || (id >= ST_LEVEL_MAX)) { - crm_warn("Ignoring topology registration for %s with invalid level %d", -@@ -1643,7 +1653,7 @@ fenced_register_level(xmlNode *msg, char **desc, pcmk__action_result_t *result) - free(target); - crm_log_xml_warn(level, "Bad level"); - pcmk__set_result(result, CRM_EX_INVALID_PARAM, PCMK_EXEC_INVALID, -- "Invalid topology level"); -+ "Invalid topology level number"); - return; - } - --- -2.27.0 - - -From c82806f9e16abcea00025fd3a290477aef2d8d83 Mon Sep 17 00:00:00 2001 -From: Ken Gaillot -Date: Tue, 21 Dec 2021 16:23:29 -0600 -Subject: [PATCH 22/23] Low: fencer: free result memory when processing fencing - replies - -found in review ---- - daemons/fenced/fenced_remote.c | 24 +++++++++++++++--------- - 1 file changed, 15 insertions(+), 9 deletions(-) - -diff --git a/daemons/fenced/fenced_remote.c b/daemons/fenced/fenced_remote.c -index 4cf723e6df..9fda9ef060 100644 ---- a/daemons/fenced/fenced_remote.c -+++ b/daemons/fenced/fenced_remote.c -@@ -2241,14 +2241,14 @@ fenced_process_fencing_reply(xmlNode *msg) - /* Could be for an event that began before we started */ - /* TODO: Record the op for later querying */ - crm_info("Received peer result of unknown or expired operation %s", id); -- return; -+ goto done; - } - - if (op->devices && device && !pcmk__str_eq(op->devices->data, device, pcmk__str_casei)) { - crm_err("Received outdated reply for device %s (instead of %s) to " - "fence (%s) %s. Operation already timed out at peer level.", - device, (const char *) op->devices->data, op->action, op->target); -- return; -+ goto done; - } - - if (pcmk__str_eq(crm_element_value(msg, F_SUBTYPE), "broadcast", pcmk__str_casei)) { -@@ -2265,14 +2265,15 @@ fenced_process_fencing_reply(xmlNode *msg) - op->state = st_failed; - } - finalize_op(op, msg, &result, false); -- return; -+ goto done; -+ - } else if (!pcmk__str_eq(op->originator, stonith_our_uname, pcmk__str_casei)) { - /* If this isn't a remote level broadcast, and we are not the - * originator of the operation, we should not be receiving this msg. */ - crm_err("Received non-broadcast fencing result for operation %.8s " - "we do not own (device %s targeting %s)", - op->id, device, op->target); -- return; -+ goto done; - } - - if (pcmk_is_set(op->call_options, st_opt_topology)) { -@@ -2290,7 +2291,7 @@ fenced_process_fencing_reply(xmlNode *msg) - * and notify our local clients. */ - if (op->state == st_done) { - finalize_op(op, msg, &result, false); -- return; -+ goto done; - } - - if ((op->phase == 2) && !pcmk__result_ok(&result)) { -@@ -2310,27 +2311,30 @@ fenced_process_fencing_reply(xmlNode *msg) - /* An operation completed successfully. Try another device if - * necessary, otherwise mark the operation as done. */ - advance_topology_device_in_level(op, device, msg); -- return; -+ goto done; - } else { - /* This device failed, time to try another topology level. If no other - * levels are available, mark this operation as failed and report results. */ - if (advance_topology_level(op, false) != pcmk_rc_ok) { - op->state = st_failed; - finalize_op(op, msg, &result, false); -- return; -+ goto done; - } - } -+ - } else if (pcmk__result_ok(&result) && (op->devices == NULL)) { - crm_trace("All done for %s", op->target); - op->state = st_done; - finalize_op(op, msg, &result, false); -- return; -+ goto done; -+ - } else if ((result.execution_status == PCMK_EXEC_TIMEOUT) - && (op->devices == NULL)) { - /* If the operation timed out don't bother retrying other peers. */ - op->state = st_failed; - finalize_op(op, msg, &result, false); -- return; -+ goto done; -+ - } else { - /* fall-through and attempt other fencing action using another peer */ - } -@@ -2340,6 +2344,8 @@ fenced_process_fencing_reply(xmlNode *msg) - op->target, op->originator, op->client_name, - pcmk_exec_status_str(result.execution_status)); - request_peer_fencing(op, NULL, &result); -+done: -+ pcmk__reset_result(&result); - } - - gboolean --- -2.27.0 - - -From 137bf97fdb39043eebb02a0d3ebbe47ee8c7044c Mon Sep 17 00:00:00 2001 -From: Ken Gaillot -Date: Tue, 21 Dec 2021 16:26:22 -0600 -Subject: [PATCH 23/23] Log: fencer: clarify timeout message - -... as suggested by review ---- - daemons/fenced/fenced_remote.c | 2 +- - 1 file changed, 1 insertion(+), 1 deletion(-) - -diff --git a/daemons/fenced/fenced_remote.c b/daemons/fenced/fenced_remote.c -index 9fda9ef060..1e237150c5 100644 ---- a/daemons/fenced/fenced_remote.c -+++ b/daemons/fenced/fenced_remote.c -@@ -656,7 +656,7 @@ remote_op_timeout_one(gpointer userdata) - crm_notice("Peer's '%s' action targeting %s for client %s timed out " CRM_XS - " id=%.8s", op->action, op->target, op->client_name, op->id); - pcmk__set_result(&result, CRM_EX_ERROR, PCMK_EXEC_TIMEOUT, -- "Peer did not send fence result within timeout"); -+ "Peer did not return fence result within timeout"); - - - // Try another device, if appropriate --- -2.27.0 - diff --git a/SOURCES/010-probe-failures.patch b/SOURCES/010-probe-failures.patch deleted file mode 100644 index d90fc3c..0000000 --- a/SOURCES/010-probe-failures.patch +++ /dev/null @@ -1,4157 +0,0 @@ -From f2e51898735b5e9990464141fc4aea3dd83f5067 Mon Sep 17 00:00:00 2001 -From: Chris Lumens -Date: Thu, 4 Nov 2021 14:36:41 -0400 -Subject: [PATCH 01/21] Refactor: scheduler: Use bool in unpack_rsc_op. - -Previously, we were using bool but TRUE/FALSE. Instead, use the actual -values. ---- - lib/pengine/unpack.c | 4 ++-- - 1 file changed, 2 insertions(+), 2 deletions(-) - -diff --git a/lib/pengine/unpack.c b/lib/pengine/unpack.c -index b1e84110a2..ecc7275e15 100644 ---- a/lib/pengine/unpack.c -+++ b/lib/pengine/unpack.c -@@ -3671,7 +3671,7 @@ unpack_rsc_op(pe_resource_t *rsc, pe_node_t *node, xmlNode *xml_op, - const char *task = NULL; - const char *task_key = NULL; - const char *exit_reason = NULL; -- bool expired = FALSE; -+ bool expired = false; - pe_resource_t *parent = rsc; - enum action_fail_response failure_strategy = action_fail_recover; - -@@ -3727,7 +3727,7 @@ unpack_rsc_op(pe_resource_t *rsc, pe_node_t *node, xmlNode *xml_op, - - if ((status != PCMK_EXEC_NOT_INSTALLED) - && check_operation_expiry(rsc, node, rc, xml_op, data_set)) { -- expired = TRUE; -+ expired = true; - } - - if (!strcmp(task, CRMD_ACTION_STATUS)) { --- -2.27.0 - - -From 4c961b8e670d336a368c7fd1535c247e40c6b48e Mon Sep 17 00:00:00 2001 -From: Chris Lumens -Date: Thu, 4 Nov 2021 15:07:01 -0400 -Subject: [PATCH 02/21] Refactor: scheduler: Add functions for determining if - an op is a probe. - ---- - include/crm/common/util.h | 3 + - lib/common/operations.c | 21 +++++++ - lib/common/tests/operations/Makefile.am | 6 +- - .../tests/operations/pcmk_is_probe_test.c | 37 +++++++++++++ - .../tests/operations/pcmk_xe_is_probe_test.c | 55 +++++++++++++++++++ - lib/pengine/unpack.c | 12 ++-- - lib/pengine/utils.c | 5 +- - 7 files changed, 127 insertions(+), 12 deletions(-) - create mode 100644 lib/common/tests/operations/pcmk_is_probe_test.c - create mode 100644 lib/common/tests/operations/pcmk_xe_is_probe_test.c - -diff --git a/include/crm/common/util.h b/include/crm/common/util.h -index 2728b64492..fbea6e560c 100644 ---- a/include/crm/common/util.h -+++ b/include/crm/common/util.h -@@ -72,6 +72,9 @@ xmlNode *crm_create_op_xml(xmlNode *parent, const char *prefix, - const char *timeout); - #define CRM_DEFAULT_OP_TIMEOUT_S "20s" - -+bool pcmk_is_probe(const char *task, guint interval); -+bool pcmk_xe_is_probe(xmlNode *xml_op); -+ - int compare_version(const char *version1, const char *version2); - - /* coverity[+kill] */ -diff --git a/lib/common/operations.c b/lib/common/operations.c -index 366c189702..978df79082 100644 ---- a/lib/common/operations.c -+++ b/lib/common/operations.c -@@ -537,3 +537,24 @@ pcmk__is_fencing_action(const char *action) - { - return pcmk__str_any_of(action, "off", "reboot", "poweroff", NULL); - } -+ -+bool -+pcmk_is_probe(const char *task, guint interval) -+{ -+ if (task == NULL) { -+ return false; -+ } -+ -+ return (interval == 0) && pcmk__str_eq(task, CRMD_ACTION_STATUS, pcmk__str_none); -+} -+ -+bool -+pcmk_xe_is_probe(xmlNode *xml_op) -+{ -+ const char *task = crm_element_value(xml_op, XML_LRM_ATTR_TASK); -+ const char *interval_ms_s = crm_element_value(xml_op, XML_LRM_ATTR_INTERVAL_MS); -+ int interval_ms; -+ -+ pcmk__scan_min_int(interval_ms_s, &interval_ms, 0); -+ return pcmk_is_probe(task, interval_ms); -+} -diff --git a/lib/common/tests/operations/Makefile.am b/lib/common/tests/operations/Makefile.am -index c8814ff0a8..2e3d0b0679 100644 ---- a/lib/common/tests/operations/Makefile.am -+++ b/lib/common/tests/operations/Makefile.am -@@ -1,5 +1,5 @@ - # --# Copyright 2020 the Pacemaker project contributors -+# Copyright 2020-2021 the Pacemaker project contributors - # - # The version control history for this file may have further details. - # -@@ -12,6 +12,8 @@ LDADD = $(top_builddir)/lib/common/libcrmcommon.la -lcmocka - include $(top_srcdir)/mk/tap.mk - - # Add "_test" to the end of all test program names to simplify .gitignore. --check_PROGRAMS = parse_op_key_test -+check_PROGRAMS = parse_op_key_test \ -+ pcmk_is_probe_test \ -+ pcmk_xe_is_probe_test - - TESTS = $(check_PROGRAMS) -diff --git a/lib/common/tests/operations/pcmk_is_probe_test.c b/lib/common/tests/operations/pcmk_is_probe_test.c -new file mode 100644 -index 0000000000..9b449f1a70 ---- /dev/null -+++ b/lib/common/tests/operations/pcmk_is_probe_test.c -@@ -0,0 +1,37 @@ -+/* -+ * Copyright 2021 the Pacemaker project contributors -+ * -+ * The version control history for this file may have further details. -+ * -+ * This source code is licensed under the GNU Lesser General Public License -+ * version 2.1 or later (LGPLv2.1+) WITHOUT ANY WARRANTY. -+ */ -+ -+#include -+ -+#include -+#include -+#include -+#include -+#include -+#include -+ -+static void -+is_probe_test(void **state) -+{ -+ assert_false(pcmk_is_probe(NULL, 0)); -+ assert_false(pcmk_is_probe("", 0)); -+ assert_false(pcmk_is_probe("blahblah", 0)); -+ assert_false(pcmk_is_probe("monitor", 1)); -+ assert_true(pcmk_is_probe("monitor", 0)); -+} -+ -+int main(int argc, char **argv) -+{ -+ const struct CMUnitTest tests[] = { -+ cmocka_unit_test(is_probe_test), -+ }; -+ -+ cmocka_set_message_output(CM_OUTPUT_TAP); -+ return cmocka_run_group_tests(tests, NULL, NULL); -+} -diff --git a/lib/common/tests/operations/pcmk_xe_is_probe_test.c b/lib/common/tests/operations/pcmk_xe_is_probe_test.c -new file mode 100644 -index 0000000000..0283d1c145 ---- /dev/null -+++ b/lib/common/tests/operations/pcmk_xe_is_probe_test.c -@@ -0,0 +1,55 @@ -+/* -+ * Copyright 2021 the Pacemaker project contributors -+ * -+ * The version control history for this file may have further details. -+ * -+ * This source code is licensed under the GNU Lesser General Public License -+ * version 2.1 or later (LGPLv2.1+) WITHOUT ANY WARRANTY. -+ */ -+ -+#include -+ -+#include -+#include -+#include -+#include -+#include -+#include -+ -+static void -+op_is_probe_test(void **state) -+{ -+ xmlNode *node = NULL; -+ -+ assert_false(pcmk_xe_is_probe(NULL)); -+ -+ node = string2xml(""); -+ assert_false(pcmk_xe_is_probe(node)); -+ free_xml(node); -+ -+ node = string2xml(""); -+ assert_false(pcmk_xe_is_probe(node)); -+ free_xml(node); -+ -+ node = string2xml(""); -+ assert_false(pcmk_xe_is_probe(node)); -+ free_xml(node); -+ -+ node = string2xml(""); -+ assert_false(pcmk_xe_is_probe(node)); -+ free_xml(node); -+ -+ node = string2xml(""); -+ assert_true(pcmk_xe_is_probe(node)); -+ free_xml(node); -+} -+ -+int main(int argc, char **argv) -+{ -+ const struct CMUnitTest tests[] = { -+ cmocka_unit_test(op_is_probe_test), -+ }; -+ -+ cmocka_set_message_output(CM_OUTPUT_TAP); -+ return cmocka_run_group_tests(tests, NULL, NULL); -+} -diff --git a/lib/pengine/unpack.c b/lib/pengine/unpack.c -index ecc7275e15..7c0c66e696 100644 ---- a/lib/pengine/unpack.c -+++ b/lib/pengine/unpack.c -@@ -83,7 +83,6 @@ is_dangling_guest_node(pe_node_t *node) - return FALSE; - } - -- - /*! - * \brief Schedule a fence action for a node - * -@@ -2984,7 +2983,6 @@ static void - unpack_rsc_op_failure(pe_resource_t * rsc, pe_node_t * node, int rc, xmlNode * xml_op, xmlNode ** last_failure, - enum action_fail_response * on_fail, pe_working_set_t * data_set) - { -- guint interval_ms = 0; - bool is_probe = false; - pe_action_t *action = NULL; - -@@ -2998,10 +2996,7 @@ unpack_rsc_op_failure(pe_resource_t * rsc, pe_node_t * node, int rc, xmlNode * x - - *last_failure = xml_op; - -- crm_element_value_ms(xml_op, XML_LRM_ATTR_INTERVAL_MS, &interval_ms); -- if ((interval_ms == 0) && !strcmp(task, CRMD_ACTION_STATUS)) { -- is_probe = true; -- } -+ is_probe = pcmk_xe_is_probe(xml_op); - - if (exit_reason == NULL) { - exit_reason = ""; -@@ -3163,8 +3158,9 @@ determine_op_status( - } - - crm_element_value_ms(xml_op, XML_LRM_ATTR_INTERVAL_MS, &interval_ms); -- if ((interval_ms == 0) && !strcmp(task, CRMD_ACTION_STATUS)) { -- is_probe = true; -+ is_probe = pcmk_xe_is_probe(xml_op); -+ -+ if (is_probe) { - task = "probe"; - } - -diff --git a/lib/pengine/utils.c b/lib/pengine/utils.c -index c5eda3898e..07753e173a 100644 ---- a/lib/pengine/utils.c -+++ b/lib/pengine/utils.c -@@ -1066,8 +1066,7 @@ unpack_operation(pe_action_t * action, xmlNode * xml_obj, pe_resource_t * contai - { - int timeout_ms = 0; - const char *value = NULL; -- bool is_probe = pcmk__str_eq(action->task, RSC_STATUS, pcmk__str_casei) -- && (interval_ms == 0); -+ bool is_probe = false; - #if ENABLE_VERSIONED_ATTRS - pe_rsc_action_details_t *rsc_details = NULL; - #endif -@@ -1094,6 +1093,8 @@ unpack_operation(pe_action_t * action, xmlNode * xml_obj, pe_resource_t * contai - - CRM_CHECK(action && action->rsc, return); - -+ is_probe = pcmk_is_probe(action->task, interval_ms); -+ - // Cluster-wide - pe__unpack_dataset_nvpairs(data_set->op_defaults, XML_TAG_META_SETS, &rule_data, - action->meta, NULL, FALSE, data_set); --- -2.27.0 - - -From 09f32df97ab5064a15ba5a1fb3970d5c64ee7b30 Mon Sep 17 00:00:00 2001 -From: Chris Lumens -Date: Fri, 19 Nov 2021 14:47:22 -0500 -Subject: [PATCH 03/21] Refactor: scheduler: Move setting interval_ms in - determine_op_status. - -This can now happen in the only place it's being used. ---- - lib/pengine/unpack.c | 9 ++++++--- - 1 file changed, 6 insertions(+), 3 deletions(-) - -diff --git a/lib/pengine/unpack.c b/lib/pengine/unpack.c -index 7c0c66e696..b9986d2462 100644 ---- a/lib/pengine/unpack.c -+++ b/lib/pengine/unpack.c -@@ -3142,7 +3142,6 @@ static int - determine_op_status( - pe_resource_t *rsc, int rc, int target_rc, pe_node_t * node, xmlNode * xml_op, enum action_fail_response * on_fail, pe_working_set_t * data_set) - { -- guint interval_ms = 0; - bool is_probe = false; - int result = PCMK_EXEC_DONE; - const char *key = get_op_key(xml_op); -@@ -3157,7 +3156,6 @@ determine_op_status( - exit_reason = ""; - } - -- crm_element_value_ms(xml_op, XML_LRM_ATTR_INTERVAL_MS, &interval_ms); - is_probe = pcmk_xe_is_probe(xml_op); - - if (is_probe) { -@@ -3230,12 +3228,17 @@ determine_op_status( - result = PCMK_EXEC_ERROR_FATAL; - break; - -- case PCMK_OCF_UNIMPLEMENT_FEATURE: -+ case PCMK_OCF_UNIMPLEMENT_FEATURE: { -+ guint interval_ms = 0; -+ crm_element_value_ms(xml_op, XML_LRM_ATTR_INTERVAL_MS, &interval_ms); -+ - if (interval_ms > 0) { - result = PCMK_EXEC_NOT_SUPPORTED; - break; - } - // fall through -+ } -+ - case PCMK_OCF_NOT_INSTALLED: - case PCMK_OCF_INVALID_PARAM: - case PCMK_OCF_INSUFFICIENT_PRIV: --- -2.27.0 - - -From 6c8f47453afd6c100fddc45187faff17e15f7bfe Mon Sep 17 00:00:00 2001 -From: Chris Lumens -Date: Fri, 19 Nov 2021 14:57:57 -0500 -Subject: [PATCH 04/21] Refactor: scheduler: Add pcmk_xe_mask_failed_probe. - -Given an xmlNodePtr for a resource operation, this function will -determine whether it is a failed probe operation that should not be -displayed in crm_mon (or other places, I suppose) or not. ---- - include/crm/common/util.h | 1 + - lib/common/operations.c | 17 ++ - lib/common/tests/operations/Makefile.am | 3 +- - .../pcmk_xe_mask_probe_failure_test.c | 162 ++++++++++++++++++ - 4 files changed, 182 insertions(+), 1 deletion(-) - create mode 100644 lib/common/tests/operations/pcmk_xe_mask_probe_failure_test.c - -diff --git a/include/crm/common/util.h b/include/crm/common/util.h -index fbea6e560c..784069ba1b 100644 ---- a/include/crm/common/util.h -+++ b/include/crm/common/util.h -@@ -74,6 +74,7 @@ xmlNode *crm_create_op_xml(xmlNode *parent, const char *prefix, - - bool pcmk_is_probe(const char *task, guint interval); - bool pcmk_xe_is_probe(xmlNode *xml_op); -+bool pcmk_xe_mask_probe_failure(xmlNode *xml_op); - - int compare_version(const char *version1, const char *version2); - -diff --git a/lib/common/operations.c b/lib/common/operations.c -index 978df79082..54482b8863 100644 ---- a/lib/common/operations.c -+++ b/lib/common/operations.c -@@ -558,3 +558,20 @@ pcmk_xe_is_probe(xmlNode *xml_op) - pcmk__scan_min_int(interval_ms_s, &interval_ms, 0); - return pcmk_is_probe(task, interval_ms); - } -+ -+bool -+pcmk_xe_mask_probe_failure(xmlNode *xml_op) -+{ -+ int status = PCMK_EXEC_UNKNOWN; -+ int rc = PCMK_OCF_OK; -+ -+ if (!pcmk_xe_is_probe(xml_op)) { -+ return false; -+ } -+ -+ crm_element_value_int(xml_op, XML_LRM_ATTR_OPSTATUS, &status); -+ crm_element_value_int(xml_op, XML_LRM_ATTR_RC, &rc); -+ -+ return rc == PCMK_OCF_NOT_INSTALLED || rc == PCMK_OCF_INVALID_PARAM || -+ status == PCMK_EXEC_NOT_INSTALLED; -+} -diff --git a/lib/common/tests/operations/Makefile.am b/lib/common/tests/operations/Makefile.am -index 2e3d0b0679..457c5f7c7a 100644 ---- a/lib/common/tests/operations/Makefile.am -+++ b/lib/common/tests/operations/Makefile.am -@@ -14,6 +14,7 @@ include $(top_srcdir)/mk/tap.mk - # Add "_test" to the end of all test program names to simplify .gitignore. - check_PROGRAMS = parse_op_key_test \ - pcmk_is_probe_test \ -- pcmk_xe_is_probe_test -+ pcmk_xe_is_probe_test \ -+ pcmk_xe_mask_probe_failure_test - - TESTS = $(check_PROGRAMS) -diff --git a/lib/common/tests/operations/pcmk_xe_mask_probe_failure_test.c b/lib/common/tests/operations/pcmk_xe_mask_probe_failure_test.c -new file mode 100644 -index 0000000000..a13f6d98f4 ---- /dev/null -+++ b/lib/common/tests/operations/pcmk_xe_mask_probe_failure_test.c -@@ -0,0 +1,162 @@ -+/* -+ * Copyright 2021 the Pacemaker project contributors -+ * -+ * The version control history for this file may have further details. -+ * -+ * This source code is licensed under the GNU Lesser General Public License -+ * version 2.1 or later (LGPLv2.1+) WITHOUT ANY WARRANTY. -+ */ -+ -+#include -+ -+#include -+#include -+#include -+#include -+#include -+#include -+ -+static void -+op_is_not_probe_test(void **state) { -+ xmlNode *node = NULL; -+ -+ /* Not worth testing this thoroughly since it's just a duplicate of whether -+ * pcmk_op_is_probe works or not. -+ */ -+ -+ node = string2xml(""); -+ assert_false(pcmk_xe_mask_probe_failure(node)); -+ free_xml(node); -+} -+ -+static void -+op_does_not_have_right_values_test(void **state) { -+ xmlNode *node = NULL; -+ -+ node = string2xml(""); -+ assert_false(pcmk_xe_mask_probe_failure(node)); -+ free_xml(node); -+ -+ node = string2xml(""); -+ assert_false(pcmk_xe_mask_probe_failure(node)); -+ free_xml(node); -+} -+ -+static void -+check_values_test(void **state) { -+ xmlNode *node = NULL; -+ -+ /* PCMK_EXEC_NOT_SUPPORTED */ -+ node = string2xml(""); -+ assert_false(pcmk_xe_mask_probe_failure(node)); -+ free_xml(node); -+ -+ node = string2xml(""); -+ assert_true(pcmk_xe_mask_probe_failure(node)); -+ free_xml(node); -+ -+ /* PCMK_EXEC_DONE */ -+ node = string2xml(""); -+ assert_false(pcmk_xe_mask_probe_failure(node)); -+ free_xml(node); -+ -+ node = string2xml(""); -+ assert_true(pcmk_xe_mask_probe_failure(node)); -+ free_xml(node); -+ -+ node = string2xml(""); -+ assert_true(pcmk_xe_mask_probe_failure(node)); -+ free_xml(node); -+ -+ node = string2xml(""); -+ assert_false(pcmk_xe_mask_probe_failure(node)); -+ free_xml(node); -+ -+ node = string2xml(""); -+ assert_false(pcmk_xe_mask_probe_failure(node)); -+ free_xml(node); -+ -+ /* PCMK_EXEC_NOT_INSTALLED */ -+ node = string2xml(""); -+ assert_true(pcmk_xe_mask_probe_failure(node)); -+ free_xml(node); -+ -+ node = string2xml(""); -+ assert_true(pcmk_xe_mask_probe_failure(node)); -+ free_xml(node); -+ -+ /* PCMK_EXEC_ERROR */ -+ node = string2xml(""); -+ assert_false(pcmk_xe_mask_probe_failure(node)); -+ free_xml(node); -+ -+ node = string2xml(""); -+ assert_true(pcmk_xe_mask_probe_failure(node)); -+ free_xml(node); -+ -+ node = string2xml(""); -+ assert_true(pcmk_xe_mask_probe_failure(node)); -+ free_xml(node); -+ -+ node = string2xml(""); -+ assert_false(pcmk_xe_mask_probe_failure(node)); -+ free_xml(node); -+ -+ node = string2xml(""); -+ assert_false(pcmk_xe_mask_probe_failure(node)); -+ free_xml(node); -+ -+ /* PCMK_EXEC_ERROR_HARD */ -+ node = string2xml(""); -+ assert_false(pcmk_xe_mask_probe_failure(node)); -+ free_xml(node); -+ -+ node = string2xml(""); -+ assert_true(pcmk_xe_mask_probe_failure(node)); -+ free_xml(node); -+ -+ node = string2xml(""); -+ assert_true(pcmk_xe_mask_probe_failure(node)); -+ free_xml(node); -+ -+ node = string2xml(""); -+ assert_false(pcmk_xe_mask_probe_failure(node)); -+ free_xml(node); -+ -+ node = string2xml(""); -+ assert_false(pcmk_xe_mask_probe_failure(node)); -+ free_xml(node); -+ -+ /* PCMK_EXEC_ERROR_FATAL */ -+ node = string2xml(""); -+ assert_false(pcmk_xe_mask_probe_failure(node)); -+ free_xml(node); -+ -+ node = string2xml(""); -+ assert_true(pcmk_xe_mask_probe_failure(node)); -+ free_xml(node); -+ -+ node = string2xml(""); -+ assert_true(pcmk_xe_mask_probe_failure(node)); -+ free_xml(node); -+ -+ node = string2xml(""); -+ assert_false(pcmk_xe_mask_probe_failure(node)); -+ free_xml(node); -+ -+ node = string2xml(""); -+ assert_false(pcmk_xe_mask_probe_failure(node)); -+ free_xml(node); -+} -+ -+int main(int argc, char **argv) -+{ -+ const struct CMUnitTest tests[] = { -+ cmocka_unit_test(op_is_not_probe_test), -+ cmocka_unit_test(op_does_not_have_right_values_test), -+ cmocka_unit_test(check_values_test), -+ }; -+ -+ cmocka_set_message_output(CM_OUTPUT_TAP); -+ return cmocka_run_group_tests(tests, NULL, NULL); -+} --- -2.27.0 - - -From c9ce1aaf93cd20bb01e80102dda0ffffb07e6472 Mon Sep 17 00:00:00 2001 -From: Chris Lumens -Date: Wed, 1 Dec 2021 14:26:31 -0500 -Subject: [PATCH 05/21] Refactor: scheduler: Combine op status and rc remapping - into one function. - -Well, not quite. Doing the remapping is complicated enough to where it -makes sense to have them in separate functions. However, they can both -be called from a single new function that takes the place of the -previous two calls in unpack_rsc_op. ---- - lib/pengine/unpack.c | 157 ++++++++++++++++++++----------------------- - 1 file changed, 72 insertions(+), 85 deletions(-) - -diff --git a/lib/pengine/unpack.c b/lib/pengine/unpack.c -index b9986d2462..b659f319fb 100644 ---- a/lib/pengine/unpack.c -+++ b/lib/pengine/unpack.c -@@ -3121,36 +3121,68 @@ unpack_rsc_op_failure(pe_resource_t * rsc, pe_node_t * node, int rc, xmlNode * x - - /*! - * \internal -- * \brief Remap operation status based on action result -+ * \brief Remap informational monitor results and operation status - * -- * Given an action result, determine an appropriate operation status for the -- * purposes of responding to the action (the status provided by the executor is -- * not directly usable since the executor does not know what was expected). -+ * For the monitor results, certain OCF codes are for providing extended information -+ * to the user about services that aren't yet failed but not entirely healthy either. -+ * These must be treated as the "normal" result by Pacemaker. -+ * -+ * For operation status, the action result can be used to determine an appropriate -+ * status for the purposes of responding to the action. The status provided by the -+ * executor is not directly usable since the executor does not know what was expected. - * -+ * \param[in] xml_op Operation history entry XML from CIB status - * \param[in,out] rsc Resource that operation history entry is for -- * \param[in] rc Actual return code of operation -- * \param[in] target_rc Expected return code of operation - * \param[in] node Node where operation was executed -- * \param[in] xml_op Operation history entry XML from CIB status -- * \param[in,out] on_fail What should be done about the result - * \param[in] data_set Current cluster working set -+ * \param[in,out] on_fail What should be done about the result -+ * \param[in] target_rc Expected return code of operation -+ * \param[in,out] rc Actual return code of operation -+ * \param[in,out] status Operation execution status -+ * -+ * \note If the result is remapped and the node is not shutting down or failed, -+ * the operation will be recorded in the data set's list of failed operations -+ * to highlight it for the user. - * -- * \return Operation status based on return code and action info - * \note This may update the resource's current and next role. - */ --static int --determine_op_status( -- pe_resource_t *rsc, int rc, int target_rc, pe_node_t * node, xmlNode * xml_op, enum action_fail_response * on_fail, pe_working_set_t * data_set) --{ -+static void -+remap_operation(xmlNode *xml_op, pe_resource_t *rsc, pe_node_t *node, -+ pe_working_set_t *data_set, enum action_fail_response *on_fail, -+ int target_rc, int *rc, int *status) { - bool is_probe = false; -- int result = PCMK_EXEC_DONE; -- const char *key = get_op_key(xml_op); - const char *task = crm_element_value(xml_op, XML_LRM_ATTR_TASK); -+ const char *key = get_op_key(xml_op); - const char *exit_reason = crm_element_value(xml_op, - XML_LRM_ATTR_EXIT_REASON); - -+ if (pcmk__str_eq(task, CRMD_ACTION_STATUS, pcmk__str_none)) { -+ int remapped_rc = pcmk__effective_rc(*rc); -+ -+ if (*rc != remapped_rc) { -+ crm_trace("Remapping monitor result %d to %d", *rc, remapped_rc); -+ if (!node->details->shutdown || node->details->online) { -+ record_failed_op(xml_op, node, rsc, data_set); -+ } -+ -+ *rc = remapped_rc; -+ } -+ } -+ -+ /* If the executor reported an operation status of anything but done or -+ * error, consider that final. But for done or error, we know better whether -+ * it should be treated as a failure or not, because we know the expected -+ * result. -+ */ -+ if (*status != PCMK_EXEC_DONE && *status != PCMK_EXEC_ERROR) { -+ return; -+ } -+ - CRM_ASSERT(rsc); -- CRM_CHECK(task != NULL, return PCMK_EXEC_ERROR); -+ CRM_CHECK(task != NULL, -+ *status = PCMK_EXEC_ERROR; return); -+ -+ *status = PCMK_EXEC_DONE; - - if (exit_reason == NULL) { - exit_reason = ""; -@@ -3171,23 +3203,23 @@ determine_op_status( - * those versions or processing of saved CIB files from those versions, - * so we do not need to care much about this case. - */ -- result = PCMK_EXEC_ERROR; -+ *status = PCMK_EXEC_ERROR; - crm_warn("Expected result not found for %s on %s (corrupt or obsolete CIB?)", - key, node->details->uname); - -- } else if (target_rc != rc) { -- result = PCMK_EXEC_ERROR; -+ } else if (target_rc != *rc) { -+ *status = PCMK_EXEC_ERROR; - pe_rsc_debug(rsc, "%s on %s: expected %d (%s), got %d (%s%s%s)", - key, node->details->uname, - target_rc, services_ocf_exitcode_str(target_rc), -- rc, services_ocf_exitcode_str(rc), -+ *rc, services_ocf_exitcode_str(*rc), - (*exit_reason? ": " : ""), exit_reason); - } - -- switch (rc) { -+ switch (*rc) { - case PCMK_OCF_OK: - if (is_probe && (target_rc == PCMK_OCF_NOT_RUNNING)) { -- result = PCMK_EXEC_DONE; -+ *status = PCMK_EXEC_DONE; - pe_rsc_info(rsc, "Probe found %s active on %s at %s", - rsc->id, node->details->uname, - last_change_str(xml_op)); -@@ -3195,10 +3227,10 @@ determine_op_status( - break; - - case PCMK_OCF_NOT_RUNNING: -- if (is_probe || (target_rc == rc) -+ if (is_probe || (target_rc == *rc) - || !pcmk_is_set(rsc->flags, pe_rsc_managed)) { - -- result = PCMK_EXEC_DONE; -+ *status = PCMK_EXEC_DONE; - rsc->role = RSC_ROLE_STOPPED; - - /* clear any previous failure actions */ -@@ -3208,8 +3240,8 @@ determine_op_status( - break; - - case PCMK_OCF_RUNNING_PROMOTED: -- if (is_probe && (rc != target_rc)) { -- result = PCMK_EXEC_DONE; -+ if (is_probe && (*rc != target_rc)) { -+ *status = PCMK_EXEC_DONE; - pe_rsc_info(rsc, - "Probe found %s active and promoted on %s at %s", - rsc->id, node->details->uname, -@@ -3221,11 +3253,11 @@ determine_op_status( - case PCMK_OCF_DEGRADED_PROMOTED: - case PCMK_OCF_FAILED_PROMOTED: - rsc->role = RSC_ROLE_PROMOTED; -- result = PCMK_EXEC_ERROR; -+ *status = PCMK_EXEC_ERROR; - break; - - case PCMK_OCF_NOT_CONFIGURED: -- result = PCMK_EXEC_ERROR_FATAL; -+ *status = PCMK_EXEC_ERROR_FATAL; - break; - - case PCMK_OCF_UNIMPLEMENT_FEATURE: { -@@ -3233,7 +3265,7 @@ determine_op_status( - crm_element_value_ms(xml_op, XML_LRM_ATTR_INTERVAL_MS, &interval_ms); - - if (interval_ms > 0) { -- result = PCMK_EXEC_NOT_SUPPORTED; -+ *status = PCMK_EXEC_NOT_SUPPORTED; - break; - } - // fall through -@@ -3248,26 +3280,27 @@ determine_op_status( - pe_proc_err("No further recovery can be attempted for %s " - "because %s on %s failed (%s%s%s) at %s " - CRM_XS " rc=%d id=%s", rsc->id, task, -- node->details->uname, services_ocf_exitcode_str(rc), -+ node->details->uname, services_ocf_exitcode_str(*rc), - (*exit_reason? ": " : ""), exit_reason, -- last_change_str(xml_op), rc, ID(xml_op)); -+ last_change_str(xml_op), *rc, ID(xml_op)); - pe__clear_resource_flags(rsc, pe_rsc_managed); - pe__set_resource_flags(rsc, pe_rsc_block); - } -- result = PCMK_EXEC_ERROR_HARD; -+ *status = PCMK_EXEC_ERROR_HARD; - break; - - default: -- if (result == PCMK_EXEC_DONE) { -+ if (*status == PCMK_EXEC_DONE) { - crm_info("Treating unknown exit status %d from %s of %s " - "on %s at %s as failure", -- rc, task, rsc->id, node->details->uname, -+ *rc, task, rsc->id, node->details->uname, - last_change_str(xml_op)); -- result = PCMK_EXEC_ERROR; -+ *status = PCMK_EXEC_ERROR; - } - break; - } -- return result; -+ -+ pe_rsc_trace(rsc, "Remapped %s status to %d", key, *status); - } - - // return TRUE if start or monitor last failure but parameters changed -@@ -3622,41 +3655,6 @@ update_resource_state(pe_resource_t * rsc, pe_node_t * node, xmlNode * xml_op, c - } - } - --/*! -- * \internal -- * \brief Remap informational monitor results to usual values -- * -- * Certain OCF result codes are for providing extended information to the -- * user about services that aren't yet failed but not entirely healthy either. -- * These must be treated as the "normal" result by Pacemaker. -- * -- * \param[in] rc Actual result of a monitor action -- * \param[in] xml_op Operation history XML -- * \param[in] node Node that operation happened on -- * \param[in] rsc Resource that operation happened to -- * \param[in] data_set Cluster working set -- * -- * \return Result code that pacemaker should use -- * -- * \note If the result is remapped, and the node is not shutting down or failed, -- * the operation will be recorded in the data set's list of failed -- * operations, to highlight it for the user. -- */ --static int --remap_monitor_rc(int rc, xmlNode *xml_op, const pe_node_t *node, -- const pe_resource_t *rsc, pe_working_set_t *data_set) --{ -- int remapped_rc = pcmk__effective_rc(rc); -- -- if (rc != remapped_rc) { -- crm_trace("Remapping monitor result %d to %d", rc, remapped_rc); -- if (!node->details->shutdown || node->details->online) { -- record_failed_op(xml_op, node, rsc, data_set); -- } -- } -- return remapped_rc; --} -- - static void - unpack_rsc_op(pe_resource_t *rsc, pe_node_t *node, xmlNode *xml_op, - xmlNode **last_failure, enum action_fail_response *on_fail, -@@ -3712,7 +3710,7 @@ unpack_rsc_op(pe_resource_t *rsc, pe_node_t *node, xmlNode *xml_op, - node->details->uname, rsc->id); - } - -- /* It should be possible to call remap_monitor_rc() first then call -+ /* It should be possible to call remap_operation() first then call - * check_operation_expiry() only if rc != target_rc, because there should - * never be a fail count without at least one unexpected result in the - * resource history. That would be more efficient by avoiding having to call -@@ -3729,9 +3727,8 @@ unpack_rsc_op(pe_resource_t *rsc, pe_node_t *node, xmlNode *xml_op, - expired = true; - } - -- if (!strcmp(task, CRMD_ACTION_STATUS)) { -- rc = remap_monitor_rc(rc, xml_op, node, rsc, data_set); -- } -+ remap_operation(xml_op, rsc, node, data_set, on_fail, target_rc, -+ &rc, &status); - - if (expired && (rc != target_rc)) { - const char *magic = crm_element_value(xml_op, XML_ATTR_TRANSITION_MAGIC); -@@ -3761,16 +3758,6 @@ unpack_rsc_op(pe_resource_t *rsc, pe_node_t *node, xmlNode *xml_op, - } - } - -- /* If the executor reported an operation status of anything but done or -- * error, consider that final. But for done or error, we know better whether -- * it should be treated as a failure or not, because we know the expected -- * result. -- */ -- if(status == PCMK_EXEC_DONE || status == PCMK_EXEC_ERROR) { -- status = determine_op_status(rsc, rc, target_rc, node, xml_op, on_fail, data_set); -- pe_rsc_trace(rsc, "Remapped %s status to %d", task_key, status); -- } -- - switch (status) { - case PCMK_EXEC_CANCELLED: - // Should never happen --- -2.27.0 - - -From 9fdca1999872b3930cf18b7d807ddb259f23e8a5 Mon Sep 17 00:00:00 2001 -From: Chris Lumens -Date: Fri, 19 Nov 2021 15:08:16 -0500 -Subject: [PATCH 06/21] Test: cts-cli: Add test output for a native resource - with a failed probe op. - -There are no code changes yet to properly handle displaying these -operations, so the results here just reflect the current handling. ---- - cts/cli/crm_mon-partial.xml | 16 +++++++++++ - cts/cli/regression.crm_mon.exp | 50 ++++++++++++++++++++++++++-------- - 2 files changed, 55 insertions(+), 11 deletions(-) - -diff --git a/cts/cli/crm_mon-partial.xml b/cts/cli/crm_mon-partial.xml -index e6c6894b6f..b7817e4775 100644 ---- a/cts/cli/crm_mon-partial.xml -+++ b/cts/cli/crm_mon-partial.xml -@@ -60,6 +60,16 @@ - - - -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ - - - -@@ -94,6 +104,9 @@ - - - -+ -+ -+ - - - -@@ -135,6 +148,9 @@ - - - -+ -+ -+ - - - -diff --git a/cts/cli/regression.crm_mon.exp b/cts/cli/regression.crm_mon.exp -index 8714f917a9..d12dce3ae8 100644 ---- a/cts/cli/regression.crm_mon.exp -+++ b/cts/cli/regression.crm_mon.exp -@@ -3470,7 +3470,7 @@ Cluster Summary: - * Last updated: - * Last change: - * 4 nodes configured -- * 13 resource instances configured (1 DISABLED) -+ * 14 resource instances configured (1 DISABLED) - - Node List: - * Online: [ cluster01 cluster02 ] -@@ -3485,6 +3485,9 @@ Active Resources: - * httpd-bundle-1 (192.168.122.132) (ocf:heartbeat:apache): Stopped cluster01 - * Resource Group: partially-active-group (1 member inactive): - * dummy-1 (ocf:pacemaker:Dummy): Started cluster02 -+ -+Failed Resource Actions: -+ * smart-mon probe on cluster02 returned 'not installed' at Tue Nov 9 15:38:55 2021 after 33ms - =#=#=#= End test: Text output of partially active resources - OK (0) =#=#=#= - * Passed: crm_mon - Text output of partially active resources - =#=#=#= Begin test: XML output of partially active resources =#=#=#= -@@ -3495,7 +3498,7 @@ Active Resources: - - - -- -+ - - - -@@ -3548,6 +3551,7 @@ Active Resources: - - - -+ - - - -@@ -3574,6 +3578,9 @@ Active Resources: - - - -+ -+ -+ - - - -@@ -3603,6 +3610,9 @@ Active Resources: - - - -+ -+ -+ - - - =#=#=#= End test: XML output of partially active resources - OK (0) =#=#=#= -@@ -3614,7 +3624,7 @@ Cluster Summary: - * Last updated: - * Last change: - * 4 nodes configured -- * 13 resource instances configured (1 DISABLED) -+ * 14 resource instances configured (1 DISABLED) - - Node List: - * Online: [ cluster01 cluster02 ] -@@ -3631,6 +3641,10 @@ Full List of Resources: - * Resource Group: partially-active-group: - * dummy-1 (ocf:pacemaker:Dummy): Started cluster02 - * dummy-2 (ocf:pacemaker:Dummy): Stopped (disabled) -+ * smart-mon (ocf:pacemaker:HealthSMART): Stopped -+ -+Failed Resource Actions: -+ * smart-mon probe on cluster02 returned 'not installed' at Tue Nov 9 15:38:55 2021 after 33ms - =#=#=#= End test: Text output of partially active resources, with inactive resources - OK (0) =#=#=#= - * Passed: crm_mon - Text output of partially active resources, with inactive resources - =#=#=#= Begin test: Complete brief text output, with inactive resources =#=#=#= -@@ -3640,13 +3654,14 @@ Cluster Summary: - * Last updated: - * Last change: - * 4 nodes configured -- * 13 resource instances configured (1 DISABLED) -+ * 14 resource instances configured (1 DISABLED) - - Node List: - * Online: [ cluster01 cluster02 ] - * GuestOnline: [ httpd-bundle-0@cluster02 httpd-bundle-1@cluster01 ] - - Full List of Resources: -+ * 0/1 (ocf:pacemaker:HealthSMART): Active - * 1/1 (stonith:fence_xvm): Active cluster01 - * Clone Set: ping-clone [ping]: - * Started: [ cluster01 ] -@@ -3676,6 +3691,8 @@ Operations: - * (3) monitor: interval="30000ms" - * dummy-1: migration-threshold=1000000: - * (2) start -+ * smart-mon: migration-threshold=1000000: -+ * (9) probe - * Node: cluster01: - * Fencing: migration-threshold=1000000: - * (15) start -@@ -3695,6 +3712,9 @@ Operations: - * Node: httpd-bundle-0@cluster02: - * httpd: migration-threshold=1000000: - * (1) start -+ -+Failed Resource Actions: -+ * smart-mon probe on cluster02 returned 'not installed' at Tue Nov 9 15:38:55 2021 after 33ms - =#=#=#= End test: Complete brief text output, with inactive resources - OK (0) =#=#=#= - * Passed: crm_mon - Complete brief text output, with inactive resources - =#=#=#= Begin test: Text output of partially active group =#=#=#= -@@ -3704,7 +3724,7 @@ Cluster Summary: - * Last updated: - * Last change: - * 4 nodes configured -- * 13 resource instances configured (1 DISABLED) -+ * 14 resource instances configured (1 DISABLED) - - Node List: - * Online: [ cluster01 cluster02 ] -@@ -3722,7 +3742,7 @@ Cluster Summary: - * Last updated: - * Last change: - * 4 nodes configured -- * 13 resource instances configured (1 DISABLED) -+ * 14 resource instances configured (1 DISABLED) - - Node List: - * Online: [ cluster01 cluster02 ] -@@ -3741,7 +3761,7 @@ Cluster Summary: - * Last updated: - * Last change: - * 4 nodes configured -- * 13 resource instances configured (1 DISABLED) -+ * 14 resource instances configured (1 DISABLED) - - Node List: - * Online: [ cluster01 cluster02 ] -@@ -3759,7 +3779,7 @@ Cluster Summary: - * Last updated: - * Last change: - * 4 nodes configured -- * 13 resource instances configured (1 DISABLED) -+ * 14 resource instances configured (1 DISABLED) - - Node List: - * Online: [ cluster01 cluster02 ] -@@ -3777,7 +3797,7 @@ Cluster Summary: - * Last updated: - * Last change: - * 4 nodes configured -- * 13 resource instances configured (1 DISABLED) -+ * 14 resource instances configured (1 DISABLED) - - Node List: - * Node cluster01: online: -@@ -3806,6 +3826,7 @@ Inactive Resources: - * httpd-bundle-1 (192.168.122.132) (ocf:heartbeat:apache): Stopped cluster01 - * Resource Group: partially-active-group: - * 1/2 (ocf:pacemaker:Dummy): Active cluster02 -+ * smart-mon (ocf:pacemaker:HealthSMART): Stopped - - Node Attributes: - * Node: cluster01: -@@ -3826,6 +3847,8 @@ Operations: - * (3) monitor: interval="30000ms" - * dummy-1: migration-threshold=1000000: - * (2) start -+ * smart-mon: migration-threshold=1000000: -+ * (9) probe - * Node: cluster01: - * Fencing: migration-threshold=1000000: - * (15) start -@@ -3845,6 +3868,9 @@ Operations: - * Node: httpd-bundle-0@cluster02: - * httpd: migration-threshold=1000000: - * (1) start -+ -+Failed Resource Actions: -+ * smart-mon probe on cluster02 returned 'not installed' at Tue Nov 9 15:38:55 2021 after 33ms - =#=#=#= End test: Complete brief text output grouped by node, with inactive resources - OK (0) =#=#=#= - * Passed: crm_mon - Complete brief text output grouped by node, with inactive resources - =#=#=#= Begin test: Text output of partially active resources, with inactive resources, filtered by node =#=#=#= -@@ -3854,7 +3880,7 @@ Cluster Summary: - * Last updated: - * Last change: - * 4 nodes configured -- * 13 resource instances configured (1 DISABLED) -+ * 14 resource instances configured (1 DISABLED) - - Node List: - * Online: [ cluster01 ] -@@ -3865,6 +3891,7 @@ Full List of Resources: - * Fencing (stonith:fence_xvm): Started cluster01 - * Container bundle set: httpd-bundle [pcmk:http]: - * httpd-bundle-1 (192.168.122.132) (ocf:heartbeat:apache): Stopped cluster01 -+ * smart-mon (ocf:pacemaker:HealthSMART): Stopped - =#=#=#= End test: Text output of partially active resources, with inactive resources, filtered by node - OK (0) =#=#=#= - * Passed: crm_mon - Text output of partially active resources, with inactive resources, filtered by node - =#=#=#= Begin test: Text output of partially active resources, filtered by node =#=#=#= -@@ -3875,7 +3902,7 @@ Full List of Resources: - - - -- -+ - - - -@@ -3905,6 +3932,7 @@ Full List of Resources: - - - -+ - - - --- -2.27.0 - - -From 1c54d0bbb74d066d55a56eae28d1a579b8854604 Mon Sep 17 00:00:00 2001 -From: Chris Lumens -Date: Fri, 19 Nov 2021 15:17:52 -0500 -Subject: [PATCH 07/21] Test: cts-cli: Add test output for a cloned resource - with a failed probe op. - -There are no code changes yet to properly handle displaying these -operations, so the results here just reflect the current handling. ---- - cts/cli/crm_mon-partial.xml | 3 +++ - cts/cli/regression.crm_mon.exp | 12 ++++++++++++ - 2 files changed, 15 insertions(+) - -diff --git a/cts/cli/crm_mon-partial.xml b/cts/cli/crm_mon-partial.xml -index b7817e4775..1f9dc156aa 100644 ---- a/cts/cli/crm_mon-partial.xml -+++ b/cts/cli/crm_mon-partial.xml -@@ -107,6 +107,9 @@ - - - -+ -+ -+ - - - -diff --git a/cts/cli/regression.crm_mon.exp b/cts/cli/regression.crm_mon.exp -index d12dce3ae8..d093bd8106 100644 ---- a/cts/cli/regression.crm_mon.exp -+++ b/cts/cli/regression.crm_mon.exp -@@ -3488,6 +3488,7 @@ Active Resources: - - Failed Resource Actions: - * smart-mon probe on cluster02 returned 'not installed' at Tue Nov 9 15:38:55 2021 after 33ms -+ * ping probe on cluster02 returned 'not installed' at Thu Nov 18 13:11:42 2021 - =#=#=#= End test: Text output of partially active resources - OK (0) =#=#=#= - * Passed: crm_mon - Text output of partially active resources - =#=#=#= Begin test: XML output of partially active resources =#=#=#= -@@ -3581,6 +3582,9 @@ Failed Resource Actions: - - - -+ -+ -+ - - - -@@ -3612,6 +3616,7 @@ Failed Resource Actions: - - - -+ - - - -@@ -3645,6 +3650,7 @@ Full List of Resources: - - Failed Resource Actions: - * smart-mon probe on cluster02 returned 'not installed' at Tue Nov 9 15:38:55 2021 after 33ms -+ * ping probe on cluster02 returned 'not installed' at Thu Nov 18 13:11:42 2021 - =#=#=#= End test: Text output of partially active resources, with inactive resources - OK (0) =#=#=#= - * Passed: crm_mon - Text output of partially active resources, with inactive resources - =#=#=#= Begin test: Complete brief text output, with inactive resources =#=#=#= -@@ -3693,6 +3699,8 @@ Operations: - * (2) start - * smart-mon: migration-threshold=1000000: - * (9) probe -+ * ping: migration-threshold=1000000: -+ * (6) probe - * Node: cluster01: - * Fencing: migration-threshold=1000000: - * (15) start -@@ -3715,6 +3723,7 @@ Operations: - - Failed Resource Actions: - * smart-mon probe on cluster02 returned 'not installed' at Tue Nov 9 15:38:55 2021 after 33ms -+ * ping probe on cluster02 returned 'not installed' at Thu Nov 18 13:11:42 2021 - =#=#=#= End test: Complete brief text output, with inactive resources - OK (0) =#=#=#= - * Passed: crm_mon - Complete brief text output, with inactive resources - =#=#=#= Begin test: Text output of partially active group =#=#=#= -@@ -3849,6 +3858,8 @@ Operations: - * (2) start - * smart-mon: migration-threshold=1000000: - * (9) probe -+ * ping: migration-threshold=1000000: -+ * (6) probe - * Node: cluster01: - * Fencing: migration-threshold=1000000: - * (15) start -@@ -3871,6 +3882,7 @@ Operations: - - Failed Resource Actions: - * smart-mon probe on cluster02 returned 'not installed' at Tue Nov 9 15:38:55 2021 after 33ms -+ * ping probe on cluster02 returned 'not installed' at Thu Nov 18 13:11:42 2021 - =#=#=#= End test: Complete brief text output grouped by node, with inactive resources - OK (0) =#=#=#= - * Passed: crm_mon - Complete brief text output grouped by node, with inactive resources - =#=#=#= Begin test: Text output of partially active resources, with inactive resources, filtered by node =#=#=#= --- -2.27.0 - - -From 9408f08c07eb531ff84b07bf959f3d681ebf2b78 Mon Sep 17 00:00:00 2001 -From: Chris Lumens -Date: Fri, 19 Nov 2021 15:48:16 -0500 -Subject: [PATCH 08/21] Test: cts-cli: Change the resources in - partially-active-group. - -dummy-2 is now not running because it failed to start due to an -unimplemented feature. I don't know what could possibly be -unimplemented about a dummy resource, but it's not important. - -There is also a new dummy-3 resource that acts exactly the same as -dummy-2. This preserves checking that the inactive member output can -still be displayed. - -There are no code changes yet to properly handle displaying these -operations, so the results here just reflect the current handling. ---- - cts/cli/crm_mon-partial.xml | 6 +++- - cts/cli/regression.crm_mon.exp | 62 +++++++++++++++++++++++----------- - 2 files changed, 47 insertions(+), 21 deletions(-) - -diff --git a/cts/cli/crm_mon-partial.xml b/cts/cli/crm_mon-partial.xml -index 1f9dc156aa..1ce80ea58a 100644 ---- a/cts/cli/crm_mon-partial.xml -+++ b/cts/cli/crm_mon-partial.xml -@@ -54,7 +54,8 @@ - - - -- -+ -+ - - - -@@ -104,6 +105,9 @@ - - - -+ -+ -+ - - - -diff --git a/cts/cli/regression.crm_mon.exp b/cts/cli/regression.crm_mon.exp -index d093bd8106..8cf3a1215e 100644 ---- a/cts/cli/regression.crm_mon.exp -+++ b/cts/cli/regression.crm_mon.exp -@@ -3470,7 +3470,7 @@ Cluster Summary: - * Last updated: - * Last change: - * 4 nodes configured -- * 14 resource instances configured (1 DISABLED) -+ * 15 resource instances configured (1 DISABLED) - - Node List: - * Online: [ cluster01 cluster02 ] -@@ -3485,8 +3485,10 @@ Active Resources: - * httpd-bundle-1 (192.168.122.132) (ocf:heartbeat:apache): Stopped cluster01 - * Resource Group: partially-active-group (1 member inactive): - * dummy-1 (ocf:pacemaker:Dummy): Started cluster02 -+ * dummy-2 (ocf:pacemaker:Dummy): FAILED cluster02 - - Failed Resource Actions: -+ * dummy-2 probe on cluster02 returned 'unimplemented feature' at Wed Sep 2 12:17:38 2020 after 33ms - * smart-mon probe on cluster02 returned 'not installed' at Tue Nov 9 15:38:55 2021 after 33ms - * ping probe on cluster02 returned 'not installed' at Thu Nov 18 13:11:42 2021 - =#=#=#= End test: Text output of partially active resources - OK (0) =#=#=#= -@@ -3499,12 +3501,12 @@ Failed Resource Actions: - - - -- -+ - - - - -- -+ - - - -@@ -3546,11 +3548,14 @@ Failed Resource Actions: - - - -- -+ - - - -- -+ -+ -+ -+ - - - -@@ -3579,6 +3584,9 @@ Failed Resource Actions: - - - -+ -+ -+ - - - -@@ -3615,6 +3623,7 @@ Failed Resource Actions: - - - -+ - - - -@@ -3629,7 +3638,7 @@ Cluster Summary: - * Last updated: - * Last change: - * 4 nodes configured -- * 14 resource instances configured (1 DISABLED) -+ * 15 resource instances configured (1 DISABLED) - - Node List: - * Online: [ cluster01 cluster02 ] -@@ -3645,10 +3654,12 @@ Full List of Resources: - * httpd-bundle-1 (192.168.122.132) (ocf:heartbeat:apache): Stopped cluster01 - * Resource Group: partially-active-group: - * dummy-1 (ocf:pacemaker:Dummy): Started cluster02 -- * dummy-2 (ocf:pacemaker:Dummy): Stopped (disabled) -+ * dummy-2 (ocf:pacemaker:Dummy): FAILED cluster02 -+ * dummy-3 (ocf:pacemaker:Dummy): Stopped (disabled) - * smart-mon (ocf:pacemaker:HealthSMART): Stopped - - Failed Resource Actions: -+ * dummy-2 probe on cluster02 returned 'unimplemented feature' at Wed Sep 2 12:17:38 2020 after 33ms - * smart-mon probe on cluster02 returned 'not installed' at Tue Nov 9 15:38:55 2021 after 33ms - * ping probe on cluster02 returned 'not installed' at Thu Nov 18 13:11:42 2021 - =#=#=#= End test: Text output of partially active resources, with inactive resources - OK (0) =#=#=#= -@@ -3660,7 +3671,7 @@ Cluster Summary: - * Last updated: - * Last change: - * 4 nodes configured -- * 14 resource instances configured (1 DISABLED) -+ * 15 resource instances configured (1 DISABLED) - - Node List: - * Online: [ cluster01 cluster02 ] -@@ -3676,7 +3687,7 @@ Full List of Resources: - * httpd-bundle-0 (192.168.122.131) (ocf:heartbeat:apache): Started cluster02 - * httpd-bundle-1 (192.168.122.132) (ocf:heartbeat:apache): Stopped cluster01 - * Resource Group: partially-active-group: -- * 1/2 (ocf:pacemaker:Dummy): Active cluster02 -+ * 2/3 (ocf:pacemaker:Dummy): Active cluster02 - - Node Attributes: - * Node: cluster01: -@@ -3697,6 +3708,8 @@ Operations: - * (3) monitor: interval="30000ms" - * dummy-1: migration-threshold=1000000: - * (2) start -+ * dummy-2: migration-threshold=1000000: -+ * (2) probe - * smart-mon: migration-threshold=1000000: - * (9) probe - * ping: migration-threshold=1000000: -@@ -3722,6 +3735,7 @@ Operations: - * (1) start - - Failed Resource Actions: -+ * dummy-2 probe on cluster02 returned 'unimplemented feature' at Wed Sep 2 12:17:38 2020 after 33ms - * smart-mon probe on cluster02 returned 'not installed' at Tue Nov 9 15:38:55 2021 after 33ms - * ping probe on cluster02 returned 'not installed' at Thu Nov 18 13:11:42 2021 - =#=#=#= End test: Complete brief text output, with inactive resources - OK (0) =#=#=#= -@@ -3733,7 +3747,7 @@ Cluster Summary: - * Last updated: - * Last change: - * 4 nodes configured -- * 14 resource instances configured (1 DISABLED) -+ * 15 resource instances configured (1 DISABLED) - - Node List: - * Online: [ cluster01 cluster02 ] -@@ -3742,6 +3756,7 @@ Node List: - Active Resources: - * Resource Group: partially-active-group (1 member inactive): - * dummy-1 (ocf:pacemaker:Dummy): Started cluster02 -+ * dummy-2 (ocf:pacemaker:Dummy): FAILED cluster02 - =#=#=#= End test: Text output of partially active group - OK (0) =#=#=#= - * Passed: crm_mon - Text output of partially active group - =#=#=#= Begin test: Text output of partially active group, with inactive resources =#=#=#= -@@ -3751,7 +3766,7 @@ Cluster Summary: - * Last updated: - * Last change: - * 4 nodes configured -- * 14 resource instances configured (1 DISABLED) -+ * 15 resource instances configured (1 DISABLED) - - Node List: - * Online: [ cluster01 cluster02 ] -@@ -3760,7 +3775,8 @@ Node List: - Full List of Resources: - * Resource Group: partially-active-group: - * dummy-1 (ocf:pacemaker:Dummy): Started cluster02 -- * dummy-2 (ocf:pacemaker:Dummy): Stopped (disabled) -+ * dummy-2 (ocf:pacemaker:Dummy): FAILED cluster02 -+ * dummy-3 (ocf:pacemaker:Dummy): Stopped (disabled) - =#=#=#= End test: Text output of partially active group, with inactive resources - OK (0) =#=#=#= - * Passed: crm_mon - Text output of partially active group, with inactive resources - =#=#=#= Begin test: Text output of active member of partially active group =#=#=#= -@@ -3770,7 +3786,7 @@ Cluster Summary: - * Last updated: - * Last change: - * 4 nodes configured -- * 14 resource instances configured (1 DISABLED) -+ * 15 resource instances configured (1 DISABLED) - - Node List: - * Online: [ cluster01 cluster02 ] -@@ -3788,7 +3804,7 @@ Cluster Summary: - * Last updated: - * Last change: - * 4 nodes configured -- * 14 resource instances configured (1 DISABLED) -+ * 15 resource instances configured (1 DISABLED) - - Node List: - * Online: [ cluster01 cluster02 ] -@@ -3796,7 +3812,10 @@ Node List: - - Active Resources: - * Resource Group: partially-active-group (1 member inactive): -- * dummy-2 (ocf:pacemaker:Dummy): Stopped (disabled) -+ * dummy-2 (ocf:pacemaker:Dummy): FAILED cluster02 -+ -+Failed Resource Actions: -+ * dummy-2 probe on cluster02 returned 'unimplemented feature' at Wed Sep 2 12:17:38 2020 after 33ms - =#=#=#= End test: Text output of inactive member of partially active group - OK (0) =#=#=#= - * Passed: crm_mon - Text output of inactive member of partially active group - =#=#=#= Begin test: Complete brief text output grouped by node, with inactive resources =#=#=#= -@@ -3806,7 +3825,7 @@ Cluster Summary: - * Last updated: - * Last change: - * 4 nodes configured -- * 14 resource instances configured (1 DISABLED) -+ * 15 resource instances configured (1 DISABLED) - - Node List: - * Node cluster01: online: -@@ -3820,7 +3839,7 @@ Node List: - * Resources: - * 1 (ocf:heartbeat:IPaddr2): Active - * 1 (ocf:heartbeat:docker): Active -- * 1 (ocf:pacemaker:Dummy): Active -+ * 2 (ocf:pacemaker:Dummy): Active - * 1 (ocf:pacemaker:remote): Active - * GuestNode httpd-bundle-0@cluster02: online: - * Resources: -@@ -3834,7 +3853,7 @@ Inactive Resources: - * httpd-bundle-0 (192.168.122.131) (ocf:heartbeat:apache): Started cluster02 - * httpd-bundle-1 (192.168.122.132) (ocf:heartbeat:apache): Stopped cluster01 - * Resource Group: partially-active-group: -- * 1/2 (ocf:pacemaker:Dummy): Active cluster02 -+ * 2/3 (ocf:pacemaker:Dummy): Active cluster02 - * smart-mon (ocf:pacemaker:HealthSMART): Stopped - - Node Attributes: -@@ -3856,6 +3875,8 @@ Operations: - * (3) monitor: interval="30000ms" - * dummy-1: migration-threshold=1000000: - * (2) start -+ * dummy-2: migration-threshold=1000000: -+ * (2) probe - * smart-mon: migration-threshold=1000000: - * (9) probe - * ping: migration-threshold=1000000: -@@ -3881,6 +3902,7 @@ Operations: - * (1) start - - Failed Resource Actions: -+ * dummy-2 probe on cluster02 returned 'unimplemented feature' at Wed Sep 2 12:17:38 2020 after 33ms - * smart-mon probe on cluster02 returned 'not installed' at Tue Nov 9 15:38:55 2021 after 33ms - * ping probe on cluster02 returned 'not installed' at Thu Nov 18 13:11:42 2021 - =#=#=#= End test: Complete brief text output grouped by node, with inactive resources - OK (0) =#=#=#= -@@ -3892,7 +3914,7 @@ Cluster Summary: - * Last updated: - * Last change: - * 4 nodes configured -- * 14 resource instances configured (1 DISABLED) -+ * 15 resource instances configured (1 DISABLED) - - Node List: - * Online: [ cluster01 ] -@@ -3914,7 +3936,7 @@ Full List of Resources: - - - -- -+ - - - --- -2.27.0 - - -From 85e76b8bdb4de261a9cb4858eeedd49fba0346a1 Mon Sep 17 00:00:00 2001 -From: Chris Lumens -Date: Fri, 19 Nov 2021 15:55:51 -0500 -Subject: [PATCH 09/21] Test: cts-cli: Add a failed probe on a new dummy-4 - resource. - -This is to verify that these resources which are part of a group are -displayed properly. No code changes will be necessary, since groups are -just several other resources all in the same pile. - -There are no code changes yet to properly handle displaying these -operations, so the results here just reflect the current handling. ---- - cts/cli/crm_mon-partial.xml | 4 +++ - cts/cli/regression.crm_mon.exp | 51 ++++++++++++++++++++++------------ - 2 files changed, 37 insertions(+), 18 deletions(-) - -diff --git a/cts/cli/crm_mon-partial.xml b/cts/cli/crm_mon-partial.xml -index 1ce80ea58a..d4d4a70848 100644 ---- a/cts/cli/crm_mon-partial.xml -+++ b/cts/cli/crm_mon-partial.xml -@@ -60,6 +60,7 @@ - - - -+ - - - -@@ -108,6 +109,9 @@ - - - -+ -+ -+ - - - -diff --git a/cts/cli/regression.crm_mon.exp b/cts/cli/regression.crm_mon.exp -index 8cf3a1215e..c524b199e3 100644 ---- a/cts/cli/regression.crm_mon.exp -+++ b/cts/cli/regression.crm_mon.exp -@@ -3470,7 +3470,7 @@ Cluster Summary: - * Last updated: - * Last change: - * 4 nodes configured -- * 15 resource instances configured (1 DISABLED) -+ * 16 resource instances configured (1 DISABLED) - - Node List: - * Online: [ cluster01 cluster02 ] -@@ -3483,12 +3483,13 @@ Active Resources: - * Container bundle set: httpd-bundle [pcmk:http]: - * httpd-bundle-0 (192.168.122.131) (ocf:heartbeat:apache): Started cluster02 - * httpd-bundle-1 (192.168.122.132) (ocf:heartbeat:apache): Stopped cluster01 -- * Resource Group: partially-active-group (1 member inactive): -+ * Resource Group: partially-active-group (2 members inactive): - * dummy-1 (ocf:pacemaker:Dummy): Started cluster02 - * dummy-2 (ocf:pacemaker:Dummy): FAILED cluster02 - - Failed Resource Actions: - * dummy-2 probe on cluster02 returned 'unimplemented feature' at Wed Sep 2 12:17:38 2020 after 33ms -+ * dummy-4 probe on cluster02 returned 'not installed' at Wed Sep 2 12:17:38 2020 - * smart-mon probe on cluster02 returned 'not installed' at Tue Nov 9 15:38:55 2021 after 33ms - * ping probe on cluster02 returned 'not installed' at Thu Nov 18 13:11:42 2021 - =#=#=#= End test: Text output of partially active resources - OK (0) =#=#=#= -@@ -3501,7 +3502,7 @@ Failed Resource Actions: - - - -- -+ - - - -@@ -3548,7 +3549,7 @@ Failed Resource Actions: - - - -- -+ - - - -@@ -3556,6 +3557,7 @@ Failed Resource Actions: - - - -+ - - - -@@ -3587,6 +3589,9 @@ Failed Resource Actions: - - - -+ -+ -+ - - - -@@ -3624,6 +3629,7 @@ Failed Resource Actions: - - - -+ - - - -@@ -3638,7 +3644,7 @@ Cluster Summary: - * Last updated: - * Last change: - * 4 nodes configured -- * 15 resource instances configured (1 DISABLED) -+ * 16 resource instances configured (1 DISABLED) - - Node List: - * Online: [ cluster01 cluster02 ] -@@ -3656,10 +3662,12 @@ Full List of Resources: - * dummy-1 (ocf:pacemaker:Dummy): Started cluster02 - * dummy-2 (ocf:pacemaker:Dummy): FAILED cluster02 - * dummy-3 (ocf:pacemaker:Dummy): Stopped (disabled) -+ * dummy-4 (ocf:pacemaker:Dummy): Stopped - * smart-mon (ocf:pacemaker:HealthSMART): Stopped - - Failed Resource Actions: - * dummy-2 probe on cluster02 returned 'unimplemented feature' at Wed Sep 2 12:17:38 2020 after 33ms -+ * dummy-4 probe on cluster02 returned 'not installed' at Wed Sep 2 12:17:38 2020 - * smart-mon probe on cluster02 returned 'not installed' at Tue Nov 9 15:38:55 2021 after 33ms - * ping probe on cluster02 returned 'not installed' at Thu Nov 18 13:11:42 2021 - =#=#=#= End test: Text output of partially active resources, with inactive resources - OK (0) =#=#=#= -@@ -3671,7 +3679,7 @@ Cluster Summary: - * Last updated: - * Last change: - * 4 nodes configured -- * 15 resource instances configured (1 DISABLED) -+ * 16 resource instances configured (1 DISABLED) - - Node List: - * Online: [ cluster01 cluster02 ] -@@ -3687,7 +3695,7 @@ Full List of Resources: - * httpd-bundle-0 (192.168.122.131) (ocf:heartbeat:apache): Started cluster02 - * httpd-bundle-1 (192.168.122.132) (ocf:heartbeat:apache): Stopped cluster01 - * Resource Group: partially-active-group: -- * 2/3 (ocf:pacemaker:Dummy): Active cluster02 -+ * 2/4 (ocf:pacemaker:Dummy): Active cluster02 - - Node Attributes: - * Node: cluster01: -@@ -3710,6 +3718,8 @@ Operations: - * (2) start - * dummy-2: migration-threshold=1000000: - * (2) probe -+ * dummy-4: migration-threshold=1000000: -+ * (2) probe - * smart-mon: migration-threshold=1000000: - * (9) probe - * ping: migration-threshold=1000000: -@@ -3736,6 +3746,7 @@ Operations: - - Failed Resource Actions: - * dummy-2 probe on cluster02 returned 'unimplemented feature' at Wed Sep 2 12:17:38 2020 after 33ms -+ * dummy-4 probe on cluster02 returned 'not installed' at Wed Sep 2 12:17:38 2020 - * smart-mon probe on cluster02 returned 'not installed' at Tue Nov 9 15:38:55 2021 after 33ms - * ping probe on cluster02 returned 'not installed' at Thu Nov 18 13:11:42 2021 - =#=#=#= End test: Complete brief text output, with inactive resources - OK (0) =#=#=#= -@@ -3747,14 +3758,14 @@ Cluster Summary: - * Last updated: - * Last change: - * 4 nodes configured -- * 15 resource instances configured (1 DISABLED) -+ * 16 resource instances configured (1 DISABLED) - - Node List: - * Online: [ cluster01 cluster02 ] - * GuestOnline: [ httpd-bundle-0@cluster02 httpd-bundle-1@cluster01 ] - - Active Resources: -- * Resource Group: partially-active-group (1 member inactive): -+ * Resource Group: partially-active-group (2 members inactive): - * dummy-1 (ocf:pacemaker:Dummy): Started cluster02 - * dummy-2 (ocf:pacemaker:Dummy): FAILED cluster02 - =#=#=#= End test: Text output of partially active group - OK (0) =#=#=#= -@@ -3766,7 +3777,7 @@ Cluster Summary: - * Last updated: - * Last change: - * 4 nodes configured -- * 15 resource instances configured (1 DISABLED) -+ * 16 resource instances configured (1 DISABLED) - - Node List: - * Online: [ cluster01 cluster02 ] -@@ -3777,6 +3788,7 @@ Full List of Resources: - * dummy-1 (ocf:pacemaker:Dummy): Started cluster02 - * dummy-2 (ocf:pacemaker:Dummy): FAILED cluster02 - * dummy-3 (ocf:pacemaker:Dummy): Stopped (disabled) -+ * dummy-4 (ocf:pacemaker:Dummy): Stopped - =#=#=#= End test: Text output of partially active group, with inactive resources - OK (0) =#=#=#= - * Passed: crm_mon - Text output of partially active group, with inactive resources - =#=#=#= Begin test: Text output of active member of partially active group =#=#=#= -@@ -3786,14 +3798,14 @@ Cluster Summary: - * Last updated: - * Last change: - * 4 nodes configured -- * 15 resource instances configured (1 DISABLED) -+ * 16 resource instances configured (1 DISABLED) - - Node List: - * Online: [ cluster01 cluster02 ] - * GuestOnline: [ httpd-bundle-0@cluster02 httpd-bundle-1@cluster01 ] - - Active Resources: -- * Resource Group: partially-active-group (1 member inactive): -+ * Resource Group: partially-active-group (2 members inactive): - * dummy-1 (ocf:pacemaker:Dummy): Started cluster02 - =#=#=#= End test: Text output of active member of partially active group - OK (0) =#=#=#= - * Passed: crm_mon - Text output of active member of partially active group -@@ -3804,14 +3816,14 @@ Cluster Summary: - * Last updated: - * Last change: - * 4 nodes configured -- * 15 resource instances configured (1 DISABLED) -+ * 16 resource instances configured (1 DISABLED) - - Node List: - * Online: [ cluster01 cluster02 ] - * GuestOnline: [ httpd-bundle-0@cluster02 httpd-bundle-1@cluster01 ] - - Active Resources: -- * Resource Group: partially-active-group (1 member inactive): -+ * Resource Group: partially-active-group (2 members inactive): - * dummy-2 (ocf:pacemaker:Dummy): FAILED cluster02 - - Failed Resource Actions: -@@ -3825,7 +3837,7 @@ Cluster Summary: - * Last updated: - * Last change: - * 4 nodes configured -- * 15 resource instances configured (1 DISABLED) -+ * 16 resource instances configured (1 DISABLED) - - Node List: - * Node cluster01: online: -@@ -3853,7 +3865,7 @@ Inactive Resources: - * httpd-bundle-0 (192.168.122.131) (ocf:heartbeat:apache): Started cluster02 - * httpd-bundle-1 (192.168.122.132) (ocf:heartbeat:apache): Stopped cluster01 - * Resource Group: partially-active-group: -- * 2/3 (ocf:pacemaker:Dummy): Active cluster02 -+ * 2/4 (ocf:pacemaker:Dummy): Active cluster02 - * smart-mon (ocf:pacemaker:HealthSMART): Stopped - - Node Attributes: -@@ -3877,6 +3889,8 @@ Operations: - * (2) start - * dummy-2: migration-threshold=1000000: - * (2) probe -+ * dummy-4: migration-threshold=1000000: -+ * (2) probe - * smart-mon: migration-threshold=1000000: - * (9) probe - * ping: migration-threshold=1000000: -@@ -3903,6 +3917,7 @@ Operations: - - Failed Resource Actions: - * dummy-2 probe on cluster02 returned 'unimplemented feature' at Wed Sep 2 12:17:38 2020 after 33ms -+ * dummy-4 probe on cluster02 returned 'not installed' at Wed Sep 2 12:17:38 2020 - * smart-mon probe on cluster02 returned 'not installed' at Tue Nov 9 15:38:55 2021 after 33ms - * ping probe on cluster02 returned 'not installed' at Thu Nov 18 13:11:42 2021 - =#=#=#= End test: Complete brief text output grouped by node, with inactive resources - OK (0) =#=#=#= -@@ -3914,7 +3929,7 @@ Cluster Summary: - * Last updated: - * Last change: - * 4 nodes configured -- * 15 resource instances configured (1 DISABLED) -+ * 16 resource instances configured (1 DISABLED) - - Node List: - * Online: [ cluster01 ] -@@ -3936,7 +3951,7 @@ Full List of Resources: - - - -- -+ - - - --- -2.27.0 - - -From 206d733b6ce8e0ffcad243d282e8baa8c3ff72b4 Mon Sep 17 00:00:00 2001 -From: Chris Lumens -Date: Tue, 23 Nov 2021 14:33:47 -0500 -Subject: [PATCH 10/21] Test: cts-cli: Add test output for a bundle resource - with a failed probe op. - -This just changes the existing failed bundle resource from not starting -to failing with a reason. - -There are no code changes yet to properly handle displaying these -operations, so the results here just reflect the current handling. ---- - cts/cli/crm_mon-partial.xml | 9 ++++++++ - cts/cli/regression.crm_mon.exp | 40 +++++++++++++++++++++++++--------- - 2 files changed, 39 insertions(+), 10 deletions(-) - -diff --git a/cts/cli/crm_mon-partial.xml b/cts/cli/crm_mon-partial.xml -index d4d4a70848..5981fc653c 100644 ---- a/cts/cli/crm_mon-partial.xml -+++ b/cts/cli/crm_mon-partial.xml -@@ -178,5 +178,14 @@ - - - -+ -+ -+ -+ -+ -+ -+ -+ -+ - - -diff --git a/cts/cli/regression.crm_mon.exp b/cts/cli/regression.crm_mon.exp -index c524b199e3..b690a26fb6 100644 ---- a/cts/cli/regression.crm_mon.exp -+++ b/cts/cli/regression.crm_mon.exp -@@ -3482,7 +3482,7 @@ Active Resources: - * Fencing (stonith:fence_xvm): Started cluster01 - * Container bundle set: httpd-bundle [pcmk:http]: - * httpd-bundle-0 (192.168.122.131) (ocf:heartbeat:apache): Started cluster02 -- * httpd-bundle-1 (192.168.122.132) (ocf:heartbeat:apache): Stopped cluster01 -+ * httpd-bundle-1 (192.168.122.132) (ocf:heartbeat:apache): FAILED cluster01 - * Resource Group: partially-active-group (2 members inactive): - * dummy-1 (ocf:pacemaker:Dummy): Started cluster02 - * dummy-2 (ocf:pacemaker:Dummy): FAILED cluster02 -@@ -3492,6 +3492,7 @@ Failed Resource Actions: - * dummy-4 probe on cluster02 returned 'not installed' at Wed Sep 2 12:17:38 2020 - * smart-mon probe on cluster02 returned 'not installed' at Tue Nov 9 15:38:55 2021 after 33ms - * ping probe on cluster02 returned 'not installed' at Thu Nov 18 13:11:42 2021 -+ * httpd probe on httpd-bundle-1 returned 'invalid parameter' at Wed May 27 15:43:09 2020 - =#=#=#= End test: Text output of partially active resources - OK (0) =#=#=#= - * Passed: crm_mon - Text output of partially active resources - =#=#=#= Begin test: XML output of partially active resources =#=#=#= -@@ -3509,7 +3510,7 @@ Failed Resource Actions: - - - -- -+ - - - -@@ -3540,7 +3541,9 @@ Failed Resource Actions: - - - -- -+ -+ -+ - - - -@@ -3626,12 +3629,18 @@ Failed Resource Actions: - - - -+ -+ -+ -+ -+ - - - - - - -+ - - - -@@ -3657,7 +3666,7 @@ Full List of Resources: - * Fencing (stonith:fence_xvm): Started cluster01 - * Container bundle set: httpd-bundle [pcmk:http]: - * httpd-bundle-0 (192.168.122.131) (ocf:heartbeat:apache): Started cluster02 -- * httpd-bundle-1 (192.168.122.132) (ocf:heartbeat:apache): Stopped cluster01 -+ * httpd-bundle-1 (192.168.122.132) (ocf:heartbeat:apache): FAILED cluster01 - * Resource Group: partially-active-group: - * dummy-1 (ocf:pacemaker:Dummy): Started cluster02 - * dummy-2 (ocf:pacemaker:Dummy): FAILED cluster02 -@@ -3670,6 +3679,7 @@ Failed Resource Actions: - * dummy-4 probe on cluster02 returned 'not installed' at Wed Sep 2 12:17:38 2020 - * smart-mon probe on cluster02 returned 'not installed' at Tue Nov 9 15:38:55 2021 after 33ms - * ping probe on cluster02 returned 'not installed' at Thu Nov 18 13:11:42 2021 -+ * httpd probe on httpd-bundle-1 returned 'invalid parameter' at Wed May 27 15:43:09 2020 - =#=#=#= End test: Text output of partially active resources, with inactive resources - OK (0) =#=#=#= - * Passed: crm_mon - Text output of partially active resources, with inactive resources - =#=#=#= Begin test: Complete brief text output, with inactive resources =#=#=#= -@@ -3693,7 +3703,7 @@ Full List of Resources: - * Stopped: [ cluster02 ] - * Container bundle set: httpd-bundle [pcmk:http]: - * httpd-bundle-0 (192.168.122.131) (ocf:heartbeat:apache): Started cluster02 -- * httpd-bundle-1 (192.168.122.132) (ocf:heartbeat:apache): Stopped cluster01 -+ * httpd-bundle-1 (192.168.122.132) (ocf:heartbeat:apache): FAILED cluster01 - * Resource Group: partially-active-group: - * 2/4 (ocf:pacemaker:Dummy): Active cluster02 - -@@ -3743,12 +3753,16 @@ Operations: - * Node: httpd-bundle-0@cluster02: - * httpd: migration-threshold=1000000: - * (1) start -+ * Node: httpd-bundle-1@cluster01: -+ * httpd: migration-threshold=1000000: -+ * (1) probe - - Failed Resource Actions: - * dummy-2 probe on cluster02 returned 'unimplemented feature' at Wed Sep 2 12:17:38 2020 after 33ms - * dummy-4 probe on cluster02 returned 'not installed' at Wed Sep 2 12:17:38 2020 - * smart-mon probe on cluster02 returned 'not installed' at Tue Nov 9 15:38:55 2021 after 33ms - * ping probe on cluster02 returned 'not installed' at Thu Nov 18 13:11:42 2021 -+ * httpd probe on httpd-bundle-1 returned 'invalid parameter' at Wed May 27 15:43:09 2020 - =#=#=#= End test: Complete brief text output, with inactive resources - OK (0) =#=#=#= - * Passed: crm_mon - Complete brief text output, with inactive resources - =#=#=#= Begin test: Text output of partially active group =#=#=#= -@@ -3856,14 +3870,14 @@ Node List: - * GuestNode httpd-bundle-0@cluster02: online: - * Resources: - * 1 (ocf:heartbeat:apache): Active -+ * GuestNode httpd-bundle-1@cluster01: online: -+ * Resources: -+ * 1 (ocf:heartbeat:apache): Active - - Inactive Resources: - * Clone Set: ping-clone [ping]: - * Started: [ cluster01 ] - * Stopped: [ cluster02 ] -- * Container bundle set: httpd-bundle [pcmk:http]: -- * httpd-bundle-0 (192.168.122.131) (ocf:heartbeat:apache): Started cluster02 -- * httpd-bundle-1 (192.168.122.132) (ocf:heartbeat:apache): Stopped cluster01 - * Resource Group: partially-active-group: - * 2/4 (ocf:pacemaker:Dummy): Active cluster02 - * smart-mon (ocf:pacemaker:HealthSMART): Stopped -@@ -3914,12 +3928,16 @@ Operations: - * Node: httpd-bundle-0@cluster02: - * httpd: migration-threshold=1000000: - * (1) start -+ * Node: httpd-bundle-1@cluster01: -+ * httpd: migration-threshold=1000000: -+ * (1) probe - - Failed Resource Actions: - * dummy-2 probe on cluster02 returned 'unimplemented feature' at Wed Sep 2 12:17:38 2020 after 33ms - * dummy-4 probe on cluster02 returned 'not installed' at Wed Sep 2 12:17:38 2020 - * smart-mon probe on cluster02 returned 'not installed' at Tue Nov 9 15:38:55 2021 after 33ms - * ping probe on cluster02 returned 'not installed' at Thu Nov 18 13:11:42 2021 -+ * httpd probe on httpd-bundle-1 returned 'invalid parameter' at Wed May 27 15:43:09 2020 - =#=#=#= End test: Complete brief text output grouped by node, with inactive resources - OK (0) =#=#=#= - * Passed: crm_mon - Complete brief text output grouped by node, with inactive resources - =#=#=#= Begin test: Text output of partially active resources, with inactive resources, filtered by node =#=#=#= -@@ -3939,7 +3957,7 @@ Full List of Resources: - * Started: [ cluster01 ] - * Fencing (stonith:fence_xvm): Started cluster01 - * Container bundle set: httpd-bundle [pcmk:http]: -- * httpd-bundle-1 (192.168.122.132) (ocf:heartbeat:apache): Stopped cluster01 -+ * httpd-bundle-1 (192.168.122.132) (ocf:heartbeat:apache): FAILED cluster01 - * smart-mon (ocf:pacemaker:HealthSMART): Stopped - =#=#=#= End test: Text output of partially active resources, with inactive resources, filtered by node - OK (0) =#=#=#= - * Passed: crm_mon - Text output of partially active resources, with inactive resources, filtered by node -@@ -3972,7 +3990,9 @@ Full List of Resources: - - - -- -+ -+ -+ - - - --- -2.27.0 - - -From 6240a28d36c0349e3b1d7f52c36106580c53bb01 Mon Sep 17 00:00:00 2001 -From: Chris Lumens -Date: Mon, 22 Nov 2021 10:59:10 -0500 -Subject: [PATCH 11/21] Test: cts: Add --show-detail to a couple of the crm_mon - tests. - -This straightens out a couple differences in output between running -tests locally (where --enable-compat-2.0 is not given, which would -automatically add --show-detail) and running tests under mock (where -that option is given). - -Note that this only really matters for failed resource actions, which -were not previously output as part of any crm_mon regression test. It -is only the patches in this series that have introduced those, and thus -this difference. ---- - cts/cli/regression.crm_mon.exp | 131 ++++++++++++++++++++------------- - cts/cts-cli.in | 10 +-- - 2 files changed, 83 insertions(+), 58 deletions(-) - -diff --git a/cts/cli/regression.crm_mon.exp b/cts/cli/regression.crm_mon.exp -index b690a26fb6..d7b9d98e2c 100644 ---- a/cts/cli/regression.crm_mon.exp -+++ b/cts/cli/regression.crm_mon.exp -@@ -3466,33 +3466,42 @@ Operations: - =#=#=#= Begin test: Text output of partially active resources =#=#=#= - Cluster Summary: - * Stack: corosync -- * Current DC: cluster02 (version) - partition with quorum -+ * Current DC: cluster02 (2) (version) - partition with quorum - * Last updated: - * Last change: - * 4 nodes configured - * 16 resource instances configured (1 DISABLED) - - Node List: -- * Online: [ cluster01 cluster02 ] -+ * Online: [ cluster01 (1) cluster02 (2) ] - * GuestOnline: [ httpd-bundle-0@cluster02 httpd-bundle-1@cluster01 ] - - Active Resources: - * Clone Set: ping-clone [ping]: -- * Started: [ cluster01 ] -+ * ping (ocf:pacemaker:ping): Started cluster01 -+ * ping (ocf:pacemaker:ping): Stopped - * Fencing (stonith:fence_xvm): Started cluster01 - * Container bundle set: httpd-bundle [pcmk:http]: -- * httpd-bundle-0 (192.168.122.131) (ocf:heartbeat:apache): Started cluster02 -- * httpd-bundle-1 (192.168.122.132) (ocf:heartbeat:apache): FAILED cluster01 -+ * Replica[0] -+ * httpd-bundle-ip-192.168.122.131 (ocf:heartbeat:IPaddr2): Started cluster02 -+ * httpd (ocf:heartbeat:apache): Started httpd-bundle-0 -+ * httpd-bundle-docker-0 (ocf:heartbeat:docker): Started cluster02 -+ * httpd-bundle-0 (ocf:pacemaker:remote): Started cluster02 -+ * Replica[1] -+ * httpd-bundle-ip-192.168.122.132 (ocf:heartbeat:IPaddr2): Started cluster01 -+ * httpd (ocf:heartbeat:apache): FAILED httpd-bundle-1 -+ * httpd-bundle-docker-1 (ocf:heartbeat:docker): Started cluster01 -+ * httpd-bundle-1 (ocf:pacemaker:remote): Started cluster01 - * Resource Group: partially-active-group (2 members inactive): - * dummy-1 (ocf:pacemaker:Dummy): Started cluster02 - * dummy-2 (ocf:pacemaker:Dummy): FAILED cluster02 - - Failed Resource Actions: -- * dummy-2 probe on cluster02 returned 'unimplemented feature' at Wed Sep 2 12:17:38 2020 after 33ms -- * dummy-4 probe on cluster02 returned 'not installed' at Wed Sep 2 12:17:38 2020 -- * smart-mon probe on cluster02 returned 'not installed' at Tue Nov 9 15:38:55 2021 after 33ms -- * ping probe on cluster02 returned 'not installed' at Thu Nov 18 13:11:42 2021 -- * httpd probe on httpd-bundle-1 returned 'invalid parameter' at Wed May 27 15:43:09 2020 -+ * dummy-2_monitor_0 on cluster02 'unimplemented feature' (3): call=2, status='complete', last-rc-change='Wed Sep 2 12:17:38 2020', queued=0ms, exec=33ms -+ * dummy-4_monitor_0 on cluster02 'not installed' (5): call=2, status='complete', last-rc-change='Wed Sep 2 12:17:38 2020', queued=0ms, exec=0ms -+ * smart-mon_monitor_0 on cluster02 'not installed' (5): call=9, status='complete', last-rc-change='Tue Nov 9 15:38:55 2021', queued=0ms, exec=33ms -+ * ping_monitor_0 on cluster02 'not installed' (5): call=6, status='complete', last-rc-change='Thu Nov 18 13:11:42 2021', queued=0ms, exec=0ms -+ * httpd_monitor_0 on httpd-bundle-1 'invalid parameter' (2): call=1, status='complete', last-rc-change='Wed May 27 15:43:09 2020', queued=0ms, exec=0ms - =#=#=#= End test: Text output of partially active resources - OK (0) =#=#=#= - * Passed: crm_mon - Text output of partially active resources - =#=#=#= Begin test: XML output of partially active resources =#=#=#= -@@ -3649,24 +3658,32 @@ Failed Resource Actions: - =#=#=#= Begin test: Text output of partially active resources, with inactive resources =#=#=#= - Cluster Summary: - * Stack: corosync -- * Current DC: cluster02 (version) - partition with quorum -+ * Current DC: cluster02 (2) (version) - partition with quorum - * Last updated: - * Last change: - * 4 nodes configured - * 16 resource instances configured (1 DISABLED) - - Node List: -- * Online: [ cluster01 cluster02 ] -+ * Online: [ cluster01 (1) cluster02 (2) ] - * GuestOnline: [ httpd-bundle-0@cluster02 httpd-bundle-1@cluster01 ] - - Full List of Resources: - * Clone Set: ping-clone [ping]: -- * Started: [ cluster01 ] -- * Stopped: [ cluster02 ] -+ * ping (ocf:pacemaker:ping): Started cluster01 -+ * ping (ocf:pacemaker:ping): Stopped - * Fencing (stonith:fence_xvm): Started cluster01 - * Container bundle set: httpd-bundle [pcmk:http]: -- * httpd-bundle-0 (192.168.122.131) (ocf:heartbeat:apache): Started cluster02 -- * httpd-bundle-1 (192.168.122.132) (ocf:heartbeat:apache): FAILED cluster01 -+ * Replica[0] -+ * httpd-bundle-ip-192.168.122.131 (ocf:heartbeat:IPaddr2): Started cluster02 -+ * httpd (ocf:heartbeat:apache): Started httpd-bundle-0 -+ * httpd-bundle-docker-0 (ocf:heartbeat:docker): Started cluster02 -+ * httpd-bundle-0 (ocf:pacemaker:remote): Started cluster02 -+ * Replica[1] -+ * httpd-bundle-ip-192.168.122.132 (ocf:heartbeat:IPaddr2): Started cluster01 -+ * httpd (ocf:heartbeat:apache): FAILED httpd-bundle-1 -+ * httpd-bundle-docker-1 (ocf:heartbeat:docker): Started cluster01 -+ * httpd-bundle-1 (ocf:pacemaker:remote): Started cluster01 - * Resource Group: partially-active-group: - * dummy-1 (ocf:pacemaker:Dummy): Started cluster02 - * dummy-2 (ocf:pacemaker:Dummy): FAILED cluster02 -@@ -3675,46 +3692,54 @@ Full List of Resources: - * smart-mon (ocf:pacemaker:HealthSMART): Stopped - - Failed Resource Actions: -- * dummy-2 probe on cluster02 returned 'unimplemented feature' at Wed Sep 2 12:17:38 2020 after 33ms -- * dummy-4 probe on cluster02 returned 'not installed' at Wed Sep 2 12:17:38 2020 -- * smart-mon probe on cluster02 returned 'not installed' at Tue Nov 9 15:38:55 2021 after 33ms -- * ping probe on cluster02 returned 'not installed' at Thu Nov 18 13:11:42 2021 -- * httpd probe on httpd-bundle-1 returned 'invalid parameter' at Wed May 27 15:43:09 2020 -+ * dummy-2_monitor_0 on cluster02 'unimplemented feature' (3): call=2, status='complete', last-rc-change='Wed Sep 2 12:17:38 2020', queued=0ms, exec=33ms -+ * dummy-4_monitor_0 on cluster02 'not installed' (5): call=2, status='complete', last-rc-change='Wed Sep 2 12:17:38 2020', queued=0ms, exec=0ms -+ * smart-mon_monitor_0 on cluster02 'not installed' (5): call=9, status='complete', last-rc-change='Tue Nov 9 15:38:55 2021', queued=0ms, exec=33ms -+ * ping_monitor_0 on cluster02 'not installed' (5): call=6, status='complete', last-rc-change='Thu Nov 18 13:11:42 2021', queued=0ms, exec=0ms -+ * httpd_monitor_0 on httpd-bundle-1 'invalid parameter' (2): call=1, status='complete', last-rc-change='Wed May 27 15:43:09 2020', queued=0ms, exec=0ms - =#=#=#= End test: Text output of partially active resources, with inactive resources - OK (0) =#=#=#= - * Passed: crm_mon - Text output of partially active resources, with inactive resources - =#=#=#= Begin test: Complete brief text output, with inactive resources =#=#=#= - Cluster Summary: - * Stack: corosync -- * Current DC: cluster02 (version) - partition with quorum -+ * Current DC: cluster02 (2) (version) - partition with quorum - * Last updated: - * Last change: - * 4 nodes configured - * 16 resource instances configured (1 DISABLED) - - Node List: -- * Online: [ cluster01 cluster02 ] -+ * Online: [ cluster01 (1) cluster02 (2) ] - * GuestOnline: [ httpd-bundle-0@cluster02 httpd-bundle-1@cluster01 ] - - Full List of Resources: - * 0/1 (ocf:pacemaker:HealthSMART): Active - * 1/1 (stonith:fence_xvm): Active cluster01 - * Clone Set: ping-clone [ping]: -- * Started: [ cluster01 ] -- * Stopped: [ cluster02 ] -+ * ping (ocf:pacemaker:ping): Started cluster01 -+ * ping (ocf:pacemaker:ping): Stopped - * Container bundle set: httpd-bundle [pcmk:http]: -- * httpd-bundle-0 (192.168.122.131) (ocf:heartbeat:apache): Started cluster02 -- * httpd-bundle-1 (192.168.122.132) (ocf:heartbeat:apache): FAILED cluster01 -+ * Replica[0] -+ * httpd-bundle-ip-192.168.122.131 (ocf:heartbeat:IPaddr2): Started cluster02 -+ * httpd (ocf:heartbeat:apache): Started httpd-bundle-0 -+ * httpd-bundle-docker-0 (ocf:heartbeat:docker): Started cluster02 -+ * httpd-bundle-0 (ocf:pacemaker:remote): Started cluster02 -+ * Replica[1] -+ * httpd-bundle-ip-192.168.122.132 (ocf:heartbeat:IPaddr2): Started cluster01 -+ * httpd (ocf:heartbeat:apache): FAILED httpd-bundle-1 -+ * httpd-bundle-docker-1 (ocf:heartbeat:docker): Started cluster01 -+ * httpd-bundle-1 (ocf:pacemaker:remote): Started cluster01 - * Resource Group: partially-active-group: - * 2/4 (ocf:pacemaker:Dummy): Active cluster02 - - Node Attributes: -- * Node: cluster01: -+ * Node: cluster01 (1): - * pingd : 1000 -- * Node: cluster02: -+ * Node: cluster02 (2): - * pingd : 1000 - - Operations: -- * Node: cluster02: -+ * Node: cluster02 (2): - * httpd-bundle-ip-192.168.122.131: migration-threshold=1000000: - * (2) start - * (3) monitor: interval="60000ms" -@@ -3734,7 +3759,7 @@ Operations: - * (9) probe - * ping: migration-threshold=1000000: - * (6) probe -- * Node: cluster01: -+ * Node: cluster01 (1): - * Fencing: migration-threshold=1000000: - * (15) start - * (20) monitor: interval="60000ms" -@@ -3758,11 +3783,11 @@ Operations: - * (1) probe - - Failed Resource Actions: -- * dummy-2 probe on cluster02 returned 'unimplemented feature' at Wed Sep 2 12:17:38 2020 after 33ms -- * dummy-4 probe on cluster02 returned 'not installed' at Wed Sep 2 12:17:38 2020 -- * smart-mon probe on cluster02 returned 'not installed' at Tue Nov 9 15:38:55 2021 after 33ms -- * ping probe on cluster02 returned 'not installed' at Thu Nov 18 13:11:42 2021 -- * httpd probe on httpd-bundle-1 returned 'invalid parameter' at Wed May 27 15:43:09 2020 -+ * dummy-2_monitor_0 on cluster02 'unimplemented feature' (3): call=2, status='complete', last-rc-change='Wed Sep 2 12:17:38 2020', queued=0ms, exec=33ms -+ * dummy-4_monitor_0 on cluster02 'not installed' (5): call=2, status='complete', last-rc-change='Wed Sep 2 12:17:38 2020', queued=0ms, exec=0ms -+ * smart-mon_monitor_0 on cluster02 'not installed' (5): call=9, status='complete', last-rc-change='Tue Nov 9 15:38:55 2021', queued=0ms, exec=33ms -+ * ping_monitor_0 on cluster02 'not installed' (5): call=6, status='complete', last-rc-change='Thu Nov 18 13:11:42 2021', queued=0ms, exec=0ms -+ * httpd_monitor_0 on httpd-bundle-1 'invalid parameter' (2): call=1, status='complete', last-rc-change='Wed May 27 15:43:09 2020', queued=0ms, exec=0ms - =#=#=#= End test: Complete brief text output, with inactive resources - OK (0) =#=#=#= - * Passed: crm_mon - Complete brief text output, with inactive resources - =#=#=#= Begin test: Text output of partially active group =#=#=#= -@@ -3826,14 +3851,14 @@ Active Resources: - =#=#=#= Begin test: Text output of inactive member of partially active group =#=#=#= - Cluster Summary: - * Stack: corosync -- * Current DC: cluster02 (version) - partition with quorum -+ * Current DC: cluster02 (2) (version) - partition with quorum - * Last updated: - * Last change: - * 4 nodes configured - * 16 resource instances configured (1 DISABLED) - - Node List: -- * Online: [ cluster01 cluster02 ] -+ * Online: [ cluster01 (1) cluster02 (2) ] - * GuestOnline: [ httpd-bundle-0@cluster02 httpd-bundle-1@cluster01 ] - - Active Resources: -@@ -3841,27 +3866,27 @@ Active Resources: - * dummy-2 (ocf:pacemaker:Dummy): FAILED cluster02 - - Failed Resource Actions: -- * dummy-2 probe on cluster02 returned 'unimplemented feature' at Wed Sep 2 12:17:38 2020 after 33ms -+ * dummy-2_monitor_0 on cluster02 'unimplemented feature' (3): call=2, status='complete', last-rc-change='Wed Sep 2 12:17:38 2020', queued=0ms, exec=33ms - =#=#=#= End test: Text output of inactive member of partially active group - OK (0) =#=#=#= - * Passed: crm_mon - Text output of inactive member of partially active group - =#=#=#= Begin test: Complete brief text output grouped by node, with inactive resources =#=#=#= - Cluster Summary: - * Stack: corosync -- * Current DC: cluster02 (version) - partition with quorum -+ * Current DC: cluster02 (2) (version) - partition with quorum - * Last updated: - * Last change: - * 4 nodes configured - * 16 resource instances configured (1 DISABLED) - - Node List: -- * Node cluster01: online: -+ * Node cluster01 (1): online: - * Resources: - * 1 (ocf:heartbeat:IPaddr2): Active - * 1 (ocf:heartbeat:docker): Active - * 1 (ocf:pacemaker:ping): Active - * 1 (ocf:pacemaker:remote): Active - * 1 (stonith:fence_xvm): Active -- * Node cluster02: online: -+ * Node cluster02 (2): online: - * Resources: - * 1 (ocf:heartbeat:IPaddr2): Active - * 1 (ocf:heartbeat:docker): Active -@@ -3876,20 +3901,20 @@ Node List: - - Inactive Resources: - * Clone Set: ping-clone [ping]: -- * Started: [ cluster01 ] -- * Stopped: [ cluster02 ] -+ * ping (ocf:pacemaker:ping): Started cluster01 -+ * ping (ocf:pacemaker:ping): Stopped - * Resource Group: partially-active-group: - * 2/4 (ocf:pacemaker:Dummy): Active cluster02 - * smart-mon (ocf:pacemaker:HealthSMART): Stopped - - Node Attributes: -- * Node: cluster01: -+ * Node: cluster01 (1): - * pingd : 1000 -- * Node: cluster02: -+ * Node: cluster02 (2): - * pingd : 1000 - - Operations: -- * Node: cluster02: -+ * Node: cluster02 (2): - * httpd-bundle-ip-192.168.122.131: migration-threshold=1000000: - * (2) start - * (3) monitor: interval="60000ms" -@@ -3909,7 +3934,7 @@ Operations: - * (9) probe - * ping: migration-threshold=1000000: - * (6) probe -- * Node: cluster01: -+ * Node: cluster01 (1): - * Fencing: migration-threshold=1000000: - * (15) start - * (20) monitor: interval="60000ms" -@@ -3933,11 +3958,11 @@ Operations: - * (1) probe - - Failed Resource Actions: -- * dummy-2 probe on cluster02 returned 'unimplemented feature' at Wed Sep 2 12:17:38 2020 after 33ms -- * dummy-4 probe on cluster02 returned 'not installed' at Wed Sep 2 12:17:38 2020 -- * smart-mon probe on cluster02 returned 'not installed' at Tue Nov 9 15:38:55 2021 after 33ms -- * ping probe on cluster02 returned 'not installed' at Thu Nov 18 13:11:42 2021 -- * httpd probe on httpd-bundle-1 returned 'invalid parameter' at Wed May 27 15:43:09 2020 -+ * dummy-2_monitor_0 on cluster02 'unimplemented feature' (3): call=2, status='complete', last-rc-change='Wed Sep 2 12:17:38 2020', queued=0ms, exec=33ms -+ * dummy-4_monitor_0 on cluster02 'not installed' (5): call=2, status='complete', last-rc-change='Wed Sep 2 12:17:38 2020', queued=0ms, exec=0ms -+ * smart-mon_monitor_0 on cluster02 'not installed' (5): call=9, status='complete', last-rc-change='Tue Nov 9 15:38:55 2021', queued=0ms, exec=33ms -+ * ping_monitor_0 on cluster02 'not installed' (5): call=6, status='complete', last-rc-change='Thu Nov 18 13:11:42 2021', queued=0ms, exec=0ms -+ * httpd_monitor_0 on httpd-bundle-1 'invalid parameter' (2): call=1, status='complete', last-rc-change='Wed May 27 15:43:09 2020', queued=0ms, exec=0ms - =#=#=#= End test: Complete brief text output grouped by node, with inactive resources - OK (0) =#=#=#= - * Passed: crm_mon - Complete brief text output grouped by node, with inactive resources - =#=#=#= Begin test: Text output of partially active resources, with inactive resources, filtered by node =#=#=#= -diff --git a/cts/cts-cli.in b/cts/cts-cli.in -index d32bfb7ed1..457816afab 100755 ---- a/cts/cts-cli.in -+++ b/cts/cts-cli.in -@@ -420,7 +420,7 @@ function test_crm_mon() { - export CIB_file="$test_home/cli/crm_mon-partial.xml" - - desc="Text output of partially active resources" -- cmd="crm_mon -1" -+ cmd="crm_mon -1 --show-detail" - test_assert $CRM_EX_OK 0 - - desc="XML output of partially active resources" -@@ -428,13 +428,13 @@ function test_crm_mon() { - test_assert_validate $CRM_EX_OK 0 - - desc="Text output of partially active resources, with inactive resources" -- cmd="crm_mon -1 -r" -+ cmd="crm_mon -1 -r --show-detail" - test_assert $CRM_EX_OK 0 - - # XML already includes inactive resources - - desc="Complete brief text output, with inactive resources" -- cmd="crm_mon -1 -r --include=all --brief" -+ cmd="crm_mon -1 -r --include=all --brief --show-detail" - test_assert $CRM_EX_OK 0 - - # XML does not have a brief output option -@@ -452,11 +452,11 @@ function test_crm_mon() { - test_assert $CRM_EX_OK 0 - - desc="Text output of inactive member of partially active group" -- cmd="crm_mon -1 --resource=dummy-2" -+ cmd="crm_mon -1 --resource=dummy-2 --show-detail" - test_assert $CRM_EX_OK 0 - - desc="Complete brief text output grouped by node, with inactive resources" -- cmd="crm_mon -1 -r --include=all --group-by-node --brief" -+ cmd="crm_mon -1 -r --include=all --group-by-node --brief --show-detail" - test_assert $CRM_EX_OK 0 - - desc="Text output of partially active resources, with inactive resources, filtered by node" --- -2.27.0 - - -From da14053e5957d84ed0647688d37733adc2f988a3 Mon Sep 17 00:00:00 2001 -From: Chris Lumens -Date: Mon, 29 Nov 2021 15:05:42 -0500 -Subject: [PATCH 12/21] Test: scheduler: Add tests for failed probe operations. - -This adds identical sets of tests for primitive resources and cloned -resources. For the moment, the output reflects the current state of the -code. No changes have been made to properly handle these operations -yet. - -Each set has three resources, and each is set up with a slightly -different configuration of probe failures: - -(1) - Maskable probe failure on each node. -(2) - Maskable probe failure on one node, successful "not running" probe - on the other node. The resource should be started on the node - where "not running" was returned. -(3) - Maskable probe failure on one node, non-maskable probe failure on - the other node. The resource should not be running anywhere, and - should be stopped on the node with the non-maskable failure. ---- - cts/cts-scheduler.in | 2 + - cts/scheduler/dot/failed-probe-clone.dot | 30 ++++ - cts/scheduler/dot/failed-probe-primitive.dot | 4 + - cts/scheduler/exp/failed-probe-clone.exp | 141 ++++++++++++++++++ - cts/scheduler/exp/failed-probe-primitive.exp | 20 +++ - .../scores/failed-probe-clone.scores | 33 ++++ - .../scores/failed-probe-primitive.scores | 9 ++ - .../summary/failed-probe-clone.summary | 46 ++++++ - .../summary/failed-probe-primitive.summary | 27 ++++ - cts/scheduler/xml/failed-probe-clone.xml | 110 ++++++++++++++ - cts/scheduler/xml/failed-probe-primitive.xml | 71 +++++++++ - 11 files changed, 493 insertions(+) - create mode 100644 cts/scheduler/dot/failed-probe-clone.dot - create mode 100644 cts/scheduler/dot/failed-probe-primitive.dot - create mode 100644 cts/scheduler/exp/failed-probe-clone.exp - create mode 100644 cts/scheduler/exp/failed-probe-primitive.exp - create mode 100644 cts/scheduler/scores/failed-probe-clone.scores - create mode 100644 cts/scheduler/scores/failed-probe-primitive.scores - create mode 100644 cts/scheduler/summary/failed-probe-clone.summary - create mode 100644 cts/scheduler/summary/failed-probe-primitive.summary - create mode 100644 cts/scheduler/xml/failed-probe-clone.xml - create mode 100644 cts/scheduler/xml/failed-probe-primitive.xml - -diff --git a/cts/cts-scheduler.in b/cts/cts-scheduler.in -index 17fd6cefdf..3abcbc6c9d 100644 ---- a/cts/cts-scheduler.in -+++ b/cts/cts-scheduler.in -@@ -113,6 +113,8 @@ TESTS = [ - [ "probe-3", "Probe (pending node)" ], - [ "probe-4", "Probe (pending node + stopped resource)" ], - [ "probe-pending-node", "Probe (pending node + unmanaged resource)" ], -+ [ "failed-probe-primitive", "Maskable vs. unmaskable probe failures on primitive resources" ], -+ [ "failed-probe-clone", "Maskable vs. unmaskable probe failures on cloned resources" ], - [ "standby", "Standby" ], - [ "comments", "Comments" ], - ], -diff --git a/cts/scheduler/dot/failed-probe-clone.dot b/cts/scheduler/dot/failed-probe-clone.dot -new file mode 100644 -index 0000000000..90536b46ed ---- /dev/null -+++ b/cts/scheduler/dot/failed-probe-clone.dot -@@ -0,0 +1,30 @@ -+ digraph "g" { -+"ping-1_clear_failcount_0 cluster01" [ style=bold color="green" fontcolor="black"] -+"ping-1_clear_failcount_0 cluster02" [ style=bold color="green" fontcolor="black"] -+"ping-2-clone_running_0" [ style=bold color="green" fontcolor="orange"] -+"ping-2-clone_start_0" -> "ping-2-clone_running_0" [ style = bold] -+"ping-2-clone_start_0" -> "ping-2_start_0 cluster02" [ style = bold] -+"ping-2-clone_start_0" [ style=bold color="green" fontcolor="orange"] -+"ping-2_clear_failcount_0 cluster01" [ style=bold color="green" fontcolor="black"] -+"ping-2_clear_failcount_0 cluster02" [ style=bold color="green" fontcolor="black"] -+"ping-2_monitor_10000 cluster02" [ style=bold color="green" fontcolor="black"] -+"ping-2_start_0 cluster02" -> "ping-2-clone_running_0" [ style = bold] -+"ping-2_start_0 cluster02" -> "ping-2_monitor_10000 cluster02" [ style = bold] -+"ping-2_start_0 cluster02" [ style=bold color="green" fontcolor="black"] -+"ping-3-clone_running_0" [ style=dashed color="red" fontcolor="orange"] -+"ping-3-clone_start_0" -> "ping-3-clone_running_0" [ style = dashed] -+"ping-3-clone_start_0" -> "ping-3_start_0 " [ style = dashed] -+"ping-3-clone_start_0" [ style=dashed color="red" fontcolor="orange"] -+"ping-3-clone_stop_0" -> "ping-3-clone_stopped_0" [ style = bold] -+"ping-3-clone_stop_0" -> "ping-3_stop_0 cluster01" [ style = bold] -+"ping-3-clone_stop_0" [ style=bold color="green" fontcolor="orange"] -+"ping-3-clone_stopped_0" -> "ping-3-clone_start_0" [ style = dashed] -+"ping-3-clone_stopped_0" [ style=bold color="green" fontcolor="orange"] -+"ping-3_clear_failcount_0 cluster01" [ style=bold color="green" fontcolor="black"] -+"ping-3_clear_failcount_0 cluster02" [ style=bold color="green" fontcolor="black"] -+"ping-3_start_0 " -> "ping-3-clone_running_0" [ style = dashed] -+"ping-3_start_0 " [ style=dashed color="red" fontcolor="black"] -+"ping-3_stop_0 cluster01" -> "ping-3-clone_stopped_0" [ style = bold] -+"ping-3_stop_0 cluster01" -> "ping-3_start_0 " [ style = dashed] -+"ping-3_stop_0 cluster01" [ style=bold color="green" fontcolor="black"] -+} -diff --git a/cts/scheduler/dot/failed-probe-primitive.dot b/cts/scheduler/dot/failed-probe-primitive.dot -new file mode 100644 -index 0000000000..6e0c83216a ---- /dev/null -+++ b/cts/scheduler/dot/failed-probe-primitive.dot -@@ -0,0 +1,4 @@ -+ digraph "g" { -+"dummy-2_start_0 cluster02" [ style=bold color="green" fontcolor="black"] -+"dummy-3_stop_0 cluster01" [ style=bold color="green" fontcolor="black"] -+} -diff --git a/cts/scheduler/exp/failed-probe-clone.exp b/cts/scheduler/exp/failed-probe-clone.exp -new file mode 100644 -index 0000000000..6be18935bf ---- /dev/null -+++ b/cts/scheduler/exp/failed-probe-clone.exp -@@ -0,0 +1,141 @@ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -diff --git a/cts/scheduler/exp/failed-probe-primitive.exp b/cts/scheduler/exp/failed-probe-primitive.exp -new file mode 100644 -index 0000000000..d0d8aa44dc ---- /dev/null -+++ b/cts/scheduler/exp/failed-probe-primitive.exp -@@ -0,0 +1,20 @@ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -diff --git a/cts/scheduler/scores/failed-probe-clone.scores b/cts/scheduler/scores/failed-probe-clone.scores -new file mode 100644 -index 0000000000..7418b7f153 ---- /dev/null -+++ b/cts/scheduler/scores/failed-probe-clone.scores -@@ -0,0 +1,33 @@ -+ -+pcmk__clone_allocate: ping-1-clone allocation score on cluster01: -INFINITY -+pcmk__clone_allocate: ping-1-clone allocation score on cluster02: -INFINITY -+pcmk__clone_allocate: ping-1:0 allocation score on cluster01: -INFINITY -+pcmk__clone_allocate: ping-1:0 allocation score on cluster02: -INFINITY -+pcmk__clone_allocate: ping-1:1 allocation score on cluster01: -INFINITY -+pcmk__clone_allocate: ping-1:1 allocation score on cluster02: -INFINITY -+pcmk__clone_allocate: ping-2-clone allocation score on cluster01: -INFINITY -+pcmk__clone_allocate: ping-2-clone allocation score on cluster02: 0 -+pcmk__clone_allocate: ping-2:0 allocation score on cluster01: -INFINITY -+pcmk__clone_allocate: ping-2:0 allocation score on cluster02: 0 -+pcmk__clone_allocate: ping-2:1 allocation score on cluster01: -INFINITY -+pcmk__clone_allocate: ping-2:1 allocation score on cluster02: 0 -+pcmk__clone_allocate: ping-3-clone allocation score on cluster01: -INFINITY -+pcmk__clone_allocate: ping-3-clone allocation score on cluster02: -INFINITY -+pcmk__clone_allocate: ping-3:0 allocation score on cluster01: -INFINITY -+pcmk__clone_allocate: ping-3:0 allocation score on cluster02: -INFINITY -+pcmk__clone_allocate: ping-3:1 allocation score on cluster01: -INFINITY -+pcmk__clone_allocate: ping-3:1 allocation score on cluster02: -INFINITY -+pcmk__native_allocate: Fencing allocation score on cluster01: 0 -+pcmk__native_allocate: Fencing allocation score on cluster02: 0 -+pcmk__native_allocate: ping-1:0 allocation score on cluster01: -INFINITY -+pcmk__native_allocate: ping-1:0 allocation score on cluster02: -INFINITY -+pcmk__native_allocate: ping-1:1 allocation score on cluster01: -INFINITY -+pcmk__native_allocate: ping-1:1 allocation score on cluster02: -INFINITY -+pcmk__native_allocate: ping-2:0 allocation score on cluster01: -INFINITY -+pcmk__native_allocate: ping-2:0 allocation score on cluster02: 0 -+pcmk__native_allocate: ping-2:1 allocation score on cluster01: -INFINITY -+pcmk__native_allocate: ping-2:1 allocation score on cluster02: -INFINITY -+pcmk__native_allocate: ping-3:0 allocation score on cluster01: -INFINITY -+pcmk__native_allocate: ping-3:0 allocation score on cluster02: -INFINITY -+pcmk__native_allocate: ping-3:1 allocation score on cluster01: -INFINITY -+pcmk__native_allocate: ping-3:1 allocation score on cluster02: -INFINITY -diff --git a/cts/scheduler/scores/failed-probe-primitive.scores b/cts/scheduler/scores/failed-probe-primitive.scores -new file mode 100644 -index 0000000000..f313029451 ---- /dev/null -+++ b/cts/scheduler/scores/failed-probe-primitive.scores -@@ -0,0 +1,9 @@ -+ -+pcmk__native_allocate: Fencing allocation score on cluster01: 0 -+pcmk__native_allocate: Fencing allocation score on cluster02: 0 -+pcmk__native_allocate: dummy-1 allocation score on cluster01: -INFINITY -+pcmk__native_allocate: dummy-1 allocation score on cluster02: -INFINITY -+pcmk__native_allocate: dummy-2 allocation score on cluster01: -INFINITY -+pcmk__native_allocate: dummy-2 allocation score on cluster02: 0 -+pcmk__native_allocate: dummy-3 allocation score on cluster01: -INFINITY -+pcmk__native_allocate: dummy-3 allocation score on cluster02: -INFINITY -diff --git a/cts/scheduler/summary/failed-probe-clone.summary b/cts/scheduler/summary/failed-probe-clone.summary -new file mode 100644 -index 0000000000..ca15c302aa ---- /dev/null -+++ b/cts/scheduler/summary/failed-probe-clone.summary -@@ -0,0 +1,46 @@ -+Current cluster status: -+ * Node List: -+ * Online: [ cluster01 cluster02 ] -+ -+ * Full List of Resources: -+ * Fencing (stonith:fence_xvm): Started cluster01 -+ * Clone Set: ping-1-clone [ping-1]: -+ * Stopped: [ cluster01 cluster02 ] -+ * Clone Set: ping-2-clone [ping-2]: -+ * Stopped: [ cluster01 cluster02 ] -+ * Clone Set: ping-3-clone [ping-3]: -+ * ping-3 (ocf:pacemaker:ping): FAILED cluster01 -+ * Stopped: [ cluster02 ] -+ -+Transition Summary: -+ * Start ping-2:0 ( cluster02 ) -+ * Stop ping-3:0 ( cluster01 ) due to node availability -+ -+Executing Cluster Transition: -+ * Cluster action: clear_failcount for ping-1 on cluster02 -+ * Cluster action: clear_failcount for ping-1 on cluster01 -+ * Cluster action: clear_failcount for ping-2 on cluster02 -+ * Cluster action: clear_failcount for ping-2 on cluster01 -+ * Pseudo action: ping-2-clone_start_0 -+ * Cluster action: clear_failcount for ping-3 on cluster01 -+ * Cluster action: clear_failcount for ping-3 on cluster02 -+ * Pseudo action: ping-3-clone_stop_0 -+ * Resource action: ping-2 start on cluster02 -+ * Pseudo action: ping-2-clone_running_0 -+ * Resource action: ping-3 stop on cluster01 -+ * Pseudo action: ping-3-clone_stopped_0 -+ * Resource action: ping-2 monitor=10000 on cluster02 -+ -+Revised Cluster Status: -+ * Node List: -+ * Online: [ cluster01 cluster02 ] -+ -+ * Full List of Resources: -+ * Fencing (stonith:fence_xvm): Started cluster01 -+ * Clone Set: ping-1-clone [ping-1]: -+ * Stopped: [ cluster01 cluster02 ] -+ * Clone Set: ping-2-clone [ping-2]: -+ * Started: [ cluster02 ] -+ * Stopped: [ cluster01 ] -+ * Clone Set: ping-3-clone [ping-3]: -+ * Stopped: [ cluster01 cluster02 ] -diff --git a/cts/scheduler/summary/failed-probe-primitive.summary b/cts/scheduler/summary/failed-probe-primitive.summary -new file mode 100644 -index 0000000000..a634e7f00b ---- /dev/null -+++ b/cts/scheduler/summary/failed-probe-primitive.summary -@@ -0,0 +1,27 @@ -+Current cluster status: -+ * Node List: -+ * Online: [ cluster01 cluster02 ] -+ -+ * Full List of Resources: -+ * Fencing (stonith:fence_xvm): Started cluster01 -+ * dummy-1 (ocf:pacemaker:Dummy): Stopped -+ * dummy-2 (ocf:pacemaker:Dummy): Stopped -+ * dummy-3 (ocf:pacemaker:Dummy): FAILED cluster01 -+ -+Transition Summary: -+ * Start dummy-2 ( cluster02 ) -+ * Stop dummy-3 ( cluster01 ) due to node availability -+ -+Executing Cluster Transition: -+ * Resource action: dummy-2 start on cluster02 -+ * Resource action: dummy-3 stop on cluster01 -+ -+Revised Cluster Status: -+ * Node List: -+ * Online: [ cluster01 cluster02 ] -+ -+ * Full List of Resources: -+ * Fencing (stonith:fence_xvm): Started cluster01 -+ * dummy-1 (ocf:pacemaker:Dummy): Stopped -+ * dummy-2 (ocf:pacemaker:Dummy): Started cluster02 -+ * dummy-3 (ocf:pacemaker:Dummy): Stopped -diff --git a/cts/scheduler/xml/failed-probe-clone.xml b/cts/scheduler/xml/failed-probe-clone.xml -new file mode 100644 -index 0000000000..f677585bab ---- /dev/null -+++ b/cts/scheduler/xml/failed-probe-clone.xml -@@ -0,0 +1,110 @@ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -diff --git a/cts/scheduler/xml/failed-probe-primitive.xml b/cts/scheduler/xml/failed-probe-primitive.xml -new file mode 100644 -index 0000000000..0c2f6416f5 ---- /dev/null -+++ b/cts/scheduler/xml/failed-probe-primitive.xml -@@ -0,0 +1,71 @@ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ --- -2.27.0 - - -From 271d50e7d6b0ee5ef670b571c6d7aae9272b75ad Mon Sep 17 00:00:00 2001 -From: Chris Lumens -Date: Thu, 11 Nov 2021 13:57:05 -0500 -Subject: [PATCH 13/21] Feature: scheduler: Don't output failed resource - probes... - -in the crm_mon "Failed Resource Actions" section. It is expected that -these one-off probes will fail, in which case displaying them in that -section can just come across as confusing to the user. - -And update the crm_mon test output to account for these changes. - -See: rhbz#1506372 ---- - cts/cli/regression.crm_mon.exp | 20 -------------------- - lib/pengine/pe_output.c | 4 ++++ - 2 files changed, 4 insertions(+), 20 deletions(-) - -diff --git a/cts/cli/regression.crm_mon.exp b/cts/cli/regression.crm_mon.exp -index d7b9d98e2c..b1643f8b29 100644 ---- a/cts/cli/regression.crm_mon.exp -+++ b/cts/cli/regression.crm_mon.exp -@@ -3498,10 +3498,6 @@ Active Resources: - - Failed Resource Actions: - * dummy-2_monitor_0 on cluster02 'unimplemented feature' (3): call=2, status='complete', last-rc-change='Wed Sep 2 12:17:38 2020', queued=0ms, exec=33ms -- * dummy-4_monitor_0 on cluster02 'not installed' (5): call=2, status='complete', last-rc-change='Wed Sep 2 12:17:38 2020', queued=0ms, exec=0ms -- * smart-mon_monitor_0 on cluster02 'not installed' (5): call=9, status='complete', last-rc-change='Tue Nov 9 15:38:55 2021', queued=0ms, exec=33ms -- * ping_monitor_0 on cluster02 'not installed' (5): call=6, status='complete', last-rc-change='Thu Nov 18 13:11:42 2021', queued=0ms, exec=0ms -- * httpd_monitor_0 on httpd-bundle-1 'invalid parameter' (2): call=1, status='complete', last-rc-change='Wed May 27 15:43:09 2020', queued=0ms, exec=0ms - =#=#=#= End test: Text output of partially active resources - OK (0) =#=#=#= - * Passed: crm_mon - Text output of partially active resources - =#=#=#= Begin test: XML output of partially active resources =#=#=#= -@@ -3646,10 +3642,6 @@ Failed Resource Actions: - - - -- -- -- -- - - - -@@ -3693,10 +3685,6 @@ Full List of Resources: - - Failed Resource Actions: - * dummy-2_monitor_0 on cluster02 'unimplemented feature' (3): call=2, status='complete', last-rc-change='Wed Sep 2 12:17:38 2020', queued=0ms, exec=33ms -- * dummy-4_monitor_0 on cluster02 'not installed' (5): call=2, status='complete', last-rc-change='Wed Sep 2 12:17:38 2020', queued=0ms, exec=0ms -- * smart-mon_monitor_0 on cluster02 'not installed' (5): call=9, status='complete', last-rc-change='Tue Nov 9 15:38:55 2021', queued=0ms, exec=33ms -- * ping_monitor_0 on cluster02 'not installed' (5): call=6, status='complete', last-rc-change='Thu Nov 18 13:11:42 2021', queued=0ms, exec=0ms -- * httpd_monitor_0 on httpd-bundle-1 'invalid parameter' (2): call=1, status='complete', last-rc-change='Wed May 27 15:43:09 2020', queued=0ms, exec=0ms - =#=#=#= End test: Text output of partially active resources, with inactive resources - OK (0) =#=#=#= - * Passed: crm_mon - Text output of partially active resources, with inactive resources - =#=#=#= Begin test: Complete brief text output, with inactive resources =#=#=#= -@@ -3784,10 +3772,6 @@ Operations: - - Failed Resource Actions: - * dummy-2_monitor_0 on cluster02 'unimplemented feature' (3): call=2, status='complete', last-rc-change='Wed Sep 2 12:17:38 2020', queued=0ms, exec=33ms -- * dummy-4_monitor_0 on cluster02 'not installed' (5): call=2, status='complete', last-rc-change='Wed Sep 2 12:17:38 2020', queued=0ms, exec=0ms -- * smart-mon_monitor_0 on cluster02 'not installed' (5): call=9, status='complete', last-rc-change='Tue Nov 9 15:38:55 2021', queued=0ms, exec=33ms -- * ping_monitor_0 on cluster02 'not installed' (5): call=6, status='complete', last-rc-change='Thu Nov 18 13:11:42 2021', queued=0ms, exec=0ms -- * httpd_monitor_0 on httpd-bundle-1 'invalid parameter' (2): call=1, status='complete', last-rc-change='Wed May 27 15:43:09 2020', queued=0ms, exec=0ms - =#=#=#= End test: Complete brief text output, with inactive resources - OK (0) =#=#=#= - * Passed: crm_mon - Complete brief text output, with inactive resources - =#=#=#= Begin test: Text output of partially active group =#=#=#= -@@ -3959,10 +3943,6 @@ Operations: - - Failed Resource Actions: - * dummy-2_monitor_0 on cluster02 'unimplemented feature' (3): call=2, status='complete', last-rc-change='Wed Sep 2 12:17:38 2020', queued=0ms, exec=33ms -- * dummy-4_monitor_0 on cluster02 'not installed' (5): call=2, status='complete', last-rc-change='Wed Sep 2 12:17:38 2020', queued=0ms, exec=0ms -- * smart-mon_monitor_0 on cluster02 'not installed' (5): call=9, status='complete', last-rc-change='Tue Nov 9 15:38:55 2021', queued=0ms, exec=33ms -- * ping_monitor_0 on cluster02 'not installed' (5): call=6, status='complete', last-rc-change='Thu Nov 18 13:11:42 2021', queued=0ms, exec=0ms -- * httpd_monitor_0 on httpd-bundle-1 'invalid parameter' (2): call=1, status='complete', last-rc-change='Wed May 27 15:43:09 2020', queued=0ms, exec=0ms - =#=#=#= End test: Complete brief text output grouped by node, with inactive resources - OK (0) =#=#=#= - * Passed: crm_mon - Complete brief text output grouped by node, with inactive resources - =#=#=#= Begin test: Text output of partially active resources, with inactive resources, filtered by node =#=#=#= -diff --git a/lib/pengine/pe_output.c b/lib/pengine/pe_output.c -index 715e001d51..84684598dd 100644 ---- a/lib/pengine/pe_output.c -+++ b/lib/pengine/pe_output.c -@@ -1370,6 +1370,10 @@ failed_action_list(pcmk__output_t *out, va_list args) { - continue; - } - -+ if (pcmk_xe_mask_probe_failure(xml_op)) { -+ continue; -+ } -+ - id = crm_element_value(xml_op, XML_LRM_ATTR_TASK_KEY); - if (parse_op_key(id ? id : ID(xml_op), &rsc, NULL, NULL) == FALSE) { - continue; --- -2.27.0 - - -From 90f641b9223c64701d494297ce3dd3382365acb8 Mon Sep 17 00:00:00 2001 -From: Chris Lumens -Date: Tue, 9 Nov 2021 10:11:19 -0500 -Subject: [PATCH 14/21] Feature: scheduler: Add a function for finding a failed - probe action... - -for a given resource ID. Optionally, a node ID can also be given to -restrict the failed probe action to one run on the given node. -Otherwise, just the first failed probe action for the resource ID will -be returned. - -See: rhbz#1506372 ---- - include/crm/pengine/internal.h | 2 ++ - lib/pengine/utils.c | 42 ++++++++++++++++++++++++++++++++++ - 2 files changed, 44 insertions(+) - -diff --git a/include/crm/pengine/internal.h b/include/crm/pengine/internal.h -index 8c8fbaca90..58dd2e8727 100644 ---- a/include/crm/pengine/internal.h -+++ b/include/crm/pengine/internal.h -@@ -574,4 +574,6 @@ gboolean pe__clone_is_filtered(pe_resource_t *rsc, GList *only_rsc, gboolean che - gboolean pe__group_is_filtered(pe_resource_t *rsc, GList *only_rsc, gboolean check_parent); - gboolean pe__native_is_filtered(pe_resource_t *rsc, GList *only_rsc, gboolean check_parent); - -+xmlNode *pe__failed_probe_for_rsc(pe_resource_t *rsc, const char *name); -+ - #endif -diff --git a/lib/pengine/utils.c b/lib/pengine/utils.c -index 07753e173a..3151f0120b 100644 ---- a/lib/pengine/utils.c -+++ b/lib/pengine/utils.c -@@ -2569,3 +2569,45 @@ pe__build_rsc_list(pe_working_set_t *data_set, const char *s) { - - return resources; - } -+ -+xmlNode * -+pe__failed_probe_for_rsc(pe_resource_t *rsc, const char *name) -+{ -+ const char *rsc_id = rsc->id; -+ -+ for (xmlNode *xml_op = pcmk__xml_first_child(rsc->cluster->failed); xml_op != NULL; -+ xml_op = pcmk__xml_next(xml_op)) { -+ const char *value = NULL; -+ char *op_id = NULL; -+ -+ /* This resource operation is not a failed probe. */ -+ if (!pcmk_xe_mask_probe_failure(xml_op)) { -+ continue; -+ } -+ -+ /* This resource operation was not run on the given node. Note that if name is -+ * NULL, this will always succeed. -+ */ -+ value = crm_element_value(xml_op, XML_LRM_ATTR_TARGET); -+ if (value == NULL || !pcmk__str_eq(value, name, pcmk__str_casei|pcmk__str_null_matches)) { -+ continue; -+ } -+ -+ /* This resource operation has no operation_key. */ -+ value = crm_element_value(xml_op, XML_LRM_ATTR_TASK_KEY); -+ if (!parse_op_key(value ? value : ID(xml_op), &op_id, NULL, NULL)) { -+ continue; -+ } -+ -+ /* This resource operation's ID does not match the rsc_id we are looking for. */ -+ if (!pcmk__str_eq(op_id, rsc_id, pcmk__str_none)) { -+ free(op_id); -+ continue; -+ } -+ -+ free(op_id); -+ return xml_op; -+ } -+ -+ return NULL; -+} --- -2.27.0 - - -From 2ad9774fe994554243078b131799fed0d1a6dffd Mon Sep 17 00:00:00 2001 -From: Chris Lumens -Date: Tue, 9 Nov 2021 15:43:24 -0500 -Subject: [PATCH 15/21] Feature: scheduler: Display the reason why a native rsc - probe failed. - -If inactive resources are being shown, add an extra blurb of text to any -stopped resources that have a failed probe action indicating why the -probe failed. - -And then add a new primitive resource to crm_mon-partial.xml with a -failed probe operation and update the expected test output. - -See: rhbz#1506372 ---- - cts/cli/regression.crm_mon.exp | 10 +++++----- - cts/scheduler/summary/failed-probe-primitive.summary | 8 ++++---- - cts/scheduler/summary/multiply-active-stonith.summary | 2 +- - lib/pengine/native.c | 11 +++++++++++ - 4 files changed, 21 insertions(+), 10 deletions(-) - -diff --git a/cts/cli/regression.crm_mon.exp b/cts/cli/regression.crm_mon.exp -index b1643f8b29..4333caa11c 100644 ---- a/cts/cli/regression.crm_mon.exp -+++ b/cts/cli/regression.crm_mon.exp -@@ -3680,8 +3680,8 @@ Full List of Resources: - * dummy-1 (ocf:pacemaker:Dummy): Started cluster02 - * dummy-2 (ocf:pacemaker:Dummy): FAILED cluster02 - * dummy-3 (ocf:pacemaker:Dummy): Stopped (disabled) -- * dummy-4 (ocf:pacemaker:Dummy): Stopped -- * smart-mon (ocf:pacemaker:HealthSMART): Stopped -+ * dummy-4 (ocf:pacemaker:Dummy): Stopped (not installed) -+ * smart-mon (ocf:pacemaker:HealthSMART): Stopped (not installed) - - Failed Resource Actions: - * dummy-2_monitor_0 on cluster02 'unimplemented feature' (3): call=2, status='complete', last-rc-change='Wed Sep 2 12:17:38 2020', queued=0ms, exec=33ms -@@ -3811,7 +3811,7 @@ Full List of Resources: - * dummy-1 (ocf:pacemaker:Dummy): Started cluster02 - * dummy-2 (ocf:pacemaker:Dummy): FAILED cluster02 - * dummy-3 (ocf:pacemaker:Dummy): Stopped (disabled) -- * dummy-4 (ocf:pacemaker:Dummy): Stopped -+ * dummy-4 (ocf:pacemaker:Dummy): Stopped (not installed) - =#=#=#= End test: Text output of partially active group, with inactive resources - OK (0) =#=#=#= - * Passed: crm_mon - Text output of partially active group, with inactive resources - =#=#=#= Begin test: Text output of active member of partially active group =#=#=#= -@@ -3889,7 +3889,7 @@ Inactive Resources: - * ping (ocf:pacemaker:ping): Stopped - * Resource Group: partially-active-group: - * 2/4 (ocf:pacemaker:Dummy): Active cluster02 -- * smart-mon (ocf:pacemaker:HealthSMART): Stopped -+ * smart-mon (ocf:pacemaker:HealthSMART): Stopped (not installed) - - Node Attributes: - * Node: cluster01 (1): -@@ -3963,7 +3963,7 @@ Full List of Resources: - * Fencing (stonith:fence_xvm): Started cluster01 - * Container bundle set: httpd-bundle [pcmk:http]: - * httpd-bundle-1 (192.168.122.132) (ocf:heartbeat:apache): FAILED cluster01 -- * smart-mon (ocf:pacemaker:HealthSMART): Stopped -+ * smart-mon (ocf:pacemaker:HealthSMART): Stopped (not installed) - =#=#=#= End test: Text output of partially active resources, with inactive resources, filtered by node - OK (0) =#=#=#= - * Passed: crm_mon - Text output of partially active resources, with inactive resources, filtered by node - =#=#=#= Begin test: Text output of partially active resources, filtered by node =#=#=#= -diff --git a/cts/scheduler/summary/failed-probe-primitive.summary b/cts/scheduler/summary/failed-probe-primitive.summary -index a634e7f00b..ea8edae494 100644 ---- a/cts/scheduler/summary/failed-probe-primitive.summary -+++ b/cts/scheduler/summary/failed-probe-primitive.summary -@@ -4,8 +4,8 @@ Current cluster status: - - * Full List of Resources: - * Fencing (stonith:fence_xvm): Started cluster01 -- * dummy-1 (ocf:pacemaker:Dummy): Stopped -- * dummy-2 (ocf:pacemaker:Dummy): Stopped -+ * dummy-1 (ocf:pacemaker:Dummy): Stopped (not installed) -+ * dummy-2 (ocf:pacemaker:Dummy): Stopped (not installed) - * dummy-3 (ocf:pacemaker:Dummy): FAILED cluster01 - - Transition Summary: -@@ -22,6 +22,6 @@ Revised Cluster Status: - - * Full List of Resources: - * Fencing (stonith:fence_xvm): Started cluster01 -- * dummy-1 (ocf:pacemaker:Dummy): Stopped -+ * dummy-1 (ocf:pacemaker:Dummy): Stopped (not installed) - * dummy-2 (ocf:pacemaker:Dummy): Started cluster02 -- * dummy-3 (ocf:pacemaker:Dummy): Stopped -+ * dummy-3 (ocf:pacemaker:Dummy): Stopped (not installed) -diff --git a/cts/scheduler/summary/multiply-active-stonith.summary b/cts/scheduler/summary/multiply-active-stonith.summary -index 8ce21d68ee..ec37de03b0 100644 ---- a/cts/scheduler/summary/multiply-active-stonith.summary -+++ b/cts/scheduler/summary/multiply-active-stonith.summary -@@ -25,4 +25,4 @@ Revised Cluster Status: - - * Full List of Resources: - * fencer (stonith:fence_ipmilan): Started node3 -- * rsc1 (lsb:rsc1): Stopped -+ * rsc1 (lsb:rsc1): Stopped (not installed) -diff --git a/lib/pengine/native.c b/lib/pengine/native.c -index 36121c527f..a95c90c09a 100644 ---- a/lib/pengine/native.c -+++ b/lib/pengine/native.c -@@ -599,6 +599,17 @@ pcmk__native_output_string(pe_resource_t *rsc, const char *name, pe_node_t *node - g_string_append_printf(outstr, " %s", node->details->uname); - } - -+ // Failed probe operation -+ if (native_displayable_role(rsc) == RSC_ROLE_STOPPED) { -+ xmlNode *probe_op = pe__failed_probe_for_rsc(rsc, node ? node->details->uname : NULL); -+ if (probe_op != NULL) { -+ int rc; -+ -+ pcmk__scan_min_int(crm_element_value(probe_op, XML_LRM_ATTR_RC), &rc, 0); -+ g_string_append_printf(outstr, " (%s) ", services_ocf_exitcode_str(rc)); -+ } -+ } -+ - // Flags, as: ( [...]) - if (node && !(node->details->online) && node->details->unclean) { - have_flags = add_output_flag(outstr, "UNCLEAN", have_flags); --- -2.27.0 - - -From b9ca2e834ee01b35c03f153438ef8828b609fb38 Mon Sep 17 00:00:00 2001 -From: Chris Lumens -Date: Thu, 18 Nov 2021 10:41:42 -0500 -Subject: [PATCH 16/21] Refactor: scheduler: Rearrange pe__clone_default. - -Instead of the single stopped list, maintain a hash table where the keys -are nodes and the values are the status of the node. For now, this is -just "Stopped" or "Stopped (disabled)" but in the future will be -expanded to cover failed probe operations. ---- - lib/pengine/clone.c | 103 +++++++++++++++++++++++++++++++++++--------- - 1 file changed, 82 insertions(+), 21 deletions(-) - -diff --git a/lib/pengine/clone.c b/lib/pengine/clone.c -index 5569c6b6e9..58fb24d24e 100644 ---- a/lib/pengine/clone.c -+++ b/lib/pengine/clone.c -@@ -28,6 +28,55 @@ - #define UNPROMOTED_INSTANCES RSC_ROLE_UNPROMOTED_S - #endif - -+static GList * -+sorted_hash_table_values(GHashTable *table) -+{ -+ GList *retval = NULL; -+ GHashTableIter iter; -+ gpointer key, value; -+ -+ g_hash_table_iter_init(&iter, table); -+ while (g_hash_table_iter_next(&iter, &key, &value)) { -+ if (!g_list_find_custom(retval, value, (GCompareFunc) strcmp)) { -+ retval = g_list_prepend(retval, (char *) value); -+ } -+ } -+ -+ retval = g_list_sort(retval, (GCompareFunc) strcmp); -+ return retval; -+} -+ -+static GList * -+nodes_with_status(GHashTable *table, const char *status) -+{ -+ GList *retval = NULL; -+ GHashTableIter iter; -+ gpointer key, value; -+ -+ g_hash_table_iter_init(&iter, table); -+ while (g_hash_table_iter_next(&iter, &key, &value)) { -+ if (!strcmp((char *) value, status)) { -+ retval = g_list_prepend(retval, key); -+ } -+ } -+ -+ retval = g_list_sort(retval, (GCompareFunc) pcmk__numeric_strcasecmp); -+ return retval; -+} -+ -+static char * -+node_list_to_str(GList *list) -+{ -+ char *retval = NULL; -+ size_t len = 0; -+ -+ for (GList *iter = list; iter != NULL; iter = iter->next) { -+ pcmk__add_word(&retval, &len, (char *) iter->data); -+ } -+ -+ return retval; -+} -+ - static void - clone_header(pcmk__output_t *out, int *rc, pe_resource_t *rsc, clone_variant_data_t *clone_data) - { -@@ -710,10 +759,10 @@ pe__clone_default(pcmk__output_t *out, va_list args) - GList *only_node = va_arg(args, GList *); - GList *only_rsc = va_arg(args, GList *); - -+ GHashTable *stopped = pcmk__strkey_table(free, free); -+ - char *list_text = NULL; -- char *stopped_list = NULL; - size_t list_text_len = 0; -- size_t stopped_list_len = 0; - - GList *promoted_list = NULL; - GList *started_list = NULL; -@@ -768,7 +817,7 @@ pe__clone_default(pcmk__output_t *out, va_list args) - // List stopped instances when requested (except orphans) - if (!pcmk_is_set(child_rsc->flags, pe_rsc_orphan) - && pcmk_is_set(show_opts, pcmk_show_inactive_rscs)) { -- pcmk__add_word(&stopped_list, &stopped_list_len, child_rsc->id); -+ g_hash_table_insert(stopped, strdup(child_rsc->id), strdup("Stopped")); - } - - } else if (is_set_recursive(child_rsc, pe_rsc_orphan, TRUE) -@@ -822,7 +871,7 @@ pe__clone_default(pcmk__output_t *out, va_list args) - } - - if (pcmk_is_set(show_opts, pcmk_show_clone_detail)) { -- free(stopped_list); -+ g_hash_table_destroy(stopped); - PCMK__OUTPUT_LIST_FOOTER(out, rc); - return pcmk_rc_ok; - } -@@ -890,23 +939,15 @@ pe__clone_default(pcmk__output_t *out, va_list args) - } - - if (pcmk_is_set(show_opts, pcmk_show_inactive_rscs)) { -- const char *state = "Stopped"; -- enum rsc_role_e role = configured_role(rsc); -- -- if (role == RSC_ROLE_STOPPED) { -- state = "Stopped (disabled)"; -- } -- - if (!pcmk_is_set(rsc->flags, pe_rsc_unique) - && (clone_data->clone_max > active_instances)) { - - GList *nIter; - GList *list = g_hash_table_get_values(rsc->allowed_nodes); - -- /* Custom stopped list for non-unique clones */ -- free(stopped_list); -- stopped_list = NULL; -- stopped_list_len = 0; -+ /* Custom stopped table for non-unique clones */ -+ g_hash_table_destroy(stopped); -+ stopped = pcmk__strkey_table(free, free); - - if (list == NULL) { - /* Clusters with symmetrical=false haven't calculated allowed_nodes yet -@@ -922,19 +963,39 @@ pe__clone_default(pcmk__output_t *out, va_list args) - if (pe_find_node(rsc->running_on, node->details->uname) == NULL && - pcmk__str_in_list(node->details->uname, only_node, - pcmk__str_star_matches|pcmk__str_casei)) { -- pcmk__add_word(&stopped_list, &stopped_list_len, -- node->details->uname); -+ const char *state = "Stopped"; -+ -+ if (configured_role(rsc) == RSC_ROLE_STOPPED) { -+ state = "Stopped (disabled)"; -+ } -+ -+ g_hash_table_insert(stopped, strdup(node->details->uname), -+ strdup(state)); - } - } - g_list_free(list); - } - -- if (stopped_list != NULL) { -+ if (g_hash_table_size(stopped) > 0) { -+ GList *list = sorted_hash_table_values(stopped); -+ - clone_header(out, &rc, rsc, clone_data); - -- out->list_item(out, NULL, "%s: [ %s ]", state, stopped_list); -- free(stopped_list); -- stopped_list_len = 0; -+ for (GList *status_iter = list; status_iter != NULL; status_iter = status_iter->next) { -+ const char *status = status_iter->data; -+ GList *nodes = nodes_with_status(stopped, status); -+ char *str = node_list_to_str(nodes); -+ -+ if (str != NULL) { -+ out->list_item(out, NULL, "%s: [ %s ]", status, str); -+ free(str); -+ } -+ -+ g_list_free(nodes); -+ } -+ -+ g_list_free(list); -+ g_hash_table_destroy(stopped); - - /* If there are no instances of this clone (perhaps because there are no - * nodes configured), simply output the clone header by itself. This can --- -2.27.0 - - -From 0228a64cea412936fb8ee91b0f83f9800048d3ba Mon Sep 17 00:00:00 2001 -From: Chris Lumens -Date: Fri, 19 Nov 2021 10:06:18 -0500 -Subject: [PATCH 17/21] Feature: scheduler: Display the reason why a clone rsc - probe failed. - -This is similar to the previous commit that adds reasons for primitive -resources. - -See: rhbz#1506372 ---- - cts/cli/regression.crm_mon.exp | 8 +++---- - .../summary/failed-probe-clone.summary | 14 +++++++------ - include/crm/pengine/internal.h | 2 ++ - lib/pengine/clone.c | 21 +++++++++++++++++-- - lib/pengine/utils.c | 7 +++++++ - 5 files changed, 40 insertions(+), 12 deletions(-) - -diff --git a/cts/cli/regression.crm_mon.exp b/cts/cli/regression.crm_mon.exp -index 4333caa11c..5688500ce5 100644 ---- a/cts/cli/regression.crm_mon.exp -+++ b/cts/cli/regression.crm_mon.exp -@@ -3479,7 +3479,7 @@ Node List: - Active Resources: - * Clone Set: ping-clone [ping]: - * ping (ocf:pacemaker:ping): Started cluster01 -- * ping (ocf:pacemaker:ping): Stopped -+ * ping (ocf:pacemaker:ping): Stopped (not installed) - * Fencing (stonith:fence_xvm): Started cluster01 - * Container bundle set: httpd-bundle [pcmk:http]: - * Replica[0] -@@ -3663,7 +3663,7 @@ Node List: - Full List of Resources: - * Clone Set: ping-clone [ping]: - * ping (ocf:pacemaker:ping): Started cluster01 -- * ping (ocf:pacemaker:ping): Stopped -+ * ping (ocf:pacemaker:ping): Stopped (not installed) - * Fencing (stonith:fence_xvm): Started cluster01 - * Container bundle set: httpd-bundle [pcmk:http]: - * Replica[0] -@@ -3705,7 +3705,7 @@ Full List of Resources: - * 1/1 (stonith:fence_xvm): Active cluster01 - * Clone Set: ping-clone [ping]: - * ping (ocf:pacemaker:ping): Started cluster01 -- * ping (ocf:pacemaker:ping): Stopped -+ * ping (ocf:pacemaker:ping): Stopped (not installed) - * Container bundle set: httpd-bundle [pcmk:http]: - * Replica[0] - * httpd-bundle-ip-192.168.122.131 (ocf:heartbeat:IPaddr2): Started cluster02 -@@ -3886,7 +3886,7 @@ Node List: - Inactive Resources: - * Clone Set: ping-clone [ping]: - * ping (ocf:pacemaker:ping): Started cluster01 -- * ping (ocf:pacemaker:ping): Stopped -+ * ping (ocf:pacemaker:ping): Stopped (not installed) - * Resource Group: partially-active-group: - * 2/4 (ocf:pacemaker:Dummy): Active cluster02 - * smart-mon (ocf:pacemaker:HealthSMART): Stopped (not installed) -diff --git a/cts/scheduler/summary/failed-probe-clone.summary b/cts/scheduler/summary/failed-probe-clone.summary -index ca15c302aa..febee14400 100644 ---- a/cts/scheduler/summary/failed-probe-clone.summary -+++ b/cts/scheduler/summary/failed-probe-clone.summary -@@ -5,12 +5,13 @@ Current cluster status: - * Full List of Resources: - * Fencing (stonith:fence_xvm): Started cluster01 - * Clone Set: ping-1-clone [ping-1]: -- * Stopped: [ cluster01 cluster02 ] -+ * Stopped (not installed): [ cluster01 cluster02 ] - * Clone Set: ping-2-clone [ping-2]: -- * Stopped: [ cluster01 cluster02 ] -+ * Stopped: [ cluster02 ] -+ * Stopped (not installed): [ cluster01 ] - * Clone Set: ping-3-clone [ping-3]: - * ping-3 (ocf:pacemaker:ping): FAILED cluster01 -- * Stopped: [ cluster02 ] -+ * Stopped (not installed): [ cluster02 ] - - Transition Summary: - * Start ping-2:0 ( cluster02 ) -@@ -38,9 +39,10 @@ Revised Cluster Status: - * Full List of Resources: - * Fencing (stonith:fence_xvm): Started cluster01 - * Clone Set: ping-1-clone [ping-1]: -- * Stopped: [ cluster01 cluster02 ] -+ * Stopped (not installed): [ cluster01 cluster02 ] - * Clone Set: ping-2-clone [ping-2]: - * Started: [ cluster02 ] -- * Stopped: [ cluster01 ] -+ * Stopped (not installed): [ cluster01 ] - * Clone Set: ping-3-clone [ping-3]: -- * Stopped: [ cluster01 cluster02 ] -+ * Stopped: [ cluster01 ] -+ * Stopped (not installed): [ cluster02 ] -diff --git a/include/crm/pengine/internal.h b/include/crm/pengine/internal.h -index 58dd2e8727..2b20da6e5f 100644 ---- a/include/crm/pengine/internal.h -+++ b/include/crm/pengine/internal.h -@@ -576,4 +576,6 @@ gboolean pe__native_is_filtered(pe_resource_t *rsc, GList *only_rsc, gboolean ch - - xmlNode *pe__failed_probe_for_rsc(pe_resource_t *rsc, const char *name); - -+const char *pe__clone_child_id(pe_resource_t *rsc); -+ - #endif -diff --git a/lib/pengine/clone.c b/lib/pengine/clone.c -index 58fb24d24e..ef4bdc0edf 100644 ---- a/lib/pengine/clone.c -+++ b/lib/pengine/clone.c -@@ -963,14 +963,23 @@ pe__clone_default(pcmk__output_t *out, va_list args) - if (pe_find_node(rsc->running_on, node->details->uname) == NULL && - pcmk__str_in_list(node->details->uname, only_node, - pcmk__str_star_matches|pcmk__str_casei)) { -+ xmlNode *probe_op = pe__failed_probe_for_rsc(rsc, node->details->uname); - const char *state = "Stopped"; - - if (configured_role(rsc) == RSC_ROLE_STOPPED) { - state = "Stopped (disabled)"; - } - -- g_hash_table_insert(stopped, strdup(node->details->uname), -- strdup(state)); -+ if (probe_op != NULL) { -+ int rc; -+ -+ pcmk__scan_min_int(crm_element_value(probe_op, XML_LRM_ATTR_RC), &rc, 0); -+ g_hash_table_insert(stopped, strdup(node->details->uname), -+ crm_strdup_printf("Stopped (%s)", services_ocf_exitcode_str(rc))); -+ } else { -+ g_hash_table_insert(stopped, strdup(node->details->uname), -+ strdup(state)); -+ } - } - } - g_list_free(list); -@@ -1113,3 +1122,11 @@ pe__clone_is_filtered(pe_resource_t *rsc, GList *only_rsc, gboolean check_parent - - return !passes; - } -+ -+const char * -+pe__clone_child_id(pe_resource_t *rsc) -+{ -+ clone_variant_data_t *clone_data = NULL; -+ get_clone_variant_data(clone_data, rsc); -+ return ID(clone_data->xml_obj_child); -+} -diff --git a/lib/pengine/utils.c b/lib/pengine/utils.c -index 3151f0120b..6c4f3b6971 100644 ---- a/lib/pengine/utils.c -+++ b/lib/pengine/utils.c -@@ -2573,8 +2573,15 @@ pe__build_rsc_list(pe_working_set_t *data_set, const char *s) { - xmlNode * - pe__failed_probe_for_rsc(pe_resource_t *rsc, const char *name) - { -+ pe_resource_t *parent = uber_parent(rsc); - const char *rsc_id = rsc->id; - -+ if (rsc->variant == pe_clone) { -+ rsc_id = pe__clone_child_id(rsc); -+ } else if (parent->variant == pe_clone) { -+ rsc_id = pe__clone_child_id(parent); -+ } -+ - for (xmlNode *xml_op = pcmk__xml_first_child(rsc->cluster->failed); xml_op != NULL; - xml_op = pcmk__xml_next(xml_op)) { - const char *value = NULL; --- -2.27.0 - - -From cf8b01da93fce87526617fefdcee6eb9f6ecdbd1 Mon Sep 17 00:00:00 2001 -From: Chris Lumens -Date: Wed, 24 Nov 2021 10:57:05 -0500 -Subject: [PATCH 18/21] Test: cts-cli: Update the last-rc-change sed - expression. - -This can now occur in both the XML output (where it's wrapped in double -quotes) and the text output (where it's wrapped in single quotes and -followed by a comma). In addition, a plus or minus can occur in the -time string. - -The "{0,1}" syntax takes the place of a "?" for marking the optional -comma. In FreeBSD sed, "?" doesn't mean anything special. ---- - cts/cli/regression.crm_mon.exp | 12 ++++++------ - cts/cts-cli.in | 2 +- - 2 files changed, 7 insertions(+), 7 deletions(-) - -diff --git a/cts/cli/regression.crm_mon.exp b/cts/cli/regression.crm_mon.exp -index 5688500ce5..957758832d 100644 ---- a/cts/cli/regression.crm_mon.exp -+++ b/cts/cli/regression.crm_mon.exp -@@ -3497,7 +3497,7 @@ Active Resources: - * dummy-2 (ocf:pacemaker:Dummy): FAILED cluster02 - - Failed Resource Actions: -- * dummy-2_monitor_0 on cluster02 'unimplemented feature' (3): call=2, status='complete', last-rc-change='Wed Sep 2 12:17:38 2020', queued=0ms, exec=33ms -+ * dummy-2_monitor_0 on cluster02 'unimplemented feature' (3): call=2, status='complete', queued=0ms, exec=33ms - =#=#=#= End test: Text output of partially active resources - OK (0) =#=#=#= - * Passed: crm_mon - Text output of partially active resources - =#=#=#= Begin test: XML output of partially active resources =#=#=#= -@@ -3641,7 +3641,7 @@ Failed Resource Actions: - - - -- -+ - - - -@@ -3684,7 +3684,7 @@ Full List of Resources: - * smart-mon (ocf:pacemaker:HealthSMART): Stopped (not installed) - - Failed Resource Actions: -- * dummy-2_monitor_0 on cluster02 'unimplemented feature' (3): call=2, status='complete', last-rc-change='Wed Sep 2 12:17:38 2020', queued=0ms, exec=33ms -+ * dummy-2_monitor_0 on cluster02 'unimplemented feature' (3): call=2, status='complete', queued=0ms, exec=33ms - =#=#=#= End test: Text output of partially active resources, with inactive resources - OK (0) =#=#=#= - * Passed: crm_mon - Text output of partially active resources, with inactive resources - =#=#=#= Begin test: Complete brief text output, with inactive resources =#=#=#= -@@ -3771,7 +3771,7 @@ Operations: - * (1) probe - - Failed Resource Actions: -- * dummy-2_monitor_0 on cluster02 'unimplemented feature' (3): call=2, status='complete', last-rc-change='Wed Sep 2 12:17:38 2020', queued=0ms, exec=33ms -+ * dummy-2_monitor_0 on cluster02 'unimplemented feature' (3): call=2, status='complete', queued=0ms, exec=33ms - =#=#=#= End test: Complete brief text output, with inactive resources - OK (0) =#=#=#= - * Passed: crm_mon - Complete brief text output, with inactive resources - =#=#=#= Begin test: Text output of partially active group =#=#=#= -@@ -3850,7 +3850,7 @@ Active Resources: - * dummy-2 (ocf:pacemaker:Dummy): FAILED cluster02 - - Failed Resource Actions: -- * dummy-2_monitor_0 on cluster02 'unimplemented feature' (3): call=2, status='complete', last-rc-change='Wed Sep 2 12:17:38 2020', queued=0ms, exec=33ms -+ * dummy-2_monitor_0 on cluster02 'unimplemented feature' (3): call=2, status='complete', queued=0ms, exec=33ms - =#=#=#= End test: Text output of inactive member of partially active group - OK (0) =#=#=#= - * Passed: crm_mon - Text output of inactive member of partially active group - =#=#=#= Begin test: Complete brief text output grouped by node, with inactive resources =#=#=#= -@@ -3942,7 +3942,7 @@ Operations: - * (1) probe - - Failed Resource Actions: -- * dummy-2_monitor_0 on cluster02 'unimplemented feature' (3): call=2, status='complete', last-rc-change='Wed Sep 2 12:17:38 2020', queued=0ms, exec=33ms -+ * dummy-2_monitor_0 on cluster02 'unimplemented feature' (3): call=2, status='complete', queued=0ms, exec=33ms - =#=#=#= End test: Complete brief text output grouped by node, with inactive resources - OK (0) =#=#=#= - * Passed: crm_mon - Complete brief text output grouped by node, with inactive resources - =#=#=#= Begin test: Text output of partially active resources, with inactive resources, filtered by node =#=#=#= -diff --git a/cts/cts-cli.in b/cts/cts-cli.in -index 457816afab..72e9a1e912 100755 ---- a/cts/cts-cli.in -+++ b/cts/cts-cli.in -@@ -1870,7 +1870,7 @@ for t in $tests; do - -e 's/.*\(unpack_.*\)@.*\.c:[0-9][0-9]*)/\1/g' \ - -e 's/.*\(update_validation\)@.*\.c:[0-9][0-9]*)/\1/g' \ - -e 's/.*\(apply_upgrade\)@.*\.c:[0-9][0-9]*)/\1/g' \ -- -e 's/ last-rc-change=\"[A-Za-z0-9: ]*\"//'\ -+ -e "s/ last-rc-change=['\"][-+A-Za-z0-9: ]*['\"],\{0,1\}//" \ - -e 's|^/tmp/cts-cli\.validity\.bad.xml\.[^:]*:|validity.bad.xml:|'\ - -e 's/^Entity: line [0-9][0-9]*: //'\ - -e 's/\(validation ([0-9][0-9]* of \)[0-9][0-9]*\().*\)/\1X\2/' \ --- -2.27.0 - - -From dea61f1b6507fbc978e040c1555384d8d7ffa9f3 Mon Sep 17 00:00:00 2001 -From: Chris Lumens -Date: Wed, 1 Dec 2021 16:23:14 -0500 -Subject: [PATCH 19/21] Fix: include: Bump feature set to 3.12.0. - -This is for the scheduler handling changing regarding maskable probe -failures. - -See: rhbz#1506372. ---- - include/crm/crm.h | 2 +- - 1 file changed, 1 insertion(+), 1 deletion(-) - -diff --git a/include/crm/crm.h b/include/crm/crm.h -index 04d2324d75..16b35e9c55 100644 ---- a/include/crm/crm.h -+++ b/include/crm/crm.h -@@ -66,7 +66,7 @@ extern "C" { - * >=3.0.13: Fail counts include operation name and interval - * >=3.2.0: DC supports PCMK_EXEC_INVALID and PCMK_EXEC_NOT_CONNECTED - */ --# define CRM_FEATURE_SET "3.11.0" -+# define CRM_FEATURE_SET "3.12.0" - - /* Pacemaker's CPG protocols use fixed-width binary fields for the sender and - * recipient of a CPG message. This imposes an arbitrary limit on cluster node --- -2.27.0 - - -From fef2c61ef462c221809dc91467ea1e96d5478c74 Mon Sep 17 00:00:00 2001 -From: Chris Lumens -Date: Mon, 6 Dec 2021 16:42:15 -0500 -Subject: [PATCH 20/21] Feature: scheduler: Handle masked probes in the - scheduler. - -These probe operations get their rc/status codes mapped to not -running/done, but still ensures they end up in the list of failed -operations so tool output continues to display them properly. - -Note that failures on bundled resources do not get masked. - -There are no test case changes for this patch. - -See: rhbz#1506372. ---- - lib/pengine/unpack.c | 42 +++++++++++++++++++++++++++++++++++++----- - 1 file changed, 37 insertions(+), 5 deletions(-) - -diff --git a/lib/pengine/unpack.c b/lib/pengine/unpack.c -index b659f319fb..f3583e97d8 100644 ---- a/lib/pengine/unpack.c -+++ b/lib/pengine/unpack.c -@@ -3169,6 +3169,11 @@ remap_operation(xmlNode *xml_op, pe_resource_t *rsc, pe_node_t *node, - } - } - -+ if (!pe_rsc_is_bundled(rsc) && pcmk_xe_mask_probe_failure(xml_op)) { -+ *status = PCMK_EXEC_DONE; -+ *rc = PCMK_OCF_NOT_RUNNING; -+ } -+ - /* If the executor reported an operation status of anything but done or - * error, consider that final. But for done or error, we know better whether - * it should be treated as a failure or not, because we know the expected -@@ -3567,12 +3572,12 @@ update_resource_state(pe_resource_t * rsc, pe_node_t * node, xmlNode * xml_op, c - CRM_ASSERT(rsc); - CRM_ASSERT(xml_op); - -- if (rc == PCMK_OCF_NOT_RUNNING) { -- clear_past_failure = TRUE; -- -- } else if (rc == PCMK_OCF_NOT_INSTALLED) { -+ if (rc == PCMK_OCF_NOT_INSTALLED || (!pe_rsc_is_bundled(rsc) && pcmk_xe_mask_probe_failure(xml_op))) { - rsc->role = RSC_ROLE_STOPPED; - -+ } else if (rc == PCMK_OCF_NOT_RUNNING) { -+ clear_past_failure = TRUE; -+ - } else if (pcmk__str_eq(task, CRMD_ACTION_STATUS, pcmk__str_casei)) { - if (last_failure) { - const char *op_key = get_op_key(xml_op); -@@ -3661,8 +3666,10 @@ unpack_rsc_op(pe_resource_t *rsc, pe_node_t *node, xmlNode *xml_op, - pe_working_set_t *data_set) - { - int rc = 0; -+ int old_rc = 0; - int task_id = 0; - int target_rc = 0; -+ int old_target_rc = 0; - int status = PCMK_EXEC_UNKNOWN; - guint interval_ms = 0; - const char *task = NULL; -@@ -3671,6 +3678,7 @@ unpack_rsc_op(pe_resource_t *rsc, pe_node_t *node, xmlNode *xml_op, - bool expired = false; - pe_resource_t *parent = rsc; - enum action_fail_response failure_strategy = action_fail_recover; -+ bool maskable_probe_failure = false; - - CRM_CHECK(rsc && node && xml_op, return); - -@@ -3727,10 +3735,22 @@ unpack_rsc_op(pe_resource_t *rsc, pe_node_t *node, xmlNode *xml_op, - expired = true; - } - -+ old_rc = rc; -+ old_target_rc = target_rc; -+ - remap_operation(xml_op, rsc, node, data_set, on_fail, target_rc, - &rc, &status); - -- if (expired && (rc != target_rc)) { -+ maskable_probe_failure = !pe_rsc_is_bundled(rsc) && pcmk_xe_mask_probe_failure(xml_op); -+ -+ if (expired && maskable_probe_failure && old_rc != old_target_rc) { -+ if (rsc->role <= RSC_ROLE_STOPPED) { -+ rsc->role = RSC_ROLE_UNKNOWN; -+ } -+ -+ goto done; -+ -+ } else if (expired && (rc != target_rc)) { - const char *magic = crm_element_value(xml_op, XML_ATTR_TRANSITION_MAGIC); - - if (interval_ms == 0) { -@@ -3758,6 +3778,18 @@ unpack_rsc_op(pe_resource_t *rsc, pe_node_t *node, xmlNode *xml_op, - } - } - -+ if (maskable_probe_failure) { -+ crm_notice("Treating probe result '%s' for %s on %s as 'not running'", -+ services_ocf_exitcode_str(rc), rsc->id, node->details->uname); -+ update_resource_state(rsc, node, xml_op, task, target_rc, *last_failure, -+ on_fail, data_set); -+ crm_xml_add(xml_op, XML_ATTR_UNAME, node->details->uname); -+ -+ record_failed_op(xml_op, node, rsc, data_set); -+ resource_location(parent, node, -INFINITY, "masked-probe-failure", data_set); -+ goto done; -+ } -+ - switch (status) { - case PCMK_EXEC_CANCELLED: - // Should never happen --- -2.27.0 - - -From ccff6eb60598f389008b0621447056457da79671 Mon Sep 17 00:00:00 2001 -From: Chris Lumens -Date: Tue, 4 Jan 2022 10:14:48 -0500 -Subject: [PATCH 21/21] Test: scheduler: Add tests for expired, masked probe - failures. - -dummy-1 is a stopped resource with an expired masked probe failure. -This probe should be rescheduled. dummy-2 is a started resource with an -expired masked probe failure. This probe should not be rescheduled. ---- - cts/cts-scheduler.in | 1 + - .../dot/expired-failed-probe-primitive.dot | 8 ++ - .../exp/expired-failed-probe-primitive.exp | 45 ++++++++++++ - .../expired-failed-probe-primitive.scores | 7 ++ - .../expired-failed-probe-primitive.summary | 26 +++++++ - .../xml/expired-failed-probe-primitive.xml | 73 +++++++++++++++++++ - 6 files changed, 160 insertions(+) - create mode 100644 cts/scheduler/dot/expired-failed-probe-primitive.dot - create mode 100644 cts/scheduler/exp/expired-failed-probe-primitive.exp - create mode 100644 cts/scheduler/scores/expired-failed-probe-primitive.scores - create mode 100644 cts/scheduler/summary/expired-failed-probe-primitive.summary - create mode 100644 cts/scheduler/xml/expired-failed-probe-primitive.xml - -diff --git a/cts/cts-scheduler.in b/cts/cts-scheduler.in -index 3abcbc6c9d..7bc41a0936 100644 ---- a/cts/cts-scheduler.in -+++ b/cts/cts-scheduler.in -@@ -115,6 +115,7 @@ TESTS = [ - [ "probe-pending-node", "Probe (pending node + unmanaged resource)" ], - [ "failed-probe-primitive", "Maskable vs. unmaskable probe failures on primitive resources" ], - [ "failed-probe-clone", "Maskable vs. unmaskable probe failures on cloned resources" ], -+ [ "expired-failed-probe-primitive", "Maskable, expired probe failure on primitive resources" ], - [ "standby", "Standby" ], - [ "comments", "Comments" ], - ], -diff --git a/cts/scheduler/dot/expired-failed-probe-primitive.dot b/cts/scheduler/dot/expired-failed-probe-primitive.dot -new file mode 100644 -index 0000000000..610c2b8047 ---- /dev/null -+++ b/cts/scheduler/dot/expired-failed-probe-primitive.dot -@@ -0,0 +1,8 @@ -+ digraph "g" { -+"dummy-1_monitor_0 cluster01" -> "dummy-1_start_0 cluster02" [ style = bold] -+"dummy-1_monitor_0 cluster01" [ style=bold color="green" fontcolor="black"] -+"dummy-1_monitor_0 cluster02" -> "dummy-1_start_0 cluster02" [ style = bold] -+"dummy-1_monitor_0 cluster02" [ style=bold color="green" fontcolor="black"] -+"dummy-1_start_0 cluster02" [ style=bold color="green" fontcolor="black"] -+"dummy-2_monitor_0 cluster01" [ style=bold color="green" fontcolor="black"] -+} -diff --git a/cts/scheduler/exp/expired-failed-probe-primitive.exp b/cts/scheduler/exp/expired-failed-probe-primitive.exp -new file mode 100644 -index 0000000000..3c2cbfe411 ---- /dev/null -+++ b/cts/scheduler/exp/expired-failed-probe-primitive.exp -@@ -0,0 +1,45 @@ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -diff --git a/cts/scheduler/scores/expired-failed-probe-primitive.scores b/cts/scheduler/scores/expired-failed-probe-primitive.scores -new file mode 100644 -index 0000000000..51ae5510e6 ---- /dev/null -+++ b/cts/scheduler/scores/expired-failed-probe-primitive.scores -@@ -0,0 +1,7 @@ -+ -+pcmk__native_allocate: Fencing allocation score on cluster01: 0 -+pcmk__native_allocate: Fencing allocation score on cluster02: 0 -+pcmk__native_allocate: dummy-1 allocation score on cluster01: 0 -+pcmk__native_allocate: dummy-1 allocation score on cluster02: 0 -+pcmk__native_allocate: dummy-2 allocation score on cluster01: 0 -+pcmk__native_allocate: dummy-2 allocation score on cluster02: 0 -diff --git a/cts/scheduler/summary/expired-failed-probe-primitive.summary b/cts/scheduler/summary/expired-failed-probe-primitive.summary -new file mode 100644 -index 0000000000..ac0604e84f ---- /dev/null -+++ b/cts/scheduler/summary/expired-failed-probe-primitive.summary -@@ -0,0 +1,26 @@ -+Current cluster status: -+ * Node List: -+ * Online: [ cluster01 cluster02 ] -+ -+ * Full List of Resources: -+ * Fencing (stonith:fence_xvm): Started cluster01 -+ * dummy-1 (ocf:pacemaker:Dummy): Stopped -+ * dummy-2 (ocf:pacemaker:Dummy): Started cluster02 -+ -+Transition Summary: -+ * Start dummy-1 ( cluster02 ) -+ -+Executing Cluster Transition: -+ * Resource action: dummy-1 monitor on cluster02 -+ * Resource action: dummy-1 monitor on cluster01 -+ * Resource action: dummy-2 monitor on cluster01 -+ * Resource action: dummy-1 start on cluster02 -+ -+Revised Cluster Status: -+ * Node List: -+ * Online: [ cluster01 cluster02 ] -+ -+ * Full List of Resources: -+ * Fencing (stonith:fence_xvm): Started cluster01 -+ * dummy-1 (ocf:pacemaker:Dummy): Started cluster02 -+ * dummy-2 (ocf:pacemaker:Dummy): Started cluster02 -diff --git a/cts/scheduler/xml/expired-failed-probe-primitive.xml b/cts/scheduler/xml/expired-failed-probe-primitive.xml -new file mode 100644 -index 0000000000..684aa73f92 ---- /dev/null -+++ b/cts/scheduler/xml/expired-failed-probe-primitive.xml -@@ -0,0 +1,73 @@ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ --- -2.27.0 - diff --git a/SOURCES/011-fencing-reasons.patch b/SOURCES/011-fencing-reasons.patch deleted file mode 100644 index 4422ca0..0000000 --- a/SOURCES/011-fencing-reasons.patch +++ /dev/null @@ -1,1450 +0,0 @@ -From 6db8e3adef0441953ec18dd0339c0a67c5c26bdf Mon Sep 17 00:00:00 2001 -From: Ken Gaillot -Date: Tue, 14 Dec 2021 16:25:21 -0600 -Subject: [PATCH 01/17] Doc: Pacemaker Development: update for recent function - renames - ---- - doc/sphinx/Pacemaker_Development/components.rst | 16 ++++++++-------- - 1 file changed, 8 insertions(+), 8 deletions(-) - -diff --git a/doc/sphinx/Pacemaker_Development/components.rst b/doc/sphinx/Pacemaker_Development/components.rst -index a51220cac9..68158484ce 100644 ---- a/doc/sphinx/Pacemaker_Development/components.rst -+++ b/doc/sphinx/Pacemaker_Development/components.rst -@@ -106,7 +106,7 @@ or messaging layer callback, which calls: - the number of active peers), and if this is the last expected reply, - calls - -- * ``call_remote_stonith()``, which calculates the timeout and sends -+ * ``request_peer_fencing()``, which calculates the timeout and sends - ``STONITH_OP_FENCE`` request(s) to carry out the fencing. If the target - node has a fencing "topology" (which allows specifications such as - "this node can be fenced either with device A, or devices B and C in -@@ -156,7 +156,7 @@ returns, and calls - * done callback (``st_child_done()``), which calls ``schedule_stonith_command()`` - for a new device if there are further required actions to execute or if the - original action failed, then builds and sends an XML reply to the original -- fencer (via ``stonith_send_async_reply()``), then checks whether any -+ fencer (via ``send_async_reply()``), then checks whether any - pending actions are the same as the one just executed and merges them if so. - - Fencing replies -@@ -169,18 +169,18 @@ messaging layer callback, which calls: - - * ``handle_reply()``, which calls - -- * ``process_remote_stonith_exec()``, which calls either -- ``call_remote_stonith()`` (to retry a failed operation, or try the next -- device in a topology is appropriate, which issues a new -+ * ``fenced_process_fencing_reply()``, which calls either -+ ``request_peer_fencing()`` (to retry a failed operation, or try the next -+ device in a topology is appropriate, which issues a new - ``STONITH_OP_FENCE`` request, proceeding as before) or -- ``remote_op_done()`` (if the operation is definitively failed or -+ ``finalize_op()`` (if the operation is definitively failed or - successful). - -- * remote_op_done() broadcasts the result to all peers. -+ * ``finalize_op()`` broadcasts the result to all peers. - - Finally, all peers receive the broadcast result and call - --* ``remote_op_done()``, which sends the result to all local clients. -+* ``finalize_op()``, which sends the result to all local clients. - - - .. index:: --- -2.27.0 - - -From 47db9e5fb410b1e911710727d646eb7180a70c90 Mon Sep 17 00:00:00 2001 -From: Ken Gaillot -Date: Fri, 12 Nov 2021 09:58:16 -0600 -Subject: [PATCH 02/17] Refactor: fencing: add full result to fence action - callback data - -stonith_callback_data_t previously only contained the legacy return code for -the action. Use its new opaque member to store the full result, along with -accessors (available only internally for now). ---- - include/crm/fencing/internal.h | 3 ++ - lib/fencing/st_client.c | 99 ++++++++++++++++++++++++++-------- - 2 files changed, 81 insertions(+), 21 deletions(-) - -diff --git a/include/crm/fencing/internal.h b/include/crm/fencing/internal.h -index f0d294a0b3..eff689e59b 100644 ---- a/include/crm/fencing/internal.h -+++ b/include/crm/fencing/internal.h -@@ -187,6 +187,9 @@ bool stonith__event_state_eq(stonith_history_t *history, void *user_data); - bool stonith__event_state_neq(stonith_history_t *history, void *user_data); - - int stonith__legacy2status(int rc); -+int stonith__exit_status(stonith_callback_data_t *data); -+int stonith__execution_status(stonith_callback_data_t *data); -+const char *stonith__exit_reason(stonith_callback_data_t *data); - - /*! - * \internal -diff --git a/lib/fencing/st_client.c b/lib/fencing/st_client.c -index 2ca094566b..9d93ffd481 100644 ---- a/lib/fencing/st_client.c -+++ b/lib/fencing/st_client.c -@@ -854,20 +854,23 @@ stonith_api_del_callback(stonith_t * stonith, int call_id, bool all_callbacks) - * \param[in] st Fencer API connection - * \param[in] call_id If positive, call ID of completed fence action, otherwise - * legacy return code for early action failure -- * \param[in] rc Legacy return code for action result -+ * \param[in] result Full result for action - * \param[in] userdata User data to pass to callback - * \param[in] callback Fence action callback to invoke - */ - static void --invoke_fence_action_callback(stonith_t *st, int call_id, int rc, void *userdata, -+invoke_fence_action_callback(stonith_t *st, int call_id, -+ pcmk__action_result_t *result, -+ void *userdata, - void (*callback) (stonith_t *st, - stonith_callback_data_t *data)) - { - stonith_callback_data_t data = { 0, }; - - data.call_id = call_id; -- data.rc = rc; -+ data.rc = pcmk_rc2legacy(stonith__result2rc(result)); - data.userdata = userdata; -+ data.opaque = (void *) result; - - callback(st, &data); - } -@@ -888,7 +891,7 @@ invoke_registered_callbacks(stonith_t *stonith, xmlNode *msg, int call_id) - { - stonith_private_t *private = NULL; - stonith_callback_client_t *cb_info = NULL; -- int rc = pcmk_ok; -+ pcmk__action_result_t result = PCMK__UNKNOWN_RESULT; - - CRM_CHECK(stonith != NULL, return); - CRM_CHECK(stonith->st_private != NULL, return); -@@ -897,20 +900,17 @@ invoke_registered_callbacks(stonith_t *stonith, xmlNode *msg, int call_id) - - if (msg == NULL) { - // Fencer didn't reply in time -- rc = -ETIME; -+ pcmk__set_result(&result, CRM_EX_ERROR, PCMK_EXEC_TIMEOUT, -+ "Timeout waiting for reply from fencer"); - CRM_LOG_ASSERT(call_id > 0); - - } else { - // We have the fencer reply -- -- if (crm_element_value_int(msg, F_STONITH_RC, &rc) != 0) { -- rc = -pcmk_err_generic; -- } -- - if ((crm_element_value_int(msg, F_STONITH_CALLID, &call_id) != 0) - || (call_id <= 0)) { - crm_log_xml_warn(msg, "Bad fencer reply"); - } -+ stonith__xe_get_result(msg, &result); - } - - if (call_id > 0) { -@@ -919,27 +919,29 @@ invoke_registered_callbacks(stonith_t *stonith, xmlNode *msg, int call_id) - } - - if ((cb_info != NULL) && (cb_info->callback != NULL) -- && (rc == pcmk_ok || !(cb_info->only_success))) { -+ && (pcmk__result_ok(&result) || !(cb_info->only_success))) { - crm_trace("Invoking callback %s for call %d", - crm_str(cb_info->id), call_id); -- invoke_fence_action_callback(stonith, call_id, rc, cb_info->user_data, -- cb_info->callback); -+ invoke_fence_action_callback(stonith, call_id, &result, -+ cb_info->user_data, cb_info->callback); - -- } else if ((private->op_callback == NULL) && (rc != pcmk_ok)) { -- crm_warn("Fencing action without registered callback failed: %s", -- pcmk_strerror(rc)); -+ } else if ((private->op_callback == NULL) && !pcmk__result_ok(&result)) { -+ crm_warn("Fencing action without registered callback failed: %d (%s)", -+ result.exit_status, -+ pcmk_exec_status_str(result.execution_status)); - crm_log_xml_debug(msg, "Failed fence update"); - } - - if (private->op_callback != NULL) { - crm_trace("Invoking global callback for call %d", call_id); -- invoke_fence_action_callback(stonith, call_id, rc, NULL, -+ invoke_fence_action_callback(stonith, call_id, &result, NULL, - private->op_callback); - } - - if (cb_info != NULL) { - stonith_api_del_callback(stonith, call_id, FALSE); - } -+ pcmk__reset_result(&result); - } - - static gboolean -@@ -1252,14 +1254,18 @@ stonith_api_add_callback(stonith_t * stonith, int call_id, int timeout, int opti - CRM_CHECK(stonith->st_private != NULL, return -EINVAL); - private = stonith->st_private; - -- if (call_id == 0) { -+ if (call_id == 0) { // Add global callback - private->op_callback = callback; - -- } else if (call_id < 0) { -+ } else if (call_id < 0) { // Call failed immediately, so call callback now - if (!(options & st_opt_report_only_success)) { -+ pcmk__action_result_t result = PCMK__UNKNOWN_RESULT; -+ - crm_trace("Call failed, calling %s: %s", callback_name, pcmk_strerror(call_id)); -- invoke_fence_action_callback(stonith, call_id, call_id, user_data, -- callback); -+ pcmk__set_result(&result, CRM_EX_ERROR, -+ stonith__legacy2status(call_id), NULL); -+ invoke_fence_action_callback(stonith, call_id, &result, -+ user_data, callback); - } else { - crm_warn("Fencer call failed: %s", pcmk_strerror(call_id)); - } -@@ -2293,6 +2299,57 @@ stonith__device_parameter_flags(uint32_t *device_flags, const char *device_name, - freeXpathObject(xpath); - } - -+/*! -+ * \internal -+ * \brief Return the exit status from an async action callback -+ * -+ * \param[in] data Callback data -+ * -+ * \return Exit status from callback data -+ */ -+int -+stonith__exit_status(stonith_callback_data_t *data) -+{ -+ if ((data == NULL) || (data->opaque == NULL)) { -+ return CRM_EX_ERROR; -+ } -+ return ((pcmk__action_result_t *) data->opaque)->exit_status; -+} -+ -+/*! -+ * \internal -+ * \brief Return the execution status from an async action callback -+ * -+ * \param[in] data Callback data -+ * -+ * \return Execution status from callback data -+ */ -+int -+stonith__execution_status(stonith_callback_data_t *data) -+{ -+ if ((data == NULL) || (data->opaque == NULL)) { -+ return PCMK_EXEC_UNKNOWN; -+ } -+ return ((pcmk__action_result_t *) data->opaque)->execution_status; -+} -+ -+/*! -+ * \internal -+ * \brief Return the exit reason from an async action callback -+ * -+ * \param[in] data Callback data -+ * -+ * \return Exit reason from callback data -+ */ -+const char * -+stonith__exit_reason(stonith_callback_data_t *data) -+{ -+ if ((data == NULL) || (data->opaque == NULL)) { -+ return NULL; -+ } -+ return ((pcmk__action_result_t *) data->opaque)->exit_reason; -+} -+ - // Deprecated functions kept only for backward API compatibility - // LCOV_EXCL_START - --- -2.27.0 - - -From 1e076370ef4ac7993b5ff21ed1cdfb3c4a494cf0 Mon Sep 17 00:00:00 2001 -From: Ken Gaillot -Date: Tue, 9 Nov 2021 16:16:03 -0600 -Subject: [PATCH 03/17] Log: controller: improve fencing result messages - -Now that fence callbacks get the full result, we can log a better message. -Also check for error conditions better, improve message wording, and ensure -only a single message is logged per result. ---- - daemons/controld/controld_fencing.c | 83 +++++++++++++++++++---------- - 1 file changed, 56 insertions(+), 27 deletions(-) - -diff --git a/daemons/controld/controld_fencing.c b/daemons/controld/controld_fencing.c -index f5a252c813..f8d2fc13f4 100644 ---- a/daemons/controld/controld_fencing.c -+++ b/daemons/controld/controld_fencing.c -@@ -714,45 +714,64 @@ tengine_stonith_callback(stonith_t *stonith, stonith_callback_data_t *data) - int stonith_id = -1; - int transition_id = -1; - crm_action_t *action = NULL; -- int call_id = data->call_id; -- int rc = data->rc; -- char *userdata = data->userdata; -- -- CRM_CHECK(userdata != NULL, return); -- crm_notice("Stonith operation %d/%s: %s (%d)", call_id, (char *)userdata, -- pcmk_strerror(rc), rc); -+ const char *target = NULL; - -- if (AM_I_DC == FALSE) { -+ if ((data == NULL) || (data->userdata == NULL)) { -+ crm_err("Ignoring fence operation %d result: " -+ "No transition key given (bug?)", -+ ((data == NULL)? -1 : data->call_id)); - return; - } - -- /* crm_info("call=%d, optype=%d, node_name=%s, result=%d, node_list=%s, action=%s", */ -- /* op->call_id, op->optype, op->node_name, op->op_result, */ -- /* (char *)op->node_list, op->private_data); */ -+ if (!AM_I_DC) { -+ const char *reason = stonith__exit_reason(data); -+ -+ if (reason == NULL) { -+ reason = pcmk_exec_status_str(stonith__execution_status(data)); -+ } -+ crm_notice("Result of fence operation %d: %d (%s) " CRM_XS " key=%s", -+ data->call_id, stonith__exit_status(data), reason, -+ (const char *) data->userdata); -+ return; -+ } - -- /* filter out old STONITH actions */ -- CRM_CHECK(decode_transition_key(userdata, &uuid, &transition_id, &stonith_id, NULL), -+ CRM_CHECK(decode_transition_key(data->userdata, &uuid, &transition_id, -+ &stonith_id, NULL), - goto bail); - -- if (transition_graph->complete || stonith_id < 0 || !pcmk__str_eq(uuid, te_uuid, pcmk__str_casei) -- || transition_graph->id != transition_id) { -- crm_info("Ignoring STONITH action initiated outside of the current transition"); -+ if (transition_graph->complete || (stonith_id < 0) -+ || !pcmk__str_eq(uuid, te_uuid, pcmk__str_none) -+ || (transition_graph->id != transition_id)) { -+ crm_info("Ignoring fence operation %d result: " -+ "Not from current transition " CRM_XS -+ " complete=%s action=%d uuid=%s (vs %s) transition=%d (vs %d)", -+ data->call_id, pcmk__btoa(transition_graph->complete), -+ stonith_id, uuid, te_uuid, transition_id, transition_graph->id); - goto bail; - } - - action = controld_get_action(stonith_id); - if (action == NULL) { -- crm_err("Stonith action not matched"); -+ crm_err("Ignoring fence operation %d result: " -+ "Action %d not found in transition graph (bug?) " -+ CRM_XS " uuid=%s transition=%d", -+ data->call_id, stonith_id, uuid, transition_id); -+ goto bail; -+ } -+ -+ target = crm_element_value(action->xml, XML_LRM_ATTR_TARGET); -+ if (target == NULL) { -+ crm_err("Ignoring fence operation %d result: No target given (bug?)", -+ data->call_id); - goto bail; - } - - stop_te_timer(action->timer); -- if (rc == pcmk_ok) { -- const char *target = crm_element_value(action->xml, XML_LRM_ATTR_TARGET); -+ if (stonith__exit_status(data) == CRM_EX_OK) { - const char *uuid = crm_element_value(action->xml, XML_LRM_ATTR_TARGET_UUID); - const char *op = crm_meta_value(action->params, "stonith_action"); - -- crm_info("Stonith operation %d for %s passed", call_id, target); -+ crm_notice("Fence operation %d for %s passed", data->call_id, target); - if (!(pcmk_is_set(action->flags, pcmk__graph_action_confirmed))) { - te_action_confirmed(action, NULL); - if (pcmk__str_eq("on", op, pcmk__str_casei)) { -@@ -791,20 +810,30 @@ tengine_stonith_callback(stonith_t *stonith, stonith_callback_data_t *data) - st_fail_count_reset(target); - - } else { -- const char *target = crm_element_value(action->xml, XML_LRM_ATTR_TARGET); - enum transition_action abort_action = tg_restart; -+ int status = stonith__execution_status(data); -+ const char *reason = stonith__exit_reason(data); - -+ if (reason == NULL) { -+ if (status == PCMK_EXEC_DONE) { -+ reason = "Agent returned error"; -+ } else { -+ reason = pcmk_exec_status_str(status); -+ } -+ } - crm__set_graph_action_flags(action, pcmk__graph_action_failed); -- crm_notice("Stonith operation %d for %s failed (%s): aborting transition.", -- call_id, target, pcmk_strerror(rc)); - - /* If no fence devices were available, there's no use in immediately - * checking again, so don't start a new transition in that case. - */ -- if (rc == -ENODEV) { -- crm_warn("No devices found in cluster to fence %s, giving up", -- target); -+ if (status == PCMK_EXEC_NO_FENCE_DEVICE) { -+ crm_warn("Fence operation %d for %s failed: %s " -+ "(aborting transition and giving up for now)", -+ data->call_id, target, reason); - abort_action = tg_stop; -+ } else { -+ crm_notice("Fence operation %d for %s failed: %s " -+ "(aborting transition)", data->call_id, target, reason); - } - - /* Increment the fail count now, so abort_for_stonith_failure() can -@@ -818,7 +847,7 @@ tengine_stonith_callback(stonith_t *stonith, stonith_callback_data_t *data) - trigger_graph(); - - bail: -- free(userdata); -+ free(data->userdata); - free(uuid); - return; - } --- -2.27.0 - - -From 25547e3b7e6eb23efad1c359388d6e8d0df62363 Mon Sep 17 00:00:00 2001 -From: Ken Gaillot -Date: Mon, 22 Nov 2021 12:37:16 -0600 -Subject: [PATCH 04/17] Refactor: executor: drop action_get_uniform_rc() - function - -action_get_uniform_rc() called stonith2uniform_rc() or services_result2ocf() as -appropriate to the action standard. However, it was called only from a place -that did not process stonith actions, so that place can just call -services_result2ocf() directly. - -This will simplify planned changes. ---- - daemons/execd/execd_commands.c | 24 ++++++------------------ - 1 file changed, 6 insertions(+), 18 deletions(-) - -diff --git a/daemons/execd/execd_commands.c b/daemons/execd/execd_commands.c -index 5bb2aab692..5e123e322e 100644 ---- a/daemons/execd/execd_commands.c -+++ b/daemons/execd/execd_commands.c -@@ -780,23 +780,6 @@ stonith2uniform_rc(const char *action, int rc) - return rc; - } - --static int --action_get_uniform_rc(svc_action_t *action) --{ -- lrmd_cmd_t *cmd = action->cb_data; -- -- if (pcmk__str_eq(action->standard, PCMK_RESOURCE_CLASS_STONITH, -- pcmk__str_casei)) { -- return stonith2uniform_rc(cmd->action, action->rc); -- } else { -- enum ocf_exitcode code = services_result2ocf(action->standard, -- cmd->action, action->rc); -- -- // Cast variable instead of function return to keep compilers happy -- return (int) code; -- } --} -- - struct notify_new_client_data { - xmlNode *notify; - pcmk__client_t *new_client; -@@ -848,6 +831,7 @@ action_complete(svc_action_t * action) - { - lrmd_rsc_t *rsc; - lrmd_cmd_t *cmd = action->cb_data; -+ enum ocf_exitcode code; - - #ifdef PCMK__TIME_USE_CGT - const char *rclass = NULL; -@@ -867,8 +851,12 @@ action_complete(svc_action_t * action) - #endif - - cmd->last_pid = action->pid; -- pcmk__set_result(&(cmd->result), action_get_uniform_rc(action), -+ -+ // Cast variable instead of function return to keep compilers happy -+ code = services_result2ocf(action->standard, cmd->action, action->rc); -+ pcmk__set_result(&(cmd->result), (int) code, - action->status, services__exit_reason(action)); -+ - rsc = cmd->rsc_id ? g_hash_table_lookup(rsc_list, cmd->rsc_id) : NULL; - - #ifdef PCMK__TIME_USE_CGT --- -2.27.0 - - -From b5e31ba2539da4e94c124c3f0c8c72f7039f9a7a Mon Sep 17 00:00:00 2001 -From: Ken Gaillot -Date: Mon, 22 Nov 2021 12:39:30 -0600 -Subject: [PATCH 05/17] Feature: executor: use full result from fencer for - fence actions - -Now that fence callbacks get the full result, we can improve the executor -command result for fence actions. stonith_action_complete() now takes a -full result, allowing the executor to use that directly rather than map a -legacy return code. ---- - daemons/execd/execd_commands.c | 140 +++++++++++++++++++-------------- - 1 file changed, 80 insertions(+), 60 deletions(-) - -diff --git a/daemons/execd/execd_commands.c b/daemons/execd/execd_commands.c -index 5e123e322e..e722994012 100644 ---- a/daemons/execd/execd_commands.c -+++ b/daemons/execd/execd_commands.c -@@ -8,6 +8,7 @@ - */ - - #include -+#include - - #include - -@@ -748,38 +749,6 @@ cmd_finalize(lrmd_cmd_t * cmd, lrmd_rsc_t * rsc) - } - } - --static int --stonith2uniform_rc(const char *action, int rc) --{ -- switch (rc) { -- case pcmk_ok: -- rc = PCMK_OCF_OK; -- break; -- -- case -ENODEV: -- /* This should be possible only for probes in practice, but -- * interpret for all actions to be safe. -- */ -- if (pcmk__str_eq(action, "monitor", pcmk__str_casei)) { -- rc = PCMK_OCF_NOT_RUNNING; -- } else if (pcmk__str_eq(action, "stop", pcmk__str_casei)) { -- rc = PCMK_OCF_OK; -- } else { -- rc = PCMK_OCF_NOT_INSTALLED; -- } -- break; -- -- case -EOPNOTSUPP: -- rc = PCMK_OCF_UNIMPLEMENT_FEATURE; -- break; -- -- default: -- rc = PCMK_OCF_UNKNOWN_ERROR; -- break; -- } -- return rc; --} -- - struct notify_new_client_data { - xmlNode *notify; - pcmk__client_t *new_client; -@@ -988,46 +957,84 @@ action_complete(svc_action_t * action) - cmd_finalize(cmd, rsc); - } - -+/*! -+ * \internal -+ * \brief Process the result of a fence device action (start, stop, or monitor) -+ * -+ * \param[in] cmd Fence device action that completed -+ * \param[in] exit_status Fencer API exit status for action -+ * \param[in] execution_status Fencer API execution status for action -+ * \param[in] exit_reason Human-friendly detail, if action failed -+ */ - static void --stonith_action_complete(lrmd_cmd_t * cmd, int rc) -+stonith_action_complete(lrmd_cmd_t *cmd, int exit_status, -+ enum pcmk_exec_status execution_status, -+ const char *exit_reason) - { - // This can be NULL if resource was removed before command completed - lrmd_rsc_t *rsc = g_hash_table_lookup(rsc_list, cmd->rsc_id); - -- cmd->result.exit_status = stonith2uniform_rc(cmd->action, rc); -+ // Simplify fencer exit status to uniform exit status -+ if (exit_status != CRM_EX_OK) { -+ exit_status = PCMK_OCF_UNKNOWN_ERROR; -+ } - -- /* This function may be called with status already set to cancelled, if a -- * pending action was aborted. Otherwise, we need to determine status from -- * the fencer return code. -- */ -- if (cmd->result.execution_status != PCMK_EXEC_CANCELLED) { -- cmd->result.execution_status = stonith__legacy2status(rc); -+ if (cmd->result.execution_status == PCMK_EXEC_CANCELLED) { -+ /* An in-flight fence action was cancelled. The execution status is -+ * already correct, so don't overwrite it. -+ */ -+ execution_status = PCMK_EXEC_CANCELLED; - -- // Simplify status codes from fencer -- switch (cmd->result.execution_status) { -+ } else { -+ /* Some execution status codes have specific meanings for the fencer -+ * that executor clients may not expect, so map them to a simple error -+ * status. -+ */ -+ switch (execution_status) { - case PCMK_EXEC_NOT_CONNECTED: - case PCMK_EXEC_INVALID: -- case PCMK_EXEC_NO_FENCE_DEVICE: - case PCMK_EXEC_NO_SECRETS: -- cmd->result.execution_status = PCMK_EXEC_ERROR; -+ execution_status = PCMK_EXEC_ERROR; - break; -- default: -+ -+ case PCMK_EXEC_NO_FENCE_DEVICE: -+ /* This should be possible only for probes in practice, but -+ * interpret for all actions to be safe. -+ */ -+ if (pcmk__str_eq(cmd->action, CRMD_ACTION_STATUS, -+ pcmk__str_none)) { -+ exit_status = PCMK_OCF_NOT_RUNNING; -+ -+ } else if (pcmk__str_eq(cmd->action, CRMD_ACTION_STOP, -+ pcmk__str_none)) { -+ exit_status = PCMK_OCF_OK; -+ -+ } else { -+ exit_status = PCMK_OCF_NOT_INSTALLED; -+ } -+ execution_status = PCMK_EXEC_ERROR; - break; -- } - -- // Certain successful actions change the known state of the resource -- if ((rsc != NULL) && pcmk__result_ok(&(cmd->result))) { -- if (pcmk__str_eq(cmd->action, "start", pcmk__str_casei)) { -- rsc->st_probe_rc = pcmk_ok; // maps to PCMK_OCF_OK -- } else if (pcmk__str_eq(cmd->action, "stop", pcmk__str_casei)) { -- rsc->st_probe_rc = -ENODEV; // maps to PCMK_OCF_NOT_RUNNING -- } -+ case PCMK_EXEC_NOT_SUPPORTED: -+ exit_status = PCMK_OCF_UNIMPLEMENT_FEATURE; -+ break; -+ -+ default: -+ break; - } - } - -- // Give the user more detail than an OCF code -- if (rc != -pcmk_err_generic) { -- cmd->result.exit_reason = strdup(pcmk_strerror(rc)); -+ pcmk__set_result(&cmd->result, exit_status, execution_status, exit_reason); -+ -+ // Certain successful actions change the known state of the resource -+ if ((rsc != NULL) && pcmk__result_ok(&(cmd->result))) { -+ -+ if (pcmk__str_eq(cmd->action, "start", pcmk__str_casei)) { -+ rsc->st_probe_rc = pcmk_ok; // maps to PCMK_OCF_OK -+ -+ } else if (pcmk__str_eq(cmd->action, "stop", pcmk__str_casei)) { -+ rsc->st_probe_rc = -ENODEV; // maps to PCMK_OCF_NOT_RUNNING -+ } - } - - /* The recurring timer should not be running at this point in any case, but -@@ -1050,7 +1057,15 @@ stonith_action_complete(lrmd_cmd_t * cmd, int rc) - static void - lrmd_stonith_callback(stonith_t * stonith, stonith_callback_data_t * data) - { -- stonith_action_complete(data->userdata, data->rc); -+ if ((data == NULL) || (data->userdata == NULL)) { -+ crm_err("Ignoring fence action result: " -+ "Invalid callback arguments (bug?)"); -+ } else { -+ stonith_action_complete((lrmd_cmd_t *) data->userdata, -+ stonith__exit_status(data), -+ stonith__execution_status(data), -+ stonith__exit_reason(data)); -+ } - } - - void -@@ -1097,7 +1112,9 @@ stonith_connection_failed(void) - crm_err("Connection to fencer failed, finalizing %d pending operations", - g_list_length(cmd_list)); - for (cmd_iter = cmd_list; cmd_iter; cmd_iter = cmd_iter->next) { -- stonith_action_complete(cmd_iter->data, -ENOTCONN); -+ stonith_action_complete((lrmd_cmd_t *) cmd_iter->data, -+ CRM_EX_ERROR, PCMK_EXEC_NOT_CONNECTED, -+ "Lost connection to fencer"); - } - g_list_free(cmd_list); - } -@@ -1210,7 +1227,7 @@ lrmd_rsc_execute_stonith(lrmd_rsc_t * rsc, lrmd_cmd_t * cmd) - - } else if (pcmk__str_eq(cmd->action, "start", pcmk__str_casei)) { - rc = execd_stonith_start(stonith_api, rsc, cmd); -- if (rc == 0) { -+ if (rc == pcmk_ok) { - do_monitor = TRUE; - } - -@@ -1233,7 +1250,10 @@ lrmd_rsc_execute_stonith(lrmd_rsc_t * rsc, lrmd_cmd_t * cmd) - } - } - -- stonith_action_complete(cmd, rc); -+ stonith_action_complete(cmd, -+ ((rc == pcmk_ok)? CRM_EX_OK : CRM_EX_ERROR), -+ stonith__legacy2status(rc), -+ rc == -pcmk_err_generic? NULL : pcmk_strerror(rc)); - } - - static int --- -2.27.0 - - -From 0cdc8506c2383cf05c2f62ab1ac9438958daf210 Mon Sep 17 00:00:00 2001 -From: Ken Gaillot -Date: Mon, 22 Nov 2021 16:15:05 -0600 -Subject: [PATCH 06/17] Fix: executor,scheduler: treat "no secrets" fence - results as a hard error - -Previously, the executor mapped the fencer's PCMK_EXEC_NO_SECRETS status to -PCMK_EXEC_ERROR to keep handling of that situation the same as before the new -code was added. - -However, the earlier handling was less than ideal -- a resource action that -failed due to missing secrets would be retried on the same node, and almost -certainly fail again for the same reason. Now, the executor passes along -PCMK_EXEC_NO_SECRETS to clients; the controller will record the result in the -CIB status, and the scheduler will treat it as a hard error (i.e. not retrying -on the same node). - -Backward compatibility isn't a problem because the scheduler treats unknown -status codes the same as PCMK_EXEC_ERROR, so an older DC will continue to -handle it as before. The CRM feature set has been bumped so the handling can't -flip back and forth in a mixed-version cluster. ---- - daemons/execd/execd_commands.c | 1 - - include/crm/crm.h | 4 ++-- - lib/pengine/unpack.c | 3 --- - 3 files changed, 2 insertions(+), 6 deletions(-) - -diff --git a/daemons/execd/execd_commands.c b/daemons/execd/execd_commands.c -index e722994012..4ced6d1d5c 100644 ---- a/daemons/execd/execd_commands.c -+++ b/daemons/execd/execd_commands.c -@@ -993,7 +993,6 @@ stonith_action_complete(lrmd_cmd_t *cmd, int exit_status, - switch (execution_status) { - case PCMK_EXEC_NOT_CONNECTED: - case PCMK_EXEC_INVALID: -- case PCMK_EXEC_NO_SECRETS: - execution_status = PCMK_EXEC_ERROR; - break; - -diff --git a/include/crm/crm.h b/include/crm/crm.h -index 16b35e9c55..56b07cb12a 100644 ---- a/include/crm/crm.h -+++ b/include/crm/crm.h -@@ -1,5 +1,5 @@ - /* -- * Copyright 2004-2021 the Pacemaker project contributors -+ * Copyright 2004-2022 the Pacemaker project contributors - * - * The version control history for this file may have further details. - * -@@ -66,7 +66,7 @@ extern "C" { - * >=3.0.13: Fail counts include operation name and interval - * >=3.2.0: DC supports PCMK_EXEC_INVALID and PCMK_EXEC_NOT_CONNECTED - */ --# define CRM_FEATURE_SET "3.12.0" -+# define CRM_FEATURE_SET "3.13.0" - - /* Pacemaker's CPG protocols use fixed-width binary fields for the sender and - * recipient of a CPG message. This imposes an arbitrary limit on cluster node -diff --git a/lib/pengine/unpack.c b/lib/pengine/unpack.c -index 3e0384cd2a..8a2d2a6d6d 100644 ---- a/lib/pengine/unpack.c -+++ b/lib/pengine/unpack.c -@@ -3879,9 +3879,6 @@ unpack_rsc_op(pe_resource_t *rsc, pe_node_t *node, xmlNode *xml_op, - case PCMK_EXEC_INVALID: - break; // Not done, do error handling - -- /* These should only be possible in fence action results, not operation -- * history, but have some handling in place as a fail-safe. -- */ - case PCMK_EXEC_NO_FENCE_DEVICE: - case PCMK_EXEC_NO_SECRETS: - status = PCMK_EXEC_ERROR_HARD; --- -2.27.0 - - -From 75c1bdcf3ffc406e6fa286fd5fcff83e1e65591a Mon Sep 17 00:00:00 2001 -From: Ken Gaillot -Date: Wed, 10 Nov 2021 12:05:20 -0600 -Subject: [PATCH 07/17] Low: executor: improve result for fence device probes - -Now that lrmd_rsc_execute_stonith() sets a full result instead of just a legacy -return code, refactor lrmd_rsc_t's st_probe_rc as an execution status (and -rename to fence_probe_result). Set an appropriate exit reason when available. ---- - daemons/execd/execd_commands.c | 57 ++++++++++++++++++++++++++------- - daemons/execd/pacemaker-execd.h | 9 +++++- - 2 files changed, 54 insertions(+), 12 deletions(-) - -diff --git a/daemons/execd/execd_commands.c b/daemons/execd/execd_commands.c -index 4ced6d1d5c..6e5505e973 100644 ---- a/daemons/execd/execd_commands.c -+++ b/daemons/execd/execd_commands.c -@@ -285,7 +285,9 @@ build_rsc_from_xml(xmlNode * msg) - rsc->provider = crm_element_value_copy(rsc_xml, F_LRMD_PROVIDER); - rsc->type = crm_element_value_copy(rsc_xml, F_LRMD_TYPE); - rsc->work = mainloop_add_trigger(G_PRIORITY_HIGH, lrmd_rsc_dispatch, rsc); -- rsc->st_probe_rc = -ENODEV; // if stonith, initialize to "not running" -+ -+ // Initialize fence device probes (to return "not running") -+ rsc->fence_probe_result = PCMK_EXEC_NO_FENCE_DEVICE; - return rsc; - } - -@@ -1029,10 +1031,10 @@ stonith_action_complete(lrmd_cmd_t *cmd, int exit_status, - if ((rsc != NULL) && pcmk__result_ok(&(cmd->result))) { - - if (pcmk__str_eq(cmd->action, "start", pcmk__str_casei)) { -- rsc->st_probe_rc = pcmk_ok; // maps to PCMK_OCF_OK -+ rsc->fence_probe_result = PCMK_EXEC_DONE; // "running" - - } else if (pcmk__str_eq(cmd->action, "stop", pcmk__str_casei)) { -- rsc->st_probe_rc = -ENODEV; // maps to PCMK_OCF_NOT_RUNNING -+ rsc->fence_probe_result = PCMK_EXEC_NO_FENCE_DEVICE; // "not running" - } - } - -@@ -1081,14 +1083,13 @@ stonith_connection_failed(void) - if (pcmk__str_eq(rsc->class, PCMK_RESOURCE_CLASS_STONITH, pcmk__str_casei)) { - /* If we registered this fence device, we don't know whether the - * fencer still has the registration or not. Cause future probes to -- * return PCMK_OCF_UNKNOWN_ERROR until the resource is stopped or -- * started successfully. This is especially important if the -- * controller also went away (possibly due to a cluster layer -- * restart) and won't receive our client notification of any -- * monitors finalized below. -+ * return an error until the resource is stopped or started -+ * successfully. This is especially important if the controller also -+ * went away (possibly due to a cluster layer restart) and won't -+ * receive our client notification of any monitors finalized below. - */ -- if (rsc->st_probe_rc == pcmk_ok) { -- rsc->st_probe_rc = pcmk_err_generic; -+ if (rsc->fence_probe_result == PCMK_EXEC_DONE) { -+ rsc->fence_probe_result = PCMK_EXEC_NOT_CONNECTED; - } - - if (rsc->active) { -@@ -1213,6 +1214,39 @@ execd_stonith_monitor(stonith_t *stonith_api, lrmd_rsc_t *rsc, lrmd_cmd_t *cmd) - return rc; - } - -+/*! -+ * \internal -+ * \brief Finalize the result of a fence device probe -+ * -+ * \param[in] cmd Probe action -+ * \param[in] probe_result Probe result -+ */ -+static void -+finalize_fence_device_probe(lrmd_cmd_t *cmd, enum pcmk_exec_status probe_result) -+{ -+ int exit_status = CRM_EX_ERROR; -+ const char *reason = NULL; -+ -+ switch (probe_result) { -+ case PCMK_EXEC_DONE: // Device is "running" -+ exit_status = CRM_EX_OK; -+ break; -+ -+ case PCMK_EXEC_NO_FENCE_DEVICE: // Device is "not running" -+ break; -+ -+ case PCMK_EXEC_NOT_CONNECTED: // stonith_connection_failed() -+ reason = "Lost connection to fencer"; -+ break; -+ -+ default: // Shouldn't be possible -+ probe_result = PCMK_EXEC_ERROR; -+ reason = "Invalid fence device probe result (bug?)"; -+ break; -+ } -+ stonith_action_complete(cmd, exit_status, probe_result, reason); -+} -+ - static void - lrmd_rsc_execute_stonith(lrmd_rsc_t * rsc, lrmd_cmd_t * cmd) - { -@@ -1237,7 +1271,8 @@ lrmd_rsc_execute_stonith(lrmd_rsc_t * rsc, lrmd_cmd_t * cmd) - if (cmd->interval_ms > 0) { - do_monitor = TRUE; - } else { -- rc = rsc->st_probe_rc; -+ finalize_fence_device_probe(cmd, rsc->fence_probe_result); -+ return; - } - } - -diff --git a/daemons/execd/pacemaker-execd.h b/daemons/execd/pacemaker-execd.h -index 51ef8d22e6..057d889584 100644 ---- a/daemons/execd/pacemaker-execd.h -+++ b/daemons/execd/pacemaker-execd.h -@@ -41,7 +41,14 @@ typedef struct lrmd_rsc_s { - * that have been handed off from the pending ops list. */ - GList *recurring_ops; - -- int st_probe_rc; // What value should be returned for a probe if stonith -+ /* If this resource is a fence device, probes are handled internally by the -+ * executor, and this value indicates the result that should currently be -+ * returned for probes. It should be one of: -+ * PCMK_EXEC_DONE (to indicate "running"), -+ * PCMK_EXEC_NO_FENCE_DEVICE ("not running"), or -+ * PCMK_EXEC_NOT_CONNECTED ("unknown because fencer connection was lost"). -+ */ -+ enum pcmk_exec_status fence_probe_result; - - crm_trigger_t *work; - } lrmd_rsc_t; --- -2.27.0 - - -From 1ab799d945171ab8d91bd0aada64e70a71193e5c Mon Sep 17 00:00:00 2001 -From: Ken Gaillot -Date: Wed, 10 Nov 2021 12:14:48 -0600 -Subject: [PATCH 08/17] Low: executor: don't require a fencer connection for - probes - -For fence devices, probe results are based on earlier state determinations, -so handle them before requiring an active fencer connection. The effect may be -negligible, but it would allow probes to proceed while waiting for a -reconnection. ---- - daemons/execd/execd_commands.c | 15 ++++++++------- - 1 file changed, 8 insertions(+), 7 deletions(-) - -diff --git a/daemons/execd/execd_commands.c b/daemons/execd/execd_commands.c -index 6e5505e973..5999ba19c9 100644 ---- a/daemons/execd/execd_commands.c -+++ b/daemons/execd/execd_commands.c -@@ -1255,7 +1255,13 @@ lrmd_rsc_execute_stonith(lrmd_rsc_t * rsc, lrmd_cmd_t * cmd) - - stonith_t *stonith_api = get_stonith_connection(); - -- if (!stonith_api) { -+ if (pcmk__str_eq(cmd->action, "monitor", pcmk__str_casei) -+ && (cmd->interval_ms == 0)) { -+ // Probes don't require a fencer connection -+ finalize_fence_device_probe(cmd, rsc->fence_probe_result); -+ return; -+ -+ } else if (stonith_api == NULL) { - rc = -ENOTCONN; - - } else if (pcmk__str_eq(cmd->action, "start", pcmk__str_casei)) { -@@ -1268,12 +1274,7 @@ lrmd_rsc_execute_stonith(lrmd_rsc_t * rsc, lrmd_cmd_t * cmd) - rc = execd_stonith_stop(stonith_api, rsc); - - } else if (pcmk__str_eq(cmd->action, "monitor", pcmk__str_casei)) { -- if (cmd->interval_ms > 0) { -- do_monitor = TRUE; -- } else { -- finalize_fence_device_probe(cmd, rsc->fence_probe_result); -- return; -- } -+ do_monitor = TRUE; - } - - if (do_monitor) { --- -2.27.0 - - -From adf41fb1637bcc9a6e057be52d61a0b26e4535cc Mon Sep 17 00:00:00 2001 -From: Ken Gaillot -Date: Wed, 10 Nov 2021 12:20:34 -0600 -Subject: [PATCH 09/17] Low: executor: return an error for unsupported fence - device actions - -... and set an exit reason. Previously, it would return success for unsupported -actions. It shouldn't be possible, but it would be nice to have an indication -of what is wrong if a bug is introduced. ---- - daemons/execd/execd_commands.c | 6 ++++++ - 1 file changed, 6 insertions(+) - -diff --git a/daemons/execd/execd_commands.c b/daemons/execd/execd_commands.c -index 5999ba19c9..772d6446dc 100644 ---- a/daemons/execd/execd_commands.c -+++ b/daemons/execd/execd_commands.c -@@ -1275,6 +1275,12 @@ lrmd_rsc_execute_stonith(lrmd_rsc_t * rsc, lrmd_cmd_t * cmd) - - } else if (pcmk__str_eq(cmd->action, "monitor", pcmk__str_casei)) { - do_monitor = TRUE; -+ -+ } else { -+ stonith_action_complete(cmd, PCMK_OCF_UNIMPLEMENT_FEATURE, -+ PCMK_EXEC_ERROR, -+ "Invalid fence device action (bug?)"); -+ return; - } - - if (do_monitor) { --- -2.27.0 - - -From af59dfe85bc83f5609d0a3b3b7939271549cb76f Mon Sep 17 00:00:00 2001 -From: Ken Gaillot -Date: Wed, 10 Nov 2021 12:24:07 -0600 -Subject: [PATCH 10/17] Low: executor: set exit reason if no fencer connection - ---- - daemons/execd/execd_commands.c | 5 ++++- - 1 file changed, 4 insertions(+), 1 deletion(-) - -diff --git a/daemons/execd/execd_commands.c b/daemons/execd/execd_commands.c -index 772d6446dc..7ae309d94c 100644 ---- a/daemons/execd/execd_commands.c -+++ b/daemons/execd/execd_commands.c -@@ -1262,7 +1262,10 @@ lrmd_rsc_execute_stonith(lrmd_rsc_t * rsc, lrmd_cmd_t * cmd) - return; - - } else if (stonith_api == NULL) { -- rc = -ENOTCONN; -+ stonith_action_complete(cmd, PCMK_OCF_UNKNOWN_ERROR, -+ PCMK_EXEC_NOT_CONNECTED, -+ "No connection to fencer"); -+ return; - - } else if (pcmk__str_eq(cmd->action, "start", pcmk__str_casei)) { - rc = execd_stonith_start(stonith_api, rsc, cmd); --- -2.27.0 - - -From ad0930b75d5617490c3a0dc3c6b83411b3c4536d Mon Sep 17 00:00:00 2001 -From: Ken Gaillot -Date: Wed, 10 Nov 2021 14:42:26 -0600 -Subject: [PATCH 11/17] Test: cts-fence-helper: log full result in fence - callback - ---- - daemons/fenced/cts-fence-helper.c | 7 +++++-- - 1 file changed, 5 insertions(+), 2 deletions(-) - -diff --git a/daemons/fenced/cts-fence-helper.c b/daemons/fenced/cts-fence-helper.c -index 2adb032f24..c2b55d73b9 100644 ---- a/daemons/fenced/cts-fence-helper.c -+++ b/daemons/fenced/cts-fence-helper.c -@@ -1,5 +1,5 @@ - /* -- * Copyright 2009-2020 the Pacemaker project contributors -+ * Copyright 2009-2021 the Pacemaker project contributors - * - * This source code is licensed under the GNU General Public License version 2 - * or later (GPLv2+) WITHOUT ANY WARRANTY. -@@ -132,7 +132,10 @@ st_callback(stonith_t * st, stonith_event_t * e) - static void - st_global_callback(stonith_t * stonith, stonith_callback_data_t * data) - { -- crm_notice("Call id %d completed with rc %d", data->call_id, data->rc); -+ crm_notice("Call %d exited %d: %s (%s)", -+ data->call_id, stonith__exit_status(data), -+ stonith__execution_status(data), -+ crm_str(stonith__exit_reason(data))); - } - - static void --- -2.27.0 - - -From 1b50ff4d83b7a96cd70389891b7b6568812f66f6 Mon Sep 17 00:00:00 2001 -From: Ken Gaillot -Date: Wed, 10 Nov 2021 15:10:14 -0600 -Subject: [PATCH 12/17] Test: cts-fence-helper: track full result instead of - legacy return code - ---- - daemons/fenced/cts-fence-helper.c | 77 +++++++++++++++---------------- - 1 file changed, 37 insertions(+), 40 deletions(-) - -diff --git a/daemons/fenced/cts-fence-helper.c b/daemons/fenced/cts-fence-helper.c -index c2b55d73b9..2739f57804 100644 ---- a/daemons/fenced/cts-fence-helper.c -+++ b/daemons/fenced/cts-fence-helper.c -@@ -34,23 +34,12 @@ - static GMainLoop *mainloop = NULL; - static crm_trigger_t *trig = NULL; - static int mainloop_iter = 0; --static int callback_rc = 0; -+static pcmk__action_result_t result = PCMK__UNKNOWN_RESULT; -+ - typedef void (*mainloop_test_iteration_cb) (int check_event); - - #define MAINLOOP_DEFAULT_TIMEOUT 2 - --#define mainloop_test_done(pass) \ -- if (pass) { \ -- crm_info("SUCCESS - %s", __func__); \ -- mainloop_iter++; \ -- mainloop_set_trigger(trig); \ -- } else { \ -- crm_err("FAILURE = %s async_callback %d", __func__, callback_rc); \ -- crm_exit(CRM_EX_ERROR); \ -- } \ -- callback_rc = 0; \ -- -- - enum test_modes { - test_standard = 0, // test using a specific developer environment - test_passive, // watch notifications only -@@ -93,6 +82,23 @@ static const int st_opts = st_opt_sync_call; - static int expected_notifications = 0; - static int verbose = 0; - -+static void -+mainloop_test_done(const char *origin, bool pass) -+{ -+ if (pass) { -+ crm_info("SUCCESS - %s", origin); -+ mainloop_iter++; -+ mainloop_set_trigger(trig); -+ result.execution_status = PCMK_EXEC_UNKNOWN; -+ result.exit_status = CRM_EX_OK; -+ } else { -+ crm_err("FAILURE - %s (%d: %s)", origin, result.exit_status, -+ pcmk_exec_status_str(result.execution_status)); -+ crm_exit(CRM_EX_ERROR); -+ } -+} -+ -+ - static void - dispatch_helper(int timeout) - { -@@ -385,7 +391,9 @@ static void - static void - mainloop_callback(stonith_t * stonith, stonith_callback_data_t * data) - { -- callback_rc = data->rc; -+ pcmk__set_result(&result, stonith__exit_status(data), -+ stonith__execution_status(data), -+ stonith__exit_reason(data)); - iterate_mainloop_tests(TRUE); - } - -@@ -404,18 +412,14 @@ test_async_fence_pass(int check_event) - int rc = 0; - - if (check_event) { -- if (callback_rc != 0) { -- mainloop_test_done(FALSE); -- } else { -- mainloop_test_done(TRUE); -- } -+ mainloop_test_done(__func__, (result.exit_status == CRM_EX_OK)); - return; - } - - rc = st->cmds->fence(st, 0, "true_1_node1", "off", MAINLOOP_DEFAULT_TIMEOUT, 0); - if (rc < 0) { - crm_err("fence failed with rc %d", rc); -- mainloop_test_done(FALSE); -+ mainloop_test_done(__func__, false); - } - register_callback_helper(rc); - /* wait for event */ -@@ -431,15 +435,15 @@ test_async_fence_custom_timeout(int check_event) - if (check_event) { - uint32_t diff = (time(NULL) - begin); - -- if (callback_rc != -ETIME) { -- mainloop_test_done(FALSE); -+ if (result.execution_status != PCMK_EXEC_TIMEOUT) { -+ mainloop_test_done(__func__, false); - } else if (diff < CUSTOM_TIMEOUT_ADDITION + MAINLOOP_DEFAULT_TIMEOUT) { - crm_err - ("Custom timeout test failed, callback expiration should be updated to %d, actual timeout was %d", - CUSTOM_TIMEOUT_ADDITION + MAINLOOP_DEFAULT_TIMEOUT, diff); -- mainloop_test_done(FALSE); -+ mainloop_test_done(__func__, false); - } else { -- mainloop_test_done(TRUE); -+ mainloop_test_done(__func__, true); - } - return; - } -@@ -448,7 +452,7 @@ test_async_fence_custom_timeout(int check_event) - rc = st->cmds->fence(st, 0, "custom_timeout_node1", "off", MAINLOOP_DEFAULT_TIMEOUT, 0); - if (rc < 0) { - crm_err("fence failed with rc %d", rc); -- mainloop_test_done(FALSE); -+ mainloop_test_done(__func__, false); - } - register_callback_helper(rc); - /* wait for event */ -@@ -460,18 +464,15 @@ test_async_fence_timeout(int check_event) - int rc = 0; - - if (check_event) { -- if (callback_rc != -ENODEV) { -- mainloop_test_done(FALSE); -- } else { -- mainloop_test_done(TRUE); -- } -+ mainloop_test_done(__func__, -+ (result.execution_status == PCMK_EXEC_NO_FENCE_DEVICE)); - return; - } - - rc = st->cmds->fence(st, 0, "false_1_node2", "off", MAINLOOP_DEFAULT_TIMEOUT, 0); - if (rc < 0) { - crm_err("fence failed with rc %d", rc); -- mainloop_test_done(FALSE); -+ mainloop_test_done(__func__, false); - } - register_callback_helper(rc); - /* wait for event */ -@@ -483,18 +484,14 @@ test_async_monitor(int check_event) - int rc = 0; - - if (check_event) { -- if (callback_rc) { -- mainloop_test_done(FALSE); -- } else { -- mainloop_test_done(TRUE); -- } -+ mainloop_test_done(__func__, (result.exit_status == CRM_EX_OK)); - return; - } - - rc = st->cmds->monitor(st, 0, "false_1", MAINLOOP_DEFAULT_TIMEOUT); - if (rc < 0) { - crm_err("monitor failed with rc %d", rc); -- mainloop_test_done(FALSE); -+ mainloop_test_done(__func__, false); - } - - register_callback_helper(rc); -@@ -531,7 +528,7 @@ test_register_async_devices(int check_event) - params); - stonith_key_value_freeall(params, 1, 1); - -- mainloop_test_done(TRUE); -+ mainloop_test_done(__func__, true); - } - - static void -@@ -540,11 +537,11 @@ try_mainloop_connect(int check_event) - int rc = stonith_api_connect_retry(st, crm_system_name, 10); - - if (rc == pcmk_ok) { -- mainloop_test_done(TRUE); -+ mainloop_test_done(__func__, true); - return; - } - crm_err("API CONNECTION FAILURE"); -- mainloop_test_done(FALSE); -+ mainloop_test_done(__func__, false); - } - - static void --- -2.27.0 - - -From 8ff4b384a34828a4a9eebe896324ba8c89e5d66c Mon Sep 17 00:00:00 2001 -From: Ken Gaillot -Date: Mon, 10 Jan 2022 10:27:45 -0600 -Subject: [PATCH 13/17] Doc: Pacemaker Development: correct typo - -caught in review ---- - doc/sphinx/Pacemaker_Development/components.rst | 2 +- - 1 file changed, 1 insertion(+), 1 deletion(-) - -diff --git a/doc/sphinx/Pacemaker_Development/components.rst b/doc/sphinx/Pacemaker_Development/components.rst -index 68158484ce..c4d10fc9f5 100644 ---- a/doc/sphinx/Pacemaker_Development/components.rst -+++ b/doc/sphinx/Pacemaker_Development/components.rst -@@ -171,7 +171,7 @@ messaging layer callback, which calls: - - * ``fenced_process_fencing_reply()``, which calls either - ``request_peer_fencing()`` (to retry a failed operation, or try the next -- device in a topology is appropriate, which issues a new -+ device in a topology if appropriate, which issues a new - ``STONITH_OP_FENCE`` request, proceeding as before) or - ``finalize_op()`` (if the operation is definitively failed or - successful). --- -2.27.0 - - -From 822ee6fbd8583a2939c636b3bccceffcc338c567 Mon Sep 17 00:00:00 2001 -From: Ken Gaillot -Date: Mon, 10 Jan 2022 11:05:40 -0600 -Subject: [PATCH 14/17] Doc: Pacemaker Development: add a placeholder for how - fencing history works - ---- - doc/sphinx/Pacemaker_Development/components.rst | 15 +++++++++++++++ - 1 file changed, 15 insertions(+) - -diff --git a/doc/sphinx/Pacemaker_Development/components.rst b/doc/sphinx/Pacemaker_Development/components.rst -index c4d10fc9f5..760da77c9b 100644 ---- a/doc/sphinx/Pacemaker_Development/components.rst -+++ b/doc/sphinx/Pacemaker_Development/components.rst -@@ -183,6 +183,21 @@ Finally, all peers receive the broadcast result and call - * ``finalize_op()``, which sends the result to all local clients. - - -+.. index:: -+ single: fence history -+ -+Fencing History -+_______________ -+ -+The fencer keeps a running history of all fencing operations. The bulk of the -+relevant code is in `fenced_history.c` and ensures the history is synchronized -+across all nodes even if a node leaves and rejoins the cluster. -+ -+In libstonithd, this information is represented by `stonith_history_t` and is -+queryable by the `stonith_api_operations_t:history()` method. `crm_mon` and -+`stonith_admin` use this API to display the history. -+ -+ - .. index:: - single: scheduler - single: pacemaker-schedulerd --- -2.27.0 - - -From d9b4060f2dadb40d5ee7535e0b2890a83d216c1e Mon Sep 17 00:00:00 2001 -From: Ken Gaillot -Date: Mon, 10 Jan 2022 11:25:31 -0600 -Subject: [PATCH 15/17] Log: fencing: add exit reason for results without a - callback - ---- - lib/fencing/st_client.c | 6 ++++-- - 1 file changed, 4 insertions(+), 2 deletions(-) - -diff --git a/lib/fencing/st_client.c b/lib/fencing/st_client.c -index 9d93ffd481..4823751267 100644 ---- a/lib/fencing/st_client.c -+++ b/lib/fencing/st_client.c -@@ -926,9 +926,11 @@ invoke_registered_callbacks(stonith_t *stonith, xmlNode *msg, int call_id) - cb_info->user_data, cb_info->callback); - - } else if ((private->op_callback == NULL) && !pcmk__result_ok(&result)) { -- crm_warn("Fencing action without registered callback failed: %d (%s)", -+ crm_warn("Fencing action without registered callback failed: %d (%s%s%s)", - result.exit_status, -- pcmk_exec_status_str(result.execution_status)); -+ pcmk_exec_status_str(result.execution_status), -+ ((result.exit_reason == NULL)? "" : ": "), -+ ((result.exit_reason == NULL)? "" : result.exit_reason)); - crm_log_xml_debug(msg, "Failed fence update"); - } - --- -2.27.0 - - -From 9956b3ad2f1c6fba305252616ad0b35a38ab96da Mon Sep 17 00:00:00 2001 -From: Ken Gaillot -Date: Tue, 11 Jan 2022 09:28:27 -0600 -Subject: [PATCH 16/17] Refactor: executor: keep formatting consistent - -... even if the line runs a little long ---- - daemons/execd/execd_commands.c | 4 ++-- - 1 file changed, 2 insertions(+), 2 deletions(-) - -diff --git a/daemons/execd/execd_commands.c b/daemons/execd/execd_commands.c -index 7ae309d94c..bc3b392b2c 100644 ---- a/daemons/execd/execd_commands.c -+++ b/daemons/execd/execd_commands.c -@@ -1,5 +1,5 @@ - /* -- * Copyright 2012-2021 the Pacemaker project contributors -+ * Copyright 2012-2022 the Pacemaker project contributors - * - * The version control history for this file may have further details. - * -@@ -1297,7 +1297,7 @@ lrmd_rsc_execute_stonith(lrmd_rsc_t * rsc, lrmd_cmd_t * cmd) - stonith_action_complete(cmd, - ((rc == pcmk_ok)? CRM_EX_OK : CRM_EX_ERROR), - stonith__legacy2status(rc), -- rc == -pcmk_err_generic? NULL : pcmk_strerror(rc)); -+ ((rc == -pcmk_err_generic)? NULL : pcmk_strerror(rc))); - } - - static int --- -2.27.0 - - -From 69d8ecb17568d6c3ecad0e5735756f58a4bce5a1 Mon Sep 17 00:00:00 2001 -From: Ken Gaillot -Date: Tue, 11 Jan 2022 09:29:03 -0600 -Subject: [PATCH 17/17] Test: cts-fence-helper: use more intuitive execution - status for completed tests - -It doesn't matter since the value is only checked against a couple of specific -failure values, but this is less confusing. ---- - daemons/fenced/cts-fence-helper.c | 4 ++-- - 1 file changed, 2 insertions(+), 2 deletions(-) - -diff --git a/daemons/fenced/cts-fence-helper.c b/daemons/fenced/cts-fence-helper.c -index 2739f57804..e222a59f9f 100644 ---- a/daemons/fenced/cts-fence-helper.c -+++ b/daemons/fenced/cts-fence-helper.c -@@ -1,5 +1,5 @@ - /* -- * Copyright 2009-2021 the Pacemaker project contributors -+ * Copyright 2009-2022 the Pacemaker project contributors - * - * This source code is licensed under the GNU General Public License version 2 - * or later (GPLv2+) WITHOUT ANY WARRANTY. -@@ -89,7 +89,7 @@ mainloop_test_done(const char *origin, bool pass) - crm_info("SUCCESS - %s", origin); - mainloop_iter++; - mainloop_set_trigger(trig); -- result.execution_status = PCMK_EXEC_UNKNOWN; -+ result.execution_status = PCMK_EXEC_DONE; - result.exit_status = CRM_EX_OK; - } else { - crm_err("FAILURE - %s (%d: %s)", origin, result.exit_status, --- -2.27.0 - diff --git a/SOURCES/012-notify-crash.patch b/SOURCES/012-notify-crash.patch deleted file mode 100644 index c18e4f5..0000000 --- a/SOURCES/012-notify-crash.patch +++ /dev/null @@ -1,65 +0,0 @@ -From ed8b2c86ab77aaa3d7fd688c049ad5e1b922a9c6 Mon Sep 17 00:00:00 2001 -From: Reid Wahl -Date: Thu, 13 Jan 2022 02:56:55 -0800 -Subject: [PATCH] Fix: liblrmd: Avoid double-free during notify operation - -This commit fixes a regression introduced by 31c7fa8a, causing a -double-free in notify operations. lrmd_dispatch_internal() assigns the -exit_reason string directly from an XML node to a new lrmd_event_data_t -object (without duplicating), and this string gets freed twice. - -Free #1: pcmk__create_history_xml() (reached via callback) calls -lrmd__set_result(), which frees event.exit_reason and sets it to NULL. -Free #2: lrmd_ipc_dispatch() frees the XML node, which contains a -pointer to the exit_reason string just freed, after -lrmd_dispatch_internal() returns. - -Prior to 31c7fa8a, pcmk__create_history_xml reset event.rc and -event.op_status but **not** event.exit_reason. - -In this commit we simply make a copy of event.exit_reason in -lrmd_dispatch_internal() before the callback. This way we don't have to -worry about whatever happens in the callback, and we can continue to -unset the exit_reason alongside the rc and op_status. The added overhead -should be minimal. - -This commit also makes a copy of output. That's not strictly necessary -but adds some futureproofing and allows us to call lrmd__reset_result() -at the end of lrmd_dispatch_internal(). - -Resolves: RHBZ#2039675 - -Signed-off-by: Reid Wahl ---- - lib/lrmd/lrmd_client.c | 8 +++++--- - 1 file changed, 5 insertions(+), 3 deletions(-) - -diff --git a/lib/lrmd/lrmd_client.c b/lib/lrmd/lrmd_client.c -index ee31bb5ae9..5131a648b7 100644 ---- a/lib/lrmd/lrmd_client.c -+++ b/lib/lrmd/lrmd_client.c -@@ -305,9 +305,10 @@ lrmd_dispatch_internal(lrmd_t * lrmd, xmlNode * msg) - event.user_data = crm_element_value(msg, F_LRMD_RSC_USERDATA_STR); - event.type = lrmd_event_exec_complete; - -- // No need to duplicate the memory, so don't use setter functions -- event.output = crm_element_value(msg, F_LRMD_RSC_OUTPUT); -- event.exit_reason = crm_element_value(msg, F_LRMD_RSC_EXIT_REASON); -+ /* output and exit_reason may be freed by a callback */ -+ event.output = crm_element_value_copy(msg, F_LRMD_RSC_OUTPUT); -+ lrmd__set_result(&event, event.rc, event.op_status, -+ crm_element_value(msg, F_LRMD_RSC_EXIT_REASON)); - - event.params = xml2list(msg); - } else if (pcmk__str_eq(type, LRMD_OP_NEW_CLIENT, pcmk__str_none)) { -@@ -324,6 +325,7 @@ lrmd_dispatch_internal(lrmd_t * lrmd, xmlNode * msg) - if (event.params) { - g_hash_table_destroy(event.params); - } -+ lrmd__reset_result(&event); - } - - // \return Always 0, to indicate that IPC mainloop source should be kept --- -2.27.0 - diff --git a/SOURCES/013-probe-failures.patch b/SOURCES/013-probe-failures.patch deleted file mode 100644 index c13867e..0000000 --- a/SOURCES/013-probe-failures.patch +++ /dev/null @@ -1,26 +0,0 @@ -From 186d5a02fba919c455fd6eeb050b4be107f82159 Mon Sep 17 00:00:00 2001 -From: Chris Lumens -Date: Thu, 13 Jan 2022 17:02:47 -0500 -Subject: [PATCH] Low: scheduler: Use the old RC code to log maskable probe - failures. - ---- - lib/pengine/unpack.c | 2 +- - 1 file changed, 1 insertion(+), 1 deletion(-) - -diff --git a/lib/pengine/unpack.c b/lib/pengine/unpack.c -index 8a2d2a6d6d..b01f86257a 100644 ---- a/lib/pengine/unpack.c -+++ b/lib/pengine/unpack.c -@@ -3780,7 +3780,7 @@ unpack_rsc_op(pe_resource_t *rsc, pe_node_t *node, xmlNode *xml_op, - - if (maskable_probe_failure) { - crm_notice("Treating probe result '%s' for %s on %s as 'not running'", -- services_ocf_exitcode_str(rc), rsc->id, node->details->uname); -+ services_ocf_exitcode_str(old_rc), rsc->id, node->details->uname); - update_resource_state(rsc, node, xml_op, task, target_rc, *last_failure, - on_fail, data_set); - crm_xml_add(xml_op, XML_ATTR_UNAME, node->details->uname); --- -2.27.0 - diff --git a/SOURCES/014-pcmk_delay_base.patch b/SOURCES/014-pcmk_delay_base.patch deleted file mode 100644 index 8aba265..0000000 --- a/SOURCES/014-pcmk_delay_base.patch +++ /dev/null @@ -1,43 +0,0 @@ -From 9d812b0401d4cedef53a3cc3653ec782a5c49e37 Mon Sep 17 00:00:00 2001 -From: Ken Gaillot -Date: Thu, 13 Jan 2022 10:42:02 -0600 -Subject: [PATCH] Doc: fencer: improve pcmk_delay_base meta-data - -Update its type, since its value can now be a node map as well as a string, -and add more detail to its description. ---- - daemons/fenced/pacemaker-fenced.c | 18 +++++++++++------- - 1 file changed, 11 insertions(+), 7 deletions(-) - -diff --git a/daemons/fenced/pacemaker-fenced.c b/daemons/fenced/pacemaker-fenced.c -index 1b954be5a4..12f331496c 100644 ---- a/daemons/fenced/pacemaker-fenced.c -+++ b/daemons/fenced/pacemaker-fenced.c -@@ -1548,13 +1548,17 @@ main(int argc, char **argv) - PCMK_STONITH_DELAY_BASE); - printf(" Enable a base delay for " - "fencing actions and specify base delay value.\n"); -- printf(" This prevents double fencing when " -- "different delays are configured on the nodes.\nUse this to " -- "enable a static delay for fencing actions.\nThe overall delay " -- "is derived from a random delay value adding this static delay " -- "so that the sum is kept below the maximum delay.\nSet to eg. " -- "node1:1s;node2:5 to set different value per node.\n"); -- printf(" \n"); -+ printf(" This enables a static delay for " -+ "fencing actions, which can help avoid \"death matches\" where " -+ "two nodes try to fence each other at the same time. If " -+ PCMK_STONITH_DELAY_MAX " is also used, a random delay will be " -+ "added such that the total delay is kept below that value.\n" -+ "This can be set to a single time value to apply to any node " -+ "targeted by this device (useful if a separate device is " -+ "configured for each target), or to a node map (for example, " -+ "\"node1:1s;node2:5\") to set a different value per target.\n" -+ " \n"); -+ printf(" \n"); - printf(" \n"); - - printf(" \n", --- -2.27.0 - diff --git a/SOURCES/015-fencing-reasons.patch b/SOURCES/015-fencing-reasons.patch deleted file mode 100644 index c53b6c9..0000000 --- a/SOURCES/015-fencing-reasons.patch +++ /dev/null @@ -1,1093 +0,0 @@ -From 87365f49b1bee0baa536783865fbd835a9cacc97 Mon Sep 17 00:00:00 2001 -From: Ken Gaillot -Date: Thu, 2 Dec 2021 16:12:24 -0600 -Subject: [PATCH 01/11] Refactor: libstonithd: functionize getting notification - data XML - -Also, only get the data when needed. ---- - lib/fencing/st_client.c | 32 +++++++++++++++++++++++--------- - 1 file changed, 23 insertions(+), 9 deletions(-) - -diff --git a/lib/fencing/st_client.c b/lib/fencing/st_client.c -index 4823751267..72a0a49408 100644 ---- a/lib/fencing/st_client.c -+++ b/lib/fencing/st_client.c -@@ -1312,6 +1312,23 @@ stonith_dump_pending_callbacks(stonith_t * stonith) - return g_hash_table_foreach(private->stonith_op_callback_table, stonith_dump_pending_op, NULL); - } - -+/*! -+ * \internal -+ * \brief Get the data section of a fencer notification -+ * -+ * \param[in] msg Notification XML -+ * \param[in] ntype Notification type -+ */ -+static xmlNode * -+get_event_data_xml(xmlNode *msg, const char *ntype) -+{ -+ char *data_addr = crm_strdup_printf("//%s", ntype); -+ xmlNode *data = get_xpath_object(data_addr, msg, LOG_DEBUG); -+ -+ free(data_addr); -+ return data; -+} -+ - /* - - -@@ -1336,17 +1353,18 @@ xml_to_event(xmlNode * msg) - { - stonith_event_t *event = calloc(1, sizeof(stonith_event_t)); - const char *ntype = crm_element_value(msg, F_SUBTYPE); -- char *data_addr = crm_strdup_printf("//%s", ntype); -- xmlNode *data = get_xpath_object(data_addr, msg, LOG_DEBUG); - - crm_log_xml_trace(msg, "stonith_notify"); - - crm_element_value_int(msg, F_STONITH_RC, &(event->result)); - - if (pcmk__str_eq(ntype, T_STONITH_NOTIFY_FENCE, pcmk__str_casei)) { -- event->operation = crm_element_value_copy(msg, F_STONITH_OPERATION); -+ xmlNode *data = get_event_data_xml(msg, ntype); - -- if (data) { -+ if (data == NULL) { -+ crm_err("No data for %s event", ntype); -+ crm_log_xml_notice(msg, "BadEvent"); -+ } else { - event->origin = crm_element_value_copy(data, F_STONITH_ORIGIN); - event->action = crm_element_value_copy(data, F_STONITH_ACTION); - event->target = crm_element_value_copy(data, F_STONITH_TARGET); -@@ -1354,14 +1372,10 @@ xml_to_event(xmlNode * msg) - event->id = crm_element_value_copy(data, F_STONITH_REMOTE_OP_ID); - event->client_origin = crm_element_value_copy(data, F_STONITH_CLIENTNAME); - event->device = crm_element_value_copy(data, F_STONITH_DEVICE); -- -- } else { -- crm_err("No data for %s event", ntype); -- crm_log_xml_notice(msg, "BadEvent"); - } -+ event->operation = crm_element_value_copy(msg, F_STONITH_OPERATION); - } - -- free(data_addr); - return event; - } - --- -2.27.0 - - -From 448f86a029d5d7e3c255d813929003a8cc2cffba Mon Sep 17 00:00:00 2001 -From: Ken Gaillot -Date: Fri, 19 Nov 2021 17:01:23 -0600 -Subject: [PATCH 02/11] Refactor: fencing: parse full result from fencer - notifications - -stonith_event_t previously contained only the legacy return code for the -notification event. Use its new opaque member to store the full result, along -with accessors (available only internally for now). Nothing uses them yet. ---- - include/crm/fencing/internal.h | 5 +++ - lib/fencing/st_client.c | 68 ++++++++++++++++++++++++++++++++-- - 2 files changed, 70 insertions(+), 3 deletions(-) - -diff --git a/include/crm/fencing/internal.h b/include/crm/fencing/internal.h -index eff689e59b..acc16d05e9 100644 ---- a/include/crm/fencing/internal.h -+++ b/include/crm/fencing/internal.h -@@ -187,10 +187,15 @@ bool stonith__event_state_eq(stonith_history_t *history, void *user_data); - bool stonith__event_state_neq(stonith_history_t *history, void *user_data); - - int stonith__legacy2status(int rc); -+ - int stonith__exit_status(stonith_callback_data_t *data); - int stonith__execution_status(stonith_callback_data_t *data); - const char *stonith__exit_reason(stonith_callback_data_t *data); - -+int stonith__event_exit_status(stonith_event_t *event); -+int stonith__event_execution_status(stonith_event_t *event); -+const char *stonith__event_exit_reason(stonith_event_t *event); -+ - /*! - * \internal - * \brief Is a fencing operation in pending state? -diff --git a/lib/fencing/st_client.c b/lib/fencing/st_client.c -index 72a0a49408..f58b3a6745 100644 ---- a/lib/fencing/st_client.c -+++ b/lib/fencing/st_client.c -@@ -1349,15 +1349,23 @@ get_event_data_xml(xmlNode *msg, const char *ntype) - - */ - static stonith_event_t * --xml_to_event(xmlNode * msg) -+xml_to_event(xmlNode *msg, pcmk__action_result_t *result) - { - stonith_event_t *event = calloc(1, sizeof(stonith_event_t)); - const char *ntype = crm_element_value(msg, F_SUBTYPE); - -+ CRM_ASSERT((event != NULL) && (result != NULL)); -+ - crm_log_xml_trace(msg, "stonith_notify"); - -- crm_element_value_int(msg, F_STONITH_RC, &(event->result)); -+ // All notification types have the operation result -+ event->opaque = result; -+ stonith__xe_get_result(msg, result); -+ -+ // @COMPAT The API originally provided the result as a legacy return code -+ event->result = pcmk_rc2legacy(stonith__result2rc(result)); - -+ // Fence notifications have additional information - if (pcmk__str_eq(ntype, T_STONITH_NOTIFY_FENCE, pcmk__str_casei)) { - xmlNode *data = get_event_data_xml(msg, ntype); - -@@ -1392,6 +1400,7 @@ event_free(stonith_event_t * event) - free(event->executioner); - free(event->device); - free(event->client_origin); -+ pcmk__reset_result((pcmk__action_result_t *) (event->opaque)); - free(event); - } - -@@ -1402,6 +1411,7 @@ stonith_send_notification(gpointer data, gpointer user_data) - stonith_notify_client_t *entry = data; - stonith_event_t *st_event = NULL; - const char *event = NULL; -+ pcmk__action_result_t result = PCMK__UNKNOWN_RESULT; - - if (blob->xml == NULL) { - crm_warn("Skipping callback - NULL message"); -@@ -1427,7 +1437,7 @@ stonith_send_notification(gpointer data, gpointer user_data) - return; - } - -- st_event = xml_to_event(blob->xml); -+ st_event = xml_to_event(blob->xml, &result); - - crm_trace("Invoking callback for %p/%s event...", entry, event); - entry->notify(blob->stonith, st_event); -@@ -2366,6 +2376,58 @@ stonith__exit_reason(stonith_callback_data_t *data) - return ((pcmk__action_result_t *) data->opaque)->exit_reason; - } - -+/*! -+ * \internal -+ * \brief Return the exit status from an event notification -+ * -+ * \param[in] event Event -+ * -+ * \return Exit status from event -+ */ -+int -+stonith__event_exit_status(stonith_event_t *event) -+{ -+ if ((event == NULL) || (event->opaque == NULL)) { -+ return CRM_EX_ERROR; -+ } -+ return ((pcmk__action_result_t *) event->opaque)->exit_status; -+} -+ -+/*! -+ * \internal -+ * \brief Return the execution status from an event notification -+ * -+ * \param[in] event Event -+ * -+ * \return Execution status from event -+ */ -+int -+stonith__event_execution_status(stonith_event_t *event) -+{ -+ if ((event == NULL) || (event->opaque == NULL)) { -+ return PCMK_EXEC_UNKNOWN; -+ } -+ return ((pcmk__action_result_t *) event->opaque)->execution_status; -+} -+ -+/*! -+ * \internal -+ * \brief Return the exit reason from an event notification -+ * -+ * \param[in] event Event -+ * -+ * \return Exit reason from event -+ */ -+const char * -+stonith__event_exit_reason(stonith_event_t *event) -+{ -+ if ((event == NULL) || (event->opaque == NULL)) { -+ return NULL; -+ } -+ return ((pcmk__action_result_t *) event->opaque)->exit_reason; -+} -+ -+ - // Deprecated functions kept only for backward API compatibility - // LCOV_EXCL_START - --- -2.27.0 - - -From 8dab65e65fe760052d1151749a7bfb2203445813 Mon Sep 17 00:00:00 2001 -From: Ken Gaillot -Date: Fri, 19 Nov 2021 17:02:28 -0600 -Subject: [PATCH 03/11] Refactor: fencing: parse full result from synchronous - fencer replies - -stonith_send_command() now parses the full result from synchronous fencer -replies, and maps that to a legacy return code, rather than parse the legacy -return code directly. - -The full result is not used yet, and won't be until we can break backward API -compatibility, since the API functions that call stonith_send_command() -currently return a legacy code. ---- - lib/fencing/st_client.c | 8 +++++--- - 1 file changed, 5 insertions(+), 3 deletions(-) - -diff --git a/lib/fencing/st_client.c b/lib/fencing/st_client.c -index f58b3a6745..5fec7529e3 100644 ---- a/lib/fencing/st_client.c -+++ b/lib/fencing/st_client.c -@@ -1537,11 +1537,13 @@ stonith_send_command(stonith_t * stonith, const char *op, xmlNode * data, xmlNod - crm_element_value_int(op_reply, F_STONITH_CALLID, &reply_id); - - if (reply_id == stonith->call_id) { -+ pcmk__action_result_t result = PCMK__UNKNOWN_RESULT; -+ - crm_trace("Synchronous reply %d received", reply_id); - -- if (crm_element_value_int(op_reply, F_STONITH_RC, &rc) != 0) { -- rc = -ENOMSG; -- } -+ stonith__xe_get_result(op_reply, &result); -+ rc = pcmk_rc2legacy(stonith__result2rc(&result)); -+ pcmk__reset_result(&result); - - if ((call_options & st_opt_discard_reply) || output_data == NULL) { - crm_trace("Discarding reply"); --- -2.27.0 - - -From 1beb319d8c62ab93b4c08b26a4e03151906c6189 Mon Sep 17 00:00:00 2001 -From: Ken Gaillot -Date: Mon, 6 Dec 2021 17:13:44 -0600 -Subject: [PATCH 04/11] Log: fencing: improve cts-fence-helper result logs - -Use the full result from the fencing event ---- - daemons/fenced/cts-fence-helper.c | 12 ++++++++---- - 1 file changed, 8 insertions(+), 4 deletions(-) - -diff --git a/daemons/fenced/cts-fence-helper.c b/daemons/fenced/cts-fence-helper.c -index e222a59f9f..858cddc9de 100644 ---- a/daemons/fenced/cts-fence-helper.c -+++ b/daemons/fenced/cts-fence-helper.c -@@ -125,10 +125,14 @@ st_callback(stonith_t * st, stonith_event_t * e) - crm_exit(CRM_EX_DISCONNECT); - } - -- crm_notice("Operation %s requested by %s %s for peer %s. %s reported: %s (ref=%s)", -- e->operation, e->origin, e->result == pcmk_ok ? "completed" : "failed", -- e->target, e->executioner ? e->executioner : "", -- pcmk_strerror(e->result), e->id); -+ crm_notice("Operation '%s' targeting %s by %s for %s: %s (exit=%d, ref=%s)", -+ ((e->operation == NULL)? "unknown" : e->operation), -+ ((e->target == NULL)? "no node" : e->target), -+ ((e->executioner == NULL)? "any node" : e->executioner), -+ ((e->origin == NULL)? "unknown client" : e->origin), -+ pcmk_exec_status_str(stonith__event_execution_status(e)), -+ stonith__event_exit_status(e), -+ ((e->id == NULL)? "none" : e->id)); - - if (expected_notifications) { - expected_notifications--; --- -2.27.0 - - -From b26f701833ade5d7441fba317832d6e827bd16d0 Mon Sep 17 00:00:00 2001 -From: Ken Gaillot -Date: Tue, 14 Dec 2021 16:52:09 -0600 -Subject: [PATCH 05/11] Test: cts-fence-helper: update expected return code - -Before recent changes, libstonithd obtained the fence API's legacy result code -directly from the fencer's XML reply, meaning that the legacy code was the -result of the fencer's mapping of the full result (including the action stderr). - -After those changes, libstonithd now ignores the legacy code in the fencer's -reply, and instead maps the legacy code itself from the full result in the -fencer's reply. - -However, the fencer's reply does not have the action stderr, so failures that -mapped to -pcmk_err_generic on the server side now map to -ENODATA on the -client side. Update cts-fence-helper's expected return code to match (neither -code is particularly useful, so there wouldn't be much benefit from having the -fencer pass the action stderr with replies, which would be considerable -additional work). ---- - daemons/fenced/cts-fence-helper.c | 4 ++-- - 1 file changed, 2 insertions(+), 2 deletions(-) - -diff --git a/daemons/fenced/cts-fence-helper.c b/daemons/fenced/cts-fence-helper.c -index 858cddc9de..e3113452ef 100644 ---- a/daemons/fenced/cts-fence-helper.c -+++ b/daemons/fenced/cts-fence-helper.c -@@ -207,10 +207,10 @@ run_fence_failure_test(void) - "Register device1 for failure test", 1, 0); - - single_test(st->cmds->fence(st, st_opts, "false_1_node2", "off", 3, 0), -- "Fence failure results off", 1, -pcmk_err_generic); -+ "Fence failure results off", 1, -ENODATA); - - single_test(st->cmds->fence(st, st_opts, "false_1_node2", "reboot", 3, 0), -- "Fence failure results reboot", 1, -pcmk_err_generic); -+ "Fence failure results reboot", 1, -ENODATA); - - single_test(st->cmds->remove_device(st, st_opts, "test-id1"), - "Remove device1 for failure test", 1, 0); --- -2.27.0 - - -From 123429de229c2148e320c76530b95e6ba458b9f6 Mon Sep 17 00:00:00 2001 -From: Ken Gaillot -Date: Tue, 7 Dec 2021 10:28:48 -0600 -Subject: [PATCH 06/11] Low: controller: compare fencing targets - case-insensitively - -... since they are node names ---- - daemons/controld/controld_fencing.c | 2 +- - 1 file changed, 1 insertion(+), 1 deletion(-) - -diff --git a/daemons/controld/controld_fencing.c b/daemons/controld/controld_fencing.c -index f8d2fc13f4..70e141dc28 100644 ---- a/daemons/controld/controld_fencing.c -+++ b/daemons/controld/controld_fencing.c -@@ -466,7 +466,7 @@ tengine_stonith_notify(stonith_t *st, stonith_event_t *st_event) - return; - - } else if ((st_event->result == pcmk_ok) -- && pcmk__str_eq(st_event->target, fsa_our_uname, pcmk__str_none)) { -+ && pcmk__str_eq(st_event->target, fsa_our_uname, pcmk__str_casei)) { - - /* We were notified of our own fencing. Most likely, either fencing was - * misconfigured, or fabric fencing that doesn't cut cluster --- -2.27.0 - - -From 3a067b8e58b3aefb49b2af1c35d0ad28b2de8784 Mon Sep 17 00:00:00 2001 -From: Ken Gaillot -Date: Tue, 7 Dec 2021 10:37:56 -0600 -Subject: [PATCH 07/11] Refactor: controller: best practices for handling - fencing notifications - -Rename tengine_stonith_notify() to handle_fence_notification(), rename its -st_event argument to event, add a doxygen block, and use some new variables and -reformatting to make it easier to follow (and change later). ---- - daemons/controld/controld_fencing.c | 131 ++++++++++++++++------------ - 1 file changed, 75 insertions(+), 56 deletions(-) - -diff --git a/daemons/controld/controld_fencing.c b/daemons/controld/controld_fencing.c -index 70e141dc28..00626444da 100644 ---- a/daemons/controld/controld_fencing.c -+++ b/daemons/controld/controld_fencing.c -@@ -435,39 +435,59 @@ tengine_stonith_connection_destroy(stonith_t *st, stonith_event_t *e) - } - } - -+/*! -+ * \internal -+ * \brief Handle an event notification from the fencing API -+ * -+ * \param[in] st Fencing API connection -+ * \param[in] event Fencing API event notification -+ */ - static void --tengine_stonith_notify(stonith_t *st, stonith_event_t *st_event) -+handle_fence_notification(stonith_t *st, stonith_event_t *event) - { -+ bool succeeded = true; -+ const char *executioner = "the cluster"; -+ const char *client = "a client"; -+ - if (te_client_id == NULL) { - te_client_id = crm_strdup_printf("%s.%lu", crm_system_name, - (unsigned long) getpid()); - } - -- if (st_event == NULL) { -+ if (event == NULL) { - crm_err("Notify data not found"); - return; - } - -- crmd_alert_fencing_op(st_event); -+ if (event->executioner != NULL) { -+ executioner = event->executioner; -+ } -+ if (event->client_origin != NULL) { -+ client = event->client_origin; -+ } - -- if ((st_event->result == pcmk_ok) && pcmk__str_eq("on", st_event->action, pcmk__str_casei)) { -- crm_notice("%s was successfully unfenced by %s (at the request of %s)", -- st_event->target, -- st_event->executioner? st_event->executioner : "", -- st_event->origin); -- /* TODO: Hook up st_event->device */ -- return; -+ if (event->result != pcmk_ok) { -+ succeeded = false; -+ } - -- } else if (pcmk__str_eq("on", st_event->action, pcmk__str_casei)) { -- crm_err("Unfencing of %s by %s failed: %s (%d)", -- st_event->target, -- st_event->executioner? st_event->executioner : "", -- pcmk_strerror(st_event->result), st_event->result); -- return; -+ crmd_alert_fencing_op(event); - -- } else if ((st_event->result == pcmk_ok) -- && pcmk__str_eq(st_event->target, fsa_our_uname, pcmk__str_casei)) { -+ if (pcmk__str_eq("on", event->action, pcmk__str_none)) { -+ // Unfencing doesn't need special handling, just a log message -+ if (succeeded) { -+ crm_notice("%s was successfully unfenced by %s (at the request of %s)", -+ event->target, executioner, event->origin); -+ /* TODO: Hook up event->device */ -+ } else { -+ crm_err("Unfencing of %s by %s failed: %s (%d)", -+ event->target, executioner, -+ pcmk_strerror(st_event->result), st_event->result); -+ } -+ return; -+ } - -+ if (succeeded -+ && pcmk__str_eq(event->target, fsa_our_uname, pcmk__str_casei)) { - /* We were notified of our own fencing. Most likely, either fencing was - * misconfigured, or fabric fencing that doesn't cut cluster - * communication is in use. -@@ -478,44 +498,41 @@ tengine_stonith_notify(stonith_t *st, stonith_event_t *st_event) - * our subsequent election votes as "not part of our cluster". - */ - crm_crit("We were allegedly just fenced by %s for %s!", -- st_event->executioner? st_event->executioner : "the cluster", -- st_event->origin); /* Dumps blackbox if enabled */ -+ executioner, event->origin); // Dumps blackbox if enabled - if (fence_reaction_panic) { - pcmk__panic(__func__); - } else { - crm_exit(CRM_EX_FATAL); - } -- return; -+ return; // Should never get here - } - -- /* Update the count of stonith failures for this target, in case we become -+ /* Update the count of fencing failures for this target, in case we become - * DC later. The current DC has already updated its fail count in - * tengine_stonith_callback(). - */ -- if (!AM_I_DC && pcmk__str_eq(st_event->operation, T_STONITH_NOTIFY_FENCE, pcmk__str_casei)) { -- if (st_event->result == pcmk_ok) { -- st_fail_count_reset(st_event->target); -+ if (!AM_I_DC -+ && pcmk__str_eq(event->operation, T_STONITH_NOTIFY_FENCE, -+ pcmk__str_casei)) { -+ -+ if (succeeded) { -+ st_fail_count_reset(event->target); - } else { -- st_fail_count_increment(st_event->target); -+ st_fail_count_increment(event->target); - } - } - - crm_notice("Peer %s was%s terminated (%s) by %s on behalf of %s: %s " - CRM_XS " initiator=%s ref=%s", -- st_event->target, st_event->result == pcmk_ok ? "" : " not", -- st_event->action, -- st_event->executioner ? st_event->executioner : "", -- (st_event->client_origin? st_event->client_origin : ""), -- pcmk_strerror(st_event->result), -- st_event->origin, st_event->id); -- -- if (st_event->result == pcmk_ok) { -- crm_node_t *peer = pcmk__search_known_node_cache(0, st_event->target, -+ event->target, (succeeded? "" : " not"), -+ event->action, executioner, client, -+ pcmk_strerror(event->result), -+ event->origin, event->id); -+ -+ if (succeeded) { -+ crm_node_t *peer = pcmk__search_known_node_cache(0, event->target, - CRM_GET_PEER_ANY); - const char *uuid = NULL; -- gboolean we_are_executioner = pcmk__str_eq(st_event->executioner, -- fsa_our_uname, -- pcmk__str_casei); - - if (peer == NULL) { - return; -@@ -523,10 +540,9 @@ tengine_stonith_notify(stonith_t *st, stonith_event_t *st_event) - - uuid = crm_peer_uuid(peer); - -- crm_trace("target=%s dc=%s", st_event->target, fsa_our_dc); -- if(AM_I_DC) { -+ if (AM_I_DC) { - /* The DC always sends updates */ -- send_stonith_update(NULL, st_event->target, uuid); -+ send_stonith_update(NULL, event->target, uuid); - - /* @TODO Ideally, at this point, we'd check whether the fenced node - * hosted any guest nodes, and call remote_node_down() for them. -@@ -536,31 +552,33 @@ tengine_stonith_notify(stonith_t *st, stonith_event_t *st_event) - * on the scheduler creating fence pseudo-events for the guests. - */ - -- if (st_event->client_origin -- && !pcmk__str_eq(st_event->client_origin, te_client_id, pcmk__str_casei)) { -- -- /* Abort the current transition graph if it wasn't us -- * that invoked stonith to fence someone -+ if (!pcmk__str_eq(client, te_client_id, pcmk__str_casei)) { -+ /* Abort the current transition if it wasn't the cluster that -+ * initiated fencing. - */ -- crm_info("External fencing operation from %s fenced %s", st_event->client_origin, st_event->target); -- abort_transition(INFINITY, tg_restart, "External Fencing Operation", NULL); -+ crm_info("External fencing operation from %s fenced %s", -+ client, event->target); -+ abort_transition(INFINITY, tg_restart, -+ "External Fencing Operation", NULL); - } - - /* Assume it was our leader if we don't currently have one */ -- } else if (pcmk__str_eq(fsa_our_dc, st_event->target, pcmk__str_null_matches | pcmk__str_casei) -+ } else if (pcmk__str_eq(fsa_our_dc, event->target, -+ pcmk__str_null_matches|pcmk__str_casei) - && !pcmk_is_set(peer->flags, crm_remote_node)) { - - crm_notice("Fencing target %s %s our leader", -- st_event->target, (fsa_our_dc? "was" : "may have been")); -+ event->target, (fsa_our_dc? "was" : "may have been")); - - /* Given the CIB resyncing that occurs around elections, - * have one node update the CIB now and, if the new DC is different, - * have them do so too after the election - */ -- if (we_are_executioner) { -- send_stonith_update(NULL, st_event->target, uuid); -+ if (pcmk__str_eq(event->executioner, fsa_our_uname, -+ pcmk__str_casei)) { -+ send_stonith_update(NULL, event->target, uuid); - } -- add_stonith_cleanup(st_event->target); -+ add_stonith_cleanup(event->target); - } - - /* If the target is a remote node, and we host its connection, -@@ -569,7 +587,7 @@ tengine_stonith_notify(stonith_t *st, stonith_event_t *st_event) - * so the failure might not otherwise be detected until the next poke. - */ - if (pcmk_is_set(peer->flags, crm_remote_node)) { -- remote_ra_fail(st_event->target); -+ remote_ra_fail(event->target); - } - - crmd_peer_down(peer, TRUE); -@@ -632,7 +650,7 @@ te_connect_stonith(gpointer user_data) - tengine_stonith_connection_destroy); - stonith_api->cmds->register_notification(stonith_api, - T_STONITH_NOTIFY_FENCE, -- tengine_stonith_notify); -+ handle_fence_notification); - stonith_api->cmds->register_notification(stonith_api, - T_STONITH_NOTIFY_HISTORY_SYNCED, - tengine_stonith_history_synced); -@@ -837,7 +855,8 @@ tengine_stonith_callback(stonith_t *stonith, stonith_callback_data_t *data) - } - - /* Increment the fail count now, so abort_for_stonith_failure() can -- * check it. Non-DC nodes will increment it in tengine_stonith_notify(). -+ * check it. Non-DC nodes will increment it in -+ * handle_fence_notification(). - */ - st_fail_count_increment(target); - abort_for_stonith_failure(abort_action, target, NULL); --- -2.27.0 - - -From 5ec9dcbbe1ee7f6252968f87d7df5a5ea17244fb Mon Sep 17 00:00:00 2001 -From: Ken Gaillot -Date: Tue, 7 Dec 2021 10:40:21 -0600 -Subject: [PATCH 08/11] Log: controller: improve messages when handling fencing - notifications - -Now that the fencing API provides a full result including exit reasons with -fencing event notifications, make the controller logs more useful and -consistent. ---- - daemons/controld/controld_fencing.c | 34 ++++++++++++++++++++--------- - 1 file changed, 24 insertions(+), 10 deletions(-) - -diff --git a/daemons/controld/controld_fencing.c b/daemons/controld/controld_fencing.c -index 00626444da..0aa9ef083c 100644 ---- a/daemons/controld/controld_fencing.c -+++ b/daemons/controld/controld_fencing.c -@@ -448,6 +448,8 @@ handle_fence_notification(stonith_t *st, stonith_event_t *event) - bool succeeded = true; - const char *executioner = "the cluster"; - const char *client = "a client"; -+ const char *reason = NULL; -+ int exec_status; - - if (te_client_id == NULL) { - te_client_id = crm_strdup_printf("%s.%lu", crm_system_name, -@@ -466,22 +468,31 @@ handle_fence_notification(stonith_t *st, stonith_event_t *event) - client = event->client_origin; - } - -- if (event->result != pcmk_ok) { -+ exec_status = stonith__event_execution_status(event); -+ if ((stonith__event_exit_status(event) != CRM_EX_OK) -+ || (exec_status != PCMK_EXEC_DONE)) { - succeeded = false; -+ if (exec_status == PCMK_EXEC_DONE) { -+ exec_status = PCMK_EXEC_ERROR; -+ } - } -+ reason = stonith__event_exit_reason(event); - - crmd_alert_fencing_op(event); - - if (pcmk__str_eq("on", event->action, pcmk__str_none)) { - // Unfencing doesn't need special handling, just a log message - if (succeeded) { -- crm_notice("%s was successfully unfenced by %s (at the request of %s)", -- event->target, executioner, event->origin); -+ crm_notice("%s was unfenced by %s at the request of %s@%s", -+ event->target, executioner, client, event->origin); - /* TODO: Hook up event->device */ - } else { -- crm_err("Unfencing of %s by %s failed: %s (%d)", -+ crm_err("Unfencing of %s by %s failed (%s%s%s) with exit status %d", - event->target, executioner, -- pcmk_strerror(st_event->result), st_event->result); -+ pcmk_exec_status_str(exec_status), -+ ((reason == NULL)? "" : ": "), -+ ((reason == NULL)? "" : reason), -+ stonith__event_exit_status(event)); - } - return; - } -@@ -522,12 +533,15 @@ handle_fence_notification(stonith_t *st, stonith_event_t *event) - } - } - -- crm_notice("Peer %s was%s terminated (%s) by %s on behalf of %s: %s " -- CRM_XS " initiator=%s ref=%s", -+ crm_notice("Peer %s was%s terminated (%s) by %s on behalf of %s@%s: " -+ "%s%s%s%s " CRM_XS " event=%s", - event->target, (succeeded? "" : " not"), -- event->action, executioner, client, -- pcmk_strerror(event->result), -- event->origin, event->id); -+ event->action, executioner, client, event->origin, -+ (succeeded? "OK" : pcmk_exec_status_str(exec_status)), -+ ((reason == NULL)? "" : " ("), -+ ((reason == NULL)? "" : reason), -+ ((reason == NULL)? "" : ")"), -+ event->id); - - if (succeeded) { - crm_node_t *peer = pcmk__search_known_node_cache(0, event->target, --- -2.27.0 - - -From fb484933ce7c8f3325300a9e01a114db1bbb5b70 Mon Sep 17 00:00:00 2001 -From: Ken Gaillot -Date: Tue, 7 Dec 2021 11:33:15 -0600 -Subject: [PATCH 09/11] Refactor: controller: move alert functions into own - source file - ---- - daemons/controld/Makefile.am | 1 + - daemons/controld/controld_alerts.c | 92 +++++++++++++++++++++++++ - daemons/controld/controld_execd_state.c | 75 -------------------- - 3 files changed, 93 insertions(+), 75 deletions(-) - create mode 100644 daemons/controld/controld_alerts.c - -diff --git a/daemons/controld/Makefile.am b/daemons/controld/Makefile.am -index db45bcba4a..0a29925c0b 100644 ---- a/daemons/controld/Makefile.am -+++ b/daemons/controld/Makefile.am -@@ -43,6 +43,7 @@ pacemaker_controld_LDADD = $(top_builddir)/lib/fencing/libstonithd.la \ - $(CLUSTERLIBS) - - pacemaker_controld_SOURCES = pacemaker-controld.c \ -+ controld_alerts.c \ - controld_attrd.c \ - controld_callbacks.c \ - controld_based.c \ -diff --git a/daemons/controld/controld_alerts.c b/daemons/controld/controld_alerts.c -new file mode 100644 -index 0000000000..bd92795cf0 ---- /dev/null -+++ b/daemons/controld/controld_alerts.c -@@ -0,0 +1,92 @@ -+/* -+ * Copyright 2012-2021 the Pacemaker project contributors -+ * -+ * The version control history for this file may have further details. -+ * -+ * This source code is licensed under the GNU General Public License version 2 -+ * or later (GPLv2+) WITHOUT ANY WARRANTY. -+ */ -+ -+#include -+ -+#include -+#include -+ -+#include -+#include -+#include -+#include -+#include -+ -+#include -+ -+static GList *crmd_alert_list = NULL; -+ -+void -+crmd_unpack_alerts(xmlNode *alerts) -+{ -+ pe_free_alert_list(crmd_alert_list); -+ crmd_alert_list = pe_unpack_alerts(alerts); -+} -+ -+void -+crmd_alert_node_event(crm_node_t *node) -+{ -+ lrm_state_t *lrm_state; -+ -+ if (crmd_alert_list == NULL) { -+ return; -+ } -+ -+ lrm_state = lrm_state_find(fsa_our_uname); -+ if (lrm_state == NULL) { -+ return; -+ } -+ -+ lrmd_send_node_alert((lrmd_t *) lrm_state->conn, crmd_alert_list, -+ node->uname, node->id, node->state); -+} -+ -+void -+crmd_alert_fencing_op(stonith_event_t * e) -+{ -+ char *desc; -+ lrm_state_t *lrm_state; -+ -+ if (crmd_alert_list == NULL) { -+ return; -+ } -+ -+ lrm_state = lrm_state_find(fsa_our_uname); -+ if (lrm_state == NULL) { -+ return; -+ } -+ -+ desc = crm_strdup_printf("Operation %s of %s by %s for %s@%s: %s (ref=%s)", -+ e->action, e->target, -+ (e->executioner? e->executioner : ""), -+ e->client_origin, e->origin, -+ pcmk_strerror(e->result), e->id); -+ -+ lrmd_send_fencing_alert((lrmd_t *) lrm_state->conn, crmd_alert_list, -+ e->target, e->operation, desc, e->result); -+ free(desc); -+} -+ -+void -+crmd_alert_resource_op(const char *node, lrmd_event_data_t * op) -+{ -+ lrm_state_t *lrm_state; -+ -+ if (crmd_alert_list == NULL) { -+ return; -+ } -+ -+ lrm_state = lrm_state_find(fsa_our_uname); -+ if (lrm_state == NULL) { -+ return; -+ } -+ -+ lrmd_send_resource_alert((lrmd_t *) lrm_state->conn, crmd_alert_list, node, -+ op); -+} -diff --git a/daemons/controld/controld_execd_state.c b/daemons/controld/controld_execd_state.c -index 67c376a426..5dce6c6d59 100644 ---- a/daemons/controld/controld_execd_state.c -+++ b/daemons/controld/controld_execd_state.c -@@ -777,78 +777,3 @@ lrm_state_unregister_rsc(lrm_state_t * lrm_state, - */ - return ((lrmd_t *) lrm_state->conn)->cmds->unregister_rsc(lrm_state->conn, rsc_id, options); - } -- --/* -- * Functions for sending alerts via local executor connection -- */ -- --static GList *crmd_alert_list = NULL; -- --void --crmd_unpack_alerts(xmlNode *alerts) --{ -- pe_free_alert_list(crmd_alert_list); -- crmd_alert_list = pe_unpack_alerts(alerts); --} -- --void --crmd_alert_node_event(crm_node_t *node) --{ -- lrm_state_t *lrm_state; -- -- if (crmd_alert_list == NULL) { -- return; -- } -- -- lrm_state = lrm_state_find(fsa_our_uname); -- if (lrm_state == NULL) { -- return; -- } -- -- lrmd_send_node_alert((lrmd_t *) lrm_state->conn, crmd_alert_list, -- node->uname, node->id, node->state); --} -- --void --crmd_alert_fencing_op(stonith_event_t * e) --{ -- char *desc; -- lrm_state_t *lrm_state; -- -- if (crmd_alert_list == NULL) { -- return; -- } -- -- lrm_state = lrm_state_find(fsa_our_uname); -- if (lrm_state == NULL) { -- return; -- } -- -- desc = crm_strdup_printf("Operation %s of %s by %s for %s@%s: %s (ref=%s)", -- e->action, e->target, -- (e->executioner? e->executioner : ""), -- e->client_origin, e->origin, -- pcmk_strerror(e->result), e->id); -- -- lrmd_send_fencing_alert((lrmd_t *) lrm_state->conn, crmd_alert_list, -- e->target, e->operation, desc, e->result); -- free(desc); --} -- --void --crmd_alert_resource_op(const char *node, lrmd_event_data_t * op) --{ -- lrm_state_t *lrm_state; -- -- if (crmd_alert_list == NULL) { -- return; -- } -- -- lrm_state = lrm_state_find(fsa_our_uname); -- if (lrm_state == NULL) { -- return; -- } -- -- lrmd_send_resource_alert((lrmd_t *) lrm_state->conn, crmd_alert_list, node, -- op); --} --- -2.27.0 - - -From 3d0b57406bcde6682623e9d62c8ee95878345eb1 Mon Sep 17 00:00:00 2001 -From: Ken Gaillot -Date: Tue, 7 Dec 2021 11:25:41 -0600 -Subject: [PATCH 10/11] Feature: controller,tools: improve description for - fencing alerts/traps - -This functionizes creating a description for fencing events, so it can be used -by both the controller for alerts and crm_mon for traps, for consistency. - -Now that we have the full result including exit reason, we can improve the -description, but the format is kept similar to before to minimize the change. - -The alert/trap also includes the legacy return code for the event, but we can't -change that now because lrmd_send_fencing_alert() and the alert/trap -environment variables are public API. ---- - daemons/controld/controld_alerts.c | 8 ++----- - include/crm/fencing/internal.h | 1 + - lib/fencing/st_client.c | 38 ++++++++++++++++++++++++++++++ - tools/crm_mon.c | 5 ++-- - 4 files changed, 43 insertions(+), 9 deletions(-) - -diff --git a/daemons/controld/controld_alerts.c b/daemons/controld/controld_alerts.c -index bd92795cf0..2e0a67dba2 100644 ---- a/daemons/controld/controld_alerts.c -+++ b/daemons/controld/controld_alerts.c -@@ -12,6 +12,7 @@ - #include - #include - -+#include - #include - #include - #include -@@ -62,12 +63,7 @@ crmd_alert_fencing_op(stonith_event_t * e) - return; - } - -- desc = crm_strdup_printf("Operation %s of %s by %s for %s@%s: %s (ref=%s)", -- e->action, e->target, -- (e->executioner? e->executioner : ""), -- e->client_origin, e->origin, -- pcmk_strerror(e->result), e->id); -- -+ desc = stonith__event_description(e); - lrmd_send_fencing_alert((lrmd_t *) lrm_state->conn, crmd_alert_list, - e->target, e->operation, desc, e->result); - free(desc); -diff --git a/include/crm/fencing/internal.h b/include/crm/fencing/internal.h -index acc16d05e9..d2b49f831a 100644 ---- a/include/crm/fencing/internal.h -+++ b/include/crm/fencing/internal.h -@@ -195,6 +195,7 @@ const char *stonith__exit_reason(stonith_callback_data_t *data); - int stonith__event_exit_status(stonith_event_t *event); - int stonith__event_execution_status(stonith_event_t *event); - const char *stonith__event_exit_reason(stonith_event_t *event); -+char *stonith__event_description(stonith_event_t *event); - - /*! - * \internal -diff --git a/lib/fencing/st_client.c b/lib/fencing/st_client.c -index 5fec7529e3..b1de912b2a 100644 ---- a/lib/fencing/st_client.c -+++ b/lib/fencing/st_client.c -@@ -2429,6 +2429,44 @@ stonith__event_exit_reason(stonith_event_t *event) - return ((pcmk__action_result_t *) event->opaque)->exit_reason; - } - -+/*! -+ * \internal -+ * \brief Return a human-friendly description of a fencing event -+ * -+ * \param[in] event Event to describe -+ * -+ * \return Newly allocated string with description of \p event -+ * \note The caller is responsible for freeing the return value. -+ * This function asserts on memory errors and never returns NULL. -+ * \note This currently is useful only for events of type -+ * T_STONITH_NOTIFY_FENCE. -+ */ -+char * -+stonith__event_description(stonith_event_t *event) -+{ -+ const char *reason; -+ const char *status; -+ -+ if (stonith__event_execution_status(event) != PCMK_EXEC_DONE) { -+ status = pcmk_exec_status_str(stonith__event_execution_status(event)); -+ } else if (stonith__event_exit_status(event) != CRM_EX_OK) { -+ status = pcmk_exec_status_str(PCMK_EXEC_ERROR); -+ } else { -+ status = crm_exit_str(CRM_EX_OK); -+ } -+ reason = stonith__event_exit_reason(event); -+ -+ return crm_strdup_printf("Operation %s of %s by %s for %s@%s: %s%s%s%s (ref=%s)", -+ event->action, event->target, -+ (event->executioner? event->executioner : "the cluster"), -+ (event->client_origin? event->client_origin : "a client"), -+ event->origin, status, -+ ((reason == NULL)? "" : " ("), -+ ((reason == NULL)? "" : reason), -+ ((reason == NULL)? "" : ")"), -+ event->id); -+} -+ - - // Deprecated functions kept only for backward API compatibility - // LCOV_EXCL_START -diff --git a/tools/crm_mon.c b/tools/crm_mon.c -index a6c459aaf7..e7b4fe2847 100644 ---- a/tools/crm_mon.c -+++ b/tools/crm_mon.c -@@ -2237,9 +2237,8 @@ mon_st_callback_event(stonith_t * st, stonith_event_t * e) - /* disconnect cib as well and have everything reconnect */ - mon_cib_connection_destroy(NULL); - } else if (options.external_agent) { -- char *desc = crm_strdup_printf("Operation %s requested by %s for peer %s: %s (ref=%s)", -- e->operation, e->origin, e->target, pcmk_strerror(e->result), -- e->id); -+ char *desc = stonith__event_description(e); -+ - send_custom_trap(e->target, NULL, e->operation, pcmk_ok, e->result, 0, desc); - free(desc); - } --- -2.27.0 - - -From 2fe03c2165680c717a1f6106c5150be7d117f1a5 Mon Sep 17 00:00:00 2001 -From: Ken Gaillot -Date: Fri, 14 Jan 2022 10:45:03 -0600 -Subject: [PATCH 11/11] Low: controller: compare case-sensitively where - appropriate - ---- - daemons/controld/controld_fencing.c | 4 ++-- - 1 file changed, 2 insertions(+), 2 deletions(-) - -diff --git a/daemons/controld/controld_fencing.c b/daemons/controld/controld_fencing.c -index 0aa9ef083c..15954b2358 100644 ---- a/daemons/controld/controld_fencing.c -+++ b/daemons/controld/controld_fencing.c -@@ -1,5 +1,5 @@ - /* -- * Copyright 2004-2021 the Pacemaker project contributors -+ * Copyright 2004-2022 the Pacemaker project contributors - * - * The version control history for this file may have further details. - * -@@ -524,7 +524,7 @@ handle_fence_notification(stonith_t *st, stonith_event_t *event) - */ - if (!AM_I_DC - && pcmk__str_eq(event->operation, T_STONITH_NOTIFY_FENCE, -- pcmk__str_casei)) { -+ pcmk__str_none)) { - - if (succeeded) { - st_fail_count_reset(event->target); --- -2.27.0 - diff --git a/SOURCES/016-fencing-crash.patch b/SOURCES/016-fencing-crash.patch deleted file mode 100644 index c514c64..0000000 --- a/SOURCES/016-fencing-crash.patch +++ /dev/null @@ -1,56 +0,0 @@ -From e330568504ec379ea42460d21a2e20b1652d9445 Mon Sep 17 00:00:00 2001 -From: Reid Wahl -Date: Fri, 14 Jan 2022 01:35:35 -0800 -Subject: [PATCH] Fix: fencing: Don't set stonith action to pending if fork - fails - -Currently, we set a stonith action to pending if -services_action_async_fork_notify() returns true. However, "true" means -that the svc_action should not be freed. This might be because the -svc_action forked successfully and is pending, or it might be because -the svc_action has already been freed. - -In the case of stonith actions, if we fail to fork, the stonith_action_t -object stored in svc_action->cb_data gets freed by the done callback, -and services_action_async_fork_notify() returns true. If we try to set -the action to pending, it causes a segfault. - -This commit moves the "set to pending" step to the -stonith_action_async_forked() callback. We avoid the segfault and only -set it to pending if it's actually pending. - -A slight difference in ordering was required to achieve this. Now, the -action gets set to pending immediately before being added to the -mainloop, instead of immediately after. - -Signed-off-by: Reid Wahl ---- - lib/fencing/st_actions.c | 5 +++-- - 1 file changed, 3 insertions(+), 2 deletions(-) - -diff --git a/lib/fencing/st_actions.c b/lib/fencing/st_actions.c -index e4e43225cd..306001af69 100644 ---- a/lib/fencing/st_actions.c -+++ b/lib/fencing/st_actions.c -@@ -550,6 +550,9 @@ stonith_action_async_forked(svc_action_t *svc_action) - (action->fork_cb) (svc_action->pid, action->userdata); - } - -+ pcmk__set_result(&(action->result), PCMK_OCF_UNKNOWN, PCMK_EXEC_PENDING, -+ NULL); -+ - crm_trace("Child process %d performing action '%s' successfully forked", - action->pid, action->action); - } -@@ -619,8 +622,6 @@ internal_stonith_action_execute(stonith_action_t * action) - if (services_action_async_fork_notify(svc_action, - &stonith_action_async_done, - &stonith_action_async_forked)) { -- pcmk__set_result(&(action->result), PCMK_OCF_UNKNOWN, -- PCMK_EXEC_PENDING, NULL); - return pcmk_ok; - } - --- -2.27.0 - diff --git a/SOURCES/017-fencing-reasons.patch b/SOURCES/017-fencing-reasons.patch deleted file mode 100644 index 1e100ec..0000000 --- a/SOURCES/017-fencing-reasons.patch +++ /dev/null @@ -1,875 +0,0 @@ -From 523f62eb235836a01ea039c23ada261a494f7b32 Mon Sep 17 00:00:00 2001 -From: Ken Gaillot -Date: Wed, 10 Nov 2021 15:22:47 -0600 -Subject: [PATCH 01/11] Feature: libpacemaker: improve result for high-level - fencing API - -Previously, pcmk__fencing_action()'s helpers for asynchronous fencing actions -initialized the result to a generic error, and then overrode that only on -success. - -Now, set a detailed result for early failures, and use the full result when -available from the fencing API. - -A standard return code is still returned to callers at this point. ---- - lib/pacemaker/pcmk_fence.c | 31 ++++++++++++++++++------------- - 1 file changed, 18 insertions(+), 13 deletions(-) - -diff --git a/lib/pacemaker/pcmk_fence.c b/lib/pacemaker/pcmk_fence.c -index 7d6acd0de6..125e1b268b 100644 ---- a/lib/pacemaker/pcmk_fence.c -+++ b/lib/pacemaker/pcmk_fence.c -@@ -32,8 +32,8 @@ static struct { - unsigned int timeout; - unsigned int tolerance; - int delay; -- int rc; --} async_fence_data; -+ pcmk__action_result_t result; -+} async_fence_data = { NULL, }; - - static int - handle_level(stonith_t *st, char *target, int fence_level, -@@ -76,14 +76,13 @@ handle_level(stonith_t *st, char *target, int fence_level, - static void - notify_callback(stonith_t * st, stonith_event_t * e) - { -- if (e->result != pcmk_ok) { -- return; -- } -+ if (pcmk__str_eq(async_fence_data.target, e->target, pcmk__str_casei) -+ && pcmk__str_eq(async_fence_data.action, e->action, pcmk__str_casei)) { - -- if (pcmk__str_eq(async_fence_data.target, e->target, pcmk__str_casei) && -- pcmk__str_eq(async_fence_data.action, e->action, pcmk__str_casei)) { -- -- async_fence_data.rc = e->result; -+ pcmk__set_result(&async_fence_data.result, -+ stonith__event_exit_status(e), -+ stonith__event_execution_status(e), -+ stonith__event_exit_reason(e)); - g_main_loop_quit(mainloop); - } - } -@@ -91,8 +90,9 @@ notify_callback(stonith_t * st, stonith_event_t * e) - static void - fence_callback(stonith_t * stonith, stonith_callback_data_t * data) - { -- async_fence_data.rc = data->rc; -- -+ pcmk__set_result(&async_fence_data.result, stonith__exit_status(data), -+ stonith__execution_status(data), -+ stonith__exit_reason(data)); - g_main_loop_quit(mainloop); - } - -@@ -106,6 +106,8 @@ async_fence_helper(gpointer user_data) - if (rc != pcmk_ok) { - fprintf(stderr, "Could not connect to fencer: %s\n", pcmk_strerror(rc)); - g_main_loop_quit(mainloop); -+ pcmk__set_result(&async_fence_data.result, CRM_EX_ERROR, -+ PCMK_EXEC_NOT_CONNECTED, NULL); - return TRUE; - } - -@@ -121,6 +123,8 @@ async_fence_helper(gpointer user_data) - - if (call_id < 0) { - g_main_loop_quit(mainloop); -+ pcmk__set_result(&async_fence_data.result, CRM_EX_ERROR, -+ PCMK_EXEC_ERROR, pcmk_strerror(call_id)); - return TRUE; - } - -@@ -146,7 +150,8 @@ pcmk__fence_action(stonith_t *st, const char *target, const char *action, - async_fence_data.timeout = timeout; - async_fence_data.tolerance = tolerance; - async_fence_data.delay = delay; -- async_fence_data.rc = pcmk_err_generic; -+ pcmk__set_result(&async_fence_data.result, CRM_EX_ERROR, PCMK_EXEC_UNKNOWN, -+ NULL); - - trig = mainloop_add_trigger(G_PRIORITY_HIGH, async_fence_helper, NULL); - mainloop_set_trigger(trig); -@@ -156,7 +161,7 @@ pcmk__fence_action(stonith_t *st, const char *target, const char *action, - - free(async_fence_data.name); - -- return pcmk_legacy2rc(async_fence_data.rc); -+ return stonith__result2rc(&async_fence_data.result); - } - - #ifdef BUILD_PUBLIC_LIBPACEMAKER --- -2.27.0 - - -From 008868fae5d1b0d6d8dc61f7acfb3856801ddd52 Mon Sep 17 00:00:00 2001 -From: Ken Gaillot -Date: Fri, 10 Dec 2021 15:36:10 -0600 -Subject: [PATCH 02/11] Refactor: libpacemaker: add exit reason to high-level - fencing API - -Nothing uses it as of this commit ---- - include/pacemaker.h | 5 ++++- - include/pcmki/pcmki_fence.h | 5 ++++- - lib/pacemaker/pcmk_fence.c | 10 +++++++--- - tools/stonith_admin.c | 6 +++--- - 4 files changed, 18 insertions(+), 8 deletions(-) - -diff --git a/include/pacemaker.h b/include/pacemaker.h -index a8523c969e..0daa4c5945 100644 ---- a/include/pacemaker.h -+++ b/include/pacemaker.h -@@ -189,12 +189,15 @@ int pcmk_list_nodes(xmlNodePtr *xml, char *node_types); - * again. - * \param[in] delay Apply a fencing delay. Value -1 means disable also any - * static/random fencing delays from pcmk_delay_base/max. -+ * \param[out] reason If not NULL, where to put descriptive failure reason - * - * \return Standard Pacemaker return code -+ * \note If \p reason is not NULL, the caller is responsible for freeing its -+ * returned value. - */ - int pcmk_fence_action(stonith_t *st, const char *target, const char *action, - const char *name, unsigned int timeout, unsigned int tolerance, -- int delay); -+ int delay, char **reason); - - /*! - * \brief List the fencing operations that have occurred for a specific node. -diff --git a/include/pcmki/pcmki_fence.h b/include/pcmki/pcmki_fence.h -index d4cef68f5c..c3da0361d7 100644 ---- a/include/pcmki/pcmki_fence.h -+++ b/include/pcmki/pcmki_fence.h -@@ -28,12 +28,15 @@ - * again. - * \param[in] delay Apply a fencing delay. Value -1 means disable also any - * static/random fencing delays from pcmk_delay_base/max -+ * \param[out] reason If not NULL, where to put descriptive failure reason - * - * \return Standard Pacemaker return code -+ * \note If \p reason is not NULL, the caller is responsible for freeing its -+ * returned value. - */ - int pcmk__fence_action(stonith_t *st, const char *target, const char *action, - const char *name, unsigned int timeout, unsigned int tolerance, -- int delay); -+ int delay, char **reason); - - /*! - * \brief List the fencing operations that have occurred for a specific node. -diff --git a/lib/pacemaker/pcmk_fence.c b/lib/pacemaker/pcmk_fence.c -index 125e1b268b..dbf084fb6b 100644 ---- a/lib/pacemaker/pcmk_fence.c -+++ b/lib/pacemaker/pcmk_fence.c -@@ -139,7 +139,7 @@ async_fence_helper(gpointer user_data) - int - pcmk__fence_action(stonith_t *st, const char *target, const char *action, - const char *name, unsigned int timeout, unsigned int tolerance, -- int delay) -+ int delay, char **reason) - { - crm_trigger_t *trig; - -@@ -161,6 +161,9 @@ pcmk__fence_action(stonith_t *st, const char *target, const char *action, - - free(async_fence_data.name); - -+ if ((reason != NULL) && (async_fence_data.result.exit_reason != NULL)) { -+ *reason = strdup(async_fence_data.result.exit_reason); -+ } - return stonith__result2rc(&async_fence_data.result); - } - -@@ -168,9 +171,10 @@ pcmk__fence_action(stonith_t *st, const char *target, const char *action, - int - pcmk_fence_action(stonith_t *st, const char *target, const char *action, - const char *name, unsigned int timeout, unsigned int tolerance, -- int delay) -+ int delay, char **reason) - { -- return pcmk__fence_action(st, target, action, name, timeout, tolerance, delay); -+ return pcmk__fence_action(st, target, action, name, timeout, tolerance, -+ delay, reason); - } - #endif - -diff --git a/tools/stonith_admin.c b/tools/stonith_admin.c -index 2d48326e1b..fdc7c46d49 100644 ---- a/tools/stonith_admin.c -+++ b/tools/stonith_admin.c -@@ -571,17 +571,17 @@ main(int argc, char **argv) - - case 'B': - rc = pcmk__fence_action(st, target, "reboot", name, options.timeout*1000, -- options.tolerance*1000, options.delay); -+ options.tolerance*1000, options.delay, NULL); - break; - - case 'F': - rc = pcmk__fence_action(st, target, "off", name, options.timeout*1000, -- options.tolerance*1000, options.delay); -+ options.tolerance*1000, options.delay, NULL); - break; - - case 'U': - rc = pcmk__fence_action(st, target, "on", name, options.timeout*1000, -- options.tolerance*1000, options.delay); -+ options.tolerance*1000, options.delay, NULL); - break; - - case 'h': --- -2.27.0 - - -From 7570510f9985ba75ef73fb824f28109e135ace0a Mon Sep 17 00:00:00 2001 -From: Ken Gaillot -Date: Fri, 10 Dec 2021 15:40:48 -0600 -Subject: [PATCH 03/11] Refactor: libpacemaker: rename high-level fencing API - -Rename pcmk_fence_action() to pcmk_request_fencing(), and its internal -equivalent pcmk__fence_action() to pcmk__request_fencing(). The change is -backward-compatible because pcmk_fence_action() has not been exposed publicly -yet. - -"Fence action" can be easily confused with libcrmservice actions, liblrmd -actions, libstonithd actions, scheduler actions, and so forth. - -Also, the new name makes it clearer that the caller is requesting that the -cluster perform fencing, and not directly performing fencing. ---- - include/pacemaker.h | 20 ++++++++++---------- - include/pcmki/pcmki_fence.h | 16 ++++++++-------- - lib/pacemaker/pcmk_fence.c | 16 ++++++++-------- - tools/stonith_admin.c | 18 ++++++++++++------ - 4 files changed, 38 insertions(+), 32 deletions(-) - -diff --git a/include/pacemaker.h b/include/pacemaker.h -index 0daa4c5945..e581f975a9 100644 ---- a/include/pacemaker.h -+++ b/include/pacemaker.h -@@ -177,27 +177,27 @@ int pcmk_list_nodes(xmlNodePtr *xml, char *node_types); - #ifdef BUILD_PUBLIC_LIBPACEMAKER - - /*! -- * \brief Perform a STONITH action. -+ * \brief Ask the cluster to perform fencing - * -- * \param[in] st A connection to the STONITH API. -- * \param[in] target The node receiving the action. -- * \param[in] action The action to perform. -+ * \param[in] st A connection to the fencer API -+ * \param[in] target The node that should be fenced -+ * \param[in] action The fencing action (on, off, reboot) to perform - * \param[in] name Who requested the fence action? -- * \param[in] timeout How long to wait for the operation to complete (in ms). -+ * \param[in] timeout How long to wait for the operation to complete (in ms) - * \param[in] tolerance If a successful action for \p target happened within - * this many ms, return 0 without performing the action -- * again. -+ * again - * \param[in] delay Apply a fencing delay. Value -1 means disable also any -- * static/random fencing delays from pcmk_delay_base/max. -+ * static/random fencing delays from pcmk_delay_base/max - * \param[out] reason If not NULL, where to put descriptive failure reason - * - * \return Standard Pacemaker return code - * \note If \p reason is not NULL, the caller is responsible for freeing its - * returned value. - */ --int pcmk_fence_action(stonith_t *st, const char *target, const char *action, -- const char *name, unsigned int timeout, unsigned int tolerance, -- int delay, char **reason); -+int pcmk_request_fencing(stonith_t *st, const char *target, const char *action, -+ const char *name, unsigned int timeout, -+ unsigned int tolerance, int delay, char **reason); - - /*! - * \brief List the fencing operations that have occurred for a specific node. -diff --git a/include/pcmki/pcmki_fence.h b/include/pcmki/pcmki_fence.h -index c3da0361d7..e3a7e27264 100644 ---- a/include/pcmki/pcmki_fence.h -+++ b/include/pcmki/pcmki_fence.h -@@ -13,14 +13,14 @@ - # include - - /*! -- * \brief Perform a STONITH action. -+ * \brief Ask the cluster to perform fencing - * -- * \note This is the internal version of pcmk_fence_action(). External users -+ * \note This is the internal version of pcmk_request_fencing(). External users - * of the pacemaker API should use that function instead. - * -- * \param[in] st A connection to the STONITH API. -- * \param[in] target The node receiving the action. -- * \param[in] action The action to perform. -+ * \param[in] st A connection to the fencer API -+ * \param[in] target The node that should be fenced -+ * \param[in] action The fencing action (on, off, reboot) to perform - * \param[in] name Who requested the fence action? - * \param[in] timeout How long to wait for the operation to complete (in ms). - * \param[in] tolerance If a successful action for \p target happened within -@@ -34,9 +34,9 @@ - * \note If \p reason is not NULL, the caller is responsible for freeing its - * returned value. - */ --int pcmk__fence_action(stonith_t *st, const char *target, const char *action, -- const char *name, unsigned int timeout, unsigned int tolerance, -- int delay, char **reason); -+int pcmk__request_fencing(stonith_t *st, const char *target, const char *action, -+ const char *name, unsigned int timeout, -+ unsigned int tolerance, int delay, char **reason); - - /*! - * \brief List the fencing operations that have occurred for a specific node. -diff --git a/lib/pacemaker/pcmk_fence.c b/lib/pacemaker/pcmk_fence.c -index dbf084fb6b..1b7feb54b2 100644 ---- a/lib/pacemaker/pcmk_fence.c -+++ b/lib/pacemaker/pcmk_fence.c -@@ -137,9 +137,9 @@ async_fence_helper(gpointer user_data) - } - - int --pcmk__fence_action(stonith_t *st, const char *target, const char *action, -- const char *name, unsigned int timeout, unsigned int tolerance, -- int delay, char **reason) -+pcmk__request_fencing(stonith_t *st, const char *target, const char *action, -+ const char *name, unsigned int timeout, -+ unsigned int tolerance, int delay, char **reason) - { - crm_trigger_t *trig; - -@@ -169,12 +169,12 @@ pcmk__fence_action(stonith_t *st, const char *target, const char *action, - - #ifdef BUILD_PUBLIC_LIBPACEMAKER - int --pcmk_fence_action(stonith_t *st, const char *target, const char *action, -- const char *name, unsigned int timeout, unsigned int tolerance, -- int delay, char **reason) -+pcmk_request_fencing(stonith_t *st, const char *target, const char *action, -+ const char *name, unsigned int timeout, -+ unsigned int tolerance, int delay, char **reason) - { -- return pcmk__fence_action(st, target, action, name, timeout, tolerance, -- delay, reason); -+ return pcmk__request_fencing(st, target, action, name, timeout, tolerance, -+ delay, reason); - } - #endif - -diff --git a/tools/stonith_admin.c b/tools/stonith_admin.c -index fdc7c46d49..56948b3875 100644 ---- a/tools/stonith_admin.c -+++ b/tools/stonith_admin.c -@@ -570,18 +570,24 @@ main(int argc, char **argv) - break; - - case 'B': -- rc = pcmk__fence_action(st, target, "reboot", name, options.timeout*1000, -- options.tolerance*1000, options.delay, NULL); -+ rc = pcmk__request_fencing(st, target, "reboot", name, -+ options.timeout * 1000, -+ options.tolerance * 1000, -+ options.delay, NULL); - break; - - case 'F': -- rc = pcmk__fence_action(st, target, "off", name, options.timeout*1000, -- options.tolerance*1000, options.delay, NULL); -+ rc = pcmk__request_fencing(st, target, "off", name, -+ options.timeout * 1000, -+ options.tolerance * 1000, -+ options.delay, NULL); - break; - - case 'U': -- rc = pcmk__fence_action(st, target, "on", name, options.timeout*1000, -- options.tolerance*1000, options.delay, NULL); -+ rc = pcmk__request_fencing(st, target, "on", name, -+ options.timeout * 1000, -+ options.tolerance * 1000, -+ options.delay, NULL); - break; - - case 'h': --- -2.27.0 - - -From 247eb303df934944c0b72b162bb661cee6e0ed8b Mon Sep 17 00:00:00 2001 -From: Ken Gaillot -Date: Fri, 10 Dec 2021 15:52:37 -0600 -Subject: [PATCH 04/11] Refactor: tools: drop unnecessary string duplication in - stonith_admin - ---- - tools/stonith_admin.c | 11 ++++------- - 1 file changed, 4 insertions(+), 7 deletions(-) - -diff --git a/tools/stonith_admin.c b/tools/stonith_admin.c -index 56948b3875..c11e302e76 100644 ---- a/tools/stonith_admin.c -+++ b/tools/stonith_admin.c -@@ -360,8 +360,6 @@ main(int argc, char **argv) - - pcmk__cli_init_logging("stonith_admin", args->verbosity); - -- name = strdup(crm_system_name); -- - rc = pcmk__output_new(&out, args->output_ty, args->output_dest, argv); - if (rc != pcmk_rc_ok) { - exit_code = CRM_EX_ERROR; -@@ -496,7 +494,7 @@ main(int argc, char **argv) - if (st == NULL) { - rc = -ENOMEM; - } else if (!no_connect) { -- rc = st->cmds->connect(st, name, NULL); -+ rc = st->cmds->connect(st, crm_system_name, NULL); - } - if (rc < 0) { - out->err(out, "Could not connect to fencer: %s", pcmk_strerror(rc)); -@@ -570,21 +568,21 @@ main(int argc, char **argv) - break; - - case 'B': -- rc = pcmk__request_fencing(st, target, "reboot", name, -+ rc = pcmk__request_fencing(st, target, "reboot", crm_system_name, - options.timeout * 1000, - options.tolerance * 1000, - options.delay, NULL); - break; - - case 'F': -- rc = pcmk__request_fencing(st, target, "off", name, -+ rc = pcmk__request_fencing(st, target, "off", crm_system_name, - options.timeout * 1000, - options.tolerance * 1000, - options.delay, NULL); - break; - - case 'U': -- rc = pcmk__request_fencing(st, target, "on", name, -+ rc = pcmk__request_fencing(st, target, "on", crm_system_name, - options.timeout * 1000, - options.tolerance * 1000, - options.delay, NULL); -@@ -619,7 +617,6 @@ main(int argc, char **argv) - out->finish(out, exit_code, true, NULL); - pcmk__output_free(out); - } -- free(name); - stonith_key_value_freeall(options.params, 1, 1); - - if (st != NULL) { --- -2.27.0 - - -From a7888bf6868d8d9d9c77f65ae9983cf748bb0548 Mon Sep 17 00:00:00 2001 -From: Ken Gaillot -Date: Fri, 10 Dec 2021 15:56:34 -0600 -Subject: [PATCH 05/11] Refactor: tools: functionize requesting fencing in - stonith_admin - -... to reduce code duplication and improve readability ---- - tools/stonith_admin.c | 27 +++++++++++++++------------ - 1 file changed, 15 insertions(+), 12 deletions(-) - -diff --git a/tools/stonith_admin.c b/tools/stonith_admin.c -index c11e302e76..f738a9c888 100644 ---- a/tools/stonith_admin.c -+++ b/tools/stonith_admin.c -@@ -331,6 +331,18 @@ build_arg_context(pcmk__common_args_t *args, GOptionGroup **group) { - return context; - } - -+// \return Standard Pacemaker return code -+static int -+request_fencing(stonith_t *st, const char *target, const char *command) -+{ -+ int rc = pcmk__request_fencing(st, target, command, crm_system_name, -+ options.timeout * 1000, -+ options.tolerance * 1000, -+ options.delay, NULL); -+ -+ return rc; -+} -+ - int - main(int argc, char **argv) - { -@@ -568,24 +580,15 @@ main(int argc, char **argv) - break; - - case 'B': -- rc = pcmk__request_fencing(st, target, "reboot", crm_system_name, -- options.timeout * 1000, -- options.tolerance * 1000, -- options.delay, NULL); -+ rc = request_fencing(st, target, "reboot"); - break; - - case 'F': -- rc = pcmk__request_fencing(st, target, "off", crm_system_name, -- options.timeout * 1000, -- options.tolerance * 1000, -- options.delay, NULL); -+ rc = request_fencing(st, target, "off"); - break; - - case 'U': -- rc = pcmk__request_fencing(st, target, "on", crm_system_name, -- options.timeout * 1000, -- options.tolerance * 1000, -- options.delay, NULL); -+ rc = request_fencing(st, target, "on"); - break; - - case 'h': --- -2.27.0 - - -From 2da32df780983ec1197e857eed5eeb5bf1101889 Mon Sep 17 00:00:00 2001 -From: Ken Gaillot -Date: Fri, 10 Dec 2021 16:05:19 -0600 -Subject: [PATCH 06/11] Feature: tools: display failure reasons for - stonith_admin fencing commands - -Previously, stonith_admin's --fence/--unfence/--reboot options did not output -any error message on failure. Now, they do, including the exit reason, if -available. ---- - tools/stonith_admin.c | 30 +++++++++++++++++++++++++----- - 1 file changed, 25 insertions(+), 5 deletions(-) - -diff --git a/tools/stonith_admin.c b/tools/stonith_admin.c -index f738a9c888..5590faf11e 100644 ---- a/tools/stonith_admin.c -+++ b/tools/stonith_admin.c -@@ -333,13 +333,33 @@ build_arg_context(pcmk__common_args_t *args, GOptionGroup **group) { - - // \return Standard Pacemaker return code - static int --request_fencing(stonith_t *st, const char *target, const char *command) -+request_fencing(stonith_t *st, const char *target, const char *command, -+ GError **error) - { -+ char *reason = NULL; - int rc = pcmk__request_fencing(st, target, command, crm_system_name, - options.timeout * 1000, - options.tolerance * 1000, -- options.delay, NULL); -+ options.delay, &reason); - -+ if (rc != pcmk_rc_ok) { -+ const char *rc_str = pcmk_rc_str(rc); -+ -+ // If reason is identical to return code string, don't display it twice -+ if (pcmk__str_eq(rc_str, reason, pcmk__str_none)) { -+ free(reason); -+ reason = NULL; -+ } -+ -+ g_set_error(error, PCMK__RC_ERROR, rc, -+ "Couldn't %sfence %s: %s%s%s%s", -+ ((strcmp(command, "on") == 0)? "un" : ""), -+ target, pcmk_rc_str(rc), -+ ((reason == NULL)? "" : " ("), -+ ((reason == NULL)? "" : reason), -+ ((reason == NULL)? "" : ")")); -+ } -+ free(reason); - return rc; - } - -@@ -580,15 +600,15 @@ main(int argc, char **argv) - break; - - case 'B': -- rc = request_fencing(st, target, "reboot"); -+ rc = request_fencing(st, target, "reboot", &error); - break; - - case 'F': -- rc = request_fencing(st, target, "off"); -+ rc = request_fencing(st, target, "off", &error); - break; - - case 'U': -- rc = request_fencing(st, target, "on"); -+ rc = request_fencing(st, target, "on", &error); - break; - - case 'h': --- -2.27.0 - - -From 2d99eba4c326d3b13dbbe446971ea5febd5d05be Mon Sep 17 00:00:00 2001 -From: Ken Gaillot -Date: Fri, 10 Dec 2021 16:08:49 -0600 -Subject: [PATCH 07/11] Feature: libpacemaker: return exit reason for fencer - connection failures - -... instead of outputting to stderr directly, so that the caller (i.e. -stonith_admin) can output the error in the correct output format. ---- - lib/pacemaker/pcmk_fence.c | 3 +-- - 1 file changed, 1 insertion(+), 2 deletions(-) - -diff --git a/lib/pacemaker/pcmk_fence.c b/lib/pacemaker/pcmk_fence.c -index 1b7feb54b2..d17b07cda2 100644 ---- a/lib/pacemaker/pcmk_fence.c -+++ b/lib/pacemaker/pcmk_fence.c -@@ -104,10 +104,9 @@ async_fence_helper(gpointer user_data) - int rc = stonith_api_connect_retry(st, async_fence_data.name, 10); - - if (rc != pcmk_ok) { -- fprintf(stderr, "Could not connect to fencer: %s\n", pcmk_strerror(rc)); - g_main_loop_quit(mainloop); - pcmk__set_result(&async_fence_data.result, CRM_EX_ERROR, -- PCMK_EXEC_NOT_CONNECTED, NULL); -+ PCMK_EXEC_NOT_CONNECTED, pcmk_strerror(rc)); - return TRUE; - } - --- -2.27.0 - - -From 4480ef0602f47450bdddfbde360a6a8327710927 Mon Sep 17 00:00:00 2001 -From: Ken Gaillot -Date: Mon, 17 Jan 2022 09:39:39 -0600 -Subject: [PATCH 08/11] Low: libpacemaker: compare fence action names - case-sensitively - ---- - lib/pacemaker/pcmk_fence.c | 6 +++--- - 1 file changed, 3 insertions(+), 3 deletions(-) - -diff --git a/lib/pacemaker/pcmk_fence.c b/lib/pacemaker/pcmk_fence.c -index d17b07cda2..2a8f50a555 100644 ---- a/lib/pacemaker/pcmk_fence.c -+++ b/lib/pacemaker/pcmk_fence.c -@@ -1,5 +1,5 @@ - /* -- * Copyright 2009-2021 the Pacemaker project contributors -+ * Copyright 2009-2022 the Pacemaker project contributors - * - * The version control history for this file may have further details. - * -@@ -77,7 +77,7 @@ static void - notify_callback(stonith_t * st, stonith_event_t * e) - { - if (pcmk__str_eq(async_fence_data.target, e->target, pcmk__str_casei) -- && pcmk__str_eq(async_fence_data.action, e->action, pcmk__str_casei)) { -+ && pcmk__str_eq(async_fence_data.action, e->action, pcmk__str_none)) { - - pcmk__set_result(&async_fence_data.result, - stonith__event_exit_status(e), -@@ -549,7 +549,7 @@ pcmk__reduce_fence_history(stonith_history_t *history) - if ((hp->state == st_done) || (hp->state == st_failed)) { - /* action not in progress */ - if (pcmk__str_eq(hp->target, np->target, pcmk__str_casei) && -- pcmk__str_eq(hp->action, np->action, pcmk__str_casei) && -+ pcmk__str_eq(hp->action, np->action, pcmk__str_none) && - (hp->state == np->state) && - ((hp->state == st_done) || - pcmk__str_eq(hp->delegate, np->delegate, pcmk__str_casei))) { --- -2.27.0 - - -From fe4c65a3b9e715c2b535709f989f2369d3637b78 Mon Sep 17 00:00:00 2001 -From: Ken Gaillot -Date: Mon, 17 Jan 2022 09:45:24 -0600 -Subject: [PATCH 09/11] Refactor: libpacemaker: avoid unnecessary string - duplication - -... and don't leave any dynamic memory hanging around ---- - lib/pacemaker/pcmk_fence.c | 11 ++++++++--- - 1 file changed, 8 insertions(+), 3 deletions(-) - -diff --git a/lib/pacemaker/pcmk_fence.c b/lib/pacemaker/pcmk_fence.c -index 2a8f50a555..260fa5ab8e 100644 ---- a/lib/pacemaker/pcmk_fence.c -+++ b/lib/pacemaker/pcmk_fence.c -@@ -141,6 +141,7 @@ pcmk__request_fencing(stonith_t *st, const char *target, const char *action, - unsigned int tolerance, int delay, char **reason) - { - crm_trigger_t *trig; -+ int rc = pcmk_rc_ok; - - async_fence_data.st = st; - async_fence_data.name = strdup(name); -@@ -160,10 +161,14 @@ pcmk__request_fencing(stonith_t *st, const char *target, const char *action, - - free(async_fence_data.name); - -- if ((reason != NULL) && (async_fence_data.result.exit_reason != NULL)) { -- *reason = strdup(async_fence_data.result.exit_reason); -+ if (reason != NULL) { -+ // Give the caller ownership of the exit reason -+ *reason = async_fence_data.result.exit_reason; -+ async_fence_data.result.exit_reason = NULL; - } -- return stonith__result2rc(&async_fence_data.result); -+ rc = stonith__result2rc(&async_fence_data.result); -+ pcmk__reset_result(&async_fence_data.result); -+ return rc; - } - - #ifdef BUILD_PUBLIC_LIBPACEMAKER --- -2.27.0 - - -From 7b7af07796f05a1adabdac655582be2e17106f81 Mon Sep 17 00:00:00 2001 -From: Ken Gaillot -Date: Mon, 17 Jan 2022 10:07:10 -0600 -Subject: [PATCH 10/11] Doc: libpacemaker: improve pcmk__request_fencing() - doxygen block - ---- - include/pacemaker.h | 6 ++++-- - include/pcmki/pcmki_fence.h | 15 +++++++++------ - 2 files changed, 13 insertions(+), 8 deletions(-) - -diff --git a/include/pacemaker.h b/include/pacemaker.h -index e581f975a9..266a844892 100644 ---- a/include/pacemaker.h -+++ b/include/pacemaker.h -@@ -187,8 +187,10 @@ int pcmk_list_nodes(xmlNodePtr *xml, char *node_types); - * \param[in] tolerance If a successful action for \p target happened within - * this many ms, return 0 without performing the action - * again -- * \param[in] delay Apply a fencing delay. Value -1 means disable also any -- * static/random fencing delays from pcmk_delay_base/max -+ * \param[in] delay Apply this delay (in milliseconds) before initiating the -+ * fencing action (a value of -1 applies no delay and also -+ * disables any fencing delay from pcmk_delay_base and -+ * pcmk_delay_max) - * \param[out] reason If not NULL, where to put descriptive failure reason - * - * \return Standard Pacemaker return code -diff --git a/include/pcmki/pcmki_fence.h b/include/pcmki/pcmki_fence.h -index e3a7e27264..4a2fe3c481 100644 ---- a/include/pcmki/pcmki_fence.h -+++ b/include/pcmki/pcmki_fence.h -@@ -1,5 +1,5 @@ - /* -- * Copyright 2019-2021 the Pacemaker project contributors -+ * Copyright 2019-2022 the Pacemaker project contributors - * - * The version control history for this file may have further details. - * -@@ -22,17 +22,20 @@ - * \param[in] target The node that should be fenced - * \param[in] action The fencing action (on, off, reboot) to perform - * \param[in] name Who requested the fence action? -- * \param[in] timeout How long to wait for the operation to complete (in ms). -+ * \param[in] timeout How long to wait for the operation to complete (in ms) - * \param[in] tolerance If a successful action for \p target happened within -- * this many ms, return 0 without performing the action -- * again. -- * \param[in] delay Apply a fencing delay. Value -1 means disable also any -- * static/random fencing delays from pcmk_delay_base/max -+ * this many milliseconds, return success without -+ * performing the action again -+ * \param[in] delay Apply this delay (in milliseconds) before initiating the -+ * fencing action (a value of -1 applies no delay and also -+ * disables any fencing delay from pcmk_delay_base and -+ * pcmk_delay_max) - * \param[out] reason If not NULL, where to put descriptive failure reason - * - * \return Standard Pacemaker return code - * \note If \p reason is not NULL, the caller is responsible for freeing its - * returned value. -+ * \todo delay is eventually used with g_timeout_add() and should be guint - */ - int pcmk__request_fencing(stonith_t *st, const char *target, const char *action, - const char *name, unsigned int timeout, --- -2.27.0 - - -From 61fb7271712e1246eb6d9472dc1afc7cd10e0a79 Mon Sep 17 00:00:00 2001 -From: Ken Gaillot -Date: Mon, 17 Jan 2022 10:18:02 -0600 -Subject: [PATCH 11/11] Fix: tools: get stonith_admin -T option working again - -Regression introduced in 2.0.3 by 3910b6fec - -This reverts commit 247eb303df934944c0b72b162bb661cee6e0ed8b -("Refactor: tools: drop unnecessary string duplication in stonith_admin") -and fixes a regression introduced when stonith_admin was converted to use -GOption. - -The -T option is intended to override the client name passed to the fencer API, -but the client name was set to the default (crm_system_name) after option -processing had already been done, so any value for -T was overwritten by the -default, and its memory was leaked. - -This commit sets the default only if -T was not used. ---- - tools/stonith_admin.c | 15 ++++++++++----- - 1 file changed, 10 insertions(+), 5 deletions(-) - -diff --git a/tools/stonith_admin.c b/tools/stonith_admin.c -index 5590faf11e..54774b6fee 100644 ---- a/tools/stonith_admin.c -+++ b/tools/stonith_admin.c -@@ -337,10 +337,10 @@ request_fencing(stonith_t *st, const char *target, const char *command, - GError **error) - { - char *reason = NULL; -- int rc = pcmk__request_fencing(st, target, command, crm_system_name, -- options.timeout * 1000, -- options.tolerance * 1000, -- options.delay, &reason); -+ int rc = pcmk__request_fencing(st, target, command, name, -+ options.timeout * 1000, -+ options.tolerance * 1000, -+ options.delay, &reason); - - if (rc != pcmk_rc_ok) { - const char *rc_str = pcmk_rc_str(rc); -@@ -392,6 +392,10 @@ main(int argc, char **argv) - - pcmk__cli_init_logging("stonith_admin", args->verbosity); - -+ if (name == NULL) { -+ name = strdup(crm_system_name); -+ } -+ - rc = pcmk__output_new(&out, args->output_ty, args->output_dest, argv); - if (rc != pcmk_rc_ok) { - exit_code = CRM_EX_ERROR; -@@ -526,7 +530,7 @@ main(int argc, char **argv) - if (st == NULL) { - rc = -ENOMEM; - } else if (!no_connect) { -- rc = st->cmds->connect(st, crm_system_name, NULL); -+ rc = st->cmds->connect(st, name, NULL); - } - if (rc < 0) { - out->err(out, "Could not connect to fencer: %s", pcmk_strerror(rc)); -@@ -640,6 +644,7 @@ main(int argc, char **argv) - out->finish(out, exit_code, true, NULL); - pcmk__output_free(out); - } -+ free(name); - stonith_key_value_freeall(options.params, 1, 1); - - if (st != NULL) { --- -2.27.0 - diff --git a/SOURCES/018-failure-messages.patch b/SOURCES/018-failure-messages.patch deleted file mode 100644 index 3a2f249..0000000 --- a/SOURCES/018-failure-messages.patch +++ /dev/null @@ -1,796 +0,0 @@ -From 08c3420f2c857e7b27cd960f355d787af534da7d Mon Sep 17 00:00:00 2001 -From: Ken Gaillot -Date: Tue, 18 Jan 2022 16:04:49 -0600 -Subject: [PATCH 01/12] Log: libcrmcommon: improve description for "not - connected" status - -PCMK_EXEC_NOT_CONNECTED was originally added to represent "No executor -connection", but it can also now mean no fencer connection, so change it to -"Internal communication failure" which is probably less mysterious to end users -anyway (especially since it should be accompanied by a more descriptive exit -reason). ---- - include/crm/common/results.h | 2 +- - 1 file changed, 1 insertion(+), 1 deletion(-) - -diff --git a/include/crm/common/results.h b/include/crm/common/results.h -index 873faf5c43..3d322a7ce6 100644 ---- a/include/crm/common/results.h -+++ b/include/crm/common/results.h -@@ -349,7 +349,7 @@ pcmk_exec_status_str(enum pcmk_exec_status status) - case PCMK_EXEC_ERROR_HARD: return "Hard error"; - case PCMK_EXEC_ERROR_FATAL: return "Fatal error"; - case PCMK_EXEC_NOT_INSTALLED: return "Not installed"; -- case PCMK_EXEC_NOT_CONNECTED: return "No executor connection"; -+ case PCMK_EXEC_NOT_CONNECTED: return "Internal communication failure"; - case PCMK_EXEC_INVALID: return "Cannot execute now"; - case PCMK_EXEC_NO_FENCE_DEVICE: return "No fence device"; - case PCMK_EXEC_NO_SECRETS: return "CIB secrets unavailable"; --- -2.27.0 - - -From 7c345cf8cf0cb054f5634206880df035bfef7311 Mon Sep 17 00:00:00 2001 -From: Ken Gaillot -Date: Mon, 20 Dec 2021 15:12:36 -0600 -Subject: [PATCH 02/12] Refactor: libcrmcommon: drop unnecessary system error - redefinitions - -portability.h defines some system error codes that might not be present on -non-Linux systems. - -This was a bad idea, since there's no way to ensure the defined values don't -conflict with existing system codes. However, we use a number of them, so it's -probably best to keep them, at least until we can make a backward compatibility -break. - -However, we don't use EUNATCH, ENOSR, or ENOSTR, so we can delete those. ---- - include/portability.h | 12 ------------ - lib/common/results.c | 9 ++++++--- - 2 files changed, 6 insertions(+), 15 deletions(-) - -diff --git a/include/portability.h b/include/portability.h -index 9a60c583a7..ee065a376d 100644 ---- a/include/portability.h -+++ b/include/portability.h -@@ -131,10 +131,6 @@ typedef union - # define EREMOTEIO 193 - # endif - --# ifndef EUNATCH --# define EUNATCH 194 --# endif -- - # ifndef ENOKEY - # define ENOKEY 195 - # endif -@@ -147,14 +143,6 @@ typedef union - # define ETIME 197 - # endif - --# ifndef ENOSR --# define ENOSR 198 --# endif -- --# ifndef ENOSTR --# define ENOSTR 199 --# endif -- - # ifndef EKEYREJECTED - # define EKEYREJECTED 200 - # endif -diff --git a/lib/common/results.c b/lib/common/results.c -index 6d120694cd..96cd4e5659 100644 ---- a/lib/common/results.c -+++ b/lib/common/results.c -@@ -118,9 +118,6 @@ pcmk_strerror(int rc) - case EREMOTEIO: - return "Remote I/O error"; - /* coverity[dead_error_condition] False positive on non-Linux */ -- case EUNATCH: -- return "Protocol driver not attached"; -- /* coverity[dead_error_condition] False positive on non-Linux */ - case ENOKEY: - return "Required key not available"; - } -@@ -342,8 +339,12 @@ pcmk_rc_name(int rc) - case ENOMSG: return "ENOMSG"; - case ENOPROTOOPT: return "ENOPROTOOPT"; - case ENOSPC: return "ENOSPC"; -+#ifdef ENOSR - case ENOSR: return "ENOSR"; -+#endif -+#ifdef ENOSTR - case ENOSTR: return "ENOSTR"; -+#endif - case ENOSYS: return "ENOSYS"; - case ENOTBLK: return "ENOTBLK"; - case ENOTCONN: return "ENOTCONN"; -@@ -376,7 +377,9 @@ pcmk_rc_name(int rc) - case ETIME: return "ETIME"; - case ETIMEDOUT: return "ETIMEDOUT"; - case ETXTBSY: return "ETXTBSY"; -+#ifdef EUNATCH - case EUNATCH: return "EUNATCH"; -+#endif - case EUSERS: return "EUSERS"; - /* case EWOULDBLOCK: return "EWOULDBLOCK"; */ - case EXDEV: return "EXDEV"; --- -2.27.0 - - -From eac8d1ca51eac3f437e18584f7e013d976ecee2c Mon Sep 17 00:00:00 2001 -From: Ken Gaillot -Date: Mon, 20 Dec 2021 15:33:12 -0600 -Subject: [PATCH 03/12] Log: libcrmcommon: improve handling of portability.h - error codes - -portability.h defines some system error codes that might not be present on -non-Linux systems. - -Define a constant for each one (for example, PCMK__ECOMM for ECOMM) when -the system doesn't have the value, so we can detect that when relevant. - -Also, make sure pcmk_rc_name() and pcmk_rc_str() handle all of these values. ---- - include/portability.h | 8 ++++++++ - lib/common/results.c | 32 ++++++++++++++++++++++++++++++-- - 2 files changed, 38 insertions(+), 2 deletions(-) - -diff --git a/include/portability.h b/include/portability.h -index ee065a376d..5d5fbf21cb 100644 ---- a/include/portability.h -+++ b/include/portability.h -@@ -116,34 +116,42 @@ typedef union - # include - - # ifndef ENOTUNIQ -+# define PCMK__ENOTUNIQ - # define ENOTUNIQ 190 - # endif - - # ifndef ECOMM -+# define PCMK__ECOMM - # define ECOMM 191 - # endif - - # ifndef ELIBACC -+# define PCMK__ELIBACC - # define ELIBACC 192 - # endif - - # ifndef EREMOTEIO -+# define PCMK__EREMOTIO - # define EREMOTEIO 193 - # endif - - # ifndef ENOKEY -+# define PCMK__ENOKEY - # define ENOKEY 195 - # endif - - # ifndef ENODATA -+# define PCMK__ENODATA - # define ENODATA 196 - # endif - - # ifndef ETIME -+# define PCMK__ETIME - # define ETIME 197 - # endif - - # ifndef EKEYREJECTED -+# define PCMK__EKEYREJECTED - # define EKEYREJECTED 200 - # endif - -diff --git a/lib/common/results.c b/lib/common/results.c -index 96cd4e5659..bcf289d0d6 100644 ---- a/lib/common/results.c -+++ b/lib/common/results.c -@@ -395,9 +395,9 @@ pcmk_rc_name(int rc) - #ifdef EISNAM // Not available on OS X, Illumos, Solaris - case EISNAM: return "EISNAM"; - case EKEYEXPIRED: return "EKEYEXPIRED"; -- case EKEYREJECTED: return "EKEYREJECTED"; - case EKEYREVOKED: return "EKEYREVOKED"; - #endif -+ case EKEYREJECTED: return "EKEYREJECTED"; - case EL2HLT: return "EL2HLT"; - case EL2NSYNC: return "EL2NSYNC"; - case EL3HLT: return "EL3HLT"; -@@ -443,7 +443,35 @@ pcmk_rc_str(int rc) - if (rc < 0) { - return "Unknown error"; - } -- return strerror(rc); -+ -+ // Handle values that could be defined by system or by portability.h -+ switch (rc) { -+#ifdef PCMK__ENOTUNIQ -+ case ENOTUNIQ: return "Name not unique on network"; -+#endif -+#ifdef PCMK__ECOMM -+ case ECOMM: return "Communication error on send"; -+#endif -+#ifdef PCMK__ELIBACC -+ case ELIBACC: return "Can not access a needed shared library"; -+#endif -+#ifdef PCMK__EREMOTEIO -+ case EREMOTEIO: return "Remote I/O error"; -+#endif -+#ifdef PCMK__ENOKEY -+ case ENOKEY: return "Required key not available"; -+#endif -+#ifdef PCMK__ENODATA -+ case ENODATA: return "No data available"; -+#endif -+#ifdef PCMK__ETIME -+ case ETIME: return "Timer expired"; -+#endif -+#ifdef PCMK__EKEYREJECTED -+ case EKEYREJECTED: return "Key was rejected by service"; -+#endif -+ default: return strerror(rc); -+ } - } - - // This returns negative values for errors --- -2.27.0 - - -From 32a38ac6374f85c43e7f4051f5e519822cc481e6 Mon Sep 17 00:00:00 2001 -From: Ken Gaillot -Date: Mon, 20 Dec 2021 15:39:19 -0600 -Subject: [PATCH 04/12] Log: libcrmcommon: redefine pcmk_strerror() in terms of - pcmk_rc_str() - -... to reduce code duplication. This causes minor differences in the string for -a few values. ---- - lib/common/results.c | 67 +------------------------------------------- - 1 file changed, 1 insertion(+), 66 deletions(-) - -diff --git a/lib/common/results.c b/lib/common/results.c -index bcf289d0d6..b2c6e8d553 100644 ---- a/lib/common/results.c -+++ b/lib/common/results.c -@@ -57,72 +57,7 @@ pcmk_errorname(int rc) - const char * - pcmk_strerror(int rc) - { -- if (rc == 0) { -- return "OK"; -- } -- -- rc = abs(rc); -- -- // Of course rc > 0 ... unless someone passed INT_MIN as rc -- if ((rc > 0) && (rc < PCMK_ERROR_OFFSET)) { -- return strerror(rc); -- } -- -- switch (rc) { -- case pcmk_err_generic: -- return "Generic Pacemaker error"; -- case pcmk_err_no_quorum: -- return "Operation requires quorum"; -- case pcmk_err_schema_validation: -- return "Update does not conform to the configured schema"; -- case pcmk_err_transform_failed: -- return "Schema transform failed"; -- case pcmk_err_old_data: -- return "Update was older than existing configuration"; -- case pcmk_err_diff_failed: -- return "Application of an update diff failed"; -- case pcmk_err_diff_resync: -- return "Application of an update diff failed, requesting a full refresh"; -- case pcmk_err_cib_modified: -- return "The on-disk configuration was manually modified"; -- case pcmk_err_cib_backup: -- return "Could not archive the previous configuration"; -- case pcmk_err_cib_save: -- return "Could not save the new configuration to disk"; -- case pcmk_err_cib_corrupt: -- return "Could not parse on-disk configuration"; -- case pcmk_err_multiple: -- return "Resource active on multiple nodes"; -- case pcmk_err_node_unknown: -- return "Node not found"; -- case pcmk_err_already: -- return "Situation already as requested"; -- case pcmk_err_bad_nvpair: -- return "Bad name/value pair given"; -- case pcmk_err_schema_unchanged: -- return "Schema is already the latest available"; -- case pcmk_err_unknown_format: -- return "Unknown output format"; -- -- /* The following cases will only be hit on systems for which they are non-standard */ -- /* coverity[dead_error_condition] False positive on non-Linux */ -- case ENOTUNIQ: -- return "Name not unique on network"; -- /* coverity[dead_error_condition] False positive on non-Linux */ -- case ECOMM: -- return "Communication error on send"; -- /* coverity[dead_error_condition] False positive on non-Linux */ -- case ELIBACC: -- return "Can not access a needed shared library"; -- /* coverity[dead_error_condition] False positive on non-Linux */ -- case EREMOTEIO: -- return "Remote I/O error"; -- /* coverity[dead_error_condition] False positive on non-Linux */ -- case ENOKEY: -- return "Required key not available"; -- } -- crm_err("Unknown error code: %d", rc); -- return "Unknown error"; -+ return pcmk_rc_str(pcmk_legacy2rc(rc)); - } - - // Standard Pacemaker API return codes --- -2.27.0 - - -From 7c331d7e2275ffebbfd5e2f6432a6137a66ee5db Mon Sep 17 00:00:00 2001 -From: Ken Gaillot -Date: Mon, 20 Dec 2021 15:41:24 -0600 -Subject: [PATCH 05/12] Log: libcrmcommon: don't say "Unknown error" - -... which is unhelpful and annoying to users ---- - lib/common/results.c | 4 ++-- - 1 file changed, 2 insertions(+), 2 deletions(-) - -diff --git a/lib/common/results.c b/lib/common/results.c -index b2c6e8d553..5ffac76549 100644 ---- a/lib/common/results.c -+++ b/lib/common/results.c -@@ -376,7 +376,7 @@ pcmk_rc_str(int rc) - return pcmk__rcs[pcmk_rc_error - rc].desc; - } - if (rc < 0) { -- return "Unknown error"; -+ return "Error"; - } - - // Handle values that could be defined by system or by portability.h -@@ -768,7 +768,7 @@ bz2_strerror(int rc) - case BZ_OUTBUFF_FULL: - return "output data will not fit into the buffer provided"; - } -- return "Unknown error"; -+ return "Data compression error"; - } - - crm_exit_t --- -2.27.0 - - -From 26883b4edda7d81bfcb79bd7b33bb3210beff110 Mon Sep 17 00:00:00 2001 -From: Ken Gaillot -Date: Mon, 20 Dec 2021 16:01:39 -0600 -Subject: [PATCH 06/12] Log: fencing: don't warn if cluster has no watchdog - device - ---- - lib/fencing/st_client.c | 7 ++++++- - 1 file changed, 6 insertions(+), 1 deletion(-) - -diff --git a/lib/fencing/st_client.c b/lib/fencing/st_client.c -index b1de912b2a..a0f3119f3b 100644 ---- a/lib/fencing/st_client.c -+++ b/lib/fencing/st_client.c -@@ -187,7 +187,12 @@ stonith__watchdog_fencing_enabled_for_node_api(stonith_t *st, const char *node) - * we drop in here - so as not to make remote nodes - * panic on that answer - */ -- crm_warn("watchdog-fencing-query failed"); -+ if (rc == -ENODEV) { -+ crm_notice("Cluster does not have watchdog fencing device"); -+ } else { -+ crm_warn("Could not check for watchdog fencing device: %s", -+ pcmk_strerror(rc)); -+ } - } else if (list[0] == '\0') { - rv = TRUE; - } else { --- -2.27.0 - - -From 72b3c42232deaca64ffba9582598c59331203761 Mon Sep 17 00:00:00 2001 -From: Ken Gaillot -Date: Mon, 20 Dec 2021 16:22:49 -0600 -Subject: [PATCH 07/12] Test: libcrmcommon: update pcmk_rc_str() unit test for - recent change - ---- - lib/common/tests/results/pcmk__results_test.c | 2 +- - 1 file changed, 1 insertion(+), 1 deletion(-) - -diff --git a/lib/common/tests/results/pcmk__results_test.c b/lib/common/tests/results/pcmk__results_test.c -index 57a520c501..e08d4b6261 100644 ---- a/lib/common/tests/results/pcmk__results_test.c -+++ b/lib/common/tests/results/pcmk__results_test.c -@@ -30,7 +30,7 @@ static void - test_for_pcmk_rc_str(void **state) { - assert_string_equal(pcmk_rc_str(pcmk_rc_error-1), "Unknown output format"); - assert_string_equal(pcmk_rc_str(pcmk_rc_ok), "OK"); -- assert_string_equal(pcmk_rc_str(-1), "Unknown error"); -+ assert_string_equal(pcmk_rc_str(-1), "Error"); - } - - static void --- -2.27.0 - - -From c1ad3d6640f695321a83183c95fae2f105adc429 Mon Sep 17 00:00:00 2001 -From: Ken Gaillot -Date: Tue, 21 Dec 2021 10:20:38 -0600 -Subject: [PATCH 08/12] Test: cts-lab: update expected patterns for recent - changes - ---- - cts/lab/CTStests.py | 2 +- - 1 file changed, 1 insertion(+), 1 deletion(-) - -diff --git a/cts/lab/CTStests.py b/cts/lab/CTStests.py -index 62c832eb45..f4be998cfb 100644 ---- a/cts/lab/CTStests.py -+++ b/cts/lab/CTStests.py -@@ -3055,7 +3055,7 @@ class RemoteStonithd(RemoteDriver): - r"pacemaker-controld.*:\s+error.*: Operation remote-.*_monitor", - r"pacemaker-controld.*:\s+error.*: Result of monitor operation for remote-.*", - r"schedulerd.*:\s+Recover remote-.*\s*\(.*\)", -- r"error: Result of monitor operation for .* on remote-.*: No executor connection", -+ r"error: Result of monitor operation for .* on remote-.*: Internal communication failure", - ] - - ignore_pats.extend(RemoteDriver.errorstoignore(self)) --- -2.27.0 - - -From f272e2f526633c707e894b39c7c7bce3c14de898 Mon Sep 17 00:00:00 2001 -From: Ken Gaillot -Date: Tue, 21 Dec 2021 15:40:49 -0600 -Subject: [PATCH 09/12] Log: controller,libpacemaker: make history XML creation - less chatty - -Other messages with the same info will already be logged at higher severity ---- - daemons/controld/controld_execd.c | 3 +-- - daemons/controld/controld_te_actions.c | 7 ++----- - include/pcmki/pcmki_sched_utils.h | 3 +-- - lib/pacemaker/pcmk_injections.c | 3 +-- - lib/pacemaker/pcmk_sched_actions.c | 12 +++++------- - 5 files changed, 10 insertions(+), 18 deletions(-) - -diff --git a/daemons/controld/controld_execd.c b/daemons/controld/controld_execd.c -index 15784e7687..52157fa5d4 100644 ---- a/daemons/controld/controld_execd.c -+++ b/daemons/controld/controld_execd.c -@@ -693,9 +693,8 @@ build_operation_update(xmlNode * parent, lrmd_rsc_info_t * rsc, lrmd_event_data_ - caller_version = CRM_FEATURE_SET; - } - -- crm_trace("Building %s operation update with originator version: %s", op->rsc_id, caller_version); - xml_op = pcmk__create_history_xml(parent, op, caller_version, target_rc, -- fsa_our_uname, src, LOG_DEBUG); -+ fsa_our_uname, src); - if (xml_op == NULL) { - return TRUE; - } -diff --git a/daemons/controld/controld_te_actions.c b/daemons/controld/controld_te_actions.c -index 63b7c72359..b0bcb8b2e4 100644 ---- a/daemons/controld/controld_te_actions.c -+++ b/daemons/controld/controld_te_actions.c -@@ -181,7 +181,6 @@ controld_record_action_timeout(crm_action_t *action) - lrmd_event_data_t *op = NULL; - xmlNode *state = NULL; - xmlNode *rsc = NULL; -- xmlNode *xml_op = NULL; - xmlNode *action_rsc = NULL; - - int rc = pcmk_ok; -@@ -245,12 +244,10 @@ controld_record_action_timeout(crm_action_t *action) - op->user_data = pcmk__transition_key(transition_graph->id, action->id, - target_rc, te_uuid); - -- xml_op = pcmk__create_history_xml(rsc, op, CRM_FEATURE_SET, target_rc, -- target, __func__, LOG_INFO); -+ pcmk__create_history_xml(rsc, op, CRM_FEATURE_SET, target_rc, target, -+ __func__); - lrmd_free_event(op); - -- crm_log_xml_trace(xml_op, "Action timeout"); -- - rc = fsa_cib_conn->cmds->update(fsa_cib_conn, XML_CIB_TAG_STATUS, state, call_options); - fsa_register_cib_callback(rc, FALSE, NULL, cib_action_updated); - free_xml(state); -diff --git a/include/pcmki/pcmki_sched_utils.h b/include/pcmki/pcmki_sched_utils.h -index 68d60fc7db..144424a609 100644 ---- a/include/pcmki/pcmki_sched_utils.h -+++ b/include/pcmki/pcmki_sched_utils.h -@@ -52,8 +52,7 @@ extern void process_utilization(pe_resource_t * rsc, pe_node_t ** prefer, pe_wor - - xmlNode *pcmk__create_history_xml(xmlNode *parent, lrmd_event_data_t *event, - const char *caller_version, int target_rc, -- const char *node, const char *origin, -- int level); -+ const char *node, const char *origin); - - # define LOAD_STOPPED "load_stopped" - -diff --git a/lib/pacemaker/pcmk_sched_transition.c b/lib/pacemaker/pcmk_sched_transition.c -index 678c3f5dd2..1aa90a5a0b 100644 ---- a/lib/pacemaker/pcmk_sched_transition.c -+++ b/lib/pacemaker/pcmk_sched_transition.c -@@ -201,8 +201,7 @@ inject_op(xmlNode * cib_resource, lrmd_event_data_t * op, int target_rc) - inject_op(xmlNode * cib_resource, lrmd_event_data_t * op, int target_rc) - { - return pcmk__create_history_xml(cib_resource, op, CRM_FEATURE_SET, -- target_rc, NULL, crm_system_name, -- LOG_TRACE); -+ target_rc, NULL, crm_system_name); - } - - static xmlNode * -diff --git a/lib/pacemaker/pcmk_sched_actions.c b/lib/pacemaker/pcmk_sched_actions.c -index f8200b0efc..4f63d3374d 100644 ---- a/lib/pacemaker/pcmk_sched_utils.c -+++ b/lib/pacemaker/pcmk_sched_utils.c -@@ -892,14 +892,13 @@ add_op_digest_to_xml(lrmd_event_data_t *op, xmlNode *update) - * \param[in] target_rc Expected result of operation - * \param[in] node Name of node on which operation was performed - * \param[in] origin Arbitrary description of update source -- * \param[in] level A log message will be logged at this level - * - * \return Newly created XML node for history update - */ - xmlNode * - pcmk__create_history_xml(xmlNode *parent, lrmd_event_data_t *op, - const char *caller_version, int target_rc, -- const char *node, const char *origin, int level) -+ const char *node, const char *origin) - { - char *key = NULL; - char *magic = NULL; -@@ -912,11 +911,10 @@ pcmk__create_history_xml(xmlNode *parent, lrmd_event_data_t *op, - const char *task = NULL; - - CRM_CHECK(op != NULL, return NULL); -- do_crm_log(level, "%s: Updating resource %s after %s op %s (interval=%u)", -- origin, op->rsc_id, op->op_type, -- pcmk_exec_status_str(op->op_status), op->interval_ms); -- -- crm_trace("DC version: %s", caller_version); -+ crm_trace("Creating history XML for %s-interval %s action for %s on %s " -+ "(DC version: %s, origin: %s)", -+ pcmk__readable_interval(op->interval_ms), op->op_type, op->rsc_id, -+ ((node == NULL)? "no node" : node), caller_version, origin); - - task = op->op_type; - --- -2.27.0 - - -From 06b1da9e5345e0d1571042c11646fd7157961279 Mon Sep 17 00:00:00 2001 -From: Ken Gaillot -Date: Tue, 21 Dec 2021 17:09:44 -0600 -Subject: [PATCH 10/12] Feature: controller: improve exit reason for internal - timeouts - -Functionize the part of controld_record_action_timeout() that creates a fake -executor event, into a new function synthesize_timeout_event(), and have it set -a more detailed exit reason describing what timed out. ---- - daemons/controld/controld_te_actions.c | 61 ++++++++++++++++++++------ - 1 file changed, 48 insertions(+), 13 deletions(-) - -diff --git a/daemons/controld/controld_te_actions.c b/daemons/controld/controld_te_actions.c -index b0bcb8b2e4..de2fbb82bf 100644 ---- a/daemons/controld/controld_te_actions.c -+++ b/daemons/controld/controld_te_actions.c -@@ -175,6 +175,53 @@ te_crm_command(crm_graph_t * graph, crm_action_t * action) - return TRUE; - } - -+/*! -+ * \internal -+ * \brief Synthesize an executor event for a resource action timeout -+ * -+ * \param[in] action Resource action that timed out -+ * \param[in] target_rc Expected result of action that timed out -+ * -+ * Synthesize an executor event for a resource action timeout. (If the executor -+ * gets a timeout while waiting for a resource action to complete, that will be -+ * reported via the usual callback. This timeout means we didn't hear from the -+ * executor itself or the controller that relayed the action to the executor.) -+ * -+ * \return Newly created executor event for result of \p action -+ * \note The caller is responsible for freeing the return value using -+ * lrmd_free_event(). -+ */ -+static lrmd_event_data_t * -+synthesize_timeout_event(crm_action_t *action, int target_rc) -+{ -+ lrmd_event_data_t *op = NULL; -+ const char *target = crm_element_value(action->xml, XML_LRM_ATTR_TARGET); -+ const char *reason = NULL; -+ char *dynamic_reason = NULL; -+ -+ if (pcmk__str_eq(target, get_local_node_name(), pcmk__str_casei)) { -+ reason = "Local executor did not return result in time"; -+ } else { -+ const char *router_node = NULL; -+ -+ router_node = crm_element_value(action->xml, XML_LRM_ATTR_ROUTER_NODE); -+ if (router_node == NULL) { -+ router_node = target; -+ } -+ dynamic_reason = crm_strdup_printf("Controller on %s did not return " -+ "result in time", router_node); -+ reason = dynamic_reason; -+ } -+ -+ op = pcmk__event_from_graph_action(NULL, action, PCMK_EXEC_TIMEOUT, -+ PCMK_OCF_UNKNOWN_ERROR, reason); -+ op->call_id = -1; -+ op->user_data = pcmk__transition_key(transition_graph->id, action->id, -+ target_rc, te_uuid); -+ free(dynamic_reason); -+ return op; -+} -+ - void - controld_record_action_timeout(crm_action_t *action) - { -@@ -231,19 +278,7 @@ controld_record_action_timeout(crm_action_t *action) - crm_copy_xml_element(action_rsc, rsc, XML_AGENT_ATTR_CLASS); - crm_copy_xml_element(action_rsc, rsc, XML_AGENT_ATTR_PROVIDER); - -- /* If the executor gets a timeout while waiting for the action to complete, -- * that will be reported via the usual callback. This timeout means that we -- * didn't hear from the executor or the controller that relayed the action -- * to the executor. -- */ -- op = pcmk__event_from_graph_action(NULL, action, PCMK_EXEC_TIMEOUT, -- PCMK_OCF_UNKNOWN_ERROR, -- "Cluster communication timeout " -- "(no response from executor)"); -- op->call_id = -1; -- op->user_data = pcmk__transition_key(transition_graph->id, action->id, -- target_rc, te_uuid); -- -+ op = synthesize_timeout_event(action, target_rc); - pcmk__create_history_xml(rsc, op, CRM_FEATURE_SET, target_rc, target, - __func__); - lrmd_free_event(op); --- -2.27.0 - - -From be620d206faefab967d4c8567d6554d10c9e72ba Mon Sep 17 00:00:00 2001 -From: Ken Gaillot -Date: Wed, 22 Dec 2021 16:35:06 -0600 -Subject: [PATCH 11/12] Feature: fencing: improve exit reason for fencing - timeouts - -Troubleshooting timeouts is one of the more difficult aspects of cluster -maintenance. We want to give as much of a hint as possible, but for fencing in -particular it is difficult because an operation might involve multiple retries -of multiple devices. - -Barring another major project to track exactly which devices, retries, etc., -were used in a given operation, these changes in wording are probably the best -we can do. ---- - daemons/fenced/fenced_remote.c | 8 +++++--- - lib/fencing/st_client.c | 2 +- - 2 files changed, 6 insertions(+), 4 deletions(-) - -diff --git a/daemons/fenced/fenced_remote.c b/daemons/fenced/fenced_remote.c -index 1e237150c5..6eebb7381e 100644 ---- a/daemons/fenced/fenced_remote.c -+++ b/daemons/fenced/fenced_remote.c -@@ -1,5 +1,5 @@ - /* -- * Copyright 2009-2021 the Pacemaker project contributors -+ * Copyright 2009-2022 the Pacemaker project contributors - * - * The version control history for this file may have further details. - * -@@ -715,8 +715,10 @@ remote_op_timeout(gpointer userdata) - CRM_XS " id=%.8s", - op->action, op->target, op->client_name, op->id); - } else { -- finalize_timed_out_op(userdata, "Fencing could not be completed " -- "within overall timeout"); -+ finalize_timed_out_op(userdata, "Fencing did not complete within a " -+ "total timeout based on the " -+ "configured timeout and retries for " -+ "any devices attempted"); - } - return G_SOURCE_REMOVE; - } -diff --git a/lib/fencing/st_client.c b/lib/fencing/st_client.c -index a0f3119f3b..718739b321 100644 ---- a/lib/fencing/st_client.c -+++ b/lib/fencing/st_client.c -@@ -906,7 +906,7 @@ invoke_registered_callbacks(stonith_t *stonith, xmlNode *msg, int call_id) - if (msg == NULL) { - // Fencer didn't reply in time - pcmk__set_result(&result, CRM_EX_ERROR, PCMK_EXEC_TIMEOUT, -- "Timeout waiting for reply from fencer"); -+ "Fencer accepted request but did not reply in time"); - CRM_LOG_ASSERT(call_id > 0); - - } else { --- -2.27.0 - - -From 0fe8ede2f8e838e335fe42846bdf147111ce9955 Mon Sep 17 00:00:00 2001 -From: Ken Gaillot -Date: Wed, 22 Dec 2021 17:09:09 -0600 -Subject: [PATCH 12/12] Feature: libcrmservice: improve exit reason for - timeouts - -The services library doesn't have enough information about an action to say -(for example) what configuration parameters might be relevant, but we can at -least distinguish what kind of agent timed out. ---- - lib/services/services_linux.c | 12 +++++++++++- - lib/services/systemd.c | 2 +- - 2 files changed, 12 insertions(+), 2 deletions(-) - -diff --git a/lib/services/services_linux.c b/lib/services/services_linux.c -index f15eee860e..d6aafcfe46 100644 ---- a/lib/services/services_linux.c -+++ b/lib/services/services_linux.c -@@ -677,9 +677,19 @@ async_action_complete(mainloop_child_t *p, pid_t pid, int core, int signo, - parse_exit_reason_from_stderr(op); - - } else if (mainloop_child_timeout(p)) { -+ const char *reason = NULL; -+ -+ if (op->rsc != NULL) { -+ reason = "Resource agent did not complete in time"; -+ } else if (pcmk__str_eq(op->standard, PCMK_RESOURCE_CLASS_STONITH, -+ pcmk__str_none)) { -+ reason = "Fence agent did not complete in time"; -+ } else { -+ reason = "Process did not complete in time"; -+ } - crm_info("%s[%d] timed out after %dms", op->id, op->pid, op->timeout); - services__set_result(op, services__generic_error(op), PCMK_EXEC_TIMEOUT, -- "Process did not exit within specified timeout"); -+ reason); - - } else if (op->cancel) { - /* If an in-flight recurring operation was killed because it was -diff --git a/lib/services/systemd.c b/lib/services/systemd.c -index 27a3b376db..d87b287424 100644 ---- a/lib/services/systemd.c -+++ b/lib/services/systemd.c -@@ -995,7 +995,7 @@ systemd_timeout_callback(gpointer p) - crm_info("%s action for systemd unit %s named '%s' timed out", - op->action, op->agent, op->rsc); - services__set_result(op, PCMK_OCF_UNKNOWN_ERROR, PCMK_EXEC_TIMEOUT, -- "Systemd action did not complete within specified timeout"); -+ "Systemd unit action did not complete in time"); - services__finalize_async_op(op); - return FALSE; - } --- -2.27.0 - diff --git a/SOURCES/019-corosync-tracking.patch b/SOURCES/019-corosync-tracking.patch deleted file mode 100644 index ac3ca96..0000000 --- a/SOURCES/019-corosync-tracking.patch +++ /dev/null @@ -1,29 +0,0 @@ -From e8bf0161b872267f1bb7143a9866fdc15ec218f2 Mon Sep 17 00:00:00 2001 -From: Jan Friesse -Date: Tue, 18 Jan 2022 16:35:24 +0100 -Subject: [PATCH] Fix: corosync: Repeat corosync_cfg_trackstart - -corosync_cfg_trackstart can fail with CS_ERR_TRY_AGAIN failure so -(similarly as for corosync_cfg_local_get, ...) handle failure with -using cs_repeat macro. ---- - daemons/pacemakerd/pcmkd_corosync.c | 3 ++- - 1 file changed, 2 insertions(+), 1 deletion(-) - -diff --git a/daemons/pacemakerd/pcmkd_corosync.c b/daemons/pacemakerd/pcmkd_corosync.c -index 7990bc43c5..cd7a40321d 100644 ---- a/daemons/pacemakerd/pcmkd_corosync.c -+++ b/daemons/pacemakerd/pcmkd_corosync.c -@@ -186,7 +186,8 @@ cluster_connect_cfg(void) - crm_debug("Corosync reports local node ID is %lu", (unsigned long) nodeid); - - #ifdef HAVE_COROSYNC_CFG_TRACKSTART -- rc = corosync_cfg_trackstart(cfg_handle, 0); -+ retries = 0; -+ cs_repeat(retries, 30, rc = corosync_cfg_trackstart(cfg_handle, 0)); - if (rc != CS_OK) { - crm_crit("Could not enable Corosync CFG shutdown tracker: %s " CRM_XS " rc=%d", - cs_strerror(rc), rc); --- -2.27.0 - diff --git a/SOURCES/020-systemd-unit.patch b/SOURCES/020-systemd-unit.patch deleted file mode 100644 index a425ae3..0000000 --- a/SOURCES/020-systemd-unit.patch +++ /dev/null @@ -1,41 +0,0 @@ -From e316840a7e1d2a72e3089ee194334244c959905a Mon Sep 17 00:00:00 2001 -From: Ken Gaillot -Date: Wed, 19 Jan 2022 09:53:53 -0600 -Subject: [PATCH] Fix: pacemakerd: tweak systemd unit respawn settings - -If pacemaker exits immediately after starting, wait 1 second before trying to -respawn, since the default of 100ms is a bit aggressive for a Pacemaker -cluster. - -Also, allow 5 attempts in 25 seconds before giving up. ---- - daemons/pacemakerd/pacemaker.service.in | 6 ++++++ - 1 file changed, 6 insertions(+) - -diff --git a/daemons/pacemakerd/pacemaker.service.in b/daemons/pacemakerd/pacemaker.service.in -index 0363a2259c..3fd53d9ffb 100644 ---- a/daemons/pacemakerd/pacemaker.service.in -+++ b/daemons/pacemakerd/pacemaker.service.in -@@ -31,6 +31,9 @@ After=rsyslog.service - After=corosync.service - Requires=corosync.service - -+# If Pacemaker respawns repeatedly, give up after this many tries in this time -+StartLimitBurst=5 -+StartLimitIntervalSec=25s - - [Install] - WantedBy=multi-user.target -@@ -57,6 +60,9 @@ TasksMax=infinity - # resource. Sending -KILL will just get the node fenced - SendSIGKILL=no - -+# Systemd's default of respawning a failed service after 100ms is too aggressive -+RestartSec=1s -+ - # If we ever hit the StartLimitInterval/StartLimitBurst limit, and the - # admin wants to stop the cluster while pacemakerd is not running, it - # might be a good idea to enable the ExecStopPost directive below. --- -2.27.0 - diff --git a/SOURCES/021-failure-messages.patch b/SOURCES/021-failure-messages.patch deleted file mode 100644 index fab1013..0000000 --- a/SOURCES/021-failure-messages.patch +++ /dev/null @@ -1,1338 +0,0 @@ -From 9ee3d6c9b0aba6aae022cc152a3b3472fe388fa3 Mon Sep 17 00:00:00 2001 -From: Ken Gaillot -Date: Thu, 6 Jan 2022 16:44:32 -0600 -Subject: [PATCH 01/15] Refactor: fencer: add exit reason to fencing operation - object - -In order to pass a fencing action's exit reason with the action history, -we need the exit reason in remote_fencing_op_t. Nothing sets or uses it as of -this commit. ---- - daemons/fenced/fenced_remote.c | 2 ++ - daemons/fenced/pacemaker-fenced.h | 4 +++- - 2 files changed, 5 insertions(+), 1 deletion(-) - -diff --git a/daemons/fenced/fenced_remote.c b/daemons/fenced/fenced_remote.c -index 6eebb7381e..0fa9706140 100644 ---- a/daemons/fenced/fenced_remote.c -+++ b/daemons/fenced/fenced_remote.c -@@ -260,6 +260,8 @@ free_remote_op(gpointer data) - } - g_list_free_full(op->automatic_list, free); - g_list_free(op->duplicates); -+ -+ pcmk__reset_result(&op->result); - free(op); - } - -diff --git a/daemons/fenced/pacemaker-fenced.h b/daemons/fenced/pacemaker-fenced.h -index 502fcc9a29..1a5c933ea7 100644 ---- a/daemons/fenced/pacemaker-fenced.h -+++ b/daemons/fenced/pacemaker-fenced.h -@@ -1,5 +1,5 @@ - /* -- * Copyright 2009-2021 the Pacemaker project contributors -+ * Copyright 2009-2022 the Pacemaker project contributors - * - * This source code is licensed under the GNU General Public License version 2 - * or later (GPLv2+) WITHOUT ANY WARRANTY. -@@ -151,6 +151,8 @@ typedef struct remote_fencing_op_s { - /*! The point at which the remote operation completed(nsec) */ - long long completed_nsec; - -+ /*! The (potentially intermediate) result of the operation */ -+ pcmk__action_result_t result; - } remote_fencing_op_t; - - void fenced_broadcast_op_result(remote_fencing_op_t *op, --- -2.27.0 - - -From 97a2c318866adc5ef5e426c5c3b753df1fa3ab66 Mon Sep 17 00:00:00 2001 -From: Ken Gaillot -Date: Thu, 6 Jan 2022 17:08:42 -0600 -Subject: [PATCH 02/15] Refactor: fencer: track full result in - remote_fencing_op_t - -Now that remote_fencing_op_t has a place for the full result, -set it before calling finalize_op(), instead of passing a separate result -object to finalize_op(). - -As a bonus, this simplifies the memory management, reducing the chance of -mistakes. ---- - daemons/fenced/fenced_remote.c | 161 ++++++++++++++++----------------- - 1 file changed, 77 insertions(+), 84 deletions(-) - -diff --git a/daemons/fenced/fenced_remote.c b/daemons/fenced/fenced_remote.c -index 0fa9706140..30edbff890 100644 ---- a/daemons/fenced/fenced_remote.c -+++ b/daemons/fenced/fenced_remote.c -@@ -82,8 +82,7 @@ extern xmlNode *stonith_create_op(int call_id, const char *token, const char *op - static void request_peer_fencing(remote_fencing_op_t *op, - peer_device_info_t *peer, - pcmk__action_result_t *result); --static void finalize_op(remote_fencing_op_t *op, xmlNode *data, -- pcmk__action_result_t *result, bool dup); -+static void finalize_op(remote_fencing_op_t *op, xmlNode *data, bool dup); - static void report_timeout_period(remote_fencing_op_t * op, int op_timeout); - static int get_op_total_timeout(const remote_fencing_op_t *op, - const peer_device_info_t *chosen_peer); -@@ -485,7 +484,9 @@ finalize_op_duplicates(remote_fencing_op_t *op, xmlNode *data, - other->client_name, other->originator, - pcmk_exec_status_str(result->execution_status), - other->id); -- finalize_op(other, data, result, true); -+ pcmk__set_result(&other->result, result->exit_status, -+ result->execution_status, result->exit_reason); -+ finalize_op(other, data, true); - - } else { - // Possible if (for example) it timed out already -@@ -520,20 +521,20 @@ delegate_from_xml(xmlNode *xml) - * - * \param[in] op Fencer operation that completed - * \param[in] data If not NULL, XML reply of last delegated fencing operation -- * \param[in] result Full operation result - * \param[in] dup Whether this operation is a duplicate of another - * (in which case, do not broadcast the result) -+ * -+ * \note The operation result should be set before calling this function. - */ - static void --finalize_op(remote_fencing_op_t *op, xmlNode *data, -- pcmk__action_result_t *result, bool dup) -+finalize_op(remote_fencing_op_t *op, xmlNode *data, bool dup) - { - int level = LOG_ERR; - const char *subt = NULL; - xmlNode *local_data = NULL; - gboolean op_merged = FALSE; - -- CRM_CHECK((op != NULL) && (result != NULL), return); -+ CRM_CHECK((op != NULL), return); - - if (op->notify_sent) { - // Most likely, this is a timed-out action that eventually completed -@@ -557,11 +558,11 @@ finalize_op(remote_fencing_op_t *op, xmlNode *data, - local_data = data; - - } else if (op->delegate == NULL) { -- switch (result->execution_status) { -+ switch (op->result.execution_status) { - case PCMK_EXEC_NO_FENCE_DEVICE: - break; - case PCMK_EXEC_INVALID: -- if (result->exit_status == CRM_EX_EXPIRED) { -+ if (op->result.exit_status == CRM_EX_EXPIRED) { - break; - } - // else fall through -@@ -581,12 +582,12 @@ finalize_op(remote_fencing_op_t *op, xmlNode *data, - subt = crm_element_value(data, F_SUBTYPE); - if (!dup && !pcmk__str_eq(subt, "broadcast", pcmk__str_casei)) { - /* Defer notification until the bcast message arrives */ -- fenced_broadcast_op_result(op, result, op_merged); -+ fenced_broadcast_op_result(op, &op->result, op_merged); - free_xml(local_data); - return; - } - -- if (pcmk__result_ok(result) || dup -+ if (pcmk__result_ok(&op->result) || dup - || !pcmk__str_eq(op->originator, stonith_our_uname, pcmk__str_casei)) { - level = LOG_NOTICE; - } -@@ -595,16 +596,17 @@ finalize_op(remote_fencing_op_t *op, xmlNode *data, - (op->target? op->target : ""), - (op->delegate? op->delegate : "unknown node"), - op->client_name, op->originator, -- (op_merged? " (merged)" : ""), crm_exit_str(result->exit_status), -- pcmk_exec_status_str(result->execution_status), -- ((result->exit_reason == NULL)? "" : ": "), -- ((result->exit_reason == NULL)? "" : result->exit_reason), -+ (op_merged? " (merged)" : ""), -+ crm_exit_str(op->result.exit_status), -+ pcmk_exec_status_str(op->result.execution_status), -+ ((op->result.exit_reason == NULL)? "" : ": "), -+ ((op->result.exit_reason == NULL)? "" : op->result.exit_reason), - op->id); - -- handle_local_reply_and_notify(op, data, result); -+ handle_local_reply_and_notify(op, data, &op->result); - - if (!dup) { -- finalize_op_duplicates(op, data, result); -+ finalize_op_duplicates(op, data, &op->result); - } - - /* Free non-essential parts of the record -@@ -634,7 +636,6 @@ static gboolean - remote_op_watchdog_done(gpointer userdata) - { - remote_fencing_op_t *op = userdata; -- pcmk__action_result_t result = PCMK__UNKNOWN_RESULT; - - op->op_timer_one = 0; - -@@ -642,8 +643,8 @@ remote_op_watchdog_done(gpointer userdata) - CRM_XS " id=%.8s", - op->action, op->target, op->client_name, op->id); - op->state = st_done; -- pcmk__set_result(&result, CRM_EX_OK, PCMK_EXEC_DONE, NULL); -- finalize_op(op, NULL, &result, false); -+ pcmk__set_result(&op->result, CRM_EX_OK, PCMK_EXEC_DONE, NULL); -+ finalize_op(op, NULL, false); - return G_SOURCE_REMOVE; - } - -@@ -676,8 +677,6 @@ remote_op_timeout_one(gpointer userdata) - static void - finalize_timed_out_op(remote_fencing_op_t *op, const char *reason) - { -- pcmk__action_result_t result = PCMK__UNKNOWN_RESULT; -- - op->op_timer_total = 0; - - crm_debug("Action '%s' targeting %s for client %s timed out " -@@ -690,13 +689,12 @@ finalize_timed_out_op(remote_fencing_op_t *op, const char *reason) - * devices, and return success. - */ - op->state = st_done; -- pcmk__set_result(&result, CRM_EX_OK, PCMK_EXEC_DONE, NULL); -+ pcmk__set_result(&op->result, CRM_EX_OK, PCMK_EXEC_DONE, NULL); - } else { - op->state = st_failed; -- pcmk__set_result(&result, CRM_EX_ERROR, PCMK_EXEC_TIMEOUT, reason); -+ pcmk__set_result(&op->result, CRM_EX_ERROR, PCMK_EXEC_TIMEOUT, reason); - } -- finalize_op(op, NULL, &result, false); -- pcmk__reset_result(&result); -+ finalize_op(op, NULL, false); - } - - /*! -@@ -1094,13 +1092,9 @@ fenced_handle_manual_confirmation(pcmk__client_t *client, xmlNode *msg) - set_fencing_completed(op); - op->delegate = strdup("a human"); - -- { -- // For the fencer's purposes, the fencing operation is done -- pcmk__action_result_t result = PCMK__UNKNOWN_RESULT; -- -- pcmk__set_result(&result, CRM_EX_OK, PCMK_EXEC_DONE, NULL); -- finalize_op(op, msg, &result, false); -- } -+ // For the fencer's purposes, the fencing operation is done -+ pcmk__set_result(&op->result, CRM_EX_OK, PCMK_EXEC_DONE, NULL); -+ finalize_op(op, msg, false); - - /* For the requester's purposes, the operation is still pending. The - * actual result will be sent asynchronously via the operation's done_cb(). -@@ -1279,16 +1273,11 @@ initiate_remote_stonith_op(pcmk__client_t *client, xmlNode *request, - switch (op->state) { - case st_failed: - // advance_topology_level() exhausted levels -- { -- pcmk__action_result_t result = PCMK__UNKNOWN_RESULT; -- -- pcmk__set_result(&result, CRM_EX_ERROR, PCMK_EXEC_ERROR, -- "All topology levels failed"); -- crm_warn("Could not request peer fencing (%s) targeting %s " -- CRM_XS " id=%.8s", op->action, op->target, op->id); -- finalize_op(op, NULL, &result, false); -- pcmk__reset_result(&result); -- } -+ pcmk__set_result(&op->result, CRM_EX_ERROR, PCMK_EXEC_ERROR, -+ "All topology levels failed"); -+ crm_warn("Could not request peer fencing (%s) targeting %s " -+ CRM_XS " id=%.8s", op->action, op->target, op->id); -+ finalize_op(op, NULL, false); - return op; - - case st_duplicate: -@@ -1613,10 +1602,6 @@ static void - advance_topology_device_in_level(remote_fencing_op_t *op, const char *device, - xmlNode *msg) - { -- pcmk__action_result_t result = PCMK__UNKNOWN_RESULT; -- -- pcmk__set_result(&result, CRM_EX_OK, PCMK_EXEC_DONE, NULL); -- - /* Advance to the next device at this topology level, if any */ - if (op->devices) { - op->devices = op->devices->next; -@@ -1644,6 +1629,10 @@ advance_topology_device_in_level(remote_fencing_op_t *op, const char *device, - } - - if (op->devices) { -+ pcmk__action_result_t result = PCMK__UNKNOWN_RESULT; -+ -+ pcmk__set_result(&result, CRM_EX_OK, PCMK_EXEC_DONE, NULL); -+ - /* Necessary devices remain, so execute the next one */ - crm_trace("Next targeting %s on behalf of %s@%s", - op->target, op->client_name, op->originator); -@@ -1659,7 +1648,8 @@ advance_topology_device_in_level(remote_fencing_op_t *op, const char *device, - crm_trace("Marking complex fencing op targeting %s as complete", - op->target); - op->state = st_done; -- finalize_op(op, msg, &result, false); -+ pcmk__set_result(&op->result, CRM_EX_OK, PCMK_EXEC_DONE, NULL); -+ finalize_op(op, msg, false); - } - } - -@@ -1868,7 +1858,9 @@ request_peer_fencing(remote_fencing_op_t *op, peer_device_info_t *peer, - } - - op->state = st_failed; -- finalize_op(op, NULL, result, false); -+ pcmk__set_result(&op->result, result->exit_status, -+ result->execution_status, result->exit_reason); -+ finalize_op(op, NULL, false); - - } else { - crm_info("Waiting for additional peers capable of fencing (%s) %s%s%s " -@@ -2245,31 +2237,34 @@ fenced_process_fencing_reply(xmlNode *msg) - /* Could be for an event that began before we started */ - /* TODO: Record the op for later querying */ - crm_info("Received peer result of unknown or expired operation %s", id); -- goto done; -+ pcmk__reset_result(&result); -+ return; - } - -+ op->result = result; // The operation takes ownership of the result -+ - if (op->devices && device && !pcmk__str_eq(op->devices->data, device, pcmk__str_casei)) { - crm_err("Received outdated reply for device %s (instead of %s) to " - "fence (%s) %s. Operation already timed out at peer level.", - device, (const char *) op->devices->data, op->action, op->target); -- goto done; -+ return; - } - - if (pcmk__str_eq(crm_element_value(msg, F_SUBTYPE), "broadcast", pcmk__str_casei)) { - crm_debug("Finalizing action '%s' targeting %s on behalf of %s@%s: %s%s%s%s " - CRM_XS " id=%.8s", - op->action, op->target, op->client_name, op->originator, -- pcmk_exec_status_str(result.execution_status), -- (result.exit_reason == NULL)? "" : " (", -- (result.exit_reason == NULL)? "" : result.exit_reason, -- (result.exit_reason == NULL)? "" : ")", op->id); -- if (pcmk__result_ok(&result)) { -+ pcmk_exec_status_str(op->result.execution_status), -+ (op->result.exit_reason == NULL)? "" : " (", -+ (op->result.exit_reason == NULL)? "" : op->result.exit_reason, -+ (op->result.exit_reason == NULL)? "" : ")", op->id); -+ if (pcmk__result_ok(&op->result)) { - op->state = st_done; - } else { - op->state = st_failed; - } -- finalize_op(op, msg, &result, false); -- goto done; -+ finalize_op(op, msg, false); -+ return; - - } else if (!pcmk__str_eq(op->originator, stonith_our_uname, pcmk__str_casei)) { - /* If this isn't a remote level broadcast, and we are not the -@@ -2277,7 +2272,7 @@ fenced_process_fencing_reply(xmlNode *msg) - crm_err("Received non-broadcast fencing result for operation %.8s " - "we do not own (device %s targeting %s)", - op->id, device, op->target); -- goto done; -+ return; - } - - if (pcmk_is_set(op->call_options, st_opt_topology)) { -@@ -2286,58 +2281,58 @@ fenced_process_fencing_reply(xmlNode *msg) - crm_notice("Action '%s' targeting %s using %s on behalf of %s@%s: %s%s%s%s", - op->action, op->target, device, op->client_name, - op->originator, -- pcmk_exec_status_str(result.execution_status), -- (result.exit_reason == NULL)? "" : " (", -- (result.exit_reason == NULL)? "" : result.exit_reason, -- (result.exit_reason == NULL)? "" : ")"); -+ pcmk_exec_status_str(op->result.execution_status), -+ (op->result.exit_reason == NULL)? "" : " (", -+ (op->result.exit_reason == NULL)? "" : op->result.exit_reason, -+ (op->result.exit_reason == NULL)? "" : ")"); - - /* We own the op, and it is complete. broadcast the result to all nodes - * and notify our local clients. */ - if (op->state == st_done) { -- finalize_op(op, msg, &result, false); -- goto done; -+ finalize_op(op, msg, false); -+ return; - } - -- if ((op->phase == 2) && !pcmk__result_ok(&result)) { -+ if ((op->phase == 2) && !pcmk__result_ok(&op->result)) { - /* A remapped "on" failed, but the node was already turned off - * successfully, so ignore the error and continue. - */ - crm_warn("Ignoring %s 'on' failure (%s%s%s) targeting %s " - "after successful 'off'", -- device, pcmk_exec_status_str(result.execution_status), -- (result.exit_reason == NULL)? "" : ": ", -- (result.exit_reason == NULL)? "" : result.exit_reason, -+ device, pcmk_exec_status_str(op->result.execution_status), -+ (op->result.exit_reason == NULL)? "" : ": ", -+ (op->result.exit_reason == NULL)? "" : op->result.exit_reason, - op->target); -- pcmk__set_result(&result, CRM_EX_OK, PCMK_EXEC_DONE, NULL); -+ pcmk__set_result(&op->result, CRM_EX_OK, PCMK_EXEC_DONE, NULL); - } - -- if (pcmk__result_ok(&result)) { -+ if (pcmk__result_ok(&op->result)) { - /* An operation completed successfully. Try another device if - * necessary, otherwise mark the operation as done. */ - advance_topology_device_in_level(op, device, msg); -- goto done; -+ return; - } else { - /* This device failed, time to try another topology level. If no other - * levels are available, mark this operation as failed and report results. */ - if (advance_topology_level(op, false) != pcmk_rc_ok) { - op->state = st_failed; -- finalize_op(op, msg, &result, false); -- goto done; -+ finalize_op(op, msg, false); -+ return; - } - } - -- } else if (pcmk__result_ok(&result) && (op->devices == NULL)) { -+ } else if (pcmk__result_ok(&op->result) && (op->devices == NULL)) { - crm_trace("All done for %s", op->target); - op->state = st_done; -- finalize_op(op, msg, &result, false); -- goto done; -+ finalize_op(op, msg, false); -+ return; - -- } else if ((result.execution_status == PCMK_EXEC_TIMEOUT) -+ } else if ((op->result.execution_status == PCMK_EXEC_TIMEOUT) - && (op->devices == NULL)) { - /* If the operation timed out don't bother retrying other peers. */ - op->state = st_failed; -- finalize_op(op, msg, &result, false); -- goto done; -+ finalize_op(op, msg, false); -+ return; - - } else { - /* fall-through and attempt other fencing action using another peer */ -@@ -2346,10 +2341,8 @@ fenced_process_fencing_reply(xmlNode *msg) - /* Retry on failure */ - crm_trace("Next for %s on behalf of %s@%s (result was: %s)", - op->target, op->originator, op->client_name, -- pcmk_exec_status_str(result.execution_status)); -- request_peer_fencing(op, NULL, &result); --done: -- pcmk__reset_result(&result); -+ pcmk_exec_status_str(op->result.execution_status)); -+ request_peer_fencing(op, NULL, &op->result); - } - - gboolean --- -2.27.0 - - -From c59d062154f7c9e15e90929a20ea244d7efd7247 Mon Sep 17 00:00:00 2001 -From: Ken Gaillot -Date: Thu, 6 Jan 2022 17:11:12 -0600 -Subject: [PATCH 03/15] Refactor: fencer: drop redundant argument from - finalize_op_duplicates() - -... now that the result is in the op ---- - daemons/fenced/fenced_remote.c | 13 ++++++------- - 1 file changed, 6 insertions(+), 7 deletions(-) - -diff --git a/daemons/fenced/fenced_remote.c b/daemons/fenced/fenced_remote.c -index 30edbff890..8b496e1042 100644 ---- a/daemons/fenced/fenced_remote.c -+++ b/daemons/fenced/fenced_remote.c -@@ -468,11 +468,9 @@ handle_local_reply_and_notify(remote_fencing_op_t *op, xmlNode *data, - * - * \param[in] op Fencer operation that completed - * \param[in] data Top-level XML to add notification to -- * \param[in] result Full operation result - */ - static void --finalize_op_duplicates(remote_fencing_op_t *op, xmlNode *data, -- pcmk__action_result_t *result) -+finalize_op_duplicates(remote_fencing_op_t *op, xmlNode *data) - { - for (GList *iter = op->duplicates; iter != NULL; iter = iter->next) { - remote_fencing_op_t *other = iter->data; -@@ -482,10 +480,11 @@ finalize_op_duplicates(remote_fencing_op_t *op, xmlNode *data, - crm_debug("Performing duplicate notification for %s@%s: %s " - CRM_XS " id=%.8s", - other->client_name, other->originator, -- pcmk_exec_status_str(result->execution_status), -+ pcmk_exec_status_str(op->result.execution_status), - other->id); -- pcmk__set_result(&other->result, result->exit_status, -- result->execution_status, result->exit_reason); -+ pcmk__set_result(&other->result, op->result.exit_status, -+ op->result.execution_status, -+ op->result.exit_reason); - finalize_op(other, data, true); - - } else { -@@ -606,7 +605,7 @@ finalize_op(remote_fencing_op_t *op, xmlNode *data, bool dup) - handle_local_reply_and_notify(op, data, &op->result); - - if (!dup) { -- finalize_op_duplicates(op, data, &op->result); -+ finalize_op_duplicates(op, data); - } - - /* Free non-essential parts of the record --- -2.27.0 - - -From 6c49675855323a52a534afa112a0861ba2e3b1ad Mon Sep 17 00:00:00 2001 -From: Ken Gaillot -Date: Thu, 6 Jan 2022 17:15:17 -0600 -Subject: [PATCH 04/15] Refactor: fencer: drop redundant argument from - fenced_broadcast_op_result() - -... now that the op includes the result ---- - daemons/fenced/fenced_history.c | 9 +++------ - daemons/fenced/fenced_remote.c | 8 +++----- - daemons/fenced/pacemaker-fenced.h | 3 +-- - 3 files changed, 7 insertions(+), 13 deletions(-) - -diff --git a/daemons/fenced/fenced_history.c b/daemons/fenced/fenced_history.c -index 0157deadb3..5cacf36ca8 100644 ---- a/daemons/fenced/fenced_history.c -+++ b/daemons/fenced/fenced_history.c -@@ -1,5 +1,5 @@ - /* -- * Copyright 2009-2021 the Pacemaker project contributors -+ * Copyright 2009-2022 the Pacemaker project contributors - * - * The version control history for this file may have further details. - * -@@ -359,8 +359,6 @@ stonith_local_history_diff_and_merge(GHashTable *remote_history, - } - - if (remote_history) { -- pcmk__action_result_t result = PCMK__UNKNOWN_RESULT; -- - init_stonith_remote_op_hash_table(&stonith_remote_op_list); - - updated |= g_hash_table_size(remote_history); -@@ -378,10 +376,10 @@ stonith_local_history_diff_and_merge(GHashTable *remote_history, - /* CRM_EX_EXPIRED + PCMK_EXEC_INVALID prevents finalize_op() - * from setting a delegate - */ -- pcmk__set_result(&result, CRM_EX_EXPIRED, PCMK_EXEC_INVALID, -+ pcmk__set_result(&op->result, CRM_EX_EXPIRED, PCMK_EXEC_INVALID, - "Initiated by earlier fencer " - "process and presumed failed"); -- fenced_broadcast_op_result(op, &result, false); -+ fenced_broadcast_op_result(op, false); - } - - g_hash_table_iter_steal(&iter); -@@ -396,7 +394,6 @@ stonith_local_history_diff_and_merge(GHashTable *remote_history, - */ - } - -- pcmk__reset_result(&result); - g_hash_table_destroy(remote_history); /* remove what is left */ - } - -diff --git a/daemons/fenced/fenced_remote.c b/daemons/fenced/fenced_remote.c -index 8b496e1042..fb5a5e980e 100644 ---- a/daemons/fenced/fenced_remote.c -+++ b/daemons/fenced/fenced_remote.c -@@ -390,16 +390,14 @@ fencing_result2xml(remote_fencing_op_t *op, pcmk__action_result_t *result) - * \brief Broadcast a fence result notification to all CPG peers - * - * \param[in] op Fencer operation that completed -- * \param[in] result Full operation result - * \param[in] op_merged Whether this operation is a duplicate of another - */ - void --fenced_broadcast_op_result(remote_fencing_op_t *op, -- pcmk__action_result_t *result, bool op_merged) -+fenced_broadcast_op_result(remote_fencing_op_t *op, bool op_merged) - { - static int count = 0; - xmlNode *bcast = create_xml_node(NULL, T_STONITH_REPLY); -- xmlNode *notify_data = fencing_result2xml(op, result); -+ xmlNode *notify_data = fencing_result2xml(op, &op->result); - - count++; - crm_trace("Broadcasting result to peers"); -@@ -581,7 +579,7 @@ finalize_op(remote_fencing_op_t *op, xmlNode *data, bool dup) - subt = crm_element_value(data, F_SUBTYPE); - if (!dup && !pcmk__str_eq(subt, "broadcast", pcmk__str_casei)) { - /* Defer notification until the bcast message arrives */ -- fenced_broadcast_op_result(op, &op->result, op_merged); -+ fenced_broadcast_op_result(op, op_merged); - free_xml(local_data); - return; - } -diff --git a/daemons/fenced/pacemaker-fenced.h b/daemons/fenced/pacemaker-fenced.h -index 1a5c933ea7..6213407da3 100644 ---- a/daemons/fenced/pacemaker-fenced.h -+++ b/daemons/fenced/pacemaker-fenced.h -@@ -155,8 +155,7 @@ typedef struct remote_fencing_op_s { - pcmk__action_result_t result; - } remote_fencing_op_t; - --void fenced_broadcast_op_result(remote_fencing_op_t *op, -- pcmk__action_result_t *result, bool op_merged); -+void fenced_broadcast_op_result(remote_fencing_op_t *op, bool op_merged); - - // Fencer-specific client flags - enum st_client_flags { --- -2.27.0 - - -From 73994fc740b8833457b130368db479502d49f285 Mon Sep 17 00:00:00 2001 -From: Ken Gaillot -Date: Thu, 6 Jan 2022 17:17:33 -0600 -Subject: [PATCH 05/15] Refactor: fencer: drop redundant argument from - handle_local_reply_and_notify() - -... now that the op includes the result ---- - daemons/fenced/fenced_remote.c | 12 +++++------- - 1 file changed, 5 insertions(+), 7 deletions(-) - -diff --git a/daemons/fenced/fenced_remote.c b/daemons/fenced/fenced_remote.c -index fb5a5e980e..2621cb2f19 100644 ---- a/daemons/fenced/fenced_remote.c -+++ b/daemons/fenced/fenced_remote.c -@@ -424,11 +424,9 @@ fenced_broadcast_op_result(remote_fencing_op_t *op, bool op_merged) - * - * \param[in] op Fencer operation that completed - * \param[in] data Top-level XML to add notification to -- * \param[in] result Full operation result - */ - static void --handle_local_reply_and_notify(remote_fencing_op_t *op, xmlNode *data, -- pcmk__action_result_t *result) -+handle_local_reply_and_notify(remote_fencing_op_t *op, xmlNode *data) - { - xmlNode *notify_data = NULL; - xmlNode *reply = NULL; -@@ -443,15 +441,15 @@ handle_local_reply_and_notify(remote_fencing_op_t *op, xmlNode *data, - crm_xml_add(data, F_STONITH_TARGET, op->target); - crm_xml_add(data, F_STONITH_OPERATION, op->action); - -- reply = fenced_construct_reply(op->request, data, result); -+ reply = fenced_construct_reply(op->request, data, &op->result); - crm_xml_add(reply, F_STONITH_DELEGATE, op->delegate); - - /* Send fencing OP reply to local client that initiated fencing */ - do_local_reply(reply, op->client_id, op->call_options & st_opt_sync_call, FALSE); - - /* bcast to all local clients that the fencing operation happend */ -- notify_data = fencing_result2xml(op, result); -- fenced_send_notification(T_STONITH_NOTIFY_FENCE, result, notify_data); -+ notify_data = fencing_result2xml(op, &op->result); -+ fenced_send_notification(T_STONITH_NOTIFY_FENCE, &op->result, notify_data); - free_xml(notify_data); - fenced_send_notification(T_STONITH_NOTIFY_HISTORY, NULL, NULL); - -@@ -600,7 +598,7 @@ finalize_op(remote_fencing_op_t *op, xmlNode *data, bool dup) - ((op->result.exit_reason == NULL)? "" : op->result.exit_reason), - op->id); - -- handle_local_reply_and_notify(op, data, &op->result); -+ handle_local_reply_and_notify(op, data); - - if (!dup) { - finalize_op_duplicates(op, data); --- -2.27.0 - - -From 194056d18d3b550d3a53b94d558ceed03b5e5442 Mon Sep 17 00:00:00 2001 -From: Ken Gaillot -Date: Thu, 6 Jan 2022 17:18:27 -0600 -Subject: [PATCH 06/15] Refactor: fencer: drop redundant argument from - fencing_result2xml() - -... now that the op includes the result ---- - daemons/fenced/fenced_remote.c | 9 ++++----- - 1 file changed, 4 insertions(+), 5 deletions(-) - -diff --git a/daemons/fenced/fenced_remote.c b/daemons/fenced/fenced_remote.c -index 2621cb2f19..8d4f53eef6 100644 ---- a/daemons/fenced/fenced_remote.c -+++ b/daemons/fenced/fenced_remote.c -@@ -362,13 +362,12 @@ undo_op_remap(remote_fencing_op_t *op) - * \brief Create notification data XML for a fencing operation result - * - * \param[in] op Fencer operation that completed -- * \param[in] result Full operation result - * - * \return Newly created XML to add as notification data - * \note The caller is responsible for freeing the result. - */ - static xmlNode * --fencing_result2xml(remote_fencing_op_t *op, pcmk__action_result_t *result) -+fencing_result2xml(remote_fencing_op_t *op) - { - xmlNode *notify_data = create_xml_node(NULL, T_STONITH_NOTIFY_FENCE); - -@@ -381,7 +380,7 @@ fencing_result2xml(remote_fencing_op_t *op, pcmk__action_result_t *result) - crm_xml_add(notify_data, F_STONITH_CLIENTID, op->client_id); - crm_xml_add(notify_data, F_STONITH_CLIENTNAME, op->client_name); - -- stonith__xe_set_result(notify_data, result); -+ stonith__xe_set_result(notify_data, &op->result); - return notify_data; - } - -@@ -397,7 +396,7 @@ fenced_broadcast_op_result(remote_fencing_op_t *op, bool op_merged) - { - static int count = 0; - xmlNode *bcast = create_xml_node(NULL, T_STONITH_REPLY); -- xmlNode *notify_data = fencing_result2xml(op, &op->result); -+ xmlNode *notify_data = fencing_result2xml(op); - - count++; - crm_trace("Broadcasting result to peers"); -@@ -448,7 +447,7 @@ handle_local_reply_and_notify(remote_fencing_op_t *op, xmlNode *data) - do_local_reply(reply, op->client_id, op->call_options & st_opt_sync_call, FALSE); - - /* bcast to all local clients that the fencing operation happend */ -- notify_data = fencing_result2xml(op, &op->result); -+ notify_data = fencing_result2xml(op); - fenced_send_notification(T_STONITH_NOTIFY_FENCE, &op->result, notify_data); - free_xml(notify_data); - fenced_send_notification(T_STONITH_NOTIFY_HISTORY, NULL, NULL); --- -2.27.0 - - -From c5d38cb201a1219ca95127cba9c3a778e31966a2 Mon Sep 17 00:00:00 2001 -From: Ken Gaillot -Date: Thu, 6 Jan 2022 17:35:43 -0600 -Subject: [PATCH 07/15] Refactor: fencer: drop redundant argument from - request_peer_fencing() - -... now that the op includes the result ---- - daemons/fenced/fenced_remote.c | 66 +++++++++++++--------------------- - 1 file changed, 25 insertions(+), 41 deletions(-) - -diff --git a/daemons/fenced/fenced_remote.c b/daemons/fenced/fenced_remote.c -index 8d4f53eef6..7fb7695fba 100644 ---- a/daemons/fenced/fenced_remote.c -+++ b/daemons/fenced/fenced_remote.c -@@ -80,8 +80,7 @@ extern xmlNode *stonith_create_op(int call_id, const char *token, const char *op - int call_options); - - static void request_peer_fencing(remote_fencing_op_t *op, -- peer_device_info_t *peer, -- pcmk__action_result_t *result); -+ peer_device_info_t *peer); - static void finalize_op(remote_fencing_op_t *op, xmlNode *data, bool dup); - static void report_timeout_period(remote_fencing_op_t * op, int op_timeout); - static int get_op_total_timeout(const remote_fencing_op_t *op, -@@ -646,18 +645,16 @@ static gboolean - remote_op_timeout_one(gpointer userdata) - { - remote_fencing_op_t *op = userdata; -- pcmk__action_result_t result = PCMK__UNKNOWN_RESULT; - - op->op_timer_one = 0; - - crm_notice("Peer's '%s' action targeting %s for client %s timed out " CRM_XS - " id=%.8s", op->action, op->target, op->client_name, op->id); -- pcmk__set_result(&result, CRM_EX_ERROR, PCMK_EXEC_TIMEOUT, -+ pcmk__set_result(&op->result, CRM_EX_ERROR, PCMK_EXEC_TIMEOUT, - "Peer did not return fence result within timeout"); - -- - // Try another device, if appropriate -- request_peer_fencing(op, NULL, &result); -+ request_peer_fencing(op, NULL); - return FALSE; - } - -@@ -730,13 +727,10 @@ remote_op_query_timeout(gpointer data) - crm_debug("Operation %.8s targeting %s already in progress", - op->id, op->target); - } else if (op->query_results) { -- // Result won't be used in this case, but we need to pass something -- pcmk__action_result_t result = PCMK__UNKNOWN_RESULT; -- - // Query succeeded, so attempt the actual fencing - crm_debug("Query %.8s targeting %s complete (state=%s)", - op->id, op->target, stonith_op_state_str(op->state)); -- request_peer_fencing(op, NULL, &result); -+ request_peer_fencing(op, NULL); - } else { - crm_debug("Query %.8s targeting %s timed out (state=%s)", - op->id, op->target, stonith_op_state_str(op->state)); -@@ -1622,11 +1616,10 @@ advance_topology_device_in_level(remote_fencing_op_t *op, const char *device, - op_phase_on(op); - } - -- if (op->devices) { -- pcmk__action_result_t result = PCMK__UNKNOWN_RESULT; -- -- pcmk__set_result(&result, CRM_EX_OK, PCMK_EXEC_DONE, NULL); -+ // This function is only called if the previous device succeeded -+ pcmk__set_result(&op->result, CRM_EX_OK, PCMK_EXEC_DONE, NULL); - -+ if (op->devices) { - /* Necessary devices remain, so execute the next one */ - crm_trace("Next targeting %s on behalf of %s@%s", - op->target, op->client_name, op->originator); -@@ -1636,13 +1629,12 @@ advance_topology_device_in_level(remote_fencing_op_t *op, const char *device, - op->delay = 0; - } - -- request_peer_fencing(op, NULL, &result); -+ request_peer_fencing(op, NULL); - } else { - /* We're done with all devices and phases, so finalize operation */ - crm_trace("Marking complex fencing op targeting %s as complete", - op->target); - op->state = st_done; -- pcmk__set_result(&op->result, CRM_EX_OK, PCMK_EXEC_DONE, NULL); - finalize_op(op, msg, false); - } - } -@@ -1673,13 +1665,9 @@ check_watchdog_fencing_and_wait(remote_fencing_op_t * op) - * \param[in] op Fencing operation to be executed - * \param[in] peer If NULL or topology is in use, choose best peer to execute - * the fencing, otherwise use this peer -- * \param[in] result Full result of previous failed attempt, if any (used as -- * final result only if a previous attempt failed, topology -- * is not in use, and no devices remain to be attempted) - */ - static void --request_peer_fencing(remote_fencing_op_t *op, peer_device_info_t *peer, -- pcmk__action_result_t *result) -+request_peer_fencing(remote_fencing_op_t *op, peer_device_info_t *peer) - { - const char *device = NULL; - int timeout; -@@ -1822,27 +1810,26 @@ request_peer_fencing(remote_fencing_op_t *op, peer_device_info_t *peer, - } - } - -- // This is the only case in which result will be used -- CRM_CHECK(result != NULL, return); -- - if (op->state == st_query) { - crm_info("No peers (out of %d) have devices capable of fencing " - "(%s) %s for client %s " CRM_XS " state=%s", - op->replies, op->action, op->target, op->client_name, - stonith_op_state_str(op->state)); - -- pcmk__reset_result(result); -- pcmk__set_result(result, CRM_EX_ERROR, PCMK_EXEC_NO_FENCE_DEVICE, -- NULL); -+ pcmk__reset_result(&op->result); -+ pcmk__set_result(&op->result, CRM_EX_ERROR, -+ PCMK_EXEC_NO_FENCE_DEVICE, NULL); - } else { - if (pcmk_is_set(op->call_options, st_opt_topology)) { -- pcmk__reset_result(result); -- pcmk__set_result(result, CRM_EX_ERROR, -+ pcmk__reset_result(&op->result); -+ pcmk__set_result(&op->result, CRM_EX_ERROR, - PCMK_EXEC_NO_FENCE_DEVICE, NULL); - } -- /* ... else use result provided by caller -- overwriting it with -- PCMK_EXEC_NO_FENCE_DEVICE would prevent finalize_op() from -- setting the correct delegate if needed. -+ /* ... else use existing result from previous failed attempt -+ * (topology is not in use, and no devices remain to be attempted). -+ * Overwriting the result with PCMK_EXEC_NO_FENCE_DEVICE would -+ * prevent finalize_op() from setting the correct delegate if -+ * needed. - */ - - crm_info("No peers (out of %d) are capable of fencing (%s) %s " -@@ -1852,8 +1839,6 @@ request_peer_fencing(remote_fencing_op_t *op, peer_device_info_t *peer, - } - - op->state = st_failed; -- pcmk__set_result(&op->result, result->exit_status, -- result->execution_status, result->exit_reason); - finalize_op(op, NULL, false); - - } else { -@@ -2104,7 +2089,6 @@ process_remote_stonith_query(xmlNode * msg) - peer_device_info_t *peer = NULL; - uint32_t replies_expected; - xmlNode *dev = get_xpath_object("//@" F_STONITH_REMOTE_OP_ID, msg, LOG_ERR); -- pcmk__action_result_t result = PCMK__UNKNOWN_RESULT; - - CRM_CHECK(dev != NULL, return -EPROTO); - -@@ -2139,7 +2123,7 @@ process_remote_stonith_query(xmlNode * msg) - peer = add_result(op, host, ndevices, dev); - } - -- pcmk__set_result(&result, CRM_EX_OK, PCMK_EXEC_DONE, NULL); -+ pcmk__set_result(&op->result, CRM_EX_OK, PCMK_EXEC_DONE, NULL); - - if (pcmk_is_set(op->call_options, st_opt_topology)) { - /* If we start the fencing before all the topology results are in, -@@ -2148,12 +2132,12 @@ process_remote_stonith_query(xmlNode * msg) - if (op->state == st_query && all_topology_devices_found(op)) { - /* All the query results are in for the topology, start the fencing ops. */ - crm_trace("All topology devices found"); -- request_peer_fencing(op, peer, &result); -+ request_peer_fencing(op, peer); - - } else if (have_all_replies) { - crm_info("All topology query replies have arrived, continuing (%d expected/%d received) ", - replies_expected, op->replies); -- request_peer_fencing(op, NULL, &result); -+ request_peer_fencing(op, NULL); - } - - } else if (op->state == st_query) { -@@ -2165,12 +2149,12 @@ process_remote_stonith_query(xmlNode * msg) - /* we have a verified device living on a peer that is not the target */ - crm_trace("Found %d verified device%s", - nverified, pcmk__plural_s(nverified)); -- request_peer_fencing(op, peer, &result); -+ request_peer_fencing(op, peer); - - } else if (have_all_replies) { - crm_info("All query replies have arrived, continuing (%d expected/%d received) ", - replies_expected, op->replies); -- request_peer_fencing(op, NULL, &result); -+ request_peer_fencing(op, NULL); - - } else { - crm_trace("Waiting for more peer results before launching fencing operation"); -@@ -2336,7 +2320,7 @@ fenced_process_fencing_reply(xmlNode *msg) - crm_trace("Next for %s on behalf of %s@%s (result was: %s)", - op->target, op->originator, op->client_name, - pcmk_exec_status_str(op->result.execution_status)); -- request_peer_fencing(op, NULL, &op->result); -+ request_peer_fencing(op, NULL); - } - - gboolean --- -2.27.0 - - -From be0a0b652c13161a82b05d3104449b7bfc06e8ac Mon Sep 17 00:00:00 2001 -From: Ken Gaillot -Date: Thu, 6 Jan 2022 17:56:24 -0600 -Subject: [PATCH 08/15] Feature: fencer: track full result in fencing history - -Add fencing operation results when creating XML in -stonith_local_history_diff_and_merge(), and parse the results from the received -XML in stonith_xml_history_to_list(). - -With this, the fencer now always has full results in its op list, and returns -them in the reply for STONITH_OP_FENCE_HISTORY requests (though nothing uses -that as of this commit). ---- - daemons/fenced/fenced_history.c | 2 ++ - 1 file changed, 2 insertions(+) - -diff --git a/daemons/fenced/fenced_history.c b/daemons/fenced/fenced_history.c -index 5cacf36ca8..3ebf016e67 100644 ---- a/daemons/fenced/fenced_history.c -+++ b/daemons/fenced/fenced_history.c -@@ -257,6 +257,7 @@ stonith_xml_history_to_list(xmlNode *history) - op->completed_nsec = completed_nsec; - crm_element_value_int(xml_op, F_STONITH_STATE, &state); - op->state = (enum op_state) state; -+ stonith__xe_get_result(xml_op, &op->result); - - g_hash_table_replace(rv, id, op); - CRM_LOG_ASSERT(g_hash_table_lookup(rv, id) != NULL); -@@ -355,6 +356,7 @@ stonith_local_history_diff_and_merge(GHashTable *remote_history, - crm_xml_add_ll(entry, F_STONITH_DATE, op->completed); - crm_xml_add_ll(entry, F_STONITH_DATE_NSEC, op->completed_nsec); - crm_xml_add_int(entry, F_STONITH_STATE, op->state); -+ stonith__xe_set_result(entry, &op->result); - } - } - --- -2.27.0 - - -From afc5292036e212bcfc7475893e0b326b2a69ac58 Mon Sep 17 00:00:00 2001 -From: Ken Gaillot -Date: Wed, 22 Dec 2021 17:17:21 -0600 -Subject: [PATCH 09/15] API: libstonithd: add exit_reason member to - stonith_history_t - -not yet used, but will be ---- - include/crm/stonith-ng.h | 3 ++- - lib/fencing/st_client.c | 3 ++- - 2 files changed, 4 insertions(+), 2 deletions(-) - -diff --git a/include/crm/stonith-ng.h b/include/crm/stonith-ng.h -index 3fe9cf54f8..2c79bfa579 100644 ---- a/include/crm/stonith-ng.h -+++ b/include/crm/stonith-ng.h -@@ -1,5 +1,5 @@ - /* -- * Copyright 2004-2021 the Pacemaker project contributors -+ * Copyright 2004-2022 the Pacemaker project contributors - * - * The version control history for this file may have further details. - * -@@ -111,6 +111,7 @@ typedef struct stonith_history_s { - time_t completed; - struct stonith_history_s *next; - long completed_nsec; -+ char *exit_reason; - } stonith_history_t; - - typedef struct stonith_s stonith_t; -diff --git a/lib/fencing/st_client.c b/lib/fencing/st_client.c -index 718739b321..57a2e03361 100644 ---- a/lib/fencing/st_client.c -+++ b/lib/fencing/st_client.c -@@ -1,5 +1,5 @@ - /* -- * Copyright 2004-2021 the Pacemaker project contributors -+ * Copyright 2004-2022 the Pacemaker project contributors - * - * The version control history for this file may have further details. - * -@@ -735,6 +735,7 @@ void stonith_history_free(stonith_history_t *history) - free(hp->origin); - free(hp->delegate); - free(hp->client); -+ free(hp->exit_reason); - } - } - --- -2.27.0 - - -From 1b9e2896322849002a5c0a3a34c9375ea32571d6 Mon Sep 17 00:00:00 2001 -From: Ken Gaillot -Date: Thu, 6 Jan 2022 18:04:15 -0600 -Subject: [PATCH 10/15] Feature: fencing: return exit reason with fencing - history - -libstonithd's stonith_t:cmds->history() method now parses exit reasons from the -fencer reply, and returns them in the stonith_history_t results. ---- - lib/fencing/st_client.c | 6 ++++++ - 1 file changed, 6 insertions(+) - -diff --git a/lib/fencing/st_client.c b/lib/fencing/st_client.c -index 57a2e03361..d229b34805 100644 ---- a/lib/fencing/st_client.c -+++ b/lib/fencing/st_client.c -@@ -698,6 +698,7 @@ stonith_api_history(stonith_t * stonith, int call_options, const char *node, - stonith_history_t *kvp; - long long completed; - long long completed_nsec = 0L; -+ pcmk__action_result_t result = PCMK__UNKNOWN_RESULT; - - kvp = calloc(1, sizeof(stonith_history_t)); - kvp->target = crm_element_value_copy(op, F_STONITH_TARGET); -@@ -711,6 +712,11 @@ stonith_api_history(stonith_t * stonith, int call_options, const char *node, - kvp->completed_nsec = completed_nsec; - crm_element_value_int(op, F_STONITH_STATE, &kvp->state); - -+ stonith__xe_get_result(op, &result); -+ kvp->exit_reason = result.exit_reason; -+ result.exit_reason = NULL; -+ pcmk__reset_result(&result); -+ - if (last) { - last->next = kvp; - } else { --- -2.27.0 - - -From ba4e77242e9be4ebeb2843b444ee4afad43c29f3 Mon Sep 17 00:00:00 2001 -From: Ken Gaillot -Date: Fri, 7 Jan 2022 09:44:39 -0600 -Subject: [PATCH 11/15] Feature: fencing: display exit reasons with failed - fencing events - -... when available ---- - lib/fencing/st_output.c | 20 ++++++++++++++++---- - tools/crm_mon_curses.c | 9 +++++++-- - 2 files changed, 23 insertions(+), 6 deletions(-) - -diff --git a/lib/fencing/st_output.c b/lib/fencing/st_output.c -index e484278867..18924d795d 100644 ---- a/lib/fencing/st_output.c -+++ b/lib/fencing/st_output.c -@@ -1,5 +1,5 @@ - /* -- * Copyright 2019-2021 the Pacemaker project contributors -+ * Copyright 2019-2022 the Pacemaker project contributors - * - * The version control history for this file may have further details. - * -@@ -11,6 +11,7 @@ - #include - - #include -+#include - #include - #include - #include -@@ -263,8 +264,12 @@ stonith_event_html(pcmk__output_t *out, va_list args) { - char *failed_s = time_t_string(event->completed); - - out->list_item(out, "failed-stonith-event", -- "%s of %s failed : delegate=%s, client=%s, origin=%s, %s='%s' %s", -+ "%s of %s failed%s%s%s: " -+ "delegate=%s, client=%s, origin=%s, %s='%s' %s", - stonith_action_str(event->action), event->target, -+ (event->exit_reason == NULL)? "" : " (", -+ (event->exit_reason == NULL)? "" : event->exit_reason, -+ (event->exit_reason == NULL)? "" : ")", - event->delegate ? event->delegate : "", - event->client, event->origin, - full_history ? "completed" : "last-failed", -@@ -296,8 +301,13 @@ stonith_event_text(pcmk__output_t *out, va_list args) { - - switch (event->state) { - case st_failed: -- pcmk__indented_printf(out, "%s of %s failed: delegate=%s, client=%s, origin=%s, %s='%s' %s\n", -+ pcmk__indented_printf(out, -+ "%s of %s failed%s%s%s: " -+ "delegate=%s, client=%s, origin=%s, %s='%s' %s\n", - stonith_action_str(event->action), event->target, -+ (event->exit_reason == NULL)? "" : " (", -+ (event->exit_reason == NULL)? "" : event->exit_reason, -+ (event->exit_reason == NULL)? "" : ")", - event->delegate ? event->delegate : "", - event->client, event->origin, - full_history ? "completed" : "last-failed", buf, -@@ -341,7 +351,9 @@ stonith_event_xml(pcmk__output_t *out, va_list args) { - - switch (event->state) { - case st_failed: -- crm_xml_add(node, "status", "failed"); -+ pcmk__xe_set_props(node, "status", "failed", -+ XML_LRM_ATTR_EXIT_REASON, event->exit_reason, -+ NULL); - break; - - case st_done: -diff --git a/tools/crm_mon_curses.c b/tools/crm_mon_curses.c -index bae3710c44..73c8516a8c 100644 ---- a/tools/crm_mon_curses.c -+++ b/tools/crm_mon_curses.c -@@ -1,5 +1,5 @@ - /* -- * Copyright 2019-2021 the Pacemaker project contributors -+ * Copyright 2019-2022 the Pacemaker project contributors - * - * The version control history for this file may have further details. - * -@@ -463,8 +463,13 @@ stonith_event_console(pcmk__output_t *out, va_list args) { - - switch (event->state) { - case st_failed: -- curses_indented_printf(out, "%s of %s failed: delegate=%s, client=%s, origin=%s, %s='%s'%s\n", -+ curses_indented_printf(out, -+ "%s of %s failed%s%s%s: " -+ "delegate=%s, client=%s, origin=%s, %s='%s' %s\n", - stonith_action_str(event->action), event->target, -+ (event->exit_reason == NULL)? "" : " (", -+ (event->exit_reason == NULL)? "" : event->exit_reason, -+ (event->exit_reason == NULL)? "" : ")", - event->delegate ? event->delegate : "", - event->client, event->origin, - full_history ? "completed" : "last-failed", buf, --- -2.27.0 - - -From 8105fb4a3a786780fdf85b3d0308eaf6df1ea434 Mon Sep 17 00:00:00 2001 -From: Ken Gaillot -Date: Fri, 7 Jan 2022 09:45:22 -0600 -Subject: [PATCH 12/15] Low: schemas: copy fence-event API schema in - preparation for changes - ---- - include/crm/common/output_internal.h | 2 +- - xml/api/fence-event-2.15.rng | 33 ++++++++++++++++++++++++++++ - 2 files changed, 34 insertions(+), 1 deletion(-) - create mode 100644 xml/api/fence-event-2.15.rng - -diff --git a/include/crm/common/output_internal.h b/include/crm/common/output_internal.h -index 479f0e4b43..8c5dcee17c 100644 ---- a/include/crm/common/output_internal.h -+++ b/include/crm/common/output_internal.h -@@ -27,7 +27,7 @@ extern "C" { - # include - # include - --# define PCMK__API_VERSION "2.14" -+# define PCMK__API_VERSION "2.15" - - #if defined(PCMK__WITH_ATTRIBUTE_OUTPUT_ARGS) - # define PCMK__OUTPUT_ARGS(ARGS...) __attribute__((output_args(ARGS))) -diff --git a/xml/api/fence-event-2.15.rng b/xml/api/fence-event-2.15.rng -new file mode 100644 -index 0000000000..e54687cd25 ---- /dev/null -+++ b/xml/api/fence-event-2.15.rng -@@ -0,0 +1,33 @@ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ failed -+ success -+ pending -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ --- -2.27.0 - - -From 46dd9b74d2ee8f7ab70a0c7fe3a998954d4029e8 Mon Sep 17 00:00:00 2001 -From: Ken Gaillot -Date: Fri, 7 Jan 2022 09:47:16 -0600 -Subject: [PATCH 13/15] Low: schemas: update fence-event API schema for recent - change - ---- - xml/api/fence-event-2.15.rng | 3 +++ - 1 file changed, 3 insertions(+) - -diff --git a/xml/api/fence-event-2.15.rng b/xml/api/fence-event-2.15.rng -index e54687cd25..8e000cafa5 100644 ---- a/xml/api/fence-event-2.15.rng -+++ b/xml/api/fence-event-2.15.rng -@@ -18,6 +18,9 @@ - - - -+ -+ -+ - - - --- -2.27.0 - - -From 350e71772f67f28af6b67f864cbabc481730035c Mon Sep 17 00:00:00 2001 -From: Ken Gaillot -Date: Fri, 7 Jan 2022 11:32:09 -0600 -Subject: [PATCH 14/15] Build: libstonithd: bump shared library version - -... for stonith_history_t change since 2.1.2. - -The struct should only ever be returned by the library as a pointer, so the -changes can be considered backward-compatible. Normally we wouldn't bump shared -library versions mid-cycle, but this will simplify expected backports of this -change. ---- - lib/fencing/Makefile.am | 4 ++-- - 1 file changed, 2 insertions(+), 2 deletions(-) - -diff --git a/lib/fencing/Makefile.am b/lib/fencing/Makefile.am -index 1ffa3e051b..a10ddb88ec 100644 ---- a/lib/fencing/Makefile.am -+++ b/lib/fencing/Makefile.am -@@ -2,7 +2,7 @@ - # Original Author: Sun Jiang Dong - # Copyright 2004 International Business Machines - # --# with later changes copyright 2004-2021 the Pacemaker project contributors. -+# with later changes copyright 2004-2022 the Pacemaker project contributors. - # The version control history for this file may have further details. - # - # This source code is licensed under the GNU General Public License version 2 -@@ -14,7 +14,7 @@ noinst_HEADERS = fencing_private.h - - lib_LTLIBRARIES = libstonithd.la - --libstonithd_la_LDFLAGS = -version-info 33:0:7 -+libstonithd_la_LDFLAGS = -version-info 34:0:8 - - libstonithd_la_CFLAGS = $(CFLAGS_HARDENED_LIB) - libstonithd_la_LDFLAGS += $(LDFLAGS_HARDENED_LIB) --- -2.27.0 - - -From 63ea88620a62ff0759560a02bb5e284ebdd03eb6 Mon Sep 17 00:00:00 2001 -From: Ken Gaillot -Date: Wed, 19 Jan 2022 16:53:45 -0600 -Subject: [PATCH 15/15] Low: fencer: reset op result before grabbing new one - -just in case ---- - daemons/fenced/fenced_remote.c | 1 + - 1 file changed, 1 insertion(+) - -diff --git a/daemons/fenced/fenced_remote.c b/daemons/fenced/fenced_remote.c -index 7fb7695fba..dc4649e0fc 100644 ---- a/daemons/fenced/fenced_remote.c -+++ b/daemons/fenced/fenced_remote.c -@@ -2219,6 +2219,7 @@ fenced_process_fencing_reply(xmlNode *msg) - return; - } - -+ pcmk__reset_result(&op->result); - op->result = result; // The operation takes ownership of the result - - if (op->devices && device && !pcmk__str_eq(op->devices->data, device, pcmk__str_casei)) { --- -2.27.0 - diff --git a/SOURCES/022-memory-leak.patch b/SOURCES/022-memory-leak.patch deleted file mode 100644 index 3970dd3..0000000 --- a/SOURCES/022-memory-leak.patch +++ /dev/null @@ -1,82 +0,0 @@ -From 8034a203bbff0aa3b53f2946dc58e409bd7246c9 Mon Sep 17 00:00:00 2001 -From: Ken Gaillot -Date: Thu, 20 Jan 2022 15:03:31 -0600 -Subject: [PATCH] Fix: scheduler: avoid memory leak when displaying clones - -Previously, pe__clone_default() unconditionally created a hash table for -stopped instances, but didn't free it in every code path. - -Now, only create the table when we have something to put in it and might -actually use it, and ensure it always gets freed. ---- - lib/pengine/clone.c | 18 +++++++++++++----- - 1 file changed, 13 insertions(+), 5 deletions(-) - -diff --git a/lib/pengine/clone.c b/lib/pengine/clone.c -index 742e2920b0..920a04c32c 100644 ---- a/lib/pengine/clone.c -+++ b/lib/pengine/clone.c -@@ -761,7 +761,7 @@ pe__clone_default(pcmk__output_t *out, va_list args) - GList *only_node = va_arg(args, GList *); - GList *only_rsc = va_arg(args, GList *); - -- GHashTable *stopped = pcmk__strkey_table(free, free); -+ GHashTable *stopped = NULL; - - char *list_text = NULL; - size_t list_text_len = 0; -@@ -818,7 +818,11 @@ pe__clone_default(pcmk__output_t *out, va_list args) - } else if (partially_active == FALSE) { - // List stopped instances when requested (except orphans) - if (!pcmk_is_set(child_rsc->flags, pe_rsc_orphan) -+ && !pcmk_is_set(show_opts, pcmk_show_clone_detail) - && pcmk_is_set(show_opts, pcmk_show_inactive_rscs)) { -+ if (stopped == NULL) { -+ stopped = pcmk__strkey_table(free, free); -+ } - g_hash_table_insert(stopped, strdup(child_rsc->id), strdup("Stopped")); - } - -@@ -873,7 +877,6 @@ pe__clone_default(pcmk__output_t *out, va_list args) - } - - if (pcmk_is_set(show_opts, pcmk_show_clone_detail)) { -- g_hash_table_destroy(stopped); - PCMK__OUTPUT_LIST_FOOTER(out, rc); - return pcmk_rc_ok; - } -@@ -948,8 +951,10 @@ pe__clone_default(pcmk__output_t *out, va_list args) - GList *list = g_hash_table_get_values(rsc->allowed_nodes); - - /* Custom stopped table for non-unique clones */ -- g_hash_table_destroy(stopped); -- stopped = pcmk__strkey_table(free, free); -+ if (stopped != NULL) { -+ g_hash_table_destroy(stopped); -+ stopped = NULL; -+ } - - if (list == NULL) { - /* Clusters with symmetrical=false haven't calculated allowed_nodes yet -@@ -972,6 +977,9 @@ pe__clone_default(pcmk__output_t *out, va_list args) - state = "Stopped (disabled)"; - } - -+ if (stopped == NULL) { -+ stopped = pcmk__strkey_table(free, free); -+ } - if (probe_op != NULL) { - int rc; - -@@ -987,7 +995,7 @@ pe__clone_default(pcmk__output_t *out, va_list args) - g_list_free(list); - } - -- if (g_hash_table_size(stopped) > 0) { -+ if (stopped != NULL) { - GList *list = sorted_hash_table_values(stopped); - - clone_header(out, &rc, rsc, clone_data); --- -2.27.0 - diff --git a/SOURCES/023-regression.patch b/SOURCES/023-regression.patch deleted file mode 100644 index 62d2a46..0000000 --- a/SOURCES/023-regression.patch +++ /dev/null @@ -1,30 +0,0 @@ -From 16928cfc69136bc56b1574bee9966e0d5de73abd Mon Sep 17 00:00:00 2001 -From: Ken Gaillot -Date: Wed, 26 Jan 2022 09:15:43 -0600 -Subject: [PATCH] Fix: controller: correctly match "node down" events - -regression introduced in 2.1.2 by 03ce7376e - -The symptom that led to this was that removing a remote node connection -resource would lead to the remote node getting fenced when the connection stop -was not recognized as an expected down event. ---- - daemons/controld/controld_te_events.c | 2 +- - 1 file changed, 1 insertion(+), 1 deletion(-) - -diff --git a/daemons/controld/controld_te_events.c b/daemons/controld/controld_te_events.c -index 36fd832ba0..1fd7129922 100644 ---- a/daemons/controld/controld_te_events.c -+++ b/daemons/controld/controld_te_events.c -@@ -304,7 +304,7 @@ match_down_event(const char *target) - gIter2 = gIter2->next) { - - match = (crm_action_t*)gIter2->data; -- if (pcmk_is_set(match->flags, pcmk__graph_action_confirmed)) { -+ if (pcmk_is_set(match->flags, pcmk__graph_action_executed)) { - xpath_ret = xpath_search(match->xml, xpath); - if (numXpathResults(xpath_ret) < 1) { - match = NULL; --- -2.27.0 - diff --git a/SOURCES/024-stop_unexpected.patch b/SOURCES/024-stop_unexpected.patch deleted file mode 100644 index 0fcf75b..0000000 --- a/SOURCES/024-stop_unexpected.patch +++ /dev/null @@ -1,806 +0,0 @@ -From 767b5552ab49850204692c2c990dfb41d37589f3 Mon Sep 17 00:00:00 2001 -From: Ken Gaillot -Date: Mon, 28 Mar 2022 18:11:52 -0500 -Subject: [PATCH 1/9] Refactor: libpacemaker: drop unnecessary argument from - "rsc-action" message - -9875cab129 moved the setting of the "moving" variable from LogActions() to a -new "rsc-action" message, but continued to pass the variable unnecessarily - -Also simplify how it's set ---- - lib/pacemaker/pcmk_output.c | 10 ++++------ - lib/pacemaker/pcmk_sched_native.c | 4 +--- - 2 files changed, 5 insertions(+), 9 deletions(-) - -diff --git a/lib/pacemaker/pcmk_output.c b/lib/pacemaker/pcmk_output.c -index d864c8bd2..56963a93f 100644 ---- a/lib/pacemaker/pcmk_output.c -+++ b/lib/pacemaker/pcmk_output.c -@@ -873,19 +873,18 @@ digests_xml(pcmk__output_t *out, va_list args) - } \ - } while(0) - --PCMK__OUTPUT_ARGS("rsc-action", "pe_resource_t *", "pe_node_t *", "pe_node_t *", -- "gboolean") -+PCMK__OUTPUT_ARGS("rsc-action", "pe_resource_t *", "pe_node_t *", "pe_node_t *") - static int - rsc_action_default(pcmk__output_t *out, va_list args) - { - pe_resource_t *rsc = va_arg(args, pe_resource_t *); - pe_node_t *current = va_arg(args, pe_node_t *); - pe_node_t *next = va_arg(args, pe_node_t *); -- gboolean moving = va_arg(args, gboolean); - - GList *possible_matches = NULL; - char *key = NULL; - int rc = pcmk_rc_no_output; -+ bool moving = false; - - pe_node_t *start_node = NULL; - pe_action_t *start = NULL; -@@ -901,9 +900,8 @@ rsc_action_default(pcmk__output_t *out, va_list args) - return rc; - } - -- if (current != NULL && next != NULL && !pcmk__str_eq(current->details->id, next->details->id, pcmk__str_casei)) { -- moving = TRUE; -- } -+ moving = (current != NULL) && (next != NULL) -+ && (current->details != next->details); - - possible_matches = pe__resource_actions(rsc, next, RSC_START, FALSE); - if (possible_matches) { -diff --git a/lib/pacemaker/pcmk_sched_resource.c b/lib/pacemaker/pcmk_sched_resource.c -index a3d646775..41631da3d 100644 ---- a/lib/pacemaker/pcmk_sched_native.c -+++ b/lib/pacemaker/pcmk_sched_native.c -@@ -2037,8 +2037,6 @@ LogActions(pe_resource_t * rsc, pe_working_set_t * data_set) - pe_node_t *next = NULL; - pe_node_t *current = NULL; - -- gboolean moving = FALSE; -- - if(rsc->variant == pe_container) { - pcmk__bundle_log_actions(rsc, data_set); - return; -@@ -2066,7 +2064,7 @@ LogActions(pe_resource_t * rsc, pe_working_set_t * data_set) - return; - } - -- out->message(out, "rsc-action", rsc, current, next, moving); -+ out->message(out, "rsc-action", rsc, current, next); - } - - gboolean --- -2.27.0 - - -From 870fb19715618c4ceab9ed4ae13a99658440b662 Mon Sep 17 00:00:00 2001 -From: Ken Gaillot -Date: Tue, 22 Mar 2022 15:22:23 -0500 -Subject: [PATCH 2/9] Refactor: scheduler: functionize scheduling restart - actions - -native_create_actions() is already overlarge, and more needs to be added to it ---- - lib/pacemaker/pcmk_sched_native.c | 85 ++++++++++++++++++++----------- - 1 file changed, 54 insertions(+), 31 deletions(-) - -diff --git a/lib/pacemaker/pcmk_sched_native.c b/lib/pacemaker/pcmk_sched_native.c -index 808e97540..b8a1c1e1a 100644 ---- a/lib/pacemaker/pcmk_sched_native.c -+++ b/lib/pacemaker/pcmk_sched_native.c -@@ -1185,6 +1185,58 @@ handle_migration_actions(pe_resource_t * rsc, pe_node_t *current, pe_node_t *cho - } - } - -+/*! -+ * \internal -+ * \brief Schedule actions to bring resource down and back to current role -+ * -+ * \param[in] rsc Resource to restart -+ * \param[in] current Node that resource should be brought down on -+ * \param[in] chosen Node that resource should be brought up on -+ * \param[in] need_stop Whether the resource must be stopped -+ * \param[in] need_promote Whether the resource must be promoted -+ * -+ * \return Role that resource would have after scheduled actions are taken -+ */ -+static void -+schedule_restart_actions(pe_resource_t *rsc, pe_node_t *current, -+ pe_node_t *chosen, bool need_stop, bool need_promote) -+{ -+ enum rsc_role_e role = rsc->role; -+ enum rsc_role_e next_role; -+ -+ // Bring resource down to a stop on its current node -+ while (role != RSC_ROLE_STOPPED) { -+ next_role = rsc_state_matrix[role][RSC_ROLE_STOPPED]; -+ pe_rsc_trace(rsc, "Creating %s action to take %s down from %s to %s", -+ (need_stop? "required" : "optional"), rsc->id, -+ role2text(role), role2text(next_role)); -+ if (!rsc_action_matrix[role][next_role](rsc, current, !need_stop, -+ rsc->cluster)) { -+ break; -+ } -+ role = next_role; -+ } -+ -+ // Bring resource up to its next role on its next node -+ while ((rsc->role <= rsc->next_role) && (role != rsc->role) -+ && !pcmk_is_set(rsc->flags, pe_rsc_block)) { -+ bool required = need_stop; -+ -+ next_role = rsc_state_matrix[role][rsc->role]; -+ if ((next_role == RSC_ROLE_PROMOTED) && need_promote) { -+ required = true; -+ } -+ pe_rsc_trace(rsc, "Creating %s action to take %s up from %s to %s", -+ (required? "required" : "optional"), rsc->id, -+ role2text(role), role2text(next_role)); -+ if (!rsc_action_matrix[role][next_role](rsc, chosen, !required, -+ rsc->cluster)) { -+ break; -+ } -+ role = next_role; -+ } -+} -+ - void - native_create_actions(pe_resource_t * rsc, pe_working_set_t * data_set) - { -@@ -1332,39 +1384,10 @@ native_create_actions(pe_resource_t * rsc, pe_working_set_t * data_set) - /* Create any additional actions required when bringing resource down and - * back up to same level. - */ -- role = rsc->role; -- while (role != RSC_ROLE_STOPPED) { -- next_role = rsc_state_matrix[role][RSC_ROLE_STOPPED]; -- pe_rsc_trace(rsc, "Creating %s action to take %s down from %s to %s", -- (need_stop? "required" : "optional"), rsc->id, -- role2text(role), role2text(next_role)); -- if (rsc_action_matrix[role][next_role] (rsc, current, !need_stop, data_set) == FALSE) { -- break; -- } -- role = next_role; -- } -- -- -- while ((rsc->role <= rsc->next_role) && (role != rsc->role) -- && !pcmk_is_set(rsc->flags, pe_rsc_block)) { -- bool required = need_stop; -- -- next_role = rsc_state_matrix[role][rsc->role]; -- if ((next_role == RSC_ROLE_PROMOTED) && need_promote) { -- required = true; -- } -- pe_rsc_trace(rsc, "Creating %s action to take %s up from %s to %s", -- (required? "required" : "optional"), rsc->id, -- role2text(role), role2text(next_role)); -- if (rsc_action_matrix[role][next_role](rsc, chosen, !required, -- data_set) == FALSE) { -- break; -- } -- role = next_role; -- } -- role = rsc->role; -+ schedule_restart_actions(rsc, current, chosen, need_stop, need_promote); - - /* Required steps from this role to the next */ -+ role = rsc->role; - while (role != rsc->next_role) { - next_role = rsc_state_matrix[role][rsc->next_role]; - pe_rsc_trace(rsc, "Creating action to take %s from %s to %s (ending at %s)", --- -2.27.0 - - -From 736d4d8f5e432acf12e577d137e9165904c71b3b Mon Sep 17 00:00:00 2001 -From: Ken Gaillot -Date: Mon, 28 Mar 2022 17:42:26 -0500 -Subject: [PATCH 3/9] Log: scheduler: improve trace messages when creating - actions - ---- - lib/pacemaker/pcmk_sched_native.c | 22 ++++++++++++++++------ - 1 file changed, 16 insertions(+), 6 deletions(-) - -diff --git a/lib/pacemaker/pcmk_sched_native.c b/lib/pacemaker/pcmk_sched_native.c -index b8a1c1e1a..8b651ebd2 100644 ---- a/lib/pacemaker/pcmk_sched_native.c -+++ b/lib/pacemaker/pcmk_sched_native.c -@@ -1997,7 +1997,6 @@ StopRsc(pe_resource_t * rsc, pe_node_t * next, gboolean optional, pe_working_set - GList *gIter = NULL; - - CRM_ASSERT(rsc); -- pe_rsc_trace(rsc, "%s", rsc->id); - - for (gIter = rsc->running_on; gIter != NULL; gIter = gIter->next) { - pe_node_t *current = (pe_node_t *) gIter->data; -@@ -2005,16 +2004,23 @@ StopRsc(pe_resource_t * rsc, pe_node_t * next, gboolean optional, pe_working_set - - if (rsc->partial_migration_target) { - if (rsc->partial_migration_target->details == current->details) { -- pe_rsc_trace(rsc, "Filtered %s -> %s %s", current->details->uname, -- next->details->uname, rsc->id); -+ pe_rsc_trace(rsc, -+ "Skipping stop of %s on %s " -+ "because migration to %s in progress", -+ rsc->id, current->details->uname, -+ next->details->uname); - continue; - } else { -- pe_rsc_trace(rsc, "Forced on %s %s", current->details->uname, rsc->id); -+ pe_rsc_trace(rsc, -+ "Forcing stop of %s on %s " -+ "because migration target changed", -+ rsc->id, current->details->uname); - optional = FALSE; - } - } - -- pe_rsc_trace(rsc, "%s on %s", rsc->id, current->details->uname); -+ pe_rsc_trace(rsc, "Scheduling stop of %s on %s", -+ rsc->id, current->details->uname); - stop = stop_action(rsc, current, optional); - - if(rsc->allocated_to == NULL) { -@@ -2048,7 +2054,11 @@ StartRsc(pe_resource_t * rsc, pe_node_t * next, gboolean optional, pe_working_se - pe_action_t *start = NULL; - - CRM_ASSERT(rsc); -- pe_rsc_trace(rsc, "%s on %s %d %d", rsc->id, next ? next->details->uname : "N/A", optional, next ? next->weight : 0); -+ -+ pe_rsc_trace(rsc, "Scheduling %s start of %s on %s (weight=%d)", -+ (optional? "optional" : "required"), rsc->id, -+ ((next == NULL)? "N/A" : next->details->uname), -+ ((next == NULL)? 0 : next->weight)); - start = start_action(rsc, next, TRUE); - - pcmk__order_vs_unfence(rsc, next, start, pe_order_implies_then, data_set); --- -2.27.0 - - -From 6f987234d5246ed50f4fe2db90e5edb6a23e877d Mon Sep 17 00:00:00 2001 -From: Ken Gaillot -Date: Tue, 1 Mar 2022 16:42:06 -0600 -Subject: [PATCH 4/9] Log: scheduler: log a warning if invalid value is given - for multiple-active - ---- - lib/pengine/complex.c | 7 ++++++- - 1 file changed, 6 insertions(+), 1 deletion(-) - -diff --git a/lib/pengine/complex.c b/lib/pengine/complex.c -index e82af2aae..f2caef831 100644 ---- a/lib/pengine/complex.c -+++ b/lib/pengine/complex.c -@@ -694,7 +694,12 @@ common_unpack(xmlNode * xml_obj, pe_resource_t ** rsc, - (*rsc)->recovery_type = recovery_block; - pe_rsc_trace((*rsc), "\tMultiple running resource recovery: block"); - -- } else { -+ } else { // "stop_start" -+ if (!pcmk__str_eq(value, "stop_start", -+ pcmk__str_casei|pcmk__str_null_matches)) { -+ pe_warn("%s is not a valid value for " XML_RSC_ATTR_MULTIPLE -+ ", using default of \"stop_start\"", value); -+ } - (*rsc)->recovery_type = recovery_stop_start; - pe_rsc_trace((*rsc), "\tMultiple running resource recovery: stop/start"); - } --- -2.27.0 - - -From 50456c3e229a6021ca0ba7346af41cd234abcc16 Mon Sep 17 00:00:00 2001 -From: Ken Gaillot -Date: Tue, 1 Mar 2022 16:49:31 -0600 -Subject: [PATCH 5/9] API: libpe_status: add recovery_stop_unexpected to enum - rsc_recovery_type - -The behavior is not implemented as of this commit ---- - include/crm/pengine/common.h | 14 ++++++++++++-- - lib/pengine/complex.c | 5 +++++ - lib/pengine/native.c | 7 +++++-- - 3 files changed, 22 insertions(+), 4 deletions(-) - -diff --git a/include/crm/pengine/common.h b/include/crm/pengine/common.h -index efe89a171..9b9f38f3b 100644 ---- a/include/crm/pengine/common.h -+++ b/include/crm/pengine/common.h -@@ -1,5 +1,5 @@ - /* -- * Copyright 2004-2021 the Pacemaker project contributors -+ * Copyright 2004-2022 the Pacemaker project contributors - * - * The version control history for this file may have further details. - * -@@ -78,7 +78,8 @@ enum action_tasks { - enum rsc_recovery_type { - recovery_stop_start, - recovery_stop_only, -- recovery_block -+ recovery_block, -+ recovery_stop_unexpected, - }; - - enum rsc_start_requirement { -@@ -143,6 +144,13 @@ const char *fail2text(enum action_fail_response fail); - const char *pe_pref(GHashTable * options, const char *name); - void calculate_active_ops(GList * sorted_op_list, int *start_index, int *stop_index); - -+/*! -+ * \brief Get readable description of a recovery type -+ * -+ * \param[in] type Recovery type -+ * -+ * \return Static string describing \p type -+ */ - static inline const char * - recovery2text(enum rsc_recovery_type type) - { -@@ -153,6 +161,8 @@ recovery2text(enum rsc_recovery_type type) - return "attempting recovery"; - case recovery_block: - return "waiting for an administrator"; -+ case recovery_stop_unexpected: -+ return "stopping unexpected instances"; - } - return "Unknown"; - } -diff --git a/lib/pengine/complex.c b/lib/pengine/complex.c -index f2caef831..fc9028e81 100644 ---- a/lib/pengine/complex.c -+++ b/lib/pengine/complex.c -@@ -694,6 +694,11 @@ common_unpack(xmlNode * xml_obj, pe_resource_t ** rsc, - (*rsc)->recovery_type = recovery_block; - pe_rsc_trace((*rsc), "\tMultiple running resource recovery: block"); - -+ } else if (pcmk__str_eq(value, "stop_unexpected", pcmk__str_casei)) { -+ (*rsc)->recovery_type = recovery_stop_unexpected; -+ pe_rsc_trace((*rsc), "\tMultiple running resource recovery: " -+ "stop unexpected instances"); -+ - } else { // "stop_start" - if (!pcmk__str_eq(value, "stop_start", - pcmk__str_casei|pcmk__str_null_matches)) { -diff --git a/lib/pengine/native.c b/lib/pengine/native.c -index e16e54bae..fa7dc8960 100644 ---- a/lib/pengine/native.c -+++ b/lib/pengine/native.c -@@ -149,8 +149,6 @@ native_add_running(pe_resource_t * rsc, pe_node_t * node, pe_working_set_t * dat - } - } - break; -- case recovery_stop_start: -- break; - case recovery_block: - pe__clear_resource_flags(rsc, pe_rsc_managed); - pe__set_resource_flags(rsc, pe_rsc_block); -@@ -171,6 +169,11 @@ native_add_running(pe_resource_t * rsc, pe_node_t * node, pe_working_set_t * dat - } - } - break; -+ default: // recovery_stop_start, recovery_stop_unexpected -+ /* The scheduler will do the right thing because the relevant -+ * variables and flags are set when unpacking the history. -+ */ -+ break; - } - crm_debug("%s is active on multiple nodes including %s: %s", - rsc->id, node->details->uname, --- -2.27.0 - - -From 5e994f0633b27e7a53701e0954466739c8f1acf7 Mon Sep 17 00:00:00 2001 -From: Ken Gaillot -Date: Wed, 30 Mar 2022 16:26:19 -0500 -Subject: [PATCH 6/9] API: libpe_status: add pe_rsc_stop_unexpected flag - ---- - include/crm/pengine/pe_types.h | 1 + - 1 file changed, 1 insertion(+) - -diff --git a/include/crm/pengine/pe_types.h b/include/crm/pengine/pe_types.h -index e3ecaa823..7d5394bff 100644 ---- a/include/crm/pengine/pe_types.h -+++ b/include/crm/pengine/pe_types.h -@@ -277,6 +277,7 @@ struct pe_node_s { - - # define pe_rsc_starting 0x00100000ULL - # define pe_rsc_stopping 0x00200000ULL -+# define pe_rsc_stop_unexpected 0x00400000ULL - # define pe_rsc_allow_migrate 0x00800000ULL - - # define pe_rsc_failure_ignored 0x01000000ULL --- -2.27.0 - - -From c1acf05be853d99c17761759b8c961f2ec4a55c2 Mon Sep 17 00:00:00 2001 -From: Ken Gaillot -Date: Thu, 31 Mar 2022 09:56:34 -0500 -Subject: [PATCH 7/9] API: libpe_status: add pe_rsc_restarting flag - -This is used to indicate that any actions currently being scheduled are part of -the resource's restart actions (i.e. we are in schedule_restart_actions()). ---- - include/crm/pengine/pe_types.h | 1 + - lib/pacemaker/pcmk_sched_native.c | 4 ++++ - 2 files changed, 5 insertions(+) - -diff --git a/include/crm/pengine/pe_types.h b/include/crm/pengine/pe_types.h -index 7d5394bff..77d28e900 100644 ---- a/include/crm/pengine/pe_types.h -+++ b/include/crm/pengine/pe_types.h -@@ -265,6 +265,7 @@ struct pe_node_s { - # define pe_rsc_provisional 0x00000100ULL - # define pe_rsc_allocating 0x00000200ULL - # define pe_rsc_merging 0x00000400ULL -+# define pe_rsc_restarting 0x00000800ULL - - # define pe_rsc_stop 0x00001000ULL - # define pe_rsc_reload 0x00002000ULL -diff --git a/lib/pacemaker/pcmk_sched_native.c b/lib/pacemaker/pcmk_sched_native.c -index 8b651ebd2..8002938b5 100644 ---- a/lib/pacemaker/pcmk_sched_native.c -+++ b/lib/pacemaker/pcmk_sched_native.c -@@ -1204,6 +1204,8 @@ schedule_restart_actions(pe_resource_t *rsc, pe_node_t *current, - enum rsc_role_e role = rsc->role; - enum rsc_role_e next_role; - -+ pe__set_resource_flags(rsc, pe_rsc_restarting); -+ - // Bring resource down to a stop on its current node - while (role != RSC_ROLE_STOPPED) { - next_role = rsc_state_matrix[role][RSC_ROLE_STOPPED]; -@@ -1235,6 +1237,8 @@ schedule_restart_actions(pe_resource_t *rsc, pe_node_t *current, - } - role = next_role; - } -+ -+ pe__clear_resource_flags(rsc, pe_rsc_restarting); - } - - void --- -2.27.0 - - -From 871e2201d92520039df45062afc9120fd1fb0f30 Mon Sep 17 00:00:00 2001 -From: Ken Gaillot -Date: Tue, 1 Mar 2022 17:46:39 -0600 -Subject: [PATCH 8/9] Refactor: scheduler: add expected node to primitive - variant data - -Nothing uses it yet ---- - include/crm/pengine/internal.h | 4 ++++ - lib/pengine/native.c | 38 ++++++++++++++++++++++++++++++++++ - lib/pengine/variant.h | 8 +++++-- - 3 files changed, 48 insertions(+), 2 deletions(-) - -diff --git a/include/crm/pengine/internal.h b/include/crm/pengine/internal.h -index f949684b4..f69e6bcce 100644 ---- a/include/crm/pengine/internal.h -+++ b/include/crm/pengine/internal.h -@@ -579,4 +579,8 @@ xmlNode *pe__failed_probe_for_rsc(pe_resource_t *rsc, const char *name); - - const char *pe__clone_child_id(pe_resource_t *rsc); - -+void pe__update_expected_node(pe_resource_t *rsc, pe_node_t *node, -+ int execution_status, int exit_status, -+ int expected_exit_status); -+ - #endif -diff --git a/lib/pengine/native.c b/lib/pengine/native.c -index fa7dc8960..591d1c6f5 100644 ---- a/lib/pengine/native.c -+++ b/lib/pengine/native.c -@@ -1376,3 +1376,41 @@ pe__native_is_filtered(pe_resource_t *rsc, GList *only_rsc, gboolean check_paren - - return TRUE; - } -+ -+/*! -+ * \internal -+ * \brief Set a resource's expected node if appropriate for a history result -+ * -+ * \param[in] rsc Resource to set expected node for -+ * \param[in] node Node to set as expected node -+ * \param[in] execution_status History entry's execution status -+ * \param[in] exit_status History entry's actual exit status -+ * \param[in] expected_status History entry's expected exit status -+ */ -+void -+pe__update_expected_node(pe_resource_t *rsc, pe_node_t *node, -+ int execution_status, int exit_status, -+ int expected_exit_status) -+{ -+ native_variant_data_t *native_data = NULL; -+ -+ get_native_variant_data(native_data, rsc); -+ -+ if ((rsc->recovery_type == recovery_stop_unexpected) -+ && (rsc->role > RSC_ROLE_STOPPED) -+ && (execution_status == PCMK_EXEC_DONE) -+ && (exit_status == expected_exit_status)) { -+ // Resource is active and was expected on this node -+ pe_rsc_trace(rsc, "Found expected node %s for %s", -+ node->details->uname, rsc->id); -+ native_data->expected_node = node; -+ pe__set_resource_flags(rsc, pe_rsc_stop_unexpected); -+ -+ } else if ((native_data->expected_node != NULL) -+ && (native_data->expected_node->details == node->details)) { -+ // Resource is not cleanly active here -+ pe_rsc_trace(rsc, "Clearing expected node for %s", rsc->id); -+ native_data->expected_node = NULL; -+ pe__clear_resource_flags(rsc, pe_rsc_stop_unexpected); -+ } -+} -diff --git a/lib/pengine/variant.h b/lib/pengine/variant.h -index cabfbe81f..d8fefa9d6 100644 ---- a/lib/pengine/variant.h -+++ b/lib/pengine/variant.h -@@ -1,5 +1,5 @@ - /* -- * Copyright 2004-2021 the Pacemaker project contributors -+ * Copyright 2004-2022 the Pacemaker project contributors - * - * The version control history for this file may have further details. - * -@@ -139,7 +139,11 @@ typedef struct group_variant_data_s { - # elif VARIANT_NATIVE - - typedef struct native_variant_data_s { -- int dummy; -+ /* If the resource is multiply active, and has multiple-active set to -+ * stop_unexpected, this will be set to the node where the resource was -+ * found active by an operation with a expected result. -+ */ -+ pe_node_t *expected_node; - } native_variant_data_t; - - # define get_native_variant_data(data, rsc) \ --- -2.27.0 - - -From 0e4e17e972f1c3663389f18d8f8c527bd819b3c5 Mon Sep 17 00:00:00 2001 -From: Ken Gaillot -Date: Thu, 7 Apr 2022 10:20:00 -0500 -Subject: [PATCH 9/9] Feature: scheduler: implement - multiple-active=stop_unexpected - -The default multiple-active policy of restarting the resource on all nodes -requires no special handling, because at least one of the locations will have -an unexpected rc, causing the resource to be marked as failed and restarted, -and StopRsc() creates stops on all nodes running the resource. - -The new stop_unexpected behavior relies on most of the same handling, but -the action creation functions need to skip the node where the resource had the -expected result. For that, we set the new rsc->expected_node when unpacking a -successful result, to be checked by those functions. - -Note that this still schedules a start for the resource, which is a pseudo-op -for the resource itself, but (properly) causes any dependent resources to be -restarted. - -Fixes T23 ---- - lib/pacemaker/pcmk_output.c | 10 ++++ - lib/pacemaker/pcmk_sched_native.c | 94 ++++++++++++++++++++++++++++++- - lib/pengine/unpack.c | 1 + - 3 files changed, 103 insertions(+), 2 deletions(-) - -diff --git a/lib/pacemaker/pcmk_output.c b/lib/pacemaker/pcmk_output.c -index 56963a93f..9a522a3e5 100644 ---- a/lib/pacemaker/pcmk_output.c -+++ b/lib/pacemaker/pcmk_output.c -@@ -918,6 +918,16 @@ rsc_action_default(pcmk__output_t *out, va_list args) - if (possible_matches) { - stop = possible_matches->data; - g_list_free(possible_matches); -+ } else if (pcmk_is_set(rsc->flags, pe_rsc_stop_unexpected)) { -+ /* The resource is multiply active with multiple-active set to -+ * stop_unexpected, and not stopping on its current node, but it should -+ * be stopping elsewhere. -+ */ -+ possible_matches = pe__resource_actions(rsc, NULL, RSC_STOP, FALSE); -+ if (possible_matches != NULL) { -+ stop = possible_matches->data; -+ g_list_free(possible_matches); -+ } - } - - possible_matches = pe__resource_actions(rsc, next, RSC_PROMOTE, FALSE); -diff --git a/lib/pacemaker/pcmk_sched_native.c b/lib/pacemaker/pcmk_sched_native.c -index 8002938b5..c0224849f 100644 ---- a/lib/pacemaker/pcmk_sched_native.c -+++ b/lib/pacemaker/pcmk_sched_native.c -@@ -1259,7 +1259,10 @@ native_create_actions(pe_resource_t * rsc, pe_working_set_t * data_set) - enum rsc_role_e role = RSC_ROLE_UNKNOWN; - enum rsc_role_e next_role = RSC_ROLE_UNKNOWN; - -- CRM_ASSERT(rsc); -+ native_variant_data_t *native_data = NULL; -+ -+ get_native_variant_data(native_data, rsc); -+ - chosen = rsc->allocated_to; - next_role = rsc->next_role; - if (next_role == RSC_ROLE_UNKNOWN) { -@@ -1323,6 +1326,7 @@ native_create_actions(pe_resource_t * rsc, pe_working_set_t * data_set) - "(will stop on both nodes)", - rsc->id, rsc->partial_migration_source->details->uname, - rsc->partial_migration_target->details->uname); -+ multiply_active = false; - - } else { - const char *class = crm_element_value(rsc->xml, XML_AGENT_ATTR_CLASS); -@@ -1345,6 +1349,11 @@ native_create_actions(pe_resource_t * rsc, pe_working_set_t * data_set) - allow_migrate = FALSE; - } - -+ if (!multiply_active) { -+ native_data->expected_node = NULL; -+ pe__clear_resource_flags(rsc, pe_rsc_stop_unexpected); -+ } -+ - if (pcmk_is_set(rsc->flags, pe_rsc_start_pending)) { - pe_rsc_trace(rsc, "Creating start action for %s to represent already pending start", - rsc->id); -@@ -1995,6 +2004,32 @@ native_expand(pe_resource_t * rsc, pe_working_set_t * data_set) - out->message(out, "rsc-action", rsc, current, next); - } - -+/*! -+ * \internal -+ * \brief Check whether a node is a multiply active resource's expected node -+ * -+ * \param[in] rsc Resource to check -+ * \param[in] node Node to check -+ * -+ * \return true if \p rsc is multiply active with multiple-active set to -+ * stop_unexpected, and \p node is the node where it will remain active -+ * \note This assumes that the resource's next role cannot be changed to stopped -+ * after this is called, which should be reasonable if status has already -+ * been unpacked and resources have been assigned to nodes. -+ */ -+static bool -+is_expected_node(const pe_resource_t *rsc, const pe_node_t *node) -+{ -+ native_variant_data_t *native_data = NULL; -+ -+ get_native_variant_data(native_data, rsc); -+ return pcmk_all_flags_set(rsc->flags, -+ pe_rsc_stop_unexpected|pe_rsc_restarting) -+ && (rsc->next_role > RSC_ROLE_STOPPED) -+ && (native_data->expected_node != NULL) && (node != NULL) -+ && (native_data->expected_node->details == node->details); -+} -+ - gboolean - StopRsc(pe_resource_t * rsc, pe_node_t * next, gboolean optional, pe_working_set_t * data_set) - { -@@ -2006,6 +2041,18 @@ StopRsc(pe_resource_t * rsc, pe_node_t * next, gboolean optional, pe_working_set - pe_node_t *current = (pe_node_t *) gIter->data; - pe_action_t *stop; - -+ if (is_expected_node(rsc, current)) { -+ /* We are scheduling restart actions for a multiply active resource -+ * with multiple-active=stop_unexpected, and this is where it should -+ * not be stopped. -+ */ -+ pe_rsc_trace(rsc, -+ "Skipping stop of multiply active resource %s " -+ "on expected node %s", -+ rsc->id, current->details->uname); -+ continue; -+ } -+ - if (rsc->partial_migration_target) { - if (rsc->partial_migration_target->details == current->details) { - pe_rsc_trace(rsc, -@@ -2029,6 +2076,17 @@ StopRsc(pe_resource_t * rsc, pe_node_t * next, gboolean optional, pe_working_set - - if(rsc->allocated_to == NULL) { - pe_action_set_reason(stop, "node availability", TRUE); -+ } else if (pcmk_is_set(rsc->flags, pe_rsc_restarting)) { -+ native_variant_data_t *native_data = NULL; -+ -+ get_native_variant_data(native_data, rsc); -+ if (native_data->expected_node != NULL) { -+ /* We are stopping a multiply active resource on a node that is -+ * not its expected node, and we are still scheduling restart -+ * actions, so the stop is for being multiply active. -+ */ -+ pe_action_set_reason(stop, "being multiply active", TRUE); -+ } - } - - if (!pcmk_is_set(rsc->flags, pe_rsc_managed)) { -@@ -2071,6 +2129,16 @@ StartRsc(pe_resource_t * rsc, pe_node_t * next, gboolean optional, pe_working_se - pe__clear_action_flags(start, pe_action_optional); - } - -+ if (is_expected_node(rsc, next)) { -+ /* This could be a problem if the start becomes necessary for other -+ * reasons later. -+ */ -+ pe_rsc_trace(rsc, -+ "Start of multiply active resouce %s " -+ "on expected node %s will be a pseudo-action", -+ rsc->id, next->details->uname); -+ pe__set_action_flags(start, pe_action_pseudo); -+ } - - return TRUE; - } -@@ -2084,6 +2152,7 @@ PromoteRsc(pe_resource_t * rsc, pe_node_t * next, gboolean optional, pe_working_ - - CRM_ASSERT(rsc); - CRM_CHECK(next != NULL, return FALSE); -+ - pe_rsc_trace(rsc, "%s on %s", rsc->id, next->details->uname); - - action_list = pe__resource_actions(rsc, next, RSC_START, TRUE); -@@ -2098,7 +2167,19 @@ PromoteRsc(pe_resource_t * rsc, pe_node_t * next, gboolean optional, pe_working_ - g_list_free(action_list); - - if (runnable) { -- promote_action(rsc, next, optional); -+ pe_action_t *promote = promote_action(rsc, next, optional); -+ -+ if (is_expected_node(rsc, next)) { -+ /* This could be a problem if the promote becomes necessary for -+ * other reasons later. -+ */ -+ pe_rsc_trace(rsc, -+ "Promotion of multiply active resouce %s " -+ "on expected node %s will be a pseudo-action", -+ rsc->id, next->details->uname); -+ pe__set_action_flags(promote, pe_action_pseudo); -+ } -+ - return TRUE; - } - -@@ -2122,6 +2203,15 @@ DemoteRsc(pe_resource_t * rsc, pe_node_t * next, gboolean optional, pe_working_s - GList *gIter = NULL; - - CRM_ASSERT(rsc); -+ -+ if (is_expected_node(rsc, next)) { -+ pe_rsc_trace(rsc, -+ "Skipping demote of multiply active resource %s " -+ "on expected node %s", -+ rsc->id, next->details->uname); -+ return TRUE; -+ } -+ - pe_rsc_trace(rsc, "%s", rsc->id); - - /* CRM_CHECK(rsc->next_role == RSC_ROLE_UNPROMOTED, return FALSE); */ -diff --git a/lib/pengine/unpack.c b/lib/pengine/unpack.c -index d218f523f..edaa9de48 100644 ---- a/lib/pengine/unpack.c -+++ b/lib/pengine/unpack.c -@@ -3974,6 +3974,7 @@ unpack_rsc_op(pe_resource_t *rsc, pe_node_t *node, xmlNode *xml_op, - } - - done: -+ pe__update_expected_node(rsc, node, status, rc, target_rc); - pe_rsc_trace(rsc, "Resource %s after %s: role=%s, next=%s", - rsc->id, task, role2text(rsc->role), - role2text(rsc->next_role)); --- -2.27.0 - diff --git a/SOURCES/025-stop_unexpected-test.patch b/SOURCES/025-stop_unexpected-test.patch deleted file mode 100644 index 65b74dc..0000000 --- a/SOURCES/025-stop_unexpected-test.patch +++ /dev/null @@ -1,495 +0,0 @@ -From 8a0a16c8ed72c74d656664694ebe36b76ff22498 Mon Sep 17 00:00:00 2001 -From: Ken Gaillot -Date: Wed, 30 Mar 2022 17:14:33 -0500 -Subject: [PATCH] Test: cts-scheduler: add test for - multiple-active=stop_unexpected - ---- - cts/cts-scheduler.in | 1 + - cts/scheduler/dot/stop-unexpected.dot | 40 ++++ - cts/scheduler/exp/stop-unexpected.exp | 201 ++++++++++++++++++ - cts/scheduler/scores/stop-unexpected.scores | 17 ++ - cts/scheduler/summary/stop-unexpected.summary | 41 ++++ - cts/scheduler/xml/stop-unexpected.xml | 131 ++++++++++++ - 6 files changed, 431 insertions(+) - create mode 100644 cts/scheduler/dot/stop-unexpected.dot - create mode 100644 cts/scheduler/exp/stop-unexpected.exp - create mode 100644 cts/scheduler/scores/stop-unexpected.scores - create mode 100644 cts/scheduler/summary/stop-unexpected.summary - create mode 100644 cts/scheduler/xml/stop-unexpected.xml - -diff --git a/cts/cts-scheduler.in b/cts/cts-scheduler.in -index 3a8aeaca5..8c04687da 100644 ---- a/cts/cts-scheduler.in -+++ b/cts/cts-scheduler.in -@@ -273,6 +273,7 @@ TESTS = [ - [ "rec-rsc-6", "Resource Recover - multiple - restart" ], - [ "rec-rsc-7", "Resource Recover - multiple - stop" ], - [ "rec-rsc-8", "Resource Recover - multiple - block" ], -+ [ "stop-unexpected", "Resource Recover - multiple - stop unexpected" ], - [ "rec-rsc-9", "Resource Recover - group/group" ], - [ "monitor-recovery", "on-fail=block + resource recovery detected by recurring monitor" ], - [ "stop-failure-no-quorum", "Stop failure without quorum" ], -diff --git a/cts/scheduler/dot/stop-unexpected.dot b/cts/scheduler/dot/stop-unexpected.dot -new file mode 100644 -index 000000000..0f67eec54 ---- /dev/null -+++ b/cts/scheduler/dot/stop-unexpected.dot -@@ -0,0 +1,40 @@ -+ digraph "g" { -+"dgroup_running_0" [ style=bold color="green" fontcolor="orange"] -+"dgroup_start_0" -> "dgroup_running_0" [ style = bold] -+"dgroup_start_0" -> "dummy2_start_0 node2" [ style = bold] -+"dgroup_start_0" -> "dummy3_start_0 node2" [ style = bold] -+"dgroup_start_0" -> "dummy_start_0 node2" [ style = bold] -+"dgroup_start_0" [ style=bold color="green" fontcolor="orange"] -+"dgroup_stop_0" -> "dgroup_stopped_0" [ style = bold] -+"dgroup_stop_0" -> "dummy2_stop_0 node2" [ style = bold] -+"dgroup_stop_0" -> "dummy3_stop_0 node2" [ style = bold] -+"dgroup_stop_0" -> "dummy_stop_0 node3" [ style = bold] -+"dgroup_stop_0" [ style=bold color="green" fontcolor="orange"] -+"dgroup_stopped_0" -> "dgroup_start_0" [ style = bold] -+"dgroup_stopped_0" [ style=bold color="green" fontcolor="orange"] -+"dummy2_monitor_10000 node2" [ style=bold color="green" fontcolor="black"] -+"dummy2_start_0 node2" -> "dgroup_running_0" [ style = bold] -+"dummy2_start_0 node2" -> "dummy2_monitor_10000 node2" [ style = bold] -+"dummy2_start_0 node2" -> "dummy3_start_0 node2" [ style = bold] -+"dummy2_start_0 node2" [ style=bold color="green" fontcolor="black"] -+"dummy2_stop_0 node2" -> "dgroup_stopped_0" [ style = bold] -+"dummy2_stop_0 node2" -> "dummy2_start_0 node2" [ style = bold] -+"dummy2_stop_0 node2" -> "dummy_stop_0 node3" [ style = bold] -+"dummy2_stop_0 node2" [ style=bold color="green" fontcolor="black"] -+"dummy3_monitor_10000 node2" [ style=bold color="green" fontcolor="black"] -+"dummy3_start_0 node2" -> "dgroup_running_0" [ style = bold] -+"dummy3_start_0 node2" -> "dummy3_monitor_10000 node2" [ style = bold] -+"dummy3_start_0 node2" [ style=bold color="green" fontcolor="black"] -+"dummy3_stop_0 node2" -> "dgroup_stopped_0" [ style = bold] -+"dummy3_stop_0 node2" -> "dummy2_stop_0 node2" [ style = bold] -+"dummy3_stop_0 node2" -> "dummy3_start_0 node2" [ style = bold] -+"dummy3_stop_0 node2" [ style=bold color="green" fontcolor="black"] -+"dummy_monitor_10000 node2" [ style=bold color="green" fontcolor="black"] -+"dummy_start_0 node2" -> "dgroup_running_0" [ style = bold] -+"dummy_start_0 node2" -> "dummy2_start_0 node2" [ style = bold] -+"dummy_start_0 node2" -> "dummy_monitor_10000 node2" [ style = bold] -+"dummy_start_0 node2" [ style=bold color="green" fontcolor="orange"] -+"dummy_stop_0 node3" -> "dgroup_stopped_0" [ style = bold] -+"dummy_stop_0 node3" -> "dummy_start_0 node2" [ style = bold] -+"dummy_stop_0 node3" [ style=bold color="green" fontcolor="black"] -+} -diff --git a/cts/scheduler/exp/stop-unexpected.exp b/cts/scheduler/exp/stop-unexpected.exp -new file mode 100644 -index 000000000..1f94532f7 ---- /dev/null -+++ b/cts/scheduler/exp/stop-unexpected.exp -@@ -0,0 +1,201 @@ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -diff --git a/cts/scheduler/scores/stop-unexpected.scores b/cts/scheduler/scores/stop-unexpected.scores -new file mode 100644 -index 000000000..68f98e837 ---- /dev/null -+++ b/cts/scheduler/scores/stop-unexpected.scores -@@ -0,0 +1,17 @@ -+ -+pcmk__group_allocate: dgroup allocation score on node2: 0 -+pcmk__group_allocate: dgroup allocation score on node3: 0 -+pcmk__group_allocate: dummy allocation score on node2: 0 -+pcmk__group_allocate: dummy allocation score on node3: 0 -+pcmk__group_allocate: dummy2 allocation score on node2: 100 -+pcmk__group_allocate: dummy2 allocation score on node3: 0 -+pcmk__group_allocate: dummy3 allocation score on node2: 100 -+pcmk__group_allocate: dummy3 allocation score on node3: 0 -+pcmk__native_allocate: dummy allocation score on node2: 200 -+pcmk__native_allocate: dummy allocation score on node3: 0 -+pcmk__native_allocate: dummy2 allocation score on node2: 200 -+pcmk__native_allocate: dummy2 allocation score on node3: -INFINITY -+pcmk__native_allocate: dummy3 allocation score on node2: 100 -+pcmk__native_allocate: dummy3 allocation score on node3: -INFINITY -+pcmk__native_allocate: st-sbd allocation score on node2: 100 -+pcmk__native_allocate: st-sbd allocation score on node3: 0 -diff --git a/cts/scheduler/summary/stop-unexpected.summary b/cts/scheduler/summary/stop-unexpected.summary -new file mode 100644 -index 000000000..7c7fc68b6 ---- /dev/null -+++ b/cts/scheduler/summary/stop-unexpected.summary -@@ -0,0 +1,41 @@ -+Current cluster status: -+ * Node List: -+ * Online: [ node2 node3 ] -+ -+ * Full List of Resources: -+ * st-sbd (stonith:external/sbd): Started node2 -+ * Resource Group: dgroup: -+ * dummy (ocf:heartbeat:DummyTimeout): FAILED [ node2 node3 ] -+ * dummy2 (ocf:heartbeat:Dummy): Started node2 -+ * dummy3 (ocf:heartbeat:Dummy): Started node2 -+ -+Transition Summary: -+ * Recover dummy ( node2 ) due to being multiply active -+ * Restart dummy2 ( node2 ) due to required dummy start -+ * Restart dummy3 ( node2 ) due to required dummy2 start -+ -+Executing Cluster Transition: -+ * Pseudo action: dgroup_stop_0 -+ * Resource action: dummy3 stop on node2 -+ * Resource action: dummy2 stop on node2 -+ * Resource action: dummy stop on node3 -+ * Pseudo action: dgroup_stopped_0 -+ * Pseudo action: dgroup_start_0 -+ * Pseudo action: dummy_start_0 -+ * Resource action: dummy monitor=10000 on node2 -+ * Resource action: dummy2 start on node2 -+ * Resource action: dummy2 monitor=10000 on node2 -+ * Resource action: dummy3 start on node2 -+ * Resource action: dummy3 monitor=10000 on node2 -+ * Pseudo action: dgroup_running_0 -+ -+Revised Cluster Status: -+ * Node List: -+ * Online: [ node2 node3 ] -+ -+ * Full List of Resources: -+ * st-sbd (stonith:external/sbd): Started node2 -+ * Resource Group: dgroup: -+ * dummy (ocf:heartbeat:DummyTimeout): Started node2 -+ * dummy2 (ocf:heartbeat:Dummy): Started node2 -+ * dummy3 (ocf:heartbeat:Dummy): Started node2 -diff --git a/cts/scheduler/xml/stop-unexpected.xml b/cts/scheduler/xml/stop-unexpected.xml -new file mode 100644 -index 000000000..6e61aeba3 ---- /dev/null -+++ b/cts/scheduler/xml/stop-unexpected.xml -@@ -0,0 +1,131 @@ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ --- -2.27.0 - diff --git a/SOURCES/026-stop_unexpected-fix.patch b/SOURCES/026-stop_unexpected-fix.patch deleted file mode 100644 index 69dd95d..0000000 --- a/SOURCES/026-stop_unexpected-fix.patch +++ /dev/null @@ -1,589 +0,0 @@ -From 4a5dcc5210160f7d167bc68142635c1b5a6d4af2 Mon Sep 17 00:00:00 2001 -From: Ken Gaillot -Date: Fri, 22 Apr 2022 10:47:29 -0500 -Subject: [PATCH 1/3] Fix: scheduler: make multiple-active="stop_unexpected" - actually work - -The previous implementation covered the scenario in the regression test and not -much else. It unnecessarily added an expected_node member to the native variant -data, when the resource's allocated_to is sufficient to know the expected node. ---- - lib/pacemaker/pcmk_sched_native.c | 45 +++++++++++++++---------------- - lib/pengine/unpack.c | 1 - - 2 files changed, 22 insertions(+), 24 deletions(-) - -diff --git a/lib/pacemaker/pcmk_sched_native.c b/lib/pacemaker/pcmk_sched_native.c -index c0224849f..a1a51721e 100644 ---- a/lib/pacemaker/pcmk_sched_native.c -+++ b/lib/pacemaker/pcmk_sched_native.c -@@ -1250,7 +1250,7 @@ native_create_actions(pe_resource_t * rsc, pe_working_set_t * data_set) - gboolean need_stop = FALSE; - bool need_promote = FALSE; - gboolean is_moving = FALSE; -- gboolean allow_migrate = pcmk_is_set(rsc->flags, pe_rsc_allow_migrate)? TRUE : FALSE; -+ gboolean allow_migrate = FALSE; - - GList *gIter = NULL; - unsigned int num_all_active = 0; -@@ -1259,9 +1259,8 @@ native_create_actions(pe_resource_t * rsc, pe_working_set_t * data_set) - enum rsc_role_e role = RSC_ROLE_UNKNOWN; - enum rsc_role_e next_role = RSC_ROLE_UNKNOWN; - -- native_variant_data_t *native_data = NULL; -- -- get_native_variant_data(native_data, rsc); -+ CRM_ASSERT(rsc != NULL); -+ allow_migrate = pcmk_is_set(rsc->flags, pe_rsc_allow_migrate)? TRUE : FALSE; - - chosen = rsc->allocated_to; - next_role = rsc->next_role; -@@ -1338,8 +1337,16 @@ native_create_actions(pe_resource_t * rsc, pe_working_set_t * data_set) - crm_notice("See https://wiki.clusterlabs.org/wiki/FAQ#Resource_is_Too_Active for more information"); - } - -- if (rsc->recovery_type == recovery_stop_start) { -- need_stop = TRUE; -+ switch (rsc->recovery_type) { -+ case recovery_stop_start: -+ need_stop = TRUE; -+ break; -+ case recovery_stop_unexpected: -+ need_stop = TRUE; // StopRsc() will skip expected node -+ pe__set_resource_flags(rsc, pe_rsc_stop_unexpected); -+ break; -+ default: -+ break; - } - - /* If by chance a partial migration is in process, but the migration -@@ -1350,7 +1357,6 @@ native_create_actions(pe_resource_t * rsc, pe_working_set_t * data_set) - } - - if (!multiply_active) { -- native_data->expected_node = NULL; - pe__clear_resource_flags(rsc, pe_rsc_stop_unexpected); - } - -@@ -2020,14 +2026,11 @@ native_expand(pe_resource_t * rsc, pe_working_set_t * data_set) - static bool - is_expected_node(const pe_resource_t *rsc, const pe_node_t *node) - { -- native_variant_data_t *native_data = NULL; -- -- get_native_variant_data(native_data, rsc); - return pcmk_all_flags_set(rsc->flags, - pe_rsc_stop_unexpected|pe_rsc_restarting) - && (rsc->next_role > RSC_ROLE_STOPPED) -- && (native_data->expected_node != NULL) && (node != NULL) -- && (native_data->expected_node->details == node->details); -+ && (rsc->allocated_to != NULL) && (node != NULL) -+ && (rsc->allocated_to->details == node->details); - } - - gboolean -@@ -2076,17 +2079,13 @@ StopRsc(pe_resource_t * rsc, pe_node_t * next, gboolean optional, pe_working_set - - if(rsc->allocated_to == NULL) { - pe_action_set_reason(stop, "node availability", TRUE); -- } else if (pcmk_is_set(rsc->flags, pe_rsc_restarting)) { -- native_variant_data_t *native_data = NULL; -- -- get_native_variant_data(native_data, rsc); -- if (native_data->expected_node != NULL) { -- /* We are stopping a multiply active resource on a node that is -- * not its expected node, and we are still scheduling restart -- * actions, so the stop is for being multiply active. -- */ -- pe_action_set_reason(stop, "being multiply active", TRUE); -- } -+ } else if (pcmk_all_flags_set(rsc->flags, pe_rsc_restarting -+ |pe_rsc_stop_unexpected)) { -+ /* We are stopping a multiply active resource on a node that is -+ * not its expected node, and we are still scheduling restart -+ * actions, so the stop is for being multiply active. -+ */ -+ pe_action_set_reason(stop, "being multiply active", TRUE); - } - - if (!pcmk_is_set(rsc->flags, pe_rsc_managed)) { -diff --git a/lib/pengine/unpack.c b/lib/pengine/unpack.c -index 17dea0d7a..426022013 100644 ---- a/lib/pengine/unpack.c -+++ b/lib/pengine/unpack.c -@@ -3945,7 +3945,6 @@ unpack_rsc_op(pe_resource_t *rsc, pe_node_t *node, xmlNode *xml_op, - } - - done: -- pe__update_expected_node(rsc, node, status, rc, target_rc); - pe_rsc_trace(rsc, "Resource %s after %s: role=%s, next=%s", - rsc->id, task, role2text(rsc->role), - role2text(rsc->next_role)); --- -2.27.0 - - -From 703d3a09bce389afb4e095e1ac7af29eb5edd189 Mon Sep 17 00:00:00 2001 -From: Ken Gaillot -Date: Fri, 22 Apr 2022 14:02:34 -0500 -Subject: [PATCH 2/3] Test: scheduler: add a second regression test for - multiple-active=stop_unexpected - ---- - cts/cts-scheduler.in | 3 +- - cts/scheduler/dot/stop-unexpected-2.dot | 7 + - cts/scheduler/exp/stop-unexpected-2.exp | 36 ++++ - cts/scheduler/scores/stop-unexpected-2.scores | 21 ++ - .../summary/stop-unexpected-2.summary | 29 +++ - cts/scheduler/xml/stop-unexpected-2.xml | 204 ++++++++++++++++++ - 6 files changed, 299 insertions(+), 1 deletion(-) - create mode 100644 cts/scheduler/dot/stop-unexpected-2.dot - create mode 100644 cts/scheduler/exp/stop-unexpected-2.exp - create mode 100644 cts/scheduler/scores/stop-unexpected-2.scores - create mode 100644 cts/scheduler/summary/stop-unexpected-2.summary - create mode 100644 cts/scheduler/xml/stop-unexpected-2.xml - -diff --git a/cts/cts-scheduler.in b/cts/cts-scheduler.in -index 8c04687da..7fc76cce4 100644 ---- a/cts/cts-scheduler.in -+++ b/cts/cts-scheduler.in -@@ -273,8 +273,9 @@ TESTS = [ - [ "rec-rsc-6", "Resource Recover - multiple - restart" ], - [ "rec-rsc-7", "Resource Recover - multiple - stop" ], - [ "rec-rsc-8", "Resource Recover - multiple - block" ], -- [ "stop-unexpected", "Resource Recover - multiple - stop unexpected" ], - [ "rec-rsc-9", "Resource Recover - group/group" ], -+ [ "stop-unexpected", "Recover multiply active group with stop_unexpected" ], -+ [ "stop-unexpected-2", "Resource multiply active primitve with stop_unexpected" ], - [ "monitor-recovery", "on-fail=block + resource recovery detected by recurring monitor" ], - [ "stop-failure-no-quorum", "Stop failure without quorum" ], - [ "stop-failure-no-fencing", "Stop failure without fencing available" ], -diff --git a/cts/scheduler/dot/stop-unexpected-2.dot b/cts/scheduler/dot/stop-unexpected-2.dot -new file mode 100644 -index 000000000..cdaebf551 ---- /dev/null -+++ b/cts/scheduler/dot/stop-unexpected-2.dot -@@ -0,0 +1,7 @@ -+ digraph "g" { -+"test_monitor_10000 rhel8-4" [ style=bold color="green" fontcolor="black"] -+"test_start_0 rhel8-4" -> "test_monitor_10000 rhel8-4" [ style = bold] -+"test_start_0 rhel8-4" [ style=bold color="green" fontcolor="orange"] -+"test_stop_0 rhel8-3" -> "test_start_0 rhel8-4" [ style = bold] -+"test_stop_0 rhel8-3" [ style=bold color="green" fontcolor="black"] -+} -diff --git a/cts/scheduler/exp/stop-unexpected-2.exp b/cts/scheduler/exp/stop-unexpected-2.exp -new file mode 100644 -index 000000000..258053c08 ---- /dev/null -+++ b/cts/scheduler/exp/stop-unexpected-2.exp -@@ -0,0 +1,36 @@ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -diff --git a/cts/scheduler/scores/stop-unexpected-2.scores b/cts/scheduler/scores/stop-unexpected-2.scores -new file mode 100644 -index 000000000..0eb549f5e ---- /dev/null -+++ b/cts/scheduler/scores/stop-unexpected-2.scores -@@ -0,0 +1,21 @@ -+ -+pcmk__native_allocate: Fencing allocation score on rhel8-1: 0 -+pcmk__native_allocate: Fencing allocation score on rhel8-2: 0 -+pcmk__native_allocate: Fencing allocation score on rhel8-3: 0 -+pcmk__native_allocate: Fencing allocation score on rhel8-4: 0 -+pcmk__native_allocate: Fencing allocation score on rhel8-5: 0 -+pcmk__native_allocate: FencingFail allocation score on rhel8-1: 0 -+pcmk__native_allocate: FencingFail allocation score on rhel8-2: 0 -+pcmk__native_allocate: FencingFail allocation score on rhel8-3: 0 -+pcmk__native_allocate: FencingFail allocation score on rhel8-4: 0 -+pcmk__native_allocate: FencingFail allocation score on rhel8-5: 0 -+pcmk__native_allocate: FencingPass allocation score on rhel8-1: 0 -+pcmk__native_allocate: FencingPass allocation score on rhel8-2: 0 -+pcmk__native_allocate: FencingPass allocation score on rhel8-3: 0 -+pcmk__native_allocate: FencingPass allocation score on rhel8-4: 0 -+pcmk__native_allocate: FencingPass allocation score on rhel8-5: 0 -+pcmk__native_allocate: test allocation score on rhel8-1: 0 -+pcmk__native_allocate: test allocation score on rhel8-2: 0 -+pcmk__native_allocate: test allocation score on rhel8-3: 0 -+pcmk__native_allocate: test allocation score on rhel8-4: 0 -+pcmk__native_allocate: test allocation score on rhel8-5: 0 -diff --git a/cts/scheduler/summary/stop-unexpected-2.summary b/cts/scheduler/summary/stop-unexpected-2.summary -new file mode 100644 -index 000000000..d6b0c15dc ---- /dev/null -+++ b/cts/scheduler/summary/stop-unexpected-2.summary -@@ -0,0 +1,29 @@ -+Using the original execution date of: 2022-04-22 14:15:37Z -+Current cluster status: -+ * Node List: -+ * Online: [ rhel8-1 rhel8-2 rhel8-3 rhel8-4 rhel8-5 ] -+ -+ * Full List of Resources: -+ * Fencing (stonith:fence_xvm): Started rhel8-1 -+ * FencingPass (stonith:fence_dummy): Started rhel8-2 -+ * FencingFail (stonith:fence_dummy): Started rhel8-3 -+ * test (ocf:pacemaker:Dummy): Started [ rhel8-4 rhel8-3 ] -+ -+Transition Summary: -+ * Restart test ( rhel8-4 ) -+ -+Executing Cluster Transition: -+ * Resource action: test stop on rhel8-3 -+ * Pseudo action: test_start_0 -+ * Resource action: test monitor=10000 on rhel8-4 -+Using the original execution date of: 2022-04-22 14:15:37Z -+ -+Revised Cluster Status: -+ * Node List: -+ * Online: [ rhel8-1 rhel8-2 rhel8-3 rhel8-4 rhel8-5 ] -+ -+ * Full List of Resources: -+ * Fencing (stonith:fence_xvm): Started rhel8-1 -+ * FencingPass (stonith:fence_dummy): Started rhel8-2 -+ * FencingFail (stonith:fence_dummy): Started rhel8-3 -+ * test (ocf:pacemaker:Dummy): Started rhel8-4 -diff --git a/cts/scheduler/xml/stop-unexpected-2.xml b/cts/scheduler/xml/stop-unexpected-2.xml -new file mode 100644 -index 000000000..e103629e9 ---- /dev/null -+++ b/cts/scheduler/xml/stop-unexpected-2.xml -@@ -0,0 +1,204 @@ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ -+ --- -2.27.0 - - -From 60d8bb01ba73dfd1cb25c6764ee2b923dcfc4e8c Mon Sep 17 00:00:00 2001 -From: Ken Gaillot -Date: Fri, 22 Apr 2022 14:09:43 -0500 -Subject: [PATCH 3/3] Revert "Refactor: scheduler: add expected node to - primitive variant data" - -This reverts commit 871e2201d92520039df45062afc9120fd1fb0f30. ---- - include/crm/pengine/internal.h | 4 ---- - lib/pengine/native.c | 38 ---------------------------------- - lib/pengine/variant.h | 8 ++----- - 3 files changed, 2 insertions(+), 48 deletions(-) - -diff --git a/include/crm/pengine/internal.h b/include/crm/pengine/internal.h -index a2e4b5bf7..fe9a23b7e 100644 ---- a/include/crm/pengine/internal.h -+++ b/include/crm/pengine/internal.h -@@ -580,8 +580,4 @@ xmlNode *pe__failed_probe_for_rsc(pe_resource_t *rsc, const char *name); - - const char *pe__clone_child_id(pe_resource_t *rsc); - --void pe__update_expected_node(pe_resource_t *rsc, pe_node_t *node, -- int execution_status, int exit_status, -- int expected_exit_status); -- - #endif -diff --git a/lib/pengine/native.c b/lib/pengine/native.c -index 591d1c6f5..fa7dc8960 100644 ---- a/lib/pengine/native.c -+++ b/lib/pengine/native.c -@@ -1376,41 +1376,3 @@ pe__native_is_filtered(pe_resource_t *rsc, GList *only_rsc, gboolean check_paren - - return TRUE; - } -- --/*! -- * \internal -- * \brief Set a resource's expected node if appropriate for a history result -- * -- * \param[in] rsc Resource to set expected node for -- * \param[in] node Node to set as expected node -- * \param[in] execution_status History entry's execution status -- * \param[in] exit_status History entry's actual exit status -- * \param[in] expected_status History entry's expected exit status -- */ --void --pe__update_expected_node(pe_resource_t *rsc, pe_node_t *node, -- int execution_status, int exit_status, -- int expected_exit_status) --{ -- native_variant_data_t *native_data = NULL; -- -- get_native_variant_data(native_data, rsc); -- -- if ((rsc->recovery_type == recovery_stop_unexpected) -- && (rsc->role > RSC_ROLE_STOPPED) -- && (execution_status == PCMK_EXEC_DONE) -- && (exit_status == expected_exit_status)) { -- // Resource is active and was expected on this node -- pe_rsc_trace(rsc, "Found expected node %s for %s", -- node->details->uname, rsc->id); -- native_data->expected_node = node; -- pe__set_resource_flags(rsc, pe_rsc_stop_unexpected); -- -- } else if ((native_data->expected_node != NULL) -- && (native_data->expected_node->details == node->details)) { -- // Resource is not cleanly active here -- pe_rsc_trace(rsc, "Clearing expected node for %s", rsc->id); -- native_data->expected_node = NULL; -- pe__clear_resource_flags(rsc, pe_rsc_stop_unexpected); -- } --} -diff --git a/lib/pengine/variant.h b/lib/pengine/variant.h -index d8fefa9d6..cabfbe81f 100644 ---- a/lib/pengine/variant.h -+++ b/lib/pengine/variant.h -@@ -1,5 +1,5 @@ - /* -- * Copyright 2004-2022 the Pacemaker project contributors -+ * Copyright 2004-2021 the Pacemaker project contributors - * - * The version control history for this file may have further details. - * -@@ -139,11 +139,7 @@ typedef struct group_variant_data_s { - # elif VARIANT_NATIVE - - typedef struct native_variant_data_s { -- /* If the resource is multiply active, and has multiple-active set to -- * stop_unexpected, this will be set to the node where the resource was -- * found active by an operation with a expected result. -- */ -- pe_node_t *expected_node; -+ int dummy; - } native_variant_data_t; - - # define get_native_variant_data(data, rsc) \ --- -2.27.0 - diff --git a/SPECS/pacemaker.spec b/SPECS/pacemaker.spec index 5033db5..1d523e4 100644 --- a/SPECS/pacemaker.spec +++ b/SPECS/pacemaker.spec @@ -35,11 +35,11 @@ ## Upstream pacemaker version, and its package version (specversion ## can be incremented to build packages reliably considered "newer" ## than previously built packages with the same pcmkversion) -%global pcmkversion 2.1.2 -%global specversion 4 +%global pcmkversion 2.1.3 +%global specversion 1 ## Upstream commit (full commit ID, abbreviated commit ID, or tag) to build -%global commit ada5c3b36e2adf1703d54d39f40a4b8628eca175 +%global commit dff7c3a7265f02e37804d6302dd1bf1f4e4c1f17 ## Since git v2.11, the extent of abbreviation is autoscaled by default ## (used to be constant of 7), so we need to convey it for non-tags, too. @@ -63,14 +63,13 @@ ## Add option for whether to support storing sensitive information outside CIB %bcond_without cibsecrets +## Add option to enable Native Language Support (experimental) +%bcond_with nls + ## Add option to create binaries suitable for use with profiling tools %bcond_with profiling -## Add option to create binaries with coverage analysis -%bcond_with coverage - -## Add option to skip (or enable, on RHEL) generating documentation -## (the build tools aren't available everywhere) +## Allow deprecated option to skip (or enable, on RHEL) documentation %if 0%{?rhel} %bcond_with doc %else @@ -159,6 +158,7 @@ %if 0%{?suse_version} > 0 %global pkgname_bzip2_devel libbz2-devel %global pkgname_docbook_xsl docbook-xsl-stylesheets +%global pkgname_gettext gettext-tools %global pkgname_gnutls_devel libgnutls-devel %global pkgname_shadow_utils shadow %global pkgname_procps procps @@ -170,6 +170,7 @@ %global pkgname_libtool_devel_arch libtool-ltdl-devel%{?_isa} %global pkgname_bzip2_devel bzip2-devel %global pkgname_docbook_xsl docbook-style-xsl +%global pkgname_gettext gettext-devel %global pkgname_gnutls_devel gnutls-devel %global pkgname_shadow_utils shadow-utils %global pkgname_procps procps-ng @@ -242,7 +243,7 @@ Name: pacemaker Summary: Scalable High-Availability cluster resource manager Version: %{pcmkversion} -Release: %{pcmk_release}%{?dist}.2 +Release: %{pcmk_release}%{?dist} %if %{defined _unitdir} License: GPLv2+ and LGPLv2+ %else @@ -263,32 +264,7 @@ Source0: https://codeload.github.com/%{github_owner}/%{name}/tar.gz/%{arch Source1: nagios-agents-metadata-%{nagios_hash}.tar.gz # upstream commits -Patch1: 001-acl-group-schema.patch -Patch2: 002-fencing-reasons.patch -Patch3: 003-fencing-reasons.patch -Patch4: 004-systemd-metadata.patch -Patch5: 005-fencing-reasons.patch -Patch6: 006-stateful-metadata.patch -Patch7: 007-memory-leak.patch -Patch8: 008-fencing-history.patch -Patch9: 009-fencing-reasons.patch -Patch10: 010-probe-failures.patch -Patch11: 011-fencing-reasons.patch -Patch12: 012-notify-crash.patch -Patch13: 013-probe-failures.patch -Patch14: 014-pcmk_delay_base.patch -Patch15: 015-fencing-reasons.patch -Patch16: 016-fencing-crash.patch -Patch17: 017-fencing-reasons.patch -Patch18: 018-failure-messages.patch -Patch19: 019-corosync-tracking.patch -Patch20: 020-systemd-unit.patch -Patch21: 021-failure-messages.patch -Patch22: 022-memory-leak.patch -Patch23: 023-regression.patch -Patch24: 024-stop_unexpected.patch -Patch25: 025-stop_unexpected-test.patch -Patch26: 026-stop_unexpected-fix.patch +#Patch001: 001-xxxx.patch # downstream-only commits #Patch1xx: 1xx-xxxx.patch @@ -347,6 +323,7 @@ BuildRequires: %{pkgname_gnutls_devel} BuildRequires: help2man BuildRequires: ncurses-devel BuildRequires: pam-devel +BuildRequires: %{pkgname_gettext} >= 0.18 # Required for "make check" BuildRequires: libcmocka-devel @@ -395,7 +372,7 @@ when related resources fail and can be configured to periodically check resource health. Available rpmbuild rebuild options: - --with(out) : cibsecrets coverage doc hardening pre_release profiling stonithd + --with(out) : cibsecrets doc hardening nls pre_release profiling stonithd %package cli License: GPLv2+ and LGPLv2+ @@ -584,8 +561,8 @@ export LDFLAGS_HARDENED_LIB="%{?_hardening_ldflags}" %{!?with_hardening: --disable-hardening} \ %{?with_legacy_links: --enable-legacy-links} \ %{?with_profiling: --with-profiling} \ - %{?with_coverage: --with-coverage} \ %{?with_cibsecrets: --with-cibsecrets} \ + %{?with_nls: --enable-nls} \ %{?with_sbd_sync: --with-sbd-sync-default="true"} \ %{?gnutls_priorities: --with-gnutls-priorities="%{gnutls_priorities}"} \ %{?bug_url: --with-bug-url=%{bug_url}} \ @@ -644,10 +621,14 @@ done mkdir -p ${RPM_BUILD_ROOT}%{_localstatedir}/lib/rpm-state/%{name} %endif +%if %{with nls} +%find_lang %{name} +%endif + # Don't package libtool archives find %{buildroot} -name '*.la' -type f -print0 | xargs -0 rm -f -# Do not package these either +# Do not package these either on RHEL rm -f %{buildroot}/%{_sbindir}/fence_legacy rm -f %{buildroot}/%{_mandir}/man8/fence_legacy.* find %{buildroot} -name '*o2cb*' -type f -print0 | xargs -0 rm -f @@ -666,16 +647,6 @@ rm -f %{buildroot}/%{_sbindir}/ipmiservicelogd %endif %endif -%if %{with coverage} -GCOV_BASE=%{buildroot}/%{_var}/lib/pacemaker/gcov -mkdir -p $GCOV_BASE -find . -name '*.gcno' -type f | while read F ; do - D=`dirname $F` - mkdir -p ${GCOV_BASE}/$D - cp $F ${GCOV_BASE}/$D -done -%endif - %post %if %{defined _unitdir} %systemd_post pacemaker.service @@ -808,7 +779,6 @@ exit 0 %exclude %{_datadir}/pacemaker/nagios %{_libexecdir}/pacemaker/* -%{_sbindir}/crm_attribute %{_sbindir}/crm_master %{_sbindir}/fence_watchdog @@ -817,7 +787,6 @@ exit 0 %doc %{_mandir}/man7/pacemaker-fenced.* %doc %{_mandir}/man7/ocf_pacemaker_controld.* %doc %{_mandir}/man7/ocf_pacemaker_remote.* -%doc %{_mandir}/man8/crm_attribute.* %doc %{_mandir}/man8/crm_master.* %doc %{_mandir}/man8/fence_watchdog.* %doc %{_mandir}/man8/pacemakerd.* @@ -856,6 +825,7 @@ exit 0 %if %{with cibsecrets} %{_sbindir}/cibsecret %endif +%{_sbindir}/crm_attribute %{_sbindir}/crm_diff %{_sbindir}/crm_error %{_sbindir}/crm_failcount @@ -892,7 +862,6 @@ exit 0 %exclude %{_mandir}/man7/ocf_pacemaker_controld.* %exclude %{_mandir}/man7/ocf_pacemaker_remote.* %doc %{_mandir}/man8/* -%exclude %{_mandir}/man8/crm_attribute.* %exclude %{_mandir}/man8/crm_master.* %exclude %{_mandir}/man8/fence_watchdog.* %exclude %{_mandir}/man8/pacemakerd.* @@ -908,7 +877,7 @@ exit 0 %dir %attr (770, %{uname}, %{gname}) %{_var}/log/pacemaker %dir %attr (770, %{uname}, %{gname}) %{_var}/log/pacemaker/bundles -%files -n %{pkgname_pcmk_libs} +%files -n %{pkgname_pcmk_libs} %{?with_nls:-f %{name}.lang} %{_libdir}/libcib.so.* %{_libdir}/liblrmd.so.* %{_libdir}/libcrmservice.so.* @@ -964,9 +933,6 @@ exit 0 %files -n %{pkgname_pcmk_libs}-devel %{_includedir}/pacemaker %{_libdir}/*.so -%if %{with coverage} -%{_var}/lib/pacemaker/gcov -%endif %{_libdir}/pkgconfig/*.pc %license licenses/LGPLv2.1 %doc COPYING @@ -978,6 +944,7 @@ exit 0 %{_datadir}/pacemaker/*.rng %{_datadir}/pacemaker/*.xsl %{_datadir}/pacemaker/api +%{_datadir}/pacemaker/base %{_datadir}/pkgconfig/pacemaker-schemas.pc %files nagios-plugins-metadata @@ -986,13 +953,23 @@ exit 0 %license %{nagios_name}-%{nagios_hash}/COPYING %changelog -* Fri Apr 22 2022 Ken Gaillot - 2.1.2-4.2 -- Fix issue with "stop_unexpected" value for "multiple-active" meta-attribute -- Resolves: rhbz2062848 - -* Fri Apr 8 2022 Ken Gaillot - 2.1.2-4.1 -- Support "stop_unexpected" value for "multiple-active" meta-attribute -- Resolves: rhbz2062848 +* Wed May 18 2022 Ken Gaillot - 2.1.3-1 +- crm_resource --restart fails to restart clone instances except instance 0 +- Add new multiple-active option for "stop unexpected instances" +- Unable to show metadata for "service" agents with "@" and "." in the name +- Resource ocf:pacemaker:attribute does not comply with the OCF 1.1 standard +- Allow resource meta-attribute to exempt resource from node health restrictions +- Show node health states in crm_mon +- Rebase pacemaker on upstream 2.1.3-rc2 release +- crm_mon API result does not validate against schema if fence event has exit-reason +- Resolves: rhbz1930578 +- Resolves: rhbz2036815 +- Resolves: rhbz2045096 +- Resolves: rhbz2049722 +- Resolves: rhbz2059638 +- Resolves: rhbz2065812 +- Resolves: rhbz2072107 +- Resolves: rhbz2086230 * Wed Jan 26 2022 Ken Gaillot - 2.1.2-4 - Fix regression in down event detection that affects remote nodes