pacemaker/018-failure-messages.patch

797 lines
29 KiB
Diff
Raw Normal View History

From 08c3420f2c857e7b27cd960f355d787af534da7d Mon Sep 17 00:00:00 2001
From: Ken Gaillot <kgaillot@redhat.com>
Date: Tue, 18 Jan 2022 16:04:49 -0600
Subject: [PATCH 01/12] Log: libcrmcommon: improve description for "not
connected" status
PCMK_EXEC_NOT_CONNECTED was originally added to represent "No executor
connection", but it can also now mean no fencer connection, so change it to
"Internal communication failure" which is probably less mysterious to end users
anyway (especially since it should be accompanied by a more descriptive exit
reason).
---
include/crm/common/results.h | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/include/crm/common/results.h b/include/crm/common/results.h
index 873faf5c43..3d322a7ce6 100644
--- a/include/crm/common/results.h
+++ b/include/crm/common/results.h
@@ -349,7 +349,7 @@ pcmk_exec_status_str(enum pcmk_exec_status status)
case PCMK_EXEC_ERROR_HARD: return "Hard error";
case PCMK_EXEC_ERROR_FATAL: return "Fatal error";
case PCMK_EXEC_NOT_INSTALLED: return "Not installed";
- case PCMK_EXEC_NOT_CONNECTED: return "No executor connection";
+ case PCMK_EXEC_NOT_CONNECTED: return "Internal communication failure";
case PCMK_EXEC_INVALID: return "Cannot execute now";
case PCMK_EXEC_NO_FENCE_DEVICE: return "No fence device";
case PCMK_EXEC_NO_SECRETS: return "CIB secrets unavailable";
--
2.27.0
From 7c345cf8cf0cb054f5634206880df035bfef7311 Mon Sep 17 00:00:00 2001
From: Ken Gaillot <kgaillot@redhat.com>
Date: Mon, 20 Dec 2021 15:12:36 -0600
Subject: [PATCH 02/12] Refactor: libcrmcommon: drop unnecessary system error
redefinitions
portability.h defines some system error codes that might not be present on
non-Linux systems.
This was a bad idea, since there's no way to ensure the defined values don't
conflict with existing system codes. However, we use a number of them, so it's
probably best to keep them, at least until we can make a backward compatibility
break.
However, we don't use EUNATCH, ENOSR, or ENOSTR, so we can delete those.
---
include/portability.h | 12 ------------
lib/common/results.c | 9 ++++++---
2 files changed, 6 insertions(+), 15 deletions(-)
diff --git a/include/portability.h b/include/portability.h
index 9a60c583a7..ee065a376d 100644
--- a/include/portability.h
+++ b/include/portability.h
@@ -131,10 +131,6 @@ typedef union
# define EREMOTEIO 193
# endif
-# ifndef EUNATCH
-# define EUNATCH 194
-# endif
-
# ifndef ENOKEY
# define ENOKEY 195
# endif
@@ -147,14 +143,6 @@ typedef union
# define ETIME 197
# endif
-# ifndef ENOSR
-# define ENOSR 198
-# endif
-
-# ifndef ENOSTR
-# define ENOSTR 199
-# endif
-
# ifndef EKEYREJECTED
# define EKEYREJECTED 200
# endif
diff --git a/lib/common/results.c b/lib/common/results.c
index 6d120694cd..96cd4e5659 100644
--- a/lib/common/results.c
+++ b/lib/common/results.c
@@ -118,9 +118,6 @@ pcmk_strerror(int rc)
case EREMOTEIO:
return "Remote I/O error";
/* coverity[dead_error_condition] False positive on non-Linux */
- case EUNATCH:
- return "Protocol driver not attached";
- /* coverity[dead_error_condition] False positive on non-Linux */
case ENOKEY:
return "Required key not available";
}
@@ -342,8 +339,12 @@ pcmk_rc_name(int rc)
case ENOMSG: return "ENOMSG";
case ENOPROTOOPT: return "ENOPROTOOPT";
case ENOSPC: return "ENOSPC";
+#ifdef ENOSR
case ENOSR: return "ENOSR";
+#endif
+#ifdef ENOSTR
case ENOSTR: return "ENOSTR";
+#endif
case ENOSYS: return "ENOSYS";
case ENOTBLK: return "ENOTBLK";
case ENOTCONN: return "ENOTCONN";
@@ -376,7 +377,9 @@ pcmk_rc_name(int rc)
case ETIME: return "ETIME";
case ETIMEDOUT: return "ETIMEDOUT";
case ETXTBSY: return "ETXTBSY";
+#ifdef EUNATCH
case EUNATCH: return "EUNATCH";
+#endif
case EUSERS: return "EUSERS";
/* case EWOULDBLOCK: return "EWOULDBLOCK"; */
case EXDEV: return "EXDEV";
--
2.27.0
From eac8d1ca51eac3f437e18584f7e013d976ecee2c Mon Sep 17 00:00:00 2001
From: Ken Gaillot <kgaillot@redhat.com>
Date: Mon, 20 Dec 2021 15:33:12 -0600
Subject: [PATCH 03/12] Log: libcrmcommon: improve handling of portability.h
error codes
portability.h defines some system error codes that might not be present on
non-Linux systems.
Define a constant for each one (for example, PCMK__ECOMM for ECOMM) when
the system doesn't have the value, so we can detect that when relevant.
Also, make sure pcmk_rc_name() and pcmk_rc_str() handle all of these values.
---
include/portability.h | 8 ++++++++
lib/common/results.c | 32 ++++++++++++++++++++++++++++++--
2 files changed, 38 insertions(+), 2 deletions(-)
diff --git a/include/portability.h b/include/portability.h
index ee065a376d..5d5fbf21cb 100644
--- a/include/portability.h
+++ b/include/portability.h
@@ -116,34 +116,42 @@ typedef union
# include <errno.h>
# ifndef ENOTUNIQ
+# define PCMK__ENOTUNIQ
# define ENOTUNIQ 190
# endif
# ifndef ECOMM
+# define PCMK__ECOMM
# define ECOMM 191
# endif
# ifndef ELIBACC
+# define PCMK__ELIBACC
# define ELIBACC 192
# endif
# ifndef EREMOTEIO
+# define PCMK__EREMOTIO
# define EREMOTEIO 193
# endif
# ifndef ENOKEY
+# define PCMK__ENOKEY
# define ENOKEY 195
# endif
# ifndef ENODATA
+# define PCMK__ENODATA
# define ENODATA 196
# endif
# ifndef ETIME
+# define PCMK__ETIME
# define ETIME 197
# endif
# ifndef EKEYREJECTED
+# define PCMK__EKEYREJECTED
# define EKEYREJECTED 200
# endif
diff --git a/lib/common/results.c b/lib/common/results.c
index 96cd4e5659..bcf289d0d6 100644
--- a/lib/common/results.c
+++ b/lib/common/results.c
@@ -395,9 +395,9 @@ pcmk_rc_name(int rc)
#ifdef EISNAM // Not available on OS X, Illumos, Solaris
case EISNAM: return "EISNAM";
case EKEYEXPIRED: return "EKEYEXPIRED";
- case EKEYREJECTED: return "EKEYREJECTED";
case EKEYREVOKED: return "EKEYREVOKED";
#endif
+ case EKEYREJECTED: return "EKEYREJECTED";
case EL2HLT: return "EL2HLT";
case EL2NSYNC: return "EL2NSYNC";
case EL3HLT: return "EL3HLT";
@@ -443,7 +443,35 @@ pcmk_rc_str(int rc)
if (rc < 0) {
return "Unknown error";
}
- return strerror(rc);
+
+ // Handle values that could be defined by system or by portability.h
+ switch (rc) {
+#ifdef PCMK__ENOTUNIQ
+ case ENOTUNIQ: return "Name not unique on network";
+#endif
+#ifdef PCMK__ECOMM
+ case ECOMM: return "Communication error on send";
+#endif
+#ifdef PCMK__ELIBACC
+ case ELIBACC: return "Can not access a needed shared library";
+#endif
+#ifdef PCMK__EREMOTEIO
+ case EREMOTEIO: return "Remote I/O error";
+#endif
+#ifdef PCMK__ENOKEY
+ case ENOKEY: return "Required key not available";
+#endif
+#ifdef PCMK__ENODATA
+ case ENODATA: return "No data available";
+#endif
+#ifdef PCMK__ETIME
+ case ETIME: return "Timer expired";
+#endif
+#ifdef PCMK__EKEYREJECTED
+ case EKEYREJECTED: return "Key was rejected by service";
+#endif
+ default: return strerror(rc);
+ }
}
// This returns negative values for errors
--
2.27.0
From 32a38ac6374f85c43e7f4051f5e519822cc481e6 Mon Sep 17 00:00:00 2001
From: Ken Gaillot <kgaillot@redhat.com>
Date: Mon, 20 Dec 2021 15:39:19 -0600
Subject: [PATCH 04/12] Log: libcrmcommon: redefine pcmk_strerror() in terms of
pcmk_rc_str()
... to reduce code duplication. This causes minor differences in the string for
a few values.
---
lib/common/results.c | 67 +-------------------------------------------
1 file changed, 1 insertion(+), 66 deletions(-)
diff --git a/lib/common/results.c b/lib/common/results.c
index bcf289d0d6..b2c6e8d553 100644
--- a/lib/common/results.c
+++ b/lib/common/results.c
@@ -57,72 +57,7 @@ pcmk_errorname(int rc)
const char *
pcmk_strerror(int rc)
{
- if (rc == 0) {
- return "OK";
- }
-
- rc = abs(rc);
-
- // Of course rc > 0 ... unless someone passed INT_MIN as rc
- if ((rc > 0) && (rc < PCMK_ERROR_OFFSET)) {
- return strerror(rc);
- }
-
- switch (rc) {
- case pcmk_err_generic:
- return "Generic Pacemaker error";
- case pcmk_err_no_quorum:
- return "Operation requires quorum";
- case pcmk_err_schema_validation:
- return "Update does not conform to the configured schema";
- case pcmk_err_transform_failed:
- return "Schema transform failed";
- case pcmk_err_old_data:
- return "Update was older than existing configuration";
- case pcmk_err_diff_failed:
- return "Application of an update diff failed";
- case pcmk_err_diff_resync:
- return "Application of an update diff failed, requesting a full refresh";
- case pcmk_err_cib_modified:
- return "The on-disk configuration was manually modified";
- case pcmk_err_cib_backup:
- return "Could not archive the previous configuration";
- case pcmk_err_cib_save:
- return "Could not save the new configuration to disk";
- case pcmk_err_cib_corrupt:
- return "Could not parse on-disk configuration";
- case pcmk_err_multiple:
- return "Resource active on multiple nodes";
- case pcmk_err_node_unknown:
- return "Node not found";
- case pcmk_err_already:
- return "Situation already as requested";
- case pcmk_err_bad_nvpair:
- return "Bad name/value pair given";
- case pcmk_err_schema_unchanged:
- return "Schema is already the latest available";
- case pcmk_err_unknown_format:
- return "Unknown output format";
-
- /* The following cases will only be hit on systems for which they are non-standard */
- /* coverity[dead_error_condition] False positive on non-Linux */
- case ENOTUNIQ:
- return "Name not unique on network";
- /* coverity[dead_error_condition] False positive on non-Linux */
- case ECOMM:
- return "Communication error on send";
- /* coverity[dead_error_condition] False positive on non-Linux */
- case ELIBACC:
- return "Can not access a needed shared library";
- /* coverity[dead_error_condition] False positive on non-Linux */
- case EREMOTEIO:
- return "Remote I/O error";
- /* coverity[dead_error_condition] False positive on non-Linux */
- case ENOKEY:
- return "Required key not available";
- }
- crm_err("Unknown error code: %d", rc);
- return "Unknown error";
+ return pcmk_rc_str(pcmk_legacy2rc(rc));
}
// Standard Pacemaker API return codes
--
2.27.0
From 7c331d7e2275ffebbfd5e2f6432a6137a66ee5db Mon Sep 17 00:00:00 2001
From: Ken Gaillot <kgaillot@redhat.com>
Date: Mon, 20 Dec 2021 15:41:24 -0600
Subject: [PATCH 05/12] Log: libcrmcommon: don't say "Unknown error"
... which is unhelpful and annoying to users
---
lib/common/results.c | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/lib/common/results.c b/lib/common/results.c
index b2c6e8d553..5ffac76549 100644
--- a/lib/common/results.c
+++ b/lib/common/results.c
@@ -376,7 +376,7 @@ pcmk_rc_str(int rc)
return pcmk__rcs[pcmk_rc_error - rc].desc;
}
if (rc < 0) {
- return "Unknown error";
+ return "Error";
}
// Handle values that could be defined by system or by portability.h
@@ -768,7 +768,7 @@ bz2_strerror(int rc)
case BZ_OUTBUFF_FULL:
return "output data will not fit into the buffer provided";
}
- return "Unknown error";
+ return "Data compression error";
}
crm_exit_t
--
2.27.0
From 26883b4edda7d81bfcb79bd7b33bb3210beff110 Mon Sep 17 00:00:00 2001
From: Ken Gaillot <kgaillot@redhat.com>
Date: Mon, 20 Dec 2021 16:01:39 -0600
Subject: [PATCH 06/12] Log: fencing: don't warn if cluster has no watchdog
device
---
lib/fencing/st_client.c | 7 ++++++-
1 file changed, 6 insertions(+), 1 deletion(-)
diff --git a/lib/fencing/st_client.c b/lib/fencing/st_client.c
index b1de912b2a..a0f3119f3b 100644
--- a/lib/fencing/st_client.c
+++ b/lib/fencing/st_client.c
@@ -187,7 +187,12 @@ stonith__watchdog_fencing_enabled_for_node_api(stonith_t *st, const char *node)
* we drop in here - so as not to make remote nodes
* panic on that answer
*/
- crm_warn("watchdog-fencing-query failed");
+ if (rc == -ENODEV) {
+ crm_notice("Cluster does not have watchdog fencing device");
+ } else {
+ crm_warn("Could not check for watchdog fencing device: %s",
+ pcmk_strerror(rc));
+ }
} else if (list[0] == '\0') {
rv = TRUE;
} else {
--
2.27.0
From 72b3c42232deaca64ffba9582598c59331203761 Mon Sep 17 00:00:00 2001
From: Ken Gaillot <kgaillot@redhat.com>
Date: Mon, 20 Dec 2021 16:22:49 -0600
Subject: [PATCH 07/12] Test: libcrmcommon: update pcmk_rc_str() unit test for
recent change
---
lib/common/tests/results/pcmk__results_test.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/lib/common/tests/results/pcmk__results_test.c b/lib/common/tests/results/pcmk__results_test.c
index 57a520c501..e08d4b6261 100644
--- a/lib/common/tests/results/pcmk__results_test.c
+++ b/lib/common/tests/results/pcmk__results_test.c
@@ -30,7 +30,7 @@ static void
test_for_pcmk_rc_str(void **state) {
assert_string_equal(pcmk_rc_str(pcmk_rc_error-1), "Unknown output format");
assert_string_equal(pcmk_rc_str(pcmk_rc_ok), "OK");
- assert_string_equal(pcmk_rc_str(-1), "Unknown error");
+ assert_string_equal(pcmk_rc_str(-1), "Error");
}
static void
--
2.27.0
From c1ad3d6640f695321a83183c95fae2f105adc429 Mon Sep 17 00:00:00 2001
From: Ken Gaillot <kgaillot@redhat.com>
Date: Tue, 21 Dec 2021 10:20:38 -0600
Subject: [PATCH 08/12] Test: cts-lab: update expected patterns for recent
changes
---
cts/lab/CTStests.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/cts/lab/CTStests.py b/cts/lab/CTStests.py
index 62c832eb45..f4be998cfb 100644
--- a/cts/lab/CTStests.py
+++ b/cts/lab/CTStests.py
@@ -3055,7 +3055,7 @@ class RemoteStonithd(RemoteDriver):
r"pacemaker-controld.*:\s+error.*: Operation remote-.*_monitor",
r"pacemaker-controld.*:\s+error.*: Result of monitor operation for remote-.*",
r"schedulerd.*:\s+Recover remote-.*\s*\(.*\)",
- r"error: Result of monitor operation for .* on remote-.*: No executor connection",
+ r"error: Result of monitor operation for .* on remote-.*: Internal communication failure",
]
ignore_pats.extend(RemoteDriver.errorstoignore(self))
--
2.27.0
From f272e2f526633c707e894b39c7c7bce3c14de898 Mon Sep 17 00:00:00 2001
From: Ken Gaillot <kgaillot@redhat.com>
Date: Tue, 21 Dec 2021 15:40:49 -0600
Subject: [PATCH 09/12] Log: controller,libpacemaker: make history XML creation
less chatty
Other messages with the same info will already be logged at higher severity
---
daemons/controld/controld_execd.c | 3 +--
daemons/controld/controld_te_actions.c | 7 ++-----
include/pcmki/pcmki_sched_utils.h | 3 +--
lib/pacemaker/pcmk_injections.c | 3 +--
lib/pacemaker/pcmk_sched_actions.c | 12 +++++-------
5 files changed, 10 insertions(+), 18 deletions(-)
diff --git a/daemons/controld/controld_execd.c b/daemons/controld/controld_execd.c
index 15784e7687..52157fa5d4 100644
--- a/daemons/controld/controld_execd.c
+++ b/daemons/controld/controld_execd.c
@@ -693,9 +693,8 @@ build_operation_update(xmlNode * parent, lrmd_rsc_info_t * rsc, lrmd_event_data_
caller_version = CRM_FEATURE_SET;
}
- crm_trace("Building %s operation update with originator version: %s", op->rsc_id, caller_version);
xml_op = pcmk__create_history_xml(parent, op, caller_version, target_rc,
- fsa_our_uname, src, LOG_DEBUG);
+ fsa_our_uname, src);
if (xml_op == NULL) {
return TRUE;
}
diff --git a/daemons/controld/controld_te_actions.c b/daemons/controld/controld_te_actions.c
index 63b7c72359..b0bcb8b2e4 100644
--- a/daemons/controld/controld_te_actions.c
+++ b/daemons/controld/controld_te_actions.c
@@ -181,7 +181,6 @@ controld_record_action_timeout(crm_action_t *action)
lrmd_event_data_t *op = NULL;
xmlNode *state = NULL;
xmlNode *rsc = NULL;
- xmlNode *xml_op = NULL;
xmlNode *action_rsc = NULL;
int rc = pcmk_ok;
@@ -245,12 +244,10 @@ controld_record_action_timeout(crm_action_t *action)
op->user_data = pcmk__transition_key(transition_graph->id, action->id,
target_rc, te_uuid);
- xml_op = pcmk__create_history_xml(rsc, op, CRM_FEATURE_SET, target_rc,
- target, __func__, LOG_INFO);
+ pcmk__create_history_xml(rsc, op, CRM_FEATURE_SET, target_rc, target,
+ __func__);
lrmd_free_event(op);
- crm_log_xml_trace(xml_op, "Action timeout");
-
rc = fsa_cib_conn->cmds->update(fsa_cib_conn, XML_CIB_TAG_STATUS, state, call_options);
fsa_register_cib_callback(rc, FALSE, NULL, cib_action_updated);
free_xml(state);
diff --git a/include/pcmki/pcmki_sched_utils.h b/include/pcmki/pcmki_sched_utils.h
index 68d60fc7db..144424a609 100644
--- a/include/pcmki/pcmki_sched_utils.h
+++ b/include/pcmki/pcmki_sched_utils.h
@@ -52,8 +52,7 @@ extern void process_utilization(pe_resource_t * rsc, pe_node_t ** prefer, pe_wor
xmlNode *pcmk__create_history_xml(xmlNode *parent, lrmd_event_data_t *event,
const char *caller_version, int target_rc,
- const char *node, const char *origin,
- int level);
+ const char *node, const char *origin);
# define LOAD_STOPPED "load_stopped"
diff --git a/lib/pacemaker/pcmk_sched_transition.c b/lib/pacemaker/pcmk_sched_transition.c
index 678c3f5dd2..1aa90a5a0b 100644
--- a/lib/pacemaker/pcmk_sched_transition.c
+++ b/lib/pacemaker/pcmk_sched_transition.c
@@ -201,8 +201,7 @@ inject_op(xmlNode * cib_resource, lrmd_event_data_t * op, int target_rc)
inject_op(xmlNode * cib_resource, lrmd_event_data_t * op, int target_rc)
{
return pcmk__create_history_xml(cib_resource, op, CRM_FEATURE_SET,
- target_rc, NULL, crm_system_name,
- LOG_TRACE);
+ target_rc, NULL, crm_system_name);
}
static xmlNode *
diff --git a/lib/pacemaker/pcmk_sched_actions.c b/lib/pacemaker/pcmk_sched_actions.c
index f8200b0efc..4f63d3374d 100644
--- a/lib/pacemaker/pcmk_sched_utils.c
+++ b/lib/pacemaker/pcmk_sched_utils.c
@@ -892,14 +892,13 @@ add_op_digest_to_xml(lrmd_event_data_t *op, xmlNode *update)
* \param[in] target_rc Expected result of operation
* \param[in] node Name of node on which operation was performed
* \param[in] origin Arbitrary description of update source
- * \param[in] level A log message will be logged at this level
*
* \return Newly created XML node for history update
*/
xmlNode *
pcmk__create_history_xml(xmlNode *parent, lrmd_event_data_t *op,
const char *caller_version, int target_rc,
- const char *node, const char *origin, int level)
+ const char *node, const char *origin)
{
char *key = NULL;
char *magic = NULL;
@@ -912,11 +911,10 @@ pcmk__create_history_xml(xmlNode *parent, lrmd_event_data_t *op,
const char *task = NULL;
CRM_CHECK(op != NULL, return NULL);
- do_crm_log(level, "%s: Updating resource %s after %s op %s (interval=%u)",
- origin, op->rsc_id, op->op_type,
- pcmk_exec_status_str(op->op_status), op->interval_ms);
-
- crm_trace("DC version: %s", caller_version);
+ crm_trace("Creating history XML for %s-interval %s action for %s on %s "
+ "(DC version: %s, origin: %s)",
+ pcmk__readable_interval(op->interval_ms), op->op_type, op->rsc_id,
+ ((node == NULL)? "no node" : node), caller_version, origin);
task = op->op_type;
--
2.27.0
From 06b1da9e5345e0d1571042c11646fd7157961279 Mon Sep 17 00:00:00 2001
From: Ken Gaillot <kgaillot@redhat.com>
Date: Tue, 21 Dec 2021 17:09:44 -0600
Subject: [PATCH 10/12] Feature: controller: improve exit reason for internal
timeouts
Functionize the part of controld_record_action_timeout() that creates a fake
executor event, into a new function synthesize_timeout_event(), and have it set
a more detailed exit reason describing what timed out.
---
daemons/controld/controld_te_actions.c | 61 ++++++++++++++++++++------
1 file changed, 48 insertions(+), 13 deletions(-)
diff --git a/daemons/controld/controld_te_actions.c b/daemons/controld/controld_te_actions.c
index b0bcb8b2e4..de2fbb82bf 100644
--- a/daemons/controld/controld_te_actions.c
+++ b/daemons/controld/controld_te_actions.c
@@ -175,6 +175,53 @@ te_crm_command(crm_graph_t * graph, crm_action_t * action)
return TRUE;
}
+/*!
+ * \internal
+ * \brief Synthesize an executor event for a resource action timeout
+ *
+ * \param[in] action Resource action that timed out
+ * \param[in] target_rc Expected result of action that timed out
+ *
+ * Synthesize an executor event for a resource action timeout. (If the executor
+ * gets a timeout while waiting for a resource action to complete, that will be
+ * reported via the usual callback. This timeout means we didn't hear from the
+ * executor itself or the controller that relayed the action to the executor.)
+ *
+ * \return Newly created executor event for result of \p action
+ * \note The caller is responsible for freeing the return value using
+ * lrmd_free_event().
+ */
+static lrmd_event_data_t *
+synthesize_timeout_event(crm_action_t *action, int target_rc)
+{
+ lrmd_event_data_t *op = NULL;
+ const char *target = crm_element_value(action->xml, XML_LRM_ATTR_TARGET);
+ const char *reason = NULL;
+ char *dynamic_reason = NULL;
+
+ if (pcmk__str_eq(target, get_local_node_name(), pcmk__str_casei)) {
+ reason = "Local executor did not return result in time";
+ } else {
+ const char *router_node = NULL;
+
+ router_node = crm_element_value(action->xml, XML_LRM_ATTR_ROUTER_NODE);
+ if (router_node == NULL) {
+ router_node = target;
+ }
+ dynamic_reason = crm_strdup_printf("Controller on %s did not return "
+ "result in time", router_node);
+ reason = dynamic_reason;
+ }
+
+ op = pcmk__event_from_graph_action(NULL, action, PCMK_EXEC_TIMEOUT,
+ PCMK_OCF_UNKNOWN_ERROR, reason);
+ op->call_id = -1;
+ op->user_data = pcmk__transition_key(transition_graph->id, action->id,
+ target_rc, te_uuid);
+ free(dynamic_reason);
+ return op;
+}
+
void
controld_record_action_timeout(crm_action_t *action)
{
@@ -231,19 +278,7 @@ controld_record_action_timeout(crm_action_t *action)
crm_copy_xml_element(action_rsc, rsc, XML_AGENT_ATTR_CLASS);
crm_copy_xml_element(action_rsc, rsc, XML_AGENT_ATTR_PROVIDER);
- /* If the executor gets a timeout while waiting for the action to complete,
- * that will be reported via the usual callback. This timeout means that we
- * didn't hear from the executor or the controller that relayed the action
- * to the executor.
- */
- op = pcmk__event_from_graph_action(NULL, action, PCMK_EXEC_TIMEOUT,
- PCMK_OCF_UNKNOWN_ERROR,
- "Cluster communication timeout "
- "(no response from executor)");
- op->call_id = -1;
- op->user_data = pcmk__transition_key(transition_graph->id, action->id,
- target_rc, te_uuid);
-
+ op = synthesize_timeout_event(action, target_rc);
pcmk__create_history_xml(rsc, op, CRM_FEATURE_SET, target_rc, target,
__func__);
lrmd_free_event(op);
--
2.27.0
From be620d206faefab967d4c8567d6554d10c9e72ba Mon Sep 17 00:00:00 2001
From: Ken Gaillot <kgaillot@redhat.com>
Date: Wed, 22 Dec 2021 16:35:06 -0600
Subject: [PATCH 11/12] Feature: fencing: improve exit reason for fencing
timeouts
Troubleshooting timeouts is one of the more difficult aspects of cluster
maintenance. We want to give as much of a hint as possible, but for fencing in
particular it is difficult because an operation might involve multiple retries
of multiple devices.
Barring another major project to track exactly which devices, retries, etc.,
were used in a given operation, these changes in wording are probably the best
we can do.
---
daemons/fenced/fenced_remote.c | 8 +++++---
lib/fencing/st_client.c | 2 +-
2 files changed, 6 insertions(+), 4 deletions(-)
diff --git a/daemons/fenced/fenced_remote.c b/daemons/fenced/fenced_remote.c
index 1e237150c5..6eebb7381e 100644
--- a/daemons/fenced/fenced_remote.c
+++ b/daemons/fenced/fenced_remote.c
@@ -1,5 +1,5 @@
/*
- * Copyright 2009-2021 the Pacemaker project contributors
+ * Copyright 2009-2022 the Pacemaker project contributors
*
* The version control history for this file may have further details.
*
@@ -715,8 +715,10 @@ remote_op_timeout(gpointer userdata)
CRM_XS " id=%.8s",
op->action, op->target, op->client_name, op->id);
} else {
- finalize_timed_out_op(userdata, "Fencing could not be completed "
- "within overall timeout");
+ finalize_timed_out_op(userdata, "Fencing did not complete within a "
+ "total timeout based on the "
+ "configured timeout and retries for "
+ "any devices attempted");
}
return G_SOURCE_REMOVE;
}
diff --git a/lib/fencing/st_client.c b/lib/fencing/st_client.c
index a0f3119f3b..718739b321 100644
--- a/lib/fencing/st_client.c
+++ b/lib/fencing/st_client.c
@@ -906,7 +906,7 @@ invoke_registered_callbacks(stonith_t *stonith, xmlNode *msg, int call_id)
if (msg == NULL) {
// Fencer didn't reply in time
pcmk__set_result(&result, CRM_EX_ERROR, PCMK_EXEC_TIMEOUT,
- "Timeout waiting for reply from fencer");
+ "Fencer accepted request but did not reply in time");
CRM_LOG_ASSERT(call_id > 0);
} else {
--
2.27.0
From 0fe8ede2f8e838e335fe42846bdf147111ce9955 Mon Sep 17 00:00:00 2001
From: Ken Gaillot <kgaillot@redhat.com>
Date: Wed, 22 Dec 2021 17:09:09 -0600
Subject: [PATCH 12/12] Feature: libcrmservice: improve exit reason for
timeouts
The services library doesn't have enough information about an action to say
(for example) what configuration parameters might be relevant, but we can at
least distinguish what kind of agent timed out.
---
lib/services/services_linux.c | 12 +++++++++++-
lib/services/systemd.c | 2 +-
2 files changed, 12 insertions(+), 2 deletions(-)
diff --git a/lib/services/services_linux.c b/lib/services/services_linux.c
index f15eee860e..d6aafcfe46 100644
--- a/lib/services/services_linux.c
+++ b/lib/services/services_linux.c
@@ -677,9 +677,19 @@ async_action_complete(mainloop_child_t *p, pid_t pid, int core, int signo,
parse_exit_reason_from_stderr(op);
} else if (mainloop_child_timeout(p)) {
+ const char *reason = NULL;
+
+ if (op->rsc != NULL) {
+ reason = "Resource agent did not complete in time";
+ } else if (pcmk__str_eq(op->standard, PCMK_RESOURCE_CLASS_STONITH,
+ pcmk__str_none)) {
+ reason = "Fence agent did not complete in time";
+ } else {
+ reason = "Process did not complete in time";
+ }
crm_info("%s[%d] timed out after %dms", op->id, op->pid, op->timeout);
services__set_result(op, services__generic_error(op), PCMK_EXEC_TIMEOUT,
- "Process did not exit within specified timeout");
+ reason);
} else if (op->cancel) {
/* If an in-flight recurring operation was killed because it was
diff --git a/lib/services/systemd.c b/lib/services/systemd.c
index 27a3b376db..d87b287424 100644
--- a/lib/services/systemd.c
+++ b/lib/services/systemd.c
@@ -995,7 +995,7 @@ systemd_timeout_callback(gpointer p)
crm_info("%s action for systemd unit %s named '%s' timed out",
op->action, op->agent, op->rsc);
services__set_result(op, PCMK_OCF_UNKNOWN_ERROR, PCMK_EXEC_TIMEOUT,
- "Systemd action did not complete within specified timeout");
+ "Systemd unit action did not complete in time");
services__finalize_async_op(op);
return FALSE;
}
--
2.27.0