pacemaker/SOURCES/015-fencing-reasons.patch
2022-03-01 13:30:06 +00:00

1094 lines
40 KiB
Diff

From 87365f49b1bee0baa536783865fbd835a9cacc97 Mon Sep 17 00:00:00 2001
From: Ken Gaillot <kgaillot@redhat.com>
Date: Thu, 2 Dec 2021 16:12:24 -0600
Subject: [PATCH 01/11] Refactor: libstonithd: functionize getting notification
data XML
Also, only get the data when needed.
---
lib/fencing/st_client.c | 32 +++++++++++++++++++++++---------
1 file changed, 23 insertions(+), 9 deletions(-)
diff --git a/lib/fencing/st_client.c b/lib/fencing/st_client.c
index 4823751267..72a0a49408 100644
--- a/lib/fencing/st_client.c
+++ b/lib/fencing/st_client.c
@@ -1312,6 +1312,23 @@ stonith_dump_pending_callbacks(stonith_t * stonith)
return g_hash_table_foreach(private->stonith_op_callback_table, stonith_dump_pending_op, NULL);
}
+/*!
+ * \internal
+ * \brief Get the data section of a fencer notification
+ *
+ * \param[in] msg Notification XML
+ * \param[in] ntype Notification type
+ */
+static xmlNode *
+get_event_data_xml(xmlNode *msg, const char *ntype)
+{
+ char *data_addr = crm_strdup_printf("//%s", ntype);
+ xmlNode *data = get_xpath_object(data_addr, msg, LOG_DEBUG);
+
+ free(data_addr);
+ return data;
+}
+
/*
<notify t="st_notify" subt="st_device_register" st_op="st_device_register" st_rc="0" >
<st_calldata >
@@ -1336,17 +1353,18 @@ xml_to_event(xmlNode * msg)
{
stonith_event_t *event = calloc(1, sizeof(stonith_event_t));
const char *ntype = crm_element_value(msg, F_SUBTYPE);
- char *data_addr = crm_strdup_printf("//%s", ntype);
- xmlNode *data = get_xpath_object(data_addr, msg, LOG_DEBUG);
crm_log_xml_trace(msg, "stonith_notify");
crm_element_value_int(msg, F_STONITH_RC, &(event->result));
if (pcmk__str_eq(ntype, T_STONITH_NOTIFY_FENCE, pcmk__str_casei)) {
- event->operation = crm_element_value_copy(msg, F_STONITH_OPERATION);
+ xmlNode *data = get_event_data_xml(msg, ntype);
- if (data) {
+ if (data == NULL) {
+ crm_err("No data for %s event", ntype);
+ crm_log_xml_notice(msg, "BadEvent");
+ } else {
event->origin = crm_element_value_copy(data, F_STONITH_ORIGIN);
event->action = crm_element_value_copy(data, F_STONITH_ACTION);
event->target = crm_element_value_copy(data, F_STONITH_TARGET);
@@ -1354,14 +1372,10 @@ xml_to_event(xmlNode * msg)
event->id = crm_element_value_copy(data, F_STONITH_REMOTE_OP_ID);
event->client_origin = crm_element_value_copy(data, F_STONITH_CLIENTNAME);
event->device = crm_element_value_copy(data, F_STONITH_DEVICE);
-
- } else {
- crm_err("No data for %s event", ntype);
- crm_log_xml_notice(msg, "BadEvent");
}
+ event->operation = crm_element_value_copy(msg, F_STONITH_OPERATION);
}
- free(data_addr);
return event;
}
--
2.27.0
From 448f86a029d5d7e3c255d813929003a8cc2cffba Mon Sep 17 00:00:00 2001
From: Ken Gaillot <kgaillot@redhat.com>
Date: Fri, 19 Nov 2021 17:01:23 -0600
Subject: [PATCH 02/11] Refactor: fencing: parse full result from fencer
notifications
stonith_event_t previously contained only the legacy return code for the
notification event. Use its new opaque member to store the full result, along
with accessors (available only internally for now). Nothing uses them yet.
---
include/crm/fencing/internal.h | 5 +++
lib/fencing/st_client.c | 68 ++++++++++++++++++++++++++++++++--
2 files changed, 70 insertions(+), 3 deletions(-)
diff --git a/include/crm/fencing/internal.h b/include/crm/fencing/internal.h
index eff689e59b..acc16d05e9 100644
--- a/include/crm/fencing/internal.h
+++ b/include/crm/fencing/internal.h
@@ -187,10 +187,15 @@ bool stonith__event_state_eq(stonith_history_t *history, void *user_data);
bool stonith__event_state_neq(stonith_history_t *history, void *user_data);
int stonith__legacy2status(int rc);
+
int stonith__exit_status(stonith_callback_data_t *data);
int stonith__execution_status(stonith_callback_data_t *data);
const char *stonith__exit_reason(stonith_callback_data_t *data);
+int stonith__event_exit_status(stonith_event_t *event);
+int stonith__event_execution_status(stonith_event_t *event);
+const char *stonith__event_exit_reason(stonith_event_t *event);
+
/*!
* \internal
* \brief Is a fencing operation in pending state?
diff --git a/lib/fencing/st_client.c b/lib/fencing/st_client.c
index 72a0a49408..f58b3a6745 100644
--- a/lib/fencing/st_client.c
+++ b/lib/fencing/st_client.c
@@ -1349,15 +1349,23 @@ get_event_data_xml(xmlNode *msg, const char *ntype)
</notify>
*/
static stonith_event_t *
-xml_to_event(xmlNode * msg)
+xml_to_event(xmlNode *msg, pcmk__action_result_t *result)
{
stonith_event_t *event = calloc(1, sizeof(stonith_event_t));
const char *ntype = crm_element_value(msg, F_SUBTYPE);
+ CRM_ASSERT((event != NULL) && (result != NULL));
+
crm_log_xml_trace(msg, "stonith_notify");
- crm_element_value_int(msg, F_STONITH_RC, &(event->result));
+ // All notification types have the operation result
+ event->opaque = result;
+ stonith__xe_get_result(msg, result);
+
+ // @COMPAT The API originally provided the result as a legacy return code
+ event->result = pcmk_rc2legacy(stonith__result2rc(result));
+ // Fence notifications have additional information
if (pcmk__str_eq(ntype, T_STONITH_NOTIFY_FENCE, pcmk__str_casei)) {
xmlNode *data = get_event_data_xml(msg, ntype);
@@ -1392,6 +1400,7 @@ event_free(stonith_event_t * event)
free(event->executioner);
free(event->device);
free(event->client_origin);
+ pcmk__reset_result((pcmk__action_result_t *) (event->opaque));
free(event);
}
@@ -1402,6 +1411,7 @@ stonith_send_notification(gpointer data, gpointer user_data)
stonith_notify_client_t *entry = data;
stonith_event_t *st_event = NULL;
const char *event = NULL;
+ pcmk__action_result_t result = PCMK__UNKNOWN_RESULT;
if (blob->xml == NULL) {
crm_warn("Skipping callback - NULL message");
@@ -1427,7 +1437,7 @@ stonith_send_notification(gpointer data, gpointer user_data)
return;
}
- st_event = xml_to_event(blob->xml);
+ st_event = xml_to_event(blob->xml, &result);
crm_trace("Invoking callback for %p/%s event...", entry, event);
entry->notify(blob->stonith, st_event);
@@ -2366,6 +2376,58 @@ stonith__exit_reason(stonith_callback_data_t *data)
return ((pcmk__action_result_t *) data->opaque)->exit_reason;
}
+/*!
+ * \internal
+ * \brief Return the exit status from an event notification
+ *
+ * \param[in] event Event
+ *
+ * \return Exit status from event
+ */
+int
+stonith__event_exit_status(stonith_event_t *event)
+{
+ if ((event == NULL) || (event->opaque == NULL)) {
+ return CRM_EX_ERROR;
+ }
+ return ((pcmk__action_result_t *) event->opaque)->exit_status;
+}
+
+/*!
+ * \internal
+ * \brief Return the execution status from an event notification
+ *
+ * \param[in] event Event
+ *
+ * \return Execution status from event
+ */
+int
+stonith__event_execution_status(stonith_event_t *event)
+{
+ if ((event == NULL) || (event->opaque == NULL)) {
+ return PCMK_EXEC_UNKNOWN;
+ }
+ return ((pcmk__action_result_t *) event->opaque)->execution_status;
+}
+
+/*!
+ * \internal
+ * \brief Return the exit reason from an event notification
+ *
+ * \param[in] event Event
+ *
+ * \return Exit reason from event
+ */
+const char *
+stonith__event_exit_reason(stonith_event_t *event)
+{
+ if ((event == NULL) || (event->opaque == NULL)) {
+ return NULL;
+ }
+ return ((pcmk__action_result_t *) event->opaque)->exit_reason;
+}
+
+
// Deprecated functions kept only for backward API compatibility
// LCOV_EXCL_START
--
2.27.0
From 8dab65e65fe760052d1151749a7bfb2203445813 Mon Sep 17 00:00:00 2001
From: Ken Gaillot <kgaillot@redhat.com>
Date: Fri, 19 Nov 2021 17:02:28 -0600
Subject: [PATCH 03/11] Refactor: fencing: parse full result from synchronous
fencer replies
stonith_send_command() now parses the full result from synchronous fencer
replies, and maps that to a legacy return code, rather than parse the legacy
return code directly.
The full result is not used yet, and won't be until we can break backward API
compatibility, since the API functions that call stonith_send_command()
currently return a legacy code.
---
lib/fencing/st_client.c | 8 +++++---
1 file changed, 5 insertions(+), 3 deletions(-)
diff --git a/lib/fencing/st_client.c b/lib/fencing/st_client.c
index f58b3a6745..5fec7529e3 100644
--- a/lib/fencing/st_client.c
+++ b/lib/fencing/st_client.c
@@ -1537,11 +1537,13 @@ stonith_send_command(stonith_t * stonith, const char *op, xmlNode * data, xmlNod
crm_element_value_int(op_reply, F_STONITH_CALLID, &reply_id);
if (reply_id == stonith->call_id) {
+ pcmk__action_result_t result = PCMK__UNKNOWN_RESULT;
+
crm_trace("Synchronous reply %d received", reply_id);
- if (crm_element_value_int(op_reply, F_STONITH_RC, &rc) != 0) {
- rc = -ENOMSG;
- }
+ stonith__xe_get_result(op_reply, &result);
+ rc = pcmk_rc2legacy(stonith__result2rc(&result));
+ pcmk__reset_result(&result);
if ((call_options & st_opt_discard_reply) || output_data == NULL) {
crm_trace("Discarding reply");
--
2.27.0
From 1beb319d8c62ab93b4c08b26a4e03151906c6189 Mon Sep 17 00:00:00 2001
From: Ken Gaillot <kgaillot@redhat.com>
Date: Mon, 6 Dec 2021 17:13:44 -0600
Subject: [PATCH 04/11] Log: fencing: improve cts-fence-helper result logs
Use the full result from the fencing event
---
daemons/fenced/cts-fence-helper.c | 12 ++++++++----
1 file changed, 8 insertions(+), 4 deletions(-)
diff --git a/daemons/fenced/cts-fence-helper.c b/daemons/fenced/cts-fence-helper.c
index e222a59f9f..858cddc9de 100644
--- a/daemons/fenced/cts-fence-helper.c
+++ b/daemons/fenced/cts-fence-helper.c
@@ -125,10 +125,14 @@ st_callback(stonith_t * st, stonith_event_t * e)
crm_exit(CRM_EX_DISCONNECT);
}
- crm_notice("Operation %s requested by %s %s for peer %s. %s reported: %s (ref=%s)",
- e->operation, e->origin, e->result == pcmk_ok ? "completed" : "failed",
- e->target, e->executioner ? e->executioner : "<none>",
- pcmk_strerror(e->result), e->id);
+ crm_notice("Operation '%s' targeting %s by %s for %s: %s (exit=%d, ref=%s)",
+ ((e->operation == NULL)? "unknown" : e->operation),
+ ((e->target == NULL)? "no node" : e->target),
+ ((e->executioner == NULL)? "any node" : e->executioner),
+ ((e->origin == NULL)? "unknown client" : e->origin),
+ pcmk_exec_status_str(stonith__event_execution_status(e)),
+ stonith__event_exit_status(e),
+ ((e->id == NULL)? "none" : e->id));
if (expected_notifications) {
expected_notifications--;
--
2.27.0
From b26f701833ade5d7441fba317832d6e827bd16d0 Mon Sep 17 00:00:00 2001
From: Ken Gaillot <kgaillot@redhat.com>
Date: Tue, 14 Dec 2021 16:52:09 -0600
Subject: [PATCH 05/11] Test: cts-fence-helper: update expected return code
Before recent changes, libstonithd obtained the fence API's legacy result code
directly from the fencer's XML reply, meaning that the legacy code was the
result of the fencer's mapping of the full result (including the action stderr).
After those changes, libstonithd now ignores the legacy code in the fencer's
reply, and instead maps the legacy code itself from the full result in the
fencer's reply.
However, the fencer's reply does not have the action stderr, so failures that
mapped to -pcmk_err_generic on the server side now map to -ENODATA on the
client side. Update cts-fence-helper's expected return code to match (neither
code is particularly useful, so there wouldn't be much benefit from having the
fencer pass the action stderr with replies, which would be considerable
additional work).
---
daemons/fenced/cts-fence-helper.c | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/daemons/fenced/cts-fence-helper.c b/daemons/fenced/cts-fence-helper.c
index 858cddc9de..e3113452ef 100644
--- a/daemons/fenced/cts-fence-helper.c
+++ b/daemons/fenced/cts-fence-helper.c
@@ -207,10 +207,10 @@ run_fence_failure_test(void)
"Register device1 for failure test", 1, 0);
single_test(st->cmds->fence(st, st_opts, "false_1_node2", "off", 3, 0),
- "Fence failure results off", 1, -pcmk_err_generic);
+ "Fence failure results off", 1, -ENODATA);
single_test(st->cmds->fence(st, st_opts, "false_1_node2", "reboot", 3, 0),
- "Fence failure results reboot", 1, -pcmk_err_generic);
+ "Fence failure results reboot", 1, -ENODATA);
single_test(st->cmds->remove_device(st, st_opts, "test-id1"),
"Remove device1 for failure test", 1, 0);
--
2.27.0
From 123429de229c2148e320c76530b95e6ba458b9f6 Mon Sep 17 00:00:00 2001
From: Ken Gaillot <kgaillot@redhat.com>
Date: Tue, 7 Dec 2021 10:28:48 -0600
Subject: [PATCH 06/11] Low: controller: compare fencing targets
case-insensitively
... since they are node names
---
daemons/controld/controld_fencing.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/daemons/controld/controld_fencing.c b/daemons/controld/controld_fencing.c
index f8d2fc13f4..70e141dc28 100644
--- a/daemons/controld/controld_fencing.c
+++ b/daemons/controld/controld_fencing.c
@@ -466,7 +466,7 @@ tengine_stonith_notify(stonith_t *st, stonith_event_t *st_event)
return;
} else if ((st_event->result == pcmk_ok)
- && pcmk__str_eq(st_event->target, fsa_our_uname, pcmk__str_none)) {
+ && pcmk__str_eq(st_event->target, fsa_our_uname, pcmk__str_casei)) {
/* We were notified of our own fencing. Most likely, either fencing was
* misconfigured, or fabric fencing that doesn't cut cluster
--
2.27.0
From 3a067b8e58b3aefb49b2af1c35d0ad28b2de8784 Mon Sep 17 00:00:00 2001
From: Ken Gaillot <kgaillot@redhat.com>
Date: Tue, 7 Dec 2021 10:37:56 -0600
Subject: [PATCH 07/11] Refactor: controller: best practices for handling
fencing notifications
Rename tengine_stonith_notify() to handle_fence_notification(), rename its
st_event argument to event, add a doxygen block, and use some new variables and
reformatting to make it easier to follow (and change later).
---
daemons/controld/controld_fencing.c | 131 ++++++++++++++++------------
1 file changed, 75 insertions(+), 56 deletions(-)
diff --git a/daemons/controld/controld_fencing.c b/daemons/controld/controld_fencing.c
index 70e141dc28..00626444da 100644
--- a/daemons/controld/controld_fencing.c
+++ b/daemons/controld/controld_fencing.c
@@ -435,39 +435,59 @@ tengine_stonith_connection_destroy(stonith_t *st, stonith_event_t *e)
}
}
+/*!
+ * \internal
+ * \brief Handle an event notification from the fencing API
+ *
+ * \param[in] st Fencing API connection
+ * \param[in] event Fencing API event notification
+ */
static void
-tengine_stonith_notify(stonith_t *st, stonith_event_t *st_event)
+handle_fence_notification(stonith_t *st, stonith_event_t *event)
{
+ bool succeeded = true;
+ const char *executioner = "the cluster";
+ const char *client = "a client";
+
if (te_client_id == NULL) {
te_client_id = crm_strdup_printf("%s.%lu", crm_system_name,
(unsigned long) getpid());
}
- if (st_event == NULL) {
+ if (event == NULL) {
crm_err("Notify data not found");
return;
}
- crmd_alert_fencing_op(st_event);
+ if (event->executioner != NULL) {
+ executioner = event->executioner;
+ }
+ if (event->client_origin != NULL) {
+ client = event->client_origin;
+ }
- if ((st_event->result == pcmk_ok) && pcmk__str_eq("on", st_event->action, pcmk__str_casei)) {
- crm_notice("%s was successfully unfenced by %s (at the request of %s)",
- st_event->target,
- st_event->executioner? st_event->executioner : "<anyone>",
- st_event->origin);
- /* TODO: Hook up st_event->device */
- return;
+ if (event->result != pcmk_ok) {
+ succeeded = false;
+ }
- } else if (pcmk__str_eq("on", st_event->action, pcmk__str_casei)) {
- crm_err("Unfencing of %s by %s failed: %s (%d)",
- st_event->target,
- st_event->executioner? st_event->executioner : "<anyone>",
- pcmk_strerror(st_event->result), st_event->result);
- return;
+ crmd_alert_fencing_op(event);
- } else if ((st_event->result == pcmk_ok)
- && pcmk__str_eq(st_event->target, fsa_our_uname, pcmk__str_casei)) {
+ if (pcmk__str_eq("on", event->action, pcmk__str_none)) {
+ // Unfencing doesn't need special handling, just a log message
+ if (succeeded) {
+ crm_notice("%s was successfully unfenced by %s (at the request of %s)",
+ event->target, executioner, event->origin);
+ /* TODO: Hook up event->device */
+ } else {
+ crm_err("Unfencing of %s by %s failed: %s (%d)",
+ event->target, executioner,
+ pcmk_strerror(st_event->result), st_event->result);
+ }
+ return;
+ }
+ if (succeeded
+ && pcmk__str_eq(event->target, fsa_our_uname, pcmk__str_casei)) {
/* We were notified of our own fencing. Most likely, either fencing was
* misconfigured, or fabric fencing that doesn't cut cluster
* communication is in use.
@@ -478,44 +498,41 @@ tengine_stonith_notify(stonith_t *st, stonith_event_t *st_event)
* our subsequent election votes as "not part of our cluster".
*/
crm_crit("We were allegedly just fenced by %s for %s!",
- st_event->executioner? st_event->executioner : "the cluster",
- st_event->origin); /* Dumps blackbox if enabled */
+ executioner, event->origin); // Dumps blackbox if enabled
if (fence_reaction_panic) {
pcmk__panic(__func__);
} else {
crm_exit(CRM_EX_FATAL);
}
- return;
+ return; // Should never get here
}
- /* Update the count of stonith failures for this target, in case we become
+ /* Update the count of fencing failures for this target, in case we become
* DC later. The current DC has already updated its fail count in
* tengine_stonith_callback().
*/
- if (!AM_I_DC && pcmk__str_eq(st_event->operation, T_STONITH_NOTIFY_FENCE, pcmk__str_casei)) {
- if (st_event->result == pcmk_ok) {
- st_fail_count_reset(st_event->target);
+ if (!AM_I_DC
+ && pcmk__str_eq(event->operation, T_STONITH_NOTIFY_FENCE,
+ pcmk__str_casei)) {
+
+ if (succeeded) {
+ st_fail_count_reset(event->target);
} else {
- st_fail_count_increment(st_event->target);
+ st_fail_count_increment(event->target);
}
}
crm_notice("Peer %s was%s terminated (%s) by %s on behalf of %s: %s "
CRM_XS " initiator=%s ref=%s",
- st_event->target, st_event->result == pcmk_ok ? "" : " not",
- st_event->action,
- st_event->executioner ? st_event->executioner : "<anyone>",
- (st_event->client_origin? st_event->client_origin : "<unknown>"),
- pcmk_strerror(st_event->result),
- st_event->origin, st_event->id);
-
- if (st_event->result == pcmk_ok) {
- crm_node_t *peer = pcmk__search_known_node_cache(0, st_event->target,
+ event->target, (succeeded? "" : " not"),
+ event->action, executioner, client,
+ pcmk_strerror(event->result),
+ event->origin, event->id);
+
+ if (succeeded) {
+ crm_node_t *peer = pcmk__search_known_node_cache(0, event->target,
CRM_GET_PEER_ANY);
const char *uuid = NULL;
- gboolean we_are_executioner = pcmk__str_eq(st_event->executioner,
- fsa_our_uname,
- pcmk__str_casei);
if (peer == NULL) {
return;
@@ -523,10 +540,9 @@ tengine_stonith_notify(stonith_t *st, stonith_event_t *st_event)
uuid = crm_peer_uuid(peer);
- crm_trace("target=%s dc=%s", st_event->target, fsa_our_dc);
- if(AM_I_DC) {
+ if (AM_I_DC) {
/* The DC always sends updates */
- send_stonith_update(NULL, st_event->target, uuid);
+ send_stonith_update(NULL, event->target, uuid);
/* @TODO Ideally, at this point, we'd check whether the fenced node
* hosted any guest nodes, and call remote_node_down() for them.
@@ -536,31 +552,33 @@ tengine_stonith_notify(stonith_t *st, stonith_event_t *st_event)
* on the scheduler creating fence pseudo-events for the guests.
*/
- if (st_event->client_origin
- && !pcmk__str_eq(st_event->client_origin, te_client_id, pcmk__str_casei)) {
-
- /* Abort the current transition graph if it wasn't us
- * that invoked stonith to fence someone
+ if (!pcmk__str_eq(client, te_client_id, pcmk__str_casei)) {
+ /* Abort the current transition if it wasn't the cluster that
+ * initiated fencing.
*/
- crm_info("External fencing operation from %s fenced %s", st_event->client_origin, st_event->target);
- abort_transition(INFINITY, tg_restart, "External Fencing Operation", NULL);
+ crm_info("External fencing operation from %s fenced %s",
+ client, event->target);
+ abort_transition(INFINITY, tg_restart,
+ "External Fencing Operation", NULL);
}
/* Assume it was our leader if we don't currently have one */
- } else if (pcmk__str_eq(fsa_our_dc, st_event->target, pcmk__str_null_matches | pcmk__str_casei)
+ } else if (pcmk__str_eq(fsa_our_dc, event->target,
+ pcmk__str_null_matches|pcmk__str_casei)
&& !pcmk_is_set(peer->flags, crm_remote_node)) {
crm_notice("Fencing target %s %s our leader",
- st_event->target, (fsa_our_dc? "was" : "may have been"));
+ event->target, (fsa_our_dc? "was" : "may have been"));
/* Given the CIB resyncing that occurs around elections,
* have one node update the CIB now and, if the new DC is different,
* have them do so too after the election
*/
- if (we_are_executioner) {
- send_stonith_update(NULL, st_event->target, uuid);
+ if (pcmk__str_eq(event->executioner, fsa_our_uname,
+ pcmk__str_casei)) {
+ send_stonith_update(NULL, event->target, uuid);
}
- add_stonith_cleanup(st_event->target);
+ add_stonith_cleanup(event->target);
}
/* If the target is a remote node, and we host its connection,
@@ -569,7 +587,7 @@ tengine_stonith_notify(stonith_t *st, stonith_event_t *st_event)
* so the failure might not otherwise be detected until the next poke.
*/
if (pcmk_is_set(peer->flags, crm_remote_node)) {
- remote_ra_fail(st_event->target);
+ remote_ra_fail(event->target);
}
crmd_peer_down(peer, TRUE);
@@ -632,7 +650,7 @@ te_connect_stonith(gpointer user_data)
tengine_stonith_connection_destroy);
stonith_api->cmds->register_notification(stonith_api,
T_STONITH_NOTIFY_FENCE,
- tengine_stonith_notify);
+ handle_fence_notification);
stonith_api->cmds->register_notification(stonith_api,
T_STONITH_NOTIFY_HISTORY_SYNCED,
tengine_stonith_history_synced);
@@ -837,7 +855,8 @@ tengine_stonith_callback(stonith_t *stonith, stonith_callback_data_t *data)
}
/* Increment the fail count now, so abort_for_stonith_failure() can
- * check it. Non-DC nodes will increment it in tengine_stonith_notify().
+ * check it. Non-DC nodes will increment it in
+ * handle_fence_notification().
*/
st_fail_count_increment(target);
abort_for_stonith_failure(abort_action, target, NULL);
--
2.27.0
From 5ec9dcbbe1ee7f6252968f87d7df5a5ea17244fb Mon Sep 17 00:00:00 2001
From: Ken Gaillot <kgaillot@redhat.com>
Date: Tue, 7 Dec 2021 10:40:21 -0600
Subject: [PATCH 08/11] Log: controller: improve messages when handling fencing
notifications
Now that the fencing API provides a full result including exit reasons with
fencing event notifications, make the controller logs more useful and
consistent.
---
daemons/controld/controld_fencing.c | 34 ++++++++++++++++++++---------
1 file changed, 24 insertions(+), 10 deletions(-)
diff --git a/daemons/controld/controld_fencing.c b/daemons/controld/controld_fencing.c
index 00626444da..0aa9ef083c 100644
--- a/daemons/controld/controld_fencing.c
+++ b/daemons/controld/controld_fencing.c
@@ -448,6 +448,8 @@ handle_fence_notification(stonith_t *st, stonith_event_t *event)
bool succeeded = true;
const char *executioner = "the cluster";
const char *client = "a client";
+ const char *reason = NULL;
+ int exec_status;
if (te_client_id == NULL) {
te_client_id = crm_strdup_printf("%s.%lu", crm_system_name,
@@ -466,22 +468,31 @@ handle_fence_notification(stonith_t *st, stonith_event_t *event)
client = event->client_origin;
}
- if (event->result != pcmk_ok) {
+ exec_status = stonith__event_execution_status(event);
+ if ((stonith__event_exit_status(event) != CRM_EX_OK)
+ || (exec_status != PCMK_EXEC_DONE)) {
succeeded = false;
+ if (exec_status == PCMK_EXEC_DONE) {
+ exec_status = PCMK_EXEC_ERROR;
+ }
}
+ reason = stonith__event_exit_reason(event);
crmd_alert_fencing_op(event);
if (pcmk__str_eq("on", event->action, pcmk__str_none)) {
// Unfencing doesn't need special handling, just a log message
if (succeeded) {
- crm_notice("%s was successfully unfenced by %s (at the request of %s)",
- event->target, executioner, event->origin);
+ crm_notice("%s was unfenced by %s at the request of %s@%s",
+ event->target, executioner, client, event->origin);
/* TODO: Hook up event->device */
} else {
- crm_err("Unfencing of %s by %s failed: %s (%d)",
+ crm_err("Unfencing of %s by %s failed (%s%s%s) with exit status %d",
event->target, executioner,
- pcmk_strerror(st_event->result), st_event->result);
+ pcmk_exec_status_str(exec_status),
+ ((reason == NULL)? "" : ": "),
+ ((reason == NULL)? "" : reason),
+ stonith__event_exit_status(event));
}
return;
}
@@ -522,12 +533,15 @@ handle_fence_notification(stonith_t *st, stonith_event_t *event)
}
}
- crm_notice("Peer %s was%s terminated (%s) by %s on behalf of %s: %s "
- CRM_XS " initiator=%s ref=%s",
+ crm_notice("Peer %s was%s terminated (%s) by %s on behalf of %s@%s: "
+ "%s%s%s%s " CRM_XS " event=%s",
event->target, (succeeded? "" : " not"),
- event->action, executioner, client,
- pcmk_strerror(event->result),
- event->origin, event->id);
+ event->action, executioner, client, event->origin,
+ (succeeded? "OK" : pcmk_exec_status_str(exec_status)),
+ ((reason == NULL)? "" : " ("),
+ ((reason == NULL)? "" : reason),
+ ((reason == NULL)? "" : ")"),
+ event->id);
if (succeeded) {
crm_node_t *peer = pcmk__search_known_node_cache(0, event->target,
--
2.27.0
From fb484933ce7c8f3325300a9e01a114db1bbb5b70 Mon Sep 17 00:00:00 2001
From: Ken Gaillot <kgaillot@redhat.com>
Date: Tue, 7 Dec 2021 11:33:15 -0600
Subject: [PATCH 09/11] Refactor: controller: move alert functions into own
source file
---
daemons/controld/Makefile.am | 1 +
daemons/controld/controld_alerts.c | 92 +++++++++++++++++++++++++
daemons/controld/controld_execd_state.c | 75 --------------------
3 files changed, 93 insertions(+), 75 deletions(-)
create mode 100644 daemons/controld/controld_alerts.c
diff --git a/daemons/controld/Makefile.am b/daemons/controld/Makefile.am
index db45bcba4a..0a29925c0b 100644
--- a/daemons/controld/Makefile.am
+++ b/daemons/controld/Makefile.am
@@ -43,6 +43,7 @@ pacemaker_controld_LDADD = $(top_builddir)/lib/fencing/libstonithd.la \
$(CLUSTERLIBS)
pacemaker_controld_SOURCES = pacemaker-controld.c \
+ controld_alerts.c \
controld_attrd.c \
controld_callbacks.c \
controld_based.c \
diff --git a/daemons/controld/controld_alerts.c b/daemons/controld/controld_alerts.c
new file mode 100644
index 0000000000..bd92795cf0
--- /dev/null
+++ b/daemons/controld/controld_alerts.c
@@ -0,0 +1,92 @@
+/*
+ * Copyright 2012-2021 the Pacemaker project contributors
+ *
+ * The version control history for this file may have further details.
+ *
+ * This source code is licensed under the GNU General Public License version 2
+ * or later (GPLv2+) WITHOUT ANY WARRANTY.
+ */
+
+#include <crm_internal.h>
+
+#include <glib.h>
+#include <libxml/tree.h>
+
+#include <crm/lrmd.h>
+#include <crm/lrmd_internal.h>
+#include <crm/pengine/rules_internal.h>
+#include <crm/pengine/status.h>
+#include <crm/stonith-ng.h>
+
+#include <pacemaker-controld.h>
+
+static GList *crmd_alert_list = NULL;
+
+void
+crmd_unpack_alerts(xmlNode *alerts)
+{
+ pe_free_alert_list(crmd_alert_list);
+ crmd_alert_list = pe_unpack_alerts(alerts);
+}
+
+void
+crmd_alert_node_event(crm_node_t *node)
+{
+ lrm_state_t *lrm_state;
+
+ if (crmd_alert_list == NULL) {
+ return;
+ }
+
+ lrm_state = lrm_state_find(fsa_our_uname);
+ if (lrm_state == NULL) {
+ return;
+ }
+
+ lrmd_send_node_alert((lrmd_t *) lrm_state->conn, crmd_alert_list,
+ node->uname, node->id, node->state);
+}
+
+void
+crmd_alert_fencing_op(stonith_event_t * e)
+{
+ char *desc;
+ lrm_state_t *lrm_state;
+
+ if (crmd_alert_list == NULL) {
+ return;
+ }
+
+ lrm_state = lrm_state_find(fsa_our_uname);
+ if (lrm_state == NULL) {
+ return;
+ }
+
+ desc = crm_strdup_printf("Operation %s of %s by %s for %s@%s: %s (ref=%s)",
+ e->action, e->target,
+ (e->executioner? e->executioner : "<no-one>"),
+ e->client_origin, e->origin,
+ pcmk_strerror(e->result), e->id);
+
+ lrmd_send_fencing_alert((lrmd_t *) lrm_state->conn, crmd_alert_list,
+ e->target, e->operation, desc, e->result);
+ free(desc);
+}
+
+void
+crmd_alert_resource_op(const char *node, lrmd_event_data_t * op)
+{
+ lrm_state_t *lrm_state;
+
+ if (crmd_alert_list == NULL) {
+ return;
+ }
+
+ lrm_state = lrm_state_find(fsa_our_uname);
+ if (lrm_state == NULL) {
+ return;
+ }
+
+ lrmd_send_resource_alert((lrmd_t *) lrm_state->conn, crmd_alert_list, node,
+ op);
+}
diff --git a/daemons/controld/controld_execd_state.c b/daemons/controld/controld_execd_state.c
index 67c376a426..5dce6c6d59 100644
--- a/daemons/controld/controld_execd_state.c
+++ b/daemons/controld/controld_execd_state.c
@@ -777,78 +777,3 @@ lrm_state_unregister_rsc(lrm_state_t * lrm_state,
*/
return ((lrmd_t *) lrm_state->conn)->cmds->unregister_rsc(lrm_state->conn, rsc_id, options);
}
-
-/*
- * Functions for sending alerts via local executor connection
- */
-
-static GList *crmd_alert_list = NULL;
-
-void
-crmd_unpack_alerts(xmlNode *alerts)
-{
- pe_free_alert_list(crmd_alert_list);
- crmd_alert_list = pe_unpack_alerts(alerts);
-}
-
-void
-crmd_alert_node_event(crm_node_t *node)
-{
- lrm_state_t *lrm_state;
-
- if (crmd_alert_list == NULL) {
- return;
- }
-
- lrm_state = lrm_state_find(fsa_our_uname);
- if (lrm_state == NULL) {
- return;
- }
-
- lrmd_send_node_alert((lrmd_t *) lrm_state->conn, crmd_alert_list,
- node->uname, node->id, node->state);
-}
-
-void
-crmd_alert_fencing_op(stonith_event_t * e)
-{
- char *desc;
- lrm_state_t *lrm_state;
-
- if (crmd_alert_list == NULL) {
- return;
- }
-
- lrm_state = lrm_state_find(fsa_our_uname);
- if (lrm_state == NULL) {
- return;
- }
-
- desc = crm_strdup_printf("Operation %s of %s by %s for %s@%s: %s (ref=%s)",
- e->action, e->target,
- (e->executioner? e->executioner : "<no-one>"),
- e->client_origin, e->origin,
- pcmk_strerror(e->result), e->id);
-
- lrmd_send_fencing_alert((lrmd_t *) lrm_state->conn, crmd_alert_list,
- e->target, e->operation, desc, e->result);
- free(desc);
-}
-
-void
-crmd_alert_resource_op(const char *node, lrmd_event_data_t * op)
-{
- lrm_state_t *lrm_state;
-
- if (crmd_alert_list == NULL) {
- return;
- }
-
- lrm_state = lrm_state_find(fsa_our_uname);
- if (lrm_state == NULL) {
- return;
- }
-
- lrmd_send_resource_alert((lrmd_t *) lrm_state->conn, crmd_alert_list, node,
- op);
-}
--
2.27.0
From 3d0b57406bcde6682623e9d62c8ee95878345eb1 Mon Sep 17 00:00:00 2001
From: Ken Gaillot <kgaillot@redhat.com>
Date: Tue, 7 Dec 2021 11:25:41 -0600
Subject: [PATCH 10/11] Feature: controller,tools: improve description for
fencing alerts/traps
This functionizes creating a description for fencing events, so it can be used
by both the controller for alerts and crm_mon for traps, for consistency.
Now that we have the full result including exit reason, we can improve the
description, but the format is kept similar to before to minimize the change.
The alert/trap also includes the legacy return code for the event, but we can't
change that now because lrmd_send_fencing_alert() and the alert/trap
environment variables are public API.
---
daemons/controld/controld_alerts.c | 8 ++-----
include/crm/fencing/internal.h | 1 +
lib/fencing/st_client.c | 38 ++++++++++++++++++++++++++++++
tools/crm_mon.c | 5 ++--
4 files changed, 43 insertions(+), 9 deletions(-)
diff --git a/daemons/controld/controld_alerts.c b/daemons/controld/controld_alerts.c
index bd92795cf0..2e0a67dba2 100644
--- a/daemons/controld/controld_alerts.c
+++ b/daemons/controld/controld_alerts.c
@@ -12,6 +12,7 @@
#include <glib.h>
#include <libxml/tree.h>
+#include <crm/fencing/internal.h>
#include <crm/lrmd.h>
#include <crm/lrmd_internal.h>
#include <crm/pengine/rules_internal.h>
@@ -62,12 +63,7 @@ crmd_alert_fencing_op(stonith_event_t * e)
return;
}
- desc = crm_strdup_printf("Operation %s of %s by %s for %s@%s: %s (ref=%s)",
- e->action, e->target,
- (e->executioner? e->executioner : "<no-one>"),
- e->client_origin, e->origin,
- pcmk_strerror(e->result), e->id);
-
+ desc = stonith__event_description(e);
lrmd_send_fencing_alert((lrmd_t *) lrm_state->conn, crmd_alert_list,
e->target, e->operation, desc, e->result);
free(desc);
diff --git a/include/crm/fencing/internal.h b/include/crm/fencing/internal.h
index acc16d05e9..d2b49f831a 100644
--- a/include/crm/fencing/internal.h
+++ b/include/crm/fencing/internal.h
@@ -195,6 +195,7 @@ const char *stonith__exit_reason(stonith_callback_data_t *data);
int stonith__event_exit_status(stonith_event_t *event);
int stonith__event_execution_status(stonith_event_t *event);
const char *stonith__event_exit_reason(stonith_event_t *event);
+char *stonith__event_description(stonith_event_t *event);
/*!
* \internal
diff --git a/lib/fencing/st_client.c b/lib/fencing/st_client.c
index 5fec7529e3..b1de912b2a 100644
--- a/lib/fencing/st_client.c
+++ b/lib/fencing/st_client.c
@@ -2429,6 +2429,44 @@ stonith__event_exit_reason(stonith_event_t *event)
return ((pcmk__action_result_t *) event->opaque)->exit_reason;
}
+/*!
+ * \internal
+ * \brief Return a human-friendly description of a fencing event
+ *
+ * \param[in] event Event to describe
+ *
+ * \return Newly allocated string with description of \p event
+ * \note The caller is responsible for freeing the return value.
+ * This function asserts on memory errors and never returns NULL.
+ * \note This currently is useful only for events of type
+ * T_STONITH_NOTIFY_FENCE.
+ */
+char *
+stonith__event_description(stonith_event_t *event)
+{
+ const char *reason;
+ const char *status;
+
+ if (stonith__event_execution_status(event) != PCMK_EXEC_DONE) {
+ status = pcmk_exec_status_str(stonith__event_execution_status(event));
+ } else if (stonith__event_exit_status(event) != CRM_EX_OK) {
+ status = pcmk_exec_status_str(PCMK_EXEC_ERROR);
+ } else {
+ status = crm_exit_str(CRM_EX_OK);
+ }
+ reason = stonith__event_exit_reason(event);
+
+ return crm_strdup_printf("Operation %s of %s by %s for %s@%s: %s%s%s%s (ref=%s)",
+ event->action, event->target,
+ (event->executioner? event->executioner : "the cluster"),
+ (event->client_origin? event->client_origin : "a client"),
+ event->origin, status,
+ ((reason == NULL)? "" : " ("),
+ ((reason == NULL)? "" : reason),
+ ((reason == NULL)? "" : ")"),
+ event->id);
+}
+
// Deprecated functions kept only for backward API compatibility
// LCOV_EXCL_START
diff --git a/tools/crm_mon.c b/tools/crm_mon.c
index a6c459aaf7..e7b4fe2847 100644
--- a/tools/crm_mon.c
+++ b/tools/crm_mon.c
@@ -2237,9 +2237,8 @@ mon_st_callback_event(stonith_t * st, stonith_event_t * e)
/* disconnect cib as well and have everything reconnect */
mon_cib_connection_destroy(NULL);
} else if (options.external_agent) {
- char *desc = crm_strdup_printf("Operation %s requested by %s for peer %s: %s (ref=%s)",
- e->operation, e->origin, e->target, pcmk_strerror(e->result),
- e->id);
+ char *desc = stonith__event_description(e);
+
send_custom_trap(e->target, NULL, e->operation, pcmk_ok, e->result, 0, desc);
free(desc);
}
--
2.27.0
From 2fe03c2165680c717a1f6106c5150be7d117f1a5 Mon Sep 17 00:00:00 2001
From: Ken Gaillot <kgaillot@redhat.com>
Date: Fri, 14 Jan 2022 10:45:03 -0600
Subject: [PATCH 11/11] Low: controller: compare case-sensitively where
appropriate
---
daemons/controld/controld_fencing.c | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/daemons/controld/controld_fencing.c b/daemons/controld/controld_fencing.c
index 0aa9ef083c..15954b2358 100644
--- a/daemons/controld/controld_fencing.c
+++ b/daemons/controld/controld_fencing.c
@@ -1,5 +1,5 @@
/*
- * Copyright 2004-2021 the Pacemaker project contributors
+ * Copyright 2004-2022 the Pacemaker project contributors
*
* The version control history for this file may have further details.
*
@@ -524,7 +524,7 @@ handle_fence_notification(stonith_t *st, stonith_event_t *event)
*/
if (!AM_I_DC
&& pcmk__str_eq(event->operation, T_STONITH_NOTIFY_FENCE,
- pcmk__str_casei)) {
+ pcmk__str_none)) {
if (succeeded) {
st_fail_count_reset(event->target);
--
2.27.0