pacemaker/SOURCES/003-fencer-logs.patch
2021-10-08 14:23:53 +00:00

653 lines
23 KiB
Diff

From 0a884f325e1049febc28bf0419ab307dd0bce5af Mon Sep 17 00:00:00 2001
From: Ken Gaillot <kgaillot@redhat.com>
Date: Thu, 16 May 2019 20:04:57 -0500
Subject: [PATCH] Log: various: improve fencer connection messages
Previously, log messages around fencer connections were inconsistent.
This attempts to make them more consistent by: having stonith_api_signon() log
only at debug level, letting the callers log at a level appropriate to the
situation using the return code; functionizing retrying a connection; and
using similar wording across clients.
This also does a bit of refactoring for better error checking and improved
efficiency.
---
daemons/controld/controld_control.c | 7 +-
daemons/controld/controld_te_utils.c | 59 ++++++-----
daemons/execd/pacemaker-execd.c | 28 ++---
daemons/fenced/cts-fence-helper.c | 38 +++----
include/crm/stonith-ng.h | 4 +
lib/fencing/st_client.c | 195 ++++++++++++++++++++---------------
tools/crm_mon.c | 1 -
tools/stonith_admin.c | 29 +-----
8 files changed, 181 insertions(+), 180 deletions(-)
diff --git a/daemons/controld/controld_control.c b/daemons/controld/controld_control.c
index 89b5b5d..6d9f335 100644
--- a/daemons/controld/controld_control.c
+++ b/daemons/controld/controld_control.c
@@ -628,10 +628,11 @@ do_started(long long action,
register_fsa_error(C_FSA_INTERNAL, I_ERROR, NULL);
}
+ // Try connecting to fencer (retrying later in mainloop if failed)
if (stonith_reconnect == NULL) {
- int dummy;
-
- stonith_reconnect = mainloop_add_trigger(G_PRIORITY_LOW, te_connect_stonith, &dummy);
+ stonith_reconnect = mainloop_add_trigger(G_PRIORITY_LOW,
+ te_connect_stonith,
+ GINT_TO_POINTER(TRUE));
}
set_bit(fsa_input_register, R_ST_REQUIRED);
mainloop_set_trigger(stonith_reconnect);
diff --git a/daemons/controld/controld_te_utils.c b/daemons/controld/controld_te_utils.c
index 5606ed6..22f83ad 100644
--- a/daemons/controld/controld_te_utils.c
+++ b/daemons/controld/controld_te_utils.c
@@ -1,5 +1,5 @@
/*
- * Copyright 2004-2018 Andrew Beekhof <andrew@beekhof.net>
+ * Copyright 2004-2019 the Pacemaker project contributors
*
* This source code is licensed under the GNU General Public License version 2
* or later (GPLv2+) WITHOUT ANY WARRANTY.
@@ -385,10 +385,18 @@ te_trigger_stonith_history_sync(void)
mainloop_timer_start(stonith_history_sync_timer);
}
+/*!
+ * \brief Connect to fencer
+ *
+ * \param[in] user_data If NULL, retry failures now, otherwise retry in main loop
+ *
+ * \return TRUE
+ * \note If user_data is NULL, this will wait 2s between attempts, for up to
+ * 30 attempts, meaning the controller could be blocked as long as 58s.
+ */
gboolean
te_connect_stonith(gpointer user_data)
{
- int lpc = 0;
int rc = pcmk_ok;
if (stonith_api == NULL) {
@@ -396,42 +404,41 @@ te_connect_stonith(gpointer user_data)
}
if (stonith_api->state != stonith_disconnected) {
- crm_trace("Still connected");
+ crm_trace("Already connected to fencer, no need to retry");
return TRUE;
}
- for (lpc = 0; lpc < 30; lpc++) {
- crm_debug("Attempting connection to fencing daemon...");
-
- sleep(1);
- rc = stonith_api->cmds->connect(stonith_api, crm_system_name, NULL);
-
- if (rc == pcmk_ok) {
- break;
+ if (user_data == NULL) {
+ // Blocking (retry failures now until successful)
+ rc = stonith_api_connect_retry(stonith_api, crm_system_name, 30);
+ if (rc != pcmk_ok) {
+ crm_err("Could not connect to fencer in 30 attempts: %s "
+ CRM_XS " rc=%d", pcmk_strerror(rc), rc);
}
-
- if (user_data != NULL) {
+ } else {
+ // Non-blocking (retry failures later in main loop)
+ rc = stonith_api->cmds->connect(stonith_api, crm_system_name, NULL);
+ if (rc != pcmk_ok) {
if (is_set(fsa_input_register, R_ST_REQUIRED)) {
- crm_err("Sign-in failed: triggered a retry");
+ crm_err("Fencer connection failed (will retry): %s "
+ CRM_XS " rc=%d", pcmk_strerror(rc), rc);
mainloop_set_trigger(stonith_reconnect);
} else {
- crm_info("Sign-in failed, but no longer required");
+ crm_info("Fencer connection failed (ignoring because no longer required): %s "
+ CRM_XS " rc=%d", pcmk_strerror(rc), rc);
}
return TRUE;
}
-
- crm_err("Sign-in failed: pausing and trying again in 2s...");
- sleep(1);
}
- CRM_CHECK(rc == pcmk_ok, return TRUE); /* If not, we failed 30 times... just get out */
- stonith_api->cmds->register_notification(stonith_api, T_STONITH_NOTIFY_DISCONNECT,
- tengine_stonith_connection_destroy);
-
- stonith_api->cmds->register_notification(stonith_api, T_STONITH_NOTIFY_FENCE,
- tengine_stonith_notify);
-
- crm_trace("Connected");
+ if (rc == pcmk_ok) {
+ stonith_api->cmds->register_notification(stonith_api,
+ T_STONITH_NOTIFY_DISCONNECT,
+ tengine_stonith_connection_destroy);
+ stonith_api->cmds->register_notification(stonith_api,
+ T_STONITH_NOTIFY_FENCE,
+ tengine_stonith_notify);
+ }
return TRUE;
}
diff --git a/daemons/execd/pacemaker-execd.c b/daemons/execd/pacemaker-execd.c
index 21bb0ed..e2fdfca 100644
--- a/daemons/execd/pacemaker-execd.c
+++ b/daemons/execd/pacemaker-execd.c
@@ -65,28 +65,20 @@ get_stonith_connection(void)
stonith_api = NULL;
}
- if (!stonith_api) {
- int rc = 0;
- int tries = 10;
+ if (stonith_api == NULL) {
+ int rc = pcmk_ok;
stonith_api = stonith_api_new();
- do {
- rc = stonith_api->cmds->connect(stonith_api, "pacemaker-execd", NULL);
- if (rc == pcmk_ok) {
- stonith_api->cmds->register_notification(stonith_api,
- T_STONITH_NOTIFY_DISCONNECT,
- stonith_connection_destroy_cb);
- break;
- }
- sleep(1);
- tries--;
- } while (tries);
-
- if (rc) {
- crm_err("Unable to connect to stonith daemon to execute command. error: %s",
- pcmk_strerror(rc));
+ rc = stonith_api_connect_retry(stonith_api, crm_system_name, 10);
+ if (rc != pcmk_ok) {
+ crm_err("Could not connect to fencer in 10 attempts: %s "
+ CRM_XS " rc=%d", pcmk_strerror(rc), rc);
stonith_api_delete(stonith_api);
stonith_api = NULL;
+ } else {
+ stonith_api->cmds->register_notification(stonith_api,
+ T_STONITH_NOTIFY_DISCONNECT,
+ stonith_connection_destroy_cb);
}
}
return stonith_api;
diff --git a/daemons/fenced/cts-fence-helper.c b/daemons/fenced/cts-fence-helper.c
index c5ce1ab..4552fc1 100644
--- a/daemons/fenced/cts-fence-helper.c
+++ b/daemons/fenced/cts-fence-helper.c
@@ -1,5 +1,5 @@
/*
- * Copyright 2009-2018 Andrew Beekhof <andrew@beekhof.net>
+ * Copyright 2009-2019 the Pacemaker project contributors
*
* This source code is licensed under the GNU General Public License version 2
* or later (GPLv2+) WITHOUT ANY WARRANTY.
@@ -124,8 +124,10 @@ passive_test(void)
int rc = 0;
rc = st->cmds->connect(st, crm_system_name, &pollfd.fd);
- crm_debug("Connect: %d", rc);
-
+ if (rc != pcmk_ok) {
+ stonith_api_delete(st);
+ crm_exit(CRM_EX_DISCONNECT);
+ }
st->cmds->register_notification(st, T_STONITH_NOTIFY_DISCONNECT, st_callback);
st->cmds->register_notification(st, T_STONITH_NOTIFY_FENCE, st_callback);
st->cmds->register_notification(st, STONITH_OP_DEVICE_ADD, st_callback);
@@ -271,8 +273,10 @@ sanity_tests(void)
int rc = 0;
rc = st->cmds->connect(st, crm_system_name, &pollfd.fd);
- crm_debug("Connect: %d", rc);
-
+ if (rc != pcmk_ok) {
+ stonith_api_delete(st);
+ crm_exit(CRM_EX_DISCONNECT);
+ }
st->cmds->register_notification(st, T_STONITH_NOTIFY_DISCONNECT, st_callback);
st->cmds->register_notification(st, T_STONITH_NOTIFY_FENCE, st_callback);
st->cmds->register_notification(st, STONITH_OP_DEVICE_ADD, st_callback);
@@ -295,7 +299,10 @@ standard_dev_test(void)
stonith_key_value_t *params = NULL;
rc = st->cmds->connect(st, crm_system_name, &pollfd.fd);
- crm_debug("Connect: %d", rc);
+ if (rc != pcmk_ok) {
+ stonith_api_delete(st);
+ crm_exit(CRM_EX_DISCONNECT);
+ }
params = stonith_key_value_add(params, "pcmk_host_map", "some-host=pcmk-7 true_1_node1=3,4");
@@ -502,23 +509,12 @@ test_register_async_devices(int check_event)
static void
try_mainloop_connect(int check_event)
{
- int tries = 10;
- int i = 0;
- int rc = 0;
+ int rc = stonith_api_connect_retry(st, crm_system_name, 10);
- for (i = 0; i < tries; i++) {
- rc = st->cmds->connect(st, crm_system_name, NULL);
-
- if (!rc) {
- crm_info("stonith client connection established");
- mainloop_test_done(TRUE);
- return;
- } else {
- crm_info("stonith client connection failed");
- }
- sleep(1);
+ if (rc == pcmk_ok) {
+ mainloop_test_done(TRUE);
+ return;
}
-
crm_err("API CONNECTION FAILURE");
mainloop_test_done(FALSE);
}
diff --git a/include/crm/stonith-ng.h b/include/crm/stonith-ng.h
index b7365a9..b640732 100644
--- a/include/crm/stonith-ng.h
+++ b/include/crm/stonith-ng.h
@@ -430,6 +430,10 @@ void stonith_key_value_freeall(stonith_key_value_t * kvp, int keys, int values);
void stonith_history_free(stonith_history_t *history);
+// Convenience functions
+int stonith_api_connect_retry(stonith_t *st, const char *name,
+ int max_attempts);
+
/* Basic helpers that allows nodes to be fenced and the history to be
* queried without mainloop or the caller understanding the full API
*
diff --git a/lib/fencing/st_client.c b/lib/fencing/st_client.c
index 270ef8d..ceee944 100644
--- a/lib/fencing/st_client.c
+++ b/lib/fencing/st_client.c
@@ -1,5 +1,5 @@
/*
- * Copyright 2004-2018 Andrew Beekhof <andrew@beekhof.net>
+ * Copyright 2004-2019 the Pacemaker project contributors
*
* This source code is licensed under the GNU Lesser General Public License
* version 2.1 or later (LGPLv2.1+) WITHOUT ANY WARRANTY.
@@ -1415,14 +1415,21 @@ static int
stonith_api_signon(stonith_t * stonith, const char *name, int *stonith_fd)
{
int rc = pcmk_ok;
- stonith_private_t *native = stonith->st_private;
+ stonith_private_t *native = NULL;
+ const char *display_name = name? name : "client";
static struct ipc_client_callbacks st_callbacks = {
.dispatch = stonith_dispatch_internal,
.destroy = stonith_connection_destroy
};
- crm_trace("Connecting command channel");
+ CRM_CHECK(stonith != NULL, return -EINVAL);
+
+ native = stonith->st_private;
+ CRM_ASSERT(native != NULL);
+
+ crm_debug("Attempting fencer connection by %s with%s mainloop",
+ display_name, (stonith_fd? "out" : ""));
stonith->state = stonith_connected_command;
if (stonith_fd) {
@@ -1432,8 +1439,9 @@ stonith_api_signon(stonith_t * stonith, const char *name, int *stonith_fd)
if (native->ipc && crm_ipc_connect(native->ipc)) {
*stonith_fd = crm_ipc_get_fd(native->ipc);
} else if (native->ipc) {
- crm_perror(LOG_ERR, "Connection to fencer failed");
- rc = -ENOTCONN;
+ crm_ipc_close(native->ipc);
+ crm_ipc_destroy(native->ipc);
+ native->ipc = NULL;
}
} else {
@@ -1444,11 +1452,8 @@ stonith_api_signon(stonith_t * stonith, const char *name, int *stonith_fd)
}
if (native->ipc == NULL) {
- crm_debug("Could not connect to the Stonith API");
rc = -ENOTCONN;
- }
-
- if (rc == pcmk_ok) {
+ } else {
xmlNode *reply = NULL;
xmlNode *hello = create_xml_node(NULL, "stonith_command");
@@ -1458,11 +1463,12 @@ stonith_api_signon(stonith_t * stonith, const char *name, int *stonith_fd)
rc = crm_ipc_send(native->ipc, hello, crm_ipc_client_response, -1, &reply);
if (rc < 0) {
- crm_perror(LOG_DEBUG, "Couldn't complete registration with the fencing API: %d", rc);
+ crm_debug("Couldn't register with the fencer: %s "
+ CRM_XS " rc=%d", pcmk_strerror(rc), rc);
rc = -ECOMM;
} else if (reply == NULL) {
- crm_err("Did not receive registration reply");
+ crm_debug("Couldn't register with the fencer: no reply");
rc = -EPROTO;
} else {
@@ -1470,18 +1476,23 @@ stonith_api_signon(stonith_t * stonith, const char *name, int *stonith_fd)
const char *tmp_ticket = crm_element_value(reply, F_STONITH_CLIENTID);
if (safe_str_neq(msg_type, CRM_OP_REGISTER)) {
- crm_err("Invalid registration message: %s", msg_type);
- crm_log_xml_err(reply, "Bad reply");
+ crm_debug("Couldn't register with the fencer: invalid reply type '%s'",
+ (msg_type? msg_type : "(missing)"));
+ crm_log_xml_debug(reply, "Invalid fencer reply");
rc = -EPROTO;
} else if (tmp_ticket == NULL) {
- crm_err("No registration token provided");
- crm_log_xml_err(reply, "Bad reply");
+ crm_debug("Couldn't register with the fencer: no token in reply");
+ crm_log_xml_debug(reply, "Invalid fencer reply");
rc = -EPROTO;
} else {
- crm_trace("Obtained registration token: %s", tmp_ticket);
native->token = strdup(tmp_ticket);
+#if HAVE_MSGFROMIPC_TIMEOUT
+ stonith->call_timeout = MAX_IPC_DELAY;
+#endif
+ crm_debug("Connection to fencer by %s succeeded (registration token: %s)",
+ display_name, native->token);
rc = pcmk_ok;
}
}
@@ -1490,16 +1501,11 @@ stonith_api_signon(stonith_t * stonith, const char *name, int *stonith_fd)
free_xml(hello);
}
- if (rc == pcmk_ok) {
-#if HAVE_MSGFROMIPC_TIMEOUT
- stonith->call_timeout = MAX_IPC_DELAY;
-#endif
- crm_debug("Connection to fencer successful");
- return pcmk_ok;
+ if (rc != pcmk_ok) {
+ crm_debug("Connection attempt to fencer by %s failed: %s "
+ CRM_XS " rc=%d", display_name, pcmk_strerror(rc), rc);
+ stonith->cmds->disconnect(stonith);
}
-
- crm_debug("Connection to fencer failed: %s", pcmk_strerror(rc));
- stonith->cmds->disconnect(stonith);
return rc;
}
@@ -2071,6 +2077,36 @@ stonith_api_new(void)
return new_stonith;
}
+/*!
+ * \brief Make a blocking connection attempt to the fencer
+ *
+ * \param[in,out] st Fencer API object
+ * \param[in] name Client name to use with fencer
+ * \param[in] max_attempts Return error if this many attempts fail
+ *
+ * \return pcmk_ok on success, result of last attempt otherwise
+ */
+int
+stonith_api_connect_retry(stonith_t *st, const char *name, int max_attempts)
+{
+ int rc = -EINVAL; // if max_attempts is not positive
+
+ for (int attempt = 1; attempt <= max_attempts; attempt++) {
+ rc = st->cmds->connect(st, name, NULL);
+ if (rc == pcmk_ok) {
+ return pcmk_ok;
+ } else if (attempt < max_attempts) {
+ crm_notice("Fencer connection attempt %d of %d failed (retrying in 2s): %s "
+ CRM_XS " rc=%d",
+ attempt, max_attempts, pcmk_strerror(rc), rc);
+ sleep(2);
+ }
+ }
+ crm_notice("Could not connect to fencer: %s " CRM_XS " rc=%d",
+ pcmk_strerror(rc), rc);
+ return rc;
+}
+
stonith_key_value_t *
stonith_key_value_add(stonith_key_value_t * head, const char *key, const char *value)
{
@@ -2122,85 +2158,78 @@ stonith_key_value_freeall(stonith_key_value_t * head, int keys, int values)
int
stonith_api_kick(uint32_t nodeid, const char *uname, int timeout, bool off)
{
- char *name = NULL;
- const char *action = "reboot";
-
- int rc = -EPROTO;
- stonith_t *st = NULL;
- enum stonith_call_options opts = st_opt_sync_call | st_opt_allow_suicide;
+ int rc = pcmk_ok;
+ stonith_t *st = stonith_api_new();
+ const char *action = off? "off" : "reboot";
api_log_open();
- st = stonith_api_new();
- if (st) {
- rc = st->cmds->connect(st, "stonith-api", NULL);
- if(rc != pcmk_ok) {
- api_log(LOG_ERR, "Connection failed, could not kick (%s) node %u/%s : %s (%d)", action, nodeid, uname, pcmk_strerror(rc), rc);
- }
+ if (st == NULL) {
+ api_log(LOG_ERR, "API initialization failed, could not kick (%s) node %u/%s",
+ action, nodeid, uname);
+ return -EPROTO;
}
- if (uname != NULL) {
- name = strdup(uname);
-
- } else if (nodeid > 0) {
- opts |= st_opt_cs_nodeid;
- name = crm_itoa(nodeid);
- }
-
- if (off) {
- action = "off";
- }
-
- if (rc == pcmk_ok) {
+ rc = st->cmds->connect(st, "stonith-api", NULL);
+ if (rc != pcmk_ok) {
+ api_log(LOG_ERR, "Connection failed, could not kick (%s) node %u/%s : %s (%d)",
+ action, nodeid, uname, pcmk_strerror(rc), rc);
+ } else {
+ char *name = NULL;
+ enum stonith_call_options opts = st_opt_sync_call | st_opt_allow_suicide;
+
+ if (uname != NULL) {
+ name = strdup(uname);
+ } else if (nodeid > 0) {
+ opts |= st_opt_cs_nodeid;
+ name = crm_itoa(nodeid);
+ }
rc = st->cmds->fence(st, opts, name, action, timeout, 0);
- if(rc != pcmk_ok) {
- api_log(LOG_ERR, "Could not kick (%s) node %u/%s : %s (%d)", action, nodeid, uname, pcmk_strerror(rc), rc);
+ free(name);
+
+ if (rc != pcmk_ok) {
+ api_log(LOG_ERR, "Could not kick (%s) node %u/%s : %s (%d)",
+ action, nodeid, uname, pcmk_strerror(rc), rc);
} else {
- api_log(LOG_NOTICE, "Node %u/%s kicked: %s ", nodeid, uname, action);
+ api_log(LOG_NOTICE, "Node %u/%s kicked: %s", nodeid, uname, action);
}
}
- if (st) {
- st->cmds->disconnect(st);
- stonith_api_delete(st);
- }
-
- free(name);
+ stonith_api_delete(st);
return rc;
}
time_t
stonith_api_time(uint32_t nodeid, const char *uname, bool in_progress)
{
- int rc = 0;
- char *name = NULL;
-
+ int rc = pcmk_ok;
time_t when = 0;
- stonith_t *st = NULL;
+ stonith_t *st = stonith_api_new();
stonith_history_t *history = NULL, *hp = NULL;
- enum stonith_call_options opts = st_opt_sync_call;
-
- st = stonith_api_new();
- if (st) {
- rc = st->cmds->connect(st, "stonith-api", NULL);
- if(rc != pcmk_ok) {
- api_log(LOG_NOTICE, "Connection failed: %s (%d)", pcmk_strerror(rc), rc);
- }
- }
-
- if (uname != NULL) {
- name = strdup(uname);
- } else if (nodeid > 0) {
- opts |= st_opt_cs_nodeid;
- name = crm_itoa(nodeid);
+ if (st == NULL) {
+ api_log(LOG_ERR, "Could not retrieve fence history for %u/%s: "
+ "API initialization failed", nodeid, uname);
+ return when;
}
- if (st && rc == pcmk_ok) {
+ rc = st->cmds->connect(st, "stonith-api", NULL);
+ if (rc != pcmk_ok) {
+ api_log(LOG_NOTICE, "Connection failed: %s (%d)", pcmk_strerror(rc), rc);
+ } else {
int entries = 0;
int progress = 0;
int completed = 0;
-
+ char *name = NULL;
+ enum stonith_call_options opts = st_opt_sync_call;
+
+ if (uname != NULL) {
+ name = strdup(uname);
+ } else if (nodeid > 0) {
+ opts |= st_opt_cs_nodeid;
+ name = crm_itoa(nodeid);
+ }
rc = st->cmds->history(st, opts, name, &history, 120);
+ free(name);
for (hp = history; hp; hp = hp->next) {
entries++;
@@ -2227,15 +2256,11 @@ stonith_api_time(uint32_t nodeid, const char *uname, bool in_progress)
}
}
- if (st) {
- st->cmds->disconnect(st);
- stonith_api_delete(st);
- }
+ stonith_api_delete(st);
if(when) {
api_log(LOG_INFO, "Node %u/%s last kicked at: %ld", nodeid, uname, (long int)when);
}
- free(name);
return when;
}
diff --git a/tools/crm_mon.c b/tools/crm_mon.c
index e101b62..bed0796 100644
--- a/tools/crm_mon.c
+++ b/tools/crm_mon.c
@@ -298,7 +298,6 @@ cib_connect(gboolean full)
}
if ((fence_connect) && (st->state == stonith_disconnected)) {
- crm_trace("Connecting to stonith");
rc = st->cmds->connect(st, crm_system_name, NULL);
if (rc == pcmk_ok) {
crm_trace("Setting up stonith callbacks");
diff --git a/tools/stonith_admin.c b/tools/stonith_admin.c
index d960fb1..6be66c6 100644
--- a/tools/stonith_admin.c
+++ b/tools/stonith_admin.c
@@ -198,31 +198,6 @@ struct {
int rc;
} async_fence_data;
-static int
-try_mainloop_connect(void)
-{
- stonith_t *st = async_fence_data.st;
- int tries = 10;
- int i = 0;
- int rc = 0;
-
- for (i = 0; i < tries; i++) {
- crm_debug("Connecting as %s", async_fence_data.name);
- rc = st->cmds->connect(st, async_fence_data.name, NULL);
-
- if (!rc) {
- crm_debug("stonith client connection established");
- return 0;
- } else {
- crm_debug("stonith client connection failed");
- }
- sleep(1);
- }
-
- crm_err("Could not connect to the fencer");
- return -1;
-}
-
static void
notify_callback(stonith_t * st, stonith_event_t * e)
{
@@ -251,8 +226,10 @@ async_fence_helper(gpointer user_data)
{
stonith_t *st = async_fence_data.st;
int call_id = 0;
+ int rc = stonith_api_connect_retry(st, async_fence_data.name, 10);
- if (try_mainloop_connect()) {
+ if (rc != pcmk_ok) {
+ fprintf(stderr, "Could not connect to fencer: %s\n", pcmk_strerror(rc));
g_main_loop_quit(mainloop);
return TRUE;
}
--
1.8.3.1