import pacemaker-2.1.2-4.el9

This commit is contained in:
CentOS Sources 2022-03-01 08:29:50 -05:00 committed by Stepan Oksanichenko
parent f997a33b7b
commit 7d8673c267
18 changed files with 13567 additions and 3 deletions

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,65 @@
From ed8b2c86ab77aaa3d7fd688c049ad5e1b922a9c6 Mon Sep 17 00:00:00 2001
From: Reid Wahl <nrwahl@protonmail.com>
Date: Thu, 13 Jan 2022 02:56:55 -0800
Subject: [PATCH] Fix: liblrmd: Avoid double-free during notify operation
This commit fixes a regression introduced by 31c7fa8a, causing a
double-free in notify operations. lrmd_dispatch_internal() assigns the
exit_reason string directly from an XML node to a new lrmd_event_data_t
object (without duplicating), and this string gets freed twice.
Free #1: pcmk__create_history_xml() (reached via callback) calls
lrmd__set_result(), which frees event.exit_reason and sets it to NULL.
Free #2: lrmd_ipc_dispatch() frees the XML node, which contains a
pointer to the exit_reason string just freed, after
lrmd_dispatch_internal() returns.
Prior to 31c7fa8a, pcmk__create_history_xml reset event.rc and
event.op_status but **not** event.exit_reason.
In this commit we simply make a copy of event.exit_reason in
lrmd_dispatch_internal() before the callback. This way we don't have to
worry about whatever happens in the callback, and we can continue to
unset the exit_reason alongside the rc and op_status. The added overhead
should be minimal.
This commit also makes a copy of output. That's not strictly necessary
but adds some futureproofing and allows us to call lrmd__reset_result()
at the end of lrmd_dispatch_internal().
Resolves: RHBZ#2039675
Signed-off-by: Reid Wahl <nrwahl@protonmail.com>
---
lib/lrmd/lrmd_client.c | 8 +++++---
1 file changed, 5 insertions(+), 3 deletions(-)
diff --git a/lib/lrmd/lrmd_client.c b/lib/lrmd/lrmd_client.c
index ee31bb5ae9..5131a648b7 100644
--- a/lib/lrmd/lrmd_client.c
+++ b/lib/lrmd/lrmd_client.c
@@ -305,9 +305,10 @@ lrmd_dispatch_internal(lrmd_t * lrmd, xmlNode * msg)
event.user_data = crm_element_value(msg, F_LRMD_RSC_USERDATA_STR);
event.type = lrmd_event_exec_complete;
- // No need to duplicate the memory, so don't use setter functions
- event.output = crm_element_value(msg, F_LRMD_RSC_OUTPUT);
- event.exit_reason = crm_element_value(msg, F_LRMD_RSC_EXIT_REASON);
+ /* output and exit_reason may be freed by a callback */
+ event.output = crm_element_value_copy(msg, F_LRMD_RSC_OUTPUT);
+ lrmd__set_result(&event, event.rc, event.op_status,
+ crm_element_value(msg, F_LRMD_RSC_EXIT_REASON));
event.params = xml2list(msg);
} else if (pcmk__str_eq(type, LRMD_OP_NEW_CLIENT, pcmk__str_none)) {
@@ -324,6 +325,7 @@ lrmd_dispatch_internal(lrmd_t * lrmd, xmlNode * msg)
if (event.params) {
g_hash_table_destroy(event.params);
}
+ lrmd__reset_result(&event);
}
// \return Always 0, to indicate that IPC mainloop source should be kept
--
2.27.0

View File

@ -0,0 +1,26 @@
From 186d5a02fba919c455fd6eeb050b4be107f82159 Mon Sep 17 00:00:00 2001
From: Chris Lumens <clumens@redhat.com>
Date: Thu, 13 Jan 2022 17:02:47 -0500
Subject: [PATCH] Low: scheduler: Use the old RC code to log maskable probe
failures.
---
lib/pengine/unpack.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/lib/pengine/unpack.c b/lib/pengine/unpack.c
index 8a2d2a6d6d..b01f86257a 100644
--- a/lib/pengine/unpack.c
+++ b/lib/pengine/unpack.c
@@ -3780,7 +3780,7 @@ unpack_rsc_op(pe_resource_t *rsc, pe_node_t *node, xmlNode *xml_op,
if (maskable_probe_failure) {
crm_notice("Treating probe result '%s' for %s on %s as 'not running'",
- services_ocf_exitcode_str(rc), rsc->id, node->details->uname);
+ services_ocf_exitcode_str(old_rc), rsc->id, node->details->uname);
update_resource_state(rsc, node, xml_op, task, target_rc, *last_failure,
on_fail, data_set);
crm_xml_add(xml_op, XML_ATTR_UNAME, node->details->uname);
--
2.27.0

View File

@ -0,0 +1,43 @@
From 9d812b0401d4cedef53a3cc3653ec782a5c49e37 Mon Sep 17 00:00:00 2001
From: Ken Gaillot <kgaillot@redhat.com>
Date: Thu, 13 Jan 2022 10:42:02 -0600
Subject: [PATCH] Doc: fencer: improve pcmk_delay_base meta-data
Update its type, since its value can now be a node map as well as a string,
and add more detail to its description.
---
daemons/fenced/pacemaker-fenced.c | 18 +++++++++++-------
1 file changed, 11 insertions(+), 7 deletions(-)
diff --git a/daemons/fenced/pacemaker-fenced.c b/daemons/fenced/pacemaker-fenced.c
index 1b954be5a4..12f331496c 100644
--- a/daemons/fenced/pacemaker-fenced.c
+++ b/daemons/fenced/pacemaker-fenced.c
@@ -1548,13 +1548,17 @@ main(int argc, char **argv)
PCMK_STONITH_DELAY_BASE);
printf(" <shortdesc lang=\"en\">Enable a base delay for "
"fencing actions and specify base delay value.</shortdesc>\n");
- printf(" <longdesc lang=\"en\">This prevents double fencing when "
- "different delays are configured on the nodes.\nUse this to "
- "enable a static delay for fencing actions.\nThe overall delay "
- "is derived from a random delay value adding this static delay "
- "so that the sum is kept below the maximum delay.\nSet to eg. "
- "node1:1s;node2:5 to set different value per node.</longdesc>\n");
- printf(" <content type=\"time\" default=\"0s\"/>\n");
+ printf(" <longdesc lang=\"en\">This enables a static delay for "
+ "fencing actions, which can help avoid \"death matches\" where "
+ "two nodes try to fence each other at the same time. If "
+ PCMK_STONITH_DELAY_MAX " is also used, a random delay will be "
+ "added such that the total delay is kept below that value.\n"
+ "This can be set to a single time value to apply to any node "
+ "targeted by this device (useful if a separate device is "
+ "configured for each target), or to a node map (for example, "
+ "\"node1:1s;node2:5\") to set a different value per target.\n"
+ " </longdesc>\n");
+ printf(" <content type=\"string\" default=\"0s\"/>\n");
printf(" </parameter>\n");
printf(" <parameter name=\"%s\" unique=\"0\">\n",
--
2.27.0

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,56 @@
From e330568504ec379ea42460d21a2e20b1652d9445 Mon Sep 17 00:00:00 2001
From: Reid Wahl <nrwahl@protonmail.com>
Date: Fri, 14 Jan 2022 01:35:35 -0800
Subject: [PATCH] Fix: fencing: Don't set stonith action to pending if fork
fails
Currently, we set a stonith action to pending if
services_action_async_fork_notify() returns true. However, "true" means
that the svc_action should not be freed. This might be because the
svc_action forked successfully and is pending, or it might be because
the svc_action has already been freed.
In the case of stonith actions, if we fail to fork, the stonith_action_t
object stored in svc_action->cb_data gets freed by the done callback,
and services_action_async_fork_notify() returns true. If we try to set
the action to pending, it causes a segfault.
This commit moves the "set to pending" step to the
stonith_action_async_forked() callback. We avoid the segfault and only
set it to pending if it's actually pending.
A slight difference in ordering was required to achieve this. Now, the
action gets set to pending immediately before being added to the
mainloop, instead of immediately after.
Signed-off-by: Reid Wahl <nrwahl@protonmail.com>
---
lib/fencing/st_actions.c | 5 +++--
1 file changed, 3 insertions(+), 2 deletions(-)
diff --git a/lib/fencing/st_actions.c b/lib/fencing/st_actions.c
index e4e43225cd..306001af69 100644
--- a/lib/fencing/st_actions.c
+++ b/lib/fencing/st_actions.c
@@ -550,6 +550,9 @@ stonith_action_async_forked(svc_action_t *svc_action)
(action->fork_cb) (svc_action->pid, action->userdata);
}
+ pcmk__set_result(&(action->result), PCMK_OCF_UNKNOWN, PCMK_EXEC_PENDING,
+ NULL);
+
crm_trace("Child process %d performing action '%s' successfully forked",
action->pid, action->action);
}
@@ -619,8 +622,6 @@ internal_stonith_action_execute(stonith_action_t * action)
if (services_action_async_fork_notify(svc_action,
&stonith_action_async_done,
&stonith_action_async_forked)) {
- pcmk__set_result(&(action->result), PCMK_OCF_UNKNOWN,
- PCMK_EXEC_PENDING, NULL);
return pcmk_ok;
}
--
2.27.0

View File

@ -0,0 +1,875 @@
From 523f62eb235836a01ea039c23ada261a494f7b32 Mon Sep 17 00:00:00 2001
From: Ken Gaillot <kgaillot@redhat.com>
Date: Wed, 10 Nov 2021 15:22:47 -0600
Subject: [PATCH 01/11] Feature: libpacemaker: improve result for high-level
fencing API
Previously, pcmk__fencing_action()'s helpers for asynchronous fencing actions
initialized the result to a generic error, and then overrode that only on
success.
Now, set a detailed result for early failures, and use the full result when
available from the fencing API.
A standard return code is still returned to callers at this point.
---
lib/pacemaker/pcmk_fence.c | 31 ++++++++++++++++++-------------
1 file changed, 18 insertions(+), 13 deletions(-)
diff --git a/lib/pacemaker/pcmk_fence.c b/lib/pacemaker/pcmk_fence.c
index 7d6acd0de6..125e1b268b 100644
--- a/lib/pacemaker/pcmk_fence.c
+++ b/lib/pacemaker/pcmk_fence.c
@@ -32,8 +32,8 @@ static struct {
unsigned int timeout;
unsigned int tolerance;
int delay;
- int rc;
-} async_fence_data;
+ pcmk__action_result_t result;
+} async_fence_data = { NULL, };
static int
handle_level(stonith_t *st, char *target, int fence_level,
@@ -76,14 +76,13 @@ handle_level(stonith_t *st, char *target, int fence_level,
static void
notify_callback(stonith_t * st, stonith_event_t * e)
{
- if (e->result != pcmk_ok) {
- return;
- }
+ if (pcmk__str_eq(async_fence_data.target, e->target, pcmk__str_casei)
+ && pcmk__str_eq(async_fence_data.action, e->action, pcmk__str_casei)) {
- if (pcmk__str_eq(async_fence_data.target, e->target, pcmk__str_casei) &&
- pcmk__str_eq(async_fence_data.action, e->action, pcmk__str_casei)) {
-
- async_fence_data.rc = e->result;
+ pcmk__set_result(&async_fence_data.result,
+ stonith__event_exit_status(e),
+ stonith__event_execution_status(e),
+ stonith__event_exit_reason(e));
g_main_loop_quit(mainloop);
}
}
@@ -91,8 +90,9 @@ notify_callback(stonith_t * st, stonith_event_t * e)
static void
fence_callback(stonith_t * stonith, stonith_callback_data_t * data)
{
- async_fence_data.rc = data->rc;
-
+ pcmk__set_result(&async_fence_data.result, stonith__exit_status(data),
+ stonith__execution_status(data),
+ stonith__exit_reason(data));
g_main_loop_quit(mainloop);
}
@@ -106,6 +106,8 @@ async_fence_helper(gpointer user_data)
if (rc != pcmk_ok) {
fprintf(stderr, "Could not connect to fencer: %s\n", pcmk_strerror(rc));
g_main_loop_quit(mainloop);
+ pcmk__set_result(&async_fence_data.result, CRM_EX_ERROR,
+ PCMK_EXEC_NOT_CONNECTED, NULL);
return TRUE;
}
@@ -121,6 +123,8 @@ async_fence_helper(gpointer user_data)
if (call_id < 0) {
g_main_loop_quit(mainloop);
+ pcmk__set_result(&async_fence_data.result, CRM_EX_ERROR,
+ PCMK_EXEC_ERROR, pcmk_strerror(call_id));
return TRUE;
}
@@ -146,7 +150,8 @@ pcmk__fence_action(stonith_t *st, const char *target, const char *action,
async_fence_data.timeout = timeout;
async_fence_data.tolerance = tolerance;
async_fence_data.delay = delay;
- async_fence_data.rc = pcmk_err_generic;
+ pcmk__set_result(&async_fence_data.result, CRM_EX_ERROR, PCMK_EXEC_UNKNOWN,
+ NULL);
trig = mainloop_add_trigger(G_PRIORITY_HIGH, async_fence_helper, NULL);
mainloop_set_trigger(trig);
@@ -156,7 +161,7 @@ pcmk__fence_action(stonith_t *st, const char *target, const char *action,
free(async_fence_data.name);
- return pcmk_legacy2rc(async_fence_data.rc);
+ return stonith__result2rc(&async_fence_data.result);
}
#ifdef BUILD_PUBLIC_LIBPACEMAKER
--
2.27.0
From 008868fae5d1b0d6d8dc61f7acfb3856801ddd52 Mon Sep 17 00:00:00 2001
From: Ken Gaillot <kgaillot@redhat.com>
Date: Fri, 10 Dec 2021 15:36:10 -0600
Subject: [PATCH 02/11] Refactor: libpacemaker: add exit reason to high-level
fencing API
Nothing uses it as of this commit
---
include/pacemaker.h | 5 ++++-
include/pcmki/pcmki_fence.h | 5 ++++-
lib/pacemaker/pcmk_fence.c | 10 +++++++---
tools/stonith_admin.c | 6 +++---
4 files changed, 18 insertions(+), 8 deletions(-)
diff --git a/include/pacemaker.h b/include/pacemaker.h
index a8523c969e..0daa4c5945 100644
--- a/include/pacemaker.h
+++ b/include/pacemaker.h
@@ -189,12 +189,15 @@ int pcmk_list_nodes(xmlNodePtr *xml, char *node_types);
* again.
* \param[in] delay Apply a fencing delay. Value -1 means disable also any
* static/random fencing delays from pcmk_delay_base/max.
+ * \param[out] reason If not NULL, where to put descriptive failure reason
*
* \return Standard Pacemaker return code
+ * \note If \p reason is not NULL, the caller is responsible for freeing its
+ * returned value.
*/
int pcmk_fence_action(stonith_t *st, const char *target, const char *action,
const char *name, unsigned int timeout, unsigned int tolerance,
- int delay);
+ int delay, char **reason);
/*!
* \brief List the fencing operations that have occurred for a specific node.
diff --git a/include/pcmki/pcmki_fence.h b/include/pcmki/pcmki_fence.h
index d4cef68f5c..c3da0361d7 100644
--- a/include/pcmki/pcmki_fence.h
+++ b/include/pcmki/pcmki_fence.h
@@ -28,12 +28,15 @@
* again.
* \param[in] delay Apply a fencing delay. Value -1 means disable also any
* static/random fencing delays from pcmk_delay_base/max
+ * \param[out] reason If not NULL, where to put descriptive failure reason
*
* \return Standard Pacemaker return code
+ * \note If \p reason is not NULL, the caller is responsible for freeing its
+ * returned value.
*/
int pcmk__fence_action(stonith_t *st, const char *target, const char *action,
const char *name, unsigned int timeout, unsigned int tolerance,
- int delay);
+ int delay, char **reason);
/*!
* \brief List the fencing operations that have occurred for a specific node.
diff --git a/lib/pacemaker/pcmk_fence.c b/lib/pacemaker/pcmk_fence.c
index 125e1b268b..dbf084fb6b 100644
--- a/lib/pacemaker/pcmk_fence.c
+++ b/lib/pacemaker/pcmk_fence.c
@@ -139,7 +139,7 @@ async_fence_helper(gpointer user_data)
int
pcmk__fence_action(stonith_t *st, const char *target, const char *action,
const char *name, unsigned int timeout, unsigned int tolerance,
- int delay)
+ int delay, char **reason)
{
crm_trigger_t *trig;
@@ -161,6 +161,9 @@ pcmk__fence_action(stonith_t *st, const char *target, const char *action,
free(async_fence_data.name);
+ if ((reason != NULL) && (async_fence_data.result.exit_reason != NULL)) {
+ *reason = strdup(async_fence_data.result.exit_reason);
+ }
return stonith__result2rc(&async_fence_data.result);
}
@@ -168,9 +171,10 @@ pcmk__fence_action(stonith_t *st, const char *target, const char *action,
int
pcmk_fence_action(stonith_t *st, const char *target, const char *action,
const char *name, unsigned int timeout, unsigned int tolerance,
- int delay)
+ int delay, char **reason)
{
- return pcmk__fence_action(st, target, action, name, timeout, tolerance, delay);
+ return pcmk__fence_action(st, target, action, name, timeout, tolerance,
+ delay, reason);
}
#endif
diff --git a/tools/stonith_admin.c b/tools/stonith_admin.c
index 2d48326e1b..fdc7c46d49 100644
--- a/tools/stonith_admin.c
+++ b/tools/stonith_admin.c
@@ -571,17 +571,17 @@ main(int argc, char **argv)
case 'B':
rc = pcmk__fence_action(st, target, "reboot", name, options.timeout*1000,
- options.tolerance*1000, options.delay);
+ options.tolerance*1000, options.delay, NULL);
break;
case 'F':
rc = pcmk__fence_action(st, target, "off", name, options.timeout*1000,
- options.tolerance*1000, options.delay);
+ options.tolerance*1000, options.delay, NULL);
break;
case 'U':
rc = pcmk__fence_action(st, target, "on", name, options.timeout*1000,
- options.tolerance*1000, options.delay);
+ options.tolerance*1000, options.delay, NULL);
break;
case 'h':
--
2.27.0
From 7570510f9985ba75ef73fb824f28109e135ace0a Mon Sep 17 00:00:00 2001
From: Ken Gaillot <kgaillot@redhat.com>
Date: Fri, 10 Dec 2021 15:40:48 -0600
Subject: [PATCH 03/11] Refactor: libpacemaker: rename high-level fencing API
Rename pcmk_fence_action() to pcmk_request_fencing(), and its internal
equivalent pcmk__fence_action() to pcmk__request_fencing(). The change is
backward-compatible because pcmk_fence_action() has not been exposed publicly
yet.
"Fence action" can be easily confused with libcrmservice actions, liblrmd
actions, libstonithd actions, scheduler actions, and so forth.
Also, the new name makes it clearer that the caller is requesting that the
cluster perform fencing, and not directly performing fencing.
---
include/pacemaker.h | 20 ++++++++++----------
include/pcmki/pcmki_fence.h | 16 ++++++++--------
lib/pacemaker/pcmk_fence.c | 16 ++++++++--------
tools/stonith_admin.c | 18 ++++++++++++------
4 files changed, 38 insertions(+), 32 deletions(-)
diff --git a/include/pacemaker.h b/include/pacemaker.h
index 0daa4c5945..e581f975a9 100644
--- a/include/pacemaker.h
+++ b/include/pacemaker.h
@@ -177,27 +177,27 @@ int pcmk_list_nodes(xmlNodePtr *xml, char *node_types);
#ifdef BUILD_PUBLIC_LIBPACEMAKER
/*!
- * \brief Perform a STONITH action.
+ * \brief Ask the cluster to perform fencing
*
- * \param[in] st A connection to the STONITH API.
- * \param[in] target The node receiving the action.
- * \param[in] action The action to perform.
+ * \param[in] st A connection to the fencer API
+ * \param[in] target The node that should be fenced
+ * \param[in] action The fencing action (on, off, reboot) to perform
* \param[in] name Who requested the fence action?
- * \param[in] timeout How long to wait for the operation to complete (in ms).
+ * \param[in] timeout How long to wait for the operation to complete (in ms)
* \param[in] tolerance If a successful action for \p target happened within
* this many ms, return 0 without performing the action
- * again.
+ * again
* \param[in] delay Apply a fencing delay. Value -1 means disable also any
- * static/random fencing delays from pcmk_delay_base/max.
+ * static/random fencing delays from pcmk_delay_base/max
* \param[out] reason If not NULL, where to put descriptive failure reason
*
* \return Standard Pacemaker return code
* \note If \p reason is not NULL, the caller is responsible for freeing its
* returned value.
*/
-int pcmk_fence_action(stonith_t *st, const char *target, const char *action,
- const char *name, unsigned int timeout, unsigned int tolerance,
- int delay, char **reason);
+int pcmk_request_fencing(stonith_t *st, const char *target, const char *action,
+ const char *name, unsigned int timeout,
+ unsigned int tolerance, int delay, char **reason);
/*!
* \brief List the fencing operations that have occurred for a specific node.
diff --git a/include/pcmki/pcmki_fence.h b/include/pcmki/pcmki_fence.h
index c3da0361d7..e3a7e27264 100644
--- a/include/pcmki/pcmki_fence.h
+++ b/include/pcmki/pcmki_fence.h
@@ -13,14 +13,14 @@
# include <crm/common/output_internal.h>
/*!
- * \brief Perform a STONITH action.
+ * \brief Ask the cluster to perform fencing
*
- * \note This is the internal version of pcmk_fence_action(). External users
+ * \note This is the internal version of pcmk_request_fencing(). External users
* of the pacemaker API should use that function instead.
*
- * \param[in] st A connection to the STONITH API.
- * \param[in] target The node receiving the action.
- * \param[in] action The action to perform.
+ * \param[in] st A connection to the fencer API
+ * \param[in] target The node that should be fenced
+ * \param[in] action The fencing action (on, off, reboot) to perform
* \param[in] name Who requested the fence action?
* \param[in] timeout How long to wait for the operation to complete (in ms).
* \param[in] tolerance If a successful action for \p target happened within
@@ -34,9 +34,9 @@
* \note If \p reason is not NULL, the caller is responsible for freeing its
* returned value.
*/
-int pcmk__fence_action(stonith_t *st, const char *target, const char *action,
- const char *name, unsigned int timeout, unsigned int tolerance,
- int delay, char **reason);
+int pcmk__request_fencing(stonith_t *st, const char *target, const char *action,
+ const char *name, unsigned int timeout,
+ unsigned int tolerance, int delay, char **reason);
/*!
* \brief List the fencing operations that have occurred for a specific node.
diff --git a/lib/pacemaker/pcmk_fence.c b/lib/pacemaker/pcmk_fence.c
index dbf084fb6b..1b7feb54b2 100644
--- a/lib/pacemaker/pcmk_fence.c
+++ b/lib/pacemaker/pcmk_fence.c
@@ -137,9 +137,9 @@ async_fence_helper(gpointer user_data)
}
int
-pcmk__fence_action(stonith_t *st, const char *target, const char *action,
- const char *name, unsigned int timeout, unsigned int tolerance,
- int delay, char **reason)
+pcmk__request_fencing(stonith_t *st, const char *target, const char *action,
+ const char *name, unsigned int timeout,
+ unsigned int tolerance, int delay, char **reason)
{
crm_trigger_t *trig;
@@ -169,12 +169,12 @@ pcmk__fence_action(stonith_t *st, const char *target, const char *action,
#ifdef BUILD_PUBLIC_LIBPACEMAKER
int
-pcmk_fence_action(stonith_t *st, const char *target, const char *action,
- const char *name, unsigned int timeout, unsigned int tolerance,
- int delay, char **reason)
+pcmk_request_fencing(stonith_t *st, const char *target, const char *action,
+ const char *name, unsigned int timeout,
+ unsigned int tolerance, int delay, char **reason)
{
- return pcmk__fence_action(st, target, action, name, timeout, tolerance,
- delay, reason);
+ return pcmk__request_fencing(st, target, action, name, timeout, tolerance,
+ delay, reason);
}
#endif
diff --git a/tools/stonith_admin.c b/tools/stonith_admin.c
index fdc7c46d49..56948b3875 100644
--- a/tools/stonith_admin.c
+++ b/tools/stonith_admin.c
@@ -570,18 +570,24 @@ main(int argc, char **argv)
break;
case 'B':
- rc = pcmk__fence_action(st, target, "reboot", name, options.timeout*1000,
- options.tolerance*1000, options.delay, NULL);
+ rc = pcmk__request_fencing(st, target, "reboot", name,
+ options.timeout * 1000,
+ options.tolerance * 1000,
+ options.delay, NULL);
break;
case 'F':
- rc = pcmk__fence_action(st, target, "off", name, options.timeout*1000,
- options.tolerance*1000, options.delay, NULL);
+ rc = pcmk__request_fencing(st, target, "off", name,
+ options.timeout * 1000,
+ options.tolerance * 1000,
+ options.delay, NULL);
break;
case 'U':
- rc = pcmk__fence_action(st, target, "on", name, options.timeout*1000,
- options.tolerance*1000, options.delay, NULL);
+ rc = pcmk__request_fencing(st, target, "on", name,
+ options.timeout * 1000,
+ options.tolerance * 1000,
+ options.delay, NULL);
break;
case 'h':
--
2.27.0
From 247eb303df934944c0b72b162bb661cee6e0ed8b Mon Sep 17 00:00:00 2001
From: Ken Gaillot <kgaillot@redhat.com>
Date: Fri, 10 Dec 2021 15:52:37 -0600
Subject: [PATCH 04/11] Refactor: tools: drop unnecessary string duplication in
stonith_admin
---
tools/stonith_admin.c | 11 ++++-------
1 file changed, 4 insertions(+), 7 deletions(-)
diff --git a/tools/stonith_admin.c b/tools/stonith_admin.c
index 56948b3875..c11e302e76 100644
--- a/tools/stonith_admin.c
+++ b/tools/stonith_admin.c
@@ -360,8 +360,6 @@ main(int argc, char **argv)
pcmk__cli_init_logging("stonith_admin", args->verbosity);
- name = strdup(crm_system_name);
-
rc = pcmk__output_new(&out, args->output_ty, args->output_dest, argv);
if (rc != pcmk_rc_ok) {
exit_code = CRM_EX_ERROR;
@@ -496,7 +494,7 @@ main(int argc, char **argv)
if (st == NULL) {
rc = -ENOMEM;
} else if (!no_connect) {
- rc = st->cmds->connect(st, name, NULL);
+ rc = st->cmds->connect(st, crm_system_name, NULL);
}
if (rc < 0) {
out->err(out, "Could not connect to fencer: %s", pcmk_strerror(rc));
@@ -570,21 +568,21 @@ main(int argc, char **argv)
break;
case 'B':
- rc = pcmk__request_fencing(st, target, "reboot", name,
+ rc = pcmk__request_fencing(st, target, "reboot", crm_system_name,
options.timeout * 1000,
options.tolerance * 1000,
options.delay, NULL);
break;
case 'F':
- rc = pcmk__request_fencing(st, target, "off", name,
+ rc = pcmk__request_fencing(st, target, "off", crm_system_name,
options.timeout * 1000,
options.tolerance * 1000,
options.delay, NULL);
break;
case 'U':
- rc = pcmk__request_fencing(st, target, "on", name,
+ rc = pcmk__request_fencing(st, target, "on", crm_system_name,
options.timeout * 1000,
options.tolerance * 1000,
options.delay, NULL);
@@ -619,7 +617,6 @@ main(int argc, char **argv)
out->finish(out, exit_code, true, NULL);
pcmk__output_free(out);
}
- free(name);
stonith_key_value_freeall(options.params, 1, 1);
if (st != NULL) {
--
2.27.0
From a7888bf6868d8d9d9c77f65ae9983cf748bb0548 Mon Sep 17 00:00:00 2001
From: Ken Gaillot <kgaillot@redhat.com>
Date: Fri, 10 Dec 2021 15:56:34 -0600
Subject: [PATCH 05/11] Refactor: tools: functionize requesting fencing in
stonith_admin
... to reduce code duplication and improve readability
---
tools/stonith_admin.c | 27 +++++++++++++++------------
1 file changed, 15 insertions(+), 12 deletions(-)
diff --git a/tools/stonith_admin.c b/tools/stonith_admin.c
index c11e302e76..f738a9c888 100644
--- a/tools/stonith_admin.c
+++ b/tools/stonith_admin.c
@@ -331,6 +331,18 @@ build_arg_context(pcmk__common_args_t *args, GOptionGroup **group) {
return context;
}
+// \return Standard Pacemaker return code
+static int
+request_fencing(stonith_t *st, const char *target, const char *command)
+{
+ int rc = pcmk__request_fencing(st, target, command, crm_system_name,
+ options.timeout * 1000,
+ options.tolerance * 1000,
+ options.delay, NULL);
+
+ return rc;
+}
+
int
main(int argc, char **argv)
{
@@ -568,24 +580,15 @@ main(int argc, char **argv)
break;
case 'B':
- rc = pcmk__request_fencing(st, target, "reboot", crm_system_name,
- options.timeout * 1000,
- options.tolerance * 1000,
- options.delay, NULL);
+ rc = request_fencing(st, target, "reboot");
break;
case 'F':
- rc = pcmk__request_fencing(st, target, "off", crm_system_name,
- options.timeout * 1000,
- options.tolerance * 1000,
- options.delay, NULL);
+ rc = request_fencing(st, target, "off");
break;
case 'U':
- rc = pcmk__request_fencing(st, target, "on", crm_system_name,
- options.timeout * 1000,
- options.tolerance * 1000,
- options.delay, NULL);
+ rc = request_fencing(st, target, "on");
break;
case 'h':
--
2.27.0
From 2da32df780983ec1197e857eed5eeb5bf1101889 Mon Sep 17 00:00:00 2001
From: Ken Gaillot <kgaillot@redhat.com>
Date: Fri, 10 Dec 2021 16:05:19 -0600
Subject: [PATCH 06/11] Feature: tools: display failure reasons for
stonith_admin fencing commands
Previously, stonith_admin's --fence/--unfence/--reboot options did not output
any error message on failure. Now, they do, including the exit reason, if
available.
---
tools/stonith_admin.c | 30 +++++++++++++++++++++++++-----
1 file changed, 25 insertions(+), 5 deletions(-)
diff --git a/tools/stonith_admin.c b/tools/stonith_admin.c
index f738a9c888..5590faf11e 100644
--- a/tools/stonith_admin.c
+++ b/tools/stonith_admin.c
@@ -333,13 +333,33 @@ build_arg_context(pcmk__common_args_t *args, GOptionGroup **group) {
// \return Standard Pacemaker return code
static int
-request_fencing(stonith_t *st, const char *target, const char *command)
+request_fencing(stonith_t *st, const char *target, const char *command,
+ GError **error)
{
+ char *reason = NULL;
int rc = pcmk__request_fencing(st, target, command, crm_system_name,
options.timeout * 1000,
options.tolerance * 1000,
- options.delay, NULL);
+ options.delay, &reason);
+ if (rc != pcmk_rc_ok) {
+ const char *rc_str = pcmk_rc_str(rc);
+
+ // If reason is identical to return code string, don't display it twice
+ if (pcmk__str_eq(rc_str, reason, pcmk__str_none)) {
+ free(reason);
+ reason = NULL;
+ }
+
+ g_set_error(error, PCMK__RC_ERROR, rc,
+ "Couldn't %sfence %s: %s%s%s%s",
+ ((strcmp(command, "on") == 0)? "un" : ""),
+ target, pcmk_rc_str(rc),
+ ((reason == NULL)? "" : " ("),
+ ((reason == NULL)? "" : reason),
+ ((reason == NULL)? "" : ")"));
+ }
+ free(reason);
return rc;
}
@@ -580,15 +600,15 @@ main(int argc, char **argv)
break;
case 'B':
- rc = request_fencing(st, target, "reboot");
+ rc = request_fencing(st, target, "reboot", &error);
break;
case 'F':
- rc = request_fencing(st, target, "off");
+ rc = request_fencing(st, target, "off", &error);
break;
case 'U':
- rc = request_fencing(st, target, "on");
+ rc = request_fencing(st, target, "on", &error);
break;
case 'h':
--
2.27.0
From 2d99eba4c326d3b13dbbe446971ea5febd5d05be Mon Sep 17 00:00:00 2001
From: Ken Gaillot <kgaillot@redhat.com>
Date: Fri, 10 Dec 2021 16:08:49 -0600
Subject: [PATCH 07/11] Feature: libpacemaker: return exit reason for fencer
connection failures
... instead of outputting to stderr directly, so that the caller (i.e.
stonith_admin) can output the error in the correct output format.
---
lib/pacemaker/pcmk_fence.c | 3 +--
1 file changed, 1 insertion(+), 2 deletions(-)
diff --git a/lib/pacemaker/pcmk_fence.c b/lib/pacemaker/pcmk_fence.c
index 1b7feb54b2..d17b07cda2 100644
--- a/lib/pacemaker/pcmk_fence.c
+++ b/lib/pacemaker/pcmk_fence.c
@@ -104,10 +104,9 @@ async_fence_helper(gpointer user_data)
int rc = stonith_api_connect_retry(st, async_fence_data.name, 10);
if (rc != pcmk_ok) {
- fprintf(stderr, "Could not connect to fencer: %s\n", pcmk_strerror(rc));
g_main_loop_quit(mainloop);
pcmk__set_result(&async_fence_data.result, CRM_EX_ERROR,
- PCMK_EXEC_NOT_CONNECTED, NULL);
+ PCMK_EXEC_NOT_CONNECTED, pcmk_strerror(rc));
return TRUE;
}
--
2.27.0
From 4480ef0602f47450bdddfbde360a6a8327710927 Mon Sep 17 00:00:00 2001
From: Ken Gaillot <kgaillot@redhat.com>
Date: Mon, 17 Jan 2022 09:39:39 -0600
Subject: [PATCH 08/11] Low: libpacemaker: compare fence action names
case-sensitively
---
lib/pacemaker/pcmk_fence.c | 6 +++---
1 file changed, 3 insertions(+), 3 deletions(-)
diff --git a/lib/pacemaker/pcmk_fence.c b/lib/pacemaker/pcmk_fence.c
index d17b07cda2..2a8f50a555 100644
--- a/lib/pacemaker/pcmk_fence.c
+++ b/lib/pacemaker/pcmk_fence.c
@@ -1,5 +1,5 @@
/*
- * Copyright 2009-2021 the Pacemaker project contributors
+ * Copyright 2009-2022 the Pacemaker project contributors
*
* The version control history for this file may have further details.
*
@@ -77,7 +77,7 @@ static void
notify_callback(stonith_t * st, stonith_event_t * e)
{
if (pcmk__str_eq(async_fence_data.target, e->target, pcmk__str_casei)
- && pcmk__str_eq(async_fence_data.action, e->action, pcmk__str_casei)) {
+ && pcmk__str_eq(async_fence_data.action, e->action, pcmk__str_none)) {
pcmk__set_result(&async_fence_data.result,
stonith__event_exit_status(e),
@@ -549,7 +549,7 @@ pcmk__reduce_fence_history(stonith_history_t *history)
if ((hp->state == st_done) || (hp->state == st_failed)) {
/* action not in progress */
if (pcmk__str_eq(hp->target, np->target, pcmk__str_casei) &&
- pcmk__str_eq(hp->action, np->action, pcmk__str_casei) &&
+ pcmk__str_eq(hp->action, np->action, pcmk__str_none) &&
(hp->state == np->state) &&
((hp->state == st_done) ||
pcmk__str_eq(hp->delegate, np->delegate, pcmk__str_casei))) {
--
2.27.0
From fe4c65a3b9e715c2b535709f989f2369d3637b78 Mon Sep 17 00:00:00 2001
From: Ken Gaillot <kgaillot@redhat.com>
Date: Mon, 17 Jan 2022 09:45:24 -0600
Subject: [PATCH 09/11] Refactor: libpacemaker: avoid unnecessary string
duplication
... and don't leave any dynamic memory hanging around
---
lib/pacemaker/pcmk_fence.c | 11 ++++++++---
1 file changed, 8 insertions(+), 3 deletions(-)
diff --git a/lib/pacemaker/pcmk_fence.c b/lib/pacemaker/pcmk_fence.c
index 2a8f50a555..260fa5ab8e 100644
--- a/lib/pacemaker/pcmk_fence.c
+++ b/lib/pacemaker/pcmk_fence.c
@@ -141,6 +141,7 @@ pcmk__request_fencing(stonith_t *st, const char *target, const char *action,
unsigned int tolerance, int delay, char **reason)
{
crm_trigger_t *trig;
+ int rc = pcmk_rc_ok;
async_fence_data.st = st;
async_fence_data.name = strdup(name);
@@ -160,10 +161,14 @@ pcmk__request_fencing(stonith_t *st, const char *target, const char *action,
free(async_fence_data.name);
- if ((reason != NULL) && (async_fence_data.result.exit_reason != NULL)) {
- *reason = strdup(async_fence_data.result.exit_reason);
+ if (reason != NULL) {
+ // Give the caller ownership of the exit reason
+ *reason = async_fence_data.result.exit_reason;
+ async_fence_data.result.exit_reason = NULL;
}
- return stonith__result2rc(&async_fence_data.result);
+ rc = stonith__result2rc(&async_fence_data.result);
+ pcmk__reset_result(&async_fence_data.result);
+ return rc;
}
#ifdef BUILD_PUBLIC_LIBPACEMAKER
--
2.27.0
From 7b7af07796f05a1adabdac655582be2e17106f81 Mon Sep 17 00:00:00 2001
From: Ken Gaillot <kgaillot@redhat.com>
Date: Mon, 17 Jan 2022 10:07:10 -0600
Subject: [PATCH 10/11] Doc: libpacemaker: improve pcmk__request_fencing()
doxygen block
---
include/pacemaker.h | 6 ++++--
include/pcmki/pcmki_fence.h | 15 +++++++++------
2 files changed, 13 insertions(+), 8 deletions(-)
diff --git a/include/pacemaker.h b/include/pacemaker.h
index e581f975a9..266a844892 100644
--- a/include/pacemaker.h
+++ b/include/pacemaker.h
@@ -187,8 +187,10 @@ int pcmk_list_nodes(xmlNodePtr *xml, char *node_types);
* \param[in] tolerance If a successful action for \p target happened within
* this many ms, return 0 without performing the action
* again
- * \param[in] delay Apply a fencing delay. Value -1 means disable also any
- * static/random fencing delays from pcmk_delay_base/max
+ * \param[in] delay Apply this delay (in milliseconds) before initiating the
+ * fencing action (a value of -1 applies no delay and also
+ * disables any fencing delay from pcmk_delay_base and
+ * pcmk_delay_max)
* \param[out] reason If not NULL, where to put descriptive failure reason
*
* \return Standard Pacemaker return code
diff --git a/include/pcmki/pcmki_fence.h b/include/pcmki/pcmki_fence.h
index e3a7e27264..4a2fe3c481 100644
--- a/include/pcmki/pcmki_fence.h
+++ b/include/pcmki/pcmki_fence.h
@@ -1,5 +1,5 @@
/*
- * Copyright 2019-2021 the Pacemaker project contributors
+ * Copyright 2019-2022 the Pacemaker project contributors
*
* The version control history for this file may have further details.
*
@@ -22,17 +22,20 @@
* \param[in] target The node that should be fenced
* \param[in] action The fencing action (on, off, reboot) to perform
* \param[in] name Who requested the fence action?
- * \param[in] timeout How long to wait for the operation to complete (in ms).
+ * \param[in] timeout How long to wait for the operation to complete (in ms)
* \param[in] tolerance If a successful action for \p target happened within
- * this many ms, return 0 without performing the action
- * again.
- * \param[in] delay Apply a fencing delay. Value -1 means disable also any
- * static/random fencing delays from pcmk_delay_base/max
+ * this many milliseconds, return success without
+ * performing the action again
+ * \param[in] delay Apply this delay (in milliseconds) before initiating the
+ * fencing action (a value of -1 applies no delay and also
+ * disables any fencing delay from pcmk_delay_base and
+ * pcmk_delay_max)
* \param[out] reason If not NULL, where to put descriptive failure reason
*
* \return Standard Pacemaker return code
* \note If \p reason is not NULL, the caller is responsible for freeing its
* returned value.
+ * \todo delay is eventually used with g_timeout_add() and should be guint
*/
int pcmk__request_fencing(stonith_t *st, const char *target, const char *action,
const char *name, unsigned int timeout,
--
2.27.0
From 61fb7271712e1246eb6d9472dc1afc7cd10e0a79 Mon Sep 17 00:00:00 2001
From: Ken Gaillot <kgaillot@redhat.com>
Date: Mon, 17 Jan 2022 10:18:02 -0600
Subject: [PATCH 11/11] Fix: tools: get stonith_admin -T option working again
Regression introduced in 2.0.3 by 3910b6fec
This reverts commit 247eb303df934944c0b72b162bb661cee6e0ed8b
("Refactor: tools: drop unnecessary string duplication in stonith_admin")
and fixes a regression introduced when stonith_admin was converted to use
GOption.
The -T option is intended to override the client name passed to the fencer API,
but the client name was set to the default (crm_system_name) after option
processing had already been done, so any value for -T was overwritten by the
default, and its memory was leaked.
This commit sets the default only if -T was not used.
---
tools/stonith_admin.c | 15 ++++++++++-----
1 file changed, 10 insertions(+), 5 deletions(-)
diff --git a/tools/stonith_admin.c b/tools/stonith_admin.c
index 5590faf11e..54774b6fee 100644
--- a/tools/stonith_admin.c
+++ b/tools/stonith_admin.c
@@ -337,10 +337,10 @@ request_fencing(stonith_t *st, const char *target, const char *command,
GError **error)
{
char *reason = NULL;
- int rc = pcmk__request_fencing(st, target, command, crm_system_name,
- options.timeout * 1000,
- options.tolerance * 1000,
- options.delay, &reason);
+ int rc = pcmk__request_fencing(st, target, command, name,
+ options.timeout * 1000,
+ options.tolerance * 1000,
+ options.delay, &reason);
if (rc != pcmk_rc_ok) {
const char *rc_str = pcmk_rc_str(rc);
@@ -392,6 +392,10 @@ main(int argc, char **argv)
pcmk__cli_init_logging("stonith_admin", args->verbosity);
+ if (name == NULL) {
+ name = strdup(crm_system_name);
+ }
+
rc = pcmk__output_new(&out, args->output_ty, args->output_dest, argv);
if (rc != pcmk_rc_ok) {
exit_code = CRM_EX_ERROR;
@@ -526,7 +530,7 @@ main(int argc, char **argv)
if (st == NULL) {
rc = -ENOMEM;
} else if (!no_connect) {
- rc = st->cmds->connect(st, crm_system_name, NULL);
+ rc = st->cmds->connect(st, name, NULL);
}
if (rc < 0) {
out->err(out, "Could not connect to fencer: %s", pcmk_strerror(rc));
@@ -640,6 +644,7 @@ main(int argc, char **argv)
out->finish(out, exit_code, true, NULL);
pcmk__output_free(out);
}
+ free(name);
stonith_key_value_freeall(options.params, 1, 1);
if (st != NULL) {
--
2.27.0

View File

@ -0,0 +1,796 @@
From 08c3420f2c857e7b27cd960f355d787af534da7d Mon Sep 17 00:00:00 2001
From: Ken Gaillot <kgaillot@redhat.com>
Date: Tue, 18 Jan 2022 16:04:49 -0600
Subject: [PATCH 01/12] Log: libcrmcommon: improve description for "not
connected" status
PCMK_EXEC_NOT_CONNECTED was originally added to represent "No executor
connection", but it can also now mean no fencer connection, so change it to
"Internal communication failure" which is probably less mysterious to end users
anyway (especially since it should be accompanied by a more descriptive exit
reason).
---
include/crm/common/results.h | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/include/crm/common/results.h b/include/crm/common/results.h
index 873faf5c43..3d322a7ce6 100644
--- a/include/crm/common/results.h
+++ b/include/crm/common/results.h
@@ -349,7 +349,7 @@ pcmk_exec_status_str(enum pcmk_exec_status status)
case PCMK_EXEC_ERROR_HARD: return "Hard error";
case PCMK_EXEC_ERROR_FATAL: return "Fatal error";
case PCMK_EXEC_NOT_INSTALLED: return "Not installed";
- case PCMK_EXEC_NOT_CONNECTED: return "No executor connection";
+ case PCMK_EXEC_NOT_CONNECTED: return "Internal communication failure";
case PCMK_EXEC_INVALID: return "Cannot execute now";
case PCMK_EXEC_NO_FENCE_DEVICE: return "No fence device";
case PCMK_EXEC_NO_SECRETS: return "CIB secrets unavailable";
--
2.27.0
From 7c345cf8cf0cb054f5634206880df035bfef7311 Mon Sep 17 00:00:00 2001
From: Ken Gaillot <kgaillot@redhat.com>
Date: Mon, 20 Dec 2021 15:12:36 -0600
Subject: [PATCH 02/12] Refactor: libcrmcommon: drop unnecessary system error
redefinitions
portability.h defines some system error codes that might not be present on
non-Linux systems.
This was a bad idea, since there's no way to ensure the defined values don't
conflict with existing system codes. However, we use a number of them, so it's
probably best to keep them, at least until we can make a backward compatibility
break.
However, we don't use EUNATCH, ENOSR, or ENOSTR, so we can delete those.
---
include/portability.h | 12 ------------
lib/common/results.c | 9 ++++++---
2 files changed, 6 insertions(+), 15 deletions(-)
diff --git a/include/portability.h b/include/portability.h
index 9a60c583a7..ee065a376d 100644
--- a/include/portability.h
+++ b/include/portability.h
@@ -131,10 +131,6 @@ typedef union
# define EREMOTEIO 193
# endif
-# ifndef EUNATCH
-# define EUNATCH 194
-# endif
-
# ifndef ENOKEY
# define ENOKEY 195
# endif
@@ -147,14 +143,6 @@ typedef union
# define ETIME 197
# endif
-# ifndef ENOSR
-# define ENOSR 198
-# endif
-
-# ifndef ENOSTR
-# define ENOSTR 199
-# endif
-
# ifndef EKEYREJECTED
# define EKEYREJECTED 200
# endif
diff --git a/lib/common/results.c b/lib/common/results.c
index 6d120694cd..96cd4e5659 100644
--- a/lib/common/results.c
+++ b/lib/common/results.c
@@ -118,9 +118,6 @@ pcmk_strerror(int rc)
case EREMOTEIO:
return "Remote I/O error";
/* coverity[dead_error_condition] False positive on non-Linux */
- case EUNATCH:
- return "Protocol driver not attached";
- /* coverity[dead_error_condition] False positive on non-Linux */
case ENOKEY:
return "Required key not available";
}
@@ -342,8 +339,12 @@ pcmk_rc_name(int rc)
case ENOMSG: return "ENOMSG";
case ENOPROTOOPT: return "ENOPROTOOPT";
case ENOSPC: return "ENOSPC";
+#ifdef ENOSR
case ENOSR: return "ENOSR";
+#endif
+#ifdef ENOSTR
case ENOSTR: return "ENOSTR";
+#endif
case ENOSYS: return "ENOSYS";
case ENOTBLK: return "ENOTBLK";
case ENOTCONN: return "ENOTCONN";
@@ -376,7 +377,9 @@ pcmk_rc_name(int rc)
case ETIME: return "ETIME";
case ETIMEDOUT: return "ETIMEDOUT";
case ETXTBSY: return "ETXTBSY";
+#ifdef EUNATCH
case EUNATCH: return "EUNATCH";
+#endif
case EUSERS: return "EUSERS";
/* case EWOULDBLOCK: return "EWOULDBLOCK"; */
case EXDEV: return "EXDEV";
--
2.27.0
From eac8d1ca51eac3f437e18584f7e013d976ecee2c Mon Sep 17 00:00:00 2001
From: Ken Gaillot <kgaillot@redhat.com>
Date: Mon, 20 Dec 2021 15:33:12 -0600
Subject: [PATCH 03/12] Log: libcrmcommon: improve handling of portability.h
error codes
portability.h defines some system error codes that might not be present on
non-Linux systems.
Define a constant for each one (for example, PCMK__ECOMM for ECOMM) when
the system doesn't have the value, so we can detect that when relevant.
Also, make sure pcmk_rc_name() and pcmk_rc_str() handle all of these values.
---
include/portability.h | 8 ++++++++
lib/common/results.c | 32 ++++++++++++++++++++++++++++++--
2 files changed, 38 insertions(+), 2 deletions(-)
diff --git a/include/portability.h b/include/portability.h
index ee065a376d..5d5fbf21cb 100644
--- a/include/portability.h
+++ b/include/portability.h
@@ -116,34 +116,42 @@ typedef union
# include <errno.h>
# ifndef ENOTUNIQ
+# define PCMK__ENOTUNIQ
# define ENOTUNIQ 190
# endif
# ifndef ECOMM
+# define PCMK__ECOMM
# define ECOMM 191
# endif
# ifndef ELIBACC
+# define PCMK__ELIBACC
# define ELIBACC 192
# endif
# ifndef EREMOTEIO
+# define PCMK__EREMOTIO
# define EREMOTEIO 193
# endif
# ifndef ENOKEY
+# define PCMK__ENOKEY
# define ENOKEY 195
# endif
# ifndef ENODATA
+# define PCMK__ENODATA
# define ENODATA 196
# endif
# ifndef ETIME
+# define PCMK__ETIME
# define ETIME 197
# endif
# ifndef EKEYREJECTED
+# define PCMK__EKEYREJECTED
# define EKEYREJECTED 200
# endif
diff --git a/lib/common/results.c b/lib/common/results.c
index 96cd4e5659..bcf289d0d6 100644
--- a/lib/common/results.c
+++ b/lib/common/results.c
@@ -395,9 +395,9 @@ pcmk_rc_name(int rc)
#ifdef EISNAM // Not available on OS X, Illumos, Solaris
case EISNAM: return "EISNAM";
case EKEYEXPIRED: return "EKEYEXPIRED";
- case EKEYREJECTED: return "EKEYREJECTED";
case EKEYREVOKED: return "EKEYREVOKED";
#endif
+ case EKEYREJECTED: return "EKEYREJECTED";
case EL2HLT: return "EL2HLT";
case EL2NSYNC: return "EL2NSYNC";
case EL3HLT: return "EL3HLT";
@@ -443,7 +443,35 @@ pcmk_rc_str(int rc)
if (rc < 0) {
return "Unknown error";
}
- return strerror(rc);
+
+ // Handle values that could be defined by system or by portability.h
+ switch (rc) {
+#ifdef PCMK__ENOTUNIQ
+ case ENOTUNIQ: return "Name not unique on network";
+#endif
+#ifdef PCMK__ECOMM
+ case ECOMM: return "Communication error on send";
+#endif
+#ifdef PCMK__ELIBACC
+ case ELIBACC: return "Can not access a needed shared library";
+#endif
+#ifdef PCMK__EREMOTEIO
+ case EREMOTEIO: return "Remote I/O error";
+#endif
+#ifdef PCMK__ENOKEY
+ case ENOKEY: return "Required key not available";
+#endif
+#ifdef PCMK__ENODATA
+ case ENODATA: return "No data available";
+#endif
+#ifdef PCMK__ETIME
+ case ETIME: return "Timer expired";
+#endif
+#ifdef PCMK__EKEYREJECTED
+ case EKEYREJECTED: return "Key was rejected by service";
+#endif
+ default: return strerror(rc);
+ }
}
// This returns negative values for errors
--
2.27.0
From 32a38ac6374f85c43e7f4051f5e519822cc481e6 Mon Sep 17 00:00:00 2001
From: Ken Gaillot <kgaillot@redhat.com>
Date: Mon, 20 Dec 2021 15:39:19 -0600
Subject: [PATCH 04/12] Log: libcrmcommon: redefine pcmk_strerror() in terms of
pcmk_rc_str()
... to reduce code duplication. This causes minor differences in the string for
a few values.
---
lib/common/results.c | 67 +-------------------------------------------
1 file changed, 1 insertion(+), 66 deletions(-)
diff --git a/lib/common/results.c b/lib/common/results.c
index bcf289d0d6..b2c6e8d553 100644
--- a/lib/common/results.c
+++ b/lib/common/results.c
@@ -57,72 +57,7 @@ pcmk_errorname(int rc)
const char *
pcmk_strerror(int rc)
{
- if (rc == 0) {
- return "OK";
- }
-
- rc = abs(rc);
-
- // Of course rc > 0 ... unless someone passed INT_MIN as rc
- if ((rc > 0) && (rc < PCMK_ERROR_OFFSET)) {
- return strerror(rc);
- }
-
- switch (rc) {
- case pcmk_err_generic:
- return "Generic Pacemaker error";
- case pcmk_err_no_quorum:
- return "Operation requires quorum";
- case pcmk_err_schema_validation:
- return "Update does not conform to the configured schema";
- case pcmk_err_transform_failed:
- return "Schema transform failed";
- case pcmk_err_old_data:
- return "Update was older than existing configuration";
- case pcmk_err_diff_failed:
- return "Application of an update diff failed";
- case pcmk_err_diff_resync:
- return "Application of an update diff failed, requesting a full refresh";
- case pcmk_err_cib_modified:
- return "The on-disk configuration was manually modified";
- case pcmk_err_cib_backup:
- return "Could not archive the previous configuration";
- case pcmk_err_cib_save:
- return "Could not save the new configuration to disk";
- case pcmk_err_cib_corrupt:
- return "Could not parse on-disk configuration";
- case pcmk_err_multiple:
- return "Resource active on multiple nodes";
- case pcmk_err_node_unknown:
- return "Node not found";
- case pcmk_err_already:
- return "Situation already as requested";
- case pcmk_err_bad_nvpair:
- return "Bad name/value pair given";
- case pcmk_err_schema_unchanged:
- return "Schema is already the latest available";
- case pcmk_err_unknown_format:
- return "Unknown output format";
-
- /* The following cases will only be hit on systems for which they are non-standard */
- /* coverity[dead_error_condition] False positive on non-Linux */
- case ENOTUNIQ:
- return "Name not unique on network";
- /* coverity[dead_error_condition] False positive on non-Linux */
- case ECOMM:
- return "Communication error on send";
- /* coverity[dead_error_condition] False positive on non-Linux */
- case ELIBACC:
- return "Can not access a needed shared library";
- /* coverity[dead_error_condition] False positive on non-Linux */
- case EREMOTEIO:
- return "Remote I/O error";
- /* coverity[dead_error_condition] False positive on non-Linux */
- case ENOKEY:
- return "Required key not available";
- }
- crm_err("Unknown error code: %d", rc);
- return "Unknown error";
+ return pcmk_rc_str(pcmk_legacy2rc(rc));
}
// Standard Pacemaker API return codes
--
2.27.0
From 7c331d7e2275ffebbfd5e2f6432a6137a66ee5db Mon Sep 17 00:00:00 2001
From: Ken Gaillot <kgaillot@redhat.com>
Date: Mon, 20 Dec 2021 15:41:24 -0600
Subject: [PATCH 05/12] Log: libcrmcommon: don't say "Unknown error"
... which is unhelpful and annoying to users
---
lib/common/results.c | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/lib/common/results.c b/lib/common/results.c
index b2c6e8d553..5ffac76549 100644
--- a/lib/common/results.c
+++ b/lib/common/results.c
@@ -376,7 +376,7 @@ pcmk_rc_str(int rc)
return pcmk__rcs[pcmk_rc_error - rc].desc;
}
if (rc < 0) {
- return "Unknown error";
+ return "Error";
}
// Handle values that could be defined by system or by portability.h
@@ -768,7 +768,7 @@ bz2_strerror(int rc)
case BZ_OUTBUFF_FULL:
return "output data will not fit into the buffer provided";
}
- return "Unknown error";
+ return "Data compression error";
}
crm_exit_t
--
2.27.0
From 26883b4edda7d81bfcb79bd7b33bb3210beff110 Mon Sep 17 00:00:00 2001
From: Ken Gaillot <kgaillot@redhat.com>
Date: Mon, 20 Dec 2021 16:01:39 -0600
Subject: [PATCH 06/12] Log: fencing: don't warn if cluster has no watchdog
device
---
lib/fencing/st_client.c | 7 ++++++-
1 file changed, 6 insertions(+), 1 deletion(-)
diff --git a/lib/fencing/st_client.c b/lib/fencing/st_client.c
index b1de912b2a..a0f3119f3b 100644
--- a/lib/fencing/st_client.c
+++ b/lib/fencing/st_client.c
@@ -187,7 +187,12 @@ stonith__watchdog_fencing_enabled_for_node_api(stonith_t *st, const char *node)
* we drop in here - so as not to make remote nodes
* panic on that answer
*/
- crm_warn("watchdog-fencing-query failed");
+ if (rc == -ENODEV) {
+ crm_notice("Cluster does not have watchdog fencing device");
+ } else {
+ crm_warn("Could not check for watchdog fencing device: %s",
+ pcmk_strerror(rc));
+ }
} else if (list[0] == '\0') {
rv = TRUE;
} else {
--
2.27.0
From 72b3c42232deaca64ffba9582598c59331203761 Mon Sep 17 00:00:00 2001
From: Ken Gaillot <kgaillot@redhat.com>
Date: Mon, 20 Dec 2021 16:22:49 -0600
Subject: [PATCH 07/12] Test: libcrmcommon: update pcmk_rc_str() unit test for
recent change
---
lib/common/tests/results/pcmk__results_test.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/lib/common/tests/results/pcmk__results_test.c b/lib/common/tests/results/pcmk__results_test.c
index 57a520c501..e08d4b6261 100644
--- a/lib/common/tests/results/pcmk__results_test.c
+++ b/lib/common/tests/results/pcmk__results_test.c
@@ -30,7 +30,7 @@ static void
test_for_pcmk_rc_str(void **state) {
assert_string_equal(pcmk_rc_str(pcmk_rc_error-1), "Unknown output format");
assert_string_equal(pcmk_rc_str(pcmk_rc_ok), "OK");
- assert_string_equal(pcmk_rc_str(-1), "Unknown error");
+ assert_string_equal(pcmk_rc_str(-1), "Error");
}
static void
--
2.27.0
From c1ad3d6640f695321a83183c95fae2f105adc429 Mon Sep 17 00:00:00 2001
From: Ken Gaillot <kgaillot@redhat.com>
Date: Tue, 21 Dec 2021 10:20:38 -0600
Subject: [PATCH 08/12] Test: cts-lab: update expected patterns for recent
changes
---
cts/lab/CTStests.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/cts/lab/CTStests.py b/cts/lab/CTStests.py
index 62c832eb45..f4be998cfb 100644
--- a/cts/lab/CTStests.py
+++ b/cts/lab/CTStests.py
@@ -3055,7 +3055,7 @@ class RemoteStonithd(RemoteDriver):
r"pacemaker-controld.*:\s+error.*: Operation remote-.*_monitor",
r"pacemaker-controld.*:\s+error.*: Result of monitor operation for remote-.*",
r"schedulerd.*:\s+Recover remote-.*\s*\(.*\)",
- r"error: Result of monitor operation for .* on remote-.*: No executor connection",
+ r"error: Result of monitor operation for .* on remote-.*: Internal communication failure",
]
ignore_pats.extend(RemoteDriver.errorstoignore(self))
--
2.27.0
From f272e2f526633c707e894b39c7c7bce3c14de898 Mon Sep 17 00:00:00 2001
From: Ken Gaillot <kgaillot@redhat.com>
Date: Tue, 21 Dec 2021 15:40:49 -0600
Subject: [PATCH 09/12] Log: controller,libpacemaker: make history XML creation
less chatty
Other messages with the same info will already be logged at higher severity
---
daemons/controld/controld_execd.c | 3 +--
daemons/controld/controld_te_actions.c | 7 ++-----
include/pcmki/pcmki_sched_utils.h | 3 +--
lib/pacemaker/pcmk_injections.c | 3 +--
lib/pacemaker/pcmk_sched_actions.c | 12 +++++-------
5 files changed, 10 insertions(+), 18 deletions(-)
diff --git a/daemons/controld/controld_execd.c b/daemons/controld/controld_execd.c
index 15784e7687..52157fa5d4 100644
--- a/daemons/controld/controld_execd.c
+++ b/daemons/controld/controld_execd.c
@@ -693,9 +693,8 @@ build_operation_update(xmlNode * parent, lrmd_rsc_info_t * rsc, lrmd_event_data_
caller_version = CRM_FEATURE_SET;
}
- crm_trace("Building %s operation update with originator version: %s", op->rsc_id, caller_version);
xml_op = pcmk__create_history_xml(parent, op, caller_version, target_rc,
- fsa_our_uname, src, LOG_DEBUG);
+ fsa_our_uname, src);
if (xml_op == NULL) {
return TRUE;
}
diff --git a/daemons/controld/controld_te_actions.c b/daemons/controld/controld_te_actions.c
index 63b7c72359..b0bcb8b2e4 100644
--- a/daemons/controld/controld_te_actions.c
+++ b/daemons/controld/controld_te_actions.c
@@ -181,7 +181,6 @@ controld_record_action_timeout(crm_action_t *action)
lrmd_event_data_t *op = NULL;
xmlNode *state = NULL;
xmlNode *rsc = NULL;
- xmlNode *xml_op = NULL;
xmlNode *action_rsc = NULL;
int rc = pcmk_ok;
@@ -245,12 +244,10 @@ controld_record_action_timeout(crm_action_t *action)
op->user_data = pcmk__transition_key(transition_graph->id, action->id,
target_rc, te_uuid);
- xml_op = pcmk__create_history_xml(rsc, op, CRM_FEATURE_SET, target_rc,
- target, __func__, LOG_INFO);
+ pcmk__create_history_xml(rsc, op, CRM_FEATURE_SET, target_rc, target,
+ __func__);
lrmd_free_event(op);
- crm_log_xml_trace(xml_op, "Action timeout");
-
rc = fsa_cib_conn->cmds->update(fsa_cib_conn, XML_CIB_TAG_STATUS, state, call_options);
fsa_register_cib_callback(rc, FALSE, NULL, cib_action_updated);
free_xml(state);
diff --git a/include/pcmki/pcmki_sched_utils.h b/include/pcmki/pcmki_sched_utils.h
index 68d60fc7db..144424a609 100644
--- a/include/pcmki/pcmki_sched_utils.h
+++ b/include/pcmki/pcmki_sched_utils.h
@@ -52,8 +52,7 @@ extern void process_utilization(pe_resource_t * rsc, pe_node_t ** prefer, pe_wor
xmlNode *pcmk__create_history_xml(xmlNode *parent, lrmd_event_data_t *event,
const char *caller_version, int target_rc,
- const char *node, const char *origin,
- int level);
+ const char *node, const char *origin);
# define LOAD_STOPPED "load_stopped"
diff --git a/lib/pacemaker/pcmk_sched_transition.c b/lib/pacemaker/pcmk_sched_transition.c
index 678c3f5dd2..1aa90a5a0b 100644
--- a/lib/pacemaker/pcmk_sched_transition.c
+++ b/lib/pacemaker/pcmk_sched_transition.c
@@ -201,8 +201,7 @@ inject_op(xmlNode * cib_resource, lrmd_event_data_t * op, int target_rc)
inject_op(xmlNode * cib_resource, lrmd_event_data_t * op, int target_rc)
{
return pcmk__create_history_xml(cib_resource, op, CRM_FEATURE_SET,
- target_rc, NULL, crm_system_name,
- LOG_TRACE);
+ target_rc, NULL, crm_system_name);
}
static xmlNode *
diff --git a/lib/pacemaker/pcmk_sched_actions.c b/lib/pacemaker/pcmk_sched_actions.c
index f8200b0efc..4f63d3374d 100644
--- a/lib/pacemaker/pcmk_sched_utils.c
+++ b/lib/pacemaker/pcmk_sched_utils.c
@@ -892,14 +892,13 @@ add_op_digest_to_xml(lrmd_event_data_t *op, xmlNode *update)
* \param[in] target_rc Expected result of operation
* \param[in] node Name of node on which operation was performed
* \param[in] origin Arbitrary description of update source
- * \param[in] level A log message will be logged at this level
*
* \return Newly created XML node for history update
*/
xmlNode *
pcmk__create_history_xml(xmlNode *parent, lrmd_event_data_t *op,
const char *caller_version, int target_rc,
- const char *node, const char *origin, int level)
+ const char *node, const char *origin)
{
char *key = NULL;
char *magic = NULL;
@@ -912,11 +911,10 @@ pcmk__create_history_xml(xmlNode *parent, lrmd_event_data_t *op,
const char *task = NULL;
CRM_CHECK(op != NULL, return NULL);
- do_crm_log(level, "%s: Updating resource %s after %s op %s (interval=%u)",
- origin, op->rsc_id, op->op_type,
- pcmk_exec_status_str(op->op_status), op->interval_ms);
-
- crm_trace("DC version: %s", caller_version);
+ crm_trace("Creating history XML for %s-interval %s action for %s on %s "
+ "(DC version: %s, origin: %s)",
+ pcmk__readable_interval(op->interval_ms), op->op_type, op->rsc_id,
+ ((node == NULL)? "no node" : node), caller_version, origin);
task = op->op_type;
--
2.27.0
From 06b1da9e5345e0d1571042c11646fd7157961279 Mon Sep 17 00:00:00 2001
From: Ken Gaillot <kgaillot@redhat.com>
Date: Tue, 21 Dec 2021 17:09:44 -0600
Subject: [PATCH 10/12] Feature: controller: improve exit reason for internal
timeouts
Functionize the part of controld_record_action_timeout() that creates a fake
executor event, into a new function synthesize_timeout_event(), and have it set
a more detailed exit reason describing what timed out.
---
daemons/controld/controld_te_actions.c | 61 ++++++++++++++++++++------
1 file changed, 48 insertions(+), 13 deletions(-)
diff --git a/daemons/controld/controld_te_actions.c b/daemons/controld/controld_te_actions.c
index b0bcb8b2e4..de2fbb82bf 100644
--- a/daemons/controld/controld_te_actions.c
+++ b/daemons/controld/controld_te_actions.c
@@ -175,6 +175,53 @@ te_crm_command(crm_graph_t * graph, crm_action_t * action)
return TRUE;
}
+/*!
+ * \internal
+ * \brief Synthesize an executor event for a resource action timeout
+ *
+ * \param[in] action Resource action that timed out
+ * \param[in] target_rc Expected result of action that timed out
+ *
+ * Synthesize an executor event for a resource action timeout. (If the executor
+ * gets a timeout while waiting for a resource action to complete, that will be
+ * reported via the usual callback. This timeout means we didn't hear from the
+ * executor itself or the controller that relayed the action to the executor.)
+ *
+ * \return Newly created executor event for result of \p action
+ * \note The caller is responsible for freeing the return value using
+ * lrmd_free_event().
+ */
+static lrmd_event_data_t *
+synthesize_timeout_event(crm_action_t *action, int target_rc)
+{
+ lrmd_event_data_t *op = NULL;
+ const char *target = crm_element_value(action->xml, XML_LRM_ATTR_TARGET);
+ const char *reason = NULL;
+ char *dynamic_reason = NULL;
+
+ if (pcmk__str_eq(target, get_local_node_name(), pcmk__str_casei)) {
+ reason = "Local executor did not return result in time";
+ } else {
+ const char *router_node = NULL;
+
+ router_node = crm_element_value(action->xml, XML_LRM_ATTR_ROUTER_NODE);
+ if (router_node == NULL) {
+ router_node = target;
+ }
+ dynamic_reason = crm_strdup_printf("Controller on %s did not return "
+ "result in time", router_node);
+ reason = dynamic_reason;
+ }
+
+ op = pcmk__event_from_graph_action(NULL, action, PCMK_EXEC_TIMEOUT,
+ PCMK_OCF_UNKNOWN_ERROR, reason);
+ op->call_id = -1;
+ op->user_data = pcmk__transition_key(transition_graph->id, action->id,
+ target_rc, te_uuid);
+ free(dynamic_reason);
+ return op;
+}
+
void
controld_record_action_timeout(crm_action_t *action)
{
@@ -231,19 +278,7 @@ controld_record_action_timeout(crm_action_t *action)
crm_copy_xml_element(action_rsc, rsc, XML_AGENT_ATTR_CLASS);
crm_copy_xml_element(action_rsc, rsc, XML_AGENT_ATTR_PROVIDER);
- /* If the executor gets a timeout while waiting for the action to complete,
- * that will be reported via the usual callback. This timeout means that we
- * didn't hear from the executor or the controller that relayed the action
- * to the executor.
- */
- op = pcmk__event_from_graph_action(NULL, action, PCMK_EXEC_TIMEOUT,
- PCMK_OCF_UNKNOWN_ERROR,
- "Cluster communication timeout "
- "(no response from executor)");
- op->call_id = -1;
- op->user_data = pcmk__transition_key(transition_graph->id, action->id,
- target_rc, te_uuid);
-
+ op = synthesize_timeout_event(action, target_rc);
pcmk__create_history_xml(rsc, op, CRM_FEATURE_SET, target_rc, target,
__func__);
lrmd_free_event(op);
--
2.27.0
From be620d206faefab967d4c8567d6554d10c9e72ba Mon Sep 17 00:00:00 2001
From: Ken Gaillot <kgaillot@redhat.com>
Date: Wed, 22 Dec 2021 16:35:06 -0600
Subject: [PATCH 11/12] Feature: fencing: improve exit reason for fencing
timeouts
Troubleshooting timeouts is one of the more difficult aspects of cluster
maintenance. We want to give as much of a hint as possible, but for fencing in
particular it is difficult because an operation might involve multiple retries
of multiple devices.
Barring another major project to track exactly which devices, retries, etc.,
were used in a given operation, these changes in wording are probably the best
we can do.
---
daemons/fenced/fenced_remote.c | 8 +++++---
lib/fencing/st_client.c | 2 +-
2 files changed, 6 insertions(+), 4 deletions(-)
diff --git a/daemons/fenced/fenced_remote.c b/daemons/fenced/fenced_remote.c
index 1e237150c5..6eebb7381e 100644
--- a/daemons/fenced/fenced_remote.c
+++ b/daemons/fenced/fenced_remote.c
@@ -1,5 +1,5 @@
/*
- * Copyright 2009-2021 the Pacemaker project contributors
+ * Copyright 2009-2022 the Pacemaker project contributors
*
* The version control history for this file may have further details.
*
@@ -715,8 +715,10 @@ remote_op_timeout(gpointer userdata)
CRM_XS " id=%.8s",
op->action, op->target, op->client_name, op->id);
} else {
- finalize_timed_out_op(userdata, "Fencing could not be completed "
- "within overall timeout");
+ finalize_timed_out_op(userdata, "Fencing did not complete within a "
+ "total timeout based on the "
+ "configured timeout and retries for "
+ "any devices attempted");
}
return G_SOURCE_REMOVE;
}
diff --git a/lib/fencing/st_client.c b/lib/fencing/st_client.c
index a0f3119f3b..718739b321 100644
--- a/lib/fencing/st_client.c
+++ b/lib/fencing/st_client.c
@@ -906,7 +906,7 @@ invoke_registered_callbacks(stonith_t *stonith, xmlNode *msg, int call_id)
if (msg == NULL) {
// Fencer didn't reply in time
pcmk__set_result(&result, CRM_EX_ERROR, PCMK_EXEC_TIMEOUT,
- "Timeout waiting for reply from fencer");
+ "Fencer accepted request but did not reply in time");
CRM_LOG_ASSERT(call_id > 0);
} else {
--
2.27.0
From 0fe8ede2f8e838e335fe42846bdf147111ce9955 Mon Sep 17 00:00:00 2001
From: Ken Gaillot <kgaillot@redhat.com>
Date: Wed, 22 Dec 2021 17:09:09 -0600
Subject: [PATCH 12/12] Feature: libcrmservice: improve exit reason for
timeouts
The services library doesn't have enough information about an action to say
(for example) what configuration parameters might be relevant, but we can at
least distinguish what kind of agent timed out.
---
lib/services/services_linux.c | 12 +++++++++++-
lib/services/systemd.c | 2 +-
2 files changed, 12 insertions(+), 2 deletions(-)
diff --git a/lib/services/services_linux.c b/lib/services/services_linux.c
index f15eee860e..d6aafcfe46 100644
--- a/lib/services/services_linux.c
+++ b/lib/services/services_linux.c
@@ -677,9 +677,19 @@ async_action_complete(mainloop_child_t *p, pid_t pid, int core, int signo,
parse_exit_reason_from_stderr(op);
} else if (mainloop_child_timeout(p)) {
+ const char *reason = NULL;
+
+ if (op->rsc != NULL) {
+ reason = "Resource agent did not complete in time";
+ } else if (pcmk__str_eq(op->standard, PCMK_RESOURCE_CLASS_STONITH,
+ pcmk__str_none)) {
+ reason = "Fence agent did not complete in time";
+ } else {
+ reason = "Process did not complete in time";
+ }
crm_info("%s[%d] timed out after %dms", op->id, op->pid, op->timeout);
services__set_result(op, services__generic_error(op), PCMK_EXEC_TIMEOUT,
- "Process did not exit within specified timeout");
+ reason);
} else if (op->cancel) {
/* If an in-flight recurring operation was killed because it was
diff --git a/lib/services/systemd.c b/lib/services/systemd.c
index 27a3b376db..d87b287424 100644
--- a/lib/services/systemd.c
+++ b/lib/services/systemd.c
@@ -995,7 +995,7 @@ systemd_timeout_callback(gpointer p)
crm_info("%s action for systemd unit %s named '%s' timed out",
op->action, op->agent, op->rsc);
services__set_result(op, PCMK_OCF_UNKNOWN_ERROR, PCMK_EXEC_TIMEOUT,
- "Systemd action did not complete within specified timeout");
+ "Systemd unit action did not complete in time");
services__finalize_async_op(op);
return FALSE;
}
--
2.27.0

View File

@ -0,0 +1,29 @@
From e8bf0161b872267f1bb7143a9866fdc15ec218f2 Mon Sep 17 00:00:00 2001
From: Jan Friesse <jfriesse@redhat.com>
Date: Tue, 18 Jan 2022 16:35:24 +0100
Subject: [PATCH] Fix: corosync: Repeat corosync_cfg_trackstart
corosync_cfg_trackstart can fail with CS_ERR_TRY_AGAIN failure so
(similarly as for corosync_cfg_local_get, ...) handle failure with
using cs_repeat macro.
---
daemons/pacemakerd/pcmkd_corosync.c | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/daemons/pacemakerd/pcmkd_corosync.c b/daemons/pacemakerd/pcmkd_corosync.c
index 7990bc43c5..cd7a40321d 100644
--- a/daemons/pacemakerd/pcmkd_corosync.c
+++ b/daemons/pacemakerd/pcmkd_corosync.c
@@ -186,7 +186,8 @@ cluster_connect_cfg(void)
crm_debug("Corosync reports local node ID is %lu", (unsigned long) nodeid);
#ifdef HAVE_COROSYNC_CFG_TRACKSTART
- rc = corosync_cfg_trackstart(cfg_handle, 0);
+ retries = 0;
+ cs_repeat(retries, 30, rc = corosync_cfg_trackstart(cfg_handle, 0));
if (rc != CS_OK) {
crm_crit("Could not enable Corosync CFG shutdown tracker: %s " CRM_XS " rc=%d",
cs_strerror(rc), rc);
--
2.27.0

View File

@ -0,0 +1,41 @@
From e316840a7e1d2a72e3089ee194334244c959905a Mon Sep 17 00:00:00 2001
From: Ken Gaillot <kgaillot@redhat.com>
Date: Wed, 19 Jan 2022 09:53:53 -0600
Subject: [PATCH] Fix: pacemakerd: tweak systemd unit respawn settings
If pacemaker exits immediately after starting, wait 1 second before trying to
respawn, since the default of 100ms is a bit aggressive for a Pacemaker
cluster.
Also, allow 5 attempts in 25 seconds before giving up.
---
daemons/pacemakerd/pacemaker.service.in | 6 ++++++
1 file changed, 6 insertions(+)
diff --git a/daemons/pacemakerd/pacemaker.service.in b/daemons/pacemakerd/pacemaker.service.in
index 0363a2259c..3fd53d9ffb 100644
--- a/daemons/pacemakerd/pacemaker.service.in
+++ b/daemons/pacemakerd/pacemaker.service.in
@@ -31,6 +31,9 @@ After=rsyslog.service
After=corosync.service
Requires=corosync.service
+# If Pacemaker respawns repeatedly, give up after this many tries in this time
+StartLimitBurst=5
+StartLimitIntervalSec=25s
[Install]
WantedBy=multi-user.target
@@ -57,6 +60,9 @@ TasksMax=infinity
# resource. Sending -KILL will just get the node fenced
SendSIGKILL=no
+# Systemd's default of respawning a failed service after 100ms is too aggressive
+RestartSec=1s
+
# If we ever hit the StartLimitInterval/StartLimitBurst limit, and the
# admin wants to stop the cluster while pacemakerd is not running, it
# might be a good idea to enable the ExecStopPost directive below.
--
2.27.0

View File

@ -0,0 +1,354 @@
From 9ee9fd6b98d8a5ff5eac57a14cbc0ce1009b10e4 Mon Sep 17 00:00:00 2001
From: Klaus Wenninger <klaus.wenninger@aon.at>
Date: Thu, 18 Nov 2021 13:23:34 +0100
Subject: [PATCH 1/2] Feature: pacemakerd: keep tracking pacemakerd for
liveness
---
daemons/pacemakerd/pacemakerd.c | 2 +
daemons/pacemakerd/pacemakerd.h | 3 +-
daemons/pacemakerd/pcmkd_messages.c | 6 +-
daemons/pacemakerd/pcmkd_subdaemons.c | 139 +++++++++++++++++---------
4 files changed, 98 insertions(+), 52 deletions(-)
diff --git a/daemons/pacemakerd/pacemakerd.c b/daemons/pacemakerd/pacemakerd.c
index 34d64c4053..062c2d5326 100644
--- a/daemons/pacemakerd/pacemakerd.c
+++ b/daemons/pacemakerd/pacemakerd.c
@@ -259,6 +259,8 @@ main(int argc, char **argv)
pcmk_ipc_api_t *old_instance = NULL;
qb_ipcs_service_t *ipcs = NULL;
+ subdaemon_check_progress = time(NULL);
+
crm_log_preinit(NULL, argc, argv);
mainloop_add_signal(SIGHUP, pcmk_ignore);
mainloop_add_signal(SIGQUIT, pcmk_sigquit);
diff --git a/daemons/pacemakerd/pacemakerd.h b/daemons/pacemakerd/pacemakerd.h
index 7c541bbf9e..424dbbcc5d 100644
--- a/daemons/pacemakerd/pacemakerd.h
+++ b/daemons/pacemakerd/pacemakerd.h
@@ -1,5 +1,5 @@
/*
- * Copyright 2010-2021 the Pacemaker project contributors
+ * Copyright 2010-2022 the Pacemaker project contributors
*
* The version control history for this file may have further details.
*
@@ -21,6 +21,7 @@ extern unsigned int shutdown_complete_state_reported_to;
extern gboolean shutdown_complete_state_reported_client_closed;
extern crm_trigger_t *shutdown_trigger;
extern crm_trigger_t *startup_trigger;
+extern time_t subdaemon_check_progress;
gboolean mcp_read_config(void);
diff --git a/daemons/pacemakerd/pcmkd_messages.c b/daemons/pacemakerd/pcmkd_messages.c
index 0439986ecf..f2cddc353e 100644
--- a/daemons/pacemakerd/pcmkd_messages.c
+++ b/daemons/pacemakerd/pcmkd_messages.c
@@ -1,5 +1,5 @@
/*
- * Copyright 2010-2021 the Pacemaker project contributors
+ * Copyright 2010-2022 the Pacemaker project contributors
*
* The version control history for this file may have further details.
*
@@ -25,7 +25,6 @@ pcmk_handle_ping_request(pcmk__client_t *c, xmlNode *msg, uint32_t id)
const char *value = NULL;
xmlNode *ping = NULL;
xmlNode *reply = NULL;
- time_t pinged = time(NULL);
const char *from = crm_element_value(msg, F_CRM_SYS_FROM);
/* Pinged for status */
@@ -36,7 +35,8 @@ pcmk_handle_ping_request(pcmk__client_t *c, xmlNode *msg, uint32_t id)
value = crm_element_value(msg, F_CRM_SYS_TO);
crm_xml_add(ping, XML_PING_ATTR_SYSFROM, value);
crm_xml_add(ping, XML_PING_ATTR_PACEMAKERDSTATE, pacemakerd_state);
- crm_xml_add_ll(ping, XML_ATTR_TSTAMP, (long long) pinged);
+ crm_xml_add_ll(ping, XML_ATTR_TSTAMP,
+ (long long) subdaemon_check_progress);
crm_xml_add(ping, XML_PING_ATTR_STATUS, "ok");
reply = create_reply(msg, ping);
free_xml(ping);
diff --git a/daemons/pacemakerd/pcmkd_subdaemons.c b/daemons/pacemakerd/pcmkd_subdaemons.c
index a54fcce1ba..c03903c99e 100644
--- a/daemons/pacemakerd/pcmkd_subdaemons.c
+++ b/daemons/pacemakerd/pcmkd_subdaemons.c
@@ -32,14 +32,16 @@ typedef struct pcmk_child_s {
const char *command;
const char *endpoint; /* IPC server name */
bool needs_cluster;
+ int check_count;
/* Anything below here will be dynamically initialized */
bool needs_retry;
bool active_before_startup;
} pcmk_child_t;
-#define PCMK_PROCESS_CHECK_INTERVAL 5
-#define SHUTDOWN_ESCALATION_PERIOD 180000 /* 3m */
+#define PCMK_PROCESS_CHECK_INTERVAL 1
+#define PCMK_PROCESS_CHECK_RETRIES 5
+#define SHUTDOWN_ESCALATION_PERIOD 180000 /* 3m */
/* Index into the array below */
#define PCMK_CHILD_CONTROLD 5
@@ -82,6 +84,7 @@ static char *opts_vgrind[] = { NULL, NULL, NULL, NULL, NULL };
crm_trigger_t *shutdown_trigger = NULL;
crm_trigger_t *startup_trigger = NULL;
+time_t subdaemon_check_progress = 0;
/* When contacted via pacemakerd-api by a client having sbd in
* the name we assume it is sbd-daemon which wants to know
@@ -103,7 +106,6 @@ gboolean running_with_sbd = FALSE; /* local copy */
GMainLoop *mainloop = NULL;
static gboolean fatal_error = FALSE;
-static bool global_keep_tracking = false;
static gboolean check_active_before_startup_processes(gpointer user_data);
static int child_liveness(pcmk_child_t *child);
@@ -127,44 +129,94 @@ pcmkd_cluster_connected(void)
static gboolean
check_active_before_startup_processes(gpointer user_data)
{
- gboolean keep_tracking = FALSE;
-
- for (int i = 0; i < PCMK__NELEM(pcmk_children); i++) {
- if (!pcmk_children[i].active_before_startup) {
- /* we are already tracking it as a child process. */
- continue;
- } else {
- int rc = child_liveness(&pcmk_children[i]);
-
- switch (rc) {
- case pcmk_rc_ok:
- break;
- case pcmk_rc_ipc_unresponsive:
- case pcmk_rc_ipc_pid_only: // This case: it was previously OK
- if (pcmk_children[i].respawn) {
- crm_err("%s[%lld] terminated%s", pcmk_children[i].name,
- (long long) PCMK__SPECIAL_PID_AS_0(pcmk_children[i].pid),
- (rc == pcmk_rc_ipc_pid_only)? " as IPC server" : "");
- } else {
- /* orderly shutdown */
- crm_notice("%s[%lld] terminated%s", pcmk_children[i].name,
- (long long) PCMK__SPECIAL_PID_AS_0(pcmk_children[i].pid),
- (rc == pcmk_rc_ipc_pid_only)? " as IPC server" : "");
- }
- pcmk_process_exit(&(pcmk_children[i]));
- continue;
- default:
- crm_exit(CRM_EX_FATAL);
- break; /* static analysis/noreturn */
+ static int next_child = 0;
+ int rc = child_liveness(&pcmk_children[next_child]);
+
+ crm_trace("%s[%lld] checked as %d",
+ pcmk_children[next_child].name,
+ (long long) PCMK__SPECIAL_PID_AS_0(
+ pcmk_children[next_child].pid),
+ rc);
+
+ switch (rc) {
+ case pcmk_rc_ok:
+ pcmk_children[next_child].check_count = 0;
+ next_child++;
+ subdaemon_check_progress = time(NULL);
+ break;
+ case pcmk_rc_ipc_pid_only: // This case: it was previously OK
+ pcmk_children[next_child].check_count++;
+ if (pcmk_children[next_child].check_count >= PCMK_PROCESS_CHECK_RETRIES) {
+ crm_err("%s[%lld] is unresponsive to ipc after %d tries but "
+ "we found the pid so have it killed that we can restart",
+ pcmk_children[next_child].name,
+ (long long) PCMK__SPECIAL_PID_AS_0(
+ pcmk_children[next_child].pid),
+ pcmk_children[next_child].check_count);
+ stop_child(&pcmk_children[next_child], SIGKILL);
+ if (pcmk_children[next_child].respawn) {
+ /* as long as the respawn-limit isn't reached
+ give it another round of check retries
+ */
+ pcmk_children[next_child].check_count = 0;
+ }
+ } else {
+ crm_notice("%s[%lld] is unresponsive to ipc after %d tries",
+ pcmk_children[next_child].name,
+ (long long) PCMK__SPECIAL_PID_AS_0(
+ pcmk_children[next_child].pid),
+ pcmk_children[next_child].check_count);
+ if (pcmk_children[next_child].respawn) {
+ /* as long as the respawn-limit isn't reached
+ and we haven't run out of connect retries
+ we account this as progress we are willing
+ to tell to sbd
+ */
+ subdaemon_check_progress = time(NULL);
+ }
}
- }
- /* at least one of the processes found at startup
- * is still going, so keep this recurring timer around */
- keep_tracking = TRUE;
+ /* go to the next child and see if
+ we can make progress there
+ */
+ next_child++;
+ break;
+ case pcmk_rc_ipc_unresponsive:
+ if (pcmk_children[next_child].respawn) {
+ crm_err("%s[%lld] terminated",
+ pcmk_children[next_child].name,
+ (long long) PCMK__SPECIAL_PID_AS_0(
+ pcmk_children[next_child].pid));
+ } else {
+ /* orderly shutdown */
+ crm_notice("%s[%lld] terminated",
+ pcmk_children[next_child].name,
+ (long long) PCMK__SPECIAL_PID_AS_0(
+ pcmk_children[next_child].pid));
+ }
+ pcmk_process_exit(&(pcmk_children[next_child]));
+ if (!pcmk_children[next_child].respawn) {
+ /* if a subdaemon is down and we don't want it
+ to be restarted this is a success during
+ shutdown. if it isn't restarted anymore
+ due to MAX_RESPAWN it is
+ rather no success.
+ */
+ if (pcmk_children[next_child].respawn_count <= MAX_RESPAWN) {
+ subdaemon_check_progress = time(NULL);
+ }
+ next_child++;
+ }
+ break;
+ default:
+ crm_exit(CRM_EX_FATAL);
+ break; /* static analysis/noreturn */
}
- global_keep_tracking = keep_tracking;
- return keep_tracking;
+ if (next_child >= PCMK__NELEM(pcmk_children)) {
+ next_child = 0;
+ }
+
+ return G_SOURCE_CONTINUE;
}
static gboolean
@@ -257,11 +309,6 @@ pcmk_process_exit(pcmk_child_t * child)
child->name, child->endpoint);
/* need to monitor how it evolves, and start new process if badly */
child->active_before_startup = true;
- if (!global_keep_tracking) {
- global_keep_tracking = true;
- g_timeout_add_seconds(PCMK_PROCESS_CHECK_INTERVAL,
- check_active_before_startup_processes, NULL);
- }
} else {
if (child->needs_cluster && !pcmkd_cluster_connected()) {
@@ -648,7 +695,6 @@ child_liveness(pcmk_child_t *child)
int
find_and_track_existing_processes(void)
{
- bool tracking = false;
bool wait_in_progress;
int rc;
size_t i, rounds;
@@ -716,7 +762,6 @@ find_and_track_existing_processes(void)
pcmk_children[i].pid));
pcmk_children[i].respawn_count = -1; /* 0~keep watching */
pcmk_children[i].active_before_startup = true;
- tracking = true;
break;
case pcmk_rc_ipc_pid_only:
if (pcmk_children[i].respawn_count == WAIT_TRIES) {
@@ -751,10 +796,8 @@ find_and_track_existing_processes(void)
pcmk_children[i].respawn_count = 0; /* restore pristine state */
}
- if (tracking) {
- g_timeout_add_seconds(PCMK_PROCESS_CHECK_INTERVAL,
+ g_timeout_add_seconds(PCMK_PROCESS_CHECK_INTERVAL,
check_active_before_startup_processes, NULL);
- }
return pcmk_rc_ok;
}
--
2.27.0
From 4b60aa100669ff494dd3f1303ca9586dc52e95e4 Mon Sep 17 00:00:00 2001
From: Klaus Wenninger <klaus.wenninger@aon.at>
Date: Thu, 9 Dec 2021 11:25:22 +0100
Subject: [PATCH 2/2] Fix: ipc_client: use libqb async API for connect
---
configure.ac | 3 +++
lib/common/ipc_client.c | 22 ++++++++++++++++++++++
2 files changed, 25 insertions(+)
diff --git a/configure.ac b/configure.ac
index f43fb724c7..c747fe1193 100644
--- a/configure.ac
+++ b/configure.ac
@@ -1309,6 +1309,9 @@ PKG_CHECK_MODULES(libqb, libqb >= 0.17)
CPPFLAGS="$libqb_CFLAGS $CPPFLAGS"
LIBS="$libqb_LIBS $LIBS"
+dnl libqb libqb-2.0.3 + ipc-connect-async-API (2022-01)
+AC_CHECK_FUNCS([qb_ipcc_connect_async])
+
dnl libqb 2.0.2+ (2020-10)
AC_CHECK_FUNCS(qb_ipcc_auth_get,
AC_DEFINE(HAVE_IPCC_AUTH_GET, 1,
diff --git a/lib/common/ipc_client.c b/lib/common/ipc_client.c
index c5afdf3a3d..417b9ef175 100644
--- a/lib/common/ipc_client.c
+++ b/lib/common/ipc_client.c
@@ -1407,13 +1407,35 @@ pcmk__ipc_is_authentic_process_active(const char *name, uid_t refuid,
int32_t qb_rc;
pid_t found_pid = 0; uid_t found_uid = 0; gid_t found_gid = 0;
qb_ipcc_connection_t *c;
+#ifdef HAVE_QB_IPCC_CONNECT_ASYNC
+ struct pollfd pollfd = { 0, };
+ int poll_rc;
+ c = qb_ipcc_connect_async(name, 0,
+ &(pollfd.fd));
+#else
c = qb_ipcc_connect(name, 0);
+#endif
if (c == NULL) {
crm_info("Could not connect to %s IPC: %s", name, strerror(errno));
rc = pcmk_rc_ipc_unresponsive;
goto bail;
}
+#ifdef HAVE_QB_IPCC_CONNECT_ASYNC
+ pollfd.events = POLLIN;
+ do {
+ poll_rc = poll(&pollfd, 1, 2000);
+ } while ((poll_rc == -1) && (errno == EINTR));
+ if ((poll_rc <= 0) || (qb_ipcc_connect_continue(c) != 0)) {
+ crm_info("Could not connect to %s IPC: %s", name,
+ (poll_rc == 0)?"timeout":strerror(errno));
+ rc = pcmk_rc_ipc_unresponsive;
+ if (poll_rc > 0) {
+ c = NULL; // qb_ipcc_connect_continue cleaned up for us
+ }
+ goto bail;
+ }
+#endif
qb_rc = qb_ipcc_fd_get(c, &fd);
if (qb_rc != 0) {
--
2.27.0

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,82 @@
From 8034a203bbff0aa3b53f2946dc58e409bd7246c9 Mon Sep 17 00:00:00 2001
From: Ken Gaillot <kgaillot@redhat.com>
Date: Thu, 20 Jan 2022 15:03:31 -0600
Subject: [PATCH] Fix: scheduler: avoid memory leak when displaying clones
Previously, pe__clone_default() unconditionally created a hash table for
stopped instances, but didn't free it in every code path.
Now, only create the table when we have something to put in it and might
actually use it, and ensure it always gets freed.
---
lib/pengine/clone.c | 18 +++++++++++++-----
1 file changed, 13 insertions(+), 5 deletions(-)
diff --git a/lib/pengine/clone.c b/lib/pengine/clone.c
index 742e2920b0..920a04c32c 100644
--- a/lib/pengine/clone.c
+++ b/lib/pengine/clone.c
@@ -761,7 +761,7 @@ pe__clone_default(pcmk__output_t *out, va_list args)
GList *only_node = va_arg(args, GList *);
GList *only_rsc = va_arg(args, GList *);
- GHashTable *stopped = pcmk__strkey_table(free, free);
+ GHashTable *stopped = NULL;
char *list_text = NULL;
size_t list_text_len = 0;
@@ -818,7 +818,11 @@ pe__clone_default(pcmk__output_t *out, va_list args)
} else if (partially_active == FALSE) {
// List stopped instances when requested (except orphans)
if (!pcmk_is_set(child_rsc->flags, pe_rsc_orphan)
+ && !pcmk_is_set(show_opts, pcmk_show_clone_detail)
&& pcmk_is_set(show_opts, pcmk_show_inactive_rscs)) {
+ if (stopped == NULL) {
+ stopped = pcmk__strkey_table(free, free);
+ }
g_hash_table_insert(stopped, strdup(child_rsc->id), strdup("Stopped"));
}
@@ -873,7 +877,6 @@ pe__clone_default(pcmk__output_t *out, va_list args)
}
if (pcmk_is_set(show_opts, pcmk_show_clone_detail)) {
- g_hash_table_destroy(stopped);
PCMK__OUTPUT_LIST_FOOTER(out, rc);
return pcmk_rc_ok;
}
@@ -948,8 +951,10 @@ pe__clone_default(pcmk__output_t *out, va_list args)
GList *list = g_hash_table_get_values(rsc->allowed_nodes);
/* Custom stopped table for non-unique clones */
- g_hash_table_destroy(stopped);
- stopped = pcmk__strkey_table(free, free);
+ if (stopped != NULL) {
+ g_hash_table_destroy(stopped);
+ stopped = NULL;
+ }
if (list == NULL) {
/* Clusters with symmetrical=false haven't calculated allowed_nodes yet
@@ -972,6 +977,9 @@ pe__clone_default(pcmk__output_t *out, va_list args)
state = "Stopped (disabled)";
}
+ if (stopped == NULL) {
+ stopped = pcmk__strkey_table(free, free);
+ }
if (probe_op != NULL) {
int rc;
@@ -987,7 +995,7 @@ pe__clone_default(pcmk__output_t *out, va_list args)
g_list_free(list);
}
- if (g_hash_table_size(stopped) > 0) {
+ if (stopped != NULL) {
GList *list = sorted_hash_table_values(stopped);
clone_header(out, &rc, rsc, clone_data);
--
2.27.0

View File

@ -0,0 +1,108 @@
From ac92690d8426ec4d1c8be1e0eb4b9289411afe75 Mon Sep 17 00:00:00 2001
From: Klaus Wenninger <klaus.wenninger@aon.at>
Date: Mon, 24 Jan 2022 12:18:42 +0100
Subject: [PATCH] Fix: pacemakerd: have signal-handler take care of lost
processes
regression from introduction of periodic subdaemon checking
in cases they are pacemakerd children - previously it was either
periodic checking or signal-handler per process.
---
daemons/pacemakerd/pcmkd_subdaemons.c | 38 ++++++++++++++++-----------
1 file changed, 22 insertions(+), 16 deletions(-)
diff --git a/daemons/pacemakerd/pcmkd_subdaemons.c b/daemons/pacemakerd/pcmkd_subdaemons.c
index c03903c99e..84ecdc1ee8 100644
--- a/daemons/pacemakerd/pcmkd_subdaemons.c
+++ b/daemons/pacemakerd/pcmkd_subdaemons.c
@@ -141,7 +141,6 @@ check_active_before_startup_processes(gpointer user_data)
switch (rc) {
case pcmk_rc_ok:
pcmk_children[next_child].check_count = 0;
- next_child++;
subdaemon_check_progress = time(NULL);
break;
case pcmk_rc_ipc_pid_only: // This case: it was previously OK
@@ -178,9 +177,27 @@ check_active_before_startup_processes(gpointer user_data)
/* go to the next child and see if
we can make progress there
*/
- next_child++;
break;
case pcmk_rc_ipc_unresponsive:
+ if (!pcmk_children[next_child].respawn) {
+ /* if a subdaemon is down and we don't want it
+ to be restarted this is a success during
+ shutdown. if it isn't restarted anymore
+ due to MAX_RESPAWN it is
+ rather no success.
+ */
+ if (pcmk_children[next_child].respawn_count <= MAX_RESPAWN) {
+ subdaemon_check_progress = time(NULL);
+ }
+ }
+ if (!pcmk_children[next_child].active_before_startup) {
+ crm_trace("found %s[%lld] missing - signal-handler "
+ "will take care of it",
+ pcmk_children[next_child].name,
+ (long long) PCMK__SPECIAL_PID_AS_0(
+ pcmk_children[next_child].pid));
+ break;
+ }
if (pcmk_children[next_child].respawn) {
crm_err("%s[%lld] terminated",
pcmk_children[next_child].name,
@@ -194,24 +211,13 @@ check_active_before_startup_processes(gpointer user_data)
pcmk_children[next_child].pid));
}
pcmk_process_exit(&(pcmk_children[next_child]));
- if (!pcmk_children[next_child].respawn) {
- /* if a subdaemon is down and we don't want it
- to be restarted this is a success during
- shutdown. if it isn't restarted anymore
- due to MAX_RESPAWN it is
- rather no success.
- */
- if (pcmk_children[next_child].respawn_count <= MAX_RESPAWN) {
- subdaemon_check_progress = time(NULL);
- }
- next_child++;
- }
break;
default:
crm_exit(CRM_EX_FATAL);
break; /* static analysis/noreturn */
}
+ next_child++;
if (next_child >= PCMK__NELEM(pcmk_children)) {
next_child = 0;
}
@@ -285,6 +291,7 @@ pcmk_process_exit(pcmk_child_t * child)
{
child->pid = 0;
child->active_before_startup = false;
+ child->check_count = 0;
child->respawn_count += 1;
if (child->respawn_count > MAX_RESPAWN) {
@@ -307,8 +314,6 @@ pcmk_process_exit(pcmk_child_t * child)
crm_warn("One-off suppressing strict respawning of a child process %s,"
" appears alright per %s IPC end-point",
child->name, child->endpoint);
- /* need to monitor how it evolves, and start new process if badly */
- child->active_before_startup = true;
} else {
if (child->needs_cluster && !pcmkd_cluster_connected()) {
@@ -422,6 +427,7 @@ start_child(pcmk_child_t * child)
const char *env_callgrind = getenv("PCMK_callgrind_enabled");
child->active_before_startup = false;
+ child->check_count = 0;
if (child->command == NULL) {
crm_info("Nothing to do for child \"%s\"", child->name);
--
2.27.0

View File

@ -0,0 +1,30 @@
From 16928cfc69136bc56b1574bee9966e0d5de73abd Mon Sep 17 00:00:00 2001
From: Ken Gaillot <kgaillot@redhat.com>
Date: Wed, 26 Jan 2022 09:15:43 -0600
Subject: [PATCH] Fix: controller: correctly match "node down" events
regression introduced in 2.1.2 by 03ce7376e
The symptom that led to this was that removing a remote node connection
resource would lead to the remote node getting fenced when the connection stop
was not recognized as an expected down event.
---
daemons/controld/controld_te_events.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/daemons/controld/controld_te_events.c b/daemons/controld/controld_te_events.c
index 36fd832ba0..1fd7129922 100644
--- a/daemons/controld/controld_te_events.c
+++ b/daemons/controld/controld_te_events.c
@@ -304,7 +304,7 @@ match_down_event(const char *target)
gIter2 = gIter2->next) {
match = (crm_action_t*)gIter2->data;
- if (pcmk_is_set(match->flags, pcmk__graph_action_confirmed)) {
+ if (pcmk_is_set(match->flags, pcmk__graph_action_executed)) {
xpath_ret = xpath_search(match->xml, xpath);
if (numXpathResults(xpath_ret) < 1) {
match = NULL;
--
2.27.0

View File

@ -36,7 +36,7 @@
## can be incremented to build packages reliably considered "newer" ## can be incremented to build packages reliably considered "newer"
## than previously built packages with the same pcmkversion) ## than previously built packages with the same pcmkversion)
%global pcmkversion 2.1.2 %global pcmkversion 2.1.2
%global specversion 2 %global specversion 4
## Upstream commit (full commit ID, abbreviated commit ID, or tag) to build ## Upstream commit (full commit ID, abbreviated commit ID, or tag) to build
%global commit ada5c3b36e2adf1703d54d39f40a4b8628eca175 %global commit ada5c3b36e2adf1703d54d39f40a4b8628eca175
@ -250,6 +250,23 @@ Patch5: 005-fencing-reasons.patch
Patch6: 006-stateful-metadata.patch Patch6: 006-stateful-metadata.patch
Patch7: 007-memory-leak.patch Patch7: 007-memory-leak.patch
Patch8: 008-fencing-history.patch Patch8: 008-fencing-history.patch
Patch9: 009-fencing-reasons.patch
Patch10: 010-probe-failures.patch
Patch11: 011-fencing-reasons.patch
Patch12: 012-notify-crash.patch
Patch13: 013-probe-failures.patch
Patch14: 014-pcmk_delay_base.patch
Patch15: 015-fencing-reasons.patch
Patch16: 016-fencing-crash.patch
Patch17: 017-fencing-reasons.patch
Patch18: 018-failure-messages.patch
Patch19: 019-corosync-tracking.patch
Patch20: 020-systemd-unit.patch
Patch21: 021-daemon-tracking.patch
Patch22: 022-failure-messages.patch
Patch23: 023-memory-leak.patch
Patch24: 024-daemon-tracking.patch
Patch25: 025-regression.patch
Requires: resource-agents Requires: resource-agents
Requires: %{pkgname_pcmk_libs}%{?_isa} = %{version}-%{release} Requires: %{pkgname_pcmk_libs}%{?_isa} = %{version}-%{release}
@ -269,8 +286,9 @@ Requires: %{python_path}
BuildRequires: %{python_name}-devel BuildRequires: %{python_name}-devel
# Pacemaker requires a minimum libqb functionality # Pacemaker requires a minimum libqb functionality
Requires: libqb >= 0.17.0 # RHEL requires a higher version than upstream, for qb_ipcc_connect_async()
BuildRequires: libqb-devel >= 0.17.0 Requires: libqb >= 2.0.3-7
BuildRequires: libqb-devel >= 2.0.3-7
# Required basic build tools # Required basic build tools
BuildRequires: autoconf BuildRequires: autoconf
@ -855,6 +873,24 @@ exit 0
%license %{nagios_name}-%{nagios_hash}/COPYING %license %{nagios_name}-%{nagios_hash}/COPYING
%changelog %changelog
* Wed Jan 26 2022 Ken Gaillot <kgaillot@redhat.com> - 2.1.2-4
- Fix regression in down event detection that affects remote nodes
- Resolves: rhbz2039399
* Mon Jan 24 2022 Ken Gaillot <kgaillot@redhat.com> - 2.1.2-3
- Detect an unresponsive subdaemon
- Handle certain probe failures as stopped instead of failed
- Update pcmk_delay_base option meta-data
- Avoid crash when using clone notifications
- Retry Corosync shutdown tracking if first attempt fails
- Improve display of failed actions
- Resolves: rhbz1707851
- Resolves: rhbz2039982
- Resolves: rhbz2032032
- Resolves: rhbz2040443
- Resolves: rhbz2042367
- Resolves: rhbz2042546
* Thu Dec 16 2021 Ken Gaillot <kgaillot@redhat.com> - 2.1.2-2 * Thu Dec 16 2021 Ken Gaillot <kgaillot@redhat.com> - 2.1.2-2
- Correctly get metadata for systemd agent names that end in '@' - Correctly get metadata for systemd agent names that end in '@'
- Use correct OCF 1.1 syntax in ocf:pacemaker:Stateful meta-data - Use correct OCF 1.1 syntax in ocf:pacemaker:Stateful meta-data