87bc6c8acd
- Detect an unresponsive subdaemon - Handle certain probe failures as stopped instead of failed - Update pcmk_delay_base option meta-data - Avoid crash when using clone notifications - Retry Corosync shutdown tracking if first attempt fails - Improve display of failed actions - Resolves: rhbz1707851 - Resolves: rhbz2039982 - Resolves: rhbz2032032 - Resolves: rhbz2040443 - Resolves: rhbz2042367 - Resolves: rhbz2042546
355 lines
14 KiB
Diff
355 lines
14 KiB
Diff
From 9ee9fd6b98d8a5ff5eac57a14cbc0ce1009b10e4 Mon Sep 17 00:00:00 2001
|
|
From: Klaus Wenninger <klaus.wenninger@aon.at>
|
|
Date: Thu, 18 Nov 2021 13:23:34 +0100
|
|
Subject: [PATCH 1/2] Feature: pacemakerd: keep tracking pacemakerd for
|
|
liveness
|
|
|
|
---
|
|
daemons/pacemakerd/pacemakerd.c | 2 +
|
|
daemons/pacemakerd/pacemakerd.h | 3 +-
|
|
daemons/pacemakerd/pcmkd_messages.c | 6 +-
|
|
daemons/pacemakerd/pcmkd_subdaemons.c | 139 +++++++++++++++++---------
|
|
4 files changed, 98 insertions(+), 52 deletions(-)
|
|
|
|
diff --git a/daemons/pacemakerd/pacemakerd.c b/daemons/pacemakerd/pacemakerd.c
|
|
index 34d64c4053..062c2d5326 100644
|
|
--- a/daemons/pacemakerd/pacemakerd.c
|
|
+++ b/daemons/pacemakerd/pacemakerd.c
|
|
@@ -259,6 +259,8 @@ main(int argc, char **argv)
|
|
pcmk_ipc_api_t *old_instance = NULL;
|
|
qb_ipcs_service_t *ipcs = NULL;
|
|
|
|
+ subdaemon_check_progress = time(NULL);
|
|
+
|
|
crm_log_preinit(NULL, argc, argv);
|
|
mainloop_add_signal(SIGHUP, pcmk_ignore);
|
|
mainloop_add_signal(SIGQUIT, pcmk_sigquit);
|
|
diff --git a/daemons/pacemakerd/pacemakerd.h b/daemons/pacemakerd/pacemakerd.h
|
|
index 7c541bbf9e..424dbbcc5d 100644
|
|
--- a/daemons/pacemakerd/pacemakerd.h
|
|
+++ b/daemons/pacemakerd/pacemakerd.h
|
|
@@ -1,5 +1,5 @@
|
|
/*
|
|
- * Copyright 2010-2021 the Pacemaker project contributors
|
|
+ * Copyright 2010-2022 the Pacemaker project contributors
|
|
*
|
|
* The version control history for this file may have further details.
|
|
*
|
|
@@ -21,6 +21,7 @@ extern unsigned int shutdown_complete_state_reported_to;
|
|
extern gboolean shutdown_complete_state_reported_client_closed;
|
|
extern crm_trigger_t *shutdown_trigger;
|
|
extern crm_trigger_t *startup_trigger;
|
|
+extern time_t subdaemon_check_progress;
|
|
|
|
gboolean mcp_read_config(void);
|
|
|
|
diff --git a/daemons/pacemakerd/pcmkd_messages.c b/daemons/pacemakerd/pcmkd_messages.c
|
|
index 0439986ecf..f2cddc353e 100644
|
|
--- a/daemons/pacemakerd/pcmkd_messages.c
|
|
+++ b/daemons/pacemakerd/pcmkd_messages.c
|
|
@@ -1,5 +1,5 @@
|
|
/*
|
|
- * Copyright 2010-2021 the Pacemaker project contributors
|
|
+ * Copyright 2010-2022 the Pacemaker project contributors
|
|
*
|
|
* The version control history for this file may have further details.
|
|
*
|
|
@@ -25,7 +25,6 @@ pcmk_handle_ping_request(pcmk__client_t *c, xmlNode *msg, uint32_t id)
|
|
const char *value = NULL;
|
|
xmlNode *ping = NULL;
|
|
xmlNode *reply = NULL;
|
|
- time_t pinged = time(NULL);
|
|
const char *from = crm_element_value(msg, F_CRM_SYS_FROM);
|
|
|
|
/* Pinged for status */
|
|
@@ -36,7 +35,8 @@ pcmk_handle_ping_request(pcmk__client_t *c, xmlNode *msg, uint32_t id)
|
|
value = crm_element_value(msg, F_CRM_SYS_TO);
|
|
crm_xml_add(ping, XML_PING_ATTR_SYSFROM, value);
|
|
crm_xml_add(ping, XML_PING_ATTR_PACEMAKERDSTATE, pacemakerd_state);
|
|
- crm_xml_add_ll(ping, XML_ATTR_TSTAMP, (long long) pinged);
|
|
+ crm_xml_add_ll(ping, XML_ATTR_TSTAMP,
|
|
+ (long long) subdaemon_check_progress);
|
|
crm_xml_add(ping, XML_PING_ATTR_STATUS, "ok");
|
|
reply = create_reply(msg, ping);
|
|
free_xml(ping);
|
|
diff --git a/daemons/pacemakerd/pcmkd_subdaemons.c b/daemons/pacemakerd/pcmkd_subdaemons.c
|
|
index a54fcce1ba..c03903c99e 100644
|
|
--- a/daemons/pacemakerd/pcmkd_subdaemons.c
|
|
+++ b/daemons/pacemakerd/pcmkd_subdaemons.c
|
|
@@ -32,14 +32,16 @@ typedef struct pcmk_child_s {
|
|
const char *command;
|
|
const char *endpoint; /* IPC server name */
|
|
bool needs_cluster;
|
|
+ int check_count;
|
|
|
|
/* Anything below here will be dynamically initialized */
|
|
bool needs_retry;
|
|
bool active_before_startup;
|
|
} pcmk_child_t;
|
|
|
|
-#define PCMK_PROCESS_CHECK_INTERVAL 5
|
|
-#define SHUTDOWN_ESCALATION_PERIOD 180000 /* 3m */
|
|
+#define PCMK_PROCESS_CHECK_INTERVAL 1
|
|
+#define PCMK_PROCESS_CHECK_RETRIES 5
|
|
+#define SHUTDOWN_ESCALATION_PERIOD 180000 /* 3m */
|
|
|
|
/* Index into the array below */
|
|
#define PCMK_CHILD_CONTROLD 5
|
|
@@ -82,6 +84,7 @@ static char *opts_vgrind[] = { NULL, NULL, NULL, NULL, NULL };
|
|
|
|
crm_trigger_t *shutdown_trigger = NULL;
|
|
crm_trigger_t *startup_trigger = NULL;
|
|
+time_t subdaemon_check_progress = 0;
|
|
|
|
/* When contacted via pacemakerd-api by a client having sbd in
|
|
* the name we assume it is sbd-daemon which wants to know
|
|
@@ -103,7 +106,6 @@ gboolean running_with_sbd = FALSE; /* local copy */
|
|
GMainLoop *mainloop = NULL;
|
|
|
|
static gboolean fatal_error = FALSE;
|
|
-static bool global_keep_tracking = false;
|
|
|
|
static gboolean check_active_before_startup_processes(gpointer user_data);
|
|
static int child_liveness(pcmk_child_t *child);
|
|
@@ -127,44 +129,94 @@ pcmkd_cluster_connected(void)
|
|
static gboolean
|
|
check_active_before_startup_processes(gpointer user_data)
|
|
{
|
|
- gboolean keep_tracking = FALSE;
|
|
-
|
|
- for (int i = 0; i < PCMK__NELEM(pcmk_children); i++) {
|
|
- if (!pcmk_children[i].active_before_startup) {
|
|
- /* we are already tracking it as a child process. */
|
|
- continue;
|
|
- } else {
|
|
- int rc = child_liveness(&pcmk_children[i]);
|
|
-
|
|
- switch (rc) {
|
|
- case pcmk_rc_ok:
|
|
- break;
|
|
- case pcmk_rc_ipc_unresponsive:
|
|
- case pcmk_rc_ipc_pid_only: // This case: it was previously OK
|
|
- if (pcmk_children[i].respawn) {
|
|
- crm_err("%s[%lld] terminated%s", pcmk_children[i].name,
|
|
- (long long) PCMK__SPECIAL_PID_AS_0(pcmk_children[i].pid),
|
|
- (rc == pcmk_rc_ipc_pid_only)? " as IPC server" : "");
|
|
- } else {
|
|
- /* orderly shutdown */
|
|
- crm_notice("%s[%lld] terminated%s", pcmk_children[i].name,
|
|
- (long long) PCMK__SPECIAL_PID_AS_0(pcmk_children[i].pid),
|
|
- (rc == pcmk_rc_ipc_pid_only)? " as IPC server" : "");
|
|
- }
|
|
- pcmk_process_exit(&(pcmk_children[i]));
|
|
- continue;
|
|
- default:
|
|
- crm_exit(CRM_EX_FATAL);
|
|
- break; /* static analysis/noreturn */
|
|
+ static int next_child = 0;
|
|
+ int rc = child_liveness(&pcmk_children[next_child]);
|
|
+
|
|
+ crm_trace("%s[%lld] checked as %d",
|
|
+ pcmk_children[next_child].name,
|
|
+ (long long) PCMK__SPECIAL_PID_AS_0(
|
|
+ pcmk_children[next_child].pid),
|
|
+ rc);
|
|
+
|
|
+ switch (rc) {
|
|
+ case pcmk_rc_ok:
|
|
+ pcmk_children[next_child].check_count = 0;
|
|
+ next_child++;
|
|
+ subdaemon_check_progress = time(NULL);
|
|
+ break;
|
|
+ case pcmk_rc_ipc_pid_only: // This case: it was previously OK
|
|
+ pcmk_children[next_child].check_count++;
|
|
+ if (pcmk_children[next_child].check_count >= PCMK_PROCESS_CHECK_RETRIES) {
|
|
+ crm_err("%s[%lld] is unresponsive to ipc after %d tries but "
|
|
+ "we found the pid so have it killed that we can restart",
|
|
+ pcmk_children[next_child].name,
|
|
+ (long long) PCMK__SPECIAL_PID_AS_0(
|
|
+ pcmk_children[next_child].pid),
|
|
+ pcmk_children[next_child].check_count);
|
|
+ stop_child(&pcmk_children[next_child], SIGKILL);
|
|
+ if (pcmk_children[next_child].respawn) {
|
|
+ /* as long as the respawn-limit isn't reached
|
|
+ give it another round of check retries
|
|
+ */
|
|
+ pcmk_children[next_child].check_count = 0;
|
|
+ }
|
|
+ } else {
|
|
+ crm_notice("%s[%lld] is unresponsive to ipc after %d tries",
|
|
+ pcmk_children[next_child].name,
|
|
+ (long long) PCMK__SPECIAL_PID_AS_0(
|
|
+ pcmk_children[next_child].pid),
|
|
+ pcmk_children[next_child].check_count);
|
|
+ if (pcmk_children[next_child].respawn) {
|
|
+ /* as long as the respawn-limit isn't reached
|
|
+ and we haven't run out of connect retries
|
|
+ we account this as progress we are willing
|
|
+ to tell to sbd
|
|
+ */
|
|
+ subdaemon_check_progress = time(NULL);
|
|
+ }
|
|
}
|
|
- }
|
|
- /* at least one of the processes found at startup
|
|
- * is still going, so keep this recurring timer around */
|
|
- keep_tracking = TRUE;
|
|
+ /* go to the next child and see if
|
|
+ we can make progress there
|
|
+ */
|
|
+ next_child++;
|
|
+ break;
|
|
+ case pcmk_rc_ipc_unresponsive:
|
|
+ if (pcmk_children[next_child].respawn) {
|
|
+ crm_err("%s[%lld] terminated",
|
|
+ pcmk_children[next_child].name,
|
|
+ (long long) PCMK__SPECIAL_PID_AS_0(
|
|
+ pcmk_children[next_child].pid));
|
|
+ } else {
|
|
+ /* orderly shutdown */
|
|
+ crm_notice("%s[%lld] terminated",
|
|
+ pcmk_children[next_child].name,
|
|
+ (long long) PCMK__SPECIAL_PID_AS_0(
|
|
+ pcmk_children[next_child].pid));
|
|
+ }
|
|
+ pcmk_process_exit(&(pcmk_children[next_child]));
|
|
+ if (!pcmk_children[next_child].respawn) {
|
|
+ /* if a subdaemon is down and we don't want it
|
|
+ to be restarted this is a success during
|
|
+ shutdown. if it isn't restarted anymore
|
|
+ due to MAX_RESPAWN it is
|
|
+ rather no success.
|
|
+ */
|
|
+ if (pcmk_children[next_child].respawn_count <= MAX_RESPAWN) {
|
|
+ subdaemon_check_progress = time(NULL);
|
|
+ }
|
|
+ next_child++;
|
|
+ }
|
|
+ break;
|
|
+ default:
|
|
+ crm_exit(CRM_EX_FATAL);
|
|
+ break; /* static analysis/noreturn */
|
|
}
|
|
|
|
- global_keep_tracking = keep_tracking;
|
|
- return keep_tracking;
|
|
+ if (next_child >= PCMK__NELEM(pcmk_children)) {
|
|
+ next_child = 0;
|
|
+ }
|
|
+
|
|
+ return G_SOURCE_CONTINUE;
|
|
}
|
|
|
|
static gboolean
|
|
@@ -257,11 +309,6 @@ pcmk_process_exit(pcmk_child_t * child)
|
|
child->name, child->endpoint);
|
|
/* need to monitor how it evolves, and start new process if badly */
|
|
child->active_before_startup = true;
|
|
- if (!global_keep_tracking) {
|
|
- global_keep_tracking = true;
|
|
- g_timeout_add_seconds(PCMK_PROCESS_CHECK_INTERVAL,
|
|
- check_active_before_startup_processes, NULL);
|
|
- }
|
|
|
|
} else {
|
|
if (child->needs_cluster && !pcmkd_cluster_connected()) {
|
|
@@ -648,7 +695,6 @@ child_liveness(pcmk_child_t *child)
|
|
int
|
|
find_and_track_existing_processes(void)
|
|
{
|
|
- bool tracking = false;
|
|
bool wait_in_progress;
|
|
int rc;
|
|
size_t i, rounds;
|
|
@@ -716,7 +762,6 @@ find_and_track_existing_processes(void)
|
|
pcmk_children[i].pid));
|
|
pcmk_children[i].respawn_count = -1; /* 0~keep watching */
|
|
pcmk_children[i].active_before_startup = true;
|
|
- tracking = true;
|
|
break;
|
|
case pcmk_rc_ipc_pid_only:
|
|
if (pcmk_children[i].respawn_count == WAIT_TRIES) {
|
|
@@ -751,10 +796,8 @@ find_and_track_existing_processes(void)
|
|
pcmk_children[i].respawn_count = 0; /* restore pristine state */
|
|
}
|
|
|
|
- if (tracking) {
|
|
- g_timeout_add_seconds(PCMK_PROCESS_CHECK_INTERVAL,
|
|
+ g_timeout_add_seconds(PCMK_PROCESS_CHECK_INTERVAL,
|
|
check_active_before_startup_processes, NULL);
|
|
- }
|
|
return pcmk_rc_ok;
|
|
}
|
|
|
|
--
|
|
2.27.0
|
|
|
|
|
|
From 4b60aa100669ff494dd3f1303ca9586dc52e95e4 Mon Sep 17 00:00:00 2001
|
|
From: Klaus Wenninger <klaus.wenninger@aon.at>
|
|
Date: Thu, 9 Dec 2021 11:25:22 +0100
|
|
Subject: [PATCH 2/2] Fix: ipc_client: use libqb async API for connect
|
|
|
|
---
|
|
configure.ac | 3 +++
|
|
lib/common/ipc_client.c | 22 ++++++++++++++++++++++
|
|
2 files changed, 25 insertions(+)
|
|
|
|
diff --git a/configure.ac b/configure.ac
|
|
index f43fb724c7..c747fe1193 100644
|
|
--- a/configure.ac
|
|
+++ b/configure.ac
|
|
@@ -1309,6 +1309,9 @@ PKG_CHECK_MODULES(libqb, libqb >= 0.17)
|
|
CPPFLAGS="$libqb_CFLAGS $CPPFLAGS"
|
|
LIBS="$libqb_LIBS $LIBS"
|
|
|
|
+dnl libqb libqb-2.0.3 + ipc-connect-async-API (2022-01)
|
|
+AC_CHECK_FUNCS([qb_ipcc_connect_async])
|
|
+
|
|
dnl libqb 2.0.2+ (2020-10)
|
|
AC_CHECK_FUNCS(qb_ipcc_auth_get,
|
|
AC_DEFINE(HAVE_IPCC_AUTH_GET, 1,
|
|
diff --git a/lib/common/ipc_client.c b/lib/common/ipc_client.c
|
|
index c5afdf3a3d..417b9ef175 100644
|
|
--- a/lib/common/ipc_client.c
|
|
+++ b/lib/common/ipc_client.c
|
|
@@ -1407,13 +1407,35 @@ pcmk__ipc_is_authentic_process_active(const char *name, uid_t refuid,
|
|
int32_t qb_rc;
|
|
pid_t found_pid = 0; uid_t found_uid = 0; gid_t found_gid = 0;
|
|
qb_ipcc_connection_t *c;
|
|
+#ifdef HAVE_QB_IPCC_CONNECT_ASYNC
|
|
+ struct pollfd pollfd = { 0, };
|
|
+ int poll_rc;
|
|
|
|
+ c = qb_ipcc_connect_async(name, 0,
|
|
+ &(pollfd.fd));
|
|
+#else
|
|
c = qb_ipcc_connect(name, 0);
|
|
+#endif
|
|
if (c == NULL) {
|
|
crm_info("Could not connect to %s IPC: %s", name, strerror(errno));
|
|
rc = pcmk_rc_ipc_unresponsive;
|
|
goto bail;
|
|
}
|
|
+#ifdef HAVE_QB_IPCC_CONNECT_ASYNC
|
|
+ pollfd.events = POLLIN;
|
|
+ do {
|
|
+ poll_rc = poll(&pollfd, 1, 2000);
|
|
+ } while ((poll_rc == -1) && (errno == EINTR));
|
|
+ if ((poll_rc <= 0) || (qb_ipcc_connect_continue(c) != 0)) {
|
|
+ crm_info("Could not connect to %s IPC: %s", name,
|
|
+ (poll_rc == 0)?"timeout":strerror(errno));
|
|
+ rc = pcmk_rc_ipc_unresponsive;
|
|
+ if (poll_rc > 0) {
|
|
+ c = NULL; // qb_ipcc_connect_continue cleaned up for us
|
|
+ }
|
|
+ goto bail;
|
|
+ }
|
|
+#endif
|
|
|
|
qb_rc = qb_ipcc_fd_get(c, &fd);
|
|
if (qb_rc != 0) {
|
|
--
|
|
2.27.0
|
|
|