pacemaker/021-daemon-tracking.patch
Ken Gaillot 87bc6c8acd Backport selected patches from upstream main branch
- Detect an unresponsive subdaemon
- Handle certain probe failures as stopped instead of failed
- Update pcmk_delay_base option meta-data
- Avoid crash when using clone notifications
- Retry Corosync shutdown tracking if first attempt fails
- Improve display of failed actions
- Resolves: rhbz1707851
- Resolves: rhbz2039982
- Resolves: rhbz2032032
- Resolves: rhbz2040443
- Resolves: rhbz2042367
- Resolves: rhbz2042546
2022-01-24 10:24:48 -06:00

355 lines
14 KiB
Diff

From 9ee9fd6b98d8a5ff5eac57a14cbc0ce1009b10e4 Mon Sep 17 00:00:00 2001
From: Klaus Wenninger <klaus.wenninger@aon.at>
Date: Thu, 18 Nov 2021 13:23:34 +0100
Subject: [PATCH 1/2] Feature: pacemakerd: keep tracking pacemakerd for
liveness
---
daemons/pacemakerd/pacemakerd.c | 2 +
daemons/pacemakerd/pacemakerd.h | 3 +-
daemons/pacemakerd/pcmkd_messages.c | 6 +-
daemons/pacemakerd/pcmkd_subdaemons.c | 139 +++++++++++++++++---------
4 files changed, 98 insertions(+), 52 deletions(-)
diff --git a/daemons/pacemakerd/pacemakerd.c b/daemons/pacemakerd/pacemakerd.c
index 34d64c4053..062c2d5326 100644
--- a/daemons/pacemakerd/pacemakerd.c
+++ b/daemons/pacemakerd/pacemakerd.c
@@ -259,6 +259,8 @@ main(int argc, char **argv)
pcmk_ipc_api_t *old_instance = NULL;
qb_ipcs_service_t *ipcs = NULL;
+ subdaemon_check_progress = time(NULL);
+
crm_log_preinit(NULL, argc, argv);
mainloop_add_signal(SIGHUP, pcmk_ignore);
mainloop_add_signal(SIGQUIT, pcmk_sigquit);
diff --git a/daemons/pacemakerd/pacemakerd.h b/daemons/pacemakerd/pacemakerd.h
index 7c541bbf9e..424dbbcc5d 100644
--- a/daemons/pacemakerd/pacemakerd.h
+++ b/daemons/pacemakerd/pacemakerd.h
@@ -1,5 +1,5 @@
/*
- * Copyright 2010-2021 the Pacemaker project contributors
+ * Copyright 2010-2022 the Pacemaker project contributors
*
* The version control history for this file may have further details.
*
@@ -21,6 +21,7 @@ extern unsigned int shutdown_complete_state_reported_to;
extern gboolean shutdown_complete_state_reported_client_closed;
extern crm_trigger_t *shutdown_trigger;
extern crm_trigger_t *startup_trigger;
+extern time_t subdaemon_check_progress;
gboolean mcp_read_config(void);
diff --git a/daemons/pacemakerd/pcmkd_messages.c b/daemons/pacemakerd/pcmkd_messages.c
index 0439986ecf..f2cddc353e 100644
--- a/daemons/pacemakerd/pcmkd_messages.c
+++ b/daemons/pacemakerd/pcmkd_messages.c
@@ -1,5 +1,5 @@
/*
- * Copyright 2010-2021 the Pacemaker project contributors
+ * Copyright 2010-2022 the Pacemaker project contributors
*
* The version control history for this file may have further details.
*
@@ -25,7 +25,6 @@ pcmk_handle_ping_request(pcmk__client_t *c, xmlNode *msg, uint32_t id)
const char *value = NULL;
xmlNode *ping = NULL;
xmlNode *reply = NULL;
- time_t pinged = time(NULL);
const char *from = crm_element_value(msg, F_CRM_SYS_FROM);
/* Pinged for status */
@@ -36,7 +35,8 @@ pcmk_handle_ping_request(pcmk__client_t *c, xmlNode *msg, uint32_t id)
value = crm_element_value(msg, F_CRM_SYS_TO);
crm_xml_add(ping, XML_PING_ATTR_SYSFROM, value);
crm_xml_add(ping, XML_PING_ATTR_PACEMAKERDSTATE, pacemakerd_state);
- crm_xml_add_ll(ping, XML_ATTR_TSTAMP, (long long) pinged);
+ crm_xml_add_ll(ping, XML_ATTR_TSTAMP,
+ (long long) subdaemon_check_progress);
crm_xml_add(ping, XML_PING_ATTR_STATUS, "ok");
reply = create_reply(msg, ping);
free_xml(ping);
diff --git a/daemons/pacemakerd/pcmkd_subdaemons.c b/daemons/pacemakerd/pcmkd_subdaemons.c
index a54fcce1ba..c03903c99e 100644
--- a/daemons/pacemakerd/pcmkd_subdaemons.c
+++ b/daemons/pacemakerd/pcmkd_subdaemons.c
@@ -32,14 +32,16 @@ typedef struct pcmk_child_s {
const char *command;
const char *endpoint; /* IPC server name */
bool needs_cluster;
+ int check_count;
/* Anything below here will be dynamically initialized */
bool needs_retry;
bool active_before_startup;
} pcmk_child_t;
-#define PCMK_PROCESS_CHECK_INTERVAL 5
-#define SHUTDOWN_ESCALATION_PERIOD 180000 /* 3m */
+#define PCMK_PROCESS_CHECK_INTERVAL 1
+#define PCMK_PROCESS_CHECK_RETRIES 5
+#define SHUTDOWN_ESCALATION_PERIOD 180000 /* 3m */
/* Index into the array below */
#define PCMK_CHILD_CONTROLD 5
@@ -82,6 +84,7 @@ static char *opts_vgrind[] = { NULL, NULL, NULL, NULL, NULL };
crm_trigger_t *shutdown_trigger = NULL;
crm_trigger_t *startup_trigger = NULL;
+time_t subdaemon_check_progress = 0;
/* When contacted via pacemakerd-api by a client having sbd in
* the name we assume it is sbd-daemon which wants to know
@@ -103,7 +106,6 @@ gboolean running_with_sbd = FALSE; /* local copy */
GMainLoop *mainloop = NULL;
static gboolean fatal_error = FALSE;
-static bool global_keep_tracking = false;
static gboolean check_active_before_startup_processes(gpointer user_data);
static int child_liveness(pcmk_child_t *child);
@@ -127,44 +129,94 @@ pcmkd_cluster_connected(void)
static gboolean
check_active_before_startup_processes(gpointer user_data)
{
- gboolean keep_tracking = FALSE;
-
- for (int i = 0; i < PCMK__NELEM(pcmk_children); i++) {
- if (!pcmk_children[i].active_before_startup) {
- /* we are already tracking it as a child process. */
- continue;
- } else {
- int rc = child_liveness(&pcmk_children[i]);
-
- switch (rc) {
- case pcmk_rc_ok:
- break;
- case pcmk_rc_ipc_unresponsive:
- case pcmk_rc_ipc_pid_only: // This case: it was previously OK
- if (pcmk_children[i].respawn) {
- crm_err("%s[%lld] terminated%s", pcmk_children[i].name,
- (long long) PCMK__SPECIAL_PID_AS_0(pcmk_children[i].pid),
- (rc == pcmk_rc_ipc_pid_only)? " as IPC server" : "");
- } else {
- /* orderly shutdown */
- crm_notice("%s[%lld] terminated%s", pcmk_children[i].name,
- (long long) PCMK__SPECIAL_PID_AS_0(pcmk_children[i].pid),
- (rc == pcmk_rc_ipc_pid_only)? " as IPC server" : "");
- }
- pcmk_process_exit(&(pcmk_children[i]));
- continue;
- default:
- crm_exit(CRM_EX_FATAL);
- break; /* static analysis/noreturn */
+ static int next_child = 0;
+ int rc = child_liveness(&pcmk_children[next_child]);
+
+ crm_trace("%s[%lld] checked as %d",
+ pcmk_children[next_child].name,
+ (long long) PCMK__SPECIAL_PID_AS_0(
+ pcmk_children[next_child].pid),
+ rc);
+
+ switch (rc) {
+ case pcmk_rc_ok:
+ pcmk_children[next_child].check_count = 0;
+ next_child++;
+ subdaemon_check_progress = time(NULL);
+ break;
+ case pcmk_rc_ipc_pid_only: // This case: it was previously OK
+ pcmk_children[next_child].check_count++;
+ if (pcmk_children[next_child].check_count >= PCMK_PROCESS_CHECK_RETRIES) {
+ crm_err("%s[%lld] is unresponsive to ipc after %d tries but "
+ "we found the pid so have it killed that we can restart",
+ pcmk_children[next_child].name,
+ (long long) PCMK__SPECIAL_PID_AS_0(
+ pcmk_children[next_child].pid),
+ pcmk_children[next_child].check_count);
+ stop_child(&pcmk_children[next_child], SIGKILL);
+ if (pcmk_children[next_child].respawn) {
+ /* as long as the respawn-limit isn't reached
+ give it another round of check retries
+ */
+ pcmk_children[next_child].check_count = 0;
+ }
+ } else {
+ crm_notice("%s[%lld] is unresponsive to ipc after %d tries",
+ pcmk_children[next_child].name,
+ (long long) PCMK__SPECIAL_PID_AS_0(
+ pcmk_children[next_child].pid),
+ pcmk_children[next_child].check_count);
+ if (pcmk_children[next_child].respawn) {
+ /* as long as the respawn-limit isn't reached
+ and we haven't run out of connect retries
+ we account this as progress we are willing
+ to tell to sbd
+ */
+ subdaemon_check_progress = time(NULL);
+ }
}
- }
- /* at least one of the processes found at startup
- * is still going, so keep this recurring timer around */
- keep_tracking = TRUE;
+ /* go to the next child and see if
+ we can make progress there
+ */
+ next_child++;
+ break;
+ case pcmk_rc_ipc_unresponsive:
+ if (pcmk_children[next_child].respawn) {
+ crm_err("%s[%lld] terminated",
+ pcmk_children[next_child].name,
+ (long long) PCMK__SPECIAL_PID_AS_0(
+ pcmk_children[next_child].pid));
+ } else {
+ /* orderly shutdown */
+ crm_notice("%s[%lld] terminated",
+ pcmk_children[next_child].name,
+ (long long) PCMK__SPECIAL_PID_AS_0(
+ pcmk_children[next_child].pid));
+ }
+ pcmk_process_exit(&(pcmk_children[next_child]));
+ if (!pcmk_children[next_child].respawn) {
+ /* if a subdaemon is down and we don't want it
+ to be restarted this is a success during
+ shutdown. if it isn't restarted anymore
+ due to MAX_RESPAWN it is
+ rather no success.
+ */
+ if (pcmk_children[next_child].respawn_count <= MAX_RESPAWN) {
+ subdaemon_check_progress = time(NULL);
+ }
+ next_child++;
+ }
+ break;
+ default:
+ crm_exit(CRM_EX_FATAL);
+ break; /* static analysis/noreturn */
}
- global_keep_tracking = keep_tracking;
- return keep_tracking;
+ if (next_child >= PCMK__NELEM(pcmk_children)) {
+ next_child = 0;
+ }
+
+ return G_SOURCE_CONTINUE;
}
static gboolean
@@ -257,11 +309,6 @@ pcmk_process_exit(pcmk_child_t * child)
child->name, child->endpoint);
/* need to monitor how it evolves, and start new process if badly */
child->active_before_startup = true;
- if (!global_keep_tracking) {
- global_keep_tracking = true;
- g_timeout_add_seconds(PCMK_PROCESS_CHECK_INTERVAL,
- check_active_before_startup_processes, NULL);
- }
} else {
if (child->needs_cluster && !pcmkd_cluster_connected()) {
@@ -648,7 +695,6 @@ child_liveness(pcmk_child_t *child)
int
find_and_track_existing_processes(void)
{
- bool tracking = false;
bool wait_in_progress;
int rc;
size_t i, rounds;
@@ -716,7 +762,6 @@ find_and_track_existing_processes(void)
pcmk_children[i].pid));
pcmk_children[i].respawn_count = -1; /* 0~keep watching */
pcmk_children[i].active_before_startup = true;
- tracking = true;
break;
case pcmk_rc_ipc_pid_only:
if (pcmk_children[i].respawn_count == WAIT_TRIES) {
@@ -751,10 +796,8 @@ find_and_track_existing_processes(void)
pcmk_children[i].respawn_count = 0; /* restore pristine state */
}
- if (tracking) {
- g_timeout_add_seconds(PCMK_PROCESS_CHECK_INTERVAL,
+ g_timeout_add_seconds(PCMK_PROCESS_CHECK_INTERVAL,
check_active_before_startup_processes, NULL);
- }
return pcmk_rc_ok;
}
--
2.27.0
From 4b60aa100669ff494dd3f1303ca9586dc52e95e4 Mon Sep 17 00:00:00 2001
From: Klaus Wenninger <klaus.wenninger@aon.at>
Date: Thu, 9 Dec 2021 11:25:22 +0100
Subject: [PATCH 2/2] Fix: ipc_client: use libqb async API for connect
---
configure.ac | 3 +++
lib/common/ipc_client.c | 22 ++++++++++++++++++++++
2 files changed, 25 insertions(+)
diff --git a/configure.ac b/configure.ac
index f43fb724c7..c747fe1193 100644
--- a/configure.ac
+++ b/configure.ac
@@ -1309,6 +1309,9 @@ PKG_CHECK_MODULES(libqb, libqb >= 0.17)
CPPFLAGS="$libqb_CFLAGS $CPPFLAGS"
LIBS="$libqb_LIBS $LIBS"
+dnl libqb libqb-2.0.3 + ipc-connect-async-API (2022-01)
+AC_CHECK_FUNCS([qb_ipcc_connect_async])
+
dnl libqb 2.0.2+ (2020-10)
AC_CHECK_FUNCS(qb_ipcc_auth_get,
AC_DEFINE(HAVE_IPCC_AUTH_GET, 1,
diff --git a/lib/common/ipc_client.c b/lib/common/ipc_client.c
index c5afdf3a3d..417b9ef175 100644
--- a/lib/common/ipc_client.c
+++ b/lib/common/ipc_client.c
@@ -1407,13 +1407,35 @@ pcmk__ipc_is_authentic_process_active(const char *name, uid_t refuid,
int32_t qb_rc;
pid_t found_pid = 0; uid_t found_uid = 0; gid_t found_gid = 0;
qb_ipcc_connection_t *c;
+#ifdef HAVE_QB_IPCC_CONNECT_ASYNC
+ struct pollfd pollfd = { 0, };
+ int poll_rc;
+ c = qb_ipcc_connect_async(name, 0,
+ &(pollfd.fd));
+#else
c = qb_ipcc_connect(name, 0);
+#endif
if (c == NULL) {
crm_info("Could not connect to %s IPC: %s", name, strerror(errno));
rc = pcmk_rc_ipc_unresponsive;
goto bail;
}
+#ifdef HAVE_QB_IPCC_CONNECT_ASYNC
+ pollfd.events = POLLIN;
+ do {
+ poll_rc = poll(&pollfd, 1, 2000);
+ } while ((poll_rc == -1) && (errno == EINTR));
+ if ((poll_rc <= 0) || (qb_ipcc_connect_continue(c) != 0)) {
+ crm_info("Could not connect to %s IPC: %s", name,
+ (poll_rc == 0)?"timeout":strerror(errno));
+ rc = pcmk_rc_ipc_unresponsive;
+ if (poll_rc > 0) {
+ c = NULL; // qb_ipcc_connect_continue cleaned up for us
+ }
+ goto bail;
+ }
+#endif
qb_rc = qb_ipcc_fd_get(c, &fd);
if (qb_rc != 0) {
--
2.27.0