pacemaker/024-daemon-tracking.patch
Ken Gaillot 87bc6c8acd Backport selected patches from upstream main branch
- Detect an unresponsive subdaemon
- Handle certain probe failures as stopped instead of failed
- Update pcmk_delay_base option meta-data
- Avoid crash when using clone notifications
- Retry Corosync shutdown tracking if first attempt fails
- Improve display of failed actions
- Resolves: rhbz1707851
- Resolves: rhbz2039982
- Resolves: rhbz2032032
- Resolves: rhbz2040443
- Resolves: rhbz2042367
- Resolves: rhbz2042546
2022-01-24 10:24:48 -06:00

109 lines
4.4 KiB
Diff

From ac92690d8426ec4d1c8be1e0eb4b9289411afe75 Mon Sep 17 00:00:00 2001
From: Klaus Wenninger <klaus.wenninger@aon.at>
Date: Mon, 24 Jan 2022 12:18:42 +0100
Subject: [PATCH] Fix: pacemakerd: have signal-handler take care of lost
processes
regression from introduction of periodic subdaemon checking
in cases they are pacemakerd children - previously it was either
periodic checking or signal-handler per process.
---
daemons/pacemakerd/pcmkd_subdaemons.c | 38 ++++++++++++++++-----------
1 file changed, 22 insertions(+), 16 deletions(-)
diff --git a/daemons/pacemakerd/pcmkd_subdaemons.c b/daemons/pacemakerd/pcmkd_subdaemons.c
index c03903c99e..84ecdc1ee8 100644
--- a/daemons/pacemakerd/pcmkd_subdaemons.c
+++ b/daemons/pacemakerd/pcmkd_subdaemons.c
@@ -141,7 +141,6 @@ check_active_before_startup_processes(gpointer user_data)
switch (rc) {
case pcmk_rc_ok:
pcmk_children[next_child].check_count = 0;
- next_child++;
subdaemon_check_progress = time(NULL);
break;
case pcmk_rc_ipc_pid_only: // This case: it was previously OK
@@ -178,9 +177,27 @@ check_active_before_startup_processes(gpointer user_data)
/* go to the next child and see if
we can make progress there
*/
- next_child++;
break;
case pcmk_rc_ipc_unresponsive:
+ if (!pcmk_children[next_child].respawn) {
+ /* if a subdaemon is down and we don't want it
+ to be restarted this is a success during
+ shutdown. if it isn't restarted anymore
+ due to MAX_RESPAWN it is
+ rather no success.
+ */
+ if (pcmk_children[next_child].respawn_count <= MAX_RESPAWN) {
+ subdaemon_check_progress = time(NULL);
+ }
+ }
+ if (!pcmk_children[next_child].active_before_startup) {
+ crm_trace("found %s[%lld] missing - signal-handler "
+ "will take care of it",
+ pcmk_children[next_child].name,
+ (long long) PCMK__SPECIAL_PID_AS_0(
+ pcmk_children[next_child].pid));
+ break;
+ }
if (pcmk_children[next_child].respawn) {
crm_err("%s[%lld] terminated",
pcmk_children[next_child].name,
@@ -194,24 +211,13 @@ check_active_before_startup_processes(gpointer user_data)
pcmk_children[next_child].pid));
}
pcmk_process_exit(&(pcmk_children[next_child]));
- if (!pcmk_children[next_child].respawn) {
- /* if a subdaemon is down and we don't want it
- to be restarted this is a success during
- shutdown. if it isn't restarted anymore
- due to MAX_RESPAWN it is
- rather no success.
- */
- if (pcmk_children[next_child].respawn_count <= MAX_RESPAWN) {
- subdaemon_check_progress = time(NULL);
- }
- next_child++;
- }
break;
default:
crm_exit(CRM_EX_FATAL);
break; /* static analysis/noreturn */
}
+ next_child++;
if (next_child >= PCMK__NELEM(pcmk_children)) {
next_child = 0;
}
@@ -285,6 +291,7 @@ pcmk_process_exit(pcmk_child_t * child)
{
child->pid = 0;
child->active_before_startup = false;
+ child->check_count = 0;
child->respawn_count += 1;
if (child->respawn_count > MAX_RESPAWN) {
@@ -307,8 +314,6 @@ pcmk_process_exit(pcmk_child_t * child)
crm_warn("One-off suppressing strict respawning of a child process %s,"
" appears alright per %s IPC end-point",
child->name, child->endpoint);
- /* need to monitor how it evolves, and start new process if badly */
- child->active_before_startup = true;
} else {
if (child->needs_cluster && !pcmkd_cluster_connected()) {
@@ -422,6 +427,7 @@ start_child(pcmk_child_t * child)
const char *env_callgrind = getenv("PCMK_callgrind_enabled");
child->active_before_startup = false;
+ child->check_count = 0;
if (child->command == NULL) {
crm_info("Nothing to do for child \"%s\"", child->name);
--
2.27.0