109 lines
4.4 KiB
Diff
109 lines
4.4 KiB
Diff
From ac92690d8426ec4d1c8be1e0eb4b9289411afe75 Mon Sep 17 00:00:00 2001
|
|
From: Klaus Wenninger <klaus.wenninger@aon.at>
|
|
Date: Mon, 24 Jan 2022 12:18:42 +0100
|
|
Subject: [PATCH] Fix: pacemakerd: have signal-handler take care of lost
|
|
processes
|
|
|
|
regression from introduction of periodic subdaemon checking
|
|
in cases they are pacemakerd children - previously it was either
|
|
periodic checking or signal-handler per process.
|
|
---
|
|
daemons/pacemakerd/pcmkd_subdaemons.c | 38 ++++++++++++++++-----------
|
|
1 file changed, 22 insertions(+), 16 deletions(-)
|
|
|
|
diff --git a/daemons/pacemakerd/pcmkd_subdaemons.c b/daemons/pacemakerd/pcmkd_subdaemons.c
|
|
index c03903c99e..84ecdc1ee8 100644
|
|
--- a/daemons/pacemakerd/pcmkd_subdaemons.c
|
|
+++ b/daemons/pacemakerd/pcmkd_subdaemons.c
|
|
@@ -141,7 +141,6 @@ check_active_before_startup_processes(gpointer user_data)
|
|
switch (rc) {
|
|
case pcmk_rc_ok:
|
|
pcmk_children[next_child].check_count = 0;
|
|
- next_child++;
|
|
subdaemon_check_progress = time(NULL);
|
|
break;
|
|
case pcmk_rc_ipc_pid_only: // This case: it was previously OK
|
|
@@ -178,9 +177,27 @@ check_active_before_startup_processes(gpointer user_data)
|
|
/* go to the next child and see if
|
|
we can make progress there
|
|
*/
|
|
- next_child++;
|
|
break;
|
|
case pcmk_rc_ipc_unresponsive:
|
|
+ if (!pcmk_children[next_child].respawn) {
|
|
+ /* if a subdaemon is down and we don't want it
|
|
+ to be restarted this is a success during
|
|
+ shutdown. if it isn't restarted anymore
|
|
+ due to MAX_RESPAWN it is
|
|
+ rather no success.
|
|
+ */
|
|
+ if (pcmk_children[next_child].respawn_count <= MAX_RESPAWN) {
|
|
+ subdaemon_check_progress = time(NULL);
|
|
+ }
|
|
+ }
|
|
+ if (!pcmk_children[next_child].active_before_startup) {
|
|
+ crm_trace("found %s[%lld] missing - signal-handler "
|
|
+ "will take care of it",
|
|
+ pcmk_children[next_child].name,
|
|
+ (long long) PCMK__SPECIAL_PID_AS_0(
|
|
+ pcmk_children[next_child].pid));
|
|
+ break;
|
|
+ }
|
|
if (pcmk_children[next_child].respawn) {
|
|
crm_err("%s[%lld] terminated",
|
|
pcmk_children[next_child].name,
|
|
@@ -194,24 +211,13 @@ check_active_before_startup_processes(gpointer user_data)
|
|
pcmk_children[next_child].pid));
|
|
}
|
|
pcmk_process_exit(&(pcmk_children[next_child]));
|
|
- if (!pcmk_children[next_child].respawn) {
|
|
- /* if a subdaemon is down and we don't want it
|
|
- to be restarted this is a success during
|
|
- shutdown. if it isn't restarted anymore
|
|
- due to MAX_RESPAWN it is
|
|
- rather no success.
|
|
- */
|
|
- if (pcmk_children[next_child].respawn_count <= MAX_RESPAWN) {
|
|
- subdaemon_check_progress = time(NULL);
|
|
- }
|
|
- next_child++;
|
|
- }
|
|
break;
|
|
default:
|
|
crm_exit(CRM_EX_FATAL);
|
|
break; /* static analysis/noreturn */
|
|
}
|
|
|
|
+ next_child++;
|
|
if (next_child >= PCMK__NELEM(pcmk_children)) {
|
|
next_child = 0;
|
|
}
|
|
@@ -285,6 +291,7 @@ pcmk_process_exit(pcmk_child_t * child)
|
|
{
|
|
child->pid = 0;
|
|
child->active_before_startup = false;
|
|
+ child->check_count = 0;
|
|
|
|
child->respawn_count += 1;
|
|
if (child->respawn_count > MAX_RESPAWN) {
|
|
@@ -307,8 +314,6 @@ pcmk_process_exit(pcmk_child_t * child)
|
|
crm_warn("One-off suppressing strict respawning of a child process %s,"
|
|
" appears alright per %s IPC end-point",
|
|
child->name, child->endpoint);
|
|
- /* need to monitor how it evolves, and start new process if badly */
|
|
- child->active_before_startup = true;
|
|
|
|
} else {
|
|
if (child->needs_cluster && !pcmkd_cluster_connected()) {
|
|
@@ -422,6 +427,7 @@ start_child(pcmk_child_t * child)
|
|
const char *env_callgrind = getenv("PCMK_callgrind_enabled");
|
|
|
|
child->active_before_startup = false;
|
|
+ child->check_count = 0;
|
|
|
|
if (child->command == NULL) {
|
|
crm_info("Nothing to do for child \"%s\"", child->name);
|
|
--
|
|
2.27.0
|
|
|