63 lines
2.5 KiB
Diff
63 lines
2.5 KiB
Diff
From 21482202567979b8a17cc750b095272b3270ee76 Mon Sep 17 00:00:00 2001
|
|
From: Quentin Armitage <quentin@armitage.org.uk>
|
|
Date: Wed, 13 Nov 2019 10:37:38 +0000
|
|
Subject: [PATCH] Fix intermittent "child lost" messages
|
|
|
|
Issue #1364 identified that occassionaly a "child lost" message could
|
|
be logged. Although keepalived continued working as expected, the
|
|
"child lost" message indicated that something wasn't working properly.
|
|
|
|
If a vrrp track script had a timeout in the script that was the
|
|
same as the script timeout configured in keepalived, when the system
|
|
was heavily loaded it was possible for the timeout to occur, followed
|
|
by the termination before the timeout thread was run, in which case
|
|
the termination would be lost because the child thread was no longer
|
|
on the child_pid queue, but on the ready queue.
|
|
|
|
This commit leaves threads on the child_pid queue after a timeout, and
|
|
only removes it when the timeout thread is run. That means that if the
|
|
termination is received before the timeout thread is run, the thread
|
|
(now on the ready queue) can be updated to be a termination rather than
|
|
a timeout.
|
|
|
|
Signed-off-by: Quentin Armitage <quentin@armitage.org.uk>
|
|
---
|
|
lib/scheduler.c | 14 ++++++++++++++
|
|
1 file changed, 14 insertions(+)
|
|
|
|
diff --git a/lib/scheduler.c b/lib/scheduler.c
|
|
index 0a1c334c..f6d9bad1 100644
|
|
--- a/lib/scheduler.c
|
|
+++ b/lib/scheduler.c
|
|
@@ -1708,6 +1708,14 @@ process_threads(thread_master_t *m)
|
|
* We only want timer and signal fd, and don't want inotify, vrrp socket,
|
|
* snmp_read, bfd_receiver, bfd pipe in vrrp/check, dbus pipe or netlink fds. */
|
|
thread = thread_trim_head(thread_list);
|
|
+
|
|
+ if (thread && thread->type == THREAD_CHILD_TIMEOUT) {
|
|
+ /* We remove the thread from the child_pid queue here so that
|
|
+ * if the termination arrives before we processed the timeout
|
|
+ * we can still handle the termination. */
|
|
+ rb_erase(&thread->rb_data, &master->child_pid);
|
|
+ }
|
|
+
|
|
if (!shutting_down ||
|
|
(thread->type == THREAD_READY_FD &&
|
|
(thread->u.fd == m->timer_fd || thread->u.fd == m->signal_fd)) ||
|
|
@@ -1773,6 +1781,12 @@ process_child_termination(pid_t pid, int status)
|
|
|
|
thread_add_terminate_event(m);
|
|
}
|
|
+ else if (thread->type == THREAD_CHILD_TIMEOUT) {
|
|
+ /* The child had been timed out, but we have not processed the timeout
|
|
+ * and it is still on the thread->ready queue. Since we have now got
|
|
+ * the termination, just handle the termination instead. */
|
|
+ thread->type = THREAD_CHILD_TERMINATED;
|
|
+ }
|
|
else
|
|
thread_move_ready(m, &m->child, thread, THREAD_CHILD_TERMINATED);
|
|
}
|
|
--
|
|
2.26.2
|
|
|