From 26c81cdfa6fdda4aab69e0184839be0fb74ef73d Mon Sep 17 00:00:00 2001 From: Alexey Tikhonov Date: Mon, 10 Jul 2023 18:16:53 +0200 Subject: [PATCH] Resolves: rhbz#2218858 - [sssd] SSSD enters failed state after heavy load in the system --- ...m_watchdog-and-disarm_watchdog-calls.patch | 106 ++++++++++++++++++ ...-watchdog-for-sbus_connect_init_send.patch | 53 +++++++++ sssd.spec | 8 +- 3 files changed, 165 insertions(+), 2 deletions(-) create mode 100644 0001-watchdog-add-arm_watchdog-and-disarm_watchdog-calls.patch create mode 100644 0002-sbus-arm-watchdog-for-sbus_connect_init_send.patch diff --git a/0001-watchdog-add-arm_watchdog-and-disarm_watchdog-calls.patch b/0001-watchdog-add-arm_watchdog-and-disarm_watchdog-calls.patch new file mode 100644 index 0000000..6a77149 --- /dev/null +++ b/0001-watchdog-add-arm_watchdog-and-disarm_watchdog-calls.patch @@ -0,0 +1,106 @@ +From 2cd5a6a2c8fd1826177d6bb51e7d4f4ad368bcfb Mon Sep 17 00:00:00 2001 +From: Sumit Bose +Date: Fri, 9 Jun 2023 12:31:39 +0200 +Subject: [PATCH 1/2] watchdog: add arm_watchdog() and disarm_watchdog() calls +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +Those two new calls can be used if there are requests stuck by e.g. +waiting on replies where there is no other way to handle the timeout and +get the system back into a stable state. They should be only used as a +last resort. + +Resolves: https://github.com/SSSD/sssd/issues/6803 + +Reviewed-by: Alexey Tikhonov +Reviewed-by: Pavel Březina +(cherry picked from commit 75f2b35ad3b9256de905d05c5108400d35688554) +--- + src/util/util.h | 12 ++++++++++++ + src/util/util_watchdog.c | 28 ++++++++++++++++++++++++++-- + 2 files changed, 38 insertions(+), 2 deletions(-) + +diff --git a/src/util/util.h b/src/util/util.h +index a8356e0cd..9dbcf3301 100644 +--- a/src/util/util.h ++++ b/src/util/util.h +@@ -756,6 +756,18 @@ int setup_watchdog(struct tevent_context *ev, int interval); + void teardown_watchdog(void); + int get_watchdog_ticks(void); + ++/* The arm_watchdog() and disarm_watchdog() calls will disable and re-enable ++ * the watchdog reset, respectively. This means that after arm_watchdog() is ++ * called the watchdog will not be resetted anymore and it will kill the ++ * process if disarm_watchdog() wasn't called before. ++ * Those calls should only be used when there is no other way to handle ++ * waiting request and recover into a stable state. ++ * Those calls cannot be nested, i.e. after calling arm_watchdog() it should ++ * not be called a second time in a different request because then ++ * disarm_watchdog() will disable the watchdog coverage for both. */ ++void arm_watchdog(void); ++void disarm_watchdog(void); ++ + /* from files.c */ + int sss_remove_tree(const char *root); + int sss_remove_subtree(const char *root); +diff --git a/src/util/util_watchdog.c b/src/util/util_watchdog.c +index b1534e499..abafd94b9 100644 +--- a/src/util/util_watchdog.c ++++ b/src/util/util_watchdog.c +@@ -40,6 +40,7 @@ struct watchdog_ctx { + time_t timestamp; + struct tevent_fd *tfd; + int pipefd[2]; ++ bool armed; /* if 'true' ticks counter will not be reset */ + } watchdog_ctx; + + static void watchdog_detect_timeshift(void) +@@ -89,8 +90,13 @@ static void watchdog_event_handler(struct tevent_context *ev, + struct timeval current_time, + void *private_data) + { +- /* first thing reset the watchdog ticks */ +- watchdog_reset(); ++ if (!watchdog_ctx.armed) { ++ /* first thing reset the watchdog ticks */ ++ watchdog_reset(); ++ } else { ++ DEBUG(SSSDBG_IMPORTANT_INFO, ++ "Watchdog armed, process might be terminated soon.\n"); ++ } + + /* then set a new watchodg event */ + watchdog_ctx.te = tevent_add_timer(ev, ev, +@@ -197,6 +203,7 @@ int setup_watchdog(struct tevent_context *ev, int interval) + watchdog_ctx.ev = ev; + watchdog_ctx.input_interval = interval; + watchdog_ctx.timestamp = time(NULL); ++ watchdog_ctx.armed = false; + + ret = pipe(watchdog_ctx.pipefd); + if (ret == -1) { +@@ -264,3 +271,20 @@ int get_watchdog_ticks(void) + { + return __sync_add_and_fetch(&watchdog_ctx.ticks, 0); + } ++ ++void arm_watchdog(void) ++{ ++ if (watchdog_ctx.armed) { ++ DEBUG(SSSDBG_CRIT_FAILURE, ++ "arm_watchdog() is called although the watchdog is already armed. " ++ "This indicates a programming error and should be avoided because " ++ "it will most probably not work as expected.\n"); ++ } ++ ++ watchdog_ctx.armed = true; ++} ++ ++void disarm_watchdog(void) ++{ ++ watchdog_ctx.armed = false; ++} +-- +2.38.1 + diff --git a/0002-sbus-arm-watchdog-for-sbus_connect_init_send.patch b/0002-sbus-arm-watchdog-for-sbus_connect_init_send.patch new file mode 100644 index 0000000..99e7c04 --- /dev/null +++ b/0002-sbus-arm-watchdog-for-sbus_connect_init_send.patch @@ -0,0 +1,53 @@ +From 55564defec8fdbb4d9df6b0124a8b18b31743230 Mon Sep 17 00:00:00 2001 +From: Sumit Bose +Date: Fri, 9 Jun 2023 13:01:47 +0200 +Subject: [PATCH 2/2] sbus: arm watchdog for sbus_connect_init_send() +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +There seem to be conditions where the reply in the +sbus_call_DBus_Hello_send() request gets lost and the backend cannot +properly initialize its sbus/DBus server. Since the backend cannot be +connected by the frontends in this state the best way to recover would +be a restart. Since the event-loop is active in this state, e.g. waiting +for the reply, the watchdog will not consider the process as hung and +will not restart the process. + +To make the watchdog handle this case arm_watchdog() and +disarm_watchdog() are called before and after the request, respectively. + +Resolves: https://github.com/SSSD/sssd/issues/6803 + +Reviewed-by: Alexey Tikhonov +Reviewed-by: Pavel Březina +(cherry picked from commit cca9361d92501e0be34d264d370fe897a0c970af) +--- + src/sbus/connection/sbus_connection_connect.c | 4 ++++ + 1 file changed, 4 insertions(+) + +diff --git a/src/sbus/connection/sbus_connection_connect.c b/src/sbus/connection/sbus_connection_connect.c +index 45a0fa491..edc090e15 100644 +--- a/src/sbus/connection/sbus_connection_connect.c ++++ b/src/sbus/connection/sbus_connection_connect.c +@@ -67,6 +67,8 @@ sbus_connect_init_send(TALLOC_CTX *mem_ctx, + + tevent_req_set_callback(subreq, sbus_connect_init_hello_done, req); + ++ arm_watchdog(); ++ + return req; + } + +@@ -111,6 +113,8 @@ static void sbus_connect_init_done(struct tevent_req *subreq) + uint32_t res; + errno_t ret; + ++ disarm_watchdog(); ++ + req = tevent_req_callback_data(subreq, struct tevent_req); + + ret = sbus_call_DBus_RequestName_recv(subreq, &res); +-- +2.38.1 + diff --git a/sssd.spec b/sssd.spec index 4b61753..372efda 100644 --- a/sssd.spec +++ b/sssd.spec @@ -27,14 +27,15 @@ Name: sssd Version: 2.9.1 -Release: 1%{?dist} +Release: 2%{?dist} Summary: System Security Services Daemon License: GPLv3+ URL: https://github.com/SSSD/sssd/ Source0: https://github.com/SSSD/sssd/releases/download/%{version}/sssd-%{version}.tar.gz ### Patches ### -#Patch0001: +Patch0001: 0001-watchdog-add-arm_watchdog-and-disarm_watchdog-calls.patch +Patch0002: 0002-sbus-arm-watchdog-for-sbus_connect_init_send.patch ### Dependencies ### @@ -1061,6 +1062,9 @@ fi %systemd_postun_with_restart sssd.service %changelog +* Mon Jul 10 2023 Alexey Tikhonov - 2.9.1-2 +- Resolves: rhbz#2218858 - [sssd] SSSD enters failed state after heavy load in the system + * Fri Jun 23 2023 Alexey Tikhonov - 2.9.1-1 - Resolves: rhbz#2167837 - Rebase SSSD for RHEL 9.3 - Resolves: rhbz#2196816 - [RHEL9] [sssd] User lookup on IPA client fails with 's2n get_fqlist request failed'