Resolves: rhbz#2218858 - [sssd] SSSD enters failed state after heavy load in the system
This commit is contained in:
parent
efb42d7981
commit
26c81cdfa6
106
0001-watchdog-add-arm_watchdog-and-disarm_watchdog-calls.patch
Normal file
106
0001-watchdog-add-arm_watchdog-and-disarm_watchdog-calls.patch
Normal file
@ -0,0 +1,106 @@
|
||||
From 2cd5a6a2c8fd1826177d6bb51e7d4f4ad368bcfb Mon Sep 17 00:00:00 2001
|
||||
From: Sumit Bose <sbose@redhat.com>
|
||||
Date: Fri, 9 Jun 2023 12:31:39 +0200
|
||||
Subject: [PATCH 1/2] watchdog: add arm_watchdog() and disarm_watchdog() calls
|
||||
MIME-Version: 1.0
|
||||
Content-Type: text/plain; charset=UTF-8
|
||||
Content-Transfer-Encoding: 8bit
|
||||
|
||||
Those two new calls can be used if there are requests stuck by e.g.
|
||||
waiting on replies where there is no other way to handle the timeout and
|
||||
get the system back into a stable state. They should be only used as a
|
||||
last resort.
|
||||
|
||||
Resolves: https://github.com/SSSD/sssd/issues/6803
|
||||
|
||||
Reviewed-by: Alexey Tikhonov <atikhono@redhat.com>
|
||||
Reviewed-by: Pavel Březina <pbrezina@redhat.com>
|
||||
(cherry picked from commit 75f2b35ad3b9256de905d05c5108400d35688554)
|
||||
---
|
||||
src/util/util.h | 12 ++++++++++++
|
||||
src/util/util_watchdog.c | 28 ++++++++++++++++++++++++++--
|
||||
2 files changed, 38 insertions(+), 2 deletions(-)
|
||||
|
||||
diff --git a/src/util/util.h b/src/util/util.h
|
||||
index a8356e0cd..9dbcf3301 100644
|
||||
--- a/src/util/util.h
|
||||
+++ b/src/util/util.h
|
||||
@@ -756,6 +756,18 @@ int setup_watchdog(struct tevent_context *ev, int interval);
|
||||
void teardown_watchdog(void);
|
||||
int get_watchdog_ticks(void);
|
||||
|
||||
+/* The arm_watchdog() and disarm_watchdog() calls will disable and re-enable
|
||||
+ * the watchdog reset, respectively. This means that after arm_watchdog() is
|
||||
+ * called the watchdog will not be resetted anymore and it will kill the
|
||||
+ * process if disarm_watchdog() wasn't called before.
|
||||
+ * Those calls should only be used when there is no other way to handle
|
||||
+ * waiting request and recover into a stable state.
|
||||
+ * Those calls cannot be nested, i.e. after calling arm_watchdog() it should
|
||||
+ * not be called a second time in a different request because then
|
||||
+ * disarm_watchdog() will disable the watchdog coverage for both. */
|
||||
+void arm_watchdog(void);
|
||||
+void disarm_watchdog(void);
|
||||
+
|
||||
/* from files.c */
|
||||
int sss_remove_tree(const char *root);
|
||||
int sss_remove_subtree(const char *root);
|
||||
diff --git a/src/util/util_watchdog.c b/src/util/util_watchdog.c
|
||||
index b1534e499..abafd94b9 100644
|
||||
--- a/src/util/util_watchdog.c
|
||||
+++ b/src/util/util_watchdog.c
|
||||
@@ -40,6 +40,7 @@ struct watchdog_ctx {
|
||||
time_t timestamp;
|
||||
struct tevent_fd *tfd;
|
||||
int pipefd[2];
|
||||
+ bool armed; /* if 'true' ticks counter will not be reset */
|
||||
} watchdog_ctx;
|
||||
|
||||
static void watchdog_detect_timeshift(void)
|
||||
@@ -89,8 +90,13 @@ static void watchdog_event_handler(struct tevent_context *ev,
|
||||
struct timeval current_time,
|
||||
void *private_data)
|
||||
{
|
||||
- /* first thing reset the watchdog ticks */
|
||||
- watchdog_reset();
|
||||
+ if (!watchdog_ctx.armed) {
|
||||
+ /* first thing reset the watchdog ticks */
|
||||
+ watchdog_reset();
|
||||
+ } else {
|
||||
+ DEBUG(SSSDBG_IMPORTANT_INFO,
|
||||
+ "Watchdog armed, process might be terminated soon.\n");
|
||||
+ }
|
||||
|
||||
/* then set a new watchodg event */
|
||||
watchdog_ctx.te = tevent_add_timer(ev, ev,
|
||||
@@ -197,6 +203,7 @@ int setup_watchdog(struct tevent_context *ev, int interval)
|
||||
watchdog_ctx.ev = ev;
|
||||
watchdog_ctx.input_interval = interval;
|
||||
watchdog_ctx.timestamp = time(NULL);
|
||||
+ watchdog_ctx.armed = false;
|
||||
|
||||
ret = pipe(watchdog_ctx.pipefd);
|
||||
if (ret == -1) {
|
||||
@@ -264,3 +271,20 @@ int get_watchdog_ticks(void)
|
||||
{
|
||||
return __sync_add_and_fetch(&watchdog_ctx.ticks, 0);
|
||||
}
|
||||
+
|
||||
+void arm_watchdog(void)
|
||||
+{
|
||||
+ if (watchdog_ctx.armed) {
|
||||
+ DEBUG(SSSDBG_CRIT_FAILURE,
|
||||
+ "arm_watchdog() is called although the watchdog is already armed. "
|
||||
+ "This indicates a programming error and should be avoided because "
|
||||
+ "it will most probably not work as expected.\n");
|
||||
+ }
|
||||
+
|
||||
+ watchdog_ctx.armed = true;
|
||||
+}
|
||||
+
|
||||
+void disarm_watchdog(void)
|
||||
+{
|
||||
+ watchdog_ctx.armed = false;
|
||||
+}
|
||||
--
|
||||
2.38.1
|
||||
|
53
0002-sbus-arm-watchdog-for-sbus_connect_init_send.patch
Normal file
53
0002-sbus-arm-watchdog-for-sbus_connect_init_send.patch
Normal file
@ -0,0 +1,53 @@
|
||||
From 55564defec8fdbb4d9df6b0124a8b18b31743230 Mon Sep 17 00:00:00 2001
|
||||
From: Sumit Bose <sbose@redhat.com>
|
||||
Date: Fri, 9 Jun 2023 13:01:47 +0200
|
||||
Subject: [PATCH 2/2] sbus: arm watchdog for sbus_connect_init_send()
|
||||
MIME-Version: 1.0
|
||||
Content-Type: text/plain; charset=UTF-8
|
||||
Content-Transfer-Encoding: 8bit
|
||||
|
||||
There seem to be conditions where the reply in the
|
||||
sbus_call_DBus_Hello_send() request gets lost and the backend cannot
|
||||
properly initialize its sbus/DBus server. Since the backend cannot be
|
||||
connected by the frontends in this state the best way to recover would
|
||||
be a restart. Since the event-loop is active in this state, e.g. waiting
|
||||
for the reply, the watchdog will not consider the process as hung and
|
||||
will not restart the process.
|
||||
|
||||
To make the watchdog handle this case arm_watchdog() and
|
||||
disarm_watchdog() are called before and after the request, respectively.
|
||||
|
||||
Resolves: https://github.com/SSSD/sssd/issues/6803
|
||||
|
||||
Reviewed-by: Alexey Tikhonov <atikhono@redhat.com>
|
||||
Reviewed-by: Pavel Březina <pbrezina@redhat.com>
|
||||
(cherry picked from commit cca9361d92501e0be34d264d370fe897a0c970af)
|
||||
---
|
||||
src/sbus/connection/sbus_connection_connect.c | 4 ++++
|
||||
1 file changed, 4 insertions(+)
|
||||
|
||||
diff --git a/src/sbus/connection/sbus_connection_connect.c b/src/sbus/connection/sbus_connection_connect.c
|
||||
index 45a0fa491..edc090e15 100644
|
||||
--- a/src/sbus/connection/sbus_connection_connect.c
|
||||
+++ b/src/sbus/connection/sbus_connection_connect.c
|
||||
@@ -67,6 +67,8 @@ sbus_connect_init_send(TALLOC_CTX *mem_ctx,
|
||||
|
||||
tevent_req_set_callback(subreq, sbus_connect_init_hello_done, req);
|
||||
|
||||
+ arm_watchdog();
|
||||
+
|
||||
return req;
|
||||
}
|
||||
|
||||
@@ -111,6 +113,8 @@ static void sbus_connect_init_done(struct tevent_req *subreq)
|
||||
uint32_t res;
|
||||
errno_t ret;
|
||||
|
||||
+ disarm_watchdog();
|
||||
+
|
||||
req = tevent_req_callback_data(subreq, struct tevent_req);
|
||||
|
||||
ret = sbus_call_DBus_RequestName_recv(subreq, &res);
|
||||
--
|
||||
2.38.1
|
||||
|
@ -27,14 +27,15 @@
|
||||
|
||||
Name: sssd
|
||||
Version: 2.9.1
|
||||
Release: 1%{?dist}
|
||||
Release: 2%{?dist}
|
||||
Summary: System Security Services Daemon
|
||||
License: GPLv3+
|
||||
URL: https://github.com/SSSD/sssd/
|
||||
Source0: https://github.com/SSSD/sssd/releases/download/%{version}/sssd-%{version}.tar.gz
|
||||
|
||||
### Patches ###
|
||||
#Patch0001:
|
||||
Patch0001: 0001-watchdog-add-arm_watchdog-and-disarm_watchdog-calls.patch
|
||||
Patch0002: 0002-sbus-arm-watchdog-for-sbus_connect_init_send.patch
|
||||
|
||||
### Dependencies ###
|
||||
|
||||
@ -1061,6 +1062,9 @@ fi
|
||||
%systemd_postun_with_restart sssd.service
|
||||
|
||||
%changelog
|
||||
* Mon Jul 10 2023 Alexey Tikhonov <atikhono@redhat.com> - 2.9.1-2
|
||||
- Resolves: rhbz#2218858 - [sssd] SSSD enters failed state after heavy load in the system
|
||||
|
||||
* Fri Jun 23 2023 Alexey Tikhonov <atikhono@redhat.com> - 2.9.1-1
|
||||
- Resolves: rhbz#2167837 - Rebase SSSD for RHEL 9.3
|
||||
- Resolves: rhbz#2196816 - [RHEL9] [sssd] User lookup on IPA client fails with 's2n get_fqlist request failed'
|
||||
|
Loading…
Reference in New Issue
Block a user