Resolves: rhbz#2218858 - [sssd] SSSD enters failed state after heavy load in the system
This commit is contained in:
parent
efb42d7981
commit
26c81cdfa6
106
0001-watchdog-add-arm_watchdog-and-disarm_watchdog-calls.patch
Normal file
106
0001-watchdog-add-arm_watchdog-and-disarm_watchdog-calls.patch
Normal file
@ -0,0 +1,106 @@
|
|||||||
|
From 2cd5a6a2c8fd1826177d6bb51e7d4f4ad368bcfb Mon Sep 17 00:00:00 2001
|
||||||
|
From: Sumit Bose <sbose@redhat.com>
|
||||||
|
Date: Fri, 9 Jun 2023 12:31:39 +0200
|
||||||
|
Subject: [PATCH 1/2] watchdog: add arm_watchdog() and disarm_watchdog() calls
|
||||||
|
MIME-Version: 1.0
|
||||||
|
Content-Type: text/plain; charset=UTF-8
|
||||||
|
Content-Transfer-Encoding: 8bit
|
||||||
|
|
||||||
|
Those two new calls can be used if there are requests stuck by e.g.
|
||||||
|
waiting on replies where there is no other way to handle the timeout and
|
||||||
|
get the system back into a stable state. They should be only used as a
|
||||||
|
last resort.
|
||||||
|
|
||||||
|
Resolves: https://github.com/SSSD/sssd/issues/6803
|
||||||
|
|
||||||
|
Reviewed-by: Alexey Tikhonov <atikhono@redhat.com>
|
||||||
|
Reviewed-by: Pavel Březina <pbrezina@redhat.com>
|
||||||
|
(cherry picked from commit 75f2b35ad3b9256de905d05c5108400d35688554)
|
||||||
|
---
|
||||||
|
src/util/util.h | 12 ++++++++++++
|
||||||
|
src/util/util_watchdog.c | 28 ++++++++++++++++++++++++++--
|
||||||
|
2 files changed, 38 insertions(+), 2 deletions(-)
|
||||||
|
|
||||||
|
diff --git a/src/util/util.h b/src/util/util.h
|
||||||
|
index a8356e0cd..9dbcf3301 100644
|
||||||
|
--- a/src/util/util.h
|
||||||
|
+++ b/src/util/util.h
|
||||||
|
@@ -756,6 +756,18 @@ int setup_watchdog(struct tevent_context *ev, int interval);
|
||||||
|
void teardown_watchdog(void);
|
||||||
|
int get_watchdog_ticks(void);
|
||||||
|
|
||||||
|
+/* The arm_watchdog() and disarm_watchdog() calls will disable and re-enable
|
||||||
|
+ * the watchdog reset, respectively. This means that after arm_watchdog() is
|
||||||
|
+ * called the watchdog will not be resetted anymore and it will kill the
|
||||||
|
+ * process if disarm_watchdog() wasn't called before.
|
||||||
|
+ * Those calls should only be used when there is no other way to handle
|
||||||
|
+ * waiting request and recover into a stable state.
|
||||||
|
+ * Those calls cannot be nested, i.e. after calling arm_watchdog() it should
|
||||||
|
+ * not be called a second time in a different request because then
|
||||||
|
+ * disarm_watchdog() will disable the watchdog coverage for both. */
|
||||||
|
+void arm_watchdog(void);
|
||||||
|
+void disarm_watchdog(void);
|
||||||
|
+
|
||||||
|
/* from files.c */
|
||||||
|
int sss_remove_tree(const char *root);
|
||||||
|
int sss_remove_subtree(const char *root);
|
||||||
|
diff --git a/src/util/util_watchdog.c b/src/util/util_watchdog.c
|
||||||
|
index b1534e499..abafd94b9 100644
|
||||||
|
--- a/src/util/util_watchdog.c
|
||||||
|
+++ b/src/util/util_watchdog.c
|
||||||
|
@@ -40,6 +40,7 @@ struct watchdog_ctx {
|
||||||
|
time_t timestamp;
|
||||||
|
struct tevent_fd *tfd;
|
||||||
|
int pipefd[2];
|
||||||
|
+ bool armed; /* if 'true' ticks counter will not be reset */
|
||||||
|
} watchdog_ctx;
|
||||||
|
|
||||||
|
static void watchdog_detect_timeshift(void)
|
||||||
|
@@ -89,8 +90,13 @@ static void watchdog_event_handler(struct tevent_context *ev,
|
||||||
|
struct timeval current_time,
|
||||||
|
void *private_data)
|
||||||
|
{
|
||||||
|
- /* first thing reset the watchdog ticks */
|
||||||
|
- watchdog_reset();
|
||||||
|
+ if (!watchdog_ctx.armed) {
|
||||||
|
+ /* first thing reset the watchdog ticks */
|
||||||
|
+ watchdog_reset();
|
||||||
|
+ } else {
|
||||||
|
+ DEBUG(SSSDBG_IMPORTANT_INFO,
|
||||||
|
+ "Watchdog armed, process might be terminated soon.\n");
|
||||||
|
+ }
|
||||||
|
|
||||||
|
/* then set a new watchodg event */
|
||||||
|
watchdog_ctx.te = tevent_add_timer(ev, ev,
|
||||||
|
@@ -197,6 +203,7 @@ int setup_watchdog(struct tevent_context *ev, int interval)
|
||||||
|
watchdog_ctx.ev = ev;
|
||||||
|
watchdog_ctx.input_interval = interval;
|
||||||
|
watchdog_ctx.timestamp = time(NULL);
|
||||||
|
+ watchdog_ctx.armed = false;
|
||||||
|
|
||||||
|
ret = pipe(watchdog_ctx.pipefd);
|
||||||
|
if (ret == -1) {
|
||||||
|
@@ -264,3 +271,20 @@ int get_watchdog_ticks(void)
|
||||||
|
{
|
||||||
|
return __sync_add_and_fetch(&watchdog_ctx.ticks, 0);
|
||||||
|
}
|
||||||
|
+
|
||||||
|
+void arm_watchdog(void)
|
||||||
|
+{
|
||||||
|
+ if (watchdog_ctx.armed) {
|
||||||
|
+ DEBUG(SSSDBG_CRIT_FAILURE,
|
||||||
|
+ "arm_watchdog() is called although the watchdog is already armed. "
|
||||||
|
+ "This indicates a programming error and should be avoided because "
|
||||||
|
+ "it will most probably not work as expected.\n");
|
||||||
|
+ }
|
||||||
|
+
|
||||||
|
+ watchdog_ctx.armed = true;
|
||||||
|
+}
|
||||||
|
+
|
||||||
|
+void disarm_watchdog(void)
|
||||||
|
+{
|
||||||
|
+ watchdog_ctx.armed = false;
|
||||||
|
+}
|
||||||
|
--
|
||||||
|
2.38.1
|
||||||
|
|
53
0002-sbus-arm-watchdog-for-sbus_connect_init_send.patch
Normal file
53
0002-sbus-arm-watchdog-for-sbus_connect_init_send.patch
Normal file
@ -0,0 +1,53 @@
|
|||||||
|
From 55564defec8fdbb4d9df6b0124a8b18b31743230 Mon Sep 17 00:00:00 2001
|
||||||
|
From: Sumit Bose <sbose@redhat.com>
|
||||||
|
Date: Fri, 9 Jun 2023 13:01:47 +0200
|
||||||
|
Subject: [PATCH 2/2] sbus: arm watchdog for sbus_connect_init_send()
|
||||||
|
MIME-Version: 1.0
|
||||||
|
Content-Type: text/plain; charset=UTF-8
|
||||||
|
Content-Transfer-Encoding: 8bit
|
||||||
|
|
||||||
|
There seem to be conditions where the reply in the
|
||||||
|
sbus_call_DBus_Hello_send() request gets lost and the backend cannot
|
||||||
|
properly initialize its sbus/DBus server. Since the backend cannot be
|
||||||
|
connected by the frontends in this state the best way to recover would
|
||||||
|
be a restart. Since the event-loop is active in this state, e.g. waiting
|
||||||
|
for the reply, the watchdog will not consider the process as hung and
|
||||||
|
will not restart the process.
|
||||||
|
|
||||||
|
To make the watchdog handle this case arm_watchdog() and
|
||||||
|
disarm_watchdog() are called before and after the request, respectively.
|
||||||
|
|
||||||
|
Resolves: https://github.com/SSSD/sssd/issues/6803
|
||||||
|
|
||||||
|
Reviewed-by: Alexey Tikhonov <atikhono@redhat.com>
|
||||||
|
Reviewed-by: Pavel Březina <pbrezina@redhat.com>
|
||||||
|
(cherry picked from commit cca9361d92501e0be34d264d370fe897a0c970af)
|
||||||
|
---
|
||||||
|
src/sbus/connection/sbus_connection_connect.c | 4 ++++
|
||||||
|
1 file changed, 4 insertions(+)
|
||||||
|
|
||||||
|
diff --git a/src/sbus/connection/sbus_connection_connect.c b/src/sbus/connection/sbus_connection_connect.c
|
||||||
|
index 45a0fa491..edc090e15 100644
|
||||||
|
--- a/src/sbus/connection/sbus_connection_connect.c
|
||||||
|
+++ b/src/sbus/connection/sbus_connection_connect.c
|
||||||
|
@@ -67,6 +67,8 @@ sbus_connect_init_send(TALLOC_CTX *mem_ctx,
|
||||||
|
|
||||||
|
tevent_req_set_callback(subreq, sbus_connect_init_hello_done, req);
|
||||||
|
|
||||||
|
+ arm_watchdog();
|
||||||
|
+
|
||||||
|
return req;
|
||||||
|
}
|
||||||
|
|
||||||
|
@@ -111,6 +113,8 @@ static void sbus_connect_init_done(struct tevent_req *subreq)
|
||||||
|
uint32_t res;
|
||||||
|
errno_t ret;
|
||||||
|
|
||||||
|
+ disarm_watchdog();
|
||||||
|
+
|
||||||
|
req = tevent_req_callback_data(subreq, struct tevent_req);
|
||||||
|
|
||||||
|
ret = sbus_call_DBus_RequestName_recv(subreq, &res);
|
||||||
|
--
|
||||||
|
2.38.1
|
||||||
|
|
@ -27,14 +27,15 @@
|
|||||||
|
|
||||||
Name: sssd
|
Name: sssd
|
||||||
Version: 2.9.1
|
Version: 2.9.1
|
||||||
Release: 1%{?dist}
|
Release: 2%{?dist}
|
||||||
Summary: System Security Services Daemon
|
Summary: System Security Services Daemon
|
||||||
License: GPLv3+
|
License: GPLv3+
|
||||||
URL: https://github.com/SSSD/sssd/
|
URL: https://github.com/SSSD/sssd/
|
||||||
Source0: https://github.com/SSSD/sssd/releases/download/%{version}/sssd-%{version}.tar.gz
|
Source0: https://github.com/SSSD/sssd/releases/download/%{version}/sssd-%{version}.tar.gz
|
||||||
|
|
||||||
### Patches ###
|
### Patches ###
|
||||||
#Patch0001:
|
Patch0001: 0001-watchdog-add-arm_watchdog-and-disarm_watchdog-calls.patch
|
||||||
|
Patch0002: 0002-sbus-arm-watchdog-for-sbus_connect_init_send.patch
|
||||||
|
|
||||||
### Dependencies ###
|
### Dependencies ###
|
||||||
|
|
||||||
@ -1061,6 +1062,9 @@ fi
|
|||||||
%systemd_postun_with_restart sssd.service
|
%systemd_postun_with_restart sssd.service
|
||||||
|
|
||||||
%changelog
|
%changelog
|
||||||
|
* Mon Jul 10 2023 Alexey Tikhonov <atikhono@redhat.com> - 2.9.1-2
|
||||||
|
- Resolves: rhbz#2218858 - [sssd] SSSD enters failed state after heavy load in the system
|
||||||
|
|
||||||
* Fri Jun 23 2023 Alexey Tikhonov <atikhono@redhat.com> - 2.9.1-1
|
* Fri Jun 23 2023 Alexey Tikhonov <atikhono@redhat.com> - 2.9.1-1
|
||||||
- Resolves: rhbz#2167837 - Rebase SSSD for RHEL 9.3
|
- Resolves: rhbz#2167837 - Rebase SSSD for RHEL 9.3
|
||||||
- Resolves: rhbz#2196816 - [RHEL9] [sssd] User lookup on IPA client fails with 's2n get_fqlist request failed'
|
- Resolves: rhbz#2196816 - [RHEL9] [sssd] User lookup on IPA client fails with 's2n get_fqlist request failed'
|
||||||
|
Loading…
Reference in New Issue
Block a user