diff --git a/0007-failover-add-failover_primary_timeout-option.patch b/0007-failover-add-failover_primary_timeout-option.patch new file mode 100644 index 0000000..c24ca1c --- /dev/null +++ b/0007-failover-add-failover_primary_timeout-option.patch @@ -0,0 +1,306 @@ +From 14f32f681a25aac185d72bc6d22a9e3b59dd265a Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Pavel=20B=C5=99ezina?= +Date: Tue, 30 Apr 2024 12:28:53 +0200 +Subject: [PATCH] failover: add failover_primary_timeout option + +This was previously hardcoded to 31 seconds (hardcoded retry_timout + +1). This may be too short period under some circumstances. + +When we retry primary server we drop connection to the backup server and +if the primary server is not yet available (and there are many +unavailable primary servers) we may go through a long timeout cycle +every half minute. + +This patch makes the value configurable. + +:config: Added `failover_primary_timout` configuration option. This + can be used to configure how often SSSD tries to reconnect to a + primary server after a successful connection to a backup server. + This was previously hardcoded to 31 seconds which is kept as + the default value. + +Resolves: https://github.com/SSSD/sssd/issues/7375 + +Reviewed-by: Alexey Tikhonov +Reviewed-by: Iker Pedrosa +(cherry picked from commit e9738e36937e78f80bb2772c48cffbddf39bd5fe) +--- + src/config/SSSDConfig/sssdoptions.py | 2 + + src/config/SSSDConfigTest.py | 2 + + src/config/cfg_rules.ini | 1 + + src/config/etc/sssd.api.conf | 1 + + src/man/sssd.conf.5.xml | 19 ++++++++ + src/providers/data_provider.h | 1 + + src/providers/data_provider_fo.c | 14 +++++- + src/providers/fail_over.c | 10 +++++ + src/providers/fail_over.h | 3 ++ + src/tests/system/tests/test_failover.py | 59 +++++++++++++++++++++++++ + 10 files changed, 110 insertions(+), 2 deletions(-) + create mode 100644 src/tests/system/tests/test_failover.py + +diff --git a/src/config/SSSDConfig/sssdoptions.py b/src/config/SSSDConfig/sssdoptions.py +index 0d75e6d82..95b39aa59 100644 +--- a/src/config/SSSDConfig/sssdoptions.py ++++ b/src/config/SSSDConfig/sssdoptions.py +@@ -186,6 +186,8 @@ class SSSDOptions(object): + 'dns_resolver_op_timeout': _('How long should keep trying to resolve single DNS query (seconds)'), + 'dns_resolver_timeout': _('How long to wait for replies from DNS when resolving servers (seconds)'), + 'dns_discovery_domain': _('The domain part of service discovery DNS query'), ++ 'failover_primary_timeout': _('How often SSSD tries to reconnect to the primary server after a successful ' ++ 'connection to the backup server.'), + 'override_gid': _('Override GID value from the identity provider with this value'), + 'case_sensitive': _('Treat usernames as case sensitive'), + 'entry_cache_user_timeout': _('Entry cache timeout length (seconds)'), +diff --git a/src/config/SSSDConfigTest.py b/src/config/SSSDConfigTest.py +index b160be2b1..f333c35eb 100755 +--- a/src/config/SSSDConfigTest.py ++++ b/src/config/SSSDConfigTest.py +@@ -579,6 +579,7 @@ class SSSDConfigTestSSSDDomain(unittest.TestCase): + 'dns_resolver_op_timeout', + 'dns_resolver_timeout', + 'dns_discovery_domain', ++ 'failover_primary_timeout', + 'dyndns_update', + 'dyndns_ttl', + 'dyndns_iface', +@@ -939,6 +940,7 @@ class SSSDConfigTestSSSDDomain(unittest.TestCase): + 'dns_resolver_op_timeout', + 'dns_resolver_timeout', + 'dns_discovery_domain', ++ 'failover_primary_timeout', + 'dyndns_update', + 'dyndns_ttl', + 'dyndns_iface', +diff --git a/src/config/cfg_rules.ini b/src/config/cfg_rules.ini +index 92e87fb18..4c2ea0b87 100644 +--- a/src/config/cfg_rules.ini ++++ b/src/config/cfg_rules.ini +@@ -405,6 +405,7 @@ option = dns_resolver_op_timeout + option = dns_resolver_timeout + option = dns_resolver_use_search_list + option = dns_discovery_domain ++option = failover_primary_timeout + option = override_gid + option = case_sensitive + option = override_homedir +diff --git a/src/config/etc/sssd.api.conf b/src/config/etc/sssd.api.conf +index 5ae6aab19..31787c23c 100644 +--- a/src/config/etc/sssd.api.conf ++++ b/src/config/etc/sssd.api.conf +@@ -172,6 +172,7 @@ dns_resolver_server_timeout = int, None, false + dns_resolver_op_timeout = int, None, false + dns_resolver_timeout = int, None, false + dns_discovery_domain = str, None, false ++failover_primary_timeout = int, None, false + override_gid = int, None, false + case_sensitive = str, None, false + override_homedir = str, None, false +diff --git a/src/man/sssd.conf.5.xml b/src/man/sssd.conf.5.xml +index 339f21e25..fbb82e357 100644 +--- a/src/man/sssd.conf.5.xml ++++ b/src/man/sssd.conf.5.xml +@@ -3773,6 +3773,25 @@ pam_gssapi_indicators_map = sudo:pkinit, sudo-i:pkinit + + + ++ ++ failover_primary_timeout (integer) ++ ++ ++ When no primary server is currently available, ++ SSSD fail overs to a backup server. This option ++ defines the amount of time (in seconds) to ++ wait before SSSD tries to reconnect to a primary ++ server again. ++ ++ ++ Note: The minimum value is 31. ++ ++ ++ Default: 31 ++ ++ ++ ++ + + override_gid (integer) + +diff --git a/src/providers/data_provider.h b/src/providers/data_provider.h +index 36a82b84d..def35e491 100644 +--- a/src/providers/data_provider.h ++++ b/src/providers/data_provider.h +@@ -267,6 +267,7 @@ enum dp_res_opts { + DP_RES_OPT_RESOLVER_SERVER_TIMEOUT, + DP_RES_OPT_RESOLVER_USE_SEARCH_LIST, + DP_RES_OPT_DNS_DOMAIN, ++ DP_RES_OPT_FAILOVER_PRIMARY_TIMEOUT, + + DP_RES_OPTS /* attrs counter */ + }; +diff --git a/src/providers/data_provider_fo.c b/src/providers/data_provider_fo.c +index b0aed54e9..c23f92e35 100644 +--- a/src/providers/data_provider_fo.c ++++ b/src/providers/data_provider_fo.c +@@ -48,10 +48,20 @@ static int be_fo_get_options(struct be_ctx *ctx, + DP_RES_OPT_RESOLVER_TIMEOUT); + opts->use_search_list = dp_opt_get_bool(ctx->be_res->opts, + DP_RES_OPT_RESOLVER_USE_SEARCH_LIST); ++ opts->primary_timeout = dp_opt_get_int(ctx->be_res->opts, ++ DP_RES_OPT_FAILOVER_PRIMARY_TIMEOUT); ++ + opts->retry_timeout = 30; + opts->srv_retry_neg_timeout = 15; + opts->family_order = ctx->be_res->family_order; + ++ if (opts->primary_timeout <= opts->retry_timeout) { ++ opts->primary_timeout = opts->retry_timeout + 1; ++ DEBUG(SSSDBG_CONF_SETTINGS, ++ "Warning: failover_primary_timeout is too low, using %lu " ++ "seconds instead\n", opts->primary_timeout); ++ } ++ + return EOK; + } + +@@ -551,7 +561,7 @@ static void be_resolve_server_done(struct tevent_req *subreq) + struct tevent_req); + struct be_resolve_server_state *state = tevent_req_data(req, + struct be_resolve_server_state); +- time_t timeout = fo_get_service_retry_timeout(state->svc->fo_service) + 1; ++ time_t timeout = fo_get_primary_retry_timeout(state->svc->fo_service); + int ret; + + ret = be_resolve_server_process(subreq, state, &new_subreq); +@@ -564,7 +574,6 @@ static void be_resolve_server_done(struct tevent_req *subreq) + } + + if (!fo_is_server_primary(state->srv)) { +- /* FIXME: make the timeout configurable */ + ret = be_primary_server_timeout_activate(state->ctx, state->ev, + state->ctx, state->svc, + timeout); +@@ -871,6 +880,7 @@ static struct dp_option dp_res_default_opts[] = { + { "dns_resolver_server_timeout", DP_OPT_NUMBER, { .number = 1000 }, NULL_NUMBER }, + { "dns_resolver_use_search_list", DP_OPT_BOOL, BOOL_TRUE, BOOL_TRUE }, + { "dns_discovery_domain", DP_OPT_STRING, NULL_STRING, NULL_STRING }, ++ { "failover_primary_timeout", DP_OPT_NUMBER, { .number = 31 }, NULL_NUMBER }, + DP_OPTION_TERMINATOR + }; + +diff --git a/src/providers/fail_over.c b/src/providers/fail_over.c +index 7cb642448..7f94407c5 100644 +--- a/src/providers/fail_over.c ++++ b/src/providers/fail_over.c +@@ -158,6 +158,7 @@ fo_context_init(TALLOC_CTX *mem_ctx, struct fo_options *opts) + + ctx->opts->srv_retry_neg_timeout = opts->srv_retry_neg_timeout; + ctx->opts->retry_timeout = opts->retry_timeout; ++ ctx->opts->primary_timeout = opts->primary_timeout; + ctx->opts->family_order = opts->family_order; + ctx->opts->service_resolv_timeout = opts->service_resolv_timeout; + ctx->opts->use_search_list = opts->use_search_list; +@@ -1740,6 +1741,15 @@ time_t fo_get_service_retry_timeout(struct fo_service *svc) + return svc->ctx->opts->retry_timeout; + } + ++time_t fo_get_primary_retry_timeout(struct fo_service *svc) ++{ ++ if (svc == NULL || svc->ctx == NULL || svc->ctx->opts == NULL) { ++ return 0; ++ } ++ ++ return svc->ctx->opts->primary_timeout; ++} ++ + bool fo_get_use_search_list(struct fo_server *server) + { + if ( +diff --git a/src/providers/fail_over.h b/src/providers/fail_over.h +index 36021ad6f..924a09970 100644 +--- a/src/providers/fail_over.h ++++ b/src/providers/fail_over.h +@@ -83,6 +83,7 @@ struct fo_server; + struct fo_options { + time_t srv_retry_neg_timeout; + time_t retry_timeout; ++ time_t primary_timeout; + int service_resolv_timeout; + bool use_search_list; + enum restrict_family family_order; +@@ -211,6 +212,8 @@ int fo_is_srv_lookup(struct fo_server *s); + + time_t fo_get_service_retry_timeout(struct fo_service *svc); + ++time_t fo_get_primary_retry_timeout(struct fo_service *svc); ++ + bool fo_get_use_search_list(struct fo_server *server); + + void fo_reset_services(struct fo_ctx *fo_ctx); +diff --git a/src/tests/system/tests/test_failover.py b/src/tests/system/tests/test_failover.py +new file mode 100644 +index 000000000..565cec9bc +--- /dev/null ++++ b/src/tests/system/tests/test_failover.py +@@ -0,0 +1,59 @@ ++""" ++SSSD Failover tests. ++ ++:requirement: Failover ++""" ++ ++from __future__ import annotations ++ ++import pytest ++from sssd_test_framework.roles.client import Client ++from sssd_test_framework.roles.ldap import LDAP ++from sssd_test_framework.topology import KnownTopology ++ ++ ++@pytest.mark.parametrize("value, expected", [(None, 31), (15, 31), (60, 60)]) ++@pytest.mark.importance("low") ++@pytest.mark.ticket(gh=7375, jira="RHEL-17659") ++@pytest.mark.topology(KnownTopology.LDAP) ++def test_failover__retry_primary(client: Client, ldap: LDAP, value: int | None, expected: int): ++ """ ++ :title: Primary server reactivation timeout is respected ++ :setup: ++ 1. Create LDAP user "user-1" ++ 2. Set failover_primary_timeout to @value ++ 3. Set ldap_uri to invalid, not working server ++ 4. Set ldap_backup_uri to working server ++ 5. Start SSSD ++ :steps: ++ 1. Lookup user-1 ++ 2. Check that SSSD is connected to backup server ++ 3. Find "Primary server reactivation timeout set to @expected seconds" in domain logs ++ :expectedresults: ++ 1. SSSD failover to backup server ++ 2. SSSD is indeed connected to the backup server ++ 3. String is found ++ :customerscenario: True ++ """ ++ ldap.user("user-1").add() ++ ++ if value is not None: ++ client.sssd.domain["failover_primary_timeout"] = str(value) ++ ++ client.sssd.enable_responder("ifp") ++ client.sssd.domain["ldap_uri"] = "ldap://ldap.invalid" ++ client.sssd.domain["ldap_backup_uri"] = f"ldap://{ldap.host.hostname}" ++ client.sssd.start() ++ ++ # Lookup user to make sure SSSD did correctly failover to backup server ++ result = client.tools.id("user-1") ++ assert result is not None ++ ++ # Check that SSSD is indeed connected to backup server ++ assert client.sssd.default_domain is not None ++ status = client.sssctl.domain_status(client.sssd.default_domain, active=True) ++ assert ldap.host.hostname in status.stdout ++ ++ # Check that primary server reactivation timeout was correctly created ++ log = client.fs.read(client.sssd.logs.domain()) ++ assert f"Primary server reactivation timeout set to {expected} seconds" in log +-- +2.46.0 + diff --git a/sssd.spec b/sssd.spec index 3963821..9b0624e 100644 --- a/sssd.spec +++ b/sssd.spec @@ -19,7 +19,7 @@ Name: sssd Version: 2.9.4 -Release: 4%{?dist} +Release: 5%{?dist} Group: Applications/System Summary: System Security Services Daemon License: GPLv3+ @@ -33,6 +33,7 @@ Patch0003: 0003-sdap-add-naming_context-as-new-member-of-struct-sdap.patch Patch0004: 0004-pam-fix-SC-auth-with-multiple-certs-and-missing-logi.patch Patch0005: 0005-ad-gpo-use-hash-to-store-intermediate-results.patch Patch0006: 0006-ad-refresh-root-domain-when-read-directly.patch +Patch0007: 0007-failover-add-failover_primary_timeout-option.patch ### Downstream Patches ### @@ -1217,6 +1218,9 @@ fi %systemd_postun_with_restart sssd.service %changelog +* Mon Sep 09 2024 Anuar Beisembayev - 2.9.4-5 +- Resolves: RHEL-39085 - [RfE] SSSD Failover Enhancements + * Fri May 17 2024 Arun Bansal - 2.9.4-4 - Resolves: RHEL-33957 - ad: refresh root domain when read directly