- Bump version to 2.3.6-5
- Issue 5909 - Multi listener hang with 20k connections (#5917) - issue 5924 - ASAN server build crash when looping - Issue 5984 - Crash when paged result search are abandoned (#5985)
This commit is contained in:
		
							parent
							
								
									12f6bddcc1
								
							
						
					
					
						commit
						c014b9beeb
					
				| @ -0,0 +1,47 @@ | ||||
| From b2194c95f9d02965b2ad3d0dcf333e74881347d0 Mon Sep 17 00:00:00 2001 | ||||
| From: James Chapman <jachapma@redhat.com> | ||||
| Date: Thu, 31 Aug 2023 14:05:31 +0000 | ||||
| Subject: [PATCH] Issue 5909 - Multi listener hang with 20k connections (#5917) | ||||
| 
 | ||||
| Bug Description: A fix for connection sub-table to freelist mapping results | ||||
| in an uninitialised head of the sub-table linked list. | ||||
| 
 | ||||
| Fix Description: During connection table creation, initialise all elements but | ||||
| skip the list head during the mapping phase. | ||||
| 
 | ||||
| Fixes: https//github.com/389ds/389-ds-base/issues/5909 | ||||
| 
 | ||||
| Reviewed by: @progier389 @tbordaz  (Thank you) | ||||
| ---
 | ||||
|  ldap/servers/slapd/conntable.c | 10 ++++++---- | ||||
|  1 file changed, 6 insertions(+), 4 deletions(-) | ||||
| 
 | ||||
| diff --git a/ldap/servers/slapd/conntable.c b/ldap/servers/slapd/conntable.c
 | ||||
| index b1c66cf421..11e997432b 100644
 | ||||
| --- a/ldap/servers/slapd/conntable.c
 | ||||
| +++ b/ldap/servers/slapd/conntable.c
 | ||||
| @@ -172,9 +172,9 @@ connection_table_new(int table_size)
 | ||||
|              /* all connections start out invalid */ | ||||
|              ct->fd[ct_list][i].fd = SLAPD_INVALID_SOCKET; | ||||
|   | ||||
| -            /* The connection table has a double linked list running through it.
 | ||||
| +            /* The connection sub-tables have a double linked list running through them.
 | ||||
|              * This is used to find out which connections should be looked at | ||||
| -            * in the poll loop.  Slot 0 in the table is always the head of
 | ||||
| +            * in the poll loop.  Slot 0 in each sub-table is always the head of
 | ||||
|              * the linked list.  Each slot has a c_next and c_prev which are | ||||
|              * pointers back into the array of connection slots. */ | ||||
|              ct->c[ct_list][i].c_next = NULL; | ||||
| @@ -196,8 +196,10 @@ connection_table_new(int table_size)
 | ||||
|              /* Ready to rock, mark as such. */ | ||||
|              ct->c[ct_list][i].c_state = CONN_STATE_INIT; | ||||
|   | ||||
| -            /* Map multiple ct lists to a single freelist. */
 | ||||
| -            ct->c_freelist[free_idx++] = &(ct->c[ct_list][i]);
 | ||||
| +            /* Map multiple ct lists to a single freelist, but skip slot 0 of each list. */
 | ||||
| +            if (i != 0) {
 | ||||
| +                ct->c_freelist[free_idx++] = &(ct->c[ct_list][i]);
 | ||||
| +            }
 | ||||
|          } | ||||
|      } | ||||
|   | ||||
| @ -0,0 +1,626 @@ | ||||
| From 26716f0edefe4690d9fd9143d07156e01321020c Mon Sep 17 00:00:00 2001 | ||||
| From: progier389 <progier@redhat.com> | ||||
| Date: Thu, 28 Sep 2023 12:15:25 +0200 | ||||
| Subject: [PATCH] issue 5924 - ASAN server build crash when looping | ||||
|  opening/closing connections (#5926) | ||||
| 
 | ||||
| * issue 5924 - ASAN server build crash when looping opening/closing connections | ||||
| Issue: Got a crash due to: | ||||
| 1. Failure to get a connection slot because connection freelist is misshandled. | ||||
| 2. A confusion between listening and acceptedfd descriptor leaded to | ||||
| close the listening descriptor while handing the error. | ||||
| 
 | ||||
| Solution: | ||||
| Rename clearly the file descriptor variables | ||||
| Close the accepted file descriptor in error handler | ||||
| Rewrite the freelist management so that first connection chosen is the last released one. | ||||
| (Code is simpler, this fix the end of list issue, and it avoid to spread the open connection over too much memory) | ||||
| 
 | ||||
| Issue: #5924 | ||||
| 
 | ||||
| Reviewed by: @Firstyear, @vashirov, @droideck (Thanks !) | ||||
| 
 | ||||
| (cherry picked from commit 02d333251419ff3c4d0384595e9fe7ded5bcd8fc) | ||||
| ---
 | ||||
|  dirsrvtests/tests/suites/basic/basic_test.py | 287 +++++++++++++++++++ | ||||
|  ldap/servers/slapd/conntable.c               | 100 +++---- | ||||
|  ldap/servers/slapd/daemon.c                  |  35 ++- | ||||
|  ldap/servers/slapd/fe.h                      |   1 - | ||||
|  4 files changed, 351 insertions(+), 72 deletions(-) | ||||
| 
 | ||||
| diff --git a/dirsrvtests/tests/suites/basic/basic_test.py b/dirsrvtests/tests/suites/basic/basic_test.py
 | ||||
| index fac0f7371e..f4525e1848 100644
 | ||||
| --- a/dirsrvtests/tests/suites/basic/basic_test.py
 | ||||
| +++ b/dirsrvtests/tests/suites/basic/basic_test.py
 | ||||
| @@ -28,7 +28,15 @@
 | ||||
|  from lib389.backend import Backends | ||||
|  from lib389.idm.domain import Domain | ||||
|  from lib389.nss_ssl import NssSsl | ||||
| +from lib389._constants import *
 | ||||
| +from lib389 import DirSrv
 | ||||
| +from lib389.instance.setup import SetupDs
 | ||||
| +from lib389.instance.options import General2Base, Slapd2Base
 | ||||
|  import os | ||||
| +import random
 | ||||
| +import ldap
 | ||||
| +import time
 | ||||
| +import subprocess
 | ||||
|   | ||||
|   | ||||
|  pytestmark = pytest.mark.tier0 | ||||
| @@ -36,6 +44,7 @@
 | ||||
|  default_paths = Paths() | ||||
|   | ||||
|  log = logging.getLogger(__name__) | ||||
| +DEBUGGING = os.getenv("DEBUGGING", default=False)
 | ||||
|   | ||||
|  # Globals | ||||
|  USER1_DN = 'uid=user1,' + DEFAULT_SUFFIX | ||||
| @@ -51,6 +60,190 @@
 | ||||
|                           'vendorName', | ||||
|                           'vendorVersion') | ||||
|   | ||||
| +# This MAX_FDS value left about 22 connections available with bdb
 | ||||
| +# (should have more connections with lmdb)
 | ||||
| +MAX_FDS = 150
 | ||||
| +
 | ||||
| +
 | ||||
| +
 | ||||
| +default_paths = Paths()
 | ||||
| +
 | ||||
| +
 | ||||
| +
 | ||||
| +log = logging.getLogger(__name__)
 | ||||
| +DEBUGGING = os.getenv("DEBUGGING", default=False)
 | ||||
| +
 | ||||
| +
 | ||||
| +class CustomSetup():
 | ||||
| +    DEFAULT_GENERAL = { 'config_version': 2,
 | ||||
| +                        'full_machine_name': 'localhost.localdomain',
 | ||||
| +                        'strict_host_checking': False,
 | ||||
| +                        # Not setting 'systemd' because it is not used.
 | ||||
| +                        # (that is the global default.inf setting that matters)
 | ||||
| +                      }
 | ||||
| +    DEFAULT_SLAPD = { 'root_password': PW_DM,
 | ||||
| +                      'defaults': INSTALL_LATEST_CONFIG,
 | ||||
| +                    }
 | ||||
| +    DEFAULT_BACKENDS = [ { 
 | ||||
| +                            'cn': 'userroot',
 | ||||
| +                            'nsslapd-suffix': DEFAULT_SUFFIX,
 | ||||
| +                            'sample_entries': 'yes',
 | ||||
| +                            BACKEND_SAMPLE_ENTRIES: INSTALL_LATEST_CONFIG,
 | ||||
| +                       }, ]
 | ||||
| +
 | ||||
| +    WRAPPER_FORMAT = '''#!/bin/sh
 | ||||
| +{wrapper_options}
 | ||||
| +exec {nsslapd} -D {cfgdir} -i {pidfile} 
 | ||||
| +'''
 | ||||
| +
 | ||||
| +
 | ||||
| +    class CustomDirSrv(DirSrv):
 | ||||
| +        def __init__(self, verbose=False, external_log=log):
 | ||||
| +            super().__init__(verbose=verbose, external_log=external_log)
 | ||||
| +            self.wrapper = None       # placeholder for the wrapper file name
 | ||||
| +            
 | ||||
| +        def _reset_systemd(self):
 | ||||
| +            self.systemd_override = False
 | ||||
| +    
 | ||||
| +        def status(self):
 | ||||
| +            self._reset_systemd()
 | ||||
| +            return super().status()
 | ||||
| +    
 | ||||
| +        def start(self, timeout=120, *args):
 | ||||
| +            if self.status():
 | ||||
| +                return
 | ||||
| +            tmp_env = os.environ
 | ||||
| +            # Unset PYTHONPATH to avoid mixing old CLI tools and new lib389
 | ||||
| +            if "PYTHONPATH" in tmp_env:
 | ||||
| +                del tmp_env["PYTHONPATH"]
 | ||||
| +            try:
 | ||||
| +                subprocess.check_call([
 | ||||
| +                    '/usr/bin/sh',
 | ||||
| +                    self.wrapper
 | ||||
| +                ], env=tmp_env, stderr=subprocess.STDOUT)
 | ||||
| +            except subprocess.CalledProcessError as e:
 | ||||
| +                log.fatal("%s failed!  Error (%s) %s" % (self.wrapper, e.returncode, e.output))
 | ||||
| +                raise e from None
 | ||||
| +            for count in range(timeout):
 | ||||
| +                if self.status():
 | ||||
| +                    return
 | ||||
| +                time.sleep(1)
 | ||||
| +            raise TimeoutException('Failed to start ns-slpad')
 | ||||
| +    
 | ||||
| +        def stop(self, timeout=120):
 | ||||
| +            self._reset_systemd()
 | ||||
| +            super().stop(timeout=timeout)
 | ||||
| +    
 | ||||
| +
 | ||||
| +    def _search_be(belist, beinfo):
 | ||||
| +        for be in belist:
 | ||||
| +            if be['cn'] == beinfo['cn']:
 | ||||
| +                return be
 | ||||
| +        return None
 | ||||
| +
 | ||||
| +    def __init__(self, serverid, general=None, slapd=None, backends=None, log=log):
 | ||||
| +        verbose = log.level > logging.DEBUG
 | ||||
| +        self.log = log
 | ||||
| +        self.serverid = serverid
 | ||||
| +        self.verbose = verbose
 | ||||
| +        self.wrapper = f'/tmp/ds_{serverid}_wrapper.sh'
 | ||||
| +        if serverid.startswith('slapd-'):
 | ||||
| +            self.instname = server.id
 | ||||
| +        else:
 | ||||
| +            self.instname = 'slapd-'+serverid
 | ||||
| +        self.ldapi = None
 | ||||
| +        self.pid_file = None
 | ||||
| +        self.inst = None
 | ||||
| +
 | ||||
| +        # Lets prepare the options
 | ||||
| +        general_options = General2Base(log)
 | ||||
| +        for d in (CustomSetup.DEFAULT_GENERAL, general):
 | ||||
| +            if d:
 | ||||
| +                for key,val in d.items():
 | ||||
| +                    general_options.set(key,val)
 | ||||
| +        log.debug('[general]: %s' % general_options._options)
 | ||||
| +        self.general = general_options
 | ||||
| +        
 | ||||
| +        slapd_options = Slapd2Base(self.log)
 | ||||
| +        slapd_options.set('instance_name', serverid)
 | ||||
| +        for d in (CustomSetup.DEFAULT_SLAPD, slapd):
 | ||||
| +            if d:
 | ||||
| +                for key,val in d.items():
 | ||||
| +                    slapd_options.set(key,val)
 | ||||
| +        log.debug('[slapd]: %s' % slapd_options._options)
 | ||||
| +        self.slapd = slapd_options
 | ||||
| +
 | ||||
| +        backend_options = []
 | ||||
| +        for backend_list in (CustomSetup.DEFAULT_BACKENDS, backends):
 | ||||
| +            if not backend_list:
 | ||||
| +                continue
 | ||||
| +            for backend in backend_list:
 | ||||
| +                target_be = CustomSetup._search_be(backend_options, backend)
 | ||||
| +                if not target_be:
 | ||||
| +                    target_be = {}
 | ||||
| +                    backend_options.append(target_be)
 | ||||
| +                for key,val in backend.items():
 | ||||
| +                    target_be[key] = val
 | ||||
| +        log.debug('[backends]: %s' % backend_options)
 | ||||
| +        self.backends = backend_options
 | ||||
| +
 | ||||
| +    def _to_dirsrv_args(self):
 | ||||
| +        args = {}
 | ||||
| +        slapd = self.slapd.collect()
 | ||||
| +        general = self.general.collect()
 | ||||
| +        args["SER_HOST"] = general['full_machine_name']
 | ||||
| +        args["SER_PORT"] = slapd['port']
 | ||||
| +        args["SER_SECURE_PORT"] = slapd['secure_port']
 | ||||
| +        args["SER_SERVERID_PROP"] = self.serverid
 | ||||
| +        return args
 | ||||
| +	
 | ||||
| +    def create_instance(self):
 | ||||
| +        sds = SetupDs(verbose=self.verbose, dryrun=False, log=self.log)
 | ||||
| +        self.general.verify()
 | ||||
| +        general = self.general.collect()
 | ||||
| +        self.slapd.verify()
 | ||||
| +        slapd = self.slapd.collect()
 | ||||
| +        sds.create_from_args(general, slapd, self.backends, None)
 | ||||
| +        self.ldapi = get_ldapurl_from_serverid(self.serverid)[0]
 | ||||
| +        args = self._to_dirsrv_args()
 | ||||
| +        log.debug('DirSrv.allocate args = %s' % str(args))
 | ||||
| +        log.debug('ldapi = %s' % str(self.ldapi))
 | ||||
| +        root_dn = slapd['root_dn']
 | ||||
| +        root_password = slapd['root_password']
 | ||||
| +        inst = DirSrv(verbose=self.verbose, external_log=self.log)
 | ||||
| +        inst.local_simple_allocate(self.serverid, ldapuri=self.ldapi, binddn=root_dn, password=root_password)
 | ||||
| +        self.pid_file = inst.pid_file()
 | ||||
| +        # inst.setup_ldapi()
 | ||||
| +        log.debug('DirSrv = %s' % str(inst.__dict__))
 | ||||
| +        inst.open()
 | ||||
| +        inst.stop()
 | ||||
| +        inst = CustomSetup.CustomDirSrv(verbose=self.verbose, external_log=self.log)
 | ||||
| +        inst.local_simple_allocate(self.serverid, ldapuri=self.ldapi, binddn=root_dn, password=root_password)
 | ||||
| +        self.inst = inst
 | ||||
| +        return inst
 | ||||
| +
 | ||||
| +    def create_wrapper(self, maxfds=None):
 | ||||
| +        self.inst.wrapper = self.wrapper
 | ||||
| +        slapd = self.slapd.collect()
 | ||||
| +        sbin_dir = slapd['sbin_dir']
 | ||||
| +        config_dir = slapd['config_dir']
 | ||||
| +        fmtvalues = {
 | ||||
| +            'nsslapd': f'{sbin_dir}/ns-slapd',
 | ||||
| +            'cfgdir': config_dir.format(instance_name=self.instname),
 | ||||
| +            'pidfile': self.pid_file,
 | ||||
| +            'wrapper_options': ''
 | ||||
| +        }
 | ||||
| +        if maxfds:
 | ||||
| +            fmtvalues['wrapper_options']=f'ulimit -n {maxfds}\nulimit -H -n {maxfds}'
 | ||||
| +        with open(self.wrapper, 'w') as f:
 | ||||
| +            f.write(CustomSetup.WRAPPER_FORMAT.format(**fmtvalues))
 | ||||
| +
 | ||||
| +    def cleanup(self):
 | ||||
| +        self.inst.stop()
 | ||||
| +        self.inst.delete()
 | ||||
| +        if os.path.exists(self.wrapper):
 | ||||
| +            os.remove(self.wrapper)
 | ||||
| +
 | ||||
|   | ||||
|  @pytest.fixture(scope="function") | ||||
|  def _reset_attr(request, topology_st): | ||||
| @@ -2222,6 +2415,100 @@ def test_dscreate_with_different_rdn(dscreate_test_rdn_value):
 | ||||
|              assert True | ||||
|   | ||||
|   | ||||
| +@pytest.fixture(scope="module")
 | ||||
| +def dscreate_custom_instance(request):
 | ||||
| +    topo = CustomSetup('custom')
 | ||||
| +
 | ||||
| +    def fin():
 | ||||
| +        topo.cleanup()
 | ||||
| +
 | ||||
| +    request.addfinalizer(fin)
 | ||||
| +    topo.create_instance()
 | ||||
| +    # Return CustomSetup object associated with 
 | ||||
| +    #  a stopped instance named "custom"
 | ||||
| +    return topo
 | ||||
| +
 | ||||
| +    obj.create_wrapper(maxfds=150)
 | ||||
| +    log.info("Starting wrapper")
 | ||||
| +    inst.start()
 | ||||
| +    log.info("Server is started.")
 | ||||
| +    log.info("Open connection")
 | ||||
| +    inst.open()
 | ||||
| +
 | ||||
| +
 | ||||
| +@pytest.fixture(scope="module", params=set(range(1,5)))
 | ||||
| +def dscreate_with_numlistener(request, dscreate_custom_instance):
 | ||||
| +    numlisteners = request.param
 | ||||
| +    dscreate_custom_instance.create_wrapper(maxfds=MAX_FDS)
 | ||||
| +    inst = dscreate_custom_instance.inst
 | ||||
| +    inst.stop()
 | ||||
| +    dse_ldif = DSEldif(inst)
 | ||||
| +    dse_ldif.replace('cn=config', 'nsslapd-numlisteners', str(numlisteners))
 | ||||
| +    inst.start()
 | ||||
| +    inst.open()
 | ||||
| +    return inst
 | ||||
| +
 | ||||
| +
 | ||||
| +@pytest.mark.skipif(ds_is_older('2.2.0.0'),
 | ||||
| +                    reason="This test is only required with multiple listener support.")
 | ||||
| +def test_conn_limits(dscreate_with_numlistener):
 | ||||
| +    """Check the connections limits for various number of listeners.
 | ||||
| +
 | ||||
| +    :id: 7be2eb5c-4d8f-11ee-ae3d-482ae39447e5
 | ||||
| +    :parametrized: yes
 | ||||
| +    :setup: Setup an instance then set nsslapd-numlisteners and maximum file descriptors
 | ||||
| +    :steps:
 | ||||
| +        1. Loops on:
 | ||||
| +             Open new connection and perform search until timeout expires
 | ||||
| +        2. Close one of the previously open connections
 | ||||
| +        3. Loops MAX_FDS times on:
 | ||||
| +              - opening a new connection
 | ||||
| +              - perform a search
 | ||||
| +              - close the connection
 | ||||
| +        4. Close all open connections
 | ||||
| +        5. Remove the instance
 | ||||
| +    :expectedresults:
 | ||||
| +        1. Should get a timeout (because the server has no more any connections)
 | ||||
| +        2. Should success
 | ||||
| +        3. Should success (otherwise issue #5924 has probably been hit)
 | ||||
| +        4. Should success
 | ||||
| +        5. Should success
 | ||||
| +    """
 | ||||
| +    inst = dscreate_with_numlistener
 | ||||
| +
 | ||||
| +    conns = []
 | ||||
| +    timeout_occured = False
 | ||||
| +    for i in range(MAX_FDS):
 | ||||
| +        try:
 | ||||
| +            ldc = ldap.initialize(f'ldap://localhost:{inst.port}')
 | ||||
| +            ldc.set_option(ldap.OPT_TIMEOUT, 5)
 | ||||
| +            ldc.search_s(DEFAULT_SUFFIX, ldap.SCOPE_SUBTREE, "(uid=demo)")
 | ||||
| +            conns.append(ldc)
 | ||||
| +        except ldap.TIMEOUT:
 | ||||
| +            timeout_occured = True
 | ||||
| +            break
 | ||||
| +    # Should not be able to open MAX_FDS connections (some file descriptor are
 | ||||
| +    #  reserved (for example for the listening socket )
 | ||||
| +    assert timeout_occured
 | ||||
| +
 | ||||
| +    conn = random.choice(conns)
 | ||||
| +    conn.unbind()
 | ||||
| +    conns.remove(conn)
 | ||||
| +
 | ||||
| +    # Should loop enough time so trigger issue #5924 if it is not fixed.
 | ||||
| +    for i in range(MAX_FDS):
 | ||||
| +        ldc = ldap.initialize(f'ldap://localhost:{inst.port}')
 | ||||
| +        # Set a timeout long enough so that the test fails if server is unresponsive
 | ||||
| +        ldc.set_option(ldap.OPT_TIMEOUT, 60)
 | ||||
| +        ldc.search_s(DEFAULT_SUFFIX, ldap.SCOPE_SUBTREE, "(uid=demo)")
 | ||||
| +        ldc.unbind()
 | ||||
| +
 | ||||
| +    # Close all open connections
 | ||||
| +    for c in conns:
 | ||||
| +        c.unbind()
 | ||||
| +
 | ||||
| +    # Step 6 is done in teardown phase by dscreate_instance finalizer
 | ||||
| +
 | ||||
|  if __name__ == '__main__': | ||||
|      # Run isolated | ||||
|      # -s for DEBUG mode | ||||
| diff --git a/ldap/servers/slapd/conntable.c b/ldap/servers/slapd/conntable.c
 | ||||
| index 11e997432b..1ca60e0e5f 100644
 | ||||
| --- a/ldap/servers/slapd/conntable.c
 | ||||
| +++ b/ldap/servers/slapd/conntable.c
 | ||||
| @@ -48,69 +48,59 @@
 | ||||
|   * under the connection table lock. This should move the allocation algorithm from O(n) worst case | ||||
|   * to O(1) worst case as we always recieve and empty slot *or* ct full. It also reduces lock/atomic | ||||
|   * contention on the CPU to improve things. | ||||
| + * Lastly a change was done: Instead of having a sliding windows that tend to get a never used
 | ||||
| + * slot for each new connection, it nows reuse last freed one. That has several benefits:
 | ||||
| + *  - Fix a bug because the last free list slots may not be alloced.
 | ||||
| + *  - Avoid to grow the memory footprint when there is no much load
 | ||||
| + *  - Simplify the code (as only a single is needed. )
 | ||||
|   * | ||||
| - * The freelist is a ringbuffer of pointers to the connection table. On a small scale it looks like:
 | ||||
| + * The freelist is a stack of pointers to the connection table.
 | ||||
| + * It is NULL terminated. On a small scale it looks like:
 | ||||
|   * | ||||
|   *  |--------------------------------------------| | ||||
| - *  | slot 1 | slot 2 | slot 3 | slot 4 | slot 5 |
 | ||||
| - *  | _ ptr  | _ ptr  | _ ptr  | _ ptr  | _ ptr  |
 | ||||
| + *  | slot 0 | slot 1 | slot 2 | slot 3 | slot 4 |
 | ||||
| + *  | _ ptr  | _ ptr  | _ ptr  | _ ptr  | NULL   |
 | ||||
|   *  |--------------------------------------------| | ||||
| - *     ^  ^- conn_next
 | ||||
| - *     |
 | ||||
| - *     \-- conn_free
 | ||||
| + *        ^- conn_next
 | ||||
|   * | ||||
| - * As we allocate, we shift conn_next through the list, yielding the ptr that was stored (and
 | ||||
| - * setting it to NULL as we proceed)
 | ||||
| + * To allocate, we pop one of the stored connection ptr out of the stack (yield the ptr, set
 | ||||
| + * its slot to NULL then increase conn_next)
 | ||||
|   * | ||||
|   *  |--------------------------------------------| | ||||
| - *  | slot 1 | slot 2 | slot 3 | slot 4 | slot 5 |
 | ||||
| - *  | _NULL  | _NULL  | _ ptr  | _ ptr  | _ ptr  |
 | ||||
| + *  | slot 0 | slot 1 | slot 2 | slot 3 | slot 4 |
 | ||||
| + *  | _NULL  | _NULL  | _ ptr  | _ ptr  | NULL   |
 | ||||
|   *  |--------------------------------------------| | ||||
| - *     ^                  ^- conn_next
 | ||||
| - *     |
 | ||||
| - *     \-- conn_free
 | ||||
| + *                        ^- conn_next
 | ||||
|   * | ||||
| - * When a connection is "freed" we return it to conn_free, which is then also slid up.
 | ||||
| + * When a connection is "freed" we push it back in the stack after decreasing conn_next
 | ||||
|   * | ||||
|   *  |--------------------------------------------| | ||||
| - *  | slot 1 | slot 2 | slot 3 | slot 4 | slot 5 |
 | ||||
| - *  | _ ptr  | _NULL  | _ ptr  | _ ptr  | _ ptr  |
 | ||||
| + *  | slot 0 | slot 1 | slot 2 | slot 3 | slot 4 |
 | ||||
| + *  | _NULL  | _ ptr  | _ ptr  | _ ptr  | NULL   |
 | ||||
|   *  |--------------------------------------------| | ||||
| - *              ^         ^- conn_next
 | ||||
| - *              |
 | ||||
| - *              \-- conn_free
 | ||||
| + *              ^- conn_next
 | ||||
|   * | ||||
| - * If all connections are exhausted, conn_next will == conn_next, as conn_next must have proceeded
 | ||||
| - * to the end of the ring, and then wrapped back allocating all previous until we meet with conn_free.
 | ||||
| + * If all connections are exhausted, freelist[conn_next] is NULL
 | ||||
|   * | ||||
|   *  |--------------------------------------------| | ||||
| - *  | slot 1 | slot 2 | slot 3 | slot 4 | slot 5 |
 | ||||
| + *  | slot 0 | slot 1 | slot 2 | slot 3 | slot 4 |
 | ||||
|   *  | _NULL  | _NULL  | _NULL  | _NULL  | _NULL  | | ||||
|   *  |--------------------------------------------| | ||||
| - *              ^ ^- conn_next
 | ||||
| - *              |
 | ||||
| - *              \-- conn_free
 | ||||
| + *                                          ^- conn_next
 | ||||
|   * | ||||
|   * This means allocations of conns will keep failing until a connection is returned. | ||||
|   * | ||||
|   *  |--------------------------------------------| | ||||
| - *  | slot 1 | slot 2 | slot 3 | slot 4 | slot 5 |
 | ||||
| - *  | _NULL  | _ ptr  | _NULL  | _NULL  | _NULL  |
 | ||||
| + *  | slot 0 | slot 1 | slot 2 | slot 3 | slot 4 |
 | ||||
| + *  | _NULL  | _NULL  | _NULL  | _ ptr  | NULL   |
 | ||||
|   *  |--------------------------------------------| | ||||
| - *             ^- conn_next ^
 | ||||
| - *                          |
 | ||||
| - *                          \-- conn_free
 | ||||
| + *                                 ^- conn_next
 | ||||
|   * | ||||
|   * And now conn_next can begin to allocate again. | ||||
|   * | ||||
|   * | ||||
|   *  -- invariants | ||||
| - * * when conn_free is slid back to meet conn_next, there can be no situation where another
 | ||||
| - *   connection is returned, as none must allocated  -if they were allocated, conn_free would have
 | ||||
| - *   moved_along.
 | ||||
| - * * the ring buffer must be as large as conntable.
 | ||||
| - * * We do not check conn_next == conn_free (that's the starting state), but we check if the
 | ||||
| - *   slot at conn_next is NULL, which must imply that conn_free has nothing to return.
 | ||||
| + * * the stack must be as large as conntable.
 | ||||
|   * * connection_table_move_connection_out_of_active_list is the only function able to return a | ||||
|   *   connection to the freelist, as it is the function that is called when the event system has | ||||
|   *   determined all IO's are complete, or unable to complete. This function is what prepares the | ||||
| @@ -136,11 +126,9 @@ connection_table_new(int table_size)
 | ||||
|      ct->c = (Connection **)slapi_ch_calloc(1, ct->size * sizeof(Connection *)); | ||||
|      ct->fd = (struct POLL_STRUCT **)slapi_ch_calloc(1, ct->list_num * sizeof(struct POLL_STRUCT*)); | ||||
|      ct->table_mutex = PR_NewLock(); | ||||
| -    /* Allocate the freelist */
 | ||||
| -    ct->c_freelist = (Connection **)slapi_ch_calloc(1, ct->size * sizeof(Connection *));
 | ||||
| -    /* NEVER use slot 0, this is a list pointer */
 | ||||
| -    ct->conn_next_offset = 1;
 | ||||
| -    ct->conn_free_offset = 1;
 | ||||
| +    /* Allocate the freelist (a slot for each connection plus another slot for the final NULL pointer) */
 | ||||
| +    ct->c_freelist = (Connection **)slapi_ch_calloc(1, (ct->size+1) * sizeof(Connection *));
 | ||||
| +    ct->conn_next_offset = 0;
 | ||||
|   | ||||
|      slapi_log_err(SLAPI_LOG_INFO, "connection_table_new", "Number of connection sub-tables %d, each containing %d slots.\n", | ||||
|          ct->list_num, ct->list_size); | ||||
| @@ -273,22 +261,22 @@ connection_table_get_connection(Connection_Table *ct, int sd)
 | ||||
|  { | ||||
|      PR_Lock(ct->table_mutex); | ||||
|   | ||||
| -    PR_ASSERT(ct->conn_next_offset != 0);
 | ||||
|      Connection *c = ct->c_freelist[ct->conn_next_offset]; | ||||
|      if (c != NULL) { | ||||
|          /* We allocated it, so now NULL the slot and move forward. */ | ||||
| -        ct->c_freelist[ct->conn_next_offset] = NULL;
 | ||||
| -        /* Handle overflow. */
 | ||||
| -        ct->conn_next_offset = (ct->conn_next_offset + 1) % ct->size;
 | ||||
| -        if (ct->conn_next_offset == 0) {
 | ||||
| -            /* Never use slot 0 */
 | ||||
| -            ct->conn_next_offset += 1;
 | ||||
| -        }
 | ||||
| +        PR_ASSERT(ct->conn_next_offset>=0 && ct->conn_next_offset<ct->size);
 | ||||
| +        ct->c_freelist[ct->conn_next_offset++] = NULL;
 | ||||
|          PR_Unlock(ct->table_mutex); | ||||
|      } else { | ||||
|          /* couldn't find a Connection, table must be full */ | ||||
| -        slapi_log_err(SLAPI_LOG_CONNS, "connection_table_get_connection", "Max open connections reached\n");
 | ||||
|          PR_Unlock(ct->table_mutex); | ||||
| +        static time_t last_err_msg_time = 0;
 | ||||
| +        time_t curtime = slapi_current_utc_time();
 | ||||
| +        /* Logs the message only once per seconds */
 | ||||
| +        if (curtime != last_err_msg_time) {
 | ||||
| +            slapi_log_err(SLAPI_LOG_ERR, "connection_table_get_connection", "Max open connections reached\n");
 | ||||
| +            last_err_msg_time = curtime;
 | ||||
| +        }
 | ||||
|          return NULL; | ||||
|      } | ||||
|   | ||||
| @@ -461,14 +449,10 @@ connection_table_move_connection_out_of_active_list(Connection_Table *ct, Connec
 | ||||
|   | ||||
|      /* Finally, place the connection back into the freelist for use */ | ||||
|      PR_ASSERT(c->c_refcnt == 0); | ||||
| -    PR_ASSERT(ct->conn_free_offset != 0);
 | ||||
| -    PR_ASSERT(ct->c_freelist[ct->conn_free_offset] == NULL);
 | ||||
| -    ct->c_freelist[ct->conn_free_offset] = c;
 | ||||
| -    ct->conn_free_offset = (ct->conn_free_offset + 1) % ct->size;
 | ||||
| -    if (ct->conn_free_offset == 0) {
 | ||||
| -        /* Never use slot 0 */
 | ||||
| -        ct->conn_free_offset += 1;
 | ||||
| -    }
 | ||||
| +    PR_ASSERT(ct->conn_next_offset != 0);
 | ||||
| +    ct->conn_next_offset--;
 | ||||
| +    PR_ASSERT(ct->c_freelist[ct->conn_next_offset] == NULL);
 | ||||
| +    ct->c_freelist[ct->conn_next_offset] = c;
 | ||||
|   | ||||
|      PR_Unlock(ct->table_mutex); | ||||
|   | ||||
| diff --git a/ldap/servers/slapd/daemon.c b/ldap/servers/slapd/daemon.c
 | ||||
| index e8b979acaf..10aabed6d3 100644
 | ||||
| --- a/ldap/servers/slapd/daemon.c
 | ||||
| +++ b/ldap/servers/slapd/daemon.c
 | ||||
| @@ -135,19 +135,25 @@ get_pid_file(void)
 | ||||
|  } | ||||
|   | ||||
|  static int | ||||
| -accept_and_configure(int s __attribute__((unused)), PRFileDesc *pr_acceptfd, PRNetAddr *pr_netaddr, int addrlen __attribute__((unused)), int secure, int local, PRFileDesc **pr_clonefd)
 | ||||
| +accept_and_configure(int s __attribute__((unused)), PRFileDesc *listenfd, PRNetAddr *pr_netaddr, int addrlen __attribute__((unused)), int secure, int local, PRFileDesc **pr_accepted_fd)
 | ||||
|  { | ||||
|      int ns = 0; | ||||
|      PRIntervalTime pr_timeout = PR_MillisecondsToInterval(slapd_accept_wakeup_timer); | ||||
|   | ||||
| -    (*pr_clonefd) = PR_Accept(pr_acceptfd, pr_netaddr, pr_timeout);
 | ||||
| -    if (!(*pr_clonefd)) {
 | ||||
| +    (*pr_accepted_fd) = PR_Accept(listenfd, pr_netaddr, pr_timeout);
 | ||||
| +    if (!(*pr_accepted_fd)) {
 | ||||
|          PRErrorCode prerr = PR_GetError(); | ||||
| -        slapi_log_err(SLAPI_LOG_ERR, "accept_and_configure", "PR_Accept() failed, " SLAPI_COMPONENT_NAME_NSPR " error %d (%s)\n",
 | ||||
| -                      prerr, slapd_pr_strerror(prerr));
 | ||||
| +        static time_t last_err_msg_time = 0;
 | ||||
| +        time_t curtime = slapi_current_utc_time();
 | ||||
| +        /* Logs the message only once per seconds */
 | ||||
| +        if (curtime != last_err_msg_time) {
 | ||||
| +            slapi_log_err(SLAPI_LOG_ERR, "accept_and_configure", "PR_Accept() failed, " SLAPI_COMPONENT_NAME_NSPR " error %d (%s)\n",
 | ||||
| +                          prerr, slapd_pr_strerror(prerr));
 | ||||
| +            last_err_msg_time = curtime;
 | ||||
| +        }
 | ||||
|          return (SLAPD_INVALID_SOCKET); | ||||
|      } | ||||
| -    ns = configure_pr_socket(pr_clonefd, secure, local);
 | ||||
| +    ns = configure_pr_socket(pr_accepted_fd, secure, local);
 | ||||
|   | ||||
|      return ns; | ||||
|  } | ||||
| @@ -155,7 +161,7 @@ accept_and_configure(int s __attribute__((unused)), PRFileDesc *pr_acceptfd, PRN
 | ||||
|  /* | ||||
|   * This is the shiny new re-born daemon function, without all the hair | ||||
|   */ | ||||
| -static int handle_new_connection(Connection_Table *ct, int tcps, PRFileDesc *pr_acceptfd, int secure, int local, Connection **newconn);
 | ||||
| +static int handle_new_connection(Connection_Table *ct, int tcps, PRFileDesc *listenfd, int secure, int local, Connection **newconn);
 | ||||
|  static void handle_pr_read_ready(Connection_Table *ct, int list_id, PRIntn num_poll); | ||||
|  static int clear_signal(struct POLL_STRUCT *fds, int list_id); | ||||
|  static void unfurl_banners(Connection_Table *ct, daemon_ports_t *ports, PRFileDesc **n_tcps, PRFileDesc **s_tcps, PRFileDesc **i_unix); | ||||
| @@ -831,6 +837,7 @@ accept_thread(void *vports)
 | ||||
|              } | ||||
|              /* Need a sleep delay here. */ | ||||
|              PR_Sleep(pr_timeout); | ||||
| +            last_accept_new_connections = accept_new_connections;
 | ||||
|              continue; | ||||
|          } else { | ||||
|              /* Log that we are now listening again */ | ||||
| @@ -1846,28 +1853,30 @@ handle_closed_connection(Connection *conn)
 | ||||
|   * this function returns the connection table list the new connection is in | ||||
|   */ | ||||
|  static int | ||||
| -handle_new_connection(Connection_Table *ct, int tcps, PRFileDesc *pr_acceptfd, int secure, int local, Connection **newconn)
 | ||||
| +handle_new_connection(Connection_Table *ct, int tcps, PRFileDesc *listenfd, int secure, int local, Connection **newconn)
 | ||||
|  { | ||||
|      int ns = 0; | ||||
|      Connection *conn = NULL; | ||||
|      /*    struct sockaddr_in    from;*/ | ||||
|      PRNetAddr from = {{0}}; | ||||
| -    PRFileDesc *pr_clonefd = NULL;
 | ||||
| +    PRFileDesc *pr_accepted_fd = NULL;
 | ||||
|      slapdFrontendConfig_t *fecfg = getFrontendConfig(); | ||||
|      ber_len_t maxbersize; | ||||
|   | ||||
|      if (newconn) { | ||||
|          *newconn = NULL; | ||||
|      } | ||||
| -    if ((ns = accept_and_configure(tcps, pr_acceptfd, &from,
 | ||||
| -                                   sizeof(from), secure, local, &pr_clonefd)) == SLAPD_INVALID_SOCKET) {
 | ||||
| +    if ((ns = accept_and_configure(tcps, listenfd, &from,
 | ||||
| +                                   sizeof(from), secure, local, &pr_accepted_fd)) == SLAPD_INVALID_SOCKET) {
 | ||||
|          return -1; | ||||
|      } | ||||
|   | ||||
|      /* get a new Connection from the Connection Table */ | ||||
|      conn = connection_table_get_connection(ct, ns); | ||||
|      if (conn == NULL) { | ||||
| -        PR_Close(pr_acceptfd);
 | ||||
| +        if (pr_accepted_fd) {
 | ||||
| +            PR_Close(pr_accepted_fd);
 | ||||
| +        }
 | ||||
|          return -1; | ||||
|      } | ||||
|      pthread_mutex_lock(&(conn->c_mutex)); | ||||
| @@ -1879,7 +1888,7 @@ handle_new_connection(Connection_Table *ct, int tcps, PRFileDesc *pr_acceptfd, i
 | ||||
|      conn->c_idletimeout = fecfg->idletimeout; | ||||
|      conn->c_idletimeout_handle = idletimeout_reslimit_handle; | ||||
|      conn->c_sd = ns; | ||||
| -    conn->c_prfd = pr_clonefd;
 | ||||
| +    conn->c_prfd = pr_accepted_fd;
 | ||||
|      conn->c_flags &= ~CONN_FLAG_CLOSING; | ||||
|   | ||||
|      /* Set per connection static config */ | ||||
| diff --git a/ldap/servers/slapd/fe.h b/ldap/servers/slapd/fe.h
 | ||||
| index 64c6645e7b..95dfeeb89b 100644
 | ||||
| --- a/ldap/servers/slapd/fe.h
 | ||||
| +++ b/ldap/servers/slapd/fe.h
 | ||||
| @@ -88,7 +88,6 @@ struct connection_table
 | ||||
|      /* An array of free connections awaiting allocation. */; | ||||
|      Connection **c_freelist; | ||||
|      size_t conn_next_offset; | ||||
| -    size_t conn_free_offset;
 | ||||
|      struct POLL_STRUCT **fd; | ||||
|      PRLock *table_mutex; | ||||
|  }; | ||||
| @ -0,0 +1,385 @@ | ||||
| From dc091325814d9ac74d238c7e14cac8b9112b3271 Mon Sep 17 00:00:00 2001 | ||||
| From: progier389 <progier@redhat.com> | ||||
| Date: Fri, 17 Nov 2023 14:41:51 +0100 | ||||
| Subject: [PATCH] Issue 5984 - Crash when paged result search are abandoned | ||||
|  (#5985) | ||||
| 
 | ||||
| * Issue 5984 - Crash when paged result search are abandoned | ||||
| 
 | ||||
| Problem: | ||||
|   Fix #4551 has changed the lock that protects the paged result data | ||||
|   within a connection. But the abandon operation attempts to free | ||||
|   the paged search result with the connection lock. | ||||
|   This leads to race condition and double free causing an heap | ||||
|   corruption and a SIGSEGV. | ||||
| 
 | ||||
|   Solution: | ||||
|    - Get a copy of the operation data that needs to be logged. | ||||
|    - Unlock the connection mutex (to avoid deadlock risk) | ||||
|    - Free the paged result while holding the paged result lock. | ||||
| 
 | ||||
| Issue: 5984 | ||||
| 
 | ||||
| Reviewed by: @tbordaz (Thanks!) | ||||
| 
 | ||||
| (cherry picked from commit 06bd0862956672eb76276cab5c1dd906fe5a7eec) | ||||
| ---
 | ||||
|  .../paged_results/paged_results_test.py       | 107 ++++++++++++++++-- | ||||
|  ldap/servers/slapd/abandon.c                  |  23 ++-- | ||||
|  ldap/servers/slapd/opshared.c                 |   4 +- | ||||
|  ldap/servers/slapd/pagedresults.c             |   8 +- | ||||
|  ldap/servers/slapd/proto-slap.h               |   2 +- | ||||
|  src/lib389/lib389/__init__.py                 |  27 ++++- | ||||
|  6 files changed, 150 insertions(+), 21 deletions(-) | ||||
| 
 | ||||
| diff --git a/dirsrvtests/tests/suites/paged_results/paged_results_test.py b/dirsrvtests/tests/suites/paged_results/paged_results_test.py
 | ||||
| index d490c4af20..cdafa834ab 100644
 | ||||
| --- a/dirsrvtests/tests/suites/paged_results/paged_results_test.py
 | ||||
| +++ b/dirsrvtests/tests/suites/paged_results/paged_results_test.py
 | ||||
| @@ -7,7 +7,8 @@
 | ||||
|  # --- END COPYRIGHT BLOCK --- | ||||
|  # | ||||
|  import socket | ||||
| -from random import sample
 | ||||
| +from random import sample, randrange
 | ||||
| +
 | ||||
|  import pytest | ||||
|  from ldap.controls import SimplePagedResultsControl, GetEffectiveRightsControl | ||||
|  from lib389.tasks import * | ||||
| @@ -16,6 +17,10 @@
 | ||||
|  from lib389._constants import DN_LDBM, DN_DM, DEFAULT_SUFFIX | ||||
|  from lib389._controls import SSSRequestControl | ||||
|  from lib389.idm.user import UserAccount, UserAccounts | ||||
| +from lib389.cli_base import FakeArgs
 | ||||
| +from lib389.config import LDBMConfig
 | ||||
| +from lib389.dbgen import dbgen_users
 | ||||
| +
 | ||||
|  from lib389.idm.organization import Organization | ||||
|  from lib389.idm.organizationalunit import OrganizationalUnit | ||||
|  from lib389.backend import Backends | ||||
| @@ -42,11 +47,56 @@
 | ||||
|  NEW_BACKEND_2 = 'child_base' | ||||
|   | ||||
|  OLD_HOSTNAME = socket.gethostname() | ||||
| -socket.sethostname('localhost')
 | ||||
| +if os.getuid() == 0:
 | ||||
| +    socket.sethostname('localhost')
 | ||||
|  HOSTNAME = socket.gethostname() | ||||
|  IP_ADDRESS = socket.gethostbyname(HOSTNAME) | ||||
|  OLD_IP_ADDRESS = socket.gethostbyname(OLD_HOSTNAME) | ||||
|   | ||||
| +
 | ||||
| +@pytest.fixture(scope="module")
 | ||||
| +def create_40k_users(topology_st, request):
 | ||||
| +    inst = topology_st.standalone
 | ||||
| +
 | ||||
| +    # Prepare return value
 | ||||
| +    retval = FakeArgs()
 | ||||
| +    retval.inst = inst
 | ||||
| +    retval.bename = '40k'
 | ||||
| +    retval.suffix = f'o={retval.bename}'
 | ||||
| +    retval.ldif_file = f'{inst.get_ldif_dir()}/{retval.bename}.ldif'
 | ||||
| +
 | ||||
| +    # Create new backend
 | ||||
| +    bes = Backends(inst)
 | ||||
| +    be_1 = bes.create(properties={
 | ||||
| +        'cn': retval.bename,
 | ||||
| +        'nsslapd-suffix': retval.suffix,
 | ||||
| +    })
 | ||||
| +
 | ||||
| +    # Set paged search lookthrough limit
 | ||||
| +    ldbmconfig = LDBMConfig(inst)
 | ||||
| +    ldbmconfig.replace('nsslapd-pagedlookthroughlimit', b'100000')
 | ||||
| +
 | ||||
| +    # Create ldif and import it.
 | ||||
| +    dbgen_users(inst, 40000, retval.ldif_file, retval.suffix)
 | ||||
| +    # tasks = Tasks(inst)
 | ||||
| +    # args = {TASK_WAIT: True}
 | ||||
| +    # tasks.importLDIF(retval.suffix, None, retval.ldif_file, args)
 | ||||
| +    inst.stop()
 | ||||
| +    assert inst.ldif2db(retval.bename, None, None, None, retval.ldif_file, None)
 | ||||
| +    inst.start()
 | ||||
| +
 | ||||
| +    # And set an aci allowing anonymous read
 | ||||
| +    log.info('Adding ACI to allow our test user to search')
 | ||||
| +    ACI_TARGET = '(targetattr != "userPassword || aci")'
 | ||||
| +    ACI_ALLOW = '(version 3.0; acl "Enable anonymous access";allow (read, search, compare)'
 | ||||
| +    ACI_SUBJECT = '(userdn = "ldap:///anyone");)'
 | ||||
| +    ACI_BODY = ACI_TARGET + ACI_ALLOW + ACI_SUBJECT
 | ||||
| +    o_1 = Organization(inst, retval.suffix)
 | ||||
| +    o_1.set('aci', ACI_BODY)
 | ||||
| +
 | ||||
| +    return retval
 | ||||
| +
 | ||||
| +
 | ||||
|  @pytest.fixture(scope="module") | ||||
|  def create_user(topology_st, request): | ||||
|      """User for binding operation""" | ||||
| @@ -71,8 +121,10 @@ def create_user(topology_st, request):
 | ||||
|   | ||||
|      def fin(): | ||||
|          log.info('Deleting user simplepaged_test') | ||||
| -        user.delete()
 | ||||
| -        socket.sethostname(OLD_HOSTNAME)
 | ||||
| +        if not DEBUGGING:
 | ||||
| +            user.delete()
 | ||||
| +        if os.getuid() == 0:
 | ||||
| +            socket.sethostname(OLD_HOSTNAME)
 | ||||
|   | ||||
|      request.addfinalizer(fin) | ||||
|   | ||||
| @@ -175,7 +227,7 @@ def change_conf_attr(topology_st, suffix, attr_name, attr_value):
 | ||||
|      return attr_value_bck | ||||
|   | ||||
|   | ||||
| -def paged_search(conn, suffix, controls, search_flt, searchreq_attrlist):
 | ||||
| +def paged_search(conn, suffix, controls, search_flt, searchreq_attrlist, abandon_rate=0):
 | ||||
|      """Search at the DEFAULT_SUFFIX with ldap.SCOPE_SUBTREE | ||||
|      using Simple Paged Control(should the first item in the | ||||
|      list controls. | ||||
| @@ -195,9 +247,16 @@ def paged_search(conn, suffix, controls, search_flt, searchreq_attrlist):
 | ||||
|                                                      req_pr_ctrl.size, | ||||
|                                                      str(controls))) | ||||
|      msgid = conn.search_ext(suffix, ldap.SCOPE_SUBTREE, search_flt, searchreq_attrlist, serverctrls=controls) | ||||
| +    log.info('Getting page %d' % (pages,))
 | ||||
|      while True: | ||||
| -        log.info('Getting page %d' % (pages,))
 | ||||
| -        rtype, rdata, rmsgid, rctrls = conn.result3(msgid)
 | ||||
| +        try:
 | ||||
| +            rtype, rdata, rmsgid, rctrls = conn.result3(msgid, timeout=0.001)
 | ||||
| +        except ldap.TIMEOUT:
 | ||||
| +            if pages > 0 and abandon_rate>0 and randrange(100)<abandon_rate:
 | ||||
| +                conn.abandon(msgid)
 | ||||
| +                log.info('Paged result search is abandonned.')
 | ||||
| +                return all_results
 | ||||
| +            continue
 | ||||
|          log.debug('Data: {}'.format(rdata)) | ||||
|          all_results.extend(rdata) | ||||
|          pages += 1 | ||||
| @@ -217,6 +276,7 @@ def paged_search(conn, suffix, controls, search_flt, searchreq_attrlist):
 | ||||
|                  break  # No more pages available | ||||
|          else: | ||||
|              break | ||||
| +        log.info('Getting page %d' % (pages,))
 | ||||
|   | ||||
|      assert not pctrls[0].cookie | ||||
|      return all_results | ||||
| @@ -1191,6 +1251,39 @@ def test_maxsimplepaged_per_conn_failure(topology_st, create_user, conf_attr_val
 | ||||
|          del_users(users_list) | ||||
|          change_conf_attr(topology_st, DN_CONFIG, 'nsslapd-maxsimplepaged-per-conn', max_per_con_bck) | ||||
|   | ||||
| +
 | ||||
| +def test_search_stress_abandon(create_40k_users, create_user):
 | ||||
| +    """Verify that search with a simple paged results control
 | ||||
| +    returns all entries it should without errors.
 | ||||
| +
 | ||||
| +    :id: e154b24a-83d6-11ee-90d1-482ae39447e5
 | ||||
| +    :customerscenario: True
 | ||||
| +    :feature: Simple paged results
 | ||||
| +    :setup: Standalone instance, test user for binding,
 | ||||
| +            40K  users in a second backend
 | ||||
| +    :steps:
 | ||||
| +        1. Bind as test user
 | ||||
| +        2. Loops a number of times doing:
 | ||||
| +                 - search through added users with a simple paged control
 | ||||
| +                 - randomly abandoning the search after a few ms.
 | ||||
| +    :expectedresults:
 | ||||
| +        1. Bind should be successful
 | ||||
| +        2. The loop should complete successfully.
 | ||||
| +    """
 | ||||
| +
 | ||||
| +    abandon_rate = 10
 | ||||
| +    page_size = 500
 | ||||
| +    nbloops = 1000
 | ||||
| +    search_flt = r'(uid=*)'
 | ||||
| +    searchreq_attrlist = ['dn', 'sn']
 | ||||
| +    log.info('Set user bind %s ' % create_user)
 | ||||
| +    conn = create_user.bind(TEST_USER_PWD)
 | ||||
| +    for idx in range(nbloops):
 | ||||
| +        req_ctrl = SimplePagedResultsControl(True, size=page_size, cookie='')
 | ||||
| +        # If the issue #5984 is not fixed the server crashs and the paged search fails with ldap.SERVER_DOWN exception
 | ||||
| +        paged_search(conn, create_40k_users.suffix, [req_ctrl], search_flt, searchreq_attrlist, abandon_rate=abandon_rate)
 | ||||
| +
 | ||||
| +
 | ||||
|  if __name__ == '__main__': | ||||
|      # Run isolated | ||||
|      # -s for DEBUG mode | ||||
| diff --git a/ldap/servers/slapd/abandon.c b/ldap/servers/slapd/abandon.c
 | ||||
| index 26a2e7bf83..964d28836f 100644
 | ||||
| --- a/ldap/servers/slapd/abandon.c
 | ||||
| +++ b/ldap/servers/slapd/abandon.c
 | ||||
| @@ -38,6 +38,12 @@ do_abandon(Slapi_PBlock *pb)
 | ||||
|      Connection *pb_conn = NULL; | ||||
|      Operation *pb_op = NULL; | ||||
|      Operation *o; | ||||
| +    /* Keep a copy of some data because o may vanish once conn is unlocked */
 | ||||
| +    struct {
 | ||||
| +        struct timespec hr_time_end;
 | ||||
| +        int nentries;
 | ||||
| +        int opid;
 | ||||
| +    } o_copy; 
 | ||||
|   | ||||
|      slapi_pblock_get(pb, SLAPI_OPERATION, &pb_op); | ||||
|      slapi_pblock_get(pb, SLAPI_CONNECTION, &pb_conn); | ||||
| @@ -90,8 +96,12 @@ do_abandon(Slapi_PBlock *pb)
 | ||||
|   | ||||
|      pthread_mutex_lock(&(pb_conn->c_mutex)); | ||||
|      for (o = pb_conn->c_ops; o != NULL; o = o->o_next) { | ||||
| -        if (o->o_msgid == id && o != pb_op)
 | ||||
| +        if (o->o_msgid == id && o != pb_op) {
 | ||||
| +            slapi_operation_time_elapsed(o, &o_copy.hr_time_end);
 | ||||
| +            o_copy.nentries = o->o_results.r.r_search.nentries;
 | ||||
| +            o_copy.opid = o->o_opid;
 | ||||
|              break; | ||||
| +        }
 | ||||
|      } | ||||
|   | ||||
|      if (o != NULL) { | ||||
| @@ -130,7 +140,8 @@ do_abandon(Slapi_PBlock *pb)
 | ||||
|          slapi_log_err(SLAPI_LOG_TRACE, "do_abandon", "op not found\n"); | ||||
|      } | ||||
|   | ||||
| -    if (0 == pagedresults_free_one_msgid_nolock(pb_conn, id)) {
 | ||||
| +    pthread_mutex_unlock(&(pb_conn->c_mutex));
 | ||||
| +    if (0 == pagedresults_free_one_msgid(pb_conn, id, pageresult_lock_get_addr(pb_conn))) {
 | ||||
|          slapi_log_access(LDAP_DEBUG_STATS, "conn=%" PRIu64 | ||||
|                                             " op=%d ABANDON targetop=Simple Paged Results msgid=%d\n", | ||||
|                           pb_conn->c_connid, pb_op->o_opid, id); | ||||
| @@ -143,15 +154,11 @@ do_abandon(Slapi_PBlock *pb)
 | ||||
|                                             " targetop=SUPPRESSED-BY-PLUGIN msgid=%d\n", | ||||
|                           pb_conn->c_connid, pb_op->o_opid, id); | ||||
|      } else { | ||||
| -        struct timespec o_hr_time_end;
 | ||||
| -        slapi_operation_time_elapsed(o, &o_hr_time_end);
 | ||||
|          slapi_log_access(LDAP_DEBUG_STATS, "conn=%" PRIu64 " op=%d ABANDON" | ||||
|                                             " targetop=%d msgid=%d nentries=%d etime=%" PRId64 ".%010" PRId64 "\n", | ||||
| -                         pb_conn->c_connid, pb_op->o_opid, o->o_opid, id,
 | ||||
| -                         o->o_results.r.r_search.nentries, (int64_t)o_hr_time_end.tv_sec, (int64_t)o_hr_time_end.tv_nsec);
 | ||||
| +                         pb_conn->c_connid, pb_op->o_opid, o_copy.opid, id,
 | ||||
| +                         o_copy.nentries, (int64_t)o_copy.hr_time_end.tv_sec, (int64_t)o_copy.hr_time_end.tv_nsec);
 | ||||
|      } | ||||
| -
 | ||||
| -    pthread_mutex_unlock(&(pb_conn->c_mutex));
 | ||||
|      /* | ||||
|       * Wake up the persistent searches, so they | ||||
|       * can notice if they've been abandoned. | ||||
| diff --git a/ldap/servers/slapd/opshared.c b/ldap/servers/slapd/opshared.c
 | ||||
| index a842d4249f..f77043afa9 100644
 | ||||
| --- a/ldap/servers/slapd/opshared.c
 | ||||
| +++ b/ldap/servers/slapd/opshared.c
 | ||||
| @@ -921,9 +921,7 @@ op_shared_search(Slapi_PBlock *pb, int send_result)
 | ||||
|                      next_be = NULL; /* to break the loop */ | ||||
|                      if (operation->o_status & SLAPI_OP_STATUS_ABANDONED) { | ||||
|                          /* It turned out this search was abandoned. */ | ||||
| -                        pthread_mutex_lock(pagedresults_mutex);
 | ||||
| -                        pagedresults_free_one_msgid_nolock(pb_conn, operation->o_msgid);
 | ||||
| -                        pthread_mutex_unlock(pagedresults_mutex);
 | ||||
| +                        pagedresults_free_one_msgid(pb_conn, operation->o_msgid, pagedresults_mutex);
 | ||||
|                          /* paged-results-request was abandoned; making an empty cookie. */ | ||||
|                          pagedresults_set_response_control(pb, 0, estimate, -1, pr_idx); | ||||
|                          send_ldap_result(pb, 0, NULL, "Simple Paged Results Search abandoned", 0, NULL); | ||||
| diff --git a/ldap/servers/slapd/pagedresults.c b/ldap/servers/slapd/pagedresults.c
 | ||||
| index fc15f6bec9..9959c927e0 100644
 | ||||
| --- a/ldap/servers/slapd/pagedresults.c
 | ||||
| +++ b/ldap/servers/slapd/pagedresults.c
 | ||||
| @@ -34,6 +34,10 @@ pageresult_lock_cleanup()
 | ||||
|      slapi_ch_free((void**)&lock_hash); | ||||
|  } | ||||
|   | ||||
| +/* Beware to the lock order with c_mutex:
 | ||||
| + * c_mutex is sometime locked while holding pageresult_lock
 | ||||
| + * ==> Do not lock pageresult_lock when holing c_mutex
 | ||||
| + */
 | ||||
|  pthread_mutex_t * | ||||
|  pageresult_lock_get_addr(Connection *conn) | ||||
|  { | ||||
| @@ -350,7 +354,7 @@ pagedresults_free_one(Connection *conn, Operation *op, int index)
 | ||||
|   * Used for abandoning - pageresult_lock_get_addr(conn) is already locked in do_abandone. | ||||
|   */ | ||||
|  int | ||||
| -pagedresults_free_one_msgid_nolock(Connection *conn, ber_int_t msgid)
 | ||||
| +pagedresults_free_one_msgid(Connection *conn, ber_int_t msgid, pthread_mutex_t *mutex)
 | ||||
|  { | ||||
|      int rc = -1; | ||||
|      int i; | ||||
| @@ -361,6 +365,7 @@ pagedresults_free_one_msgid_nolock(Connection *conn, ber_int_t msgid)
 | ||||
|          } else { | ||||
|              slapi_log_err(SLAPI_LOG_TRACE, | ||||
|                            "pagedresults_free_one_msgid_nolock", "=> msgid=%d\n", msgid); | ||||
| +            pthread_mutex_lock(mutex);
 | ||||
|              for (i = 0; i < conn->c_pagedresults.prl_maxlen; i++) { | ||||
|                  if (conn->c_pagedresults.prl_list[i].pr_msgid == msgid) { | ||||
|                      PagedResults *prp = conn->c_pagedresults.prl_list + i; | ||||
| @@ -375,6 +380,7 @@ pagedresults_free_one_msgid_nolock(Connection *conn, ber_int_t msgid)
 | ||||
|                      break; | ||||
|                  } | ||||
|              } | ||||
| +            pthread_mutex_unlock(mutex);
 | ||||
|              slapi_log_err(SLAPI_LOG_TRACE, | ||||
|                            "pagedresults_free_one_msgid_nolock", "<= %d\n", rc); | ||||
|          } | ||||
| diff --git a/ldap/servers/slapd/proto-slap.h b/ldap/servers/slapd/proto-slap.h
 | ||||
| index c7389fe2ec..e8adbc254b 100644
 | ||||
| --- a/ldap/servers/slapd/proto-slap.h
 | ||||
| +++ b/ldap/servers/slapd/proto-slap.h
 | ||||
| @@ -1614,7 +1614,7 @@ int pagedresults_is_timedout_nolock(Connection *conn);
 | ||||
|  int pagedresults_reset_timedout_nolock(Connection *conn); | ||||
|  int pagedresults_in_use_nolock(Connection *conn); | ||||
|  int pagedresults_free_one(Connection *conn, Operation *op, int index); | ||||
| -int pagedresults_free_one_msgid_nolock(Connection *conn, ber_int_t msgid);
 | ||||
| +int pagedresults_free_one_msgid(Connection *conn, ber_int_t msgid, pthread_mutex_t *mutex);
 | ||||
|  int op_is_pagedresults(Operation *op); | ||||
|  int pagedresults_cleanup_all(Connection *conn, int needlock); | ||||
|  void op_set_pagedresults(Operation *op); | ||||
| diff --git a/src/lib389/lib389/__init__.py b/src/lib389/lib389/__init__.py
 | ||||
| index 7590ec4424..6a941dbe75 100644
 | ||||
| --- a/src/lib389/lib389/__init__.py
 | ||||
| +++ b/src/lib389/lib389/__init__.py
 | ||||
| @@ -1048,6 +1048,24 @@ def close(self):
 | ||||
|   | ||||
|          self.state = DIRSRV_STATE_OFFLINE | ||||
|   | ||||
| +    def dump_errorlog(self):
 | ||||
| +        '''
 | ||||
| +            Its logs all errors messages within the error log that occured 
 | ||||
| +            after the last startup.
 | ||||
| +        '''
 | ||||
| +        if os.path.isfile(self.errlog):
 | ||||
| +            lines = []
 | ||||
| +            with open(self.errlog, 'r') as file:
 | ||||
| +                for line in file:
 | ||||
| +                    if "starting up" in line:
 | ||||
| +                        lines = []
 | ||||
| +                    for key in ( 'DEBUG', 'INFO', 'NOTICE', 'WARN' ):
 | ||||
| +                        if key in line:
 | ||||
| +                            lines.append(line)
 | ||||
| +                            break
 | ||||
| +            for line in lines:
 | ||||
| +                self.log.error(line)
 | ||||
| +
 | ||||
|      def start(self, timeout=120, post_open=True): | ||||
|          ''' | ||||
|              It starts an instance and rebind it. Its final state after rebind | ||||
| @@ -1071,7 +1089,13 @@ def start(self, timeout=120, post_open=True):
 | ||||
|          if self.with_systemd(): | ||||
|              self.log.debug("systemd status -> True") | ||||
|              # Do systemd things here ... | ||||
| -            subprocess.check_output(["systemctl", "start", "dirsrv@%s" % self.serverid], stderr=subprocess.STDOUT)
 | ||||
| +            try:
 | ||||
| +                subprocess.check_output(["systemctl", "start", "dirsrv@%s" % self.serverid], stderr=subprocess.STDOUT)
 | ||||
| +            except subprocess.CalledProcessError as e:
 | ||||
| +                self.dump_errorlog()
 | ||||
| +                self.log.error('Failed to start dirsrv@%s: "%s"' % (self.serverid, e.output.decode()))
 | ||||
| +                self.log.error(e)
 | ||||
| +                raise ValueError('Failed to start DS')
 | ||||
|          else: | ||||
|              self.log.debug("systemd status -> False") | ||||
|              # Start the process. | ||||
| @@ -1095,6 +1119,7 @@ def start(self, timeout=120, post_open=True):
 | ||||
|                  self.log.debug("DEBUG: starting with %s" % cmd) | ||||
|                  output = subprocess.check_output(*cmd, env=env, stderr=subprocess.STDOUT) | ||||
|              except subprocess.CalledProcessError as e: | ||||
| +                self.dump_errorlog()
 | ||||
|                  self.log.error('Failed to start ns-slapd: "%s"' % e.output.decode()) | ||||
|                  self.log.error(e) | ||||
|                  raise ValueError('Failed to start DS') | ||||
| @ -0,0 +1,76 @@ | ||||
| From 4883a69dcf2764146bbe54c5432752b79f8d1a1e Mon Sep 17 00:00:00 2001 | ||||
| From: Pierre Rogier <progier@redhat.com> | ||||
| Date: Mon, 20 Nov 2023 16:17:39 +0100 | ||||
| Subject: [PATCH] Issue 5984 - Crash when paged result search are abandoned - | ||||
|  fix2 | ||||
| 
 | ||||
| ---
 | ||||
|  ldap/servers/slapd/abandon.c  |  2 +- | ||||
|  src/lib389/lib389/__init__.py | 27 +-------------------------- | ||||
|  2 files changed, 2 insertions(+), 27 deletions(-) | ||||
| 
 | ||||
| diff --git a/ldap/servers/slapd/abandon.c b/ldap/servers/slapd/abandon.c
 | ||||
| index 964d28836f..2dd1ee3205 100644
 | ||||
| --- a/ldap/servers/slapd/abandon.c
 | ||||
| +++ b/ldap/servers/slapd/abandon.c
 | ||||
| @@ -43,7 +43,7 @@ do_abandon(Slapi_PBlock *pb)
 | ||||
|          struct timespec hr_time_end; | ||||
|          int nentries; | ||||
|          int opid; | ||||
| -    } o_copy; 
 | ||||
| +    } o_copy;
 | ||||
|   | ||||
|      slapi_pblock_get(pb, SLAPI_OPERATION, &pb_op); | ||||
|      slapi_pblock_get(pb, SLAPI_CONNECTION, &pb_conn); | ||||
| diff --git a/src/lib389/lib389/__init__.py b/src/lib389/lib389/__init__.py
 | ||||
| index 6a941dbe75..7590ec4424 100644
 | ||||
| --- a/src/lib389/lib389/__init__.py
 | ||||
| +++ b/src/lib389/lib389/__init__.py
 | ||||
| @@ -1048,24 +1048,6 @@ class DirSrv(SimpleLDAPObject, object):
 | ||||
|   | ||||
|          self.state = DIRSRV_STATE_OFFLINE | ||||
|   | ||||
| -    def dump_errorlog(self):
 | ||||
| -        '''
 | ||||
| -            Its logs all errors messages within the error log that occured 
 | ||||
| -            after the last startup.
 | ||||
| -        '''
 | ||||
| -        if os.path.isfile(self.errlog):
 | ||||
| -            lines = []
 | ||||
| -            with open(self.errlog, 'r') as file:
 | ||||
| -                for line in file:
 | ||||
| -                    if "starting up" in line:
 | ||||
| -                        lines = []
 | ||||
| -                    for key in ( 'DEBUG', 'INFO', 'NOTICE', 'WARN' ):
 | ||||
| -                        if key in line:
 | ||||
| -                            lines.append(line)
 | ||||
| -                            break
 | ||||
| -            for line in lines:
 | ||||
| -                self.log.error(line)
 | ||||
| -
 | ||||
|      def start(self, timeout=120, post_open=True): | ||||
|          ''' | ||||
|              It starts an instance and rebind it. Its final state after rebind | ||||
| @@ -1089,13 +1071,7 @@ class DirSrv(SimpleLDAPObject, object):
 | ||||
|          if self.with_systemd(): | ||||
|              self.log.debug("systemd status -> True") | ||||
|              # Do systemd things here ... | ||||
| -            try:
 | ||||
| -                subprocess.check_output(["systemctl", "start", "dirsrv@%s" % self.serverid], stderr=subprocess.STDOUT)
 | ||||
| -            except subprocess.CalledProcessError as e:
 | ||||
| -                self.dump_errorlog()
 | ||||
| -                self.log.error('Failed to start dirsrv@%s: "%s"' % (self.serverid, e.output.decode()))
 | ||||
| -                self.log.error(e)
 | ||||
| -                raise ValueError('Failed to start DS')
 | ||||
| +            subprocess.check_output(["systemctl", "start", "dirsrv@%s" % self.serverid], stderr=subprocess.STDOUT)
 | ||||
|          else: | ||||
|              self.log.debug("systemd status -> False") | ||||
|              # Start the process. | ||||
| @@ -1119,7 +1095,6 @@ class DirSrv(SimpleLDAPObject, object):
 | ||||
|                  self.log.debug("DEBUG: starting with %s" % cmd) | ||||
|                  output = subprocess.check_output(*cmd, env=env, stderr=subprocess.STDOUT) | ||||
|              except subprocess.CalledProcessError as e: | ||||
| -                self.dump_errorlog()
 | ||||
|                  self.log.error('Failed to start ns-slapd: "%s"' % e.output.decode()) | ||||
|                  self.log.error(e) | ||||
|                  raise ValueError('Failed to start DS') | ||||
| @ -47,7 +47,7 @@ ExcludeArch: i686 | ||||
| Summary:          389 Directory Server (base) | ||||
| Name:             389-ds-base | ||||
| Version:          2.3.6 | ||||
| Release:          3%{?dist} | ||||
| Release:          5%{?dist}.alma.1 | ||||
| License:          GPLv3+ and MIT and ASL 2.0 | ||||
| URL:              https://www.port389.org | ||||
| Conflicts:        selinux-policy-base < 3.9.8 | ||||
| @ -295,7 +295,17 @@ Source3:          https://github.com/jemalloc/%{jemalloc_name}/releases/download | ||||
| %endif | ||||
| Source4:          389-ds-base.sysusers | ||||
| 
 | ||||
| Patch1:		  0001-Issue-5848-Fix-condition-and-add-a-CI-test-5916.patch | ||||
| Patch1:           0001-Issue-5848-Fix-condition-and-add-a-CI-test-5916.patch | ||||
| 
 | ||||
| # Patches were taken from: | ||||
| # https://github.com/389ds/389-ds-base/commit/b2194c95f9d02965b2ad3d0dcf333e74881347d0 | ||||
| Patch2:           0002-Issue-5909-Multi-listener-hang-with-20k-connections.patch | ||||
| # https://github.com/389ds/389-ds-base/commit/26716f0edefe4690d9fd9143d07156e01321020c | ||||
| Patch3:           0003-Issue-5924-ASAN-server-build-crash-when-looping.patch | ||||
| # https://github.com/389ds/389-ds-base/commit/dc091325814d9ac74d238c7e14cac8b9112b3271 | ||||
| Patch4:           0004-Issue-5984-Crash-when-paged-result-search-are-abandoned.patch | ||||
| # https://github.com/389ds/389-ds-base/pull/5987/commits/4883a69dcf2764146bbe54c5432752b79f8d1a1e | ||||
| Patch5:           0005-Issue-5984-Crash-when-paged-result-search-are-abandoned-fix2.patch | ||||
| 
 | ||||
| %description | ||||
| 389 Directory Server is an LDAPv3 compliant server.  The base package includes | ||||
| @ -737,6 +747,13 @@ exit 0 | ||||
| %endif | ||||
| 
 | ||||
| %changelog | ||||
| * Thu Jan 25 2024 Eduard Abdullin <eabdullin@almalinux.org> - 2.3.6-5.alma.1 | ||||
| - Bump version to 2.3.6-5 | ||||
| - Issue 5909 - Multi listener hang with 20k connections (#5917) | ||||
| - issue 5924 - ASAN server build crash when looping | ||||
| - Issue 5984 - Crash when paged result search are abandoned | ||||
|  (#5985) | ||||
| 
 | ||||
| * Thu Sep 7 2023 Simon Pichugin <spichugi@redhat.com> - 2.3.6-3 | ||||
| - Bump version to 2.3.6-3 | ||||
| - Resolves: rhbz#2236163 - Regression: replication can't be enabled for consumer or hub role | ||||
|  | ||||
		Loading…
	
		Reference in New Issue
	
	Block a user