diff --git a/glibc-RHEL-2122.patch b/glibc-RHEL-2122.patch new file mode 100644 index 0000000..69a294f --- /dev/null +++ b/glibc-RHEL-2122.patch @@ -0,0 +1,312 @@ +From d2123d68275acc0f061e73d5f86ca504e0d5a344 Mon Sep 17 00:00:00 2001 +From: Szabolcs Nagy +Date: Tue, 16 Feb 2021 12:55:13 +0000 +Subject: elf: Fix slow tls access after dlopen [BZ #19924] + +In short: __tls_get_addr checks the global generation counter and if +the current dtv is older then _dl_update_slotinfo updates dtv up to the +generation of the accessed module. So if the global generation is newer +than generation of the module then __tls_get_addr keeps hitting the +slow dtv update path. The dtv update path includes a number of checks +to see if any update is needed and this already causes measurable tls +access slow down after dlopen. + +It may be possible to detect up-to-date dtv faster. But if there are +many modules loaded (> TLS_SLOTINFO_SURPLUS) then this requires at +least walking the slotinfo list. + +This patch tries to update the dtv to the global generation instead, so +after a dlopen the tls access slow path is only hit once. The modules +with larger generation than the accessed one were not necessarily +synchronized before, so additional synchronization is needed. + +This patch uses acquire/release synchronization when accessing the +generation counter. + +Note: in the x86_64 version of dl-tls.c the generation is only loaded +once, since relaxed mo is not faster than acquire mo load. + +I have not benchmarked this. Tested by Adhemerval Zanella on aarch64, +powerpc, sparc, x86 who reported that it fixes the performance issue +of bug 19924. + +Reviewed-by: Adhemerval Zanella + +[rebased to c8s by DJ] + +diff -rup a/elf/dl-close.c b/elf/dl-close.c +--- a/elf/dl-close.c 2023-10-13 16:24:27.068217519 -0400 ++++ b/elf/dl-close.c 2023-10-13 16:28:59.936019397 -0400 +@@ -739,7 +739,7 @@ _dl_close_worker (struct link_map *map, + if (__glibc_unlikely (newgen == 0)) + _dl_fatal_printf ("TLS generation counter wrapped! Please report as described in "REPORT_BUGS_TO".\n"); + /* Can be read concurrently. */ +- atomic_store_relaxed (&GL(dl_tls_generation), newgen); ++ atomic_store_release (&GL(dl_tls_generation), newgen); + + if (tls_free_end == GL(dl_tls_static_used)) + GL(dl_tls_static_used) = tls_free_start; +diff -rup a/elf/dl-open.c b/elf/dl-open.c +--- a/elf/dl-open.c 2023-10-13 16:24:26.930212160 -0400 ++++ b/elf/dl-open.c 2023-10-13 16:28:59.936019397 -0400 +@@ -403,7 +403,7 @@ update_tls_slotinfo (struct link_map *ne + _dl_fatal_printf (N_("\ + TLS generation counter wrapped! Please report this.")); + /* Can be read concurrently. */ +- atomic_store_relaxed (&GL(dl_tls_generation), newgen); ++ atomic_store_release (&GL(dl_tls_generation), newgen); + + /* We need a second pass for static tls data, because + _dl_update_slotinfo must not be run while calls to +@@ -420,8 +420,8 @@ TLS generation counter wrapped! Please + now, but we can delay updating the DTV. */ + imap->l_need_tls_init = 0; + #ifdef SHARED +- /* Update the slot information data for at least the +- generation of the DSO we are allocating data for. */ ++ /* Update the slot information data for the current ++ generation. */ + + /* FIXME: This can terminate the process on memory + allocation failure. It is not possible to raise +@@ -429,7 +429,7 @@ TLS generation counter wrapped! Please + _dl_update_slotinfo would have to be split into two + operations, similar to resize_scopes and update_scopes + above. This is related to bug 16134. */ +- _dl_update_slotinfo (imap->l_tls_modid); ++ _dl_update_slotinfo (imap->l_tls_modid, newgen); + #endif + + GL(dl_init_static_tls) (imap); +diff -rup a/elf/dl-reloc.c b/elf/dl-reloc.c +--- a/elf/dl-reloc.c 2023-10-13 16:24:26.390191189 -0400 ++++ b/elf/dl-reloc.c 2023-10-13 16:28:59.937019438 -0400 +@@ -111,11 +111,11 @@ _dl_try_allocate_static_tls (struct link + if (map->l_real->l_relocated) + { + #ifdef SHARED ++ /* Update the DTV of the current thread. Note: GL(dl_load_tls_lock) ++ is held here so normal load of the generation counter is valid. */ + if (__builtin_expect (THREAD_DTV()[0].counter != GL(dl_tls_generation), + 0)) +- /* Update the slot information data for at least the generation of +- the DSO we are allocating data for. */ +- (void) _dl_update_slotinfo (map->l_tls_modid); ++ (void) _dl_update_slotinfo (map->l_tls_modid, GL(dl_tls_generation)); + #endif + + GL(dl_init_static_tls) (map); +diff -rup a/elf/dl-tls.c b/elf/dl-tls.c +--- a/elf/dl-tls.c 2023-10-13 16:24:26.564197946 -0400 ++++ b/elf/dl-tls.c 2023-10-13 16:28:59.937019438 -0400 +@@ -716,57 +716,57 @@ allocate_and_init (struct link_map *map) + + + struct link_map * +-_dl_update_slotinfo (unsigned long int req_modid) ++_dl_update_slotinfo (unsigned long int req_modid, size_t new_gen) + { + struct link_map *the_map = NULL; + dtv_t *dtv = THREAD_DTV (); + +- /* The global dl_tls_dtv_slotinfo array contains for each module +- index the generation counter current when the entry was created. ++ /* CONCURRENCY NOTES: ++ ++ The global dl_tls_dtv_slotinfo_list array contains for each module ++ index the generation counter current when that entry was updated. + This array never shrinks so that all module indices which were +- valid at some time can be used to access it. Before the first +- use of a new module index in this function the array was extended +- appropriately. Access also does not have to be guarded against +- modifications of the array. It is assumed that pointer-size +- values can be read atomically even in SMP environments. It is +- possible that other threads at the same time dynamically load +- code and therefore add to the slotinfo list. This is a problem +- since we must not pick up any information about incomplete work. +- The solution to this is to ignore all dtv slots which were +- created after the one we are currently interested. We know that +- dynamic loading for this module is completed and this is the last +- load operation we know finished. */ +- unsigned long int idx = req_modid; ++ valid at some time can be used to access it. Concurrent loading ++ and unloading of modules can update slotinfo entries or extend ++ the array. The updates happen under the GL(dl_load_tls_lock) and ++ finish with the release store of the generation counter to ++ GL(dl_tls_generation) which is synchronized with the load of ++ new_gen in the caller. So updates up to new_gen are synchronized ++ but updates for later generations may not be. ++ ++ Here we update the thread dtv from old_gen (== dtv[0].counter) to ++ new_gen generation. For this, each dtv[i] entry is either set to ++ an unallocated state (set), or left unmodified (nop). Where (set) ++ may resize the dtv first if modid i >= dtv[-1].counter. The rules ++ for the decision between (set) and (nop) are ++ ++ (1) If slotinfo entry i is concurrently updated then either (set) ++ or (nop) is valid: TLS access cannot use dtv[i] unless it is ++ synchronized with a generation > new_gen. ++ ++ Otherwise, if the generation of slotinfo entry i is gen and the ++ loaded module for this entry is map then ++ ++ (2) If gen <= old_gen then do (nop). ++ ++ (3) If old_gen < gen <= new_gen then ++ (3.1) if map != 0 then (set) ++ (3.2) if map == 0 then either (set) or (nop). ++ ++ Note that (1) cannot be reliably detected, but since both actions ++ are valid it does not have to be. Only (2) and (3.1) cases need ++ to be distinguished for which relaxed mo access of gen and map is ++ enough: their value is synchronized when it matters. ++ ++ Note that a relaxed mo load may give an out-of-thin-air value since ++ it is used in decisions that can affect concurrent stores. But this ++ should only happen if the OOTA value causes UB that justifies the ++ concurrent store of the value. This is not expected to be an issue ++ in practice. */ + struct dtv_slotinfo_list *listp = GL(dl_tls_dtv_slotinfo_list); + +- while (idx >= listp->len) ++ if (dtv[0].counter < new_gen) + { +- idx -= listp->len; +- listp = listp->next; +- } +- +- if (dtv[0].counter < listp->slotinfo[idx].gen) +- { +- /* CONCURRENCY NOTES: +- +- Here the dtv needs to be updated to new_gen generation count. +- +- This code may be called during TLS access when GL(dl_load_tls_lock) +- is not held. In that case the user code has to synchronize with +- dlopen and dlclose calls of relevant modules. A module m is +- relevant if the generation of m <= new_gen and dlclose of m is +- synchronized: a memory access here happens after the dlopen and +- before the dlclose of relevant modules. The dtv entries for +- relevant modules need to be updated, other entries can be +- arbitrary. +- +- This e.g. means that the first part of the slotinfo list can be +- accessed race free, but the tail may be concurrently extended. +- Similarly relevant slotinfo entries can be read race free, but +- other entries are racy. However updating a non-relevant dtv +- entry does not affect correctness. For a relevant module m, +- max_modid >= modid of m. */ +- size_t new_gen = listp->slotinfo[idx].gen; + size_t total = 0; + size_t max_modid = atomic_load_relaxed (&GL(dl_tls_max_dtv_idx)); + assert (max_modid >= req_modid); +@@ -779,31 +779,33 @@ _dl_update_slotinfo (unsigned long int r + { + size_t modid = total + cnt; + +- /* Later entries are not relevant. */ ++ /* Case (1) for all later modids. */ + if (modid > max_modid) + break; + + size_t gen = atomic_load_relaxed (&listp->slotinfo[cnt].gen); + ++ /* Case (1). */ + if (gen > new_gen) +- /* Not relevant. */ + continue; + +- /* If the entry is older than the current dtv layout we +- know we don't have to handle it. */ ++ /* Case (2) or (1). */ + if (gen <= dtv[0].counter) + continue; + ++ /* Case (3) or (1). */ ++ + /* If there is no map this means the entry is empty. */ + struct link_map *map + = atomic_load_relaxed (&listp->slotinfo[cnt].map); + /* Check whether the current dtv array is large enough. */ + if (dtv[-1].counter < modid) + { ++ /* Case (3.2) or (1). */ + if (map == NULL) + continue; + +- /* Resize the dtv. */ ++ /* Resizing the dtv aborts on failure: bug 16134. */ + dtv = _dl_resize_dtv (dtv, max_modid); + + assert (modid <= dtv[-1].counter); +@@ -814,7 +816,7 @@ _dl_update_slotinfo (unsigned long int r + } + + /* If there is currently memory allocate for this +- dtv entry free it. */ ++ dtv entry free it. Note: this is not AS-safe. */ + /* XXX Ideally we will at some point create a memory + pool. */ + free (dtv[modid].pointer.to_free); +@@ -909,9 +911,9 @@ tls_get_addr_tail (GET_ADDR_ARGS, dtv_t + + static struct link_map * + __attribute_noinline__ +-update_get_addr (GET_ADDR_ARGS) ++update_get_addr (GET_ADDR_ARGS, size_t gen) + { +- struct link_map *the_map = _dl_update_slotinfo (GET_ADDR_MODULE); ++ struct link_map *the_map = _dl_update_slotinfo (GET_ADDR_MODULE, gen); + dtv_t *dtv = THREAD_DTV (); + + void *p = dtv[GET_ADDR_MODULE].pointer.val; +@@ -941,12 +943,17 @@ __tls_get_addr (GET_ADDR_ARGS) + dtv_t *dtv = THREAD_DTV (); + + /* Update is needed if dtv[0].counter < the generation of the accessed +- module. The global generation counter is used here as it is easier +- to check. Synchronization for the relaxed MO access is guaranteed +- by user code, see CONCURRENCY NOTES in _dl_update_slotinfo. */ ++ module, but the global generation counter is easier to check (which ++ must be synchronized up to the generation of the accessed module by ++ user code doing the TLS access so relaxed mo read is enough). */ + size_t gen = atomic_load_relaxed (&GL(dl_tls_generation)); + if (__glibc_unlikely (dtv[0].counter != gen)) +- return update_get_addr (GET_ADDR_PARAM); ++ { ++ /* Update DTV up to the global generation, see CONCURRENCY NOTES ++ in _dl_update_slotinfo. */ ++ gen = atomic_load_acquire (&GL(dl_tls_generation)); ++ return update_get_addr (GET_ADDR_PARAM, gen); ++ } + + void *p = dtv[GET_ADDR_MODULE].pointer.val; + +diff -rup a/sysdeps/generic/ldsodefs.h b/sysdeps/generic/ldsodefs.h +--- a/sysdeps/generic/ldsodefs.h 2023-10-13 16:24:27.136220160 -0400 ++++ b/sysdeps/generic/ldsodefs.h 2023-10-13 16:28:59.937019438 -0400 +@@ -1231,7 +1231,8 @@ extern void _dl_add_to_slotinfo (struct + + /* Update slot information data for at least the generation of the + module with the given index. */ +-extern struct link_map *_dl_update_slotinfo (unsigned long int req_modid) ++extern struct link_map *_dl_update_slotinfo (unsigned long int req_modid, ++ size_t gen) + attribute_hidden; + + /* Look up the module's TLS block as for __tls_get_addr, +diff -rup a/sysdeps/x86_64/dl-tls.c b/sysdeps/x86_64/dl-tls.c +--- a/sysdeps/x86_64/dl-tls.c 2023-10-13 16:24:24.948135189 -0400 ++++ b/sysdeps/x86_64/dl-tls.c 2023-10-13 16:28:59.938019479 -0400 +@@ -40,9 +40,9 @@ __tls_get_addr_slow (GET_ADDR_ARGS) + { + dtv_t *dtv = THREAD_DTV (); + +- size_t gen = atomic_load_relaxed (&GL(dl_tls_generation)); ++ size_t gen = atomic_load_acquire (&GL(dl_tls_generation)); + if (__glibc_unlikely (dtv[0].counter != gen)) +- return update_get_addr (GET_ADDR_PARAM); ++ return update_get_addr (GET_ADDR_PARAM, gen); + + return tls_get_addr_tail (GET_ADDR_PARAM, dtv, NULL); + } diff --git a/glibc.spec b/glibc.spec index aab1b15..3c44c47 100644 --- a/glibc.spec +++ b/glibc.spec @@ -1,6 +1,6 @@ %define glibcsrcdir glibc-2.28 %define glibcversion 2.28 -%define glibcrelease 237%{?dist} +%define glibcrelease 238%{?dist} # Pre-release tarballs are pulled in from git using a command that is # effectively: # @@ -1054,6 +1054,7 @@ Patch866: glibc-RHEL-2435-2.patch Patch867: glibc-RHEL-2423.patch Patch868: glibc-RHEL-3036.patch Patch869: glibc-RHEL-3757.patch +Patch870: glibc-RHEL-2122.patch ############################################################################## # Continued list of core "glibc" package information: @@ -2884,6 +2885,9 @@ fi %files -f compat-libpthread-nonshared.filelist -n compat-libpthread-nonshared %changelog +* Mon Oct 16 2023 DJ Delorie - 2.28-238 +- Fix slow tls access after dlopen (RHEL-2122) + * Mon Oct 16 2023 Arjun Shankar - 2.28-237 - Enable running a single test from the testsuite (RHEL-3757)