283 lines
10 KiB
Diff
283 lines
10 KiB
Diff
|
commit 5397764c8c8435324294b885ad42ed9a73a68d1f
|
||
|
Author: Florian Weimer <fweimer@redhat.com>
|
||
|
Date: Mon Jun 24 16:00:43 2024 +0200
|
||
|
|
||
|
elf: Support recursive use of dynamic TLS in interposed malloc
|
||
|
|
||
|
It turns out that quite a few applications use bundled mallocs that
|
||
|
have been built to use global-dynamic TLS (instead of the recommended
|
||
|
initial-exec TLS). The previous workaround from
|
||
|
commit afe42e935b3ee97bac9a7064157587777259c60e ("elf: Avoid some
|
||
|
free (NULL) calls in _dl_update_slotinfo") does not fix all
|
||
|
encountered cases unfortunatelly.
|
||
|
|
||
|
This change avoids the TLS generation update for recursive use
|
||
|
of TLS from a malloc that was called during a TLS update. This
|
||
|
is possible because an interposed malloc has a fixed module ID and
|
||
|
TLS slot. (It cannot be unloaded.) If an initially-loaded module ID
|
||
|
is encountered in __tls_get_addr and the dynamic linker is already
|
||
|
in the middle of a TLS update, use the outdated DTV, thus avoiding
|
||
|
another call into malloc. It's still necessary to update the
|
||
|
DTV to the most recent generation, to get out of the slow path,
|
||
|
which is why the check for recursion is needed.
|
||
|
|
||
|
The bookkeeping is done using a global counter instead of per-thread
|
||
|
flag because TLS access in the dynamic linker is tricky.
|
||
|
|
||
|
All this will go away once the dynamic linker stops using malloc
|
||
|
for TLS, likely as part of a change that pre-allocates all TLS
|
||
|
during pthread_create/dlopen.
|
||
|
|
||
|
Fixes commit d2123d68275acc0f061e73d5f86ca504e0d5a344 ("elf: Fix slow
|
||
|
tls access after dlopen [BZ #19924]").
|
||
|
|
||
|
diff --git a/elf/dl-tls.c b/elf/dl-tls.c
|
||
|
index 231171b72c21828f..720921b4d3b9df17 100644
|
||
|
--- a/elf/dl-tls.c
|
||
|
+++ b/elf/dl-tls.c
|
||
|
@@ -75,6 +75,31 @@
|
||
|
/* Default for dl_tls_static_optional. */
|
||
|
#define OPTIONAL_TLS 512
|
||
|
|
||
|
+/* Used to count the number of threads currently executing dynamic TLS
|
||
|
+ updates. Used to avoid recursive malloc calls in __tls_get_addr
|
||
|
+ for an interposed malloc that uses global-dynamic TLS (which is not
|
||
|
+ recommended); see _dl_tls_allocate_active checks. This could be a
|
||
|
+ per-thread flag, but would need TLS access in the dynamic linker. */
|
||
|
+unsigned int _dl_tls_threads_in_update;
|
||
|
+
|
||
|
+static inline void
|
||
|
+_dl_tls_allocate_begin (void)
|
||
|
+{
|
||
|
+ atomic_fetch_add_relaxed (&_dl_tls_threads_in_update, 1);
|
||
|
+}
|
||
|
+
|
||
|
+static inline void
|
||
|
+_dl_tls_allocate_end (void)
|
||
|
+{
|
||
|
+ atomic_fetch_add_relaxed (&_dl_tls_threads_in_update, -1);
|
||
|
+}
|
||
|
+
|
||
|
+static inline bool
|
||
|
+_dl_tls_allocate_active (void)
|
||
|
+{
|
||
|
+ return atomic_load_relaxed (&_dl_tls_threads_in_update) > 0;
|
||
|
+}
|
||
|
+
|
||
|
/* Compute the static TLS surplus based on the namespace count and the
|
||
|
TLS space that can be used for optimizations. */
|
||
|
static inline int
|
||
|
@@ -431,12 +456,18 @@ _dl_allocate_tls_storage (void)
|
||
|
size += TLS_PRE_TCB_SIZE;
|
||
|
#endif
|
||
|
|
||
|
- /* Perform the allocation. Reserve space for the required alignment
|
||
|
- and the pointer to the original allocation. */
|
||
|
+ /* Reserve space for the required alignment and the pointer to the
|
||
|
+ original allocation. */
|
||
|
size_t alignment = GLRO (dl_tls_static_align);
|
||
|
+
|
||
|
+ /* Perform the allocation. */
|
||
|
+ _dl_tls_allocate_begin ();
|
||
|
void *allocated = malloc (size + alignment + sizeof (void *));
|
||
|
if (__glibc_unlikely (allocated == NULL))
|
||
|
- return NULL;
|
||
|
+ {
|
||
|
+ _dl_tls_allocate_end ();
|
||
|
+ return NULL;
|
||
|
+ }
|
||
|
|
||
|
/* Perform alignment and allocate the DTV. */
|
||
|
#if TLS_TCB_AT_TP
|
||
|
@@ -472,6 +503,8 @@ _dl_allocate_tls_storage (void)
|
||
|
result = allocate_dtv (result);
|
||
|
if (result == NULL)
|
||
|
free (allocated);
|
||
|
+
|
||
|
+ _dl_tls_allocate_end ();
|
||
|
return result;
|
||
|
}
|
||
|
|
||
|
@@ -489,6 +522,7 @@ _dl_resize_dtv (dtv_t *dtv, size_t max_modid)
|
||
|
size_t newsize = max_modid + DTV_SURPLUS;
|
||
|
size_t oldsize = dtv[-1].counter;
|
||
|
|
||
|
+ _dl_tls_allocate_begin ();
|
||
|
if (dtv == GL(dl_initial_dtv))
|
||
|
{
|
||
|
/* This is the initial dtv that was either statically allocated in
|
||
|
@@ -508,6 +542,7 @@ _dl_resize_dtv (dtv_t *dtv, size_t max_modid)
|
||
|
if (newp == NULL)
|
||
|
oom ();
|
||
|
}
|
||
|
+ _dl_tls_allocate_end ();
|
||
|
|
||
|
newp[0].counter = newsize;
|
||
|
|
||
|
@@ -682,7 +717,9 @@ allocate_dtv_entry (size_t alignment, size_t size)
|
||
|
if (powerof2 (alignment) && alignment <= _Alignof (max_align_t))
|
||
|
{
|
||
|
/* The alignment is supported by malloc. */
|
||
|
+ _dl_tls_allocate_begin ();
|
||
|
void *ptr = malloc (size);
|
||
|
+ _dl_tls_allocate_end ();
|
||
|
return (struct dtv_pointer) { ptr, ptr };
|
||
|
}
|
||
|
|
||
|
@@ -694,7 +731,10 @@ allocate_dtv_entry (size_t alignment, size_t size)
|
||
|
|
||
|
/* Perform the allocation. This is the pointer we need to free
|
||
|
later. */
|
||
|
+ _dl_tls_allocate_begin ();
|
||
|
void *start = malloc (alloc_size);
|
||
|
+ _dl_tls_allocate_end ();
|
||
|
+
|
||
|
if (start == NULL)
|
||
|
return (struct dtv_pointer) {};
|
||
|
|
||
|
@@ -832,7 +872,11 @@ _dl_update_slotinfo (unsigned long int req_modid, size_t new_gen)
|
||
|
free implementation. Checking here papers over at
|
||
|
least some dynamic TLS usage by interposed mallocs. */
|
||
|
if (dtv[modid].pointer.to_free != NULL)
|
||
|
- free (dtv[modid].pointer.to_free);
|
||
|
+ {
|
||
|
+ _dl_tls_allocate_begin ();
|
||
|
+ free (dtv[modid].pointer.to_free);
|
||
|
+ _dl_tls_allocate_end ();
|
||
|
+ }
|
||
|
dtv[modid].pointer.val = TLS_DTV_UNALLOCATED;
|
||
|
dtv[modid].pointer.to_free = NULL;
|
||
|
|
||
|
@@ -959,13 +1003,25 @@ __tls_get_addr (GET_ADDR_ARGS)
|
||
|
module, but the global generation counter is easier to check (which
|
||
|
must be synchronized up to the generation of the accessed module by
|
||
|
user code doing the TLS access so relaxed mo read is enough). */
|
||
|
- size_t gen = atomic_load_relaxed (&GL(dl_tls_generation));
|
||
|
+ size_t gen = atomic_load_relaxed (&GL(dl_tls_generation));
|
||
|
if (__glibc_unlikely (dtv[0].counter != gen))
|
||
|
{
|
||
|
- /* Update DTV up to the global generation, see CONCURRENCY NOTES
|
||
|
- in _dl_update_slotinfo. */
|
||
|
- gen = atomic_load_acquire (&GL(dl_tls_generation));
|
||
|
- return update_get_addr (GET_ADDR_PARAM, gen);
|
||
|
+ if (_dl_tls_allocate_active ()
|
||
|
+ && GET_ADDR_MODULE < _dl_tls_initial_modid_limit)
|
||
|
+ /* This is a reentrant __tls_get_addr call, but we can
|
||
|
+ satisfy it because it's an initially-loaded module ID.
|
||
|
+ These TLS slotinfo slots do not change, so the
|
||
|
+ out-of-date generation counter does not matter. However,
|
||
|
+ if not in a TLS update, still update_get_addr below, to
|
||
|
+ get off the slow path eventually. */
|
||
|
+ ;
|
||
|
+ else
|
||
|
+ {
|
||
|
+ /* Update DTV up to the global generation, see CONCURRENCY NOTES
|
||
|
+ in _dl_update_slotinfo. */
|
||
|
+ gen = atomic_load_acquire (&GL(dl_tls_generation));
|
||
|
+ return update_get_addr (GET_ADDR_PARAM, gen);
|
||
|
+ }
|
||
|
}
|
||
|
|
||
|
void *p = dtv[GET_ADDR_MODULE].pointer.val;
|
||
|
@@ -975,7 +1031,7 @@ __tls_get_addr (GET_ADDR_ARGS)
|
||
|
|
||
|
return (char *) p + GET_ADDR_OFFSET;
|
||
|
}
|
||
|
-#endif
|
||
|
+#endif /* SHARED */
|
||
|
|
||
|
|
||
|
/* Look up the module's TLS block as for __tls_get_addr,
|
||
|
@@ -1024,6 +1080,25 @@ _dl_tls_get_addr_soft (struct link_map *l)
|
||
|
return data;
|
||
|
}
|
||
|
|
||
|
+size_t _dl_tls_initial_modid_limit;
|
||
|
+
|
||
|
+void
|
||
|
+_dl_tls_initial_modid_limit_setup (void)
|
||
|
+{
|
||
|
+ struct dtv_slotinfo_list *listp = GL(dl_tls_dtv_slotinfo_list);
|
||
|
+ size_t idx;
|
||
|
+ for (idx = 0; idx < listp->len; ++idx)
|
||
|
+ {
|
||
|
+ struct link_map *l = listp->slotinfo[idx].map;
|
||
|
+ if (l == NULL
|
||
|
+ /* The object can be unloaded, so its modid can be
|
||
|
+ reassociated. */
|
||
|
+ || !(l->l_type == lt_executable || l->l_type == lt_library))
|
||
|
+ break;
|
||
|
+ }
|
||
|
+ _dl_tls_initial_modid_limit = idx;
|
||
|
+}
|
||
|
+
|
||
|
|
||
|
void
|
||
|
_dl_add_to_slotinfo (struct link_map *l, bool do_add)
|
||
|
@@ -1056,9 +1131,11 @@ _dl_add_to_slotinfo (struct link_map *l, bool do_add)
|
||
|
the first slot. */
|
||
|
assert (idx == 0);
|
||
|
|
||
|
+ _dl_tls_allocate_begin ();
|
||
|
listp = (struct dtv_slotinfo_list *)
|
||
|
malloc (sizeof (struct dtv_slotinfo_list)
|
||
|
+ TLS_SLOTINFO_SURPLUS * sizeof (struct dtv_slotinfo));
|
||
|
+ _dl_tls_allocate_end ();
|
||
|
if (listp == NULL)
|
||
|
{
|
||
|
/* We ran out of memory while resizing the dtv slotinfo list. */
|
||
|
diff --git a/elf/rtld.c b/elf/rtld.c
|
||
|
index d973c385b312ea16..15a01f3b175ac065 100644
|
||
|
--- a/elf/rtld.c
|
||
|
+++ b/elf/rtld.c
|
||
|
@@ -801,6 +801,8 @@ init_tls (size_t naudit)
|
||
|
_dl_fatal_printf ("\
|
||
|
cannot allocate TLS data structures for initial thread\n");
|
||
|
|
||
|
+ _dl_tls_initial_modid_limit_setup ();
|
||
|
+
|
||
|
/* Store for detection of the special case by __tls_get_addr
|
||
|
so it knows not to pass this dtv to the normal realloc. */
|
||
|
GL(dl_initial_dtv) = GET_DTV (tcbp);
|
||
|
diff --git a/sysdeps/generic/ldsodefs.h b/sysdeps/generic/ldsodefs.h
|
||
|
index 7964e133e4930e88..8bc42ba147ad4c47 100644
|
||
|
--- a/sysdeps/generic/ldsodefs.h
|
||
|
+++ b/sysdeps/generic/ldsodefs.h
|
||
|
@@ -1308,6 +1308,20 @@ extern struct link_map *_dl_update_slotinfo (unsigned long int req_modid,
|
||
|
size_t gen)
|
||
|
attribute_hidden;
|
||
|
|
||
|
+/* The last TLS module ID that is initially loaded, plus 1. TLS
|
||
|
+ addresses for modules with IDs lower than that can be obtained from
|
||
|
+ the DTV even if its generation is outdated. */
|
||
|
+extern size_t _dl_tls_initial_modid_limit attribute_hidden attribute_relro;
|
||
|
+
|
||
|
+/* Compute _dl_tls_initial_modid_limit. To be called after initial
|
||
|
+ relocation. */
|
||
|
+void _dl_tls_initial_modid_limit_setup (void) attribute_hidden;
|
||
|
+
|
||
|
+/* Number of threads currently in a TLS update. This is used to
|
||
|
+ detect reentrant __tls_get_addr calls without a per-thread
|
||
|
+ flag. */
|
||
|
+extern unsigned int _dl_tls_threads_in_update attribute_hidden;
|
||
|
+
|
||
|
/* Look up the module's TLS block as for __tls_get_addr,
|
||
|
but never touch anything. Return null if it's not allocated yet. */
|
||
|
extern void *_dl_tls_get_addr_soft (struct link_map *l) attribute_hidden;
|
||
|
diff --git a/sysdeps/x86_64/dl-tls.c b/sysdeps/x86_64/dl-tls.c
|
||
|
index 4ded8dd6b94edc81..7abfb18413b1d06c 100644
|
||
|
--- a/sysdeps/x86_64/dl-tls.c
|
||
|
+++ b/sysdeps/x86_64/dl-tls.c
|
||
|
@@ -41,7 +41,10 @@ __tls_get_addr_slow (GET_ADDR_ARGS)
|
||
|
dtv_t *dtv = THREAD_DTV ();
|
||
|
|
||
|
size_t gen = atomic_load_acquire (&GL(dl_tls_generation));
|
||
|
- if (__glibc_unlikely (dtv[0].counter != gen))
|
||
|
+ if (__glibc_unlikely (dtv[0].counter != gen)
|
||
|
+ /* See comment in __tls_get_addr in elf/dl-tls.c. */
|
||
|
+ && !(_dl_tls_allocate_active ()
|
||
|
+ && GET_ADDR_MODULE < _dl_tls_initial_modid_limit))
|
||
|
return update_get_addr (GET_ADDR_PARAM, gen);
|
||
|
|
||
|
return tls_get_addr_tail (GET_ADDR_PARAM, dtv, NULL);
|