diff --git a/glibc-upstream-2.39-10.patch b/glibc-upstream-2.39-10.patch
new file mode 100644
index 0000000..858f28e
--- /dev/null
+++ b/glibc-upstream-2.39-10.patch
@@ -0,0 +1,16 @@
+commit 983f34a1252de3ca6f2305c211d86530ea42010e
+Author: caiyinyu <caiyinyu@loongson.cn>
+Date:   Mon Mar 11 16:07:48 2024 +0800
+
+    LoongArch: Correct {__ieee754, _}_scalb -> {__ieee754, _}_scalbf
+
+diff --git a/sysdeps/loongarch/fpu/e_scalbf.c b/sysdeps/loongarch/fpu/e_scalbf.c
+index 9f054852362e2d76..7c0395fbb5afbc58 100644
+--- a/sysdeps/loongarch/fpu/e_scalbf.c
++++ b/sysdeps/loongarch/fpu/e_scalbf.c
+@@ -57,4 +57,4 @@ __ieee754_scalbf (float x, float fn)
+ 
+   return x;
+ }
+-libm_alias_finite (__ieee754_scalb, __scalb)
++libm_alias_finite (__ieee754_scalbf, __scalbf)
diff --git a/glibc-upstream-2.39-11.patch b/glibc-upstream-2.39-11.patch
new file mode 100644
index 0000000..d816c3c
--- /dev/null
+++ b/glibc-upstream-2.39-11.patch
@@ -0,0 +1,80 @@
+commit 7fc8242bf87828c935ac5df5cafb9dc7ab635fd9
+Author: H.J. Lu <hjl.tools@gmail.com>
+Date:   Fri Feb 16 07:17:10 2024 -0800
+
+    x86-64: Save APX registers in ld.so trampoline
+    
+    Add APX registers to STATE_SAVE_MASK so that APX registers are saved in
+    ld.so trampoline.  This fixes BZ #31371.
+    
+    Also update STATE_SAVE_OFFSET and STATE_SAVE_MASK for i386 which will
+    be used by i386 _dl_tlsdesc_dynamic.
+    Reviewed-by: Noah Goldstein <goldstein.w.n@gmail.com>
+    
+    (cherry picked from commit dfb05f8e704edac70db38c4c8ee700769d91a413)
+
+diff --git a/sysdeps/x86/sysdep.h b/sysdeps/x86/sysdep.h
+index 85d0a8c943cbb218..837fd28734914a1c 100644
+--- a/sysdeps/x86/sysdep.h
++++ b/sysdeps/x86/sysdep.h
+@@ -21,14 +21,54 @@
+ 
+ #include <sysdeps/generic/sysdep.h>
+ 
++/* The extended state feature IDs in the state component bitmap.  */
++#define X86_XSTATE_X87_ID	0
++#define X86_XSTATE_SSE_ID	1
++#define X86_XSTATE_AVX_ID	2
++#define X86_XSTATE_BNDREGS_ID	3
++#define X86_XSTATE_BNDCFG_ID	4
++#define X86_XSTATE_K_ID		5
++#define X86_XSTATE_ZMM_H_ID	6
++#define X86_XSTATE_ZMM_ID	7
++#define X86_XSTATE_PKRU_ID	9
++#define X86_XSTATE_TILECFG_ID	17
++#define X86_XSTATE_TILEDATA_ID	18
++#define X86_XSTATE_APX_F_ID	19
++
++#ifdef __x86_64__
+ /* Offset for fxsave/xsave area used by _dl_runtime_resolve.  Also need
+    space to preserve RCX, RDX, RSI, RDI, R8, R9 and RAX.  It must be
+-   aligned to 16 bytes for fxsave and 64 bytes for xsave.  */
+-#define STATE_SAVE_OFFSET (8 * 7 + 8)
+-
+-/* Save SSE, AVX, AVX512, mask and bound registers.  */
+-#define STATE_SAVE_MASK \
+-  ((1 << 1) | (1 << 2) | (1 << 3) | (1 << 5) | (1 << 6) | (1 << 7))
++   aligned to 16 bytes for fxsave and 64 bytes for xsave.
++
++   NB: Is is non-zero because of the 128-byte red-zone.  Some registers
++   are saved on stack without adjusting stack pointer first.  When we
++   update stack pointer to allocate more space, we need to take the
++   red-zone into account.  */
++# define STATE_SAVE_OFFSET (8 * 7 + 8)
++
++/* Save SSE, AVX, AVX512, mask, bound and APX registers.  Bound and APX
++   registers are mutually exclusive.  */
++# define STATE_SAVE_MASK		\
++  ((1 << X86_XSTATE_SSE_ID)		\
++   | (1 << X86_XSTATE_AVX_ID)		\
++   | (1 << X86_XSTATE_BNDREGS_ID)	\
++   | (1 << X86_XSTATE_K_ID)		\
++   | (1 << X86_XSTATE_ZMM_H_ID) 	\
++   | (1 << X86_XSTATE_ZMM_ID)		\
++   | (1 << X86_XSTATE_APX_F_ID))
++#else
++/* Offset for fxsave/xsave area used by _dl_tlsdesc_dynamic.  Since i386
++   doesn't have red-zone, use 0 here.  */
++# define STATE_SAVE_OFFSET 0
++
++/* Save SSE, AVX, AXV512, mask and bound registers.   */
++# define STATE_SAVE_MASK		\
++  ((1 << X86_XSTATE_SSE_ID)		\
++   | (1 << X86_XSTATE_AVX_ID)		\
++   | (1 << X86_XSTATE_BNDREGS_ID)	\
++   | (1 << X86_XSTATE_K_ID)		\
++   | (1 << X86_XSTATE_ZMM_H_ID))
++#endif
+ 
+ /* Constants for bits in __x86_string_control:  */
+ 
diff --git a/glibc-upstream-2.39-12.patch b/glibc-upstream-2.39-12.patch
new file mode 100644
index 0000000..74e0b72
--- /dev/null
+++ b/glibc-upstream-2.39-12.patch
@@ -0,0 +1,1453 @@
+commit a364304718725a31ab141936322855c76c73e35e
+Author: H.J. Lu <hjl.tools@gmail.com>
+Date:   Mon Feb 26 06:37:03 2024 -0800
+
+    x86: Update _dl_tlsdesc_dynamic to preserve caller-saved registers
+    
+    Compiler generates the following instruction sequence for GNU2 dynamic
+    TLS access:
+    
+            leaq    tls_var@TLSDESC(%rip), %rax
+            call    *tls_var@TLSCALL(%rax)
+    
+    or
+    
+            leal    tls_var@TLSDESC(%ebx), %eax
+            call    *tls_var@TLSCALL(%eax)
+    
+    CALL instruction is transparent to compiler which assumes all registers,
+    except for EFLAGS and RAX/EAX, are unchanged after CALL.  When
+    _dl_tlsdesc_dynamic is called, it calls __tls_get_addr on the slow
+    path.  __tls_get_addr is a normal function which doesn't preserve any
+    caller-saved registers.  _dl_tlsdesc_dynamic saved and restored integer
+    caller-saved registers, but didn't preserve any other caller-saved
+    registers.  Add _dl_tlsdesc_dynamic IFUNC functions for FNSAVE, FXSAVE,
+    XSAVE and XSAVEC to save and restore all caller-saved registers.  This
+    fixes BZ #31372.
+    
+    Add GLRO(dl_x86_64_runtime_resolve) with GLRO(dl_x86_tlsdesc_dynamic)
+    to optimize elf_machine_runtime_setup.
+    Reviewed-by: Noah Goldstein <goldstein.w.n@gmail.com>
+    
+    (cherry picked from commit 0aac205a814a8511e98d02b91a8dc908f1c53cde)
+
+diff --git a/elf/Makefile b/elf/Makefile
+index 5d78b659ce813ff9..c5c37a9147e69d83 100644
+--- a/elf/Makefile
++++ b/elf/Makefile
+@@ -424,6 +424,7 @@ tests += \
+   tst-glibc-hwcaps-prepend \
+   tst-global1 \
+   tst-global2 \
++  tst-gnu2-tls2 \
+   tst-initfinilazyfail \
+   tst-initorder \
+   tst-initorder2 \
+@@ -846,6 +847,9 @@ modules-names += \
+   tst-filterobj-flt \
+   tst-finilazyfailmod \
+   tst-globalmod2 \
++  tst-gnu2-tls2mod0 \
++  tst-gnu2-tls2mod1 \
++  tst-gnu2-tls2mod2 \
+   tst-initlazyfailmod \
+   tst-initorder2a \
+   tst-initorder2b \
+@@ -3044,8 +3048,22 @@ $(objpfx)tst-tlsgap.out: \
+   $(objpfx)tst-tlsgap-mod0.so \
+   $(objpfx)tst-tlsgap-mod1.so \
+   $(objpfx)tst-tlsgap-mod2.so
++
++$(objpfx)tst-gnu2-tls2: $(shared-thread-library)
++$(objpfx)tst-gnu2-tls2.out: \
++  $(objpfx)tst-gnu2-tls2mod0.so \
++  $(objpfx)tst-gnu2-tls2mod1.so \
++  $(objpfx)tst-gnu2-tls2mod2.so
++
+ ifeq (yes,$(have-mtls-dialect-gnu2))
++# This test fails if dl_tlsdesc_dynamic doesn't preserve all caller-saved
++# registers.  See https://sourceware.org/bugzilla/show_bug.cgi?id=31372
++test-xfail-tst-gnu2-tls2 = yes
++
+ CFLAGS-tst-tlsgap-mod0.c += -mtls-dialect=gnu2
+ CFLAGS-tst-tlsgap-mod1.c += -mtls-dialect=gnu2
+ CFLAGS-tst-tlsgap-mod2.c += -mtls-dialect=gnu2
++CFLAGS-tst-gnu2-tls2mod0.c += -mtls-dialect=gnu2
++CFLAGS-tst-gnu2-tls2mod1.c += -mtls-dialect=gnu2
++CFLAGS-tst-gnu2-tls2mod2.c += -mtls-dialect=gnu2
+ endif
+diff --git a/elf/tst-gnu2-tls2.c b/elf/tst-gnu2-tls2.c
+new file mode 100644
+index 0000000000000000..7ac04d7f3312033e
+--- /dev/null
++++ b/elf/tst-gnu2-tls2.c
+@@ -0,0 +1,122 @@
++/* Test TLSDESC relocation.
++   Copyright (C) 2024 Free Software Foundation, Inc.
++   This file is part of the GNU C Library.
++
++   The GNU C Library is free software; you can redistribute it and/or
++   modify it under the terms of the GNU Lesser General Public
++   License as published by the Free Software Foundation; either
++   version 2.1 of the License, or (at your option) any later version.
++
++   The GNU C Library is distributed in the hope that it will be useful,
++   but WITHOUT ANY WARRANTY; without even the implied warranty of
++   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++   Lesser General Public License for more details.
++
++   You should have received a copy of the GNU Lesser General Public
++   License along with the GNU C Library; if not, see
++   <http://www.gnu.org/licenses/>.  */
++
++#include <stdio.h>
++#include <stdlib.h>
++#include <string.h>
++#include <dlfcn.h>
++#include <pthread.h>
++#include <support/xdlfcn.h>
++#include <support/xthread.h>
++#include <support/check.h>
++#include <support/test-driver.h>
++#include "tst-gnu2-tls2.h"
++
++#ifndef IS_SUPPORTED
++# define IS_SUPPORTED() true
++#endif
++
++/* An architecture can define it to clobber caller-saved registers in
++   malloc below to verify that the implicit TLSDESC call won't change
++   caller-saved registers.  */
++#ifndef PREPARE_MALLOC
++# define PREPARE_MALLOC()
++#endif
++
++extern void * __libc_malloc (size_t);
++
++size_t malloc_counter = 0;
++
++void *
++malloc (size_t n)
++{
++  PREPARE_MALLOC ();
++  malloc_counter++;
++  return __libc_malloc (n);
++}
++
++static void *mod[3];
++#ifndef MOD
++# define MOD(i) "tst-gnu2-tls2mod" #i ".so"
++#endif
++static const char *modname[3] = { MOD(0), MOD(1), MOD(2) };
++#undef MOD
++
++static void
++open_mod (int i)
++{
++  mod[i] = xdlopen (modname[i], RTLD_LAZY);
++  printf ("open %s\n", modname[i]);
++}
++
++static void
++close_mod (int i)
++{
++  xdlclose (mod[i]);
++  mod[i] = NULL;
++  printf ("close %s\n", modname[i]);
++}
++
++static void
++access_mod (int i, const char *sym)
++{
++  struct tls var = { -1, -1, -1, -1 };
++  struct tls *(*f) (struct tls *) = xdlsym (mod[i], sym);
++  /* Check that our malloc is called.  */
++  malloc_counter = 0;
++  struct tls *p = f (&var);
++  TEST_VERIFY (malloc_counter != 0);
++  printf ("access %s: %s() = %p\n", modname[i], sym, p);
++  TEST_VERIFY_EXIT (memcmp (p, &var, sizeof (var)) == 0);
++  ++(p->a);
++}
++
++static void *
++start (void *arg)
++{
++  /* The DTV generation is at the last dlopen of mod0 and the
++     entry for mod1 is NULL.  */
++
++  open_mod (1); /* Reuse modid of mod1. Uses dynamic TLS.  */
++
++  /* Force the slow path in GNU2 TLS descriptor call.  */
++  access_mod (1, "apply_tls");
++
++  return arg;
++}
++
++static int
++do_test (void)
++{
++  if (!IS_SUPPORTED ())
++    return EXIT_UNSUPPORTED;
++
++  open_mod (0);
++  open_mod (1);
++  open_mod (2);
++  close_mod (0);
++  close_mod (1); /* Create modid gap at mod1.  */
++  open_mod (0); /* Reuse modid of mod0, bump generation count.  */
++
++  /* Create a thread where DTV of mod1 is NULL.  */
++  pthread_t t = xpthread_create (NULL, start, NULL);
++  xpthread_join (t);
++  return 0;
++}
++
++#include <support/test-driver.c>
+diff --git a/elf/tst-gnu2-tls2.h b/elf/tst-gnu2-tls2.h
+new file mode 100644
+index 0000000000000000..77964a57a352e6a4
+--- /dev/null
++++ b/elf/tst-gnu2-tls2.h
+@@ -0,0 +1,36 @@
++/* Test TLSDESC relocation.
++   Copyright (C) 2024 Free Software Foundation, Inc.
++   This file is part of the GNU C Library.
++
++   The GNU C Library is free software; you can redistribute it and/or
++   modify it under the terms of the GNU Lesser General Public
++   License as published by the Free Software Foundation; either
++   version 2.1 of the License, or (at your option) any later version.
++
++   The GNU C Library is distributed in the hope that it will be useful,
++   but WITHOUT ANY WARRANTY; without even the implied warranty of
++   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++   Lesser General Public License for more details.
++
++   You should have received a copy of the GNU Lesser General Public
++   License along with the GNU C Library; if not, see
++   <https://www.gnu.org/licenses/>.  */
++
++#include <stdint.h>
++
++struct tls
++{
++  int64_t a, b, c, d;
++};
++
++extern struct tls *apply_tls (struct tls *);
++
++/* An architecture can define them to verify that clobber caller-saved
++   registers aren't changed by the implicit TLSDESC call.  */
++#ifndef BEFORE_TLSDESC_CALL
++# define BEFORE_TLSDESC_CALL()
++#endif
++
++#ifndef AFTER_TLSDESC_CALL
++# define AFTER_TLSDESC_CALL()
++#endif
+diff --git a/elf/tst-gnu2-tls2mod0.c b/elf/tst-gnu2-tls2mod0.c
+new file mode 100644
+index 0000000000000000..45556a0e173922cc
+--- /dev/null
++++ b/elf/tst-gnu2-tls2mod0.c
+@@ -0,0 +1,31 @@
++/* DSO used by tst-gnu2-tls2.
++   Copyright (C) 2024 Free Software Foundation, Inc.
++   This file is part of the GNU C Library.
++
++   The GNU C Library is free software; you can redistribute it and/or
++   modify it under the terms of the GNU Lesser General Public
++   License as published by the Free Software Foundation; either
++   version 2.1 of the License, or (at your option) any later version.
++
++   The GNU C Library is distributed in the hope that it will be useful,
++   but WITHOUT ANY WARRANTY; without even the implied warranty of
++   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++   Lesser General Public License for more details.
++
++   You should have received a copy of the GNU Lesser General Public
++   License along with the GNU C Library; if not, see
++   <https://www.gnu.org/licenses/>.  */
++
++#include "tst-gnu2-tls2.h"
++
++__thread struct tls tls_var0 __attribute__ ((visibility ("hidden")));
++
++struct tls *
++apply_tls (struct tls *p)
++{
++  BEFORE_TLSDESC_CALL ();
++  tls_var0 = *p;
++  struct tls *ret = &tls_var0;
++  AFTER_TLSDESC_CALL ();
++  return ret;
++}
+diff --git a/elf/tst-gnu2-tls2mod1.c b/elf/tst-gnu2-tls2mod1.c
+new file mode 100644
+index 0000000000000000..e10b9dbc0a7573c7
+--- /dev/null
++++ b/elf/tst-gnu2-tls2mod1.c
+@@ -0,0 +1,31 @@
++/* DSO used by tst-gnu2-tls2.
++   Copyright (C) 2024 Free Software Foundation, Inc.
++   This file is part of the GNU C Library.
++
++   The GNU C Library is free software; you can redistribute it and/or
++   modify it under the terms of the GNU Lesser General Public
++   License as published by the Free Software Foundation; either
++   version 2.1 of the License, or (at your option) any later version.
++
++   The GNU C Library is distributed in the hope that it will be useful,
++   but WITHOUT ANY WARRANTY; without even the implied warranty of
++   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++   Lesser General Public License for more details.
++
++   You should have received a copy of the GNU Lesser General Public
++   License along with the GNU C Library; if not, see
++   <https://www.gnu.org/licenses/>.  */
++
++#include "tst-gnu2-tls2.h"
++
++__thread struct tls tls_var1[100] __attribute__ ((visibility ("hidden")));
++
++struct tls *
++apply_tls (struct tls *p)
++{
++  BEFORE_TLSDESC_CALL ();
++  tls_var1[1] = *p;
++  struct tls *ret = &tls_var1[1];
++  AFTER_TLSDESC_CALL ();
++  return ret;
++}
+diff --git a/elf/tst-gnu2-tls2mod2.c b/elf/tst-gnu2-tls2mod2.c
+new file mode 100644
+index 0000000000000000..141af51e55b8bf34
+--- /dev/null
++++ b/elf/tst-gnu2-tls2mod2.c
+@@ -0,0 +1,31 @@
++/* DSO used by tst-gnu2-tls2.
++   Copyright (C) 2024 Free Software Foundation, Inc.
++   This file is part of the GNU C Library.
++
++   The GNU C Library is free software; you can redistribute it and/or
++   modify it under the terms of the GNU Lesser General Public
++   License as published by the Free Software Foundation; either
++   version 2.1 of the License, or (at your option) any later version.
++
++   The GNU C Library is distributed in the hope that it will be useful,
++   but WITHOUT ANY WARRANTY; without even the implied warranty of
++   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++   Lesser General Public License for more details.
++
++   You should have received a copy of the GNU Lesser General Public
++   License along with the GNU C Library; if not, see
++   <https://www.gnu.org/licenses/>.  */
++
++#include "tst-gnu2-tls2.h"
++
++__thread struct tls tls_var2 __attribute__ ((visibility ("hidden")));
++
++struct tls *
++apply_tls (struct tls *p)
++{
++  BEFORE_TLSDESC_CALL ();
++  tls_var2 = *p;
++  struct tls *ret = &tls_var2;
++  AFTER_TLSDESC_CALL ();
++  return ret;
++}
+diff --git a/sysdeps/i386/dl-machine.h b/sysdeps/i386/dl-machine.h
+index fc1ef96587e1992e..50d74fe6e9aaa7bb 100644
+--- a/sysdeps/i386/dl-machine.h
++++ b/sysdeps/i386/dl-machine.h
+@@ -347,7 +347,7 @@ and creates an unsatisfiable circular dependency.\n",
+ 		  {
+ 		    td->arg = _dl_make_tlsdesc_dynamic
+ 		      (sym_map, sym->st_value + (ElfW(Word))td->arg);
+-		    td->entry = _dl_tlsdesc_dynamic;
++		    td->entry = GLRO(dl_x86_tlsdesc_dynamic);
+ 		  }
+ 		else
+ #  endif
+diff --git a/sysdeps/i386/dl-tlsdesc-dynamic.h b/sysdeps/i386/dl-tlsdesc-dynamic.h
+new file mode 100644
+index 0000000000000000..36270285775016ff
+--- /dev/null
++++ b/sysdeps/i386/dl-tlsdesc-dynamic.h
+@@ -0,0 +1,190 @@
++/* Thread-local storage handling in the ELF dynamic linker.  i386 version.
++   Copyright (C) 2004-2024 Free Software Foundation, Inc.
++   This file is part of the GNU C Library.
++
++   The GNU C Library is free software; you can redistribute it and/or
++   modify it under the terms of the GNU Lesser General Public
++   License as published by the Free Software Foundation; either
++   version 2.1 of the License, or (at your option) any later version.
++
++   The GNU C Library is distributed in the hope that it will be useful,
++   but WITHOUT ANY WARRANTY; without even the implied warranty of
++   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++   Lesser General Public License for more details.
++
++   You should have received a copy of the GNU Lesser General Public
++   License along with the GNU C Library; if not, see
++   <https://www.gnu.org/licenses/>.  */
++
++#undef REGISTER_SAVE_AREA
++
++#if !defined USE_FNSAVE && (STATE_SAVE_ALIGNMENT % 16) != 0
++# error STATE_SAVE_ALIGNMENT must be multiple of 16
++#endif
++
++#if DL_RUNTIME_RESOLVE_REALIGN_STACK
++# ifdef USE_FNSAVE
++#  error USE_FNSAVE shouldn't be defined
++# endif
++# ifdef USE_FXSAVE
++/* Use fxsave to save all registers.  */
++#  define REGISTER_SAVE_AREA	512
++# endif
++#else
++# ifdef USE_FNSAVE
++/* Use fnsave to save x87 FPU stack registers.  */
++#  define REGISTER_SAVE_AREA	108
++# else
++#  ifndef USE_FXSAVE
++#   error USE_FXSAVE must be defined
++#  endif
++/* Use fxsave to save all registers.  Add 12 bytes to align the stack
++   to 16 bytes.  */
++#  define REGISTER_SAVE_AREA	(512 + 12)
++# endif
++#endif
++
++	.hidden _dl_tlsdesc_dynamic
++	.global	_dl_tlsdesc_dynamic
++	.type	_dl_tlsdesc_dynamic,@function
++
++     /* This function is used for symbols that need dynamic TLS.
++
++	%eax points to the TLS descriptor, such that 0(%eax) points to
++	_dl_tlsdesc_dynamic itself, and 4(%eax) points to a struct
++	tlsdesc_dynamic_arg object.  It must return in %eax the offset
++	between the thread pointer and the object denoted by the
++	argument, without clobbering any registers.
++
++	The assembly code that follows is a rendition of the following
++	C code, hand-optimized a little bit.
++
++ptrdiff_t
++__attribute__ ((__regparm__ (1)))
++_dl_tlsdesc_dynamic (struct tlsdesc *tdp)
++{
++  struct tlsdesc_dynamic_arg *td = tdp->arg;
++  dtv_t *dtv = *(dtv_t **)((char *)__thread_pointer + DTV_OFFSET);
++  if (__builtin_expect (td->gen_count <= dtv[0].counter
++			&& (dtv[td->tlsinfo.ti_module].pointer.val
++			    != TLS_DTV_UNALLOCATED),
++			1))
++    return dtv[td->tlsinfo.ti_module].pointer.val + td->tlsinfo.ti_offset
++      - __thread_pointer;
++
++  return ___tls_get_addr (&td->tlsinfo) - __thread_pointer;
++}
++*/
++	cfi_startproc
++	.align 16
++_dl_tlsdesc_dynamic:
++	/* Like all TLS resolvers, preserve call-clobbered registers.
++	   We need two scratch regs anyway.  */
++	subl	$32, %esp
++	cfi_adjust_cfa_offset (32)
++	movl	%ecx, 20(%esp)
++	movl	%edx, 24(%esp)
++	movl	TLSDESC_ARG(%eax), %eax
++	movl	%gs:DTV_OFFSET, %edx
++	movl	TLSDESC_GEN_COUNT(%eax), %ecx
++	cmpl	(%edx), %ecx
++	ja	2f
++	movl	TLSDESC_MODID(%eax), %ecx
++	movl	(%edx,%ecx,8), %edx
++	cmpl	$-1, %edx
++	je	2f
++	movl	TLSDESC_MODOFF(%eax), %eax
++	addl	%edx, %eax
++1:
++	movl	20(%esp), %ecx
++	subl	%gs:0, %eax
++	movl	24(%esp), %edx
++	addl	$32, %esp
++	cfi_adjust_cfa_offset (-32)
++	ret
++	.p2align 4,,7
++2:
++	cfi_adjust_cfa_offset (32)
++#if DL_RUNTIME_RESOLVE_REALIGN_STACK
++	movl	%ebx, -28(%esp)
++	movl	%esp, %ebx
++	cfi_def_cfa_register(%ebx)
++	and	$-STATE_SAVE_ALIGNMENT, %esp
++#endif
++#ifdef REGISTER_SAVE_AREA
++	subl	$REGISTER_SAVE_AREA, %esp
++# if !DL_RUNTIME_RESOLVE_REALIGN_STACK
++	cfi_adjust_cfa_offset(REGISTER_SAVE_AREA)
++# endif
++#else
++# if !DL_RUNTIME_RESOLVE_REALIGN_STACK
++#  error DL_RUNTIME_RESOLVE_REALIGN_STACK must be true
++# endif
++	/* Allocate stack space of the required size to save the state.  */
++	LOAD_PIC_REG (cx)
++	subl	RTLD_GLOBAL_RO_DL_X86_CPU_FEATURES_OFFSET+XSAVE_STATE_SIZE_OFFSET+_rtld_local_ro@GOTOFF(%ecx), %esp
++#endif
++#ifdef USE_FNSAVE
++	fnsave	(%esp)
++#elif defined USE_FXSAVE
++	fxsave	(%esp)
++#else
++	/* Save the argument for ___tls_get_addr in EAX.  */
++	movl	%eax, %ecx
++	movl	$TLSDESC_CALL_STATE_SAVE_MASK, %eax
++	xorl	%edx, %edx
++	/* Clear the XSAVE Header.  */
++# ifdef USE_XSAVE
++	movl	%edx, (512)(%esp)
++	movl	%edx, (512 + 4 * 1)(%esp)
++	movl	%edx, (512 + 4 * 2)(%esp)
++	movl	%edx, (512 + 4 * 3)(%esp)
++# endif
++	movl	%edx, (512 + 4 * 4)(%esp)
++	movl	%edx, (512 + 4 * 5)(%esp)
++	movl	%edx, (512 + 4 * 6)(%esp)
++	movl	%edx, (512 + 4 * 7)(%esp)
++	movl	%edx, (512 + 4 * 8)(%esp)
++	movl	%edx, (512 + 4 * 9)(%esp)
++	movl	%edx, (512 + 4 * 10)(%esp)
++	movl	%edx, (512 + 4 * 11)(%esp)
++	movl	%edx, (512 + 4 * 12)(%esp)
++	movl	%edx, (512 + 4 * 13)(%esp)
++	movl	%edx, (512 + 4 * 14)(%esp)
++	movl	%edx, (512 + 4 * 15)(%esp)
++# ifdef USE_XSAVE
++	xsave	(%esp)
++# else
++	xsavec	(%esp)
++# endif
++	/* Restore the argument for ___tls_get_addr in EAX.  */
++	movl	%ecx, %eax
++#endif
++	call	HIDDEN_JUMPTARGET (___tls_get_addr)
++	/* Get register content back.  */
++#ifdef USE_FNSAVE
++	frstor	(%esp)
++#elif defined USE_FXSAVE
++	fxrstor	(%esp)
++#else
++	/* Save and retore ___tls_get_addr return value stored in EAX.  */
++	movl	%eax, %ecx
++	movl	$TLSDESC_CALL_STATE_SAVE_MASK, %eax
++	xorl	%edx, %edx
++	xrstor	(%esp)
++	movl	%ecx, %eax
++#endif
++#if DL_RUNTIME_RESOLVE_REALIGN_STACK
++	mov	%ebx, %esp
++	cfi_def_cfa_register(%esp)
++	movl	-28(%esp), %ebx
++	cfi_restore(%ebx)
++#else
++	addl	$REGISTER_SAVE_AREA, %esp
++	cfi_adjust_cfa_offset(-REGISTER_SAVE_AREA)
++#endif
++	jmp	1b
++	cfi_endproc
++	.size	_dl_tlsdesc_dynamic, .-_dl_tlsdesc_dynamic
++
++#undef STATE_SAVE_ALIGNMENT
+diff --git a/sysdeps/i386/dl-tlsdesc.S b/sysdeps/i386/dl-tlsdesc.S
+index 90d93caa0cdb7442..f002feee56e43f71 100644
+--- a/sysdeps/i386/dl-tlsdesc.S
++++ b/sysdeps/i386/dl-tlsdesc.S
+@@ -18,8 +18,27 @@
+ 
+ #include <sysdep.h>
+ #include <tls.h>
++#include <cpu-features-offsets.h>
++#include <features-offsets.h>
+ #include "tlsdesc.h"
+ 
++#ifndef DL_STACK_ALIGNMENT
++/* Due to GCC bug:
++
++   https://gcc.gnu.org/bugzilla/show_bug.cgi?id=58066
++
++   __tls_get_addr may be called with 4-byte stack alignment.  Although
++   this bug has been fixed in GCC 4.9.4, 5.3 and 6, we can't assume
++   that stack will be always aligned at 16 bytes.  */
++# define DL_STACK_ALIGNMENT 4
++#endif
++
++/* True if _dl_tlsdesc_dynamic should align stack for STATE_SAVE or align
++   stack to MINIMUM_ALIGNMENT bytes before calling ___tls_get_addr.  */
++#define DL_RUNTIME_RESOLVE_REALIGN_STACK \
++  (STATE_SAVE_ALIGNMENT > DL_STACK_ALIGNMENT \
++   || MINIMUM_ALIGNMENT > DL_STACK_ALIGNMENT)
++
+ 	.text
+ 
+      /* This function is used to compute the TP offset for symbols in
+@@ -65,69 +84,35 @@ _dl_tlsdesc_undefweak:
+ 	.size	_dl_tlsdesc_undefweak, .-_dl_tlsdesc_undefweak
+ 
+ #ifdef SHARED
+-	.hidden _dl_tlsdesc_dynamic
+-	.global	_dl_tlsdesc_dynamic
+-	.type	_dl_tlsdesc_dynamic,@function
+-
+-     /* This function is used for symbols that need dynamic TLS.
+-
+-	%eax points to the TLS descriptor, such that 0(%eax) points to
+-	_dl_tlsdesc_dynamic itself, and 4(%eax) points to a struct
+-	tlsdesc_dynamic_arg object.  It must return in %eax the offset
+-	between the thread pointer and the object denoted by the
+-	argument, without clobbering any registers.
+-
+-	The assembly code that follows is a rendition of the following
+-	C code, hand-optimized a little bit.
+-
+-ptrdiff_t
+-__attribute__ ((__regparm__ (1)))
+-_dl_tlsdesc_dynamic (struct tlsdesc *tdp)
+-{
+-  struct tlsdesc_dynamic_arg *td = tdp->arg;
+-  dtv_t *dtv = *(dtv_t **)((char *)__thread_pointer + DTV_OFFSET);
+-  if (__builtin_expect (td->gen_count <= dtv[0].counter
+-			&& (dtv[td->tlsinfo.ti_module].pointer.val
+-			    != TLS_DTV_UNALLOCATED),
+-			1))
+-    return dtv[td->tlsinfo.ti_module].pointer.val + td->tlsinfo.ti_offset
+-      - __thread_pointer;
+-
+-  return ___tls_get_addr (&td->tlsinfo) - __thread_pointer;
+-}
+-*/
+-	cfi_startproc
+-	.align 16
+-_dl_tlsdesc_dynamic:
+-	/* Like all TLS resolvers, preserve call-clobbered registers.
+-	   We need two scratch regs anyway.  */
+-	subl	$28, %esp
+-	cfi_adjust_cfa_offset (28)
+-	movl	%ecx, 20(%esp)
+-	movl	%edx, 24(%esp)
+-	movl	TLSDESC_ARG(%eax), %eax
+-	movl	%gs:DTV_OFFSET, %edx
+-	movl	TLSDESC_GEN_COUNT(%eax), %ecx
+-	cmpl	(%edx), %ecx
+-	ja	.Lslow
+-	movl	TLSDESC_MODID(%eax), %ecx
+-	movl	(%edx,%ecx,8), %edx
+-	cmpl	$-1, %edx
+-	je	.Lslow
+-	movl	TLSDESC_MODOFF(%eax), %eax
+-	addl	%edx, %eax
+-.Lret:
+-	movl	20(%esp), %ecx
+-	subl	%gs:0, %eax
+-	movl	24(%esp), %edx
+-	addl	$28, %esp
+-	cfi_adjust_cfa_offset (-28)
+-	ret
+-	.p2align 4,,7
+-.Lslow:
+-	cfi_adjust_cfa_offset (28)
+-	call	HIDDEN_JUMPTARGET (___tls_get_addr)
+-	jmp	.Lret
+-	cfi_endproc
+-	.size	_dl_tlsdesc_dynamic, .-_dl_tlsdesc_dynamic
++# define USE_FNSAVE
++# define MINIMUM_ALIGNMENT	4
++# define STATE_SAVE_ALIGNMENT	4
++# define _dl_tlsdesc_dynamic	_dl_tlsdesc_dynamic_fnsave
++# include "dl-tlsdesc-dynamic.h"
++# undef _dl_tlsdesc_dynamic
++# undef MINIMUM_ALIGNMENT
++# undef USE_FNSAVE
++
++# define MINIMUM_ALIGNMENT	16
++
++# define USE_FXSAVE
++# define STATE_SAVE_ALIGNMENT	16
++# define _dl_tlsdesc_dynamic	_dl_tlsdesc_dynamic_fxsave
++# include "dl-tlsdesc-dynamic.h"
++# undef _dl_tlsdesc_dynamic
++# undef USE_FXSAVE
++
++# define USE_XSAVE
++# define STATE_SAVE_ALIGNMENT	64
++# define _dl_tlsdesc_dynamic	_dl_tlsdesc_dynamic_xsave
++# include "dl-tlsdesc-dynamic.h"
++# undef _dl_tlsdesc_dynamic
++# undef USE_XSAVE
++
++# define USE_XSAVEC
++# define STATE_SAVE_ALIGNMENT	64
++# define _dl_tlsdesc_dynamic	_dl_tlsdesc_dynamic_xsavec
++# include "dl-tlsdesc-dynamic.h"
++# undef _dl_tlsdesc_dynamic
++# undef USE_XSAVEC
+ #endif /* SHARED */
+diff --git a/sysdeps/x86/Makefile b/sysdeps/x86/Makefile
+index 4d50b327b55ffd65..992aabe43ec60abf 100644
+--- a/sysdeps/x86/Makefile
++++ b/sysdeps/x86/Makefile
+@@ -1,5 +1,5 @@
+ ifeq ($(subdir),csu)
+-gen-as-const-headers += cpu-features-offsets.sym
++gen-as-const-headers += cpu-features-offsets.sym features-offsets.sym
+ endif
+ 
+ ifeq ($(subdir),elf)
+@@ -86,6 +86,11 @@ endif
+ tst-ifunc-isa-2-ENV = GLIBC_TUNABLES=glibc.cpu.hwcaps=-SSE4_2,-AVX,-AVX2,-AVX512F
+ tst-ifunc-isa-2-static-ENV = $(tst-ifunc-isa-2-ENV)
+ tst-hwcap-tunables-ARGS = -- $(host-test-program-cmd)
++
++CFLAGS-tst-gnu2-tls2.c += -msse
++CFLAGS-tst-gnu2-tls2mod0.c += -msse2 -mtune=haswell
++CFLAGS-tst-gnu2-tls2mod1.c += -msse2 -mtune=haswell
++CFLAGS-tst-gnu2-tls2mod2.c += -msse2 -mtune=haswell
+ endif
+ 
+ ifeq ($(subdir),math)
+diff --git a/sysdeps/x86/cpu-features.c b/sysdeps/x86/cpu-features.c
+index 25e6622a79bb969f..835113b42f924b83 100644
+--- a/sysdeps/x86/cpu-features.c
++++ b/sysdeps/x86/cpu-features.c
+@@ -27,8 +27,13 @@
+ extern void TUNABLE_CALLBACK (set_hwcaps) (tunable_val_t *)
+   attribute_hidden;
+ 
+-#if defined SHARED && defined __x86_64__
+-# include <dl-plt-rewrite.h>
++#if defined SHARED
++extern void _dl_tlsdesc_dynamic_fxsave (void) attribute_hidden;
++extern void _dl_tlsdesc_dynamic_xsave (void) attribute_hidden;
++extern void _dl_tlsdesc_dynamic_xsavec (void) attribute_hidden;
++
++# ifdef __x86_64__
++#  include <dl-plt-rewrite.h>
+ 
+ static void
+ TUNABLE_CALLBACK (set_plt_rewrite) (tunable_val_t *valp)
+@@ -47,6 +52,15 @@ TUNABLE_CALLBACK (set_plt_rewrite) (tunable_val_t *valp)
+ 		 : plt_rewrite_jmp);
+     }
+ }
++# else
++extern void _dl_tlsdesc_dynamic_fnsave (void) attribute_hidden;
++# endif
++#endif
++
++#ifdef __x86_64__
++extern void _dl_runtime_resolve_fxsave (void) attribute_hidden;
++extern void _dl_runtime_resolve_xsave (void) attribute_hidden;
++extern void _dl_runtime_resolve_xsavec (void) attribute_hidden;
+ #endif
+ 
+ #ifdef __LP64__
+@@ -1130,6 +1144,44 @@ no_cpuid:
+ 	       TUNABLE_CALLBACK (set_x86_shstk));
+ #endif
+ 
++  if (GLRO(dl_x86_cpu_features).xsave_state_size != 0)
++    {
++      if (CPU_FEATURE_USABLE_P (cpu_features, XSAVEC))
++	{
++#ifdef __x86_64__
++	  GLRO(dl_x86_64_runtime_resolve) = _dl_runtime_resolve_xsavec;
++#endif
++#ifdef SHARED
++	  GLRO(dl_x86_tlsdesc_dynamic) = _dl_tlsdesc_dynamic_xsavec;
++#endif
++	}
++      else
++	{
++#ifdef __x86_64__
++	  GLRO(dl_x86_64_runtime_resolve) = _dl_runtime_resolve_xsave;
++#endif
++#ifdef SHARED
++	  GLRO(dl_x86_tlsdesc_dynamic) = _dl_tlsdesc_dynamic_xsave;
++#endif
++	}
++    }
++  else
++    {
++#ifdef __x86_64__
++      GLRO(dl_x86_64_runtime_resolve) = _dl_runtime_resolve_fxsave;
++# ifdef SHARED
++      GLRO(dl_x86_tlsdesc_dynamic) = _dl_tlsdesc_dynamic_fxsave;
++# endif
++#else
++# ifdef SHARED
++      if (CPU_FEATURE_USABLE_P (cpu_features, FXSR))
++	GLRO(dl_x86_tlsdesc_dynamic) = _dl_tlsdesc_dynamic_fxsave;
++      else
++	GLRO(dl_x86_tlsdesc_dynamic) = _dl_tlsdesc_dynamic_fnsave;
++# endif
++#endif
++    }
++
+ #ifdef SHARED
+ # ifdef __x86_64__
+   TUNABLE_GET (plt_rewrite, tunable_val_t *,
+diff --git a/sysdeps/x86/dl-procinfo.c b/sysdeps/x86/dl-procinfo.c
+index ee957b4d70050b1c..5920d4b32034501d 100644
+--- a/sysdeps/x86/dl-procinfo.c
++++ b/sysdeps/x86/dl-procinfo.c
+@@ -86,3 +86,19 @@ PROCINFO_CLASS const char _dl_x86_platforms[4][9]
+ #else
+ ,
+ #endif
++
++#if defined SHARED && !IS_IN (ldconfig)
++# if !defined PROCINFO_DECL
++  ._dl_x86_tlsdesc_dynamic
++# else
++PROCINFO_CLASS void * _dl_x86_tlsdesc_dynamic
++# endif
++# ifndef PROCINFO_DECL
++= NULL
++# endif
++# ifdef PROCINFO_DECL
++;
++# else
++,
++# endif
++#endif
+diff --git a/sysdeps/x86_64/features-offsets.sym b/sysdeps/x86/features-offsets.sym
+similarity index 89%
+rename from sysdeps/x86_64/features-offsets.sym
+rename to sysdeps/x86/features-offsets.sym
+index 9e4be3393a60fd92..77e990c7053a38bf 100644
+--- a/sysdeps/x86_64/features-offsets.sym
++++ b/sysdeps/x86/features-offsets.sym
+@@ -3,4 +3,6 @@
+ #include <ldsodefs.h>
+ 
+ RTLD_GLOBAL_RO_DL_X86_CPU_FEATURES_OFFSET offsetof (struct rtld_global_ro, _dl_x86_cpu_features)
++#ifdef __x86_64__
+ RTLD_GLOBAL_DL_X86_FEATURE_1_OFFSET offsetof (struct rtld_global, _dl_x86_feature_1)
++#endif
+diff --git a/sysdeps/x86/sysdep.h b/sysdeps/x86/sysdep.h
+index 837fd28734914a1c..485cad9c0283b334 100644
+--- a/sysdeps/x86/sysdep.h
++++ b/sysdeps/x86/sysdep.h
+@@ -70,6 +70,12 @@
+    | (1 << X86_XSTATE_ZMM_H_ID))
+ #endif
+ 
++/* States which should be saved for TLSDESC_CALL and TLS_DESC_CALL.
++   Compiler assumes that all registers, including x87 FPU stack registers,
++   are unchanged after CALL, except for EFLAGS and RAX/EAX.  */
++#define TLSDESC_CALL_STATE_SAVE_MASK	\
++  (STATE_SAVE_MASK | (1 << X86_XSTATE_X87_ID))
++
+ /* Constants for bits in __x86_string_control:  */
+ 
+ /* Avoid short distance REP MOVSB.  */
+diff --git a/sysdeps/x86/tst-gnu2-tls2.c b/sysdeps/x86/tst-gnu2-tls2.c
+new file mode 100644
+index 0000000000000000..de900a423bb70321
+--- /dev/null
++++ b/sysdeps/x86/tst-gnu2-tls2.c
+@@ -0,0 +1,20 @@
++#ifndef __x86_64__
++#include <sys/platform/x86.h>
++
++#define IS_SUPPORTED() CPU_FEATURE_ACTIVE (SSE2)
++#endif
++
++/* Clear XMM0...XMM7  */
++#define PREPARE_MALLOC()				\
++{							\
++  asm volatile ("xorps %%xmm0, %%xmm0" : : : "xmm0" );	\
++  asm volatile ("xorps %%xmm1, %%xmm1" : : : "xmm1" );	\
++  asm volatile ("xorps %%xmm2, %%xmm2" : : : "xmm2" );	\
++  asm volatile ("xorps %%xmm3, %%xmm3" : : : "xmm3" );	\
++  asm volatile ("xorps %%xmm4, %%xmm4" : : : "xmm4" );	\
++  asm volatile ("xorps %%xmm5, %%xmm5" : : : "xmm5" );	\
++  asm volatile ("xorps %%xmm6, %%xmm6" : : : "xmm6" );	\
++  asm volatile ("xorps %%xmm7, %%xmm7" : : : "xmm7" );	\
++}
++
++#include <elf/tst-gnu2-tls2.c>
+diff --git a/sysdeps/x86_64/Makefile b/sysdeps/x86_64/Makefile
+index 90f4ecfd262cfb87..e8babc9a4edbf90b 100644
+--- a/sysdeps/x86_64/Makefile
++++ b/sysdeps/x86_64/Makefile
+@@ -10,7 +10,7 @@ LDFLAGS-rtld += -Wl,-z,nomark-plt
+ endif
+ 
+ ifeq ($(subdir),csu)
+-gen-as-const-headers += features-offsets.sym link-defines.sym
++gen-as-const-headers += link-defines.sym
+ endif
+ 
+ ifeq ($(subdir),gmon)
+diff --git a/sysdeps/x86_64/dl-machine.h b/sysdeps/x86_64/dl-machine.h
+index 6d605d0d3293bcd6..ff5d45f7cb7cd81d 100644
+--- a/sysdeps/x86_64/dl-machine.h
++++ b/sysdeps/x86_64/dl-machine.h
+@@ -71,9 +71,6 @@ elf_machine_runtime_setup (struct link_map *l, struct r_scope_elem *scope[],
+ 			   int lazy, int profile)
+ {
+   Elf64_Addr *got;
+-  extern void _dl_runtime_resolve_fxsave (ElfW(Word)) attribute_hidden;
+-  extern void _dl_runtime_resolve_xsave (ElfW(Word)) attribute_hidden;
+-  extern void _dl_runtime_resolve_xsavec (ElfW(Word)) attribute_hidden;
+   extern void _dl_runtime_profile_sse (ElfW(Word)) attribute_hidden;
+   extern void _dl_runtime_profile_avx (ElfW(Word)) attribute_hidden;
+   extern void _dl_runtime_profile_avx512 (ElfW(Word)) attribute_hidden;
+@@ -96,8 +93,6 @@ elf_machine_runtime_setup (struct link_map *l, struct r_scope_elem *scope[],
+       /* Identify this shared object.  */
+       *(ElfW(Addr) *) (got + 1) = (ElfW(Addr)) l;
+ 
+-      const struct cpu_features* cpu_features = __get_cpu_features ();
+-
+ #ifdef SHARED
+       /* The got[2] entry contains the address of a function which gets
+ 	 called to get the address of a so far unresolved function and
+@@ -107,6 +102,7 @@ elf_machine_runtime_setup (struct link_map *l, struct r_scope_elem *scope[],
+ 	 end in this function.  */
+       if (__glibc_unlikely (profile))
+ 	{
++	  const struct cpu_features* cpu_features = __get_cpu_features ();
+ 	  if (X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, AVX512F))
+ 	    *(ElfW(Addr) *) (got + 2) = (ElfW(Addr)) &_dl_runtime_profile_avx512;
+ 	  else if (X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, AVX))
+@@ -126,15 +122,8 @@ elf_machine_runtime_setup (struct link_map *l, struct r_scope_elem *scope[],
+ 	  /* This function will get called to fix up the GOT entry
+ 	     indicated by the offset on the stack, and then jump to
+ 	     the resolved address.  */
+-	  if (MINIMUM_X86_ISA_LEVEL >= AVX_X86_ISA_LEVEL
+-	      || GLRO(dl_x86_cpu_features).xsave_state_size != 0)
+-	    *(ElfW(Addr) *) (got + 2)
+-	      = (CPU_FEATURE_USABLE_P (cpu_features, XSAVEC)
+-		 ? (ElfW(Addr)) &_dl_runtime_resolve_xsavec
+-		 : (ElfW(Addr)) &_dl_runtime_resolve_xsave);
+-	  else
+-	    *(ElfW(Addr) *) (got + 2)
+-	      = (ElfW(Addr)) &_dl_runtime_resolve_fxsave;
++	  *(ElfW(Addr) *) (got + 2)
++	    = (ElfW(Addr)) GLRO(dl_x86_64_runtime_resolve);
+ 	}
+     }
+ 
+@@ -383,7 +372,7 @@ and creates an unsatisfiable circular dependency.\n",
+ 		  {
+ 		    td->arg = _dl_make_tlsdesc_dynamic
+ 		      (sym_map, sym->st_value + reloc->r_addend);
+-		    td->entry = _dl_tlsdesc_dynamic;
++		    td->entry = GLRO(dl_x86_tlsdesc_dynamic);
+ 		  }
+ 		else
+ #  endif
+diff --git a/sysdeps/x86_64/dl-procinfo.c b/sysdeps/x86_64/dl-procinfo.c
+index 4d1d790fbb2f2992..06637a8154d8648f 100644
+--- a/sysdeps/x86_64/dl-procinfo.c
++++ b/sysdeps/x86_64/dl-procinfo.c
+@@ -41,5 +41,21 @@
+ 
+ #include <sysdeps/x86/dl-procinfo.c>
+ 
++#if !IS_IN (ldconfig)
++# if !defined PROCINFO_DECL && defined SHARED
++  ._dl_x86_64_runtime_resolve
++# else
++PROCINFO_CLASS void * _dl_x86_64_runtime_resolve
++# endif
++# ifndef PROCINFO_DECL
++= NULL
++# endif
++# if !defined SHARED || defined PROCINFO_DECL
++;
++# else
++,
++# endif
++#endif
++
+ #undef PROCINFO_DECL
+ #undef PROCINFO_CLASS
+diff --git a/sysdeps/x86_64/dl-tlsdesc-dynamic.h b/sysdeps/x86_64/dl-tlsdesc-dynamic.h
+new file mode 100644
+index 0000000000000000..0c2e8d5320d0bd26
+--- /dev/null
++++ b/sysdeps/x86_64/dl-tlsdesc-dynamic.h
+@@ -0,0 +1,166 @@
++/* Thread-local storage handling in the ELF dynamic linker.  x86_64 version.
++   Copyright (C) 2004-2024 Free Software Foundation, Inc.
++   This file is part of the GNU C Library.
++
++   The GNU C Library is free software; you can redistribute it and/or
++   modify it under the terms of the GNU Lesser General Public
++   License as published by the Free Software Foundation; either
++   version 2.1 of the License, or (at your option) any later version.
++
++   The GNU C Library is distributed in the hope that it will be useful,
++   but WITHOUT ANY WARRANTY; without even the implied warranty of
++   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++   Lesser General Public License for more details.
++
++   You should have received a copy of the GNU Lesser General Public
++   License along with the GNU C Library; if not, see
++   <https://www.gnu.org/licenses/>.  */
++
++#ifndef SECTION
++# define SECTION(p)	p
++#endif
++
++#undef REGISTER_SAVE_AREA
++#undef LOCAL_STORAGE_AREA
++#undef BASE
++
++#include "dl-trampoline-state.h"
++
++	.section SECTION(.text),"ax",@progbits
++
++	.hidden _dl_tlsdesc_dynamic
++	.global	_dl_tlsdesc_dynamic
++	.type	_dl_tlsdesc_dynamic,@function
++
++     /* %rax points to the TLS descriptor, such that 0(%rax) points to
++	_dl_tlsdesc_dynamic itself, and 8(%rax) points to a struct
++	tlsdesc_dynamic_arg object.  It must return in %rax the offset
++	between the thread pointer and the object denoted by the
++	argument, without clobbering any registers.
++
++	The assembly code that follows is a rendition of the following
++	C code, hand-optimized a little bit.
++
++ptrdiff_t
++_dl_tlsdesc_dynamic (register struct tlsdesc *tdp asm ("%rax"))
++{
++  struct tlsdesc_dynamic_arg *td = tdp->arg;
++  dtv_t *dtv = *(dtv_t **)((char *)__thread_pointer + DTV_OFFSET);
++  if (__builtin_expect (td->gen_count <= dtv[0].counter
++			&& (dtv[td->tlsinfo.ti_module].pointer.val
++			    != TLS_DTV_UNALLOCATED),
++			1))
++    return dtv[td->tlsinfo.ti_module].pointer.val + td->tlsinfo.ti_offset
++      - __thread_pointer;
++
++  return __tls_get_addr_internal (&td->tlsinfo) - __thread_pointer;
++}
++*/
++	cfi_startproc
++	.align 16
++_dl_tlsdesc_dynamic:
++	_CET_ENDBR
++	/* Preserve call-clobbered registers that we modify.
++	   We need two scratch regs anyway.  */
++	movq	%rsi, -16(%rsp)
++	mov	%fs:DTV_OFFSET, %RSI_LP
++	movq	%rdi, -8(%rsp)
++	movq	TLSDESC_ARG(%rax), %rdi
++	movq	(%rsi), %rax
++	cmpq	%rax, TLSDESC_GEN_COUNT(%rdi)
++	ja	2f
++	movq	TLSDESC_MODID(%rdi), %rax
++	salq	$4, %rax
++	movq	(%rax,%rsi), %rax
++	cmpq	$-1, %rax
++	je	2f
++	addq	TLSDESC_MODOFF(%rdi), %rax
++1:
++	movq	-16(%rsp), %rsi
++	sub	%fs:0, %RAX_LP
++	movq	-8(%rsp), %rdi
++	ret
++2:
++#if DL_RUNTIME_RESOLVE_REALIGN_STACK
++	movq	%rbx, -24(%rsp)
++	mov	%RSP_LP, %RBX_LP
++	cfi_def_cfa_register(%rbx)
++	and	$-STATE_SAVE_ALIGNMENT, %RSP_LP
++#endif
++#ifdef REGISTER_SAVE_AREA
++# if DL_RUNTIME_RESOLVE_REALIGN_STACK
++	/* STATE_SAVE_OFFSET has space for 8 integer registers.  But we
++	   need space for RCX, RDX, RSI, RDI, R8, R9, R10 and R11, plus
++	   RBX above.  */
++	sub	$(REGISTER_SAVE_AREA + STATE_SAVE_ALIGNMENT), %RSP_LP
++# else
++	sub	$REGISTER_SAVE_AREA, %RSP_LP
++	cfi_adjust_cfa_offset(REGISTER_SAVE_AREA)
++# endif
++#else
++	/* Allocate stack space of the required size to save the state.  */
++	sub	_rtld_local_ro+RTLD_GLOBAL_RO_DL_X86_CPU_FEATURES_OFFSET+XSAVE_STATE_SIZE_OFFSET(%rip), %RSP_LP
++#endif
++	/* Besides rdi and rsi, saved above, save rcx, rdx, r8, r9,
++	   r10 and r11.  */
++	movq	%rcx, REGISTER_SAVE_RCX(%rsp)
++	movq	%rdx, REGISTER_SAVE_RDX(%rsp)
++	movq	%r8, REGISTER_SAVE_R8(%rsp)
++	movq	%r9, REGISTER_SAVE_R9(%rsp)
++	movq	%r10, REGISTER_SAVE_R10(%rsp)
++	movq	%r11, REGISTER_SAVE_R11(%rsp)
++#ifdef USE_FXSAVE
++	fxsave	STATE_SAVE_OFFSET(%rsp)
++#else
++	movl	$TLSDESC_CALL_STATE_SAVE_MASK, %eax
++	xorl	%edx, %edx
++	/* Clear the XSAVE Header.  */
++# ifdef USE_XSAVE
++	movq	%rdx, (STATE_SAVE_OFFSET + 512)(%rsp)
++	movq	%rdx, (STATE_SAVE_OFFSET + 512 + 8)(%rsp)
++# endif
++	movq	%rdx, (STATE_SAVE_OFFSET + 512 + 8 * 2)(%rsp)
++	movq	%rdx, (STATE_SAVE_OFFSET + 512 + 8 * 3)(%rsp)
++	movq	%rdx, (STATE_SAVE_OFFSET + 512 + 8 * 4)(%rsp)
++	movq	%rdx, (STATE_SAVE_OFFSET + 512 + 8 * 5)(%rsp)
++	movq	%rdx, (STATE_SAVE_OFFSET + 512 + 8 * 6)(%rsp)
++	movq	%rdx, (STATE_SAVE_OFFSET + 512 + 8 * 7)(%rsp)
++# ifdef USE_XSAVE
++	xsave	STATE_SAVE_OFFSET(%rsp)
++# else
++	xsavec	STATE_SAVE_OFFSET(%rsp)
++# endif
++#endif
++	/* %rdi already points to the tlsinfo data structure.  */
++	call	HIDDEN_JUMPTARGET (__tls_get_addr)
++	# Get register content back.
++#ifdef USE_FXSAVE
++	fxrstor	STATE_SAVE_OFFSET(%rsp)
++#else
++	/* Save and retore __tls_get_addr return value stored in RAX.  */
++	mov	%RAX_LP, %RCX_LP
++	movl	$TLSDESC_CALL_STATE_SAVE_MASK, %eax
++	xorl	%edx, %edx
++	xrstor	STATE_SAVE_OFFSET(%rsp)
++	mov	%RCX_LP, %RAX_LP
++#endif
++	movq	REGISTER_SAVE_R11(%rsp), %r11
++	movq	REGISTER_SAVE_R10(%rsp), %r10
++	movq	REGISTER_SAVE_R9(%rsp), %r9
++	movq	REGISTER_SAVE_R8(%rsp), %r8
++	movq	REGISTER_SAVE_RDX(%rsp), %rdx
++	movq	REGISTER_SAVE_RCX(%rsp), %rcx
++#if DL_RUNTIME_RESOLVE_REALIGN_STACK
++	mov	%RBX_LP, %RSP_LP
++	cfi_def_cfa_register(%rsp)
++	movq	-24(%rsp), %rbx
++	cfi_restore(%rbx)
++#else
++	add	$REGISTER_SAVE_AREA, %RSP_LP
++	cfi_adjust_cfa_offset(-REGISTER_SAVE_AREA)
++#endif
++	jmp	1b
++	cfi_endproc
++	.size	_dl_tlsdesc_dynamic, .-_dl_tlsdesc_dynamic
++
++#undef STATE_SAVE_ALIGNMENT
+diff --git a/sysdeps/x86_64/dl-tlsdesc.S b/sysdeps/x86_64/dl-tlsdesc.S
+index f748af2ece8de09a..ea69f5223a77e0c0 100644
+--- a/sysdeps/x86_64/dl-tlsdesc.S
++++ b/sysdeps/x86_64/dl-tlsdesc.S
+@@ -18,7 +18,19 @@
+ 
+ #include <sysdep.h>
+ #include <tls.h>
++#include <cpu-features-offsets.h>
++#include <features-offsets.h>
+ #include "tlsdesc.h"
++#include "dl-trampoline-save.h"
++
++/* Area on stack to save and restore registers used for parameter
++   passing when calling _dl_tlsdesc_dynamic.  */
++#define REGISTER_SAVE_RCX	0
++#define REGISTER_SAVE_RDX	(REGISTER_SAVE_RCX + 8)
++#define REGISTER_SAVE_R8	(REGISTER_SAVE_RDX + 8)
++#define REGISTER_SAVE_R9	(REGISTER_SAVE_R8 + 8)
++#define REGISTER_SAVE_R10	(REGISTER_SAVE_R9 + 8)
++#define REGISTER_SAVE_R11	(REGISTER_SAVE_R10 + 8)
+ 
+ 	.text
+ 
+@@ -67,80 +79,24 @@ _dl_tlsdesc_undefweak:
+ 	.size	_dl_tlsdesc_undefweak, .-_dl_tlsdesc_undefweak
+ 
+ #ifdef SHARED
+-	.hidden _dl_tlsdesc_dynamic
+-	.global	_dl_tlsdesc_dynamic
+-	.type	_dl_tlsdesc_dynamic,@function
+-
+-     /* %rax points to the TLS descriptor, such that 0(%rax) points to
+-	_dl_tlsdesc_dynamic itself, and 8(%rax) points to a struct
+-	tlsdesc_dynamic_arg object.  It must return in %rax the offset
+-	between the thread pointer and the object denoted by the
+-	argument, without clobbering any registers.
+-
+-	The assembly code that follows is a rendition of the following
+-	C code, hand-optimized a little bit.
+-
+-ptrdiff_t
+-_dl_tlsdesc_dynamic (register struct tlsdesc *tdp asm ("%rax"))
+-{
+-  struct tlsdesc_dynamic_arg *td = tdp->arg;
+-  dtv_t *dtv = *(dtv_t **)((char *)__thread_pointer + DTV_OFFSET);
+-  if (__builtin_expect (td->gen_count <= dtv[0].counter
+-			&& (dtv[td->tlsinfo.ti_module].pointer.val
+-			    != TLS_DTV_UNALLOCATED),
+-			1))
+-    return dtv[td->tlsinfo.ti_module].pointer.val + td->tlsinfo.ti_offset
+-      - __thread_pointer;
+-
+-  return __tls_get_addr_internal (&td->tlsinfo) - __thread_pointer;
+-}
+-*/
+-	cfi_startproc
+-	.align 16
+-_dl_tlsdesc_dynamic:
+-	_CET_ENDBR
+-	/* Preserve call-clobbered registers that we modify.
+-	   We need two scratch regs anyway.  */
+-	movq	%rsi, -16(%rsp)
+-	mov	%fs:DTV_OFFSET, %RSI_LP
+-	movq	%rdi, -8(%rsp)
+-	movq	TLSDESC_ARG(%rax), %rdi
+-	movq	(%rsi), %rax
+-	cmpq	%rax, TLSDESC_GEN_COUNT(%rdi)
+-	ja	.Lslow
+-	movq	TLSDESC_MODID(%rdi), %rax
+-	salq	$4, %rax
+-	movq	(%rax,%rsi), %rax
+-	cmpq	$-1, %rax
+-	je	.Lslow
+-	addq	TLSDESC_MODOFF(%rdi), %rax
+-.Lret:
+-	movq	-16(%rsp), %rsi
+-	sub	%fs:0, %RAX_LP
+-	movq	-8(%rsp), %rdi
+-	ret
+-.Lslow:
+-	/* Besides rdi and rsi, saved above, save rdx, rcx, r8, r9,
+-	   r10 and r11.  Also, align the stack, that's off by 8 bytes.	*/
+-	subq	$72, %rsp
+-	cfi_adjust_cfa_offset (72)
+-	movq	%rdx, 8(%rsp)
+-	movq	%rcx, 16(%rsp)
+-	movq	%r8, 24(%rsp)
+-	movq	%r9, 32(%rsp)
+-	movq	%r10, 40(%rsp)
+-	movq	%r11, 48(%rsp)
+-	/* %rdi already points to the tlsinfo data structure.  */
+-	call	HIDDEN_JUMPTARGET (__tls_get_addr)
+-	movq	8(%rsp), %rdx
+-	movq	16(%rsp), %rcx
+-	movq	24(%rsp), %r8
+-	movq	32(%rsp), %r9
+-	movq	40(%rsp), %r10
+-	movq	48(%rsp), %r11
+-	addq	$72, %rsp
+-	cfi_adjust_cfa_offset (-72)
+-	jmp	.Lret
+-	cfi_endproc
+-	.size	_dl_tlsdesc_dynamic, .-_dl_tlsdesc_dynamic
++# define USE_FXSAVE
++# define STATE_SAVE_ALIGNMENT	16
++# define _dl_tlsdesc_dynamic	_dl_tlsdesc_dynamic_fxsave
++# include "dl-tlsdesc-dynamic.h"
++# undef _dl_tlsdesc_dynamic
++# undef USE_FXSAVE
++
++# define USE_XSAVE
++# define STATE_SAVE_ALIGNMENT	64
++# define _dl_tlsdesc_dynamic	_dl_tlsdesc_dynamic_xsave
++# include "dl-tlsdesc-dynamic.h"
++# undef _dl_tlsdesc_dynamic
++# undef USE_XSAVE
++
++# define USE_XSAVEC
++# define STATE_SAVE_ALIGNMENT	64
++# define _dl_tlsdesc_dynamic	_dl_tlsdesc_dynamic_xsavec
++# include "dl-tlsdesc-dynamic.h"
++# undef _dl_tlsdesc_dynamic
++# undef USE_XSAVEC
+ #endif /* SHARED */
+diff --git a/sysdeps/x86_64/dl-trampoline-save.h b/sysdeps/x86_64/dl-trampoline-save.h
+new file mode 100644
+index 0000000000000000..84eac4a8ac13ad86
+--- /dev/null
++++ b/sysdeps/x86_64/dl-trampoline-save.h
+@@ -0,0 +1,34 @@
++/* x86-64 PLT trampoline register save macros.
++   Copyright (C) 2024 Free Software Foundation, Inc.
++   This file is part of the GNU C Library.
++
++   The GNU C Library is free software; you can redistribute it and/or
++   modify it under the terms of the GNU Lesser General Public
++   License as published by the Free Software Foundation; either
++   version 2.1 of the License, or (at your option) any later version.
++
++   The GNU C Library is distributed in the hope that it will be useful,
++   but WITHOUT ANY WARRANTY; without even the implied warranty of
++   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++   Lesser General Public License for more details.
++
++   You should have received a copy of the GNU Lesser General Public
++   License along with the GNU C Library; if not, see
++   <https://www.gnu.org/licenses/>.  */
++
++#ifndef DL_STACK_ALIGNMENT
++/* Due to GCC bug:
++
++   https://gcc.gnu.org/bugzilla/show_bug.cgi?id=58066
++
++   __tls_get_addr may be called with 8-byte stack alignment.  Although
++   this bug has been fixed in GCC 4.9.4, 5.3 and 6, we can't assume
++   that stack will be always aligned at 16 bytes.  */
++# define DL_STACK_ALIGNMENT 8
++#endif
++
++/* True if _dl_runtime_resolve should align stack for STATE_SAVE or align
++   stack to 16 bytes before calling _dl_fixup.  */
++#define DL_RUNTIME_RESOLVE_REALIGN_STACK \
++  (STATE_SAVE_ALIGNMENT > DL_STACK_ALIGNMENT \
++   || 16 > DL_STACK_ALIGNMENT)
+diff --git a/sysdeps/x86_64/dl-trampoline-state.h b/sysdeps/x86_64/dl-trampoline-state.h
+new file mode 100644
+index 0000000000000000..575f120797860583
+--- /dev/null
++++ b/sysdeps/x86_64/dl-trampoline-state.h
+@@ -0,0 +1,51 @@
++/* x86-64 PLT dl-trampoline state macros.
++   Copyright (C) 2024 Free Software Foundation, Inc.
++   This file is part of the GNU C Library.
++
++   The GNU C Library is free software; you can redistribute it and/or
++   modify it under the terms of the GNU Lesser General Public
++   License as published by the Free Software Foundation; either
++   version 2.1 of the License, or (at your option) any later version.
++
++   The GNU C Library is distributed in the hope that it will be useful,
++   but WITHOUT ANY WARRANTY; without even the implied warranty of
++   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++   Lesser General Public License for more details.
++
++   You should have received a copy of the GNU Lesser General Public
++   License along with the GNU C Library; if not, see
++   <https://www.gnu.org/licenses/>.  */
++
++#if (STATE_SAVE_ALIGNMENT % 16) != 0
++# error STATE_SAVE_ALIGNMENT must be multiple of 16
++#endif
++
++#if (STATE_SAVE_OFFSET % STATE_SAVE_ALIGNMENT) != 0
++# error STATE_SAVE_OFFSET must be multiple of STATE_SAVE_ALIGNMENT
++#endif
++
++#if DL_RUNTIME_RESOLVE_REALIGN_STACK
++/* Local stack area before jumping to function address: RBX.  */
++# define LOCAL_STORAGE_AREA	8
++# define BASE			rbx
++# ifdef USE_FXSAVE
++/* Use fxsave to save XMM registers.  */
++#  define REGISTER_SAVE_AREA	(512 + STATE_SAVE_OFFSET)
++#  if (REGISTER_SAVE_AREA % 16) != 0
++#   error REGISTER_SAVE_AREA must be multiple of 16
++#  endif
++# endif
++#else
++# ifndef USE_FXSAVE
++#  error USE_FXSAVE must be defined
++# endif
++/* Use fxsave to save XMM registers.  */
++# define REGISTER_SAVE_AREA	(512 + STATE_SAVE_OFFSET + 8)
++/* Local stack area before jumping to function address:  All saved
++   registers.  */
++# define LOCAL_STORAGE_AREA	REGISTER_SAVE_AREA
++# define BASE			rsp
++# if (REGISTER_SAVE_AREA % 16) != 8
++#  error REGISTER_SAVE_AREA must be odd multiple of 8
++# endif
++#endif
+diff --git a/sysdeps/x86_64/dl-trampoline.S b/sysdeps/x86_64/dl-trampoline.S
+index b2e7e0f69b709ffd..87c5137837f01a63 100644
+--- a/sysdeps/x86_64/dl-trampoline.S
++++ b/sysdeps/x86_64/dl-trampoline.S
+@@ -22,25 +22,7 @@
+ #include <features-offsets.h>
+ #include <link-defines.h>
+ #include <isa-level.h>
+-
+-#ifndef DL_STACK_ALIGNMENT
+-/* Due to GCC bug:
+-
+-   https://gcc.gnu.org/bugzilla/show_bug.cgi?id=58066
+-
+-   __tls_get_addr may be called with 8-byte stack alignment.  Although
+-   this bug has been fixed in GCC 4.9.4, 5.3 and 6, we can't assume
+-   that stack will be always aligned at 16 bytes.  We use unaligned
+-   16-byte move to load and store SSE registers, which has no penalty
+-   on modern processors if stack is 16-byte aligned.  */
+-# define DL_STACK_ALIGNMENT 8
+-#endif
+-
+-/* True if _dl_runtime_resolve should align stack for STATE_SAVE or align
+-   stack to 16 bytes before calling _dl_fixup.  */
+-#define DL_RUNTIME_RESOLVE_REALIGN_STACK \
+-  (STATE_SAVE_ALIGNMENT > DL_STACK_ALIGNMENT \
+-   || 16 > DL_STACK_ALIGNMENT)
++#include "dl-trampoline-save.h"
+ 
+ /* Area on stack to save and restore registers used for parameter
+    passing when calling _dl_fixup.  */
+diff --git a/sysdeps/x86_64/dl-trampoline.h b/sysdeps/x86_64/dl-trampoline.h
+index f55c6ea040b9d319..d9ccfb40d46d3312 100644
+--- a/sysdeps/x86_64/dl-trampoline.h
++++ b/sysdeps/x86_64/dl-trampoline.h
+@@ -27,39 +27,7 @@
+ # undef LOCAL_STORAGE_AREA
+ # undef BASE
+ 
+-# if (STATE_SAVE_ALIGNMENT % 16) != 0
+-#  error STATE_SAVE_ALIGNMENT must be multiple of 16
+-# endif
+-
+-# if (STATE_SAVE_OFFSET % STATE_SAVE_ALIGNMENT) != 0
+-#  error STATE_SAVE_OFFSET must be multiple of STATE_SAVE_ALIGNMENT
+-# endif
+-
+-# if DL_RUNTIME_RESOLVE_REALIGN_STACK
+-/* Local stack area before jumping to function address: RBX.  */
+-#  define LOCAL_STORAGE_AREA	8
+-#  define BASE			rbx
+-#  ifdef USE_FXSAVE
+-/* Use fxsave to save XMM registers.  */
+-#   define REGISTER_SAVE_AREA	(512 + STATE_SAVE_OFFSET)
+-#   if (REGISTER_SAVE_AREA % 16) != 0
+-#    error REGISTER_SAVE_AREA must be multiple of 16
+-#   endif
+-#  endif
+-# else
+-#  ifndef USE_FXSAVE
+-#   error USE_FXSAVE must be defined
+-#  endif
+-/* Use fxsave to save XMM registers.  */
+-#  define REGISTER_SAVE_AREA	(512 + STATE_SAVE_OFFSET + 8)
+-/* Local stack area before jumping to function address:  All saved
+-   registers.  */
+-#  define LOCAL_STORAGE_AREA	REGISTER_SAVE_AREA
+-#  define BASE			rsp
+-#  if (REGISTER_SAVE_AREA % 16) != 8
+-#   error REGISTER_SAVE_AREA must be odd multiple of 8
+-#  endif
+-# endif
++# include "dl-trampoline-state.h"
+ 
+ 	.globl _dl_runtime_resolve
+ 	.hidden _dl_runtime_resolve
diff --git a/glibc-upstream-2.39-13.patch b/glibc-upstream-2.39-13.patch
new file mode 100644
index 0000000..0a52189
--- /dev/null
+++ b/glibc-upstream-2.39-13.patch
@@ -0,0 +1,496 @@
+commit 853e915fdd6ae6c5f1a7a68d2594ec8dbfef1286
+Author: H.J. Lu <hjl.tools@gmail.com>
+Date:   Wed Feb 28 12:08:03 2024 -0800
+
+    x86-64: Update _dl_tlsdesc_dynamic to preserve AMX registers
+    
+    _dl_tlsdesc_dynamic should also preserve AMX registers which are
+    caller-saved.  Add X86_XSTATE_TILECFG_ID and X86_XSTATE_TILEDATA_ID
+    to x86-64 TLSDESC_CALL_STATE_SAVE_MASK.  Compute the AMX state size
+    and save it in xsave_state_full_size which is only used by
+    _dl_tlsdesc_dynamic_xsave and _dl_tlsdesc_dynamic_xsavec.  This fixes
+    the AMX part of BZ #31372.  Tested on AMX processor.
+    
+    AMX test is enabled only for compilers with the fix for
+    
+    https://gcc.gnu.org/bugzilla/show_bug.cgi?id=114098
+    
+    GCC 14 and GCC 11/12/13 branches have the bug fix.
+    Reviewed-by: Sunil K Pandey <skpgkp2@gmail.com>
+    
+    (cherry picked from commit 9b7091415af47082664717210ac49d51551456ab)
+
+diff --git a/sysdeps/unix/sysv/linux/x86_64/Makefile b/sysdeps/unix/sysv/linux/x86_64/Makefile
+index 4223feb95f6fe2f5..9a1e7aa6461725af 100644
+--- a/sysdeps/unix/sysv/linux/x86_64/Makefile
++++ b/sysdeps/unix/sysv/linux/x86_64/Makefile
+@@ -63,6 +63,33 @@ $(objpfx)libx86-64-isa-level%.os: $(..)/sysdeps/unix/sysv/linux/x86_64/x86-64-is
+ $(objpfx)libx86-64-isa-level.so: $(objpfx)libx86-64-isa-level-1.so
+ 	cp $< $@
+ endif
++
++ifeq (yes,$(have-mamx-tile))
++tests += \
++  tst-gnu2-tls2-amx \
++# tests
++
++modules-names += \
++  tst-gnu2-tls2-amx-mod0 \
++  tst-gnu2-tls2-amx-mod1 \
++  tst-gnu2-tls2-amx-mod2 \
++# modules-names
++
++$(objpfx)tst-gnu2-tls2-amx: $(shared-thread-library)
++$(objpfx)tst-gnu2-tls2-amx.out: \
++  $(objpfx)tst-gnu2-tls2-amx-mod0.so \
++  $(objpfx)tst-gnu2-tls2-amx-mod1.so \
++  $(objpfx)tst-gnu2-tls2-amx-mod2.so
++$(objpfx)tst-gnu2-tls2-amx-mod0.so: $(libsupport)
++$(objpfx)tst-gnu2-tls2-amx-mod1.so: $(libsupport)
++$(objpfx)tst-gnu2-tls2-amx-mod2.so: $(libsupport)
++
++CFLAGS-tst-gnu2-tls2-amx.c += -mamx-tile
++CFLAGS-tst-gnu2-tls2-amx-mod0.c += -mamx-tile -mtls-dialect=gnu2
++CFLAGS-tst-gnu2-tls2-amx-mod1.c += -mamx-tile -mtls-dialect=gnu2
++CFLAGS-tst-gnu2-tls2-amx-mod2.c += -mamx-tile -mtls-dialect=gnu2
++endif
++
+ endif # $(subdir) == elf
+ 
+ ifneq ($(enable-cet),no)
+diff --git a/sysdeps/unix/sysv/linux/x86_64/include/asm/prctl.h b/sysdeps/unix/sysv/linux/x86_64/include/asm/prctl.h
+index 2f511321ad3b3ac1..ef4631bf4b2fd9aa 100644
+--- a/sysdeps/unix/sysv/linux/x86_64/include/asm/prctl.h
++++ b/sysdeps/unix/sysv/linux/x86_64/include/asm/prctl.h
+@@ -20,3 +20,8 @@
+ # define ARCH_SHSTK_SHSTK		0x1
+ # define ARCH_SHSTK_WRSS		0x2
+ #endif
++
++#ifndef ARCH_GET_XCOMP_PERM
++# define ARCH_GET_XCOMP_PERM		0x1022
++# define ARCH_REQ_XCOMP_PERM		0x1023
++#endif
+diff --git a/sysdeps/unix/sysv/linux/x86_64/tst-gnu2-tls2-amx-mod0.c b/sysdeps/unix/sysv/linux/x86_64/tst-gnu2-tls2-amx-mod0.c
+new file mode 100644
+index 0000000000000000..2e0c7b91b7caf3ab
+--- /dev/null
++++ b/sysdeps/unix/sysv/linux/x86_64/tst-gnu2-tls2-amx-mod0.c
+@@ -0,0 +1,2 @@
++#include "tst-gnu2-tls2-amx.h"
++#include <tst-gnu2-tls2mod0.c>
+diff --git a/sysdeps/unix/sysv/linux/x86_64/tst-gnu2-tls2-amx-mod1.c b/sysdeps/unix/sysv/linux/x86_64/tst-gnu2-tls2-amx-mod1.c
+new file mode 100644
+index 0000000000000000..b8a8ccf1c119d443
+--- /dev/null
++++ b/sysdeps/unix/sysv/linux/x86_64/tst-gnu2-tls2-amx-mod1.c
+@@ -0,0 +1,2 @@
++#include "tst-gnu2-tls2-amx.h"
++#include <tst-gnu2-tls2mod1.c>
+diff --git a/sysdeps/unix/sysv/linux/x86_64/tst-gnu2-tls2-amx-mod2.c b/sysdeps/unix/sysv/linux/x86_64/tst-gnu2-tls2-amx-mod2.c
+new file mode 100644
+index 0000000000000000..cdf4a8f3635b327c
+--- /dev/null
++++ b/sysdeps/unix/sysv/linux/x86_64/tst-gnu2-tls2-amx-mod2.c
+@@ -0,0 +1,2 @@
++#include "tst-gnu2-tls2-amx.h"
++#include <tst-gnu2-tls2mod2.c>
+diff --git a/sysdeps/unix/sysv/linux/x86_64/tst-gnu2-tls2-amx.c b/sysdeps/unix/sysv/linux/x86_64/tst-gnu2-tls2-amx.c
+new file mode 100644
+index 0000000000000000..ae4dd82556c9b2ef
+--- /dev/null
++++ b/sysdeps/unix/sysv/linux/x86_64/tst-gnu2-tls2-amx.c
+@@ -0,0 +1,83 @@
++/* Test TLSDESC relocation with AMX.
++   Copyright (C) 2024 Free Software Foundation, Inc.
++   This file is part of the GNU C Library.
++
++   The GNU C Library is free software; you can redistribute it and/or
++   modify it under the terms of the GNU Lesser General Public
++   License as published by the Free Software Foundation; either
++   version 2.1 of the License, or (at your option) any later version.
++
++   The GNU C Library is distributed in the hope that it will be useful,
++   but WITHOUT ANY WARRANTY; without even the implied warranty of
++   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++   Lesser General Public License for more details.
++
++   You should have received a copy of the GNU Lesser General Public
++   License along with the GNU C Library; if not, see
++   <http://www.gnu.org/licenses/>.  */
++
++#include <stdbool.h>
++#include <asm/prctl.h>
++#include <support/check.h>
++#include "tst-gnu2-tls2-amx.h"
++
++extern int arch_prctl (int, ...);
++
++#define X86_XSTATE_TILECFG_ID	17
++#define X86_XSTATE_TILEDATA_ID	18
++
++/* Initialize tile config.  */
++__attribute__ ((noinline, noclone))
++static void
++init_tile_config (__tilecfg *tileinfo)
++{
++  int i;
++  tileinfo->palette_id = 1;
++  tileinfo->start_row = 0;
++
++  tileinfo->colsb[0] = MAX_ROWS;
++  tileinfo->rows[0] = MAX_ROWS;
++
++  for (i = 1; i < 4; ++i)
++  {
++    tileinfo->colsb[i] = MAX_COLS;
++    tileinfo->rows[i] = MAX_ROWS;
++  }
++
++  _tile_loadconfig (tileinfo);
++}
++
++static bool
++enable_amx (void)
++{
++  uint64_t bitmask;
++  if (arch_prctl (ARCH_GET_XCOMP_PERM, &bitmask) != 0)
++    return false;
++
++  if ((bitmask & (1 << X86_XSTATE_TILECFG_ID)) == 0)
++    return false;
++
++  if (arch_prctl (ARCH_REQ_XCOMP_PERM, X86_XSTATE_TILEDATA_ID) != 0)
++    return false;
++
++  /* Load tile configuration.  */
++  __tilecfg tile_data = { 0 };
++  init_tile_config (&tile_data);
++
++  return true;
++}
++
++/* An architecture can define it to clobber caller-saved registers in
++   malloc below to verify that the implicit TLSDESC call won't change
++   caller-saved registers.  */
++static void
++clear_tile_register (void)
++{
++  _tile_zero (2);
++}
++
++#define MOD(i) "tst-gnu2-tls2-amx-mod" #i ".so"
++#define IS_SUPPORTED()	enable_amx ()
++#define PREPARE_MALLOC() clear_tile_register ()
++
++#include <elf/tst-gnu2-tls2.c>
+diff --git a/sysdeps/unix/sysv/linux/x86_64/tst-gnu2-tls2-amx.h b/sysdeps/unix/sysv/linux/x86_64/tst-gnu2-tls2-amx.h
+new file mode 100644
+index 0000000000000000..1845a3caba43a0f1
+--- /dev/null
++++ b/sysdeps/unix/sysv/linux/x86_64/tst-gnu2-tls2-amx.h
+@@ -0,0 +1,63 @@
++/* Test TLSDESC relocation with AMX.
++   Copyright (C) 2024 Free Software Foundation, Inc.
++   This file is part of the GNU C Library.
++
++   The GNU C Library is free software; you can redistribute it and/or
++   modify it under the terms of the GNU Lesser General Public
++   License as published by the Free Software Foundation; either
++   version 2.1 of the License, or (at your option) any later version.
++
++   The GNU C Library is distributed in the hope that it will be useful,
++   but WITHOUT ANY WARRANTY; without even the implied warranty of
++   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++   Lesser General Public License for more details.
++
++   You should have received a copy of the GNU Lesser General Public
++   License along with the GNU C Library; if not, see
++   <http://www.gnu.org/licenses/>.  */
++
++#include <stdint.h>
++#include <string.h>
++#include <x86intrin.h>
++#include <support/check.h>
++
++#define MAX_ROWS 16
++#define MAX_COLS 64
++#define MAX 1024
++#define STRIDE 64
++
++typedef struct __tile_config
++{
++  uint8_t palette_id;
++  uint8_t start_row;
++  uint8_t reserved_0[14];
++  uint16_t colsb[16];
++  uint8_t rows[16];
++} __tilecfg __attribute__ ((aligned (64)));
++
++/* Initialize int8_t buffer */
++static inline void
++init_buffer (int8_t *buf, int8_t value)
++{
++  int rows, colsb, i, j;
++  rows  = MAX_ROWS;
++  colsb = MAX_COLS;
++
++  for (i = 0; i < rows; i++)
++    for (j = 0; j < colsb; j++)
++      buf[i * colsb + j] = value;
++}
++
++#define BEFORE_TLSDESC_CALL()					\
++  int8_t src[MAX];						\
++  int8_t res[MAX];						\
++  /* Initialize src with data  */				\
++  init_buffer (src, 2);						\
++  /* Load tile rows from memory.  */				\
++  _tile_loadd (2, src, STRIDE);
++
++#define AFTER_TLSDESC_CALL()					\
++  /* Store the tile data to memory.  */				\
++  _tile_stored (2, res, STRIDE);				\
++  _tile_release ();						\
++  TEST_VERIFY_EXIT (memcmp (src, res, sizeof (res)) == 0);
+diff --git a/sysdeps/x86/cpu-features-offsets.sym b/sysdeps/x86/cpu-features-offsets.sym
+index 6a8fd298137b7f23..21fc88d6510840e6 100644
+--- a/sysdeps/x86/cpu-features-offsets.sym
++++ b/sysdeps/x86/cpu-features-offsets.sym
+@@ -3,3 +3,4 @@
+ #include <ldsodefs.h>
+ 
+ XSAVE_STATE_SIZE_OFFSET	offsetof (struct cpu_features, xsave_state_size)
++XSAVE_STATE_FULL_SIZE_OFFSET offsetof (struct cpu_features, xsave_state_full_size)
+diff --git a/sysdeps/x86/cpu-features.c b/sysdeps/x86/cpu-features.c
+index 835113b42f924b83..d71e8d3d2e0e49f9 100644
+--- a/sysdeps/x86/cpu-features.c
++++ b/sysdeps/x86/cpu-features.c
+@@ -307,6 +307,8 @@ update_active (struct cpu_features *cpu_features)
+ 	  __cpuid_count (0xd, 0, eax, ebx, ecx, edx);
+ 	  if (ebx != 0)
+ 	    {
++	      /* NB: On AMX capable processors, ebx always includes AMX
++		 states.  */
+ 	      unsigned int xsave_state_full_size
+ 		= ALIGN_UP (ebx + STATE_SAVE_OFFSET, 64);
+ 
+@@ -320,6 +322,11 @@ update_active (struct cpu_features *cpu_features)
+ 		{
+ 		  unsigned int xstate_comp_offsets[32];
+ 		  unsigned int xstate_comp_sizes[32];
++#ifdef __x86_64__
++		  unsigned int xstate_amx_comp_offsets[32];
++		  unsigned int xstate_amx_comp_sizes[32];
++		  unsigned int amx_ecx;
++#endif
+ 		  unsigned int i;
+ 
+ 		  xstate_comp_offsets[0] = 0;
+@@ -327,16 +334,39 @@ update_active (struct cpu_features *cpu_features)
+ 		  xstate_comp_offsets[2] = 576;
+ 		  xstate_comp_sizes[0] = 160;
+ 		  xstate_comp_sizes[1] = 256;
++#ifdef __x86_64__
++		  xstate_amx_comp_offsets[0] = 0;
++		  xstate_amx_comp_offsets[1] = 160;
++		  xstate_amx_comp_offsets[2] = 576;
++		  xstate_amx_comp_sizes[0] = 160;
++		  xstate_amx_comp_sizes[1] = 256;
++#endif
+ 
+ 		  for (i = 2; i < 32; i++)
+ 		    {
+-		      if ((STATE_SAVE_MASK & (1 << i)) != 0)
++		      if ((FULL_STATE_SAVE_MASK & (1 << i)) != 0)
+ 			{
+ 			  __cpuid_count (0xd, i, eax, ebx, ecx, edx);
+-			  xstate_comp_sizes[i] = eax;
++#ifdef __x86_64__
++			  /* Include this in xsave_state_full_size.  */
++			  amx_ecx = ecx;
++			  xstate_amx_comp_sizes[i] = eax;
++			  if ((AMX_STATE_SAVE_MASK & (1 << i)) != 0)
++			    {
++			      /* Exclude this from xsave_state_size.  */
++			      ecx = 0;
++			      xstate_comp_sizes[i] = 0;
++			    }
++			  else
++#endif
++			    xstate_comp_sizes[i] = eax;
+ 			}
+ 		      else
+ 			{
++#ifdef __x86_64__
++			  amx_ecx = 0;
++			  xstate_amx_comp_sizes[i] = 0;
++#endif
+ 			  ecx = 0;
+ 			  xstate_comp_sizes[i] = 0;
+ 			}
+@@ -349,6 +379,15 @@ update_active (struct cpu_features *cpu_features)
+ 			  if ((ecx & (1 << 1)) != 0)
+ 			    xstate_comp_offsets[i]
+ 			      = ALIGN_UP (xstate_comp_offsets[i], 64);
++#ifdef __x86_64__
++			  xstate_amx_comp_offsets[i]
++			    = (xstate_amx_comp_offsets[i - 1]
++			       + xstate_amx_comp_sizes[i - 1]);
++			  if ((amx_ecx & (1 << 1)) != 0)
++			    xstate_amx_comp_offsets[i]
++			      = ALIGN_UP (xstate_amx_comp_offsets[i],
++					  64);
++#endif
+ 			}
+ 		    }
+ 
+@@ -357,6 +396,18 @@ update_active (struct cpu_features *cpu_features)
+ 		    = xstate_comp_offsets[31] + xstate_comp_sizes[31];
+ 		  if (size)
+ 		    {
++#ifdef __x86_64__
++		      unsigned int amx_size
++			= (xstate_amx_comp_offsets[31]
++			   + xstate_amx_comp_sizes[31]);
++		      amx_size = ALIGN_UP (amx_size + STATE_SAVE_OFFSET,
++					   64);
++		      /* Set xsave_state_full_size to the compact AMX
++			 state size for XSAVEC.  NB: xsave_state_full_size
++			 is only used in _dl_tlsdesc_dynamic_xsave and
++			 _dl_tlsdesc_dynamic_xsavec.  */
++		      cpu_features->xsave_state_full_size = amx_size;
++#endif
+ 		      cpu_features->xsave_state_size
+ 			= ALIGN_UP (size + STATE_SAVE_OFFSET, 64);
+ 		      CPU_FEATURE_SET (cpu_features, XSAVEC);
+diff --git a/sysdeps/x86/include/cpu-features.h b/sysdeps/x86/include/cpu-features.h
+index b9bf3115b616f05f..cd7bd27cf35959fd 100644
+--- a/sysdeps/x86/include/cpu-features.h
++++ b/sysdeps/x86/include/cpu-features.h
+@@ -934,6 +934,8 @@ struct cpu_features
+   /* The full state size for XSAVE when XSAVEC is disabled by
+ 
+      GLIBC_TUNABLES=glibc.cpu.hwcaps=-XSAVEC
++
++     and the AMX state size when XSAVEC is available.
+    */
+   unsigned int xsave_state_full_size;
+   /* Data cache size for use in memory and string routines, typically
+diff --git a/sysdeps/x86/sysdep.h b/sysdeps/x86/sysdep.h
+index 485cad9c0283b334..db8e576e91767db5 100644
+--- a/sysdeps/x86/sysdep.h
++++ b/sysdeps/x86/sysdep.h
+@@ -56,6 +56,14 @@
+    | (1 << X86_XSTATE_ZMM_H_ID) 	\
+    | (1 << X86_XSTATE_ZMM_ID)		\
+    | (1 << X86_XSTATE_APX_F_ID))
++
++/* AMX state mask.  */
++# define AMX_STATE_SAVE_MASK		\
++  ((1 << X86_XSTATE_TILECFG_ID) | (1 << X86_XSTATE_TILEDATA_ID))
++
++/* States to be included in xsave_state_full_size.  */
++# define FULL_STATE_SAVE_MASK		\
++  (STATE_SAVE_MASK | AMX_STATE_SAVE_MASK)
+ #else
+ /* Offset for fxsave/xsave area used by _dl_tlsdesc_dynamic.  Since i386
+    doesn't have red-zone, use 0 here.  */
+@@ -68,13 +76,17 @@
+    | (1 << X86_XSTATE_BNDREGS_ID)	\
+    | (1 << X86_XSTATE_K_ID)		\
+    | (1 << X86_XSTATE_ZMM_H_ID))
++
++/* States to be included in xsave_state_size.  */
++# define FULL_STATE_SAVE_MASK		STATE_SAVE_MASK
+ #endif
+ 
+ /* States which should be saved for TLSDESC_CALL and TLS_DESC_CALL.
+-   Compiler assumes that all registers, including x87 FPU stack registers,
+-   are unchanged after CALL, except for EFLAGS and RAX/EAX.  */
++   Compiler assumes that all registers, including AMX and x87 FPU
++   stack registers, are unchanged after CALL, except for EFLAGS and
++   RAX/EAX.  */
+ #define TLSDESC_CALL_STATE_SAVE_MASK	\
+-  (STATE_SAVE_MASK | (1 << X86_XSTATE_X87_ID))
++  (FULL_STATE_SAVE_MASK | (1 << X86_XSTATE_X87_ID))
+ 
+ /* Constants for bits in __x86_string_control:  */
+ 
+diff --git a/sysdeps/x86_64/configure b/sysdeps/x86_64/configure
+index 418cc4a9b862f7e0..04a534fa126a7bf7 100755
+--- a/sysdeps/x86_64/configure
++++ b/sysdeps/x86_64/configure
+@@ -134,6 +134,34 @@ fi
+ config_vars="$config_vars
+ enable-cet = $enable_cet"
+ 
++# Check if -mamx-tile works properly.
++{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking whether -mamx-tile works properly" >&5
++printf %s "checking whether -mamx-tile works properly... " >&6; }
++if test ${libc_cv_x86_have_amx_tile+y}
++then :
++  printf %s "(cached) " >&6
++else $as_nop
++  cat > conftest.c <<EOF
++#include <x86intrin.h>
++EOF
++	       libc_cv_x86_have_amx_tile=no
++	       if { ac_try='${CC-cc} -E $CFLAGS -mamx-tile conftest.c > conftest.i'
++  { { eval echo "\"\$as_me\":${as_lineno-$LINENO}: \"$ac_try\""; } >&5
++  (eval $ac_try) 2>&5
++  ac_status=$?
++  printf "%s\n" "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
++  test $ac_status = 0; }; }; then
++		 if grep -q __builtin_ia32_ldtilecfg conftest.i; then
++		   libc_cv_x86_have_amx_tile=yes
++	         fi
++	       fi
++	       rm -rf conftest*
++fi
++{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: $libc_cv_x86_have_amx_tile" >&5
++printf "%s\n" "$libc_cv_x86_have_amx_tile" >&6; }
++config_vars="$config_vars
++have-mamx-tile = $libc_cv_x86_have_amx_tile"
++
+ test -n "$critic_missing" && as_fn_error $? "
+ *** $critic_missing" "$LINENO" 5
+ 
+diff --git a/sysdeps/x86_64/configure.ac b/sysdeps/x86_64/configure.ac
+index d1f803c02ee67fc5..c714c47351e70390 100644
+--- a/sysdeps/x86_64/configure.ac
++++ b/sysdeps/x86_64/configure.ac
+@@ -61,5 +61,20 @@ elif test $enable_cet = permissive; then
+ fi
+ LIBC_CONFIG_VAR([enable-cet], [$enable_cet])
+ 
++# Check if -mamx-tile works properly.
++AC_CACHE_CHECK(whether -mamx-tile works properly,
++	       libc_cv_x86_have_amx_tile, [dnl
++cat > conftest.c <<EOF
++#include <x86intrin.h>
++EOF
++	       libc_cv_x86_have_amx_tile=no
++	       if AC_TRY_COMMAND(${CC-cc} -E $CFLAGS -mamx-tile conftest.c > conftest.i); then
++		 if grep -q __builtin_ia32_ldtilecfg conftest.i; then
++		   libc_cv_x86_have_amx_tile=yes
++	         fi
++	       fi
++	       rm -rf conftest*])
++LIBC_CONFIG_VAR([have-mamx-tile], [$libc_cv_x86_have_amx_tile])
++
+ test -n "$critic_missing" && AC_MSG_ERROR([
+ *** $critic_missing])
+diff --git a/sysdeps/x86_64/dl-tlsdesc-dynamic.h b/sysdeps/x86_64/dl-tlsdesc-dynamic.h
+index 0c2e8d5320d0bd26..9f02cfc3eb297ed2 100644
+--- a/sysdeps/x86_64/dl-tlsdesc-dynamic.h
++++ b/sysdeps/x86_64/dl-tlsdesc-dynamic.h
+@@ -99,7 +99,7 @@ _dl_tlsdesc_dynamic:
+ # endif
+ #else
+ 	/* Allocate stack space of the required size to save the state.  */
+-	sub	_rtld_local_ro+RTLD_GLOBAL_RO_DL_X86_CPU_FEATURES_OFFSET+XSAVE_STATE_SIZE_OFFSET(%rip), %RSP_LP
++	sub	_rtld_local_ro+RTLD_GLOBAL_RO_DL_X86_CPU_FEATURES_OFFSET+XSAVE_STATE_FULL_SIZE_OFFSET(%rip), %RSP_LP
+ #endif
+ 	/* Besides rdi and rsi, saved above, save rcx, rdx, r8, r9,
+ 	   r10 and r11.  */
diff --git a/glibc-upstream-2.39-14.patch b/glibc-upstream-2.39-14.patch
new file mode 100644
index 0000000..a49f788
--- /dev/null
+++ b/glibc-upstream-2.39-14.patch
@@ -0,0 +1,250 @@
+commit 354cabcb2634abe16da7a2ba5e648aac1204b58e
+Author: H.J. Lu <hjl.tools@gmail.com>
+Date:   Mon Mar 18 06:40:16 2024 -0700
+
+    x86-64: Allocate state buffer space for RDI, RSI and RBX
+    
+    _dl_tlsdesc_dynamic preserves RDI, RSI and RBX before realigning stack.
+    After realigning stack, it saves RCX, RDX, R8, R9, R10 and R11.  Define
+    TLSDESC_CALL_REGISTER_SAVE_AREA to allocate space for RDI, RSI and RBX
+    to avoid clobbering saved RDI, RSI and RBX values on stack by xsave to
+    STATE_SAVE_OFFSET(%rsp).
+    
+       +==================+<- stack frame start aligned at 8 or 16 bytes
+       |                  |<- RDI saved in the red zone
+       |                  |<- RSI saved in the red zone
+       |                  |<- RBX saved in the red zone
+       |                  |<- paddings for stack realignment of 64 bytes
+       |------------------|<- xsave buffer end aligned at 64 bytes
+       |                  |<-
+       |                  |<-
+       |                  |<-
+       |------------------|<- xsave buffer start at STATE_SAVE_OFFSET(%rsp)
+       |                  |<- 8-byte padding for 64-byte alignment
+       |                  |<- 8-byte padding for 64-byte alignment
+       |                  |<- R11
+       |                  |<- R10
+       |                  |<- R9
+       |                  |<- R8
+       |                  |<- RDX
+       |                  |<- RCX
+       +==================+<- RSP aligned at 64 bytes
+    
+    Define TLSDESC_CALL_REGISTER_SAVE_AREA, the total register save area size
+    for all integer registers by adding 24 to STATE_SAVE_OFFSET since RDI, RSI
+    and RBX are saved onto stack without adjusting stack pointer first, using
+    the red-zone.  This fixes BZ #31501.
+    Reviewed-by: Sunil K Pandey <skpgkp2@gmail.com>
+    
+    (cherry picked from commit 717ebfa85c8240d32d0d19d86a484c31c55c9617)
+
+diff --git a/sysdeps/x86/cpu-features.c b/sysdeps/x86/cpu-features.c
+index d71e8d3d2e0e49f9..6fe1b728c607f39e 100644
+--- a/sysdeps/x86/cpu-features.c
++++ b/sysdeps/x86/cpu-features.c
+@@ -310,7 +310,7 @@ update_active (struct cpu_features *cpu_features)
+ 	      /* NB: On AMX capable processors, ebx always includes AMX
+ 		 states.  */
+ 	      unsigned int xsave_state_full_size
+-		= ALIGN_UP (ebx + STATE_SAVE_OFFSET, 64);
++		= ALIGN_UP (ebx + TLSDESC_CALL_REGISTER_SAVE_AREA, 64);
+ 
+ 	      cpu_features->xsave_state_size
+ 		= xsave_state_full_size;
+@@ -400,8 +400,10 @@ update_active (struct cpu_features *cpu_features)
+ 		      unsigned int amx_size
+ 			= (xstate_amx_comp_offsets[31]
+ 			   + xstate_amx_comp_sizes[31]);
+-		      amx_size = ALIGN_UP (amx_size + STATE_SAVE_OFFSET,
+-					   64);
++		      amx_size
++			= ALIGN_UP ((amx_size
++				     + TLSDESC_CALL_REGISTER_SAVE_AREA),
++				    64);
+ 		      /* Set xsave_state_full_size to the compact AMX
+ 			 state size for XSAVEC.  NB: xsave_state_full_size
+ 			 is only used in _dl_tlsdesc_dynamic_xsave and
+@@ -409,7 +411,8 @@ update_active (struct cpu_features *cpu_features)
+ 		      cpu_features->xsave_state_full_size = amx_size;
+ #endif
+ 		      cpu_features->xsave_state_size
+-			= ALIGN_UP (size + STATE_SAVE_OFFSET, 64);
++			= ALIGN_UP (size + TLSDESC_CALL_REGISTER_SAVE_AREA,
++				    64);
+ 		      CPU_FEATURE_SET (cpu_features, XSAVEC);
+ 		    }
+ 		}
+diff --git a/sysdeps/x86/sysdep.h b/sysdeps/x86/sysdep.h
+index db8e576e91767db5..7359149e17ccf341 100644
+--- a/sysdeps/x86/sysdep.h
++++ b/sysdeps/x86/sysdep.h
+@@ -38,14 +38,59 @@
+ #ifdef __x86_64__
+ /* Offset for fxsave/xsave area used by _dl_runtime_resolve.  Also need
+    space to preserve RCX, RDX, RSI, RDI, R8, R9 and RAX.  It must be
+-   aligned to 16 bytes for fxsave and 64 bytes for xsave.
+-
+-   NB: Is is non-zero because of the 128-byte red-zone.  Some registers
+-   are saved on stack without adjusting stack pointer first.  When we
+-   update stack pointer to allocate more space, we need to take the
+-   red-zone into account.  */
++   aligned to 16 bytes for fxsave and 64 bytes for xsave.  It is non-zero
++   because MOV, instead of PUSH, is used to save registers onto stack.
++
++   +==================+<- stack frame start aligned at 8 or 16 bytes
++   |                  |<- paddings for stack realignment of 64 bytes
++   |------------------|<- xsave buffer end aligned at 64 bytes
++   |                  |<-
++   |                  |<-
++   |                  |<-
++   |------------------|<- xsave buffer start at STATE_SAVE_OFFSET(%rsp)
++   |                  |<- 8-byte padding for 64-byte alignment
++   |                  |<- R9
++   |                  |<- R8
++   |                  |<- RDI
++   |                  |<- RSI
++   |                  |<- RDX
++   |                  |<- RCX
++   |                  |<- RAX
++   +==================+<- RSP aligned at 64 bytes
++
++ */
+ # define STATE_SAVE_OFFSET (8 * 7 + 8)
+ 
++/* _dl_tlsdesc_dynamic preserves RDI, RSI and RBX before realigning
++   stack.  After realigning stack, it saves RCX, RDX, R8, R9, R10 and
++   R11.  Allocate space for RDI, RSI and RBX to avoid clobbering saved
++   RDI, RSI and RBX values on stack by xsave.
++
++   +==================+<- stack frame start aligned at 8 or 16 bytes
++   |                  |<- RDI saved in the red zone
++   |                  |<- RSI saved in the red zone
++   |                  |<- RBX saved in the red zone
++   |                  |<- paddings for stack realignment of 64 bytes
++   |------------------|<- xsave buffer end aligned at 64 bytes
++   |                  |<-
++   |                  |<-
++   |                  |<-
++   |------------------|<- xsave buffer start at STATE_SAVE_OFFSET(%rsp)
++   |                  |<- 8-byte padding for 64-byte alignment
++   |                  |<- 8-byte padding for 64-byte alignment
++   |                  |<- R11
++   |                  |<- R10
++   |                  |<- R9
++   |                  |<- R8
++   |                  |<- RDX
++   |                  |<- RCX
++   +==================+<- RSP aligned at 64 bytes
++
++   Define the total register save area size for all integer registers by
++   adding 24 to STATE_SAVE_OFFSET since RDI, RSI and RBX are saved onto
++   stack without adjusting stack pointer first, using the red-zone.  */
++# define TLSDESC_CALL_REGISTER_SAVE_AREA (STATE_SAVE_OFFSET + 24)
++
+ /* Save SSE, AVX, AVX512, mask, bound and APX registers.  Bound and APX
+    registers are mutually exclusive.  */
+ # define STATE_SAVE_MASK		\
+@@ -66,8 +111,9 @@
+   (STATE_SAVE_MASK | AMX_STATE_SAVE_MASK)
+ #else
+ /* Offset for fxsave/xsave area used by _dl_tlsdesc_dynamic.  Since i386
+-   doesn't have red-zone, use 0 here.  */
++   uses PUSH to save registers onto stack, use 0 here.  */
+ # define STATE_SAVE_OFFSET 0
++# define TLSDESC_CALL_REGISTER_SAVE_AREA 0
+ 
+ /* Save SSE, AVX, AXV512, mask and bound registers.   */
+ # define STATE_SAVE_MASK		\
+diff --git a/sysdeps/x86_64/tst-gnu2-tls2mod1.S b/sysdeps/x86_64/tst-gnu2-tls2mod1.S
+new file mode 100644
+index 0000000000000000..1d636669ba255724
+--- /dev/null
++++ b/sysdeps/x86_64/tst-gnu2-tls2mod1.S
+@@ -0,0 +1,87 @@
++/* Check if TLSDESC relocation preserves %rdi, %rsi and %rbx.
++   Copyright (C) 2024 Free Software Foundation, Inc.
++   This file is part of the GNU C Library.
++
++   The GNU C Library is free software; you can redistribute it and/or
++   modify it under the terms of the GNU Lesser General Public
++   License as published by the Free Software Foundation; either
++   version 2.1 of the License, or (at your option) any later version.
++
++   The GNU C Library is distributed in the hope that it will be useful,
++   but WITHOUT ANY WARRANTY; without even the implied warranty of
++   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++   Lesser General Public License for more details.
++
++   You should have received a copy of the GNU Lesser General Public
++   License along with the GNU C Library; if not, see
++   <http://www.gnu.org/licenses/>.  */
++
++#include <sysdep.h>
++
++/* On AVX512 machines, OFFSET == 40 caused _dl_tlsdesc_dynamic_xsavec
++   to clobber %rdi, %rsi and %rbx.  On Intel AVX CPUs, the state size
++   is 960 bytes and this test didn't fail.  It may be due to the unused
++   last 128 bytes.  On AMD AVX CPUs, the state size is 832 bytes and
++   this test might fail without the fix.  */
++#ifndef OFFSET
++# define OFFSET 40
++#endif
++
++	.text
++	.p2align 4
++	.globl	apply_tls
++	.type	apply_tls, @function
++apply_tls:
++	cfi_startproc
++	_CET_ENDBR
++	pushq	%rbp
++	cfi_def_cfa_offset (16)
++	cfi_offset (6, -16)
++	movdqu	(%RDI_LP), %xmm0
++	lea	tls_var1@TLSDESC(%rip), %RAX_LP
++	mov	%RSP_LP, %RBP_LP
++	cfi_def_cfa_register (6)
++	/* Align stack to 64 bytes.  */
++	and	$-64, %RSP_LP
++	sub	$OFFSET, %RSP_LP
++	pushq	%rbx
++	/* Set %ebx to 0xbadbeef.  */
++	movl	$0xbadbeef, %ebx
++	movl	$0xbadbeef, %esi
++	movq	%rdi, saved_rdi(%rip)
++	movq	%rsi, saved_rsi(%rip)
++	call	*tls_var1@TLSCALL(%RAX_LP)
++	/* Check if _dl_tlsdesc_dynamic preserves %rdi, %rsi and %rbx.  */
++	cmpq	saved_rdi(%rip), %rdi
++	jne	L(hlt)
++	cmpq	saved_rsi(%rip), %rsi
++	jne	L(hlt)
++	cmpl	$0xbadbeef, %ebx
++	jne	L(hlt)
++	add	%fs:0, %RAX_LP
++	movups	%xmm0, 32(%RAX_LP)
++	movdqu	16(%RDI_LP), %xmm1
++	mov	%RAX_LP, %RBX_LP
++	movups	%xmm1, 48(%RAX_LP)
++	lea	32(%RBX_LP), %RAX_LP
++	pop	%rbx
++	leave
++	cfi_def_cfa (7, 8)
++	ret
++L(hlt):
++	hlt
++	cfi_endproc
++	.size	apply_tls, .-apply_tls
++	.hidden	tls_var1
++	.globl	tls_var1
++	.section	.tbss,"awT",@nobits
++	.align 16
++	.type	tls_var1, @object
++	.size	tls_var1, 3200
++tls_var1:
++	.zero	3200
++	.local	saved_rdi
++	.comm	saved_rdi,8,8
++	.local	saved_rsi
++	.comm	saved_rsi,8,8
++	.section	.note.GNU-stack,"",@progbits
diff --git a/glibc-upstream-2.39-15.patch b/glibc-upstream-2.39-15.patch
new file mode 100644
index 0000000..037a4d3
--- /dev/null
+++ b/glibc-upstream-2.39-15.patch
@@ -0,0 +1,38 @@
+commit 15aebdbada54098787715448c94701f17033fc92
+Author: Adhemerval Zanella <adhemerval.zanella@linaro.org>
+Date:   Tue Mar 12 13:21:18 2024 -0300
+
+    Ignore undefined symbols for -mtls-dialect=gnu2
+    
+    So it does not fail for arm config that defaults to -mtp=soft (which
+    issues a call to __aeabi_read_tp).
+    Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
+    
+    (cherry picked from commit 968b0ca9440040a2b31248a572891f0e55c1ab10)
+
+diff --git a/configure b/configure
+index 59ff1e415dda4fbf..117b48a421792eda 100755
+--- a/configure
++++ b/configure
+@@ -7020,7 +7020,7 @@ void foo (void)
+ }
+ EOF
+ if { ac_try='${CC-cc} $CFLAGS $CPPFLAGS -fPIC -mtls-dialect=gnu2 -nostdlib -nostartfiles
+-		   conftest.c -o conftest 1>&5'
++		   -shared conftest.c -o conftest 1>&5'
+   { { eval echo "\"\$as_me\":${as_lineno-$LINENO}: \"$ac_try\""; } >&5
+   (eval $ac_try) 2>&5
+   ac_status=$?
+diff --git a/configure.ac b/configure.ac
+index 65799e56852a5356..19b88a47a52508a1 100644
+--- a/configure.ac
++++ b/configure.ac
+@@ -1297,7 +1297,7 @@ void foo (void)
+ }
+ EOF
+ if AC_TRY_COMMAND([${CC-cc} $CFLAGS $CPPFLAGS -fPIC -mtls-dialect=gnu2 -nostdlib -nostartfiles
+-		   conftest.c -o conftest 1>&AS_MESSAGE_LOG_FD])
++		   -shared conftest.c -o conftest 1>&AS_MESSAGE_LOG_FD])
+ then
+   libc_cv_mtls_dialect_gnu2=yes
+ else
diff --git a/glibc-upstream-2.39-16.patch b/glibc-upstream-2.39-16.patch
new file mode 100644
index 0000000..e5fa640
--- /dev/null
+++ b/glibc-upstream-2.39-16.patch
@@ -0,0 +1,446 @@
+commit a8ba52bde58c69f2b31da62ad2311f119adf6cb9
+Author: Adhemerval Zanella <adhemerval.zanella@linaro.org>
+Date:   Tue Mar 12 13:21:19 2024 -0300
+
+    arm: Update _dl_tlsdesc_dynamic to preserve caller-saved registers (BZ 31372)
+    
+    ARM _dl_tlsdesc_dynamic slow path has two issues:
+    
+      * The ip/r12 is defined by AAPCS as a scratch register, and gcc is
+        used to save the stack pointer before on some function calls.  So it
+        should also be saved/restored as well.  It fixes the tst-gnu2-tls2.
+    
+      * None of the possible VFP registers are saved/restored.  ARM has the
+        additional complexity to have different VFP bank sizes (depending of
+        VFP support by the chip).
+    
+    The tst-gnu2-tls2 test is extended to check for VFP registers, although
+    only for hardfp builds.  Different than setcontext, _dl_tlsdesc_dynamic
+    does not have  HWCAP_ARM_IWMMXT (I don't have a way to properly test
+    it and it is almost a decade since newer hardware was released).
+    
+    With this patch there is no need to mark tst-gnu2-tls2 as XFAIL.
+    
+    Checked on arm-linux-gnueabihf.
+    Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
+    
+    (cherry picked from commit 64c7e344289ed085517c2227d8e3b06388242c13)
+
+diff --git a/config.h.in b/config.h.in
+index 44a34072a47aa008..4d33c63a841d3d6d 100644
+--- a/config.h.in
++++ b/config.h.in
+@@ -141,6 +141,9 @@
+ /* LOONGARCH floating-point ABI for ld.so.  */
+ #undef LOONGARCH_ABI_FRLEN
+ 
++/* Define whether ARM used hard-float and support VFPvX-D32.  */
++#undef HAVE_ARM_PCS_VFP_D32
++
+ /* Linux specific: minimum supported kernel version.  */
+ #undef	__LINUX_KERNEL_VERSION
+ 
+diff --git a/elf/Makefile b/elf/Makefile
+index c5c37a9147e69d83..030db4d207d3491e 100644
+--- a/elf/Makefile
++++ b/elf/Makefile
+@@ -3056,10 +3056,6 @@ $(objpfx)tst-gnu2-tls2.out: \
+   $(objpfx)tst-gnu2-tls2mod2.so
+ 
+ ifeq (yes,$(have-mtls-dialect-gnu2))
+-# This test fails if dl_tlsdesc_dynamic doesn't preserve all caller-saved
+-# registers.  See https://sourceware.org/bugzilla/show_bug.cgi?id=31372
+-test-xfail-tst-gnu2-tls2 = yes
+-
+ CFLAGS-tst-tlsgap-mod0.c += -mtls-dialect=gnu2
+ CFLAGS-tst-tlsgap-mod1.c += -mtls-dialect=gnu2
+ CFLAGS-tst-tlsgap-mod2.c += -mtls-dialect=gnu2
+diff --git a/elf/tst-gnu2-tls2.h b/elf/tst-gnu2-tls2.h
+index 77964a57a352e6a4..1ade8151e200af68 100644
+--- a/elf/tst-gnu2-tls2.h
++++ b/elf/tst-gnu2-tls2.h
+@@ -27,6 +27,10 @@ extern struct tls *apply_tls (struct tls *);
+ 
+ /* An architecture can define them to verify that clobber caller-saved
+    registers aren't changed by the implicit TLSDESC call.  */
++#ifndef INIT_TLSDESC_CALL
++# define INIT_TLSDESC_CALL()
++#endif
++
+ #ifndef BEFORE_TLSDESC_CALL
+ # define BEFORE_TLSDESC_CALL()
+ #endif
+diff --git a/elf/tst-gnu2-tls2mod0.c b/elf/tst-gnu2-tls2mod0.c
+index 45556a0e173922cc..3fe3c142777abe04 100644
+--- a/elf/tst-gnu2-tls2mod0.c
++++ b/elf/tst-gnu2-tls2mod0.c
+@@ -16,13 +16,14 @@
+    License along with the GNU C Library; if not, see
+    <https://www.gnu.org/licenses/>.  */
+ 
+-#include "tst-gnu2-tls2.h"
++#include <tst-gnu2-tls2.h>
+ 
+ __thread struct tls tls_var0 __attribute__ ((visibility ("hidden")));
+ 
+ struct tls *
+ apply_tls (struct tls *p)
+ {
++  INIT_TLSDESC_CALL ();
+   BEFORE_TLSDESC_CALL ();
+   tls_var0 = *p;
+   struct tls *ret = &tls_var0;
+diff --git a/elf/tst-gnu2-tls2mod1.c b/elf/tst-gnu2-tls2mod1.c
+index e10b9dbc0a7573c7..e2105384689e2d2e 100644
+--- a/elf/tst-gnu2-tls2mod1.c
++++ b/elf/tst-gnu2-tls2mod1.c
+@@ -16,13 +16,14 @@
+    License along with the GNU C Library; if not, see
+    <https://www.gnu.org/licenses/>.  */
+ 
+-#include "tst-gnu2-tls2.h"
++#include <tst-gnu2-tls2.h>
+ 
+ __thread struct tls tls_var1[100] __attribute__ ((visibility ("hidden")));
+ 
+ struct tls *
+ apply_tls (struct tls *p)
+ {
++  INIT_TLSDESC_CALL ();
+   BEFORE_TLSDESC_CALL ();
+   tls_var1[1] = *p;
+   struct tls *ret = &tls_var1[1];
+diff --git a/elf/tst-gnu2-tls2mod2.c b/elf/tst-gnu2-tls2mod2.c
+index 141af51e55b8bf34..6d3031dc5fbc1041 100644
+--- a/elf/tst-gnu2-tls2mod2.c
++++ b/elf/tst-gnu2-tls2mod2.c
+@@ -16,13 +16,14 @@
+    License along with the GNU C Library; if not, see
+    <https://www.gnu.org/licenses/>.  */
+ 
+-#include "tst-gnu2-tls2.h"
++#include <tst-gnu2-tls2.h>
+ 
+ __thread struct tls tls_var2 __attribute__ ((visibility ("hidden")));
+ 
+ struct tls *
+ apply_tls (struct tls *p)
+ {
++  INIT_TLSDESC_CALL ();
+   BEFORE_TLSDESC_CALL ();
+   tls_var2 = *p;
+   struct tls *ret = &tls_var2;
+diff --git a/sysdeps/arm/configure b/sysdeps/arm/configure
+index 35e2918922300956..4ef4d46cbd5384e9 100644
+--- a/sysdeps/arm/configure
++++ b/sysdeps/arm/configure
+@@ -187,6 +187,38 @@ else
+ default-abi = soft"
+ fi
+ 
++{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking whether VFP supports 32 registers" >&5
++printf %s "checking whether VFP supports 32 registers... " >&6; }
++if test ${libc_cv_arm_pcs_vfp_d32+y}
++then :
++  printf %s "(cached) " >&6
++else $as_nop
++
++cat confdefs.h - <<_ACEOF >conftest.$ac_ext
++/* end confdefs.h.  */
++
++void foo (void)
++{
++  asm volatile ("vldr d16,=17" : : : "d16");
++}
++
++_ACEOF
++if ac_fn_c_try_compile "$LINENO"
++then :
++  libc_cv_arm_pcs_vfp_d32=yes
++else $as_nop
++  libc_cv_arm_pcs_vfp_d32=no
++fi
++rm -f core conftest.err conftest.$ac_objext conftest.beam conftest.$ac_ext
++fi
++{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: $libc_cv_arm_pcs_vfp_d32" >&5
++printf "%s\n" "$libc_cv_arm_pcs_vfp_d32" >&6; }
++if test "$libc_cv_arm_pcs_vfp_d32" = yes ;
++then
++  printf "%s\n" "#define HAVE_ARM_PCS_VFP_D32 1" >>confdefs.h
++
++fi
++
+ { printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking whether PC-relative relocs in movw/movt work properly" >&5
+ printf %s "checking whether PC-relative relocs in movw/movt work properly... " >&6; }
+ if test ${libc_cv_arm_pcrel_movw+y}
+diff --git a/sysdeps/arm/configure.ac b/sysdeps/arm/configure.ac
+index 5172e30bbe79f995..cd00ddc9d9ade5d7 100644
+--- a/sysdeps/arm/configure.ac
++++ b/sysdeps/arm/configure.ac
+@@ -21,6 +21,21 @@ else
+   LIBC_CONFIG_VAR([default-abi], [soft])
+ fi
+ 
++AC_CACHE_CHECK([whether VFP supports 32 registers],
++		libc_cv_arm_pcs_vfp_d32, [
++AC_COMPILE_IFELSE([AC_LANG_SOURCE([[
++void foo (void)
++{
++  asm volatile ("vldr d16,=17" : : : "d16");
++}
++]])],
++                [libc_cv_arm_pcs_vfp_d32=yes],
++                [libc_cv_arm_pcs_vfp_d32=no])])
++if test "$libc_cv_arm_pcs_vfp_d32" = yes ;
++then
++  AC_DEFINE(HAVE_ARM_PCS_VFP_D32)
++fi
++
+ AC_CACHE_CHECK([whether PC-relative relocs in movw/movt work properly],
+ 	       libc_cv_arm_pcrel_movw, [
+ cat > conftest.s <<\EOF
+diff --git a/sysdeps/arm/dl-tlsdesc.S b/sysdeps/arm/dl-tlsdesc.S
+index 764c56e70f046b03..ada106521da8971a 100644
+--- a/sysdeps/arm/dl-tlsdesc.S
++++ b/sysdeps/arm/dl-tlsdesc.S
+@@ -19,6 +19,7 @@
+ #include <sysdep.h>
+ #include <arm-features.h>
+ #include <tls.h>
++#include <rtld-global-offsets.h>
+ #include "tlsdesc.h"
+ 
+ 	.text
+@@ -83,14 +84,20 @@ _dl_tlsdesc_dynamic(struct tlsdesc *tdp)
+ 	.align 2
+ _dl_tlsdesc_dynamic:
+ 	/* Our calling convention is to clobber r0, r1 and the processor
+-	   flags.  All others that are modified must be saved */
+-	eabi_save ({r2,r3,r4,lr})
+-	push	{r2,r3,r4,lr}
+-	cfi_adjust_cfa_offset (16)
++	   flags.  All others that are modified must be saved.  r5 is
++	   used as the hwcap value to avoid reload after __tls_get_addr
++	   call.  If required we will save the vector register on the slow
++	   path.  */
++	eabi_save ({r2,r3,r4,r5,ip,lr})
++	push	{r2,r3,r4,r5,ip,lr}
++	cfi_adjust_cfa_offset (24)
+ 	cfi_rel_offset (r2,0)
+ 	cfi_rel_offset (r3,4)
+ 	cfi_rel_offset (r4,8)
+-	cfi_rel_offset (lr,12)
++	cfi_rel_offset (r5,12)
++	cfi_rel_offset (ip,16)
++	cfi_rel_offset (lr,20)
++
+ 	ldr	r1, [r0] /* td */
+ 	GET_TLS (lr)
+ 	mov	r4, r0 /* r4 = tp */
+@@ -113,22 +120,69 @@ _dl_tlsdesc_dynamic:
+ 	rsbne	r0, r4, r3
+ 	bne	2f
+ 1:	mov	r0, r1
++
++	/* Load the hwcap to check for vector support.  */
++	ldr     r2, 3f
++	ldr     r1, .Lrtld_global_ro
++0:	add     r2, pc, r2
++	ldr     r2, [r2, r1]
++	ldr     r5, [r2, #RTLD_GLOBAL_RO_DL_HWCAP_OFFSET]
++
++#ifdef __SOFTFP__
++	tst     r5, #HWCAP_ARM_VFP
++	beq     .Lno_vfp
++#endif
++
++	/* Store the VFP registers.  Don't use VFP instructions directly
++	   because this code is used in non-VFP multilibs.  */
++#define VFP_STACK_REQ (32*8 + 8)
++	sub	sp, sp, VFP_STACK_REQ
++	cfi_adjust_cfa_offset (VFP_STACK_REQ)
++	mov	r3, sp
++	.inst	0xeca30b20	/* vstmia r3!, {d0-d15} */
++	tst	r5, #HWCAP_ARM_VFPD32
++	beq	4f
++	.inst	0xece30b20	/* vstmia r3!, {d16-d31}  */
++	/* Store the floating-point status register.  */
++4:	.inst	0xeef12a10	/* vmrs	r2, fpscr */
++	str	r2, [r3]
++.Lno_vfp:
+ 	bl	__tls_get_addr
+ 	rsb	r0, r4, r0
++#ifdef __SOFTFP__
++	tst     r5, #HWCAP_ARM_VFP
++	beq     2f
++#endif
++	mov	r3, sp
++	.inst	0xecb30b20	/* vldmia r3!, {d0-d15}  */
++	tst	r5, #HWCAP_ARM_VFPD32
++	beq	5f
++	.inst	0xecf30b20	/* vldmia r3!, {d16-d31}  */
++	ldr	r4, [r3]
++5:	.inst	0xeee14a10	/* vmsr	fpscr, r4  */
++	add	sp, sp, VFP_STACK_REQ
++	cfi_adjust_cfa_offset (-VFP_STACK_REQ)
++
+ 2:
+ #if ((defined (__ARM_ARCH_4T__) && defined (__THUMB_INTERWORK__)) \
+      || defined (ARM_ALWAYS_BX))
+-	pop	{r2,r3,r4, lr}
+-	cfi_adjust_cfa_offset (-16)
++	pop	{r2,r3,r4,r5,ip, lr}
++	cfi_adjust_cfa_offset (-20)
+ 	cfi_restore (lr)
++	cfi_restore (ip)
++	cfi_restore (r5)
+ 	cfi_restore (r4)
+ 	cfi_restore (r3)
+ 	cfi_restore (r2)
+ 	bx	lr
+ #else
+-	pop	{r2,r3,r4, pc}
++	pop	{r2,r3,r4,r5,ip, pc}
+ #endif
+ 	eabi_fnend
+ 	cfi_endproc
+ 	.size	_dl_tlsdesc_dynamic, .-_dl_tlsdesc_dynamic
++
++3:      .long   _GLOBAL_OFFSET_TABLE_ - 0b - PC_OFS
++.Lrtld_global_ro:
++	.long   C_SYMBOL_NAME(_rtld_global_ro)(GOT)
+ #endif /* SHARED */
+diff --git a/sysdeps/arm/tst-gnu2-tls2.h b/sysdeps/arm/tst-gnu2-tls2.h
+new file mode 100644
+index 0000000000000000..e413ac21fb9ed9bf
+--- /dev/null
++++ b/sysdeps/arm/tst-gnu2-tls2.h
+@@ -0,0 +1,128 @@
++/* Test TLSDESC relocation.  ARM version.
++   Copyright (C) 2024 Free Software Foundation, Inc.
++   This file is part of the GNU C Library.
++
++   The GNU C Library is free software; you can redistribute it and/or
++   modify it under the terms of the GNU Lesser General Public
++   License as published by the Free Software Foundation; either
++   version 2.1 of the License, or (at your option) any later version.
++
++   The GNU C Library is distributed in the hope that it will be useful,
++   but WITHOUT ANY WARRANTY; without even the implied warranty of
++   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++   Lesser General Public License for more details.
++
++   You should have received a copy of the GNU Lesser General Public
++   License along with the GNU C Library; if not, see
++   <https://www.gnu.org/licenses/>.  */
++
++#include <config.h>
++#include <sys/auxv.h>
++#include <string.h>
++#include <stdlib.h>
++#include <endian.h>
++
++#ifndef __SOFTFP__
++
++# ifdef HAVE_ARM_PCS_VFP_D32
++#  define SAVE_VFP_D32					\
++      asm volatile ("vldr d16,=17" : : : "d16");	\
++      asm volatile ("vldr d17,=18" : : : "d17");	\
++      asm volatile ("vldr d18,=19" : : : "d18");	\
++      asm volatile ("vldr d19,=20" : : : "d19");	\
++      asm volatile ("vldr d20,=21" : : : "d20");	\
++      asm volatile ("vldr d21,=22" : : : "d21");	\
++      asm volatile ("vldr d22,=23" : : : "d22");	\
++      asm volatile ("vldr d23,=24" : : : "d23");	\
++      asm volatile ("vldr d24,=25" : : : "d24");	\
++      asm volatile ("vldr d25,=26" : : : "d25");	\
++      asm volatile ("vldr d26,=27" : : : "d26");	\
++      asm volatile ("vldr d27,=28" : : : "d27");	\
++      asm volatile ("vldr d28,=29" : : : "d28");	\
++      asm volatile ("vldr d29,=30" : : : "d29");	\
++      asm volatile ("vldr d30,=31" : : : "d30");	\
++      asm volatile ("vldr d31,=32" : : : "d31");
++# else
++#  define SAVE_VFP_D32
++# endif
++
++# define INIT_TLSDESC_CALL()				\
++  unsigned long hwcap = getauxval (AT_HWCAP)
++
++/* Set each vector register to a value from 1 to 32 before the TLS access,
++   dump to memory after TLS access, and compare with the expected values.  */
++
++# define BEFORE_TLSDESC_CALL()				\
++  if (hwcap & HWCAP_ARM_VFP)				\
++    {							\
++      asm volatile ("vldr  d0,=1" : : : "d0");		\
++      asm volatile ("vldr  d1,=2" : : : "d1");		\
++      asm volatile ("vldr  d2,=3" : : : "d1");		\
++      asm volatile ("vldr  d3,=4" : : : "d3");		\
++      asm volatile ("vldr  d4,=5" : : : "d4");		\
++      asm volatile ("vldr  d5,=6" : : : "d5");		\
++      asm volatile ("vldr  d6,=7" : : : "d6");		\
++      asm volatile ("vldr  d7,=8" : : : "d7");		\
++      asm volatile ("vldr  d8,=9" : : : "d8");		\
++      asm volatile ("vldr  d9,=10" : : : "d9");		\
++      asm volatile ("vldr d10,=11" : : : "d10");	\
++      asm volatile ("vldr d11,=12" : : : "d11");	\
++      asm volatile ("vldr d12,=13" : : : "d12");	\
++      asm volatile ("vldr d13,=14" : : : "d13");	\
++      asm volatile ("vldr d14,=15" : : : "d14");	\
++      asm volatile ("vldr d15,=16" : : : "d15");	\
++    }							\
++  if (hwcap & HWCAP_ARM_VFPD32)				\
++    {							\
++      SAVE_VFP_D32					\
++    }
++
++# define VFP_STACK_REQ (16*8)
++# if __BYTE_ORDER == __BIG_ENDIAN
++#  define DISP 7
++# else
++#  define DISP 0
++# endif
++
++# ifdef HAVE_ARM_PCS_VFP_D32
++#  define CHECK_VFP_D32							\
++      char vfp[VFP_STACK_REQ];						\
++      asm volatile ("vstmia %0, {d16-d31}\n"				\
++		    :							\
++		    : "r" (vfp)						\
++		    : "memory");					\
++									\
++      char expected[VFP_STACK_REQ] = { 0 };				\
++      for (int i = 0; i < 16; ++i)					\
++	expected[i * 8 + DISP] = i + 17;				\
++									\
++      if (memcmp (vfp, expected, VFP_STACK_REQ) != 0)			\
++        abort ();
++# else
++#  define CHECK_VFP_D32
++# endif
++
++# define AFTER_TLSDESC_CALL()						\
++  if (hwcap & HWCAP_ARM_VFP)						\
++    {									\
++      char vfp[VFP_STACK_REQ];						\
++      asm volatile ("vstmia %0, {d0-d15}\n"				\
++		    :							\
++		    : "r" (vfp)						\
++		    : "memory");					\
++									\
++      char expected[VFP_STACK_REQ] = { 0 };				\
++      for (int i = 0; i < 16; ++i)					\
++	expected[i * 8 + DISP] = i + 1;					\
++									\
++      if (memcmp (vfp, expected, VFP_STACK_REQ) != 0)			\
++        abort ();							\
++    }									\
++  if (hwcap & HWCAP_ARM_VFPD32)						\
++    {									\
++      CHECK_VFP_D32							\
++    }
++
++#endif /* __SOFTFP__ */
++
++#include_next <tst-gnu2-tls2.h>
diff --git a/glibc-upstream-2.39-17.patch b/glibc-upstream-2.39-17.patch
new file mode 100644
index 0000000..050355c
--- /dev/null
+++ b/glibc-upstream-2.39-17.patch
@@ -0,0 +1,221 @@
+commit aded2fc004e7ee85cf0b45b1382552d41e555a23
+Author: Adhemerval Zanella <adhemerval.zanella@linaro.org>
+Date:   Tue Mar 12 13:21:20 2024 -0300
+
+    elf: Enable TLS descriptor tests on aarch64
+    
+    The aarch64 uses 'trad' for traditional tls and 'desc' for tls
+    descriptors, but unlike other targets it defaults to 'desc'.  The
+    gnutls2 configure check does not set aarch64 as an ABI that uses
+    TLS descriptors, which then disable somes stests.
+    
+    Also rename the internal machinery fron gnu2 to tls descriptors.
+    
+    Checked on aarch64-linux-gnu.
+    Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
+    
+    (cherry picked from commit 3d53d18fc71c5d9ef4773b8bce04d54b80181926)
+
+diff --git a/configure b/configure
+index 117b48a421792eda..432e40a59295cffd 100755
+--- a/configure
++++ b/configure
+@@ -653,7 +653,7 @@ LIBGD
+ libc_cv_cc_loop_to_function
+ libc_cv_cc_submachine
+ libc_cv_cc_nofma
+-libc_cv_mtls_dialect_gnu2
++libc_cv_mtls_descriptor
+ libc_cv_has_glob_dat
+ libc_cv_fpie
+ libc_cv_z_execstack
+@@ -4760,6 +4760,9 @@ libc_config_ok=no
+ # whether to use such directories.
+ with_fp_cond=1
+ 
++# A preconfigure script may define another name to TLS descriptor variant
++mtls_descriptor=gnu2
++
+ if frags=`ls -d $srcdir/sysdeps/*/preconfigure 2> /dev/null`
+ then
+   { printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking for sysdeps preconfigure fragments" >&5
+@@ -7006,9 +7009,9 @@ fi
+ printf "%s\n" "$libc_cv_has_glob_dat" >&6; }
+ 
+ 
+-{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking for -mtls-dialect=gnu2" >&5
+-printf %s "checking for -mtls-dialect=gnu2... " >&6; }
+-if test ${libc_cv_mtls_dialect_gnu2+y}
++{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking for tls descriptor support" >&5
++printf %s "checking for tls descriptor support... " >&6; }
++if test ${libc_cv_mtls_descriptor+y}
+ then :
+   printf %s "(cached) " >&6
+ else $as_nop
+@@ -7019,7 +7022,7 @@ void foo (void)
+   i = 10;
+ }
+ EOF
+-if { ac_try='${CC-cc} $CFLAGS $CPPFLAGS -fPIC -mtls-dialect=gnu2 -nostdlib -nostartfiles
++if { ac_try='${CC-cc} $CFLAGS $CPPFLAGS -fPIC -mtls-dialect=$mtls_descriptor -nostdlib -nostartfiles
+ 		   -shared conftest.c -o conftest 1>&5'
+   { { eval echo "\"\$as_me\":${as_lineno-$LINENO}: \"$ac_try\""; } >&5
+   (eval $ac_try) 2>&5
+@@ -7027,17 +7030,17 @@ if { ac_try='${CC-cc} $CFLAGS $CPPFLAGS -fPIC -mtls-dialect=gnu2 -nostdlib -nost
+   printf "%s\n" "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
+   test $ac_status = 0; }; }
+ then
+-  libc_cv_mtls_dialect_gnu2=yes
++  libc_cv_mtls_descriptor=$mtls_descriptor
+ else
+-  libc_cv_mtls_dialect_gnu2=no
++  libc_cv_mtls_descriptor=no
+ fi
+ rm -f conftest*
+ fi
+-{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: $libc_cv_mtls_dialect_gnu2" >&5
+-printf "%s\n" "$libc_cv_mtls_dialect_gnu2" >&6; }
++{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: $libc_cv_mtls_descriptor" >&5
++printf "%s\n" "$libc_cv_mtls_descriptor" >&6; }
+ 
+ config_vars="$config_vars
+-have-mtls-dialect-gnu2 = $libc_cv_mtls_dialect_gnu2"
++have-mtls-descriptor = $libc_cv_mtls_descriptor"
+ 
+ { printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking if -Wno-ignored-attributes is required for aliases" >&5
+ printf %s "checking if -Wno-ignored-attributes is required for aliases... " >&6; }
+diff --git a/configure.ac b/configure.ac
+index 19b88a47a52508a1..bdc385d03c3dc7f5 100644
+--- a/configure.ac
++++ b/configure.ac
+@@ -442,6 +442,9 @@ libc_config_ok=no
+ # whether to use such directories.
+ with_fp_cond=1
+ 
++# A preconfigure script may define another name to TLS descriptor variant
++mtls_descriptor=gnu2
++
+ dnl Let sysdeps/*/preconfigure act here.
+ LIBC_PRECONFIGURE([$srcdir], [for sysdeps])
+ 
+@@ -1287,7 +1290,7 @@ fi
+ rm -f conftest*])
+ AC_SUBST(libc_cv_has_glob_dat)
+ 
+-AC_CACHE_CHECK([for -mtls-dialect=gnu2], libc_cv_mtls_dialect_gnu2,
++AC_CACHE_CHECK([for tls descriptor support], libc_cv_mtls_descriptor,
+ [dnl
+ cat > conftest.c <<EOF
+ __thread int i;
+@@ -1296,16 +1299,16 @@ void foo (void)
+   i = 10;
+ }
+ EOF
+-if AC_TRY_COMMAND([${CC-cc} $CFLAGS $CPPFLAGS -fPIC -mtls-dialect=gnu2 -nostdlib -nostartfiles
++if AC_TRY_COMMAND([${CC-cc} $CFLAGS $CPPFLAGS -fPIC -mtls-dialect=$mtls_descriptor -nostdlib -nostartfiles
+ 		   -shared conftest.c -o conftest 1>&AS_MESSAGE_LOG_FD])
+ then
+-  libc_cv_mtls_dialect_gnu2=yes
++  libc_cv_mtls_descriptor=$mtls_descriptor
+ else
+-  libc_cv_mtls_dialect_gnu2=no
++  libc_cv_mtls_descriptor=no
+ fi
+ rm -f conftest*])
+-AC_SUBST(libc_cv_mtls_dialect_gnu2)
+-LIBC_CONFIG_VAR([have-mtls-dialect-gnu2], [$libc_cv_mtls_dialect_gnu2])
++AC_SUBST(libc_cv_mtls_descriptor)
++LIBC_CONFIG_VAR([have-mtls-descriptor], [$libc_cv_mtls_descriptor])
+ 
+ dnl clang emits an warning for a double alias redirection, to warn the
+ dnl original symbol is sed even when weak definition overrides it.
+diff --git a/elf/Makefile b/elf/Makefile
+index 030db4d207d3491e..69aa423c4b90127d 100644
+--- a/elf/Makefile
++++ b/elf/Makefile
+@@ -999,13 +999,13 @@ modules-names-tests = $(filter-out ifuncmod% tst-tlsmod%,\
+ # For +depfiles in Makerules.
+ extra-test-objs += tst-auditmod17.os
+ 
+-ifeq (yes,$(have-mtls-dialect-gnu2))
++ifneq (no,$(have-mtls-descriptor))
+ tests += tst-gnu2-tls1
+ modules-names += tst-gnu2-tls1mod
+ $(objpfx)tst-gnu2-tls1: $(objpfx)tst-gnu2-tls1mod.so
+ tst-gnu2-tls1mod.so-no-z-defs = yes
+-CFLAGS-tst-gnu2-tls1mod.c += -mtls-dialect=gnu2
+-endif # $(have-mtls-dialect-gnu2)
++CFLAGS-tst-gnu2-tls1mod.c += -mtls-dialect=$(have-mtls-descriptor)
++endif # $(have-mtls-descriptor)
+ 
+ ifeq (yes,$(have-protected-data))
+ modules-names += tst-protected1moda tst-protected1modb
+@@ -2972,11 +2972,11 @@ $(objpfx)tst-tls-allocation-failure-static-patched.out: \
+ $(objpfx)tst-audit-tlsdesc: $(objpfx)tst-audit-tlsdesc-mod1.so \
+ 			    $(objpfx)tst-audit-tlsdesc-mod2.so \
+ 			    $(shared-thread-library)
+-ifeq (yes,$(have-mtls-dialect-gnu2))
++ifneq (no,$(have-mtls-descriptor))
+ # The test is valid for all TLS types, but we want to exercise GNU2
+ # TLS if possible.
+-CFLAGS-tst-audit-tlsdesc-mod1.c += -mtls-dialect=gnu2
+-CFLAGS-tst-audit-tlsdesc-mod2.c += -mtls-dialect=gnu2
++CFLAGS-tst-audit-tlsdesc-mod1.c += -mtls-dialect=$(have-mtls-descriptor)
++CFLAGS-tst-audit-tlsdesc-mod2.c += -mtls-dialect=$(have-mtls-descriptor)
+ endif
+ $(objpfx)tst-audit-tlsdesc-dlopen: $(shared-thread-library)
+ $(objpfx)tst-audit-tlsdesc-dlopen.out: $(objpfx)tst-audit-tlsdesc-mod1.so \
+@@ -3055,11 +3055,11 @@ $(objpfx)tst-gnu2-tls2.out: \
+   $(objpfx)tst-gnu2-tls2mod1.so \
+   $(objpfx)tst-gnu2-tls2mod2.so
+ 
+-ifeq (yes,$(have-mtls-dialect-gnu2))
+-CFLAGS-tst-tlsgap-mod0.c += -mtls-dialect=gnu2
+-CFLAGS-tst-tlsgap-mod1.c += -mtls-dialect=gnu2
+-CFLAGS-tst-tlsgap-mod2.c += -mtls-dialect=gnu2
+-CFLAGS-tst-gnu2-tls2mod0.c += -mtls-dialect=gnu2
+-CFLAGS-tst-gnu2-tls2mod1.c += -mtls-dialect=gnu2
+-CFLAGS-tst-gnu2-tls2mod2.c += -mtls-dialect=gnu2
++ifneq (no,$(have-mtls-descriptor))
++CFLAGS-tst-tlsgap-mod0.c += -mtls-dialect=$(have-mtls-descriptor)
++CFLAGS-tst-tlsgap-mod1.c += -mtls-dialect=$(have-mtls-descriptor)
++CFLAGS-tst-tlsgap-mod2.c += -mtls-dialect=$(have-mtls-descriptor)
++CFLAGS-tst-gnu2-tls2mod0.c += -mtls-dialect=$(have-mtls-descriptor)
++CFLAGS-tst-gnu2-tls2mod1.c += -mtls-dialect=$(have-mtls-descriptor)
++CFLAGS-tst-gnu2-tls2mod2.c += -mtls-dialect=$(have-mtls-descriptor)
+ endif
+diff --git a/sysdeps/aarch64/preconfigure b/sysdeps/aarch64/preconfigure
+index d9bd1f8558a079cb..19657b627bc84c4e 100644
+--- a/sysdeps/aarch64/preconfigure
++++ b/sysdeps/aarch64/preconfigure
+@@ -2,5 +2,6 @@ case "$machine" in
+ aarch64*)
+ 	base_machine=aarch64
+ 	machine=aarch64
++	mtls_descriptor=desc
+ 	;;
+ esac
+diff --git a/sysdeps/arm/Makefile b/sysdeps/arm/Makefile
+index d5cea717a9c201aa..619474eca94fe8e4 100644
+--- a/sysdeps/arm/Makefile
++++ b/sysdeps/arm/Makefile
+@@ -13,15 +13,15 @@ $(objpfx)libgcc-stubs.a: $(objpfx)aeabi_unwind_cpp_pr1.os
+ lib-noranlib: $(objpfx)libgcc-stubs.a
+ 
+ ifeq ($(build-shared),yes)
+-ifeq (yes,$(have-mtls-dialect-gnu2))
++ifneq (no,$(have-mtls-descriptor))
+ tests += tst-armtlsdescloc tst-armtlsdescextnow tst-armtlsdescextlazy
+ modules-names += tst-armtlsdesclocmod
+ modules-names += tst-armtlsdescextlazymod tst-armtlsdescextnowmod
+ CPPFLAGS-tst-armtlsdescextnowmod.c += -Dstatic=
+ CPPFLAGS-tst-armtlsdescextlazymod.c += -Dstatic=
+-CFLAGS-tst-armtlsdesclocmod.c += -mtls-dialect=gnu2
+-CFLAGS-tst-armtlsdescextnowmod.c += -mtls-dialect=gnu2
+-CFLAGS-tst-armtlsdescextlazymod.c += -mtls-dialect=gnu2
++CFLAGS-tst-armtlsdesclocmod.c += -mtls-dialect=$(have-mtls-descriptor)
++CFLAGS-tst-armtlsdescextnowmod.c += -mtls-dialect=$(have-mtls-descriptor)
++CFLAGS-tst-armtlsdescextlazymod.c += -mtls-dialect=$(have-mtls-descriptor)
+ LDFLAGS-tst-armtlsdescextnowmod.so += -Wl,-z,now
+ tst-armtlsdescloc-ENV = LD_BIND_NOW=1
+ tst-armtlsdescextnow-ENV = LD_BIND_NOW=1
diff --git a/glibc-upstream-2.39-18.patch b/glibc-upstream-2.39-18.patch
new file mode 100644
index 0000000..60a31ff
--- /dev/null
+++ b/glibc-upstream-2.39-18.patch
@@ -0,0 +1,24 @@
+commit 5a461f2949ded98d8211939f84988bc464c7b4fe
+Author: Andreas Schwab <schwab@suse.de>
+Date:   Tue Mar 19 13:49:50 2024 +0100
+
+    Add tst-gnu2-tls2mod1 to test-internal-extras
+    
+    That allows sysdeps/x86_64/tst-gnu2-tls2mod1.S to use internal headers.
+    
+    Fixes: 717ebfa85c ("x86-64: Allocate state buffer space for RDI, RSI and RBX")
+    (cherry picked from commit fd7ee2e6c5eb49e4a630a9978b4d668bff6354ee)
+
+diff --git a/sysdeps/x86_64/Makefile b/sysdeps/x86_64/Makefile
+index e8babc9a4edbf90b..9d374a329916fc45 100644
+--- a/sysdeps/x86_64/Makefile
++++ b/sysdeps/x86_64/Makefile
+@@ -210,6 +210,8 @@ tst-plt-rewrite2-ENV = GLIBC_TUNABLES=glibc.cpu.plt_rewrite=2
+ $(objpfx)tst-plt-rewrite2: $(objpfx)tst-plt-rewritemod2.so
+ endif
+ 
++test-internal-extras += tst-gnu2-tls2mod1
++
+ endif # $(subdir) == elf
+ 
+ ifeq ($(subdir),csu)
diff --git a/glibc-upstream-2.39-19.patch b/glibc-upstream-2.39-19.patch
new file mode 100644
index 0000000..fa9220e
--- /dev/null
+++ b/glibc-upstream-2.39-19.patch
@@ -0,0 +1,146 @@
+commit aa4249266e9906c4bc833e4847f4d8feef59504f
+Author: Adhemerval Zanella <adhemerval.zanella@linaro.org>
+Date:   Thu Feb 8 10:08:38 2024 -0300
+
+    x86: Fix Zen3/Zen4 ERMS selection (BZ 30994)
+    
+    The REP MOVSB usage on memcpy/memmove does not show much performance
+    improvement on Zen3/Zen4 cores compared to the vectorized loops.  Also,
+    as from BZ 30994, if the source is aligned and the destination is not
+    the performance can be 20x slower.
+    
+    The performance difference is noticeable with small buffer sizes, closer
+    to the lower bounds limits when memcpy/memmove starts to use ERMS.  The
+    performance of REP MOVSB is similar to vectorized instruction on the
+    size limit (the L2 cache).  Also, there is no drawback to multiple cores
+    sharing the cache.
+    
+    Checked on x86_64-linux-gnu on Zen3.
+    Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
+    
+    (cherry picked from commit 0c0d39fe4aeb0f69b26e76337c5dfd5530d5d44e)
+
+diff --git a/sysdeps/x86/dl-cacheinfo.h b/sysdeps/x86/dl-cacheinfo.h
+index d5101615e348e5c2..f34d12846caf9422 100644
+--- a/sysdeps/x86/dl-cacheinfo.h
++++ b/sysdeps/x86/dl-cacheinfo.h
+@@ -791,7 +791,6 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
+   long int data = -1;
+   long int shared = -1;
+   long int shared_per_thread = -1;
+-  long int core = -1;
+   unsigned int threads = 0;
+   unsigned long int level1_icache_size = -1;
+   unsigned long int level1_icache_linesize = -1;
+@@ -809,7 +808,6 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
+   if (cpu_features->basic.kind == arch_kind_intel)
+     {
+       data = handle_intel (_SC_LEVEL1_DCACHE_SIZE, cpu_features);
+-      core = handle_intel (_SC_LEVEL2_CACHE_SIZE, cpu_features);
+       shared = handle_intel (_SC_LEVEL3_CACHE_SIZE, cpu_features);
+       shared_per_thread = shared;
+ 
+@@ -822,7 +820,8 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
+ 	= handle_intel (_SC_LEVEL1_DCACHE_ASSOC, cpu_features);
+       level1_dcache_linesize
+ 	= handle_intel (_SC_LEVEL1_DCACHE_LINESIZE, cpu_features);
+-      level2_cache_size = core;
++      level2_cache_size
++	= handle_intel (_SC_LEVEL2_CACHE_SIZE, cpu_features);
+       level2_cache_assoc
+ 	= handle_intel (_SC_LEVEL2_CACHE_ASSOC, cpu_features);
+       level2_cache_linesize
+@@ -835,12 +834,12 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
+       level4_cache_size
+ 	= handle_intel (_SC_LEVEL4_CACHE_SIZE, cpu_features);
+ 
+-      get_common_cache_info (&shared, &shared_per_thread, &threads, core);
++      get_common_cache_info (&shared, &shared_per_thread, &threads,
++			     level2_cache_size);
+     }
+   else if (cpu_features->basic.kind == arch_kind_zhaoxin)
+     {
+       data = handle_zhaoxin (_SC_LEVEL1_DCACHE_SIZE);
+-      core = handle_zhaoxin (_SC_LEVEL2_CACHE_SIZE);
+       shared = handle_zhaoxin (_SC_LEVEL3_CACHE_SIZE);
+       shared_per_thread = shared;
+ 
+@@ -849,19 +848,19 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
+       level1_dcache_size = data;
+       level1_dcache_assoc = handle_zhaoxin (_SC_LEVEL1_DCACHE_ASSOC);
+       level1_dcache_linesize = handle_zhaoxin (_SC_LEVEL1_DCACHE_LINESIZE);
+-      level2_cache_size = core;
++      level2_cache_size = handle_zhaoxin (_SC_LEVEL2_CACHE_SIZE);
+       level2_cache_assoc = handle_zhaoxin (_SC_LEVEL2_CACHE_ASSOC);
+       level2_cache_linesize = handle_zhaoxin (_SC_LEVEL2_CACHE_LINESIZE);
+       level3_cache_size = shared;
+       level3_cache_assoc = handle_zhaoxin (_SC_LEVEL3_CACHE_ASSOC);
+       level3_cache_linesize = handle_zhaoxin (_SC_LEVEL3_CACHE_LINESIZE);
+ 
+-      get_common_cache_info (&shared, &shared_per_thread, &threads, core);
++      get_common_cache_info (&shared, &shared_per_thread, &threads,
++			     level2_cache_size);
+     }
+   else if (cpu_features->basic.kind == arch_kind_amd)
+     {
+       data = handle_amd (_SC_LEVEL1_DCACHE_SIZE);
+-      core = handle_amd (_SC_LEVEL2_CACHE_SIZE);
+       shared = handle_amd (_SC_LEVEL3_CACHE_SIZE);
+ 
+       level1_icache_size = handle_amd (_SC_LEVEL1_ICACHE_SIZE);
+@@ -869,7 +868,7 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
+       level1_dcache_size = data;
+       level1_dcache_assoc = handle_amd (_SC_LEVEL1_DCACHE_ASSOC);
+       level1_dcache_linesize = handle_amd (_SC_LEVEL1_DCACHE_LINESIZE);
+-      level2_cache_size = core;
++      level2_cache_size = handle_amd (_SC_LEVEL2_CACHE_SIZE);;
+       level2_cache_assoc = handle_amd (_SC_LEVEL2_CACHE_ASSOC);
+       level2_cache_linesize = handle_amd (_SC_LEVEL2_CACHE_LINESIZE);
+       level3_cache_size = shared;
+@@ -880,12 +879,12 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
+       if (shared <= 0)
+         {
+            /* No shared L3 cache.  All we have is the L2 cache.  */
+-           shared = core;
++           shared = level2_cache_size;
+         }
+       else if (cpu_features->basic.family < 0x17)
+         {
+            /* Account for exclusive L2 and L3 caches.  */
+-           shared += core;
++           shared += level2_cache_size;
+         }
+ 
+       shared_per_thread = shared;
+@@ -987,6 +986,12 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
+   if (CPU_FEATURE_USABLE_P (cpu_features, FSRM))
+     rep_movsb_threshold = 2112;
+ 
++   /* For AMD CPUs that support ERMS (Zen3+), REP MOVSB is in a lot of
++      cases slower than the vectorized path (and for some alignments,
++      it is really slow, check BZ #30994).  */
++  if (cpu_features->basic.kind == arch_kind_amd)
++    rep_movsb_threshold = non_temporal_threshold;
++
+   /* The default threshold to use Enhanced REP STOSB.  */
+   unsigned long int rep_stosb_threshold = 2048;
+ 
+@@ -1028,16 +1033,9 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
+ 			   SIZE_MAX);
+ 
+   unsigned long int rep_movsb_stop_threshold;
+-  /* ERMS feature is implemented from AMD Zen3 architecture and it is
+-     performing poorly for data above L2 cache size. Henceforth, adding
+-     an upper bound threshold parameter to limit the usage of Enhanced
+-     REP MOVSB operations and setting its value to L2 cache size.  */
+-  if (cpu_features->basic.kind == arch_kind_amd)
+-    rep_movsb_stop_threshold = core;
+   /* Setting the upper bound of ERMS to the computed value of
+-     non-temporal threshold for architectures other than AMD.  */
+-  else
+-    rep_movsb_stop_threshold = non_temporal_threshold;
++     non-temporal threshold for all architectures.  */
++  rep_movsb_stop_threshold = non_temporal_threshold;
+ 
+   cpu_features->data_cache_size = data;
+   cpu_features->shared_cache_size = shared;
diff --git a/glibc-upstream-2.39-20.patch b/glibc-upstream-2.39-20.patch
new file mode 100644
index 0000000..3aac665
--- /dev/null
+++ b/glibc-upstream-2.39-20.patch
@@ -0,0 +1,30 @@
+commit 6484a92698039c4a7a510f0214e22d067b0d78b3
+Author: Adhemerval Zanella <adhemerval.zanella@linaro.org>
+Date:   Thu Feb 8 10:08:39 2024 -0300
+
+    x86: Do not prefer ERMS for memset on Zen3+
+    
+    For AMD Zen3+ architecture, the performance of the vectorized loop is
+    slightly better than ERMS.
+    
+    Checked on x86_64-linux-gnu on Zen3.
+    Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
+    
+    (cherry picked from commit 272708884cb750f12f5c74a00e6620c19dc6d567)
+
+diff --git a/sysdeps/x86/dl-cacheinfo.h b/sysdeps/x86/dl-cacheinfo.h
+index f34d12846caf9422..5a98f70364220da4 100644
+--- a/sysdeps/x86/dl-cacheinfo.h
++++ b/sysdeps/x86/dl-cacheinfo.h
+@@ -1021,6 +1021,11 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
+      minimum value is fixed.  */
+   rep_stosb_threshold = TUNABLE_GET (x86_rep_stosb_threshold,
+ 				     long int, NULL);
++  if (cpu_features->basic.kind == arch_kind_amd
++      && !TUNABLE_IS_INITIALIZED (x86_rep_stosb_threshold))
++    /* For AMD Zen3+ architecture, the performance of the vectorized loop is
++       slightly better than ERMS.  */
++    rep_stosb_threshold = SIZE_MAX;
+ 
+   TUNABLE_SET_WITH_BOUNDS (x86_data_cache_size, data, 0, SIZE_MAX);
+   TUNABLE_SET_WITH_BOUNDS (x86_shared_cache_size, shared, 0, SIZE_MAX);
diff --git a/glibc-upstream-2.39-21.patch b/glibc-upstream-2.39-21.patch
new file mode 100644
index 0000000..6ceb90c
--- /dev/null
+++ b/glibc-upstream-2.39-21.patch
@@ -0,0 +1,24 @@
+commit 5d070d12b3a52bc44dd1b71743abc4b6243862ae
+Author: Adhemerval Zanella <adhemerval.zanella@linaro.org>
+Date:   Thu Feb 8 10:08:40 2024 -0300
+
+    x86: Expand the comment on when REP STOSB is used on memset
+    
+    Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
+    (cherry picked from commit 491e55beab7457ed310a4a47496f4a333c5d1032)
+
+diff --git a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
+index 9984c3ca0fafab6a..97839a22483b0613 100644
+--- a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
++++ b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
+@@ -21,7 +21,9 @@
+    2. If size is less than VEC, use integer register stores.
+    3. If size is from VEC_SIZE to 2 * VEC_SIZE, use 2 VEC stores.
+    4. If size is from 2 * VEC_SIZE to 4 * VEC_SIZE, use 4 VEC stores.
+-   5. If size is more to 4 * VEC_SIZE, align to 4 * VEC_SIZE with
++   5. On machines ERMS feature, if size is greater or equal than
++      __x86_rep_stosb_threshold then REP STOSB will be used.
++   6. If size is more to 4 * VEC_SIZE, align to 4 * VEC_SIZE with
+       4 VEC stores and store 4 * VEC at a time until done.  */
+ 
+ #include <sysdep.h>
diff --git a/glibc-upstream-2.39-8.patch b/glibc-upstream-2.39-8.patch
new file mode 100644
index 0000000..653d5a1
--- /dev/null
+++ b/glibc-upstream-2.39-8.patch
@@ -0,0 +1,284 @@
+commit ee7f4c54e19738c2c27d3846e1e9b3595c89221f
+Author: Manjunath Matti <mmatti@linux.ibm.com>
+Date:   Tue Mar 19 15:29:48 2024 -0500
+
+    ﻿powerpc: Add HWCAP3/HWCAP4 data to TCB for Power Architecture.
+    
+    This patch adds a new feature for powerpc.  In order to get faster
+    access to the HWCAP3/HWCAP4 masks, similar to HWCAP/HWCAP2 (i.e. for
+    implementing __builtin_cpu_supports() in GCC) without the overhead of
+    reading them from the auxiliary vector, we now reserve space for them
+    in the TCB.
+    
+    Suggested-by: Peter Bergner <bergner@linux.ibm.com>
+    Reviewed-by: Peter Bergner <bergner@linux.ibm.com>
+    (cherry picked from commit 3ab9b88e2ac91062b6d493fe32bd101a55006c6a)
+
+diff --git a/elf/dl-diagnostics.c b/elf/dl-diagnostics.c
+index 7345ebc4e586883f..aaf67b87e81b04c8 100644
+--- a/elf/dl-diagnostics.c
++++ b/elf/dl-diagnostics.c
+@@ -235,6 +235,8 @@ _dl_print_diagnostics (char **environ)
+   _dl_diagnostics_print_labeled_value ("dl_hwcap", GLRO (dl_hwcap));
+   _dl_diagnostics_print_labeled_value ("dl_hwcap_important", HWCAP_IMPORTANT);
+   _dl_diagnostics_print_labeled_value ("dl_hwcap2", GLRO (dl_hwcap2));
++  _dl_diagnostics_print_labeled_value ("dl_hwcap3", GLRO (dl_hwcap3));
++  _dl_diagnostics_print_labeled_value ("dl_hwcap4", GLRO (dl_hwcap4));
+   _dl_diagnostics_print_labeled_string
+     ("dl_hwcaps_subdirs", _dl_hwcaps_subdirs);
+   _dl_diagnostics_print_labeled_value
+diff --git a/elf/dl-support.c b/elf/dl-support.c
+index 2f502c8b0d27b784..451932dd03e971b8 100644
+--- a/elf/dl-support.c
++++ b/elf/dl-support.c
+@@ -158,6 +158,8 @@ const ElfW(Phdr) *_dl_phdr;
+ size_t _dl_phnum;
+ uint64_t _dl_hwcap;
+ uint64_t _dl_hwcap2;
++uint64_t _dl_hwcap3;
++uint64_t _dl_hwcap4;
+ 
+ enum dso_sort_algorithm _dl_dso_sort_algo;
+ 
+diff --git a/elf/elf.h b/elf/elf.h
+index 455731663c6ed339..1c394c64cd5c66ed 100644
+--- a/elf/elf.h
++++ b/elf/elf.h
+@@ -1234,6 +1234,10 @@ typedef struct
+ #define AT_RSEQ_FEATURE_SIZE	27	/* rseq supported feature size.  */
+ #define AT_RSEQ_ALIGN	28		/* rseq allocation alignment.  */
+ 
++/* More machine-dependent hints about processor capabilities.  */
++#define AT_HWCAP3	29		/* extension of AT_HWCAP.  */
++#define AT_HWCAP4	30		/* extension of AT_HWCAP.  */
++
+ #define AT_EXECFN	31		/* Filename of executable.  */
+ 
+ /* Pointer to the global system page used for system calls and other
+diff --git a/sysdeps/generic/ldsodefs.h b/sysdeps/generic/ldsodefs.h
+index 117c901ccc5c5f0b..50f58a60e3f02330 100644
+--- a/sysdeps/generic/ldsodefs.h
++++ b/sysdeps/generic/ldsodefs.h
+@@ -646,6 +646,8 @@ struct rtld_global_ro
+   /* Mask for more hardware capabilities that are available on some
+      platforms.  */
+   EXTERN uint64_t _dl_hwcap2;
++  EXTERN uint64_t _dl_hwcap3;
++  EXTERN uint64_t _dl_hwcap4;
+ 
+   EXTERN enum dso_sort_algorithm _dl_dso_sort_algo;
+ 
+diff --git a/sysdeps/powerpc/dl-procinfo.c b/sysdeps/powerpc/dl-procinfo.c
+index a76bb6e5b0895e3f..8cf00aa7e359bb6a 100644
+--- a/sysdeps/powerpc/dl-procinfo.c
++++ b/sysdeps/powerpc/dl-procinfo.c
+@@ -38,6 +38,10 @@
+        needed.
+   */
+ 
++/* The total number of available bits (including those prior to
++   _DL_HWCAP_FIRST).  Some of these bits might not be used.  */
++#define _DL_HWCAP_COUNT         128
++
+ #ifndef PROCINFO_CLASS
+ # define PROCINFO_CLASS
+ #endif
+@@ -61,7 +65,7 @@ PROCINFO_CLASS struct cpu_features _dl_powerpc_cpu_features
+ #if !defined PROCINFO_DECL && defined SHARED
+   ._dl_powerpc_cap_flags
+ #else
+-PROCINFO_CLASS const char _dl_powerpc_cap_flags[64][15]
++PROCINFO_CLASS const char _dl_powerpc_cap_flags[_DL_HWCAP_COUNT][15]
+ #endif
+ #ifndef PROCINFO_DECL
+ = {
+diff --git a/sysdeps/powerpc/dl-procinfo.h b/sysdeps/powerpc/dl-procinfo.h
+index 68f424109501aaef..f8cb343877386402 100644
+--- a/sysdeps/powerpc/dl-procinfo.h
++++ b/sysdeps/powerpc/dl-procinfo.h
+@@ -22,16 +22,17 @@
+ #include <ldsodefs.h>
+ #include <sysdep.h>	/* This defines the PPC_FEATURE[2]_* macros.  */
+ 
+-/* The total number of available bits (including those prior to
+-   _DL_HWCAP_FIRST).  Some of these bits might not be used.  */
+-#define _DL_HWCAP_COUNT		64
++/* Feature masks are all 32-bits in size.  */
++#define _DL_HWCAP_SIZE		32
+ 
+-/* Features started at bit 31 and decremented as new features were added.  */
+-#define _DL_HWCAP_LAST		31
++/* AT_HWCAP2 feature strings follow the AT_HWCAP feature strings.  */
++#define _DL_HWCAP2_OFFSET	_DL_HWCAP_SIZE
+ 
+-/* AT_HWCAP2 features started at bit 31 and decremented as new features were
+-   added.  HWCAP2 feature bits start at bit 0.  */
+-#define _DL_HWCAP2_LAST		31
++/* AT_HWCAP3 feature strings follow the AT_HWCAP2 feature strings.  */
++#define _DL_HWCAP3_OFFSET	(_DL_HWCAP2_OFFSET + _DL_HWCAP_SIZE)
++
++/* AT_HWCAP4 feature strings follow the AT_HWCAP3 feature strings.  */
++#define _DL_HWCAP4_OFFSET	(_DL_HWCAP3_OFFSET + _DL_HWCAP_SIZE)
+ 
+ /* These bits influence library search.  */
+ #define HWCAP_IMPORTANT		(PPC_FEATURE_HAS_ALTIVEC \
+@@ -187,21 +188,42 @@ _dl_procinfo (unsigned int type, unsigned long int word)
+     case AT_HWCAP:
+       _dl_printf ("AT_HWCAP:            ");
+ 
+-      for (int i = 0; i <= _DL_HWCAP_LAST; ++i)
++      for (int i = 0; i < _DL_HWCAP_SIZE; ++i)
+        if (word & (1 << i))
+          _dl_printf (" %s", _dl_hwcap_string (i));
+       break;
+     case AT_HWCAP2:
+       {
+-       unsigned int offset = _DL_HWCAP_LAST + 1;
+ 
+        _dl_printf ("AT_HWCAP2:           ");
+ 
+-        /* We have to go through them all because the kernel added the
+-          AT_HWCAP2 features starting with the high bits.  */
+-       for (int i = 0; i <= _DL_HWCAP2_LAST; ++i)
+-         if (word & (1 << i))
+-           _dl_printf (" %s", _dl_hwcap_string (offset + i));
++       /* We have to go through them all because the kernel added the
++	  AT_HWCAP2 features starting with the high bits.  */
++       for (int i = 0; i < _DL_HWCAP_SIZE; ++i)
++	 if (word & (1 << i))
++	   _dl_printf (" %s", _dl_hwcap_string (_DL_HWCAP2_OFFSET + i));
++       break;
++      }
++    case AT_HWCAP3:
++      {
++       _dl_printf ("AT_HWCAP3:           ");
++
++       /* We have to go through them all because the kernel added the
++	  AT_HWCAP3 features starting with the high bits.  */
++       for (int i = 0; i < _DL_HWCAP_SIZE; ++i)
++	 if (word & (1 << i))
++	   _dl_printf (" %s", _dl_hwcap_string (_DL_HWCAP3_OFFSET + i));
++       break;
++      }
++    case AT_HWCAP4:
++      {
++       _dl_printf ("AT_HWCAP4:           ");
++
++       /* We have to go through them all because the kernel added the
++	  AT_HWCAP4 features starting with the high bits.  */
++       for (int i = 0; i <= _DL_HWCAP_SIZE; ++i)
++	 if (word & (1 << i))
++	   _dl_printf (" %s", _dl_hwcap_string (_DL_HWCAP4_OFFSET + i));
+        break;
+       }
+     case AT_L1I_CACHEGEOMETRY:
+diff --git a/sysdeps/powerpc/hwcapinfo.c b/sysdeps/powerpc/hwcapinfo.c
+index 76344f285a903858..f6fede15a7dfbf6c 100644
+--- a/sysdeps/powerpc/hwcapinfo.c
++++ b/sysdeps/powerpc/hwcapinfo.c
+@@ -31,7 +31,7 @@ void
+ __tcb_parse_hwcap_and_convert_at_platform (void)
+ {
+ 
+-  uint64_t h1, h2;
++  uint64_t h1, h2, h3, h4;
+ 
+   /* Read AT_PLATFORM string from auxv and convert it to a number.  */
+   __tcb.at_platform = _dl_string_platform (GLRO (dl_platform));
+@@ -39,6 +39,8 @@ __tcb_parse_hwcap_and_convert_at_platform (void)
+   /* Read HWCAP and HWCAP2 from auxv.  */
+   h1 = GLRO (dl_hwcap);
+   h2 = GLRO (dl_hwcap2);
++  h3 = GLRO (dl_hwcap3);
++  h4 = GLRO (dl_hwcap4);
+ 
+   /* hwcap contains only the latest supported ISA, the code checks which is
+      and fills the previous supported ones.  */
+@@ -64,13 +66,16 @@ __tcb_parse_hwcap_and_convert_at_platform (void)
+   else if (h1 & PPC_FEATURE_POWER5)
+     h1 |= PPC_FEATURE_POWER4;
+ 
+-  uint64_t array_hwcaps[] = { h1, h2 };
++  uint64_t array_hwcaps[] = { h1, h2, h3, h4 };
+   init_cpu_features (&GLRO(dl_powerpc_cpu_features), array_hwcaps);
+ 
+   /* Consolidate both HWCAP and HWCAP2 into a single doubleword so that
+      we can read both in a single load later.  */
+   __tcb.hwcap = (h1 << 32) | (h2 & 0xffffffff);
+-  __tcb.hwcap_extn = 0x0;
++
++  /* Consolidate both HWCAP3 and HWCAP4 into a single doubleword so that
++     we can read both in a single load later.  */
++  __tcb.hwcap_extn = (h3 << 32) | (h4 & 0xffffffff);
+ 
+ }
+ #if IS_IN (rtld)
+diff --git a/sysdeps/unix/sysv/linux/dl-parse_auxv.h b/sysdeps/unix/sysv/linux/dl-parse_auxv.h
+index e3d758b163c619df..ea2a58ecb1668774 100644
+--- a/sysdeps/unix/sysv/linux/dl-parse_auxv.h
++++ b/sysdeps/unix/sysv/linux/dl-parse_auxv.h
+@@ -47,6 +47,8 @@ void _dl_parse_auxv (ElfW(auxv_t) *av, dl_parse_auxv_t auxv_values)
+   GLRO(dl_platform) = (void *) auxv_values[AT_PLATFORM];
+   GLRO(dl_hwcap) = auxv_values[AT_HWCAP];
+   GLRO(dl_hwcap2) = auxv_values[AT_HWCAP2];
++  GLRO(dl_hwcap3) = auxv_values[AT_HWCAP3];
++  GLRO(dl_hwcap4) = auxv_values[AT_HWCAP4];
+   GLRO(dl_clktck) = auxv_values[AT_CLKTCK];
+   GLRO(dl_fpu_control) = auxv_values[AT_FPUCW];
+   _dl_random = (void *) auxv_values[AT_RANDOM];
+diff --git a/sysdeps/unix/sysv/linux/dl-sysdep.c b/sysdeps/unix/sysv/linux/dl-sysdep.c
+index ad3692d73839d7a3..e1b14e9eb34ff5cb 100644
+--- a/sysdeps/unix/sysv/linux/dl-sysdep.c
++++ b/sysdeps/unix/sysv/linux/dl-sysdep.c
+@@ -197,6 +197,8 @@ _dl_show_auxv (void)
+ 	  [AT_SYSINFO_EHDR - 2] =	{ "SYSINFO_EHDR:      0x", hex },
+ 	  [AT_RANDOM - 2] =		{ "RANDOM:            0x", hex },
+ 	  [AT_HWCAP2 - 2] =		{ "HWCAP2:            0x", hex },
++	  [AT_HWCAP3 - 2] =		{ "HWCAP3:            0x", hex },
++	  [AT_HWCAP4 - 2] =		{ "HWCAP4:            0x", hex },
+ 	  [AT_MINSIGSTKSZ - 2] =	{ "MINSIGSTKSZ:       ", dec },
+ 	  [AT_L1I_CACHESIZE - 2] =	{ "L1I_CACHESIZE:     ", dec },
+ 	  [AT_L1I_CACHEGEOMETRY - 2] =	{ "L1I_CACHEGEOMETRY: 0x", hex },
+diff --git a/sysdeps/unix/sysv/linux/powerpc/cpu-features.c b/sysdeps/unix/sysv/linux/powerpc/cpu-features.c
+index 8e8a5ec2eab7e8c6..a947d62db63965b1 100644
+--- a/sysdeps/unix/sysv/linux/powerpc/cpu-features.c
++++ b/sysdeps/unix/sysv/linux/powerpc/cpu-features.c
+@@ -94,6 +94,8 @@ init_cpu_features (struct cpu_features *cpu_features, uint64_t hwcaps[])
+      which are set by __tcb_parse_hwcap_and_convert_at_platform.  */
+   cpu_features->hwcap = hwcaps[0];
+   cpu_features->hwcap2 = hwcaps[1];
++  cpu_features->hwcap3 = hwcaps[2];
++  cpu_features->hwcap4 = hwcaps[3];
+   /* Default is to use aligned memory access on optimized function unless
+      tunables is enable, since for this case user can explicit disable
+      unaligned optimizations.  */
+diff --git a/sysdeps/unix/sysv/linux/powerpc/cpu-features.h b/sysdeps/unix/sysv/linux/powerpc/cpu-features.h
+index 1294f0b601ebf54f..e9eb6a13c8ab11d7 100644
+--- a/sysdeps/unix/sysv/linux/powerpc/cpu-features.h
++++ b/sysdeps/unix/sysv/linux/powerpc/cpu-features.h
+@@ -26,6 +26,8 @@ struct cpu_features
+   bool use_cached_memopt;
+   unsigned long int hwcap;
+   unsigned long int hwcap2;
++  unsigned long int hwcap3;
++  unsigned long int hwcap4;
+ };
+ 
+ static const char hwcap_names[] = {
+diff --git a/sysdeps/unix/sysv/linux/powerpc/libc-start.c b/sysdeps/unix/sysv/linux/powerpc/libc-start.c
+index a4705daf1cdea4de..6a00cd88cd64b992 100644
+--- a/sysdeps/unix/sysv/linux/powerpc/libc-start.c
++++ b/sysdeps/unix/sysv/linux/powerpc/libc-start.c
+@@ -87,6 +87,12 @@ __libc_start_main_impl (int argc, char **argv,
+       case AT_HWCAP2:
+ 	_dl_hwcap2 = (unsigned long int) av->a_un.a_val;
+ 	break;
++      case AT_HWCAP3:
++	_dl_hwcap3 = (unsigned long int) av->a_un.a_val;
++	break;
++      case AT_HWCAP4:
++	_dl_hwcap4 = (unsigned long int) av->a_un.a_val;
++	break;
+       case AT_PLATFORM:
+ 	_dl_platform = (void *) av->a_un.a_val;
+ 	break;
diff --git a/glibc-upstream-2.39-9.patch b/glibc-upstream-2.39-9.patch
new file mode 100644
index 0000000..4289fa2
--- /dev/null
+++ b/glibc-upstream-2.39-9.patch
@@ -0,0 +1,172 @@
+commit aad45c8ac30aa1072e54903ce6aead22702f244a
+Author: Amrita H S <amritahs@linux.ibm.com>
+Date:   Tue Mar 19 19:08:47 2024 -0500
+
+    ﻿powerpc: Placeholder and infrastructure/build support to add Power11 related changes.
+    
+    The following three changes have been added to provide initial Power11 support.
+        1. Add the directories to hold Power11 files.
+        2. Add support to select Power11 libraries based on AT_PLATFORM.
+        3. Let submachine=power11 be set automatically.
+    
+    Reviewed-by: Florian Weimer <fweimer@redhat.com>
+    Reviewed-by: Peter Bergner <bergner@linux.ibm.com>
+    (cherry picked from commit 1ea051145612f199d8716ecdf78b084b00b5a727)
+
+diff --git a/sysdeps/powerpc/dl-procinfo.h b/sysdeps/powerpc/dl-procinfo.h
+index f8cb343877386402..b36697ba440654be 100644
+--- a/sysdeps/powerpc/dl-procinfo.h
++++ b/sysdeps/powerpc/dl-procinfo.h
+@@ -38,7 +38,7 @@
+ #define HWCAP_IMPORTANT		(PPC_FEATURE_HAS_ALTIVEC \
+ 				+ PPC_FEATURE_HAS_DFP)
+ 
+-#define _DL_PLATFORMS_COUNT	16
++#define _DL_PLATFORMS_COUNT	17
+ 
+ #define _DL_FIRST_PLATFORM	32
+ /* Mask to filter out platforms.  */
+@@ -62,6 +62,7 @@
+ #define PPC_PLATFORM_POWER8		13
+ #define PPC_PLATFORM_POWER9		14
+ #define PPC_PLATFORM_POWER10		15
++#define PPC_PLATFORM_POWER11		16
+ 
+ static inline const char *
+ __attribute__ ((unused))
+@@ -89,6 +90,11 @@ _dl_string_platform (const char *str)
+ 	      ret = _DL_FIRST_PLATFORM + PPC_PLATFORM_POWER10;
+ 	      str++;
+ 	    }
++	  else if (str[1] == '1')
++	    {
++	      ret = _DL_FIRST_PLATFORM + PPC_PLATFORM_POWER11;
++	      str++;
++	    }
+ 	  else
+ 	    return -1;
+ 	  break;
+diff --git a/sysdeps/powerpc/powerpc32/power11/Implies b/sysdeps/powerpc/powerpc32/power11/Implies
+new file mode 100644
+index 0000000000000000..051cbe0f7911c93c
+--- /dev/null
++++ b/sysdeps/powerpc/powerpc32/power11/Implies
+@@ -0,0 +1,2 @@
++powerpc/powerpc32/power10/fpu
++powerpc/powerpc32/power10
+diff --git a/sysdeps/powerpc/powerpc32/power11/fpu/multiarch/Implies b/sysdeps/powerpc/powerpc32/power11/fpu/multiarch/Implies
+new file mode 100644
+index 0000000000000000..58edb2861d17f504
+--- /dev/null
++++ b/sysdeps/powerpc/powerpc32/power11/fpu/multiarch/Implies
+@@ -0,0 +1 @@
++powerpc/powerpc32/power10/fpu/multiarch
+diff --git a/sysdeps/powerpc/powerpc32/power11/multiarch/Implies b/sysdeps/powerpc/powerpc32/power11/multiarch/Implies
+new file mode 100644
+index 0000000000000000..c70f0428badbaf14
+--- /dev/null
++++ b/sysdeps/powerpc/powerpc32/power11/multiarch/Implies
+@@ -0,0 +1 @@
++powerpc/powerpc32/power10/multiarch
+diff --git a/sysdeps/powerpc/powerpc64/be/power11/Implies b/sysdeps/powerpc/powerpc64/be/power11/Implies
+new file mode 100644
+index 0000000000000000..de481d1c13db695e
+--- /dev/null
++++ b/sysdeps/powerpc/powerpc64/be/power11/Implies
+@@ -0,0 +1,2 @@
++powerpc/powerpc64/be/power10/fpu
++powerpc/powerpc64/be/power10
+diff --git a/sysdeps/powerpc/powerpc64/be/power11/fpu/Implies b/sysdeps/powerpc/powerpc64/be/power11/fpu/Implies
+new file mode 100644
+index 0000000000000000..dff0e13064ce8238
+--- /dev/null
++++ b/sysdeps/powerpc/powerpc64/be/power11/fpu/Implies
+@@ -0,0 +1 @@
++powerpc/powerpc64/be/power10/fpu
+diff --git a/sysdeps/powerpc/powerpc64/be/power11/fpu/multiarch/Implies b/sysdeps/powerpc/powerpc64/be/power11/fpu/multiarch/Implies
+new file mode 100644
+index 0000000000000000..c3f259e0097386a5
+--- /dev/null
++++ b/sysdeps/powerpc/powerpc64/be/power11/fpu/multiarch/Implies
+@@ -0,0 +1 @@
++powerpc/powerpc64/be/power10/fpu/multiarch
+diff --git a/sysdeps/powerpc/powerpc64/be/power11/multiarch/Implies b/sysdeps/powerpc/powerpc64/be/power11/multiarch/Implies
+new file mode 100644
+index 0000000000000000..9491a394c9519d01
+--- /dev/null
++++ b/sysdeps/powerpc/powerpc64/be/power11/multiarch/Implies
+@@ -0,0 +1 @@
++powerpc/powerpc64/be/power10/multiarch
+diff --git a/sysdeps/powerpc/powerpc64/le/power11/Implies b/sysdeps/powerpc/powerpc64/le/power11/Implies
+new file mode 100644
+index 0000000000000000..e18182dcc1f4c25f
+--- /dev/null
++++ b/sysdeps/powerpc/powerpc64/le/power11/Implies
+@@ -0,0 +1,2 @@
++powerpc/powerpc64/le/power10/fpu
++powerpc/powerpc64/le/power10
+diff --git a/sysdeps/powerpc/powerpc64/le/power11/fpu/Implies b/sysdeps/powerpc/powerpc64/le/power11/fpu/Implies
+new file mode 100644
+index 0000000000000000..e41bd55684dbb5b8
+--- /dev/null
++++ b/sysdeps/powerpc/powerpc64/le/power11/fpu/Implies
+@@ -0,0 +1 @@
++powerpc/powerpc64/le/power10/fpu
+diff --git a/sysdeps/powerpc/powerpc64/le/power11/fpu/multiarch/Implies b/sysdeps/powerpc/powerpc64/le/power11/fpu/multiarch/Implies
+new file mode 100644
+index 0000000000000000..c838d5093140eae3
+--- /dev/null
++++ b/sysdeps/powerpc/powerpc64/le/power11/fpu/multiarch/Implies
+@@ -0,0 +1 @@
++powerpc/powerpc64/le/power10/fpu/multiarch
+diff --git a/sysdeps/powerpc/powerpc64/le/power11/multiarch/Implies b/sysdeps/powerpc/powerpc64/le/power11/multiarch/Implies
+new file mode 100644
+index 0000000000000000..687248c3c267cd8c
+--- /dev/null
++++ b/sysdeps/powerpc/powerpc64/le/power11/multiarch/Implies
+@@ -0,0 +1 @@
++powerpc/powerpc64/le/power10/multiarch
+diff --git a/sysdeps/powerpc/powerpc64/le/tst-glibc-hwcaps.c b/sysdeps/powerpc/powerpc64/le/tst-glibc-hwcaps.c
+index 77465d9133410267..65d3e69303a1c963 100644
+--- a/sysdeps/powerpc/powerpc64/le/tst-glibc-hwcaps.c
++++ b/sysdeps/powerpc/powerpc64/le/tst-glibc-hwcaps.c
+@@ -36,9 +36,11 @@ compute_level (void)
+     return 9;
+   if (strcmp (platform, "power10") == 0)
+     return 10;
++  if (strcmp (platform, "power11") == 0)
++    return 11;
+   printf ("warning: unrecognized AT_PLATFORM value: %s\n", platform);
+-  /* Assume that the new platform supports POWER10.  */
+-  return 10;
++  /* Assume that the new platform supports POWER11.  */
++  return 11;
+ }
+ 
+ static int
+diff --git a/sysdeps/powerpc/preconfigure b/sysdeps/powerpc/preconfigure
+index 4de94089a3f68532..9e5a07ab6d6767cd 100644
+--- a/sysdeps/powerpc/preconfigure
++++ b/sysdeps/powerpc/preconfigure
+@@ -58,7 +58,7 @@ fi
+ 
+     ;;
+ 
+-  a2|970|power[4-9]|power5x|power6+|power10)
++  a2|970|power[4-9]|power5x|power6+|power10|power11)
+     submachine=${archcpu}
+     if test ${libc_cv_cc_submachine+y}
+ then :
+diff --git a/sysdeps/powerpc/preconfigure.ac b/sysdeps/powerpc/preconfigure.ac
+index 6c63bd8257b7e40a..14b6dafd4a895c3b 100644
+--- a/sysdeps/powerpc/preconfigure.ac
++++ b/sysdeps/powerpc/preconfigure.ac
+@@ -46,7 +46,7 @@ case "${machine}:${submachine}" in
+     AC_CACHE_VAL(libc_cv_cc_submachine,libc_cv_cc_submachine="")
+     ;;
+ 
+-  a2|970|power[[4-9]]|power5x|power6+|power10)
++  a2|970|power[[4-9]]|power5x|power6+|power10|power11)
+     submachine=${archcpu}
+     AC_CACHE_VAL(libc_cv_cc_submachine,libc_cv_cc_submachine="")
+     ;;
diff --git a/glibc.spec b/glibc.spec
index 990213c..493b171 100644
--- a/glibc.spec
+++ b/glibc.spec
@@ -171,7 +171,7 @@ Version: %{glibcversion}
 # - It allows using the Release number without the %%dist tag in the dependency
 #   generator to make the generated requires interchangeable between Rawhide
 #   and ELN (.elnYY < .fcXX).
-%global baserelease 6
+%global baserelease 7
 Release: %{baserelease}%{?dist}
 
 # Licenses:
@@ -288,6 +288,20 @@ Patch27: glibc-upstream-2.39-4.patch
 Patch28: glibc-upstream-2.39-5.patch
 Patch29: glibc-upstream-2.39-6.patch
 Patch30: glibc-upstream-2.39-7.patch
+Patch31: glibc-upstream-2.39-8.patch
+Patch32: glibc-upstream-2.39-9.patch
+Patch33: glibc-upstream-2.39-10.patch
+Patch34: glibc-upstream-2.39-11.patch
+Patch35: glibc-upstream-2.39-12.patch
+Patch36: glibc-upstream-2.39-13.patch
+Patch37: glibc-upstream-2.39-14.patch
+Patch38: glibc-upstream-2.39-15.patch
+Patch39: glibc-upstream-2.39-16.patch
+Patch40: glibc-upstream-2.39-17.patch
+Patch41: glibc-upstream-2.39-18.patch
+Patch42: glibc-upstream-2.39-19.patch
+Patch43: glibc-upstream-2.39-20.patch
+Patch44: glibc-upstream-2.39-21.patch
 
 ##############################################################################
 # Continued list of core "glibc" package information:
@@ -2464,6 +2478,24 @@ update_gconv_modules_cache ()
 %endif
 
 %changelog
+* Thu Apr 04 2024 Arjun Shankar <arjun@redhat.com> - 2.39-7
+- Sync with upstream branch release/2.39/master,
+  commit 5d070d12b3a52bc44dd1b71743abc4b6243862ae:
+- x86: Expand the comment on when REP STOSB is used on memset
+- x86: Do not prefer ERMS for memset on Zen3+
+- x86: Fix Zen3/Zen4 ERMS selection (BZ 30994)
+- Add tst-gnu2-tls2mod1 to test-internal-extras
+- elf: Enable TLS descriptor tests on aarch64
+- arm: Update _dl_tlsdesc_dynamic to preserve caller-saved registers (BZ 31372)
+- Ignore undefined symbols for -mtls-dialect=gnu2
+- x86-64: Allocate state buffer space for RDI, RSI and RBX
+- x86-64: Update _dl_tlsdesc_dynamic to preserve AMX registers
+- x86: Update _dl_tlsdesc_dynamic to preserve caller-saved registers
+- x86-64: Save APX registers in ld.so trampoline
+- LoongArch: Correct {__ieee754, _}_scalb -> {__ieee754, _}_scalbf
+- powerpc: Placeholder and infrastructure/build support to add Power11 related changes.
+- powerpc: Add HWCAP3/HWCAP4 data to TCB for Power Architecture.
+
 * Tue Mar 26 2024 Florian Weimer <fweimer@redhat.com> - 2.39-6
 - Do not generate ELF dependency information for glibc32