diff --git a/glibc-upstream-2.39-10.patch b/glibc-upstream-2.39-10.patch new file mode 100644 index 0000000..858f28e --- /dev/null +++ b/glibc-upstream-2.39-10.patch @@ -0,0 +1,16 @@ +commit 983f34a1252de3ca6f2305c211d86530ea42010e +Author: caiyinyu +Date: Mon Mar 11 16:07:48 2024 +0800 + + LoongArch: Correct {__ieee754, _}_scalb -> {__ieee754, _}_scalbf + +diff --git a/sysdeps/loongarch/fpu/e_scalbf.c b/sysdeps/loongarch/fpu/e_scalbf.c +index 9f054852362e2d76..7c0395fbb5afbc58 100644 +--- a/sysdeps/loongarch/fpu/e_scalbf.c ++++ b/sysdeps/loongarch/fpu/e_scalbf.c +@@ -57,4 +57,4 @@ __ieee754_scalbf (float x, float fn) + + return x; + } +-libm_alias_finite (__ieee754_scalb, __scalb) ++libm_alias_finite (__ieee754_scalbf, __scalbf) diff --git a/glibc-upstream-2.39-11.patch b/glibc-upstream-2.39-11.patch new file mode 100644 index 0000000..d816c3c --- /dev/null +++ b/glibc-upstream-2.39-11.patch @@ -0,0 +1,80 @@ +commit 7fc8242bf87828c935ac5df5cafb9dc7ab635fd9 +Author: H.J. Lu +Date: Fri Feb 16 07:17:10 2024 -0800 + + x86-64: Save APX registers in ld.so trampoline + + Add APX registers to STATE_SAVE_MASK so that APX registers are saved in + ld.so trampoline. This fixes BZ #31371. + + Also update STATE_SAVE_OFFSET and STATE_SAVE_MASK for i386 which will + be used by i386 _dl_tlsdesc_dynamic. + Reviewed-by: Noah Goldstein + + (cherry picked from commit dfb05f8e704edac70db38c4c8ee700769d91a413) + +diff --git a/sysdeps/x86/sysdep.h b/sysdeps/x86/sysdep.h +index 85d0a8c943cbb218..837fd28734914a1c 100644 +--- a/sysdeps/x86/sysdep.h ++++ b/sysdeps/x86/sysdep.h +@@ -21,14 +21,54 @@ + + #include + ++/* The extended state feature IDs in the state component bitmap. */ ++#define X86_XSTATE_X87_ID 0 ++#define X86_XSTATE_SSE_ID 1 ++#define X86_XSTATE_AVX_ID 2 ++#define X86_XSTATE_BNDREGS_ID 3 ++#define X86_XSTATE_BNDCFG_ID 4 ++#define X86_XSTATE_K_ID 5 ++#define X86_XSTATE_ZMM_H_ID 6 ++#define X86_XSTATE_ZMM_ID 7 ++#define X86_XSTATE_PKRU_ID 9 ++#define X86_XSTATE_TILECFG_ID 17 ++#define X86_XSTATE_TILEDATA_ID 18 ++#define X86_XSTATE_APX_F_ID 19 ++ ++#ifdef __x86_64__ + /* Offset for fxsave/xsave area used by _dl_runtime_resolve. Also need + space to preserve RCX, RDX, RSI, RDI, R8, R9 and RAX. It must be +- aligned to 16 bytes for fxsave and 64 bytes for xsave. */ +-#define STATE_SAVE_OFFSET (8 * 7 + 8) +- +-/* Save SSE, AVX, AVX512, mask and bound registers. */ +-#define STATE_SAVE_MASK \ +- ((1 << 1) | (1 << 2) | (1 << 3) | (1 << 5) | (1 << 6) | (1 << 7)) ++ aligned to 16 bytes for fxsave and 64 bytes for xsave. ++ ++ NB: Is is non-zero because of the 128-byte red-zone. Some registers ++ are saved on stack without adjusting stack pointer first. When we ++ update stack pointer to allocate more space, we need to take the ++ red-zone into account. */ ++# define STATE_SAVE_OFFSET (8 * 7 + 8) ++ ++/* Save SSE, AVX, AVX512, mask, bound and APX registers. Bound and APX ++ registers are mutually exclusive. */ ++# define STATE_SAVE_MASK \ ++ ((1 << X86_XSTATE_SSE_ID) \ ++ | (1 << X86_XSTATE_AVX_ID) \ ++ | (1 << X86_XSTATE_BNDREGS_ID) \ ++ | (1 << X86_XSTATE_K_ID) \ ++ | (1 << X86_XSTATE_ZMM_H_ID) \ ++ | (1 << X86_XSTATE_ZMM_ID) \ ++ | (1 << X86_XSTATE_APX_F_ID)) ++#else ++/* Offset for fxsave/xsave area used by _dl_tlsdesc_dynamic. Since i386 ++ doesn't have red-zone, use 0 here. */ ++# define STATE_SAVE_OFFSET 0 ++ ++/* Save SSE, AVX, AXV512, mask and bound registers. */ ++# define STATE_SAVE_MASK \ ++ ((1 << X86_XSTATE_SSE_ID) \ ++ | (1 << X86_XSTATE_AVX_ID) \ ++ | (1 << X86_XSTATE_BNDREGS_ID) \ ++ | (1 << X86_XSTATE_K_ID) \ ++ | (1 << X86_XSTATE_ZMM_H_ID)) ++#endif + + /* Constants for bits in __x86_string_control: */ + diff --git a/glibc-upstream-2.39-12.patch b/glibc-upstream-2.39-12.patch new file mode 100644 index 0000000..74e0b72 --- /dev/null +++ b/glibc-upstream-2.39-12.patch @@ -0,0 +1,1453 @@ +commit a364304718725a31ab141936322855c76c73e35e +Author: H.J. Lu +Date: Mon Feb 26 06:37:03 2024 -0800 + + x86: Update _dl_tlsdesc_dynamic to preserve caller-saved registers + + Compiler generates the following instruction sequence for GNU2 dynamic + TLS access: + + leaq tls_var@TLSDESC(%rip), %rax + call *tls_var@TLSCALL(%rax) + + or + + leal tls_var@TLSDESC(%ebx), %eax + call *tls_var@TLSCALL(%eax) + + CALL instruction is transparent to compiler which assumes all registers, + except for EFLAGS and RAX/EAX, are unchanged after CALL. When + _dl_tlsdesc_dynamic is called, it calls __tls_get_addr on the slow + path. __tls_get_addr is a normal function which doesn't preserve any + caller-saved registers. _dl_tlsdesc_dynamic saved and restored integer + caller-saved registers, but didn't preserve any other caller-saved + registers. Add _dl_tlsdesc_dynamic IFUNC functions for FNSAVE, FXSAVE, + XSAVE and XSAVEC to save and restore all caller-saved registers. This + fixes BZ #31372. + + Add GLRO(dl_x86_64_runtime_resolve) with GLRO(dl_x86_tlsdesc_dynamic) + to optimize elf_machine_runtime_setup. + Reviewed-by: Noah Goldstein + + (cherry picked from commit 0aac205a814a8511e98d02b91a8dc908f1c53cde) + +diff --git a/elf/Makefile b/elf/Makefile +index 5d78b659ce813ff9..c5c37a9147e69d83 100644 +--- a/elf/Makefile ++++ b/elf/Makefile +@@ -424,6 +424,7 @@ tests += \ + tst-glibc-hwcaps-prepend \ + tst-global1 \ + tst-global2 \ ++ tst-gnu2-tls2 \ + tst-initfinilazyfail \ + tst-initorder \ + tst-initorder2 \ +@@ -846,6 +847,9 @@ modules-names += \ + tst-filterobj-flt \ + tst-finilazyfailmod \ + tst-globalmod2 \ ++ tst-gnu2-tls2mod0 \ ++ tst-gnu2-tls2mod1 \ ++ tst-gnu2-tls2mod2 \ + tst-initlazyfailmod \ + tst-initorder2a \ + tst-initorder2b \ +@@ -3044,8 +3048,22 @@ $(objpfx)tst-tlsgap.out: \ + $(objpfx)tst-tlsgap-mod0.so \ + $(objpfx)tst-tlsgap-mod1.so \ + $(objpfx)tst-tlsgap-mod2.so ++ ++$(objpfx)tst-gnu2-tls2: $(shared-thread-library) ++$(objpfx)tst-gnu2-tls2.out: \ ++ $(objpfx)tst-gnu2-tls2mod0.so \ ++ $(objpfx)tst-gnu2-tls2mod1.so \ ++ $(objpfx)tst-gnu2-tls2mod2.so ++ + ifeq (yes,$(have-mtls-dialect-gnu2)) ++# This test fails if dl_tlsdesc_dynamic doesn't preserve all caller-saved ++# registers. See https://sourceware.org/bugzilla/show_bug.cgi?id=31372 ++test-xfail-tst-gnu2-tls2 = yes ++ + CFLAGS-tst-tlsgap-mod0.c += -mtls-dialect=gnu2 + CFLAGS-tst-tlsgap-mod1.c += -mtls-dialect=gnu2 + CFLAGS-tst-tlsgap-mod2.c += -mtls-dialect=gnu2 ++CFLAGS-tst-gnu2-tls2mod0.c += -mtls-dialect=gnu2 ++CFLAGS-tst-gnu2-tls2mod1.c += -mtls-dialect=gnu2 ++CFLAGS-tst-gnu2-tls2mod2.c += -mtls-dialect=gnu2 + endif +diff --git a/elf/tst-gnu2-tls2.c b/elf/tst-gnu2-tls2.c +new file mode 100644 +index 0000000000000000..7ac04d7f3312033e +--- /dev/null ++++ b/elf/tst-gnu2-tls2.c +@@ -0,0 +1,122 @@ ++/* Test TLSDESC relocation. ++ Copyright (C) 2024 Free Software Foundation, Inc. ++ This file is part of the GNU C Library. ++ ++ The GNU C Library is free software; you can redistribute it and/or ++ modify it under the terms of the GNU Lesser General Public ++ License as published by the Free Software Foundation; either ++ version 2.1 of the License, or (at your option) any later version. ++ ++ The GNU C Library is distributed in the hope that it will be useful, ++ but WITHOUT ANY WARRANTY; without even the implied warranty of ++ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ Lesser General Public License for more details. ++ ++ You should have received a copy of the GNU Lesser General Public ++ License along with the GNU C Library; if not, see ++ . */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include "tst-gnu2-tls2.h" ++ ++#ifndef IS_SUPPORTED ++# define IS_SUPPORTED() true ++#endif ++ ++/* An architecture can define it to clobber caller-saved registers in ++ malloc below to verify that the implicit TLSDESC call won't change ++ caller-saved registers. */ ++#ifndef PREPARE_MALLOC ++# define PREPARE_MALLOC() ++#endif ++ ++extern void * __libc_malloc (size_t); ++ ++size_t malloc_counter = 0; ++ ++void * ++malloc (size_t n) ++{ ++ PREPARE_MALLOC (); ++ malloc_counter++; ++ return __libc_malloc (n); ++} ++ ++static void *mod[3]; ++#ifndef MOD ++# define MOD(i) "tst-gnu2-tls2mod" #i ".so" ++#endif ++static const char *modname[3] = { MOD(0), MOD(1), MOD(2) }; ++#undef MOD ++ ++static void ++open_mod (int i) ++{ ++ mod[i] = xdlopen (modname[i], RTLD_LAZY); ++ printf ("open %s\n", modname[i]); ++} ++ ++static void ++close_mod (int i) ++{ ++ xdlclose (mod[i]); ++ mod[i] = NULL; ++ printf ("close %s\n", modname[i]); ++} ++ ++static void ++access_mod (int i, const char *sym) ++{ ++ struct tls var = { -1, -1, -1, -1 }; ++ struct tls *(*f) (struct tls *) = xdlsym (mod[i], sym); ++ /* Check that our malloc is called. */ ++ malloc_counter = 0; ++ struct tls *p = f (&var); ++ TEST_VERIFY (malloc_counter != 0); ++ printf ("access %s: %s() = %p\n", modname[i], sym, p); ++ TEST_VERIFY_EXIT (memcmp (p, &var, sizeof (var)) == 0); ++ ++(p->a); ++} ++ ++static void * ++start (void *arg) ++{ ++ /* The DTV generation is at the last dlopen of mod0 and the ++ entry for mod1 is NULL. */ ++ ++ open_mod (1); /* Reuse modid of mod1. Uses dynamic TLS. */ ++ ++ /* Force the slow path in GNU2 TLS descriptor call. */ ++ access_mod (1, "apply_tls"); ++ ++ return arg; ++} ++ ++static int ++do_test (void) ++{ ++ if (!IS_SUPPORTED ()) ++ return EXIT_UNSUPPORTED; ++ ++ open_mod (0); ++ open_mod (1); ++ open_mod (2); ++ close_mod (0); ++ close_mod (1); /* Create modid gap at mod1. */ ++ open_mod (0); /* Reuse modid of mod0, bump generation count. */ ++ ++ /* Create a thread where DTV of mod1 is NULL. */ ++ pthread_t t = xpthread_create (NULL, start, NULL); ++ xpthread_join (t); ++ return 0; ++} ++ ++#include +diff --git a/elf/tst-gnu2-tls2.h b/elf/tst-gnu2-tls2.h +new file mode 100644 +index 0000000000000000..77964a57a352e6a4 +--- /dev/null ++++ b/elf/tst-gnu2-tls2.h +@@ -0,0 +1,36 @@ ++/* Test TLSDESC relocation. ++ Copyright (C) 2024 Free Software Foundation, Inc. ++ This file is part of the GNU C Library. ++ ++ The GNU C Library is free software; you can redistribute it and/or ++ modify it under the terms of the GNU Lesser General Public ++ License as published by the Free Software Foundation; either ++ version 2.1 of the License, or (at your option) any later version. ++ ++ The GNU C Library is distributed in the hope that it will be useful, ++ but WITHOUT ANY WARRANTY; without even the implied warranty of ++ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ Lesser General Public License for more details. ++ ++ You should have received a copy of the GNU Lesser General Public ++ License along with the GNU C Library; if not, see ++ . */ ++ ++#include ++ ++struct tls ++{ ++ int64_t a, b, c, d; ++}; ++ ++extern struct tls *apply_tls (struct tls *); ++ ++/* An architecture can define them to verify that clobber caller-saved ++ registers aren't changed by the implicit TLSDESC call. */ ++#ifndef BEFORE_TLSDESC_CALL ++# define BEFORE_TLSDESC_CALL() ++#endif ++ ++#ifndef AFTER_TLSDESC_CALL ++# define AFTER_TLSDESC_CALL() ++#endif +diff --git a/elf/tst-gnu2-tls2mod0.c b/elf/tst-gnu2-tls2mod0.c +new file mode 100644 +index 0000000000000000..45556a0e173922cc +--- /dev/null ++++ b/elf/tst-gnu2-tls2mod0.c +@@ -0,0 +1,31 @@ ++/* DSO used by tst-gnu2-tls2. ++ Copyright (C) 2024 Free Software Foundation, Inc. ++ This file is part of the GNU C Library. ++ ++ The GNU C Library is free software; you can redistribute it and/or ++ modify it under the terms of the GNU Lesser General Public ++ License as published by the Free Software Foundation; either ++ version 2.1 of the License, or (at your option) any later version. ++ ++ The GNU C Library is distributed in the hope that it will be useful, ++ but WITHOUT ANY WARRANTY; without even the implied warranty of ++ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ Lesser General Public License for more details. ++ ++ You should have received a copy of the GNU Lesser General Public ++ License along with the GNU C Library; if not, see ++ . */ ++ ++#include "tst-gnu2-tls2.h" ++ ++__thread struct tls tls_var0 __attribute__ ((visibility ("hidden"))); ++ ++struct tls * ++apply_tls (struct tls *p) ++{ ++ BEFORE_TLSDESC_CALL (); ++ tls_var0 = *p; ++ struct tls *ret = &tls_var0; ++ AFTER_TLSDESC_CALL (); ++ return ret; ++} +diff --git a/elf/tst-gnu2-tls2mod1.c b/elf/tst-gnu2-tls2mod1.c +new file mode 100644 +index 0000000000000000..e10b9dbc0a7573c7 +--- /dev/null ++++ b/elf/tst-gnu2-tls2mod1.c +@@ -0,0 +1,31 @@ ++/* DSO used by tst-gnu2-tls2. ++ Copyright (C) 2024 Free Software Foundation, Inc. ++ This file is part of the GNU C Library. ++ ++ The GNU C Library is free software; you can redistribute it and/or ++ modify it under the terms of the GNU Lesser General Public ++ License as published by the Free Software Foundation; either ++ version 2.1 of the License, or (at your option) any later version. ++ ++ The GNU C Library is distributed in the hope that it will be useful, ++ but WITHOUT ANY WARRANTY; without even the implied warranty of ++ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ Lesser General Public License for more details. ++ ++ You should have received a copy of the GNU Lesser General Public ++ License along with the GNU C Library; if not, see ++ . */ ++ ++#include "tst-gnu2-tls2.h" ++ ++__thread struct tls tls_var1[100] __attribute__ ((visibility ("hidden"))); ++ ++struct tls * ++apply_tls (struct tls *p) ++{ ++ BEFORE_TLSDESC_CALL (); ++ tls_var1[1] = *p; ++ struct tls *ret = &tls_var1[1]; ++ AFTER_TLSDESC_CALL (); ++ return ret; ++} +diff --git a/elf/tst-gnu2-tls2mod2.c b/elf/tst-gnu2-tls2mod2.c +new file mode 100644 +index 0000000000000000..141af51e55b8bf34 +--- /dev/null ++++ b/elf/tst-gnu2-tls2mod2.c +@@ -0,0 +1,31 @@ ++/* DSO used by tst-gnu2-tls2. ++ Copyright (C) 2024 Free Software Foundation, Inc. ++ This file is part of the GNU C Library. ++ ++ The GNU C Library is free software; you can redistribute it and/or ++ modify it under the terms of the GNU Lesser General Public ++ License as published by the Free Software Foundation; either ++ version 2.1 of the License, or (at your option) any later version. ++ ++ The GNU C Library is distributed in the hope that it will be useful, ++ but WITHOUT ANY WARRANTY; without even the implied warranty of ++ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ Lesser General Public License for more details. ++ ++ You should have received a copy of the GNU Lesser General Public ++ License along with the GNU C Library; if not, see ++ . */ ++ ++#include "tst-gnu2-tls2.h" ++ ++__thread struct tls tls_var2 __attribute__ ((visibility ("hidden"))); ++ ++struct tls * ++apply_tls (struct tls *p) ++{ ++ BEFORE_TLSDESC_CALL (); ++ tls_var2 = *p; ++ struct tls *ret = &tls_var2; ++ AFTER_TLSDESC_CALL (); ++ return ret; ++} +diff --git a/sysdeps/i386/dl-machine.h b/sysdeps/i386/dl-machine.h +index fc1ef96587e1992e..50d74fe6e9aaa7bb 100644 +--- a/sysdeps/i386/dl-machine.h ++++ b/sysdeps/i386/dl-machine.h +@@ -347,7 +347,7 @@ and creates an unsatisfiable circular dependency.\n", + { + td->arg = _dl_make_tlsdesc_dynamic + (sym_map, sym->st_value + (ElfW(Word))td->arg); +- td->entry = _dl_tlsdesc_dynamic; ++ td->entry = GLRO(dl_x86_tlsdesc_dynamic); + } + else + # endif +diff --git a/sysdeps/i386/dl-tlsdesc-dynamic.h b/sysdeps/i386/dl-tlsdesc-dynamic.h +new file mode 100644 +index 0000000000000000..36270285775016ff +--- /dev/null ++++ b/sysdeps/i386/dl-tlsdesc-dynamic.h +@@ -0,0 +1,190 @@ ++/* Thread-local storage handling in the ELF dynamic linker. i386 version. ++ Copyright (C) 2004-2024 Free Software Foundation, Inc. ++ This file is part of the GNU C Library. ++ ++ The GNU C Library is free software; you can redistribute it and/or ++ modify it under the terms of the GNU Lesser General Public ++ License as published by the Free Software Foundation; either ++ version 2.1 of the License, or (at your option) any later version. ++ ++ The GNU C Library is distributed in the hope that it will be useful, ++ but WITHOUT ANY WARRANTY; without even the implied warranty of ++ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ Lesser General Public License for more details. ++ ++ You should have received a copy of the GNU Lesser General Public ++ License along with the GNU C Library; if not, see ++ . */ ++ ++#undef REGISTER_SAVE_AREA ++ ++#if !defined USE_FNSAVE && (STATE_SAVE_ALIGNMENT % 16) != 0 ++# error STATE_SAVE_ALIGNMENT must be multiple of 16 ++#endif ++ ++#if DL_RUNTIME_RESOLVE_REALIGN_STACK ++# ifdef USE_FNSAVE ++# error USE_FNSAVE shouldn't be defined ++# endif ++# ifdef USE_FXSAVE ++/* Use fxsave to save all registers. */ ++# define REGISTER_SAVE_AREA 512 ++# endif ++#else ++# ifdef USE_FNSAVE ++/* Use fnsave to save x87 FPU stack registers. */ ++# define REGISTER_SAVE_AREA 108 ++# else ++# ifndef USE_FXSAVE ++# error USE_FXSAVE must be defined ++# endif ++/* Use fxsave to save all registers. Add 12 bytes to align the stack ++ to 16 bytes. */ ++# define REGISTER_SAVE_AREA (512 + 12) ++# endif ++#endif ++ ++ .hidden _dl_tlsdesc_dynamic ++ .global _dl_tlsdesc_dynamic ++ .type _dl_tlsdesc_dynamic,@function ++ ++ /* This function is used for symbols that need dynamic TLS. ++ ++ %eax points to the TLS descriptor, such that 0(%eax) points to ++ _dl_tlsdesc_dynamic itself, and 4(%eax) points to a struct ++ tlsdesc_dynamic_arg object. It must return in %eax the offset ++ between the thread pointer and the object denoted by the ++ argument, without clobbering any registers. ++ ++ The assembly code that follows is a rendition of the following ++ C code, hand-optimized a little bit. ++ ++ptrdiff_t ++__attribute__ ((__regparm__ (1))) ++_dl_tlsdesc_dynamic (struct tlsdesc *tdp) ++{ ++ struct tlsdesc_dynamic_arg *td = tdp->arg; ++ dtv_t *dtv = *(dtv_t **)((char *)__thread_pointer + DTV_OFFSET); ++ if (__builtin_expect (td->gen_count <= dtv[0].counter ++ && (dtv[td->tlsinfo.ti_module].pointer.val ++ != TLS_DTV_UNALLOCATED), ++ 1)) ++ return dtv[td->tlsinfo.ti_module].pointer.val + td->tlsinfo.ti_offset ++ - __thread_pointer; ++ ++ return ___tls_get_addr (&td->tlsinfo) - __thread_pointer; ++} ++*/ ++ cfi_startproc ++ .align 16 ++_dl_tlsdesc_dynamic: ++ /* Like all TLS resolvers, preserve call-clobbered registers. ++ We need two scratch regs anyway. */ ++ subl $32, %esp ++ cfi_adjust_cfa_offset (32) ++ movl %ecx, 20(%esp) ++ movl %edx, 24(%esp) ++ movl TLSDESC_ARG(%eax), %eax ++ movl %gs:DTV_OFFSET, %edx ++ movl TLSDESC_GEN_COUNT(%eax), %ecx ++ cmpl (%edx), %ecx ++ ja 2f ++ movl TLSDESC_MODID(%eax), %ecx ++ movl (%edx,%ecx,8), %edx ++ cmpl $-1, %edx ++ je 2f ++ movl TLSDESC_MODOFF(%eax), %eax ++ addl %edx, %eax ++1: ++ movl 20(%esp), %ecx ++ subl %gs:0, %eax ++ movl 24(%esp), %edx ++ addl $32, %esp ++ cfi_adjust_cfa_offset (-32) ++ ret ++ .p2align 4,,7 ++2: ++ cfi_adjust_cfa_offset (32) ++#if DL_RUNTIME_RESOLVE_REALIGN_STACK ++ movl %ebx, -28(%esp) ++ movl %esp, %ebx ++ cfi_def_cfa_register(%ebx) ++ and $-STATE_SAVE_ALIGNMENT, %esp ++#endif ++#ifdef REGISTER_SAVE_AREA ++ subl $REGISTER_SAVE_AREA, %esp ++# if !DL_RUNTIME_RESOLVE_REALIGN_STACK ++ cfi_adjust_cfa_offset(REGISTER_SAVE_AREA) ++# endif ++#else ++# if !DL_RUNTIME_RESOLVE_REALIGN_STACK ++# error DL_RUNTIME_RESOLVE_REALIGN_STACK must be true ++# endif ++ /* Allocate stack space of the required size to save the state. */ ++ LOAD_PIC_REG (cx) ++ subl RTLD_GLOBAL_RO_DL_X86_CPU_FEATURES_OFFSET+XSAVE_STATE_SIZE_OFFSET+_rtld_local_ro@GOTOFF(%ecx), %esp ++#endif ++#ifdef USE_FNSAVE ++ fnsave (%esp) ++#elif defined USE_FXSAVE ++ fxsave (%esp) ++#else ++ /* Save the argument for ___tls_get_addr in EAX. */ ++ movl %eax, %ecx ++ movl $TLSDESC_CALL_STATE_SAVE_MASK, %eax ++ xorl %edx, %edx ++ /* Clear the XSAVE Header. */ ++# ifdef USE_XSAVE ++ movl %edx, (512)(%esp) ++ movl %edx, (512 + 4 * 1)(%esp) ++ movl %edx, (512 + 4 * 2)(%esp) ++ movl %edx, (512 + 4 * 3)(%esp) ++# endif ++ movl %edx, (512 + 4 * 4)(%esp) ++ movl %edx, (512 + 4 * 5)(%esp) ++ movl %edx, (512 + 4 * 6)(%esp) ++ movl %edx, (512 + 4 * 7)(%esp) ++ movl %edx, (512 + 4 * 8)(%esp) ++ movl %edx, (512 + 4 * 9)(%esp) ++ movl %edx, (512 + 4 * 10)(%esp) ++ movl %edx, (512 + 4 * 11)(%esp) ++ movl %edx, (512 + 4 * 12)(%esp) ++ movl %edx, (512 + 4 * 13)(%esp) ++ movl %edx, (512 + 4 * 14)(%esp) ++ movl %edx, (512 + 4 * 15)(%esp) ++# ifdef USE_XSAVE ++ xsave (%esp) ++# else ++ xsavec (%esp) ++# endif ++ /* Restore the argument for ___tls_get_addr in EAX. */ ++ movl %ecx, %eax ++#endif ++ call HIDDEN_JUMPTARGET (___tls_get_addr) ++ /* Get register content back. */ ++#ifdef USE_FNSAVE ++ frstor (%esp) ++#elif defined USE_FXSAVE ++ fxrstor (%esp) ++#else ++ /* Save and retore ___tls_get_addr return value stored in EAX. */ ++ movl %eax, %ecx ++ movl $TLSDESC_CALL_STATE_SAVE_MASK, %eax ++ xorl %edx, %edx ++ xrstor (%esp) ++ movl %ecx, %eax ++#endif ++#if DL_RUNTIME_RESOLVE_REALIGN_STACK ++ mov %ebx, %esp ++ cfi_def_cfa_register(%esp) ++ movl -28(%esp), %ebx ++ cfi_restore(%ebx) ++#else ++ addl $REGISTER_SAVE_AREA, %esp ++ cfi_adjust_cfa_offset(-REGISTER_SAVE_AREA) ++#endif ++ jmp 1b ++ cfi_endproc ++ .size _dl_tlsdesc_dynamic, .-_dl_tlsdesc_dynamic ++ ++#undef STATE_SAVE_ALIGNMENT +diff --git a/sysdeps/i386/dl-tlsdesc.S b/sysdeps/i386/dl-tlsdesc.S +index 90d93caa0cdb7442..f002feee56e43f71 100644 +--- a/sysdeps/i386/dl-tlsdesc.S ++++ b/sysdeps/i386/dl-tlsdesc.S +@@ -18,8 +18,27 @@ + + #include + #include ++#include ++#include + #include "tlsdesc.h" + ++#ifndef DL_STACK_ALIGNMENT ++/* Due to GCC bug: ++ ++ https://gcc.gnu.org/bugzilla/show_bug.cgi?id=58066 ++ ++ __tls_get_addr may be called with 4-byte stack alignment. Although ++ this bug has been fixed in GCC 4.9.4, 5.3 and 6, we can't assume ++ that stack will be always aligned at 16 bytes. */ ++# define DL_STACK_ALIGNMENT 4 ++#endif ++ ++/* True if _dl_tlsdesc_dynamic should align stack for STATE_SAVE or align ++ stack to MINIMUM_ALIGNMENT bytes before calling ___tls_get_addr. */ ++#define DL_RUNTIME_RESOLVE_REALIGN_STACK \ ++ (STATE_SAVE_ALIGNMENT > DL_STACK_ALIGNMENT \ ++ || MINIMUM_ALIGNMENT > DL_STACK_ALIGNMENT) ++ + .text + + /* This function is used to compute the TP offset for symbols in +@@ -65,69 +84,35 @@ _dl_tlsdesc_undefweak: + .size _dl_tlsdesc_undefweak, .-_dl_tlsdesc_undefweak + + #ifdef SHARED +- .hidden _dl_tlsdesc_dynamic +- .global _dl_tlsdesc_dynamic +- .type _dl_tlsdesc_dynamic,@function +- +- /* This function is used for symbols that need dynamic TLS. +- +- %eax points to the TLS descriptor, such that 0(%eax) points to +- _dl_tlsdesc_dynamic itself, and 4(%eax) points to a struct +- tlsdesc_dynamic_arg object. It must return in %eax the offset +- between the thread pointer and the object denoted by the +- argument, without clobbering any registers. +- +- The assembly code that follows is a rendition of the following +- C code, hand-optimized a little bit. +- +-ptrdiff_t +-__attribute__ ((__regparm__ (1))) +-_dl_tlsdesc_dynamic (struct tlsdesc *tdp) +-{ +- struct tlsdesc_dynamic_arg *td = tdp->arg; +- dtv_t *dtv = *(dtv_t **)((char *)__thread_pointer + DTV_OFFSET); +- if (__builtin_expect (td->gen_count <= dtv[0].counter +- && (dtv[td->tlsinfo.ti_module].pointer.val +- != TLS_DTV_UNALLOCATED), +- 1)) +- return dtv[td->tlsinfo.ti_module].pointer.val + td->tlsinfo.ti_offset +- - __thread_pointer; +- +- return ___tls_get_addr (&td->tlsinfo) - __thread_pointer; +-} +-*/ +- cfi_startproc +- .align 16 +-_dl_tlsdesc_dynamic: +- /* Like all TLS resolvers, preserve call-clobbered registers. +- We need two scratch regs anyway. */ +- subl $28, %esp +- cfi_adjust_cfa_offset (28) +- movl %ecx, 20(%esp) +- movl %edx, 24(%esp) +- movl TLSDESC_ARG(%eax), %eax +- movl %gs:DTV_OFFSET, %edx +- movl TLSDESC_GEN_COUNT(%eax), %ecx +- cmpl (%edx), %ecx +- ja .Lslow +- movl TLSDESC_MODID(%eax), %ecx +- movl (%edx,%ecx,8), %edx +- cmpl $-1, %edx +- je .Lslow +- movl TLSDESC_MODOFF(%eax), %eax +- addl %edx, %eax +-.Lret: +- movl 20(%esp), %ecx +- subl %gs:0, %eax +- movl 24(%esp), %edx +- addl $28, %esp +- cfi_adjust_cfa_offset (-28) +- ret +- .p2align 4,,7 +-.Lslow: +- cfi_adjust_cfa_offset (28) +- call HIDDEN_JUMPTARGET (___tls_get_addr) +- jmp .Lret +- cfi_endproc +- .size _dl_tlsdesc_dynamic, .-_dl_tlsdesc_dynamic ++# define USE_FNSAVE ++# define MINIMUM_ALIGNMENT 4 ++# define STATE_SAVE_ALIGNMENT 4 ++# define _dl_tlsdesc_dynamic _dl_tlsdesc_dynamic_fnsave ++# include "dl-tlsdesc-dynamic.h" ++# undef _dl_tlsdesc_dynamic ++# undef MINIMUM_ALIGNMENT ++# undef USE_FNSAVE ++ ++# define MINIMUM_ALIGNMENT 16 ++ ++# define USE_FXSAVE ++# define STATE_SAVE_ALIGNMENT 16 ++# define _dl_tlsdesc_dynamic _dl_tlsdesc_dynamic_fxsave ++# include "dl-tlsdesc-dynamic.h" ++# undef _dl_tlsdesc_dynamic ++# undef USE_FXSAVE ++ ++# define USE_XSAVE ++# define STATE_SAVE_ALIGNMENT 64 ++# define _dl_tlsdesc_dynamic _dl_tlsdesc_dynamic_xsave ++# include "dl-tlsdesc-dynamic.h" ++# undef _dl_tlsdesc_dynamic ++# undef USE_XSAVE ++ ++# define USE_XSAVEC ++# define STATE_SAVE_ALIGNMENT 64 ++# define _dl_tlsdesc_dynamic _dl_tlsdesc_dynamic_xsavec ++# include "dl-tlsdesc-dynamic.h" ++# undef _dl_tlsdesc_dynamic ++# undef USE_XSAVEC + #endif /* SHARED */ +diff --git a/sysdeps/x86/Makefile b/sysdeps/x86/Makefile +index 4d50b327b55ffd65..992aabe43ec60abf 100644 +--- a/sysdeps/x86/Makefile ++++ b/sysdeps/x86/Makefile +@@ -1,5 +1,5 @@ + ifeq ($(subdir),csu) +-gen-as-const-headers += cpu-features-offsets.sym ++gen-as-const-headers += cpu-features-offsets.sym features-offsets.sym + endif + + ifeq ($(subdir),elf) +@@ -86,6 +86,11 @@ endif + tst-ifunc-isa-2-ENV = GLIBC_TUNABLES=glibc.cpu.hwcaps=-SSE4_2,-AVX,-AVX2,-AVX512F + tst-ifunc-isa-2-static-ENV = $(tst-ifunc-isa-2-ENV) + tst-hwcap-tunables-ARGS = -- $(host-test-program-cmd) ++ ++CFLAGS-tst-gnu2-tls2.c += -msse ++CFLAGS-tst-gnu2-tls2mod0.c += -msse2 -mtune=haswell ++CFLAGS-tst-gnu2-tls2mod1.c += -msse2 -mtune=haswell ++CFLAGS-tst-gnu2-tls2mod2.c += -msse2 -mtune=haswell + endif + + ifeq ($(subdir),math) +diff --git a/sysdeps/x86/cpu-features.c b/sysdeps/x86/cpu-features.c +index 25e6622a79bb969f..835113b42f924b83 100644 +--- a/sysdeps/x86/cpu-features.c ++++ b/sysdeps/x86/cpu-features.c +@@ -27,8 +27,13 @@ + extern void TUNABLE_CALLBACK (set_hwcaps) (tunable_val_t *) + attribute_hidden; + +-#if defined SHARED && defined __x86_64__ +-# include ++#if defined SHARED ++extern void _dl_tlsdesc_dynamic_fxsave (void) attribute_hidden; ++extern void _dl_tlsdesc_dynamic_xsave (void) attribute_hidden; ++extern void _dl_tlsdesc_dynamic_xsavec (void) attribute_hidden; ++ ++# ifdef __x86_64__ ++# include + + static void + TUNABLE_CALLBACK (set_plt_rewrite) (tunable_val_t *valp) +@@ -47,6 +52,15 @@ TUNABLE_CALLBACK (set_plt_rewrite) (tunable_val_t *valp) + : plt_rewrite_jmp); + } + } ++# else ++extern void _dl_tlsdesc_dynamic_fnsave (void) attribute_hidden; ++# endif ++#endif ++ ++#ifdef __x86_64__ ++extern void _dl_runtime_resolve_fxsave (void) attribute_hidden; ++extern void _dl_runtime_resolve_xsave (void) attribute_hidden; ++extern void _dl_runtime_resolve_xsavec (void) attribute_hidden; + #endif + + #ifdef __LP64__ +@@ -1130,6 +1144,44 @@ no_cpuid: + TUNABLE_CALLBACK (set_x86_shstk)); + #endif + ++ if (GLRO(dl_x86_cpu_features).xsave_state_size != 0) ++ { ++ if (CPU_FEATURE_USABLE_P (cpu_features, XSAVEC)) ++ { ++#ifdef __x86_64__ ++ GLRO(dl_x86_64_runtime_resolve) = _dl_runtime_resolve_xsavec; ++#endif ++#ifdef SHARED ++ GLRO(dl_x86_tlsdesc_dynamic) = _dl_tlsdesc_dynamic_xsavec; ++#endif ++ } ++ else ++ { ++#ifdef __x86_64__ ++ GLRO(dl_x86_64_runtime_resolve) = _dl_runtime_resolve_xsave; ++#endif ++#ifdef SHARED ++ GLRO(dl_x86_tlsdesc_dynamic) = _dl_tlsdesc_dynamic_xsave; ++#endif ++ } ++ } ++ else ++ { ++#ifdef __x86_64__ ++ GLRO(dl_x86_64_runtime_resolve) = _dl_runtime_resolve_fxsave; ++# ifdef SHARED ++ GLRO(dl_x86_tlsdesc_dynamic) = _dl_tlsdesc_dynamic_fxsave; ++# endif ++#else ++# ifdef SHARED ++ if (CPU_FEATURE_USABLE_P (cpu_features, FXSR)) ++ GLRO(dl_x86_tlsdesc_dynamic) = _dl_tlsdesc_dynamic_fxsave; ++ else ++ GLRO(dl_x86_tlsdesc_dynamic) = _dl_tlsdesc_dynamic_fnsave; ++# endif ++#endif ++ } ++ + #ifdef SHARED + # ifdef __x86_64__ + TUNABLE_GET (plt_rewrite, tunable_val_t *, +diff --git a/sysdeps/x86/dl-procinfo.c b/sysdeps/x86/dl-procinfo.c +index ee957b4d70050b1c..5920d4b32034501d 100644 +--- a/sysdeps/x86/dl-procinfo.c ++++ b/sysdeps/x86/dl-procinfo.c +@@ -86,3 +86,19 @@ PROCINFO_CLASS const char _dl_x86_platforms[4][9] + #else + , + #endif ++ ++#if defined SHARED && !IS_IN (ldconfig) ++# if !defined PROCINFO_DECL ++ ._dl_x86_tlsdesc_dynamic ++# else ++PROCINFO_CLASS void * _dl_x86_tlsdesc_dynamic ++# endif ++# ifndef PROCINFO_DECL ++= NULL ++# endif ++# ifdef PROCINFO_DECL ++; ++# else ++, ++# endif ++#endif +diff --git a/sysdeps/x86_64/features-offsets.sym b/sysdeps/x86/features-offsets.sym +similarity index 89% +rename from sysdeps/x86_64/features-offsets.sym +rename to sysdeps/x86/features-offsets.sym +index 9e4be3393a60fd92..77e990c7053a38bf 100644 +--- a/sysdeps/x86_64/features-offsets.sym ++++ b/sysdeps/x86/features-offsets.sym +@@ -3,4 +3,6 @@ + #include + + RTLD_GLOBAL_RO_DL_X86_CPU_FEATURES_OFFSET offsetof (struct rtld_global_ro, _dl_x86_cpu_features) ++#ifdef __x86_64__ + RTLD_GLOBAL_DL_X86_FEATURE_1_OFFSET offsetof (struct rtld_global, _dl_x86_feature_1) ++#endif +diff --git a/sysdeps/x86/sysdep.h b/sysdeps/x86/sysdep.h +index 837fd28734914a1c..485cad9c0283b334 100644 +--- a/sysdeps/x86/sysdep.h ++++ b/sysdeps/x86/sysdep.h +@@ -70,6 +70,12 @@ + | (1 << X86_XSTATE_ZMM_H_ID)) + #endif + ++/* States which should be saved for TLSDESC_CALL and TLS_DESC_CALL. ++ Compiler assumes that all registers, including x87 FPU stack registers, ++ are unchanged after CALL, except for EFLAGS and RAX/EAX. */ ++#define TLSDESC_CALL_STATE_SAVE_MASK \ ++ (STATE_SAVE_MASK | (1 << X86_XSTATE_X87_ID)) ++ + /* Constants for bits in __x86_string_control: */ + + /* Avoid short distance REP MOVSB. */ +diff --git a/sysdeps/x86/tst-gnu2-tls2.c b/sysdeps/x86/tst-gnu2-tls2.c +new file mode 100644 +index 0000000000000000..de900a423bb70321 +--- /dev/null ++++ b/sysdeps/x86/tst-gnu2-tls2.c +@@ -0,0 +1,20 @@ ++#ifndef __x86_64__ ++#include ++ ++#define IS_SUPPORTED() CPU_FEATURE_ACTIVE (SSE2) ++#endif ++ ++/* Clear XMM0...XMM7 */ ++#define PREPARE_MALLOC() \ ++{ \ ++ asm volatile ("xorps %%xmm0, %%xmm0" : : : "xmm0" ); \ ++ asm volatile ("xorps %%xmm1, %%xmm1" : : : "xmm1" ); \ ++ asm volatile ("xorps %%xmm2, %%xmm2" : : : "xmm2" ); \ ++ asm volatile ("xorps %%xmm3, %%xmm3" : : : "xmm3" ); \ ++ asm volatile ("xorps %%xmm4, %%xmm4" : : : "xmm4" ); \ ++ asm volatile ("xorps %%xmm5, %%xmm5" : : : "xmm5" ); \ ++ asm volatile ("xorps %%xmm6, %%xmm6" : : : "xmm6" ); \ ++ asm volatile ("xorps %%xmm7, %%xmm7" : : : "xmm7" ); \ ++} ++ ++#include +diff --git a/sysdeps/x86_64/Makefile b/sysdeps/x86_64/Makefile +index 90f4ecfd262cfb87..e8babc9a4edbf90b 100644 +--- a/sysdeps/x86_64/Makefile ++++ b/sysdeps/x86_64/Makefile +@@ -10,7 +10,7 @@ LDFLAGS-rtld += -Wl,-z,nomark-plt + endif + + ifeq ($(subdir),csu) +-gen-as-const-headers += features-offsets.sym link-defines.sym ++gen-as-const-headers += link-defines.sym + endif + + ifeq ($(subdir),gmon) +diff --git a/sysdeps/x86_64/dl-machine.h b/sysdeps/x86_64/dl-machine.h +index 6d605d0d3293bcd6..ff5d45f7cb7cd81d 100644 +--- a/sysdeps/x86_64/dl-machine.h ++++ b/sysdeps/x86_64/dl-machine.h +@@ -71,9 +71,6 @@ elf_machine_runtime_setup (struct link_map *l, struct r_scope_elem *scope[], + int lazy, int profile) + { + Elf64_Addr *got; +- extern void _dl_runtime_resolve_fxsave (ElfW(Word)) attribute_hidden; +- extern void _dl_runtime_resolve_xsave (ElfW(Word)) attribute_hidden; +- extern void _dl_runtime_resolve_xsavec (ElfW(Word)) attribute_hidden; + extern void _dl_runtime_profile_sse (ElfW(Word)) attribute_hidden; + extern void _dl_runtime_profile_avx (ElfW(Word)) attribute_hidden; + extern void _dl_runtime_profile_avx512 (ElfW(Word)) attribute_hidden; +@@ -96,8 +93,6 @@ elf_machine_runtime_setup (struct link_map *l, struct r_scope_elem *scope[], + /* Identify this shared object. */ + *(ElfW(Addr) *) (got + 1) = (ElfW(Addr)) l; + +- const struct cpu_features* cpu_features = __get_cpu_features (); +- + #ifdef SHARED + /* The got[2] entry contains the address of a function which gets + called to get the address of a so far unresolved function and +@@ -107,6 +102,7 @@ elf_machine_runtime_setup (struct link_map *l, struct r_scope_elem *scope[], + end in this function. */ + if (__glibc_unlikely (profile)) + { ++ const struct cpu_features* cpu_features = __get_cpu_features (); + if (X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, AVX512F)) + *(ElfW(Addr) *) (got + 2) = (ElfW(Addr)) &_dl_runtime_profile_avx512; + else if (X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, AVX)) +@@ -126,15 +122,8 @@ elf_machine_runtime_setup (struct link_map *l, struct r_scope_elem *scope[], + /* This function will get called to fix up the GOT entry + indicated by the offset on the stack, and then jump to + the resolved address. */ +- if (MINIMUM_X86_ISA_LEVEL >= AVX_X86_ISA_LEVEL +- || GLRO(dl_x86_cpu_features).xsave_state_size != 0) +- *(ElfW(Addr) *) (got + 2) +- = (CPU_FEATURE_USABLE_P (cpu_features, XSAVEC) +- ? (ElfW(Addr)) &_dl_runtime_resolve_xsavec +- : (ElfW(Addr)) &_dl_runtime_resolve_xsave); +- else +- *(ElfW(Addr) *) (got + 2) +- = (ElfW(Addr)) &_dl_runtime_resolve_fxsave; ++ *(ElfW(Addr) *) (got + 2) ++ = (ElfW(Addr)) GLRO(dl_x86_64_runtime_resolve); + } + } + +@@ -383,7 +372,7 @@ and creates an unsatisfiable circular dependency.\n", + { + td->arg = _dl_make_tlsdesc_dynamic + (sym_map, sym->st_value + reloc->r_addend); +- td->entry = _dl_tlsdesc_dynamic; ++ td->entry = GLRO(dl_x86_tlsdesc_dynamic); + } + else + # endif +diff --git a/sysdeps/x86_64/dl-procinfo.c b/sysdeps/x86_64/dl-procinfo.c +index 4d1d790fbb2f2992..06637a8154d8648f 100644 +--- a/sysdeps/x86_64/dl-procinfo.c ++++ b/sysdeps/x86_64/dl-procinfo.c +@@ -41,5 +41,21 @@ + + #include + ++#if !IS_IN (ldconfig) ++# if !defined PROCINFO_DECL && defined SHARED ++ ._dl_x86_64_runtime_resolve ++# else ++PROCINFO_CLASS void * _dl_x86_64_runtime_resolve ++# endif ++# ifndef PROCINFO_DECL ++= NULL ++# endif ++# if !defined SHARED || defined PROCINFO_DECL ++; ++# else ++, ++# endif ++#endif ++ + #undef PROCINFO_DECL + #undef PROCINFO_CLASS +diff --git a/sysdeps/x86_64/dl-tlsdesc-dynamic.h b/sysdeps/x86_64/dl-tlsdesc-dynamic.h +new file mode 100644 +index 0000000000000000..0c2e8d5320d0bd26 +--- /dev/null ++++ b/sysdeps/x86_64/dl-tlsdesc-dynamic.h +@@ -0,0 +1,166 @@ ++/* Thread-local storage handling in the ELF dynamic linker. x86_64 version. ++ Copyright (C) 2004-2024 Free Software Foundation, Inc. ++ This file is part of the GNU C Library. ++ ++ The GNU C Library is free software; you can redistribute it and/or ++ modify it under the terms of the GNU Lesser General Public ++ License as published by the Free Software Foundation; either ++ version 2.1 of the License, or (at your option) any later version. ++ ++ The GNU C Library is distributed in the hope that it will be useful, ++ but WITHOUT ANY WARRANTY; without even the implied warranty of ++ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ Lesser General Public License for more details. ++ ++ You should have received a copy of the GNU Lesser General Public ++ License along with the GNU C Library; if not, see ++ . */ ++ ++#ifndef SECTION ++# define SECTION(p) p ++#endif ++ ++#undef REGISTER_SAVE_AREA ++#undef LOCAL_STORAGE_AREA ++#undef BASE ++ ++#include "dl-trampoline-state.h" ++ ++ .section SECTION(.text),"ax",@progbits ++ ++ .hidden _dl_tlsdesc_dynamic ++ .global _dl_tlsdesc_dynamic ++ .type _dl_tlsdesc_dynamic,@function ++ ++ /* %rax points to the TLS descriptor, such that 0(%rax) points to ++ _dl_tlsdesc_dynamic itself, and 8(%rax) points to a struct ++ tlsdesc_dynamic_arg object. It must return in %rax the offset ++ between the thread pointer and the object denoted by the ++ argument, without clobbering any registers. ++ ++ The assembly code that follows is a rendition of the following ++ C code, hand-optimized a little bit. ++ ++ptrdiff_t ++_dl_tlsdesc_dynamic (register struct tlsdesc *tdp asm ("%rax")) ++{ ++ struct tlsdesc_dynamic_arg *td = tdp->arg; ++ dtv_t *dtv = *(dtv_t **)((char *)__thread_pointer + DTV_OFFSET); ++ if (__builtin_expect (td->gen_count <= dtv[0].counter ++ && (dtv[td->tlsinfo.ti_module].pointer.val ++ != TLS_DTV_UNALLOCATED), ++ 1)) ++ return dtv[td->tlsinfo.ti_module].pointer.val + td->tlsinfo.ti_offset ++ - __thread_pointer; ++ ++ return __tls_get_addr_internal (&td->tlsinfo) - __thread_pointer; ++} ++*/ ++ cfi_startproc ++ .align 16 ++_dl_tlsdesc_dynamic: ++ _CET_ENDBR ++ /* Preserve call-clobbered registers that we modify. ++ We need two scratch regs anyway. */ ++ movq %rsi, -16(%rsp) ++ mov %fs:DTV_OFFSET, %RSI_LP ++ movq %rdi, -8(%rsp) ++ movq TLSDESC_ARG(%rax), %rdi ++ movq (%rsi), %rax ++ cmpq %rax, TLSDESC_GEN_COUNT(%rdi) ++ ja 2f ++ movq TLSDESC_MODID(%rdi), %rax ++ salq $4, %rax ++ movq (%rax,%rsi), %rax ++ cmpq $-1, %rax ++ je 2f ++ addq TLSDESC_MODOFF(%rdi), %rax ++1: ++ movq -16(%rsp), %rsi ++ sub %fs:0, %RAX_LP ++ movq -8(%rsp), %rdi ++ ret ++2: ++#if DL_RUNTIME_RESOLVE_REALIGN_STACK ++ movq %rbx, -24(%rsp) ++ mov %RSP_LP, %RBX_LP ++ cfi_def_cfa_register(%rbx) ++ and $-STATE_SAVE_ALIGNMENT, %RSP_LP ++#endif ++#ifdef REGISTER_SAVE_AREA ++# if DL_RUNTIME_RESOLVE_REALIGN_STACK ++ /* STATE_SAVE_OFFSET has space for 8 integer registers. But we ++ need space for RCX, RDX, RSI, RDI, R8, R9, R10 and R11, plus ++ RBX above. */ ++ sub $(REGISTER_SAVE_AREA + STATE_SAVE_ALIGNMENT), %RSP_LP ++# else ++ sub $REGISTER_SAVE_AREA, %RSP_LP ++ cfi_adjust_cfa_offset(REGISTER_SAVE_AREA) ++# endif ++#else ++ /* Allocate stack space of the required size to save the state. */ ++ sub _rtld_local_ro+RTLD_GLOBAL_RO_DL_X86_CPU_FEATURES_OFFSET+XSAVE_STATE_SIZE_OFFSET(%rip), %RSP_LP ++#endif ++ /* Besides rdi and rsi, saved above, save rcx, rdx, r8, r9, ++ r10 and r11. */ ++ movq %rcx, REGISTER_SAVE_RCX(%rsp) ++ movq %rdx, REGISTER_SAVE_RDX(%rsp) ++ movq %r8, REGISTER_SAVE_R8(%rsp) ++ movq %r9, REGISTER_SAVE_R9(%rsp) ++ movq %r10, REGISTER_SAVE_R10(%rsp) ++ movq %r11, REGISTER_SAVE_R11(%rsp) ++#ifdef USE_FXSAVE ++ fxsave STATE_SAVE_OFFSET(%rsp) ++#else ++ movl $TLSDESC_CALL_STATE_SAVE_MASK, %eax ++ xorl %edx, %edx ++ /* Clear the XSAVE Header. */ ++# ifdef USE_XSAVE ++ movq %rdx, (STATE_SAVE_OFFSET + 512)(%rsp) ++ movq %rdx, (STATE_SAVE_OFFSET + 512 + 8)(%rsp) ++# endif ++ movq %rdx, (STATE_SAVE_OFFSET + 512 + 8 * 2)(%rsp) ++ movq %rdx, (STATE_SAVE_OFFSET + 512 + 8 * 3)(%rsp) ++ movq %rdx, (STATE_SAVE_OFFSET + 512 + 8 * 4)(%rsp) ++ movq %rdx, (STATE_SAVE_OFFSET + 512 + 8 * 5)(%rsp) ++ movq %rdx, (STATE_SAVE_OFFSET + 512 + 8 * 6)(%rsp) ++ movq %rdx, (STATE_SAVE_OFFSET + 512 + 8 * 7)(%rsp) ++# ifdef USE_XSAVE ++ xsave STATE_SAVE_OFFSET(%rsp) ++# else ++ xsavec STATE_SAVE_OFFSET(%rsp) ++# endif ++#endif ++ /* %rdi already points to the tlsinfo data structure. */ ++ call HIDDEN_JUMPTARGET (__tls_get_addr) ++ # Get register content back. ++#ifdef USE_FXSAVE ++ fxrstor STATE_SAVE_OFFSET(%rsp) ++#else ++ /* Save and retore __tls_get_addr return value stored in RAX. */ ++ mov %RAX_LP, %RCX_LP ++ movl $TLSDESC_CALL_STATE_SAVE_MASK, %eax ++ xorl %edx, %edx ++ xrstor STATE_SAVE_OFFSET(%rsp) ++ mov %RCX_LP, %RAX_LP ++#endif ++ movq REGISTER_SAVE_R11(%rsp), %r11 ++ movq REGISTER_SAVE_R10(%rsp), %r10 ++ movq REGISTER_SAVE_R9(%rsp), %r9 ++ movq REGISTER_SAVE_R8(%rsp), %r8 ++ movq REGISTER_SAVE_RDX(%rsp), %rdx ++ movq REGISTER_SAVE_RCX(%rsp), %rcx ++#if DL_RUNTIME_RESOLVE_REALIGN_STACK ++ mov %RBX_LP, %RSP_LP ++ cfi_def_cfa_register(%rsp) ++ movq -24(%rsp), %rbx ++ cfi_restore(%rbx) ++#else ++ add $REGISTER_SAVE_AREA, %RSP_LP ++ cfi_adjust_cfa_offset(-REGISTER_SAVE_AREA) ++#endif ++ jmp 1b ++ cfi_endproc ++ .size _dl_tlsdesc_dynamic, .-_dl_tlsdesc_dynamic ++ ++#undef STATE_SAVE_ALIGNMENT +diff --git a/sysdeps/x86_64/dl-tlsdesc.S b/sysdeps/x86_64/dl-tlsdesc.S +index f748af2ece8de09a..ea69f5223a77e0c0 100644 +--- a/sysdeps/x86_64/dl-tlsdesc.S ++++ b/sysdeps/x86_64/dl-tlsdesc.S +@@ -18,7 +18,19 @@ + + #include + #include ++#include ++#include + #include "tlsdesc.h" ++#include "dl-trampoline-save.h" ++ ++/* Area on stack to save and restore registers used for parameter ++ passing when calling _dl_tlsdesc_dynamic. */ ++#define REGISTER_SAVE_RCX 0 ++#define REGISTER_SAVE_RDX (REGISTER_SAVE_RCX + 8) ++#define REGISTER_SAVE_R8 (REGISTER_SAVE_RDX + 8) ++#define REGISTER_SAVE_R9 (REGISTER_SAVE_R8 + 8) ++#define REGISTER_SAVE_R10 (REGISTER_SAVE_R9 + 8) ++#define REGISTER_SAVE_R11 (REGISTER_SAVE_R10 + 8) + + .text + +@@ -67,80 +79,24 @@ _dl_tlsdesc_undefweak: + .size _dl_tlsdesc_undefweak, .-_dl_tlsdesc_undefweak + + #ifdef SHARED +- .hidden _dl_tlsdesc_dynamic +- .global _dl_tlsdesc_dynamic +- .type _dl_tlsdesc_dynamic,@function +- +- /* %rax points to the TLS descriptor, such that 0(%rax) points to +- _dl_tlsdesc_dynamic itself, and 8(%rax) points to a struct +- tlsdesc_dynamic_arg object. It must return in %rax the offset +- between the thread pointer and the object denoted by the +- argument, without clobbering any registers. +- +- The assembly code that follows is a rendition of the following +- C code, hand-optimized a little bit. +- +-ptrdiff_t +-_dl_tlsdesc_dynamic (register struct tlsdesc *tdp asm ("%rax")) +-{ +- struct tlsdesc_dynamic_arg *td = tdp->arg; +- dtv_t *dtv = *(dtv_t **)((char *)__thread_pointer + DTV_OFFSET); +- if (__builtin_expect (td->gen_count <= dtv[0].counter +- && (dtv[td->tlsinfo.ti_module].pointer.val +- != TLS_DTV_UNALLOCATED), +- 1)) +- return dtv[td->tlsinfo.ti_module].pointer.val + td->tlsinfo.ti_offset +- - __thread_pointer; +- +- return __tls_get_addr_internal (&td->tlsinfo) - __thread_pointer; +-} +-*/ +- cfi_startproc +- .align 16 +-_dl_tlsdesc_dynamic: +- _CET_ENDBR +- /* Preserve call-clobbered registers that we modify. +- We need two scratch regs anyway. */ +- movq %rsi, -16(%rsp) +- mov %fs:DTV_OFFSET, %RSI_LP +- movq %rdi, -8(%rsp) +- movq TLSDESC_ARG(%rax), %rdi +- movq (%rsi), %rax +- cmpq %rax, TLSDESC_GEN_COUNT(%rdi) +- ja .Lslow +- movq TLSDESC_MODID(%rdi), %rax +- salq $4, %rax +- movq (%rax,%rsi), %rax +- cmpq $-1, %rax +- je .Lslow +- addq TLSDESC_MODOFF(%rdi), %rax +-.Lret: +- movq -16(%rsp), %rsi +- sub %fs:0, %RAX_LP +- movq -8(%rsp), %rdi +- ret +-.Lslow: +- /* Besides rdi and rsi, saved above, save rdx, rcx, r8, r9, +- r10 and r11. Also, align the stack, that's off by 8 bytes. */ +- subq $72, %rsp +- cfi_adjust_cfa_offset (72) +- movq %rdx, 8(%rsp) +- movq %rcx, 16(%rsp) +- movq %r8, 24(%rsp) +- movq %r9, 32(%rsp) +- movq %r10, 40(%rsp) +- movq %r11, 48(%rsp) +- /* %rdi already points to the tlsinfo data structure. */ +- call HIDDEN_JUMPTARGET (__tls_get_addr) +- movq 8(%rsp), %rdx +- movq 16(%rsp), %rcx +- movq 24(%rsp), %r8 +- movq 32(%rsp), %r9 +- movq 40(%rsp), %r10 +- movq 48(%rsp), %r11 +- addq $72, %rsp +- cfi_adjust_cfa_offset (-72) +- jmp .Lret +- cfi_endproc +- .size _dl_tlsdesc_dynamic, .-_dl_tlsdesc_dynamic ++# define USE_FXSAVE ++# define STATE_SAVE_ALIGNMENT 16 ++# define _dl_tlsdesc_dynamic _dl_tlsdesc_dynamic_fxsave ++# include "dl-tlsdesc-dynamic.h" ++# undef _dl_tlsdesc_dynamic ++# undef USE_FXSAVE ++ ++# define USE_XSAVE ++# define STATE_SAVE_ALIGNMENT 64 ++# define _dl_tlsdesc_dynamic _dl_tlsdesc_dynamic_xsave ++# include "dl-tlsdesc-dynamic.h" ++# undef _dl_tlsdesc_dynamic ++# undef USE_XSAVE ++ ++# define USE_XSAVEC ++# define STATE_SAVE_ALIGNMENT 64 ++# define _dl_tlsdesc_dynamic _dl_tlsdesc_dynamic_xsavec ++# include "dl-tlsdesc-dynamic.h" ++# undef _dl_tlsdesc_dynamic ++# undef USE_XSAVEC + #endif /* SHARED */ +diff --git a/sysdeps/x86_64/dl-trampoline-save.h b/sysdeps/x86_64/dl-trampoline-save.h +new file mode 100644 +index 0000000000000000..84eac4a8ac13ad86 +--- /dev/null ++++ b/sysdeps/x86_64/dl-trampoline-save.h +@@ -0,0 +1,34 @@ ++/* x86-64 PLT trampoline register save macros. ++ Copyright (C) 2024 Free Software Foundation, Inc. ++ This file is part of the GNU C Library. ++ ++ The GNU C Library is free software; you can redistribute it and/or ++ modify it under the terms of the GNU Lesser General Public ++ License as published by the Free Software Foundation; either ++ version 2.1 of the License, or (at your option) any later version. ++ ++ The GNU C Library is distributed in the hope that it will be useful, ++ but WITHOUT ANY WARRANTY; without even the implied warranty of ++ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ Lesser General Public License for more details. ++ ++ You should have received a copy of the GNU Lesser General Public ++ License along with the GNU C Library; if not, see ++ . */ ++ ++#ifndef DL_STACK_ALIGNMENT ++/* Due to GCC bug: ++ ++ https://gcc.gnu.org/bugzilla/show_bug.cgi?id=58066 ++ ++ __tls_get_addr may be called with 8-byte stack alignment. Although ++ this bug has been fixed in GCC 4.9.4, 5.3 and 6, we can't assume ++ that stack will be always aligned at 16 bytes. */ ++# define DL_STACK_ALIGNMENT 8 ++#endif ++ ++/* True if _dl_runtime_resolve should align stack for STATE_SAVE or align ++ stack to 16 bytes before calling _dl_fixup. */ ++#define DL_RUNTIME_RESOLVE_REALIGN_STACK \ ++ (STATE_SAVE_ALIGNMENT > DL_STACK_ALIGNMENT \ ++ || 16 > DL_STACK_ALIGNMENT) +diff --git a/sysdeps/x86_64/dl-trampoline-state.h b/sysdeps/x86_64/dl-trampoline-state.h +new file mode 100644 +index 0000000000000000..575f120797860583 +--- /dev/null ++++ b/sysdeps/x86_64/dl-trampoline-state.h +@@ -0,0 +1,51 @@ ++/* x86-64 PLT dl-trampoline state macros. ++ Copyright (C) 2024 Free Software Foundation, Inc. ++ This file is part of the GNU C Library. ++ ++ The GNU C Library is free software; you can redistribute it and/or ++ modify it under the terms of the GNU Lesser General Public ++ License as published by the Free Software Foundation; either ++ version 2.1 of the License, or (at your option) any later version. ++ ++ The GNU C Library is distributed in the hope that it will be useful, ++ but WITHOUT ANY WARRANTY; without even the implied warranty of ++ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ Lesser General Public License for more details. ++ ++ You should have received a copy of the GNU Lesser General Public ++ License along with the GNU C Library; if not, see ++ . */ ++ ++#if (STATE_SAVE_ALIGNMENT % 16) != 0 ++# error STATE_SAVE_ALIGNMENT must be multiple of 16 ++#endif ++ ++#if (STATE_SAVE_OFFSET % STATE_SAVE_ALIGNMENT) != 0 ++# error STATE_SAVE_OFFSET must be multiple of STATE_SAVE_ALIGNMENT ++#endif ++ ++#if DL_RUNTIME_RESOLVE_REALIGN_STACK ++/* Local stack area before jumping to function address: RBX. */ ++# define LOCAL_STORAGE_AREA 8 ++# define BASE rbx ++# ifdef USE_FXSAVE ++/* Use fxsave to save XMM registers. */ ++# define REGISTER_SAVE_AREA (512 + STATE_SAVE_OFFSET) ++# if (REGISTER_SAVE_AREA % 16) != 0 ++# error REGISTER_SAVE_AREA must be multiple of 16 ++# endif ++# endif ++#else ++# ifndef USE_FXSAVE ++# error USE_FXSAVE must be defined ++# endif ++/* Use fxsave to save XMM registers. */ ++# define REGISTER_SAVE_AREA (512 + STATE_SAVE_OFFSET + 8) ++/* Local stack area before jumping to function address: All saved ++ registers. */ ++# define LOCAL_STORAGE_AREA REGISTER_SAVE_AREA ++# define BASE rsp ++# if (REGISTER_SAVE_AREA % 16) != 8 ++# error REGISTER_SAVE_AREA must be odd multiple of 8 ++# endif ++#endif +diff --git a/sysdeps/x86_64/dl-trampoline.S b/sysdeps/x86_64/dl-trampoline.S +index b2e7e0f69b709ffd..87c5137837f01a63 100644 +--- a/sysdeps/x86_64/dl-trampoline.S ++++ b/sysdeps/x86_64/dl-trampoline.S +@@ -22,25 +22,7 @@ + #include + #include + #include +- +-#ifndef DL_STACK_ALIGNMENT +-/* Due to GCC bug: +- +- https://gcc.gnu.org/bugzilla/show_bug.cgi?id=58066 +- +- __tls_get_addr may be called with 8-byte stack alignment. Although +- this bug has been fixed in GCC 4.9.4, 5.3 and 6, we can't assume +- that stack will be always aligned at 16 bytes. We use unaligned +- 16-byte move to load and store SSE registers, which has no penalty +- on modern processors if stack is 16-byte aligned. */ +-# define DL_STACK_ALIGNMENT 8 +-#endif +- +-/* True if _dl_runtime_resolve should align stack for STATE_SAVE or align +- stack to 16 bytes before calling _dl_fixup. */ +-#define DL_RUNTIME_RESOLVE_REALIGN_STACK \ +- (STATE_SAVE_ALIGNMENT > DL_STACK_ALIGNMENT \ +- || 16 > DL_STACK_ALIGNMENT) ++#include "dl-trampoline-save.h" + + /* Area on stack to save and restore registers used for parameter + passing when calling _dl_fixup. */ +diff --git a/sysdeps/x86_64/dl-trampoline.h b/sysdeps/x86_64/dl-trampoline.h +index f55c6ea040b9d319..d9ccfb40d46d3312 100644 +--- a/sysdeps/x86_64/dl-trampoline.h ++++ b/sysdeps/x86_64/dl-trampoline.h +@@ -27,39 +27,7 @@ + # undef LOCAL_STORAGE_AREA + # undef BASE + +-# if (STATE_SAVE_ALIGNMENT % 16) != 0 +-# error STATE_SAVE_ALIGNMENT must be multiple of 16 +-# endif +- +-# if (STATE_SAVE_OFFSET % STATE_SAVE_ALIGNMENT) != 0 +-# error STATE_SAVE_OFFSET must be multiple of STATE_SAVE_ALIGNMENT +-# endif +- +-# if DL_RUNTIME_RESOLVE_REALIGN_STACK +-/* Local stack area before jumping to function address: RBX. */ +-# define LOCAL_STORAGE_AREA 8 +-# define BASE rbx +-# ifdef USE_FXSAVE +-/* Use fxsave to save XMM registers. */ +-# define REGISTER_SAVE_AREA (512 + STATE_SAVE_OFFSET) +-# if (REGISTER_SAVE_AREA % 16) != 0 +-# error REGISTER_SAVE_AREA must be multiple of 16 +-# endif +-# endif +-# else +-# ifndef USE_FXSAVE +-# error USE_FXSAVE must be defined +-# endif +-/* Use fxsave to save XMM registers. */ +-# define REGISTER_SAVE_AREA (512 + STATE_SAVE_OFFSET + 8) +-/* Local stack area before jumping to function address: All saved +- registers. */ +-# define LOCAL_STORAGE_AREA REGISTER_SAVE_AREA +-# define BASE rsp +-# if (REGISTER_SAVE_AREA % 16) != 8 +-# error REGISTER_SAVE_AREA must be odd multiple of 8 +-# endif +-# endif ++# include "dl-trampoline-state.h" + + .globl _dl_runtime_resolve + .hidden _dl_runtime_resolve diff --git a/glibc-upstream-2.39-13.patch b/glibc-upstream-2.39-13.patch new file mode 100644 index 0000000..0a52189 --- /dev/null +++ b/glibc-upstream-2.39-13.patch @@ -0,0 +1,496 @@ +commit 853e915fdd6ae6c5f1a7a68d2594ec8dbfef1286 +Author: H.J. Lu +Date: Wed Feb 28 12:08:03 2024 -0800 + + x86-64: Update _dl_tlsdesc_dynamic to preserve AMX registers + + _dl_tlsdesc_dynamic should also preserve AMX registers which are + caller-saved. Add X86_XSTATE_TILECFG_ID and X86_XSTATE_TILEDATA_ID + to x86-64 TLSDESC_CALL_STATE_SAVE_MASK. Compute the AMX state size + and save it in xsave_state_full_size which is only used by + _dl_tlsdesc_dynamic_xsave and _dl_tlsdesc_dynamic_xsavec. This fixes + the AMX part of BZ #31372. Tested on AMX processor. + + AMX test is enabled only for compilers with the fix for + + https://gcc.gnu.org/bugzilla/show_bug.cgi?id=114098 + + GCC 14 and GCC 11/12/13 branches have the bug fix. + Reviewed-by: Sunil K Pandey + + (cherry picked from commit 9b7091415af47082664717210ac49d51551456ab) + +diff --git a/sysdeps/unix/sysv/linux/x86_64/Makefile b/sysdeps/unix/sysv/linux/x86_64/Makefile +index 4223feb95f6fe2f5..9a1e7aa6461725af 100644 +--- a/sysdeps/unix/sysv/linux/x86_64/Makefile ++++ b/sysdeps/unix/sysv/linux/x86_64/Makefile +@@ -63,6 +63,33 @@ $(objpfx)libx86-64-isa-level%.os: $(..)/sysdeps/unix/sysv/linux/x86_64/x86-64-is + $(objpfx)libx86-64-isa-level.so: $(objpfx)libx86-64-isa-level-1.so + cp $< $@ + endif ++ ++ifeq (yes,$(have-mamx-tile)) ++tests += \ ++ tst-gnu2-tls2-amx \ ++# tests ++ ++modules-names += \ ++ tst-gnu2-tls2-amx-mod0 \ ++ tst-gnu2-tls2-amx-mod1 \ ++ tst-gnu2-tls2-amx-mod2 \ ++# modules-names ++ ++$(objpfx)tst-gnu2-tls2-amx: $(shared-thread-library) ++$(objpfx)tst-gnu2-tls2-amx.out: \ ++ $(objpfx)tst-gnu2-tls2-amx-mod0.so \ ++ $(objpfx)tst-gnu2-tls2-amx-mod1.so \ ++ $(objpfx)tst-gnu2-tls2-amx-mod2.so ++$(objpfx)tst-gnu2-tls2-amx-mod0.so: $(libsupport) ++$(objpfx)tst-gnu2-tls2-amx-mod1.so: $(libsupport) ++$(objpfx)tst-gnu2-tls2-amx-mod2.so: $(libsupport) ++ ++CFLAGS-tst-gnu2-tls2-amx.c += -mamx-tile ++CFLAGS-tst-gnu2-tls2-amx-mod0.c += -mamx-tile -mtls-dialect=gnu2 ++CFLAGS-tst-gnu2-tls2-amx-mod1.c += -mamx-tile -mtls-dialect=gnu2 ++CFLAGS-tst-gnu2-tls2-amx-mod2.c += -mamx-tile -mtls-dialect=gnu2 ++endif ++ + endif # $(subdir) == elf + + ifneq ($(enable-cet),no) +diff --git a/sysdeps/unix/sysv/linux/x86_64/include/asm/prctl.h b/sysdeps/unix/sysv/linux/x86_64/include/asm/prctl.h +index 2f511321ad3b3ac1..ef4631bf4b2fd9aa 100644 +--- a/sysdeps/unix/sysv/linux/x86_64/include/asm/prctl.h ++++ b/sysdeps/unix/sysv/linux/x86_64/include/asm/prctl.h +@@ -20,3 +20,8 @@ + # define ARCH_SHSTK_SHSTK 0x1 + # define ARCH_SHSTK_WRSS 0x2 + #endif ++ ++#ifndef ARCH_GET_XCOMP_PERM ++# define ARCH_GET_XCOMP_PERM 0x1022 ++# define ARCH_REQ_XCOMP_PERM 0x1023 ++#endif +diff --git a/sysdeps/unix/sysv/linux/x86_64/tst-gnu2-tls2-amx-mod0.c b/sysdeps/unix/sysv/linux/x86_64/tst-gnu2-tls2-amx-mod0.c +new file mode 100644 +index 0000000000000000..2e0c7b91b7caf3ab +--- /dev/null ++++ b/sysdeps/unix/sysv/linux/x86_64/tst-gnu2-tls2-amx-mod0.c +@@ -0,0 +1,2 @@ ++#include "tst-gnu2-tls2-amx.h" ++#include +diff --git a/sysdeps/unix/sysv/linux/x86_64/tst-gnu2-tls2-amx-mod1.c b/sysdeps/unix/sysv/linux/x86_64/tst-gnu2-tls2-amx-mod1.c +new file mode 100644 +index 0000000000000000..b8a8ccf1c119d443 +--- /dev/null ++++ b/sysdeps/unix/sysv/linux/x86_64/tst-gnu2-tls2-amx-mod1.c +@@ -0,0 +1,2 @@ ++#include "tst-gnu2-tls2-amx.h" ++#include +diff --git a/sysdeps/unix/sysv/linux/x86_64/tst-gnu2-tls2-amx-mod2.c b/sysdeps/unix/sysv/linux/x86_64/tst-gnu2-tls2-amx-mod2.c +new file mode 100644 +index 0000000000000000..cdf4a8f3635b327c +--- /dev/null ++++ b/sysdeps/unix/sysv/linux/x86_64/tst-gnu2-tls2-amx-mod2.c +@@ -0,0 +1,2 @@ ++#include "tst-gnu2-tls2-amx.h" ++#include +diff --git a/sysdeps/unix/sysv/linux/x86_64/tst-gnu2-tls2-amx.c b/sysdeps/unix/sysv/linux/x86_64/tst-gnu2-tls2-amx.c +new file mode 100644 +index 0000000000000000..ae4dd82556c9b2ef +--- /dev/null ++++ b/sysdeps/unix/sysv/linux/x86_64/tst-gnu2-tls2-amx.c +@@ -0,0 +1,83 @@ ++/* Test TLSDESC relocation with AMX. ++ Copyright (C) 2024 Free Software Foundation, Inc. ++ This file is part of the GNU C Library. ++ ++ The GNU C Library is free software; you can redistribute it and/or ++ modify it under the terms of the GNU Lesser General Public ++ License as published by the Free Software Foundation; either ++ version 2.1 of the License, or (at your option) any later version. ++ ++ The GNU C Library is distributed in the hope that it will be useful, ++ but WITHOUT ANY WARRANTY; without even the implied warranty of ++ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ Lesser General Public License for more details. ++ ++ You should have received a copy of the GNU Lesser General Public ++ License along with the GNU C Library; if not, see ++ . */ ++ ++#include ++#include ++#include ++#include "tst-gnu2-tls2-amx.h" ++ ++extern int arch_prctl (int, ...); ++ ++#define X86_XSTATE_TILECFG_ID 17 ++#define X86_XSTATE_TILEDATA_ID 18 ++ ++/* Initialize tile config. */ ++__attribute__ ((noinline, noclone)) ++static void ++init_tile_config (__tilecfg *tileinfo) ++{ ++ int i; ++ tileinfo->palette_id = 1; ++ tileinfo->start_row = 0; ++ ++ tileinfo->colsb[0] = MAX_ROWS; ++ tileinfo->rows[0] = MAX_ROWS; ++ ++ for (i = 1; i < 4; ++i) ++ { ++ tileinfo->colsb[i] = MAX_COLS; ++ tileinfo->rows[i] = MAX_ROWS; ++ } ++ ++ _tile_loadconfig (tileinfo); ++} ++ ++static bool ++enable_amx (void) ++{ ++ uint64_t bitmask; ++ if (arch_prctl (ARCH_GET_XCOMP_PERM, &bitmask) != 0) ++ return false; ++ ++ if ((bitmask & (1 << X86_XSTATE_TILECFG_ID)) == 0) ++ return false; ++ ++ if (arch_prctl (ARCH_REQ_XCOMP_PERM, X86_XSTATE_TILEDATA_ID) != 0) ++ return false; ++ ++ /* Load tile configuration. */ ++ __tilecfg tile_data = { 0 }; ++ init_tile_config (&tile_data); ++ ++ return true; ++} ++ ++/* An architecture can define it to clobber caller-saved registers in ++ malloc below to verify that the implicit TLSDESC call won't change ++ caller-saved registers. */ ++static void ++clear_tile_register (void) ++{ ++ _tile_zero (2); ++} ++ ++#define MOD(i) "tst-gnu2-tls2-amx-mod" #i ".so" ++#define IS_SUPPORTED() enable_amx () ++#define PREPARE_MALLOC() clear_tile_register () ++ ++#include +diff --git a/sysdeps/unix/sysv/linux/x86_64/tst-gnu2-tls2-amx.h b/sysdeps/unix/sysv/linux/x86_64/tst-gnu2-tls2-amx.h +new file mode 100644 +index 0000000000000000..1845a3caba43a0f1 +--- /dev/null ++++ b/sysdeps/unix/sysv/linux/x86_64/tst-gnu2-tls2-amx.h +@@ -0,0 +1,63 @@ ++/* Test TLSDESC relocation with AMX. ++ Copyright (C) 2024 Free Software Foundation, Inc. ++ This file is part of the GNU C Library. ++ ++ The GNU C Library is free software; you can redistribute it and/or ++ modify it under the terms of the GNU Lesser General Public ++ License as published by the Free Software Foundation; either ++ version 2.1 of the License, or (at your option) any later version. ++ ++ The GNU C Library is distributed in the hope that it will be useful, ++ but WITHOUT ANY WARRANTY; without even the implied warranty of ++ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ Lesser General Public License for more details. ++ ++ You should have received a copy of the GNU Lesser General Public ++ License along with the GNU C Library; if not, see ++ . */ ++ ++#include ++#include ++#include ++#include ++ ++#define MAX_ROWS 16 ++#define MAX_COLS 64 ++#define MAX 1024 ++#define STRIDE 64 ++ ++typedef struct __tile_config ++{ ++ uint8_t palette_id; ++ uint8_t start_row; ++ uint8_t reserved_0[14]; ++ uint16_t colsb[16]; ++ uint8_t rows[16]; ++} __tilecfg __attribute__ ((aligned (64))); ++ ++/* Initialize int8_t buffer */ ++static inline void ++init_buffer (int8_t *buf, int8_t value) ++{ ++ int rows, colsb, i, j; ++ rows = MAX_ROWS; ++ colsb = MAX_COLS; ++ ++ for (i = 0; i < rows; i++) ++ for (j = 0; j < colsb; j++) ++ buf[i * colsb + j] = value; ++} ++ ++#define BEFORE_TLSDESC_CALL() \ ++ int8_t src[MAX]; \ ++ int8_t res[MAX]; \ ++ /* Initialize src with data */ \ ++ init_buffer (src, 2); \ ++ /* Load tile rows from memory. */ \ ++ _tile_loadd (2, src, STRIDE); ++ ++#define AFTER_TLSDESC_CALL() \ ++ /* Store the tile data to memory. */ \ ++ _tile_stored (2, res, STRIDE); \ ++ _tile_release (); \ ++ TEST_VERIFY_EXIT (memcmp (src, res, sizeof (res)) == 0); +diff --git a/sysdeps/x86/cpu-features-offsets.sym b/sysdeps/x86/cpu-features-offsets.sym +index 6a8fd298137b7f23..21fc88d6510840e6 100644 +--- a/sysdeps/x86/cpu-features-offsets.sym ++++ b/sysdeps/x86/cpu-features-offsets.sym +@@ -3,3 +3,4 @@ + #include + + XSAVE_STATE_SIZE_OFFSET offsetof (struct cpu_features, xsave_state_size) ++XSAVE_STATE_FULL_SIZE_OFFSET offsetof (struct cpu_features, xsave_state_full_size) +diff --git a/sysdeps/x86/cpu-features.c b/sysdeps/x86/cpu-features.c +index 835113b42f924b83..d71e8d3d2e0e49f9 100644 +--- a/sysdeps/x86/cpu-features.c ++++ b/sysdeps/x86/cpu-features.c +@@ -307,6 +307,8 @@ update_active (struct cpu_features *cpu_features) + __cpuid_count (0xd, 0, eax, ebx, ecx, edx); + if (ebx != 0) + { ++ /* NB: On AMX capable processors, ebx always includes AMX ++ states. */ + unsigned int xsave_state_full_size + = ALIGN_UP (ebx + STATE_SAVE_OFFSET, 64); + +@@ -320,6 +322,11 @@ update_active (struct cpu_features *cpu_features) + { + unsigned int xstate_comp_offsets[32]; + unsigned int xstate_comp_sizes[32]; ++#ifdef __x86_64__ ++ unsigned int xstate_amx_comp_offsets[32]; ++ unsigned int xstate_amx_comp_sizes[32]; ++ unsigned int amx_ecx; ++#endif + unsigned int i; + + xstate_comp_offsets[0] = 0; +@@ -327,16 +334,39 @@ update_active (struct cpu_features *cpu_features) + xstate_comp_offsets[2] = 576; + xstate_comp_sizes[0] = 160; + xstate_comp_sizes[1] = 256; ++#ifdef __x86_64__ ++ xstate_amx_comp_offsets[0] = 0; ++ xstate_amx_comp_offsets[1] = 160; ++ xstate_amx_comp_offsets[2] = 576; ++ xstate_amx_comp_sizes[0] = 160; ++ xstate_amx_comp_sizes[1] = 256; ++#endif + + for (i = 2; i < 32; i++) + { +- if ((STATE_SAVE_MASK & (1 << i)) != 0) ++ if ((FULL_STATE_SAVE_MASK & (1 << i)) != 0) + { + __cpuid_count (0xd, i, eax, ebx, ecx, edx); +- xstate_comp_sizes[i] = eax; ++#ifdef __x86_64__ ++ /* Include this in xsave_state_full_size. */ ++ amx_ecx = ecx; ++ xstate_amx_comp_sizes[i] = eax; ++ if ((AMX_STATE_SAVE_MASK & (1 << i)) != 0) ++ { ++ /* Exclude this from xsave_state_size. */ ++ ecx = 0; ++ xstate_comp_sizes[i] = 0; ++ } ++ else ++#endif ++ xstate_comp_sizes[i] = eax; + } + else + { ++#ifdef __x86_64__ ++ amx_ecx = 0; ++ xstate_amx_comp_sizes[i] = 0; ++#endif + ecx = 0; + xstate_comp_sizes[i] = 0; + } +@@ -349,6 +379,15 @@ update_active (struct cpu_features *cpu_features) + if ((ecx & (1 << 1)) != 0) + xstate_comp_offsets[i] + = ALIGN_UP (xstate_comp_offsets[i], 64); ++#ifdef __x86_64__ ++ xstate_amx_comp_offsets[i] ++ = (xstate_amx_comp_offsets[i - 1] ++ + xstate_amx_comp_sizes[i - 1]); ++ if ((amx_ecx & (1 << 1)) != 0) ++ xstate_amx_comp_offsets[i] ++ = ALIGN_UP (xstate_amx_comp_offsets[i], ++ 64); ++#endif + } + } + +@@ -357,6 +396,18 @@ update_active (struct cpu_features *cpu_features) + = xstate_comp_offsets[31] + xstate_comp_sizes[31]; + if (size) + { ++#ifdef __x86_64__ ++ unsigned int amx_size ++ = (xstate_amx_comp_offsets[31] ++ + xstate_amx_comp_sizes[31]); ++ amx_size = ALIGN_UP (amx_size + STATE_SAVE_OFFSET, ++ 64); ++ /* Set xsave_state_full_size to the compact AMX ++ state size for XSAVEC. NB: xsave_state_full_size ++ is only used in _dl_tlsdesc_dynamic_xsave and ++ _dl_tlsdesc_dynamic_xsavec. */ ++ cpu_features->xsave_state_full_size = amx_size; ++#endif + cpu_features->xsave_state_size + = ALIGN_UP (size + STATE_SAVE_OFFSET, 64); + CPU_FEATURE_SET (cpu_features, XSAVEC); +diff --git a/sysdeps/x86/include/cpu-features.h b/sysdeps/x86/include/cpu-features.h +index b9bf3115b616f05f..cd7bd27cf35959fd 100644 +--- a/sysdeps/x86/include/cpu-features.h ++++ b/sysdeps/x86/include/cpu-features.h +@@ -934,6 +934,8 @@ struct cpu_features + /* The full state size for XSAVE when XSAVEC is disabled by + + GLIBC_TUNABLES=glibc.cpu.hwcaps=-XSAVEC ++ ++ and the AMX state size when XSAVEC is available. + */ + unsigned int xsave_state_full_size; + /* Data cache size for use in memory and string routines, typically +diff --git a/sysdeps/x86/sysdep.h b/sysdeps/x86/sysdep.h +index 485cad9c0283b334..db8e576e91767db5 100644 +--- a/sysdeps/x86/sysdep.h ++++ b/sysdeps/x86/sysdep.h +@@ -56,6 +56,14 @@ + | (1 << X86_XSTATE_ZMM_H_ID) \ + | (1 << X86_XSTATE_ZMM_ID) \ + | (1 << X86_XSTATE_APX_F_ID)) ++ ++/* AMX state mask. */ ++# define AMX_STATE_SAVE_MASK \ ++ ((1 << X86_XSTATE_TILECFG_ID) | (1 << X86_XSTATE_TILEDATA_ID)) ++ ++/* States to be included in xsave_state_full_size. */ ++# define FULL_STATE_SAVE_MASK \ ++ (STATE_SAVE_MASK | AMX_STATE_SAVE_MASK) + #else + /* Offset for fxsave/xsave area used by _dl_tlsdesc_dynamic. Since i386 + doesn't have red-zone, use 0 here. */ +@@ -68,13 +76,17 @@ + | (1 << X86_XSTATE_BNDREGS_ID) \ + | (1 << X86_XSTATE_K_ID) \ + | (1 << X86_XSTATE_ZMM_H_ID)) ++ ++/* States to be included in xsave_state_size. */ ++# define FULL_STATE_SAVE_MASK STATE_SAVE_MASK + #endif + + /* States which should be saved for TLSDESC_CALL and TLS_DESC_CALL. +- Compiler assumes that all registers, including x87 FPU stack registers, +- are unchanged after CALL, except for EFLAGS and RAX/EAX. */ ++ Compiler assumes that all registers, including AMX and x87 FPU ++ stack registers, are unchanged after CALL, except for EFLAGS and ++ RAX/EAX. */ + #define TLSDESC_CALL_STATE_SAVE_MASK \ +- (STATE_SAVE_MASK | (1 << X86_XSTATE_X87_ID)) ++ (FULL_STATE_SAVE_MASK | (1 << X86_XSTATE_X87_ID)) + + /* Constants for bits in __x86_string_control: */ + +diff --git a/sysdeps/x86_64/configure b/sysdeps/x86_64/configure +index 418cc4a9b862f7e0..04a534fa126a7bf7 100755 +--- a/sysdeps/x86_64/configure ++++ b/sysdeps/x86_64/configure +@@ -134,6 +134,34 @@ fi + config_vars="$config_vars + enable-cet = $enable_cet" + ++# Check if -mamx-tile works properly. ++{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking whether -mamx-tile works properly" >&5 ++printf %s "checking whether -mamx-tile works properly... " >&6; } ++if test ${libc_cv_x86_have_amx_tile+y} ++then : ++ printf %s "(cached) " >&6 ++else $as_nop ++ cat > conftest.c < ++EOF ++ libc_cv_x86_have_amx_tile=no ++ if { ac_try='${CC-cc} -E $CFLAGS -mamx-tile conftest.c > conftest.i' ++ { { eval echo "\"\$as_me\":${as_lineno-$LINENO}: \"$ac_try\""; } >&5 ++ (eval $ac_try) 2>&5 ++ ac_status=$? ++ printf "%s\n" "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5 ++ test $ac_status = 0; }; }; then ++ if grep -q __builtin_ia32_ldtilecfg conftest.i; then ++ libc_cv_x86_have_amx_tile=yes ++ fi ++ fi ++ rm -rf conftest* ++fi ++{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: $libc_cv_x86_have_amx_tile" >&5 ++printf "%s\n" "$libc_cv_x86_have_amx_tile" >&6; } ++config_vars="$config_vars ++have-mamx-tile = $libc_cv_x86_have_amx_tile" ++ + test -n "$critic_missing" && as_fn_error $? " + *** $critic_missing" "$LINENO" 5 + +diff --git a/sysdeps/x86_64/configure.ac b/sysdeps/x86_64/configure.ac +index d1f803c02ee67fc5..c714c47351e70390 100644 +--- a/sysdeps/x86_64/configure.ac ++++ b/sysdeps/x86_64/configure.ac +@@ -61,5 +61,20 @@ elif test $enable_cet = permissive; then + fi + LIBC_CONFIG_VAR([enable-cet], [$enable_cet]) + ++# Check if -mamx-tile works properly. ++AC_CACHE_CHECK(whether -mamx-tile works properly, ++ libc_cv_x86_have_amx_tile, [dnl ++cat > conftest.c < ++EOF ++ libc_cv_x86_have_amx_tile=no ++ if AC_TRY_COMMAND(${CC-cc} -E $CFLAGS -mamx-tile conftest.c > conftest.i); then ++ if grep -q __builtin_ia32_ldtilecfg conftest.i; then ++ libc_cv_x86_have_amx_tile=yes ++ fi ++ fi ++ rm -rf conftest*]) ++LIBC_CONFIG_VAR([have-mamx-tile], [$libc_cv_x86_have_amx_tile]) ++ + test -n "$critic_missing" && AC_MSG_ERROR([ + *** $critic_missing]) +diff --git a/sysdeps/x86_64/dl-tlsdesc-dynamic.h b/sysdeps/x86_64/dl-tlsdesc-dynamic.h +index 0c2e8d5320d0bd26..9f02cfc3eb297ed2 100644 +--- a/sysdeps/x86_64/dl-tlsdesc-dynamic.h ++++ b/sysdeps/x86_64/dl-tlsdesc-dynamic.h +@@ -99,7 +99,7 @@ _dl_tlsdesc_dynamic: + # endif + #else + /* Allocate stack space of the required size to save the state. */ +- sub _rtld_local_ro+RTLD_GLOBAL_RO_DL_X86_CPU_FEATURES_OFFSET+XSAVE_STATE_SIZE_OFFSET(%rip), %RSP_LP ++ sub _rtld_local_ro+RTLD_GLOBAL_RO_DL_X86_CPU_FEATURES_OFFSET+XSAVE_STATE_FULL_SIZE_OFFSET(%rip), %RSP_LP + #endif + /* Besides rdi and rsi, saved above, save rcx, rdx, r8, r9, + r10 and r11. */ diff --git a/glibc-upstream-2.39-14.patch b/glibc-upstream-2.39-14.patch new file mode 100644 index 0000000..a49f788 --- /dev/null +++ b/glibc-upstream-2.39-14.patch @@ -0,0 +1,250 @@ +commit 354cabcb2634abe16da7a2ba5e648aac1204b58e +Author: H.J. Lu +Date: Mon Mar 18 06:40:16 2024 -0700 + + x86-64: Allocate state buffer space for RDI, RSI and RBX + + _dl_tlsdesc_dynamic preserves RDI, RSI and RBX before realigning stack. + After realigning stack, it saves RCX, RDX, R8, R9, R10 and R11. Define + TLSDESC_CALL_REGISTER_SAVE_AREA to allocate space for RDI, RSI and RBX + to avoid clobbering saved RDI, RSI and RBX values on stack by xsave to + STATE_SAVE_OFFSET(%rsp). + + +==================+<- stack frame start aligned at 8 or 16 bytes + | |<- RDI saved in the red zone + | |<- RSI saved in the red zone + | |<- RBX saved in the red zone + | |<- paddings for stack realignment of 64 bytes + |------------------|<- xsave buffer end aligned at 64 bytes + | |<- + | |<- + | |<- + |------------------|<- xsave buffer start at STATE_SAVE_OFFSET(%rsp) + | |<- 8-byte padding for 64-byte alignment + | |<- 8-byte padding for 64-byte alignment + | |<- R11 + | |<- R10 + | |<- R9 + | |<- R8 + | |<- RDX + | |<- RCX + +==================+<- RSP aligned at 64 bytes + + Define TLSDESC_CALL_REGISTER_SAVE_AREA, the total register save area size + for all integer registers by adding 24 to STATE_SAVE_OFFSET since RDI, RSI + and RBX are saved onto stack without adjusting stack pointer first, using + the red-zone. This fixes BZ #31501. + Reviewed-by: Sunil K Pandey + + (cherry picked from commit 717ebfa85c8240d32d0d19d86a484c31c55c9617) + +diff --git a/sysdeps/x86/cpu-features.c b/sysdeps/x86/cpu-features.c +index d71e8d3d2e0e49f9..6fe1b728c607f39e 100644 +--- a/sysdeps/x86/cpu-features.c ++++ b/sysdeps/x86/cpu-features.c +@@ -310,7 +310,7 @@ update_active (struct cpu_features *cpu_features) + /* NB: On AMX capable processors, ebx always includes AMX + states. */ + unsigned int xsave_state_full_size +- = ALIGN_UP (ebx + STATE_SAVE_OFFSET, 64); ++ = ALIGN_UP (ebx + TLSDESC_CALL_REGISTER_SAVE_AREA, 64); + + cpu_features->xsave_state_size + = xsave_state_full_size; +@@ -400,8 +400,10 @@ update_active (struct cpu_features *cpu_features) + unsigned int amx_size + = (xstate_amx_comp_offsets[31] + + xstate_amx_comp_sizes[31]); +- amx_size = ALIGN_UP (amx_size + STATE_SAVE_OFFSET, +- 64); ++ amx_size ++ = ALIGN_UP ((amx_size ++ + TLSDESC_CALL_REGISTER_SAVE_AREA), ++ 64); + /* Set xsave_state_full_size to the compact AMX + state size for XSAVEC. NB: xsave_state_full_size + is only used in _dl_tlsdesc_dynamic_xsave and +@@ -409,7 +411,8 @@ update_active (struct cpu_features *cpu_features) + cpu_features->xsave_state_full_size = amx_size; + #endif + cpu_features->xsave_state_size +- = ALIGN_UP (size + STATE_SAVE_OFFSET, 64); ++ = ALIGN_UP (size + TLSDESC_CALL_REGISTER_SAVE_AREA, ++ 64); + CPU_FEATURE_SET (cpu_features, XSAVEC); + } + } +diff --git a/sysdeps/x86/sysdep.h b/sysdeps/x86/sysdep.h +index db8e576e91767db5..7359149e17ccf341 100644 +--- a/sysdeps/x86/sysdep.h ++++ b/sysdeps/x86/sysdep.h +@@ -38,14 +38,59 @@ + #ifdef __x86_64__ + /* Offset for fxsave/xsave area used by _dl_runtime_resolve. Also need + space to preserve RCX, RDX, RSI, RDI, R8, R9 and RAX. It must be +- aligned to 16 bytes for fxsave and 64 bytes for xsave. +- +- NB: Is is non-zero because of the 128-byte red-zone. Some registers +- are saved on stack without adjusting stack pointer first. When we +- update stack pointer to allocate more space, we need to take the +- red-zone into account. */ ++ aligned to 16 bytes for fxsave and 64 bytes for xsave. It is non-zero ++ because MOV, instead of PUSH, is used to save registers onto stack. ++ ++ +==================+<- stack frame start aligned at 8 or 16 bytes ++ | |<- paddings for stack realignment of 64 bytes ++ |------------------|<- xsave buffer end aligned at 64 bytes ++ | |<- ++ | |<- ++ | |<- ++ |------------------|<- xsave buffer start at STATE_SAVE_OFFSET(%rsp) ++ | |<- 8-byte padding for 64-byte alignment ++ | |<- R9 ++ | |<- R8 ++ | |<- RDI ++ | |<- RSI ++ | |<- RDX ++ | |<- RCX ++ | |<- RAX ++ +==================+<- RSP aligned at 64 bytes ++ ++ */ + # define STATE_SAVE_OFFSET (8 * 7 + 8) + ++/* _dl_tlsdesc_dynamic preserves RDI, RSI and RBX before realigning ++ stack. After realigning stack, it saves RCX, RDX, R8, R9, R10 and ++ R11. Allocate space for RDI, RSI and RBX to avoid clobbering saved ++ RDI, RSI and RBX values on stack by xsave. ++ ++ +==================+<- stack frame start aligned at 8 or 16 bytes ++ | |<- RDI saved in the red zone ++ | |<- RSI saved in the red zone ++ | |<- RBX saved in the red zone ++ | |<- paddings for stack realignment of 64 bytes ++ |------------------|<- xsave buffer end aligned at 64 bytes ++ | |<- ++ | |<- ++ | |<- ++ |------------------|<- xsave buffer start at STATE_SAVE_OFFSET(%rsp) ++ | |<- 8-byte padding for 64-byte alignment ++ | |<- 8-byte padding for 64-byte alignment ++ | |<- R11 ++ | |<- R10 ++ | |<- R9 ++ | |<- R8 ++ | |<- RDX ++ | |<- RCX ++ +==================+<- RSP aligned at 64 bytes ++ ++ Define the total register save area size for all integer registers by ++ adding 24 to STATE_SAVE_OFFSET since RDI, RSI and RBX are saved onto ++ stack without adjusting stack pointer first, using the red-zone. */ ++# define TLSDESC_CALL_REGISTER_SAVE_AREA (STATE_SAVE_OFFSET + 24) ++ + /* Save SSE, AVX, AVX512, mask, bound and APX registers. Bound and APX + registers are mutually exclusive. */ + # define STATE_SAVE_MASK \ +@@ -66,8 +111,9 @@ + (STATE_SAVE_MASK | AMX_STATE_SAVE_MASK) + #else + /* Offset for fxsave/xsave area used by _dl_tlsdesc_dynamic. Since i386 +- doesn't have red-zone, use 0 here. */ ++ uses PUSH to save registers onto stack, use 0 here. */ + # define STATE_SAVE_OFFSET 0 ++# define TLSDESC_CALL_REGISTER_SAVE_AREA 0 + + /* Save SSE, AVX, AXV512, mask and bound registers. */ + # define STATE_SAVE_MASK \ +diff --git a/sysdeps/x86_64/tst-gnu2-tls2mod1.S b/sysdeps/x86_64/tst-gnu2-tls2mod1.S +new file mode 100644 +index 0000000000000000..1d636669ba255724 +--- /dev/null ++++ b/sysdeps/x86_64/tst-gnu2-tls2mod1.S +@@ -0,0 +1,87 @@ ++/* Check if TLSDESC relocation preserves %rdi, %rsi and %rbx. ++ Copyright (C) 2024 Free Software Foundation, Inc. ++ This file is part of the GNU C Library. ++ ++ The GNU C Library is free software; you can redistribute it and/or ++ modify it under the terms of the GNU Lesser General Public ++ License as published by the Free Software Foundation; either ++ version 2.1 of the License, or (at your option) any later version. ++ ++ The GNU C Library is distributed in the hope that it will be useful, ++ but WITHOUT ANY WARRANTY; without even the implied warranty of ++ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ Lesser General Public License for more details. ++ ++ You should have received a copy of the GNU Lesser General Public ++ License along with the GNU C Library; if not, see ++ . */ ++ ++#include ++ ++/* On AVX512 machines, OFFSET == 40 caused _dl_tlsdesc_dynamic_xsavec ++ to clobber %rdi, %rsi and %rbx. On Intel AVX CPUs, the state size ++ is 960 bytes and this test didn't fail. It may be due to the unused ++ last 128 bytes. On AMD AVX CPUs, the state size is 832 bytes and ++ this test might fail without the fix. */ ++#ifndef OFFSET ++# define OFFSET 40 ++#endif ++ ++ .text ++ .p2align 4 ++ .globl apply_tls ++ .type apply_tls, @function ++apply_tls: ++ cfi_startproc ++ _CET_ENDBR ++ pushq %rbp ++ cfi_def_cfa_offset (16) ++ cfi_offset (6, -16) ++ movdqu (%RDI_LP), %xmm0 ++ lea tls_var1@TLSDESC(%rip), %RAX_LP ++ mov %RSP_LP, %RBP_LP ++ cfi_def_cfa_register (6) ++ /* Align stack to 64 bytes. */ ++ and $-64, %RSP_LP ++ sub $OFFSET, %RSP_LP ++ pushq %rbx ++ /* Set %ebx to 0xbadbeef. */ ++ movl $0xbadbeef, %ebx ++ movl $0xbadbeef, %esi ++ movq %rdi, saved_rdi(%rip) ++ movq %rsi, saved_rsi(%rip) ++ call *tls_var1@TLSCALL(%RAX_LP) ++ /* Check if _dl_tlsdesc_dynamic preserves %rdi, %rsi and %rbx. */ ++ cmpq saved_rdi(%rip), %rdi ++ jne L(hlt) ++ cmpq saved_rsi(%rip), %rsi ++ jne L(hlt) ++ cmpl $0xbadbeef, %ebx ++ jne L(hlt) ++ add %fs:0, %RAX_LP ++ movups %xmm0, 32(%RAX_LP) ++ movdqu 16(%RDI_LP), %xmm1 ++ mov %RAX_LP, %RBX_LP ++ movups %xmm1, 48(%RAX_LP) ++ lea 32(%RBX_LP), %RAX_LP ++ pop %rbx ++ leave ++ cfi_def_cfa (7, 8) ++ ret ++L(hlt): ++ hlt ++ cfi_endproc ++ .size apply_tls, .-apply_tls ++ .hidden tls_var1 ++ .globl tls_var1 ++ .section .tbss,"awT",@nobits ++ .align 16 ++ .type tls_var1, @object ++ .size tls_var1, 3200 ++tls_var1: ++ .zero 3200 ++ .local saved_rdi ++ .comm saved_rdi,8,8 ++ .local saved_rsi ++ .comm saved_rsi,8,8 ++ .section .note.GNU-stack,"",@progbits diff --git a/glibc-upstream-2.39-15.patch b/glibc-upstream-2.39-15.patch new file mode 100644 index 0000000..037a4d3 --- /dev/null +++ b/glibc-upstream-2.39-15.patch @@ -0,0 +1,38 @@ +commit 15aebdbada54098787715448c94701f17033fc92 +Author: Adhemerval Zanella +Date: Tue Mar 12 13:21:18 2024 -0300 + + Ignore undefined symbols for -mtls-dialect=gnu2 + + So it does not fail for arm config that defaults to -mtp=soft (which + issues a call to __aeabi_read_tp). + Reviewed-by: H.J. Lu + + (cherry picked from commit 968b0ca9440040a2b31248a572891f0e55c1ab10) + +diff --git a/configure b/configure +index 59ff1e415dda4fbf..117b48a421792eda 100755 +--- a/configure ++++ b/configure +@@ -7020,7 +7020,7 @@ void foo (void) + } + EOF + if { ac_try='${CC-cc} $CFLAGS $CPPFLAGS -fPIC -mtls-dialect=gnu2 -nostdlib -nostartfiles +- conftest.c -o conftest 1>&5' ++ -shared conftest.c -o conftest 1>&5' + { { eval echo "\"\$as_me\":${as_lineno-$LINENO}: \"$ac_try\""; } >&5 + (eval $ac_try) 2>&5 + ac_status=$? +diff --git a/configure.ac b/configure.ac +index 65799e56852a5356..19b88a47a52508a1 100644 +--- a/configure.ac ++++ b/configure.ac +@@ -1297,7 +1297,7 @@ void foo (void) + } + EOF + if AC_TRY_COMMAND([${CC-cc} $CFLAGS $CPPFLAGS -fPIC -mtls-dialect=gnu2 -nostdlib -nostartfiles +- conftest.c -o conftest 1>&AS_MESSAGE_LOG_FD]) ++ -shared conftest.c -o conftest 1>&AS_MESSAGE_LOG_FD]) + then + libc_cv_mtls_dialect_gnu2=yes + else diff --git a/glibc-upstream-2.39-16.patch b/glibc-upstream-2.39-16.patch new file mode 100644 index 0000000..e5fa640 --- /dev/null +++ b/glibc-upstream-2.39-16.patch @@ -0,0 +1,446 @@ +commit a8ba52bde58c69f2b31da62ad2311f119adf6cb9 +Author: Adhemerval Zanella +Date: Tue Mar 12 13:21:19 2024 -0300 + + arm: Update _dl_tlsdesc_dynamic to preserve caller-saved registers (BZ 31372) + + ARM _dl_tlsdesc_dynamic slow path has two issues: + + * The ip/r12 is defined by AAPCS as a scratch register, and gcc is + used to save the stack pointer before on some function calls. So it + should also be saved/restored as well. It fixes the tst-gnu2-tls2. + + * None of the possible VFP registers are saved/restored. ARM has the + additional complexity to have different VFP bank sizes (depending of + VFP support by the chip). + + The tst-gnu2-tls2 test is extended to check for VFP registers, although + only for hardfp builds. Different than setcontext, _dl_tlsdesc_dynamic + does not have HWCAP_ARM_IWMMXT (I don't have a way to properly test + it and it is almost a decade since newer hardware was released). + + With this patch there is no need to mark tst-gnu2-tls2 as XFAIL. + + Checked on arm-linux-gnueabihf. + Reviewed-by: H.J. Lu + + (cherry picked from commit 64c7e344289ed085517c2227d8e3b06388242c13) + +diff --git a/config.h.in b/config.h.in +index 44a34072a47aa008..4d33c63a841d3d6d 100644 +--- a/config.h.in ++++ b/config.h.in +@@ -141,6 +141,9 @@ + /* LOONGARCH floating-point ABI for ld.so. */ + #undef LOONGARCH_ABI_FRLEN + ++/* Define whether ARM used hard-float and support VFPvX-D32. */ ++#undef HAVE_ARM_PCS_VFP_D32 ++ + /* Linux specific: minimum supported kernel version. */ + #undef __LINUX_KERNEL_VERSION + +diff --git a/elf/Makefile b/elf/Makefile +index c5c37a9147e69d83..030db4d207d3491e 100644 +--- a/elf/Makefile ++++ b/elf/Makefile +@@ -3056,10 +3056,6 @@ $(objpfx)tst-gnu2-tls2.out: \ + $(objpfx)tst-gnu2-tls2mod2.so + + ifeq (yes,$(have-mtls-dialect-gnu2)) +-# This test fails if dl_tlsdesc_dynamic doesn't preserve all caller-saved +-# registers. See https://sourceware.org/bugzilla/show_bug.cgi?id=31372 +-test-xfail-tst-gnu2-tls2 = yes +- + CFLAGS-tst-tlsgap-mod0.c += -mtls-dialect=gnu2 + CFLAGS-tst-tlsgap-mod1.c += -mtls-dialect=gnu2 + CFLAGS-tst-tlsgap-mod2.c += -mtls-dialect=gnu2 +diff --git a/elf/tst-gnu2-tls2.h b/elf/tst-gnu2-tls2.h +index 77964a57a352e6a4..1ade8151e200af68 100644 +--- a/elf/tst-gnu2-tls2.h ++++ b/elf/tst-gnu2-tls2.h +@@ -27,6 +27,10 @@ extern struct tls *apply_tls (struct tls *); + + /* An architecture can define them to verify that clobber caller-saved + registers aren't changed by the implicit TLSDESC call. */ ++#ifndef INIT_TLSDESC_CALL ++# define INIT_TLSDESC_CALL() ++#endif ++ + #ifndef BEFORE_TLSDESC_CALL + # define BEFORE_TLSDESC_CALL() + #endif +diff --git a/elf/tst-gnu2-tls2mod0.c b/elf/tst-gnu2-tls2mod0.c +index 45556a0e173922cc..3fe3c142777abe04 100644 +--- a/elf/tst-gnu2-tls2mod0.c ++++ b/elf/tst-gnu2-tls2mod0.c +@@ -16,13 +16,14 @@ + License along with the GNU C Library; if not, see + . */ + +-#include "tst-gnu2-tls2.h" ++#include + + __thread struct tls tls_var0 __attribute__ ((visibility ("hidden"))); + + struct tls * + apply_tls (struct tls *p) + { ++ INIT_TLSDESC_CALL (); + BEFORE_TLSDESC_CALL (); + tls_var0 = *p; + struct tls *ret = &tls_var0; +diff --git a/elf/tst-gnu2-tls2mod1.c b/elf/tst-gnu2-tls2mod1.c +index e10b9dbc0a7573c7..e2105384689e2d2e 100644 +--- a/elf/tst-gnu2-tls2mod1.c ++++ b/elf/tst-gnu2-tls2mod1.c +@@ -16,13 +16,14 @@ + License along with the GNU C Library; if not, see + . */ + +-#include "tst-gnu2-tls2.h" ++#include + + __thread struct tls tls_var1[100] __attribute__ ((visibility ("hidden"))); + + struct tls * + apply_tls (struct tls *p) + { ++ INIT_TLSDESC_CALL (); + BEFORE_TLSDESC_CALL (); + tls_var1[1] = *p; + struct tls *ret = &tls_var1[1]; +diff --git a/elf/tst-gnu2-tls2mod2.c b/elf/tst-gnu2-tls2mod2.c +index 141af51e55b8bf34..6d3031dc5fbc1041 100644 +--- a/elf/tst-gnu2-tls2mod2.c ++++ b/elf/tst-gnu2-tls2mod2.c +@@ -16,13 +16,14 @@ + License along with the GNU C Library; if not, see + . */ + +-#include "tst-gnu2-tls2.h" ++#include + + __thread struct tls tls_var2 __attribute__ ((visibility ("hidden"))); + + struct tls * + apply_tls (struct tls *p) + { ++ INIT_TLSDESC_CALL (); + BEFORE_TLSDESC_CALL (); + tls_var2 = *p; + struct tls *ret = &tls_var2; +diff --git a/sysdeps/arm/configure b/sysdeps/arm/configure +index 35e2918922300956..4ef4d46cbd5384e9 100644 +--- a/sysdeps/arm/configure ++++ b/sysdeps/arm/configure +@@ -187,6 +187,38 @@ else + default-abi = soft" + fi + ++{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking whether VFP supports 32 registers" >&5 ++printf %s "checking whether VFP supports 32 registers... " >&6; } ++if test ${libc_cv_arm_pcs_vfp_d32+y} ++then : ++ printf %s "(cached) " >&6 ++else $as_nop ++ ++cat confdefs.h - <<_ACEOF >conftest.$ac_ext ++/* end confdefs.h. */ ++ ++void foo (void) ++{ ++ asm volatile ("vldr d16,=17" : : : "d16"); ++} ++ ++_ACEOF ++if ac_fn_c_try_compile "$LINENO" ++then : ++ libc_cv_arm_pcs_vfp_d32=yes ++else $as_nop ++ libc_cv_arm_pcs_vfp_d32=no ++fi ++rm -f core conftest.err conftest.$ac_objext conftest.beam conftest.$ac_ext ++fi ++{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: $libc_cv_arm_pcs_vfp_d32" >&5 ++printf "%s\n" "$libc_cv_arm_pcs_vfp_d32" >&6; } ++if test "$libc_cv_arm_pcs_vfp_d32" = yes ; ++then ++ printf "%s\n" "#define HAVE_ARM_PCS_VFP_D32 1" >>confdefs.h ++ ++fi ++ + { printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking whether PC-relative relocs in movw/movt work properly" >&5 + printf %s "checking whether PC-relative relocs in movw/movt work properly... " >&6; } + if test ${libc_cv_arm_pcrel_movw+y} +diff --git a/sysdeps/arm/configure.ac b/sysdeps/arm/configure.ac +index 5172e30bbe79f995..cd00ddc9d9ade5d7 100644 +--- a/sysdeps/arm/configure.ac ++++ b/sysdeps/arm/configure.ac +@@ -21,6 +21,21 @@ else + LIBC_CONFIG_VAR([default-abi], [soft]) + fi + ++AC_CACHE_CHECK([whether VFP supports 32 registers], ++ libc_cv_arm_pcs_vfp_d32, [ ++AC_COMPILE_IFELSE([AC_LANG_SOURCE([[ ++void foo (void) ++{ ++ asm volatile ("vldr d16,=17" : : : "d16"); ++} ++]])], ++ [libc_cv_arm_pcs_vfp_d32=yes], ++ [libc_cv_arm_pcs_vfp_d32=no])]) ++if test "$libc_cv_arm_pcs_vfp_d32" = yes ; ++then ++ AC_DEFINE(HAVE_ARM_PCS_VFP_D32) ++fi ++ + AC_CACHE_CHECK([whether PC-relative relocs in movw/movt work properly], + libc_cv_arm_pcrel_movw, [ + cat > conftest.s <<\EOF +diff --git a/sysdeps/arm/dl-tlsdesc.S b/sysdeps/arm/dl-tlsdesc.S +index 764c56e70f046b03..ada106521da8971a 100644 +--- a/sysdeps/arm/dl-tlsdesc.S ++++ b/sysdeps/arm/dl-tlsdesc.S +@@ -19,6 +19,7 @@ + #include + #include + #include ++#include + #include "tlsdesc.h" + + .text +@@ -83,14 +84,20 @@ _dl_tlsdesc_dynamic(struct tlsdesc *tdp) + .align 2 + _dl_tlsdesc_dynamic: + /* Our calling convention is to clobber r0, r1 and the processor +- flags. All others that are modified must be saved */ +- eabi_save ({r2,r3,r4,lr}) +- push {r2,r3,r4,lr} +- cfi_adjust_cfa_offset (16) ++ flags. All others that are modified must be saved. r5 is ++ used as the hwcap value to avoid reload after __tls_get_addr ++ call. If required we will save the vector register on the slow ++ path. */ ++ eabi_save ({r2,r3,r4,r5,ip,lr}) ++ push {r2,r3,r4,r5,ip,lr} ++ cfi_adjust_cfa_offset (24) + cfi_rel_offset (r2,0) + cfi_rel_offset (r3,4) + cfi_rel_offset (r4,8) +- cfi_rel_offset (lr,12) ++ cfi_rel_offset (r5,12) ++ cfi_rel_offset (ip,16) ++ cfi_rel_offset (lr,20) ++ + ldr r1, [r0] /* td */ + GET_TLS (lr) + mov r4, r0 /* r4 = tp */ +@@ -113,22 +120,69 @@ _dl_tlsdesc_dynamic: + rsbne r0, r4, r3 + bne 2f + 1: mov r0, r1 ++ ++ /* Load the hwcap to check for vector support. */ ++ ldr r2, 3f ++ ldr r1, .Lrtld_global_ro ++0: add r2, pc, r2 ++ ldr r2, [r2, r1] ++ ldr r5, [r2, #RTLD_GLOBAL_RO_DL_HWCAP_OFFSET] ++ ++#ifdef __SOFTFP__ ++ tst r5, #HWCAP_ARM_VFP ++ beq .Lno_vfp ++#endif ++ ++ /* Store the VFP registers. Don't use VFP instructions directly ++ because this code is used in non-VFP multilibs. */ ++#define VFP_STACK_REQ (32*8 + 8) ++ sub sp, sp, VFP_STACK_REQ ++ cfi_adjust_cfa_offset (VFP_STACK_REQ) ++ mov r3, sp ++ .inst 0xeca30b20 /* vstmia r3!, {d0-d15} */ ++ tst r5, #HWCAP_ARM_VFPD32 ++ beq 4f ++ .inst 0xece30b20 /* vstmia r3!, {d16-d31} */ ++ /* Store the floating-point status register. */ ++4: .inst 0xeef12a10 /* vmrs r2, fpscr */ ++ str r2, [r3] ++.Lno_vfp: + bl __tls_get_addr + rsb r0, r4, r0 ++#ifdef __SOFTFP__ ++ tst r5, #HWCAP_ARM_VFP ++ beq 2f ++#endif ++ mov r3, sp ++ .inst 0xecb30b20 /* vldmia r3!, {d0-d15} */ ++ tst r5, #HWCAP_ARM_VFPD32 ++ beq 5f ++ .inst 0xecf30b20 /* vldmia r3!, {d16-d31} */ ++ ldr r4, [r3] ++5: .inst 0xeee14a10 /* vmsr fpscr, r4 */ ++ add sp, sp, VFP_STACK_REQ ++ cfi_adjust_cfa_offset (-VFP_STACK_REQ) ++ + 2: + #if ((defined (__ARM_ARCH_4T__) && defined (__THUMB_INTERWORK__)) \ + || defined (ARM_ALWAYS_BX)) +- pop {r2,r3,r4, lr} +- cfi_adjust_cfa_offset (-16) ++ pop {r2,r3,r4,r5,ip, lr} ++ cfi_adjust_cfa_offset (-20) + cfi_restore (lr) ++ cfi_restore (ip) ++ cfi_restore (r5) + cfi_restore (r4) + cfi_restore (r3) + cfi_restore (r2) + bx lr + #else +- pop {r2,r3,r4, pc} ++ pop {r2,r3,r4,r5,ip, pc} + #endif + eabi_fnend + cfi_endproc + .size _dl_tlsdesc_dynamic, .-_dl_tlsdesc_dynamic ++ ++3: .long _GLOBAL_OFFSET_TABLE_ - 0b - PC_OFS ++.Lrtld_global_ro: ++ .long C_SYMBOL_NAME(_rtld_global_ro)(GOT) + #endif /* SHARED */ +diff --git a/sysdeps/arm/tst-gnu2-tls2.h b/sysdeps/arm/tst-gnu2-tls2.h +new file mode 100644 +index 0000000000000000..e413ac21fb9ed9bf +--- /dev/null ++++ b/sysdeps/arm/tst-gnu2-tls2.h +@@ -0,0 +1,128 @@ ++/* Test TLSDESC relocation. ARM version. ++ Copyright (C) 2024 Free Software Foundation, Inc. ++ This file is part of the GNU C Library. ++ ++ The GNU C Library is free software; you can redistribute it and/or ++ modify it under the terms of the GNU Lesser General Public ++ License as published by the Free Software Foundation; either ++ version 2.1 of the License, or (at your option) any later version. ++ ++ The GNU C Library is distributed in the hope that it will be useful, ++ but WITHOUT ANY WARRANTY; without even the implied warranty of ++ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ Lesser General Public License for more details. ++ ++ You should have received a copy of the GNU Lesser General Public ++ License along with the GNU C Library; if not, see ++ . */ ++ ++#include ++#include ++#include ++#include ++#include ++ ++#ifndef __SOFTFP__ ++ ++# ifdef HAVE_ARM_PCS_VFP_D32 ++# define SAVE_VFP_D32 \ ++ asm volatile ("vldr d16,=17" : : : "d16"); \ ++ asm volatile ("vldr d17,=18" : : : "d17"); \ ++ asm volatile ("vldr d18,=19" : : : "d18"); \ ++ asm volatile ("vldr d19,=20" : : : "d19"); \ ++ asm volatile ("vldr d20,=21" : : : "d20"); \ ++ asm volatile ("vldr d21,=22" : : : "d21"); \ ++ asm volatile ("vldr d22,=23" : : : "d22"); \ ++ asm volatile ("vldr d23,=24" : : : "d23"); \ ++ asm volatile ("vldr d24,=25" : : : "d24"); \ ++ asm volatile ("vldr d25,=26" : : : "d25"); \ ++ asm volatile ("vldr d26,=27" : : : "d26"); \ ++ asm volatile ("vldr d27,=28" : : : "d27"); \ ++ asm volatile ("vldr d28,=29" : : : "d28"); \ ++ asm volatile ("vldr d29,=30" : : : "d29"); \ ++ asm volatile ("vldr d30,=31" : : : "d30"); \ ++ asm volatile ("vldr d31,=32" : : : "d31"); ++# else ++# define SAVE_VFP_D32 ++# endif ++ ++# define INIT_TLSDESC_CALL() \ ++ unsigned long hwcap = getauxval (AT_HWCAP) ++ ++/* Set each vector register to a value from 1 to 32 before the TLS access, ++ dump to memory after TLS access, and compare with the expected values. */ ++ ++# define BEFORE_TLSDESC_CALL() \ ++ if (hwcap & HWCAP_ARM_VFP) \ ++ { \ ++ asm volatile ("vldr d0,=1" : : : "d0"); \ ++ asm volatile ("vldr d1,=2" : : : "d1"); \ ++ asm volatile ("vldr d2,=3" : : : "d1"); \ ++ asm volatile ("vldr d3,=4" : : : "d3"); \ ++ asm volatile ("vldr d4,=5" : : : "d4"); \ ++ asm volatile ("vldr d5,=6" : : : "d5"); \ ++ asm volatile ("vldr d6,=7" : : : "d6"); \ ++ asm volatile ("vldr d7,=8" : : : "d7"); \ ++ asm volatile ("vldr d8,=9" : : : "d8"); \ ++ asm volatile ("vldr d9,=10" : : : "d9"); \ ++ asm volatile ("vldr d10,=11" : : : "d10"); \ ++ asm volatile ("vldr d11,=12" : : : "d11"); \ ++ asm volatile ("vldr d12,=13" : : : "d12"); \ ++ asm volatile ("vldr d13,=14" : : : "d13"); \ ++ asm volatile ("vldr d14,=15" : : : "d14"); \ ++ asm volatile ("vldr d15,=16" : : : "d15"); \ ++ } \ ++ if (hwcap & HWCAP_ARM_VFPD32) \ ++ { \ ++ SAVE_VFP_D32 \ ++ } ++ ++# define VFP_STACK_REQ (16*8) ++# if __BYTE_ORDER == __BIG_ENDIAN ++# define DISP 7 ++# else ++# define DISP 0 ++# endif ++ ++# ifdef HAVE_ARM_PCS_VFP_D32 ++# define CHECK_VFP_D32 \ ++ char vfp[VFP_STACK_REQ]; \ ++ asm volatile ("vstmia %0, {d16-d31}\n" \ ++ : \ ++ : "r" (vfp) \ ++ : "memory"); \ ++ \ ++ char expected[VFP_STACK_REQ] = { 0 }; \ ++ for (int i = 0; i < 16; ++i) \ ++ expected[i * 8 + DISP] = i + 17; \ ++ \ ++ if (memcmp (vfp, expected, VFP_STACK_REQ) != 0) \ ++ abort (); ++# else ++# define CHECK_VFP_D32 ++# endif ++ ++# define AFTER_TLSDESC_CALL() \ ++ if (hwcap & HWCAP_ARM_VFP) \ ++ { \ ++ char vfp[VFP_STACK_REQ]; \ ++ asm volatile ("vstmia %0, {d0-d15}\n" \ ++ : \ ++ : "r" (vfp) \ ++ : "memory"); \ ++ \ ++ char expected[VFP_STACK_REQ] = { 0 }; \ ++ for (int i = 0; i < 16; ++i) \ ++ expected[i * 8 + DISP] = i + 1; \ ++ \ ++ if (memcmp (vfp, expected, VFP_STACK_REQ) != 0) \ ++ abort (); \ ++ } \ ++ if (hwcap & HWCAP_ARM_VFPD32) \ ++ { \ ++ CHECK_VFP_D32 \ ++ } ++ ++#endif /* __SOFTFP__ */ ++ ++#include_next diff --git a/glibc-upstream-2.39-17.patch b/glibc-upstream-2.39-17.patch new file mode 100644 index 0000000..050355c --- /dev/null +++ b/glibc-upstream-2.39-17.patch @@ -0,0 +1,221 @@ +commit aded2fc004e7ee85cf0b45b1382552d41e555a23 +Author: Adhemerval Zanella +Date: Tue Mar 12 13:21:20 2024 -0300 + + elf: Enable TLS descriptor tests on aarch64 + + The aarch64 uses 'trad' for traditional tls and 'desc' for tls + descriptors, but unlike other targets it defaults to 'desc'. The + gnutls2 configure check does not set aarch64 as an ABI that uses + TLS descriptors, which then disable somes stests. + + Also rename the internal machinery fron gnu2 to tls descriptors. + + Checked on aarch64-linux-gnu. + Reviewed-by: H.J. Lu + + (cherry picked from commit 3d53d18fc71c5d9ef4773b8bce04d54b80181926) + +diff --git a/configure b/configure +index 117b48a421792eda..432e40a59295cffd 100755 +--- a/configure ++++ b/configure +@@ -653,7 +653,7 @@ LIBGD + libc_cv_cc_loop_to_function + libc_cv_cc_submachine + libc_cv_cc_nofma +-libc_cv_mtls_dialect_gnu2 ++libc_cv_mtls_descriptor + libc_cv_has_glob_dat + libc_cv_fpie + libc_cv_z_execstack +@@ -4760,6 +4760,9 @@ libc_config_ok=no + # whether to use such directories. + with_fp_cond=1 + ++# A preconfigure script may define another name to TLS descriptor variant ++mtls_descriptor=gnu2 ++ + if frags=`ls -d $srcdir/sysdeps/*/preconfigure 2> /dev/null` + then + { printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking for sysdeps preconfigure fragments" >&5 +@@ -7006,9 +7009,9 @@ fi + printf "%s\n" "$libc_cv_has_glob_dat" >&6; } + + +-{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking for -mtls-dialect=gnu2" >&5 +-printf %s "checking for -mtls-dialect=gnu2... " >&6; } +-if test ${libc_cv_mtls_dialect_gnu2+y} ++{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking for tls descriptor support" >&5 ++printf %s "checking for tls descriptor support... " >&6; } ++if test ${libc_cv_mtls_descriptor+y} + then : + printf %s "(cached) " >&6 + else $as_nop +@@ -7019,7 +7022,7 @@ void foo (void) + i = 10; + } + EOF +-if { ac_try='${CC-cc} $CFLAGS $CPPFLAGS -fPIC -mtls-dialect=gnu2 -nostdlib -nostartfiles ++if { ac_try='${CC-cc} $CFLAGS $CPPFLAGS -fPIC -mtls-dialect=$mtls_descriptor -nostdlib -nostartfiles + -shared conftest.c -o conftest 1>&5' + { { eval echo "\"\$as_me\":${as_lineno-$LINENO}: \"$ac_try\""; } >&5 + (eval $ac_try) 2>&5 +@@ -7027,17 +7030,17 @@ if { ac_try='${CC-cc} $CFLAGS $CPPFLAGS -fPIC -mtls-dialect=gnu2 -nostdlib -nost + printf "%s\n" "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5 + test $ac_status = 0; }; } + then +- libc_cv_mtls_dialect_gnu2=yes ++ libc_cv_mtls_descriptor=$mtls_descriptor + else +- libc_cv_mtls_dialect_gnu2=no ++ libc_cv_mtls_descriptor=no + fi + rm -f conftest* + fi +-{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: $libc_cv_mtls_dialect_gnu2" >&5 +-printf "%s\n" "$libc_cv_mtls_dialect_gnu2" >&6; } ++{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: $libc_cv_mtls_descriptor" >&5 ++printf "%s\n" "$libc_cv_mtls_descriptor" >&6; } + + config_vars="$config_vars +-have-mtls-dialect-gnu2 = $libc_cv_mtls_dialect_gnu2" ++have-mtls-descriptor = $libc_cv_mtls_descriptor" + + { printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking if -Wno-ignored-attributes is required for aliases" >&5 + printf %s "checking if -Wno-ignored-attributes is required for aliases... " >&6; } +diff --git a/configure.ac b/configure.ac +index 19b88a47a52508a1..bdc385d03c3dc7f5 100644 +--- a/configure.ac ++++ b/configure.ac +@@ -442,6 +442,9 @@ libc_config_ok=no + # whether to use such directories. + with_fp_cond=1 + ++# A preconfigure script may define another name to TLS descriptor variant ++mtls_descriptor=gnu2 ++ + dnl Let sysdeps/*/preconfigure act here. + LIBC_PRECONFIGURE([$srcdir], [for sysdeps]) + +@@ -1287,7 +1290,7 @@ fi + rm -f conftest*]) + AC_SUBST(libc_cv_has_glob_dat) + +-AC_CACHE_CHECK([for -mtls-dialect=gnu2], libc_cv_mtls_dialect_gnu2, ++AC_CACHE_CHECK([for tls descriptor support], libc_cv_mtls_descriptor, + [dnl + cat > conftest.c <&AS_MESSAGE_LOG_FD]) + then +- libc_cv_mtls_dialect_gnu2=yes ++ libc_cv_mtls_descriptor=$mtls_descriptor + else +- libc_cv_mtls_dialect_gnu2=no ++ libc_cv_mtls_descriptor=no + fi + rm -f conftest*]) +-AC_SUBST(libc_cv_mtls_dialect_gnu2) +-LIBC_CONFIG_VAR([have-mtls-dialect-gnu2], [$libc_cv_mtls_dialect_gnu2]) ++AC_SUBST(libc_cv_mtls_descriptor) ++LIBC_CONFIG_VAR([have-mtls-descriptor], [$libc_cv_mtls_descriptor]) + + dnl clang emits an warning for a double alias redirection, to warn the + dnl original symbol is sed even when weak definition overrides it. +diff --git a/elf/Makefile b/elf/Makefile +index 030db4d207d3491e..69aa423c4b90127d 100644 +--- a/elf/Makefile ++++ b/elf/Makefile +@@ -999,13 +999,13 @@ modules-names-tests = $(filter-out ifuncmod% tst-tlsmod%,\ + # For +depfiles in Makerules. + extra-test-objs += tst-auditmod17.os + +-ifeq (yes,$(have-mtls-dialect-gnu2)) ++ifneq (no,$(have-mtls-descriptor)) + tests += tst-gnu2-tls1 + modules-names += tst-gnu2-tls1mod + $(objpfx)tst-gnu2-tls1: $(objpfx)tst-gnu2-tls1mod.so + tst-gnu2-tls1mod.so-no-z-defs = yes +-CFLAGS-tst-gnu2-tls1mod.c += -mtls-dialect=gnu2 +-endif # $(have-mtls-dialect-gnu2) ++CFLAGS-tst-gnu2-tls1mod.c += -mtls-dialect=$(have-mtls-descriptor) ++endif # $(have-mtls-descriptor) + + ifeq (yes,$(have-protected-data)) + modules-names += tst-protected1moda tst-protected1modb +@@ -2972,11 +2972,11 @@ $(objpfx)tst-tls-allocation-failure-static-patched.out: \ + $(objpfx)tst-audit-tlsdesc: $(objpfx)tst-audit-tlsdesc-mod1.so \ + $(objpfx)tst-audit-tlsdesc-mod2.so \ + $(shared-thread-library) +-ifeq (yes,$(have-mtls-dialect-gnu2)) ++ifneq (no,$(have-mtls-descriptor)) + # The test is valid for all TLS types, but we want to exercise GNU2 + # TLS if possible. +-CFLAGS-tst-audit-tlsdesc-mod1.c += -mtls-dialect=gnu2 +-CFLAGS-tst-audit-tlsdesc-mod2.c += -mtls-dialect=gnu2 ++CFLAGS-tst-audit-tlsdesc-mod1.c += -mtls-dialect=$(have-mtls-descriptor) ++CFLAGS-tst-audit-tlsdesc-mod2.c += -mtls-dialect=$(have-mtls-descriptor) + endif + $(objpfx)tst-audit-tlsdesc-dlopen: $(shared-thread-library) + $(objpfx)tst-audit-tlsdesc-dlopen.out: $(objpfx)tst-audit-tlsdesc-mod1.so \ +@@ -3055,11 +3055,11 @@ $(objpfx)tst-gnu2-tls2.out: \ + $(objpfx)tst-gnu2-tls2mod1.so \ + $(objpfx)tst-gnu2-tls2mod2.so + +-ifeq (yes,$(have-mtls-dialect-gnu2)) +-CFLAGS-tst-tlsgap-mod0.c += -mtls-dialect=gnu2 +-CFLAGS-tst-tlsgap-mod1.c += -mtls-dialect=gnu2 +-CFLAGS-tst-tlsgap-mod2.c += -mtls-dialect=gnu2 +-CFLAGS-tst-gnu2-tls2mod0.c += -mtls-dialect=gnu2 +-CFLAGS-tst-gnu2-tls2mod1.c += -mtls-dialect=gnu2 +-CFLAGS-tst-gnu2-tls2mod2.c += -mtls-dialect=gnu2 ++ifneq (no,$(have-mtls-descriptor)) ++CFLAGS-tst-tlsgap-mod0.c += -mtls-dialect=$(have-mtls-descriptor) ++CFLAGS-tst-tlsgap-mod1.c += -mtls-dialect=$(have-mtls-descriptor) ++CFLAGS-tst-tlsgap-mod2.c += -mtls-dialect=$(have-mtls-descriptor) ++CFLAGS-tst-gnu2-tls2mod0.c += -mtls-dialect=$(have-mtls-descriptor) ++CFLAGS-tst-gnu2-tls2mod1.c += -mtls-dialect=$(have-mtls-descriptor) ++CFLAGS-tst-gnu2-tls2mod2.c += -mtls-dialect=$(have-mtls-descriptor) + endif +diff --git a/sysdeps/aarch64/preconfigure b/sysdeps/aarch64/preconfigure +index d9bd1f8558a079cb..19657b627bc84c4e 100644 +--- a/sysdeps/aarch64/preconfigure ++++ b/sysdeps/aarch64/preconfigure +@@ -2,5 +2,6 @@ case "$machine" in + aarch64*) + base_machine=aarch64 + machine=aarch64 ++ mtls_descriptor=desc + ;; + esac +diff --git a/sysdeps/arm/Makefile b/sysdeps/arm/Makefile +index d5cea717a9c201aa..619474eca94fe8e4 100644 +--- a/sysdeps/arm/Makefile ++++ b/sysdeps/arm/Makefile +@@ -13,15 +13,15 @@ $(objpfx)libgcc-stubs.a: $(objpfx)aeabi_unwind_cpp_pr1.os + lib-noranlib: $(objpfx)libgcc-stubs.a + + ifeq ($(build-shared),yes) +-ifeq (yes,$(have-mtls-dialect-gnu2)) ++ifneq (no,$(have-mtls-descriptor)) + tests += tst-armtlsdescloc tst-armtlsdescextnow tst-armtlsdescextlazy + modules-names += tst-armtlsdesclocmod + modules-names += tst-armtlsdescextlazymod tst-armtlsdescextnowmod + CPPFLAGS-tst-armtlsdescextnowmod.c += -Dstatic= + CPPFLAGS-tst-armtlsdescextlazymod.c += -Dstatic= +-CFLAGS-tst-armtlsdesclocmod.c += -mtls-dialect=gnu2 +-CFLAGS-tst-armtlsdescextnowmod.c += -mtls-dialect=gnu2 +-CFLAGS-tst-armtlsdescextlazymod.c += -mtls-dialect=gnu2 ++CFLAGS-tst-armtlsdesclocmod.c += -mtls-dialect=$(have-mtls-descriptor) ++CFLAGS-tst-armtlsdescextnowmod.c += -mtls-dialect=$(have-mtls-descriptor) ++CFLAGS-tst-armtlsdescextlazymod.c += -mtls-dialect=$(have-mtls-descriptor) + LDFLAGS-tst-armtlsdescextnowmod.so += -Wl,-z,now + tst-armtlsdescloc-ENV = LD_BIND_NOW=1 + tst-armtlsdescextnow-ENV = LD_BIND_NOW=1 diff --git a/glibc-upstream-2.39-18.patch b/glibc-upstream-2.39-18.patch new file mode 100644 index 0000000..60a31ff --- /dev/null +++ b/glibc-upstream-2.39-18.patch @@ -0,0 +1,24 @@ +commit 5a461f2949ded98d8211939f84988bc464c7b4fe +Author: Andreas Schwab +Date: Tue Mar 19 13:49:50 2024 +0100 + + Add tst-gnu2-tls2mod1 to test-internal-extras + + That allows sysdeps/x86_64/tst-gnu2-tls2mod1.S to use internal headers. + + Fixes: 717ebfa85c ("x86-64: Allocate state buffer space for RDI, RSI and RBX") + (cherry picked from commit fd7ee2e6c5eb49e4a630a9978b4d668bff6354ee) + +diff --git a/sysdeps/x86_64/Makefile b/sysdeps/x86_64/Makefile +index e8babc9a4edbf90b..9d374a329916fc45 100644 +--- a/sysdeps/x86_64/Makefile ++++ b/sysdeps/x86_64/Makefile +@@ -210,6 +210,8 @@ tst-plt-rewrite2-ENV = GLIBC_TUNABLES=glibc.cpu.plt_rewrite=2 + $(objpfx)tst-plt-rewrite2: $(objpfx)tst-plt-rewritemod2.so + endif + ++test-internal-extras += tst-gnu2-tls2mod1 ++ + endif # $(subdir) == elf + + ifeq ($(subdir),csu) diff --git a/glibc-upstream-2.39-19.patch b/glibc-upstream-2.39-19.patch new file mode 100644 index 0000000..fa9220e --- /dev/null +++ b/glibc-upstream-2.39-19.patch @@ -0,0 +1,146 @@ +commit aa4249266e9906c4bc833e4847f4d8feef59504f +Author: Adhemerval Zanella +Date: Thu Feb 8 10:08:38 2024 -0300 + + x86: Fix Zen3/Zen4 ERMS selection (BZ 30994) + + The REP MOVSB usage on memcpy/memmove does not show much performance + improvement on Zen3/Zen4 cores compared to the vectorized loops. Also, + as from BZ 30994, if the source is aligned and the destination is not + the performance can be 20x slower. + + The performance difference is noticeable with small buffer sizes, closer + to the lower bounds limits when memcpy/memmove starts to use ERMS. The + performance of REP MOVSB is similar to vectorized instruction on the + size limit (the L2 cache). Also, there is no drawback to multiple cores + sharing the cache. + + Checked on x86_64-linux-gnu on Zen3. + Reviewed-by: H.J. Lu + + (cherry picked from commit 0c0d39fe4aeb0f69b26e76337c5dfd5530d5d44e) + +diff --git a/sysdeps/x86/dl-cacheinfo.h b/sysdeps/x86/dl-cacheinfo.h +index d5101615e348e5c2..f34d12846caf9422 100644 +--- a/sysdeps/x86/dl-cacheinfo.h ++++ b/sysdeps/x86/dl-cacheinfo.h +@@ -791,7 +791,6 @@ dl_init_cacheinfo (struct cpu_features *cpu_features) + long int data = -1; + long int shared = -1; + long int shared_per_thread = -1; +- long int core = -1; + unsigned int threads = 0; + unsigned long int level1_icache_size = -1; + unsigned long int level1_icache_linesize = -1; +@@ -809,7 +808,6 @@ dl_init_cacheinfo (struct cpu_features *cpu_features) + if (cpu_features->basic.kind == arch_kind_intel) + { + data = handle_intel (_SC_LEVEL1_DCACHE_SIZE, cpu_features); +- core = handle_intel (_SC_LEVEL2_CACHE_SIZE, cpu_features); + shared = handle_intel (_SC_LEVEL3_CACHE_SIZE, cpu_features); + shared_per_thread = shared; + +@@ -822,7 +820,8 @@ dl_init_cacheinfo (struct cpu_features *cpu_features) + = handle_intel (_SC_LEVEL1_DCACHE_ASSOC, cpu_features); + level1_dcache_linesize + = handle_intel (_SC_LEVEL1_DCACHE_LINESIZE, cpu_features); +- level2_cache_size = core; ++ level2_cache_size ++ = handle_intel (_SC_LEVEL2_CACHE_SIZE, cpu_features); + level2_cache_assoc + = handle_intel (_SC_LEVEL2_CACHE_ASSOC, cpu_features); + level2_cache_linesize +@@ -835,12 +834,12 @@ dl_init_cacheinfo (struct cpu_features *cpu_features) + level4_cache_size + = handle_intel (_SC_LEVEL4_CACHE_SIZE, cpu_features); + +- get_common_cache_info (&shared, &shared_per_thread, &threads, core); ++ get_common_cache_info (&shared, &shared_per_thread, &threads, ++ level2_cache_size); + } + else if (cpu_features->basic.kind == arch_kind_zhaoxin) + { + data = handle_zhaoxin (_SC_LEVEL1_DCACHE_SIZE); +- core = handle_zhaoxin (_SC_LEVEL2_CACHE_SIZE); + shared = handle_zhaoxin (_SC_LEVEL3_CACHE_SIZE); + shared_per_thread = shared; + +@@ -849,19 +848,19 @@ dl_init_cacheinfo (struct cpu_features *cpu_features) + level1_dcache_size = data; + level1_dcache_assoc = handle_zhaoxin (_SC_LEVEL1_DCACHE_ASSOC); + level1_dcache_linesize = handle_zhaoxin (_SC_LEVEL1_DCACHE_LINESIZE); +- level2_cache_size = core; ++ level2_cache_size = handle_zhaoxin (_SC_LEVEL2_CACHE_SIZE); + level2_cache_assoc = handle_zhaoxin (_SC_LEVEL2_CACHE_ASSOC); + level2_cache_linesize = handle_zhaoxin (_SC_LEVEL2_CACHE_LINESIZE); + level3_cache_size = shared; + level3_cache_assoc = handle_zhaoxin (_SC_LEVEL3_CACHE_ASSOC); + level3_cache_linesize = handle_zhaoxin (_SC_LEVEL3_CACHE_LINESIZE); + +- get_common_cache_info (&shared, &shared_per_thread, &threads, core); ++ get_common_cache_info (&shared, &shared_per_thread, &threads, ++ level2_cache_size); + } + else if (cpu_features->basic.kind == arch_kind_amd) + { + data = handle_amd (_SC_LEVEL1_DCACHE_SIZE); +- core = handle_amd (_SC_LEVEL2_CACHE_SIZE); + shared = handle_amd (_SC_LEVEL3_CACHE_SIZE); + + level1_icache_size = handle_amd (_SC_LEVEL1_ICACHE_SIZE); +@@ -869,7 +868,7 @@ dl_init_cacheinfo (struct cpu_features *cpu_features) + level1_dcache_size = data; + level1_dcache_assoc = handle_amd (_SC_LEVEL1_DCACHE_ASSOC); + level1_dcache_linesize = handle_amd (_SC_LEVEL1_DCACHE_LINESIZE); +- level2_cache_size = core; ++ level2_cache_size = handle_amd (_SC_LEVEL2_CACHE_SIZE);; + level2_cache_assoc = handle_amd (_SC_LEVEL2_CACHE_ASSOC); + level2_cache_linesize = handle_amd (_SC_LEVEL2_CACHE_LINESIZE); + level3_cache_size = shared; +@@ -880,12 +879,12 @@ dl_init_cacheinfo (struct cpu_features *cpu_features) + if (shared <= 0) + { + /* No shared L3 cache. All we have is the L2 cache. */ +- shared = core; ++ shared = level2_cache_size; + } + else if (cpu_features->basic.family < 0x17) + { + /* Account for exclusive L2 and L3 caches. */ +- shared += core; ++ shared += level2_cache_size; + } + + shared_per_thread = shared; +@@ -987,6 +986,12 @@ dl_init_cacheinfo (struct cpu_features *cpu_features) + if (CPU_FEATURE_USABLE_P (cpu_features, FSRM)) + rep_movsb_threshold = 2112; + ++ /* For AMD CPUs that support ERMS (Zen3+), REP MOVSB is in a lot of ++ cases slower than the vectorized path (and for some alignments, ++ it is really slow, check BZ #30994). */ ++ if (cpu_features->basic.kind == arch_kind_amd) ++ rep_movsb_threshold = non_temporal_threshold; ++ + /* The default threshold to use Enhanced REP STOSB. */ + unsigned long int rep_stosb_threshold = 2048; + +@@ -1028,16 +1033,9 @@ dl_init_cacheinfo (struct cpu_features *cpu_features) + SIZE_MAX); + + unsigned long int rep_movsb_stop_threshold; +- /* ERMS feature is implemented from AMD Zen3 architecture and it is +- performing poorly for data above L2 cache size. Henceforth, adding +- an upper bound threshold parameter to limit the usage of Enhanced +- REP MOVSB operations and setting its value to L2 cache size. */ +- if (cpu_features->basic.kind == arch_kind_amd) +- rep_movsb_stop_threshold = core; + /* Setting the upper bound of ERMS to the computed value of +- non-temporal threshold for architectures other than AMD. */ +- else +- rep_movsb_stop_threshold = non_temporal_threshold; ++ non-temporal threshold for all architectures. */ ++ rep_movsb_stop_threshold = non_temporal_threshold; + + cpu_features->data_cache_size = data; + cpu_features->shared_cache_size = shared; diff --git a/glibc-upstream-2.39-20.patch b/glibc-upstream-2.39-20.patch new file mode 100644 index 0000000..3aac665 --- /dev/null +++ b/glibc-upstream-2.39-20.patch @@ -0,0 +1,30 @@ +commit 6484a92698039c4a7a510f0214e22d067b0d78b3 +Author: Adhemerval Zanella +Date: Thu Feb 8 10:08:39 2024 -0300 + + x86: Do not prefer ERMS for memset on Zen3+ + + For AMD Zen3+ architecture, the performance of the vectorized loop is + slightly better than ERMS. + + Checked on x86_64-linux-gnu on Zen3. + Reviewed-by: H.J. Lu + + (cherry picked from commit 272708884cb750f12f5c74a00e6620c19dc6d567) + +diff --git a/sysdeps/x86/dl-cacheinfo.h b/sysdeps/x86/dl-cacheinfo.h +index f34d12846caf9422..5a98f70364220da4 100644 +--- a/sysdeps/x86/dl-cacheinfo.h ++++ b/sysdeps/x86/dl-cacheinfo.h +@@ -1021,6 +1021,11 @@ dl_init_cacheinfo (struct cpu_features *cpu_features) + minimum value is fixed. */ + rep_stosb_threshold = TUNABLE_GET (x86_rep_stosb_threshold, + long int, NULL); ++ if (cpu_features->basic.kind == arch_kind_amd ++ && !TUNABLE_IS_INITIALIZED (x86_rep_stosb_threshold)) ++ /* For AMD Zen3+ architecture, the performance of the vectorized loop is ++ slightly better than ERMS. */ ++ rep_stosb_threshold = SIZE_MAX; + + TUNABLE_SET_WITH_BOUNDS (x86_data_cache_size, data, 0, SIZE_MAX); + TUNABLE_SET_WITH_BOUNDS (x86_shared_cache_size, shared, 0, SIZE_MAX); diff --git a/glibc-upstream-2.39-21.patch b/glibc-upstream-2.39-21.patch new file mode 100644 index 0000000..6ceb90c --- /dev/null +++ b/glibc-upstream-2.39-21.patch @@ -0,0 +1,24 @@ +commit 5d070d12b3a52bc44dd1b71743abc4b6243862ae +Author: Adhemerval Zanella +Date: Thu Feb 8 10:08:40 2024 -0300 + + x86: Expand the comment on when REP STOSB is used on memset + + Reviewed-by: H.J. Lu + (cherry picked from commit 491e55beab7457ed310a4a47496f4a333c5d1032) + +diff --git a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S +index 9984c3ca0fafab6a..97839a22483b0613 100644 +--- a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S ++++ b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S +@@ -21,7 +21,9 @@ + 2. If size is less than VEC, use integer register stores. + 3. If size is from VEC_SIZE to 2 * VEC_SIZE, use 2 VEC stores. + 4. If size is from 2 * VEC_SIZE to 4 * VEC_SIZE, use 4 VEC stores. +- 5. If size is more to 4 * VEC_SIZE, align to 4 * VEC_SIZE with ++ 5. On machines ERMS feature, if size is greater or equal than ++ __x86_rep_stosb_threshold then REP STOSB will be used. ++ 6. If size is more to 4 * VEC_SIZE, align to 4 * VEC_SIZE with + 4 VEC stores and store 4 * VEC at a time until done. */ + + #include diff --git a/glibc-upstream-2.39-8.patch b/glibc-upstream-2.39-8.patch new file mode 100644 index 0000000..653d5a1 --- /dev/null +++ b/glibc-upstream-2.39-8.patch @@ -0,0 +1,284 @@ +commit ee7f4c54e19738c2c27d3846e1e9b3595c89221f +Author: Manjunath Matti +Date: Tue Mar 19 15:29:48 2024 -0500 + + powerpc: Add HWCAP3/HWCAP4 data to TCB for Power Architecture. + + This patch adds a new feature for powerpc. In order to get faster + access to the HWCAP3/HWCAP4 masks, similar to HWCAP/HWCAP2 (i.e. for + implementing __builtin_cpu_supports() in GCC) without the overhead of + reading them from the auxiliary vector, we now reserve space for them + in the TCB. + + Suggested-by: Peter Bergner + Reviewed-by: Peter Bergner + (cherry picked from commit 3ab9b88e2ac91062b6d493fe32bd101a55006c6a) + +diff --git a/elf/dl-diagnostics.c b/elf/dl-diagnostics.c +index 7345ebc4e586883f..aaf67b87e81b04c8 100644 +--- a/elf/dl-diagnostics.c ++++ b/elf/dl-diagnostics.c +@@ -235,6 +235,8 @@ _dl_print_diagnostics (char **environ) + _dl_diagnostics_print_labeled_value ("dl_hwcap", GLRO (dl_hwcap)); + _dl_diagnostics_print_labeled_value ("dl_hwcap_important", HWCAP_IMPORTANT); + _dl_diagnostics_print_labeled_value ("dl_hwcap2", GLRO (dl_hwcap2)); ++ _dl_diagnostics_print_labeled_value ("dl_hwcap3", GLRO (dl_hwcap3)); ++ _dl_diagnostics_print_labeled_value ("dl_hwcap4", GLRO (dl_hwcap4)); + _dl_diagnostics_print_labeled_string + ("dl_hwcaps_subdirs", _dl_hwcaps_subdirs); + _dl_diagnostics_print_labeled_value +diff --git a/elf/dl-support.c b/elf/dl-support.c +index 2f502c8b0d27b784..451932dd03e971b8 100644 +--- a/elf/dl-support.c ++++ b/elf/dl-support.c +@@ -158,6 +158,8 @@ const ElfW(Phdr) *_dl_phdr; + size_t _dl_phnum; + uint64_t _dl_hwcap; + uint64_t _dl_hwcap2; ++uint64_t _dl_hwcap3; ++uint64_t _dl_hwcap4; + + enum dso_sort_algorithm _dl_dso_sort_algo; + +diff --git a/elf/elf.h b/elf/elf.h +index 455731663c6ed339..1c394c64cd5c66ed 100644 +--- a/elf/elf.h ++++ b/elf/elf.h +@@ -1234,6 +1234,10 @@ typedef struct + #define AT_RSEQ_FEATURE_SIZE 27 /* rseq supported feature size. */ + #define AT_RSEQ_ALIGN 28 /* rseq allocation alignment. */ + ++/* More machine-dependent hints about processor capabilities. */ ++#define AT_HWCAP3 29 /* extension of AT_HWCAP. */ ++#define AT_HWCAP4 30 /* extension of AT_HWCAP. */ ++ + #define AT_EXECFN 31 /* Filename of executable. */ + + /* Pointer to the global system page used for system calls and other +diff --git a/sysdeps/generic/ldsodefs.h b/sysdeps/generic/ldsodefs.h +index 117c901ccc5c5f0b..50f58a60e3f02330 100644 +--- a/sysdeps/generic/ldsodefs.h ++++ b/sysdeps/generic/ldsodefs.h +@@ -646,6 +646,8 @@ struct rtld_global_ro + /* Mask for more hardware capabilities that are available on some + platforms. */ + EXTERN uint64_t _dl_hwcap2; ++ EXTERN uint64_t _dl_hwcap3; ++ EXTERN uint64_t _dl_hwcap4; + + EXTERN enum dso_sort_algorithm _dl_dso_sort_algo; + +diff --git a/sysdeps/powerpc/dl-procinfo.c b/sysdeps/powerpc/dl-procinfo.c +index a76bb6e5b0895e3f..8cf00aa7e359bb6a 100644 +--- a/sysdeps/powerpc/dl-procinfo.c ++++ b/sysdeps/powerpc/dl-procinfo.c +@@ -38,6 +38,10 @@ + needed. + */ + ++/* The total number of available bits (including those prior to ++ _DL_HWCAP_FIRST). Some of these bits might not be used. */ ++#define _DL_HWCAP_COUNT 128 ++ + #ifndef PROCINFO_CLASS + # define PROCINFO_CLASS + #endif +@@ -61,7 +65,7 @@ PROCINFO_CLASS struct cpu_features _dl_powerpc_cpu_features + #if !defined PROCINFO_DECL && defined SHARED + ._dl_powerpc_cap_flags + #else +-PROCINFO_CLASS const char _dl_powerpc_cap_flags[64][15] ++PROCINFO_CLASS const char _dl_powerpc_cap_flags[_DL_HWCAP_COUNT][15] + #endif + #ifndef PROCINFO_DECL + = { +diff --git a/sysdeps/powerpc/dl-procinfo.h b/sysdeps/powerpc/dl-procinfo.h +index 68f424109501aaef..f8cb343877386402 100644 +--- a/sysdeps/powerpc/dl-procinfo.h ++++ b/sysdeps/powerpc/dl-procinfo.h +@@ -22,16 +22,17 @@ + #include + #include /* This defines the PPC_FEATURE[2]_* macros. */ + +-/* The total number of available bits (including those prior to +- _DL_HWCAP_FIRST). Some of these bits might not be used. */ +-#define _DL_HWCAP_COUNT 64 ++/* Feature masks are all 32-bits in size. */ ++#define _DL_HWCAP_SIZE 32 + +-/* Features started at bit 31 and decremented as new features were added. */ +-#define _DL_HWCAP_LAST 31 ++/* AT_HWCAP2 feature strings follow the AT_HWCAP feature strings. */ ++#define _DL_HWCAP2_OFFSET _DL_HWCAP_SIZE + +-/* AT_HWCAP2 features started at bit 31 and decremented as new features were +- added. HWCAP2 feature bits start at bit 0. */ +-#define _DL_HWCAP2_LAST 31 ++/* AT_HWCAP3 feature strings follow the AT_HWCAP2 feature strings. */ ++#define _DL_HWCAP3_OFFSET (_DL_HWCAP2_OFFSET + _DL_HWCAP_SIZE) ++ ++/* AT_HWCAP4 feature strings follow the AT_HWCAP3 feature strings. */ ++#define _DL_HWCAP4_OFFSET (_DL_HWCAP3_OFFSET + _DL_HWCAP_SIZE) + + /* These bits influence library search. */ + #define HWCAP_IMPORTANT (PPC_FEATURE_HAS_ALTIVEC \ +@@ -187,21 +188,42 @@ _dl_procinfo (unsigned int type, unsigned long int word) + case AT_HWCAP: + _dl_printf ("AT_HWCAP: "); + +- for (int i = 0; i <= _DL_HWCAP_LAST; ++i) ++ for (int i = 0; i < _DL_HWCAP_SIZE; ++i) + if (word & (1 << i)) + _dl_printf (" %s", _dl_hwcap_string (i)); + break; + case AT_HWCAP2: + { +- unsigned int offset = _DL_HWCAP_LAST + 1; + + _dl_printf ("AT_HWCAP2: "); + +- /* We have to go through them all because the kernel added the +- AT_HWCAP2 features starting with the high bits. */ +- for (int i = 0; i <= _DL_HWCAP2_LAST; ++i) +- if (word & (1 << i)) +- _dl_printf (" %s", _dl_hwcap_string (offset + i)); ++ /* We have to go through them all because the kernel added the ++ AT_HWCAP2 features starting with the high bits. */ ++ for (int i = 0; i < _DL_HWCAP_SIZE; ++i) ++ if (word & (1 << i)) ++ _dl_printf (" %s", _dl_hwcap_string (_DL_HWCAP2_OFFSET + i)); ++ break; ++ } ++ case AT_HWCAP3: ++ { ++ _dl_printf ("AT_HWCAP3: "); ++ ++ /* We have to go through them all because the kernel added the ++ AT_HWCAP3 features starting with the high bits. */ ++ for (int i = 0; i < _DL_HWCAP_SIZE; ++i) ++ if (word & (1 << i)) ++ _dl_printf (" %s", _dl_hwcap_string (_DL_HWCAP3_OFFSET + i)); ++ break; ++ } ++ case AT_HWCAP4: ++ { ++ _dl_printf ("AT_HWCAP4: "); ++ ++ /* We have to go through them all because the kernel added the ++ AT_HWCAP4 features starting with the high bits. */ ++ for (int i = 0; i <= _DL_HWCAP_SIZE; ++i) ++ if (word & (1 << i)) ++ _dl_printf (" %s", _dl_hwcap_string (_DL_HWCAP4_OFFSET + i)); + break; + } + case AT_L1I_CACHEGEOMETRY: +diff --git a/sysdeps/powerpc/hwcapinfo.c b/sysdeps/powerpc/hwcapinfo.c +index 76344f285a903858..f6fede15a7dfbf6c 100644 +--- a/sysdeps/powerpc/hwcapinfo.c ++++ b/sysdeps/powerpc/hwcapinfo.c +@@ -31,7 +31,7 @@ void + __tcb_parse_hwcap_and_convert_at_platform (void) + { + +- uint64_t h1, h2; ++ uint64_t h1, h2, h3, h4; + + /* Read AT_PLATFORM string from auxv and convert it to a number. */ + __tcb.at_platform = _dl_string_platform (GLRO (dl_platform)); +@@ -39,6 +39,8 @@ __tcb_parse_hwcap_and_convert_at_platform (void) + /* Read HWCAP and HWCAP2 from auxv. */ + h1 = GLRO (dl_hwcap); + h2 = GLRO (dl_hwcap2); ++ h3 = GLRO (dl_hwcap3); ++ h4 = GLRO (dl_hwcap4); + + /* hwcap contains only the latest supported ISA, the code checks which is + and fills the previous supported ones. */ +@@ -64,13 +66,16 @@ __tcb_parse_hwcap_and_convert_at_platform (void) + else if (h1 & PPC_FEATURE_POWER5) + h1 |= PPC_FEATURE_POWER4; + +- uint64_t array_hwcaps[] = { h1, h2 }; ++ uint64_t array_hwcaps[] = { h1, h2, h3, h4 }; + init_cpu_features (&GLRO(dl_powerpc_cpu_features), array_hwcaps); + + /* Consolidate both HWCAP and HWCAP2 into a single doubleword so that + we can read both in a single load later. */ + __tcb.hwcap = (h1 << 32) | (h2 & 0xffffffff); +- __tcb.hwcap_extn = 0x0; ++ ++ /* Consolidate both HWCAP3 and HWCAP4 into a single doubleword so that ++ we can read both in a single load later. */ ++ __tcb.hwcap_extn = (h3 << 32) | (h4 & 0xffffffff); + + } + #if IS_IN (rtld) +diff --git a/sysdeps/unix/sysv/linux/dl-parse_auxv.h b/sysdeps/unix/sysv/linux/dl-parse_auxv.h +index e3d758b163c619df..ea2a58ecb1668774 100644 +--- a/sysdeps/unix/sysv/linux/dl-parse_auxv.h ++++ b/sysdeps/unix/sysv/linux/dl-parse_auxv.h +@@ -47,6 +47,8 @@ void _dl_parse_auxv (ElfW(auxv_t) *av, dl_parse_auxv_t auxv_values) + GLRO(dl_platform) = (void *) auxv_values[AT_PLATFORM]; + GLRO(dl_hwcap) = auxv_values[AT_HWCAP]; + GLRO(dl_hwcap2) = auxv_values[AT_HWCAP2]; ++ GLRO(dl_hwcap3) = auxv_values[AT_HWCAP3]; ++ GLRO(dl_hwcap4) = auxv_values[AT_HWCAP4]; + GLRO(dl_clktck) = auxv_values[AT_CLKTCK]; + GLRO(dl_fpu_control) = auxv_values[AT_FPUCW]; + _dl_random = (void *) auxv_values[AT_RANDOM]; +diff --git a/sysdeps/unix/sysv/linux/dl-sysdep.c b/sysdeps/unix/sysv/linux/dl-sysdep.c +index ad3692d73839d7a3..e1b14e9eb34ff5cb 100644 +--- a/sysdeps/unix/sysv/linux/dl-sysdep.c ++++ b/sysdeps/unix/sysv/linux/dl-sysdep.c +@@ -197,6 +197,8 @@ _dl_show_auxv (void) + [AT_SYSINFO_EHDR - 2] = { "SYSINFO_EHDR: 0x", hex }, + [AT_RANDOM - 2] = { "RANDOM: 0x", hex }, + [AT_HWCAP2 - 2] = { "HWCAP2: 0x", hex }, ++ [AT_HWCAP3 - 2] = { "HWCAP3: 0x", hex }, ++ [AT_HWCAP4 - 2] = { "HWCAP4: 0x", hex }, + [AT_MINSIGSTKSZ - 2] = { "MINSIGSTKSZ: ", dec }, + [AT_L1I_CACHESIZE - 2] = { "L1I_CACHESIZE: ", dec }, + [AT_L1I_CACHEGEOMETRY - 2] = { "L1I_CACHEGEOMETRY: 0x", hex }, +diff --git a/sysdeps/unix/sysv/linux/powerpc/cpu-features.c b/sysdeps/unix/sysv/linux/powerpc/cpu-features.c +index 8e8a5ec2eab7e8c6..a947d62db63965b1 100644 +--- a/sysdeps/unix/sysv/linux/powerpc/cpu-features.c ++++ b/sysdeps/unix/sysv/linux/powerpc/cpu-features.c +@@ -94,6 +94,8 @@ init_cpu_features (struct cpu_features *cpu_features, uint64_t hwcaps[]) + which are set by __tcb_parse_hwcap_and_convert_at_platform. */ + cpu_features->hwcap = hwcaps[0]; + cpu_features->hwcap2 = hwcaps[1]; ++ cpu_features->hwcap3 = hwcaps[2]; ++ cpu_features->hwcap4 = hwcaps[3]; + /* Default is to use aligned memory access on optimized function unless + tunables is enable, since for this case user can explicit disable + unaligned optimizations. */ +diff --git a/sysdeps/unix/sysv/linux/powerpc/cpu-features.h b/sysdeps/unix/sysv/linux/powerpc/cpu-features.h +index 1294f0b601ebf54f..e9eb6a13c8ab11d7 100644 +--- a/sysdeps/unix/sysv/linux/powerpc/cpu-features.h ++++ b/sysdeps/unix/sysv/linux/powerpc/cpu-features.h +@@ -26,6 +26,8 @@ struct cpu_features + bool use_cached_memopt; + unsigned long int hwcap; + unsigned long int hwcap2; ++ unsigned long int hwcap3; ++ unsigned long int hwcap4; + }; + + static const char hwcap_names[] = { +diff --git a/sysdeps/unix/sysv/linux/powerpc/libc-start.c b/sysdeps/unix/sysv/linux/powerpc/libc-start.c +index a4705daf1cdea4de..6a00cd88cd64b992 100644 +--- a/sysdeps/unix/sysv/linux/powerpc/libc-start.c ++++ b/sysdeps/unix/sysv/linux/powerpc/libc-start.c +@@ -87,6 +87,12 @@ __libc_start_main_impl (int argc, char **argv, + case AT_HWCAP2: + _dl_hwcap2 = (unsigned long int) av->a_un.a_val; + break; ++ case AT_HWCAP3: ++ _dl_hwcap3 = (unsigned long int) av->a_un.a_val; ++ break; ++ case AT_HWCAP4: ++ _dl_hwcap4 = (unsigned long int) av->a_un.a_val; ++ break; + case AT_PLATFORM: + _dl_platform = (void *) av->a_un.a_val; + break; diff --git a/glibc-upstream-2.39-9.patch b/glibc-upstream-2.39-9.patch new file mode 100644 index 0000000..4289fa2 --- /dev/null +++ b/glibc-upstream-2.39-9.patch @@ -0,0 +1,172 @@ +commit aad45c8ac30aa1072e54903ce6aead22702f244a +Author: Amrita H S +Date: Tue Mar 19 19:08:47 2024 -0500 + + powerpc: Placeholder and infrastructure/build support to add Power11 related changes. + + The following three changes have been added to provide initial Power11 support. + 1. Add the directories to hold Power11 files. + 2. Add support to select Power11 libraries based on AT_PLATFORM. + 3. Let submachine=power11 be set automatically. + + Reviewed-by: Florian Weimer + Reviewed-by: Peter Bergner + (cherry picked from commit 1ea051145612f199d8716ecdf78b084b00b5a727) + +diff --git a/sysdeps/powerpc/dl-procinfo.h b/sysdeps/powerpc/dl-procinfo.h +index f8cb343877386402..b36697ba440654be 100644 +--- a/sysdeps/powerpc/dl-procinfo.h ++++ b/sysdeps/powerpc/dl-procinfo.h +@@ -38,7 +38,7 @@ + #define HWCAP_IMPORTANT (PPC_FEATURE_HAS_ALTIVEC \ + + PPC_FEATURE_HAS_DFP) + +-#define _DL_PLATFORMS_COUNT 16 ++#define _DL_PLATFORMS_COUNT 17 + + #define _DL_FIRST_PLATFORM 32 + /* Mask to filter out platforms. */ +@@ -62,6 +62,7 @@ + #define PPC_PLATFORM_POWER8 13 + #define PPC_PLATFORM_POWER9 14 + #define PPC_PLATFORM_POWER10 15 ++#define PPC_PLATFORM_POWER11 16 + + static inline const char * + __attribute__ ((unused)) +@@ -89,6 +90,11 @@ _dl_string_platform (const char *str) + ret = _DL_FIRST_PLATFORM + PPC_PLATFORM_POWER10; + str++; + } ++ else if (str[1] == '1') ++ { ++ ret = _DL_FIRST_PLATFORM + PPC_PLATFORM_POWER11; ++ str++; ++ } + else + return -1; + break; +diff --git a/sysdeps/powerpc/powerpc32/power11/Implies b/sysdeps/powerpc/powerpc32/power11/Implies +new file mode 100644 +index 0000000000000000..051cbe0f7911c93c +--- /dev/null ++++ b/sysdeps/powerpc/powerpc32/power11/Implies +@@ -0,0 +1,2 @@ ++powerpc/powerpc32/power10/fpu ++powerpc/powerpc32/power10 +diff --git a/sysdeps/powerpc/powerpc32/power11/fpu/multiarch/Implies b/sysdeps/powerpc/powerpc32/power11/fpu/multiarch/Implies +new file mode 100644 +index 0000000000000000..58edb2861d17f504 +--- /dev/null ++++ b/sysdeps/powerpc/powerpc32/power11/fpu/multiarch/Implies +@@ -0,0 +1 @@ ++powerpc/powerpc32/power10/fpu/multiarch +diff --git a/sysdeps/powerpc/powerpc32/power11/multiarch/Implies b/sysdeps/powerpc/powerpc32/power11/multiarch/Implies +new file mode 100644 +index 0000000000000000..c70f0428badbaf14 +--- /dev/null ++++ b/sysdeps/powerpc/powerpc32/power11/multiarch/Implies +@@ -0,0 +1 @@ ++powerpc/powerpc32/power10/multiarch +diff --git a/sysdeps/powerpc/powerpc64/be/power11/Implies b/sysdeps/powerpc/powerpc64/be/power11/Implies +new file mode 100644 +index 0000000000000000..de481d1c13db695e +--- /dev/null ++++ b/sysdeps/powerpc/powerpc64/be/power11/Implies +@@ -0,0 +1,2 @@ ++powerpc/powerpc64/be/power10/fpu ++powerpc/powerpc64/be/power10 +diff --git a/sysdeps/powerpc/powerpc64/be/power11/fpu/Implies b/sysdeps/powerpc/powerpc64/be/power11/fpu/Implies +new file mode 100644 +index 0000000000000000..dff0e13064ce8238 +--- /dev/null ++++ b/sysdeps/powerpc/powerpc64/be/power11/fpu/Implies +@@ -0,0 +1 @@ ++powerpc/powerpc64/be/power10/fpu +diff --git a/sysdeps/powerpc/powerpc64/be/power11/fpu/multiarch/Implies b/sysdeps/powerpc/powerpc64/be/power11/fpu/multiarch/Implies +new file mode 100644 +index 0000000000000000..c3f259e0097386a5 +--- /dev/null ++++ b/sysdeps/powerpc/powerpc64/be/power11/fpu/multiarch/Implies +@@ -0,0 +1 @@ ++powerpc/powerpc64/be/power10/fpu/multiarch +diff --git a/sysdeps/powerpc/powerpc64/be/power11/multiarch/Implies b/sysdeps/powerpc/powerpc64/be/power11/multiarch/Implies +new file mode 100644 +index 0000000000000000..9491a394c9519d01 +--- /dev/null ++++ b/sysdeps/powerpc/powerpc64/be/power11/multiarch/Implies +@@ -0,0 +1 @@ ++powerpc/powerpc64/be/power10/multiarch +diff --git a/sysdeps/powerpc/powerpc64/le/power11/Implies b/sysdeps/powerpc/powerpc64/le/power11/Implies +new file mode 100644 +index 0000000000000000..e18182dcc1f4c25f +--- /dev/null ++++ b/sysdeps/powerpc/powerpc64/le/power11/Implies +@@ -0,0 +1,2 @@ ++powerpc/powerpc64/le/power10/fpu ++powerpc/powerpc64/le/power10 +diff --git a/sysdeps/powerpc/powerpc64/le/power11/fpu/Implies b/sysdeps/powerpc/powerpc64/le/power11/fpu/Implies +new file mode 100644 +index 0000000000000000..e41bd55684dbb5b8 +--- /dev/null ++++ b/sysdeps/powerpc/powerpc64/le/power11/fpu/Implies +@@ -0,0 +1 @@ ++powerpc/powerpc64/le/power10/fpu +diff --git a/sysdeps/powerpc/powerpc64/le/power11/fpu/multiarch/Implies b/sysdeps/powerpc/powerpc64/le/power11/fpu/multiarch/Implies +new file mode 100644 +index 0000000000000000..c838d5093140eae3 +--- /dev/null ++++ b/sysdeps/powerpc/powerpc64/le/power11/fpu/multiarch/Implies +@@ -0,0 +1 @@ ++powerpc/powerpc64/le/power10/fpu/multiarch +diff --git a/sysdeps/powerpc/powerpc64/le/power11/multiarch/Implies b/sysdeps/powerpc/powerpc64/le/power11/multiarch/Implies +new file mode 100644 +index 0000000000000000..687248c3c267cd8c +--- /dev/null ++++ b/sysdeps/powerpc/powerpc64/le/power11/multiarch/Implies +@@ -0,0 +1 @@ ++powerpc/powerpc64/le/power10/multiarch +diff --git a/sysdeps/powerpc/powerpc64/le/tst-glibc-hwcaps.c b/sysdeps/powerpc/powerpc64/le/tst-glibc-hwcaps.c +index 77465d9133410267..65d3e69303a1c963 100644 +--- a/sysdeps/powerpc/powerpc64/le/tst-glibc-hwcaps.c ++++ b/sysdeps/powerpc/powerpc64/le/tst-glibc-hwcaps.c +@@ -36,9 +36,11 @@ compute_level (void) + return 9; + if (strcmp (platform, "power10") == 0) + return 10; ++ if (strcmp (platform, "power11") == 0) ++ return 11; + printf ("warning: unrecognized AT_PLATFORM value: %s\n", platform); +- /* Assume that the new platform supports POWER10. */ +- return 10; ++ /* Assume that the new platform supports POWER11. */ ++ return 11; + } + + static int +diff --git a/sysdeps/powerpc/preconfigure b/sysdeps/powerpc/preconfigure +index 4de94089a3f68532..9e5a07ab6d6767cd 100644 +--- a/sysdeps/powerpc/preconfigure ++++ b/sysdeps/powerpc/preconfigure +@@ -58,7 +58,7 @@ fi + + ;; + +- a2|970|power[4-9]|power5x|power6+|power10) ++ a2|970|power[4-9]|power5x|power6+|power10|power11) + submachine=${archcpu} + if test ${libc_cv_cc_submachine+y} + then : +diff --git a/sysdeps/powerpc/preconfigure.ac b/sysdeps/powerpc/preconfigure.ac +index 6c63bd8257b7e40a..14b6dafd4a895c3b 100644 +--- a/sysdeps/powerpc/preconfigure.ac ++++ b/sysdeps/powerpc/preconfigure.ac +@@ -46,7 +46,7 @@ case "${machine}:${submachine}" in + AC_CACHE_VAL(libc_cv_cc_submachine,libc_cv_cc_submachine="") + ;; + +- a2|970|power[[4-9]]|power5x|power6+|power10) ++ a2|970|power[[4-9]]|power5x|power6+|power10|power11) + submachine=${archcpu} + AC_CACHE_VAL(libc_cv_cc_submachine,libc_cv_cc_submachine="") + ;; diff --git a/glibc.spec b/glibc.spec index 990213c..493b171 100644 --- a/glibc.spec +++ b/glibc.spec @@ -171,7 +171,7 @@ Version: %{glibcversion} # - It allows using the Release number without the %%dist tag in the dependency # generator to make the generated requires interchangeable between Rawhide # and ELN (.elnYY < .fcXX). -%global baserelease 6 +%global baserelease 7 Release: %{baserelease}%{?dist} # Licenses: @@ -288,6 +288,20 @@ Patch27: glibc-upstream-2.39-4.patch Patch28: glibc-upstream-2.39-5.patch Patch29: glibc-upstream-2.39-6.patch Patch30: glibc-upstream-2.39-7.patch +Patch31: glibc-upstream-2.39-8.patch +Patch32: glibc-upstream-2.39-9.patch +Patch33: glibc-upstream-2.39-10.patch +Patch34: glibc-upstream-2.39-11.patch +Patch35: glibc-upstream-2.39-12.patch +Patch36: glibc-upstream-2.39-13.patch +Patch37: glibc-upstream-2.39-14.patch +Patch38: glibc-upstream-2.39-15.patch +Patch39: glibc-upstream-2.39-16.patch +Patch40: glibc-upstream-2.39-17.patch +Patch41: glibc-upstream-2.39-18.patch +Patch42: glibc-upstream-2.39-19.patch +Patch43: glibc-upstream-2.39-20.patch +Patch44: glibc-upstream-2.39-21.patch ############################################################################## # Continued list of core "glibc" package information: @@ -2464,6 +2478,24 @@ update_gconv_modules_cache () %endif %changelog +* Thu Apr 04 2024 Arjun Shankar - 2.39-7 +- Sync with upstream branch release/2.39/master, + commit 5d070d12b3a52bc44dd1b71743abc4b6243862ae: +- x86: Expand the comment on when REP STOSB is used on memset +- x86: Do not prefer ERMS for memset on Zen3+ +- x86: Fix Zen3/Zen4 ERMS selection (BZ 30994) +- Add tst-gnu2-tls2mod1 to test-internal-extras +- elf: Enable TLS descriptor tests on aarch64 +- arm: Update _dl_tlsdesc_dynamic to preserve caller-saved registers (BZ 31372) +- Ignore undefined symbols for -mtls-dialect=gnu2 +- x86-64: Allocate state buffer space for RDI, RSI and RBX +- x86-64: Update _dl_tlsdesc_dynamic to preserve AMX registers +- x86: Update _dl_tlsdesc_dynamic to preserve caller-saved registers +- x86-64: Save APX registers in ld.so trampoline +- LoongArch: Correct {__ieee754, _}_scalb -> {__ieee754, _}_scalbf +- powerpc: Placeholder and infrastructure/build support to add Power11 related changes. +- powerpc: Add HWCAP3/HWCAP4 data to TCB for Power Architecture. + * Tue Mar 26 2024 Florian Weimer - 2.39-6 - Do not generate ELF dependency information for glibc32