diff --git a/glibc-upstream-2.39-147.patch b/glibc-upstream-2.39-147.patch new file mode 100644 index 0000000..1da3b7c --- /dev/null +++ b/glibc-upstream-2.39-147.patch @@ -0,0 +1,136 @@ +commit c1f7bfbe081ebf807b6374a497ad5d5a9f499574 +Author: H.J. Lu +Date: Tue Dec 17 18:41:45 2024 +0800 + + Hide all malloc functions from compiler [BZ #32366] + + Since -1 isn't a power of two, compiler may reject it, hide memalign from + Clang 19 which issues an error: + + tst-memalign.c:86:31: error: requested alignment is not a power of 2 [-Werror,-Wnon-power-of-two-alignment] + 86 | p = memalign (-1, pagesize); + | ^~ + tst-memalign.c:86:31: error: requested alignment must be 4294967296 bytes or smaller; maximum alignment assumed [-Werror,-Wbuiltin-assume-aligned-alignment] + 86 | p = memalign (-1, pagesize); + | ^~ + + Update tst-malloc-aux.h to hide all malloc functions and include it in + all malloc tests to prevent compiler from optimizing out any malloc + functions. + + Tested with Clang 19.1.5 and GCC 15 20241206 for BZ #32366. + + Signed-off-by: H.J. Lu + Reviewed-by: Sam James + (cherry picked from commit f9493a15ea9cfb63a815c00c23142369ec09d8ce) + +diff --git a/malloc/tst-mallinfo2.c b/malloc/tst-mallinfo2.c +index 2c02f5f700f5051e..f072b9f24b575792 100644 +--- a/malloc/tst-mallinfo2.c ++++ b/malloc/tst-mallinfo2.c +@@ -23,6 +23,8 @@ + #include + #include + ++#include "tst-malloc-aux.h" ++ + /* This is not specifically needed for the test, but (1) does + something to the data so gcc doesn't optimize it away, and (2) may + help when developing future tests. */ +diff --git a/malloc/tst-malloc-aux.h b/malloc/tst-malloc-aux.h +index 54908b4a2464d510..3e1b61ce3414dad4 100644 +--- a/malloc/tst-malloc-aux.h ++++ b/malloc/tst-malloc-aux.h +@@ -22,20 +22,35 @@ + + #include + #include +- +-static void *(*volatile aligned_alloc_indirect)(size_t, size_t) = aligned_alloc; +-static void *(*volatile calloc_indirect)(size_t, size_t) = calloc; +-static void *(*volatile malloc_indirect)(size_t) = malloc; +-static void *(*volatile realloc_indirect)(void*, size_t) = realloc; ++#include ++ ++static __typeof (aligned_alloc) * volatile aligned_alloc_indirect ++ = aligned_alloc; ++static __typeof (calloc) * volatile calloc_indirect = calloc; ++static __typeof (malloc) * volatile malloc_indirect = malloc; ++static __typeof (memalign) * volatile memalign_indirect = memalign; ++static __typeof (posix_memalign) * volatile posix_memalign_indirect ++ = posix_memalign; ++static __typeof (pvalloc) * volatile pvalloc_indirect = pvalloc; ++static __typeof (realloc) * volatile realloc_indirect = realloc; ++static __typeof (valloc) * volatile valloc_indirect = valloc; + + #undef aligned_alloc + #undef calloc + #undef malloc ++#undef memalign ++#undef posix_memalign ++#undef pvalloc + #undef realloc ++#undef valloc + + #define aligned_alloc aligned_alloc_indirect + #define calloc calloc_indirect + #define malloc malloc_indirect ++#define memalign memalign_indirect ++#define posix_memalign posix_memalign_indirect ++#define pvalloc pvalloc_indirect + #define realloc realloc_indirect ++#define valloc valloc_indirect + + #endif /* TST_MALLOC_AUX_H */ +diff --git a/malloc/tst-malloc-backtrace.c b/malloc/tst-malloc-backtrace.c +index c7b1d65e5c95c437..65fa91f6fdbdce91 100644 +--- a/malloc/tst-malloc-backtrace.c ++++ b/malloc/tst-malloc-backtrace.c +@@ -22,6 +22,8 @@ + #include + #include + ++#include "tst-malloc-aux.h" ++ + #define SIZE 4096 + + /* Wrap free with a function to prevent gcc from optimizing it out. */ +diff --git a/malloc/tst-memalign.c b/malloc/tst-memalign.c +index 563f6413d2da506b..ac9770d3f96313a7 100644 +--- a/malloc/tst-memalign.c ++++ b/malloc/tst-memalign.c +@@ -23,6 +23,8 @@ + #include + #include + ++#include "tst-malloc-aux.h" ++ + static int errors = 0; + + static void +diff --git a/malloc/tst-safe-linking.c b/malloc/tst-safe-linking.c +index 01dd07004d65a767..63a7e2bc8e8ff536 100644 +--- a/malloc/tst-safe-linking.c ++++ b/malloc/tst-safe-linking.c +@@ -26,6 +26,8 @@ + #include + #include + ++#include "tst-malloc-aux.h" ++ + /* Run CALLBACK and check that the data on standard error equals + EXPECTED. */ + static void +diff --git a/malloc/tst-valloc.c b/malloc/tst-valloc.c +index 9bab8c6470d4fd95..0243d3dfd494d329 100644 +--- a/malloc/tst-valloc.c ++++ b/malloc/tst-valloc.c +@@ -23,6 +23,8 @@ + #include + #include + ++#include "tst-malloc-aux.h" ++ + static int errors = 0; + + static void diff --git a/glibc-upstream-2.39-148.patch b/glibc-upstream-2.39-148.patch new file mode 100644 index 0000000..2975507 --- /dev/null +++ b/glibc-upstream-2.39-148.patch @@ -0,0 +1,55 @@ +commit 1432850ad8fbef6dea82d137e491b53840dc7f4d +Author: Sam James +Date: Fri Jan 10 03:03:47 2025 +0000 + + malloc: obscure calloc use in tst-calloc + + Similar to a9944a52c967ce76a5894c30d0274b824df43c7a and + f9493a15ea9cfb63a815c00c23142369ec09d8ce, we need to hide calloc use from + the compiler to accommodate GCC's r15-6566-g804e9d55d9e54c change. + + First, include tst-malloc-aux.h, but then use `volatile` variables + for size. + + The test passes without the tst-malloc-aux.h change but IMO we want + it there for consistency and to avoid future problems (possibly silent). + + Reviewed-by: H.J. Lu + (cherry picked from commit c3d1dac96bdd10250aa37bb367d5ef8334a093a1) + +diff --git a/malloc/tst-calloc.c b/malloc/tst-calloc.c +index 01f17f9e65591659..5a8c7ab121ef2d00 100644 +--- a/malloc/tst-calloc.c ++++ b/malloc/tst-calloc.c +@@ -23,6 +23,7 @@ + #include + #include + ++#include "tst-malloc-aux.h" + + /* Number of samples per size. */ + #define N 50000 +@@ -94,16 +95,19 @@ random_test (void) + static void + null_test (void) + { ++ /* Obscure allocation size from the compiler. */ ++ volatile size_t max_size = UINT_MAX; ++ volatile size_t zero_size = 0; + /* If the size is 0 the result is implementation defined. Just make + sure the program doesn't crash. The result of calloc is + deliberately ignored, so do not warn about that. */ + DIAG_PUSH_NEEDS_COMMENT; + DIAG_IGNORE_NEEDS_COMMENT (10, "-Wunused-result"); + calloc (0, 0); +- calloc (0, UINT_MAX); +- calloc (UINT_MAX, 0); +- calloc (0, ~((size_t) 0)); +- calloc (~((size_t) 0), 0); ++ calloc (0, max_size); ++ calloc (max_size, 0); ++ calloc (0, ~((size_t) zero_size)); ++ calloc (~((size_t) zero_size), 0); + DIAG_POP_NEEDS_COMMENT; + } + diff --git a/glibc-upstream-2.39-149.patch b/glibc-upstream-2.39-149.patch new file mode 100644 index 0000000..d02b958 --- /dev/null +++ b/glibc-upstream-2.39-149.patch @@ -0,0 +1,67 @@ +commit 662516aca8b6bf6aa6555f471055d5eb512b1ddc +Author: H.J. Lu +Date: Fri Jan 24 18:53:13 2025 +0800 + + stdlib: Test using setenv with updated environ [BZ #32588] + + Add a test for setenv with updated environ. Verify that BZ #32588 is + fixed. + + Signed-off-by: H.J. Lu + Reviewed-by: Florian Weimer + (cherry picked from commit 8ab34497de14e35aff09b607222fe1309ef156da) + +diff --git a/stdlib/Makefile b/stdlib/Makefile +index f4dec9be46a573b9..12f8820fd0668039 100644 +--- a/stdlib/Makefile ++++ b/stdlib/Makefile +@@ -316,6 +316,7 @@ tests := \ + tst-setcontext9 \ + tst-setcontext10 \ + tst-setcontext11 \ ++ tst-setenv-environ \ + tst-stdbit-Wconversion \ + tst-stdbit-builtins \ + tst-stdc_bit_ceil \ +diff --git a/stdlib/tst-setenv-environ.c b/stdlib/tst-setenv-environ.c +new file mode 100644 +index 0000000000000000..02fcef96d098f1b7 +--- /dev/null ++++ b/stdlib/tst-setenv-environ.c +@@ -0,0 +1,36 @@ ++/* Test using setenv with updated environ. ++ Copyright (C) 2025 Free Software Foundation, Inc. ++ This file is part of the GNU C Library. ++ ++ The GNU C Library is free software; you can redistribute it and/or ++ modify it under the terms of the GNU Lesser General Public ++ License as published by the Free Software Foundation; either ++ version 2.1 of the License, or (at your option) any later version. ++ ++ The GNU C Library is distributed in the hope that it will be useful, ++ but WITHOUT ANY WARRANTY; without even the implied warranty of ++ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ Lesser General Public License for more details. ++ ++ You should have received a copy of the GNU Lesser General Public ++ License along with the GNU C Library; if not, see ++ . */ ++ ++#include ++#include ++ ++extern char **environ; ++ ++int ++do_test (void) ++{ ++ char *valp; ++ static char *dummy_environ[] = { NULL }; ++ environ = dummy_environ; ++ setenv ("A", "1", 0); ++ valp = getenv ("A"); ++ TEST_VERIFY_EXIT (valp[0] == '1' && valp[1] == '\0'); ++ return 0; ++} ++ ++#include diff --git a/glibc-upstream-2.39-150.patch b/glibc-upstream-2.39-150.patch new file mode 100644 index 0000000..d45425f --- /dev/null +++ b/glibc-upstream-2.39-150.patch @@ -0,0 +1,125 @@ +commit f6d48470aef9264d2d56f4c4533eb76db7f9c2e4 +Author: Siddhesh Poyarekar +Date: Fri Jan 31 12:16:30 2025 -0500 + + assert: Add test for CVE-2025-0395 + + Use the __progname symbol to override the program name to induce the + failure that CVE-2025-0395 describes. + + This is related to BZ #32582 + + Signed-off-by: Siddhesh Poyarekar + Reviewed-by: Adhemerval Zanella + (cherry picked from commit cdb9ba84191ce72e86346fb8b1d906e7cd930ea2) + +diff --git a/assert/Makefile b/assert/Makefile +index 35dc908ddb8a14f2..c0fe660bd69f9ec8 100644 +--- a/assert/Makefile ++++ b/assert/Makefile +@@ -38,6 +38,7 @@ tests := \ + test-assert-perr \ + tst-assert-c++ \ + tst-assert-g++ \ ++ tst-assert-sa-2025-0001 \ + # tests + + ifeq ($(have-cxx-thread_local),yes) +diff --git a/assert/tst-assert-sa-2025-0001.c b/assert/tst-assert-sa-2025-0001.c +new file mode 100644 +index 0000000000000000..102cb0078dafa9c1 +--- /dev/null ++++ b/assert/tst-assert-sa-2025-0001.c +@@ -0,0 +1,92 @@ ++/* Test for CVE-2025-0395. ++ Copyright The GNU Toolchain Authors. ++ This file is part of the GNU C Library. ++ ++ The GNU C Library is free software; you can redistribute it and/or ++ modify it under the terms of the GNU Lesser General Public ++ License as published by the Free Software Foundation; either ++ version 2.1 of the License, or (at your option) any later version. ++ ++ The GNU C Library is distributed in the hope that it will be useful, ++ but WITHOUT ANY WARRANTY; without even the implied warranty of ++ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ Lesser General Public License for more details. ++ ++ You should have received a copy of the GNU Lesser General Public ++ License along with the GNU C Library; if not, see ++ . */ ++ ++/* Test that a large enough __progname does not result in a buffer overflow ++ when printing an assertion failure. This was CVE-2025-0395. */ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++extern const char *__progname; ++ ++int ++do_test (int argc, char **argv) ++{ ++ ++ support_need_proc ("Reads /proc/self/maps to add guards to writable maps."); ++ ignore_stderr (); ++ ++ /* XXX assumes that the assert is on a 2 digit line number. */ ++ const char *prompt = ": %s:99: do_test: Assertion `argc < 1' failed.\n"; ++ ++ int ret = fprintf (stderr, prompt, __FILE__); ++ if (ret < 0) ++ FAIL_EXIT1 ("fprintf failed: %m\n"); ++ ++ size_t pagesize = getpagesize (); ++ size_t namesize = pagesize - 1 - ret; ++ ++ /* Alter the progname so that the assert message fills the entire page. */ ++ char progname[namesize]; ++ memset (progname, 'A', namesize - 1); ++ progname[namesize - 1] = '\0'; ++ __progname = progname; ++ ++ FILE *f = xfopen ("/proc/self/maps", "r"); ++ char *line = NULL; ++ size_t len = 0; ++ uintptr_t prev_to = 0; ++ ++ /* Pad the beginning of every writable mapping with a PROT_NONE map. This ++ ensures that the mmap in the assert_fail path never ends up below a ++ writable map and will terminate immediately in case of a buffer ++ overflow. */ ++ while (xgetline (&line, &len, f)) ++ { ++ uintptr_t from, to; ++ char perm[4]; ++ ++ sscanf (line, "%" SCNxPTR "-%" SCNxPTR " %c%c%c%c ", ++ &from, &to, ++ &perm[0], &perm[1], &perm[2], &perm[3]); ++ ++ bool writable = (memchr (perm, 'w', 4) != NULL); ++ ++ if (prev_to != 0 && from - prev_to > pagesize && writable) ++ xmmap ((void *) from - pagesize, pagesize, PROT_NONE, ++ MAP_ANONYMOUS | MAP_PRIVATE, 0); ++ ++ prev_to = to; ++ } ++ ++ xfclose (f); ++ ++ assert (argc < 1); ++ return 0; ++} ++ ++#define EXPECTED_SIGNAL SIGABRT ++#define TEST_FUNCTION_ARGV do_test ++#include diff --git a/glibc-upstream-2.39-151.patch b/glibc-upstream-2.39-151.patch new file mode 100644 index 0000000..d930fc9 --- /dev/null +++ b/glibc-upstream-2.39-151.patch @@ -0,0 +1,230 @@ +commit d591876303e368fde0b03e1536efb69b64d9d483 +Author: Joe Ramsay +Date: Thu May 2 16:43:13 2024 +0100 + + aarch64: Fix AdvSIMD libmvec routines for big-endian + + Previously many routines used * to load from vector types stored + in the data table. This is emitted as ldr, which byte-swaps the + entire vector register, and causes bugs for big-endian when not + all lanes contain the same value. When a vector is to be used + this way, it has been replaced with an array and the load with an + explicit ld1 intrinsic, which byte-swaps only within lanes. + + As well, many routines previously used non-standard GCC syntax + for vector operations such as indexing into vectors types with [] + and assembling vectors using {}. This syntax should not be mixed + with ACLE, as the former does not respect endianness whereas the + latter does. Such examples have been replaced with, for instance, + vcombine_* and vgetq_lane* intrinsics. Helpers which only use the + GCC syntax, such as the v_call helpers, do not need changing as + they do not use intrinsics. + + Reviewed-by: Szabolcs Nagy + (cherry picked from commit 90a6ca8b28bf34e361e577e526e1b0f4c39a32a5) + +diff --git a/sysdeps/aarch64/fpu/exp10f_advsimd.c b/sysdeps/aarch64/fpu/exp10f_advsimd.c +index ab117b69da23e5f3..cf53e73290fcedb6 100644 +--- a/sysdeps/aarch64/fpu/exp10f_advsimd.c ++++ b/sysdeps/aarch64/fpu/exp10f_advsimd.c +@@ -25,7 +25,8 @@ + static const struct data + { + float32x4_t poly[5]; +- float32x4_t log10_2_and_inv, shift; ++ float log10_2_and_inv[4]; ++ float32x4_t shift; + + #if !WANT_SIMD_EXCEPT + float32x4_t scale_thresh; +@@ -111,10 +112,11 @@ float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (exp10) (float32x4_t x) + /* exp10(x) = 2^n * 10^r = 2^n * (1 + poly (r)), + with poly(r) in [1/sqrt(2), sqrt(2)] and + x = r + n * log10 (2), with r in [-log10(2)/2, log10(2)/2]. */ +- float32x4_t z = vfmaq_laneq_f32 (d->shift, x, d->log10_2_and_inv, 0); ++ float32x4_t log10_2_and_inv = vld1q_f32 (d->log10_2_and_inv); ++ float32x4_t z = vfmaq_laneq_f32 (d->shift, x, log10_2_and_inv, 0); + float32x4_t n = vsubq_f32 (z, d->shift); +- float32x4_t r = vfmsq_laneq_f32 (x, n, d->log10_2_and_inv, 1); +- r = vfmsq_laneq_f32 (r, n, d->log10_2_and_inv, 2); ++ float32x4_t r = vfmsq_laneq_f32 (x, n, log10_2_and_inv, 1); ++ r = vfmsq_laneq_f32 (r, n, log10_2_and_inv, 2); + uint32x4_t e = vshlq_n_u32 (vreinterpretq_u32_f32 (z), 23); + + float32x4_t scale = vreinterpretq_f32_u32 (vaddq_u32 (e, ExponentBias)); +diff --git a/sysdeps/aarch64/fpu/expm1_advsimd.c b/sysdeps/aarch64/fpu/expm1_advsimd.c +index 3628398674468131..3db3b80c49292947 100644 +--- a/sysdeps/aarch64/fpu/expm1_advsimd.c ++++ b/sysdeps/aarch64/fpu/expm1_advsimd.c +@@ -23,7 +23,9 @@ + static const struct data + { + float64x2_t poly[11]; +- float64x2_t invln2, ln2, shift; ++ float64x2_t invln2; ++ double ln2[2]; ++ float64x2_t shift; + int64x2_t exponent_bias; + #if WANT_SIMD_EXCEPT + uint64x2_t thresh, tiny_bound; +@@ -92,8 +94,9 @@ float64x2_t VPCS_ATTR V_NAME_D1 (expm1) (float64x2_t x) + where 2^i is exact because i is an integer. */ + float64x2_t n = vsubq_f64 (vfmaq_f64 (d->shift, d->invln2, x), d->shift); + int64x2_t i = vcvtq_s64_f64 (n); +- float64x2_t f = vfmsq_laneq_f64 (x, n, d->ln2, 0); +- f = vfmsq_laneq_f64 (f, n, d->ln2, 1); ++ float64x2_t ln2 = vld1q_f64 (&d->ln2[0]); ++ float64x2_t f = vfmsq_laneq_f64 (x, n, ln2, 0); ++ f = vfmsq_laneq_f64 (f, n, ln2, 1); + + /* Approximate expm1(f) using polynomial. + Taylor expansion for expm1(x) has the form: +diff --git a/sysdeps/aarch64/fpu/expm1f_advsimd.c b/sysdeps/aarch64/fpu/expm1f_advsimd.c +index 93db200f618379be..a0616ec7542cbfce 100644 +--- a/sysdeps/aarch64/fpu/expm1f_advsimd.c ++++ b/sysdeps/aarch64/fpu/expm1f_advsimd.c +@@ -23,7 +23,7 @@ + static const struct data + { + float32x4_t poly[5]; +- float32x4_t invln2_and_ln2; ++ float invln2_and_ln2[4]; + float32x4_t shift; + int32x4_t exponent_bias; + #if WANT_SIMD_EXCEPT +@@ -88,11 +88,12 @@ float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (expm1) (float32x4_t x) + and f = x - i * ln2, then f is in [-ln2/2, ln2/2]. + exp(x) - 1 = 2^i * (expm1(f) + 1) - 1 + where 2^i is exact because i is an integer. */ +- float32x4_t j = vsubq_f32 ( +- vfmaq_laneq_f32 (d->shift, x, d->invln2_and_ln2, 0), d->shift); ++ float32x4_t invln2_and_ln2 = vld1q_f32 (d->invln2_and_ln2); ++ float32x4_t j ++ = vsubq_f32 (vfmaq_laneq_f32 (d->shift, x, invln2_and_ln2, 0), d->shift); + int32x4_t i = vcvtq_s32_f32 (j); +- float32x4_t f = vfmsq_laneq_f32 (x, j, d->invln2_and_ln2, 1); +- f = vfmsq_laneq_f32 (f, j, d->invln2_and_ln2, 2); ++ float32x4_t f = vfmsq_laneq_f32 (x, j, invln2_and_ln2, 1); ++ f = vfmsq_laneq_f32 (f, j, invln2_and_ln2, 2); + + /* Approximate expm1(f) using polynomial. + Taylor expansion for expm1(x) has the form: +diff --git a/sysdeps/aarch64/fpu/log10_advsimd.c b/sysdeps/aarch64/fpu/log10_advsimd.c +index 1e5ef99e8907068b..c065aaebae8600fb 100644 +--- a/sysdeps/aarch64/fpu/log10_advsimd.c ++++ b/sysdeps/aarch64/fpu/log10_advsimd.c +@@ -58,8 +58,10 @@ static inline struct entry + lookup (uint64x2_t i) + { + struct entry e; +- uint64_t i0 = (i[0] >> (52 - V_LOG10_TABLE_BITS)) & IndexMask; +- uint64_t i1 = (i[1] >> (52 - V_LOG10_TABLE_BITS)) & IndexMask; ++ uint64_t i0 ++ = (vgetq_lane_u64 (i, 0) >> (52 - V_LOG10_TABLE_BITS)) & IndexMask; ++ uint64_t i1 ++ = (vgetq_lane_u64 (i, 1) >> (52 - V_LOG10_TABLE_BITS)) & IndexMask; + float64x2_t e0 = vld1q_f64 (&__v_log10_data.table[i0].invc); + float64x2_t e1 = vld1q_f64 (&__v_log10_data.table[i1].invc); + e.invc = vuzp1q_f64 (e0, e1); +diff --git a/sysdeps/aarch64/fpu/log2_advsimd.c b/sysdeps/aarch64/fpu/log2_advsimd.c +index a34978f6cf1cdb44..4057c552d8dfc0bb 100644 +--- a/sysdeps/aarch64/fpu/log2_advsimd.c ++++ b/sysdeps/aarch64/fpu/log2_advsimd.c +@@ -55,8 +55,10 @@ static inline struct entry + lookup (uint64x2_t i) + { + struct entry e; +- uint64_t i0 = (i[0] >> (52 - V_LOG2_TABLE_BITS)) & IndexMask; +- uint64_t i1 = (i[1] >> (52 - V_LOG2_TABLE_BITS)) & IndexMask; ++ uint64_t i0 ++ = (vgetq_lane_u64 (i, 0) >> (52 - V_LOG2_TABLE_BITS)) & IndexMask; ++ uint64_t i1 ++ = (vgetq_lane_u64 (i, 1) >> (52 - V_LOG2_TABLE_BITS)) & IndexMask; + float64x2_t e0 = vld1q_f64 (&__v_log2_data.table[i0].invc); + float64x2_t e1 = vld1q_f64 (&__v_log2_data.table[i1].invc); + e.invc = vuzp1q_f64 (e0, e1); +diff --git a/sysdeps/aarch64/fpu/log_advsimd.c b/sysdeps/aarch64/fpu/log_advsimd.c +index 21df61728ca87374..015a6da7d7fd693e 100644 +--- a/sysdeps/aarch64/fpu/log_advsimd.c ++++ b/sysdeps/aarch64/fpu/log_advsimd.c +@@ -54,17 +54,12 @@ lookup (uint64x2_t i) + { + /* Since N is a power of 2, n % N = n & (N - 1). */ + struct entry e; +- uint64_t i0 = (i[0] >> (52 - V_LOG_TABLE_BITS)) & IndexMask; +- uint64_t i1 = (i[1] >> (52 - V_LOG_TABLE_BITS)) & IndexMask; ++ uint64_t i0 = (vgetq_lane_u64 (i, 0) >> (52 - V_LOG_TABLE_BITS)) & IndexMask; ++ uint64_t i1 = (vgetq_lane_u64 (i, 1) >> (52 - V_LOG_TABLE_BITS)) & IndexMask; + float64x2_t e0 = vld1q_f64 (&__v_log_data.table[i0].invc); + float64x2_t e1 = vld1q_f64 (&__v_log_data.table[i1].invc); +-#if __BYTE_ORDER == __LITTLE_ENDIAN + e.invc = vuzp1q_f64 (e0, e1); + e.logc = vuzp2q_f64 (e0, e1); +-#else +- e.invc = vuzp1q_f64 (e1, e0); +- e.logc = vuzp2q_f64 (e1, e0); +-#endif + return e; + } + +diff --git a/sysdeps/aarch64/fpu/tan_advsimd.c b/sysdeps/aarch64/fpu/tan_advsimd.c +index 0459821ab25487a8..d56a102dd17a3463 100644 +--- a/sysdeps/aarch64/fpu/tan_advsimd.c ++++ b/sysdeps/aarch64/fpu/tan_advsimd.c +@@ -23,7 +23,8 @@ + static const struct data + { + float64x2_t poly[9]; +- float64x2_t half_pi, two_over_pi, shift; ++ double half_pi[2]; ++ float64x2_t two_over_pi, shift; + #if !WANT_SIMD_EXCEPT + float64x2_t range_val; + #endif +@@ -81,8 +82,9 @@ float64x2_t VPCS_ATTR V_NAME_D1 (tan) (float64x2_t x) + /* Use q to reduce x to r in [-pi/4, pi/4], by: + r = x - q * pi/2, in extended precision. */ + float64x2_t r = x; +- r = vfmsq_laneq_f64 (r, q, dat->half_pi, 0); +- r = vfmsq_laneq_f64 (r, q, dat->half_pi, 1); ++ float64x2_t half_pi = vld1q_f64 (dat->half_pi); ++ r = vfmsq_laneq_f64 (r, q, half_pi, 0); ++ r = vfmsq_laneq_f64 (r, q, half_pi, 1); + /* Further reduce r to [-pi/8, pi/8], to be reconstructed using double angle + formula. */ + r = vmulq_n_f64 (r, 0.5); +diff --git a/sysdeps/aarch64/fpu/tanf_advsimd.c b/sysdeps/aarch64/fpu/tanf_advsimd.c +index 5a7489390a9692c6..705586f0c0b664c1 100644 +--- a/sysdeps/aarch64/fpu/tanf_advsimd.c ++++ b/sysdeps/aarch64/fpu/tanf_advsimd.c +@@ -23,7 +23,7 @@ + static const struct data + { + float32x4_t poly[6]; +- float32x4_t pi_consts; ++ float pi_consts[4]; + float32x4_t shift; + #if !WANT_SIMD_EXCEPT + float32x4_t range_val; +@@ -95,16 +95,17 @@ float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (tan) (float32x4_t x) + #endif + + /* n = rint(x/(pi/2)). */ +- float32x4_t q = vfmaq_laneq_f32 (d->shift, x, d->pi_consts, 3); ++ float32x4_t pi_consts = vld1q_f32 (d->pi_consts); ++ float32x4_t q = vfmaq_laneq_f32 (d->shift, x, pi_consts, 3); + float32x4_t n = vsubq_f32 (q, d->shift); + /* Determine if x lives in an interval, where |tan(x)| grows to infinity. */ + uint32x4_t pred_alt = vtstq_u32 (vreinterpretq_u32_f32 (q), v_u32 (1)); + + /* r = x - n * (pi/2) (range reduction into -pi./4 .. pi/4). */ + float32x4_t r; +- r = vfmaq_laneq_f32 (x, n, d->pi_consts, 0); +- r = vfmaq_laneq_f32 (r, n, d->pi_consts, 1); +- r = vfmaq_laneq_f32 (r, n, d->pi_consts, 2); ++ r = vfmaq_laneq_f32 (x, n, pi_consts, 0); ++ r = vfmaq_laneq_f32 (r, n, pi_consts, 1); ++ r = vfmaq_laneq_f32 (r, n, pi_consts, 2); + + /* If x lives in an interval, where |tan(x)| + - is finite, then use a polynomial approximation of the form diff --git a/glibc-upstream-2.39-152.patch b/glibc-upstream-2.39-152.patch new file mode 100644 index 0000000..6f249e1 --- /dev/null +++ b/glibc-upstream-2.39-152.patch @@ -0,0 +1,268 @@ +commit 80df456112d67e27660563b9540cbc1bb5475c84 +Author: Joe Ramsay +Date: Mon Sep 9 13:00:01 2024 +0100 + + aarch64: Avoid redundant MOVs in AdvSIMD F32 logs + + Since the last operation is destructive, the first argument to the FMA + also has to be the first argument to the special-case in order to + avoid unnecessary MOVs. Reorder arguments and adjust special-case + bounds to facilitate this. + + Reviewed-by: Wilco Dijkstra + (cherry picked from commit 8b09af572b208bfde4d31c6abbae047dcc217675) + +diff --git a/sysdeps/aarch64/fpu/log10f_advsimd.c b/sysdeps/aarch64/fpu/log10f_advsimd.c +index 9347422a771e3d4e..82228b599a5c061b 100644 +--- a/sysdeps/aarch64/fpu/log10f_advsimd.c ++++ b/sysdeps/aarch64/fpu/log10f_advsimd.c +@@ -22,11 +22,11 @@ + + static const struct data + { +- uint32x4_t min_norm; ++ uint32x4_t off, offset_lower_bound; + uint16x8_t special_bound; ++ uint32x4_t mantissa_mask; + float32x4_t poly[8]; + float32x4_t inv_ln10, ln2; +- uint32x4_t off, mantissa_mask; + } data = { + /* Use order 9 for log10(1+x), i.e. order 8 for log10(1+x)/x, with x in + [-1/3, 1/3] (offset=2/3). Max. relative error: 0x1.068ee468p-25. */ +@@ -35,18 +35,22 @@ static const struct data + V4 (-0x1.0fc92cp-4f), V4 (0x1.f5f76ap-5f) }, + .ln2 = V4 (0x1.62e43p-1f), + .inv_ln10 = V4 (0x1.bcb7b2p-2f), +- .min_norm = V4 (0x00800000), +- .special_bound = V8 (0x7f00), /* asuint32(inf) - min_norm. */ ++ /* Lower bound is the smallest positive normal float 0x00800000. For ++ optimised register use subnormals are detected after offset has been ++ subtracted, so lower bound is 0x0080000 - offset (which wraps around). */ ++ .offset_lower_bound = V4 (0x00800000 - 0x3f2aaaab), ++ .special_bound = V8 (0x7f00), /* top16(asuint32(inf) - 0x00800000). */ + .off = V4 (0x3f2aaaab), /* 0.666667. */ + .mantissa_mask = V4 (0x007fffff), + }; + + static float32x4_t VPCS_ATTR NOINLINE +-special_case (float32x4_t x, float32x4_t y, float32x4_t p, float32x4_t r2, +- uint16x4_t cmp) ++special_case (float32x4_t y, uint32x4_t u_off, float32x4_t p, float32x4_t r2, ++ uint16x4_t cmp, const struct data *d) + { + /* Fall back to scalar code. */ +- return v_call_f32 (log10f, x, vfmaq_f32 (y, p, r2), vmovl_u16 (cmp)); ++ return v_call_f32 (log10f, vreinterpretq_f32_u32 (vaddq_u32 (u_off, d->off)), ++ vfmaq_f32 (y, p, r2), vmovl_u16 (cmp)); + } + + /* Fast implementation of AdvSIMD log10f, +@@ -58,15 +62,21 @@ special_case (float32x4_t x, float32x4_t y, float32x4_t p, float32x4_t r2, + float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (log10) (float32x4_t x) + { + const struct data *d = ptr_barrier (&data); +- uint32x4_t u = vreinterpretq_u32_f32 (x); +- uint16x4_t special = vcge_u16 (vsubhn_u32 (u, d->min_norm), +- vget_low_u16 (d->special_bound)); ++ ++ /* To avoid having to mov x out of the way, keep u after offset has been ++ applied, and recover x by adding the offset back in the special-case ++ handler. */ ++ uint32x4_t u_off = vreinterpretq_u32_f32 (x); + + /* x = 2^n * (1+r), where 2/3 < 1+r < 4/3. */ +- u = vsubq_u32 (u, d->off); ++ u_off = vsubq_u32 (u_off, d->off); + float32x4_t n = vcvtq_f32_s32 ( +- vshrq_n_s32 (vreinterpretq_s32_u32 (u), 23)); /* signextend. */ +- u = vaddq_u32 (vandq_u32 (u, d->mantissa_mask), d->off); ++ vshrq_n_s32 (vreinterpretq_s32_u32 (u_off), 23)); /* signextend. */ ++ ++ uint16x4_t special = vcge_u16 (vsubhn_u32 (u_off, d->offset_lower_bound), ++ vget_low_u16 (d->special_bound)); ++ ++ uint32x4_t u = vaddq_u32 (vandq_u32 (u_off, d->mantissa_mask), d->off); + float32x4_t r = vsubq_f32 (vreinterpretq_f32_u32 (u), v_f32 (1.0f)); + + /* y = log10(1+r) + n * log10(2). */ +@@ -77,7 +87,7 @@ float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (log10) (float32x4_t x) + y = vmulq_f32 (y, d->inv_ln10); + + if (__glibc_unlikely (v_any_u16h (special))) +- return special_case (x, y, poly, r2, special); ++ return special_case (y, u_off, poly, r2, special, d); + return vfmaq_f32 (y, poly, r2); + } + libmvec_hidden_def (V_NAME_F1 (log10)) +diff --git a/sysdeps/aarch64/fpu/log2f_advsimd.c b/sysdeps/aarch64/fpu/log2f_advsimd.c +index db218367495dc567..84effe4fe9492d08 100644 +--- a/sysdeps/aarch64/fpu/log2f_advsimd.c ++++ b/sysdeps/aarch64/fpu/log2f_advsimd.c +@@ -22,9 +22,9 @@ + + static const struct data + { +- uint32x4_t min_norm; ++ uint32x4_t off, offset_lower_bound; + uint16x8_t special_bound; +- uint32x4_t off, mantissa_mask; ++ uint32x4_t mantissa_mask; + float32x4_t poly[9]; + } data = { + /* Coefficients generated using Remez algorithm approximate +@@ -34,18 +34,22 @@ static const struct data + V4 (-0x1.715458p-1f), V4 (0x1.ec701cp-2f), V4 (-0x1.7171a4p-2f), + V4 (0x1.27a0b8p-2f), V4 (-0x1.e5143ep-3f), V4 (0x1.9d8ecap-3f), + V4 (-0x1.c675bp-3f), V4 (0x1.9e495p-3f) }, +- .min_norm = V4 (0x00800000), +- .special_bound = V8 (0x7f00), /* asuint32(inf) - min_norm. */ ++ /* Lower bound is the smallest positive normal float 0x00800000. For ++ optimised register use subnormals are detected after offset has been ++ subtracted, so lower bound is 0x0080000 - offset (which wraps around). */ ++ .offset_lower_bound = V4 (0x00800000 - 0x3f2aaaab), ++ .special_bound = V8 (0x7f00), /* top16(asuint32(inf) - 0x00800000). */ + .off = V4 (0x3f2aaaab), /* 0.666667. */ + .mantissa_mask = V4 (0x007fffff), + }; + + static float32x4_t VPCS_ATTR NOINLINE +-special_case (float32x4_t x, float32x4_t n, float32x4_t p, float32x4_t r, +- uint16x4_t cmp) ++special_case (float32x4_t n, uint32x4_t u_off, float32x4_t p, float32x4_t r, ++ uint16x4_t cmp, const struct data *d) + { + /* Fall back to scalar code. */ +- return v_call_f32 (log2f, x, vfmaq_f32 (n, p, r), vmovl_u16 (cmp)); ++ return v_call_f32 (log2f, vreinterpretq_f32_u32 (vaddq_u32 (u_off, d->off)), ++ vfmaq_f32 (n, p, r), vmovl_u16 (cmp)); + } + + /* Fast implementation for single precision AdvSIMD log2, +@@ -56,15 +60,21 @@ special_case (float32x4_t x, float32x4_t n, float32x4_t p, float32x4_t r, + float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (log2) (float32x4_t x) + { + const struct data *d = ptr_barrier (&data); +- uint32x4_t u = vreinterpretq_u32_f32 (x); +- uint16x4_t special = vcge_u16 (vsubhn_u32 (u, d->min_norm), +- vget_low_u16 (d->special_bound)); ++ ++ /* To avoid having to mov x out of the way, keep u after offset has been ++ applied, and recover x by adding the offset back in the special-case ++ handler. */ ++ uint32x4_t u_off = vreinterpretq_u32_f32 (x); + + /* x = 2^n * (1+r), where 2/3 < 1+r < 4/3. */ +- u = vsubq_u32 (u, d->off); ++ u_off = vsubq_u32 (u_off, d->off); + float32x4_t n = vcvtq_f32_s32 ( +- vshrq_n_s32 (vreinterpretq_s32_u32 (u), 23)); /* signextend. */ +- u = vaddq_u32 (vandq_u32 (u, d->mantissa_mask), d->off); ++ vshrq_n_s32 (vreinterpretq_s32_u32 (u_off), 23)); /* signextend. */ ++ ++ uint16x4_t special = vcge_u16 (vsubhn_u32 (u_off, d->offset_lower_bound), ++ vget_low_u16 (d->special_bound)); ++ ++ uint32x4_t u = vaddq_u32 (vandq_u32 (u_off, d->mantissa_mask), d->off); + float32x4_t r = vsubq_f32 (vreinterpretq_f32_u32 (u), v_f32 (1.0f)); + + /* y = log2(1+r) + n. */ +@@ -72,7 +82,7 @@ float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (log2) (float32x4_t x) + float32x4_t p = v_pw_horner_8_f32 (r, r2, d->poly); + + if (__glibc_unlikely (v_any_u16h (special))) +- return special_case (x, n, p, r, special); ++ return special_case (n, u_off, p, r, special, d); + return vfmaq_f32 (n, p, r); + } + libmvec_hidden_def (V_NAME_F1 (log2)) +diff --git a/sysdeps/aarch64/fpu/logf_advsimd.c b/sysdeps/aarch64/fpu/logf_advsimd.c +index 3c0d0fcdc76f1004..c20dbfd6c088c0af 100644 +--- a/sysdeps/aarch64/fpu/logf_advsimd.c ++++ b/sysdeps/aarch64/fpu/logf_advsimd.c +@@ -21,20 +21,22 @@ + + static const struct data + { +- uint32x4_t min_norm; ++ uint32x4_t off, offset_lower_bound; + uint16x8_t special_bound; ++ uint32x4_t mantissa_mask; + float32x4_t poly[7]; +- float32x4_t ln2, tiny_bound; +- uint32x4_t off, mantissa_mask; ++ float32x4_t ln2; + } data = { + /* 3.34 ulp error. */ + .poly = { V4 (-0x1.3e737cp-3f), V4 (0x1.5a9aa2p-3f), V4 (-0x1.4f9934p-3f), + V4 (0x1.961348p-3f), V4 (-0x1.00187cp-2f), V4 (0x1.555d7cp-2f), + V4 (-0x1.ffffc8p-2f) }, + .ln2 = V4 (0x1.62e43p-1f), +- .tiny_bound = V4 (0x1p-126), +- .min_norm = V4 (0x00800000), +- .special_bound = V8 (0x7f00), /* asuint32(inf) - min_norm. */ ++ /* Lower bound is the smallest positive normal float 0x00800000. For ++ optimised register use subnormals are detected after offset has been ++ subtracted, so lower bound is 0x0080000 - offset (which wraps around). */ ++ .offset_lower_bound = V4 (0x00800000 - 0x3f2aaaab), ++ .special_bound = V8 (0x7f00), /* top16(asuint32(inf) - 0x00800000). */ + .off = V4 (0x3f2aaaab), /* 0.666667. */ + .mantissa_mask = V4 (0x007fffff) + }; +@@ -42,32 +44,37 @@ static const struct data + #define P(i) d->poly[7 - i] + + static float32x4_t VPCS_ATTR NOINLINE +-special_case (float32x4_t x, float32x4_t y, float32x4_t r2, float32x4_t p, +- uint16x4_t cmp) ++special_case (float32x4_t p, uint32x4_t u_off, float32x4_t y, float32x4_t r2, ++ uint16x4_t cmp, const struct data *d) + { + /* Fall back to scalar code. */ +- return v_call_f32 (logf, x, vfmaq_f32 (p, y, r2), vmovl_u16 (cmp)); ++ return v_call_f32 (logf, vreinterpretq_f32_u32 (vaddq_u32 (u_off, d->off)), ++ vfmaq_f32 (p, y, r2), vmovl_u16 (cmp)); + } + + float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (log) (float32x4_t x) + { + const struct data *d = ptr_barrier (&data); + float32x4_t n, p, q, r, r2, y; +- uint32x4_t u; ++ uint32x4_t u, u_off; + uint16x4_t cmp; + +- u = vreinterpretq_u32_f32 (x); +- cmp = vcge_u16 (vsubhn_u32 (u, d->min_norm), +- vget_low_u16 (d->special_bound)); ++ /* To avoid having to mov x out of the way, keep u after offset has been ++ applied, and recover x by adding the offset back in the special-case ++ handler. */ ++ u_off = vreinterpretq_u32_f32 (x); + + /* x = 2^n * (1+r), where 2/3 < 1+r < 4/3. */ +- u = vsubq_u32 (u, d->off); ++ u_off = vsubq_u32 (u_off, d->off); + n = vcvtq_f32_s32 ( +- vshrq_n_s32 (vreinterpretq_s32_u32 (u), 23)); /* signextend. */ +- u = vandq_u32 (u, d->mantissa_mask); ++ vshrq_n_s32 (vreinterpretq_s32_u32 (u_off), 23)); /* signextend. */ ++ u = vandq_u32 (u_off, d->mantissa_mask); + u = vaddq_u32 (u, d->off); + r = vsubq_f32 (vreinterpretq_f32_u32 (u), v_f32 (1.0f)); + ++ cmp = vcge_u16 (vsubhn_u32 (u_off, d->offset_lower_bound), ++ vget_low_u16 (d->special_bound)); ++ + /* y = log(1+r) + n*ln2. */ + r2 = vmulq_f32 (r, r); + /* n*ln2 + r + r2*(P1 + r*P2 + r2*(P3 + r*P4 + r2*(P5 + r*P6 + r2*P7))). */ +@@ -80,7 +87,7 @@ float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (log) (float32x4_t x) + p = vfmaq_f32 (r, d->ln2, n); + + if (__glibc_unlikely (v_any_u16h (cmp))) +- return special_case (x, y, r2, p, cmp); ++ return special_case (p, u_off, y, r2, cmp, d); + return vfmaq_f32 (p, y, r2); + } + libmvec_hidden_def (V_NAME_F1 (log)) diff --git a/glibc-upstream-2.39-153.patch b/glibc-upstream-2.39-153.patch new file mode 100644 index 0000000..32bee5b --- /dev/null +++ b/glibc-upstream-2.39-153.patch @@ -0,0 +1,240 @@ +commit 5e354bf4e20ca3ccc15bda63c7b56ea0e97efa81 +Author: Joe Ramsay +Date: Mon Sep 23 15:33:31 2024 +0100 + + AArch64: Simplify rounding-multiply pattern in several AdvSIMD routines + + This operation can be simplified to use simpler multiply-round-convert + sequence, which uses fewer instructions and constants. + + Reviewed-by: Wilco Dijkstra + (cherry picked from commit 16a59571e4e9fd019d3fc23a2e7d73c1df8bb5cb) + +diff --git a/sysdeps/aarch64/fpu/cos_advsimd.c b/sysdeps/aarch64/fpu/cos_advsimd.c +index 3924c9ce44c30d4d..11a89b1530825b5f 100644 +--- a/sysdeps/aarch64/fpu/cos_advsimd.c ++++ b/sysdeps/aarch64/fpu/cos_advsimd.c +@@ -22,7 +22,7 @@ + static const struct data + { + float64x2_t poly[7]; +- float64x2_t range_val, shift, inv_pi, half_pi, pi_1, pi_2, pi_3; ++ float64x2_t range_val, inv_pi, pi_1, pi_2, pi_3; + } data = { + /* Worst-case error is 3.3 ulp in [-pi/2, pi/2]. */ + .poly = { V2 (-0x1.555555555547bp-3), V2 (0x1.1111111108a4dp-7), +@@ -30,11 +30,9 @@ static const struct data + V2 (-0x1.ae633919987c6p-26), V2 (0x1.60e277ae07cecp-33), + V2 (-0x1.9e9540300a1p-41) }, + .inv_pi = V2 (0x1.45f306dc9c883p-2), +- .half_pi = V2 (0x1.921fb54442d18p+0), + .pi_1 = V2 (0x1.921fb54442d18p+1), + .pi_2 = V2 (0x1.1a62633145c06p-53), + .pi_3 = V2 (0x1.c1cd129024e09p-106), +- .shift = V2 (0x1.8p52), + .range_val = V2 (0x1p23) + }; + +@@ -68,10 +66,9 @@ float64x2_t VPCS_ATTR V_NAME_D1 (cos) (float64x2_t x) + #endif + + /* n = rint((|x|+pi/2)/pi) - 0.5. */ +- n = vfmaq_f64 (d->shift, d->inv_pi, vaddq_f64 (r, d->half_pi)); +- odd = vshlq_n_u64 (vreinterpretq_u64_f64 (n), 63); +- n = vsubq_f64 (n, d->shift); +- n = vsubq_f64 (n, v_f64 (0.5)); ++ n = vrndaq_f64 (vfmaq_f64 (v_f64 (0.5), r, d->inv_pi)); ++ odd = vshlq_n_u64 (vreinterpretq_u64_s64 (vcvtq_s64_f64 (n)), 63); ++ n = vsubq_f64 (n, v_f64 (0.5f)); + + /* r = |x| - n*pi (range reduction into -pi/2 .. pi/2). */ + r = vfmsq_f64 (r, d->pi_1, n); +diff --git a/sysdeps/aarch64/fpu/cosf_advsimd.c b/sysdeps/aarch64/fpu/cosf_advsimd.c +index d0c285b03a8bfe22..85a1b373733123fa 100644 +--- a/sysdeps/aarch64/fpu/cosf_advsimd.c ++++ b/sysdeps/aarch64/fpu/cosf_advsimd.c +@@ -22,7 +22,7 @@ + static const struct data + { + float32x4_t poly[4]; +- float32x4_t range_val, inv_pi, half_pi, shift, pi_1, pi_2, pi_3; ++ float32x4_t range_val, inv_pi, pi_1, pi_2, pi_3; + } data = { + /* 1.886 ulp error. */ + .poly = { V4 (-0x1.555548p-3f), V4 (0x1.110df4p-7f), V4 (-0x1.9f42eap-13f), +@@ -33,8 +33,6 @@ static const struct data + .pi_3 = V4 (-0x1.ee59dap-49f), + + .inv_pi = V4 (0x1.45f306p-2f), +- .shift = V4 (0x1.8p+23f), +- .half_pi = V4 (0x1.921fb6p0f), + .range_val = V4 (0x1p20f) + }; + +@@ -69,9 +67,8 @@ float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (cos) (float32x4_t x) + #endif + + /* n = rint((|x|+pi/2)/pi) - 0.5. */ +- n = vfmaq_f32 (d->shift, d->inv_pi, vaddq_f32 (r, d->half_pi)); +- odd = vshlq_n_u32 (vreinterpretq_u32_f32 (n), 31); +- n = vsubq_f32 (n, d->shift); ++ n = vrndaq_f32 (vfmaq_f32 (v_f32 (0.5), r, d->inv_pi)); ++ odd = vshlq_n_u32 (vreinterpretq_u32_s32 (vcvtq_s32_f32 (n)), 31); + n = vsubq_f32 (n, v_f32 (0.5f)); + + /* r = |x| - n*pi (range reduction into -pi/2 .. pi/2). */ +diff --git a/sysdeps/aarch64/fpu/expf_advsimd.c b/sysdeps/aarch64/fpu/expf_advsimd.c +index 99d2e647aab70260..5c9cb726205ece6e 100644 +--- a/sysdeps/aarch64/fpu/expf_advsimd.c ++++ b/sysdeps/aarch64/fpu/expf_advsimd.c +@@ -22,7 +22,7 @@ + static const struct data + { + float32x4_t poly[5]; +- float32x4_t shift, inv_ln2, ln2_hi, ln2_lo; ++ float32x4_t inv_ln2, ln2_hi, ln2_lo; + uint32x4_t exponent_bias; + #if !WANT_SIMD_EXCEPT + float32x4_t special_bound, scale_thresh; +@@ -31,7 +31,6 @@ static const struct data + /* maxerr: 1.45358 +0.5 ulp. */ + .poly = { V4 (0x1.0e4020p-7f), V4 (0x1.573e2ep-5f), V4 (0x1.555e66p-3f), + V4 (0x1.fffdb6p-2f), V4 (0x1.ffffecp-1f) }, +- .shift = V4 (0x1.8p23f), + .inv_ln2 = V4 (0x1.715476p+0f), + .ln2_hi = V4 (0x1.62e4p-1f), + .ln2_lo = V4 (0x1.7f7d1cp-20f), +@@ -85,7 +84,7 @@ special_case (float32x4_t poly, float32x4_t n, uint32x4_t e, uint32x4_t cmp1, + float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (exp) (float32x4_t x) + { + const struct data *d = ptr_barrier (&data); +- float32x4_t n, r, r2, scale, p, q, poly, z; ++ float32x4_t n, r, r2, scale, p, q, poly; + uint32x4_t cmp, e; + + #if WANT_SIMD_EXCEPT +@@ -104,11 +103,10 @@ float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (exp) (float32x4_t x) + + /* exp(x) = 2^n (1 + poly(r)), with 1 + poly(r) in [1/sqrt(2),sqrt(2)] + x = ln2*n + r, with r in [-ln2/2, ln2/2]. */ +- z = vfmaq_f32 (d->shift, x, d->inv_ln2); +- n = vsubq_f32 (z, d->shift); ++ n = vrndaq_f32 (vmulq_f32 (x, d->inv_ln2)); + r = vfmsq_f32 (x, n, d->ln2_hi); + r = vfmsq_f32 (r, n, d->ln2_lo); +- e = vshlq_n_u32 (vreinterpretq_u32_f32 (z), 23); ++ e = vshlq_n_u32 (vreinterpretq_u32_s32 (vcvtq_s32_f32 (n)), 23); + scale = vreinterpretq_f32_u32 (vaddq_u32 (e, d->exponent_bias)); + + #if !WANT_SIMD_EXCEPT +diff --git a/sysdeps/aarch64/fpu/sin_advsimd.c b/sysdeps/aarch64/fpu/sin_advsimd.c +index a0d9d3b81965db76..718125cbad81db41 100644 +--- a/sysdeps/aarch64/fpu/sin_advsimd.c ++++ b/sysdeps/aarch64/fpu/sin_advsimd.c +@@ -22,7 +22,7 @@ + static const struct data + { + float64x2_t poly[7]; +- float64x2_t range_val, inv_pi, shift, pi_1, pi_2, pi_3; ++ float64x2_t range_val, inv_pi, pi_1, pi_2, pi_3; + } data = { + .poly = { V2 (-0x1.555555555547bp-3), V2 (0x1.1111111108a4dp-7), + V2 (-0x1.a01a019936f27p-13), V2 (0x1.71de37a97d93ep-19), +@@ -34,12 +34,13 @@ static const struct data + .pi_1 = V2 (0x1.921fb54442d18p+1), + .pi_2 = V2 (0x1.1a62633145c06p-53), + .pi_3 = V2 (0x1.c1cd129024e09p-106), +- .shift = V2 (0x1.8p52), + }; + + #if WANT_SIMD_EXCEPT +-# define TinyBound v_u64 (0x3000000000000000) /* asuint64 (0x1p-255). */ +-# define Thresh v_u64 (0x1160000000000000) /* RangeVal - TinyBound. */ ++/* asuint64(0x1p-253)), below which multiply by inv_pi underflows. */ ++# define TinyBound v_u64 (0x3020000000000000) ++/* RangeVal - TinyBound. */ ++# define Thresh v_u64 (0x1160000000000000) + #endif + + #define C(i) d->poly[i] +@@ -72,16 +73,15 @@ float64x2_t VPCS_ATTR V_NAME_D1 (sin) (float64x2_t x) + fenv). These lanes will be fixed by special-case handler later. */ + uint64x2_t ir = vreinterpretq_u64_f64 (vabsq_f64 (x)); + cmp = vcgeq_u64 (vsubq_u64 (ir, TinyBound), Thresh); +- r = vbslq_f64 (cmp, vreinterpretq_f64_u64 (cmp), x); ++ r = vreinterpretq_f64_u64 (vbicq_u64 (vreinterpretq_u64_f64 (x), cmp)); + #else + r = x; + cmp = vcageq_f64 (x, d->range_val); + #endif + + /* n = rint(|x|/pi). */ +- n = vfmaq_f64 (d->shift, d->inv_pi, r); +- odd = vshlq_n_u64 (vreinterpretq_u64_f64 (n), 63); +- n = vsubq_f64 (n, d->shift); ++ n = vrndaq_f64 (vmulq_f64 (r, d->inv_pi)); ++ odd = vshlq_n_u64 (vreinterpretq_u64_s64 (vcvtq_s64_f64 (n)), 63); + + /* r = |x| - n*pi (range reduction into -pi/2 .. pi/2). */ + r = vfmsq_f64 (r, d->pi_1, n); +diff --git a/sysdeps/aarch64/fpu/sinf_advsimd.c b/sysdeps/aarch64/fpu/sinf_advsimd.c +index 375dfc3331fa6a9c..6ee9a23d5b7fd13f 100644 +--- a/sysdeps/aarch64/fpu/sinf_advsimd.c ++++ b/sysdeps/aarch64/fpu/sinf_advsimd.c +@@ -22,7 +22,7 @@ + static const struct data + { + float32x4_t poly[4]; +- float32x4_t range_val, inv_pi, shift, pi_1, pi_2, pi_3; ++ float32x4_t range_val, inv_pi, pi_1, pi_2, pi_3; + } data = { + /* 1.886 ulp error. */ + .poly = { V4 (-0x1.555548p-3f), V4 (0x1.110df4p-7f), V4 (-0x1.9f42eap-13f), +@@ -33,13 +33,14 @@ static const struct data + .pi_3 = V4 (-0x1.ee59dap-49f), + + .inv_pi = V4 (0x1.45f306p-2f), +- .shift = V4 (0x1.8p+23f), + .range_val = V4 (0x1p20f) + }; + + #if WANT_SIMD_EXCEPT +-# define TinyBound v_u32 (0x21000000) /* asuint32(0x1p-61f). */ +-# define Thresh v_u32 (0x28800000) /* RangeVal - TinyBound. */ ++/* asuint32(0x1p-59f), below which multiply by inv_pi underflows. */ ++# define TinyBound v_u32 (0x22000000) ++/* RangeVal - TinyBound. */ ++# define Thresh v_u32 (0x27800000) + #endif + + #define C(i) d->poly[i] +@@ -64,23 +65,22 @@ float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (sin) (float32x4_t x) + /* If fenv exceptions are to be triggered correctly, set any special lanes + to 1 (which is neutral w.r.t. fenv). These lanes will be fixed by + special-case handler later. */ +- r = vbslq_f32 (cmp, vreinterpretq_f32_u32 (cmp), x); ++ r = vreinterpretq_f32_u32 (vbicq_u32 (vreinterpretq_u32_f32 (x), cmp)); + #else + r = x; + cmp = vcageq_f32 (x, d->range_val); + #endif + +- /* n = rint(|x|/pi) */ +- n = vfmaq_f32 (d->shift, d->inv_pi, r); +- odd = vshlq_n_u32 (vreinterpretq_u32_f32 (n), 31); +- n = vsubq_f32 (n, d->shift); ++ /* n = rint(|x|/pi). */ ++ n = vrndaq_f32 (vmulq_f32 (r, d->inv_pi)); ++ odd = vshlq_n_u32 (vreinterpretq_u32_s32 (vcvtq_s32_f32 (n)), 31); + +- /* r = |x| - n*pi (range reduction into -pi/2 .. pi/2) */ ++ /* r = |x| - n*pi (range reduction into -pi/2 .. pi/2). */ + r = vfmsq_f32 (r, d->pi_1, n); + r = vfmsq_f32 (r, d->pi_2, n); + r = vfmsq_f32 (r, d->pi_3, n); + +- /* y = sin(r) */ ++ /* y = sin(r). */ + r2 = vmulq_f32 (r, r); + y = vfmaq_f32 (C (2), C (3), r2); + y = vfmaq_f32 (C (1), y, r2); diff --git a/glibc-upstream-2.39-154.patch b/glibc-upstream-2.39-154.patch new file mode 100644 index 0000000..b192513 --- /dev/null +++ b/glibc-upstream-2.39-154.patch @@ -0,0 +1,404 @@ +commit 72156cb90bb845eddf3acd59dd1599cec365942e +Author: Pierre Blanchard +Date: Mon Dec 9 15:54:34 2024 +0000 + + AArch64: Improve codegen in AdvSIMD logs + + Remove spurious ADRP and a few MOVs. + Reduce memory access by using more indexed MLAs in polynomial. + Align notation so that algorithms are easier to compare. + Speedup on Neoverse V1 for log10 (8%), log (8.5%), and log2 (10%). + Update error threshold in AdvSIMD log (now matches SVE log). + + (cherry picked from commit 8eb5ad2ebc94cc5bedbac57c226c02ec254479c7) + +diff --git a/sysdeps/aarch64/fpu/log10_advsimd.c b/sysdeps/aarch64/fpu/log10_advsimd.c +index c065aaebae8600fb..f69ed21c3938d9a9 100644 +--- a/sysdeps/aarch64/fpu/log10_advsimd.c ++++ b/sysdeps/aarch64/fpu/log10_advsimd.c +@@ -18,36 +18,36 @@ + . */ + + #include "v_math.h" +-#include "poly_advsimd_f64.h" +- +-#define N (1 << V_LOG10_TABLE_BITS) + + static const struct data + { +- uint64x2_t min_norm; ++ uint64x2_t off, sign_exp_mask, offset_lower_bound; + uint32x4_t special_bound; +- float64x2_t poly[5]; +- float64x2_t invln10, log10_2, ln2; +- uint64x2_t sign_exp_mask; ++ double invln10, log10_2; ++ double c1, c3; ++ float64x2_t c0, c2, c4; + } data = { + /* Computed from log coefficients divided by log(10) then rounded to double + precision. */ +- .poly = { V2 (-0x1.bcb7b1526e506p-3), V2 (0x1.287a7636be1d1p-3), +- V2 (-0x1.bcb7b158af938p-4), V2 (0x1.63c78734e6d07p-4), +- V2 (-0x1.287461742fee4p-4) }, +- .ln2 = V2 (0x1.62e42fefa39efp-1), +- .invln10 = V2 (0x1.bcb7b1526e50ep-2), +- .log10_2 = V2 (0x1.34413509f79ffp-2), +- .min_norm = V2 (0x0010000000000000), /* asuint64(0x1p-1022). */ +- .special_bound = V4 (0x7fe00000), /* asuint64(inf) - min_norm. */ ++ .c0 = V2 (-0x1.bcb7b1526e506p-3), ++ .c1 = 0x1.287a7636be1d1p-3, ++ .c2 = V2 (-0x1.bcb7b158af938p-4), ++ .c3 = 0x1.63c78734e6d07p-4, ++ .c4 = V2 (-0x1.287461742fee4p-4), ++ .invln10 = 0x1.bcb7b1526e50ep-2, ++ .log10_2 = 0x1.34413509f79ffp-2, ++ .off = V2 (0x3fe6900900000000), + .sign_exp_mask = V2 (0xfff0000000000000), ++ /* Lower bound is 0x0010000000000000. For ++ optimised register use subnormals are detected after offset has been ++ subtracted, so lower bound - offset (which wraps around). */ ++ .offset_lower_bound = V2 (0x0010000000000000 - 0x3fe6900900000000), ++ .special_bound = V4 (0x7fe00000), /* asuint64(inf) - 0x0010000000000000. */ + }; + +-#define Off v_u64 (0x3fe6900900000000) ++#define N (1 << V_LOG10_TABLE_BITS) + #define IndexMask (N - 1) + +-#define T(s, i) __v_log10_data.s[i] +- + struct entry + { + float64x2_t invc; +@@ -70,10 +70,11 @@ lookup (uint64x2_t i) + } + + static float64x2_t VPCS_ATTR NOINLINE +-special_case (float64x2_t x, float64x2_t y, float64x2_t hi, float64x2_t r2, +- uint32x2_t special) ++special_case (float64x2_t hi, uint64x2_t u_off, float64x2_t y, float64x2_t r2, ++ uint32x2_t special, const struct data *d) + { +- return v_call_f64 (log10, x, vfmaq_f64 (hi, r2, y), vmovl_u32 (special)); ++ float64x2_t x = vreinterpretq_f64_u64 (vaddq_u64 (u_off, d->off)); ++ return v_call_f64 (log10, x, vfmaq_f64 (hi, y, r2), vmovl_u32 (special)); + } + + /* Fast implementation of double-precision vector log10 +@@ -85,19 +86,24 @@ special_case (float64x2_t x, float64x2_t y, float64x2_t hi, float64x2_t r2, + float64x2_t VPCS_ATTR V_NAME_D1 (log10) (float64x2_t x) + { + const struct data *d = ptr_barrier (&data); +- uint64x2_t ix = vreinterpretq_u64_f64 (x); +- uint32x2_t special = vcge_u32 (vsubhn_u64 (ix, d->min_norm), +- vget_low_u32 (d->special_bound)); ++ ++ /* To avoid having to mov x out of the way, keep u after offset has been ++ applied, and recover x by adding the offset back in the special-case ++ handler. */ ++ uint64x2_t u = vreinterpretq_u64_f64 (x); ++ uint64x2_t u_off = vsubq_u64 (u, d->off); + + /* x = 2^k z; where z is in range [OFF,2*OFF) and exact. + The range is split into N subintervals. + The ith subinterval contains z and c is near its center. */ +- uint64x2_t tmp = vsubq_u64 (ix, Off); +- int64x2_t k = vshrq_n_s64 (vreinterpretq_s64_u64 (tmp), 52); +- uint64x2_t iz = vsubq_u64 (ix, vandq_u64 (tmp, d->sign_exp_mask)); ++ int64x2_t k = vshrq_n_s64 (vreinterpretq_s64_u64 (u_off), 52); ++ uint64x2_t iz = vsubq_u64 (u, vandq_u64 (u_off, d->sign_exp_mask)); + float64x2_t z = vreinterpretq_f64_u64 (iz); + +- struct entry e = lookup (tmp); ++ struct entry e = lookup (u_off); ++ ++ uint32x2_t special = vcge_u32 (vsubhn_u64 (u_off, d->offset_lower_bound), ++ vget_low_u32 (d->special_bound)); + + /* log10(x) = log1p(z/c-1)/log(10) + log10(c) + k*log10(2). */ + float64x2_t r = vfmaq_f64 (v_f64 (-1.0), z, e.invc); +@@ -105,17 +111,22 @@ float64x2_t VPCS_ATTR V_NAME_D1 (log10) (float64x2_t x) + + /* hi = r / log(10) + log10(c) + k*log10(2). + Constants in v_log10_data.c are computed (in extended precision) as +- e.log10c := e.logc * ivln10. */ +- float64x2_t w = vfmaq_f64 (e.log10c, r, d->invln10); ++ e.log10c := e.logc * invln10. */ ++ float64x2_t cte = vld1q_f64 (&d->invln10); ++ float64x2_t hi = vfmaq_laneq_f64 (e.log10c, r, cte, 0); + + /* y = log10(1+r) + n * log10(2). */ +- float64x2_t hi = vfmaq_f64 (w, kd, d->log10_2); ++ hi = vfmaq_laneq_f64 (hi, kd, cte, 1); + + /* y = r2*(A0 + r*A1 + r2*(A2 + r*A3 + r2*A4)) + hi. */ + float64x2_t r2 = vmulq_f64 (r, r); +- float64x2_t y = v_pw_horner_4_f64 (r, r2, d->poly); ++ float64x2_t odd_coeffs = vld1q_f64 (&d->c1); ++ float64x2_t y = vfmaq_laneq_f64 (d->c2, r, odd_coeffs, 1); ++ float64x2_t p = vfmaq_laneq_f64 (d->c0, r, odd_coeffs, 0); ++ y = vfmaq_f64 (y, d->c4, r2); ++ y = vfmaq_f64 (p, y, r2); + + if (__glibc_unlikely (v_any_u32h (special))) +- return special_case (x, y, hi, r2, special); +- return vfmaq_f64 (hi, r2, y); ++ return special_case (hi, u_off, y, r2, special, d); ++ return vfmaq_f64 (hi, y, r2); + } +diff --git a/sysdeps/aarch64/fpu/log2_advsimd.c b/sysdeps/aarch64/fpu/log2_advsimd.c +index 4057c552d8dfc0bb..1eea1f86ebdeab34 100644 +--- a/sysdeps/aarch64/fpu/log2_advsimd.c ++++ b/sysdeps/aarch64/fpu/log2_advsimd.c +@@ -18,31 +18,33 @@ + . */ + + #include "v_math.h" +-#include "poly_advsimd_f64.h" +- +-#define N (1 << V_LOG2_TABLE_BITS) + + static const struct data + { +- uint64x2_t min_norm; ++ uint64x2_t off, sign_exp_mask, offset_lower_bound; + uint32x4_t special_bound; +- float64x2_t poly[5]; +- float64x2_t invln2; +- uint64x2_t sign_exp_mask; ++ float64x2_t c0, c2; ++ double c1, c3, invln2, c4; + } data = { + /* Each coefficient was generated to approximate log(r) for |r| < 0x1.fp-9 + and N = 128, then scaled by log2(e) in extended precision and rounded back + to double precision. */ +- .poly = { V2 (-0x1.71547652b83p-1), V2 (0x1.ec709dc340953p-2), +- V2 (-0x1.71547651c8f35p-2), V2 (0x1.2777ebe12dda5p-2), +- V2 (-0x1.ec738d616fe26p-3) }, +- .invln2 = V2 (0x1.71547652b82fep0), +- .min_norm = V2 (0x0010000000000000), /* asuint64(0x1p-1022). */ +- .special_bound = V4 (0x7fe00000), /* asuint64(inf) - min_norm. */ ++ .c0 = V2 (-0x1.71547652b8300p-1), ++ .c1 = 0x1.ec709dc340953p-2, ++ .c2 = V2 (-0x1.71547651c8f35p-2), ++ .c3 = 0x1.2777ebe12dda5p-2, ++ .c4 = -0x1.ec738d616fe26p-3, ++ .invln2 = 0x1.71547652b82fep0, ++ .off = V2 (0x3fe6900900000000), + .sign_exp_mask = V2 (0xfff0000000000000), ++ /* Lower bound is 0x0010000000000000. For ++ optimised register use subnormals are detected after offset has been ++ subtracted, so lower bound - offset (which wraps around). */ ++ .offset_lower_bound = V2 (0x0010000000000000 - 0x3fe6900900000000), ++ .special_bound = V4 (0x7fe00000), /* asuint64(inf) - asuint64(0x1p-1022). */ + }; + +-#define Off v_u64 (0x3fe6900900000000) ++#define N (1 << V_LOG2_TABLE_BITS) + #define IndexMask (N - 1) + + struct entry +@@ -67,10 +69,11 @@ lookup (uint64x2_t i) + } + + static float64x2_t VPCS_ATTR NOINLINE +-special_case (float64x2_t x, float64x2_t y, float64x2_t w, float64x2_t r2, +- uint32x2_t special) ++special_case (float64x2_t hi, uint64x2_t u_off, float64x2_t y, float64x2_t r2, ++ uint32x2_t special, const struct data *d) + { +- return v_call_f64 (log2, x, vfmaq_f64 (w, r2, y), vmovl_u32 (special)); ++ float64x2_t x = vreinterpretq_f64_u64 (vaddq_u64 (u_off, d->off)); ++ return v_call_f64 (log2, x, vfmaq_f64 (hi, y, r2), vmovl_u32 (special)); + } + + /* Double-precision vector log2 routine. Implements the same algorithm as +@@ -81,31 +84,41 @@ special_case (float64x2_t x, float64x2_t y, float64x2_t w, float64x2_t r2, + float64x2_t VPCS_ATTR V_NAME_D1 (log2) (float64x2_t x) + { + const struct data *d = ptr_barrier (&data); +- uint64x2_t ix = vreinterpretq_u64_f64 (x); +- uint32x2_t special = vcge_u32 (vsubhn_u64 (ix, d->min_norm), +- vget_low_u32 (d->special_bound)); ++ ++ /* To avoid having to mov x out of the way, keep u after offset has been ++ applied, and recover x by adding the offset back in the special-case ++ handler. */ ++ uint64x2_t u = vreinterpretq_u64_f64 (x); ++ uint64x2_t u_off = vsubq_u64 (u, d->off); + + /* x = 2^k z; where z is in range [Off,2*Off) and exact. + The range is split into N subintervals. + The ith subinterval contains z and c is near its center. */ +- uint64x2_t tmp = vsubq_u64 (ix, Off); +- int64x2_t k = vshrq_n_s64 (vreinterpretq_s64_u64 (tmp), 52); +- uint64x2_t iz = vsubq_u64 (ix, vandq_u64 (tmp, d->sign_exp_mask)); ++ int64x2_t k = vshrq_n_s64 (vreinterpretq_s64_u64 (u_off), 52); ++ uint64x2_t iz = vsubq_u64 (u, vandq_u64 (u_off, d->sign_exp_mask)); + float64x2_t z = vreinterpretq_f64_u64 (iz); + +- struct entry e = lookup (tmp); ++ struct entry e = lookup (u_off); + +- /* log2(x) = log1p(z/c-1)/log(2) + log2(c) + k. */ ++ uint32x2_t special = vcge_u32 (vsubhn_u64 (u_off, d->offset_lower_bound), ++ vget_low_u32 (d->special_bound)); + ++ /* log2(x) = log1p(z/c-1)/log(2) + log2(c) + k. */ + float64x2_t r = vfmaq_f64 (v_f64 (-1.0), z, e.invc); + float64x2_t kd = vcvtq_f64_s64 (k); +- float64x2_t w = vfmaq_f64 (e.log2c, r, d->invln2); ++ ++ float64x2_t invln2_and_c4 = vld1q_f64 (&d->invln2); ++ float64x2_t hi ++ = vfmaq_laneq_f64 (vaddq_f64 (e.log2c, kd), r, invln2_and_c4, 0); + + float64x2_t r2 = vmulq_f64 (r, r); +- float64x2_t y = v_pw_horner_4_f64 (r, r2, d->poly); +- w = vaddq_f64 (kd, w); ++ float64x2_t odd_coeffs = vld1q_f64 (&d->c1); ++ float64x2_t y = vfmaq_laneq_f64 (d->c2, r, odd_coeffs, 1); ++ float64x2_t p = vfmaq_laneq_f64 (d->c0, r, odd_coeffs, 0); ++ y = vfmaq_laneq_f64 (y, r2, invln2_and_c4, 1); ++ y = vfmaq_f64 (p, r2, y); + + if (__glibc_unlikely (v_any_u32h (special))) +- return special_case (x, y, w, r2, special); +- return vfmaq_f64 (w, r2, y); ++ return special_case (hi, u_off, y, r2, special, d); ++ return vfmaq_f64 (hi, y, r2); + } +diff --git a/sysdeps/aarch64/fpu/log_advsimd.c b/sysdeps/aarch64/fpu/log_advsimd.c +index 015a6da7d7fd693e..b1a27fbc290d918c 100644 +--- a/sysdeps/aarch64/fpu/log_advsimd.c ++++ b/sysdeps/aarch64/fpu/log_advsimd.c +@@ -21,27 +21,29 @@ + + static const struct data + { +- uint64x2_t min_norm; ++ uint64x2_t off, sign_exp_mask, offset_lower_bound; + uint32x4_t special_bound; +- float64x2_t poly[5]; +- float64x2_t ln2; +- uint64x2_t sign_exp_mask; ++ float64x2_t c0, c2; ++ double c1, c3, ln2, c4; + } data = { +- /* Worst-case error: 1.17 + 0.5 ulp. +- Rel error: 0x1.6272e588p-56 in [ -0x1.fc1p-9 0x1.009p-8 ]. */ +- .poly = { V2 (-0x1.ffffffffffff7p-2), V2 (0x1.55555555170d4p-2), +- V2 (-0x1.0000000399c27p-2), V2 (0x1.999b2e90e94cap-3), +- V2 (-0x1.554e550bd501ep-3) }, +- .ln2 = V2 (0x1.62e42fefa39efp-1), +- .min_norm = V2 (0x0010000000000000), +- .special_bound = V4 (0x7fe00000), /* asuint64(inf) - min_norm. */ +- .sign_exp_mask = V2 (0xfff0000000000000) ++ /* Rel error: 0x1.6272e588p-56 in [ -0x1.fc1p-9 0x1.009p-8 ]. */ ++ .c0 = V2 (-0x1.ffffffffffff7p-2), ++ .c1 = 0x1.55555555170d4p-2, ++ .c2 = V2 (-0x1.0000000399c27p-2), ++ .c3 = 0x1.999b2e90e94cap-3, ++ .c4 = -0x1.554e550bd501ep-3, ++ .ln2 = 0x1.62e42fefa39efp-1, ++ .sign_exp_mask = V2 (0xfff0000000000000), ++ .off = V2 (0x3fe6900900000000), ++ /* Lower bound is 0x0010000000000000. For ++ optimised register use subnormals are detected after offset has been ++ subtracted, so lower bound - offset (which wraps around). */ ++ .offset_lower_bound = V2 (0x0010000000000000 - 0x3fe6900900000000), ++ .special_bound = V4 (0x7fe00000), /* asuint64(inf) - asuint64(0x1p-126). */ + }; + +-#define A(i) d->poly[i] + #define N (1 << V_LOG_TABLE_BITS) + #define IndexMask (N - 1) +-#define Off v_u64 (0x3fe6900900000000) + + struct entry + { +@@ -64,48 +66,56 @@ lookup (uint64x2_t i) + } + + static float64x2_t VPCS_ATTR NOINLINE +-special_case (float64x2_t x, float64x2_t y, float64x2_t hi, float64x2_t r2, +- uint32x2_t cmp) ++special_case (float64x2_t hi, uint64x2_t u_off, float64x2_t y, float64x2_t r2, ++ uint32x2_t special, const struct data *d) + { +- return v_call_f64 (log, x, vfmaq_f64 (hi, y, r2), vmovl_u32 (cmp)); ++ float64x2_t x = vreinterpretq_f64_u64 (vaddq_u64 (u_off, d->off)); ++ return v_call_f64 (log, x, vfmaq_f64 (hi, y, r2), vmovl_u32 (special)); + } + ++/* Double-precision vector log routine. ++ The maximum observed error is 2.17 ULP: ++ _ZGVnN2v_log(0x1.a6129884398a3p+0) got 0x1.ffffff1cca043p-2 ++ want 0x1.ffffff1cca045p-2. */ + float64x2_t VPCS_ATTR V_NAME_D1 (log) (float64x2_t x) + { + const struct data *d = ptr_barrier (&data); +- float64x2_t z, r, r2, p, y, kd, hi; +- uint64x2_t ix, iz, tmp; +- uint32x2_t cmp; +- int64x2_t k; +- struct entry e; + +- ix = vreinterpretq_u64_f64 (x); +- cmp = vcge_u32 (vsubhn_u64 (ix, d->min_norm), +- vget_low_u32 (d->special_bound)); ++ /* To avoid having to mov x out of the way, keep u after offset has been ++ applied, and recover x by adding the offset back in the special-case ++ handler. */ ++ uint64x2_t u = vreinterpretq_u64_f64 (x); ++ uint64x2_t u_off = vsubq_u64 (u, d->off); + + /* x = 2^k z; where z is in range [Off,2*Off) and exact. + The range is split into N subintervals. + The ith subinterval contains z and c is near its center. */ +- tmp = vsubq_u64 (ix, Off); +- k = vshrq_n_s64 (vreinterpretq_s64_u64 (tmp), 52); /* arithmetic shift. */ +- iz = vsubq_u64 (ix, vandq_u64 (tmp, d->sign_exp_mask)); +- z = vreinterpretq_f64_u64 (iz); +- e = lookup (tmp); ++ int64x2_t k = vshrq_n_s64 (vreinterpretq_s64_u64 (u_off), 52); ++ uint64x2_t iz = vsubq_u64 (u, vandq_u64 (u_off, d->sign_exp_mask)); ++ float64x2_t z = vreinterpretq_f64_u64 (iz); ++ ++ struct entry e = lookup (u_off); ++ ++ uint32x2_t special = vcge_u32 (vsubhn_u64 (u_off, d->offset_lower_bound), ++ vget_low_u32 (d->special_bound)); + + /* log(x) = log1p(z/c-1) + log(c) + k*Ln2. */ +- r = vfmaq_f64 (v_f64 (-1.0), z, e.invc); +- kd = vcvtq_f64_s64 (k); ++ float64x2_t r = vfmaq_f64 (v_f64 (-1.0), z, e.invc); ++ float64x2_t kd = vcvtq_f64_s64 (k); + + /* hi = r + log(c) + k*Ln2. */ +- hi = vfmaq_f64 (vaddq_f64 (e.logc, r), kd, d->ln2); ++ float64x2_t ln2_and_c4 = vld1q_f64 (&d->ln2); ++ float64x2_t hi = vfmaq_laneq_f64 (vaddq_f64 (e.logc, r), kd, ln2_and_c4, 0); ++ + /* y = r2*(A0 + r*A1 + r2*(A2 + r*A3 + r2*A4)) + hi. */ +- r2 = vmulq_f64 (r, r); +- y = vfmaq_f64 (A (2), A (3), r); +- p = vfmaq_f64 (A (0), A (1), r); +- y = vfmaq_f64 (y, A (4), r2); +- y = vfmaq_f64 (p, y, r2); +- +- if (__glibc_unlikely (v_any_u32h (cmp))) +- return special_case (x, y, hi, r2, cmp); ++ float64x2_t odd_coeffs = vld1q_f64 (&d->c1); ++ float64x2_t r2 = vmulq_f64 (r, r); ++ float64x2_t y = vfmaq_laneq_f64 (d->c2, r, odd_coeffs, 1); ++ float64x2_t p = vfmaq_laneq_f64 (d->c0, r, odd_coeffs, 0); ++ y = vfmaq_laneq_f64 (y, r2, ln2_and_c4, 1); ++ y = vfmaq_f64 (p, r2, y); ++ ++ if (__glibc_unlikely (v_any_u32h (special))) ++ return special_case (hi, u_off, y, r2, special, d); + return vfmaq_f64 (hi, y, r2); + } diff --git a/glibc-upstream-2.39-155.patch b/glibc-upstream-2.39-155.patch new file mode 100644 index 0000000..d8ba7a5 --- /dev/null +++ b/glibc-upstream-2.39-155.patch @@ -0,0 +1,224 @@ +commit dcd1229e5bbc8c899cb35b22aaf89d8babc5af5a +Author: Joana Cruz +Date: Tue Dec 17 14:47:31 2024 +0000 + + AArch64: Improve codegen of AdvSIMD logf function family + + Load the polynomial evaluation coefficients into 2 vectors and use lanewise MLAs. + 8% improvement in throughput microbenchmark on Neoverse V1 for log2 and log, + and 2% for log10. + + Reviewed-by: Wilco Dijkstra + (cherry picked from commit d6e034f5b222a9ed1aeb5de0c0c7d0dda8b63da3) + +diff --git a/sysdeps/aarch64/fpu/log10f_advsimd.c b/sysdeps/aarch64/fpu/log10f_advsimd.c +index 82228b599a5c061b..0d792c3df9a7216e 100644 +--- a/sysdeps/aarch64/fpu/log10f_advsimd.c ++++ b/sysdeps/aarch64/fpu/log10f_advsimd.c +@@ -18,21 +18,25 @@ + . */ + + #include "v_math.h" +-#include "poly_advsimd_f32.h" + + static const struct data + { ++ float32x4_t c0, c2, c4, c6, inv_ln10, ln2; + uint32x4_t off, offset_lower_bound; + uint16x8_t special_bound; + uint32x4_t mantissa_mask; +- float32x4_t poly[8]; +- float32x4_t inv_ln10, ln2; ++ float c1, c3, c5, c7; + } data = { + /* Use order 9 for log10(1+x), i.e. order 8 for log10(1+x)/x, with x in + [-1/3, 1/3] (offset=2/3). Max. relative error: 0x1.068ee468p-25. */ +- .poly = { V4 (-0x1.bcb79cp-3f), V4 (0x1.2879c8p-3f), V4 (-0x1.bcd472p-4f), +- V4 (0x1.6408f8p-4f), V4 (-0x1.246f8p-4f), V4 (0x1.f0e514p-5f), +- V4 (-0x1.0fc92cp-4f), V4 (0x1.f5f76ap-5f) }, ++ .c0 = V4 (-0x1.bcb79cp-3f), ++ .c1 = 0x1.2879c8p-3f, ++ .c2 = V4 (-0x1.bcd472p-4f), ++ .c3 = 0x1.6408f8p-4f, ++ .c4 = V4 (-0x1.246f8p-4f), ++ .c5 = 0x1.f0e514p-5f, ++ .c6 = V4 (-0x1.0fc92cp-4f), ++ .c7 = 0x1.f5f76ap-5f, + .ln2 = V4 (0x1.62e43p-1f), + .inv_ln10 = V4 (0x1.bcb7b2p-2f), + /* Lower bound is the smallest positive normal float 0x00800000. For +@@ -62,7 +66,7 @@ special_case (float32x4_t y, uint32x4_t u_off, float32x4_t p, float32x4_t r2, + float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (log10) (float32x4_t x) + { + const struct data *d = ptr_barrier (&data); +- ++ float32x4_t c1357 = vld1q_f32 (&d->c1); + /* To avoid having to mov x out of the way, keep u after offset has been + applied, and recover x by adding the offset back in the special-case + handler. */ +@@ -81,7 +85,16 @@ float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (log10) (float32x4_t x) + + /* y = log10(1+r) + n * log10(2). */ + float32x4_t r2 = vmulq_f32 (r, r); +- float32x4_t poly = v_pw_horner_7_f32 (r, r2, d->poly); ++ ++ float32x4_t c01 = vfmaq_laneq_f32 (d->c0, r, c1357, 0); ++ float32x4_t c23 = vfmaq_laneq_f32 (d->c2, r, c1357, 1); ++ float32x4_t c45 = vfmaq_laneq_f32 (d->c4, r, c1357, 2); ++ float32x4_t c67 = vfmaq_laneq_f32 (d->c6, r, c1357, 3); ++ ++ float32x4_t p47 = vfmaq_f32 (c45, r2, c67); ++ float32x4_t p27 = vfmaq_f32 (c23, r2, p47); ++ float32x4_t poly = vfmaq_f32 (c01, r2, p27); ++ + /* y = Log10(2) * n + poly * InvLn(10). */ + float32x4_t y = vfmaq_f32 (r, d->ln2, n); + y = vmulq_f32 (y, d->inv_ln10); +diff --git a/sysdeps/aarch64/fpu/log2f_advsimd.c b/sysdeps/aarch64/fpu/log2f_advsimd.c +index 84effe4fe9492d08..116c36c8e2889f99 100644 +--- a/sysdeps/aarch64/fpu/log2f_advsimd.c ++++ b/sysdeps/aarch64/fpu/log2f_advsimd.c +@@ -18,22 +18,27 @@ + . */ + + #include "v_math.h" +-#include "poly_advsimd_f32.h" + + static const struct data + { ++ float32x4_t c0, c2, c4, c6, c8; + uint32x4_t off, offset_lower_bound; + uint16x8_t special_bound; + uint32x4_t mantissa_mask; +- float32x4_t poly[9]; ++ float c1, c3, c5, c7; + } data = { + /* Coefficients generated using Remez algorithm approximate + log2(1+r)/r for r in [ -1/3, 1/3 ]. + rel error: 0x1.c4c4b0cp-26. */ +- .poly = { V4 (0x1.715476p0f), /* (float)(1 / ln(2)). */ +- V4 (-0x1.715458p-1f), V4 (0x1.ec701cp-2f), V4 (-0x1.7171a4p-2f), +- V4 (0x1.27a0b8p-2f), V4 (-0x1.e5143ep-3f), V4 (0x1.9d8ecap-3f), +- V4 (-0x1.c675bp-3f), V4 (0x1.9e495p-3f) }, ++ .c0 = V4 (0x1.715476p0f), /* (float)(1 / ln(2)). */ ++ .c1 = -0x1.715458p-1f, ++ .c2 = V4 (0x1.ec701cp-2f), ++ .c3 = -0x1.7171a4p-2f, ++ .c4 = V4 (0x1.27a0b8p-2f), ++ .c5 = -0x1.e5143ep-3f, ++ .c6 = V4 (0x1.9d8ecap-3f), ++ .c7 = -0x1.c675bp-3f, ++ .c8 = V4 (0x1.9e495p-3f), + /* Lower bound is the smallest positive normal float 0x00800000. For + optimised register use subnormals are detected after offset has been + subtracted, so lower bound is 0x0080000 - offset (which wraps around). */ +@@ -79,11 +84,21 @@ float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (log2) (float32x4_t x) + + /* y = log2(1+r) + n. */ + float32x4_t r2 = vmulq_f32 (r, r); +- float32x4_t p = v_pw_horner_8_f32 (r, r2, d->poly); ++ ++ float32x4_t c1357 = vld1q_f32 (&d->c1); ++ float32x4_t c01 = vfmaq_laneq_f32 (d->c0, r, c1357, 0); ++ float32x4_t c23 = vfmaq_laneq_f32 (d->c2, r, c1357, 1); ++ float32x4_t c45 = vfmaq_laneq_f32 (d->c4, r, c1357, 2); ++ float32x4_t c67 = vfmaq_laneq_f32 (d->c6, r, c1357, 3); ++ float32x4_t p68 = vfmaq_f32 (c67, r2, d->c8); ++ float32x4_t p48 = vfmaq_f32 (c45, r2, p68); ++ float32x4_t p28 = vfmaq_f32 (c23, r2, p48); ++ float32x4_t p = vfmaq_f32 (c01, r2, p28); + + if (__glibc_unlikely (v_any_u16h (special))) + return special_case (n, u_off, p, r, special, d); + return vfmaq_f32 (n, p, r); + } ++ + libmvec_hidden_def (V_NAME_F1 (log2)) + HALF_WIDTH_ALIAS_F1 (log2) +diff --git a/sysdeps/aarch64/fpu/logf_advsimd.c b/sysdeps/aarch64/fpu/logf_advsimd.c +index c20dbfd6c088c0af..d9e64c732d7d8d28 100644 +--- a/sysdeps/aarch64/fpu/logf_advsimd.c ++++ b/sysdeps/aarch64/fpu/logf_advsimd.c +@@ -21,16 +21,19 @@ + + static const struct data + { +- uint32x4_t off, offset_lower_bound; ++ float32x4_t c2, c4, c6, ln2; ++ uint32x4_t off, offset_lower_bound, mantissa_mask; + uint16x8_t special_bound; +- uint32x4_t mantissa_mask; +- float32x4_t poly[7]; +- float32x4_t ln2; ++ float c1, c3, c5, c0; + } data = { + /* 3.34 ulp error. */ +- .poly = { V4 (-0x1.3e737cp-3f), V4 (0x1.5a9aa2p-3f), V4 (-0x1.4f9934p-3f), +- V4 (0x1.961348p-3f), V4 (-0x1.00187cp-2f), V4 (0x1.555d7cp-2f), +- V4 (-0x1.ffffc8p-2f) }, ++ .c0 = -0x1.3e737cp-3f, ++ .c1 = 0x1.5a9aa2p-3f, ++ .c2 = V4 (-0x1.4f9934p-3f), ++ .c3 = 0x1.961348p-3f, ++ .c4 = V4 (-0x1.00187cp-2f), ++ .c5 = 0x1.555d7cp-2f, ++ .c6 = V4 (-0x1.ffffc8p-2f), + .ln2 = V4 (0x1.62e43p-1f), + /* Lower bound is the smallest positive normal float 0x00800000. For + optimised register use subnormals are detected after offset has been +@@ -41,8 +44,6 @@ static const struct data + .mantissa_mask = V4 (0x007fffff) + }; + +-#define P(i) d->poly[7 - i] +- + static float32x4_t VPCS_ATTR NOINLINE + special_case (float32x4_t p, uint32x4_t u_off, float32x4_t y, float32x4_t r2, + uint16x4_t cmp, const struct data *d) +@@ -55,33 +56,30 @@ special_case (float32x4_t p, uint32x4_t u_off, float32x4_t y, float32x4_t r2, + float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (log) (float32x4_t x) + { + const struct data *d = ptr_barrier (&data); +- float32x4_t n, p, q, r, r2, y; +- uint32x4_t u, u_off; +- uint16x4_t cmp; ++ float32x4_t c1350 = vld1q_f32 (&d->c1); + + /* To avoid having to mov x out of the way, keep u after offset has been + applied, and recover x by adding the offset back in the special-case + handler. */ +- u_off = vreinterpretq_u32_f32 (x); ++ uint32x4_t u_off = vsubq_u32 (vreinterpretq_u32_f32 (x), d->off); + + /* x = 2^n * (1+r), where 2/3 < 1+r < 4/3. */ +- u_off = vsubq_u32 (u_off, d->off); +- n = vcvtq_f32_s32 ( ++ float32x4_t n = vcvtq_f32_s32 ( + vshrq_n_s32 (vreinterpretq_s32_u32 (u_off), 23)); /* signextend. */ +- u = vandq_u32 (u_off, d->mantissa_mask); +- u = vaddq_u32 (u, d->off); +- r = vsubq_f32 (vreinterpretq_f32_u32 (u), v_f32 (1.0f)); ++ uint16x4_t cmp = vcge_u16 (vsubhn_u32 (u_off, d->offset_lower_bound), ++ vget_low_u16 (d->special_bound)); + +- cmp = vcge_u16 (vsubhn_u32 (u_off, d->offset_lower_bound), +- vget_low_u16 (d->special_bound)); ++ uint32x4_t u = vaddq_u32 (vandq_u32 (u_off, d->mantissa_mask), d->off); ++ float32x4_t r = vsubq_f32 (vreinterpretq_f32_u32 (u), v_f32 (1.0f)); + + /* y = log(1+r) + n*ln2. */ +- r2 = vmulq_f32 (r, r); ++ float32x4_t r2 = vmulq_f32 (r, r); + /* n*ln2 + r + r2*(P1 + r*P2 + r2*(P3 + r*P4 + r2*(P5 + r*P6 + r2*P7))). */ +- p = vfmaq_f32 (P (5), P (6), r); +- q = vfmaq_f32 (P (3), P (4), r); +- y = vfmaq_f32 (P (1), P (2), r); +- p = vfmaq_f32 (p, P (7), r2); ++ float32x4_t p = vfmaq_laneq_f32 (d->c2, r, c1350, 0); ++ float32x4_t q = vfmaq_laneq_f32 (d->c4, r, c1350, 1); ++ float32x4_t y = vfmaq_laneq_f32 (d->c6, r, c1350, 2); ++ p = vfmaq_laneq_f32 (p, r2, c1350, 3); ++ + q = vfmaq_f32 (q, p, r2); + y = vfmaq_f32 (y, q, r2); + p = vfmaq_f32 (r, d->ln2, n); diff --git a/glibc-upstream-2.39-156.patch b/glibc-upstream-2.39-156.patch new file mode 100644 index 0000000..12fa961 --- /dev/null +++ b/glibc-upstream-2.39-156.patch @@ -0,0 +1,404 @@ +commit a10183b6338baf4b2643b92cce1b0fba0e3ab62f +Author: Joana Cruz +Date: Tue Dec 17 14:49:30 2024 +0000 + + AArch64: Improve codegen of AdvSIMD atan(2)(f) + + Load the polynomial evaluation coefficients into 2 vectors and use lanewise MLAs. + 8% improvement in throughput microbenchmark on Neoverse V1. + + Reviewed-by: Wilco Dijkstra + (cherry picked from commit 6914774b9d3460876d9ad4482782213ec01a752e) + +diff --git a/sysdeps/aarch64/fpu/atan2_advsimd.c b/sysdeps/aarch64/fpu/atan2_advsimd.c +index 2fd61641340c0315..5df4b99ff4277c6a 100644 +--- a/sysdeps/aarch64/fpu/atan2_advsimd.c ++++ b/sysdeps/aarch64/fpu/atan2_advsimd.c +@@ -22,40 +22,57 @@ + + static const struct data + { ++ float64x2_t c0, c2, c4, c6, c8, c10, c12, c14, c16, c18; + float64x2_t pi_over_2; +- float64x2_t poly[20]; ++ double c1, c3, c5, c7, c9, c11, c13, c15, c17, c19; ++ uint64x2_t zeroinfnan, minustwo; + } data = { + /* Coefficients of polynomial P such that atan(x)~x+x*P(x^2) on +- the interval [2**-1022, 1.0]. */ +- .poly = { V2 (-0x1.5555555555555p-2), V2 (0x1.99999999996c1p-3), +- V2 (-0x1.2492492478f88p-3), V2 (0x1.c71c71bc3951cp-4), +- V2 (-0x1.745d160a7e368p-4), V2 (0x1.3b139b6a88ba1p-4), +- V2 (-0x1.11100ee084227p-4), V2 (0x1.e1d0f9696f63bp-5), +- V2 (-0x1.aebfe7b418581p-5), V2 (0x1.842dbe9b0d916p-5), +- V2 (-0x1.5d30140ae5e99p-5), V2 (0x1.338e31eb2fbbcp-5), +- V2 (-0x1.00e6eece7de8p-5), V2 (0x1.860897b29e5efp-6), +- V2 (-0x1.0051381722a59p-6), V2 (0x1.14e9dc19a4a4ep-7), +- V2 (-0x1.d0062b42fe3bfp-9), V2 (0x1.17739e210171ap-10), +- V2 (-0x1.ab24da7be7402p-13), V2 (0x1.358851160a528p-16), }, ++ [2**-1022, 1.0]. */ ++ .c0 = V2 (-0x1.5555555555555p-2), ++ .c1 = 0x1.99999999996c1p-3, ++ .c2 = V2 (-0x1.2492492478f88p-3), ++ .c3 = 0x1.c71c71bc3951cp-4, ++ .c4 = V2 (-0x1.745d160a7e368p-4), ++ .c5 = 0x1.3b139b6a88ba1p-4, ++ .c6 = V2 (-0x1.11100ee084227p-4), ++ .c7 = 0x1.e1d0f9696f63bp-5, ++ .c8 = V2 (-0x1.aebfe7b418581p-5), ++ .c9 = 0x1.842dbe9b0d916p-5, ++ .c10 = V2 (-0x1.5d30140ae5e99p-5), ++ .c11 = 0x1.338e31eb2fbbcp-5, ++ .c12 = V2 (-0x1.00e6eece7de8p-5), ++ .c13 = 0x1.860897b29e5efp-6, ++ .c14 = V2 (-0x1.0051381722a59p-6), ++ .c15 = 0x1.14e9dc19a4a4ep-7, ++ .c16 = V2 (-0x1.d0062b42fe3bfp-9), ++ .c17 = 0x1.17739e210171ap-10, ++ .c18 = V2 (-0x1.ab24da7be7402p-13), ++ .c19 = 0x1.358851160a528p-16, + .pi_over_2 = V2 (0x1.921fb54442d18p+0), ++ .zeroinfnan = V2 (2 * 0x7ff0000000000000ul - 1), ++ .minustwo = V2 (0xc000000000000000), + }; + + #define SignMask v_u64 (0x8000000000000000) + + /* Special cases i.e. 0, infinity, NaN (fall back to scalar calls). */ + static float64x2_t VPCS_ATTR NOINLINE +-special_case (float64x2_t y, float64x2_t x, float64x2_t ret, uint64x2_t cmp) ++special_case (float64x2_t y, float64x2_t x, float64x2_t ret, ++ uint64x2_t sign_xy, uint64x2_t cmp) + { ++ /* Account for the sign of x and y. */ ++ ret = vreinterpretq_f64_u64 ( ++ veorq_u64 (vreinterpretq_u64_f64 (ret), sign_xy)); + return v_call2_f64 (atan2, y, x, ret, cmp); + } + + /* Returns 1 if input is the bit representation of 0, infinity or nan. */ + static inline uint64x2_t +-zeroinfnan (uint64x2_t i) ++zeroinfnan (uint64x2_t i, const struct data *d) + { + /* (2 * i - 1) >= (2 * asuint64 (INFINITY) - 1). */ +- return vcgeq_u64 (vsubq_u64 (vaddq_u64 (i, i), v_u64 (1)), +- v_u64 (2 * asuint64 (INFINITY) - 1)); ++ return vcgeq_u64 (vsubq_u64 (vaddq_u64 (i, i), v_u64 (1)), d->zeroinfnan); + } + + /* Fast implementation of vector atan2. +@@ -65,12 +82,13 @@ zeroinfnan (uint64x2_t i) + want 0x1.92d628ab678cfp-1. */ + float64x2_t VPCS_ATTR V_NAME_D2 (atan2) (float64x2_t y, float64x2_t x) + { +- const struct data *data_ptr = ptr_barrier (&data); ++ const struct data *d = ptr_barrier (&data); + + uint64x2_t ix = vreinterpretq_u64_f64 (x); + uint64x2_t iy = vreinterpretq_u64_f64 (y); + +- uint64x2_t special_cases = vorrq_u64 (zeroinfnan (ix), zeroinfnan (iy)); ++ uint64x2_t special_cases ++ = vorrq_u64 (zeroinfnan (ix, d), zeroinfnan (iy, d)); + + uint64x2_t sign_x = vandq_u64 (ix, SignMask); + uint64x2_t sign_y = vandq_u64 (iy, SignMask); +@@ -80,18 +98,18 @@ float64x2_t VPCS_ATTR V_NAME_D2 (atan2) (float64x2_t y, float64x2_t x) + float64x2_t ay = vabsq_f64 (y); + + uint64x2_t pred_xlt0 = vcltzq_f64 (x); +- uint64x2_t pred_aygtax = vcgtq_f64 (ay, ax); ++ uint64x2_t pred_aygtax = vcagtq_f64 (y, x); + + /* Set up z for call to atan. */ + float64x2_t n = vbslq_f64 (pred_aygtax, vnegq_f64 (ax), ay); +- float64x2_t d = vbslq_f64 (pred_aygtax, ay, ax); +- float64x2_t z = vdivq_f64 (n, d); ++ float64x2_t q = vbslq_f64 (pred_aygtax, ay, ax); ++ float64x2_t z = vdivq_f64 (n, q); + + /* Work out the correct shift. */ +- float64x2_t shift = vreinterpretq_f64_u64 ( +- vandq_u64 (pred_xlt0, vreinterpretq_u64_f64 (v_f64 (-2.0)))); ++ float64x2_t shift ++ = vreinterpretq_f64_u64 (vandq_u64 (pred_xlt0, d->minustwo)); + shift = vbslq_f64 (pred_aygtax, vaddq_f64 (shift, v_f64 (1.0)), shift); +- shift = vmulq_f64 (shift, data_ptr->pi_over_2); ++ shift = vmulq_f64 (shift, d->pi_over_2); + + /* Calculate the polynomial approximation. + Use split Estrin scheme for P(z^2) with deg(P)=19. Use split instead of +@@ -102,20 +120,52 @@ float64x2_t VPCS_ATTR V_NAME_D2 (atan2) (float64x2_t y, float64x2_t x) + float64x2_t x2 = vmulq_f64 (z2, z2); + float64x2_t x4 = vmulq_f64 (x2, x2); + float64x2_t x8 = vmulq_f64 (x4, x4); +- float64x2_t ret +- = vfmaq_f64 (v_estrin_7_f64 (z2, x2, x4, data_ptr->poly), +- v_estrin_11_f64 (z2, x2, x4, x8, data_ptr->poly + 8), x8); ++ ++ float64x2_t c13 = vld1q_f64 (&d->c1); ++ float64x2_t c57 = vld1q_f64 (&d->c5); ++ float64x2_t c911 = vld1q_f64 (&d->c9); ++ float64x2_t c1315 = vld1q_f64 (&d->c13); ++ float64x2_t c1719 = vld1q_f64 (&d->c17); ++ ++ /* estrin_7. */ ++ float64x2_t p01 = vfmaq_laneq_f64 (d->c0, z2, c13, 0); ++ float64x2_t p23 = vfmaq_laneq_f64 (d->c2, z2, c13, 1); ++ float64x2_t p03 = vfmaq_f64 (p01, x2, p23); ++ ++ float64x2_t p45 = vfmaq_laneq_f64 (d->c4, z2, c57, 0); ++ float64x2_t p67 = vfmaq_laneq_f64 (d->c6, z2, c57, 1); ++ float64x2_t p47 = vfmaq_f64 (p45, x2, p67); ++ ++ float64x2_t p07 = vfmaq_f64 (p03, x4, p47); ++ ++ /* estrin_11. */ ++ float64x2_t p89 = vfmaq_laneq_f64 (d->c8, z2, c911, 0); ++ float64x2_t p1011 = vfmaq_laneq_f64 (d->c10, z2, c911, 1); ++ float64x2_t p811 = vfmaq_f64 (p89, x2, p1011); ++ ++ float64x2_t p1213 = vfmaq_laneq_f64 (d->c12, z2, c1315, 0); ++ float64x2_t p1415 = vfmaq_laneq_f64 (d->c14, z2, c1315, 1); ++ float64x2_t p1215 = vfmaq_f64 (p1213, x2, p1415); ++ ++ float64x2_t p1617 = vfmaq_laneq_f64 (d->c16, z2, c1719, 0); ++ float64x2_t p1819 = vfmaq_laneq_f64 (d->c18, z2, c1719, 1); ++ float64x2_t p1619 = vfmaq_f64 (p1617, x2, p1819); ++ ++ float64x2_t p815 = vfmaq_f64 (p811, x4, p1215); ++ float64x2_t p819 = vfmaq_f64 (p815, x8, p1619); ++ ++ float64x2_t ret = vfmaq_f64 (p07, p819, x8); + + /* Finalize. y = shift + z + z^3 * P(z^2). */ + ret = vfmaq_f64 (z, ret, vmulq_f64 (z2, z)); + ret = vaddq_f64 (ret, shift); + ++ if (__glibc_unlikely (v_any_u64 (special_cases))) ++ return special_case (y, x, ret, sign_xy, special_cases); ++ + /* Account for the sign of x and y. */ + ret = vreinterpretq_f64_u64 ( + veorq_u64 (vreinterpretq_u64_f64 (ret), sign_xy)); + +- if (__glibc_unlikely (v_any_u64 (special_cases))) +- return special_case (y, x, ret, special_cases); +- + return ret; + } +diff --git a/sysdeps/aarch64/fpu/atan2f_advsimd.c b/sysdeps/aarch64/fpu/atan2f_advsimd.c +index 56e610caf18f6d77..88daacd76cdd3998 100644 +--- a/sysdeps/aarch64/fpu/atan2f_advsimd.c ++++ b/sysdeps/aarch64/fpu/atan2f_advsimd.c +@@ -22,34 +22,39 @@ + + static const struct data + { +- float32x4_t poly[8]; +- float32x4_t pi_over_2; ++ float32x4_t c0, pi_over_2, c4, c6, c2; ++ float c1, c3, c5, c7; ++ uint32x4_t comp_const; + } data = { + /* Coefficients of polynomial P such that atan(x)~x+x*P(x^2) on + [2**-128, 1.0]. + Generated using fpminimax between FLT_MIN and 1. */ +- .poly = { V4 (-0x1.55555p-2f), V4 (0x1.99935ep-3f), V4 (-0x1.24051ep-3f), +- V4 (0x1.bd7368p-4f), V4 (-0x1.491f0ep-4f), V4 (0x1.93a2c0p-5f), +- V4 (-0x1.4c3c60p-6f), V4 (0x1.01fd88p-8f) }, +- .pi_over_2 = V4 (0x1.921fb6p+0f), ++ .c0 = V4 (-0x1.55555p-2f), .c1 = 0x1.99935ep-3f, ++ .c2 = V4 (-0x1.24051ep-3f), .c3 = 0x1.bd7368p-4f, ++ .c4 = V4 (-0x1.491f0ep-4f), .c5 = 0x1.93a2c0p-5f, ++ .c6 = V4 (-0x1.4c3c60p-6f), .c7 = 0x1.01fd88p-8f, ++ .pi_over_2 = V4 (0x1.921fb6p+0f), .comp_const = V4 (2 * 0x7f800000lu - 1), + }; + + #define SignMask v_u32 (0x80000000) + + /* Special cases i.e. 0, infinity and nan (fall back to scalar calls). */ + static float32x4_t VPCS_ATTR NOINLINE +-special_case (float32x4_t y, float32x4_t x, float32x4_t ret, uint32x4_t cmp) ++special_case (float32x4_t y, float32x4_t x, float32x4_t ret, ++ uint32x4_t sign_xy, uint32x4_t cmp) + { ++ /* Account for the sign of y. */ ++ ret = vreinterpretq_f32_u32 ( ++ veorq_u32 (vreinterpretq_u32_f32 (ret), sign_xy)); + return v_call2_f32 (atan2f, y, x, ret, cmp); + } + + /* Returns 1 if input is the bit representation of 0, infinity or nan. */ + static inline uint32x4_t +-zeroinfnan (uint32x4_t i) ++zeroinfnan (uint32x4_t i, const struct data *d) + { + /* 2 * i - 1 >= 2 * 0x7f800000lu - 1. */ +- return vcgeq_u32 (vsubq_u32 (vmulq_n_u32 (i, 2), v_u32 (1)), +- v_u32 (2 * 0x7f800000lu - 1)); ++ return vcgeq_u32 (vsubq_u32 (vmulq_n_u32 (i, 2), v_u32 (1)), d->comp_const); + } + + /* Fast implementation of vector atan2f. Maximum observed error is +@@ -58,12 +63,13 @@ zeroinfnan (uint32x4_t i) + want 0x1.967f00p-1. */ + float32x4_t VPCS_ATTR NOINLINE V_NAME_F2 (atan2) (float32x4_t y, float32x4_t x) + { +- const struct data *data_ptr = ptr_barrier (&data); ++ const struct data *d = ptr_barrier (&data); + + uint32x4_t ix = vreinterpretq_u32_f32 (x); + uint32x4_t iy = vreinterpretq_u32_f32 (y); + +- uint32x4_t special_cases = vorrq_u32 (zeroinfnan (ix), zeroinfnan (iy)); ++ uint32x4_t special_cases ++ = vorrq_u32 (zeroinfnan (ix, d), zeroinfnan (iy, d)); + + uint32x4_t sign_x = vandq_u32 (ix, SignMask); + uint32x4_t sign_y = vandq_u32 (iy, SignMask); +@@ -77,14 +83,14 @@ float32x4_t VPCS_ATTR NOINLINE V_NAME_F2 (atan2) (float32x4_t y, float32x4_t x) + + /* Set up z for call to atanf. */ + float32x4_t n = vbslq_f32 (pred_aygtax, vnegq_f32 (ax), ay); +- float32x4_t d = vbslq_f32 (pred_aygtax, ay, ax); +- float32x4_t z = vdivq_f32 (n, d); ++ float32x4_t q = vbslq_f32 (pred_aygtax, ay, ax); ++ float32x4_t z = vdivq_f32 (n, q); + + /* Work out the correct shift. */ + float32x4_t shift = vreinterpretq_f32_u32 ( + vandq_u32 (pred_xlt0, vreinterpretq_u32_f32 (v_f32 (-2.0f)))); + shift = vbslq_f32 (pred_aygtax, vaddq_f32 (shift, v_f32 (1.0f)), shift); +- shift = vmulq_f32 (shift, data_ptr->pi_over_2); ++ shift = vmulq_f32 (shift, d->pi_over_2); + + /* Calculate the polynomial approximation. + Use 2-level Estrin scheme for P(z^2) with deg(P)=7. However, +@@ -96,23 +102,27 @@ float32x4_t VPCS_ATTR NOINLINE V_NAME_F2 (atan2) (float32x4_t y, float32x4_t x) + float32x4_t z2 = vmulq_f32 (z, z); + float32x4_t z4 = vmulq_f32 (z2, z2); + +- float32x4_t ret = vfmaq_f32 ( +- v_pairwise_poly_3_f32 (z2, z4, data_ptr->poly), z4, +- vmulq_f32 (z4, v_pairwise_poly_3_f32 (z2, z4, data_ptr->poly + 4))); ++ float32x4_t c1357 = vld1q_f32 (&d->c1); ++ float32x4_t p01 = vfmaq_laneq_f32 (d->c0, z2, c1357, 0); ++ float32x4_t p23 = vfmaq_laneq_f32 (d->c2, z2, c1357, 1); ++ float32x4_t p45 = vfmaq_laneq_f32 (d->c4, z2, c1357, 2); ++ float32x4_t p67 = vfmaq_laneq_f32 (d->c6, z2, c1357, 3); ++ float32x4_t p03 = vfmaq_f32 (p01, z4, p23); ++ float32x4_t p47 = vfmaq_f32 (p45, z4, p67); ++ ++ float32x4_t ret = vfmaq_f32 (p03, z4, vmulq_f32 (z4, p47)); + + /* y = shift + z * P(z^2). */ + ret = vaddq_f32 (vfmaq_f32 (z, ret, vmulq_f32 (z2, z)), shift); + +- /* Account for the sign of y. */ +- ret = vreinterpretq_f32_u32 ( +- veorq_u32 (vreinterpretq_u32_f32 (ret), sign_xy)); +- + if (__glibc_unlikely (v_any_u32 (special_cases))) + { +- return special_case (y, x, ret, special_cases); ++ return special_case (y, x, ret, sign_xy, special_cases); + } + +- return ret; ++ /* Account for the sign of y. */ ++ return vreinterpretq_f32_u32 ( ++ veorq_u32 (vreinterpretq_u32_f32 (ret), sign_xy)); + } + libmvec_hidden_def (V_NAME_F2 (atan2)) + HALF_WIDTH_ALIAS_F2(atan2) +diff --git a/sysdeps/aarch64/fpu/atan_advsimd.c b/sysdeps/aarch64/fpu/atan_advsimd.c +index a962be0f78e4a9c7..14f1809796f05246 100644 +--- a/sysdeps/aarch64/fpu/atan_advsimd.c ++++ b/sysdeps/aarch64/fpu/atan_advsimd.c +@@ -22,21 +22,22 @@ + + static const struct data + { ++ float64x2_t c0, c2, c4, c6, c8, c10, c12, c14, c16, c18; + float64x2_t pi_over_2; +- float64x2_t poly[20]; ++ double c1, c3, c5, c7, c9, c11, c13, c15, c17, c19; + } data = { + /* Coefficients of polynomial P such that atan(x)~x+x*P(x^2) on + [2**-1022, 1.0]. */ +- .poly = { V2 (-0x1.5555555555555p-2), V2 (0x1.99999999996c1p-3), +- V2 (-0x1.2492492478f88p-3), V2 (0x1.c71c71bc3951cp-4), +- V2 (-0x1.745d160a7e368p-4), V2 (0x1.3b139b6a88ba1p-4), +- V2 (-0x1.11100ee084227p-4), V2 (0x1.e1d0f9696f63bp-5), +- V2 (-0x1.aebfe7b418581p-5), V2 (0x1.842dbe9b0d916p-5), +- V2 (-0x1.5d30140ae5e99p-5), V2 (0x1.338e31eb2fbbcp-5), +- V2 (-0x1.00e6eece7de8p-5), V2 (0x1.860897b29e5efp-6), +- V2 (-0x1.0051381722a59p-6), V2 (0x1.14e9dc19a4a4ep-7), +- V2 (-0x1.d0062b42fe3bfp-9), V2 (0x1.17739e210171ap-10), +- V2 (-0x1.ab24da7be7402p-13), V2 (0x1.358851160a528p-16), }, ++ .c0 = V2 (-0x1.5555555555555p-2), .c1 = 0x1.99999999996c1p-3, ++ .c2 = V2 (-0x1.2492492478f88p-3), .c3 = 0x1.c71c71bc3951cp-4, ++ .c4 = V2 (-0x1.745d160a7e368p-4), .c5 = 0x1.3b139b6a88ba1p-4, ++ .c6 = V2 (-0x1.11100ee084227p-4), .c7 = 0x1.e1d0f9696f63bp-5, ++ .c8 = V2 (-0x1.aebfe7b418581p-5), .c9 = 0x1.842dbe9b0d916p-5, ++ .c10 = V2 (-0x1.5d30140ae5e99p-5), .c11 = 0x1.338e31eb2fbbcp-5, ++ .c12 = V2 (-0x1.00e6eece7de8p-5), .c13 = 0x1.860897b29e5efp-6, ++ .c14 = V2 (-0x1.0051381722a59p-6), .c15 = 0x1.14e9dc19a4a4ep-7, ++ .c16 = V2 (-0x1.d0062b42fe3bfp-9), .c17 = 0x1.17739e210171ap-10, ++ .c18 = V2 (-0x1.ab24da7be7402p-13), .c19 = 0x1.358851160a528p-16, + .pi_over_2 = V2 (0x1.921fb54442d18p+0), + }; + +@@ -52,6 +53,11 @@ static const struct data + float64x2_t VPCS_ATTR V_NAME_D1 (atan) (float64x2_t x) + { + const struct data *d = ptr_barrier (&data); ++ float64x2_t c13 = vld1q_f64 (&d->c1); ++ float64x2_t c57 = vld1q_f64 (&d->c5); ++ float64x2_t c911 = vld1q_f64 (&d->c9); ++ float64x2_t c1315 = vld1q_f64 (&d->c13); ++ float64x2_t c1719 = vld1q_f64 (&d->c17); + + /* Small cases, infs and nans are supported by our approximation technique, + but do not set fenv flags correctly. Only trigger special case if we need +@@ -90,9 +96,35 @@ float64x2_t VPCS_ATTR V_NAME_D1 (atan) (float64x2_t x) + float64x2_t x2 = vmulq_f64 (z2, z2); + float64x2_t x4 = vmulq_f64 (x2, x2); + float64x2_t x8 = vmulq_f64 (x4, x4); +- float64x2_t y +- = vfmaq_f64 (v_estrin_7_f64 (z2, x2, x4, d->poly), +- v_estrin_11_f64 (z2, x2, x4, x8, d->poly + 8), x8); ++ ++ /* estrin_7. */ ++ float64x2_t p01 = vfmaq_laneq_f64 (d->c0, z2, c13, 0); ++ float64x2_t p23 = vfmaq_laneq_f64 (d->c2, z2, c13, 1); ++ float64x2_t p03 = vfmaq_f64 (p01, x2, p23); ++ ++ float64x2_t p45 = vfmaq_laneq_f64 (d->c4, z2, c57, 0); ++ float64x2_t p67 = vfmaq_laneq_f64 (d->c6, z2, c57, 1); ++ float64x2_t p47 = vfmaq_f64 (p45, x2, p67); ++ ++ float64x2_t p07 = vfmaq_f64 (p03, x4, p47); ++ ++ /* estrin_11. */ ++ float64x2_t p89 = vfmaq_laneq_f64 (d->c8, z2, c911, 0); ++ float64x2_t p1011 = vfmaq_laneq_f64 (d->c10, z2, c911, 1); ++ float64x2_t p811 = vfmaq_f64 (p89, x2, p1011); ++ ++ float64x2_t p1213 = vfmaq_laneq_f64 (d->c12, z2, c1315, 0); ++ float64x2_t p1415 = vfmaq_laneq_f64 (d->c14, z2, c1315, 1); ++ float64x2_t p1215 = vfmaq_f64 (p1213, x2, p1415); ++ ++ float64x2_t p1617 = vfmaq_laneq_f64 (d->c16, z2, c1719, 0); ++ float64x2_t p1819 = vfmaq_laneq_f64 (d->c18, z2, c1719, 1); ++ float64x2_t p1619 = vfmaq_f64 (p1617, x2, p1819); ++ ++ float64x2_t p815 = vfmaq_f64 (p811, x4, p1215); ++ float64x2_t p819 = vfmaq_f64 (p815, x8, p1619); ++ ++ float64x2_t y = vfmaq_f64 (p07, p819, x8); + + /* Finalize. y = shift + z + z^3 * P(z^2). */ + y = vfmaq_f64 (az, y, vmulq_f64 (z2, az)); diff --git a/glibc-upstream-2.39-157.patch b/glibc-upstream-2.39-157.patch new file mode 100644 index 0000000..3f1d87d --- /dev/null +++ b/glibc-upstream-2.39-157.patch @@ -0,0 +1,211 @@ +commit 78abd3ef6e607853def82a97bf34a3c632db04e2 +Author: Luna Lamb +Date: Fri Jan 3 19:02:52 2025 +0000 + + AArch64: Improve codegen in SVE tans + + Improves memory access. + Tan: MOVPRFX 7 -> 2, LD1RD 12 -> 5, move MOV away from return. + Tanf: MOV 2 -> 1, MOVPRFX 6 -> 3, LD1RW 5 -> 4, move mov away from return. + + (cherry picked from commit aa6609feb20ebf8653db639dabe2a6afc77b02cc) + +diff --git a/sysdeps/aarch64/fpu/tan_sve.c b/sysdeps/aarch64/fpu/tan_sve.c +index b2e44473166845d0..a7318fd417dc7064 100644 +--- a/sysdeps/aarch64/fpu/tan_sve.c ++++ b/sysdeps/aarch64/fpu/tan_sve.c +@@ -22,24 +22,38 @@ + + static const struct data + { +- double poly[9]; +- double half_pi_hi, half_pi_lo, inv_half_pi, range_val, shift; ++ double c2, c4, c6, c8; ++ double poly_1357[4]; ++ double c0, inv_half_pi; ++ double half_pi_hi, half_pi_lo, range_val; + } data = { + /* Polynomial generated with FPMinimax. */ +- .poly = { 0x1.5555555555556p-2, 0x1.1111111110a63p-3, 0x1.ba1ba1bb46414p-5, +- 0x1.664f47e5b5445p-6, 0x1.226e5e5ecdfa3p-7, 0x1.d6c7ddbf87047p-9, +- 0x1.7ea75d05b583ep-10, 0x1.289f22964a03cp-11, +- 0x1.4e4fd14147622p-12, }, ++ .c2 = 0x1.ba1ba1bb46414p-5, ++ .c4 = 0x1.226e5e5ecdfa3p-7, ++ .c6 = 0x1.7ea75d05b583ep-10, ++ .c8 = 0x1.4e4fd14147622p-12, ++ .poly_1357 = { 0x1.1111111110a63p-3, 0x1.664f47e5b5445p-6, ++ 0x1.d6c7ddbf87047p-9, 0x1.289f22964a03cp-11 }, ++ .c0 = 0x1.5555555555556p-2, ++ .inv_half_pi = 0x1.45f306dc9c883p-1, + .half_pi_hi = 0x1.921fb54442d18p0, + .half_pi_lo = 0x1.1a62633145c07p-54, +- .inv_half_pi = 0x1.45f306dc9c883p-1, + .range_val = 0x1p23, +- .shift = 0x1.8p52, + }; + + static svfloat64_t NOINLINE +-special_case (svfloat64_t x, svfloat64_t y, svbool_t special) ++special_case (svfloat64_t x, svfloat64_t p, svfloat64_t q, svbool_t pg, ++ svbool_t special) + { ++ svbool_t use_recip = svcmpeq ( ++ pg, svand_x (pg, svreinterpret_u64 (svcvt_s64_x (pg, q)), 1), 0); ++ ++ svfloat64_t n = svmad_x (pg, p, p, -1); ++ svfloat64_t d = svmul_x (svptrue_b64 (), p, 2); ++ svfloat64_t swap = n; ++ n = svneg_m (n, use_recip, d); ++ d = svsel (use_recip, swap, d); ++ svfloat64_t y = svdiv_x (svnot_z (pg, special), n, d); + return sv_call_f64 (tan, x, y, special); + } + +@@ -50,15 +64,10 @@ special_case (svfloat64_t x, svfloat64_t y, svbool_t special) + svfloat64_t SV_NAME_D1 (tan) (svfloat64_t x, svbool_t pg) + { + const struct data *dat = ptr_barrier (&data); +- +- /* Invert condition to catch NaNs and Infs as well as large values. */ +- svbool_t special = svnot_z (pg, svaclt (pg, x, dat->range_val)); +- ++ svfloat64_t half_pi_c0 = svld1rq (svptrue_b64 (), &dat->c0); + /* q = nearest integer to 2 * x / pi. */ +- svfloat64_t shift = sv_f64 (dat->shift); +- svfloat64_t q = svmla_x (pg, shift, x, dat->inv_half_pi); +- q = svsub_x (pg, q, shift); +- svint64_t qi = svcvt_s64_x (pg, q); ++ svfloat64_t q = svmul_lane (x, half_pi_c0, 1); ++ q = svrinta_x (pg, q); + + /* Use q to reduce x to r in [-pi/4, pi/4], by: + r = x - q * pi/2, in extended precision. */ +@@ -68,7 +77,7 @@ svfloat64_t SV_NAME_D1 (tan) (svfloat64_t x, svbool_t pg) + r = svmls_lane (r, q, half_pi, 1); + /* Further reduce r to [-pi/8, pi/8], to be reconstructed using double angle + formula. */ +- r = svmul_x (pg, r, 0.5); ++ r = svmul_x (svptrue_b64 (), r, 0.5); + + /* Approximate tan(r) using order 8 polynomial. + tan(x) is odd, so polynomial has the form: +@@ -76,29 +85,51 @@ svfloat64_t SV_NAME_D1 (tan) (svfloat64_t x, svbool_t pg) + Hence we first approximate P(r) = C1 + C2 * r^2 + C3 * r^4 + ... + Then compute the approximation by: + tan(r) ~= r + r^3 * (C0 + r^2 * P(r)). */ +- svfloat64_t r2 = svmul_x (pg, r, r); +- svfloat64_t r4 = svmul_x (pg, r2, r2); +- svfloat64_t r8 = svmul_x (pg, r4, r4); ++ ++ svfloat64_t r2 = svmul_x (svptrue_b64 (), r, r); ++ svfloat64_t r4 = svmul_x (svptrue_b64 (), r2, r2); ++ svfloat64_t r8 = svmul_x (svptrue_b64 (), r4, r4); + /* Use offset version coeff array by 1 to evaluate from C1 onwards. */ +- svfloat64_t p = sv_estrin_7_f64_x (pg, r2, r4, r8, dat->poly + 1); +- p = svmad_x (pg, p, r2, dat->poly[0]); +- p = svmla_x (pg, r, r2, svmul_x (pg, p, r)); ++ svfloat64_t C_24 = svld1rq (svptrue_b64 (), &dat->c2); ++ svfloat64_t C_68 = svld1rq (svptrue_b64 (), &dat->c6); ++ ++ /* Use offset version coeff array by 1 to evaluate from C1 onwards. */ ++ svfloat64_t p01 = svmla_lane (sv_f64 (dat->poly_1357[0]), r2, C_24, 0); ++ svfloat64_t p23 = svmla_lane_f64 (sv_f64 (dat->poly_1357[1]), r2, C_24, 1); ++ svfloat64_t p03 = svmla_x (pg, p01, p23, r4); ++ ++ svfloat64_t p45 = svmla_lane (sv_f64 (dat->poly_1357[2]), r2, C_68, 0); ++ svfloat64_t p67 = svmla_lane (sv_f64 (dat->poly_1357[3]), r2, C_68, 1); ++ svfloat64_t p47 = svmla_x (pg, p45, p67, r4); ++ ++ svfloat64_t p = svmla_x (pg, p03, p47, r8); ++ ++ svfloat64_t z = svmul_x (svptrue_b64 (), p, r); ++ z = svmul_x (svptrue_b64 (), r2, z); ++ z = svmla_lane (z, r, half_pi_c0, 0); ++ p = svmla_x (pg, r, r2, z); + + /* Recombination uses double-angle formula: + tan(2x) = 2 * tan(x) / (1 - (tan(x))^2) + and reciprocity around pi/2: + tan(x) = 1 / (tan(pi/2 - x)) + to assemble result using change-of-sign and conditional selection of +- numerator/denominator dependent on odd/even-ness of q (hence quadrant). */ +- svbool_t use_recip +- = svcmpeq (pg, svand_x (pg, svreinterpret_u64 (qi), 1), 0); ++ numerator/denominator dependent on odd/even-ness of q (quadrant). */ ++ ++ /* Invert condition to catch NaNs and Infs as well as large values. */ ++ svbool_t special = svnot_z (pg, svaclt (pg, x, dat->range_val)); ++ ++ if (__glibc_unlikely (svptest_any (pg, special))) ++ { ++ return special_case (x, p, q, pg, special); ++ } ++ svbool_t use_recip = svcmpeq ( ++ pg, svand_x (pg, svreinterpret_u64 (svcvt_s64_x (pg, q)), 1), 0); + + svfloat64_t n = svmad_x (pg, p, p, -1); +- svfloat64_t d = svmul_x (pg, p, 2); ++ svfloat64_t d = svmul_x (svptrue_b64 (), p, 2); + svfloat64_t swap = n; + n = svneg_m (n, use_recip, d); + d = svsel (use_recip, swap, d); +- if (__glibc_unlikely (svptest_any (pg, special))) +- return special_case (x, svdiv_x (svnot_z (pg, special), n, d), special); + return svdiv_x (pg, n, d); + } +diff --git a/sysdeps/aarch64/fpu/tanf_sve.c b/sysdeps/aarch64/fpu/tanf_sve.c +index f34258324114a360..e850fb4882e88380 100644 +--- a/sysdeps/aarch64/fpu/tanf_sve.c ++++ b/sysdeps/aarch64/fpu/tanf_sve.c +@@ -60,21 +60,16 @@ svfloat32_t SV_NAME_F1 (tan) (svfloat32_t x, const svbool_t pg) + { + const struct data *d = ptr_barrier (&data); + +- /* Determine whether input is too large to perform fast regression. */ +- svbool_t cmp = svacge (pg, x, d->range_val); +- + svfloat32_t odd_coeffs = svld1rq (svptrue_b32 (), &d->c1); + svfloat32_t pi_vals = svld1rq (svptrue_b32 (), &d->pio2_1); + + /* n = rint(x/(pi/2)). */ +- svfloat32_t q = svmla_lane (sv_f32 (d->shift), x, pi_vals, 3); +- svfloat32_t n = svsub_x (pg, q, d->shift); ++ svfloat32_t n = svrintn_x (pg, svmul_lane (x, pi_vals, 3)); + /* n is already a signed integer, simply convert it. */ + svint32_t in = svcvt_s32_x (pg, n); + /* Determine if x lives in an interval, where |tan(x)| grows to infinity. */ + svint32_t alt = svand_x (pg, in, 1); + svbool_t pred_alt = svcmpne (pg, alt, 0); +- + /* r = x - n * (pi/2) (range reduction into 0 .. pi/4). */ + svfloat32_t r; + r = svmls_lane (x, n, pi_vals, 0); +@@ -93,7 +88,7 @@ svfloat32_t SV_NAME_F1 (tan) (svfloat32_t x, const svbool_t pg) + + /* Evaluate polynomial approximation of tangent on [-pi/4, pi/4], + using Estrin on z^2. */ +- svfloat32_t z2 = svmul_x (pg, z, z); ++ svfloat32_t z2 = svmul_x (svptrue_b32 (), r, r); + svfloat32_t p01 = svmla_lane (sv_f32 (d->c0), z2, odd_coeffs, 0); + svfloat32_t p23 = svmla_lane (sv_f32 (d->c2), z2, odd_coeffs, 1); + svfloat32_t p45 = svmla_lane (sv_f32 (d->c4), z2, odd_coeffs, 2); +@@ -106,13 +101,14 @@ svfloat32_t SV_NAME_F1 (tan) (svfloat32_t x, const svbool_t pg) + + svfloat32_t y = svmla_x (pg, z, p, svmul_x (pg, z, z2)); + +- /* Transform result back, if necessary. */ +- svfloat32_t inv_y = svdivr_x (pg, y, 1.0f); +- + /* No need to pass pg to specialcase here since cmp is a strict subset, + guaranteed by the cmpge above. */ ++ ++ /* Determine whether input is too large to perform fast regression. */ ++ svbool_t cmp = svacge (pg, x, d->range_val); + if (__glibc_unlikely (svptest_any (pg, cmp))) +- return special_case (x, svsel (pred_alt, inv_y, y), cmp); ++ return special_case (x, svdivr_x (pg, y, 1.0f), cmp); + ++ svfloat32_t inv_y = svdivr_x (pg, y, 1.0f); + return svsel (pred_alt, inv_y, y); + } diff --git a/glibc-upstream-2.39-158.patch b/glibc-upstream-2.39-158.patch new file mode 100644 index 0000000..0c02585 --- /dev/null +++ b/glibc-upstream-2.39-158.patch @@ -0,0 +1,301 @@ +commit 4073e4ee2c68de89b7220afba8d0780f86d9c60e +Author: Yat Long Poon +Date: Fri Jan 3 19:07:30 2025 +0000 + + AArch64: Improve codegen for SVE logs + + Reduce memory access by using lanewise MLA and moving constants to struct + and reduce number of MOVPRFXs. + Update maximum ULP error for double log_sve from 1 to 2. + Speedup on Neoverse V1 for log (3%), log2 (5%), and log10 (4%). + + (cherry picked from commit 32d193a372feb28f9da247bb7283d404b84429c6) + +diff --git a/sysdeps/aarch64/fpu/log10_sve.c b/sysdeps/aarch64/fpu/log10_sve.c +index ab7362128d9b3ffb..f1cad2759a31a178 100644 +--- a/sysdeps/aarch64/fpu/log10_sve.c ++++ b/sysdeps/aarch64/fpu/log10_sve.c +@@ -23,28 +23,49 @@ + #define Min 0x0010000000000000 + #define Max 0x7ff0000000000000 + #define Thres 0x7fe0000000000000 /* Max - Min. */ +-#define Off 0x3fe6900900000000 + #define N (1 << V_LOG10_TABLE_BITS) + ++static const struct data ++{ ++ double c0, c2; ++ double c1, c3; ++ double invln10, log10_2; ++ double c4; ++ uint64_t off; ++} data = { ++ .c0 = -0x1.bcb7b1526e506p-3, ++ .c1 = 0x1.287a7636be1d1p-3, ++ .c2 = -0x1.bcb7b158af938p-4, ++ .c3 = 0x1.63c78734e6d07p-4, ++ .c4 = -0x1.287461742fee4p-4, ++ .invln10 = 0x1.bcb7b1526e50ep-2, ++ .log10_2 = 0x1.34413509f79ffp-2, ++ .off = 0x3fe6900900000000, ++}; ++ + static svfloat64_t NOINLINE +-special_case (svfloat64_t x, svfloat64_t y, svbool_t special) ++special_case (svfloat64_t hi, svuint64_t tmp, svfloat64_t y, svfloat64_t r2, ++ svbool_t special, const struct data *d) + { +- return sv_call_f64 (log10, x, y, special); ++ svfloat64_t x = svreinterpret_f64 (svadd_x (svptrue_b64 (), tmp, d->off)); ++ return sv_call_f64 (log10, x, svmla_x (svptrue_b64 (), hi, r2, y), special); + } + +-/* SVE log10 algorithm. ++/* Double-precision SVE log10 routine. + Maximum measured error is 2.46 ulps. + SV_NAME_D1 (log10)(0x1.131956cd4b627p+0) got 0x1.fffbdf6eaa669p-6 + want 0x1.fffbdf6eaa667p-6. */ + svfloat64_t SV_NAME_D1 (log10) (svfloat64_t x, const svbool_t pg) + { ++ const struct data *d = ptr_barrier (&data); ++ + svuint64_t ix = svreinterpret_u64 (x); + svbool_t special = svcmpge (pg, svsub_x (pg, ix, Min), Thres); + + /* x = 2^k z; where z is in range [Off,2*Off) and exact. + The range is split into N subintervals. + The ith subinterval contains z and c is near its center. */ +- svuint64_t tmp = svsub_x (pg, ix, Off); ++ svuint64_t tmp = svsub_x (pg, ix, d->off); + svuint64_t i = svlsr_x (pg, tmp, 51 - V_LOG10_TABLE_BITS); + i = svand_x (pg, i, (N - 1) << 1); + svfloat64_t k = svcvt_f64_x (pg, svasr_x (pg, svreinterpret_s64 (tmp), 52)); +@@ -62,15 +83,19 @@ svfloat64_t SV_NAME_D1 (log10) (svfloat64_t x, const svbool_t pg) + svfloat64_t r = svmad_x (pg, invc, z, -1.0); + + /* hi = log(c) + k*log(2). */ +- svfloat64_t w = svmla_x (pg, logc, r, __v_log10_data.invln10); +- svfloat64_t hi = svmla_x (pg, w, k, __v_log10_data.log10_2); ++ svfloat64_t invln10_log10_2 = svld1rq_f64 (svptrue_b64 (), &d->invln10); ++ svfloat64_t w = svmla_lane_f64 (logc, r, invln10_log10_2, 0); ++ svfloat64_t hi = svmla_lane_f64 (w, k, invln10_log10_2, 1); + + /* y = r2*(A0 + r*A1 + r2*(A2 + r*A3 + r2*A4)) + hi. */ +- svfloat64_t r2 = svmul_x (pg, r, r); +- svfloat64_t y = sv_pw_horner_4_f64_x (pg, r, r2, __v_log10_data.poly); ++ svfloat64_t odd_coeffs = svld1rq_f64 (svptrue_b64 (), &d->c1); ++ svfloat64_t r2 = svmul_x (svptrue_b64 (), r, r); ++ svfloat64_t y = svmla_lane_f64 (sv_f64 (d->c2), r, odd_coeffs, 1); ++ svfloat64_t p = svmla_lane_f64 (sv_f64 (d->c0), r, odd_coeffs, 0); ++ y = svmla_x (pg, y, r2, d->c4); ++ y = svmla_x (pg, p, r2, y); + + if (__glibc_unlikely (svptest_any (pg, special))) +- return special_case (x, svmla_x (svnot_z (pg, special), hi, r2, y), +- special); ++ return special_case (hi, tmp, y, r2, special, d); + return svmla_x (pg, hi, r2, y); + } +diff --git a/sysdeps/aarch64/fpu/log2_sve.c b/sysdeps/aarch64/fpu/log2_sve.c +index 743fa2a91392093b..908e638246abc13d 100644 +--- a/sysdeps/aarch64/fpu/log2_sve.c ++++ b/sysdeps/aarch64/fpu/log2_sve.c +@@ -21,15 +21,32 @@ + #include "poly_sve_f64.h" + + #define N (1 << V_LOG2_TABLE_BITS) +-#define Off 0x3fe6900900000000 + #define Max (0x7ff0000000000000) + #define Min (0x0010000000000000) + #define Thresh (0x7fe0000000000000) /* Max - Min. */ + ++static const struct data ++{ ++ double c0, c2; ++ double c1, c3; ++ double invln2, c4; ++ uint64_t off; ++} data = { ++ .c0 = -0x1.71547652b83p-1, ++ .c1 = 0x1.ec709dc340953p-2, ++ .c2 = -0x1.71547651c8f35p-2, ++ .c3 = 0x1.2777ebe12dda5p-2, ++ .c4 = -0x1.ec738d616fe26p-3, ++ .invln2 = 0x1.71547652b82fep0, ++ .off = 0x3fe6900900000000, ++}; ++ + static svfloat64_t NOINLINE +-special_case (svfloat64_t x, svfloat64_t y, svbool_t cmp) ++special_case (svfloat64_t w, svuint64_t tmp, svfloat64_t y, svfloat64_t r2, ++ svbool_t special, const struct data *d) + { +- return sv_call_f64 (log2, x, y, cmp); ++ svfloat64_t x = svreinterpret_f64 (svadd_x (svptrue_b64 (), tmp, d->off)); ++ return sv_call_f64 (log2, x, svmla_x (svptrue_b64 (), w, r2, y), special); + } + + /* Double-precision SVE log2 routine. +@@ -40,13 +57,15 @@ special_case (svfloat64_t x, svfloat64_t y, svbool_t cmp) + want 0x1.fffb34198d9ddp-5. */ + svfloat64_t SV_NAME_D1 (log2) (svfloat64_t x, const svbool_t pg) + { ++ const struct data *d = ptr_barrier (&data); ++ + svuint64_t ix = svreinterpret_u64 (x); + svbool_t special = svcmpge (pg, svsub_x (pg, ix, Min), Thresh); + + /* x = 2^k z; where z is in range [Off,2*Off) and exact. + The range is split into N subintervals. + The ith subinterval contains z and c is near its center. */ +- svuint64_t tmp = svsub_x (pg, ix, Off); ++ svuint64_t tmp = svsub_x (pg, ix, d->off); + svuint64_t i = svlsr_x (pg, tmp, 51 - V_LOG2_TABLE_BITS); + i = svand_x (pg, i, (N - 1) << 1); + svfloat64_t k = svcvt_f64_x (pg, svasr_x (pg, svreinterpret_s64 (tmp), 52)); +@@ -59,15 +78,19 @@ svfloat64_t SV_NAME_D1 (log2) (svfloat64_t x, const svbool_t pg) + + /* log2(x) = log1p(z/c-1)/log(2) + log2(c) + k. */ + ++ svfloat64_t invln2_and_c4 = svld1rq_f64 (svptrue_b64 (), &d->invln2); + svfloat64_t r = svmad_x (pg, invc, z, -1.0); +- svfloat64_t w = svmla_x (pg, log2c, r, __v_log2_data.invln2); +- +- svfloat64_t r2 = svmul_x (pg, r, r); +- svfloat64_t y = sv_pw_horner_4_f64_x (pg, r, r2, __v_log2_data.poly); ++ svfloat64_t w = svmla_lane_f64 (log2c, r, invln2_and_c4, 0); + w = svadd_x (pg, k, w); + ++ svfloat64_t odd_coeffs = svld1rq_f64 (svptrue_b64 (), &d->c1); ++ svfloat64_t r2 = svmul_x (svptrue_b64 (), r, r); ++ svfloat64_t y = svmla_lane_f64 (sv_f64 (d->c2), r, odd_coeffs, 1); ++ svfloat64_t p = svmla_lane_f64 (sv_f64 (d->c0), r, odd_coeffs, 0); ++ y = svmla_lane_f64 (y, r2, invln2_and_c4, 1); ++ y = svmla_x (pg, p, r2, y); ++ + if (__glibc_unlikely (svptest_any (pg, special))) +- return special_case (x, svmla_x (svnot_z (pg, special), w, r2, y), +- special); ++ return special_case (w, tmp, y, r2, special, d); + return svmla_x (pg, w, r2, y); + } +diff --git a/sysdeps/aarch64/fpu/log_sve.c b/sysdeps/aarch64/fpu/log_sve.c +index 9b689f2ec7190338..044223400ba2463b 100644 +--- a/sysdeps/aarch64/fpu/log_sve.c ++++ b/sysdeps/aarch64/fpu/log_sve.c +@@ -19,39 +19,54 @@ + + #include "sv_math.h" + +-#define P(i) sv_f64 (__v_log_data.poly[i]) + #define N (1 << V_LOG_TABLE_BITS) +-#define Off (0x3fe6900900000000) +-#define MaxTop (0x7ff) +-#define MinTop (0x001) +-#define ThreshTop (0x7fe) /* MaxTop - MinTop. */ ++#define Max (0x7ff0000000000000) ++#define Min (0x0010000000000000) ++#define Thresh (0x7fe0000000000000) /* Max - Min. */ ++ ++static const struct data ++{ ++ double c0, c2; ++ double c1, c3; ++ double ln2, c4; ++ uint64_t off; ++} data = { ++ .c0 = -0x1.ffffffffffff7p-2, ++ .c1 = 0x1.55555555170d4p-2, ++ .c2 = -0x1.0000000399c27p-2, ++ .c3 = 0x1.999b2e90e94cap-3, ++ .c4 = -0x1.554e550bd501ep-3, ++ .ln2 = 0x1.62e42fefa39efp-1, ++ .off = 0x3fe6900900000000, ++}; + + static svfloat64_t NOINLINE +-special_case (svfloat64_t x, svfloat64_t y, svbool_t cmp) ++special_case (svfloat64_t hi, svuint64_t tmp, svfloat64_t y, svfloat64_t r2, ++ svbool_t special, const struct data *d) + { +- return sv_call_f64 (log, x, y, cmp); ++ svfloat64_t x = svreinterpret_f64 (svadd_x (svptrue_b64 (), tmp, d->off)); ++ return sv_call_f64 (log, x, svmla_x (svptrue_b64 (), hi, r2, y), special); + } + +-/* SVE port of AdvSIMD log algorithm. +- Maximum measured error is 2.17 ulp: +- SV_NAME_D1 (log)(0x1.a6129884398a3p+0) got 0x1.ffffff1cca043p-2 +- want 0x1.ffffff1cca045p-2. */ ++/* Double-precision SVE log routine. ++ Maximum measured error is 2.64 ulp: ++ SV_NAME_D1 (log)(0x1.95e54bc91a5e2p+184) got 0x1.fffffffe88cacp+6 ++ want 0x1.fffffffe88cafp+6. */ + svfloat64_t SV_NAME_D1 (log) (svfloat64_t x, const svbool_t pg) + { ++ const struct data *d = ptr_barrier (&data); ++ + svuint64_t ix = svreinterpret_u64 (x); +- svuint64_t top = svlsr_x (pg, ix, 52); +- svbool_t cmp = svcmpge (pg, svsub_x (pg, top, MinTop), sv_u64 (ThreshTop)); ++ svbool_t special = svcmpge (pg, svsub_x (pg, ix, Min), Thresh); + + /* x = 2^k z; where z is in range [Off,2*Off) and exact. + The range is split into N subintervals. + The ith subinterval contains z and c is near its center. */ +- svuint64_t tmp = svsub_x (pg, ix, Off); ++ svuint64_t tmp = svsub_x (pg, ix, d->off); + /* Calculate table index = (tmp >> (52 - V_LOG_TABLE_BITS)) % N. + The actual value of i is double this due to table layout. */ + svuint64_t i + = svand_x (pg, svlsr_x (pg, tmp, (51 - V_LOG_TABLE_BITS)), (N - 1) << 1); +- svint64_t k +- = svasr_x (pg, svreinterpret_s64 (tmp), 52); /* Arithmetic shift. */ + svuint64_t iz = svsub_x (pg, ix, svand_x (pg, tmp, 0xfffULL << 52)); + svfloat64_t z = svreinterpret_f64 (iz); + /* Lookup in 2 global lists (length N). */ +@@ -59,18 +74,22 @@ svfloat64_t SV_NAME_D1 (log) (svfloat64_t x, const svbool_t pg) + svfloat64_t logc = svld1_gather_index (pg, &__v_log_data.table[0].logc, i); + + /* log(x) = log1p(z/c-1) + log(c) + k*Ln2. */ +- svfloat64_t r = svmad_x (pg, invc, z, -1); +- svfloat64_t kd = svcvt_f64_x (pg, k); ++ svfloat64_t kd = svcvt_f64_x (pg, svasr_x (pg, svreinterpret_s64 (tmp), 52)); + /* hi = r + log(c) + k*Ln2. */ +- svfloat64_t hi = svmla_x (pg, svadd_x (pg, logc, r), kd, __v_log_data.ln2); ++ svfloat64_t ln2_and_c4 = svld1rq_f64 (svptrue_b64 (), &d->ln2); ++ svfloat64_t r = svmad_x (pg, invc, z, -1); ++ svfloat64_t hi = svmla_lane_f64 (logc, kd, ln2_and_c4, 0); ++ hi = svadd_x (pg, r, hi); ++ + /* y = r2*(A0 + r*A1 + r2*(A2 + r*A3 + r2*A4)) + hi. */ +- svfloat64_t r2 = svmul_x (pg, r, r); +- svfloat64_t y = svmla_x (pg, P (2), r, P (3)); +- svfloat64_t p = svmla_x (pg, P (0), r, P (1)); +- y = svmla_x (pg, y, r2, P (4)); ++ svfloat64_t odd_coeffs = svld1rq_f64 (svptrue_b64 (), &d->c1); ++ svfloat64_t r2 = svmul_x (svptrue_b64 (), r, r); ++ svfloat64_t y = svmla_lane_f64 (sv_f64 (d->c2), r, odd_coeffs, 1); ++ svfloat64_t p = svmla_lane_f64 (sv_f64 (d->c0), r, odd_coeffs, 0); ++ y = svmla_lane_f64 (y, r2, ln2_and_c4, 1); + y = svmla_x (pg, p, r2, y); + +- if (__glibc_unlikely (svptest_any (pg, cmp))) +- return special_case (x, svmla_x (svnot_z (pg, cmp), hi, r2, y), cmp); ++ if (__glibc_unlikely (svptest_any (pg, special))) ++ return special_case (hi, tmp, y, r2, special, d); + return svmla_x (pg, hi, r2, y); + } +diff --git a/sysdeps/aarch64/libm-test-ulps b/sysdeps/aarch64/libm-test-ulps +index 1d52bf9ebf534f1a..10788b790a963918 100644 +--- a/sysdeps/aarch64/libm-test-ulps ++++ b/sysdeps/aarch64/libm-test-ulps +@@ -1316,7 +1316,7 @@ float: 2 + ldouble: 1 + + Function: "log_sve": +-double: 1 ++double: 2 + float: 3 + + Function: "log_towardzero": diff --git a/glibc-upstream-2.39-159.patch b/glibc-upstream-2.39-159.patch new file mode 100644 index 0000000..355d2e3 --- /dev/null +++ b/glibc-upstream-2.39-159.patch @@ -0,0 +1,87 @@ +commit 65a96a6f2bb9f6f6f896394662279d263d59cdd2 +Author: Wilco Dijkstra +Date: Wed Aug 7 14:43:47 2024 +0100 + + AArch64: Improve generic strlen + + Improve performance by handling another 16 bytes before entering the loop. + Use ADDHN in the loop to avoid SHRN+FMOV when it terminates. Change final + size computation to avoid increasing latency. On Neoverse V1 performance + of the random strlen benchmark improves by 4.6%. + + Reviewed-by: Adhemerval Zanella + (cherry picked from commit 3dc426b642dcafdbc11a99f2767e081d086f5fc7) + +diff --git a/sysdeps/aarch64/strlen.S b/sysdeps/aarch64/strlen.S +index ab2a576cdb5665e5..352fb40d3abbb44b 100644 +--- a/sysdeps/aarch64/strlen.S ++++ b/sysdeps/aarch64/strlen.S +@@ -1,4 +1,5 @@ +-/* Copyright (C) 2012-2024 Free Software Foundation, Inc. ++/* Generic optimized strlen using SIMD. ++ Copyright (C) 2012-2024 Free Software Foundation, Inc. + + This file is part of the GNU C Library. + +@@ -56,36 +57,50 @@ ENTRY (STRLEN) + shrn vend.8b, vhas_nul.8h, 4 /* 128->64 */ + fmov synd, dend + lsr synd, synd, shift +- cbz synd, L(loop) ++ cbz synd, L(next16) + + rbit synd, synd + clz result, synd + lsr result, result, 2 + ret + ++L(next16): ++ ldr data, [src, 16] ++ cmeq vhas_nul.16b, vdata.16b, 0 ++ shrn vend.8b, vhas_nul.8h, 4 /* 128->64 */ ++ fmov synd, dend ++ cbz synd, L(loop) ++ add src, src, 16 ++#ifndef __AARCH64EB__ ++ rbit synd, synd ++#endif ++ sub result, src, srcin ++ clz tmp, synd ++ add result, result, tmp, lsr 2 ++ ret ++ + .p2align 5 + L(loop): +- ldr data, [src, 16] ++ ldr data, [src, 32]! + cmeq vhas_nul.16b, vdata.16b, 0 +- umaxp vend.16b, vhas_nul.16b, vhas_nul.16b ++ addhn vend.8b, vhas_nul.8h, vhas_nul.8h + fmov synd, dend + cbnz synd, L(loop_end) +- ldr data, [src, 32]! ++ ldr data, [src, 16] + cmeq vhas_nul.16b, vdata.16b, 0 +- umaxp vend.16b, vhas_nul.16b, vhas_nul.16b ++ addhn vend.8b, vhas_nul.8h, vhas_nul.8h + fmov synd, dend + cbz synd, L(loop) +- sub src, src, 16 ++ add src, src, 16 + L(loop_end): +- shrn vend.8b, vhas_nul.8h, 4 /* 128->64 */ +- sub result, src, srcin +- fmov synd, dend ++ sub result, shift, src, lsl 2 /* (srcin - src) << 2. */ + #ifndef __AARCH64EB__ + rbit synd, synd ++ sub result, result, 3 + #endif +- add result, result, 16 + clz tmp, synd +- add result, result, tmp, lsr 2 ++ sub result, tmp, result ++ lsr result, result, 2 + ret + + END (STRLEN) diff --git a/glibc-upstream-2.39-160.patch b/glibc-upstream-2.39-160.patch new file mode 100644 index 0000000..eeeaa42 --- /dev/null +++ b/glibc-upstream-2.39-160.patch @@ -0,0 +1,282 @@ +commit dd1e63ab580d801926265007796f290b84747ec8 +Author: Wilco Dijkstra +Date: Mon Sep 9 15:26:47 2024 +0100 + + AArch64: Optimize memset + + Improve small memsets by avoiding branches and use overlapping stores. + Use DC ZVA for copies over 128 bytes. Remove unnecessary code for ZVA sizes + other than 64 and 128. Performance of random memset benchmark improves by 24% + on Neoverse N1. + + Reviewed-by: Adhemerval Zanella + (cherry picked from commit cec3aef32412779e207f825db0d057ebb4628ae8) + +diff --git a/sysdeps/aarch64/memset.S b/sysdeps/aarch64/memset.S +index 7ef77ee8c926de21..caafb019e2b6217b 100644 +--- a/sysdeps/aarch64/memset.S ++++ b/sysdeps/aarch64/memset.S +@@ -1,4 +1,5 @@ +-/* Copyright (C) 2012-2024 Free Software Foundation, Inc. ++/* Generic optimized memset using SIMD. ++ Copyright (C) 2012-2024 Free Software Foundation, Inc. + + This file is part of the GNU C Library. + +@@ -17,7 +18,6 @@ + . */ + + #include +-#include "memset-reg.h" + + #ifndef MEMSET + # define MEMSET memset +@@ -25,130 +25,132 @@ + + /* Assumptions: + * +- * ARMv8-a, AArch64, unaligned accesses ++ * ARMv8-a, AArch64, Advanced SIMD, unaligned accesses. + * + */ + +-ENTRY (MEMSET) ++#define dstin x0 ++#define val x1 ++#define valw w1 ++#define count x2 ++#define dst x3 ++#define dstend x4 ++#define zva_val x5 ++#define off x3 ++#define dstend2 x5 + ++ENTRY (MEMSET) + PTR_ARG (0) + SIZE_ARG (2) + + dup v0.16B, valw ++ cmp count, 16 ++ b.lo L(set_small) ++ + add dstend, dstin, count ++ cmp count, 64 ++ b.hs L(set_128) + +- cmp count, 96 +- b.hi L(set_long) +- cmp count, 16 +- b.hs L(set_medium) +- mov val, v0.D[0] ++ /* Set 16..63 bytes. */ ++ mov off, 16 ++ and off, off, count, lsr 1 ++ sub dstend2, dstend, off ++ str q0, [dstin] ++ str q0, [dstin, off] ++ str q0, [dstend2, -16] ++ str q0, [dstend, -16] ++ ret + ++ .p2align 4 + /* Set 0..15 bytes. */ +- tbz count, 3, 1f +- str val, [dstin] +- str val, [dstend, -8] +- ret +- nop +-1: tbz count, 2, 2f +- str valw, [dstin] +- str valw, [dstend, -4] ++L(set_small): ++ add dstend, dstin, count ++ cmp count, 4 ++ b.lo 2f ++ lsr off, count, 3 ++ sub dstend2, dstend, off, lsl 2 ++ str s0, [dstin] ++ str s0, [dstin, off, lsl 2] ++ str s0, [dstend2, -4] ++ str s0, [dstend, -4] + ret ++ ++ /* Set 0..3 bytes. */ + 2: cbz count, 3f ++ lsr off, count, 1 + strb valw, [dstin] +- tbz count, 1, 3f +- strh valw, [dstend, -2] ++ strb valw, [dstin, off] ++ strb valw, [dstend, -1] + 3: ret + +- /* Set 17..96 bytes. */ +-L(set_medium): +- str q0, [dstin] +- tbnz count, 6, L(set96) +- str q0, [dstend, -16] +- tbz count, 5, 1f +- str q0, [dstin, 16] +- str q0, [dstend, -32] +-1: ret +- + .p2align 4 +- /* Set 64..96 bytes. Write 64 bytes from the start and +- 32 bytes from the end. */ +-L(set96): +- str q0, [dstin, 16] ++L(set_128): ++ bic dst, dstin, 15 ++ cmp count, 128 ++ b.hi L(set_long) ++ stp q0, q0, [dstin] + stp q0, q0, [dstin, 32] ++ stp q0, q0, [dstend, -64] + stp q0, q0, [dstend, -32] + ret + +- .p2align 3 +- nop ++ .p2align 4 + L(set_long): +- and valw, valw, 255 +- bic dst, dstin, 15 + str q0, [dstin] +- cmp count, 256 +- ccmp valw, 0, 0, cs +- b.eq L(try_zva) +-L(no_zva): +- sub count, dstend, dst /* Count is 16 too large. */ +- sub dst, dst, 16 /* Dst is biased by -32. */ +- sub count, count, 64 + 16 /* Adjust count and bias for loop. */ +-1: stp q0, q0, [dst, 32] +- stp q0, q0, [dst, 64]! +-L(tail64): +- subs count, count, 64 +- b.hi 1b +-2: stp q0, q0, [dstend, -64] ++ str q0, [dst, 16] ++ tst valw, 255 ++ b.ne L(no_zva) ++#ifndef ZVA64_ONLY ++ mrs zva_val, dczid_el0 ++ and zva_val, zva_val, 31 ++ cmp zva_val, 4 /* ZVA size is 64 bytes. */ ++ b.ne L(zva_128) ++#endif ++ stp q0, q0, [dst, 32] ++ bic dst, dstin, 63 ++ sub count, dstend, dst /* Count is now 64 too large. */ ++ sub count, count, 64 + 64 /* Adjust count and bias for loop. */ ++ ++ /* Write last bytes before ZVA loop. */ ++ stp q0, q0, [dstend, -64] + stp q0, q0, [dstend, -32] ++ ++ .p2align 4 ++L(zva64_loop): ++ add dst, dst, 64 ++ dc zva, dst ++ subs count, count, 64 ++ b.hi L(zva64_loop) + ret + +-L(try_zva): +-#ifndef ZVA64_ONLY + .p2align 3 +- mrs tmp1, dczid_el0 +- tbnz tmp1w, 4, L(no_zva) +- and tmp1w, tmp1w, 15 +- cmp tmp1w, 4 /* ZVA size is 64 bytes. */ +- b.ne L(zva_128) +- nop +-#endif +- /* Write the first and last 64 byte aligned block using stp rather +- than using DC ZVA. This is faster on some cores. +- */ +- .p2align 4 +-L(zva_64): +- str q0, [dst, 16] ++L(no_zva): ++ sub count, dstend, dst /* Count is 32 too large. */ ++ sub count, count, 64 + 32 /* Adjust count and bias for loop. */ ++L(no_zva_loop): + stp q0, q0, [dst, 32] +- bic dst, dst, 63 + stp q0, q0, [dst, 64] +- stp q0, q0, [dst, 96] +- sub count, dstend, dst /* Count is now 128 too large. */ +- sub count, count, 128+64+64 /* Adjust count and bias for loop. */ +- add dst, dst, 128 +-1: dc zva, dst + add dst, dst, 64 + subs count, count, 64 +- b.hi 1b +- stp q0, q0, [dst, 0] +- stp q0, q0, [dst, 32] ++ b.hi L(no_zva_loop) + stp q0, q0, [dstend, -64] + stp q0, q0, [dstend, -32] + ret + + #ifndef ZVA64_ONLY +- .p2align 3 ++ .p2align 4 + L(zva_128): +- cmp tmp1w, 5 /* ZVA size is 128 bytes. */ +- b.ne L(zva_other) ++ cmp zva_val, 5 /* ZVA size is 128 bytes. */ ++ b.ne L(no_zva) + +- str q0, [dst, 16] + stp q0, q0, [dst, 32] + stp q0, q0, [dst, 64] + stp q0, q0, [dst, 96] + bic dst, dst, 127 + sub count, dstend, dst /* Count is now 128 too large. */ +- sub count, count, 128+128 /* Adjust count and bias for loop. */ +- add dst, dst, 128 +-1: dc zva, dst +- add dst, dst, 128 ++ sub count, count, 128 + 128 /* Adjust count and bias for loop. */ ++1: add dst, dst, 128 ++ dc zva, dst + subs count, count, 128 + b.hi 1b + stp q0, q0, [dstend, -128] +@@ -156,35 +158,6 @@ L(zva_128): + stp q0, q0, [dstend, -64] + stp q0, q0, [dstend, -32] + ret +- +-L(zva_other): +- mov tmp2w, 4 +- lsl zva_lenw, tmp2w, tmp1w +- add tmp1, zva_len, 64 /* Max alignment bytes written. */ +- cmp count, tmp1 +- blo L(no_zva) +- +- sub tmp2, zva_len, 1 +- add tmp1, dst, zva_len +- add dst, dst, 16 +- subs count, tmp1, dst /* Actual alignment bytes to write. */ +- bic tmp1, tmp1, tmp2 /* Aligned dc zva start address. */ +- beq 2f +-1: stp q0, q0, [dst], 64 +- stp q0, q0, [dst, -32] +- subs count, count, 64 +- b.hi 1b +-2: mov dst, tmp1 +- sub count, dstend, tmp1 /* Remaining bytes to write. */ +- subs count, count, zva_len +- b.lo 4f +-3: dc zva, dst +- add dst, dst, zva_len +- subs count, count, zva_len +- b.hs 3b +-4: add count, count, zva_len +- sub dst, dst, 32 /* Bias dst for tail loop. */ +- b L(tail64) + #endif + + END (MEMSET) diff --git a/glibc-upstream-2.39-161.patch b/glibc-upstream-2.39-161.patch new file mode 100644 index 0000000..6d902f1 --- /dev/null +++ b/glibc-upstream-2.39-161.patch @@ -0,0 +1,60 @@ +commit 0cd10047bf046a658f32e12833ccc42304b3b152 +Author: Wilco Dijkstra +Date: Mon Nov 25 18:43:08 2024 +0000 + + AArch64: Remove zva_128 from memset + + Remove ZVA 128 support from memset - the new memset no longer + guarantees count >= 256, which can result in underflow and a + crash if ZVA size is 128 ([1]). Since only one CPU uses a ZVA + size of 128 and its memcpy implementation was removed in commit + e162ab2bf1b82c40f29e1925986582fa07568ce8, remove this special + case too. + + [1] https://sourceware.org/pipermail/libc-alpha/2024-November/161626.html + + Reviewed-by: Andrew Pinski + (cherry picked from commit a08d9a52f967531a77e1824c23b5368c6434a72d) + +diff --git a/sysdeps/aarch64/memset.S b/sysdeps/aarch64/memset.S +index caafb019e2b6217b..71814d0b2f6dd3a7 100644 +--- a/sysdeps/aarch64/memset.S ++++ b/sysdeps/aarch64/memset.S +@@ -104,7 +104,7 @@ L(set_long): + mrs zva_val, dczid_el0 + and zva_val, zva_val, 31 + cmp zva_val, 4 /* ZVA size is 64 bytes. */ +- b.ne L(zva_128) ++ b.ne L(no_zva) + #endif + stp q0, q0, [dst, 32] + bic dst, dstin, 63 +@@ -137,28 +137,5 @@ L(no_zva_loop): + stp q0, q0, [dstend, -32] + ret + +-#ifndef ZVA64_ONLY +- .p2align 4 +-L(zva_128): +- cmp zva_val, 5 /* ZVA size is 128 bytes. */ +- b.ne L(no_zva) +- +- stp q0, q0, [dst, 32] +- stp q0, q0, [dst, 64] +- stp q0, q0, [dst, 96] +- bic dst, dst, 127 +- sub count, dstend, dst /* Count is now 128 too large. */ +- sub count, count, 128 + 128 /* Adjust count and bias for loop. */ +-1: add dst, dst, 128 +- dc zva, dst +- subs count, count, 128 +- b.hi 1b +- stp q0, q0, [dstend, -128] +- stp q0, q0, [dstend, -96] +- stp q0, q0, [dstend, -64] +- stp q0, q0, [dstend, -32] +- ret +-#endif +- + END (MEMSET) + libc_hidden_builtin_def (MEMSET) diff --git a/glibc-upstream-2.39-162.patch b/glibc-upstream-2.39-162.patch new file mode 100644 index 0000000..d71b6b6 --- /dev/null +++ b/glibc-upstream-2.39-162.patch @@ -0,0 +1,29 @@ +commit 0cc12d9c47eb97d82c8f5af3724b4a4bc01df74a +Author: Wilco Dijkstra +Date: Wed Jul 24 15:17:47 2024 +0100 + + math: Improve layout of expf data + + GCC aligns global data to 16 bytes if their size is >= 16 bytes. This patch + changes the exp2f_data struct slightly so that the fields are better aligned. + As a result on targets that support them, load-pair instructions accessing + poly_scaled and invln2_scaled are now 16-byte aligned. + + Reviewed-by: Adhemerval Zanella + (cherry picked from commit 44fa9c1080fe6a9539f0d2345b9d2ae37b8ee57a) + +diff --git a/sysdeps/ieee754/flt-32/math_config.h b/sysdeps/ieee754/flt-32/math_config.h +index 729f22cd4f7dd9e4..dc07ebd45977e511 100644 +--- a/sysdeps/ieee754/flt-32/math_config.h ++++ b/sysdeps/ieee754/flt-32/math_config.h +@@ -166,9 +166,9 @@ extern const struct exp2f_data + uint64_t tab[1 << EXP2F_TABLE_BITS]; + double shift_scaled; + double poly[EXP2F_POLY_ORDER]; +- double shift; + double invln2_scaled; + double poly_scaled[EXP2F_POLY_ORDER]; ++ double shift; + } __exp2f_data attribute_hidden; + + #define LOGF_TABLE_BITS 4 diff --git a/glibc-upstream-2.39-163.patch b/glibc-upstream-2.39-163.patch new file mode 100644 index 0000000..3ec9e32 --- /dev/null +++ b/glibc-upstream-2.39-163.patch @@ -0,0 +1,189 @@ +commit d0e2133470d848e80eb4ba79ecd5d8c8b11fd2bb +Author: Wilco Dijkstra +Date: Tue Dec 24 18:01:59 2024 +0000 + + AArch64: Add SVE memset + + Add SVE memset based on the generic memset with predicated load for sizes < 16. + Unaligned memsets of 128-1024 are improved by ~20% on average by using aligned + stores for the last 64 bytes. Performance of random memset benchmark improves + by ~2% on Neoverse V1. + + Reviewed-by: Yury Khrustalev + (cherry picked from commit 163b1bbb76caba4d9673c07940c5930a1afa7548) + +diff --git a/sysdeps/aarch64/multiarch/Makefile b/sysdeps/aarch64/multiarch/Makefile +index e4720b746859f515..214b6137b0bc63a2 100644 +--- a/sysdeps/aarch64/multiarch/Makefile ++++ b/sysdeps/aarch64/multiarch/Makefile +@@ -14,6 +14,7 @@ sysdep_routines += \ + memset_generic \ + memset_kunpeng \ + memset_mops \ ++ memset_sve_zva64 \ + memset_zva64 \ + strlen_asimd \ + strlen_generic \ +diff --git a/sysdeps/aarch64/multiarch/ifunc-impl-list.c b/sysdeps/aarch64/multiarch/ifunc-impl-list.c +index ecd0f87de6a5b254..f8544fe3b525f775 100644 +--- a/sysdeps/aarch64/multiarch/ifunc-impl-list.c ++++ b/sysdeps/aarch64/multiarch/ifunc-impl-list.c +@@ -57,6 +57,7 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, + IFUNC_IMPL_ADD (array, i, memset, 1, __memset_kunpeng) + #if HAVE_AARCH64_SVE_ASM + IFUNC_IMPL_ADD (array, i, memset, sve && !bti && zva_size == 256, __memset_a64fx) ++ IFUNC_IMPL_ADD (array, i, memset, sve && zva_size == 64, __memset_sve_zva64) + #endif + IFUNC_IMPL_ADD (array, i, memset, mops, __memset_mops) + IFUNC_IMPL_ADD (array, i, memset, 1, __memset_generic)) +diff --git a/sysdeps/aarch64/multiarch/memset.c b/sysdeps/aarch64/multiarch/memset.c +index 34bce045dd64ba9b..9d98664e6bc32212 100644 +--- a/sysdeps/aarch64/multiarch/memset.c ++++ b/sysdeps/aarch64/multiarch/memset.c +@@ -34,6 +34,7 @@ extern __typeof (__redirect_memset) __memset_kunpeng attribute_hidden; + extern __typeof (__redirect_memset) __memset_a64fx attribute_hidden; + extern __typeof (__redirect_memset) __memset_generic attribute_hidden; + extern __typeof (__redirect_memset) __memset_mops attribute_hidden; ++extern __typeof (__redirect_memset) __memset_sve_zva64 attribute_hidden; + + static inline __typeof (__redirect_memset) * + select_memset_ifunc (void) +@@ -47,6 +48,9 @@ select_memset_ifunc (void) + { + if (IS_A64FX (midr) && zva_size == 256) + return __memset_a64fx; ++ ++ if (zva_size == 64) ++ return __memset_sve_zva64; + } + + if (IS_KUNPENG920 (midr)) +diff --git a/sysdeps/aarch64/multiarch/memset_sve_zva64.S b/sysdeps/aarch64/multiarch/memset_sve_zva64.S +new file mode 100644 +index 0000000000000000..7fb40fdd9e927bb3 +--- /dev/null ++++ b/sysdeps/aarch64/multiarch/memset_sve_zva64.S +@@ -0,0 +1,123 @@ ++/* Optimized memset for SVE. ++ Copyright (C) 2025 Free Software Foundation, Inc. ++ ++ This file is part of the GNU C Library. ++ ++ The GNU C Library is free software; you can redistribute it and/or ++ modify it under the terms of the GNU Lesser General Public ++ License as published by the Free Software Foundation; either ++ version 2.1 of the License, or (at your option) any later version. ++ ++ The GNU C Library is distributed in the hope that it will be useful, ++ but WITHOUT ANY WARRANTY; without even the implied warranty of ++ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ Lesser General Public License for more details. ++ ++ You should have received a copy of the GNU Lesser General Public ++ License along with the GNU C Library. If not, see ++ . */ ++ ++#include ++ ++/* Assumptions: ++ * ++ * ARMv8-a, AArch64, Advanced SIMD, SVE, unaligned accesses. ++ * ZVA size is 64. ++ */ ++ ++#if HAVE_AARCH64_SVE_ASM ++ ++.arch armv8.2-a+sve ++ ++#define dstin x0 ++#define val x1 ++#define valw w1 ++#define count x2 ++#define dst x3 ++#define dstend x4 ++#define zva_val x5 ++#define vlen x5 ++#define off x3 ++#define dstend2 x5 ++ ++ENTRY (__memset_sve_zva64) ++ dup v0.16B, valw ++ cmp count, 16 ++ b.lo L(set_16) ++ ++ add dstend, dstin, count ++ cmp count, 64 ++ b.hs L(set_128) ++ ++ /* Set 16..63 bytes. */ ++ mov off, 16 ++ and off, off, count, lsr 1 ++ sub dstend2, dstend, off ++ str q0, [dstin] ++ str q0, [dstin, off] ++ str q0, [dstend2, -16] ++ str q0, [dstend, -16] ++ ret ++ ++ .p2align 4 ++L(set_16): ++ whilelo p0.b, xzr, count ++ st1b z0.b, p0, [dstin] ++ ret ++ ++ .p2align 4 ++L(set_128): ++ bic dst, dstin, 15 ++ cmp count, 128 ++ b.hi L(set_long) ++ stp q0, q0, [dstin] ++ stp q0, q0, [dstin, 32] ++ stp q0, q0, [dstend, -64] ++ stp q0, q0, [dstend, -32] ++ ret ++ ++ .p2align 4 ++L(set_long): ++ cmp count, 256 ++ b.lo L(no_zva) ++ tst valw, 255 ++ b.ne L(no_zva) ++ ++ str q0, [dstin] ++ str q0, [dst, 16] ++ bic dst, dstin, 31 ++ stp q0, q0, [dst, 32] ++ bic dst, dstin, 63 ++ sub count, dstend, dst /* Count is now 64 too large. */ ++ sub count, count, 128 /* Adjust count and bias for loop. */ ++ ++ sub x8, dstend, 1 /* Write last bytes before ZVA loop. */ ++ bic x8, x8, 15 ++ stp q0, q0, [x8, -48] ++ str q0, [x8, -16] ++ str q0, [dstend, -16] ++ ++ .p2align 4 ++L(zva64_loop): ++ add dst, dst, 64 ++ dc zva, dst ++ subs count, count, 64 ++ b.hi L(zva64_loop) ++ ret ++ ++L(no_zva): ++ str q0, [dstin] ++ sub count, dstend, dst /* Count is 16 too large. */ ++ sub count, count, 64 + 16 /* Adjust count and bias for loop. */ ++L(no_zva_loop): ++ stp q0, q0, [dst, 16] ++ stp q0, q0, [dst, 48] ++ add dst, dst, 64 ++ subs count, count, 64 ++ b.hi L(no_zva_loop) ++ stp q0, q0, [dstend, -64] ++ stp q0, q0, [dstend, -32] ++ ret ++ ++END (__memset_sve_zva64) ++#endif diff --git a/glibc-upstream-2.39-164.patch b/glibc-upstream-2.39-164.patch new file mode 100644 index 0000000..4c47935 --- /dev/null +++ b/glibc-upstream-2.39-164.patch @@ -0,0 +1,24 @@ +commit a1b09e59e2de9a5634a864e1a915f9f46e2cdd3a +Author: Wilco Dijkstra +Date: Thu Feb 27 16:28:52 2025 +0000 + + AArch64: Use prefer_sve_ifuncs for SVE memset + + Use prefer_sve_ifuncs for SVE memset just like memcpy. + + Reviewed-by: Yury Khrustalev + (cherry picked from commit 0f044be1dae5169d0e57f8d487b427863aeadab4) + +diff --git a/sysdeps/aarch64/multiarch/memset.c b/sysdeps/aarch64/multiarch/memset.c +index 9d98664e6bc32212..161624fe6028d9e9 100644 +--- a/sysdeps/aarch64/multiarch/memset.c ++++ b/sysdeps/aarch64/multiarch/memset.c +@@ -49,7 +49,7 @@ select_memset_ifunc (void) + if (IS_A64FX (midr) && zva_size == 256) + return __memset_a64fx; + +- if (zva_size == 64) ++ if (prefer_sve_ifuncs && zva_size == 64) + return __memset_sve_zva64; + } + diff --git a/glibc-upstream-2.39-165.patch b/glibc-upstream-2.39-165.patch new file mode 100644 index 0000000..8410684 --- /dev/null +++ b/glibc-upstream-2.39-165.patch @@ -0,0 +1,43 @@ +commit dd8c0c3bbd4e22e00a7275c75dc0d40f24bb0d68 +Author: Wilco Dijkstra +Date: Fri Dec 13 15:43:07 2024 +0000 + + math: Improve layout of exp/exp10 data + + GCC aligns global data to 16 bytes if their size is >= 16 bytes. This patch + changes the exp_data struct slightly so that the fields are better aligned + and without gaps. As a result on targets that support them, more load-pair + instructions are used in exp. Exp10 is improved by moving invlog10_2N later + so that neglog10_2hiN and neglog10_2loN can be loaded using load-pair. + + The exp benchmark improves 2.5%, "144bits" by 7.2%, "768bits" by 12.7% on + Neoverse V2. Exp10 improves by 1.5%. + + Reviewed-by: Adhemerval Zanella + (cherry picked from commit 5afaf99edb326fd9f36eb306a828d129a3a1d7f7) + +diff --git a/sysdeps/ieee754/dbl-64/math_config.h b/sysdeps/ieee754/dbl-64/math_config.h +index ef87cfa6be9860e3..05515fd95ad15d52 100644 +--- a/sysdeps/ieee754/dbl-64/math_config.h ++++ b/sysdeps/ieee754/dbl-64/math_config.h +@@ -195,16 +195,18 @@ check_uflow (double x) + extern const struct exp_data + { + double invln2N; +- double shift; + double negln2hiN; + double negln2loN; + double poly[4]; /* Last four coefficients. */ ++ double shift; ++ + double exp2_shift; + double exp2_poly[EXP2_POLY_ORDER]; +- double invlog10_2N; ++ + double neglog10_2hiN; + double neglog10_2loN; + double exp10_poly[5]; ++ double invlog10_2N; + uint64_t tab[2*(1 << EXP_TABLE_BITS)]; + } __exp_data attribute_hidden; + diff --git a/glibc-upstream-2.39-166.patch b/glibc-upstream-2.39-166.patch new file mode 100644 index 0000000..1fb60eb --- /dev/null +++ b/glibc-upstream-2.39-166.patch @@ -0,0 +1,54 @@ +commit e1fe22368e4fbc13ce300d89802b7fcc0d5cfb38 +Author: Michael Jeanson +Date: Fri Feb 14 13:54:22 2025 -0500 + + nptl: clear the whole rseq area before registration + + Due to the extensible nature of the rseq area we can't explictly + initialize fields that are not part of the ABI yet. It was agreed with + upstream that all new fields will be documented as zero initialized by + userspace. Future kernels configured with CONFIG_DEBUG_RSEQ will + validate the content of all fields during registration. + + Replace the explicit field initialization with a memset of the whole + rseq area which will cover fields as they are added to future kernels. + + Signed-off-by: Michael Jeanson + Reviewed-by: Florian Weimer + (cherry picked from commit 689a62a4217fae78b9ce0db781dc2a421f2b1ab4) + +diff --git a/sysdeps/nptl/dl-tls_init_tp.c b/sysdeps/nptl/dl-tls_init_tp.c +index 7803e19fd16ad803..ed10185e3708f4b6 100644 +--- a/sysdeps/nptl/dl-tls_init_tp.c ++++ b/sysdeps/nptl/dl-tls_init_tp.c +@@ -23,6 +23,7 @@ + #include + #include + #include ++#include + + #define TUNABLE_NAMESPACE pthread + #include +diff --git a/sysdeps/unix/sysv/linux/rseq-internal.h b/sysdeps/unix/sysv/linux/rseq-internal.h +index ef3eab1fefd4d90d..76de2b7ff079eb0f 100644 +--- a/sysdeps/unix/sysv/linux/rseq-internal.h ++++ b/sysdeps/unix/sysv/linux/rseq-internal.h +@@ -52,13 +52,12 @@ rseq_register_current_thread (struct pthread *self, bool do_rseq) + but still expected size 32. */ + size = RSEQ_AREA_SIZE_INITIAL; + +- /* Initialize the rseq fields that are read by the kernel on +- registration, there is no guarantee that struct pthread is +- cleared on all architectures. */ ++ /* Initialize the whole rseq area to zero prior to registration. */ ++ memset (&self->rseq_area, 0, size); ++ ++ /* Set the cpu_id field to RSEQ_CPU_ID_UNINITIALIZED, this is checked by ++ the kernel at registration when CONFIG_DEBUG_RSEQ is enabled. */ + THREAD_SETMEM (self, rseq_area.cpu_id, RSEQ_CPU_ID_UNINITIALIZED); +- THREAD_SETMEM (self, rseq_area.cpu_id_start, 0); +- THREAD_SETMEM (self, rseq_area.rseq_cs, 0); +- THREAD_SETMEM (self, rseq_area.flags, 0); + + int ret = INTERNAL_SYSCALL_CALL (rseq, &self->rseq_area, + size, 0, RSEQ_SIG); diff --git a/glibc-upstream-2.39-167.patch b/glibc-upstream-2.39-167.patch new file mode 100644 index 0000000..ef92f21 --- /dev/null +++ b/glibc-upstream-2.39-167.patch @@ -0,0 +1,193 @@ +commit 7ecf0d3bde54e4f9e6f025d2f43eff565ed97414 +Author: H.J. Lu +Date: Thu Apr 4 15:43:50 2024 -0700 + + x86-64: Exclude FMA4 IFUNC functions for -mapxf + + When -mapxf is used to build glibc, the resulting glibc will never run + on FMA4 machines. Exclude FMA4 IFUNC functions when -mapxf is used. + This requires GCC which defines __APX_F__ for -mapxf with commit: + + 1df56719bd8 x86: Define __APX_F__ for -mapxf + + Reviewed-by: Sunil K Pandey + (cherry picked from commit 9e1f4aef865ddeffeb4b5f6578fefab606783120) + +diff --git a/config.h.in b/config.h.in +index 1e647de58580bc2d..a5fdea0c3c7b070e 100644 +--- a/config.h.in ++++ b/config.h.in +@@ -295,4 +295,7 @@ + /* Define if -mmovbe is enabled by default on x86. */ + #undef HAVE_X86_MOVBE + ++/* Define if -mapxf is enabled by default on x86. */ ++#undef HAVE_X86_APX ++ + #endif +diff --git a/sysdeps/x86_64/configure b/sysdeps/x86_64/configure +index 04a534fa126a7bf7..07bdd40a37247c7b 100755 +--- a/sysdeps/x86_64/configure ++++ b/sysdeps/x86_64/configure +@@ -162,6 +162,38 @@ printf "%s\n" "$libc_cv_x86_have_amx_tile" >&6; } + config_vars="$config_vars + have-mamx-tile = $libc_cv_x86_have_amx_tile" + ++# Check if -mapxf is enabled. ++{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking whether -mapxf is enabled" >&5 ++printf %s "checking whether -mapxf is enabled... " >&6; } ++if test ${libc_cv_x86_have_apx+y} ++then : ++ printf %s "(cached) " >&6 ++else $as_nop ++ cat > conftest.c <&5 ++ (eval $ac_try) 2>&5 ++ ac_status=$? ++ printf "%s\n" "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5 ++ test $ac_status = 0; }; }; then ++ libc_cv_x86_have_apx=yes ++ fi ++ rm -rf conftest* ++fi ++{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: $libc_cv_x86_have_apx" >&5 ++printf "%s\n" "$libc_cv_x86_have_apx" >&6; } ++if test $libc_cv_x86_have_apx = yes; then ++ printf "%s\n" "#define HAVE_X86_APX 1" >>confdefs.h ++ ++fi ++config_vars="$config_vars ++have-x86-apx = $libc_cv_x86_have_apx" ++ + test -n "$critic_missing" && as_fn_error $? " + *** $critic_missing" "$LINENO" 5 + +diff --git a/sysdeps/x86_64/configure.ac b/sysdeps/x86_64/configure.ac +index c714c47351e70390..c7b68544a2c79ae5 100644 +--- a/sysdeps/x86_64/configure.ac ++++ b/sysdeps/x86_64/configure.ac +@@ -76,5 +76,23 @@ EOF + rm -rf conftest*]) + LIBC_CONFIG_VAR([have-mamx-tile], [$libc_cv_x86_have_amx_tile]) + ++# Check if -mapxf is enabled. ++AC_CACHE_CHECK(whether -mapxf is enabled, ++ libc_cv_x86_have_apx, [dnl ++cat > conftest.c <&AS_MESSAGE_LOG_FD); then ++ libc_cv_x86_have_apx=yes ++ fi ++ rm -rf conftest*]) ++if test $libc_cv_x86_have_apx = yes; then ++ AC_DEFINE(HAVE_X86_APX) ++fi ++LIBC_CONFIG_VAR([have-x86-apx], [$libc_cv_x86_have_apx]) ++ + test -n "$critic_missing" && AC_MSG_ERROR([ + *** $critic_missing]) +diff --git a/sysdeps/x86_64/fpu/multiarch/Makefile b/sysdeps/x86_64/fpu/multiarch/Makefile +index 6ddd50240ce33d22..cbe09d49f49581f1 100644 +--- a/sysdeps/x86_64/fpu/multiarch/Makefile ++++ b/sysdeps/x86_64/fpu/multiarch/Makefile +@@ -38,29 +38,36 @@ libm-sysdep_routines += \ + s_truncf-avx \ + # libm-sysdep_routines + else ++ifeq (no,$(have-x86-apx)) + libm-sysdep_routines += \ +- e_asin-fma \ + e_asin-fma4 \ ++ e_atan2-fma4 \ ++ e_exp-fma4 \ ++ e_log-fma4 \ ++ e_pow-fma4 \ ++ s_atan-fma4 \ ++ s_sin-fma4 \ ++ s_sincos-fma4 \ ++ s_tan-fma4 \ ++# libm-sysdep_routines ++endif ++libm-sysdep_routines += \ ++ e_asin-fma \ + e_atan2-avx \ + e_atan2-fma \ +- e_atan2-fma4 \ + e_exp-avx \ + e_exp-fma \ +- e_exp-fma4 \ + e_exp2f-fma \ + e_expf-fma \ + e_log-avx \ + e_log-fma \ +- e_log-fma4 \ + e_log2-fma \ + e_log2f-fma \ + e_logf-fma \ + e_pow-fma \ +- e_pow-fma4 \ + e_powf-fma \ + s_atan-avx \ + s_atan-fma \ +- s_atan-fma4 \ + s_ceil-sse4_1 \ + s_ceilf-sse4_1 \ + s_cosf-fma \ +@@ -77,17 +84,14 @@ libm-sysdep_routines += \ + s_roundevenf-sse4_1 \ + s_sin-avx \ + s_sin-fma \ +- s_sin-fma4 \ + s_sincos-avx \ + s_sincos-fma \ +- s_sincos-fma4 \ + s_sincosf-fma \ + s_sincosf-sse2 \ + s_sinf-fma \ + s_sinf-sse2 \ + s_tan-avx \ + s_tan-fma \ +- s_tan-fma4 \ + s_trunc-sse4_1 \ + s_truncf-sse4_1 \ + # libm-sysdep_routines +diff --git a/sysdeps/x86_64/fpu/multiarch/ifunc-avx-fma4.h b/sysdeps/x86_64/fpu/multiarch/ifunc-avx-fma4.h +index 7719188888fbec38..d126cf9cd5ae55e4 100644 +--- a/sysdeps/x86_64/fpu/multiarch/ifunc-avx-fma4.h ++++ b/sysdeps/x86_64/fpu/multiarch/ifunc-avx-fma4.h +@@ -33,8 +33,10 @@ IFUNC_SELECTOR (void) + && CPU_FEATURE_USABLE_P (cpu_features, AVX2)) + return OPTIMIZE (fma); + ++#ifndef HAVE_X86_APX + if (CPU_FEATURE_USABLE_P (cpu_features, FMA4)) + return OPTIMIZE (fma4); ++#endif + + if (CPU_FEATURE_USABLE_P (cpu_features, AVX)) + return OPTIMIZE (avx); +diff --git a/sysdeps/x86_64/fpu/multiarch/ifunc-fma4.h b/sysdeps/x86_64/fpu/multiarch/ifunc-fma4.h +index c35ba13845b7914b..18d372d25cb598f2 100644 +--- a/sysdeps/x86_64/fpu/multiarch/ifunc-fma4.h ++++ b/sysdeps/x86_64/fpu/multiarch/ifunc-fma4.h +@@ -32,8 +32,10 @@ IFUNC_SELECTOR (void) + && CPU_FEATURE_USABLE_P (cpu_features, AVX2)) + return OPTIMIZE (fma); + ++#ifndef HAVE_X86_APX + if (CPU_FEATURE_USABLE_P (cpu_features, FMA4)) + return OPTIMIZE (fma4); ++#endif + + return OPTIMIZE (sse2); + } diff --git a/glibc-upstream-2.39-168.patch b/glibc-upstream-2.39-168.patch new file mode 100644 index 0000000..565eb14 --- /dev/null +++ b/glibc-upstream-2.39-168.patch @@ -0,0 +1,106 @@ +commit 0edcc77fe7e13b29d99e7f4d7fe3373b3666468e +Author: Sunil K Pandey +Date: Mon Mar 10 10:24:07 2025 -0700 + + x86_64: Add tanh with FMA + + On Skylake, it improves tanh bench performance by: + + Before After Improvement + max 110.89 95.826 14% + min 20.966 20.157 4% + mean 30.9601 29.8431 4% + + Reviewed-by: H.J. Lu + (cherry picked from commit c6352111c72a20b3588ae304dd99b63e25dd6d85) + +diff --git a/sysdeps/ieee754/dbl-64/s_tanh.c b/sysdeps/ieee754/dbl-64/s_tanh.c +index 673a97102de292fd..13063db04ebb198c 100644 +--- a/sysdeps/ieee754/dbl-64/s_tanh.c ++++ b/sysdeps/ieee754/dbl-64/s_tanh.c +@@ -46,6 +46,11 @@ static char rcsid[] = "$NetBSD: s_tanh.c,v 1.7 1995/05/10 20:48:22 jtc Exp $"; + + static const double one = 1.0, two = 2.0, tiny = 1.0e-300; + ++#ifndef SECTION ++# define SECTION ++#endif ++ ++SECTION + double + __tanh (double x) + { +diff --git a/sysdeps/x86_64/fpu/multiarch/Makefile b/sysdeps/x86_64/fpu/multiarch/Makefile +index cbe09d49f49581f1..0f69f7089c06af73 100644 +--- a/sysdeps/x86_64/fpu/multiarch/Makefile ++++ b/sysdeps/x86_64/fpu/multiarch/Makefile +@@ -10,6 +10,7 @@ CFLAGS-s_expm1-fma.c = -mfma -mavx2 + CFLAGS-s_log1p-fma.c = -mfma -mavx2 + CFLAGS-s_sin-fma.c = -mfma -mavx2 + CFLAGS-s_tan-fma.c = -mfma -mavx2 ++CFLAGS-s_tanh-fma.c = -mfma -mavx2 + CFLAGS-s_sincos-fma.c = -mfma -mavx2 + + CFLAGS-e_exp2f-fma.c = -mfma -mavx2 +@@ -92,6 +93,7 @@ libm-sysdep_routines += \ + s_sinf-sse2 \ + s_tan-avx \ + s_tan-fma \ ++ s_tanh-fma \ + s_trunc-sse4_1 \ + s_truncf-sse4_1 \ + # libm-sysdep_routines +diff --git a/sysdeps/x86_64/fpu/multiarch/s_tanh-fma.c b/sysdeps/x86_64/fpu/multiarch/s_tanh-fma.c +new file mode 100644 +index 0000000000000000..1b808b1227f50cf5 +--- /dev/null ++++ b/sysdeps/x86_64/fpu/multiarch/s_tanh-fma.c +@@ -0,0 +1,11 @@ ++#define __tanh __tanh_fma ++#define __expm1 __expm1_fma ++ ++/* NB: __expm1 may be expanded to __expm1_fma in the following ++ prototypes. */ ++extern long double __expm1l (long double); ++extern long double __expm1f128 (long double); ++ ++#define SECTION __attribute__ ((section (".text.fma"))) ++ ++#include +diff --git a/sysdeps/x86_64/fpu/multiarch/s_tanh.c b/sysdeps/x86_64/fpu/multiarch/s_tanh.c +new file mode 100644 +index 0000000000000000..5539b6c61c63548d +--- /dev/null ++++ b/sysdeps/x86_64/fpu/multiarch/s_tanh.c +@@ -0,0 +1,31 @@ ++/* Multiple versions of tanh. ++ Copyright (C) 2025 Free Software Foundation, Inc. ++ This file is part of the GNU C Library. ++ ++ The GNU C Library is free software; you can redistribute it and/or ++ modify it under the terms of the GNU Lesser General Public ++ License as published by the Free Software Foundation; either ++ version 2.1 of the License, or (at your option) any later version. ++ ++ The GNU C Library is distributed in the hope that it will be useful, ++ but WITHOUT ANY WARRANTY; without even the implied warranty of ++ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ Lesser General Public License for more details. ++ ++ You should have received a copy of the GNU Lesser General Public ++ License along with the GNU C Library; if not, see ++ . */ ++ ++#include ++#if MINIMUM_X86_ISA_LEVEL < AVX2_X86_ISA_LEVEL ++ ++extern double __redirect_tanh (double); ++ ++# define SYMBOL_NAME tanh ++# include "ifunc-fma.h" ++ ++libc_ifunc_redirected (__redirect_tanh, __tanh, IFUNC_SELECTOR ()); ++ ++# define __tanh __tanh_sse2 ++#endif ++#include diff --git a/glibc-upstream-2.39-169.patch b/glibc-upstream-2.39-169.patch new file mode 100644 index 0000000..d1df5b9 --- /dev/null +++ b/glibc-upstream-2.39-169.patch @@ -0,0 +1,130 @@ +commit 01ed435e2ee8df18f107ac9d999e1c4db922f564 +Author: Sunil K Pandey +Date: Sat Mar 8 08:51:10 2025 -0800 + + x86_64: Add sinh with FMA + + On SPR, it improves sinh bench performance by: + + Before After Improvement + reciprocal-throughput 14.2017 11.815 17% + latency 36.4917 35.2114 4% + + Reviewed-by: H.J. Lu + (cherry picked from commit dded0d20f67ba1925ccbcb9cf28f0c75febe0dbe) + +diff --git a/benchtests/sinh-inputs b/benchtests/sinh-inputs +index 7b1ac46a39c0a0b0..2fcb2fabf82ce778 100644 +--- a/benchtests/sinh-inputs ++++ b/benchtests/sinh-inputs +@@ -1,6 +1,7 @@ + ## args: double + ## ret: double + ## includes: math.h ++## name: workload-random + 0x1.bcb6129b5ff2bp8 + -0x1.63057386325ebp9 + 0x1.62f1d7dc4e8bfp9 +diff --git a/sysdeps/ieee754/dbl-64/e_sinh.c b/sysdeps/ieee754/dbl-64/e_sinh.c +index b4b5857dddf90f7a..3f787967f93d72f0 100644 +--- a/sysdeps/ieee754/dbl-64/e_sinh.c ++++ b/sysdeps/ieee754/dbl-64/e_sinh.c +@@ -41,6 +41,11 @@ static char rcsid[] = "$NetBSD: e_sinh.c,v 1.7 1995/05/10 20:46:13 jtc Exp $"; + + static const double one = 1.0, shuge = 1.0e307; + ++#ifndef SECTION ++# define SECTION ++#endif ++ ++SECTION + double + __ieee754_sinh (double x) + { +@@ -90,4 +95,7 @@ __ieee754_sinh (double x) + /* |x| > overflowthresold, sinh(x) overflow */ + return math_narrow_eval (x * shuge); + } ++ ++#ifndef __ieee754_sinh + libm_alias_finite (__ieee754_sinh, __sinh) ++#endif +diff --git a/sysdeps/x86_64/fpu/multiarch/Makefile b/sysdeps/x86_64/fpu/multiarch/Makefile +index 0f69f7089c06af73..b527cab8d134be21 100644 +--- a/sysdeps/x86_64/fpu/multiarch/Makefile ++++ b/sysdeps/x86_64/fpu/multiarch/Makefile +@@ -5,6 +5,7 @@ CFLAGS-e_exp-fma.c = -mfma -mavx2 + CFLAGS-e_log-fma.c = -mfma -mavx2 + CFLAGS-e_log2-fma.c = -mfma -mavx2 + CFLAGS-e_pow-fma.c = -mfma -mavx2 ++CFLAGS-e_sinh-fma.c = -mfma -mavx2 + CFLAGS-s_atan-fma.c = -mfma -mavx2 + CFLAGS-s_expm1-fma.c = -mfma -mavx2 + CFLAGS-s_log1p-fma.c = -mfma -mavx2 +@@ -67,6 +68,7 @@ libm-sysdep_routines += \ + e_logf-fma \ + e_pow-fma \ + e_powf-fma \ ++ e_sinh-fma \ + s_atan-avx \ + s_atan-fma \ + s_ceil-sse4_1 \ +diff --git a/sysdeps/x86_64/fpu/multiarch/e_sinh-fma.c b/sysdeps/x86_64/fpu/multiarch/e_sinh-fma.c +new file mode 100644 +index 0000000000000000..e0e1e39a7a606dc8 +--- /dev/null ++++ b/sysdeps/x86_64/fpu/multiarch/e_sinh-fma.c +@@ -0,0 +1,12 @@ ++#define __ieee754_sinh __ieee754_sinh_fma ++#define __ieee754_exp __ieee754_exp_fma ++#define __expm1 __expm1_fma ++ ++/* NB: __expm1 may be expanded to __expm1_fma in the following ++ prototypes. */ ++extern long double __expm1l (long double); ++extern long double __expm1f128 (long double); ++ ++#define SECTION __attribute__ ((section (".text.fma"))) ++ ++#include +diff --git a/sysdeps/x86_64/fpu/multiarch/e_sinh.c b/sysdeps/x86_64/fpu/multiarch/e_sinh.c +new file mode 100644 +index 0000000000000000..3d3c18ccdf1d437a +--- /dev/null ++++ b/sysdeps/x86_64/fpu/multiarch/e_sinh.c +@@ -0,0 +1,35 @@ ++/* Multiple versions of sinh. ++ Copyright (C) 2025 Free Software Foundation, Inc. ++ This file is part of the GNU C Library. ++ ++ The GNU C Library is free software; you can redistribute it and/or ++ modify it under the terms of the GNU Lesser General Public ++ License as published by the Free Software Foundation; either ++ version 2.1 of the License, or (at your option) any later version. ++ ++ The GNU C Library is distributed in the hope that it will be useful, ++ but WITHOUT ANY WARRANTY; without even the implied warranty of ++ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ Lesser General Public License for more details. ++ ++ You should have received a copy of the GNU Lesser General Public ++ License along with the GNU C Library; if not, see ++ . */ ++ ++#include ++#if MINIMUM_X86_ISA_LEVEL < AVX2_X86_ISA_LEVEL ++# include ++ ++extern double __redirect_ieee754_sinh (double); ++ ++# define SYMBOL_NAME ieee754_sinh ++# include "ifunc-fma.h" ++ ++libc_ifunc_redirected (__redirect_ieee754_sinh, __ieee754_sinh, ++ IFUNC_SELECTOR ()); ++ ++libm_alias_finite (__ieee754_sinh, __sinh) ++ ++# define __ieee754_sinh __ieee754_sinh_sse2 ++#endif ++#include diff --git a/glibc-upstream-2.39-170.patch b/glibc-upstream-2.39-170.patch new file mode 100644 index 0000000..8ea22b1 --- /dev/null +++ b/glibc-upstream-2.39-170.patch @@ -0,0 +1,123 @@ +commit 4cf3f9df544a6f3dc27ea097b43bd2fb73113c3f +Author: Sunil K Pandey +Date: Wed Mar 5 16:13:38 2025 -0800 + + x86_64: Add atanh with FMA + + On SPR, it improves atanh bench performance by: + + Before After Improvement + reciprocal-throughput 15.1715 14.8628 2% + latency 57.1941 56.1883 2% + + Reviewed-by: H.J. Lu + (cherry picked from commit c7c4a5906f326f1290b1c2413a83c530564ec4b8) + +diff --git a/benchtests/atanh-inputs b/benchtests/atanh-inputs +index 455aa65b6500bccb..498529325436d48f 100644 +--- a/benchtests/atanh-inputs ++++ b/benchtests/atanh-inputs +@@ -1,6 +1,7 @@ + ## args: double + ## ret: double + ## includes: math.h ++## name: workload-random + 0x1.5a2730bacd94ap-1 + -0x1.b57eb40fc048ep-21 + -0x1.c0b185fb450e2p-17 +diff --git a/sysdeps/ieee754/dbl-64/e_atanh.c b/sysdeps/ieee754/dbl-64/e_atanh.c +index 11a2a45799d09f63..05ac0a1b30c164a7 100644 +--- a/sysdeps/ieee754/dbl-64/e_atanh.c ++++ b/sysdeps/ieee754/dbl-64/e_atanh.c +@@ -44,6 +44,11 @@ + + static const double huge = 1e300; + ++#ifndef SECTION ++# define SECTION ++#endif ++ ++SECTION + double + __ieee754_atanh (double x) + { +@@ -73,4 +78,7 @@ __ieee754_atanh (double x) + + return copysign (t, x); + } ++ ++#ifndef __ieee754_atanh + libm_alias_finite (__ieee754_atanh, __atanh) ++#endif +diff --git a/sysdeps/x86_64/fpu/multiarch/Makefile b/sysdeps/x86_64/fpu/multiarch/Makefile +index b527cab8d134be21..bc479b42d279825b 100644 +--- a/sysdeps/x86_64/fpu/multiarch/Makefile ++++ b/sysdeps/x86_64/fpu/multiarch/Makefile +@@ -1,6 +1,7 @@ + ifeq ($(subdir),math) + CFLAGS-e_asin-fma.c = -mfma -mavx2 + CFLAGS-e_atan2-fma.c = -mfma -mavx2 ++CFLAGS-e_atanh-fma.c = -mfma -mavx2 + CFLAGS-e_exp-fma.c = -mfma -mavx2 + CFLAGS-e_log-fma.c = -mfma -mavx2 + CFLAGS-e_log2-fma.c = -mfma -mavx2 +@@ -57,6 +58,7 @@ libm-sysdep_routines += \ + e_asin-fma \ + e_atan2-avx \ + e_atan2-fma \ ++ e_atanh-fma \ + e_exp-avx \ + e_exp-fma \ + e_exp2f-fma \ +diff --git a/sysdeps/x86_64/fpu/multiarch/e_atanh-fma.c b/sysdeps/x86_64/fpu/multiarch/e_atanh-fma.c +new file mode 100644 +index 0000000000000000..c3f2f9e5506ae363 +--- /dev/null ++++ b/sysdeps/x86_64/fpu/multiarch/e_atanh-fma.c +@@ -0,0 +1,6 @@ ++#define __ieee754_atanh __ieee754_atanh_fma ++#define __log1p __log1p_fma ++ ++#define SECTION __attribute__ ((section (".text.fma"))) ++ ++#include +diff --git a/sysdeps/x86_64/fpu/multiarch/e_atanh.c b/sysdeps/x86_64/fpu/multiarch/e_atanh.c +new file mode 100644 +index 0000000000000000..d2b785dfc0268df8 +--- /dev/null ++++ b/sysdeps/x86_64/fpu/multiarch/e_atanh.c +@@ -0,0 +1,34 @@ ++/* Multiple versions of atanh. ++ Copyright (C) 2025 Free Software Foundation, Inc. ++ This file is part of the GNU C Library. ++ ++ The GNU C Library is free software; you can redistribute it and/or ++ modify it under the terms of the GNU Lesser General Public ++ License as published by the Free Software Foundation; either ++ version 2.1 of the License, or (at your option) any later version. ++ ++ The GNU C Library is distributed in the hope that it will be useful, ++ but WITHOUT ANY WARRANTY; without even the implied warranty of ++ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ Lesser General Public License for more details. ++ ++ You should have received a copy of the GNU Lesser General Public ++ License along with the GNU C Library; if not, see ++ . */ ++ ++#include ++#if MINIMUM_X86_ISA_LEVEL < AVX2_X86_ISA_LEVEL ++# include ++ ++extern double __redirect_ieee754_atanh (double); ++ ++# define SYMBOL_NAME ieee754_atanh ++# include "ifunc-fma.h" ++ ++libc_ifunc_redirected (__redirect_ieee754_atanh, __ieee754_atanh, IFUNC_SELECTOR ()); ++ ++libm_alias_finite (__ieee754_atanh, __atanh) ++ ++# define __ieee754_atanh __ieee754_atanh_sse2 ++#endif ++#include diff --git a/glibc-upstream-2.39-171.patch b/glibc-upstream-2.39-171.patch new file mode 100644 index 0000000..bdeedbe --- /dev/null +++ b/glibc-upstream-2.39-171.patch @@ -0,0 +1,47 @@ +commit 60cd7123a6c4441a509c22cc1d5da60df2c1dfeb +Author: Florian Weimer +Date: Fri Mar 28 09:26:06 2025 +0100 + + x86: Skip XSAVE state size reset if ISA level requires XSAVE + + If we have to use XSAVE or XSAVEC trampolines, do not adjust the size + information they need. Technically, it is an operator error to try to + run with -XSAVE,-XSAVEC on such builds, but this change here disables + some unnecessary code with higher ISA levels and simplifies testing. + + Related to commit befe2d3c4dec8be2cdd01a47132e47bdb7020922 + ("x86-64: Don't use SSE resolvers for ISA level 3 or above"). + + Reviewed-by: H.J. Lu + (cherry picked from commit 59585ddaa2d44f22af04bb4b8bd4ad1e302c4c02) + +diff --git a/sysdeps/x86/cpu-features.c b/sysdeps/x86/cpu-features.c +index 3d7c2819d7cc6643..4c535970d10a2d67 100644 +--- a/sysdeps/x86/cpu-features.c ++++ b/sysdeps/x86/cpu-features.c +@@ -24,6 +24,7 @@ + #include + #include + #include ++#include + + extern void TUNABLE_CALLBACK (set_hwcaps) (tunable_val_t *) + attribute_hidden; +@@ -1092,6 +1093,9 @@ no_cpuid: + TUNABLE_CALLBACK (set_prefer_map_32bit_exec)); + #endif + ++ /* Do not add the logic to disable XSAVE/XSAVEC if this glibc build ++ requires AVX and therefore XSAVE or XSAVEC support. */ ++#ifndef GCCMACRO__AVX__ + bool disable_xsave_features = false; + + if (!CPU_FEATURE_USABLE_P (cpu_features, OSXSAVE)) +@@ -1145,6 +1149,7 @@ no_cpuid: + + CPU_FEATURE_UNSET (cpu_features, FMA4); + } ++#endif + + #ifdef __x86_64__ + GLRO(dl_hwcap) = HWCAP_X86_64; diff --git a/glibc-upstream-2.39-172.patch b/glibc-upstream-2.39-172.patch new file mode 100644 index 0000000..f850918 --- /dev/null +++ b/glibc-upstream-2.39-172.patch @@ -0,0 +1,183 @@ +commit 87ab0c7f7f7c4bc16cda782c703b61cd28f383a3 +Author: Florian Weimer +Date: Fri Mar 28 09:26:59 2025 +0100 + + x86: Use separate variable for TLSDESC XSAVE/XSAVEC state size (bug 32810) + + Previously, the initialization code reused the xsave_state_full_size + member of struct cpu_features for the TLSDESC state size. However, + the tunable processing code assumes that this member has the + original XSAVE (non-compact) state size, so that it can use its + value if XSAVEC is disabled via tunable. + + This change uses a separate variable and not a struct member because + the value is only needed in ld.so and the static libc, but not in + libc.so. As a result, struct cpu_features layout does not change, + helping a future backport of this change. + + Fixes commit 9b7091415af47082664717210ac49d51551456ab ("x86-64: + Update _dl_tlsdesc_dynamic to preserve AMX registers"). + + Reviewed-by: H.J. Lu + (cherry picked from commit 145097dff170507fe73190e8e41194f5b5f7e6bf) + +diff --git a/sysdeps/x86/Makefile b/sysdeps/x86/Makefile +index 5311b594aff62f7c..8819fba1b7164f45 100644 +--- a/sysdeps/x86/Makefile ++++ b/sysdeps/x86/Makefile +@@ -21,6 +21,9 @@ tests += \ + tst-cpu-features-supports-static \ + tst-get-cpu-features \ + tst-get-cpu-features-static \ ++ tst-gnu2-tls2-x86-noxsave \ ++ tst-gnu2-tls2-x86-noxsavec \ ++ tst-gnu2-tls2-x86-noxsavexsavec \ + tst-hwcap-tunables \ + # tests + tests-static += \ +@@ -91,6 +94,22 @@ CFLAGS-tst-gnu2-tls2.c += -msse + CFLAGS-tst-gnu2-tls2mod0.c += -msse2 -mtune=haswell + CFLAGS-tst-gnu2-tls2mod1.c += -msse2 -mtune=haswell + CFLAGS-tst-gnu2-tls2mod2.c += -msse2 -mtune=haswell ++ ++LDFLAGS-tst-gnu2-tls2-x86-noxsave += -Wl,-z,lazy ++LDFLAGS-tst-gnu2-tls2-x86-noxsavec += -Wl,-z,lazy ++LDFLAGS-tst-gnu2-tls2-x86-noxsavexsavec += -Wl,-z,lazy ++ ++# Test for bug 32810: incorrect XSAVE state size if XSAVEC is disabled ++# via tunable. ++tst-gnu2-tls2-x86-noxsave-ENV = GLIBC_TUNABLES=glibc.cpu.hwcaps=-XSAVE ++tst-gnu2-tls2-x86-noxsavec-ENV = GLIBC_TUNABLES=glibc.cpu.hwcaps=-XSAVEC ++tst-gnu2-tls2-x86-noxsavexsavec-ENV = GLIBC_TUNABLES=glibc.cpu.hwcaps=-XSAVE,-XSAVEC ++$(objpfx)tst-gnu2-tls2-x86-noxsave.out \ ++$(objpfx)tst-gnu2-tls2-x86-noxsavec.out \ ++$(objpfx)tst-gnu2-tls2-x86-noxsavexsavec.out: \ ++ $(objpfx)tst-gnu2-tls2mod0.so \ ++ $(objpfx)tst-gnu2-tls2mod1.so \ ++ $(objpfx)tst-gnu2-tls2mod2.so + endif + + ifeq ($(subdir),math) +diff --git a/sysdeps/x86/cpu-features.c b/sysdeps/x86/cpu-features.c +index 4c535970d10a2d67..3be69558a4c3aa2d 100644 +--- a/sysdeps/x86/cpu-features.c ++++ b/sysdeps/x86/cpu-features.c +@@ -84,6 +84,8 @@ extern void TUNABLE_CALLBACK (set_x86_shstk) (tunable_val_t *) + # include + #endif + ++unsigned long int _dl_x86_features_tlsdesc_state_size; ++ + static void + update_active (struct cpu_features *cpu_features) + { +@@ -318,6 +320,7 @@ update_active (struct cpu_features *cpu_features) + = xsave_state_full_size; + cpu_features->xsave_state_full_size + = xsave_state_full_size; ++ _dl_x86_features_tlsdesc_state_size = xsave_state_full_size; + + /* Check if XSAVEC is available. */ + if (CPU_FEATURES_CPU_P (cpu_features, XSAVEC)) +@@ -406,11 +409,9 @@ update_active (struct cpu_features *cpu_features) + = ALIGN_UP ((amx_size + + TLSDESC_CALL_REGISTER_SAVE_AREA), + 64); +- /* Set xsave_state_full_size to the compact AMX +- state size for XSAVEC. NB: xsave_state_full_size +- is only used in _dl_tlsdesc_dynamic_xsave and +- _dl_tlsdesc_dynamic_xsavec. */ +- cpu_features->xsave_state_full_size = amx_size; ++ /* Set TLSDESC state size to the compact AMX ++ state size for XSAVEC. */ ++ _dl_x86_features_tlsdesc_state_size = amx_size; + #endif + cpu_features->xsave_state_size + = ALIGN_UP (size + TLSDESC_CALL_REGISTER_SAVE_AREA, +diff --git a/sysdeps/x86/cpu-tunables.c b/sysdeps/x86/cpu-tunables.c +index 89da7a03daa665f6..a72ba61d837c6383 100644 +--- a/sysdeps/x86/cpu-tunables.c ++++ b/sysdeps/x86/cpu-tunables.c +@@ -164,6 +164,8 @@ TUNABLE_CALLBACK (set_hwcaps) (tunable_val_t *valp) + /* Update xsave_state_size to XSAVE state size. */ + cpu_features->xsave_state_size + = cpu_features->xsave_state_full_size; ++ _dl_x86_features_tlsdesc_state_size ++ = cpu_features->xsave_state_full_size; + CPU_FEATURE_UNSET (cpu_features, XSAVEC); + } + } +diff --git a/sysdeps/x86/dl-diagnostics-cpu.c b/sysdeps/x86/dl-diagnostics-cpu.c +index c76ea3be16f6bead..9f10645ee9778741 100644 +--- a/sysdeps/x86/dl-diagnostics-cpu.c ++++ b/sysdeps/x86/dl-diagnostics-cpu.c +@@ -78,6 +78,8 @@ _dl_diagnostics_cpu (void) + cpu_features->xsave_state_size); + print_cpu_features_value ("xsave_state_full_size", + cpu_features->xsave_state_full_size); ++ print_cpu_features_value ("tlsdesc_state_full_size", ++ _dl_x86_features_tlsdesc_state_size); + print_cpu_features_value ("data_cache_size", cpu_features->data_cache_size); + print_cpu_features_value ("shared_cache_size", + cpu_features->shared_cache_size); +diff --git a/sysdeps/x86/include/cpu-features.h b/sysdeps/x86/include/cpu-features.h +index cd7bd27cf35959fd..a11d4be30b696ac3 100644 +--- a/sysdeps/x86/include/cpu-features.h ++++ b/sysdeps/x86/include/cpu-features.h +@@ -934,8 +934,6 @@ struct cpu_features + /* The full state size for XSAVE when XSAVEC is disabled by + + GLIBC_TUNABLES=glibc.cpu.hwcaps=-XSAVEC +- +- and the AMX state size when XSAVEC is available. + */ + unsigned int xsave_state_full_size; + /* Data cache size for use in memory and string routines, typically +@@ -987,6 +985,13 @@ extern const struct cpu_features *_dl_x86_get_cpu_features (void) + + #define __get_cpu_features() _dl_x86_get_cpu_features() + ++#if IS_IN (rtld) || IS_IN (libc) ++/* XSAVE/XSAVEC state size used by TLS descriptors. Compared to ++ xsave_state_size from struct cpu_features, this includes additional ++ registers. */ ++extern unsigned long int _dl_x86_features_tlsdesc_state_size attribute_hidden; ++#endif ++ + #if defined (_LIBC) && !IS_IN (nonlib) + /* Unused for x86. */ + # define INIT_ARCH() +diff --git a/sysdeps/x86/tst-gnu2-tls2-x86-noxsave.c b/sysdeps/x86/tst-gnu2-tls2-x86-noxsave.c +new file mode 100644 +index 0000000000000000..f0024c143d1a1df5 +--- /dev/null ++++ b/sysdeps/x86/tst-gnu2-tls2-x86-noxsave.c +@@ -0,0 +1 @@ ++#include +diff --git a/sysdeps/x86/tst-gnu2-tls2-x86-noxsavec.c b/sysdeps/x86/tst-gnu2-tls2-x86-noxsavec.c +new file mode 100644 +index 0000000000000000..f0024c143d1a1df5 +--- /dev/null ++++ b/sysdeps/x86/tst-gnu2-tls2-x86-noxsavec.c +@@ -0,0 +1 @@ ++#include +diff --git a/sysdeps/x86/tst-gnu2-tls2-x86-noxsavexsavec.c b/sysdeps/x86/tst-gnu2-tls2-x86-noxsavexsavec.c +new file mode 100644 +index 0000000000000000..f0024c143d1a1df5 +--- /dev/null ++++ b/sysdeps/x86/tst-gnu2-tls2-x86-noxsavexsavec.c +@@ -0,0 +1 @@ ++#include +diff --git a/sysdeps/x86_64/dl-tlsdesc-dynamic.h b/sysdeps/x86_64/dl-tlsdesc-dynamic.h +index 9f02cfc3eb297ed2..44d948696fbe44af 100644 +--- a/sysdeps/x86_64/dl-tlsdesc-dynamic.h ++++ b/sysdeps/x86_64/dl-tlsdesc-dynamic.h +@@ -99,7 +99,7 @@ _dl_tlsdesc_dynamic: + # endif + #else + /* Allocate stack space of the required size to save the state. */ +- sub _rtld_local_ro+RTLD_GLOBAL_RO_DL_X86_CPU_FEATURES_OFFSET+XSAVE_STATE_FULL_SIZE_OFFSET(%rip), %RSP_LP ++ sub _dl_x86_features_tlsdesc_state_size(%rip), %RSP_LP + #endif + /* Besides rdi and rsi, saved above, save rcx, rdx, r8, r9, + r10 and r11. */ diff --git a/glibc-upstream-2.39-173.patch b/glibc-upstream-2.39-173.patch new file mode 100644 index 0000000..c81f9e7 --- /dev/null +++ b/glibc-upstream-2.39-173.patch @@ -0,0 +1,28 @@ +commit 837a36c371f18a3152d032e8060f4e5120c25e2b +Author: Florian Weimer +Date: Mon Mar 31 21:33:18 2025 +0200 + + x86: Link tst-gnu2-tls2-x86-noxsave{,c,xsavec} with libpthread + + This fixes a test build failure on Hurd. + + Fixes commit 145097dff170507fe73190e8e41194f5b5f7e6bf ("x86: Use separate + variable for TLSDESC XSAVE/XSAVEC state size (bug 32810)"). + + Reviewed-by: Adhemerval Zanella + (cherry picked from commit c6e2895695118ab59c7b17feb0fcb75a53e3478c) + +diff --git a/sysdeps/x86/Makefile b/sysdeps/x86/Makefile +index 8819fba1b7164f45..01b0192ddf5e23ca 100644 +--- a/sysdeps/x86/Makefile ++++ b/sysdeps/x86/Makefile +@@ -104,6 +104,9 @@ LDFLAGS-tst-gnu2-tls2-x86-noxsavexsavec += -Wl,-z,lazy + tst-gnu2-tls2-x86-noxsave-ENV = GLIBC_TUNABLES=glibc.cpu.hwcaps=-XSAVE + tst-gnu2-tls2-x86-noxsavec-ENV = GLIBC_TUNABLES=glibc.cpu.hwcaps=-XSAVEC + tst-gnu2-tls2-x86-noxsavexsavec-ENV = GLIBC_TUNABLES=glibc.cpu.hwcaps=-XSAVE,-XSAVEC ++$(objpfx)tst-gnu2-tls2-x86-noxsave: $(shared-thread-library) ++$(objpfx)tst-gnu2-tls2-x86-noxsavec: $(shared-thread-library) ++$(objpfx)tst-gnu2-tls2-x86-noxsavexsavec: $(shared-thread-library) + $(objpfx)tst-gnu2-tls2-x86-noxsave.out \ + $(objpfx)tst-gnu2-tls2-x86-noxsavec.out \ + $(objpfx)tst-gnu2-tls2-x86-noxsavexsavec.out: \ diff --git a/glibc-upstream-2.39-174.patch b/glibc-upstream-2.39-174.patch new file mode 100644 index 0000000..152c6ae --- /dev/null +++ b/glibc-upstream-2.39-174.patch @@ -0,0 +1,202 @@ +commit 0da58e8be087ca7011ec918977c2ffac9034d1d4 +Author: Noah Goldstein +Date: Fri May 24 12:38:51 2024 -0500 + + x86: Add seperate non-temporal tunable for memset + + The tuning for non-temporal stores for memset vs memcpy is not always + the same. This includes both the exact value and whether non-temporal + stores are profitable at all for a given arch. + + This patch add `x86_memset_non_temporal_threshold`. Currently we + disable non-temporal stores for non Intel vendors as the only + benchmarks showing its benefit have been on Intel hardware. + Reviewed-by: H.J. Lu + + (cherry picked from commit 46b5e98ef6f1b9f4b53851f152ecb8209064b26c) + +diff --git a/manual/tunables.texi b/manual/tunables.texi +index be97190d67b1c82e..b255a149d10aecf6 100644 +--- a/manual/tunables.texi ++++ b/manual/tunables.texi +@@ -52,6 +52,7 @@ glibc.elision.skip_lock_busy: 3 (min: 0, max: 2147483647) + glibc.malloc.top_pad: 0x20000 (min: 0x0, max: 0xffffffffffffffff) + glibc.cpu.x86_rep_stosb_threshold: 0x800 (min: 0x1, max: 0xffffffffffffffff) + glibc.cpu.x86_non_temporal_threshold: 0xc0000 (min: 0x4040, max: 0xfffffffffffffff) ++glibc.cpu.x86_memset_non_temporal_threshold: 0xc0000 (min: 0x4040, max: 0xfffffffffffffff) + glibc.cpu.x86_shstk: + glibc.pthread.stack_cache_size: 0x2800000 (min: 0x0, max: 0xffffffffffffffff) + glibc.cpu.hwcap_mask: 0x6 (min: 0x0, max: 0xffffffffffffffff) +@@ -485,7 +486,8 @@ thread stack originally backup by Huge Pages to default pages. + @cindex shared_cache_size tunables + @cindex tunables, shared_cache_size + @cindex non_temporal_threshold tunables +-@cindex tunables, non_temporal_threshold ++@cindex memset_non_temporal_threshold tunables ++@cindex tunables, non_temporal_threshold, memset_non_temporal_threshold + + @deftp {Tunable namespace} glibc.cpu + Behavior of @theglibc{} can be tuned to assume specific hardware capabilities +@@ -561,6 +563,18 @@ like memmove and memcpy. + This tunable is specific to i386 and x86-64. + @end deftp + ++@deftp Tunable glibc.cpu.x86_memset_non_temporal_threshold ++The @code{glibc.cpu.x86_memset_non_temporal_threshold} tunable allows ++the user to set threshold in bytes for non temporal store in ++memset. Non temporal stores give a hint to the hardware to move data ++directly to memory without displacing other data from the cache. This ++tunable is used by some platforms to determine when to use non ++temporal stores memset. ++ ++This tunable is specific to i386 and x86-64. ++@end deftp ++ ++ + @deftp Tunable glibc.cpu.x86_rep_movsb_threshold + The @code{glibc.cpu.x86_rep_movsb_threshold} tunable allows the user to + set threshold in bytes to start using "rep movsb". The value must be +diff --git a/sysdeps/x86/cacheinfo.h b/sysdeps/x86/cacheinfo.h +index ab73556772209402..83491607c761ccc6 100644 +--- a/sysdeps/x86/cacheinfo.h ++++ b/sysdeps/x86/cacheinfo.h +@@ -35,9 +35,12 @@ long int __x86_data_cache_size attribute_hidden = 32 * 1024; + long int __x86_shared_cache_size_half attribute_hidden = 1024 * 1024 / 2; + long int __x86_shared_cache_size attribute_hidden = 1024 * 1024; + +-/* Threshold to use non temporal store. */ ++/* Threshold to use non temporal store in memmove. */ + long int __x86_shared_non_temporal_threshold attribute_hidden; + ++/* Threshold to use non temporal store in memset. */ ++long int __x86_memset_non_temporal_threshold attribute_hidden; ++ + /* Threshold to use Enhanced REP MOVSB. */ + long int __x86_rep_movsb_threshold attribute_hidden = 2048; + +@@ -77,6 +80,9 @@ init_cacheinfo (void) + __x86_shared_non_temporal_threshold + = cpu_features->non_temporal_threshold; + ++ __x86_memset_non_temporal_threshold ++ = cpu_features->memset_non_temporal_threshold; ++ + __x86_rep_movsb_threshold = cpu_features->rep_movsb_threshold; + __x86_rep_stosb_threshold = cpu_features->rep_stosb_threshold; + __x86_rep_movsb_stop_threshold = cpu_features->rep_movsb_stop_threshold; +diff --git a/sysdeps/x86/dl-cacheinfo.h b/sysdeps/x86/dl-cacheinfo.h +index 1f68968a9a457586..0e7c1e0415d4137b 100644 +--- a/sysdeps/x86/dl-cacheinfo.h ++++ b/sysdeps/x86/dl-cacheinfo.h +@@ -986,6 +986,13 @@ dl_init_cacheinfo (struct cpu_features *cpu_features) + if (CPU_FEATURE_USABLE_P (cpu_features, FSRM)) + rep_movsb_threshold = 2112; + ++ /* Non-temporal stores in memset have only been tested on Intel hardware. ++ Until we benchmark data on other x86 processor, disable non-temporal ++ stores in memset. */ ++ unsigned long int memset_non_temporal_threshold = SIZE_MAX; ++ if (cpu_features->basic.kind == arch_kind_intel) ++ memset_non_temporal_threshold = non_temporal_threshold; ++ + /* For AMD CPUs that support ERMS (Zen3+), REP MOVSB is in a lot of + cases slower than the vectorized path (and for some alignments, + it is really slow, check BZ #30994). */ +@@ -1012,6 +1019,11 @@ dl_init_cacheinfo (struct cpu_features *cpu_features) + && tunable_size <= maximum_non_temporal_threshold) + non_temporal_threshold = tunable_size; + ++ tunable_size = TUNABLE_GET (x86_memset_non_temporal_threshold, long int, NULL); ++ if (tunable_size > minimum_non_temporal_threshold ++ && tunable_size <= maximum_non_temporal_threshold) ++ memset_non_temporal_threshold = tunable_size; ++ + tunable_size = TUNABLE_GET (x86_rep_movsb_threshold, long int, NULL); + if (tunable_size > minimum_rep_movsb_threshold) + rep_movsb_threshold = tunable_size; +@@ -1032,6 +1044,9 @@ dl_init_cacheinfo (struct cpu_features *cpu_features) + TUNABLE_SET_WITH_BOUNDS (x86_non_temporal_threshold, non_temporal_threshold, + minimum_non_temporal_threshold, + maximum_non_temporal_threshold); ++ TUNABLE_SET_WITH_BOUNDS ( ++ x86_memset_non_temporal_threshold, memset_non_temporal_threshold, ++ minimum_non_temporal_threshold, maximum_non_temporal_threshold); + TUNABLE_SET_WITH_BOUNDS (x86_rep_movsb_threshold, rep_movsb_threshold, + minimum_rep_movsb_threshold, SIZE_MAX); + TUNABLE_SET_WITH_BOUNDS (x86_rep_stosb_threshold, rep_stosb_threshold, 1, +@@ -1045,6 +1060,7 @@ dl_init_cacheinfo (struct cpu_features *cpu_features) + cpu_features->data_cache_size = data; + cpu_features->shared_cache_size = shared; + cpu_features->non_temporal_threshold = non_temporal_threshold; ++ cpu_features->memset_non_temporal_threshold = memset_non_temporal_threshold; + cpu_features->rep_movsb_threshold = rep_movsb_threshold; + cpu_features->rep_stosb_threshold = rep_stosb_threshold; + cpu_features->rep_movsb_stop_threshold = rep_movsb_stop_threshold; +diff --git a/sysdeps/x86/dl-diagnostics-cpu.c b/sysdeps/x86/dl-diagnostics-cpu.c +index 9f10645ee9778741..8113a93883cfe7a2 100644 +--- a/sysdeps/x86/dl-diagnostics-cpu.c ++++ b/sysdeps/x86/dl-diagnostics-cpu.c +@@ -85,6 +85,8 @@ _dl_diagnostics_cpu (void) + cpu_features->shared_cache_size); + print_cpu_features_value ("non_temporal_threshold", + cpu_features->non_temporal_threshold); ++ print_cpu_features_value ("memset_non_temporal_threshold", ++ cpu_features->memset_non_temporal_threshold); + print_cpu_features_value ("rep_movsb_threshold", + cpu_features->rep_movsb_threshold); + print_cpu_features_value ("rep_movsb_stop_threshold", +diff --git a/sysdeps/x86/dl-tunables.list b/sysdeps/x86/dl-tunables.list +index 7d82da0dece49c45..a0a12995927dc4f1 100644 +--- a/sysdeps/x86/dl-tunables.list ++++ b/sysdeps/x86/dl-tunables.list +@@ -30,6 +30,9 @@ glibc { + x86_non_temporal_threshold { + type: SIZE_T + } ++ x86_memset_non_temporal_threshold { ++ type: SIZE_T ++ } + x86_rep_movsb_threshold { + type: SIZE_T + # Since there is overhead to set up REP MOVSB operation, REP +diff --git a/sysdeps/x86/include/cpu-features.h b/sysdeps/x86/include/cpu-features.h +index a11d4be30b696ac3..03c71387dd08982b 100644 +--- a/sysdeps/x86/include/cpu-features.h ++++ b/sysdeps/x86/include/cpu-features.h +@@ -942,8 +942,10 @@ struct cpu_features + /* Shared cache size for use in memory and string routines, typically + L2 or L3 size. */ + unsigned long int shared_cache_size; +- /* Threshold to use non temporal store. */ ++ /* Threshold to use non temporal store in memmove. */ + unsigned long int non_temporal_threshold; ++ /* Threshold to use non temporal store in memset. */ ++ unsigned long int memset_non_temporal_threshold; + /* Threshold to use "rep movsb". */ + unsigned long int rep_movsb_threshold; + /* Threshold to stop using "rep movsb". */ +diff --git a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S +index 637caadb406b2544..88bf08e4f4a2260e 100644 +--- a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S ++++ b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S +@@ -24,9 +24,9 @@ + 5. If size is more to 4 * VEC_SIZE, align to 1 * VEC_SIZE with + 4 VEC stores and store 4 * VEC at a time until done. + 6. On machines ERMS feature, if size is range +- [__x86_rep_stosb_threshold, __x86_shared_non_temporal_threshold) ++ [__x86_rep_stosb_threshold, __x86_memset_non_temporal_threshold) + then REP STOSB will be used. +- 7. If size >= __x86_shared_non_temporal_threshold, use a ++ 7. If size >= __x86_memset_non_temporal_threshold, use a + non-temporal stores. */ + + #include +@@ -318,7 +318,7 @@ L(return_vzeroupper): + /* If no USE_LESS_VEC_MASK put L(stosb_local) here. Will be in + range for 2-byte jump encoding. */ + L(stosb_local): +- cmp __x86_shared_non_temporal_threshold(%rip), %RDX_LP ++ cmp __x86_memset_non_temporal_threshold(%rip), %RDX_LP + jae L(nt_memset) + movzbl %sil, %eax + mov %RDX_LP, %RCX_LP diff --git a/glibc-upstream-2.39-175.patch b/glibc-upstream-2.39-175.patch new file mode 100644 index 0000000..1858d00 --- /dev/null +++ b/glibc-upstream-2.39-175.patch @@ -0,0 +1,43 @@ +commit cc59fa5dbc4db7c6d1fb792c55a5d83c54ee72bf +Author: Joe Damato +Date: Fri Jun 7 23:04:47 2024 +0000 + + x86: Enable non-temporal memset tunable for AMD + + In commit 46b5e98ef6f1 ("x86: Add seperate non-temporal tunable for + memset") a tunable threshold for enabling non-temporal memset was added, + but only for Intel hardware. + + Since that commit, new benchmark results suggest that non-temporal + memset is beneficial on AMD, as well, so allow this tunable to be set + for AMD. + + See: + https://docs.google.com/spreadsheets/d/1opzukzvum4n6-RUVHTGddV6RjAEil4P2uMjjQGLbLcU/edit?usp=sharing + which has been updated to include data using different stategies for + large memset on AMD Zen2, Zen3, and Zen4. + + Signed-off-by: Joe Damato + Reviewed-by: Noah Goldstein + (cherry picked from commit bef2a827a55fc759693ccc5b0f614353b8ad712d) + +diff --git a/sysdeps/x86/dl-cacheinfo.h b/sysdeps/x86/dl-cacheinfo.h +index 0e7c1e0415d4137b..9916c5d951361c90 100644 +--- a/sysdeps/x86/dl-cacheinfo.h ++++ b/sysdeps/x86/dl-cacheinfo.h +@@ -986,11 +986,11 @@ dl_init_cacheinfo (struct cpu_features *cpu_features) + if (CPU_FEATURE_USABLE_P (cpu_features, FSRM)) + rep_movsb_threshold = 2112; + +- /* Non-temporal stores in memset have only been tested on Intel hardware. +- Until we benchmark data on other x86 processor, disable non-temporal +- stores in memset. */ ++ /* Non-temporal stores are more performant on Intel and AMD hardware above ++ non_temporal_threshold. Enable this for both Intel and AMD hardware. */ + unsigned long int memset_non_temporal_threshold = SIZE_MAX; +- if (cpu_features->basic.kind == arch_kind_intel) ++ if (cpu_features->basic.kind == arch_kind_intel ++ || cpu_features->basic.kind == arch_kind_amd) + memset_non_temporal_threshold = non_temporal_threshold; + + /* For AMD CPUs that support ERMS (Zen3+), REP MOVSB is in a lot of diff --git a/glibc-upstream-2.39-176.patch b/glibc-upstream-2.39-176.patch new file mode 100644 index 0000000..130210b --- /dev/null +++ b/glibc-upstream-2.39-176.patch @@ -0,0 +1,37 @@ +commit 38a7632f2d1ec86445904b356c54129591e8519b +Author: Noah Goldstein +Date: Fri Jun 14 13:01:58 2024 -0500 + + x86: Fix value for `x86_memset_non_temporal_threshold` when it is undesirable + + When we don't want to use non-temporal stores for memset, we set + `x86_memset_non_temporal_threshold` to SIZE_MAX. + + The current code, however, we using `maximum_non_temporal_threshold` + as the upper bound which is `SIZE_MAX >> 4` so we ended up with a + value of `0`. + + Fix is to just use `SIZE_MAX` as the upper bound for when setting the + tunable. + Tested-by: Borislav Petkov (AMD) + Reviewed-by: H.J. Lu + + (cherry picked from commit 5b54a33435e5533653a9956728f2de9d16a3b4ee) + +diff --git a/sysdeps/x86/dl-cacheinfo.h b/sysdeps/x86/dl-cacheinfo.h +index 9916c5d951361c90..9b6f68e46de4bdaa 100644 +--- a/sysdeps/x86/dl-cacheinfo.h ++++ b/sysdeps/x86/dl-cacheinfo.h +@@ -1044,9 +1044,9 @@ dl_init_cacheinfo (struct cpu_features *cpu_features) + TUNABLE_SET_WITH_BOUNDS (x86_non_temporal_threshold, non_temporal_threshold, + minimum_non_temporal_threshold, + maximum_non_temporal_threshold); +- TUNABLE_SET_WITH_BOUNDS ( +- x86_memset_non_temporal_threshold, memset_non_temporal_threshold, +- minimum_non_temporal_threshold, maximum_non_temporal_threshold); ++ TUNABLE_SET_WITH_BOUNDS (x86_memset_non_temporal_threshold, ++ memset_non_temporal_threshold, ++ minimum_non_temporal_threshold, SIZE_MAX); + TUNABLE_SET_WITH_BOUNDS (x86_rep_movsb_threshold, rep_movsb_threshold, + minimum_rep_movsb_threshold, SIZE_MAX); + TUNABLE_SET_WITH_BOUNDS (x86_rep_stosb_threshold, rep_stosb_threshold, 1, diff --git a/glibc-upstream-2.39-177.patch b/glibc-upstream-2.39-177.patch new file mode 100644 index 0000000..b550ed7 --- /dev/null +++ b/glibc-upstream-2.39-177.patch @@ -0,0 +1,121 @@ +commit bde201e92c1e64934f8ffe3e5b7d769100677037 +Author: Noah Goldstein +Date: Mon Jul 15 16:19:17 2024 +0800 + + x86: Disable non-temporal memset on Skylake Server + + The original commit enabling non-temporal memset on Skylake Server had + erroneous benchmarks (actually done on ICX). + + Further benchmarks indicate non-temporal stores may in fact by a + regression on Skylake Server. + + This commit may be over-cautious in some cases, but should avoid any + regressions for 2.40. + + Tested using qemu on all x86_64 cpu arch supported by both qemu + + GLIBC. + + Reviewed-by: DJ Delorie + Reviewed-by: H.J. Lu + (cherry picked from commit 5bcf6265f215326d14dfacdce8532792c2c7f8f8) + +diff --git a/sysdeps/x86/cpu-features.c b/sysdeps/x86/cpu-features.c +index 3be69558a4c3aa2d..77b5638daafe9a1e 100644 +--- a/sysdeps/x86/cpu-features.c ++++ b/sysdeps/x86/cpu-features.c +@@ -872,11 +872,18 @@ init_cpu_features (struct cpu_features *cpu_features) + + /* Newer Bigcore microarch (larger non-temporal store + threshold). */ +- case INTEL_BIGCORE_SKYLAKE: +- case INTEL_BIGCORE_KABYLAKE: +- case INTEL_BIGCORE_COMETLAKE: + case INTEL_BIGCORE_SKYLAKE_AVX512: + case INTEL_BIGCORE_CANNONLAKE: ++ /* Benchmarks indicate non-temporal memset is not ++ necessarily profitable on SKX (and in some cases much ++ worse). This is likely unique to SKX due its it unique ++ mesh interconnect (not present on ICX or BWD). Disable ++ non-temporal on all Skylake servers. */ ++ cpu_features->preferred[index_arch_Avoid_Non_Temporal_Memset] ++ |= bit_arch_Avoid_Non_Temporal_Memset; ++ case INTEL_BIGCORE_COMETLAKE: ++ case INTEL_BIGCORE_SKYLAKE: ++ case INTEL_BIGCORE_KABYLAKE: + case INTEL_BIGCORE_ICELAKE: + case INTEL_BIGCORE_TIGERLAKE: + case INTEL_BIGCORE_ROCKETLAKE: +diff --git a/sysdeps/x86/cpu-tunables.c b/sysdeps/x86/cpu-tunables.c +index a72ba61d837c6383..a71772c9c07d01d7 100644 +--- a/sysdeps/x86/cpu-tunables.c ++++ b/sysdeps/x86/cpu-tunables.c +@@ -245,6 +245,11 @@ TUNABLE_CALLBACK (set_hwcaps) (tunable_val_t *valp) + (n, cpu_features, MathVec_Prefer_No_AVX512, AVX512F, 24); + } + break; ++ case 25: ++ { ++ CHECK_GLIBC_IFUNC_PREFERRED_BOTH (n, cpu_features, ++ Avoid_Non_Temporal_Memset, 25); ++ } + case 26: + { + CHECK_GLIBC_IFUNC_PREFERRED_NEED_BOTH +diff --git a/sysdeps/x86/dl-cacheinfo.h b/sysdeps/x86/dl-cacheinfo.h +index 9b6f68e46de4bdaa..66e2b83fea0dc744 100644 +--- a/sysdeps/x86/dl-cacheinfo.h ++++ b/sysdeps/x86/dl-cacheinfo.h +@@ -989,13 +989,14 @@ dl_init_cacheinfo (struct cpu_features *cpu_features) + /* Non-temporal stores are more performant on Intel and AMD hardware above + non_temporal_threshold. Enable this for both Intel and AMD hardware. */ + unsigned long int memset_non_temporal_threshold = SIZE_MAX; +- if (cpu_features->basic.kind == arch_kind_intel +- || cpu_features->basic.kind == arch_kind_amd) +- memset_non_temporal_threshold = non_temporal_threshold; +- +- /* For AMD CPUs that support ERMS (Zen3+), REP MOVSB is in a lot of +- cases slower than the vectorized path (and for some alignments, +- it is really slow, check BZ #30994). */ ++ if (!CPU_FEATURES_ARCH_P (cpu_features, Avoid_Non_Temporal_Memset) ++ && (cpu_features->basic.kind == arch_kind_intel ++ || cpu_features->basic.kind == arch_kind_amd)) ++ memset_non_temporal_threshold = non_temporal_threshold; ++ ++ /* For AMD CPUs that support ERMS (Zen3+), REP MOVSB is in a lot of ++ cases slower than the vectorized path (and for some alignments, ++ it is really slow, check BZ #30994). */ + if (cpu_features->basic.kind == arch_kind_amd) + rep_movsb_threshold = non_temporal_threshold; + +diff --git a/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def b/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def +index 85e7f54ec8204328..61bbbc2e8983482e 100644 +--- a/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def ++++ b/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def +@@ -33,3 +33,4 @@ BIT (Prefer_No_AVX512) + BIT (MathVec_Prefer_No_AVX512) + BIT (Prefer_FSRM) + BIT (Avoid_Short_Distance_REP_MOVSB) ++BIT (Avoid_Non_Temporal_Memset) +diff --git a/sysdeps/x86/tst-hwcap-tunables.c b/sysdeps/x86/tst-hwcap-tunables.c +index f6a65b88dea6d9dc..bc573c7435130dee 100644 +--- a/sysdeps/x86/tst-hwcap-tunables.c ++++ b/sysdeps/x86/tst-hwcap-tunables.c +@@ -60,7 +60,7 @@ static const struct test_t + /* Disable everything. */ + "-Prefer_ERMS,-Prefer_FSRM,-AVX,-AVX2,-AVX512F,-AVX512VL," + "-SSE4_1,-SSE4_2,-SSSE3,-Fast_Unaligned_Load,-ERMS," +- "-AVX_Fast_Unaligned_Load", ++ "-AVX_Fast_Unaligned_Load,-Avoid_Non_Temporal_Memset", + test_1, + array_length (test_1) + }, +@@ -68,7 +68,7 @@ static const struct test_t + /* Same as before, but with some empty suboptions. */ + ",-,-Prefer_ERMS,-Prefer_FSRM,-AVX,-AVX2,-AVX512F,-AVX512VL," + "-SSE4_1,-SSE4_2,-SSSE3,-Fast_Unaligned_Load,,-," +- "-ERMS,-AVX_Fast_Unaligned_Load,-,", ++ "-ERMS,-AVX_Fast_Unaligned_Load,-Avoid_Non_Temporal_Memset,-,", + test_1, + array_length (test_1) + } diff --git a/glibc-upstream-2.39-178.patch b/glibc-upstream-2.39-178.patch new file mode 100644 index 0000000..476939e --- /dev/null +++ b/glibc-upstream-2.39-178.patch @@ -0,0 +1,24 @@ +commit 2be36448c46e9ef712e5f3d5381f38bf3138efdf +Author: Florian Weimer +Date: Fri Aug 2 15:22:14 2024 +0200 + + x86: Tunables may incorrectly set Prefer_PMINUB_for_stringop (bug 32047) + + Fixes commit 5bcf6265f215326d14dfacdce8532792c2c7f8f8 ("x86: + Disable non-temporal memset on Skylake Server"). + + Reviewed-by: Noah Goldstein + (cherry picked from commit 7a630f7d3392ca391a399486ce2846f9e4b4ee63) + +diff --git a/sysdeps/x86/cpu-tunables.c b/sysdeps/x86/cpu-tunables.c +index a71772c9c07d01d7..a0b31d80f64127c5 100644 +--- a/sysdeps/x86/cpu-tunables.c ++++ b/sysdeps/x86/cpu-tunables.c +@@ -250,6 +250,7 @@ TUNABLE_CALLBACK (set_hwcaps) (tunable_val_t *valp) + CHECK_GLIBC_IFUNC_PREFERRED_BOTH (n, cpu_features, + Avoid_Non_Temporal_Memset, 25); + } ++ break; + case 26: + { + CHECK_GLIBC_IFUNC_PREFERRED_NEED_BOTH diff --git a/glibc-upstream-2.39-179.patch b/glibc-upstream-2.39-179.patch new file mode 100644 index 0000000..b07abba --- /dev/null +++ b/glibc-upstream-2.39-179.patch @@ -0,0 +1,90 @@ +commit 65ae73be01604699493d387d8ea6bba41df004ab +Author: Noah Goldstein +Date: Wed Aug 14 14:37:30 2024 +0800 + + x86: Use `Avoid_Non_Temporal_Memset` to control non-temporal path + + This is just a refactor and there should be no behavioral change from + this commit. + + The goal is to make `Avoid_Non_Temporal_Memset` a more universal knob + for controlling whether we use non-temporal memset rather than having + extra logic based on vendor. + Reviewed-by: H.J. Lu + + (cherry picked from commit b93dddfaf440aa12f45d7c356f6ffe9f27d35577) + +diff --git a/sysdeps/x86/cpu-features.c b/sysdeps/x86/cpu-features.c +index 77b5638daafe9a1e..4490c0a782e25d8d 100644 +--- a/sysdeps/x86/cpu-features.c ++++ b/sysdeps/x86/cpu-features.c +@@ -758,6 +758,12 @@ init_cpu_features (struct cpu_features *cpu_features) + unsigned int stepping = 0; + enum cpu_features_kind kind; + ++ /* Default is avoid non-temporal memset for non Intel/AMD hardware. This is, ++ as of writing this, we only have benchmarks indicatings it profitability ++ on Intel/AMD. */ ++ cpu_features->preferred[index_arch_Avoid_Non_Temporal_Memset] ++ |= bit_arch_Avoid_Non_Temporal_Memset; ++ + cpu_features->cachesize_non_temporal_divisor = 4; + #if !HAS_CPUID + if (__get_cpuid_max (0, 0) == 0) +@@ -783,6 +789,11 @@ init_cpu_features (struct cpu_features *cpu_features) + + update_active (cpu_features); + ++ /* Benchmarks indicate non-temporal memset can be profitable on Intel ++ hardware. */ ++ cpu_features->preferred[index_arch_Avoid_Non_Temporal_Memset] ++ &= ~bit_arch_Avoid_Non_Temporal_Memset; ++ + if (family == 0x06) + { + model += extended_model; +@@ -993,6 +1004,11 @@ https://www.intel.com/content/www/us/en/support/articles/000059422/processors.ht + + ecx = cpu_features->features[CPUID_INDEX_1].cpuid.ecx; + ++ /* Benchmarks indicate non-temporal memset can be profitable on AMD ++ hardware. */ ++ cpu_features->preferred[index_arch_Avoid_Non_Temporal_Memset] ++ &= ~bit_arch_Avoid_Non_Temporal_Memset; ++ + if (CPU_FEATURE_USABLE_P (cpu_features, AVX)) + { + /* Since the FMA4 bit is in CPUID_INDEX_80000001 and +diff --git a/sysdeps/x86/dl-cacheinfo.h b/sysdeps/x86/dl-cacheinfo.h +index 66e2b83fea0dc744..10ad18061a1b47af 100644 +--- a/sysdeps/x86/dl-cacheinfo.h ++++ b/sysdeps/x86/dl-cacheinfo.h +@@ -986,14 +986,6 @@ dl_init_cacheinfo (struct cpu_features *cpu_features) + if (CPU_FEATURE_USABLE_P (cpu_features, FSRM)) + rep_movsb_threshold = 2112; + +- /* Non-temporal stores are more performant on Intel and AMD hardware above +- non_temporal_threshold. Enable this for both Intel and AMD hardware. */ +- unsigned long int memset_non_temporal_threshold = SIZE_MAX; +- if (!CPU_FEATURES_ARCH_P (cpu_features, Avoid_Non_Temporal_Memset) +- && (cpu_features->basic.kind == arch_kind_intel +- || cpu_features->basic.kind == arch_kind_amd)) +- memset_non_temporal_threshold = non_temporal_threshold; +- + /* For AMD CPUs that support ERMS (Zen3+), REP MOVSB is in a lot of + cases slower than the vectorized path (and for some alignments, + it is really slow, check BZ #30994). */ +@@ -1015,6 +1007,13 @@ dl_init_cacheinfo (struct cpu_features *cpu_features) + if (tunable_size != 0) + shared = tunable_size; + ++ /* Non-temporal stores are more performant on some hardware above ++ non_temporal_threshold. Currently Prefer_Non_Temporal is set for for both ++ Intel and AMD hardware. */ ++ unsigned long int memset_non_temporal_threshold = SIZE_MAX; ++ if (!CPU_FEATURES_ARCH_P (cpu_features, Avoid_Non_Temporal_Memset)) ++ memset_non_temporal_threshold = non_temporal_threshold; ++ + tunable_size = TUNABLE_GET (x86_non_temporal_threshold, long int, NULL); + if (tunable_size > minimum_non_temporal_threshold + && tunable_size <= maximum_non_temporal_threshold) diff --git a/glibc-upstream-2.39-180.patch b/glibc-upstream-2.39-180.patch new file mode 100644 index 0000000..2140061 --- /dev/null +++ b/glibc-upstream-2.39-180.patch @@ -0,0 +1,160 @@ +commit 765ff3d0d49f039575dd20961e745fb2876339a7 +Author: Sunil K Pandey +Date: Thu Apr 3 13:00:45 2025 -0700 + + x86: Optimize xstate size calculation + + Scan xstate IDs up to the maximum supported xstate ID. Remove the + separate AMX xstate calculation. Instead, exclude the AMX space from + the start of TILECFG to the end of TILEDATA in xsave_state_size. + + Completed validation on SKL/SKX/SPR/SDE and compared xsave state size + with "ld.so --list-diagnostics" option, no regression. + + Co-Authored-By: H.J. Lu + Reviewed-by: Sunil K Pandey + (cherry picked from commit 70b648855185e967e54668b101d24704c3fb869d) + +diff --git a/sysdeps/x86/cpu-features.c b/sysdeps/x86/cpu-features.c +index 4490c0a782e25d8d..dc5cd01d489851b8 100644 +--- a/sysdeps/x86/cpu-features.c ++++ b/sysdeps/x86/cpu-features.c +@@ -325,13 +325,8 @@ update_active (struct cpu_features *cpu_features) + /* Check if XSAVEC is available. */ + if (CPU_FEATURES_CPU_P (cpu_features, XSAVEC)) + { +- unsigned int xstate_comp_offsets[32]; +- unsigned int xstate_comp_sizes[32]; +-#ifdef __x86_64__ +- unsigned int xstate_amx_comp_offsets[32]; +- unsigned int xstate_amx_comp_sizes[32]; +- unsigned int amx_ecx; +-#endif ++ unsigned int xstate_comp_offsets[X86_XSTATE_MAX_ID + 1]; ++ unsigned int xstate_comp_sizes[X86_XSTATE_MAX_ID + 1]; + unsigned int i; + + xstate_comp_offsets[0] = 0; +@@ -339,39 +334,16 @@ update_active (struct cpu_features *cpu_features) + xstate_comp_offsets[2] = 576; + xstate_comp_sizes[0] = 160; + xstate_comp_sizes[1] = 256; +-#ifdef __x86_64__ +- xstate_amx_comp_offsets[0] = 0; +- xstate_amx_comp_offsets[1] = 160; +- xstate_amx_comp_offsets[2] = 576; +- xstate_amx_comp_sizes[0] = 160; +- xstate_amx_comp_sizes[1] = 256; +-#endif + +- for (i = 2; i < 32; i++) ++ for (i = 2; i <= X86_XSTATE_MAX_ID; i++) + { + if ((FULL_STATE_SAVE_MASK & (1 << i)) != 0) + { + __cpuid_count (0xd, i, eax, ebx, ecx, edx); +-#ifdef __x86_64__ +- /* Include this in xsave_state_full_size. */ +- amx_ecx = ecx; +- xstate_amx_comp_sizes[i] = eax; +- if ((AMX_STATE_SAVE_MASK & (1 << i)) != 0) +- { +- /* Exclude this from xsave_state_size. */ +- ecx = 0; +- xstate_comp_sizes[i] = 0; +- } +- else +-#endif +- xstate_comp_sizes[i] = eax; ++ xstate_comp_sizes[i] = eax; + } + else + { +-#ifdef __x86_64__ +- amx_ecx = 0; +- xstate_amx_comp_sizes[i] = 0; +-#endif + ecx = 0; + xstate_comp_sizes[i] = 0; + } +@@ -380,42 +352,32 @@ update_active (struct cpu_features *cpu_features) + { + xstate_comp_offsets[i] + = (xstate_comp_offsets[i - 1] +- + xstate_comp_sizes[i -1]); ++ + xstate_comp_sizes[i - 1]); + if ((ecx & (1 << 1)) != 0) + xstate_comp_offsets[i] + = ALIGN_UP (xstate_comp_offsets[i], 64); +-#ifdef __x86_64__ +- xstate_amx_comp_offsets[i] +- = (xstate_amx_comp_offsets[i - 1] +- + xstate_amx_comp_sizes[i - 1]); +- if ((amx_ecx & (1 << 1)) != 0) +- xstate_amx_comp_offsets[i] +- = ALIGN_UP (xstate_amx_comp_offsets[i], +- 64); +-#endif + } + } + + /* Use XSAVEC. */ + unsigned int size +- = xstate_comp_offsets[31] + xstate_comp_sizes[31]; ++ = (xstate_comp_offsets[X86_XSTATE_MAX_ID] ++ + xstate_comp_sizes[X86_XSTATE_MAX_ID]); + if (size) + { ++ size = ALIGN_UP (size + TLSDESC_CALL_REGISTER_SAVE_AREA, ++ 64); + #ifdef __x86_64__ +- unsigned int amx_size +- = (xstate_amx_comp_offsets[31] +- + xstate_amx_comp_sizes[31]); +- amx_size +- = ALIGN_UP ((amx_size +- + TLSDESC_CALL_REGISTER_SAVE_AREA), +- 64); +- /* Set TLSDESC state size to the compact AMX +- state size for XSAVEC. */ +- _dl_x86_features_tlsdesc_state_size = amx_size; ++ _dl_x86_features_tlsdesc_state_size = size; ++ /* Exclude the AMX space from the start of TILECFG ++ space to the end of TILEDATA space. If CPU ++ doesn't support AMX, TILECFG offset is the same ++ as TILEDATA + 1 offset. Otherwise, they are ++ multiples of 64. */ ++ size -= (xstate_comp_offsets[X86_XSTATE_TILEDATA_ID + 1] ++ - xstate_comp_offsets[X86_XSTATE_TILECFG_ID]); + #endif +- cpu_features->xsave_state_size +- = ALIGN_UP (size + TLSDESC_CALL_REGISTER_SAVE_AREA, +- 64); ++ cpu_features->xsave_state_size = size; + CPU_FEATURE_SET (cpu_features, XSAVEC); + } + } +diff --git a/sysdeps/x86/sysdep.h b/sysdeps/x86/sysdep.h +index 7359149e17ccf341..1d6cabd816bf84cc 100644 +--- a/sysdeps/x86/sysdep.h ++++ b/sysdeps/x86/sysdep.h +@@ -102,6 +102,9 @@ + | (1 << X86_XSTATE_ZMM_ID) \ + | (1 << X86_XSTATE_APX_F_ID)) + ++/* The maximum supported xstate ID. */ ++# define X86_XSTATE_MAX_ID X86_XSTATE_APX_F_ID ++ + /* AMX state mask. */ + # define AMX_STATE_SAVE_MASK \ + ((1 << X86_XSTATE_TILECFG_ID) | (1 << X86_XSTATE_TILEDATA_ID)) +@@ -123,6 +126,9 @@ + | (1 << X86_XSTATE_K_ID) \ + | (1 << X86_XSTATE_ZMM_H_ID)) + ++/* The maximum supported xstate ID. */ ++# define X86_XSTATE_MAX_ID X86_XSTATE_ZMM_H_ID ++ + /* States to be included in xsave_state_size. */ + # define FULL_STATE_SAVE_MASK STATE_SAVE_MASK + #endif diff --git a/glibc-upstream-2.39-181.patch b/glibc-upstream-2.39-181.patch new file mode 100644 index 0000000..5842775 --- /dev/null +++ b/glibc-upstream-2.39-181.patch @@ -0,0 +1,76 @@ +commit 7620d98186fc23e216773dbec5dc5da1fd8daf0f +Author: Sunil K Pandey +Date: Thu Apr 3 18:14:20 2025 -0700 + + x86: Add ARL/PTL/CWF model detection support + + - Add ARROWLAKE model detection. + - Add PANTHERLAKE model detection. + - Add CLEARWATERFOREST model detection. + + Intel® Architecture Instruction Set Extensions Programming Reference + https://cdrdv2.intel.com/v1/dl/getContent/671368 Section 1.2. + + No regression, validated model detection on SDE. + + Reviewed-by: H.J. Lu + (cherry picked from commit e53eb952b970ac94c97d74fb447418fb327ca096) + +diff --git a/sysdeps/x86/cpu-features.c b/sysdeps/x86/cpu-features.c +index dc5cd01d489851b8..fb94477dad08ab02 100644 +--- a/sysdeps/x86/cpu-features.c ++++ b/sysdeps/x86/cpu-features.c +@@ -512,6 +512,7 @@ enum + INTEL_ATOM_GOLDMONT, + INTEL_ATOM_GOLDMONT_PLUS, + INTEL_ATOM_SIERRAFOREST, ++ INTEL_ATOM_CLEARWATERFOREST, + INTEL_ATOM_GRANDRIDGE, + INTEL_ATOM_TREMONT, + +@@ -539,6 +540,7 @@ enum + INTEL_BIGCORE_METEORLAKE, + INTEL_BIGCORE_LUNARLAKE, + INTEL_BIGCORE_ARROWLAKE, ++ INTEL_BIGCORE_PANTHERLAKE, + INTEL_BIGCORE_GRANITERAPIDS, + + /* Mixed (bigcore + atom SOC). */ +@@ -584,6 +586,8 @@ intel_get_fam6_microarch (unsigned int model, + return INTEL_ATOM_GOLDMONT_PLUS; + case 0xAF: + return INTEL_ATOM_SIERRAFOREST; ++ case 0xDD: ++ return INTEL_ATOM_CLEARWATERFOREST; + case 0xB6: + return INTEL_ATOM_GRANDRIDGE; + case 0x86: +@@ -691,8 +695,12 @@ intel_get_fam6_microarch (unsigned int model, + return INTEL_BIGCORE_METEORLAKE; + case 0xbd: + return INTEL_BIGCORE_LUNARLAKE; ++ case 0xb5: ++ case 0xc5: + case 0xc6: + return INTEL_BIGCORE_ARROWLAKE; ++ case 0xCC: ++ return INTEL_BIGCORE_PANTHERLAKE; + case 0xAD: + case 0xAE: + return INTEL_BIGCORE_GRANITERAPIDS; +@@ -808,6 +816,7 @@ init_cpu_features (struct cpu_features *cpu_features) + Default tuned atom microarch. + case INTEL_ATOM_SIERRAFOREST: + case INTEL_ATOM_GRANDRIDGE: ++ case INTEL_ATOM_CLEARWATERFOREST: + */ + + /* Bigcore/Default Tuning. */ +@@ -864,6 +873,7 @@ init_cpu_features (struct cpu_features *cpu_features) + case INTEL_BIGCORE_METEORLAKE: + case INTEL_BIGCORE_LUNARLAKE: + case INTEL_BIGCORE_ARROWLAKE: ++ case INTEL_BIGCORE_PANTHERLAKE: + case INTEL_BIGCORE_SAPPHIRERAPIDS: + case INTEL_BIGCORE_EMERALDRAPIDS: + case INTEL_BIGCORE_GRANITERAPIDS: diff --git a/glibc-upstream-2.39-182.patch b/glibc-upstream-2.39-182.patch new file mode 100644 index 0000000..cdec3f3 --- /dev/null +++ b/glibc-upstream-2.39-182.patch @@ -0,0 +1,352 @@ +commit e09436c2cb5b6453d922c5af6a30e2de0255cd61 +Author: Sunil K Pandey +Date: Fri Apr 11 08:52:52 2025 -0700 + + x86: Handle unknown Intel processor with default tuning + + Enable default tuning for unknown Intel processor. + + Tested on x86, no regression. + + Co-Authored-By: H.J. Lu + Reviewed-by: H.J. Lu + (cherry picked from commit 9f0deff558d1d6b08c425c157f50de85013ada9c) + +diff --git a/sysdeps/x86/cpu-features.c b/sysdeps/x86/cpu-features.c +index fb94477dad08ab02..6d2e660b4b20ff06 100644 +--- a/sysdeps/x86/cpu-features.c ++++ b/sysdeps/x86/cpu-features.c +@@ -502,8 +502,8 @@ _Static_assert (((index_arch_Fast_Unaligned_Load + "Incorrect index_arch_Fast_Unaligned_Load"); + + +-/* Intel Family-6 microarch list. */ +-enum ++/* Intel microarch list. */ ++enum intel_microarch + { + /* Atom processors. */ + INTEL_ATOM_BONNELL, +@@ -555,7 +555,7 @@ enum + INTEL_UNKNOWN, + }; + +-static unsigned int ++static enum intel_microarch + intel_get_fam6_microarch (unsigned int model, + __attribute__ ((unused)) unsigned int stepping) + { +@@ -764,134 +764,20 @@ init_cpu_features (struct cpu_features *cpu_features) + cpu_features->preferred[index_arch_Avoid_Non_Temporal_Memset] + &= ~bit_arch_Avoid_Non_Temporal_Memset; + ++ enum intel_microarch microarch = INTEL_UNKNOWN; + if (family == 0x06) + { + model += extended_model; +- unsigned int microarch +- = intel_get_fam6_microarch (model, stepping); ++ microarch = intel_get_fam6_microarch (model, stepping); + ++ /* Disable TSX on some processors to avoid TSX on kernels that ++ weren't updated with the latest microcode package (which ++ disables broken feature by default). */ + switch (microarch) + { +- /* Atom / KNL tuning. */ +- case INTEL_ATOM_BONNELL: +- /* BSF is slow on Bonnell. */ +- cpu_features->preferred[index_arch_Slow_BSF] +- |= bit_arch_Slow_BSF; +- break; +- +- /* Unaligned load versions are faster than SSSE3 +- on Airmont, Silvermont, Goldmont, and Goldmont Plus. */ +- case INTEL_ATOM_AIRMONT: +- case INTEL_ATOM_SILVERMONT: +- case INTEL_ATOM_GOLDMONT: +- case INTEL_ATOM_GOLDMONT_PLUS: +- +- /* Knights Landing. Enable Silvermont optimizations. */ +- case INTEL_KNIGHTS_LANDING: +- +- cpu_features->preferred[index_arch_Fast_Unaligned_Load] +- |= (bit_arch_Fast_Unaligned_Load +- | bit_arch_Fast_Unaligned_Copy +- | bit_arch_Prefer_PMINUB_for_stringop +- | bit_arch_Slow_SSE4_2); +- break; +- +- case INTEL_ATOM_TREMONT: +- /* Enable rep string instructions, unaligned load, unaligned +- copy, pminub and avoid SSE 4.2 on Tremont. */ +- cpu_features->preferred[index_arch_Fast_Rep_String] +- |= (bit_arch_Fast_Rep_String +- | bit_arch_Fast_Unaligned_Load +- | bit_arch_Fast_Unaligned_Copy +- | bit_arch_Prefer_PMINUB_for_stringop +- | bit_arch_Slow_SSE4_2); +- break; +- +- /* +- Default tuned Knights microarch. +- case INTEL_KNIGHTS_MILL: +- */ +- +- /* +- Default tuned atom microarch. +- case INTEL_ATOM_SIERRAFOREST: +- case INTEL_ATOM_GRANDRIDGE: +- case INTEL_ATOM_CLEARWATERFOREST: +- */ +- +- /* Bigcore/Default Tuning. */ + default: +- default_tuning: +- /* Unknown family 0x06 processors. Assuming this is one +- of Core i3/i5/i7 processors if AVX is available. */ +- if (!CPU_FEATURES_CPU_P (cpu_features, AVX)) +- break; +- +- enable_modern_features: +- /* Rep string instructions, unaligned load, unaligned copy, +- and pminub are fast on Intel Core i3, i5 and i7. */ +- cpu_features->preferred[index_arch_Fast_Rep_String] +- |= (bit_arch_Fast_Rep_String +- | bit_arch_Fast_Unaligned_Load +- | bit_arch_Fast_Unaligned_Copy +- | bit_arch_Prefer_PMINUB_for_stringop); + break; + +- case INTEL_BIGCORE_NEHALEM: +- case INTEL_BIGCORE_WESTMERE: +- /* Older CPUs prefer non-temporal stores at lower threshold. */ +- cpu_features->cachesize_non_temporal_divisor = 8; +- goto enable_modern_features; +- +- /* Older Bigcore microarch (smaller non-temporal store +- threshold). */ +- case INTEL_BIGCORE_SANDYBRIDGE: +- case INTEL_BIGCORE_IVYBRIDGE: +- case INTEL_BIGCORE_HASWELL: +- case INTEL_BIGCORE_BROADWELL: +- cpu_features->cachesize_non_temporal_divisor = 8; +- goto default_tuning; +- +- /* Newer Bigcore microarch (larger non-temporal store +- threshold). */ +- case INTEL_BIGCORE_SKYLAKE_AVX512: +- case INTEL_BIGCORE_CANNONLAKE: +- /* Benchmarks indicate non-temporal memset is not +- necessarily profitable on SKX (and in some cases much +- worse). This is likely unique to SKX due its it unique +- mesh interconnect (not present on ICX or BWD). Disable +- non-temporal on all Skylake servers. */ +- cpu_features->preferred[index_arch_Avoid_Non_Temporal_Memset] +- |= bit_arch_Avoid_Non_Temporal_Memset; +- case INTEL_BIGCORE_COMETLAKE: +- case INTEL_BIGCORE_SKYLAKE: +- case INTEL_BIGCORE_KABYLAKE: +- case INTEL_BIGCORE_ICELAKE: +- case INTEL_BIGCORE_TIGERLAKE: +- case INTEL_BIGCORE_ROCKETLAKE: +- case INTEL_BIGCORE_RAPTORLAKE: +- case INTEL_BIGCORE_METEORLAKE: +- case INTEL_BIGCORE_LUNARLAKE: +- case INTEL_BIGCORE_ARROWLAKE: +- case INTEL_BIGCORE_PANTHERLAKE: +- case INTEL_BIGCORE_SAPPHIRERAPIDS: +- case INTEL_BIGCORE_EMERALDRAPIDS: +- case INTEL_BIGCORE_GRANITERAPIDS: +- cpu_features->cachesize_non_temporal_divisor = 2; +- goto default_tuning; +- +- /* Default tuned Mixed (bigcore + atom SOC). */ +- case INTEL_MIXED_LAKEFIELD: +- case INTEL_MIXED_ALDERLAKE: +- cpu_features->cachesize_non_temporal_divisor = 2; +- goto default_tuning; +- } +- +- /* Disable TSX on some processors to avoid TSX on kernels that +- weren't updated with the latest microcode package (which +- disables broken feature by default). */ +- switch (microarch) +- { + case INTEL_BIGCORE_SKYLAKE_AVX512: + /* 0x55 (Skylake-avx512) && stepping <= 5 disable TSX. */ + if (stepping <= 5) +@@ -900,38 +786,152 @@ init_cpu_features (struct cpu_features *cpu_features) + + case INTEL_BIGCORE_KABYLAKE: + /* NB: Although the errata documents that for model == 0x8e +- (kabylake skylake client), only 0xb stepping or lower are +- impacted, the intention of the errata was to disable TSX on +- all client processors on all steppings. Include 0xc +- stepping which is an Intel Core i7-8665U, a client mobile +- processor. */ ++ (kabylake skylake client), only 0xb stepping or lower are ++ impacted, the intention of the errata was to disable TSX on ++ all client processors on all steppings. Include 0xc ++ stepping which is an Intel Core i7-8665U, a client mobile ++ processor. */ + if (stepping > 0xc) + break; + /* Fall through. */ + case INTEL_BIGCORE_SKYLAKE: +- /* Disable Intel TSX and enable RTM_ALWAYS_ABORT for +- processors listed in: +- +-https://www.intel.com/content/www/us/en/support/articles/000059422/processors.html +- */ +- disable_tsx: +- CPU_FEATURE_UNSET (cpu_features, HLE); +- CPU_FEATURE_UNSET (cpu_features, RTM); +- CPU_FEATURE_SET (cpu_features, RTM_ALWAYS_ABORT); +- break; ++ /* Disable Intel TSX and enable RTM_ALWAYS_ABORT for ++ processors listed in: ++ ++ https://www.intel.com/content/www/us/en/support/articles/000059422/processors.html ++ */ ++disable_tsx: ++ CPU_FEATURE_UNSET (cpu_features, HLE); ++ CPU_FEATURE_UNSET (cpu_features, RTM); ++ CPU_FEATURE_SET (cpu_features, RTM_ALWAYS_ABORT); ++ break; + + case INTEL_BIGCORE_HASWELL: +- /* Xeon E7 v3 (model == 0x3f) with stepping >= 4 has working +- TSX. Haswell also include other model numbers that have +- working TSX. */ +- if (model == 0x3f && stepping >= 4) ++ /* Xeon E7 v3 (model == 0x3f) with stepping >= 4 has working ++ TSX. Haswell also includes other model numbers that have ++ working TSX. */ ++ if (model == 0x3f && stepping >= 4) + break; + +- CPU_FEATURE_UNSET (cpu_features, RTM); +- break; ++ CPU_FEATURE_UNSET (cpu_features, RTM); ++ break; + } + } + ++ switch (microarch) ++ { ++ /* Atom / KNL tuning. */ ++ case INTEL_ATOM_BONNELL: ++ /* BSF is slow on Bonnell. */ ++ cpu_features->preferred[index_arch_Slow_BSF] ++ |= bit_arch_Slow_BSF; ++ break; ++ ++ /* Unaligned load versions are faster than SSSE3 ++ on Airmont, Silvermont, Goldmont, and Goldmont Plus. */ ++ case INTEL_ATOM_AIRMONT: ++ case INTEL_ATOM_SILVERMONT: ++ case INTEL_ATOM_GOLDMONT: ++ case INTEL_ATOM_GOLDMONT_PLUS: ++ ++ /* Knights Landing. Enable Silvermont optimizations. */ ++ case INTEL_KNIGHTS_LANDING: ++ ++ cpu_features->preferred[index_arch_Fast_Unaligned_Load] ++ |= (bit_arch_Fast_Unaligned_Load ++ | bit_arch_Fast_Unaligned_Copy ++ | bit_arch_Prefer_PMINUB_for_stringop ++ | bit_arch_Slow_SSE4_2); ++ break; ++ ++ case INTEL_ATOM_TREMONT: ++ /* Enable rep string instructions, unaligned load, unaligned ++ copy, pminub and avoid SSE 4.2 on Tremont. */ ++ cpu_features->preferred[index_arch_Fast_Rep_String] ++ |= (bit_arch_Fast_Rep_String ++ | bit_arch_Fast_Unaligned_Load ++ | bit_arch_Fast_Unaligned_Copy ++ | bit_arch_Prefer_PMINUB_for_stringop ++ | bit_arch_Slow_SSE4_2); ++ break; ++ ++ /* ++ Default tuned Knights microarch. ++ case INTEL_KNIGHTS_MILL: ++ */ ++ ++ /* ++ Default tuned atom microarch. ++ case INTEL_ATOM_SIERRAFOREST: ++ case INTEL_ATOM_GRANDRIDGE: ++ case INTEL_ATOM_CLEARWATERFOREST: ++ */ ++ ++ /* Bigcore/Default Tuning. */ ++ default: ++ default_tuning: ++ /* Unknown Intel processors. Assuming this is one of Core ++ i3/i5/i7 processors if AVX is available. */ ++ if (!CPU_FEATURES_CPU_P (cpu_features, AVX)) ++ break; ++ ++ enable_modern_features: ++ /* Rep string instructions, unaligned load, unaligned copy, ++ and pminub are fast on Intel Core i3, i5 and i7. */ ++ cpu_features->preferred[index_arch_Fast_Rep_String] ++ |= (bit_arch_Fast_Rep_String ++ | bit_arch_Fast_Unaligned_Load ++ | bit_arch_Fast_Unaligned_Copy ++ | bit_arch_Prefer_PMINUB_for_stringop); ++ break; ++ ++ case INTEL_BIGCORE_NEHALEM: ++ case INTEL_BIGCORE_WESTMERE: ++ /* Older CPUs prefer non-temporal stores at lower threshold. */ ++ cpu_features->cachesize_non_temporal_divisor = 8; ++ goto enable_modern_features; ++ ++ /* Older Bigcore microarch (smaller non-temporal store ++ threshold). */ ++ case INTEL_BIGCORE_SANDYBRIDGE: ++ case INTEL_BIGCORE_IVYBRIDGE: ++ case INTEL_BIGCORE_HASWELL: ++ case INTEL_BIGCORE_BROADWELL: ++ cpu_features->cachesize_non_temporal_divisor = 8; ++ goto default_tuning; ++ ++ /* Newer Bigcore microarch (larger non-temporal store ++ threshold). */ ++ case INTEL_BIGCORE_SKYLAKE_AVX512: ++ case INTEL_BIGCORE_CANNONLAKE: ++ /* Benchmarks indicate non-temporal memset is not ++ necessarily profitable on SKX (and in some cases much ++ worse). This is likely unique to SKX due to its unique ++ mesh interconnect (not present on ICX or BWD). Disable ++ non-temporal on all Skylake servers. */ ++ cpu_features->preferred[index_arch_Avoid_Non_Temporal_Memset] ++ |= bit_arch_Avoid_Non_Temporal_Memset; ++ /* fallthrough */ ++ case INTEL_BIGCORE_COMETLAKE: ++ case INTEL_BIGCORE_SKYLAKE: ++ case INTEL_BIGCORE_KABYLAKE: ++ case INTEL_BIGCORE_ICELAKE: ++ case INTEL_BIGCORE_TIGERLAKE: ++ case INTEL_BIGCORE_ROCKETLAKE: ++ case INTEL_BIGCORE_RAPTORLAKE: ++ case INTEL_BIGCORE_METEORLAKE: ++ case INTEL_BIGCORE_LUNARLAKE: ++ case INTEL_BIGCORE_ARROWLAKE: ++ case INTEL_BIGCORE_PANTHERLAKE: ++ case INTEL_BIGCORE_SAPPHIRERAPIDS: ++ case INTEL_BIGCORE_EMERALDRAPIDS: ++ case INTEL_BIGCORE_GRANITERAPIDS: ++ /* Default tuned Mixed (bigcore + atom SOC). */ ++ case INTEL_MIXED_LAKEFIELD: ++ case INTEL_MIXED_ALDERLAKE: ++ cpu_features->cachesize_non_temporal_divisor = 2; ++ goto default_tuning; ++ } + + /* Since AVX512ER is unique to Xeon Phi, set Prefer_No_VZEROUPPER + if AVX512ER is available. Don't use AVX512 to avoid lower CPU diff --git a/glibc-upstream-2.39-183.patch b/glibc-upstream-2.39-183.patch new file mode 100644 index 0000000..d8d52fa --- /dev/null +++ b/glibc-upstream-2.39-183.patch @@ -0,0 +1,49 @@ +commit 3463100f2d47f2897a24ba8023a5c7aaf2d26550 +Author: H.J. Lu +Date: Sat Apr 12 08:37:29 2025 -0700 + + x86: Detect Intel Diamond Rapids + + Detect Intel Diamond Rapids and tune it similar to Intel Granite Rapids. + + Signed-off-by: H.J. Lu + Reviewed-by: Sunil K Pandey + (cherry picked from commit de14f1959ee5f9b845a7cae43bee03068b8136f0) + +diff --git a/sysdeps/x86/cpu-features.c b/sysdeps/x86/cpu-features.c +index 6d2e660b4b20ff06..47dc3b1510a68fc9 100644 +--- a/sysdeps/x86/cpu-features.c ++++ b/sysdeps/x86/cpu-features.c +@@ -542,6 +542,7 @@ enum intel_microarch + INTEL_BIGCORE_ARROWLAKE, + INTEL_BIGCORE_PANTHERLAKE, + INTEL_BIGCORE_GRANITERAPIDS, ++ INTEL_BIGCORE_DIAMONDRAPIDS, + + /* Mixed (bigcore + atom SOC). */ + INTEL_MIXED_LAKEFIELD, +@@ -817,6 +818,16 @@ disable_tsx: + break; + } + } ++ else if (family == 19) ++ switch (model) ++ { ++ case 0x01: ++ microarch = INTEL_BIGCORE_DIAMONDRAPIDS; ++ break; ++ ++ default: ++ break; ++ } + + switch (microarch) + { +@@ -926,6 +937,7 @@ disable_tsx: + case INTEL_BIGCORE_SAPPHIRERAPIDS: + case INTEL_BIGCORE_EMERALDRAPIDS: + case INTEL_BIGCORE_GRANITERAPIDS: ++ case INTEL_BIGCORE_DIAMONDRAPIDS: + /* Default tuned Mixed (bigcore + atom SOC). */ + case INTEL_MIXED_LAKEFIELD: + case INTEL_MIXED_ALDERLAKE: diff --git a/glibc-upstream-2.39-184.patch b/glibc-upstream-2.39-184.patch new file mode 100644 index 0000000..269e62b --- /dev/null +++ b/glibc-upstream-2.39-184.patch @@ -0,0 +1,444 @@ +commit 2451ef5c4a92e774c56111b3708eede7f98fe940 +Author: Frank Barrus +Date: Wed Dec 4 07:55:02 2024 -0500 + + pthreads NPTL: lost wakeup fix 2 + + This fixes the lost wakeup (from a bug in signal stealing) with a change + in the usage of g_signals[] in the condition variable internal state. + It also completely eliminates the concept and handling of signal stealing, + as well as the need for signalers to block to wait for waiters to wake + up every time there is a G1/G2 switch. This greatly reduces the average + and maximum latency for pthread_cond_signal. + + The g_signals[] field now contains a signal count that is relative to + the current g1_start value. Since it is a 32-bit field, and the LSB is + still reserved (though not currently used anymore), it has a 31-bit value + that corresponds to the low 31 bits of the sequence number in g1_start. + (since g1_start also has an LSB flag, this means bits 31:1 in g_signals + correspond to bits 31:1 in g1_start, plus the current signal count) + + By making the signal count relative to g1_start, there is no longer + any ambiguity or A/B/A issue, and thus any checks before blocking, + including the futex call itself, are guaranteed not to block if the G1/G2 + switch occurs, even if the signal count remains the same. This allows + initially safely blocking in G2 until the switch to G1 occurs, and + then transitioning from G1 to a new G1 or G2, and always being able to + distinguish the state change. This removes the race condition and A/B/A + problems that otherwise ocurred if a late (pre-empted) waiter were to + resume just as the futex call attempted to block on g_signal since + otherwise there was no last opportunity to re-check things like whether + the current G1 group was already closed. + + By fixing these issues, the signal stealing code can be eliminated, + since there is no concept of signal stealing anymore. The code to block + for all waiters to exit g_refs can also be removed, since any waiters + that are still in the g_refs region can be guaranteed to safely wake + up and exit. If there are still any left at this time, they are all + sent one final futex wakeup to ensure that they are not blocked any + longer, but there is no need for the signaller to block and wait for + them to wake up and exit the g_refs region. + + The signal count is then effectively "zeroed" but since it is now + relative to g1_start, this is done by advancing it to a new value that + can be observed by any pending blocking waiters. Any late waiters can + always tell the difference, and can thus just cleanly exit if they are + in a stale G1 or G2. They can never steal a signal from the current + G1 if they are not in the current G1, since the signal value that has + to match in the cmpxchg has the low 31 bits of the g1_start value + contained in it, and that's first checked, and then it won't match if + there's a G1/G2 change. + + Note: the 31-bit sequence number used in g_signals is designed to + handle wrap-around when checking the signal count, but if the entire + 31-bit wraparound (2 billion signals) occurs while there is still a + late waiter that has not yet resumed, and it happens to then match + the current g1_start low bits, and the pre-emption occurs after the + normal "closed group" checks (which are 64-bit) but then hits the + futex syscall and signal consuming code, then an A/B/A issue could + still result and cause an incorrect assumption about whether it + should block. This particular scenario seems unlikely in practice. + Note that once awake from the futex, the waiter would notice the + closed group before consuming the signal (since that's still a 64-bit + check that would not be aliased in the wrap-around in g_signals), + so the biggest impact would be blocking on the futex until the next + full wakeup from a G1/G2 switch. + + Signed-off-by: Frank Barrus + Reviewed-by: Carlos O'Donell + (cherry picked from commit 1db84775f831a1494993ce9c118deaf9537cc50a) + +diff --git a/nptl/pthread_cond_common.c b/nptl/pthread_cond_common.c +index 3487557bb86c8186..4855b8899f887ad0 100644 +--- a/nptl/pthread_cond_common.c ++++ b/nptl/pthread_cond_common.c +@@ -201,7 +201,6 @@ static bool __attribute__ ((unused)) + __condvar_quiesce_and_switch_g1 (pthread_cond_t *cond, uint64_t wseq, + unsigned int *g1index, int private) + { +- const unsigned int maxspin = 0; + unsigned int g1 = *g1index; + + /* If there is no waiter in G2, we don't do anything. The expression may +@@ -222,84 +221,46 @@ __condvar_quiesce_and_switch_g1 (pthread_cond_t *cond, uint64_t wseq, + * New waiters arriving concurrently with the group switching will all go + into G2 until we atomically make the switch. Waiters existing in G2 + are not affected. +- * Waiters in G1 will be closed out immediately by setting a flag in +- __g_signals, which will prevent waiters from blocking using a futex on +- __g_signals and also notifies them that the group is closed. As a +- result, they will eventually remove their group reference, allowing us +- to close switch group roles. */ +- +- /* First, set the closed flag on __g_signals. This tells waiters that are +- about to wait that they shouldn't do that anymore. This basically +- serves as an advance notification of the upcoming change to __g1_start; +- waiters interpret it as if __g1_start was larger than their waiter +- sequence position. This allows us to change __g1_start after waiting +- for all existing waiters with group references to leave, which in turn +- makes recovery after stealing a signal simpler because it then can be +- skipped if __g1_start indicates that the group is closed (otherwise, +- we would have to recover always because waiters don't know how big their +- groups are). Relaxed MO is fine. */ +- atomic_fetch_or_relaxed (cond->__data.__g_signals + g1, 1); +- +- /* Wait until there are no group references anymore. The fetch-or operation +- injects us into the modification order of __g_refs; release MO ensures +- that waiters incrementing __g_refs after our fetch-or see the previous +- changes to __g_signals and to __g1_start that had to happen before we can +- switch this G1 and alias with an older group (we have two groups, so +- aliasing requires switching group roles twice). Note that nobody else +- can have set the wake-request flag, so we do not have to act upon it. +- +- Also note that it is harmless if older waiters or waiters from this G1 +- get a group reference after we have quiesced the group because it will +- remain closed for them either because of the closed flag in __g_signals +- or the later update to __g1_start. New waiters will never arrive here +- but instead continue to go into the still current G2. */ +- unsigned r = atomic_fetch_or_release (cond->__data.__g_refs + g1, 0); +- while ((r >> 1) > 0) +- { +- for (unsigned int spin = maxspin; ((r >> 1) > 0) && (spin > 0); spin--) +- { +- /* TODO Back off. */ +- r = atomic_load_relaxed (cond->__data.__g_refs + g1); +- } +- if ((r >> 1) > 0) +- { +- /* There is still a waiter after spinning. Set the wake-request +- flag and block. Relaxed MO is fine because this is just about +- this futex word. +- +- Update r to include the set wake-request flag so that the upcoming +- futex_wait only blocks if the flag is still set (otherwise, we'd +- violate the basic client-side futex protocol). */ +- r = atomic_fetch_or_relaxed (cond->__data.__g_refs + g1, 1) | 1; +- +- if ((r >> 1) > 0) +- futex_wait_simple (cond->__data.__g_refs + g1, r, private); +- /* Reload here so we eventually see the most recent value even if we +- do not spin. */ +- r = atomic_load_relaxed (cond->__data.__g_refs + g1); +- } +- } +- /* Acquire MO so that we synchronize with the release operation that waiters +- use to decrement __g_refs and thus happen after the waiters we waited +- for. */ +- atomic_thread_fence_acquire (); ++ * Waiters in G1 will be closed out immediately by the advancing of ++ __g_signals to the next "lowseq" (low 31 bits of the new g1_start), ++ which will prevent waiters from blocking using a futex on ++ __g_signals since it provides enough signals for all possible ++ remaining waiters. As a result, they can each consume a signal ++ and they will eventually remove their group reference. */ + + /* Update __g1_start, which finishes closing this group. The value we add + will never be negative because old_orig_size can only be zero when we + switch groups the first time after a condvar was initialized, in which +- case G1 will be at index 1 and we will add a value of 1. See above for +- why this takes place after waiting for quiescence of the group. ++ case G1 will be at index 1 and we will add a value of 1. + Relaxed MO is fine because the change comes with no additional + constraints that others would have to observe. */ + __condvar_add_g1_start_relaxed (cond, + (old_orig_size << 1) + (g1 == 1 ? 1 : - 1)); + +- /* Now reopen the group, thus enabling waiters to again block using the +- futex controlled by __g_signals. Release MO so that observers that see +- no signals (and thus can block) also see the write __g1_start and thus +- that this is now a new group (see __pthread_cond_wait_common for the +- matching acquire MO loads). */ +- atomic_store_release (cond->__data.__g_signals + g1, 0); ++ unsigned int lowseq = ((old_g1_start + old_orig_size) << 1) & ~1U; ++ ++ /* If any waiters still hold group references (and thus could be blocked), ++ then wake them all up now and prevent any running ones from blocking. ++ This is effectively a catch-all for any possible current or future ++ bugs that can allow the group size to reach 0 before all G1 waiters ++ have been awakened or at least given signals to consume, or any ++ other case that can leave blocked (or about to block) older waiters.. */ ++ if ((atomic_fetch_or_release (cond->__data.__g_refs + g1, 0) >> 1) > 0) ++ { ++ /* First advance signals to the end of the group (i.e. enough signals ++ for the entire G1 group) to ensure that waiters which have not ++ yet blocked in the futex will not block. ++ Note that in the vast majority of cases, this should never ++ actually be necessary, since __g_signals will have enough ++ signals for the remaining g_refs waiters. As an optimization, ++ we could check this first before proceeding, although that ++ could still leave the potential for futex lost wakeup bugs ++ if the signal count was non-zero but the futex wakeup ++ was somehow lost. */ ++ atomic_store_release (cond->__data.__g_signals + g1, lowseq); ++ ++ futex_wake (cond->__data.__g_signals + g1, INT_MAX, private); ++ } + + /* At this point, the old G1 is now a valid new G2 (but not in use yet). + No old waiter can neither grab a signal nor acquire a reference without +@@ -311,6 +272,10 @@ __condvar_quiesce_and_switch_g1 (pthread_cond_t *cond, uint64_t wseq, + g1 ^= 1; + *g1index ^= 1; + ++ /* Now advance the new G1 g_signals to the new lowseq, giving it ++ an effective signal count of 0 to start. */ ++ atomic_store_release (cond->__data.__g_signals + g1, lowseq); ++ + /* These values are just observed by signalers, and thus protected by the + lock. */ + unsigned int orig_size = wseq - (old_g1_start + old_orig_size); +diff --git a/nptl/pthread_cond_wait.c b/nptl/pthread_cond_wait.c +index 66786c7b9022b26c..3d290e39c8ccebb7 100644 +--- a/nptl/pthread_cond_wait.c ++++ b/nptl/pthread_cond_wait.c +@@ -238,9 +238,7 @@ __condvar_cleanup_waiting (void *arg) + signaled), and a reference count. + + The group reference count is used to maintain the number of waiters that +- are using the group's futex. Before a group can change its role, the +- reference count must show that no waiters are using the futex anymore; this +- prevents ABA issues on the futex word. ++ are using the group's futex. + + To represent which intervals in the waiter sequence the groups cover (and + thus also which group slot contains G1 or G2), we use a 64b counter to +@@ -300,11 +298,12 @@ __condvar_cleanup_waiting (void *arg) + last reference. + * Reference count used by waiters concurrently with signalers that have + acquired the condvar-internal lock. +- __g_signals: The number of signals that can still be consumed. ++ __g_signals: The number of signals that can still be consumed, relative to ++ the current g1_start. (i.e. bits 31 to 1 of __g_signals are bits ++ 31 to 1 of g1_start with the signal count added) + * Used as a futex word by waiters. Used concurrently by waiters and + signalers. +- * LSB is true iff this group has been completely signaled (i.e., it is +- closed). ++ * LSB is currently reserved and 0. + __g_size: Waiters remaining in this group (i.e., which have not been + signaled yet. + * Accessed by signalers and waiters that cancel waiting (both do so only +@@ -328,18 +327,6 @@ __condvar_cleanup_waiting (void *arg) + sufficient because if a waiter can see a sufficiently large value, it could + have also consume a signal in the waiters group. + +- Waiters try to grab a signal from __g_signals without holding a reference +- count, which can lead to stealing a signal from a more recent group after +- their own group was already closed. They cannot always detect whether they +- in fact did because they do not know when they stole, but they can +- conservatively add a signal back to the group they stole from; if they +- did so unnecessarily, all that happens is a spurious wake-up. To make this +- even less likely, __g1_start contains the index of the current g2 too, +- which allows waiters to check if there aliasing on the group slots; if +- there wasn't, they didn't steal from the current G1, which means that the +- G1 they stole from must have been already closed and they do not need to +- fix anything. +- + It is essential that the last field in pthread_cond_t is __g_signals[1]: + The previous condvar used a pointer-sized field in pthread_cond_t, so a + PTHREAD_COND_INITIALIZER from that condvar implementation might only +@@ -435,6 +422,9 @@ __pthread_cond_wait_common (pthread_cond_t *cond, pthread_mutex_t *mutex, + { + while (1) + { ++ uint64_t g1_start = __condvar_load_g1_start_relaxed (cond); ++ unsigned int lowseq = (g1_start & 1) == g ? signals : g1_start & ~1U; ++ + /* Spin-wait first. + Note that spinning first without checking whether a timeout + passed might lead to what looks like a spurious wake-up even +@@ -446,35 +436,45 @@ __pthread_cond_wait_common (pthread_cond_t *cond, pthread_mutex_t *mutex, + having to compare against the current time seems to be the right + choice from a performance perspective for most use cases. */ + unsigned int spin = maxspin; +- while (signals == 0 && spin > 0) ++ while (spin > 0 && ((int)(signals - lowseq) < 2)) + { + /* Check that we are not spinning on a group that's already + closed. */ +- if (seq < (__condvar_load_g1_start_relaxed (cond) >> 1)) +- goto done; ++ if (seq < (g1_start >> 1)) ++ break; + + /* TODO Back off. */ + + /* Reload signals. See above for MO. */ + signals = atomic_load_acquire (cond->__data.__g_signals + g); ++ g1_start = __condvar_load_g1_start_relaxed (cond); ++ lowseq = (g1_start & 1) == g ? signals : g1_start & ~1U; + spin--; + } + +- /* If our group will be closed as indicated by the flag on signals, +- don't bother grabbing a signal. */ +- if (signals & 1) +- goto done; +- +- /* If there is an available signal, don't block. */ +- if (signals != 0) ++ if (seq < (g1_start >> 1)) ++ { ++ /* If the group is closed already, ++ then this waiter originally had enough extra signals to ++ consume, up until the time its group was closed. */ ++ goto done; ++ } ++ ++ /* If there is an available signal, don't block. ++ If __g1_start has advanced at all, then we must be in G1 ++ by now, perhaps in the process of switching back to an older ++ G2, but in either case we're allowed to consume the available ++ signal and should not block anymore. */ ++ if ((int)(signals - lowseq) >= 2) + break; + + /* No signals available after spinning, so prepare to block. + We first acquire a group reference and use acquire MO for that so + that we synchronize with the dummy read-modify-write in + __condvar_quiesce_and_switch_g1 if we read from that. In turn, +- in this case this will make us see the closed flag on __g_signals +- that designates a concurrent attempt to reuse the group's slot. ++ in this case this will make us see the advancement of __g_signals ++ to the upcoming new g1_start that occurs with a concurrent ++ attempt to reuse the group's slot. + We use acquire MO for the __g_signals check to make the + __g1_start check work (see spinning above). + Note that the group reference acquisition will not mask the +@@ -482,15 +482,24 @@ __pthread_cond_wait_common (pthread_cond_t *cond, pthread_mutex_t *mutex, + an atomic read-modify-write operation and thus extend the release + sequence. */ + atomic_fetch_add_acquire (cond->__data.__g_refs + g, 2); +- if (((atomic_load_acquire (cond->__data.__g_signals + g) & 1) != 0) +- || (seq < (__condvar_load_g1_start_relaxed (cond) >> 1))) ++ signals = atomic_load_acquire (cond->__data.__g_signals + g); ++ g1_start = __condvar_load_g1_start_relaxed (cond); ++ lowseq = (g1_start & 1) == g ? signals : g1_start & ~1U; ++ ++ if (seq < (g1_start >> 1)) + { +- /* Our group is closed. Wake up any signalers that might be +- waiting. */ ++ /* group is closed already, so don't block */ + __condvar_dec_grefs (cond, g, private); + goto done; + } + ++ if ((int)(signals - lowseq) >= 2) ++ { ++ /* a signal showed up or G1/G2 switched after we grabbed the refcount */ ++ __condvar_dec_grefs (cond, g, private); ++ break; ++ } ++ + // Now block. + struct _pthread_cleanup_buffer buffer; + struct _condvar_cleanup_buffer cbuffer; +@@ -501,7 +510,7 @@ __pthread_cond_wait_common (pthread_cond_t *cond, pthread_mutex_t *mutex, + __pthread_cleanup_push (&buffer, __condvar_cleanup_waiting, &cbuffer); + + err = __futex_abstimed_wait_cancelable64 ( +- cond->__data.__g_signals + g, 0, clockid, abstime, private); ++ cond->__data.__g_signals + g, signals, clockid, abstime, private); + + __pthread_cleanup_pop (&buffer, 0); + +@@ -524,6 +533,8 @@ __pthread_cond_wait_common (pthread_cond_t *cond, pthread_mutex_t *mutex, + signals = atomic_load_acquire (cond->__data.__g_signals + g); + } + ++ if (seq < (__condvar_load_g1_start_relaxed (cond) >> 1)) ++ goto done; + } + /* Try to grab a signal. Use acquire MO so that we see an up-to-date value + of __g1_start below (see spinning above for a similar case). In +@@ -532,69 +543,6 @@ __pthread_cond_wait_common (pthread_cond_t *cond, pthread_mutex_t *mutex, + while (!atomic_compare_exchange_weak_acquire (cond->__data.__g_signals + g, + &signals, signals - 2)); + +- /* We consumed a signal but we could have consumed from a more recent group +- that aliased with ours due to being in the same group slot. If this +- might be the case our group must be closed as visible through +- __g1_start. */ +- uint64_t g1_start = __condvar_load_g1_start_relaxed (cond); +- if (seq < (g1_start >> 1)) +- { +- /* We potentially stole a signal from a more recent group but we do not +- know which group we really consumed from. +- We do not care about groups older than current G1 because they are +- closed; we could have stolen from these, but then we just add a +- spurious wake-up for the current groups. +- We will never steal a signal from current G2 that was really intended +- for G2 because G2 never receives signals (until it becomes G1). We +- could have stolen a signal from G2 that was conservatively added by a +- previous waiter that also thought it stole a signal -- but given that +- that signal was added unnecessarily, it's not a problem if we steal +- it. +- Thus, the remaining case is that we could have stolen from the current +- G1, where "current" means the __g1_start value we observed. However, +- if the current G1 does not have the same slot index as we do, we did +- not steal from it and do not need to undo that. This is the reason +- for putting a bit with G2's index into__g1_start as well. */ +- if (((g1_start & 1) ^ 1) == g) +- { +- /* We have to conservatively undo our potential mistake of stealing +- a signal. We can stop trying to do that when the current G1 +- changes because other spinning waiters will notice this too and +- __condvar_quiesce_and_switch_g1 has checked that there are no +- futex waiters anymore before switching G1. +- Relaxed MO is fine for the __g1_start load because we need to +- merely be able to observe this fact and not have to observe +- something else as well. +- ??? Would it help to spin for a little while to see whether the +- current G1 gets closed? This might be worthwhile if the group is +- small or close to being closed. */ +- unsigned int s = atomic_load_relaxed (cond->__data.__g_signals + g); +- while (__condvar_load_g1_start_relaxed (cond) == g1_start) +- { +- /* Try to add a signal. We don't need to acquire the lock +- because at worst we can cause a spurious wake-up. If the +- group is in the process of being closed (LSB is true), this +- has an effect similar to us adding a signal. */ +- if (((s & 1) != 0) +- || atomic_compare_exchange_weak_relaxed +- (cond->__data.__g_signals + g, &s, s + 2)) +- { +- /* If we added a signal, we also need to add a wake-up on +- the futex. We also need to do that if we skipped adding +- a signal because the group is being closed because +- while __condvar_quiesce_and_switch_g1 could have closed +- the group, it might still be waiting for futex waiters to +- leave (and one of those waiters might be the one we stole +- the signal from, which cause it to block using the +- futex). */ +- futex_wake (cond->__data.__g_signals + g, 1, private); +- break; +- } +- /* TODO Back off. */ +- } +- } +- } +- + done: + + /* Confirm that we have been woken. We do that before acquiring the mutex diff --git a/glibc-upstream-2.39-185.patch b/glibc-upstream-2.39-185.patch new file mode 100644 index 0000000..bfea4d7 --- /dev/null +++ b/glibc-upstream-2.39-185.patch @@ -0,0 +1,134 @@ +commit ea13a35e37932cabeef7d7b018aaef1136287a5e +Author: Malte Skarupke +Date: Wed Dec 4 07:55:22 2024 -0500 + + nptl: Update comments and indentation for new condvar implementation + + Some comments were wrong after the most recent commit. This fixes that. + + Also fixing indentation where it was using spaces instead of tabs. + + Signed-off-by: Malte Skarupke + Reviewed-by: Carlos O'Donell + (cherry picked from commit 0cc973160c23bb67f895bc887dd6942d29f8fee3) + +diff --git a/nptl/pthread_cond_common.c b/nptl/pthread_cond_common.c +index 4855b8899f887ad0..3475d1512354be3c 100644 +--- a/nptl/pthread_cond_common.c ++++ b/nptl/pthread_cond_common.c +@@ -221,8 +221,9 @@ __condvar_quiesce_and_switch_g1 (pthread_cond_t *cond, uint64_t wseq, + * New waiters arriving concurrently with the group switching will all go + into G2 until we atomically make the switch. Waiters existing in G2 + are not affected. +- * Waiters in G1 will be closed out immediately by the advancing of +- __g_signals to the next "lowseq" (low 31 bits of the new g1_start), ++ * Waiters in G1 have already received a signal and been woken. If they ++ haven't woken yet, they will be closed out immediately by the advancing ++ of __g_signals to the next "lowseq" (low 31 bits of the new g1_start), + which will prevent waiters from blocking using a futex on + __g_signals since it provides enough signals for all possible + remaining waiters. As a result, they can each consume a signal +diff --git a/nptl/pthread_cond_wait.c b/nptl/pthread_cond_wait.c +index 3d290e39c8ccebb7..ad2cee7d59ddc093 100644 +--- a/nptl/pthread_cond_wait.c ++++ b/nptl/pthread_cond_wait.c +@@ -249,7 +249,7 @@ __condvar_cleanup_waiting (void *arg) + figure out whether they are in a group that has already been completely + signaled (i.e., if the current G1 starts at a later position that the + waiter's position). Waiters cannot determine whether they are currently +- in G2 or G1 -- but they do not have too because all they are interested in ++ in G2 or G1 -- but they do not have to because all they are interested in + is whether there are available signals, and they always start in G2 (whose + group slot they know because of the bit in the waiter sequence. Signalers + will simply fill the right group until it is completely signaled and can +@@ -412,7 +412,7 @@ __pthread_cond_wait_common (pthread_cond_t *cond, pthread_mutex_t *mutex, + } + + /* Now wait until a signal is available in our group or it is closed. +- Acquire MO so that if we observe a value of zero written after group ++ Acquire MO so that if we observe (signals == lowseq) after group + switching in __condvar_quiesce_and_switch_g1, we synchronize with that + store and will see the prior update of __g1_start done while switching + groups too. */ +@@ -422,8 +422,8 @@ __pthread_cond_wait_common (pthread_cond_t *cond, pthread_mutex_t *mutex, + { + while (1) + { +- uint64_t g1_start = __condvar_load_g1_start_relaxed (cond); +- unsigned int lowseq = (g1_start & 1) == g ? signals : g1_start & ~1U; ++ uint64_t g1_start = __condvar_load_g1_start_relaxed (cond); ++ unsigned int lowseq = (g1_start & 1) == g ? signals : g1_start & ~1U; + + /* Spin-wait first. + Note that spinning first without checking whether a timeout +@@ -447,21 +447,21 @@ __pthread_cond_wait_common (pthread_cond_t *cond, pthread_mutex_t *mutex, + + /* Reload signals. See above for MO. */ + signals = atomic_load_acquire (cond->__data.__g_signals + g); +- g1_start = __condvar_load_g1_start_relaxed (cond); +- lowseq = (g1_start & 1) == g ? signals : g1_start & ~1U; ++ g1_start = __condvar_load_g1_start_relaxed (cond); ++ lowseq = (g1_start & 1) == g ? signals : g1_start & ~1U; + spin--; + } + +- if (seq < (g1_start >> 1)) ++ if (seq < (g1_start >> 1)) + { +- /* If the group is closed already, ++ /* If the group is closed already, + then this waiter originally had enough extra signals to + consume, up until the time its group was closed. */ + goto done; +- } ++ } + + /* If there is an available signal, don't block. +- If __g1_start has advanced at all, then we must be in G1 ++ If __g1_start has advanced at all, then we must be in G1 + by now, perhaps in the process of switching back to an older + G2, but in either case we're allowed to consume the available + signal and should not block anymore. */ +@@ -483,22 +483,23 @@ __pthread_cond_wait_common (pthread_cond_t *cond, pthread_mutex_t *mutex, + sequence. */ + atomic_fetch_add_acquire (cond->__data.__g_refs + g, 2); + signals = atomic_load_acquire (cond->__data.__g_signals + g); +- g1_start = __condvar_load_g1_start_relaxed (cond); +- lowseq = (g1_start & 1) == g ? signals : g1_start & ~1U; ++ g1_start = __condvar_load_g1_start_relaxed (cond); ++ lowseq = (g1_start & 1) == g ? signals : g1_start & ~1U; + +- if (seq < (g1_start >> 1)) ++ if (seq < (g1_start >> 1)) + { +- /* group is closed already, so don't block */ ++ /* group is closed already, so don't block */ + __condvar_dec_grefs (cond, g, private); + goto done; + } + + if ((int)(signals - lowseq) >= 2) + { +- /* a signal showed up or G1/G2 switched after we grabbed the refcount */ ++ /* a signal showed up or G1/G2 switched after we grabbed the ++ refcount */ + __condvar_dec_grefs (cond, g, private); + break; +- } ++ } + + // Now block. + struct _pthread_cleanup_buffer buffer; +@@ -536,10 +537,8 @@ __pthread_cond_wait_common (pthread_cond_t *cond, pthread_mutex_t *mutex, + if (seq < (__condvar_load_g1_start_relaxed (cond) >> 1)) + goto done; + } +- /* Try to grab a signal. Use acquire MO so that we see an up-to-date value +- of __g1_start below (see spinning above for a similar case). In +- particular, if we steal from a more recent group, we will also see a +- more recent __g1_start below. */ ++ /* Try to grab a signal. See above for MO. (if we do another loop ++ iteration we need to see the correct value of g1_start) */ + while (!atomic_compare_exchange_weak_acquire (cond->__data.__g_signals + g, + &signals, signals - 2)); + diff --git a/glibc-upstream-2.39-186.patch b/glibc-upstream-2.39-186.patch new file mode 100644 index 0000000..e66963e --- /dev/null +++ b/glibc-upstream-2.39-186.patch @@ -0,0 +1,68 @@ +commit d0da34ad302df61c4e4c3030845cbe9b986196bf +Author: Malte Skarupke +Date: Wed Dec 4 07:55:50 2024 -0500 + + nptl: Remove unnecessary catch-all-wake in condvar group switch + + This wake is unnecessary. We only switch groups after every sleeper in a group + has been woken. Sure, they may take a while to actually wake up and may still + hold a reference, but waking them a second time doesn't speed that up. Instead + this just makes the code more complicated and may hide problems. + + In particular this safety wake wouldn't even have helped with the bug that was + fixed by Barrus' patch: The bug there was that pthread_cond_signal would not + switch g1 when it should, so we wouldn't even have entered this code path. + + Signed-off-by: Malte Skarupke + Reviewed-by: Carlos O'Donell + (cherry picked from commit b42cc6af11062c260c7dfa91f1c89891366fed3e) + +diff --git a/nptl/pthread_cond_common.c b/nptl/pthread_cond_common.c +index 3475d1512354be3c..30b8eee149cee195 100644 +--- a/nptl/pthread_cond_common.c ++++ b/nptl/pthread_cond_common.c +@@ -221,13 +221,7 @@ __condvar_quiesce_and_switch_g1 (pthread_cond_t *cond, uint64_t wseq, + * New waiters arriving concurrently with the group switching will all go + into G2 until we atomically make the switch. Waiters existing in G2 + are not affected. +- * Waiters in G1 have already received a signal and been woken. If they +- haven't woken yet, they will be closed out immediately by the advancing +- of __g_signals to the next "lowseq" (low 31 bits of the new g1_start), +- which will prevent waiters from blocking using a futex on +- __g_signals since it provides enough signals for all possible +- remaining waiters. As a result, they can each consume a signal +- and they will eventually remove their group reference. */ ++ * Waiters in G1 have already received a signal and been woken. */ + + /* Update __g1_start, which finishes closing this group. The value we add + will never be negative because old_orig_size can only be zero when we +@@ -240,29 +234,6 @@ __condvar_quiesce_and_switch_g1 (pthread_cond_t *cond, uint64_t wseq, + + unsigned int lowseq = ((old_g1_start + old_orig_size) << 1) & ~1U; + +- /* If any waiters still hold group references (and thus could be blocked), +- then wake them all up now and prevent any running ones from blocking. +- This is effectively a catch-all for any possible current or future +- bugs that can allow the group size to reach 0 before all G1 waiters +- have been awakened or at least given signals to consume, or any +- other case that can leave blocked (or about to block) older waiters.. */ +- if ((atomic_fetch_or_release (cond->__data.__g_refs + g1, 0) >> 1) > 0) +- { +- /* First advance signals to the end of the group (i.e. enough signals +- for the entire G1 group) to ensure that waiters which have not +- yet blocked in the futex will not block. +- Note that in the vast majority of cases, this should never +- actually be necessary, since __g_signals will have enough +- signals for the remaining g_refs waiters. As an optimization, +- we could check this first before proceeding, although that +- could still leave the potential for futex lost wakeup bugs +- if the signal count was non-zero but the futex wakeup +- was somehow lost. */ +- atomic_store_release (cond->__data.__g_signals + g1, lowseq); +- +- futex_wake (cond->__data.__g_signals + g1, INT_MAX, private); +- } +- + /* At this point, the old G1 is now a valid new G2 (but not in use yet). + No old waiter can neither grab a signal nor acquire a reference without + noticing that __g1_start is larger. diff --git a/glibc-upstream-2.39-187.patch b/glibc-upstream-2.39-187.patch new file mode 100644 index 0000000..27bb735 --- /dev/null +++ b/glibc-upstream-2.39-187.patch @@ -0,0 +1,108 @@ +commit 6f5ba03968339122e11d5185fed5ff6f99ee4f28 +Author: Malte Skarupke +Date: Wed Dec 4 07:56:13 2024 -0500 + + nptl: Remove unnecessary quadruple check in pthread_cond_wait + + pthread_cond_wait was checking whether it was in a closed group no less than + four times. Checking once is enough. Here are the four checks: + + 1. While spin-waiting. This was dead code: maxspin is set to 0 and has been + for years. + 2. Before deciding to go to sleep, and before incrementing grefs: I kept this + 3. After incrementing grefs. There is no reason to think that the group would + close while we do an atomic increment. Obviously it could close at any + point, but that doesn't mean we have to recheck after every step. This + check was equally good as check 2, except it has to do more work. + 4. When we find ourselves in a group that has a signal. We only get here after + we check that we're not in a closed group. There is no need to check again. + The check would only have helped in cases where the compare_exchange in the + next line would also have failed. Relying on the compare_exchange is fine. + + Removing the duplicate checks clarifies the code. + + Signed-off-by: Malte Skarupke + Reviewed-by: Carlos O'Donell + (cherry picked from commit 4f7b051f8ee3feff1b53b27a906f245afaa9cee1) + +diff --git a/nptl/pthread_cond_wait.c b/nptl/pthread_cond_wait.c +index ad2cee7d59ddc093..cfdd13bb87c72fa5 100644 +--- a/nptl/pthread_cond_wait.c ++++ b/nptl/pthread_cond_wait.c +@@ -366,7 +366,6 @@ static __always_inline int + __pthread_cond_wait_common (pthread_cond_t *cond, pthread_mutex_t *mutex, + clockid_t clockid, const struct __timespec64 *abstime) + { +- const int maxspin = 0; + int err; + int result = 0; + +@@ -425,33 +424,6 @@ __pthread_cond_wait_common (pthread_cond_t *cond, pthread_mutex_t *mutex, + uint64_t g1_start = __condvar_load_g1_start_relaxed (cond); + unsigned int lowseq = (g1_start & 1) == g ? signals : g1_start & ~1U; + +- /* Spin-wait first. +- Note that spinning first without checking whether a timeout +- passed might lead to what looks like a spurious wake-up even +- though we should return ETIMEDOUT (e.g., if the caller provides +- an absolute timeout that is clearly in the past). However, +- (1) spurious wake-ups are allowed, (2) it seems unlikely that a +- user will (ab)use pthread_cond_wait as a check for whether a +- point in time is in the past, and (3) spinning first without +- having to compare against the current time seems to be the right +- choice from a performance perspective for most use cases. */ +- unsigned int spin = maxspin; +- while (spin > 0 && ((int)(signals - lowseq) < 2)) +- { +- /* Check that we are not spinning on a group that's already +- closed. */ +- if (seq < (g1_start >> 1)) +- break; +- +- /* TODO Back off. */ +- +- /* Reload signals. See above for MO. */ +- signals = atomic_load_acquire (cond->__data.__g_signals + g); +- g1_start = __condvar_load_g1_start_relaxed (cond); +- lowseq = (g1_start & 1) == g ? signals : g1_start & ~1U; +- spin--; +- } +- + if (seq < (g1_start >> 1)) + { + /* If the group is closed already, +@@ -482,24 +454,6 @@ __pthread_cond_wait_common (pthread_cond_t *cond, pthread_mutex_t *mutex, + an atomic read-modify-write operation and thus extend the release + sequence. */ + atomic_fetch_add_acquire (cond->__data.__g_refs + g, 2); +- signals = atomic_load_acquire (cond->__data.__g_signals + g); +- g1_start = __condvar_load_g1_start_relaxed (cond); +- lowseq = (g1_start & 1) == g ? signals : g1_start & ~1U; +- +- if (seq < (g1_start >> 1)) +- { +- /* group is closed already, so don't block */ +- __condvar_dec_grefs (cond, g, private); +- goto done; +- } +- +- if ((int)(signals - lowseq) >= 2) +- { +- /* a signal showed up or G1/G2 switched after we grabbed the +- refcount */ +- __condvar_dec_grefs (cond, g, private); +- break; +- } + + // Now block. + struct _pthread_cleanup_buffer buffer; +@@ -533,9 +487,6 @@ __pthread_cond_wait_common (pthread_cond_t *cond, pthread_mutex_t *mutex, + /* Reload signals. See above for MO. */ + signals = atomic_load_acquire (cond->__data.__g_signals + g); + } +- +- if (seq < (__condvar_load_g1_start_relaxed (cond) >> 1)) +- goto done; + } + /* Try to grab a signal. See above for MO. (if we do another loop + iteration we need to see the correct value of g1_start) */ diff --git a/glibc-upstream-2.39-188.patch b/glibc-upstream-2.39-188.patch new file mode 100644 index 0000000..37b779f --- /dev/null +++ b/glibc-upstream-2.39-188.patch @@ -0,0 +1,175 @@ +commit fc2a25417df71a1ef3613216269227b7721b21c8 +Author: Malte Skarupke +Date: Wed Dec 4 07:56:38 2024 -0500 + + nptl: Remove g_refs from condition variables + + This variable used to be needed to wait in group switching until all sleepers + have confirmed that they have woken. This is no longer needed. Nothing waits + on this variable so there is no need to track how many threads are currently + asleep in each group. + + Signed-off-by: Malte Skarupke + Reviewed-by: Carlos O'Donell + (cherry picked from commit c36fc50781995e6758cae2b6927839d0157f213c) + +diff --git a/nptl/pthread_cond_wait.c b/nptl/pthread_cond_wait.c +index cfdd13bb87c72fa5..411fc0380b78f482 100644 +--- a/nptl/pthread_cond_wait.c ++++ b/nptl/pthread_cond_wait.c +@@ -143,23 +143,6 @@ __condvar_cancel_waiting (pthread_cond_t *cond, uint64_t seq, unsigned int g, + } + } + +-/* Wake up any signalers that might be waiting. */ +-static void +-__condvar_dec_grefs (pthread_cond_t *cond, unsigned int g, int private) +-{ +- /* Release MO to synchronize-with the acquire load in +- __condvar_quiesce_and_switch_g1. */ +- if (atomic_fetch_add_release (cond->__data.__g_refs + g, -2) == 3) +- { +- /* Clear the wake-up request flag before waking up. We do not need more +- than relaxed MO and it doesn't matter if we apply this for an aliased +- group because we wake all futex waiters right after clearing the +- flag. */ +- atomic_fetch_and_relaxed (cond->__data.__g_refs + g, ~(unsigned int) 1); +- futex_wake (cond->__data.__g_refs + g, INT_MAX, private); +- } +-} +- + /* Clean-up for cancellation of waiters waiting for normal signals. We cancel + our registration as a waiter, confirm we have woken up, and re-acquire the + mutex. */ +@@ -171,8 +154,6 @@ __condvar_cleanup_waiting (void *arg) + pthread_cond_t *cond = cbuffer->cond; + unsigned g = cbuffer->wseq & 1; + +- __condvar_dec_grefs (cond, g, cbuffer->private); +- + __condvar_cancel_waiting (cond, cbuffer->wseq >> 1, g, cbuffer->private); + /* FIXME With the current cancellation implementation, it is possible that + a thread is cancelled after it has returned from a syscall. This could +@@ -327,15 +308,6 @@ __condvar_cleanup_waiting (void *arg) + sufficient because if a waiter can see a sufficiently large value, it could + have also consume a signal in the waiters group. + +- It is essential that the last field in pthread_cond_t is __g_signals[1]: +- The previous condvar used a pointer-sized field in pthread_cond_t, so a +- PTHREAD_COND_INITIALIZER from that condvar implementation might only +- initialize 4 bytes to zero instead of the 8 bytes we need (i.e., 44 bytes +- in total instead of the 48 we need). __g_signals[1] is not accessed before +- the first group switch (G2 starts at index 0), which will set its value to +- zero after a harmless fetch-or whose return value is ignored. This +- effectively completes initialization. +- + + Limitations: + * This condvar isn't designed to allow for more than +@@ -440,21 +412,6 @@ __pthread_cond_wait_common (pthread_cond_t *cond, pthread_mutex_t *mutex, + if ((int)(signals - lowseq) >= 2) + break; + +- /* No signals available after spinning, so prepare to block. +- We first acquire a group reference and use acquire MO for that so +- that we synchronize with the dummy read-modify-write in +- __condvar_quiesce_and_switch_g1 if we read from that. In turn, +- in this case this will make us see the advancement of __g_signals +- to the upcoming new g1_start that occurs with a concurrent +- attempt to reuse the group's slot. +- We use acquire MO for the __g_signals check to make the +- __g1_start check work (see spinning above). +- Note that the group reference acquisition will not mask the +- release MO when decrementing the reference count because we use +- an atomic read-modify-write operation and thus extend the release +- sequence. */ +- atomic_fetch_add_acquire (cond->__data.__g_refs + g, 2); +- + // Now block. + struct _pthread_cleanup_buffer buffer; + struct _condvar_cleanup_buffer cbuffer; +@@ -471,18 +428,11 @@ __pthread_cond_wait_common (pthread_cond_t *cond, pthread_mutex_t *mutex, + + if (__glibc_unlikely (err == ETIMEDOUT || err == EOVERFLOW)) + { +- __condvar_dec_grefs (cond, g, private); +- /* If we timed out, we effectively cancel waiting. Note that +- we have decremented __g_refs before cancellation, so that a +- deadlock between waiting for quiescence of our group in +- __condvar_quiesce_and_switch_g1 and us trying to acquire +- the lock during cancellation is not possible. */ ++ /* If we timed out, we effectively cancel waiting. */ + __condvar_cancel_waiting (cond, seq, g, private); + result = err; + goto done; + } +- else +- __condvar_dec_grefs (cond, g, private); + + /* Reload signals. See above for MO. */ + signals = atomic_load_acquire (cond->__data.__g_signals + g); +diff --git a/nptl/tst-cond22.c b/nptl/tst-cond22.c +index 1336e9c79d97ca70..bdcb45c53674a5fd 100644 +--- a/nptl/tst-cond22.c ++++ b/nptl/tst-cond22.c +@@ -106,13 +106,13 @@ do_test (void) + status = 1; + } + +- printf ("cond = { 0x%x:%x, 0x%x:%x, %u/%u/%u, %u/%u/%u, %u, %u }\n", ++ printf ("cond = { 0x%x:%x, 0x%x:%x, %u/%u, %u/%u, %u, %u }\n", + c.__data.__wseq.__value32.__high, + c.__data.__wseq.__value32.__low, + c.__data.__g1_start.__value32.__high, + c.__data.__g1_start.__value32.__low, +- c.__data.__g_signals[0], c.__data.__g_refs[0], c.__data.__g_size[0], +- c.__data.__g_signals[1], c.__data.__g_refs[1], c.__data.__g_size[1], ++ c.__data.__g_signals[0], c.__data.__g_size[0], ++ c.__data.__g_signals[1], c.__data.__g_size[1], + c.__data.__g1_orig_size, c.__data.__wrefs); + + if (pthread_create (&th, NULL, tf, (void *) 1l) != 0) +@@ -152,13 +152,13 @@ do_test (void) + status = 1; + } + +- printf ("cond = { 0x%x:%x, 0x%x:%x, %u/%u/%u, %u/%u/%u, %u, %u }\n", ++ printf ("cond = { 0x%x:%x, 0x%x:%x, %u/%u, %u/%u, %u, %u }\n", + c.__data.__wseq.__value32.__high, + c.__data.__wseq.__value32.__low, + c.__data.__g1_start.__value32.__high, + c.__data.__g1_start.__value32.__low, +- c.__data.__g_signals[0], c.__data.__g_refs[0], c.__data.__g_size[0], +- c.__data.__g_signals[1], c.__data.__g_refs[1], c.__data.__g_size[1], ++ c.__data.__g_signals[0], c.__data.__g_size[0], ++ c.__data.__g_signals[1], c.__data.__g_size[1], + c.__data.__g1_orig_size, c.__data.__wrefs); + + return status; +diff --git a/sysdeps/nptl/bits/thread-shared-types.h b/sysdeps/nptl/bits/thread-shared-types.h +index df54eef6f71f2cee..a3d482f80f7d0d35 100644 +--- a/sysdeps/nptl/bits/thread-shared-types.h ++++ b/sysdeps/nptl/bits/thread-shared-types.h +@@ -95,8 +95,7 @@ struct __pthread_cond_s + { + __atomic_wide_counter __wseq; + __atomic_wide_counter __g1_start; +- unsigned int __g_refs[2] __LOCK_ALIGNMENT; +- unsigned int __g_size[2]; ++ unsigned int __g_size[2] __LOCK_ALIGNMENT; + unsigned int __g1_orig_size; + unsigned int __wrefs; + unsigned int __g_signals[2]; +diff --git a/sysdeps/nptl/pthread.h b/sysdeps/nptl/pthread.h +index 3d4f4a756c66750d..9af75d6eae090218 100644 +--- a/sysdeps/nptl/pthread.h ++++ b/sysdeps/nptl/pthread.h +@@ -152,7 +152,7 @@ enum + + + /* Conditional variable handling. */ +-#define PTHREAD_COND_INITIALIZER { { {0}, {0}, {0, 0}, {0, 0}, 0, 0, {0, 0} } } ++#define PTHREAD_COND_INITIALIZER { { {0}, {0}, {0, 0}, 0, 0, {0, 0} } } + + + /* Cleanup buffers */ diff --git a/glibc-upstream-2.39-189.patch b/glibc-upstream-2.39-189.patch new file mode 100644 index 0000000..0111578 --- /dev/null +++ b/glibc-upstream-2.39-189.patch @@ -0,0 +1,92 @@ +commit 582c99b2c04d6da95743b36bf8e5c54dec178274 +Author: Malte Skarupke +Date: Wed Dec 4 08:03:44 2024 -0500 + + nptl: Use a single loop in pthread_cond_wait instaed of a nested loop + + The loop was a little more complicated than necessary. There was only one + break statement out of the inner loop, and the outer loop was nearly empty. + So just remove the outer loop, moving its code to the one break statement in + the inner loop. This allows us to replace all gotos with break statements. + + Signed-off-by: Malte Skarupke + Reviewed-by: Carlos O'Donell + (cherry picked from commit 929a4764ac90382616b6a21f099192b2475da674) + +diff --git a/nptl/pthread_cond_wait.c b/nptl/pthread_cond_wait.c +index 411fc0380b78f482..683cb2b133f2163f 100644 +--- a/nptl/pthread_cond_wait.c ++++ b/nptl/pthread_cond_wait.c +@@ -382,17 +382,15 @@ __pthread_cond_wait_common (pthread_cond_t *cond, pthread_mutex_t *mutex, + return err; + } + +- /* Now wait until a signal is available in our group or it is closed. +- Acquire MO so that if we observe (signals == lowseq) after group +- switching in __condvar_quiesce_and_switch_g1, we synchronize with that +- store and will see the prior update of __g1_start done while switching +- groups too. */ +- unsigned int signals = atomic_load_acquire (cond->__data.__g_signals + g); +- +- do +- { ++ + while (1) + { ++ /* Now wait until a signal is available in our group or it is closed. ++ Acquire MO so that if we observe (signals == lowseq) after group ++ switching in __condvar_quiesce_and_switch_g1, we synchronize with that ++ store and will see the prior update of __g1_start done while switching ++ groups too. */ ++ unsigned int signals = atomic_load_acquire (cond->__data.__g_signals + g); + uint64_t g1_start = __condvar_load_g1_start_relaxed (cond); + unsigned int lowseq = (g1_start & 1) == g ? signals : g1_start & ~1U; + +@@ -401,7 +399,7 @@ __pthread_cond_wait_common (pthread_cond_t *cond, pthread_mutex_t *mutex, + /* If the group is closed already, + then this waiter originally had enough extra signals to + consume, up until the time its group was closed. */ +- goto done; ++ break; + } + + /* If there is an available signal, don't block. +@@ -410,7 +408,16 @@ __pthread_cond_wait_common (pthread_cond_t *cond, pthread_mutex_t *mutex, + G2, but in either case we're allowed to consume the available + signal and should not block anymore. */ + if ((int)(signals - lowseq) >= 2) +- break; ++ { ++ /* Try to grab a signal. See above for MO. (if we do another loop ++ iteration we need to see the correct value of g1_start) */ ++ if (atomic_compare_exchange_weak_acquire ( ++ cond->__data.__g_signals + g, ++ &signals, signals - 2)) ++ break; ++ else ++ continue; ++ } + + // Now block. + struct _pthread_cleanup_buffer buffer; +@@ -431,19 +438,9 @@ __pthread_cond_wait_common (pthread_cond_t *cond, pthread_mutex_t *mutex, + /* If we timed out, we effectively cancel waiting. */ + __condvar_cancel_waiting (cond, seq, g, private); + result = err; +- goto done; ++ break; + } +- +- /* Reload signals. See above for MO. */ +- signals = atomic_load_acquire (cond->__data.__g_signals + g); + } +- } +- /* Try to grab a signal. See above for MO. (if we do another loop +- iteration we need to see the correct value of g1_start) */ +- while (!atomic_compare_exchange_weak_acquire (cond->__data.__g_signals + g, +- &signals, signals - 2)); +- +- done: + + /* Confirm that we have been woken. We do that before acquiring the mutex + to allow for execution of pthread_cond_destroy while having acquired the diff --git a/glibc-upstream-2.39-190.patch b/glibc-upstream-2.39-190.patch new file mode 100644 index 0000000..ee0e91d --- /dev/null +++ b/glibc-upstream-2.39-190.patch @@ -0,0 +1,139 @@ +commit 2fdc0afd0763377dc51870449b476f77baeb8aa0 +Author: Malte Skarupke +Date: Wed Dec 4 08:04:10 2024 -0500 + + nptl: Fix indentation + + In my previous change I turned a nested loop into a simple loop. I'm doing + the resulting indentation changes in a separate commit to make the diff on + the previous commit easier to review. + + Signed-off-by: Malte Skarupke + Reviewed-by: Carlos O'Donell + (cherry picked from commit ee6c14ed59d480720721aaacc5fb03213dc153da) + +diff --git a/nptl/pthread_cond_wait.c b/nptl/pthread_cond_wait.c +index 683cb2b133f2163f..7fc9dadf15aa9bc6 100644 +--- a/nptl/pthread_cond_wait.c ++++ b/nptl/pthread_cond_wait.c +@@ -383,65 +383,65 @@ __pthread_cond_wait_common (pthread_cond_t *cond, pthread_mutex_t *mutex, + } + + +- while (1) +- { +- /* Now wait until a signal is available in our group or it is closed. +- Acquire MO so that if we observe (signals == lowseq) after group +- switching in __condvar_quiesce_and_switch_g1, we synchronize with that +- store and will see the prior update of __g1_start done while switching +- groups too. */ +- unsigned int signals = atomic_load_acquire (cond->__data.__g_signals + g); +- uint64_t g1_start = __condvar_load_g1_start_relaxed (cond); +- unsigned int lowseq = (g1_start & 1) == g ? signals : g1_start & ~1U; +- +- if (seq < (g1_start >> 1)) +- { +- /* If the group is closed already, +- then this waiter originally had enough extra signals to +- consume, up until the time its group was closed. */ +- break; +- } +- +- /* If there is an available signal, don't block. +- If __g1_start has advanced at all, then we must be in G1 +- by now, perhaps in the process of switching back to an older +- G2, but in either case we're allowed to consume the available +- signal and should not block anymore. */ +- if ((int)(signals - lowseq) >= 2) +- { +- /* Try to grab a signal. See above for MO. (if we do another loop +- iteration we need to see the correct value of g1_start) */ +- if (atomic_compare_exchange_weak_acquire ( +- cond->__data.__g_signals + g, ++ while (1) ++ { ++ /* Now wait until a signal is available in our group or it is closed. ++ Acquire MO so that if we observe (signals == lowseq) after group ++ switching in __condvar_quiesce_and_switch_g1, we synchronize with that ++ store and will see the prior update of __g1_start done while switching ++ groups too. */ ++ unsigned int signals = atomic_load_acquire (cond->__data.__g_signals + g); ++ uint64_t g1_start = __condvar_load_g1_start_relaxed (cond); ++ unsigned int lowseq = (g1_start & 1) == g ? signals : g1_start & ~1U; ++ ++ if (seq < (g1_start >> 1)) ++ { ++ /* If the group is closed already, ++ then this waiter originally had enough extra signals to ++ consume, up until the time its group was closed. */ ++ break; ++ } ++ ++ /* If there is an available signal, don't block. ++ If __g1_start has advanced at all, then we must be in G1 ++ by now, perhaps in the process of switching back to an older ++ G2, but in either case we're allowed to consume the available ++ signal and should not block anymore. */ ++ if ((int)(signals - lowseq) >= 2) ++ { ++ /* Try to grab a signal. See above for MO. (if we do another loop ++ iteration we need to see the correct value of g1_start) */ ++ if (atomic_compare_exchange_weak_acquire ( ++ cond->__data.__g_signals + g, + &signals, signals - 2)) +- break; +- else +- continue; +- } +- +- // Now block. +- struct _pthread_cleanup_buffer buffer; +- struct _condvar_cleanup_buffer cbuffer; +- cbuffer.wseq = wseq; +- cbuffer.cond = cond; +- cbuffer.mutex = mutex; +- cbuffer.private = private; +- __pthread_cleanup_push (&buffer, __condvar_cleanup_waiting, &cbuffer); +- +- err = __futex_abstimed_wait_cancelable64 ( +- cond->__data.__g_signals + g, signals, clockid, abstime, private); +- +- __pthread_cleanup_pop (&buffer, 0); +- +- if (__glibc_unlikely (err == ETIMEDOUT || err == EOVERFLOW)) +- { +- /* If we timed out, we effectively cancel waiting. */ +- __condvar_cancel_waiting (cond, seq, g, private); +- result = err; + break; +- } ++ else ++ continue; + } + ++ // Now block. ++ struct _pthread_cleanup_buffer buffer; ++ struct _condvar_cleanup_buffer cbuffer; ++ cbuffer.wseq = wseq; ++ cbuffer.cond = cond; ++ cbuffer.mutex = mutex; ++ cbuffer.private = private; ++ __pthread_cleanup_push (&buffer, __condvar_cleanup_waiting, &cbuffer); ++ ++ err = __futex_abstimed_wait_cancelable64 ( ++ cond->__data.__g_signals + g, signals, clockid, abstime, private); ++ ++ __pthread_cleanup_pop (&buffer, 0); ++ ++ if (__glibc_unlikely (err == ETIMEDOUT || err == EOVERFLOW)) ++ { ++ /* If we timed out, we effectively cancel waiting. */ ++ __condvar_cancel_waiting (cond, seq, g, private); ++ result = err; ++ break; ++ } ++ } ++ + /* Confirm that we have been woken. We do that before acquiring the mutex + to allow for execution of pthread_cond_destroy while having acquired the + mutex. */ diff --git a/glibc-upstream-2.39-191.patch b/glibc-upstream-2.39-191.patch new file mode 100644 index 0000000..6a4cdea --- /dev/null +++ b/glibc-upstream-2.39-191.patch @@ -0,0 +1,148 @@ +commit ac5da3c0e4ed9cbdbb88928c5c9886d02a6dd7ed +Author: Malte Skarupke +Date: Wed Dec 4 08:04:54 2024 -0500 + + nptl: rename __condvar_quiesce_and_switch_g1 + + This function no longer waits for threads to leave g1, so rename it to + __condvar_switch_g1 + + Signed-off-by: Malte Skarupke + Reviewed-by: Carlos O'Donell + (cherry picked from commit 4b79e27a5073c02f6bff9aa8f4791230a0ab1867) + +diff --git a/nptl/pthread_cond_broadcast.c b/nptl/pthread_cond_broadcast.c +index aada91639a346f19..38bba17bfc8a0083 100644 +--- a/nptl/pthread_cond_broadcast.c ++++ b/nptl/pthread_cond_broadcast.c +@@ -60,7 +60,7 @@ ___pthread_cond_broadcast (pthread_cond_t *cond) + cond->__data.__g_size[g1] << 1); + cond->__data.__g_size[g1] = 0; + +- /* We need to wake G1 waiters before we quiesce G1 below. */ ++ /* We need to wake G1 waiters before we switch G1 below. */ + /* TODO Only set it if there are indeed futex waiters. We could + also try to move this out of the critical section in cases when + G2 is empty (and we don't need to quiesce). */ +@@ -69,7 +69,7 @@ ___pthread_cond_broadcast (pthread_cond_t *cond) + + /* G1 is complete. Step (2) is next unless there are no waiters in G2, in + which case we can stop. */ +- if (__condvar_quiesce_and_switch_g1 (cond, wseq, &g1, private)) ++ if (__condvar_switch_g1 (cond, wseq, &g1, private)) + { + /* Step (3): Send signals to all waiters in the old G2 / new G1. */ + atomic_fetch_add_relaxed (cond->__data.__g_signals + g1, +diff --git a/nptl/pthread_cond_common.c b/nptl/pthread_cond_common.c +index 30b8eee149cee195..5044273cc265ce94 100644 +--- a/nptl/pthread_cond_common.c ++++ b/nptl/pthread_cond_common.c +@@ -189,16 +189,15 @@ __condvar_get_private (int flags) + return FUTEX_SHARED; + } + +-/* This closes G1 (whose index is in G1INDEX), waits for all futex waiters to +- leave G1, converts G1 into a fresh G2, and then switches group roles so that +- the former G2 becomes the new G1 ending at the current __wseq value when we +- eventually make the switch (WSEQ is just an observation of __wseq by the +- signaler). ++/* This closes G1 (whose index is in G1INDEX), converts G1 into a fresh G2, ++ and then switches group roles so that the former G2 becomes the new G1 ++ ending at the current __wseq value when we eventually make the switch ++ (WSEQ is just an observation of __wseq by the signaler). + If G2 is empty, it will not switch groups because then it would create an + empty G1 which would require switching groups again on the next signal. + Returns false iff groups were not switched because G2 was empty. */ + static bool __attribute__ ((unused)) +-__condvar_quiesce_and_switch_g1 (pthread_cond_t *cond, uint64_t wseq, ++__condvar_switch_g1 (pthread_cond_t *cond, uint64_t wseq, + unsigned int *g1index, int private) + { + unsigned int g1 = *g1index; +@@ -214,8 +213,7 @@ __condvar_quiesce_and_switch_g1 (pthread_cond_t *cond, uint64_t wseq, + + cond->__data.__g_size[g1 ^ 1]) == 0) + return false; + +- /* Now try to close and quiesce G1. We have to consider the following kinds +- of waiters: ++ /* We have to consider the following kinds of waiters: + * Waiters from less recent groups than G1 are not affected because + nothing will change for them apart from __g1_start getting larger. + * New waiters arriving concurrently with the group switching will all go +@@ -223,12 +221,12 @@ __condvar_quiesce_and_switch_g1 (pthread_cond_t *cond, uint64_t wseq, + are not affected. + * Waiters in G1 have already received a signal and been woken. */ + +- /* Update __g1_start, which finishes closing this group. The value we add +- will never be negative because old_orig_size can only be zero when we +- switch groups the first time after a condvar was initialized, in which +- case G1 will be at index 1 and we will add a value of 1. +- Relaxed MO is fine because the change comes with no additional +- constraints that others would have to observe. */ ++ /* Update __g1_start, which closes this group. The value we add will never ++ be negative because old_orig_size can only be zero when we switch groups ++ the first time after a condvar was initialized, in which case G1 will be ++ at index 1 and we will add a value of 1. Relaxed MO is fine because the ++ change comes with no additional constraints that others would have to ++ observe. */ + __condvar_add_g1_start_relaxed (cond, + (old_orig_size << 1) + (g1 == 1 ? 1 : - 1)); + +diff --git a/nptl/pthread_cond_signal.c b/nptl/pthread_cond_signal.c +index 43d6286ecdf63f51..f09549714299c370 100644 +--- a/nptl/pthread_cond_signal.c ++++ b/nptl/pthread_cond_signal.c +@@ -69,18 +69,17 @@ ___pthread_cond_signal (pthread_cond_t *cond) + bool do_futex_wake = false; + + /* If G1 is still receiving signals, we put the signal there. If not, we +- check if G2 has waiters, and if so, quiesce and switch G1 to the former +- G2; if this results in a new G1 with waiters (G2 might have cancellations +- already, see __condvar_quiesce_and_switch_g1), we put the signal in the +- new G1. */ ++ check if G2 has waiters, and if so, switch G1 to the former G2; if this ++ results in a new G1 with waiters (G2 might have cancellations already, ++ see __condvar_switch_g1), we put the signal in the new G1. */ + if ((cond->__data.__g_size[g1] != 0) +- || __condvar_quiesce_and_switch_g1 (cond, wseq, &g1, private)) ++ || __condvar_switch_g1 (cond, wseq, &g1, private)) + { + /* Add a signal. Relaxed MO is fine because signaling does not need to +- establish a happens-before relation (see above). We do not mask the +- release-MO store when initializing a group in +- __condvar_quiesce_and_switch_g1 because we use an atomic +- read-modify-write and thus extend that store's release sequence. */ ++ establish a happens-before relation (see above). We do not mask the ++ release-MO store when initializing a group in __condvar_switch_g1 ++ because we use an atomic read-modify-write and thus extend that ++ store's release sequence. */ + atomic_fetch_add_relaxed (cond->__data.__g_signals + g1, 2); + cond->__data.__g_size[g1]--; + /* TODO Only set it if there are indeed futex waiters. */ +diff --git a/nptl/pthread_cond_wait.c b/nptl/pthread_cond_wait.c +index 7fc9dadf15aa9bc6..80bb7282118775b8 100644 +--- a/nptl/pthread_cond_wait.c ++++ b/nptl/pthread_cond_wait.c +@@ -354,8 +354,7 @@ __pthread_cond_wait_common (pthread_cond_t *cond, pthread_mutex_t *mutex, + because we do not need to establish any happens-before relation with + signalers (see __pthread_cond_signal); modification order alone + establishes a total order of waiters/signals. We do need acquire MO +- to synchronize with group reinitialization in +- __condvar_quiesce_and_switch_g1. */ ++ to synchronize with group reinitialization in __condvar_switch_g1. */ + uint64_t wseq = __condvar_fetch_add_wseq_acquire (cond, 2); + /* Find our group's index. We always go into what was G2 when we acquired + our position. */ +@@ -387,9 +386,9 @@ __pthread_cond_wait_common (pthread_cond_t *cond, pthread_mutex_t *mutex, + { + /* Now wait until a signal is available in our group or it is closed. + Acquire MO so that if we observe (signals == lowseq) after group +- switching in __condvar_quiesce_and_switch_g1, we synchronize with that +- store and will see the prior update of __g1_start done while switching +- groups too. */ ++ switching in __condvar_switch_g1, we synchronize with that store and ++ will see the prior update of __g1_start done while switching groups ++ too. */ + unsigned int signals = atomic_load_acquire (cond->__data.__g_signals + g); + uint64_t g1_start = __condvar_load_g1_start_relaxed (cond); + unsigned int lowseq = (g1_start & 1) == g ? signals : g1_start & ~1U; diff --git a/glibc-upstream-2.39-192.patch b/glibc-upstream-2.39-192.patch new file mode 100644 index 0000000..1a69b86 --- /dev/null +++ b/glibc-upstream-2.39-192.patch @@ -0,0 +1,180 @@ +commit b1eb369aee9cafefdbe5a65375310a918ef0c3ec +Author: Malte Skarupke +Date: Wed Dec 4 08:05:40 2024 -0500 + + nptl: Use all of g1_start and g_signals + + The LSB of g_signals was unused. The LSB of g1_start was used to indicate + which group is G2. This was used to always go to sleep in pthread_cond_wait + if a waiter is in G2. A comment earlier in the file says that this is not + correct to do: + + "Waiters cannot determine whether they are currently in G2 or G1 -- but they + do not have to because all they are interested in is whether there are + available signals" + + I either would have had to update the comment, or get rid of the check. I + chose to get rid of the check. In fact I don't quite know why it was there. + There will never be available signals for group G2, so we didn't need the + special case. Even if there were, this would just be a spurious wake. This + might have caught some cases where the count has wrapped around, but it + wouldn't reliably do that, (and even if it did, why would you want to force a + sleep in that case?) and we don't support that many concurrent waiters + anyway. Getting rid of it allows us to use one more bit, making us more + robust to wraparound. + + Signed-off-by: Malte Skarupke + Reviewed-by: Carlos O'Donell + (cherry picked from commit 91bb902f58264a2fd50fbce8f39a9a290dd23706) + +diff --git a/nptl/pthread_cond_broadcast.c b/nptl/pthread_cond_broadcast.c +index 38bba17bfc8a0083..51afa62adf7da4c1 100644 +--- a/nptl/pthread_cond_broadcast.c ++++ b/nptl/pthread_cond_broadcast.c +@@ -57,7 +57,7 @@ ___pthread_cond_broadcast (pthread_cond_t *cond) + { + /* Add as many signals as the remaining size of the group. */ + atomic_fetch_add_relaxed (cond->__data.__g_signals + g1, +- cond->__data.__g_size[g1] << 1); ++ cond->__data.__g_size[g1]); + cond->__data.__g_size[g1] = 0; + + /* We need to wake G1 waiters before we switch G1 below. */ +@@ -73,7 +73,7 @@ ___pthread_cond_broadcast (pthread_cond_t *cond) + { + /* Step (3): Send signals to all waiters in the old G2 / new G1. */ + atomic_fetch_add_relaxed (cond->__data.__g_signals + g1, +- cond->__data.__g_size[g1] << 1); ++ cond->__data.__g_size[g1]); + cond->__data.__g_size[g1] = 0; + /* TODO Only set it if there are indeed futex waiters. */ + do_futex_wake = true; +diff --git a/nptl/pthread_cond_common.c b/nptl/pthread_cond_common.c +index 5044273cc265ce94..389402913c7b7714 100644 +--- a/nptl/pthread_cond_common.c ++++ b/nptl/pthread_cond_common.c +@@ -208,9 +208,9 @@ __condvar_switch_g1 (pthread_cond_t *cond, uint64_t wseq, + behavior. + Note that this works correctly for a zero-initialized condvar too. */ + unsigned int old_orig_size = __condvar_get_orig_size (cond); +- uint64_t old_g1_start = __condvar_load_g1_start_relaxed (cond) >> 1; +- if (((unsigned) (wseq - old_g1_start - old_orig_size) +- + cond->__data.__g_size[g1 ^ 1]) == 0) ++ uint64_t old_g1_start = __condvar_load_g1_start_relaxed (cond); ++ uint64_t new_g1_start = old_g1_start + old_orig_size; ++ if (((unsigned) (wseq - new_g1_start) + cond->__data.__g_size[g1 ^ 1]) == 0) + return false; + + /* We have to consider the following kinds of waiters: +@@ -221,16 +221,10 @@ __condvar_switch_g1 (pthread_cond_t *cond, uint64_t wseq, + are not affected. + * Waiters in G1 have already received a signal and been woken. */ + +- /* Update __g1_start, which closes this group. The value we add will never +- be negative because old_orig_size can only be zero when we switch groups +- the first time after a condvar was initialized, in which case G1 will be +- at index 1 and we will add a value of 1. Relaxed MO is fine because the +- change comes with no additional constraints that others would have to +- observe. */ +- __condvar_add_g1_start_relaxed (cond, +- (old_orig_size << 1) + (g1 == 1 ? 1 : - 1)); +- +- unsigned int lowseq = ((old_g1_start + old_orig_size) << 1) & ~1U; ++ /* Update __g1_start, which closes this group. Relaxed MO is fine because ++ the change comes with no additional constraints that others would have ++ to observe. */ ++ __condvar_add_g1_start_relaxed (cond, old_orig_size); + + /* At this point, the old G1 is now a valid new G2 (but not in use yet). + No old waiter can neither grab a signal nor acquire a reference without +@@ -242,13 +236,13 @@ __condvar_switch_g1 (pthread_cond_t *cond, uint64_t wseq, + g1 ^= 1; + *g1index ^= 1; + +- /* Now advance the new G1 g_signals to the new lowseq, giving it ++ /* Now advance the new G1 g_signals to the new g1_start, giving it + an effective signal count of 0 to start. */ +- atomic_store_release (cond->__data.__g_signals + g1, lowseq); ++ atomic_store_release (cond->__data.__g_signals + g1, (unsigned)new_g1_start); + + /* These values are just observed by signalers, and thus protected by the + lock. */ +- unsigned int orig_size = wseq - (old_g1_start + old_orig_size); ++ unsigned int orig_size = wseq - new_g1_start; + __condvar_set_orig_size (cond, orig_size); + /* Use and addition to not loose track of cancellations in what was + previously G2. */ +diff --git a/nptl/pthread_cond_signal.c b/nptl/pthread_cond_signal.c +index f09549714299c370..fa3a5c3d8f731687 100644 +--- a/nptl/pthread_cond_signal.c ++++ b/nptl/pthread_cond_signal.c +@@ -80,7 +80,7 @@ ___pthread_cond_signal (pthread_cond_t *cond) + release-MO store when initializing a group in __condvar_switch_g1 + because we use an atomic read-modify-write and thus extend that + store's release sequence. */ +- atomic_fetch_add_relaxed (cond->__data.__g_signals + g1, 2); ++ atomic_fetch_add_relaxed (cond->__data.__g_signals + g1, 1); + cond->__data.__g_size[g1]--; + /* TODO Only set it if there are indeed futex waiters. */ + do_futex_wake = true; +diff --git a/nptl/pthread_cond_wait.c b/nptl/pthread_cond_wait.c +index 80bb7282118775b8..0f1dfcb595941eba 100644 +--- a/nptl/pthread_cond_wait.c ++++ b/nptl/pthread_cond_wait.c +@@ -84,7 +84,7 @@ __condvar_cancel_waiting (pthread_cond_t *cond, uint64_t seq, unsigned int g, + not hold a reference on the group. */ + __condvar_acquire_lock (cond, private); + +- uint64_t g1_start = __condvar_load_g1_start_relaxed (cond) >> 1; ++ uint64_t g1_start = __condvar_load_g1_start_relaxed (cond); + if (g1_start > seq) + { + /* Our group is closed, so someone provided enough signals for it. +@@ -259,7 +259,6 @@ __condvar_cleanup_waiting (void *arg) + * Waiters fetch-add while having acquire the mutex associated with the + condvar. Signalers load it and fetch-xor it concurrently. + __g1_start: Starting position of G1 (inclusive) +- * LSB is index of current G2. + * Modified by signalers while having acquired the condvar-internal lock + and observed concurrently by waiters. + __g1_orig_size: Initial size of G1 +@@ -280,11 +279,9 @@ __condvar_cleanup_waiting (void *arg) + * Reference count used by waiters concurrently with signalers that have + acquired the condvar-internal lock. + __g_signals: The number of signals that can still be consumed, relative to +- the current g1_start. (i.e. bits 31 to 1 of __g_signals are bits +- 31 to 1 of g1_start with the signal count added) ++ the current g1_start. (i.e. g1_start with the signal count added) + * Used as a futex word by waiters. Used concurrently by waiters and + signalers. +- * LSB is currently reserved and 0. + __g_size: Waiters remaining in this group (i.e., which have not been + signaled yet. + * Accessed by signalers and waiters that cancel waiting (both do so only +@@ -391,9 +388,8 @@ __pthread_cond_wait_common (pthread_cond_t *cond, pthread_mutex_t *mutex, + too. */ + unsigned int signals = atomic_load_acquire (cond->__data.__g_signals + g); + uint64_t g1_start = __condvar_load_g1_start_relaxed (cond); +- unsigned int lowseq = (g1_start & 1) == g ? signals : g1_start & ~1U; + +- if (seq < (g1_start >> 1)) ++ if (seq < g1_start) + { + /* If the group is closed already, + then this waiter originally had enough extra signals to +@@ -406,13 +402,13 @@ __pthread_cond_wait_common (pthread_cond_t *cond, pthread_mutex_t *mutex, + by now, perhaps in the process of switching back to an older + G2, but in either case we're allowed to consume the available + signal and should not block anymore. */ +- if ((int)(signals - lowseq) >= 2) ++ if ((int)(signals - (unsigned int)g1_start) > 0) + { + /* Try to grab a signal. See above for MO. (if we do another loop + iteration we need to see the correct value of g1_start) */ + if (atomic_compare_exchange_weak_acquire ( + cond->__data.__g_signals + g, +- &signals, signals - 2)) ++ &signals, signals - 1)) + break; + else + continue; diff --git a/glibc-upstream-2.39-193.patch b/glibc-upstream-2.39-193.patch new file mode 100644 index 0000000..2bda9e9 --- /dev/null +++ b/glibc-upstream-2.39-193.patch @@ -0,0 +1,41 @@ +commit d33d10642fb24091e8fc8b9115f0a17d9f78491d +Author: Florian Weimer +Date: Thu Mar 13 06:07:07 2025 +0100 + + nptl: PTHREAD_COND_INITIALIZER compatibility with pre-2.41 versions (bug 32786) + + The new initializer and struct layout does not initialize the + __g_signals field in the old struct layout before the change in + commit c36fc50781995e6758cae2b6927839d0157f213c ("nptl: Remove + g_refs from condition variables"). Bring back fields at the end + of struct __pthread_cond_s, so that they are again zero-initialized. + + Reviewed-by: Sam James + (cherry picked from commit dbc5a50d12eff4cb3f782129029d04b8a76f58e7) + +diff --git a/sysdeps/nptl/bits/thread-shared-types.h b/sysdeps/nptl/bits/thread-shared-types.h +index a3d482f80f7d0d35..bccc2003ec6dea5c 100644 +--- a/sysdeps/nptl/bits/thread-shared-types.h ++++ b/sysdeps/nptl/bits/thread-shared-types.h +@@ -99,6 +99,8 @@ struct __pthread_cond_s + unsigned int __g1_orig_size; + unsigned int __wrefs; + unsigned int __g_signals[2]; ++ unsigned int __unused_initialized_1; ++ unsigned int __unused_initialized_2; + }; + + typedef unsigned int __tss_t; +diff --git a/sysdeps/nptl/pthread.h b/sysdeps/nptl/pthread.h +index 9af75d6eae090218..e0f24418fe4233f0 100644 +--- a/sysdeps/nptl/pthread.h ++++ b/sysdeps/nptl/pthread.h +@@ -152,7 +152,7 @@ enum + + + /* Conditional variable handling. */ +-#define PTHREAD_COND_INITIALIZER { { {0}, {0}, {0, 0}, 0, 0, {0, 0} } } ++#define PTHREAD_COND_INITIALIZER { { {0}, {0}, {0, 0}, 0, 0, {0, 0}, 0, 0 } } + + + /* Cleanup buffers */ diff --git a/glibc-upstream-2.39-194.patch b/glibc-upstream-2.39-194.patch new file mode 100644 index 0000000..52ef411 --- /dev/null +++ b/glibc-upstream-2.39-194.patch @@ -0,0 +1,63 @@ +commit 68f3f1a1d08f7f3e0fb74391461699717efbb4bc +Author: Florian Weimer +Date: Sat Feb 17 09:17:04 2024 +0100 + + Linux: Switch back to assembly syscall wrapper for prctl (bug 29770) + + Commit ff026950e280bc3e9487b41b460fb31bc5b57721 ("Add a C wrapper for + prctl [BZ #25896]") replaced the assembler wrapper with a C function. + However, on powerpc64le-linux-gnu, the C variadic function + implementation requires extra work in the caller to set up the + parameter save area. Calling a function that needs a parameter save + area without one (because the prototype used indicates the function is + not variadic) corrupts the caller's stack. The Linux manual pages + project documents prctl as a non-variadic function. This has resulted + in various projects over the years using non-variadic prototypes, + including the sanitizer libraries in LLVm and GCC (GCC PR 113728). + + This commit switches back to the assembler implementation on most + targets and only keeps the C implementation for x86-64 x32. + + Also add the __prctl_time64 alias from commit + b39ffab860cd743a82c91946619f1b8158b0b65e ("Linux: Add time64 alias for + prctl") to sysdeps/unix/sysv/linux/syscalls.list; it was not yet + present in commit ff026950e280bc3e9487b41b460fb31bc5b57721. + + This restores the old ABI on powerpc64le-linux-gnu, thus fixing + bug 29770. + + Reviewed-By: Simon Chopin + (cherry picked from commit 6a04404521ac4119ae36827eeb288ea84eee7cf6) + +diff --git a/sysdeps/unix/sysv/linux/syscalls.list b/sysdeps/unix/sysv/linux/syscalls.list +index 73e941ef894cd72c..9ac42c3436dd1520 100644 +--- a/sysdeps/unix/sysv/linux/syscalls.list ++++ b/sysdeps/unix/sysv/linux/syscalls.list +@@ -46,6 +46,7 @@ open_tree EXTRA open_tree i:isU open_tree + pipe2 - pipe2 i:fi __pipe2 pipe2 + pidfd_open EXTRA pidfd_open i:iU pidfd_open + pidfd_getfd EXTRA pidfd_getfd i:iiU pidfd_getfd ++prctl EXTRA prctl i:iiiii __prctl prctl __prctl_time64 + pivot_root EXTRA pivot_root i:ss pivot_root + pidfd_send_signal EXTRA pidfd_send_signal i:iiPU pidfd_send_signal + process_madvise EXTRA process_madvise i:iPniU process_madvise +diff --git a/sysdeps/unix/sysv/linux/prctl.c b/sysdeps/unix/sysv/linux/x86_64/x32/prctl.c +similarity index 93% +rename from sysdeps/unix/sysv/linux/prctl.c +rename to sysdeps/unix/sysv/linux/x86_64/x32/prctl.c +index 52d234ea0df4cc48..4bf1b479a07c6e8f 100644 +--- a/sysdeps/unix/sysv/linux/prctl.c ++++ b/sysdeps/unix/sysv/linux/x86_64/x32/prctl.c +@@ -1,4 +1,4 @@ +-/* prctl - Linux specific syscall. ++/* prctl - Linux specific syscall. x86-64 x32 version. + Copyright (C) 2020-2024 Free Software Foundation, Inc. + This file is part of the GNU C Library. + +@@ -40,6 +40,3 @@ __prctl (int option, ...) + + libc_hidden_def (__prctl) + weak_alias (__prctl, prctl) +-#if __TIMESIZE != 64 +-weak_alias (__prctl, __prctl_time64) +-#endif diff --git a/glibc-upstream-2.39-195.patch b/glibc-upstream-2.39-195.patch new file mode 100644 index 0000000..944d1a4 --- /dev/null +++ b/glibc-upstream-2.39-195.patch @@ -0,0 +1,139 @@ +commit e31ac9a639306c8611e1ebe9fa405037337c70e0 +Author: H.J. Lu +Date: Tue Apr 30 09:21:16 2024 -0700 + + libio: Sort test variables in Makefile + + Sort test variables in libio/Makefile using scripts/sort-makefile-lines.py. + Reviewed-by: Sunil K Pandey + + (cherry picked from commit ddf71c550a5940deca74cc676f1cae134a891717) + +diff --git a/libio/Makefile b/libio/Makefile +index b92aeaf62634f1cb..0c1f16ee3b54c2d3 100644 +--- a/libio/Makefile ++++ b/libio/Makefile +@@ -68,22 +68,76 @@ routines_no_fortify += \ + wprintf \ + # routines_no_fortify + +-tests = tst_swprintf tst_wprintf tst_swscanf tst_wscanf tst_getwc tst_putwc \ +- tst_wprintf2 tst-widetext test-fmemopen tst-ext tst-ext2 \ +- tst-fgetws tst-ungetwc1 tst-ungetwc2 tst-swscanf tst-sscanf \ +- tst-mmap-setvbuf bug-ungetwc1 bug-ungetwc2 tst-atime tst-eof \ +- tst-freopen bug-rewind bug-rewind2 bug-ungetc bug-fseek \ +- tst-mmap-eofsync tst-mmap-fflushsync bug-mmap-fflush \ +- tst-mmap2-eofsync tst-mmap-offend bug-fopena+ bug-wfflush \ +- bug-ungetc2 bug-ftell bug-ungetc3 bug-ungetc4 tst-fopenloc2 \ +- tst-memstream1 tst-memstream2 tst-memstream3 tst-memstream4 \ +- tst-wmemstream1 tst-wmemstream2 tst-wmemstream3 tst-wmemstream4 \ +- tst-wmemstream5 bug-memstream1 bug-wmemstream1 \ +- tst-setvbuf1 tst-popen1 tst-fgetwc bug-wsetpos tst-fseek \ +- tst-fwrite-error tst-ftell-partial-wide tst-ftell-active-handler \ +- tst-ftell-append tst-fputws tst-bz22415 tst-fgetc-after-eof \ +- tst-sprintf-ub tst-sprintf-chk-ub tst-bz24051 tst-bz24153 \ +- tst-wfile-sync tst-bz28828 tst-getdelim ++tests = \ ++ bug-fopena+ \ ++ bug-fseek \ ++ bug-ftell \ ++ bug-memstream1 \ ++ bug-mmap-fflush \ ++ bug-rewind \ ++ bug-rewind2 \ ++ bug-ungetc \ ++ bug-ungetc2 \ ++ bug-ungetc3 \ ++ bug-ungetc4 \ ++ bug-ungetwc1 \ ++ bug-ungetwc2 \ ++ bug-wfflush \ ++ bug-wmemstream1 \ ++ bug-wsetpos \ ++ test-fmemopen \ ++ tst-atime \ ++ tst-bz22415 \ ++ tst-bz24051 \ ++ tst-bz24153 \ ++ tst-bz28828 \ ++ tst-eof \ ++ tst-ext \ ++ tst-ext2 \ ++ tst-fgetc-after-eof \ ++ tst-fgetwc \ ++ tst-fgetws \ ++ tst-fopenloc2 \ ++ tst-fputws \ ++ tst-freopen \ ++ tst-fseek \ ++ tst-ftell-active-handler \ ++ tst-ftell-append \ ++ tst-ftell-partial-wide \ ++ tst-fwrite-error \ ++ tst-getdelim \ ++ tst-memstream1 \ ++ tst-memstream2 \ ++ tst-memstream3 \ ++ tst-memstream4 \ ++ tst-mmap-eofsync \ ++ tst-mmap-fflushsync \ ++ tst-mmap-offend \ ++ tst-mmap-setvbuf \ ++ tst-mmap2-eofsync \ ++ tst-popen1 \ ++ tst-setvbuf1 \ ++ tst-sprintf-chk-ub \ ++ tst-sprintf-ub \ ++ tst-sscanf \ ++ tst-swscanf \ ++ tst-ungetwc1 \ ++ tst-ungetwc2 \ ++ tst-wfile-sync \ ++ tst-widetext \ ++ tst-wmemstream1 \ ++ tst-wmemstream2 \ ++ tst-wmemstream3 \ ++ tst-wmemstream4 \ ++ tst-wmemstream5 \ ++ tst_getwc \ ++ tst_putwc \ ++ tst_swprintf \ ++ tst_swscanf \ ++ tst_wprintf \ ++ tst_wprintf2 \ ++ tst_wscanf \ ++ # tests + + tests-internal = tst-vtables tst-vtables-interposed + +@@ -235,16 +289,26 @@ tests-special += $(objpfx)tst-fopenloc-cmp.out $(objpfx)tst-fopenloc-mem.out \ + $(objpfx)tst-bz24228-mem.out + endif + +-tests += tst-cleanup-default tst-cleanup-default-static ++tests += \ ++ tst-cleanup-default \ ++ tst-cleanup-default-static \ ++ # tests + tests-static += tst-cleanup-default-static + tests-special += $(objpfx)tst-cleanup-default-cmp.out $(objpfx)tst-cleanup-default-static-cmp.out + LDFLAGS-tst-cleanup-default = -Wl,--gc-sections + LDFLAGS-tst-cleanup-default-static = -Wl,--gc-sections + + ifeq ($(have-gnu-retain)$(have-z-start-stop-gc),yesyes) +-tests += tst-cleanup-start-stop-gc tst-cleanup-start-stop-gc-static \ +- tst-cleanup-nostart-stop-gc tst-cleanup-nostart-stop-gc-static +-tests-static += tst-cleanup-start-stop-gc-static tst-cleanup-nostart-stop-gc-static ++tests += \ ++ tst-cleanup-nostart-stop-gc \ ++ tst-cleanup-nostart-stop-gc-static \ ++ tst-cleanup-start-stop-gc \ ++ tst-cleanup-start-stop-gc-static \ ++ # tests ++tests-static += \ ++ tst-cleanup-nostart-stop-gc-static \ ++ tst-cleanup-start-stop-gc-static \ ++ # tests-static + tests-special += $(objpfx)tst-cleanup-start-stop-gc-cmp.out \ + $(objpfx)tst-cleanup-start-stop-gc-static-cmp.out \ + $(objpfx)tst-cleanup-nostart-stop-gc-cmp.out \ diff --git a/glibc-upstream-2.39-196.patch b/glibc-upstream-2.39-196.patch new file mode 100644 index 0000000..d47bb85 --- /dev/null +++ b/glibc-upstream-2.39-196.patch @@ -0,0 +1,195 @@ +commit 1dcfb9479df400160208ac3d8ab33128d8f1aae5 +Author: Arjun Shankar +Date: Fri Oct 18 16:03:25 2024 +0200 + + libio: Fix a deadlock after fork in popen + + popen modifies its file handler book-keeping under a lock that wasn't + being taken during fork. This meant that a concurrent popen and fork + could end up copying the lock in a "locked" state into the fork child, + where subsequently calling popen would lead to a deadlock due to the + already (spuriously) held lock. + + This commit fixes the deadlock by appropriately taking the lock before + fork, and releasing/resetting it in the parent/child after the fork. + + A new test for concurrent popen and fork is also added. It consistently + hangs (and therefore fails via timeout) without the fix applied. + Reviewed-by: Florian Weimer + + (cherry picked from commit 9f0d2c0ee6c728643fcf9a4879e9f20f5e45ce5f) + +diff --git a/libio/Makefile b/libio/Makefile +index 0c1f16ee3b54c2d3..d1f2342867601735 100644 +--- a/libio/Makefile ++++ b/libio/Makefile +@@ -115,6 +115,7 @@ tests = \ + tst-mmap-offend \ + tst-mmap-setvbuf \ + tst-mmap2-eofsync \ ++ tst-popen-fork \ + tst-popen1 \ + tst-setvbuf1 \ + tst-sprintf-chk-ub \ +diff --git a/libio/iopopen.c b/libio/iopopen.c +index d01cb0648e3aac54..352513a2914a9d36 100644 +--- a/libio/iopopen.c ++++ b/libio/iopopen.c +@@ -57,6 +57,26 @@ unlock (void *not_used) + } + #endif + ++/* These lock/unlock/resetlock functions are used during fork. */ ++ ++void ++_IO_proc_file_chain_lock (void) ++{ ++ _IO_lock_lock (proc_file_chain_lock); ++} ++ ++void ++_IO_proc_file_chain_unlock (void) ++{ ++ _IO_lock_unlock (proc_file_chain_lock); ++} ++ ++void ++_IO_proc_file_chain_resetlock (void) ++{ ++ _IO_lock_init (proc_file_chain_lock); ++} ++ + /* POSIX states popen shall ensure that any streams from previous popen() + calls that remain open in the parent process should be closed in the new + child process. +diff --git a/libio/libioP.h b/libio/libioP.h +index 616253fcd00f04db..a83a411fdf7d93c9 100644 +--- a/libio/libioP.h ++++ b/libio/libioP.h +@@ -429,6 +429,12 @@ libc_hidden_proto (_IO_list_resetlock) + extern void _IO_enable_locks (void) __THROW; + libc_hidden_proto (_IO_enable_locks) + ++/* Functions for operating popen's proc_file_chain_lock during fork. */ ++ ++extern void _IO_proc_file_chain_lock (void) __THROW attribute_hidden; ++extern void _IO_proc_file_chain_unlock (void) __THROW attribute_hidden; ++extern void _IO_proc_file_chain_resetlock (void) __THROW attribute_hidden; ++ + /* Default jumptable functions. */ + + extern int _IO_default_underflow (FILE *) __THROW; +diff --git a/libio/tst-popen-fork.c b/libio/tst-popen-fork.c +new file mode 100644 +index 0000000000000000..1df30fc6c0a3f583 +--- /dev/null ++++ b/libio/tst-popen-fork.c +@@ -0,0 +1,80 @@ ++/* Test concurrent popen and fork. ++ Copyright (C) 2024 Free Software Foundation, Inc. ++ This file is part of the GNU C Library. ++ ++ The GNU C Library is free software; you can redistribute it and/or ++ modify it under the terms of the GNU Lesser General Public ++ License as published by the Free Software Foundation; either ++ version 2.1 of the License, or (at your option) any later version. ++ ++ The GNU C Library is distributed in the hope that it will be useful, ++ but WITHOUT ANY WARRANTY; without even the implied warranty of ++ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ Lesser General Public License for more details. ++ ++ You should have received a copy of the GNU Lesser General Public ++ License along with the GNU C Library; if not, see ++ . */ ++ ++#include ++#include ++#include ++#include ++#include ++ ++#include ++#include ++#include ++ ++static void ++popen_and_pclose (void) ++{ ++ FILE *f = popen ("true", "r"); ++ TEST_VERIFY_EXIT (f != NULL); ++ pclose (f); ++ return; ++} ++ ++static atomic_bool done = ATOMIC_VAR_INIT (0); ++ ++static void * ++popen_and_pclose_forever (__attribute__ ((unused)) ++ void *arg) ++{ ++ while (!atomic_load_explicit (&done, memory_order_acquire)) ++ popen_and_pclose (); ++ return NULL; ++} ++ ++static int ++do_test (void) ++{ ++ ++ /* Repeatedly call popen in a loop during the entire test. */ ++ pthread_t t = xpthread_create (NULL, popen_and_pclose_forever, NULL); ++ ++ /* Repeatedly fork off and reap child processes one-by-one. ++ Each child calls popen once, then exits, leading to the possibility ++ that a child forks *during* our own popen call, thus inheriting any ++ intermediate popen state, possibly including lock state(s). */ ++ for (int i = 0; i < 100; i++) ++ { ++ int cpid = xfork (); ++ ++ if (cpid == 0) ++ { ++ popen_and_pclose (); ++ _exit (0); ++ } ++ else ++ xwaitpid (cpid, NULL, 0); ++ } ++ ++ /* Stop calling popen. */ ++ atomic_store_explicit (&done, 1, memory_order_release); ++ xpthread_join (t); ++ ++ return 0; ++} ++ ++#include +diff --git a/posix/fork.c b/posix/fork.c +index 298765a1ffd08b75..cf9b80e7c059e748 100644 +--- a/posix/fork.c ++++ b/posix/fork.c +@@ -62,6 +62,7 @@ __libc_fork (void) + call_function_static_weak (__nss_database_fork_prepare_parent, + &nss_database_data); + ++ _IO_proc_file_chain_lock (); + _IO_list_lock (); + + /* Acquire malloc locks. This needs to come last because fork +@@ -92,6 +93,7 @@ __libc_fork (void) + + /* Reset locks in the I/O code. */ + _IO_list_resetlock (); ++ _IO_proc_file_chain_resetlock (); + + call_function_static_weak (__nss_database_fork_subprocess, + &nss_database_data); +@@ -121,6 +123,7 @@ __libc_fork (void) + + /* We execute this even if the 'fork' call failed. */ + _IO_list_unlock (); ++ _IO_proc_file_chain_unlock (); + } + + /* Run the handlers registered for the parent. */ diff --git a/glibc-upstream-2.39-197.patch b/glibc-upstream-2.39-197.patch new file mode 100644 index 0000000..a413578 --- /dev/null +++ b/glibc-upstream-2.39-197.patch @@ -0,0 +1,27 @@ +commit 14ec225d859091c048ec54e5c4ddf6738498aee7 +Author: Arjun Shankar +Date: Fri Oct 25 09:33:45 2024 +0200 + + libio: Correctly link tst-popen-fork against libpthread + + tst-popen-fork failed to build for Hurd due to not being linked with + libpthread. This commit fixes that. + + Tested with build-many-glibcs.py for i686-gnu. + + Reviewed-by: Florian Weimer + (cherry picked from commit 6a290b2895b77be839fcb7c44a6a9879560097ad) + +diff --git a/libio/Makefile b/libio/Makefile +index d1f2342867601735..92d6c6bcab1818d0 100644 +--- a/libio/Makefile ++++ b/libio/Makefile +@@ -140,6 +140,8 @@ tests = \ + tst_wscanf \ + # tests + ++$(objpfx)tst-popen-fork: $(shared-thread-library) ++ + tests-internal = tst-vtables tst-vtables-interposed + + ifeq (yes,$(build-shared)) diff --git a/glibc-upstream-2.39-198.patch b/glibc-upstream-2.39-198.patch new file mode 100644 index 0000000..a0c9d1a --- /dev/null +++ b/glibc-upstream-2.39-198.patch @@ -0,0 +1,34 @@ +commit 9fe51d34bbce71d186e7adee74e523ccc64a9727 +Author: H.J. Lu +Date: Thu Feb 15 03:22:55 2024 -0800 + + sort-makefile-lines.py: Allow '_' in name and "^# name" + + '_' is used in Makefile variable names and many variables end with + "^# name". Relax sort-makefile-lines.py to allow '_' in name and + "^# name" as variable end. This fixes BZ #31385. + + (cherry picked from commit 6a2512bf1605a4208dd94ef67408488d8acb2409) + +diff --git a/scripts/sort-makefile-lines.py b/scripts/sort-makefile-lines.py +index f65ee40e27fb85ff..b2249aef6d028cf7 100755 +--- a/scripts/sort-makefile-lines.py ++++ b/scripts/sort-makefile-lines.py +@@ -129,7 +129,7 @@ def sort_makefile_lines(): + for i in range(len(lines)): + # Look for things like "var = \", "var := \" or "var += \" + # to start the sorted list. +- var = re.search(r'^([a-zA-Z0-9-]*) [\+:]?\= \\$', lines[i]) ++ var = re.search(r'^([-_a-zA-Z0-9]*) [\+:]?\= \\$', lines[i]) + if var: + # Remember the index and the name. + startmarks.append((i, var.group(1))) +@@ -140,7 +140,7 @@ def sort_makefile_lines(): + rangemarks = [] + for sm in startmarks: + # Look for things like " # var" to end the sorted list. +- reg = r'^ # ' + sm[1] + r'$' ++ reg = r'^ *# ' + sm[1] + r'$' + for j in range(sm[0] + 1, len(lines)): + if re.search(reg, lines[j]): + # Remember the block to sort (inclusive). diff --git a/glibc-upstream-2.39-199.patch b/glibc-upstream-2.39-199.patch new file mode 100644 index 0000000..b5b2458 --- /dev/null +++ b/glibc-upstream-2.39-199.patch @@ -0,0 +1,32 @@ +commit 37b30b6a685c5facccdff61663eb3adf0dd253cd +Author: H.J. Lu +Date: Thu Feb 15 11:12:13 2024 -0800 + + sysdeps/x86_64/Makefile (tests): Add the end marker + + (cherry picked from commit 71d133c500b0d23f6b6a7c6e3595e3fc447bfe91) + +diff --git a/sysdeps/x86_64/Makefile b/sysdeps/x86_64/Makefile +index 0ede447405d549b5..08ec882159990e97 100644 +--- a/sysdeps/x86_64/Makefile ++++ b/sysdeps/x86_64/Makefile +@@ -32,7 +32,8 @@ sysdep_routines += \ + # sysdep_routines + gen-as-const-headers += locale-defines.sym + tests += \ +- tst-rsi-strlen ++ tst-rsi-strlen \ ++# tests + endif + + ifeq ($(subdir),elf) +@@ -232,7 +233,8 @@ sysdep_routines += \ + # sysdep_routines + + tests += \ +- tst-rsi-wcslen ++ tst-rsi-wcslen \ ++# tests + endif + + diff --git a/glibc-upstream-2.39-200.patch b/glibc-upstream-2.39-200.patch new file mode 100644 index 0000000..bda619f --- /dev/null +++ b/glibc-upstream-2.39-200.patch @@ -0,0 +1,39 @@ +commit 4e5ee49a432b8569137bdacc302fc696ed37b1bd +Author: H.J. Lu +Date: Wed Feb 28 05:46:40 2024 -0800 + + sysdeps/unix/sysv/linux/x86_64/Makefile: Add the end marker + + Add the end marker to tests, tests-container and modules-names. + + (cherry picked from commit e6350be7e9cae8f71c96c1f06eab61b9acb227c8) + +diff --git a/sysdeps/unix/sysv/linux/x86_64/Makefile b/sysdeps/unix/sysv/linux/x86_64/Makefile +index 9a1e7aa6461725af..fcbffd81cbaa031d 100644 +--- a/sysdeps/unix/sysv/linux/x86_64/Makefile ++++ b/sysdeps/unix/sysv/linux/x86_64/Makefile +@@ -17,18 +17,21 @@ endif + ifeq ($(subdir),elf) + ifeq (yes,$(enable-x86-isa-level)) + tests += \ +- tst-glibc-hwcaps-2 ++ tst-glibc-hwcaps-2 \ ++# tests + ifeq (no,$(build-hardcoded-path-in-tests)) + # This is an ld.so.cache test, and RPATH/RUNPATH in the executable + # interferes with its test objectives. + tests-container += \ +- tst-glibc-hwcaps-2-cache ++ tst-glibc-hwcaps-2-cache \ ++# tests-container + endif + modules-names += \ + libx86-64-isa-level-1 \ + libx86-64-isa-level-2 \ + libx86-64-isa-level-3 \ +- libx86-64-isa-level-4 ++ libx86-64-isa-level-4 \ ++# modules-names + + $(objpfx)tst-glibc-hwcaps-2: $(objpfx)libx86-64-isa-level.so + diff --git a/glibc-upstream-2.39-201.patch b/glibc-upstream-2.39-201.patch new file mode 100644 index 0000000..0e6a9a2 --- /dev/null +++ b/glibc-upstream-2.39-201.patch @@ -0,0 +1,182 @@ +commit 147bed0a71a6c5cbf83d05f4081e923d74a6847e +Author: Florian Weimer +Date: Thu Feb 13 21:56:52 2025 +0100 + + elf: Keep using minimal malloc after early DTV resize (bug 32412) + + If an auditor loads many TLS-using modules during startup, it is + possible to trigger DTV resizing. Previously, the DTV was marked + as allocated by the main malloc afterwards, even if the minimal + malloc was still in use. With this change, _dl_resize_dtv marks + the resized DTV as allocated with the minimal malloc. + + The new test reuses TLS-using modules from other auditing tests. + + Reviewed-by: DJ Delorie + (cherry picked from commit aa3d7bd5299b33bffc118aa618b59bfa66059bcb) + +diff --git a/elf/Makefile b/elf/Makefile +index 8a5678aa63736812..f2e9cb1075adc8a5 100644 +--- a/elf/Makefile ++++ b/elf/Makefile +@@ -376,6 +376,7 @@ tests += \ + tst-align3 \ + tst-audit-tlsdesc \ + tst-audit-tlsdesc-dlopen \ ++ tst-audit-tlsdesc-dlopen2 \ + tst-audit1 \ + tst-audit2 \ + tst-audit8 \ +@@ -802,6 +803,7 @@ modules-names += \ + tst-auditmanymod8 \ + tst-auditmanymod9 \ + tst-auditmod-tlsdesc \ ++ tst-auditmod-tlsdesc2 \ + tst-auditmod1 \ + tst-auditmod11 \ + tst-auditmod12 \ +@@ -3012,6 +3014,9 @@ $(objpfx)tst-audit-tlsdesc.out: $(objpfx)tst-auditmod-tlsdesc.so + tst-audit-tlsdesc-ENV = LD_AUDIT=$(objpfx)tst-auditmod-tlsdesc.so + $(objpfx)tst-audit-tlsdesc-dlopen.out: $(objpfx)tst-auditmod-tlsdesc.so + tst-audit-tlsdesc-dlopen-ENV = LD_AUDIT=$(objpfx)tst-auditmod-tlsdesc.so ++$(objpfx)tst-audit-tlsdesc-dlopen2.out: $(objpfx)tst-auditmod-tlsdesc2.so \ ++ $(patsubst %, $(objpfx)%.so, $(tlsmod17a-modules)) ++tst-audit-tlsdesc-dlopen2-ENV = LD_AUDIT=$(objpfx)tst-auditmod-tlsdesc2.so + + $(objpfx)tst-dlmopen-twice.out: \ + $(objpfx)tst-dlmopen-twice-mod1.so \ +diff --git a/elf/dl-tls.c b/elf/dl-tls.c +index 3d529b722cb271d9..b13e752358a059a4 100644 +--- a/elf/dl-tls.c ++++ b/elf/dl-tls.c +@@ -528,6 +528,13 @@ _dl_resize_dtv (dtv_t *dtv, size_t max_modid) + if (newp == NULL) + oom (); + memcpy (newp, &dtv[-1], (2 + oldsize) * sizeof (dtv_t)); ++#ifdef SHARED ++ /* Auditors can trigger a DTV resize event while the full malloc ++ is not yet in use. Mark the new DTV allocation as the ++ initial allocation. */ ++ if (!__rtld_malloc_is_complete ()) ++ GL(dl_initial_dtv) = &newp[1]; ++#endif + } + else + { +diff --git a/elf/tst-audit-tlsdesc-dlopen2.c b/elf/tst-audit-tlsdesc-dlopen2.c +new file mode 100644 +index 0000000000000000..7ba2c4129a9bcc53 +--- /dev/null ++++ b/elf/tst-audit-tlsdesc-dlopen2.c +@@ -0,0 +1,46 @@ ++/* Loading TLS-using modules from auditors (bug 32412). Main program. ++ Copyright (C) 2021-2025 Free Software Foundation, Inc. ++ This file is part of the GNU C Library. ++ ++ The GNU C Library is free software; you can redistribute it and/or ++ modify it under the terms of the GNU Lesser General Public ++ License as published by the Free Software Foundation; either ++ version 2.1 of the License, or (at your option) any later version. ++ ++ The GNU C Library is distributed in the hope that it will be useful, ++ but WITHOUT ANY WARRANTY; without even the implied warranty of ++ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ Lesser General Public License for more details. ++ ++ You should have received a copy of the GNU Lesser General Public ++ License along with the GNU C Library; if not, see ++ . */ ++ ++#include ++#include ++ ++static int ++do_test (void) ++{ ++ puts ("info: start of main program"); ++ ++ /* Load TLS-using modules, to trigger DTV resizing. The dynamic ++ linker will load them again (requiring their own TLS) because the ++ dlopen calls from the auditor were in the auditing namespace. */ ++ for (int i = 1; i <= 19; ++i) ++ { ++ char dso[30]; ++ snprintf (dso, sizeof (dso), "tst-tlsmod17a%d.so", i); ++ char sym[30]; ++ snprintf (sym, sizeof(sym), "tlsmod17a%d", i); ++ ++ void *handle = xdlopen (dso, RTLD_LAZY); ++ int (*func) (void) = xdlsym (handle, sym); ++ /* Trigger TLS allocation. */ ++ func (); ++ } ++ ++ return 0; ++} ++ ++#include +diff --git a/elf/tst-auditmod-tlsdesc2.c b/elf/tst-auditmod-tlsdesc2.c +new file mode 100644 +index 0000000000000000..50275cd34d1219c6 +--- /dev/null ++++ b/elf/tst-auditmod-tlsdesc2.c +@@ -0,0 +1,59 @@ ++/* Loading TLS-using modules from auditors (bug 32412). Audit module. ++ Copyright (C) 2021-2025 Free Software Foundation, Inc. ++ This file is part of the GNU C Library. ++ ++ The GNU C Library is free software; you can redistribute it and/or ++ modify it under the terms of the GNU Lesser General Public ++ License as published by the Free Software Foundation; either ++ version 2.1 of the License, or (at your option) any later version. ++ ++ The GNU C Library is distributed in the hope that it will be useful, ++ but WITHOUT ANY WARRANTY; without even the implied warranty of ++ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ Lesser General Public License for more details. ++ ++ You should have received a copy of the GNU Lesser General Public ++ License along with the GNU C Library; if not, see ++ . */ ++ ++#include ++#include ++#include ++#include ++#include ++ ++unsigned int ++la_version (unsigned int version) ++{ ++ /* Open some modules, to trigger DTV resizing before the switch to ++ the main malloc. */ ++ for (int i = 1; i <= 19; ++i) ++ { ++ char dso[30]; ++ snprintf (dso, sizeof (dso), "tst-tlsmod17a%d.so", i); ++ char sym[30]; ++ snprintf (sym, sizeof(sym), "tlsmod17a%d", i); ++ ++ void *handle = dlopen (dso, RTLD_LAZY); ++ if (handle == NULL) ++ { ++ printf ("error: dlmopen from auditor: %s\n", dlerror ()); ++ fflush (stdout); ++ _exit (1); ++ } ++ int (*func) (void) = dlsym (handle, sym); ++ if (func == NULL) ++ { ++ printf ("error: dlsym from auditor: %s\n", dlerror ()); ++ fflush (stdout); ++ _exit (1); ++ } ++ /* Trigger TLS allocation. */ ++ func (); ++ } ++ ++ puts ("info: TLS-using modules loaded from auditor"); ++ fflush (stdout); ++ ++ return LAV_CURRENT; ++} diff --git a/glibc-upstream-2.39-202.patch b/glibc-upstream-2.39-202.patch new file mode 100644 index 0000000..dd9ebe0 --- /dev/null +++ b/glibc-upstream-2.39-202.patch @@ -0,0 +1,57 @@ +commit abdeb4b5200e0afb05e6a7863c52d2fbe7029b47 +Author: Florian Weimer +Date: Tue May 20 19:36:02 2025 +0200 + + support: Use const char * argument in support_capture_subprogram_self_sgid + + The function does not modify the passed-in string, so make this clear + via the prototype. + + Reviewed-by: Carlos O'Donell + (cherry picked from commit f0c09fe61678df6f7f18fe1ebff074e62fa5ca7a) + +diff --git a/support/capture_subprocess.h b/support/capture_subprocess.h +index 1ecbdfe4fc4aa123..f2765278d920839d 100644 +--- a/support/capture_subprocess.h ++++ b/support/capture_subprocess.h +@@ -44,8 +44,7 @@ struct support_capture_subprocess support_capture_subprogram + /* Copy the running program into a setgid binary and run it with CHILD_ID + argument. If execution is successful, return the exit status of the child + program, otherwise return a non-zero failure exit code. */ +-int support_capture_subprogram_self_sgid +- (char *child_id); ++int support_capture_subprogram_self_sgid (const char *child_id); + + /* Deallocate the subprocess data captured by + support_capture_subprocess. */ +diff --git a/support/support_capture_subprocess.c b/support/support_capture_subprocess.c +index ffced8a89fca37a5..eb72a2c21cf99ee2 100644 +--- a/support/support_capture_subprocess.c ++++ b/support/support_capture_subprocess.c +@@ -109,7 +109,7 @@ support_capture_subprogram (const char *file, char *const argv[]) + safely make it SGID with the TARGET group ID. Then runs the + executable. */ + static int +-copy_and_spawn_sgid (char *child_id, gid_t gid) ++copy_and_spawn_sgid (const char *child_id, gid_t gid) + { + char *dirname = xasprintf ("%s/tst-tunables-setuid.%jd", + test_dir, (intmax_t) getpid ()); +@@ -181,7 +181,7 @@ copy_and_spawn_sgid (char *child_id, gid_t gid) + ret = 0; + infd = outfd = -1; + +- char * const args[] = {execname, child_id, NULL}; ++ char * const args[] = {execname, (char *) child_id, NULL}; + + status = support_subprogram_wait (args[0], args); + +@@ -210,7 +210,7 @@ err: + } + + int +-support_capture_subprogram_self_sgid (char *child_id) ++support_capture_subprogram_self_sgid (const char *child_id) + { + gid_t target = 0; + const int count = 64; diff --git a/glibc-upstream-2.39-203.patch b/glibc-upstream-2.39-203.patch new file mode 100644 index 0000000..140f69f --- /dev/null +++ b/glibc-upstream-2.39-203.patch @@ -0,0 +1,43 @@ +commit 71ddb11ccd76843cec6e793977218e227fe51c07 +Author: Florian Weimer +Date: Mon Dec 23 13:57:55 2024 +0100 + + support: Add support_record_failure_barrier + + This can be used to stop execution after a TEST_COMPARE_BLOB + failure, for example. + + (cherry picked from commit d0b8aa6de4529231fadfe604ac2c434e559c2d9e) + +diff --git a/support/check.h b/support/check.h +index 7ea22c7a2cba5cfd..8f41e5b99fc17472 100644 +--- a/support/check.h ++++ b/support/check.h +@@ -207,6 +207,9 @@ void support_record_failure_reset (void); + failures or not. */ + int support_record_failure_is_failed (void); + ++/* Terminate the process if any failures have been encountered so far. */ ++void support_record_failure_barrier (void); ++ + __END_DECLS + + #endif /* SUPPORT_CHECK_H */ +diff --git a/support/support_record_failure.c b/support/support_record_failure.c +index 978123701d128795..72ee2b232fb2b08c 100644 +--- a/support/support_record_failure.c ++++ b/support/support_record_failure.c +@@ -112,3 +112,13 @@ support_record_failure_is_failed (void) + synchronization for reliable test error reporting anyway. */ + return __atomic_load_n (&state->failed, __ATOMIC_RELAXED); + } ++ ++void ++support_record_failure_barrier (void) ++{ ++ if (__atomic_load_n (&state->failed, __ATOMIC_RELAXED)) ++ { ++ puts ("error: exiting due to previous errors"); ++ exit (1); ++ } ++} diff --git a/glibc-upstream-2.39-204.patch b/glibc-upstream-2.39-204.patch new file mode 100644 index 0000000..2658349 --- /dev/null +++ b/glibc-upstream-2.39-204.patch @@ -0,0 +1,155 @@ +commit ca99d55315b80277a7b189f5a9630f5b08ccaa6d +Author: Florian Weimer +Date: Tue May 20 19:45:06 2025 +0200 + + elf: Test case for bug 32976 (CVE-2025-4802) + + Check that LD_LIBRARY_PATH is ignored for AT_SECURE statically + linked binaries, using support_capture_subprogram_self_sgid. + + Reviewed-by: Carlos O'Donell + (cherry picked from commit d8f7a79335b0d861c12c42aec94c04cd5bb181e2) + +diff --git a/elf/Makefile b/elf/Makefile +index f2e9cb1075adc8a5..51d52b57876fc5ba 100644 +--- a/elf/Makefile ++++ b/elf/Makefile +@@ -266,6 +266,7 @@ tests-static-normal := \ + tst-array1-static \ + tst-array5-static \ + tst-dl-iter-static \ ++ tst-dlopen-sgid \ + tst-dst-static \ + tst-env-setuid-static \ + tst-getauxval-static \ +@@ -844,6 +845,7 @@ modules-names += \ + tst-dlmopen-twice-mod1 \ + tst-dlmopen-twice-mod2 \ + tst-dlmopen1mod \ ++ tst-dlopen-sgid-mod \ + tst-dlopen-tlsreinitmod1 \ + tst-dlopen-tlsreinitmod2 \ + tst-dlopen-tlsreinitmod3 \ +@@ -3125,3 +3127,5 @@ $(objpfx)tst-dlopen-tlsreinit3.out: $(objpfx)tst-auditmod1.so + tst-dlopen-tlsreinit3-ENV = LD_AUDIT=$(objpfx)tst-auditmod1.so + $(objpfx)tst-dlopen-tlsreinit4.out: $(objpfx)tst-auditmod1.so + tst-dlopen-tlsreinit4-ENV = LD_AUDIT=$(objpfx)tst-auditmod1.so ++ ++$(objpfx)tst-dlopen-sgid.out: $(objpfx)tst-dlopen-sgid-mod.so +diff --git a/elf/tst-dlopen-sgid-mod.c b/elf/tst-dlopen-sgid-mod.c +new file mode 100644 +index 0000000000000000..5eb79eef485da4c9 +--- /dev/null ++++ b/elf/tst-dlopen-sgid-mod.c +@@ -0,0 +1 @@ ++/* Opening this object should not succeed. */ +diff --git a/elf/tst-dlopen-sgid.c b/elf/tst-dlopen-sgid.c +new file mode 100644 +index 0000000000000000..47829a405e90b6b9 +--- /dev/null ++++ b/elf/tst-dlopen-sgid.c +@@ -0,0 +1,104 @@ ++/* Test case for ignored LD_LIBRARY_PATH in static startug (bug 32976). ++ Copyright (C) 2025 Free Software Foundation, Inc. ++ This file is part of the GNU C Library. ++ ++ The GNU C Library is free software; you can redistribute it and/or ++ modify it under the terms of the GNU Lesser General Public ++ License as published by the Free Software Foundation; either ++ version 2.1 of the License, or (at your option) any later version. ++ ++ The GNU C Library is distributed in the hope that it will be useful, ++ but WITHOUT ANY WARRANTY; without even the implied warranty of ++ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ Lesser General Public License for more details. ++ ++ You should have received a copy of the GNU Lesser General Public ++ License along with the GNU C Library; if not, see ++ . */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++/* This is the name of our test object. Use a custom module for ++ testing, so that this object does not get picked up from the system ++ path. */ ++static const char dso_name[] = "tst-dlopen-sgid-mod.so"; ++ ++/* Used to mark the recursive invocation. */ ++static const char magic_argument[] = "run-actual-test"; ++ ++static int ++do_test (void) ++{ ++/* Pathname of the directory that receives the shared objects this ++ test attempts to load. */ ++ char *libdir = support_create_temp_directory ("tst-dlopen-sgid-"); ++ ++ /* This is supposed to be ignored and stripped. */ ++ TEST_COMPARE (setenv ("LD_LIBRARY_PATH", libdir, 1), 0); ++ ++ /* Copy of libc.so.6. */ ++ { ++ char *from = xasprintf ("%s/%s", support_objdir_root, LIBC_SO); ++ char *to = xasprintf ("%s/%s", libdir, LIBC_SO); ++ add_temp_file (to); ++ support_copy_file (from, to); ++ free (to); ++ free (from); ++ } ++ ++ /* Copy of the test object. */ ++ { ++ char *from = xasprintf ("%s/elf/%s", support_objdir_root, dso_name); ++ char *to = xasprintf ("%s/%s", libdir, dso_name); ++ add_temp_file (to); ++ support_copy_file (from, to); ++ free (to); ++ free (from); ++ } ++ ++ TEST_COMPARE (support_capture_subprogram_self_sgid (magic_argument), 0); ++ ++ free (libdir); ++ ++ return 0; ++} ++ ++static void ++alternative_main (int argc, char **argv) ++{ ++ if (argc == 2 && strcmp (argv[1], magic_argument) == 0) ++ { ++ if (getgid () == getegid ()) ++ /* This can happen if the file system is mounted nosuid. */ ++ FAIL_UNSUPPORTED ("SGID failed: GID and EGID match (%jd)\n", ++ (intmax_t) getgid ()); ++ ++ /* Should be removed due to SGID. */ ++ TEST_COMPARE_STRING (getenv ("LD_LIBRARY_PATH"), NULL); ++ ++ TEST_VERIFY (dlopen (dso_name, RTLD_NOW) == NULL); ++ { ++ const char *message = dlerror (); ++ TEST_COMPARE_STRING (message, ++ "tst-dlopen-sgid-mod.so:" ++ " cannot open shared object file:" ++ " No such file or directory"); ++ } ++ ++ support_record_failure_barrier (); ++ exit (EXIT_SUCCESS); ++ } ++} ++ ++#define PREPARE alternative_main ++#include diff --git a/glibc-upstream-2.39-205.patch b/glibc-upstream-2.39-205.patch new file mode 100644 index 0000000..637fec3 --- /dev/null +++ b/glibc-upstream-2.39-205.patch @@ -0,0 +1,33 @@ +commit 9e25c0f445606e809996329b8a21d3342529474d +Author: Sunil K Pandey +Date: Tue May 20 10:07:27 2025 -0700 + + x86_64: Fix typo in ifunc-impl-list.c. + + Fix wcsncpy and wcpncpy typo in ifunc-impl-list.c. + + Reviewed-by: H.J. Lu + (cherry picked from commit f2aeb6ff941dccc4c777b5621e77addea6cc076c) + +diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c +index c4a21d4b7ca8f01a..c34c94cb58394b56 100644 +--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c ++++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c +@@ -928,7 +928,7 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, + (CPU_FEATURE_USABLE (AVX2) + && CPU_FEATURE_USABLE (BMI2)), + __wcsncpy_avx2) +- X86_IFUNC_IMPL_ADD_V2 (array, i, wcpncpy, ++ X86_IFUNC_IMPL_ADD_V2 (array, i, wcsncpy, + 1, + __wcsncpy_generic)) + +@@ -958,7 +958,7 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, + (CPU_FEATURE_USABLE (AVX2) + && CPU_FEATURE_USABLE (BMI2)), + __wcpncpy_avx2) +- X86_IFUNC_IMPL_ADD_V2 (array, i, wcsncpy, ++ X86_IFUNC_IMPL_ADD_V2 (array, i, wcpncpy, + 1, + __wcpncpy_generic)) + diff --git a/glibc-upstream-2.39-206.patch b/glibc-upstream-2.39-206.patch new file mode 100644 index 0000000..59b1a10 --- /dev/null +++ b/glibc-upstream-2.39-206.patch @@ -0,0 +1,43 @@ +commit 2caef2827f76af88d495eb382da174896d08900a +Author: Florian Weimer +Date: Wed May 21 08:43:32 2025 +0200 + + elf: Fix subprocess status handling for tst-dlopen-sgid (bug 32987) + + This should really move into support_capture_subprogram_self_sgid. + + Reviewed-by: Sam James + (cherry picked from commit 35fc356fa3b4f485bd3ba3114c9f774e5df7d3c2) + +diff --git a/elf/tst-dlopen-sgid.c b/elf/tst-dlopen-sgid.c +index 47829a405e90b6b9..5688b79f2e870b1d 100644 +--- a/elf/tst-dlopen-sgid.c ++++ b/elf/tst-dlopen-sgid.c +@@ -26,6 +26,8 @@ + #include + #include + #include ++#include ++#include + #include + + /* This is the name of our test object. Use a custom module for +@@ -66,10 +68,16 @@ do_test (void) + free (from); + } + +- TEST_COMPARE (support_capture_subprogram_self_sgid (magic_argument), 0); +- + free (libdir); + ++ int status = support_capture_subprogram_self_sgid (magic_argument); ++ ++ if (WEXITSTATUS (status) == EXIT_UNSUPPORTED) ++ return EXIT_UNSUPPORTED; ++ ++ if (!WIFEXITED (status)) ++ FAIL_EXIT1 ("Unexpected exit status %d from child process\n", status); ++ + return 0; + } + diff --git a/glibc-upstream-2.39-207.patch b/glibc-upstream-2.39-207.patch new file mode 100644 index 0000000..8edb2ab --- /dev/null +++ b/glibc-upstream-2.39-207.patch @@ -0,0 +1,215 @@ +commit c6240a11f7325031651e634309ca1a43a7484bd4 +Author: Carlos O'Donell +Date: Wed Jun 11 09:43:50 2025 -0400 + + ppc64le: Revert "powerpc: Fix performance issues of strcmp power10" (CVE-2025-5702) + + This reverts commit 90bcc8721ef82b7378d2b080141228660e862d56 + + This change is in the chain of the final revert that fixes the CVE + i.e. 3367d8e180848030d1646f088759f02b8dfe0d6f + + Reason for revert: Power10 strcmp clobbers non-volatile vector + registers (Bug 33056) + + Tested on ppc64le with no regressions. + + (cherry picked from commit c22de63588df7a8a0edceea9bb02534064c9d201) + +diff --git a/sysdeps/powerpc/powerpc64/le/power10/strcmp.S b/sysdeps/powerpc/powerpc64/le/power10/strcmp.S +index f0d6732a25efc63b..00f1e9c1707f5dd1 100644 +--- a/sysdeps/powerpc/powerpc64/le/power10/strcmp.S ++++ b/sysdeps/powerpc/powerpc64/le/power10/strcmp.S +@@ -62,7 +62,7 @@ + lxvl 32+v5,reg2,r0; \ + add reg1,reg1,len_reg; \ + add reg2,reg2,len_reg; \ +- vcmpnezb v7,v4,v5; \ ++ vcmpnezb. v7,v4,v5; \ + vctzlsbb r6,v7; \ + cmpld cr7,r6,len_reg; \ + blt cr7,L(different); \ +@@ -72,110 +72,70 @@ + + .machine power9 + ENTRY_TOCLESS (STRCMP, 4) +- andi. r7,r3,4095 +- andi. r8,r4,4095 +- cmpldi cr0,r7,4096-16 +- cmpldi cr1,r8,4096-16 +- bgt cr0,L(crosses) +- bgt cr1,L(crosses) +- COMPARE_16(v4,v5,0) +- +-L(crosses): +- andi. r7,r3,15 +- subfic r7,r7,16 /* r7(nalign1) = 16 - (str1 & 15). */ +- andi. r9,r4,15 +- subfic r5,r9,16 /* r5(nalign2) = 16 - (str2 & 15). */ +- cmpld cr7,r7,r5 +- beq cr7,L(same_aligned) +- blt cr7,L(nalign1_min) ++ li r11,16 ++ /* eq bit of cr1 used as swap status flag to indicate if ++ source pointers were swapped. */ ++ crclr 4*cr1+eq ++ vspltisb v19,-1 ++ andi. r7,r3,15 ++ sub r7,r11,r7 /* r7(nalign1) = 16 - (str1 & 15). */ ++ andi. r9,r4,15 ++ sub r5,r11,r9 /* r5(nalign2) = 16 - (str2 & 15). */ ++ cmpld cr7,r7,r5 ++ beq cr7,L(same_aligned) ++ blt cr7,L(nalign1_min) ++ /* Swap r3 and r4, and r7 and r5 such that r3 and r7 hold the ++ pointer which is closer to the next 16B boundary so that only ++ one CHECK_N_BYTES is needed before entering the loop below. */ ++ mr r8,r4 ++ mr r4,r3 ++ mr r3,r8 ++ mr r12,r7 ++ mr r7,r5 ++ mr r5,r12 ++ crset 4*cr1+eq /* Set bit on swapping source pointers. */ + +- /* nalign2 is minimum and s2 pointer is aligned. */ +- CHECK_N_BYTES(r3,r4,r5) +- /* Are we on the 64B hunk which crosses a page? */ +- andi. r10,r3,63 /* Determine offset into 64B hunk. */ +- andi. r8,r3,15 /* The offset into the 16B hunk. */ +- neg r7,r3 +- andi. r9,r7,15 /* Number of bytes after a 16B cross. */ +- rlwinm. r7,r7,26,0x3F /* ((r3-4096))>>6&63. */ +- beq L(compare_64_pagecross) +- mtctr r7 +- b L(compare_64B_unaligned) +- +- /* nalign1 is minimum and s1 pointer is aligned. */ ++ .p2align 5 + L(nalign1_min): + CHECK_N_BYTES(r3,r4,r7) +- /* Are we on the 64B hunk which crosses a page? */ +- andi. r10,r4,63 /* Determine offset into 64B hunk. */ +- andi. r8,r4,15 /* The offset into the 16B hunk. */ +- neg r7,r4 +- andi. r9,r7,15 /* Number of bytes after a 16B cross. */ +- rlwinm. r7,r7,26,0x3F /* ((r4-4096))>>6&63. */ +- beq L(compare_64_pagecross) +- mtctr r7 + + .p2align 5 +-L(compare_64B_unaligned): +- COMPARE_16(v4,v5,0) +- COMPARE_16(v4,v5,16) +- COMPARE_16(v4,v5,32) +- COMPARE_16(v4,v5,48) +- addi r3,r3,64 +- addi r4,r4,64 +- bdnz L(compare_64B_unaligned) ++L(s1_aligned): ++ /* r9 and r5 is number of bytes to be read after and before ++ page boundary correspondingly. */ ++ sub r5,r5,r7 ++ subfic r9,r5,16 ++ /* Now let r7 hold the count of quadwords which can be ++ checked without crossing a page boundary. quadword offset is ++ (str2>>4)&0xFF. */ ++ rlwinm r7,r4,28,0xFF ++ /* Below check is required only for first iteration. For second ++ iteration and beyond, the new loop counter is always 255. */ ++ cmpldi r7,255 ++ beq L(L3) ++ /* Get the initial loop count by 255-((str2>>4)&0xFF). */ ++ subfic r11,r7,255 + +- /* Cross the page boundary of s2, carefully. Only for first +- iteration we have to get the count of 64B blocks to be checked. +- From second iteration and beyond, loop counter is always 63. */ +-L(compare_64_pagecross): +- li r11, 63 ++ .p2align 5 ++L(L1): + mtctr r11 +- cmpldi r10,16 +- ble L(cross_4) +- cmpldi r10,32 +- ble L(cross_3) +- cmpldi r10,48 +- ble L(cross_2) +-L(cross_1): +- CHECK_N_BYTES(r3,r4,r9) +- CHECK_N_BYTES(r3,r4,r8) +- COMPARE_16(v4,v5,0) +- COMPARE_16(v4,v5,16) +- COMPARE_16(v4,v5,32) +- addi r3,r3,48 +- addi r4,r4,48 +- b L(compare_64B_unaligned) +-L(cross_2): +- COMPARE_16(v4,v5,0) +- addi r3,r3,16 +- addi r4,r4,16 +- CHECK_N_BYTES(r3,r4,r9) +- CHECK_N_BYTES(r3,r4,r8) +- COMPARE_16(v4,v5,0) +- COMPARE_16(v4,v5,16) +- addi r3,r3,32 +- addi r4,r4,32 +- b L(compare_64B_unaligned) +-L(cross_3): +- COMPARE_16(v4,v5,0) +- COMPARE_16(v4,v5,16) +- addi r3,r3,32 +- addi r4,r4,32 +- CHECK_N_BYTES(r3,r4,r9) +- CHECK_N_BYTES(r3,r4,r8) +- COMPARE_16(v4,v5,0) ++ ++ .p2align 5 ++L(L2): ++ COMPARE_16(v4,v5,0) /* Load 16B blocks using lxv. */ + addi r3,r3,16 + addi r4,r4,16 +- b L(compare_64B_unaligned) +-L(cross_4): +- COMPARE_16(v4,v5,0) +- COMPARE_16(v4,v5,16) +- COMPARE_16(v4,v5,32) +- addi r3,r3,48 +- addi r4,r4,48 ++ bdnz L(L2) ++ /* Cross the page boundary of s2, carefully. */ ++ ++ .p2align 5 ++L(L3): ++ CHECK_N_BYTES(r3,r4,r5) + CHECK_N_BYTES(r3,r4,r9) +- CHECK_N_BYTES(r3,r4,r8) +- b L(compare_64B_unaligned) ++ li r11,255 /* Load the new loop counter. */ ++ b L(L1) + ++ .p2align 5 + L(same_aligned): + CHECK_N_BYTES(r3,r4,r7) + /* Align s1 to 32B and adjust s2 address. +@@ -208,7 +168,18 @@ L(16B_aligned_loop): + + /* Calculate and return the difference. */ + L(different): +- TAIL(v4,v5) ++ vctzlsbb r6,v7 ++ vextubrx r5,r6,v4 ++ vextubrx r4,r6,v5 ++ bt 4*cr1+eq,L(swapped) ++ subf r3,r4,r5 ++ blr ++ ++ /* If src pointers were swapped, then swap the ++ indices and calculate the return value. */ ++L(swapped): ++ subf r3,r5,r4 ++ blr + + .p2align 5 + L(32B_aligned_loop): diff --git a/glibc-upstream-2.39-208.patch b/glibc-upstream-2.39-208.patch new file mode 100644 index 0000000..debabf9 --- /dev/null +++ b/glibc-upstream-2.39-208.patch @@ -0,0 +1,443 @@ +commit 3875045da55e3df9b2a05392504888b88cd68edb +Author: Carlos O'Donell +Date: Wed Jun 11 09:33:45 2025 -0400 + + ppc64le: Revert "powerpc : Add optimized memchr for POWER10" (Bug 33059) + + This reverts commit b9182c793caa05df5d697427c0538936e6396d4b + + Reason for revert: Power10 memchr clobbers v20 vector register + (Bug 33059) + + This is not a security issue, unlike CVE-2025-5745 and + CVE-2025-5702. + + Tested on ppc64le without regression. + + (cherry picked from commit a7877bb6685300f159fa095c9f50b22b112cddb8) + +diff --git a/sysdeps/powerpc/powerpc64/le/power10/memchr.S b/sysdeps/powerpc/powerpc64/le/power10/memchr.S +deleted file mode 100644 +index 53e5716d72e133d5..0000000000000000 +--- a/sysdeps/powerpc/powerpc64/le/power10/memchr.S ++++ /dev/null +@@ -1,315 +0,0 @@ +-/* Optimized memchr implementation for POWER10 LE. +- Copyright (C) 2021-2024 Free Software Foundation, Inc. +- This file is part of the GNU C Library. +- +- The GNU C Library is free software; you can redistribute it and/or +- modify it under the terms of the GNU Lesser General Public +- License as published by the Free Software Foundation; either +- version 2.1 of the License, or (at your option) any later version. +- +- The GNU C Library is distributed in the hope that it will be useful, +- but WITHOUT ANY WARRANTY; without even the implied warranty of +- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +- Lesser General Public License for more details. +- +- You should have received a copy of the GNU Lesser General Public +- License along with the GNU C Library; if not, see +- . */ +- +-#include +- +-# ifndef MEMCHR +-# define MEMCHR __memchr +-# endif +-# define M_VREG_ZERO v20 +-# define M_OFF_START_LOOP 256 +-# define MEMCHR_SUBTRACT_VECTORS \ +- vsububm v4,v4,v18; \ +- vsububm v5,v5,v18; \ +- vsububm v6,v6,v18; \ +- vsububm v7,v7,v18; +-# define M_TAIL(vreg,increment) \ +- vctzlsbb r4,vreg; \ +- cmpld r5,r4; \ +- ble L(null); \ +- addi r4,r4,increment; \ +- add r3,r6,r4; \ +- blr +- +-/* TODO: Replace macros by the actual instructions when minimum binutils becomes +- >= 2.35. This is used to keep compatibility with older versions. */ +-#define M_VEXTRACTBM(rt,vrb) \ +- .long(((4)<<(32-6)) \ +- | ((rt)<<(32-11)) \ +- | ((8)<<(32-16)) \ +- | ((vrb)<<(32-21)) \ +- | 1602) +- +-#define M_LXVP(xtp,dq,ra) \ +- .long(((6)<<(32-6)) \ +- | ((((xtp)-32)>>1)<<(32-10)) \ +- | ((1)<<(32-11)) \ +- | ((ra)<<(32-16)) \ +- | dq) +- +-#define CHECK16B(vreg,offset,addr,label) \ +- lxv vreg+32,offset(addr); \ +- vcmpequb. vreg,vreg,v18; \ +- bne cr6,L(label); \ +- cmpldi r5,16; \ +- ble L(null); \ +- addi r5,r5,-16; +- +-/* Load 4 quadwords, merge into one VR for speed and check for NULLs. r6 has # +- of bytes already checked. */ +-#define CHECK64B(offset,addr,label) \ +- M_LXVP(v4+32,offset,addr); \ +- M_LXVP(v6+32,offset+32,addr); \ +- MEMCHR_SUBTRACT_VECTORS; \ +- vminub v14,v4,v5; \ +- vminub v15,v6,v7; \ +- vminub v16,v14,v15; \ +- vcmpequb. v0,v16,M_VREG_ZERO; \ +- beq cr6,$+12; \ +- li r7,offset; \ +- b L(label); \ +- cmpldi r5,64; \ +- ble L(null); \ +- addi r5,r5,-64 +- +-/* Implements the function +- void *[r3] memchr (const void *s [r3], int c [r4], size_t n [r5]). */ +- +- .machine power9 +- +-ENTRY_TOCLESS (MEMCHR) +- CALL_MCOUNT 3 +- +- cmpldi r5,0 +- beq L(null) +- mr r0,r5 +- xori r6,r4,0xff +- +- mtvsrd v18+32,r4 /* matching char in v18 */ +- mtvsrd v19+32,r6 /* non matching char in v19 */ +- +- vspltb v18,v18,7 /* replicate */ +- vspltb v19,v19,7 /* replicate */ +- vspltisb M_VREG_ZERO,0 +- +- /* Next 16B-aligned address. Prepare address for L(aligned). */ +- addi r6,r3,16 +- clrrdi r6,r6,4 +- +- /* Align data and fill bytes not loaded with non matching char. */ +- lvx v0,0,r3 +- lvsr v1,0,r3 +- vperm v0,v19,v0,v1 +- +- vcmpequb. v6,v0,v18 +- bne cr6,L(found) +- sub r4,r6,r3 +- cmpld r5,r4 +- ble L(null) +- sub r5,r5,r4 +- +- /* Test up to OFF_START_LOOP-16 bytes in 16B chunks. The main loop is +- optimized for longer strings, so checking the first bytes in 16B +- chunks benefits a lot small strings. */ +- .p2align 5 +-L(aligned): +- cmpldi r5,0 +- beq L(null) +- +- CHECK16B(v0,0,r6,tail1) +- CHECK16B(v1,16,r6,tail2) +- CHECK16B(v2,32,r6,tail3) +- CHECK16B(v3,48,r6,tail4) +- CHECK16B(v4,64,r6,tail5) +- CHECK16B(v5,80,r6,tail6) +- CHECK16B(v6,96,r6,tail7) +- CHECK16B(v7,112,r6,tail8) +- CHECK16B(v8,128,r6,tail9) +- CHECK16B(v9,144,r6,tail10) +- CHECK16B(v10,160,r6,tail11) +- CHECK16B(v0,176,r6,tail12) +- CHECK16B(v1,192,r6,tail13) +- CHECK16B(v2,208,r6,tail14) +- CHECK16B(v3,224,r6,tail15) +- +- cmpdi cr5,r4,0 /* Check if c == 0. This will be useful to +- choose how we will perform the main loop. */ +- +- /* Prepare address for the loop. */ +- addi r4,r3,M_OFF_START_LOOP +- clrrdi r4,r4,6 +- sub r6,r4,r3 +- sub r5,r0,r6 +- addi r6,r4,128 +- +- /* If c == 0, use the loop without the vsububm. */ +- beq cr5,L(loop) +- +- /* This is very similar to the block after L(loop), the difference is +- that here MEMCHR_SUBTRACT_VECTORS is not empty, and we subtract +- each byte loaded by the char we are looking for, this way we can keep +- using vminub to merge the results and checking for nulls. */ +- .p2align 5 +-L(memchr_loop): +- CHECK64B(0,r4,pre_tail_64b) +- CHECK64B(64,r4,pre_tail_64b) +- addi r4,r4,256 +- +- CHECK64B(0,r6,tail_64b) +- CHECK64B(64,r6,tail_64b) +- addi r6,r6,256 +- +- CHECK64B(0,r4,pre_tail_64b) +- CHECK64B(64,r4,pre_tail_64b) +- addi r4,r4,256 +- +- CHECK64B(0,r6,tail_64b) +- CHECK64B(64,r6,tail_64b) +- addi r6,r6,256 +- +- b L(memchr_loop) +- /* Switch to a more aggressive approach checking 64B each time. Use 2 +- pointers 128B apart and unroll the loop once to make the pointer +- updates and usages separated enough to avoid stalls waiting for +- address calculation. */ +- .p2align 5 +-L(loop): +-#undef MEMCHR_SUBTRACT_VECTORS +-#define MEMCHR_SUBTRACT_VECTORS /* nothing */ +- CHECK64B(0,r4,pre_tail_64b) +- CHECK64B(64,r4,pre_tail_64b) +- addi r4,r4,256 +- +- CHECK64B(0,r6,tail_64b) +- CHECK64B(64,r6,tail_64b) +- addi r6,r6,256 +- +- CHECK64B(0,r4,pre_tail_64b) +- CHECK64B(64,r4,pre_tail_64b) +- addi r4,r4,256 +- +- CHECK64B(0,r6,tail_64b) +- CHECK64B(64,r6,tail_64b) +- addi r6,r6,256 +- +- b L(loop) +- +- .p2align 5 +-L(pre_tail_64b): +- mr r6,r4 +-L(tail_64b): +- /* OK, we found a null byte. Let's look for it in the current 64-byte +- block and mark it in its corresponding VR. lxvp vx,0(ry) puts the +- low 16B bytes into vx+1, and the high into vx, so the order here is +- v5, v4, v7, v6. */ +- vcmpequb v1,v5,M_VREG_ZERO +- vcmpequb v2,v4,M_VREG_ZERO +- vcmpequb v3,v7,M_VREG_ZERO +- vcmpequb v4,v6,M_VREG_ZERO +- +- /* Take into account the other 64B blocks we had already checked. */ +- add r6,r6,r7 +- /* Extract first bit of each byte. */ +- M_VEXTRACTBM(r8,v1) +- M_VEXTRACTBM(r9,v2) +- M_VEXTRACTBM(r10,v3) +- M_VEXTRACTBM(r11,v4) +- +- /* Shift each value into their corresponding position. */ +- sldi r9,r9,16 +- sldi r10,r10,32 +- sldi r11,r11,48 +- +- /* Merge the results. */ +- or r8,r8,r9 +- or r9,r10,r11 +- or r11,r9,r8 +- +- cnttzd r0,r11 /* Count trailing zeros before the match. */ +- cmpld r5,r0 +- ble L(null) +- add r3,r6,r0 /* Compute final address. */ +- blr +- +- .p2align 5 +-L(tail1): +- M_TAIL(v0,0) +- +- .p2align 5 +-L(tail2): +- M_TAIL(v1,16) +- +- .p2align 5 +-L(tail3): +- M_TAIL(v2,32) +- +- .p2align 5 +-L(tail4): +- M_TAIL(v3,48) +- +- .p2align 5 +-L(tail5): +- M_TAIL(v4,64) +- +- .p2align 5 +-L(tail6): +- M_TAIL(v5,80) +- +- .p2align 5 +-L(tail7): +- M_TAIL(v6,96) +- +- .p2align 5 +-L(tail8): +- M_TAIL(v7,112) +- +- .p2align 5 +-L(tail9): +- M_TAIL(v8,128) +- +- .p2align 5 +-L(tail10): +- M_TAIL(v9,144) +- +- .p2align 5 +-L(tail11): +- M_TAIL(v10,160) +- +- .p2align 5 +-L(tail12): +- M_TAIL(v0,176) +- +- .p2align 5 +-L(tail13): +- M_TAIL(v1,192) +- +- .p2align 5 +-L(tail14): +- M_TAIL(v2,208) +- +- .p2align 5 +-L(tail15): +- M_TAIL(v3,224) +- +- .p2align 5 +-L(found): +- vctzlsbb r7,v6 +- cmpld r5,r7 +- ble L(null) +- add r3,r3,r7 +- blr +- +- .p2align 5 +-L(null): +- li r3,0 +- blr +- +-END (MEMCHR) +- +-weak_alias (__memchr, memchr) +-libc_hidden_builtin_def (memchr) +diff --git a/sysdeps/powerpc/powerpc64/multiarch/Makefile b/sysdeps/powerpc/powerpc64/multiarch/Makefile +index 594fbb8058569d95..d7824a922b0de470 100644 +--- a/sysdeps/powerpc/powerpc64/multiarch/Makefile ++++ b/sysdeps/powerpc/powerpc64/multiarch/Makefile +@@ -31,10 +31,10 @@ sysdep_routines += memcpy-power8-cached memcpy-power7 memcpy-a2 memcpy-power6 \ + strncase-power8 + + ifneq (,$(filter %le,$(config-machine))) +-sysdep_routines += memchr-power10 memcmp-power10 memcpy-power10 \ +- memmove-power10 memset-power10 rawmemchr-power9 \ +- rawmemchr-power10 strcmp-power9 strcmp-power10 \ +- strncmp-power9 strcpy-power9 stpcpy-power9 \ ++sysdep_routines += memcmp-power10 memcpy-power10 memmove-power10 memset-power10 \ ++ rawmemchr-power9 rawmemchr-power10 \ ++ strcmp-power9 strcmp-power10 strncmp-power9 \ ++ strcpy-power9 stpcpy-power9 \ + strlen-power9 strncpy-power9 stpncpy-power9 strlen-power10 + endif + CFLAGS-strncase-power7.c += -mcpu=power7 -funroll-loops +diff --git a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c +index 5b2d6a90ab59e561..e2f733eb82fa6199 100644 +--- a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c ++++ b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c +@@ -226,12 +226,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, + + /* Support sysdeps/powerpc/powerpc64/multiarch/memchr.c. */ + IFUNC_IMPL (i, name, memchr, +-#ifdef __LITTLE_ENDIAN__ +- IFUNC_IMPL_ADD (array, i, memchr, +- hwcap2 & PPC_FEATURE2_ARCH_3_1 +- && hwcap & PPC_FEATURE_HAS_VSX, +- __memchr_power10) +-#endif + IFUNC_IMPL_ADD (array, i, memchr, + hwcap2 & PPC_FEATURE2_ARCH_2_07 + && hwcap & PPC_FEATURE_HAS_ALTIVEC, +diff --git a/sysdeps/powerpc/powerpc64/multiarch/memchr-power10.S b/sysdeps/powerpc/powerpc64/multiarch/memchr-power10.S +deleted file mode 100644 +index 7d35ef28a91255ba..0000000000000000 +--- a/sysdeps/powerpc/powerpc64/multiarch/memchr-power10.S ++++ /dev/null +@@ -1,28 +0,0 @@ +-/* Optimized memchr implementation for POWER10/PPC64. +- Copyright (C) 2016-2024 Free Software Foundation, Inc. +- This file is part of the GNU C Library. +- +- The GNU C Library is free software; you can redistribute it and/or +- modify it under the terms of the GNU Lesser General Public +- License as published by the Free Software Foundation; either +- version 2.1 of the License, or (at your option) any later version. +- +- The GNU C Library is distributed in the hope that it will be useful, +- but WITHOUT ANY WARRANTY; without even the implied warranty of +- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +- Lesser General Public License for more details. +- +- You should have received a copy of the GNU Lesser General Public +- License along with the GNU C Library; if not, see +- . */ +- +-#if defined __LITTLE_ENDIAN__ && IS_IN (libc) +-#define MEMCHR __memchr_power10 +- +-#undef libc_hidden_builtin_def +-#define libc_hidden_builtin_def(name) +-#undef weak_alias +-#define weak_alias(name,alias) +- +-#include +-#endif +diff --git a/sysdeps/powerpc/powerpc64/multiarch/memchr.c b/sysdeps/powerpc/powerpc64/multiarch/memchr.c +index 57d23e7b18587e82..b4655dfcaa482774 100644 +--- a/sysdeps/powerpc/powerpc64/multiarch/memchr.c ++++ b/sysdeps/powerpc/powerpc64/multiarch/memchr.c +@@ -25,23 +25,15 @@ extern __typeof (__memchr) __memchr_ppc attribute_hidden; + extern __typeof (__memchr) __memchr_power7 attribute_hidden; + extern __typeof (__memchr) __memchr_power8 attribute_hidden; + +-# ifdef __LITTLE_ENDIAN__ +-extern __typeof (__memchr) __memchr_power10 attribute_hidden; +-# endif + /* Avoid DWARF definition DIE on ifunc symbol so that GDB can handle + ifunc symbol properly. */ + libc_ifunc (__memchr, +-# ifdef __LITTLE_ENDIAN__ +- (hwcap2 & PPC_FEATURE2_ARCH_3_1 +- && hwcap & PPC_FEATURE_HAS_VSX) +- ? __memchr_power10 : +-# endif +- (hwcap2 & PPC_FEATURE2_ARCH_2_07 +- && hwcap & PPC_FEATURE_HAS_ALTIVEC) +- ? __memchr_power8 : +- (hwcap & PPC_FEATURE_ARCH_2_06) +- ? __memchr_power7 +- : __memchr_ppc); ++ (hwcap2 & PPC_FEATURE2_ARCH_2_07 ++ && hwcap & PPC_FEATURE_HAS_ALTIVEC) ++ ? __memchr_power8 : ++ (hwcap & PPC_FEATURE_ARCH_2_06) ++ ? __memchr_power7 ++ : __memchr_ppc); + + weak_alias (__memchr, memchr) + libc_hidden_builtin_def (memchr) diff --git a/glibc-upstream-2.39-209.patch b/glibc-upstream-2.39-209.patch new file mode 100644 index 0000000..230468f --- /dev/null +++ b/glibc-upstream-2.39-209.patch @@ -0,0 +1,307 @@ +commit 06a70769fd0b2e1f2a3085ad50ab620282bd77b3 +Author: Carlos O'Donell +Date: Mon Jun 16 13:09:57 2025 -0400 + + ppc64le: Revert "powerpc: Optimized strcmp for power10" (CVE-2025-5702) + + This reverts commit 3367d8e180848030d1646f088759f02b8dfe0d6f + + Reason for revert: Power10 strcmp clobbers non-volatile vector + registers (Bug 33056) + + Tested on ppc64le without regression. + + (cherry picked from commit 15808c77b35319e67ee0dc8f984a9a1a434701bc) + +diff --git a/sysdeps/powerpc/powerpc64/le/power10/strcmp.S b/sysdeps/powerpc/powerpc64/le/power10/strcmp.S +deleted file mode 100644 +index 00f1e9c1707f5dd1..0000000000000000 +--- a/sysdeps/powerpc/powerpc64/le/power10/strcmp.S ++++ /dev/null +@@ -1,204 +0,0 @@ +-/* Optimized strcmp implementation for PowerPC64/POWER10. +- Copyright (C) 2021-2024 Free Software Foundation, Inc. +- This file is part of the GNU C Library. +- +- The GNU C Library is free software; you can redistribute it and/or +- modify it under the terms of the GNU Lesser General Public +- License as published by the Free Software Foundation; either +- version 2.1 of the License, or (at your option) any later version. +- +- The GNU C Library is distributed in the hope that it will be useful, +- but WITHOUT ANY WARRANTY; without even the implied warranty of +- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +- Lesser General Public License for more details. +- +- You should have received a copy of the GNU Lesser General Public +- License along with the GNU C Library; if not, see +- . */ +-#include +- +-#ifndef STRCMP +-# define STRCMP strcmp +-#endif +- +-/* Implements the function +- int [r3] strcmp (const char *s1 [r3], const char *s2 [r4]). */ +- +-/* TODO: Change this to actual instructions when minimum binutils is upgraded +- to 2.27. Macros are defined below for these newer instructions in order +- to maintain compatibility. */ +- +-#define LXVP(xtp,dq,ra) \ +- .long(((6)<<(32-6)) \ +- | ((((xtp)-32)>>1)<<(32-10)) \ +- | ((1)<<(32-11)) \ +- | ((ra)<<(32-16)) \ +- | dq) +- +-#define COMPARE_16(vreg1,vreg2,offset) \ +- lxv vreg1+32,offset(r3); \ +- lxv vreg2+32,offset(r4); \ +- vcmpnezb. v7,vreg1,vreg2; \ +- bne cr6,L(different); \ +- +-#define COMPARE_32(vreg1,vreg2,offset,label1,label2) \ +- LXVP(vreg1+32,offset,r3); \ +- LXVP(vreg2+32,offset,r4); \ +- vcmpnezb. v7,vreg1+1,vreg2+1; \ +- bne cr6,L(label1); \ +- vcmpnezb. v7,vreg1,vreg2; \ +- bne cr6,L(label2); \ +- +-#define TAIL(vreg1,vreg2) \ +- vctzlsbb r6,v7; \ +- vextubrx r5,r6,vreg1; \ +- vextubrx r4,r6,vreg2; \ +- subf r3,r4,r5; \ +- blr; \ +- +-#define CHECK_N_BYTES(reg1,reg2,len_reg) \ +- sldi r0,len_reg,56; \ +- lxvl 32+v4,reg1,r0; \ +- lxvl 32+v5,reg2,r0; \ +- add reg1,reg1,len_reg; \ +- add reg2,reg2,len_reg; \ +- vcmpnezb. v7,v4,v5; \ +- vctzlsbb r6,v7; \ +- cmpld cr7,r6,len_reg; \ +- blt cr7,L(different); \ +- +- /* TODO: change this to .machine power10 when the minimum required +- binutils allows it. */ +- +- .machine power9 +-ENTRY_TOCLESS (STRCMP, 4) +- li r11,16 +- /* eq bit of cr1 used as swap status flag to indicate if +- source pointers were swapped. */ +- crclr 4*cr1+eq +- vspltisb v19,-1 +- andi. r7,r3,15 +- sub r7,r11,r7 /* r7(nalign1) = 16 - (str1 & 15). */ +- andi. r9,r4,15 +- sub r5,r11,r9 /* r5(nalign2) = 16 - (str2 & 15). */ +- cmpld cr7,r7,r5 +- beq cr7,L(same_aligned) +- blt cr7,L(nalign1_min) +- /* Swap r3 and r4, and r7 and r5 such that r3 and r7 hold the +- pointer which is closer to the next 16B boundary so that only +- one CHECK_N_BYTES is needed before entering the loop below. */ +- mr r8,r4 +- mr r4,r3 +- mr r3,r8 +- mr r12,r7 +- mr r7,r5 +- mr r5,r12 +- crset 4*cr1+eq /* Set bit on swapping source pointers. */ +- +- .p2align 5 +-L(nalign1_min): +- CHECK_N_BYTES(r3,r4,r7) +- +- .p2align 5 +-L(s1_aligned): +- /* r9 and r5 is number of bytes to be read after and before +- page boundary correspondingly. */ +- sub r5,r5,r7 +- subfic r9,r5,16 +- /* Now let r7 hold the count of quadwords which can be +- checked without crossing a page boundary. quadword offset is +- (str2>>4)&0xFF. */ +- rlwinm r7,r4,28,0xFF +- /* Below check is required only for first iteration. For second +- iteration and beyond, the new loop counter is always 255. */ +- cmpldi r7,255 +- beq L(L3) +- /* Get the initial loop count by 255-((str2>>4)&0xFF). */ +- subfic r11,r7,255 +- +- .p2align 5 +-L(L1): +- mtctr r11 +- +- .p2align 5 +-L(L2): +- COMPARE_16(v4,v5,0) /* Load 16B blocks using lxv. */ +- addi r3,r3,16 +- addi r4,r4,16 +- bdnz L(L2) +- /* Cross the page boundary of s2, carefully. */ +- +- .p2align 5 +-L(L3): +- CHECK_N_BYTES(r3,r4,r5) +- CHECK_N_BYTES(r3,r4,r9) +- li r11,255 /* Load the new loop counter. */ +- b L(L1) +- +- .p2align 5 +-L(same_aligned): +- CHECK_N_BYTES(r3,r4,r7) +- /* Align s1 to 32B and adjust s2 address. +- Use lxvp only if both s1 and s2 are 32B aligned. */ +- COMPARE_16(v4,v5,0) +- COMPARE_16(v4,v5,16) +- COMPARE_16(v4,v5,32) +- COMPARE_16(v4,v5,48) +- addi r3,r3,64 +- addi r4,r4,64 +- COMPARE_16(v4,v5,0) +- COMPARE_16(v4,v5,16) +- +- clrldi r6,r3,59 +- subfic r5,r6,32 +- add r3,r3,r5 +- add r4,r4,r5 +- andi. r5,r4,0x1F +- beq cr0,L(32B_aligned_loop) +- +- .p2align 5 +-L(16B_aligned_loop): +- COMPARE_16(v4,v5,0) +- COMPARE_16(v4,v5,16) +- COMPARE_16(v4,v5,32) +- COMPARE_16(v4,v5,48) +- addi r3,r3,64 +- addi r4,r4,64 +- b L(16B_aligned_loop) +- +- /* Calculate and return the difference. */ +-L(different): +- vctzlsbb r6,v7 +- vextubrx r5,r6,v4 +- vextubrx r4,r6,v5 +- bt 4*cr1+eq,L(swapped) +- subf r3,r4,r5 +- blr +- +- /* If src pointers were swapped, then swap the +- indices and calculate the return value. */ +-L(swapped): +- subf r3,r5,r4 +- blr +- +- .p2align 5 +-L(32B_aligned_loop): +- COMPARE_32(v14,v16,0,tail1,tail2) +- COMPARE_32(v18,v20,32,tail3,tail4) +- COMPARE_32(v22,v24,64,tail5,tail6) +- COMPARE_32(v26,v28,96,tail7,tail8) +- addi r3,r3,128 +- addi r4,r4,128 +- b L(32B_aligned_loop) +- +-L(tail1): TAIL(v15,v17) +-L(tail2): TAIL(v14,v16) +-L(tail3): TAIL(v19,v21) +-L(tail4): TAIL(v18,v20) +-L(tail5): TAIL(v23,v25) +-L(tail6): TAIL(v22,v24) +-L(tail7): TAIL(v27,v29) +-L(tail8): TAIL(v26,v28) +- +-END (STRCMP) +-libc_hidden_builtin_def (strcmp) +diff --git a/sysdeps/powerpc/powerpc64/multiarch/Makefile b/sysdeps/powerpc/powerpc64/multiarch/Makefile +index d7824a922b0de470..27d8495503a5a1fe 100644 +--- a/sysdeps/powerpc/powerpc64/multiarch/Makefile ++++ b/sysdeps/powerpc/powerpc64/multiarch/Makefile +@@ -33,8 +33,7 @@ sysdep_routines += memcpy-power8-cached memcpy-power7 memcpy-a2 memcpy-power6 \ + ifneq (,$(filter %le,$(config-machine))) + sysdep_routines += memcmp-power10 memcpy-power10 memmove-power10 memset-power10 \ + rawmemchr-power9 rawmemchr-power10 \ +- strcmp-power9 strcmp-power10 strncmp-power9 \ +- strcpy-power9 stpcpy-power9 \ ++ strcmp-power9 strncmp-power9 strcpy-power9 stpcpy-power9 \ + strlen-power9 strncpy-power9 stpncpy-power9 strlen-power10 + endif + CFLAGS-strncase-power7.c += -mcpu=power7 -funroll-loops +diff --git a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c +index e2f733eb82fa6199..ad6080f1991f4080 100644 +--- a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c ++++ b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c +@@ -377,10 +377,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, + /* Support sysdeps/powerpc/powerpc64/multiarch/strcmp.c. */ + IFUNC_IMPL (i, name, strcmp, + #ifdef __LITTLE_ENDIAN__ +- IFUNC_IMPL_ADD (array, i, strcmp, +- (hwcap2 & PPC_FEATURE2_ARCH_3_1) +- && (hwcap & PPC_FEATURE_HAS_VSX), +- __strcmp_power10) + IFUNC_IMPL_ADD (array, i, strcmp, + hwcap2 & PPC_FEATURE2_ARCH_3_00 + && hwcap & PPC_FEATURE_HAS_ALTIVEC, +diff --git a/sysdeps/powerpc/powerpc64/multiarch/strcmp-power10.S b/sysdeps/powerpc/powerpc64/multiarch/strcmp-power10.S +deleted file mode 100644 +index 1a9f6069f589a95c..0000000000000000 +--- a/sysdeps/powerpc/powerpc64/multiarch/strcmp-power10.S ++++ /dev/null +@@ -1,26 +0,0 @@ +-/* Optimized strcmp implementation for POWER10/PPC64. +- Copyright (C) 2021-2024 Free Software Foundation, Inc. +- This file is part of the GNU C Library. +- +- The GNU C Library is free software; you can redistribute it and/or +- modify it under the terms of the GNU Lesser General Public +- License as published by the Free Software Foundation; either +- version 2.1 of the License, or (at your option) any later version. +- +- The GNU C Library is distributed in the hope that it will be useful, +- but WITHOUT ANY WARRANTY; without even the implied warranty of +- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +- Lesser General Public License for more details. +- +- You should have received a copy of the GNU Lesser General Public +- License along with the GNU C Library; if not, see +- . */ +- +-#if defined __LITTLE_ENDIAN__ && IS_IN (libc) +-#define STRCMP __strcmp_power10 +- +-#undef libc_hidden_builtin_def +-#define libc_hidden_builtin_def(name) +- +-#include +-#endif /* __LITTLE_ENDIAN__ && IS_IN (libc) */ +diff --git a/sysdeps/powerpc/powerpc64/multiarch/strcmp.c b/sysdeps/powerpc/powerpc64/multiarch/strcmp.c +index ff32496fabba2e47..06b9b4090ff23ee1 100644 +--- a/sysdeps/powerpc/powerpc64/multiarch/strcmp.c ++++ b/sysdeps/powerpc/powerpc64/multiarch/strcmp.c +@@ -29,16 +29,12 @@ extern __typeof (strcmp) __strcmp_power7 attribute_hidden; + extern __typeof (strcmp) __strcmp_power8 attribute_hidden; + # ifdef __LITTLE_ENDIAN__ + extern __typeof (strcmp) __strcmp_power9 attribute_hidden; +-extern __typeof (strcmp) __strcmp_power10 attribute_hidden; + # endif + + # undef strcmp + + libc_ifunc_redirected (__redirect_strcmp, strcmp, + # ifdef __LITTLE_ENDIAN__ +- (hwcap2 & PPC_FEATURE2_ARCH_3_1 +- && hwcap & PPC_FEATURE_HAS_VSX) +- ? __strcmp_power10 : + (hwcap2 & PPC_FEATURE2_ARCH_3_00 + && hwcap & PPC_FEATURE_HAS_ALTIVEC) + ? __strcmp_power9 : diff --git a/glibc-upstream-2.39-210.patch b/glibc-upstream-2.39-210.patch new file mode 100644 index 0000000..6267922 --- /dev/null +++ b/glibc-upstream-2.39-210.patch @@ -0,0 +1,106 @@ +commit 1924d341c0acbb9bf9ec77f1971fdb109933d12f +Author: Florian Weimer +Date: Wed May 21 16:47:34 2025 +0200 + + support: Pick group in support_capture_subprogram_self_sgid if UID == 0 + + When running as root, it is likely that we can run under any group. + Pick a harmless group from /etc/group in this case. + + Reviewed-by: Carlos O'Donell + (cherry picked from commit 2f769cec448d84a62b7dd0d4ff56978fe22c0cd6) + +diff --git a/support/support_capture_subprocess.c b/support/support_capture_subprocess.c +index eb72a2c21cf99ee2..1a30ae3f31f041d4 100644 +--- a/support/support_capture_subprocess.c ++++ b/support/support_capture_subprocess.c +@@ -21,7 +21,11 @@ + + #include + #include ++#include ++#include ++#include + #include ++#include + #include + #include + #include +@@ -209,10 +213,48 @@ err: + return status; + } + ++/* Returns true if a group with NAME has been found, and writes its ++ GID to *TARGET. */ ++static bool ++find_sgid_group (gid_t *target, const char *name) ++{ ++ /* Do not use getgrname_r because it does not work in statically ++ linked binaries if the system libc is different. */ ++ FILE *fp = fopen ("/etc/group", "rce"); ++ if (fp == NULL) ++ return false; ++ __fsetlocking (fp, FSETLOCKING_BYCALLER); ++ ++ bool ok = false; ++ struct scratch_buffer buf; ++ scratch_buffer_init (&buf); ++ while (true) ++ { ++ struct group grp; ++ struct group *result = NULL; ++ int status = fgetgrent_r (fp, &grp, buf.data, buf.length, &result); ++ if (status == 0 && result != NULL) ++ { ++ if (strcmp (result->gr_name, name) == 0) ++ { ++ *target = result->gr_gid; ++ ok = true; ++ break; ++ } ++ } ++ else if (errno != ERANGE) ++ break; ++ else if (!scratch_buffer_grow (&buf)) ++ break; ++ } ++ scratch_buffer_free (&buf); ++ fclose (fp); ++ return ok; ++} ++ + int + support_capture_subprogram_self_sgid (const char *child_id) + { +- gid_t target = 0; + const int count = 64; + gid_t groups[count]; + +@@ -224,6 +266,7 @@ support_capture_subprogram_self_sgid (const char *child_id) + (intmax_t) getuid ()); + + gid_t current = getgid (); ++ gid_t target = current; + for (int i = 0; i < ret; ++i) + { + if (groups[i] != current) +@@ -233,9 +276,16 @@ support_capture_subprogram_self_sgid (const char *child_id) + } + } + +- if (target == 0) +- FAIL_UNSUPPORTED("Could not find a suitable GID for user %jd\n", +- (intmax_t) getuid ()); ++ if (target == current) ++ { ++ /* If running as root, try to find a harmless group for SGID. */ ++ if (getuid () != 0 ++ || (!find_sgid_group (&target, "nogroup") ++ && !find_sgid_group (&target, "bin") ++ && !find_sgid_group (&target, "daemon"))) ++ FAIL_UNSUPPORTED("Could not find a suitable GID for user %jd\n", ++ (intmax_t) getuid ()); ++ } + + return copy_and_spawn_sgid (child_id, target); + } diff --git a/glibc-upstream-2.39-211.patch b/glibc-upstream-2.39-211.patch new file mode 100644 index 0000000..fd21834 --- /dev/null +++ b/glibc-upstream-2.39-211.patch @@ -0,0 +1,318 @@ +commit cff1042cceec3502269947e96cf7023451af22f3 +Author: Florian Weimer +Date: Thu May 22 14:36:37 2025 +0200 + + Fix error reporting (false negatives) in SGID tests + + And simplify the interface of support_capture_subprogram_self_sgid. + + Use the existing framework for temporary directories (now with + mode 0700) and directory/file deletion. Handle all execution + errors within support_capture_subprogram_self_sgid. In particular, + this includes test failures because the invoked program did not + exit with exit status zero. Existing tests that expect exit + status 42 are adjusted to use zero instead. + + In addition, fix callers not to call exit (0) with test failures + pending (which may mask them, especially when running with --direct). + + Fixes commit 35fc356fa3b4f485bd3ba3114c9f774e5df7d3c2 + ("elf: Fix subprocess status handling for tst-dlopen-sgid (bug 32987)"). + + Reviewed-by: Carlos O'Donell + (cherry picked from commit 3a3fb2ed83f79100c116c824454095ecfb335ad7) + +diff --git a/elf/tst-dlopen-sgid.c b/elf/tst-dlopen-sgid.c +index 5688b79f2e870b1d..8aec52e19fc56aba 100644 +--- a/elf/tst-dlopen-sgid.c ++++ b/elf/tst-dlopen-sgid.c +@@ -70,13 +70,7 @@ do_test (void) + + free (libdir); + +- int status = support_capture_subprogram_self_sgid (magic_argument); +- +- if (WEXITSTATUS (status) == EXIT_UNSUPPORTED) +- return EXIT_UNSUPPORTED; +- +- if (!WIFEXITED (status)) +- FAIL_EXIT1 ("Unexpected exit status %d from child process\n", status); ++ support_capture_subprogram_self_sgid (magic_argument); + + return 0; + } +diff --git a/elf/tst-env-setuid-tunables.c b/elf/tst-env-setuid-tunables.c +index a47219047f0f602d..233eec7631ed837c 100644 +--- a/elf/tst-env-setuid-tunables.c ++++ b/elf/tst-env-setuid-tunables.c +@@ -105,10 +105,7 @@ do_test (int argc, char **argv) + + if (ret != 0) + exit (1); +- +- /* Special return code to make sure that the child executed all the way +- through. */ +- exit (42); ++ return 0; + } + else + { +@@ -127,18 +124,7 @@ do_test (int argc, char **argv) + continue; + } + +- int status = support_capture_subprogram_self_sgid (buf); +- +- /* Bail out early if unsupported. */ +- if (WEXITSTATUS (status) == EXIT_UNSUPPORTED) +- return EXIT_UNSUPPORTED; +- +- if (WEXITSTATUS (status) != 42) +- { +- printf (" [%d] child failed with status %d\n", i, +- WEXITSTATUS (status)); +- support_record_failure (); +- } ++ support_capture_subprogram_self_sgid (buf); + } + return 0; + } +diff --git a/elf/tst-env-setuid.c b/elf/tst-env-setuid.c +index 43047c48f3ecd555..c084aa4c1a382152 100644 +--- a/elf/tst-env-setuid.c ++++ b/elf/tst-env-setuid.c +@@ -148,10 +148,7 @@ do_test (int argc, char **argv) + + if (ret != 0) + exit (1); +- +- /* Special return code to make sure that the child executed all the way +- through. */ +- exit (42); ++ return 0; + } + else + { +@@ -175,17 +172,7 @@ do_test (int argc, char **argv) + free (profilepath); + } + +- int status = support_capture_subprogram_self_sgid (SETGID_CHILD); +- +- if (WEXITSTATUS (status) == EXIT_UNSUPPORTED) +- exit (EXIT_UNSUPPORTED); +- +- if (WEXITSTATUS (status) != 42) +- { +- printf (" child failed with status %d\n", +- WEXITSTATUS (status)); +- support_record_failure (); +- } ++ support_capture_subprogram_self_sgid (SETGID_CHILD); + + return 0; + } +diff --git a/stdlib/tst-secure-getenv.c b/stdlib/tst-secure-getenv.c +index cc26ed6d15803c99..cefee58d46f25ebb 100644 +--- a/stdlib/tst-secure-getenv.c ++++ b/stdlib/tst-secure-getenv.c +@@ -57,13 +57,7 @@ do_test (void) + exit (1); + } + +- int status = support_capture_subprogram_self_sgid (MAGIC_ARGUMENT); +- +- if (WEXITSTATUS (status) == EXIT_UNSUPPORTED) +- return EXIT_UNSUPPORTED; +- +- if (!WIFEXITED (status)) +- FAIL_EXIT1 ("Unexpected exit status %d from child process\n", status); ++ support_capture_subprogram_self_sgid (MAGIC_ARGUMENT); + + return 0; + } +@@ -82,6 +76,7 @@ alternative_main (int argc, char **argv) + if (secure_getenv ("PATH") != NULL) + FAIL_EXIT (4, "PATH variable not filtered out\n"); + ++ support_record_failure_barrier (); + exit (EXIT_SUCCESS); + } + } +diff --git a/support/capture_subprocess.h b/support/capture_subprocess.h +index f2765278d920839d..8cbdca3b9dfb41ba 100644 +--- a/support/capture_subprocess.h ++++ b/support/capture_subprocess.h +@@ -41,10 +41,12 @@ struct support_capture_subprocess support_capture_subprocess + struct support_capture_subprocess support_capture_subprogram + (const char *file, char *const argv[]); + +-/* Copy the running program into a setgid binary and run it with CHILD_ID +- argument. If execution is successful, return the exit status of the child +- program, otherwise return a non-zero failure exit code. */ +-int support_capture_subprogram_self_sgid (const char *child_id); ++/* Copy the running program into a setgid binary and run it with ++ CHILD_ID argument. If the program exits with a non-zero status, ++ exit with that exit status (or status 1 if the program did not exit ++ normally). If the test cannot be performed, exit with ++ EXIT_UNSUPPORTED. */ ++void support_capture_subprogram_self_sgid (const char *child_id); + + /* Deallocate the subprocess data captured by + support_capture_subprocess. */ +diff --git a/support/support_capture_subprocess.c b/support/support_capture_subprocess.c +index 1a30ae3f31f041d4..8dc95f8aa723b6bc 100644 +--- a/support/support_capture_subprocess.c ++++ b/support/support_capture_subprocess.c +@@ -31,6 +31,7 @@ + #include + #include + #include ++#include + #include + + static void +@@ -112,105 +113,44 @@ support_capture_subprogram (const char *file, char *const argv[]) + /* Copies the executable into a restricted directory, so that we can + safely make it SGID with the TARGET group ID. Then runs the + executable. */ +-static int ++static void + copy_and_spawn_sgid (const char *child_id, gid_t gid) + { +- char *dirname = xasprintf ("%s/tst-tunables-setuid.%jd", +- test_dir, (intmax_t) getpid ()); ++ char *dirname = support_create_temp_directory ("tst-glibc-sgid-"); + char *execname = xasprintf ("%s/bin", dirname); +- int infd = -1; +- int outfd = -1; +- int ret = 1, status = 1; +- +- TEST_VERIFY (mkdir (dirname, 0700) == 0); +- if (support_record_failure_is_failed ()) +- goto err; ++ add_temp_file (execname); + +- infd = open ("/proc/self/exe", O_RDONLY); +- if (infd < 0) ++ if (access ("/proc/self/exe", R_OK) != 0) + FAIL_UNSUPPORTED ("unsupported: Cannot read binary from procfs\n"); + +- outfd = open (execname, O_WRONLY | O_CREAT | O_EXCL, 0700); +- TEST_VERIFY (outfd >= 0); +- if (support_record_failure_is_failed ()) +- goto err; +- +- char buf[4096]; +- for (;;) +- { +- ssize_t rdcount = read (infd, buf, sizeof (buf)); +- TEST_VERIFY (rdcount >= 0); +- if (support_record_failure_is_failed ()) +- goto err; +- if (rdcount == 0) +- break; +- char *p = buf; +- char *end = buf + rdcount; +- while (p != end) +- { +- ssize_t wrcount = write (outfd, buf, end - p); +- if (wrcount == 0) +- errno = ENOSPC; +- TEST_VERIFY (wrcount > 0); +- if (support_record_failure_is_failed ()) +- goto err; +- p += wrcount; +- } +- } ++ support_copy_file ("/proc/self/exe", execname); + +- bool chowned = false; +- TEST_VERIFY ((chowned = fchown (outfd, getuid (), gid) == 0) +- || errno == EPERM); +- if (support_record_failure_is_failed ()) +- goto err; +- else if (!chowned) +- { +- ret = 77; +- goto err; +- } ++ if (chown (execname, getuid (), gid) != 0) ++ FAIL_UNSUPPORTED ("cannot change group of \"%s\" to %jd: %m", ++ execname, (intmax_t) gid); + +- TEST_VERIFY (fchmod (outfd, 02750) == 0); +- if (support_record_failure_is_failed ()) +- goto err; +- TEST_VERIFY (close (outfd) == 0); +- if (support_record_failure_is_failed ()) +- goto err; +- TEST_VERIFY (close (infd) == 0); +- if (support_record_failure_is_failed ()) +- goto err; ++ if (chmod (execname, 02750) != 0) ++ FAIL_UNSUPPORTED ("cannot make \"%s\" SGID: %m ", execname); + + /* We have the binary, now spawn the subprocess. Avoid using + support_subprogram because we only want the program exit status, not the + contents. */ +- ret = 0; +- infd = outfd = -1; + + char * const args[] = {execname, (char *) child_id, NULL}; ++ int status = support_subprogram_wait (args[0], args); + +- status = support_subprogram_wait (args[0], args); ++ free (execname); ++ free (dirname); + +-err: +- if (outfd >= 0) +- close (outfd); +- if (infd >= 0) +- close (infd); +- if (execname != NULL) +- { +- unlink (execname); +- free (execname); +- } +- if (dirname != NULL) ++ if (WIFEXITED (status)) + { +- rmdir (dirname); +- free (dirname); ++ if (WEXITSTATUS (status) == 0) ++ return; ++ else ++ exit (WEXITSTATUS (status)); + } +- +- if (ret == 77) +- FAIL_UNSUPPORTED ("Failed to make sgid executable for test\n"); +- if (ret != 0) +- FAIL_EXIT1 ("Failed to make sgid executable for test\n"); +- +- return status; ++ else ++ FAIL_EXIT1 ("subprogram failed with status %d", status); + } + + /* Returns true if a group with NAME has been found, and writes its +@@ -252,7 +192,7 @@ find_sgid_group (gid_t *target, const char *name) + return ok; + } + +-int ++void + support_capture_subprogram_self_sgid (const char *child_id) + { + const int count = 64; +@@ -287,7 +227,7 @@ support_capture_subprogram_self_sgid (const char *child_id) + (intmax_t) getuid ()); + } + +- return copy_and_spawn_sgid (child_id, target); ++ copy_and_spawn_sgid (child_id, target); + } + + void diff --git a/glibc.spec b/glibc.spec index d3babc8..921a729 100644 --- a/glibc.spec +++ b/glibc.spec @@ -145,7 +145,7 @@ Version: %{glibcversion} # - It allows using the Release number without the %%dist tag in the dependency # generator to make the generated requires interchangeable between Rawhide # and ELN (.elnYY < .fcXX). -%global baserelease 39 +%global baserelease 40 Release: %{baserelease}%{?dist} # Licenses: @@ -508,6 +508,71 @@ Patch190: glibc-RHEL-75809.patch Patch191: glibc-RHEL-75555.patch Patch192: glibc-RHEL-75809-2.patch Patch193: glibc-RHEL-75809-3.patch +Patch194: glibc-upstream-2.39-147.patch +Patch195: glibc-upstream-2.39-148.patch +Patch196: glibc-upstream-2.39-149.patch +Patch197: glibc-upstream-2.39-150.patch +Patch198: glibc-upstream-2.39-151.patch +Patch199: glibc-upstream-2.39-152.patch +Patch200: glibc-upstream-2.39-153.patch +Patch201: glibc-upstream-2.39-154.patch +Patch202: glibc-upstream-2.39-155.patch +Patch203: glibc-upstream-2.39-156.patch +Patch204: glibc-upstream-2.39-157.patch +Patch205: glibc-upstream-2.39-158.patch +Patch206: glibc-upstream-2.39-159.patch +Patch207: glibc-upstream-2.39-160.patch +Patch208: glibc-upstream-2.39-161.patch +Patch209: glibc-upstream-2.39-162.patch +Patch210: glibc-upstream-2.39-163.patch +Patch211: glibc-upstream-2.39-164.patch +Patch212: glibc-upstream-2.39-165.patch +Patch213: glibc-upstream-2.39-166.patch +Patch214: glibc-upstream-2.39-167.patch +Patch215: glibc-upstream-2.39-168.patch +Patch216: glibc-upstream-2.39-169.patch +Patch217: glibc-upstream-2.39-170.patch +Patch218: glibc-upstream-2.39-171.patch +Patch219: glibc-upstream-2.39-172.patch +Patch220: glibc-upstream-2.39-173.patch +Patch221: glibc-upstream-2.39-174.patch +Patch222: glibc-upstream-2.39-175.patch +Patch223: glibc-upstream-2.39-176.patch +Patch224: glibc-upstream-2.39-177.patch +Patch225: glibc-upstream-2.39-178.patch +Patch226: glibc-upstream-2.39-179.patch +Patch227: glibc-upstream-2.39-180.patch +Patch228: glibc-upstream-2.39-181.patch +Patch229: glibc-upstream-2.39-182.patch +Patch230: glibc-upstream-2.39-183.patch +Patch231: glibc-upstream-2.39-184.patch +Patch232: glibc-upstream-2.39-185.patch +Patch233: glibc-upstream-2.39-186.patch +Patch234: glibc-upstream-2.39-187.patch +Patch235: glibc-upstream-2.39-188.patch +Patch236: glibc-upstream-2.39-189.patch +Patch237: glibc-upstream-2.39-190.patch +Patch238: glibc-upstream-2.39-191.patch +Patch239: glibc-upstream-2.39-192.patch +Patch240: glibc-upstream-2.39-193.patch +Patch241: glibc-upstream-2.39-194.patch +Patch242: glibc-upstream-2.39-195.patch +Patch243: glibc-upstream-2.39-196.patch +Patch244: glibc-upstream-2.39-197.patch +Patch245: glibc-upstream-2.39-198.patch +Patch246: glibc-upstream-2.39-199.patch +Patch247: glibc-upstream-2.39-200.patch +Patch248: glibc-upstream-2.39-201.patch +Patch249: glibc-upstream-2.39-202.patch +Patch250: glibc-upstream-2.39-203.patch +Patch251: glibc-upstream-2.39-204.patch +Patch252: glibc-upstream-2.39-205.patch +Patch253: glibc-upstream-2.39-206.patch +Patch254: glibc-upstream-2.39-207.patch +Patch255: glibc-upstream-2.39-208.patch +Patch256: glibc-upstream-2.39-209.patch +Patch257: glibc-upstream-2.39-210.patch +Patch258: glibc-upstream-2.39-211.patch ############################################################################## # Continued list of core "glibc" package information: @@ -2505,6 +2570,35 @@ update_gconv_modules_cache () %endif %changelog +* Tue Jun 17 2025 Arjun Shankar - 2.39-40 +- Sync with upstream branch release/2.39/master (RHEL-87416) +- Upstream commit: cff1042cceec3502269947e96cf7023451af22f3 +- CVE-2025-5702: Vector register overwrite bug in glibc (RHEL-95485) +- elf: Keep using minimal malloc after early DTV resize (RHEL-71923) +- libio: Fix a deadlock after fork in popen (RHEL-86433) +- Linux: Switch back to assembly syscall wrapper for prctl (RHEL-82286) +- Fix missed wakeup in POSIX thread condition variables (RHEL-82285) +- x86: Detect Intel Diamond Rapids +- x86: Handle unknown Intel processor with default tuning +- x86: Add ARL/PTL/CWF model detection support +- x86: Optimize xstate size calculation +- x86: Support and fixes for separate non-temporal tunable for memset +- x86: Fix a crash when running with XSAVEC disabled via tunables (RHEL-84837) +- x86_64: Add tanh, sinh, and atanh with FMA +- x86-64: Exclude FMA4 IFUNC functions for -mapxf +- nptl: clear the whole rseq area before registration +- math: Improve layout of exp/exp10 data +- AArch64: Add SVE memset +- math: Improve layout of expf data +- AArch64: Remove zva_128 from memset +- AArch64: Optimize memset +- AArch64: Improve generic strlen +- AArch64: Improve codegen for SVE tans and logs +- AArch64: Improve codegen in AdvSIMD logs, logf function family, and atan(2)(f) +- AArch64: Simplify rounding-multiply pattern in several AdvSIMD routines +- aarch64: Avoid redundant MOVs in AdvSIMD F32 logs +- aarch64: Fix AdvSIMD libmvec routines for big-endian + * Tue Jun 17 2025 Florian Weimer - 2.39-39 - langpacks: Use symlinks for LC_NAME, LC_NUMERIC files if possible (RHEL-97433)