Sync with upstream branch release/2.39/master (RHEL-87416)

Upstream commit: cff1042cceec3502269947e96cf7023451af22f3

Resolves: RHEL-87416

Resolves: RHEL-71923
Resolves: RHEL-82285
Resolves: RHEL-82286
Resolves: RHEL-84837
Resolves: RHEL-86433
Resolves: RHEL-95485
This commit is contained in:
Arjun Shankar 2025-07-01 11:39:21 +02:00
parent 55a8279a3b
commit 70ebc1f0c6
66 changed files with 9452 additions and 1 deletions

View File

@ -0,0 +1,136 @@
commit c1f7bfbe081ebf807b6374a497ad5d5a9f499574
Author: H.J. Lu <hjl.tools@gmail.com>
Date: Tue Dec 17 18:41:45 2024 +0800
Hide all malloc functions from compiler [BZ #32366]
Since -1 isn't a power of two, compiler may reject it, hide memalign from
Clang 19 which issues an error:
tst-memalign.c:86:31: error: requested alignment is not a power of 2 [-Werror,-Wnon-power-of-two-alignment]
86 | p = memalign (-1, pagesize);
| ^~
tst-memalign.c:86:31: error: requested alignment must be 4294967296 bytes or smaller; maximum alignment assumed [-Werror,-Wbuiltin-assume-aligned-alignment]
86 | p = memalign (-1, pagesize);
| ^~
Update tst-malloc-aux.h to hide all malloc functions and include it in
all malloc tests to prevent compiler from optimizing out any malloc
functions.
Tested with Clang 19.1.5 and GCC 15 20241206 for BZ #32366.
Signed-off-by: H.J. Lu <hjl.tools@gmail.com>
Reviewed-by: Sam James <sam@gentoo.org>
(cherry picked from commit f9493a15ea9cfb63a815c00c23142369ec09d8ce)
diff --git a/malloc/tst-mallinfo2.c b/malloc/tst-mallinfo2.c
index 2c02f5f700f5051e..f072b9f24b575792 100644
--- a/malloc/tst-mallinfo2.c
+++ b/malloc/tst-mallinfo2.c
@@ -23,6 +23,8 @@
#include <stdlib.h>
#include <support/check.h>
+#include "tst-malloc-aux.h"
+
/* This is not specifically needed for the test, but (1) does
something to the data so gcc doesn't optimize it away, and (2) may
help when developing future tests. */
diff --git a/malloc/tst-malloc-aux.h b/malloc/tst-malloc-aux.h
index 54908b4a2464d510..3e1b61ce3414dad4 100644
--- a/malloc/tst-malloc-aux.h
+++ b/malloc/tst-malloc-aux.h
@@ -22,20 +22,35 @@
#include <stddef.h>
#include <stdlib.h>
-
-static void *(*volatile aligned_alloc_indirect)(size_t, size_t) = aligned_alloc;
-static void *(*volatile calloc_indirect)(size_t, size_t) = calloc;
-static void *(*volatile malloc_indirect)(size_t) = malloc;
-static void *(*volatile realloc_indirect)(void*, size_t) = realloc;
+#include <malloc.h>
+
+static __typeof (aligned_alloc) * volatile aligned_alloc_indirect
+ = aligned_alloc;
+static __typeof (calloc) * volatile calloc_indirect = calloc;
+static __typeof (malloc) * volatile malloc_indirect = malloc;
+static __typeof (memalign) * volatile memalign_indirect = memalign;
+static __typeof (posix_memalign) * volatile posix_memalign_indirect
+ = posix_memalign;
+static __typeof (pvalloc) * volatile pvalloc_indirect = pvalloc;
+static __typeof (realloc) * volatile realloc_indirect = realloc;
+static __typeof (valloc) * volatile valloc_indirect = valloc;
#undef aligned_alloc
#undef calloc
#undef malloc
+#undef memalign
+#undef posix_memalign
+#undef pvalloc
#undef realloc
+#undef valloc
#define aligned_alloc aligned_alloc_indirect
#define calloc calloc_indirect
#define malloc malloc_indirect
+#define memalign memalign_indirect
+#define posix_memalign posix_memalign_indirect
+#define pvalloc pvalloc_indirect
#define realloc realloc_indirect
+#define valloc valloc_indirect
#endif /* TST_MALLOC_AUX_H */
diff --git a/malloc/tst-malloc-backtrace.c b/malloc/tst-malloc-backtrace.c
index c7b1d65e5c95c437..65fa91f6fdbdce91 100644
--- a/malloc/tst-malloc-backtrace.c
+++ b/malloc/tst-malloc-backtrace.c
@@ -22,6 +22,8 @@
#include <support/support.h>
#include <libc-diag.h>
+#include "tst-malloc-aux.h"
+
#define SIZE 4096
/* Wrap free with a function to prevent gcc from optimizing it out. */
diff --git a/malloc/tst-memalign.c b/malloc/tst-memalign.c
index 563f6413d2da506b..ac9770d3f96313a7 100644
--- a/malloc/tst-memalign.c
+++ b/malloc/tst-memalign.c
@@ -23,6 +23,8 @@
#include <unistd.h>
#include <libc-diag.h>
+#include "tst-malloc-aux.h"
+
static int errors = 0;
static void
diff --git a/malloc/tst-safe-linking.c b/malloc/tst-safe-linking.c
index 01dd07004d65a767..63a7e2bc8e8ff536 100644
--- a/malloc/tst-safe-linking.c
+++ b/malloc/tst-safe-linking.c
@@ -26,6 +26,8 @@
#include <support/capture_subprocess.h>
#include <support/check.h>
+#include "tst-malloc-aux.h"
+
/* Run CALLBACK and check that the data on standard error equals
EXPECTED. */
static void
diff --git a/malloc/tst-valloc.c b/malloc/tst-valloc.c
index 9bab8c6470d4fd95..0243d3dfd494d329 100644
--- a/malloc/tst-valloc.c
+++ b/malloc/tst-valloc.c
@@ -23,6 +23,8 @@
#include <unistd.h>
#include <libc-diag.h>
+#include "tst-malloc-aux.h"
+
static int errors = 0;
static void

View File

@ -0,0 +1,55 @@
commit 1432850ad8fbef6dea82d137e491b53840dc7f4d
Author: Sam James <sam@gentoo.org>
Date: Fri Jan 10 03:03:47 2025 +0000
malloc: obscure calloc use in tst-calloc
Similar to a9944a52c967ce76a5894c30d0274b824df43c7a and
f9493a15ea9cfb63a815c00c23142369ec09d8ce, we need to hide calloc use from
the compiler to accommodate GCC's r15-6566-g804e9d55d9e54c change.
First, include tst-malloc-aux.h, but then use `volatile` variables
for size.
The test passes without the tst-malloc-aux.h change but IMO we want
it there for consistency and to avoid future problems (possibly silent).
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
(cherry picked from commit c3d1dac96bdd10250aa37bb367d5ef8334a093a1)
diff --git a/malloc/tst-calloc.c b/malloc/tst-calloc.c
index 01f17f9e65591659..5a8c7ab121ef2d00 100644
--- a/malloc/tst-calloc.c
+++ b/malloc/tst-calloc.c
@@ -23,6 +23,7 @@
#include <stdio.h>
#include <libc-diag.h>
+#include "tst-malloc-aux.h"
/* Number of samples per size. */
#define N 50000
@@ -94,16 +95,19 @@ random_test (void)
static void
null_test (void)
{
+ /* Obscure allocation size from the compiler. */
+ volatile size_t max_size = UINT_MAX;
+ volatile size_t zero_size = 0;
/* If the size is 0 the result is implementation defined. Just make
sure the program doesn't crash. The result of calloc is
deliberately ignored, so do not warn about that. */
DIAG_PUSH_NEEDS_COMMENT;
DIAG_IGNORE_NEEDS_COMMENT (10, "-Wunused-result");
calloc (0, 0);
- calloc (0, UINT_MAX);
- calloc (UINT_MAX, 0);
- calloc (0, ~((size_t) 0));
- calloc (~((size_t) 0), 0);
+ calloc (0, max_size);
+ calloc (max_size, 0);
+ calloc (0, ~((size_t) zero_size));
+ calloc (~((size_t) zero_size), 0);
DIAG_POP_NEEDS_COMMENT;
}

View File

@ -0,0 +1,67 @@
commit 662516aca8b6bf6aa6555f471055d5eb512b1ddc
Author: H.J. Lu <hjl.tools@gmail.com>
Date: Fri Jan 24 18:53:13 2025 +0800
stdlib: Test using setenv with updated environ [BZ #32588]
Add a test for setenv with updated environ. Verify that BZ #32588 is
fixed.
Signed-off-by: H.J. Lu <hjl.tools@gmail.com>
Reviewed-by: Florian Weimer <fweimer@redhat.com>
(cherry picked from commit 8ab34497de14e35aff09b607222fe1309ef156da)
diff --git a/stdlib/Makefile b/stdlib/Makefile
index f4dec9be46a573b9..12f8820fd0668039 100644
--- a/stdlib/Makefile
+++ b/stdlib/Makefile
@@ -316,6 +316,7 @@ tests := \
tst-setcontext9 \
tst-setcontext10 \
tst-setcontext11 \
+ tst-setenv-environ \
tst-stdbit-Wconversion \
tst-stdbit-builtins \
tst-stdc_bit_ceil \
diff --git a/stdlib/tst-setenv-environ.c b/stdlib/tst-setenv-environ.c
new file mode 100644
index 0000000000000000..02fcef96d098f1b7
--- /dev/null
+++ b/stdlib/tst-setenv-environ.c
@@ -0,0 +1,36 @@
+/* Test using setenv with updated environ.
+ Copyright (C) 2025 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#include <stdlib.h>
+#include <support/check.h>
+
+extern char **environ;
+
+int
+do_test (void)
+{
+ char *valp;
+ static char *dummy_environ[] = { NULL };
+ environ = dummy_environ;
+ setenv ("A", "1", 0);
+ valp = getenv ("A");
+ TEST_VERIFY_EXIT (valp[0] == '1' && valp[1] == '\0');
+ return 0;
+}
+
+#include <support/test-driver.c>

View File

@ -0,0 +1,125 @@
commit f6d48470aef9264d2d56f4c4533eb76db7f9c2e4
Author: Siddhesh Poyarekar <siddhesh@sourceware.org>
Date: Fri Jan 31 12:16:30 2025 -0500
assert: Add test for CVE-2025-0395
Use the __progname symbol to override the program name to induce the
failure that CVE-2025-0395 describes.
This is related to BZ #32582
Signed-off-by: Siddhesh Poyarekar <siddhesh@sourceware.org>
Reviewed-by: Adhemerval Zanella <adhemerval.zanella@linaro.org>
(cherry picked from commit cdb9ba84191ce72e86346fb8b1d906e7cd930ea2)
diff --git a/assert/Makefile b/assert/Makefile
index 35dc908ddb8a14f2..c0fe660bd69f9ec8 100644
--- a/assert/Makefile
+++ b/assert/Makefile
@@ -38,6 +38,7 @@ tests := \
test-assert-perr \
tst-assert-c++ \
tst-assert-g++ \
+ tst-assert-sa-2025-0001 \
# tests
ifeq ($(have-cxx-thread_local),yes)
diff --git a/assert/tst-assert-sa-2025-0001.c b/assert/tst-assert-sa-2025-0001.c
new file mode 100644
index 0000000000000000..102cb0078dafa9c1
--- /dev/null
+++ b/assert/tst-assert-sa-2025-0001.c
@@ -0,0 +1,92 @@
+/* Test for CVE-2025-0395.
+ Copyright The GNU Toolchain Authors.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+/* Test that a large enough __progname does not result in a buffer overflow
+ when printing an assertion failure. This was CVE-2025-0395. */
+#include <assert.h>
+#include <inttypes.h>
+#include <signal.h>
+#include <stdbool.h>
+#include <string.h>
+#include <sys/mman.h>
+#include <support/check.h>
+#include <support/support.h>
+#include <support/xstdio.h>
+#include <support/xunistd.h>
+
+extern const char *__progname;
+
+int
+do_test (int argc, char **argv)
+{
+
+ support_need_proc ("Reads /proc/self/maps to add guards to writable maps.");
+ ignore_stderr ();
+
+ /* XXX assumes that the assert is on a 2 digit line number. */
+ const char *prompt = ": %s:99: do_test: Assertion `argc < 1' failed.\n";
+
+ int ret = fprintf (stderr, prompt, __FILE__);
+ if (ret < 0)
+ FAIL_EXIT1 ("fprintf failed: %m\n");
+
+ size_t pagesize = getpagesize ();
+ size_t namesize = pagesize - 1 - ret;
+
+ /* Alter the progname so that the assert message fills the entire page. */
+ char progname[namesize];
+ memset (progname, 'A', namesize - 1);
+ progname[namesize - 1] = '\0';
+ __progname = progname;
+
+ FILE *f = xfopen ("/proc/self/maps", "r");
+ char *line = NULL;
+ size_t len = 0;
+ uintptr_t prev_to = 0;
+
+ /* Pad the beginning of every writable mapping with a PROT_NONE map. This
+ ensures that the mmap in the assert_fail path never ends up below a
+ writable map and will terminate immediately in case of a buffer
+ overflow. */
+ while (xgetline (&line, &len, f))
+ {
+ uintptr_t from, to;
+ char perm[4];
+
+ sscanf (line, "%" SCNxPTR "-%" SCNxPTR " %c%c%c%c ",
+ &from, &to,
+ &perm[0], &perm[1], &perm[2], &perm[3]);
+
+ bool writable = (memchr (perm, 'w', 4) != NULL);
+
+ if (prev_to != 0 && from - prev_to > pagesize && writable)
+ xmmap ((void *) from - pagesize, pagesize, PROT_NONE,
+ MAP_ANONYMOUS | MAP_PRIVATE, 0);
+
+ prev_to = to;
+ }
+
+ xfclose (f);
+
+ assert (argc < 1);
+ return 0;
+}
+
+#define EXPECTED_SIGNAL SIGABRT
+#define TEST_FUNCTION_ARGV do_test
+#include <support/test-driver.c>

View File

@ -0,0 +1,230 @@
commit d591876303e368fde0b03e1536efb69b64d9d483
Author: Joe Ramsay <Joe.Ramsay@arm.com>
Date: Thu May 2 16:43:13 2024 +0100
aarch64: Fix AdvSIMD libmvec routines for big-endian
Previously many routines used * to load from vector types stored
in the data table. This is emitted as ldr, which byte-swaps the
entire vector register, and causes bugs for big-endian when not
all lanes contain the same value. When a vector is to be used
this way, it has been replaced with an array and the load with an
explicit ld1 intrinsic, which byte-swaps only within lanes.
As well, many routines previously used non-standard GCC syntax
for vector operations such as indexing into vectors types with []
and assembling vectors using {}. This syntax should not be mixed
with ACLE, as the former does not respect endianness whereas the
latter does. Such examples have been replaced with, for instance,
vcombine_* and vgetq_lane* intrinsics. Helpers which only use the
GCC syntax, such as the v_call helpers, do not need changing as
they do not use intrinsics.
Reviewed-by: Szabolcs Nagy <szabolcs.nagy@arm.com>
(cherry picked from commit 90a6ca8b28bf34e361e577e526e1b0f4c39a32a5)
diff --git a/sysdeps/aarch64/fpu/exp10f_advsimd.c b/sysdeps/aarch64/fpu/exp10f_advsimd.c
index ab117b69da23e5f3..cf53e73290fcedb6 100644
--- a/sysdeps/aarch64/fpu/exp10f_advsimd.c
+++ b/sysdeps/aarch64/fpu/exp10f_advsimd.c
@@ -25,7 +25,8 @@
static const struct data
{
float32x4_t poly[5];
- float32x4_t log10_2_and_inv, shift;
+ float log10_2_and_inv[4];
+ float32x4_t shift;
#if !WANT_SIMD_EXCEPT
float32x4_t scale_thresh;
@@ -111,10 +112,11 @@ float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (exp10) (float32x4_t x)
/* exp10(x) = 2^n * 10^r = 2^n * (1 + poly (r)),
with poly(r) in [1/sqrt(2), sqrt(2)] and
x = r + n * log10 (2), with r in [-log10(2)/2, log10(2)/2]. */
- float32x4_t z = vfmaq_laneq_f32 (d->shift, x, d->log10_2_and_inv, 0);
+ float32x4_t log10_2_and_inv = vld1q_f32 (d->log10_2_and_inv);
+ float32x4_t z = vfmaq_laneq_f32 (d->shift, x, log10_2_and_inv, 0);
float32x4_t n = vsubq_f32 (z, d->shift);
- float32x4_t r = vfmsq_laneq_f32 (x, n, d->log10_2_and_inv, 1);
- r = vfmsq_laneq_f32 (r, n, d->log10_2_and_inv, 2);
+ float32x4_t r = vfmsq_laneq_f32 (x, n, log10_2_and_inv, 1);
+ r = vfmsq_laneq_f32 (r, n, log10_2_and_inv, 2);
uint32x4_t e = vshlq_n_u32 (vreinterpretq_u32_f32 (z), 23);
float32x4_t scale = vreinterpretq_f32_u32 (vaddq_u32 (e, ExponentBias));
diff --git a/sysdeps/aarch64/fpu/expm1_advsimd.c b/sysdeps/aarch64/fpu/expm1_advsimd.c
index 3628398674468131..3db3b80c49292947 100644
--- a/sysdeps/aarch64/fpu/expm1_advsimd.c
+++ b/sysdeps/aarch64/fpu/expm1_advsimd.c
@@ -23,7 +23,9 @@
static const struct data
{
float64x2_t poly[11];
- float64x2_t invln2, ln2, shift;
+ float64x2_t invln2;
+ double ln2[2];
+ float64x2_t shift;
int64x2_t exponent_bias;
#if WANT_SIMD_EXCEPT
uint64x2_t thresh, tiny_bound;
@@ -92,8 +94,9 @@ float64x2_t VPCS_ATTR V_NAME_D1 (expm1) (float64x2_t x)
where 2^i is exact because i is an integer. */
float64x2_t n = vsubq_f64 (vfmaq_f64 (d->shift, d->invln2, x), d->shift);
int64x2_t i = vcvtq_s64_f64 (n);
- float64x2_t f = vfmsq_laneq_f64 (x, n, d->ln2, 0);
- f = vfmsq_laneq_f64 (f, n, d->ln2, 1);
+ float64x2_t ln2 = vld1q_f64 (&d->ln2[0]);
+ float64x2_t f = vfmsq_laneq_f64 (x, n, ln2, 0);
+ f = vfmsq_laneq_f64 (f, n, ln2, 1);
/* Approximate expm1(f) using polynomial.
Taylor expansion for expm1(x) has the form:
diff --git a/sysdeps/aarch64/fpu/expm1f_advsimd.c b/sysdeps/aarch64/fpu/expm1f_advsimd.c
index 93db200f618379be..a0616ec7542cbfce 100644
--- a/sysdeps/aarch64/fpu/expm1f_advsimd.c
+++ b/sysdeps/aarch64/fpu/expm1f_advsimd.c
@@ -23,7 +23,7 @@
static const struct data
{
float32x4_t poly[5];
- float32x4_t invln2_and_ln2;
+ float invln2_and_ln2[4];
float32x4_t shift;
int32x4_t exponent_bias;
#if WANT_SIMD_EXCEPT
@@ -88,11 +88,12 @@ float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (expm1) (float32x4_t x)
and f = x - i * ln2, then f is in [-ln2/2, ln2/2].
exp(x) - 1 = 2^i * (expm1(f) + 1) - 1
where 2^i is exact because i is an integer. */
- float32x4_t j = vsubq_f32 (
- vfmaq_laneq_f32 (d->shift, x, d->invln2_and_ln2, 0), d->shift);
+ float32x4_t invln2_and_ln2 = vld1q_f32 (d->invln2_and_ln2);
+ float32x4_t j
+ = vsubq_f32 (vfmaq_laneq_f32 (d->shift, x, invln2_and_ln2, 0), d->shift);
int32x4_t i = vcvtq_s32_f32 (j);
- float32x4_t f = vfmsq_laneq_f32 (x, j, d->invln2_and_ln2, 1);
- f = vfmsq_laneq_f32 (f, j, d->invln2_and_ln2, 2);
+ float32x4_t f = vfmsq_laneq_f32 (x, j, invln2_and_ln2, 1);
+ f = vfmsq_laneq_f32 (f, j, invln2_and_ln2, 2);
/* Approximate expm1(f) using polynomial.
Taylor expansion for expm1(x) has the form:
diff --git a/sysdeps/aarch64/fpu/log10_advsimd.c b/sysdeps/aarch64/fpu/log10_advsimd.c
index 1e5ef99e8907068b..c065aaebae8600fb 100644
--- a/sysdeps/aarch64/fpu/log10_advsimd.c
+++ b/sysdeps/aarch64/fpu/log10_advsimd.c
@@ -58,8 +58,10 @@ static inline struct entry
lookup (uint64x2_t i)
{
struct entry e;
- uint64_t i0 = (i[0] >> (52 - V_LOG10_TABLE_BITS)) & IndexMask;
- uint64_t i1 = (i[1] >> (52 - V_LOG10_TABLE_BITS)) & IndexMask;
+ uint64_t i0
+ = (vgetq_lane_u64 (i, 0) >> (52 - V_LOG10_TABLE_BITS)) & IndexMask;
+ uint64_t i1
+ = (vgetq_lane_u64 (i, 1) >> (52 - V_LOG10_TABLE_BITS)) & IndexMask;
float64x2_t e0 = vld1q_f64 (&__v_log10_data.table[i0].invc);
float64x2_t e1 = vld1q_f64 (&__v_log10_data.table[i1].invc);
e.invc = vuzp1q_f64 (e0, e1);
diff --git a/sysdeps/aarch64/fpu/log2_advsimd.c b/sysdeps/aarch64/fpu/log2_advsimd.c
index a34978f6cf1cdb44..4057c552d8dfc0bb 100644
--- a/sysdeps/aarch64/fpu/log2_advsimd.c
+++ b/sysdeps/aarch64/fpu/log2_advsimd.c
@@ -55,8 +55,10 @@ static inline struct entry
lookup (uint64x2_t i)
{
struct entry e;
- uint64_t i0 = (i[0] >> (52 - V_LOG2_TABLE_BITS)) & IndexMask;
- uint64_t i1 = (i[1] >> (52 - V_LOG2_TABLE_BITS)) & IndexMask;
+ uint64_t i0
+ = (vgetq_lane_u64 (i, 0) >> (52 - V_LOG2_TABLE_BITS)) & IndexMask;
+ uint64_t i1
+ = (vgetq_lane_u64 (i, 1) >> (52 - V_LOG2_TABLE_BITS)) & IndexMask;
float64x2_t e0 = vld1q_f64 (&__v_log2_data.table[i0].invc);
float64x2_t e1 = vld1q_f64 (&__v_log2_data.table[i1].invc);
e.invc = vuzp1q_f64 (e0, e1);
diff --git a/sysdeps/aarch64/fpu/log_advsimd.c b/sysdeps/aarch64/fpu/log_advsimd.c
index 21df61728ca87374..015a6da7d7fd693e 100644
--- a/sysdeps/aarch64/fpu/log_advsimd.c
+++ b/sysdeps/aarch64/fpu/log_advsimd.c
@@ -54,17 +54,12 @@ lookup (uint64x2_t i)
{
/* Since N is a power of 2, n % N = n & (N - 1). */
struct entry e;
- uint64_t i0 = (i[0] >> (52 - V_LOG_TABLE_BITS)) & IndexMask;
- uint64_t i1 = (i[1] >> (52 - V_LOG_TABLE_BITS)) & IndexMask;
+ uint64_t i0 = (vgetq_lane_u64 (i, 0) >> (52 - V_LOG_TABLE_BITS)) & IndexMask;
+ uint64_t i1 = (vgetq_lane_u64 (i, 1) >> (52 - V_LOG_TABLE_BITS)) & IndexMask;
float64x2_t e0 = vld1q_f64 (&__v_log_data.table[i0].invc);
float64x2_t e1 = vld1q_f64 (&__v_log_data.table[i1].invc);
-#if __BYTE_ORDER == __LITTLE_ENDIAN
e.invc = vuzp1q_f64 (e0, e1);
e.logc = vuzp2q_f64 (e0, e1);
-#else
- e.invc = vuzp1q_f64 (e1, e0);
- e.logc = vuzp2q_f64 (e1, e0);
-#endif
return e;
}
diff --git a/sysdeps/aarch64/fpu/tan_advsimd.c b/sysdeps/aarch64/fpu/tan_advsimd.c
index 0459821ab25487a8..d56a102dd17a3463 100644
--- a/sysdeps/aarch64/fpu/tan_advsimd.c
+++ b/sysdeps/aarch64/fpu/tan_advsimd.c
@@ -23,7 +23,8 @@
static const struct data
{
float64x2_t poly[9];
- float64x2_t half_pi, two_over_pi, shift;
+ double half_pi[2];
+ float64x2_t two_over_pi, shift;
#if !WANT_SIMD_EXCEPT
float64x2_t range_val;
#endif
@@ -81,8 +82,9 @@ float64x2_t VPCS_ATTR V_NAME_D1 (tan) (float64x2_t x)
/* Use q to reduce x to r in [-pi/4, pi/4], by:
r = x - q * pi/2, in extended precision. */
float64x2_t r = x;
- r = vfmsq_laneq_f64 (r, q, dat->half_pi, 0);
- r = vfmsq_laneq_f64 (r, q, dat->half_pi, 1);
+ float64x2_t half_pi = vld1q_f64 (dat->half_pi);
+ r = vfmsq_laneq_f64 (r, q, half_pi, 0);
+ r = vfmsq_laneq_f64 (r, q, half_pi, 1);
/* Further reduce r to [-pi/8, pi/8], to be reconstructed using double angle
formula. */
r = vmulq_n_f64 (r, 0.5);
diff --git a/sysdeps/aarch64/fpu/tanf_advsimd.c b/sysdeps/aarch64/fpu/tanf_advsimd.c
index 5a7489390a9692c6..705586f0c0b664c1 100644
--- a/sysdeps/aarch64/fpu/tanf_advsimd.c
+++ b/sysdeps/aarch64/fpu/tanf_advsimd.c
@@ -23,7 +23,7 @@
static const struct data
{
float32x4_t poly[6];
- float32x4_t pi_consts;
+ float pi_consts[4];
float32x4_t shift;
#if !WANT_SIMD_EXCEPT
float32x4_t range_val;
@@ -95,16 +95,17 @@ float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (tan) (float32x4_t x)
#endif
/* n = rint(x/(pi/2)). */
- float32x4_t q = vfmaq_laneq_f32 (d->shift, x, d->pi_consts, 3);
+ float32x4_t pi_consts = vld1q_f32 (d->pi_consts);
+ float32x4_t q = vfmaq_laneq_f32 (d->shift, x, pi_consts, 3);
float32x4_t n = vsubq_f32 (q, d->shift);
/* Determine if x lives in an interval, where |tan(x)| grows to infinity. */
uint32x4_t pred_alt = vtstq_u32 (vreinterpretq_u32_f32 (q), v_u32 (1));
/* r = x - n * (pi/2) (range reduction into -pi./4 .. pi/4). */
float32x4_t r;
- r = vfmaq_laneq_f32 (x, n, d->pi_consts, 0);
- r = vfmaq_laneq_f32 (r, n, d->pi_consts, 1);
- r = vfmaq_laneq_f32 (r, n, d->pi_consts, 2);
+ r = vfmaq_laneq_f32 (x, n, pi_consts, 0);
+ r = vfmaq_laneq_f32 (r, n, pi_consts, 1);
+ r = vfmaq_laneq_f32 (r, n, pi_consts, 2);
/* If x lives in an interval, where |tan(x)|
- is finite, then use a polynomial approximation of the form

View File

@ -0,0 +1,268 @@
commit 80df456112d67e27660563b9540cbc1bb5475c84
Author: Joe Ramsay <Joe.Ramsay@arm.com>
Date: Mon Sep 9 13:00:01 2024 +0100
aarch64: Avoid redundant MOVs in AdvSIMD F32 logs
Since the last operation is destructive, the first argument to the FMA
also has to be the first argument to the special-case in order to
avoid unnecessary MOVs. Reorder arguments and adjust special-case
bounds to facilitate this.
Reviewed-by: Wilco Dijkstra <Wilco.Dijkstra@arm.com>
(cherry picked from commit 8b09af572b208bfde4d31c6abbae047dcc217675)
diff --git a/sysdeps/aarch64/fpu/log10f_advsimd.c b/sysdeps/aarch64/fpu/log10f_advsimd.c
index 9347422a771e3d4e..82228b599a5c061b 100644
--- a/sysdeps/aarch64/fpu/log10f_advsimd.c
+++ b/sysdeps/aarch64/fpu/log10f_advsimd.c
@@ -22,11 +22,11 @@
static const struct data
{
- uint32x4_t min_norm;
+ uint32x4_t off, offset_lower_bound;
uint16x8_t special_bound;
+ uint32x4_t mantissa_mask;
float32x4_t poly[8];
float32x4_t inv_ln10, ln2;
- uint32x4_t off, mantissa_mask;
} data = {
/* Use order 9 for log10(1+x), i.e. order 8 for log10(1+x)/x, with x in
[-1/3, 1/3] (offset=2/3). Max. relative error: 0x1.068ee468p-25. */
@@ -35,18 +35,22 @@ static const struct data
V4 (-0x1.0fc92cp-4f), V4 (0x1.f5f76ap-5f) },
.ln2 = V4 (0x1.62e43p-1f),
.inv_ln10 = V4 (0x1.bcb7b2p-2f),
- .min_norm = V4 (0x00800000),
- .special_bound = V8 (0x7f00), /* asuint32(inf) - min_norm. */
+ /* Lower bound is the smallest positive normal float 0x00800000. For
+ optimised register use subnormals are detected after offset has been
+ subtracted, so lower bound is 0x0080000 - offset (which wraps around). */
+ .offset_lower_bound = V4 (0x00800000 - 0x3f2aaaab),
+ .special_bound = V8 (0x7f00), /* top16(asuint32(inf) - 0x00800000). */
.off = V4 (0x3f2aaaab), /* 0.666667. */
.mantissa_mask = V4 (0x007fffff),
};
static float32x4_t VPCS_ATTR NOINLINE
-special_case (float32x4_t x, float32x4_t y, float32x4_t p, float32x4_t r2,
- uint16x4_t cmp)
+special_case (float32x4_t y, uint32x4_t u_off, float32x4_t p, float32x4_t r2,
+ uint16x4_t cmp, const struct data *d)
{
/* Fall back to scalar code. */
- return v_call_f32 (log10f, x, vfmaq_f32 (y, p, r2), vmovl_u16 (cmp));
+ return v_call_f32 (log10f, vreinterpretq_f32_u32 (vaddq_u32 (u_off, d->off)),
+ vfmaq_f32 (y, p, r2), vmovl_u16 (cmp));
}
/* Fast implementation of AdvSIMD log10f,
@@ -58,15 +62,21 @@ special_case (float32x4_t x, float32x4_t y, float32x4_t p, float32x4_t r2,
float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (log10) (float32x4_t x)
{
const struct data *d = ptr_barrier (&data);
- uint32x4_t u = vreinterpretq_u32_f32 (x);
- uint16x4_t special = vcge_u16 (vsubhn_u32 (u, d->min_norm),
- vget_low_u16 (d->special_bound));
+
+ /* To avoid having to mov x out of the way, keep u after offset has been
+ applied, and recover x by adding the offset back in the special-case
+ handler. */
+ uint32x4_t u_off = vreinterpretq_u32_f32 (x);
/* x = 2^n * (1+r), where 2/3 < 1+r < 4/3. */
- u = vsubq_u32 (u, d->off);
+ u_off = vsubq_u32 (u_off, d->off);
float32x4_t n = vcvtq_f32_s32 (
- vshrq_n_s32 (vreinterpretq_s32_u32 (u), 23)); /* signextend. */
- u = vaddq_u32 (vandq_u32 (u, d->mantissa_mask), d->off);
+ vshrq_n_s32 (vreinterpretq_s32_u32 (u_off), 23)); /* signextend. */
+
+ uint16x4_t special = vcge_u16 (vsubhn_u32 (u_off, d->offset_lower_bound),
+ vget_low_u16 (d->special_bound));
+
+ uint32x4_t u = vaddq_u32 (vandq_u32 (u_off, d->mantissa_mask), d->off);
float32x4_t r = vsubq_f32 (vreinterpretq_f32_u32 (u), v_f32 (1.0f));
/* y = log10(1+r) + n * log10(2). */
@@ -77,7 +87,7 @@ float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (log10) (float32x4_t x)
y = vmulq_f32 (y, d->inv_ln10);
if (__glibc_unlikely (v_any_u16h (special)))
- return special_case (x, y, poly, r2, special);
+ return special_case (y, u_off, poly, r2, special, d);
return vfmaq_f32 (y, poly, r2);
}
libmvec_hidden_def (V_NAME_F1 (log10))
diff --git a/sysdeps/aarch64/fpu/log2f_advsimd.c b/sysdeps/aarch64/fpu/log2f_advsimd.c
index db218367495dc567..84effe4fe9492d08 100644
--- a/sysdeps/aarch64/fpu/log2f_advsimd.c
+++ b/sysdeps/aarch64/fpu/log2f_advsimd.c
@@ -22,9 +22,9 @@
static const struct data
{
- uint32x4_t min_norm;
+ uint32x4_t off, offset_lower_bound;
uint16x8_t special_bound;
- uint32x4_t off, mantissa_mask;
+ uint32x4_t mantissa_mask;
float32x4_t poly[9];
} data = {
/* Coefficients generated using Remez algorithm approximate
@@ -34,18 +34,22 @@ static const struct data
V4 (-0x1.715458p-1f), V4 (0x1.ec701cp-2f), V4 (-0x1.7171a4p-2f),
V4 (0x1.27a0b8p-2f), V4 (-0x1.e5143ep-3f), V4 (0x1.9d8ecap-3f),
V4 (-0x1.c675bp-3f), V4 (0x1.9e495p-3f) },
- .min_norm = V4 (0x00800000),
- .special_bound = V8 (0x7f00), /* asuint32(inf) - min_norm. */
+ /* Lower bound is the smallest positive normal float 0x00800000. For
+ optimised register use subnormals are detected after offset has been
+ subtracted, so lower bound is 0x0080000 - offset (which wraps around). */
+ .offset_lower_bound = V4 (0x00800000 - 0x3f2aaaab),
+ .special_bound = V8 (0x7f00), /* top16(asuint32(inf) - 0x00800000). */
.off = V4 (0x3f2aaaab), /* 0.666667. */
.mantissa_mask = V4 (0x007fffff),
};
static float32x4_t VPCS_ATTR NOINLINE
-special_case (float32x4_t x, float32x4_t n, float32x4_t p, float32x4_t r,
- uint16x4_t cmp)
+special_case (float32x4_t n, uint32x4_t u_off, float32x4_t p, float32x4_t r,
+ uint16x4_t cmp, const struct data *d)
{
/* Fall back to scalar code. */
- return v_call_f32 (log2f, x, vfmaq_f32 (n, p, r), vmovl_u16 (cmp));
+ return v_call_f32 (log2f, vreinterpretq_f32_u32 (vaddq_u32 (u_off, d->off)),
+ vfmaq_f32 (n, p, r), vmovl_u16 (cmp));
}
/* Fast implementation for single precision AdvSIMD log2,
@@ -56,15 +60,21 @@ special_case (float32x4_t x, float32x4_t n, float32x4_t p, float32x4_t r,
float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (log2) (float32x4_t x)
{
const struct data *d = ptr_barrier (&data);
- uint32x4_t u = vreinterpretq_u32_f32 (x);
- uint16x4_t special = vcge_u16 (vsubhn_u32 (u, d->min_norm),
- vget_low_u16 (d->special_bound));
+
+ /* To avoid having to mov x out of the way, keep u after offset has been
+ applied, and recover x by adding the offset back in the special-case
+ handler. */
+ uint32x4_t u_off = vreinterpretq_u32_f32 (x);
/* x = 2^n * (1+r), where 2/3 < 1+r < 4/3. */
- u = vsubq_u32 (u, d->off);
+ u_off = vsubq_u32 (u_off, d->off);
float32x4_t n = vcvtq_f32_s32 (
- vshrq_n_s32 (vreinterpretq_s32_u32 (u), 23)); /* signextend. */
- u = vaddq_u32 (vandq_u32 (u, d->mantissa_mask), d->off);
+ vshrq_n_s32 (vreinterpretq_s32_u32 (u_off), 23)); /* signextend. */
+
+ uint16x4_t special = vcge_u16 (vsubhn_u32 (u_off, d->offset_lower_bound),
+ vget_low_u16 (d->special_bound));
+
+ uint32x4_t u = vaddq_u32 (vandq_u32 (u_off, d->mantissa_mask), d->off);
float32x4_t r = vsubq_f32 (vreinterpretq_f32_u32 (u), v_f32 (1.0f));
/* y = log2(1+r) + n. */
@@ -72,7 +82,7 @@ float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (log2) (float32x4_t x)
float32x4_t p = v_pw_horner_8_f32 (r, r2, d->poly);
if (__glibc_unlikely (v_any_u16h (special)))
- return special_case (x, n, p, r, special);
+ return special_case (n, u_off, p, r, special, d);
return vfmaq_f32 (n, p, r);
}
libmvec_hidden_def (V_NAME_F1 (log2))
diff --git a/sysdeps/aarch64/fpu/logf_advsimd.c b/sysdeps/aarch64/fpu/logf_advsimd.c
index 3c0d0fcdc76f1004..c20dbfd6c088c0af 100644
--- a/sysdeps/aarch64/fpu/logf_advsimd.c
+++ b/sysdeps/aarch64/fpu/logf_advsimd.c
@@ -21,20 +21,22 @@
static const struct data
{
- uint32x4_t min_norm;
+ uint32x4_t off, offset_lower_bound;
uint16x8_t special_bound;
+ uint32x4_t mantissa_mask;
float32x4_t poly[7];
- float32x4_t ln2, tiny_bound;
- uint32x4_t off, mantissa_mask;
+ float32x4_t ln2;
} data = {
/* 3.34 ulp error. */
.poly = { V4 (-0x1.3e737cp-3f), V4 (0x1.5a9aa2p-3f), V4 (-0x1.4f9934p-3f),
V4 (0x1.961348p-3f), V4 (-0x1.00187cp-2f), V4 (0x1.555d7cp-2f),
V4 (-0x1.ffffc8p-2f) },
.ln2 = V4 (0x1.62e43p-1f),
- .tiny_bound = V4 (0x1p-126),
- .min_norm = V4 (0x00800000),
- .special_bound = V8 (0x7f00), /* asuint32(inf) - min_norm. */
+ /* Lower bound is the smallest positive normal float 0x00800000. For
+ optimised register use subnormals are detected after offset has been
+ subtracted, so lower bound is 0x0080000 - offset (which wraps around). */
+ .offset_lower_bound = V4 (0x00800000 - 0x3f2aaaab),
+ .special_bound = V8 (0x7f00), /* top16(asuint32(inf) - 0x00800000). */
.off = V4 (0x3f2aaaab), /* 0.666667. */
.mantissa_mask = V4 (0x007fffff)
};
@@ -42,32 +44,37 @@ static const struct data
#define P(i) d->poly[7 - i]
static float32x4_t VPCS_ATTR NOINLINE
-special_case (float32x4_t x, float32x4_t y, float32x4_t r2, float32x4_t p,
- uint16x4_t cmp)
+special_case (float32x4_t p, uint32x4_t u_off, float32x4_t y, float32x4_t r2,
+ uint16x4_t cmp, const struct data *d)
{
/* Fall back to scalar code. */
- return v_call_f32 (logf, x, vfmaq_f32 (p, y, r2), vmovl_u16 (cmp));
+ return v_call_f32 (logf, vreinterpretq_f32_u32 (vaddq_u32 (u_off, d->off)),
+ vfmaq_f32 (p, y, r2), vmovl_u16 (cmp));
}
float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (log) (float32x4_t x)
{
const struct data *d = ptr_barrier (&data);
float32x4_t n, p, q, r, r2, y;
- uint32x4_t u;
+ uint32x4_t u, u_off;
uint16x4_t cmp;
- u = vreinterpretq_u32_f32 (x);
- cmp = vcge_u16 (vsubhn_u32 (u, d->min_norm),
- vget_low_u16 (d->special_bound));
+ /* To avoid having to mov x out of the way, keep u after offset has been
+ applied, and recover x by adding the offset back in the special-case
+ handler. */
+ u_off = vreinterpretq_u32_f32 (x);
/* x = 2^n * (1+r), where 2/3 < 1+r < 4/3. */
- u = vsubq_u32 (u, d->off);
+ u_off = vsubq_u32 (u_off, d->off);
n = vcvtq_f32_s32 (
- vshrq_n_s32 (vreinterpretq_s32_u32 (u), 23)); /* signextend. */
- u = vandq_u32 (u, d->mantissa_mask);
+ vshrq_n_s32 (vreinterpretq_s32_u32 (u_off), 23)); /* signextend. */
+ u = vandq_u32 (u_off, d->mantissa_mask);
u = vaddq_u32 (u, d->off);
r = vsubq_f32 (vreinterpretq_f32_u32 (u), v_f32 (1.0f));
+ cmp = vcge_u16 (vsubhn_u32 (u_off, d->offset_lower_bound),
+ vget_low_u16 (d->special_bound));
+
/* y = log(1+r) + n*ln2. */
r2 = vmulq_f32 (r, r);
/* n*ln2 + r + r2*(P1 + r*P2 + r2*(P3 + r*P4 + r2*(P5 + r*P6 + r2*P7))). */
@@ -80,7 +87,7 @@ float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (log) (float32x4_t x)
p = vfmaq_f32 (r, d->ln2, n);
if (__glibc_unlikely (v_any_u16h (cmp)))
- return special_case (x, y, r2, p, cmp);
+ return special_case (p, u_off, y, r2, cmp, d);
return vfmaq_f32 (p, y, r2);
}
libmvec_hidden_def (V_NAME_F1 (log))

View File

@ -0,0 +1,240 @@
commit 5e354bf4e20ca3ccc15bda63c7b56ea0e97efa81
Author: Joe Ramsay <Joe.Ramsay@arm.com>
Date: Mon Sep 23 15:33:31 2024 +0100
AArch64: Simplify rounding-multiply pattern in several AdvSIMD routines
This operation can be simplified to use simpler multiply-round-convert
sequence, which uses fewer instructions and constants.
Reviewed-by: Wilco Dijkstra <Wilco.Dijkstra@arm.com>
(cherry picked from commit 16a59571e4e9fd019d3fc23a2e7d73c1df8bb5cb)
diff --git a/sysdeps/aarch64/fpu/cos_advsimd.c b/sysdeps/aarch64/fpu/cos_advsimd.c
index 3924c9ce44c30d4d..11a89b1530825b5f 100644
--- a/sysdeps/aarch64/fpu/cos_advsimd.c
+++ b/sysdeps/aarch64/fpu/cos_advsimd.c
@@ -22,7 +22,7 @@
static const struct data
{
float64x2_t poly[7];
- float64x2_t range_val, shift, inv_pi, half_pi, pi_1, pi_2, pi_3;
+ float64x2_t range_val, inv_pi, pi_1, pi_2, pi_3;
} data = {
/* Worst-case error is 3.3 ulp in [-pi/2, pi/2]. */
.poly = { V2 (-0x1.555555555547bp-3), V2 (0x1.1111111108a4dp-7),
@@ -30,11 +30,9 @@ static const struct data
V2 (-0x1.ae633919987c6p-26), V2 (0x1.60e277ae07cecp-33),
V2 (-0x1.9e9540300a1p-41) },
.inv_pi = V2 (0x1.45f306dc9c883p-2),
- .half_pi = V2 (0x1.921fb54442d18p+0),
.pi_1 = V2 (0x1.921fb54442d18p+1),
.pi_2 = V2 (0x1.1a62633145c06p-53),
.pi_3 = V2 (0x1.c1cd129024e09p-106),
- .shift = V2 (0x1.8p52),
.range_val = V2 (0x1p23)
};
@@ -68,10 +66,9 @@ float64x2_t VPCS_ATTR V_NAME_D1 (cos) (float64x2_t x)
#endif
/* n = rint((|x|+pi/2)/pi) - 0.5. */
- n = vfmaq_f64 (d->shift, d->inv_pi, vaddq_f64 (r, d->half_pi));
- odd = vshlq_n_u64 (vreinterpretq_u64_f64 (n), 63);
- n = vsubq_f64 (n, d->shift);
- n = vsubq_f64 (n, v_f64 (0.5));
+ n = vrndaq_f64 (vfmaq_f64 (v_f64 (0.5), r, d->inv_pi));
+ odd = vshlq_n_u64 (vreinterpretq_u64_s64 (vcvtq_s64_f64 (n)), 63);
+ n = vsubq_f64 (n, v_f64 (0.5f));
/* r = |x| - n*pi (range reduction into -pi/2 .. pi/2). */
r = vfmsq_f64 (r, d->pi_1, n);
diff --git a/sysdeps/aarch64/fpu/cosf_advsimd.c b/sysdeps/aarch64/fpu/cosf_advsimd.c
index d0c285b03a8bfe22..85a1b373733123fa 100644
--- a/sysdeps/aarch64/fpu/cosf_advsimd.c
+++ b/sysdeps/aarch64/fpu/cosf_advsimd.c
@@ -22,7 +22,7 @@
static const struct data
{
float32x4_t poly[4];
- float32x4_t range_val, inv_pi, half_pi, shift, pi_1, pi_2, pi_3;
+ float32x4_t range_val, inv_pi, pi_1, pi_2, pi_3;
} data = {
/* 1.886 ulp error. */
.poly = { V4 (-0x1.555548p-3f), V4 (0x1.110df4p-7f), V4 (-0x1.9f42eap-13f),
@@ -33,8 +33,6 @@ static const struct data
.pi_3 = V4 (-0x1.ee59dap-49f),
.inv_pi = V4 (0x1.45f306p-2f),
- .shift = V4 (0x1.8p+23f),
- .half_pi = V4 (0x1.921fb6p0f),
.range_val = V4 (0x1p20f)
};
@@ -69,9 +67,8 @@ float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (cos) (float32x4_t x)
#endif
/* n = rint((|x|+pi/2)/pi) - 0.5. */
- n = vfmaq_f32 (d->shift, d->inv_pi, vaddq_f32 (r, d->half_pi));
- odd = vshlq_n_u32 (vreinterpretq_u32_f32 (n), 31);
- n = vsubq_f32 (n, d->shift);
+ n = vrndaq_f32 (vfmaq_f32 (v_f32 (0.5), r, d->inv_pi));
+ odd = vshlq_n_u32 (vreinterpretq_u32_s32 (vcvtq_s32_f32 (n)), 31);
n = vsubq_f32 (n, v_f32 (0.5f));
/* r = |x| - n*pi (range reduction into -pi/2 .. pi/2). */
diff --git a/sysdeps/aarch64/fpu/expf_advsimd.c b/sysdeps/aarch64/fpu/expf_advsimd.c
index 99d2e647aab70260..5c9cb726205ece6e 100644
--- a/sysdeps/aarch64/fpu/expf_advsimd.c
+++ b/sysdeps/aarch64/fpu/expf_advsimd.c
@@ -22,7 +22,7 @@
static const struct data
{
float32x4_t poly[5];
- float32x4_t shift, inv_ln2, ln2_hi, ln2_lo;
+ float32x4_t inv_ln2, ln2_hi, ln2_lo;
uint32x4_t exponent_bias;
#if !WANT_SIMD_EXCEPT
float32x4_t special_bound, scale_thresh;
@@ -31,7 +31,6 @@ static const struct data
/* maxerr: 1.45358 +0.5 ulp. */
.poly = { V4 (0x1.0e4020p-7f), V4 (0x1.573e2ep-5f), V4 (0x1.555e66p-3f),
V4 (0x1.fffdb6p-2f), V4 (0x1.ffffecp-1f) },
- .shift = V4 (0x1.8p23f),
.inv_ln2 = V4 (0x1.715476p+0f),
.ln2_hi = V4 (0x1.62e4p-1f),
.ln2_lo = V4 (0x1.7f7d1cp-20f),
@@ -85,7 +84,7 @@ special_case (float32x4_t poly, float32x4_t n, uint32x4_t e, uint32x4_t cmp1,
float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (exp) (float32x4_t x)
{
const struct data *d = ptr_barrier (&data);
- float32x4_t n, r, r2, scale, p, q, poly, z;
+ float32x4_t n, r, r2, scale, p, q, poly;
uint32x4_t cmp, e;
#if WANT_SIMD_EXCEPT
@@ -104,11 +103,10 @@ float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (exp) (float32x4_t x)
/* exp(x) = 2^n (1 + poly(r)), with 1 + poly(r) in [1/sqrt(2),sqrt(2)]
x = ln2*n + r, with r in [-ln2/2, ln2/2]. */
- z = vfmaq_f32 (d->shift, x, d->inv_ln2);
- n = vsubq_f32 (z, d->shift);
+ n = vrndaq_f32 (vmulq_f32 (x, d->inv_ln2));
r = vfmsq_f32 (x, n, d->ln2_hi);
r = vfmsq_f32 (r, n, d->ln2_lo);
- e = vshlq_n_u32 (vreinterpretq_u32_f32 (z), 23);
+ e = vshlq_n_u32 (vreinterpretq_u32_s32 (vcvtq_s32_f32 (n)), 23);
scale = vreinterpretq_f32_u32 (vaddq_u32 (e, d->exponent_bias));
#if !WANT_SIMD_EXCEPT
diff --git a/sysdeps/aarch64/fpu/sin_advsimd.c b/sysdeps/aarch64/fpu/sin_advsimd.c
index a0d9d3b81965db76..718125cbad81db41 100644
--- a/sysdeps/aarch64/fpu/sin_advsimd.c
+++ b/sysdeps/aarch64/fpu/sin_advsimd.c
@@ -22,7 +22,7 @@
static const struct data
{
float64x2_t poly[7];
- float64x2_t range_val, inv_pi, shift, pi_1, pi_2, pi_3;
+ float64x2_t range_val, inv_pi, pi_1, pi_2, pi_3;
} data = {
.poly = { V2 (-0x1.555555555547bp-3), V2 (0x1.1111111108a4dp-7),
V2 (-0x1.a01a019936f27p-13), V2 (0x1.71de37a97d93ep-19),
@@ -34,12 +34,13 @@ static const struct data
.pi_1 = V2 (0x1.921fb54442d18p+1),
.pi_2 = V2 (0x1.1a62633145c06p-53),
.pi_3 = V2 (0x1.c1cd129024e09p-106),
- .shift = V2 (0x1.8p52),
};
#if WANT_SIMD_EXCEPT
-# define TinyBound v_u64 (0x3000000000000000) /* asuint64 (0x1p-255). */
-# define Thresh v_u64 (0x1160000000000000) /* RangeVal - TinyBound. */
+/* asuint64(0x1p-253)), below which multiply by inv_pi underflows. */
+# define TinyBound v_u64 (0x3020000000000000)
+/* RangeVal - TinyBound. */
+# define Thresh v_u64 (0x1160000000000000)
#endif
#define C(i) d->poly[i]
@@ -72,16 +73,15 @@ float64x2_t VPCS_ATTR V_NAME_D1 (sin) (float64x2_t x)
fenv). These lanes will be fixed by special-case handler later. */
uint64x2_t ir = vreinterpretq_u64_f64 (vabsq_f64 (x));
cmp = vcgeq_u64 (vsubq_u64 (ir, TinyBound), Thresh);
- r = vbslq_f64 (cmp, vreinterpretq_f64_u64 (cmp), x);
+ r = vreinterpretq_f64_u64 (vbicq_u64 (vreinterpretq_u64_f64 (x), cmp));
#else
r = x;
cmp = vcageq_f64 (x, d->range_val);
#endif
/* n = rint(|x|/pi). */
- n = vfmaq_f64 (d->shift, d->inv_pi, r);
- odd = vshlq_n_u64 (vreinterpretq_u64_f64 (n), 63);
- n = vsubq_f64 (n, d->shift);
+ n = vrndaq_f64 (vmulq_f64 (r, d->inv_pi));
+ odd = vshlq_n_u64 (vreinterpretq_u64_s64 (vcvtq_s64_f64 (n)), 63);
/* r = |x| - n*pi (range reduction into -pi/2 .. pi/2). */
r = vfmsq_f64 (r, d->pi_1, n);
diff --git a/sysdeps/aarch64/fpu/sinf_advsimd.c b/sysdeps/aarch64/fpu/sinf_advsimd.c
index 375dfc3331fa6a9c..6ee9a23d5b7fd13f 100644
--- a/sysdeps/aarch64/fpu/sinf_advsimd.c
+++ b/sysdeps/aarch64/fpu/sinf_advsimd.c
@@ -22,7 +22,7 @@
static const struct data
{
float32x4_t poly[4];
- float32x4_t range_val, inv_pi, shift, pi_1, pi_2, pi_3;
+ float32x4_t range_val, inv_pi, pi_1, pi_2, pi_3;
} data = {
/* 1.886 ulp error. */
.poly = { V4 (-0x1.555548p-3f), V4 (0x1.110df4p-7f), V4 (-0x1.9f42eap-13f),
@@ -33,13 +33,14 @@ static const struct data
.pi_3 = V4 (-0x1.ee59dap-49f),
.inv_pi = V4 (0x1.45f306p-2f),
- .shift = V4 (0x1.8p+23f),
.range_val = V4 (0x1p20f)
};
#if WANT_SIMD_EXCEPT
-# define TinyBound v_u32 (0x21000000) /* asuint32(0x1p-61f). */
-# define Thresh v_u32 (0x28800000) /* RangeVal - TinyBound. */
+/* asuint32(0x1p-59f), below which multiply by inv_pi underflows. */
+# define TinyBound v_u32 (0x22000000)
+/* RangeVal - TinyBound. */
+# define Thresh v_u32 (0x27800000)
#endif
#define C(i) d->poly[i]
@@ -64,23 +65,22 @@ float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (sin) (float32x4_t x)
/* If fenv exceptions are to be triggered correctly, set any special lanes
to 1 (which is neutral w.r.t. fenv). These lanes will be fixed by
special-case handler later. */
- r = vbslq_f32 (cmp, vreinterpretq_f32_u32 (cmp), x);
+ r = vreinterpretq_f32_u32 (vbicq_u32 (vreinterpretq_u32_f32 (x), cmp));
#else
r = x;
cmp = vcageq_f32 (x, d->range_val);
#endif
- /* n = rint(|x|/pi) */
- n = vfmaq_f32 (d->shift, d->inv_pi, r);
- odd = vshlq_n_u32 (vreinterpretq_u32_f32 (n), 31);
- n = vsubq_f32 (n, d->shift);
+ /* n = rint(|x|/pi). */
+ n = vrndaq_f32 (vmulq_f32 (r, d->inv_pi));
+ odd = vshlq_n_u32 (vreinterpretq_u32_s32 (vcvtq_s32_f32 (n)), 31);
- /* r = |x| - n*pi (range reduction into -pi/2 .. pi/2) */
+ /* r = |x| - n*pi (range reduction into -pi/2 .. pi/2). */
r = vfmsq_f32 (r, d->pi_1, n);
r = vfmsq_f32 (r, d->pi_2, n);
r = vfmsq_f32 (r, d->pi_3, n);
- /* y = sin(r) */
+ /* y = sin(r). */
r2 = vmulq_f32 (r, r);
y = vfmaq_f32 (C (2), C (3), r2);
y = vfmaq_f32 (C (1), y, r2);

View File

@ -0,0 +1,404 @@
commit 72156cb90bb845eddf3acd59dd1599cec365942e
Author: Pierre Blanchard <pierre.blanchard@arm.com>
Date: Mon Dec 9 15:54:34 2024 +0000
AArch64: Improve codegen in AdvSIMD logs
Remove spurious ADRP and a few MOVs.
Reduce memory access by using more indexed MLAs in polynomial.
Align notation so that algorithms are easier to compare.
Speedup on Neoverse V1 for log10 (8%), log (8.5%), and log2 (10%).
Update error threshold in AdvSIMD log (now matches SVE log).
(cherry picked from commit 8eb5ad2ebc94cc5bedbac57c226c02ec254479c7)
diff --git a/sysdeps/aarch64/fpu/log10_advsimd.c b/sysdeps/aarch64/fpu/log10_advsimd.c
index c065aaebae8600fb..f69ed21c3938d9a9 100644
--- a/sysdeps/aarch64/fpu/log10_advsimd.c
+++ b/sysdeps/aarch64/fpu/log10_advsimd.c
@@ -18,36 +18,36 @@
<https://www.gnu.org/licenses/>. */
#include "v_math.h"
-#include "poly_advsimd_f64.h"
-
-#define N (1 << V_LOG10_TABLE_BITS)
static const struct data
{
- uint64x2_t min_norm;
+ uint64x2_t off, sign_exp_mask, offset_lower_bound;
uint32x4_t special_bound;
- float64x2_t poly[5];
- float64x2_t invln10, log10_2, ln2;
- uint64x2_t sign_exp_mask;
+ double invln10, log10_2;
+ double c1, c3;
+ float64x2_t c0, c2, c4;
} data = {
/* Computed from log coefficients divided by log(10) then rounded to double
precision. */
- .poly = { V2 (-0x1.bcb7b1526e506p-3), V2 (0x1.287a7636be1d1p-3),
- V2 (-0x1.bcb7b158af938p-4), V2 (0x1.63c78734e6d07p-4),
- V2 (-0x1.287461742fee4p-4) },
- .ln2 = V2 (0x1.62e42fefa39efp-1),
- .invln10 = V2 (0x1.bcb7b1526e50ep-2),
- .log10_2 = V2 (0x1.34413509f79ffp-2),
- .min_norm = V2 (0x0010000000000000), /* asuint64(0x1p-1022). */
- .special_bound = V4 (0x7fe00000), /* asuint64(inf) - min_norm. */
+ .c0 = V2 (-0x1.bcb7b1526e506p-3),
+ .c1 = 0x1.287a7636be1d1p-3,
+ .c2 = V2 (-0x1.bcb7b158af938p-4),
+ .c3 = 0x1.63c78734e6d07p-4,
+ .c4 = V2 (-0x1.287461742fee4p-4),
+ .invln10 = 0x1.bcb7b1526e50ep-2,
+ .log10_2 = 0x1.34413509f79ffp-2,
+ .off = V2 (0x3fe6900900000000),
.sign_exp_mask = V2 (0xfff0000000000000),
+ /* Lower bound is 0x0010000000000000. For
+ optimised register use subnormals are detected after offset has been
+ subtracted, so lower bound - offset (which wraps around). */
+ .offset_lower_bound = V2 (0x0010000000000000 - 0x3fe6900900000000),
+ .special_bound = V4 (0x7fe00000), /* asuint64(inf) - 0x0010000000000000. */
};
-#define Off v_u64 (0x3fe6900900000000)
+#define N (1 << V_LOG10_TABLE_BITS)
#define IndexMask (N - 1)
-#define T(s, i) __v_log10_data.s[i]
-
struct entry
{
float64x2_t invc;
@@ -70,10 +70,11 @@ lookup (uint64x2_t i)
}
static float64x2_t VPCS_ATTR NOINLINE
-special_case (float64x2_t x, float64x2_t y, float64x2_t hi, float64x2_t r2,
- uint32x2_t special)
+special_case (float64x2_t hi, uint64x2_t u_off, float64x2_t y, float64x2_t r2,
+ uint32x2_t special, const struct data *d)
{
- return v_call_f64 (log10, x, vfmaq_f64 (hi, r2, y), vmovl_u32 (special));
+ float64x2_t x = vreinterpretq_f64_u64 (vaddq_u64 (u_off, d->off));
+ return v_call_f64 (log10, x, vfmaq_f64 (hi, y, r2), vmovl_u32 (special));
}
/* Fast implementation of double-precision vector log10
@@ -85,19 +86,24 @@ special_case (float64x2_t x, float64x2_t y, float64x2_t hi, float64x2_t r2,
float64x2_t VPCS_ATTR V_NAME_D1 (log10) (float64x2_t x)
{
const struct data *d = ptr_barrier (&data);
- uint64x2_t ix = vreinterpretq_u64_f64 (x);
- uint32x2_t special = vcge_u32 (vsubhn_u64 (ix, d->min_norm),
- vget_low_u32 (d->special_bound));
+
+ /* To avoid having to mov x out of the way, keep u after offset has been
+ applied, and recover x by adding the offset back in the special-case
+ handler. */
+ uint64x2_t u = vreinterpretq_u64_f64 (x);
+ uint64x2_t u_off = vsubq_u64 (u, d->off);
/* x = 2^k z; where z is in range [OFF,2*OFF) and exact.
The range is split into N subintervals.
The ith subinterval contains z and c is near its center. */
- uint64x2_t tmp = vsubq_u64 (ix, Off);
- int64x2_t k = vshrq_n_s64 (vreinterpretq_s64_u64 (tmp), 52);
- uint64x2_t iz = vsubq_u64 (ix, vandq_u64 (tmp, d->sign_exp_mask));
+ int64x2_t k = vshrq_n_s64 (vreinterpretq_s64_u64 (u_off), 52);
+ uint64x2_t iz = vsubq_u64 (u, vandq_u64 (u_off, d->sign_exp_mask));
float64x2_t z = vreinterpretq_f64_u64 (iz);
- struct entry e = lookup (tmp);
+ struct entry e = lookup (u_off);
+
+ uint32x2_t special = vcge_u32 (vsubhn_u64 (u_off, d->offset_lower_bound),
+ vget_low_u32 (d->special_bound));
/* log10(x) = log1p(z/c-1)/log(10) + log10(c) + k*log10(2). */
float64x2_t r = vfmaq_f64 (v_f64 (-1.0), z, e.invc);
@@ -105,17 +111,22 @@ float64x2_t VPCS_ATTR V_NAME_D1 (log10) (float64x2_t x)
/* hi = r / log(10) + log10(c) + k*log10(2).
Constants in v_log10_data.c are computed (in extended precision) as
- e.log10c := e.logc * ivln10. */
- float64x2_t w = vfmaq_f64 (e.log10c, r, d->invln10);
+ e.log10c := e.logc * invln10. */
+ float64x2_t cte = vld1q_f64 (&d->invln10);
+ float64x2_t hi = vfmaq_laneq_f64 (e.log10c, r, cte, 0);
/* y = log10(1+r) + n * log10(2). */
- float64x2_t hi = vfmaq_f64 (w, kd, d->log10_2);
+ hi = vfmaq_laneq_f64 (hi, kd, cte, 1);
/* y = r2*(A0 + r*A1 + r2*(A2 + r*A3 + r2*A4)) + hi. */
float64x2_t r2 = vmulq_f64 (r, r);
- float64x2_t y = v_pw_horner_4_f64 (r, r2, d->poly);
+ float64x2_t odd_coeffs = vld1q_f64 (&d->c1);
+ float64x2_t y = vfmaq_laneq_f64 (d->c2, r, odd_coeffs, 1);
+ float64x2_t p = vfmaq_laneq_f64 (d->c0, r, odd_coeffs, 0);
+ y = vfmaq_f64 (y, d->c4, r2);
+ y = vfmaq_f64 (p, y, r2);
if (__glibc_unlikely (v_any_u32h (special)))
- return special_case (x, y, hi, r2, special);
- return vfmaq_f64 (hi, r2, y);
+ return special_case (hi, u_off, y, r2, special, d);
+ return vfmaq_f64 (hi, y, r2);
}
diff --git a/sysdeps/aarch64/fpu/log2_advsimd.c b/sysdeps/aarch64/fpu/log2_advsimd.c
index 4057c552d8dfc0bb..1eea1f86ebdeab34 100644
--- a/sysdeps/aarch64/fpu/log2_advsimd.c
+++ b/sysdeps/aarch64/fpu/log2_advsimd.c
@@ -18,31 +18,33 @@
<https://www.gnu.org/licenses/>. */
#include "v_math.h"
-#include "poly_advsimd_f64.h"
-
-#define N (1 << V_LOG2_TABLE_BITS)
static const struct data
{
- uint64x2_t min_norm;
+ uint64x2_t off, sign_exp_mask, offset_lower_bound;
uint32x4_t special_bound;
- float64x2_t poly[5];
- float64x2_t invln2;
- uint64x2_t sign_exp_mask;
+ float64x2_t c0, c2;
+ double c1, c3, invln2, c4;
} data = {
/* Each coefficient was generated to approximate log(r) for |r| < 0x1.fp-9
and N = 128, then scaled by log2(e) in extended precision and rounded back
to double precision. */
- .poly = { V2 (-0x1.71547652b83p-1), V2 (0x1.ec709dc340953p-2),
- V2 (-0x1.71547651c8f35p-2), V2 (0x1.2777ebe12dda5p-2),
- V2 (-0x1.ec738d616fe26p-3) },
- .invln2 = V2 (0x1.71547652b82fep0),
- .min_norm = V2 (0x0010000000000000), /* asuint64(0x1p-1022). */
- .special_bound = V4 (0x7fe00000), /* asuint64(inf) - min_norm. */
+ .c0 = V2 (-0x1.71547652b8300p-1),
+ .c1 = 0x1.ec709dc340953p-2,
+ .c2 = V2 (-0x1.71547651c8f35p-2),
+ .c3 = 0x1.2777ebe12dda5p-2,
+ .c4 = -0x1.ec738d616fe26p-3,
+ .invln2 = 0x1.71547652b82fep0,
+ .off = V2 (0x3fe6900900000000),
.sign_exp_mask = V2 (0xfff0000000000000),
+ /* Lower bound is 0x0010000000000000. For
+ optimised register use subnormals are detected after offset has been
+ subtracted, so lower bound - offset (which wraps around). */
+ .offset_lower_bound = V2 (0x0010000000000000 - 0x3fe6900900000000),
+ .special_bound = V4 (0x7fe00000), /* asuint64(inf) - asuint64(0x1p-1022). */
};
-#define Off v_u64 (0x3fe6900900000000)
+#define N (1 << V_LOG2_TABLE_BITS)
#define IndexMask (N - 1)
struct entry
@@ -67,10 +69,11 @@ lookup (uint64x2_t i)
}
static float64x2_t VPCS_ATTR NOINLINE
-special_case (float64x2_t x, float64x2_t y, float64x2_t w, float64x2_t r2,
- uint32x2_t special)
+special_case (float64x2_t hi, uint64x2_t u_off, float64x2_t y, float64x2_t r2,
+ uint32x2_t special, const struct data *d)
{
- return v_call_f64 (log2, x, vfmaq_f64 (w, r2, y), vmovl_u32 (special));
+ float64x2_t x = vreinterpretq_f64_u64 (vaddq_u64 (u_off, d->off));
+ return v_call_f64 (log2, x, vfmaq_f64 (hi, y, r2), vmovl_u32 (special));
}
/* Double-precision vector log2 routine. Implements the same algorithm as
@@ -81,31 +84,41 @@ special_case (float64x2_t x, float64x2_t y, float64x2_t w, float64x2_t r2,
float64x2_t VPCS_ATTR V_NAME_D1 (log2) (float64x2_t x)
{
const struct data *d = ptr_barrier (&data);
- uint64x2_t ix = vreinterpretq_u64_f64 (x);
- uint32x2_t special = vcge_u32 (vsubhn_u64 (ix, d->min_norm),
- vget_low_u32 (d->special_bound));
+
+ /* To avoid having to mov x out of the way, keep u after offset has been
+ applied, and recover x by adding the offset back in the special-case
+ handler. */
+ uint64x2_t u = vreinterpretq_u64_f64 (x);
+ uint64x2_t u_off = vsubq_u64 (u, d->off);
/* x = 2^k z; where z is in range [Off,2*Off) and exact.
The range is split into N subintervals.
The ith subinterval contains z and c is near its center. */
- uint64x2_t tmp = vsubq_u64 (ix, Off);
- int64x2_t k = vshrq_n_s64 (vreinterpretq_s64_u64 (tmp), 52);
- uint64x2_t iz = vsubq_u64 (ix, vandq_u64 (tmp, d->sign_exp_mask));
+ int64x2_t k = vshrq_n_s64 (vreinterpretq_s64_u64 (u_off), 52);
+ uint64x2_t iz = vsubq_u64 (u, vandq_u64 (u_off, d->sign_exp_mask));
float64x2_t z = vreinterpretq_f64_u64 (iz);
- struct entry e = lookup (tmp);
+ struct entry e = lookup (u_off);
- /* log2(x) = log1p(z/c-1)/log(2) + log2(c) + k. */
+ uint32x2_t special = vcge_u32 (vsubhn_u64 (u_off, d->offset_lower_bound),
+ vget_low_u32 (d->special_bound));
+ /* log2(x) = log1p(z/c-1)/log(2) + log2(c) + k. */
float64x2_t r = vfmaq_f64 (v_f64 (-1.0), z, e.invc);
float64x2_t kd = vcvtq_f64_s64 (k);
- float64x2_t w = vfmaq_f64 (e.log2c, r, d->invln2);
+
+ float64x2_t invln2_and_c4 = vld1q_f64 (&d->invln2);
+ float64x2_t hi
+ = vfmaq_laneq_f64 (vaddq_f64 (e.log2c, kd), r, invln2_and_c4, 0);
float64x2_t r2 = vmulq_f64 (r, r);
- float64x2_t y = v_pw_horner_4_f64 (r, r2, d->poly);
- w = vaddq_f64 (kd, w);
+ float64x2_t odd_coeffs = vld1q_f64 (&d->c1);
+ float64x2_t y = vfmaq_laneq_f64 (d->c2, r, odd_coeffs, 1);
+ float64x2_t p = vfmaq_laneq_f64 (d->c0, r, odd_coeffs, 0);
+ y = vfmaq_laneq_f64 (y, r2, invln2_and_c4, 1);
+ y = vfmaq_f64 (p, r2, y);
if (__glibc_unlikely (v_any_u32h (special)))
- return special_case (x, y, w, r2, special);
- return vfmaq_f64 (w, r2, y);
+ return special_case (hi, u_off, y, r2, special, d);
+ return vfmaq_f64 (hi, y, r2);
}
diff --git a/sysdeps/aarch64/fpu/log_advsimd.c b/sysdeps/aarch64/fpu/log_advsimd.c
index 015a6da7d7fd693e..b1a27fbc290d918c 100644
--- a/sysdeps/aarch64/fpu/log_advsimd.c
+++ b/sysdeps/aarch64/fpu/log_advsimd.c
@@ -21,27 +21,29 @@
static const struct data
{
- uint64x2_t min_norm;
+ uint64x2_t off, sign_exp_mask, offset_lower_bound;
uint32x4_t special_bound;
- float64x2_t poly[5];
- float64x2_t ln2;
- uint64x2_t sign_exp_mask;
+ float64x2_t c0, c2;
+ double c1, c3, ln2, c4;
} data = {
- /* Worst-case error: 1.17 + 0.5 ulp.
- Rel error: 0x1.6272e588p-56 in [ -0x1.fc1p-9 0x1.009p-8 ]. */
- .poly = { V2 (-0x1.ffffffffffff7p-2), V2 (0x1.55555555170d4p-2),
- V2 (-0x1.0000000399c27p-2), V2 (0x1.999b2e90e94cap-3),
- V2 (-0x1.554e550bd501ep-3) },
- .ln2 = V2 (0x1.62e42fefa39efp-1),
- .min_norm = V2 (0x0010000000000000),
- .special_bound = V4 (0x7fe00000), /* asuint64(inf) - min_norm. */
- .sign_exp_mask = V2 (0xfff0000000000000)
+ /* Rel error: 0x1.6272e588p-56 in [ -0x1.fc1p-9 0x1.009p-8 ]. */
+ .c0 = V2 (-0x1.ffffffffffff7p-2),
+ .c1 = 0x1.55555555170d4p-2,
+ .c2 = V2 (-0x1.0000000399c27p-2),
+ .c3 = 0x1.999b2e90e94cap-3,
+ .c4 = -0x1.554e550bd501ep-3,
+ .ln2 = 0x1.62e42fefa39efp-1,
+ .sign_exp_mask = V2 (0xfff0000000000000),
+ .off = V2 (0x3fe6900900000000),
+ /* Lower bound is 0x0010000000000000. For
+ optimised register use subnormals are detected after offset has been
+ subtracted, so lower bound - offset (which wraps around). */
+ .offset_lower_bound = V2 (0x0010000000000000 - 0x3fe6900900000000),
+ .special_bound = V4 (0x7fe00000), /* asuint64(inf) - asuint64(0x1p-126). */
};
-#define A(i) d->poly[i]
#define N (1 << V_LOG_TABLE_BITS)
#define IndexMask (N - 1)
-#define Off v_u64 (0x3fe6900900000000)
struct entry
{
@@ -64,48 +66,56 @@ lookup (uint64x2_t i)
}
static float64x2_t VPCS_ATTR NOINLINE
-special_case (float64x2_t x, float64x2_t y, float64x2_t hi, float64x2_t r2,
- uint32x2_t cmp)
+special_case (float64x2_t hi, uint64x2_t u_off, float64x2_t y, float64x2_t r2,
+ uint32x2_t special, const struct data *d)
{
- return v_call_f64 (log, x, vfmaq_f64 (hi, y, r2), vmovl_u32 (cmp));
+ float64x2_t x = vreinterpretq_f64_u64 (vaddq_u64 (u_off, d->off));
+ return v_call_f64 (log, x, vfmaq_f64 (hi, y, r2), vmovl_u32 (special));
}
+/* Double-precision vector log routine.
+ The maximum observed error is 2.17 ULP:
+ _ZGVnN2v_log(0x1.a6129884398a3p+0) got 0x1.ffffff1cca043p-2
+ want 0x1.ffffff1cca045p-2. */
float64x2_t VPCS_ATTR V_NAME_D1 (log) (float64x2_t x)
{
const struct data *d = ptr_barrier (&data);
- float64x2_t z, r, r2, p, y, kd, hi;
- uint64x2_t ix, iz, tmp;
- uint32x2_t cmp;
- int64x2_t k;
- struct entry e;
- ix = vreinterpretq_u64_f64 (x);
- cmp = vcge_u32 (vsubhn_u64 (ix, d->min_norm),
- vget_low_u32 (d->special_bound));
+ /* To avoid having to mov x out of the way, keep u after offset has been
+ applied, and recover x by adding the offset back in the special-case
+ handler. */
+ uint64x2_t u = vreinterpretq_u64_f64 (x);
+ uint64x2_t u_off = vsubq_u64 (u, d->off);
/* x = 2^k z; where z is in range [Off,2*Off) and exact.
The range is split into N subintervals.
The ith subinterval contains z and c is near its center. */
- tmp = vsubq_u64 (ix, Off);
- k = vshrq_n_s64 (vreinterpretq_s64_u64 (tmp), 52); /* arithmetic shift. */
- iz = vsubq_u64 (ix, vandq_u64 (tmp, d->sign_exp_mask));
- z = vreinterpretq_f64_u64 (iz);
- e = lookup (tmp);
+ int64x2_t k = vshrq_n_s64 (vreinterpretq_s64_u64 (u_off), 52);
+ uint64x2_t iz = vsubq_u64 (u, vandq_u64 (u_off, d->sign_exp_mask));
+ float64x2_t z = vreinterpretq_f64_u64 (iz);
+
+ struct entry e = lookup (u_off);
+
+ uint32x2_t special = vcge_u32 (vsubhn_u64 (u_off, d->offset_lower_bound),
+ vget_low_u32 (d->special_bound));
/* log(x) = log1p(z/c-1) + log(c) + k*Ln2. */
- r = vfmaq_f64 (v_f64 (-1.0), z, e.invc);
- kd = vcvtq_f64_s64 (k);
+ float64x2_t r = vfmaq_f64 (v_f64 (-1.0), z, e.invc);
+ float64x2_t kd = vcvtq_f64_s64 (k);
/* hi = r + log(c) + k*Ln2. */
- hi = vfmaq_f64 (vaddq_f64 (e.logc, r), kd, d->ln2);
+ float64x2_t ln2_and_c4 = vld1q_f64 (&d->ln2);
+ float64x2_t hi = vfmaq_laneq_f64 (vaddq_f64 (e.logc, r), kd, ln2_and_c4, 0);
+
/* y = r2*(A0 + r*A1 + r2*(A2 + r*A3 + r2*A4)) + hi. */
- r2 = vmulq_f64 (r, r);
- y = vfmaq_f64 (A (2), A (3), r);
- p = vfmaq_f64 (A (0), A (1), r);
- y = vfmaq_f64 (y, A (4), r2);
- y = vfmaq_f64 (p, y, r2);
-
- if (__glibc_unlikely (v_any_u32h (cmp)))
- return special_case (x, y, hi, r2, cmp);
+ float64x2_t odd_coeffs = vld1q_f64 (&d->c1);
+ float64x2_t r2 = vmulq_f64 (r, r);
+ float64x2_t y = vfmaq_laneq_f64 (d->c2, r, odd_coeffs, 1);
+ float64x2_t p = vfmaq_laneq_f64 (d->c0, r, odd_coeffs, 0);
+ y = vfmaq_laneq_f64 (y, r2, ln2_and_c4, 1);
+ y = vfmaq_f64 (p, r2, y);
+
+ if (__glibc_unlikely (v_any_u32h (special)))
+ return special_case (hi, u_off, y, r2, special, d);
return vfmaq_f64 (hi, y, r2);
}

View File

@ -0,0 +1,224 @@
commit dcd1229e5bbc8c899cb35b22aaf89d8babc5af5a
Author: Joana Cruz <Joana.Cruz@arm.com>
Date: Tue Dec 17 14:47:31 2024 +0000
AArch64: Improve codegen of AdvSIMD logf function family
Load the polynomial evaluation coefficients into 2 vectors and use lanewise MLAs.
8% improvement in throughput microbenchmark on Neoverse V1 for log2 and log,
and 2% for log10.
Reviewed-by: Wilco Dijkstra <Wilco.Dijkstra@arm.com>
(cherry picked from commit d6e034f5b222a9ed1aeb5de0c0c7d0dda8b63da3)
diff --git a/sysdeps/aarch64/fpu/log10f_advsimd.c b/sysdeps/aarch64/fpu/log10f_advsimd.c
index 82228b599a5c061b..0d792c3df9a7216e 100644
--- a/sysdeps/aarch64/fpu/log10f_advsimd.c
+++ b/sysdeps/aarch64/fpu/log10f_advsimd.c
@@ -18,21 +18,25 @@
<https://www.gnu.org/licenses/>. */
#include "v_math.h"
-#include "poly_advsimd_f32.h"
static const struct data
{
+ float32x4_t c0, c2, c4, c6, inv_ln10, ln2;
uint32x4_t off, offset_lower_bound;
uint16x8_t special_bound;
uint32x4_t mantissa_mask;
- float32x4_t poly[8];
- float32x4_t inv_ln10, ln2;
+ float c1, c3, c5, c7;
} data = {
/* Use order 9 for log10(1+x), i.e. order 8 for log10(1+x)/x, with x in
[-1/3, 1/3] (offset=2/3). Max. relative error: 0x1.068ee468p-25. */
- .poly = { V4 (-0x1.bcb79cp-3f), V4 (0x1.2879c8p-3f), V4 (-0x1.bcd472p-4f),
- V4 (0x1.6408f8p-4f), V4 (-0x1.246f8p-4f), V4 (0x1.f0e514p-5f),
- V4 (-0x1.0fc92cp-4f), V4 (0x1.f5f76ap-5f) },
+ .c0 = V4 (-0x1.bcb79cp-3f),
+ .c1 = 0x1.2879c8p-3f,
+ .c2 = V4 (-0x1.bcd472p-4f),
+ .c3 = 0x1.6408f8p-4f,
+ .c4 = V4 (-0x1.246f8p-4f),
+ .c5 = 0x1.f0e514p-5f,
+ .c6 = V4 (-0x1.0fc92cp-4f),
+ .c7 = 0x1.f5f76ap-5f,
.ln2 = V4 (0x1.62e43p-1f),
.inv_ln10 = V4 (0x1.bcb7b2p-2f),
/* Lower bound is the smallest positive normal float 0x00800000. For
@@ -62,7 +66,7 @@ special_case (float32x4_t y, uint32x4_t u_off, float32x4_t p, float32x4_t r2,
float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (log10) (float32x4_t x)
{
const struct data *d = ptr_barrier (&data);
-
+ float32x4_t c1357 = vld1q_f32 (&d->c1);
/* To avoid having to mov x out of the way, keep u after offset has been
applied, and recover x by adding the offset back in the special-case
handler. */
@@ -81,7 +85,16 @@ float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (log10) (float32x4_t x)
/* y = log10(1+r) + n * log10(2). */
float32x4_t r2 = vmulq_f32 (r, r);
- float32x4_t poly = v_pw_horner_7_f32 (r, r2, d->poly);
+
+ float32x4_t c01 = vfmaq_laneq_f32 (d->c0, r, c1357, 0);
+ float32x4_t c23 = vfmaq_laneq_f32 (d->c2, r, c1357, 1);
+ float32x4_t c45 = vfmaq_laneq_f32 (d->c4, r, c1357, 2);
+ float32x4_t c67 = vfmaq_laneq_f32 (d->c6, r, c1357, 3);
+
+ float32x4_t p47 = vfmaq_f32 (c45, r2, c67);
+ float32x4_t p27 = vfmaq_f32 (c23, r2, p47);
+ float32x4_t poly = vfmaq_f32 (c01, r2, p27);
+
/* y = Log10(2) * n + poly * InvLn(10). */
float32x4_t y = vfmaq_f32 (r, d->ln2, n);
y = vmulq_f32 (y, d->inv_ln10);
diff --git a/sysdeps/aarch64/fpu/log2f_advsimd.c b/sysdeps/aarch64/fpu/log2f_advsimd.c
index 84effe4fe9492d08..116c36c8e2889f99 100644
--- a/sysdeps/aarch64/fpu/log2f_advsimd.c
+++ b/sysdeps/aarch64/fpu/log2f_advsimd.c
@@ -18,22 +18,27 @@
<https://www.gnu.org/licenses/>. */
#include "v_math.h"
-#include "poly_advsimd_f32.h"
static const struct data
{
+ float32x4_t c0, c2, c4, c6, c8;
uint32x4_t off, offset_lower_bound;
uint16x8_t special_bound;
uint32x4_t mantissa_mask;
- float32x4_t poly[9];
+ float c1, c3, c5, c7;
} data = {
/* Coefficients generated using Remez algorithm approximate
log2(1+r)/r for r in [ -1/3, 1/3 ].
rel error: 0x1.c4c4b0cp-26. */
- .poly = { V4 (0x1.715476p0f), /* (float)(1 / ln(2)). */
- V4 (-0x1.715458p-1f), V4 (0x1.ec701cp-2f), V4 (-0x1.7171a4p-2f),
- V4 (0x1.27a0b8p-2f), V4 (-0x1.e5143ep-3f), V4 (0x1.9d8ecap-3f),
- V4 (-0x1.c675bp-3f), V4 (0x1.9e495p-3f) },
+ .c0 = V4 (0x1.715476p0f), /* (float)(1 / ln(2)). */
+ .c1 = -0x1.715458p-1f,
+ .c2 = V4 (0x1.ec701cp-2f),
+ .c3 = -0x1.7171a4p-2f,
+ .c4 = V4 (0x1.27a0b8p-2f),
+ .c5 = -0x1.e5143ep-3f,
+ .c6 = V4 (0x1.9d8ecap-3f),
+ .c7 = -0x1.c675bp-3f,
+ .c8 = V4 (0x1.9e495p-3f),
/* Lower bound is the smallest positive normal float 0x00800000. For
optimised register use subnormals are detected after offset has been
subtracted, so lower bound is 0x0080000 - offset (which wraps around). */
@@ -79,11 +84,21 @@ float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (log2) (float32x4_t x)
/* y = log2(1+r) + n. */
float32x4_t r2 = vmulq_f32 (r, r);
- float32x4_t p = v_pw_horner_8_f32 (r, r2, d->poly);
+
+ float32x4_t c1357 = vld1q_f32 (&d->c1);
+ float32x4_t c01 = vfmaq_laneq_f32 (d->c0, r, c1357, 0);
+ float32x4_t c23 = vfmaq_laneq_f32 (d->c2, r, c1357, 1);
+ float32x4_t c45 = vfmaq_laneq_f32 (d->c4, r, c1357, 2);
+ float32x4_t c67 = vfmaq_laneq_f32 (d->c6, r, c1357, 3);
+ float32x4_t p68 = vfmaq_f32 (c67, r2, d->c8);
+ float32x4_t p48 = vfmaq_f32 (c45, r2, p68);
+ float32x4_t p28 = vfmaq_f32 (c23, r2, p48);
+ float32x4_t p = vfmaq_f32 (c01, r2, p28);
if (__glibc_unlikely (v_any_u16h (special)))
return special_case (n, u_off, p, r, special, d);
return vfmaq_f32 (n, p, r);
}
+
libmvec_hidden_def (V_NAME_F1 (log2))
HALF_WIDTH_ALIAS_F1 (log2)
diff --git a/sysdeps/aarch64/fpu/logf_advsimd.c b/sysdeps/aarch64/fpu/logf_advsimd.c
index c20dbfd6c088c0af..d9e64c732d7d8d28 100644
--- a/sysdeps/aarch64/fpu/logf_advsimd.c
+++ b/sysdeps/aarch64/fpu/logf_advsimd.c
@@ -21,16 +21,19 @@
static const struct data
{
- uint32x4_t off, offset_lower_bound;
+ float32x4_t c2, c4, c6, ln2;
+ uint32x4_t off, offset_lower_bound, mantissa_mask;
uint16x8_t special_bound;
- uint32x4_t mantissa_mask;
- float32x4_t poly[7];
- float32x4_t ln2;
+ float c1, c3, c5, c0;
} data = {
/* 3.34 ulp error. */
- .poly = { V4 (-0x1.3e737cp-3f), V4 (0x1.5a9aa2p-3f), V4 (-0x1.4f9934p-3f),
- V4 (0x1.961348p-3f), V4 (-0x1.00187cp-2f), V4 (0x1.555d7cp-2f),
- V4 (-0x1.ffffc8p-2f) },
+ .c0 = -0x1.3e737cp-3f,
+ .c1 = 0x1.5a9aa2p-3f,
+ .c2 = V4 (-0x1.4f9934p-3f),
+ .c3 = 0x1.961348p-3f,
+ .c4 = V4 (-0x1.00187cp-2f),
+ .c5 = 0x1.555d7cp-2f,
+ .c6 = V4 (-0x1.ffffc8p-2f),
.ln2 = V4 (0x1.62e43p-1f),
/* Lower bound is the smallest positive normal float 0x00800000. For
optimised register use subnormals are detected after offset has been
@@ -41,8 +44,6 @@ static const struct data
.mantissa_mask = V4 (0x007fffff)
};
-#define P(i) d->poly[7 - i]
-
static float32x4_t VPCS_ATTR NOINLINE
special_case (float32x4_t p, uint32x4_t u_off, float32x4_t y, float32x4_t r2,
uint16x4_t cmp, const struct data *d)
@@ -55,33 +56,30 @@ special_case (float32x4_t p, uint32x4_t u_off, float32x4_t y, float32x4_t r2,
float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (log) (float32x4_t x)
{
const struct data *d = ptr_barrier (&data);
- float32x4_t n, p, q, r, r2, y;
- uint32x4_t u, u_off;
- uint16x4_t cmp;
+ float32x4_t c1350 = vld1q_f32 (&d->c1);
/* To avoid having to mov x out of the way, keep u after offset has been
applied, and recover x by adding the offset back in the special-case
handler. */
- u_off = vreinterpretq_u32_f32 (x);
+ uint32x4_t u_off = vsubq_u32 (vreinterpretq_u32_f32 (x), d->off);
/* x = 2^n * (1+r), where 2/3 < 1+r < 4/3. */
- u_off = vsubq_u32 (u_off, d->off);
- n = vcvtq_f32_s32 (
+ float32x4_t n = vcvtq_f32_s32 (
vshrq_n_s32 (vreinterpretq_s32_u32 (u_off), 23)); /* signextend. */
- u = vandq_u32 (u_off, d->mantissa_mask);
- u = vaddq_u32 (u, d->off);
- r = vsubq_f32 (vreinterpretq_f32_u32 (u), v_f32 (1.0f));
+ uint16x4_t cmp = vcge_u16 (vsubhn_u32 (u_off, d->offset_lower_bound),
+ vget_low_u16 (d->special_bound));
- cmp = vcge_u16 (vsubhn_u32 (u_off, d->offset_lower_bound),
- vget_low_u16 (d->special_bound));
+ uint32x4_t u = vaddq_u32 (vandq_u32 (u_off, d->mantissa_mask), d->off);
+ float32x4_t r = vsubq_f32 (vreinterpretq_f32_u32 (u), v_f32 (1.0f));
/* y = log(1+r) + n*ln2. */
- r2 = vmulq_f32 (r, r);
+ float32x4_t r2 = vmulq_f32 (r, r);
/* n*ln2 + r + r2*(P1 + r*P2 + r2*(P3 + r*P4 + r2*(P5 + r*P6 + r2*P7))). */
- p = vfmaq_f32 (P (5), P (6), r);
- q = vfmaq_f32 (P (3), P (4), r);
- y = vfmaq_f32 (P (1), P (2), r);
- p = vfmaq_f32 (p, P (7), r2);
+ float32x4_t p = vfmaq_laneq_f32 (d->c2, r, c1350, 0);
+ float32x4_t q = vfmaq_laneq_f32 (d->c4, r, c1350, 1);
+ float32x4_t y = vfmaq_laneq_f32 (d->c6, r, c1350, 2);
+ p = vfmaq_laneq_f32 (p, r2, c1350, 3);
+
q = vfmaq_f32 (q, p, r2);
y = vfmaq_f32 (y, q, r2);
p = vfmaq_f32 (r, d->ln2, n);

View File

@ -0,0 +1,404 @@
commit a10183b6338baf4b2643b92cce1b0fba0e3ab62f
Author: Joana Cruz <Joana.Cruz@arm.com>
Date: Tue Dec 17 14:49:30 2024 +0000
AArch64: Improve codegen of AdvSIMD atan(2)(f)
Load the polynomial evaluation coefficients into 2 vectors and use lanewise MLAs.
8% improvement in throughput microbenchmark on Neoverse V1.
Reviewed-by: Wilco Dijkstra <Wilco.Dijkstra@arm.com>
(cherry picked from commit 6914774b9d3460876d9ad4482782213ec01a752e)
diff --git a/sysdeps/aarch64/fpu/atan2_advsimd.c b/sysdeps/aarch64/fpu/atan2_advsimd.c
index 2fd61641340c0315..5df4b99ff4277c6a 100644
--- a/sysdeps/aarch64/fpu/atan2_advsimd.c
+++ b/sysdeps/aarch64/fpu/atan2_advsimd.c
@@ -22,40 +22,57 @@
static const struct data
{
+ float64x2_t c0, c2, c4, c6, c8, c10, c12, c14, c16, c18;
float64x2_t pi_over_2;
- float64x2_t poly[20];
+ double c1, c3, c5, c7, c9, c11, c13, c15, c17, c19;
+ uint64x2_t zeroinfnan, minustwo;
} data = {
/* Coefficients of polynomial P such that atan(x)~x+x*P(x^2) on
- the interval [2**-1022, 1.0]. */
- .poly = { V2 (-0x1.5555555555555p-2), V2 (0x1.99999999996c1p-3),
- V2 (-0x1.2492492478f88p-3), V2 (0x1.c71c71bc3951cp-4),
- V2 (-0x1.745d160a7e368p-4), V2 (0x1.3b139b6a88ba1p-4),
- V2 (-0x1.11100ee084227p-4), V2 (0x1.e1d0f9696f63bp-5),
- V2 (-0x1.aebfe7b418581p-5), V2 (0x1.842dbe9b0d916p-5),
- V2 (-0x1.5d30140ae5e99p-5), V2 (0x1.338e31eb2fbbcp-5),
- V2 (-0x1.00e6eece7de8p-5), V2 (0x1.860897b29e5efp-6),
- V2 (-0x1.0051381722a59p-6), V2 (0x1.14e9dc19a4a4ep-7),
- V2 (-0x1.d0062b42fe3bfp-9), V2 (0x1.17739e210171ap-10),
- V2 (-0x1.ab24da7be7402p-13), V2 (0x1.358851160a528p-16), },
+ [2**-1022, 1.0]. */
+ .c0 = V2 (-0x1.5555555555555p-2),
+ .c1 = 0x1.99999999996c1p-3,
+ .c2 = V2 (-0x1.2492492478f88p-3),
+ .c3 = 0x1.c71c71bc3951cp-4,
+ .c4 = V2 (-0x1.745d160a7e368p-4),
+ .c5 = 0x1.3b139b6a88ba1p-4,
+ .c6 = V2 (-0x1.11100ee084227p-4),
+ .c7 = 0x1.e1d0f9696f63bp-5,
+ .c8 = V2 (-0x1.aebfe7b418581p-5),
+ .c9 = 0x1.842dbe9b0d916p-5,
+ .c10 = V2 (-0x1.5d30140ae5e99p-5),
+ .c11 = 0x1.338e31eb2fbbcp-5,
+ .c12 = V2 (-0x1.00e6eece7de8p-5),
+ .c13 = 0x1.860897b29e5efp-6,
+ .c14 = V2 (-0x1.0051381722a59p-6),
+ .c15 = 0x1.14e9dc19a4a4ep-7,
+ .c16 = V2 (-0x1.d0062b42fe3bfp-9),
+ .c17 = 0x1.17739e210171ap-10,
+ .c18 = V2 (-0x1.ab24da7be7402p-13),
+ .c19 = 0x1.358851160a528p-16,
.pi_over_2 = V2 (0x1.921fb54442d18p+0),
+ .zeroinfnan = V2 (2 * 0x7ff0000000000000ul - 1),
+ .minustwo = V2 (0xc000000000000000),
};
#define SignMask v_u64 (0x8000000000000000)
/* Special cases i.e. 0, infinity, NaN (fall back to scalar calls). */
static float64x2_t VPCS_ATTR NOINLINE
-special_case (float64x2_t y, float64x2_t x, float64x2_t ret, uint64x2_t cmp)
+special_case (float64x2_t y, float64x2_t x, float64x2_t ret,
+ uint64x2_t sign_xy, uint64x2_t cmp)
{
+ /* Account for the sign of x and y. */
+ ret = vreinterpretq_f64_u64 (
+ veorq_u64 (vreinterpretq_u64_f64 (ret), sign_xy));
return v_call2_f64 (atan2, y, x, ret, cmp);
}
/* Returns 1 if input is the bit representation of 0, infinity or nan. */
static inline uint64x2_t
-zeroinfnan (uint64x2_t i)
+zeroinfnan (uint64x2_t i, const struct data *d)
{
/* (2 * i - 1) >= (2 * asuint64 (INFINITY) - 1). */
- return vcgeq_u64 (vsubq_u64 (vaddq_u64 (i, i), v_u64 (1)),
- v_u64 (2 * asuint64 (INFINITY) - 1));
+ return vcgeq_u64 (vsubq_u64 (vaddq_u64 (i, i), v_u64 (1)), d->zeroinfnan);
}
/* Fast implementation of vector atan2.
@@ -65,12 +82,13 @@ zeroinfnan (uint64x2_t i)
want 0x1.92d628ab678cfp-1. */
float64x2_t VPCS_ATTR V_NAME_D2 (atan2) (float64x2_t y, float64x2_t x)
{
- const struct data *data_ptr = ptr_barrier (&data);
+ const struct data *d = ptr_barrier (&data);
uint64x2_t ix = vreinterpretq_u64_f64 (x);
uint64x2_t iy = vreinterpretq_u64_f64 (y);
- uint64x2_t special_cases = vorrq_u64 (zeroinfnan (ix), zeroinfnan (iy));
+ uint64x2_t special_cases
+ = vorrq_u64 (zeroinfnan (ix, d), zeroinfnan (iy, d));
uint64x2_t sign_x = vandq_u64 (ix, SignMask);
uint64x2_t sign_y = vandq_u64 (iy, SignMask);
@@ -80,18 +98,18 @@ float64x2_t VPCS_ATTR V_NAME_D2 (atan2) (float64x2_t y, float64x2_t x)
float64x2_t ay = vabsq_f64 (y);
uint64x2_t pred_xlt0 = vcltzq_f64 (x);
- uint64x2_t pred_aygtax = vcgtq_f64 (ay, ax);
+ uint64x2_t pred_aygtax = vcagtq_f64 (y, x);
/* Set up z for call to atan. */
float64x2_t n = vbslq_f64 (pred_aygtax, vnegq_f64 (ax), ay);
- float64x2_t d = vbslq_f64 (pred_aygtax, ay, ax);
- float64x2_t z = vdivq_f64 (n, d);
+ float64x2_t q = vbslq_f64 (pred_aygtax, ay, ax);
+ float64x2_t z = vdivq_f64 (n, q);
/* Work out the correct shift. */
- float64x2_t shift = vreinterpretq_f64_u64 (
- vandq_u64 (pred_xlt0, vreinterpretq_u64_f64 (v_f64 (-2.0))));
+ float64x2_t shift
+ = vreinterpretq_f64_u64 (vandq_u64 (pred_xlt0, d->minustwo));
shift = vbslq_f64 (pred_aygtax, vaddq_f64 (shift, v_f64 (1.0)), shift);
- shift = vmulq_f64 (shift, data_ptr->pi_over_2);
+ shift = vmulq_f64 (shift, d->pi_over_2);
/* Calculate the polynomial approximation.
Use split Estrin scheme for P(z^2) with deg(P)=19. Use split instead of
@@ -102,20 +120,52 @@ float64x2_t VPCS_ATTR V_NAME_D2 (atan2) (float64x2_t y, float64x2_t x)
float64x2_t x2 = vmulq_f64 (z2, z2);
float64x2_t x4 = vmulq_f64 (x2, x2);
float64x2_t x8 = vmulq_f64 (x4, x4);
- float64x2_t ret
- = vfmaq_f64 (v_estrin_7_f64 (z2, x2, x4, data_ptr->poly),
- v_estrin_11_f64 (z2, x2, x4, x8, data_ptr->poly + 8), x8);
+
+ float64x2_t c13 = vld1q_f64 (&d->c1);
+ float64x2_t c57 = vld1q_f64 (&d->c5);
+ float64x2_t c911 = vld1q_f64 (&d->c9);
+ float64x2_t c1315 = vld1q_f64 (&d->c13);
+ float64x2_t c1719 = vld1q_f64 (&d->c17);
+
+ /* estrin_7. */
+ float64x2_t p01 = vfmaq_laneq_f64 (d->c0, z2, c13, 0);
+ float64x2_t p23 = vfmaq_laneq_f64 (d->c2, z2, c13, 1);
+ float64x2_t p03 = vfmaq_f64 (p01, x2, p23);
+
+ float64x2_t p45 = vfmaq_laneq_f64 (d->c4, z2, c57, 0);
+ float64x2_t p67 = vfmaq_laneq_f64 (d->c6, z2, c57, 1);
+ float64x2_t p47 = vfmaq_f64 (p45, x2, p67);
+
+ float64x2_t p07 = vfmaq_f64 (p03, x4, p47);
+
+ /* estrin_11. */
+ float64x2_t p89 = vfmaq_laneq_f64 (d->c8, z2, c911, 0);
+ float64x2_t p1011 = vfmaq_laneq_f64 (d->c10, z2, c911, 1);
+ float64x2_t p811 = vfmaq_f64 (p89, x2, p1011);
+
+ float64x2_t p1213 = vfmaq_laneq_f64 (d->c12, z2, c1315, 0);
+ float64x2_t p1415 = vfmaq_laneq_f64 (d->c14, z2, c1315, 1);
+ float64x2_t p1215 = vfmaq_f64 (p1213, x2, p1415);
+
+ float64x2_t p1617 = vfmaq_laneq_f64 (d->c16, z2, c1719, 0);
+ float64x2_t p1819 = vfmaq_laneq_f64 (d->c18, z2, c1719, 1);
+ float64x2_t p1619 = vfmaq_f64 (p1617, x2, p1819);
+
+ float64x2_t p815 = vfmaq_f64 (p811, x4, p1215);
+ float64x2_t p819 = vfmaq_f64 (p815, x8, p1619);
+
+ float64x2_t ret = vfmaq_f64 (p07, p819, x8);
/* Finalize. y = shift + z + z^3 * P(z^2). */
ret = vfmaq_f64 (z, ret, vmulq_f64 (z2, z));
ret = vaddq_f64 (ret, shift);
+ if (__glibc_unlikely (v_any_u64 (special_cases)))
+ return special_case (y, x, ret, sign_xy, special_cases);
+
/* Account for the sign of x and y. */
ret = vreinterpretq_f64_u64 (
veorq_u64 (vreinterpretq_u64_f64 (ret), sign_xy));
- if (__glibc_unlikely (v_any_u64 (special_cases)))
- return special_case (y, x, ret, special_cases);
-
return ret;
}
diff --git a/sysdeps/aarch64/fpu/atan2f_advsimd.c b/sysdeps/aarch64/fpu/atan2f_advsimd.c
index 56e610caf18f6d77..88daacd76cdd3998 100644
--- a/sysdeps/aarch64/fpu/atan2f_advsimd.c
+++ b/sysdeps/aarch64/fpu/atan2f_advsimd.c
@@ -22,34 +22,39 @@
static const struct data
{
- float32x4_t poly[8];
- float32x4_t pi_over_2;
+ float32x4_t c0, pi_over_2, c4, c6, c2;
+ float c1, c3, c5, c7;
+ uint32x4_t comp_const;
} data = {
/* Coefficients of polynomial P such that atan(x)~x+x*P(x^2) on
[2**-128, 1.0].
Generated using fpminimax between FLT_MIN and 1. */
- .poly = { V4 (-0x1.55555p-2f), V4 (0x1.99935ep-3f), V4 (-0x1.24051ep-3f),
- V4 (0x1.bd7368p-4f), V4 (-0x1.491f0ep-4f), V4 (0x1.93a2c0p-5f),
- V4 (-0x1.4c3c60p-6f), V4 (0x1.01fd88p-8f) },
- .pi_over_2 = V4 (0x1.921fb6p+0f),
+ .c0 = V4 (-0x1.55555p-2f), .c1 = 0x1.99935ep-3f,
+ .c2 = V4 (-0x1.24051ep-3f), .c3 = 0x1.bd7368p-4f,
+ .c4 = V4 (-0x1.491f0ep-4f), .c5 = 0x1.93a2c0p-5f,
+ .c6 = V4 (-0x1.4c3c60p-6f), .c7 = 0x1.01fd88p-8f,
+ .pi_over_2 = V4 (0x1.921fb6p+0f), .comp_const = V4 (2 * 0x7f800000lu - 1),
};
#define SignMask v_u32 (0x80000000)
/* Special cases i.e. 0, infinity and nan (fall back to scalar calls). */
static float32x4_t VPCS_ATTR NOINLINE
-special_case (float32x4_t y, float32x4_t x, float32x4_t ret, uint32x4_t cmp)
+special_case (float32x4_t y, float32x4_t x, float32x4_t ret,
+ uint32x4_t sign_xy, uint32x4_t cmp)
{
+ /* Account for the sign of y. */
+ ret = vreinterpretq_f32_u32 (
+ veorq_u32 (vreinterpretq_u32_f32 (ret), sign_xy));
return v_call2_f32 (atan2f, y, x, ret, cmp);
}
/* Returns 1 if input is the bit representation of 0, infinity or nan. */
static inline uint32x4_t
-zeroinfnan (uint32x4_t i)
+zeroinfnan (uint32x4_t i, const struct data *d)
{
/* 2 * i - 1 >= 2 * 0x7f800000lu - 1. */
- return vcgeq_u32 (vsubq_u32 (vmulq_n_u32 (i, 2), v_u32 (1)),
- v_u32 (2 * 0x7f800000lu - 1));
+ return vcgeq_u32 (vsubq_u32 (vmulq_n_u32 (i, 2), v_u32 (1)), d->comp_const);
}
/* Fast implementation of vector atan2f. Maximum observed error is
@@ -58,12 +63,13 @@ zeroinfnan (uint32x4_t i)
want 0x1.967f00p-1. */
float32x4_t VPCS_ATTR NOINLINE V_NAME_F2 (atan2) (float32x4_t y, float32x4_t x)
{
- const struct data *data_ptr = ptr_barrier (&data);
+ const struct data *d = ptr_barrier (&data);
uint32x4_t ix = vreinterpretq_u32_f32 (x);
uint32x4_t iy = vreinterpretq_u32_f32 (y);
- uint32x4_t special_cases = vorrq_u32 (zeroinfnan (ix), zeroinfnan (iy));
+ uint32x4_t special_cases
+ = vorrq_u32 (zeroinfnan (ix, d), zeroinfnan (iy, d));
uint32x4_t sign_x = vandq_u32 (ix, SignMask);
uint32x4_t sign_y = vandq_u32 (iy, SignMask);
@@ -77,14 +83,14 @@ float32x4_t VPCS_ATTR NOINLINE V_NAME_F2 (atan2) (float32x4_t y, float32x4_t x)
/* Set up z for call to atanf. */
float32x4_t n = vbslq_f32 (pred_aygtax, vnegq_f32 (ax), ay);
- float32x4_t d = vbslq_f32 (pred_aygtax, ay, ax);
- float32x4_t z = vdivq_f32 (n, d);
+ float32x4_t q = vbslq_f32 (pred_aygtax, ay, ax);
+ float32x4_t z = vdivq_f32 (n, q);
/* Work out the correct shift. */
float32x4_t shift = vreinterpretq_f32_u32 (
vandq_u32 (pred_xlt0, vreinterpretq_u32_f32 (v_f32 (-2.0f))));
shift = vbslq_f32 (pred_aygtax, vaddq_f32 (shift, v_f32 (1.0f)), shift);
- shift = vmulq_f32 (shift, data_ptr->pi_over_2);
+ shift = vmulq_f32 (shift, d->pi_over_2);
/* Calculate the polynomial approximation.
Use 2-level Estrin scheme for P(z^2) with deg(P)=7. However,
@@ -96,23 +102,27 @@ float32x4_t VPCS_ATTR NOINLINE V_NAME_F2 (atan2) (float32x4_t y, float32x4_t x)
float32x4_t z2 = vmulq_f32 (z, z);
float32x4_t z4 = vmulq_f32 (z2, z2);
- float32x4_t ret = vfmaq_f32 (
- v_pairwise_poly_3_f32 (z2, z4, data_ptr->poly), z4,
- vmulq_f32 (z4, v_pairwise_poly_3_f32 (z2, z4, data_ptr->poly + 4)));
+ float32x4_t c1357 = vld1q_f32 (&d->c1);
+ float32x4_t p01 = vfmaq_laneq_f32 (d->c0, z2, c1357, 0);
+ float32x4_t p23 = vfmaq_laneq_f32 (d->c2, z2, c1357, 1);
+ float32x4_t p45 = vfmaq_laneq_f32 (d->c4, z2, c1357, 2);
+ float32x4_t p67 = vfmaq_laneq_f32 (d->c6, z2, c1357, 3);
+ float32x4_t p03 = vfmaq_f32 (p01, z4, p23);
+ float32x4_t p47 = vfmaq_f32 (p45, z4, p67);
+
+ float32x4_t ret = vfmaq_f32 (p03, z4, vmulq_f32 (z4, p47));
/* y = shift + z * P(z^2). */
ret = vaddq_f32 (vfmaq_f32 (z, ret, vmulq_f32 (z2, z)), shift);
- /* Account for the sign of y. */
- ret = vreinterpretq_f32_u32 (
- veorq_u32 (vreinterpretq_u32_f32 (ret), sign_xy));
-
if (__glibc_unlikely (v_any_u32 (special_cases)))
{
- return special_case (y, x, ret, special_cases);
+ return special_case (y, x, ret, sign_xy, special_cases);
}
- return ret;
+ /* Account for the sign of y. */
+ return vreinterpretq_f32_u32 (
+ veorq_u32 (vreinterpretq_u32_f32 (ret), sign_xy));
}
libmvec_hidden_def (V_NAME_F2 (atan2))
HALF_WIDTH_ALIAS_F2(atan2)
diff --git a/sysdeps/aarch64/fpu/atan_advsimd.c b/sysdeps/aarch64/fpu/atan_advsimd.c
index a962be0f78e4a9c7..14f1809796f05246 100644
--- a/sysdeps/aarch64/fpu/atan_advsimd.c
+++ b/sysdeps/aarch64/fpu/atan_advsimd.c
@@ -22,21 +22,22 @@
static const struct data
{
+ float64x2_t c0, c2, c4, c6, c8, c10, c12, c14, c16, c18;
float64x2_t pi_over_2;
- float64x2_t poly[20];
+ double c1, c3, c5, c7, c9, c11, c13, c15, c17, c19;
} data = {
/* Coefficients of polynomial P such that atan(x)~x+x*P(x^2) on
[2**-1022, 1.0]. */
- .poly = { V2 (-0x1.5555555555555p-2), V2 (0x1.99999999996c1p-3),
- V2 (-0x1.2492492478f88p-3), V2 (0x1.c71c71bc3951cp-4),
- V2 (-0x1.745d160a7e368p-4), V2 (0x1.3b139b6a88ba1p-4),
- V2 (-0x1.11100ee084227p-4), V2 (0x1.e1d0f9696f63bp-5),
- V2 (-0x1.aebfe7b418581p-5), V2 (0x1.842dbe9b0d916p-5),
- V2 (-0x1.5d30140ae5e99p-5), V2 (0x1.338e31eb2fbbcp-5),
- V2 (-0x1.00e6eece7de8p-5), V2 (0x1.860897b29e5efp-6),
- V2 (-0x1.0051381722a59p-6), V2 (0x1.14e9dc19a4a4ep-7),
- V2 (-0x1.d0062b42fe3bfp-9), V2 (0x1.17739e210171ap-10),
- V2 (-0x1.ab24da7be7402p-13), V2 (0x1.358851160a528p-16), },
+ .c0 = V2 (-0x1.5555555555555p-2), .c1 = 0x1.99999999996c1p-3,
+ .c2 = V2 (-0x1.2492492478f88p-3), .c3 = 0x1.c71c71bc3951cp-4,
+ .c4 = V2 (-0x1.745d160a7e368p-4), .c5 = 0x1.3b139b6a88ba1p-4,
+ .c6 = V2 (-0x1.11100ee084227p-4), .c7 = 0x1.e1d0f9696f63bp-5,
+ .c8 = V2 (-0x1.aebfe7b418581p-5), .c9 = 0x1.842dbe9b0d916p-5,
+ .c10 = V2 (-0x1.5d30140ae5e99p-5), .c11 = 0x1.338e31eb2fbbcp-5,
+ .c12 = V2 (-0x1.00e6eece7de8p-5), .c13 = 0x1.860897b29e5efp-6,
+ .c14 = V2 (-0x1.0051381722a59p-6), .c15 = 0x1.14e9dc19a4a4ep-7,
+ .c16 = V2 (-0x1.d0062b42fe3bfp-9), .c17 = 0x1.17739e210171ap-10,
+ .c18 = V2 (-0x1.ab24da7be7402p-13), .c19 = 0x1.358851160a528p-16,
.pi_over_2 = V2 (0x1.921fb54442d18p+0),
};
@@ -52,6 +53,11 @@ static const struct data
float64x2_t VPCS_ATTR V_NAME_D1 (atan) (float64x2_t x)
{
const struct data *d = ptr_barrier (&data);
+ float64x2_t c13 = vld1q_f64 (&d->c1);
+ float64x2_t c57 = vld1q_f64 (&d->c5);
+ float64x2_t c911 = vld1q_f64 (&d->c9);
+ float64x2_t c1315 = vld1q_f64 (&d->c13);
+ float64x2_t c1719 = vld1q_f64 (&d->c17);
/* Small cases, infs and nans are supported by our approximation technique,
but do not set fenv flags correctly. Only trigger special case if we need
@@ -90,9 +96,35 @@ float64x2_t VPCS_ATTR V_NAME_D1 (atan) (float64x2_t x)
float64x2_t x2 = vmulq_f64 (z2, z2);
float64x2_t x4 = vmulq_f64 (x2, x2);
float64x2_t x8 = vmulq_f64 (x4, x4);
- float64x2_t y
- = vfmaq_f64 (v_estrin_7_f64 (z2, x2, x4, d->poly),
- v_estrin_11_f64 (z2, x2, x4, x8, d->poly + 8), x8);
+
+ /* estrin_7. */
+ float64x2_t p01 = vfmaq_laneq_f64 (d->c0, z2, c13, 0);
+ float64x2_t p23 = vfmaq_laneq_f64 (d->c2, z2, c13, 1);
+ float64x2_t p03 = vfmaq_f64 (p01, x2, p23);
+
+ float64x2_t p45 = vfmaq_laneq_f64 (d->c4, z2, c57, 0);
+ float64x2_t p67 = vfmaq_laneq_f64 (d->c6, z2, c57, 1);
+ float64x2_t p47 = vfmaq_f64 (p45, x2, p67);
+
+ float64x2_t p07 = vfmaq_f64 (p03, x4, p47);
+
+ /* estrin_11. */
+ float64x2_t p89 = vfmaq_laneq_f64 (d->c8, z2, c911, 0);
+ float64x2_t p1011 = vfmaq_laneq_f64 (d->c10, z2, c911, 1);
+ float64x2_t p811 = vfmaq_f64 (p89, x2, p1011);
+
+ float64x2_t p1213 = vfmaq_laneq_f64 (d->c12, z2, c1315, 0);
+ float64x2_t p1415 = vfmaq_laneq_f64 (d->c14, z2, c1315, 1);
+ float64x2_t p1215 = vfmaq_f64 (p1213, x2, p1415);
+
+ float64x2_t p1617 = vfmaq_laneq_f64 (d->c16, z2, c1719, 0);
+ float64x2_t p1819 = vfmaq_laneq_f64 (d->c18, z2, c1719, 1);
+ float64x2_t p1619 = vfmaq_f64 (p1617, x2, p1819);
+
+ float64x2_t p815 = vfmaq_f64 (p811, x4, p1215);
+ float64x2_t p819 = vfmaq_f64 (p815, x8, p1619);
+
+ float64x2_t y = vfmaq_f64 (p07, p819, x8);
/* Finalize. y = shift + z + z^3 * P(z^2). */
y = vfmaq_f64 (az, y, vmulq_f64 (z2, az));

View File

@ -0,0 +1,211 @@
commit 78abd3ef6e607853def82a97bf34a3c632db04e2
Author: Luna Lamb <luna.lamb@arm.com>
Date: Fri Jan 3 19:02:52 2025 +0000
AArch64: Improve codegen in SVE tans
Improves memory access.
Tan: MOVPRFX 7 -> 2, LD1RD 12 -> 5, move MOV away from return.
Tanf: MOV 2 -> 1, MOVPRFX 6 -> 3, LD1RW 5 -> 4, move mov away from return.
(cherry picked from commit aa6609feb20ebf8653db639dabe2a6afc77b02cc)
diff --git a/sysdeps/aarch64/fpu/tan_sve.c b/sysdeps/aarch64/fpu/tan_sve.c
index b2e44473166845d0..a7318fd417dc7064 100644
--- a/sysdeps/aarch64/fpu/tan_sve.c
+++ b/sysdeps/aarch64/fpu/tan_sve.c
@@ -22,24 +22,38 @@
static const struct data
{
- double poly[9];
- double half_pi_hi, half_pi_lo, inv_half_pi, range_val, shift;
+ double c2, c4, c6, c8;
+ double poly_1357[4];
+ double c0, inv_half_pi;
+ double half_pi_hi, half_pi_lo, range_val;
} data = {
/* Polynomial generated with FPMinimax. */
- .poly = { 0x1.5555555555556p-2, 0x1.1111111110a63p-3, 0x1.ba1ba1bb46414p-5,
- 0x1.664f47e5b5445p-6, 0x1.226e5e5ecdfa3p-7, 0x1.d6c7ddbf87047p-9,
- 0x1.7ea75d05b583ep-10, 0x1.289f22964a03cp-11,
- 0x1.4e4fd14147622p-12, },
+ .c2 = 0x1.ba1ba1bb46414p-5,
+ .c4 = 0x1.226e5e5ecdfa3p-7,
+ .c6 = 0x1.7ea75d05b583ep-10,
+ .c8 = 0x1.4e4fd14147622p-12,
+ .poly_1357 = { 0x1.1111111110a63p-3, 0x1.664f47e5b5445p-6,
+ 0x1.d6c7ddbf87047p-9, 0x1.289f22964a03cp-11 },
+ .c0 = 0x1.5555555555556p-2,
+ .inv_half_pi = 0x1.45f306dc9c883p-1,
.half_pi_hi = 0x1.921fb54442d18p0,
.half_pi_lo = 0x1.1a62633145c07p-54,
- .inv_half_pi = 0x1.45f306dc9c883p-1,
.range_val = 0x1p23,
- .shift = 0x1.8p52,
};
static svfloat64_t NOINLINE
-special_case (svfloat64_t x, svfloat64_t y, svbool_t special)
+special_case (svfloat64_t x, svfloat64_t p, svfloat64_t q, svbool_t pg,
+ svbool_t special)
{
+ svbool_t use_recip = svcmpeq (
+ pg, svand_x (pg, svreinterpret_u64 (svcvt_s64_x (pg, q)), 1), 0);
+
+ svfloat64_t n = svmad_x (pg, p, p, -1);
+ svfloat64_t d = svmul_x (svptrue_b64 (), p, 2);
+ svfloat64_t swap = n;
+ n = svneg_m (n, use_recip, d);
+ d = svsel (use_recip, swap, d);
+ svfloat64_t y = svdiv_x (svnot_z (pg, special), n, d);
return sv_call_f64 (tan, x, y, special);
}
@@ -50,15 +64,10 @@ special_case (svfloat64_t x, svfloat64_t y, svbool_t special)
svfloat64_t SV_NAME_D1 (tan) (svfloat64_t x, svbool_t pg)
{
const struct data *dat = ptr_barrier (&data);
-
- /* Invert condition to catch NaNs and Infs as well as large values. */
- svbool_t special = svnot_z (pg, svaclt (pg, x, dat->range_val));
-
+ svfloat64_t half_pi_c0 = svld1rq (svptrue_b64 (), &dat->c0);
/* q = nearest integer to 2 * x / pi. */
- svfloat64_t shift = sv_f64 (dat->shift);
- svfloat64_t q = svmla_x (pg, shift, x, dat->inv_half_pi);
- q = svsub_x (pg, q, shift);
- svint64_t qi = svcvt_s64_x (pg, q);
+ svfloat64_t q = svmul_lane (x, half_pi_c0, 1);
+ q = svrinta_x (pg, q);
/* Use q to reduce x to r in [-pi/4, pi/4], by:
r = x - q * pi/2, in extended precision. */
@@ -68,7 +77,7 @@ svfloat64_t SV_NAME_D1 (tan) (svfloat64_t x, svbool_t pg)
r = svmls_lane (r, q, half_pi, 1);
/* Further reduce r to [-pi/8, pi/8], to be reconstructed using double angle
formula. */
- r = svmul_x (pg, r, 0.5);
+ r = svmul_x (svptrue_b64 (), r, 0.5);
/* Approximate tan(r) using order 8 polynomial.
tan(x) is odd, so polynomial has the form:
@@ -76,29 +85,51 @@ svfloat64_t SV_NAME_D1 (tan) (svfloat64_t x, svbool_t pg)
Hence we first approximate P(r) = C1 + C2 * r^2 + C3 * r^4 + ...
Then compute the approximation by:
tan(r) ~= r + r^3 * (C0 + r^2 * P(r)). */
- svfloat64_t r2 = svmul_x (pg, r, r);
- svfloat64_t r4 = svmul_x (pg, r2, r2);
- svfloat64_t r8 = svmul_x (pg, r4, r4);
+
+ svfloat64_t r2 = svmul_x (svptrue_b64 (), r, r);
+ svfloat64_t r4 = svmul_x (svptrue_b64 (), r2, r2);
+ svfloat64_t r8 = svmul_x (svptrue_b64 (), r4, r4);
/* Use offset version coeff array by 1 to evaluate from C1 onwards. */
- svfloat64_t p = sv_estrin_7_f64_x (pg, r2, r4, r8, dat->poly + 1);
- p = svmad_x (pg, p, r2, dat->poly[0]);
- p = svmla_x (pg, r, r2, svmul_x (pg, p, r));
+ svfloat64_t C_24 = svld1rq (svptrue_b64 (), &dat->c2);
+ svfloat64_t C_68 = svld1rq (svptrue_b64 (), &dat->c6);
+
+ /* Use offset version coeff array by 1 to evaluate from C1 onwards. */
+ svfloat64_t p01 = svmla_lane (sv_f64 (dat->poly_1357[0]), r2, C_24, 0);
+ svfloat64_t p23 = svmla_lane_f64 (sv_f64 (dat->poly_1357[1]), r2, C_24, 1);
+ svfloat64_t p03 = svmla_x (pg, p01, p23, r4);
+
+ svfloat64_t p45 = svmla_lane (sv_f64 (dat->poly_1357[2]), r2, C_68, 0);
+ svfloat64_t p67 = svmla_lane (sv_f64 (dat->poly_1357[3]), r2, C_68, 1);
+ svfloat64_t p47 = svmla_x (pg, p45, p67, r4);
+
+ svfloat64_t p = svmla_x (pg, p03, p47, r8);
+
+ svfloat64_t z = svmul_x (svptrue_b64 (), p, r);
+ z = svmul_x (svptrue_b64 (), r2, z);
+ z = svmla_lane (z, r, half_pi_c0, 0);
+ p = svmla_x (pg, r, r2, z);
/* Recombination uses double-angle formula:
tan(2x) = 2 * tan(x) / (1 - (tan(x))^2)
and reciprocity around pi/2:
tan(x) = 1 / (tan(pi/2 - x))
to assemble result using change-of-sign and conditional selection of
- numerator/denominator dependent on odd/even-ness of q (hence quadrant). */
- svbool_t use_recip
- = svcmpeq (pg, svand_x (pg, svreinterpret_u64 (qi), 1), 0);
+ numerator/denominator dependent on odd/even-ness of q (quadrant). */
+
+ /* Invert condition to catch NaNs and Infs as well as large values. */
+ svbool_t special = svnot_z (pg, svaclt (pg, x, dat->range_val));
+
+ if (__glibc_unlikely (svptest_any (pg, special)))
+ {
+ return special_case (x, p, q, pg, special);
+ }
+ svbool_t use_recip = svcmpeq (
+ pg, svand_x (pg, svreinterpret_u64 (svcvt_s64_x (pg, q)), 1), 0);
svfloat64_t n = svmad_x (pg, p, p, -1);
- svfloat64_t d = svmul_x (pg, p, 2);
+ svfloat64_t d = svmul_x (svptrue_b64 (), p, 2);
svfloat64_t swap = n;
n = svneg_m (n, use_recip, d);
d = svsel (use_recip, swap, d);
- if (__glibc_unlikely (svptest_any (pg, special)))
- return special_case (x, svdiv_x (svnot_z (pg, special), n, d), special);
return svdiv_x (pg, n, d);
}
diff --git a/sysdeps/aarch64/fpu/tanf_sve.c b/sysdeps/aarch64/fpu/tanf_sve.c
index f34258324114a360..e850fb4882e88380 100644
--- a/sysdeps/aarch64/fpu/tanf_sve.c
+++ b/sysdeps/aarch64/fpu/tanf_sve.c
@@ -60,21 +60,16 @@ svfloat32_t SV_NAME_F1 (tan) (svfloat32_t x, const svbool_t pg)
{
const struct data *d = ptr_barrier (&data);
- /* Determine whether input is too large to perform fast regression. */
- svbool_t cmp = svacge (pg, x, d->range_val);
-
svfloat32_t odd_coeffs = svld1rq (svptrue_b32 (), &d->c1);
svfloat32_t pi_vals = svld1rq (svptrue_b32 (), &d->pio2_1);
/* n = rint(x/(pi/2)). */
- svfloat32_t q = svmla_lane (sv_f32 (d->shift), x, pi_vals, 3);
- svfloat32_t n = svsub_x (pg, q, d->shift);
+ svfloat32_t n = svrintn_x (pg, svmul_lane (x, pi_vals, 3));
/* n is already a signed integer, simply convert it. */
svint32_t in = svcvt_s32_x (pg, n);
/* Determine if x lives in an interval, where |tan(x)| grows to infinity. */
svint32_t alt = svand_x (pg, in, 1);
svbool_t pred_alt = svcmpne (pg, alt, 0);
-
/* r = x - n * (pi/2) (range reduction into 0 .. pi/4). */
svfloat32_t r;
r = svmls_lane (x, n, pi_vals, 0);
@@ -93,7 +88,7 @@ svfloat32_t SV_NAME_F1 (tan) (svfloat32_t x, const svbool_t pg)
/* Evaluate polynomial approximation of tangent on [-pi/4, pi/4],
using Estrin on z^2. */
- svfloat32_t z2 = svmul_x (pg, z, z);
+ svfloat32_t z2 = svmul_x (svptrue_b32 (), r, r);
svfloat32_t p01 = svmla_lane (sv_f32 (d->c0), z2, odd_coeffs, 0);
svfloat32_t p23 = svmla_lane (sv_f32 (d->c2), z2, odd_coeffs, 1);
svfloat32_t p45 = svmla_lane (sv_f32 (d->c4), z2, odd_coeffs, 2);
@@ -106,13 +101,14 @@ svfloat32_t SV_NAME_F1 (tan) (svfloat32_t x, const svbool_t pg)
svfloat32_t y = svmla_x (pg, z, p, svmul_x (pg, z, z2));
- /* Transform result back, if necessary. */
- svfloat32_t inv_y = svdivr_x (pg, y, 1.0f);
-
/* No need to pass pg to specialcase here since cmp is a strict subset,
guaranteed by the cmpge above. */
+
+ /* Determine whether input is too large to perform fast regression. */
+ svbool_t cmp = svacge (pg, x, d->range_val);
if (__glibc_unlikely (svptest_any (pg, cmp)))
- return special_case (x, svsel (pred_alt, inv_y, y), cmp);
+ return special_case (x, svdivr_x (pg, y, 1.0f), cmp);
+ svfloat32_t inv_y = svdivr_x (pg, y, 1.0f);
return svsel (pred_alt, inv_y, y);
}

View File

@ -0,0 +1,301 @@
commit 4073e4ee2c68de89b7220afba8d0780f86d9c60e
Author: Yat Long Poon <yatlong.poon@arm.com>
Date: Fri Jan 3 19:07:30 2025 +0000
AArch64: Improve codegen for SVE logs
Reduce memory access by using lanewise MLA and moving constants to struct
and reduce number of MOVPRFXs.
Update maximum ULP error for double log_sve from 1 to 2.
Speedup on Neoverse V1 for log (3%), log2 (5%), and log10 (4%).
(cherry picked from commit 32d193a372feb28f9da247bb7283d404b84429c6)
diff --git a/sysdeps/aarch64/fpu/log10_sve.c b/sysdeps/aarch64/fpu/log10_sve.c
index ab7362128d9b3ffb..f1cad2759a31a178 100644
--- a/sysdeps/aarch64/fpu/log10_sve.c
+++ b/sysdeps/aarch64/fpu/log10_sve.c
@@ -23,28 +23,49 @@
#define Min 0x0010000000000000
#define Max 0x7ff0000000000000
#define Thres 0x7fe0000000000000 /* Max - Min. */
-#define Off 0x3fe6900900000000
#define N (1 << V_LOG10_TABLE_BITS)
+static const struct data
+{
+ double c0, c2;
+ double c1, c3;
+ double invln10, log10_2;
+ double c4;
+ uint64_t off;
+} data = {
+ .c0 = -0x1.bcb7b1526e506p-3,
+ .c1 = 0x1.287a7636be1d1p-3,
+ .c2 = -0x1.bcb7b158af938p-4,
+ .c3 = 0x1.63c78734e6d07p-4,
+ .c4 = -0x1.287461742fee4p-4,
+ .invln10 = 0x1.bcb7b1526e50ep-2,
+ .log10_2 = 0x1.34413509f79ffp-2,
+ .off = 0x3fe6900900000000,
+};
+
static svfloat64_t NOINLINE
-special_case (svfloat64_t x, svfloat64_t y, svbool_t special)
+special_case (svfloat64_t hi, svuint64_t tmp, svfloat64_t y, svfloat64_t r2,
+ svbool_t special, const struct data *d)
{
- return sv_call_f64 (log10, x, y, special);
+ svfloat64_t x = svreinterpret_f64 (svadd_x (svptrue_b64 (), tmp, d->off));
+ return sv_call_f64 (log10, x, svmla_x (svptrue_b64 (), hi, r2, y), special);
}
-/* SVE log10 algorithm.
+/* Double-precision SVE log10 routine.
Maximum measured error is 2.46 ulps.
SV_NAME_D1 (log10)(0x1.131956cd4b627p+0) got 0x1.fffbdf6eaa669p-6
want 0x1.fffbdf6eaa667p-6. */
svfloat64_t SV_NAME_D1 (log10) (svfloat64_t x, const svbool_t pg)
{
+ const struct data *d = ptr_barrier (&data);
+
svuint64_t ix = svreinterpret_u64 (x);
svbool_t special = svcmpge (pg, svsub_x (pg, ix, Min), Thres);
/* x = 2^k z; where z is in range [Off,2*Off) and exact.
The range is split into N subintervals.
The ith subinterval contains z and c is near its center. */
- svuint64_t tmp = svsub_x (pg, ix, Off);
+ svuint64_t tmp = svsub_x (pg, ix, d->off);
svuint64_t i = svlsr_x (pg, tmp, 51 - V_LOG10_TABLE_BITS);
i = svand_x (pg, i, (N - 1) << 1);
svfloat64_t k = svcvt_f64_x (pg, svasr_x (pg, svreinterpret_s64 (tmp), 52));
@@ -62,15 +83,19 @@ svfloat64_t SV_NAME_D1 (log10) (svfloat64_t x, const svbool_t pg)
svfloat64_t r = svmad_x (pg, invc, z, -1.0);
/* hi = log(c) + k*log(2). */
- svfloat64_t w = svmla_x (pg, logc, r, __v_log10_data.invln10);
- svfloat64_t hi = svmla_x (pg, w, k, __v_log10_data.log10_2);
+ svfloat64_t invln10_log10_2 = svld1rq_f64 (svptrue_b64 (), &d->invln10);
+ svfloat64_t w = svmla_lane_f64 (logc, r, invln10_log10_2, 0);
+ svfloat64_t hi = svmla_lane_f64 (w, k, invln10_log10_2, 1);
/* y = r2*(A0 + r*A1 + r2*(A2 + r*A3 + r2*A4)) + hi. */
- svfloat64_t r2 = svmul_x (pg, r, r);
- svfloat64_t y = sv_pw_horner_4_f64_x (pg, r, r2, __v_log10_data.poly);
+ svfloat64_t odd_coeffs = svld1rq_f64 (svptrue_b64 (), &d->c1);
+ svfloat64_t r2 = svmul_x (svptrue_b64 (), r, r);
+ svfloat64_t y = svmla_lane_f64 (sv_f64 (d->c2), r, odd_coeffs, 1);
+ svfloat64_t p = svmla_lane_f64 (sv_f64 (d->c0), r, odd_coeffs, 0);
+ y = svmla_x (pg, y, r2, d->c4);
+ y = svmla_x (pg, p, r2, y);
if (__glibc_unlikely (svptest_any (pg, special)))
- return special_case (x, svmla_x (svnot_z (pg, special), hi, r2, y),
- special);
+ return special_case (hi, tmp, y, r2, special, d);
return svmla_x (pg, hi, r2, y);
}
diff --git a/sysdeps/aarch64/fpu/log2_sve.c b/sysdeps/aarch64/fpu/log2_sve.c
index 743fa2a91392093b..908e638246abc13d 100644
--- a/sysdeps/aarch64/fpu/log2_sve.c
+++ b/sysdeps/aarch64/fpu/log2_sve.c
@@ -21,15 +21,32 @@
#include "poly_sve_f64.h"
#define N (1 << V_LOG2_TABLE_BITS)
-#define Off 0x3fe6900900000000
#define Max (0x7ff0000000000000)
#define Min (0x0010000000000000)
#define Thresh (0x7fe0000000000000) /* Max - Min. */
+static const struct data
+{
+ double c0, c2;
+ double c1, c3;
+ double invln2, c4;
+ uint64_t off;
+} data = {
+ .c0 = -0x1.71547652b83p-1,
+ .c1 = 0x1.ec709dc340953p-2,
+ .c2 = -0x1.71547651c8f35p-2,
+ .c3 = 0x1.2777ebe12dda5p-2,
+ .c4 = -0x1.ec738d616fe26p-3,
+ .invln2 = 0x1.71547652b82fep0,
+ .off = 0x3fe6900900000000,
+};
+
static svfloat64_t NOINLINE
-special_case (svfloat64_t x, svfloat64_t y, svbool_t cmp)
+special_case (svfloat64_t w, svuint64_t tmp, svfloat64_t y, svfloat64_t r2,
+ svbool_t special, const struct data *d)
{
- return sv_call_f64 (log2, x, y, cmp);
+ svfloat64_t x = svreinterpret_f64 (svadd_x (svptrue_b64 (), tmp, d->off));
+ return sv_call_f64 (log2, x, svmla_x (svptrue_b64 (), w, r2, y), special);
}
/* Double-precision SVE log2 routine.
@@ -40,13 +57,15 @@ special_case (svfloat64_t x, svfloat64_t y, svbool_t cmp)
want 0x1.fffb34198d9ddp-5. */
svfloat64_t SV_NAME_D1 (log2) (svfloat64_t x, const svbool_t pg)
{
+ const struct data *d = ptr_barrier (&data);
+
svuint64_t ix = svreinterpret_u64 (x);
svbool_t special = svcmpge (pg, svsub_x (pg, ix, Min), Thresh);
/* x = 2^k z; where z is in range [Off,2*Off) and exact.
The range is split into N subintervals.
The ith subinterval contains z and c is near its center. */
- svuint64_t tmp = svsub_x (pg, ix, Off);
+ svuint64_t tmp = svsub_x (pg, ix, d->off);
svuint64_t i = svlsr_x (pg, tmp, 51 - V_LOG2_TABLE_BITS);
i = svand_x (pg, i, (N - 1) << 1);
svfloat64_t k = svcvt_f64_x (pg, svasr_x (pg, svreinterpret_s64 (tmp), 52));
@@ -59,15 +78,19 @@ svfloat64_t SV_NAME_D1 (log2) (svfloat64_t x, const svbool_t pg)
/* log2(x) = log1p(z/c-1)/log(2) + log2(c) + k. */
+ svfloat64_t invln2_and_c4 = svld1rq_f64 (svptrue_b64 (), &d->invln2);
svfloat64_t r = svmad_x (pg, invc, z, -1.0);
- svfloat64_t w = svmla_x (pg, log2c, r, __v_log2_data.invln2);
-
- svfloat64_t r2 = svmul_x (pg, r, r);
- svfloat64_t y = sv_pw_horner_4_f64_x (pg, r, r2, __v_log2_data.poly);
+ svfloat64_t w = svmla_lane_f64 (log2c, r, invln2_and_c4, 0);
w = svadd_x (pg, k, w);
+ svfloat64_t odd_coeffs = svld1rq_f64 (svptrue_b64 (), &d->c1);
+ svfloat64_t r2 = svmul_x (svptrue_b64 (), r, r);
+ svfloat64_t y = svmla_lane_f64 (sv_f64 (d->c2), r, odd_coeffs, 1);
+ svfloat64_t p = svmla_lane_f64 (sv_f64 (d->c0), r, odd_coeffs, 0);
+ y = svmla_lane_f64 (y, r2, invln2_and_c4, 1);
+ y = svmla_x (pg, p, r2, y);
+
if (__glibc_unlikely (svptest_any (pg, special)))
- return special_case (x, svmla_x (svnot_z (pg, special), w, r2, y),
- special);
+ return special_case (w, tmp, y, r2, special, d);
return svmla_x (pg, w, r2, y);
}
diff --git a/sysdeps/aarch64/fpu/log_sve.c b/sysdeps/aarch64/fpu/log_sve.c
index 9b689f2ec7190338..044223400ba2463b 100644
--- a/sysdeps/aarch64/fpu/log_sve.c
+++ b/sysdeps/aarch64/fpu/log_sve.c
@@ -19,39 +19,54 @@
#include "sv_math.h"
-#define P(i) sv_f64 (__v_log_data.poly[i])
#define N (1 << V_LOG_TABLE_BITS)
-#define Off (0x3fe6900900000000)
-#define MaxTop (0x7ff)
-#define MinTop (0x001)
-#define ThreshTop (0x7fe) /* MaxTop - MinTop. */
+#define Max (0x7ff0000000000000)
+#define Min (0x0010000000000000)
+#define Thresh (0x7fe0000000000000) /* Max - Min. */
+
+static const struct data
+{
+ double c0, c2;
+ double c1, c3;
+ double ln2, c4;
+ uint64_t off;
+} data = {
+ .c0 = -0x1.ffffffffffff7p-2,
+ .c1 = 0x1.55555555170d4p-2,
+ .c2 = -0x1.0000000399c27p-2,
+ .c3 = 0x1.999b2e90e94cap-3,
+ .c4 = -0x1.554e550bd501ep-3,
+ .ln2 = 0x1.62e42fefa39efp-1,
+ .off = 0x3fe6900900000000,
+};
static svfloat64_t NOINLINE
-special_case (svfloat64_t x, svfloat64_t y, svbool_t cmp)
+special_case (svfloat64_t hi, svuint64_t tmp, svfloat64_t y, svfloat64_t r2,
+ svbool_t special, const struct data *d)
{
- return sv_call_f64 (log, x, y, cmp);
+ svfloat64_t x = svreinterpret_f64 (svadd_x (svptrue_b64 (), tmp, d->off));
+ return sv_call_f64 (log, x, svmla_x (svptrue_b64 (), hi, r2, y), special);
}
-/* SVE port of AdvSIMD log algorithm.
- Maximum measured error is 2.17 ulp:
- SV_NAME_D1 (log)(0x1.a6129884398a3p+0) got 0x1.ffffff1cca043p-2
- want 0x1.ffffff1cca045p-2. */
+/* Double-precision SVE log routine.
+ Maximum measured error is 2.64 ulp:
+ SV_NAME_D1 (log)(0x1.95e54bc91a5e2p+184) got 0x1.fffffffe88cacp+6
+ want 0x1.fffffffe88cafp+6. */
svfloat64_t SV_NAME_D1 (log) (svfloat64_t x, const svbool_t pg)
{
+ const struct data *d = ptr_barrier (&data);
+
svuint64_t ix = svreinterpret_u64 (x);
- svuint64_t top = svlsr_x (pg, ix, 52);
- svbool_t cmp = svcmpge (pg, svsub_x (pg, top, MinTop), sv_u64 (ThreshTop));
+ svbool_t special = svcmpge (pg, svsub_x (pg, ix, Min), Thresh);
/* x = 2^k z; where z is in range [Off,2*Off) and exact.
The range is split into N subintervals.
The ith subinterval contains z and c is near its center. */
- svuint64_t tmp = svsub_x (pg, ix, Off);
+ svuint64_t tmp = svsub_x (pg, ix, d->off);
/* Calculate table index = (tmp >> (52 - V_LOG_TABLE_BITS)) % N.
The actual value of i is double this due to table layout. */
svuint64_t i
= svand_x (pg, svlsr_x (pg, tmp, (51 - V_LOG_TABLE_BITS)), (N - 1) << 1);
- svint64_t k
- = svasr_x (pg, svreinterpret_s64 (tmp), 52); /* Arithmetic shift. */
svuint64_t iz = svsub_x (pg, ix, svand_x (pg, tmp, 0xfffULL << 52));
svfloat64_t z = svreinterpret_f64 (iz);
/* Lookup in 2 global lists (length N). */
@@ -59,18 +74,22 @@ svfloat64_t SV_NAME_D1 (log) (svfloat64_t x, const svbool_t pg)
svfloat64_t logc = svld1_gather_index (pg, &__v_log_data.table[0].logc, i);
/* log(x) = log1p(z/c-1) + log(c) + k*Ln2. */
- svfloat64_t r = svmad_x (pg, invc, z, -1);
- svfloat64_t kd = svcvt_f64_x (pg, k);
+ svfloat64_t kd = svcvt_f64_x (pg, svasr_x (pg, svreinterpret_s64 (tmp), 52));
/* hi = r + log(c) + k*Ln2. */
- svfloat64_t hi = svmla_x (pg, svadd_x (pg, logc, r), kd, __v_log_data.ln2);
+ svfloat64_t ln2_and_c4 = svld1rq_f64 (svptrue_b64 (), &d->ln2);
+ svfloat64_t r = svmad_x (pg, invc, z, -1);
+ svfloat64_t hi = svmla_lane_f64 (logc, kd, ln2_and_c4, 0);
+ hi = svadd_x (pg, r, hi);
+
/* y = r2*(A0 + r*A1 + r2*(A2 + r*A3 + r2*A4)) + hi. */
- svfloat64_t r2 = svmul_x (pg, r, r);
- svfloat64_t y = svmla_x (pg, P (2), r, P (3));
- svfloat64_t p = svmla_x (pg, P (0), r, P (1));
- y = svmla_x (pg, y, r2, P (4));
+ svfloat64_t odd_coeffs = svld1rq_f64 (svptrue_b64 (), &d->c1);
+ svfloat64_t r2 = svmul_x (svptrue_b64 (), r, r);
+ svfloat64_t y = svmla_lane_f64 (sv_f64 (d->c2), r, odd_coeffs, 1);
+ svfloat64_t p = svmla_lane_f64 (sv_f64 (d->c0), r, odd_coeffs, 0);
+ y = svmla_lane_f64 (y, r2, ln2_and_c4, 1);
y = svmla_x (pg, p, r2, y);
- if (__glibc_unlikely (svptest_any (pg, cmp)))
- return special_case (x, svmla_x (svnot_z (pg, cmp), hi, r2, y), cmp);
+ if (__glibc_unlikely (svptest_any (pg, special)))
+ return special_case (hi, tmp, y, r2, special, d);
return svmla_x (pg, hi, r2, y);
}
diff --git a/sysdeps/aarch64/libm-test-ulps b/sysdeps/aarch64/libm-test-ulps
index 1d52bf9ebf534f1a..10788b790a963918 100644
--- a/sysdeps/aarch64/libm-test-ulps
+++ b/sysdeps/aarch64/libm-test-ulps
@@ -1316,7 +1316,7 @@ float: 2
ldouble: 1
Function: "log_sve":
-double: 1
+double: 2
float: 3
Function: "log_towardzero":

View File

@ -0,0 +1,87 @@
commit 65a96a6f2bb9f6f6f896394662279d263d59cdd2
Author: Wilco Dijkstra <wilco.dijkstra@arm.com>
Date: Wed Aug 7 14:43:47 2024 +0100
AArch64: Improve generic strlen
Improve performance by handling another 16 bytes before entering the loop.
Use ADDHN in the loop to avoid SHRN+FMOV when it terminates. Change final
size computation to avoid increasing latency. On Neoverse V1 performance
of the random strlen benchmark improves by 4.6%.
Reviewed-by: Adhemerval Zanella <adhemerval.zanella@linaro.org>
(cherry picked from commit 3dc426b642dcafdbc11a99f2767e081d086f5fc7)
diff --git a/sysdeps/aarch64/strlen.S b/sysdeps/aarch64/strlen.S
index ab2a576cdb5665e5..352fb40d3abbb44b 100644
--- a/sysdeps/aarch64/strlen.S
+++ b/sysdeps/aarch64/strlen.S
@@ -1,4 +1,5 @@
-/* Copyright (C) 2012-2024 Free Software Foundation, Inc.
+/* Generic optimized strlen using SIMD.
+ Copyright (C) 2012-2024 Free Software Foundation, Inc.
This file is part of the GNU C Library.
@@ -56,36 +57,50 @@ ENTRY (STRLEN)
shrn vend.8b, vhas_nul.8h, 4 /* 128->64 */
fmov synd, dend
lsr synd, synd, shift
- cbz synd, L(loop)
+ cbz synd, L(next16)
rbit synd, synd
clz result, synd
lsr result, result, 2
ret
+L(next16):
+ ldr data, [src, 16]
+ cmeq vhas_nul.16b, vdata.16b, 0
+ shrn vend.8b, vhas_nul.8h, 4 /* 128->64 */
+ fmov synd, dend
+ cbz synd, L(loop)
+ add src, src, 16
+#ifndef __AARCH64EB__
+ rbit synd, synd
+#endif
+ sub result, src, srcin
+ clz tmp, synd
+ add result, result, tmp, lsr 2
+ ret
+
.p2align 5
L(loop):
- ldr data, [src, 16]
+ ldr data, [src, 32]!
cmeq vhas_nul.16b, vdata.16b, 0
- umaxp vend.16b, vhas_nul.16b, vhas_nul.16b
+ addhn vend.8b, vhas_nul.8h, vhas_nul.8h
fmov synd, dend
cbnz synd, L(loop_end)
- ldr data, [src, 32]!
+ ldr data, [src, 16]
cmeq vhas_nul.16b, vdata.16b, 0
- umaxp vend.16b, vhas_nul.16b, vhas_nul.16b
+ addhn vend.8b, vhas_nul.8h, vhas_nul.8h
fmov synd, dend
cbz synd, L(loop)
- sub src, src, 16
+ add src, src, 16
L(loop_end):
- shrn vend.8b, vhas_nul.8h, 4 /* 128->64 */
- sub result, src, srcin
- fmov synd, dend
+ sub result, shift, src, lsl 2 /* (srcin - src) << 2. */
#ifndef __AARCH64EB__
rbit synd, synd
+ sub result, result, 3
#endif
- add result, result, 16
clz tmp, synd
- add result, result, tmp, lsr 2
+ sub result, tmp, result
+ lsr result, result, 2
ret
END (STRLEN)

View File

@ -0,0 +1,282 @@
commit dd1e63ab580d801926265007796f290b84747ec8
Author: Wilco Dijkstra <wilco.dijkstra@arm.com>
Date: Mon Sep 9 15:26:47 2024 +0100
AArch64: Optimize memset
Improve small memsets by avoiding branches and use overlapping stores.
Use DC ZVA for copies over 128 bytes. Remove unnecessary code for ZVA sizes
other than 64 and 128. Performance of random memset benchmark improves by 24%
on Neoverse N1.
Reviewed-by: Adhemerval Zanella <adhemerval.zanella@linaro.org>
(cherry picked from commit cec3aef32412779e207f825db0d057ebb4628ae8)
diff --git a/sysdeps/aarch64/memset.S b/sysdeps/aarch64/memset.S
index 7ef77ee8c926de21..caafb019e2b6217b 100644
--- a/sysdeps/aarch64/memset.S
+++ b/sysdeps/aarch64/memset.S
@@ -1,4 +1,5 @@
-/* Copyright (C) 2012-2024 Free Software Foundation, Inc.
+/* Generic optimized memset using SIMD.
+ Copyright (C) 2012-2024 Free Software Foundation, Inc.
This file is part of the GNU C Library.
@@ -17,7 +18,6 @@
<https://www.gnu.org/licenses/>. */
#include <sysdep.h>
-#include "memset-reg.h"
#ifndef MEMSET
# define MEMSET memset
@@ -25,130 +25,132 @@
/* Assumptions:
*
- * ARMv8-a, AArch64, unaligned accesses
+ * ARMv8-a, AArch64, Advanced SIMD, unaligned accesses.
*
*/
-ENTRY (MEMSET)
+#define dstin x0
+#define val x1
+#define valw w1
+#define count x2
+#define dst x3
+#define dstend x4
+#define zva_val x5
+#define off x3
+#define dstend2 x5
+ENTRY (MEMSET)
PTR_ARG (0)
SIZE_ARG (2)
dup v0.16B, valw
+ cmp count, 16
+ b.lo L(set_small)
+
add dstend, dstin, count
+ cmp count, 64
+ b.hs L(set_128)
- cmp count, 96
- b.hi L(set_long)
- cmp count, 16
- b.hs L(set_medium)
- mov val, v0.D[0]
+ /* Set 16..63 bytes. */
+ mov off, 16
+ and off, off, count, lsr 1
+ sub dstend2, dstend, off
+ str q0, [dstin]
+ str q0, [dstin, off]
+ str q0, [dstend2, -16]
+ str q0, [dstend, -16]
+ ret
+ .p2align 4
/* Set 0..15 bytes. */
- tbz count, 3, 1f
- str val, [dstin]
- str val, [dstend, -8]
- ret
- nop
-1: tbz count, 2, 2f
- str valw, [dstin]
- str valw, [dstend, -4]
+L(set_small):
+ add dstend, dstin, count
+ cmp count, 4
+ b.lo 2f
+ lsr off, count, 3
+ sub dstend2, dstend, off, lsl 2
+ str s0, [dstin]
+ str s0, [dstin, off, lsl 2]
+ str s0, [dstend2, -4]
+ str s0, [dstend, -4]
ret
+
+ /* Set 0..3 bytes. */
2: cbz count, 3f
+ lsr off, count, 1
strb valw, [dstin]
- tbz count, 1, 3f
- strh valw, [dstend, -2]
+ strb valw, [dstin, off]
+ strb valw, [dstend, -1]
3: ret
- /* Set 17..96 bytes. */
-L(set_medium):
- str q0, [dstin]
- tbnz count, 6, L(set96)
- str q0, [dstend, -16]
- tbz count, 5, 1f
- str q0, [dstin, 16]
- str q0, [dstend, -32]
-1: ret
-
.p2align 4
- /* Set 64..96 bytes. Write 64 bytes from the start and
- 32 bytes from the end. */
-L(set96):
- str q0, [dstin, 16]
+L(set_128):
+ bic dst, dstin, 15
+ cmp count, 128
+ b.hi L(set_long)
+ stp q0, q0, [dstin]
stp q0, q0, [dstin, 32]
+ stp q0, q0, [dstend, -64]
stp q0, q0, [dstend, -32]
ret
- .p2align 3
- nop
+ .p2align 4
L(set_long):
- and valw, valw, 255
- bic dst, dstin, 15
str q0, [dstin]
- cmp count, 256
- ccmp valw, 0, 0, cs
- b.eq L(try_zva)
-L(no_zva):
- sub count, dstend, dst /* Count is 16 too large. */
- sub dst, dst, 16 /* Dst is biased by -32. */
- sub count, count, 64 + 16 /* Adjust count and bias for loop. */
-1: stp q0, q0, [dst, 32]
- stp q0, q0, [dst, 64]!
-L(tail64):
- subs count, count, 64
- b.hi 1b
-2: stp q0, q0, [dstend, -64]
+ str q0, [dst, 16]
+ tst valw, 255
+ b.ne L(no_zva)
+#ifndef ZVA64_ONLY
+ mrs zva_val, dczid_el0
+ and zva_val, zva_val, 31
+ cmp zva_val, 4 /* ZVA size is 64 bytes. */
+ b.ne L(zva_128)
+#endif
+ stp q0, q0, [dst, 32]
+ bic dst, dstin, 63
+ sub count, dstend, dst /* Count is now 64 too large. */
+ sub count, count, 64 + 64 /* Adjust count and bias for loop. */
+
+ /* Write last bytes before ZVA loop. */
+ stp q0, q0, [dstend, -64]
stp q0, q0, [dstend, -32]
+
+ .p2align 4
+L(zva64_loop):
+ add dst, dst, 64
+ dc zva, dst
+ subs count, count, 64
+ b.hi L(zva64_loop)
ret
-L(try_zva):
-#ifndef ZVA64_ONLY
.p2align 3
- mrs tmp1, dczid_el0
- tbnz tmp1w, 4, L(no_zva)
- and tmp1w, tmp1w, 15
- cmp tmp1w, 4 /* ZVA size is 64 bytes. */
- b.ne L(zva_128)
- nop
-#endif
- /* Write the first and last 64 byte aligned block using stp rather
- than using DC ZVA. This is faster on some cores.
- */
- .p2align 4
-L(zva_64):
- str q0, [dst, 16]
+L(no_zva):
+ sub count, dstend, dst /* Count is 32 too large. */
+ sub count, count, 64 + 32 /* Adjust count and bias for loop. */
+L(no_zva_loop):
stp q0, q0, [dst, 32]
- bic dst, dst, 63
stp q0, q0, [dst, 64]
- stp q0, q0, [dst, 96]
- sub count, dstend, dst /* Count is now 128 too large. */
- sub count, count, 128+64+64 /* Adjust count and bias for loop. */
- add dst, dst, 128
-1: dc zva, dst
add dst, dst, 64
subs count, count, 64
- b.hi 1b
- stp q0, q0, [dst, 0]
- stp q0, q0, [dst, 32]
+ b.hi L(no_zva_loop)
stp q0, q0, [dstend, -64]
stp q0, q0, [dstend, -32]
ret
#ifndef ZVA64_ONLY
- .p2align 3
+ .p2align 4
L(zva_128):
- cmp tmp1w, 5 /* ZVA size is 128 bytes. */
- b.ne L(zva_other)
+ cmp zva_val, 5 /* ZVA size is 128 bytes. */
+ b.ne L(no_zva)
- str q0, [dst, 16]
stp q0, q0, [dst, 32]
stp q0, q0, [dst, 64]
stp q0, q0, [dst, 96]
bic dst, dst, 127
sub count, dstend, dst /* Count is now 128 too large. */
- sub count, count, 128+128 /* Adjust count and bias for loop. */
- add dst, dst, 128
-1: dc zva, dst
- add dst, dst, 128
+ sub count, count, 128 + 128 /* Adjust count and bias for loop. */
+1: add dst, dst, 128
+ dc zva, dst
subs count, count, 128
b.hi 1b
stp q0, q0, [dstend, -128]
@@ -156,35 +158,6 @@ L(zva_128):
stp q0, q0, [dstend, -64]
stp q0, q0, [dstend, -32]
ret
-
-L(zva_other):
- mov tmp2w, 4
- lsl zva_lenw, tmp2w, tmp1w
- add tmp1, zva_len, 64 /* Max alignment bytes written. */
- cmp count, tmp1
- blo L(no_zva)
-
- sub tmp2, zva_len, 1
- add tmp1, dst, zva_len
- add dst, dst, 16
- subs count, tmp1, dst /* Actual alignment bytes to write. */
- bic tmp1, tmp1, tmp2 /* Aligned dc zva start address. */
- beq 2f
-1: stp q0, q0, [dst], 64
- stp q0, q0, [dst, -32]
- subs count, count, 64
- b.hi 1b
-2: mov dst, tmp1
- sub count, dstend, tmp1 /* Remaining bytes to write. */
- subs count, count, zva_len
- b.lo 4f
-3: dc zva, dst
- add dst, dst, zva_len
- subs count, count, zva_len
- b.hs 3b
-4: add count, count, zva_len
- sub dst, dst, 32 /* Bias dst for tail loop. */
- b L(tail64)
#endif
END (MEMSET)

View File

@ -0,0 +1,60 @@
commit 0cd10047bf046a658f32e12833ccc42304b3b152
Author: Wilco Dijkstra <wilco.dijkstra@arm.com>
Date: Mon Nov 25 18:43:08 2024 +0000
AArch64: Remove zva_128 from memset
Remove ZVA 128 support from memset - the new memset no longer
guarantees count >= 256, which can result in underflow and a
crash if ZVA size is 128 ([1]). Since only one CPU uses a ZVA
size of 128 and its memcpy implementation was removed in commit
e162ab2bf1b82c40f29e1925986582fa07568ce8, remove this special
case too.
[1] https://sourceware.org/pipermail/libc-alpha/2024-November/161626.html
Reviewed-by: Andrew Pinski <quic_apinski@quicinc.com>
(cherry picked from commit a08d9a52f967531a77e1824c23b5368c6434a72d)
diff --git a/sysdeps/aarch64/memset.S b/sysdeps/aarch64/memset.S
index caafb019e2b6217b..71814d0b2f6dd3a7 100644
--- a/sysdeps/aarch64/memset.S
+++ b/sysdeps/aarch64/memset.S
@@ -104,7 +104,7 @@ L(set_long):
mrs zva_val, dczid_el0
and zva_val, zva_val, 31
cmp zva_val, 4 /* ZVA size is 64 bytes. */
- b.ne L(zva_128)
+ b.ne L(no_zva)
#endif
stp q0, q0, [dst, 32]
bic dst, dstin, 63
@@ -137,28 +137,5 @@ L(no_zva_loop):
stp q0, q0, [dstend, -32]
ret
-#ifndef ZVA64_ONLY
- .p2align 4
-L(zva_128):
- cmp zva_val, 5 /* ZVA size is 128 bytes. */
- b.ne L(no_zva)
-
- stp q0, q0, [dst, 32]
- stp q0, q0, [dst, 64]
- stp q0, q0, [dst, 96]
- bic dst, dst, 127
- sub count, dstend, dst /* Count is now 128 too large. */
- sub count, count, 128 + 128 /* Adjust count and bias for loop. */
-1: add dst, dst, 128
- dc zva, dst
- subs count, count, 128
- b.hi 1b
- stp q0, q0, [dstend, -128]
- stp q0, q0, [dstend, -96]
- stp q0, q0, [dstend, -64]
- stp q0, q0, [dstend, -32]
- ret
-#endif
-
END (MEMSET)
libc_hidden_builtin_def (MEMSET)

View File

@ -0,0 +1,29 @@
commit 0cc12d9c47eb97d82c8f5af3724b4a4bc01df74a
Author: Wilco Dijkstra <wilco.dijkstra@arm.com>
Date: Wed Jul 24 15:17:47 2024 +0100
math: Improve layout of expf data
GCC aligns global data to 16 bytes if their size is >= 16 bytes. This patch
changes the exp2f_data struct slightly so that the fields are better aligned.
As a result on targets that support them, load-pair instructions accessing
poly_scaled and invln2_scaled are now 16-byte aligned.
Reviewed-by: Adhemerval Zanella <adhemerval.zanella@linaro.org>
(cherry picked from commit 44fa9c1080fe6a9539f0d2345b9d2ae37b8ee57a)
diff --git a/sysdeps/ieee754/flt-32/math_config.h b/sysdeps/ieee754/flt-32/math_config.h
index 729f22cd4f7dd9e4..dc07ebd45977e511 100644
--- a/sysdeps/ieee754/flt-32/math_config.h
+++ b/sysdeps/ieee754/flt-32/math_config.h
@@ -166,9 +166,9 @@ extern const struct exp2f_data
uint64_t tab[1 << EXP2F_TABLE_BITS];
double shift_scaled;
double poly[EXP2F_POLY_ORDER];
- double shift;
double invln2_scaled;
double poly_scaled[EXP2F_POLY_ORDER];
+ double shift;
} __exp2f_data attribute_hidden;
#define LOGF_TABLE_BITS 4

View File

@ -0,0 +1,189 @@
commit d0e2133470d848e80eb4ba79ecd5d8c8b11fd2bb
Author: Wilco Dijkstra <wilco.dijkstra@arm.com>
Date: Tue Dec 24 18:01:59 2024 +0000
AArch64: Add SVE memset
Add SVE memset based on the generic memset with predicated load for sizes < 16.
Unaligned memsets of 128-1024 are improved by ~20% on average by using aligned
stores for the last 64 bytes. Performance of random memset benchmark improves
by ~2% on Neoverse V1.
Reviewed-by: Yury Khrustalev <yury.khrustalev@arm.com>
(cherry picked from commit 163b1bbb76caba4d9673c07940c5930a1afa7548)
diff --git a/sysdeps/aarch64/multiarch/Makefile b/sysdeps/aarch64/multiarch/Makefile
index e4720b746859f515..214b6137b0bc63a2 100644
--- a/sysdeps/aarch64/multiarch/Makefile
+++ b/sysdeps/aarch64/multiarch/Makefile
@@ -14,6 +14,7 @@ sysdep_routines += \
memset_generic \
memset_kunpeng \
memset_mops \
+ memset_sve_zva64 \
memset_zva64 \
strlen_asimd \
strlen_generic \
diff --git a/sysdeps/aarch64/multiarch/ifunc-impl-list.c b/sysdeps/aarch64/multiarch/ifunc-impl-list.c
index ecd0f87de6a5b254..f8544fe3b525f775 100644
--- a/sysdeps/aarch64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/aarch64/multiarch/ifunc-impl-list.c
@@ -57,6 +57,7 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
IFUNC_IMPL_ADD (array, i, memset, 1, __memset_kunpeng)
#if HAVE_AARCH64_SVE_ASM
IFUNC_IMPL_ADD (array, i, memset, sve && !bti && zva_size == 256, __memset_a64fx)
+ IFUNC_IMPL_ADD (array, i, memset, sve && zva_size == 64, __memset_sve_zva64)
#endif
IFUNC_IMPL_ADD (array, i, memset, mops, __memset_mops)
IFUNC_IMPL_ADD (array, i, memset, 1, __memset_generic))
diff --git a/sysdeps/aarch64/multiarch/memset.c b/sysdeps/aarch64/multiarch/memset.c
index 34bce045dd64ba9b..9d98664e6bc32212 100644
--- a/sysdeps/aarch64/multiarch/memset.c
+++ b/sysdeps/aarch64/multiarch/memset.c
@@ -34,6 +34,7 @@ extern __typeof (__redirect_memset) __memset_kunpeng attribute_hidden;
extern __typeof (__redirect_memset) __memset_a64fx attribute_hidden;
extern __typeof (__redirect_memset) __memset_generic attribute_hidden;
extern __typeof (__redirect_memset) __memset_mops attribute_hidden;
+extern __typeof (__redirect_memset) __memset_sve_zva64 attribute_hidden;
static inline __typeof (__redirect_memset) *
select_memset_ifunc (void)
@@ -47,6 +48,9 @@ select_memset_ifunc (void)
{
if (IS_A64FX (midr) && zva_size == 256)
return __memset_a64fx;
+
+ if (zva_size == 64)
+ return __memset_sve_zva64;
}
if (IS_KUNPENG920 (midr))
diff --git a/sysdeps/aarch64/multiarch/memset_sve_zva64.S b/sysdeps/aarch64/multiarch/memset_sve_zva64.S
new file mode 100644
index 0000000000000000..7fb40fdd9e927bb3
--- /dev/null
+++ b/sysdeps/aarch64/multiarch/memset_sve_zva64.S
@@ -0,0 +1,123 @@
+/* Optimized memset for SVE.
+ Copyright (C) 2025 Free Software Foundation, Inc.
+
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library. If not, see
+ <https://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+
+/* Assumptions:
+ *
+ * ARMv8-a, AArch64, Advanced SIMD, SVE, unaligned accesses.
+ * ZVA size is 64.
+ */
+
+#if HAVE_AARCH64_SVE_ASM
+
+.arch armv8.2-a+sve
+
+#define dstin x0
+#define val x1
+#define valw w1
+#define count x2
+#define dst x3
+#define dstend x4
+#define zva_val x5
+#define vlen x5
+#define off x3
+#define dstend2 x5
+
+ENTRY (__memset_sve_zva64)
+ dup v0.16B, valw
+ cmp count, 16
+ b.lo L(set_16)
+
+ add dstend, dstin, count
+ cmp count, 64
+ b.hs L(set_128)
+
+ /* Set 16..63 bytes. */
+ mov off, 16
+ and off, off, count, lsr 1
+ sub dstend2, dstend, off
+ str q0, [dstin]
+ str q0, [dstin, off]
+ str q0, [dstend2, -16]
+ str q0, [dstend, -16]
+ ret
+
+ .p2align 4
+L(set_16):
+ whilelo p0.b, xzr, count
+ st1b z0.b, p0, [dstin]
+ ret
+
+ .p2align 4
+L(set_128):
+ bic dst, dstin, 15
+ cmp count, 128
+ b.hi L(set_long)
+ stp q0, q0, [dstin]
+ stp q0, q0, [dstin, 32]
+ stp q0, q0, [dstend, -64]
+ stp q0, q0, [dstend, -32]
+ ret
+
+ .p2align 4
+L(set_long):
+ cmp count, 256
+ b.lo L(no_zva)
+ tst valw, 255
+ b.ne L(no_zva)
+
+ str q0, [dstin]
+ str q0, [dst, 16]
+ bic dst, dstin, 31
+ stp q0, q0, [dst, 32]
+ bic dst, dstin, 63
+ sub count, dstend, dst /* Count is now 64 too large. */
+ sub count, count, 128 /* Adjust count and bias for loop. */
+
+ sub x8, dstend, 1 /* Write last bytes before ZVA loop. */
+ bic x8, x8, 15
+ stp q0, q0, [x8, -48]
+ str q0, [x8, -16]
+ str q0, [dstend, -16]
+
+ .p2align 4
+L(zva64_loop):
+ add dst, dst, 64
+ dc zva, dst
+ subs count, count, 64
+ b.hi L(zva64_loop)
+ ret
+
+L(no_zva):
+ str q0, [dstin]
+ sub count, dstend, dst /* Count is 16 too large. */
+ sub count, count, 64 + 16 /* Adjust count and bias for loop. */
+L(no_zva_loop):
+ stp q0, q0, [dst, 16]
+ stp q0, q0, [dst, 48]
+ add dst, dst, 64
+ subs count, count, 64
+ b.hi L(no_zva_loop)
+ stp q0, q0, [dstend, -64]
+ stp q0, q0, [dstend, -32]
+ ret
+
+END (__memset_sve_zva64)
+#endif

View File

@ -0,0 +1,24 @@
commit a1b09e59e2de9a5634a864e1a915f9f46e2cdd3a
Author: Wilco Dijkstra <wilco.dijkstra@arm.com>
Date: Thu Feb 27 16:28:52 2025 +0000
AArch64: Use prefer_sve_ifuncs for SVE memset
Use prefer_sve_ifuncs for SVE memset just like memcpy.
Reviewed-by: Yury Khrustalev <yury.khrustalev@arm.com>
(cherry picked from commit 0f044be1dae5169d0e57f8d487b427863aeadab4)
diff --git a/sysdeps/aarch64/multiarch/memset.c b/sysdeps/aarch64/multiarch/memset.c
index 9d98664e6bc32212..161624fe6028d9e9 100644
--- a/sysdeps/aarch64/multiarch/memset.c
+++ b/sysdeps/aarch64/multiarch/memset.c
@@ -49,7 +49,7 @@ select_memset_ifunc (void)
if (IS_A64FX (midr) && zva_size == 256)
return __memset_a64fx;
- if (zva_size == 64)
+ if (prefer_sve_ifuncs && zva_size == 64)
return __memset_sve_zva64;
}

View File

@ -0,0 +1,43 @@
commit dd8c0c3bbd4e22e00a7275c75dc0d40f24bb0d68
Author: Wilco Dijkstra <wilco.dijkstra@arm.com>
Date: Fri Dec 13 15:43:07 2024 +0000
math: Improve layout of exp/exp10 data
GCC aligns global data to 16 bytes if their size is >= 16 bytes. This patch
changes the exp_data struct slightly so that the fields are better aligned
and without gaps. As a result on targets that support them, more load-pair
instructions are used in exp. Exp10 is improved by moving invlog10_2N later
so that neglog10_2hiN and neglog10_2loN can be loaded using load-pair.
The exp benchmark improves 2.5%, "144bits" by 7.2%, "768bits" by 12.7% on
Neoverse V2. Exp10 improves by 1.5%.
Reviewed-by: Adhemerval Zanella <adhemerval.zanella@linaro.org>
(cherry picked from commit 5afaf99edb326fd9f36eb306a828d129a3a1d7f7)
diff --git a/sysdeps/ieee754/dbl-64/math_config.h b/sysdeps/ieee754/dbl-64/math_config.h
index ef87cfa6be9860e3..05515fd95ad15d52 100644
--- a/sysdeps/ieee754/dbl-64/math_config.h
+++ b/sysdeps/ieee754/dbl-64/math_config.h
@@ -195,16 +195,18 @@ check_uflow (double x)
extern const struct exp_data
{
double invln2N;
- double shift;
double negln2hiN;
double negln2loN;
double poly[4]; /* Last four coefficients. */
+ double shift;
+
double exp2_shift;
double exp2_poly[EXP2_POLY_ORDER];
- double invlog10_2N;
+
double neglog10_2hiN;
double neglog10_2loN;
double exp10_poly[5];
+ double invlog10_2N;
uint64_t tab[2*(1 << EXP_TABLE_BITS)];
} __exp_data attribute_hidden;

View File

@ -0,0 +1,54 @@
commit e1fe22368e4fbc13ce300d89802b7fcc0d5cfb38
Author: Michael Jeanson <mjeanson@efficios.com>
Date: Fri Feb 14 13:54:22 2025 -0500
nptl: clear the whole rseq area before registration
Due to the extensible nature of the rseq area we can't explictly
initialize fields that are not part of the ABI yet. It was agreed with
upstream that all new fields will be documented as zero initialized by
userspace. Future kernels configured with CONFIG_DEBUG_RSEQ will
validate the content of all fields during registration.
Replace the explicit field initialization with a memset of the whole
rseq area which will cover fields as they are added to future kernels.
Signed-off-by: Michael Jeanson <mjeanson@efficios.com>
Reviewed-by: Florian Weimer <fweimer@redhat.com>
(cherry picked from commit 689a62a4217fae78b9ce0db781dc2a421f2b1ab4)
diff --git a/sysdeps/nptl/dl-tls_init_tp.c b/sysdeps/nptl/dl-tls_init_tp.c
index 7803e19fd16ad803..ed10185e3708f4b6 100644
--- a/sysdeps/nptl/dl-tls_init_tp.c
+++ b/sysdeps/nptl/dl-tls_init_tp.c
@@ -23,6 +23,7 @@
#include <tls.h>
#include <rseq-internal.h>
#include <thread_pointer.h>
+#include <dl-symbol-redir-ifunc.h>
#define TUNABLE_NAMESPACE pthread
#include <dl-tunables.h>
diff --git a/sysdeps/unix/sysv/linux/rseq-internal.h b/sysdeps/unix/sysv/linux/rseq-internal.h
index ef3eab1fefd4d90d..76de2b7ff079eb0f 100644
--- a/sysdeps/unix/sysv/linux/rseq-internal.h
+++ b/sysdeps/unix/sysv/linux/rseq-internal.h
@@ -52,13 +52,12 @@ rseq_register_current_thread (struct pthread *self, bool do_rseq)
but still expected size 32. */
size = RSEQ_AREA_SIZE_INITIAL;
- /* Initialize the rseq fields that are read by the kernel on
- registration, there is no guarantee that struct pthread is
- cleared on all architectures. */
+ /* Initialize the whole rseq area to zero prior to registration. */
+ memset (&self->rseq_area, 0, size);
+
+ /* Set the cpu_id field to RSEQ_CPU_ID_UNINITIALIZED, this is checked by
+ the kernel at registration when CONFIG_DEBUG_RSEQ is enabled. */
THREAD_SETMEM (self, rseq_area.cpu_id, RSEQ_CPU_ID_UNINITIALIZED);
- THREAD_SETMEM (self, rseq_area.cpu_id_start, 0);
- THREAD_SETMEM (self, rseq_area.rseq_cs, 0);
- THREAD_SETMEM (self, rseq_area.flags, 0);
int ret = INTERNAL_SYSCALL_CALL (rseq, &self->rseq_area,
size, 0, RSEQ_SIG);

View File

@ -0,0 +1,193 @@
commit 7ecf0d3bde54e4f9e6f025d2f43eff565ed97414
Author: H.J. Lu <hjl.tools@gmail.com>
Date: Thu Apr 4 15:43:50 2024 -0700
x86-64: Exclude FMA4 IFUNC functions for -mapxf
When -mapxf is used to build glibc, the resulting glibc will never run
on FMA4 machines. Exclude FMA4 IFUNC functions when -mapxf is used.
This requires GCC which defines __APX_F__ for -mapxf with commit:
1df56719bd8 x86: Define __APX_F__ for -mapxf
Reviewed-by: Sunil K Pandey <skpgkp2@gmail.com>
(cherry picked from commit 9e1f4aef865ddeffeb4b5f6578fefab606783120)
diff --git a/config.h.in b/config.h.in
index 1e647de58580bc2d..a5fdea0c3c7b070e 100644
--- a/config.h.in
+++ b/config.h.in
@@ -295,4 +295,7 @@
/* Define if -mmovbe is enabled by default on x86. */
#undef HAVE_X86_MOVBE
+/* Define if -mapxf is enabled by default on x86. */
+#undef HAVE_X86_APX
+
#endif
diff --git a/sysdeps/x86_64/configure b/sysdeps/x86_64/configure
index 04a534fa126a7bf7..07bdd40a37247c7b 100755
--- a/sysdeps/x86_64/configure
+++ b/sysdeps/x86_64/configure
@@ -162,6 +162,38 @@ printf "%s\n" "$libc_cv_x86_have_amx_tile" >&6; }
config_vars="$config_vars
have-mamx-tile = $libc_cv_x86_have_amx_tile"
+# Check if -mapxf is enabled.
+{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking whether -mapxf is enabled" >&5
+printf %s "checking whether -mapxf is enabled... " >&6; }
+if test ${libc_cv_x86_have_apx+y}
+then :
+ printf %s "(cached) " >&6
+else $as_nop
+ cat > conftest.c <<EOF
+#ifndef __APX_F__
+# error APX isn't enabled
+#endif
+EOF
+ libc_cv_x86_have_apx=no
+ if { ac_try='${CC-cc} -c $CFLAGS conftest.c -o conftest.o 1>&5'
+ { { eval echo "\"\$as_me\":${as_lineno-$LINENO}: \"$ac_try\""; } >&5
+ (eval $ac_try) 2>&5
+ ac_status=$?
+ printf "%s\n" "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
+ test $ac_status = 0; }; }; then
+ libc_cv_x86_have_apx=yes
+ fi
+ rm -rf conftest*
+fi
+{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: $libc_cv_x86_have_apx" >&5
+printf "%s\n" "$libc_cv_x86_have_apx" >&6; }
+if test $libc_cv_x86_have_apx = yes; then
+ printf "%s\n" "#define HAVE_X86_APX 1" >>confdefs.h
+
+fi
+config_vars="$config_vars
+have-x86-apx = $libc_cv_x86_have_apx"
+
test -n "$critic_missing" && as_fn_error $? "
*** $critic_missing" "$LINENO" 5
diff --git a/sysdeps/x86_64/configure.ac b/sysdeps/x86_64/configure.ac
index c714c47351e70390..c7b68544a2c79ae5 100644
--- a/sysdeps/x86_64/configure.ac
+++ b/sysdeps/x86_64/configure.ac
@@ -76,5 +76,23 @@ EOF
rm -rf conftest*])
LIBC_CONFIG_VAR([have-mamx-tile], [$libc_cv_x86_have_amx_tile])
+# Check if -mapxf is enabled.
+AC_CACHE_CHECK(whether -mapxf is enabled,
+ libc_cv_x86_have_apx, [dnl
+cat > conftest.c <<EOF
+#ifndef __APX_F__
+# error APX isn't enabled
+#endif
+EOF
+ libc_cv_x86_have_apx=no
+ if AC_TRY_COMMAND(${CC-cc} -c $CFLAGS conftest.c -o conftest.o 1>&AS_MESSAGE_LOG_FD); then
+ libc_cv_x86_have_apx=yes
+ fi
+ rm -rf conftest*])
+if test $libc_cv_x86_have_apx = yes; then
+ AC_DEFINE(HAVE_X86_APX)
+fi
+LIBC_CONFIG_VAR([have-x86-apx], [$libc_cv_x86_have_apx])
+
test -n "$critic_missing" && AC_MSG_ERROR([
*** $critic_missing])
diff --git a/sysdeps/x86_64/fpu/multiarch/Makefile b/sysdeps/x86_64/fpu/multiarch/Makefile
index 6ddd50240ce33d22..cbe09d49f49581f1 100644
--- a/sysdeps/x86_64/fpu/multiarch/Makefile
+++ b/sysdeps/x86_64/fpu/multiarch/Makefile
@@ -38,29 +38,36 @@ libm-sysdep_routines += \
s_truncf-avx \
# libm-sysdep_routines
else
+ifeq (no,$(have-x86-apx))
libm-sysdep_routines += \
- e_asin-fma \
e_asin-fma4 \
+ e_atan2-fma4 \
+ e_exp-fma4 \
+ e_log-fma4 \
+ e_pow-fma4 \
+ s_atan-fma4 \
+ s_sin-fma4 \
+ s_sincos-fma4 \
+ s_tan-fma4 \
+# libm-sysdep_routines
+endif
+libm-sysdep_routines += \
+ e_asin-fma \
e_atan2-avx \
e_atan2-fma \
- e_atan2-fma4 \
e_exp-avx \
e_exp-fma \
- e_exp-fma4 \
e_exp2f-fma \
e_expf-fma \
e_log-avx \
e_log-fma \
- e_log-fma4 \
e_log2-fma \
e_log2f-fma \
e_logf-fma \
e_pow-fma \
- e_pow-fma4 \
e_powf-fma \
s_atan-avx \
s_atan-fma \
- s_atan-fma4 \
s_ceil-sse4_1 \
s_ceilf-sse4_1 \
s_cosf-fma \
@@ -77,17 +84,14 @@ libm-sysdep_routines += \
s_roundevenf-sse4_1 \
s_sin-avx \
s_sin-fma \
- s_sin-fma4 \
s_sincos-avx \
s_sincos-fma \
- s_sincos-fma4 \
s_sincosf-fma \
s_sincosf-sse2 \
s_sinf-fma \
s_sinf-sse2 \
s_tan-avx \
s_tan-fma \
- s_tan-fma4 \
s_trunc-sse4_1 \
s_truncf-sse4_1 \
# libm-sysdep_routines
diff --git a/sysdeps/x86_64/fpu/multiarch/ifunc-avx-fma4.h b/sysdeps/x86_64/fpu/multiarch/ifunc-avx-fma4.h
index 7719188888fbec38..d126cf9cd5ae55e4 100644
--- a/sysdeps/x86_64/fpu/multiarch/ifunc-avx-fma4.h
+++ b/sysdeps/x86_64/fpu/multiarch/ifunc-avx-fma4.h
@@ -33,8 +33,10 @@ IFUNC_SELECTOR (void)
&& CPU_FEATURE_USABLE_P (cpu_features, AVX2))
return OPTIMIZE (fma);
+#ifndef HAVE_X86_APX
if (CPU_FEATURE_USABLE_P (cpu_features, FMA4))
return OPTIMIZE (fma4);
+#endif
if (CPU_FEATURE_USABLE_P (cpu_features, AVX))
return OPTIMIZE (avx);
diff --git a/sysdeps/x86_64/fpu/multiarch/ifunc-fma4.h b/sysdeps/x86_64/fpu/multiarch/ifunc-fma4.h
index c35ba13845b7914b..18d372d25cb598f2 100644
--- a/sysdeps/x86_64/fpu/multiarch/ifunc-fma4.h
+++ b/sysdeps/x86_64/fpu/multiarch/ifunc-fma4.h
@@ -32,8 +32,10 @@ IFUNC_SELECTOR (void)
&& CPU_FEATURE_USABLE_P (cpu_features, AVX2))
return OPTIMIZE (fma);
+#ifndef HAVE_X86_APX
if (CPU_FEATURE_USABLE_P (cpu_features, FMA4))
return OPTIMIZE (fma4);
+#endif
return OPTIMIZE (sse2);
}

View File

@ -0,0 +1,106 @@
commit 0edcc77fe7e13b29d99e7f4d7fe3373b3666468e
Author: Sunil K Pandey <skpgkp2@gmail.com>
Date: Mon Mar 10 10:24:07 2025 -0700
x86_64: Add tanh with FMA
On Skylake, it improves tanh bench performance by:
Before After Improvement
max 110.89 95.826 14%
min 20.966 20.157 4%
mean 30.9601 29.8431 4%
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
(cherry picked from commit c6352111c72a20b3588ae304dd99b63e25dd6d85)
diff --git a/sysdeps/ieee754/dbl-64/s_tanh.c b/sysdeps/ieee754/dbl-64/s_tanh.c
index 673a97102de292fd..13063db04ebb198c 100644
--- a/sysdeps/ieee754/dbl-64/s_tanh.c
+++ b/sysdeps/ieee754/dbl-64/s_tanh.c
@@ -46,6 +46,11 @@ static char rcsid[] = "$NetBSD: s_tanh.c,v 1.7 1995/05/10 20:48:22 jtc Exp $";
static const double one = 1.0, two = 2.0, tiny = 1.0e-300;
+#ifndef SECTION
+# define SECTION
+#endif
+
+SECTION
double
__tanh (double x)
{
diff --git a/sysdeps/x86_64/fpu/multiarch/Makefile b/sysdeps/x86_64/fpu/multiarch/Makefile
index cbe09d49f49581f1..0f69f7089c06af73 100644
--- a/sysdeps/x86_64/fpu/multiarch/Makefile
+++ b/sysdeps/x86_64/fpu/multiarch/Makefile
@@ -10,6 +10,7 @@ CFLAGS-s_expm1-fma.c = -mfma -mavx2
CFLAGS-s_log1p-fma.c = -mfma -mavx2
CFLAGS-s_sin-fma.c = -mfma -mavx2
CFLAGS-s_tan-fma.c = -mfma -mavx2
+CFLAGS-s_tanh-fma.c = -mfma -mavx2
CFLAGS-s_sincos-fma.c = -mfma -mavx2
CFLAGS-e_exp2f-fma.c = -mfma -mavx2
@@ -92,6 +93,7 @@ libm-sysdep_routines += \
s_sinf-sse2 \
s_tan-avx \
s_tan-fma \
+ s_tanh-fma \
s_trunc-sse4_1 \
s_truncf-sse4_1 \
# libm-sysdep_routines
diff --git a/sysdeps/x86_64/fpu/multiarch/s_tanh-fma.c b/sysdeps/x86_64/fpu/multiarch/s_tanh-fma.c
new file mode 100644
index 0000000000000000..1b808b1227f50cf5
--- /dev/null
+++ b/sysdeps/x86_64/fpu/multiarch/s_tanh-fma.c
@@ -0,0 +1,11 @@
+#define __tanh __tanh_fma
+#define __expm1 __expm1_fma
+
+/* NB: __expm1 may be expanded to __expm1_fma in the following
+ prototypes. */
+extern long double __expm1l (long double);
+extern long double __expm1f128 (long double);
+
+#define SECTION __attribute__ ((section (".text.fma")))
+
+#include <sysdeps/ieee754/dbl-64/s_tanh.c>
diff --git a/sysdeps/x86_64/fpu/multiarch/s_tanh.c b/sysdeps/x86_64/fpu/multiarch/s_tanh.c
new file mode 100644
index 0000000000000000..5539b6c61c63548d
--- /dev/null
+++ b/sysdeps/x86_64/fpu/multiarch/s_tanh.c
@@ -0,0 +1,31 @@
+/* Multiple versions of tanh.
+ Copyright (C) 2025 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#include <sysdeps/x86/isa-level.h>
+#if MINIMUM_X86_ISA_LEVEL < AVX2_X86_ISA_LEVEL
+
+extern double __redirect_tanh (double);
+
+# define SYMBOL_NAME tanh
+# include "ifunc-fma.h"
+
+libc_ifunc_redirected (__redirect_tanh, __tanh, IFUNC_SELECTOR ());
+
+# define __tanh __tanh_sse2
+#endif
+#include <sysdeps/ieee754/dbl-64/s_tanh.c>

View File

@ -0,0 +1,130 @@
commit 01ed435e2ee8df18f107ac9d999e1c4db922f564
Author: Sunil K Pandey <skpgkp2@gmail.com>
Date: Sat Mar 8 08:51:10 2025 -0800
x86_64: Add sinh with FMA
On SPR, it improves sinh bench performance by:
Before After Improvement
reciprocal-throughput 14.2017 11.815 17%
latency 36.4917 35.2114 4%
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
(cherry picked from commit dded0d20f67ba1925ccbcb9cf28f0c75febe0dbe)
diff --git a/benchtests/sinh-inputs b/benchtests/sinh-inputs
index 7b1ac46a39c0a0b0..2fcb2fabf82ce778 100644
--- a/benchtests/sinh-inputs
+++ b/benchtests/sinh-inputs
@@ -1,6 +1,7 @@
## args: double
## ret: double
## includes: math.h
+## name: workload-random
0x1.bcb6129b5ff2bp8
-0x1.63057386325ebp9
0x1.62f1d7dc4e8bfp9
diff --git a/sysdeps/ieee754/dbl-64/e_sinh.c b/sysdeps/ieee754/dbl-64/e_sinh.c
index b4b5857dddf90f7a..3f787967f93d72f0 100644
--- a/sysdeps/ieee754/dbl-64/e_sinh.c
+++ b/sysdeps/ieee754/dbl-64/e_sinh.c
@@ -41,6 +41,11 @@ static char rcsid[] = "$NetBSD: e_sinh.c,v 1.7 1995/05/10 20:46:13 jtc Exp $";
static const double one = 1.0, shuge = 1.0e307;
+#ifndef SECTION
+# define SECTION
+#endif
+
+SECTION
double
__ieee754_sinh (double x)
{
@@ -90,4 +95,7 @@ __ieee754_sinh (double x)
/* |x| > overflowthresold, sinh(x) overflow */
return math_narrow_eval (x * shuge);
}
+
+#ifndef __ieee754_sinh
libm_alias_finite (__ieee754_sinh, __sinh)
+#endif
diff --git a/sysdeps/x86_64/fpu/multiarch/Makefile b/sysdeps/x86_64/fpu/multiarch/Makefile
index 0f69f7089c06af73..b527cab8d134be21 100644
--- a/sysdeps/x86_64/fpu/multiarch/Makefile
+++ b/sysdeps/x86_64/fpu/multiarch/Makefile
@@ -5,6 +5,7 @@ CFLAGS-e_exp-fma.c = -mfma -mavx2
CFLAGS-e_log-fma.c = -mfma -mavx2
CFLAGS-e_log2-fma.c = -mfma -mavx2
CFLAGS-e_pow-fma.c = -mfma -mavx2
+CFLAGS-e_sinh-fma.c = -mfma -mavx2
CFLAGS-s_atan-fma.c = -mfma -mavx2
CFLAGS-s_expm1-fma.c = -mfma -mavx2
CFLAGS-s_log1p-fma.c = -mfma -mavx2
@@ -67,6 +68,7 @@ libm-sysdep_routines += \
e_logf-fma \
e_pow-fma \
e_powf-fma \
+ e_sinh-fma \
s_atan-avx \
s_atan-fma \
s_ceil-sse4_1 \
diff --git a/sysdeps/x86_64/fpu/multiarch/e_sinh-fma.c b/sysdeps/x86_64/fpu/multiarch/e_sinh-fma.c
new file mode 100644
index 0000000000000000..e0e1e39a7a606dc8
--- /dev/null
+++ b/sysdeps/x86_64/fpu/multiarch/e_sinh-fma.c
@@ -0,0 +1,12 @@
+#define __ieee754_sinh __ieee754_sinh_fma
+#define __ieee754_exp __ieee754_exp_fma
+#define __expm1 __expm1_fma
+
+/* NB: __expm1 may be expanded to __expm1_fma in the following
+ prototypes. */
+extern long double __expm1l (long double);
+extern long double __expm1f128 (long double);
+
+#define SECTION __attribute__ ((section (".text.fma")))
+
+#include <sysdeps/ieee754/dbl-64/e_sinh.c>
diff --git a/sysdeps/x86_64/fpu/multiarch/e_sinh.c b/sysdeps/x86_64/fpu/multiarch/e_sinh.c
new file mode 100644
index 0000000000000000..3d3c18ccdf1d437a
--- /dev/null
+++ b/sysdeps/x86_64/fpu/multiarch/e_sinh.c
@@ -0,0 +1,35 @@
+/* Multiple versions of sinh.
+ Copyright (C) 2025 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#include <sysdeps/x86/isa-level.h>
+#if MINIMUM_X86_ISA_LEVEL < AVX2_X86_ISA_LEVEL
+# include <libm-alias-finite.h>
+
+extern double __redirect_ieee754_sinh (double);
+
+# define SYMBOL_NAME ieee754_sinh
+# include "ifunc-fma.h"
+
+libc_ifunc_redirected (__redirect_ieee754_sinh, __ieee754_sinh,
+ IFUNC_SELECTOR ());
+
+libm_alias_finite (__ieee754_sinh, __sinh)
+
+# define __ieee754_sinh __ieee754_sinh_sse2
+#endif
+#include <sysdeps/ieee754/dbl-64/e_sinh.c>

View File

@ -0,0 +1,123 @@
commit 4cf3f9df544a6f3dc27ea097b43bd2fb73113c3f
Author: Sunil K Pandey <skpgkp2@gmail.com>
Date: Wed Mar 5 16:13:38 2025 -0800
x86_64: Add atanh with FMA
On SPR, it improves atanh bench performance by:
Before After Improvement
reciprocal-throughput 15.1715 14.8628 2%
latency 57.1941 56.1883 2%
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
(cherry picked from commit c7c4a5906f326f1290b1c2413a83c530564ec4b8)
diff --git a/benchtests/atanh-inputs b/benchtests/atanh-inputs
index 455aa65b6500bccb..498529325436d48f 100644
--- a/benchtests/atanh-inputs
+++ b/benchtests/atanh-inputs
@@ -1,6 +1,7 @@
## args: double
## ret: double
## includes: math.h
+## name: workload-random
0x1.5a2730bacd94ap-1
-0x1.b57eb40fc048ep-21
-0x1.c0b185fb450e2p-17
diff --git a/sysdeps/ieee754/dbl-64/e_atanh.c b/sysdeps/ieee754/dbl-64/e_atanh.c
index 11a2a45799d09f63..05ac0a1b30c164a7 100644
--- a/sysdeps/ieee754/dbl-64/e_atanh.c
+++ b/sysdeps/ieee754/dbl-64/e_atanh.c
@@ -44,6 +44,11 @@
static const double huge = 1e300;
+#ifndef SECTION
+# define SECTION
+#endif
+
+SECTION
double
__ieee754_atanh (double x)
{
@@ -73,4 +78,7 @@ __ieee754_atanh (double x)
return copysign (t, x);
}
+
+#ifndef __ieee754_atanh
libm_alias_finite (__ieee754_atanh, __atanh)
+#endif
diff --git a/sysdeps/x86_64/fpu/multiarch/Makefile b/sysdeps/x86_64/fpu/multiarch/Makefile
index b527cab8d134be21..bc479b42d279825b 100644
--- a/sysdeps/x86_64/fpu/multiarch/Makefile
+++ b/sysdeps/x86_64/fpu/multiarch/Makefile
@@ -1,6 +1,7 @@
ifeq ($(subdir),math)
CFLAGS-e_asin-fma.c = -mfma -mavx2
CFLAGS-e_atan2-fma.c = -mfma -mavx2
+CFLAGS-e_atanh-fma.c = -mfma -mavx2
CFLAGS-e_exp-fma.c = -mfma -mavx2
CFLAGS-e_log-fma.c = -mfma -mavx2
CFLAGS-e_log2-fma.c = -mfma -mavx2
@@ -57,6 +58,7 @@ libm-sysdep_routines += \
e_asin-fma \
e_atan2-avx \
e_atan2-fma \
+ e_atanh-fma \
e_exp-avx \
e_exp-fma \
e_exp2f-fma \
diff --git a/sysdeps/x86_64/fpu/multiarch/e_atanh-fma.c b/sysdeps/x86_64/fpu/multiarch/e_atanh-fma.c
new file mode 100644
index 0000000000000000..c3f2f9e5506ae363
--- /dev/null
+++ b/sysdeps/x86_64/fpu/multiarch/e_atanh-fma.c
@@ -0,0 +1,6 @@
+#define __ieee754_atanh __ieee754_atanh_fma
+#define __log1p __log1p_fma
+
+#define SECTION __attribute__ ((section (".text.fma")))
+
+#include <sysdeps/ieee754/dbl-64/e_atanh.c>
diff --git a/sysdeps/x86_64/fpu/multiarch/e_atanh.c b/sysdeps/x86_64/fpu/multiarch/e_atanh.c
new file mode 100644
index 0000000000000000..d2b785dfc0268df8
--- /dev/null
+++ b/sysdeps/x86_64/fpu/multiarch/e_atanh.c
@@ -0,0 +1,34 @@
+/* Multiple versions of atanh.
+ Copyright (C) 2025 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#include <sysdeps/x86/isa-level.h>
+#if MINIMUM_X86_ISA_LEVEL < AVX2_X86_ISA_LEVEL
+# include <libm-alias-finite.h>
+
+extern double __redirect_ieee754_atanh (double);
+
+# define SYMBOL_NAME ieee754_atanh
+# include "ifunc-fma.h"
+
+libc_ifunc_redirected (__redirect_ieee754_atanh, __ieee754_atanh, IFUNC_SELECTOR ());
+
+libm_alias_finite (__ieee754_atanh, __atanh)
+
+# define __ieee754_atanh __ieee754_atanh_sse2
+#endif
+#include <sysdeps/ieee754/dbl-64/e_atanh.c>

View File

@ -0,0 +1,47 @@
commit 60cd7123a6c4441a509c22cc1d5da60df2c1dfeb
Author: Florian Weimer <fweimer@redhat.com>
Date: Fri Mar 28 09:26:06 2025 +0100
x86: Skip XSAVE state size reset if ISA level requires XSAVE
If we have to use XSAVE or XSAVEC trampolines, do not adjust the size
information they need. Technically, it is an operator error to try to
run with -XSAVE,-XSAVEC on such builds, but this change here disables
some unnecessary code with higher ISA levels and simplifies testing.
Related to commit befe2d3c4dec8be2cdd01a47132e47bdb7020922
("x86-64: Don't use SSE resolvers for ISA level 3 or above").
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
(cherry picked from commit 59585ddaa2d44f22af04bb4b8bd4ad1e302c4c02)
diff --git a/sysdeps/x86/cpu-features.c b/sysdeps/x86/cpu-features.c
index 3d7c2819d7cc6643..4c535970d10a2d67 100644
--- a/sysdeps/x86/cpu-features.c
+++ b/sysdeps/x86/cpu-features.c
@@ -24,6 +24,7 @@
#include <dl-cacheinfo.h>
#include <dl-minsigstacksize.h>
#include <dl-hwcap2.h>
+#include <gcc-macros.h>
extern void TUNABLE_CALLBACK (set_hwcaps) (tunable_val_t *)
attribute_hidden;
@@ -1092,6 +1093,9 @@ no_cpuid:
TUNABLE_CALLBACK (set_prefer_map_32bit_exec));
#endif
+ /* Do not add the logic to disable XSAVE/XSAVEC if this glibc build
+ requires AVX and therefore XSAVE or XSAVEC support. */
+#ifndef GCCMACRO__AVX__
bool disable_xsave_features = false;
if (!CPU_FEATURE_USABLE_P (cpu_features, OSXSAVE))
@@ -1145,6 +1149,7 @@ no_cpuid:
CPU_FEATURE_UNSET (cpu_features, FMA4);
}
+#endif
#ifdef __x86_64__
GLRO(dl_hwcap) = HWCAP_X86_64;

View File

@ -0,0 +1,183 @@
commit 87ab0c7f7f7c4bc16cda782c703b61cd28f383a3
Author: Florian Weimer <fweimer@redhat.com>
Date: Fri Mar 28 09:26:59 2025 +0100
x86: Use separate variable for TLSDESC XSAVE/XSAVEC state size (bug 32810)
Previously, the initialization code reused the xsave_state_full_size
member of struct cpu_features for the TLSDESC state size. However,
the tunable processing code assumes that this member has the
original XSAVE (non-compact) state size, so that it can use its
value if XSAVEC is disabled via tunable.
This change uses a separate variable and not a struct member because
the value is only needed in ld.so and the static libc, but not in
libc.so. As a result, struct cpu_features layout does not change,
helping a future backport of this change.
Fixes commit 9b7091415af47082664717210ac49d51551456ab ("x86-64:
Update _dl_tlsdesc_dynamic to preserve AMX registers").
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
(cherry picked from commit 145097dff170507fe73190e8e41194f5b5f7e6bf)
diff --git a/sysdeps/x86/Makefile b/sysdeps/x86/Makefile
index 5311b594aff62f7c..8819fba1b7164f45 100644
--- a/sysdeps/x86/Makefile
+++ b/sysdeps/x86/Makefile
@@ -21,6 +21,9 @@ tests += \
tst-cpu-features-supports-static \
tst-get-cpu-features \
tst-get-cpu-features-static \
+ tst-gnu2-tls2-x86-noxsave \
+ tst-gnu2-tls2-x86-noxsavec \
+ tst-gnu2-tls2-x86-noxsavexsavec \
tst-hwcap-tunables \
# tests
tests-static += \
@@ -91,6 +94,22 @@ CFLAGS-tst-gnu2-tls2.c += -msse
CFLAGS-tst-gnu2-tls2mod0.c += -msse2 -mtune=haswell
CFLAGS-tst-gnu2-tls2mod1.c += -msse2 -mtune=haswell
CFLAGS-tst-gnu2-tls2mod2.c += -msse2 -mtune=haswell
+
+LDFLAGS-tst-gnu2-tls2-x86-noxsave += -Wl,-z,lazy
+LDFLAGS-tst-gnu2-tls2-x86-noxsavec += -Wl,-z,lazy
+LDFLAGS-tst-gnu2-tls2-x86-noxsavexsavec += -Wl,-z,lazy
+
+# Test for bug 32810: incorrect XSAVE state size if XSAVEC is disabled
+# via tunable.
+tst-gnu2-tls2-x86-noxsave-ENV = GLIBC_TUNABLES=glibc.cpu.hwcaps=-XSAVE
+tst-gnu2-tls2-x86-noxsavec-ENV = GLIBC_TUNABLES=glibc.cpu.hwcaps=-XSAVEC
+tst-gnu2-tls2-x86-noxsavexsavec-ENV = GLIBC_TUNABLES=glibc.cpu.hwcaps=-XSAVE,-XSAVEC
+$(objpfx)tst-gnu2-tls2-x86-noxsave.out \
+$(objpfx)tst-gnu2-tls2-x86-noxsavec.out \
+$(objpfx)tst-gnu2-tls2-x86-noxsavexsavec.out: \
+ $(objpfx)tst-gnu2-tls2mod0.so \
+ $(objpfx)tst-gnu2-tls2mod1.so \
+ $(objpfx)tst-gnu2-tls2mod2.so
endif
ifeq ($(subdir),math)
diff --git a/sysdeps/x86/cpu-features.c b/sysdeps/x86/cpu-features.c
index 4c535970d10a2d67..3be69558a4c3aa2d 100644
--- a/sysdeps/x86/cpu-features.c
+++ b/sysdeps/x86/cpu-features.c
@@ -84,6 +84,8 @@ extern void TUNABLE_CALLBACK (set_x86_shstk) (tunable_val_t *)
# include <dl-cet.h>
#endif
+unsigned long int _dl_x86_features_tlsdesc_state_size;
+
static void
update_active (struct cpu_features *cpu_features)
{
@@ -318,6 +320,7 @@ update_active (struct cpu_features *cpu_features)
= xsave_state_full_size;
cpu_features->xsave_state_full_size
= xsave_state_full_size;
+ _dl_x86_features_tlsdesc_state_size = xsave_state_full_size;
/* Check if XSAVEC is available. */
if (CPU_FEATURES_CPU_P (cpu_features, XSAVEC))
@@ -406,11 +409,9 @@ update_active (struct cpu_features *cpu_features)
= ALIGN_UP ((amx_size
+ TLSDESC_CALL_REGISTER_SAVE_AREA),
64);
- /* Set xsave_state_full_size to the compact AMX
- state size for XSAVEC. NB: xsave_state_full_size
- is only used in _dl_tlsdesc_dynamic_xsave and
- _dl_tlsdesc_dynamic_xsavec. */
- cpu_features->xsave_state_full_size = amx_size;
+ /* Set TLSDESC state size to the compact AMX
+ state size for XSAVEC. */
+ _dl_x86_features_tlsdesc_state_size = amx_size;
#endif
cpu_features->xsave_state_size
= ALIGN_UP (size + TLSDESC_CALL_REGISTER_SAVE_AREA,
diff --git a/sysdeps/x86/cpu-tunables.c b/sysdeps/x86/cpu-tunables.c
index 89da7a03daa665f6..a72ba61d837c6383 100644
--- a/sysdeps/x86/cpu-tunables.c
+++ b/sysdeps/x86/cpu-tunables.c
@@ -164,6 +164,8 @@ TUNABLE_CALLBACK (set_hwcaps) (tunable_val_t *valp)
/* Update xsave_state_size to XSAVE state size. */
cpu_features->xsave_state_size
= cpu_features->xsave_state_full_size;
+ _dl_x86_features_tlsdesc_state_size
+ = cpu_features->xsave_state_full_size;
CPU_FEATURE_UNSET (cpu_features, XSAVEC);
}
}
diff --git a/sysdeps/x86/dl-diagnostics-cpu.c b/sysdeps/x86/dl-diagnostics-cpu.c
index c76ea3be16f6bead..9f10645ee9778741 100644
--- a/sysdeps/x86/dl-diagnostics-cpu.c
+++ b/sysdeps/x86/dl-diagnostics-cpu.c
@@ -78,6 +78,8 @@ _dl_diagnostics_cpu (void)
cpu_features->xsave_state_size);
print_cpu_features_value ("xsave_state_full_size",
cpu_features->xsave_state_full_size);
+ print_cpu_features_value ("tlsdesc_state_full_size",
+ _dl_x86_features_tlsdesc_state_size);
print_cpu_features_value ("data_cache_size", cpu_features->data_cache_size);
print_cpu_features_value ("shared_cache_size",
cpu_features->shared_cache_size);
diff --git a/sysdeps/x86/include/cpu-features.h b/sysdeps/x86/include/cpu-features.h
index cd7bd27cf35959fd..a11d4be30b696ac3 100644
--- a/sysdeps/x86/include/cpu-features.h
+++ b/sysdeps/x86/include/cpu-features.h
@@ -934,8 +934,6 @@ struct cpu_features
/* The full state size for XSAVE when XSAVEC is disabled by
GLIBC_TUNABLES=glibc.cpu.hwcaps=-XSAVEC
-
- and the AMX state size when XSAVEC is available.
*/
unsigned int xsave_state_full_size;
/* Data cache size for use in memory and string routines, typically
@@ -987,6 +985,13 @@ extern const struct cpu_features *_dl_x86_get_cpu_features (void)
#define __get_cpu_features() _dl_x86_get_cpu_features()
+#if IS_IN (rtld) || IS_IN (libc)
+/* XSAVE/XSAVEC state size used by TLS descriptors. Compared to
+ xsave_state_size from struct cpu_features, this includes additional
+ registers. */
+extern unsigned long int _dl_x86_features_tlsdesc_state_size attribute_hidden;
+#endif
+
#if defined (_LIBC) && !IS_IN (nonlib)
/* Unused for x86. */
# define INIT_ARCH()
diff --git a/sysdeps/x86/tst-gnu2-tls2-x86-noxsave.c b/sysdeps/x86/tst-gnu2-tls2-x86-noxsave.c
new file mode 100644
index 0000000000000000..f0024c143d1a1df5
--- /dev/null
+++ b/sysdeps/x86/tst-gnu2-tls2-x86-noxsave.c
@@ -0,0 +1 @@
+#include <elf/tst-gnu2-tls2.c>
diff --git a/sysdeps/x86/tst-gnu2-tls2-x86-noxsavec.c b/sysdeps/x86/tst-gnu2-tls2-x86-noxsavec.c
new file mode 100644
index 0000000000000000..f0024c143d1a1df5
--- /dev/null
+++ b/sysdeps/x86/tst-gnu2-tls2-x86-noxsavec.c
@@ -0,0 +1 @@
+#include <elf/tst-gnu2-tls2.c>
diff --git a/sysdeps/x86/tst-gnu2-tls2-x86-noxsavexsavec.c b/sysdeps/x86/tst-gnu2-tls2-x86-noxsavexsavec.c
new file mode 100644
index 0000000000000000..f0024c143d1a1df5
--- /dev/null
+++ b/sysdeps/x86/tst-gnu2-tls2-x86-noxsavexsavec.c
@@ -0,0 +1 @@
+#include <elf/tst-gnu2-tls2.c>
diff --git a/sysdeps/x86_64/dl-tlsdesc-dynamic.h b/sysdeps/x86_64/dl-tlsdesc-dynamic.h
index 9f02cfc3eb297ed2..44d948696fbe44af 100644
--- a/sysdeps/x86_64/dl-tlsdesc-dynamic.h
+++ b/sysdeps/x86_64/dl-tlsdesc-dynamic.h
@@ -99,7 +99,7 @@ _dl_tlsdesc_dynamic:
# endif
#else
/* Allocate stack space of the required size to save the state. */
- sub _rtld_local_ro+RTLD_GLOBAL_RO_DL_X86_CPU_FEATURES_OFFSET+XSAVE_STATE_FULL_SIZE_OFFSET(%rip), %RSP_LP
+ sub _dl_x86_features_tlsdesc_state_size(%rip), %RSP_LP
#endif
/* Besides rdi and rsi, saved above, save rcx, rdx, r8, r9,
r10 and r11. */

View File

@ -0,0 +1,28 @@
commit 837a36c371f18a3152d032e8060f4e5120c25e2b
Author: Florian Weimer <fweimer@redhat.com>
Date: Mon Mar 31 21:33:18 2025 +0200
x86: Link tst-gnu2-tls2-x86-noxsave{,c,xsavec} with libpthread
This fixes a test build failure on Hurd.
Fixes commit 145097dff170507fe73190e8e41194f5b5f7e6bf ("x86: Use separate
variable for TLSDESC XSAVE/XSAVEC state size (bug 32810)").
Reviewed-by: Adhemerval Zanella <adhemerval.zanella@linaro.org>
(cherry picked from commit c6e2895695118ab59c7b17feb0fcb75a53e3478c)
diff --git a/sysdeps/x86/Makefile b/sysdeps/x86/Makefile
index 8819fba1b7164f45..01b0192ddf5e23ca 100644
--- a/sysdeps/x86/Makefile
+++ b/sysdeps/x86/Makefile
@@ -104,6 +104,9 @@ LDFLAGS-tst-gnu2-tls2-x86-noxsavexsavec += -Wl,-z,lazy
tst-gnu2-tls2-x86-noxsave-ENV = GLIBC_TUNABLES=glibc.cpu.hwcaps=-XSAVE
tst-gnu2-tls2-x86-noxsavec-ENV = GLIBC_TUNABLES=glibc.cpu.hwcaps=-XSAVEC
tst-gnu2-tls2-x86-noxsavexsavec-ENV = GLIBC_TUNABLES=glibc.cpu.hwcaps=-XSAVE,-XSAVEC
+$(objpfx)tst-gnu2-tls2-x86-noxsave: $(shared-thread-library)
+$(objpfx)tst-gnu2-tls2-x86-noxsavec: $(shared-thread-library)
+$(objpfx)tst-gnu2-tls2-x86-noxsavexsavec: $(shared-thread-library)
$(objpfx)tst-gnu2-tls2-x86-noxsave.out \
$(objpfx)tst-gnu2-tls2-x86-noxsavec.out \
$(objpfx)tst-gnu2-tls2-x86-noxsavexsavec.out: \

View File

@ -0,0 +1,202 @@
commit 0da58e8be087ca7011ec918977c2ffac9034d1d4
Author: Noah Goldstein <goldstein.w.n@gmail.com>
Date: Fri May 24 12:38:51 2024 -0500
x86: Add seperate non-temporal tunable for memset
The tuning for non-temporal stores for memset vs memcpy is not always
the same. This includes both the exact value and whether non-temporal
stores are profitable at all for a given arch.
This patch add `x86_memset_non_temporal_threshold`. Currently we
disable non-temporal stores for non Intel vendors as the only
benchmarks showing its benefit have been on Intel hardware.
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
(cherry picked from commit 46b5e98ef6f1b9f4b53851f152ecb8209064b26c)
diff --git a/manual/tunables.texi b/manual/tunables.texi
index be97190d67b1c82e..b255a149d10aecf6 100644
--- a/manual/tunables.texi
+++ b/manual/tunables.texi
@@ -52,6 +52,7 @@ glibc.elision.skip_lock_busy: 3 (min: 0, max: 2147483647)
glibc.malloc.top_pad: 0x20000 (min: 0x0, max: 0xffffffffffffffff)
glibc.cpu.x86_rep_stosb_threshold: 0x800 (min: 0x1, max: 0xffffffffffffffff)
glibc.cpu.x86_non_temporal_threshold: 0xc0000 (min: 0x4040, max: 0xfffffffffffffff)
+glibc.cpu.x86_memset_non_temporal_threshold: 0xc0000 (min: 0x4040, max: 0xfffffffffffffff)
glibc.cpu.x86_shstk:
glibc.pthread.stack_cache_size: 0x2800000 (min: 0x0, max: 0xffffffffffffffff)
glibc.cpu.hwcap_mask: 0x6 (min: 0x0, max: 0xffffffffffffffff)
@@ -485,7 +486,8 @@ thread stack originally backup by Huge Pages to default pages.
@cindex shared_cache_size tunables
@cindex tunables, shared_cache_size
@cindex non_temporal_threshold tunables
-@cindex tunables, non_temporal_threshold
+@cindex memset_non_temporal_threshold tunables
+@cindex tunables, non_temporal_threshold, memset_non_temporal_threshold
@deftp {Tunable namespace} glibc.cpu
Behavior of @theglibc{} can be tuned to assume specific hardware capabilities
@@ -561,6 +563,18 @@ like memmove and memcpy.
This tunable is specific to i386 and x86-64.
@end deftp
+@deftp Tunable glibc.cpu.x86_memset_non_temporal_threshold
+The @code{glibc.cpu.x86_memset_non_temporal_threshold} tunable allows
+the user to set threshold in bytes for non temporal store in
+memset. Non temporal stores give a hint to the hardware to move data
+directly to memory without displacing other data from the cache. This
+tunable is used by some platforms to determine when to use non
+temporal stores memset.
+
+This tunable is specific to i386 and x86-64.
+@end deftp
+
+
@deftp Tunable glibc.cpu.x86_rep_movsb_threshold
The @code{glibc.cpu.x86_rep_movsb_threshold} tunable allows the user to
set threshold in bytes to start using "rep movsb". The value must be
diff --git a/sysdeps/x86/cacheinfo.h b/sysdeps/x86/cacheinfo.h
index ab73556772209402..83491607c761ccc6 100644
--- a/sysdeps/x86/cacheinfo.h
+++ b/sysdeps/x86/cacheinfo.h
@@ -35,9 +35,12 @@ long int __x86_data_cache_size attribute_hidden = 32 * 1024;
long int __x86_shared_cache_size_half attribute_hidden = 1024 * 1024 / 2;
long int __x86_shared_cache_size attribute_hidden = 1024 * 1024;
-/* Threshold to use non temporal store. */
+/* Threshold to use non temporal store in memmove. */
long int __x86_shared_non_temporal_threshold attribute_hidden;
+/* Threshold to use non temporal store in memset. */
+long int __x86_memset_non_temporal_threshold attribute_hidden;
+
/* Threshold to use Enhanced REP MOVSB. */
long int __x86_rep_movsb_threshold attribute_hidden = 2048;
@@ -77,6 +80,9 @@ init_cacheinfo (void)
__x86_shared_non_temporal_threshold
= cpu_features->non_temporal_threshold;
+ __x86_memset_non_temporal_threshold
+ = cpu_features->memset_non_temporal_threshold;
+
__x86_rep_movsb_threshold = cpu_features->rep_movsb_threshold;
__x86_rep_stosb_threshold = cpu_features->rep_stosb_threshold;
__x86_rep_movsb_stop_threshold = cpu_features->rep_movsb_stop_threshold;
diff --git a/sysdeps/x86/dl-cacheinfo.h b/sysdeps/x86/dl-cacheinfo.h
index 1f68968a9a457586..0e7c1e0415d4137b 100644
--- a/sysdeps/x86/dl-cacheinfo.h
+++ b/sysdeps/x86/dl-cacheinfo.h
@@ -986,6 +986,13 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
if (CPU_FEATURE_USABLE_P (cpu_features, FSRM))
rep_movsb_threshold = 2112;
+ /* Non-temporal stores in memset have only been tested on Intel hardware.
+ Until we benchmark data on other x86 processor, disable non-temporal
+ stores in memset. */
+ unsigned long int memset_non_temporal_threshold = SIZE_MAX;
+ if (cpu_features->basic.kind == arch_kind_intel)
+ memset_non_temporal_threshold = non_temporal_threshold;
+
/* For AMD CPUs that support ERMS (Zen3+), REP MOVSB is in a lot of
cases slower than the vectorized path (and for some alignments,
it is really slow, check BZ #30994). */
@@ -1012,6 +1019,11 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
&& tunable_size <= maximum_non_temporal_threshold)
non_temporal_threshold = tunable_size;
+ tunable_size = TUNABLE_GET (x86_memset_non_temporal_threshold, long int, NULL);
+ if (tunable_size > minimum_non_temporal_threshold
+ && tunable_size <= maximum_non_temporal_threshold)
+ memset_non_temporal_threshold = tunable_size;
+
tunable_size = TUNABLE_GET (x86_rep_movsb_threshold, long int, NULL);
if (tunable_size > minimum_rep_movsb_threshold)
rep_movsb_threshold = tunable_size;
@@ -1032,6 +1044,9 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
TUNABLE_SET_WITH_BOUNDS (x86_non_temporal_threshold, non_temporal_threshold,
minimum_non_temporal_threshold,
maximum_non_temporal_threshold);
+ TUNABLE_SET_WITH_BOUNDS (
+ x86_memset_non_temporal_threshold, memset_non_temporal_threshold,
+ minimum_non_temporal_threshold, maximum_non_temporal_threshold);
TUNABLE_SET_WITH_BOUNDS (x86_rep_movsb_threshold, rep_movsb_threshold,
minimum_rep_movsb_threshold, SIZE_MAX);
TUNABLE_SET_WITH_BOUNDS (x86_rep_stosb_threshold, rep_stosb_threshold, 1,
@@ -1045,6 +1060,7 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
cpu_features->data_cache_size = data;
cpu_features->shared_cache_size = shared;
cpu_features->non_temporal_threshold = non_temporal_threshold;
+ cpu_features->memset_non_temporal_threshold = memset_non_temporal_threshold;
cpu_features->rep_movsb_threshold = rep_movsb_threshold;
cpu_features->rep_stosb_threshold = rep_stosb_threshold;
cpu_features->rep_movsb_stop_threshold = rep_movsb_stop_threshold;
diff --git a/sysdeps/x86/dl-diagnostics-cpu.c b/sysdeps/x86/dl-diagnostics-cpu.c
index 9f10645ee9778741..8113a93883cfe7a2 100644
--- a/sysdeps/x86/dl-diagnostics-cpu.c
+++ b/sysdeps/x86/dl-diagnostics-cpu.c
@@ -85,6 +85,8 @@ _dl_diagnostics_cpu (void)
cpu_features->shared_cache_size);
print_cpu_features_value ("non_temporal_threshold",
cpu_features->non_temporal_threshold);
+ print_cpu_features_value ("memset_non_temporal_threshold",
+ cpu_features->memset_non_temporal_threshold);
print_cpu_features_value ("rep_movsb_threshold",
cpu_features->rep_movsb_threshold);
print_cpu_features_value ("rep_movsb_stop_threshold",
diff --git a/sysdeps/x86/dl-tunables.list b/sysdeps/x86/dl-tunables.list
index 7d82da0dece49c45..a0a12995927dc4f1 100644
--- a/sysdeps/x86/dl-tunables.list
+++ b/sysdeps/x86/dl-tunables.list
@@ -30,6 +30,9 @@ glibc {
x86_non_temporal_threshold {
type: SIZE_T
}
+ x86_memset_non_temporal_threshold {
+ type: SIZE_T
+ }
x86_rep_movsb_threshold {
type: SIZE_T
# Since there is overhead to set up REP MOVSB operation, REP
diff --git a/sysdeps/x86/include/cpu-features.h b/sysdeps/x86/include/cpu-features.h
index a11d4be30b696ac3..03c71387dd08982b 100644
--- a/sysdeps/x86/include/cpu-features.h
+++ b/sysdeps/x86/include/cpu-features.h
@@ -942,8 +942,10 @@ struct cpu_features
/* Shared cache size for use in memory and string routines, typically
L2 or L3 size. */
unsigned long int shared_cache_size;
- /* Threshold to use non temporal store. */
+ /* Threshold to use non temporal store in memmove. */
unsigned long int non_temporal_threshold;
+ /* Threshold to use non temporal store in memset. */
+ unsigned long int memset_non_temporal_threshold;
/* Threshold to use "rep movsb". */
unsigned long int rep_movsb_threshold;
/* Threshold to stop using "rep movsb". */
diff --git a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
index 637caadb406b2544..88bf08e4f4a2260e 100644
--- a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
@@ -24,9 +24,9 @@
5. If size is more to 4 * VEC_SIZE, align to 1 * VEC_SIZE with
4 VEC stores and store 4 * VEC at a time until done.
6. On machines ERMS feature, if size is range
- [__x86_rep_stosb_threshold, __x86_shared_non_temporal_threshold)
+ [__x86_rep_stosb_threshold, __x86_memset_non_temporal_threshold)
then REP STOSB will be used.
- 7. If size >= __x86_shared_non_temporal_threshold, use a
+ 7. If size >= __x86_memset_non_temporal_threshold, use a
non-temporal stores. */
#include <sysdep.h>
@@ -318,7 +318,7 @@ L(return_vzeroupper):
/* If no USE_LESS_VEC_MASK put L(stosb_local) here. Will be in
range for 2-byte jump encoding. */
L(stosb_local):
- cmp __x86_shared_non_temporal_threshold(%rip), %RDX_LP
+ cmp __x86_memset_non_temporal_threshold(%rip), %RDX_LP
jae L(nt_memset)
movzbl %sil, %eax
mov %RDX_LP, %RCX_LP

View File

@ -0,0 +1,43 @@
commit cc59fa5dbc4db7c6d1fb792c55a5d83c54ee72bf
Author: Joe Damato <jdamato@fastly.com>
Date: Fri Jun 7 23:04:47 2024 +0000
x86: Enable non-temporal memset tunable for AMD
In commit 46b5e98ef6f1 ("x86: Add seperate non-temporal tunable for
memset") a tunable threshold for enabling non-temporal memset was added,
but only for Intel hardware.
Since that commit, new benchmark results suggest that non-temporal
memset is beneficial on AMD, as well, so allow this tunable to be set
for AMD.
See:
https://docs.google.com/spreadsheets/d/1opzukzvum4n6-RUVHTGddV6RjAEil4P2uMjjQGLbLcU/edit?usp=sharing
which has been updated to include data using different stategies for
large memset on AMD Zen2, Zen3, and Zen4.
Signed-off-by: Joe Damato <jdamato@fastly.com>
Reviewed-by: Noah Goldstein <goldstein.w.n@gmail.com>
(cherry picked from commit bef2a827a55fc759693ccc5b0f614353b8ad712d)
diff --git a/sysdeps/x86/dl-cacheinfo.h b/sysdeps/x86/dl-cacheinfo.h
index 0e7c1e0415d4137b..9916c5d951361c90 100644
--- a/sysdeps/x86/dl-cacheinfo.h
+++ b/sysdeps/x86/dl-cacheinfo.h
@@ -986,11 +986,11 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
if (CPU_FEATURE_USABLE_P (cpu_features, FSRM))
rep_movsb_threshold = 2112;
- /* Non-temporal stores in memset have only been tested on Intel hardware.
- Until we benchmark data on other x86 processor, disable non-temporal
- stores in memset. */
+ /* Non-temporal stores are more performant on Intel and AMD hardware above
+ non_temporal_threshold. Enable this for both Intel and AMD hardware. */
unsigned long int memset_non_temporal_threshold = SIZE_MAX;
- if (cpu_features->basic.kind == arch_kind_intel)
+ if (cpu_features->basic.kind == arch_kind_intel
+ || cpu_features->basic.kind == arch_kind_amd)
memset_non_temporal_threshold = non_temporal_threshold;
/* For AMD CPUs that support ERMS (Zen3+), REP MOVSB is in a lot of

View File

@ -0,0 +1,37 @@
commit 38a7632f2d1ec86445904b356c54129591e8519b
Author: Noah Goldstein <goldstein.w.n@gmail.com>
Date: Fri Jun 14 13:01:58 2024 -0500
x86: Fix value for `x86_memset_non_temporal_threshold` when it is undesirable
When we don't want to use non-temporal stores for memset, we set
`x86_memset_non_temporal_threshold` to SIZE_MAX.
The current code, however, we using `maximum_non_temporal_threshold`
as the upper bound which is `SIZE_MAX >> 4` so we ended up with a
value of `0`.
Fix is to just use `SIZE_MAX` as the upper bound for when setting the
tunable.
Tested-by: Borislav Petkov (AMD) <bp@alien8.de>
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
(cherry picked from commit 5b54a33435e5533653a9956728f2de9d16a3b4ee)
diff --git a/sysdeps/x86/dl-cacheinfo.h b/sysdeps/x86/dl-cacheinfo.h
index 9916c5d951361c90..9b6f68e46de4bdaa 100644
--- a/sysdeps/x86/dl-cacheinfo.h
+++ b/sysdeps/x86/dl-cacheinfo.h
@@ -1044,9 +1044,9 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
TUNABLE_SET_WITH_BOUNDS (x86_non_temporal_threshold, non_temporal_threshold,
minimum_non_temporal_threshold,
maximum_non_temporal_threshold);
- TUNABLE_SET_WITH_BOUNDS (
- x86_memset_non_temporal_threshold, memset_non_temporal_threshold,
- minimum_non_temporal_threshold, maximum_non_temporal_threshold);
+ TUNABLE_SET_WITH_BOUNDS (x86_memset_non_temporal_threshold,
+ memset_non_temporal_threshold,
+ minimum_non_temporal_threshold, SIZE_MAX);
TUNABLE_SET_WITH_BOUNDS (x86_rep_movsb_threshold, rep_movsb_threshold,
minimum_rep_movsb_threshold, SIZE_MAX);
TUNABLE_SET_WITH_BOUNDS (x86_rep_stosb_threshold, rep_stosb_threshold, 1,

View File

@ -0,0 +1,121 @@
commit bde201e92c1e64934f8ffe3e5b7d769100677037
Author: Noah Goldstein <goldstein.w.n@gmail.com>
Date: Mon Jul 15 16:19:17 2024 +0800
x86: Disable non-temporal memset on Skylake Server
The original commit enabling non-temporal memset on Skylake Server had
erroneous benchmarks (actually done on ICX).
Further benchmarks indicate non-temporal stores may in fact by a
regression on Skylake Server.
This commit may be over-cautious in some cases, but should avoid any
regressions for 2.40.
Tested using qemu on all x86_64 cpu arch supported by both qemu +
GLIBC.
Reviewed-by: DJ Delorie <dj@redhat.com>
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
(cherry picked from commit 5bcf6265f215326d14dfacdce8532792c2c7f8f8)
diff --git a/sysdeps/x86/cpu-features.c b/sysdeps/x86/cpu-features.c
index 3be69558a4c3aa2d..77b5638daafe9a1e 100644
--- a/sysdeps/x86/cpu-features.c
+++ b/sysdeps/x86/cpu-features.c
@@ -872,11 +872,18 @@ init_cpu_features (struct cpu_features *cpu_features)
/* Newer Bigcore microarch (larger non-temporal store
threshold). */
- case INTEL_BIGCORE_SKYLAKE:
- case INTEL_BIGCORE_KABYLAKE:
- case INTEL_BIGCORE_COMETLAKE:
case INTEL_BIGCORE_SKYLAKE_AVX512:
case INTEL_BIGCORE_CANNONLAKE:
+ /* Benchmarks indicate non-temporal memset is not
+ necessarily profitable on SKX (and in some cases much
+ worse). This is likely unique to SKX due its it unique
+ mesh interconnect (not present on ICX or BWD). Disable
+ non-temporal on all Skylake servers. */
+ cpu_features->preferred[index_arch_Avoid_Non_Temporal_Memset]
+ |= bit_arch_Avoid_Non_Temporal_Memset;
+ case INTEL_BIGCORE_COMETLAKE:
+ case INTEL_BIGCORE_SKYLAKE:
+ case INTEL_BIGCORE_KABYLAKE:
case INTEL_BIGCORE_ICELAKE:
case INTEL_BIGCORE_TIGERLAKE:
case INTEL_BIGCORE_ROCKETLAKE:
diff --git a/sysdeps/x86/cpu-tunables.c b/sysdeps/x86/cpu-tunables.c
index a72ba61d837c6383..a71772c9c07d01d7 100644
--- a/sysdeps/x86/cpu-tunables.c
+++ b/sysdeps/x86/cpu-tunables.c
@@ -245,6 +245,11 @@ TUNABLE_CALLBACK (set_hwcaps) (tunable_val_t *valp)
(n, cpu_features, MathVec_Prefer_No_AVX512, AVX512F, 24);
}
break;
+ case 25:
+ {
+ CHECK_GLIBC_IFUNC_PREFERRED_BOTH (n, cpu_features,
+ Avoid_Non_Temporal_Memset, 25);
+ }
case 26:
{
CHECK_GLIBC_IFUNC_PREFERRED_NEED_BOTH
diff --git a/sysdeps/x86/dl-cacheinfo.h b/sysdeps/x86/dl-cacheinfo.h
index 9b6f68e46de4bdaa..66e2b83fea0dc744 100644
--- a/sysdeps/x86/dl-cacheinfo.h
+++ b/sysdeps/x86/dl-cacheinfo.h
@@ -989,13 +989,14 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
/* Non-temporal stores are more performant on Intel and AMD hardware above
non_temporal_threshold. Enable this for both Intel and AMD hardware. */
unsigned long int memset_non_temporal_threshold = SIZE_MAX;
- if (cpu_features->basic.kind == arch_kind_intel
- || cpu_features->basic.kind == arch_kind_amd)
- memset_non_temporal_threshold = non_temporal_threshold;
-
- /* For AMD CPUs that support ERMS (Zen3+), REP MOVSB is in a lot of
- cases slower than the vectorized path (and for some alignments,
- it is really slow, check BZ #30994). */
+ if (!CPU_FEATURES_ARCH_P (cpu_features, Avoid_Non_Temporal_Memset)
+ && (cpu_features->basic.kind == arch_kind_intel
+ || cpu_features->basic.kind == arch_kind_amd))
+ memset_non_temporal_threshold = non_temporal_threshold;
+
+ /* For AMD CPUs that support ERMS (Zen3+), REP MOVSB is in a lot of
+ cases slower than the vectorized path (and for some alignments,
+ it is really slow, check BZ #30994). */
if (cpu_features->basic.kind == arch_kind_amd)
rep_movsb_threshold = non_temporal_threshold;
diff --git a/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def b/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def
index 85e7f54ec8204328..61bbbc2e8983482e 100644
--- a/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def
+++ b/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def
@@ -33,3 +33,4 @@ BIT (Prefer_No_AVX512)
BIT (MathVec_Prefer_No_AVX512)
BIT (Prefer_FSRM)
BIT (Avoid_Short_Distance_REP_MOVSB)
+BIT (Avoid_Non_Temporal_Memset)
diff --git a/sysdeps/x86/tst-hwcap-tunables.c b/sysdeps/x86/tst-hwcap-tunables.c
index f6a65b88dea6d9dc..bc573c7435130dee 100644
--- a/sysdeps/x86/tst-hwcap-tunables.c
+++ b/sysdeps/x86/tst-hwcap-tunables.c
@@ -60,7 +60,7 @@ static const struct test_t
/* Disable everything. */
"-Prefer_ERMS,-Prefer_FSRM,-AVX,-AVX2,-AVX512F,-AVX512VL,"
"-SSE4_1,-SSE4_2,-SSSE3,-Fast_Unaligned_Load,-ERMS,"
- "-AVX_Fast_Unaligned_Load",
+ "-AVX_Fast_Unaligned_Load,-Avoid_Non_Temporal_Memset",
test_1,
array_length (test_1)
},
@@ -68,7 +68,7 @@ static const struct test_t
/* Same as before, but with some empty suboptions. */
",-,-Prefer_ERMS,-Prefer_FSRM,-AVX,-AVX2,-AVX512F,-AVX512VL,"
"-SSE4_1,-SSE4_2,-SSSE3,-Fast_Unaligned_Load,,-,"
- "-ERMS,-AVX_Fast_Unaligned_Load,-,",
+ "-ERMS,-AVX_Fast_Unaligned_Load,-Avoid_Non_Temporal_Memset,-,",
test_1,
array_length (test_1)
}

View File

@ -0,0 +1,24 @@
commit 2be36448c46e9ef712e5f3d5381f38bf3138efdf
Author: Florian Weimer <fweimer@redhat.com>
Date: Fri Aug 2 15:22:14 2024 +0200
x86: Tunables may incorrectly set Prefer_PMINUB_for_stringop (bug 32047)
Fixes commit 5bcf6265f215326d14dfacdce8532792c2c7f8f8 ("x86:
Disable non-temporal memset on Skylake Server").
Reviewed-by: Noah Goldstein <goldstein.w.n@gmail.com>
(cherry picked from commit 7a630f7d3392ca391a399486ce2846f9e4b4ee63)
diff --git a/sysdeps/x86/cpu-tunables.c b/sysdeps/x86/cpu-tunables.c
index a71772c9c07d01d7..a0b31d80f64127c5 100644
--- a/sysdeps/x86/cpu-tunables.c
+++ b/sysdeps/x86/cpu-tunables.c
@@ -250,6 +250,7 @@ TUNABLE_CALLBACK (set_hwcaps) (tunable_val_t *valp)
CHECK_GLIBC_IFUNC_PREFERRED_BOTH (n, cpu_features,
Avoid_Non_Temporal_Memset, 25);
}
+ break;
case 26:
{
CHECK_GLIBC_IFUNC_PREFERRED_NEED_BOTH

View File

@ -0,0 +1,90 @@
commit 65ae73be01604699493d387d8ea6bba41df004ab
Author: Noah Goldstein <goldstein.w.n@gmail.com>
Date: Wed Aug 14 14:37:30 2024 +0800
x86: Use `Avoid_Non_Temporal_Memset` to control non-temporal path
This is just a refactor and there should be no behavioral change from
this commit.
The goal is to make `Avoid_Non_Temporal_Memset` a more universal knob
for controlling whether we use non-temporal memset rather than having
extra logic based on vendor.
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
(cherry picked from commit b93dddfaf440aa12f45d7c356f6ffe9f27d35577)
diff --git a/sysdeps/x86/cpu-features.c b/sysdeps/x86/cpu-features.c
index 77b5638daafe9a1e..4490c0a782e25d8d 100644
--- a/sysdeps/x86/cpu-features.c
+++ b/sysdeps/x86/cpu-features.c
@@ -758,6 +758,12 @@ init_cpu_features (struct cpu_features *cpu_features)
unsigned int stepping = 0;
enum cpu_features_kind kind;
+ /* Default is avoid non-temporal memset for non Intel/AMD hardware. This is,
+ as of writing this, we only have benchmarks indicatings it profitability
+ on Intel/AMD. */
+ cpu_features->preferred[index_arch_Avoid_Non_Temporal_Memset]
+ |= bit_arch_Avoid_Non_Temporal_Memset;
+
cpu_features->cachesize_non_temporal_divisor = 4;
#if !HAS_CPUID
if (__get_cpuid_max (0, 0) == 0)
@@ -783,6 +789,11 @@ init_cpu_features (struct cpu_features *cpu_features)
update_active (cpu_features);
+ /* Benchmarks indicate non-temporal memset can be profitable on Intel
+ hardware. */
+ cpu_features->preferred[index_arch_Avoid_Non_Temporal_Memset]
+ &= ~bit_arch_Avoid_Non_Temporal_Memset;
+
if (family == 0x06)
{
model += extended_model;
@@ -993,6 +1004,11 @@ https://www.intel.com/content/www/us/en/support/articles/000059422/processors.ht
ecx = cpu_features->features[CPUID_INDEX_1].cpuid.ecx;
+ /* Benchmarks indicate non-temporal memset can be profitable on AMD
+ hardware. */
+ cpu_features->preferred[index_arch_Avoid_Non_Temporal_Memset]
+ &= ~bit_arch_Avoid_Non_Temporal_Memset;
+
if (CPU_FEATURE_USABLE_P (cpu_features, AVX))
{
/* Since the FMA4 bit is in CPUID_INDEX_80000001 and
diff --git a/sysdeps/x86/dl-cacheinfo.h b/sysdeps/x86/dl-cacheinfo.h
index 66e2b83fea0dc744..10ad18061a1b47af 100644
--- a/sysdeps/x86/dl-cacheinfo.h
+++ b/sysdeps/x86/dl-cacheinfo.h
@@ -986,14 +986,6 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
if (CPU_FEATURE_USABLE_P (cpu_features, FSRM))
rep_movsb_threshold = 2112;
- /* Non-temporal stores are more performant on Intel and AMD hardware above
- non_temporal_threshold. Enable this for both Intel and AMD hardware. */
- unsigned long int memset_non_temporal_threshold = SIZE_MAX;
- if (!CPU_FEATURES_ARCH_P (cpu_features, Avoid_Non_Temporal_Memset)
- && (cpu_features->basic.kind == arch_kind_intel
- || cpu_features->basic.kind == arch_kind_amd))
- memset_non_temporal_threshold = non_temporal_threshold;
-
/* For AMD CPUs that support ERMS (Zen3+), REP MOVSB is in a lot of
cases slower than the vectorized path (and for some alignments,
it is really slow, check BZ #30994). */
@@ -1015,6 +1007,13 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
if (tunable_size != 0)
shared = tunable_size;
+ /* Non-temporal stores are more performant on some hardware above
+ non_temporal_threshold. Currently Prefer_Non_Temporal is set for for both
+ Intel and AMD hardware. */
+ unsigned long int memset_non_temporal_threshold = SIZE_MAX;
+ if (!CPU_FEATURES_ARCH_P (cpu_features, Avoid_Non_Temporal_Memset))
+ memset_non_temporal_threshold = non_temporal_threshold;
+
tunable_size = TUNABLE_GET (x86_non_temporal_threshold, long int, NULL);
if (tunable_size > minimum_non_temporal_threshold
&& tunable_size <= maximum_non_temporal_threshold)

View File

@ -0,0 +1,160 @@
commit 765ff3d0d49f039575dd20961e745fb2876339a7
Author: Sunil K Pandey <skpgkp2@gmail.com>
Date: Thu Apr 3 13:00:45 2025 -0700
x86: Optimize xstate size calculation
Scan xstate IDs up to the maximum supported xstate ID. Remove the
separate AMX xstate calculation. Instead, exclude the AMX space from
the start of TILECFG to the end of TILEDATA in xsave_state_size.
Completed validation on SKL/SKX/SPR/SDE and compared xsave state size
with "ld.so --list-diagnostics" option, no regression.
Co-Authored-By: H.J. Lu <hjl.tools@gmail.com>
Reviewed-by: Sunil K Pandey <skpgkp2@gmail.com>
(cherry picked from commit 70b648855185e967e54668b101d24704c3fb869d)
diff --git a/sysdeps/x86/cpu-features.c b/sysdeps/x86/cpu-features.c
index 4490c0a782e25d8d..dc5cd01d489851b8 100644
--- a/sysdeps/x86/cpu-features.c
+++ b/sysdeps/x86/cpu-features.c
@@ -325,13 +325,8 @@ update_active (struct cpu_features *cpu_features)
/* Check if XSAVEC is available. */
if (CPU_FEATURES_CPU_P (cpu_features, XSAVEC))
{
- unsigned int xstate_comp_offsets[32];
- unsigned int xstate_comp_sizes[32];
-#ifdef __x86_64__
- unsigned int xstate_amx_comp_offsets[32];
- unsigned int xstate_amx_comp_sizes[32];
- unsigned int amx_ecx;
-#endif
+ unsigned int xstate_comp_offsets[X86_XSTATE_MAX_ID + 1];
+ unsigned int xstate_comp_sizes[X86_XSTATE_MAX_ID + 1];
unsigned int i;
xstate_comp_offsets[0] = 0;
@@ -339,39 +334,16 @@ update_active (struct cpu_features *cpu_features)
xstate_comp_offsets[2] = 576;
xstate_comp_sizes[0] = 160;
xstate_comp_sizes[1] = 256;
-#ifdef __x86_64__
- xstate_amx_comp_offsets[0] = 0;
- xstate_amx_comp_offsets[1] = 160;
- xstate_amx_comp_offsets[2] = 576;
- xstate_amx_comp_sizes[0] = 160;
- xstate_amx_comp_sizes[1] = 256;
-#endif
- for (i = 2; i < 32; i++)
+ for (i = 2; i <= X86_XSTATE_MAX_ID; i++)
{
if ((FULL_STATE_SAVE_MASK & (1 << i)) != 0)
{
__cpuid_count (0xd, i, eax, ebx, ecx, edx);
-#ifdef __x86_64__
- /* Include this in xsave_state_full_size. */
- amx_ecx = ecx;
- xstate_amx_comp_sizes[i] = eax;
- if ((AMX_STATE_SAVE_MASK & (1 << i)) != 0)
- {
- /* Exclude this from xsave_state_size. */
- ecx = 0;
- xstate_comp_sizes[i] = 0;
- }
- else
-#endif
- xstate_comp_sizes[i] = eax;
+ xstate_comp_sizes[i] = eax;
}
else
{
-#ifdef __x86_64__
- amx_ecx = 0;
- xstate_amx_comp_sizes[i] = 0;
-#endif
ecx = 0;
xstate_comp_sizes[i] = 0;
}
@@ -380,42 +352,32 @@ update_active (struct cpu_features *cpu_features)
{
xstate_comp_offsets[i]
= (xstate_comp_offsets[i - 1]
- + xstate_comp_sizes[i -1]);
+ + xstate_comp_sizes[i - 1]);
if ((ecx & (1 << 1)) != 0)
xstate_comp_offsets[i]
= ALIGN_UP (xstate_comp_offsets[i], 64);
-#ifdef __x86_64__
- xstate_amx_comp_offsets[i]
- = (xstate_amx_comp_offsets[i - 1]
- + xstate_amx_comp_sizes[i - 1]);
- if ((amx_ecx & (1 << 1)) != 0)
- xstate_amx_comp_offsets[i]
- = ALIGN_UP (xstate_amx_comp_offsets[i],
- 64);
-#endif
}
}
/* Use XSAVEC. */
unsigned int size
- = xstate_comp_offsets[31] + xstate_comp_sizes[31];
+ = (xstate_comp_offsets[X86_XSTATE_MAX_ID]
+ + xstate_comp_sizes[X86_XSTATE_MAX_ID]);
if (size)
{
+ size = ALIGN_UP (size + TLSDESC_CALL_REGISTER_SAVE_AREA,
+ 64);
#ifdef __x86_64__
- unsigned int amx_size
- = (xstate_amx_comp_offsets[31]
- + xstate_amx_comp_sizes[31]);
- amx_size
- = ALIGN_UP ((amx_size
- + TLSDESC_CALL_REGISTER_SAVE_AREA),
- 64);
- /* Set TLSDESC state size to the compact AMX
- state size for XSAVEC. */
- _dl_x86_features_tlsdesc_state_size = amx_size;
+ _dl_x86_features_tlsdesc_state_size = size;
+ /* Exclude the AMX space from the start of TILECFG
+ space to the end of TILEDATA space. If CPU
+ doesn't support AMX, TILECFG offset is the same
+ as TILEDATA + 1 offset. Otherwise, they are
+ multiples of 64. */
+ size -= (xstate_comp_offsets[X86_XSTATE_TILEDATA_ID + 1]
+ - xstate_comp_offsets[X86_XSTATE_TILECFG_ID]);
#endif
- cpu_features->xsave_state_size
- = ALIGN_UP (size + TLSDESC_CALL_REGISTER_SAVE_AREA,
- 64);
+ cpu_features->xsave_state_size = size;
CPU_FEATURE_SET (cpu_features, XSAVEC);
}
}
diff --git a/sysdeps/x86/sysdep.h b/sysdeps/x86/sysdep.h
index 7359149e17ccf341..1d6cabd816bf84cc 100644
--- a/sysdeps/x86/sysdep.h
+++ b/sysdeps/x86/sysdep.h
@@ -102,6 +102,9 @@
| (1 << X86_XSTATE_ZMM_ID) \
| (1 << X86_XSTATE_APX_F_ID))
+/* The maximum supported xstate ID. */
+# define X86_XSTATE_MAX_ID X86_XSTATE_APX_F_ID
+
/* AMX state mask. */
# define AMX_STATE_SAVE_MASK \
((1 << X86_XSTATE_TILECFG_ID) | (1 << X86_XSTATE_TILEDATA_ID))
@@ -123,6 +126,9 @@
| (1 << X86_XSTATE_K_ID) \
| (1 << X86_XSTATE_ZMM_H_ID))
+/* The maximum supported xstate ID. */
+# define X86_XSTATE_MAX_ID X86_XSTATE_ZMM_H_ID
+
/* States to be included in xsave_state_size. */
# define FULL_STATE_SAVE_MASK STATE_SAVE_MASK
#endif

View File

@ -0,0 +1,76 @@
commit 7620d98186fc23e216773dbec5dc5da1fd8daf0f
Author: Sunil K Pandey <skpgkp2@gmail.com>
Date: Thu Apr 3 18:14:20 2025 -0700
x86: Add ARL/PTL/CWF model detection support
- Add ARROWLAKE model detection.
- Add PANTHERLAKE model detection.
- Add CLEARWATERFOREST model detection.
Intel® Architecture Instruction Set Extensions Programming Reference
https://cdrdv2.intel.com/v1/dl/getContent/671368 Section 1.2.
No regression, validated model detection on SDE.
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
(cherry picked from commit e53eb952b970ac94c97d74fb447418fb327ca096)
diff --git a/sysdeps/x86/cpu-features.c b/sysdeps/x86/cpu-features.c
index dc5cd01d489851b8..fb94477dad08ab02 100644
--- a/sysdeps/x86/cpu-features.c
+++ b/sysdeps/x86/cpu-features.c
@@ -512,6 +512,7 @@ enum
INTEL_ATOM_GOLDMONT,
INTEL_ATOM_GOLDMONT_PLUS,
INTEL_ATOM_SIERRAFOREST,
+ INTEL_ATOM_CLEARWATERFOREST,
INTEL_ATOM_GRANDRIDGE,
INTEL_ATOM_TREMONT,
@@ -539,6 +540,7 @@ enum
INTEL_BIGCORE_METEORLAKE,
INTEL_BIGCORE_LUNARLAKE,
INTEL_BIGCORE_ARROWLAKE,
+ INTEL_BIGCORE_PANTHERLAKE,
INTEL_BIGCORE_GRANITERAPIDS,
/* Mixed (bigcore + atom SOC). */
@@ -584,6 +586,8 @@ intel_get_fam6_microarch (unsigned int model,
return INTEL_ATOM_GOLDMONT_PLUS;
case 0xAF:
return INTEL_ATOM_SIERRAFOREST;
+ case 0xDD:
+ return INTEL_ATOM_CLEARWATERFOREST;
case 0xB6:
return INTEL_ATOM_GRANDRIDGE;
case 0x86:
@@ -691,8 +695,12 @@ intel_get_fam6_microarch (unsigned int model,
return INTEL_BIGCORE_METEORLAKE;
case 0xbd:
return INTEL_BIGCORE_LUNARLAKE;
+ case 0xb5:
+ case 0xc5:
case 0xc6:
return INTEL_BIGCORE_ARROWLAKE;
+ case 0xCC:
+ return INTEL_BIGCORE_PANTHERLAKE;
case 0xAD:
case 0xAE:
return INTEL_BIGCORE_GRANITERAPIDS;
@@ -808,6 +816,7 @@ init_cpu_features (struct cpu_features *cpu_features)
Default tuned atom microarch.
case INTEL_ATOM_SIERRAFOREST:
case INTEL_ATOM_GRANDRIDGE:
+ case INTEL_ATOM_CLEARWATERFOREST:
*/
/* Bigcore/Default Tuning. */
@@ -864,6 +873,7 @@ init_cpu_features (struct cpu_features *cpu_features)
case INTEL_BIGCORE_METEORLAKE:
case INTEL_BIGCORE_LUNARLAKE:
case INTEL_BIGCORE_ARROWLAKE:
+ case INTEL_BIGCORE_PANTHERLAKE:
case INTEL_BIGCORE_SAPPHIRERAPIDS:
case INTEL_BIGCORE_EMERALDRAPIDS:
case INTEL_BIGCORE_GRANITERAPIDS:

View File

@ -0,0 +1,352 @@
commit e09436c2cb5b6453d922c5af6a30e2de0255cd61
Author: Sunil K Pandey <sunil.k.pandey@intel.com>
Date: Fri Apr 11 08:52:52 2025 -0700
x86: Handle unknown Intel processor with default tuning
Enable default tuning for unknown Intel processor.
Tested on x86, no regression.
Co-Authored-By: H.J. Lu <hjl.tools@gmail.com>
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
(cherry picked from commit 9f0deff558d1d6b08c425c157f50de85013ada9c)
diff --git a/sysdeps/x86/cpu-features.c b/sysdeps/x86/cpu-features.c
index fb94477dad08ab02..6d2e660b4b20ff06 100644
--- a/sysdeps/x86/cpu-features.c
+++ b/sysdeps/x86/cpu-features.c
@@ -502,8 +502,8 @@ _Static_assert (((index_arch_Fast_Unaligned_Load
"Incorrect index_arch_Fast_Unaligned_Load");
-/* Intel Family-6 microarch list. */
-enum
+/* Intel microarch list. */
+enum intel_microarch
{
/* Atom processors. */
INTEL_ATOM_BONNELL,
@@ -555,7 +555,7 @@ enum
INTEL_UNKNOWN,
};
-static unsigned int
+static enum intel_microarch
intel_get_fam6_microarch (unsigned int model,
__attribute__ ((unused)) unsigned int stepping)
{
@@ -764,134 +764,20 @@ init_cpu_features (struct cpu_features *cpu_features)
cpu_features->preferred[index_arch_Avoid_Non_Temporal_Memset]
&= ~bit_arch_Avoid_Non_Temporal_Memset;
+ enum intel_microarch microarch = INTEL_UNKNOWN;
if (family == 0x06)
{
model += extended_model;
- unsigned int microarch
- = intel_get_fam6_microarch (model, stepping);
+ microarch = intel_get_fam6_microarch (model, stepping);
+ /* Disable TSX on some processors to avoid TSX on kernels that
+ weren't updated with the latest microcode package (which
+ disables broken feature by default). */
switch (microarch)
{
- /* Atom / KNL tuning. */
- case INTEL_ATOM_BONNELL:
- /* BSF is slow on Bonnell. */
- cpu_features->preferred[index_arch_Slow_BSF]
- |= bit_arch_Slow_BSF;
- break;
-
- /* Unaligned load versions are faster than SSSE3
- on Airmont, Silvermont, Goldmont, and Goldmont Plus. */
- case INTEL_ATOM_AIRMONT:
- case INTEL_ATOM_SILVERMONT:
- case INTEL_ATOM_GOLDMONT:
- case INTEL_ATOM_GOLDMONT_PLUS:
-
- /* Knights Landing. Enable Silvermont optimizations. */
- case INTEL_KNIGHTS_LANDING:
-
- cpu_features->preferred[index_arch_Fast_Unaligned_Load]
- |= (bit_arch_Fast_Unaligned_Load
- | bit_arch_Fast_Unaligned_Copy
- | bit_arch_Prefer_PMINUB_for_stringop
- | bit_arch_Slow_SSE4_2);
- break;
-
- case INTEL_ATOM_TREMONT:
- /* Enable rep string instructions, unaligned load, unaligned
- copy, pminub and avoid SSE 4.2 on Tremont. */
- cpu_features->preferred[index_arch_Fast_Rep_String]
- |= (bit_arch_Fast_Rep_String
- | bit_arch_Fast_Unaligned_Load
- | bit_arch_Fast_Unaligned_Copy
- | bit_arch_Prefer_PMINUB_for_stringop
- | bit_arch_Slow_SSE4_2);
- break;
-
- /*
- Default tuned Knights microarch.
- case INTEL_KNIGHTS_MILL:
- */
-
- /*
- Default tuned atom microarch.
- case INTEL_ATOM_SIERRAFOREST:
- case INTEL_ATOM_GRANDRIDGE:
- case INTEL_ATOM_CLEARWATERFOREST:
- */
-
- /* Bigcore/Default Tuning. */
default:
- default_tuning:
- /* Unknown family 0x06 processors. Assuming this is one
- of Core i3/i5/i7 processors if AVX is available. */
- if (!CPU_FEATURES_CPU_P (cpu_features, AVX))
- break;
-
- enable_modern_features:
- /* Rep string instructions, unaligned load, unaligned copy,
- and pminub are fast on Intel Core i3, i5 and i7. */
- cpu_features->preferred[index_arch_Fast_Rep_String]
- |= (bit_arch_Fast_Rep_String
- | bit_arch_Fast_Unaligned_Load
- | bit_arch_Fast_Unaligned_Copy
- | bit_arch_Prefer_PMINUB_for_stringop);
break;
- case INTEL_BIGCORE_NEHALEM:
- case INTEL_BIGCORE_WESTMERE:
- /* Older CPUs prefer non-temporal stores at lower threshold. */
- cpu_features->cachesize_non_temporal_divisor = 8;
- goto enable_modern_features;
-
- /* Older Bigcore microarch (smaller non-temporal store
- threshold). */
- case INTEL_BIGCORE_SANDYBRIDGE:
- case INTEL_BIGCORE_IVYBRIDGE:
- case INTEL_BIGCORE_HASWELL:
- case INTEL_BIGCORE_BROADWELL:
- cpu_features->cachesize_non_temporal_divisor = 8;
- goto default_tuning;
-
- /* Newer Bigcore microarch (larger non-temporal store
- threshold). */
- case INTEL_BIGCORE_SKYLAKE_AVX512:
- case INTEL_BIGCORE_CANNONLAKE:
- /* Benchmarks indicate non-temporal memset is not
- necessarily profitable on SKX (and in some cases much
- worse). This is likely unique to SKX due its it unique
- mesh interconnect (not present on ICX or BWD). Disable
- non-temporal on all Skylake servers. */
- cpu_features->preferred[index_arch_Avoid_Non_Temporal_Memset]
- |= bit_arch_Avoid_Non_Temporal_Memset;
- case INTEL_BIGCORE_COMETLAKE:
- case INTEL_BIGCORE_SKYLAKE:
- case INTEL_BIGCORE_KABYLAKE:
- case INTEL_BIGCORE_ICELAKE:
- case INTEL_BIGCORE_TIGERLAKE:
- case INTEL_BIGCORE_ROCKETLAKE:
- case INTEL_BIGCORE_RAPTORLAKE:
- case INTEL_BIGCORE_METEORLAKE:
- case INTEL_BIGCORE_LUNARLAKE:
- case INTEL_BIGCORE_ARROWLAKE:
- case INTEL_BIGCORE_PANTHERLAKE:
- case INTEL_BIGCORE_SAPPHIRERAPIDS:
- case INTEL_BIGCORE_EMERALDRAPIDS:
- case INTEL_BIGCORE_GRANITERAPIDS:
- cpu_features->cachesize_non_temporal_divisor = 2;
- goto default_tuning;
-
- /* Default tuned Mixed (bigcore + atom SOC). */
- case INTEL_MIXED_LAKEFIELD:
- case INTEL_MIXED_ALDERLAKE:
- cpu_features->cachesize_non_temporal_divisor = 2;
- goto default_tuning;
- }
-
- /* Disable TSX on some processors to avoid TSX on kernels that
- weren't updated with the latest microcode package (which
- disables broken feature by default). */
- switch (microarch)
- {
case INTEL_BIGCORE_SKYLAKE_AVX512:
/* 0x55 (Skylake-avx512) && stepping <= 5 disable TSX. */
if (stepping <= 5)
@@ -900,38 +786,152 @@ init_cpu_features (struct cpu_features *cpu_features)
case INTEL_BIGCORE_KABYLAKE:
/* NB: Although the errata documents that for model == 0x8e
- (kabylake skylake client), only 0xb stepping or lower are
- impacted, the intention of the errata was to disable TSX on
- all client processors on all steppings. Include 0xc
- stepping which is an Intel Core i7-8665U, a client mobile
- processor. */
+ (kabylake skylake client), only 0xb stepping or lower are
+ impacted, the intention of the errata was to disable TSX on
+ all client processors on all steppings. Include 0xc
+ stepping which is an Intel Core i7-8665U, a client mobile
+ processor. */
if (stepping > 0xc)
break;
/* Fall through. */
case INTEL_BIGCORE_SKYLAKE:
- /* Disable Intel TSX and enable RTM_ALWAYS_ABORT for
- processors listed in:
-
-https://www.intel.com/content/www/us/en/support/articles/000059422/processors.html
- */
- disable_tsx:
- CPU_FEATURE_UNSET (cpu_features, HLE);
- CPU_FEATURE_UNSET (cpu_features, RTM);
- CPU_FEATURE_SET (cpu_features, RTM_ALWAYS_ABORT);
- break;
+ /* Disable Intel TSX and enable RTM_ALWAYS_ABORT for
+ processors listed in:
+
+ https://www.intel.com/content/www/us/en/support/articles/000059422/processors.html
+ */
+disable_tsx:
+ CPU_FEATURE_UNSET (cpu_features, HLE);
+ CPU_FEATURE_UNSET (cpu_features, RTM);
+ CPU_FEATURE_SET (cpu_features, RTM_ALWAYS_ABORT);
+ break;
case INTEL_BIGCORE_HASWELL:
- /* Xeon E7 v3 (model == 0x3f) with stepping >= 4 has working
- TSX. Haswell also include other model numbers that have
- working TSX. */
- if (model == 0x3f && stepping >= 4)
+ /* Xeon E7 v3 (model == 0x3f) with stepping >= 4 has working
+ TSX. Haswell also includes other model numbers that have
+ working TSX. */
+ if (model == 0x3f && stepping >= 4)
break;
- CPU_FEATURE_UNSET (cpu_features, RTM);
- break;
+ CPU_FEATURE_UNSET (cpu_features, RTM);
+ break;
}
}
+ switch (microarch)
+ {
+ /* Atom / KNL tuning. */
+ case INTEL_ATOM_BONNELL:
+ /* BSF is slow on Bonnell. */
+ cpu_features->preferred[index_arch_Slow_BSF]
+ |= bit_arch_Slow_BSF;
+ break;
+
+ /* Unaligned load versions are faster than SSSE3
+ on Airmont, Silvermont, Goldmont, and Goldmont Plus. */
+ case INTEL_ATOM_AIRMONT:
+ case INTEL_ATOM_SILVERMONT:
+ case INTEL_ATOM_GOLDMONT:
+ case INTEL_ATOM_GOLDMONT_PLUS:
+
+ /* Knights Landing. Enable Silvermont optimizations. */
+ case INTEL_KNIGHTS_LANDING:
+
+ cpu_features->preferred[index_arch_Fast_Unaligned_Load]
+ |= (bit_arch_Fast_Unaligned_Load
+ | bit_arch_Fast_Unaligned_Copy
+ | bit_arch_Prefer_PMINUB_for_stringop
+ | bit_arch_Slow_SSE4_2);
+ break;
+
+ case INTEL_ATOM_TREMONT:
+ /* Enable rep string instructions, unaligned load, unaligned
+ copy, pminub and avoid SSE 4.2 on Tremont. */
+ cpu_features->preferred[index_arch_Fast_Rep_String]
+ |= (bit_arch_Fast_Rep_String
+ | bit_arch_Fast_Unaligned_Load
+ | bit_arch_Fast_Unaligned_Copy
+ | bit_arch_Prefer_PMINUB_for_stringop
+ | bit_arch_Slow_SSE4_2);
+ break;
+
+ /*
+ Default tuned Knights microarch.
+ case INTEL_KNIGHTS_MILL:
+ */
+
+ /*
+ Default tuned atom microarch.
+ case INTEL_ATOM_SIERRAFOREST:
+ case INTEL_ATOM_GRANDRIDGE:
+ case INTEL_ATOM_CLEARWATERFOREST:
+ */
+
+ /* Bigcore/Default Tuning. */
+ default:
+ default_tuning:
+ /* Unknown Intel processors. Assuming this is one of Core
+ i3/i5/i7 processors if AVX is available. */
+ if (!CPU_FEATURES_CPU_P (cpu_features, AVX))
+ break;
+
+ enable_modern_features:
+ /* Rep string instructions, unaligned load, unaligned copy,
+ and pminub are fast on Intel Core i3, i5 and i7. */
+ cpu_features->preferred[index_arch_Fast_Rep_String]
+ |= (bit_arch_Fast_Rep_String
+ | bit_arch_Fast_Unaligned_Load
+ | bit_arch_Fast_Unaligned_Copy
+ | bit_arch_Prefer_PMINUB_for_stringop);
+ break;
+
+ case INTEL_BIGCORE_NEHALEM:
+ case INTEL_BIGCORE_WESTMERE:
+ /* Older CPUs prefer non-temporal stores at lower threshold. */
+ cpu_features->cachesize_non_temporal_divisor = 8;
+ goto enable_modern_features;
+
+ /* Older Bigcore microarch (smaller non-temporal store
+ threshold). */
+ case INTEL_BIGCORE_SANDYBRIDGE:
+ case INTEL_BIGCORE_IVYBRIDGE:
+ case INTEL_BIGCORE_HASWELL:
+ case INTEL_BIGCORE_BROADWELL:
+ cpu_features->cachesize_non_temporal_divisor = 8;
+ goto default_tuning;
+
+ /* Newer Bigcore microarch (larger non-temporal store
+ threshold). */
+ case INTEL_BIGCORE_SKYLAKE_AVX512:
+ case INTEL_BIGCORE_CANNONLAKE:
+ /* Benchmarks indicate non-temporal memset is not
+ necessarily profitable on SKX (and in some cases much
+ worse). This is likely unique to SKX due to its unique
+ mesh interconnect (not present on ICX or BWD). Disable
+ non-temporal on all Skylake servers. */
+ cpu_features->preferred[index_arch_Avoid_Non_Temporal_Memset]
+ |= bit_arch_Avoid_Non_Temporal_Memset;
+ /* fallthrough */
+ case INTEL_BIGCORE_COMETLAKE:
+ case INTEL_BIGCORE_SKYLAKE:
+ case INTEL_BIGCORE_KABYLAKE:
+ case INTEL_BIGCORE_ICELAKE:
+ case INTEL_BIGCORE_TIGERLAKE:
+ case INTEL_BIGCORE_ROCKETLAKE:
+ case INTEL_BIGCORE_RAPTORLAKE:
+ case INTEL_BIGCORE_METEORLAKE:
+ case INTEL_BIGCORE_LUNARLAKE:
+ case INTEL_BIGCORE_ARROWLAKE:
+ case INTEL_BIGCORE_PANTHERLAKE:
+ case INTEL_BIGCORE_SAPPHIRERAPIDS:
+ case INTEL_BIGCORE_EMERALDRAPIDS:
+ case INTEL_BIGCORE_GRANITERAPIDS:
+ /* Default tuned Mixed (bigcore + atom SOC). */
+ case INTEL_MIXED_LAKEFIELD:
+ case INTEL_MIXED_ALDERLAKE:
+ cpu_features->cachesize_non_temporal_divisor = 2;
+ goto default_tuning;
+ }
/* Since AVX512ER is unique to Xeon Phi, set Prefer_No_VZEROUPPER
if AVX512ER is available. Don't use AVX512 to avoid lower CPU

View File

@ -0,0 +1,49 @@
commit 3463100f2d47f2897a24ba8023a5c7aaf2d26550
Author: H.J. Lu <hjl.tools@gmail.com>
Date: Sat Apr 12 08:37:29 2025 -0700
x86: Detect Intel Diamond Rapids
Detect Intel Diamond Rapids and tune it similar to Intel Granite Rapids.
Signed-off-by: H.J. Lu <hjl.tools@gmail.com>
Reviewed-by: Sunil K Pandey <skpgkp2@gmail.com>
(cherry picked from commit de14f1959ee5f9b845a7cae43bee03068b8136f0)
diff --git a/sysdeps/x86/cpu-features.c b/sysdeps/x86/cpu-features.c
index 6d2e660b4b20ff06..47dc3b1510a68fc9 100644
--- a/sysdeps/x86/cpu-features.c
+++ b/sysdeps/x86/cpu-features.c
@@ -542,6 +542,7 @@ enum intel_microarch
INTEL_BIGCORE_ARROWLAKE,
INTEL_BIGCORE_PANTHERLAKE,
INTEL_BIGCORE_GRANITERAPIDS,
+ INTEL_BIGCORE_DIAMONDRAPIDS,
/* Mixed (bigcore + atom SOC). */
INTEL_MIXED_LAKEFIELD,
@@ -817,6 +818,16 @@ disable_tsx:
break;
}
}
+ else if (family == 19)
+ switch (model)
+ {
+ case 0x01:
+ microarch = INTEL_BIGCORE_DIAMONDRAPIDS;
+ break;
+
+ default:
+ break;
+ }
switch (microarch)
{
@@ -926,6 +937,7 @@ disable_tsx:
case INTEL_BIGCORE_SAPPHIRERAPIDS:
case INTEL_BIGCORE_EMERALDRAPIDS:
case INTEL_BIGCORE_GRANITERAPIDS:
+ case INTEL_BIGCORE_DIAMONDRAPIDS:
/* Default tuned Mixed (bigcore + atom SOC). */
case INTEL_MIXED_LAKEFIELD:
case INTEL_MIXED_ALDERLAKE:

View File

@ -0,0 +1,444 @@
commit 2451ef5c4a92e774c56111b3708eede7f98fe940
Author: Frank Barrus <frankbarrus_sw@shaggy.cc>
Date: Wed Dec 4 07:55:02 2024 -0500
pthreads NPTL: lost wakeup fix 2
This fixes the lost wakeup (from a bug in signal stealing) with a change
in the usage of g_signals[] in the condition variable internal state.
It also completely eliminates the concept and handling of signal stealing,
as well as the need for signalers to block to wait for waiters to wake
up every time there is a G1/G2 switch. This greatly reduces the average
and maximum latency for pthread_cond_signal.
The g_signals[] field now contains a signal count that is relative to
the current g1_start value. Since it is a 32-bit field, and the LSB is
still reserved (though not currently used anymore), it has a 31-bit value
that corresponds to the low 31 bits of the sequence number in g1_start.
(since g1_start also has an LSB flag, this means bits 31:1 in g_signals
correspond to bits 31:1 in g1_start, plus the current signal count)
By making the signal count relative to g1_start, there is no longer
any ambiguity or A/B/A issue, and thus any checks before blocking,
including the futex call itself, are guaranteed not to block if the G1/G2
switch occurs, even if the signal count remains the same. This allows
initially safely blocking in G2 until the switch to G1 occurs, and
then transitioning from G1 to a new G1 or G2, and always being able to
distinguish the state change. This removes the race condition and A/B/A
problems that otherwise ocurred if a late (pre-empted) waiter were to
resume just as the futex call attempted to block on g_signal since
otherwise there was no last opportunity to re-check things like whether
the current G1 group was already closed.
By fixing these issues, the signal stealing code can be eliminated,
since there is no concept of signal stealing anymore. The code to block
for all waiters to exit g_refs can also be removed, since any waiters
that are still in the g_refs region can be guaranteed to safely wake
up and exit. If there are still any left at this time, they are all
sent one final futex wakeup to ensure that they are not blocked any
longer, but there is no need for the signaller to block and wait for
them to wake up and exit the g_refs region.
The signal count is then effectively "zeroed" but since it is now
relative to g1_start, this is done by advancing it to a new value that
can be observed by any pending blocking waiters. Any late waiters can
always tell the difference, and can thus just cleanly exit if they are
in a stale G1 or G2. They can never steal a signal from the current
G1 if they are not in the current G1, since the signal value that has
to match in the cmpxchg has the low 31 bits of the g1_start value
contained in it, and that's first checked, and then it won't match if
there's a G1/G2 change.
Note: the 31-bit sequence number used in g_signals is designed to
handle wrap-around when checking the signal count, but if the entire
31-bit wraparound (2 billion signals) occurs while there is still a
late waiter that has not yet resumed, and it happens to then match
the current g1_start low bits, and the pre-emption occurs after the
normal "closed group" checks (which are 64-bit) but then hits the
futex syscall and signal consuming code, then an A/B/A issue could
still result and cause an incorrect assumption about whether it
should block. This particular scenario seems unlikely in practice.
Note that once awake from the futex, the waiter would notice the
closed group before consuming the signal (since that's still a 64-bit
check that would not be aliased in the wrap-around in g_signals),
so the biggest impact would be blocking on the futex until the next
full wakeup from a G1/G2 switch.
Signed-off-by: Frank Barrus <frankbarrus_sw@shaggy.cc>
Reviewed-by: Carlos O'Donell <carlos@redhat.com>
(cherry picked from commit 1db84775f831a1494993ce9c118deaf9537cc50a)
diff --git a/nptl/pthread_cond_common.c b/nptl/pthread_cond_common.c
index 3487557bb86c8186..4855b8899f887ad0 100644
--- a/nptl/pthread_cond_common.c
+++ b/nptl/pthread_cond_common.c
@@ -201,7 +201,6 @@ static bool __attribute__ ((unused))
__condvar_quiesce_and_switch_g1 (pthread_cond_t *cond, uint64_t wseq,
unsigned int *g1index, int private)
{
- const unsigned int maxspin = 0;
unsigned int g1 = *g1index;
/* If there is no waiter in G2, we don't do anything. The expression may
@@ -222,84 +221,46 @@ __condvar_quiesce_and_switch_g1 (pthread_cond_t *cond, uint64_t wseq,
* New waiters arriving concurrently with the group switching will all go
into G2 until we atomically make the switch. Waiters existing in G2
are not affected.
- * Waiters in G1 will be closed out immediately by setting a flag in
- __g_signals, which will prevent waiters from blocking using a futex on
- __g_signals and also notifies them that the group is closed. As a
- result, they will eventually remove their group reference, allowing us
- to close switch group roles. */
-
- /* First, set the closed flag on __g_signals. This tells waiters that are
- about to wait that they shouldn't do that anymore. This basically
- serves as an advance notification of the upcoming change to __g1_start;
- waiters interpret it as if __g1_start was larger than their waiter
- sequence position. This allows us to change __g1_start after waiting
- for all existing waiters with group references to leave, which in turn
- makes recovery after stealing a signal simpler because it then can be
- skipped if __g1_start indicates that the group is closed (otherwise,
- we would have to recover always because waiters don't know how big their
- groups are). Relaxed MO is fine. */
- atomic_fetch_or_relaxed (cond->__data.__g_signals + g1, 1);
-
- /* Wait until there are no group references anymore. The fetch-or operation
- injects us into the modification order of __g_refs; release MO ensures
- that waiters incrementing __g_refs after our fetch-or see the previous
- changes to __g_signals and to __g1_start that had to happen before we can
- switch this G1 and alias with an older group (we have two groups, so
- aliasing requires switching group roles twice). Note that nobody else
- can have set the wake-request flag, so we do not have to act upon it.
-
- Also note that it is harmless if older waiters or waiters from this G1
- get a group reference after we have quiesced the group because it will
- remain closed for them either because of the closed flag in __g_signals
- or the later update to __g1_start. New waiters will never arrive here
- but instead continue to go into the still current G2. */
- unsigned r = atomic_fetch_or_release (cond->__data.__g_refs + g1, 0);
- while ((r >> 1) > 0)
- {
- for (unsigned int spin = maxspin; ((r >> 1) > 0) && (spin > 0); spin--)
- {
- /* TODO Back off. */
- r = atomic_load_relaxed (cond->__data.__g_refs + g1);
- }
- if ((r >> 1) > 0)
- {
- /* There is still a waiter after spinning. Set the wake-request
- flag and block. Relaxed MO is fine because this is just about
- this futex word.
-
- Update r to include the set wake-request flag so that the upcoming
- futex_wait only blocks if the flag is still set (otherwise, we'd
- violate the basic client-side futex protocol). */
- r = atomic_fetch_or_relaxed (cond->__data.__g_refs + g1, 1) | 1;
-
- if ((r >> 1) > 0)
- futex_wait_simple (cond->__data.__g_refs + g1, r, private);
- /* Reload here so we eventually see the most recent value even if we
- do not spin. */
- r = atomic_load_relaxed (cond->__data.__g_refs + g1);
- }
- }
- /* Acquire MO so that we synchronize with the release operation that waiters
- use to decrement __g_refs and thus happen after the waiters we waited
- for. */
- atomic_thread_fence_acquire ();
+ * Waiters in G1 will be closed out immediately by the advancing of
+ __g_signals to the next "lowseq" (low 31 bits of the new g1_start),
+ which will prevent waiters from blocking using a futex on
+ __g_signals since it provides enough signals for all possible
+ remaining waiters. As a result, they can each consume a signal
+ and they will eventually remove their group reference. */
/* Update __g1_start, which finishes closing this group. The value we add
will never be negative because old_orig_size can only be zero when we
switch groups the first time after a condvar was initialized, in which
- case G1 will be at index 1 and we will add a value of 1. See above for
- why this takes place after waiting for quiescence of the group.
+ case G1 will be at index 1 and we will add a value of 1.
Relaxed MO is fine because the change comes with no additional
constraints that others would have to observe. */
__condvar_add_g1_start_relaxed (cond,
(old_orig_size << 1) + (g1 == 1 ? 1 : - 1));
- /* Now reopen the group, thus enabling waiters to again block using the
- futex controlled by __g_signals. Release MO so that observers that see
- no signals (and thus can block) also see the write __g1_start and thus
- that this is now a new group (see __pthread_cond_wait_common for the
- matching acquire MO loads). */
- atomic_store_release (cond->__data.__g_signals + g1, 0);
+ unsigned int lowseq = ((old_g1_start + old_orig_size) << 1) & ~1U;
+
+ /* If any waiters still hold group references (and thus could be blocked),
+ then wake them all up now and prevent any running ones from blocking.
+ This is effectively a catch-all for any possible current or future
+ bugs that can allow the group size to reach 0 before all G1 waiters
+ have been awakened or at least given signals to consume, or any
+ other case that can leave blocked (or about to block) older waiters.. */
+ if ((atomic_fetch_or_release (cond->__data.__g_refs + g1, 0) >> 1) > 0)
+ {
+ /* First advance signals to the end of the group (i.e. enough signals
+ for the entire G1 group) to ensure that waiters which have not
+ yet blocked in the futex will not block.
+ Note that in the vast majority of cases, this should never
+ actually be necessary, since __g_signals will have enough
+ signals for the remaining g_refs waiters. As an optimization,
+ we could check this first before proceeding, although that
+ could still leave the potential for futex lost wakeup bugs
+ if the signal count was non-zero but the futex wakeup
+ was somehow lost. */
+ atomic_store_release (cond->__data.__g_signals + g1, lowseq);
+
+ futex_wake (cond->__data.__g_signals + g1, INT_MAX, private);
+ }
/* At this point, the old G1 is now a valid new G2 (but not in use yet).
No old waiter can neither grab a signal nor acquire a reference without
@@ -311,6 +272,10 @@ __condvar_quiesce_and_switch_g1 (pthread_cond_t *cond, uint64_t wseq,
g1 ^= 1;
*g1index ^= 1;
+ /* Now advance the new G1 g_signals to the new lowseq, giving it
+ an effective signal count of 0 to start. */
+ atomic_store_release (cond->__data.__g_signals + g1, lowseq);
+
/* These values are just observed by signalers, and thus protected by the
lock. */
unsigned int orig_size = wseq - (old_g1_start + old_orig_size);
diff --git a/nptl/pthread_cond_wait.c b/nptl/pthread_cond_wait.c
index 66786c7b9022b26c..3d290e39c8ccebb7 100644
--- a/nptl/pthread_cond_wait.c
+++ b/nptl/pthread_cond_wait.c
@@ -238,9 +238,7 @@ __condvar_cleanup_waiting (void *arg)
signaled), and a reference count.
The group reference count is used to maintain the number of waiters that
- are using the group's futex. Before a group can change its role, the
- reference count must show that no waiters are using the futex anymore; this
- prevents ABA issues on the futex word.
+ are using the group's futex.
To represent which intervals in the waiter sequence the groups cover (and
thus also which group slot contains G1 or G2), we use a 64b counter to
@@ -300,11 +298,12 @@ __condvar_cleanup_waiting (void *arg)
last reference.
* Reference count used by waiters concurrently with signalers that have
acquired the condvar-internal lock.
- __g_signals: The number of signals that can still be consumed.
+ __g_signals: The number of signals that can still be consumed, relative to
+ the current g1_start. (i.e. bits 31 to 1 of __g_signals are bits
+ 31 to 1 of g1_start with the signal count added)
* Used as a futex word by waiters. Used concurrently by waiters and
signalers.
- * LSB is true iff this group has been completely signaled (i.e., it is
- closed).
+ * LSB is currently reserved and 0.
__g_size: Waiters remaining in this group (i.e., which have not been
signaled yet.
* Accessed by signalers and waiters that cancel waiting (both do so only
@@ -328,18 +327,6 @@ __condvar_cleanup_waiting (void *arg)
sufficient because if a waiter can see a sufficiently large value, it could
have also consume a signal in the waiters group.
- Waiters try to grab a signal from __g_signals without holding a reference
- count, which can lead to stealing a signal from a more recent group after
- their own group was already closed. They cannot always detect whether they
- in fact did because they do not know when they stole, but they can
- conservatively add a signal back to the group they stole from; if they
- did so unnecessarily, all that happens is a spurious wake-up. To make this
- even less likely, __g1_start contains the index of the current g2 too,
- which allows waiters to check if there aliasing on the group slots; if
- there wasn't, they didn't steal from the current G1, which means that the
- G1 they stole from must have been already closed and they do not need to
- fix anything.
-
It is essential that the last field in pthread_cond_t is __g_signals[1]:
The previous condvar used a pointer-sized field in pthread_cond_t, so a
PTHREAD_COND_INITIALIZER from that condvar implementation might only
@@ -435,6 +422,9 @@ __pthread_cond_wait_common (pthread_cond_t *cond, pthread_mutex_t *mutex,
{
while (1)
{
+ uint64_t g1_start = __condvar_load_g1_start_relaxed (cond);
+ unsigned int lowseq = (g1_start & 1) == g ? signals : g1_start & ~1U;
+
/* Spin-wait first.
Note that spinning first without checking whether a timeout
passed might lead to what looks like a spurious wake-up even
@@ -446,35 +436,45 @@ __pthread_cond_wait_common (pthread_cond_t *cond, pthread_mutex_t *mutex,
having to compare against the current time seems to be the right
choice from a performance perspective for most use cases. */
unsigned int spin = maxspin;
- while (signals == 0 && spin > 0)
+ while (spin > 0 && ((int)(signals - lowseq) < 2))
{
/* Check that we are not spinning on a group that's already
closed. */
- if (seq < (__condvar_load_g1_start_relaxed (cond) >> 1))
- goto done;
+ if (seq < (g1_start >> 1))
+ break;
/* TODO Back off. */
/* Reload signals. See above for MO. */
signals = atomic_load_acquire (cond->__data.__g_signals + g);
+ g1_start = __condvar_load_g1_start_relaxed (cond);
+ lowseq = (g1_start & 1) == g ? signals : g1_start & ~1U;
spin--;
}
- /* If our group will be closed as indicated by the flag on signals,
- don't bother grabbing a signal. */
- if (signals & 1)
- goto done;
-
- /* If there is an available signal, don't block. */
- if (signals != 0)
+ if (seq < (g1_start >> 1))
+ {
+ /* If the group is closed already,
+ then this waiter originally had enough extra signals to
+ consume, up until the time its group was closed. */
+ goto done;
+ }
+
+ /* If there is an available signal, don't block.
+ If __g1_start has advanced at all, then we must be in G1
+ by now, perhaps in the process of switching back to an older
+ G2, but in either case we're allowed to consume the available
+ signal and should not block anymore. */
+ if ((int)(signals - lowseq) >= 2)
break;
/* No signals available after spinning, so prepare to block.
We first acquire a group reference and use acquire MO for that so
that we synchronize with the dummy read-modify-write in
__condvar_quiesce_and_switch_g1 if we read from that. In turn,
- in this case this will make us see the closed flag on __g_signals
- that designates a concurrent attempt to reuse the group's slot.
+ in this case this will make us see the advancement of __g_signals
+ to the upcoming new g1_start that occurs with a concurrent
+ attempt to reuse the group's slot.
We use acquire MO for the __g_signals check to make the
__g1_start check work (see spinning above).
Note that the group reference acquisition will not mask the
@@ -482,15 +482,24 @@ __pthread_cond_wait_common (pthread_cond_t *cond, pthread_mutex_t *mutex,
an atomic read-modify-write operation and thus extend the release
sequence. */
atomic_fetch_add_acquire (cond->__data.__g_refs + g, 2);
- if (((atomic_load_acquire (cond->__data.__g_signals + g) & 1) != 0)
- || (seq < (__condvar_load_g1_start_relaxed (cond) >> 1)))
+ signals = atomic_load_acquire (cond->__data.__g_signals + g);
+ g1_start = __condvar_load_g1_start_relaxed (cond);
+ lowseq = (g1_start & 1) == g ? signals : g1_start & ~1U;
+
+ if (seq < (g1_start >> 1))
{
- /* Our group is closed. Wake up any signalers that might be
- waiting. */
+ /* group is closed already, so don't block */
__condvar_dec_grefs (cond, g, private);
goto done;
}
+ if ((int)(signals - lowseq) >= 2)
+ {
+ /* a signal showed up or G1/G2 switched after we grabbed the refcount */
+ __condvar_dec_grefs (cond, g, private);
+ break;
+ }
+
// Now block.
struct _pthread_cleanup_buffer buffer;
struct _condvar_cleanup_buffer cbuffer;
@@ -501,7 +510,7 @@ __pthread_cond_wait_common (pthread_cond_t *cond, pthread_mutex_t *mutex,
__pthread_cleanup_push (&buffer, __condvar_cleanup_waiting, &cbuffer);
err = __futex_abstimed_wait_cancelable64 (
- cond->__data.__g_signals + g, 0, clockid, abstime, private);
+ cond->__data.__g_signals + g, signals, clockid, abstime, private);
__pthread_cleanup_pop (&buffer, 0);
@@ -524,6 +533,8 @@ __pthread_cond_wait_common (pthread_cond_t *cond, pthread_mutex_t *mutex,
signals = atomic_load_acquire (cond->__data.__g_signals + g);
}
+ if (seq < (__condvar_load_g1_start_relaxed (cond) >> 1))
+ goto done;
}
/* Try to grab a signal. Use acquire MO so that we see an up-to-date value
of __g1_start below (see spinning above for a similar case). In
@@ -532,69 +543,6 @@ __pthread_cond_wait_common (pthread_cond_t *cond, pthread_mutex_t *mutex,
while (!atomic_compare_exchange_weak_acquire (cond->__data.__g_signals + g,
&signals, signals - 2));
- /* We consumed a signal but we could have consumed from a more recent group
- that aliased with ours due to being in the same group slot. If this
- might be the case our group must be closed as visible through
- __g1_start. */
- uint64_t g1_start = __condvar_load_g1_start_relaxed (cond);
- if (seq < (g1_start >> 1))
- {
- /* We potentially stole a signal from a more recent group but we do not
- know which group we really consumed from.
- We do not care about groups older than current G1 because they are
- closed; we could have stolen from these, but then we just add a
- spurious wake-up for the current groups.
- We will never steal a signal from current G2 that was really intended
- for G2 because G2 never receives signals (until it becomes G1). We
- could have stolen a signal from G2 that was conservatively added by a
- previous waiter that also thought it stole a signal -- but given that
- that signal was added unnecessarily, it's not a problem if we steal
- it.
- Thus, the remaining case is that we could have stolen from the current
- G1, where "current" means the __g1_start value we observed. However,
- if the current G1 does not have the same slot index as we do, we did
- not steal from it and do not need to undo that. This is the reason
- for putting a bit with G2's index into__g1_start as well. */
- if (((g1_start & 1) ^ 1) == g)
- {
- /* We have to conservatively undo our potential mistake of stealing
- a signal. We can stop trying to do that when the current G1
- changes because other spinning waiters will notice this too and
- __condvar_quiesce_and_switch_g1 has checked that there are no
- futex waiters anymore before switching G1.
- Relaxed MO is fine for the __g1_start load because we need to
- merely be able to observe this fact and not have to observe
- something else as well.
- ??? Would it help to spin for a little while to see whether the
- current G1 gets closed? This might be worthwhile if the group is
- small or close to being closed. */
- unsigned int s = atomic_load_relaxed (cond->__data.__g_signals + g);
- while (__condvar_load_g1_start_relaxed (cond) == g1_start)
- {
- /* Try to add a signal. We don't need to acquire the lock
- because at worst we can cause a spurious wake-up. If the
- group is in the process of being closed (LSB is true), this
- has an effect similar to us adding a signal. */
- if (((s & 1) != 0)
- || atomic_compare_exchange_weak_relaxed
- (cond->__data.__g_signals + g, &s, s + 2))
- {
- /* If we added a signal, we also need to add a wake-up on
- the futex. We also need to do that if we skipped adding
- a signal because the group is being closed because
- while __condvar_quiesce_and_switch_g1 could have closed
- the group, it might still be waiting for futex waiters to
- leave (and one of those waiters might be the one we stole
- the signal from, which cause it to block using the
- futex). */
- futex_wake (cond->__data.__g_signals + g, 1, private);
- break;
- }
- /* TODO Back off. */
- }
- }
- }
-
done:
/* Confirm that we have been woken. We do that before acquiring the mutex

View File

@ -0,0 +1,134 @@
commit ea13a35e37932cabeef7d7b018aaef1136287a5e
Author: Malte Skarupke <malteskarupke@fastmail.fm>
Date: Wed Dec 4 07:55:22 2024 -0500
nptl: Update comments and indentation for new condvar implementation
Some comments were wrong after the most recent commit. This fixes that.
Also fixing indentation where it was using spaces instead of tabs.
Signed-off-by: Malte Skarupke <malteskarupke@fastmail.fm>
Reviewed-by: Carlos O'Donell <carlos@redhat.com>
(cherry picked from commit 0cc973160c23bb67f895bc887dd6942d29f8fee3)
diff --git a/nptl/pthread_cond_common.c b/nptl/pthread_cond_common.c
index 4855b8899f887ad0..3475d1512354be3c 100644
--- a/nptl/pthread_cond_common.c
+++ b/nptl/pthread_cond_common.c
@@ -221,8 +221,9 @@ __condvar_quiesce_and_switch_g1 (pthread_cond_t *cond, uint64_t wseq,
* New waiters arriving concurrently with the group switching will all go
into G2 until we atomically make the switch. Waiters existing in G2
are not affected.
- * Waiters in G1 will be closed out immediately by the advancing of
- __g_signals to the next "lowseq" (low 31 bits of the new g1_start),
+ * Waiters in G1 have already received a signal and been woken. If they
+ haven't woken yet, they will be closed out immediately by the advancing
+ of __g_signals to the next "lowseq" (low 31 bits of the new g1_start),
which will prevent waiters from blocking using a futex on
__g_signals since it provides enough signals for all possible
remaining waiters. As a result, they can each consume a signal
diff --git a/nptl/pthread_cond_wait.c b/nptl/pthread_cond_wait.c
index 3d290e39c8ccebb7..ad2cee7d59ddc093 100644
--- a/nptl/pthread_cond_wait.c
+++ b/nptl/pthread_cond_wait.c
@@ -249,7 +249,7 @@ __condvar_cleanup_waiting (void *arg)
figure out whether they are in a group that has already been completely
signaled (i.e., if the current G1 starts at a later position that the
waiter's position). Waiters cannot determine whether they are currently
- in G2 or G1 -- but they do not have too because all they are interested in
+ in G2 or G1 -- but they do not have to because all they are interested in
is whether there are available signals, and they always start in G2 (whose
group slot they know because of the bit in the waiter sequence. Signalers
will simply fill the right group until it is completely signaled and can
@@ -412,7 +412,7 @@ __pthread_cond_wait_common (pthread_cond_t *cond, pthread_mutex_t *mutex,
}
/* Now wait until a signal is available in our group or it is closed.
- Acquire MO so that if we observe a value of zero written after group
+ Acquire MO so that if we observe (signals == lowseq) after group
switching in __condvar_quiesce_and_switch_g1, we synchronize with that
store and will see the prior update of __g1_start done while switching
groups too. */
@@ -422,8 +422,8 @@ __pthread_cond_wait_common (pthread_cond_t *cond, pthread_mutex_t *mutex,
{
while (1)
{
- uint64_t g1_start = __condvar_load_g1_start_relaxed (cond);
- unsigned int lowseq = (g1_start & 1) == g ? signals : g1_start & ~1U;
+ uint64_t g1_start = __condvar_load_g1_start_relaxed (cond);
+ unsigned int lowseq = (g1_start & 1) == g ? signals : g1_start & ~1U;
/* Spin-wait first.
Note that spinning first without checking whether a timeout
@@ -447,21 +447,21 @@ __pthread_cond_wait_common (pthread_cond_t *cond, pthread_mutex_t *mutex,
/* Reload signals. See above for MO. */
signals = atomic_load_acquire (cond->__data.__g_signals + g);
- g1_start = __condvar_load_g1_start_relaxed (cond);
- lowseq = (g1_start & 1) == g ? signals : g1_start & ~1U;
+ g1_start = __condvar_load_g1_start_relaxed (cond);
+ lowseq = (g1_start & 1) == g ? signals : g1_start & ~1U;
spin--;
}
- if (seq < (g1_start >> 1))
+ if (seq < (g1_start >> 1))
{
- /* If the group is closed already,
+ /* If the group is closed already,
then this waiter originally had enough extra signals to
consume, up until the time its group was closed. */
goto done;
- }
+ }
/* If there is an available signal, don't block.
- If __g1_start has advanced at all, then we must be in G1
+ If __g1_start has advanced at all, then we must be in G1
by now, perhaps in the process of switching back to an older
G2, but in either case we're allowed to consume the available
signal and should not block anymore. */
@@ -483,22 +483,23 @@ __pthread_cond_wait_common (pthread_cond_t *cond, pthread_mutex_t *mutex,
sequence. */
atomic_fetch_add_acquire (cond->__data.__g_refs + g, 2);
signals = atomic_load_acquire (cond->__data.__g_signals + g);
- g1_start = __condvar_load_g1_start_relaxed (cond);
- lowseq = (g1_start & 1) == g ? signals : g1_start & ~1U;
+ g1_start = __condvar_load_g1_start_relaxed (cond);
+ lowseq = (g1_start & 1) == g ? signals : g1_start & ~1U;
- if (seq < (g1_start >> 1))
+ if (seq < (g1_start >> 1))
{
- /* group is closed already, so don't block */
+ /* group is closed already, so don't block */
__condvar_dec_grefs (cond, g, private);
goto done;
}
if ((int)(signals - lowseq) >= 2)
{
- /* a signal showed up or G1/G2 switched after we grabbed the refcount */
+ /* a signal showed up or G1/G2 switched after we grabbed the
+ refcount */
__condvar_dec_grefs (cond, g, private);
break;
- }
+ }
// Now block.
struct _pthread_cleanup_buffer buffer;
@@ -536,10 +537,8 @@ __pthread_cond_wait_common (pthread_cond_t *cond, pthread_mutex_t *mutex,
if (seq < (__condvar_load_g1_start_relaxed (cond) >> 1))
goto done;
}
- /* Try to grab a signal. Use acquire MO so that we see an up-to-date value
- of __g1_start below (see spinning above for a similar case). In
- particular, if we steal from a more recent group, we will also see a
- more recent __g1_start below. */
+ /* Try to grab a signal. See above for MO. (if we do another loop
+ iteration we need to see the correct value of g1_start) */
while (!atomic_compare_exchange_weak_acquire (cond->__data.__g_signals + g,
&signals, signals - 2));

View File

@ -0,0 +1,68 @@
commit d0da34ad302df61c4e4c3030845cbe9b986196bf
Author: Malte Skarupke <malteskarupke@fastmail.fm>
Date: Wed Dec 4 07:55:50 2024 -0500
nptl: Remove unnecessary catch-all-wake in condvar group switch
This wake is unnecessary. We only switch groups after every sleeper in a group
has been woken. Sure, they may take a while to actually wake up and may still
hold a reference, but waking them a second time doesn't speed that up. Instead
this just makes the code more complicated and may hide problems.
In particular this safety wake wouldn't even have helped with the bug that was
fixed by Barrus' patch: The bug there was that pthread_cond_signal would not
switch g1 when it should, so we wouldn't even have entered this code path.
Signed-off-by: Malte Skarupke <malteskarupke@fastmail.fm>
Reviewed-by: Carlos O'Donell <carlos@redhat.com>
(cherry picked from commit b42cc6af11062c260c7dfa91f1c89891366fed3e)
diff --git a/nptl/pthread_cond_common.c b/nptl/pthread_cond_common.c
index 3475d1512354be3c..30b8eee149cee195 100644
--- a/nptl/pthread_cond_common.c
+++ b/nptl/pthread_cond_common.c
@@ -221,13 +221,7 @@ __condvar_quiesce_and_switch_g1 (pthread_cond_t *cond, uint64_t wseq,
* New waiters arriving concurrently with the group switching will all go
into G2 until we atomically make the switch. Waiters existing in G2
are not affected.
- * Waiters in G1 have already received a signal and been woken. If they
- haven't woken yet, they will be closed out immediately by the advancing
- of __g_signals to the next "lowseq" (low 31 bits of the new g1_start),
- which will prevent waiters from blocking using a futex on
- __g_signals since it provides enough signals for all possible
- remaining waiters. As a result, they can each consume a signal
- and they will eventually remove their group reference. */
+ * Waiters in G1 have already received a signal and been woken. */
/* Update __g1_start, which finishes closing this group. The value we add
will never be negative because old_orig_size can only be zero when we
@@ -240,29 +234,6 @@ __condvar_quiesce_and_switch_g1 (pthread_cond_t *cond, uint64_t wseq,
unsigned int lowseq = ((old_g1_start + old_orig_size) << 1) & ~1U;
- /* If any waiters still hold group references (and thus could be blocked),
- then wake them all up now and prevent any running ones from blocking.
- This is effectively a catch-all for any possible current or future
- bugs that can allow the group size to reach 0 before all G1 waiters
- have been awakened or at least given signals to consume, or any
- other case that can leave blocked (or about to block) older waiters.. */
- if ((atomic_fetch_or_release (cond->__data.__g_refs + g1, 0) >> 1) > 0)
- {
- /* First advance signals to the end of the group (i.e. enough signals
- for the entire G1 group) to ensure that waiters which have not
- yet blocked in the futex will not block.
- Note that in the vast majority of cases, this should never
- actually be necessary, since __g_signals will have enough
- signals for the remaining g_refs waiters. As an optimization,
- we could check this first before proceeding, although that
- could still leave the potential for futex lost wakeup bugs
- if the signal count was non-zero but the futex wakeup
- was somehow lost. */
- atomic_store_release (cond->__data.__g_signals + g1, lowseq);
-
- futex_wake (cond->__data.__g_signals + g1, INT_MAX, private);
- }
-
/* At this point, the old G1 is now a valid new G2 (but not in use yet).
No old waiter can neither grab a signal nor acquire a reference without
noticing that __g1_start is larger.

View File

@ -0,0 +1,108 @@
commit 6f5ba03968339122e11d5185fed5ff6f99ee4f28
Author: Malte Skarupke <malteskarupke@fastmail.fm>
Date: Wed Dec 4 07:56:13 2024 -0500
nptl: Remove unnecessary quadruple check in pthread_cond_wait
pthread_cond_wait was checking whether it was in a closed group no less than
four times. Checking once is enough. Here are the four checks:
1. While spin-waiting. This was dead code: maxspin is set to 0 and has been
for years.
2. Before deciding to go to sleep, and before incrementing grefs: I kept this
3. After incrementing grefs. There is no reason to think that the group would
close while we do an atomic increment. Obviously it could close at any
point, but that doesn't mean we have to recheck after every step. This
check was equally good as check 2, except it has to do more work.
4. When we find ourselves in a group that has a signal. We only get here after
we check that we're not in a closed group. There is no need to check again.
The check would only have helped in cases where the compare_exchange in the
next line would also have failed. Relying on the compare_exchange is fine.
Removing the duplicate checks clarifies the code.
Signed-off-by: Malte Skarupke <malteskarupke@fastmail.fm>
Reviewed-by: Carlos O'Donell <carlos@redhat.com>
(cherry picked from commit 4f7b051f8ee3feff1b53b27a906f245afaa9cee1)
diff --git a/nptl/pthread_cond_wait.c b/nptl/pthread_cond_wait.c
index ad2cee7d59ddc093..cfdd13bb87c72fa5 100644
--- a/nptl/pthread_cond_wait.c
+++ b/nptl/pthread_cond_wait.c
@@ -366,7 +366,6 @@ static __always_inline int
__pthread_cond_wait_common (pthread_cond_t *cond, pthread_mutex_t *mutex,
clockid_t clockid, const struct __timespec64 *abstime)
{
- const int maxspin = 0;
int err;
int result = 0;
@@ -425,33 +424,6 @@ __pthread_cond_wait_common (pthread_cond_t *cond, pthread_mutex_t *mutex,
uint64_t g1_start = __condvar_load_g1_start_relaxed (cond);
unsigned int lowseq = (g1_start & 1) == g ? signals : g1_start & ~1U;
- /* Spin-wait first.
- Note that spinning first without checking whether a timeout
- passed might lead to what looks like a spurious wake-up even
- though we should return ETIMEDOUT (e.g., if the caller provides
- an absolute timeout that is clearly in the past). However,
- (1) spurious wake-ups are allowed, (2) it seems unlikely that a
- user will (ab)use pthread_cond_wait as a check for whether a
- point in time is in the past, and (3) spinning first without
- having to compare against the current time seems to be the right
- choice from a performance perspective for most use cases. */
- unsigned int spin = maxspin;
- while (spin > 0 && ((int)(signals - lowseq) < 2))
- {
- /* Check that we are not spinning on a group that's already
- closed. */
- if (seq < (g1_start >> 1))
- break;
-
- /* TODO Back off. */
-
- /* Reload signals. See above for MO. */
- signals = atomic_load_acquire (cond->__data.__g_signals + g);
- g1_start = __condvar_load_g1_start_relaxed (cond);
- lowseq = (g1_start & 1) == g ? signals : g1_start & ~1U;
- spin--;
- }
-
if (seq < (g1_start >> 1))
{
/* If the group is closed already,
@@ -482,24 +454,6 @@ __pthread_cond_wait_common (pthread_cond_t *cond, pthread_mutex_t *mutex,
an atomic read-modify-write operation and thus extend the release
sequence. */
atomic_fetch_add_acquire (cond->__data.__g_refs + g, 2);
- signals = atomic_load_acquire (cond->__data.__g_signals + g);
- g1_start = __condvar_load_g1_start_relaxed (cond);
- lowseq = (g1_start & 1) == g ? signals : g1_start & ~1U;
-
- if (seq < (g1_start >> 1))
- {
- /* group is closed already, so don't block */
- __condvar_dec_grefs (cond, g, private);
- goto done;
- }
-
- if ((int)(signals - lowseq) >= 2)
- {
- /* a signal showed up or G1/G2 switched after we grabbed the
- refcount */
- __condvar_dec_grefs (cond, g, private);
- break;
- }
// Now block.
struct _pthread_cleanup_buffer buffer;
@@ -533,9 +487,6 @@ __pthread_cond_wait_common (pthread_cond_t *cond, pthread_mutex_t *mutex,
/* Reload signals. See above for MO. */
signals = atomic_load_acquire (cond->__data.__g_signals + g);
}
-
- if (seq < (__condvar_load_g1_start_relaxed (cond) >> 1))
- goto done;
}
/* Try to grab a signal. See above for MO. (if we do another loop
iteration we need to see the correct value of g1_start) */

View File

@ -0,0 +1,175 @@
commit fc2a25417df71a1ef3613216269227b7721b21c8
Author: Malte Skarupke <malteskarupke@fastmail.fm>
Date: Wed Dec 4 07:56:38 2024 -0500
nptl: Remove g_refs from condition variables
This variable used to be needed to wait in group switching until all sleepers
have confirmed that they have woken. This is no longer needed. Nothing waits
on this variable so there is no need to track how many threads are currently
asleep in each group.
Signed-off-by: Malte Skarupke <malteskarupke@fastmail.fm>
Reviewed-by: Carlos O'Donell <carlos@redhat.com>
(cherry picked from commit c36fc50781995e6758cae2b6927839d0157f213c)
diff --git a/nptl/pthread_cond_wait.c b/nptl/pthread_cond_wait.c
index cfdd13bb87c72fa5..411fc0380b78f482 100644
--- a/nptl/pthread_cond_wait.c
+++ b/nptl/pthread_cond_wait.c
@@ -143,23 +143,6 @@ __condvar_cancel_waiting (pthread_cond_t *cond, uint64_t seq, unsigned int g,
}
}
-/* Wake up any signalers that might be waiting. */
-static void
-__condvar_dec_grefs (pthread_cond_t *cond, unsigned int g, int private)
-{
- /* Release MO to synchronize-with the acquire load in
- __condvar_quiesce_and_switch_g1. */
- if (atomic_fetch_add_release (cond->__data.__g_refs + g, -2) == 3)
- {
- /* Clear the wake-up request flag before waking up. We do not need more
- than relaxed MO and it doesn't matter if we apply this for an aliased
- group because we wake all futex waiters right after clearing the
- flag. */
- atomic_fetch_and_relaxed (cond->__data.__g_refs + g, ~(unsigned int) 1);
- futex_wake (cond->__data.__g_refs + g, INT_MAX, private);
- }
-}
-
/* Clean-up for cancellation of waiters waiting for normal signals. We cancel
our registration as a waiter, confirm we have woken up, and re-acquire the
mutex. */
@@ -171,8 +154,6 @@ __condvar_cleanup_waiting (void *arg)
pthread_cond_t *cond = cbuffer->cond;
unsigned g = cbuffer->wseq & 1;
- __condvar_dec_grefs (cond, g, cbuffer->private);
-
__condvar_cancel_waiting (cond, cbuffer->wseq >> 1, g, cbuffer->private);
/* FIXME With the current cancellation implementation, it is possible that
a thread is cancelled after it has returned from a syscall. This could
@@ -327,15 +308,6 @@ __condvar_cleanup_waiting (void *arg)
sufficient because if a waiter can see a sufficiently large value, it could
have also consume a signal in the waiters group.
- It is essential that the last field in pthread_cond_t is __g_signals[1]:
- The previous condvar used a pointer-sized field in pthread_cond_t, so a
- PTHREAD_COND_INITIALIZER from that condvar implementation might only
- initialize 4 bytes to zero instead of the 8 bytes we need (i.e., 44 bytes
- in total instead of the 48 we need). __g_signals[1] is not accessed before
- the first group switch (G2 starts at index 0), which will set its value to
- zero after a harmless fetch-or whose return value is ignored. This
- effectively completes initialization.
-
Limitations:
* This condvar isn't designed to allow for more than
@@ -440,21 +412,6 @@ __pthread_cond_wait_common (pthread_cond_t *cond, pthread_mutex_t *mutex,
if ((int)(signals - lowseq) >= 2)
break;
- /* No signals available after spinning, so prepare to block.
- We first acquire a group reference and use acquire MO for that so
- that we synchronize with the dummy read-modify-write in
- __condvar_quiesce_and_switch_g1 if we read from that. In turn,
- in this case this will make us see the advancement of __g_signals
- to the upcoming new g1_start that occurs with a concurrent
- attempt to reuse the group's slot.
- We use acquire MO for the __g_signals check to make the
- __g1_start check work (see spinning above).
- Note that the group reference acquisition will not mask the
- release MO when decrementing the reference count because we use
- an atomic read-modify-write operation and thus extend the release
- sequence. */
- atomic_fetch_add_acquire (cond->__data.__g_refs + g, 2);
-
// Now block.
struct _pthread_cleanup_buffer buffer;
struct _condvar_cleanup_buffer cbuffer;
@@ -471,18 +428,11 @@ __pthread_cond_wait_common (pthread_cond_t *cond, pthread_mutex_t *mutex,
if (__glibc_unlikely (err == ETIMEDOUT || err == EOVERFLOW))
{
- __condvar_dec_grefs (cond, g, private);
- /* If we timed out, we effectively cancel waiting. Note that
- we have decremented __g_refs before cancellation, so that a
- deadlock between waiting for quiescence of our group in
- __condvar_quiesce_and_switch_g1 and us trying to acquire
- the lock during cancellation is not possible. */
+ /* If we timed out, we effectively cancel waiting. */
__condvar_cancel_waiting (cond, seq, g, private);
result = err;
goto done;
}
- else
- __condvar_dec_grefs (cond, g, private);
/* Reload signals. See above for MO. */
signals = atomic_load_acquire (cond->__data.__g_signals + g);
diff --git a/nptl/tst-cond22.c b/nptl/tst-cond22.c
index 1336e9c79d97ca70..bdcb45c53674a5fd 100644
--- a/nptl/tst-cond22.c
+++ b/nptl/tst-cond22.c
@@ -106,13 +106,13 @@ do_test (void)
status = 1;
}
- printf ("cond = { 0x%x:%x, 0x%x:%x, %u/%u/%u, %u/%u/%u, %u, %u }\n",
+ printf ("cond = { 0x%x:%x, 0x%x:%x, %u/%u, %u/%u, %u, %u }\n",
c.__data.__wseq.__value32.__high,
c.__data.__wseq.__value32.__low,
c.__data.__g1_start.__value32.__high,
c.__data.__g1_start.__value32.__low,
- c.__data.__g_signals[0], c.__data.__g_refs[0], c.__data.__g_size[0],
- c.__data.__g_signals[1], c.__data.__g_refs[1], c.__data.__g_size[1],
+ c.__data.__g_signals[0], c.__data.__g_size[0],
+ c.__data.__g_signals[1], c.__data.__g_size[1],
c.__data.__g1_orig_size, c.__data.__wrefs);
if (pthread_create (&th, NULL, tf, (void *) 1l) != 0)
@@ -152,13 +152,13 @@ do_test (void)
status = 1;
}
- printf ("cond = { 0x%x:%x, 0x%x:%x, %u/%u/%u, %u/%u/%u, %u, %u }\n",
+ printf ("cond = { 0x%x:%x, 0x%x:%x, %u/%u, %u/%u, %u, %u }\n",
c.__data.__wseq.__value32.__high,
c.__data.__wseq.__value32.__low,
c.__data.__g1_start.__value32.__high,
c.__data.__g1_start.__value32.__low,
- c.__data.__g_signals[0], c.__data.__g_refs[0], c.__data.__g_size[0],
- c.__data.__g_signals[1], c.__data.__g_refs[1], c.__data.__g_size[1],
+ c.__data.__g_signals[0], c.__data.__g_size[0],
+ c.__data.__g_signals[1], c.__data.__g_size[1],
c.__data.__g1_orig_size, c.__data.__wrefs);
return status;
diff --git a/sysdeps/nptl/bits/thread-shared-types.h b/sysdeps/nptl/bits/thread-shared-types.h
index df54eef6f71f2cee..a3d482f80f7d0d35 100644
--- a/sysdeps/nptl/bits/thread-shared-types.h
+++ b/sysdeps/nptl/bits/thread-shared-types.h
@@ -95,8 +95,7 @@ struct __pthread_cond_s
{
__atomic_wide_counter __wseq;
__atomic_wide_counter __g1_start;
- unsigned int __g_refs[2] __LOCK_ALIGNMENT;
- unsigned int __g_size[2];
+ unsigned int __g_size[2] __LOCK_ALIGNMENT;
unsigned int __g1_orig_size;
unsigned int __wrefs;
unsigned int __g_signals[2];
diff --git a/sysdeps/nptl/pthread.h b/sysdeps/nptl/pthread.h
index 3d4f4a756c66750d..9af75d6eae090218 100644
--- a/sysdeps/nptl/pthread.h
+++ b/sysdeps/nptl/pthread.h
@@ -152,7 +152,7 @@ enum
/* Conditional variable handling. */
-#define PTHREAD_COND_INITIALIZER { { {0}, {0}, {0, 0}, {0, 0}, 0, 0, {0, 0} } }
+#define PTHREAD_COND_INITIALIZER { { {0}, {0}, {0, 0}, 0, 0, {0, 0} } }
/* Cleanup buffers */

View File

@ -0,0 +1,92 @@
commit 582c99b2c04d6da95743b36bf8e5c54dec178274
Author: Malte Skarupke <malteskarupke@fastmail.fm>
Date: Wed Dec 4 08:03:44 2024 -0500
nptl: Use a single loop in pthread_cond_wait instaed of a nested loop
The loop was a little more complicated than necessary. There was only one
break statement out of the inner loop, and the outer loop was nearly empty.
So just remove the outer loop, moving its code to the one break statement in
the inner loop. This allows us to replace all gotos with break statements.
Signed-off-by: Malte Skarupke <malteskarupke@fastmail.fm>
Reviewed-by: Carlos O'Donell <carlos@redhat.com>
(cherry picked from commit 929a4764ac90382616b6a21f099192b2475da674)
diff --git a/nptl/pthread_cond_wait.c b/nptl/pthread_cond_wait.c
index 411fc0380b78f482..683cb2b133f2163f 100644
--- a/nptl/pthread_cond_wait.c
+++ b/nptl/pthread_cond_wait.c
@@ -382,17 +382,15 @@ __pthread_cond_wait_common (pthread_cond_t *cond, pthread_mutex_t *mutex,
return err;
}
- /* Now wait until a signal is available in our group or it is closed.
- Acquire MO so that if we observe (signals == lowseq) after group
- switching in __condvar_quiesce_and_switch_g1, we synchronize with that
- store and will see the prior update of __g1_start done while switching
- groups too. */
- unsigned int signals = atomic_load_acquire (cond->__data.__g_signals + g);
-
- do
- {
+
while (1)
{
+ /* Now wait until a signal is available in our group or it is closed.
+ Acquire MO so that if we observe (signals == lowseq) after group
+ switching in __condvar_quiesce_and_switch_g1, we synchronize with that
+ store and will see the prior update of __g1_start done while switching
+ groups too. */
+ unsigned int signals = atomic_load_acquire (cond->__data.__g_signals + g);
uint64_t g1_start = __condvar_load_g1_start_relaxed (cond);
unsigned int lowseq = (g1_start & 1) == g ? signals : g1_start & ~1U;
@@ -401,7 +399,7 @@ __pthread_cond_wait_common (pthread_cond_t *cond, pthread_mutex_t *mutex,
/* If the group is closed already,
then this waiter originally had enough extra signals to
consume, up until the time its group was closed. */
- goto done;
+ break;
}
/* If there is an available signal, don't block.
@@ -410,7 +408,16 @@ __pthread_cond_wait_common (pthread_cond_t *cond, pthread_mutex_t *mutex,
G2, but in either case we're allowed to consume the available
signal and should not block anymore. */
if ((int)(signals - lowseq) >= 2)
- break;
+ {
+ /* Try to grab a signal. See above for MO. (if we do another loop
+ iteration we need to see the correct value of g1_start) */
+ if (atomic_compare_exchange_weak_acquire (
+ cond->__data.__g_signals + g,
+ &signals, signals - 2))
+ break;
+ else
+ continue;
+ }
// Now block.
struct _pthread_cleanup_buffer buffer;
@@ -431,19 +438,9 @@ __pthread_cond_wait_common (pthread_cond_t *cond, pthread_mutex_t *mutex,
/* If we timed out, we effectively cancel waiting. */
__condvar_cancel_waiting (cond, seq, g, private);
result = err;
- goto done;
+ break;
}
-
- /* Reload signals. See above for MO. */
- signals = atomic_load_acquire (cond->__data.__g_signals + g);
}
- }
- /* Try to grab a signal. See above for MO. (if we do another loop
- iteration we need to see the correct value of g1_start) */
- while (!atomic_compare_exchange_weak_acquire (cond->__data.__g_signals + g,
- &signals, signals - 2));
-
- done:
/* Confirm that we have been woken. We do that before acquiring the mutex
to allow for execution of pthread_cond_destroy while having acquired the

View File

@ -0,0 +1,139 @@
commit 2fdc0afd0763377dc51870449b476f77baeb8aa0
Author: Malte Skarupke <malteskarupke@fastmail.fm>
Date: Wed Dec 4 08:04:10 2024 -0500
nptl: Fix indentation
In my previous change I turned a nested loop into a simple loop. I'm doing
the resulting indentation changes in a separate commit to make the diff on
the previous commit easier to review.
Signed-off-by: Malte Skarupke <malteskarupke@fastmail.fm>
Reviewed-by: Carlos O'Donell <carlos@redhat.com>
(cherry picked from commit ee6c14ed59d480720721aaacc5fb03213dc153da)
diff --git a/nptl/pthread_cond_wait.c b/nptl/pthread_cond_wait.c
index 683cb2b133f2163f..7fc9dadf15aa9bc6 100644
--- a/nptl/pthread_cond_wait.c
+++ b/nptl/pthread_cond_wait.c
@@ -383,65 +383,65 @@ __pthread_cond_wait_common (pthread_cond_t *cond, pthread_mutex_t *mutex,
}
- while (1)
- {
- /* Now wait until a signal is available in our group or it is closed.
- Acquire MO so that if we observe (signals == lowseq) after group
- switching in __condvar_quiesce_and_switch_g1, we synchronize with that
- store and will see the prior update of __g1_start done while switching
- groups too. */
- unsigned int signals = atomic_load_acquire (cond->__data.__g_signals + g);
- uint64_t g1_start = __condvar_load_g1_start_relaxed (cond);
- unsigned int lowseq = (g1_start & 1) == g ? signals : g1_start & ~1U;
-
- if (seq < (g1_start >> 1))
- {
- /* If the group is closed already,
- then this waiter originally had enough extra signals to
- consume, up until the time its group was closed. */
- break;
- }
-
- /* If there is an available signal, don't block.
- If __g1_start has advanced at all, then we must be in G1
- by now, perhaps in the process of switching back to an older
- G2, but in either case we're allowed to consume the available
- signal and should not block anymore. */
- if ((int)(signals - lowseq) >= 2)
- {
- /* Try to grab a signal. See above for MO. (if we do another loop
- iteration we need to see the correct value of g1_start) */
- if (atomic_compare_exchange_weak_acquire (
- cond->__data.__g_signals + g,
+ while (1)
+ {
+ /* Now wait until a signal is available in our group or it is closed.
+ Acquire MO so that if we observe (signals == lowseq) after group
+ switching in __condvar_quiesce_and_switch_g1, we synchronize with that
+ store and will see the prior update of __g1_start done while switching
+ groups too. */
+ unsigned int signals = atomic_load_acquire (cond->__data.__g_signals + g);
+ uint64_t g1_start = __condvar_load_g1_start_relaxed (cond);
+ unsigned int lowseq = (g1_start & 1) == g ? signals : g1_start & ~1U;
+
+ if (seq < (g1_start >> 1))
+ {
+ /* If the group is closed already,
+ then this waiter originally had enough extra signals to
+ consume, up until the time its group was closed. */
+ break;
+ }
+
+ /* If there is an available signal, don't block.
+ If __g1_start has advanced at all, then we must be in G1
+ by now, perhaps in the process of switching back to an older
+ G2, but in either case we're allowed to consume the available
+ signal and should not block anymore. */
+ if ((int)(signals - lowseq) >= 2)
+ {
+ /* Try to grab a signal. See above for MO. (if we do another loop
+ iteration we need to see the correct value of g1_start) */
+ if (atomic_compare_exchange_weak_acquire (
+ cond->__data.__g_signals + g,
&signals, signals - 2))
- break;
- else
- continue;
- }
-
- // Now block.
- struct _pthread_cleanup_buffer buffer;
- struct _condvar_cleanup_buffer cbuffer;
- cbuffer.wseq = wseq;
- cbuffer.cond = cond;
- cbuffer.mutex = mutex;
- cbuffer.private = private;
- __pthread_cleanup_push (&buffer, __condvar_cleanup_waiting, &cbuffer);
-
- err = __futex_abstimed_wait_cancelable64 (
- cond->__data.__g_signals + g, signals, clockid, abstime, private);
-
- __pthread_cleanup_pop (&buffer, 0);
-
- if (__glibc_unlikely (err == ETIMEDOUT || err == EOVERFLOW))
- {
- /* If we timed out, we effectively cancel waiting. */
- __condvar_cancel_waiting (cond, seq, g, private);
- result = err;
break;
- }
+ else
+ continue;
}
+ // Now block.
+ struct _pthread_cleanup_buffer buffer;
+ struct _condvar_cleanup_buffer cbuffer;
+ cbuffer.wseq = wseq;
+ cbuffer.cond = cond;
+ cbuffer.mutex = mutex;
+ cbuffer.private = private;
+ __pthread_cleanup_push (&buffer, __condvar_cleanup_waiting, &cbuffer);
+
+ err = __futex_abstimed_wait_cancelable64 (
+ cond->__data.__g_signals + g, signals, clockid, abstime, private);
+
+ __pthread_cleanup_pop (&buffer, 0);
+
+ if (__glibc_unlikely (err == ETIMEDOUT || err == EOVERFLOW))
+ {
+ /* If we timed out, we effectively cancel waiting. */
+ __condvar_cancel_waiting (cond, seq, g, private);
+ result = err;
+ break;
+ }
+ }
+
/* Confirm that we have been woken. We do that before acquiring the mutex
to allow for execution of pthread_cond_destroy while having acquired the
mutex. */

View File

@ -0,0 +1,148 @@
commit ac5da3c0e4ed9cbdbb88928c5c9886d02a6dd7ed
Author: Malte Skarupke <malteskarupke@fastmail.fm>
Date: Wed Dec 4 08:04:54 2024 -0500
nptl: rename __condvar_quiesce_and_switch_g1
This function no longer waits for threads to leave g1, so rename it to
__condvar_switch_g1
Signed-off-by: Malte Skarupke <malteskarupke@fastmail.fm>
Reviewed-by: Carlos O'Donell <carlos@redhat.com>
(cherry picked from commit 4b79e27a5073c02f6bff9aa8f4791230a0ab1867)
diff --git a/nptl/pthread_cond_broadcast.c b/nptl/pthread_cond_broadcast.c
index aada91639a346f19..38bba17bfc8a0083 100644
--- a/nptl/pthread_cond_broadcast.c
+++ b/nptl/pthread_cond_broadcast.c
@@ -60,7 +60,7 @@ ___pthread_cond_broadcast (pthread_cond_t *cond)
cond->__data.__g_size[g1] << 1);
cond->__data.__g_size[g1] = 0;
- /* We need to wake G1 waiters before we quiesce G1 below. */
+ /* We need to wake G1 waiters before we switch G1 below. */
/* TODO Only set it if there are indeed futex waiters. We could
also try to move this out of the critical section in cases when
G2 is empty (and we don't need to quiesce). */
@@ -69,7 +69,7 @@ ___pthread_cond_broadcast (pthread_cond_t *cond)
/* G1 is complete. Step (2) is next unless there are no waiters in G2, in
which case we can stop. */
- if (__condvar_quiesce_and_switch_g1 (cond, wseq, &g1, private))
+ if (__condvar_switch_g1 (cond, wseq, &g1, private))
{
/* Step (3): Send signals to all waiters in the old G2 / new G1. */
atomic_fetch_add_relaxed (cond->__data.__g_signals + g1,
diff --git a/nptl/pthread_cond_common.c b/nptl/pthread_cond_common.c
index 30b8eee149cee195..5044273cc265ce94 100644
--- a/nptl/pthread_cond_common.c
+++ b/nptl/pthread_cond_common.c
@@ -189,16 +189,15 @@ __condvar_get_private (int flags)
return FUTEX_SHARED;
}
-/* This closes G1 (whose index is in G1INDEX), waits for all futex waiters to
- leave G1, converts G1 into a fresh G2, and then switches group roles so that
- the former G2 becomes the new G1 ending at the current __wseq value when we
- eventually make the switch (WSEQ is just an observation of __wseq by the
- signaler).
+/* This closes G1 (whose index is in G1INDEX), converts G1 into a fresh G2,
+ and then switches group roles so that the former G2 becomes the new G1
+ ending at the current __wseq value when we eventually make the switch
+ (WSEQ is just an observation of __wseq by the signaler).
If G2 is empty, it will not switch groups because then it would create an
empty G1 which would require switching groups again on the next signal.
Returns false iff groups were not switched because G2 was empty. */
static bool __attribute__ ((unused))
-__condvar_quiesce_and_switch_g1 (pthread_cond_t *cond, uint64_t wseq,
+__condvar_switch_g1 (pthread_cond_t *cond, uint64_t wseq,
unsigned int *g1index, int private)
{
unsigned int g1 = *g1index;
@@ -214,8 +213,7 @@ __condvar_quiesce_and_switch_g1 (pthread_cond_t *cond, uint64_t wseq,
+ cond->__data.__g_size[g1 ^ 1]) == 0)
return false;
- /* Now try to close and quiesce G1. We have to consider the following kinds
- of waiters:
+ /* We have to consider the following kinds of waiters:
* Waiters from less recent groups than G1 are not affected because
nothing will change for them apart from __g1_start getting larger.
* New waiters arriving concurrently with the group switching will all go
@@ -223,12 +221,12 @@ __condvar_quiesce_and_switch_g1 (pthread_cond_t *cond, uint64_t wseq,
are not affected.
* Waiters in G1 have already received a signal and been woken. */
- /* Update __g1_start, which finishes closing this group. The value we add
- will never be negative because old_orig_size can only be zero when we
- switch groups the first time after a condvar was initialized, in which
- case G1 will be at index 1 and we will add a value of 1.
- Relaxed MO is fine because the change comes with no additional
- constraints that others would have to observe. */
+ /* Update __g1_start, which closes this group. The value we add will never
+ be negative because old_orig_size can only be zero when we switch groups
+ the first time after a condvar was initialized, in which case G1 will be
+ at index 1 and we will add a value of 1. Relaxed MO is fine because the
+ change comes with no additional constraints that others would have to
+ observe. */
__condvar_add_g1_start_relaxed (cond,
(old_orig_size << 1) + (g1 == 1 ? 1 : - 1));
diff --git a/nptl/pthread_cond_signal.c b/nptl/pthread_cond_signal.c
index 43d6286ecdf63f51..f09549714299c370 100644
--- a/nptl/pthread_cond_signal.c
+++ b/nptl/pthread_cond_signal.c
@@ -69,18 +69,17 @@ ___pthread_cond_signal (pthread_cond_t *cond)
bool do_futex_wake = false;
/* If G1 is still receiving signals, we put the signal there. If not, we
- check if G2 has waiters, and if so, quiesce and switch G1 to the former
- G2; if this results in a new G1 with waiters (G2 might have cancellations
- already, see __condvar_quiesce_and_switch_g1), we put the signal in the
- new G1. */
+ check if G2 has waiters, and if so, switch G1 to the former G2; if this
+ results in a new G1 with waiters (G2 might have cancellations already,
+ see __condvar_switch_g1), we put the signal in the new G1. */
if ((cond->__data.__g_size[g1] != 0)
- || __condvar_quiesce_and_switch_g1 (cond, wseq, &g1, private))
+ || __condvar_switch_g1 (cond, wseq, &g1, private))
{
/* Add a signal. Relaxed MO is fine because signaling does not need to
- establish a happens-before relation (see above). We do not mask the
- release-MO store when initializing a group in
- __condvar_quiesce_and_switch_g1 because we use an atomic
- read-modify-write and thus extend that store's release sequence. */
+ establish a happens-before relation (see above). We do not mask the
+ release-MO store when initializing a group in __condvar_switch_g1
+ because we use an atomic read-modify-write and thus extend that
+ store's release sequence. */
atomic_fetch_add_relaxed (cond->__data.__g_signals + g1, 2);
cond->__data.__g_size[g1]--;
/* TODO Only set it if there are indeed futex waiters. */
diff --git a/nptl/pthread_cond_wait.c b/nptl/pthread_cond_wait.c
index 7fc9dadf15aa9bc6..80bb7282118775b8 100644
--- a/nptl/pthread_cond_wait.c
+++ b/nptl/pthread_cond_wait.c
@@ -354,8 +354,7 @@ __pthread_cond_wait_common (pthread_cond_t *cond, pthread_mutex_t *mutex,
because we do not need to establish any happens-before relation with
signalers (see __pthread_cond_signal); modification order alone
establishes a total order of waiters/signals. We do need acquire MO
- to synchronize with group reinitialization in
- __condvar_quiesce_and_switch_g1. */
+ to synchronize with group reinitialization in __condvar_switch_g1. */
uint64_t wseq = __condvar_fetch_add_wseq_acquire (cond, 2);
/* Find our group's index. We always go into what was G2 when we acquired
our position. */
@@ -387,9 +386,9 @@ __pthread_cond_wait_common (pthread_cond_t *cond, pthread_mutex_t *mutex,
{
/* Now wait until a signal is available in our group or it is closed.
Acquire MO so that if we observe (signals == lowseq) after group
- switching in __condvar_quiesce_and_switch_g1, we synchronize with that
- store and will see the prior update of __g1_start done while switching
- groups too. */
+ switching in __condvar_switch_g1, we synchronize with that store and
+ will see the prior update of __g1_start done while switching groups
+ too. */
unsigned int signals = atomic_load_acquire (cond->__data.__g_signals + g);
uint64_t g1_start = __condvar_load_g1_start_relaxed (cond);
unsigned int lowseq = (g1_start & 1) == g ? signals : g1_start & ~1U;

View File

@ -0,0 +1,180 @@
commit b1eb369aee9cafefdbe5a65375310a918ef0c3ec
Author: Malte Skarupke <malteskarupke@fastmail.fm>
Date: Wed Dec 4 08:05:40 2024 -0500
nptl: Use all of g1_start and g_signals
The LSB of g_signals was unused. The LSB of g1_start was used to indicate
which group is G2. This was used to always go to sleep in pthread_cond_wait
if a waiter is in G2. A comment earlier in the file says that this is not
correct to do:
"Waiters cannot determine whether they are currently in G2 or G1 -- but they
do not have to because all they are interested in is whether there are
available signals"
I either would have had to update the comment, or get rid of the check. I
chose to get rid of the check. In fact I don't quite know why it was there.
There will never be available signals for group G2, so we didn't need the
special case. Even if there were, this would just be a spurious wake. This
might have caught some cases where the count has wrapped around, but it
wouldn't reliably do that, (and even if it did, why would you want to force a
sleep in that case?) and we don't support that many concurrent waiters
anyway. Getting rid of it allows us to use one more bit, making us more
robust to wraparound.
Signed-off-by: Malte Skarupke <malteskarupke@fastmail.fm>
Reviewed-by: Carlos O'Donell <carlos@redhat.com>
(cherry picked from commit 91bb902f58264a2fd50fbce8f39a9a290dd23706)
diff --git a/nptl/pthread_cond_broadcast.c b/nptl/pthread_cond_broadcast.c
index 38bba17bfc8a0083..51afa62adf7da4c1 100644
--- a/nptl/pthread_cond_broadcast.c
+++ b/nptl/pthread_cond_broadcast.c
@@ -57,7 +57,7 @@ ___pthread_cond_broadcast (pthread_cond_t *cond)
{
/* Add as many signals as the remaining size of the group. */
atomic_fetch_add_relaxed (cond->__data.__g_signals + g1,
- cond->__data.__g_size[g1] << 1);
+ cond->__data.__g_size[g1]);
cond->__data.__g_size[g1] = 0;
/* We need to wake G1 waiters before we switch G1 below. */
@@ -73,7 +73,7 @@ ___pthread_cond_broadcast (pthread_cond_t *cond)
{
/* Step (3): Send signals to all waiters in the old G2 / new G1. */
atomic_fetch_add_relaxed (cond->__data.__g_signals + g1,
- cond->__data.__g_size[g1] << 1);
+ cond->__data.__g_size[g1]);
cond->__data.__g_size[g1] = 0;
/* TODO Only set it if there are indeed futex waiters. */
do_futex_wake = true;
diff --git a/nptl/pthread_cond_common.c b/nptl/pthread_cond_common.c
index 5044273cc265ce94..389402913c7b7714 100644
--- a/nptl/pthread_cond_common.c
+++ b/nptl/pthread_cond_common.c
@@ -208,9 +208,9 @@ __condvar_switch_g1 (pthread_cond_t *cond, uint64_t wseq,
behavior.
Note that this works correctly for a zero-initialized condvar too. */
unsigned int old_orig_size = __condvar_get_orig_size (cond);
- uint64_t old_g1_start = __condvar_load_g1_start_relaxed (cond) >> 1;
- if (((unsigned) (wseq - old_g1_start - old_orig_size)
- + cond->__data.__g_size[g1 ^ 1]) == 0)
+ uint64_t old_g1_start = __condvar_load_g1_start_relaxed (cond);
+ uint64_t new_g1_start = old_g1_start + old_orig_size;
+ if (((unsigned) (wseq - new_g1_start) + cond->__data.__g_size[g1 ^ 1]) == 0)
return false;
/* We have to consider the following kinds of waiters:
@@ -221,16 +221,10 @@ __condvar_switch_g1 (pthread_cond_t *cond, uint64_t wseq,
are not affected.
* Waiters in G1 have already received a signal and been woken. */
- /* Update __g1_start, which closes this group. The value we add will never
- be negative because old_orig_size can only be zero when we switch groups
- the first time after a condvar was initialized, in which case G1 will be
- at index 1 and we will add a value of 1. Relaxed MO is fine because the
- change comes with no additional constraints that others would have to
- observe. */
- __condvar_add_g1_start_relaxed (cond,
- (old_orig_size << 1) + (g1 == 1 ? 1 : - 1));
-
- unsigned int lowseq = ((old_g1_start + old_orig_size) << 1) & ~1U;
+ /* Update __g1_start, which closes this group. Relaxed MO is fine because
+ the change comes with no additional constraints that others would have
+ to observe. */
+ __condvar_add_g1_start_relaxed (cond, old_orig_size);
/* At this point, the old G1 is now a valid new G2 (but not in use yet).
No old waiter can neither grab a signal nor acquire a reference without
@@ -242,13 +236,13 @@ __condvar_switch_g1 (pthread_cond_t *cond, uint64_t wseq,
g1 ^= 1;
*g1index ^= 1;
- /* Now advance the new G1 g_signals to the new lowseq, giving it
+ /* Now advance the new G1 g_signals to the new g1_start, giving it
an effective signal count of 0 to start. */
- atomic_store_release (cond->__data.__g_signals + g1, lowseq);
+ atomic_store_release (cond->__data.__g_signals + g1, (unsigned)new_g1_start);
/* These values are just observed by signalers, and thus protected by the
lock. */
- unsigned int orig_size = wseq - (old_g1_start + old_orig_size);
+ unsigned int orig_size = wseq - new_g1_start;
__condvar_set_orig_size (cond, orig_size);
/* Use and addition to not loose track of cancellations in what was
previously G2. */
diff --git a/nptl/pthread_cond_signal.c b/nptl/pthread_cond_signal.c
index f09549714299c370..fa3a5c3d8f731687 100644
--- a/nptl/pthread_cond_signal.c
+++ b/nptl/pthread_cond_signal.c
@@ -80,7 +80,7 @@ ___pthread_cond_signal (pthread_cond_t *cond)
release-MO store when initializing a group in __condvar_switch_g1
because we use an atomic read-modify-write and thus extend that
store's release sequence. */
- atomic_fetch_add_relaxed (cond->__data.__g_signals + g1, 2);
+ atomic_fetch_add_relaxed (cond->__data.__g_signals + g1, 1);
cond->__data.__g_size[g1]--;
/* TODO Only set it if there are indeed futex waiters. */
do_futex_wake = true;
diff --git a/nptl/pthread_cond_wait.c b/nptl/pthread_cond_wait.c
index 80bb7282118775b8..0f1dfcb595941eba 100644
--- a/nptl/pthread_cond_wait.c
+++ b/nptl/pthread_cond_wait.c
@@ -84,7 +84,7 @@ __condvar_cancel_waiting (pthread_cond_t *cond, uint64_t seq, unsigned int g,
not hold a reference on the group. */
__condvar_acquire_lock (cond, private);
- uint64_t g1_start = __condvar_load_g1_start_relaxed (cond) >> 1;
+ uint64_t g1_start = __condvar_load_g1_start_relaxed (cond);
if (g1_start > seq)
{
/* Our group is closed, so someone provided enough signals for it.
@@ -259,7 +259,6 @@ __condvar_cleanup_waiting (void *arg)
* Waiters fetch-add while having acquire the mutex associated with the
condvar. Signalers load it and fetch-xor it concurrently.
__g1_start: Starting position of G1 (inclusive)
- * LSB is index of current G2.
* Modified by signalers while having acquired the condvar-internal lock
and observed concurrently by waiters.
__g1_orig_size: Initial size of G1
@@ -280,11 +279,9 @@ __condvar_cleanup_waiting (void *arg)
* Reference count used by waiters concurrently with signalers that have
acquired the condvar-internal lock.
__g_signals: The number of signals that can still be consumed, relative to
- the current g1_start. (i.e. bits 31 to 1 of __g_signals are bits
- 31 to 1 of g1_start with the signal count added)
+ the current g1_start. (i.e. g1_start with the signal count added)
* Used as a futex word by waiters. Used concurrently by waiters and
signalers.
- * LSB is currently reserved and 0.
__g_size: Waiters remaining in this group (i.e., which have not been
signaled yet.
* Accessed by signalers and waiters that cancel waiting (both do so only
@@ -391,9 +388,8 @@ __pthread_cond_wait_common (pthread_cond_t *cond, pthread_mutex_t *mutex,
too. */
unsigned int signals = atomic_load_acquire (cond->__data.__g_signals + g);
uint64_t g1_start = __condvar_load_g1_start_relaxed (cond);
- unsigned int lowseq = (g1_start & 1) == g ? signals : g1_start & ~1U;
- if (seq < (g1_start >> 1))
+ if (seq < g1_start)
{
/* If the group is closed already,
then this waiter originally had enough extra signals to
@@ -406,13 +402,13 @@ __pthread_cond_wait_common (pthread_cond_t *cond, pthread_mutex_t *mutex,
by now, perhaps in the process of switching back to an older
G2, but in either case we're allowed to consume the available
signal and should not block anymore. */
- if ((int)(signals - lowseq) >= 2)
+ if ((int)(signals - (unsigned int)g1_start) > 0)
{
/* Try to grab a signal. See above for MO. (if we do another loop
iteration we need to see the correct value of g1_start) */
if (atomic_compare_exchange_weak_acquire (
cond->__data.__g_signals + g,
- &signals, signals - 2))
+ &signals, signals - 1))
break;
else
continue;

View File

@ -0,0 +1,41 @@
commit d33d10642fb24091e8fc8b9115f0a17d9f78491d
Author: Florian Weimer <fweimer@redhat.com>
Date: Thu Mar 13 06:07:07 2025 +0100
nptl: PTHREAD_COND_INITIALIZER compatibility with pre-2.41 versions (bug 32786)
The new initializer and struct layout does not initialize the
__g_signals field in the old struct layout before the change in
commit c36fc50781995e6758cae2b6927839d0157f213c ("nptl: Remove
g_refs from condition variables"). Bring back fields at the end
of struct __pthread_cond_s, so that they are again zero-initialized.
Reviewed-by: Sam James <sam@gentoo.org>
(cherry picked from commit dbc5a50d12eff4cb3f782129029d04b8a76f58e7)
diff --git a/sysdeps/nptl/bits/thread-shared-types.h b/sysdeps/nptl/bits/thread-shared-types.h
index a3d482f80f7d0d35..bccc2003ec6dea5c 100644
--- a/sysdeps/nptl/bits/thread-shared-types.h
+++ b/sysdeps/nptl/bits/thread-shared-types.h
@@ -99,6 +99,8 @@ struct __pthread_cond_s
unsigned int __g1_orig_size;
unsigned int __wrefs;
unsigned int __g_signals[2];
+ unsigned int __unused_initialized_1;
+ unsigned int __unused_initialized_2;
};
typedef unsigned int __tss_t;
diff --git a/sysdeps/nptl/pthread.h b/sysdeps/nptl/pthread.h
index 9af75d6eae090218..e0f24418fe4233f0 100644
--- a/sysdeps/nptl/pthread.h
+++ b/sysdeps/nptl/pthread.h
@@ -152,7 +152,7 @@ enum
/* Conditional variable handling. */
-#define PTHREAD_COND_INITIALIZER { { {0}, {0}, {0, 0}, 0, 0, {0, 0} } }
+#define PTHREAD_COND_INITIALIZER { { {0}, {0}, {0, 0}, 0, 0, {0, 0}, 0, 0 } }
/* Cleanup buffers */

View File

@ -0,0 +1,63 @@
commit 68f3f1a1d08f7f3e0fb74391461699717efbb4bc
Author: Florian Weimer <fweimer@redhat.com>
Date: Sat Feb 17 09:17:04 2024 +0100
Linux: Switch back to assembly syscall wrapper for prctl (bug 29770)
Commit ff026950e280bc3e9487b41b460fb31bc5b57721 ("Add a C wrapper for
prctl [BZ #25896]") replaced the assembler wrapper with a C function.
However, on powerpc64le-linux-gnu, the C variadic function
implementation requires extra work in the caller to set up the
parameter save area. Calling a function that needs a parameter save
area without one (because the prototype used indicates the function is
not variadic) corrupts the caller's stack. The Linux manual pages
project documents prctl as a non-variadic function. This has resulted
in various projects over the years using non-variadic prototypes,
including the sanitizer libraries in LLVm and GCC (GCC PR 113728).
This commit switches back to the assembler implementation on most
targets and only keeps the C implementation for x86-64 x32.
Also add the __prctl_time64 alias from commit
b39ffab860cd743a82c91946619f1b8158b0b65e ("Linux: Add time64 alias for
prctl") to sysdeps/unix/sysv/linux/syscalls.list; it was not yet
present in commit ff026950e280bc3e9487b41b460fb31bc5b57721.
This restores the old ABI on powerpc64le-linux-gnu, thus fixing
bug 29770.
Reviewed-By: Simon Chopin <simon.chopin@canonical.com>
(cherry picked from commit 6a04404521ac4119ae36827eeb288ea84eee7cf6)
diff --git a/sysdeps/unix/sysv/linux/syscalls.list b/sysdeps/unix/sysv/linux/syscalls.list
index 73e941ef894cd72c..9ac42c3436dd1520 100644
--- a/sysdeps/unix/sysv/linux/syscalls.list
+++ b/sysdeps/unix/sysv/linux/syscalls.list
@@ -46,6 +46,7 @@ open_tree EXTRA open_tree i:isU open_tree
pipe2 - pipe2 i:fi __pipe2 pipe2
pidfd_open EXTRA pidfd_open i:iU pidfd_open
pidfd_getfd EXTRA pidfd_getfd i:iiU pidfd_getfd
+prctl EXTRA prctl i:iiiii __prctl prctl __prctl_time64
pivot_root EXTRA pivot_root i:ss pivot_root
pidfd_send_signal EXTRA pidfd_send_signal i:iiPU pidfd_send_signal
process_madvise EXTRA process_madvise i:iPniU process_madvise
diff --git a/sysdeps/unix/sysv/linux/prctl.c b/sysdeps/unix/sysv/linux/x86_64/x32/prctl.c
similarity index 93%
rename from sysdeps/unix/sysv/linux/prctl.c
rename to sysdeps/unix/sysv/linux/x86_64/x32/prctl.c
index 52d234ea0df4cc48..4bf1b479a07c6e8f 100644
--- a/sysdeps/unix/sysv/linux/prctl.c
+++ b/sysdeps/unix/sysv/linux/x86_64/x32/prctl.c
@@ -1,4 +1,4 @@
-/* prctl - Linux specific syscall.
+/* prctl - Linux specific syscall. x86-64 x32 version.
Copyright (C) 2020-2024 Free Software Foundation, Inc.
This file is part of the GNU C Library.
@@ -40,6 +40,3 @@ __prctl (int option, ...)
libc_hidden_def (__prctl)
weak_alias (__prctl, prctl)
-#if __TIMESIZE != 64
-weak_alias (__prctl, __prctl_time64)
-#endif

View File

@ -0,0 +1,139 @@
commit e31ac9a639306c8611e1ebe9fa405037337c70e0
Author: H.J. Lu <hjl.tools@gmail.com>
Date: Tue Apr 30 09:21:16 2024 -0700
libio: Sort test variables in Makefile
Sort test variables in libio/Makefile using scripts/sort-makefile-lines.py.
Reviewed-by: Sunil K Pandey <skpgkp2@gmail.com>
(cherry picked from commit ddf71c550a5940deca74cc676f1cae134a891717)
diff --git a/libio/Makefile b/libio/Makefile
index b92aeaf62634f1cb..0c1f16ee3b54c2d3 100644
--- a/libio/Makefile
+++ b/libio/Makefile
@@ -68,22 +68,76 @@ routines_no_fortify += \
wprintf \
# routines_no_fortify
-tests = tst_swprintf tst_wprintf tst_swscanf tst_wscanf tst_getwc tst_putwc \
- tst_wprintf2 tst-widetext test-fmemopen tst-ext tst-ext2 \
- tst-fgetws tst-ungetwc1 tst-ungetwc2 tst-swscanf tst-sscanf \
- tst-mmap-setvbuf bug-ungetwc1 bug-ungetwc2 tst-atime tst-eof \
- tst-freopen bug-rewind bug-rewind2 bug-ungetc bug-fseek \
- tst-mmap-eofsync tst-mmap-fflushsync bug-mmap-fflush \
- tst-mmap2-eofsync tst-mmap-offend bug-fopena+ bug-wfflush \
- bug-ungetc2 bug-ftell bug-ungetc3 bug-ungetc4 tst-fopenloc2 \
- tst-memstream1 tst-memstream2 tst-memstream3 tst-memstream4 \
- tst-wmemstream1 tst-wmemstream2 tst-wmemstream3 tst-wmemstream4 \
- tst-wmemstream5 bug-memstream1 bug-wmemstream1 \
- tst-setvbuf1 tst-popen1 tst-fgetwc bug-wsetpos tst-fseek \
- tst-fwrite-error tst-ftell-partial-wide tst-ftell-active-handler \
- tst-ftell-append tst-fputws tst-bz22415 tst-fgetc-after-eof \
- tst-sprintf-ub tst-sprintf-chk-ub tst-bz24051 tst-bz24153 \
- tst-wfile-sync tst-bz28828 tst-getdelim
+tests = \
+ bug-fopena+ \
+ bug-fseek \
+ bug-ftell \
+ bug-memstream1 \
+ bug-mmap-fflush \
+ bug-rewind \
+ bug-rewind2 \
+ bug-ungetc \
+ bug-ungetc2 \
+ bug-ungetc3 \
+ bug-ungetc4 \
+ bug-ungetwc1 \
+ bug-ungetwc2 \
+ bug-wfflush \
+ bug-wmemstream1 \
+ bug-wsetpos \
+ test-fmemopen \
+ tst-atime \
+ tst-bz22415 \
+ tst-bz24051 \
+ tst-bz24153 \
+ tst-bz28828 \
+ tst-eof \
+ tst-ext \
+ tst-ext2 \
+ tst-fgetc-after-eof \
+ tst-fgetwc \
+ tst-fgetws \
+ tst-fopenloc2 \
+ tst-fputws \
+ tst-freopen \
+ tst-fseek \
+ tst-ftell-active-handler \
+ tst-ftell-append \
+ tst-ftell-partial-wide \
+ tst-fwrite-error \
+ tst-getdelim \
+ tst-memstream1 \
+ tst-memstream2 \
+ tst-memstream3 \
+ tst-memstream4 \
+ tst-mmap-eofsync \
+ tst-mmap-fflushsync \
+ tst-mmap-offend \
+ tst-mmap-setvbuf \
+ tst-mmap2-eofsync \
+ tst-popen1 \
+ tst-setvbuf1 \
+ tst-sprintf-chk-ub \
+ tst-sprintf-ub \
+ tst-sscanf \
+ tst-swscanf \
+ tst-ungetwc1 \
+ tst-ungetwc2 \
+ tst-wfile-sync \
+ tst-widetext \
+ tst-wmemstream1 \
+ tst-wmemstream2 \
+ tst-wmemstream3 \
+ tst-wmemstream4 \
+ tst-wmemstream5 \
+ tst_getwc \
+ tst_putwc \
+ tst_swprintf \
+ tst_swscanf \
+ tst_wprintf \
+ tst_wprintf2 \
+ tst_wscanf \
+ # tests
tests-internal = tst-vtables tst-vtables-interposed
@@ -235,16 +289,26 @@ tests-special += $(objpfx)tst-fopenloc-cmp.out $(objpfx)tst-fopenloc-mem.out \
$(objpfx)tst-bz24228-mem.out
endif
-tests += tst-cleanup-default tst-cleanup-default-static
+tests += \
+ tst-cleanup-default \
+ tst-cleanup-default-static \
+ # tests
tests-static += tst-cleanup-default-static
tests-special += $(objpfx)tst-cleanup-default-cmp.out $(objpfx)tst-cleanup-default-static-cmp.out
LDFLAGS-tst-cleanup-default = -Wl,--gc-sections
LDFLAGS-tst-cleanup-default-static = -Wl,--gc-sections
ifeq ($(have-gnu-retain)$(have-z-start-stop-gc),yesyes)
-tests += tst-cleanup-start-stop-gc tst-cleanup-start-stop-gc-static \
- tst-cleanup-nostart-stop-gc tst-cleanup-nostart-stop-gc-static
-tests-static += tst-cleanup-start-stop-gc-static tst-cleanup-nostart-stop-gc-static
+tests += \
+ tst-cleanup-nostart-stop-gc \
+ tst-cleanup-nostart-stop-gc-static \
+ tst-cleanup-start-stop-gc \
+ tst-cleanup-start-stop-gc-static \
+ # tests
+tests-static += \
+ tst-cleanup-nostart-stop-gc-static \
+ tst-cleanup-start-stop-gc-static \
+ # tests-static
tests-special += $(objpfx)tst-cleanup-start-stop-gc-cmp.out \
$(objpfx)tst-cleanup-start-stop-gc-static-cmp.out \
$(objpfx)tst-cleanup-nostart-stop-gc-cmp.out \

View File

@ -0,0 +1,195 @@
commit 1dcfb9479df400160208ac3d8ab33128d8f1aae5
Author: Arjun Shankar <arjun@redhat.com>
Date: Fri Oct 18 16:03:25 2024 +0200
libio: Fix a deadlock after fork in popen
popen modifies its file handler book-keeping under a lock that wasn't
being taken during fork. This meant that a concurrent popen and fork
could end up copying the lock in a "locked" state into the fork child,
where subsequently calling popen would lead to a deadlock due to the
already (spuriously) held lock.
This commit fixes the deadlock by appropriately taking the lock before
fork, and releasing/resetting it in the parent/child after the fork.
A new test for concurrent popen and fork is also added. It consistently
hangs (and therefore fails via timeout) without the fix applied.
Reviewed-by: Florian Weimer <fweimer@redhat.com>
(cherry picked from commit 9f0d2c0ee6c728643fcf9a4879e9f20f5e45ce5f)
diff --git a/libio/Makefile b/libio/Makefile
index 0c1f16ee3b54c2d3..d1f2342867601735 100644
--- a/libio/Makefile
+++ b/libio/Makefile
@@ -115,6 +115,7 @@ tests = \
tst-mmap-offend \
tst-mmap-setvbuf \
tst-mmap2-eofsync \
+ tst-popen-fork \
tst-popen1 \
tst-setvbuf1 \
tst-sprintf-chk-ub \
diff --git a/libio/iopopen.c b/libio/iopopen.c
index d01cb0648e3aac54..352513a2914a9d36 100644
--- a/libio/iopopen.c
+++ b/libio/iopopen.c
@@ -57,6 +57,26 @@ unlock (void *not_used)
}
#endif
+/* These lock/unlock/resetlock functions are used during fork. */
+
+void
+_IO_proc_file_chain_lock (void)
+{
+ _IO_lock_lock (proc_file_chain_lock);
+}
+
+void
+_IO_proc_file_chain_unlock (void)
+{
+ _IO_lock_unlock (proc_file_chain_lock);
+}
+
+void
+_IO_proc_file_chain_resetlock (void)
+{
+ _IO_lock_init (proc_file_chain_lock);
+}
+
/* POSIX states popen shall ensure that any streams from previous popen()
calls that remain open in the parent process should be closed in the new
child process.
diff --git a/libio/libioP.h b/libio/libioP.h
index 616253fcd00f04db..a83a411fdf7d93c9 100644
--- a/libio/libioP.h
+++ b/libio/libioP.h
@@ -429,6 +429,12 @@ libc_hidden_proto (_IO_list_resetlock)
extern void _IO_enable_locks (void) __THROW;
libc_hidden_proto (_IO_enable_locks)
+/* Functions for operating popen's proc_file_chain_lock during fork. */
+
+extern void _IO_proc_file_chain_lock (void) __THROW attribute_hidden;
+extern void _IO_proc_file_chain_unlock (void) __THROW attribute_hidden;
+extern void _IO_proc_file_chain_resetlock (void) __THROW attribute_hidden;
+
/* Default jumptable functions. */
extern int _IO_default_underflow (FILE *) __THROW;
diff --git a/libio/tst-popen-fork.c b/libio/tst-popen-fork.c
new file mode 100644
index 0000000000000000..1df30fc6c0a3f583
--- /dev/null
+++ b/libio/tst-popen-fork.c
@@ -0,0 +1,80 @@
+/* Test concurrent popen and fork.
+ Copyright (C) 2024 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#include <stdio.h>
+#include <stdatomic.h>
+#include <pthread.h>
+#include <unistd.h>
+#include <sys/wait.h>
+
+#include <support/check.h>
+#include <support/xthread.h>
+#include <support/xunistd.h>
+
+static void
+popen_and_pclose (void)
+{
+ FILE *f = popen ("true", "r");
+ TEST_VERIFY_EXIT (f != NULL);
+ pclose (f);
+ return;
+}
+
+static atomic_bool done = ATOMIC_VAR_INIT (0);
+
+static void *
+popen_and_pclose_forever (__attribute__ ((unused))
+ void *arg)
+{
+ while (!atomic_load_explicit (&done, memory_order_acquire))
+ popen_and_pclose ();
+ return NULL;
+}
+
+static int
+do_test (void)
+{
+
+ /* Repeatedly call popen in a loop during the entire test. */
+ pthread_t t = xpthread_create (NULL, popen_and_pclose_forever, NULL);
+
+ /* Repeatedly fork off and reap child processes one-by-one.
+ Each child calls popen once, then exits, leading to the possibility
+ that a child forks *during* our own popen call, thus inheriting any
+ intermediate popen state, possibly including lock state(s). */
+ for (int i = 0; i < 100; i++)
+ {
+ int cpid = xfork ();
+
+ if (cpid == 0)
+ {
+ popen_and_pclose ();
+ _exit (0);
+ }
+ else
+ xwaitpid (cpid, NULL, 0);
+ }
+
+ /* Stop calling popen. */
+ atomic_store_explicit (&done, 1, memory_order_release);
+ xpthread_join (t);
+
+ return 0;
+}
+
+#include <support/test-driver.c>
diff --git a/posix/fork.c b/posix/fork.c
index 298765a1ffd08b75..cf9b80e7c059e748 100644
--- a/posix/fork.c
+++ b/posix/fork.c
@@ -62,6 +62,7 @@ __libc_fork (void)
call_function_static_weak (__nss_database_fork_prepare_parent,
&nss_database_data);
+ _IO_proc_file_chain_lock ();
_IO_list_lock ();
/* Acquire malloc locks. This needs to come last because fork
@@ -92,6 +93,7 @@ __libc_fork (void)
/* Reset locks in the I/O code. */
_IO_list_resetlock ();
+ _IO_proc_file_chain_resetlock ();
call_function_static_weak (__nss_database_fork_subprocess,
&nss_database_data);
@@ -121,6 +123,7 @@ __libc_fork (void)
/* We execute this even if the 'fork' call failed. */
_IO_list_unlock ();
+ _IO_proc_file_chain_unlock ();
}
/* Run the handlers registered for the parent. */

View File

@ -0,0 +1,27 @@
commit 14ec225d859091c048ec54e5c4ddf6738498aee7
Author: Arjun Shankar <arjun@redhat.com>
Date: Fri Oct 25 09:33:45 2024 +0200
libio: Correctly link tst-popen-fork against libpthread
tst-popen-fork failed to build for Hurd due to not being linked with
libpthread. This commit fixes that.
Tested with build-many-glibcs.py for i686-gnu.
Reviewed-by: Florian Weimer <fweimer@redhat.com>
(cherry picked from commit 6a290b2895b77be839fcb7c44a6a9879560097ad)
diff --git a/libio/Makefile b/libio/Makefile
index d1f2342867601735..92d6c6bcab1818d0 100644
--- a/libio/Makefile
+++ b/libio/Makefile
@@ -140,6 +140,8 @@ tests = \
tst_wscanf \
# tests
+$(objpfx)tst-popen-fork: $(shared-thread-library)
+
tests-internal = tst-vtables tst-vtables-interposed
ifeq (yes,$(build-shared))

View File

@ -0,0 +1,34 @@
commit 9fe51d34bbce71d186e7adee74e523ccc64a9727
Author: H.J. Lu <hjl.tools@gmail.com>
Date: Thu Feb 15 03:22:55 2024 -0800
sort-makefile-lines.py: Allow '_' in name and "^# name"
'_' is used in Makefile variable names and many variables end with
"^# name". Relax sort-makefile-lines.py to allow '_' in name and
"^# name" as variable end. This fixes BZ #31385.
(cherry picked from commit 6a2512bf1605a4208dd94ef67408488d8acb2409)
diff --git a/scripts/sort-makefile-lines.py b/scripts/sort-makefile-lines.py
index f65ee40e27fb85ff..b2249aef6d028cf7 100755
--- a/scripts/sort-makefile-lines.py
+++ b/scripts/sort-makefile-lines.py
@@ -129,7 +129,7 @@ def sort_makefile_lines():
for i in range(len(lines)):
# Look for things like "var = \", "var := \" or "var += \"
# to start the sorted list.
- var = re.search(r'^([a-zA-Z0-9-]*) [\+:]?\= \\$', lines[i])
+ var = re.search(r'^([-_a-zA-Z0-9]*) [\+:]?\= \\$', lines[i])
if var:
# Remember the index and the name.
startmarks.append((i, var.group(1)))
@@ -140,7 +140,7 @@ def sort_makefile_lines():
rangemarks = []
for sm in startmarks:
# Look for things like " # var" to end the sorted list.
- reg = r'^ # ' + sm[1] + r'$'
+ reg = r'^ *# ' + sm[1] + r'$'
for j in range(sm[0] + 1, len(lines)):
if re.search(reg, lines[j]):
# Remember the block to sort (inclusive).

View File

@ -0,0 +1,32 @@
commit 37b30b6a685c5facccdff61663eb3adf0dd253cd
Author: H.J. Lu <hjl.tools@gmail.com>
Date: Thu Feb 15 11:12:13 2024 -0800
sysdeps/x86_64/Makefile (tests): Add the end marker
(cherry picked from commit 71d133c500b0d23f6b6a7c6e3595e3fc447bfe91)
diff --git a/sysdeps/x86_64/Makefile b/sysdeps/x86_64/Makefile
index 0ede447405d549b5..08ec882159990e97 100644
--- a/sysdeps/x86_64/Makefile
+++ b/sysdeps/x86_64/Makefile
@@ -32,7 +32,8 @@ sysdep_routines += \
# sysdep_routines
gen-as-const-headers += locale-defines.sym
tests += \
- tst-rsi-strlen
+ tst-rsi-strlen \
+# tests
endif
ifeq ($(subdir),elf)
@@ -232,7 +233,8 @@ sysdep_routines += \
# sysdep_routines
tests += \
- tst-rsi-wcslen
+ tst-rsi-wcslen \
+# tests
endif

View File

@ -0,0 +1,39 @@
commit 4e5ee49a432b8569137bdacc302fc696ed37b1bd
Author: H.J. Lu <hjl.tools@gmail.com>
Date: Wed Feb 28 05:46:40 2024 -0800
sysdeps/unix/sysv/linux/x86_64/Makefile: Add the end marker
Add the end marker to tests, tests-container and modules-names.
(cherry picked from commit e6350be7e9cae8f71c96c1f06eab61b9acb227c8)
diff --git a/sysdeps/unix/sysv/linux/x86_64/Makefile b/sysdeps/unix/sysv/linux/x86_64/Makefile
index 9a1e7aa6461725af..fcbffd81cbaa031d 100644
--- a/sysdeps/unix/sysv/linux/x86_64/Makefile
+++ b/sysdeps/unix/sysv/linux/x86_64/Makefile
@@ -17,18 +17,21 @@ endif
ifeq ($(subdir),elf)
ifeq (yes,$(enable-x86-isa-level))
tests += \
- tst-glibc-hwcaps-2
+ tst-glibc-hwcaps-2 \
+# tests
ifeq (no,$(build-hardcoded-path-in-tests))
# This is an ld.so.cache test, and RPATH/RUNPATH in the executable
# interferes with its test objectives.
tests-container += \
- tst-glibc-hwcaps-2-cache
+ tst-glibc-hwcaps-2-cache \
+# tests-container
endif
modules-names += \
libx86-64-isa-level-1 \
libx86-64-isa-level-2 \
libx86-64-isa-level-3 \
- libx86-64-isa-level-4
+ libx86-64-isa-level-4 \
+# modules-names
$(objpfx)tst-glibc-hwcaps-2: $(objpfx)libx86-64-isa-level.so

View File

@ -0,0 +1,182 @@
commit 147bed0a71a6c5cbf83d05f4081e923d74a6847e
Author: Florian Weimer <fweimer@redhat.com>
Date: Thu Feb 13 21:56:52 2025 +0100
elf: Keep using minimal malloc after early DTV resize (bug 32412)
If an auditor loads many TLS-using modules during startup, it is
possible to trigger DTV resizing. Previously, the DTV was marked
as allocated by the main malloc afterwards, even if the minimal
malloc was still in use. With this change, _dl_resize_dtv marks
the resized DTV as allocated with the minimal malloc.
The new test reuses TLS-using modules from other auditing tests.
Reviewed-by: DJ Delorie <dj@redhat.com>
(cherry picked from commit aa3d7bd5299b33bffc118aa618b59bfa66059bcb)
diff --git a/elf/Makefile b/elf/Makefile
index 8a5678aa63736812..f2e9cb1075adc8a5 100644
--- a/elf/Makefile
+++ b/elf/Makefile
@@ -376,6 +376,7 @@ tests += \
tst-align3 \
tst-audit-tlsdesc \
tst-audit-tlsdesc-dlopen \
+ tst-audit-tlsdesc-dlopen2 \
tst-audit1 \
tst-audit2 \
tst-audit8 \
@@ -802,6 +803,7 @@ modules-names += \
tst-auditmanymod8 \
tst-auditmanymod9 \
tst-auditmod-tlsdesc \
+ tst-auditmod-tlsdesc2 \
tst-auditmod1 \
tst-auditmod11 \
tst-auditmod12 \
@@ -3012,6 +3014,9 @@ $(objpfx)tst-audit-tlsdesc.out: $(objpfx)tst-auditmod-tlsdesc.so
tst-audit-tlsdesc-ENV = LD_AUDIT=$(objpfx)tst-auditmod-tlsdesc.so
$(objpfx)tst-audit-tlsdesc-dlopen.out: $(objpfx)tst-auditmod-tlsdesc.so
tst-audit-tlsdesc-dlopen-ENV = LD_AUDIT=$(objpfx)tst-auditmod-tlsdesc.so
+$(objpfx)tst-audit-tlsdesc-dlopen2.out: $(objpfx)tst-auditmod-tlsdesc2.so \
+ $(patsubst %, $(objpfx)%.so, $(tlsmod17a-modules))
+tst-audit-tlsdesc-dlopen2-ENV = LD_AUDIT=$(objpfx)tst-auditmod-tlsdesc2.so
$(objpfx)tst-dlmopen-twice.out: \
$(objpfx)tst-dlmopen-twice-mod1.so \
diff --git a/elf/dl-tls.c b/elf/dl-tls.c
index 3d529b722cb271d9..b13e752358a059a4 100644
--- a/elf/dl-tls.c
+++ b/elf/dl-tls.c
@@ -528,6 +528,13 @@ _dl_resize_dtv (dtv_t *dtv, size_t max_modid)
if (newp == NULL)
oom ();
memcpy (newp, &dtv[-1], (2 + oldsize) * sizeof (dtv_t));
+#ifdef SHARED
+ /* Auditors can trigger a DTV resize event while the full malloc
+ is not yet in use. Mark the new DTV allocation as the
+ initial allocation. */
+ if (!__rtld_malloc_is_complete ())
+ GL(dl_initial_dtv) = &newp[1];
+#endif
}
else
{
diff --git a/elf/tst-audit-tlsdesc-dlopen2.c b/elf/tst-audit-tlsdesc-dlopen2.c
new file mode 100644
index 0000000000000000..7ba2c4129a9bcc53
--- /dev/null
+++ b/elf/tst-audit-tlsdesc-dlopen2.c
@@ -0,0 +1,46 @@
+/* Loading TLS-using modules from auditors (bug 32412). Main program.
+ Copyright (C) 2021-2025 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#include <support/xdlfcn.h>
+#include <stdio.h>
+
+static int
+do_test (void)
+{
+ puts ("info: start of main program");
+
+ /* Load TLS-using modules, to trigger DTV resizing. The dynamic
+ linker will load them again (requiring their own TLS) because the
+ dlopen calls from the auditor were in the auditing namespace. */
+ for (int i = 1; i <= 19; ++i)
+ {
+ char dso[30];
+ snprintf (dso, sizeof (dso), "tst-tlsmod17a%d.so", i);
+ char sym[30];
+ snprintf (sym, sizeof(sym), "tlsmod17a%d", i);
+
+ void *handle = xdlopen (dso, RTLD_LAZY);
+ int (*func) (void) = xdlsym (handle, sym);
+ /* Trigger TLS allocation. */
+ func ();
+ }
+
+ return 0;
+}
+
+#include <support/test-driver.c>
diff --git a/elf/tst-auditmod-tlsdesc2.c b/elf/tst-auditmod-tlsdesc2.c
new file mode 100644
index 0000000000000000..50275cd34d1219c6
--- /dev/null
+++ b/elf/tst-auditmod-tlsdesc2.c
@@ -0,0 +1,59 @@
+/* Loading TLS-using modules from auditors (bug 32412). Audit module.
+ Copyright (C) 2021-2025 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#include <dlfcn.h>
+#include <link.h>
+#include <stdbool.h>
+#include <stdio.h>
+#include <unistd.h>
+
+unsigned int
+la_version (unsigned int version)
+{
+ /* Open some modules, to trigger DTV resizing before the switch to
+ the main malloc. */
+ for (int i = 1; i <= 19; ++i)
+ {
+ char dso[30];
+ snprintf (dso, sizeof (dso), "tst-tlsmod17a%d.so", i);
+ char sym[30];
+ snprintf (sym, sizeof(sym), "tlsmod17a%d", i);
+
+ void *handle = dlopen (dso, RTLD_LAZY);
+ if (handle == NULL)
+ {
+ printf ("error: dlmopen from auditor: %s\n", dlerror ());
+ fflush (stdout);
+ _exit (1);
+ }
+ int (*func) (void) = dlsym (handle, sym);
+ if (func == NULL)
+ {
+ printf ("error: dlsym from auditor: %s\n", dlerror ());
+ fflush (stdout);
+ _exit (1);
+ }
+ /* Trigger TLS allocation. */
+ func ();
+ }
+
+ puts ("info: TLS-using modules loaded from auditor");
+ fflush (stdout);
+
+ return LAV_CURRENT;
+}

View File

@ -0,0 +1,57 @@
commit abdeb4b5200e0afb05e6a7863c52d2fbe7029b47
Author: Florian Weimer <fweimer@redhat.com>
Date: Tue May 20 19:36:02 2025 +0200
support: Use const char * argument in support_capture_subprogram_self_sgid
The function does not modify the passed-in string, so make this clear
via the prototype.
Reviewed-by: Carlos O'Donell <carlos@redhat.com>
(cherry picked from commit f0c09fe61678df6f7f18fe1ebff074e62fa5ca7a)
diff --git a/support/capture_subprocess.h b/support/capture_subprocess.h
index 1ecbdfe4fc4aa123..f2765278d920839d 100644
--- a/support/capture_subprocess.h
+++ b/support/capture_subprocess.h
@@ -44,8 +44,7 @@ struct support_capture_subprocess support_capture_subprogram
/* Copy the running program into a setgid binary and run it with CHILD_ID
argument. If execution is successful, return the exit status of the child
program, otherwise return a non-zero failure exit code. */
-int support_capture_subprogram_self_sgid
- (char *child_id);
+int support_capture_subprogram_self_sgid (const char *child_id);
/* Deallocate the subprocess data captured by
support_capture_subprocess. */
diff --git a/support/support_capture_subprocess.c b/support/support_capture_subprocess.c
index ffced8a89fca37a5..eb72a2c21cf99ee2 100644
--- a/support/support_capture_subprocess.c
+++ b/support/support_capture_subprocess.c
@@ -109,7 +109,7 @@ support_capture_subprogram (const char *file, char *const argv[])
safely make it SGID with the TARGET group ID. Then runs the
executable. */
static int
-copy_and_spawn_sgid (char *child_id, gid_t gid)
+copy_and_spawn_sgid (const char *child_id, gid_t gid)
{
char *dirname = xasprintf ("%s/tst-tunables-setuid.%jd",
test_dir, (intmax_t) getpid ());
@@ -181,7 +181,7 @@ copy_and_spawn_sgid (char *child_id, gid_t gid)
ret = 0;
infd = outfd = -1;
- char * const args[] = {execname, child_id, NULL};
+ char * const args[] = {execname, (char *) child_id, NULL};
status = support_subprogram_wait (args[0], args);
@@ -210,7 +210,7 @@ err:
}
int
-support_capture_subprogram_self_sgid (char *child_id)
+support_capture_subprogram_self_sgid (const char *child_id)
{
gid_t target = 0;
const int count = 64;

View File

@ -0,0 +1,43 @@
commit 71ddb11ccd76843cec6e793977218e227fe51c07
Author: Florian Weimer <fweimer@redhat.com>
Date: Mon Dec 23 13:57:55 2024 +0100
support: Add support_record_failure_barrier
This can be used to stop execution after a TEST_COMPARE_BLOB
failure, for example.
(cherry picked from commit d0b8aa6de4529231fadfe604ac2c434e559c2d9e)
diff --git a/support/check.h b/support/check.h
index 7ea22c7a2cba5cfd..8f41e5b99fc17472 100644
--- a/support/check.h
+++ b/support/check.h
@@ -207,6 +207,9 @@ void support_record_failure_reset (void);
failures or not. */
int support_record_failure_is_failed (void);
+/* Terminate the process if any failures have been encountered so far. */
+void support_record_failure_barrier (void);
+
__END_DECLS
#endif /* SUPPORT_CHECK_H */
diff --git a/support/support_record_failure.c b/support/support_record_failure.c
index 978123701d128795..72ee2b232fb2b08c 100644
--- a/support/support_record_failure.c
+++ b/support/support_record_failure.c
@@ -112,3 +112,13 @@ support_record_failure_is_failed (void)
synchronization for reliable test error reporting anyway. */
return __atomic_load_n (&state->failed, __ATOMIC_RELAXED);
}
+
+void
+support_record_failure_barrier (void)
+{
+ if (__atomic_load_n (&state->failed, __ATOMIC_RELAXED))
+ {
+ puts ("error: exiting due to previous errors");
+ exit (1);
+ }
+}

View File

@ -0,0 +1,155 @@
commit ca99d55315b80277a7b189f5a9630f5b08ccaa6d
Author: Florian Weimer <fweimer@redhat.com>
Date: Tue May 20 19:45:06 2025 +0200
elf: Test case for bug 32976 (CVE-2025-4802)
Check that LD_LIBRARY_PATH is ignored for AT_SECURE statically
linked binaries, using support_capture_subprogram_self_sgid.
Reviewed-by: Carlos O'Donell <carlos@redhat.com>
(cherry picked from commit d8f7a79335b0d861c12c42aec94c04cd5bb181e2)
diff --git a/elf/Makefile b/elf/Makefile
index f2e9cb1075adc8a5..51d52b57876fc5ba 100644
--- a/elf/Makefile
+++ b/elf/Makefile
@@ -266,6 +266,7 @@ tests-static-normal := \
tst-array1-static \
tst-array5-static \
tst-dl-iter-static \
+ tst-dlopen-sgid \
tst-dst-static \
tst-env-setuid-static \
tst-getauxval-static \
@@ -844,6 +845,7 @@ modules-names += \
tst-dlmopen-twice-mod1 \
tst-dlmopen-twice-mod2 \
tst-dlmopen1mod \
+ tst-dlopen-sgid-mod \
tst-dlopen-tlsreinitmod1 \
tst-dlopen-tlsreinitmod2 \
tst-dlopen-tlsreinitmod3 \
@@ -3125,3 +3127,5 @@ $(objpfx)tst-dlopen-tlsreinit3.out: $(objpfx)tst-auditmod1.so
tst-dlopen-tlsreinit3-ENV = LD_AUDIT=$(objpfx)tst-auditmod1.so
$(objpfx)tst-dlopen-tlsreinit4.out: $(objpfx)tst-auditmod1.so
tst-dlopen-tlsreinit4-ENV = LD_AUDIT=$(objpfx)tst-auditmod1.so
+
+$(objpfx)tst-dlopen-sgid.out: $(objpfx)tst-dlopen-sgid-mod.so
diff --git a/elf/tst-dlopen-sgid-mod.c b/elf/tst-dlopen-sgid-mod.c
new file mode 100644
index 0000000000000000..5eb79eef485da4c9
--- /dev/null
+++ b/elf/tst-dlopen-sgid-mod.c
@@ -0,0 +1 @@
+/* Opening this object should not succeed. */
diff --git a/elf/tst-dlopen-sgid.c b/elf/tst-dlopen-sgid.c
new file mode 100644
index 0000000000000000..47829a405e90b6b9
--- /dev/null
+++ b/elf/tst-dlopen-sgid.c
@@ -0,0 +1,104 @@
+/* Test case for ignored LD_LIBRARY_PATH in static startug (bug 32976).
+ Copyright (C) 2025 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#include <dlfcn.h>
+#include <gnu/lib-names.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+#include <support/capture_subprocess.h>
+#include <support/check.h>
+#include <support/support.h>
+#include <support/temp_file.h>
+#include <unistd.h>
+
+/* This is the name of our test object. Use a custom module for
+ testing, so that this object does not get picked up from the system
+ path. */
+static const char dso_name[] = "tst-dlopen-sgid-mod.so";
+
+/* Used to mark the recursive invocation. */
+static const char magic_argument[] = "run-actual-test";
+
+static int
+do_test (void)
+{
+/* Pathname of the directory that receives the shared objects this
+ test attempts to load. */
+ char *libdir = support_create_temp_directory ("tst-dlopen-sgid-");
+
+ /* This is supposed to be ignored and stripped. */
+ TEST_COMPARE (setenv ("LD_LIBRARY_PATH", libdir, 1), 0);
+
+ /* Copy of libc.so.6. */
+ {
+ char *from = xasprintf ("%s/%s", support_objdir_root, LIBC_SO);
+ char *to = xasprintf ("%s/%s", libdir, LIBC_SO);
+ add_temp_file (to);
+ support_copy_file (from, to);
+ free (to);
+ free (from);
+ }
+
+ /* Copy of the test object. */
+ {
+ char *from = xasprintf ("%s/elf/%s", support_objdir_root, dso_name);
+ char *to = xasprintf ("%s/%s", libdir, dso_name);
+ add_temp_file (to);
+ support_copy_file (from, to);
+ free (to);
+ free (from);
+ }
+
+ TEST_COMPARE (support_capture_subprogram_self_sgid (magic_argument), 0);
+
+ free (libdir);
+
+ return 0;
+}
+
+static void
+alternative_main (int argc, char **argv)
+{
+ if (argc == 2 && strcmp (argv[1], magic_argument) == 0)
+ {
+ if (getgid () == getegid ())
+ /* This can happen if the file system is mounted nosuid. */
+ FAIL_UNSUPPORTED ("SGID failed: GID and EGID match (%jd)\n",
+ (intmax_t) getgid ());
+
+ /* Should be removed due to SGID. */
+ TEST_COMPARE_STRING (getenv ("LD_LIBRARY_PATH"), NULL);
+
+ TEST_VERIFY (dlopen (dso_name, RTLD_NOW) == NULL);
+ {
+ const char *message = dlerror ();
+ TEST_COMPARE_STRING (message,
+ "tst-dlopen-sgid-mod.so:"
+ " cannot open shared object file:"
+ " No such file or directory");
+ }
+
+ support_record_failure_barrier ();
+ exit (EXIT_SUCCESS);
+ }
+}
+
+#define PREPARE alternative_main
+#include <support/test-driver.c>

View File

@ -0,0 +1,33 @@
commit 9e25c0f445606e809996329b8a21d3342529474d
Author: Sunil K Pandey <sunil.k.pandey@intel.com>
Date: Tue May 20 10:07:27 2025 -0700
x86_64: Fix typo in ifunc-impl-list.c.
Fix wcsncpy and wcpncpy typo in ifunc-impl-list.c.
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
(cherry picked from commit f2aeb6ff941dccc4c777b5621e77addea6cc076c)
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
index c4a21d4b7ca8f01a..c34c94cb58394b56 100644
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
@@ -928,7 +928,7 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
(CPU_FEATURE_USABLE (AVX2)
&& CPU_FEATURE_USABLE (BMI2)),
__wcsncpy_avx2)
- X86_IFUNC_IMPL_ADD_V2 (array, i, wcpncpy,
+ X86_IFUNC_IMPL_ADD_V2 (array, i, wcsncpy,
1,
__wcsncpy_generic))
@@ -958,7 +958,7 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
(CPU_FEATURE_USABLE (AVX2)
&& CPU_FEATURE_USABLE (BMI2)),
__wcpncpy_avx2)
- X86_IFUNC_IMPL_ADD_V2 (array, i, wcsncpy,
+ X86_IFUNC_IMPL_ADD_V2 (array, i, wcpncpy,
1,
__wcpncpy_generic))

View File

@ -0,0 +1,43 @@
commit 2caef2827f76af88d495eb382da174896d08900a
Author: Florian Weimer <fweimer@redhat.com>
Date: Wed May 21 08:43:32 2025 +0200
elf: Fix subprocess status handling for tst-dlopen-sgid (bug 32987)
This should really move into support_capture_subprogram_self_sgid.
Reviewed-by: Sam James <sam@gentoo.org>
(cherry picked from commit 35fc356fa3b4f485bd3ba3114c9f774e5df7d3c2)
diff --git a/elf/tst-dlopen-sgid.c b/elf/tst-dlopen-sgid.c
index 47829a405e90b6b9..5688b79f2e870b1d 100644
--- a/elf/tst-dlopen-sgid.c
+++ b/elf/tst-dlopen-sgid.c
@@ -26,6 +26,8 @@
#include <support/check.h>
#include <support/support.h>
#include <support/temp_file.h>
+#include <support/test-driver.h>
+#include <sys/wait.h>
#include <unistd.h>
/* This is the name of our test object. Use a custom module for
@@ -66,10 +68,16 @@ do_test (void)
free (from);
}
- TEST_COMPARE (support_capture_subprogram_self_sgid (magic_argument), 0);
-
free (libdir);
+ int status = support_capture_subprogram_self_sgid (magic_argument);
+
+ if (WEXITSTATUS (status) == EXIT_UNSUPPORTED)
+ return EXIT_UNSUPPORTED;
+
+ if (!WIFEXITED (status))
+ FAIL_EXIT1 ("Unexpected exit status %d from child process\n", status);
+
return 0;
}

View File

@ -0,0 +1,215 @@
commit c6240a11f7325031651e634309ca1a43a7484bd4
Author: Carlos O'Donell <carlos@redhat.com>
Date: Wed Jun 11 09:43:50 2025 -0400
ppc64le: Revert "powerpc: Fix performance issues of strcmp power10" (CVE-2025-5702)
This reverts commit 90bcc8721ef82b7378d2b080141228660e862d56
This change is in the chain of the final revert that fixes the CVE
i.e. 3367d8e180848030d1646f088759f02b8dfe0d6f
Reason for revert: Power10 strcmp clobbers non-volatile vector
registers (Bug 33056)
Tested on ppc64le with no regressions.
(cherry picked from commit c22de63588df7a8a0edceea9bb02534064c9d201)
diff --git a/sysdeps/powerpc/powerpc64/le/power10/strcmp.S b/sysdeps/powerpc/powerpc64/le/power10/strcmp.S
index f0d6732a25efc63b..00f1e9c1707f5dd1 100644
--- a/sysdeps/powerpc/powerpc64/le/power10/strcmp.S
+++ b/sysdeps/powerpc/powerpc64/le/power10/strcmp.S
@@ -62,7 +62,7 @@
lxvl 32+v5,reg2,r0; \
add reg1,reg1,len_reg; \
add reg2,reg2,len_reg; \
- vcmpnezb v7,v4,v5; \
+ vcmpnezb. v7,v4,v5; \
vctzlsbb r6,v7; \
cmpld cr7,r6,len_reg; \
blt cr7,L(different); \
@@ -72,110 +72,70 @@
.machine power9
ENTRY_TOCLESS (STRCMP, 4)
- andi. r7,r3,4095
- andi. r8,r4,4095
- cmpldi cr0,r7,4096-16
- cmpldi cr1,r8,4096-16
- bgt cr0,L(crosses)
- bgt cr1,L(crosses)
- COMPARE_16(v4,v5,0)
-
-L(crosses):
- andi. r7,r3,15
- subfic r7,r7,16 /* r7(nalign1) = 16 - (str1 & 15). */
- andi. r9,r4,15
- subfic r5,r9,16 /* r5(nalign2) = 16 - (str2 & 15). */
- cmpld cr7,r7,r5
- beq cr7,L(same_aligned)
- blt cr7,L(nalign1_min)
+ li r11,16
+ /* eq bit of cr1 used as swap status flag to indicate if
+ source pointers were swapped. */
+ crclr 4*cr1+eq
+ vspltisb v19,-1
+ andi. r7,r3,15
+ sub r7,r11,r7 /* r7(nalign1) = 16 - (str1 & 15). */
+ andi. r9,r4,15
+ sub r5,r11,r9 /* r5(nalign2) = 16 - (str2 & 15). */
+ cmpld cr7,r7,r5
+ beq cr7,L(same_aligned)
+ blt cr7,L(nalign1_min)
+ /* Swap r3 and r4, and r7 and r5 such that r3 and r7 hold the
+ pointer which is closer to the next 16B boundary so that only
+ one CHECK_N_BYTES is needed before entering the loop below. */
+ mr r8,r4
+ mr r4,r3
+ mr r3,r8
+ mr r12,r7
+ mr r7,r5
+ mr r5,r12
+ crset 4*cr1+eq /* Set bit on swapping source pointers. */
- /* nalign2 is minimum and s2 pointer is aligned. */
- CHECK_N_BYTES(r3,r4,r5)
- /* Are we on the 64B hunk which crosses a page? */
- andi. r10,r3,63 /* Determine offset into 64B hunk. */
- andi. r8,r3,15 /* The offset into the 16B hunk. */
- neg r7,r3
- andi. r9,r7,15 /* Number of bytes after a 16B cross. */
- rlwinm. r7,r7,26,0x3F /* ((r3-4096))>>6&63. */
- beq L(compare_64_pagecross)
- mtctr r7
- b L(compare_64B_unaligned)
-
- /* nalign1 is minimum and s1 pointer is aligned. */
+ .p2align 5
L(nalign1_min):
CHECK_N_BYTES(r3,r4,r7)
- /* Are we on the 64B hunk which crosses a page? */
- andi. r10,r4,63 /* Determine offset into 64B hunk. */
- andi. r8,r4,15 /* The offset into the 16B hunk. */
- neg r7,r4
- andi. r9,r7,15 /* Number of bytes after a 16B cross. */
- rlwinm. r7,r7,26,0x3F /* ((r4-4096))>>6&63. */
- beq L(compare_64_pagecross)
- mtctr r7
.p2align 5
-L(compare_64B_unaligned):
- COMPARE_16(v4,v5,0)
- COMPARE_16(v4,v5,16)
- COMPARE_16(v4,v5,32)
- COMPARE_16(v4,v5,48)
- addi r3,r3,64
- addi r4,r4,64
- bdnz L(compare_64B_unaligned)
+L(s1_aligned):
+ /* r9 and r5 is number of bytes to be read after and before
+ page boundary correspondingly. */
+ sub r5,r5,r7
+ subfic r9,r5,16
+ /* Now let r7 hold the count of quadwords which can be
+ checked without crossing a page boundary. quadword offset is
+ (str2>>4)&0xFF. */
+ rlwinm r7,r4,28,0xFF
+ /* Below check is required only for first iteration. For second
+ iteration and beyond, the new loop counter is always 255. */
+ cmpldi r7,255
+ beq L(L3)
+ /* Get the initial loop count by 255-((str2>>4)&0xFF). */
+ subfic r11,r7,255
- /* Cross the page boundary of s2, carefully. Only for first
- iteration we have to get the count of 64B blocks to be checked.
- From second iteration and beyond, loop counter is always 63. */
-L(compare_64_pagecross):
- li r11, 63
+ .p2align 5
+L(L1):
mtctr r11
- cmpldi r10,16
- ble L(cross_4)
- cmpldi r10,32
- ble L(cross_3)
- cmpldi r10,48
- ble L(cross_2)
-L(cross_1):
- CHECK_N_BYTES(r3,r4,r9)
- CHECK_N_BYTES(r3,r4,r8)
- COMPARE_16(v4,v5,0)
- COMPARE_16(v4,v5,16)
- COMPARE_16(v4,v5,32)
- addi r3,r3,48
- addi r4,r4,48
- b L(compare_64B_unaligned)
-L(cross_2):
- COMPARE_16(v4,v5,0)
- addi r3,r3,16
- addi r4,r4,16
- CHECK_N_BYTES(r3,r4,r9)
- CHECK_N_BYTES(r3,r4,r8)
- COMPARE_16(v4,v5,0)
- COMPARE_16(v4,v5,16)
- addi r3,r3,32
- addi r4,r4,32
- b L(compare_64B_unaligned)
-L(cross_3):
- COMPARE_16(v4,v5,0)
- COMPARE_16(v4,v5,16)
- addi r3,r3,32
- addi r4,r4,32
- CHECK_N_BYTES(r3,r4,r9)
- CHECK_N_BYTES(r3,r4,r8)
- COMPARE_16(v4,v5,0)
+
+ .p2align 5
+L(L2):
+ COMPARE_16(v4,v5,0) /* Load 16B blocks using lxv. */
addi r3,r3,16
addi r4,r4,16
- b L(compare_64B_unaligned)
-L(cross_4):
- COMPARE_16(v4,v5,0)
- COMPARE_16(v4,v5,16)
- COMPARE_16(v4,v5,32)
- addi r3,r3,48
- addi r4,r4,48
+ bdnz L(L2)
+ /* Cross the page boundary of s2, carefully. */
+
+ .p2align 5
+L(L3):
+ CHECK_N_BYTES(r3,r4,r5)
CHECK_N_BYTES(r3,r4,r9)
- CHECK_N_BYTES(r3,r4,r8)
- b L(compare_64B_unaligned)
+ li r11,255 /* Load the new loop counter. */
+ b L(L1)
+ .p2align 5
L(same_aligned):
CHECK_N_BYTES(r3,r4,r7)
/* Align s1 to 32B and adjust s2 address.
@@ -208,7 +168,18 @@ L(16B_aligned_loop):
/* Calculate and return the difference. */
L(different):
- TAIL(v4,v5)
+ vctzlsbb r6,v7
+ vextubrx r5,r6,v4
+ vextubrx r4,r6,v5
+ bt 4*cr1+eq,L(swapped)
+ subf r3,r4,r5
+ blr
+
+ /* If src pointers were swapped, then swap the
+ indices and calculate the return value. */
+L(swapped):
+ subf r3,r5,r4
+ blr
.p2align 5
L(32B_aligned_loop):

View File

@ -0,0 +1,443 @@
commit 3875045da55e3df9b2a05392504888b88cd68edb
Author: Carlos O'Donell <carlos@redhat.com>
Date: Wed Jun 11 09:33:45 2025 -0400
ppc64le: Revert "powerpc : Add optimized memchr for POWER10" (Bug 33059)
This reverts commit b9182c793caa05df5d697427c0538936e6396d4b
Reason for revert: Power10 memchr clobbers v20 vector register
(Bug 33059)
This is not a security issue, unlike CVE-2025-5745 and
CVE-2025-5702.
Tested on ppc64le without regression.
(cherry picked from commit a7877bb6685300f159fa095c9f50b22b112cddb8)
diff --git a/sysdeps/powerpc/powerpc64/le/power10/memchr.S b/sysdeps/powerpc/powerpc64/le/power10/memchr.S
deleted file mode 100644
index 53e5716d72e133d5..0000000000000000
--- a/sysdeps/powerpc/powerpc64/le/power10/memchr.S
+++ /dev/null
@@ -1,315 +0,0 @@
-/* Optimized memchr implementation for POWER10 LE.
- Copyright (C) 2021-2024 Free Software Foundation, Inc.
- This file is part of the GNU C Library.
-
- The GNU C Library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- The GNU C Library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with the GNU C Library; if not, see
- <https://www.gnu.org/licenses/>. */
-
-#include <sysdep.h>
-
-# ifndef MEMCHR
-# define MEMCHR __memchr
-# endif
-# define M_VREG_ZERO v20
-# define M_OFF_START_LOOP 256
-# define MEMCHR_SUBTRACT_VECTORS \
- vsububm v4,v4,v18; \
- vsububm v5,v5,v18; \
- vsububm v6,v6,v18; \
- vsububm v7,v7,v18;
-# define M_TAIL(vreg,increment) \
- vctzlsbb r4,vreg; \
- cmpld r5,r4; \
- ble L(null); \
- addi r4,r4,increment; \
- add r3,r6,r4; \
- blr
-
-/* TODO: Replace macros by the actual instructions when minimum binutils becomes
- >= 2.35. This is used to keep compatibility with older versions. */
-#define M_VEXTRACTBM(rt,vrb) \
- .long(((4)<<(32-6)) \
- | ((rt)<<(32-11)) \
- | ((8)<<(32-16)) \
- | ((vrb)<<(32-21)) \
- | 1602)
-
-#define M_LXVP(xtp,dq,ra) \
- .long(((6)<<(32-6)) \
- | ((((xtp)-32)>>1)<<(32-10)) \
- | ((1)<<(32-11)) \
- | ((ra)<<(32-16)) \
- | dq)
-
-#define CHECK16B(vreg,offset,addr,label) \
- lxv vreg+32,offset(addr); \
- vcmpequb. vreg,vreg,v18; \
- bne cr6,L(label); \
- cmpldi r5,16; \
- ble L(null); \
- addi r5,r5,-16;
-
-/* Load 4 quadwords, merge into one VR for speed and check for NULLs. r6 has #
- of bytes already checked. */
-#define CHECK64B(offset,addr,label) \
- M_LXVP(v4+32,offset,addr); \
- M_LXVP(v6+32,offset+32,addr); \
- MEMCHR_SUBTRACT_VECTORS; \
- vminub v14,v4,v5; \
- vminub v15,v6,v7; \
- vminub v16,v14,v15; \
- vcmpequb. v0,v16,M_VREG_ZERO; \
- beq cr6,$+12; \
- li r7,offset; \
- b L(label); \
- cmpldi r5,64; \
- ble L(null); \
- addi r5,r5,-64
-
-/* Implements the function
- void *[r3] memchr (const void *s [r3], int c [r4], size_t n [r5]). */
-
- .machine power9
-
-ENTRY_TOCLESS (MEMCHR)
- CALL_MCOUNT 3
-
- cmpldi r5,0
- beq L(null)
- mr r0,r5
- xori r6,r4,0xff
-
- mtvsrd v18+32,r4 /* matching char in v18 */
- mtvsrd v19+32,r6 /* non matching char in v19 */
-
- vspltb v18,v18,7 /* replicate */
- vspltb v19,v19,7 /* replicate */
- vspltisb M_VREG_ZERO,0
-
- /* Next 16B-aligned address. Prepare address for L(aligned). */
- addi r6,r3,16
- clrrdi r6,r6,4
-
- /* Align data and fill bytes not loaded with non matching char. */
- lvx v0,0,r3
- lvsr v1,0,r3
- vperm v0,v19,v0,v1
-
- vcmpequb. v6,v0,v18
- bne cr6,L(found)
- sub r4,r6,r3
- cmpld r5,r4
- ble L(null)
- sub r5,r5,r4
-
- /* Test up to OFF_START_LOOP-16 bytes in 16B chunks. The main loop is
- optimized for longer strings, so checking the first bytes in 16B
- chunks benefits a lot small strings. */
- .p2align 5
-L(aligned):
- cmpldi r5,0
- beq L(null)
-
- CHECK16B(v0,0,r6,tail1)
- CHECK16B(v1,16,r6,tail2)
- CHECK16B(v2,32,r6,tail3)
- CHECK16B(v3,48,r6,tail4)
- CHECK16B(v4,64,r6,tail5)
- CHECK16B(v5,80,r6,tail6)
- CHECK16B(v6,96,r6,tail7)
- CHECK16B(v7,112,r6,tail8)
- CHECK16B(v8,128,r6,tail9)
- CHECK16B(v9,144,r6,tail10)
- CHECK16B(v10,160,r6,tail11)
- CHECK16B(v0,176,r6,tail12)
- CHECK16B(v1,192,r6,tail13)
- CHECK16B(v2,208,r6,tail14)
- CHECK16B(v3,224,r6,tail15)
-
- cmpdi cr5,r4,0 /* Check if c == 0. This will be useful to
- choose how we will perform the main loop. */
-
- /* Prepare address for the loop. */
- addi r4,r3,M_OFF_START_LOOP
- clrrdi r4,r4,6
- sub r6,r4,r3
- sub r5,r0,r6
- addi r6,r4,128
-
- /* If c == 0, use the loop without the vsububm. */
- beq cr5,L(loop)
-
- /* This is very similar to the block after L(loop), the difference is
- that here MEMCHR_SUBTRACT_VECTORS is not empty, and we subtract
- each byte loaded by the char we are looking for, this way we can keep
- using vminub to merge the results and checking for nulls. */
- .p2align 5
-L(memchr_loop):
- CHECK64B(0,r4,pre_tail_64b)
- CHECK64B(64,r4,pre_tail_64b)
- addi r4,r4,256
-
- CHECK64B(0,r6,tail_64b)
- CHECK64B(64,r6,tail_64b)
- addi r6,r6,256
-
- CHECK64B(0,r4,pre_tail_64b)
- CHECK64B(64,r4,pre_tail_64b)
- addi r4,r4,256
-
- CHECK64B(0,r6,tail_64b)
- CHECK64B(64,r6,tail_64b)
- addi r6,r6,256
-
- b L(memchr_loop)
- /* Switch to a more aggressive approach checking 64B each time. Use 2
- pointers 128B apart and unroll the loop once to make the pointer
- updates and usages separated enough to avoid stalls waiting for
- address calculation. */
- .p2align 5
-L(loop):
-#undef MEMCHR_SUBTRACT_VECTORS
-#define MEMCHR_SUBTRACT_VECTORS /* nothing */
- CHECK64B(0,r4,pre_tail_64b)
- CHECK64B(64,r4,pre_tail_64b)
- addi r4,r4,256
-
- CHECK64B(0,r6,tail_64b)
- CHECK64B(64,r6,tail_64b)
- addi r6,r6,256
-
- CHECK64B(0,r4,pre_tail_64b)
- CHECK64B(64,r4,pre_tail_64b)
- addi r4,r4,256
-
- CHECK64B(0,r6,tail_64b)
- CHECK64B(64,r6,tail_64b)
- addi r6,r6,256
-
- b L(loop)
-
- .p2align 5
-L(pre_tail_64b):
- mr r6,r4
-L(tail_64b):
- /* OK, we found a null byte. Let's look for it in the current 64-byte
- block and mark it in its corresponding VR. lxvp vx,0(ry) puts the
- low 16B bytes into vx+1, and the high into vx, so the order here is
- v5, v4, v7, v6. */
- vcmpequb v1,v5,M_VREG_ZERO
- vcmpequb v2,v4,M_VREG_ZERO
- vcmpequb v3,v7,M_VREG_ZERO
- vcmpequb v4,v6,M_VREG_ZERO
-
- /* Take into account the other 64B blocks we had already checked. */
- add r6,r6,r7
- /* Extract first bit of each byte. */
- M_VEXTRACTBM(r8,v1)
- M_VEXTRACTBM(r9,v2)
- M_VEXTRACTBM(r10,v3)
- M_VEXTRACTBM(r11,v4)
-
- /* Shift each value into their corresponding position. */
- sldi r9,r9,16
- sldi r10,r10,32
- sldi r11,r11,48
-
- /* Merge the results. */
- or r8,r8,r9
- or r9,r10,r11
- or r11,r9,r8
-
- cnttzd r0,r11 /* Count trailing zeros before the match. */
- cmpld r5,r0
- ble L(null)
- add r3,r6,r0 /* Compute final address. */
- blr
-
- .p2align 5
-L(tail1):
- M_TAIL(v0,0)
-
- .p2align 5
-L(tail2):
- M_TAIL(v1,16)
-
- .p2align 5
-L(tail3):
- M_TAIL(v2,32)
-
- .p2align 5
-L(tail4):
- M_TAIL(v3,48)
-
- .p2align 5
-L(tail5):
- M_TAIL(v4,64)
-
- .p2align 5
-L(tail6):
- M_TAIL(v5,80)
-
- .p2align 5
-L(tail7):
- M_TAIL(v6,96)
-
- .p2align 5
-L(tail8):
- M_TAIL(v7,112)
-
- .p2align 5
-L(tail9):
- M_TAIL(v8,128)
-
- .p2align 5
-L(tail10):
- M_TAIL(v9,144)
-
- .p2align 5
-L(tail11):
- M_TAIL(v10,160)
-
- .p2align 5
-L(tail12):
- M_TAIL(v0,176)
-
- .p2align 5
-L(tail13):
- M_TAIL(v1,192)
-
- .p2align 5
-L(tail14):
- M_TAIL(v2,208)
-
- .p2align 5
-L(tail15):
- M_TAIL(v3,224)
-
- .p2align 5
-L(found):
- vctzlsbb r7,v6
- cmpld r5,r7
- ble L(null)
- add r3,r3,r7
- blr
-
- .p2align 5
-L(null):
- li r3,0
- blr
-
-END (MEMCHR)
-
-weak_alias (__memchr, memchr)
-libc_hidden_builtin_def (memchr)
diff --git a/sysdeps/powerpc/powerpc64/multiarch/Makefile b/sysdeps/powerpc/powerpc64/multiarch/Makefile
index 594fbb8058569d95..d7824a922b0de470 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/Makefile
+++ b/sysdeps/powerpc/powerpc64/multiarch/Makefile
@@ -31,10 +31,10 @@ sysdep_routines += memcpy-power8-cached memcpy-power7 memcpy-a2 memcpy-power6 \
strncase-power8
ifneq (,$(filter %le,$(config-machine)))
-sysdep_routines += memchr-power10 memcmp-power10 memcpy-power10 \
- memmove-power10 memset-power10 rawmemchr-power9 \
- rawmemchr-power10 strcmp-power9 strcmp-power10 \
- strncmp-power9 strcpy-power9 stpcpy-power9 \
+sysdep_routines += memcmp-power10 memcpy-power10 memmove-power10 memset-power10 \
+ rawmemchr-power9 rawmemchr-power10 \
+ strcmp-power9 strcmp-power10 strncmp-power9 \
+ strcpy-power9 stpcpy-power9 \
strlen-power9 strncpy-power9 stpncpy-power9 strlen-power10
endif
CFLAGS-strncase-power7.c += -mcpu=power7 -funroll-loops
diff --git a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
index 5b2d6a90ab59e561..e2f733eb82fa6199 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
@@ -226,12 +226,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
/* Support sysdeps/powerpc/powerpc64/multiarch/memchr.c. */
IFUNC_IMPL (i, name, memchr,
-#ifdef __LITTLE_ENDIAN__
- IFUNC_IMPL_ADD (array, i, memchr,
- hwcap2 & PPC_FEATURE2_ARCH_3_1
- && hwcap & PPC_FEATURE_HAS_VSX,
- __memchr_power10)
-#endif
IFUNC_IMPL_ADD (array, i, memchr,
hwcap2 & PPC_FEATURE2_ARCH_2_07
&& hwcap & PPC_FEATURE_HAS_ALTIVEC,
diff --git a/sysdeps/powerpc/powerpc64/multiarch/memchr-power10.S b/sysdeps/powerpc/powerpc64/multiarch/memchr-power10.S
deleted file mode 100644
index 7d35ef28a91255ba..0000000000000000
--- a/sysdeps/powerpc/powerpc64/multiarch/memchr-power10.S
+++ /dev/null
@@ -1,28 +0,0 @@
-/* Optimized memchr implementation for POWER10/PPC64.
- Copyright (C) 2016-2024 Free Software Foundation, Inc.
- This file is part of the GNU C Library.
-
- The GNU C Library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- The GNU C Library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with the GNU C Library; if not, see
- <https://www.gnu.org/licenses/>. */
-
-#if defined __LITTLE_ENDIAN__ && IS_IN (libc)
-#define MEMCHR __memchr_power10
-
-#undef libc_hidden_builtin_def
-#define libc_hidden_builtin_def(name)
-#undef weak_alias
-#define weak_alias(name,alias)
-
-#include <sysdeps/powerpc/powerpc64/le/power10/memchr.S>
-#endif
diff --git a/sysdeps/powerpc/powerpc64/multiarch/memchr.c b/sysdeps/powerpc/powerpc64/multiarch/memchr.c
index 57d23e7b18587e82..b4655dfcaa482774 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/memchr.c
+++ b/sysdeps/powerpc/powerpc64/multiarch/memchr.c
@@ -25,23 +25,15 @@ extern __typeof (__memchr) __memchr_ppc attribute_hidden;
extern __typeof (__memchr) __memchr_power7 attribute_hidden;
extern __typeof (__memchr) __memchr_power8 attribute_hidden;
-# ifdef __LITTLE_ENDIAN__
-extern __typeof (__memchr) __memchr_power10 attribute_hidden;
-# endif
/* Avoid DWARF definition DIE on ifunc symbol so that GDB can handle
ifunc symbol properly. */
libc_ifunc (__memchr,
-# ifdef __LITTLE_ENDIAN__
- (hwcap2 & PPC_FEATURE2_ARCH_3_1
- && hwcap & PPC_FEATURE_HAS_VSX)
- ? __memchr_power10 :
-# endif
- (hwcap2 & PPC_FEATURE2_ARCH_2_07
- && hwcap & PPC_FEATURE_HAS_ALTIVEC)
- ? __memchr_power8 :
- (hwcap & PPC_FEATURE_ARCH_2_06)
- ? __memchr_power7
- : __memchr_ppc);
+ (hwcap2 & PPC_FEATURE2_ARCH_2_07
+ && hwcap & PPC_FEATURE_HAS_ALTIVEC)
+ ? __memchr_power8 :
+ (hwcap & PPC_FEATURE_ARCH_2_06)
+ ? __memchr_power7
+ : __memchr_ppc);
weak_alias (__memchr, memchr)
libc_hidden_builtin_def (memchr)

View File

@ -0,0 +1,307 @@
commit 06a70769fd0b2e1f2a3085ad50ab620282bd77b3
Author: Carlos O'Donell <carlos@redhat.com>
Date: Mon Jun 16 13:09:57 2025 -0400
ppc64le: Revert "powerpc: Optimized strcmp for power10" (CVE-2025-5702)
This reverts commit 3367d8e180848030d1646f088759f02b8dfe0d6f
Reason for revert: Power10 strcmp clobbers non-volatile vector
registers (Bug 33056)
Tested on ppc64le without regression.
(cherry picked from commit 15808c77b35319e67ee0dc8f984a9a1a434701bc)
diff --git a/sysdeps/powerpc/powerpc64/le/power10/strcmp.S b/sysdeps/powerpc/powerpc64/le/power10/strcmp.S
deleted file mode 100644
index 00f1e9c1707f5dd1..0000000000000000
--- a/sysdeps/powerpc/powerpc64/le/power10/strcmp.S
+++ /dev/null
@@ -1,204 +0,0 @@
-/* Optimized strcmp implementation for PowerPC64/POWER10.
- Copyright (C) 2021-2024 Free Software Foundation, Inc.
- This file is part of the GNU C Library.
-
- The GNU C Library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- The GNU C Library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with the GNU C Library; if not, see
- <https://www.gnu.org/licenses/>. */
-#include <sysdep.h>
-
-#ifndef STRCMP
-# define STRCMP strcmp
-#endif
-
-/* Implements the function
- int [r3] strcmp (const char *s1 [r3], const char *s2 [r4]). */
-
-/* TODO: Change this to actual instructions when minimum binutils is upgraded
- to 2.27. Macros are defined below for these newer instructions in order
- to maintain compatibility. */
-
-#define LXVP(xtp,dq,ra) \
- .long(((6)<<(32-6)) \
- | ((((xtp)-32)>>1)<<(32-10)) \
- | ((1)<<(32-11)) \
- | ((ra)<<(32-16)) \
- | dq)
-
-#define COMPARE_16(vreg1,vreg2,offset) \
- lxv vreg1+32,offset(r3); \
- lxv vreg2+32,offset(r4); \
- vcmpnezb. v7,vreg1,vreg2; \
- bne cr6,L(different); \
-
-#define COMPARE_32(vreg1,vreg2,offset,label1,label2) \
- LXVP(vreg1+32,offset,r3); \
- LXVP(vreg2+32,offset,r4); \
- vcmpnezb. v7,vreg1+1,vreg2+1; \
- bne cr6,L(label1); \
- vcmpnezb. v7,vreg1,vreg2; \
- bne cr6,L(label2); \
-
-#define TAIL(vreg1,vreg2) \
- vctzlsbb r6,v7; \
- vextubrx r5,r6,vreg1; \
- vextubrx r4,r6,vreg2; \
- subf r3,r4,r5; \
- blr; \
-
-#define CHECK_N_BYTES(reg1,reg2,len_reg) \
- sldi r0,len_reg,56; \
- lxvl 32+v4,reg1,r0; \
- lxvl 32+v5,reg2,r0; \
- add reg1,reg1,len_reg; \
- add reg2,reg2,len_reg; \
- vcmpnezb. v7,v4,v5; \
- vctzlsbb r6,v7; \
- cmpld cr7,r6,len_reg; \
- blt cr7,L(different); \
-
- /* TODO: change this to .machine power10 when the minimum required
- binutils allows it. */
-
- .machine power9
-ENTRY_TOCLESS (STRCMP, 4)
- li r11,16
- /* eq bit of cr1 used as swap status flag to indicate if
- source pointers were swapped. */
- crclr 4*cr1+eq
- vspltisb v19,-1
- andi. r7,r3,15
- sub r7,r11,r7 /* r7(nalign1) = 16 - (str1 & 15). */
- andi. r9,r4,15
- sub r5,r11,r9 /* r5(nalign2) = 16 - (str2 & 15). */
- cmpld cr7,r7,r5
- beq cr7,L(same_aligned)
- blt cr7,L(nalign1_min)
- /* Swap r3 and r4, and r7 and r5 such that r3 and r7 hold the
- pointer which is closer to the next 16B boundary so that only
- one CHECK_N_BYTES is needed before entering the loop below. */
- mr r8,r4
- mr r4,r3
- mr r3,r8
- mr r12,r7
- mr r7,r5
- mr r5,r12
- crset 4*cr1+eq /* Set bit on swapping source pointers. */
-
- .p2align 5
-L(nalign1_min):
- CHECK_N_BYTES(r3,r4,r7)
-
- .p2align 5
-L(s1_aligned):
- /* r9 and r5 is number of bytes to be read after and before
- page boundary correspondingly. */
- sub r5,r5,r7
- subfic r9,r5,16
- /* Now let r7 hold the count of quadwords which can be
- checked without crossing a page boundary. quadword offset is
- (str2>>4)&0xFF. */
- rlwinm r7,r4,28,0xFF
- /* Below check is required only for first iteration. For second
- iteration and beyond, the new loop counter is always 255. */
- cmpldi r7,255
- beq L(L3)
- /* Get the initial loop count by 255-((str2>>4)&0xFF). */
- subfic r11,r7,255
-
- .p2align 5
-L(L1):
- mtctr r11
-
- .p2align 5
-L(L2):
- COMPARE_16(v4,v5,0) /* Load 16B blocks using lxv. */
- addi r3,r3,16
- addi r4,r4,16
- bdnz L(L2)
- /* Cross the page boundary of s2, carefully. */
-
- .p2align 5
-L(L3):
- CHECK_N_BYTES(r3,r4,r5)
- CHECK_N_BYTES(r3,r4,r9)
- li r11,255 /* Load the new loop counter. */
- b L(L1)
-
- .p2align 5
-L(same_aligned):
- CHECK_N_BYTES(r3,r4,r7)
- /* Align s1 to 32B and adjust s2 address.
- Use lxvp only if both s1 and s2 are 32B aligned. */
- COMPARE_16(v4,v5,0)
- COMPARE_16(v4,v5,16)
- COMPARE_16(v4,v5,32)
- COMPARE_16(v4,v5,48)
- addi r3,r3,64
- addi r4,r4,64
- COMPARE_16(v4,v5,0)
- COMPARE_16(v4,v5,16)
-
- clrldi r6,r3,59
- subfic r5,r6,32
- add r3,r3,r5
- add r4,r4,r5
- andi. r5,r4,0x1F
- beq cr0,L(32B_aligned_loop)
-
- .p2align 5
-L(16B_aligned_loop):
- COMPARE_16(v4,v5,0)
- COMPARE_16(v4,v5,16)
- COMPARE_16(v4,v5,32)
- COMPARE_16(v4,v5,48)
- addi r3,r3,64
- addi r4,r4,64
- b L(16B_aligned_loop)
-
- /* Calculate and return the difference. */
-L(different):
- vctzlsbb r6,v7
- vextubrx r5,r6,v4
- vextubrx r4,r6,v5
- bt 4*cr1+eq,L(swapped)
- subf r3,r4,r5
- blr
-
- /* If src pointers were swapped, then swap the
- indices and calculate the return value. */
-L(swapped):
- subf r3,r5,r4
- blr
-
- .p2align 5
-L(32B_aligned_loop):
- COMPARE_32(v14,v16,0,tail1,tail2)
- COMPARE_32(v18,v20,32,tail3,tail4)
- COMPARE_32(v22,v24,64,tail5,tail6)
- COMPARE_32(v26,v28,96,tail7,tail8)
- addi r3,r3,128
- addi r4,r4,128
- b L(32B_aligned_loop)
-
-L(tail1): TAIL(v15,v17)
-L(tail2): TAIL(v14,v16)
-L(tail3): TAIL(v19,v21)
-L(tail4): TAIL(v18,v20)
-L(tail5): TAIL(v23,v25)
-L(tail6): TAIL(v22,v24)
-L(tail7): TAIL(v27,v29)
-L(tail8): TAIL(v26,v28)
-
-END (STRCMP)
-libc_hidden_builtin_def (strcmp)
diff --git a/sysdeps/powerpc/powerpc64/multiarch/Makefile b/sysdeps/powerpc/powerpc64/multiarch/Makefile
index d7824a922b0de470..27d8495503a5a1fe 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/Makefile
+++ b/sysdeps/powerpc/powerpc64/multiarch/Makefile
@@ -33,8 +33,7 @@ sysdep_routines += memcpy-power8-cached memcpy-power7 memcpy-a2 memcpy-power6 \
ifneq (,$(filter %le,$(config-machine)))
sysdep_routines += memcmp-power10 memcpy-power10 memmove-power10 memset-power10 \
rawmemchr-power9 rawmemchr-power10 \
- strcmp-power9 strcmp-power10 strncmp-power9 \
- strcpy-power9 stpcpy-power9 \
+ strcmp-power9 strncmp-power9 strcpy-power9 stpcpy-power9 \
strlen-power9 strncpy-power9 stpncpy-power9 strlen-power10
endif
CFLAGS-strncase-power7.c += -mcpu=power7 -funroll-loops
diff --git a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
index e2f733eb82fa6199..ad6080f1991f4080 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
@@ -377,10 +377,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
/* Support sysdeps/powerpc/powerpc64/multiarch/strcmp.c. */
IFUNC_IMPL (i, name, strcmp,
#ifdef __LITTLE_ENDIAN__
- IFUNC_IMPL_ADD (array, i, strcmp,
- (hwcap2 & PPC_FEATURE2_ARCH_3_1)
- && (hwcap & PPC_FEATURE_HAS_VSX),
- __strcmp_power10)
IFUNC_IMPL_ADD (array, i, strcmp,
hwcap2 & PPC_FEATURE2_ARCH_3_00
&& hwcap & PPC_FEATURE_HAS_ALTIVEC,
diff --git a/sysdeps/powerpc/powerpc64/multiarch/strcmp-power10.S b/sysdeps/powerpc/powerpc64/multiarch/strcmp-power10.S
deleted file mode 100644
index 1a9f6069f589a95c..0000000000000000
--- a/sysdeps/powerpc/powerpc64/multiarch/strcmp-power10.S
+++ /dev/null
@@ -1,26 +0,0 @@
-/* Optimized strcmp implementation for POWER10/PPC64.
- Copyright (C) 2021-2024 Free Software Foundation, Inc.
- This file is part of the GNU C Library.
-
- The GNU C Library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- The GNU C Library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with the GNU C Library; if not, see
- <https://www.gnu.org/licenses/>. */
-
-#if defined __LITTLE_ENDIAN__ && IS_IN (libc)
-#define STRCMP __strcmp_power10
-
-#undef libc_hidden_builtin_def
-#define libc_hidden_builtin_def(name)
-
-#include <sysdeps/powerpc/powerpc64/le/power10/strcmp.S>
-#endif /* __LITTLE_ENDIAN__ && IS_IN (libc) */
diff --git a/sysdeps/powerpc/powerpc64/multiarch/strcmp.c b/sysdeps/powerpc/powerpc64/multiarch/strcmp.c
index ff32496fabba2e47..06b9b4090ff23ee1 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/strcmp.c
+++ b/sysdeps/powerpc/powerpc64/multiarch/strcmp.c
@@ -29,16 +29,12 @@ extern __typeof (strcmp) __strcmp_power7 attribute_hidden;
extern __typeof (strcmp) __strcmp_power8 attribute_hidden;
# ifdef __LITTLE_ENDIAN__
extern __typeof (strcmp) __strcmp_power9 attribute_hidden;
-extern __typeof (strcmp) __strcmp_power10 attribute_hidden;
# endif
# undef strcmp
libc_ifunc_redirected (__redirect_strcmp, strcmp,
# ifdef __LITTLE_ENDIAN__
- (hwcap2 & PPC_FEATURE2_ARCH_3_1
- && hwcap & PPC_FEATURE_HAS_VSX)
- ? __strcmp_power10 :
(hwcap2 & PPC_FEATURE2_ARCH_3_00
&& hwcap & PPC_FEATURE_HAS_ALTIVEC)
? __strcmp_power9 :

View File

@ -0,0 +1,106 @@
commit 1924d341c0acbb9bf9ec77f1971fdb109933d12f
Author: Florian Weimer <fweimer@redhat.com>
Date: Wed May 21 16:47:34 2025 +0200
support: Pick group in support_capture_subprogram_self_sgid if UID == 0
When running as root, it is likely that we can run under any group.
Pick a harmless group from /etc/group in this case.
Reviewed-by: Carlos O'Donell <carlos@redhat.com>
(cherry picked from commit 2f769cec448d84a62b7dd0d4ff56978fe22c0cd6)
diff --git a/support/support_capture_subprocess.c b/support/support_capture_subprocess.c
index eb72a2c21cf99ee2..1a30ae3f31f041d4 100644
--- a/support/support_capture_subprocess.c
+++ b/support/support_capture_subprocess.c
@@ -21,7 +21,11 @@
#include <errno.h>
#include <fcntl.h>
+#include <grp.h>
+#include <scratch_buffer.h>
+#include <stdio_ext.h>
#include <stdlib.h>
+#include <string.h>
#include <support/check.h>
#include <support/xunistd.h>
#include <support/xsocket.h>
@@ -209,10 +213,48 @@ err:
return status;
}
+/* Returns true if a group with NAME has been found, and writes its
+ GID to *TARGET. */
+static bool
+find_sgid_group (gid_t *target, const char *name)
+{
+ /* Do not use getgrname_r because it does not work in statically
+ linked binaries if the system libc is different. */
+ FILE *fp = fopen ("/etc/group", "rce");
+ if (fp == NULL)
+ return false;
+ __fsetlocking (fp, FSETLOCKING_BYCALLER);
+
+ bool ok = false;
+ struct scratch_buffer buf;
+ scratch_buffer_init (&buf);
+ while (true)
+ {
+ struct group grp;
+ struct group *result = NULL;
+ int status = fgetgrent_r (fp, &grp, buf.data, buf.length, &result);
+ if (status == 0 && result != NULL)
+ {
+ if (strcmp (result->gr_name, name) == 0)
+ {
+ *target = result->gr_gid;
+ ok = true;
+ break;
+ }
+ }
+ else if (errno != ERANGE)
+ break;
+ else if (!scratch_buffer_grow (&buf))
+ break;
+ }
+ scratch_buffer_free (&buf);
+ fclose (fp);
+ return ok;
+}
+
int
support_capture_subprogram_self_sgid (const char *child_id)
{
- gid_t target = 0;
const int count = 64;
gid_t groups[count];
@@ -224,6 +266,7 @@ support_capture_subprogram_self_sgid (const char *child_id)
(intmax_t) getuid ());
gid_t current = getgid ();
+ gid_t target = current;
for (int i = 0; i < ret; ++i)
{
if (groups[i] != current)
@@ -233,9 +276,16 @@ support_capture_subprogram_self_sgid (const char *child_id)
}
}
- if (target == 0)
- FAIL_UNSUPPORTED("Could not find a suitable GID for user %jd\n",
- (intmax_t) getuid ());
+ if (target == current)
+ {
+ /* If running as root, try to find a harmless group for SGID. */
+ if (getuid () != 0
+ || (!find_sgid_group (&target, "nogroup")
+ && !find_sgid_group (&target, "bin")
+ && !find_sgid_group (&target, "daemon")))
+ FAIL_UNSUPPORTED("Could not find a suitable GID for user %jd\n",
+ (intmax_t) getuid ());
+ }
return copy_and_spawn_sgid (child_id, target);
}

View File

@ -0,0 +1,318 @@
commit cff1042cceec3502269947e96cf7023451af22f3
Author: Florian Weimer <fweimer@redhat.com>
Date: Thu May 22 14:36:37 2025 +0200
Fix error reporting (false negatives) in SGID tests
And simplify the interface of support_capture_subprogram_self_sgid.
Use the existing framework for temporary directories (now with
mode 0700) and directory/file deletion. Handle all execution
errors within support_capture_subprogram_self_sgid. In particular,
this includes test failures because the invoked program did not
exit with exit status zero. Existing tests that expect exit
status 42 are adjusted to use zero instead.
In addition, fix callers not to call exit (0) with test failures
pending (which may mask them, especially when running with --direct).
Fixes commit 35fc356fa3b4f485bd3ba3114c9f774e5df7d3c2
("elf: Fix subprocess status handling for tst-dlopen-sgid (bug 32987)").
Reviewed-by: Carlos O'Donell <carlos@redhat.com>
(cherry picked from commit 3a3fb2ed83f79100c116c824454095ecfb335ad7)
diff --git a/elf/tst-dlopen-sgid.c b/elf/tst-dlopen-sgid.c
index 5688b79f2e870b1d..8aec52e19fc56aba 100644
--- a/elf/tst-dlopen-sgid.c
+++ b/elf/tst-dlopen-sgid.c
@@ -70,13 +70,7 @@ do_test (void)
free (libdir);
- int status = support_capture_subprogram_self_sgid (magic_argument);
-
- if (WEXITSTATUS (status) == EXIT_UNSUPPORTED)
- return EXIT_UNSUPPORTED;
-
- if (!WIFEXITED (status))
- FAIL_EXIT1 ("Unexpected exit status %d from child process\n", status);
+ support_capture_subprogram_self_sgid (magic_argument);
return 0;
}
diff --git a/elf/tst-env-setuid-tunables.c b/elf/tst-env-setuid-tunables.c
index a47219047f0f602d..233eec7631ed837c 100644
--- a/elf/tst-env-setuid-tunables.c
+++ b/elf/tst-env-setuid-tunables.c
@@ -105,10 +105,7 @@ do_test (int argc, char **argv)
if (ret != 0)
exit (1);
-
- /* Special return code to make sure that the child executed all the way
- through. */
- exit (42);
+ return 0;
}
else
{
@@ -127,18 +124,7 @@ do_test (int argc, char **argv)
continue;
}
- int status = support_capture_subprogram_self_sgid (buf);
-
- /* Bail out early if unsupported. */
- if (WEXITSTATUS (status) == EXIT_UNSUPPORTED)
- return EXIT_UNSUPPORTED;
-
- if (WEXITSTATUS (status) != 42)
- {
- printf (" [%d] child failed with status %d\n", i,
- WEXITSTATUS (status));
- support_record_failure ();
- }
+ support_capture_subprogram_self_sgid (buf);
}
return 0;
}
diff --git a/elf/tst-env-setuid.c b/elf/tst-env-setuid.c
index 43047c48f3ecd555..c084aa4c1a382152 100644
--- a/elf/tst-env-setuid.c
+++ b/elf/tst-env-setuid.c
@@ -148,10 +148,7 @@ do_test (int argc, char **argv)
if (ret != 0)
exit (1);
-
- /* Special return code to make sure that the child executed all the way
- through. */
- exit (42);
+ return 0;
}
else
{
@@ -175,17 +172,7 @@ do_test (int argc, char **argv)
free (profilepath);
}
- int status = support_capture_subprogram_self_sgid (SETGID_CHILD);
-
- if (WEXITSTATUS (status) == EXIT_UNSUPPORTED)
- exit (EXIT_UNSUPPORTED);
-
- if (WEXITSTATUS (status) != 42)
- {
- printf (" child failed with status %d\n",
- WEXITSTATUS (status));
- support_record_failure ();
- }
+ support_capture_subprogram_self_sgid (SETGID_CHILD);
return 0;
}
diff --git a/stdlib/tst-secure-getenv.c b/stdlib/tst-secure-getenv.c
index cc26ed6d15803c99..cefee58d46f25ebb 100644
--- a/stdlib/tst-secure-getenv.c
+++ b/stdlib/tst-secure-getenv.c
@@ -57,13 +57,7 @@ do_test (void)
exit (1);
}
- int status = support_capture_subprogram_self_sgid (MAGIC_ARGUMENT);
-
- if (WEXITSTATUS (status) == EXIT_UNSUPPORTED)
- return EXIT_UNSUPPORTED;
-
- if (!WIFEXITED (status))
- FAIL_EXIT1 ("Unexpected exit status %d from child process\n", status);
+ support_capture_subprogram_self_sgid (MAGIC_ARGUMENT);
return 0;
}
@@ -82,6 +76,7 @@ alternative_main (int argc, char **argv)
if (secure_getenv ("PATH") != NULL)
FAIL_EXIT (4, "PATH variable not filtered out\n");
+ support_record_failure_barrier ();
exit (EXIT_SUCCESS);
}
}
diff --git a/support/capture_subprocess.h b/support/capture_subprocess.h
index f2765278d920839d..8cbdca3b9dfb41ba 100644
--- a/support/capture_subprocess.h
+++ b/support/capture_subprocess.h
@@ -41,10 +41,12 @@ struct support_capture_subprocess support_capture_subprocess
struct support_capture_subprocess support_capture_subprogram
(const char *file, char *const argv[]);
-/* Copy the running program into a setgid binary and run it with CHILD_ID
- argument. If execution is successful, return the exit status of the child
- program, otherwise return a non-zero failure exit code. */
-int support_capture_subprogram_self_sgid (const char *child_id);
+/* Copy the running program into a setgid binary and run it with
+ CHILD_ID argument. If the program exits with a non-zero status,
+ exit with that exit status (or status 1 if the program did not exit
+ normally). If the test cannot be performed, exit with
+ EXIT_UNSUPPORTED. */
+void support_capture_subprogram_self_sgid (const char *child_id);
/* Deallocate the subprocess data captured by
support_capture_subprocess. */
diff --git a/support/support_capture_subprocess.c b/support/support_capture_subprocess.c
index 1a30ae3f31f041d4..8dc95f8aa723b6bc 100644
--- a/support/support_capture_subprocess.c
+++ b/support/support_capture_subprocess.c
@@ -31,6 +31,7 @@
#include <support/xsocket.h>
#include <support/xspawn.h>
#include <support/support.h>
+#include <support/temp_file.h>
#include <support/test-driver.h>
static void
@@ -112,105 +113,44 @@ support_capture_subprogram (const char *file, char *const argv[])
/* Copies the executable into a restricted directory, so that we can
safely make it SGID with the TARGET group ID. Then runs the
executable. */
-static int
+static void
copy_and_spawn_sgid (const char *child_id, gid_t gid)
{
- char *dirname = xasprintf ("%s/tst-tunables-setuid.%jd",
- test_dir, (intmax_t) getpid ());
+ char *dirname = support_create_temp_directory ("tst-glibc-sgid-");
char *execname = xasprintf ("%s/bin", dirname);
- int infd = -1;
- int outfd = -1;
- int ret = 1, status = 1;
-
- TEST_VERIFY (mkdir (dirname, 0700) == 0);
- if (support_record_failure_is_failed ())
- goto err;
+ add_temp_file (execname);
- infd = open ("/proc/self/exe", O_RDONLY);
- if (infd < 0)
+ if (access ("/proc/self/exe", R_OK) != 0)
FAIL_UNSUPPORTED ("unsupported: Cannot read binary from procfs\n");
- outfd = open (execname, O_WRONLY | O_CREAT | O_EXCL, 0700);
- TEST_VERIFY (outfd >= 0);
- if (support_record_failure_is_failed ())
- goto err;
-
- char buf[4096];
- for (;;)
- {
- ssize_t rdcount = read (infd, buf, sizeof (buf));
- TEST_VERIFY (rdcount >= 0);
- if (support_record_failure_is_failed ())
- goto err;
- if (rdcount == 0)
- break;
- char *p = buf;
- char *end = buf + rdcount;
- while (p != end)
- {
- ssize_t wrcount = write (outfd, buf, end - p);
- if (wrcount == 0)
- errno = ENOSPC;
- TEST_VERIFY (wrcount > 0);
- if (support_record_failure_is_failed ())
- goto err;
- p += wrcount;
- }
- }
+ support_copy_file ("/proc/self/exe", execname);
- bool chowned = false;
- TEST_VERIFY ((chowned = fchown (outfd, getuid (), gid) == 0)
- || errno == EPERM);
- if (support_record_failure_is_failed ())
- goto err;
- else if (!chowned)
- {
- ret = 77;
- goto err;
- }
+ if (chown (execname, getuid (), gid) != 0)
+ FAIL_UNSUPPORTED ("cannot change group of \"%s\" to %jd: %m",
+ execname, (intmax_t) gid);
- TEST_VERIFY (fchmod (outfd, 02750) == 0);
- if (support_record_failure_is_failed ())
- goto err;
- TEST_VERIFY (close (outfd) == 0);
- if (support_record_failure_is_failed ())
- goto err;
- TEST_VERIFY (close (infd) == 0);
- if (support_record_failure_is_failed ())
- goto err;
+ if (chmod (execname, 02750) != 0)
+ FAIL_UNSUPPORTED ("cannot make \"%s\" SGID: %m ", execname);
/* We have the binary, now spawn the subprocess. Avoid using
support_subprogram because we only want the program exit status, not the
contents. */
- ret = 0;
- infd = outfd = -1;
char * const args[] = {execname, (char *) child_id, NULL};
+ int status = support_subprogram_wait (args[0], args);
- status = support_subprogram_wait (args[0], args);
+ free (execname);
+ free (dirname);
-err:
- if (outfd >= 0)
- close (outfd);
- if (infd >= 0)
- close (infd);
- if (execname != NULL)
- {
- unlink (execname);
- free (execname);
- }
- if (dirname != NULL)
+ if (WIFEXITED (status))
{
- rmdir (dirname);
- free (dirname);
+ if (WEXITSTATUS (status) == 0)
+ return;
+ else
+ exit (WEXITSTATUS (status));
}
-
- if (ret == 77)
- FAIL_UNSUPPORTED ("Failed to make sgid executable for test\n");
- if (ret != 0)
- FAIL_EXIT1 ("Failed to make sgid executable for test\n");
-
- return status;
+ else
+ FAIL_EXIT1 ("subprogram failed with status %d", status);
}
/* Returns true if a group with NAME has been found, and writes its
@@ -252,7 +192,7 @@ find_sgid_group (gid_t *target, const char *name)
return ok;
}
-int
+void
support_capture_subprogram_self_sgid (const char *child_id)
{
const int count = 64;
@@ -287,7 +227,7 @@ support_capture_subprogram_self_sgid (const char *child_id)
(intmax_t) getuid ());
}
- return copy_and_spawn_sgid (child_id, target);
+ copy_and_spawn_sgid (child_id, target);
}
void

View File

@ -145,7 +145,7 @@ Version: %{glibcversion}
# - It allows using the Release number without the %%dist tag in the dependency # - It allows using the Release number without the %%dist tag in the dependency
# generator to make the generated requires interchangeable between Rawhide # generator to make the generated requires interchangeable between Rawhide
# and ELN (.elnYY < .fcXX). # and ELN (.elnYY < .fcXX).
%global baserelease 39 %global baserelease 40
Release: %{baserelease}%{?dist} Release: %{baserelease}%{?dist}
# Licenses: # Licenses:
@ -508,6 +508,71 @@ Patch190: glibc-RHEL-75809.patch
Patch191: glibc-RHEL-75555.patch Patch191: glibc-RHEL-75555.patch
Patch192: glibc-RHEL-75809-2.patch Patch192: glibc-RHEL-75809-2.patch
Patch193: glibc-RHEL-75809-3.patch Patch193: glibc-RHEL-75809-3.patch
Patch194: glibc-upstream-2.39-147.patch
Patch195: glibc-upstream-2.39-148.patch
Patch196: glibc-upstream-2.39-149.patch
Patch197: glibc-upstream-2.39-150.patch
Patch198: glibc-upstream-2.39-151.patch
Patch199: glibc-upstream-2.39-152.patch
Patch200: glibc-upstream-2.39-153.patch
Patch201: glibc-upstream-2.39-154.patch
Patch202: glibc-upstream-2.39-155.patch
Patch203: glibc-upstream-2.39-156.patch
Patch204: glibc-upstream-2.39-157.patch
Patch205: glibc-upstream-2.39-158.patch
Patch206: glibc-upstream-2.39-159.patch
Patch207: glibc-upstream-2.39-160.patch
Patch208: glibc-upstream-2.39-161.patch
Patch209: glibc-upstream-2.39-162.patch
Patch210: glibc-upstream-2.39-163.patch
Patch211: glibc-upstream-2.39-164.patch
Patch212: glibc-upstream-2.39-165.patch
Patch213: glibc-upstream-2.39-166.patch
Patch214: glibc-upstream-2.39-167.patch
Patch215: glibc-upstream-2.39-168.patch
Patch216: glibc-upstream-2.39-169.patch
Patch217: glibc-upstream-2.39-170.patch
Patch218: glibc-upstream-2.39-171.patch
Patch219: glibc-upstream-2.39-172.patch
Patch220: glibc-upstream-2.39-173.patch
Patch221: glibc-upstream-2.39-174.patch
Patch222: glibc-upstream-2.39-175.patch
Patch223: glibc-upstream-2.39-176.patch
Patch224: glibc-upstream-2.39-177.patch
Patch225: glibc-upstream-2.39-178.patch
Patch226: glibc-upstream-2.39-179.patch
Patch227: glibc-upstream-2.39-180.patch
Patch228: glibc-upstream-2.39-181.patch
Patch229: glibc-upstream-2.39-182.patch
Patch230: glibc-upstream-2.39-183.patch
Patch231: glibc-upstream-2.39-184.patch
Patch232: glibc-upstream-2.39-185.patch
Patch233: glibc-upstream-2.39-186.patch
Patch234: glibc-upstream-2.39-187.patch
Patch235: glibc-upstream-2.39-188.patch
Patch236: glibc-upstream-2.39-189.patch
Patch237: glibc-upstream-2.39-190.patch
Patch238: glibc-upstream-2.39-191.patch
Patch239: glibc-upstream-2.39-192.patch
Patch240: glibc-upstream-2.39-193.patch
Patch241: glibc-upstream-2.39-194.patch
Patch242: glibc-upstream-2.39-195.patch
Patch243: glibc-upstream-2.39-196.patch
Patch244: glibc-upstream-2.39-197.patch
Patch245: glibc-upstream-2.39-198.patch
Patch246: glibc-upstream-2.39-199.patch
Patch247: glibc-upstream-2.39-200.patch
Patch248: glibc-upstream-2.39-201.patch
Patch249: glibc-upstream-2.39-202.patch
Patch250: glibc-upstream-2.39-203.patch
Patch251: glibc-upstream-2.39-204.patch
Patch252: glibc-upstream-2.39-205.patch
Patch253: glibc-upstream-2.39-206.patch
Patch254: glibc-upstream-2.39-207.patch
Patch255: glibc-upstream-2.39-208.patch
Patch256: glibc-upstream-2.39-209.patch
Patch257: glibc-upstream-2.39-210.patch
Patch258: glibc-upstream-2.39-211.patch
############################################################################## ##############################################################################
# Continued list of core "glibc" package information: # Continued list of core "glibc" package information:
@ -2505,6 +2570,35 @@ update_gconv_modules_cache ()
%endif %endif
%changelog %changelog
* Tue Jun 17 2025 Arjun Shankar <arjun@redhat.com> - 2.39-40
- Sync with upstream branch release/2.39/master (RHEL-87416)
- Upstream commit: cff1042cceec3502269947e96cf7023451af22f3
- CVE-2025-5702: Vector register overwrite bug in glibc (RHEL-95485)
- elf: Keep using minimal malloc after early DTV resize (RHEL-71923)
- libio: Fix a deadlock after fork in popen (RHEL-86433)
- Linux: Switch back to assembly syscall wrapper for prctl (RHEL-82286)
- Fix missed wakeup in POSIX thread condition variables (RHEL-82285)
- x86: Detect Intel Diamond Rapids
- x86: Handle unknown Intel processor with default tuning
- x86: Add ARL/PTL/CWF model detection support
- x86: Optimize xstate size calculation
- x86: Support and fixes for separate non-temporal tunable for memset
- x86: Fix a crash when running with XSAVEC disabled via tunables (RHEL-84837)
- x86_64: Add tanh, sinh, and atanh with FMA
- x86-64: Exclude FMA4 IFUNC functions for -mapxf
- nptl: clear the whole rseq area before registration
- math: Improve layout of exp/exp10 data
- AArch64: Add SVE memset
- math: Improve layout of expf data
- AArch64: Remove zva_128 from memset
- AArch64: Optimize memset
- AArch64: Improve generic strlen
- AArch64: Improve codegen for SVE tans and logs
- AArch64: Improve codegen in AdvSIMD logs, logf function family, and atan(2)(f)
- AArch64: Simplify rounding-multiply pattern in several AdvSIMD routines
- aarch64: Avoid redundant MOVs in AdvSIMD F32 logs
- aarch64: Fix AdvSIMD libmvec routines for big-endian
* Tue Jun 17 2025 Florian Weimer <fweimer@redhat.com> - 2.39-39 * Tue Jun 17 2025 Florian Weimer <fweimer@redhat.com> - 2.39-39
- langpacks: Use symlinks for LC_NAME, LC_NUMERIC files if possible (RHEL-97433) - langpacks: Use symlinks for LC_NAME, LC_NUMERIC files if possible (RHEL-97433)