Sync with upstream branch release/2.39/master (RHEL-87416)
Upstream commit: cff1042cceec3502269947e96cf7023451af22f3 Resolves: RHEL-87416 Resolves: RHEL-71923 Resolves: RHEL-82285 Resolves: RHEL-82286 Resolves: RHEL-84837 Resolves: RHEL-86433 Resolves: RHEL-95485
This commit is contained in:
parent
55a8279a3b
commit
70ebc1f0c6
136
glibc-upstream-2.39-147.patch
Normal file
136
glibc-upstream-2.39-147.patch
Normal file
@ -0,0 +1,136 @@
|
||||
commit c1f7bfbe081ebf807b6374a497ad5d5a9f499574
|
||||
Author: H.J. Lu <hjl.tools@gmail.com>
|
||||
Date: Tue Dec 17 18:41:45 2024 +0800
|
||||
|
||||
Hide all malloc functions from compiler [BZ #32366]
|
||||
|
||||
Since -1 isn't a power of two, compiler may reject it, hide memalign from
|
||||
Clang 19 which issues an error:
|
||||
|
||||
tst-memalign.c:86:31: error: requested alignment is not a power of 2 [-Werror,-Wnon-power-of-two-alignment]
|
||||
86 | p = memalign (-1, pagesize);
|
||||
| ^~
|
||||
tst-memalign.c:86:31: error: requested alignment must be 4294967296 bytes or smaller; maximum alignment assumed [-Werror,-Wbuiltin-assume-aligned-alignment]
|
||||
86 | p = memalign (-1, pagesize);
|
||||
| ^~
|
||||
|
||||
Update tst-malloc-aux.h to hide all malloc functions and include it in
|
||||
all malloc tests to prevent compiler from optimizing out any malloc
|
||||
functions.
|
||||
|
||||
Tested with Clang 19.1.5 and GCC 15 20241206 for BZ #32366.
|
||||
|
||||
Signed-off-by: H.J. Lu <hjl.tools@gmail.com>
|
||||
Reviewed-by: Sam James <sam@gentoo.org>
|
||||
(cherry picked from commit f9493a15ea9cfb63a815c00c23142369ec09d8ce)
|
||||
|
||||
diff --git a/malloc/tst-mallinfo2.c b/malloc/tst-mallinfo2.c
|
||||
index 2c02f5f700f5051e..f072b9f24b575792 100644
|
||||
--- a/malloc/tst-mallinfo2.c
|
||||
+++ b/malloc/tst-mallinfo2.c
|
||||
@@ -23,6 +23,8 @@
|
||||
#include <stdlib.h>
|
||||
#include <support/check.h>
|
||||
|
||||
+#include "tst-malloc-aux.h"
|
||||
+
|
||||
/* This is not specifically needed for the test, but (1) does
|
||||
something to the data so gcc doesn't optimize it away, and (2) may
|
||||
help when developing future tests. */
|
||||
diff --git a/malloc/tst-malloc-aux.h b/malloc/tst-malloc-aux.h
|
||||
index 54908b4a2464d510..3e1b61ce3414dad4 100644
|
||||
--- a/malloc/tst-malloc-aux.h
|
||||
+++ b/malloc/tst-malloc-aux.h
|
||||
@@ -22,20 +22,35 @@
|
||||
|
||||
#include <stddef.h>
|
||||
#include <stdlib.h>
|
||||
-
|
||||
-static void *(*volatile aligned_alloc_indirect)(size_t, size_t) = aligned_alloc;
|
||||
-static void *(*volatile calloc_indirect)(size_t, size_t) = calloc;
|
||||
-static void *(*volatile malloc_indirect)(size_t) = malloc;
|
||||
-static void *(*volatile realloc_indirect)(void*, size_t) = realloc;
|
||||
+#include <malloc.h>
|
||||
+
|
||||
+static __typeof (aligned_alloc) * volatile aligned_alloc_indirect
|
||||
+ = aligned_alloc;
|
||||
+static __typeof (calloc) * volatile calloc_indirect = calloc;
|
||||
+static __typeof (malloc) * volatile malloc_indirect = malloc;
|
||||
+static __typeof (memalign) * volatile memalign_indirect = memalign;
|
||||
+static __typeof (posix_memalign) * volatile posix_memalign_indirect
|
||||
+ = posix_memalign;
|
||||
+static __typeof (pvalloc) * volatile pvalloc_indirect = pvalloc;
|
||||
+static __typeof (realloc) * volatile realloc_indirect = realloc;
|
||||
+static __typeof (valloc) * volatile valloc_indirect = valloc;
|
||||
|
||||
#undef aligned_alloc
|
||||
#undef calloc
|
||||
#undef malloc
|
||||
+#undef memalign
|
||||
+#undef posix_memalign
|
||||
+#undef pvalloc
|
||||
#undef realloc
|
||||
+#undef valloc
|
||||
|
||||
#define aligned_alloc aligned_alloc_indirect
|
||||
#define calloc calloc_indirect
|
||||
#define malloc malloc_indirect
|
||||
+#define memalign memalign_indirect
|
||||
+#define posix_memalign posix_memalign_indirect
|
||||
+#define pvalloc pvalloc_indirect
|
||||
#define realloc realloc_indirect
|
||||
+#define valloc valloc_indirect
|
||||
|
||||
#endif /* TST_MALLOC_AUX_H */
|
||||
diff --git a/malloc/tst-malloc-backtrace.c b/malloc/tst-malloc-backtrace.c
|
||||
index c7b1d65e5c95c437..65fa91f6fdbdce91 100644
|
||||
--- a/malloc/tst-malloc-backtrace.c
|
||||
+++ b/malloc/tst-malloc-backtrace.c
|
||||
@@ -22,6 +22,8 @@
|
||||
#include <support/support.h>
|
||||
#include <libc-diag.h>
|
||||
|
||||
+#include "tst-malloc-aux.h"
|
||||
+
|
||||
#define SIZE 4096
|
||||
|
||||
/* Wrap free with a function to prevent gcc from optimizing it out. */
|
||||
diff --git a/malloc/tst-memalign.c b/malloc/tst-memalign.c
|
||||
index 563f6413d2da506b..ac9770d3f96313a7 100644
|
||||
--- a/malloc/tst-memalign.c
|
||||
+++ b/malloc/tst-memalign.c
|
||||
@@ -23,6 +23,8 @@
|
||||
#include <unistd.h>
|
||||
#include <libc-diag.h>
|
||||
|
||||
+#include "tst-malloc-aux.h"
|
||||
+
|
||||
static int errors = 0;
|
||||
|
||||
static void
|
||||
diff --git a/malloc/tst-safe-linking.c b/malloc/tst-safe-linking.c
|
||||
index 01dd07004d65a767..63a7e2bc8e8ff536 100644
|
||||
--- a/malloc/tst-safe-linking.c
|
||||
+++ b/malloc/tst-safe-linking.c
|
||||
@@ -26,6 +26,8 @@
|
||||
#include <support/capture_subprocess.h>
|
||||
#include <support/check.h>
|
||||
|
||||
+#include "tst-malloc-aux.h"
|
||||
+
|
||||
/* Run CALLBACK and check that the data on standard error equals
|
||||
EXPECTED. */
|
||||
static void
|
||||
diff --git a/malloc/tst-valloc.c b/malloc/tst-valloc.c
|
||||
index 9bab8c6470d4fd95..0243d3dfd494d329 100644
|
||||
--- a/malloc/tst-valloc.c
|
||||
+++ b/malloc/tst-valloc.c
|
||||
@@ -23,6 +23,8 @@
|
||||
#include <unistd.h>
|
||||
#include <libc-diag.h>
|
||||
|
||||
+#include "tst-malloc-aux.h"
|
||||
+
|
||||
static int errors = 0;
|
||||
|
||||
static void
|
55
glibc-upstream-2.39-148.patch
Normal file
55
glibc-upstream-2.39-148.patch
Normal file
@ -0,0 +1,55 @@
|
||||
commit 1432850ad8fbef6dea82d137e491b53840dc7f4d
|
||||
Author: Sam James <sam@gentoo.org>
|
||||
Date: Fri Jan 10 03:03:47 2025 +0000
|
||||
|
||||
malloc: obscure calloc use in tst-calloc
|
||||
|
||||
Similar to a9944a52c967ce76a5894c30d0274b824df43c7a and
|
||||
f9493a15ea9cfb63a815c00c23142369ec09d8ce, we need to hide calloc use from
|
||||
the compiler to accommodate GCC's r15-6566-g804e9d55d9e54c change.
|
||||
|
||||
First, include tst-malloc-aux.h, but then use `volatile` variables
|
||||
for size.
|
||||
|
||||
The test passes without the tst-malloc-aux.h change but IMO we want
|
||||
it there for consistency and to avoid future problems (possibly silent).
|
||||
|
||||
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
|
||||
(cherry picked from commit c3d1dac96bdd10250aa37bb367d5ef8334a093a1)
|
||||
|
||||
diff --git a/malloc/tst-calloc.c b/malloc/tst-calloc.c
|
||||
index 01f17f9e65591659..5a8c7ab121ef2d00 100644
|
||||
--- a/malloc/tst-calloc.c
|
||||
+++ b/malloc/tst-calloc.c
|
||||
@@ -23,6 +23,7 @@
|
||||
#include <stdio.h>
|
||||
#include <libc-diag.h>
|
||||
|
||||
+#include "tst-malloc-aux.h"
|
||||
|
||||
/* Number of samples per size. */
|
||||
#define N 50000
|
||||
@@ -94,16 +95,19 @@ random_test (void)
|
||||
static void
|
||||
null_test (void)
|
||||
{
|
||||
+ /* Obscure allocation size from the compiler. */
|
||||
+ volatile size_t max_size = UINT_MAX;
|
||||
+ volatile size_t zero_size = 0;
|
||||
/* If the size is 0 the result is implementation defined. Just make
|
||||
sure the program doesn't crash. The result of calloc is
|
||||
deliberately ignored, so do not warn about that. */
|
||||
DIAG_PUSH_NEEDS_COMMENT;
|
||||
DIAG_IGNORE_NEEDS_COMMENT (10, "-Wunused-result");
|
||||
calloc (0, 0);
|
||||
- calloc (0, UINT_MAX);
|
||||
- calloc (UINT_MAX, 0);
|
||||
- calloc (0, ~((size_t) 0));
|
||||
- calloc (~((size_t) 0), 0);
|
||||
+ calloc (0, max_size);
|
||||
+ calloc (max_size, 0);
|
||||
+ calloc (0, ~((size_t) zero_size));
|
||||
+ calloc (~((size_t) zero_size), 0);
|
||||
DIAG_POP_NEEDS_COMMENT;
|
||||
}
|
||||
|
67
glibc-upstream-2.39-149.patch
Normal file
67
glibc-upstream-2.39-149.patch
Normal file
@ -0,0 +1,67 @@
|
||||
commit 662516aca8b6bf6aa6555f471055d5eb512b1ddc
|
||||
Author: H.J. Lu <hjl.tools@gmail.com>
|
||||
Date: Fri Jan 24 18:53:13 2025 +0800
|
||||
|
||||
stdlib: Test using setenv with updated environ [BZ #32588]
|
||||
|
||||
Add a test for setenv with updated environ. Verify that BZ #32588 is
|
||||
fixed.
|
||||
|
||||
Signed-off-by: H.J. Lu <hjl.tools@gmail.com>
|
||||
Reviewed-by: Florian Weimer <fweimer@redhat.com>
|
||||
(cherry picked from commit 8ab34497de14e35aff09b607222fe1309ef156da)
|
||||
|
||||
diff --git a/stdlib/Makefile b/stdlib/Makefile
|
||||
index f4dec9be46a573b9..12f8820fd0668039 100644
|
||||
--- a/stdlib/Makefile
|
||||
+++ b/stdlib/Makefile
|
||||
@@ -316,6 +316,7 @@ tests := \
|
||||
tst-setcontext9 \
|
||||
tst-setcontext10 \
|
||||
tst-setcontext11 \
|
||||
+ tst-setenv-environ \
|
||||
tst-stdbit-Wconversion \
|
||||
tst-stdbit-builtins \
|
||||
tst-stdc_bit_ceil \
|
||||
diff --git a/stdlib/tst-setenv-environ.c b/stdlib/tst-setenv-environ.c
|
||||
new file mode 100644
|
||||
index 0000000000000000..02fcef96d098f1b7
|
||||
--- /dev/null
|
||||
+++ b/stdlib/tst-setenv-environ.c
|
||||
@@ -0,0 +1,36 @@
|
||||
+/* Test using setenv with updated environ.
|
||||
+ Copyright (C) 2025 Free Software Foundation, Inc.
|
||||
+ This file is part of the GNU C Library.
|
||||
+
|
||||
+ The GNU C Library is free software; you can redistribute it and/or
|
||||
+ modify it under the terms of the GNU Lesser General Public
|
||||
+ License as published by the Free Software Foundation; either
|
||||
+ version 2.1 of the License, or (at your option) any later version.
|
||||
+
|
||||
+ The GNU C Library is distributed in the hope that it will be useful,
|
||||
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
+ Lesser General Public License for more details.
|
||||
+
|
||||
+ You should have received a copy of the GNU Lesser General Public
|
||||
+ License along with the GNU C Library; if not, see
|
||||
+ <https://www.gnu.org/licenses/>. */
|
||||
+
|
||||
+#include <stdlib.h>
|
||||
+#include <support/check.h>
|
||||
+
|
||||
+extern char **environ;
|
||||
+
|
||||
+int
|
||||
+do_test (void)
|
||||
+{
|
||||
+ char *valp;
|
||||
+ static char *dummy_environ[] = { NULL };
|
||||
+ environ = dummy_environ;
|
||||
+ setenv ("A", "1", 0);
|
||||
+ valp = getenv ("A");
|
||||
+ TEST_VERIFY_EXIT (valp[0] == '1' && valp[1] == '\0');
|
||||
+ return 0;
|
||||
+}
|
||||
+
|
||||
+#include <support/test-driver.c>
|
125
glibc-upstream-2.39-150.patch
Normal file
125
glibc-upstream-2.39-150.patch
Normal file
@ -0,0 +1,125 @@
|
||||
commit f6d48470aef9264d2d56f4c4533eb76db7f9c2e4
|
||||
Author: Siddhesh Poyarekar <siddhesh@sourceware.org>
|
||||
Date: Fri Jan 31 12:16:30 2025 -0500
|
||||
|
||||
assert: Add test for CVE-2025-0395
|
||||
|
||||
Use the __progname symbol to override the program name to induce the
|
||||
failure that CVE-2025-0395 describes.
|
||||
|
||||
This is related to BZ #32582
|
||||
|
||||
Signed-off-by: Siddhesh Poyarekar <siddhesh@sourceware.org>
|
||||
Reviewed-by: Adhemerval Zanella <adhemerval.zanella@linaro.org>
|
||||
(cherry picked from commit cdb9ba84191ce72e86346fb8b1d906e7cd930ea2)
|
||||
|
||||
diff --git a/assert/Makefile b/assert/Makefile
|
||||
index 35dc908ddb8a14f2..c0fe660bd69f9ec8 100644
|
||||
--- a/assert/Makefile
|
||||
+++ b/assert/Makefile
|
||||
@@ -38,6 +38,7 @@ tests := \
|
||||
test-assert-perr \
|
||||
tst-assert-c++ \
|
||||
tst-assert-g++ \
|
||||
+ tst-assert-sa-2025-0001 \
|
||||
# tests
|
||||
|
||||
ifeq ($(have-cxx-thread_local),yes)
|
||||
diff --git a/assert/tst-assert-sa-2025-0001.c b/assert/tst-assert-sa-2025-0001.c
|
||||
new file mode 100644
|
||||
index 0000000000000000..102cb0078dafa9c1
|
||||
--- /dev/null
|
||||
+++ b/assert/tst-assert-sa-2025-0001.c
|
||||
@@ -0,0 +1,92 @@
|
||||
+/* Test for CVE-2025-0395.
|
||||
+ Copyright The GNU Toolchain Authors.
|
||||
+ This file is part of the GNU C Library.
|
||||
+
|
||||
+ The GNU C Library is free software; you can redistribute it and/or
|
||||
+ modify it under the terms of the GNU Lesser General Public
|
||||
+ License as published by the Free Software Foundation; either
|
||||
+ version 2.1 of the License, or (at your option) any later version.
|
||||
+
|
||||
+ The GNU C Library is distributed in the hope that it will be useful,
|
||||
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
+ Lesser General Public License for more details.
|
||||
+
|
||||
+ You should have received a copy of the GNU Lesser General Public
|
||||
+ License along with the GNU C Library; if not, see
|
||||
+ <https://www.gnu.org/licenses/>. */
|
||||
+
|
||||
+/* Test that a large enough __progname does not result in a buffer overflow
|
||||
+ when printing an assertion failure. This was CVE-2025-0395. */
|
||||
+#include <assert.h>
|
||||
+#include <inttypes.h>
|
||||
+#include <signal.h>
|
||||
+#include <stdbool.h>
|
||||
+#include <string.h>
|
||||
+#include <sys/mman.h>
|
||||
+#include <support/check.h>
|
||||
+#include <support/support.h>
|
||||
+#include <support/xstdio.h>
|
||||
+#include <support/xunistd.h>
|
||||
+
|
||||
+extern const char *__progname;
|
||||
+
|
||||
+int
|
||||
+do_test (int argc, char **argv)
|
||||
+{
|
||||
+
|
||||
+ support_need_proc ("Reads /proc/self/maps to add guards to writable maps.");
|
||||
+ ignore_stderr ();
|
||||
+
|
||||
+ /* XXX assumes that the assert is on a 2 digit line number. */
|
||||
+ const char *prompt = ": %s:99: do_test: Assertion `argc < 1' failed.\n";
|
||||
+
|
||||
+ int ret = fprintf (stderr, prompt, __FILE__);
|
||||
+ if (ret < 0)
|
||||
+ FAIL_EXIT1 ("fprintf failed: %m\n");
|
||||
+
|
||||
+ size_t pagesize = getpagesize ();
|
||||
+ size_t namesize = pagesize - 1 - ret;
|
||||
+
|
||||
+ /* Alter the progname so that the assert message fills the entire page. */
|
||||
+ char progname[namesize];
|
||||
+ memset (progname, 'A', namesize - 1);
|
||||
+ progname[namesize - 1] = '\0';
|
||||
+ __progname = progname;
|
||||
+
|
||||
+ FILE *f = xfopen ("/proc/self/maps", "r");
|
||||
+ char *line = NULL;
|
||||
+ size_t len = 0;
|
||||
+ uintptr_t prev_to = 0;
|
||||
+
|
||||
+ /* Pad the beginning of every writable mapping with a PROT_NONE map. This
|
||||
+ ensures that the mmap in the assert_fail path never ends up below a
|
||||
+ writable map and will terminate immediately in case of a buffer
|
||||
+ overflow. */
|
||||
+ while (xgetline (&line, &len, f))
|
||||
+ {
|
||||
+ uintptr_t from, to;
|
||||
+ char perm[4];
|
||||
+
|
||||
+ sscanf (line, "%" SCNxPTR "-%" SCNxPTR " %c%c%c%c ",
|
||||
+ &from, &to,
|
||||
+ &perm[0], &perm[1], &perm[2], &perm[3]);
|
||||
+
|
||||
+ bool writable = (memchr (perm, 'w', 4) != NULL);
|
||||
+
|
||||
+ if (prev_to != 0 && from - prev_to > pagesize && writable)
|
||||
+ xmmap ((void *) from - pagesize, pagesize, PROT_NONE,
|
||||
+ MAP_ANONYMOUS | MAP_PRIVATE, 0);
|
||||
+
|
||||
+ prev_to = to;
|
||||
+ }
|
||||
+
|
||||
+ xfclose (f);
|
||||
+
|
||||
+ assert (argc < 1);
|
||||
+ return 0;
|
||||
+}
|
||||
+
|
||||
+#define EXPECTED_SIGNAL SIGABRT
|
||||
+#define TEST_FUNCTION_ARGV do_test
|
||||
+#include <support/test-driver.c>
|
230
glibc-upstream-2.39-151.patch
Normal file
230
glibc-upstream-2.39-151.patch
Normal file
@ -0,0 +1,230 @@
|
||||
commit d591876303e368fde0b03e1536efb69b64d9d483
|
||||
Author: Joe Ramsay <Joe.Ramsay@arm.com>
|
||||
Date: Thu May 2 16:43:13 2024 +0100
|
||||
|
||||
aarch64: Fix AdvSIMD libmvec routines for big-endian
|
||||
|
||||
Previously many routines used * to load from vector types stored
|
||||
in the data table. This is emitted as ldr, which byte-swaps the
|
||||
entire vector register, and causes bugs for big-endian when not
|
||||
all lanes contain the same value. When a vector is to be used
|
||||
this way, it has been replaced with an array and the load with an
|
||||
explicit ld1 intrinsic, which byte-swaps only within lanes.
|
||||
|
||||
As well, many routines previously used non-standard GCC syntax
|
||||
for vector operations such as indexing into vectors types with []
|
||||
and assembling vectors using {}. This syntax should not be mixed
|
||||
with ACLE, as the former does not respect endianness whereas the
|
||||
latter does. Such examples have been replaced with, for instance,
|
||||
vcombine_* and vgetq_lane* intrinsics. Helpers which only use the
|
||||
GCC syntax, such as the v_call helpers, do not need changing as
|
||||
they do not use intrinsics.
|
||||
|
||||
Reviewed-by: Szabolcs Nagy <szabolcs.nagy@arm.com>
|
||||
(cherry picked from commit 90a6ca8b28bf34e361e577e526e1b0f4c39a32a5)
|
||||
|
||||
diff --git a/sysdeps/aarch64/fpu/exp10f_advsimd.c b/sysdeps/aarch64/fpu/exp10f_advsimd.c
|
||||
index ab117b69da23e5f3..cf53e73290fcedb6 100644
|
||||
--- a/sysdeps/aarch64/fpu/exp10f_advsimd.c
|
||||
+++ b/sysdeps/aarch64/fpu/exp10f_advsimd.c
|
||||
@@ -25,7 +25,8 @@
|
||||
static const struct data
|
||||
{
|
||||
float32x4_t poly[5];
|
||||
- float32x4_t log10_2_and_inv, shift;
|
||||
+ float log10_2_and_inv[4];
|
||||
+ float32x4_t shift;
|
||||
|
||||
#if !WANT_SIMD_EXCEPT
|
||||
float32x4_t scale_thresh;
|
||||
@@ -111,10 +112,11 @@ float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (exp10) (float32x4_t x)
|
||||
/* exp10(x) = 2^n * 10^r = 2^n * (1 + poly (r)),
|
||||
with poly(r) in [1/sqrt(2), sqrt(2)] and
|
||||
x = r + n * log10 (2), with r in [-log10(2)/2, log10(2)/2]. */
|
||||
- float32x4_t z = vfmaq_laneq_f32 (d->shift, x, d->log10_2_and_inv, 0);
|
||||
+ float32x4_t log10_2_and_inv = vld1q_f32 (d->log10_2_and_inv);
|
||||
+ float32x4_t z = vfmaq_laneq_f32 (d->shift, x, log10_2_and_inv, 0);
|
||||
float32x4_t n = vsubq_f32 (z, d->shift);
|
||||
- float32x4_t r = vfmsq_laneq_f32 (x, n, d->log10_2_and_inv, 1);
|
||||
- r = vfmsq_laneq_f32 (r, n, d->log10_2_and_inv, 2);
|
||||
+ float32x4_t r = vfmsq_laneq_f32 (x, n, log10_2_and_inv, 1);
|
||||
+ r = vfmsq_laneq_f32 (r, n, log10_2_and_inv, 2);
|
||||
uint32x4_t e = vshlq_n_u32 (vreinterpretq_u32_f32 (z), 23);
|
||||
|
||||
float32x4_t scale = vreinterpretq_f32_u32 (vaddq_u32 (e, ExponentBias));
|
||||
diff --git a/sysdeps/aarch64/fpu/expm1_advsimd.c b/sysdeps/aarch64/fpu/expm1_advsimd.c
|
||||
index 3628398674468131..3db3b80c49292947 100644
|
||||
--- a/sysdeps/aarch64/fpu/expm1_advsimd.c
|
||||
+++ b/sysdeps/aarch64/fpu/expm1_advsimd.c
|
||||
@@ -23,7 +23,9 @@
|
||||
static const struct data
|
||||
{
|
||||
float64x2_t poly[11];
|
||||
- float64x2_t invln2, ln2, shift;
|
||||
+ float64x2_t invln2;
|
||||
+ double ln2[2];
|
||||
+ float64x2_t shift;
|
||||
int64x2_t exponent_bias;
|
||||
#if WANT_SIMD_EXCEPT
|
||||
uint64x2_t thresh, tiny_bound;
|
||||
@@ -92,8 +94,9 @@ float64x2_t VPCS_ATTR V_NAME_D1 (expm1) (float64x2_t x)
|
||||
where 2^i is exact because i is an integer. */
|
||||
float64x2_t n = vsubq_f64 (vfmaq_f64 (d->shift, d->invln2, x), d->shift);
|
||||
int64x2_t i = vcvtq_s64_f64 (n);
|
||||
- float64x2_t f = vfmsq_laneq_f64 (x, n, d->ln2, 0);
|
||||
- f = vfmsq_laneq_f64 (f, n, d->ln2, 1);
|
||||
+ float64x2_t ln2 = vld1q_f64 (&d->ln2[0]);
|
||||
+ float64x2_t f = vfmsq_laneq_f64 (x, n, ln2, 0);
|
||||
+ f = vfmsq_laneq_f64 (f, n, ln2, 1);
|
||||
|
||||
/* Approximate expm1(f) using polynomial.
|
||||
Taylor expansion for expm1(x) has the form:
|
||||
diff --git a/sysdeps/aarch64/fpu/expm1f_advsimd.c b/sysdeps/aarch64/fpu/expm1f_advsimd.c
|
||||
index 93db200f618379be..a0616ec7542cbfce 100644
|
||||
--- a/sysdeps/aarch64/fpu/expm1f_advsimd.c
|
||||
+++ b/sysdeps/aarch64/fpu/expm1f_advsimd.c
|
||||
@@ -23,7 +23,7 @@
|
||||
static const struct data
|
||||
{
|
||||
float32x4_t poly[5];
|
||||
- float32x4_t invln2_and_ln2;
|
||||
+ float invln2_and_ln2[4];
|
||||
float32x4_t shift;
|
||||
int32x4_t exponent_bias;
|
||||
#if WANT_SIMD_EXCEPT
|
||||
@@ -88,11 +88,12 @@ float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (expm1) (float32x4_t x)
|
||||
and f = x - i * ln2, then f is in [-ln2/2, ln2/2].
|
||||
exp(x) - 1 = 2^i * (expm1(f) + 1) - 1
|
||||
where 2^i is exact because i is an integer. */
|
||||
- float32x4_t j = vsubq_f32 (
|
||||
- vfmaq_laneq_f32 (d->shift, x, d->invln2_and_ln2, 0), d->shift);
|
||||
+ float32x4_t invln2_and_ln2 = vld1q_f32 (d->invln2_and_ln2);
|
||||
+ float32x4_t j
|
||||
+ = vsubq_f32 (vfmaq_laneq_f32 (d->shift, x, invln2_and_ln2, 0), d->shift);
|
||||
int32x4_t i = vcvtq_s32_f32 (j);
|
||||
- float32x4_t f = vfmsq_laneq_f32 (x, j, d->invln2_and_ln2, 1);
|
||||
- f = vfmsq_laneq_f32 (f, j, d->invln2_and_ln2, 2);
|
||||
+ float32x4_t f = vfmsq_laneq_f32 (x, j, invln2_and_ln2, 1);
|
||||
+ f = vfmsq_laneq_f32 (f, j, invln2_and_ln2, 2);
|
||||
|
||||
/* Approximate expm1(f) using polynomial.
|
||||
Taylor expansion for expm1(x) has the form:
|
||||
diff --git a/sysdeps/aarch64/fpu/log10_advsimd.c b/sysdeps/aarch64/fpu/log10_advsimd.c
|
||||
index 1e5ef99e8907068b..c065aaebae8600fb 100644
|
||||
--- a/sysdeps/aarch64/fpu/log10_advsimd.c
|
||||
+++ b/sysdeps/aarch64/fpu/log10_advsimd.c
|
||||
@@ -58,8 +58,10 @@ static inline struct entry
|
||||
lookup (uint64x2_t i)
|
||||
{
|
||||
struct entry e;
|
||||
- uint64_t i0 = (i[0] >> (52 - V_LOG10_TABLE_BITS)) & IndexMask;
|
||||
- uint64_t i1 = (i[1] >> (52 - V_LOG10_TABLE_BITS)) & IndexMask;
|
||||
+ uint64_t i0
|
||||
+ = (vgetq_lane_u64 (i, 0) >> (52 - V_LOG10_TABLE_BITS)) & IndexMask;
|
||||
+ uint64_t i1
|
||||
+ = (vgetq_lane_u64 (i, 1) >> (52 - V_LOG10_TABLE_BITS)) & IndexMask;
|
||||
float64x2_t e0 = vld1q_f64 (&__v_log10_data.table[i0].invc);
|
||||
float64x2_t e1 = vld1q_f64 (&__v_log10_data.table[i1].invc);
|
||||
e.invc = vuzp1q_f64 (e0, e1);
|
||||
diff --git a/sysdeps/aarch64/fpu/log2_advsimd.c b/sysdeps/aarch64/fpu/log2_advsimd.c
|
||||
index a34978f6cf1cdb44..4057c552d8dfc0bb 100644
|
||||
--- a/sysdeps/aarch64/fpu/log2_advsimd.c
|
||||
+++ b/sysdeps/aarch64/fpu/log2_advsimd.c
|
||||
@@ -55,8 +55,10 @@ static inline struct entry
|
||||
lookup (uint64x2_t i)
|
||||
{
|
||||
struct entry e;
|
||||
- uint64_t i0 = (i[0] >> (52 - V_LOG2_TABLE_BITS)) & IndexMask;
|
||||
- uint64_t i1 = (i[1] >> (52 - V_LOG2_TABLE_BITS)) & IndexMask;
|
||||
+ uint64_t i0
|
||||
+ = (vgetq_lane_u64 (i, 0) >> (52 - V_LOG2_TABLE_BITS)) & IndexMask;
|
||||
+ uint64_t i1
|
||||
+ = (vgetq_lane_u64 (i, 1) >> (52 - V_LOG2_TABLE_BITS)) & IndexMask;
|
||||
float64x2_t e0 = vld1q_f64 (&__v_log2_data.table[i0].invc);
|
||||
float64x2_t e1 = vld1q_f64 (&__v_log2_data.table[i1].invc);
|
||||
e.invc = vuzp1q_f64 (e0, e1);
|
||||
diff --git a/sysdeps/aarch64/fpu/log_advsimd.c b/sysdeps/aarch64/fpu/log_advsimd.c
|
||||
index 21df61728ca87374..015a6da7d7fd693e 100644
|
||||
--- a/sysdeps/aarch64/fpu/log_advsimd.c
|
||||
+++ b/sysdeps/aarch64/fpu/log_advsimd.c
|
||||
@@ -54,17 +54,12 @@ lookup (uint64x2_t i)
|
||||
{
|
||||
/* Since N is a power of 2, n % N = n & (N - 1). */
|
||||
struct entry e;
|
||||
- uint64_t i0 = (i[0] >> (52 - V_LOG_TABLE_BITS)) & IndexMask;
|
||||
- uint64_t i1 = (i[1] >> (52 - V_LOG_TABLE_BITS)) & IndexMask;
|
||||
+ uint64_t i0 = (vgetq_lane_u64 (i, 0) >> (52 - V_LOG_TABLE_BITS)) & IndexMask;
|
||||
+ uint64_t i1 = (vgetq_lane_u64 (i, 1) >> (52 - V_LOG_TABLE_BITS)) & IndexMask;
|
||||
float64x2_t e0 = vld1q_f64 (&__v_log_data.table[i0].invc);
|
||||
float64x2_t e1 = vld1q_f64 (&__v_log_data.table[i1].invc);
|
||||
-#if __BYTE_ORDER == __LITTLE_ENDIAN
|
||||
e.invc = vuzp1q_f64 (e0, e1);
|
||||
e.logc = vuzp2q_f64 (e0, e1);
|
||||
-#else
|
||||
- e.invc = vuzp1q_f64 (e1, e0);
|
||||
- e.logc = vuzp2q_f64 (e1, e0);
|
||||
-#endif
|
||||
return e;
|
||||
}
|
||||
|
||||
diff --git a/sysdeps/aarch64/fpu/tan_advsimd.c b/sysdeps/aarch64/fpu/tan_advsimd.c
|
||||
index 0459821ab25487a8..d56a102dd17a3463 100644
|
||||
--- a/sysdeps/aarch64/fpu/tan_advsimd.c
|
||||
+++ b/sysdeps/aarch64/fpu/tan_advsimd.c
|
||||
@@ -23,7 +23,8 @@
|
||||
static const struct data
|
||||
{
|
||||
float64x2_t poly[9];
|
||||
- float64x2_t half_pi, two_over_pi, shift;
|
||||
+ double half_pi[2];
|
||||
+ float64x2_t two_over_pi, shift;
|
||||
#if !WANT_SIMD_EXCEPT
|
||||
float64x2_t range_val;
|
||||
#endif
|
||||
@@ -81,8 +82,9 @@ float64x2_t VPCS_ATTR V_NAME_D1 (tan) (float64x2_t x)
|
||||
/* Use q to reduce x to r in [-pi/4, pi/4], by:
|
||||
r = x - q * pi/2, in extended precision. */
|
||||
float64x2_t r = x;
|
||||
- r = vfmsq_laneq_f64 (r, q, dat->half_pi, 0);
|
||||
- r = vfmsq_laneq_f64 (r, q, dat->half_pi, 1);
|
||||
+ float64x2_t half_pi = vld1q_f64 (dat->half_pi);
|
||||
+ r = vfmsq_laneq_f64 (r, q, half_pi, 0);
|
||||
+ r = vfmsq_laneq_f64 (r, q, half_pi, 1);
|
||||
/* Further reduce r to [-pi/8, pi/8], to be reconstructed using double angle
|
||||
formula. */
|
||||
r = vmulq_n_f64 (r, 0.5);
|
||||
diff --git a/sysdeps/aarch64/fpu/tanf_advsimd.c b/sysdeps/aarch64/fpu/tanf_advsimd.c
|
||||
index 5a7489390a9692c6..705586f0c0b664c1 100644
|
||||
--- a/sysdeps/aarch64/fpu/tanf_advsimd.c
|
||||
+++ b/sysdeps/aarch64/fpu/tanf_advsimd.c
|
||||
@@ -23,7 +23,7 @@
|
||||
static const struct data
|
||||
{
|
||||
float32x4_t poly[6];
|
||||
- float32x4_t pi_consts;
|
||||
+ float pi_consts[4];
|
||||
float32x4_t shift;
|
||||
#if !WANT_SIMD_EXCEPT
|
||||
float32x4_t range_val;
|
||||
@@ -95,16 +95,17 @@ float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (tan) (float32x4_t x)
|
||||
#endif
|
||||
|
||||
/* n = rint(x/(pi/2)). */
|
||||
- float32x4_t q = vfmaq_laneq_f32 (d->shift, x, d->pi_consts, 3);
|
||||
+ float32x4_t pi_consts = vld1q_f32 (d->pi_consts);
|
||||
+ float32x4_t q = vfmaq_laneq_f32 (d->shift, x, pi_consts, 3);
|
||||
float32x4_t n = vsubq_f32 (q, d->shift);
|
||||
/* Determine if x lives in an interval, where |tan(x)| grows to infinity. */
|
||||
uint32x4_t pred_alt = vtstq_u32 (vreinterpretq_u32_f32 (q), v_u32 (1));
|
||||
|
||||
/* r = x - n * (pi/2) (range reduction into -pi./4 .. pi/4). */
|
||||
float32x4_t r;
|
||||
- r = vfmaq_laneq_f32 (x, n, d->pi_consts, 0);
|
||||
- r = vfmaq_laneq_f32 (r, n, d->pi_consts, 1);
|
||||
- r = vfmaq_laneq_f32 (r, n, d->pi_consts, 2);
|
||||
+ r = vfmaq_laneq_f32 (x, n, pi_consts, 0);
|
||||
+ r = vfmaq_laneq_f32 (r, n, pi_consts, 1);
|
||||
+ r = vfmaq_laneq_f32 (r, n, pi_consts, 2);
|
||||
|
||||
/* If x lives in an interval, where |tan(x)|
|
||||
- is finite, then use a polynomial approximation of the form
|
268
glibc-upstream-2.39-152.patch
Normal file
268
glibc-upstream-2.39-152.patch
Normal file
@ -0,0 +1,268 @@
|
||||
commit 80df456112d67e27660563b9540cbc1bb5475c84
|
||||
Author: Joe Ramsay <Joe.Ramsay@arm.com>
|
||||
Date: Mon Sep 9 13:00:01 2024 +0100
|
||||
|
||||
aarch64: Avoid redundant MOVs in AdvSIMD F32 logs
|
||||
|
||||
Since the last operation is destructive, the first argument to the FMA
|
||||
also has to be the first argument to the special-case in order to
|
||||
avoid unnecessary MOVs. Reorder arguments and adjust special-case
|
||||
bounds to facilitate this.
|
||||
|
||||
Reviewed-by: Wilco Dijkstra <Wilco.Dijkstra@arm.com>
|
||||
(cherry picked from commit 8b09af572b208bfde4d31c6abbae047dcc217675)
|
||||
|
||||
diff --git a/sysdeps/aarch64/fpu/log10f_advsimd.c b/sysdeps/aarch64/fpu/log10f_advsimd.c
|
||||
index 9347422a771e3d4e..82228b599a5c061b 100644
|
||||
--- a/sysdeps/aarch64/fpu/log10f_advsimd.c
|
||||
+++ b/sysdeps/aarch64/fpu/log10f_advsimd.c
|
||||
@@ -22,11 +22,11 @@
|
||||
|
||||
static const struct data
|
||||
{
|
||||
- uint32x4_t min_norm;
|
||||
+ uint32x4_t off, offset_lower_bound;
|
||||
uint16x8_t special_bound;
|
||||
+ uint32x4_t mantissa_mask;
|
||||
float32x4_t poly[8];
|
||||
float32x4_t inv_ln10, ln2;
|
||||
- uint32x4_t off, mantissa_mask;
|
||||
} data = {
|
||||
/* Use order 9 for log10(1+x), i.e. order 8 for log10(1+x)/x, with x in
|
||||
[-1/3, 1/3] (offset=2/3). Max. relative error: 0x1.068ee468p-25. */
|
||||
@@ -35,18 +35,22 @@ static const struct data
|
||||
V4 (-0x1.0fc92cp-4f), V4 (0x1.f5f76ap-5f) },
|
||||
.ln2 = V4 (0x1.62e43p-1f),
|
||||
.inv_ln10 = V4 (0x1.bcb7b2p-2f),
|
||||
- .min_norm = V4 (0x00800000),
|
||||
- .special_bound = V8 (0x7f00), /* asuint32(inf) - min_norm. */
|
||||
+ /* Lower bound is the smallest positive normal float 0x00800000. For
|
||||
+ optimised register use subnormals are detected after offset has been
|
||||
+ subtracted, so lower bound is 0x0080000 - offset (which wraps around). */
|
||||
+ .offset_lower_bound = V4 (0x00800000 - 0x3f2aaaab),
|
||||
+ .special_bound = V8 (0x7f00), /* top16(asuint32(inf) - 0x00800000). */
|
||||
.off = V4 (0x3f2aaaab), /* 0.666667. */
|
||||
.mantissa_mask = V4 (0x007fffff),
|
||||
};
|
||||
|
||||
static float32x4_t VPCS_ATTR NOINLINE
|
||||
-special_case (float32x4_t x, float32x4_t y, float32x4_t p, float32x4_t r2,
|
||||
- uint16x4_t cmp)
|
||||
+special_case (float32x4_t y, uint32x4_t u_off, float32x4_t p, float32x4_t r2,
|
||||
+ uint16x4_t cmp, const struct data *d)
|
||||
{
|
||||
/* Fall back to scalar code. */
|
||||
- return v_call_f32 (log10f, x, vfmaq_f32 (y, p, r2), vmovl_u16 (cmp));
|
||||
+ return v_call_f32 (log10f, vreinterpretq_f32_u32 (vaddq_u32 (u_off, d->off)),
|
||||
+ vfmaq_f32 (y, p, r2), vmovl_u16 (cmp));
|
||||
}
|
||||
|
||||
/* Fast implementation of AdvSIMD log10f,
|
||||
@@ -58,15 +62,21 @@ special_case (float32x4_t x, float32x4_t y, float32x4_t p, float32x4_t r2,
|
||||
float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (log10) (float32x4_t x)
|
||||
{
|
||||
const struct data *d = ptr_barrier (&data);
|
||||
- uint32x4_t u = vreinterpretq_u32_f32 (x);
|
||||
- uint16x4_t special = vcge_u16 (vsubhn_u32 (u, d->min_norm),
|
||||
- vget_low_u16 (d->special_bound));
|
||||
+
|
||||
+ /* To avoid having to mov x out of the way, keep u after offset has been
|
||||
+ applied, and recover x by adding the offset back in the special-case
|
||||
+ handler. */
|
||||
+ uint32x4_t u_off = vreinterpretq_u32_f32 (x);
|
||||
|
||||
/* x = 2^n * (1+r), where 2/3 < 1+r < 4/3. */
|
||||
- u = vsubq_u32 (u, d->off);
|
||||
+ u_off = vsubq_u32 (u_off, d->off);
|
||||
float32x4_t n = vcvtq_f32_s32 (
|
||||
- vshrq_n_s32 (vreinterpretq_s32_u32 (u), 23)); /* signextend. */
|
||||
- u = vaddq_u32 (vandq_u32 (u, d->mantissa_mask), d->off);
|
||||
+ vshrq_n_s32 (vreinterpretq_s32_u32 (u_off), 23)); /* signextend. */
|
||||
+
|
||||
+ uint16x4_t special = vcge_u16 (vsubhn_u32 (u_off, d->offset_lower_bound),
|
||||
+ vget_low_u16 (d->special_bound));
|
||||
+
|
||||
+ uint32x4_t u = vaddq_u32 (vandq_u32 (u_off, d->mantissa_mask), d->off);
|
||||
float32x4_t r = vsubq_f32 (vreinterpretq_f32_u32 (u), v_f32 (1.0f));
|
||||
|
||||
/* y = log10(1+r) + n * log10(2). */
|
||||
@@ -77,7 +87,7 @@ float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (log10) (float32x4_t x)
|
||||
y = vmulq_f32 (y, d->inv_ln10);
|
||||
|
||||
if (__glibc_unlikely (v_any_u16h (special)))
|
||||
- return special_case (x, y, poly, r2, special);
|
||||
+ return special_case (y, u_off, poly, r2, special, d);
|
||||
return vfmaq_f32 (y, poly, r2);
|
||||
}
|
||||
libmvec_hidden_def (V_NAME_F1 (log10))
|
||||
diff --git a/sysdeps/aarch64/fpu/log2f_advsimd.c b/sysdeps/aarch64/fpu/log2f_advsimd.c
|
||||
index db218367495dc567..84effe4fe9492d08 100644
|
||||
--- a/sysdeps/aarch64/fpu/log2f_advsimd.c
|
||||
+++ b/sysdeps/aarch64/fpu/log2f_advsimd.c
|
||||
@@ -22,9 +22,9 @@
|
||||
|
||||
static const struct data
|
||||
{
|
||||
- uint32x4_t min_norm;
|
||||
+ uint32x4_t off, offset_lower_bound;
|
||||
uint16x8_t special_bound;
|
||||
- uint32x4_t off, mantissa_mask;
|
||||
+ uint32x4_t mantissa_mask;
|
||||
float32x4_t poly[9];
|
||||
} data = {
|
||||
/* Coefficients generated using Remez algorithm approximate
|
||||
@@ -34,18 +34,22 @@ static const struct data
|
||||
V4 (-0x1.715458p-1f), V4 (0x1.ec701cp-2f), V4 (-0x1.7171a4p-2f),
|
||||
V4 (0x1.27a0b8p-2f), V4 (-0x1.e5143ep-3f), V4 (0x1.9d8ecap-3f),
|
||||
V4 (-0x1.c675bp-3f), V4 (0x1.9e495p-3f) },
|
||||
- .min_norm = V4 (0x00800000),
|
||||
- .special_bound = V8 (0x7f00), /* asuint32(inf) - min_norm. */
|
||||
+ /* Lower bound is the smallest positive normal float 0x00800000. For
|
||||
+ optimised register use subnormals are detected after offset has been
|
||||
+ subtracted, so lower bound is 0x0080000 - offset (which wraps around). */
|
||||
+ .offset_lower_bound = V4 (0x00800000 - 0x3f2aaaab),
|
||||
+ .special_bound = V8 (0x7f00), /* top16(asuint32(inf) - 0x00800000). */
|
||||
.off = V4 (0x3f2aaaab), /* 0.666667. */
|
||||
.mantissa_mask = V4 (0x007fffff),
|
||||
};
|
||||
|
||||
static float32x4_t VPCS_ATTR NOINLINE
|
||||
-special_case (float32x4_t x, float32x4_t n, float32x4_t p, float32x4_t r,
|
||||
- uint16x4_t cmp)
|
||||
+special_case (float32x4_t n, uint32x4_t u_off, float32x4_t p, float32x4_t r,
|
||||
+ uint16x4_t cmp, const struct data *d)
|
||||
{
|
||||
/* Fall back to scalar code. */
|
||||
- return v_call_f32 (log2f, x, vfmaq_f32 (n, p, r), vmovl_u16 (cmp));
|
||||
+ return v_call_f32 (log2f, vreinterpretq_f32_u32 (vaddq_u32 (u_off, d->off)),
|
||||
+ vfmaq_f32 (n, p, r), vmovl_u16 (cmp));
|
||||
}
|
||||
|
||||
/* Fast implementation for single precision AdvSIMD log2,
|
||||
@@ -56,15 +60,21 @@ special_case (float32x4_t x, float32x4_t n, float32x4_t p, float32x4_t r,
|
||||
float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (log2) (float32x4_t x)
|
||||
{
|
||||
const struct data *d = ptr_barrier (&data);
|
||||
- uint32x4_t u = vreinterpretq_u32_f32 (x);
|
||||
- uint16x4_t special = vcge_u16 (vsubhn_u32 (u, d->min_norm),
|
||||
- vget_low_u16 (d->special_bound));
|
||||
+
|
||||
+ /* To avoid having to mov x out of the way, keep u after offset has been
|
||||
+ applied, and recover x by adding the offset back in the special-case
|
||||
+ handler. */
|
||||
+ uint32x4_t u_off = vreinterpretq_u32_f32 (x);
|
||||
|
||||
/* x = 2^n * (1+r), where 2/3 < 1+r < 4/3. */
|
||||
- u = vsubq_u32 (u, d->off);
|
||||
+ u_off = vsubq_u32 (u_off, d->off);
|
||||
float32x4_t n = vcvtq_f32_s32 (
|
||||
- vshrq_n_s32 (vreinterpretq_s32_u32 (u), 23)); /* signextend. */
|
||||
- u = vaddq_u32 (vandq_u32 (u, d->mantissa_mask), d->off);
|
||||
+ vshrq_n_s32 (vreinterpretq_s32_u32 (u_off), 23)); /* signextend. */
|
||||
+
|
||||
+ uint16x4_t special = vcge_u16 (vsubhn_u32 (u_off, d->offset_lower_bound),
|
||||
+ vget_low_u16 (d->special_bound));
|
||||
+
|
||||
+ uint32x4_t u = vaddq_u32 (vandq_u32 (u_off, d->mantissa_mask), d->off);
|
||||
float32x4_t r = vsubq_f32 (vreinterpretq_f32_u32 (u), v_f32 (1.0f));
|
||||
|
||||
/* y = log2(1+r) + n. */
|
||||
@@ -72,7 +82,7 @@ float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (log2) (float32x4_t x)
|
||||
float32x4_t p = v_pw_horner_8_f32 (r, r2, d->poly);
|
||||
|
||||
if (__glibc_unlikely (v_any_u16h (special)))
|
||||
- return special_case (x, n, p, r, special);
|
||||
+ return special_case (n, u_off, p, r, special, d);
|
||||
return vfmaq_f32 (n, p, r);
|
||||
}
|
||||
libmvec_hidden_def (V_NAME_F1 (log2))
|
||||
diff --git a/sysdeps/aarch64/fpu/logf_advsimd.c b/sysdeps/aarch64/fpu/logf_advsimd.c
|
||||
index 3c0d0fcdc76f1004..c20dbfd6c088c0af 100644
|
||||
--- a/sysdeps/aarch64/fpu/logf_advsimd.c
|
||||
+++ b/sysdeps/aarch64/fpu/logf_advsimd.c
|
||||
@@ -21,20 +21,22 @@
|
||||
|
||||
static const struct data
|
||||
{
|
||||
- uint32x4_t min_norm;
|
||||
+ uint32x4_t off, offset_lower_bound;
|
||||
uint16x8_t special_bound;
|
||||
+ uint32x4_t mantissa_mask;
|
||||
float32x4_t poly[7];
|
||||
- float32x4_t ln2, tiny_bound;
|
||||
- uint32x4_t off, mantissa_mask;
|
||||
+ float32x4_t ln2;
|
||||
} data = {
|
||||
/* 3.34 ulp error. */
|
||||
.poly = { V4 (-0x1.3e737cp-3f), V4 (0x1.5a9aa2p-3f), V4 (-0x1.4f9934p-3f),
|
||||
V4 (0x1.961348p-3f), V4 (-0x1.00187cp-2f), V4 (0x1.555d7cp-2f),
|
||||
V4 (-0x1.ffffc8p-2f) },
|
||||
.ln2 = V4 (0x1.62e43p-1f),
|
||||
- .tiny_bound = V4 (0x1p-126),
|
||||
- .min_norm = V4 (0x00800000),
|
||||
- .special_bound = V8 (0x7f00), /* asuint32(inf) - min_norm. */
|
||||
+ /* Lower bound is the smallest positive normal float 0x00800000. For
|
||||
+ optimised register use subnormals are detected after offset has been
|
||||
+ subtracted, so lower bound is 0x0080000 - offset (which wraps around). */
|
||||
+ .offset_lower_bound = V4 (0x00800000 - 0x3f2aaaab),
|
||||
+ .special_bound = V8 (0x7f00), /* top16(asuint32(inf) - 0x00800000). */
|
||||
.off = V4 (0x3f2aaaab), /* 0.666667. */
|
||||
.mantissa_mask = V4 (0x007fffff)
|
||||
};
|
||||
@@ -42,32 +44,37 @@ static const struct data
|
||||
#define P(i) d->poly[7 - i]
|
||||
|
||||
static float32x4_t VPCS_ATTR NOINLINE
|
||||
-special_case (float32x4_t x, float32x4_t y, float32x4_t r2, float32x4_t p,
|
||||
- uint16x4_t cmp)
|
||||
+special_case (float32x4_t p, uint32x4_t u_off, float32x4_t y, float32x4_t r2,
|
||||
+ uint16x4_t cmp, const struct data *d)
|
||||
{
|
||||
/* Fall back to scalar code. */
|
||||
- return v_call_f32 (logf, x, vfmaq_f32 (p, y, r2), vmovl_u16 (cmp));
|
||||
+ return v_call_f32 (logf, vreinterpretq_f32_u32 (vaddq_u32 (u_off, d->off)),
|
||||
+ vfmaq_f32 (p, y, r2), vmovl_u16 (cmp));
|
||||
}
|
||||
|
||||
float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (log) (float32x4_t x)
|
||||
{
|
||||
const struct data *d = ptr_barrier (&data);
|
||||
float32x4_t n, p, q, r, r2, y;
|
||||
- uint32x4_t u;
|
||||
+ uint32x4_t u, u_off;
|
||||
uint16x4_t cmp;
|
||||
|
||||
- u = vreinterpretq_u32_f32 (x);
|
||||
- cmp = vcge_u16 (vsubhn_u32 (u, d->min_norm),
|
||||
- vget_low_u16 (d->special_bound));
|
||||
+ /* To avoid having to mov x out of the way, keep u after offset has been
|
||||
+ applied, and recover x by adding the offset back in the special-case
|
||||
+ handler. */
|
||||
+ u_off = vreinterpretq_u32_f32 (x);
|
||||
|
||||
/* x = 2^n * (1+r), where 2/3 < 1+r < 4/3. */
|
||||
- u = vsubq_u32 (u, d->off);
|
||||
+ u_off = vsubq_u32 (u_off, d->off);
|
||||
n = vcvtq_f32_s32 (
|
||||
- vshrq_n_s32 (vreinterpretq_s32_u32 (u), 23)); /* signextend. */
|
||||
- u = vandq_u32 (u, d->mantissa_mask);
|
||||
+ vshrq_n_s32 (vreinterpretq_s32_u32 (u_off), 23)); /* signextend. */
|
||||
+ u = vandq_u32 (u_off, d->mantissa_mask);
|
||||
u = vaddq_u32 (u, d->off);
|
||||
r = vsubq_f32 (vreinterpretq_f32_u32 (u), v_f32 (1.0f));
|
||||
|
||||
+ cmp = vcge_u16 (vsubhn_u32 (u_off, d->offset_lower_bound),
|
||||
+ vget_low_u16 (d->special_bound));
|
||||
+
|
||||
/* y = log(1+r) + n*ln2. */
|
||||
r2 = vmulq_f32 (r, r);
|
||||
/* n*ln2 + r + r2*(P1 + r*P2 + r2*(P3 + r*P4 + r2*(P5 + r*P6 + r2*P7))). */
|
||||
@@ -80,7 +87,7 @@ float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (log) (float32x4_t x)
|
||||
p = vfmaq_f32 (r, d->ln2, n);
|
||||
|
||||
if (__glibc_unlikely (v_any_u16h (cmp)))
|
||||
- return special_case (x, y, r2, p, cmp);
|
||||
+ return special_case (p, u_off, y, r2, cmp, d);
|
||||
return vfmaq_f32 (p, y, r2);
|
||||
}
|
||||
libmvec_hidden_def (V_NAME_F1 (log))
|
240
glibc-upstream-2.39-153.patch
Normal file
240
glibc-upstream-2.39-153.patch
Normal file
@ -0,0 +1,240 @@
|
||||
commit 5e354bf4e20ca3ccc15bda63c7b56ea0e97efa81
|
||||
Author: Joe Ramsay <Joe.Ramsay@arm.com>
|
||||
Date: Mon Sep 23 15:33:31 2024 +0100
|
||||
|
||||
AArch64: Simplify rounding-multiply pattern in several AdvSIMD routines
|
||||
|
||||
This operation can be simplified to use simpler multiply-round-convert
|
||||
sequence, which uses fewer instructions and constants.
|
||||
|
||||
Reviewed-by: Wilco Dijkstra <Wilco.Dijkstra@arm.com>
|
||||
(cherry picked from commit 16a59571e4e9fd019d3fc23a2e7d73c1df8bb5cb)
|
||||
|
||||
diff --git a/sysdeps/aarch64/fpu/cos_advsimd.c b/sysdeps/aarch64/fpu/cos_advsimd.c
|
||||
index 3924c9ce44c30d4d..11a89b1530825b5f 100644
|
||||
--- a/sysdeps/aarch64/fpu/cos_advsimd.c
|
||||
+++ b/sysdeps/aarch64/fpu/cos_advsimd.c
|
||||
@@ -22,7 +22,7 @@
|
||||
static const struct data
|
||||
{
|
||||
float64x2_t poly[7];
|
||||
- float64x2_t range_val, shift, inv_pi, half_pi, pi_1, pi_2, pi_3;
|
||||
+ float64x2_t range_val, inv_pi, pi_1, pi_2, pi_3;
|
||||
} data = {
|
||||
/* Worst-case error is 3.3 ulp in [-pi/2, pi/2]. */
|
||||
.poly = { V2 (-0x1.555555555547bp-3), V2 (0x1.1111111108a4dp-7),
|
||||
@@ -30,11 +30,9 @@ static const struct data
|
||||
V2 (-0x1.ae633919987c6p-26), V2 (0x1.60e277ae07cecp-33),
|
||||
V2 (-0x1.9e9540300a1p-41) },
|
||||
.inv_pi = V2 (0x1.45f306dc9c883p-2),
|
||||
- .half_pi = V2 (0x1.921fb54442d18p+0),
|
||||
.pi_1 = V2 (0x1.921fb54442d18p+1),
|
||||
.pi_2 = V2 (0x1.1a62633145c06p-53),
|
||||
.pi_3 = V2 (0x1.c1cd129024e09p-106),
|
||||
- .shift = V2 (0x1.8p52),
|
||||
.range_val = V2 (0x1p23)
|
||||
};
|
||||
|
||||
@@ -68,10 +66,9 @@ float64x2_t VPCS_ATTR V_NAME_D1 (cos) (float64x2_t x)
|
||||
#endif
|
||||
|
||||
/* n = rint((|x|+pi/2)/pi) - 0.5. */
|
||||
- n = vfmaq_f64 (d->shift, d->inv_pi, vaddq_f64 (r, d->half_pi));
|
||||
- odd = vshlq_n_u64 (vreinterpretq_u64_f64 (n), 63);
|
||||
- n = vsubq_f64 (n, d->shift);
|
||||
- n = vsubq_f64 (n, v_f64 (0.5));
|
||||
+ n = vrndaq_f64 (vfmaq_f64 (v_f64 (0.5), r, d->inv_pi));
|
||||
+ odd = vshlq_n_u64 (vreinterpretq_u64_s64 (vcvtq_s64_f64 (n)), 63);
|
||||
+ n = vsubq_f64 (n, v_f64 (0.5f));
|
||||
|
||||
/* r = |x| - n*pi (range reduction into -pi/2 .. pi/2). */
|
||||
r = vfmsq_f64 (r, d->pi_1, n);
|
||||
diff --git a/sysdeps/aarch64/fpu/cosf_advsimd.c b/sysdeps/aarch64/fpu/cosf_advsimd.c
|
||||
index d0c285b03a8bfe22..85a1b373733123fa 100644
|
||||
--- a/sysdeps/aarch64/fpu/cosf_advsimd.c
|
||||
+++ b/sysdeps/aarch64/fpu/cosf_advsimd.c
|
||||
@@ -22,7 +22,7 @@
|
||||
static const struct data
|
||||
{
|
||||
float32x4_t poly[4];
|
||||
- float32x4_t range_val, inv_pi, half_pi, shift, pi_1, pi_2, pi_3;
|
||||
+ float32x4_t range_val, inv_pi, pi_1, pi_2, pi_3;
|
||||
} data = {
|
||||
/* 1.886 ulp error. */
|
||||
.poly = { V4 (-0x1.555548p-3f), V4 (0x1.110df4p-7f), V4 (-0x1.9f42eap-13f),
|
||||
@@ -33,8 +33,6 @@ static const struct data
|
||||
.pi_3 = V4 (-0x1.ee59dap-49f),
|
||||
|
||||
.inv_pi = V4 (0x1.45f306p-2f),
|
||||
- .shift = V4 (0x1.8p+23f),
|
||||
- .half_pi = V4 (0x1.921fb6p0f),
|
||||
.range_val = V4 (0x1p20f)
|
||||
};
|
||||
|
||||
@@ -69,9 +67,8 @@ float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (cos) (float32x4_t x)
|
||||
#endif
|
||||
|
||||
/* n = rint((|x|+pi/2)/pi) - 0.5. */
|
||||
- n = vfmaq_f32 (d->shift, d->inv_pi, vaddq_f32 (r, d->half_pi));
|
||||
- odd = vshlq_n_u32 (vreinterpretq_u32_f32 (n), 31);
|
||||
- n = vsubq_f32 (n, d->shift);
|
||||
+ n = vrndaq_f32 (vfmaq_f32 (v_f32 (0.5), r, d->inv_pi));
|
||||
+ odd = vshlq_n_u32 (vreinterpretq_u32_s32 (vcvtq_s32_f32 (n)), 31);
|
||||
n = vsubq_f32 (n, v_f32 (0.5f));
|
||||
|
||||
/* r = |x| - n*pi (range reduction into -pi/2 .. pi/2). */
|
||||
diff --git a/sysdeps/aarch64/fpu/expf_advsimd.c b/sysdeps/aarch64/fpu/expf_advsimd.c
|
||||
index 99d2e647aab70260..5c9cb726205ece6e 100644
|
||||
--- a/sysdeps/aarch64/fpu/expf_advsimd.c
|
||||
+++ b/sysdeps/aarch64/fpu/expf_advsimd.c
|
||||
@@ -22,7 +22,7 @@
|
||||
static const struct data
|
||||
{
|
||||
float32x4_t poly[5];
|
||||
- float32x4_t shift, inv_ln2, ln2_hi, ln2_lo;
|
||||
+ float32x4_t inv_ln2, ln2_hi, ln2_lo;
|
||||
uint32x4_t exponent_bias;
|
||||
#if !WANT_SIMD_EXCEPT
|
||||
float32x4_t special_bound, scale_thresh;
|
||||
@@ -31,7 +31,6 @@ static const struct data
|
||||
/* maxerr: 1.45358 +0.5 ulp. */
|
||||
.poly = { V4 (0x1.0e4020p-7f), V4 (0x1.573e2ep-5f), V4 (0x1.555e66p-3f),
|
||||
V4 (0x1.fffdb6p-2f), V4 (0x1.ffffecp-1f) },
|
||||
- .shift = V4 (0x1.8p23f),
|
||||
.inv_ln2 = V4 (0x1.715476p+0f),
|
||||
.ln2_hi = V4 (0x1.62e4p-1f),
|
||||
.ln2_lo = V4 (0x1.7f7d1cp-20f),
|
||||
@@ -85,7 +84,7 @@ special_case (float32x4_t poly, float32x4_t n, uint32x4_t e, uint32x4_t cmp1,
|
||||
float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (exp) (float32x4_t x)
|
||||
{
|
||||
const struct data *d = ptr_barrier (&data);
|
||||
- float32x4_t n, r, r2, scale, p, q, poly, z;
|
||||
+ float32x4_t n, r, r2, scale, p, q, poly;
|
||||
uint32x4_t cmp, e;
|
||||
|
||||
#if WANT_SIMD_EXCEPT
|
||||
@@ -104,11 +103,10 @@ float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (exp) (float32x4_t x)
|
||||
|
||||
/* exp(x) = 2^n (1 + poly(r)), with 1 + poly(r) in [1/sqrt(2),sqrt(2)]
|
||||
x = ln2*n + r, with r in [-ln2/2, ln2/2]. */
|
||||
- z = vfmaq_f32 (d->shift, x, d->inv_ln2);
|
||||
- n = vsubq_f32 (z, d->shift);
|
||||
+ n = vrndaq_f32 (vmulq_f32 (x, d->inv_ln2));
|
||||
r = vfmsq_f32 (x, n, d->ln2_hi);
|
||||
r = vfmsq_f32 (r, n, d->ln2_lo);
|
||||
- e = vshlq_n_u32 (vreinterpretq_u32_f32 (z), 23);
|
||||
+ e = vshlq_n_u32 (vreinterpretq_u32_s32 (vcvtq_s32_f32 (n)), 23);
|
||||
scale = vreinterpretq_f32_u32 (vaddq_u32 (e, d->exponent_bias));
|
||||
|
||||
#if !WANT_SIMD_EXCEPT
|
||||
diff --git a/sysdeps/aarch64/fpu/sin_advsimd.c b/sysdeps/aarch64/fpu/sin_advsimd.c
|
||||
index a0d9d3b81965db76..718125cbad81db41 100644
|
||||
--- a/sysdeps/aarch64/fpu/sin_advsimd.c
|
||||
+++ b/sysdeps/aarch64/fpu/sin_advsimd.c
|
||||
@@ -22,7 +22,7 @@
|
||||
static const struct data
|
||||
{
|
||||
float64x2_t poly[7];
|
||||
- float64x2_t range_val, inv_pi, shift, pi_1, pi_2, pi_3;
|
||||
+ float64x2_t range_val, inv_pi, pi_1, pi_2, pi_3;
|
||||
} data = {
|
||||
.poly = { V2 (-0x1.555555555547bp-3), V2 (0x1.1111111108a4dp-7),
|
||||
V2 (-0x1.a01a019936f27p-13), V2 (0x1.71de37a97d93ep-19),
|
||||
@@ -34,12 +34,13 @@ static const struct data
|
||||
.pi_1 = V2 (0x1.921fb54442d18p+1),
|
||||
.pi_2 = V2 (0x1.1a62633145c06p-53),
|
||||
.pi_3 = V2 (0x1.c1cd129024e09p-106),
|
||||
- .shift = V2 (0x1.8p52),
|
||||
};
|
||||
|
||||
#if WANT_SIMD_EXCEPT
|
||||
-# define TinyBound v_u64 (0x3000000000000000) /* asuint64 (0x1p-255). */
|
||||
-# define Thresh v_u64 (0x1160000000000000) /* RangeVal - TinyBound. */
|
||||
+/* asuint64(0x1p-253)), below which multiply by inv_pi underflows. */
|
||||
+# define TinyBound v_u64 (0x3020000000000000)
|
||||
+/* RangeVal - TinyBound. */
|
||||
+# define Thresh v_u64 (0x1160000000000000)
|
||||
#endif
|
||||
|
||||
#define C(i) d->poly[i]
|
||||
@@ -72,16 +73,15 @@ float64x2_t VPCS_ATTR V_NAME_D1 (sin) (float64x2_t x)
|
||||
fenv). These lanes will be fixed by special-case handler later. */
|
||||
uint64x2_t ir = vreinterpretq_u64_f64 (vabsq_f64 (x));
|
||||
cmp = vcgeq_u64 (vsubq_u64 (ir, TinyBound), Thresh);
|
||||
- r = vbslq_f64 (cmp, vreinterpretq_f64_u64 (cmp), x);
|
||||
+ r = vreinterpretq_f64_u64 (vbicq_u64 (vreinterpretq_u64_f64 (x), cmp));
|
||||
#else
|
||||
r = x;
|
||||
cmp = vcageq_f64 (x, d->range_val);
|
||||
#endif
|
||||
|
||||
/* n = rint(|x|/pi). */
|
||||
- n = vfmaq_f64 (d->shift, d->inv_pi, r);
|
||||
- odd = vshlq_n_u64 (vreinterpretq_u64_f64 (n), 63);
|
||||
- n = vsubq_f64 (n, d->shift);
|
||||
+ n = vrndaq_f64 (vmulq_f64 (r, d->inv_pi));
|
||||
+ odd = vshlq_n_u64 (vreinterpretq_u64_s64 (vcvtq_s64_f64 (n)), 63);
|
||||
|
||||
/* r = |x| - n*pi (range reduction into -pi/2 .. pi/2). */
|
||||
r = vfmsq_f64 (r, d->pi_1, n);
|
||||
diff --git a/sysdeps/aarch64/fpu/sinf_advsimd.c b/sysdeps/aarch64/fpu/sinf_advsimd.c
|
||||
index 375dfc3331fa6a9c..6ee9a23d5b7fd13f 100644
|
||||
--- a/sysdeps/aarch64/fpu/sinf_advsimd.c
|
||||
+++ b/sysdeps/aarch64/fpu/sinf_advsimd.c
|
||||
@@ -22,7 +22,7 @@
|
||||
static const struct data
|
||||
{
|
||||
float32x4_t poly[4];
|
||||
- float32x4_t range_val, inv_pi, shift, pi_1, pi_2, pi_3;
|
||||
+ float32x4_t range_val, inv_pi, pi_1, pi_2, pi_3;
|
||||
} data = {
|
||||
/* 1.886 ulp error. */
|
||||
.poly = { V4 (-0x1.555548p-3f), V4 (0x1.110df4p-7f), V4 (-0x1.9f42eap-13f),
|
||||
@@ -33,13 +33,14 @@ static const struct data
|
||||
.pi_3 = V4 (-0x1.ee59dap-49f),
|
||||
|
||||
.inv_pi = V4 (0x1.45f306p-2f),
|
||||
- .shift = V4 (0x1.8p+23f),
|
||||
.range_val = V4 (0x1p20f)
|
||||
};
|
||||
|
||||
#if WANT_SIMD_EXCEPT
|
||||
-# define TinyBound v_u32 (0x21000000) /* asuint32(0x1p-61f). */
|
||||
-# define Thresh v_u32 (0x28800000) /* RangeVal - TinyBound. */
|
||||
+/* asuint32(0x1p-59f), below which multiply by inv_pi underflows. */
|
||||
+# define TinyBound v_u32 (0x22000000)
|
||||
+/* RangeVal - TinyBound. */
|
||||
+# define Thresh v_u32 (0x27800000)
|
||||
#endif
|
||||
|
||||
#define C(i) d->poly[i]
|
||||
@@ -64,23 +65,22 @@ float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (sin) (float32x4_t x)
|
||||
/* If fenv exceptions are to be triggered correctly, set any special lanes
|
||||
to 1 (which is neutral w.r.t. fenv). These lanes will be fixed by
|
||||
special-case handler later. */
|
||||
- r = vbslq_f32 (cmp, vreinterpretq_f32_u32 (cmp), x);
|
||||
+ r = vreinterpretq_f32_u32 (vbicq_u32 (vreinterpretq_u32_f32 (x), cmp));
|
||||
#else
|
||||
r = x;
|
||||
cmp = vcageq_f32 (x, d->range_val);
|
||||
#endif
|
||||
|
||||
- /* n = rint(|x|/pi) */
|
||||
- n = vfmaq_f32 (d->shift, d->inv_pi, r);
|
||||
- odd = vshlq_n_u32 (vreinterpretq_u32_f32 (n), 31);
|
||||
- n = vsubq_f32 (n, d->shift);
|
||||
+ /* n = rint(|x|/pi). */
|
||||
+ n = vrndaq_f32 (vmulq_f32 (r, d->inv_pi));
|
||||
+ odd = vshlq_n_u32 (vreinterpretq_u32_s32 (vcvtq_s32_f32 (n)), 31);
|
||||
|
||||
- /* r = |x| - n*pi (range reduction into -pi/2 .. pi/2) */
|
||||
+ /* r = |x| - n*pi (range reduction into -pi/2 .. pi/2). */
|
||||
r = vfmsq_f32 (r, d->pi_1, n);
|
||||
r = vfmsq_f32 (r, d->pi_2, n);
|
||||
r = vfmsq_f32 (r, d->pi_3, n);
|
||||
|
||||
- /* y = sin(r) */
|
||||
+ /* y = sin(r). */
|
||||
r2 = vmulq_f32 (r, r);
|
||||
y = vfmaq_f32 (C (2), C (3), r2);
|
||||
y = vfmaq_f32 (C (1), y, r2);
|
404
glibc-upstream-2.39-154.patch
Normal file
404
glibc-upstream-2.39-154.patch
Normal file
@ -0,0 +1,404 @@
|
||||
commit 72156cb90bb845eddf3acd59dd1599cec365942e
|
||||
Author: Pierre Blanchard <pierre.blanchard@arm.com>
|
||||
Date: Mon Dec 9 15:54:34 2024 +0000
|
||||
|
||||
AArch64: Improve codegen in AdvSIMD logs
|
||||
|
||||
Remove spurious ADRP and a few MOVs.
|
||||
Reduce memory access by using more indexed MLAs in polynomial.
|
||||
Align notation so that algorithms are easier to compare.
|
||||
Speedup on Neoverse V1 for log10 (8%), log (8.5%), and log2 (10%).
|
||||
Update error threshold in AdvSIMD log (now matches SVE log).
|
||||
|
||||
(cherry picked from commit 8eb5ad2ebc94cc5bedbac57c226c02ec254479c7)
|
||||
|
||||
diff --git a/sysdeps/aarch64/fpu/log10_advsimd.c b/sysdeps/aarch64/fpu/log10_advsimd.c
|
||||
index c065aaebae8600fb..f69ed21c3938d9a9 100644
|
||||
--- a/sysdeps/aarch64/fpu/log10_advsimd.c
|
||||
+++ b/sysdeps/aarch64/fpu/log10_advsimd.c
|
||||
@@ -18,36 +18,36 @@
|
||||
<https://www.gnu.org/licenses/>. */
|
||||
|
||||
#include "v_math.h"
|
||||
-#include "poly_advsimd_f64.h"
|
||||
-
|
||||
-#define N (1 << V_LOG10_TABLE_BITS)
|
||||
|
||||
static const struct data
|
||||
{
|
||||
- uint64x2_t min_norm;
|
||||
+ uint64x2_t off, sign_exp_mask, offset_lower_bound;
|
||||
uint32x4_t special_bound;
|
||||
- float64x2_t poly[5];
|
||||
- float64x2_t invln10, log10_2, ln2;
|
||||
- uint64x2_t sign_exp_mask;
|
||||
+ double invln10, log10_2;
|
||||
+ double c1, c3;
|
||||
+ float64x2_t c0, c2, c4;
|
||||
} data = {
|
||||
/* Computed from log coefficients divided by log(10) then rounded to double
|
||||
precision. */
|
||||
- .poly = { V2 (-0x1.bcb7b1526e506p-3), V2 (0x1.287a7636be1d1p-3),
|
||||
- V2 (-0x1.bcb7b158af938p-4), V2 (0x1.63c78734e6d07p-4),
|
||||
- V2 (-0x1.287461742fee4p-4) },
|
||||
- .ln2 = V2 (0x1.62e42fefa39efp-1),
|
||||
- .invln10 = V2 (0x1.bcb7b1526e50ep-2),
|
||||
- .log10_2 = V2 (0x1.34413509f79ffp-2),
|
||||
- .min_norm = V2 (0x0010000000000000), /* asuint64(0x1p-1022). */
|
||||
- .special_bound = V4 (0x7fe00000), /* asuint64(inf) - min_norm. */
|
||||
+ .c0 = V2 (-0x1.bcb7b1526e506p-3),
|
||||
+ .c1 = 0x1.287a7636be1d1p-3,
|
||||
+ .c2 = V2 (-0x1.bcb7b158af938p-4),
|
||||
+ .c3 = 0x1.63c78734e6d07p-4,
|
||||
+ .c4 = V2 (-0x1.287461742fee4p-4),
|
||||
+ .invln10 = 0x1.bcb7b1526e50ep-2,
|
||||
+ .log10_2 = 0x1.34413509f79ffp-2,
|
||||
+ .off = V2 (0x3fe6900900000000),
|
||||
.sign_exp_mask = V2 (0xfff0000000000000),
|
||||
+ /* Lower bound is 0x0010000000000000. For
|
||||
+ optimised register use subnormals are detected after offset has been
|
||||
+ subtracted, so lower bound - offset (which wraps around). */
|
||||
+ .offset_lower_bound = V2 (0x0010000000000000 - 0x3fe6900900000000),
|
||||
+ .special_bound = V4 (0x7fe00000), /* asuint64(inf) - 0x0010000000000000. */
|
||||
};
|
||||
|
||||
-#define Off v_u64 (0x3fe6900900000000)
|
||||
+#define N (1 << V_LOG10_TABLE_BITS)
|
||||
#define IndexMask (N - 1)
|
||||
|
||||
-#define T(s, i) __v_log10_data.s[i]
|
||||
-
|
||||
struct entry
|
||||
{
|
||||
float64x2_t invc;
|
||||
@@ -70,10 +70,11 @@ lookup (uint64x2_t i)
|
||||
}
|
||||
|
||||
static float64x2_t VPCS_ATTR NOINLINE
|
||||
-special_case (float64x2_t x, float64x2_t y, float64x2_t hi, float64x2_t r2,
|
||||
- uint32x2_t special)
|
||||
+special_case (float64x2_t hi, uint64x2_t u_off, float64x2_t y, float64x2_t r2,
|
||||
+ uint32x2_t special, const struct data *d)
|
||||
{
|
||||
- return v_call_f64 (log10, x, vfmaq_f64 (hi, r2, y), vmovl_u32 (special));
|
||||
+ float64x2_t x = vreinterpretq_f64_u64 (vaddq_u64 (u_off, d->off));
|
||||
+ return v_call_f64 (log10, x, vfmaq_f64 (hi, y, r2), vmovl_u32 (special));
|
||||
}
|
||||
|
||||
/* Fast implementation of double-precision vector log10
|
||||
@@ -85,19 +86,24 @@ special_case (float64x2_t x, float64x2_t y, float64x2_t hi, float64x2_t r2,
|
||||
float64x2_t VPCS_ATTR V_NAME_D1 (log10) (float64x2_t x)
|
||||
{
|
||||
const struct data *d = ptr_barrier (&data);
|
||||
- uint64x2_t ix = vreinterpretq_u64_f64 (x);
|
||||
- uint32x2_t special = vcge_u32 (vsubhn_u64 (ix, d->min_norm),
|
||||
- vget_low_u32 (d->special_bound));
|
||||
+
|
||||
+ /* To avoid having to mov x out of the way, keep u after offset has been
|
||||
+ applied, and recover x by adding the offset back in the special-case
|
||||
+ handler. */
|
||||
+ uint64x2_t u = vreinterpretq_u64_f64 (x);
|
||||
+ uint64x2_t u_off = vsubq_u64 (u, d->off);
|
||||
|
||||
/* x = 2^k z; where z is in range [OFF,2*OFF) and exact.
|
||||
The range is split into N subintervals.
|
||||
The ith subinterval contains z and c is near its center. */
|
||||
- uint64x2_t tmp = vsubq_u64 (ix, Off);
|
||||
- int64x2_t k = vshrq_n_s64 (vreinterpretq_s64_u64 (tmp), 52);
|
||||
- uint64x2_t iz = vsubq_u64 (ix, vandq_u64 (tmp, d->sign_exp_mask));
|
||||
+ int64x2_t k = vshrq_n_s64 (vreinterpretq_s64_u64 (u_off), 52);
|
||||
+ uint64x2_t iz = vsubq_u64 (u, vandq_u64 (u_off, d->sign_exp_mask));
|
||||
float64x2_t z = vreinterpretq_f64_u64 (iz);
|
||||
|
||||
- struct entry e = lookup (tmp);
|
||||
+ struct entry e = lookup (u_off);
|
||||
+
|
||||
+ uint32x2_t special = vcge_u32 (vsubhn_u64 (u_off, d->offset_lower_bound),
|
||||
+ vget_low_u32 (d->special_bound));
|
||||
|
||||
/* log10(x) = log1p(z/c-1)/log(10) + log10(c) + k*log10(2). */
|
||||
float64x2_t r = vfmaq_f64 (v_f64 (-1.0), z, e.invc);
|
||||
@@ -105,17 +111,22 @@ float64x2_t VPCS_ATTR V_NAME_D1 (log10) (float64x2_t x)
|
||||
|
||||
/* hi = r / log(10) + log10(c) + k*log10(2).
|
||||
Constants in v_log10_data.c are computed (in extended precision) as
|
||||
- e.log10c := e.logc * ivln10. */
|
||||
- float64x2_t w = vfmaq_f64 (e.log10c, r, d->invln10);
|
||||
+ e.log10c := e.logc * invln10. */
|
||||
+ float64x2_t cte = vld1q_f64 (&d->invln10);
|
||||
+ float64x2_t hi = vfmaq_laneq_f64 (e.log10c, r, cte, 0);
|
||||
|
||||
/* y = log10(1+r) + n * log10(2). */
|
||||
- float64x2_t hi = vfmaq_f64 (w, kd, d->log10_2);
|
||||
+ hi = vfmaq_laneq_f64 (hi, kd, cte, 1);
|
||||
|
||||
/* y = r2*(A0 + r*A1 + r2*(A2 + r*A3 + r2*A4)) + hi. */
|
||||
float64x2_t r2 = vmulq_f64 (r, r);
|
||||
- float64x2_t y = v_pw_horner_4_f64 (r, r2, d->poly);
|
||||
+ float64x2_t odd_coeffs = vld1q_f64 (&d->c1);
|
||||
+ float64x2_t y = vfmaq_laneq_f64 (d->c2, r, odd_coeffs, 1);
|
||||
+ float64x2_t p = vfmaq_laneq_f64 (d->c0, r, odd_coeffs, 0);
|
||||
+ y = vfmaq_f64 (y, d->c4, r2);
|
||||
+ y = vfmaq_f64 (p, y, r2);
|
||||
|
||||
if (__glibc_unlikely (v_any_u32h (special)))
|
||||
- return special_case (x, y, hi, r2, special);
|
||||
- return vfmaq_f64 (hi, r2, y);
|
||||
+ return special_case (hi, u_off, y, r2, special, d);
|
||||
+ return vfmaq_f64 (hi, y, r2);
|
||||
}
|
||||
diff --git a/sysdeps/aarch64/fpu/log2_advsimd.c b/sysdeps/aarch64/fpu/log2_advsimd.c
|
||||
index 4057c552d8dfc0bb..1eea1f86ebdeab34 100644
|
||||
--- a/sysdeps/aarch64/fpu/log2_advsimd.c
|
||||
+++ b/sysdeps/aarch64/fpu/log2_advsimd.c
|
||||
@@ -18,31 +18,33 @@
|
||||
<https://www.gnu.org/licenses/>. */
|
||||
|
||||
#include "v_math.h"
|
||||
-#include "poly_advsimd_f64.h"
|
||||
-
|
||||
-#define N (1 << V_LOG2_TABLE_BITS)
|
||||
|
||||
static const struct data
|
||||
{
|
||||
- uint64x2_t min_norm;
|
||||
+ uint64x2_t off, sign_exp_mask, offset_lower_bound;
|
||||
uint32x4_t special_bound;
|
||||
- float64x2_t poly[5];
|
||||
- float64x2_t invln2;
|
||||
- uint64x2_t sign_exp_mask;
|
||||
+ float64x2_t c0, c2;
|
||||
+ double c1, c3, invln2, c4;
|
||||
} data = {
|
||||
/* Each coefficient was generated to approximate log(r) for |r| < 0x1.fp-9
|
||||
and N = 128, then scaled by log2(e) in extended precision and rounded back
|
||||
to double precision. */
|
||||
- .poly = { V2 (-0x1.71547652b83p-1), V2 (0x1.ec709dc340953p-2),
|
||||
- V2 (-0x1.71547651c8f35p-2), V2 (0x1.2777ebe12dda5p-2),
|
||||
- V2 (-0x1.ec738d616fe26p-3) },
|
||||
- .invln2 = V2 (0x1.71547652b82fep0),
|
||||
- .min_norm = V2 (0x0010000000000000), /* asuint64(0x1p-1022). */
|
||||
- .special_bound = V4 (0x7fe00000), /* asuint64(inf) - min_norm. */
|
||||
+ .c0 = V2 (-0x1.71547652b8300p-1),
|
||||
+ .c1 = 0x1.ec709dc340953p-2,
|
||||
+ .c2 = V2 (-0x1.71547651c8f35p-2),
|
||||
+ .c3 = 0x1.2777ebe12dda5p-2,
|
||||
+ .c4 = -0x1.ec738d616fe26p-3,
|
||||
+ .invln2 = 0x1.71547652b82fep0,
|
||||
+ .off = V2 (0x3fe6900900000000),
|
||||
.sign_exp_mask = V2 (0xfff0000000000000),
|
||||
+ /* Lower bound is 0x0010000000000000. For
|
||||
+ optimised register use subnormals are detected after offset has been
|
||||
+ subtracted, so lower bound - offset (which wraps around). */
|
||||
+ .offset_lower_bound = V2 (0x0010000000000000 - 0x3fe6900900000000),
|
||||
+ .special_bound = V4 (0x7fe00000), /* asuint64(inf) - asuint64(0x1p-1022). */
|
||||
};
|
||||
|
||||
-#define Off v_u64 (0x3fe6900900000000)
|
||||
+#define N (1 << V_LOG2_TABLE_BITS)
|
||||
#define IndexMask (N - 1)
|
||||
|
||||
struct entry
|
||||
@@ -67,10 +69,11 @@ lookup (uint64x2_t i)
|
||||
}
|
||||
|
||||
static float64x2_t VPCS_ATTR NOINLINE
|
||||
-special_case (float64x2_t x, float64x2_t y, float64x2_t w, float64x2_t r2,
|
||||
- uint32x2_t special)
|
||||
+special_case (float64x2_t hi, uint64x2_t u_off, float64x2_t y, float64x2_t r2,
|
||||
+ uint32x2_t special, const struct data *d)
|
||||
{
|
||||
- return v_call_f64 (log2, x, vfmaq_f64 (w, r2, y), vmovl_u32 (special));
|
||||
+ float64x2_t x = vreinterpretq_f64_u64 (vaddq_u64 (u_off, d->off));
|
||||
+ return v_call_f64 (log2, x, vfmaq_f64 (hi, y, r2), vmovl_u32 (special));
|
||||
}
|
||||
|
||||
/* Double-precision vector log2 routine. Implements the same algorithm as
|
||||
@@ -81,31 +84,41 @@ special_case (float64x2_t x, float64x2_t y, float64x2_t w, float64x2_t r2,
|
||||
float64x2_t VPCS_ATTR V_NAME_D1 (log2) (float64x2_t x)
|
||||
{
|
||||
const struct data *d = ptr_barrier (&data);
|
||||
- uint64x2_t ix = vreinterpretq_u64_f64 (x);
|
||||
- uint32x2_t special = vcge_u32 (vsubhn_u64 (ix, d->min_norm),
|
||||
- vget_low_u32 (d->special_bound));
|
||||
+
|
||||
+ /* To avoid having to mov x out of the way, keep u after offset has been
|
||||
+ applied, and recover x by adding the offset back in the special-case
|
||||
+ handler. */
|
||||
+ uint64x2_t u = vreinterpretq_u64_f64 (x);
|
||||
+ uint64x2_t u_off = vsubq_u64 (u, d->off);
|
||||
|
||||
/* x = 2^k z; where z is in range [Off,2*Off) and exact.
|
||||
The range is split into N subintervals.
|
||||
The ith subinterval contains z and c is near its center. */
|
||||
- uint64x2_t tmp = vsubq_u64 (ix, Off);
|
||||
- int64x2_t k = vshrq_n_s64 (vreinterpretq_s64_u64 (tmp), 52);
|
||||
- uint64x2_t iz = vsubq_u64 (ix, vandq_u64 (tmp, d->sign_exp_mask));
|
||||
+ int64x2_t k = vshrq_n_s64 (vreinterpretq_s64_u64 (u_off), 52);
|
||||
+ uint64x2_t iz = vsubq_u64 (u, vandq_u64 (u_off, d->sign_exp_mask));
|
||||
float64x2_t z = vreinterpretq_f64_u64 (iz);
|
||||
|
||||
- struct entry e = lookup (tmp);
|
||||
+ struct entry e = lookup (u_off);
|
||||
|
||||
- /* log2(x) = log1p(z/c-1)/log(2) + log2(c) + k. */
|
||||
+ uint32x2_t special = vcge_u32 (vsubhn_u64 (u_off, d->offset_lower_bound),
|
||||
+ vget_low_u32 (d->special_bound));
|
||||
|
||||
+ /* log2(x) = log1p(z/c-1)/log(2) + log2(c) + k. */
|
||||
float64x2_t r = vfmaq_f64 (v_f64 (-1.0), z, e.invc);
|
||||
float64x2_t kd = vcvtq_f64_s64 (k);
|
||||
- float64x2_t w = vfmaq_f64 (e.log2c, r, d->invln2);
|
||||
+
|
||||
+ float64x2_t invln2_and_c4 = vld1q_f64 (&d->invln2);
|
||||
+ float64x2_t hi
|
||||
+ = vfmaq_laneq_f64 (vaddq_f64 (e.log2c, kd), r, invln2_and_c4, 0);
|
||||
|
||||
float64x2_t r2 = vmulq_f64 (r, r);
|
||||
- float64x2_t y = v_pw_horner_4_f64 (r, r2, d->poly);
|
||||
- w = vaddq_f64 (kd, w);
|
||||
+ float64x2_t odd_coeffs = vld1q_f64 (&d->c1);
|
||||
+ float64x2_t y = vfmaq_laneq_f64 (d->c2, r, odd_coeffs, 1);
|
||||
+ float64x2_t p = vfmaq_laneq_f64 (d->c0, r, odd_coeffs, 0);
|
||||
+ y = vfmaq_laneq_f64 (y, r2, invln2_and_c4, 1);
|
||||
+ y = vfmaq_f64 (p, r2, y);
|
||||
|
||||
if (__glibc_unlikely (v_any_u32h (special)))
|
||||
- return special_case (x, y, w, r2, special);
|
||||
- return vfmaq_f64 (w, r2, y);
|
||||
+ return special_case (hi, u_off, y, r2, special, d);
|
||||
+ return vfmaq_f64 (hi, y, r2);
|
||||
}
|
||||
diff --git a/sysdeps/aarch64/fpu/log_advsimd.c b/sysdeps/aarch64/fpu/log_advsimd.c
|
||||
index 015a6da7d7fd693e..b1a27fbc290d918c 100644
|
||||
--- a/sysdeps/aarch64/fpu/log_advsimd.c
|
||||
+++ b/sysdeps/aarch64/fpu/log_advsimd.c
|
||||
@@ -21,27 +21,29 @@
|
||||
|
||||
static const struct data
|
||||
{
|
||||
- uint64x2_t min_norm;
|
||||
+ uint64x2_t off, sign_exp_mask, offset_lower_bound;
|
||||
uint32x4_t special_bound;
|
||||
- float64x2_t poly[5];
|
||||
- float64x2_t ln2;
|
||||
- uint64x2_t sign_exp_mask;
|
||||
+ float64x2_t c0, c2;
|
||||
+ double c1, c3, ln2, c4;
|
||||
} data = {
|
||||
- /* Worst-case error: 1.17 + 0.5 ulp.
|
||||
- Rel error: 0x1.6272e588p-56 in [ -0x1.fc1p-9 0x1.009p-8 ]. */
|
||||
- .poly = { V2 (-0x1.ffffffffffff7p-2), V2 (0x1.55555555170d4p-2),
|
||||
- V2 (-0x1.0000000399c27p-2), V2 (0x1.999b2e90e94cap-3),
|
||||
- V2 (-0x1.554e550bd501ep-3) },
|
||||
- .ln2 = V2 (0x1.62e42fefa39efp-1),
|
||||
- .min_norm = V2 (0x0010000000000000),
|
||||
- .special_bound = V4 (0x7fe00000), /* asuint64(inf) - min_norm. */
|
||||
- .sign_exp_mask = V2 (0xfff0000000000000)
|
||||
+ /* Rel error: 0x1.6272e588p-56 in [ -0x1.fc1p-9 0x1.009p-8 ]. */
|
||||
+ .c0 = V2 (-0x1.ffffffffffff7p-2),
|
||||
+ .c1 = 0x1.55555555170d4p-2,
|
||||
+ .c2 = V2 (-0x1.0000000399c27p-2),
|
||||
+ .c3 = 0x1.999b2e90e94cap-3,
|
||||
+ .c4 = -0x1.554e550bd501ep-3,
|
||||
+ .ln2 = 0x1.62e42fefa39efp-1,
|
||||
+ .sign_exp_mask = V2 (0xfff0000000000000),
|
||||
+ .off = V2 (0x3fe6900900000000),
|
||||
+ /* Lower bound is 0x0010000000000000. For
|
||||
+ optimised register use subnormals are detected after offset has been
|
||||
+ subtracted, so lower bound - offset (which wraps around). */
|
||||
+ .offset_lower_bound = V2 (0x0010000000000000 - 0x3fe6900900000000),
|
||||
+ .special_bound = V4 (0x7fe00000), /* asuint64(inf) - asuint64(0x1p-126). */
|
||||
};
|
||||
|
||||
-#define A(i) d->poly[i]
|
||||
#define N (1 << V_LOG_TABLE_BITS)
|
||||
#define IndexMask (N - 1)
|
||||
-#define Off v_u64 (0x3fe6900900000000)
|
||||
|
||||
struct entry
|
||||
{
|
||||
@@ -64,48 +66,56 @@ lookup (uint64x2_t i)
|
||||
}
|
||||
|
||||
static float64x2_t VPCS_ATTR NOINLINE
|
||||
-special_case (float64x2_t x, float64x2_t y, float64x2_t hi, float64x2_t r2,
|
||||
- uint32x2_t cmp)
|
||||
+special_case (float64x2_t hi, uint64x2_t u_off, float64x2_t y, float64x2_t r2,
|
||||
+ uint32x2_t special, const struct data *d)
|
||||
{
|
||||
- return v_call_f64 (log, x, vfmaq_f64 (hi, y, r2), vmovl_u32 (cmp));
|
||||
+ float64x2_t x = vreinterpretq_f64_u64 (vaddq_u64 (u_off, d->off));
|
||||
+ return v_call_f64 (log, x, vfmaq_f64 (hi, y, r2), vmovl_u32 (special));
|
||||
}
|
||||
|
||||
+/* Double-precision vector log routine.
|
||||
+ The maximum observed error is 2.17 ULP:
|
||||
+ _ZGVnN2v_log(0x1.a6129884398a3p+0) got 0x1.ffffff1cca043p-2
|
||||
+ want 0x1.ffffff1cca045p-2. */
|
||||
float64x2_t VPCS_ATTR V_NAME_D1 (log) (float64x2_t x)
|
||||
{
|
||||
const struct data *d = ptr_barrier (&data);
|
||||
- float64x2_t z, r, r2, p, y, kd, hi;
|
||||
- uint64x2_t ix, iz, tmp;
|
||||
- uint32x2_t cmp;
|
||||
- int64x2_t k;
|
||||
- struct entry e;
|
||||
|
||||
- ix = vreinterpretq_u64_f64 (x);
|
||||
- cmp = vcge_u32 (vsubhn_u64 (ix, d->min_norm),
|
||||
- vget_low_u32 (d->special_bound));
|
||||
+ /* To avoid having to mov x out of the way, keep u after offset has been
|
||||
+ applied, and recover x by adding the offset back in the special-case
|
||||
+ handler. */
|
||||
+ uint64x2_t u = vreinterpretq_u64_f64 (x);
|
||||
+ uint64x2_t u_off = vsubq_u64 (u, d->off);
|
||||
|
||||
/* x = 2^k z; where z is in range [Off,2*Off) and exact.
|
||||
The range is split into N subintervals.
|
||||
The ith subinterval contains z and c is near its center. */
|
||||
- tmp = vsubq_u64 (ix, Off);
|
||||
- k = vshrq_n_s64 (vreinterpretq_s64_u64 (tmp), 52); /* arithmetic shift. */
|
||||
- iz = vsubq_u64 (ix, vandq_u64 (tmp, d->sign_exp_mask));
|
||||
- z = vreinterpretq_f64_u64 (iz);
|
||||
- e = lookup (tmp);
|
||||
+ int64x2_t k = vshrq_n_s64 (vreinterpretq_s64_u64 (u_off), 52);
|
||||
+ uint64x2_t iz = vsubq_u64 (u, vandq_u64 (u_off, d->sign_exp_mask));
|
||||
+ float64x2_t z = vreinterpretq_f64_u64 (iz);
|
||||
+
|
||||
+ struct entry e = lookup (u_off);
|
||||
+
|
||||
+ uint32x2_t special = vcge_u32 (vsubhn_u64 (u_off, d->offset_lower_bound),
|
||||
+ vget_low_u32 (d->special_bound));
|
||||
|
||||
/* log(x) = log1p(z/c-1) + log(c) + k*Ln2. */
|
||||
- r = vfmaq_f64 (v_f64 (-1.0), z, e.invc);
|
||||
- kd = vcvtq_f64_s64 (k);
|
||||
+ float64x2_t r = vfmaq_f64 (v_f64 (-1.0), z, e.invc);
|
||||
+ float64x2_t kd = vcvtq_f64_s64 (k);
|
||||
|
||||
/* hi = r + log(c) + k*Ln2. */
|
||||
- hi = vfmaq_f64 (vaddq_f64 (e.logc, r), kd, d->ln2);
|
||||
+ float64x2_t ln2_and_c4 = vld1q_f64 (&d->ln2);
|
||||
+ float64x2_t hi = vfmaq_laneq_f64 (vaddq_f64 (e.logc, r), kd, ln2_and_c4, 0);
|
||||
+
|
||||
/* y = r2*(A0 + r*A1 + r2*(A2 + r*A3 + r2*A4)) + hi. */
|
||||
- r2 = vmulq_f64 (r, r);
|
||||
- y = vfmaq_f64 (A (2), A (3), r);
|
||||
- p = vfmaq_f64 (A (0), A (1), r);
|
||||
- y = vfmaq_f64 (y, A (4), r2);
|
||||
- y = vfmaq_f64 (p, y, r2);
|
||||
-
|
||||
- if (__glibc_unlikely (v_any_u32h (cmp)))
|
||||
- return special_case (x, y, hi, r2, cmp);
|
||||
+ float64x2_t odd_coeffs = vld1q_f64 (&d->c1);
|
||||
+ float64x2_t r2 = vmulq_f64 (r, r);
|
||||
+ float64x2_t y = vfmaq_laneq_f64 (d->c2, r, odd_coeffs, 1);
|
||||
+ float64x2_t p = vfmaq_laneq_f64 (d->c0, r, odd_coeffs, 0);
|
||||
+ y = vfmaq_laneq_f64 (y, r2, ln2_and_c4, 1);
|
||||
+ y = vfmaq_f64 (p, r2, y);
|
||||
+
|
||||
+ if (__glibc_unlikely (v_any_u32h (special)))
|
||||
+ return special_case (hi, u_off, y, r2, special, d);
|
||||
return vfmaq_f64 (hi, y, r2);
|
||||
}
|
224
glibc-upstream-2.39-155.patch
Normal file
224
glibc-upstream-2.39-155.patch
Normal file
@ -0,0 +1,224 @@
|
||||
commit dcd1229e5bbc8c899cb35b22aaf89d8babc5af5a
|
||||
Author: Joana Cruz <Joana.Cruz@arm.com>
|
||||
Date: Tue Dec 17 14:47:31 2024 +0000
|
||||
|
||||
AArch64: Improve codegen of AdvSIMD logf function family
|
||||
|
||||
Load the polynomial evaluation coefficients into 2 vectors and use lanewise MLAs.
|
||||
8% improvement in throughput microbenchmark on Neoverse V1 for log2 and log,
|
||||
and 2% for log10.
|
||||
|
||||
Reviewed-by: Wilco Dijkstra <Wilco.Dijkstra@arm.com>
|
||||
(cherry picked from commit d6e034f5b222a9ed1aeb5de0c0c7d0dda8b63da3)
|
||||
|
||||
diff --git a/sysdeps/aarch64/fpu/log10f_advsimd.c b/sysdeps/aarch64/fpu/log10f_advsimd.c
|
||||
index 82228b599a5c061b..0d792c3df9a7216e 100644
|
||||
--- a/sysdeps/aarch64/fpu/log10f_advsimd.c
|
||||
+++ b/sysdeps/aarch64/fpu/log10f_advsimd.c
|
||||
@@ -18,21 +18,25 @@
|
||||
<https://www.gnu.org/licenses/>. */
|
||||
|
||||
#include "v_math.h"
|
||||
-#include "poly_advsimd_f32.h"
|
||||
|
||||
static const struct data
|
||||
{
|
||||
+ float32x4_t c0, c2, c4, c6, inv_ln10, ln2;
|
||||
uint32x4_t off, offset_lower_bound;
|
||||
uint16x8_t special_bound;
|
||||
uint32x4_t mantissa_mask;
|
||||
- float32x4_t poly[8];
|
||||
- float32x4_t inv_ln10, ln2;
|
||||
+ float c1, c3, c5, c7;
|
||||
} data = {
|
||||
/* Use order 9 for log10(1+x), i.e. order 8 for log10(1+x)/x, with x in
|
||||
[-1/3, 1/3] (offset=2/3). Max. relative error: 0x1.068ee468p-25. */
|
||||
- .poly = { V4 (-0x1.bcb79cp-3f), V4 (0x1.2879c8p-3f), V4 (-0x1.bcd472p-4f),
|
||||
- V4 (0x1.6408f8p-4f), V4 (-0x1.246f8p-4f), V4 (0x1.f0e514p-5f),
|
||||
- V4 (-0x1.0fc92cp-4f), V4 (0x1.f5f76ap-5f) },
|
||||
+ .c0 = V4 (-0x1.bcb79cp-3f),
|
||||
+ .c1 = 0x1.2879c8p-3f,
|
||||
+ .c2 = V4 (-0x1.bcd472p-4f),
|
||||
+ .c3 = 0x1.6408f8p-4f,
|
||||
+ .c4 = V4 (-0x1.246f8p-4f),
|
||||
+ .c5 = 0x1.f0e514p-5f,
|
||||
+ .c6 = V4 (-0x1.0fc92cp-4f),
|
||||
+ .c7 = 0x1.f5f76ap-5f,
|
||||
.ln2 = V4 (0x1.62e43p-1f),
|
||||
.inv_ln10 = V4 (0x1.bcb7b2p-2f),
|
||||
/* Lower bound is the smallest positive normal float 0x00800000. For
|
||||
@@ -62,7 +66,7 @@ special_case (float32x4_t y, uint32x4_t u_off, float32x4_t p, float32x4_t r2,
|
||||
float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (log10) (float32x4_t x)
|
||||
{
|
||||
const struct data *d = ptr_barrier (&data);
|
||||
-
|
||||
+ float32x4_t c1357 = vld1q_f32 (&d->c1);
|
||||
/* To avoid having to mov x out of the way, keep u after offset has been
|
||||
applied, and recover x by adding the offset back in the special-case
|
||||
handler. */
|
||||
@@ -81,7 +85,16 @@ float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (log10) (float32x4_t x)
|
||||
|
||||
/* y = log10(1+r) + n * log10(2). */
|
||||
float32x4_t r2 = vmulq_f32 (r, r);
|
||||
- float32x4_t poly = v_pw_horner_7_f32 (r, r2, d->poly);
|
||||
+
|
||||
+ float32x4_t c01 = vfmaq_laneq_f32 (d->c0, r, c1357, 0);
|
||||
+ float32x4_t c23 = vfmaq_laneq_f32 (d->c2, r, c1357, 1);
|
||||
+ float32x4_t c45 = vfmaq_laneq_f32 (d->c4, r, c1357, 2);
|
||||
+ float32x4_t c67 = vfmaq_laneq_f32 (d->c6, r, c1357, 3);
|
||||
+
|
||||
+ float32x4_t p47 = vfmaq_f32 (c45, r2, c67);
|
||||
+ float32x4_t p27 = vfmaq_f32 (c23, r2, p47);
|
||||
+ float32x4_t poly = vfmaq_f32 (c01, r2, p27);
|
||||
+
|
||||
/* y = Log10(2) * n + poly * InvLn(10). */
|
||||
float32x4_t y = vfmaq_f32 (r, d->ln2, n);
|
||||
y = vmulq_f32 (y, d->inv_ln10);
|
||||
diff --git a/sysdeps/aarch64/fpu/log2f_advsimd.c b/sysdeps/aarch64/fpu/log2f_advsimd.c
|
||||
index 84effe4fe9492d08..116c36c8e2889f99 100644
|
||||
--- a/sysdeps/aarch64/fpu/log2f_advsimd.c
|
||||
+++ b/sysdeps/aarch64/fpu/log2f_advsimd.c
|
||||
@@ -18,22 +18,27 @@
|
||||
<https://www.gnu.org/licenses/>. */
|
||||
|
||||
#include "v_math.h"
|
||||
-#include "poly_advsimd_f32.h"
|
||||
|
||||
static const struct data
|
||||
{
|
||||
+ float32x4_t c0, c2, c4, c6, c8;
|
||||
uint32x4_t off, offset_lower_bound;
|
||||
uint16x8_t special_bound;
|
||||
uint32x4_t mantissa_mask;
|
||||
- float32x4_t poly[9];
|
||||
+ float c1, c3, c5, c7;
|
||||
} data = {
|
||||
/* Coefficients generated using Remez algorithm approximate
|
||||
log2(1+r)/r for r in [ -1/3, 1/3 ].
|
||||
rel error: 0x1.c4c4b0cp-26. */
|
||||
- .poly = { V4 (0x1.715476p0f), /* (float)(1 / ln(2)). */
|
||||
- V4 (-0x1.715458p-1f), V4 (0x1.ec701cp-2f), V4 (-0x1.7171a4p-2f),
|
||||
- V4 (0x1.27a0b8p-2f), V4 (-0x1.e5143ep-3f), V4 (0x1.9d8ecap-3f),
|
||||
- V4 (-0x1.c675bp-3f), V4 (0x1.9e495p-3f) },
|
||||
+ .c0 = V4 (0x1.715476p0f), /* (float)(1 / ln(2)). */
|
||||
+ .c1 = -0x1.715458p-1f,
|
||||
+ .c2 = V4 (0x1.ec701cp-2f),
|
||||
+ .c3 = -0x1.7171a4p-2f,
|
||||
+ .c4 = V4 (0x1.27a0b8p-2f),
|
||||
+ .c5 = -0x1.e5143ep-3f,
|
||||
+ .c6 = V4 (0x1.9d8ecap-3f),
|
||||
+ .c7 = -0x1.c675bp-3f,
|
||||
+ .c8 = V4 (0x1.9e495p-3f),
|
||||
/* Lower bound is the smallest positive normal float 0x00800000. For
|
||||
optimised register use subnormals are detected after offset has been
|
||||
subtracted, so lower bound is 0x0080000 - offset (which wraps around). */
|
||||
@@ -79,11 +84,21 @@ float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (log2) (float32x4_t x)
|
||||
|
||||
/* y = log2(1+r) + n. */
|
||||
float32x4_t r2 = vmulq_f32 (r, r);
|
||||
- float32x4_t p = v_pw_horner_8_f32 (r, r2, d->poly);
|
||||
+
|
||||
+ float32x4_t c1357 = vld1q_f32 (&d->c1);
|
||||
+ float32x4_t c01 = vfmaq_laneq_f32 (d->c0, r, c1357, 0);
|
||||
+ float32x4_t c23 = vfmaq_laneq_f32 (d->c2, r, c1357, 1);
|
||||
+ float32x4_t c45 = vfmaq_laneq_f32 (d->c4, r, c1357, 2);
|
||||
+ float32x4_t c67 = vfmaq_laneq_f32 (d->c6, r, c1357, 3);
|
||||
+ float32x4_t p68 = vfmaq_f32 (c67, r2, d->c8);
|
||||
+ float32x4_t p48 = vfmaq_f32 (c45, r2, p68);
|
||||
+ float32x4_t p28 = vfmaq_f32 (c23, r2, p48);
|
||||
+ float32x4_t p = vfmaq_f32 (c01, r2, p28);
|
||||
|
||||
if (__glibc_unlikely (v_any_u16h (special)))
|
||||
return special_case (n, u_off, p, r, special, d);
|
||||
return vfmaq_f32 (n, p, r);
|
||||
}
|
||||
+
|
||||
libmvec_hidden_def (V_NAME_F1 (log2))
|
||||
HALF_WIDTH_ALIAS_F1 (log2)
|
||||
diff --git a/sysdeps/aarch64/fpu/logf_advsimd.c b/sysdeps/aarch64/fpu/logf_advsimd.c
|
||||
index c20dbfd6c088c0af..d9e64c732d7d8d28 100644
|
||||
--- a/sysdeps/aarch64/fpu/logf_advsimd.c
|
||||
+++ b/sysdeps/aarch64/fpu/logf_advsimd.c
|
||||
@@ -21,16 +21,19 @@
|
||||
|
||||
static const struct data
|
||||
{
|
||||
- uint32x4_t off, offset_lower_bound;
|
||||
+ float32x4_t c2, c4, c6, ln2;
|
||||
+ uint32x4_t off, offset_lower_bound, mantissa_mask;
|
||||
uint16x8_t special_bound;
|
||||
- uint32x4_t mantissa_mask;
|
||||
- float32x4_t poly[7];
|
||||
- float32x4_t ln2;
|
||||
+ float c1, c3, c5, c0;
|
||||
} data = {
|
||||
/* 3.34 ulp error. */
|
||||
- .poly = { V4 (-0x1.3e737cp-3f), V4 (0x1.5a9aa2p-3f), V4 (-0x1.4f9934p-3f),
|
||||
- V4 (0x1.961348p-3f), V4 (-0x1.00187cp-2f), V4 (0x1.555d7cp-2f),
|
||||
- V4 (-0x1.ffffc8p-2f) },
|
||||
+ .c0 = -0x1.3e737cp-3f,
|
||||
+ .c1 = 0x1.5a9aa2p-3f,
|
||||
+ .c2 = V4 (-0x1.4f9934p-3f),
|
||||
+ .c3 = 0x1.961348p-3f,
|
||||
+ .c4 = V4 (-0x1.00187cp-2f),
|
||||
+ .c5 = 0x1.555d7cp-2f,
|
||||
+ .c6 = V4 (-0x1.ffffc8p-2f),
|
||||
.ln2 = V4 (0x1.62e43p-1f),
|
||||
/* Lower bound is the smallest positive normal float 0x00800000. For
|
||||
optimised register use subnormals are detected after offset has been
|
||||
@@ -41,8 +44,6 @@ static const struct data
|
||||
.mantissa_mask = V4 (0x007fffff)
|
||||
};
|
||||
|
||||
-#define P(i) d->poly[7 - i]
|
||||
-
|
||||
static float32x4_t VPCS_ATTR NOINLINE
|
||||
special_case (float32x4_t p, uint32x4_t u_off, float32x4_t y, float32x4_t r2,
|
||||
uint16x4_t cmp, const struct data *d)
|
||||
@@ -55,33 +56,30 @@ special_case (float32x4_t p, uint32x4_t u_off, float32x4_t y, float32x4_t r2,
|
||||
float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (log) (float32x4_t x)
|
||||
{
|
||||
const struct data *d = ptr_barrier (&data);
|
||||
- float32x4_t n, p, q, r, r2, y;
|
||||
- uint32x4_t u, u_off;
|
||||
- uint16x4_t cmp;
|
||||
+ float32x4_t c1350 = vld1q_f32 (&d->c1);
|
||||
|
||||
/* To avoid having to mov x out of the way, keep u after offset has been
|
||||
applied, and recover x by adding the offset back in the special-case
|
||||
handler. */
|
||||
- u_off = vreinterpretq_u32_f32 (x);
|
||||
+ uint32x4_t u_off = vsubq_u32 (vreinterpretq_u32_f32 (x), d->off);
|
||||
|
||||
/* x = 2^n * (1+r), where 2/3 < 1+r < 4/3. */
|
||||
- u_off = vsubq_u32 (u_off, d->off);
|
||||
- n = vcvtq_f32_s32 (
|
||||
+ float32x4_t n = vcvtq_f32_s32 (
|
||||
vshrq_n_s32 (vreinterpretq_s32_u32 (u_off), 23)); /* signextend. */
|
||||
- u = vandq_u32 (u_off, d->mantissa_mask);
|
||||
- u = vaddq_u32 (u, d->off);
|
||||
- r = vsubq_f32 (vreinterpretq_f32_u32 (u), v_f32 (1.0f));
|
||||
+ uint16x4_t cmp = vcge_u16 (vsubhn_u32 (u_off, d->offset_lower_bound),
|
||||
+ vget_low_u16 (d->special_bound));
|
||||
|
||||
- cmp = vcge_u16 (vsubhn_u32 (u_off, d->offset_lower_bound),
|
||||
- vget_low_u16 (d->special_bound));
|
||||
+ uint32x4_t u = vaddq_u32 (vandq_u32 (u_off, d->mantissa_mask), d->off);
|
||||
+ float32x4_t r = vsubq_f32 (vreinterpretq_f32_u32 (u), v_f32 (1.0f));
|
||||
|
||||
/* y = log(1+r) + n*ln2. */
|
||||
- r2 = vmulq_f32 (r, r);
|
||||
+ float32x4_t r2 = vmulq_f32 (r, r);
|
||||
/* n*ln2 + r + r2*(P1 + r*P2 + r2*(P3 + r*P4 + r2*(P5 + r*P6 + r2*P7))). */
|
||||
- p = vfmaq_f32 (P (5), P (6), r);
|
||||
- q = vfmaq_f32 (P (3), P (4), r);
|
||||
- y = vfmaq_f32 (P (1), P (2), r);
|
||||
- p = vfmaq_f32 (p, P (7), r2);
|
||||
+ float32x4_t p = vfmaq_laneq_f32 (d->c2, r, c1350, 0);
|
||||
+ float32x4_t q = vfmaq_laneq_f32 (d->c4, r, c1350, 1);
|
||||
+ float32x4_t y = vfmaq_laneq_f32 (d->c6, r, c1350, 2);
|
||||
+ p = vfmaq_laneq_f32 (p, r2, c1350, 3);
|
||||
+
|
||||
q = vfmaq_f32 (q, p, r2);
|
||||
y = vfmaq_f32 (y, q, r2);
|
||||
p = vfmaq_f32 (r, d->ln2, n);
|
404
glibc-upstream-2.39-156.patch
Normal file
404
glibc-upstream-2.39-156.patch
Normal file
@ -0,0 +1,404 @@
|
||||
commit a10183b6338baf4b2643b92cce1b0fba0e3ab62f
|
||||
Author: Joana Cruz <Joana.Cruz@arm.com>
|
||||
Date: Tue Dec 17 14:49:30 2024 +0000
|
||||
|
||||
AArch64: Improve codegen of AdvSIMD atan(2)(f)
|
||||
|
||||
Load the polynomial evaluation coefficients into 2 vectors and use lanewise MLAs.
|
||||
8% improvement in throughput microbenchmark on Neoverse V1.
|
||||
|
||||
Reviewed-by: Wilco Dijkstra <Wilco.Dijkstra@arm.com>
|
||||
(cherry picked from commit 6914774b9d3460876d9ad4482782213ec01a752e)
|
||||
|
||||
diff --git a/sysdeps/aarch64/fpu/atan2_advsimd.c b/sysdeps/aarch64/fpu/atan2_advsimd.c
|
||||
index 2fd61641340c0315..5df4b99ff4277c6a 100644
|
||||
--- a/sysdeps/aarch64/fpu/atan2_advsimd.c
|
||||
+++ b/sysdeps/aarch64/fpu/atan2_advsimd.c
|
||||
@@ -22,40 +22,57 @@
|
||||
|
||||
static const struct data
|
||||
{
|
||||
+ float64x2_t c0, c2, c4, c6, c8, c10, c12, c14, c16, c18;
|
||||
float64x2_t pi_over_2;
|
||||
- float64x2_t poly[20];
|
||||
+ double c1, c3, c5, c7, c9, c11, c13, c15, c17, c19;
|
||||
+ uint64x2_t zeroinfnan, minustwo;
|
||||
} data = {
|
||||
/* Coefficients of polynomial P such that atan(x)~x+x*P(x^2) on
|
||||
- the interval [2**-1022, 1.0]. */
|
||||
- .poly = { V2 (-0x1.5555555555555p-2), V2 (0x1.99999999996c1p-3),
|
||||
- V2 (-0x1.2492492478f88p-3), V2 (0x1.c71c71bc3951cp-4),
|
||||
- V2 (-0x1.745d160a7e368p-4), V2 (0x1.3b139b6a88ba1p-4),
|
||||
- V2 (-0x1.11100ee084227p-4), V2 (0x1.e1d0f9696f63bp-5),
|
||||
- V2 (-0x1.aebfe7b418581p-5), V2 (0x1.842dbe9b0d916p-5),
|
||||
- V2 (-0x1.5d30140ae5e99p-5), V2 (0x1.338e31eb2fbbcp-5),
|
||||
- V2 (-0x1.00e6eece7de8p-5), V2 (0x1.860897b29e5efp-6),
|
||||
- V2 (-0x1.0051381722a59p-6), V2 (0x1.14e9dc19a4a4ep-7),
|
||||
- V2 (-0x1.d0062b42fe3bfp-9), V2 (0x1.17739e210171ap-10),
|
||||
- V2 (-0x1.ab24da7be7402p-13), V2 (0x1.358851160a528p-16), },
|
||||
+ [2**-1022, 1.0]. */
|
||||
+ .c0 = V2 (-0x1.5555555555555p-2),
|
||||
+ .c1 = 0x1.99999999996c1p-3,
|
||||
+ .c2 = V2 (-0x1.2492492478f88p-3),
|
||||
+ .c3 = 0x1.c71c71bc3951cp-4,
|
||||
+ .c4 = V2 (-0x1.745d160a7e368p-4),
|
||||
+ .c5 = 0x1.3b139b6a88ba1p-4,
|
||||
+ .c6 = V2 (-0x1.11100ee084227p-4),
|
||||
+ .c7 = 0x1.e1d0f9696f63bp-5,
|
||||
+ .c8 = V2 (-0x1.aebfe7b418581p-5),
|
||||
+ .c9 = 0x1.842dbe9b0d916p-5,
|
||||
+ .c10 = V2 (-0x1.5d30140ae5e99p-5),
|
||||
+ .c11 = 0x1.338e31eb2fbbcp-5,
|
||||
+ .c12 = V2 (-0x1.00e6eece7de8p-5),
|
||||
+ .c13 = 0x1.860897b29e5efp-6,
|
||||
+ .c14 = V2 (-0x1.0051381722a59p-6),
|
||||
+ .c15 = 0x1.14e9dc19a4a4ep-7,
|
||||
+ .c16 = V2 (-0x1.d0062b42fe3bfp-9),
|
||||
+ .c17 = 0x1.17739e210171ap-10,
|
||||
+ .c18 = V2 (-0x1.ab24da7be7402p-13),
|
||||
+ .c19 = 0x1.358851160a528p-16,
|
||||
.pi_over_2 = V2 (0x1.921fb54442d18p+0),
|
||||
+ .zeroinfnan = V2 (2 * 0x7ff0000000000000ul - 1),
|
||||
+ .minustwo = V2 (0xc000000000000000),
|
||||
};
|
||||
|
||||
#define SignMask v_u64 (0x8000000000000000)
|
||||
|
||||
/* Special cases i.e. 0, infinity, NaN (fall back to scalar calls). */
|
||||
static float64x2_t VPCS_ATTR NOINLINE
|
||||
-special_case (float64x2_t y, float64x2_t x, float64x2_t ret, uint64x2_t cmp)
|
||||
+special_case (float64x2_t y, float64x2_t x, float64x2_t ret,
|
||||
+ uint64x2_t sign_xy, uint64x2_t cmp)
|
||||
{
|
||||
+ /* Account for the sign of x and y. */
|
||||
+ ret = vreinterpretq_f64_u64 (
|
||||
+ veorq_u64 (vreinterpretq_u64_f64 (ret), sign_xy));
|
||||
return v_call2_f64 (atan2, y, x, ret, cmp);
|
||||
}
|
||||
|
||||
/* Returns 1 if input is the bit representation of 0, infinity or nan. */
|
||||
static inline uint64x2_t
|
||||
-zeroinfnan (uint64x2_t i)
|
||||
+zeroinfnan (uint64x2_t i, const struct data *d)
|
||||
{
|
||||
/* (2 * i - 1) >= (2 * asuint64 (INFINITY) - 1). */
|
||||
- return vcgeq_u64 (vsubq_u64 (vaddq_u64 (i, i), v_u64 (1)),
|
||||
- v_u64 (2 * asuint64 (INFINITY) - 1));
|
||||
+ return vcgeq_u64 (vsubq_u64 (vaddq_u64 (i, i), v_u64 (1)), d->zeroinfnan);
|
||||
}
|
||||
|
||||
/* Fast implementation of vector atan2.
|
||||
@@ -65,12 +82,13 @@ zeroinfnan (uint64x2_t i)
|
||||
want 0x1.92d628ab678cfp-1. */
|
||||
float64x2_t VPCS_ATTR V_NAME_D2 (atan2) (float64x2_t y, float64x2_t x)
|
||||
{
|
||||
- const struct data *data_ptr = ptr_barrier (&data);
|
||||
+ const struct data *d = ptr_barrier (&data);
|
||||
|
||||
uint64x2_t ix = vreinterpretq_u64_f64 (x);
|
||||
uint64x2_t iy = vreinterpretq_u64_f64 (y);
|
||||
|
||||
- uint64x2_t special_cases = vorrq_u64 (zeroinfnan (ix), zeroinfnan (iy));
|
||||
+ uint64x2_t special_cases
|
||||
+ = vorrq_u64 (zeroinfnan (ix, d), zeroinfnan (iy, d));
|
||||
|
||||
uint64x2_t sign_x = vandq_u64 (ix, SignMask);
|
||||
uint64x2_t sign_y = vandq_u64 (iy, SignMask);
|
||||
@@ -80,18 +98,18 @@ float64x2_t VPCS_ATTR V_NAME_D2 (atan2) (float64x2_t y, float64x2_t x)
|
||||
float64x2_t ay = vabsq_f64 (y);
|
||||
|
||||
uint64x2_t pred_xlt0 = vcltzq_f64 (x);
|
||||
- uint64x2_t pred_aygtax = vcgtq_f64 (ay, ax);
|
||||
+ uint64x2_t pred_aygtax = vcagtq_f64 (y, x);
|
||||
|
||||
/* Set up z for call to atan. */
|
||||
float64x2_t n = vbslq_f64 (pred_aygtax, vnegq_f64 (ax), ay);
|
||||
- float64x2_t d = vbslq_f64 (pred_aygtax, ay, ax);
|
||||
- float64x2_t z = vdivq_f64 (n, d);
|
||||
+ float64x2_t q = vbslq_f64 (pred_aygtax, ay, ax);
|
||||
+ float64x2_t z = vdivq_f64 (n, q);
|
||||
|
||||
/* Work out the correct shift. */
|
||||
- float64x2_t shift = vreinterpretq_f64_u64 (
|
||||
- vandq_u64 (pred_xlt0, vreinterpretq_u64_f64 (v_f64 (-2.0))));
|
||||
+ float64x2_t shift
|
||||
+ = vreinterpretq_f64_u64 (vandq_u64 (pred_xlt0, d->minustwo));
|
||||
shift = vbslq_f64 (pred_aygtax, vaddq_f64 (shift, v_f64 (1.0)), shift);
|
||||
- shift = vmulq_f64 (shift, data_ptr->pi_over_2);
|
||||
+ shift = vmulq_f64 (shift, d->pi_over_2);
|
||||
|
||||
/* Calculate the polynomial approximation.
|
||||
Use split Estrin scheme for P(z^2) with deg(P)=19. Use split instead of
|
||||
@@ -102,20 +120,52 @@ float64x2_t VPCS_ATTR V_NAME_D2 (atan2) (float64x2_t y, float64x2_t x)
|
||||
float64x2_t x2 = vmulq_f64 (z2, z2);
|
||||
float64x2_t x4 = vmulq_f64 (x2, x2);
|
||||
float64x2_t x8 = vmulq_f64 (x4, x4);
|
||||
- float64x2_t ret
|
||||
- = vfmaq_f64 (v_estrin_7_f64 (z2, x2, x4, data_ptr->poly),
|
||||
- v_estrin_11_f64 (z2, x2, x4, x8, data_ptr->poly + 8), x8);
|
||||
+
|
||||
+ float64x2_t c13 = vld1q_f64 (&d->c1);
|
||||
+ float64x2_t c57 = vld1q_f64 (&d->c5);
|
||||
+ float64x2_t c911 = vld1q_f64 (&d->c9);
|
||||
+ float64x2_t c1315 = vld1q_f64 (&d->c13);
|
||||
+ float64x2_t c1719 = vld1q_f64 (&d->c17);
|
||||
+
|
||||
+ /* estrin_7. */
|
||||
+ float64x2_t p01 = vfmaq_laneq_f64 (d->c0, z2, c13, 0);
|
||||
+ float64x2_t p23 = vfmaq_laneq_f64 (d->c2, z2, c13, 1);
|
||||
+ float64x2_t p03 = vfmaq_f64 (p01, x2, p23);
|
||||
+
|
||||
+ float64x2_t p45 = vfmaq_laneq_f64 (d->c4, z2, c57, 0);
|
||||
+ float64x2_t p67 = vfmaq_laneq_f64 (d->c6, z2, c57, 1);
|
||||
+ float64x2_t p47 = vfmaq_f64 (p45, x2, p67);
|
||||
+
|
||||
+ float64x2_t p07 = vfmaq_f64 (p03, x4, p47);
|
||||
+
|
||||
+ /* estrin_11. */
|
||||
+ float64x2_t p89 = vfmaq_laneq_f64 (d->c8, z2, c911, 0);
|
||||
+ float64x2_t p1011 = vfmaq_laneq_f64 (d->c10, z2, c911, 1);
|
||||
+ float64x2_t p811 = vfmaq_f64 (p89, x2, p1011);
|
||||
+
|
||||
+ float64x2_t p1213 = vfmaq_laneq_f64 (d->c12, z2, c1315, 0);
|
||||
+ float64x2_t p1415 = vfmaq_laneq_f64 (d->c14, z2, c1315, 1);
|
||||
+ float64x2_t p1215 = vfmaq_f64 (p1213, x2, p1415);
|
||||
+
|
||||
+ float64x2_t p1617 = vfmaq_laneq_f64 (d->c16, z2, c1719, 0);
|
||||
+ float64x2_t p1819 = vfmaq_laneq_f64 (d->c18, z2, c1719, 1);
|
||||
+ float64x2_t p1619 = vfmaq_f64 (p1617, x2, p1819);
|
||||
+
|
||||
+ float64x2_t p815 = vfmaq_f64 (p811, x4, p1215);
|
||||
+ float64x2_t p819 = vfmaq_f64 (p815, x8, p1619);
|
||||
+
|
||||
+ float64x2_t ret = vfmaq_f64 (p07, p819, x8);
|
||||
|
||||
/* Finalize. y = shift + z + z^3 * P(z^2). */
|
||||
ret = vfmaq_f64 (z, ret, vmulq_f64 (z2, z));
|
||||
ret = vaddq_f64 (ret, shift);
|
||||
|
||||
+ if (__glibc_unlikely (v_any_u64 (special_cases)))
|
||||
+ return special_case (y, x, ret, sign_xy, special_cases);
|
||||
+
|
||||
/* Account for the sign of x and y. */
|
||||
ret = vreinterpretq_f64_u64 (
|
||||
veorq_u64 (vreinterpretq_u64_f64 (ret), sign_xy));
|
||||
|
||||
- if (__glibc_unlikely (v_any_u64 (special_cases)))
|
||||
- return special_case (y, x, ret, special_cases);
|
||||
-
|
||||
return ret;
|
||||
}
|
||||
diff --git a/sysdeps/aarch64/fpu/atan2f_advsimd.c b/sysdeps/aarch64/fpu/atan2f_advsimd.c
|
||||
index 56e610caf18f6d77..88daacd76cdd3998 100644
|
||||
--- a/sysdeps/aarch64/fpu/atan2f_advsimd.c
|
||||
+++ b/sysdeps/aarch64/fpu/atan2f_advsimd.c
|
||||
@@ -22,34 +22,39 @@
|
||||
|
||||
static const struct data
|
||||
{
|
||||
- float32x4_t poly[8];
|
||||
- float32x4_t pi_over_2;
|
||||
+ float32x4_t c0, pi_over_2, c4, c6, c2;
|
||||
+ float c1, c3, c5, c7;
|
||||
+ uint32x4_t comp_const;
|
||||
} data = {
|
||||
/* Coefficients of polynomial P such that atan(x)~x+x*P(x^2) on
|
||||
[2**-128, 1.0].
|
||||
Generated using fpminimax between FLT_MIN and 1. */
|
||||
- .poly = { V4 (-0x1.55555p-2f), V4 (0x1.99935ep-3f), V4 (-0x1.24051ep-3f),
|
||||
- V4 (0x1.bd7368p-4f), V4 (-0x1.491f0ep-4f), V4 (0x1.93a2c0p-5f),
|
||||
- V4 (-0x1.4c3c60p-6f), V4 (0x1.01fd88p-8f) },
|
||||
- .pi_over_2 = V4 (0x1.921fb6p+0f),
|
||||
+ .c0 = V4 (-0x1.55555p-2f), .c1 = 0x1.99935ep-3f,
|
||||
+ .c2 = V4 (-0x1.24051ep-3f), .c3 = 0x1.bd7368p-4f,
|
||||
+ .c4 = V4 (-0x1.491f0ep-4f), .c5 = 0x1.93a2c0p-5f,
|
||||
+ .c6 = V4 (-0x1.4c3c60p-6f), .c7 = 0x1.01fd88p-8f,
|
||||
+ .pi_over_2 = V4 (0x1.921fb6p+0f), .comp_const = V4 (2 * 0x7f800000lu - 1),
|
||||
};
|
||||
|
||||
#define SignMask v_u32 (0x80000000)
|
||||
|
||||
/* Special cases i.e. 0, infinity and nan (fall back to scalar calls). */
|
||||
static float32x4_t VPCS_ATTR NOINLINE
|
||||
-special_case (float32x4_t y, float32x4_t x, float32x4_t ret, uint32x4_t cmp)
|
||||
+special_case (float32x4_t y, float32x4_t x, float32x4_t ret,
|
||||
+ uint32x4_t sign_xy, uint32x4_t cmp)
|
||||
{
|
||||
+ /* Account for the sign of y. */
|
||||
+ ret = vreinterpretq_f32_u32 (
|
||||
+ veorq_u32 (vreinterpretq_u32_f32 (ret), sign_xy));
|
||||
return v_call2_f32 (atan2f, y, x, ret, cmp);
|
||||
}
|
||||
|
||||
/* Returns 1 if input is the bit representation of 0, infinity or nan. */
|
||||
static inline uint32x4_t
|
||||
-zeroinfnan (uint32x4_t i)
|
||||
+zeroinfnan (uint32x4_t i, const struct data *d)
|
||||
{
|
||||
/* 2 * i - 1 >= 2 * 0x7f800000lu - 1. */
|
||||
- return vcgeq_u32 (vsubq_u32 (vmulq_n_u32 (i, 2), v_u32 (1)),
|
||||
- v_u32 (2 * 0x7f800000lu - 1));
|
||||
+ return vcgeq_u32 (vsubq_u32 (vmulq_n_u32 (i, 2), v_u32 (1)), d->comp_const);
|
||||
}
|
||||
|
||||
/* Fast implementation of vector atan2f. Maximum observed error is
|
||||
@@ -58,12 +63,13 @@ zeroinfnan (uint32x4_t i)
|
||||
want 0x1.967f00p-1. */
|
||||
float32x4_t VPCS_ATTR NOINLINE V_NAME_F2 (atan2) (float32x4_t y, float32x4_t x)
|
||||
{
|
||||
- const struct data *data_ptr = ptr_barrier (&data);
|
||||
+ const struct data *d = ptr_barrier (&data);
|
||||
|
||||
uint32x4_t ix = vreinterpretq_u32_f32 (x);
|
||||
uint32x4_t iy = vreinterpretq_u32_f32 (y);
|
||||
|
||||
- uint32x4_t special_cases = vorrq_u32 (zeroinfnan (ix), zeroinfnan (iy));
|
||||
+ uint32x4_t special_cases
|
||||
+ = vorrq_u32 (zeroinfnan (ix, d), zeroinfnan (iy, d));
|
||||
|
||||
uint32x4_t sign_x = vandq_u32 (ix, SignMask);
|
||||
uint32x4_t sign_y = vandq_u32 (iy, SignMask);
|
||||
@@ -77,14 +83,14 @@ float32x4_t VPCS_ATTR NOINLINE V_NAME_F2 (atan2) (float32x4_t y, float32x4_t x)
|
||||
|
||||
/* Set up z for call to atanf. */
|
||||
float32x4_t n = vbslq_f32 (pred_aygtax, vnegq_f32 (ax), ay);
|
||||
- float32x4_t d = vbslq_f32 (pred_aygtax, ay, ax);
|
||||
- float32x4_t z = vdivq_f32 (n, d);
|
||||
+ float32x4_t q = vbslq_f32 (pred_aygtax, ay, ax);
|
||||
+ float32x4_t z = vdivq_f32 (n, q);
|
||||
|
||||
/* Work out the correct shift. */
|
||||
float32x4_t shift = vreinterpretq_f32_u32 (
|
||||
vandq_u32 (pred_xlt0, vreinterpretq_u32_f32 (v_f32 (-2.0f))));
|
||||
shift = vbslq_f32 (pred_aygtax, vaddq_f32 (shift, v_f32 (1.0f)), shift);
|
||||
- shift = vmulq_f32 (shift, data_ptr->pi_over_2);
|
||||
+ shift = vmulq_f32 (shift, d->pi_over_2);
|
||||
|
||||
/* Calculate the polynomial approximation.
|
||||
Use 2-level Estrin scheme for P(z^2) with deg(P)=7. However,
|
||||
@@ -96,23 +102,27 @@ float32x4_t VPCS_ATTR NOINLINE V_NAME_F2 (atan2) (float32x4_t y, float32x4_t x)
|
||||
float32x4_t z2 = vmulq_f32 (z, z);
|
||||
float32x4_t z4 = vmulq_f32 (z2, z2);
|
||||
|
||||
- float32x4_t ret = vfmaq_f32 (
|
||||
- v_pairwise_poly_3_f32 (z2, z4, data_ptr->poly), z4,
|
||||
- vmulq_f32 (z4, v_pairwise_poly_3_f32 (z2, z4, data_ptr->poly + 4)));
|
||||
+ float32x4_t c1357 = vld1q_f32 (&d->c1);
|
||||
+ float32x4_t p01 = vfmaq_laneq_f32 (d->c0, z2, c1357, 0);
|
||||
+ float32x4_t p23 = vfmaq_laneq_f32 (d->c2, z2, c1357, 1);
|
||||
+ float32x4_t p45 = vfmaq_laneq_f32 (d->c4, z2, c1357, 2);
|
||||
+ float32x4_t p67 = vfmaq_laneq_f32 (d->c6, z2, c1357, 3);
|
||||
+ float32x4_t p03 = vfmaq_f32 (p01, z4, p23);
|
||||
+ float32x4_t p47 = vfmaq_f32 (p45, z4, p67);
|
||||
+
|
||||
+ float32x4_t ret = vfmaq_f32 (p03, z4, vmulq_f32 (z4, p47));
|
||||
|
||||
/* y = shift + z * P(z^2). */
|
||||
ret = vaddq_f32 (vfmaq_f32 (z, ret, vmulq_f32 (z2, z)), shift);
|
||||
|
||||
- /* Account for the sign of y. */
|
||||
- ret = vreinterpretq_f32_u32 (
|
||||
- veorq_u32 (vreinterpretq_u32_f32 (ret), sign_xy));
|
||||
-
|
||||
if (__glibc_unlikely (v_any_u32 (special_cases)))
|
||||
{
|
||||
- return special_case (y, x, ret, special_cases);
|
||||
+ return special_case (y, x, ret, sign_xy, special_cases);
|
||||
}
|
||||
|
||||
- return ret;
|
||||
+ /* Account for the sign of y. */
|
||||
+ return vreinterpretq_f32_u32 (
|
||||
+ veorq_u32 (vreinterpretq_u32_f32 (ret), sign_xy));
|
||||
}
|
||||
libmvec_hidden_def (V_NAME_F2 (atan2))
|
||||
HALF_WIDTH_ALIAS_F2(atan2)
|
||||
diff --git a/sysdeps/aarch64/fpu/atan_advsimd.c b/sysdeps/aarch64/fpu/atan_advsimd.c
|
||||
index a962be0f78e4a9c7..14f1809796f05246 100644
|
||||
--- a/sysdeps/aarch64/fpu/atan_advsimd.c
|
||||
+++ b/sysdeps/aarch64/fpu/atan_advsimd.c
|
||||
@@ -22,21 +22,22 @@
|
||||
|
||||
static const struct data
|
||||
{
|
||||
+ float64x2_t c0, c2, c4, c6, c8, c10, c12, c14, c16, c18;
|
||||
float64x2_t pi_over_2;
|
||||
- float64x2_t poly[20];
|
||||
+ double c1, c3, c5, c7, c9, c11, c13, c15, c17, c19;
|
||||
} data = {
|
||||
/* Coefficients of polynomial P such that atan(x)~x+x*P(x^2) on
|
||||
[2**-1022, 1.0]. */
|
||||
- .poly = { V2 (-0x1.5555555555555p-2), V2 (0x1.99999999996c1p-3),
|
||||
- V2 (-0x1.2492492478f88p-3), V2 (0x1.c71c71bc3951cp-4),
|
||||
- V2 (-0x1.745d160a7e368p-4), V2 (0x1.3b139b6a88ba1p-4),
|
||||
- V2 (-0x1.11100ee084227p-4), V2 (0x1.e1d0f9696f63bp-5),
|
||||
- V2 (-0x1.aebfe7b418581p-5), V2 (0x1.842dbe9b0d916p-5),
|
||||
- V2 (-0x1.5d30140ae5e99p-5), V2 (0x1.338e31eb2fbbcp-5),
|
||||
- V2 (-0x1.00e6eece7de8p-5), V2 (0x1.860897b29e5efp-6),
|
||||
- V2 (-0x1.0051381722a59p-6), V2 (0x1.14e9dc19a4a4ep-7),
|
||||
- V2 (-0x1.d0062b42fe3bfp-9), V2 (0x1.17739e210171ap-10),
|
||||
- V2 (-0x1.ab24da7be7402p-13), V2 (0x1.358851160a528p-16), },
|
||||
+ .c0 = V2 (-0x1.5555555555555p-2), .c1 = 0x1.99999999996c1p-3,
|
||||
+ .c2 = V2 (-0x1.2492492478f88p-3), .c3 = 0x1.c71c71bc3951cp-4,
|
||||
+ .c4 = V2 (-0x1.745d160a7e368p-4), .c5 = 0x1.3b139b6a88ba1p-4,
|
||||
+ .c6 = V2 (-0x1.11100ee084227p-4), .c7 = 0x1.e1d0f9696f63bp-5,
|
||||
+ .c8 = V2 (-0x1.aebfe7b418581p-5), .c9 = 0x1.842dbe9b0d916p-5,
|
||||
+ .c10 = V2 (-0x1.5d30140ae5e99p-5), .c11 = 0x1.338e31eb2fbbcp-5,
|
||||
+ .c12 = V2 (-0x1.00e6eece7de8p-5), .c13 = 0x1.860897b29e5efp-6,
|
||||
+ .c14 = V2 (-0x1.0051381722a59p-6), .c15 = 0x1.14e9dc19a4a4ep-7,
|
||||
+ .c16 = V2 (-0x1.d0062b42fe3bfp-9), .c17 = 0x1.17739e210171ap-10,
|
||||
+ .c18 = V2 (-0x1.ab24da7be7402p-13), .c19 = 0x1.358851160a528p-16,
|
||||
.pi_over_2 = V2 (0x1.921fb54442d18p+0),
|
||||
};
|
||||
|
||||
@@ -52,6 +53,11 @@ static const struct data
|
||||
float64x2_t VPCS_ATTR V_NAME_D1 (atan) (float64x2_t x)
|
||||
{
|
||||
const struct data *d = ptr_barrier (&data);
|
||||
+ float64x2_t c13 = vld1q_f64 (&d->c1);
|
||||
+ float64x2_t c57 = vld1q_f64 (&d->c5);
|
||||
+ float64x2_t c911 = vld1q_f64 (&d->c9);
|
||||
+ float64x2_t c1315 = vld1q_f64 (&d->c13);
|
||||
+ float64x2_t c1719 = vld1q_f64 (&d->c17);
|
||||
|
||||
/* Small cases, infs and nans are supported by our approximation technique,
|
||||
but do not set fenv flags correctly. Only trigger special case if we need
|
||||
@@ -90,9 +96,35 @@ float64x2_t VPCS_ATTR V_NAME_D1 (atan) (float64x2_t x)
|
||||
float64x2_t x2 = vmulq_f64 (z2, z2);
|
||||
float64x2_t x4 = vmulq_f64 (x2, x2);
|
||||
float64x2_t x8 = vmulq_f64 (x4, x4);
|
||||
- float64x2_t y
|
||||
- = vfmaq_f64 (v_estrin_7_f64 (z2, x2, x4, d->poly),
|
||||
- v_estrin_11_f64 (z2, x2, x4, x8, d->poly + 8), x8);
|
||||
+
|
||||
+ /* estrin_7. */
|
||||
+ float64x2_t p01 = vfmaq_laneq_f64 (d->c0, z2, c13, 0);
|
||||
+ float64x2_t p23 = vfmaq_laneq_f64 (d->c2, z2, c13, 1);
|
||||
+ float64x2_t p03 = vfmaq_f64 (p01, x2, p23);
|
||||
+
|
||||
+ float64x2_t p45 = vfmaq_laneq_f64 (d->c4, z2, c57, 0);
|
||||
+ float64x2_t p67 = vfmaq_laneq_f64 (d->c6, z2, c57, 1);
|
||||
+ float64x2_t p47 = vfmaq_f64 (p45, x2, p67);
|
||||
+
|
||||
+ float64x2_t p07 = vfmaq_f64 (p03, x4, p47);
|
||||
+
|
||||
+ /* estrin_11. */
|
||||
+ float64x2_t p89 = vfmaq_laneq_f64 (d->c8, z2, c911, 0);
|
||||
+ float64x2_t p1011 = vfmaq_laneq_f64 (d->c10, z2, c911, 1);
|
||||
+ float64x2_t p811 = vfmaq_f64 (p89, x2, p1011);
|
||||
+
|
||||
+ float64x2_t p1213 = vfmaq_laneq_f64 (d->c12, z2, c1315, 0);
|
||||
+ float64x2_t p1415 = vfmaq_laneq_f64 (d->c14, z2, c1315, 1);
|
||||
+ float64x2_t p1215 = vfmaq_f64 (p1213, x2, p1415);
|
||||
+
|
||||
+ float64x2_t p1617 = vfmaq_laneq_f64 (d->c16, z2, c1719, 0);
|
||||
+ float64x2_t p1819 = vfmaq_laneq_f64 (d->c18, z2, c1719, 1);
|
||||
+ float64x2_t p1619 = vfmaq_f64 (p1617, x2, p1819);
|
||||
+
|
||||
+ float64x2_t p815 = vfmaq_f64 (p811, x4, p1215);
|
||||
+ float64x2_t p819 = vfmaq_f64 (p815, x8, p1619);
|
||||
+
|
||||
+ float64x2_t y = vfmaq_f64 (p07, p819, x8);
|
||||
|
||||
/* Finalize. y = shift + z + z^3 * P(z^2). */
|
||||
y = vfmaq_f64 (az, y, vmulq_f64 (z2, az));
|
211
glibc-upstream-2.39-157.patch
Normal file
211
glibc-upstream-2.39-157.patch
Normal file
@ -0,0 +1,211 @@
|
||||
commit 78abd3ef6e607853def82a97bf34a3c632db04e2
|
||||
Author: Luna Lamb <luna.lamb@arm.com>
|
||||
Date: Fri Jan 3 19:02:52 2025 +0000
|
||||
|
||||
AArch64: Improve codegen in SVE tans
|
||||
|
||||
Improves memory access.
|
||||
Tan: MOVPRFX 7 -> 2, LD1RD 12 -> 5, move MOV away from return.
|
||||
Tanf: MOV 2 -> 1, MOVPRFX 6 -> 3, LD1RW 5 -> 4, move mov away from return.
|
||||
|
||||
(cherry picked from commit aa6609feb20ebf8653db639dabe2a6afc77b02cc)
|
||||
|
||||
diff --git a/sysdeps/aarch64/fpu/tan_sve.c b/sysdeps/aarch64/fpu/tan_sve.c
|
||||
index b2e44473166845d0..a7318fd417dc7064 100644
|
||||
--- a/sysdeps/aarch64/fpu/tan_sve.c
|
||||
+++ b/sysdeps/aarch64/fpu/tan_sve.c
|
||||
@@ -22,24 +22,38 @@
|
||||
|
||||
static const struct data
|
||||
{
|
||||
- double poly[9];
|
||||
- double half_pi_hi, half_pi_lo, inv_half_pi, range_val, shift;
|
||||
+ double c2, c4, c6, c8;
|
||||
+ double poly_1357[4];
|
||||
+ double c0, inv_half_pi;
|
||||
+ double half_pi_hi, half_pi_lo, range_val;
|
||||
} data = {
|
||||
/* Polynomial generated with FPMinimax. */
|
||||
- .poly = { 0x1.5555555555556p-2, 0x1.1111111110a63p-3, 0x1.ba1ba1bb46414p-5,
|
||||
- 0x1.664f47e5b5445p-6, 0x1.226e5e5ecdfa3p-7, 0x1.d6c7ddbf87047p-9,
|
||||
- 0x1.7ea75d05b583ep-10, 0x1.289f22964a03cp-11,
|
||||
- 0x1.4e4fd14147622p-12, },
|
||||
+ .c2 = 0x1.ba1ba1bb46414p-5,
|
||||
+ .c4 = 0x1.226e5e5ecdfa3p-7,
|
||||
+ .c6 = 0x1.7ea75d05b583ep-10,
|
||||
+ .c8 = 0x1.4e4fd14147622p-12,
|
||||
+ .poly_1357 = { 0x1.1111111110a63p-3, 0x1.664f47e5b5445p-6,
|
||||
+ 0x1.d6c7ddbf87047p-9, 0x1.289f22964a03cp-11 },
|
||||
+ .c0 = 0x1.5555555555556p-2,
|
||||
+ .inv_half_pi = 0x1.45f306dc9c883p-1,
|
||||
.half_pi_hi = 0x1.921fb54442d18p0,
|
||||
.half_pi_lo = 0x1.1a62633145c07p-54,
|
||||
- .inv_half_pi = 0x1.45f306dc9c883p-1,
|
||||
.range_val = 0x1p23,
|
||||
- .shift = 0x1.8p52,
|
||||
};
|
||||
|
||||
static svfloat64_t NOINLINE
|
||||
-special_case (svfloat64_t x, svfloat64_t y, svbool_t special)
|
||||
+special_case (svfloat64_t x, svfloat64_t p, svfloat64_t q, svbool_t pg,
|
||||
+ svbool_t special)
|
||||
{
|
||||
+ svbool_t use_recip = svcmpeq (
|
||||
+ pg, svand_x (pg, svreinterpret_u64 (svcvt_s64_x (pg, q)), 1), 0);
|
||||
+
|
||||
+ svfloat64_t n = svmad_x (pg, p, p, -1);
|
||||
+ svfloat64_t d = svmul_x (svptrue_b64 (), p, 2);
|
||||
+ svfloat64_t swap = n;
|
||||
+ n = svneg_m (n, use_recip, d);
|
||||
+ d = svsel (use_recip, swap, d);
|
||||
+ svfloat64_t y = svdiv_x (svnot_z (pg, special), n, d);
|
||||
return sv_call_f64 (tan, x, y, special);
|
||||
}
|
||||
|
||||
@@ -50,15 +64,10 @@ special_case (svfloat64_t x, svfloat64_t y, svbool_t special)
|
||||
svfloat64_t SV_NAME_D1 (tan) (svfloat64_t x, svbool_t pg)
|
||||
{
|
||||
const struct data *dat = ptr_barrier (&data);
|
||||
-
|
||||
- /* Invert condition to catch NaNs and Infs as well as large values. */
|
||||
- svbool_t special = svnot_z (pg, svaclt (pg, x, dat->range_val));
|
||||
-
|
||||
+ svfloat64_t half_pi_c0 = svld1rq (svptrue_b64 (), &dat->c0);
|
||||
/* q = nearest integer to 2 * x / pi. */
|
||||
- svfloat64_t shift = sv_f64 (dat->shift);
|
||||
- svfloat64_t q = svmla_x (pg, shift, x, dat->inv_half_pi);
|
||||
- q = svsub_x (pg, q, shift);
|
||||
- svint64_t qi = svcvt_s64_x (pg, q);
|
||||
+ svfloat64_t q = svmul_lane (x, half_pi_c0, 1);
|
||||
+ q = svrinta_x (pg, q);
|
||||
|
||||
/* Use q to reduce x to r in [-pi/4, pi/4], by:
|
||||
r = x - q * pi/2, in extended precision. */
|
||||
@@ -68,7 +77,7 @@ svfloat64_t SV_NAME_D1 (tan) (svfloat64_t x, svbool_t pg)
|
||||
r = svmls_lane (r, q, half_pi, 1);
|
||||
/* Further reduce r to [-pi/8, pi/8], to be reconstructed using double angle
|
||||
formula. */
|
||||
- r = svmul_x (pg, r, 0.5);
|
||||
+ r = svmul_x (svptrue_b64 (), r, 0.5);
|
||||
|
||||
/* Approximate tan(r) using order 8 polynomial.
|
||||
tan(x) is odd, so polynomial has the form:
|
||||
@@ -76,29 +85,51 @@ svfloat64_t SV_NAME_D1 (tan) (svfloat64_t x, svbool_t pg)
|
||||
Hence we first approximate P(r) = C1 + C2 * r^2 + C3 * r^4 + ...
|
||||
Then compute the approximation by:
|
||||
tan(r) ~= r + r^3 * (C0 + r^2 * P(r)). */
|
||||
- svfloat64_t r2 = svmul_x (pg, r, r);
|
||||
- svfloat64_t r4 = svmul_x (pg, r2, r2);
|
||||
- svfloat64_t r8 = svmul_x (pg, r4, r4);
|
||||
+
|
||||
+ svfloat64_t r2 = svmul_x (svptrue_b64 (), r, r);
|
||||
+ svfloat64_t r4 = svmul_x (svptrue_b64 (), r2, r2);
|
||||
+ svfloat64_t r8 = svmul_x (svptrue_b64 (), r4, r4);
|
||||
/* Use offset version coeff array by 1 to evaluate from C1 onwards. */
|
||||
- svfloat64_t p = sv_estrin_7_f64_x (pg, r2, r4, r8, dat->poly + 1);
|
||||
- p = svmad_x (pg, p, r2, dat->poly[0]);
|
||||
- p = svmla_x (pg, r, r2, svmul_x (pg, p, r));
|
||||
+ svfloat64_t C_24 = svld1rq (svptrue_b64 (), &dat->c2);
|
||||
+ svfloat64_t C_68 = svld1rq (svptrue_b64 (), &dat->c6);
|
||||
+
|
||||
+ /* Use offset version coeff array by 1 to evaluate from C1 onwards. */
|
||||
+ svfloat64_t p01 = svmla_lane (sv_f64 (dat->poly_1357[0]), r2, C_24, 0);
|
||||
+ svfloat64_t p23 = svmla_lane_f64 (sv_f64 (dat->poly_1357[1]), r2, C_24, 1);
|
||||
+ svfloat64_t p03 = svmla_x (pg, p01, p23, r4);
|
||||
+
|
||||
+ svfloat64_t p45 = svmla_lane (sv_f64 (dat->poly_1357[2]), r2, C_68, 0);
|
||||
+ svfloat64_t p67 = svmla_lane (sv_f64 (dat->poly_1357[3]), r2, C_68, 1);
|
||||
+ svfloat64_t p47 = svmla_x (pg, p45, p67, r4);
|
||||
+
|
||||
+ svfloat64_t p = svmla_x (pg, p03, p47, r8);
|
||||
+
|
||||
+ svfloat64_t z = svmul_x (svptrue_b64 (), p, r);
|
||||
+ z = svmul_x (svptrue_b64 (), r2, z);
|
||||
+ z = svmla_lane (z, r, half_pi_c0, 0);
|
||||
+ p = svmla_x (pg, r, r2, z);
|
||||
|
||||
/* Recombination uses double-angle formula:
|
||||
tan(2x) = 2 * tan(x) / (1 - (tan(x))^2)
|
||||
and reciprocity around pi/2:
|
||||
tan(x) = 1 / (tan(pi/2 - x))
|
||||
to assemble result using change-of-sign and conditional selection of
|
||||
- numerator/denominator dependent on odd/even-ness of q (hence quadrant). */
|
||||
- svbool_t use_recip
|
||||
- = svcmpeq (pg, svand_x (pg, svreinterpret_u64 (qi), 1), 0);
|
||||
+ numerator/denominator dependent on odd/even-ness of q (quadrant). */
|
||||
+
|
||||
+ /* Invert condition to catch NaNs and Infs as well as large values. */
|
||||
+ svbool_t special = svnot_z (pg, svaclt (pg, x, dat->range_val));
|
||||
+
|
||||
+ if (__glibc_unlikely (svptest_any (pg, special)))
|
||||
+ {
|
||||
+ return special_case (x, p, q, pg, special);
|
||||
+ }
|
||||
+ svbool_t use_recip = svcmpeq (
|
||||
+ pg, svand_x (pg, svreinterpret_u64 (svcvt_s64_x (pg, q)), 1), 0);
|
||||
|
||||
svfloat64_t n = svmad_x (pg, p, p, -1);
|
||||
- svfloat64_t d = svmul_x (pg, p, 2);
|
||||
+ svfloat64_t d = svmul_x (svptrue_b64 (), p, 2);
|
||||
svfloat64_t swap = n;
|
||||
n = svneg_m (n, use_recip, d);
|
||||
d = svsel (use_recip, swap, d);
|
||||
- if (__glibc_unlikely (svptest_any (pg, special)))
|
||||
- return special_case (x, svdiv_x (svnot_z (pg, special), n, d), special);
|
||||
return svdiv_x (pg, n, d);
|
||||
}
|
||||
diff --git a/sysdeps/aarch64/fpu/tanf_sve.c b/sysdeps/aarch64/fpu/tanf_sve.c
|
||||
index f34258324114a360..e850fb4882e88380 100644
|
||||
--- a/sysdeps/aarch64/fpu/tanf_sve.c
|
||||
+++ b/sysdeps/aarch64/fpu/tanf_sve.c
|
||||
@@ -60,21 +60,16 @@ svfloat32_t SV_NAME_F1 (tan) (svfloat32_t x, const svbool_t pg)
|
||||
{
|
||||
const struct data *d = ptr_barrier (&data);
|
||||
|
||||
- /* Determine whether input is too large to perform fast regression. */
|
||||
- svbool_t cmp = svacge (pg, x, d->range_val);
|
||||
-
|
||||
svfloat32_t odd_coeffs = svld1rq (svptrue_b32 (), &d->c1);
|
||||
svfloat32_t pi_vals = svld1rq (svptrue_b32 (), &d->pio2_1);
|
||||
|
||||
/* n = rint(x/(pi/2)). */
|
||||
- svfloat32_t q = svmla_lane (sv_f32 (d->shift), x, pi_vals, 3);
|
||||
- svfloat32_t n = svsub_x (pg, q, d->shift);
|
||||
+ svfloat32_t n = svrintn_x (pg, svmul_lane (x, pi_vals, 3));
|
||||
/* n is already a signed integer, simply convert it. */
|
||||
svint32_t in = svcvt_s32_x (pg, n);
|
||||
/* Determine if x lives in an interval, where |tan(x)| grows to infinity. */
|
||||
svint32_t alt = svand_x (pg, in, 1);
|
||||
svbool_t pred_alt = svcmpne (pg, alt, 0);
|
||||
-
|
||||
/* r = x - n * (pi/2) (range reduction into 0 .. pi/4). */
|
||||
svfloat32_t r;
|
||||
r = svmls_lane (x, n, pi_vals, 0);
|
||||
@@ -93,7 +88,7 @@ svfloat32_t SV_NAME_F1 (tan) (svfloat32_t x, const svbool_t pg)
|
||||
|
||||
/* Evaluate polynomial approximation of tangent on [-pi/4, pi/4],
|
||||
using Estrin on z^2. */
|
||||
- svfloat32_t z2 = svmul_x (pg, z, z);
|
||||
+ svfloat32_t z2 = svmul_x (svptrue_b32 (), r, r);
|
||||
svfloat32_t p01 = svmla_lane (sv_f32 (d->c0), z2, odd_coeffs, 0);
|
||||
svfloat32_t p23 = svmla_lane (sv_f32 (d->c2), z2, odd_coeffs, 1);
|
||||
svfloat32_t p45 = svmla_lane (sv_f32 (d->c4), z2, odd_coeffs, 2);
|
||||
@@ -106,13 +101,14 @@ svfloat32_t SV_NAME_F1 (tan) (svfloat32_t x, const svbool_t pg)
|
||||
|
||||
svfloat32_t y = svmla_x (pg, z, p, svmul_x (pg, z, z2));
|
||||
|
||||
- /* Transform result back, if necessary. */
|
||||
- svfloat32_t inv_y = svdivr_x (pg, y, 1.0f);
|
||||
-
|
||||
/* No need to pass pg to specialcase here since cmp is a strict subset,
|
||||
guaranteed by the cmpge above. */
|
||||
+
|
||||
+ /* Determine whether input is too large to perform fast regression. */
|
||||
+ svbool_t cmp = svacge (pg, x, d->range_val);
|
||||
if (__glibc_unlikely (svptest_any (pg, cmp)))
|
||||
- return special_case (x, svsel (pred_alt, inv_y, y), cmp);
|
||||
+ return special_case (x, svdivr_x (pg, y, 1.0f), cmp);
|
||||
|
||||
+ svfloat32_t inv_y = svdivr_x (pg, y, 1.0f);
|
||||
return svsel (pred_alt, inv_y, y);
|
||||
}
|
301
glibc-upstream-2.39-158.patch
Normal file
301
glibc-upstream-2.39-158.patch
Normal file
@ -0,0 +1,301 @@
|
||||
commit 4073e4ee2c68de89b7220afba8d0780f86d9c60e
|
||||
Author: Yat Long Poon <yatlong.poon@arm.com>
|
||||
Date: Fri Jan 3 19:07:30 2025 +0000
|
||||
|
||||
AArch64: Improve codegen for SVE logs
|
||||
|
||||
Reduce memory access by using lanewise MLA and moving constants to struct
|
||||
and reduce number of MOVPRFXs.
|
||||
Update maximum ULP error for double log_sve from 1 to 2.
|
||||
Speedup on Neoverse V1 for log (3%), log2 (5%), and log10 (4%).
|
||||
|
||||
(cherry picked from commit 32d193a372feb28f9da247bb7283d404b84429c6)
|
||||
|
||||
diff --git a/sysdeps/aarch64/fpu/log10_sve.c b/sysdeps/aarch64/fpu/log10_sve.c
|
||||
index ab7362128d9b3ffb..f1cad2759a31a178 100644
|
||||
--- a/sysdeps/aarch64/fpu/log10_sve.c
|
||||
+++ b/sysdeps/aarch64/fpu/log10_sve.c
|
||||
@@ -23,28 +23,49 @@
|
||||
#define Min 0x0010000000000000
|
||||
#define Max 0x7ff0000000000000
|
||||
#define Thres 0x7fe0000000000000 /* Max - Min. */
|
||||
-#define Off 0x3fe6900900000000
|
||||
#define N (1 << V_LOG10_TABLE_BITS)
|
||||
|
||||
+static const struct data
|
||||
+{
|
||||
+ double c0, c2;
|
||||
+ double c1, c3;
|
||||
+ double invln10, log10_2;
|
||||
+ double c4;
|
||||
+ uint64_t off;
|
||||
+} data = {
|
||||
+ .c0 = -0x1.bcb7b1526e506p-3,
|
||||
+ .c1 = 0x1.287a7636be1d1p-3,
|
||||
+ .c2 = -0x1.bcb7b158af938p-4,
|
||||
+ .c3 = 0x1.63c78734e6d07p-4,
|
||||
+ .c4 = -0x1.287461742fee4p-4,
|
||||
+ .invln10 = 0x1.bcb7b1526e50ep-2,
|
||||
+ .log10_2 = 0x1.34413509f79ffp-2,
|
||||
+ .off = 0x3fe6900900000000,
|
||||
+};
|
||||
+
|
||||
static svfloat64_t NOINLINE
|
||||
-special_case (svfloat64_t x, svfloat64_t y, svbool_t special)
|
||||
+special_case (svfloat64_t hi, svuint64_t tmp, svfloat64_t y, svfloat64_t r2,
|
||||
+ svbool_t special, const struct data *d)
|
||||
{
|
||||
- return sv_call_f64 (log10, x, y, special);
|
||||
+ svfloat64_t x = svreinterpret_f64 (svadd_x (svptrue_b64 (), tmp, d->off));
|
||||
+ return sv_call_f64 (log10, x, svmla_x (svptrue_b64 (), hi, r2, y), special);
|
||||
}
|
||||
|
||||
-/* SVE log10 algorithm.
|
||||
+/* Double-precision SVE log10 routine.
|
||||
Maximum measured error is 2.46 ulps.
|
||||
SV_NAME_D1 (log10)(0x1.131956cd4b627p+0) got 0x1.fffbdf6eaa669p-6
|
||||
want 0x1.fffbdf6eaa667p-6. */
|
||||
svfloat64_t SV_NAME_D1 (log10) (svfloat64_t x, const svbool_t pg)
|
||||
{
|
||||
+ const struct data *d = ptr_barrier (&data);
|
||||
+
|
||||
svuint64_t ix = svreinterpret_u64 (x);
|
||||
svbool_t special = svcmpge (pg, svsub_x (pg, ix, Min), Thres);
|
||||
|
||||
/* x = 2^k z; where z is in range [Off,2*Off) and exact.
|
||||
The range is split into N subintervals.
|
||||
The ith subinterval contains z and c is near its center. */
|
||||
- svuint64_t tmp = svsub_x (pg, ix, Off);
|
||||
+ svuint64_t tmp = svsub_x (pg, ix, d->off);
|
||||
svuint64_t i = svlsr_x (pg, tmp, 51 - V_LOG10_TABLE_BITS);
|
||||
i = svand_x (pg, i, (N - 1) << 1);
|
||||
svfloat64_t k = svcvt_f64_x (pg, svasr_x (pg, svreinterpret_s64 (tmp), 52));
|
||||
@@ -62,15 +83,19 @@ svfloat64_t SV_NAME_D1 (log10) (svfloat64_t x, const svbool_t pg)
|
||||
svfloat64_t r = svmad_x (pg, invc, z, -1.0);
|
||||
|
||||
/* hi = log(c) + k*log(2). */
|
||||
- svfloat64_t w = svmla_x (pg, logc, r, __v_log10_data.invln10);
|
||||
- svfloat64_t hi = svmla_x (pg, w, k, __v_log10_data.log10_2);
|
||||
+ svfloat64_t invln10_log10_2 = svld1rq_f64 (svptrue_b64 (), &d->invln10);
|
||||
+ svfloat64_t w = svmla_lane_f64 (logc, r, invln10_log10_2, 0);
|
||||
+ svfloat64_t hi = svmla_lane_f64 (w, k, invln10_log10_2, 1);
|
||||
|
||||
/* y = r2*(A0 + r*A1 + r2*(A2 + r*A3 + r2*A4)) + hi. */
|
||||
- svfloat64_t r2 = svmul_x (pg, r, r);
|
||||
- svfloat64_t y = sv_pw_horner_4_f64_x (pg, r, r2, __v_log10_data.poly);
|
||||
+ svfloat64_t odd_coeffs = svld1rq_f64 (svptrue_b64 (), &d->c1);
|
||||
+ svfloat64_t r2 = svmul_x (svptrue_b64 (), r, r);
|
||||
+ svfloat64_t y = svmla_lane_f64 (sv_f64 (d->c2), r, odd_coeffs, 1);
|
||||
+ svfloat64_t p = svmla_lane_f64 (sv_f64 (d->c0), r, odd_coeffs, 0);
|
||||
+ y = svmla_x (pg, y, r2, d->c4);
|
||||
+ y = svmla_x (pg, p, r2, y);
|
||||
|
||||
if (__glibc_unlikely (svptest_any (pg, special)))
|
||||
- return special_case (x, svmla_x (svnot_z (pg, special), hi, r2, y),
|
||||
- special);
|
||||
+ return special_case (hi, tmp, y, r2, special, d);
|
||||
return svmla_x (pg, hi, r2, y);
|
||||
}
|
||||
diff --git a/sysdeps/aarch64/fpu/log2_sve.c b/sysdeps/aarch64/fpu/log2_sve.c
|
||||
index 743fa2a91392093b..908e638246abc13d 100644
|
||||
--- a/sysdeps/aarch64/fpu/log2_sve.c
|
||||
+++ b/sysdeps/aarch64/fpu/log2_sve.c
|
||||
@@ -21,15 +21,32 @@
|
||||
#include "poly_sve_f64.h"
|
||||
|
||||
#define N (1 << V_LOG2_TABLE_BITS)
|
||||
-#define Off 0x3fe6900900000000
|
||||
#define Max (0x7ff0000000000000)
|
||||
#define Min (0x0010000000000000)
|
||||
#define Thresh (0x7fe0000000000000) /* Max - Min. */
|
||||
|
||||
+static const struct data
|
||||
+{
|
||||
+ double c0, c2;
|
||||
+ double c1, c3;
|
||||
+ double invln2, c4;
|
||||
+ uint64_t off;
|
||||
+} data = {
|
||||
+ .c0 = -0x1.71547652b83p-1,
|
||||
+ .c1 = 0x1.ec709dc340953p-2,
|
||||
+ .c2 = -0x1.71547651c8f35p-2,
|
||||
+ .c3 = 0x1.2777ebe12dda5p-2,
|
||||
+ .c4 = -0x1.ec738d616fe26p-3,
|
||||
+ .invln2 = 0x1.71547652b82fep0,
|
||||
+ .off = 0x3fe6900900000000,
|
||||
+};
|
||||
+
|
||||
static svfloat64_t NOINLINE
|
||||
-special_case (svfloat64_t x, svfloat64_t y, svbool_t cmp)
|
||||
+special_case (svfloat64_t w, svuint64_t tmp, svfloat64_t y, svfloat64_t r2,
|
||||
+ svbool_t special, const struct data *d)
|
||||
{
|
||||
- return sv_call_f64 (log2, x, y, cmp);
|
||||
+ svfloat64_t x = svreinterpret_f64 (svadd_x (svptrue_b64 (), tmp, d->off));
|
||||
+ return sv_call_f64 (log2, x, svmla_x (svptrue_b64 (), w, r2, y), special);
|
||||
}
|
||||
|
||||
/* Double-precision SVE log2 routine.
|
||||
@@ -40,13 +57,15 @@ special_case (svfloat64_t x, svfloat64_t y, svbool_t cmp)
|
||||
want 0x1.fffb34198d9ddp-5. */
|
||||
svfloat64_t SV_NAME_D1 (log2) (svfloat64_t x, const svbool_t pg)
|
||||
{
|
||||
+ const struct data *d = ptr_barrier (&data);
|
||||
+
|
||||
svuint64_t ix = svreinterpret_u64 (x);
|
||||
svbool_t special = svcmpge (pg, svsub_x (pg, ix, Min), Thresh);
|
||||
|
||||
/* x = 2^k z; where z is in range [Off,2*Off) and exact.
|
||||
The range is split into N subintervals.
|
||||
The ith subinterval contains z and c is near its center. */
|
||||
- svuint64_t tmp = svsub_x (pg, ix, Off);
|
||||
+ svuint64_t tmp = svsub_x (pg, ix, d->off);
|
||||
svuint64_t i = svlsr_x (pg, tmp, 51 - V_LOG2_TABLE_BITS);
|
||||
i = svand_x (pg, i, (N - 1) << 1);
|
||||
svfloat64_t k = svcvt_f64_x (pg, svasr_x (pg, svreinterpret_s64 (tmp), 52));
|
||||
@@ -59,15 +78,19 @@ svfloat64_t SV_NAME_D1 (log2) (svfloat64_t x, const svbool_t pg)
|
||||
|
||||
/* log2(x) = log1p(z/c-1)/log(2) + log2(c) + k. */
|
||||
|
||||
+ svfloat64_t invln2_and_c4 = svld1rq_f64 (svptrue_b64 (), &d->invln2);
|
||||
svfloat64_t r = svmad_x (pg, invc, z, -1.0);
|
||||
- svfloat64_t w = svmla_x (pg, log2c, r, __v_log2_data.invln2);
|
||||
-
|
||||
- svfloat64_t r2 = svmul_x (pg, r, r);
|
||||
- svfloat64_t y = sv_pw_horner_4_f64_x (pg, r, r2, __v_log2_data.poly);
|
||||
+ svfloat64_t w = svmla_lane_f64 (log2c, r, invln2_and_c4, 0);
|
||||
w = svadd_x (pg, k, w);
|
||||
|
||||
+ svfloat64_t odd_coeffs = svld1rq_f64 (svptrue_b64 (), &d->c1);
|
||||
+ svfloat64_t r2 = svmul_x (svptrue_b64 (), r, r);
|
||||
+ svfloat64_t y = svmla_lane_f64 (sv_f64 (d->c2), r, odd_coeffs, 1);
|
||||
+ svfloat64_t p = svmla_lane_f64 (sv_f64 (d->c0), r, odd_coeffs, 0);
|
||||
+ y = svmla_lane_f64 (y, r2, invln2_and_c4, 1);
|
||||
+ y = svmla_x (pg, p, r2, y);
|
||||
+
|
||||
if (__glibc_unlikely (svptest_any (pg, special)))
|
||||
- return special_case (x, svmla_x (svnot_z (pg, special), w, r2, y),
|
||||
- special);
|
||||
+ return special_case (w, tmp, y, r2, special, d);
|
||||
return svmla_x (pg, w, r2, y);
|
||||
}
|
||||
diff --git a/sysdeps/aarch64/fpu/log_sve.c b/sysdeps/aarch64/fpu/log_sve.c
|
||||
index 9b689f2ec7190338..044223400ba2463b 100644
|
||||
--- a/sysdeps/aarch64/fpu/log_sve.c
|
||||
+++ b/sysdeps/aarch64/fpu/log_sve.c
|
||||
@@ -19,39 +19,54 @@
|
||||
|
||||
#include "sv_math.h"
|
||||
|
||||
-#define P(i) sv_f64 (__v_log_data.poly[i])
|
||||
#define N (1 << V_LOG_TABLE_BITS)
|
||||
-#define Off (0x3fe6900900000000)
|
||||
-#define MaxTop (0x7ff)
|
||||
-#define MinTop (0x001)
|
||||
-#define ThreshTop (0x7fe) /* MaxTop - MinTop. */
|
||||
+#define Max (0x7ff0000000000000)
|
||||
+#define Min (0x0010000000000000)
|
||||
+#define Thresh (0x7fe0000000000000) /* Max - Min. */
|
||||
+
|
||||
+static const struct data
|
||||
+{
|
||||
+ double c0, c2;
|
||||
+ double c1, c3;
|
||||
+ double ln2, c4;
|
||||
+ uint64_t off;
|
||||
+} data = {
|
||||
+ .c0 = -0x1.ffffffffffff7p-2,
|
||||
+ .c1 = 0x1.55555555170d4p-2,
|
||||
+ .c2 = -0x1.0000000399c27p-2,
|
||||
+ .c3 = 0x1.999b2e90e94cap-3,
|
||||
+ .c4 = -0x1.554e550bd501ep-3,
|
||||
+ .ln2 = 0x1.62e42fefa39efp-1,
|
||||
+ .off = 0x3fe6900900000000,
|
||||
+};
|
||||
|
||||
static svfloat64_t NOINLINE
|
||||
-special_case (svfloat64_t x, svfloat64_t y, svbool_t cmp)
|
||||
+special_case (svfloat64_t hi, svuint64_t tmp, svfloat64_t y, svfloat64_t r2,
|
||||
+ svbool_t special, const struct data *d)
|
||||
{
|
||||
- return sv_call_f64 (log, x, y, cmp);
|
||||
+ svfloat64_t x = svreinterpret_f64 (svadd_x (svptrue_b64 (), tmp, d->off));
|
||||
+ return sv_call_f64 (log, x, svmla_x (svptrue_b64 (), hi, r2, y), special);
|
||||
}
|
||||
|
||||
-/* SVE port of AdvSIMD log algorithm.
|
||||
- Maximum measured error is 2.17 ulp:
|
||||
- SV_NAME_D1 (log)(0x1.a6129884398a3p+0) got 0x1.ffffff1cca043p-2
|
||||
- want 0x1.ffffff1cca045p-2. */
|
||||
+/* Double-precision SVE log routine.
|
||||
+ Maximum measured error is 2.64 ulp:
|
||||
+ SV_NAME_D1 (log)(0x1.95e54bc91a5e2p+184) got 0x1.fffffffe88cacp+6
|
||||
+ want 0x1.fffffffe88cafp+6. */
|
||||
svfloat64_t SV_NAME_D1 (log) (svfloat64_t x, const svbool_t pg)
|
||||
{
|
||||
+ const struct data *d = ptr_barrier (&data);
|
||||
+
|
||||
svuint64_t ix = svreinterpret_u64 (x);
|
||||
- svuint64_t top = svlsr_x (pg, ix, 52);
|
||||
- svbool_t cmp = svcmpge (pg, svsub_x (pg, top, MinTop), sv_u64 (ThreshTop));
|
||||
+ svbool_t special = svcmpge (pg, svsub_x (pg, ix, Min), Thresh);
|
||||
|
||||
/* x = 2^k z; where z is in range [Off,2*Off) and exact.
|
||||
The range is split into N subintervals.
|
||||
The ith subinterval contains z and c is near its center. */
|
||||
- svuint64_t tmp = svsub_x (pg, ix, Off);
|
||||
+ svuint64_t tmp = svsub_x (pg, ix, d->off);
|
||||
/* Calculate table index = (tmp >> (52 - V_LOG_TABLE_BITS)) % N.
|
||||
The actual value of i is double this due to table layout. */
|
||||
svuint64_t i
|
||||
= svand_x (pg, svlsr_x (pg, tmp, (51 - V_LOG_TABLE_BITS)), (N - 1) << 1);
|
||||
- svint64_t k
|
||||
- = svasr_x (pg, svreinterpret_s64 (tmp), 52); /* Arithmetic shift. */
|
||||
svuint64_t iz = svsub_x (pg, ix, svand_x (pg, tmp, 0xfffULL << 52));
|
||||
svfloat64_t z = svreinterpret_f64 (iz);
|
||||
/* Lookup in 2 global lists (length N). */
|
||||
@@ -59,18 +74,22 @@ svfloat64_t SV_NAME_D1 (log) (svfloat64_t x, const svbool_t pg)
|
||||
svfloat64_t logc = svld1_gather_index (pg, &__v_log_data.table[0].logc, i);
|
||||
|
||||
/* log(x) = log1p(z/c-1) + log(c) + k*Ln2. */
|
||||
- svfloat64_t r = svmad_x (pg, invc, z, -1);
|
||||
- svfloat64_t kd = svcvt_f64_x (pg, k);
|
||||
+ svfloat64_t kd = svcvt_f64_x (pg, svasr_x (pg, svreinterpret_s64 (tmp), 52));
|
||||
/* hi = r + log(c) + k*Ln2. */
|
||||
- svfloat64_t hi = svmla_x (pg, svadd_x (pg, logc, r), kd, __v_log_data.ln2);
|
||||
+ svfloat64_t ln2_and_c4 = svld1rq_f64 (svptrue_b64 (), &d->ln2);
|
||||
+ svfloat64_t r = svmad_x (pg, invc, z, -1);
|
||||
+ svfloat64_t hi = svmla_lane_f64 (logc, kd, ln2_and_c4, 0);
|
||||
+ hi = svadd_x (pg, r, hi);
|
||||
+
|
||||
/* y = r2*(A0 + r*A1 + r2*(A2 + r*A3 + r2*A4)) + hi. */
|
||||
- svfloat64_t r2 = svmul_x (pg, r, r);
|
||||
- svfloat64_t y = svmla_x (pg, P (2), r, P (3));
|
||||
- svfloat64_t p = svmla_x (pg, P (0), r, P (1));
|
||||
- y = svmla_x (pg, y, r2, P (4));
|
||||
+ svfloat64_t odd_coeffs = svld1rq_f64 (svptrue_b64 (), &d->c1);
|
||||
+ svfloat64_t r2 = svmul_x (svptrue_b64 (), r, r);
|
||||
+ svfloat64_t y = svmla_lane_f64 (sv_f64 (d->c2), r, odd_coeffs, 1);
|
||||
+ svfloat64_t p = svmla_lane_f64 (sv_f64 (d->c0), r, odd_coeffs, 0);
|
||||
+ y = svmla_lane_f64 (y, r2, ln2_and_c4, 1);
|
||||
y = svmla_x (pg, p, r2, y);
|
||||
|
||||
- if (__glibc_unlikely (svptest_any (pg, cmp)))
|
||||
- return special_case (x, svmla_x (svnot_z (pg, cmp), hi, r2, y), cmp);
|
||||
+ if (__glibc_unlikely (svptest_any (pg, special)))
|
||||
+ return special_case (hi, tmp, y, r2, special, d);
|
||||
return svmla_x (pg, hi, r2, y);
|
||||
}
|
||||
diff --git a/sysdeps/aarch64/libm-test-ulps b/sysdeps/aarch64/libm-test-ulps
|
||||
index 1d52bf9ebf534f1a..10788b790a963918 100644
|
||||
--- a/sysdeps/aarch64/libm-test-ulps
|
||||
+++ b/sysdeps/aarch64/libm-test-ulps
|
||||
@@ -1316,7 +1316,7 @@ float: 2
|
||||
ldouble: 1
|
||||
|
||||
Function: "log_sve":
|
||||
-double: 1
|
||||
+double: 2
|
||||
float: 3
|
||||
|
||||
Function: "log_towardzero":
|
87
glibc-upstream-2.39-159.patch
Normal file
87
glibc-upstream-2.39-159.patch
Normal file
@ -0,0 +1,87 @@
|
||||
commit 65a96a6f2bb9f6f6f896394662279d263d59cdd2
|
||||
Author: Wilco Dijkstra <wilco.dijkstra@arm.com>
|
||||
Date: Wed Aug 7 14:43:47 2024 +0100
|
||||
|
||||
AArch64: Improve generic strlen
|
||||
|
||||
Improve performance by handling another 16 bytes before entering the loop.
|
||||
Use ADDHN in the loop to avoid SHRN+FMOV when it terminates. Change final
|
||||
size computation to avoid increasing latency. On Neoverse V1 performance
|
||||
of the random strlen benchmark improves by 4.6%.
|
||||
|
||||
Reviewed-by: Adhemerval Zanella <adhemerval.zanella@linaro.org>
|
||||
(cherry picked from commit 3dc426b642dcafdbc11a99f2767e081d086f5fc7)
|
||||
|
||||
diff --git a/sysdeps/aarch64/strlen.S b/sysdeps/aarch64/strlen.S
|
||||
index ab2a576cdb5665e5..352fb40d3abbb44b 100644
|
||||
--- a/sysdeps/aarch64/strlen.S
|
||||
+++ b/sysdeps/aarch64/strlen.S
|
||||
@@ -1,4 +1,5 @@
|
||||
-/* Copyright (C) 2012-2024 Free Software Foundation, Inc.
|
||||
+/* Generic optimized strlen using SIMD.
|
||||
+ Copyright (C) 2012-2024 Free Software Foundation, Inc.
|
||||
|
||||
This file is part of the GNU C Library.
|
||||
|
||||
@@ -56,36 +57,50 @@ ENTRY (STRLEN)
|
||||
shrn vend.8b, vhas_nul.8h, 4 /* 128->64 */
|
||||
fmov synd, dend
|
||||
lsr synd, synd, shift
|
||||
- cbz synd, L(loop)
|
||||
+ cbz synd, L(next16)
|
||||
|
||||
rbit synd, synd
|
||||
clz result, synd
|
||||
lsr result, result, 2
|
||||
ret
|
||||
|
||||
+L(next16):
|
||||
+ ldr data, [src, 16]
|
||||
+ cmeq vhas_nul.16b, vdata.16b, 0
|
||||
+ shrn vend.8b, vhas_nul.8h, 4 /* 128->64 */
|
||||
+ fmov synd, dend
|
||||
+ cbz synd, L(loop)
|
||||
+ add src, src, 16
|
||||
+#ifndef __AARCH64EB__
|
||||
+ rbit synd, synd
|
||||
+#endif
|
||||
+ sub result, src, srcin
|
||||
+ clz tmp, synd
|
||||
+ add result, result, tmp, lsr 2
|
||||
+ ret
|
||||
+
|
||||
.p2align 5
|
||||
L(loop):
|
||||
- ldr data, [src, 16]
|
||||
+ ldr data, [src, 32]!
|
||||
cmeq vhas_nul.16b, vdata.16b, 0
|
||||
- umaxp vend.16b, vhas_nul.16b, vhas_nul.16b
|
||||
+ addhn vend.8b, vhas_nul.8h, vhas_nul.8h
|
||||
fmov synd, dend
|
||||
cbnz synd, L(loop_end)
|
||||
- ldr data, [src, 32]!
|
||||
+ ldr data, [src, 16]
|
||||
cmeq vhas_nul.16b, vdata.16b, 0
|
||||
- umaxp vend.16b, vhas_nul.16b, vhas_nul.16b
|
||||
+ addhn vend.8b, vhas_nul.8h, vhas_nul.8h
|
||||
fmov synd, dend
|
||||
cbz synd, L(loop)
|
||||
- sub src, src, 16
|
||||
+ add src, src, 16
|
||||
L(loop_end):
|
||||
- shrn vend.8b, vhas_nul.8h, 4 /* 128->64 */
|
||||
- sub result, src, srcin
|
||||
- fmov synd, dend
|
||||
+ sub result, shift, src, lsl 2 /* (srcin - src) << 2. */
|
||||
#ifndef __AARCH64EB__
|
||||
rbit synd, synd
|
||||
+ sub result, result, 3
|
||||
#endif
|
||||
- add result, result, 16
|
||||
clz tmp, synd
|
||||
- add result, result, tmp, lsr 2
|
||||
+ sub result, tmp, result
|
||||
+ lsr result, result, 2
|
||||
ret
|
||||
|
||||
END (STRLEN)
|
282
glibc-upstream-2.39-160.patch
Normal file
282
glibc-upstream-2.39-160.patch
Normal file
@ -0,0 +1,282 @@
|
||||
commit dd1e63ab580d801926265007796f290b84747ec8
|
||||
Author: Wilco Dijkstra <wilco.dijkstra@arm.com>
|
||||
Date: Mon Sep 9 15:26:47 2024 +0100
|
||||
|
||||
AArch64: Optimize memset
|
||||
|
||||
Improve small memsets by avoiding branches and use overlapping stores.
|
||||
Use DC ZVA for copies over 128 bytes. Remove unnecessary code for ZVA sizes
|
||||
other than 64 and 128. Performance of random memset benchmark improves by 24%
|
||||
on Neoverse N1.
|
||||
|
||||
Reviewed-by: Adhemerval Zanella <adhemerval.zanella@linaro.org>
|
||||
(cherry picked from commit cec3aef32412779e207f825db0d057ebb4628ae8)
|
||||
|
||||
diff --git a/sysdeps/aarch64/memset.S b/sysdeps/aarch64/memset.S
|
||||
index 7ef77ee8c926de21..caafb019e2b6217b 100644
|
||||
--- a/sysdeps/aarch64/memset.S
|
||||
+++ b/sysdeps/aarch64/memset.S
|
||||
@@ -1,4 +1,5 @@
|
||||
-/* Copyright (C) 2012-2024 Free Software Foundation, Inc.
|
||||
+/* Generic optimized memset using SIMD.
|
||||
+ Copyright (C) 2012-2024 Free Software Foundation, Inc.
|
||||
|
||||
This file is part of the GNU C Library.
|
||||
|
||||
@@ -17,7 +18,6 @@
|
||||
<https://www.gnu.org/licenses/>. */
|
||||
|
||||
#include <sysdep.h>
|
||||
-#include "memset-reg.h"
|
||||
|
||||
#ifndef MEMSET
|
||||
# define MEMSET memset
|
||||
@@ -25,130 +25,132 @@
|
||||
|
||||
/* Assumptions:
|
||||
*
|
||||
- * ARMv8-a, AArch64, unaligned accesses
|
||||
+ * ARMv8-a, AArch64, Advanced SIMD, unaligned accesses.
|
||||
*
|
||||
*/
|
||||
|
||||
-ENTRY (MEMSET)
|
||||
+#define dstin x0
|
||||
+#define val x1
|
||||
+#define valw w1
|
||||
+#define count x2
|
||||
+#define dst x3
|
||||
+#define dstend x4
|
||||
+#define zva_val x5
|
||||
+#define off x3
|
||||
+#define dstend2 x5
|
||||
|
||||
+ENTRY (MEMSET)
|
||||
PTR_ARG (0)
|
||||
SIZE_ARG (2)
|
||||
|
||||
dup v0.16B, valw
|
||||
+ cmp count, 16
|
||||
+ b.lo L(set_small)
|
||||
+
|
||||
add dstend, dstin, count
|
||||
+ cmp count, 64
|
||||
+ b.hs L(set_128)
|
||||
|
||||
- cmp count, 96
|
||||
- b.hi L(set_long)
|
||||
- cmp count, 16
|
||||
- b.hs L(set_medium)
|
||||
- mov val, v0.D[0]
|
||||
+ /* Set 16..63 bytes. */
|
||||
+ mov off, 16
|
||||
+ and off, off, count, lsr 1
|
||||
+ sub dstend2, dstend, off
|
||||
+ str q0, [dstin]
|
||||
+ str q0, [dstin, off]
|
||||
+ str q0, [dstend2, -16]
|
||||
+ str q0, [dstend, -16]
|
||||
+ ret
|
||||
|
||||
+ .p2align 4
|
||||
/* Set 0..15 bytes. */
|
||||
- tbz count, 3, 1f
|
||||
- str val, [dstin]
|
||||
- str val, [dstend, -8]
|
||||
- ret
|
||||
- nop
|
||||
-1: tbz count, 2, 2f
|
||||
- str valw, [dstin]
|
||||
- str valw, [dstend, -4]
|
||||
+L(set_small):
|
||||
+ add dstend, dstin, count
|
||||
+ cmp count, 4
|
||||
+ b.lo 2f
|
||||
+ lsr off, count, 3
|
||||
+ sub dstend2, dstend, off, lsl 2
|
||||
+ str s0, [dstin]
|
||||
+ str s0, [dstin, off, lsl 2]
|
||||
+ str s0, [dstend2, -4]
|
||||
+ str s0, [dstend, -4]
|
||||
ret
|
||||
+
|
||||
+ /* Set 0..3 bytes. */
|
||||
2: cbz count, 3f
|
||||
+ lsr off, count, 1
|
||||
strb valw, [dstin]
|
||||
- tbz count, 1, 3f
|
||||
- strh valw, [dstend, -2]
|
||||
+ strb valw, [dstin, off]
|
||||
+ strb valw, [dstend, -1]
|
||||
3: ret
|
||||
|
||||
- /* Set 17..96 bytes. */
|
||||
-L(set_medium):
|
||||
- str q0, [dstin]
|
||||
- tbnz count, 6, L(set96)
|
||||
- str q0, [dstend, -16]
|
||||
- tbz count, 5, 1f
|
||||
- str q0, [dstin, 16]
|
||||
- str q0, [dstend, -32]
|
||||
-1: ret
|
||||
-
|
||||
.p2align 4
|
||||
- /* Set 64..96 bytes. Write 64 bytes from the start and
|
||||
- 32 bytes from the end. */
|
||||
-L(set96):
|
||||
- str q0, [dstin, 16]
|
||||
+L(set_128):
|
||||
+ bic dst, dstin, 15
|
||||
+ cmp count, 128
|
||||
+ b.hi L(set_long)
|
||||
+ stp q0, q0, [dstin]
|
||||
stp q0, q0, [dstin, 32]
|
||||
+ stp q0, q0, [dstend, -64]
|
||||
stp q0, q0, [dstend, -32]
|
||||
ret
|
||||
|
||||
- .p2align 3
|
||||
- nop
|
||||
+ .p2align 4
|
||||
L(set_long):
|
||||
- and valw, valw, 255
|
||||
- bic dst, dstin, 15
|
||||
str q0, [dstin]
|
||||
- cmp count, 256
|
||||
- ccmp valw, 0, 0, cs
|
||||
- b.eq L(try_zva)
|
||||
-L(no_zva):
|
||||
- sub count, dstend, dst /* Count is 16 too large. */
|
||||
- sub dst, dst, 16 /* Dst is biased by -32. */
|
||||
- sub count, count, 64 + 16 /* Adjust count and bias for loop. */
|
||||
-1: stp q0, q0, [dst, 32]
|
||||
- stp q0, q0, [dst, 64]!
|
||||
-L(tail64):
|
||||
- subs count, count, 64
|
||||
- b.hi 1b
|
||||
-2: stp q0, q0, [dstend, -64]
|
||||
+ str q0, [dst, 16]
|
||||
+ tst valw, 255
|
||||
+ b.ne L(no_zva)
|
||||
+#ifndef ZVA64_ONLY
|
||||
+ mrs zva_val, dczid_el0
|
||||
+ and zva_val, zva_val, 31
|
||||
+ cmp zva_val, 4 /* ZVA size is 64 bytes. */
|
||||
+ b.ne L(zva_128)
|
||||
+#endif
|
||||
+ stp q0, q0, [dst, 32]
|
||||
+ bic dst, dstin, 63
|
||||
+ sub count, dstend, dst /* Count is now 64 too large. */
|
||||
+ sub count, count, 64 + 64 /* Adjust count and bias for loop. */
|
||||
+
|
||||
+ /* Write last bytes before ZVA loop. */
|
||||
+ stp q0, q0, [dstend, -64]
|
||||
stp q0, q0, [dstend, -32]
|
||||
+
|
||||
+ .p2align 4
|
||||
+L(zva64_loop):
|
||||
+ add dst, dst, 64
|
||||
+ dc zva, dst
|
||||
+ subs count, count, 64
|
||||
+ b.hi L(zva64_loop)
|
||||
ret
|
||||
|
||||
-L(try_zva):
|
||||
-#ifndef ZVA64_ONLY
|
||||
.p2align 3
|
||||
- mrs tmp1, dczid_el0
|
||||
- tbnz tmp1w, 4, L(no_zva)
|
||||
- and tmp1w, tmp1w, 15
|
||||
- cmp tmp1w, 4 /* ZVA size is 64 bytes. */
|
||||
- b.ne L(zva_128)
|
||||
- nop
|
||||
-#endif
|
||||
- /* Write the first and last 64 byte aligned block using stp rather
|
||||
- than using DC ZVA. This is faster on some cores.
|
||||
- */
|
||||
- .p2align 4
|
||||
-L(zva_64):
|
||||
- str q0, [dst, 16]
|
||||
+L(no_zva):
|
||||
+ sub count, dstend, dst /* Count is 32 too large. */
|
||||
+ sub count, count, 64 + 32 /* Adjust count and bias for loop. */
|
||||
+L(no_zva_loop):
|
||||
stp q0, q0, [dst, 32]
|
||||
- bic dst, dst, 63
|
||||
stp q0, q0, [dst, 64]
|
||||
- stp q0, q0, [dst, 96]
|
||||
- sub count, dstend, dst /* Count is now 128 too large. */
|
||||
- sub count, count, 128+64+64 /* Adjust count and bias for loop. */
|
||||
- add dst, dst, 128
|
||||
-1: dc zva, dst
|
||||
add dst, dst, 64
|
||||
subs count, count, 64
|
||||
- b.hi 1b
|
||||
- stp q0, q0, [dst, 0]
|
||||
- stp q0, q0, [dst, 32]
|
||||
+ b.hi L(no_zva_loop)
|
||||
stp q0, q0, [dstend, -64]
|
||||
stp q0, q0, [dstend, -32]
|
||||
ret
|
||||
|
||||
#ifndef ZVA64_ONLY
|
||||
- .p2align 3
|
||||
+ .p2align 4
|
||||
L(zva_128):
|
||||
- cmp tmp1w, 5 /* ZVA size is 128 bytes. */
|
||||
- b.ne L(zva_other)
|
||||
+ cmp zva_val, 5 /* ZVA size is 128 bytes. */
|
||||
+ b.ne L(no_zva)
|
||||
|
||||
- str q0, [dst, 16]
|
||||
stp q0, q0, [dst, 32]
|
||||
stp q0, q0, [dst, 64]
|
||||
stp q0, q0, [dst, 96]
|
||||
bic dst, dst, 127
|
||||
sub count, dstend, dst /* Count is now 128 too large. */
|
||||
- sub count, count, 128+128 /* Adjust count and bias for loop. */
|
||||
- add dst, dst, 128
|
||||
-1: dc zva, dst
|
||||
- add dst, dst, 128
|
||||
+ sub count, count, 128 + 128 /* Adjust count and bias for loop. */
|
||||
+1: add dst, dst, 128
|
||||
+ dc zva, dst
|
||||
subs count, count, 128
|
||||
b.hi 1b
|
||||
stp q0, q0, [dstend, -128]
|
||||
@@ -156,35 +158,6 @@ L(zva_128):
|
||||
stp q0, q0, [dstend, -64]
|
||||
stp q0, q0, [dstend, -32]
|
||||
ret
|
||||
-
|
||||
-L(zva_other):
|
||||
- mov tmp2w, 4
|
||||
- lsl zva_lenw, tmp2w, tmp1w
|
||||
- add tmp1, zva_len, 64 /* Max alignment bytes written. */
|
||||
- cmp count, tmp1
|
||||
- blo L(no_zva)
|
||||
-
|
||||
- sub tmp2, zva_len, 1
|
||||
- add tmp1, dst, zva_len
|
||||
- add dst, dst, 16
|
||||
- subs count, tmp1, dst /* Actual alignment bytes to write. */
|
||||
- bic tmp1, tmp1, tmp2 /* Aligned dc zva start address. */
|
||||
- beq 2f
|
||||
-1: stp q0, q0, [dst], 64
|
||||
- stp q0, q0, [dst, -32]
|
||||
- subs count, count, 64
|
||||
- b.hi 1b
|
||||
-2: mov dst, tmp1
|
||||
- sub count, dstend, tmp1 /* Remaining bytes to write. */
|
||||
- subs count, count, zva_len
|
||||
- b.lo 4f
|
||||
-3: dc zva, dst
|
||||
- add dst, dst, zva_len
|
||||
- subs count, count, zva_len
|
||||
- b.hs 3b
|
||||
-4: add count, count, zva_len
|
||||
- sub dst, dst, 32 /* Bias dst for tail loop. */
|
||||
- b L(tail64)
|
||||
#endif
|
||||
|
||||
END (MEMSET)
|
60
glibc-upstream-2.39-161.patch
Normal file
60
glibc-upstream-2.39-161.patch
Normal file
@ -0,0 +1,60 @@
|
||||
commit 0cd10047bf046a658f32e12833ccc42304b3b152
|
||||
Author: Wilco Dijkstra <wilco.dijkstra@arm.com>
|
||||
Date: Mon Nov 25 18:43:08 2024 +0000
|
||||
|
||||
AArch64: Remove zva_128 from memset
|
||||
|
||||
Remove ZVA 128 support from memset - the new memset no longer
|
||||
guarantees count >= 256, which can result in underflow and a
|
||||
crash if ZVA size is 128 ([1]). Since only one CPU uses a ZVA
|
||||
size of 128 and its memcpy implementation was removed in commit
|
||||
e162ab2bf1b82c40f29e1925986582fa07568ce8, remove this special
|
||||
case too.
|
||||
|
||||
[1] https://sourceware.org/pipermail/libc-alpha/2024-November/161626.html
|
||||
|
||||
Reviewed-by: Andrew Pinski <quic_apinski@quicinc.com>
|
||||
(cherry picked from commit a08d9a52f967531a77e1824c23b5368c6434a72d)
|
||||
|
||||
diff --git a/sysdeps/aarch64/memset.S b/sysdeps/aarch64/memset.S
|
||||
index caafb019e2b6217b..71814d0b2f6dd3a7 100644
|
||||
--- a/sysdeps/aarch64/memset.S
|
||||
+++ b/sysdeps/aarch64/memset.S
|
||||
@@ -104,7 +104,7 @@ L(set_long):
|
||||
mrs zva_val, dczid_el0
|
||||
and zva_val, zva_val, 31
|
||||
cmp zva_val, 4 /* ZVA size is 64 bytes. */
|
||||
- b.ne L(zva_128)
|
||||
+ b.ne L(no_zva)
|
||||
#endif
|
||||
stp q0, q0, [dst, 32]
|
||||
bic dst, dstin, 63
|
||||
@@ -137,28 +137,5 @@ L(no_zva_loop):
|
||||
stp q0, q0, [dstend, -32]
|
||||
ret
|
||||
|
||||
-#ifndef ZVA64_ONLY
|
||||
- .p2align 4
|
||||
-L(zva_128):
|
||||
- cmp zva_val, 5 /* ZVA size is 128 bytes. */
|
||||
- b.ne L(no_zva)
|
||||
-
|
||||
- stp q0, q0, [dst, 32]
|
||||
- stp q0, q0, [dst, 64]
|
||||
- stp q0, q0, [dst, 96]
|
||||
- bic dst, dst, 127
|
||||
- sub count, dstend, dst /* Count is now 128 too large. */
|
||||
- sub count, count, 128 + 128 /* Adjust count and bias for loop. */
|
||||
-1: add dst, dst, 128
|
||||
- dc zva, dst
|
||||
- subs count, count, 128
|
||||
- b.hi 1b
|
||||
- stp q0, q0, [dstend, -128]
|
||||
- stp q0, q0, [dstend, -96]
|
||||
- stp q0, q0, [dstend, -64]
|
||||
- stp q0, q0, [dstend, -32]
|
||||
- ret
|
||||
-#endif
|
||||
-
|
||||
END (MEMSET)
|
||||
libc_hidden_builtin_def (MEMSET)
|
29
glibc-upstream-2.39-162.patch
Normal file
29
glibc-upstream-2.39-162.patch
Normal file
@ -0,0 +1,29 @@
|
||||
commit 0cc12d9c47eb97d82c8f5af3724b4a4bc01df74a
|
||||
Author: Wilco Dijkstra <wilco.dijkstra@arm.com>
|
||||
Date: Wed Jul 24 15:17:47 2024 +0100
|
||||
|
||||
math: Improve layout of expf data
|
||||
|
||||
GCC aligns global data to 16 bytes if their size is >= 16 bytes. This patch
|
||||
changes the exp2f_data struct slightly so that the fields are better aligned.
|
||||
As a result on targets that support them, load-pair instructions accessing
|
||||
poly_scaled and invln2_scaled are now 16-byte aligned.
|
||||
|
||||
Reviewed-by: Adhemerval Zanella <adhemerval.zanella@linaro.org>
|
||||
(cherry picked from commit 44fa9c1080fe6a9539f0d2345b9d2ae37b8ee57a)
|
||||
|
||||
diff --git a/sysdeps/ieee754/flt-32/math_config.h b/sysdeps/ieee754/flt-32/math_config.h
|
||||
index 729f22cd4f7dd9e4..dc07ebd45977e511 100644
|
||||
--- a/sysdeps/ieee754/flt-32/math_config.h
|
||||
+++ b/sysdeps/ieee754/flt-32/math_config.h
|
||||
@@ -166,9 +166,9 @@ extern const struct exp2f_data
|
||||
uint64_t tab[1 << EXP2F_TABLE_BITS];
|
||||
double shift_scaled;
|
||||
double poly[EXP2F_POLY_ORDER];
|
||||
- double shift;
|
||||
double invln2_scaled;
|
||||
double poly_scaled[EXP2F_POLY_ORDER];
|
||||
+ double shift;
|
||||
} __exp2f_data attribute_hidden;
|
||||
|
||||
#define LOGF_TABLE_BITS 4
|
189
glibc-upstream-2.39-163.patch
Normal file
189
glibc-upstream-2.39-163.patch
Normal file
@ -0,0 +1,189 @@
|
||||
commit d0e2133470d848e80eb4ba79ecd5d8c8b11fd2bb
|
||||
Author: Wilco Dijkstra <wilco.dijkstra@arm.com>
|
||||
Date: Tue Dec 24 18:01:59 2024 +0000
|
||||
|
||||
AArch64: Add SVE memset
|
||||
|
||||
Add SVE memset based on the generic memset with predicated load for sizes < 16.
|
||||
Unaligned memsets of 128-1024 are improved by ~20% on average by using aligned
|
||||
stores for the last 64 bytes. Performance of random memset benchmark improves
|
||||
by ~2% on Neoverse V1.
|
||||
|
||||
Reviewed-by: Yury Khrustalev <yury.khrustalev@arm.com>
|
||||
(cherry picked from commit 163b1bbb76caba4d9673c07940c5930a1afa7548)
|
||||
|
||||
diff --git a/sysdeps/aarch64/multiarch/Makefile b/sysdeps/aarch64/multiarch/Makefile
|
||||
index e4720b746859f515..214b6137b0bc63a2 100644
|
||||
--- a/sysdeps/aarch64/multiarch/Makefile
|
||||
+++ b/sysdeps/aarch64/multiarch/Makefile
|
||||
@@ -14,6 +14,7 @@ sysdep_routines += \
|
||||
memset_generic \
|
||||
memset_kunpeng \
|
||||
memset_mops \
|
||||
+ memset_sve_zva64 \
|
||||
memset_zva64 \
|
||||
strlen_asimd \
|
||||
strlen_generic \
|
||||
diff --git a/sysdeps/aarch64/multiarch/ifunc-impl-list.c b/sysdeps/aarch64/multiarch/ifunc-impl-list.c
|
||||
index ecd0f87de6a5b254..f8544fe3b525f775 100644
|
||||
--- a/sysdeps/aarch64/multiarch/ifunc-impl-list.c
|
||||
+++ b/sysdeps/aarch64/multiarch/ifunc-impl-list.c
|
||||
@@ -57,6 +57,7 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
|
||||
IFUNC_IMPL_ADD (array, i, memset, 1, __memset_kunpeng)
|
||||
#if HAVE_AARCH64_SVE_ASM
|
||||
IFUNC_IMPL_ADD (array, i, memset, sve && !bti && zva_size == 256, __memset_a64fx)
|
||||
+ IFUNC_IMPL_ADD (array, i, memset, sve && zva_size == 64, __memset_sve_zva64)
|
||||
#endif
|
||||
IFUNC_IMPL_ADD (array, i, memset, mops, __memset_mops)
|
||||
IFUNC_IMPL_ADD (array, i, memset, 1, __memset_generic))
|
||||
diff --git a/sysdeps/aarch64/multiarch/memset.c b/sysdeps/aarch64/multiarch/memset.c
|
||||
index 34bce045dd64ba9b..9d98664e6bc32212 100644
|
||||
--- a/sysdeps/aarch64/multiarch/memset.c
|
||||
+++ b/sysdeps/aarch64/multiarch/memset.c
|
||||
@@ -34,6 +34,7 @@ extern __typeof (__redirect_memset) __memset_kunpeng attribute_hidden;
|
||||
extern __typeof (__redirect_memset) __memset_a64fx attribute_hidden;
|
||||
extern __typeof (__redirect_memset) __memset_generic attribute_hidden;
|
||||
extern __typeof (__redirect_memset) __memset_mops attribute_hidden;
|
||||
+extern __typeof (__redirect_memset) __memset_sve_zva64 attribute_hidden;
|
||||
|
||||
static inline __typeof (__redirect_memset) *
|
||||
select_memset_ifunc (void)
|
||||
@@ -47,6 +48,9 @@ select_memset_ifunc (void)
|
||||
{
|
||||
if (IS_A64FX (midr) && zva_size == 256)
|
||||
return __memset_a64fx;
|
||||
+
|
||||
+ if (zva_size == 64)
|
||||
+ return __memset_sve_zva64;
|
||||
}
|
||||
|
||||
if (IS_KUNPENG920 (midr))
|
||||
diff --git a/sysdeps/aarch64/multiarch/memset_sve_zva64.S b/sysdeps/aarch64/multiarch/memset_sve_zva64.S
|
||||
new file mode 100644
|
||||
index 0000000000000000..7fb40fdd9e927bb3
|
||||
--- /dev/null
|
||||
+++ b/sysdeps/aarch64/multiarch/memset_sve_zva64.S
|
||||
@@ -0,0 +1,123 @@
|
||||
+/* Optimized memset for SVE.
|
||||
+ Copyright (C) 2025 Free Software Foundation, Inc.
|
||||
+
|
||||
+ This file is part of the GNU C Library.
|
||||
+
|
||||
+ The GNU C Library is free software; you can redistribute it and/or
|
||||
+ modify it under the terms of the GNU Lesser General Public
|
||||
+ License as published by the Free Software Foundation; either
|
||||
+ version 2.1 of the License, or (at your option) any later version.
|
||||
+
|
||||
+ The GNU C Library is distributed in the hope that it will be useful,
|
||||
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
+ Lesser General Public License for more details.
|
||||
+
|
||||
+ You should have received a copy of the GNU Lesser General Public
|
||||
+ License along with the GNU C Library. If not, see
|
||||
+ <https://www.gnu.org/licenses/>. */
|
||||
+
|
||||
+#include <sysdep.h>
|
||||
+
|
||||
+/* Assumptions:
|
||||
+ *
|
||||
+ * ARMv8-a, AArch64, Advanced SIMD, SVE, unaligned accesses.
|
||||
+ * ZVA size is 64.
|
||||
+ */
|
||||
+
|
||||
+#if HAVE_AARCH64_SVE_ASM
|
||||
+
|
||||
+.arch armv8.2-a+sve
|
||||
+
|
||||
+#define dstin x0
|
||||
+#define val x1
|
||||
+#define valw w1
|
||||
+#define count x2
|
||||
+#define dst x3
|
||||
+#define dstend x4
|
||||
+#define zva_val x5
|
||||
+#define vlen x5
|
||||
+#define off x3
|
||||
+#define dstend2 x5
|
||||
+
|
||||
+ENTRY (__memset_sve_zva64)
|
||||
+ dup v0.16B, valw
|
||||
+ cmp count, 16
|
||||
+ b.lo L(set_16)
|
||||
+
|
||||
+ add dstend, dstin, count
|
||||
+ cmp count, 64
|
||||
+ b.hs L(set_128)
|
||||
+
|
||||
+ /* Set 16..63 bytes. */
|
||||
+ mov off, 16
|
||||
+ and off, off, count, lsr 1
|
||||
+ sub dstend2, dstend, off
|
||||
+ str q0, [dstin]
|
||||
+ str q0, [dstin, off]
|
||||
+ str q0, [dstend2, -16]
|
||||
+ str q0, [dstend, -16]
|
||||
+ ret
|
||||
+
|
||||
+ .p2align 4
|
||||
+L(set_16):
|
||||
+ whilelo p0.b, xzr, count
|
||||
+ st1b z0.b, p0, [dstin]
|
||||
+ ret
|
||||
+
|
||||
+ .p2align 4
|
||||
+L(set_128):
|
||||
+ bic dst, dstin, 15
|
||||
+ cmp count, 128
|
||||
+ b.hi L(set_long)
|
||||
+ stp q0, q0, [dstin]
|
||||
+ stp q0, q0, [dstin, 32]
|
||||
+ stp q0, q0, [dstend, -64]
|
||||
+ stp q0, q0, [dstend, -32]
|
||||
+ ret
|
||||
+
|
||||
+ .p2align 4
|
||||
+L(set_long):
|
||||
+ cmp count, 256
|
||||
+ b.lo L(no_zva)
|
||||
+ tst valw, 255
|
||||
+ b.ne L(no_zva)
|
||||
+
|
||||
+ str q0, [dstin]
|
||||
+ str q0, [dst, 16]
|
||||
+ bic dst, dstin, 31
|
||||
+ stp q0, q0, [dst, 32]
|
||||
+ bic dst, dstin, 63
|
||||
+ sub count, dstend, dst /* Count is now 64 too large. */
|
||||
+ sub count, count, 128 /* Adjust count and bias for loop. */
|
||||
+
|
||||
+ sub x8, dstend, 1 /* Write last bytes before ZVA loop. */
|
||||
+ bic x8, x8, 15
|
||||
+ stp q0, q0, [x8, -48]
|
||||
+ str q0, [x8, -16]
|
||||
+ str q0, [dstend, -16]
|
||||
+
|
||||
+ .p2align 4
|
||||
+L(zva64_loop):
|
||||
+ add dst, dst, 64
|
||||
+ dc zva, dst
|
||||
+ subs count, count, 64
|
||||
+ b.hi L(zva64_loop)
|
||||
+ ret
|
||||
+
|
||||
+L(no_zva):
|
||||
+ str q0, [dstin]
|
||||
+ sub count, dstend, dst /* Count is 16 too large. */
|
||||
+ sub count, count, 64 + 16 /* Adjust count and bias for loop. */
|
||||
+L(no_zva_loop):
|
||||
+ stp q0, q0, [dst, 16]
|
||||
+ stp q0, q0, [dst, 48]
|
||||
+ add dst, dst, 64
|
||||
+ subs count, count, 64
|
||||
+ b.hi L(no_zva_loop)
|
||||
+ stp q0, q0, [dstend, -64]
|
||||
+ stp q0, q0, [dstend, -32]
|
||||
+ ret
|
||||
+
|
||||
+END (__memset_sve_zva64)
|
||||
+#endif
|
24
glibc-upstream-2.39-164.patch
Normal file
24
glibc-upstream-2.39-164.patch
Normal file
@ -0,0 +1,24 @@
|
||||
commit a1b09e59e2de9a5634a864e1a915f9f46e2cdd3a
|
||||
Author: Wilco Dijkstra <wilco.dijkstra@arm.com>
|
||||
Date: Thu Feb 27 16:28:52 2025 +0000
|
||||
|
||||
AArch64: Use prefer_sve_ifuncs for SVE memset
|
||||
|
||||
Use prefer_sve_ifuncs for SVE memset just like memcpy.
|
||||
|
||||
Reviewed-by: Yury Khrustalev <yury.khrustalev@arm.com>
|
||||
(cherry picked from commit 0f044be1dae5169d0e57f8d487b427863aeadab4)
|
||||
|
||||
diff --git a/sysdeps/aarch64/multiarch/memset.c b/sysdeps/aarch64/multiarch/memset.c
|
||||
index 9d98664e6bc32212..161624fe6028d9e9 100644
|
||||
--- a/sysdeps/aarch64/multiarch/memset.c
|
||||
+++ b/sysdeps/aarch64/multiarch/memset.c
|
||||
@@ -49,7 +49,7 @@ select_memset_ifunc (void)
|
||||
if (IS_A64FX (midr) && zva_size == 256)
|
||||
return __memset_a64fx;
|
||||
|
||||
- if (zva_size == 64)
|
||||
+ if (prefer_sve_ifuncs && zva_size == 64)
|
||||
return __memset_sve_zva64;
|
||||
}
|
||||
|
43
glibc-upstream-2.39-165.patch
Normal file
43
glibc-upstream-2.39-165.patch
Normal file
@ -0,0 +1,43 @@
|
||||
commit dd8c0c3bbd4e22e00a7275c75dc0d40f24bb0d68
|
||||
Author: Wilco Dijkstra <wilco.dijkstra@arm.com>
|
||||
Date: Fri Dec 13 15:43:07 2024 +0000
|
||||
|
||||
math: Improve layout of exp/exp10 data
|
||||
|
||||
GCC aligns global data to 16 bytes if their size is >= 16 bytes. This patch
|
||||
changes the exp_data struct slightly so that the fields are better aligned
|
||||
and without gaps. As a result on targets that support them, more load-pair
|
||||
instructions are used in exp. Exp10 is improved by moving invlog10_2N later
|
||||
so that neglog10_2hiN and neglog10_2loN can be loaded using load-pair.
|
||||
|
||||
The exp benchmark improves 2.5%, "144bits" by 7.2%, "768bits" by 12.7% on
|
||||
Neoverse V2. Exp10 improves by 1.5%.
|
||||
|
||||
Reviewed-by: Adhemerval Zanella <adhemerval.zanella@linaro.org>
|
||||
(cherry picked from commit 5afaf99edb326fd9f36eb306a828d129a3a1d7f7)
|
||||
|
||||
diff --git a/sysdeps/ieee754/dbl-64/math_config.h b/sysdeps/ieee754/dbl-64/math_config.h
|
||||
index ef87cfa6be9860e3..05515fd95ad15d52 100644
|
||||
--- a/sysdeps/ieee754/dbl-64/math_config.h
|
||||
+++ b/sysdeps/ieee754/dbl-64/math_config.h
|
||||
@@ -195,16 +195,18 @@ check_uflow (double x)
|
||||
extern const struct exp_data
|
||||
{
|
||||
double invln2N;
|
||||
- double shift;
|
||||
double negln2hiN;
|
||||
double negln2loN;
|
||||
double poly[4]; /* Last four coefficients. */
|
||||
+ double shift;
|
||||
+
|
||||
double exp2_shift;
|
||||
double exp2_poly[EXP2_POLY_ORDER];
|
||||
- double invlog10_2N;
|
||||
+
|
||||
double neglog10_2hiN;
|
||||
double neglog10_2loN;
|
||||
double exp10_poly[5];
|
||||
+ double invlog10_2N;
|
||||
uint64_t tab[2*(1 << EXP_TABLE_BITS)];
|
||||
} __exp_data attribute_hidden;
|
||||
|
54
glibc-upstream-2.39-166.patch
Normal file
54
glibc-upstream-2.39-166.patch
Normal file
@ -0,0 +1,54 @@
|
||||
commit e1fe22368e4fbc13ce300d89802b7fcc0d5cfb38
|
||||
Author: Michael Jeanson <mjeanson@efficios.com>
|
||||
Date: Fri Feb 14 13:54:22 2025 -0500
|
||||
|
||||
nptl: clear the whole rseq area before registration
|
||||
|
||||
Due to the extensible nature of the rseq area we can't explictly
|
||||
initialize fields that are not part of the ABI yet. It was agreed with
|
||||
upstream that all new fields will be documented as zero initialized by
|
||||
userspace. Future kernels configured with CONFIG_DEBUG_RSEQ will
|
||||
validate the content of all fields during registration.
|
||||
|
||||
Replace the explicit field initialization with a memset of the whole
|
||||
rseq area which will cover fields as they are added to future kernels.
|
||||
|
||||
Signed-off-by: Michael Jeanson <mjeanson@efficios.com>
|
||||
Reviewed-by: Florian Weimer <fweimer@redhat.com>
|
||||
(cherry picked from commit 689a62a4217fae78b9ce0db781dc2a421f2b1ab4)
|
||||
|
||||
diff --git a/sysdeps/nptl/dl-tls_init_tp.c b/sysdeps/nptl/dl-tls_init_tp.c
|
||||
index 7803e19fd16ad803..ed10185e3708f4b6 100644
|
||||
--- a/sysdeps/nptl/dl-tls_init_tp.c
|
||||
+++ b/sysdeps/nptl/dl-tls_init_tp.c
|
||||
@@ -23,6 +23,7 @@
|
||||
#include <tls.h>
|
||||
#include <rseq-internal.h>
|
||||
#include <thread_pointer.h>
|
||||
+#include <dl-symbol-redir-ifunc.h>
|
||||
|
||||
#define TUNABLE_NAMESPACE pthread
|
||||
#include <dl-tunables.h>
|
||||
diff --git a/sysdeps/unix/sysv/linux/rseq-internal.h b/sysdeps/unix/sysv/linux/rseq-internal.h
|
||||
index ef3eab1fefd4d90d..76de2b7ff079eb0f 100644
|
||||
--- a/sysdeps/unix/sysv/linux/rseq-internal.h
|
||||
+++ b/sysdeps/unix/sysv/linux/rseq-internal.h
|
||||
@@ -52,13 +52,12 @@ rseq_register_current_thread (struct pthread *self, bool do_rseq)
|
||||
but still expected size 32. */
|
||||
size = RSEQ_AREA_SIZE_INITIAL;
|
||||
|
||||
- /* Initialize the rseq fields that are read by the kernel on
|
||||
- registration, there is no guarantee that struct pthread is
|
||||
- cleared on all architectures. */
|
||||
+ /* Initialize the whole rseq area to zero prior to registration. */
|
||||
+ memset (&self->rseq_area, 0, size);
|
||||
+
|
||||
+ /* Set the cpu_id field to RSEQ_CPU_ID_UNINITIALIZED, this is checked by
|
||||
+ the kernel at registration when CONFIG_DEBUG_RSEQ is enabled. */
|
||||
THREAD_SETMEM (self, rseq_area.cpu_id, RSEQ_CPU_ID_UNINITIALIZED);
|
||||
- THREAD_SETMEM (self, rseq_area.cpu_id_start, 0);
|
||||
- THREAD_SETMEM (self, rseq_area.rseq_cs, 0);
|
||||
- THREAD_SETMEM (self, rseq_area.flags, 0);
|
||||
|
||||
int ret = INTERNAL_SYSCALL_CALL (rseq, &self->rseq_area,
|
||||
size, 0, RSEQ_SIG);
|
193
glibc-upstream-2.39-167.patch
Normal file
193
glibc-upstream-2.39-167.patch
Normal file
@ -0,0 +1,193 @@
|
||||
commit 7ecf0d3bde54e4f9e6f025d2f43eff565ed97414
|
||||
Author: H.J. Lu <hjl.tools@gmail.com>
|
||||
Date: Thu Apr 4 15:43:50 2024 -0700
|
||||
|
||||
x86-64: Exclude FMA4 IFUNC functions for -mapxf
|
||||
|
||||
When -mapxf is used to build glibc, the resulting glibc will never run
|
||||
on FMA4 machines. Exclude FMA4 IFUNC functions when -mapxf is used.
|
||||
This requires GCC which defines __APX_F__ for -mapxf with commit:
|
||||
|
||||
1df56719bd8 x86: Define __APX_F__ for -mapxf
|
||||
|
||||
Reviewed-by: Sunil K Pandey <skpgkp2@gmail.com>
|
||||
(cherry picked from commit 9e1f4aef865ddeffeb4b5f6578fefab606783120)
|
||||
|
||||
diff --git a/config.h.in b/config.h.in
|
||||
index 1e647de58580bc2d..a5fdea0c3c7b070e 100644
|
||||
--- a/config.h.in
|
||||
+++ b/config.h.in
|
||||
@@ -295,4 +295,7 @@
|
||||
/* Define if -mmovbe is enabled by default on x86. */
|
||||
#undef HAVE_X86_MOVBE
|
||||
|
||||
+/* Define if -mapxf is enabled by default on x86. */
|
||||
+#undef HAVE_X86_APX
|
||||
+
|
||||
#endif
|
||||
diff --git a/sysdeps/x86_64/configure b/sysdeps/x86_64/configure
|
||||
index 04a534fa126a7bf7..07bdd40a37247c7b 100755
|
||||
--- a/sysdeps/x86_64/configure
|
||||
+++ b/sysdeps/x86_64/configure
|
||||
@@ -162,6 +162,38 @@ printf "%s\n" "$libc_cv_x86_have_amx_tile" >&6; }
|
||||
config_vars="$config_vars
|
||||
have-mamx-tile = $libc_cv_x86_have_amx_tile"
|
||||
|
||||
+# Check if -mapxf is enabled.
|
||||
+{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking whether -mapxf is enabled" >&5
|
||||
+printf %s "checking whether -mapxf is enabled... " >&6; }
|
||||
+if test ${libc_cv_x86_have_apx+y}
|
||||
+then :
|
||||
+ printf %s "(cached) " >&6
|
||||
+else $as_nop
|
||||
+ cat > conftest.c <<EOF
|
||||
+#ifndef __APX_F__
|
||||
+# error APX isn't enabled
|
||||
+#endif
|
||||
+EOF
|
||||
+ libc_cv_x86_have_apx=no
|
||||
+ if { ac_try='${CC-cc} -c $CFLAGS conftest.c -o conftest.o 1>&5'
|
||||
+ { { eval echo "\"\$as_me\":${as_lineno-$LINENO}: \"$ac_try\""; } >&5
|
||||
+ (eval $ac_try) 2>&5
|
||||
+ ac_status=$?
|
||||
+ printf "%s\n" "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
|
||||
+ test $ac_status = 0; }; }; then
|
||||
+ libc_cv_x86_have_apx=yes
|
||||
+ fi
|
||||
+ rm -rf conftest*
|
||||
+fi
|
||||
+{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: $libc_cv_x86_have_apx" >&5
|
||||
+printf "%s\n" "$libc_cv_x86_have_apx" >&6; }
|
||||
+if test $libc_cv_x86_have_apx = yes; then
|
||||
+ printf "%s\n" "#define HAVE_X86_APX 1" >>confdefs.h
|
||||
+
|
||||
+fi
|
||||
+config_vars="$config_vars
|
||||
+have-x86-apx = $libc_cv_x86_have_apx"
|
||||
+
|
||||
test -n "$critic_missing" && as_fn_error $? "
|
||||
*** $critic_missing" "$LINENO" 5
|
||||
|
||||
diff --git a/sysdeps/x86_64/configure.ac b/sysdeps/x86_64/configure.ac
|
||||
index c714c47351e70390..c7b68544a2c79ae5 100644
|
||||
--- a/sysdeps/x86_64/configure.ac
|
||||
+++ b/sysdeps/x86_64/configure.ac
|
||||
@@ -76,5 +76,23 @@ EOF
|
||||
rm -rf conftest*])
|
||||
LIBC_CONFIG_VAR([have-mamx-tile], [$libc_cv_x86_have_amx_tile])
|
||||
|
||||
+# Check if -mapxf is enabled.
|
||||
+AC_CACHE_CHECK(whether -mapxf is enabled,
|
||||
+ libc_cv_x86_have_apx, [dnl
|
||||
+cat > conftest.c <<EOF
|
||||
+#ifndef __APX_F__
|
||||
+# error APX isn't enabled
|
||||
+#endif
|
||||
+EOF
|
||||
+ libc_cv_x86_have_apx=no
|
||||
+ if AC_TRY_COMMAND(${CC-cc} -c $CFLAGS conftest.c -o conftest.o 1>&AS_MESSAGE_LOG_FD); then
|
||||
+ libc_cv_x86_have_apx=yes
|
||||
+ fi
|
||||
+ rm -rf conftest*])
|
||||
+if test $libc_cv_x86_have_apx = yes; then
|
||||
+ AC_DEFINE(HAVE_X86_APX)
|
||||
+fi
|
||||
+LIBC_CONFIG_VAR([have-x86-apx], [$libc_cv_x86_have_apx])
|
||||
+
|
||||
test -n "$critic_missing" && AC_MSG_ERROR([
|
||||
*** $critic_missing])
|
||||
diff --git a/sysdeps/x86_64/fpu/multiarch/Makefile b/sysdeps/x86_64/fpu/multiarch/Makefile
|
||||
index 6ddd50240ce33d22..cbe09d49f49581f1 100644
|
||||
--- a/sysdeps/x86_64/fpu/multiarch/Makefile
|
||||
+++ b/sysdeps/x86_64/fpu/multiarch/Makefile
|
||||
@@ -38,29 +38,36 @@ libm-sysdep_routines += \
|
||||
s_truncf-avx \
|
||||
# libm-sysdep_routines
|
||||
else
|
||||
+ifeq (no,$(have-x86-apx))
|
||||
libm-sysdep_routines += \
|
||||
- e_asin-fma \
|
||||
e_asin-fma4 \
|
||||
+ e_atan2-fma4 \
|
||||
+ e_exp-fma4 \
|
||||
+ e_log-fma4 \
|
||||
+ e_pow-fma4 \
|
||||
+ s_atan-fma4 \
|
||||
+ s_sin-fma4 \
|
||||
+ s_sincos-fma4 \
|
||||
+ s_tan-fma4 \
|
||||
+# libm-sysdep_routines
|
||||
+endif
|
||||
+libm-sysdep_routines += \
|
||||
+ e_asin-fma \
|
||||
e_atan2-avx \
|
||||
e_atan2-fma \
|
||||
- e_atan2-fma4 \
|
||||
e_exp-avx \
|
||||
e_exp-fma \
|
||||
- e_exp-fma4 \
|
||||
e_exp2f-fma \
|
||||
e_expf-fma \
|
||||
e_log-avx \
|
||||
e_log-fma \
|
||||
- e_log-fma4 \
|
||||
e_log2-fma \
|
||||
e_log2f-fma \
|
||||
e_logf-fma \
|
||||
e_pow-fma \
|
||||
- e_pow-fma4 \
|
||||
e_powf-fma \
|
||||
s_atan-avx \
|
||||
s_atan-fma \
|
||||
- s_atan-fma4 \
|
||||
s_ceil-sse4_1 \
|
||||
s_ceilf-sse4_1 \
|
||||
s_cosf-fma \
|
||||
@@ -77,17 +84,14 @@ libm-sysdep_routines += \
|
||||
s_roundevenf-sse4_1 \
|
||||
s_sin-avx \
|
||||
s_sin-fma \
|
||||
- s_sin-fma4 \
|
||||
s_sincos-avx \
|
||||
s_sincos-fma \
|
||||
- s_sincos-fma4 \
|
||||
s_sincosf-fma \
|
||||
s_sincosf-sse2 \
|
||||
s_sinf-fma \
|
||||
s_sinf-sse2 \
|
||||
s_tan-avx \
|
||||
s_tan-fma \
|
||||
- s_tan-fma4 \
|
||||
s_trunc-sse4_1 \
|
||||
s_truncf-sse4_1 \
|
||||
# libm-sysdep_routines
|
||||
diff --git a/sysdeps/x86_64/fpu/multiarch/ifunc-avx-fma4.h b/sysdeps/x86_64/fpu/multiarch/ifunc-avx-fma4.h
|
||||
index 7719188888fbec38..d126cf9cd5ae55e4 100644
|
||||
--- a/sysdeps/x86_64/fpu/multiarch/ifunc-avx-fma4.h
|
||||
+++ b/sysdeps/x86_64/fpu/multiarch/ifunc-avx-fma4.h
|
||||
@@ -33,8 +33,10 @@ IFUNC_SELECTOR (void)
|
||||
&& CPU_FEATURE_USABLE_P (cpu_features, AVX2))
|
||||
return OPTIMIZE (fma);
|
||||
|
||||
+#ifndef HAVE_X86_APX
|
||||
if (CPU_FEATURE_USABLE_P (cpu_features, FMA4))
|
||||
return OPTIMIZE (fma4);
|
||||
+#endif
|
||||
|
||||
if (CPU_FEATURE_USABLE_P (cpu_features, AVX))
|
||||
return OPTIMIZE (avx);
|
||||
diff --git a/sysdeps/x86_64/fpu/multiarch/ifunc-fma4.h b/sysdeps/x86_64/fpu/multiarch/ifunc-fma4.h
|
||||
index c35ba13845b7914b..18d372d25cb598f2 100644
|
||||
--- a/sysdeps/x86_64/fpu/multiarch/ifunc-fma4.h
|
||||
+++ b/sysdeps/x86_64/fpu/multiarch/ifunc-fma4.h
|
||||
@@ -32,8 +32,10 @@ IFUNC_SELECTOR (void)
|
||||
&& CPU_FEATURE_USABLE_P (cpu_features, AVX2))
|
||||
return OPTIMIZE (fma);
|
||||
|
||||
+#ifndef HAVE_X86_APX
|
||||
if (CPU_FEATURE_USABLE_P (cpu_features, FMA4))
|
||||
return OPTIMIZE (fma4);
|
||||
+#endif
|
||||
|
||||
return OPTIMIZE (sse2);
|
||||
}
|
106
glibc-upstream-2.39-168.patch
Normal file
106
glibc-upstream-2.39-168.patch
Normal file
@ -0,0 +1,106 @@
|
||||
commit 0edcc77fe7e13b29d99e7f4d7fe3373b3666468e
|
||||
Author: Sunil K Pandey <skpgkp2@gmail.com>
|
||||
Date: Mon Mar 10 10:24:07 2025 -0700
|
||||
|
||||
x86_64: Add tanh with FMA
|
||||
|
||||
On Skylake, it improves tanh bench performance by:
|
||||
|
||||
Before After Improvement
|
||||
max 110.89 95.826 14%
|
||||
min 20.966 20.157 4%
|
||||
mean 30.9601 29.8431 4%
|
||||
|
||||
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
|
||||
(cherry picked from commit c6352111c72a20b3588ae304dd99b63e25dd6d85)
|
||||
|
||||
diff --git a/sysdeps/ieee754/dbl-64/s_tanh.c b/sysdeps/ieee754/dbl-64/s_tanh.c
|
||||
index 673a97102de292fd..13063db04ebb198c 100644
|
||||
--- a/sysdeps/ieee754/dbl-64/s_tanh.c
|
||||
+++ b/sysdeps/ieee754/dbl-64/s_tanh.c
|
||||
@@ -46,6 +46,11 @@ static char rcsid[] = "$NetBSD: s_tanh.c,v 1.7 1995/05/10 20:48:22 jtc Exp $";
|
||||
|
||||
static const double one = 1.0, two = 2.0, tiny = 1.0e-300;
|
||||
|
||||
+#ifndef SECTION
|
||||
+# define SECTION
|
||||
+#endif
|
||||
+
|
||||
+SECTION
|
||||
double
|
||||
__tanh (double x)
|
||||
{
|
||||
diff --git a/sysdeps/x86_64/fpu/multiarch/Makefile b/sysdeps/x86_64/fpu/multiarch/Makefile
|
||||
index cbe09d49f49581f1..0f69f7089c06af73 100644
|
||||
--- a/sysdeps/x86_64/fpu/multiarch/Makefile
|
||||
+++ b/sysdeps/x86_64/fpu/multiarch/Makefile
|
||||
@@ -10,6 +10,7 @@ CFLAGS-s_expm1-fma.c = -mfma -mavx2
|
||||
CFLAGS-s_log1p-fma.c = -mfma -mavx2
|
||||
CFLAGS-s_sin-fma.c = -mfma -mavx2
|
||||
CFLAGS-s_tan-fma.c = -mfma -mavx2
|
||||
+CFLAGS-s_tanh-fma.c = -mfma -mavx2
|
||||
CFLAGS-s_sincos-fma.c = -mfma -mavx2
|
||||
|
||||
CFLAGS-e_exp2f-fma.c = -mfma -mavx2
|
||||
@@ -92,6 +93,7 @@ libm-sysdep_routines += \
|
||||
s_sinf-sse2 \
|
||||
s_tan-avx \
|
||||
s_tan-fma \
|
||||
+ s_tanh-fma \
|
||||
s_trunc-sse4_1 \
|
||||
s_truncf-sse4_1 \
|
||||
# libm-sysdep_routines
|
||||
diff --git a/sysdeps/x86_64/fpu/multiarch/s_tanh-fma.c b/sysdeps/x86_64/fpu/multiarch/s_tanh-fma.c
|
||||
new file mode 100644
|
||||
index 0000000000000000..1b808b1227f50cf5
|
||||
--- /dev/null
|
||||
+++ b/sysdeps/x86_64/fpu/multiarch/s_tanh-fma.c
|
||||
@@ -0,0 +1,11 @@
|
||||
+#define __tanh __tanh_fma
|
||||
+#define __expm1 __expm1_fma
|
||||
+
|
||||
+/* NB: __expm1 may be expanded to __expm1_fma in the following
|
||||
+ prototypes. */
|
||||
+extern long double __expm1l (long double);
|
||||
+extern long double __expm1f128 (long double);
|
||||
+
|
||||
+#define SECTION __attribute__ ((section (".text.fma")))
|
||||
+
|
||||
+#include <sysdeps/ieee754/dbl-64/s_tanh.c>
|
||||
diff --git a/sysdeps/x86_64/fpu/multiarch/s_tanh.c b/sysdeps/x86_64/fpu/multiarch/s_tanh.c
|
||||
new file mode 100644
|
||||
index 0000000000000000..5539b6c61c63548d
|
||||
--- /dev/null
|
||||
+++ b/sysdeps/x86_64/fpu/multiarch/s_tanh.c
|
||||
@@ -0,0 +1,31 @@
|
||||
+/* Multiple versions of tanh.
|
||||
+ Copyright (C) 2025 Free Software Foundation, Inc.
|
||||
+ This file is part of the GNU C Library.
|
||||
+
|
||||
+ The GNU C Library is free software; you can redistribute it and/or
|
||||
+ modify it under the terms of the GNU Lesser General Public
|
||||
+ License as published by the Free Software Foundation; either
|
||||
+ version 2.1 of the License, or (at your option) any later version.
|
||||
+
|
||||
+ The GNU C Library is distributed in the hope that it will be useful,
|
||||
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
+ Lesser General Public License for more details.
|
||||
+
|
||||
+ You should have received a copy of the GNU Lesser General Public
|
||||
+ License along with the GNU C Library; if not, see
|
||||
+ <https://www.gnu.org/licenses/>. */
|
||||
+
|
||||
+#include <sysdeps/x86/isa-level.h>
|
||||
+#if MINIMUM_X86_ISA_LEVEL < AVX2_X86_ISA_LEVEL
|
||||
+
|
||||
+extern double __redirect_tanh (double);
|
||||
+
|
||||
+# define SYMBOL_NAME tanh
|
||||
+# include "ifunc-fma.h"
|
||||
+
|
||||
+libc_ifunc_redirected (__redirect_tanh, __tanh, IFUNC_SELECTOR ());
|
||||
+
|
||||
+# define __tanh __tanh_sse2
|
||||
+#endif
|
||||
+#include <sysdeps/ieee754/dbl-64/s_tanh.c>
|
130
glibc-upstream-2.39-169.patch
Normal file
130
glibc-upstream-2.39-169.patch
Normal file
@ -0,0 +1,130 @@
|
||||
commit 01ed435e2ee8df18f107ac9d999e1c4db922f564
|
||||
Author: Sunil K Pandey <skpgkp2@gmail.com>
|
||||
Date: Sat Mar 8 08:51:10 2025 -0800
|
||||
|
||||
x86_64: Add sinh with FMA
|
||||
|
||||
On SPR, it improves sinh bench performance by:
|
||||
|
||||
Before After Improvement
|
||||
reciprocal-throughput 14.2017 11.815 17%
|
||||
latency 36.4917 35.2114 4%
|
||||
|
||||
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
|
||||
(cherry picked from commit dded0d20f67ba1925ccbcb9cf28f0c75febe0dbe)
|
||||
|
||||
diff --git a/benchtests/sinh-inputs b/benchtests/sinh-inputs
|
||||
index 7b1ac46a39c0a0b0..2fcb2fabf82ce778 100644
|
||||
--- a/benchtests/sinh-inputs
|
||||
+++ b/benchtests/sinh-inputs
|
||||
@@ -1,6 +1,7 @@
|
||||
## args: double
|
||||
## ret: double
|
||||
## includes: math.h
|
||||
+## name: workload-random
|
||||
0x1.bcb6129b5ff2bp8
|
||||
-0x1.63057386325ebp9
|
||||
0x1.62f1d7dc4e8bfp9
|
||||
diff --git a/sysdeps/ieee754/dbl-64/e_sinh.c b/sysdeps/ieee754/dbl-64/e_sinh.c
|
||||
index b4b5857dddf90f7a..3f787967f93d72f0 100644
|
||||
--- a/sysdeps/ieee754/dbl-64/e_sinh.c
|
||||
+++ b/sysdeps/ieee754/dbl-64/e_sinh.c
|
||||
@@ -41,6 +41,11 @@ static char rcsid[] = "$NetBSD: e_sinh.c,v 1.7 1995/05/10 20:46:13 jtc Exp $";
|
||||
|
||||
static const double one = 1.0, shuge = 1.0e307;
|
||||
|
||||
+#ifndef SECTION
|
||||
+# define SECTION
|
||||
+#endif
|
||||
+
|
||||
+SECTION
|
||||
double
|
||||
__ieee754_sinh (double x)
|
||||
{
|
||||
@@ -90,4 +95,7 @@ __ieee754_sinh (double x)
|
||||
/* |x| > overflowthresold, sinh(x) overflow */
|
||||
return math_narrow_eval (x * shuge);
|
||||
}
|
||||
+
|
||||
+#ifndef __ieee754_sinh
|
||||
libm_alias_finite (__ieee754_sinh, __sinh)
|
||||
+#endif
|
||||
diff --git a/sysdeps/x86_64/fpu/multiarch/Makefile b/sysdeps/x86_64/fpu/multiarch/Makefile
|
||||
index 0f69f7089c06af73..b527cab8d134be21 100644
|
||||
--- a/sysdeps/x86_64/fpu/multiarch/Makefile
|
||||
+++ b/sysdeps/x86_64/fpu/multiarch/Makefile
|
||||
@@ -5,6 +5,7 @@ CFLAGS-e_exp-fma.c = -mfma -mavx2
|
||||
CFLAGS-e_log-fma.c = -mfma -mavx2
|
||||
CFLAGS-e_log2-fma.c = -mfma -mavx2
|
||||
CFLAGS-e_pow-fma.c = -mfma -mavx2
|
||||
+CFLAGS-e_sinh-fma.c = -mfma -mavx2
|
||||
CFLAGS-s_atan-fma.c = -mfma -mavx2
|
||||
CFLAGS-s_expm1-fma.c = -mfma -mavx2
|
||||
CFLAGS-s_log1p-fma.c = -mfma -mavx2
|
||||
@@ -67,6 +68,7 @@ libm-sysdep_routines += \
|
||||
e_logf-fma \
|
||||
e_pow-fma \
|
||||
e_powf-fma \
|
||||
+ e_sinh-fma \
|
||||
s_atan-avx \
|
||||
s_atan-fma \
|
||||
s_ceil-sse4_1 \
|
||||
diff --git a/sysdeps/x86_64/fpu/multiarch/e_sinh-fma.c b/sysdeps/x86_64/fpu/multiarch/e_sinh-fma.c
|
||||
new file mode 100644
|
||||
index 0000000000000000..e0e1e39a7a606dc8
|
||||
--- /dev/null
|
||||
+++ b/sysdeps/x86_64/fpu/multiarch/e_sinh-fma.c
|
||||
@@ -0,0 +1,12 @@
|
||||
+#define __ieee754_sinh __ieee754_sinh_fma
|
||||
+#define __ieee754_exp __ieee754_exp_fma
|
||||
+#define __expm1 __expm1_fma
|
||||
+
|
||||
+/* NB: __expm1 may be expanded to __expm1_fma in the following
|
||||
+ prototypes. */
|
||||
+extern long double __expm1l (long double);
|
||||
+extern long double __expm1f128 (long double);
|
||||
+
|
||||
+#define SECTION __attribute__ ((section (".text.fma")))
|
||||
+
|
||||
+#include <sysdeps/ieee754/dbl-64/e_sinh.c>
|
||||
diff --git a/sysdeps/x86_64/fpu/multiarch/e_sinh.c b/sysdeps/x86_64/fpu/multiarch/e_sinh.c
|
||||
new file mode 100644
|
||||
index 0000000000000000..3d3c18ccdf1d437a
|
||||
--- /dev/null
|
||||
+++ b/sysdeps/x86_64/fpu/multiarch/e_sinh.c
|
||||
@@ -0,0 +1,35 @@
|
||||
+/* Multiple versions of sinh.
|
||||
+ Copyright (C) 2025 Free Software Foundation, Inc.
|
||||
+ This file is part of the GNU C Library.
|
||||
+
|
||||
+ The GNU C Library is free software; you can redistribute it and/or
|
||||
+ modify it under the terms of the GNU Lesser General Public
|
||||
+ License as published by the Free Software Foundation; either
|
||||
+ version 2.1 of the License, or (at your option) any later version.
|
||||
+
|
||||
+ The GNU C Library is distributed in the hope that it will be useful,
|
||||
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
+ Lesser General Public License for more details.
|
||||
+
|
||||
+ You should have received a copy of the GNU Lesser General Public
|
||||
+ License along with the GNU C Library; if not, see
|
||||
+ <https://www.gnu.org/licenses/>. */
|
||||
+
|
||||
+#include <sysdeps/x86/isa-level.h>
|
||||
+#if MINIMUM_X86_ISA_LEVEL < AVX2_X86_ISA_LEVEL
|
||||
+# include <libm-alias-finite.h>
|
||||
+
|
||||
+extern double __redirect_ieee754_sinh (double);
|
||||
+
|
||||
+# define SYMBOL_NAME ieee754_sinh
|
||||
+# include "ifunc-fma.h"
|
||||
+
|
||||
+libc_ifunc_redirected (__redirect_ieee754_sinh, __ieee754_sinh,
|
||||
+ IFUNC_SELECTOR ());
|
||||
+
|
||||
+libm_alias_finite (__ieee754_sinh, __sinh)
|
||||
+
|
||||
+# define __ieee754_sinh __ieee754_sinh_sse2
|
||||
+#endif
|
||||
+#include <sysdeps/ieee754/dbl-64/e_sinh.c>
|
123
glibc-upstream-2.39-170.patch
Normal file
123
glibc-upstream-2.39-170.patch
Normal file
@ -0,0 +1,123 @@
|
||||
commit 4cf3f9df544a6f3dc27ea097b43bd2fb73113c3f
|
||||
Author: Sunil K Pandey <skpgkp2@gmail.com>
|
||||
Date: Wed Mar 5 16:13:38 2025 -0800
|
||||
|
||||
x86_64: Add atanh with FMA
|
||||
|
||||
On SPR, it improves atanh bench performance by:
|
||||
|
||||
Before After Improvement
|
||||
reciprocal-throughput 15.1715 14.8628 2%
|
||||
latency 57.1941 56.1883 2%
|
||||
|
||||
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
|
||||
(cherry picked from commit c7c4a5906f326f1290b1c2413a83c530564ec4b8)
|
||||
|
||||
diff --git a/benchtests/atanh-inputs b/benchtests/atanh-inputs
|
||||
index 455aa65b6500bccb..498529325436d48f 100644
|
||||
--- a/benchtests/atanh-inputs
|
||||
+++ b/benchtests/atanh-inputs
|
||||
@@ -1,6 +1,7 @@
|
||||
## args: double
|
||||
## ret: double
|
||||
## includes: math.h
|
||||
+## name: workload-random
|
||||
0x1.5a2730bacd94ap-1
|
||||
-0x1.b57eb40fc048ep-21
|
||||
-0x1.c0b185fb450e2p-17
|
||||
diff --git a/sysdeps/ieee754/dbl-64/e_atanh.c b/sysdeps/ieee754/dbl-64/e_atanh.c
|
||||
index 11a2a45799d09f63..05ac0a1b30c164a7 100644
|
||||
--- a/sysdeps/ieee754/dbl-64/e_atanh.c
|
||||
+++ b/sysdeps/ieee754/dbl-64/e_atanh.c
|
||||
@@ -44,6 +44,11 @@
|
||||
|
||||
static const double huge = 1e300;
|
||||
|
||||
+#ifndef SECTION
|
||||
+# define SECTION
|
||||
+#endif
|
||||
+
|
||||
+SECTION
|
||||
double
|
||||
__ieee754_atanh (double x)
|
||||
{
|
||||
@@ -73,4 +78,7 @@ __ieee754_atanh (double x)
|
||||
|
||||
return copysign (t, x);
|
||||
}
|
||||
+
|
||||
+#ifndef __ieee754_atanh
|
||||
libm_alias_finite (__ieee754_atanh, __atanh)
|
||||
+#endif
|
||||
diff --git a/sysdeps/x86_64/fpu/multiarch/Makefile b/sysdeps/x86_64/fpu/multiarch/Makefile
|
||||
index b527cab8d134be21..bc479b42d279825b 100644
|
||||
--- a/sysdeps/x86_64/fpu/multiarch/Makefile
|
||||
+++ b/sysdeps/x86_64/fpu/multiarch/Makefile
|
||||
@@ -1,6 +1,7 @@
|
||||
ifeq ($(subdir),math)
|
||||
CFLAGS-e_asin-fma.c = -mfma -mavx2
|
||||
CFLAGS-e_atan2-fma.c = -mfma -mavx2
|
||||
+CFLAGS-e_atanh-fma.c = -mfma -mavx2
|
||||
CFLAGS-e_exp-fma.c = -mfma -mavx2
|
||||
CFLAGS-e_log-fma.c = -mfma -mavx2
|
||||
CFLAGS-e_log2-fma.c = -mfma -mavx2
|
||||
@@ -57,6 +58,7 @@ libm-sysdep_routines += \
|
||||
e_asin-fma \
|
||||
e_atan2-avx \
|
||||
e_atan2-fma \
|
||||
+ e_atanh-fma \
|
||||
e_exp-avx \
|
||||
e_exp-fma \
|
||||
e_exp2f-fma \
|
||||
diff --git a/sysdeps/x86_64/fpu/multiarch/e_atanh-fma.c b/sysdeps/x86_64/fpu/multiarch/e_atanh-fma.c
|
||||
new file mode 100644
|
||||
index 0000000000000000..c3f2f9e5506ae363
|
||||
--- /dev/null
|
||||
+++ b/sysdeps/x86_64/fpu/multiarch/e_atanh-fma.c
|
||||
@@ -0,0 +1,6 @@
|
||||
+#define __ieee754_atanh __ieee754_atanh_fma
|
||||
+#define __log1p __log1p_fma
|
||||
+
|
||||
+#define SECTION __attribute__ ((section (".text.fma")))
|
||||
+
|
||||
+#include <sysdeps/ieee754/dbl-64/e_atanh.c>
|
||||
diff --git a/sysdeps/x86_64/fpu/multiarch/e_atanh.c b/sysdeps/x86_64/fpu/multiarch/e_atanh.c
|
||||
new file mode 100644
|
||||
index 0000000000000000..d2b785dfc0268df8
|
||||
--- /dev/null
|
||||
+++ b/sysdeps/x86_64/fpu/multiarch/e_atanh.c
|
||||
@@ -0,0 +1,34 @@
|
||||
+/* Multiple versions of atanh.
|
||||
+ Copyright (C) 2025 Free Software Foundation, Inc.
|
||||
+ This file is part of the GNU C Library.
|
||||
+
|
||||
+ The GNU C Library is free software; you can redistribute it and/or
|
||||
+ modify it under the terms of the GNU Lesser General Public
|
||||
+ License as published by the Free Software Foundation; either
|
||||
+ version 2.1 of the License, or (at your option) any later version.
|
||||
+
|
||||
+ The GNU C Library is distributed in the hope that it will be useful,
|
||||
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
+ Lesser General Public License for more details.
|
||||
+
|
||||
+ You should have received a copy of the GNU Lesser General Public
|
||||
+ License along with the GNU C Library; if not, see
|
||||
+ <https://www.gnu.org/licenses/>. */
|
||||
+
|
||||
+#include <sysdeps/x86/isa-level.h>
|
||||
+#if MINIMUM_X86_ISA_LEVEL < AVX2_X86_ISA_LEVEL
|
||||
+# include <libm-alias-finite.h>
|
||||
+
|
||||
+extern double __redirect_ieee754_atanh (double);
|
||||
+
|
||||
+# define SYMBOL_NAME ieee754_atanh
|
||||
+# include "ifunc-fma.h"
|
||||
+
|
||||
+libc_ifunc_redirected (__redirect_ieee754_atanh, __ieee754_atanh, IFUNC_SELECTOR ());
|
||||
+
|
||||
+libm_alias_finite (__ieee754_atanh, __atanh)
|
||||
+
|
||||
+# define __ieee754_atanh __ieee754_atanh_sse2
|
||||
+#endif
|
||||
+#include <sysdeps/ieee754/dbl-64/e_atanh.c>
|
47
glibc-upstream-2.39-171.patch
Normal file
47
glibc-upstream-2.39-171.patch
Normal file
@ -0,0 +1,47 @@
|
||||
commit 60cd7123a6c4441a509c22cc1d5da60df2c1dfeb
|
||||
Author: Florian Weimer <fweimer@redhat.com>
|
||||
Date: Fri Mar 28 09:26:06 2025 +0100
|
||||
|
||||
x86: Skip XSAVE state size reset if ISA level requires XSAVE
|
||||
|
||||
If we have to use XSAVE or XSAVEC trampolines, do not adjust the size
|
||||
information they need. Technically, it is an operator error to try to
|
||||
run with -XSAVE,-XSAVEC on such builds, but this change here disables
|
||||
some unnecessary code with higher ISA levels and simplifies testing.
|
||||
|
||||
Related to commit befe2d3c4dec8be2cdd01a47132e47bdb7020922
|
||||
("x86-64: Don't use SSE resolvers for ISA level 3 or above").
|
||||
|
||||
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
|
||||
(cherry picked from commit 59585ddaa2d44f22af04bb4b8bd4ad1e302c4c02)
|
||||
|
||||
diff --git a/sysdeps/x86/cpu-features.c b/sysdeps/x86/cpu-features.c
|
||||
index 3d7c2819d7cc6643..4c535970d10a2d67 100644
|
||||
--- a/sysdeps/x86/cpu-features.c
|
||||
+++ b/sysdeps/x86/cpu-features.c
|
||||
@@ -24,6 +24,7 @@
|
||||
#include <dl-cacheinfo.h>
|
||||
#include <dl-minsigstacksize.h>
|
||||
#include <dl-hwcap2.h>
|
||||
+#include <gcc-macros.h>
|
||||
|
||||
extern void TUNABLE_CALLBACK (set_hwcaps) (tunable_val_t *)
|
||||
attribute_hidden;
|
||||
@@ -1092,6 +1093,9 @@ no_cpuid:
|
||||
TUNABLE_CALLBACK (set_prefer_map_32bit_exec));
|
||||
#endif
|
||||
|
||||
+ /* Do not add the logic to disable XSAVE/XSAVEC if this glibc build
|
||||
+ requires AVX and therefore XSAVE or XSAVEC support. */
|
||||
+#ifndef GCCMACRO__AVX__
|
||||
bool disable_xsave_features = false;
|
||||
|
||||
if (!CPU_FEATURE_USABLE_P (cpu_features, OSXSAVE))
|
||||
@@ -1145,6 +1149,7 @@ no_cpuid:
|
||||
|
||||
CPU_FEATURE_UNSET (cpu_features, FMA4);
|
||||
}
|
||||
+#endif
|
||||
|
||||
#ifdef __x86_64__
|
||||
GLRO(dl_hwcap) = HWCAP_X86_64;
|
183
glibc-upstream-2.39-172.patch
Normal file
183
glibc-upstream-2.39-172.patch
Normal file
@ -0,0 +1,183 @@
|
||||
commit 87ab0c7f7f7c4bc16cda782c703b61cd28f383a3
|
||||
Author: Florian Weimer <fweimer@redhat.com>
|
||||
Date: Fri Mar 28 09:26:59 2025 +0100
|
||||
|
||||
x86: Use separate variable for TLSDESC XSAVE/XSAVEC state size (bug 32810)
|
||||
|
||||
Previously, the initialization code reused the xsave_state_full_size
|
||||
member of struct cpu_features for the TLSDESC state size. However,
|
||||
the tunable processing code assumes that this member has the
|
||||
original XSAVE (non-compact) state size, so that it can use its
|
||||
value if XSAVEC is disabled via tunable.
|
||||
|
||||
This change uses a separate variable and not a struct member because
|
||||
the value is only needed in ld.so and the static libc, but not in
|
||||
libc.so. As a result, struct cpu_features layout does not change,
|
||||
helping a future backport of this change.
|
||||
|
||||
Fixes commit 9b7091415af47082664717210ac49d51551456ab ("x86-64:
|
||||
Update _dl_tlsdesc_dynamic to preserve AMX registers").
|
||||
|
||||
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
|
||||
(cherry picked from commit 145097dff170507fe73190e8e41194f5b5f7e6bf)
|
||||
|
||||
diff --git a/sysdeps/x86/Makefile b/sysdeps/x86/Makefile
|
||||
index 5311b594aff62f7c..8819fba1b7164f45 100644
|
||||
--- a/sysdeps/x86/Makefile
|
||||
+++ b/sysdeps/x86/Makefile
|
||||
@@ -21,6 +21,9 @@ tests += \
|
||||
tst-cpu-features-supports-static \
|
||||
tst-get-cpu-features \
|
||||
tst-get-cpu-features-static \
|
||||
+ tst-gnu2-tls2-x86-noxsave \
|
||||
+ tst-gnu2-tls2-x86-noxsavec \
|
||||
+ tst-gnu2-tls2-x86-noxsavexsavec \
|
||||
tst-hwcap-tunables \
|
||||
# tests
|
||||
tests-static += \
|
||||
@@ -91,6 +94,22 @@ CFLAGS-tst-gnu2-tls2.c += -msse
|
||||
CFLAGS-tst-gnu2-tls2mod0.c += -msse2 -mtune=haswell
|
||||
CFLAGS-tst-gnu2-tls2mod1.c += -msse2 -mtune=haswell
|
||||
CFLAGS-tst-gnu2-tls2mod2.c += -msse2 -mtune=haswell
|
||||
+
|
||||
+LDFLAGS-tst-gnu2-tls2-x86-noxsave += -Wl,-z,lazy
|
||||
+LDFLAGS-tst-gnu2-tls2-x86-noxsavec += -Wl,-z,lazy
|
||||
+LDFLAGS-tst-gnu2-tls2-x86-noxsavexsavec += -Wl,-z,lazy
|
||||
+
|
||||
+# Test for bug 32810: incorrect XSAVE state size if XSAVEC is disabled
|
||||
+# via tunable.
|
||||
+tst-gnu2-tls2-x86-noxsave-ENV = GLIBC_TUNABLES=glibc.cpu.hwcaps=-XSAVE
|
||||
+tst-gnu2-tls2-x86-noxsavec-ENV = GLIBC_TUNABLES=glibc.cpu.hwcaps=-XSAVEC
|
||||
+tst-gnu2-tls2-x86-noxsavexsavec-ENV = GLIBC_TUNABLES=glibc.cpu.hwcaps=-XSAVE,-XSAVEC
|
||||
+$(objpfx)tst-gnu2-tls2-x86-noxsave.out \
|
||||
+$(objpfx)tst-gnu2-tls2-x86-noxsavec.out \
|
||||
+$(objpfx)tst-gnu2-tls2-x86-noxsavexsavec.out: \
|
||||
+ $(objpfx)tst-gnu2-tls2mod0.so \
|
||||
+ $(objpfx)tst-gnu2-tls2mod1.so \
|
||||
+ $(objpfx)tst-gnu2-tls2mod2.so
|
||||
endif
|
||||
|
||||
ifeq ($(subdir),math)
|
||||
diff --git a/sysdeps/x86/cpu-features.c b/sysdeps/x86/cpu-features.c
|
||||
index 4c535970d10a2d67..3be69558a4c3aa2d 100644
|
||||
--- a/sysdeps/x86/cpu-features.c
|
||||
+++ b/sysdeps/x86/cpu-features.c
|
||||
@@ -84,6 +84,8 @@ extern void TUNABLE_CALLBACK (set_x86_shstk) (tunable_val_t *)
|
||||
# include <dl-cet.h>
|
||||
#endif
|
||||
|
||||
+unsigned long int _dl_x86_features_tlsdesc_state_size;
|
||||
+
|
||||
static void
|
||||
update_active (struct cpu_features *cpu_features)
|
||||
{
|
||||
@@ -318,6 +320,7 @@ update_active (struct cpu_features *cpu_features)
|
||||
= xsave_state_full_size;
|
||||
cpu_features->xsave_state_full_size
|
||||
= xsave_state_full_size;
|
||||
+ _dl_x86_features_tlsdesc_state_size = xsave_state_full_size;
|
||||
|
||||
/* Check if XSAVEC is available. */
|
||||
if (CPU_FEATURES_CPU_P (cpu_features, XSAVEC))
|
||||
@@ -406,11 +409,9 @@ update_active (struct cpu_features *cpu_features)
|
||||
= ALIGN_UP ((amx_size
|
||||
+ TLSDESC_CALL_REGISTER_SAVE_AREA),
|
||||
64);
|
||||
- /* Set xsave_state_full_size to the compact AMX
|
||||
- state size for XSAVEC. NB: xsave_state_full_size
|
||||
- is only used in _dl_tlsdesc_dynamic_xsave and
|
||||
- _dl_tlsdesc_dynamic_xsavec. */
|
||||
- cpu_features->xsave_state_full_size = amx_size;
|
||||
+ /* Set TLSDESC state size to the compact AMX
|
||||
+ state size for XSAVEC. */
|
||||
+ _dl_x86_features_tlsdesc_state_size = amx_size;
|
||||
#endif
|
||||
cpu_features->xsave_state_size
|
||||
= ALIGN_UP (size + TLSDESC_CALL_REGISTER_SAVE_AREA,
|
||||
diff --git a/sysdeps/x86/cpu-tunables.c b/sysdeps/x86/cpu-tunables.c
|
||||
index 89da7a03daa665f6..a72ba61d837c6383 100644
|
||||
--- a/sysdeps/x86/cpu-tunables.c
|
||||
+++ b/sysdeps/x86/cpu-tunables.c
|
||||
@@ -164,6 +164,8 @@ TUNABLE_CALLBACK (set_hwcaps) (tunable_val_t *valp)
|
||||
/* Update xsave_state_size to XSAVE state size. */
|
||||
cpu_features->xsave_state_size
|
||||
= cpu_features->xsave_state_full_size;
|
||||
+ _dl_x86_features_tlsdesc_state_size
|
||||
+ = cpu_features->xsave_state_full_size;
|
||||
CPU_FEATURE_UNSET (cpu_features, XSAVEC);
|
||||
}
|
||||
}
|
||||
diff --git a/sysdeps/x86/dl-diagnostics-cpu.c b/sysdeps/x86/dl-diagnostics-cpu.c
|
||||
index c76ea3be16f6bead..9f10645ee9778741 100644
|
||||
--- a/sysdeps/x86/dl-diagnostics-cpu.c
|
||||
+++ b/sysdeps/x86/dl-diagnostics-cpu.c
|
||||
@@ -78,6 +78,8 @@ _dl_diagnostics_cpu (void)
|
||||
cpu_features->xsave_state_size);
|
||||
print_cpu_features_value ("xsave_state_full_size",
|
||||
cpu_features->xsave_state_full_size);
|
||||
+ print_cpu_features_value ("tlsdesc_state_full_size",
|
||||
+ _dl_x86_features_tlsdesc_state_size);
|
||||
print_cpu_features_value ("data_cache_size", cpu_features->data_cache_size);
|
||||
print_cpu_features_value ("shared_cache_size",
|
||||
cpu_features->shared_cache_size);
|
||||
diff --git a/sysdeps/x86/include/cpu-features.h b/sysdeps/x86/include/cpu-features.h
|
||||
index cd7bd27cf35959fd..a11d4be30b696ac3 100644
|
||||
--- a/sysdeps/x86/include/cpu-features.h
|
||||
+++ b/sysdeps/x86/include/cpu-features.h
|
||||
@@ -934,8 +934,6 @@ struct cpu_features
|
||||
/* The full state size for XSAVE when XSAVEC is disabled by
|
||||
|
||||
GLIBC_TUNABLES=glibc.cpu.hwcaps=-XSAVEC
|
||||
-
|
||||
- and the AMX state size when XSAVEC is available.
|
||||
*/
|
||||
unsigned int xsave_state_full_size;
|
||||
/* Data cache size for use in memory and string routines, typically
|
||||
@@ -987,6 +985,13 @@ extern const struct cpu_features *_dl_x86_get_cpu_features (void)
|
||||
|
||||
#define __get_cpu_features() _dl_x86_get_cpu_features()
|
||||
|
||||
+#if IS_IN (rtld) || IS_IN (libc)
|
||||
+/* XSAVE/XSAVEC state size used by TLS descriptors. Compared to
|
||||
+ xsave_state_size from struct cpu_features, this includes additional
|
||||
+ registers. */
|
||||
+extern unsigned long int _dl_x86_features_tlsdesc_state_size attribute_hidden;
|
||||
+#endif
|
||||
+
|
||||
#if defined (_LIBC) && !IS_IN (nonlib)
|
||||
/* Unused for x86. */
|
||||
# define INIT_ARCH()
|
||||
diff --git a/sysdeps/x86/tst-gnu2-tls2-x86-noxsave.c b/sysdeps/x86/tst-gnu2-tls2-x86-noxsave.c
|
||||
new file mode 100644
|
||||
index 0000000000000000..f0024c143d1a1df5
|
||||
--- /dev/null
|
||||
+++ b/sysdeps/x86/tst-gnu2-tls2-x86-noxsave.c
|
||||
@@ -0,0 +1 @@
|
||||
+#include <elf/tst-gnu2-tls2.c>
|
||||
diff --git a/sysdeps/x86/tst-gnu2-tls2-x86-noxsavec.c b/sysdeps/x86/tst-gnu2-tls2-x86-noxsavec.c
|
||||
new file mode 100644
|
||||
index 0000000000000000..f0024c143d1a1df5
|
||||
--- /dev/null
|
||||
+++ b/sysdeps/x86/tst-gnu2-tls2-x86-noxsavec.c
|
||||
@@ -0,0 +1 @@
|
||||
+#include <elf/tst-gnu2-tls2.c>
|
||||
diff --git a/sysdeps/x86/tst-gnu2-tls2-x86-noxsavexsavec.c b/sysdeps/x86/tst-gnu2-tls2-x86-noxsavexsavec.c
|
||||
new file mode 100644
|
||||
index 0000000000000000..f0024c143d1a1df5
|
||||
--- /dev/null
|
||||
+++ b/sysdeps/x86/tst-gnu2-tls2-x86-noxsavexsavec.c
|
||||
@@ -0,0 +1 @@
|
||||
+#include <elf/tst-gnu2-tls2.c>
|
||||
diff --git a/sysdeps/x86_64/dl-tlsdesc-dynamic.h b/sysdeps/x86_64/dl-tlsdesc-dynamic.h
|
||||
index 9f02cfc3eb297ed2..44d948696fbe44af 100644
|
||||
--- a/sysdeps/x86_64/dl-tlsdesc-dynamic.h
|
||||
+++ b/sysdeps/x86_64/dl-tlsdesc-dynamic.h
|
||||
@@ -99,7 +99,7 @@ _dl_tlsdesc_dynamic:
|
||||
# endif
|
||||
#else
|
||||
/* Allocate stack space of the required size to save the state. */
|
||||
- sub _rtld_local_ro+RTLD_GLOBAL_RO_DL_X86_CPU_FEATURES_OFFSET+XSAVE_STATE_FULL_SIZE_OFFSET(%rip), %RSP_LP
|
||||
+ sub _dl_x86_features_tlsdesc_state_size(%rip), %RSP_LP
|
||||
#endif
|
||||
/* Besides rdi and rsi, saved above, save rcx, rdx, r8, r9,
|
||||
r10 and r11. */
|
28
glibc-upstream-2.39-173.patch
Normal file
28
glibc-upstream-2.39-173.patch
Normal file
@ -0,0 +1,28 @@
|
||||
commit 837a36c371f18a3152d032e8060f4e5120c25e2b
|
||||
Author: Florian Weimer <fweimer@redhat.com>
|
||||
Date: Mon Mar 31 21:33:18 2025 +0200
|
||||
|
||||
x86: Link tst-gnu2-tls2-x86-noxsave{,c,xsavec} with libpthread
|
||||
|
||||
This fixes a test build failure on Hurd.
|
||||
|
||||
Fixes commit 145097dff170507fe73190e8e41194f5b5f7e6bf ("x86: Use separate
|
||||
variable for TLSDESC XSAVE/XSAVEC state size (bug 32810)").
|
||||
|
||||
Reviewed-by: Adhemerval Zanella <adhemerval.zanella@linaro.org>
|
||||
(cherry picked from commit c6e2895695118ab59c7b17feb0fcb75a53e3478c)
|
||||
|
||||
diff --git a/sysdeps/x86/Makefile b/sysdeps/x86/Makefile
|
||||
index 8819fba1b7164f45..01b0192ddf5e23ca 100644
|
||||
--- a/sysdeps/x86/Makefile
|
||||
+++ b/sysdeps/x86/Makefile
|
||||
@@ -104,6 +104,9 @@ LDFLAGS-tst-gnu2-tls2-x86-noxsavexsavec += -Wl,-z,lazy
|
||||
tst-gnu2-tls2-x86-noxsave-ENV = GLIBC_TUNABLES=glibc.cpu.hwcaps=-XSAVE
|
||||
tst-gnu2-tls2-x86-noxsavec-ENV = GLIBC_TUNABLES=glibc.cpu.hwcaps=-XSAVEC
|
||||
tst-gnu2-tls2-x86-noxsavexsavec-ENV = GLIBC_TUNABLES=glibc.cpu.hwcaps=-XSAVE,-XSAVEC
|
||||
+$(objpfx)tst-gnu2-tls2-x86-noxsave: $(shared-thread-library)
|
||||
+$(objpfx)tst-gnu2-tls2-x86-noxsavec: $(shared-thread-library)
|
||||
+$(objpfx)tst-gnu2-tls2-x86-noxsavexsavec: $(shared-thread-library)
|
||||
$(objpfx)tst-gnu2-tls2-x86-noxsave.out \
|
||||
$(objpfx)tst-gnu2-tls2-x86-noxsavec.out \
|
||||
$(objpfx)tst-gnu2-tls2-x86-noxsavexsavec.out: \
|
202
glibc-upstream-2.39-174.patch
Normal file
202
glibc-upstream-2.39-174.patch
Normal file
@ -0,0 +1,202 @@
|
||||
commit 0da58e8be087ca7011ec918977c2ffac9034d1d4
|
||||
Author: Noah Goldstein <goldstein.w.n@gmail.com>
|
||||
Date: Fri May 24 12:38:51 2024 -0500
|
||||
|
||||
x86: Add seperate non-temporal tunable for memset
|
||||
|
||||
The tuning for non-temporal stores for memset vs memcpy is not always
|
||||
the same. This includes both the exact value and whether non-temporal
|
||||
stores are profitable at all for a given arch.
|
||||
|
||||
This patch add `x86_memset_non_temporal_threshold`. Currently we
|
||||
disable non-temporal stores for non Intel vendors as the only
|
||||
benchmarks showing its benefit have been on Intel hardware.
|
||||
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
|
||||
|
||||
(cherry picked from commit 46b5e98ef6f1b9f4b53851f152ecb8209064b26c)
|
||||
|
||||
diff --git a/manual/tunables.texi b/manual/tunables.texi
|
||||
index be97190d67b1c82e..b255a149d10aecf6 100644
|
||||
--- a/manual/tunables.texi
|
||||
+++ b/manual/tunables.texi
|
||||
@@ -52,6 +52,7 @@ glibc.elision.skip_lock_busy: 3 (min: 0, max: 2147483647)
|
||||
glibc.malloc.top_pad: 0x20000 (min: 0x0, max: 0xffffffffffffffff)
|
||||
glibc.cpu.x86_rep_stosb_threshold: 0x800 (min: 0x1, max: 0xffffffffffffffff)
|
||||
glibc.cpu.x86_non_temporal_threshold: 0xc0000 (min: 0x4040, max: 0xfffffffffffffff)
|
||||
+glibc.cpu.x86_memset_non_temporal_threshold: 0xc0000 (min: 0x4040, max: 0xfffffffffffffff)
|
||||
glibc.cpu.x86_shstk:
|
||||
glibc.pthread.stack_cache_size: 0x2800000 (min: 0x0, max: 0xffffffffffffffff)
|
||||
glibc.cpu.hwcap_mask: 0x6 (min: 0x0, max: 0xffffffffffffffff)
|
||||
@@ -485,7 +486,8 @@ thread stack originally backup by Huge Pages to default pages.
|
||||
@cindex shared_cache_size tunables
|
||||
@cindex tunables, shared_cache_size
|
||||
@cindex non_temporal_threshold tunables
|
||||
-@cindex tunables, non_temporal_threshold
|
||||
+@cindex memset_non_temporal_threshold tunables
|
||||
+@cindex tunables, non_temporal_threshold, memset_non_temporal_threshold
|
||||
|
||||
@deftp {Tunable namespace} glibc.cpu
|
||||
Behavior of @theglibc{} can be tuned to assume specific hardware capabilities
|
||||
@@ -561,6 +563,18 @@ like memmove and memcpy.
|
||||
This tunable is specific to i386 and x86-64.
|
||||
@end deftp
|
||||
|
||||
+@deftp Tunable glibc.cpu.x86_memset_non_temporal_threshold
|
||||
+The @code{glibc.cpu.x86_memset_non_temporal_threshold} tunable allows
|
||||
+the user to set threshold in bytes for non temporal store in
|
||||
+memset. Non temporal stores give a hint to the hardware to move data
|
||||
+directly to memory without displacing other data from the cache. This
|
||||
+tunable is used by some platforms to determine when to use non
|
||||
+temporal stores memset.
|
||||
+
|
||||
+This tunable is specific to i386 and x86-64.
|
||||
+@end deftp
|
||||
+
|
||||
+
|
||||
@deftp Tunable glibc.cpu.x86_rep_movsb_threshold
|
||||
The @code{glibc.cpu.x86_rep_movsb_threshold} tunable allows the user to
|
||||
set threshold in bytes to start using "rep movsb". The value must be
|
||||
diff --git a/sysdeps/x86/cacheinfo.h b/sysdeps/x86/cacheinfo.h
|
||||
index ab73556772209402..83491607c761ccc6 100644
|
||||
--- a/sysdeps/x86/cacheinfo.h
|
||||
+++ b/sysdeps/x86/cacheinfo.h
|
||||
@@ -35,9 +35,12 @@ long int __x86_data_cache_size attribute_hidden = 32 * 1024;
|
||||
long int __x86_shared_cache_size_half attribute_hidden = 1024 * 1024 / 2;
|
||||
long int __x86_shared_cache_size attribute_hidden = 1024 * 1024;
|
||||
|
||||
-/* Threshold to use non temporal store. */
|
||||
+/* Threshold to use non temporal store in memmove. */
|
||||
long int __x86_shared_non_temporal_threshold attribute_hidden;
|
||||
|
||||
+/* Threshold to use non temporal store in memset. */
|
||||
+long int __x86_memset_non_temporal_threshold attribute_hidden;
|
||||
+
|
||||
/* Threshold to use Enhanced REP MOVSB. */
|
||||
long int __x86_rep_movsb_threshold attribute_hidden = 2048;
|
||||
|
||||
@@ -77,6 +80,9 @@ init_cacheinfo (void)
|
||||
__x86_shared_non_temporal_threshold
|
||||
= cpu_features->non_temporal_threshold;
|
||||
|
||||
+ __x86_memset_non_temporal_threshold
|
||||
+ = cpu_features->memset_non_temporal_threshold;
|
||||
+
|
||||
__x86_rep_movsb_threshold = cpu_features->rep_movsb_threshold;
|
||||
__x86_rep_stosb_threshold = cpu_features->rep_stosb_threshold;
|
||||
__x86_rep_movsb_stop_threshold = cpu_features->rep_movsb_stop_threshold;
|
||||
diff --git a/sysdeps/x86/dl-cacheinfo.h b/sysdeps/x86/dl-cacheinfo.h
|
||||
index 1f68968a9a457586..0e7c1e0415d4137b 100644
|
||||
--- a/sysdeps/x86/dl-cacheinfo.h
|
||||
+++ b/sysdeps/x86/dl-cacheinfo.h
|
||||
@@ -986,6 +986,13 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
|
||||
if (CPU_FEATURE_USABLE_P (cpu_features, FSRM))
|
||||
rep_movsb_threshold = 2112;
|
||||
|
||||
+ /* Non-temporal stores in memset have only been tested on Intel hardware.
|
||||
+ Until we benchmark data on other x86 processor, disable non-temporal
|
||||
+ stores in memset. */
|
||||
+ unsigned long int memset_non_temporal_threshold = SIZE_MAX;
|
||||
+ if (cpu_features->basic.kind == arch_kind_intel)
|
||||
+ memset_non_temporal_threshold = non_temporal_threshold;
|
||||
+
|
||||
/* For AMD CPUs that support ERMS (Zen3+), REP MOVSB is in a lot of
|
||||
cases slower than the vectorized path (and for some alignments,
|
||||
it is really slow, check BZ #30994). */
|
||||
@@ -1012,6 +1019,11 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
|
||||
&& tunable_size <= maximum_non_temporal_threshold)
|
||||
non_temporal_threshold = tunable_size;
|
||||
|
||||
+ tunable_size = TUNABLE_GET (x86_memset_non_temporal_threshold, long int, NULL);
|
||||
+ if (tunable_size > minimum_non_temporal_threshold
|
||||
+ && tunable_size <= maximum_non_temporal_threshold)
|
||||
+ memset_non_temporal_threshold = tunable_size;
|
||||
+
|
||||
tunable_size = TUNABLE_GET (x86_rep_movsb_threshold, long int, NULL);
|
||||
if (tunable_size > minimum_rep_movsb_threshold)
|
||||
rep_movsb_threshold = tunable_size;
|
||||
@@ -1032,6 +1044,9 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
|
||||
TUNABLE_SET_WITH_BOUNDS (x86_non_temporal_threshold, non_temporal_threshold,
|
||||
minimum_non_temporal_threshold,
|
||||
maximum_non_temporal_threshold);
|
||||
+ TUNABLE_SET_WITH_BOUNDS (
|
||||
+ x86_memset_non_temporal_threshold, memset_non_temporal_threshold,
|
||||
+ minimum_non_temporal_threshold, maximum_non_temporal_threshold);
|
||||
TUNABLE_SET_WITH_BOUNDS (x86_rep_movsb_threshold, rep_movsb_threshold,
|
||||
minimum_rep_movsb_threshold, SIZE_MAX);
|
||||
TUNABLE_SET_WITH_BOUNDS (x86_rep_stosb_threshold, rep_stosb_threshold, 1,
|
||||
@@ -1045,6 +1060,7 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
|
||||
cpu_features->data_cache_size = data;
|
||||
cpu_features->shared_cache_size = shared;
|
||||
cpu_features->non_temporal_threshold = non_temporal_threshold;
|
||||
+ cpu_features->memset_non_temporal_threshold = memset_non_temporal_threshold;
|
||||
cpu_features->rep_movsb_threshold = rep_movsb_threshold;
|
||||
cpu_features->rep_stosb_threshold = rep_stosb_threshold;
|
||||
cpu_features->rep_movsb_stop_threshold = rep_movsb_stop_threshold;
|
||||
diff --git a/sysdeps/x86/dl-diagnostics-cpu.c b/sysdeps/x86/dl-diagnostics-cpu.c
|
||||
index 9f10645ee9778741..8113a93883cfe7a2 100644
|
||||
--- a/sysdeps/x86/dl-diagnostics-cpu.c
|
||||
+++ b/sysdeps/x86/dl-diagnostics-cpu.c
|
||||
@@ -85,6 +85,8 @@ _dl_diagnostics_cpu (void)
|
||||
cpu_features->shared_cache_size);
|
||||
print_cpu_features_value ("non_temporal_threshold",
|
||||
cpu_features->non_temporal_threshold);
|
||||
+ print_cpu_features_value ("memset_non_temporal_threshold",
|
||||
+ cpu_features->memset_non_temporal_threshold);
|
||||
print_cpu_features_value ("rep_movsb_threshold",
|
||||
cpu_features->rep_movsb_threshold);
|
||||
print_cpu_features_value ("rep_movsb_stop_threshold",
|
||||
diff --git a/sysdeps/x86/dl-tunables.list b/sysdeps/x86/dl-tunables.list
|
||||
index 7d82da0dece49c45..a0a12995927dc4f1 100644
|
||||
--- a/sysdeps/x86/dl-tunables.list
|
||||
+++ b/sysdeps/x86/dl-tunables.list
|
||||
@@ -30,6 +30,9 @@ glibc {
|
||||
x86_non_temporal_threshold {
|
||||
type: SIZE_T
|
||||
}
|
||||
+ x86_memset_non_temporal_threshold {
|
||||
+ type: SIZE_T
|
||||
+ }
|
||||
x86_rep_movsb_threshold {
|
||||
type: SIZE_T
|
||||
# Since there is overhead to set up REP MOVSB operation, REP
|
||||
diff --git a/sysdeps/x86/include/cpu-features.h b/sysdeps/x86/include/cpu-features.h
|
||||
index a11d4be30b696ac3..03c71387dd08982b 100644
|
||||
--- a/sysdeps/x86/include/cpu-features.h
|
||||
+++ b/sysdeps/x86/include/cpu-features.h
|
||||
@@ -942,8 +942,10 @@ struct cpu_features
|
||||
/* Shared cache size for use in memory and string routines, typically
|
||||
L2 or L3 size. */
|
||||
unsigned long int shared_cache_size;
|
||||
- /* Threshold to use non temporal store. */
|
||||
+ /* Threshold to use non temporal store in memmove. */
|
||||
unsigned long int non_temporal_threshold;
|
||||
+ /* Threshold to use non temporal store in memset. */
|
||||
+ unsigned long int memset_non_temporal_threshold;
|
||||
/* Threshold to use "rep movsb". */
|
||||
unsigned long int rep_movsb_threshold;
|
||||
/* Threshold to stop using "rep movsb". */
|
||||
diff --git a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
|
||||
index 637caadb406b2544..88bf08e4f4a2260e 100644
|
||||
--- a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
|
||||
+++ b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
|
||||
@@ -24,9 +24,9 @@
|
||||
5. If size is more to 4 * VEC_SIZE, align to 1 * VEC_SIZE with
|
||||
4 VEC stores and store 4 * VEC at a time until done.
|
||||
6. On machines ERMS feature, if size is range
|
||||
- [__x86_rep_stosb_threshold, __x86_shared_non_temporal_threshold)
|
||||
+ [__x86_rep_stosb_threshold, __x86_memset_non_temporal_threshold)
|
||||
then REP STOSB will be used.
|
||||
- 7. If size >= __x86_shared_non_temporal_threshold, use a
|
||||
+ 7. If size >= __x86_memset_non_temporal_threshold, use a
|
||||
non-temporal stores. */
|
||||
|
||||
#include <sysdep.h>
|
||||
@@ -318,7 +318,7 @@ L(return_vzeroupper):
|
||||
/* If no USE_LESS_VEC_MASK put L(stosb_local) here. Will be in
|
||||
range for 2-byte jump encoding. */
|
||||
L(stosb_local):
|
||||
- cmp __x86_shared_non_temporal_threshold(%rip), %RDX_LP
|
||||
+ cmp __x86_memset_non_temporal_threshold(%rip), %RDX_LP
|
||||
jae L(nt_memset)
|
||||
movzbl %sil, %eax
|
||||
mov %RDX_LP, %RCX_LP
|
43
glibc-upstream-2.39-175.patch
Normal file
43
glibc-upstream-2.39-175.patch
Normal file
@ -0,0 +1,43 @@
|
||||
commit cc59fa5dbc4db7c6d1fb792c55a5d83c54ee72bf
|
||||
Author: Joe Damato <jdamato@fastly.com>
|
||||
Date: Fri Jun 7 23:04:47 2024 +0000
|
||||
|
||||
x86: Enable non-temporal memset tunable for AMD
|
||||
|
||||
In commit 46b5e98ef6f1 ("x86: Add seperate non-temporal tunable for
|
||||
memset") a tunable threshold for enabling non-temporal memset was added,
|
||||
but only for Intel hardware.
|
||||
|
||||
Since that commit, new benchmark results suggest that non-temporal
|
||||
memset is beneficial on AMD, as well, so allow this tunable to be set
|
||||
for AMD.
|
||||
|
||||
See:
|
||||
https://docs.google.com/spreadsheets/d/1opzukzvum4n6-RUVHTGddV6RjAEil4P2uMjjQGLbLcU/edit?usp=sharing
|
||||
which has been updated to include data using different stategies for
|
||||
large memset on AMD Zen2, Zen3, and Zen4.
|
||||
|
||||
Signed-off-by: Joe Damato <jdamato@fastly.com>
|
||||
Reviewed-by: Noah Goldstein <goldstein.w.n@gmail.com>
|
||||
(cherry picked from commit bef2a827a55fc759693ccc5b0f614353b8ad712d)
|
||||
|
||||
diff --git a/sysdeps/x86/dl-cacheinfo.h b/sysdeps/x86/dl-cacheinfo.h
|
||||
index 0e7c1e0415d4137b..9916c5d951361c90 100644
|
||||
--- a/sysdeps/x86/dl-cacheinfo.h
|
||||
+++ b/sysdeps/x86/dl-cacheinfo.h
|
||||
@@ -986,11 +986,11 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
|
||||
if (CPU_FEATURE_USABLE_P (cpu_features, FSRM))
|
||||
rep_movsb_threshold = 2112;
|
||||
|
||||
- /* Non-temporal stores in memset have only been tested on Intel hardware.
|
||||
- Until we benchmark data on other x86 processor, disable non-temporal
|
||||
- stores in memset. */
|
||||
+ /* Non-temporal stores are more performant on Intel and AMD hardware above
|
||||
+ non_temporal_threshold. Enable this for both Intel and AMD hardware. */
|
||||
unsigned long int memset_non_temporal_threshold = SIZE_MAX;
|
||||
- if (cpu_features->basic.kind == arch_kind_intel)
|
||||
+ if (cpu_features->basic.kind == arch_kind_intel
|
||||
+ || cpu_features->basic.kind == arch_kind_amd)
|
||||
memset_non_temporal_threshold = non_temporal_threshold;
|
||||
|
||||
/* For AMD CPUs that support ERMS (Zen3+), REP MOVSB is in a lot of
|
37
glibc-upstream-2.39-176.patch
Normal file
37
glibc-upstream-2.39-176.patch
Normal file
@ -0,0 +1,37 @@
|
||||
commit 38a7632f2d1ec86445904b356c54129591e8519b
|
||||
Author: Noah Goldstein <goldstein.w.n@gmail.com>
|
||||
Date: Fri Jun 14 13:01:58 2024 -0500
|
||||
|
||||
x86: Fix value for `x86_memset_non_temporal_threshold` when it is undesirable
|
||||
|
||||
When we don't want to use non-temporal stores for memset, we set
|
||||
`x86_memset_non_temporal_threshold` to SIZE_MAX.
|
||||
|
||||
The current code, however, we using `maximum_non_temporal_threshold`
|
||||
as the upper bound which is `SIZE_MAX >> 4` so we ended up with a
|
||||
value of `0`.
|
||||
|
||||
Fix is to just use `SIZE_MAX` as the upper bound for when setting the
|
||||
tunable.
|
||||
Tested-by: Borislav Petkov (AMD) <bp@alien8.de>
|
||||
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
|
||||
|
||||
(cherry picked from commit 5b54a33435e5533653a9956728f2de9d16a3b4ee)
|
||||
|
||||
diff --git a/sysdeps/x86/dl-cacheinfo.h b/sysdeps/x86/dl-cacheinfo.h
|
||||
index 9916c5d951361c90..9b6f68e46de4bdaa 100644
|
||||
--- a/sysdeps/x86/dl-cacheinfo.h
|
||||
+++ b/sysdeps/x86/dl-cacheinfo.h
|
||||
@@ -1044,9 +1044,9 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
|
||||
TUNABLE_SET_WITH_BOUNDS (x86_non_temporal_threshold, non_temporal_threshold,
|
||||
minimum_non_temporal_threshold,
|
||||
maximum_non_temporal_threshold);
|
||||
- TUNABLE_SET_WITH_BOUNDS (
|
||||
- x86_memset_non_temporal_threshold, memset_non_temporal_threshold,
|
||||
- minimum_non_temporal_threshold, maximum_non_temporal_threshold);
|
||||
+ TUNABLE_SET_WITH_BOUNDS (x86_memset_non_temporal_threshold,
|
||||
+ memset_non_temporal_threshold,
|
||||
+ minimum_non_temporal_threshold, SIZE_MAX);
|
||||
TUNABLE_SET_WITH_BOUNDS (x86_rep_movsb_threshold, rep_movsb_threshold,
|
||||
minimum_rep_movsb_threshold, SIZE_MAX);
|
||||
TUNABLE_SET_WITH_BOUNDS (x86_rep_stosb_threshold, rep_stosb_threshold, 1,
|
121
glibc-upstream-2.39-177.patch
Normal file
121
glibc-upstream-2.39-177.patch
Normal file
@ -0,0 +1,121 @@
|
||||
commit bde201e92c1e64934f8ffe3e5b7d769100677037
|
||||
Author: Noah Goldstein <goldstein.w.n@gmail.com>
|
||||
Date: Mon Jul 15 16:19:17 2024 +0800
|
||||
|
||||
x86: Disable non-temporal memset on Skylake Server
|
||||
|
||||
The original commit enabling non-temporal memset on Skylake Server had
|
||||
erroneous benchmarks (actually done on ICX).
|
||||
|
||||
Further benchmarks indicate non-temporal stores may in fact by a
|
||||
regression on Skylake Server.
|
||||
|
||||
This commit may be over-cautious in some cases, but should avoid any
|
||||
regressions for 2.40.
|
||||
|
||||
Tested using qemu on all x86_64 cpu arch supported by both qemu +
|
||||
GLIBC.
|
||||
|
||||
Reviewed-by: DJ Delorie <dj@redhat.com>
|
||||
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
|
||||
(cherry picked from commit 5bcf6265f215326d14dfacdce8532792c2c7f8f8)
|
||||
|
||||
diff --git a/sysdeps/x86/cpu-features.c b/sysdeps/x86/cpu-features.c
|
||||
index 3be69558a4c3aa2d..77b5638daafe9a1e 100644
|
||||
--- a/sysdeps/x86/cpu-features.c
|
||||
+++ b/sysdeps/x86/cpu-features.c
|
||||
@@ -872,11 +872,18 @@ init_cpu_features (struct cpu_features *cpu_features)
|
||||
|
||||
/* Newer Bigcore microarch (larger non-temporal store
|
||||
threshold). */
|
||||
- case INTEL_BIGCORE_SKYLAKE:
|
||||
- case INTEL_BIGCORE_KABYLAKE:
|
||||
- case INTEL_BIGCORE_COMETLAKE:
|
||||
case INTEL_BIGCORE_SKYLAKE_AVX512:
|
||||
case INTEL_BIGCORE_CANNONLAKE:
|
||||
+ /* Benchmarks indicate non-temporal memset is not
|
||||
+ necessarily profitable on SKX (and in some cases much
|
||||
+ worse). This is likely unique to SKX due its it unique
|
||||
+ mesh interconnect (not present on ICX or BWD). Disable
|
||||
+ non-temporal on all Skylake servers. */
|
||||
+ cpu_features->preferred[index_arch_Avoid_Non_Temporal_Memset]
|
||||
+ |= bit_arch_Avoid_Non_Temporal_Memset;
|
||||
+ case INTEL_BIGCORE_COMETLAKE:
|
||||
+ case INTEL_BIGCORE_SKYLAKE:
|
||||
+ case INTEL_BIGCORE_KABYLAKE:
|
||||
case INTEL_BIGCORE_ICELAKE:
|
||||
case INTEL_BIGCORE_TIGERLAKE:
|
||||
case INTEL_BIGCORE_ROCKETLAKE:
|
||||
diff --git a/sysdeps/x86/cpu-tunables.c b/sysdeps/x86/cpu-tunables.c
|
||||
index a72ba61d837c6383..a71772c9c07d01d7 100644
|
||||
--- a/sysdeps/x86/cpu-tunables.c
|
||||
+++ b/sysdeps/x86/cpu-tunables.c
|
||||
@@ -245,6 +245,11 @@ TUNABLE_CALLBACK (set_hwcaps) (tunable_val_t *valp)
|
||||
(n, cpu_features, MathVec_Prefer_No_AVX512, AVX512F, 24);
|
||||
}
|
||||
break;
|
||||
+ case 25:
|
||||
+ {
|
||||
+ CHECK_GLIBC_IFUNC_PREFERRED_BOTH (n, cpu_features,
|
||||
+ Avoid_Non_Temporal_Memset, 25);
|
||||
+ }
|
||||
case 26:
|
||||
{
|
||||
CHECK_GLIBC_IFUNC_PREFERRED_NEED_BOTH
|
||||
diff --git a/sysdeps/x86/dl-cacheinfo.h b/sysdeps/x86/dl-cacheinfo.h
|
||||
index 9b6f68e46de4bdaa..66e2b83fea0dc744 100644
|
||||
--- a/sysdeps/x86/dl-cacheinfo.h
|
||||
+++ b/sysdeps/x86/dl-cacheinfo.h
|
||||
@@ -989,13 +989,14 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
|
||||
/* Non-temporal stores are more performant on Intel and AMD hardware above
|
||||
non_temporal_threshold. Enable this for both Intel and AMD hardware. */
|
||||
unsigned long int memset_non_temporal_threshold = SIZE_MAX;
|
||||
- if (cpu_features->basic.kind == arch_kind_intel
|
||||
- || cpu_features->basic.kind == arch_kind_amd)
|
||||
- memset_non_temporal_threshold = non_temporal_threshold;
|
||||
-
|
||||
- /* For AMD CPUs that support ERMS (Zen3+), REP MOVSB is in a lot of
|
||||
- cases slower than the vectorized path (and for some alignments,
|
||||
- it is really slow, check BZ #30994). */
|
||||
+ if (!CPU_FEATURES_ARCH_P (cpu_features, Avoid_Non_Temporal_Memset)
|
||||
+ && (cpu_features->basic.kind == arch_kind_intel
|
||||
+ || cpu_features->basic.kind == arch_kind_amd))
|
||||
+ memset_non_temporal_threshold = non_temporal_threshold;
|
||||
+
|
||||
+ /* For AMD CPUs that support ERMS (Zen3+), REP MOVSB is in a lot of
|
||||
+ cases slower than the vectorized path (and for some alignments,
|
||||
+ it is really slow, check BZ #30994). */
|
||||
if (cpu_features->basic.kind == arch_kind_amd)
|
||||
rep_movsb_threshold = non_temporal_threshold;
|
||||
|
||||
diff --git a/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def b/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def
|
||||
index 85e7f54ec8204328..61bbbc2e8983482e 100644
|
||||
--- a/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def
|
||||
+++ b/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def
|
||||
@@ -33,3 +33,4 @@ BIT (Prefer_No_AVX512)
|
||||
BIT (MathVec_Prefer_No_AVX512)
|
||||
BIT (Prefer_FSRM)
|
||||
BIT (Avoid_Short_Distance_REP_MOVSB)
|
||||
+BIT (Avoid_Non_Temporal_Memset)
|
||||
diff --git a/sysdeps/x86/tst-hwcap-tunables.c b/sysdeps/x86/tst-hwcap-tunables.c
|
||||
index f6a65b88dea6d9dc..bc573c7435130dee 100644
|
||||
--- a/sysdeps/x86/tst-hwcap-tunables.c
|
||||
+++ b/sysdeps/x86/tst-hwcap-tunables.c
|
||||
@@ -60,7 +60,7 @@ static const struct test_t
|
||||
/* Disable everything. */
|
||||
"-Prefer_ERMS,-Prefer_FSRM,-AVX,-AVX2,-AVX512F,-AVX512VL,"
|
||||
"-SSE4_1,-SSE4_2,-SSSE3,-Fast_Unaligned_Load,-ERMS,"
|
||||
- "-AVX_Fast_Unaligned_Load",
|
||||
+ "-AVX_Fast_Unaligned_Load,-Avoid_Non_Temporal_Memset",
|
||||
test_1,
|
||||
array_length (test_1)
|
||||
},
|
||||
@@ -68,7 +68,7 @@ static const struct test_t
|
||||
/* Same as before, but with some empty suboptions. */
|
||||
",-,-Prefer_ERMS,-Prefer_FSRM,-AVX,-AVX2,-AVX512F,-AVX512VL,"
|
||||
"-SSE4_1,-SSE4_2,-SSSE3,-Fast_Unaligned_Load,,-,"
|
||||
- "-ERMS,-AVX_Fast_Unaligned_Load,-,",
|
||||
+ "-ERMS,-AVX_Fast_Unaligned_Load,-Avoid_Non_Temporal_Memset,-,",
|
||||
test_1,
|
||||
array_length (test_1)
|
||||
}
|
24
glibc-upstream-2.39-178.patch
Normal file
24
glibc-upstream-2.39-178.patch
Normal file
@ -0,0 +1,24 @@
|
||||
commit 2be36448c46e9ef712e5f3d5381f38bf3138efdf
|
||||
Author: Florian Weimer <fweimer@redhat.com>
|
||||
Date: Fri Aug 2 15:22:14 2024 +0200
|
||||
|
||||
x86: Tunables may incorrectly set Prefer_PMINUB_for_stringop (bug 32047)
|
||||
|
||||
Fixes commit 5bcf6265f215326d14dfacdce8532792c2c7f8f8 ("x86:
|
||||
Disable non-temporal memset on Skylake Server").
|
||||
|
||||
Reviewed-by: Noah Goldstein <goldstein.w.n@gmail.com>
|
||||
(cherry picked from commit 7a630f7d3392ca391a399486ce2846f9e4b4ee63)
|
||||
|
||||
diff --git a/sysdeps/x86/cpu-tunables.c b/sysdeps/x86/cpu-tunables.c
|
||||
index a71772c9c07d01d7..a0b31d80f64127c5 100644
|
||||
--- a/sysdeps/x86/cpu-tunables.c
|
||||
+++ b/sysdeps/x86/cpu-tunables.c
|
||||
@@ -250,6 +250,7 @@ TUNABLE_CALLBACK (set_hwcaps) (tunable_val_t *valp)
|
||||
CHECK_GLIBC_IFUNC_PREFERRED_BOTH (n, cpu_features,
|
||||
Avoid_Non_Temporal_Memset, 25);
|
||||
}
|
||||
+ break;
|
||||
case 26:
|
||||
{
|
||||
CHECK_GLIBC_IFUNC_PREFERRED_NEED_BOTH
|
90
glibc-upstream-2.39-179.patch
Normal file
90
glibc-upstream-2.39-179.patch
Normal file
@ -0,0 +1,90 @@
|
||||
commit 65ae73be01604699493d387d8ea6bba41df004ab
|
||||
Author: Noah Goldstein <goldstein.w.n@gmail.com>
|
||||
Date: Wed Aug 14 14:37:30 2024 +0800
|
||||
|
||||
x86: Use `Avoid_Non_Temporal_Memset` to control non-temporal path
|
||||
|
||||
This is just a refactor and there should be no behavioral change from
|
||||
this commit.
|
||||
|
||||
The goal is to make `Avoid_Non_Temporal_Memset` a more universal knob
|
||||
for controlling whether we use non-temporal memset rather than having
|
||||
extra logic based on vendor.
|
||||
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
|
||||
|
||||
(cherry picked from commit b93dddfaf440aa12f45d7c356f6ffe9f27d35577)
|
||||
|
||||
diff --git a/sysdeps/x86/cpu-features.c b/sysdeps/x86/cpu-features.c
|
||||
index 77b5638daafe9a1e..4490c0a782e25d8d 100644
|
||||
--- a/sysdeps/x86/cpu-features.c
|
||||
+++ b/sysdeps/x86/cpu-features.c
|
||||
@@ -758,6 +758,12 @@ init_cpu_features (struct cpu_features *cpu_features)
|
||||
unsigned int stepping = 0;
|
||||
enum cpu_features_kind kind;
|
||||
|
||||
+ /* Default is avoid non-temporal memset for non Intel/AMD hardware. This is,
|
||||
+ as of writing this, we only have benchmarks indicatings it profitability
|
||||
+ on Intel/AMD. */
|
||||
+ cpu_features->preferred[index_arch_Avoid_Non_Temporal_Memset]
|
||||
+ |= bit_arch_Avoid_Non_Temporal_Memset;
|
||||
+
|
||||
cpu_features->cachesize_non_temporal_divisor = 4;
|
||||
#if !HAS_CPUID
|
||||
if (__get_cpuid_max (0, 0) == 0)
|
||||
@@ -783,6 +789,11 @@ init_cpu_features (struct cpu_features *cpu_features)
|
||||
|
||||
update_active (cpu_features);
|
||||
|
||||
+ /* Benchmarks indicate non-temporal memset can be profitable on Intel
|
||||
+ hardware. */
|
||||
+ cpu_features->preferred[index_arch_Avoid_Non_Temporal_Memset]
|
||||
+ &= ~bit_arch_Avoid_Non_Temporal_Memset;
|
||||
+
|
||||
if (family == 0x06)
|
||||
{
|
||||
model += extended_model;
|
||||
@@ -993,6 +1004,11 @@ https://www.intel.com/content/www/us/en/support/articles/000059422/processors.ht
|
||||
|
||||
ecx = cpu_features->features[CPUID_INDEX_1].cpuid.ecx;
|
||||
|
||||
+ /* Benchmarks indicate non-temporal memset can be profitable on AMD
|
||||
+ hardware. */
|
||||
+ cpu_features->preferred[index_arch_Avoid_Non_Temporal_Memset]
|
||||
+ &= ~bit_arch_Avoid_Non_Temporal_Memset;
|
||||
+
|
||||
if (CPU_FEATURE_USABLE_P (cpu_features, AVX))
|
||||
{
|
||||
/* Since the FMA4 bit is in CPUID_INDEX_80000001 and
|
||||
diff --git a/sysdeps/x86/dl-cacheinfo.h b/sysdeps/x86/dl-cacheinfo.h
|
||||
index 66e2b83fea0dc744..10ad18061a1b47af 100644
|
||||
--- a/sysdeps/x86/dl-cacheinfo.h
|
||||
+++ b/sysdeps/x86/dl-cacheinfo.h
|
||||
@@ -986,14 +986,6 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
|
||||
if (CPU_FEATURE_USABLE_P (cpu_features, FSRM))
|
||||
rep_movsb_threshold = 2112;
|
||||
|
||||
- /* Non-temporal stores are more performant on Intel and AMD hardware above
|
||||
- non_temporal_threshold. Enable this for both Intel and AMD hardware. */
|
||||
- unsigned long int memset_non_temporal_threshold = SIZE_MAX;
|
||||
- if (!CPU_FEATURES_ARCH_P (cpu_features, Avoid_Non_Temporal_Memset)
|
||||
- && (cpu_features->basic.kind == arch_kind_intel
|
||||
- || cpu_features->basic.kind == arch_kind_amd))
|
||||
- memset_non_temporal_threshold = non_temporal_threshold;
|
||||
-
|
||||
/* For AMD CPUs that support ERMS (Zen3+), REP MOVSB is in a lot of
|
||||
cases slower than the vectorized path (and for some alignments,
|
||||
it is really slow, check BZ #30994). */
|
||||
@@ -1015,6 +1007,13 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
|
||||
if (tunable_size != 0)
|
||||
shared = tunable_size;
|
||||
|
||||
+ /* Non-temporal stores are more performant on some hardware above
|
||||
+ non_temporal_threshold. Currently Prefer_Non_Temporal is set for for both
|
||||
+ Intel and AMD hardware. */
|
||||
+ unsigned long int memset_non_temporal_threshold = SIZE_MAX;
|
||||
+ if (!CPU_FEATURES_ARCH_P (cpu_features, Avoid_Non_Temporal_Memset))
|
||||
+ memset_non_temporal_threshold = non_temporal_threshold;
|
||||
+
|
||||
tunable_size = TUNABLE_GET (x86_non_temporal_threshold, long int, NULL);
|
||||
if (tunable_size > minimum_non_temporal_threshold
|
||||
&& tunable_size <= maximum_non_temporal_threshold)
|
160
glibc-upstream-2.39-180.patch
Normal file
160
glibc-upstream-2.39-180.patch
Normal file
@ -0,0 +1,160 @@
|
||||
commit 765ff3d0d49f039575dd20961e745fb2876339a7
|
||||
Author: Sunil K Pandey <skpgkp2@gmail.com>
|
||||
Date: Thu Apr 3 13:00:45 2025 -0700
|
||||
|
||||
x86: Optimize xstate size calculation
|
||||
|
||||
Scan xstate IDs up to the maximum supported xstate ID. Remove the
|
||||
separate AMX xstate calculation. Instead, exclude the AMX space from
|
||||
the start of TILECFG to the end of TILEDATA in xsave_state_size.
|
||||
|
||||
Completed validation on SKL/SKX/SPR/SDE and compared xsave state size
|
||||
with "ld.so --list-diagnostics" option, no regression.
|
||||
|
||||
Co-Authored-By: H.J. Lu <hjl.tools@gmail.com>
|
||||
Reviewed-by: Sunil K Pandey <skpgkp2@gmail.com>
|
||||
(cherry picked from commit 70b648855185e967e54668b101d24704c3fb869d)
|
||||
|
||||
diff --git a/sysdeps/x86/cpu-features.c b/sysdeps/x86/cpu-features.c
|
||||
index 4490c0a782e25d8d..dc5cd01d489851b8 100644
|
||||
--- a/sysdeps/x86/cpu-features.c
|
||||
+++ b/sysdeps/x86/cpu-features.c
|
||||
@@ -325,13 +325,8 @@ update_active (struct cpu_features *cpu_features)
|
||||
/* Check if XSAVEC is available. */
|
||||
if (CPU_FEATURES_CPU_P (cpu_features, XSAVEC))
|
||||
{
|
||||
- unsigned int xstate_comp_offsets[32];
|
||||
- unsigned int xstate_comp_sizes[32];
|
||||
-#ifdef __x86_64__
|
||||
- unsigned int xstate_amx_comp_offsets[32];
|
||||
- unsigned int xstate_amx_comp_sizes[32];
|
||||
- unsigned int amx_ecx;
|
||||
-#endif
|
||||
+ unsigned int xstate_comp_offsets[X86_XSTATE_MAX_ID + 1];
|
||||
+ unsigned int xstate_comp_sizes[X86_XSTATE_MAX_ID + 1];
|
||||
unsigned int i;
|
||||
|
||||
xstate_comp_offsets[0] = 0;
|
||||
@@ -339,39 +334,16 @@ update_active (struct cpu_features *cpu_features)
|
||||
xstate_comp_offsets[2] = 576;
|
||||
xstate_comp_sizes[0] = 160;
|
||||
xstate_comp_sizes[1] = 256;
|
||||
-#ifdef __x86_64__
|
||||
- xstate_amx_comp_offsets[0] = 0;
|
||||
- xstate_amx_comp_offsets[1] = 160;
|
||||
- xstate_amx_comp_offsets[2] = 576;
|
||||
- xstate_amx_comp_sizes[0] = 160;
|
||||
- xstate_amx_comp_sizes[1] = 256;
|
||||
-#endif
|
||||
|
||||
- for (i = 2; i < 32; i++)
|
||||
+ for (i = 2; i <= X86_XSTATE_MAX_ID; i++)
|
||||
{
|
||||
if ((FULL_STATE_SAVE_MASK & (1 << i)) != 0)
|
||||
{
|
||||
__cpuid_count (0xd, i, eax, ebx, ecx, edx);
|
||||
-#ifdef __x86_64__
|
||||
- /* Include this in xsave_state_full_size. */
|
||||
- amx_ecx = ecx;
|
||||
- xstate_amx_comp_sizes[i] = eax;
|
||||
- if ((AMX_STATE_SAVE_MASK & (1 << i)) != 0)
|
||||
- {
|
||||
- /* Exclude this from xsave_state_size. */
|
||||
- ecx = 0;
|
||||
- xstate_comp_sizes[i] = 0;
|
||||
- }
|
||||
- else
|
||||
-#endif
|
||||
- xstate_comp_sizes[i] = eax;
|
||||
+ xstate_comp_sizes[i] = eax;
|
||||
}
|
||||
else
|
||||
{
|
||||
-#ifdef __x86_64__
|
||||
- amx_ecx = 0;
|
||||
- xstate_amx_comp_sizes[i] = 0;
|
||||
-#endif
|
||||
ecx = 0;
|
||||
xstate_comp_sizes[i] = 0;
|
||||
}
|
||||
@@ -380,42 +352,32 @@ update_active (struct cpu_features *cpu_features)
|
||||
{
|
||||
xstate_comp_offsets[i]
|
||||
= (xstate_comp_offsets[i - 1]
|
||||
- + xstate_comp_sizes[i -1]);
|
||||
+ + xstate_comp_sizes[i - 1]);
|
||||
if ((ecx & (1 << 1)) != 0)
|
||||
xstate_comp_offsets[i]
|
||||
= ALIGN_UP (xstate_comp_offsets[i], 64);
|
||||
-#ifdef __x86_64__
|
||||
- xstate_amx_comp_offsets[i]
|
||||
- = (xstate_amx_comp_offsets[i - 1]
|
||||
- + xstate_amx_comp_sizes[i - 1]);
|
||||
- if ((amx_ecx & (1 << 1)) != 0)
|
||||
- xstate_amx_comp_offsets[i]
|
||||
- = ALIGN_UP (xstate_amx_comp_offsets[i],
|
||||
- 64);
|
||||
-#endif
|
||||
}
|
||||
}
|
||||
|
||||
/* Use XSAVEC. */
|
||||
unsigned int size
|
||||
- = xstate_comp_offsets[31] + xstate_comp_sizes[31];
|
||||
+ = (xstate_comp_offsets[X86_XSTATE_MAX_ID]
|
||||
+ + xstate_comp_sizes[X86_XSTATE_MAX_ID]);
|
||||
if (size)
|
||||
{
|
||||
+ size = ALIGN_UP (size + TLSDESC_CALL_REGISTER_SAVE_AREA,
|
||||
+ 64);
|
||||
#ifdef __x86_64__
|
||||
- unsigned int amx_size
|
||||
- = (xstate_amx_comp_offsets[31]
|
||||
- + xstate_amx_comp_sizes[31]);
|
||||
- amx_size
|
||||
- = ALIGN_UP ((amx_size
|
||||
- + TLSDESC_CALL_REGISTER_SAVE_AREA),
|
||||
- 64);
|
||||
- /* Set TLSDESC state size to the compact AMX
|
||||
- state size for XSAVEC. */
|
||||
- _dl_x86_features_tlsdesc_state_size = amx_size;
|
||||
+ _dl_x86_features_tlsdesc_state_size = size;
|
||||
+ /* Exclude the AMX space from the start of TILECFG
|
||||
+ space to the end of TILEDATA space. If CPU
|
||||
+ doesn't support AMX, TILECFG offset is the same
|
||||
+ as TILEDATA + 1 offset. Otherwise, they are
|
||||
+ multiples of 64. */
|
||||
+ size -= (xstate_comp_offsets[X86_XSTATE_TILEDATA_ID + 1]
|
||||
+ - xstate_comp_offsets[X86_XSTATE_TILECFG_ID]);
|
||||
#endif
|
||||
- cpu_features->xsave_state_size
|
||||
- = ALIGN_UP (size + TLSDESC_CALL_REGISTER_SAVE_AREA,
|
||||
- 64);
|
||||
+ cpu_features->xsave_state_size = size;
|
||||
CPU_FEATURE_SET (cpu_features, XSAVEC);
|
||||
}
|
||||
}
|
||||
diff --git a/sysdeps/x86/sysdep.h b/sysdeps/x86/sysdep.h
|
||||
index 7359149e17ccf341..1d6cabd816bf84cc 100644
|
||||
--- a/sysdeps/x86/sysdep.h
|
||||
+++ b/sysdeps/x86/sysdep.h
|
||||
@@ -102,6 +102,9 @@
|
||||
| (1 << X86_XSTATE_ZMM_ID) \
|
||||
| (1 << X86_XSTATE_APX_F_ID))
|
||||
|
||||
+/* The maximum supported xstate ID. */
|
||||
+# define X86_XSTATE_MAX_ID X86_XSTATE_APX_F_ID
|
||||
+
|
||||
/* AMX state mask. */
|
||||
# define AMX_STATE_SAVE_MASK \
|
||||
((1 << X86_XSTATE_TILECFG_ID) | (1 << X86_XSTATE_TILEDATA_ID))
|
||||
@@ -123,6 +126,9 @@
|
||||
| (1 << X86_XSTATE_K_ID) \
|
||||
| (1 << X86_XSTATE_ZMM_H_ID))
|
||||
|
||||
+/* The maximum supported xstate ID. */
|
||||
+# define X86_XSTATE_MAX_ID X86_XSTATE_ZMM_H_ID
|
||||
+
|
||||
/* States to be included in xsave_state_size. */
|
||||
# define FULL_STATE_SAVE_MASK STATE_SAVE_MASK
|
||||
#endif
|
76
glibc-upstream-2.39-181.patch
Normal file
76
glibc-upstream-2.39-181.patch
Normal file
@ -0,0 +1,76 @@
|
||||
commit 7620d98186fc23e216773dbec5dc5da1fd8daf0f
|
||||
Author: Sunil K Pandey <skpgkp2@gmail.com>
|
||||
Date: Thu Apr 3 18:14:20 2025 -0700
|
||||
|
||||
x86: Add ARL/PTL/CWF model detection support
|
||||
|
||||
- Add ARROWLAKE model detection.
|
||||
- Add PANTHERLAKE model detection.
|
||||
- Add CLEARWATERFOREST model detection.
|
||||
|
||||
Intel® Architecture Instruction Set Extensions Programming Reference
|
||||
https://cdrdv2.intel.com/v1/dl/getContent/671368 Section 1.2.
|
||||
|
||||
No regression, validated model detection on SDE.
|
||||
|
||||
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
|
||||
(cherry picked from commit e53eb952b970ac94c97d74fb447418fb327ca096)
|
||||
|
||||
diff --git a/sysdeps/x86/cpu-features.c b/sysdeps/x86/cpu-features.c
|
||||
index dc5cd01d489851b8..fb94477dad08ab02 100644
|
||||
--- a/sysdeps/x86/cpu-features.c
|
||||
+++ b/sysdeps/x86/cpu-features.c
|
||||
@@ -512,6 +512,7 @@ enum
|
||||
INTEL_ATOM_GOLDMONT,
|
||||
INTEL_ATOM_GOLDMONT_PLUS,
|
||||
INTEL_ATOM_SIERRAFOREST,
|
||||
+ INTEL_ATOM_CLEARWATERFOREST,
|
||||
INTEL_ATOM_GRANDRIDGE,
|
||||
INTEL_ATOM_TREMONT,
|
||||
|
||||
@@ -539,6 +540,7 @@ enum
|
||||
INTEL_BIGCORE_METEORLAKE,
|
||||
INTEL_BIGCORE_LUNARLAKE,
|
||||
INTEL_BIGCORE_ARROWLAKE,
|
||||
+ INTEL_BIGCORE_PANTHERLAKE,
|
||||
INTEL_BIGCORE_GRANITERAPIDS,
|
||||
|
||||
/* Mixed (bigcore + atom SOC). */
|
||||
@@ -584,6 +586,8 @@ intel_get_fam6_microarch (unsigned int model,
|
||||
return INTEL_ATOM_GOLDMONT_PLUS;
|
||||
case 0xAF:
|
||||
return INTEL_ATOM_SIERRAFOREST;
|
||||
+ case 0xDD:
|
||||
+ return INTEL_ATOM_CLEARWATERFOREST;
|
||||
case 0xB6:
|
||||
return INTEL_ATOM_GRANDRIDGE;
|
||||
case 0x86:
|
||||
@@ -691,8 +695,12 @@ intel_get_fam6_microarch (unsigned int model,
|
||||
return INTEL_BIGCORE_METEORLAKE;
|
||||
case 0xbd:
|
||||
return INTEL_BIGCORE_LUNARLAKE;
|
||||
+ case 0xb5:
|
||||
+ case 0xc5:
|
||||
case 0xc6:
|
||||
return INTEL_BIGCORE_ARROWLAKE;
|
||||
+ case 0xCC:
|
||||
+ return INTEL_BIGCORE_PANTHERLAKE;
|
||||
case 0xAD:
|
||||
case 0xAE:
|
||||
return INTEL_BIGCORE_GRANITERAPIDS;
|
||||
@@ -808,6 +816,7 @@ init_cpu_features (struct cpu_features *cpu_features)
|
||||
Default tuned atom microarch.
|
||||
case INTEL_ATOM_SIERRAFOREST:
|
||||
case INTEL_ATOM_GRANDRIDGE:
|
||||
+ case INTEL_ATOM_CLEARWATERFOREST:
|
||||
*/
|
||||
|
||||
/* Bigcore/Default Tuning. */
|
||||
@@ -864,6 +873,7 @@ init_cpu_features (struct cpu_features *cpu_features)
|
||||
case INTEL_BIGCORE_METEORLAKE:
|
||||
case INTEL_BIGCORE_LUNARLAKE:
|
||||
case INTEL_BIGCORE_ARROWLAKE:
|
||||
+ case INTEL_BIGCORE_PANTHERLAKE:
|
||||
case INTEL_BIGCORE_SAPPHIRERAPIDS:
|
||||
case INTEL_BIGCORE_EMERALDRAPIDS:
|
||||
case INTEL_BIGCORE_GRANITERAPIDS:
|
352
glibc-upstream-2.39-182.patch
Normal file
352
glibc-upstream-2.39-182.patch
Normal file
@ -0,0 +1,352 @@
|
||||
commit e09436c2cb5b6453d922c5af6a30e2de0255cd61
|
||||
Author: Sunil K Pandey <sunil.k.pandey@intel.com>
|
||||
Date: Fri Apr 11 08:52:52 2025 -0700
|
||||
|
||||
x86: Handle unknown Intel processor with default tuning
|
||||
|
||||
Enable default tuning for unknown Intel processor.
|
||||
|
||||
Tested on x86, no regression.
|
||||
|
||||
Co-Authored-By: H.J. Lu <hjl.tools@gmail.com>
|
||||
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
|
||||
(cherry picked from commit 9f0deff558d1d6b08c425c157f50de85013ada9c)
|
||||
|
||||
diff --git a/sysdeps/x86/cpu-features.c b/sysdeps/x86/cpu-features.c
|
||||
index fb94477dad08ab02..6d2e660b4b20ff06 100644
|
||||
--- a/sysdeps/x86/cpu-features.c
|
||||
+++ b/sysdeps/x86/cpu-features.c
|
||||
@@ -502,8 +502,8 @@ _Static_assert (((index_arch_Fast_Unaligned_Load
|
||||
"Incorrect index_arch_Fast_Unaligned_Load");
|
||||
|
||||
|
||||
-/* Intel Family-6 microarch list. */
|
||||
-enum
|
||||
+/* Intel microarch list. */
|
||||
+enum intel_microarch
|
||||
{
|
||||
/* Atom processors. */
|
||||
INTEL_ATOM_BONNELL,
|
||||
@@ -555,7 +555,7 @@ enum
|
||||
INTEL_UNKNOWN,
|
||||
};
|
||||
|
||||
-static unsigned int
|
||||
+static enum intel_microarch
|
||||
intel_get_fam6_microarch (unsigned int model,
|
||||
__attribute__ ((unused)) unsigned int stepping)
|
||||
{
|
||||
@@ -764,134 +764,20 @@ init_cpu_features (struct cpu_features *cpu_features)
|
||||
cpu_features->preferred[index_arch_Avoid_Non_Temporal_Memset]
|
||||
&= ~bit_arch_Avoid_Non_Temporal_Memset;
|
||||
|
||||
+ enum intel_microarch microarch = INTEL_UNKNOWN;
|
||||
if (family == 0x06)
|
||||
{
|
||||
model += extended_model;
|
||||
- unsigned int microarch
|
||||
- = intel_get_fam6_microarch (model, stepping);
|
||||
+ microarch = intel_get_fam6_microarch (model, stepping);
|
||||
|
||||
+ /* Disable TSX on some processors to avoid TSX on kernels that
|
||||
+ weren't updated with the latest microcode package (which
|
||||
+ disables broken feature by default). */
|
||||
switch (microarch)
|
||||
{
|
||||
- /* Atom / KNL tuning. */
|
||||
- case INTEL_ATOM_BONNELL:
|
||||
- /* BSF is slow on Bonnell. */
|
||||
- cpu_features->preferred[index_arch_Slow_BSF]
|
||||
- |= bit_arch_Slow_BSF;
|
||||
- break;
|
||||
-
|
||||
- /* Unaligned load versions are faster than SSSE3
|
||||
- on Airmont, Silvermont, Goldmont, and Goldmont Plus. */
|
||||
- case INTEL_ATOM_AIRMONT:
|
||||
- case INTEL_ATOM_SILVERMONT:
|
||||
- case INTEL_ATOM_GOLDMONT:
|
||||
- case INTEL_ATOM_GOLDMONT_PLUS:
|
||||
-
|
||||
- /* Knights Landing. Enable Silvermont optimizations. */
|
||||
- case INTEL_KNIGHTS_LANDING:
|
||||
-
|
||||
- cpu_features->preferred[index_arch_Fast_Unaligned_Load]
|
||||
- |= (bit_arch_Fast_Unaligned_Load
|
||||
- | bit_arch_Fast_Unaligned_Copy
|
||||
- | bit_arch_Prefer_PMINUB_for_stringop
|
||||
- | bit_arch_Slow_SSE4_2);
|
||||
- break;
|
||||
-
|
||||
- case INTEL_ATOM_TREMONT:
|
||||
- /* Enable rep string instructions, unaligned load, unaligned
|
||||
- copy, pminub and avoid SSE 4.2 on Tremont. */
|
||||
- cpu_features->preferred[index_arch_Fast_Rep_String]
|
||||
- |= (bit_arch_Fast_Rep_String
|
||||
- | bit_arch_Fast_Unaligned_Load
|
||||
- | bit_arch_Fast_Unaligned_Copy
|
||||
- | bit_arch_Prefer_PMINUB_for_stringop
|
||||
- | bit_arch_Slow_SSE4_2);
|
||||
- break;
|
||||
-
|
||||
- /*
|
||||
- Default tuned Knights microarch.
|
||||
- case INTEL_KNIGHTS_MILL:
|
||||
- */
|
||||
-
|
||||
- /*
|
||||
- Default tuned atom microarch.
|
||||
- case INTEL_ATOM_SIERRAFOREST:
|
||||
- case INTEL_ATOM_GRANDRIDGE:
|
||||
- case INTEL_ATOM_CLEARWATERFOREST:
|
||||
- */
|
||||
-
|
||||
- /* Bigcore/Default Tuning. */
|
||||
default:
|
||||
- default_tuning:
|
||||
- /* Unknown family 0x06 processors. Assuming this is one
|
||||
- of Core i3/i5/i7 processors if AVX is available. */
|
||||
- if (!CPU_FEATURES_CPU_P (cpu_features, AVX))
|
||||
- break;
|
||||
-
|
||||
- enable_modern_features:
|
||||
- /* Rep string instructions, unaligned load, unaligned copy,
|
||||
- and pminub are fast on Intel Core i3, i5 and i7. */
|
||||
- cpu_features->preferred[index_arch_Fast_Rep_String]
|
||||
- |= (bit_arch_Fast_Rep_String
|
||||
- | bit_arch_Fast_Unaligned_Load
|
||||
- | bit_arch_Fast_Unaligned_Copy
|
||||
- | bit_arch_Prefer_PMINUB_for_stringop);
|
||||
break;
|
||||
|
||||
- case INTEL_BIGCORE_NEHALEM:
|
||||
- case INTEL_BIGCORE_WESTMERE:
|
||||
- /* Older CPUs prefer non-temporal stores at lower threshold. */
|
||||
- cpu_features->cachesize_non_temporal_divisor = 8;
|
||||
- goto enable_modern_features;
|
||||
-
|
||||
- /* Older Bigcore microarch (smaller non-temporal store
|
||||
- threshold). */
|
||||
- case INTEL_BIGCORE_SANDYBRIDGE:
|
||||
- case INTEL_BIGCORE_IVYBRIDGE:
|
||||
- case INTEL_BIGCORE_HASWELL:
|
||||
- case INTEL_BIGCORE_BROADWELL:
|
||||
- cpu_features->cachesize_non_temporal_divisor = 8;
|
||||
- goto default_tuning;
|
||||
-
|
||||
- /* Newer Bigcore microarch (larger non-temporal store
|
||||
- threshold). */
|
||||
- case INTEL_BIGCORE_SKYLAKE_AVX512:
|
||||
- case INTEL_BIGCORE_CANNONLAKE:
|
||||
- /* Benchmarks indicate non-temporal memset is not
|
||||
- necessarily profitable on SKX (and in some cases much
|
||||
- worse). This is likely unique to SKX due its it unique
|
||||
- mesh interconnect (not present on ICX or BWD). Disable
|
||||
- non-temporal on all Skylake servers. */
|
||||
- cpu_features->preferred[index_arch_Avoid_Non_Temporal_Memset]
|
||||
- |= bit_arch_Avoid_Non_Temporal_Memset;
|
||||
- case INTEL_BIGCORE_COMETLAKE:
|
||||
- case INTEL_BIGCORE_SKYLAKE:
|
||||
- case INTEL_BIGCORE_KABYLAKE:
|
||||
- case INTEL_BIGCORE_ICELAKE:
|
||||
- case INTEL_BIGCORE_TIGERLAKE:
|
||||
- case INTEL_BIGCORE_ROCKETLAKE:
|
||||
- case INTEL_BIGCORE_RAPTORLAKE:
|
||||
- case INTEL_BIGCORE_METEORLAKE:
|
||||
- case INTEL_BIGCORE_LUNARLAKE:
|
||||
- case INTEL_BIGCORE_ARROWLAKE:
|
||||
- case INTEL_BIGCORE_PANTHERLAKE:
|
||||
- case INTEL_BIGCORE_SAPPHIRERAPIDS:
|
||||
- case INTEL_BIGCORE_EMERALDRAPIDS:
|
||||
- case INTEL_BIGCORE_GRANITERAPIDS:
|
||||
- cpu_features->cachesize_non_temporal_divisor = 2;
|
||||
- goto default_tuning;
|
||||
-
|
||||
- /* Default tuned Mixed (bigcore + atom SOC). */
|
||||
- case INTEL_MIXED_LAKEFIELD:
|
||||
- case INTEL_MIXED_ALDERLAKE:
|
||||
- cpu_features->cachesize_non_temporal_divisor = 2;
|
||||
- goto default_tuning;
|
||||
- }
|
||||
-
|
||||
- /* Disable TSX on some processors to avoid TSX on kernels that
|
||||
- weren't updated with the latest microcode package (which
|
||||
- disables broken feature by default). */
|
||||
- switch (microarch)
|
||||
- {
|
||||
case INTEL_BIGCORE_SKYLAKE_AVX512:
|
||||
/* 0x55 (Skylake-avx512) && stepping <= 5 disable TSX. */
|
||||
if (stepping <= 5)
|
||||
@@ -900,38 +786,152 @@ init_cpu_features (struct cpu_features *cpu_features)
|
||||
|
||||
case INTEL_BIGCORE_KABYLAKE:
|
||||
/* NB: Although the errata documents that for model == 0x8e
|
||||
- (kabylake skylake client), only 0xb stepping or lower are
|
||||
- impacted, the intention of the errata was to disable TSX on
|
||||
- all client processors on all steppings. Include 0xc
|
||||
- stepping which is an Intel Core i7-8665U, a client mobile
|
||||
- processor. */
|
||||
+ (kabylake skylake client), only 0xb stepping or lower are
|
||||
+ impacted, the intention of the errata was to disable TSX on
|
||||
+ all client processors on all steppings. Include 0xc
|
||||
+ stepping which is an Intel Core i7-8665U, a client mobile
|
||||
+ processor. */
|
||||
if (stepping > 0xc)
|
||||
break;
|
||||
/* Fall through. */
|
||||
case INTEL_BIGCORE_SKYLAKE:
|
||||
- /* Disable Intel TSX and enable RTM_ALWAYS_ABORT for
|
||||
- processors listed in:
|
||||
-
|
||||
-https://www.intel.com/content/www/us/en/support/articles/000059422/processors.html
|
||||
- */
|
||||
- disable_tsx:
|
||||
- CPU_FEATURE_UNSET (cpu_features, HLE);
|
||||
- CPU_FEATURE_UNSET (cpu_features, RTM);
|
||||
- CPU_FEATURE_SET (cpu_features, RTM_ALWAYS_ABORT);
|
||||
- break;
|
||||
+ /* Disable Intel TSX and enable RTM_ALWAYS_ABORT for
|
||||
+ processors listed in:
|
||||
+
|
||||
+ https://www.intel.com/content/www/us/en/support/articles/000059422/processors.html
|
||||
+ */
|
||||
+disable_tsx:
|
||||
+ CPU_FEATURE_UNSET (cpu_features, HLE);
|
||||
+ CPU_FEATURE_UNSET (cpu_features, RTM);
|
||||
+ CPU_FEATURE_SET (cpu_features, RTM_ALWAYS_ABORT);
|
||||
+ break;
|
||||
|
||||
case INTEL_BIGCORE_HASWELL:
|
||||
- /* Xeon E7 v3 (model == 0x3f) with stepping >= 4 has working
|
||||
- TSX. Haswell also include other model numbers that have
|
||||
- working TSX. */
|
||||
- if (model == 0x3f && stepping >= 4)
|
||||
+ /* Xeon E7 v3 (model == 0x3f) with stepping >= 4 has working
|
||||
+ TSX. Haswell also includes other model numbers that have
|
||||
+ working TSX. */
|
||||
+ if (model == 0x3f && stepping >= 4)
|
||||
break;
|
||||
|
||||
- CPU_FEATURE_UNSET (cpu_features, RTM);
|
||||
- break;
|
||||
+ CPU_FEATURE_UNSET (cpu_features, RTM);
|
||||
+ break;
|
||||
}
|
||||
}
|
||||
|
||||
+ switch (microarch)
|
||||
+ {
|
||||
+ /* Atom / KNL tuning. */
|
||||
+ case INTEL_ATOM_BONNELL:
|
||||
+ /* BSF is slow on Bonnell. */
|
||||
+ cpu_features->preferred[index_arch_Slow_BSF]
|
||||
+ |= bit_arch_Slow_BSF;
|
||||
+ break;
|
||||
+
|
||||
+ /* Unaligned load versions are faster than SSSE3
|
||||
+ on Airmont, Silvermont, Goldmont, and Goldmont Plus. */
|
||||
+ case INTEL_ATOM_AIRMONT:
|
||||
+ case INTEL_ATOM_SILVERMONT:
|
||||
+ case INTEL_ATOM_GOLDMONT:
|
||||
+ case INTEL_ATOM_GOLDMONT_PLUS:
|
||||
+
|
||||
+ /* Knights Landing. Enable Silvermont optimizations. */
|
||||
+ case INTEL_KNIGHTS_LANDING:
|
||||
+
|
||||
+ cpu_features->preferred[index_arch_Fast_Unaligned_Load]
|
||||
+ |= (bit_arch_Fast_Unaligned_Load
|
||||
+ | bit_arch_Fast_Unaligned_Copy
|
||||
+ | bit_arch_Prefer_PMINUB_for_stringop
|
||||
+ | bit_arch_Slow_SSE4_2);
|
||||
+ break;
|
||||
+
|
||||
+ case INTEL_ATOM_TREMONT:
|
||||
+ /* Enable rep string instructions, unaligned load, unaligned
|
||||
+ copy, pminub and avoid SSE 4.2 on Tremont. */
|
||||
+ cpu_features->preferred[index_arch_Fast_Rep_String]
|
||||
+ |= (bit_arch_Fast_Rep_String
|
||||
+ | bit_arch_Fast_Unaligned_Load
|
||||
+ | bit_arch_Fast_Unaligned_Copy
|
||||
+ | bit_arch_Prefer_PMINUB_for_stringop
|
||||
+ | bit_arch_Slow_SSE4_2);
|
||||
+ break;
|
||||
+
|
||||
+ /*
|
||||
+ Default tuned Knights microarch.
|
||||
+ case INTEL_KNIGHTS_MILL:
|
||||
+ */
|
||||
+
|
||||
+ /*
|
||||
+ Default tuned atom microarch.
|
||||
+ case INTEL_ATOM_SIERRAFOREST:
|
||||
+ case INTEL_ATOM_GRANDRIDGE:
|
||||
+ case INTEL_ATOM_CLEARWATERFOREST:
|
||||
+ */
|
||||
+
|
||||
+ /* Bigcore/Default Tuning. */
|
||||
+ default:
|
||||
+ default_tuning:
|
||||
+ /* Unknown Intel processors. Assuming this is one of Core
|
||||
+ i3/i5/i7 processors if AVX is available. */
|
||||
+ if (!CPU_FEATURES_CPU_P (cpu_features, AVX))
|
||||
+ break;
|
||||
+
|
||||
+ enable_modern_features:
|
||||
+ /* Rep string instructions, unaligned load, unaligned copy,
|
||||
+ and pminub are fast on Intel Core i3, i5 and i7. */
|
||||
+ cpu_features->preferred[index_arch_Fast_Rep_String]
|
||||
+ |= (bit_arch_Fast_Rep_String
|
||||
+ | bit_arch_Fast_Unaligned_Load
|
||||
+ | bit_arch_Fast_Unaligned_Copy
|
||||
+ | bit_arch_Prefer_PMINUB_for_stringop);
|
||||
+ break;
|
||||
+
|
||||
+ case INTEL_BIGCORE_NEHALEM:
|
||||
+ case INTEL_BIGCORE_WESTMERE:
|
||||
+ /* Older CPUs prefer non-temporal stores at lower threshold. */
|
||||
+ cpu_features->cachesize_non_temporal_divisor = 8;
|
||||
+ goto enable_modern_features;
|
||||
+
|
||||
+ /* Older Bigcore microarch (smaller non-temporal store
|
||||
+ threshold). */
|
||||
+ case INTEL_BIGCORE_SANDYBRIDGE:
|
||||
+ case INTEL_BIGCORE_IVYBRIDGE:
|
||||
+ case INTEL_BIGCORE_HASWELL:
|
||||
+ case INTEL_BIGCORE_BROADWELL:
|
||||
+ cpu_features->cachesize_non_temporal_divisor = 8;
|
||||
+ goto default_tuning;
|
||||
+
|
||||
+ /* Newer Bigcore microarch (larger non-temporal store
|
||||
+ threshold). */
|
||||
+ case INTEL_BIGCORE_SKYLAKE_AVX512:
|
||||
+ case INTEL_BIGCORE_CANNONLAKE:
|
||||
+ /* Benchmarks indicate non-temporal memset is not
|
||||
+ necessarily profitable on SKX (and in some cases much
|
||||
+ worse). This is likely unique to SKX due to its unique
|
||||
+ mesh interconnect (not present on ICX or BWD). Disable
|
||||
+ non-temporal on all Skylake servers. */
|
||||
+ cpu_features->preferred[index_arch_Avoid_Non_Temporal_Memset]
|
||||
+ |= bit_arch_Avoid_Non_Temporal_Memset;
|
||||
+ /* fallthrough */
|
||||
+ case INTEL_BIGCORE_COMETLAKE:
|
||||
+ case INTEL_BIGCORE_SKYLAKE:
|
||||
+ case INTEL_BIGCORE_KABYLAKE:
|
||||
+ case INTEL_BIGCORE_ICELAKE:
|
||||
+ case INTEL_BIGCORE_TIGERLAKE:
|
||||
+ case INTEL_BIGCORE_ROCKETLAKE:
|
||||
+ case INTEL_BIGCORE_RAPTORLAKE:
|
||||
+ case INTEL_BIGCORE_METEORLAKE:
|
||||
+ case INTEL_BIGCORE_LUNARLAKE:
|
||||
+ case INTEL_BIGCORE_ARROWLAKE:
|
||||
+ case INTEL_BIGCORE_PANTHERLAKE:
|
||||
+ case INTEL_BIGCORE_SAPPHIRERAPIDS:
|
||||
+ case INTEL_BIGCORE_EMERALDRAPIDS:
|
||||
+ case INTEL_BIGCORE_GRANITERAPIDS:
|
||||
+ /* Default tuned Mixed (bigcore + atom SOC). */
|
||||
+ case INTEL_MIXED_LAKEFIELD:
|
||||
+ case INTEL_MIXED_ALDERLAKE:
|
||||
+ cpu_features->cachesize_non_temporal_divisor = 2;
|
||||
+ goto default_tuning;
|
||||
+ }
|
||||
|
||||
/* Since AVX512ER is unique to Xeon Phi, set Prefer_No_VZEROUPPER
|
||||
if AVX512ER is available. Don't use AVX512 to avoid lower CPU
|
49
glibc-upstream-2.39-183.patch
Normal file
49
glibc-upstream-2.39-183.patch
Normal file
@ -0,0 +1,49 @@
|
||||
commit 3463100f2d47f2897a24ba8023a5c7aaf2d26550
|
||||
Author: H.J. Lu <hjl.tools@gmail.com>
|
||||
Date: Sat Apr 12 08:37:29 2025 -0700
|
||||
|
||||
x86: Detect Intel Diamond Rapids
|
||||
|
||||
Detect Intel Diamond Rapids and tune it similar to Intel Granite Rapids.
|
||||
|
||||
Signed-off-by: H.J. Lu <hjl.tools@gmail.com>
|
||||
Reviewed-by: Sunil K Pandey <skpgkp2@gmail.com>
|
||||
(cherry picked from commit de14f1959ee5f9b845a7cae43bee03068b8136f0)
|
||||
|
||||
diff --git a/sysdeps/x86/cpu-features.c b/sysdeps/x86/cpu-features.c
|
||||
index 6d2e660b4b20ff06..47dc3b1510a68fc9 100644
|
||||
--- a/sysdeps/x86/cpu-features.c
|
||||
+++ b/sysdeps/x86/cpu-features.c
|
||||
@@ -542,6 +542,7 @@ enum intel_microarch
|
||||
INTEL_BIGCORE_ARROWLAKE,
|
||||
INTEL_BIGCORE_PANTHERLAKE,
|
||||
INTEL_BIGCORE_GRANITERAPIDS,
|
||||
+ INTEL_BIGCORE_DIAMONDRAPIDS,
|
||||
|
||||
/* Mixed (bigcore + atom SOC). */
|
||||
INTEL_MIXED_LAKEFIELD,
|
||||
@@ -817,6 +818,16 @@ disable_tsx:
|
||||
break;
|
||||
}
|
||||
}
|
||||
+ else if (family == 19)
|
||||
+ switch (model)
|
||||
+ {
|
||||
+ case 0x01:
|
||||
+ microarch = INTEL_BIGCORE_DIAMONDRAPIDS;
|
||||
+ break;
|
||||
+
|
||||
+ default:
|
||||
+ break;
|
||||
+ }
|
||||
|
||||
switch (microarch)
|
||||
{
|
||||
@@ -926,6 +937,7 @@ disable_tsx:
|
||||
case INTEL_BIGCORE_SAPPHIRERAPIDS:
|
||||
case INTEL_BIGCORE_EMERALDRAPIDS:
|
||||
case INTEL_BIGCORE_GRANITERAPIDS:
|
||||
+ case INTEL_BIGCORE_DIAMONDRAPIDS:
|
||||
/* Default tuned Mixed (bigcore + atom SOC). */
|
||||
case INTEL_MIXED_LAKEFIELD:
|
||||
case INTEL_MIXED_ALDERLAKE:
|
444
glibc-upstream-2.39-184.patch
Normal file
444
glibc-upstream-2.39-184.patch
Normal file
@ -0,0 +1,444 @@
|
||||
commit 2451ef5c4a92e774c56111b3708eede7f98fe940
|
||||
Author: Frank Barrus <frankbarrus_sw@shaggy.cc>
|
||||
Date: Wed Dec 4 07:55:02 2024 -0500
|
||||
|
||||
pthreads NPTL: lost wakeup fix 2
|
||||
|
||||
This fixes the lost wakeup (from a bug in signal stealing) with a change
|
||||
in the usage of g_signals[] in the condition variable internal state.
|
||||
It also completely eliminates the concept and handling of signal stealing,
|
||||
as well as the need for signalers to block to wait for waiters to wake
|
||||
up every time there is a G1/G2 switch. This greatly reduces the average
|
||||
and maximum latency for pthread_cond_signal.
|
||||
|
||||
The g_signals[] field now contains a signal count that is relative to
|
||||
the current g1_start value. Since it is a 32-bit field, and the LSB is
|
||||
still reserved (though not currently used anymore), it has a 31-bit value
|
||||
that corresponds to the low 31 bits of the sequence number in g1_start.
|
||||
(since g1_start also has an LSB flag, this means bits 31:1 in g_signals
|
||||
correspond to bits 31:1 in g1_start, plus the current signal count)
|
||||
|
||||
By making the signal count relative to g1_start, there is no longer
|
||||
any ambiguity or A/B/A issue, and thus any checks before blocking,
|
||||
including the futex call itself, are guaranteed not to block if the G1/G2
|
||||
switch occurs, even if the signal count remains the same. This allows
|
||||
initially safely blocking in G2 until the switch to G1 occurs, and
|
||||
then transitioning from G1 to a new G1 or G2, and always being able to
|
||||
distinguish the state change. This removes the race condition and A/B/A
|
||||
problems that otherwise ocurred if a late (pre-empted) waiter were to
|
||||
resume just as the futex call attempted to block on g_signal since
|
||||
otherwise there was no last opportunity to re-check things like whether
|
||||
the current G1 group was already closed.
|
||||
|
||||
By fixing these issues, the signal stealing code can be eliminated,
|
||||
since there is no concept of signal stealing anymore. The code to block
|
||||
for all waiters to exit g_refs can also be removed, since any waiters
|
||||
that are still in the g_refs region can be guaranteed to safely wake
|
||||
up and exit. If there are still any left at this time, they are all
|
||||
sent one final futex wakeup to ensure that they are not blocked any
|
||||
longer, but there is no need for the signaller to block and wait for
|
||||
them to wake up and exit the g_refs region.
|
||||
|
||||
The signal count is then effectively "zeroed" but since it is now
|
||||
relative to g1_start, this is done by advancing it to a new value that
|
||||
can be observed by any pending blocking waiters. Any late waiters can
|
||||
always tell the difference, and can thus just cleanly exit if they are
|
||||
in a stale G1 or G2. They can never steal a signal from the current
|
||||
G1 if they are not in the current G1, since the signal value that has
|
||||
to match in the cmpxchg has the low 31 bits of the g1_start value
|
||||
contained in it, and that's first checked, and then it won't match if
|
||||
there's a G1/G2 change.
|
||||
|
||||
Note: the 31-bit sequence number used in g_signals is designed to
|
||||
handle wrap-around when checking the signal count, but if the entire
|
||||
31-bit wraparound (2 billion signals) occurs while there is still a
|
||||
late waiter that has not yet resumed, and it happens to then match
|
||||
the current g1_start low bits, and the pre-emption occurs after the
|
||||
normal "closed group" checks (which are 64-bit) but then hits the
|
||||
futex syscall and signal consuming code, then an A/B/A issue could
|
||||
still result and cause an incorrect assumption about whether it
|
||||
should block. This particular scenario seems unlikely in practice.
|
||||
Note that once awake from the futex, the waiter would notice the
|
||||
closed group before consuming the signal (since that's still a 64-bit
|
||||
check that would not be aliased in the wrap-around in g_signals),
|
||||
so the biggest impact would be blocking on the futex until the next
|
||||
full wakeup from a G1/G2 switch.
|
||||
|
||||
Signed-off-by: Frank Barrus <frankbarrus_sw@shaggy.cc>
|
||||
Reviewed-by: Carlos O'Donell <carlos@redhat.com>
|
||||
(cherry picked from commit 1db84775f831a1494993ce9c118deaf9537cc50a)
|
||||
|
||||
diff --git a/nptl/pthread_cond_common.c b/nptl/pthread_cond_common.c
|
||||
index 3487557bb86c8186..4855b8899f887ad0 100644
|
||||
--- a/nptl/pthread_cond_common.c
|
||||
+++ b/nptl/pthread_cond_common.c
|
||||
@@ -201,7 +201,6 @@ static bool __attribute__ ((unused))
|
||||
__condvar_quiesce_and_switch_g1 (pthread_cond_t *cond, uint64_t wseq,
|
||||
unsigned int *g1index, int private)
|
||||
{
|
||||
- const unsigned int maxspin = 0;
|
||||
unsigned int g1 = *g1index;
|
||||
|
||||
/* If there is no waiter in G2, we don't do anything. The expression may
|
||||
@@ -222,84 +221,46 @@ __condvar_quiesce_and_switch_g1 (pthread_cond_t *cond, uint64_t wseq,
|
||||
* New waiters arriving concurrently with the group switching will all go
|
||||
into G2 until we atomically make the switch. Waiters existing in G2
|
||||
are not affected.
|
||||
- * Waiters in G1 will be closed out immediately by setting a flag in
|
||||
- __g_signals, which will prevent waiters from blocking using a futex on
|
||||
- __g_signals and also notifies them that the group is closed. As a
|
||||
- result, they will eventually remove their group reference, allowing us
|
||||
- to close switch group roles. */
|
||||
-
|
||||
- /* First, set the closed flag on __g_signals. This tells waiters that are
|
||||
- about to wait that they shouldn't do that anymore. This basically
|
||||
- serves as an advance notification of the upcoming change to __g1_start;
|
||||
- waiters interpret it as if __g1_start was larger than their waiter
|
||||
- sequence position. This allows us to change __g1_start after waiting
|
||||
- for all existing waiters with group references to leave, which in turn
|
||||
- makes recovery after stealing a signal simpler because it then can be
|
||||
- skipped if __g1_start indicates that the group is closed (otherwise,
|
||||
- we would have to recover always because waiters don't know how big their
|
||||
- groups are). Relaxed MO is fine. */
|
||||
- atomic_fetch_or_relaxed (cond->__data.__g_signals + g1, 1);
|
||||
-
|
||||
- /* Wait until there are no group references anymore. The fetch-or operation
|
||||
- injects us into the modification order of __g_refs; release MO ensures
|
||||
- that waiters incrementing __g_refs after our fetch-or see the previous
|
||||
- changes to __g_signals and to __g1_start that had to happen before we can
|
||||
- switch this G1 and alias with an older group (we have two groups, so
|
||||
- aliasing requires switching group roles twice). Note that nobody else
|
||||
- can have set the wake-request flag, so we do not have to act upon it.
|
||||
-
|
||||
- Also note that it is harmless if older waiters or waiters from this G1
|
||||
- get a group reference after we have quiesced the group because it will
|
||||
- remain closed for them either because of the closed flag in __g_signals
|
||||
- or the later update to __g1_start. New waiters will never arrive here
|
||||
- but instead continue to go into the still current G2. */
|
||||
- unsigned r = atomic_fetch_or_release (cond->__data.__g_refs + g1, 0);
|
||||
- while ((r >> 1) > 0)
|
||||
- {
|
||||
- for (unsigned int spin = maxspin; ((r >> 1) > 0) && (spin > 0); spin--)
|
||||
- {
|
||||
- /* TODO Back off. */
|
||||
- r = atomic_load_relaxed (cond->__data.__g_refs + g1);
|
||||
- }
|
||||
- if ((r >> 1) > 0)
|
||||
- {
|
||||
- /* There is still a waiter after spinning. Set the wake-request
|
||||
- flag and block. Relaxed MO is fine because this is just about
|
||||
- this futex word.
|
||||
-
|
||||
- Update r to include the set wake-request flag so that the upcoming
|
||||
- futex_wait only blocks if the flag is still set (otherwise, we'd
|
||||
- violate the basic client-side futex protocol). */
|
||||
- r = atomic_fetch_or_relaxed (cond->__data.__g_refs + g1, 1) | 1;
|
||||
-
|
||||
- if ((r >> 1) > 0)
|
||||
- futex_wait_simple (cond->__data.__g_refs + g1, r, private);
|
||||
- /* Reload here so we eventually see the most recent value even if we
|
||||
- do not spin. */
|
||||
- r = atomic_load_relaxed (cond->__data.__g_refs + g1);
|
||||
- }
|
||||
- }
|
||||
- /* Acquire MO so that we synchronize with the release operation that waiters
|
||||
- use to decrement __g_refs and thus happen after the waiters we waited
|
||||
- for. */
|
||||
- atomic_thread_fence_acquire ();
|
||||
+ * Waiters in G1 will be closed out immediately by the advancing of
|
||||
+ __g_signals to the next "lowseq" (low 31 bits of the new g1_start),
|
||||
+ which will prevent waiters from blocking using a futex on
|
||||
+ __g_signals since it provides enough signals for all possible
|
||||
+ remaining waiters. As a result, they can each consume a signal
|
||||
+ and they will eventually remove their group reference. */
|
||||
|
||||
/* Update __g1_start, which finishes closing this group. The value we add
|
||||
will never be negative because old_orig_size can only be zero when we
|
||||
switch groups the first time after a condvar was initialized, in which
|
||||
- case G1 will be at index 1 and we will add a value of 1. See above for
|
||||
- why this takes place after waiting for quiescence of the group.
|
||||
+ case G1 will be at index 1 and we will add a value of 1.
|
||||
Relaxed MO is fine because the change comes with no additional
|
||||
constraints that others would have to observe. */
|
||||
__condvar_add_g1_start_relaxed (cond,
|
||||
(old_orig_size << 1) + (g1 == 1 ? 1 : - 1));
|
||||
|
||||
- /* Now reopen the group, thus enabling waiters to again block using the
|
||||
- futex controlled by __g_signals. Release MO so that observers that see
|
||||
- no signals (and thus can block) also see the write __g1_start and thus
|
||||
- that this is now a new group (see __pthread_cond_wait_common for the
|
||||
- matching acquire MO loads). */
|
||||
- atomic_store_release (cond->__data.__g_signals + g1, 0);
|
||||
+ unsigned int lowseq = ((old_g1_start + old_orig_size) << 1) & ~1U;
|
||||
+
|
||||
+ /* If any waiters still hold group references (and thus could be blocked),
|
||||
+ then wake them all up now and prevent any running ones from blocking.
|
||||
+ This is effectively a catch-all for any possible current or future
|
||||
+ bugs that can allow the group size to reach 0 before all G1 waiters
|
||||
+ have been awakened or at least given signals to consume, or any
|
||||
+ other case that can leave blocked (or about to block) older waiters.. */
|
||||
+ if ((atomic_fetch_or_release (cond->__data.__g_refs + g1, 0) >> 1) > 0)
|
||||
+ {
|
||||
+ /* First advance signals to the end of the group (i.e. enough signals
|
||||
+ for the entire G1 group) to ensure that waiters which have not
|
||||
+ yet blocked in the futex will not block.
|
||||
+ Note that in the vast majority of cases, this should never
|
||||
+ actually be necessary, since __g_signals will have enough
|
||||
+ signals for the remaining g_refs waiters. As an optimization,
|
||||
+ we could check this first before proceeding, although that
|
||||
+ could still leave the potential for futex lost wakeup bugs
|
||||
+ if the signal count was non-zero but the futex wakeup
|
||||
+ was somehow lost. */
|
||||
+ atomic_store_release (cond->__data.__g_signals + g1, lowseq);
|
||||
+
|
||||
+ futex_wake (cond->__data.__g_signals + g1, INT_MAX, private);
|
||||
+ }
|
||||
|
||||
/* At this point, the old G1 is now a valid new G2 (but not in use yet).
|
||||
No old waiter can neither grab a signal nor acquire a reference without
|
||||
@@ -311,6 +272,10 @@ __condvar_quiesce_and_switch_g1 (pthread_cond_t *cond, uint64_t wseq,
|
||||
g1 ^= 1;
|
||||
*g1index ^= 1;
|
||||
|
||||
+ /* Now advance the new G1 g_signals to the new lowseq, giving it
|
||||
+ an effective signal count of 0 to start. */
|
||||
+ atomic_store_release (cond->__data.__g_signals + g1, lowseq);
|
||||
+
|
||||
/* These values are just observed by signalers, and thus protected by the
|
||||
lock. */
|
||||
unsigned int orig_size = wseq - (old_g1_start + old_orig_size);
|
||||
diff --git a/nptl/pthread_cond_wait.c b/nptl/pthread_cond_wait.c
|
||||
index 66786c7b9022b26c..3d290e39c8ccebb7 100644
|
||||
--- a/nptl/pthread_cond_wait.c
|
||||
+++ b/nptl/pthread_cond_wait.c
|
||||
@@ -238,9 +238,7 @@ __condvar_cleanup_waiting (void *arg)
|
||||
signaled), and a reference count.
|
||||
|
||||
The group reference count is used to maintain the number of waiters that
|
||||
- are using the group's futex. Before a group can change its role, the
|
||||
- reference count must show that no waiters are using the futex anymore; this
|
||||
- prevents ABA issues on the futex word.
|
||||
+ are using the group's futex.
|
||||
|
||||
To represent which intervals in the waiter sequence the groups cover (and
|
||||
thus also which group slot contains G1 or G2), we use a 64b counter to
|
||||
@@ -300,11 +298,12 @@ __condvar_cleanup_waiting (void *arg)
|
||||
last reference.
|
||||
* Reference count used by waiters concurrently with signalers that have
|
||||
acquired the condvar-internal lock.
|
||||
- __g_signals: The number of signals that can still be consumed.
|
||||
+ __g_signals: The number of signals that can still be consumed, relative to
|
||||
+ the current g1_start. (i.e. bits 31 to 1 of __g_signals are bits
|
||||
+ 31 to 1 of g1_start with the signal count added)
|
||||
* Used as a futex word by waiters. Used concurrently by waiters and
|
||||
signalers.
|
||||
- * LSB is true iff this group has been completely signaled (i.e., it is
|
||||
- closed).
|
||||
+ * LSB is currently reserved and 0.
|
||||
__g_size: Waiters remaining in this group (i.e., which have not been
|
||||
signaled yet.
|
||||
* Accessed by signalers and waiters that cancel waiting (both do so only
|
||||
@@ -328,18 +327,6 @@ __condvar_cleanup_waiting (void *arg)
|
||||
sufficient because if a waiter can see a sufficiently large value, it could
|
||||
have also consume a signal in the waiters group.
|
||||
|
||||
- Waiters try to grab a signal from __g_signals without holding a reference
|
||||
- count, which can lead to stealing a signal from a more recent group after
|
||||
- their own group was already closed. They cannot always detect whether they
|
||||
- in fact did because they do not know when they stole, but they can
|
||||
- conservatively add a signal back to the group they stole from; if they
|
||||
- did so unnecessarily, all that happens is a spurious wake-up. To make this
|
||||
- even less likely, __g1_start contains the index of the current g2 too,
|
||||
- which allows waiters to check if there aliasing on the group slots; if
|
||||
- there wasn't, they didn't steal from the current G1, which means that the
|
||||
- G1 they stole from must have been already closed and they do not need to
|
||||
- fix anything.
|
||||
-
|
||||
It is essential that the last field in pthread_cond_t is __g_signals[1]:
|
||||
The previous condvar used a pointer-sized field in pthread_cond_t, so a
|
||||
PTHREAD_COND_INITIALIZER from that condvar implementation might only
|
||||
@@ -435,6 +422,9 @@ __pthread_cond_wait_common (pthread_cond_t *cond, pthread_mutex_t *mutex,
|
||||
{
|
||||
while (1)
|
||||
{
|
||||
+ uint64_t g1_start = __condvar_load_g1_start_relaxed (cond);
|
||||
+ unsigned int lowseq = (g1_start & 1) == g ? signals : g1_start & ~1U;
|
||||
+
|
||||
/* Spin-wait first.
|
||||
Note that spinning first without checking whether a timeout
|
||||
passed might lead to what looks like a spurious wake-up even
|
||||
@@ -446,35 +436,45 @@ __pthread_cond_wait_common (pthread_cond_t *cond, pthread_mutex_t *mutex,
|
||||
having to compare against the current time seems to be the right
|
||||
choice from a performance perspective for most use cases. */
|
||||
unsigned int spin = maxspin;
|
||||
- while (signals == 0 && spin > 0)
|
||||
+ while (spin > 0 && ((int)(signals - lowseq) < 2))
|
||||
{
|
||||
/* Check that we are not spinning on a group that's already
|
||||
closed. */
|
||||
- if (seq < (__condvar_load_g1_start_relaxed (cond) >> 1))
|
||||
- goto done;
|
||||
+ if (seq < (g1_start >> 1))
|
||||
+ break;
|
||||
|
||||
/* TODO Back off. */
|
||||
|
||||
/* Reload signals. See above for MO. */
|
||||
signals = atomic_load_acquire (cond->__data.__g_signals + g);
|
||||
+ g1_start = __condvar_load_g1_start_relaxed (cond);
|
||||
+ lowseq = (g1_start & 1) == g ? signals : g1_start & ~1U;
|
||||
spin--;
|
||||
}
|
||||
|
||||
- /* If our group will be closed as indicated by the flag on signals,
|
||||
- don't bother grabbing a signal. */
|
||||
- if (signals & 1)
|
||||
- goto done;
|
||||
-
|
||||
- /* If there is an available signal, don't block. */
|
||||
- if (signals != 0)
|
||||
+ if (seq < (g1_start >> 1))
|
||||
+ {
|
||||
+ /* If the group is closed already,
|
||||
+ then this waiter originally had enough extra signals to
|
||||
+ consume, up until the time its group was closed. */
|
||||
+ goto done;
|
||||
+ }
|
||||
+
|
||||
+ /* If there is an available signal, don't block.
|
||||
+ If __g1_start has advanced at all, then we must be in G1
|
||||
+ by now, perhaps in the process of switching back to an older
|
||||
+ G2, but in either case we're allowed to consume the available
|
||||
+ signal and should not block anymore. */
|
||||
+ if ((int)(signals - lowseq) >= 2)
|
||||
break;
|
||||
|
||||
/* No signals available after spinning, so prepare to block.
|
||||
We first acquire a group reference and use acquire MO for that so
|
||||
that we synchronize with the dummy read-modify-write in
|
||||
__condvar_quiesce_and_switch_g1 if we read from that. In turn,
|
||||
- in this case this will make us see the closed flag on __g_signals
|
||||
- that designates a concurrent attempt to reuse the group's slot.
|
||||
+ in this case this will make us see the advancement of __g_signals
|
||||
+ to the upcoming new g1_start that occurs with a concurrent
|
||||
+ attempt to reuse the group's slot.
|
||||
We use acquire MO for the __g_signals check to make the
|
||||
__g1_start check work (see spinning above).
|
||||
Note that the group reference acquisition will not mask the
|
||||
@@ -482,15 +482,24 @@ __pthread_cond_wait_common (pthread_cond_t *cond, pthread_mutex_t *mutex,
|
||||
an atomic read-modify-write operation and thus extend the release
|
||||
sequence. */
|
||||
atomic_fetch_add_acquire (cond->__data.__g_refs + g, 2);
|
||||
- if (((atomic_load_acquire (cond->__data.__g_signals + g) & 1) != 0)
|
||||
- || (seq < (__condvar_load_g1_start_relaxed (cond) >> 1)))
|
||||
+ signals = atomic_load_acquire (cond->__data.__g_signals + g);
|
||||
+ g1_start = __condvar_load_g1_start_relaxed (cond);
|
||||
+ lowseq = (g1_start & 1) == g ? signals : g1_start & ~1U;
|
||||
+
|
||||
+ if (seq < (g1_start >> 1))
|
||||
{
|
||||
- /* Our group is closed. Wake up any signalers that might be
|
||||
- waiting. */
|
||||
+ /* group is closed already, so don't block */
|
||||
__condvar_dec_grefs (cond, g, private);
|
||||
goto done;
|
||||
}
|
||||
|
||||
+ if ((int)(signals - lowseq) >= 2)
|
||||
+ {
|
||||
+ /* a signal showed up or G1/G2 switched after we grabbed the refcount */
|
||||
+ __condvar_dec_grefs (cond, g, private);
|
||||
+ break;
|
||||
+ }
|
||||
+
|
||||
// Now block.
|
||||
struct _pthread_cleanup_buffer buffer;
|
||||
struct _condvar_cleanup_buffer cbuffer;
|
||||
@@ -501,7 +510,7 @@ __pthread_cond_wait_common (pthread_cond_t *cond, pthread_mutex_t *mutex,
|
||||
__pthread_cleanup_push (&buffer, __condvar_cleanup_waiting, &cbuffer);
|
||||
|
||||
err = __futex_abstimed_wait_cancelable64 (
|
||||
- cond->__data.__g_signals + g, 0, clockid, abstime, private);
|
||||
+ cond->__data.__g_signals + g, signals, clockid, abstime, private);
|
||||
|
||||
__pthread_cleanup_pop (&buffer, 0);
|
||||
|
||||
@@ -524,6 +533,8 @@ __pthread_cond_wait_common (pthread_cond_t *cond, pthread_mutex_t *mutex,
|
||||
signals = atomic_load_acquire (cond->__data.__g_signals + g);
|
||||
}
|
||||
|
||||
+ if (seq < (__condvar_load_g1_start_relaxed (cond) >> 1))
|
||||
+ goto done;
|
||||
}
|
||||
/* Try to grab a signal. Use acquire MO so that we see an up-to-date value
|
||||
of __g1_start below (see spinning above for a similar case). In
|
||||
@@ -532,69 +543,6 @@ __pthread_cond_wait_common (pthread_cond_t *cond, pthread_mutex_t *mutex,
|
||||
while (!atomic_compare_exchange_weak_acquire (cond->__data.__g_signals + g,
|
||||
&signals, signals - 2));
|
||||
|
||||
- /* We consumed a signal but we could have consumed from a more recent group
|
||||
- that aliased with ours due to being in the same group slot. If this
|
||||
- might be the case our group must be closed as visible through
|
||||
- __g1_start. */
|
||||
- uint64_t g1_start = __condvar_load_g1_start_relaxed (cond);
|
||||
- if (seq < (g1_start >> 1))
|
||||
- {
|
||||
- /* We potentially stole a signal from a more recent group but we do not
|
||||
- know which group we really consumed from.
|
||||
- We do not care about groups older than current G1 because they are
|
||||
- closed; we could have stolen from these, but then we just add a
|
||||
- spurious wake-up for the current groups.
|
||||
- We will never steal a signal from current G2 that was really intended
|
||||
- for G2 because G2 never receives signals (until it becomes G1). We
|
||||
- could have stolen a signal from G2 that was conservatively added by a
|
||||
- previous waiter that also thought it stole a signal -- but given that
|
||||
- that signal was added unnecessarily, it's not a problem if we steal
|
||||
- it.
|
||||
- Thus, the remaining case is that we could have stolen from the current
|
||||
- G1, where "current" means the __g1_start value we observed. However,
|
||||
- if the current G1 does not have the same slot index as we do, we did
|
||||
- not steal from it and do not need to undo that. This is the reason
|
||||
- for putting a bit with G2's index into__g1_start as well. */
|
||||
- if (((g1_start & 1) ^ 1) == g)
|
||||
- {
|
||||
- /* We have to conservatively undo our potential mistake of stealing
|
||||
- a signal. We can stop trying to do that when the current G1
|
||||
- changes because other spinning waiters will notice this too and
|
||||
- __condvar_quiesce_and_switch_g1 has checked that there are no
|
||||
- futex waiters anymore before switching G1.
|
||||
- Relaxed MO is fine for the __g1_start load because we need to
|
||||
- merely be able to observe this fact and not have to observe
|
||||
- something else as well.
|
||||
- ??? Would it help to spin for a little while to see whether the
|
||||
- current G1 gets closed? This might be worthwhile if the group is
|
||||
- small or close to being closed. */
|
||||
- unsigned int s = atomic_load_relaxed (cond->__data.__g_signals + g);
|
||||
- while (__condvar_load_g1_start_relaxed (cond) == g1_start)
|
||||
- {
|
||||
- /* Try to add a signal. We don't need to acquire the lock
|
||||
- because at worst we can cause a spurious wake-up. If the
|
||||
- group is in the process of being closed (LSB is true), this
|
||||
- has an effect similar to us adding a signal. */
|
||||
- if (((s & 1) != 0)
|
||||
- || atomic_compare_exchange_weak_relaxed
|
||||
- (cond->__data.__g_signals + g, &s, s + 2))
|
||||
- {
|
||||
- /* If we added a signal, we also need to add a wake-up on
|
||||
- the futex. We also need to do that if we skipped adding
|
||||
- a signal because the group is being closed because
|
||||
- while __condvar_quiesce_and_switch_g1 could have closed
|
||||
- the group, it might still be waiting for futex waiters to
|
||||
- leave (and one of those waiters might be the one we stole
|
||||
- the signal from, which cause it to block using the
|
||||
- futex). */
|
||||
- futex_wake (cond->__data.__g_signals + g, 1, private);
|
||||
- break;
|
||||
- }
|
||||
- /* TODO Back off. */
|
||||
- }
|
||||
- }
|
||||
- }
|
||||
-
|
||||
done:
|
||||
|
||||
/* Confirm that we have been woken. We do that before acquiring the mutex
|
134
glibc-upstream-2.39-185.patch
Normal file
134
glibc-upstream-2.39-185.patch
Normal file
@ -0,0 +1,134 @@
|
||||
commit ea13a35e37932cabeef7d7b018aaef1136287a5e
|
||||
Author: Malte Skarupke <malteskarupke@fastmail.fm>
|
||||
Date: Wed Dec 4 07:55:22 2024 -0500
|
||||
|
||||
nptl: Update comments and indentation for new condvar implementation
|
||||
|
||||
Some comments were wrong after the most recent commit. This fixes that.
|
||||
|
||||
Also fixing indentation where it was using spaces instead of tabs.
|
||||
|
||||
Signed-off-by: Malte Skarupke <malteskarupke@fastmail.fm>
|
||||
Reviewed-by: Carlos O'Donell <carlos@redhat.com>
|
||||
(cherry picked from commit 0cc973160c23bb67f895bc887dd6942d29f8fee3)
|
||||
|
||||
diff --git a/nptl/pthread_cond_common.c b/nptl/pthread_cond_common.c
|
||||
index 4855b8899f887ad0..3475d1512354be3c 100644
|
||||
--- a/nptl/pthread_cond_common.c
|
||||
+++ b/nptl/pthread_cond_common.c
|
||||
@@ -221,8 +221,9 @@ __condvar_quiesce_and_switch_g1 (pthread_cond_t *cond, uint64_t wseq,
|
||||
* New waiters arriving concurrently with the group switching will all go
|
||||
into G2 until we atomically make the switch. Waiters existing in G2
|
||||
are not affected.
|
||||
- * Waiters in G1 will be closed out immediately by the advancing of
|
||||
- __g_signals to the next "lowseq" (low 31 bits of the new g1_start),
|
||||
+ * Waiters in G1 have already received a signal and been woken. If they
|
||||
+ haven't woken yet, they will be closed out immediately by the advancing
|
||||
+ of __g_signals to the next "lowseq" (low 31 bits of the new g1_start),
|
||||
which will prevent waiters from blocking using a futex on
|
||||
__g_signals since it provides enough signals for all possible
|
||||
remaining waiters. As a result, they can each consume a signal
|
||||
diff --git a/nptl/pthread_cond_wait.c b/nptl/pthread_cond_wait.c
|
||||
index 3d290e39c8ccebb7..ad2cee7d59ddc093 100644
|
||||
--- a/nptl/pthread_cond_wait.c
|
||||
+++ b/nptl/pthread_cond_wait.c
|
||||
@@ -249,7 +249,7 @@ __condvar_cleanup_waiting (void *arg)
|
||||
figure out whether they are in a group that has already been completely
|
||||
signaled (i.e., if the current G1 starts at a later position that the
|
||||
waiter's position). Waiters cannot determine whether they are currently
|
||||
- in G2 or G1 -- but they do not have too because all they are interested in
|
||||
+ in G2 or G1 -- but they do not have to because all they are interested in
|
||||
is whether there are available signals, and they always start in G2 (whose
|
||||
group slot they know because of the bit in the waiter sequence. Signalers
|
||||
will simply fill the right group until it is completely signaled and can
|
||||
@@ -412,7 +412,7 @@ __pthread_cond_wait_common (pthread_cond_t *cond, pthread_mutex_t *mutex,
|
||||
}
|
||||
|
||||
/* Now wait until a signal is available in our group or it is closed.
|
||||
- Acquire MO so that if we observe a value of zero written after group
|
||||
+ Acquire MO so that if we observe (signals == lowseq) after group
|
||||
switching in __condvar_quiesce_and_switch_g1, we synchronize with that
|
||||
store and will see the prior update of __g1_start done while switching
|
||||
groups too. */
|
||||
@@ -422,8 +422,8 @@ __pthread_cond_wait_common (pthread_cond_t *cond, pthread_mutex_t *mutex,
|
||||
{
|
||||
while (1)
|
||||
{
|
||||
- uint64_t g1_start = __condvar_load_g1_start_relaxed (cond);
|
||||
- unsigned int lowseq = (g1_start & 1) == g ? signals : g1_start & ~1U;
|
||||
+ uint64_t g1_start = __condvar_load_g1_start_relaxed (cond);
|
||||
+ unsigned int lowseq = (g1_start & 1) == g ? signals : g1_start & ~1U;
|
||||
|
||||
/* Spin-wait first.
|
||||
Note that spinning first without checking whether a timeout
|
||||
@@ -447,21 +447,21 @@ __pthread_cond_wait_common (pthread_cond_t *cond, pthread_mutex_t *mutex,
|
||||
|
||||
/* Reload signals. See above for MO. */
|
||||
signals = atomic_load_acquire (cond->__data.__g_signals + g);
|
||||
- g1_start = __condvar_load_g1_start_relaxed (cond);
|
||||
- lowseq = (g1_start & 1) == g ? signals : g1_start & ~1U;
|
||||
+ g1_start = __condvar_load_g1_start_relaxed (cond);
|
||||
+ lowseq = (g1_start & 1) == g ? signals : g1_start & ~1U;
|
||||
spin--;
|
||||
}
|
||||
|
||||
- if (seq < (g1_start >> 1))
|
||||
+ if (seq < (g1_start >> 1))
|
||||
{
|
||||
- /* If the group is closed already,
|
||||
+ /* If the group is closed already,
|
||||
then this waiter originally had enough extra signals to
|
||||
consume, up until the time its group was closed. */
|
||||
goto done;
|
||||
- }
|
||||
+ }
|
||||
|
||||
/* If there is an available signal, don't block.
|
||||
- If __g1_start has advanced at all, then we must be in G1
|
||||
+ If __g1_start has advanced at all, then we must be in G1
|
||||
by now, perhaps in the process of switching back to an older
|
||||
G2, but in either case we're allowed to consume the available
|
||||
signal and should not block anymore. */
|
||||
@@ -483,22 +483,23 @@ __pthread_cond_wait_common (pthread_cond_t *cond, pthread_mutex_t *mutex,
|
||||
sequence. */
|
||||
atomic_fetch_add_acquire (cond->__data.__g_refs + g, 2);
|
||||
signals = atomic_load_acquire (cond->__data.__g_signals + g);
|
||||
- g1_start = __condvar_load_g1_start_relaxed (cond);
|
||||
- lowseq = (g1_start & 1) == g ? signals : g1_start & ~1U;
|
||||
+ g1_start = __condvar_load_g1_start_relaxed (cond);
|
||||
+ lowseq = (g1_start & 1) == g ? signals : g1_start & ~1U;
|
||||
|
||||
- if (seq < (g1_start >> 1))
|
||||
+ if (seq < (g1_start >> 1))
|
||||
{
|
||||
- /* group is closed already, so don't block */
|
||||
+ /* group is closed already, so don't block */
|
||||
__condvar_dec_grefs (cond, g, private);
|
||||
goto done;
|
||||
}
|
||||
|
||||
if ((int)(signals - lowseq) >= 2)
|
||||
{
|
||||
- /* a signal showed up or G1/G2 switched after we grabbed the refcount */
|
||||
+ /* a signal showed up or G1/G2 switched after we grabbed the
|
||||
+ refcount */
|
||||
__condvar_dec_grefs (cond, g, private);
|
||||
break;
|
||||
- }
|
||||
+ }
|
||||
|
||||
// Now block.
|
||||
struct _pthread_cleanup_buffer buffer;
|
||||
@@ -536,10 +537,8 @@ __pthread_cond_wait_common (pthread_cond_t *cond, pthread_mutex_t *mutex,
|
||||
if (seq < (__condvar_load_g1_start_relaxed (cond) >> 1))
|
||||
goto done;
|
||||
}
|
||||
- /* Try to grab a signal. Use acquire MO so that we see an up-to-date value
|
||||
- of __g1_start below (see spinning above for a similar case). In
|
||||
- particular, if we steal from a more recent group, we will also see a
|
||||
- more recent __g1_start below. */
|
||||
+ /* Try to grab a signal. See above for MO. (if we do another loop
|
||||
+ iteration we need to see the correct value of g1_start) */
|
||||
while (!atomic_compare_exchange_weak_acquire (cond->__data.__g_signals + g,
|
||||
&signals, signals - 2));
|
||||
|
68
glibc-upstream-2.39-186.patch
Normal file
68
glibc-upstream-2.39-186.patch
Normal file
@ -0,0 +1,68 @@
|
||||
commit d0da34ad302df61c4e4c3030845cbe9b986196bf
|
||||
Author: Malte Skarupke <malteskarupke@fastmail.fm>
|
||||
Date: Wed Dec 4 07:55:50 2024 -0500
|
||||
|
||||
nptl: Remove unnecessary catch-all-wake in condvar group switch
|
||||
|
||||
This wake is unnecessary. We only switch groups after every sleeper in a group
|
||||
has been woken. Sure, they may take a while to actually wake up and may still
|
||||
hold a reference, but waking them a second time doesn't speed that up. Instead
|
||||
this just makes the code more complicated and may hide problems.
|
||||
|
||||
In particular this safety wake wouldn't even have helped with the bug that was
|
||||
fixed by Barrus' patch: The bug there was that pthread_cond_signal would not
|
||||
switch g1 when it should, so we wouldn't even have entered this code path.
|
||||
|
||||
Signed-off-by: Malte Skarupke <malteskarupke@fastmail.fm>
|
||||
Reviewed-by: Carlos O'Donell <carlos@redhat.com>
|
||||
(cherry picked from commit b42cc6af11062c260c7dfa91f1c89891366fed3e)
|
||||
|
||||
diff --git a/nptl/pthread_cond_common.c b/nptl/pthread_cond_common.c
|
||||
index 3475d1512354be3c..30b8eee149cee195 100644
|
||||
--- a/nptl/pthread_cond_common.c
|
||||
+++ b/nptl/pthread_cond_common.c
|
||||
@@ -221,13 +221,7 @@ __condvar_quiesce_and_switch_g1 (pthread_cond_t *cond, uint64_t wseq,
|
||||
* New waiters arriving concurrently with the group switching will all go
|
||||
into G2 until we atomically make the switch. Waiters existing in G2
|
||||
are not affected.
|
||||
- * Waiters in G1 have already received a signal and been woken. If they
|
||||
- haven't woken yet, they will be closed out immediately by the advancing
|
||||
- of __g_signals to the next "lowseq" (low 31 bits of the new g1_start),
|
||||
- which will prevent waiters from blocking using a futex on
|
||||
- __g_signals since it provides enough signals for all possible
|
||||
- remaining waiters. As a result, they can each consume a signal
|
||||
- and they will eventually remove their group reference. */
|
||||
+ * Waiters in G1 have already received a signal and been woken. */
|
||||
|
||||
/* Update __g1_start, which finishes closing this group. The value we add
|
||||
will never be negative because old_orig_size can only be zero when we
|
||||
@@ -240,29 +234,6 @@ __condvar_quiesce_and_switch_g1 (pthread_cond_t *cond, uint64_t wseq,
|
||||
|
||||
unsigned int lowseq = ((old_g1_start + old_orig_size) << 1) & ~1U;
|
||||
|
||||
- /* If any waiters still hold group references (and thus could be blocked),
|
||||
- then wake them all up now and prevent any running ones from blocking.
|
||||
- This is effectively a catch-all for any possible current or future
|
||||
- bugs that can allow the group size to reach 0 before all G1 waiters
|
||||
- have been awakened or at least given signals to consume, or any
|
||||
- other case that can leave blocked (or about to block) older waiters.. */
|
||||
- if ((atomic_fetch_or_release (cond->__data.__g_refs + g1, 0) >> 1) > 0)
|
||||
- {
|
||||
- /* First advance signals to the end of the group (i.e. enough signals
|
||||
- for the entire G1 group) to ensure that waiters which have not
|
||||
- yet blocked in the futex will not block.
|
||||
- Note that in the vast majority of cases, this should never
|
||||
- actually be necessary, since __g_signals will have enough
|
||||
- signals for the remaining g_refs waiters. As an optimization,
|
||||
- we could check this first before proceeding, although that
|
||||
- could still leave the potential for futex lost wakeup bugs
|
||||
- if the signal count was non-zero but the futex wakeup
|
||||
- was somehow lost. */
|
||||
- atomic_store_release (cond->__data.__g_signals + g1, lowseq);
|
||||
-
|
||||
- futex_wake (cond->__data.__g_signals + g1, INT_MAX, private);
|
||||
- }
|
||||
-
|
||||
/* At this point, the old G1 is now a valid new G2 (but not in use yet).
|
||||
No old waiter can neither grab a signal nor acquire a reference without
|
||||
noticing that __g1_start is larger.
|
108
glibc-upstream-2.39-187.patch
Normal file
108
glibc-upstream-2.39-187.patch
Normal file
@ -0,0 +1,108 @@
|
||||
commit 6f5ba03968339122e11d5185fed5ff6f99ee4f28
|
||||
Author: Malte Skarupke <malteskarupke@fastmail.fm>
|
||||
Date: Wed Dec 4 07:56:13 2024 -0500
|
||||
|
||||
nptl: Remove unnecessary quadruple check in pthread_cond_wait
|
||||
|
||||
pthread_cond_wait was checking whether it was in a closed group no less than
|
||||
four times. Checking once is enough. Here are the four checks:
|
||||
|
||||
1. While spin-waiting. This was dead code: maxspin is set to 0 and has been
|
||||
for years.
|
||||
2. Before deciding to go to sleep, and before incrementing grefs: I kept this
|
||||
3. After incrementing grefs. There is no reason to think that the group would
|
||||
close while we do an atomic increment. Obviously it could close at any
|
||||
point, but that doesn't mean we have to recheck after every step. This
|
||||
check was equally good as check 2, except it has to do more work.
|
||||
4. When we find ourselves in a group that has a signal. We only get here after
|
||||
we check that we're not in a closed group. There is no need to check again.
|
||||
The check would only have helped in cases where the compare_exchange in the
|
||||
next line would also have failed. Relying on the compare_exchange is fine.
|
||||
|
||||
Removing the duplicate checks clarifies the code.
|
||||
|
||||
Signed-off-by: Malte Skarupke <malteskarupke@fastmail.fm>
|
||||
Reviewed-by: Carlos O'Donell <carlos@redhat.com>
|
||||
(cherry picked from commit 4f7b051f8ee3feff1b53b27a906f245afaa9cee1)
|
||||
|
||||
diff --git a/nptl/pthread_cond_wait.c b/nptl/pthread_cond_wait.c
|
||||
index ad2cee7d59ddc093..cfdd13bb87c72fa5 100644
|
||||
--- a/nptl/pthread_cond_wait.c
|
||||
+++ b/nptl/pthread_cond_wait.c
|
||||
@@ -366,7 +366,6 @@ static __always_inline int
|
||||
__pthread_cond_wait_common (pthread_cond_t *cond, pthread_mutex_t *mutex,
|
||||
clockid_t clockid, const struct __timespec64 *abstime)
|
||||
{
|
||||
- const int maxspin = 0;
|
||||
int err;
|
||||
int result = 0;
|
||||
|
||||
@@ -425,33 +424,6 @@ __pthread_cond_wait_common (pthread_cond_t *cond, pthread_mutex_t *mutex,
|
||||
uint64_t g1_start = __condvar_load_g1_start_relaxed (cond);
|
||||
unsigned int lowseq = (g1_start & 1) == g ? signals : g1_start & ~1U;
|
||||
|
||||
- /* Spin-wait first.
|
||||
- Note that spinning first without checking whether a timeout
|
||||
- passed might lead to what looks like a spurious wake-up even
|
||||
- though we should return ETIMEDOUT (e.g., if the caller provides
|
||||
- an absolute timeout that is clearly in the past). However,
|
||||
- (1) spurious wake-ups are allowed, (2) it seems unlikely that a
|
||||
- user will (ab)use pthread_cond_wait as a check for whether a
|
||||
- point in time is in the past, and (3) spinning first without
|
||||
- having to compare against the current time seems to be the right
|
||||
- choice from a performance perspective for most use cases. */
|
||||
- unsigned int spin = maxspin;
|
||||
- while (spin > 0 && ((int)(signals - lowseq) < 2))
|
||||
- {
|
||||
- /* Check that we are not spinning on a group that's already
|
||||
- closed. */
|
||||
- if (seq < (g1_start >> 1))
|
||||
- break;
|
||||
-
|
||||
- /* TODO Back off. */
|
||||
-
|
||||
- /* Reload signals. See above for MO. */
|
||||
- signals = atomic_load_acquire (cond->__data.__g_signals + g);
|
||||
- g1_start = __condvar_load_g1_start_relaxed (cond);
|
||||
- lowseq = (g1_start & 1) == g ? signals : g1_start & ~1U;
|
||||
- spin--;
|
||||
- }
|
||||
-
|
||||
if (seq < (g1_start >> 1))
|
||||
{
|
||||
/* If the group is closed already,
|
||||
@@ -482,24 +454,6 @@ __pthread_cond_wait_common (pthread_cond_t *cond, pthread_mutex_t *mutex,
|
||||
an atomic read-modify-write operation and thus extend the release
|
||||
sequence. */
|
||||
atomic_fetch_add_acquire (cond->__data.__g_refs + g, 2);
|
||||
- signals = atomic_load_acquire (cond->__data.__g_signals + g);
|
||||
- g1_start = __condvar_load_g1_start_relaxed (cond);
|
||||
- lowseq = (g1_start & 1) == g ? signals : g1_start & ~1U;
|
||||
-
|
||||
- if (seq < (g1_start >> 1))
|
||||
- {
|
||||
- /* group is closed already, so don't block */
|
||||
- __condvar_dec_grefs (cond, g, private);
|
||||
- goto done;
|
||||
- }
|
||||
-
|
||||
- if ((int)(signals - lowseq) >= 2)
|
||||
- {
|
||||
- /* a signal showed up or G1/G2 switched after we grabbed the
|
||||
- refcount */
|
||||
- __condvar_dec_grefs (cond, g, private);
|
||||
- break;
|
||||
- }
|
||||
|
||||
// Now block.
|
||||
struct _pthread_cleanup_buffer buffer;
|
||||
@@ -533,9 +487,6 @@ __pthread_cond_wait_common (pthread_cond_t *cond, pthread_mutex_t *mutex,
|
||||
/* Reload signals. See above for MO. */
|
||||
signals = atomic_load_acquire (cond->__data.__g_signals + g);
|
||||
}
|
||||
-
|
||||
- if (seq < (__condvar_load_g1_start_relaxed (cond) >> 1))
|
||||
- goto done;
|
||||
}
|
||||
/* Try to grab a signal. See above for MO. (if we do another loop
|
||||
iteration we need to see the correct value of g1_start) */
|
175
glibc-upstream-2.39-188.patch
Normal file
175
glibc-upstream-2.39-188.patch
Normal file
@ -0,0 +1,175 @@
|
||||
commit fc2a25417df71a1ef3613216269227b7721b21c8
|
||||
Author: Malte Skarupke <malteskarupke@fastmail.fm>
|
||||
Date: Wed Dec 4 07:56:38 2024 -0500
|
||||
|
||||
nptl: Remove g_refs from condition variables
|
||||
|
||||
This variable used to be needed to wait in group switching until all sleepers
|
||||
have confirmed that they have woken. This is no longer needed. Nothing waits
|
||||
on this variable so there is no need to track how many threads are currently
|
||||
asleep in each group.
|
||||
|
||||
Signed-off-by: Malte Skarupke <malteskarupke@fastmail.fm>
|
||||
Reviewed-by: Carlos O'Donell <carlos@redhat.com>
|
||||
(cherry picked from commit c36fc50781995e6758cae2b6927839d0157f213c)
|
||||
|
||||
diff --git a/nptl/pthread_cond_wait.c b/nptl/pthread_cond_wait.c
|
||||
index cfdd13bb87c72fa5..411fc0380b78f482 100644
|
||||
--- a/nptl/pthread_cond_wait.c
|
||||
+++ b/nptl/pthread_cond_wait.c
|
||||
@@ -143,23 +143,6 @@ __condvar_cancel_waiting (pthread_cond_t *cond, uint64_t seq, unsigned int g,
|
||||
}
|
||||
}
|
||||
|
||||
-/* Wake up any signalers that might be waiting. */
|
||||
-static void
|
||||
-__condvar_dec_grefs (pthread_cond_t *cond, unsigned int g, int private)
|
||||
-{
|
||||
- /* Release MO to synchronize-with the acquire load in
|
||||
- __condvar_quiesce_and_switch_g1. */
|
||||
- if (atomic_fetch_add_release (cond->__data.__g_refs + g, -2) == 3)
|
||||
- {
|
||||
- /* Clear the wake-up request flag before waking up. We do not need more
|
||||
- than relaxed MO and it doesn't matter if we apply this for an aliased
|
||||
- group because we wake all futex waiters right after clearing the
|
||||
- flag. */
|
||||
- atomic_fetch_and_relaxed (cond->__data.__g_refs + g, ~(unsigned int) 1);
|
||||
- futex_wake (cond->__data.__g_refs + g, INT_MAX, private);
|
||||
- }
|
||||
-}
|
||||
-
|
||||
/* Clean-up for cancellation of waiters waiting for normal signals. We cancel
|
||||
our registration as a waiter, confirm we have woken up, and re-acquire the
|
||||
mutex. */
|
||||
@@ -171,8 +154,6 @@ __condvar_cleanup_waiting (void *arg)
|
||||
pthread_cond_t *cond = cbuffer->cond;
|
||||
unsigned g = cbuffer->wseq & 1;
|
||||
|
||||
- __condvar_dec_grefs (cond, g, cbuffer->private);
|
||||
-
|
||||
__condvar_cancel_waiting (cond, cbuffer->wseq >> 1, g, cbuffer->private);
|
||||
/* FIXME With the current cancellation implementation, it is possible that
|
||||
a thread is cancelled after it has returned from a syscall. This could
|
||||
@@ -327,15 +308,6 @@ __condvar_cleanup_waiting (void *arg)
|
||||
sufficient because if a waiter can see a sufficiently large value, it could
|
||||
have also consume a signal in the waiters group.
|
||||
|
||||
- It is essential that the last field in pthread_cond_t is __g_signals[1]:
|
||||
- The previous condvar used a pointer-sized field in pthread_cond_t, so a
|
||||
- PTHREAD_COND_INITIALIZER from that condvar implementation might only
|
||||
- initialize 4 bytes to zero instead of the 8 bytes we need (i.e., 44 bytes
|
||||
- in total instead of the 48 we need). __g_signals[1] is not accessed before
|
||||
- the first group switch (G2 starts at index 0), which will set its value to
|
||||
- zero after a harmless fetch-or whose return value is ignored. This
|
||||
- effectively completes initialization.
|
||||
-
|
||||
|
||||
Limitations:
|
||||
* This condvar isn't designed to allow for more than
|
||||
@@ -440,21 +412,6 @@ __pthread_cond_wait_common (pthread_cond_t *cond, pthread_mutex_t *mutex,
|
||||
if ((int)(signals - lowseq) >= 2)
|
||||
break;
|
||||
|
||||
- /* No signals available after spinning, so prepare to block.
|
||||
- We first acquire a group reference and use acquire MO for that so
|
||||
- that we synchronize with the dummy read-modify-write in
|
||||
- __condvar_quiesce_and_switch_g1 if we read from that. In turn,
|
||||
- in this case this will make us see the advancement of __g_signals
|
||||
- to the upcoming new g1_start that occurs with a concurrent
|
||||
- attempt to reuse the group's slot.
|
||||
- We use acquire MO for the __g_signals check to make the
|
||||
- __g1_start check work (see spinning above).
|
||||
- Note that the group reference acquisition will not mask the
|
||||
- release MO when decrementing the reference count because we use
|
||||
- an atomic read-modify-write operation and thus extend the release
|
||||
- sequence. */
|
||||
- atomic_fetch_add_acquire (cond->__data.__g_refs + g, 2);
|
||||
-
|
||||
// Now block.
|
||||
struct _pthread_cleanup_buffer buffer;
|
||||
struct _condvar_cleanup_buffer cbuffer;
|
||||
@@ -471,18 +428,11 @@ __pthread_cond_wait_common (pthread_cond_t *cond, pthread_mutex_t *mutex,
|
||||
|
||||
if (__glibc_unlikely (err == ETIMEDOUT || err == EOVERFLOW))
|
||||
{
|
||||
- __condvar_dec_grefs (cond, g, private);
|
||||
- /* If we timed out, we effectively cancel waiting. Note that
|
||||
- we have decremented __g_refs before cancellation, so that a
|
||||
- deadlock between waiting for quiescence of our group in
|
||||
- __condvar_quiesce_and_switch_g1 and us trying to acquire
|
||||
- the lock during cancellation is not possible. */
|
||||
+ /* If we timed out, we effectively cancel waiting. */
|
||||
__condvar_cancel_waiting (cond, seq, g, private);
|
||||
result = err;
|
||||
goto done;
|
||||
}
|
||||
- else
|
||||
- __condvar_dec_grefs (cond, g, private);
|
||||
|
||||
/* Reload signals. See above for MO. */
|
||||
signals = atomic_load_acquire (cond->__data.__g_signals + g);
|
||||
diff --git a/nptl/tst-cond22.c b/nptl/tst-cond22.c
|
||||
index 1336e9c79d97ca70..bdcb45c53674a5fd 100644
|
||||
--- a/nptl/tst-cond22.c
|
||||
+++ b/nptl/tst-cond22.c
|
||||
@@ -106,13 +106,13 @@ do_test (void)
|
||||
status = 1;
|
||||
}
|
||||
|
||||
- printf ("cond = { 0x%x:%x, 0x%x:%x, %u/%u/%u, %u/%u/%u, %u, %u }\n",
|
||||
+ printf ("cond = { 0x%x:%x, 0x%x:%x, %u/%u, %u/%u, %u, %u }\n",
|
||||
c.__data.__wseq.__value32.__high,
|
||||
c.__data.__wseq.__value32.__low,
|
||||
c.__data.__g1_start.__value32.__high,
|
||||
c.__data.__g1_start.__value32.__low,
|
||||
- c.__data.__g_signals[0], c.__data.__g_refs[0], c.__data.__g_size[0],
|
||||
- c.__data.__g_signals[1], c.__data.__g_refs[1], c.__data.__g_size[1],
|
||||
+ c.__data.__g_signals[0], c.__data.__g_size[0],
|
||||
+ c.__data.__g_signals[1], c.__data.__g_size[1],
|
||||
c.__data.__g1_orig_size, c.__data.__wrefs);
|
||||
|
||||
if (pthread_create (&th, NULL, tf, (void *) 1l) != 0)
|
||||
@@ -152,13 +152,13 @@ do_test (void)
|
||||
status = 1;
|
||||
}
|
||||
|
||||
- printf ("cond = { 0x%x:%x, 0x%x:%x, %u/%u/%u, %u/%u/%u, %u, %u }\n",
|
||||
+ printf ("cond = { 0x%x:%x, 0x%x:%x, %u/%u, %u/%u, %u, %u }\n",
|
||||
c.__data.__wseq.__value32.__high,
|
||||
c.__data.__wseq.__value32.__low,
|
||||
c.__data.__g1_start.__value32.__high,
|
||||
c.__data.__g1_start.__value32.__low,
|
||||
- c.__data.__g_signals[0], c.__data.__g_refs[0], c.__data.__g_size[0],
|
||||
- c.__data.__g_signals[1], c.__data.__g_refs[1], c.__data.__g_size[1],
|
||||
+ c.__data.__g_signals[0], c.__data.__g_size[0],
|
||||
+ c.__data.__g_signals[1], c.__data.__g_size[1],
|
||||
c.__data.__g1_orig_size, c.__data.__wrefs);
|
||||
|
||||
return status;
|
||||
diff --git a/sysdeps/nptl/bits/thread-shared-types.h b/sysdeps/nptl/bits/thread-shared-types.h
|
||||
index df54eef6f71f2cee..a3d482f80f7d0d35 100644
|
||||
--- a/sysdeps/nptl/bits/thread-shared-types.h
|
||||
+++ b/sysdeps/nptl/bits/thread-shared-types.h
|
||||
@@ -95,8 +95,7 @@ struct __pthread_cond_s
|
||||
{
|
||||
__atomic_wide_counter __wseq;
|
||||
__atomic_wide_counter __g1_start;
|
||||
- unsigned int __g_refs[2] __LOCK_ALIGNMENT;
|
||||
- unsigned int __g_size[2];
|
||||
+ unsigned int __g_size[2] __LOCK_ALIGNMENT;
|
||||
unsigned int __g1_orig_size;
|
||||
unsigned int __wrefs;
|
||||
unsigned int __g_signals[2];
|
||||
diff --git a/sysdeps/nptl/pthread.h b/sysdeps/nptl/pthread.h
|
||||
index 3d4f4a756c66750d..9af75d6eae090218 100644
|
||||
--- a/sysdeps/nptl/pthread.h
|
||||
+++ b/sysdeps/nptl/pthread.h
|
||||
@@ -152,7 +152,7 @@ enum
|
||||
|
||||
|
||||
/* Conditional variable handling. */
|
||||
-#define PTHREAD_COND_INITIALIZER { { {0}, {0}, {0, 0}, {0, 0}, 0, 0, {0, 0} } }
|
||||
+#define PTHREAD_COND_INITIALIZER { { {0}, {0}, {0, 0}, 0, 0, {0, 0} } }
|
||||
|
||||
|
||||
/* Cleanup buffers */
|
92
glibc-upstream-2.39-189.patch
Normal file
92
glibc-upstream-2.39-189.patch
Normal file
@ -0,0 +1,92 @@
|
||||
commit 582c99b2c04d6da95743b36bf8e5c54dec178274
|
||||
Author: Malte Skarupke <malteskarupke@fastmail.fm>
|
||||
Date: Wed Dec 4 08:03:44 2024 -0500
|
||||
|
||||
nptl: Use a single loop in pthread_cond_wait instaed of a nested loop
|
||||
|
||||
The loop was a little more complicated than necessary. There was only one
|
||||
break statement out of the inner loop, and the outer loop was nearly empty.
|
||||
So just remove the outer loop, moving its code to the one break statement in
|
||||
the inner loop. This allows us to replace all gotos with break statements.
|
||||
|
||||
Signed-off-by: Malte Skarupke <malteskarupke@fastmail.fm>
|
||||
Reviewed-by: Carlos O'Donell <carlos@redhat.com>
|
||||
(cherry picked from commit 929a4764ac90382616b6a21f099192b2475da674)
|
||||
|
||||
diff --git a/nptl/pthread_cond_wait.c b/nptl/pthread_cond_wait.c
|
||||
index 411fc0380b78f482..683cb2b133f2163f 100644
|
||||
--- a/nptl/pthread_cond_wait.c
|
||||
+++ b/nptl/pthread_cond_wait.c
|
||||
@@ -382,17 +382,15 @@ __pthread_cond_wait_common (pthread_cond_t *cond, pthread_mutex_t *mutex,
|
||||
return err;
|
||||
}
|
||||
|
||||
- /* Now wait until a signal is available in our group or it is closed.
|
||||
- Acquire MO so that if we observe (signals == lowseq) after group
|
||||
- switching in __condvar_quiesce_and_switch_g1, we synchronize with that
|
||||
- store and will see the prior update of __g1_start done while switching
|
||||
- groups too. */
|
||||
- unsigned int signals = atomic_load_acquire (cond->__data.__g_signals + g);
|
||||
-
|
||||
- do
|
||||
- {
|
||||
+
|
||||
while (1)
|
||||
{
|
||||
+ /* Now wait until a signal is available in our group or it is closed.
|
||||
+ Acquire MO so that if we observe (signals == lowseq) after group
|
||||
+ switching in __condvar_quiesce_and_switch_g1, we synchronize with that
|
||||
+ store and will see the prior update of __g1_start done while switching
|
||||
+ groups too. */
|
||||
+ unsigned int signals = atomic_load_acquire (cond->__data.__g_signals + g);
|
||||
uint64_t g1_start = __condvar_load_g1_start_relaxed (cond);
|
||||
unsigned int lowseq = (g1_start & 1) == g ? signals : g1_start & ~1U;
|
||||
|
||||
@@ -401,7 +399,7 @@ __pthread_cond_wait_common (pthread_cond_t *cond, pthread_mutex_t *mutex,
|
||||
/* If the group is closed already,
|
||||
then this waiter originally had enough extra signals to
|
||||
consume, up until the time its group was closed. */
|
||||
- goto done;
|
||||
+ break;
|
||||
}
|
||||
|
||||
/* If there is an available signal, don't block.
|
||||
@@ -410,7 +408,16 @@ __pthread_cond_wait_common (pthread_cond_t *cond, pthread_mutex_t *mutex,
|
||||
G2, but in either case we're allowed to consume the available
|
||||
signal and should not block anymore. */
|
||||
if ((int)(signals - lowseq) >= 2)
|
||||
- break;
|
||||
+ {
|
||||
+ /* Try to grab a signal. See above for MO. (if we do another loop
|
||||
+ iteration we need to see the correct value of g1_start) */
|
||||
+ if (atomic_compare_exchange_weak_acquire (
|
||||
+ cond->__data.__g_signals + g,
|
||||
+ &signals, signals - 2))
|
||||
+ break;
|
||||
+ else
|
||||
+ continue;
|
||||
+ }
|
||||
|
||||
// Now block.
|
||||
struct _pthread_cleanup_buffer buffer;
|
||||
@@ -431,19 +438,9 @@ __pthread_cond_wait_common (pthread_cond_t *cond, pthread_mutex_t *mutex,
|
||||
/* If we timed out, we effectively cancel waiting. */
|
||||
__condvar_cancel_waiting (cond, seq, g, private);
|
||||
result = err;
|
||||
- goto done;
|
||||
+ break;
|
||||
}
|
||||
-
|
||||
- /* Reload signals. See above for MO. */
|
||||
- signals = atomic_load_acquire (cond->__data.__g_signals + g);
|
||||
}
|
||||
- }
|
||||
- /* Try to grab a signal. See above for MO. (if we do another loop
|
||||
- iteration we need to see the correct value of g1_start) */
|
||||
- while (!atomic_compare_exchange_weak_acquire (cond->__data.__g_signals + g,
|
||||
- &signals, signals - 2));
|
||||
-
|
||||
- done:
|
||||
|
||||
/* Confirm that we have been woken. We do that before acquiring the mutex
|
||||
to allow for execution of pthread_cond_destroy while having acquired the
|
139
glibc-upstream-2.39-190.patch
Normal file
139
glibc-upstream-2.39-190.patch
Normal file
@ -0,0 +1,139 @@
|
||||
commit 2fdc0afd0763377dc51870449b476f77baeb8aa0
|
||||
Author: Malte Skarupke <malteskarupke@fastmail.fm>
|
||||
Date: Wed Dec 4 08:04:10 2024 -0500
|
||||
|
||||
nptl: Fix indentation
|
||||
|
||||
In my previous change I turned a nested loop into a simple loop. I'm doing
|
||||
the resulting indentation changes in a separate commit to make the diff on
|
||||
the previous commit easier to review.
|
||||
|
||||
Signed-off-by: Malte Skarupke <malteskarupke@fastmail.fm>
|
||||
Reviewed-by: Carlos O'Donell <carlos@redhat.com>
|
||||
(cherry picked from commit ee6c14ed59d480720721aaacc5fb03213dc153da)
|
||||
|
||||
diff --git a/nptl/pthread_cond_wait.c b/nptl/pthread_cond_wait.c
|
||||
index 683cb2b133f2163f..7fc9dadf15aa9bc6 100644
|
||||
--- a/nptl/pthread_cond_wait.c
|
||||
+++ b/nptl/pthread_cond_wait.c
|
||||
@@ -383,65 +383,65 @@ __pthread_cond_wait_common (pthread_cond_t *cond, pthread_mutex_t *mutex,
|
||||
}
|
||||
|
||||
|
||||
- while (1)
|
||||
- {
|
||||
- /* Now wait until a signal is available in our group or it is closed.
|
||||
- Acquire MO so that if we observe (signals == lowseq) after group
|
||||
- switching in __condvar_quiesce_and_switch_g1, we synchronize with that
|
||||
- store and will see the prior update of __g1_start done while switching
|
||||
- groups too. */
|
||||
- unsigned int signals = atomic_load_acquire (cond->__data.__g_signals + g);
|
||||
- uint64_t g1_start = __condvar_load_g1_start_relaxed (cond);
|
||||
- unsigned int lowseq = (g1_start & 1) == g ? signals : g1_start & ~1U;
|
||||
-
|
||||
- if (seq < (g1_start >> 1))
|
||||
- {
|
||||
- /* If the group is closed already,
|
||||
- then this waiter originally had enough extra signals to
|
||||
- consume, up until the time its group was closed. */
|
||||
- break;
|
||||
- }
|
||||
-
|
||||
- /* If there is an available signal, don't block.
|
||||
- If __g1_start has advanced at all, then we must be in G1
|
||||
- by now, perhaps in the process of switching back to an older
|
||||
- G2, but in either case we're allowed to consume the available
|
||||
- signal and should not block anymore. */
|
||||
- if ((int)(signals - lowseq) >= 2)
|
||||
- {
|
||||
- /* Try to grab a signal. See above for MO. (if we do another loop
|
||||
- iteration we need to see the correct value of g1_start) */
|
||||
- if (atomic_compare_exchange_weak_acquire (
|
||||
- cond->__data.__g_signals + g,
|
||||
+ while (1)
|
||||
+ {
|
||||
+ /* Now wait until a signal is available in our group or it is closed.
|
||||
+ Acquire MO so that if we observe (signals == lowseq) after group
|
||||
+ switching in __condvar_quiesce_and_switch_g1, we synchronize with that
|
||||
+ store and will see the prior update of __g1_start done while switching
|
||||
+ groups too. */
|
||||
+ unsigned int signals = atomic_load_acquire (cond->__data.__g_signals + g);
|
||||
+ uint64_t g1_start = __condvar_load_g1_start_relaxed (cond);
|
||||
+ unsigned int lowseq = (g1_start & 1) == g ? signals : g1_start & ~1U;
|
||||
+
|
||||
+ if (seq < (g1_start >> 1))
|
||||
+ {
|
||||
+ /* If the group is closed already,
|
||||
+ then this waiter originally had enough extra signals to
|
||||
+ consume, up until the time its group was closed. */
|
||||
+ break;
|
||||
+ }
|
||||
+
|
||||
+ /* If there is an available signal, don't block.
|
||||
+ If __g1_start has advanced at all, then we must be in G1
|
||||
+ by now, perhaps in the process of switching back to an older
|
||||
+ G2, but in either case we're allowed to consume the available
|
||||
+ signal and should not block anymore. */
|
||||
+ if ((int)(signals - lowseq) >= 2)
|
||||
+ {
|
||||
+ /* Try to grab a signal. See above for MO. (if we do another loop
|
||||
+ iteration we need to see the correct value of g1_start) */
|
||||
+ if (atomic_compare_exchange_weak_acquire (
|
||||
+ cond->__data.__g_signals + g,
|
||||
&signals, signals - 2))
|
||||
- break;
|
||||
- else
|
||||
- continue;
|
||||
- }
|
||||
-
|
||||
- // Now block.
|
||||
- struct _pthread_cleanup_buffer buffer;
|
||||
- struct _condvar_cleanup_buffer cbuffer;
|
||||
- cbuffer.wseq = wseq;
|
||||
- cbuffer.cond = cond;
|
||||
- cbuffer.mutex = mutex;
|
||||
- cbuffer.private = private;
|
||||
- __pthread_cleanup_push (&buffer, __condvar_cleanup_waiting, &cbuffer);
|
||||
-
|
||||
- err = __futex_abstimed_wait_cancelable64 (
|
||||
- cond->__data.__g_signals + g, signals, clockid, abstime, private);
|
||||
-
|
||||
- __pthread_cleanup_pop (&buffer, 0);
|
||||
-
|
||||
- if (__glibc_unlikely (err == ETIMEDOUT || err == EOVERFLOW))
|
||||
- {
|
||||
- /* If we timed out, we effectively cancel waiting. */
|
||||
- __condvar_cancel_waiting (cond, seq, g, private);
|
||||
- result = err;
|
||||
break;
|
||||
- }
|
||||
+ else
|
||||
+ continue;
|
||||
}
|
||||
|
||||
+ // Now block.
|
||||
+ struct _pthread_cleanup_buffer buffer;
|
||||
+ struct _condvar_cleanup_buffer cbuffer;
|
||||
+ cbuffer.wseq = wseq;
|
||||
+ cbuffer.cond = cond;
|
||||
+ cbuffer.mutex = mutex;
|
||||
+ cbuffer.private = private;
|
||||
+ __pthread_cleanup_push (&buffer, __condvar_cleanup_waiting, &cbuffer);
|
||||
+
|
||||
+ err = __futex_abstimed_wait_cancelable64 (
|
||||
+ cond->__data.__g_signals + g, signals, clockid, abstime, private);
|
||||
+
|
||||
+ __pthread_cleanup_pop (&buffer, 0);
|
||||
+
|
||||
+ if (__glibc_unlikely (err == ETIMEDOUT || err == EOVERFLOW))
|
||||
+ {
|
||||
+ /* If we timed out, we effectively cancel waiting. */
|
||||
+ __condvar_cancel_waiting (cond, seq, g, private);
|
||||
+ result = err;
|
||||
+ break;
|
||||
+ }
|
||||
+ }
|
||||
+
|
||||
/* Confirm that we have been woken. We do that before acquiring the mutex
|
||||
to allow for execution of pthread_cond_destroy while having acquired the
|
||||
mutex. */
|
148
glibc-upstream-2.39-191.patch
Normal file
148
glibc-upstream-2.39-191.patch
Normal file
@ -0,0 +1,148 @@
|
||||
commit ac5da3c0e4ed9cbdbb88928c5c9886d02a6dd7ed
|
||||
Author: Malte Skarupke <malteskarupke@fastmail.fm>
|
||||
Date: Wed Dec 4 08:04:54 2024 -0500
|
||||
|
||||
nptl: rename __condvar_quiesce_and_switch_g1
|
||||
|
||||
This function no longer waits for threads to leave g1, so rename it to
|
||||
__condvar_switch_g1
|
||||
|
||||
Signed-off-by: Malte Skarupke <malteskarupke@fastmail.fm>
|
||||
Reviewed-by: Carlos O'Donell <carlos@redhat.com>
|
||||
(cherry picked from commit 4b79e27a5073c02f6bff9aa8f4791230a0ab1867)
|
||||
|
||||
diff --git a/nptl/pthread_cond_broadcast.c b/nptl/pthread_cond_broadcast.c
|
||||
index aada91639a346f19..38bba17bfc8a0083 100644
|
||||
--- a/nptl/pthread_cond_broadcast.c
|
||||
+++ b/nptl/pthread_cond_broadcast.c
|
||||
@@ -60,7 +60,7 @@ ___pthread_cond_broadcast (pthread_cond_t *cond)
|
||||
cond->__data.__g_size[g1] << 1);
|
||||
cond->__data.__g_size[g1] = 0;
|
||||
|
||||
- /* We need to wake G1 waiters before we quiesce G1 below. */
|
||||
+ /* We need to wake G1 waiters before we switch G1 below. */
|
||||
/* TODO Only set it if there are indeed futex waiters. We could
|
||||
also try to move this out of the critical section in cases when
|
||||
G2 is empty (and we don't need to quiesce). */
|
||||
@@ -69,7 +69,7 @@ ___pthread_cond_broadcast (pthread_cond_t *cond)
|
||||
|
||||
/* G1 is complete. Step (2) is next unless there are no waiters in G2, in
|
||||
which case we can stop. */
|
||||
- if (__condvar_quiesce_and_switch_g1 (cond, wseq, &g1, private))
|
||||
+ if (__condvar_switch_g1 (cond, wseq, &g1, private))
|
||||
{
|
||||
/* Step (3): Send signals to all waiters in the old G2 / new G1. */
|
||||
atomic_fetch_add_relaxed (cond->__data.__g_signals + g1,
|
||||
diff --git a/nptl/pthread_cond_common.c b/nptl/pthread_cond_common.c
|
||||
index 30b8eee149cee195..5044273cc265ce94 100644
|
||||
--- a/nptl/pthread_cond_common.c
|
||||
+++ b/nptl/pthread_cond_common.c
|
||||
@@ -189,16 +189,15 @@ __condvar_get_private (int flags)
|
||||
return FUTEX_SHARED;
|
||||
}
|
||||
|
||||
-/* This closes G1 (whose index is in G1INDEX), waits for all futex waiters to
|
||||
- leave G1, converts G1 into a fresh G2, and then switches group roles so that
|
||||
- the former G2 becomes the new G1 ending at the current __wseq value when we
|
||||
- eventually make the switch (WSEQ is just an observation of __wseq by the
|
||||
- signaler).
|
||||
+/* This closes G1 (whose index is in G1INDEX), converts G1 into a fresh G2,
|
||||
+ and then switches group roles so that the former G2 becomes the new G1
|
||||
+ ending at the current __wseq value when we eventually make the switch
|
||||
+ (WSEQ is just an observation of __wseq by the signaler).
|
||||
If G2 is empty, it will not switch groups because then it would create an
|
||||
empty G1 which would require switching groups again on the next signal.
|
||||
Returns false iff groups were not switched because G2 was empty. */
|
||||
static bool __attribute__ ((unused))
|
||||
-__condvar_quiesce_and_switch_g1 (pthread_cond_t *cond, uint64_t wseq,
|
||||
+__condvar_switch_g1 (pthread_cond_t *cond, uint64_t wseq,
|
||||
unsigned int *g1index, int private)
|
||||
{
|
||||
unsigned int g1 = *g1index;
|
||||
@@ -214,8 +213,7 @@ __condvar_quiesce_and_switch_g1 (pthread_cond_t *cond, uint64_t wseq,
|
||||
+ cond->__data.__g_size[g1 ^ 1]) == 0)
|
||||
return false;
|
||||
|
||||
- /* Now try to close and quiesce G1. We have to consider the following kinds
|
||||
- of waiters:
|
||||
+ /* We have to consider the following kinds of waiters:
|
||||
* Waiters from less recent groups than G1 are not affected because
|
||||
nothing will change for them apart from __g1_start getting larger.
|
||||
* New waiters arriving concurrently with the group switching will all go
|
||||
@@ -223,12 +221,12 @@ __condvar_quiesce_and_switch_g1 (pthread_cond_t *cond, uint64_t wseq,
|
||||
are not affected.
|
||||
* Waiters in G1 have already received a signal and been woken. */
|
||||
|
||||
- /* Update __g1_start, which finishes closing this group. The value we add
|
||||
- will never be negative because old_orig_size can only be zero when we
|
||||
- switch groups the first time after a condvar was initialized, in which
|
||||
- case G1 will be at index 1 and we will add a value of 1.
|
||||
- Relaxed MO is fine because the change comes with no additional
|
||||
- constraints that others would have to observe. */
|
||||
+ /* Update __g1_start, which closes this group. The value we add will never
|
||||
+ be negative because old_orig_size can only be zero when we switch groups
|
||||
+ the first time after a condvar was initialized, in which case G1 will be
|
||||
+ at index 1 and we will add a value of 1. Relaxed MO is fine because the
|
||||
+ change comes with no additional constraints that others would have to
|
||||
+ observe. */
|
||||
__condvar_add_g1_start_relaxed (cond,
|
||||
(old_orig_size << 1) + (g1 == 1 ? 1 : - 1));
|
||||
|
||||
diff --git a/nptl/pthread_cond_signal.c b/nptl/pthread_cond_signal.c
|
||||
index 43d6286ecdf63f51..f09549714299c370 100644
|
||||
--- a/nptl/pthread_cond_signal.c
|
||||
+++ b/nptl/pthread_cond_signal.c
|
||||
@@ -69,18 +69,17 @@ ___pthread_cond_signal (pthread_cond_t *cond)
|
||||
bool do_futex_wake = false;
|
||||
|
||||
/* If G1 is still receiving signals, we put the signal there. If not, we
|
||||
- check if G2 has waiters, and if so, quiesce and switch G1 to the former
|
||||
- G2; if this results in a new G1 with waiters (G2 might have cancellations
|
||||
- already, see __condvar_quiesce_and_switch_g1), we put the signal in the
|
||||
- new G1. */
|
||||
+ check if G2 has waiters, and if so, switch G1 to the former G2; if this
|
||||
+ results in a new G1 with waiters (G2 might have cancellations already,
|
||||
+ see __condvar_switch_g1), we put the signal in the new G1. */
|
||||
if ((cond->__data.__g_size[g1] != 0)
|
||||
- || __condvar_quiesce_and_switch_g1 (cond, wseq, &g1, private))
|
||||
+ || __condvar_switch_g1 (cond, wseq, &g1, private))
|
||||
{
|
||||
/* Add a signal. Relaxed MO is fine because signaling does not need to
|
||||
- establish a happens-before relation (see above). We do not mask the
|
||||
- release-MO store when initializing a group in
|
||||
- __condvar_quiesce_and_switch_g1 because we use an atomic
|
||||
- read-modify-write and thus extend that store's release sequence. */
|
||||
+ establish a happens-before relation (see above). We do not mask the
|
||||
+ release-MO store when initializing a group in __condvar_switch_g1
|
||||
+ because we use an atomic read-modify-write and thus extend that
|
||||
+ store's release sequence. */
|
||||
atomic_fetch_add_relaxed (cond->__data.__g_signals + g1, 2);
|
||||
cond->__data.__g_size[g1]--;
|
||||
/* TODO Only set it if there are indeed futex waiters. */
|
||||
diff --git a/nptl/pthread_cond_wait.c b/nptl/pthread_cond_wait.c
|
||||
index 7fc9dadf15aa9bc6..80bb7282118775b8 100644
|
||||
--- a/nptl/pthread_cond_wait.c
|
||||
+++ b/nptl/pthread_cond_wait.c
|
||||
@@ -354,8 +354,7 @@ __pthread_cond_wait_common (pthread_cond_t *cond, pthread_mutex_t *mutex,
|
||||
because we do not need to establish any happens-before relation with
|
||||
signalers (see __pthread_cond_signal); modification order alone
|
||||
establishes a total order of waiters/signals. We do need acquire MO
|
||||
- to synchronize with group reinitialization in
|
||||
- __condvar_quiesce_and_switch_g1. */
|
||||
+ to synchronize with group reinitialization in __condvar_switch_g1. */
|
||||
uint64_t wseq = __condvar_fetch_add_wseq_acquire (cond, 2);
|
||||
/* Find our group's index. We always go into what was G2 when we acquired
|
||||
our position. */
|
||||
@@ -387,9 +386,9 @@ __pthread_cond_wait_common (pthread_cond_t *cond, pthread_mutex_t *mutex,
|
||||
{
|
||||
/* Now wait until a signal is available in our group or it is closed.
|
||||
Acquire MO so that if we observe (signals == lowseq) after group
|
||||
- switching in __condvar_quiesce_and_switch_g1, we synchronize with that
|
||||
- store and will see the prior update of __g1_start done while switching
|
||||
- groups too. */
|
||||
+ switching in __condvar_switch_g1, we synchronize with that store and
|
||||
+ will see the prior update of __g1_start done while switching groups
|
||||
+ too. */
|
||||
unsigned int signals = atomic_load_acquire (cond->__data.__g_signals + g);
|
||||
uint64_t g1_start = __condvar_load_g1_start_relaxed (cond);
|
||||
unsigned int lowseq = (g1_start & 1) == g ? signals : g1_start & ~1U;
|
180
glibc-upstream-2.39-192.patch
Normal file
180
glibc-upstream-2.39-192.patch
Normal file
@ -0,0 +1,180 @@
|
||||
commit b1eb369aee9cafefdbe5a65375310a918ef0c3ec
|
||||
Author: Malte Skarupke <malteskarupke@fastmail.fm>
|
||||
Date: Wed Dec 4 08:05:40 2024 -0500
|
||||
|
||||
nptl: Use all of g1_start and g_signals
|
||||
|
||||
The LSB of g_signals was unused. The LSB of g1_start was used to indicate
|
||||
which group is G2. This was used to always go to sleep in pthread_cond_wait
|
||||
if a waiter is in G2. A comment earlier in the file says that this is not
|
||||
correct to do:
|
||||
|
||||
"Waiters cannot determine whether they are currently in G2 or G1 -- but they
|
||||
do not have to because all they are interested in is whether there are
|
||||
available signals"
|
||||
|
||||
I either would have had to update the comment, or get rid of the check. I
|
||||
chose to get rid of the check. In fact I don't quite know why it was there.
|
||||
There will never be available signals for group G2, so we didn't need the
|
||||
special case. Even if there were, this would just be a spurious wake. This
|
||||
might have caught some cases where the count has wrapped around, but it
|
||||
wouldn't reliably do that, (and even if it did, why would you want to force a
|
||||
sleep in that case?) and we don't support that many concurrent waiters
|
||||
anyway. Getting rid of it allows us to use one more bit, making us more
|
||||
robust to wraparound.
|
||||
|
||||
Signed-off-by: Malte Skarupke <malteskarupke@fastmail.fm>
|
||||
Reviewed-by: Carlos O'Donell <carlos@redhat.com>
|
||||
(cherry picked from commit 91bb902f58264a2fd50fbce8f39a9a290dd23706)
|
||||
|
||||
diff --git a/nptl/pthread_cond_broadcast.c b/nptl/pthread_cond_broadcast.c
|
||||
index 38bba17bfc8a0083..51afa62adf7da4c1 100644
|
||||
--- a/nptl/pthread_cond_broadcast.c
|
||||
+++ b/nptl/pthread_cond_broadcast.c
|
||||
@@ -57,7 +57,7 @@ ___pthread_cond_broadcast (pthread_cond_t *cond)
|
||||
{
|
||||
/* Add as many signals as the remaining size of the group. */
|
||||
atomic_fetch_add_relaxed (cond->__data.__g_signals + g1,
|
||||
- cond->__data.__g_size[g1] << 1);
|
||||
+ cond->__data.__g_size[g1]);
|
||||
cond->__data.__g_size[g1] = 0;
|
||||
|
||||
/* We need to wake G1 waiters before we switch G1 below. */
|
||||
@@ -73,7 +73,7 @@ ___pthread_cond_broadcast (pthread_cond_t *cond)
|
||||
{
|
||||
/* Step (3): Send signals to all waiters in the old G2 / new G1. */
|
||||
atomic_fetch_add_relaxed (cond->__data.__g_signals + g1,
|
||||
- cond->__data.__g_size[g1] << 1);
|
||||
+ cond->__data.__g_size[g1]);
|
||||
cond->__data.__g_size[g1] = 0;
|
||||
/* TODO Only set it if there are indeed futex waiters. */
|
||||
do_futex_wake = true;
|
||||
diff --git a/nptl/pthread_cond_common.c b/nptl/pthread_cond_common.c
|
||||
index 5044273cc265ce94..389402913c7b7714 100644
|
||||
--- a/nptl/pthread_cond_common.c
|
||||
+++ b/nptl/pthread_cond_common.c
|
||||
@@ -208,9 +208,9 @@ __condvar_switch_g1 (pthread_cond_t *cond, uint64_t wseq,
|
||||
behavior.
|
||||
Note that this works correctly for a zero-initialized condvar too. */
|
||||
unsigned int old_orig_size = __condvar_get_orig_size (cond);
|
||||
- uint64_t old_g1_start = __condvar_load_g1_start_relaxed (cond) >> 1;
|
||||
- if (((unsigned) (wseq - old_g1_start - old_orig_size)
|
||||
- + cond->__data.__g_size[g1 ^ 1]) == 0)
|
||||
+ uint64_t old_g1_start = __condvar_load_g1_start_relaxed (cond);
|
||||
+ uint64_t new_g1_start = old_g1_start + old_orig_size;
|
||||
+ if (((unsigned) (wseq - new_g1_start) + cond->__data.__g_size[g1 ^ 1]) == 0)
|
||||
return false;
|
||||
|
||||
/* We have to consider the following kinds of waiters:
|
||||
@@ -221,16 +221,10 @@ __condvar_switch_g1 (pthread_cond_t *cond, uint64_t wseq,
|
||||
are not affected.
|
||||
* Waiters in G1 have already received a signal and been woken. */
|
||||
|
||||
- /* Update __g1_start, which closes this group. The value we add will never
|
||||
- be negative because old_orig_size can only be zero when we switch groups
|
||||
- the first time after a condvar was initialized, in which case G1 will be
|
||||
- at index 1 and we will add a value of 1. Relaxed MO is fine because the
|
||||
- change comes with no additional constraints that others would have to
|
||||
- observe. */
|
||||
- __condvar_add_g1_start_relaxed (cond,
|
||||
- (old_orig_size << 1) + (g1 == 1 ? 1 : - 1));
|
||||
-
|
||||
- unsigned int lowseq = ((old_g1_start + old_orig_size) << 1) & ~1U;
|
||||
+ /* Update __g1_start, which closes this group. Relaxed MO is fine because
|
||||
+ the change comes with no additional constraints that others would have
|
||||
+ to observe. */
|
||||
+ __condvar_add_g1_start_relaxed (cond, old_orig_size);
|
||||
|
||||
/* At this point, the old G1 is now a valid new G2 (but not in use yet).
|
||||
No old waiter can neither grab a signal nor acquire a reference without
|
||||
@@ -242,13 +236,13 @@ __condvar_switch_g1 (pthread_cond_t *cond, uint64_t wseq,
|
||||
g1 ^= 1;
|
||||
*g1index ^= 1;
|
||||
|
||||
- /* Now advance the new G1 g_signals to the new lowseq, giving it
|
||||
+ /* Now advance the new G1 g_signals to the new g1_start, giving it
|
||||
an effective signal count of 0 to start. */
|
||||
- atomic_store_release (cond->__data.__g_signals + g1, lowseq);
|
||||
+ atomic_store_release (cond->__data.__g_signals + g1, (unsigned)new_g1_start);
|
||||
|
||||
/* These values are just observed by signalers, and thus protected by the
|
||||
lock. */
|
||||
- unsigned int orig_size = wseq - (old_g1_start + old_orig_size);
|
||||
+ unsigned int orig_size = wseq - new_g1_start;
|
||||
__condvar_set_orig_size (cond, orig_size);
|
||||
/* Use and addition to not loose track of cancellations in what was
|
||||
previously G2. */
|
||||
diff --git a/nptl/pthread_cond_signal.c b/nptl/pthread_cond_signal.c
|
||||
index f09549714299c370..fa3a5c3d8f731687 100644
|
||||
--- a/nptl/pthread_cond_signal.c
|
||||
+++ b/nptl/pthread_cond_signal.c
|
||||
@@ -80,7 +80,7 @@ ___pthread_cond_signal (pthread_cond_t *cond)
|
||||
release-MO store when initializing a group in __condvar_switch_g1
|
||||
because we use an atomic read-modify-write and thus extend that
|
||||
store's release sequence. */
|
||||
- atomic_fetch_add_relaxed (cond->__data.__g_signals + g1, 2);
|
||||
+ atomic_fetch_add_relaxed (cond->__data.__g_signals + g1, 1);
|
||||
cond->__data.__g_size[g1]--;
|
||||
/* TODO Only set it if there are indeed futex waiters. */
|
||||
do_futex_wake = true;
|
||||
diff --git a/nptl/pthread_cond_wait.c b/nptl/pthread_cond_wait.c
|
||||
index 80bb7282118775b8..0f1dfcb595941eba 100644
|
||||
--- a/nptl/pthread_cond_wait.c
|
||||
+++ b/nptl/pthread_cond_wait.c
|
||||
@@ -84,7 +84,7 @@ __condvar_cancel_waiting (pthread_cond_t *cond, uint64_t seq, unsigned int g,
|
||||
not hold a reference on the group. */
|
||||
__condvar_acquire_lock (cond, private);
|
||||
|
||||
- uint64_t g1_start = __condvar_load_g1_start_relaxed (cond) >> 1;
|
||||
+ uint64_t g1_start = __condvar_load_g1_start_relaxed (cond);
|
||||
if (g1_start > seq)
|
||||
{
|
||||
/* Our group is closed, so someone provided enough signals for it.
|
||||
@@ -259,7 +259,6 @@ __condvar_cleanup_waiting (void *arg)
|
||||
* Waiters fetch-add while having acquire the mutex associated with the
|
||||
condvar. Signalers load it and fetch-xor it concurrently.
|
||||
__g1_start: Starting position of G1 (inclusive)
|
||||
- * LSB is index of current G2.
|
||||
* Modified by signalers while having acquired the condvar-internal lock
|
||||
and observed concurrently by waiters.
|
||||
__g1_orig_size: Initial size of G1
|
||||
@@ -280,11 +279,9 @@ __condvar_cleanup_waiting (void *arg)
|
||||
* Reference count used by waiters concurrently with signalers that have
|
||||
acquired the condvar-internal lock.
|
||||
__g_signals: The number of signals that can still be consumed, relative to
|
||||
- the current g1_start. (i.e. bits 31 to 1 of __g_signals are bits
|
||||
- 31 to 1 of g1_start with the signal count added)
|
||||
+ the current g1_start. (i.e. g1_start with the signal count added)
|
||||
* Used as a futex word by waiters. Used concurrently by waiters and
|
||||
signalers.
|
||||
- * LSB is currently reserved and 0.
|
||||
__g_size: Waiters remaining in this group (i.e., which have not been
|
||||
signaled yet.
|
||||
* Accessed by signalers and waiters that cancel waiting (both do so only
|
||||
@@ -391,9 +388,8 @@ __pthread_cond_wait_common (pthread_cond_t *cond, pthread_mutex_t *mutex,
|
||||
too. */
|
||||
unsigned int signals = atomic_load_acquire (cond->__data.__g_signals + g);
|
||||
uint64_t g1_start = __condvar_load_g1_start_relaxed (cond);
|
||||
- unsigned int lowseq = (g1_start & 1) == g ? signals : g1_start & ~1U;
|
||||
|
||||
- if (seq < (g1_start >> 1))
|
||||
+ if (seq < g1_start)
|
||||
{
|
||||
/* If the group is closed already,
|
||||
then this waiter originally had enough extra signals to
|
||||
@@ -406,13 +402,13 @@ __pthread_cond_wait_common (pthread_cond_t *cond, pthread_mutex_t *mutex,
|
||||
by now, perhaps in the process of switching back to an older
|
||||
G2, but in either case we're allowed to consume the available
|
||||
signal and should not block anymore. */
|
||||
- if ((int)(signals - lowseq) >= 2)
|
||||
+ if ((int)(signals - (unsigned int)g1_start) > 0)
|
||||
{
|
||||
/* Try to grab a signal. See above for MO. (if we do another loop
|
||||
iteration we need to see the correct value of g1_start) */
|
||||
if (atomic_compare_exchange_weak_acquire (
|
||||
cond->__data.__g_signals + g,
|
||||
- &signals, signals - 2))
|
||||
+ &signals, signals - 1))
|
||||
break;
|
||||
else
|
||||
continue;
|
41
glibc-upstream-2.39-193.patch
Normal file
41
glibc-upstream-2.39-193.patch
Normal file
@ -0,0 +1,41 @@
|
||||
commit d33d10642fb24091e8fc8b9115f0a17d9f78491d
|
||||
Author: Florian Weimer <fweimer@redhat.com>
|
||||
Date: Thu Mar 13 06:07:07 2025 +0100
|
||||
|
||||
nptl: PTHREAD_COND_INITIALIZER compatibility with pre-2.41 versions (bug 32786)
|
||||
|
||||
The new initializer and struct layout does not initialize the
|
||||
__g_signals field in the old struct layout before the change in
|
||||
commit c36fc50781995e6758cae2b6927839d0157f213c ("nptl: Remove
|
||||
g_refs from condition variables"). Bring back fields at the end
|
||||
of struct __pthread_cond_s, so that they are again zero-initialized.
|
||||
|
||||
Reviewed-by: Sam James <sam@gentoo.org>
|
||||
(cherry picked from commit dbc5a50d12eff4cb3f782129029d04b8a76f58e7)
|
||||
|
||||
diff --git a/sysdeps/nptl/bits/thread-shared-types.h b/sysdeps/nptl/bits/thread-shared-types.h
|
||||
index a3d482f80f7d0d35..bccc2003ec6dea5c 100644
|
||||
--- a/sysdeps/nptl/bits/thread-shared-types.h
|
||||
+++ b/sysdeps/nptl/bits/thread-shared-types.h
|
||||
@@ -99,6 +99,8 @@ struct __pthread_cond_s
|
||||
unsigned int __g1_orig_size;
|
||||
unsigned int __wrefs;
|
||||
unsigned int __g_signals[2];
|
||||
+ unsigned int __unused_initialized_1;
|
||||
+ unsigned int __unused_initialized_2;
|
||||
};
|
||||
|
||||
typedef unsigned int __tss_t;
|
||||
diff --git a/sysdeps/nptl/pthread.h b/sysdeps/nptl/pthread.h
|
||||
index 9af75d6eae090218..e0f24418fe4233f0 100644
|
||||
--- a/sysdeps/nptl/pthread.h
|
||||
+++ b/sysdeps/nptl/pthread.h
|
||||
@@ -152,7 +152,7 @@ enum
|
||||
|
||||
|
||||
/* Conditional variable handling. */
|
||||
-#define PTHREAD_COND_INITIALIZER { { {0}, {0}, {0, 0}, 0, 0, {0, 0} } }
|
||||
+#define PTHREAD_COND_INITIALIZER { { {0}, {0}, {0, 0}, 0, 0, {0, 0}, 0, 0 } }
|
||||
|
||||
|
||||
/* Cleanup buffers */
|
63
glibc-upstream-2.39-194.patch
Normal file
63
glibc-upstream-2.39-194.patch
Normal file
@ -0,0 +1,63 @@
|
||||
commit 68f3f1a1d08f7f3e0fb74391461699717efbb4bc
|
||||
Author: Florian Weimer <fweimer@redhat.com>
|
||||
Date: Sat Feb 17 09:17:04 2024 +0100
|
||||
|
||||
Linux: Switch back to assembly syscall wrapper for prctl (bug 29770)
|
||||
|
||||
Commit ff026950e280bc3e9487b41b460fb31bc5b57721 ("Add a C wrapper for
|
||||
prctl [BZ #25896]") replaced the assembler wrapper with a C function.
|
||||
However, on powerpc64le-linux-gnu, the C variadic function
|
||||
implementation requires extra work in the caller to set up the
|
||||
parameter save area. Calling a function that needs a parameter save
|
||||
area without one (because the prototype used indicates the function is
|
||||
not variadic) corrupts the caller's stack. The Linux manual pages
|
||||
project documents prctl as a non-variadic function. This has resulted
|
||||
in various projects over the years using non-variadic prototypes,
|
||||
including the sanitizer libraries in LLVm and GCC (GCC PR 113728).
|
||||
|
||||
This commit switches back to the assembler implementation on most
|
||||
targets and only keeps the C implementation for x86-64 x32.
|
||||
|
||||
Also add the __prctl_time64 alias from commit
|
||||
b39ffab860cd743a82c91946619f1b8158b0b65e ("Linux: Add time64 alias for
|
||||
prctl") to sysdeps/unix/sysv/linux/syscalls.list; it was not yet
|
||||
present in commit ff026950e280bc3e9487b41b460fb31bc5b57721.
|
||||
|
||||
This restores the old ABI on powerpc64le-linux-gnu, thus fixing
|
||||
bug 29770.
|
||||
|
||||
Reviewed-By: Simon Chopin <simon.chopin@canonical.com>
|
||||
(cherry picked from commit 6a04404521ac4119ae36827eeb288ea84eee7cf6)
|
||||
|
||||
diff --git a/sysdeps/unix/sysv/linux/syscalls.list b/sysdeps/unix/sysv/linux/syscalls.list
|
||||
index 73e941ef894cd72c..9ac42c3436dd1520 100644
|
||||
--- a/sysdeps/unix/sysv/linux/syscalls.list
|
||||
+++ b/sysdeps/unix/sysv/linux/syscalls.list
|
||||
@@ -46,6 +46,7 @@ open_tree EXTRA open_tree i:isU open_tree
|
||||
pipe2 - pipe2 i:fi __pipe2 pipe2
|
||||
pidfd_open EXTRA pidfd_open i:iU pidfd_open
|
||||
pidfd_getfd EXTRA pidfd_getfd i:iiU pidfd_getfd
|
||||
+prctl EXTRA prctl i:iiiii __prctl prctl __prctl_time64
|
||||
pivot_root EXTRA pivot_root i:ss pivot_root
|
||||
pidfd_send_signal EXTRA pidfd_send_signal i:iiPU pidfd_send_signal
|
||||
process_madvise EXTRA process_madvise i:iPniU process_madvise
|
||||
diff --git a/sysdeps/unix/sysv/linux/prctl.c b/sysdeps/unix/sysv/linux/x86_64/x32/prctl.c
|
||||
similarity index 93%
|
||||
rename from sysdeps/unix/sysv/linux/prctl.c
|
||||
rename to sysdeps/unix/sysv/linux/x86_64/x32/prctl.c
|
||||
index 52d234ea0df4cc48..4bf1b479a07c6e8f 100644
|
||||
--- a/sysdeps/unix/sysv/linux/prctl.c
|
||||
+++ b/sysdeps/unix/sysv/linux/x86_64/x32/prctl.c
|
||||
@@ -1,4 +1,4 @@
|
||||
-/* prctl - Linux specific syscall.
|
||||
+/* prctl - Linux specific syscall. x86-64 x32 version.
|
||||
Copyright (C) 2020-2024 Free Software Foundation, Inc.
|
||||
This file is part of the GNU C Library.
|
||||
|
||||
@@ -40,6 +40,3 @@ __prctl (int option, ...)
|
||||
|
||||
libc_hidden_def (__prctl)
|
||||
weak_alias (__prctl, prctl)
|
||||
-#if __TIMESIZE != 64
|
||||
-weak_alias (__prctl, __prctl_time64)
|
||||
-#endif
|
139
glibc-upstream-2.39-195.patch
Normal file
139
glibc-upstream-2.39-195.patch
Normal file
@ -0,0 +1,139 @@
|
||||
commit e31ac9a639306c8611e1ebe9fa405037337c70e0
|
||||
Author: H.J. Lu <hjl.tools@gmail.com>
|
||||
Date: Tue Apr 30 09:21:16 2024 -0700
|
||||
|
||||
libio: Sort test variables in Makefile
|
||||
|
||||
Sort test variables in libio/Makefile using scripts/sort-makefile-lines.py.
|
||||
Reviewed-by: Sunil K Pandey <skpgkp2@gmail.com>
|
||||
|
||||
(cherry picked from commit ddf71c550a5940deca74cc676f1cae134a891717)
|
||||
|
||||
diff --git a/libio/Makefile b/libio/Makefile
|
||||
index b92aeaf62634f1cb..0c1f16ee3b54c2d3 100644
|
||||
--- a/libio/Makefile
|
||||
+++ b/libio/Makefile
|
||||
@@ -68,22 +68,76 @@ routines_no_fortify += \
|
||||
wprintf \
|
||||
# routines_no_fortify
|
||||
|
||||
-tests = tst_swprintf tst_wprintf tst_swscanf tst_wscanf tst_getwc tst_putwc \
|
||||
- tst_wprintf2 tst-widetext test-fmemopen tst-ext tst-ext2 \
|
||||
- tst-fgetws tst-ungetwc1 tst-ungetwc2 tst-swscanf tst-sscanf \
|
||||
- tst-mmap-setvbuf bug-ungetwc1 bug-ungetwc2 tst-atime tst-eof \
|
||||
- tst-freopen bug-rewind bug-rewind2 bug-ungetc bug-fseek \
|
||||
- tst-mmap-eofsync tst-mmap-fflushsync bug-mmap-fflush \
|
||||
- tst-mmap2-eofsync tst-mmap-offend bug-fopena+ bug-wfflush \
|
||||
- bug-ungetc2 bug-ftell bug-ungetc3 bug-ungetc4 tst-fopenloc2 \
|
||||
- tst-memstream1 tst-memstream2 tst-memstream3 tst-memstream4 \
|
||||
- tst-wmemstream1 tst-wmemstream2 tst-wmemstream3 tst-wmemstream4 \
|
||||
- tst-wmemstream5 bug-memstream1 bug-wmemstream1 \
|
||||
- tst-setvbuf1 tst-popen1 tst-fgetwc bug-wsetpos tst-fseek \
|
||||
- tst-fwrite-error tst-ftell-partial-wide tst-ftell-active-handler \
|
||||
- tst-ftell-append tst-fputws tst-bz22415 tst-fgetc-after-eof \
|
||||
- tst-sprintf-ub tst-sprintf-chk-ub tst-bz24051 tst-bz24153 \
|
||||
- tst-wfile-sync tst-bz28828 tst-getdelim
|
||||
+tests = \
|
||||
+ bug-fopena+ \
|
||||
+ bug-fseek \
|
||||
+ bug-ftell \
|
||||
+ bug-memstream1 \
|
||||
+ bug-mmap-fflush \
|
||||
+ bug-rewind \
|
||||
+ bug-rewind2 \
|
||||
+ bug-ungetc \
|
||||
+ bug-ungetc2 \
|
||||
+ bug-ungetc3 \
|
||||
+ bug-ungetc4 \
|
||||
+ bug-ungetwc1 \
|
||||
+ bug-ungetwc2 \
|
||||
+ bug-wfflush \
|
||||
+ bug-wmemstream1 \
|
||||
+ bug-wsetpos \
|
||||
+ test-fmemopen \
|
||||
+ tst-atime \
|
||||
+ tst-bz22415 \
|
||||
+ tst-bz24051 \
|
||||
+ tst-bz24153 \
|
||||
+ tst-bz28828 \
|
||||
+ tst-eof \
|
||||
+ tst-ext \
|
||||
+ tst-ext2 \
|
||||
+ tst-fgetc-after-eof \
|
||||
+ tst-fgetwc \
|
||||
+ tst-fgetws \
|
||||
+ tst-fopenloc2 \
|
||||
+ tst-fputws \
|
||||
+ tst-freopen \
|
||||
+ tst-fseek \
|
||||
+ tst-ftell-active-handler \
|
||||
+ tst-ftell-append \
|
||||
+ tst-ftell-partial-wide \
|
||||
+ tst-fwrite-error \
|
||||
+ tst-getdelim \
|
||||
+ tst-memstream1 \
|
||||
+ tst-memstream2 \
|
||||
+ tst-memstream3 \
|
||||
+ tst-memstream4 \
|
||||
+ tst-mmap-eofsync \
|
||||
+ tst-mmap-fflushsync \
|
||||
+ tst-mmap-offend \
|
||||
+ tst-mmap-setvbuf \
|
||||
+ tst-mmap2-eofsync \
|
||||
+ tst-popen1 \
|
||||
+ tst-setvbuf1 \
|
||||
+ tst-sprintf-chk-ub \
|
||||
+ tst-sprintf-ub \
|
||||
+ tst-sscanf \
|
||||
+ tst-swscanf \
|
||||
+ tst-ungetwc1 \
|
||||
+ tst-ungetwc2 \
|
||||
+ tst-wfile-sync \
|
||||
+ tst-widetext \
|
||||
+ tst-wmemstream1 \
|
||||
+ tst-wmemstream2 \
|
||||
+ tst-wmemstream3 \
|
||||
+ tst-wmemstream4 \
|
||||
+ tst-wmemstream5 \
|
||||
+ tst_getwc \
|
||||
+ tst_putwc \
|
||||
+ tst_swprintf \
|
||||
+ tst_swscanf \
|
||||
+ tst_wprintf \
|
||||
+ tst_wprintf2 \
|
||||
+ tst_wscanf \
|
||||
+ # tests
|
||||
|
||||
tests-internal = tst-vtables tst-vtables-interposed
|
||||
|
||||
@@ -235,16 +289,26 @@ tests-special += $(objpfx)tst-fopenloc-cmp.out $(objpfx)tst-fopenloc-mem.out \
|
||||
$(objpfx)tst-bz24228-mem.out
|
||||
endif
|
||||
|
||||
-tests += tst-cleanup-default tst-cleanup-default-static
|
||||
+tests += \
|
||||
+ tst-cleanup-default \
|
||||
+ tst-cleanup-default-static \
|
||||
+ # tests
|
||||
tests-static += tst-cleanup-default-static
|
||||
tests-special += $(objpfx)tst-cleanup-default-cmp.out $(objpfx)tst-cleanup-default-static-cmp.out
|
||||
LDFLAGS-tst-cleanup-default = -Wl,--gc-sections
|
||||
LDFLAGS-tst-cleanup-default-static = -Wl,--gc-sections
|
||||
|
||||
ifeq ($(have-gnu-retain)$(have-z-start-stop-gc),yesyes)
|
||||
-tests += tst-cleanup-start-stop-gc tst-cleanup-start-stop-gc-static \
|
||||
- tst-cleanup-nostart-stop-gc tst-cleanup-nostart-stop-gc-static
|
||||
-tests-static += tst-cleanup-start-stop-gc-static tst-cleanup-nostart-stop-gc-static
|
||||
+tests += \
|
||||
+ tst-cleanup-nostart-stop-gc \
|
||||
+ tst-cleanup-nostart-stop-gc-static \
|
||||
+ tst-cleanup-start-stop-gc \
|
||||
+ tst-cleanup-start-stop-gc-static \
|
||||
+ # tests
|
||||
+tests-static += \
|
||||
+ tst-cleanup-nostart-stop-gc-static \
|
||||
+ tst-cleanup-start-stop-gc-static \
|
||||
+ # tests-static
|
||||
tests-special += $(objpfx)tst-cleanup-start-stop-gc-cmp.out \
|
||||
$(objpfx)tst-cleanup-start-stop-gc-static-cmp.out \
|
||||
$(objpfx)tst-cleanup-nostart-stop-gc-cmp.out \
|
195
glibc-upstream-2.39-196.patch
Normal file
195
glibc-upstream-2.39-196.patch
Normal file
@ -0,0 +1,195 @@
|
||||
commit 1dcfb9479df400160208ac3d8ab33128d8f1aae5
|
||||
Author: Arjun Shankar <arjun@redhat.com>
|
||||
Date: Fri Oct 18 16:03:25 2024 +0200
|
||||
|
||||
libio: Fix a deadlock after fork in popen
|
||||
|
||||
popen modifies its file handler book-keeping under a lock that wasn't
|
||||
being taken during fork. This meant that a concurrent popen and fork
|
||||
could end up copying the lock in a "locked" state into the fork child,
|
||||
where subsequently calling popen would lead to a deadlock due to the
|
||||
already (spuriously) held lock.
|
||||
|
||||
This commit fixes the deadlock by appropriately taking the lock before
|
||||
fork, and releasing/resetting it in the parent/child after the fork.
|
||||
|
||||
A new test for concurrent popen and fork is also added. It consistently
|
||||
hangs (and therefore fails via timeout) without the fix applied.
|
||||
Reviewed-by: Florian Weimer <fweimer@redhat.com>
|
||||
|
||||
(cherry picked from commit 9f0d2c0ee6c728643fcf9a4879e9f20f5e45ce5f)
|
||||
|
||||
diff --git a/libio/Makefile b/libio/Makefile
|
||||
index 0c1f16ee3b54c2d3..d1f2342867601735 100644
|
||||
--- a/libio/Makefile
|
||||
+++ b/libio/Makefile
|
||||
@@ -115,6 +115,7 @@ tests = \
|
||||
tst-mmap-offend \
|
||||
tst-mmap-setvbuf \
|
||||
tst-mmap2-eofsync \
|
||||
+ tst-popen-fork \
|
||||
tst-popen1 \
|
||||
tst-setvbuf1 \
|
||||
tst-sprintf-chk-ub \
|
||||
diff --git a/libio/iopopen.c b/libio/iopopen.c
|
||||
index d01cb0648e3aac54..352513a2914a9d36 100644
|
||||
--- a/libio/iopopen.c
|
||||
+++ b/libio/iopopen.c
|
||||
@@ -57,6 +57,26 @@ unlock (void *not_used)
|
||||
}
|
||||
#endif
|
||||
|
||||
+/* These lock/unlock/resetlock functions are used during fork. */
|
||||
+
|
||||
+void
|
||||
+_IO_proc_file_chain_lock (void)
|
||||
+{
|
||||
+ _IO_lock_lock (proc_file_chain_lock);
|
||||
+}
|
||||
+
|
||||
+void
|
||||
+_IO_proc_file_chain_unlock (void)
|
||||
+{
|
||||
+ _IO_lock_unlock (proc_file_chain_lock);
|
||||
+}
|
||||
+
|
||||
+void
|
||||
+_IO_proc_file_chain_resetlock (void)
|
||||
+{
|
||||
+ _IO_lock_init (proc_file_chain_lock);
|
||||
+}
|
||||
+
|
||||
/* POSIX states popen shall ensure that any streams from previous popen()
|
||||
calls that remain open in the parent process should be closed in the new
|
||||
child process.
|
||||
diff --git a/libio/libioP.h b/libio/libioP.h
|
||||
index 616253fcd00f04db..a83a411fdf7d93c9 100644
|
||||
--- a/libio/libioP.h
|
||||
+++ b/libio/libioP.h
|
||||
@@ -429,6 +429,12 @@ libc_hidden_proto (_IO_list_resetlock)
|
||||
extern void _IO_enable_locks (void) __THROW;
|
||||
libc_hidden_proto (_IO_enable_locks)
|
||||
|
||||
+/* Functions for operating popen's proc_file_chain_lock during fork. */
|
||||
+
|
||||
+extern void _IO_proc_file_chain_lock (void) __THROW attribute_hidden;
|
||||
+extern void _IO_proc_file_chain_unlock (void) __THROW attribute_hidden;
|
||||
+extern void _IO_proc_file_chain_resetlock (void) __THROW attribute_hidden;
|
||||
+
|
||||
/* Default jumptable functions. */
|
||||
|
||||
extern int _IO_default_underflow (FILE *) __THROW;
|
||||
diff --git a/libio/tst-popen-fork.c b/libio/tst-popen-fork.c
|
||||
new file mode 100644
|
||||
index 0000000000000000..1df30fc6c0a3f583
|
||||
--- /dev/null
|
||||
+++ b/libio/tst-popen-fork.c
|
||||
@@ -0,0 +1,80 @@
|
||||
+/* Test concurrent popen and fork.
|
||||
+ Copyright (C) 2024 Free Software Foundation, Inc.
|
||||
+ This file is part of the GNU C Library.
|
||||
+
|
||||
+ The GNU C Library is free software; you can redistribute it and/or
|
||||
+ modify it under the terms of the GNU Lesser General Public
|
||||
+ License as published by the Free Software Foundation; either
|
||||
+ version 2.1 of the License, or (at your option) any later version.
|
||||
+
|
||||
+ The GNU C Library is distributed in the hope that it will be useful,
|
||||
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
+ Lesser General Public License for more details.
|
||||
+
|
||||
+ You should have received a copy of the GNU Lesser General Public
|
||||
+ License along with the GNU C Library; if not, see
|
||||
+ <https://www.gnu.org/licenses/>. */
|
||||
+
|
||||
+#include <stdio.h>
|
||||
+#include <stdatomic.h>
|
||||
+#include <pthread.h>
|
||||
+#include <unistd.h>
|
||||
+#include <sys/wait.h>
|
||||
+
|
||||
+#include <support/check.h>
|
||||
+#include <support/xthread.h>
|
||||
+#include <support/xunistd.h>
|
||||
+
|
||||
+static void
|
||||
+popen_and_pclose (void)
|
||||
+{
|
||||
+ FILE *f = popen ("true", "r");
|
||||
+ TEST_VERIFY_EXIT (f != NULL);
|
||||
+ pclose (f);
|
||||
+ return;
|
||||
+}
|
||||
+
|
||||
+static atomic_bool done = ATOMIC_VAR_INIT (0);
|
||||
+
|
||||
+static void *
|
||||
+popen_and_pclose_forever (__attribute__ ((unused))
|
||||
+ void *arg)
|
||||
+{
|
||||
+ while (!atomic_load_explicit (&done, memory_order_acquire))
|
||||
+ popen_and_pclose ();
|
||||
+ return NULL;
|
||||
+}
|
||||
+
|
||||
+static int
|
||||
+do_test (void)
|
||||
+{
|
||||
+
|
||||
+ /* Repeatedly call popen in a loop during the entire test. */
|
||||
+ pthread_t t = xpthread_create (NULL, popen_and_pclose_forever, NULL);
|
||||
+
|
||||
+ /* Repeatedly fork off and reap child processes one-by-one.
|
||||
+ Each child calls popen once, then exits, leading to the possibility
|
||||
+ that a child forks *during* our own popen call, thus inheriting any
|
||||
+ intermediate popen state, possibly including lock state(s). */
|
||||
+ for (int i = 0; i < 100; i++)
|
||||
+ {
|
||||
+ int cpid = xfork ();
|
||||
+
|
||||
+ if (cpid == 0)
|
||||
+ {
|
||||
+ popen_and_pclose ();
|
||||
+ _exit (0);
|
||||
+ }
|
||||
+ else
|
||||
+ xwaitpid (cpid, NULL, 0);
|
||||
+ }
|
||||
+
|
||||
+ /* Stop calling popen. */
|
||||
+ atomic_store_explicit (&done, 1, memory_order_release);
|
||||
+ xpthread_join (t);
|
||||
+
|
||||
+ return 0;
|
||||
+}
|
||||
+
|
||||
+#include <support/test-driver.c>
|
||||
diff --git a/posix/fork.c b/posix/fork.c
|
||||
index 298765a1ffd08b75..cf9b80e7c059e748 100644
|
||||
--- a/posix/fork.c
|
||||
+++ b/posix/fork.c
|
||||
@@ -62,6 +62,7 @@ __libc_fork (void)
|
||||
call_function_static_weak (__nss_database_fork_prepare_parent,
|
||||
&nss_database_data);
|
||||
|
||||
+ _IO_proc_file_chain_lock ();
|
||||
_IO_list_lock ();
|
||||
|
||||
/* Acquire malloc locks. This needs to come last because fork
|
||||
@@ -92,6 +93,7 @@ __libc_fork (void)
|
||||
|
||||
/* Reset locks in the I/O code. */
|
||||
_IO_list_resetlock ();
|
||||
+ _IO_proc_file_chain_resetlock ();
|
||||
|
||||
call_function_static_weak (__nss_database_fork_subprocess,
|
||||
&nss_database_data);
|
||||
@@ -121,6 +123,7 @@ __libc_fork (void)
|
||||
|
||||
/* We execute this even if the 'fork' call failed. */
|
||||
_IO_list_unlock ();
|
||||
+ _IO_proc_file_chain_unlock ();
|
||||
}
|
||||
|
||||
/* Run the handlers registered for the parent. */
|
27
glibc-upstream-2.39-197.patch
Normal file
27
glibc-upstream-2.39-197.patch
Normal file
@ -0,0 +1,27 @@
|
||||
commit 14ec225d859091c048ec54e5c4ddf6738498aee7
|
||||
Author: Arjun Shankar <arjun@redhat.com>
|
||||
Date: Fri Oct 25 09:33:45 2024 +0200
|
||||
|
||||
libio: Correctly link tst-popen-fork against libpthread
|
||||
|
||||
tst-popen-fork failed to build for Hurd due to not being linked with
|
||||
libpthread. This commit fixes that.
|
||||
|
||||
Tested with build-many-glibcs.py for i686-gnu.
|
||||
|
||||
Reviewed-by: Florian Weimer <fweimer@redhat.com>
|
||||
(cherry picked from commit 6a290b2895b77be839fcb7c44a6a9879560097ad)
|
||||
|
||||
diff --git a/libio/Makefile b/libio/Makefile
|
||||
index d1f2342867601735..92d6c6bcab1818d0 100644
|
||||
--- a/libio/Makefile
|
||||
+++ b/libio/Makefile
|
||||
@@ -140,6 +140,8 @@ tests = \
|
||||
tst_wscanf \
|
||||
# tests
|
||||
|
||||
+$(objpfx)tst-popen-fork: $(shared-thread-library)
|
||||
+
|
||||
tests-internal = tst-vtables tst-vtables-interposed
|
||||
|
||||
ifeq (yes,$(build-shared))
|
34
glibc-upstream-2.39-198.patch
Normal file
34
glibc-upstream-2.39-198.patch
Normal file
@ -0,0 +1,34 @@
|
||||
commit 9fe51d34bbce71d186e7adee74e523ccc64a9727
|
||||
Author: H.J. Lu <hjl.tools@gmail.com>
|
||||
Date: Thu Feb 15 03:22:55 2024 -0800
|
||||
|
||||
sort-makefile-lines.py: Allow '_' in name and "^# name"
|
||||
|
||||
'_' is used in Makefile variable names and many variables end with
|
||||
"^# name". Relax sort-makefile-lines.py to allow '_' in name and
|
||||
"^# name" as variable end. This fixes BZ #31385.
|
||||
|
||||
(cherry picked from commit 6a2512bf1605a4208dd94ef67408488d8acb2409)
|
||||
|
||||
diff --git a/scripts/sort-makefile-lines.py b/scripts/sort-makefile-lines.py
|
||||
index f65ee40e27fb85ff..b2249aef6d028cf7 100755
|
||||
--- a/scripts/sort-makefile-lines.py
|
||||
+++ b/scripts/sort-makefile-lines.py
|
||||
@@ -129,7 +129,7 @@ def sort_makefile_lines():
|
||||
for i in range(len(lines)):
|
||||
# Look for things like "var = \", "var := \" or "var += \"
|
||||
# to start the sorted list.
|
||||
- var = re.search(r'^([a-zA-Z0-9-]*) [\+:]?\= \\$', lines[i])
|
||||
+ var = re.search(r'^([-_a-zA-Z0-9]*) [\+:]?\= \\$', lines[i])
|
||||
if var:
|
||||
# Remember the index and the name.
|
||||
startmarks.append((i, var.group(1)))
|
||||
@@ -140,7 +140,7 @@ def sort_makefile_lines():
|
||||
rangemarks = []
|
||||
for sm in startmarks:
|
||||
# Look for things like " # var" to end the sorted list.
|
||||
- reg = r'^ # ' + sm[1] + r'$'
|
||||
+ reg = r'^ *# ' + sm[1] + r'$'
|
||||
for j in range(sm[0] + 1, len(lines)):
|
||||
if re.search(reg, lines[j]):
|
||||
# Remember the block to sort (inclusive).
|
32
glibc-upstream-2.39-199.patch
Normal file
32
glibc-upstream-2.39-199.patch
Normal file
@ -0,0 +1,32 @@
|
||||
commit 37b30b6a685c5facccdff61663eb3adf0dd253cd
|
||||
Author: H.J. Lu <hjl.tools@gmail.com>
|
||||
Date: Thu Feb 15 11:12:13 2024 -0800
|
||||
|
||||
sysdeps/x86_64/Makefile (tests): Add the end marker
|
||||
|
||||
(cherry picked from commit 71d133c500b0d23f6b6a7c6e3595e3fc447bfe91)
|
||||
|
||||
diff --git a/sysdeps/x86_64/Makefile b/sysdeps/x86_64/Makefile
|
||||
index 0ede447405d549b5..08ec882159990e97 100644
|
||||
--- a/sysdeps/x86_64/Makefile
|
||||
+++ b/sysdeps/x86_64/Makefile
|
||||
@@ -32,7 +32,8 @@ sysdep_routines += \
|
||||
# sysdep_routines
|
||||
gen-as-const-headers += locale-defines.sym
|
||||
tests += \
|
||||
- tst-rsi-strlen
|
||||
+ tst-rsi-strlen \
|
||||
+# tests
|
||||
endif
|
||||
|
||||
ifeq ($(subdir),elf)
|
||||
@@ -232,7 +233,8 @@ sysdep_routines += \
|
||||
# sysdep_routines
|
||||
|
||||
tests += \
|
||||
- tst-rsi-wcslen
|
||||
+ tst-rsi-wcslen \
|
||||
+# tests
|
||||
endif
|
||||
|
||||
|
39
glibc-upstream-2.39-200.patch
Normal file
39
glibc-upstream-2.39-200.patch
Normal file
@ -0,0 +1,39 @@
|
||||
commit 4e5ee49a432b8569137bdacc302fc696ed37b1bd
|
||||
Author: H.J. Lu <hjl.tools@gmail.com>
|
||||
Date: Wed Feb 28 05:46:40 2024 -0800
|
||||
|
||||
sysdeps/unix/sysv/linux/x86_64/Makefile: Add the end marker
|
||||
|
||||
Add the end marker to tests, tests-container and modules-names.
|
||||
|
||||
(cherry picked from commit e6350be7e9cae8f71c96c1f06eab61b9acb227c8)
|
||||
|
||||
diff --git a/sysdeps/unix/sysv/linux/x86_64/Makefile b/sysdeps/unix/sysv/linux/x86_64/Makefile
|
||||
index 9a1e7aa6461725af..fcbffd81cbaa031d 100644
|
||||
--- a/sysdeps/unix/sysv/linux/x86_64/Makefile
|
||||
+++ b/sysdeps/unix/sysv/linux/x86_64/Makefile
|
||||
@@ -17,18 +17,21 @@ endif
|
||||
ifeq ($(subdir),elf)
|
||||
ifeq (yes,$(enable-x86-isa-level))
|
||||
tests += \
|
||||
- tst-glibc-hwcaps-2
|
||||
+ tst-glibc-hwcaps-2 \
|
||||
+# tests
|
||||
ifeq (no,$(build-hardcoded-path-in-tests))
|
||||
# This is an ld.so.cache test, and RPATH/RUNPATH in the executable
|
||||
# interferes with its test objectives.
|
||||
tests-container += \
|
||||
- tst-glibc-hwcaps-2-cache
|
||||
+ tst-glibc-hwcaps-2-cache \
|
||||
+# tests-container
|
||||
endif
|
||||
modules-names += \
|
||||
libx86-64-isa-level-1 \
|
||||
libx86-64-isa-level-2 \
|
||||
libx86-64-isa-level-3 \
|
||||
- libx86-64-isa-level-4
|
||||
+ libx86-64-isa-level-4 \
|
||||
+# modules-names
|
||||
|
||||
$(objpfx)tst-glibc-hwcaps-2: $(objpfx)libx86-64-isa-level.so
|
||||
|
182
glibc-upstream-2.39-201.patch
Normal file
182
glibc-upstream-2.39-201.patch
Normal file
@ -0,0 +1,182 @@
|
||||
commit 147bed0a71a6c5cbf83d05f4081e923d74a6847e
|
||||
Author: Florian Weimer <fweimer@redhat.com>
|
||||
Date: Thu Feb 13 21:56:52 2025 +0100
|
||||
|
||||
elf: Keep using minimal malloc after early DTV resize (bug 32412)
|
||||
|
||||
If an auditor loads many TLS-using modules during startup, it is
|
||||
possible to trigger DTV resizing. Previously, the DTV was marked
|
||||
as allocated by the main malloc afterwards, even if the minimal
|
||||
malloc was still in use. With this change, _dl_resize_dtv marks
|
||||
the resized DTV as allocated with the minimal malloc.
|
||||
|
||||
The new test reuses TLS-using modules from other auditing tests.
|
||||
|
||||
Reviewed-by: DJ Delorie <dj@redhat.com>
|
||||
(cherry picked from commit aa3d7bd5299b33bffc118aa618b59bfa66059bcb)
|
||||
|
||||
diff --git a/elf/Makefile b/elf/Makefile
|
||||
index 8a5678aa63736812..f2e9cb1075adc8a5 100644
|
||||
--- a/elf/Makefile
|
||||
+++ b/elf/Makefile
|
||||
@@ -376,6 +376,7 @@ tests += \
|
||||
tst-align3 \
|
||||
tst-audit-tlsdesc \
|
||||
tst-audit-tlsdesc-dlopen \
|
||||
+ tst-audit-tlsdesc-dlopen2 \
|
||||
tst-audit1 \
|
||||
tst-audit2 \
|
||||
tst-audit8 \
|
||||
@@ -802,6 +803,7 @@ modules-names += \
|
||||
tst-auditmanymod8 \
|
||||
tst-auditmanymod9 \
|
||||
tst-auditmod-tlsdesc \
|
||||
+ tst-auditmod-tlsdesc2 \
|
||||
tst-auditmod1 \
|
||||
tst-auditmod11 \
|
||||
tst-auditmod12 \
|
||||
@@ -3012,6 +3014,9 @@ $(objpfx)tst-audit-tlsdesc.out: $(objpfx)tst-auditmod-tlsdesc.so
|
||||
tst-audit-tlsdesc-ENV = LD_AUDIT=$(objpfx)tst-auditmod-tlsdesc.so
|
||||
$(objpfx)tst-audit-tlsdesc-dlopen.out: $(objpfx)tst-auditmod-tlsdesc.so
|
||||
tst-audit-tlsdesc-dlopen-ENV = LD_AUDIT=$(objpfx)tst-auditmod-tlsdesc.so
|
||||
+$(objpfx)tst-audit-tlsdesc-dlopen2.out: $(objpfx)tst-auditmod-tlsdesc2.so \
|
||||
+ $(patsubst %, $(objpfx)%.so, $(tlsmod17a-modules))
|
||||
+tst-audit-tlsdesc-dlopen2-ENV = LD_AUDIT=$(objpfx)tst-auditmod-tlsdesc2.so
|
||||
|
||||
$(objpfx)tst-dlmopen-twice.out: \
|
||||
$(objpfx)tst-dlmopen-twice-mod1.so \
|
||||
diff --git a/elf/dl-tls.c b/elf/dl-tls.c
|
||||
index 3d529b722cb271d9..b13e752358a059a4 100644
|
||||
--- a/elf/dl-tls.c
|
||||
+++ b/elf/dl-tls.c
|
||||
@@ -528,6 +528,13 @@ _dl_resize_dtv (dtv_t *dtv, size_t max_modid)
|
||||
if (newp == NULL)
|
||||
oom ();
|
||||
memcpy (newp, &dtv[-1], (2 + oldsize) * sizeof (dtv_t));
|
||||
+#ifdef SHARED
|
||||
+ /* Auditors can trigger a DTV resize event while the full malloc
|
||||
+ is not yet in use. Mark the new DTV allocation as the
|
||||
+ initial allocation. */
|
||||
+ if (!__rtld_malloc_is_complete ())
|
||||
+ GL(dl_initial_dtv) = &newp[1];
|
||||
+#endif
|
||||
}
|
||||
else
|
||||
{
|
||||
diff --git a/elf/tst-audit-tlsdesc-dlopen2.c b/elf/tst-audit-tlsdesc-dlopen2.c
|
||||
new file mode 100644
|
||||
index 0000000000000000..7ba2c4129a9bcc53
|
||||
--- /dev/null
|
||||
+++ b/elf/tst-audit-tlsdesc-dlopen2.c
|
||||
@@ -0,0 +1,46 @@
|
||||
+/* Loading TLS-using modules from auditors (bug 32412). Main program.
|
||||
+ Copyright (C) 2021-2025 Free Software Foundation, Inc.
|
||||
+ This file is part of the GNU C Library.
|
||||
+
|
||||
+ The GNU C Library is free software; you can redistribute it and/or
|
||||
+ modify it under the terms of the GNU Lesser General Public
|
||||
+ License as published by the Free Software Foundation; either
|
||||
+ version 2.1 of the License, or (at your option) any later version.
|
||||
+
|
||||
+ The GNU C Library is distributed in the hope that it will be useful,
|
||||
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
+ Lesser General Public License for more details.
|
||||
+
|
||||
+ You should have received a copy of the GNU Lesser General Public
|
||||
+ License along with the GNU C Library; if not, see
|
||||
+ <https://www.gnu.org/licenses/>. */
|
||||
+
|
||||
+#include <support/xdlfcn.h>
|
||||
+#include <stdio.h>
|
||||
+
|
||||
+static int
|
||||
+do_test (void)
|
||||
+{
|
||||
+ puts ("info: start of main program");
|
||||
+
|
||||
+ /* Load TLS-using modules, to trigger DTV resizing. The dynamic
|
||||
+ linker will load them again (requiring their own TLS) because the
|
||||
+ dlopen calls from the auditor were in the auditing namespace. */
|
||||
+ for (int i = 1; i <= 19; ++i)
|
||||
+ {
|
||||
+ char dso[30];
|
||||
+ snprintf (dso, sizeof (dso), "tst-tlsmod17a%d.so", i);
|
||||
+ char sym[30];
|
||||
+ snprintf (sym, sizeof(sym), "tlsmod17a%d", i);
|
||||
+
|
||||
+ void *handle = xdlopen (dso, RTLD_LAZY);
|
||||
+ int (*func) (void) = xdlsym (handle, sym);
|
||||
+ /* Trigger TLS allocation. */
|
||||
+ func ();
|
||||
+ }
|
||||
+
|
||||
+ return 0;
|
||||
+}
|
||||
+
|
||||
+#include <support/test-driver.c>
|
||||
diff --git a/elf/tst-auditmod-tlsdesc2.c b/elf/tst-auditmod-tlsdesc2.c
|
||||
new file mode 100644
|
||||
index 0000000000000000..50275cd34d1219c6
|
||||
--- /dev/null
|
||||
+++ b/elf/tst-auditmod-tlsdesc2.c
|
||||
@@ -0,0 +1,59 @@
|
||||
+/* Loading TLS-using modules from auditors (bug 32412). Audit module.
|
||||
+ Copyright (C) 2021-2025 Free Software Foundation, Inc.
|
||||
+ This file is part of the GNU C Library.
|
||||
+
|
||||
+ The GNU C Library is free software; you can redistribute it and/or
|
||||
+ modify it under the terms of the GNU Lesser General Public
|
||||
+ License as published by the Free Software Foundation; either
|
||||
+ version 2.1 of the License, or (at your option) any later version.
|
||||
+
|
||||
+ The GNU C Library is distributed in the hope that it will be useful,
|
||||
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
+ Lesser General Public License for more details.
|
||||
+
|
||||
+ You should have received a copy of the GNU Lesser General Public
|
||||
+ License along with the GNU C Library; if not, see
|
||||
+ <https://www.gnu.org/licenses/>. */
|
||||
+
|
||||
+#include <dlfcn.h>
|
||||
+#include <link.h>
|
||||
+#include <stdbool.h>
|
||||
+#include <stdio.h>
|
||||
+#include <unistd.h>
|
||||
+
|
||||
+unsigned int
|
||||
+la_version (unsigned int version)
|
||||
+{
|
||||
+ /* Open some modules, to trigger DTV resizing before the switch to
|
||||
+ the main malloc. */
|
||||
+ for (int i = 1; i <= 19; ++i)
|
||||
+ {
|
||||
+ char dso[30];
|
||||
+ snprintf (dso, sizeof (dso), "tst-tlsmod17a%d.so", i);
|
||||
+ char sym[30];
|
||||
+ snprintf (sym, sizeof(sym), "tlsmod17a%d", i);
|
||||
+
|
||||
+ void *handle = dlopen (dso, RTLD_LAZY);
|
||||
+ if (handle == NULL)
|
||||
+ {
|
||||
+ printf ("error: dlmopen from auditor: %s\n", dlerror ());
|
||||
+ fflush (stdout);
|
||||
+ _exit (1);
|
||||
+ }
|
||||
+ int (*func) (void) = dlsym (handle, sym);
|
||||
+ if (func == NULL)
|
||||
+ {
|
||||
+ printf ("error: dlsym from auditor: %s\n", dlerror ());
|
||||
+ fflush (stdout);
|
||||
+ _exit (1);
|
||||
+ }
|
||||
+ /* Trigger TLS allocation. */
|
||||
+ func ();
|
||||
+ }
|
||||
+
|
||||
+ puts ("info: TLS-using modules loaded from auditor");
|
||||
+ fflush (stdout);
|
||||
+
|
||||
+ return LAV_CURRENT;
|
||||
+}
|
57
glibc-upstream-2.39-202.patch
Normal file
57
glibc-upstream-2.39-202.patch
Normal file
@ -0,0 +1,57 @@
|
||||
commit abdeb4b5200e0afb05e6a7863c52d2fbe7029b47
|
||||
Author: Florian Weimer <fweimer@redhat.com>
|
||||
Date: Tue May 20 19:36:02 2025 +0200
|
||||
|
||||
support: Use const char * argument in support_capture_subprogram_self_sgid
|
||||
|
||||
The function does not modify the passed-in string, so make this clear
|
||||
via the prototype.
|
||||
|
||||
Reviewed-by: Carlos O'Donell <carlos@redhat.com>
|
||||
(cherry picked from commit f0c09fe61678df6f7f18fe1ebff074e62fa5ca7a)
|
||||
|
||||
diff --git a/support/capture_subprocess.h b/support/capture_subprocess.h
|
||||
index 1ecbdfe4fc4aa123..f2765278d920839d 100644
|
||||
--- a/support/capture_subprocess.h
|
||||
+++ b/support/capture_subprocess.h
|
||||
@@ -44,8 +44,7 @@ struct support_capture_subprocess support_capture_subprogram
|
||||
/* Copy the running program into a setgid binary and run it with CHILD_ID
|
||||
argument. If execution is successful, return the exit status of the child
|
||||
program, otherwise return a non-zero failure exit code. */
|
||||
-int support_capture_subprogram_self_sgid
|
||||
- (char *child_id);
|
||||
+int support_capture_subprogram_self_sgid (const char *child_id);
|
||||
|
||||
/* Deallocate the subprocess data captured by
|
||||
support_capture_subprocess. */
|
||||
diff --git a/support/support_capture_subprocess.c b/support/support_capture_subprocess.c
|
||||
index ffced8a89fca37a5..eb72a2c21cf99ee2 100644
|
||||
--- a/support/support_capture_subprocess.c
|
||||
+++ b/support/support_capture_subprocess.c
|
||||
@@ -109,7 +109,7 @@ support_capture_subprogram (const char *file, char *const argv[])
|
||||
safely make it SGID with the TARGET group ID. Then runs the
|
||||
executable. */
|
||||
static int
|
||||
-copy_and_spawn_sgid (char *child_id, gid_t gid)
|
||||
+copy_and_spawn_sgid (const char *child_id, gid_t gid)
|
||||
{
|
||||
char *dirname = xasprintf ("%s/tst-tunables-setuid.%jd",
|
||||
test_dir, (intmax_t) getpid ());
|
||||
@@ -181,7 +181,7 @@ copy_and_spawn_sgid (char *child_id, gid_t gid)
|
||||
ret = 0;
|
||||
infd = outfd = -1;
|
||||
|
||||
- char * const args[] = {execname, child_id, NULL};
|
||||
+ char * const args[] = {execname, (char *) child_id, NULL};
|
||||
|
||||
status = support_subprogram_wait (args[0], args);
|
||||
|
||||
@@ -210,7 +210,7 @@ err:
|
||||
}
|
||||
|
||||
int
|
||||
-support_capture_subprogram_self_sgid (char *child_id)
|
||||
+support_capture_subprogram_self_sgid (const char *child_id)
|
||||
{
|
||||
gid_t target = 0;
|
||||
const int count = 64;
|
43
glibc-upstream-2.39-203.patch
Normal file
43
glibc-upstream-2.39-203.patch
Normal file
@ -0,0 +1,43 @@
|
||||
commit 71ddb11ccd76843cec6e793977218e227fe51c07
|
||||
Author: Florian Weimer <fweimer@redhat.com>
|
||||
Date: Mon Dec 23 13:57:55 2024 +0100
|
||||
|
||||
support: Add support_record_failure_barrier
|
||||
|
||||
This can be used to stop execution after a TEST_COMPARE_BLOB
|
||||
failure, for example.
|
||||
|
||||
(cherry picked from commit d0b8aa6de4529231fadfe604ac2c434e559c2d9e)
|
||||
|
||||
diff --git a/support/check.h b/support/check.h
|
||||
index 7ea22c7a2cba5cfd..8f41e5b99fc17472 100644
|
||||
--- a/support/check.h
|
||||
+++ b/support/check.h
|
||||
@@ -207,6 +207,9 @@ void support_record_failure_reset (void);
|
||||
failures or not. */
|
||||
int support_record_failure_is_failed (void);
|
||||
|
||||
+/* Terminate the process if any failures have been encountered so far. */
|
||||
+void support_record_failure_barrier (void);
|
||||
+
|
||||
__END_DECLS
|
||||
|
||||
#endif /* SUPPORT_CHECK_H */
|
||||
diff --git a/support/support_record_failure.c b/support/support_record_failure.c
|
||||
index 978123701d128795..72ee2b232fb2b08c 100644
|
||||
--- a/support/support_record_failure.c
|
||||
+++ b/support/support_record_failure.c
|
||||
@@ -112,3 +112,13 @@ support_record_failure_is_failed (void)
|
||||
synchronization for reliable test error reporting anyway. */
|
||||
return __atomic_load_n (&state->failed, __ATOMIC_RELAXED);
|
||||
}
|
||||
+
|
||||
+void
|
||||
+support_record_failure_barrier (void)
|
||||
+{
|
||||
+ if (__atomic_load_n (&state->failed, __ATOMIC_RELAXED))
|
||||
+ {
|
||||
+ puts ("error: exiting due to previous errors");
|
||||
+ exit (1);
|
||||
+ }
|
||||
+}
|
155
glibc-upstream-2.39-204.patch
Normal file
155
glibc-upstream-2.39-204.patch
Normal file
@ -0,0 +1,155 @@
|
||||
commit ca99d55315b80277a7b189f5a9630f5b08ccaa6d
|
||||
Author: Florian Weimer <fweimer@redhat.com>
|
||||
Date: Tue May 20 19:45:06 2025 +0200
|
||||
|
||||
elf: Test case for bug 32976 (CVE-2025-4802)
|
||||
|
||||
Check that LD_LIBRARY_PATH is ignored for AT_SECURE statically
|
||||
linked binaries, using support_capture_subprogram_self_sgid.
|
||||
|
||||
Reviewed-by: Carlos O'Donell <carlos@redhat.com>
|
||||
(cherry picked from commit d8f7a79335b0d861c12c42aec94c04cd5bb181e2)
|
||||
|
||||
diff --git a/elf/Makefile b/elf/Makefile
|
||||
index f2e9cb1075adc8a5..51d52b57876fc5ba 100644
|
||||
--- a/elf/Makefile
|
||||
+++ b/elf/Makefile
|
||||
@@ -266,6 +266,7 @@ tests-static-normal := \
|
||||
tst-array1-static \
|
||||
tst-array5-static \
|
||||
tst-dl-iter-static \
|
||||
+ tst-dlopen-sgid \
|
||||
tst-dst-static \
|
||||
tst-env-setuid-static \
|
||||
tst-getauxval-static \
|
||||
@@ -844,6 +845,7 @@ modules-names += \
|
||||
tst-dlmopen-twice-mod1 \
|
||||
tst-dlmopen-twice-mod2 \
|
||||
tst-dlmopen1mod \
|
||||
+ tst-dlopen-sgid-mod \
|
||||
tst-dlopen-tlsreinitmod1 \
|
||||
tst-dlopen-tlsreinitmod2 \
|
||||
tst-dlopen-tlsreinitmod3 \
|
||||
@@ -3125,3 +3127,5 @@ $(objpfx)tst-dlopen-tlsreinit3.out: $(objpfx)tst-auditmod1.so
|
||||
tst-dlopen-tlsreinit3-ENV = LD_AUDIT=$(objpfx)tst-auditmod1.so
|
||||
$(objpfx)tst-dlopen-tlsreinit4.out: $(objpfx)tst-auditmod1.so
|
||||
tst-dlopen-tlsreinit4-ENV = LD_AUDIT=$(objpfx)tst-auditmod1.so
|
||||
+
|
||||
+$(objpfx)tst-dlopen-sgid.out: $(objpfx)tst-dlopen-sgid-mod.so
|
||||
diff --git a/elf/tst-dlopen-sgid-mod.c b/elf/tst-dlopen-sgid-mod.c
|
||||
new file mode 100644
|
||||
index 0000000000000000..5eb79eef485da4c9
|
||||
--- /dev/null
|
||||
+++ b/elf/tst-dlopen-sgid-mod.c
|
||||
@@ -0,0 +1 @@
|
||||
+/* Opening this object should not succeed. */
|
||||
diff --git a/elf/tst-dlopen-sgid.c b/elf/tst-dlopen-sgid.c
|
||||
new file mode 100644
|
||||
index 0000000000000000..47829a405e90b6b9
|
||||
--- /dev/null
|
||||
+++ b/elf/tst-dlopen-sgid.c
|
||||
@@ -0,0 +1,104 @@
|
||||
+/* Test case for ignored LD_LIBRARY_PATH in static startug (bug 32976).
|
||||
+ Copyright (C) 2025 Free Software Foundation, Inc.
|
||||
+ This file is part of the GNU C Library.
|
||||
+
|
||||
+ The GNU C Library is free software; you can redistribute it and/or
|
||||
+ modify it under the terms of the GNU Lesser General Public
|
||||
+ License as published by the Free Software Foundation; either
|
||||
+ version 2.1 of the License, or (at your option) any later version.
|
||||
+
|
||||
+ The GNU C Library is distributed in the hope that it will be useful,
|
||||
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
+ Lesser General Public License for more details.
|
||||
+
|
||||
+ You should have received a copy of the GNU Lesser General Public
|
||||
+ License along with the GNU C Library; if not, see
|
||||
+ <https://www.gnu.org/licenses/>. */
|
||||
+
|
||||
+#include <dlfcn.h>
|
||||
+#include <gnu/lib-names.h>
|
||||
+#include <stddef.h>
|
||||
+#include <stdint.h>
|
||||
+#include <stdlib.h>
|
||||
+#include <string.h>
|
||||
+#include <support/capture_subprocess.h>
|
||||
+#include <support/check.h>
|
||||
+#include <support/support.h>
|
||||
+#include <support/temp_file.h>
|
||||
+#include <unistd.h>
|
||||
+
|
||||
+/* This is the name of our test object. Use a custom module for
|
||||
+ testing, so that this object does not get picked up from the system
|
||||
+ path. */
|
||||
+static const char dso_name[] = "tst-dlopen-sgid-mod.so";
|
||||
+
|
||||
+/* Used to mark the recursive invocation. */
|
||||
+static const char magic_argument[] = "run-actual-test";
|
||||
+
|
||||
+static int
|
||||
+do_test (void)
|
||||
+{
|
||||
+/* Pathname of the directory that receives the shared objects this
|
||||
+ test attempts to load. */
|
||||
+ char *libdir = support_create_temp_directory ("tst-dlopen-sgid-");
|
||||
+
|
||||
+ /* This is supposed to be ignored and stripped. */
|
||||
+ TEST_COMPARE (setenv ("LD_LIBRARY_PATH", libdir, 1), 0);
|
||||
+
|
||||
+ /* Copy of libc.so.6. */
|
||||
+ {
|
||||
+ char *from = xasprintf ("%s/%s", support_objdir_root, LIBC_SO);
|
||||
+ char *to = xasprintf ("%s/%s", libdir, LIBC_SO);
|
||||
+ add_temp_file (to);
|
||||
+ support_copy_file (from, to);
|
||||
+ free (to);
|
||||
+ free (from);
|
||||
+ }
|
||||
+
|
||||
+ /* Copy of the test object. */
|
||||
+ {
|
||||
+ char *from = xasprintf ("%s/elf/%s", support_objdir_root, dso_name);
|
||||
+ char *to = xasprintf ("%s/%s", libdir, dso_name);
|
||||
+ add_temp_file (to);
|
||||
+ support_copy_file (from, to);
|
||||
+ free (to);
|
||||
+ free (from);
|
||||
+ }
|
||||
+
|
||||
+ TEST_COMPARE (support_capture_subprogram_self_sgid (magic_argument), 0);
|
||||
+
|
||||
+ free (libdir);
|
||||
+
|
||||
+ return 0;
|
||||
+}
|
||||
+
|
||||
+static void
|
||||
+alternative_main (int argc, char **argv)
|
||||
+{
|
||||
+ if (argc == 2 && strcmp (argv[1], magic_argument) == 0)
|
||||
+ {
|
||||
+ if (getgid () == getegid ())
|
||||
+ /* This can happen if the file system is mounted nosuid. */
|
||||
+ FAIL_UNSUPPORTED ("SGID failed: GID and EGID match (%jd)\n",
|
||||
+ (intmax_t) getgid ());
|
||||
+
|
||||
+ /* Should be removed due to SGID. */
|
||||
+ TEST_COMPARE_STRING (getenv ("LD_LIBRARY_PATH"), NULL);
|
||||
+
|
||||
+ TEST_VERIFY (dlopen (dso_name, RTLD_NOW) == NULL);
|
||||
+ {
|
||||
+ const char *message = dlerror ();
|
||||
+ TEST_COMPARE_STRING (message,
|
||||
+ "tst-dlopen-sgid-mod.so:"
|
||||
+ " cannot open shared object file:"
|
||||
+ " No such file or directory");
|
||||
+ }
|
||||
+
|
||||
+ support_record_failure_barrier ();
|
||||
+ exit (EXIT_SUCCESS);
|
||||
+ }
|
||||
+}
|
||||
+
|
||||
+#define PREPARE alternative_main
|
||||
+#include <support/test-driver.c>
|
33
glibc-upstream-2.39-205.patch
Normal file
33
glibc-upstream-2.39-205.patch
Normal file
@ -0,0 +1,33 @@
|
||||
commit 9e25c0f445606e809996329b8a21d3342529474d
|
||||
Author: Sunil K Pandey <sunil.k.pandey@intel.com>
|
||||
Date: Tue May 20 10:07:27 2025 -0700
|
||||
|
||||
x86_64: Fix typo in ifunc-impl-list.c.
|
||||
|
||||
Fix wcsncpy and wcpncpy typo in ifunc-impl-list.c.
|
||||
|
||||
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
|
||||
(cherry picked from commit f2aeb6ff941dccc4c777b5621e77addea6cc076c)
|
||||
|
||||
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
|
||||
index c4a21d4b7ca8f01a..c34c94cb58394b56 100644
|
||||
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
|
||||
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
|
||||
@@ -928,7 +928,7 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
|
||||
(CPU_FEATURE_USABLE (AVX2)
|
||||
&& CPU_FEATURE_USABLE (BMI2)),
|
||||
__wcsncpy_avx2)
|
||||
- X86_IFUNC_IMPL_ADD_V2 (array, i, wcpncpy,
|
||||
+ X86_IFUNC_IMPL_ADD_V2 (array, i, wcsncpy,
|
||||
1,
|
||||
__wcsncpy_generic))
|
||||
|
||||
@@ -958,7 +958,7 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
|
||||
(CPU_FEATURE_USABLE (AVX2)
|
||||
&& CPU_FEATURE_USABLE (BMI2)),
|
||||
__wcpncpy_avx2)
|
||||
- X86_IFUNC_IMPL_ADD_V2 (array, i, wcsncpy,
|
||||
+ X86_IFUNC_IMPL_ADD_V2 (array, i, wcpncpy,
|
||||
1,
|
||||
__wcpncpy_generic))
|
||||
|
43
glibc-upstream-2.39-206.patch
Normal file
43
glibc-upstream-2.39-206.patch
Normal file
@ -0,0 +1,43 @@
|
||||
commit 2caef2827f76af88d495eb382da174896d08900a
|
||||
Author: Florian Weimer <fweimer@redhat.com>
|
||||
Date: Wed May 21 08:43:32 2025 +0200
|
||||
|
||||
elf: Fix subprocess status handling for tst-dlopen-sgid (bug 32987)
|
||||
|
||||
This should really move into support_capture_subprogram_self_sgid.
|
||||
|
||||
Reviewed-by: Sam James <sam@gentoo.org>
|
||||
(cherry picked from commit 35fc356fa3b4f485bd3ba3114c9f774e5df7d3c2)
|
||||
|
||||
diff --git a/elf/tst-dlopen-sgid.c b/elf/tst-dlopen-sgid.c
|
||||
index 47829a405e90b6b9..5688b79f2e870b1d 100644
|
||||
--- a/elf/tst-dlopen-sgid.c
|
||||
+++ b/elf/tst-dlopen-sgid.c
|
||||
@@ -26,6 +26,8 @@
|
||||
#include <support/check.h>
|
||||
#include <support/support.h>
|
||||
#include <support/temp_file.h>
|
||||
+#include <support/test-driver.h>
|
||||
+#include <sys/wait.h>
|
||||
#include <unistd.h>
|
||||
|
||||
/* This is the name of our test object. Use a custom module for
|
||||
@@ -66,10 +68,16 @@ do_test (void)
|
||||
free (from);
|
||||
}
|
||||
|
||||
- TEST_COMPARE (support_capture_subprogram_self_sgid (magic_argument), 0);
|
||||
-
|
||||
free (libdir);
|
||||
|
||||
+ int status = support_capture_subprogram_self_sgid (magic_argument);
|
||||
+
|
||||
+ if (WEXITSTATUS (status) == EXIT_UNSUPPORTED)
|
||||
+ return EXIT_UNSUPPORTED;
|
||||
+
|
||||
+ if (!WIFEXITED (status))
|
||||
+ FAIL_EXIT1 ("Unexpected exit status %d from child process\n", status);
|
||||
+
|
||||
return 0;
|
||||
}
|
||||
|
215
glibc-upstream-2.39-207.patch
Normal file
215
glibc-upstream-2.39-207.patch
Normal file
@ -0,0 +1,215 @@
|
||||
commit c6240a11f7325031651e634309ca1a43a7484bd4
|
||||
Author: Carlos O'Donell <carlos@redhat.com>
|
||||
Date: Wed Jun 11 09:43:50 2025 -0400
|
||||
|
||||
ppc64le: Revert "powerpc: Fix performance issues of strcmp power10" (CVE-2025-5702)
|
||||
|
||||
This reverts commit 90bcc8721ef82b7378d2b080141228660e862d56
|
||||
|
||||
This change is in the chain of the final revert that fixes the CVE
|
||||
i.e. 3367d8e180848030d1646f088759f02b8dfe0d6f
|
||||
|
||||
Reason for revert: Power10 strcmp clobbers non-volatile vector
|
||||
registers (Bug 33056)
|
||||
|
||||
Tested on ppc64le with no regressions.
|
||||
|
||||
(cherry picked from commit c22de63588df7a8a0edceea9bb02534064c9d201)
|
||||
|
||||
diff --git a/sysdeps/powerpc/powerpc64/le/power10/strcmp.S b/sysdeps/powerpc/powerpc64/le/power10/strcmp.S
|
||||
index f0d6732a25efc63b..00f1e9c1707f5dd1 100644
|
||||
--- a/sysdeps/powerpc/powerpc64/le/power10/strcmp.S
|
||||
+++ b/sysdeps/powerpc/powerpc64/le/power10/strcmp.S
|
||||
@@ -62,7 +62,7 @@
|
||||
lxvl 32+v5,reg2,r0; \
|
||||
add reg1,reg1,len_reg; \
|
||||
add reg2,reg2,len_reg; \
|
||||
- vcmpnezb v7,v4,v5; \
|
||||
+ vcmpnezb. v7,v4,v5; \
|
||||
vctzlsbb r6,v7; \
|
||||
cmpld cr7,r6,len_reg; \
|
||||
blt cr7,L(different); \
|
||||
@@ -72,110 +72,70 @@
|
||||
|
||||
.machine power9
|
||||
ENTRY_TOCLESS (STRCMP, 4)
|
||||
- andi. r7,r3,4095
|
||||
- andi. r8,r4,4095
|
||||
- cmpldi cr0,r7,4096-16
|
||||
- cmpldi cr1,r8,4096-16
|
||||
- bgt cr0,L(crosses)
|
||||
- bgt cr1,L(crosses)
|
||||
- COMPARE_16(v4,v5,0)
|
||||
-
|
||||
-L(crosses):
|
||||
- andi. r7,r3,15
|
||||
- subfic r7,r7,16 /* r7(nalign1) = 16 - (str1 & 15). */
|
||||
- andi. r9,r4,15
|
||||
- subfic r5,r9,16 /* r5(nalign2) = 16 - (str2 & 15). */
|
||||
- cmpld cr7,r7,r5
|
||||
- beq cr7,L(same_aligned)
|
||||
- blt cr7,L(nalign1_min)
|
||||
+ li r11,16
|
||||
+ /* eq bit of cr1 used as swap status flag to indicate if
|
||||
+ source pointers were swapped. */
|
||||
+ crclr 4*cr1+eq
|
||||
+ vspltisb v19,-1
|
||||
+ andi. r7,r3,15
|
||||
+ sub r7,r11,r7 /* r7(nalign1) = 16 - (str1 & 15). */
|
||||
+ andi. r9,r4,15
|
||||
+ sub r5,r11,r9 /* r5(nalign2) = 16 - (str2 & 15). */
|
||||
+ cmpld cr7,r7,r5
|
||||
+ beq cr7,L(same_aligned)
|
||||
+ blt cr7,L(nalign1_min)
|
||||
+ /* Swap r3 and r4, and r7 and r5 such that r3 and r7 hold the
|
||||
+ pointer which is closer to the next 16B boundary so that only
|
||||
+ one CHECK_N_BYTES is needed before entering the loop below. */
|
||||
+ mr r8,r4
|
||||
+ mr r4,r3
|
||||
+ mr r3,r8
|
||||
+ mr r12,r7
|
||||
+ mr r7,r5
|
||||
+ mr r5,r12
|
||||
+ crset 4*cr1+eq /* Set bit on swapping source pointers. */
|
||||
|
||||
- /* nalign2 is minimum and s2 pointer is aligned. */
|
||||
- CHECK_N_BYTES(r3,r4,r5)
|
||||
- /* Are we on the 64B hunk which crosses a page? */
|
||||
- andi. r10,r3,63 /* Determine offset into 64B hunk. */
|
||||
- andi. r8,r3,15 /* The offset into the 16B hunk. */
|
||||
- neg r7,r3
|
||||
- andi. r9,r7,15 /* Number of bytes after a 16B cross. */
|
||||
- rlwinm. r7,r7,26,0x3F /* ((r3-4096))>>6&63. */
|
||||
- beq L(compare_64_pagecross)
|
||||
- mtctr r7
|
||||
- b L(compare_64B_unaligned)
|
||||
-
|
||||
- /* nalign1 is minimum and s1 pointer is aligned. */
|
||||
+ .p2align 5
|
||||
L(nalign1_min):
|
||||
CHECK_N_BYTES(r3,r4,r7)
|
||||
- /* Are we on the 64B hunk which crosses a page? */
|
||||
- andi. r10,r4,63 /* Determine offset into 64B hunk. */
|
||||
- andi. r8,r4,15 /* The offset into the 16B hunk. */
|
||||
- neg r7,r4
|
||||
- andi. r9,r7,15 /* Number of bytes after a 16B cross. */
|
||||
- rlwinm. r7,r7,26,0x3F /* ((r4-4096))>>6&63. */
|
||||
- beq L(compare_64_pagecross)
|
||||
- mtctr r7
|
||||
|
||||
.p2align 5
|
||||
-L(compare_64B_unaligned):
|
||||
- COMPARE_16(v4,v5,0)
|
||||
- COMPARE_16(v4,v5,16)
|
||||
- COMPARE_16(v4,v5,32)
|
||||
- COMPARE_16(v4,v5,48)
|
||||
- addi r3,r3,64
|
||||
- addi r4,r4,64
|
||||
- bdnz L(compare_64B_unaligned)
|
||||
+L(s1_aligned):
|
||||
+ /* r9 and r5 is number of bytes to be read after and before
|
||||
+ page boundary correspondingly. */
|
||||
+ sub r5,r5,r7
|
||||
+ subfic r9,r5,16
|
||||
+ /* Now let r7 hold the count of quadwords which can be
|
||||
+ checked without crossing a page boundary. quadword offset is
|
||||
+ (str2>>4)&0xFF. */
|
||||
+ rlwinm r7,r4,28,0xFF
|
||||
+ /* Below check is required only for first iteration. For second
|
||||
+ iteration and beyond, the new loop counter is always 255. */
|
||||
+ cmpldi r7,255
|
||||
+ beq L(L3)
|
||||
+ /* Get the initial loop count by 255-((str2>>4)&0xFF). */
|
||||
+ subfic r11,r7,255
|
||||
|
||||
- /* Cross the page boundary of s2, carefully. Only for first
|
||||
- iteration we have to get the count of 64B blocks to be checked.
|
||||
- From second iteration and beyond, loop counter is always 63. */
|
||||
-L(compare_64_pagecross):
|
||||
- li r11, 63
|
||||
+ .p2align 5
|
||||
+L(L1):
|
||||
mtctr r11
|
||||
- cmpldi r10,16
|
||||
- ble L(cross_4)
|
||||
- cmpldi r10,32
|
||||
- ble L(cross_3)
|
||||
- cmpldi r10,48
|
||||
- ble L(cross_2)
|
||||
-L(cross_1):
|
||||
- CHECK_N_BYTES(r3,r4,r9)
|
||||
- CHECK_N_BYTES(r3,r4,r8)
|
||||
- COMPARE_16(v4,v5,0)
|
||||
- COMPARE_16(v4,v5,16)
|
||||
- COMPARE_16(v4,v5,32)
|
||||
- addi r3,r3,48
|
||||
- addi r4,r4,48
|
||||
- b L(compare_64B_unaligned)
|
||||
-L(cross_2):
|
||||
- COMPARE_16(v4,v5,0)
|
||||
- addi r3,r3,16
|
||||
- addi r4,r4,16
|
||||
- CHECK_N_BYTES(r3,r4,r9)
|
||||
- CHECK_N_BYTES(r3,r4,r8)
|
||||
- COMPARE_16(v4,v5,0)
|
||||
- COMPARE_16(v4,v5,16)
|
||||
- addi r3,r3,32
|
||||
- addi r4,r4,32
|
||||
- b L(compare_64B_unaligned)
|
||||
-L(cross_3):
|
||||
- COMPARE_16(v4,v5,0)
|
||||
- COMPARE_16(v4,v5,16)
|
||||
- addi r3,r3,32
|
||||
- addi r4,r4,32
|
||||
- CHECK_N_BYTES(r3,r4,r9)
|
||||
- CHECK_N_BYTES(r3,r4,r8)
|
||||
- COMPARE_16(v4,v5,0)
|
||||
+
|
||||
+ .p2align 5
|
||||
+L(L2):
|
||||
+ COMPARE_16(v4,v5,0) /* Load 16B blocks using lxv. */
|
||||
addi r3,r3,16
|
||||
addi r4,r4,16
|
||||
- b L(compare_64B_unaligned)
|
||||
-L(cross_4):
|
||||
- COMPARE_16(v4,v5,0)
|
||||
- COMPARE_16(v4,v5,16)
|
||||
- COMPARE_16(v4,v5,32)
|
||||
- addi r3,r3,48
|
||||
- addi r4,r4,48
|
||||
+ bdnz L(L2)
|
||||
+ /* Cross the page boundary of s2, carefully. */
|
||||
+
|
||||
+ .p2align 5
|
||||
+L(L3):
|
||||
+ CHECK_N_BYTES(r3,r4,r5)
|
||||
CHECK_N_BYTES(r3,r4,r9)
|
||||
- CHECK_N_BYTES(r3,r4,r8)
|
||||
- b L(compare_64B_unaligned)
|
||||
+ li r11,255 /* Load the new loop counter. */
|
||||
+ b L(L1)
|
||||
|
||||
+ .p2align 5
|
||||
L(same_aligned):
|
||||
CHECK_N_BYTES(r3,r4,r7)
|
||||
/* Align s1 to 32B and adjust s2 address.
|
||||
@@ -208,7 +168,18 @@ L(16B_aligned_loop):
|
||||
|
||||
/* Calculate and return the difference. */
|
||||
L(different):
|
||||
- TAIL(v4,v5)
|
||||
+ vctzlsbb r6,v7
|
||||
+ vextubrx r5,r6,v4
|
||||
+ vextubrx r4,r6,v5
|
||||
+ bt 4*cr1+eq,L(swapped)
|
||||
+ subf r3,r4,r5
|
||||
+ blr
|
||||
+
|
||||
+ /* If src pointers were swapped, then swap the
|
||||
+ indices and calculate the return value. */
|
||||
+L(swapped):
|
||||
+ subf r3,r5,r4
|
||||
+ blr
|
||||
|
||||
.p2align 5
|
||||
L(32B_aligned_loop):
|
443
glibc-upstream-2.39-208.patch
Normal file
443
glibc-upstream-2.39-208.patch
Normal file
@ -0,0 +1,443 @@
|
||||
commit 3875045da55e3df9b2a05392504888b88cd68edb
|
||||
Author: Carlos O'Donell <carlos@redhat.com>
|
||||
Date: Wed Jun 11 09:33:45 2025 -0400
|
||||
|
||||
ppc64le: Revert "powerpc : Add optimized memchr for POWER10" (Bug 33059)
|
||||
|
||||
This reverts commit b9182c793caa05df5d697427c0538936e6396d4b
|
||||
|
||||
Reason for revert: Power10 memchr clobbers v20 vector register
|
||||
(Bug 33059)
|
||||
|
||||
This is not a security issue, unlike CVE-2025-5745 and
|
||||
CVE-2025-5702.
|
||||
|
||||
Tested on ppc64le without regression.
|
||||
|
||||
(cherry picked from commit a7877bb6685300f159fa095c9f50b22b112cddb8)
|
||||
|
||||
diff --git a/sysdeps/powerpc/powerpc64/le/power10/memchr.S b/sysdeps/powerpc/powerpc64/le/power10/memchr.S
|
||||
deleted file mode 100644
|
||||
index 53e5716d72e133d5..0000000000000000
|
||||
--- a/sysdeps/powerpc/powerpc64/le/power10/memchr.S
|
||||
+++ /dev/null
|
||||
@@ -1,315 +0,0 @@
|
||||
-/* Optimized memchr implementation for POWER10 LE.
|
||||
- Copyright (C) 2021-2024 Free Software Foundation, Inc.
|
||||
- This file is part of the GNU C Library.
|
||||
-
|
||||
- The GNU C Library is free software; you can redistribute it and/or
|
||||
- modify it under the terms of the GNU Lesser General Public
|
||||
- License as published by the Free Software Foundation; either
|
||||
- version 2.1 of the License, or (at your option) any later version.
|
||||
-
|
||||
- The GNU C Library is distributed in the hope that it will be useful,
|
||||
- but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
- Lesser General Public License for more details.
|
||||
-
|
||||
- You should have received a copy of the GNU Lesser General Public
|
||||
- License along with the GNU C Library; if not, see
|
||||
- <https://www.gnu.org/licenses/>. */
|
||||
-
|
||||
-#include <sysdep.h>
|
||||
-
|
||||
-# ifndef MEMCHR
|
||||
-# define MEMCHR __memchr
|
||||
-# endif
|
||||
-# define M_VREG_ZERO v20
|
||||
-# define M_OFF_START_LOOP 256
|
||||
-# define MEMCHR_SUBTRACT_VECTORS \
|
||||
- vsububm v4,v4,v18; \
|
||||
- vsububm v5,v5,v18; \
|
||||
- vsububm v6,v6,v18; \
|
||||
- vsububm v7,v7,v18;
|
||||
-# define M_TAIL(vreg,increment) \
|
||||
- vctzlsbb r4,vreg; \
|
||||
- cmpld r5,r4; \
|
||||
- ble L(null); \
|
||||
- addi r4,r4,increment; \
|
||||
- add r3,r6,r4; \
|
||||
- blr
|
||||
-
|
||||
-/* TODO: Replace macros by the actual instructions when minimum binutils becomes
|
||||
- >= 2.35. This is used to keep compatibility with older versions. */
|
||||
-#define M_VEXTRACTBM(rt,vrb) \
|
||||
- .long(((4)<<(32-6)) \
|
||||
- | ((rt)<<(32-11)) \
|
||||
- | ((8)<<(32-16)) \
|
||||
- | ((vrb)<<(32-21)) \
|
||||
- | 1602)
|
||||
-
|
||||
-#define M_LXVP(xtp,dq,ra) \
|
||||
- .long(((6)<<(32-6)) \
|
||||
- | ((((xtp)-32)>>1)<<(32-10)) \
|
||||
- | ((1)<<(32-11)) \
|
||||
- | ((ra)<<(32-16)) \
|
||||
- | dq)
|
||||
-
|
||||
-#define CHECK16B(vreg,offset,addr,label) \
|
||||
- lxv vreg+32,offset(addr); \
|
||||
- vcmpequb. vreg,vreg,v18; \
|
||||
- bne cr6,L(label); \
|
||||
- cmpldi r5,16; \
|
||||
- ble L(null); \
|
||||
- addi r5,r5,-16;
|
||||
-
|
||||
-/* Load 4 quadwords, merge into one VR for speed and check for NULLs. r6 has #
|
||||
- of bytes already checked. */
|
||||
-#define CHECK64B(offset,addr,label) \
|
||||
- M_LXVP(v4+32,offset,addr); \
|
||||
- M_LXVP(v6+32,offset+32,addr); \
|
||||
- MEMCHR_SUBTRACT_VECTORS; \
|
||||
- vminub v14,v4,v5; \
|
||||
- vminub v15,v6,v7; \
|
||||
- vminub v16,v14,v15; \
|
||||
- vcmpequb. v0,v16,M_VREG_ZERO; \
|
||||
- beq cr6,$+12; \
|
||||
- li r7,offset; \
|
||||
- b L(label); \
|
||||
- cmpldi r5,64; \
|
||||
- ble L(null); \
|
||||
- addi r5,r5,-64
|
||||
-
|
||||
-/* Implements the function
|
||||
- void *[r3] memchr (const void *s [r3], int c [r4], size_t n [r5]). */
|
||||
-
|
||||
- .machine power9
|
||||
-
|
||||
-ENTRY_TOCLESS (MEMCHR)
|
||||
- CALL_MCOUNT 3
|
||||
-
|
||||
- cmpldi r5,0
|
||||
- beq L(null)
|
||||
- mr r0,r5
|
||||
- xori r6,r4,0xff
|
||||
-
|
||||
- mtvsrd v18+32,r4 /* matching char in v18 */
|
||||
- mtvsrd v19+32,r6 /* non matching char in v19 */
|
||||
-
|
||||
- vspltb v18,v18,7 /* replicate */
|
||||
- vspltb v19,v19,7 /* replicate */
|
||||
- vspltisb M_VREG_ZERO,0
|
||||
-
|
||||
- /* Next 16B-aligned address. Prepare address for L(aligned). */
|
||||
- addi r6,r3,16
|
||||
- clrrdi r6,r6,4
|
||||
-
|
||||
- /* Align data and fill bytes not loaded with non matching char. */
|
||||
- lvx v0,0,r3
|
||||
- lvsr v1,0,r3
|
||||
- vperm v0,v19,v0,v1
|
||||
-
|
||||
- vcmpequb. v6,v0,v18
|
||||
- bne cr6,L(found)
|
||||
- sub r4,r6,r3
|
||||
- cmpld r5,r4
|
||||
- ble L(null)
|
||||
- sub r5,r5,r4
|
||||
-
|
||||
- /* Test up to OFF_START_LOOP-16 bytes in 16B chunks. The main loop is
|
||||
- optimized for longer strings, so checking the first bytes in 16B
|
||||
- chunks benefits a lot small strings. */
|
||||
- .p2align 5
|
||||
-L(aligned):
|
||||
- cmpldi r5,0
|
||||
- beq L(null)
|
||||
-
|
||||
- CHECK16B(v0,0,r6,tail1)
|
||||
- CHECK16B(v1,16,r6,tail2)
|
||||
- CHECK16B(v2,32,r6,tail3)
|
||||
- CHECK16B(v3,48,r6,tail4)
|
||||
- CHECK16B(v4,64,r6,tail5)
|
||||
- CHECK16B(v5,80,r6,tail6)
|
||||
- CHECK16B(v6,96,r6,tail7)
|
||||
- CHECK16B(v7,112,r6,tail8)
|
||||
- CHECK16B(v8,128,r6,tail9)
|
||||
- CHECK16B(v9,144,r6,tail10)
|
||||
- CHECK16B(v10,160,r6,tail11)
|
||||
- CHECK16B(v0,176,r6,tail12)
|
||||
- CHECK16B(v1,192,r6,tail13)
|
||||
- CHECK16B(v2,208,r6,tail14)
|
||||
- CHECK16B(v3,224,r6,tail15)
|
||||
-
|
||||
- cmpdi cr5,r4,0 /* Check if c == 0. This will be useful to
|
||||
- choose how we will perform the main loop. */
|
||||
-
|
||||
- /* Prepare address for the loop. */
|
||||
- addi r4,r3,M_OFF_START_LOOP
|
||||
- clrrdi r4,r4,6
|
||||
- sub r6,r4,r3
|
||||
- sub r5,r0,r6
|
||||
- addi r6,r4,128
|
||||
-
|
||||
- /* If c == 0, use the loop without the vsububm. */
|
||||
- beq cr5,L(loop)
|
||||
-
|
||||
- /* This is very similar to the block after L(loop), the difference is
|
||||
- that here MEMCHR_SUBTRACT_VECTORS is not empty, and we subtract
|
||||
- each byte loaded by the char we are looking for, this way we can keep
|
||||
- using vminub to merge the results and checking for nulls. */
|
||||
- .p2align 5
|
||||
-L(memchr_loop):
|
||||
- CHECK64B(0,r4,pre_tail_64b)
|
||||
- CHECK64B(64,r4,pre_tail_64b)
|
||||
- addi r4,r4,256
|
||||
-
|
||||
- CHECK64B(0,r6,tail_64b)
|
||||
- CHECK64B(64,r6,tail_64b)
|
||||
- addi r6,r6,256
|
||||
-
|
||||
- CHECK64B(0,r4,pre_tail_64b)
|
||||
- CHECK64B(64,r4,pre_tail_64b)
|
||||
- addi r4,r4,256
|
||||
-
|
||||
- CHECK64B(0,r6,tail_64b)
|
||||
- CHECK64B(64,r6,tail_64b)
|
||||
- addi r6,r6,256
|
||||
-
|
||||
- b L(memchr_loop)
|
||||
- /* Switch to a more aggressive approach checking 64B each time. Use 2
|
||||
- pointers 128B apart and unroll the loop once to make the pointer
|
||||
- updates and usages separated enough to avoid stalls waiting for
|
||||
- address calculation. */
|
||||
- .p2align 5
|
||||
-L(loop):
|
||||
-#undef MEMCHR_SUBTRACT_VECTORS
|
||||
-#define MEMCHR_SUBTRACT_VECTORS /* nothing */
|
||||
- CHECK64B(0,r4,pre_tail_64b)
|
||||
- CHECK64B(64,r4,pre_tail_64b)
|
||||
- addi r4,r4,256
|
||||
-
|
||||
- CHECK64B(0,r6,tail_64b)
|
||||
- CHECK64B(64,r6,tail_64b)
|
||||
- addi r6,r6,256
|
||||
-
|
||||
- CHECK64B(0,r4,pre_tail_64b)
|
||||
- CHECK64B(64,r4,pre_tail_64b)
|
||||
- addi r4,r4,256
|
||||
-
|
||||
- CHECK64B(0,r6,tail_64b)
|
||||
- CHECK64B(64,r6,tail_64b)
|
||||
- addi r6,r6,256
|
||||
-
|
||||
- b L(loop)
|
||||
-
|
||||
- .p2align 5
|
||||
-L(pre_tail_64b):
|
||||
- mr r6,r4
|
||||
-L(tail_64b):
|
||||
- /* OK, we found a null byte. Let's look for it in the current 64-byte
|
||||
- block and mark it in its corresponding VR. lxvp vx,0(ry) puts the
|
||||
- low 16B bytes into vx+1, and the high into vx, so the order here is
|
||||
- v5, v4, v7, v6. */
|
||||
- vcmpequb v1,v5,M_VREG_ZERO
|
||||
- vcmpequb v2,v4,M_VREG_ZERO
|
||||
- vcmpequb v3,v7,M_VREG_ZERO
|
||||
- vcmpequb v4,v6,M_VREG_ZERO
|
||||
-
|
||||
- /* Take into account the other 64B blocks we had already checked. */
|
||||
- add r6,r6,r7
|
||||
- /* Extract first bit of each byte. */
|
||||
- M_VEXTRACTBM(r8,v1)
|
||||
- M_VEXTRACTBM(r9,v2)
|
||||
- M_VEXTRACTBM(r10,v3)
|
||||
- M_VEXTRACTBM(r11,v4)
|
||||
-
|
||||
- /* Shift each value into their corresponding position. */
|
||||
- sldi r9,r9,16
|
||||
- sldi r10,r10,32
|
||||
- sldi r11,r11,48
|
||||
-
|
||||
- /* Merge the results. */
|
||||
- or r8,r8,r9
|
||||
- or r9,r10,r11
|
||||
- or r11,r9,r8
|
||||
-
|
||||
- cnttzd r0,r11 /* Count trailing zeros before the match. */
|
||||
- cmpld r5,r0
|
||||
- ble L(null)
|
||||
- add r3,r6,r0 /* Compute final address. */
|
||||
- blr
|
||||
-
|
||||
- .p2align 5
|
||||
-L(tail1):
|
||||
- M_TAIL(v0,0)
|
||||
-
|
||||
- .p2align 5
|
||||
-L(tail2):
|
||||
- M_TAIL(v1,16)
|
||||
-
|
||||
- .p2align 5
|
||||
-L(tail3):
|
||||
- M_TAIL(v2,32)
|
||||
-
|
||||
- .p2align 5
|
||||
-L(tail4):
|
||||
- M_TAIL(v3,48)
|
||||
-
|
||||
- .p2align 5
|
||||
-L(tail5):
|
||||
- M_TAIL(v4,64)
|
||||
-
|
||||
- .p2align 5
|
||||
-L(tail6):
|
||||
- M_TAIL(v5,80)
|
||||
-
|
||||
- .p2align 5
|
||||
-L(tail7):
|
||||
- M_TAIL(v6,96)
|
||||
-
|
||||
- .p2align 5
|
||||
-L(tail8):
|
||||
- M_TAIL(v7,112)
|
||||
-
|
||||
- .p2align 5
|
||||
-L(tail9):
|
||||
- M_TAIL(v8,128)
|
||||
-
|
||||
- .p2align 5
|
||||
-L(tail10):
|
||||
- M_TAIL(v9,144)
|
||||
-
|
||||
- .p2align 5
|
||||
-L(tail11):
|
||||
- M_TAIL(v10,160)
|
||||
-
|
||||
- .p2align 5
|
||||
-L(tail12):
|
||||
- M_TAIL(v0,176)
|
||||
-
|
||||
- .p2align 5
|
||||
-L(tail13):
|
||||
- M_TAIL(v1,192)
|
||||
-
|
||||
- .p2align 5
|
||||
-L(tail14):
|
||||
- M_TAIL(v2,208)
|
||||
-
|
||||
- .p2align 5
|
||||
-L(tail15):
|
||||
- M_TAIL(v3,224)
|
||||
-
|
||||
- .p2align 5
|
||||
-L(found):
|
||||
- vctzlsbb r7,v6
|
||||
- cmpld r5,r7
|
||||
- ble L(null)
|
||||
- add r3,r3,r7
|
||||
- blr
|
||||
-
|
||||
- .p2align 5
|
||||
-L(null):
|
||||
- li r3,0
|
||||
- blr
|
||||
-
|
||||
-END (MEMCHR)
|
||||
-
|
||||
-weak_alias (__memchr, memchr)
|
||||
-libc_hidden_builtin_def (memchr)
|
||||
diff --git a/sysdeps/powerpc/powerpc64/multiarch/Makefile b/sysdeps/powerpc/powerpc64/multiarch/Makefile
|
||||
index 594fbb8058569d95..d7824a922b0de470 100644
|
||||
--- a/sysdeps/powerpc/powerpc64/multiarch/Makefile
|
||||
+++ b/sysdeps/powerpc/powerpc64/multiarch/Makefile
|
||||
@@ -31,10 +31,10 @@ sysdep_routines += memcpy-power8-cached memcpy-power7 memcpy-a2 memcpy-power6 \
|
||||
strncase-power8
|
||||
|
||||
ifneq (,$(filter %le,$(config-machine)))
|
||||
-sysdep_routines += memchr-power10 memcmp-power10 memcpy-power10 \
|
||||
- memmove-power10 memset-power10 rawmemchr-power9 \
|
||||
- rawmemchr-power10 strcmp-power9 strcmp-power10 \
|
||||
- strncmp-power9 strcpy-power9 stpcpy-power9 \
|
||||
+sysdep_routines += memcmp-power10 memcpy-power10 memmove-power10 memset-power10 \
|
||||
+ rawmemchr-power9 rawmemchr-power10 \
|
||||
+ strcmp-power9 strcmp-power10 strncmp-power9 \
|
||||
+ strcpy-power9 stpcpy-power9 \
|
||||
strlen-power9 strncpy-power9 stpncpy-power9 strlen-power10
|
||||
endif
|
||||
CFLAGS-strncase-power7.c += -mcpu=power7 -funroll-loops
|
||||
diff --git a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
|
||||
index 5b2d6a90ab59e561..e2f733eb82fa6199 100644
|
||||
--- a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
|
||||
+++ b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
|
||||
@@ -226,12 +226,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
|
||||
|
||||
/* Support sysdeps/powerpc/powerpc64/multiarch/memchr.c. */
|
||||
IFUNC_IMPL (i, name, memchr,
|
||||
-#ifdef __LITTLE_ENDIAN__
|
||||
- IFUNC_IMPL_ADD (array, i, memchr,
|
||||
- hwcap2 & PPC_FEATURE2_ARCH_3_1
|
||||
- && hwcap & PPC_FEATURE_HAS_VSX,
|
||||
- __memchr_power10)
|
||||
-#endif
|
||||
IFUNC_IMPL_ADD (array, i, memchr,
|
||||
hwcap2 & PPC_FEATURE2_ARCH_2_07
|
||||
&& hwcap & PPC_FEATURE_HAS_ALTIVEC,
|
||||
diff --git a/sysdeps/powerpc/powerpc64/multiarch/memchr-power10.S b/sysdeps/powerpc/powerpc64/multiarch/memchr-power10.S
|
||||
deleted file mode 100644
|
||||
index 7d35ef28a91255ba..0000000000000000
|
||||
--- a/sysdeps/powerpc/powerpc64/multiarch/memchr-power10.S
|
||||
+++ /dev/null
|
||||
@@ -1,28 +0,0 @@
|
||||
-/* Optimized memchr implementation for POWER10/PPC64.
|
||||
- Copyright (C) 2016-2024 Free Software Foundation, Inc.
|
||||
- This file is part of the GNU C Library.
|
||||
-
|
||||
- The GNU C Library is free software; you can redistribute it and/or
|
||||
- modify it under the terms of the GNU Lesser General Public
|
||||
- License as published by the Free Software Foundation; either
|
||||
- version 2.1 of the License, or (at your option) any later version.
|
||||
-
|
||||
- The GNU C Library is distributed in the hope that it will be useful,
|
||||
- but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
- Lesser General Public License for more details.
|
||||
-
|
||||
- You should have received a copy of the GNU Lesser General Public
|
||||
- License along with the GNU C Library; if not, see
|
||||
- <https://www.gnu.org/licenses/>. */
|
||||
-
|
||||
-#if defined __LITTLE_ENDIAN__ && IS_IN (libc)
|
||||
-#define MEMCHR __memchr_power10
|
||||
-
|
||||
-#undef libc_hidden_builtin_def
|
||||
-#define libc_hidden_builtin_def(name)
|
||||
-#undef weak_alias
|
||||
-#define weak_alias(name,alias)
|
||||
-
|
||||
-#include <sysdeps/powerpc/powerpc64/le/power10/memchr.S>
|
||||
-#endif
|
||||
diff --git a/sysdeps/powerpc/powerpc64/multiarch/memchr.c b/sysdeps/powerpc/powerpc64/multiarch/memchr.c
|
||||
index 57d23e7b18587e82..b4655dfcaa482774 100644
|
||||
--- a/sysdeps/powerpc/powerpc64/multiarch/memchr.c
|
||||
+++ b/sysdeps/powerpc/powerpc64/multiarch/memchr.c
|
||||
@@ -25,23 +25,15 @@ extern __typeof (__memchr) __memchr_ppc attribute_hidden;
|
||||
extern __typeof (__memchr) __memchr_power7 attribute_hidden;
|
||||
extern __typeof (__memchr) __memchr_power8 attribute_hidden;
|
||||
|
||||
-# ifdef __LITTLE_ENDIAN__
|
||||
-extern __typeof (__memchr) __memchr_power10 attribute_hidden;
|
||||
-# endif
|
||||
/* Avoid DWARF definition DIE on ifunc symbol so that GDB can handle
|
||||
ifunc symbol properly. */
|
||||
libc_ifunc (__memchr,
|
||||
-# ifdef __LITTLE_ENDIAN__
|
||||
- (hwcap2 & PPC_FEATURE2_ARCH_3_1
|
||||
- && hwcap & PPC_FEATURE_HAS_VSX)
|
||||
- ? __memchr_power10 :
|
||||
-# endif
|
||||
- (hwcap2 & PPC_FEATURE2_ARCH_2_07
|
||||
- && hwcap & PPC_FEATURE_HAS_ALTIVEC)
|
||||
- ? __memchr_power8 :
|
||||
- (hwcap & PPC_FEATURE_ARCH_2_06)
|
||||
- ? __memchr_power7
|
||||
- : __memchr_ppc);
|
||||
+ (hwcap2 & PPC_FEATURE2_ARCH_2_07
|
||||
+ && hwcap & PPC_FEATURE_HAS_ALTIVEC)
|
||||
+ ? __memchr_power8 :
|
||||
+ (hwcap & PPC_FEATURE_ARCH_2_06)
|
||||
+ ? __memchr_power7
|
||||
+ : __memchr_ppc);
|
||||
|
||||
weak_alias (__memchr, memchr)
|
||||
libc_hidden_builtin_def (memchr)
|
307
glibc-upstream-2.39-209.patch
Normal file
307
glibc-upstream-2.39-209.patch
Normal file
@ -0,0 +1,307 @@
|
||||
commit 06a70769fd0b2e1f2a3085ad50ab620282bd77b3
|
||||
Author: Carlos O'Donell <carlos@redhat.com>
|
||||
Date: Mon Jun 16 13:09:57 2025 -0400
|
||||
|
||||
ppc64le: Revert "powerpc: Optimized strcmp for power10" (CVE-2025-5702)
|
||||
|
||||
This reverts commit 3367d8e180848030d1646f088759f02b8dfe0d6f
|
||||
|
||||
Reason for revert: Power10 strcmp clobbers non-volatile vector
|
||||
registers (Bug 33056)
|
||||
|
||||
Tested on ppc64le without regression.
|
||||
|
||||
(cherry picked from commit 15808c77b35319e67ee0dc8f984a9a1a434701bc)
|
||||
|
||||
diff --git a/sysdeps/powerpc/powerpc64/le/power10/strcmp.S b/sysdeps/powerpc/powerpc64/le/power10/strcmp.S
|
||||
deleted file mode 100644
|
||||
index 00f1e9c1707f5dd1..0000000000000000
|
||||
--- a/sysdeps/powerpc/powerpc64/le/power10/strcmp.S
|
||||
+++ /dev/null
|
||||
@@ -1,204 +0,0 @@
|
||||
-/* Optimized strcmp implementation for PowerPC64/POWER10.
|
||||
- Copyright (C) 2021-2024 Free Software Foundation, Inc.
|
||||
- This file is part of the GNU C Library.
|
||||
-
|
||||
- The GNU C Library is free software; you can redistribute it and/or
|
||||
- modify it under the terms of the GNU Lesser General Public
|
||||
- License as published by the Free Software Foundation; either
|
||||
- version 2.1 of the License, or (at your option) any later version.
|
||||
-
|
||||
- The GNU C Library is distributed in the hope that it will be useful,
|
||||
- but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
- Lesser General Public License for more details.
|
||||
-
|
||||
- You should have received a copy of the GNU Lesser General Public
|
||||
- License along with the GNU C Library; if not, see
|
||||
- <https://www.gnu.org/licenses/>. */
|
||||
-#include <sysdep.h>
|
||||
-
|
||||
-#ifndef STRCMP
|
||||
-# define STRCMP strcmp
|
||||
-#endif
|
||||
-
|
||||
-/* Implements the function
|
||||
- int [r3] strcmp (const char *s1 [r3], const char *s2 [r4]). */
|
||||
-
|
||||
-/* TODO: Change this to actual instructions when minimum binutils is upgraded
|
||||
- to 2.27. Macros are defined below for these newer instructions in order
|
||||
- to maintain compatibility. */
|
||||
-
|
||||
-#define LXVP(xtp,dq,ra) \
|
||||
- .long(((6)<<(32-6)) \
|
||||
- | ((((xtp)-32)>>1)<<(32-10)) \
|
||||
- | ((1)<<(32-11)) \
|
||||
- | ((ra)<<(32-16)) \
|
||||
- | dq)
|
||||
-
|
||||
-#define COMPARE_16(vreg1,vreg2,offset) \
|
||||
- lxv vreg1+32,offset(r3); \
|
||||
- lxv vreg2+32,offset(r4); \
|
||||
- vcmpnezb. v7,vreg1,vreg2; \
|
||||
- bne cr6,L(different); \
|
||||
-
|
||||
-#define COMPARE_32(vreg1,vreg2,offset,label1,label2) \
|
||||
- LXVP(vreg1+32,offset,r3); \
|
||||
- LXVP(vreg2+32,offset,r4); \
|
||||
- vcmpnezb. v7,vreg1+1,vreg2+1; \
|
||||
- bne cr6,L(label1); \
|
||||
- vcmpnezb. v7,vreg1,vreg2; \
|
||||
- bne cr6,L(label2); \
|
||||
-
|
||||
-#define TAIL(vreg1,vreg2) \
|
||||
- vctzlsbb r6,v7; \
|
||||
- vextubrx r5,r6,vreg1; \
|
||||
- vextubrx r4,r6,vreg2; \
|
||||
- subf r3,r4,r5; \
|
||||
- blr; \
|
||||
-
|
||||
-#define CHECK_N_BYTES(reg1,reg2,len_reg) \
|
||||
- sldi r0,len_reg,56; \
|
||||
- lxvl 32+v4,reg1,r0; \
|
||||
- lxvl 32+v5,reg2,r0; \
|
||||
- add reg1,reg1,len_reg; \
|
||||
- add reg2,reg2,len_reg; \
|
||||
- vcmpnezb. v7,v4,v5; \
|
||||
- vctzlsbb r6,v7; \
|
||||
- cmpld cr7,r6,len_reg; \
|
||||
- blt cr7,L(different); \
|
||||
-
|
||||
- /* TODO: change this to .machine power10 when the minimum required
|
||||
- binutils allows it. */
|
||||
-
|
||||
- .machine power9
|
||||
-ENTRY_TOCLESS (STRCMP, 4)
|
||||
- li r11,16
|
||||
- /* eq bit of cr1 used as swap status flag to indicate if
|
||||
- source pointers were swapped. */
|
||||
- crclr 4*cr1+eq
|
||||
- vspltisb v19,-1
|
||||
- andi. r7,r3,15
|
||||
- sub r7,r11,r7 /* r7(nalign1) = 16 - (str1 & 15). */
|
||||
- andi. r9,r4,15
|
||||
- sub r5,r11,r9 /* r5(nalign2) = 16 - (str2 & 15). */
|
||||
- cmpld cr7,r7,r5
|
||||
- beq cr7,L(same_aligned)
|
||||
- blt cr7,L(nalign1_min)
|
||||
- /* Swap r3 and r4, and r7 and r5 such that r3 and r7 hold the
|
||||
- pointer which is closer to the next 16B boundary so that only
|
||||
- one CHECK_N_BYTES is needed before entering the loop below. */
|
||||
- mr r8,r4
|
||||
- mr r4,r3
|
||||
- mr r3,r8
|
||||
- mr r12,r7
|
||||
- mr r7,r5
|
||||
- mr r5,r12
|
||||
- crset 4*cr1+eq /* Set bit on swapping source pointers. */
|
||||
-
|
||||
- .p2align 5
|
||||
-L(nalign1_min):
|
||||
- CHECK_N_BYTES(r3,r4,r7)
|
||||
-
|
||||
- .p2align 5
|
||||
-L(s1_aligned):
|
||||
- /* r9 and r5 is number of bytes to be read after and before
|
||||
- page boundary correspondingly. */
|
||||
- sub r5,r5,r7
|
||||
- subfic r9,r5,16
|
||||
- /* Now let r7 hold the count of quadwords which can be
|
||||
- checked without crossing a page boundary. quadword offset is
|
||||
- (str2>>4)&0xFF. */
|
||||
- rlwinm r7,r4,28,0xFF
|
||||
- /* Below check is required only for first iteration. For second
|
||||
- iteration and beyond, the new loop counter is always 255. */
|
||||
- cmpldi r7,255
|
||||
- beq L(L3)
|
||||
- /* Get the initial loop count by 255-((str2>>4)&0xFF). */
|
||||
- subfic r11,r7,255
|
||||
-
|
||||
- .p2align 5
|
||||
-L(L1):
|
||||
- mtctr r11
|
||||
-
|
||||
- .p2align 5
|
||||
-L(L2):
|
||||
- COMPARE_16(v4,v5,0) /* Load 16B blocks using lxv. */
|
||||
- addi r3,r3,16
|
||||
- addi r4,r4,16
|
||||
- bdnz L(L2)
|
||||
- /* Cross the page boundary of s2, carefully. */
|
||||
-
|
||||
- .p2align 5
|
||||
-L(L3):
|
||||
- CHECK_N_BYTES(r3,r4,r5)
|
||||
- CHECK_N_BYTES(r3,r4,r9)
|
||||
- li r11,255 /* Load the new loop counter. */
|
||||
- b L(L1)
|
||||
-
|
||||
- .p2align 5
|
||||
-L(same_aligned):
|
||||
- CHECK_N_BYTES(r3,r4,r7)
|
||||
- /* Align s1 to 32B and adjust s2 address.
|
||||
- Use lxvp only if both s1 and s2 are 32B aligned. */
|
||||
- COMPARE_16(v4,v5,0)
|
||||
- COMPARE_16(v4,v5,16)
|
||||
- COMPARE_16(v4,v5,32)
|
||||
- COMPARE_16(v4,v5,48)
|
||||
- addi r3,r3,64
|
||||
- addi r4,r4,64
|
||||
- COMPARE_16(v4,v5,0)
|
||||
- COMPARE_16(v4,v5,16)
|
||||
-
|
||||
- clrldi r6,r3,59
|
||||
- subfic r5,r6,32
|
||||
- add r3,r3,r5
|
||||
- add r4,r4,r5
|
||||
- andi. r5,r4,0x1F
|
||||
- beq cr0,L(32B_aligned_loop)
|
||||
-
|
||||
- .p2align 5
|
||||
-L(16B_aligned_loop):
|
||||
- COMPARE_16(v4,v5,0)
|
||||
- COMPARE_16(v4,v5,16)
|
||||
- COMPARE_16(v4,v5,32)
|
||||
- COMPARE_16(v4,v5,48)
|
||||
- addi r3,r3,64
|
||||
- addi r4,r4,64
|
||||
- b L(16B_aligned_loop)
|
||||
-
|
||||
- /* Calculate and return the difference. */
|
||||
-L(different):
|
||||
- vctzlsbb r6,v7
|
||||
- vextubrx r5,r6,v4
|
||||
- vextubrx r4,r6,v5
|
||||
- bt 4*cr1+eq,L(swapped)
|
||||
- subf r3,r4,r5
|
||||
- blr
|
||||
-
|
||||
- /* If src pointers were swapped, then swap the
|
||||
- indices and calculate the return value. */
|
||||
-L(swapped):
|
||||
- subf r3,r5,r4
|
||||
- blr
|
||||
-
|
||||
- .p2align 5
|
||||
-L(32B_aligned_loop):
|
||||
- COMPARE_32(v14,v16,0,tail1,tail2)
|
||||
- COMPARE_32(v18,v20,32,tail3,tail4)
|
||||
- COMPARE_32(v22,v24,64,tail5,tail6)
|
||||
- COMPARE_32(v26,v28,96,tail7,tail8)
|
||||
- addi r3,r3,128
|
||||
- addi r4,r4,128
|
||||
- b L(32B_aligned_loop)
|
||||
-
|
||||
-L(tail1): TAIL(v15,v17)
|
||||
-L(tail2): TAIL(v14,v16)
|
||||
-L(tail3): TAIL(v19,v21)
|
||||
-L(tail4): TAIL(v18,v20)
|
||||
-L(tail5): TAIL(v23,v25)
|
||||
-L(tail6): TAIL(v22,v24)
|
||||
-L(tail7): TAIL(v27,v29)
|
||||
-L(tail8): TAIL(v26,v28)
|
||||
-
|
||||
-END (STRCMP)
|
||||
-libc_hidden_builtin_def (strcmp)
|
||||
diff --git a/sysdeps/powerpc/powerpc64/multiarch/Makefile b/sysdeps/powerpc/powerpc64/multiarch/Makefile
|
||||
index d7824a922b0de470..27d8495503a5a1fe 100644
|
||||
--- a/sysdeps/powerpc/powerpc64/multiarch/Makefile
|
||||
+++ b/sysdeps/powerpc/powerpc64/multiarch/Makefile
|
||||
@@ -33,8 +33,7 @@ sysdep_routines += memcpy-power8-cached memcpy-power7 memcpy-a2 memcpy-power6 \
|
||||
ifneq (,$(filter %le,$(config-machine)))
|
||||
sysdep_routines += memcmp-power10 memcpy-power10 memmove-power10 memset-power10 \
|
||||
rawmemchr-power9 rawmemchr-power10 \
|
||||
- strcmp-power9 strcmp-power10 strncmp-power9 \
|
||||
- strcpy-power9 stpcpy-power9 \
|
||||
+ strcmp-power9 strncmp-power9 strcpy-power9 stpcpy-power9 \
|
||||
strlen-power9 strncpy-power9 stpncpy-power9 strlen-power10
|
||||
endif
|
||||
CFLAGS-strncase-power7.c += -mcpu=power7 -funroll-loops
|
||||
diff --git a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
|
||||
index e2f733eb82fa6199..ad6080f1991f4080 100644
|
||||
--- a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
|
||||
+++ b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
|
||||
@@ -377,10 +377,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
|
||||
/* Support sysdeps/powerpc/powerpc64/multiarch/strcmp.c. */
|
||||
IFUNC_IMPL (i, name, strcmp,
|
||||
#ifdef __LITTLE_ENDIAN__
|
||||
- IFUNC_IMPL_ADD (array, i, strcmp,
|
||||
- (hwcap2 & PPC_FEATURE2_ARCH_3_1)
|
||||
- && (hwcap & PPC_FEATURE_HAS_VSX),
|
||||
- __strcmp_power10)
|
||||
IFUNC_IMPL_ADD (array, i, strcmp,
|
||||
hwcap2 & PPC_FEATURE2_ARCH_3_00
|
||||
&& hwcap & PPC_FEATURE_HAS_ALTIVEC,
|
||||
diff --git a/sysdeps/powerpc/powerpc64/multiarch/strcmp-power10.S b/sysdeps/powerpc/powerpc64/multiarch/strcmp-power10.S
|
||||
deleted file mode 100644
|
||||
index 1a9f6069f589a95c..0000000000000000
|
||||
--- a/sysdeps/powerpc/powerpc64/multiarch/strcmp-power10.S
|
||||
+++ /dev/null
|
||||
@@ -1,26 +0,0 @@
|
||||
-/* Optimized strcmp implementation for POWER10/PPC64.
|
||||
- Copyright (C) 2021-2024 Free Software Foundation, Inc.
|
||||
- This file is part of the GNU C Library.
|
||||
-
|
||||
- The GNU C Library is free software; you can redistribute it and/or
|
||||
- modify it under the terms of the GNU Lesser General Public
|
||||
- License as published by the Free Software Foundation; either
|
||||
- version 2.1 of the License, or (at your option) any later version.
|
||||
-
|
||||
- The GNU C Library is distributed in the hope that it will be useful,
|
||||
- but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
- Lesser General Public License for more details.
|
||||
-
|
||||
- You should have received a copy of the GNU Lesser General Public
|
||||
- License along with the GNU C Library; if not, see
|
||||
- <https://www.gnu.org/licenses/>. */
|
||||
-
|
||||
-#if defined __LITTLE_ENDIAN__ && IS_IN (libc)
|
||||
-#define STRCMP __strcmp_power10
|
||||
-
|
||||
-#undef libc_hidden_builtin_def
|
||||
-#define libc_hidden_builtin_def(name)
|
||||
-
|
||||
-#include <sysdeps/powerpc/powerpc64/le/power10/strcmp.S>
|
||||
-#endif /* __LITTLE_ENDIAN__ && IS_IN (libc) */
|
||||
diff --git a/sysdeps/powerpc/powerpc64/multiarch/strcmp.c b/sysdeps/powerpc/powerpc64/multiarch/strcmp.c
|
||||
index ff32496fabba2e47..06b9b4090ff23ee1 100644
|
||||
--- a/sysdeps/powerpc/powerpc64/multiarch/strcmp.c
|
||||
+++ b/sysdeps/powerpc/powerpc64/multiarch/strcmp.c
|
||||
@@ -29,16 +29,12 @@ extern __typeof (strcmp) __strcmp_power7 attribute_hidden;
|
||||
extern __typeof (strcmp) __strcmp_power8 attribute_hidden;
|
||||
# ifdef __LITTLE_ENDIAN__
|
||||
extern __typeof (strcmp) __strcmp_power9 attribute_hidden;
|
||||
-extern __typeof (strcmp) __strcmp_power10 attribute_hidden;
|
||||
# endif
|
||||
|
||||
# undef strcmp
|
||||
|
||||
libc_ifunc_redirected (__redirect_strcmp, strcmp,
|
||||
# ifdef __LITTLE_ENDIAN__
|
||||
- (hwcap2 & PPC_FEATURE2_ARCH_3_1
|
||||
- && hwcap & PPC_FEATURE_HAS_VSX)
|
||||
- ? __strcmp_power10 :
|
||||
(hwcap2 & PPC_FEATURE2_ARCH_3_00
|
||||
&& hwcap & PPC_FEATURE_HAS_ALTIVEC)
|
||||
? __strcmp_power9 :
|
106
glibc-upstream-2.39-210.patch
Normal file
106
glibc-upstream-2.39-210.patch
Normal file
@ -0,0 +1,106 @@
|
||||
commit 1924d341c0acbb9bf9ec77f1971fdb109933d12f
|
||||
Author: Florian Weimer <fweimer@redhat.com>
|
||||
Date: Wed May 21 16:47:34 2025 +0200
|
||||
|
||||
support: Pick group in support_capture_subprogram_self_sgid if UID == 0
|
||||
|
||||
When running as root, it is likely that we can run under any group.
|
||||
Pick a harmless group from /etc/group in this case.
|
||||
|
||||
Reviewed-by: Carlos O'Donell <carlos@redhat.com>
|
||||
(cherry picked from commit 2f769cec448d84a62b7dd0d4ff56978fe22c0cd6)
|
||||
|
||||
diff --git a/support/support_capture_subprocess.c b/support/support_capture_subprocess.c
|
||||
index eb72a2c21cf99ee2..1a30ae3f31f041d4 100644
|
||||
--- a/support/support_capture_subprocess.c
|
||||
+++ b/support/support_capture_subprocess.c
|
||||
@@ -21,7 +21,11 @@
|
||||
|
||||
#include <errno.h>
|
||||
#include <fcntl.h>
|
||||
+#include <grp.h>
|
||||
+#include <scratch_buffer.h>
|
||||
+#include <stdio_ext.h>
|
||||
#include <stdlib.h>
|
||||
+#include <string.h>
|
||||
#include <support/check.h>
|
||||
#include <support/xunistd.h>
|
||||
#include <support/xsocket.h>
|
||||
@@ -209,10 +213,48 @@ err:
|
||||
return status;
|
||||
}
|
||||
|
||||
+/* Returns true if a group with NAME has been found, and writes its
|
||||
+ GID to *TARGET. */
|
||||
+static bool
|
||||
+find_sgid_group (gid_t *target, const char *name)
|
||||
+{
|
||||
+ /* Do not use getgrname_r because it does not work in statically
|
||||
+ linked binaries if the system libc is different. */
|
||||
+ FILE *fp = fopen ("/etc/group", "rce");
|
||||
+ if (fp == NULL)
|
||||
+ return false;
|
||||
+ __fsetlocking (fp, FSETLOCKING_BYCALLER);
|
||||
+
|
||||
+ bool ok = false;
|
||||
+ struct scratch_buffer buf;
|
||||
+ scratch_buffer_init (&buf);
|
||||
+ while (true)
|
||||
+ {
|
||||
+ struct group grp;
|
||||
+ struct group *result = NULL;
|
||||
+ int status = fgetgrent_r (fp, &grp, buf.data, buf.length, &result);
|
||||
+ if (status == 0 && result != NULL)
|
||||
+ {
|
||||
+ if (strcmp (result->gr_name, name) == 0)
|
||||
+ {
|
||||
+ *target = result->gr_gid;
|
||||
+ ok = true;
|
||||
+ break;
|
||||
+ }
|
||||
+ }
|
||||
+ else if (errno != ERANGE)
|
||||
+ break;
|
||||
+ else if (!scratch_buffer_grow (&buf))
|
||||
+ break;
|
||||
+ }
|
||||
+ scratch_buffer_free (&buf);
|
||||
+ fclose (fp);
|
||||
+ return ok;
|
||||
+}
|
||||
+
|
||||
int
|
||||
support_capture_subprogram_self_sgid (const char *child_id)
|
||||
{
|
||||
- gid_t target = 0;
|
||||
const int count = 64;
|
||||
gid_t groups[count];
|
||||
|
||||
@@ -224,6 +266,7 @@ support_capture_subprogram_self_sgid (const char *child_id)
|
||||
(intmax_t) getuid ());
|
||||
|
||||
gid_t current = getgid ();
|
||||
+ gid_t target = current;
|
||||
for (int i = 0; i < ret; ++i)
|
||||
{
|
||||
if (groups[i] != current)
|
||||
@@ -233,9 +276,16 @@ support_capture_subprogram_self_sgid (const char *child_id)
|
||||
}
|
||||
}
|
||||
|
||||
- if (target == 0)
|
||||
- FAIL_UNSUPPORTED("Could not find a suitable GID for user %jd\n",
|
||||
- (intmax_t) getuid ());
|
||||
+ if (target == current)
|
||||
+ {
|
||||
+ /* If running as root, try to find a harmless group for SGID. */
|
||||
+ if (getuid () != 0
|
||||
+ || (!find_sgid_group (&target, "nogroup")
|
||||
+ && !find_sgid_group (&target, "bin")
|
||||
+ && !find_sgid_group (&target, "daemon")))
|
||||
+ FAIL_UNSUPPORTED("Could not find a suitable GID for user %jd\n",
|
||||
+ (intmax_t) getuid ());
|
||||
+ }
|
||||
|
||||
return copy_and_spawn_sgid (child_id, target);
|
||||
}
|
318
glibc-upstream-2.39-211.patch
Normal file
318
glibc-upstream-2.39-211.patch
Normal file
@ -0,0 +1,318 @@
|
||||
commit cff1042cceec3502269947e96cf7023451af22f3
|
||||
Author: Florian Weimer <fweimer@redhat.com>
|
||||
Date: Thu May 22 14:36:37 2025 +0200
|
||||
|
||||
Fix error reporting (false negatives) in SGID tests
|
||||
|
||||
And simplify the interface of support_capture_subprogram_self_sgid.
|
||||
|
||||
Use the existing framework for temporary directories (now with
|
||||
mode 0700) and directory/file deletion. Handle all execution
|
||||
errors within support_capture_subprogram_self_sgid. In particular,
|
||||
this includes test failures because the invoked program did not
|
||||
exit with exit status zero. Existing tests that expect exit
|
||||
status 42 are adjusted to use zero instead.
|
||||
|
||||
In addition, fix callers not to call exit (0) with test failures
|
||||
pending (which may mask them, especially when running with --direct).
|
||||
|
||||
Fixes commit 35fc356fa3b4f485bd3ba3114c9f774e5df7d3c2
|
||||
("elf: Fix subprocess status handling for tst-dlopen-sgid (bug 32987)").
|
||||
|
||||
Reviewed-by: Carlos O'Donell <carlos@redhat.com>
|
||||
(cherry picked from commit 3a3fb2ed83f79100c116c824454095ecfb335ad7)
|
||||
|
||||
diff --git a/elf/tst-dlopen-sgid.c b/elf/tst-dlopen-sgid.c
|
||||
index 5688b79f2e870b1d..8aec52e19fc56aba 100644
|
||||
--- a/elf/tst-dlopen-sgid.c
|
||||
+++ b/elf/tst-dlopen-sgid.c
|
||||
@@ -70,13 +70,7 @@ do_test (void)
|
||||
|
||||
free (libdir);
|
||||
|
||||
- int status = support_capture_subprogram_self_sgid (magic_argument);
|
||||
-
|
||||
- if (WEXITSTATUS (status) == EXIT_UNSUPPORTED)
|
||||
- return EXIT_UNSUPPORTED;
|
||||
-
|
||||
- if (!WIFEXITED (status))
|
||||
- FAIL_EXIT1 ("Unexpected exit status %d from child process\n", status);
|
||||
+ support_capture_subprogram_self_sgid (magic_argument);
|
||||
|
||||
return 0;
|
||||
}
|
||||
diff --git a/elf/tst-env-setuid-tunables.c b/elf/tst-env-setuid-tunables.c
|
||||
index a47219047f0f602d..233eec7631ed837c 100644
|
||||
--- a/elf/tst-env-setuid-tunables.c
|
||||
+++ b/elf/tst-env-setuid-tunables.c
|
||||
@@ -105,10 +105,7 @@ do_test (int argc, char **argv)
|
||||
|
||||
if (ret != 0)
|
||||
exit (1);
|
||||
-
|
||||
- /* Special return code to make sure that the child executed all the way
|
||||
- through. */
|
||||
- exit (42);
|
||||
+ return 0;
|
||||
}
|
||||
else
|
||||
{
|
||||
@@ -127,18 +124,7 @@ do_test (int argc, char **argv)
|
||||
continue;
|
||||
}
|
||||
|
||||
- int status = support_capture_subprogram_self_sgid (buf);
|
||||
-
|
||||
- /* Bail out early if unsupported. */
|
||||
- if (WEXITSTATUS (status) == EXIT_UNSUPPORTED)
|
||||
- return EXIT_UNSUPPORTED;
|
||||
-
|
||||
- if (WEXITSTATUS (status) != 42)
|
||||
- {
|
||||
- printf (" [%d] child failed with status %d\n", i,
|
||||
- WEXITSTATUS (status));
|
||||
- support_record_failure ();
|
||||
- }
|
||||
+ support_capture_subprogram_self_sgid (buf);
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
diff --git a/elf/tst-env-setuid.c b/elf/tst-env-setuid.c
|
||||
index 43047c48f3ecd555..c084aa4c1a382152 100644
|
||||
--- a/elf/tst-env-setuid.c
|
||||
+++ b/elf/tst-env-setuid.c
|
||||
@@ -148,10 +148,7 @@ do_test (int argc, char **argv)
|
||||
|
||||
if (ret != 0)
|
||||
exit (1);
|
||||
-
|
||||
- /* Special return code to make sure that the child executed all the way
|
||||
- through. */
|
||||
- exit (42);
|
||||
+ return 0;
|
||||
}
|
||||
else
|
||||
{
|
||||
@@ -175,17 +172,7 @@ do_test (int argc, char **argv)
|
||||
free (profilepath);
|
||||
}
|
||||
|
||||
- int status = support_capture_subprogram_self_sgid (SETGID_CHILD);
|
||||
-
|
||||
- if (WEXITSTATUS (status) == EXIT_UNSUPPORTED)
|
||||
- exit (EXIT_UNSUPPORTED);
|
||||
-
|
||||
- if (WEXITSTATUS (status) != 42)
|
||||
- {
|
||||
- printf (" child failed with status %d\n",
|
||||
- WEXITSTATUS (status));
|
||||
- support_record_failure ();
|
||||
- }
|
||||
+ support_capture_subprogram_self_sgid (SETGID_CHILD);
|
||||
|
||||
return 0;
|
||||
}
|
||||
diff --git a/stdlib/tst-secure-getenv.c b/stdlib/tst-secure-getenv.c
|
||||
index cc26ed6d15803c99..cefee58d46f25ebb 100644
|
||||
--- a/stdlib/tst-secure-getenv.c
|
||||
+++ b/stdlib/tst-secure-getenv.c
|
||||
@@ -57,13 +57,7 @@ do_test (void)
|
||||
exit (1);
|
||||
}
|
||||
|
||||
- int status = support_capture_subprogram_self_sgid (MAGIC_ARGUMENT);
|
||||
-
|
||||
- if (WEXITSTATUS (status) == EXIT_UNSUPPORTED)
|
||||
- return EXIT_UNSUPPORTED;
|
||||
-
|
||||
- if (!WIFEXITED (status))
|
||||
- FAIL_EXIT1 ("Unexpected exit status %d from child process\n", status);
|
||||
+ support_capture_subprogram_self_sgid (MAGIC_ARGUMENT);
|
||||
|
||||
return 0;
|
||||
}
|
||||
@@ -82,6 +76,7 @@ alternative_main (int argc, char **argv)
|
||||
if (secure_getenv ("PATH") != NULL)
|
||||
FAIL_EXIT (4, "PATH variable not filtered out\n");
|
||||
|
||||
+ support_record_failure_barrier ();
|
||||
exit (EXIT_SUCCESS);
|
||||
}
|
||||
}
|
||||
diff --git a/support/capture_subprocess.h b/support/capture_subprocess.h
|
||||
index f2765278d920839d..8cbdca3b9dfb41ba 100644
|
||||
--- a/support/capture_subprocess.h
|
||||
+++ b/support/capture_subprocess.h
|
||||
@@ -41,10 +41,12 @@ struct support_capture_subprocess support_capture_subprocess
|
||||
struct support_capture_subprocess support_capture_subprogram
|
||||
(const char *file, char *const argv[]);
|
||||
|
||||
-/* Copy the running program into a setgid binary and run it with CHILD_ID
|
||||
- argument. If execution is successful, return the exit status of the child
|
||||
- program, otherwise return a non-zero failure exit code. */
|
||||
-int support_capture_subprogram_self_sgid (const char *child_id);
|
||||
+/* Copy the running program into a setgid binary and run it with
|
||||
+ CHILD_ID argument. If the program exits with a non-zero status,
|
||||
+ exit with that exit status (or status 1 if the program did not exit
|
||||
+ normally). If the test cannot be performed, exit with
|
||||
+ EXIT_UNSUPPORTED. */
|
||||
+void support_capture_subprogram_self_sgid (const char *child_id);
|
||||
|
||||
/* Deallocate the subprocess data captured by
|
||||
support_capture_subprocess. */
|
||||
diff --git a/support/support_capture_subprocess.c b/support/support_capture_subprocess.c
|
||||
index 1a30ae3f31f041d4..8dc95f8aa723b6bc 100644
|
||||
--- a/support/support_capture_subprocess.c
|
||||
+++ b/support/support_capture_subprocess.c
|
||||
@@ -31,6 +31,7 @@
|
||||
#include <support/xsocket.h>
|
||||
#include <support/xspawn.h>
|
||||
#include <support/support.h>
|
||||
+#include <support/temp_file.h>
|
||||
#include <support/test-driver.h>
|
||||
|
||||
static void
|
||||
@@ -112,105 +113,44 @@ support_capture_subprogram (const char *file, char *const argv[])
|
||||
/* Copies the executable into a restricted directory, so that we can
|
||||
safely make it SGID with the TARGET group ID. Then runs the
|
||||
executable. */
|
||||
-static int
|
||||
+static void
|
||||
copy_and_spawn_sgid (const char *child_id, gid_t gid)
|
||||
{
|
||||
- char *dirname = xasprintf ("%s/tst-tunables-setuid.%jd",
|
||||
- test_dir, (intmax_t) getpid ());
|
||||
+ char *dirname = support_create_temp_directory ("tst-glibc-sgid-");
|
||||
char *execname = xasprintf ("%s/bin", dirname);
|
||||
- int infd = -1;
|
||||
- int outfd = -1;
|
||||
- int ret = 1, status = 1;
|
||||
-
|
||||
- TEST_VERIFY (mkdir (dirname, 0700) == 0);
|
||||
- if (support_record_failure_is_failed ())
|
||||
- goto err;
|
||||
+ add_temp_file (execname);
|
||||
|
||||
- infd = open ("/proc/self/exe", O_RDONLY);
|
||||
- if (infd < 0)
|
||||
+ if (access ("/proc/self/exe", R_OK) != 0)
|
||||
FAIL_UNSUPPORTED ("unsupported: Cannot read binary from procfs\n");
|
||||
|
||||
- outfd = open (execname, O_WRONLY | O_CREAT | O_EXCL, 0700);
|
||||
- TEST_VERIFY (outfd >= 0);
|
||||
- if (support_record_failure_is_failed ())
|
||||
- goto err;
|
||||
-
|
||||
- char buf[4096];
|
||||
- for (;;)
|
||||
- {
|
||||
- ssize_t rdcount = read (infd, buf, sizeof (buf));
|
||||
- TEST_VERIFY (rdcount >= 0);
|
||||
- if (support_record_failure_is_failed ())
|
||||
- goto err;
|
||||
- if (rdcount == 0)
|
||||
- break;
|
||||
- char *p = buf;
|
||||
- char *end = buf + rdcount;
|
||||
- while (p != end)
|
||||
- {
|
||||
- ssize_t wrcount = write (outfd, buf, end - p);
|
||||
- if (wrcount == 0)
|
||||
- errno = ENOSPC;
|
||||
- TEST_VERIFY (wrcount > 0);
|
||||
- if (support_record_failure_is_failed ())
|
||||
- goto err;
|
||||
- p += wrcount;
|
||||
- }
|
||||
- }
|
||||
+ support_copy_file ("/proc/self/exe", execname);
|
||||
|
||||
- bool chowned = false;
|
||||
- TEST_VERIFY ((chowned = fchown (outfd, getuid (), gid) == 0)
|
||||
- || errno == EPERM);
|
||||
- if (support_record_failure_is_failed ())
|
||||
- goto err;
|
||||
- else if (!chowned)
|
||||
- {
|
||||
- ret = 77;
|
||||
- goto err;
|
||||
- }
|
||||
+ if (chown (execname, getuid (), gid) != 0)
|
||||
+ FAIL_UNSUPPORTED ("cannot change group of \"%s\" to %jd: %m",
|
||||
+ execname, (intmax_t) gid);
|
||||
|
||||
- TEST_VERIFY (fchmod (outfd, 02750) == 0);
|
||||
- if (support_record_failure_is_failed ())
|
||||
- goto err;
|
||||
- TEST_VERIFY (close (outfd) == 0);
|
||||
- if (support_record_failure_is_failed ())
|
||||
- goto err;
|
||||
- TEST_VERIFY (close (infd) == 0);
|
||||
- if (support_record_failure_is_failed ())
|
||||
- goto err;
|
||||
+ if (chmod (execname, 02750) != 0)
|
||||
+ FAIL_UNSUPPORTED ("cannot make \"%s\" SGID: %m ", execname);
|
||||
|
||||
/* We have the binary, now spawn the subprocess. Avoid using
|
||||
support_subprogram because we only want the program exit status, not the
|
||||
contents. */
|
||||
- ret = 0;
|
||||
- infd = outfd = -1;
|
||||
|
||||
char * const args[] = {execname, (char *) child_id, NULL};
|
||||
+ int status = support_subprogram_wait (args[0], args);
|
||||
|
||||
- status = support_subprogram_wait (args[0], args);
|
||||
+ free (execname);
|
||||
+ free (dirname);
|
||||
|
||||
-err:
|
||||
- if (outfd >= 0)
|
||||
- close (outfd);
|
||||
- if (infd >= 0)
|
||||
- close (infd);
|
||||
- if (execname != NULL)
|
||||
- {
|
||||
- unlink (execname);
|
||||
- free (execname);
|
||||
- }
|
||||
- if (dirname != NULL)
|
||||
+ if (WIFEXITED (status))
|
||||
{
|
||||
- rmdir (dirname);
|
||||
- free (dirname);
|
||||
+ if (WEXITSTATUS (status) == 0)
|
||||
+ return;
|
||||
+ else
|
||||
+ exit (WEXITSTATUS (status));
|
||||
}
|
||||
-
|
||||
- if (ret == 77)
|
||||
- FAIL_UNSUPPORTED ("Failed to make sgid executable for test\n");
|
||||
- if (ret != 0)
|
||||
- FAIL_EXIT1 ("Failed to make sgid executable for test\n");
|
||||
-
|
||||
- return status;
|
||||
+ else
|
||||
+ FAIL_EXIT1 ("subprogram failed with status %d", status);
|
||||
}
|
||||
|
||||
/* Returns true if a group with NAME has been found, and writes its
|
||||
@@ -252,7 +192,7 @@ find_sgid_group (gid_t *target, const char *name)
|
||||
return ok;
|
||||
}
|
||||
|
||||
-int
|
||||
+void
|
||||
support_capture_subprogram_self_sgid (const char *child_id)
|
||||
{
|
||||
const int count = 64;
|
||||
@@ -287,7 +227,7 @@ support_capture_subprogram_self_sgid (const char *child_id)
|
||||
(intmax_t) getuid ());
|
||||
}
|
||||
|
||||
- return copy_and_spawn_sgid (child_id, target);
|
||||
+ copy_and_spawn_sgid (child_id, target);
|
||||
}
|
||||
|
||||
void
|
96
glibc.spec
96
glibc.spec
@ -145,7 +145,7 @@ Version: %{glibcversion}
|
||||
# - It allows using the Release number without the %%dist tag in the dependency
|
||||
# generator to make the generated requires interchangeable between Rawhide
|
||||
# and ELN (.elnYY < .fcXX).
|
||||
%global baserelease 39
|
||||
%global baserelease 40
|
||||
Release: %{baserelease}%{?dist}
|
||||
|
||||
# Licenses:
|
||||
@ -508,6 +508,71 @@ Patch190: glibc-RHEL-75809.patch
|
||||
Patch191: glibc-RHEL-75555.patch
|
||||
Patch192: glibc-RHEL-75809-2.patch
|
||||
Patch193: glibc-RHEL-75809-3.patch
|
||||
Patch194: glibc-upstream-2.39-147.patch
|
||||
Patch195: glibc-upstream-2.39-148.patch
|
||||
Patch196: glibc-upstream-2.39-149.patch
|
||||
Patch197: glibc-upstream-2.39-150.patch
|
||||
Patch198: glibc-upstream-2.39-151.patch
|
||||
Patch199: glibc-upstream-2.39-152.patch
|
||||
Patch200: glibc-upstream-2.39-153.patch
|
||||
Patch201: glibc-upstream-2.39-154.patch
|
||||
Patch202: glibc-upstream-2.39-155.patch
|
||||
Patch203: glibc-upstream-2.39-156.patch
|
||||
Patch204: glibc-upstream-2.39-157.patch
|
||||
Patch205: glibc-upstream-2.39-158.patch
|
||||
Patch206: glibc-upstream-2.39-159.patch
|
||||
Patch207: glibc-upstream-2.39-160.patch
|
||||
Patch208: glibc-upstream-2.39-161.patch
|
||||
Patch209: glibc-upstream-2.39-162.patch
|
||||
Patch210: glibc-upstream-2.39-163.patch
|
||||
Patch211: glibc-upstream-2.39-164.patch
|
||||
Patch212: glibc-upstream-2.39-165.patch
|
||||
Patch213: glibc-upstream-2.39-166.patch
|
||||
Patch214: glibc-upstream-2.39-167.patch
|
||||
Patch215: glibc-upstream-2.39-168.patch
|
||||
Patch216: glibc-upstream-2.39-169.patch
|
||||
Patch217: glibc-upstream-2.39-170.patch
|
||||
Patch218: glibc-upstream-2.39-171.patch
|
||||
Patch219: glibc-upstream-2.39-172.patch
|
||||
Patch220: glibc-upstream-2.39-173.patch
|
||||
Patch221: glibc-upstream-2.39-174.patch
|
||||
Patch222: glibc-upstream-2.39-175.patch
|
||||
Patch223: glibc-upstream-2.39-176.patch
|
||||
Patch224: glibc-upstream-2.39-177.patch
|
||||
Patch225: glibc-upstream-2.39-178.patch
|
||||
Patch226: glibc-upstream-2.39-179.patch
|
||||
Patch227: glibc-upstream-2.39-180.patch
|
||||
Patch228: glibc-upstream-2.39-181.patch
|
||||
Patch229: glibc-upstream-2.39-182.patch
|
||||
Patch230: glibc-upstream-2.39-183.patch
|
||||
Patch231: glibc-upstream-2.39-184.patch
|
||||
Patch232: glibc-upstream-2.39-185.patch
|
||||
Patch233: glibc-upstream-2.39-186.patch
|
||||
Patch234: glibc-upstream-2.39-187.patch
|
||||
Patch235: glibc-upstream-2.39-188.patch
|
||||
Patch236: glibc-upstream-2.39-189.patch
|
||||
Patch237: glibc-upstream-2.39-190.patch
|
||||
Patch238: glibc-upstream-2.39-191.patch
|
||||
Patch239: glibc-upstream-2.39-192.patch
|
||||
Patch240: glibc-upstream-2.39-193.patch
|
||||
Patch241: glibc-upstream-2.39-194.patch
|
||||
Patch242: glibc-upstream-2.39-195.patch
|
||||
Patch243: glibc-upstream-2.39-196.patch
|
||||
Patch244: glibc-upstream-2.39-197.patch
|
||||
Patch245: glibc-upstream-2.39-198.patch
|
||||
Patch246: glibc-upstream-2.39-199.patch
|
||||
Patch247: glibc-upstream-2.39-200.patch
|
||||
Patch248: glibc-upstream-2.39-201.patch
|
||||
Patch249: glibc-upstream-2.39-202.patch
|
||||
Patch250: glibc-upstream-2.39-203.patch
|
||||
Patch251: glibc-upstream-2.39-204.patch
|
||||
Patch252: glibc-upstream-2.39-205.patch
|
||||
Patch253: glibc-upstream-2.39-206.patch
|
||||
Patch254: glibc-upstream-2.39-207.patch
|
||||
Patch255: glibc-upstream-2.39-208.patch
|
||||
Patch256: glibc-upstream-2.39-209.patch
|
||||
Patch257: glibc-upstream-2.39-210.patch
|
||||
Patch258: glibc-upstream-2.39-211.patch
|
||||
|
||||
##############################################################################
|
||||
# Continued list of core "glibc" package information:
|
||||
@ -2505,6 +2570,35 @@ update_gconv_modules_cache ()
|
||||
%endif
|
||||
|
||||
%changelog
|
||||
* Tue Jun 17 2025 Arjun Shankar <arjun@redhat.com> - 2.39-40
|
||||
- Sync with upstream branch release/2.39/master (RHEL-87416)
|
||||
- Upstream commit: cff1042cceec3502269947e96cf7023451af22f3
|
||||
- CVE-2025-5702: Vector register overwrite bug in glibc (RHEL-95485)
|
||||
- elf: Keep using minimal malloc after early DTV resize (RHEL-71923)
|
||||
- libio: Fix a deadlock after fork in popen (RHEL-86433)
|
||||
- Linux: Switch back to assembly syscall wrapper for prctl (RHEL-82286)
|
||||
- Fix missed wakeup in POSIX thread condition variables (RHEL-82285)
|
||||
- x86: Detect Intel Diamond Rapids
|
||||
- x86: Handle unknown Intel processor with default tuning
|
||||
- x86: Add ARL/PTL/CWF model detection support
|
||||
- x86: Optimize xstate size calculation
|
||||
- x86: Support and fixes for separate non-temporal tunable for memset
|
||||
- x86: Fix a crash when running with XSAVEC disabled via tunables (RHEL-84837)
|
||||
- x86_64: Add tanh, sinh, and atanh with FMA
|
||||
- x86-64: Exclude FMA4 IFUNC functions for -mapxf
|
||||
- nptl: clear the whole rseq area before registration
|
||||
- math: Improve layout of exp/exp10 data
|
||||
- AArch64: Add SVE memset
|
||||
- math: Improve layout of expf data
|
||||
- AArch64: Remove zva_128 from memset
|
||||
- AArch64: Optimize memset
|
||||
- AArch64: Improve generic strlen
|
||||
- AArch64: Improve codegen for SVE tans and logs
|
||||
- AArch64: Improve codegen in AdvSIMD logs, logf function family, and atan(2)(f)
|
||||
- AArch64: Simplify rounding-multiply pattern in several AdvSIMD routines
|
||||
- aarch64: Avoid redundant MOVs in AdvSIMD F32 logs
|
||||
- aarch64: Fix AdvSIMD libmvec routines for big-endian
|
||||
|
||||
* Tue Jun 17 2025 Florian Weimer <fweimer@redhat.com> - 2.39-39
|
||||
- langpacks: Use symlinks for LC_NAME, LC_NUMERIC files if possible (RHEL-97433)
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user