glibc/glibc-RHEL-118273-42.patch
Yuki Inoguchi 9dd92cac18 aarch64: Add GLIBC_2.40 vector functions and performance fixes (RHEL-118273)
This combines the following upstream commits:

e45af510bc AArch64: Fix instability in AdvSIMD sinh
6c22823da5 AArch64: Fix instability in AdvSIMD tan
aebaeb2c33 AArch64: Update math-vector-fortran.h
e20ca759af AArch64: add optimised strspn/strcspn
aac077645a AArch64: Fix SVE powf routine [BZ #33299]
1e3d1ddf97 AArch64: Optimize SVE exp functions
dee22d2a81 AArch64: Optimise SVE FP64 Hyperbolics
6849c5b791 AArch64: Improve codegen SVE log1p helper
09795c5612 AArch64: Fix builderror with GCC 12.1/12.2
aa18367c11 AArch64: Improve enabling of SVE for libmvec
691edbdf77 aarch64: fix unwinding in longjmp
4352e2cc93 aarch64: Fix _dl_tlsdesc_dynamic unwind for pac-ret (BZ 32612)
cf56eb28fa AArch64: Optimize algorithm in users of SVE expf helper
ce2f26a22e AArch64: Remove PTR_ARG/SIZE_ARG defines
8f0e7fe61e Aarch64: Improve codegen in SVE asinh
c0ff447edf Aarch64: Improve codegen in SVE exp and users, and update expf_inline
f5ff34cb3c AArch64: Improve codegen for SVE erfcf
0b195651db AArch64: Improve codegen for SVE pow
95e807209b AArch64: Improve codegen for SVE powf
d3f2b71ef1 aarch64: Fix tests not compatible with targets supporting GCS
f86b4cf875 AArch64: Improve codegen in SVE expm1f and users
140b985e5a AArch64: Improve codegen in AdvSIMD asinh
91c1fadba3 AArch64: Improve codegen for SVE log1pf users
cff9648d0b AArch64: Improve codegen of AdvSIMD expf family
569cfaaf49 AArch64: Improve codegen in AdvSIMD pow
ca0c0d0f26 AArch64: Improve codegen in users of ADVSIMD log1p helper
13a7ef5999 AArch64: Improve codegen in users of ADVSIMD expm1 helper
2d82d781a5 AArch64: Remove SVE erf and erfc tables
1cf29fbc5b AArch64: Small optimisation in AdvSIMD erf and erfc
7b8c134b54 AArch64: Improve codegen in SVE expf & related routines
a15b1394b5 AArch64: Improve codegen in SVE F32 logs
5bc100bd4b AArch64: Improve codegen in users of AdvSIMD log1pf helper
7900ac490d AArch64: Improve codegen in users of ADVSIMD expm1f helper
0fed0b250f aarch64/fpu: Add vector variants of pow
75207bde68 aarch64/fpu: Add vector variants of cbrt
157f89fa3d aarch64/fpu: Add vector variants of hypot
90a6ca8b28 aarch64: Fix AdvSIMD libmvec routines for big-endian
87cb1dfcd6 aarch64/fpu: Add vector variants of erfc
3d3a4fb8e4 aarch64/fpu: Add vector variants of tanh
eedbbca0bf aarch64/fpu: Add vector variants of sinh
8b67920528 aarch64/fpu: Add vector variants of atanh
81406ea3c5 aarch64/fpu: Add vector variants of asinh
b09fee1d21 aarch64/fpu: Add vector variants of acosh
bdb5705b7b aarch64/fpu: Add vector variants of cosh
cb5d84f1f8 aarch64/fpu: Add vector variants of erf

Resolves: RHEL-118273
2025-12-05 16:24:54 +01:00

175 lines
4.3 KiB
Diff

commit e20ca759af46fbb7eae20c52b857e7636eb50e1b
Author: remph <lhr@disroot.org>
Date: Thu Sep 4 12:53:56 2025 +0000
AArch64: add optimised strspn/strcspn
Requires Neon (aka. Advanced SIMD). Looks up 16 characters at a time,
for a 2-3x perfomance improvement, and a ~30% speedup on the strtok &
strsep benchtests, as tested on Cortex A-{53,72}.
Signed-off-by: remph <lhr@disroot.org>
Reviewed-by: Wilco Dijkstra <Wilco.Dijkstra@arm.com>
diff --git a/sysdeps/aarch64/strcspn.S b/sysdeps/aarch64/strcspn.S
new file mode 100644
index 0000000000000000..f2a69e9856cba04c
--- /dev/null
+++ b/sysdeps/aarch64/strcspn.S
@@ -0,0 +1,2 @@
+#define USE_AS_STRCSPN 1
+#include "strspn.S"
diff --git a/sysdeps/aarch64/strspn.S b/sysdeps/aarch64/strspn.S
new file mode 100644
index 0000000000000000..edbb705b15991e39
--- /dev/null
+++ b/sysdeps/aarch64/strspn.S
@@ -0,0 +1,146 @@
+/* Copyright (C) 2025 Free Software Foundation, Inc.
+
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library. If not, see
+ <https://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+
+#ifdef USE_AS_STRCSPN
+# define STRSPN strcspn
+# define SBT orr /* SBT -- `set bit' */
+#else
+# define STRSPN strspn
+# define SBT bic
+#endif
+
+#ifdef __AARCH64EB__
+# define LS_FW lsl
+# define LS_BK lsr
+#else
+# define LS_FW lsr
+# define LS_BK lsl
+#endif
+
+#define og_s x0
+#define set x1 /* ACCEPT for strspn, REJECT for strcspn */
+
+#define byte_i x3
+#define bits_i x4
+#define one x6
+
+#define syndrome x5
+#define s x6
+
+#define vbyte_i v1.16b
+#define vbits_i v2.16b
+#define table v4.16b-v5.16b
+#define table_a v4
+#define table_b v5
+#define sevens v7.16b
+
+ENTRY(STRSPN)
+ ldrb w2, [set]
+ cbz w2, L(early)
+#ifdef USE_AS_STRCSPN
+ ldrb w3, [set, 1]
+ cbz w3, L(early)
+#endif
+
+ /* Table has ones for bytes to reject and zeros for bytes to accept */
+ mov one, 1
+#ifdef USE_AS_STRCSPN
+ stp one, xzr, [sp, -32]!
+ .cfi_def_cfa_offset 32
+ stp xzr, xzr, [sp, 16]
+#else
+ mvni v0.4s, 0
+ stp q0, q0, [sp, -32]!
+ .cfi_def_cfa_offset 32
+#endif
+
+ .p2align 4
+L(fill_table):
+ lsr byte_i, x2, 6 /* x2 / 64 */
+ lsl bits_i, one, x2 /* x2 % 64 implicitly */
+ ldrb w2, [set, 1]!
+ ldr x5, [sp, byte_i, lsl 3]
+ SBT x5, x5, bits_i
+ str x5, [sp, byte_i, lsl 3]
+ cbnz w2, L(fill_table)
+
+ ld1 {table_a.2d-table_b.2d}, [sp], 32
+ .cfi_def_cfa_offset 0
+ ubfiz syndrome, og_s, 2, 4 /* Bottom 4 bits, times 4 to count nibbles */
+ and s, og_s, -16 /* Round S down to 16-byte boundary */
+ movi sevens, 7
+ /* Bias the syndrome to mask off these nibbles */
+ mov x8, -1
+ LS_BK syndrome, x8, syndrome
+ mvn syndrome, syndrome
+
+L(loop):
+ ldr q0, [s], 16
+ ushr vbyte_i, v0.16b, 3
+ bic vbits_i, sevens, v0.16b
+ tbl v0.16b, {table}, vbyte_i
+ /* Bring the relevant bit to the MSB of each byte */
+ sshl v0.16b, v0.16b, vbits_i
+ /* Set every bit of each byte to its MSB */
+ cmlt v0.16b, v0.16b, 0
+ /* Bytes->nibbles */
+ shrn v0.8b, v0.8h, 4
+ fmov x2, d0
+ bic syndrome, x2, syndrome
+ cbz syndrome, L(loop)
+
+#ifndef __AARCH64EB__
+ rbit syndrome, syndrome
+#endif
+ sub s, s, 16
+ clz syndrome, syndrome
+ sub x0, s, og_s
+ add x0, x0, syndrome, lsr 2
+ ret
+
+ .balign 8 /* For strspn, which has only 2 instructions here */
+L(early):
+#ifdef USE_AS_STRCSPN
+ /* strlen(set) < 2: call strchrnul(s, *set) and get its offset from S */
+ stp fp, lr, [sp, -32]!
+ .cfi_def_cfa_offset 32
+ .cfi_offset fp, -32
+ .cfi_offset lr, -24
+ str x19, [sp, 16]
+ .cfi_offset 19, -16
+ mov w1, w2
+ mov fp, sp
+ mov x19, x0
+ bl __strchrnul
+ sub x0, x0, x19
+ ldr x19, [sp, 16]
+ ldp fp, lr, [sp], 32
+ .cfi_restore lr
+ .cfi_restore fp
+ .cfi_restore 19
+ .cfi_def_cfa_offset 0
+#else
+ mov w0, 0
+#endif
+ ret
+END(STRSPN)
+
+#undef set
+libc_hidden_def(STRSPN)