This combines the following upstream commits: e45af510bc AArch64: Fix instability in AdvSIMD sinh 6c22823da5 AArch64: Fix instability in AdvSIMD tan aebaeb2c33 AArch64: Update math-vector-fortran.h e20ca759af AArch64: add optimised strspn/strcspn aac077645a AArch64: Fix SVE powf routine [BZ #33299] 1e3d1ddf97 AArch64: Optimize SVE exp functions dee22d2a81 AArch64: Optimise SVE FP64 Hyperbolics 6849c5b791 AArch64: Improve codegen SVE log1p helper 09795c5612 AArch64: Fix builderror with GCC 12.1/12.2 aa18367c11 AArch64: Improve enabling of SVE for libmvec 691edbdf77 aarch64: fix unwinding in longjmp 4352e2cc93 aarch64: Fix _dl_tlsdesc_dynamic unwind for pac-ret (BZ 32612) cf56eb28fa AArch64: Optimize algorithm in users of SVE expf helper ce2f26a22e AArch64: Remove PTR_ARG/SIZE_ARG defines 8f0e7fe61e Aarch64: Improve codegen in SVE asinh c0ff447edf Aarch64: Improve codegen in SVE exp and users, and update expf_inline f5ff34cb3c AArch64: Improve codegen for SVE erfcf 0b195651db AArch64: Improve codegen for SVE pow 95e807209b AArch64: Improve codegen for SVE powf d3f2b71ef1 aarch64: Fix tests not compatible with targets supporting GCS f86b4cf875 AArch64: Improve codegen in SVE expm1f and users 140b985e5a AArch64: Improve codegen in AdvSIMD asinh 91c1fadba3 AArch64: Improve codegen for SVE log1pf users cff9648d0b AArch64: Improve codegen of AdvSIMD expf family 569cfaaf49 AArch64: Improve codegen in AdvSIMD pow ca0c0d0f26 AArch64: Improve codegen in users of ADVSIMD log1p helper 13a7ef5999 AArch64: Improve codegen in users of ADVSIMD expm1 helper 2d82d781a5 AArch64: Remove SVE erf and erfc tables 1cf29fbc5b AArch64: Small optimisation in AdvSIMD erf and erfc 7b8c134b54 AArch64: Improve codegen in SVE expf & related routines a15b1394b5 AArch64: Improve codegen in SVE F32 logs 5bc100bd4b AArch64: Improve codegen in users of AdvSIMD log1pf helper 7900ac490d AArch64: Improve codegen in users of ADVSIMD expm1f helper 0fed0b250f aarch64/fpu: Add vector variants of pow 75207bde68 aarch64/fpu: Add vector variants of cbrt 157f89fa3d aarch64/fpu: Add vector variants of hypot 90a6ca8b28 aarch64: Fix AdvSIMD libmvec routines for big-endian 87cb1dfcd6 aarch64/fpu: Add vector variants of erfc 3d3a4fb8e4 aarch64/fpu: Add vector variants of tanh eedbbca0bf aarch64/fpu: Add vector variants of sinh 8b67920528 aarch64/fpu: Add vector variants of atanh 81406ea3c5 aarch64/fpu: Add vector variants of asinh b09fee1d21 aarch64/fpu: Add vector variants of acosh bdb5705b7b aarch64/fpu: Add vector variants of cosh cb5d84f1f8 aarch64/fpu: Add vector variants of erf Resolves: RHEL-118273
175 lines
4.3 KiB
Diff
175 lines
4.3 KiB
Diff
commit e20ca759af46fbb7eae20c52b857e7636eb50e1b
|
|
Author: remph <lhr@disroot.org>
|
|
Date: Thu Sep 4 12:53:56 2025 +0000
|
|
|
|
AArch64: add optimised strspn/strcspn
|
|
|
|
Requires Neon (aka. Advanced SIMD). Looks up 16 characters at a time,
|
|
for a 2-3x perfomance improvement, and a ~30% speedup on the strtok &
|
|
strsep benchtests, as tested on Cortex A-{53,72}.
|
|
|
|
Signed-off-by: remph <lhr@disroot.org>
|
|
|
|
Reviewed-by: Wilco Dijkstra <Wilco.Dijkstra@arm.com>
|
|
|
|
diff --git a/sysdeps/aarch64/strcspn.S b/sysdeps/aarch64/strcspn.S
|
|
new file mode 100644
|
|
index 0000000000000000..f2a69e9856cba04c
|
|
--- /dev/null
|
|
+++ b/sysdeps/aarch64/strcspn.S
|
|
@@ -0,0 +1,2 @@
|
|
+#define USE_AS_STRCSPN 1
|
|
+#include "strspn.S"
|
|
diff --git a/sysdeps/aarch64/strspn.S b/sysdeps/aarch64/strspn.S
|
|
new file mode 100644
|
|
index 0000000000000000..edbb705b15991e39
|
|
--- /dev/null
|
|
+++ b/sysdeps/aarch64/strspn.S
|
|
@@ -0,0 +1,146 @@
|
|
+/* Copyright (C) 2025 Free Software Foundation, Inc.
|
|
+
|
|
+ This file is part of the GNU C Library.
|
|
+
|
|
+ The GNU C Library is free software; you can redistribute it and/or
|
|
+ modify it under the terms of the GNU Lesser General Public
|
|
+ License as published by the Free Software Foundation; either
|
|
+ version 2.1 of the License, or (at your option) any later version.
|
|
+
|
|
+ The GNU C Library is distributed in the hope that it will be useful,
|
|
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
+ Lesser General Public License for more details.
|
|
+
|
|
+ You should have received a copy of the GNU Lesser General Public
|
|
+ License along with the GNU C Library. If not, see
|
|
+ <https://www.gnu.org/licenses/>. */
|
|
+
|
|
+#include <sysdep.h>
|
|
+
|
|
+#ifdef USE_AS_STRCSPN
|
|
+# define STRSPN strcspn
|
|
+# define SBT orr /* SBT -- `set bit' */
|
|
+#else
|
|
+# define STRSPN strspn
|
|
+# define SBT bic
|
|
+#endif
|
|
+
|
|
+#ifdef __AARCH64EB__
|
|
+# define LS_FW lsl
|
|
+# define LS_BK lsr
|
|
+#else
|
|
+# define LS_FW lsr
|
|
+# define LS_BK lsl
|
|
+#endif
|
|
+
|
|
+#define og_s x0
|
|
+#define set x1 /* ACCEPT for strspn, REJECT for strcspn */
|
|
+
|
|
+#define byte_i x3
|
|
+#define bits_i x4
|
|
+#define one x6
|
|
+
|
|
+#define syndrome x5
|
|
+#define s x6
|
|
+
|
|
+#define vbyte_i v1.16b
|
|
+#define vbits_i v2.16b
|
|
+#define table v4.16b-v5.16b
|
|
+#define table_a v4
|
|
+#define table_b v5
|
|
+#define sevens v7.16b
|
|
+
|
|
+ENTRY(STRSPN)
|
|
+ ldrb w2, [set]
|
|
+ cbz w2, L(early)
|
|
+#ifdef USE_AS_STRCSPN
|
|
+ ldrb w3, [set, 1]
|
|
+ cbz w3, L(early)
|
|
+#endif
|
|
+
|
|
+ /* Table has ones for bytes to reject and zeros for bytes to accept */
|
|
+ mov one, 1
|
|
+#ifdef USE_AS_STRCSPN
|
|
+ stp one, xzr, [sp, -32]!
|
|
+ .cfi_def_cfa_offset 32
|
|
+ stp xzr, xzr, [sp, 16]
|
|
+#else
|
|
+ mvni v0.4s, 0
|
|
+ stp q0, q0, [sp, -32]!
|
|
+ .cfi_def_cfa_offset 32
|
|
+#endif
|
|
+
|
|
+ .p2align 4
|
|
+L(fill_table):
|
|
+ lsr byte_i, x2, 6 /* x2 / 64 */
|
|
+ lsl bits_i, one, x2 /* x2 % 64 implicitly */
|
|
+ ldrb w2, [set, 1]!
|
|
+ ldr x5, [sp, byte_i, lsl 3]
|
|
+ SBT x5, x5, bits_i
|
|
+ str x5, [sp, byte_i, lsl 3]
|
|
+ cbnz w2, L(fill_table)
|
|
+
|
|
+ ld1 {table_a.2d-table_b.2d}, [sp], 32
|
|
+ .cfi_def_cfa_offset 0
|
|
+ ubfiz syndrome, og_s, 2, 4 /* Bottom 4 bits, times 4 to count nibbles */
|
|
+ and s, og_s, -16 /* Round S down to 16-byte boundary */
|
|
+ movi sevens, 7
|
|
+ /* Bias the syndrome to mask off these nibbles */
|
|
+ mov x8, -1
|
|
+ LS_BK syndrome, x8, syndrome
|
|
+ mvn syndrome, syndrome
|
|
+
|
|
+L(loop):
|
|
+ ldr q0, [s], 16
|
|
+ ushr vbyte_i, v0.16b, 3
|
|
+ bic vbits_i, sevens, v0.16b
|
|
+ tbl v0.16b, {table}, vbyte_i
|
|
+ /* Bring the relevant bit to the MSB of each byte */
|
|
+ sshl v0.16b, v0.16b, vbits_i
|
|
+ /* Set every bit of each byte to its MSB */
|
|
+ cmlt v0.16b, v0.16b, 0
|
|
+ /* Bytes->nibbles */
|
|
+ shrn v0.8b, v0.8h, 4
|
|
+ fmov x2, d0
|
|
+ bic syndrome, x2, syndrome
|
|
+ cbz syndrome, L(loop)
|
|
+
|
|
+#ifndef __AARCH64EB__
|
|
+ rbit syndrome, syndrome
|
|
+#endif
|
|
+ sub s, s, 16
|
|
+ clz syndrome, syndrome
|
|
+ sub x0, s, og_s
|
|
+ add x0, x0, syndrome, lsr 2
|
|
+ ret
|
|
+
|
|
+ .balign 8 /* For strspn, which has only 2 instructions here */
|
|
+L(early):
|
|
+#ifdef USE_AS_STRCSPN
|
|
+ /* strlen(set) < 2: call strchrnul(s, *set) and get its offset from S */
|
|
+ stp fp, lr, [sp, -32]!
|
|
+ .cfi_def_cfa_offset 32
|
|
+ .cfi_offset fp, -32
|
|
+ .cfi_offset lr, -24
|
|
+ str x19, [sp, 16]
|
|
+ .cfi_offset 19, -16
|
|
+ mov w1, w2
|
|
+ mov fp, sp
|
|
+ mov x19, x0
|
|
+ bl __strchrnul
|
|
+ sub x0, x0, x19
|
|
+ ldr x19, [sp, 16]
|
|
+ ldp fp, lr, [sp], 32
|
|
+ .cfi_restore lr
|
|
+ .cfi_restore fp
|
|
+ .cfi_restore 19
|
|
+ .cfi_def_cfa_offset 0
|
|
+#else
|
|
+ mov w0, 0
|
|
+#endif
|
|
+ ret
|
|
+END(STRSPN)
|
|
+
|
|
+#undef set
|
|
+libc_hidden_def(STRSPN)
|