aarch64: Add GLIBC_2.40 vector functions and performance fixes (RHEL-118273)

This combines the following upstream commits:

e45af510bc AArch64: Fix instability in AdvSIMD sinh
6c22823da5 AArch64: Fix instability in AdvSIMD tan
aebaeb2c33 AArch64: Update math-vector-fortran.h
e20ca759af AArch64: add optimised strspn/strcspn
aac077645a AArch64: Fix SVE powf routine [BZ #33299]
1e3d1ddf97 AArch64: Optimize SVE exp functions
dee22d2a81 AArch64: Optimise SVE FP64 Hyperbolics
6849c5b791 AArch64: Improve codegen SVE log1p helper
09795c5612 AArch64: Fix builderror with GCC 12.1/12.2
aa18367c11 AArch64: Improve enabling of SVE for libmvec
691edbdf77 aarch64: fix unwinding in longjmp
4352e2cc93 aarch64: Fix _dl_tlsdesc_dynamic unwind for pac-ret (BZ 32612)
cf56eb28fa AArch64: Optimize algorithm in users of SVE expf helper
ce2f26a22e AArch64: Remove PTR_ARG/SIZE_ARG defines
8f0e7fe61e Aarch64: Improve codegen in SVE asinh
c0ff447edf Aarch64: Improve codegen in SVE exp and users, and update expf_inline
f5ff34cb3c AArch64: Improve codegen for SVE erfcf
0b195651db AArch64: Improve codegen for SVE pow
95e807209b AArch64: Improve codegen for SVE powf
d3f2b71ef1 aarch64: Fix tests not compatible with targets supporting GCS
f86b4cf875 AArch64: Improve codegen in SVE expm1f and users
140b985e5a AArch64: Improve codegen in AdvSIMD asinh
91c1fadba3 AArch64: Improve codegen for SVE log1pf users
cff9648d0b AArch64: Improve codegen of AdvSIMD expf family
569cfaaf49 AArch64: Improve codegen in AdvSIMD pow
ca0c0d0f26 AArch64: Improve codegen in users of ADVSIMD log1p helper
13a7ef5999 AArch64: Improve codegen in users of ADVSIMD expm1 helper
2d82d781a5 AArch64: Remove SVE erf and erfc tables
1cf29fbc5b AArch64: Small optimisation in AdvSIMD erf and erfc
7b8c134b54 AArch64: Improve codegen in SVE expf & related routines
a15b1394b5 AArch64: Improve codegen in SVE F32 logs
5bc100bd4b AArch64: Improve codegen in users of AdvSIMD log1pf helper
7900ac490d AArch64: Improve codegen in users of ADVSIMD expm1f helper
0fed0b250f aarch64/fpu: Add vector variants of pow
75207bde68 aarch64/fpu: Add vector variants of cbrt
157f89fa3d aarch64/fpu: Add vector variants of hypot
90a6ca8b28 aarch64: Fix AdvSIMD libmvec routines for big-endian
87cb1dfcd6 aarch64/fpu: Add vector variants of erfc
3d3a4fb8e4 aarch64/fpu: Add vector variants of tanh
eedbbca0bf aarch64/fpu: Add vector variants of sinh
8b67920528 aarch64/fpu: Add vector variants of atanh
81406ea3c5 aarch64/fpu: Add vector variants of asinh
b09fee1d21 aarch64/fpu: Add vector variants of acosh
bdb5705b7b aarch64/fpu: Add vector variants of cosh
cb5d84f1f8 aarch64/fpu: Add vector variants of erf

Resolves: RHEL-118273
This commit is contained in:
Yuki Inoguchi 2025-11-12 13:08:45 -05:00 committed by Florian Weimer
parent 7361fbbfab
commit 9dd92cac18
45 changed files with 29319 additions and 0 deletions

4742
glibc-RHEL-118273-1.patch Normal file

File diff suppressed because it is too large Load Diff

514
glibc-RHEL-118273-10.patch Normal file
View File

@ -0,0 +1,514 @@
commit 157f89fa3d616729c8d7797168a9b3eaaa6ebf6e
Author: Joe Ramsay <Joe.Ramsay@arm.com>
Date: Tue Apr 30 13:49:58 2024 +0100
aarch64/fpu: Add vector variants of hypot
Reviewed-by: Szabolcs Nagy <szabolcs.nagy@arm.com>
diff --git a/sysdeps/aarch64/fpu/Makefile b/sysdeps/aarch64/fpu/Makefile
index e8af35099d7b9f8f..06657782a1ee7106 100644
--- a/sysdeps/aarch64/fpu/Makefile
+++ b/sysdeps/aarch64/fpu/Makefile
@@ -13,6 +13,7 @@ libmvec-supported-funcs = acos \
exp10 \
exp2 \
expm1 \
+ hypot \
log \
log10 \
log1p \
diff --git a/sysdeps/aarch64/fpu/Versions b/sysdeps/aarch64/fpu/Versions
index 3cb1b82bd2785a4b..aedae9457b148983 100644
--- a/sysdeps/aarch64/fpu/Versions
+++ b/sysdeps/aarch64/fpu/Versions
@@ -109,6 +109,11 @@ libmvec {
_ZGVnN4v_erfcf;
_ZGVsMxv_erfc;
_ZGVsMxv_erfcf;
+ _ZGVnN4vv_hypotf;
+ _ZGVnN2vv_hypotf;
+ _ZGVnN2vv_hypot;
+ _ZGVsMxvv_hypotf;
+ _ZGVsMxvv_hypot;
_ZGVnN2v_sinh;
_ZGVnN2v_sinhf;
_ZGVnN4v_sinhf;
diff --git a/sysdeps/aarch64/fpu/advsimd_f32_protos.h b/sysdeps/aarch64/fpu/advsimd_f32_protos.h
index 383c4369729a3452..a8889a92fd041585 100644
--- a/sysdeps/aarch64/fpu/advsimd_f32_protos.h
+++ b/sysdeps/aarch64/fpu/advsimd_f32_protos.h
@@ -31,6 +31,7 @@ libmvec_hidden_proto (V_NAME_F1(exp10));
libmvec_hidden_proto (V_NAME_F1(exp2));
libmvec_hidden_proto (V_NAME_F1(exp));
libmvec_hidden_proto (V_NAME_F1(expm1));
+libmvec_hidden_proto (V_NAME_F2(hypot));
libmvec_hidden_proto (V_NAME_F1(log10));
libmvec_hidden_proto (V_NAME_F1(log1p));
libmvec_hidden_proto (V_NAME_F1(log2));
diff --git a/sysdeps/aarch64/fpu/bits/math-vector.h b/sysdeps/aarch64/fpu/bits/math-vector.h
index e29b2d1c09273969..ca3017733959702f 100644
--- a/sysdeps/aarch64/fpu/bits/math-vector.h
+++ b/sysdeps/aarch64/fpu/bits/math-vector.h
@@ -89,6 +89,10 @@
# define __DECL_SIMD_expm1 __DECL_SIMD_aarch64
# undef __DECL_SIMD_expm1f
# define __DECL_SIMD_expm1f __DECL_SIMD_aarch64
+# undef __DECL_SIMD_hypot
+# define __DECL_SIMD_hypot __DECL_SIMD_aarch64
+# undef __DECL_SIMD_hypotf
+# define __DECL_SIMD_hypotf __DECL_SIMD_aarch64
# undef __DECL_SIMD_log
# define __DECL_SIMD_log __DECL_SIMD_aarch64
# undef __DECL_SIMD_logf
@@ -162,6 +166,7 @@ __vpcs __f32x4_t _ZGVnN4v_expf (__f32x4_t);
__vpcs __f32x4_t _ZGVnN4v_exp10f (__f32x4_t);
__vpcs __f32x4_t _ZGVnN4v_exp2f (__f32x4_t);
__vpcs __f32x4_t _ZGVnN4v_expm1f (__f32x4_t);
+__vpcs __f32x4_t _ZGVnN4vv_hypotf (__f32x4_t, __f32x4_t);
__vpcs __f32x4_t _ZGVnN4v_logf (__f32x4_t);
__vpcs __f32x4_t _ZGVnN4v_log10f (__f32x4_t);
__vpcs __f32x4_t _ZGVnN4v_log1pf (__f32x4_t);
@@ -186,6 +191,7 @@ __vpcs __f64x2_t _ZGVnN2v_exp (__f64x2_t);
__vpcs __f64x2_t _ZGVnN2v_exp10 (__f64x2_t);
__vpcs __f64x2_t _ZGVnN2v_exp2 (__f64x2_t);
__vpcs __f64x2_t _ZGVnN2v_expm1 (__f64x2_t);
+__vpcs __f64x2_t _ZGVnN2vv_hypot (__f64x2_t, __f64x2_t);
__vpcs __f64x2_t _ZGVnN2v_log (__f64x2_t);
__vpcs __f64x2_t _ZGVnN2v_log10 (__f64x2_t);
__vpcs __f64x2_t _ZGVnN2v_log1p (__f64x2_t);
@@ -215,6 +221,7 @@ __sv_f32_t _ZGVsMxv_expf (__sv_f32_t, __sv_bool_t);
__sv_f32_t _ZGVsMxv_exp10f (__sv_f32_t, __sv_bool_t);
__sv_f32_t _ZGVsMxv_exp2f (__sv_f32_t, __sv_bool_t);
__sv_f32_t _ZGVsMxv_expm1f (__sv_f32_t, __sv_bool_t);
+__sv_f32_t _ZGVsMxvv_hypotf (__sv_f32_t, __sv_f32_t, __sv_bool_t);
__sv_f32_t _ZGVsMxv_logf (__sv_f32_t, __sv_bool_t);
__sv_f32_t _ZGVsMxv_log10f (__sv_f32_t, __sv_bool_t);
__sv_f32_t _ZGVsMxv_log1pf (__sv_f32_t, __sv_bool_t);
@@ -239,6 +246,7 @@ __sv_f64_t _ZGVsMxv_exp (__sv_f64_t, __sv_bool_t);
__sv_f64_t _ZGVsMxv_exp10 (__sv_f64_t, __sv_bool_t);
__sv_f64_t _ZGVsMxv_exp2 (__sv_f64_t, __sv_bool_t);
__sv_f64_t _ZGVsMxv_expm1 (__sv_f64_t, __sv_bool_t);
+__sv_f64_t _ZGVsMxvv_hypot (__sv_f64_t, __sv_f64_t, __sv_bool_t);
__sv_f64_t _ZGVsMxv_log (__sv_f64_t, __sv_bool_t);
__sv_f64_t _ZGVsMxv_log10 (__sv_f64_t, __sv_bool_t);
__sv_f64_t _ZGVsMxv_log1p (__sv_f64_t, __sv_bool_t);
diff --git a/sysdeps/aarch64/fpu/hypot_advsimd.c b/sysdeps/aarch64/fpu/hypot_advsimd.c
new file mode 100644
index 0000000000000000..e4e279fa0c362336
--- /dev/null
+++ b/sysdeps/aarch64/fpu/hypot_advsimd.c
@@ -0,0 +1,97 @@
+/* Double-precision vector (Advanced SIMD) hypot function
+
+ Copyright (C) 2024 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#include "v_math.h"
+
+#if WANT_SIMD_EXCEPT
+static const struct data
+{
+ uint64x2_t tiny_bound, thres;
+} data = {
+ .tiny_bound = V2 (0x2000000000000000), /* asuint (0x1p-511). */
+ .thres = V2 (0x3fe0000000000000), /* asuint (0x1p511) - tiny_bound. */
+};
+#else
+static const struct data
+{
+ uint64x2_t tiny_bound;
+ uint32x4_t thres;
+} data = {
+ .tiny_bound = V2 (0x0360000000000000), /* asuint (0x1p-969). */
+ .thres = V4 (0x7c900000), /* asuint (inf) - tiny_bound. */
+};
+#endif
+
+static float64x2_t VPCS_ATTR NOINLINE
+special_case (float64x2_t x, float64x2_t y, float64x2_t sqsum,
+ uint32x2_t special)
+{
+ return v_call2_f64 (hypot, x, y, vsqrtq_f64 (sqsum), vmovl_u32 (special));
+}
+
+/* Vector implementation of double-precision hypot.
+ Maximum error observed is 1.21 ULP:
+ _ZGVnN2vv_hypot (0x1.6a1b193ff85b5p-204, 0x1.bc50676c2a447p-222)
+ got 0x1.6a1b19400964ep-204
+ want 0x1.6a1b19400964dp-204. */
+#if WANT_SIMD_EXCEPT
+
+float64x2_t VPCS_ATTR V_NAME_D2 (hypot) (float64x2_t x, float64x2_t y)
+{
+ const struct data *d = ptr_barrier (&data);
+
+ float64x2_t ax = vabsq_f64 (x);
+ float64x2_t ay = vabsq_f64 (y);
+
+ uint64x2_t ix = vreinterpretq_u64_f64 (ax);
+ uint64x2_t iy = vreinterpretq_u64_f64 (ay);
+
+ /* Extreme values, NaNs, and infinities should be handled by the scalar
+ fallback for correct flag handling. */
+ uint64x2_t specialx = vcgeq_u64 (vsubq_u64 (ix, d->tiny_bound), d->thres);
+ uint64x2_t specialy = vcgeq_u64 (vsubq_u64 (iy, d->tiny_bound), d->thres);
+ ax = v_zerofy_f64 (ax, specialx);
+ ay = v_zerofy_f64 (ay, specialy);
+ uint32x2_t special = vaddhn_u64 (specialx, specialy);
+
+ float64x2_t sqsum = vfmaq_f64 (vmulq_f64 (ax, ax), ay, ay);
+
+ if (__glibc_unlikely (v_any_u32h (special)))
+ return special_case (x, y, sqsum, special);
+
+ return vsqrtq_f64 (sqsum);
+}
+#else
+
+float64x2_t VPCS_ATTR V_NAME_D2 (hypot) (float64x2_t x, float64x2_t y)
+{
+ const struct data *d = ptr_barrier (&data);
+
+ float64x2_t sqsum = vfmaq_f64 (vmulq_f64 (x, x), y, y);
+
+ uint32x2_t special = vcge_u32 (
+ vsubhn_u64 (vreinterpretq_u64_f64 (sqsum), d->tiny_bound),
+ vget_low_u32 (d->thres));
+
+ if (__glibc_unlikely (v_any_u32h (special)))
+ return special_case (x, y, sqsum, special);
+
+ return vsqrtq_f64 (sqsum);
+}
+#endif
diff --git a/sysdeps/aarch64/fpu/hypot_sve.c b/sysdeps/aarch64/fpu/hypot_sve.c
new file mode 100644
index 0000000000000000..74417040acb2f32f
--- /dev/null
+++ b/sysdeps/aarch64/fpu/hypot_sve.c
@@ -0,0 +1,54 @@
+/* Double-precision vector (SVE) hypot function
+
+ Copyright (C) 2024 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#include "sv_math.h"
+
+static const struct data
+{
+ uint64_t tiny_bound, thres;
+} data = {
+ .tiny_bound = 0x0c80000000000000, /* asuint (0x1p-102). */
+ .thres = 0x7300000000000000, /* asuint (inf) - tiny_bound. */
+};
+
+static svfloat64_t NOINLINE
+special_case (svfloat64_t sqsum, svfloat64_t x, svfloat64_t y, svbool_t pg,
+ svbool_t special)
+{
+ return sv_call2_f64 (hypot, x, y, svsqrt_x (pg, sqsum), special);
+}
+
+/* SVE implementation of double-precision hypot.
+ Maximum error observed is 1.21 ULP:
+ _ZGVsMxvv_hypot (-0x1.6a22d0412cdd3p+352, 0x1.d3d89bd66fb1ap+330)
+ got 0x1.6a22d0412cfp+352
+ want 0x1.6a22d0412cf01p+352. */
+svfloat64_t SV_NAME_D2 (hypot) (svfloat64_t x, svfloat64_t y, svbool_t pg)
+{
+ const struct data *d = ptr_barrier (&data);
+
+ svfloat64_t sqsum = svmla_x (pg, svmul_x (pg, x, x), y, y);
+
+ svbool_t special = svcmpge (
+ pg, svsub_x (pg, svreinterpret_u64 (sqsum), d->tiny_bound), d->thres);
+
+ if (__glibc_unlikely (svptest_any (pg, special)))
+ return special_case (sqsum, x, y, pg, special);
+ return svsqrt_x (pg, sqsum);
+}
diff --git a/sysdeps/aarch64/fpu/hypotf_advsimd.c b/sysdeps/aarch64/fpu/hypotf_advsimd.c
new file mode 100644
index 0000000000000000..34818b021abce1b7
--- /dev/null
+++ b/sysdeps/aarch64/fpu/hypotf_advsimd.c
@@ -0,0 +1,98 @@
+/* Single-precision vector (Advanced SIMD) hypot function
+
+ Copyright (C) 2024 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#include "v_math.h"
+
+#if WANT_SIMD_EXCEPT
+static const struct data
+{
+ uint32x4_t tiny_bound, thres;
+} data = {
+ .tiny_bound = V4 (0x20000000), /* asuint (0x1p-63). */
+ .thres = V4 (0x3f000000), /* asuint (0x1p63) - tiny_bound. */
+};
+#else
+static const struct data
+{
+ uint32x4_t tiny_bound;
+ uint16x8_t thres;
+} data = {
+ .tiny_bound = V4 (0x0C800000), /* asuint (0x1p-102). */
+ .thres = V8 (0x7300), /* asuint (inf) - tiny_bound. */
+};
+#endif
+
+static float32x4_t VPCS_ATTR NOINLINE
+special_case (float32x4_t x, float32x4_t y, float32x4_t sqsum,
+ uint16x4_t special)
+{
+ return v_call2_f32 (hypotf, x, y, vsqrtq_f32 (sqsum), vmovl_u16 (special));
+}
+
+/* Vector implementation of single-precision hypot.
+ Maximum error observed is 1.21 ULP:
+ _ZGVnN4vv_hypotf (0x1.6a419cp-13, 0x1.82a852p-22) got 0x1.6a41d2p-13
+ want 0x1.6a41dp-13. */
+#if WANT_SIMD_EXCEPT
+
+float32x4_t VPCS_ATTR V_NAME_F2 (hypot) (float32x4_t x, float32x4_t y)
+{
+ const struct data *d = ptr_barrier (&data);
+
+ float32x4_t ax = vabsq_f32 (x);
+ float32x4_t ay = vabsq_f32 (y);
+
+ uint32x4_t ix = vreinterpretq_u32_f32 (ax);
+ uint32x4_t iy = vreinterpretq_u32_f32 (ay);
+
+ /* Extreme values, NaNs, and infinities should be handled by the scalar
+ fallback for correct flag handling. */
+ uint32x4_t specialx = vcgeq_u32 (vsubq_u32 (ix, d->tiny_bound), d->thres);
+ uint32x4_t specialy = vcgeq_u32 (vsubq_u32 (iy, d->tiny_bound), d->thres);
+ ax = v_zerofy_f32 (ax, specialx);
+ ay = v_zerofy_f32 (ay, specialy);
+ uint16x4_t special = vaddhn_u32 (specialx, specialy);
+
+ float32x4_t sqsum = vfmaq_f32 (vmulq_f32 (ax, ax), ay, ay);
+
+ if (__glibc_unlikely (v_any_u16h (special)))
+ return special_case (x, y, sqsum, special);
+
+ return vsqrtq_f32 (sqsum);
+}
+#else
+
+float32x4_t VPCS_ATTR V_NAME_F2 (hypot) (float32x4_t x, float32x4_t y)
+{
+ const struct data *d = ptr_barrier (&data);
+
+ float32x4_t sqsum = vfmaq_f32 (vmulq_f32 (x, x), y, y);
+
+ uint16x4_t special = vcge_u16 (
+ vsubhn_u32 (vreinterpretq_u32_f32 (sqsum), d->tiny_bound),
+ vget_low_u16 (d->thres));
+
+ if (__glibc_unlikely (v_any_u16h (special)))
+ return special_case (x, y, sqsum, special);
+
+ return vsqrtq_f32 (sqsum);
+}
+#endif
+libmvec_hidden_def (V_NAME_F2 (hypot))
+HALF_WIDTH_ALIAS_F2(hypot)
diff --git a/sysdeps/aarch64/fpu/hypotf_sve.c b/sysdeps/aarch64/fpu/hypotf_sve.c
new file mode 100644
index 0000000000000000..3a403de66eb091f4
--- /dev/null
+++ b/sysdeps/aarch64/fpu/hypotf_sve.c
@@ -0,0 +1,48 @@
+/* Single-precision vector (SVE) hypot function
+
+ Copyright (C) 2024 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#include "sv_math.h"
+
+#define TinyBound 0x0c800000 /* asuint (0x1p-102). */
+#define Thres 0x73000000 /* 0x70000000 - TinyBound. */
+
+static svfloat32_t NOINLINE
+special_case (svfloat32_t sqsum, svfloat32_t x, svfloat32_t y, svbool_t pg,
+ svbool_t special)
+{
+ return sv_call2_f32 (hypotf, x, y, svsqrt_x (pg, sqsum), special);
+}
+
+/* SVE implementation of single-precision hypot.
+ Maximum error observed is 1.21 ULP:
+ _ZGVsMxvv_hypotf (0x1.6a213cp-19, -0x1.32b982p-26) got 0x1.6a2346p-19
+ want 0x1.6a2344p-19. */
+svfloat32_t SV_NAME_F2 (hypot) (svfloat32_t x, svfloat32_t y,
+ const svbool_t pg)
+{
+ svfloat32_t sqsum = svmla_x (pg, svmul_x (pg, x, x), y, y);
+
+ svbool_t special = svcmpge (
+ pg, svsub_x (pg, svreinterpret_u32 (sqsum), TinyBound), Thres);
+
+ if (__glibc_unlikely (svptest_any (pg, special)))
+ return special_case (sqsum, x, y, pg, special);
+
+ return svsqrt_x (pg, sqsum);
+}
diff --git a/sysdeps/aarch64/fpu/test-double-advsimd-wrappers.c b/sysdeps/aarch64/fpu/test-double-advsimd-wrappers.c
index f2d8714075ab99b8..417125be476cd75f 100644
--- a/sysdeps/aarch64/fpu/test-double-advsimd-wrappers.c
+++ b/sysdeps/aarch64/fpu/test-double-advsimd-wrappers.c
@@ -38,6 +38,7 @@ VPCS_VECTOR_WRAPPER (exp_advsimd, _ZGVnN2v_exp)
VPCS_VECTOR_WRAPPER (exp10_advsimd, _ZGVnN2v_exp10)
VPCS_VECTOR_WRAPPER (exp2_advsimd, _ZGVnN2v_exp2)
VPCS_VECTOR_WRAPPER (expm1_advsimd, _ZGVnN2v_expm1)
+VPCS_VECTOR_WRAPPER_ff (hypot_advsimd, _ZGVnN2vv_hypot)
VPCS_VECTOR_WRAPPER (log_advsimd, _ZGVnN2v_log)
VPCS_VECTOR_WRAPPER (log10_advsimd, _ZGVnN2v_log10)
VPCS_VECTOR_WRAPPER (log1p_advsimd, _ZGVnN2v_log1p)
diff --git a/sysdeps/aarch64/fpu/test-double-sve-wrappers.c b/sysdeps/aarch64/fpu/test-double-sve-wrappers.c
index 37873d5e432ae9e8..31ebf18705f68856 100644
--- a/sysdeps/aarch64/fpu/test-double-sve-wrappers.c
+++ b/sysdeps/aarch64/fpu/test-double-sve-wrappers.c
@@ -57,6 +57,7 @@ SVE_VECTOR_WRAPPER (exp_sve, _ZGVsMxv_exp)
SVE_VECTOR_WRAPPER (exp10_sve, _ZGVsMxv_exp10)
SVE_VECTOR_WRAPPER (exp2_sve, _ZGVsMxv_exp2)
SVE_VECTOR_WRAPPER (expm1_sve, _ZGVsMxv_expm1)
+SVE_VECTOR_WRAPPER_ff (hypot_sve, _ZGVsMxvv_hypot)
SVE_VECTOR_WRAPPER (log_sve, _ZGVsMxv_log)
SVE_VECTOR_WRAPPER (log10_sve, _ZGVsMxv_log10)
SVE_VECTOR_WRAPPER (log1p_sve, _ZGVsMxv_log1p)
diff --git a/sysdeps/aarch64/fpu/test-float-advsimd-wrappers.c b/sysdeps/aarch64/fpu/test-float-advsimd-wrappers.c
index 08e33115b9dc6f5e..dab0f1cfcb79a305 100644
--- a/sysdeps/aarch64/fpu/test-float-advsimd-wrappers.c
+++ b/sysdeps/aarch64/fpu/test-float-advsimd-wrappers.c
@@ -38,6 +38,7 @@ VPCS_VECTOR_WRAPPER (expf_advsimd, _ZGVnN4v_expf)
VPCS_VECTOR_WRAPPER (exp10f_advsimd, _ZGVnN4v_exp10f)
VPCS_VECTOR_WRAPPER (exp2f_advsimd, _ZGVnN4v_exp2f)
VPCS_VECTOR_WRAPPER (expm1f_advsimd, _ZGVnN4v_expm1f)
+VPCS_VECTOR_WRAPPER_ff (hypotf_advsimd, _ZGVnN4vv_hypotf)
VPCS_VECTOR_WRAPPER (logf_advsimd, _ZGVnN4v_logf)
VPCS_VECTOR_WRAPPER (log10f_advsimd, _ZGVnN4v_log10f)
VPCS_VECTOR_WRAPPER (log1pf_advsimd, _ZGVnN4v_log1pf)
diff --git a/sysdeps/aarch64/fpu/test-float-sve-wrappers.c b/sysdeps/aarch64/fpu/test-float-sve-wrappers.c
index 025daa662efd6f7f..2aa6cbcc28d69cf8 100644
--- a/sysdeps/aarch64/fpu/test-float-sve-wrappers.c
+++ b/sysdeps/aarch64/fpu/test-float-sve-wrappers.c
@@ -57,6 +57,7 @@ SVE_VECTOR_WRAPPER (expf_sve, _ZGVsMxv_expf)
SVE_VECTOR_WRAPPER (exp10f_sve, _ZGVsMxv_exp10f)
SVE_VECTOR_WRAPPER (exp2f_sve, _ZGVsMxv_exp2f)
SVE_VECTOR_WRAPPER (expm1f_sve, _ZGVsMxv_expm1f)
+SVE_VECTOR_WRAPPER_ff (hypotf_sve, _ZGVsMxvv_hypotf)
SVE_VECTOR_WRAPPER (logf_sve, _ZGVsMxv_logf)
SVE_VECTOR_WRAPPER (log10f_sve, _ZGVsMxv_log10f)
SVE_VECTOR_WRAPPER (log1pf_sve, _ZGVsMxv_log1pf)
diff --git a/sysdeps/aarch64/libm-test-ulps b/sysdeps/aarch64/libm-test-ulps
index 055da83d639a2430..17723d0c9e2dfcf5 100644
--- a/sysdeps/aarch64/libm-test-ulps
+++ b/sysdeps/aarch64/libm-test-ulps
@@ -1174,10 +1174,18 @@ double: 1
float: 1
ldouble: 1
+Function: "hypot_advsimd":
+double: 1
+float: 1
+
Function: "hypot_downward":
double: 1
ldouble: 1
+Function: "hypot_sve":
+double: 1
+float: 1
+
Function: "hypot_towardzero":
double: 1
ldouble: 1
diff --git a/sysdeps/unix/sysv/linux/aarch64/libmvec.abilist b/sysdeps/unix/sysv/linux/aarch64/libmvec.abilist
index 26c3fbf18b2f12a9..1184374efd25cfa6 100644
--- a/sysdeps/unix/sysv/linux/aarch64/libmvec.abilist
+++ b/sysdeps/unix/sysv/linux/aarch64/libmvec.abilist
@@ -89,6 +89,8 @@ GLIBC_2.40 _ZGVnN2v_sinh F
GLIBC_2.40 _ZGVnN2v_sinhf F
GLIBC_2.40 _ZGVnN2v_tanh F
GLIBC_2.40 _ZGVnN2v_tanhf F
+GLIBC_2.40 _ZGVnN2vv_hypot F
+GLIBC_2.40 _ZGVnN2vv_hypotf F
GLIBC_2.40 _ZGVnN4v_acoshf F
GLIBC_2.40 _ZGVnN4v_asinhf F
GLIBC_2.40 _ZGVnN4v_atanhf F
@@ -97,6 +99,7 @@ GLIBC_2.40 _ZGVnN4v_erfcf F
GLIBC_2.40 _ZGVnN4v_erff F
GLIBC_2.40 _ZGVnN4v_sinhf F
GLIBC_2.40 _ZGVnN4v_tanhf F
+GLIBC_2.40 _ZGVnN4vv_hypotf F
GLIBC_2.40 _ZGVsMxv_acosh F
GLIBC_2.40 _ZGVsMxv_acoshf F
GLIBC_2.40 _ZGVsMxv_asinh F
@@ -113,3 +116,5 @@ GLIBC_2.40 _ZGVsMxv_sinh F
GLIBC_2.40 _ZGVsMxv_sinhf F
GLIBC_2.40 _ZGVsMxv_tanh F
GLIBC_2.40 _ZGVsMxv_tanhf F
+GLIBC_2.40 _ZGVsMxvv_hypot F
+GLIBC_2.40 _ZGVsMxvv_hypotf F

715
glibc-RHEL-118273-11.patch Normal file
View File

@ -0,0 +1,715 @@
commit 75207bde6870eb4b258e16fbb41252b2e6377675
Author: Joe Ramsay <Joe.Ramsay@arm.com>
Date: Tue Apr 30 13:49:59 2024 +0100
aarch64/fpu: Add vector variants of cbrt
Reviewed-by: Szabolcs Nagy <szabolcs.nagy@arm.com>
diff --git a/sysdeps/aarch64/fpu/Makefile b/sysdeps/aarch64/fpu/Makefile
index 06657782a1ee7106..990d1135b93485c5 100644
--- a/sysdeps/aarch64/fpu/Makefile
+++ b/sysdeps/aarch64/fpu/Makefile
@@ -5,6 +5,7 @@ libmvec-supported-funcs = acos \
atan \
atanh \
atan2 \
+ cbrt \
cos \
cosh \
erf \
diff --git a/sysdeps/aarch64/fpu/Versions b/sysdeps/aarch64/fpu/Versions
index aedae9457b148983..36a9e4df1e058c46 100644
--- a/sysdeps/aarch64/fpu/Versions
+++ b/sysdeps/aarch64/fpu/Versions
@@ -94,6 +94,11 @@ libmvec {
_ZGVnN4v_atanhf;
_ZGVsMxv_atanh;
_ZGVsMxv_atanhf;
+ _ZGVnN2v_cbrt;
+ _ZGVnN2v_cbrtf;
+ _ZGVnN4v_cbrtf;
+ _ZGVsMxv_cbrt;
+ _ZGVsMxv_cbrtf;
_ZGVnN2v_cosh;
_ZGVnN2v_coshf;
_ZGVnN4v_coshf;
diff --git a/sysdeps/aarch64/fpu/advsimd_f32_protos.h b/sysdeps/aarch64/fpu/advsimd_f32_protos.h
index a8889a92fd041585..54858efd8aa0ff82 100644
--- a/sysdeps/aarch64/fpu/advsimd_f32_protos.h
+++ b/sysdeps/aarch64/fpu/advsimd_f32_protos.h
@@ -23,6 +23,7 @@ libmvec_hidden_proto (V_NAME_F1(asin));
libmvec_hidden_proto (V_NAME_F1(asinh));
libmvec_hidden_proto (V_NAME_F1(atan));
libmvec_hidden_proto (V_NAME_F1(atanh));
+libmvec_hidden_proto (V_NAME_F1(cbrt));
libmvec_hidden_proto (V_NAME_F1(cos));
libmvec_hidden_proto (V_NAME_F1(cosh));
libmvec_hidden_proto (V_NAME_F1(erf));
diff --git a/sysdeps/aarch64/fpu/bits/math-vector.h b/sysdeps/aarch64/fpu/bits/math-vector.h
index ca3017733959702f..b1c024fe13a7dc32 100644
--- a/sysdeps/aarch64/fpu/bits/math-vector.h
+++ b/sysdeps/aarch64/fpu/bits/math-vector.h
@@ -57,6 +57,10 @@
# define __DECL_SIMD_atan2 __DECL_SIMD_aarch64
# undef __DECL_SIMD_atan2f
# define __DECL_SIMD_atan2f __DECL_SIMD_aarch64
+# undef __DECL_SIMD_cbrt
+# define __DECL_SIMD_cbrt __DECL_SIMD_aarch64
+# undef __DECL_SIMD_cbrtf
+# define __DECL_SIMD_cbrtf __DECL_SIMD_aarch64
# undef __DECL_SIMD_cos
# define __DECL_SIMD_cos __DECL_SIMD_aarch64
# undef __DECL_SIMD_cosf
@@ -158,6 +162,7 @@ __vpcs __f32x4_t _ZGVnN4v_asinf (__f32x4_t);
__vpcs __f32x4_t _ZGVnN4v_asinhf (__f32x4_t);
__vpcs __f32x4_t _ZGVnN4v_atanf (__f32x4_t);
__vpcs __f32x4_t _ZGVnN4v_atanhf (__f32x4_t);
+__vpcs __f32x4_t _ZGVnN4v_cbrtf (__f32x4_t);
__vpcs __f32x4_t _ZGVnN4v_cosf (__f32x4_t);
__vpcs __f32x4_t _ZGVnN4v_coshf (__f32x4_t);
__vpcs __f32x4_t _ZGVnN4v_erff (__f32x4_t);
@@ -183,6 +188,7 @@ __vpcs __f64x2_t _ZGVnN2v_asin (__f64x2_t);
__vpcs __f64x2_t _ZGVnN2v_asinh (__f64x2_t);
__vpcs __f64x2_t _ZGVnN2v_atan (__f64x2_t);
__vpcs __f64x2_t _ZGVnN2v_atanh (__f64x2_t);
+__vpcs __f64x2_t _ZGVnN2v_cbrt (__f64x2_t);
__vpcs __f64x2_t _ZGVnN2v_cos (__f64x2_t);
__vpcs __f64x2_t _ZGVnN2v_cosh (__f64x2_t);
__vpcs __f64x2_t _ZGVnN2v_erf (__f64x2_t);
@@ -213,6 +219,7 @@ __sv_f32_t _ZGVsMxv_asinf (__sv_f32_t, __sv_bool_t);
__sv_f32_t _ZGVsMxv_asinhf (__sv_f32_t, __sv_bool_t);
__sv_f32_t _ZGVsMxv_atanf (__sv_f32_t, __sv_bool_t);
__sv_f32_t _ZGVsMxv_atanhf (__sv_f32_t, __sv_bool_t);
+__sv_f32_t _ZGVsMxv_cbrtf (__sv_f32_t, __sv_bool_t);
__sv_f32_t _ZGVsMxv_cosf (__sv_f32_t, __sv_bool_t);
__sv_f32_t _ZGVsMxv_coshf (__sv_f32_t, __sv_bool_t);
__sv_f32_t _ZGVsMxv_erff (__sv_f32_t, __sv_bool_t);
@@ -238,6 +245,7 @@ __sv_f64_t _ZGVsMxv_asin (__sv_f64_t, __sv_bool_t);
__sv_f64_t _ZGVsMxv_asinh (__sv_f64_t, __sv_bool_t);
__sv_f64_t _ZGVsMxv_atan (__sv_f64_t, __sv_bool_t);
__sv_f64_t _ZGVsMxv_atanh (__sv_f64_t, __sv_bool_t);
+__sv_f64_t _ZGVsMxv_cbrt (__sv_f64_t, __sv_bool_t);
__sv_f64_t _ZGVsMxv_cos (__sv_f64_t, __sv_bool_t);
__sv_f64_t _ZGVsMxv_cosh (__sv_f64_t, __sv_bool_t);
__sv_f64_t _ZGVsMxv_erf (__sv_f64_t, __sv_bool_t);
diff --git a/sysdeps/aarch64/fpu/cbrt_advsimd.c b/sysdeps/aarch64/fpu/cbrt_advsimd.c
new file mode 100644
index 0000000000000000..adfbb60cd3918c95
--- /dev/null
+++ b/sysdeps/aarch64/fpu/cbrt_advsimd.c
@@ -0,0 +1,121 @@
+/* Double-precision vector (AdvSIMD) cbrt function
+
+ Copyright (C) 2024 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#include "v_math.h"
+#include "poly_advsimd_f64.h"
+
+const static struct data
+{
+ float64x2_t poly[4], one_third, shift;
+ int64x2_t exp_bias;
+ uint64x2_t abs_mask, tiny_bound;
+ uint32x4_t thresh;
+ double table[5];
+} data = {
+ .shift = V2 (0x1.8p52),
+ .poly = { /* Generated with fpminimax in [0.5, 1]. */
+ V2 (0x1.c14e8ee44767p-2), V2 (0x1.dd2d3f99e4c0ep-1),
+ V2 (-0x1.08e83026b7e74p-1), V2 (0x1.2c74eaa3ba428p-3) },
+ .exp_bias = V2 (1022),
+ .abs_mask = V2(0x7fffffffffffffff),
+ .tiny_bound = V2(0x0010000000000000), /* Smallest normal. */
+ .thresh = V4(0x7fe00000), /* asuint64 (infinity) - tiny_bound. */
+ .one_third = V2(0x1.5555555555555p-2),
+ .table = { /* table[i] = 2^((i - 2) / 3). */
+ 0x1.428a2f98d728bp-1, 0x1.965fea53d6e3dp-1, 0x1p0,
+ 0x1.428a2f98d728bp0, 0x1.965fea53d6e3dp0 }
+};
+
+#define MantissaMask v_u64 (0x000fffffffffffff)
+
+static float64x2_t NOINLINE VPCS_ATTR
+special_case (float64x2_t x, float64x2_t y, uint32x2_t special)
+{
+ return v_call_f64 (cbrt, x, y, vmovl_u32 (special));
+}
+
+/* Approximation for double-precision vector cbrt(x), using low-order polynomial
+ and two Newton iterations. Greatest observed error is 1.79 ULP. Errors repeat
+ according to the exponent, for instance an error observed for double value
+ m * 2^e will be observed for any input m * 2^(e + 3*i), where i is an
+ integer.
+ __v_cbrt(0x1.fffff403f0bc6p+1) got 0x1.965fe72821e9bp+0
+ want 0x1.965fe72821e99p+0. */
+VPCS_ATTR float64x2_t V_NAME_D1 (cbrt) (float64x2_t x)
+{
+ const struct data *d = ptr_barrier (&data);
+ uint64x2_t iax = vreinterpretq_u64_f64 (vabsq_f64 (x));
+
+ /* Subnormal, +/-0 and special values. */
+ uint32x2_t special
+ = vcge_u32 (vsubhn_u64 (iax, d->tiny_bound), vget_low_u32 (d->thresh));
+
+ /* Decompose |x| into m * 2^e, where m is in [0.5, 1.0]. This is a vector
+ version of frexp, which gets subnormal values wrong - these have to be
+ special-cased as a result. */
+ float64x2_t m = vbslq_f64 (MantissaMask, x, v_f64 (0.5));
+ int64x2_t exp_bias = d->exp_bias;
+ uint64x2_t ia12 = vshrq_n_u64 (iax, 52);
+ int64x2_t e = vsubq_s64 (vreinterpretq_s64_u64 (ia12), exp_bias);
+
+ /* Calculate rough approximation for cbrt(m) in [0.5, 1.0], starting point for
+ Newton iterations. */
+ float64x2_t p = v_pairwise_poly_3_f64 (m, vmulq_f64 (m, m), d->poly);
+ float64x2_t one_third = d->one_third;
+ /* Two iterations of Newton's method for iteratively approximating cbrt. */
+ float64x2_t m_by_3 = vmulq_f64 (m, one_third);
+ float64x2_t two_thirds = vaddq_f64 (one_third, one_third);
+ float64x2_t a
+ = vfmaq_f64 (vdivq_f64 (m_by_3, vmulq_f64 (p, p)), two_thirds, p);
+ a = vfmaq_f64 (vdivq_f64 (m_by_3, vmulq_f64 (a, a)), two_thirds, a);
+
+ /* Assemble the result by the following:
+
+ cbrt(x) = cbrt(m) * 2 ^ (e / 3).
+
+ We can get 2 ^ round(e / 3) using ldexp and integer divide, but since e is
+ not necessarily a multiple of 3 we lose some information.
+
+ Let q = 2 ^ round(e / 3), then t = 2 ^ (e / 3) / q.
+
+ Then we know t = 2 ^ (i / 3), where i is the remainder from e / 3, which is
+ an integer in [-2, 2], and can be looked up in the table T. Hence the
+ result is assembled as:
+
+ cbrt(x) = cbrt(m) * t * 2 ^ round(e / 3) * sign. */
+
+ float64x2_t ef = vcvtq_f64_s64 (e);
+ float64x2_t eb3f = vrndnq_f64 (vmulq_f64 (ef, one_third));
+ int64x2_t em3 = vcvtq_s64_f64 (vfmsq_f64 (ef, eb3f, v_f64 (3)));
+ int64x2_t ey = vcvtq_s64_f64 (eb3f);
+
+ float64x2_t my = (float64x2_t){ d->table[em3[0] + 2], d->table[em3[1] + 2] };
+ my = vmulq_f64 (my, a);
+
+ /* Vector version of ldexp. */
+ float64x2_t y = vreinterpretq_f64_s64 (
+ vshlq_n_s64 (vaddq_s64 (ey, vaddq_s64 (exp_bias, v_s64 (1))), 52));
+ y = vmulq_f64 (y, my);
+
+ if (__glibc_unlikely (v_any_u32h (special)))
+ return special_case (x, vbslq_f64 (d->abs_mask, y, x), special);
+
+ /* Copy sign. */
+ return vbslq_f64 (d->abs_mask, y, x);
+}
diff --git a/sysdeps/aarch64/fpu/cbrt_sve.c b/sysdeps/aarch64/fpu/cbrt_sve.c
new file mode 100644
index 0000000000000000..fc976eda2a6018f7
--- /dev/null
+++ b/sysdeps/aarch64/fpu/cbrt_sve.c
@@ -0,0 +1,128 @@
+/* Double-precision vector (SVE) cbrt function
+
+ Copyright (C) 2024 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#include "sv_math.h"
+#include "poly_sve_f64.h"
+
+const static struct data
+{
+ float64_t poly[4];
+ float64_t table[5];
+ float64_t one_third, two_thirds, shift;
+ int64_t exp_bias;
+ uint64_t tiny_bound, thresh;
+} data = {
+ /* Generated with FPMinimax in [0.5, 1]. */
+ .poly = { 0x1.c14e8ee44767p-2, 0x1.dd2d3f99e4c0ep-1, -0x1.08e83026b7e74p-1,
+ 0x1.2c74eaa3ba428p-3, },
+ /* table[i] = 2^((i - 2) / 3). */
+ .table = { 0x1.428a2f98d728bp-1, 0x1.965fea53d6e3dp-1, 0x1p0,
+ 0x1.428a2f98d728bp0, 0x1.965fea53d6e3dp0, },
+ .one_third = 0x1.5555555555555p-2,
+ .two_thirds = 0x1.5555555555555p-1,
+ .shift = 0x1.8p52,
+ .exp_bias = 1022,
+ .tiny_bound = 0x0010000000000000, /* Smallest normal. */
+ .thresh = 0x7fe0000000000000, /* asuint64 (infinity) - tiny_bound. */
+};
+
+#define MantissaMask 0x000fffffffffffff
+#define HalfExp 0x3fe0000000000000
+
+static svfloat64_t NOINLINE
+special_case (svfloat64_t x, svfloat64_t y, svbool_t special)
+{
+ return sv_call_f64 (cbrt, x, y, special);
+}
+
+static inline svfloat64_t
+shifted_lookup (const svbool_t pg, const float64_t *table, svint64_t i)
+{
+ return svld1_gather_index (pg, table, svadd_x (pg, i, 2));
+}
+
+/* Approximation for double-precision vector cbrt(x), using low-order
+ polynomial and two Newton iterations. Greatest observed error is 1.79 ULP.
+ Errors repeat according to the exponent, for instance an error observed for
+ double value m * 2^e will be observed for any input m * 2^(e + 3*i), where i
+ is an integer.
+ _ZGVsMxv_cbrt (0x0.3fffb8d4413f3p-1022) got 0x1.965f53b0e5d97p-342
+ want 0x1.965f53b0e5d95p-342. */
+svfloat64_t SV_NAME_D1 (cbrt) (svfloat64_t x, const svbool_t pg)
+{
+ const struct data *d = ptr_barrier (&data);
+
+ svfloat64_t ax = svabs_x (pg, x);
+ svuint64_t iax = svreinterpret_u64 (ax);
+ svuint64_t sign = sveor_x (pg, svreinterpret_u64 (x), iax);
+
+ /* Subnormal, +/-0 and special values. */
+ svbool_t special = svcmpge (pg, svsub_x (pg, iax, d->tiny_bound), d->thresh);
+
+ /* Decompose |x| into m * 2^e, where m is in [0.5, 1.0]. This is a vector
+ version of frexp, which gets subnormal values wrong - these have to be
+ special-cased as a result. */
+ svfloat64_t m = svreinterpret_f64 (svorr_x (
+ pg, svand_x (pg, svreinterpret_u64 (x), MantissaMask), HalfExp));
+ svint64_t e
+ = svsub_x (pg, svreinterpret_s64 (svlsr_x (pg, iax, 52)), d->exp_bias);
+
+ /* Calculate rough approximation for cbrt(m) in [0.5, 1.0], starting point
+ for Newton iterations. */
+ svfloat64_t p
+ = sv_pairwise_poly_3_f64_x (pg, m, svmul_x (pg, m, m), d->poly);
+
+ /* Two iterations of Newton's method for iteratively approximating cbrt. */
+ svfloat64_t m_by_3 = svmul_x (pg, m, d->one_third);
+ svfloat64_t a = svmla_x (pg, svdiv_x (pg, m_by_3, svmul_x (pg, p, p)), p,
+ d->two_thirds);
+ a = svmla_x (pg, svdiv_x (pg, m_by_3, svmul_x (pg, a, a)), a, d->two_thirds);
+
+ /* Assemble the result by the following:
+
+ cbrt(x) = cbrt(m) * 2 ^ (e / 3).
+
+ We can get 2 ^ round(e / 3) using ldexp and integer divide, but since e is
+ not necessarily a multiple of 3 we lose some information.
+
+ Let q = 2 ^ round(e / 3), then t = 2 ^ (e / 3) / q.
+
+ Then we know t = 2 ^ (i / 3), where i is the remainder from e / 3, which
+ is an integer in [-2, 2], and can be looked up in the table T. Hence the
+ result is assembled as:
+
+ cbrt(x) = cbrt(m) * t * 2 ^ round(e / 3) * sign. */
+ svfloat64_t eb3f = svmul_x (pg, svcvt_f64_x (pg, e), d->one_third);
+ svint64_t ey = svcvt_s64_x (pg, eb3f);
+ svint64_t em3 = svmls_x (pg, e, ey, 3);
+
+ svfloat64_t my = shifted_lookup (pg, d->table, em3);
+ my = svmul_x (pg, my, a);
+
+ /* Vector version of ldexp. */
+ svfloat64_t y = svscale_x (pg, my, ey);
+
+ if (__glibc_unlikely (svptest_any (pg, special)))
+ return special_case (
+ x, svreinterpret_f64 (svorr_x (pg, svreinterpret_u64 (y), sign)),
+ special);
+
+ /* Copy sign. */
+ return svreinterpret_f64 (svorr_x (pg, svreinterpret_u64 (y), sign));
+}
diff --git a/sysdeps/aarch64/fpu/cbrtf_advsimd.c b/sysdeps/aarch64/fpu/cbrtf_advsimd.c
new file mode 100644
index 0000000000000000..27debb8b57c8c3e2
--- /dev/null
+++ b/sysdeps/aarch64/fpu/cbrtf_advsimd.c
@@ -0,0 +1,123 @@
+/* Single-precision vector (AdvSIMD) cbrt function
+
+ Copyright (C) 2024 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#include "v_math.h"
+#include "poly_advsimd_f32.h"
+
+const static struct data
+{
+ float32x4_t poly[4], one_third;
+ float table[5];
+} data = {
+ .poly = { /* Very rough approximation of cbrt(x) in [0.5, 1], generated with
+ FPMinimax. */
+ V4 (0x1.c14e96p-2), V4 (0x1.dd2d3p-1), V4 (-0x1.08e81ap-1),
+ V4 (0x1.2c74c2p-3) },
+ .table = { /* table[i] = 2^((i - 2) / 3). */
+ 0x1.428a3p-1, 0x1.965feap-1, 0x1p0, 0x1.428a3p0, 0x1.965feap0 },
+ .one_third = V4 (0x1.555556p-2f),
+};
+
+#define SignMask v_u32 (0x80000000)
+#define SmallestNormal v_u32 (0x00800000)
+#define Thresh vdup_n_u16 (0x7f00) /* asuint(INFINITY) - SmallestNormal. */
+#define MantissaMask v_u32 (0x007fffff)
+#define HalfExp v_u32 (0x3f000000)
+
+static float32x4_t VPCS_ATTR NOINLINE
+special_case (float32x4_t x, float32x4_t y, uint16x4_t special)
+{
+ return v_call_f32 (cbrtf, x, y, vmovl_u16 (special));
+}
+
+static inline float32x4_t
+shifted_lookup (const float *table, int32x4_t i)
+{
+ return (float32x4_t){ table[i[0] + 2], table[i[1] + 2], table[i[2] + 2],
+ table[i[3] + 2] };
+}
+
+/* Approximation for vector single-precision cbrt(x) using Newton iteration
+ with initial guess obtained by a low-order polynomial. Greatest error
+ is 1.64 ULP. This is observed for every value where the mantissa is
+ 0x1.85a2aa and the exponent is a multiple of 3, for example:
+ _ZGVnN4v_cbrtf(0x1.85a2aap+3) got 0x1.267936p+1
+ want 0x1.267932p+1. */
+VPCS_ATTR float32x4_t V_NAME_F1 (cbrt) (float32x4_t x)
+{
+ const struct data *d = ptr_barrier (&data);
+ uint32x4_t iax = vreinterpretq_u32_f32 (vabsq_f32 (x));
+
+ /* Subnormal, +/-0 and special values. */
+ uint16x4_t special = vcge_u16 (vsubhn_u32 (iax, SmallestNormal), Thresh);
+
+ /* Decompose |x| into m * 2^e, where m is in [0.5, 1.0]. This is a vector
+ version of frexpf, which gets subnormal values wrong - these have to be
+ special-cased as a result. */
+ float32x4_t m = vbslq_f32 (MantissaMask, x, v_f32 (0.5));
+ int32x4_t e
+ = vsubq_s32 (vreinterpretq_s32_u32 (vshrq_n_u32 (iax, 23)), v_s32 (126));
+
+ /* p is a rough approximation for cbrt(m) in [0.5, 1.0]. The better this is,
+ the less accurate the next stage of the algorithm needs to be. An order-4
+ polynomial is enough for one Newton iteration. */
+ float32x4_t p = v_pairwise_poly_3_f32 (m, vmulq_f32 (m, m), d->poly);
+
+ float32x4_t one_third = d->one_third;
+ float32x4_t two_thirds = vaddq_f32 (one_third, one_third);
+
+ /* One iteration of Newton's method for iteratively approximating cbrt. */
+ float32x4_t m_by_3 = vmulq_f32 (m, one_third);
+ float32x4_t a
+ = vfmaq_f32 (vdivq_f32 (m_by_3, vmulq_f32 (p, p)), two_thirds, p);
+
+ /* Assemble the result by the following:
+
+ cbrt(x) = cbrt(m) * 2 ^ (e / 3).
+
+ We can get 2 ^ round(e / 3) using ldexp and integer divide, but since e is
+ not necessarily a multiple of 3 we lose some information.
+
+ Let q = 2 ^ round(e / 3), then t = 2 ^ (e / 3) / q.
+
+ Then we know t = 2 ^ (i / 3), where i is the remainder from e / 3, which
+ is an integer in [-2, 2], and can be looked up in the table T. Hence the
+ result is assembled as:
+
+ cbrt(x) = cbrt(m) * t * 2 ^ round(e / 3) * sign. */
+ float32x4_t ef = vmulq_f32 (vcvtq_f32_s32 (e), one_third);
+ int32x4_t ey = vcvtq_s32_f32 (ef);
+ int32x4_t em3 = vsubq_s32 (e, vmulq_s32 (ey, v_s32 (3)));
+
+ float32x4_t my = shifted_lookup (d->table, em3);
+ my = vmulq_f32 (my, a);
+
+ /* Vector version of ldexpf. */
+ float32x4_t y
+ = vreinterpretq_f32_s32 (vshlq_n_s32 (vaddq_s32 (ey, v_s32 (127)), 23));
+ y = vmulq_f32 (y, my);
+
+ if (__glibc_unlikely (v_any_u16h (special)))
+ return special_case (x, vbslq_f32 (SignMask, x, y), special);
+
+ /* Copy sign. */
+ return vbslq_f32 (SignMask, x, y);
+}
+libmvec_hidden_def (V_NAME_F1 (cbrt))
+HALF_WIDTH_ALIAS_F1 (cbrt)
diff --git a/sysdeps/aarch64/fpu/cbrtf_sve.c b/sysdeps/aarch64/fpu/cbrtf_sve.c
new file mode 100644
index 0000000000000000..23c220c202244c1f
--- /dev/null
+++ b/sysdeps/aarch64/fpu/cbrtf_sve.c
@@ -0,0 +1,122 @@
+/* Single-precision vector (SVE) cbrt function
+
+ Copyright (C) 2024 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#include "sv_math.h"
+#include "poly_sve_f32.h"
+
+const static struct data
+{
+ float32_t poly[4];
+ float32_t table[5];
+ float32_t one_third, two_thirds;
+} data = {
+ /* Very rough approximation of cbrt(x) in [0.5, 1], generated with FPMinimax.
+ */
+ .poly = { 0x1.c14e96p-2, 0x1.dd2d3p-1, -0x1.08e81ap-1,
+ 0x1.2c74c2p-3, },
+ /* table[i] = 2^((i - 2) / 3). */
+ .table = { 0x1.428a3p-1, 0x1.965feap-1, 0x1p0, 0x1.428a3p0, 0x1.965feap0 },
+ .one_third = 0x1.555556p-2f,
+ .two_thirds = 0x1.555556p-1f,
+};
+
+#define SmallestNormal 0x00800000
+#define Thresh 0x7f000000 /* asuint(INFINITY) - SmallestNormal. */
+#define MantissaMask 0x007fffff
+#define HalfExp 0x3f000000
+
+static svfloat32_t NOINLINE
+special_case (svfloat32_t x, svfloat32_t y, svbool_t special)
+{
+ return sv_call_f32 (cbrtf, x, y, special);
+}
+
+static inline svfloat32_t
+shifted_lookup (const svbool_t pg, const float32_t *table, svint32_t i)
+{
+ return svld1_gather_index (pg, table, svadd_x (pg, i, 2));
+}
+
+/* Approximation for vector single-precision cbrt(x) using Newton iteration
+ with initial guess obtained by a low-order polynomial. Greatest error
+ is 1.64 ULP. This is observed for every value where the mantissa is
+ 0x1.85a2aa and the exponent is a multiple of 3, for example:
+ _ZGVsMxv_cbrtf (0x1.85a2aap+3) got 0x1.267936p+1
+ want 0x1.267932p+1. */
+svfloat32_t SV_NAME_F1 (cbrt) (svfloat32_t x, const svbool_t pg)
+{
+ const struct data *d = ptr_barrier (&data);
+
+ svfloat32_t ax = svabs_x (pg, x);
+ svuint32_t iax = svreinterpret_u32 (ax);
+ svuint32_t sign = sveor_x (pg, svreinterpret_u32 (x), iax);
+
+ /* Subnormal, +/-0 and special values. */
+ svbool_t special = svcmpge (pg, svsub_x (pg, iax, SmallestNormal), Thresh);
+
+ /* Decompose |x| into m * 2^e, where m is in [0.5, 1.0]. This is a vector
+ version of frexpf, which gets subnormal values wrong - these have to be
+ special-cased as a result. */
+ svfloat32_t m = svreinterpret_f32 (svorr_x (
+ pg, svand_x (pg, svreinterpret_u32 (x), MantissaMask), HalfExp));
+ svint32_t e = svsub_x (pg, svreinterpret_s32 (svlsr_x (pg, iax, 23)), 126);
+
+ /* p is a rough approximation for cbrt(m) in [0.5, 1.0]. The better this is,
+ the less accurate the next stage of the algorithm needs to be. An order-4
+ polynomial is enough for one Newton iteration. */
+ svfloat32_t p
+ = sv_pairwise_poly_3_f32_x (pg, m, svmul_x (pg, m, m), d->poly);
+
+ /* One iteration of Newton's method for iteratively approximating cbrt. */
+ svfloat32_t m_by_3 = svmul_x (pg, m, d->one_third);
+ svfloat32_t a = svmla_x (pg, svdiv_x (pg, m_by_3, svmul_x (pg, p, p)), p,
+ d->two_thirds);
+
+ /* Assemble the result by the following:
+
+ cbrt(x) = cbrt(m) * 2 ^ (e / 3).
+
+ We can get 2 ^ round(e / 3) using ldexp and integer divide, but since e is
+ not necessarily a multiple of 3 we lose some information.
+
+ Let q = 2 ^ round(e / 3), then t = 2 ^ (e / 3) / q.
+
+ Then we know t = 2 ^ (i / 3), where i is the remainder from e / 3, which
+ is an integer in [-2, 2], and can be looked up in the table T. Hence the
+ result is assembled as:
+
+ cbrt(x) = cbrt(m) * t * 2 ^ round(e / 3) * sign. */
+ svfloat32_t ef = svmul_x (pg, svcvt_f32_x (pg, e), d->one_third);
+ svint32_t ey = svcvt_s32_x (pg, ef);
+ svint32_t em3 = svmls_x (pg, e, ey, 3);
+
+ svfloat32_t my = shifted_lookup (pg, d->table, em3);
+ my = svmul_x (pg, my, a);
+
+ /* Vector version of ldexpf. */
+ svfloat32_t y = svscale_x (pg, my, ey);
+
+ if (__glibc_unlikely (svptest_any (pg, special)))
+ return special_case (
+ x, svreinterpret_f32 (svorr_x (pg, svreinterpret_u32 (y), sign)),
+ special);
+
+ /* Copy sign. */
+ return svreinterpret_f32 (svorr_x (pg, svreinterpret_u32 (y), sign));
+}
diff --git a/sysdeps/aarch64/fpu/test-double-advsimd-wrappers.c b/sysdeps/aarch64/fpu/test-double-advsimd-wrappers.c
index 417125be476cd75f..1877db3ac6932037 100644
--- a/sysdeps/aarch64/fpu/test-double-advsimd-wrappers.c
+++ b/sysdeps/aarch64/fpu/test-double-advsimd-wrappers.c
@@ -30,6 +30,7 @@ VPCS_VECTOR_WRAPPER (asinh_advsimd, _ZGVnN2v_asinh)
VPCS_VECTOR_WRAPPER (atan_advsimd, _ZGVnN2v_atan)
VPCS_VECTOR_WRAPPER (atanh_advsimd, _ZGVnN2v_atanh)
VPCS_VECTOR_WRAPPER_ff (atan2_advsimd, _ZGVnN2vv_atan2)
+VPCS_VECTOR_WRAPPER (cbrt_advsimd, _ZGVnN2v_cbrt)
VPCS_VECTOR_WRAPPER (cos_advsimd, _ZGVnN2v_cos)
VPCS_VECTOR_WRAPPER (cosh_advsimd, _ZGVnN2v_cosh)
VPCS_VECTOR_WRAPPER (erf_advsimd, _ZGVnN2v_erf)
diff --git a/sysdeps/aarch64/fpu/test-double-sve-wrappers.c b/sysdeps/aarch64/fpu/test-double-sve-wrappers.c
index 31ebf18705f68856..b702f942dea0749f 100644
--- a/sysdeps/aarch64/fpu/test-double-sve-wrappers.c
+++ b/sysdeps/aarch64/fpu/test-double-sve-wrappers.c
@@ -49,6 +49,7 @@ SVE_VECTOR_WRAPPER (asinh_sve, _ZGVsMxv_asinh)
SVE_VECTOR_WRAPPER (atan_sve, _ZGVsMxv_atan)
SVE_VECTOR_WRAPPER (atanh_sve, _ZGVsMxv_atanh)
SVE_VECTOR_WRAPPER_ff (atan2_sve, _ZGVsMxvv_atan2)
+SVE_VECTOR_WRAPPER (cbrt_sve, _ZGVsMxv_cbrt)
SVE_VECTOR_WRAPPER (cos_sve, _ZGVsMxv_cos)
SVE_VECTOR_WRAPPER (cosh_sve, _ZGVsMxv_cosh)
SVE_VECTOR_WRAPPER (erf_sve, _ZGVsMxv_erf)
diff --git a/sysdeps/aarch64/fpu/test-float-advsimd-wrappers.c b/sysdeps/aarch64/fpu/test-float-advsimd-wrappers.c
index dab0f1cfcb79a305..9cb451b4f045e625 100644
--- a/sysdeps/aarch64/fpu/test-float-advsimd-wrappers.c
+++ b/sysdeps/aarch64/fpu/test-float-advsimd-wrappers.c
@@ -30,6 +30,7 @@ VPCS_VECTOR_WRAPPER (asinhf_advsimd, _ZGVnN4v_asinhf)
VPCS_VECTOR_WRAPPER (atanf_advsimd, _ZGVnN4v_atanf)
VPCS_VECTOR_WRAPPER (atanhf_advsimd, _ZGVnN4v_atanhf)
VPCS_VECTOR_WRAPPER_ff (atan2f_advsimd, _ZGVnN4vv_atan2f)
+VPCS_VECTOR_WRAPPER (cbrtf_advsimd, _ZGVnN4v_cbrtf)
VPCS_VECTOR_WRAPPER (cosf_advsimd, _ZGVnN4v_cosf)
VPCS_VECTOR_WRAPPER (coshf_advsimd, _ZGVnN4v_coshf)
VPCS_VECTOR_WRAPPER (erff_advsimd, _ZGVnN4v_erff)
diff --git a/sysdeps/aarch64/fpu/test-float-sve-wrappers.c b/sysdeps/aarch64/fpu/test-float-sve-wrappers.c
index 2aa6cbcc28d69cf8..5b3dd22916d2a50d 100644
--- a/sysdeps/aarch64/fpu/test-float-sve-wrappers.c
+++ b/sysdeps/aarch64/fpu/test-float-sve-wrappers.c
@@ -49,6 +49,7 @@ SVE_VECTOR_WRAPPER (asinhf_sve, _ZGVsMxv_asinhf)
SVE_VECTOR_WRAPPER (atanf_sve, _ZGVsMxv_atanf)
SVE_VECTOR_WRAPPER (atanhf_sve, _ZGVsMxv_atanhf)
SVE_VECTOR_WRAPPER_ff (atan2f_sve, _ZGVsMxvv_atan2f)
+SVE_VECTOR_WRAPPER (cbrtf_sve, _ZGVsMxv_cbrtf)
SVE_VECTOR_WRAPPER (cosf_sve, _ZGVsMxv_cosf)
SVE_VECTOR_WRAPPER (coshf_sve, _ZGVsMxv_coshf)
SVE_VECTOR_WRAPPER (erff_sve, _ZGVsMxv_erff)
diff --git a/sysdeps/aarch64/libm-test-ulps b/sysdeps/aarch64/libm-test-ulps
index 17723d0c9e2dfcf5..a67cd7cd7399c533 100644
--- a/sysdeps/aarch64/libm-test-ulps
+++ b/sysdeps/aarch64/libm-test-ulps
@@ -477,11 +477,19 @@ double: 4
float: 1
ldouble: 1
+Function: "cbrt_advsimd":
+double: 1
+float: 1
+
Function: "cbrt_downward":
double: 4
float: 1
ldouble: 1
+Function: "cbrt_sve":
+double: 1
+float: 1
+
Function: "cbrt_towardzero":
double: 3
float: 1
diff --git a/sysdeps/unix/sysv/linux/aarch64/libmvec.abilist b/sysdeps/unix/sysv/linux/aarch64/libmvec.abilist
index 1184374efd25cfa6..89ac1dfa36279eb0 100644
--- a/sysdeps/unix/sysv/linux/aarch64/libmvec.abilist
+++ b/sysdeps/unix/sysv/linux/aarch64/libmvec.abilist
@@ -79,6 +79,8 @@ GLIBC_2.40 _ZGVnN2v_asinh F
GLIBC_2.40 _ZGVnN2v_asinhf F
GLIBC_2.40 _ZGVnN2v_atanh F
GLIBC_2.40 _ZGVnN2v_atanhf F
+GLIBC_2.40 _ZGVnN2v_cbrt F
+GLIBC_2.40 _ZGVnN2v_cbrtf F
GLIBC_2.40 _ZGVnN2v_cosh F
GLIBC_2.40 _ZGVnN2v_coshf F
GLIBC_2.40 _ZGVnN2v_erf F
@@ -94,6 +96,7 @@ GLIBC_2.40 _ZGVnN2vv_hypotf F
GLIBC_2.40 _ZGVnN4v_acoshf F
GLIBC_2.40 _ZGVnN4v_asinhf F
GLIBC_2.40 _ZGVnN4v_atanhf F
+GLIBC_2.40 _ZGVnN4v_cbrtf F
GLIBC_2.40 _ZGVnN4v_coshf F
GLIBC_2.40 _ZGVnN4v_erfcf F
GLIBC_2.40 _ZGVnN4v_erff F
@@ -106,6 +109,8 @@ GLIBC_2.40 _ZGVsMxv_asinh F
GLIBC_2.40 _ZGVsMxv_asinhf F
GLIBC_2.40 _ZGVsMxv_atanh F
GLIBC_2.40 _ZGVsMxv_atanhf F
+GLIBC_2.40 _ZGVsMxv_cbrt F
+GLIBC_2.40 _ZGVsMxv_cbrtf F
GLIBC_2.40 _ZGVsMxv_cosh F
GLIBC_2.40 _ZGVsMxv_coshf F
GLIBC_2.40 _ZGVsMxv_erf F

2511
glibc-RHEL-118273-12.patch Normal file

File diff suppressed because it is too large Load Diff

319
glibc-RHEL-118273-13.patch Normal file
View File

@ -0,0 +1,319 @@
commit 7900ac490db32f6bccff812733f00280dde34e27
Author: Joe Ramsay <Joe.Ramsay@arm.com>
Date: Mon Sep 23 15:32:53 2024 +0100
AArch64: Improve codegen in users of ADVSIMD expm1f helper
Rearrange operations so MOV is not necessary in reduction or around
the special-case handler. Reduce memory access by using more indexed
MLAs in polynomial.
Reviewed-by: Wilco Dijkstra <Wilco.Dijkstra@arm.com>
diff --git a/sysdeps/aarch64/fpu/expm1f_advsimd.c b/sysdeps/aarch64/fpu/expm1f_advsimd.c
index a0616ec7542cbfce..8303ca296e030c2e 100644
--- a/sysdeps/aarch64/fpu/expm1f_advsimd.c
+++ b/sysdeps/aarch64/fpu/expm1f_advsimd.c
@@ -18,27 +18,18 @@
<https://www.gnu.org/licenses/>. */
#include "v_math.h"
-#include "poly_advsimd_f32.h"
+#include "v_expm1f_inline.h"
static const struct data
{
- float32x4_t poly[5];
- float invln2_and_ln2[4];
- float32x4_t shift;
- int32x4_t exponent_bias;
+ struct v_expm1f_data d;
#if WANT_SIMD_EXCEPT
uint32x4_t thresh;
#else
float32x4_t oflow_bound;
#endif
} data = {
- /* Generated using fpminimax with degree=5 in [-log(2)/2, log(2)/2]. */
- .poly = { V4 (0x1.fffffep-2), V4 (0x1.5554aep-3), V4 (0x1.555736p-5),
- V4 (0x1.12287cp-7), V4 (0x1.6b55a2p-10) },
- /* Stores constants: invln2, ln2_hi, ln2_lo, 0. */
- .invln2_and_ln2 = { 0x1.715476p+0f, 0x1.62e4p-1f, 0x1.7f7d1cp-20f, 0 },
- .shift = V4 (0x1.8p23f),
- .exponent_bias = V4 (0x3f800000),
+ .d = V_EXPM1F_DATA,
#if !WANT_SIMD_EXCEPT
/* Value above which expm1f(x) should overflow. Absolute value of the
underflow bound is greater than this, so it catches both cases - there is
@@ -55,67 +46,38 @@ static const struct data
#define TinyBound v_u32 (0x34000000 << 1)
static float32x4_t VPCS_ATTR NOINLINE
-special_case (float32x4_t x, float32x4_t y, uint32x4_t special)
+special_case (float32x4_t x, uint32x4_t special, const struct data *d)
{
- return v_call_f32 (expm1f, x, y, special);
+ return v_call_f32 (
+ expm1f, x, expm1f_inline (v_zerofy_f32 (x, special), &d->d), special);
}
/* Single-precision vector exp(x) - 1 function.
- The maximum error is 1.51 ULP:
- _ZGVnN4v_expm1f (0x1.8baa96p-2) got 0x1.e2fb9p-2
- want 0x1.e2fb94p-2. */
+ The maximum error is 1.62 ULP:
+ _ZGVnN4v_expm1f(0x1.85f83p-2) got 0x1.da9f4p-2
+ want 0x1.da9f44p-2. */
float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (expm1) (float32x4_t x)
{
const struct data *d = ptr_barrier (&data);
- uint32x4_t ix = vreinterpretq_u32_f32 (x);
#if WANT_SIMD_EXCEPT
+ uint32x4_t ix = vreinterpretq_u32_f32 (x);
/* If fp exceptions are to be triggered correctly, fall back to scalar for
|x| < 2^-23, |x| > oflow_bound, Inf & NaN. Add ix to itself for
shift-left by 1, and compare with thresh which was left-shifted offline -
this is effectively an absolute compare. */
uint32x4_t special
= vcgeq_u32 (vsubq_u32 (vaddq_u32 (ix, ix), TinyBound), d->thresh);
- if (__glibc_unlikely (v_any_u32 (special)))
- x = v_zerofy_f32 (x, special);
#else
/* Handles very large values (+ve and -ve), +/-NaN, +/-Inf. */
uint32x4_t special = vcagtq_f32 (x, d->oflow_bound);
#endif
- /* Reduce argument to smaller range:
- Let i = round(x / ln2)
- and f = x - i * ln2, then f is in [-ln2/2, ln2/2].
- exp(x) - 1 = 2^i * (expm1(f) + 1) - 1
- where 2^i is exact because i is an integer. */
- float32x4_t invln2_and_ln2 = vld1q_f32 (d->invln2_and_ln2);
- float32x4_t j
- = vsubq_f32 (vfmaq_laneq_f32 (d->shift, x, invln2_and_ln2, 0), d->shift);
- int32x4_t i = vcvtq_s32_f32 (j);
- float32x4_t f = vfmsq_laneq_f32 (x, j, invln2_and_ln2, 1);
- f = vfmsq_laneq_f32 (f, j, invln2_and_ln2, 2);
-
- /* Approximate expm1(f) using polynomial.
- Taylor expansion for expm1(x) has the form:
- x + ax^2 + bx^3 + cx^4 ....
- So we calculate the polynomial P(f) = a + bf + cf^2 + ...
- and assemble the approximation expm1(f) ~= f + f^2 * P(f). */
- float32x4_t p = v_horner_4_f32 (f, d->poly);
- p = vfmaq_f32 (f, vmulq_f32 (f, f), p);
-
- /* Assemble the result.
- expm1(x) ~= 2^i * (p + 1) - 1
- Let t = 2^i. */
- int32x4_t u = vaddq_s32 (vshlq_n_s32 (i, 23), d->exponent_bias);
- float32x4_t t = vreinterpretq_f32_s32 (u);
-
if (__glibc_unlikely (v_any_u32 (special)))
- return special_case (vreinterpretq_f32_u32 (ix),
- vfmaq_f32 (vsubq_f32 (t, v_f32 (1.0f)), p, t),
- special);
+ return special_case (x, special, d);
/* expm1(x) ~= p * t + (t - 1). */
- return vfmaq_f32 (vsubq_f32 (t, v_f32 (1.0f)), p, t);
+ return expm1f_inline (x, &d->d);
}
libmvec_hidden_def (V_NAME_F1 (expm1))
HALF_WIDTH_ALIAS_F1 (expm1)
diff --git a/sysdeps/aarch64/fpu/sinhf_advsimd.c b/sysdeps/aarch64/fpu/sinhf_advsimd.c
index 6bb7482dc28795c1..c6ed7598e7deca1b 100644
--- a/sysdeps/aarch64/fpu/sinhf_advsimd.c
+++ b/sysdeps/aarch64/fpu/sinhf_advsimd.c
@@ -23,15 +23,13 @@
static const struct data
{
struct v_expm1f_data expm1f_consts;
- uint32x4_t halff;
#if WANT_SIMD_EXCEPT
uint32x4_t tiny_bound, thresh;
#else
- uint32x4_t oflow_bound;
+ float32x4_t oflow_bound;
#endif
} data = {
.expm1f_consts = V_EXPM1F_DATA,
- .halff = V4 (0x3f000000),
#if WANT_SIMD_EXCEPT
/* 0x1.6a09e8p-32, below which expm1f underflows. */
.tiny_bound = V4 (0x2fb504f4),
@@ -39,14 +37,15 @@ static const struct data
.thresh = V4 (0x12fbbbb3),
#else
/* 0x1.61814ep+6, above which expm1f helper overflows. */
- .oflow_bound = V4 (0x42b0c0a7),
+ .oflow_bound = V4 (0x1.61814ep+6),
#endif
};
static float32x4_t NOINLINE VPCS_ATTR
-special_case (float32x4_t x, float32x4_t y, uint32x4_t special)
+special_case (float32x4_t x, float32x4_t t, float32x4_t halfsign,
+ uint32x4_t special)
{
- return v_call_f32 (sinhf, x, y, special);
+ return v_call_f32 (sinhf, x, vmulq_f32 (t, halfsign), special);
}
/* Approximation for vector single-precision sinh(x) using expm1.
@@ -60,15 +59,15 @@ float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (sinh) (float32x4_t x)
uint32x4_t ix = vreinterpretq_u32_f32 (x);
float32x4_t ax = vabsq_f32 (x);
- uint32x4_t iax = vreinterpretq_u32_f32 (ax);
- uint32x4_t sign = veorq_u32 (ix, iax);
- float32x4_t halfsign = vreinterpretq_f32_u32 (vorrq_u32 (sign, d->halff));
+ float32x4_t halfsign = vreinterpretq_f32_u32 (
+ vbslq_u32 (v_u32 (0x80000000), ix, vreinterpretq_u32_f32 (v_f32 (0.5))));
#if WANT_SIMD_EXCEPT
- uint32x4_t special = vcgeq_u32 (vsubq_u32 (iax, d->tiny_bound), d->thresh);
+ uint32x4_t special = vcgeq_u32 (
+ vsubq_u32 (vreinterpretq_u32_f32 (ax), d->tiny_bound), d->thresh);
ax = v_zerofy_f32 (ax, special);
#else
- uint32x4_t special = vcgeq_u32 (iax, d->oflow_bound);
+ uint32x4_t special = vcageq_f32 (x, d->oflow_bound);
#endif
/* Up to the point that expm1f overflows, we can use it to calculate sinhf
@@ -80,7 +79,7 @@ float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (sinh) (float32x4_t x)
/* Fall back to the scalar variant for any lanes that should trigger an
exception. */
if (__glibc_unlikely (v_any_u32 (special)))
- return special_case (x, vmulq_f32 (t, halfsign), special);
+ return special_case (x, t, halfsign, special);
return vmulq_f32 (t, halfsign);
}
diff --git a/sysdeps/aarch64/fpu/tanhf_advsimd.c b/sysdeps/aarch64/fpu/tanhf_advsimd.c
index 50defd6ef03926f4..3ced9b7a414c812c 100644
--- a/sysdeps/aarch64/fpu/tanhf_advsimd.c
+++ b/sysdeps/aarch64/fpu/tanhf_advsimd.c
@@ -28,13 +28,16 @@ static const struct data
/* 0x1.205966p+3, above which tanhf rounds to 1 (or -1 for negative). */
.boring_bound = V4 (0x41102cb3),
.large_bound = V4 (0x7f800000),
- .onef = V4 (0x3f800000),
};
static float32x4_t NOINLINE VPCS_ATTR
-special_case (float32x4_t x, float32x4_t y, uint32x4_t special)
+special_case (float32x4_t x, uint32x4_t is_boring, float32x4_t boring,
+ float32x4_t q, uint32x4_t special)
{
- return v_call_f32 (tanhf, x, y, special);
+ return v_call_f32 (
+ tanhf, x,
+ vbslq_f32 (is_boring, boring, vdivq_f32 (q, vaddq_f32 (q, v_f32 (2.0)))),
+ special);
}
/* Approximation for single-precision vector tanh(x), using a simplified
@@ -50,7 +53,9 @@ float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (tanh) (float32x4_t x)
uint32x4_t iax = vreinterpretq_u32_f32 (ax);
uint32x4_t sign = veorq_u32 (ix, iax);
uint32x4_t is_boring = vcgtq_u32 (iax, d->boring_bound);
- float32x4_t boring = vreinterpretq_f32_u32 (vorrq_u32 (sign, d->onef));
+ /* expm1 exponent bias is 1.0f reinterpreted to int. */
+ float32x4_t boring = vreinterpretq_f32_u32 (vorrq_u32 (
+ sign, vreinterpretq_u32_s32 (d->expm1f_consts.exponent_bias)));
#if WANT_SIMD_EXCEPT
/* If fp exceptions are to be triggered properly, set all special and boring
@@ -66,10 +71,12 @@ float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (tanh) (float32x4_t x)
/* tanh(x) = (e^2x - 1) / (e^2x + 1). */
float32x4_t q = expm1f_inline (vmulq_n_f32 (x, 2), &d->expm1f_consts);
- float32x4_t y = vdivq_f32 (q, vaddq_f32 (q, v_f32 (2.0)));
+
if (__glibc_unlikely (v_any_u32 (special)))
- return special_case (vreinterpretq_f32_u32 (ix),
- vbslq_f32 (is_boring, boring, y), special);
+ return special_case (vreinterpretq_f32_u32 (ix), is_boring, boring, q,
+ special);
+
+ float32x4_t y = vdivq_f32 (q, vaddq_f32 (q, v_f32 (2.0)));
return vbslq_f32 (is_boring, boring, y);
}
libmvec_hidden_def (V_NAME_F1 (tanh))
diff --git a/sysdeps/aarch64/fpu/v_expm1f_inline.h b/sysdeps/aarch64/fpu/v_expm1f_inline.h
index 59b552da6b74785e..1daedfdd51cfc54b 100644
--- a/sysdeps/aarch64/fpu/v_expm1f_inline.h
+++ b/sysdeps/aarch64/fpu/v_expm1f_inline.h
@@ -21,48 +21,47 @@
#define AARCH64_FPU_V_EXPM1F_INLINE_H
#include "v_math.h"
-#include "poly_advsimd_f32.h"
+#include "math_config.h"
struct v_expm1f_data
{
- float32x4_t poly[5];
- float invln2_and_ln2[4];
- float32x4_t shift;
+ float32x4_t c0, c2;
int32x4_t exponent_bias;
+ float c1, c3, inv_ln2, c4;
+ float ln2_hi, ln2_lo;
};
/* Coefficients generated using fpminimax with degree=5 in [-log(2)/2,
- log(2)/2]. Exponent bias is asuint(1.0f).
- invln2_and_ln2 Stores constants: invln2, ln2_lo, ln2_hi, 0. */
+ log(2)/2]. Exponent bias is asuint(1.0f). */
#define V_EXPM1F_DATA \
{ \
- .poly = { V4 (0x1.fffffep-2), V4 (0x1.5554aep-3), V4 (0x1.555736p-5), \
- V4 (0x1.12287cp-7), V4 (0x1.6b55a2p-10) }, \
- .shift = V4 (0x1.8p23f), .exponent_bias = V4 (0x3f800000), \
- .invln2_and_ln2 = { 0x1.715476p+0f, 0x1.62e4p-1f, 0x1.7f7d1cp-20f, 0 }, \
+ .c0 = V4 (0x1.fffffep-2), .c1 = 0x1.5554aep-3, .c2 = V4 (0x1.555736p-5), \
+ .c3 = 0x1.12287cp-7, .c4 = 0x1.6b55a2p-10, \
+ .exponent_bias = V4 (0x3f800000), .inv_ln2 = 0x1.715476p+0f, \
+ .ln2_hi = 0x1.62e4p-1f, .ln2_lo = 0x1.7f7d1cp-20f, \
}
static inline float32x4_t
expm1f_inline (float32x4_t x, const struct v_expm1f_data *d)
{
- /* Helper routine for calculating exp(x) - 1.
- Copied from v_expm1f_1u6.c, with all special-case handling removed - the
- calling routine should handle special values if required. */
+ /* Helper routine for calculating exp(x) - 1. */
+
+ float32x2_t ln2 = vld1_f32 (&d->ln2_hi);
+ float32x4_t lane_consts = vld1q_f32 (&d->c1);
/* Reduce argument: f in [-ln2/2, ln2/2], i is exact. */
- float32x4_t invln2_and_ln2 = vld1q_f32 (d->invln2_and_ln2);
- float32x4_t j
- = vsubq_f32 (vfmaq_laneq_f32 (d->shift, x, invln2_and_ln2, 0), d->shift);
+ float32x4_t j = vrndaq_f32 (vmulq_laneq_f32 (x, lane_consts, 2));
int32x4_t i = vcvtq_s32_f32 (j);
- float32x4_t f = vfmsq_laneq_f32 (x, j, invln2_and_ln2, 1);
- f = vfmsq_laneq_f32 (f, j, invln2_and_ln2, 2);
+ float32x4_t f = vfmsq_lane_f32 (x, j, ln2, 0);
+ f = vfmsq_lane_f32 (f, j, ln2, 1);
- /* Approximate expm1(f) with polynomial P, expm1(f) ~= f + f^2 * P(f).
- Uses Estrin scheme, where the main _ZGVnN4v_expm1f routine uses
- Horner. */
+ /* Approximate expm1(f) with polynomial P, expm1(f) ~= f + f^2 * P(f). */
float32x4_t f2 = vmulq_f32 (f, f);
float32x4_t f4 = vmulq_f32 (f2, f2);
- float32x4_t p = v_estrin_4_f32 (f, f2, f4, d->poly);
+ float32x4_t p01 = vfmaq_laneq_f32 (d->c0, f, lane_consts, 0);
+ float32x4_t p23 = vfmaq_laneq_f32 (d->c2, f, lane_consts, 1);
+ float32x4_t p = vfmaq_f32 (p01, f2, p23);
+ p = vfmaq_laneq_f32 (p, f4, lane_consts, 3);
p = vfmaq_f32 (f, f2, p);
/* t = 2^i. */

495
glibc-RHEL-118273-14.patch Normal file
View File

@ -0,0 +1,495 @@
commit 5bc100bd4b7e00db3009ae93d25d303341545d23
Author: Joe Ramsay <Joe.Ramsay@arm.com>
Date: Mon Sep 23 15:32:14 2024 +0100
AArch64: Improve codegen in users of AdvSIMD log1pf helper
log1pf is quite register-intensive - use fewer registers for the
polynomial, and make various changes to shorten dependency chains in
parent routines. There is now no spilling with GCC 14. Accuracy moves
around a little - comments adjusted accordingly but does not require
regen-ulps.
Use the helper in log1pf as well, instead of having separate
implementations. The more accurate polynomial means special-casing can
be simplified, and the shorter dependency chain avoids the usual dance
around v0, which is otherwise difficult.
There is a small duplication of vectors containing 1.0f (or 0x3f800000) -
GCC is not currently able to efficiently handle values which fit in FMOV
but not MOVI, and are reinterpreted to integer. There may be potential
for more optimisation if this is fixed.
Reviewed-by: Wilco Dijkstra <Wilco.Dijkstra@arm.com>
Conflicts:
sysdeps/aarch64/fpu/log1pf_advsimd.c
(Fixup context to apply without out-of-scope dependency 751a5502)
diff --git a/sysdeps/aarch64/fpu/acoshf_advsimd.c b/sysdeps/aarch64/fpu/acoshf_advsimd.c
index 8916dcbf409922a9..004474acf9e9322b 100644
--- a/sysdeps/aarch64/fpu/acoshf_advsimd.c
+++ b/sysdeps/aarch64/fpu/acoshf_advsimd.c
@@ -25,35 +25,32 @@ const static struct data
{
struct v_log1pf_data log1pf_consts;
uint32x4_t one;
- uint16x4_t thresh;
-} data = {
- .log1pf_consts = V_LOG1PF_CONSTANTS_TABLE,
- .one = V4 (0x3f800000),
- .thresh = V4 (0x2000) /* top(asuint(SquareLim) - asuint(1)). */
-};
+} data = { .log1pf_consts = V_LOG1PF_CONSTANTS_TABLE, .one = V4 (0x3f800000) };
+
+#define Thresh vdup_n_u16 (0x2000) /* top(asuint(SquareLim) - asuint(1)). */
static float32x4_t NOINLINE VPCS_ATTR
special_case (float32x4_t x, float32x4_t y, uint16x4_t special,
- const struct v_log1pf_data d)
+ const struct v_log1pf_data *d)
{
return v_call_f32 (acoshf, x, log1pf_inline (y, d), vmovl_u16 (special));
}
/* Vector approximation for single-precision acosh, based on log1p. Maximum
error depends on WANT_SIMD_EXCEPT. With SIMD fp exceptions enabled, it
- is 2.78 ULP:
- __v_acoshf(0x1.07887p+0) got 0x1.ef9e9cp-3
- want 0x1.ef9ea2p-3.
+ is 3.00 ULP:
+ _ZGVnN4v_acoshf(0x1.01df3ap+0) got 0x1.ef0a82p-4
+ want 0x1.ef0a7cp-4.
With exceptions disabled, we can compute u with a shorter dependency chain,
- which gives maximum error of 3.07 ULP:
- __v_acoshf(0x1.01f83ep+0) got 0x1.fbc7fap-4
- want 0x1.fbc7f4p-4. */
+ which gives maximum error of 3.22 ULP:
+ _ZGVnN4v_acoshf(0x1.007ef2p+0) got 0x1.fdcdccp-5
+ want 0x1.fdcdd2p-5. */
VPCS_ATTR float32x4_t NOINLINE V_NAME_F1 (acosh) (float32x4_t x)
{
const struct data *d = ptr_barrier (&data);
uint32x4_t ix = vreinterpretq_u32_f32 (x);
- uint16x4_t special = vcge_u16 (vsubhn_u32 (ix, d->one), d->thresh);
+ uint16x4_t special = vcge_u16 (vsubhn_u32 (ix, d->one), Thresh);
#if WANT_SIMD_EXCEPT
/* Mask special lanes with 1 to side-step spurious invalid or overflow. Use
@@ -64,15 +61,16 @@ VPCS_ATTR float32x4_t NOINLINE V_NAME_F1 (acosh) (float32x4_t x)
float32x4_t xm1 = v_zerofy_f32 (vsubq_f32 (x, v_f32 (1)), p);
float32x4_t u = vfmaq_f32 (vaddq_f32 (xm1, xm1), xm1, xm1);
#else
- float32x4_t xm1 = vsubq_f32 (x, v_f32 (1));
- float32x4_t u = vmulq_f32 (xm1, vaddq_f32 (x, v_f32 (1.0f)));
+ float32x4_t xm1 = vsubq_f32 (x, vreinterpretq_f32_u32 (d->one));
+ float32x4_t u
+ = vmulq_f32 (xm1, vaddq_f32 (x, vreinterpretq_f32_u32 (d->one)));
#endif
float32x4_t y = vaddq_f32 (xm1, vsqrtq_f32 (u));
if (__glibc_unlikely (v_any_u16h (special)))
- return special_case (x, y, special, d->log1pf_consts);
- return log1pf_inline (y, d->log1pf_consts);
+ return special_case (x, y, special, &d->log1pf_consts);
+ return log1pf_inline (y, &d->log1pf_consts);
}
libmvec_hidden_def (V_NAME_F1 (acosh))
HALF_WIDTH_ALIAS_F1 (acosh)
diff --git a/sysdeps/aarch64/fpu/asinhf_advsimd.c b/sysdeps/aarch64/fpu/asinhf_advsimd.c
index 09fd8a614305563d..eb789b91b600af52 100644
--- a/sysdeps/aarch64/fpu/asinhf_advsimd.c
+++ b/sysdeps/aarch64/fpu/asinhf_advsimd.c
@@ -20,16 +20,16 @@
#include "v_math.h"
#include "v_log1pf_inline.h"
-#define SignMask v_u32 (0x80000000)
-
const static struct data
{
struct v_log1pf_data log1pf_consts;
+ float32x4_t one;
uint32x4_t big_bound;
#if WANT_SIMD_EXCEPT
uint32x4_t tiny_bound;
#endif
} data = {
+ .one = V4 (1),
.log1pf_consts = V_LOG1PF_CONSTANTS_TABLE,
.big_bound = V4 (0x5f800000), /* asuint(0x1p64). */
#if WANT_SIMD_EXCEPT
@@ -38,20 +38,27 @@ const static struct data
};
static float32x4_t NOINLINE VPCS_ATTR
-special_case (float32x4_t x, float32x4_t y, uint32x4_t special)
+special_case (float32x4_t x, uint32x4_t sign, float32x4_t y,
+ uint32x4_t special, const struct data *d)
{
- return v_call_f32 (asinhf, x, y, special);
+ return v_call_f32 (
+ asinhf, x,
+ vreinterpretq_f32_u32 (veorq_u32 (
+ sign, vreinterpretq_u32_f32 (log1pf_inline (y, &d->log1pf_consts)))),
+ special);
}
/* Single-precision implementation of vector asinh(x), using vector log1p.
- Worst-case error is 2.66 ULP, at roughly +/-0.25:
- __v_asinhf(0x1.01b04p-2) got 0x1.fe163ep-3 want 0x1.fe1638p-3. */
+ Worst-case error is 2.59 ULP:
+ _ZGVnN4v_asinhf(0x1.d86124p-3) got 0x1.d449bep-3
+ want 0x1.d449c4p-3. */
VPCS_ATTR float32x4_t NOINLINE V_NAME_F1 (asinh) (float32x4_t x)
{
const struct data *dat = ptr_barrier (&data);
- uint32x4_t iax = vbicq_u32 (vreinterpretq_u32_f32 (x), SignMask);
- float32x4_t ax = vreinterpretq_f32_u32 (iax);
+ float32x4_t ax = vabsq_f32 (x);
+ uint32x4_t iax = vreinterpretq_u32_f32 (ax);
uint32x4_t special = vcgeq_u32 (iax, dat->big_bound);
+ uint32x4_t sign = veorq_u32 (vreinterpretq_u32_f32 (x), iax);
float32x4_t special_arg = x;
#if WANT_SIMD_EXCEPT
@@ -68,13 +75,13 @@ VPCS_ATTR float32x4_t NOINLINE V_NAME_F1 (asinh) (float32x4_t x)
/* asinh(x) = log(x + sqrt(x * x + 1)).
For positive x, asinh(x) = log1p(x + x * x / (1 + sqrt(x * x + 1))). */
float32x4_t d
- = vaddq_f32 (v_f32 (1), vsqrtq_f32 (vfmaq_f32 (v_f32 (1), x, x)));
- float32x4_t y = log1pf_inline (
- vaddq_f32 (ax, vdivq_f32 (vmulq_f32 (ax, ax), d)), dat->log1pf_consts);
+ = vaddq_f32 (v_f32 (1), vsqrtq_f32 (vfmaq_f32 (dat->one, ax, ax)));
+ float32x4_t y = vaddq_f32 (ax, vdivq_f32 (vmulq_f32 (ax, ax), d));
if (__glibc_unlikely (v_any_u32 (special)))
- return special_case (special_arg, vbslq_f32 (SignMask, x, y), special);
- return vbslq_f32 (SignMask, x, y);
+ return special_case (special_arg, sign, y, special, dat);
+ return vreinterpretq_f32_u32 (veorq_u32 (
+ sign, vreinterpretq_u32_f32 (log1pf_inline (y, &dat->log1pf_consts))));
}
libmvec_hidden_def (V_NAME_F1 (asinh))
HALF_WIDTH_ALIAS_F1 (asinh)
diff --git a/sysdeps/aarch64/fpu/atanhf_advsimd.c b/sysdeps/aarch64/fpu/atanhf_advsimd.c
index ae488f7b54ddce26..818b6c92adcd48bb 100644
--- a/sysdeps/aarch64/fpu/atanhf_advsimd.c
+++ b/sysdeps/aarch64/fpu/atanhf_advsimd.c
@@ -40,15 +40,17 @@ const static struct data
#define Half v_u32 (0x3f000000)
static float32x4_t NOINLINE VPCS_ATTR
-special_case (float32x4_t x, float32x4_t y, uint32x4_t special)
+special_case (float32x4_t x, float32x4_t halfsign, float32x4_t y,
+ uint32x4_t special)
{
- return v_call_f32 (atanhf, x, y, special);
+ return v_call_f32 (atanhf, vbslq_f32 (AbsMask, x, halfsign),
+ vmulq_f32 (halfsign, y), special);
}
/* Approximation for vector single-precision atanh(x) using modified log1p.
- The maximum error is 3.08 ULP:
- __v_atanhf(0x1.ff215p-5) got 0x1.ffcb7cp-5
- want 0x1.ffcb82p-5. */
+ The maximum error is 2.93 ULP:
+ _ZGVnN4v_atanhf(0x1.f43d7p-5) got 0x1.f4dcfep-5
+ want 0x1.f4dcf8p-5. */
VPCS_ATTR float32x4_t NOINLINE V_NAME_F1 (atanh) (float32x4_t x)
{
const struct data *d = ptr_barrier (&data);
@@ -68,11 +70,19 @@ VPCS_ATTR float32x4_t NOINLINE V_NAME_F1 (atanh) (float32x4_t x)
uint32x4_t special = vcgeq_u32 (iax, d->one);
#endif
- float32x4_t y = vdivq_f32 (vaddq_f32 (ax, ax), vsubq_f32 (v_f32 (1), ax));
- y = log1pf_inline (y, d->log1pf_consts);
+ float32x4_t y = vdivq_f32 (vaddq_f32 (ax, ax),
+ vsubq_f32 (vreinterpretq_f32_u32 (d->one), ax));
+ y = log1pf_inline (y, &d->log1pf_consts);
+ /* If exceptions not required, pass ax to special-case for shorter dependency
+ chain. If exceptions are required ax will have been zerofied, so have to
+ pass x. */
if (__glibc_unlikely (v_any_u32 (special)))
- return special_case (x, vmulq_f32 (halfsign, y), special);
+#if WANT_SIMD_EXCEPT
+ return special_case (x, halfsign, y, special);
+#else
+ return special_case (ax, halfsign, y, special);
+#endif
return vmulq_f32 (halfsign, y);
}
libmvec_hidden_def (V_NAME_F1 (atanh))
diff --git a/sysdeps/aarch64/fpu/log1pf_advsimd.c b/sysdeps/aarch64/fpu/log1pf_advsimd.c
index dc15334a8537b1fc..f2d47962fe13fbdd 100644
--- a/sysdeps/aarch64/fpu/log1pf_advsimd.c
+++ b/sysdeps/aarch64/fpu/log1pf_advsimd.c
@@ -18,113 +18,78 @@
<https://www.gnu.org/licenses/>. */
#include "v_math.h"
-#include "poly_advsimd_f32.h"
+#include "v_log1pf_inline.h"
+
+#if WANT_SIMD_EXCEPT
const static struct data
{
- float32x4_t poly[8], ln2;
- uint32x4_t tiny_bound, minus_one, four, thresh;
- int32x4_t three_quarters;
+ uint32x4_t minus_one, thresh;
+ struct v_log1pf_data d;
} data = {
- .poly = { /* Generated using FPMinimax in [-0.25, 0.5]. First two coefficients
- (1, -0.5) are not stored as they can be generated more
- efficiently. */
- V4 (0x1.5555aap-2f), V4 (-0x1.000038p-2f), V4 (0x1.99675cp-3f),
- V4 (-0x1.54ef78p-3f), V4 (0x1.28a1f4p-3f), V4 (-0x1.0da91p-3f),
- V4 (0x1.abcb6p-4f), V4 (-0x1.6f0d5ep-5f) },
- .ln2 = V4 (0x1.62e43p-1f),
- .tiny_bound = V4 (0x34000000), /* asuint32(0x1p-23). ulp=0.5 at 0x1p-23. */
- .thresh = V4 (0x4b800000), /* asuint32(INFINITY) - tiny_bound. */
+ .d = V_LOG1PF_CONSTANTS_TABLE,
+ .thresh = V4 (0x4b800000), /* asuint32(INFINITY) - TinyBound. */
.minus_one = V4 (0xbf800000),
- .four = V4 (0x40800000),
- .three_quarters = V4 (0x3f400000)
};
-static inline float32x4_t
-eval_poly (float32x4_t m, const float32x4_t *p)
-{
- /* Approximate log(1+m) on [-0.25, 0.5] using split Estrin scheme. */
- float32x4_t p_12 = vfmaq_f32 (v_f32 (-0.5), m, p[0]);
- float32x4_t p_34 = vfmaq_f32 (p[1], m, p[2]);
- float32x4_t p_56 = vfmaq_f32 (p[3], m, p[4]);
- float32x4_t p_78 = vfmaq_f32 (p[5], m, p[6]);
-
- float32x4_t m2 = vmulq_f32 (m, m);
- float32x4_t p_02 = vfmaq_f32 (m, m2, p_12);
- float32x4_t p_36 = vfmaq_f32 (p_34, m2, p_56);
- float32x4_t p_79 = vfmaq_f32 (p_78, m2, p[7]);
-
- float32x4_t m4 = vmulq_f32 (m2, m2);
- float32x4_t p_06 = vfmaq_f32 (p_02, m4, p_36);
- return vfmaq_f32 (p_06, m4, vmulq_f32 (m4, p_79));
-}
+/* asuint32(0x1p-23). ulp=0.5 at 0x1p-23. */
+# define TinyBound v_u32 (0x34000000)
static float32x4_t NOINLINE VPCS_ATTR
-special_case (float32x4_t x, float32x4_t y, uint32x4_t special)
+special_case (float32x4_t x, uint32x4_t cmp, const struct data *d)
{
- return v_call_f32 (log1pf, x, y, special);
+ /* Side-step special lanes so fenv exceptions are not triggered
+ inadvertently. */
+ float32x4_t x_nospecial = v_zerofy_f32 (x, cmp);
+ return v_call_f32 (log1pf, x, log1pf_inline (x_nospecial, &d->d), cmp);
}
-/* Vector log1pf approximation using polynomial on reduced interval. Accuracy
- is roughly 2.02 ULP:
- log1pf(0x1.21e13ap-2) got 0x1.fe8028p-3 want 0x1.fe802cp-3. */
+/* Vector log1pf approximation using polynomial on reduced interval. Worst-case
+ error is 1.69 ULP:
+ _ZGVnN4v_log1pf(0x1.04418ap-2) got 0x1.cfcbd8p-3
+ want 0x1.cfcbdcp-3. */
VPCS_ATTR float32x4_t V_NAME_F1 (log1p) (float32x4_t x)
{
const struct data *d = ptr_barrier (&data);
-
uint32x4_t ix = vreinterpretq_u32_f32 (x);
uint32x4_t ia = vreinterpretq_u32_f32 (vabsq_f32 (x));
+
uint32x4_t special_cases
- = vorrq_u32 (vcgeq_u32 (vsubq_u32 (ia, d->tiny_bound), d->thresh),
+ = vorrq_u32 (vcgeq_u32 (vsubq_u32 (ia, TinyBound), d->thresh),
vcgeq_u32 (ix, d->minus_one));
- float32x4_t special_arg = x;
-#if WANT_SIMD_EXCEPT
if (__glibc_unlikely (v_any_u32 (special_cases)))
- /* Side-step special lanes so fenv exceptions are not triggered
- inadvertently. */
- x = v_zerofy_f32 (x, special_cases);
-#endif
+ return special_case (x, special_cases, d);
- /* With x + 1 = t * 2^k (where t = m + 1 and k is chosen such that m
- is in [-0.25, 0.5]):
- log1p(x) = log(t) + log(2^k) = log1p(m) + k*log(2).
-
- We approximate log1p(m) with a polynomial, then scale by
- k*log(2). Instead of doing this directly, we use an intermediate
- scale factor s = 4*k*log(2) to ensure the scale is representable
- as a normalised fp32 number. */
+ return log1pf_inline (x, &d->d);
+}
- float32x4_t m = vaddq_f32 (x, v_f32 (1.0f));
+#else
- /* Choose k to scale x to the range [-1/4, 1/2]. */
- int32x4_t k
- = vandq_s32 (vsubq_s32 (vreinterpretq_s32_f32 (m), d->three_quarters),
- v_s32 (0xff800000));
- uint32x4_t ku = vreinterpretq_u32_s32 (k);
+const static struct v_log1pf_data data = V_LOG1PF_CONSTANTS_TABLE;
- /* Scale x by exponent manipulation. */
- float32x4_t m_scale
- = vreinterpretq_f32_u32 (vsubq_u32 (vreinterpretq_u32_f32 (x), ku));
+static float32x4_t NOINLINE VPCS_ATTR
+special_case (float32x4_t x, uint32x4_t cmp)
+{
+ return v_call_f32 (log1pf, x, log1pf_inline (x, ptr_barrier (&data)), cmp);
+}
- /* Scale up to ensure that the scale factor is representable as normalised
- fp32 number, and scale m down accordingly. */
- float32x4_t s = vreinterpretq_f32_u32 (vsubq_u32 (d->four, ku));
- m_scale = vaddq_f32 (m_scale, vfmaq_f32 (v_f32 (-1.0f), v_f32 (0.25f), s));
+/* Vector log1pf approximation using polynomial on reduced interval. Worst-case
+ error is 1.63 ULP:
+ _ZGVnN4v_log1pf(0x1.216d12p-2) got 0x1.fdcb12p-3
+ want 0x1.fdcb16p-3. */
+VPCS_ATTR float32x4_t V_NAME_F1 (log1p) (float32x4_t x)
+{
+ uint32x4_t special_cases = vornq_u32 (vcleq_f32 (x, v_f32 (-1)),
+ vcaleq_f32 (x, v_f32 (0x1p127f)));
- /* Evaluate polynomial on the reduced interval. */
- float32x4_t p = eval_poly (m_scale, d->poly);
+ if (__glibc_unlikely (v_any_u32 (special_cases)))
+ return special_case (x, special_cases);
- /* The scale factor to be applied back at the end - by multiplying float(k)
- by 2^-23 we get the unbiased exponent of k. */
- float32x4_t scale_back = vcvtq_f32_s32 (vshrq_n_s32 (k, 23));
+ return log1pf_inline (x, ptr_barrier (&data));
+}
- /* Apply the scaling back. */
- float32x4_t y = vfmaq_f32 (p, scale_back, d->ln2);
+#endif
- if (__glibc_unlikely (v_any_u32 (special_cases)))
- return special_case (special_arg, y, special_cases);
- return y;
-}
libmvec_hidden_def (V_NAME_F1 (log1p))
HALF_WIDTH_ALIAS_F1 (log1p)
diff --git a/sysdeps/aarch64/fpu/v_log1pf_inline.h b/sysdeps/aarch64/fpu/v_log1pf_inline.h
index 643a6cdcfc498970..73e45a942e24a26f 100644
--- a/sysdeps/aarch64/fpu/v_log1pf_inline.h
+++ b/sysdeps/aarch64/fpu/v_log1pf_inline.h
@@ -25,54 +25,81 @@
struct v_log1pf_data
{
- float32x4_t poly[8], ln2;
uint32x4_t four;
int32x4_t three_quarters;
+ float c0, c3, c5, c7;
+ float32x4_t c4, c6, c1, c2, ln2;
};
/* Polynomial generated using FPMinimax in [-0.25, 0.5]. First two coefficients
(1, -0.5) are not stored as they can be generated more efficiently. */
#define V_LOG1PF_CONSTANTS_TABLE \
{ \
- .poly \
- = { V4 (0x1.5555aap-2f), V4 (-0x1.000038p-2f), V4 (0x1.99675cp-3f), \
- V4 (-0x1.54ef78p-3f), V4 (0x1.28a1f4p-3f), V4 (-0x1.0da91p-3f), \
- V4 (0x1.abcb6p-4f), V4 (-0x1.6f0d5ep-5f) }, \
- .ln2 = V4 (0x1.62e43p-1f), .four = V4 (0x40800000), \
- .three_quarters = V4 (0x3f400000) \
+ .c0 = 0x1.5555aap-2f, .c1 = V4 (-0x1.000038p-2f), \
+ .c2 = V4 (0x1.99675cp-3f), .c3 = -0x1.54ef78p-3f, \
+ .c4 = V4 (0x1.28a1f4p-3f), .c5 = -0x1.0da91p-3f, \
+ .c6 = V4 (0x1.abcb6p-4f), .c7 = -0x1.6f0d5ep-5f, \
+ .ln2 = V4 (0x1.62e43p-1f), .four = V4 (0x40800000), \
+ .three_quarters = V4 (0x3f400000) \
}
static inline float32x4_t
-eval_poly (float32x4_t m, const float32x4_t *c)
+eval_poly (float32x4_t m, const struct v_log1pf_data *d)
{
- /* Approximate log(1+m) on [-0.25, 0.5] using pairwise Horner (main routine
- uses split Estrin, but this way reduces register pressure in the calling
- routine). */
- float32x4_t q = vfmaq_f32 (v_f32 (-0.5), m, c[0]);
+ /* Approximate log(1+m) on [-0.25, 0.5] using pairwise Horner. */
+ float32x4_t c0357 = vld1q_f32 (&d->c0);
+ float32x4_t q = vfmaq_laneq_f32 (v_f32 (-0.5), m, c0357, 0);
float32x4_t m2 = vmulq_f32 (m, m);
- q = vfmaq_f32 (m, m2, q);
- float32x4_t p = v_pw_horner_6_f32 (m, m2, c + 1);
+ float32x4_t p67 = vfmaq_laneq_f32 (d->c6, m, c0357, 3);
+ float32x4_t p45 = vfmaq_laneq_f32 (d->c4, m, c0357, 2);
+ float32x4_t p23 = vfmaq_laneq_f32 (d->c2, m, c0357, 1);
+ float32x4_t p = vfmaq_f32 (p45, m2, p67);
+ p = vfmaq_f32 (p23, m2, p);
+ p = vfmaq_f32 (d->c1, m, p);
p = vmulq_f32 (m2, p);
- return vfmaq_f32 (q, m2, p);
+ p = vfmaq_f32 (m, m2, p);
+ return vfmaq_f32 (p, m2, q);
}
static inline float32x4_t
-log1pf_inline (float32x4_t x, const struct v_log1pf_data d)
+log1pf_inline (float32x4_t x, const struct v_log1pf_data *d)
{
- /* Helper for calculating log(x + 1). Copied from log1pf_2u1.c, with no
- special-case handling. See that file for details of the algorithm. */
+ /* Helper for calculating log(x + 1). */
+
+ /* With x + 1 = t * 2^k (where t = m + 1 and k is chosen such that m
+ is in [-0.25, 0.5]):
+ log1p(x) = log(t) + log(2^k) = log1p(m) + k*log(2).
+
+ We approximate log1p(m) with a polynomial, then scale by
+ k*log(2). Instead of doing this directly, we use an intermediate
+ scale factor s = 4*k*log(2) to ensure the scale is representable
+ as a normalised fp32 number. */
float32x4_t m = vaddq_f32 (x, v_f32 (1.0f));
+
+ /* Choose k to scale x to the range [-1/4, 1/2]. */
int32x4_t k
- = vandq_s32 (vsubq_s32 (vreinterpretq_s32_f32 (m), d.three_quarters),
+ = vandq_s32 (vsubq_s32 (vreinterpretq_s32_f32 (m), d->three_quarters),
v_s32 (0xff800000));
uint32x4_t ku = vreinterpretq_u32_s32 (k);
- float32x4_t s = vreinterpretq_f32_u32 (vsubq_u32 (d.four, ku));
+
+ /* Scale up to ensure that the scale factor is representable as normalised
+ fp32 number, and scale m down accordingly. */
+ float32x4_t s = vreinterpretq_f32_u32 (vsubq_u32 (d->four, ku));
+
+ /* Scale x by exponent manipulation. */
float32x4_t m_scale
= vreinterpretq_f32_u32 (vsubq_u32 (vreinterpretq_u32_f32 (x), ku));
m_scale = vaddq_f32 (m_scale, vfmaq_f32 (v_f32 (-1.0f), v_f32 (0.25f), s));
- float32x4_t p = eval_poly (m_scale, d.poly);
+
+ /* Evaluate polynomial on the reduced interval. */
+ float32x4_t p = eval_poly (m_scale, d);
+
+ /* The scale factor to be applied back at the end - by multiplying float(k)
+ by 2^-23 we get the unbiased exponent of k. */
float32x4_t scale_back = vmulq_f32 (vcvtq_f32_s32 (k), v_f32 (0x1.0p-23f));
- return vfmaq_f32 (p, scale_back, d.ln2);
+
+ /* Apply the scaling back. */
+ return vfmaq_f32 (p, scale_back, d->ln2);
}
#endif

261
glibc-RHEL-118273-15.patch Normal file
View File

@ -0,0 +1,261 @@
commit a15b1394b5eba98ffe28a02a392b587e4fe13c0d
Author: Joe Ramsay <Joe.Ramsay@arm.com>
Date: Mon Sep 23 15:30:20 2024 +0100
AArch64: Improve codegen in SVE F32 logs
Reduce MOVPRFXs by using unpredicated (non-destructive) instructions
where possible. Similar to the recent change to AdvSIMD F32 logs,
adjust special-case arguments and bounds to allow for more optimal
register usage. For all 3 routines one MOVPRFX remains in the
reduction, which cannot be avoided as immediate AND and ASR are both
destructive.
Reviewed-by: Wilco Dijkstra <Wilco.Dijkstra@arm.com>
diff --git a/sysdeps/aarch64/fpu/log10f_sve.c b/sysdeps/aarch64/fpu/log10f_sve.c
index bdbb49cd32feccb4..7913679f6795502a 100644
--- a/sysdeps/aarch64/fpu/log10f_sve.c
+++ b/sysdeps/aarch64/fpu/log10f_sve.c
@@ -24,6 +24,7 @@ static const struct data
float poly_0246[4];
float poly_1357[4];
float ln2, inv_ln10;
+ uint32_t off, lower;
} data = {
.poly_1357 = {
/* Coefficients copied from the AdvSIMD routine, then rearranged so that coeffs
@@ -35,18 +36,23 @@ static const struct data
-0x1.0fc92cp-4f },
.ln2 = 0x1.62e43p-1f,
.inv_ln10 = 0x1.bcb7b2p-2f,
+ .off = 0x3f2aaaab,
+ /* Lower bound is the smallest positive normal float 0x00800000. For
+ optimised register use subnormals are detected after offset has been
+ subtracted, so lower bound is 0x0080000 - offset (which wraps around). */
+ .lower = 0x00800000 - 0x3f2aaaab
};
-#define Min 0x00800000
-#define Max 0x7f800000
-#define Thres 0x7f000000 /* Max - Min. */
-#define Offset 0x3f2aaaab /* 0.666667. */
+#define Thres 0x7f000000 /* asuint32(inf) - 0x00800000. */
#define MantissaMask 0x007fffff
static svfloat32_t NOINLINE
-special_case (svfloat32_t x, svfloat32_t y, svbool_t special)
+special_case (svuint32_t u_off, svfloat32_t p, svfloat32_t r2, svfloat32_t y,
+ svbool_t cmp)
{
- return sv_call_f32 (log10f, x, y, special);
+ return sv_call_f32 (
+ log10f, svreinterpret_f32 (svadd_x (svptrue_b32 (), u_off, data.off)),
+ svmla_x (svptrue_b32 (), p, r2, y), cmp);
}
/* Optimised implementation of SVE log10f using the same algorithm and
@@ -57,23 +63,25 @@ special_case (svfloat32_t x, svfloat32_t y, svbool_t special)
svfloat32_t SV_NAME_F1 (log10) (svfloat32_t x, const svbool_t pg)
{
const struct data *d = ptr_barrier (&data);
- svuint32_t ix = svreinterpret_u32 (x);
- svbool_t special = svcmpge (pg, svsub_x (pg, ix, Min), Thres);
+
+ svuint32_t u_off = svreinterpret_u32 (x);
+
+ u_off = svsub_x (pg, u_off, d->off);
+ svbool_t special = svcmpge (pg, svsub_x (pg, u_off, d->lower), Thres);
/* x = 2^n * (1+r), where 2/3 < 1+r < 4/3. */
- ix = svsub_x (pg, ix, Offset);
svfloat32_t n = svcvt_f32_x (
- pg, svasr_x (pg, svreinterpret_s32 (ix), 23)); /* signextend. */
- ix = svand_x (pg, ix, MantissaMask);
- ix = svadd_x (pg, ix, Offset);
+ pg, svasr_x (pg, svreinterpret_s32 (u_off), 23)); /* signextend. */
+ svuint32_t ix = svand_x (pg, u_off, MantissaMask);
+ ix = svadd_x (pg, ix, d->off);
svfloat32_t r = svsub_x (pg, svreinterpret_f32 (ix), 1.0f);
/* y = log10(1+r) + n*log10(2)
log10(1+r) ~ r * InvLn(10) + P(r)
where P(r) is a polynomial. Use order 9 for log10(1+x), i.e. order 8 for
log10(1+x)/x, with x in [-1/3, 1/3] (offset=2/3). */
- svfloat32_t r2 = svmul_x (pg, r, r);
- svfloat32_t r4 = svmul_x (pg, r2, r2);
+ svfloat32_t r2 = svmul_x (svptrue_b32 (), r, r);
+ svfloat32_t r4 = svmul_x (svptrue_b32 (), r2, r2);
svfloat32_t p_1357 = svld1rq (svptrue_b32 (), &d->poly_1357[0]);
svfloat32_t q_01 = svmla_lane (sv_f32 (d->poly_0246[0]), r, p_1357, 0);
svfloat32_t q_23 = svmla_lane (sv_f32 (d->poly_0246[1]), r, p_1357, 1);
@@ -88,7 +96,6 @@ svfloat32_t SV_NAME_F1 (log10) (svfloat32_t x, const svbool_t pg)
hi = svmul_x (pg, hi, d->inv_ln10);
if (__glibc_unlikely (svptest_any (pg, special)))
- return special_case (x, svmla_x (svnot_z (pg, special), hi, r2, y),
- special);
- return svmla_x (pg, hi, r2, y);
+ return special_case (u_off, hi, r2, y, special);
+ return svmla_x (svptrue_b32 (), hi, r2, y);
}
diff --git a/sysdeps/aarch64/fpu/log2f_sve.c b/sysdeps/aarch64/fpu/log2f_sve.c
index 5031c4248359295e..939d89bfb9a95a11 100644
--- a/sysdeps/aarch64/fpu/log2f_sve.c
+++ b/sysdeps/aarch64/fpu/log2f_sve.c
@@ -23,6 +23,7 @@ static const struct data
{
float poly_02468[5];
float poly_1357[4];
+ uint32_t off, lower;
} data = {
.poly_1357 = {
/* Coefficients copied from the AdvSIMD routine, then rearranged so that coeffs
@@ -32,18 +33,23 @@ static const struct data
},
.poly_02468 = { 0x1.715476p0f, 0x1.ec701cp-2f, 0x1.27a0b8p-2f,
0x1.9d8ecap-3f, 0x1.9e495p-3f },
+ .off = 0x3f2aaaab,
+ /* Lower bound is the smallest positive normal float 0x00800000. For
+ optimised register use subnormals are detected after offset has been
+ subtracted, so lower bound is 0x0080000 - offset (which wraps around). */
+ .lower = 0x00800000 - 0x3f2aaaab
};
-#define Min (0x00800000)
-#define Max (0x7f800000)
-#define Thres (0x7f000000) /* Max - Min. */
+#define Thresh (0x7f000000) /* asuint32(inf) - 0x00800000. */
#define MantissaMask (0x007fffff)
-#define Off (0x3f2aaaab) /* 0.666667. */
static svfloat32_t NOINLINE
-special_case (svfloat32_t x, svfloat32_t y, svbool_t cmp)
+special_case (svuint32_t u_off, svfloat32_t p, svfloat32_t r2, svfloat32_t y,
+ svbool_t cmp)
{
- return sv_call_f32 (log2f, x, y, cmp);
+ return sv_call_f32 (
+ log2f, svreinterpret_f32 (svadd_x (svptrue_b32 (), u_off, data.off)),
+ svmla_x (svptrue_b32 (), p, r2, y), cmp);
}
/* Optimised implementation of SVE log2f, using the same algorithm
@@ -55,19 +61,20 @@ svfloat32_t SV_NAME_F1 (log2) (svfloat32_t x, const svbool_t pg)
{
const struct data *d = ptr_barrier (&data);
- svuint32_t u = svreinterpret_u32 (x);
- svbool_t special = svcmpge (pg, svsub_x (pg, u, Min), Thres);
+ svuint32_t u_off = svreinterpret_u32 (x);
+
+ u_off = svsub_x (pg, u_off, d->off);
+ svbool_t special = svcmpge (pg, svsub_x (pg, u_off, d->lower), Thresh);
/* x = 2^n * (1+r), where 2/3 < 1+r < 4/3. */
- u = svsub_x (pg, u, Off);
svfloat32_t n = svcvt_f32_x (
- pg, svasr_x (pg, svreinterpret_s32 (u), 23)); /* Sign-extend. */
- u = svand_x (pg, u, MantissaMask);
- u = svadd_x (pg, u, Off);
+ pg, svasr_x (pg, svreinterpret_s32 (u_off), 23)); /* Sign-extend. */
+ svuint32_t u = svand_x (pg, u_off, MantissaMask);
+ u = svadd_x (pg, u, d->off);
svfloat32_t r = svsub_x (pg, svreinterpret_f32 (u), 1.0f);
/* y = log2(1+r) + n. */
- svfloat32_t r2 = svmul_x (pg, r, r);
+ svfloat32_t r2 = svmul_x (svptrue_b32 (), r, r);
/* Evaluate polynomial using pairwise Horner scheme. */
svfloat32_t p_1357 = svld1rq (svptrue_b32 (), &d->poly_1357[0]);
@@ -81,6 +88,6 @@ svfloat32_t SV_NAME_F1 (log2) (svfloat32_t x, const svbool_t pg)
y = svmla_x (pg, q_01, r2, y);
if (__glibc_unlikely (svptest_any (pg, special)))
- return special_case (x, svmla_x (svnot_z (pg, special), n, r, y), special);
- return svmla_x (pg, n, r, y);
+ return special_case (u_off, n, r, y, special);
+ return svmla_x (svptrue_b32 (), n, r, y);
}
diff --git a/sysdeps/aarch64/fpu/logf_sve.c b/sysdeps/aarch64/fpu/logf_sve.c
index d64e810cfec9aa19..5b9324678d99455b 100644
--- a/sysdeps/aarch64/fpu/logf_sve.c
+++ b/sysdeps/aarch64/fpu/logf_sve.c
@@ -24,6 +24,7 @@ static const struct data
float poly_0135[4];
float poly_246[3];
float ln2;
+ uint32_t off, lower;
} data = {
.poly_0135 = {
/* Coefficients copied from the AdvSIMD routine in math/, then rearranged so
@@ -32,19 +33,24 @@ static const struct data
-0x1.3e737cp-3f, 0x1.5a9aa2p-3f, 0x1.961348p-3f, 0x1.555d7cp-2f
},
.poly_246 = { -0x1.4f9934p-3f, -0x1.00187cp-2f, -0x1.ffffc8p-2f },
- .ln2 = 0x1.62e43p-1f
+ .ln2 = 0x1.62e43p-1f,
+ .off = 0x3f2aaaab,
+ /* Lower bound is the smallest positive normal float 0x00800000. For
+ optimised register use subnormals are detected after offset has been
+ subtracted, so lower bound is 0x0080000 - offset (which wraps around). */
+ .lower = 0x00800000 - 0x3f2aaaab
};
-#define Min (0x00800000)
-#define Max (0x7f800000)
-#define Thresh (0x7f000000) /* Max - Min. */
+#define Thresh (0x7f000000) /* asuint32(inf) - 0x00800000. */
#define Mask (0x007fffff)
-#define Off (0x3f2aaaab) /* 0.666667. */
static svfloat32_t NOINLINE
-special_case (svfloat32_t x, svfloat32_t y, svbool_t cmp)
+special_case (svuint32_t u_off, svfloat32_t p, svfloat32_t r2, svfloat32_t y,
+ svbool_t cmp)
{
- return sv_call_f32 (logf, x, y, cmp);
+ return sv_call_f32 (
+ logf, svreinterpret_f32 (svadd_x (svptrue_b32 (), u_off, data.off)),
+ svmla_x (svptrue_b32 (), p, r2, y), cmp);
}
/* Optimised implementation of SVE logf, using the same algorithm and
@@ -55,19 +61,21 @@ svfloat32_t SV_NAME_F1 (log) (svfloat32_t x, const svbool_t pg)
{
const struct data *d = ptr_barrier (&data);
- svuint32_t u = svreinterpret_u32 (x);
- svbool_t cmp = svcmpge (pg, svsub_x (pg, u, Min), Thresh);
+ svuint32_t u_off = svreinterpret_u32 (x);
+
+ u_off = svsub_x (pg, u_off, d->off);
+ svbool_t cmp = svcmpge (pg, svsub_x (pg, u_off, d->lower), Thresh);
/* x = 2^n * (1+r), where 2/3 < 1+r < 4/3. */
- u = svsub_x (pg, u, Off);
svfloat32_t n = svcvt_f32_x (
- pg, svasr_x (pg, svreinterpret_s32 (u), 23)); /* Sign-extend. */
- u = svand_x (pg, u, Mask);
- u = svadd_x (pg, u, Off);
+ pg, svasr_x (pg, svreinterpret_s32 (u_off), 23)); /* Sign-extend. */
+
+ svuint32_t u = svand_x (pg, u_off, Mask);
+ u = svadd_x (pg, u, d->off);
svfloat32_t r = svsub_x (pg, svreinterpret_f32 (u), 1.0f);
/* y = log(1+r) + n*ln2. */
- svfloat32_t r2 = svmul_x (pg, r, r);
+ svfloat32_t r2 = svmul_x (svptrue_b32 (), r, r);
/* n*ln2 + r + r2*(P6 + r*P5 + r2*(P4 + r*P3 + r2*(P2 + r*P1 + r2*P0))). */
svfloat32_t p_0135 = svld1rq (svptrue_b32 (), &d->poly_0135[0]);
svfloat32_t p = svmla_lane (sv_f32 (d->poly_246[0]), r, p_0135, 1);
@@ -80,6 +88,6 @@ svfloat32_t SV_NAME_F1 (log) (svfloat32_t x, const svbool_t pg)
p = svmla_x (pg, r, n, d->ln2);
if (__glibc_unlikely (svptest_any (pg, cmp)))
- return special_case (x, svmla_x (svnot_z (pg, cmp), p, r2, y), cmp);
+ return special_case (u_off, p, r2, y, cmp);
return svmla_x (pg, p, r2, y);
}

467
glibc-RHEL-118273-16.patch Normal file
View File

@ -0,0 +1,467 @@
commit 7b8c134b5460ed933d610fa92ed1227372b68fdc
Author: Joe Ramsay <Joe.Ramsay@arm.com>
Date: Mon Sep 23 15:26:12 2024 +0100
AArch64: Improve codegen in SVE expf & related routines
Reduce MOV and MOVPRFX by improving special-case handling. Use inline
helper to duplicate the entire computation between the special- and
non-special case branches, removing the contention for z0 between x
and the return value.
Also rearrange some MLAs and MLSs - by making the multiplicand the
destination we can avoid a MOVPRFX in several cases. Also change which
constants go in the vector used for lanewise ops - the last lane is no
longer wasted.
Spotted that shift was incorrect in exp2f and exp10f, w.r.t. to the
comment that explains it. Fixed - worst-case ULP for exp2f moves
around but it doesn't change significantly for either routine.
Worst-case error for coshf increases due to passing x to exp rather
than abs(x) - updated the comment, but does not require regen-ulps.
Reviewed-by: Wilco Dijkstra <Wilco.Dijkstra@arm.com>
diff --git a/sysdeps/aarch64/fpu/coshf_sve.c b/sysdeps/aarch64/fpu/coshf_sve.c
index e5d8a299c6aa7ceb..7ad6efa0fc218278 100644
--- a/sysdeps/aarch64/fpu/coshf_sve.c
+++ b/sysdeps/aarch64/fpu/coshf_sve.c
@@ -23,37 +23,42 @@
static const struct data
{
struct sv_expf_data expf_consts;
- uint32_t special_bound;
+ float special_bound;
} data = {
.expf_consts = SV_EXPF_DATA,
/* 0x1.5a92d8p+6: expf overflows above this, so have to use special case. */
- .special_bound = 0x42ad496c,
+ .special_bound = 0x1.5a92d8p+6,
};
static svfloat32_t NOINLINE
-special_case (svfloat32_t x, svfloat32_t y, svbool_t pg)
+special_case (svfloat32_t x, svfloat32_t half_e, svfloat32_t half_over_e,
+ svbool_t pg)
{
- return sv_call_f32 (coshf, x, y, pg);
+ return sv_call_f32 (coshf, x, svadd_x (svptrue_b32 (), half_e, half_over_e),
+ pg);
}
/* Single-precision vector cosh, using vector expf.
- Maximum error is 1.89 ULP:
- _ZGVsMxv_coshf (-0x1.65898cp+6) got 0x1.f00aep+127
- want 0x1.f00adcp+127. */
+ Maximum error is 2.77 ULP:
+ _ZGVsMxv_coshf(-0x1.5b38f4p+1) got 0x1.e45946p+2
+ want 0x1.e4594cp+2. */
svfloat32_t SV_NAME_F1 (cosh) (svfloat32_t x, svbool_t pg)
{
const struct data *d = ptr_barrier (&data);
- svfloat32_t ax = svabs_x (pg, x);
- svbool_t special = svcmpge (pg, svreinterpret_u32 (ax), d->special_bound);
+ svbool_t special = svacge (pg, x, d->special_bound);
- /* Calculate cosh by exp(x) / 2 + exp(-x) / 2. */
- svfloat32_t t = expf_inline (ax, pg, &d->expf_consts);
- svfloat32_t half_t = svmul_x (pg, t, 0.5);
- svfloat32_t half_over_t = svdivr_x (pg, t, 0.5);
+ /* Calculate cosh by exp(x) / 2 + exp(-x) / 2.
+ Note that x is passed to exp here, rather than |x|. This is to avoid using
+ destructive unary ABS for better register usage. However it means the
+ routine is not exactly symmetrical, as the exp helper is slightly less
+ accurate in the negative range. */
+ svfloat32_t e = expf_inline (x, pg, &d->expf_consts);
+ svfloat32_t half_e = svmul_x (svptrue_b32 (), e, 0.5);
+ svfloat32_t half_over_e = svdivr_x (pg, e, 0.5);
if (__glibc_unlikely (svptest_any (pg, special)))
- return special_case (x, svadd_x (pg, half_t, half_over_t), special);
+ return special_case (x, half_e, half_over_e, special);
- return svadd_x (pg, half_t, half_over_t);
+ return svadd_x (svptrue_b32 (), half_e, half_over_e);
}
diff --git a/sysdeps/aarch64/fpu/exp10f_sve.c b/sysdeps/aarch64/fpu/exp10f_sve.c
index e09b2f3b2705515a..8aa3fa9c4335cfb8 100644
--- a/sysdeps/aarch64/fpu/exp10f_sve.c
+++ b/sysdeps/aarch64/fpu/exp10f_sve.c
@@ -18,74 +18,83 @@
<https://www.gnu.org/licenses/>. */
#include "sv_math.h"
-#include "poly_sve_f32.h"
-/* For x < -SpecialBound, the result is subnormal and not handled correctly by
+/* For x < -Thres, the result is subnormal and not handled correctly by
FEXPA. */
-#define SpecialBound 37.9
+#define Thres 37.9
static const struct data
{
- float poly[5];
- float shift, log10_2, log2_10_hi, log2_10_lo, special_bound;
+ float log2_10_lo, c0, c2, c4;
+ float c1, c3, log10_2;
+ float shift, log2_10_hi, thres;
} data = {
/* Coefficients generated using Remez algorithm with minimisation of relative
error.
rel error: 0x1.89dafa3p-24
abs error: 0x1.167d55p-23 in [-log10(2)/2, log10(2)/2]
maxerr: 0.52 +0.5 ulp. */
- .poly = { 0x1.26bb16p+1f, 0x1.5350d2p+1f, 0x1.04744ap+1f, 0x1.2d8176p+0f,
- 0x1.12b41ap-1f },
+ .c0 = 0x1.26bb16p+1f,
+ .c1 = 0x1.5350d2p+1f,
+ .c2 = 0x1.04744ap+1f,
+ .c3 = 0x1.2d8176p+0f,
+ .c4 = 0x1.12b41ap-1f,
/* 1.5*2^17 + 127, a shift value suitable for FEXPA. */
- .shift = 0x1.903f8p17f,
+ .shift = 0x1.803f8p17f,
.log10_2 = 0x1.a934fp+1,
.log2_10_hi = 0x1.344136p-2,
.log2_10_lo = -0x1.ec10cp-27,
- .special_bound = SpecialBound,
+ .thres = Thres,
};
-static svfloat32_t NOINLINE
-special_case (svfloat32_t x, svfloat32_t y, svbool_t special)
+static inline svfloat32_t
+sv_exp10f_inline (svfloat32_t x, const svbool_t pg, const struct data *d)
{
- return sv_call_f32 (exp10f, x, y, special);
-}
-
-/* Single-precision SVE exp10f routine. Implements the same algorithm
- as AdvSIMD exp10f.
- Worst case error is 1.02 ULPs.
- _ZGVsMxv_exp10f(-0x1.040488p-4) got 0x1.ba5f9ep-1
- want 0x1.ba5f9cp-1. */
-svfloat32_t SV_NAME_F1 (exp10) (svfloat32_t x, const svbool_t pg)
-{
- const struct data *d = ptr_barrier (&data);
/* exp10(x) = 2^(n/N) * 10^r = 2^n * (1 + poly (r)),
with poly(r) in [1/sqrt(2), sqrt(2)] and
x = r + n * log10(2) / N, with r in [-log10(2)/2N, log10(2)/2N]. */
- /* Load some constants in quad-word chunks to minimise memory access (last
- lane is wasted). */
- svfloat32_t log10_2_and_inv = svld1rq (svptrue_b32 (), &d->log10_2);
+ svfloat32_t lane_consts = svld1rq (svptrue_b32 (), &d->log2_10_lo);
/* n = round(x/(log10(2)/N)). */
svfloat32_t shift = sv_f32 (d->shift);
- svfloat32_t z = svmla_lane (shift, x, log10_2_and_inv, 0);
- svfloat32_t n = svsub_x (pg, z, shift);
+ svfloat32_t z = svmad_x (pg, sv_f32 (d->log10_2), x, shift);
+ svfloat32_t n = svsub_x (svptrue_b32 (), z, shift);
/* r = x - n*log10(2)/N. */
- svfloat32_t r = svmls_lane (x, n, log10_2_and_inv, 1);
- r = svmls_lane (r, n, log10_2_and_inv, 2);
+ svfloat32_t r = svmsb_x (pg, sv_f32 (d->log2_10_hi), n, x);
+ r = svmls_lane (r, n, lane_consts, 0);
- svbool_t special = svacgt (pg, x, d->special_bound);
svfloat32_t scale = svexpa (svreinterpret_u32 (z));
/* Polynomial evaluation: poly(r) ~ exp10(r)-1. */
- svfloat32_t r2 = svmul_x (pg, r, r);
- svfloat32_t poly
- = svmla_x (pg, svmul_x (pg, r, d->poly[0]),
- sv_pairwise_poly_3_f32_x (pg, r, r2, d->poly + 1), r2);
-
- if (__glibc_unlikely (svptest_any (pg, special)))
- return special_case (x, svmla_x (pg, scale, scale, poly), special);
+ svfloat32_t p12 = svmla_lane (sv_f32 (d->c1), r, lane_consts, 2);
+ svfloat32_t p34 = svmla_lane (sv_f32 (d->c3), r, lane_consts, 3);
+ svfloat32_t r2 = svmul_x (svptrue_b32 (), r, r);
+ svfloat32_t p14 = svmla_x (pg, p12, p34, r2);
+ svfloat32_t p0 = svmul_lane (r, lane_consts, 1);
+ svfloat32_t poly = svmla_x (pg, p0, r2, p14);
return svmla_x (pg, scale, scale, poly);
}
+
+static svfloat32_t NOINLINE
+special_case (svfloat32_t x, svbool_t special, const struct data *d)
+{
+ return sv_call_f32 (exp10f, x, sv_exp10f_inline (x, svptrue_b32 (), d),
+ special);
+}
+
+/* Single-precision SVE exp10f routine. Implements the same algorithm
+ as AdvSIMD exp10f.
+ Worst case error is 1.02 ULPs.
+ _ZGVsMxv_exp10f(-0x1.040488p-4) got 0x1.ba5f9ep-1
+ want 0x1.ba5f9cp-1. */
+svfloat32_t SV_NAME_F1 (exp10) (svfloat32_t x, const svbool_t pg)
+{
+ const struct data *d = ptr_barrier (&data);
+ svbool_t special = svacgt (pg, x, d->thres);
+ if (__glibc_unlikely (svptest_any (special, special)))
+ return special_case (x, special, d);
+ return sv_exp10f_inline (x, pg, d);
+}
diff --git a/sysdeps/aarch64/fpu/exp2f_sve.c b/sysdeps/aarch64/fpu/exp2f_sve.c
index 8a686e3e054cb7f5..c6216bed9e9e7538 100644
--- a/sysdeps/aarch64/fpu/exp2f_sve.c
+++ b/sysdeps/aarch64/fpu/exp2f_sve.c
@@ -24,54 +24,64 @@
static const struct data
{
- float poly[5];
+ float c0, c2, c4, c1, c3;
float shift, thres;
} data = {
- /* Coefficients copied from the polynomial in AdvSIMD variant, reversed for
- compatibility with polynomial helpers. */
- .poly = { 0x1.62e422p-1f, 0x1.ebf9bcp-3f, 0x1.c6bd32p-5f, 0x1.3ce9e4p-7f,
- 0x1.59977ap-10f },
+ /* Coefficients copied from the polynomial in AdvSIMD variant. */
+ .c0 = 0x1.62e422p-1f,
+ .c1 = 0x1.ebf9bcp-3f,
+ .c2 = 0x1.c6bd32p-5f,
+ .c3 = 0x1.3ce9e4p-7f,
+ .c4 = 0x1.59977ap-10f,
/* 1.5*2^17 + 127. */
- .shift = 0x1.903f8p17f,
+ .shift = 0x1.803f8p17f,
/* Roughly 87.3. For x < -Thres, the result is subnormal and not handled
correctly by FEXPA. */
.thres = Thres,
};
-static svfloat32_t NOINLINE
-special_case (svfloat32_t x, svfloat32_t y, svbool_t special)
-{
- return sv_call_f32 (exp2f, x, y, special);
-}
-
-/* Single-precision SVE exp2f routine. Implements the same algorithm
- as AdvSIMD exp2f.
- Worst case error is 1.04 ULPs.
- SV_NAME_F1 (exp2)(0x1.943b9p-1) got 0x1.ba7eb2p+0
- want 0x1.ba7ebp+0. */
-svfloat32_t SV_NAME_F1 (exp2) (svfloat32_t x, const svbool_t pg)
+static inline svfloat32_t
+sv_exp2f_inline (svfloat32_t x, const svbool_t pg, const struct data *d)
{
- const struct data *d = ptr_barrier (&data);
/* exp2(x) = 2^n (1 + poly(r)), with 1 + poly(r) in [1/sqrt(2),sqrt(2)]
x = n + r, with r in [-1/2, 1/2]. */
- svfloat32_t shift = sv_f32 (d->shift);
- svfloat32_t z = svadd_x (pg, x, shift);
- svfloat32_t n = svsub_x (pg, z, shift);
- svfloat32_t r = svsub_x (pg, x, n);
+ svfloat32_t z = svadd_x (svptrue_b32 (), x, d->shift);
+ svfloat32_t n = svsub_x (svptrue_b32 (), z, d->shift);
+ svfloat32_t r = svsub_x (svptrue_b32 (), x, n);
- svbool_t special = svacgt (pg, x, d->thres);
svfloat32_t scale = svexpa (svreinterpret_u32 (z));
/* Polynomial evaluation: poly(r) ~ exp2(r)-1.
Evaluate polynomial use hybrid scheme - offset ESTRIN by 1 for
coefficients 1 to 4, and apply most significant coefficient directly. */
- svfloat32_t r2 = svmul_x (pg, r, r);
- svfloat32_t p14 = sv_pairwise_poly_3_f32_x (pg, r, r2, d->poly + 1);
- svfloat32_t p0 = svmul_x (pg, r, d->poly[0]);
+ svfloat32_t even_coeffs = svld1rq (svptrue_b32 (), &d->c0);
+ svfloat32_t r2 = svmul_x (svptrue_b32 (), r, r);
+ svfloat32_t p12 = svmla_lane (sv_f32 (d->c1), r, even_coeffs, 1);
+ svfloat32_t p34 = svmla_lane (sv_f32 (d->c3), r, even_coeffs, 2);
+ svfloat32_t p14 = svmla_x (pg, p12, r2, p34);
+ svfloat32_t p0 = svmul_lane (r, even_coeffs, 0);
svfloat32_t poly = svmla_x (pg, p0, r2, p14);
- if (__glibc_unlikely (svptest_any (pg, special)))
- return special_case (x, svmla_x (pg, scale, scale, poly), special);
-
return svmla_x (pg, scale, scale, poly);
}
+
+static svfloat32_t NOINLINE
+special_case (svfloat32_t x, svbool_t special, const struct data *d)
+{
+ return sv_call_f32 (exp2f, x, sv_exp2f_inline (x, svptrue_b32 (), d),
+ special);
+}
+
+/* Single-precision SVE exp2f routine. Implements the same algorithm
+ as AdvSIMD exp2f.
+ Worst case error is 1.04 ULPs.
+ _ZGVsMxv_exp2f(-0x1.af994ap-3) got 0x1.ba6a66p-1
+ want 0x1.ba6a64p-1. */
+svfloat32_t SV_NAME_F1 (exp2) (svfloat32_t x, const svbool_t pg)
+{
+ const struct data *d = ptr_barrier (&data);
+ svbool_t special = svacgt (pg, x, d->thres);
+ if (__glibc_unlikely (svptest_any (special, special)))
+ return special_case (x, special, d);
+ return sv_exp2f_inline (x, pg, d);
+}
diff --git a/sysdeps/aarch64/fpu/expf_sve.c b/sysdeps/aarch64/fpu/expf_sve.c
index 3ba79bc4f11a05f9..da93e01b87e0e890 100644
--- a/sysdeps/aarch64/fpu/expf_sve.c
+++ b/sysdeps/aarch64/fpu/expf_sve.c
@@ -18,33 +18,25 @@
<https://www.gnu.org/licenses/>. */
#include "sv_math.h"
+#include "sv_expf_inline.h"
+
+/* Roughly 87.3. For x < -Thres, the result is subnormal and not handled
+ correctly by FEXPA. */
+#define Thres 0x1.5d5e2ap+6f
static const struct data
{
- float poly[5];
- float inv_ln2, ln2_hi, ln2_lo, shift, thres;
+ struct sv_expf_data d;
+ float thres;
} data = {
- /* Coefficients copied from the polynomial in AdvSIMD variant, reversed for
- compatibility with polynomial helpers. */
- .poly = { 0x1.ffffecp-1f, 0x1.fffdb6p-2f, 0x1.555e66p-3f, 0x1.573e2ep-5f,
- 0x1.0e4020p-7f },
- .inv_ln2 = 0x1.715476p+0f,
- .ln2_hi = 0x1.62e4p-1f,
- .ln2_lo = 0x1.7f7d1cp-20f,
- /* 1.5*2^17 + 127. */
- .shift = 0x1.903f8p17f,
- /* Roughly 87.3. For x < -Thres, the result is subnormal and not handled
- correctly by FEXPA. */
- .thres = 0x1.5d5e2ap+6f,
+ .d = SV_EXPF_DATA,
+ .thres = Thres,
};
-#define C(i) sv_f32 (d->poly[i])
-#define ExponentBias 0x3f800000
-
static svfloat32_t NOINLINE
-special_case (svfloat32_t x, svfloat32_t y, svbool_t special)
+special_case (svfloat32_t x, svbool_t special, const struct sv_expf_data *d)
{
- return sv_call_f32 (expf, x, y, special);
+ return sv_call_f32 (expf, x, expf_inline (x, svptrue_b32 (), d), special);
}
/* Optimised single-precision SVE exp function.
@@ -54,36 +46,8 @@ special_case (svfloat32_t x, svfloat32_t y, svbool_t special)
svfloat32_t SV_NAME_F1 (exp) (svfloat32_t x, const svbool_t pg)
{
const struct data *d = ptr_barrier (&data);
-
- /* exp(x) = 2^n (1 + poly(r)), with 1 + poly(r) in [1/sqrt(2),sqrt(2)]
- x = ln2*n + r, with r in [-ln2/2, ln2/2]. */
-
- /* Load some constants in quad-word chunks to minimise memory access (last
- lane is wasted). */
- svfloat32_t invln2_and_ln2 = svld1rq (svptrue_b32 (), &d->inv_ln2);
-
- /* n = round(x/(ln2/N)). */
- svfloat32_t z = svmla_lane (sv_f32 (d->shift), x, invln2_and_ln2, 0);
- svfloat32_t n = svsub_x (pg, z, d->shift);
-
- /* r = x - n*ln2/N. */
- svfloat32_t r = svmls_lane (x, n, invln2_and_ln2, 1);
- r = svmls_lane (r, n, invln2_and_ln2, 2);
-
- /* scale = 2^(n/N). */
svbool_t is_special_case = svacgt (pg, x, d->thres);
- svfloat32_t scale = svexpa (svreinterpret_u32 (z));
-
- /* y = exp(r) - 1 ~= r + C0 r^2 + C1 r^3 + C2 r^4 + C3 r^5 + C4 r^6. */
- svfloat32_t p12 = svmla_x (pg, C (1), C (2), r);
- svfloat32_t p34 = svmla_x (pg, C (3), C (4), r);
- svfloat32_t r2 = svmul_x (pg, r, r);
- svfloat32_t p14 = svmla_x (pg, p12, p34, r2);
- svfloat32_t p0 = svmul_x (pg, r, C (0));
- svfloat32_t poly = svmla_x (pg, p0, r2, p14);
-
if (__glibc_unlikely (svptest_any (pg, is_special_case)))
- return special_case (x, svmla_x (pg, scale, scale, poly), is_special_case);
-
- return svmla_x (pg, scale, scale, poly);
+ return special_case (x, is_special_case, &d->d);
+ return expf_inline (x, pg, &d->d);
}
diff --git a/sysdeps/aarch64/fpu/sv_expf_inline.h b/sysdeps/aarch64/fpu/sv_expf_inline.h
index 23963b5f8ec89ead..6166df65533555a6 100644
--- a/sysdeps/aarch64/fpu/sv_expf_inline.h
+++ b/sysdeps/aarch64/fpu/sv_expf_inline.h
@@ -24,19 +24,20 @@
struct sv_expf_data
{
- float poly[5];
- float inv_ln2, ln2_hi, ln2_lo, shift;
+ float c1, c3, inv_ln2;
+ float ln2_lo, c0, c2, c4;
+ float ln2_hi, shift;
};
/* Coefficients copied from the polynomial in AdvSIMD variant, reversed for
compatibility with polynomial helpers. Shift is 1.5*2^17 + 127. */
#define SV_EXPF_DATA \
{ \
- .poly = { 0x1.ffffecp-1f, 0x1.fffdb6p-2f, 0x1.555e66p-3f, 0x1.573e2ep-5f, \
- 0x1.0e4020p-7f }, \
- \
- .inv_ln2 = 0x1.715476p+0f, .ln2_hi = 0x1.62e4p-1f, \
- .ln2_lo = 0x1.7f7d1cp-20f, .shift = 0x1.803f8p17f, \
+ /* Coefficients copied from the polynomial in AdvSIMD variant. */ \
+ .c0 = 0x1.ffffecp-1f, .c1 = 0x1.fffdb6p-2f, .c2 = 0x1.555e66p-3f, \
+ .c3 = 0x1.573e2ep-5f, .c4 = 0x1.0e4020p-7f, .inv_ln2 = 0x1.715476p+0f, \
+ .ln2_hi = 0x1.62e4p-1f, .ln2_lo = 0x1.7f7d1cp-20f, \
+ .shift = 0x1.803f8p17f, \
}
#define C(i) sv_f32 (d->poly[i])
@@ -47,26 +48,25 @@ expf_inline (svfloat32_t x, const svbool_t pg, const struct sv_expf_data *d)
/* exp(x) = 2^n (1 + poly(r)), with 1 + poly(r) in [1/sqrt(2),sqrt(2)]
x = ln2*n + r, with r in [-ln2/2, ln2/2]. */
- /* Load some constants in quad-word chunks to minimise memory access. */
- svfloat32_t c4_invln2_and_ln2 = svld1rq (svptrue_b32 (), &d->poly[4]);
+ svfloat32_t lane_consts = svld1rq (svptrue_b32 (), &d->ln2_lo);
/* n = round(x/(ln2/N)). */
- svfloat32_t z = svmla_lane (sv_f32 (d->shift), x, c4_invln2_and_ln2, 1);
+ svfloat32_t z = svmad_x (pg, sv_f32 (d->inv_ln2), x, d->shift);
svfloat32_t n = svsub_x (pg, z, d->shift);
/* r = x - n*ln2/N. */
- svfloat32_t r = svmls_lane (x, n, c4_invln2_and_ln2, 2);
- r = svmls_lane (r, n, c4_invln2_and_ln2, 3);
+ svfloat32_t r = svmsb_x (pg, sv_f32 (d->ln2_hi), n, x);
+ r = svmls_lane (r, n, lane_consts, 0);
/* scale = 2^(n/N). */
- svfloat32_t scale = svexpa (svreinterpret_u32_f32 (z));
+ svfloat32_t scale = svexpa (svreinterpret_u32 (z));
/* y = exp(r) - 1 ~= r + C0 r^2 + C1 r^3 + C2 r^4 + C3 r^5 + C4 r^6. */
- svfloat32_t p12 = svmla_x (pg, C (1), C (2), r);
- svfloat32_t p34 = svmla_lane (C (3), r, c4_invln2_and_ln2, 0);
- svfloat32_t r2 = svmul_f32_x (pg, r, r);
+ svfloat32_t p12 = svmla_lane (sv_f32 (d->c1), r, lane_consts, 2);
+ svfloat32_t p34 = svmla_lane (sv_f32 (d->c3), r, lane_consts, 3);
+ svfloat32_t r2 = svmul_x (svptrue_b32 (), r, r);
svfloat32_t p14 = svmla_x (pg, p12, p34, r2);
- svfloat32_t p0 = svmul_f32_x (pg, r, C (0));
+ svfloat32_t p0 = svmul_lane (r, lane_consts, 1);
svfloat32_t poly = svmla_x (pg, p0, r2, p14);
return svmla_x (pg, scale, scale, poly);

124
glibc-RHEL-118273-17.patch Normal file
View File

@ -0,0 +1,124 @@
commit 1cf29fbc5be23db775d1dfa6b332ded6e6554252
Author: Joe Ramsay <Joe.Ramsay@arm.com>
Date: Mon Oct 28 14:58:35 2024 +0000
AArch64: Small optimisation in AdvSIMD erf and erfc
In both routines, reduce register pressure such that GCC 14 emits no
spills for erf and fewer spills for erfc. Also use more efficient
comparison for the special-case in erf.
Benchtests show erf improves by 6.4%, erfc by 1.0%.
diff --git a/sysdeps/aarch64/fpu/erf_advsimd.c b/sysdeps/aarch64/fpu/erf_advsimd.c
index 19cbb7d0f42eb4e2..c0116735e408066d 100644
--- a/sysdeps/aarch64/fpu/erf_advsimd.c
+++ b/sysdeps/aarch64/fpu/erf_advsimd.c
@@ -22,19 +22,21 @@
static const struct data
{
float64x2_t third;
- float64x2_t tenth, two_over_five, two_over_fifteen;
- float64x2_t two_over_nine, two_over_fortyfive;
+ float64x2_t tenth, two_over_five, two_over_nine;
+ double two_over_fifteen, two_over_fortyfive;
float64x2_t max, shift;
+ uint64x2_t max_idx;
#if WANT_SIMD_EXCEPT
float64x2_t tiny_bound, huge_bound, scale_minus_one;
#endif
} data = {
+ .max_idx = V2 (768),
.third = V2 (0x1.5555555555556p-2), /* used to compute 2/3 and 1/6 too. */
- .two_over_fifteen = V2 (0x1.1111111111111p-3),
+ .two_over_fifteen = 0x1.1111111111111p-3,
.tenth = V2 (-0x1.999999999999ap-4),
.two_over_five = V2 (-0x1.999999999999ap-2),
.two_over_nine = V2 (-0x1.c71c71c71c71cp-3),
- .two_over_fortyfive = V2 (0x1.6c16c16c16c17p-5),
+ .two_over_fortyfive = 0x1.6c16c16c16c17p-5,
.max = V2 (5.9921875), /* 6 - 1/128. */
.shift = V2 (0x1p45),
#if WANT_SIMD_EXCEPT
@@ -87,8 +89,8 @@ float64x2_t VPCS_ATTR V_NAME_D1 (erf) (float64x2_t x)
float64x2_t a = vabsq_f64 (x);
/* Reciprocal conditions that do not catch NaNs so they can be used in BSLs
to return expected results. */
- uint64x2_t a_le_max = vcleq_f64 (a, dat->max);
- uint64x2_t a_gt_max = vcgtq_f64 (a, dat->max);
+ uint64x2_t a_le_max = vcaleq_f64 (x, dat->max);
+ uint64x2_t a_gt_max = vcagtq_f64 (x, dat->max);
#if WANT_SIMD_EXCEPT
/* |x| huge or tiny. */
@@ -115,7 +117,7 @@ float64x2_t VPCS_ATTR V_NAME_D1 (erf) (float64x2_t x)
segfault. */
uint64x2_t i
= vsubq_u64 (vreinterpretq_u64_f64 (z), vreinterpretq_u64_f64 (shift));
- i = vbslq_u64 (a_le_max, i, v_u64 (768));
+ i = vbslq_u64 (a_le_max, i, dat->max_idx);
struct entry e = lookup (i);
float64x2_t r = vsubq_f64 (z, shift);
@@ -125,14 +127,19 @@ float64x2_t VPCS_ATTR V_NAME_D1 (erf) (float64x2_t x)
float64x2_t d2 = vmulq_f64 (d, d);
float64x2_t r2 = vmulq_f64 (r, r);
+ float64x2_t two_over_fifteen_and_fortyfive
+ = vld1q_f64 (&dat->two_over_fifteen);
+
/* poly (d, r) = 1 + p1(r) * d + p2(r) * d^2 + ... + p5(r) * d^5. */
float64x2_t p1 = r;
float64x2_t p2
= vfmsq_f64 (dat->third, r2, vaddq_f64 (dat->third, dat->third));
float64x2_t p3 = vmulq_f64 (r, vfmaq_f64 (v_f64 (-0.5), r2, dat->third));
- float64x2_t p4 = vfmaq_f64 (dat->two_over_five, r2, dat->two_over_fifteen);
+ float64x2_t p4 = vfmaq_laneq_f64 (dat->two_over_five, r2,
+ two_over_fifteen_and_fortyfive, 0);
p4 = vfmsq_f64 (dat->tenth, r2, p4);
- float64x2_t p5 = vfmaq_f64 (dat->two_over_nine, r2, dat->two_over_fortyfive);
+ float64x2_t p5 = vfmaq_laneq_f64 (dat->two_over_nine, r2,
+ two_over_fifteen_and_fortyfive, 1);
p5 = vmulq_f64 (r, vfmaq_f64 (vmulq_f64 (v_f64 (0.5), dat->third), r2, p5));
float64x2_t p34 = vfmaq_f64 (p3, d, p4);
diff --git a/sysdeps/aarch64/fpu/erfc_advsimd.c b/sysdeps/aarch64/fpu/erfc_advsimd.c
index f1b3bfe8304c73b5..2f2f755c46e71b58 100644
--- a/sysdeps/aarch64/fpu/erfc_advsimd.c
+++ b/sysdeps/aarch64/fpu/erfc_advsimd.c
@@ -24,8 +24,8 @@ static const struct data
{
uint64x2_t offset, table_scale;
float64x2_t max, shift;
- float64x2_t p20, p40, p41, p42;
- float64x2_t p51, p52;
+ float64x2_t p20, p40, p41, p51;
+ double p42, p52;
double qr5[2], qr6[2], qr7[2], qr8[2], qr9[2];
#if WANT_SIMD_EXCEPT
float64x2_t uflow_bound;
@@ -41,9 +41,9 @@ static const struct data
.p20 = V2 (0x1.5555555555555p-2), /* 1/3, used to compute 2/3 and 1/6. */
.p40 = V2 (-0x1.999999999999ap-4), /* 1/10. */
.p41 = V2 (-0x1.999999999999ap-2), /* 2/5. */
- .p42 = V2 (0x1.1111111111111p-3), /* 2/15. */
+ .p42 = 0x1.1111111111111p-3, /* 2/15. */
.p51 = V2 (-0x1.c71c71c71c71cp-3), /* 2/9. */
- .p52 = V2 (0x1.6c16c16c16c17p-5), /* 2/45. */
+ .p52 = 0x1.6c16c16c16c17p-5, /* 2/45. */
/* Qi = (i+1) / i, Ri = -2 * i / ((i+1)*(i+2)), for i = 5, ..., 9. */
.qr5 = { 0x1.3333333333333p0, -0x1.e79e79e79e79ep-3 },
.qr6 = { 0x1.2aaaaaaaaaaabp0, -0x1.b6db6db6db6dbp-3 },
@@ -157,9 +157,10 @@ float64x2_t V_NAME_D1 (erfc) (float64x2_t x)
float64x2_t p1 = r;
float64x2_t p2 = vfmsq_f64 (dat->p20, r2, vaddq_f64 (dat->p20, dat->p20));
float64x2_t p3 = vmulq_f64 (r, vfmaq_f64 (v_f64 (-0.5), r2, dat->p20));
- float64x2_t p4 = vfmaq_f64 (dat->p41, r2, dat->p42);
+ float64x2_t p42_p52 = vld1q_f64 (&dat->p42);
+ float64x2_t p4 = vfmaq_laneq_f64 (dat->p41, r2, p42_p52, 0);
p4 = vfmsq_f64 (dat->p40, r2, p4);
- float64x2_t p5 = vfmaq_f64 (dat->p51, r2, dat->p52);
+ float64x2_t p5 = vfmaq_laneq_f64 (dat->p51, r2, p42_p52, 1);
p5 = vmulq_f64 (r, vfmaq_f64 (vmulq_f64 (v_f64 (0.5), dat->p20), r2, p5));
/* Compute p_i using recurrence relation:
p_{i+2} = (p_i + r * Q_{i+1} * p_{i+1}) * R_{i+1}. */

2973
glibc-RHEL-118273-18.patch Normal file

File diff suppressed because it is too large Load Diff

461
glibc-RHEL-118273-19.patch Normal file
View File

@ -0,0 +1,461 @@
commit 13a7ef5999de56add448a24fefb0250236271a06
Author: Pierre Blanchard <pierre.blanchard@arm.com>
Date: Mon Dec 9 15:58:47 2024 +0000
AArch64: Improve codegen in users of ADVSIMD expm1 helper
Add inline helper for expm1 and rearrange operations so MOV
is not necessary in reduction or around the special-case handler.
Reduce memory access by using more indexed MLAs in polynomial.
Speedup on Neoverse V1 for expm1 (19%), sinh (8.5%), and tanh (7.5%).
diff --git a/sysdeps/aarch64/fpu/expm1_advsimd.c b/sysdeps/aarch64/fpu/expm1_advsimd.c
index 3db3b80c49292947..f2042db8bcc8466a 100644
--- a/sysdeps/aarch64/fpu/expm1_advsimd.c
+++ b/sysdeps/aarch64/fpu/expm1_advsimd.c
@@ -18,31 +18,18 @@
<https://www.gnu.org/licenses/>. */
#include "v_math.h"
-#include "poly_advsimd_f64.h"
+#include "v_expm1_inline.h"
static const struct data
{
- float64x2_t poly[11];
- float64x2_t invln2;
- double ln2[2];
- float64x2_t shift;
- int64x2_t exponent_bias;
+ struct v_expm1_data d;
#if WANT_SIMD_EXCEPT
uint64x2_t thresh, tiny_bound;
#else
float64x2_t oflow_bound;
#endif
} data = {
- /* Generated using fpminimax, with degree=12 in [log(2)/2, log(2)/2]. */
- .poly = { V2 (0x1p-1), V2 (0x1.5555555555559p-3), V2 (0x1.555555555554bp-5),
- V2 (0x1.111111110f663p-7), V2 (0x1.6c16c16c1b5f3p-10),
- V2 (0x1.a01a01affa35dp-13), V2 (0x1.a01a018b4ecbbp-16),
- V2 (0x1.71ddf82db5bb4p-19), V2 (0x1.27e517fc0d54bp-22),
- V2 (0x1.af5eedae67435p-26), V2 (0x1.1f143d060a28ap-29) },
- .invln2 = V2 (0x1.71547652b82fep0),
- .ln2 = { 0x1.62e42fefa39efp-1, 0x1.abc9e3b39803fp-56 },
- .shift = V2 (0x1.8p52),
- .exponent_bias = V2 (0x3ff0000000000000),
+ .d = V_EXPM1_DATA,
#if WANT_SIMD_EXCEPT
/* asuint64(oflow_bound) - asuint64(0x1p-51), shifted left by 1 for abs
compare. */
@@ -58,67 +45,36 @@ static const struct data
};
static float64x2_t VPCS_ATTR NOINLINE
-special_case (float64x2_t x, float64x2_t y, uint64x2_t special)
+special_case (float64x2_t x, uint64x2_t special, const struct data *d)
{
- return v_call_f64 (expm1, x, y, special);
+ return v_call_f64 (expm1, x, expm1_inline (v_zerofy_f64 (x, special), &d->d),
+ special);
}
/* Double-precision vector exp(x) - 1 function.
- The maximum error observed error is 2.18 ULP:
- _ZGVnN2v_expm1 (0x1.634ba0c237d7bp-2) got 0x1.a8b9ea8d66e22p-2
- want 0x1.a8b9ea8d66e2p-2. */
+ The maximum error observed error is 2.05 ULP:
+ _ZGVnN2v_expm1(0x1.634902eaff3adp-2) got 0x1.a8b636e2a9388p-2
+ want 0x1.a8b636e2a9386p-2. */
float64x2_t VPCS_ATTR V_NAME_D1 (expm1) (float64x2_t x)
{
const struct data *d = ptr_barrier (&data);
- uint64x2_t ix = vreinterpretq_u64_f64 (x);
-
#if WANT_SIMD_EXCEPT
+ uint64x2_t ix = vreinterpretq_u64_f64 (x);
/* If fp exceptions are to be triggered correctly, fall back to scalar for
|x| < 2^-51, |x| > oflow_bound, Inf & NaN. Add ix to itself for
shift-left by 1, and compare with thresh which was left-shifted offline -
this is effectively an absolute compare. */
uint64x2_t special
= vcgeq_u64 (vsubq_u64 (vaddq_u64 (ix, ix), d->tiny_bound), d->thresh);
- if (__glibc_unlikely (v_any_u64 (special)))
- x = v_zerofy_f64 (x, special);
#else
/* Large input, NaNs and Infs. */
uint64x2_t special = vcageq_f64 (x, d->oflow_bound);
#endif
- /* Reduce argument to smaller range:
- Let i = round(x / ln2)
- and f = x - i * ln2, then f is in [-ln2/2, ln2/2].
- exp(x) - 1 = 2^i * (expm1(f) + 1) - 1
- where 2^i is exact because i is an integer. */
- float64x2_t n = vsubq_f64 (vfmaq_f64 (d->shift, d->invln2, x), d->shift);
- int64x2_t i = vcvtq_s64_f64 (n);
- float64x2_t ln2 = vld1q_f64 (&d->ln2[0]);
- float64x2_t f = vfmsq_laneq_f64 (x, n, ln2, 0);
- f = vfmsq_laneq_f64 (f, n, ln2, 1);
-
- /* Approximate expm1(f) using polynomial.
- Taylor expansion for expm1(x) has the form:
- x + ax^2 + bx^3 + cx^4 ....
- So we calculate the polynomial P(f) = a + bf + cf^2 + ...
- and assemble the approximation expm1(f) ~= f + f^2 * P(f). */
- float64x2_t f2 = vmulq_f64 (f, f);
- float64x2_t f4 = vmulq_f64 (f2, f2);
- float64x2_t f8 = vmulq_f64 (f4, f4);
- float64x2_t p = vfmaq_f64 (f, f2, v_estrin_10_f64 (f, f2, f4, f8, d->poly));
-
- /* Assemble the result.
- expm1(x) ~= 2^i * (p + 1) - 1
- Let t = 2^i. */
- int64x2_t u = vaddq_s64 (vshlq_n_s64 (i, 52), d->exponent_bias);
- float64x2_t t = vreinterpretq_f64_s64 (u);
-
if (__glibc_unlikely (v_any_u64 (special)))
- return special_case (vreinterpretq_f64_u64 (ix),
- vfmaq_f64 (vsubq_f64 (t, v_f64 (1.0)), p, t),
- special);
+ return special_case (x, special, d);
/* expm1(x) ~= p * t + (t - 1). */
- return vfmaq_f64 (vsubq_f64 (t, v_f64 (1.0)), p, t);
+ return expm1_inline (x, &d->d);
}
diff --git a/sysdeps/aarch64/fpu/sinh_advsimd.c b/sysdeps/aarch64/fpu/sinh_advsimd.c
index 3e3b76c502b01e16..7adf771517de2507 100644
--- a/sysdeps/aarch64/fpu/sinh_advsimd.c
+++ b/sysdeps/aarch64/fpu/sinh_advsimd.c
@@ -18,72 +18,31 @@
<https://www.gnu.org/licenses/>. */
#include "v_math.h"
-#include "poly_advsimd_f64.h"
+#include "v_expm1_inline.h"
static const struct data
{
- float64x2_t poly[11], inv_ln2;
- double m_ln2[2];
- float64x2_t shift;
+ struct v_expm1_data d;
uint64x2_t halff;
- int64x2_t onef;
#if WANT_SIMD_EXCEPT
uint64x2_t tiny_bound, thresh;
#else
- uint64x2_t large_bound;
+ float64x2_t large_bound;
#endif
} data = {
- /* Generated using Remez, deg=12 in [-log(2)/2, log(2)/2]. */
- .poly = { V2 (0x1p-1), V2 (0x1.5555555555559p-3), V2 (0x1.555555555554bp-5),
- V2 (0x1.111111110f663p-7), V2 (0x1.6c16c16c1b5f3p-10),
- V2 (0x1.a01a01affa35dp-13), V2 (0x1.a01a018b4ecbbp-16),
- V2 (0x1.71ddf82db5bb4p-19), V2 (0x1.27e517fc0d54bp-22),
- V2 (0x1.af5eedae67435p-26), V2 (0x1.1f143d060a28ap-29), },
-
- .inv_ln2 = V2 (0x1.71547652b82fep0),
- .m_ln2 = {-0x1.62e42fefa39efp-1, -0x1.abc9e3b39803fp-56},
- .shift = V2 (0x1.8p52),
-
+ .d = V_EXPM1_DATA,
.halff = V2 (0x3fe0000000000000),
- .onef = V2 (0x3ff0000000000000),
#if WANT_SIMD_EXCEPT
/* 2^-26, below which sinh(x) rounds to x. */
.tiny_bound = V2 (0x3e50000000000000),
/* asuint(large_bound) - asuint(tiny_bound). */
.thresh = V2 (0x0230000000000000),
#else
-/* 2^9. expm1 helper overflows for large input. */
- .large_bound = V2 (0x4080000000000000),
+ /* 2^9. expm1 helper overflows for large input. */
+ .large_bound = V2 (0x1p+9),
#endif
};
-static inline float64x2_t
-expm1_inline (float64x2_t x)
-{
- const struct data *d = ptr_barrier (&data);
-
- /* Reduce argument:
- exp(x) - 1 = 2^i * (expm1(f) + 1) - 1
- where i = round(x / ln2)
- and f = x - i * ln2 (f in [-ln2/2, ln2/2]). */
- float64x2_t j = vsubq_f64 (vfmaq_f64 (d->shift, d->inv_ln2, x), d->shift);
- int64x2_t i = vcvtq_s64_f64 (j);
-
- float64x2_t m_ln2 = vld1q_f64 (d->m_ln2);
- float64x2_t f = vfmaq_laneq_f64 (x, j, m_ln2, 0);
- f = vfmaq_laneq_f64 (f, j, m_ln2, 1);
- /* Approximate expm1(f) using polynomial. */
- float64x2_t f2 = vmulq_f64 (f, f);
- float64x2_t f4 = vmulq_f64 (f2, f2);
- float64x2_t f8 = vmulq_f64 (f4, f4);
- float64x2_t p = vfmaq_f64 (f, f2, v_estrin_10_f64 (f, f2, f4, f8, d->poly));
- /* t = 2^i. */
- float64x2_t t = vreinterpretq_f64_u64 (
- vreinterpretq_u64_s64 (vaddq_s64 (vshlq_n_s64 (i, 52), d->onef)));
- /* expm1(x) ~= p * t + (t - 1). */
- return vfmaq_f64 (vsubq_f64 (t, v_f64 (1.0)), p, t);
-}
-
static float64x2_t NOINLINE VPCS_ATTR
special_case (float64x2_t x)
{
@@ -92,23 +51,23 @@ special_case (float64x2_t x)
/* Approximation for vector double-precision sinh(x) using expm1.
sinh(x) = (exp(x) - exp(-x)) / 2.
- The greatest observed error is 2.57 ULP:
- _ZGVnN2v_sinh (0x1.9fb1d49d1d58bp-2) got 0x1.ab34e59d678dcp-2
- want 0x1.ab34e59d678d9p-2. */
+ The greatest observed error is 2.52 ULP:
+ _ZGVnN2v_sinh(-0x1.a098a2177a2b9p-2) got -0x1.ac2f05bb66fccp-2
+ want -0x1.ac2f05bb66fc9p-2. */
float64x2_t VPCS_ATTR V_NAME_D1 (sinh) (float64x2_t x)
{
const struct data *d = ptr_barrier (&data);
float64x2_t ax = vabsq_f64 (x);
- uint64x2_t sign
- = veorq_u64 (vreinterpretq_u64_f64 (x), vreinterpretq_u64_f64 (ax));
- float64x2_t halfsign = vreinterpretq_f64_u64 (vorrq_u64 (sign, d->halff));
+ uint64x2_t ix = vreinterpretq_u64_f64 (x);
+ float64x2_t halfsign = vreinterpretq_f64_u64 (
+ vbslq_u64 (v_u64 (0x8000000000000000), ix, d->halff));
#if WANT_SIMD_EXCEPT
uint64x2_t special = vcgeq_u64 (
vsubq_u64 (vreinterpretq_u64_f64 (ax), d->tiny_bound), d->thresh);
#else
- uint64x2_t special = vcgeq_u64 (vreinterpretq_u64_f64 (ax), d->large_bound);
+ uint64x2_t special = vcageq_f64 (x, d->large_bound);
#endif
/* Fall back to scalar variant for all lanes if any of them are special. */
@@ -118,7 +77,7 @@ float64x2_t VPCS_ATTR V_NAME_D1 (sinh) (float64x2_t x)
/* Up to the point that expm1 overflows, we can use it to calculate sinh
using a slight rearrangement of the definition of sinh. This allows us to
retain acceptable accuracy for very small inputs. */
- float64x2_t t = expm1_inline (ax);
+ float64x2_t t = expm1_inline (ax, &d->d);
t = vaddq_f64 (t, vdivq_f64 (t, vaddq_f64 (t, v_f64 (1.0))));
return vmulq_f64 (t, halfsign);
}
diff --git a/sysdeps/aarch64/fpu/tanh_advsimd.c b/sysdeps/aarch64/fpu/tanh_advsimd.c
index 1da1dfa5dbe418b6..402ba9d8ad2478a8 100644
--- a/sysdeps/aarch64/fpu/tanh_advsimd.c
+++ b/sysdeps/aarch64/fpu/tanh_advsimd.c
@@ -18,68 +18,30 @@
<https://www.gnu.org/licenses/>. */
#include "v_math.h"
-#include "poly_advsimd_f64.h"
+#include "v_expm1_inline.h"
static const struct data
{
- float64x2_t poly[11];
- float64x2_t inv_ln2, ln2_hi, ln2_lo, shift;
- uint64x2_t onef;
+ struct v_expm1_data d;
uint64x2_t thresh, tiny_bound;
} data = {
- /* Generated using Remez, deg=12 in [-log(2)/2, log(2)/2]. */
- .poly = { V2 (0x1p-1), V2 (0x1.5555555555559p-3), V2 (0x1.555555555554bp-5),
- V2 (0x1.111111110f663p-7), V2 (0x1.6c16c16c1b5f3p-10),
- V2 (0x1.a01a01affa35dp-13), V2 (0x1.a01a018b4ecbbp-16),
- V2 (0x1.71ddf82db5bb4p-19), V2 (0x1.27e517fc0d54bp-22),
- V2 (0x1.af5eedae67435p-26), V2 (0x1.1f143d060a28ap-29), },
-
- .inv_ln2 = V2 (0x1.71547652b82fep0),
- .ln2_hi = V2 (-0x1.62e42fefa39efp-1),
- .ln2_lo = V2 (-0x1.abc9e3b39803fp-56),
- .shift = V2 (0x1.8p52),
-
- .onef = V2 (0x3ff0000000000000),
+ .d = V_EXPM1_DATA,
.tiny_bound = V2 (0x3e40000000000000), /* asuint64 (0x1p-27). */
/* asuint64(0x1.241bf835f9d5fp+4) - asuint64(tiny_bound). */
.thresh = V2 (0x01f241bf835f9d5f),
};
-static inline float64x2_t
-expm1_inline (float64x2_t x, const struct data *d)
-{
- /* Helper routine for calculating exp(x) - 1. Vector port of the helper from
- the scalar variant of tanh. */
-
- /* Reduce argument: f in [-ln2/2, ln2/2], i is exact. */
- float64x2_t j = vsubq_f64 (vfmaq_f64 (d->shift, d->inv_ln2, x), d->shift);
- int64x2_t i = vcvtq_s64_f64 (j);
- float64x2_t f = vfmaq_f64 (x, j, d->ln2_hi);
- f = vfmaq_f64 (f, j, d->ln2_lo);
-
- /* Approximate expm1(f) using polynomial. */
- float64x2_t f2 = vmulq_f64 (f, f);
- float64x2_t f4 = vmulq_f64 (f2, f2);
- float64x2_t p = vfmaq_f64 (
- f, f2, v_estrin_10_f64 (f, f2, f4, vmulq_f64 (f4, f4), d->poly));
-
- /* t = 2 ^ i. */
- float64x2_t t = vreinterpretq_f64_u64 (
- vaddq_u64 (vreinterpretq_u64_s64 (i << 52), d->onef));
- /* expm1(x) = p * t + (t - 1). */
- return vfmaq_f64 (vsubq_f64 (t, v_f64 (1)), p, t);
-}
-
static float64x2_t NOINLINE VPCS_ATTR
-special_case (float64x2_t x, float64x2_t y, uint64x2_t special)
+special_case (float64x2_t x, float64x2_t q, float64x2_t qp2,
+ uint64x2_t special)
{
- return v_call_f64 (tanh, x, y, special);
+ return v_call_f64 (tanh, x, vdivq_f64 (q, qp2), special);
}
/* Vector approximation for double-precision tanh(x), using a simplified
- version of expm1. The greatest observed error is 2.77 ULP:
- _ZGVnN2v_tanh(-0x1.c4a4ca0f9f3b7p-3) got -0x1.bd6a21a163627p-3
- want -0x1.bd6a21a163624p-3. */
+ version of expm1. The greatest observed error is 2.70 ULP:
+ _ZGVnN2v_tanh(-0x1.c59aa220cb177p-3) got -0x1.be5452a6459fep-3
+ want -0x1.be5452a6459fbp-3. */
float64x2_t VPCS_ATTR V_NAME_D1 (tanh) (float64x2_t x)
{
const struct data *d = ptr_barrier (&data);
@@ -100,10 +62,10 @@ float64x2_t VPCS_ATTR V_NAME_D1 (tanh) (float64x2_t x)
u = vaddq_f64 (u, u);
/* tanh(x) = (e^2x - 1) / (e^2x + 1). */
- float64x2_t q = expm1_inline (u, d);
- float64x2_t qp2 = vaddq_f64 (q, v_f64 (2));
+ float64x2_t q = expm1_inline (u, &d->d);
+ float64x2_t qp2 = vaddq_f64 (q, v_f64 (2.0));
if (__glibc_unlikely (v_any_u64 (special)))
- return special_case (x, vdivq_f64 (q, qp2), special);
+ return special_case (x, q, qp2, special);
return vdivq_f64 (q, qp2);
}
diff --git a/sysdeps/aarch64/fpu/v_expm1_inline.h b/sysdeps/aarch64/fpu/v_expm1_inline.h
new file mode 100644
index 0000000000000000..a925183d4e5e4623
--- /dev/null
+++ b/sysdeps/aarch64/fpu/v_expm1_inline.h
@@ -0,0 +1,97 @@
+/* Double-precision inline helper for vector (Advanced SIMD) expm1 function
+
+ Copyright (C) 2024 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#ifndef AARCH64_FPU_V_EXPM1_INLINE_H
+#define AARCH64_FPU_V_EXPM1_INLINE_H
+
+#include "v_math.h"
+
+struct v_expm1_data
+{
+ float64x2_t c2, c4, c6, c8;
+ float64x2_t invln2;
+ int64x2_t exponent_bias;
+ double c1, c3, c5, c7, c9, c10;
+ double ln2[2];
+};
+
+/* Generated using fpminimax, with degree=12 in [log(2)/2, log(2)/2]. */
+#define V_EXPM1_DATA \
+ { \
+ .c1 = 0x1.5555555555559p-3, .c2 = V2 (0x1.555555555554bp-5), \
+ .c3 = 0x1.111111110f663p-7, .c4 = V2 (0x1.6c16c16c1b5f3p-10), \
+ .c5 = 0x1.a01a01affa35dp-13, .c6 = V2 (0x1.a01a018b4ecbbp-16), \
+ .c7 = 0x1.71ddf82db5bb4p-19, .c8 = V2 (0x1.27e517fc0d54bp-22), \
+ .c9 = 0x1.af5eedae67435p-26, .c10 = 0x1.1f143d060a28ap-29, \
+ .ln2 = { 0x1.62e42fefa39efp-1, 0x1.abc9e3b39803fp-56 }, \
+ .invln2 = V2 (0x1.71547652b82fep0), \
+ .exponent_bias = V2 (0x3ff0000000000000), \
+ }
+
+static inline float64x2_t
+expm1_inline (float64x2_t x, const struct v_expm1_data *d)
+{
+ /* Helper routine for calculating exp(x) - 1. */
+
+ float64x2_t ln2 = vld1q_f64 (&d->ln2[0]);
+
+ /* Reduce argument to smaller range:
+ Let i = round(x / ln2)
+ and f = x - i * ln2, then f is in [-ln2/2, ln2/2].
+ exp(x) - 1 = 2^i * (expm1(f) + 1) - 1
+ where 2^i is exact because i is an integer. */
+ float64x2_t n = vrndaq_f64 (vmulq_f64 (x, d->invln2));
+ int64x2_t i = vcvtq_s64_f64 (n);
+ float64x2_t f = vfmsq_laneq_f64 (x, n, ln2, 0);
+ f = vfmsq_laneq_f64 (f, n, ln2, 1);
+
+ /* Approximate expm1(f) using polynomial.
+ Taylor expansion for expm1(x) has the form:
+ x + ax^2 + bx^3 + cx^4 ....
+ So we calculate the polynomial P(f) = a + bf + cf^2 + ...
+ and assemble the approximation expm1(f) ~= f + f^2 * P(f). */
+ float64x2_t f2 = vmulq_f64 (f, f);
+ float64x2_t f4 = vmulq_f64 (f2, f2);
+ float64x2_t lane_consts_13 = vld1q_f64 (&d->c1);
+ float64x2_t lane_consts_57 = vld1q_f64 (&d->c5);
+ float64x2_t lane_consts_910 = vld1q_f64 (&d->c9);
+ float64x2_t p01 = vfmaq_laneq_f64 (v_f64 (0.5), f, lane_consts_13, 0);
+ float64x2_t p23 = vfmaq_laneq_f64 (d->c2, f, lane_consts_13, 1);
+ float64x2_t p45 = vfmaq_laneq_f64 (d->c4, f, lane_consts_57, 0);
+ float64x2_t p67 = vfmaq_laneq_f64 (d->c6, f, lane_consts_57, 1);
+ float64x2_t p03 = vfmaq_f64 (p01, f2, p23);
+ float64x2_t p47 = vfmaq_f64 (p45, f2, p67);
+ float64x2_t p89 = vfmaq_laneq_f64 (d->c8, f, lane_consts_910, 0);
+ float64x2_t p = vfmaq_laneq_f64 (p89, f2, lane_consts_910, 1);
+ p = vfmaq_f64 (p47, f4, p);
+ p = vfmaq_f64 (p03, f4, p);
+
+ p = vfmaq_f64 (f, f2, p);
+
+ /* Assemble the result.
+ expm1(x) ~= 2^i * (p + 1) - 1
+ Let t = 2^i. */
+ int64x2_t u = vaddq_s64 (vshlq_n_s64 (i, 52), d->exponent_bias);
+ float64x2_t t = vreinterpretq_f64_s64 (u);
+
+ /* expm1(x) ~= p * t + (t - 1). */
+ return vfmaq_f64 (vsubq_f64 (t, v_f64 (1.0)), p, t);
+}
+
+#endif
diff --git a/sysdeps/aarch64/fpu/v_expm1f_inline.h b/sysdeps/aarch64/fpu/v_expm1f_inline.h
index 1daedfdd51cfc54b..c1fb88b5e027b322 100644
--- a/sysdeps/aarch64/fpu/v_expm1f_inline.h
+++ b/sysdeps/aarch64/fpu/v_expm1f_inline.h
@@ -21,7 +21,6 @@
#define AARCH64_FPU_V_EXPM1F_INLINE_H
#include "v_math.h"
-#include "math_config.h"
struct v_expm1f_data
{

862
glibc-RHEL-118273-2.patch Normal file
View File

@ -0,0 +1,862 @@
commit bdb5705b7bab618ed4445f4b17d4b1e4fbbf94a7
Author: Joe Ramsay <Joe.Ramsay@arm.com>
Date: Tue Feb 20 16:59:39 2024 +0000
aarch64/fpu: Add vector variants of cosh
Reviewed-by: Szabolcs Nagy <szabolcs.nagy@arm.com>
diff --git a/sysdeps/aarch64/fpu/Makefile b/sysdeps/aarch64/fpu/Makefile
index 320b6ed43a9a454c..019c3a51880e2306 100644
--- a/sysdeps/aarch64/fpu/Makefile
+++ b/sysdeps/aarch64/fpu/Makefile
@@ -3,6 +3,7 @@ libmvec-supported-funcs = acos \
atan \
atan2 \
cos \
+ cosh \
erf \
exp \
exp10 \
@@ -32,7 +33,8 @@ libmvec-support = $(addsuffix f_advsimd,$(float-advsimd-funcs)) \
erf_data \
erff_data \
sv_erf_data \
- sv_erff_data
+ sv_erff_data \
+ v_exp_tail_data
endif
sve-cflags = -march=armv8-a+sve
diff --git a/sysdeps/aarch64/fpu/Versions b/sysdeps/aarch64/fpu/Versions
index d7b1e87191b66439..884b4b57f097635f 100644
--- a/sysdeps/aarch64/fpu/Versions
+++ b/sysdeps/aarch64/fpu/Versions
@@ -79,6 +79,11 @@ libmvec {
_ZGVsMxv_tan;
}
GLIBC_2.40 {
+ _ZGVnN2v_cosh;
+ _ZGVnN2v_coshf;
+ _ZGVnN4v_coshf;
+ _ZGVsMxv_cosh;
+ _ZGVsMxv_coshf;
_ZGVnN2v_erf;
_ZGVnN2v_erff;
_ZGVnN4v_erff;
diff --git a/sysdeps/aarch64/fpu/advsimd_f32_protos.h b/sysdeps/aarch64/fpu/advsimd_f32_protos.h
index d8d88de2181569f9..c63b2948d4938b0d 100644
--- a/sysdeps/aarch64/fpu/advsimd_f32_protos.h
+++ b/sysdeps/aarch64/fpu/advsimd_f32_protos.h
@@ -21,6 +21,7 @@ libmvec_hidden_proto (V_NAME_F1(acos));
libmvec_hidden_proto (V_NAME_F1(asin));
libmvec_hidden_proto (V_NAME_F1(atan));
libmvec_hidden_proto (V_NAME_F1(cos));
+libmvec_hidden_proto (V_NAME_F1(cosh));
libmvec_hidden_proto (V_NAME_F1(erf));
libmvec_hidden_proto (V_NAME_F1(exp10));
libmvec_hidden_proto (V_NAME_F1(exp2));
diff --git a/sysdeps/aarch64/fpu/bits/math-vector.h b/sysdeps/aarch64/fpu/bits/math-vector.h
index 71f53363a071126d..8ca55098706a54c2 100644
--- a/sysdeps/aarch64/fpu/bits/math-vector.h
+++ b/sysdeps/aarch64/fpu/bits/math-vector.h
@@ -49,6 +49,10 @@
# define __DECL_SIMD_cos __DECL_SIMD_aarch64
# undef __DECL_SIMD_cosf
# define __DECL_SIMD_cosf __DECL_SIMD_aarch64
+# undef __DECL_SIMD_cosh
+# define __DECL_SIMD_cosh __DECL_SIMD_aarch64
+# undef __DECL_SIMD_coshf
+# define __DECL_SIMD_coshf __DECL_SIMD_aarch64
# undef __DECL_SIMD_erf
# define __DECL_SIMD_erf __DECL_SIMD_aarch64
# undef __DECL_SIMD_erff
@@ -124,6 +128,7 @@ __vpcs __f32x4_t _ZGVnN4v_acosf (__f32x4_t);
__vpcs __f32x4_t _ZGVnN4v_asinf (__f32x4_t);
__vpcs __f32x4_t _ZGVnN4v_atanf (__f32x4_t);
__vpcs __f32x4_t _ZGVnN4v_cosf (__f32x4_t);
+__vpcs __f32x4_t _ZGVnN4v_coshf (__f32x4_t);
__vpcs __f32x4_t _ZGVnN4v_erff (__f32x4_t);
__vpcs __f32x4_t _ZGVnN4v_expf (__f32x4_t);
__vpcs __f32x4_t _ZGVnN4v_exp10f (__f32x4_t);
@@ -141,6 +146,7 @@ __vpcs __f64x2_t _ZGVnN2v_acos (__f64x2_t);
__vpcs __f64x2_t _ZGVnN2v_asin (__f64x2_t);
__vpcs __f64x2_t _ZGVnN2v_atan (__f64x2_t);
__vpcs __f64x2_t _ZGVnN2v_cos (__f64x2_t);
+__vpcs __f64x2_t _ZGVnN2v_cosh (__f64x2_t);
__vpcs __f64x2_t _ZGVnN2v_erf (__f64x2_t);
__vpcs __f64x2_t _ZGVnN2v_exp (__f64x2_t);
__vpcs __f64x2_t _ZGVnN2v_exp10 (__f64x2_t);
@@ -163,6 +169,7 @@ __sv_f32_t _ZGVsMxv_acosf (__sv_f32_t, __sv_bool_t);
__sv_f32_t _ZGVsMxv_asinf (__sv_f32_t, __sv_bool_t);
__sv_f32_t _ZGVsMxv_atanf (__sv_f32_t, __sv_bool_t);
__sv_f32_t _ZGVsMxv_cosf (__sv_f32_t, __sv_bool_t);
+__sv_f32_t _ZGVsMxv_coshf (__sv_f32_t, __sv_bool_t);
__sv_f32_t _ZGVsMxv_erff (__sv_f32_t, __sv_bool_t);
__sv_f32_t _ZGVsMxv_expf (__sv_f32_t, __sv_bool_t);
__sv_f32_t _ZGVsMxv_exp10f (__sv_f32_t, __sv_bool_t);
@@ -180,6 +187,7 @@ __sv_f64_t _ZGVsMxv_acos (__sv_f64_t, __sv_bool_t);
__sv_f64_t _ZGVsMxv_asin (__sv_f64_t, __sv_bool_t);
__sv_f64_t _ZGVsMxv_atan (__sv_f64_t, __sv_bool_t);
__sv_f64_t _ZGVsMxv_cos (__sv_f64_t, __sv_bool_t);
+__sv_f64_t _ZGVsMxv_cosh (__sv_f64_t, __sv_bool_t);
__sv_f64_t _ZGVsMxv_erf (__sv_f64_t, __sv_bool_t);
__sv_f64_t _ZGVsMxv_exp (__sv_f64_t, __sv_bool_t);
__sv_f64_t _ZGVsMxv_exp10 (__sv_f64_t, __sv_bool_t);
diff --git a/sysdeps/aarch64/fpu/cosh_advsimd.c b/sysdeps/aarch64/fpu/cosh_advsimd.c
new file mode 100644
index 0000000000000000..ec7b59637e973da9
--- /dev/null
+++ b/sysdeps/aarch64/fpu/cosh_advsimd.c
@@ -0,0 +1,108 @@
+/* Double-precision vector (AdvSIMD) cosh function
+
+ Copyright (C) 2024 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#include "v_math.h"
+
+static const struct data
+{
+ float64x2_t poly[3];
+ float64x2_t inv_ln2, ln2, shift, thres;
+ uint64x2_t index_mask, special_bound;
+} data = {
+ .poly = { V2 (0x1.fffffffffffd4p-2), V2 (0x1.5555571d6b68cp-3),
+ V2 (0x1.5555576a59599p-5), },
+
+ .inv_ln2 = V2 (0x1.71547652b82fep8), /* N/ln2. */
+ /* -ln2/N. */
+ .ln2 = {-0x1.62e42fefa39efp-9, -0x1.abc9e3b39803f3p-64},
+ .shift = V2 (0x1.8p+52),
+ .thres = V2 (704.0),
+
+ .index_mask = V2 (0xff),
+ /* 0x1.6p9, above which exp overflows. */
+ .special_bound = V2 (0x4086000000000000),
+};
+
+static float64x2_t NOINLINE VPCS_ATTR
+special_case (float64x2_t x, float64x2_t y, uint64x2_t special)
+{
+ return v_call_f64 (cosh, x, y, special);
+}
+
+/* Helper for approximating exp(x). Copied from v_exp_tail, with no
+ special-case handling or tail. */
+static inline float64x2_t
+exp_inline (float64x2_t x)
+{
+ const struct data *d = ptr_barrier (&data);
+
+ /* n = round(x/(ln2/N)). */
+ float64x2_t z = vfmaq_f64 (d->shift, x, d->inv_ln2);
+ uint64x2_t u = vreinterpretq_u64_f64 (z);
+ float64x2_t n = vsubq_f64 (z, d->shift);
+
+ /* r = x - n*ln2/N. */
+ float64x2_t r = vfmaq_laneq_f64 (x, n, d->ln2, 0);
+ r = vfmaq_laneq_f64 (r, n, d->ln2, 1);
+
+ uint64x2_t e = vshlq_n_u64 (u, 52 - V_EXP_TAIL_TABLE_BITS);
+ uint64x2_t i = vandq_u64 (u, d->index_mask);
+
+ /* y = tail + exp(r) - 1 ~= r + C1 r^2 + C2 r^3 + C3 r^4. */
+ float64x2_t y = vfmaq_f64 (d->poly[1], d->poly[2], r);
+ y = vfmaq_f64 (d->poly[0], y, r);
+ y = vmulq_f64 (vfmaq_f64 (v_f64 (1), y, r), r);
+
+ /* s = 2^(n/N). */
+ u = v_lookup_u64 (__v_exp_tail_data, i);
+ float64x2_t s = vreinterpretq_f64_u64 (vaddq_u64 (u, e));
+
+ return vfmaq_f64 (s, y, s);
+}
+
+/* Approximation for vector double-precision cosh(x) using exp_inline.
+ cosh(x) = (exp(x) + exp(-x)) / 2.
+ The greatest observed error is in the scalar fall-back region, so is the
+ same as the scalar routine, 1.93 ULP:
+ _ZGVnN2v_cosh (0x1.628af341989dap+9) got 0x1.fdf28623ef921p+1021
+ want 0x1.fdf28623ef923p+1021.
+
+ The greatest observed error in the non-special region is 1.54 ULP:
+ _ZGVnN2v_cosh (0x1.8e205b6ecacf7p+2) got 0x1.f711dcb0c77afp+7
+ want 0x1.f711dcb0c77b1p+7. */
+float64x2_t VPCS_ATTR V_NAME_D1 (cosh) (float64x2_t x)
+{
+ const struct data *d = ptr_barrier (&data);
+
+ float64x2_t ax = vabsq_f64 (x);
+ uint64x2_t special
+ = vcgtq_u64 (vreinterpretq_u64_f64 (ax), d->special_bound);
+
+ /* Up to the point that exp overflows, we can use it to calculate cosh by
+ exp(|x|) / 2 + 1 / (2 * exp(|x|)). */
+ float64x2_t t = exp_inline (ax);
+ float64x2_t half_t = vmulq_n_f64 (t, 0.5);
+ float64x2_t half_over_t = vdivq_f64 (v_f64 (0.5), t);
+
+ /* Fall back to scalar for any special cases. */
+ if (__glibc_unlikely (v_any_u64 (special)))
+ return special_case (x, vaddq_f64 (half_t, half_over_t), special);
+
+ return vaddq_f64 (half_t, half_over_t);
+}
diff --git a/sysdeps/aarch64/fpu/cosh_sve.c b/sysdeps/aarch64/fpu/cosh_sve.c
new file mode 100644
index 0000000000000000..919f34604a452b4a
--- /dev/null
+++ b/sysdeps/aarch64/fpu/cosh_sve.c
@@ -0,0 +1,105 @@
+/* Double-precision vector (SVE) cosh function
+
+ Copyright (C) 2024 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#include "sv_math.h"
+
+static const struct data
+{
+ float64_t poly[3];
+ float64_t inv_ln2, ln2_hi, ln2_lo, shift, thres;
+ uint64_t index_mask, special_bound;
+} data = {
+ .poly = { 0x1.fffffffffffd4p-2, 0x1.5555571d6b68cp-3,
+ 0x1.5555576a59599p-5, },
+
+ .inv_ln2 = 0x1.71547652b82fep8, /* N/ln2. */
+ /* -ln2/N. */
+ .ln2_hi = -0x1.62e42fefa39efp-9,
+ .ln2_lo = -0x1.abc9e3b39803f3p-64,
+ .shift = 0x1.8p+52,
+ .thres = 704.0,
+
+ .index_mask = 0xff,
+ /* 0x1.6p9, above which exp overflows. */
+ .special_bound = 0x4086000000000000,
+};
+
+static svfloat64_t NOINLINE
+special_case (svfloat64_t x, svfloat64_t y, svbool_t special)
+{
+ return sv_call_f64 (cosh, x, y, special);
+}
+
+/* Helper for approximating exp(x). Copied from sv_exp_tail, with no
+ special-case handling or tail. */
+static inline svfloat64_t
+exp_inline (svfloat64_t x, const svbool_t pg, const struct data *d)
+{
+ /* Calculate exp(x). */
+ svfloat64_t z = svmla_x (pg, sv_f64 (d->shift), x, d->inv_ln2);
+ svfloat64_t n = svsub_x (pg, z, d->shift);
+
+ svfloat64_t r = svmla_x (pg, x, n, d->ln2_hi);
+ r = svmla_x (pg, r, n, d->ln2_lo);
+
+ svuint64_t u = svreinterpret_u64 (z);
+ svuint64_t e = svlsl_x (pg, u, 52 - V_EXP_TAIL_TABLE_BITS);
+ svuint64_t i = svand_x (pg, u, d->index_mask);
+
+ svfloat64_t y = svmla_x (pg, sv_f64 (d->poly[1]), r, d->poly[2]);
+ y = svmla_x (pg, sv_f64 (d->poly[0]), r, y);
+ y = svmla_x (pg, sv_f64 (1.0), r, y);
+ y = svmul_x (pg, r, y);
+
+ /* s = 2^(n/N). */
+ u = svld1_gather_index (pg, __v_exp_tail_data, i);
+ svfloat64_t s = svreinterpret_f64 (svadd_x (pg, u, e));
+
+ return svmla_x (pg, s, s, y);
+}
+
+/* Approximation for SVE double-precision cosh(x) using exp_inline.
+ cosh(x) = (exp(x) + exp(-x)) / 2.
+ The greatest observed error is in the scalar fall-back region, so is the
+ same as the scalar routine, 1.93 ULP:
+ _ZGVsMxv_cosh (0x1.628ad45039d2fp+9) got 0x1.fd774e958236dp+1021
+ want 0x1.fd774e958236fp+1021.
+
+ The greatest observed error in the non-special region is 1.54 ULP:
+ _ZGVsMxv_cosh (0x1.ba5651dd4486bp+2) got 0x1.f5e2bb8d5c98fp+8
+ want 0x1.f5e2bb8d5c991p+8. */
+svfloat64_t SV_NAME_D1 (cosh) (svfloat64_t x, const svbool_t pg)
+{
+ const struct data *d = ptr_barrier (&data);
+
+ svfloat64_t ax = svabs_x (pg, x);
+ svbool_t special = svcmpgt (pg, svreinterpret_u64 (ax), d->special_bound);
+
+ /* Up to the point that exp overflows, we can use it to calculate cosh by
+ exp(|x|) / 2 + 1 / (2 * exp(|x|)). */
+ svfloat64_t t = exp_inline (ax, pg, d);
+ svfloat64_t half_t = svmul_x (pg, t, 0.5);
+ svfloat64_t half_over_t = svdivr_x (pg, t, 0.5);
+
+ /* Fall back to scalar for any special cases. */
+ if (__glibc_unlikely (svptest_any (pg, special)))
+ return special_case (x, svadd_x (pg, half_t, half_over_t), special);
+
+ return svadd_x (pg, half_t, half_over_t);
+}
diff --git a/sysdeps/aarch64/fpu/coshf_advsimd.c b/sysdeps/aarch64/fpu/coshf_advsimd.c
new file mode 100644
index 0000000000000000..c1ab4923b826569b
--- /dev/null
+++ b/sysdeps/aarch64/fpu/coshf_advsimd.c
@@ -0,0 +1,84 @@
+/* Single-precision vector (AdvSIMD) cosh function
+
+ Copyright (C) 2024 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#include "v_expf_inline.h"
+#include "v_math.h"
+
+static const struct data
+{
+ struct v_expf_data expf_consts;
+ uint32x4_t tiny_bound, special_bound;
+} data = {
+ .expf_consts = V_EXPF_DATA,
+ .tiny_bound = V4 (0x20000000), /* 0x1p-63: Round to 1 below this. */
+ /* 0x1.5a92d8p+6: expf overflows above this, so have to use special case. */
+ .special_bound = V4 (0x42ad496c),
+};
+
+#if !WANT_SIMD_EXCEPT
+static float32x4_t NOINLINE VPCS_ATTR
+special_case (float32x4_t x, float32x4_t y, uint32x4_t special)
+{
+ return v_call_f32 (coshf, x, y, special);
+}
+#endif
+
+/* Single-precision vector cosh, using vector expf.
+ Maximum error is 2.38 ULP:
+ _ZGVnN4v_coshf (0x1.e8001ep+1) got 0x1.6a491ep+4
+ want 0x1.6a4922p+4. */
+float32x4_t VPCS_ATTR V_NAME_F1 (cosh) (float32x4_t x)
+{
+ const struct data *d = ptr_barrier (&data);
+
+ float32x4_t ax = vabsq_f32 (x);
+ uint32x4_t iax = vreinterpretq_u32_f32 (ax);
+ uint32x4_t special = vcgeq_u32 (iax, d->special_bound);
+
+#if WANT_SIMD_EXCEPT
+ /* If fp exceptions are to be triggered correctly, fall back to the scalar
+ variant for all inputs if any input is a special value or above the bound
+ at which expf overflows. */
+ if (__glibc_unlikely (v_any_u32 (special)))
+ return v_call_f32 (coshf, x, x, v_u32 (-1));
+
+ uint32x4_t tiny = vcleq_u32 (iax, d->tiny_bound);
+ /* If any input is tiny, avoid underflow exception by fixing tiny lanes of
+ input to 0, which will generate no exceptions. */
+ if (__glibc_unlikely (v_any_u32 (tiny)))
+ ax = v_zerofy_f32 (ax, tiny);
+#endif
+
+ /* Calculate cosh by exp(x) / 2 + exp(-x) / 2. */
+ float32x4_t t = v_expf_inline (ax, &d->expf_consts);
+ float32x4_t half_t = vmulq_n_f32 (t, 0.5);
+ float32x4_t half_over_t = vdivq_f32 (v_f32 (0.5), t);
+
+#if WANT_SIMD_EXCEPT
+ if (__glibc_unlikely (v_any_u32 (tiny)))
+ return vbslq_f32 (tiny, v_f32 (1), vaddq_f32 (half_t, half_over_t));
+#else
+ if (__glibc_unlikely (v_any_u32 (special)))
+ return special_case (x, vaddq_f32 (half_t, half_over_t), special);
+#endif
+
+ return vaddq_f32 (half_t, half_over_t);
+}
+libmvec_hidden_def (V_NAME_F1 (cosh))
+HALF_WIDTH_ALIAS_F1 (cosh)
diff --git a/sysdeps/aarch64/fpu/coshf_sve.c b/sysdeps/aarch64/fpu/coshf_sve.c
new file mode 100644
index 0000000000000000..e5d8a299c6aa7ceb
--- /dev/null
+++ b/sysdeps/aarch64/fpu/coshf_sve.c
@@ -0,0 +1,59 @@
+/* Single-precision vector (SVE) cosh function
+
+ Copyright (C) 2024 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#include "sv_math.h"
+#include "sv_expf_inline.h"
+
+static const struct data
+{
+ struct sv_expf_data expf_consts;
+ uint32_t special_bound;
+} data = {
+ .expf_consts = SV_EXPF_DATA,
+ /* 0x1.5a92d8p+6: expf overflows above this, so have to use special case. */
+ .special_bound = 0x42ad496c,
+};
+
+static svfloat32_t NOINLINE
+special_case (svfloat32_t x, svfloat32_t y, svbool_t pg)
+{
+ return sv_call_f32 (coshf, x, y, pg);
+}
+
+/* Single-precision vector cosh, using vector expf.
+ Maximum error is 1.89 ULP:
+ _ZGVsMxv_coshf (-0x1.65898cp+6) got 0x1.f00aep+127
+ want 0x1.f00adcp+127. */
+svfloat32_t SV_NAME_F1 (cosh) (svfloat32_t x, svbool_t pg)
+{
+ const struct data *d = ptr_barrier (&data);
+
+ svfloat32_t ax = svabs_x (pg, x);
+ svbool_t special = svcmpge (pg, svreinterpret_u32 (ax), d->special_bound);
+
+ /* Calculate cosh by exp(x) / 2 + exp(-x) / 2. */
+ svfloat32_t t = expf_inline (ax, pg, &d->expf_consts);
+ svfloat32_t half_t = svmul_x (pg, t, 0.5);
+ svfloat32_t half_over_t = svdivr_x (pg, t, 0.5);
+
+ if (__glibc_unlikely (svptest_any (pg, special)))
+ return special_case (x, svadd_x (pg, half_t, half_over_t), special);
+
+ return svadd_x (pg, half_t, half_over_t);
+}
diff --git a/sysdeps/aarch64/fpu/sv_expf_inline.h b/sysdeps/aarch64/fpu/sv_expf_inline.h
new file mode 100644
index 0000000000000000..23963b5f8ec89ead
--- /dev/null
+++ b/sysdeps/aarch64/fpu/sv_expf_inline.h
@@ -0,0 +1,75 @@
+/* SVE helper for single-precision routines which depend on exp
+
+ Copyright (C) 2024 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#ifndef AARCH64_FPU_SV_EXPF_INLINE_H
+#define AARCH64_FPU_SV_EXPF_INLINE_H
+
+#include "sv_math.h"
+
+struct sv_expf_data
+{
+ float poly[5];
+ float inv_ln2, ln2_hi, ln2_lo, shift;
+};
+
+/* Coefficients copied from the polynomial in AdvSIMD variant, reversed for
+ compatibility with polynomial helpers. Shift is 1.5*2^17 + 127. */
+#define SV_EXPF_DATA \
+ { \
+ .poly = { 0x1.ffffecp-1f, 0x1.fffdb6p-2f, 0x1.555e66p-3f, 0x1.573e2ep-5f, \
+ 0x1.0e4020p-7f }, \
+ \
+ .inv_ln2 = 0x1.715476p+0f, .ln2_hi = 0x1.62e4p-1f, \
+ .ln2_lo = 0x1.7f7d1cp-20f, .shift = 0x1.803f8p17f, \
+ }
+
+#define C(i) sv_f32 (d->poly[i])
+
+static inline svfloat32_t
+expf_inline (svfloat32_t x, const svbool_t pg, const struct sv_expf_data *d)
+{
+ /* exp(x) = 2^n (1 + poly(r)), with 1 + poly(r) in [1/sqrt(2),sqrt(2)]
+ x = ln2*n + r, with r in [-ln2/2, ln2/2]. */
+
+ /* Load some constants in quad-word chunks to minimise memory access. */
+ svfloat32_t c4_invln2_and_ln2 = svld1rq (svptrue_b32 (), &d->poly[4]);
+
+ /* n = round(x/(ln2/N)). */
+ svfloat32_t z = svmla_lane (sv_f32 (d->shift), x, c4_invln2_and_ln2, 1);
+ svfloat32_t n = svsub_x (pg, z, d->shift);
+
+ /* r = x - n*ln2/N. */
+ svfloat32_t r = svmls_lane (x, n, c4_invln2_and_ln2, 2);
+ r = svmls_lane (r, n, c4_invln2_and_ln2, 3);
+
+ /* scale = 2^(n/N). */
+ svfloat32_t scale = svexpa (svreinterpret_u32_f32 (z));
+
+ /* y = exp(r) - 1 ~= r + C0 r^2 + C1 r^3 + C2 r^4 + C3 r^5 + C4 r^6. */
+ svfloat32_t p12 = svmla_x (pg, C (1), C (2), r);
+ svfloat32_t p34 = svmla_lane (C (3), r, c4_invln2_and_ln2, 0);
+ svfloat32_t r2 = svmul_f32_x (pg, r, r);
+ svfloat32_t p14 = svmla_x (pg, p12, p34, r2);
+ svfloat32_t p0 = svmul_f32_x (pg, r, C (0));
+ svfloat32_t poly = svmla_x (pg, p0, r2, p14);
+
+ return svmla_x (pg, scale, scale, poly);
+}
+
+#endif
diff --git a/sysdeps/aarch64/fpu/test-double-advsimd-wrappers.c b/sysdeps/aarch64/fpu/test-double-advsimd-wrappers.c
index 41fdb92d7ea6e707..b37cb7d5e9c0d96a 100644
--- a/sysdeps/aarch64/fpu/test-double-advsimd-wrappers.c
+++ b/sysdeps/aarch64/fpu/test-double-advsimd-wrappers.c
@@ -28,6 +28,7 @@ VPCS_VECTOR_WRAPPER (asin_advsimd, _ZGVnN2v_asin)
VPCS_VECTOR_WRAPPER (atan_advsimd, _ZGVnN2v_atan)
VPCS_VECTOR_WRAPPER_ff (atan2_advsimd, _ZGVnN2vv_atan2)
VPCS_VECTOR_WRAPPER (cos_advsimd, _ZGVnN2v_cos)
+VPCS_VECTOR_WRAPPER (cosh_advsimd, _ZGVnN2v_cosh)
VPCS_VECTOR_WRAPPER (erf_advsimd, _ZGVnN2v_erf)
VPCS_VECTOR_WRAPPER (exp_advsimd, _ZGVnN2v_exp)
VPCS_VECTOR_WRAPPER (exp10_advsimd, _ZGVnN2v_exp10)
diff --git a/sysdeps/aarch64/fpu/test-double-sve-wrappers.c b/sysdeps/aarch64/fpu/test-double-sve-wrappers.c
index 8e3d64da420348a7..011f07d2c15b148f 100644
--- a/sysdeps/aarch64/fpu/test-double-sve-wrappers.c
+++ b/sysdeps/aarch64/fpu/test-double-sve-wrappers.c
@@ -47,6 +47,7 @@ SVE_VECTOR_WRAPPER (asin_sve, _ZGVsMxv_asin)
SVE_VECTOR_WRAPPER (atan_sve, _ZGVsMxv_atan)
SVE_VECTOR_WRAPPER_ff (atan2_sve, _ZGVsMxvv_atan2)
SVE_VECTOR_WRAPPER (cos_sve, _ZGVsMxv_cos)
+SVE_VECTOR_WRAPPER (cosh_sve, _ZGVsMxv_cosh)
SVE_VECTOR_WRAPPER (erf_sve, _ZGVsMxv_erf)
SVE_VECTOR_WRAPPER (exp_sve, _ZGVsMxv_exp)
SVE_VECTOR_WRAPPER (exp10_sve, _ZGVsMxv_exp10)
diff --git a/sysdeps/aarch64/fpu/test-float-advsimd-wrappers.c b/sysdeps/aarch64/fpu/test-float-advsimd-wrappers.c
index 33ae92878f774ac3..35452991431e238a 100644
--- a/sysdeps/aarch64/fpu/test-float-advsimd-wrappers.c
+++ b/sysdeps/aarch64/fpu/test-float-advsimd-wrappers.c
@@ -28,6 +28,7 @@ VPCS_VECTOR_WRAPPER (asinf_advsimd, _ZGVnN4v_asinf)
VPCS_VECTOR_WRAPPER (atanf_advsimd, _ZGVnN4v_atanf)
VPCS_VECTOR_WRAPPER_ff (atan2f_advsimd, _ZGVnN4vv_atan2f)
VPCS_VECTOR_WRAPPER (cosf_advsimd, _ZGVnN4v_cosf)
+VPCS_VECTOR_WRAPPER (coshf_advsimd, _ZGVnN4v_coshf)
VPCS_VECTOR_WRAPPER (erff_advsimd, _ZGVnN4v_erff)
VPCS_VECTOR_WRAPPER (expf_advsimd, _ZGVnN4v_expf)
VPCS_VECTOR_WRAPPER (exp10f_advsimd, _ZGVnN4v_exp10f)
diff --git a/sysdeps/aarch64/fpu/test-float-sve-wrappers.c b/sysdeps/aarch64/fpu/test-float-sve-wrappers.c
index ac0464f196e7972f..bbc74ede88c9e6c8 100644
--- a/sysdeps/aarch64/fpu/test-float-sve-wrappers.c
+++ b/sysdeps/aarch64/fpu/test-float-sve-wrappers.c
@@ -47,6 +47,7 @@ SVE_VECTOR_WRAPPER (asinf_sve, _ZGVsMxv_asinf)
SVE_VECTOR_WRAPPER (atanf_sve, _ZGVsMxv_atanf)
SVE_VECTOR_WRAPPER_ff (atan2f_sve, _ZGVsMxvv_atan2f)
SVE_VECTOR_WRAPPER (cosf_sve, _ZGVsMxv_cosf)
+SVE_VECTOR_WRAPPER (coshf_sve, _ZGVsMxv_coshf)
SVE_VECTOR_WRAPPER (erff_sve, _ZGVsMxv_erff)
SVE_VECTOR_WRAPPER (expf_sve, _ZGVsMxv_expf)
SVE_VECTOR_WRAPPER (exp10f_sve, _ZGVsMxv_exp10f)
diff --git a/sysdeps/aarch64/fpu/v_exp_tail_data.c b/sysdeps/aarch64/fpu/v_exp_tail_data.c
new file mode 100644
index 0000000000000000..151e97c21bbc11ae
--- /dev/null
+++ b/sysdeps/aarch64/fpu/v_exp_tail_data.c
@@ -0,0 +1,110 @@
+/* Lookup table for high-precision exp(x, tail) function
+
+ Copyright (C) 2024 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#include "vecmath_config.h"
+
+/* 2^(j/N), j=0..N, N=2^8=256. */
+const uint64_t __v_exp_tail_data[] = {
+ 0x3ff0000000000000, 0x3feffb1afa5abcbf, 0x3feff63da9fb3335,
+ 0x3feff168143b0281, 0x3fefec9a3e778061, 0x3fefe7d42e11bbcc,
+ 0x3fefe315e86e7f85, 0x3fefde5f72f654b1, 0x3fefd9b0d3158574,
+ 0x3fefd50a0e3c1f89, 0x3fefd06b29ddf6de, 0x3fefcbd42b72a836,
+ 0x3fefc74518759bc8, 0x3fefc2bdf66607e0, 0x3fefbe3ecac6f383,
+ 0x3fefb9c79b1f3919, 0x3fefb5586cf9890f, 0x3fefb0f145e46c85,
+ 0x3fefac922b7247f7, 0x3fefa83b23395dec, 0x3fefa3ec32d3d1a2,
+ 0x3fef9fa55fdfa9c5, 0x3fef9b66affed31b, 0x3fef973028d7233e,
+ 0x3fef9301d0125b51, 0x3fef8edbab5e2ab6, 0x3fef8abdc06c31cc,
+ 0x3fef86a814f204ab, 0x3fef829aaea92de0, 0x3fef7e95934f312e,
+ 0x3fef7a98c8a58e51, 0x3fef76a45471c3c2, 0x3fef72b83c7d517b,
+ 0x3fef6ed48695bbc0, 0x3fef6af9388c8dea, 0x3fef672658375d2f,
+ 0x3fef635beb6fcb75, 0x3fef5f99f8138a1c, 0x3fef5be084045cd4,
+ 0x3fef582f95281c6b, 0x3fef54873168b9aa, 0x3fef50e75eb44027,
+ 0x3fef4d5022fcd91d, 0x3fef49c18438ce4d, 0x3fef463b88628cd6,
+ 0x3fef42be3578a819, 0x3fef3f49917ddc96, 0x3fef3bdda27912d1,
+ 0x3fef387a6e756238, 0x3fef351ffb82140a, 0x3fef31ce4fb2a63f,
+ 0x3fef2e85711ece75, 0x3fef2b4565e27cdd, 0x3fef280e341ddf29,
+ 0x3fef24dfe1f56381, 0x3fef21ba7591bb70, 0x3fef1e9df51fdee1,
+ 0x3fef1b8a66d10f13, 0x3fef187fd0dad990, 0x3fef157e39771b2f,
+ 0x3fef1285a6e4030b, 0x3fef0f961f641589, 0x3fef0cafa93e2f56,
+ 0x3fef09d24abd886b, 0x3fef06fe0a31b715, 0x3fef0432edeeb2fd,
+ 0x3fef0170fc4cd831, 0x3feefeb83ba8ea32, 0x3feefc08b26416ff,
+ 0x3feef96266e3fa2d, 0x3feef6c55f929ff1, 0x3feef431a2de883b,
+ 0x3feef1a7373aa9cb, 0x3feeef26231e754a, 0x3feeecae6d05d866,
+ 0x3feeea401b7140ef, 0x3feee7db34e59ff7, 0x3feee57fbfec6cf4,
+ 0x3feee32dc313a8e5, 0x3feee0e544ede173, 0x3feedea64c123422,
+ 0x3feedc70df1c5175, 0x3feeda4504ac801c, 0x3feed822c367a024,
+ 0x3feed60a21f72e2a, 0x3feed3fb2709468a, 0x3feed1f5d950a897,
+ 0x3feecffa3f84b9d4, 0x3feece086061892d, 0x3feecc2042a7d232,
+ 0x3feeca41ed1d0057, 0x3feec86d668b3237, 0x3feec6a2b5c13cd0,
+ 0x3feec4e1e192aed2, 0x3feec32af0d7d3de, 0x3feec17dea6db7d7,
+ 0x3feebfdad5362a27, 0x3feebe41b817c114, 0x3feebcb299fddd0d,
+ 0x3feebb2d81d8abff, 0x3feeb9b2769d2ca7, 0x3feeb8417f4531ee,
+ 0x3feeb6daa2cf6642, 0x3feeb57de83f4eef, 0x3feeb42b569d4f82,
+ 0x3feeb2e2f4f6ad27, 0x3feeb1a4ca5d920f, 0x3feeb070dde910d2,
+ 0x3feeaf4736b527da, 0x3feeae27dbe2c4cf, 0x3feead12d497c7fd,
+ 0x3feeac0827ff07cc, 0x3feeab07dd485429, 0x3feeaa11fba87a03,
+ 0x3feea9268a5946b7, 0x3feea84590998b93, 0x3feea76f15ad2148,
+ 0x3feea6a320dceb71, 0x3feea5e1b976dc09, 0x3feea52ae6cdf6f4,
+ 0x3feea47eb03a5585, 0x3feea3dd1d1929fd, 0x3feea34634ccc320,
+ 0x3feea2b9febc8fb7, 0x3feea23882552225, 0x3feea1c1c70833f6,
+ 0x3feea155d44ca973, 0x3feea0f4b19e9538, 0x3feea09e667f3bcd,
+ 0x3feea052fa75173e, 0x3feea012750bdabf, 0x3fee9fdcddd47645,
+ 0x3fee9fb23c651a2f, 0x3fee9f9298593ae5, 0x3fee9f7df9519484,
+ 0x3fee9f7466f42e87, 0x3fee9f75e8ec5f74, 0x3fee9f8286ead08a,
+ 0x3fee9f9a48a58174, 0x3fee9fbd35d7cbfd, 0x3fee9feb564267c9,
+ 0x3feea024b1ab6e09, 0x3feea0694fde5d3f, 0x3feea0b938ac1cf6,
+ 0x3feea11473eb0187, 0x3feea17b0976cfdb, 0x3feea1ed0130c132,
+ 0x3feea26a62ff86f0, 0x3feea2f336cf4e62, 0x3feea3878491c491,
+ 0x3feea427543e1a12, 0x3feea4d2add106d9, 0x3feea589994cce13,
+ 0x3feea64c1eb941f7, 0x3feea71a4623c7ad, 0x3feea7f4179f5b21,
+ 0x3feea8d99b4492ed, 0x3feea9cad931a436, 0x3feeaac7d98a6699,
+ 0x3feeabd0a478580f, 0x3feeace5422aa0db, 0x3feeae05bad61778,
+ 0x3feeaf3216b5448c, 0x3feeb06a5e0866d9, 0x3feeb1ae99157736,
+ 0x3feeb2fed0282c8a, 0x3feeb45b0b91ffc6, 0x3feeb5c353aa2fe2,
+ 0x3feeb737b0cdc5e5, 0x3feeb8b82b5f98e5, 0x3feeba44cbc8520f,
+ 0x3feebbdd9a7670b3, 0x3feebd829fde4e50, 0x3feebf33e47a22a2,
+ 0x3feec0f170ca07ba, 0x3feec2bb4d53fe0d, 0x3feec49182a3f090,
+ 0x3feec674194bb8d5, 0x3feec86319e32323, 0x3feeca5e8d07f29e,
+ 0x3feecc667b5de565, 0x3feece7aed8eb8bb, 0x3feed09bec4a2d33,
+ 0x3feed2c980460ad8, 0x3feed503b23e255d, 0x3feed74a8af46052,
+ 0x3feed99e1330b358, 0x3feedbfe53c12e59, 0x3feede6b5579fdbf,
+ 0x3feee0e521356eba, 0x3feee36bbfd3f37a, 0x3feee5ff3a3c2774,
+ 0x3feee89f995ad3ad, 0x3feeeb4ce622f2ff, 0x3feeee07298db666,
+ 0x3feef0ce6c9a8952, 0x3feef3a2b84f15fb, 0x3feef68415b749b1,
+ 0x3feef9728de5593a, 0x3feefc6e29f1c52a, 0x3feeff76f2fb5e47,
+ 0x3fef028cf22749e4, 0x3fef05b030a1064a, 0x3fef08e0b79a6f1f,
+ 0x3fef0c1e904bc1d2, 0x3fef0f69c3f3a207, 0x3fef12c25bd71e09,
+ 0x3fef16286141b33d, 0x3fef199bdd85529c, 0x3fef1d1cd9fa652c,
+ 0x3fef20ab5fffd07a, 0x3fef244778fafb22, 0x3fef27f12e57d14b,
+ 0x3fef2ba88988c933, 0x3fef2f6d9406e7b5, 0x3fef33405751c4db,
+ 0x3fef3720dcef9069, 0x3fef3b0f2e6d1675, 0x3fef3f0b555dc3fa,
+ 0x3fef43155b5bab74, 0x3fef472d4a07897c, 0x3fef4b532b08c968,
+ 0x3fef4f87080d89f2, 0x3fef53c8eacaa1d6, 0x3fef5818dcfba487,
+ 0x3fef5c76e862e6d3, 0x3fef60e316c98398, 0x3fef655d71ff6075,
+ 0x3fef69e603db3285, 0x3fef6e7cd63a8315, 0x3fef7321f301b460,
+ 0x3fef77d5641c0658, 0x3fef7c97337b9b5f, 0x3fef81676b197d17,
+ 0x3fef864614f5a129, 0x3fef8b333b16ee12, 0x3fef902ee78b3ff6,
+ 0x3fef953924676d76, 0x3fef9a51fbc74c83, 0x3fef9f7977cdb740,
+ 0x3fefa4afa2a490da, 0x3fefa9f4867cca6e, 0x3fefaf482d8e67f1,
+ 0x3fefb4aaa2188510, 0x3fefba1bee615a27, 0x3fefbf9c1cb6412a,
+ 0x3fefc52b376bba97, 0x3fefcac948dd7274, 0x3fefd0765b6e4540,
+ 0x3fefd632798844f8, 0x3fefdbfdad9cbe14, 0x3fefe1d802243c89,
+ 0x3fefe7c1819e90d8, 0x3fefedba3692d514, 0x3feff3c22b8f71f1,
+ 0x3feff9d96b2a23d9,
+};
diff --git a/sysdeps/aarch64/fpu/v_expf_inline.h b/sysdeps/aarch64/fpu/v_expf_inline.h
new file mode 100644
index 0000000000000000..a3b0e32f9eb42021
--- /dev/null
+++ b/sysdeps/aarch64/fpu/v_expf_inline.h
@@ -0,0 +1,71 @@
+/* Helper for single-precision AdvSIMD routines which depend on exp
+
+ Copyright (C) 2024 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#ifndef AARCH64_FPU_V_EXPF_INLINE_H
+#define AARCH64_FPU_V_EXPF_INLINE_H
+
+#include "v_math.h"
+
+struct v_expf_data
+{
+ float32x4_t poly[5];
+ float32x4_t shift, invln2_and_ln2;
+};
+
+/* maxerr: 1.45358 +0.5 ulp. */
+#define V_EXPF_DATA \
+ { \
+ .poly = { V4 (0x1.0e4020p-7f), V4 (0x1.573e2ep-5f), V4 (0x1.555e66p-3f), \
+ V4 (0x1.fffdb6p-2f), V4 (0x1.ffffecp-1f) }, \
+ .shift = V4 (0x1.8p23f), \
+ .invln2_and_ln2 = { 0x1.715476p+0f, 0x1.62e4p-1f, 0x1.7f7d1cp-20f, 0 }, \
+ }
+
+#define ExponentBias v_u32 (0x3f800000) /* asuint(1.0f). */
+#define C(i) d->poly[i]
+
+static inline float32x4_t
+v_expf_inline (float32x4_t x, const struct v_expf_data *d)
+{
+ /* Helper routine for calculating exp(x).
+ Copied from v_expf.c, with all special-case handling removed - the
+ calling routine should handle special values if required. */
+
+ /* exp(x) = 2^n (1 + poly(r)), with 1 + poly(r) in [1/sqrt(2),sqrt(2)]
+ x = ln2*n + r, with r in [-ln2/2, ln2/2]. */
+ float32x4_t n, r, z;
+ z = vfmaq_laneq_f32 (d->shift, x, d->invln2_and_ln2, 0);
+ n = vsubq_f32 (z, d->shift);
+ r = vfmsq_laneq_f32 (x, n, d->invln2_and_ln2, 1);
+ r = vfmsq_laneq_f32 (r, n, d->invln2_and_ln2, 2);
+ uint32x4_t e = vshlq_n_u32 (vreinterpretq_u32_f32 (z), 23);
+ float32x4_t scale = vreinterpretq_f32_u32 (vaddq_u32 (e, ExponentBias));
+
+ /* Custom order-4 Estrin avoids building high order monomial. */
+ float32x4_t r2 = vmulq_f32 (r, r);
+ float32x4_t p, q, poly;
+ p = vfmaq_f32 (C (1), C (0), r);
+ q = vfmaq_f32 (C (3), C (2), r);
+ q = vfmaq_f32 (q, p, r2);
+ p = vmulq_f32 (C (4), r);
+ poly = vfmaq_f32 (p, q, r2);
+ return vfmaq_f32 (scale, poly, scale);
+}
+
+#endif
diff --git a/sysdeps/aarch64/fpu/vecmath_config.h b/sysdeps/aarch64/fpu/vecmath_config.h
index 409c0c9bd9b85422..3f0b5f476433ca06 100644
--- a/sysdeps/aarch64/fpu/vecmath_config.h
+++ b/sysdeps/aarch64/fpu/vecmath_config.h
@@ -59,6 +59,8 @@ extern const struct v_log_data
} table[1 << V_LOG_TABLE_BITS];
} __v_log_data attribute_hidden;
+#define V_EXP_TAIL_TABLE_BITS 8
+extern const uint64_t __v_exp_tail_data[1 << V_EXP_TAIL_TABLE_BITS] attribute_hidden;
#define V_EXP_TABLE_BITS 7
extern const uint64_t __v_exp_data[1 << V_EXP_TABLE_BITS] attribute_hidden;
diff --git a/sysdeps/aarch64/libm-test-ulps b/sysdeps/aarch64/libm-test-ulps
index f1103a245645476b..48d747ad5793be96 100644
--- a/sysdeps/aarch64/libm-test-ulps
+++ b/sysdeps/aarch64/libm-test-ulps
@@ -701,11 +701,19 @@ double: 2
float: 2
ldouble: 2
+Function: "cosh_advsimd":
+double: 2
+float: 2
+
Function: "cosh_downward":
double: 3
float: 1
ldouble: 3
+Function: "cosh_sve":
+double: 2
+float: 2
+
Function: "cosh_towardzero":
double: 3
float: 1
diff --git a/sysdeps/unix/sysv/linux/aarch64/libmvec.abilist b/sysdeps/unix/sysv/linux/aarch64/libmvec.abilist
index 6193518fb001cc92..f66da42c3630bf48 100644
--- a/sysdeps/unix/sysv/linux/aarch64/libmvec.abilist
+++ b/sysdeps/unix/sysv/linux/aarch64/libmvec.abilist
@@ -73,8 +73,13 @@ GLIBC_2.39 _ZGVsMxv_tan F
GLIBC_2.39 _ZGVsMxv_tanf F
GLIBC_2.39 _ZGVsMxvv_atan2 F
GLIBC_2.39 _ZGVsMxvv_atan2f F
+GLIBC_2.40 _ZGVnN2v_cosh F
+GLIBC_2.40 _ZGVnN2v_coshf F
GLIBC_2.40 _ZGVnN2v_erf F
GLIBC_2.40 _ZGVnN2v_erff F
+GLIBC_2.40 _ZGVnN4v_coshf F
GLIBC_2.40 _ZGVnN4v_erff F
+GLIBC_2.40 _ZGVsMxv_cosh F
+GLIBC_2.40 _ZGVsMxv_coshf F
GLIBC_2.40 _ZGVsMxv_erf F
GLIBC_2.40 _ZGVsMxv_erff F

359
glibc-RHEL-118273-20.patch Normal file
View File

@ -0,0 +1,359 @@
commit ca0c0d0f26fbf75b9cacc65122b457e8fdec40b8
Author: Pierre Blanchard <pierre.blanchard@arm.com>
Date: Mon Dec 9 15:55:39 2024 +0000
AArch64: Improve codegen in users of ADVSIMD log1p helper
Add inline helper for log1p and rearrange operations so MOV
is not necessary in reduction or around the special-case handler.
Reduce memory access by using more indexed MLAs in polynomial.
Speedup on Neoverse V1 for log1p (3.5%), acosh (7.5%) and atanh (10%).
Conflicts:
sysdeps/aarch64/fpu/log1p_advsimd.c
(Fixup context to apply without out-of-scope dependency 751a5502)
diff --git a/sysdeps/aarch64/fpu/acosh_advsimd.c b/sysdeps/aarch64/fpu/acosh_advsimd.c
index c88283cf1191f4eb..a98f4a2e4d8cbf42 100644
--- a/sysdeps/aarch64/fpu/acosh_advsimd.c
+++ b/sysdeps/aarch64/fpu/acosh_advsimd.c
@@ -54,9 +54,8 @@ VPCS_ATTR float64x2_t V_NAME_D1 (acosh) (float64x2_t x)
x = vbslq_f64 (special, vreinterpretq_f64_u64 (d->one), x);
#endif
- float64x2_t xm1 = vsubq_f64 (x, v_f64 (1));
- float64x2_t y;
- y = vaddq_f64 (x, v_f64 (1));
+ float64x2_t xm1 = vsubq_f64 (x, v_f64 (1.0));
+ float64x2_t y = vaddq_f64 (x, v_f64 (1.0));
y = vmulq_f64 (y, xm1);
y = vsqrtq_f64 (y);
y = vaddq_f64 (xm1, y);
diff --git a/sysdeps/aarch64/fpu/atanh_advsimd.c b/sysdeps/aarch64/fpu/atanh_advsimd.c
index 3c3d0bd6ad41396d..eb9769aeac29cf15 100644
--- a/sysdeps/aarch64/fpu/atanh_advsimd.c
+++ b/sysdeps/aarch64/fpu/atanh_advsimd.c
@@ -23,15 +23,19 @@
const static struct data
{
struct v_log1p_data log1p_consts;
- uint64x2_t one, half;
+ uint64x2_t one;
+ uint64x2_t sign_mask;
} data = { .log1p_consts = V_LOG1P_CONSTANTS_TABLE,
.one = V2 (0x3ff0000000000000),
- .half = V2 (0x3fe0000000000000) };
+ .sign_mask = V2 (0x8000000000000000) };
static float64x2_t VPCS_ATTR NOINLINE
-special_case (float64x2_t x, float64x2_t y, uint64x2_t special)
+special_case (float64x2_t x, float64x2_t halfsign, float64x2_t y,
+ uint64x2_t special, const struct data *d)
{
- return v_call_f64 (atanh, x, y, special);
+ y = log1p_inline (y, &d->log1p_consts);
+ return v_call_f64 (atanh, vbslq_f64 (d->sign_mask, halfsign, x),
+ vmulq_f64 (halfsign, y), special);
}
/* Approximation for vector double-precision atanh(x) using modified log1p.
@@ -43,11 +47,10 @@ float64x2_t V_NAME_D1 (atanh) (float64x2_t x)
{
const struct data *d = ptr_barrier (&data);
+ float64x2_t halfsign = vbslq_f64 (d->sign_mask, x, v_f64 (0.5));
float64x2_t ax = vabsq_f64 (x);
uint64x2_t ia = vreinterpretq_u64_f64 (ax);
- uint64x2_t sign = veorq_u64 (vreinterpretq_u64_f64 (x), ia);
uint64x2_t special = vcgeq_u64 (ia, d->one);
- float64x2_t halfsign = vreinterpretq_f64_u64 (vorrq_u64 (sign, d->half));
#if WANT_SIMD_EXCEPT
ax = v_zerofy_f64 (ax, special);
@@ -55,10 +58,15 @@ float64x2_t V_NAME_D1 (atanh) (float64x2_t x)
float64x2_t y;
y = vaddq_f64 (ax, ax);
- y = vdivq_f64 (y, vsubq_f64 (v_f64 (1), ax));
- y = log1p_inline (y, &d->log1p_consts);
+ y = vdivq_f64 (y, vsubq_f64 (vreinterpretq_f64_u64 (d->one), ax));
if (__glibc_unlikely (v_any_u64 (special)))
- return special_case (x, vmulq_f64 (y, halfsign), special);
+#if WANT_SIMD_EXCEPT
+ return special_case (x, halfsign, y, special, d);
+#else
+ return special_case (ax, halfsign, y, special, d);
+#endif
+
+ y = log1p_inline (y, &d->log1p_consts);
return vmulq_f64 (y, halfsign);
}
diff --git a/sysdeps/aarch64/fpu/log1p_advsimd.c b/sysdeps/aarch64/fpu/log1p_advsimd.c
index ffc418fc9c24be28..9d18578ce6497787 100644
--- a/sysdeps/aarch64/fpu/log1p_advsimd.c
+++ b/sysdeps/aarch64/fpu/log1p_advsimd.c
@@ -17,43 +17,26 @@
License along with the GNU C Library; if not, see
<https://www.gnu.org/licenses/>. */
-#include "v_math.h"
-#include "poly_advsimd_f64.h"
+#define WANT_V_LOG1P_K0_SHORTCUT 0
+#include "v_log1p_inline.h"
const static struct data
{
- float64x2_t poly[19], ln2[2];
- uint64x2_t hf_rt2_top, one_m_hf_rt2_top, umask, inf, minus_one;
- int64x2_t one_top;
-} data = {
- /* Generated using Remez, deg=20, in [sqrt(2)/2-1, sqrt(2)-1]. */
- .poly = { V2 (-0x1.ffffffffffffbp-2), V2 (0x1.55555555551a9p-2),
- V2 (-0x1.00000000008e3p-2), V2 (0x1.9999999a32797p-3),
- V2 (-0x1.555555552fecfp-3), V2 (0x1.249248e071e5ap-3),
- V2 (-0x1.ffffff8bf8482p-4), V2 (0x1.c71c8f07da57ap-4),
- V2 (-0x1.9999ca4ccb617p-4), V2 (0x1.7459ad2e1dfa3p-4),
- V2 (-0x1.554d2680a3ff2p-4), V2 (0x1.3b4c54d487455p-4),
- V2 (-0x1.2548a9ffe80e6p-4), V2 (0x1.0f389a24b2e07p-4),
- V2 (-0x1.eee4db15db335p-5), V2 (0x1.e95b494d4a5ddp-5),
- V2 (-0x1.15fdf07cb7c73p-4), V2 (0x1.0310b70800fcfp-4),
- V2 (-0x1.cfa7385bdb37ep-6) },
- .ln2 = { V2 (0x1.62e42fefa3800p-1), V2 (0x1.ef35793c76730p-45) },
- /* top32(asuint64(sqrt(2)/2)) << 32. */
- .hf_rt2_top = V2 (0x3fe6a09e00000000),
- /* (top32(asuint64(1)) - top32(asuint64(sqrt(2)/2))) << 32. */
- .one_m_hf_rt2_top = V2 (0x00095f6200000000),
- .umask = V2 (0x000fffff00000000),
- .one_top = V2 (0x3ff),
- .inf = V2 (0x7ff0000000000000),
- .minus_one = V2 (0xbff0000000000000)
-};
+ struct v_log1p_data d;
+ uint64x2_t inf, minus_one;
+} data = { .d = V_LOG1P_CONSTANTS_TABLE,
+ .inf = V2 (0x7ff0000000000000),
+ .minus_one = V2 (0xbff0000000000000) };
#define BottomMask v_u64 (0xffffffff)
-static float64x2_t VPCS_ATTR NOINLINE
-special_case (float64x2_t x, float64x2_t y, uint64x2_t special)
+static float64x2_t NOINLINE VPCS_ATTR
+special_case (float64x2_t x, uint64x2_t cmp, const struct data *d)
{
- return v_call_f64 (log1p, x, y, special);
+ /* Side-step special lanes so fenv exceptions are not triggered
+ inadvertently. */
+ float64x2_t x_nospecial = v_zerofy_f64 (x, cmp);
+ return v_call_f64 (log1p, x, log1p_inline (x_nospecial, &d->d), cmp);
}
/* Vector log1p approximation using polynomial on reduced interval. Routine is
@@ -66,64 +49,12 @@ VPCS_ATTR float64x2_t V_NAME_D1 (log1p) (float64x2_t x)
const struct data *d = ptr_barrier (&data);
uint64x2_t ix = vreinterpretq_u64_f64 (x);
uint64x2_t ia = vreinterpretq_u64_f64 (vabsq_f64 (x));
- uint64x2_t special = vcgeq_u64 (ia, d->inf);
-#if WANT_SIMD_EXCEPT
- special = vorrq_u64 (special,
- vcgeq_u64 (ix, vreinterpretq_u64_f64 (v_f64 (-1))));
- if (__glibc_unlikely (v_any_u64 (special)))
- x = v_zerofy_f64 (x, special);
-#else
- special = vorrq_u64 (special, vcleq_f64 (x, v_f64 (-1)));
-#endif
+ uint64x2_t special_cases
+ = vorrq_u64 (vcgeq_u64 (ia, d->inf), vcgeq_u64 (ix, d->minus_one));
- /* With x + 1 = t * 2^k (where t = f + 1 and k is chosen such that f
- is in [sqrt(2)/2, sqrt(2)]):
- log1p(x) = k*log(2) + log1p(f).
+ if (__glibc_unlikely (v_any_u64 (special_cases)))
+ return special_case (x, special_cases, d);
- f may not be representable exactly, so we need a correction term:
- let m = round(1 + x), c = (1 + x) - m.
- c << m: at very small x, log1p(x) ~ x, hence:
- log(1+x) - log(m) ~ c/m.
-
- We therefore calculate log1p(x) by k*log2 + log1p(f) + c/m. */
-
- /* Obtain correctly scaled k by manipulation in the exponent.
- The scalar algorithm casts down to 32-bit at this point to calculate k and
- u_red. We stay in double-width to obtain f and k, using the same constants
- as the scalar algorithm but shifted left by 32. */
- float64x2_t m = vaddq_f64 (x, v_f64 (1));
- uint64x2_t mi = vreinterpretq_u64_f64 (m);
- uint64x2_t u = vaddq_u64 (mi, d->one_m_hf_rt2_top);
-
- int64x2_t ki
- = vsubq_s64 (vreinterpretq_s64_u64 (vshrq_n_u64 (u, 52)), d->one_top);
- float64x2_t k = vcvtq_f64_s64 (ki);
-
- /* Reduce x to f in [sqrt(2)/2, sqrt(2)]. */
- uint64x2_t utop = vaddq_u64 (vandq_u64 (u, d->umask), d->hf_rt2_top);
- uint64x2_t u_red = vorrq_u64 (utop, vandq_u64 (mi, BottomMask));
- float64x2_t f = vsubq_f64 (vreinterpretq_f64_u64 (u_red), v_f64 (1));
-
- /* Correction term c/m. */
- float64x2_t cm = vdivq_f64 (vsubq_f64 (x, vsubq_f64 (m, v_f64 (1))), m);
-
- /* Approximate log1p(x) on the reduced input using a polynomial. Because
- log1p(0)=0 we choose an approximation of the form:
- x + C0*x^2 + C1*x^3 + C2x^4 + ...
- Hence approximation has the form f + f^2 * P(f)
- where P(x) = C0 + C1*x + C2x^2 + ...
- Assembling this all correctly is dealt with at the final step. */
- float64x2_t f2 = vmulq_f64 (f, f);
- float64x2_t p = v_pw_horner_18_f64 (f, f2, d->poly);
-
- float64x2_t ylo = vfmaq_f64 (cm, k, d->ln2[1]);
- float64x2_t yhi = vfmaq_f64 (f, k, d->ln2[0]);
- float64x2_t y = vaddq_f64 (ylo, yhi);
-
- if (__glibc_unlikely (v_any_u64 (special)))
- return special_case (vreinterpretq_f64_u64 (ix), vfmaq_f64 (y, f2, p),
- special);
-
- return vfmaq_f64 (y, f2, p);
+ return log1p_inline (x, &d->d);
}
diff --git a/sysdeps/aarch64/fpu/v_log1p_inline.h b/sysdeps/aarch64/fpu/v_log1p_inline.h
index 242e43b6eecc0b6e..834ff65adf34ed4a 100644
--- a/sysdeps/aarch64/fpu/v_log1p_inline.h
+++ b/sysdeps/aarch64/fpu/v_log1p_inline.h
@@ -21,29 +21,30 @@
#define AARCH64_FPU_V_LOG1P_INLINE_H
#include "v_math.h"
-#include "poly_advsimd_f64.h"
struct v_log1p_data
{
- float64x2_t poly[19], ln2[2];
+ float64x2_t c0, c2, c4, c6, c8, c10, c12, c14, c16;
uint64x2_t hf_rt2_top, one_m_hf_rt2_top, umask;
int64x2_t one_top;
+ double c1, c3, c5, c7, c9, c11, c13, c15, c17, c18;
+ double ln2[2];
};
/* Coefficients generated using Remez, deg=20, in [sqrt(2)/2-1, sqrt(2)-1]. */
#define V_LOG1P_CONSTANTS_TABLE \
{ \
- .poly = { V2 (-0x1.ffffffffffffbp-2), V2 (0x1.55555555551a9p-2), \
- V2 (-0x1.00000000008e3p-2), V2 (0x1.9999999a32797p-3), \
- V2 (-0x1.555555552fecfp-3), V2 (0x1.249248e071e5ap-3), \
- V2 (-0x1.ffffff8bf8482p-4), V2 (0x1.c71c8f07da57ap-4), \
- V2 (-0x1.9999ca4ccb617p-4), V2 (0x1.7459ad2e1dfa3p-4), \
- V2 (-0x1.554d2680a3ff2p-4), V2 (0x1.3b4c54d487455p-4), \
- V2 (-0x1.2548a9ffe80e6p-4), V2 (0x1.0f389a24b2e07p-4), \
- V2 (-0x1.eee4db15db335p-5), V2 (0x1.e95b494d4a5ddp-5), \
- V2 (-0x1.15fdf07cb7c73p-4), V2 (0x1.0310b70800fcfp-4), \
- V2 (-0x1.cfa7385bdb37ep-6) }, \
- .ln2 = { V2 (0x1.62e42fefa3800p-1), V2 (0x1.ef35793c76730p-45) }, \
+ .c0 = V2 (-0x1.ffffffffffffbp-2), .c1 = 0x1.55555555551a9p-2, \
+ .c2 = V2 (-0x1.00000000008e3p-2), .c3 = 0x1.9999999a32797p-3, \
+ .c4 = V2 (-0x1.555555552fecfp-3), .c5 = 0x1.249248e071e5ap-3, \
+ .c6 = V2 (-0x1.ffffff8bf8482p-4), .c7 = 0x1.c71c8f07da57ap-4, \
+ .c8 = V2 (-0x1.9999ca4ccb617p-4), .c9 = 0x1.7459ad2e1dfa3p-4, \
+ .c10 = V2 (-0x1.554d2680a3ff2p-4), .c11 = 0x1.3b4c54d487455p-4, \
+ .c12 = V2 (-0x1.2548a9ffe80e6p-4), .c13 = 0x1.0f389a24b2e07p-4, \
+ .c14 = V2 (-0x1.eee4db15db335p-5), .c15 = 0x1.e95b494d4a5ddp-5, \
+ .c16 = V2 (-0x1.15fdf07cb7c73p-4), .c17 = 0x1.0310b70800fcfp-4, \
+ .c18 = -0x1.cfa7385bdb37ep-6, \
+ .ln2 = { 0x1.62e42fefa3800p-1, 0x1.ef35793c76730p-45 }, \
.hf_rt2_top = V2 (0x3fe6a09e00000000), \
.one_m_hf_rt2_top = V2 (0x00095f6200000000), \
.umask = V2 (0x000fffff00000000), .one_top = V2 (0x3ff) \
@@ -51,19 +52,45 @@ struct v_log1p_data
#define BottomMask v_u64 (0xffffffff)
+static inline float64x2_t
+eval_poly (float64x2_t m, float64x2_t m2, const struct v_log1p_data *d)
+{
+ /* Approximate log(1+m) on [-0.25, 0.5] using pairwise Horner. */
+ float64x2_t c13 = vld1q_f64 (&d->c1);
+ float64x2_t c57 = vld1q_f64 (&d->c5);
+ float64x2_t c911 = vld1q_f64 (&d->c9);
+ float64x2_t c1315 = vld1q_f64 (&d->c13);
+ float64x2_t c1718 = vld1q_f64 (&d->c17);
+ float64x2_t p1617 = vfmaq_laneq_f64 (d->c16, m, c1718, 0);
+ float64x2_t p1415 = vfmaq_laneq_f64 (d->c14, m, c1315, 1);
+ float64x2_t p1213 = vfmaq_laneq_f64 (d->c12, m, c1315, 0);
+ float64x2_t p1011 = vfmaq_laneq_f64 (d->c10, m, c911, 1);
+ float64x2_t p89 = vfmaq_laneq_f64 (d->c8, m, c911, 0);
+ float64x2_t p67 = vfmaq_laneq_f64 (d->c6, m, c57, 1);
+ float64x2_t p45 = vfmaq_laneq_f64 (d->c4, m, c57, 0);
+ float64x2_t p23 = vfmaq_laneq_f64 (d->c2, m, c13, 1);
+ float64x2_t p01 = vfmaq_laneq_f64 (d->c0, m, c13, 0);
+ float64x2_t p = vfmaq_laneq_f64 (p1617, m2, c1718, 1);
+ p = vfmaq_f64 (p1415, m2, p);
+ p = vfmaq_f64 (p1213, m2, p);
+ p = vfmaq_f64 (p1011, m2, p);
+ p = vfmaq_f64 (p89, m2, p);
+ p = vfmaq_f64 (p67, m2, p);
+ p = vfmaq_f64 (p45, m2, p);
+ p = vfmaq_f64 (p23, m2, p);
+ return vfmaq_f64 (p01, m2, p);
+}
+
static inline float64x2_t
log1p_inline (float64x2_t x, const struct v_log1p_data *d)
{
- /* Helper for calculating log(x + 1). Copied from v_log1p_2u5.c, with several
- modifications:
+ /* Helper for calculating log(x + 1):
- No special-case handling - this should be dealt with by the caller.
- - Pairwise Horner polynomial evaluation for improved accuracy.
- Optionally simulate the shortcut for k=0, used in the scalar routine,
- using v_sel, for improved accuracy when the argument to log1p is close to
- 0. This feature is enabled by defining WANT_V_LOG1P_K0_SHORTCUT as 1 in
- the source of the caller before including this file.
- See v_log1pf_2u1.c for details of the algorithm. */
- float64x2_t m = vaddq_f64 (x, v_f64 (1));
+ using v_sel, for improved accuracy when the argument to log1p is close
+ to 0. This feature is enabled by defining WANT_V_LOG1P_K0_SHORTCUT as 1
+ in the source of the caller before including this file. */
+ float64x2_t m = vaddq_f64 (x, v_f64 (1.0));
uint64x2_t mi = vreinterpretq_u64_f64 (m);
uint64x2_t u = vaddq_u64 (mi, d->one_m_hf_rt2_top);
@@ -74,14 +101,14 @@ log1p_inline (float64x2_t x, const struct v_log1p_data *d)
/* Reduce x to f in [sqrt(2)/2, sqrt(2)]. */
uint64x2_t utop = vaddq_u64 (vandq_u64 (u, d->umask), d->hf_rt2_top);
uint64x2_t u_red = vorrq_u64 (utop, vandq_u64 (mi, BottomMask));
- float64x2_t f = vsubq_f64 (vreinterpretq_f64_u64 (u_red), v_f64 (1));
+ float64x2_t f = vsubq_f64 (vreinterpretq_f64_u64 (u_red), v_f64 (1.0));
/* Correction term c/m. */
- float64x2_t cm = vdivq_f64 (vsubq_f64 (x, vsubq_f64 (m, v_f64 (1))), m);
+ float64x2_t cm = vdivq_f64 (vsubq_f64 (x, vsubq_f64 (m, v_f64 (1.0))), m);
#ifndef WANT_V_LOG1P_K0_SHORTCUT
-#error \
- "Cannot use v_log1p_inline.h without specifying whether you need the k0 shortcut for greater accuracy close to 0"
+# error \
+ "Cannot use v_log1p_inline.h without specifying whether you need the k0 shortcut for greater accuracy close to 0"
#elif WANT_V_LOG1P_K0_SHORTCUT
/* Shortcut if k is 0 - set correction term to 0 and f to x. The result is
that the approximation is solely the polynomial. */
@@ -92,11 +119,12 @@ log1p_inline (float64x2_t x, const struct v_log1p_data *d)
/* Approximate log1p(f) on the reduced input using a polynomial. */
float64x2_t f2 = vmulq_f64 (f, f);
- float64x2_t p = v_pw_horner_18_f64 (f, f2, d->poly);
+ float64x2_t p = eval_poly (f, f2, d);
/* Assemble log1p(x) = k * log2 + log1p(f) + c/m. */
- float64x2_t ylo = vfmaq_f64 (cm, k, d->ln2[1]);
- float64x2_t yhi = vfmaq_f64 (f, k, d->ln2[0]);
+ float64x2_t ln2 = vld1q_f64 (&d->ln2[0]);
+ float64x2_t ylo = vfmaq_laneq_f64 (cm, k, ln2, 1);
+ float64x2_t yhi = vfmaq_laneq_f64 (f, k, ln2, 0);
return vfmaq_f64 (vaddq_f64 (ylo, yhi), f2, p);
}

216
glibc-RHEL-118273-21.patch Normal file
View File

@ -0,0 +1,216 @@
commit 569cfaaf4984ae70b23c61ee28a609b5aef93fea
Author: Pierre Blanchard <pierre.blanchard@arm.com>
Date: Mon Dec 9 15:53:04 2024 +0000
AArch64: Improve codegen in AdvSIMD pow
Remove spurious ADRP. Improve memory access by shuffling constants and
using more indexed MLAs.
A few more optimisation with no impact on accuracy
- force fmas contraction
- switch from shift-aided rint to rint instruction
Between 1 and 5% throughput improvement on Neoverse
V1 depending on benchmark.
diff --git a/sysdeps/aarch64/fpu/pow_advsimd.c b/sysdeps/aarch64/fpu/pow_advsimd.c
index 3c91e3e183599e3e..81e134ac2f0bd2f5 100644
--- a/sysdeps/aarch64/fpu/pow_advsimd.c
+++ b/sysdeps/aarch64/fpu/pow_advsimd.c
@@ -22,9 +22,6 @@
/* Defines parameters of the approximation and scalar fallback. */
#include "finite_pow.h"
-#define VecSmallExp v_u64 (SmallExp)
-#define VecThresExp v_u64 (ThresExp)
-
#define VecSmallPowX v_u64 (SmallPowX)
#define VecThresPowX v_u64 (ThresPowX)
#define VecSmallPowY v_u64 (SmallPowY)
@@ -32,36 +29,48 @@
static const struct data
{
- float64x2_t log_poly[6];
- float64x2_t exp_poly[3];
- float64x2_t ln2_hi, ln2_lo;
- float64x2_t shift, inv_ln2_n, ln2_hi_n, ln2_lo_n, small_powx;
uint64x2_t inf;
+ float64x2_t small_powx;
+ uint64x2_t offset, mask;
+ uint64x2_t mask_sub_0, mask_sub_1;
+ float64x2_t log_c0, log_c2, log_c4, log_c5;
+ double log_c1, log_c3;
+ double ln2_lo, ln2_hi;
+ uint64x2_t small_exp, thres_exp;
+ double ln2_lo_n, ln2_hi_n;
+ double inv_ln2_n, exp_c2;
+ float64x2_t exp_c0, exp_c1;
} data = {
+ /* Power threshold. */
+ .inf = V2 (0x7ff0000000000000),
+ .small_powx = V2 (0x1p-126),
+ .offset = V2 (Off),
+ .mask = V2 (0xfffULL << 52),
+ .mask_sub_0 = V2 (1ULL << 52),
+ .mask_sub_1 = V2 (52ULL << 52),
/* Coefficients copied from v_pow_log_data.c
relative error: 0x1.11922ap-70 in [-0x1.6bp-8, 0x1.6bp-8]
Coefficients are scaled to match the scaling during evaluation. */
- .log_poly
- = { V2 (0x1.555555555556p-2 * -2), V2 (-0x1.0000000000006p-2 * -2),
- V2 (0x1.999999959554ep-3 * 4), V2 (-0x1.555555529a47ap-3 * 4),
- V2 (0x1.2495b9b4845e9p-3 * -8), V2 (-0x1.0002b8b263fc3p-3 * -8) },
- .ln2_hi = V2 (0x1.62e42fefa3800p-1),
- .ln2_lo = V2 (0x1.ef35793c76730p-45),
+ .log_c0 = V2 (0x1.555555555556p-2 * -2),
+ .log_c1 = -0x1.0000000000006p-2 * -2,
+ .log_c2 = V2 (0x1.999999959554ep-3 * 4),
+ .log_c3 = -0x1.555555529a47ap-3 * 4,
+ .log_c4 = V2 (0x1.2495b9b4845e9p-3 * -8),
+ .log_c5 = V2 (-0x1.0002b8b263fc3p-3 * -8),
+ .ln2_hi = 0x1.62e42fefa3800p-1,
+ .ln2_lo = 0x1.ef35793c76730p-45,
/* Polynomial coefficients: abs error: 1.43*2^-58, ulp error: 0.549
(0.550 without fma) if |x| < ln2/512. */
- .exp_poly = { V2 (0x1.fffffffffffd4p-2), V2 (0x1.5555571d6ef9p-3),
- V2 (0x1.5555576a5adcep-5) },
- .shift = V2 (0x1.8p52), /* round to nearest int. without intrinsics. */
- .inv_ln2_n = V2 (0x1.71547652b82fep8), /* N/ln2. */
- .ln2_hi_n = V2 (0x1.62e42fefc0000p-9), /* ln2/N. */
- .ln2_lo_n = V2 (-0x1.c610ca86c3899p-45),
- .small_powx = V2 (0x1p-126),
- .inf = V2 (0x7ff0000000000000)
+ .exp_c0 = V2 (0x1.fffffffffffd4p-2),
+ .exp_c1 = V2 (0x1.5555571d6ef9p-3),
+ .exp_c2 = 0x1.5555576a5adcep-5,
+ .small_exp = V2 (0x3c90000000000000),
+ .thres_exp = V2 (0x03f0000000000000),
+ .inv_ln2_n = 0x1.71547652b82fep8, /* N/ln2. */
+ .ln2_hi_n = 0x1.62e42fefc0000p-9, /* ln2/N. */
+ .ln2_lo_n = -0x1.c610ca86c3899p-45,
};
-#define A(i) data.log_poly[i]
-#define C(i) data.exp_poly[i]
-
/* This version implements an algorithm close to scalar pow but
- does not implement the trick in the exp's specialcase subroutine to avoid
double-rounding,
@@ -91,10 +100,9 @@ v_log_inline (uint64x2_t ix, float64x2_t *tail, const struct data *d)
/* x = 2^k z; where z is in range [OFF,2*OFF) and exact.
The range is split into N subintervals.
The ith subinterval contains z and c is near its center. */
- uint64x2_t tmp = vsubq_u64 (ix, v_u64 (Off));
- int64x2_t k
- = vshrq_n_s64 (vreinterpretq_s64_u64 (tmp), 52); /* arithmetic shift. */
- uint64x2_t iz = vsubq_u64 (ix, vandq_u64 (tmp, v_u64 (0xfffULL << 52)));
+ uint64x2_t tmp = vsubq_u64 (ix, d->offset);
+ int64x2_t k = vshrq_n_s64 (vreinterpretq_s64_u64 (tmp), 52);
+ uint64x2_t iz = vsubq_u64 (ix, vandq_u64 (tmp, d->mask));
float64x2_t z = vreinterpretq_f64_u64 (iz);
float64x2_t kd = vcvtq_f64_s64 (k);
/* log(x) = k*Ln2 + log(c) + log1p(z/c-1). */
@@ -105,9 +113,10 @@ v_log_inline (uint64x2_t ix, float64x2_t *tail, const struct data *d)
|z/c - 1| < 1/N, so r = z/c - 1 is exactly representible. */
float64x2_t r = vfmaq_f64 (v_f64 (-1.0), z, invc);
/* k*Ln2 + log(c) + r. */
- float64x2_t t1 = vfmaq_f64 (logc, kd, d->ln2_hi);
+ float64x2_t ln2 = vld1q_f64 (&d->ln2_lo);
+ float64x2_t t1 = vfmaq_laneq_f64 (logc, kd, ln2, 1);
float64x2_t t2 = vaddq_f64 (t1, r);
- float64x2_t lo1 = vfmaq_f64 (logctail, kd, d->ln2_lo);
+ float64x2_t lo1 = vfmaq_laneq_f64 (logctail, kd, ln2, 0);
float64x2_t lo2 = vaddq_f64 (vsubq_f64 (t1, t2), r);
/* Evaluation is optimized assuming superscalar pipelined execution. */
float64x2_t ar = vmulq_f64 (v_f64 (-0.5), r);
@@ -118,9 +127,10 @@ v_log_inline (uint64x2_t ix, float64x2_t *tail, const struct data *d)
float64x2_t lo3 = vfmaq_f64 (vnegq_f64 (ar2), ar, r);
float64x2_t lo4 = vaddq_f64 (vsubq_f64 (t2, hi), ar2);
/* p = log1p(r) - r - A[0]*r*r. */
- float64x2_t a56 = vfmaq_f64 (A (4), r, A (5));
- float64x2_t a34 = vfmaq_f64 (A (2), r, A (3));
- float64x2_t a12 = vfmaq_f64 (A (0), r, A (1));
+ float64x2_t odd_coeffs = vld1q_f64 (&d->log_c1);
+ float64x2_t a56 = vfmaq_f64 (d->log_c4, r, d->log_c5);
+ float64x2_t a34 = vfmaq_laneq_f64 (d->log_c2, r, odd_coeffs, 1);
+ float64x2_t a12 = vfmaq_laneq_f64 (d->log_c0, r, odd_coeffs, 0);
float64x2_t p = vfmaq_f64 (a34, ar2, a56);
p = vfmaq_f64 (a12, ar2, p);
p = vmulq_f64 (ar3, p);
@@ -140,28 +150,28 @@ exp_special_case (float64x2_t x, float64x2_t xtail)
/* Computes sign*exp(x+xtail) where |xtail| < 2^-8/N and |xtail| <= |x|. */
static inline float64x2_t
-v_exp_inline (float64x2_t x, float64x2_t xtail, const struct data *d)
+v_exp_inline (float64x2_t x, float64x2_t neg_xtail, const struct data *d)
{
/* Fallback to scalar exp_inline for all lanes if any lane
contains value of x s.t. |x| <= 2^-54 or >= 512. */
- uint64x2_t abstop
- = vshrq_n_u64 (vandq_u64 (vreinterpretq_u64_f64 (x), d->inf), 52);
- uint64x2_t uoflowx
- = vcgeq_u64 (vsubq_u64 (abstop, VecSmallExp), VecThresExp);
+ uint64x2_t uoflowx = vcgeq_u64 (
+ vsubq_u64 (vreinterpretq_u64_f64 (vabsq_f64 (x)), d->small_exp),
+ d->thres_exp);
if (__glibc_unlikely (v_any_u64 (uoflowx)))
- return exp_special_case (x, xtail);
+ return exp_special_case (x, vnegq_f64 (neg_xtail));
/* exp(x) = 2^(k/N) * exp(r), with exp(r) in [2^(-1/2N),2^(1/2N)]. */
/* x = ln2/N*k + r, with k integer and r in [-ln2/2N, ln2/2N]. */
- float64x2_t z = vmulq_f64 (d->inv_ln2_n, x);
/* z - kd is in [-1, 1] in non-nearest rounding modes. */
- float64x2_t kd = vaddq_f64 (z, d->shift);
- uint64x2_t ki = vreinterpretq_u64_f64 (kd);
- kd = vsubq_f64 (kd, d->shift);
- float64x2_t r = vfmsq_f64 (x, kd, d->ln2_hi_n);
- r = vfmsq_f64 (r, kd, d->ln2_lo_n);
+ float64x2_t exp_consts = vld1q_f64 (&d->inv_ln2_n);
+ float64x2_t z = vmulq_laneq_f64 (x, exp_consts, 0);
+ float64x2_t kd = vrndnq_f64 (z);
+ uint64x2_t ki = vreinterpretq_u64_s64 (vcvtaq_s64_f64 (z));
+ float64x2_t ln2_n = vld1q_f64 (&d->ln2_lo_n);
+ float64x2_t r = vfmsq_laneq_f64 (x, kd, ln2_n, 1);
+ r = vfmsq_laneq_f64 (r, kd, ln2_n, 0);
/* The code assumes 2^-200 < |xtail| < 2^-8/N. */
- r = vaddq_f64 (r, xtail);
+ r = vsubq_f64 (r, neg_xtail);
/* 2^(k/N) ~= scale. */
uint64x2_t idx = vandq_u64 (ki, v_u64 (N_EXP - 1));
uint64x2_t top = vshlq_n_u64 (ki, 52 - V_POW_EXP_TABLE_BITS);
@@ -170,8 +180,8 @@ v_exp_inline (float64x2_t x, float64x2_t xtail, const struct data *d)
sbits = vaddq_u64 (sbits, top);
/* exp(x) = 2^(k/N) * exp(r) ~= scale + scale * (exp(r) - 1). */
float64x2_t r2 = vmulq_f64 (r, r);
- float64x2_t tmp = vfmaq_f64 (C (1), r, C (2));
- tmp = vfmaq_f64 (C (0), r, tmp);
+ float64x2_t tmp = vfmaq_laneq_f64 (d->exp_c1, r, exp_consts, 1);
+ tmp = vfmaq_f64 (d->exp_c0, r, tmp);
tmp = vfmaq_f64 (r, r2, tmp);
float64x2_t scale = vreinterpretq_f64_u64 (sbits);
/* Note: tmp == 0 or |tmp| > 2^-200 and scale > 2^-739, so there
@@ -230,8 +240,8 @@ float64x2_t VPCS_ATTR V_NAME_D2 (pow) (float64x2_t x, float64x2_t y)
{
/* Normalize subnormal x so exponent becomes negative. */
uint64x2_t vix_norm = vreinterpretq_u64_f64 (
- vabsq_f64 (vmulq_f64 (x, vcvtq_f64_u64 (v_u64 (1ULL << 52)))));
- vix_norm = vsubq_u64 (vix_norm, v_u64 (52ULL << 52));
+ vabsq_f64 (vmulq_f64 (x, vcvtq_f64_u64 (d->mask_sub_0))));
+ vix_norm = vsubq_u64 (vix_norm, d->mask_sub_1);
vix = vbslq_u64 (sub_x, vix_norm, vix);
}
}
@@ -242,8 +252,7 @@ float64x2_t VPCS_ATTR V_NAME_D2 (pow) (float64x2_t x, float64x2_t y)
/* Vector Exp(y_loghi, y_loglo). */
float64x2_t vehi = vmulq_f64 (y, vhi);
- float64x2_t velo = vmulq_f64 (y, vlo);
float64x2_t vemi = vfmsq_f64 (vehi, y, vhi);
- velo = vsubq_f64 (velo, vemi);
- return v_exp_inline (vehi, velo, d);
+ float64x2_t neg_velo = vfmsq_f64 (vemi, y, vlo);
+ return v_exp_inline (vehi, neg_velo, d);
}

501
glibc-RHEL-118273-22.patch Normal file
View File

@ -0,0 +1,501 @@
commit cff9648d0b50d19cdaf685f6767add040d4e1a8e
Author: Joana Cruz <Joana.Cruz@arm.com>
Date: Tue Dec 17 14:50:33 2024 +0000
AArch64: Improve codegen of AdvSIMD expf family
Load the polynomial evaluation coefficients into 2 vectors and use lanewise MLAs.
Also use intrinsics instead of native operations.
expf: 3% improvement in throughput microbenchmark on Neoverse V1, exp2f: 5%,
exp10f: 13%, coshf: 14%.
Reviewed-by: Wilco Dijkstra <Wilco.Dijkstra@arm.com>
diff --git a/sysdeps/aarch64/fpu/coshf_advsimd.c b/sysdeps/aarch64/fpu/coshf_advsimd.c
index c1ab4923b826569b..cd5c86652129ea9c 100644
--- a/sysdeps/aarch64/fpu/coshf_advsimd.c
+++ b/sysdeps/aarch64/fpu/coshf_advsimd.c
@@ -23,19 +23,27 @@
static const struct data
{
struct v_expf_data expf_consts;
- uint32x4_t tiny_bound, special_bound;
+ uint32x4_t tiny_bound;
+ float32x4_t bound;
+#if WANT_SIMD_EXCEPT
+ uint32x4_t special_bound;
+#endif
} data = {
.expf_consts = V_EXPF_DATA,
.tiny_bound = V4 (0x20000000), /* 0x1p-63: Round to 1 below this. */
/* 0x1.5a92d8p+6: expf overflows above this, so have to use special case. */
+ .bound = V4 (0x1.5a92d8p+6),
+#if WANT_SIMD_EXCEPT
.special_bound = V4 (0x42ad496c),
+#endif
};
#if !WANT_SIMD_EXCEPT
static float32x4_t NOINLINE VPCS_ATTR
-special_case (float32x4_t x, float32x4_t y, uint32x4_t special)
+special_case (float32x4_t x, float32x4_t half_t, float32x4_t half_over_t,
+ uint32x4_t special)
{
- return v_call_f32 (coshf, x, y, special);
+ return v_call_f32 (coshf, x, vaddq_f32 (half_t, half_over_t), special);
}
#endif
@@ -47,14 +55,13 @@ float32x4_t VPCS_ATTR V_NAME_F1 (cosh) (float32x4_t x)
{
const struct data *d = ptr_barrier (&data);
- float32x4_t ax = vabsq_f32 (x);
- uint32x4_t iax = vreinterpretq_u32_f32 (ax);
- uint32x4_t special = vcgeq_u32 (iax, d->special_bound);
-
#if WANT_SIMD_EXCEPT
/* If fp exceptions are to be triggered correctly, fall back to the scalar
variant for all inputs if any input is a special value or above the bound
at which expf overflows. */
+ float32x4_t ax = vabsq_f32 (x);
+ uint32x4_t iax = vreinterpretq_u32_f32 (ax);
+ uint32x4_t special = vcgeq_u32 (iax, d->special_bound);
if (__glibc_unlikely (v_any_u32 (special)))
return v_call_f32 (coshf, x, x, v_u32 (-1));
@@ -63,10 +70,13 @@ float32x4_t VPCS_ATTR V_NAME_F1 (cosh) (float32x4_t x)
input to 0, which will generate no exceptions. */
if (__glibc_unlikely (v_any_u32 (tiny)))
ax = v_zerofy_f32 (ax, tiny);
+ float32x4_t t = v_expf_inline (ax, &d->expf_consts);
+#else
+ uint32x4_t special = vcageq_f32 (x, d->bound);
+ float32x4_t t = v_expf_inline (x, &d->expf_consts);
#endif
/* Calculate cosh by exp(x) / 2 + exp(-x) / 2. */
- float32x4_t t = v_expf_inline (ax, &d->expf_consts);
float32x4_t half_t = vmulq_n_f32 (t, 0.5);
float32x4_t half_over_t = vdivq_f32 (v_f32 (0.5), t);
@@ -75,7 +85,7 @@ float32x4_t VPCS_ATTR V_NAME_F1 (cosh) (float32x4_t x)
return vbslq_f32 (tiny, v_f32 (1), vaddq_f32 (half_t, half_over_t));
#else
if (__glibc_unlikely (v_any_u32 (special)))
- return special_case (x, vaddq_f32 (half_t, half_over_t), special);
+ return special_case (x, half_t, half_over_t, special);
#endif
return vaddq_f32 (half_t, half_over_t);
diff --git a/sysdeps/aarch64/fpu/exp10f_advsimd.c b/sysdeps/aarch64/fpu/exp10f_advsimd.c
index cf53e73290fcedb6..55d9cd83f2968ab9 100644
--- a/sysdeps/aarch64/fpu/exp10f_advsimd.c
+++ b/sysdeps/aarch64/fpu/exp10f_advsimd.c
@@ -18,16 +18,15 @@
<https://www.gnu.org/licenses/>. */
#include "v_math.h"
-#include "poly_advsimd_f32.h"
#define ScaleBound 192.0f
static const struct data
{
- float32x4_t poly[5];
- float log10_2_and_inv[4];
- float32x4_t shift;
-
+ float32x4_t c0, c1, c3;
+ float log10_2_high, log10_2_low, c2, c4;
+ float32x4_t inv_log10_2, special_bound;
+ uint32x4_t exponent_bias, special_offset, special_bias;
#if !WANT_SIMD_EXCEPT
float32x4_t scale_thresh;
#endif
@@ -37,19 +36,24 @@ static const struct data
rel error: 0x1.89dafa3p-24
abs error: 0x1.167d55p-23 in [-log10(2)/2, log10(2)/2]
maxerr: 1.85943 +0.5 ulp. */
- .poly = { V4 (0x1.26bb16p+1f), V4 (0x1.5350d2p+1f), V4 (0x1.04744ap+1f),
- V4 (0x1.2d8176p+0f), V4 (0x1.12b41ap-1f) },
- .shift = V4 (0x1.8p23f),
-
- /* Stores constants 1/log10(2), log10(2)_high, log10(2)_low, 0. */
- .log10_2_and_inv = { 0x1.a934fp+1, 0x1.344136p-2, -0x1.ec10cp-27, 0 },
+ .c0 = V4 (0x1.26bb16p+1f),
+ .c1 = V4 (0x1.5350d2p+1f),
+ .c2 = 0x1.04744ap+1f,
+ .c3 = V4 (0x1.2d8176p+0f),
+ .c4 = 0x1.12b41ap-1f,
+ .inv_log10_2 = V4 (0x1.a934fp+1),
+ .log10_2_high = 0x1.344136p-2,
+ .log10_2_low = 0x1.ec10cp-27,
+ /* rint (log2 (2^127 / (1 + sqrt (2)))). */
+ .special_bound = V4 (126.0f),
+ .exponent_bias = V4 (0x3f800000),
+ .special_offset = V4 (0x82000000),
+ .special_bias = V4 (0x7f000000),
#if !WANT_SIMD_EXCEPT
.scale_thresh = V4 (ScaleBound)
#endif
};
-#define ExponentBias v_u32 (0x3f800000)
-
#if WANT_SIMD_EXCEPT
# define SpecialBound 38.0f /* rint(log10(2^127)). */
@@ -67,17 +71,15 @@ special_case (float32x4_t x, float32x4_t y, uint32x4_t cmp)
#else
-# define SpecialBound 126.0f /* rint (log2 (2^127 / (1 + sqrt (2)))). */
-# define SpecialOffset v_u32 (0x82000000)
-# define SpecialBias v_u32 (0x7f000000)
+# define SpecialBound 126.0f
static float32x4_t VPCS_ATTR NOINLINE
special_case (float32x4_t poly, float32x4_t n, uint32x4_t e, uint32x4_t cmp1,
float32x4_t scale, const struct data *d)
{
/* 2^n may overflow, break it up into s1*s2. */
- uint32x4_t b = vandq_u32 (vclezq_f32 (n), SpecialOffset);
- float32x4_t s1 = vreinterpretq_f32_u32 (vaddq_u32 (b, SpecialBias));
+ uint32x4_t b = vandq_u32 (vclezq_f32 (n), d->special_offset);
+ float32x4_t s1 = vreinterpretq_f32_u32 (vaddq_u32 (b, d->special_bias));
float32x4_t s2 = vreinterpretq_f32_u32 (vsubq_u32 (e, b));
uint32x4_t cmp2 = vcagtq_f32 (n, d->scale_thresh);
float32x4_t r2 = vmulq_f32 (s1, s1);
@@ -112,23 +114,23 @@ float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (exp10) (float32x4_t x)
/* exp10(x) = 2^n * 10^r = 2^n * (1 + poly (r)),
with poly(r) in [1/sqrt(2), sqrt(2)] and
x = r + n * log10 (2), with r in [-log10(2)/2, log10(2)/2]. */
- float32x4_t log10_2_and_inv = vld1q_f32 (d->log10_2_and_inv);
- float32x4_t z = vfmaq_laneq_f32 (d->shift, x, log10_2_and_inv, 0);
- float32x4_t n = vsubq_f32 (z, d->shift);
- float32x4_t r = vfmsq_laneq_f32 (x, n, log10_2_and_inv, 1);
- r = vfmsq_laneq_f32 (r, n, log10_2_and_inv, 2);
- uint32x4_t e = vshlq_n_u32 (vreinterpretq_u32_f32 (z), 23);
+ float32x4_t log10_2_c24 = vld1q_f32 (&d->log10_2_high);
+ float32x4_t n = vrndaq_f32 (vmulq_f32 (x, d->inv_log10_2));
+ float32x4_t r = vfmsq_laneq_f32 (x, n, log10_2_c24, 0);
+ r = vfmaq_laneq_f32 (r, n, log10_2_c24, 1);
+ uint32x4_t e = vshlq_n_u32 (vreinterpretq_u32_s32 (vcvtaq_s32_f32 (n)), 23);
- float32x4_t scale = vreinterpretq_f32_u32 (vaddq_u32 (e, ExponentBias));
+ float32x4_t scale = vreinterpretq_f32_u32 (vaddq_u32 (e, d->exponent_bias));
#if !WANT_SIMD_EXCEPT
- uint32x4_t cmp = vcagtq_f32 (n, v_f32 (SpecialBound));
+ uint32x4_t cmp = vcagtq_f32 (n, d->special_bound);
#endif
float32x4_t r2 = vmulq_f32 (r, r);
- float32x4_t poly
- = vfmaq_f32 (vmulq_f32 (r, d->poly[0]),
- v_pairwise_poly_3_f32 (r, r2, d->poly + 1), r2);
+ float32x4_t p12 = vfmaq_laneq_f32 (d->c1, r, log10_2_c24, 2);
+ float32x4_t p34 = vfmaq_laneq_f32 (d->c3, r, log10_2_c24, 3);
+ float32x4_t p14 = vfmaq_f32 (p12, r2, p34);
+ float32x4_t poly = vfmaq_f32 (vmulq_f32 (r, d->c0), p14, r2);
if (__glibc_unlikely (v_any_u32 (cmp)))
#if WANT_SIMD_EXCEPT
diff --git a/sysdeps/aarch64/fpu/exp2f_advsimd.c b/sysdeps/aarch64/fpu/exp2f_advsimd.c
index 69e0b193a1a91249..a4220da63c624490 100644
--- a/sysdeps/aarch64/fpu/exp2f_advsimd.c
+++ b/sysdeps/aarch64/fpu/exp2f_advsimd.c
@@ -21,24 +21,28 @@
static const struct data
{
- float32x4_t poly[5];
- uint32x4_t exponent_bias;
+ float32x4_t c1, c3;
+ uint32x4_t exponent_bias, special_offset, special_bias;
#if !WANT_SIMD_EXCEPT
- float32x4_t special_bound, scale_thresh;
+ float32x4_t scale_thresh, special_bound;
#endif
+ float c0, c2, c4, zero;
} data = {
/* maxerr: 1.962 ulp. */
- .poly = { V4 (0x1.59977ap-10f), V4 (0x1.3ce9e4p-7f), V4 (0x1.c6bd32p-5f),
- V4 (0x1.ebf9bcp-3f), V4 (0x1.62e422p-1f) },
+ .c0 = 0x1.59977ap-10f,
+ .c1 = V4 (0x1.3ce9e4p-7f),
+ .c2 = 0x1.c6bd32p-5f,
+ .c3 = V4 (0x1.ebf9bcp-3f),
+ .c4 = 0x1.62e422p-1f,
.exponent_bias = V4 (0x3f800000),
+ .special_offset = V4 (0x82000000),
+ .special_bias = V4 (0x7f000000),
#if !WANT_SIMD_EXCEPT
.special_bound = V4 (126.0f),
.scale_thresh = V4 (192.0f),
#endif
};
-#define C(i) d->poly[i]
-
#if WANT_SIMD_EXCEPT
# define TinyBound v_u32 (0x20000000) /* asuint (0x1p-63). */
@@ -55,16 +59,13 @@ special_case (float32x4_t x, float32x4_t y, uint32x4_t cmp)
#else
-# define SpecialOffset v_u32 (0x82000000)
-# define SpecialBias v_u32 (0x7f000000)
-
static float32x4_t VPCS_ATTR NOINLINE
special_case (float32x4_t poly, float32x4_t n, uint32x4_t e, uint32x4_t cmp1,
float32x4_t scale, const struct data *d)
{
/* 2^n may overflow, break it up into s1*s2. */
- uint32x4_t b = vandq_u32 (vclezq_f32 (n), SpecialOffset);
- float32x4_t s1 = vreinterpretq_f32_u32 (vaddq_u32 (b, SpecialBias));
+ uint32x4_t b = vandq_u32 (vclezq_f32 (n), d->special_offset);
+ float32x4_t s1 = vreinterpretq_f32_u32 (vaddq_u32 (b, d->special_bias));
float32x4_t s2 = vreinterpretq_f32_u32 (vsubq_u32 (e, b));
uint32x4_t cmp2 = vcagtq_f32 (n, d->scale_thresh);
float32x4_t r2 = vmulq_f32 (s1, s1);
@@ -80,13 +81,11 @@ special_case (float32x4_t poly, float32x4_t n, uint32x4_t e, uint32x4_t cmp1,
float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (exp2) (float32x4_t x)
{
const struct data *d = ptr_barrier (&data);
- float32x4_t n, r, r2, scale, p, q, poly;
- uint32x4_t cmp, e;
#if WANT_SIMD_EXCEPT
/* asuint(|x|) - TinyBound >= BigBound - TinyBound. */
uint32x4_t ia = vreinterpretq_u32_f32 (vabsq_f32 (x));
- cmp = vcgeq_u32 (vsubq_u32 (ia, TinyBound), SpecialBound);
+ uint32x4_t cmp = vcgeq_u32 (vsubq_u32 (ia, TinyBound), SpecialBound);
float32x4_t xm = x;
/* If any lanes are special, mask them with 1 and retain a copy of x to allow
special_case to fix special lanes later. This is only necessary if fenv
@@ -95,23 +94,24 @@ float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (exp2) (float32x4_t x)
x = vbslq_f32 (cmp, v_f32 (1), x);
#endif
- /* exp2(x) = 2^n (1 + poly(r)), with 1 + poly(r) in [1/sqrt(2),sqrt(2)]
- x = n + r, with r in [-1/2, 1/2]. */
- n = vrndaq_f32 (x);
- r = vsubq_f32 (x, n);
- e = vshlq_n_u32 (vreinterpretq_u32_s32 (vcvtaq_s32_f32 (x)), 23);
- scale = vreinterpretq_f32_u32 (vaddq_u32 (e, d->exponent_bias));
+ /* exp2(x) = 2^n (1 + poly(r)), with 1 + poly(r) in [1/sqrt(2),sqrt(2)]
+ x = n + r, with r in [-1/2, 1/2]. */
+ float32x4_t n = vrndaq_f32 (x);
+ float32x4_t r = vsubq_f32 (x, n);
+ uint32x4_t e = vshlq_n_u32 (vreinterpretq_u32_s32 (vcvtaq_s32_f32 (x)), 23);
+ float32x4_t scale = vreinterpretq_f32_u32 (vaddq_u32 (e, d->exponent_bias));
#if !WANT_SIMD_EXCEPT
- cmp = vcagtq_f32 (n, d->special_bound);
+ uint32x4_t cmp = vcagtq_f32 (n, d->special_bound);
#endif
- r2 = vmulq_f32 (r, r);
- p = vfmaq_f32 (C (1), C (0), r);
- q = vfmaq_f32 (C (3), C (2), r);
+ float32x4_t c024 = vld1q_f32 (&d->c0);
+ float32x4_t r2 = vmulq_f32 (r, r);
+ float32x4_t p = vfmaq_laneq_f32 (d->c1, r, c024, 0);
+ float32x4_t q = vfmaq_laneq_f32 (d->c3, r, c024, 1);
q = vfmaq_f32 (q, p, r2);
- p = vmulq_f32 (C (4), r);
- poly = vfmaq_f32 (p, q, r2);
+ p = vmulq_laneq_f32 (r, c024, 2);
+ float32x4_t poly = vfmaq_f32 (p, q, r2);
if (__glibc_unlikely (v_any_u32 (cmp)))
#if WANT_SIMD_EXCEPT
diff --git a/sysdeps/aarch64/fpu/expf_advsimd.c b/sysdeps/aarch64/fpu/expf_advsimd.c
index 5c9cb726205ece6e..70f137e2e5b46207 100644
--- a/sysdeps/aarch64/fpu/expf_advsimd.c
+++ b/sysdeps/aarch64/fpu/expf_advsimd.c
@@ -21,20 +21,25 @@
static const struct data
{
- float32x4_t poly[5];
- float32x4_t inv_ln2, ln2_hi, ln2_lo;
- uint32x4_t exponent_bias;
+ float32x4_t c1, c3, c4, inv_ln2;
+ float ln2_hi, ln2_lo, c0, c2;
+ uint32x4_t exponent_bias, special_offset, special_bias;
#if !WANT_SIMD_EXCEPT
float32x4_t special_bound, scale_thresh;
#endif
} data = {
/* maxerr: 1.45358 +0.5 ulp. */
- .poly = { V4 (0x1.0e4020p-7f), V4 (0x1.573e2ep-5f), V4 (0x1.555e66p-3f),
- V4 (0x1.fffdb6p-2f), V4 (0x1.ffffecp-1f) },
+ .c0 = 0x1.0e4020p-7f,
+ .c1 = V4 (0x1.573e2ep-5f),
+ .c2 = 0x1.555e66p-3f,
+ .c3 = V4 (0x1.fffdb6p-2f),
+ .c4 = V4 (0x1.ffffecp-1f),
.inv_ln2 = V4 (0x1.715476p+0f),
- .ln2_hi = V4 (0x1.62e4p-1f),
- .ln2_lo = V4 (0x1.7f7d1cp-20f),
+ .ln2_hi = 0x1.62e4p-1f,
+ .ln2_lo = 0x1.7f7d1cp-20f,
.exponent_bias = V4 (0x3f800000),
+ .special_offset = V4 (0x82000000),
+ .special_bias = V4 (0x7f000000),
#if !WANT_SIMD_EXCEPT
.special_bound = V4 (126.0f),
.scale_thresh = V4 (192.0f),
@@ -59,19 +64,17 @@ special_case (float32x4_t x, float32x4_t y, uint32x4_t cmp)
#else
-# define SpecialOffset v_u32 (0x82000000)
-# define SpecialBias v_u32 (0x7f000000)
-
static float32x4_t VPCS_ATTR NOINLINE
special_case (float32x4_t poly, float32x4_t n, uint32x4_t e, uint32x4_t cmp1,
float32x4_t scale, const struct data *d)
{
/* 2^n may overflow, break it up into s1*s2. */
- uint32x4_t b = vandq_u32 (vclezq_f32 (n), SpecialOffset);
- float32x4_t s1 = vreinterpretq_f32_u32 (vaddq_u32 (b, SpecialBias));
+ uint32x4_t b = vandq_u32 (vclezq_f32 (n), d->special_offset);
+ float32x4_t s1 = vreinterpretq_f32_u32 (vaddq_u32 (b, d->special_bias));
float32x4_t s2 = vreinterpretq_f32_u32 (vsubq_u32 (e, b));
uint32x4_t cmp2 = vcagtq_f32 (n, d->scale_thresh);
float32x4_t r2 = vmulq_f32 (s1, s1);
+ // (s2 + p*s2)*s1 = s2(p+1)s1
float32x4_t r1 = vmulq_f32 (vfmaq_f32 (s2, poly, s2), s1);
/* Similar to r1 but avoids double rounding in the subnormal range. */
float32x4_t r0 = vfmaq_f32 (scale, poly, scale);
@@ -84,12 +87,11 @@ special_case (float32x4_t poly, float32x4_t n, uint32x4_t e, uint32x4_t cmp1,
float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (exp) (float32x4_t x)
{
const struct data *d = ptr_barrier (&data);
- float32x4_t n, r, r2, scale, p, q, poly;
- uint32x4_t cmp, e;
+ float32x4_t ln2_c02 = vld1q_f32 (&d->ln2_hi);
#if WANT_SIMD_EXCEPT
/* asuint(x) - TinyBound >= BigBound - TinyBound. */
- cmp = vcgeq_u32 (
+ uint32x4_t cmp = vcgeq_u32 (
vsubq_u32 (vandq_u32 (vreinterpretq_u32_f32 (x), v_u32 (0x7fffffff)),
TinyBound),
SpecialBound);
@@ -103,22 +105,22 @@ float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (exp) (float32x4_t x)
/* exp(x) = 2^n (1 + poly(r)), with 1 + poly(r) in [1/sqrt(2),sqrt(2)]
x = ln2*n + r, with r in [-ln2/2, ln2/2]. */
- n = vrndaq_f32 (vmulq_f32 (x, d->inv_ln2));
- r = vfmsq_f32 (x, n, d->ln2_hi);
- r = vfmsq_f32 (r, n, d->ln2_lo);
- e = vshlq_n_u32 (vreinterpretq_u32_s32 (vcvtq_s32_f32 (n)), 23);
- scale = vreinterpretq_f32_u32 (vaddq_u32 (e, d->exponent_bias));
+ float32x4_t n = vrndaq_f32 (vmulq_f32 (x, d->inv_ln2));
+ float32x4_t r = vfmsq_laneq_f32 (x, n, ln2_c02, 0);
+ r = vfmsq_laneq_f32 (r, n, ln2_c02, 1);
+ uint32x4_t e = vshlq_n_u32 (vreinterpretq_u32_s32 (vcvtq_s32_f32 (n)), 23);
+ float32x4_t scale = vreinterpretq_f32_u32 (vaddq_u32 (e, d->exponent_bias));
#if !WANT_SIMD_EXCEPT
- cmp = vcagtq_f32 (n, d->special_bound);
+ uint32x4_t cmp = vcagtq_f32 (n, d->special_bound);
#endif
- r2 = vmulq_f32 (r, r);
- p = vfmaq_f32 (C (1), C (0), r);
- q = vfmaq_f32 (C (3), C (2), r);
+ float32x4_t r2 = vmulq_f32 (r, r);
+ float32x4_t p = vfmaq_laneq_f32 (d->c1, r, ln2_c02, 2);
+ float32x4_t q = vfmaq_laneq_f32 (d->c3, r, ln2_c02, 3);
q = vfmaq_f32 (q, p, r2);
- p = vmulq_f32 (C (4), r);
- poly = vfmaq_f32 (p, q, r2);
+ p = vmulq_f32 (d->c4, r);
+ float32x4_t poly = vfmaq_f32 (p, q, r2);
if (__glibc_unlikely (v_any_u32 (cmp)))
#if WANT_SIMD_EXCEPT
diff --git a/sysdeps/aarch64/fpu/v_expf_inline.h b/sysdeps/aarch64/fpu/v_expf_inline.h
index 08b06e0a6b34b4f4..eacd2af24161fe3a 100644
--- a/sysdeps/aarch64/fpu/v_expf_inline.h
+++ b/sysdeps/aarch64/fpu/v_expf_inline.h
@@ -24,50 +24,45 @@
struct v_expf_data
{
- float32x4_t poly[5];
- float32x4_t shift;
- float invln2_and_ln2[4];
+ float ln2_hi, ln2_lo, c0, c2;
+ float32x4_t inv_ln2, c1, c3, c4;
+ /* asuint(1.0f). */
+ uint32x4_t exponent_bias;
};
/* maxerr: 1.45358 +0.5 ulp. */
#define V_EXPF_DATA \
{ \
- .poly = { V4 (0x1.0e4020p-7f), V4 (0x1.573e2ep-5f), V4 (0x1.555e66p-3f), \
- V4 (0x1.fffdb6p-2f), V4 (0x1.ffffecp-1f) }, \
- .shift = V4 (0x1.8p23f), \
- .invln2_and_ln2 = { 0x1.715476p+0f, 0x1.62e4p-1f, 0x1.7f7d1cp-20f, 0 }, \
+ .c0 = 0x1.0e4020p-7f, .c1 = V4 (0x1.573e2ep-5f), .c2 = 0x1.555e66p-3f, \
+ .c3 = V4 (0x1.fffdb6p-2f), .c4 = V4 (0x1.ffffecp-1f), \
+ .ln2_hi = 0x1.62e4p-1f, .ln2_lo = 0x1.7f7d1cp-20f, \
+ .inv_ln2 = V4 (0x1.715476p+0f), .exponent_bias = V4 (0x3f800000), \
}
-#define ExponentBias v_u32 (0x3f800000) /* asuint(1.0f). */
-#define C(i) d->poly[i]
-
static inline float32x4_t
v_expf_inline (float32x4_t x, const struct v_expf_data *d)
{
- /* Helper routine for calculating exp(x).
+ /* Helper routine for calculating exp(ax).
Copied from v_expf.c, with all special-case handling removed - the
calling routine should handle special values if required. */
- /* exp(x) = 2^n (1 + poly(r)), with 1 + poly(r) in [1/sqrt(2),sqrt(2)]
- x = ln2*n + r, with r in [-ln2/2, ln2/2]. */
- float32x4_t n, r, z;
- float32x4_t invln2_and_ln2 = vld1q_f32 (d->invln2_and_ln2);
- z = vfmaq_laneq_f32 (d->shift, x, invln2_and_ln2, 0);
- n = vsubq_f32 (z, d->shift);
- r = vfmsq_laneq_f32 (x, n, invln2_and_ln2, 1);
- r = vfmsq_laneq_f32 (r, n, invln2_and_ln2, 2);
- uint32x4_t e = vshlq_n_u32 (vreinterpretq_u32_f32 (z), 23);
- float32x4_t scale = vreinterpretq_f32_u32 (vaddq_u32 (e, ExponentBias));
+ /* exp(ax) = 2^n (1 + poly(r)), with 1 + poly(r) in [1/sqrt(2),sqrt(2)]
+ ax = ln2*n + r, with r in [-ln2/2, ln2/2]. */
+ float32x4_t ax = vabsq_f32 (x);
+ float32x4_t ln2_c02 = vld1q_f32 (&d->ln2_hi);
+ float32x4_t n = vrndaq_f32 (vmulq_f32 (ax, d->inv_ln2));
+ float32x4_t r = vfmsq_laneq_f32 (ax, n, ln2_c02, 0);
+ r = vfmsq_laneq_f32 (r, n, ln2_c02, 1);
+ uint32x4_t e = vshlq_n_u32 (vreinterpretq_u32_s32 (vcvtq_s32_f32 (n)), 23);
+ float32x4_t scale = vreinterpretq_f32_u32 (vaddq_u32 (e, d->exponent_bias));
/* Custom order-4 Estrin avoids building high order monomial. */
float32x4_t r2 = vmulq_f32 (r, r);
- float32x4_t p, q, poly;
- p = vfmaq_f32 (C (1), C (0), r);
- q = vfmaq_f32 (C (3), C (2), r);
+ float32x4_t p = vfmaq_laneq_f32 (d->c1, r, ln2_c02, 2);
+ float32x4_t q = vfmaq_laneq_f32 (d->c3, r, ln2_c02, 3);
q = vfmaq_f32 (q, p, r2);
- p = vmulq_f32 (C (4), r);
- poly = vfmaq_f32 (p, q, r2);
+ p = vmulq_f32 (d->c4, r);
+ float32x4_t poly = vfmaq_f32 (p, q, r2);
return vfmaq_f32 (scale, poly, scale);
}
-
#endif

362
glibc-RHEL-118273-23.patch Normal file
View File

@ -0,0 +1,362 @@
commit 91c1fadba338752bf514cd4cca057b27b1b10eed
Author: Yat Long Poon <yatlong.poon@arm.com>
Date: Fri Jan 3 19:09:05 2025 +0000
AArch64: Improve codegen for SVE log1pf users
Reduce memory access by using lanewise MLA and reduce number of MOVPRFXs.
Move log1pf implementation to inline helper function.
Speedup on Neoverse V1 for log1pf (10%), acoshf (-1%), atanhf (2%), asinhf (2%).
Conflicts:
sysdeps/aarch64/fpu/log1pf_sve.c
(Fixup context to apply without out-of-scope dependency 751a5502)
diff --git a/sysdeps/aarch64/fpu/acoshf_sve.c b/sysdeps/aarch64/fpu/acoshf_sve.c
index 2110894e629500be..491365e24d692f0f 100644
--- a/sysdeps/aarch64/fpu/acoshf_sve.c
+++ b/sysdeps/aarch64/fpu/acoshf_sve.c
@@ -17,23 +17,26 @@
License along with the GNU C Library; if not, see
<https://www.gnu.org/licenses/>. */
+#include "sv_math.h"
+#include "sv_log1pf_inline.h"
+
#define One 0x3f800000
#define Thres 0x20000000 /* asuint(0x1p64) - One. */
-#include "sv_log1pf_inline.h"
-
static svfloat32_t NOINLINE
-special_case (svfloat32_t x, svfloat32_t y, svbool_t special)
+special_case (svfloat32_t xm1, svfloat32_t tmp, svbool_t special)
{
+ svfloat32_t x = svadd_x (svptrue_b32 (), xm1, 1.0f);
+ svfloat32_t y = sv_log1pf_inline (tmp, svptrue_b32 ());
return sv_call_f32 (acoshf, x, y, special);
}
/* Single-precision SVE acosh(x) routine. Implements the same algorithm as
vector acoshf and log1p.
- Maximum error is 2.78 ULPs:
- SV_NAME_F1 (acosh) (0x1.01e996p+0) got 0x1.f45b42p-4
- want 0x1.f45b3cp-4. */
+ Maximum error is 2.47 ULPs:
+ SV_NAME_F1 (acosh) (0x1.01ca76p+0) got 0x1.e435a6p-4
+ want 0x1.e435a2p-4. */
svfloat32_t SV_NAME_F1 (acosh) (svfloat32_t x, const svbool_t pg)
{
svuint32_t ix = svreinterpret_u32 (x);
@@ -41,9 +44,9 @@ svfloat32_t SV_NAME_F1 (acosh) (svfloat32_t x, const svbool_t pg)
svfloat32_t xm1 = svsub_x (pg, x, 1.0f);
svfloat32_t u = svmul_x (pg, xm1, svadd_x (pg, x, 1.0f));
- svfloat32_t y = sv_log1pf_inline (svadd_x (pg, xm1, svsqrt_x (pg, u)), pg);
+ svfloat32_t tmp = svadd_x (pg, xm1, svsqrt_x (pg, u));
if (__glibc_unlikely (svptest_any (pg, special)))
- return special_case (x, y, special);
- return y;
+ return special_case (xm1, tmp, special);
+ return sv_log1pf_inline (tmp, pg);
}
diff --git a/sysdeps/aarch64/fpu/asinhf_sve.c b/sysdeps/aarch64/fpu/asinhf_sve.c
index d85c3a685c0b83ff..b7f253bf32fb9478 100644
--- a/sysdeps/aarch64/fpu/asinhf_sve.c
+++ b/sysdeps/aarch64/fpu/asinhf_sve.c
@@ -20,20 +20,23 @@
#include "sv_math.h"
#include "sv_log1pf_inline.h"
-#define BigBound (0x5f800000) /* asuint(0x1p64). */
+#define BigBound 0x5f800000 /* asuint(0x1p64). */
static svfloat32_t NOINLINE
-special_case (svfloat32_t x, svfloat32_t y, svbool_t special)
+special_case (svuint32_t iax, svuint32_t sign, svfloat32_t y, svbool_t special)
{
+ svfloat32_t x = svreinterpret_f32 (sveor_x (svptrue_b32 (), iax, sign));
+ y = svreinterpret_f32 (
+ svorr_x (svptrue_b32 (), sign, svreinterpret_u32 (y)));
return sv_call_f32 (asinhf, x, y, special);
}
/* Single-precision SVE asinh(x) routine. Implements the same algorithm as
vector asinhf and log1p.
- Maximum error is 2.48 ULPs:
- SV_NAME_F1 (asinh) (0x1.008864p-3) got 0x1.ffbbbcp-4
- want 0x1.ffbbb8p-4. */
+ Maximum error is 1.92 ULPs:
+ SV_NAME_F1 (asinh) (-0x1.0922ecp-1) got -0x1.fd0bccp-2
+ want -0x1.fd0bc8p-2. */
svfloat32_t SV_NAME_F1 (asinh) (svfloat32_t x, const svbool_t pg)
{
svfloat32_t ax = svabs_x (pg, x);
@@ -49,8 +52,6 @@ svfloat32_t SV_NAME_F1 (asinh) (svfloat32_t x, const svbool_t pg)
= sv_log1pf_inline (svadd_x (pg, ax, svdiv_x (pg, ax2, d)), pg);
if (__glibc_unlikely (svptest_any (pg, special)))
- return special_case (
- x, svreinterpret_f32 (svorr_x (pg, sign, svreinterpret_u32 (y))),
- special);
+ return special_case (iax, sign, y, special);
return svreinterpret_f32 (svorr_x (pg, sign, svreinterpret_u32 (y)));
}
diff --git a/sysdeps/aarch64/fpu/atanhf_sve.c b/sysdeps/aarch64/fpu/atanhf_sve.c
index dae83041ef7157f0..2d3005bbc88393ec 100644
--- a/sysdeps/aarch64/fpu/atanhf_sve.c
+++ b/sysdeps/aarch64/fpu/atanhf_sve.c
@@ -17,21 +17,25 @@
License along with the GNU C Library; if not, see
<https://www.gnu.org/licenses/>. */
+#include "sv_math.h"
#include "sv_log1pf_inline.h"
#define One (0x3f800000)
#define Half (0x3f000000)
static svfloat32_t NOINLINE
-special_case (svfloat32_t x, svfloat32_t y, svbool_t special)
+special_case (svuint32_t iax, svuint32_t sign, svfloat32_t halfsign,
+ svfloat32_t y, svbool_t special)
{
+ svfloat32_t x = svreinterpret_f32 (sveor_x (svptrue_b32 (), iax, sign));
+ y = svmul_x (svptrue_b32 (), halfsign, y);
return sv_call_f32 (atanhf, x, y, special);
}
/* Approximation for vector single-precision atanh(x) using modified log1p.
- The maximum error is 2.28 ULP:
- _ZGVsMxv_atanhf(0x1.ff1194p-5) got 0x1.ffbbbcp-5
- want 0x1.ffbbb6p-5. */
+ The maximum error is 1.99 ULP:
+ _ZGVsMxv_atanhf(0x1.f1583p-5) got 0x1.f1f4fap-5
+ want 0x1.f1f4f6p-5. */
svfloat32_t SV_NAME_F1 (atanh) (svfloat32_t x, const svbool_t pg)
{
svfloat32_t ax = svabs_x (pg, x);
@@ -48,7 +52,7 @@ svfloat32_t SV_NAME_F1 (atanh) (svfloat32_t x, const svbool_t pg)
y = sv_log1pf_inline (y, pg);
if (__glibc_unlikely (svptest_any (pg, special)))
- return special_case (x, svmul_x (pg, halfsign, y), special);
+ return special_case (iax, sign, halfsign, y, special);
return svmul_x (pg, halfsign, y);
}
diff --git a/sysdeps/aarch64/fpu/log1pf_sve.c b/sysdeps/aarch64/fpu/log1pf_sve.c
index f645cc997e430bcb..4f17c44e2d96039a 100644
--- a/sysdeps/aarch64/fpu/log1pf_sve.c
+++ b/sysdeps/aarch64/fpu/log1pf_sve.c
@@ -18,30 +18,13 @@
<https://www.gnu.org/licenses/>. */
#include "sv_math.h"
-#include "poly_sve_f32.h"
-
-static const struct data
-{
- float poly[8];
- float ln2, exp_bias;
- uint32_t four, three_quarters;
-} data = {.poly = {/* Do not store first term of polynomial, which is -0.5, as
- this can be fmov-ed directly instead of including it in
- the main load-and-mla polynomial schedule. */
- 0x1.5555aap-2f, -0x1.000038p-2f, 0x1.99675cp-3f,
- -0x1.54ef78p-3f, 0x1.28a1f4p-3f, -0x1.0da91p-3f,
- 0x1.abcb6p-4f, -0x1.6f0d5ep-5f},
- .ln2 = 0x1.62e43p-1f,
- .exp_bias = 0x1p-23f,
- .four = 0x40800000,
- .three_quarters = 0x3f400000};
-
-#define SignExponentMask 0xff800000
+#include "sv_log1pf_inline.h"
static svfloat32_t NOINLINE
-special_case (svfloat32_t x, svfloat32_t y, svbool_t special)
+special_case (svfloat32_t x, svbool_t special)
{
- return sv_call_f32 (log1pf, x, y, special);
+ return sv_call_f32 (log1pf, x, sv_log1pf_inline (x, svptrue_b32 ()),
+ special);
}
/* Vector log1pf approximation using polynomial on reduced interval. Worst-case
@@ -50,51 +33,12 @@ special_case (svfloat32_t x, svfloat32_t y, svbool_t special)
want 0x1.9f323ep-2. */
svfloat32_t SV_NAME_F1 (log1p) (svfloat32_t x, svbool_t pg)
{
- const struct data *d = ptr_barrier (&data);
/* x < -1, Inf/Nan. */
svbool_t special = svcmpeq (pg, svreinterpret_u32 (x), 0x7f800000);
special = svorn_z (pg, special, svcmpge (pg, x, -1));
- /* With x + 1 = t * 2^k (where t = m + 1 and k is chosen such that m
- is in [-0.25, 0.5]):
- log1p(x) = log(t) + log(2^k) = log1p(m) + k*log(2).
-
- We approximate log1p(m) with a polynomial, then scale by
- k*log(2). Instead of doing this directly, we use an intermediate
- scale factor s = 4*k*log(2) to ensure the scale is representable
- as a normalised fp32 number. */
- svfloat32_t m = svadd_x (pg, x, 1);
-
- /* Choose k to scale x to the range [-1/4, 1/2]. */
- svint32_t k
- = svand_x (pg, svsub_x (pg, svreinterpret_s32 (m), d->three_quarters),
- sv_s32 (SignExponentMask));
-
- /* Scale x by exponent manipulation. */
- svfloat32_t m_scale = svreinterpret_f32 (
- svsub_x (pg, svreinterpret_u32 (x), svreinterpret_u32 (k)));
-
- /* Scale up to ensure that the scale factor is representable as normalised
- fp32 number, and scale m down accordingly. */
- svfloat32_t s = svreinterpret_f32 (svsubr_x (pg, k, d->four));
- m_scale = svadd_x (pg, m_scale, svmla_x (pg, sv_f32 (-1), s, 0.25));
-
- /* Evaluate polynomial on reduced interval. */
- svfloat32_t ms2 = svmul_x (pg, m_scale, m_scale),
- ms4 = svmul_x (pg, ms2, ms2);
- svfloat32_t p = sv_estrin_7_f32_x (pg, m_scale, ms2, ms4, d->poly);
- p = svmad_x (pg, m_scale, p, -0.5);
- p = svmla_x (pg, m_scale, m_scale, svmul_x (pg, m_scale, p));
-
- /* The scale factor to be applied back at the end - by multiplying float(k)
- by 2^-23 we get the unbiased exponent of k. */
- svfloat32_t scale_back = svmul_x (pg, svcvt_f32_x (pg, k), d->exp_bias);
-
- /* Apply the scaling back. */
- svfloat32_t y = svmla_x (pg, p, scale_back, d->ln2);
-
if (__glibc_unlikely (svptest_any (pg, special)))
- return special_case (x, y, special);
+ return special_case (x, special);
- return y;
+ return sv_log1pf_inline (x, pg);
}
diff --git a/sysdeps/aarch64/fpu/sv_log1pf_inline.h b/sysdeps/aarch64/fpu/sv_log1pf_inline.h
index b94b2da055a6c59b..850297d61556740c 100644
--- a/sysdeps/aarch64/fpu/sv_log1pf_inline.h
+++ b/sysdeps/aarch64/fpu/sv_log1pf_inline.h
@@ -22,55 +22,76 @@
#include "sv_math.h"
#include "vecmath_config.h"
-#include "poly_sve_f32.h"
+
+#define SignExponentMask 0xff800000
static const struct sv_log1pf_data
{
- float32_t poly[9];
- float32_t ln2;
- float32_t scale_back;
+ float c0, c2, c4, c6;
+ float c1, c3, c5, c7;
+ float ln2, exp_bias, quarter;
+ uint32_t four, three_quarters;
} sv_log1pf_data = {
- /* Polynomial generated using FPMinimax in [-0.25, 0.5]. */
- .poly = { -0x1p-1f, 0x1.5555aap-2f, -0x1.000038p-2f, 0x1.99675cp-3f,
- -0x1.54ef78p-3f, 0x1.28a1f4p-3f, -0x1.0da91p-3f, 0x1.abcb6p-4f,
- -0x1.6f0d5ep-5f },
- .scale_back = 0x1.0p-23f,
- .ln2 = 0x1.62e43p-1f,
+ /* Do not store first term of polynomial, which is -0.5, as
+ this can be fmov-ed directly instead of including it in
+ the main load-and-mla polynomial schedule. */
+ .c0 = 0x1.5555aap-2f, .c1 = -0x1.000038p-2f, .c2 = 0x1.99675cp-3f,
+ .c3 = -0x1.54ef78p-3f, .c4 = 0x1.28a1f4p-3f, .c5 = -0x1.0da91p-3f,
+ .c6 = 0x1.abcb6p-4f, .c7 = -0x1.6f0d5ep-5f, .ln2 = 0x1.62e43p-1f,
+ .exp_bias = 0x1p-23f, .quarter = 0x1p-2f, .four = 0x40800000,
+ .three_quarters = 0x3f400000,
};
-static inline svfloat32_t
-eval_poly (svfloat32_t m, const float32_t *c, svbool_t pg)
-{
- svfloat32_t p_12 = svmla_x (pg, sv_f32 (c[0]), m, sv_f32 (c[1]));
- svfloat32_t m2 = svmul_x (pg, m, m);
- svfloat32_t q = svmla_x (pg, m, m2, p_12);
- svfloat32_t p = sv_pw_horner_6_f32_x (pg, m, m2, c + 2);
- p = svmul_x (pg, m2, p);
-
- return svmla_x (pg, q, m2, p);
-}
-
static inline svfloat32_t
sv_log1pf_inline (svfloat32_t x, svbool_t pg)
{
const struct sv_log1pf_data *d = ptr_barrier (&sv_log1pf_data);
- svfloat32_t m = svadd_x (pg, x, 1.0f);
-
- svint32_t ks = svsub_x (pg, svreinterpret_s32 (m),
- svreinterpret_s32 (svdup_f32 (0.75f)));
- ks = svand_x (pg, ks, 0xff800000);
- svuint32_t k = svreinterpret_u32 (ks);
- svfloat32_t s = svreinterpret_f32 (
- svsub_x (pg, svreinterpret_u32 (svdup_f32 (4.0f)), k));
-
- svfloat32_t m_scale
- = svreinterpret_f32 (svsub_x (pg, svreinterpret_u32 (x), k));
- m_scale
- = svadd_x (pg, m_scale, svmla_x (pg, sv_f32 (-1.0f), sv_f32 (0.25f), s));
- svfloat32_t p = eval_poly (m_scale, d->poly, pg);
- svfloat32_t scale_back = svmul_x (pg, svcvt_f32_x (pg, k), d->scale_back);
- return svmla_x (pg, p, scale_back, d->ln2);
+ /* With x + 1 = t * 2^k (where t = m + 1 and k is chosen such that m
+ is in [-0.25, 0.5]):
+ log1p(x) = log(t) + log(2^k) = log1p(m) + k*log(2).
+
+ We approximate log1p(m) with a polynomial, then scale by
+ k*log(2). Instead of doing this directly, we use an intermediate
+ scale factor s = 4*k*log(2) to ensure the scale is representable
+ as a normalised fp32 number. */
+ svfloat32_t m = svadd_x (pg, x, 1);
+
+ /* Choose k to scale x to the range [-1/4, 1/2]. */
+ svint32_t k
+ = svand_x (pg, svsub_x (pg, svreinterpret_s32 (m), d->three_quarters),
+ sv_s32 (SignExponentMask));
+
+ /* Scale x by exponent manipulation. */
+ svfloat32_t m_scale = svreinterpret_f32 (
+ svsub_x (pg, svreinterpret_u32 (x), svreinterpret_u32 (k)));
+
+ /* Scale up to ensure that the scale factor is representable as normalised
+ fp32 number, and scale m down accordingly. */
+ svfloat32_t s = svreinterpret_f32 (svsubr_x (pg, k, d->four));
+ svfloat32_t fconst = svld1rq_f32 (svptrue_b32 (), &d->ln2);
+ m_scale = svadd_x (pg, m_scale, svmla_lane_f32 (sv_f32 (-1), s, fconst, 2));
+
+ /* Evaluate polynomial on reduced interval. */
+ svfloat32_t ms2 = svmul_x (svptrue_b32 (), m_scale, m_scale);
+
+ svfloat32_t c1357 = svld1rq_f32 (svptrue_b32 (), &d->c1);
+ svfloat32_t p01 = svmla_lane_f32 (sv_f32 (d->c0), m_scale, c1357, 0);
+ svfloat32_t p23 = svmla_lane_f32 (sv_f32 (d->c2), m_scale, c1357, 1);
+ svfloat32_t p45 = svmla_lane_f32 (sv_f32 (d->c4), m_scale, c1357, 2);
+ svfloat32_t p67 = svmla_lane_f32 (sv_f32 (d->c6), m_scale, c1357, 3);
+
+ svfloat32_t p = svmla_x (pg, p45, p67, ms2);
+ p = svmla_x (pg, p23, p, ms2);
+ p = svmla_x (pg, p01, p, ms2);
+
+ p = svmad_x (pg, m_scale, p, -0.5);
+ p = svmla_x (pg, m_scale, m_scale, svmul_x (pg, m_scale, p));
+
+ /* The scale factor to be applied back at the end - by multiplying float(k)
+ by 2^-23 we get the unbiased exponent of k. */
+ svfloat32_t scale_back = svmul_lane_f32 (svcvt_f32_x (pg, k), fconst, 1);
+ return svmla_lane_f32 (p, scale_back, fconst, 0);
}
#endif

258
glibc-RHEL-118273-24.patch Normal file
View File

@ -0,0 +1,258 @@
commit 140b985e5a2071000122b3cb63ebfe88cf21dd29
Author: Luna Lamb <luna.lamb@arm.com>
Date: Fri Jan 3 19:00:12 2025 +0000
AArch64: Improve codegen in AdvSIMD asinh
Improves memory access and removes spills.
Load the polynomial evaluation coefficients into 2 vectors and use lanewise
MLAs. Reduces MOVs 6->3 , LDR 11->5, STR/STP 2->0, ADRP 3->2.
diff --git a/sysdeps/aarch64/fpu/asinh_advsimd.c b/sysdeps/aarch64/fpu/asinh_advsimd.c
index 6207e7da9531f48d..2739f98b390edca7 100644
--- a/sysdeps/aarch64/fpu/asinh_advsimd.c
+++ b/sysdeps/aarch64/fpu/asinh_advsimd.c
@@ -20,41 +20,71 @@
#include "v_math.h"
#include "poly_advsimd_f64.h"
-#define A(i) v_f64 (__v_log_data.poly[i])
-#define N (1 << V_LOG_TABLE_BITS)
-#define IndexMask (N - 1)
-
const static struct data
{
- float64x2_t poly[18];
- uint64x2_t off, huge_bound, abs_mask;
- float64x2_t ln2, tiny_bound;
+ uint64x2_t huge_bound, abs_mask, off, mask;
+#if WANT_SIMD_EXCEPT
+ float64x2_t tiny_bound;
+#endif
+ float64x2_t lc0, lc2;
+ double lc1, lc3, ln2, lc4;
+
+ float64x2_t c0, c2, c4, c6, c8, c10, c12, c14, c16, c17;
+ double c1, c3, c5, c7, c9, c11, c13, c15;
+
} data = {
- .off = V2 (0x3fe6900900000000),
- .ln2 = V2 (0x1.62e42fefa39efp-1),
- .huge_bound = V2 (0x5fe0000000000000),
+
+#if WANT_SIMD_EXCEPT
.tiny_bound = V2 (0x1p-26),
- .abs_mask = V2 (0x7fffffffffffffff),
+#endif
/* Even terms of polynomial s.t. asinh(x) is approximated by
asinh(x) ~= x + x^3 * (C0 + C1 * x + C2 * x^2 + C3 * x^3 + ...).
Generated using Remez, f = (asinh(sqrt(x)) - sqrt(x))/x^(3/2). */
- .poly = { V2 (-0x1.55555555554a7p-3), V2 (0x1.3333333326c7p-4),
- V2 (-0x1.6db6db68332e6p-5), V2 (0x1.f1c71b26fb40dp-6),
- V2 (-0x1.6e8b8b654a621p-6), V2 (0x1.1c4daa9e67871p-6),
- V2 (-0x1.c9871d10885afp-7), V2 (0x1.7a16e8d9d2ecfp-7),
- V2 (-0x1.3ddca533e9f54p-7), V2 (0x1.0becef748dafcp-7),
- V2 (-0x1.b90c7099dd397p-8), V2 (0x1.541f2bb1ffe51p-8),
- V2 (-0x1.d217026a669ecp-9), V2 (0x1.0b5c7977aaf7p-9),
- V2 (-0x1.e0f37daef9127p-11), V2 (0x1.388b5fe542a6p-12),
- V2 (-0x1.021a48685e287p-14), V2 (0x1.93d4ba83d34dap-18) },
+
+ .c0 = V2 (-0x1.55555555554a7p-3),
+ .c1 = 0x1.3333333326c7p-4,
+ .c2 = V2 (-0x1.6db6db68332e6p-5),
+ .c3 = 0x1.f1c71b26fb40dp-6,
+ .c4 = V2 (-0x1.6e8b8b654a621p-6),
+ .c5 = 0x1.1c4daa9e67871p-6,
+ .c6 = V2 (-0x1.c9871d10885afp-7),
+ .c7 = 0x1.7a16e8d9d2ecfp-7,
+ .c8 = V2 (-0x1.3ddca533e9f54p-7),
+ .c9 = 0x1.0becef748dafcp-7,
+ .c10 = V2 (-0x1.b90c7099dd397p-8),
+ .c11 = 0x1.541f2bb1ffe51p-8,
+ .c12 = V2 (-0x1.d217026a669ecp-9),
+ .c13 = 0x1.0b5c7977aaf7p-9,
+ .c14 = V2 (-0x1.e0f37daef9127p-11),
+ .c15 = 0x1.388b5fe542a6p-12,
+ .c16 = V2 (-0x1.021a48685e287p-14),
+ .c17 = V2 (0x1.93d4ba83d34dap-18),
+
+ .lc0 = V2 (-0x1.ffffffffffff7p-2),
+ .lc1 = 0x1.55555555170d4p-2,
+ .lc2 = V2 (-0x1.0000000399c27p-2),
+ .lc3 = 0x1.999b2e90e94cap-3,
+ .lc4 = -0x1.554e550bd501ep-3,
+ .ln2 = 0x1.62e42fefa39efp-1,
+
+ .off = V2 (0x3fe6900900000000),
+ .huge_bound = V2 (0x5fe0000000000000),
+ .abs_mask = V2 (0x7fffffffffffffff),
+ .mask = V2 (0xfffULL << 52),
};
static float64x2_t NOINLINE VPCS_ATTR
-special_case (float64x2_t x, float64x2_t y, uint64x2_t special)
+special_case (float64x2_t x, float64x2_t y, uint64x2_t abs_mask,
+ uint64x2_t special)
{
+ /* Copy sign. */
+ y = vbslq_f64 (abs_mask, y, x);
return v_call_f64 (asinh, x, y, special);
}
+#define N (1 << V_LOG_TABLE_BITS)
+#define IndexMask (N - 1)
+
struct entry
{
float64x2_t invc;
@@ -76,27 +106,34 @@ lookup (uint64x2_t i)
}
static inline float64x2_t
-log_inline (float64x2_t x, const struct data *d)
+log_inline (float64x2_t xm, const struct data *d)
{
- /* Double-precision vector log, copied from ordinary vector log with some
- cosmetic modification and special-cases removed. */
- uint64x2_t ix = vreinterpretq_u64_f64 (x);
- uint64x2_t tmp = vsubq_u64 (ix, d->off);
- int64x2_t k = vshrq_n_s64 (vreinterpretq_s64_u64 (tmp), 52);
- uint64x2_t iz
- = vsubq_u64 (ix, vandq_u64 (tmp, vdupq_n_u64 (0xfffULL << 52)));
+
+ uint64x2_t u = vreinterpretq_u64_f64 (xm);
+ uint64x2_t u_off = vsubq_u64 (u, d->off);
+
+ int64x2_t k = vshrq_n_s64 (vreinterpretq_s64_u64 (u_off), 52);
+ uint64x2_t iz = vsubq_u64 (u, vandq_u64 (u_off, d->mask));
float64x2_t z = vreinterpretq_f64_u64 (iz);
- struct entry e = lookup (tmp);
+
+ struct entry e = lookup (u_off);
+
+ /* log(x) = log1p(z/c-1) + log(c) + k*Ln2. */
float64x2_t r = vfmaq_f64 (v_f64 (-1.0), z, e.invc);
float64x2_t kd = vcvtq_f64_s64 (k);
- float64x2_t hi = vfmaq_f64 (vaddq_f64 (e.logc, r), kd, d->ln2);
+
+ /* hi = r + log(c) + k*Ln2. */
+ float64x2_t ln2_and_lc4 = vld1q_f64 (&d->ln2);
+ float64x2_t hi = vfmaq_laneq_f64 (vaddq_f64 (e.logc, r), kd, ln2_and_lc4, 0);
+
+ /* y = r2*(A0 + r*A1 + r2*(A2 + r*A3 + r2*A4)) + hi. */
+ float64x2_t odd_coeffs = vld1q_f64 (&d->lc1);
float64x2_t r2 = vmulq_f64 (r, r);
- float64x2_t y = vfmaq_f64 (A (2), A (3), r);
- float64x2_t p = vfmaq_f64 (A (0), A (1), r);
- y = vfmaq_f64 (y, A (4), r2);
- y = vfmaq_f64 (p, y, r2);
- y = vfmaq_f64 (hi, y, r2);
- return y;
+ float64x2_t y = vfmaq_laneq_f64 (d->lc2, r, odd_coeffs, 1);
+ float64x2_t p = vfmaq_laneq_f64 (d->lc0, r, odd_coeffs, 0);
+ y = vfmaq_laneq_f64 (y, r2, ln2_and_lc4, 1);
+ y = vfmaq_f64 (p, r2, y);
+ return vfmaq_f64 (hi, y, r2);
}
/* Double-precision implementation of vector asinh(x).
@@ -106,23 +143,24 @@ log_inline (float64x2_t x, const struct data *d)
asinh(x) = sign(x) * log(|x| + sqrt(x^2 + 1) if |x| >= 1
= sign(x) * (|x| + |x|^3 * P(x^2)) otherwise
where log(x) is an optimized log approximation, and P(x) is a polynomial
- shared with the scalar routine. The greatest observed error 3.29 ULP, in
+ shared with the scalar routine. The greatest observed error 2.79 ULP, in
|x| >= 1:
- __v_asinh(0x1.2cd9d717e2c9bp+0) got 0x1.ffffcfd0e234fp-1
- want 0x1.ffffcfd0e2352p-1. */
+ _ZGVnN2v_asinh(0x1.2cd9d73ea76a6p+0) got 0x1.ffffd003219dap-1
+ want 0x1.ffffd003219ddp-1. */
VPCS_ATTR float64x2_t V_NAME_D1 (asinh) (float64x2_t x)
{
const struct data *d = ptr_barrier (&data);
-
float64x2_t ax = vabsq_f64 (x);
- uint64x2_t iax = vreinterpretq_u64_f64 (ax);
uint64x2_t gt1 = vcgeq_f64 (ax, v_f64 (1));
- uint64x2_t special = vcgeq_u64 (iax, d->huge_bound);
#if WANT_SIMD_EXCEPT
+ uint64x2_t iax = vreinterpretq_u64_f64 (ax);
+ uint64x2_t special = vcgeq_u64 (iax, (d->huge_bound));
uint64x2_t tiny = vcltq_f64 (ax, d->tiny_bound);
special = vorrq_u64 (special, tiny);
+#else
+ uint64x2_t special = vcgeq_f64 (ax, vreinterpretq_f64_u64 (d->huge_bound));
#endif
/* Option 1: |x| >= 1.
@@ -147,19 +185,45 @@ VPCS_ATTR float64x2_t V_NAME_D1 (asinh) (float64x2_t x)
overflow, and tiny lanes, which will underflow, by setting them to 0. They
will be fixed later, either by selecting x or falling back to the scalar
special-case. The largest observed error in this region is 1.47 ULPs:
- __v_asinh(0x1.fdfcd00cc1e6ap-1) got 0x1.c1d6bf874019bp-1
- want 0x1.c1d6bf874019cp-1. */
+ _ZGVnN2v_asinh(0x1.fdfcd00cc1e6ap-1) got 0x1.c1d6bf874019bp-1
+ want 0x1.c1d6bf874019cp-1. */
float64x2_t option_2 = v_f64 (0);
+
if (__glibc_likely (v_any_u64 (vceqzq_u64 (gt1))))
{
+
#if WANT_SIMD_EXCEPT
ax = v_zerofy_f64 (ax, vorrq_u64 (tiny, gt1));
#endif
- float64x2_t x2 = vmulq_f64 (ax, ax), x3 = vmulq_f64 (ax, x2),
- z2 = vmulq_f64 (x2, x2), z4 = vmulq_f64 (z2, z2),
- z8 = vmulq_f64 (z4, z4), z16 = vmulq_f64 (z8, z8);
- float64x2_t p = v_estrin_17_f64 (x2, z2, z4, z8, z16, d->poly);
- option_2 = vfmaq_f64 (ax, p, x3);
+ float64x2_t x2 = vmulq_f64 (ax, ax), z2 = vmulq_f64 (x2, x2);
+ /* Order-17 Pairwise Horner scheme. */
+ float64x2_t c13 = vld1q_f64 (&d->c1);
+ float64x2_t c57 = vld1q_f64 (&d->c5);
+ float64x2_t c911 = vld1q_f64 (&d->c9);
+ float64x2_t c1315 = vld1q_f64 (&d->c13);
+
+ float64x2_t p01 = vfmaq_laneq_f64 (d->c0, x2, c13, 0);
+ float64x2_t p23 = vfmaq_laneq_f64 (d->c2, x2, c13, 1);
+ float64x2_t p45 = vfmaq_laneq_f64 (d->c4, x2, c57, 0);
+ float64x2_t p67 = vfmaq_laneq_f64 (d->c6, x2, c57, 1);
+ float64x2_t p89 = vfmaq_laneq_f64 (d->c8, x2, c911, 0);
+ float64x2_t p1011 = vfmaq_laneq_f64 (d->c10, x2, c911, 1);
+ float64x2_t p1213 = vfmaq_laneq_f64 (d->c12, x2, c1315, 0);
+ float64x2_t p1415 = vfmaq_laneq_f64 (d->c14, x2, c1315, 1);
+ float64x2_t p1617 = vfmaq_f64 (d->c16, x2, d->c17);
+
+ float64x2_t p = vfmaq_f64 (p1415, z2, p1617);
+ p = vfmaq_f64 (p1213, z2, p);
+ p = vfmaq_f64 (p1011, z2, p);
+ p = vfmaq_f64 (p89, z2, p);
+
+ p = vfmaq_f64 (p67, z2, p);
+ p = vfmaq_f64 (p45, z2, p);
+
+ p = vfmaq_f64 (p23, z2, p);
+
+ p = vfmaq_f64 (p01, z2, p);
+ option_2 = vfmaq_f64 (ax, p, vmulq_f64 (ax, x2));
#if WANT_SIMD_EXCEPT
option_2 = vbslq_f64 (tiny, x, option_2);
#endif
@@ -167,10 +231,10 @@ VPCS_ATTR float64x2_t V_NAME_D1 (asinh) (float64x2_t x)
/* Choose the right option for each lane. */
float64x2_t y = vbslq_f64 (gt1, option_1, option_2);
- /* Copy sign. */
- y = vbslq_f64 (d->abs_mask, y, x);
-
if (__glibc_unlikely (v_any_u64 (special)))
- return special_case (x, y, special);
- return y;
+ {
+ return special_case (x, y, d->abs_mask, special);
+ }
+ /* Copy sign. */
+ return vbslq_f64 (d->abs_mask, y, x);
}

221
glibc-RHEL-118273-25.patch Normal file
View File

@ -0,0 +1,221 @@
commit f86b4cf87581cf1e45702b07880679ffa0b1f47a
Author: Luna Lamb <luna.lamb@arm.com>
Date: Fri Jan 3 20:15:17 2025 +0000
AArch64: Improve codegen in SVE expm1f and users
Use unpredicated muls, use absolute compare and improve memory access.
Expm1f, sinhf and tanhf show 7%, 5% and 1% improvement in throughput
microbenchmark on Neoverse V1.
diff --git a/sysdeps/aarch64/fpu/expm1f_sve.c b/sysdeps/aarch64/fpu/expm1f_sve.c
index 7c852125cdbd0a2b..05a66400d477b819 100644
--- a/sysdeps/aarch64/fpu/expm1f_sve.c
+++ b/sysdeps/aarch64/fpu/expm1f_sve.c
@@ -18,7 +18,6 @@
<https://www.gnu.org/licenses/>. */
#include "sv_math.h"
-#include "poly_sve_f32.h"
/* Largest value of x for which expm1(x) should round to -1. */
#define SpecialBound 0x1.5ebc4p+6f
@@ -28,20 +27,17 @@ static const struct data
/* These 4 are grouped together so they can be loaded as one quadword, then
used with _lane forms of svmla/svmls. */
float c2, c4, ln2_hi, ln2_lo;
- float c0, c1, c3, inv_ln2, special_bound, shift;
+ float c0, inv_ln2, c1, c3, special_bound;
} data = {
/* Generated using fpminimax. */
.c0 = 0x1.fffffep-2, .c1 = 0x1.5554aep-3,
.c2 = 0x1.555736p-5, .c3 = 0x1.12287cp-7,
- .c4 = 0x1.6b55a2p-10,
+ .c4 = 0x1.6b55a2p-10, .inv_ln2 = 0x1.715476p+0f,
+ .special_bound = SpecialBound, .ln2_lo = 0x1.7f7d1cp-20f,
+ .ln2_hi = 0x1.62e4p-1f,
- .special_bound = SpecialBound, .shift = 0x1.8p23f,
- .inv_ln2 = 0x1.715476p+0f, .ln2_hi = 0x1.62e4p-1f,
- .ln2_lo = 0x1.7f7d1cp-20f,
};
-#define C(i) sv_f32 (d->c##i)
-
static svfloat32_t NOINLINE
special_case (svfloat32_t x, svbool_t pg)
{
@@ -71,9 +67,8 @@ svfloat32_t SV_NAME_F1 (expm1) (svfloat32_t x, svbool_t pg)
and f = x - i * ln2, then f is in [-ln2/2, ln2/2].
exp(x) - 1 = 2^i * (expm1(f) + 1) - 1
where 2^i is exact because i is an integer. */
- svfloat32_t j = svmla_x (pg, sv_f32 (d->shift), x, d->inv_ln2);
- j = svsub_x (pg, j, d->shift);
- svint32_t i = svcvt_s32_x (pg, j);
+ svfloat32_t j = svmul_x (svptrue_b32 (), x, d->inv_ln2);
+ j = svrinta_x (pg, j);
svfloat32_t f = svmls_lane (x, j, lane_constants, 2);
f = svmls_lane (f, j, lane_constants, 3);
@@ -83,17 +78,17 @@ svfloat32_t SV_NAME_F1 (expm1) (svfloat32_t x, svbool_t pg)
x + ax^2 + bx^3 + cx^4 ....
So we calculate the polynomial P(f) = a + bf + cf^2 + ...
and assemble the approximation expm1(f) ~= f + f^2 * P(f). */
- svfloat32_t p12 = svmla_lane (C (1), f, lane_constants, 0);
- svfloat32_t p34 = svmla_lane (C (3), f, lane_constants, 1);
- svfloat32_t f2 = svmul_x (pg, f, f);
+ svfloat32_t p12 = svmla_lane (sv_f32 (d->c1), f, lane_constants, 0);
+ svfloat32_t p34 = svmla_lane (sv_f32 (d->c3), f, lane_constants, 1);
+ svfloat32_t f2 = svmul_x (svptrue_b32 (), f, f);
svfloat32_t p = svmla_x (pg, p12, f2, p34);
- p = svmla_x (pg, C (0), f, p);
+
+ p = svmla_x (pg, sv_f32 (d->c0), f, p);
p = svmla_x (pg, f, f2, p);
/* Assemble the result.
expm1(x) ~= 2^i * (p + 1) - 1
Let t = 2^i. */
- svfloat32_t t = svreinterpret_f32 (
- svadd_x (pg, svreinterpret_u32 (svlsl_x (pg, i, 23)), 0x3f800000));
- return svmla_x (pg, svsub_x (pg, t, 1), p, t);
+ svfloat32_t t = svscale_x (pg, sv_f32 (1.0f), svcvt_s32_x (pg, j));
+ return svmla_x (pg, svsub_x (pg, t, 1.0f), p, t);
}
diff --git a/sysdeps/aarch64/fpu/sinhf_sve.c b/sysdeps/aarch64/fpu/sinhf_sve.c
index 6c204b57a2aa18d3..50dd386774b005ca 100644
--- a/sysdeps/aarch64/fpu/sinhf_sve.c
+++ b/sysdeps/aarch64/fpu/sinhf_sve.c
@@ -63,5 +63,5 @@ svfloat32_t SV_NAME_F1 (sinh) (svfloat32_t x, const svbool_t pg)
if (__glibc_unlikely (svptest_any (pg, special)))
return special_case (x, svmul_x (pg, t, halfsign), special);
- return svmul_x (pg, t, halfsign);
+ return svmul_x (svptrue_b32 (), t, halfsign);
}
diff --git a/sysdeps/aarch64/fpu/sv_expm1f_inline.h b/sysdeps/aarch64/fpu/sv_expm1f_inline.h
index 5b7245122294e1b4..e46ddda5437dc826 100644
--- a/sysdeps/aarch64/fpu/sv_expm1f_inline.h
+++ b/sysdeps/aarch64/fpu/sv_expm1f_inline.h
@@ -27,21 +27,18 @@ struct sv_expm1f_data
/* These 4 are grouped together so they can be loaded as one quadword, then
used with _lane forms of svmla/svmls. */
float32_t c2, c4, ln2_hi, ln2_lo;
- float32_t c0, c1, c3, inv_ln2, shift;
+ float c0, inv_ln2, c1, c3, special_bound;
};
/* Coefficients generated using fpminimax. */
#define SV_EXPM1F_DATA \
{ \
- .c0 = 0x1.fffffep-2, .c1 = 0x1.5554aep-3, .c2 = 0x1.555736p-5, \
- .c3 = 0x1.12287cp-7, .c4 = 0x1.6b55a2p-10, \
+ .c0 = 0x1.fffffep-2, .c1 = 0x1.5554aep-3, .inv_ln2 = 0x1.715476p+0f, \
+ .c2 = 0x1.555736p-5, .c3 = 0x1.12287cp-7, \
\
- .shift = 0x1.8p23f, .inv_ln2 = 0x1.715476p+0f, .ln2_hi = 0x1.62e4p-1f, \
- .ln2_lo = 0x1.7f7d1cp-20f, \
+ .c4 = 0x1.6b55a2p-10, .ln2_lo = 0x1.7f7d1cp-20f, .ln2_hi = 0x1.62e4p-1f, \
}
-#define C(i) sv_f32 (d->c##i)
-
static inline svfloat32_t
expm1f_inline (svfloat32_t x, svbool_t pg, const struct sv_expm1f_data *d)
{
@@ -55,9 +52,8 @@ expm1f_inline (svfloat32_t x, svbool_t pg, const struct sv_expm1f_data *d)
and f = x - i * ln2, then f is in [-ln2/2, ln2/2].
exp(x) - 1 = 2^i * (expm1(f) + 1) - 1
where 2^i is exact because i is an integer. */
- svfloat32_t j = svmla_x (pg, sv_f32 (d->shift), x, d->inv_ln2);
- j = svsub_x (pg, j, d->shift);
- svint32_t i = svcvt_s32_x (pg, j);
+ svfloat32_t j = svmul_x (svptrue_b32 (), x, d->inv_ln2);
+ j = svrinta_x (pg, j);
svfloat32_t f = svmls_lane (x, j, lane_constants, 2);
f = svmls_lane (f, j, lane_constants, 3);
@@ -67,18 +63,18 @@ expm1f_inline (svfloat32_t x, svbool_t pg, const struct sv_expm1f_data *d)
x + ax^2 + bx^3 + cx^4 ....
So we calculate the polynomial P(f) = a + bf + cf^2 + ...
and assemble the approximation expm1(f) ~= f + f^2 * P(f). */
- svfloat32_t p12 = svmla_lane (C (1), f, lane_constants, 0);
- svfloat32_t p34 = svmla_lane (C (3), f, lane_constants, 1);
- svfloat32_t f2 = svmul_x (pg, f, f);
+ svfloat32_t p12 = svmla_lane (sv_f32 (d->c1), f, lane_constants, 0);
+ svfloat32_t p34 = svmla_lane (sv_f32 (d->c3), f, lane_constants, 1);
+ svfloat32_t f2 = svmul_x (svptrue_b32 (), f, f);
svfloat32_t p = svmla_x (pg, p12, f2, p34);
- p = svmla_x (pg, C (0), f, p);
+ p = svmla_x (pg, sv_f32 (d->c0), f, p);
p = svmla_x (pg, f, f2, p);
/* Assemble the result.
expm1(x) ~= 2^i * (p + 1) - 1
Let t = 2^i. */
- svfloat32_t t = svscale_x (pg, sv_f32 (1), i);
- return svmla_x (pg, svsub_x (pg, t, 1), p, t);
+ svfloat32_t t = svscale_x (pg, sv_f32 (1.0f), svcvt_s32_x (pg, j));
+ return svmla_x (pg, svsub_x (pg, t, 1.0f), p, t);
}
#endif
diff --git a/sysdeps/aarch64/fpu/tanhf_sve.c b/sysdeps/aarch64/fpu/tanhf_sve.c
index 0b94523cf5074200..80dd679346f13f37 100644
--- a/sysdeps/aarch64/fpu/tanhf_sve.c
+++ b/sysdeps/aarch64/fpu/tanhf_sve.c
@@ -19,20 +19,27 @@
#include "sv_expm1f_inline.h"
+/* Largest value of x for which tanhf(x) rounds to 1 (or -1 for negative). */
+#define BoringBound 0x1.205966p+3f
+
static const struct data
{
struct sv_expm1f_data expm1f_consts;
- uint32_t boring_bound, onef;
+ uint32_t onef, special_bound;
+ float boring_bound;
} data = {
.expm1f_consts = SV_EXPM1F_DATA,
- /* 0x1.205966p+3, above which tanhf rounds to 1 (or -1 for negative). */
- .boring_bound = 0x41102cb3,
.onef = 0x3f800000,
+ .special_bound = 0x7f800000,
+ .boring_bound = BoringBound,
};
static svfloat32_t NOINLINE
-special_case (svfloat32_t x, svfloat32_t y, svbool_t special)
+special_case (svfloat32_t x, svbool_t pg, svbool_t is_boring,
+ svfloat32_t boring, svfloat32_t q, svbool_t special)
{
+ svfloat32_t y
+ = svsel_f32 (is_boring, boring, svdiv_x (pg, q, svadd_x (pg, q, 2.0)));
return sv_call_f32 (tanhf, x, y, special);
}
@@ -47,15 +54,16 @@ svfloat32_t SV_NAME_F1 (tanh) (svfloat32_t x, const svbool_t pg)
svfloat32_t ax = svabs_x (pg, x);
svuint32_t iax = svreinterpret_u32 (ax);
svuint32_t sign = sveor_x (pg, svreinterpret_u32 (x), iax);
- svbool_t is_boring = svcmpgt (pg, iax, d->boring_bound);
svfloat32_t boring = svreinterpret_f32 (svorr_x (pg, sign, d->onef));
-
- svbool_t special = svcmpgt (pg, iax, 0x7f800000);
+ svbool_t special = svcmpgt (pg, iax, d->special_bound);
+ svbool_t is_boring = svacgt (pg, x, d->boring_bound);
/* tanh(x) = (e^2x - 1) / (e^2x + 1). */
- svfloat32_t q = expm1f_inline (svmul_x (pg, x, 2.0), pg, &d->expm1f_consts);
- svfloat32_t y = svdiv_x (pg, q, svadd_x (pg, q, 2.0));
+ svfloat32_t q = expm1f_inline (svmul_x (svptrue_b32 (), x, 2.0), pg,
+ &d->expm1f_consts);
+
if (__glibc_unlikely (svptest_any (pg, special)))
- return special_case (x, svsel_f32 (is_boring, boring, y), special);
+ return special_case (x, pg, is_boring, boring, q, special);
+ svfloat32_t y = svdiv_x (pg, q, svadd_x (pg, q, 2.0));
return svsel_f32 (is_boring, boring, y);
}

125
glibc-RHEL-118273-26.patch Normal file
View File

@ -0,0 +1,125 @@
commit d3f2b71ef1d146137a25dd1367d97a14fac341c6
Author: Yury Khrustalev <yury.khrustalev@arm.com>
Date: Tue Nov 26 11:38:30 2024 +0000
aarch64: Fix tests not compatible with targets supporting GCS
- Add GCS marking to some of the tests when target supports GCS
- Fix tst-ro-dynamic-mod.map linker script to avoid removing
GNU properties
- Add header with macros for GNU properties
Reviewed-by: Wilco Dijkstra <Wilco.Dijkstra@arm.com>
diff --git a/elf/tst-asm-helper.h b/elf/tst-asm-helper.h
new file mode 100644
index 0000000000000000..6f91ac2ddc54d3f9
--- /dev/null
+++ b/elf/tst-asm-helper.h
@@ -0,0 +1,49 @@
+/* Test header that defines macros for GNU properties that need to be
+ used in some test assembly files where sysdep.h cannot be included
+ for some reason.
+ Copyright (C) 2024-2025 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#include <config.h>
+
+/* GNU_PROPERTY_AARCH64_* macros from elf.h for use in asm code. */
+#define FEATURE_1_AND 0xc0000000
+#define FEATURE_1_BTI 1
+#define FEATURE_1_PAC 2
+#define FEATURE_1_GCS 4
+
+/* Add a NT_GNU_PROPERTY_TYPE_0 note. */
+#define GNU_PROPERTY(type, value) \
+ .section .note.gnu.property, "a"; \
+ .p2align 3; \
+ .word 4; \
+ .word 16; \
+ .word 5; \
+ .asciz "GNU"; \
+ .word type; \
+ .word 4; \
+ .word value; \
+ .word 0; \
+ .text
+
+/* Add GNU property note with the supported features to all asm code
+ where sysdep.h is included. */
+#if HAVE_AARCH64_BTI && HAVE_AARCH64_PAC_RET
+GNU_PROPERTY (FEATURE_1_AND, FEATURE_1_BTI|FEATURE_1_PAC|FEATURE_1_GCS)
+#elif HAVE_AARCH64_BTI
+GNU_PROPERTY (FEATURE_1_AND, FEATURE_1_BTI|FEATURE_1_GCS)
+#endif
diff --git a/elf/tst-big-note-lib.S b/elf/tst-big-note-lib.S
index 5eb1e03cfbe2cee8..cfd31137e85a1335 100644
--- a/elf/tst-big-note-lib.S
+++ b/elf/tst-big-note-lib.S
@@ -20,6 +20,8 @@
On a typical Linux system with 8MiB "ulimit -s", that was enough
to trigger stack overflow in open_verify. */
+#include "tst-asm-helper.h"
+
#define NOTE_SIZE 8*1024*1024
.pushsection .note.big,"a"
diff --git a/elf/tst-ro-dynamic-mod.map b/elf/tst-ro-dynamic-mod.map
index 2fe4a2998cddd587..2a158480c07d9691 100644
--- a/elf/tst-ro-dynamic-mod.map
+++ b/elf/tst-ro-dynamic-mod.map
@@ -3,14 +3,13 @@ SECTIONS
. = SIZEOF_HEADERS;
.dynamic : { *(.dynamic) } :text :dynamic
.rodata : { *(.data*) *(.bss*) } :text
- /DISCARD/ : {
- *(.note.gnu.property)
- }
- .note : { *(.note.*) } :text :note
+ .note : { *(.note) } :text :note
+ .note.gnu.property : { *(.note.gnu.property) } :text :gnu_property
}
PHDRS
{
text PT_LOAD FLAGS(5) FILEHDR PHDRS;
dynamic PT_DYNAMIC FLAGS(4);
note PT_NOTE FLAGS(4);
+ gnu_property PT_GNU_PROPERTY FLAGS(4);
}
diff --git a/sysdeps/aarch64/tst-vpcs-mod.S b/sysdeps/aarch64/tst-vpcs-mod.S
index 19b01c3c3859e13b..b3b5824eda1fb076 100644
--- a/sysdeps/aarch64/tst-vpcs-mod.S
+++ b/sysdeps/aarch64/tst-vpcs-mod.S
@@ -17,6 +17,8 @@
License along with the GNU C Library. If not, see
<https://www.gnu.org/licenses/>. */
+#include "tst-asm-helper.h"
+
.variant_pcs vpcs_call
.global vpcs_call
.type vpcs_call, %function
@@ -121,7 +123,7 @@ vpcs_call_regs:
/* Emulate a BL using B, but save x30 before the branch. */
adr x30, .L_return_addr
stp x30, x29, [x1, 240]
- b vpcs_call
+ bl vpcs_call
.L_return_addr:
/* Restore callee-saved registers. */

241
glibc-RHEL-118273-27.patch Normal file
View File

@ -0,0 +1,241 @@
commit 95e807209b680257a9afe81a507754f1565dbb4d
Author: Yat Long Poon <yatlong.poon@arm.com>
Date: Thu Feb 13 18:03:04 2025 +0000
AArch64: Improve codegen for SVE powf
Improve memory access with indexed/unpredicated instructions.
Eliminate register spills. Speedup on Neoverse V1: 3%.
Reviewed-by: Wilco Dijkstra <Wilco.Dijkstra@arm.com>
diff --git a/sysdeps/aarch64/fpu/powf_sve.c b/sysdeps/aarch64/fpu/powf_sve.c
index 4f6a142325ae719b..08d7019a1855ff3c 100644
--- a/sysdeps/aarch64/fpu/powf_sve.c
+++ b/sysdeps/aarch64/fpu/powf_sve.c
@@ -26,7 +26,6 @@
#define Tlogc __v_powf_data.logc
#define Texp __v_powf_data.scale
#define SignBias (1 << (V_POWF_EXP2_TABLE_BITS + 11))
-#define Shift 0x1.8p52
#define Norm 0x1p23f /* 0x4b000000. */
/* Overall ULP error bound for pow is 2.6 ulp
@@ -36,7 +35,7 @@ static const struct data
double log_poly[4];
double exp_poly[3];
float uflow_bound, oflow_bound, small_bound;
- uint32_t sign_bias, sign_mask, subnormal_bias, off;
+ uint32_t sign_bias, subnormal_bias, off;
} data = {
/* rel err: 1.5 * 2^-30. Each coefficients is multiplied the value of
V_POWF_EXP2_N. */
@@ -53,7 +52,6 @@ static const struct data
.small_bound = 0x1p-126f,
.off = 0x3f35d000,
.sign_bias = SignBias,
- .sign_mask = 0x80000000,
.subnormal_bias = 0x0b800000, /* 23 << 23. */
};
@@ -86,7 +84,7 @@ svisodd (svbool_t pg, svfloat32_t x)
static inline svbool_t
sv_zeroinfnan (svbool_t pg, svuint32_t i)
{
- return svcmpge (pg, svsub_x (pg, svmul_x (pg, i, 2u), 1),
+ return svcmpge (pg, svsub_x (pg, svadd_x (pg, i, i), 1),
2u * 0x7f800000 - 1);
}
@@ -150,9 +148,14 @@ powf_specialcase (float x, float y, float z)
}
/* Scalar fallback for special case routines with custom signature. */
-static inline svfloat32_t
-sv_call_powf_sc (svfloat32_t x1, svfloat32_t x2, svfloat32_t y, svbool_t cmp)
+static svfloat32_t NOINLINE
+sv_call_powf_sc (svfloat32_t x1, svfloat32_t x2, svfloat32_t y)
{
+ /* Special cases of x or y: zero, inf and nan. */
+ svbool_t xspecial = sv_zeroinfnan (svptrue_b32 (), svreinterpret_u32 (x1));
+ svbool_t yspecial = sv_zeroinfnan (svptrue_b32 (), svreinterpret_u32 (x2));
+ svbool_t cmp = svorr_z (svptrue_b32 (), xspecial, yspecial);
+
svbool_t p = svpfirst (cmp, svpfalse ());
while (svptest_any (cmp, p))
{
@@ -182,30 +185,30 @@ sv_powf_core_ext (const svbool_t pg, svuint64_t i, svfloat64_t z, svint64_t k,
/* Polynomial to approximate log1p(r)/ln2. */
svfloat64_t logx = A (0);
- logx = svmla_x (pg, A (1), r, logx);
- logx = svmla_x (pg, A (2), r, logx);
- logx = svmla_x (pg, A (3), r, logx);
- logx = svmla_x (pg, y0, r, logx);
+ logx = svmad_x (pg, r, logx, A (1));
+ logx = svmad_x (pg, r, logx, A (2));
+ logx = svmad_x (pg, r, logx, A (3));
+ logx = svmad_x (pg, r, logx, y0);
*pylogx = svmul_x (pg, y, logx);
/* z - kd is in [-1, 1] in non-nearest rounding modes. */
- svfloat64_t kd = svadd_x (pg, *pylogx, Shift);
- svuint64_t ki = svreinterpret_u64 (kd);
- kd = svsub_x (pg, kd, Shift);
+ svfloat64_t kd = svrinta_x (svptrue_b64 (), *pylogx);
+ svuint64_t ki = svreinterpret_u64 (svcvt_s64_x (svptrue_b64 (), kd));
r = svsub_x (pg, *pylogx, kd);
/* exp2(x) = 2^(k/N) * 2^r ~= s * (C0*r^3 + C1*r^2 + C2*r + 1). */
- svuint64_t t
- = svld1_gather_index (pg, Texp, svand_x (pg, ki, V_POWF_EXP2_N - 1));
- svuint64_t ski = svadd_x (pg, ki, sign_bias);
- t = svadd_x (pg, t, svlsl_x (pg, ski, 52 - V_POWF_EXP2_TABLE_BITS));
+ svuint64_t t = svld1_gather_index (
+ svptrue_b64 (), Texp, svand_x (svptrue_b64 (), ki, V_POWF_EXP2_N - 1));
+ svuint64_t ski = svadd_x (svptrue_b64 (), ki, sign_bias);
+ t = svadd_x (svptrue_b64 (), t,
+ svlsl_x (svptrue_b64 (), ski, 52 - V_POWF_EXP2_TABLE_BITS));
svfloat64_t s = svreinterpret_f64 (t);
svfloat64_t p = C (0);
p = svmla_x (pg, C (1), p, r);
p = svmla_x (pg, C (2), p, r);
- p = svmla_x (pg, s, p, svmul_x (pg, s, r));
+ p = svmla_x (pg, s, p, svmul_x (svptrue_b64 (), s, r));
return p;
}
@@ -219,19 +222,16 @@ sv_powf_core (const svbool_t pg, svuint32_t i, svuint32_t iz, svint32_t k,
{
const svbool_t ptrue = svptrue_b64 ();
- /* Unpack and promote input vectors (pg, y, z, i, k and sign_bias) into two in
- order to perform core computation in double precision. */
+ /* Unpack and promote input vectors (pg, y, z, i, k and sign_bias) into two
+ * in order to perform core computation in double precision. */
const svbool_t pg_lo = svunpklo (pg);
const svbool_t pg_hi = svunpkhi (pg);
- svfloat64_t y_lo = svcvt_f64_x (
- ptrue, svreinterpret_f32 (svunpklo (svreinterpret_u32 (y))));
- svfloat64_t y_hi = svcvt_f64_x (
- ptrue, svreinterpret_f32 (svunpkhi (svreinterpret_u32 (y))));
- svfloat32_t z = svreinterpret_f32 (iz);
- svfloat64_t z_lo = svcvt_f64_x (
- ptrue, svreinterpret_f32 (svunpklo (svreinterpret_u32 (z))));
- svfloat64_t z_hi = svcvt_f64_x (
- ptrue, svreinterpret_f32 (svunpkhi (svreinterpret_u32 (z))));
+ svfloat64_t y_lo
+ = svcvt_f64_x (pg, svreinterpret_f32 (svunpklo (svreinterpret_u32 (y))));
+ svfloat64_t y_hi
+ = svcvt_f64_x (pg, svreinterpret_f32 (svunpkhi (svreinterpret_u32 (y))));
+ svfloat64_t z_lo = svcvt_f64_x (pg, svreinterpret_f32 (svunpklo (iz)));
+ svfloat64_t z_hi = svcvt_f64_x (pg, svreinterpret_f32 (svunpkhi (iz)));
svuint64_t i_lo = svunpklo (i);
svuint64_t i_hi = svunpkhi (i);
svint64_t k_lo = svunpklo (k);
@@ -258,9 +258,9 @@ sv_powf_core (const svbool_t pg, svuint32_t i, svuint32_t iz, svint32_t k,
/* Implementation of SVE powf.
Provides the same accuracy as AdvSIMD powf, since it relies on the same
algorithm. The theoretical maximum error is under 2.60 ULPs.
- Maximum measured error is 2.56 ULPs:
- SV_NAME_F2 (pow) (0x1.004118p+0, 0x1.5d14a4p+16) got 0x1.fd4bp+127
- want 0x1.fd4b06p+127. */
+ Maximum measured error is 2.57 ULPs:
+ SV_NAME_F2 (pow) (0x1.031706p+0, 0x1.ce2ec2p+12) got 0x1.fff868p+127
+ want 0x1.fff862p+127. */
svfloat32_t SV_NAME_F2 (pow) (svfloat32_t x, svfloat32_t y, const svbool_t pg)
{
const struct data *d = ptr_barrier (&data);
@@ -269,21 +269,19 @@ svfloat32_t SV_NAME_F2 (pow) (svfloat32_t x, svfloat32_t y, const svbool_t pg)
svuint32_t viy0 = svreinterpret_u32 (y);
/* Negative x cases. */
- svuint32_t sign_bit = svand_m (pg, vix0, d->sign_mask);
- svbool_t xisneg = svcmpeq (pg, sign_bit, d->sign_mask);
+ svbool_t xisneg = svcmplt (pg, x, sv_f32 (0));
/* Set sign_bias and ix depending on sign of x and nature of y. */
- svbool_t yisnotint_xisneg = svpfalse_b ();
+ svbool_t yint_or_xpos = pg;
svuint32_t sign_bias = sv_u32 (0);
svuint32_t vix = vix0;
if (__glibc_unlikely (svptest_any (pg, xisneg)))
{
/* Determine nature of y. */
- yisnotint_xisneg = svisnotint (xisneg, y);
- svbool_t yisint_xisneg = svisint (xisneg, y);
+ yint_or_xpos = svisint (xisneg, y);
svbool_t yisodd_xisneg = svisodd (xisneg, y);
/* ix set to abs(ix) if y is integer. */
- vix = svand_m (yisint_xisneg, vix0, 0x7fffffff);
+ vix = svand_m (yint_or_xpos, vix0, 0x7fffffff);
/* Set to SignBias if x is negative and y is odd. */
sign_bias = svsel (yisodd_xisneg, sv_u32 (d->sign_bias), sv_u32 (0));
}
@@ -294,8 +292,8 @@ svfloat32_t SV_NAME_F2 (pow) (svfloat32_t x, svfloat32_t y, const svbool_t pg)
svbool_t cmp = svorr_z (pg, xspecial, yspecial);
/* Small cases of x: |x| < 0x1p-126. */
- svbool_t xsmall = svaclt (pg, x, d->small_bound);
- if (__glibc_unlikely (svptest_any (pg, xsmall)))
+ svbool_t xsmall = svaclt (yint_or_xpos, x, d->small_bound);
+ if (__glibc_unlikely (svptest_any (yint_or_xpos, xsmall)))
{
/* Normalize subnormal x so exponent becomes negative. */
svuint32_t vix_norm = svreinterpret_u32 (svmul_x (xsmall, x, Norm));
@@ -304,32 +302,35 @@ svfloat32_t SV_NAME_F2 (pow) (svfloat32_t x, svfloat32_t y, const svbool_t pg)
vix = svsel (xsmall, vix_norm, vix);
}
/* Part of core computation carried in working precision. */
- svuint32_t tmp = svsub_x (pg, vix, d->off);
- svuint32_t i = svand_x (pg, svlsr_x (pg, tmp, (23 - V_POWF_LOG2_TABLE_BITS)),
- V_POWF_LOG2_N - 1);
- svuint32_t top = svand_x (pg, tmp, 0xff800000);
- svuint32_t iz = svsub_x (pg, vix, top);
- svint32_t k
- = svasr_x (pg, svreinterpret_s32 (top), (23 - V_POWF_EXP2_TABLE_BITS));
-
- /* Compute core in extended precision and return intermediate ylogx results to
- handle cases of underflow and underflow in exp. */
+ svuint32_t tmp = svsub_x (yint_or_xpos, vix, d->off);
+ svuint32_t i = svand_x (
+ yint_or_xpos, svlsr_x (yint_or_xpos, tmp, (23 - V_POWF_LOG2_TABLE_BITS)),
+ V_POWF_LOG2_N - 1);
+ svuint32_t top = svand_x (yint_or_xpos, tmp, 0xff800000);
+ svuint32_t iz = svsub_x (yint_or_xpos, vix, top);
+ svint32_t k = svasr_x (yint_or_xpos, svreinterpret_s32 (top),
+ (23 - V_POWF_EXP2_TABLE_BITS));
+
+ /* Compute core in extended precision and return intermediate ylogx results
+ * to handle cases of underflow and underflow in exp. */
svfloat32_t ylogx;
- svfloat32_t ret = sv_powf_core (pg, i, iz, k, y, sign_bias, &ylogx, d);
+ svfloat32_t ret
+ = sv_powf_core (yint_or_xpos, i, iz, k, y, sign_bias, &ylogx, d);
/* Handle exp special cases of underflow and overflow. */
- svuint32_t sign = svlsl_x (pg, sign_bias, 20 - V_POWF_EXP2_TABLE_BITS);
+ svuint32_t sign
+ = svlsl_x (yint_or_xpos, sign_bias, 20 - V_POWF_EXP2_TABLE_BITS);
svfloat32_t ret_oflow
- = svreinterpret_f32 (svorr_x (pg, sign, asuint (INFINITY)));
+ = svreinterpret_f32 (svorr_x (yint_or_xpos, sign, asuint (INFINITY)));
svfloat32_t ret_uflow = svreinterpret_f32 (sign);
- ret = svsel (svcmple (pg, ylogx, d->uflow_bound), ret_uflow, ret);
- ret = svsel (svcmpgt (pg, ylogx, d->oflow_bound), ret_oflow, ret);
+ ret = svsel (svcmple (yint_or_xpos, ylogx, d->uflow_bound), ret_uflow, ret);
+ ret = svsel (svcmpgt (yint_or_xpos, ylogx, d->oflow_bound), ret_oflow, ret);
/* Cases of finite y and finite negative x. */
- ret = svsel (yisnotint_xisneg, sv_f32 (__builtin_nanf ("")), ret);
+ ret = svsel (yint_or_xpos, ret, sv_f32 (__builtin_nanf ("")));
- if (__glibc_unlikely (svptest_any (pg, cmp)))
- return sv_call_powf_sc (x, y, ret, cmp);
+ if (__glibc_unlikely (svptest_any (cmp, cmp)))
+ return sv_call_powf_sc (x, y, ret);
return ret;
}

401
glibc-RHEL-118273-28.patch Normal file
View File

@ -0,0 +1,401 @@
commit 0b195651db3ae793187c7dd6d78b5a7a8da9d5e6
Author: Yat Long Poon <yatlong.poon@arm.com>
Date: Thu Feb 13 18:02:01 2025 +0000
AArch64: Improve codegen for SVE pow
Move constants to struct. Improve memory access with indexed/unpredicated
instructions. Eliminate register spills. Speedup on Neoverse V1: 24%.
Reviewed-by: Wilco Dijkstra <Wilco.Dijkstra@arm.com>
diff --git a/sysdeps/aarch64/fpu/pow_sve.c b/sysdeps/aarch64/fpu/pow_sve.c
index 4c0bf8956c584be7..4242d22a491ed17e 100644
--- a/sysdeps/aarch64/fpu/pow_sve.c
+++ b/sysdeps/aarch64/fpu/pow_sve.c
@@ -44,19 +44,18 @@
/* Data is defined in v_pow_log_data.c. */
#define N_LOG (1 << V_POW_LOG_TABLE_BITS)
-#define A __v_pow_log_data.poly
#define Off 0x3fe6955500000000
/* Data is defined in v_pow_exp_data.c. */
#define N_EXP (1 << V_POW_EXP_TABLE_BITS)
#define SignBias (0x800 << V_POW_EXP_TABLE_BITS)
-#define C __v_pow_exp_data.poly
#define SmallExp 0x3c9 /* top12(0x1p-54). */
#define BigExp 0x408 /* top12(512.). */
#define ThresExp 0x03f /* BigExp - SmallExp. */
#define HugeExp 0x409 /* top12(1024.). */
/* Constants associated with pow. */
+#define SmallBoundX 0x1p-126
#define SmallPowX 0x001 /* top12(0x1p-126). */
#define BigPowX 0x7ff /* top12(INFINITY). */
#define ThresPowX 0x7fe /* BigPowX - SmallPowX. */
@@ -64,6 +63,31 @@
#define BigPowY 0x43e /* top12(0x1.749p62). */
#define ThresPowY 0x080 /* BigPowY - SmallPowY. */
+static const struct data
+{
+ double log_c0, log_c2, log_c4, log_c6, ln2_hi, ln2_lo;
+ double log_c1, log_c3, log_c5, off;
+ double n_over_ln2, exp_c2, ln2_over_n_hi, ln2_over_n_lo;
+ double exp_c0, exp_c1;
+} data = {
+ .log_c0 = -0x1p-1,
+ .log_c1 = -0x1.555555555556p-1,
+ .log_c2 = 0x1.0000000000006p-1,
+ .log_c3 = 0x1.999999959554ep-1,
+ .log_c4 = -0x1.555555529a47ap-1,
+ .log_c5 = -0x1.2495b9b4845e9p0,
+ .log_c6 = 0x1.0002b8b263fc3p0,
+ .off = Off,
+ .exp_c0 = 0x1.fffffffffffd4p-2,
+ .exp_c1 = 0x1.5555571d6ef9p-3,
+ .exp_c2 = 0x1.5555576a5adcep-5,
+ .ln2_hi = 0x1.62e42fefa3800p-1,
+ .ln2_lo = 0x1.ef35793c76730p-45,
+ .n_over_ln2 = 0x1.71547652b82fep0 * N_EXP,
+ .ln2_over_n_hi = 0x1.62e42fefc0000p-9,
+ .ln2_over_n_lo = -0x1.c610ca86c3899p-45,
+};
+
/* Check if x is an integer. */
static inline svbool_t
sv_isint (svbool_t pg, svfloat64_t x)
@@ -82,7 +106,7 @@ sv_isnotint (svbool_t pg, svfloat64_t x)
static inline svbool_t
sv_isodd (svbool_t pg, svfloat64_t x)
{
- svfloat64_t y = svmul_x (pg, x, 0.5);
+ svfloat64_t y = svmul_x (svptrue_b64 (), x, 0.5);
return sv_isnotint (pg, y);
}
@@ -121,7 +145,7 @@ zeroinfnan (uint64_t i)
static inline svbool_t
sv_zeroinfnan (svbool_t pg, svuint64_t i)
{
- return svcmpge (pg, svsub_x (pg, svmul_x (pg, i, 2), 1),
+ return svcmpge (pg, svsub_x (pg, svadd_x (pg, i, i), 1),
2 * asuint64 (INFINITY) - 1);
}
@@ -174,16 +198,17 @@ sv_call_specialcase (svfloat64_t x1, svuint64_t u1, svuint64_t u2,
additional 15 bits precision. IX is the bit representation of x, but
normalized in the subnormal range using the sign bit for the exponent. */
static inline svfloat64_t
-sv_log_inline (svbool_t pg, svuint64_t ix, svfloat64_t *tail)
+sv_log_inline (svbool_t pg, svuint64_t ix, svfloat64_t *tail,
+ const struct data *d)
{
/* x = 2^k z; where z is in range [Off,2*Off) and exact.
The range is split into N subintervals.
The ith subinterval contains z and c is near its center. */
- svuint64_t tmp = svsub_x (pg, ix, Off);
+ svuint64_t tmp = svsub_x (pg, ix, d->off);
svuint64_t i = svand_x (pg, svlsr_x (pg, tmp, 52 - V_POW_LOG_TABLE_BITS),
sv_u64 (N_LOG - 1));
svint64_t k = svasr_x (pg, svreinterpret_s64 (tmp), 52);
- svuint64_t iz = svsub_x (pg, ix, svand_x (pg, tmp, sv_u64 (0xfffULL << 52)));
+ svuint64_t iz = svsub_x (pg, ix, svlsl_x (pg, svreinterpret_u64 (k), 52));
svfloat64_t z = svreinterpret_f64 (iz);
svfloat64_t kd = svcvt_f64_x (pg, k);
@@ -199,40 +224,85 @@ sv_log_inline (svbool_t pg, svuint64_t ix, svfloat64_t *tail)
|z/c - 1| < 1/N, so r = z/c - 1 is exactly representible. */
svfloat64_t r = svmad_x (pg, z, invc, -1.0);
/* k*Ln2 + log(c) + r. */
- svfloat64_t t1 = svmla_x (pg, logc, kd, __v_pow_log_data.ln2_hi);
+
+ svfloat64_t ln2_hilo = svld1rq_f64 (svptrue_b64 (), &d->ln2_hi);
+ svfloat64_t t1 = svmla_lane_f64 (logc, kd, ln2_hilo, 0);
svfloat64_t t2 = svadd_x (pg, t1, r);
- svfloat64_t lo1 = svmla_x (pg, logctail, kd, __v_pow_log_data.ln2_lo);
+ svfloat64_t lo1 = svmla_lane_f64 (logctail, kd, ln2_hilo, 1);
svfloat64_t lo2 = svadd_x (pg, svsub_x (pg, t1, t2), r);
/* Evaluation is optimized assuming superscalar pipelined execution. */
- svfloat64_t ar = svmul_x (pg, r, -0.5); /* A[0] = -0.5. */
- svfloat64_t ar2 = svmul_x (pg, r, ar);
- svfloat64_t ar3 = svmul_x (pg, r, ar2);
+
+ svfloat64_t log_c02 = svld1rq_f64 (svptrue_b64 (), &d->log_c0);
+ svfloat64_t ar = svmul_lane_f64 (r, log_c02, 0);
+ svfloat64_t ar2 = svmul_x (svptrue_b64 (), r, ar);
+ svfloat64_t ar3 = svmul_x (svptrue_b64 (), r, ar2);
/* k*Ln2 + log(c) + r + A[0]*r*r. */
svfloat64_t hi = svadd_x (pg, t2, ar2);
- svfloat64_t lo3 = svmla_x (pg, svneg_x (pg, ar2), ar, r);
+ svfloat64_t lo3 = svmls_x (pg, ar2, ar, r);
svfloat64_t lo4 = svadd_x (pg, svsub_x (pg, t2, hi), ar2);
/* p = log1p(r) - r - A[0]*r*r. */
/* p = (ar3 * (A[1] + r * A[2] + ar2 * (A[3] + r * A[4] + ar2 * (A[5] + r *
A[6])))). */
- svfloat64_t a56 = svmla_x (pg, sv_f64 (A[5]), r, A[6]);
- svfloat64_t a34 = svmla_x (pg, sv_f64 (A[3]), r, A[4]);
- svfloat64_t a12 = svmla_x (pg, sv_f64 (A[1]), r, A[2]);
+
+ svfloat64_t log_c46 = svld1rq_f64 (svptrue_b64 (), &d->log_c4);
+ svfloat64_t a56 = svmla_lane_f64 (sv_f64 (d->log_c5), r, log_c46, 1);
+ svfloat64_t a34 = svmla_lane_f64 (sv_f64 (d->log_c3), r, log_c46, 0);
+ svfloat64_t a12 = svmla_lane_f64 (sv_f64 (d->log_c1), r, log_c02, 1);
svfloat64_t p = svmla_x (pg, a34, ar2, a56);
p = svmla_x (pg, a12, ar2, p);
- p = svmul_x (pg, ar3, p);
+ p = svmul_x (svptrue_b64 (), ar3, p);
svfloat64_t lo = svadd_x (
- pg, svadd_x (pg, svadd_x (pg, svadd_x (pg, lo1, lo2), lo3), lo4), p);
+ pg, svadd_x (pg, svsub_x (pg, svadd_x (pg, lo1, lo2), lo3), lo4), p);
svfloat64_t y = svadd_x (pg, hi, lo);
*tail = svadd_x (pg, svsub_x (pg, hi, y), lo);
return y;
}
+static inline svfloat64_t
+sv_exp_core (svbool_t pg, svfloat64_t x, svfloat64_t xtail,
+ svuint64_t sign_bias, svfloat64_t *tmp, svuint64_t *sbits,
+ svuint64_t *ki, const struct data *d)
+{
+ /* exp(x) = 2^(k/N) * exp(r), with exp(r) in [2^(-1/2N),2^(1/2N)]. */
+ /* x = ln2/N*k + r, with int k and r in [-ln2/2N, ln2/2N]. */
+ svfloat64_t n_over_ln2_and_c2 = svld1rq_f64 (svptrue_b64 (), &d->n_over_ln2);
+ svfloat64_t z = svmul_lane_f64 (x, n_over_ln2_and_c2, 0);
+ /* z - kd is in [-1, 1] in non-nearest rounding modes. */
+ svfloat64_t kd = svrinta_x (pg, z);
+ *ki = svreinterpret_u64 (svcvt_s64_x (pg, kd));
+
+ svfloat64_t ln2_over_n_hilo
+ = svld1rq_f64 (svptrue_b64 (), &d->ln2_over_n_hi);
+ svfloat64_t r = x;
+ r = svmls_lane_f64 (r, kd, ln2_over_n_hilo, 0);
+ r = svmls_lane_f64 (r, kd, ln2_over_n_hilo, 1);
+ /* The code assumes 2^-200 < |xtail| < 2^-8/N. */
+ r = svadd_x (pg, r, xtail);
+ /* 2^(k/N) ~= scale. */
+ svuint64_t idx = svand_x (pg, *ki, N_EXP - 1);
+ svuint64_t top
+ = svlsl_x (pg, svadd_x (pg, *ki, sign_bias), 52 - V_POW_EXP_TABLE_BITS);
+ /* This is only a valid scale when -1023*N < k < 1024*N. */
+ *sbits = svld1_gather_index (pg, __v_pow_exp_data.sbits, idx);
+ *sbits = svadd_x (pg, *sbits, top);
+ /* exp(x) = 2^(k/N) * exp(r) ~= scale + scale * (exp(r) - 1). */
+ svfloat64_t r2 = svmul_x (svptrue_b64 (), r, r);
+ *tmp = svmla_lane_f64 (sv_f64 (d->exp_c1), r, n_over_ln2_and_c2, 1);
+ *tmp = svmla_x (pg, sv_f64 (d->exp_c0), r, *tmp);
+ *tmp = svmla_x (pg, r, r2, *tmp);
+ svfloat64_t scale = svreinterpret_f64 (*sbits);
+ /* Note: tmp == 0 or |tmp| > 2^-200 and scale > 2^-739, so there
+ is no spurious underflow here even without fma. */
+ z = svmla_x (pg, scale, scale, *tmp);
+ return z;
+}
+
/* Computes sign*exp(x+xtail) where |xtail| < 2^-8/N and |xtail| <= |x|.
The sign_bias argument is SignBias or 0 and sets the sign to -1 or 1. */
static inline svfloat64_t
sv_exp_inline (svbool_t pg, svfloat64_t x, svfloat64_t xtail,
- svuint64_t sign_bias)
+ svuint64_t sign_bias, const struct data *d)
{
/* 3 types of special cases: tiny (uflow and spurious uflow), huge (oflow)
and other cases of large values of x (scale * (1 + TMP) oflow). */
@@ -240,73 +310,46 @@ sv_exp_inline (svbool_t pg, svfloat64_t x, svfloat64_t xtail,
/* |x| is large (|x| >= 512) or tiny (|x| <= 0x1p-54). */
svbool_t uoflow = svcmpge (pg, svsub_x (pg, abstop, SmallExp), ThresExp);
- /* Conditions special, uflow and oflow are all expressed as uoflow &&
- something, hence do not bother computing anything if no lane in uoflow is
- true. */
- svbool_t special = svpfalse_b ();
- svbool_t uflow = svpfalse_b ();
- svbool_t oflow = svpfalse_b ();
+ svfloat64_t tmp;
+ svuint64_t sbits, ki;
if (__glibc_unlikely (svptest_any (pg, uoflow)))
{
+ svfloat64_t z
+ = sv_exp_core (pg, x, xtail, sign_bias, &tmp, &sbits, &ki, d);
+
/* |x| is tiny (|x| <= 0x1p-54). */
- uflow = svcmpge (pg, svsub_x (pg, abstop, SmallExp), 0x80000000);
+ svbool_t uflow
+ = svcmpge (pg, svsub_x (pg, abstop, SmallExp), 0x80000000);
uflow = svand_z (pg, uoflow, uflow);
/* |x| is huge (|x| >= 1024). */
- oflow = svcmpge (pg, abstop, HugeExp);
+ svbool_t oflow = svcmpge (pg, abstop, HugeExp);
oflow = svand_z (pg, uoflow, svbic_z (pg, oflow, uflow));
+
/* For large |x| values (512 < |x| < 1024) scale * (1 + TMP) can overflow
- or underflow. */
- special = svbic_z (pg, uoflow, svorr_z (pg, uflow, oflow));
+ or underflow. */
+ svbool_t special = svbic_z (pg, uoflow, svorr_z (pg, uflow, oflow));
+
+ /* Update result with special and large cases. */
+ z = sv_call_specialcase (tmp, sbits, ki, z, special);
+
+ /* Handle underflow and overflow. */
+ svbool_t x_is_neg = svcmplt (pg, x, 0);
+ svuint64_t sign_mask
+ = svlsl_x (pg, sign_bias, 52 - V_POW_EXP_TABLE_BITS);
+ svfloat64_t res_uoflow
+ = svsel (x_is_neg, sv_f64 (0.0), sv_f64 (INFINITY));
+ res_uoflow = svreinterpret_f64 (
+ svorr_x (pg, svreinterpret_u64 (res_uoflow), sign_mask));
+ /* Avoid spurious underflow for tiny x. */
+ svfloat64_t res_spurious_uflow
+ = svreinterpret_f64 (svorr_x (pg, sign_mask, 0x3ff0000000000000));
+
+ z = svsel (oflow, res_uoflow, z);
+ z = svsel (uflow, res_spurious_uflow, z);
+ return z;
}
- /* exp(x) = 2^(k/N) * exp(r), with exp(r) in [2^(-1/2N),2^(1/2N)]. */
- /* x = ln2/N*k + r, with int k and r in [-ln2/2N, ln2/2N]. */
- svfloat64_t z = svmul_x (pg, x, __v_pow_exp_data.n_over_ln2);
- /* z - kd is in [-1, 1] in non-nearest rounding modes. */
- svfloat64_t shift = sv_f64 (__v_pow_exp_data.shift);
- svfloat64_t kd = svadd_x (pg, z, shift);
- svuint64_t ki = svreinterpret_u64 (kd);
- kd = svsub_x (pg, kd, shift);
- svfloat64_t r = x;
- r = svmls_x (pg, r, kd, __v_pow_exp_data.ln2_over_n_hi);
- r = svmls_x (pg, r, kd, __v_pow_exp_data.ln2_over_n_lo);
- /* The code assumes 2^-200 < |xtail| < 2^-8/N. */
- r = svadd_x (pg, r, xtail);
- /* 2^(k/N) ~= scale. */
- svuint64_t idx = svand_x (pg, ki, N_EXP - 1);
- svuint64_t top
- = svlsl_x (pg, svadd_x (pg, ki, sign_bias), 52 - V_POW_EXP_TABLE_BITS);
- /* This is only a valid scale when -1023*N < k < 1024*N. */
- svuint64_t sbits = svld1_gather_index (pg, __v_pow_exp_data.sbits, idx);
- sbits = svadd_x (pg, sbits, top);
- /* exp(x) = 2^(k/N) * exp(r) ~= scale + scale * (exp(r) - 1). */
- svfloat64_t r2 = svmul_x (pg, r, r);
- svfloat64_t tmp = svmla_x (pg, sv_f64 (C[1]), r, C[2]);
- tmp = svmla_x (pg, sv_f64 (C[0]), r, tmp);
- tmp = svmla_x (pg, r, r2, tmp);
- svfloat64_t scale = svreinterpret_f64 (sbits);
- /* Note: tmp == 0 or |tmp| > 2^-200 and scale > 2^-739, so there
- is no spurious underflow here even without fma. */
- z = svmla_x (pg, scale, scale, tmp);
-
- /* Update result with special and large cases. */
- if (__glibc_unlikely (svptest_any (pg, special)))
- z = sv_call_specialcase (tmp, sbits, ki, z, special);
-
- /* Handle underflow and overflow. */
- svuint64_t sign_bit = svlsr_x (pg, svreinterpret_u64 (x), 63);
- svbool_t x_is_neg = svcmpne (pg, sign_bit, 0);
- svuint64_t sign_mask = svlsl_x (pg, sign_bias, 52 - V_POW_EXP_TABLE_BITS);
- svfloat64_t res_uoflow = svsel (x_is_neg, sv_f64 (0.0), sv_f64 (INFINITY));
- res_uoflow = svreinterpret_f64 (
- svorr_x (pg, svreinterpret_u64 (res_uoflow), sign_mask));
- z = svsel (oflow, res_uoflow, z);
- /* Avoid spurious underflow for tiny x. */
- svfloat64_t res_spurious_uflow
- = svreinterpret_f64 (svorr_x (pg, sign_mask, 0x3ff0000000000000));
- z = svsel (uflow, res_spurious_uflow, z);
-
- return z;
+ return sv_exp_core (pg, x, xtail, sign_bias, &tmp, &sbits, &ki, d);
}
static inline double
@@ -341,47 +384,39 @@ pow_sc (double x, double y)
svfloat64_t SV_NAME_D2 (pow) (svfloat64_t x, svfloat64_t y, const svbool_t pg)
{
+ const struct data *d = ptr_barrier (&data);
+
/* This preamble handles special case conditions used in the final scalar
fallbacks. It also updates ix and sign_bias, that are used in the core
computation too, i.e., exp( y * log (x) ). */
svuint64_t vix0 = svreinterpret_u64 (x);
svuint64_t viy0 = svreinterpret_u64 (y);
- svuint64_t vtopx0 = svlsr_x (svptrue_b64 (), vix0, 52);
/* Negative x cases. */
- svuint64_t sign_bit = svlsr_m (pg, vix0, 63);
- svbool_t xisneg = svcmpeq (pg, sign_bit, 1);
+ svbool_t xisneg = svcmplt (pg, x, 0);
/* Set sign_bias and ix depending on sign of x and nature of y. */
- svbool_t yisnotint_xisneg = svpfalse_b ();
+ svbool_t yint_or_xpos = pg;
svuint64_t sign_bias = sv_u64 (0);
svuint64_t vix = vix0;
- svuint64_t vtopx1 = vtopx0;
if (__glibc_unlikely (svptest_any (pg, xisneg)))
{
/* Determine nature of y. */
- yisnotint_xisneg = sv_isnotint (xisneg, y);
- svbool_t yisint_xisneg = sv_isint (xisneg, y);
+ yint_or_xpos = sv_isint (xisneg, y);
svbool_t yisodd_xisneg = sv_isodd (xisneg, y);
/* ix set to abs(ix) if y is integer. */
- vix = svand_m (yisint_xisneg, vix0, 0x7fffffffffffffff);
- vtopx1 = svand_m (yisint_xisneg, vtopx0, 0x7ff);
+ vix = svand_m (yint_or_xpos, vix0, 0x7fffffffffffffff);
/* Set to SignBias if x is negative and y is odd. */
sign_bias = svsel (yisodd_xisneg, sv_u64 (SignBias), sv_u64 (0));
}
- /* Special cases of x or y: zero, inf and nan. */
- svbool_t xspecial = sv_zeroinfnan (pg, vix0);
- svbool_t yspecial = sv_zeroinfnan (pg, viy0);
- svbool_t special = svorr_z (pg, xspecial, yspecial);
-
/* Small cases of x: |x| < 0x1p-126. */
- svuint64_t vabstopx0 = svand_x (pg, vtopx0, 0x7ff);
- svbool_t xsmall = svcmplt (pg, vabstopx0, SmallPowX);
- if (__glibc_unlikely (svptest_any (pg, xsmall)))
+ svbool_t xsmall = svaclt (yint_or_xpos, x, SmallBoundX);
+ if (__glibc_unlikely (svptest_any (yint_or_xpos, xsmall)))
{
/* Normalize subnormal x so exponent becomes negative. */
- svbool_t topx_is_null = svcmpeq (xsmall, vtopx1, 0);
+ svuint64_t vtopx = svlsr_x (svptrue_b64 (), vix, 52);
+ svbool_t topx_is_null = svcmpeq (xsmall, vtopx, 0);
svuint64_t vix_norm = svreinterpret_u64 (svmul_m (xsmall, x, 0x1p52));
vix_norm = svand_m (xsmall, vix_norm, 0x7fffffffffffffff);
@@ -391,20 +426,24 @@ svfloat64_t SV_NAME_D2 (pow) (svfloat64_t x, svfloat64_t y, const svbool_t pg)
/* y_hi = log(ix, &y_lo). */
svfloat64_t vlo;
- svfloat64_t vhi = sv_log_inline (pg, vix, &vlo);
+ svfloat64_t vhi = sv_log_inline (yint_or_xpos, vix, &vlo, d);
/* z = exp(y_hi, y_lo, sign_bias). */
- svfloat64_t vehi = svmul_x (pg, y, vhi);
- svfloat64_t velo = svmul_x (pg, y, vlo);
- svfloat64_t vemi = svmls_x (pg, vehi, y, vhi);
- velo = svsub_x (pg, velo, vemi);
- svfloat64_t vz = sv_exp_inline (pg, vehi, velo, sign_bias);
+ svfloat64_t vehi = svmul_x (svptrue_b64 (), y, vhi);
+ svfloat64_t vemi = svmls_x (yint_or_xpos, vehi, y, vhi);
+ svfloat64_t velo = svnmls_x (yint_or_xpos, vemi, y, vlo);
+ svfloat64_t vz = sv_exp_inline (yint_or_xpos, vehi, velo, sign_bias, d);
/* Cases of finite y and finite negative x. */
- vz = svsel (yisnotint_xisneg, sv_f64 (__builtin_nan ("")), vz);
+ vz = svsel (yint_or_xpos, vz, sv_f64 (__builtin_nan ("")));
+
+ /* Special cases of x or y: zero, inf and nan. */
+ svbool_t xspecial = sv_zeroinfnan (svptrue_b64 (), vix0);
+ svbool_t yspecial = sv_zeroinfnan (svptrue_b64 (), viy0);
+ svbool_t special = svorr_z (svptrue_b64 (), xspecial, yspecial);
/* Cases of zero/inf/nan x or y. */
- if (__glibc_unlikely (svptest_any (pg, special)))
+ if (__glibc_unlikely (svptest_any (svptrue_b64 (), special)))
vz = sv_call2_f64 (pow_sc, x, y, vz, special);
return vz;

View File

@ -0,0 +1,45 @@
commit f5ff34cb3c75ec1061c75bb9188b3c1176426947
Author: Yat Long Poon <yatlong.poon@arm.com>
Date: Thu Feb 13 18:00:50 2025 +0000
AArch64: Improve codegen for SVE erfcf
Reduce number of MOV/MOVPRFXs and use unpredicated FMUL.
Replace MUL with LSL. Speedup on Neoverse V1: 6%.
Reviewed-by: Wilco Dijkstra <Wilco.Dijkstra@arm.com>
diff --git a/sysdeps/aarch64/fpu/erfcf_sve.c b/sysdeps/aarch64/fpu/erfcf_sve.c
index ecacb933aca40855..e4869263e31e18bc 100644
--- a/sysdeps/aarch64/fpu/erfcf_sve.c
+++ b/sysdeps/aarch64/fpu/erfcf_sve.c
@@ -76,7 +76,7 @@ svfloat32_t SV_NAME_F1 (erfc) (svfloat32_t x, const svbool_t pg)
svuint32_t i = svqadd (svreinterpret_u32 (z), dat->off_idx);
/* Lookup erfc(r) and 2/sqrt(pi)*exp(-r^2) in tables. */
- i = svmul_x (pg, i, 2);
+ i = svlsl_x (svptrue_b32 (), i, 1);
const float32_t *p = &__v_erfcf_data.tab[0].erfc - 2 * dat->off_arr;
svfloat32_t erfcr = svld1_gather_index (pg, p, i);
svfloat32_t scale = svld1_gather_index (pg, p + 1, i);
@@ -84,15 +84,15 @@ svfloat32_t SV_NAME_F1 (erfc) (svfloat32_t x, const svbool_t pg)
/* erfc(x) ~ erfc(r) - scale * d * poly(r, d). */
svfloat32_t r = svsub_x (pg, z, shift);
svfloat32_t d = svsub_x (pg, a, r);
- svfloat32_t d2 = svmul_x (pg, d, d);
- svfloat32_t r2 = svmul_x (pg, r, r);
+ svfloat32_t d2 = svmul_x (svptrue_b32 (), d, d);
+ svfloat32_t r2 = svmul_x (svptrue_b32 (), r, r);
svfloat32_t coeffs = svld1rq (svptrue_b32 (), &dat->third);
- svfloat32_t third = svdup_lane (coeffs, 0);
svfloat32_t p1 = r;
- svfloat32_t p2 = svmls_lane (third, r2, coeffs, 1);
- svfloat32_t p3 = svmul_x (pg, r, svmla_lane (sv_f32 (-0.5), r2, coeffs, 0));
+ svfloat32_t p2 = svmls_lane (sv_f32 (dat->third), r2, coeffs, 1);
+ svfloat32_t p3
+ = svmul_x (svptrue_b32 (), r, svmla_lane (sv_f32 (-0.5), r2, coeffs, 0));
svfloat32_t p4 = svmla_lane (sv_f32 (dat->two_over_five), r2, coeffs, 2);
p4 = svmls_x (pg, sv_f32 (dat->tenth), r2, p4);

873
glibc-RHEL-118273-3.patch Normal file
View File

@ -0,0 +1,873 @@
commit b09fee1d21650428a6a3335408a46ebe1165d30d
Author: Joe Ramsay <Joe.Ramsay@arm.com>
Date: Tue Feb 20 16:59:40 2024 +0000
aarch64/fpu: Add vector variants of acosh
Reviewed-by: Szabolcs Nagy <szabolcs.nagy@arm.com>
diff --git a/sysdeps/aarch64/fpu/Makefile b/sysdeps/aarch64/fpu/Makefile
index 019c3a51880e2306..2e5bbb5a07f4c9b0 100644
--- a/sysdeps/aarch64/fpu/Makefile
+++ b/sysdeps/aarch64/fpu/Makefile
@@ -1,4 +1,5 @@
libmvec-supported-funcs = acos \
+ acosh \
asin \
atan \
atan2 \
diff --git a/sysdeps/aarch64/fpu/Versions b/sysdeps/aarch64/fpu/Versions
index 884b4b57f097635f..60e1cdeacec3f77e 100644
--- a/sysdeps/aarch64/fpu/Versions
+++ b/sysdeps/aarch64/fpu/Versions
@@ -79,6 +79,11 @@ libmvec {
_ZGVsMxv_tan;
}
GLIBC_2.40 {
+ _ZGVnN2v_acosh;
+ _ZGVnN2v_acoshf;
+ _ZGVnN4v_acoshf;
+ _ZGVsMxv_acosh;
+ _ZGVsMxv_acoshf;
_ZGVnN2v_cosh;
_ZGVnN2v_coshf;
_ZGVnN4v_coshf;
diff --git a/sysdeps/aarch64/fpu/acosh_advsimd.c b/sysdeps/aarch64/fpu/acosh_advsimd.c
new file mode 100644
index 0000000000000000..c88283cf1191f4eb
--- /dev/null
+++ b/sysdeps/aarch64/fpu/acosh_advsimd.c
@@ -0,0 +1,67 @@
+/* Double-precision vector (Advanced SIMD) acosh function
+
+ Copyright (C) 2024 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#define WANT_V_LOG1P_K0_SHORTCUT 1
+#include "v_log1p_inline.h"
+
+const static struct data
+{
+ struct v_log1p_data log1p_consts;
+ uint64x2_t one, thresh;
+} data = {
+ .log1p_consts = V_LOG1P_CONSTANTS_TABLE,
+ .one = V2 (0x3ff0000000000000),
+ .thresh = V2 (0x1ff0000000000000) /* asuint64(0x1p511) - asuint64(1). */
+};
+
+static float64x2_t NOINLINE VPCS_ATTR
+special_case (float64x2_t x, float64x2_t y, uint64x2_t special,
+ const struct v_log1p_data *d)
+{
+ return v_call_f64 (acosh, x, log1p_inline (y, d), special);
+}
+
+/* Vector approximation for double-precision acosh, based on log1p.
+ The largest observed error is 3.02 ULP in the region where the
+ argument to log1p falls in the k=0 interval, i.e. x close to 1:
+ _ZGVnN2v_acosh(0x1.00798aaf80739p+0) got 0x1.f2d6d823bc9dfp-5
+ want 0x1.f2d6d823bc9e2p-5. */
+VPCS_ATTR float64x2_t V_NAME_D1 (acosh) (float64x2_t x)
+{
+ const struct data *d = ptr_barrier (&data);
+ uint64x2_t special
+ = vcgeq_u64 (vsubq_u64 (vreinterpretq_u64_f64 (x), d->one), d->thresh);
+ float64x2_t special_arg = x;
+
+#if WANT_SIMD_EXCEPT
+ if (__glibc_unlikely (v_any_u64 (special)))
+ x = vbslq_f64 (special, vreinterpretq_f64_u64 (d->one), x);
+#endif
+
+ float64x2_t xm1 = vsubq_f64 (x, v_f64 (1));
+ float64x2_t y;
+ y = vaddq_f64 (x, v_f64 (1));
+ y = vmulq_f64 (y, xm1);
+ y = vsqrtq_f64 (y);
+ y = vaddq_f64 (xm1, y);
+
+ if (__glibc_unlikely (v_any_u64 (special)))
+ return special_case (special_arg, y, special, &d->log1p_consts);
+ return log1p_inline (y, &d->log1p_consts);
+}
diff --git a/sysdeps/aarch64/fpu/acosh_sve.c b/sysdeps/aarch64/fpu/acosh_sve.c
new file mode 100644
index 0000000000000000..3e4faaa5ca686c18
--- /dev/null
+++ b/sysdeps/aarch64/fpu/acosh_sve.c
@@ -0,0 +1,51 @@
+/* Double-precision vector (SVE) acosh function
+
+ Copyright (C) 2024 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#define WANT_SV_LOG1P_K0_SHORTCUT 1
+#include "sv_log1p_inline.h"
+
+#define One (0x3ff0000000000000)
+#define Thres (0x1ff0000000000000) /* asuint64 (0x1p511) - One. */
+
+static svfloat64_t NOINLINE
+special_case (svfloat64_t x, svfloat64_t y, svbool_t special)
+{
+ return sv_call_f64 (acosh, x, y, special);
+}
+
+/* SVE approximation for double-precision acosh, based on log1p.
+ The largest observed error is 3.19 ULP in the region where the
+ argument to log1p falls in the k=0 interval, i.e. x close to 1:
+ SV_NAME_D1 (acosh)(0x1.1e4388d4ca821p+0) got 0x1.ed23399f5137p-2
+ want 0x1.ed23399f51373p-2. */
+svfloat64_t SV_NAME_D1 (acosh) (svfloat64_t x, const svbool_t pg)
+{
+ /* (ix - One) >= (BigBound - One). */
+ svuint64_t ix = svreinterpret_u64 (x);
+ svbool_t special = svcmpge (pg, svsub_x (pg, ix, One), Thres);
+
+ svfloat64_t xm1 = svsub_x (pg, x, 1.0);
+ svfloat64_t u = svmul_x (pg, xm1, svadd_x (pg, x, 1.0));
+ svfloat64_t y = svadd_x (pg, xm1, svsqrt_x (pg, u));
+
+ /* Fall back to scalar routine for special lanes. */
+ if (__glibc_unlikely (svptest_any (pg, special)))
+ return special_case (x, sv_log1p_inline (y, pg), special);
+ return sv_log1p_inline (y, pg);
+}
diff --git a/sysdeps/aarch64/fpu/acoshf_advsimd.c b/sysdeps/aarch64/fpu/acoshf_advsimd.c
new file mode 100644
index 0000000000000000..8916dcbf409922a9
--- /dev/null
+++ b/sysdeps/aarch64/fpu/acoshf_advsimd.c
@@ -0,0 +1,78 @@
+/* Single-precision vector (Advanced SIMD) acosh function
+
+ Copyright (C) 2024 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#include "v_log1pf_inline.h"
+
+#define SquareLim 0x1p64
+
+const static struct data
+{
+ struct v_log1pf_data log1pf_consts;
+ uint32x4_t one;
+ uint16x4_t thresh;
+} data = {
+ .log1pf_consts = V_LOG1PF_CONSTANTS_TABLE,
+ .one = V4 (0x3f800000),
+ .thresh = V4 (0x2000) /* top(asuint(SquareLim) - asuint(1)). */
+};
+
+static float32x4_t NOINLINE VPCS_ATTR
+special_case (float32x4_t x, float32x4_t y, uint16x4_t special,
+ const struct v_log1pf_data d)
+{
+ return v_call_f32 (acoshf, x, log1pf_inline (y, d), vmovl_u16 (special));
+}
+
+/* Vector approximation for single-precision acosh, based on log1p. Maximum
+ error depends on WANT_SIMD_EXCEPT. With SIMD fp exceptions enabled, it
+ is 2.78 ULP:
+ __v_acoshf(0x1.07887p+0) got 0x1.ef9e9cp-3
+ want 0x1.ef9ea2p-3.
+ With exceptions disabled, we can compute u with a shorter dependency chain,
+ which gives maximum error of 3.07 ULP:
+ __v_acoshf(0x1.01f83ep+0) got 0x1.fbc7fap-4
+ want 0x1.fbc7f4p-4. */
+
+VPCS_ATTR float32x4_t NOINLINE V_NAME_F1 (acosh) (float32x4_t x)
+{
+ const struct data *d = ptr_barrier (&data);
+ uint32x4_t ix = vreinterpretq_u32_f32 (x);
+ uint16x4_t special = vcge_u16 (vsubhn_u32 (ix, d->one), d->thresh);
+
+#if WANT_SIMD_EXCEPT
+ /* Mask special lanes with 1 to side-step spurious invalid or overflow. Use
+ only xm1 to calculate u, as operating on x will trigger invalid for NaN.
+ Widening sign-extend special predicate in order to mask with it. */
+ uint32x4_t p
+ = vreinterpretq_u32_s32 (vmovl_s16 (vreinterpret_s16_u16 (special)));
+ float32x4_t xm1 = v_zerofy_f32 (vsubq_f32 (x, v_f32 (1)), p);
+ float32x4_t u = vfmaq_f32 (vaddq_f32 (xm1, xm1), xm1, xm1);
+#else
+ float32x4_t xm1 = vsubq_f32 (x, v_f32 (1));
+ float32x4_t u = vmulq_f32 (xm1, vaddq_f32 (x, v_f32 (1.0f)));
+#endif
+
+ float32x4_t y = vaddq_f32 (xm1, vsqrtq_f32 (u));
+
+ if (__glibc_unlikely (v_any_u16h (special)))
+ return special_case (x, y, special, d->log1pf_consts);
+ return log1pf_inline (y, d->log1pf_consts);
+}
+libmvec_hidden_def (V_NAME_F1 (acosh))
+HALF_WIDTH_ALIAS_F1 (acosh)
diff --git a/sysdeps/aarch64/fpu/acoshf_sve.c b/sysdeps/aarch64/fpu/acoshf_sve.c
new file mode 100644
index 0000000000000000..2110894e629500be
--- /dev/null
+++ b/sysdeps/aarch64/fpu/acoshf_sve.c
@@ -0,0 +1,49 @@
+/* Single-precision vector (SVE) acosh function
+
+ Copyright (C) 2024 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#define One 0x3f800000
+#define Thres 0x20000000 /* asuint(0x1p64) - One. */
+
+#include "sv_log1pf_inline.h"
+
+static svfloat32_t NOINLINE
+special_case (svfloat32_t x, svfloat32_t y, svbool_t special)
+{
+ return sv_call_f32 (acoshf, x, y, special);
+}
+
+/* Single-precision SVE acosh(x) routine. Implements the same algorithm as
+ vector acoshf and log1p.
+
+ Maximum error is 2.78 ULPs:
+ SV_NAME_F1 (acosh) (0x1.01e996p+0) got 0x1.f45b42p-4
+ want 0x1.f45b3cp-4. */
+svfloat32_t SV_NAME_F1 (acosh) (svfloat32_t x, const svbool_t pg)
+{
+ svuint32_t ix = svreinterpret_u32 (x);
+ svbool_t special = svcmpge (pg, svsub_x (pg, ix, One), Thres);
+
+ svfloat32_t xm1 = svsub_x (pg, x, 1.0f);
+ svfloat32_t u = svmul_x (pg, xm1, svadd_x (pg, x, 1.0f));
+ svfloat32_t y = sv_log1pf_inline (svadd_x (pg, xm1, svsqrt_x (pg, u)), pg);
+
+ if (__glibc_unlikely (svptest_any (pg, special)))
+ return special_case (x, y, special);
+ return y;
+}
diff --git a/sysdeps/aarch64/fpu/advsimd_f32_protos.h b/sysdeps/aarch64/fpu/advsimd_f32_protos.h
index c63b2948d4938b0d..22fec4de77395e60 100644
--- a/sysdeps/aarch64/fpu/advsimd_f32_protos.h
+++ b/sysdeps/aarch64/fpu/advsimd_f32_protos.h
@@ -18,6 +18,7 @@
<https://www.gnu.org/licenses/>. */
libmvec_hidden_proto (V_NAME_F1(acos));
+libmvec_hidden_proto (V_NAME_F1(acosh));
libmvec_hidden_proto (V_NAME_F1(asin));
libmvec_hidden_proto (V_NAME_F1(atan));
libmvec_hidden_proto (V_NAME_F1(cos));
diff --git a/sysdeps/aarch64/fpu/bits/math-vector.h b/sysdeps/aarch64/fpu/bits/math-vector.h
index 8ca55098706a54c2..841330956c102ff1 100644
--- a/sysdeps/aarch64/fpu/bits/math-vector.h
+++ b/sysdeps/aarch64/fpu/bits/math-vector.h
@@ -33,6 +33,10 @@
# define __DECL_SIMD_acos __DECL_SIMD_aarch64
# undef __DECL_SIMD_acosf
# define __DECL_SIMD_acosf __DECL_SIMD_aarch64
+# undef __DECL_SIMD_acosh
+# define __DECL_SIMD_acosh __DECL_SIMD_aarch64
+# undef __DECL_SIMD_acoshf
+# define __DECL_SIMD_acoshf __DECL_SIMD_aarch64
# undef __DECL_SIMD_asin
# define __DECL_SIMD_asin __DECL_SIMD_aarch64
# undef __DECL_SIMD_asinf
@@ -125,6 +129,7 @@ typedef __SVBool_t __sv_bool_t;
__vpcs __f32x4_t _ZGVnN4vv_atan2f (__f32x4_t, __f32x4_t);
__vpcs __f32x4_t _ZGVnN4v_acosf (__f32x4_t);
+__vpcs __f32x4_t _ZGVnN4v_acoshf (__f32x4_t);
__vpcs __f32x4_t _ZGVnN4v_asinf (__f32x4_t);
__vpcs __f32x4_t _ZGVnN4v_atanf (__f32x4_t);
__vpcs __f32x4_t _ZGVnN4v_cosf (__f32x4_t);
@@ -143,6 +148,7 @@ __vpcs __f32x4_t _ZGVnN4v_tanf (__f32x4_t);
__vpcs __f64x2_t _ZGVnN2vv_atan2 (__f64x2_t, __f64x2_t);
__vpcs __f64x2_t _ZGVnN2v_acos (__f64x2_t);
+__vpcs __f64x2_t _ZGVnN2v_acosh (__f64x2_t);
__vpcs __f64x2_t _ZGVnN2v_asin (__f64x2_t);
__vpcs __f64x2_t _ZGVnN2v_atan (__f64x2_t);
__vpcs __f64x2_t _ZGVnN2v_cos (__f64x2_t);
@@ -166,6 +172,7 @@ __vpcs __f64x2_t _ZGVnN2v_tan (__f64x2_t);
__sv_f32_t _ZGVsMxvv_atan2f (__sv_f32_t, __sv_f32_t, __sv_bool_t);
__sv_f32_t _ZGVsMxv_acosf (__sv_f32_t, __sv_bool_t);
+__sv_f32_t _ZGVsMxv_acoshf (__sv_f32_t, __sv_bool_t);
__sv_f32_t _ZGVsMxv_asinf (__sv_f32_t, __sv_bool_t);
__sv_f32_t _ZGVsMxv_atanf (__sv_f32_t, __sv_bool_t);
__sv_f32_t _ZGVsMxv_cosf (__sv_f32_t, __sv_bool_t);
@@ -184,6 +191,7 @@ __sv_f32_t _ZGVsMxv_tanf (__sv_f32_t, __sv_bool_t);
__sv_f64_t _ZGVsMxvv_atan2 (__sv_f64_t, __sv_f64_t, __sv_bool_t);
__sv_f64_t _ZGVsMxv_acos (__sv_f64_t, __sv_bool_t);
+__sv_f64_t _ZGVsMxv_acosh (__sv_f64_t, __sv_bool_t);
__sv_f64_t _ZGVsMxv_asin (__sv_f64_t, __sv_bool_t);
__sv_f64_t _ZGVsMxv_atan (__sv_f64_t, __sv_bool_t);
__sv_f64_t _ZGVsMxv_cos (__sv_f64_t, __sv_bool_t);
diff --git a/sysdeps/aarch64/fpu/sv_log1p_inline.h b/sysdeps/aarch64/fpu/sv_log1p_inline.h
new file mode 100644
index 0000000000000000..da019674f94dbac7
--- /dev/null
+++ b/sysdeps/aarch64/fpu/sv_log1p_inline.h
@@ -0,0 +1,109 @@
+/* Helper for double-precision SVE routines which depend on log1p
+
+ Copyright (C) 2024 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#ifndef AARCH64_FPU_SV_LOG1P_INLINE_H
+#define AARCH64_FPU_SV_LOG1P_INLINE_H
+
+#include "sv_math.h"
+#include "poly_sve_f64.h"
+
+static const struct sv_log1p_data
+{
+ double poly[19], ln2[2];
+ uint64_t hf_rt2_top;
+ uint64_t one_m_hf_rt2_top;
+ uint32_t bottom_mask;
+ int64_t one_top;
+} sv_log1p_data = {
+ /* Coefficients generated using Remez, deg=20, in [sqrt(2)/2-1, sqrt(2)-1].
+ */
+ .poly = { -0x1.ffffffffffffbp-2, 0x1.55555555551a9p-2, -0x1.00000000008e3p-2,
+ 0x1.9999999a32797p-3, -0x1.555555552fecfp-3, 0x1.249248e071e5ap-3,
+ -0x1.ffffff8bf8482p-4, 0x1.c71c8f07da57ap-4, -0x1.9999ca4ccb617p-4,
+ 0x1.7459ad2e1dfa3p-4, -0x1.554d2680a3ff2p-4, 0x1.3b4c54d487455p-4,
+ -0x1.2548a9ffe80e6p-4, 0x1.0f389a24b2e07p-4, -0x1.eee4db15db335p-5,
+ 0x1.e95b494d4a5ddp-5, -0x1.15fdf07cb7c73p-4, 0x1.0310b70800fcfp-4,
+ -0x1.cfa7385bdb37ep-6 },
+ .ln2 = { 0x1.62e42fefa3800p-1, 0x1.ef35793c76730p-45 },
+ .hf_rt2_top = 0x3fe6a09e00000000,
+ .one_m_hf_rt2_top = 0x00095f6200000000,
+ .bottom_mask = 0xffffffff,
+ .one_top = 0x3ff
+};
+
+static inline svfloat64_t
+sv_log1p_inline (svfloat64_t x, const svbool_t pg)
+{
+ /* Helper for calculating log(x + 1). Adapted from v_log1p_inline.h, which
+ differs from v_log1p_2u5.c by:
+ - No special-case handling - this should be dealt with by the caller.
+ - Pairwise Horner polynomial evaluation for improved accuracy.
+ - Optionally simulate the shortcut for k=0, used in the scalar routine,
+ using svsel, for improved accuracy when the argument to log1p is close
+ to 0. This feature is enabled by defining WANT_SV_LOG1P_K0_SHORTCUT as 1
+ in the source of the caller before including this file.
+ See sv_log1p_2u1.c for details of the algorithm. */
+ const struct sv_log1p_data *d = ptr_barrier (&sv_log1p_data);
+ svfloat64_t m = svadd_x (pg, x, 1);
+ svuint64_t mi = svreinterpret_u64 (m);
+ svuint64_t u = svadd_x (pg, mi, d->one_m_hf_rt2_top);
+
+ svint64_t ki
+ = svsub_x (pg, svreinterpret_s64 (svlsr_x (pg, u, 52)), d->one_top);
+ svfloat64_t k = svcvt_f64_x (pg, ki);
+
+ /* Reduce x to f in [sqrt(2)/2, sqrt(2)]. */
+ svuint64_t utop
+ = svadd_x (pg, svand_x (pg, u, 0x000fffff00000000), d->hf_rt2_top);
+ svuint64_t u_red = svorr_x (pg, utop, svand_x (pg, mi, d->bottom_mask));
+ svfloat64_t f = svsub_x (pg, svreinterpret_f64 (u_red), 1);
+
+ /* Correction term c/m. */
+ svfloat64_t c = svsub_x (pg, x, svsub_x (pg, m, 1));
+ svfloat64_t cm;
+
+#ifndef WANT_SV_LOG1P_K0_SHORTCUT
+#error \
+ "Cannot use sv_log1p_inline.h without specifying whether you need the k0 shortcut for greater accuracy close to 0"
+#elif WANT_SV_LOG1P_K0_SHORTCUT
+ /* Shortcut if k is 0 - set correction term to 0 and f to x. The result is
+ that the approximation is solely the polynomial. */
+ svbool_t knot0 = svcmpne (pg, k, 0);
+ cm = svdiv_z (knot0, c, m);
+ if (__glibc_likely (!svptest_any (pg, knot0)))
+ {
+ f = svsel (knot0, f, x);
+ }
+#else
+ /* No shortcut. */
+ cm = svdiv_x (pg, c, m);
+#endif
+
+ /* Approximate log1p(f) on the reduced input using a polynomial. */
+ svfloat64_t f2 = svmul_x (pg, f, f);
+ svfloat64_t p = sv_pw_horner_18_f64_x (pg, f, f2, d->poly);
+
+ /* Assemble log1p(x) = k * log2 + log1p(f) + c/m. */
+ svfloat64_t ylo = svmla_x (pg, cm, k, d->ln2[0]);
+ svfloat64_t yhi = svmla_x (pg, f, k, d->ln2[1]);
+
+ return svmla_x (pg, svadd_x (pg, ylo, yhi), f2, p);
+}
+
+#endif
diff --git a/sysdeps/aarch64/fpu/sv_log1pf_inline.h b/sysdeps/aarch64/fpu/sv_log1pf_inline.h
new file mode 100644
index 0000000000000000..b94b2da055a6c59b
--- /dev/null
+++ b/sysdeps/aarch64/fpu/sv_log1pf_inline.h
@@ -0,0 +1,76 @@
+/* Helper for single-precision SVE routines which depend on log1p
+
+ Copyright (C) 2024 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#ifndef AARCH64_FPU_SV_LOG1PF_INLINE_H
+#define AARCH64_FPU_SV_LOG1PF_INLINE_H
+
+#include "sv_math.h"
+#include "vecmath_config.h"
+#include "poly_sve_f32.h"
+
+static const struct sv_log1pf_data
+{
+ float32_t poly[9];
+ float32_t ln2;
+ float32_t scale_back;
+} sv_log1pf_data = {
+ /* Polynomial generated using FPMinimax in [-0.25, 0.5]. */
+ .poly = { -0x1p-1f, 0x1.5555aap-2f, -0x1.000038p-2f, 0x1.99675cp-3f,
+ -0x1.54ef78p-3f, 0x1.28a1f4p-3f, -0x1.0da91p-3f, 0x1.abcb6p-4f,
+ -0x1.6f0d5ep-5f },
+ .scale_back = 0x1.0p-23f,
+ .ln2 = 0x1.62e43p-1f,
+};
+
+static inline svfloat32_t
+eval_poly (svfloat32_t m, const float32_t *c, svbool_t pg)
+{
+ svfloat32_t p_12 = svmla_x (pg, sv_f32 (c[0]), m, sv_f32 (c[1]));
+ svfloat32_t m2 = svmul_x (pg, m, m);
+ svfloat32_t q = svmla_x (pg, m, m2, p_12);
+ svfloat32_t p = sv_pw_horner_6_f32_x (pg, m, m2, c + 2);
+ p = svmul_x (pg, m2, p);
+
+ return svmla_x (pg, q, m2, p);
+}
+
+static inline svfloat32_t
+sv_log1pf_inline (svfloat32_t x, svbool_t pg)
+{
+ const struct sv_log1pf_data *d = ptr_barrier (&sv_log1pf_data);
+
+ svfloat32_t m = svadd_x (pg, x, 1.0f);
+
+ svint32_t ks = svsub_x (pg, svreinterpret_s32 (m),
+ svreinterpret_s32 (svdup_f32 (0.75f)));
+ ks = svand_x (pg, ks, 0xff800000);
+ svuint32_t k = svreinterpret_u32 (ks);
+ svfloat32_t s = svreinterpret_f32 (
+ svsub_x (pg, svreinterpret_u32 (svdup_f32 (4.0f)), k));
+
+ svfloat32_t m_scale
+ = svreinterpret_f32 (svsub_x (pg, svreinterpret_u32 (x), k));
+ m_scale
+ = svadd_x (pg, m_scale, svmla_x (pg, sv_f32 (-1.0f), sv_f32 (0.25f), s));
+ svfloat32_t p = eval_poly (m_scale, d->poly, pg);
+ svfloat32_t scale_back = svmul_x (pg, svcvt_f32_x (pg, k), d->scale_back);
+ return svmla_x (pg, p, scale_back, d->ln2);
+}
+
+#endif
diff --git a/sysdeps/aarch64/fpu/test-double-advsimd-wrappers.c b/sysdeps/aarch64/fpu/test-double-advsimd-wrappers.c
index b37cb7d5e9c0d96a..f4ce1d70096888aa 100644
--- a/sysdeps/aarch64/fpu/test-double-advsimd-wrappers.c
+++ b/sysdeps/aarch64/fpu/test-double-advsimd-wrappers.c
@@ -24,6 +24,7 @@
#define VEC_TYPE float64x2_t
VPCS_VECTOR_WRAPPER (acos_advsimd, _ZGVnN2v_acos)
+VPCS_VECTOR_WRAPPER (acosh_advsimd, _ZGVnN2v_acosh)
VPCS_VECTOR_WRAPPER (asin_advsimd, _ZGVnN2v_asin)
VPCS_VECTOR_WRAPPER (atan_advsimd, _ZGVnN2v_atan)
VPCS_VECTOR_WRAPPER_ff (atan2_advsimd, _ZGVnN2vv_atan2)
diff --git a/sysdeps/aarch64/fpu/test-double-sve-wrappers.c b/sysdeps/aarch64/fpu/test-double-sve-wrappers.c
index 011f07d2c15b148f..0e973cc9d7ade813 100644
--- a/sysdeps/aarch64/fpu/test-double-sve-wrappers.c
+++ b/sysdeps/aarch64/fpu/test-double-sve-wrappers.c
@@ -43,6 +43,7 @@
}
SVE_VECTOR_WRAPPER (acos_sve, _ZGVsMxv_acos)
+SVE_VECTOR_WRAPPER (acosh_sve, _ZGVsMxv_acosh)
SVE_VECTOR_WRAPPER (asin_sve, _ZGVsMxv_asin)
SVE_VECTOR_WRAPPER (atan_sve, _ZGVsMxv_atan)
SVE_VECTOR_WRAPPER_ff (atan2_sve, _ZGVsMxvv_atan2)
diff --git a/sysdeps/aarch64/fpu/test-float-advsimd-wrappers.c b/sysdeps/aarch64/fpu/test-float-advsimd-wrappers.c
index 35452991431e238a..0ce026b5ea96a064 100644
--- a/sysdeps/aarch64/fpu/test-float-advsimd-wrappers.c
+++ b/sysdeps/aarch64/fpu/test-float-advsimd-wrappers.c
@@ -24,6 +24,7 @@
#define VEC_TYPE float32x4_t
VPCS_VECTOR_WRAPPER (acosf_advsimd, _ZGVnN4v_acosf)
+VPCS_VECTOR_WRAPPER (acoshf_advsimd, _ZGVnN4v_acoshf)
VPCS_VECTOR_WRAPPER (asinf_advsimd, _ZGVnN4v_asinf)
VPCS_VECTOR_WRAPPER (atanf_advsimd, _ZGVnN4v_atanf)
VPCS_VECTOR_WRAPPER_ff (atan2f_advsimd, _ZGVnN4vv_atan2f)
diff --git a/sysdeps/aarch64/fpu/test-float-sve-wrappers.c b/sysdeps/aarch64/fpu/test-float-sve-wrappers.c
index bbc74ede88c9e6c8..398b7373e800cd5b 100644
--- a/sysdeps/aarch64/fpu/test-float-sve-wrappers.c
+++ b/sysdeps/aarch64/fpu/test-float-sve-wrappers.c
@@ -43,6 +43,7 @@
}
SVE_VECTOR_WRAPPER (acosf_sve, _ZGVsMxv_acosf)
+SVE_VECTOR_WRAPPER (acoshf_sve, _ZGVsMxv_acoshf)
SVE_VECTOR_WRAPPER (asinf_sve, _ZGVsMxv_asinf)
SVE_VECTOR_WRAPPER (atanf_sve, _ZGVsMxv_atanf)
SVE_VECTOR_WRAPPER_ff (atan2f_sve, _ZGVsMxvv_atan2f)
diff --git a/sysdeps/aarch64/fpu/v_log1p_inline.h b/sysdeps/aarch64/fpu/v_log1p_inline.h
new file mode 100644
index 0000000000000000..242e43b6eecc0b6e
--- /dev/null
+++ b/sysdeps/aarch64/fpu/v_log1p_inline.h
@@ -0,0 +1,103 @@
+/* Helper for double-precision Advanced SIMD routines which depend on log1p
+
+ Copyright (C) 2024 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#ifndef AARCH64_FPU_V_LOG1P_INLINE_H
+#define AARCH64_FPU_V_LOG1P_INLINE_H
+
+#include "v_math.h"
+#include "poly_advsimd_f64.h"
+
+struct v_log1p_data
+{
+ float64x2_t poly[19], ln2[2];
+ uint64x2_t hf_rt2_top, one_m_hf_rt2_top, umask;
+ int64x2_t one_top;
+};
+
+/* Coefficients generated using Remez, deg=20, in [sqrt(2)/2-1, sqrt(2)-1]. */
+#define V_LOG1P_CONSTANTS_TABLE \
+ { \
+ .poly = { V2 (-0x1.ffffffffffffbp-2), V2 (0x1.55555555551a9p-2), \
+ V2 (-0x1.00000000008e3p-2), V2 (0x1.9999999a32797p-3), \
+ V2 (-0x1.555555552fecfp-3), V2 (0x1.249248e071e5ap-3), \
+ V2 (-0x1.ffffff8bf8482p-4), V2 (0x1.c71c8f07da57ap-4), \
+ V2 (-0x1.9999ca4ccb617p-4), V2 (0x1.7459ad2e1dfa3p-4), \
+ V2 (-0x1.554d2680a3ff2p-4), V2 (0x1.3b4c54d487455p-4), \
+ V2 (-0x1.2548a9ffe80e6p-4), V2 (0x1.0f389a24b2e07p-4), \
+ V2 (-0x1.eee4db15db335p-5), V2 (0x1.e95b494d4a5ddp-5), \
+ V2 (-0x1.15fdf07cb7c73p-4), V2 (0x1.0310b70800fcfp-4), \
+ V2 (-0x1.cfa7385bdb37ep-6) }, \
+ .ln2 = { V2 (0x1.62e42fefa3800p-1), V2 (0x1.ef35793c76730p-45) }, \
+ .hf_rt2_top = V2 (0x3fe6a09e00000000), \
+ .one_m_hf_rt2_top = V2 (0x00095f6200000000), \
+ .umask = V2 (0x000fffff00000000), .one_top = V2 (0x3ff) \
+ }
+
+#define BottomMask v_u64 (0xffffffff)
+
+static inline float64x2_t
+log1p_inline (float64x2_t x, const struct v_log1p_data *d)
+{
+ /* Helper for calculating log(x + 1). Copied from v_log1p_2u5.c, with several
+ modifications:
+ - No special-case handling - this should be dealt with by the caller.
+ - Pairwise Horner polynomial evaluation for improved accuracy.
+ - Optionally simulate the shortcut for k=0, used in the scalar routine,
+ using v_sel, for improved accuracy when the argument to log1p is close to
+ 0. This feature is enabled by defining WANT_V_LOG1P_K0_SHORTCUT as 1 in
+ the source of the caller before including this file.
+ See v_log1pf_2u1.c for details of the algorithm. */
+ float64x2_t m = vaddq_f64 (x, v_f64 (1));
+ uint64x2_t mi = vreinterpretq_u64_f64 (m);
+ uint64x2_t u = vaddq_u64 (mi, d->one_m_hf_rt2_top);
+
+ int64x2_t ki
+ = vsubq_s64 (vreinterpretq_s64_u64 (vshrq_n_u64 (u, 52)), d->one_top);
+ float64x2_t k = vcvtq_f64_s64 (ki);
+
+ /* Reduce x to f in [sqrt(2)/2, sqrt(2)]. */
+ uint64x2_t utop = vaddq_u64 (vandq_u64 (u, d->umask), d->hf_rt2_top);
+ uint64x2_t u_red = vorrq_u64 (utop, vandq_u64 (mi, BottomMask));
+ float64x2_t f = vsubq_f64 (vreinterpretq_f64_u64 (u_red), v_f64 (1));
+
+ /* Correction term c/m. */
+ float64x2_t cm = vdivq_f64 (vsubq_f64 (x, vsubq_f64 (m, v_f64 (1))), m);
+
+#ifndef WANT_V_LOG1P_K0_SHORTCUT
+#error \
+ "Cannot use v_log1p_inline.h without specifying whether you need the k0 shortcut for greater accuracy close to 0"
+#elif WANT_V_LOG1P_K0_SHORTCUT
+ /* Shortcut if k is 0 - set correction term to 0 and f to x. The result is
+ that the approximation is solely the polynomial. */
+ uint64x2_t k0 = vceqzq_f64 (k);
+ cm = v_zerofy_f64 (cm, k0);
+ f = vbslq_f64 (k0, x, f);
+#endif
+
+ /* Approximate log1p(f) on the reduced input using a polynomial. */
+ float64x2_t f2 = vmulq_f64 (f, f);
+ float64x2_t p = v_pw_horner_18_f64 (f, f2, d->poly);
+
+ /* Assemble log1p(x) = k * log2 + log1p(f) + c/m. */
+ float64x2_t ylo = vfmaq_f64 (cm, k, d->ln2[1]);
+ float64x2_t yhi = vfmaq_f64 (f, k, d->ln2[0]);
+ return vfmaq_f64 (vaddq_f64 (ylo, yhi), f2, p);
+}
+
+#endif
diff --git a/sysdeps/aarch64/fpu/v_log1pf_inline.h b/sysdeps/aarch64/fpu/v_log1pf_inline.h
new file mode 100644
index 0000000000000000..643a6cdcfc498970
--- /dev/null
+++ b/sysdeps/aarch64/fpu/v_log1pf_inline.h
@@ -0,0 +1,78 @@
+/* Helper for single-precision Advanced SIMD routines which depend on log1p
+
+ Copyright (C) 2024 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#ifndef AARCH64_FPU_V_LOG1PF_INLINE_H
+#define AARCH64_FPU_V_LOG1PF_INLINE_H
+
+#include "v_math.h"
+#include "poly_advsimd_f32.h"
+
+struct v_log1pf_data
+{
+ float32x4_t poly[8], ln2;
+ uint32x4_t four;
+ int32x4_t three_quarters;
+};
+
+/* Polynomial generated using FPMinimax in [-0.25, 0.5]. First two coefficients
+ (1, -0.5) are not stored as they can be generated more efficiently. */
+#define V_LOG1PF_CONSTANTS_TABLE \
+ { \
+ .poly \
+ = { V4 (0x1.5555aap-2f), V4 (-0x1.000038p-2f), V4 (0x1.99675cp-3f), \
+ V4 (-0x1.54ef78p-3f), V4 (0x1.28a1f4p-3f), V4 (-0x1.0da91p-3f), \
+ V4 (0x1.abcb6p-4f), V4 (-0x1.6f0d5ep-5f) }, \
+ .ln2 = V4 (0x1.62e43p-1f), .four = V4 (0x40800000), \
+ .three_quarters = V4 (0x3f400000) \
+ }
+
+static inline float32x4_t
+eval_poly (float32x4_t m, const float32x4_t *c)
+{
+ /* Approximate log(1+m) on [-0.25, 0.5] using pairwise Horner (main routine
+ uses split Estrin, but this way reduces register pressure in the calling
+ routine). */
+ float32x4_t q = vfmaq_f32 (v_f32 (-0.5), m, c[0]);
+ float32x4_t m2 = vmulq_f32 (m, m);
+ q = vfmaq_f32 (m, m2, q);
+ float32x4_t p = v_pw_horner_6_f32 (m, m2, c + 1);
+ p = vmulq_f32 (m2, p);
+ return vfmaq_f32 (q, m2, p);
+}
+
+static inline float32x4_t
+log1pf_inline (float32x4_t x, const struct v_log1pf_data d)
+{
+ /* Helper for calculating log(x + 1). Copied from log1pf_2u1.c, with no
+ special-case handling. See that file for details of the algorithm. */
+ float32x4_t m = vaddq_f32 (x, v_f32 (1.0f));
+ int32x4_t k
+ = vandq_s32 (vsubq_s32 (vreinterpretq_s32_f32 (m), d.three_quarters),
+ v_s32 (0xff800000));
+ uint32x4_t ku = vreinterpretq_u32_s32 (k);
+ float32x4_t s = vreinterpretq_f32_u32 (vsubq_u32 (d.four, ku));
+ float32x4_t m_scale
+ = vreinterpretq_f32_u32 (vsubq_u32 (vreinterpretq_u32_f32 (x), ku));
+ m_scale = vaddq_f32 (m_scale, vfmaq_f32 (v_f32 (-1.0f), v_f32 (0.25f), s));
+ float32x4_t p = eval_poly (m_scale, d.poly);
+ float32x4_t scale_back = vmulq_f32 (vcvtq_f32_s32 (k), v_f32 (0x1.0p-23f));
+ return vfmaq_f32 (p, scale_back, d.ln2);
+}
+
+#endif
diff --git a/sysdeps/aarch64/fpu/v_math.h b/sysdeps/aarch64/fpu/v_math.h
index d4d78bc4027abebb..12824fce8c698cf4 100644
--- a/sysdeps/aarch64/fpu/v_math.h
+++ b/sysdeps/aarch64/fpu/v_math.h
@@ -108,6 +108,11 @@ v_call2_f32 (float (*f) (float, float), float32x4_t x1, float32x4_t x2,
p[2] ? f (x1[2], x2[2]) : y[2],
p[3] ? f (x1[3], x2[3]) : y[3] };
}
+static inline float32x4_t
+v_zerofy_f32 (float32x4_t x, uint32x4_t mask)
+{
+ return vreinterpretq_f32_u32 (vbicq_u32 (vreinterpretq_u32_f32 (x), mask));
+}
static inline float64x2_t
v_f64 (double x)
@@ -167,5 +172,10 @@ v_call2_f64 (double (*f) (double, double), float64x2_t x1, float64x2_t x2,
return (float64x2_t){ p[0] ? f (x1[0], x2[0]) : y[0],
p[1] ? f (x1[1], x2[1]) : y[1] };
}
+static inline float64x2_t
+v_zerofy_f64 (float64x2_t x, uint64x2_t mask)
+{
+ return vreinterpretq_f64_u64 (vbicq_u64 (vreinterpretq_u64_f64 (x), mask));
+}
#endif
diff --git a/sysdeps/aarch64/libm-test-ulps b/sysdeps/aarch64/libm-test-ulps
index 48d747ad5793be96..1646cdbdd22d93d9 100644
--- a/sysdeps/aarch64/libm-test-ulps
+++ b/sysdeps/aarch64/libm-test-ulps
@@ -34,11 +34,19 @@ double: 2
float: 2
ldouble: 4
+Function: "acosh_advsimd":
+double: 2
+float: 2
+
Function: "acosh_downward":
double: 2
float: 2
ldouble: 3
+Function: "acosh_sve":
+double: 2
+float: 2
+
Function: "acosh_towardzero":
double: 2
float: 2
diff --git a/sysdeps/unix/sysv/linux/aarch64/libmvec.abilist b/sysdeps/unix/sysv/linux/aarch64/libmvec.abilist
index f66da42c3630bf48..f5aaa519f2c8663e 100644
--- a/sysdeps/unix/sysv/linux/aarch64/libmvec.abilist
+++ b/sysdeps/unix/sysv/linux/aarch64/libmvec.abilist
@@ -73,12 +73,17 @@ GLIBC_2.39 _ZGVsMxv_tan F
GLIBC_2.39 _ZGVsMxv_tanf F
GLIBC_2.39 _ZGVsMxvv_atan2 F
GLIBC_2.39 _ZGVsMxvv_atan2f F
+GLIBC_2.40 _ZGVnN2v_acosh F
+GLIBC_2.40 _ZGVnN2v_acoshf F
GLIBC_2.40 _ZGVnN2v_cosh F
GLIBC_2.40 _ZGVnN2v_coshf F
GLIBC_2.40 _ZGVnN2v_erf F
GLIBC_2.40 _ZGVnN2v_erff F
+GLIBC_2.40 _ZGVnN4v_acoshf F
GLIBC_2.40 _ZGVnN4v_coshf F
GLIBC_2.40 _ZGVnN4v_erff F
+GLIBC_2.40 _ZGVsMxv_acosh F
+GLIBC_2.40 _ZGVsMxv_acoshf F
GLIBC_2.40 _ZGVsMxv_cosh F
GLIBC_2.40 _ZGVsMxv_coshf F
GLIBC_2.40 _ZGVsMxv_erf F

303
glibc-RHEL-118273-30.patch Normal file
View File

@ -0,0 +1,303 @@
commit c0ff447edf19bd4630fe79adf5e8b896405b059f
Author: Luna Lamb <luna.lamb@arm.com>
Date: Thu Feb 13 17:54:46 2025 +0000
Aarch64: Improve codegen in SVE exp and users, and update expf_inline
Use unpredicted muls, and improve memory access.
7%, 3% and 1% improvement in throughput microbenchmark on Neoverse V1,
for exp, exp2 and cosh respectively.
Reviewed-by: Wilco Dijkstra <Wilco.Dijkstra@arm.com>
diff --git a/sysdeps/aarch64/fpu/cosh_sve.c b/sysdeps/aarch64/fpu/cosh_sve.c
index 919f34604a452b4a..e375dd8a3407feb2 100644
--- a/sysdeps/aarch64/fpu/cosh_sve.c
+++ b/sysdeps/aarch64/fpu/cosh_sve.c
@@ -23,7 +23,7 @@ static const struct data
{
float64_t poly[3];
float64_t inv_ln2, ln2_hi, ln2_lo, shift, thres;
- uint64_t index_mask, special_bound;
+ uint64_t special_bound;
} data = {
.poly = { 0x1.fffffffffffd4p-2, 0x1.5555571d6b68cp-3,
0x1.5555576a59599p-5, },
@@ -35,14 +35,16 @@ static const struct data
.shift = 0x1.8p+52,
.thres = 704.0,
- .index_mask = 0xff,
/* 0x1.6p9, above which exp overflows. */
.special_bound = 0x4086000000000000,
};
static svfloat64_t NOINLINE
-special_case (svfloat64_t x, svfloat64_t y, svbool_t special)
+special_case (svfloat64_t x, svbool_t pg, svfloat64_t t, svbool_t special)
{
+ svfloat64_t half_t = svmul_x (svptrue_b64 (), t, 0.5);
+ svfloat64_t half_over_t = svdivr_x (pg, t, 0.5);
+ svfloat64_t y = svadd_x (pg, half_t, half_over_t);
return sv_call_f64 (cosh, x, y, special);
}
@@ -60,12 +62,12 @@ exp_inline (svfloat64_t x, const svbool_t pg, const struct data *d)
svuint64_t u = svreinterpret_u64 (z);
svuint64_t e = svlsl_x (pg, u, 52 - V_EXP_TAIL_TABLE_BITS);
- svuint64_t i = svand_x (pg, u, d->index_mask);
+ svuint64_t i = svand_x (svptrue_b64 (), u, 0xff);
svfloat64_t y = svmla_x (pg, sv_f64 (d->poly[1]), r, d->poly[2]);
y = svmla_x (pg, sv_f64 (d->poly[0]), r, y);
y = svmla_x (pg, sv_f64 (1.0), r, y);
- y = svmul_x (pg, r, y);
+ y = svmul_x (svptrue_b64 (), r, y);
/* s = 2^(n/N). */
u = svld1_gather_index (pg, __v_exp_tail_data, i);
@@ -94,12 +96,12 @@ svfloat64_t SV_NAME_D1 (cosh) (svfloat64_t x, const svbool_t pg)
/* Up to the point that exp overflows, we can use it to calculate cosh by
exp(|x|) / 2 + 1 / (2 * exp(|x|)). */
svfloat64_t t = exp_inline (ax, pg, d);
- svfloat64_t half_t = svmul_x (pg, t, 0.5);
- svfloat64_t half_over_t = svdivr_x (pg, t, 0.5);
/* Fall back to scalar for any special cases. */
if (__glibc_unlikely (svptest_any (pg, special)))
- return special_case (x, svadd_x (pg, half_t, half_over_t), special);
+ return special_case (x, pg, t, special);
+ svfloat64_t half_t = svmul_x (svptrue_b64 (), t, 0.5);
+ svfloat64_t half_over_t = svdivr_x (pg, t, 0.5);
return svadd_x (pg, half_t, half_over_t);
}
diff --git a/sysdeps/aarch64/fpu/exp10_sve.c b/sysdeps/aarch64/fpu/exp10_sve.c
index ddf64708cb1773cd..bfd3fb9e1948a3b8 100644
--- a/sysdeps/aarch64/fpu/exp10_sve.c
+++ b/sysdeps/aarch64/fpu/exp10_sve.c
@@ -18,21 +18,23 @@
<https://www.gnu.org/licenses/>. */
#include "sv_math.h"
-#include "poly_sve_f64.h"
#define SpecialBound 307.0 /* floor (log10 (2^1023)). */
static const struct data
{
- double poly[5];
+ double c1, c3, c2, c4, c0;
double shift, log10_2, log2_10_hi, log2_10_lo, scale_thres, special_bound;
} data = {
/* Coefficients generated using Remez algorithm.
rel error: 0x1.9fcb9b3p-60
abs error: 0x1.a20d9598p-60 in [ -log10(2)/128, log10(2)/128 ]
max ulp err 0.52 +0.5. */
- .poly = { 0x1.26bb1bbb55516p1, 0x1.53524c73cd32ap1, 0x1.0470591daeafbp1,
- 0x1.2bd77b1361ef6p0, 0x1.142b5d54e9621p-1 },
+ .c0 = 0x1.26bb1bbb55516p1,
+ .c1 = 0x1.53524c73cd32ap1,
+ .c2 = 0x1.0470591daeafbp1,
+ .c3 = 0x1.2bd77b1361ef6p0,
+ .c4 = 0x1.142b5d54e9621p-1,
/* 1.5*2^46+1023. This value is further explained below. */
.shift = 0x1.800000000ffc0p+46,
.log10_2 = 0x1.a934f0979a371p1, /* 1/log2(10). */
@@ -70,9 +72,9 @@ special_case (svbool_t pg, svfloat64_t s, svfloat64_t y, svfloat64_t n,
/* |n| > 1280 => 2^(n) overflows. */
svbool_t p_cmp = svacgt (pg, n, d->scale_thres);
- svfloat64_t r1 = svmul_x (pg, s1, s1);
+ svfloat64_t r1 = svmul_x (svptrue_b64 (), s1, s1);
svfloat64_t r2 = svmla_x (pg, s2, s2, y);
- svfloat64_t r0 = svmul_x (pg, r2, s1);
+ svfloat64_t r0 = svmul_x (svptrue_b64 (), r2, s1);
return svsel (p_cmp, r1, r0);
}
@@ -103,11 +105,14 @@ svfloat64_t SV_NAME_D1 (exp10) (svfloat64_t x, svbool_t pg)
comes at significant performance cost. */
svuint64_t u = svreinterpret_u64 (z);
svfloat64_t scale = svexpa (u);
-
+ svfloat64_t c24 = svld1rq (svptrue_b64 (), &d->c2);
/* Approximate exp10(r) using polynomial. */
- svfloat64_t r2 = svmul_x (pg, r, r);
- svfloat64_t y = svmla_x (pg, svmul_x (pg, r, d->poly[0]), r2,
- sv_pairwise_poly_3_f64_x (pg, r, r2, d->poly + 1));
+ svfloat64_t r2 = svmul_x (svptrue_b64 (), r, r);
+ svfloat64_t p12 = svmla_lane (sv_f64 (d->c1), r, c24, 0);
+ svfloat64_t p34 = svmla_lane (sv_f64 (d->c3), r, c24, 1);
+ svfloat64_t p14 = svmla_x (pg, p12, p34, r2);
+
+ svfloat64_t y = svmla_x (pg, svmul_x (svptrue_b64 (), r, d->c0), r2, p14);
/* Assemble result as exp10(x) = 2^n * exp10(r). If |x| > SpecialBound
multiplication may overflow, so use special case routine. */
diff --git a/sysdeps/aarch64/fpu/exp2_sve.c b/sysdeps/aarch64/fpu/exp2_sve.c
index 22848ebfa5ac21d8..5dfb77cdbc2f6a51 100644
--- a/sysdeps/aarch64/fpu/exp2_sve.c
+++ b/sysdeps/aarch64/fpu/exp2_sve.c
@@ -18,7 +18,6 @@
<https://www.gnu.org/licenses/>. */
#include "sv_math.h"
-#include "poly_sve_f64.h"
#define N (1 << V_EXP_TABLE_BITS)
@@ -27,15 +26,15 @@
static const struct data
{
- double poly[4];
+ double c0, c2;
+ double c1, c3;
double shift, big_bound, uoflow_bound;
} data = {
/* Coefficients are computed using Remez algorithm with
minimisation of the absolute error. */
- .poly = { 0x1.62e42fefa3686p-1, 0x1.ebfbdff82c241p-3, 0x1.c6b09b16de99ap-5,
- 0x1.3b2abf5571ad8p-7 },
- .shift = 0x1.8p52 / N,
- .uoflow_bound = UOFlowBound,
+ .c0 = 0x1.62e42fefa3686p-1, .c1 = 0x1.ebfbdff82c241p-3,
+ .c2 = 0x1.c6b09b16de99ap-5, .c3 = 0x1.3b2abf5571ad8p-7,
+ .shift = 0x1.8p52 / N, .uoflow_bound = UOFlowBound,
.big_bound = BigBound,
};
@@ -67,9 +66,9 @@ special_case (svbool_t pg, svfloat64_t s, svfloat64_t y, svfloat64_t n,
/* |n| > 1280 => 2^(n) overflows. */
svbool_t p_cmp = svacgt (pg, n, d->uoflow_bound);
- svfloat64_t r1 = svmul_x (pg, s1, s1);
+ svfloat64_t r1 = svmul_x (svptrue_b64 (), s1, s1);
svfloat64_t r2 = svmla_x (pg, s2, s2, y);
- svfloat64_t r0 = svmul_x (pg, r2, s1);
+ svfloat64_t r0 = svmul_x (svptrue_b64 (), r2, s1);
return svsel (p_cmp, r1, r0);
}
@@ -99,11 +98,14 @@ svfloat64_t SV_NAME_D1 (exp2) (svfloat64_t x, svbool_t pg)
svuint64_t top = svlsl_x (pg, ki, 52 - V_EXP_TABLE_BITS);
svfloat64_t scale = svreinterpret_f64 (svadd_x (pg, sbits, top));
+ svfloat64_t c13 = svld1rq (svptrue_b64 (), &d->c1);
/* Approximate exp2(r) using polynomial. */
- svfloat64_t r2 = svmul_x (pg, r, r);
- svfloat64_t p = sv_pairwise_poly_3_f64_x (pg, r, r2, d->poly);
- svfloat64_t y = svmul_x (pg, r, p);
-
+ /* y = exp2(r) - 1 ~= C0 r + C1 r^2 + C2 r^3 + C3 r^4. */
+ svfloat64_t r2 = svmul_x (svptrue_b64 (), r, r);
+ svfloat64_t p01 = svmla_lane (sv_f64 (d->c0), r, c13, 0);
+ svfloat64_t p23 = svmla_lane (sv_f64 (d->c2), r, c13, 1);
+ svfloat64_t p = svmla_x (pg, p01, p23, r2);
+ svfloat64_t y = svmul_x (svptrue_b64 (), r, p);
/* Assemble exp2(x) = exp2(r) * scale. */
if (__glibc_unlikely (svptest_any (pg, special)))
return special_case (pg, scale, y, kd, d);
diff --git a/sysdeps/aarch64/fpu/exp_sve.c b/sysdeps/aarch64/fpu/exp_sve.c
index aabaaa1d61dbab27..b2421d493f2e119f 100644
--- a/sysdeps/aarch64/fpu/exp_sve.c
+++ b/sysdeps/aarch64/fpu/exp_sve.c
@@ -21,12 +21,15 @@
static const struct data
{
- double poly[4];
+ double c0, c2;
+ double c1, c3;
double ln2_hi, ln2_lo, inv_ln2, shift, thres;
+
} data = {
- .poly = { /* ulp error: 0.53. */
- 0x1.fffffffffdbcdp-2, 0x1.555555555444cp-3, 0x1.555573c6a9f7dp-5,
- 0x1.1111266d28935p-7 },
+ .c0 = 0x1.fffffffffdbcdp-2,
+ .c1 = 0x1.555555555444cp-3,
+ .c2 = 0x1.555573c6a9f7dp-5,
+ .c3 = 0x1.1111266d28935p-7,
.ln2_hi = 0x1.62e42fefa3800p-1,
.ln2_lo = 0x1.ef35793c76730p-45,
/* 1/ln2. */
@@ -36,7 +39,6 @@ static const struct data
.thres = 704.0,
};
-#define C(i) sv_f64 (d->poly[i])
#define SpecialOffset 0x6000000000000000 /* 0x1p513. */
/* SpecialBias1 + SpecialBias1 = asuint(1.0). */
#define SpecialBias1 0x7000000000000000 /* 0x1p769. */
@@ -56,20 +58,20 @@ special_case (svbool_t pg, svfloat64_t s, svfloat64_t y, svfloat64_t n)
svuint64_t b
= svdup_u64_z (p_sign, SpecialOffset); /* Inactive lanes set to 0. */
- /* Set s1 to generate overflow depending on sign of exponent n. */
- svfloat64_t s1 = svreinterpret_f64 (
- svsubr_x (pg, b, SpecialBias1)); /* 0x70...0 - b. */
- /* Offset s to avoid overflow in final result if n is below threshold. */
+ /* Set s1 to generate overflow depending on sign of exponent n,
+ ie. s1 = 0x70...0 - b. */
+ svfloat64_t s1 = svreinterpret_f64 (svsubr_x (pg, b, SpecialBias1));
+ /* Offset s to avoid overflow in final result if n is below threshold.
+ ie. s2 = as_u64 (s) - 0x3010...0 + b. */
svfloat64_t s2 = svreinterpret_f64 (
- svadd_x (pg, svsub_x (pg, svreinterpret_u64 (s), SpecialBias2),
- b)); /* as_u64 (s) - 0x3010...0 + b. */
+ svadd_x (pg, svsub_x (pg, svreinterpret_u64 (s), SpecialBias2), b));
/* |n| > 1280 => 2^(n) overflows. */
svbool_t p_cmp = svacgt (pg, n, 1280.0);
- svfloat64_t r1 = svmul_x (pg, s1, s1);
+ svfloat64_t r1 = svmul_x (svptrue_b64 (), s1, s1);
svfloat64_t r2 = svmla_x (pg, s2, s2, y);
- svfloat64_t r0 = svmul_x (pg, r2, s1);
+ svfloat64_t r0 = svmul_x (svptrue_b64 (), r2, s1);
return svsel (p_cmp, r1, r0);
}
@@ -103,16 +105,16 @@ svfloat64_t SV_NAME_D1 (exp) (svfloat64_t x, const svbool_t pg)
svfloat64_t z = svmla_x (pg, sv_f64 (d->shift), x, d->inv_ln2);
svuint64_t u = svreinterpret_u64 (z);
svfloat64_t n = svsub_x (pg, z, d->shift);
-
+ svfloat64_t c13 = svld1rq (svptrue_b64 (), &d->c1);
/* r = x - n * ln2, r is in [-ln2/(2N), ln2/(2N)]. */
svfloat64_t ln2 = svld1rq (svptrue_b64 (), &d->ln2_hi);
svfloat64_t r = svmls_lane (x, n, ln2, 0);
r = svmls_lane (r, n, ln2, 1);
/* y = exp(r) - 1 ~= r + C0 r^2 + C1 r^3 + C2 r^4 + C3 r^5. */
- svfloat64_t r2 = svmul_x (pg, r, r);
- svfloat64_t p01 = svmla_x (pg, C (0), C (1), r);
- svfloat64_t p23 = svmla_x (pg, C (2), C (3), r);
+ svfloat64_t r2 = svmul_x (svptrue_b64 (), r, r);
+ svfloat64_t p01 = svmla_lane (sv_f64 (d->c0), r, c13, 0);
+ svfloat64_t p23 = svmla_lane (sv_f64 (d->c2), r, c13, 1);
svfloat64_t p04 = svmla_x (pg, p01, p23, r2);
svfloat64_t y = svmla_x (pg, r, p04, r2);
diff --git a/sysdeps/aarch64/fpu/sv_expf_inline.h b/sysdeps/aarch64/fpu/sv_expf_inline.h
index 6166df65533555a6..75781fb4ddcb9790 100644
--- a/sysdeps/aarch64/fpu/sv_expf_inline.h
+++ b/sysdeps/aarch64/fpu/sv_expf_inline.h
@@ -61,7 +61,7 @@ expf_inline (svfloat32_t x, const svbool_t pg, const struct sv_expf_data *d)
/* scale = 2^(n/N). */
svfloat32_t scale = svexpa (svreinterpret_u32 (z));
- /* y = exp(r) - 1 ~= r + C0 r^2 + C1 r^3 + C2 r^4 + C3 r^5 + C4 r^6. */
+ /* poly(r) = exp(r) - 1 ~= C0 r + C1 r^2 + C2 r^3 + C3 r^4 + C4 r^5. */
svfloat32_t p12 = svmla_lane (sv_f32 (d->c1), r, lane_consts, 2);
svfloat32_t p34 = svmla_lane (sv_f32 (d->c3), r, lane_consts, 3);
svfloat32_t r2 = svmul_x (svptrue_b32 (), r, r);
@@ -71,5 +71,4 @@ expf_inline (svfloat32_t x, const svbool_t pg, const struct sv_expf_data *d)
return svmla_x (pg, scale, scale, poly);
}
-
#endif

194
glibc-RHEL-118273-31.patch Normal file
View File

@ -0,0 +1,194 @@
commit 8f0e7fe61e0a2ad5ed777933703ce09053810ec4
Author: Luna Lamb <luna.lamb@arm.com>
Date: Thu Feb 13 17:52:09 2025 +0000
Aarch64: Improve codegen in SVE asinh
Use unpredicated muls, use lanewise mla's and improve memory access.
1% regression in throughput microbenchmark on Neoverse V1.
Reviewed-by: Wilco Dijkstra <Wilco.Dijkstra@arm.com>
diff --git a/sysdeps/aarch64/fpu/asinh_sve.c b/sysdeps/aarch64/fpu/asinh_sve.c
index 28dc5c458750bac4..fe8715e06c92ac51 100644
--- a/sysdeps/aarch64/fpu/asinh_sve.c
+++ b/sysdeps/aarch64/fpu/asinh_sve.c
@@ -18,36 +18,49 @@
<https://www.gnu.org/licenses/>. */
#include "sv_math.h"
-#include "poly_sve_f64.h"
#define SignMask (0x8000000000000000)
#define One (0x3ff0000000000000)
#define Thres (0x5fe0000000000000) /* asuint64 (0x1p511). */
+#define IndexMask (((1 << V_LOG_TABLE_BITS) - 1) << 1)
static const struct data
{
- double poly[18];
- double ln2, p3, p1, p4, p0, p2;
- uint64_t n;
- uint64_t off;
+ double even_coeffs[9];
+ double ln2, p3, p1, p4, p0, p2, c1, c3, c5, c7, c9, c11, c13, c15, c17;
+ uint64_t off, mask;
} data = {
- /* Polynomial generated using Remez on [2^-26, 1]. */
- .poly
- = { -0x1.55555555554a7p-3, 0x1.3333333326c7p-4, -0x1.6db6db68332e6p-5,
- 0x1.f1c71b26fb40dp-6, -0x1.6e8b8b654a621p-6, 0x1.1c4daa9e67871p-6,
- -0x1.c9871d10885afp-7, 0x1.7a16e8d9d2ecfp-7, -0x1.3ddca533e9f54p-7,
- 0x1.0becef748dafcp-7, -0x1.b90c7099dd397p-8, 0x1.541f2bb1ffe51p-8,
- -0x1.d217026a669ecp-9, 0x1.0b5c7977aaf7p-9, -0x1.e0f37daef9127p-11,
- 0x1.388b5fe542a6p-12, -0x1.021a48685e287p-14, 0x1.93d4ba83d34dap-18 },
+ /* Polynomial generated using Remez on [2^-26, 1]. */
+ .even_coeffs ={
+ -0x1.55555555554a7p-3,
+ -0x1.6db6db68332e6p-5,
+ -0x1.6e8b8b654a621p-6,
+ -0x1.c9871d10885afp-7,
+ -0x1.3ddca533e9f54p-7,
+ -0x1.b90c7099dd397p-8,
+ -0x1.d217026a669ecp-9,
+ -0x1.e0f37daef9127p-11,
+ -0x1.021a48685e287p-14, },
+
+ .c1 = 0x1.3333333326c7p-4,
+ .c3 = 0x1.f1c71b26fb40dp-6,
+ .c5 = 0x1.1c4daa9e67871p-6,
+ .c7 = 0x1.7a16e8d9d2ecfp-7,
+ .c9 = 0x1.0becef748dafcp-7,
+ .c11 = 0x1.541f2bb1ffe51p-8,
+ .c13 = 0x1.0b5c7977aaf7p-9,
+ .c15 = 0x1.388b5fe542a6p-12,
+ .c17 = 0x1.93d4ba83d34dap-18,
+
.ln2 = 0x1.62e42fefa39efp-1,
.p0 = -0x1.ffffffffffff7p-2,
.p1 = 0x1.55555555170d4p-2,
.p2 = -0x1.0000000399c27p-2,
.p3 = 0x1.999b2e90e94cap-3,
.p4 = -0x1.554e550bd501ep-3,
- .n = 1 << V_LOG_TABLE_BITS,
- .off = 0x3fe6900900000000
+ .off = 0x3fe6900900000000,
+ .mask = 0xfffULL << 52,
};
static svfloat64_t NOINLINE
@@ -64,11 +77,10 @@ __sv_log_inline (svfloat64_t x, const struct data *d, const svbool_t pg)
of the algorithm used. */
svuint64_t ix = svreinterpret_u64 (x);
- svuint64_t tmp = svsub_x (pg, ix, d->off);
- svuint64_t i = svand_x (pg, svlsr_x (pg, tmp, (51 - V_LOG_TABLE_BITS)),
- (d->n - 1) << 1);
- svint64_t k = svasr_x (pg, svreinterpret_s64 (tmp), 52);
- svuint64_t iz = svsub_x (pg, ix, svand_x (pg, tmp, 0xfffULL << 52));
+ svuint64_t i_off = svsub_x (pg, ix, d->off);
+ svuint64_t i
+ = svand_x (pg, svlsr_x (pg, i_off, (51 - V_LOG_TABLE_BITS)), IndexMask);
+ svuint64_t iz = svsub_x (pg, ix, svand_x (pg, i_off, d->mask));
svfloat64_t z = svreinterpret_f64 (iz);
svfloat64_t invc = svld1_gather_index (pg, &__v_log_data.table[0].invc, i);
@@ -78,14 +90,14 @@ __sv_log_inline (svfloat64_t x, const struct data *d, const svbool_t pg)
svfloat64_t p1_p4 = svld1rq (svptrue_b64 (), &d->p1);
svfloat64_t r = svmla_x (pg, sv_f64 (-1.0), invc, z);
- svfloat64_t kd = svcvt_f64_x (pg, k);
+ svfloat64_t kd
+ = svcvt_f64_x (pg, svasr_x (pg, svreinterpret_s64 (i_off), 52));
svfloat64_t hi = svmla_lane (svadd_x (pg, logc, r), kd, ln2_p3, 0);
- svfloat64_t r2 = svmul_x (pg, r, r);
-
+ svfloat64_t r2 = svmul_x (svptrue_b64 (), r, r);
svfloat64_t y = svmla_lane (sv_f64 (d->p2), r, ln2_p3, 1);
-
svfloat64_t p = svmla_lane (sv_f64 (d->p0), r, p1_p4, 0);
+
y = svmla_lane (y, r2, p1_p4, 1);
y = svmla_x (pg, p, r2, y);
y = svmla_x (pg, hi, r2, y);
@@ -111,7 +123,6 @@ svfloat64_t SV_NAME_D1 (asinh) (svfloat64_t x, const svbool_t pg)
svuint64_t iax = svbic_x (pg, ix, SignMask);
svuint64_t sign = svand_x (pg, ix, SignMask);
svfloat64_t ax = svreinterpret_f64 (iax);
-
svbool_t ge1 = svcmpge (pg, iax, One);
svbool_t special = svcmpge (pg, iax, Thres);
@@ -120,7 +131,7 @@ svfloat64_t SV_NAME_D1 (asinh) (svfloat64_t x, const svbool_t pg)
svfloat64_t option_1 = sv_f64 (0);
if (__glibc_likely (svptest_any (pg, ge1)))
{
- svfloat64_t x2 = svmul_x (pg, ax, ax);
+ svfloat64_t x2 = svmul_x (svptrue_b64 (), ax, ax);
option_1 = __sv_log_inline (
svadd_x (pg, ax, svsqrt_x (pg, svadd_x (pg, x2, 1))), d, pg);
}
@@ -130,21 +141,53 @@ svfloat64_t SV_NAME_D1 (asinh) (svfloat64_t x, const svbool_t pg)
The largest observed error in this region is 1.51 ULPs:
_ZGVsMxv_asinh(0x1.fe12bf8c616a2p-1) got 0x1.c1e649ee2681bp-1
want 0x1.c1e649ee2681dp-1. */
+
svfloat64_t option_2 = sv_f64 (0);
if (__glibc_likely (svptest_any (pg, svnot_z (pg, ge1))))
{
- svfloat64_t x2 = svmul_x (pg, ax, ax);
- svfloat64_t x4 = svmul_x (pg, x2, x2);
- svfloat64_t p = sv_pw_horner_17_f64_x (pg, x2, x4, d->poly);
- option_2 = svmla_x (pg, ax, p, svmul_x (pg, x2, ax));
+ svfloat64_t x2 = svmul_x (svptrue_b64 (), ax, ax);
+ svfloat64_t x4 = svmul_x (svptrue_b64 (), x2, x2);
+ /* Order-17 Pairwise Horner scheme. */
+ svfloat64_t c13 = svld1rq (svptrue_b64 (), &d->c1);
+ svfloat64_t c57 = svld1rq (svptrue_b64 (), &d->c5);
+ svfloat64_t c911 = svld1rq (svptrue_b64 (), &d->c9);
+ svfloat64_t c1315 = svld1rq (svptrue_b64 (), &d->c13);
+
+ svfloat64_t p01 = svmla_lane (sv_f64 (d->even_coeffs[0]), x2, c13, 0);
+ svfloat64_t p23 = svmla_lane (sv_f64 (d->even_coeffs[1]), x2, c13, 1);
+ svfloat64_t p45 = svmla_lane (sv_f64 (d->even_coeffs[2]), x2, c57, 0);
+ svfloat64_t p67 = svmla_lane (sv_f64 (d->even_coeffs[3]), x2, c57, 1);
+ svfloat64_t p89 = svmla_lane (sv_f64 (d->even_coeffs[4]), x2, c911, 0);
+ svfloat64_t p1011 = svmla_lane (sv_f64 (d->even_coeffs[5]), x2, c911, 1);
+ svfloat64_t p1213
+ = svmla_lane (sv_f64 (d->even_coeffs[6]), x2, c1315, 0);
+ svfloat64_t p1415
+ = svmla_lane (sv_f64 (d->even_coeffs[7]), x2, c1315, 1);
+ svfloat64_t p1617 = svmla_x (pg, sv_f64 (d->even_coeffs[8]), x2, d->c17);
+
+ svfloat64_t p = svmla_x (pg, p1415, x4, p1617);
+ p = svmla_x (pg, p1213, x4, p);
+ p = svmla_x (pg, p1011, x4, p);
+ p = svmla_x (pg, p89, x4, p);
+
+ p = svmla_x (pg, p67, x4, p);
+ p = svmla_x (pg, p45, x4, p);
+
+ p = svmla_x (pg, p23, x4, p);
+
+ p = svmla_x (pg, p01, x4, p);
+
+ option_2 = svmla_x (pg, ax, p, svmul_x (svptrue_b64 (), x2, ax));
}
- /* Choose the right option for each lane. */
- svfloat64_t y = svsel (ge1, option_1, option_2);
-
if (__glibc_unlikely (svptest_any (pg, special)))
return special_case (
- x, svreinterpret_f64 (sveor_x (pg, svreinterpret_u64 (y), sign)),
+ x,
+ svreinterpret_f64 (sveor_x (
+ pg, svreinterpret_u64 (svsel (ge1, option_1, option_2)), sign)),
special);
+
+ /* Choose the right option for each lane. */
+ svfloat64_t y = svsel (ge1, option_1, option_2);
return svreinterpret_f64 (sveor_x (pg, svreinterpret_u64 (y), sign));
}

531
glibc-RHEL-118273-32.patch Normal file
View File

@ -0,0 +1,531 @@
commit ce2f26a22e6b6f5c108d156afd9b43a452bb024c
Author: Wilco Dijkstra <wilco.dijkstra@arm.com>
Date: Tue Dec 31 18:07:36 2024 +0000
AArch64: Remove PTR_ARG/SIZE_ARG defines
This series removes various ILP32 defines that are now
no longer needed.
Remove PTR_ARG/SIZE_ARG.
Reviewed-by: Adhemerval Zanella  <adhemerval.zanella@linaro.org>
Conflicts:
sysdeps/aarch64/dl-start.S
(Fixup context to apply without out-of-scope dependency 01f52b11de)
sysdeps/aarch64/multiarch/memcpy_thunderx.S
(Dropped by upstream commit e162ab2)
sysdeps/aarch64/multiarch/memcpy_oryon1.S
(Skipped: file from 4dc83cac is out-of-scope)
sysdeps/aarch64/multiarch/memset_oryon1.S
(Skipped: file from 2f1f7a5f is out-of-scope)
diff --git a/sysdeps/aarch64/__longjmp.S b/sysdeps/aarch64/__longjmp.S
index 7b6add751e6bd96b..452ba0da6d788ce8 100644
--- a/sysdeps/aarch64/__longjmp.S
+++ b/sysdeps/aarch64/__longjmp.S
@@ -47,8 +47,6 @@ ENTRY (__longjmp)
cfi_offset(d14, JB_D14<<3)
cfi_offset(d15, JB_D15<<3)
- PTR_ARG (0)
-
#if IS_IN(libc)
/* Disable ZA state of SME in libc.a and libc.so, but not in ld.so. */
# if HAVE_AARCH64_PAC_RET
diff --git a/sysdeps/aarch64/__mtag_tag_region.S b/sysdeps/aarch64/__mtag_tag_region.S
index 22e8d8b75372c8aa..90ac17ced4801f21 100644
--- a/sysdeps/aarch64/__mtag_tag_region.S
+++ b/sysdeps/aarch64/__mtag_tag_region.S
@@ -40,9 +40,6 @@
#define zva_val x4
ENTRY (__libc_mtag_tag_region)
- PTR_ARG (0)
- SIZE_ARG (1)
-
add dstend, dstin, count
cmp count, 96
diff --git a/sysdeps/aarch64/__mtag_tag_zero_region.S b/sysdeps/aarch64/__mtag_tag_zero_region.S
index 566698e9146e7da8..e975a2f8bdb85ae0 100644
--- a/sysdeps/aarch64/__mtag_tag_zero_region.S
+++ b/sysdeps/aarch64/__mtag_tag_zero_region.S
@@ -40,9 +40,6 @@
#define zva_val x4
ENTRY (__libc_mtag_tag_zero_region)
- PTR_ARG (0)
- SIZE_ARG (1)
-
add dstend, dstin, count
cmp count, 96
diff --git a/sysdeps/aarch64/dl-start.S b/sysdeps/aarch64/dl-start.S
index d645484e79858013..b7ac6c31432e07c9 100644
--- a/sysdeps/aarch64/dl-start.S
+++ b/sysdeps/aarch64/dl-start.S
@@ -26,7 +26,6 @@ ENTRY (_start)
mov x30, #0
mov x0, sp
- PTR_ARG (0)
bl _dl_start
/* Returns user entry point in x0. */
mov PTR_REG (21), PTR_REG (0)
diff --git a/sysdeps/aarch64/dl-tlsdesc.S b/sysdeps/aarch64/dl-tlsdesc.S
index 9b253b39dd1d9d46..0aeaf64edd2594f1 100644
--- a/sysdeps/aarch64/dl-tlsdesc.S
+++ b/sysdeps/aarch64/dl-tlsdesc.S
@@ -75,7 +75,6 @@
.align 2
_dl_tlsdesc_return:
BTI_C
- PTR_ARG (0)
ldr PTR_REG (0), [x0, #PTR_SIZE]
RET
cfi_endproc
@@ -99,7 +98,6 @@ _dl_tlsdesc_undefweak:
BTI_C
str x1, [sp, #-16]!
cfi_adjust_cfa_offset (16)
- PTR_ARG (0)
ldr PTR_REG (0), [x0, #PTR_SIZE]
mrs x1, tpidr_el0
sub PTR_REG (0), PTR_REG (0), PTR_REG (1)
@@ -145,7 +143,6 @@ _dl_tlsdesc_undefweak:
.align 2
_dl_tlsdesc_dynamic:
BTI_C
- PTR_ARG (0)
/* Save just enough registers to support fast path, if we fall
into slow path we will save additional registers. */
diff --git a/sysdeps/aarch64/memchr.S b/sysdeps/aarch64/memchr.S
index a9fa40519c78b7df..7173c7fafa7d6eb5 100644
--- a/sysdeps/aarch64/memchr.S
+++ b/sysdeps/aarch64/memchr.S
@@ -57,8 +57,6 @@
exactly which byte matched. */
ENTRY (MEMCHR)
- PTR_ARG (0)
- SIZE_ARG (2)
bic src, srcin, 15
cbz cntin, L(nomatch)
ld1 {vdata.16b}, [src]
diff --git a/sysdeps/aarch64/memcmp.S b/sysdeps/aarch64/memcmp.S
index 5afa79494bf9cb7f..68dfa604f4b1bd43 100644
--- a/sysdeps/aarch64/memcmp.S
+++ b/sysdeps/aarch64/memcmp.S
@@ -44,10 +44,6 @@
ENTRY (memcmp)
- PTR_ARG (0)
- PTR_ARG (1)
- SIZE_ARG (2)
-
cmp limit, 16
b.lo L(less16)
ldp data1, data3, [src1]
diff --git a/sysdeps/aarch64/memcpy.S b/sysdeps/aarch64/memcpy.S
index f21c21d3f2a21d89..fba93faeba52447f 100644
--- a/sysdeps/aarch64/memcpy.S
+++ b/sysdeps/aarch64/memcpy.S
@@ -70,10 +70,6 @@
from the end. */
ENTRY (MEMCPY)
- PTR_ARG (0)
- PTR_ARG (1)
- SIZE_ARG (2)
-
add srcend, src, count
add dstend, dstin, count
cmp count, 128
@@ -187,10 +183,6 @@ libc_hidden_builtin_def (MEMCPY)
ENTRY (MEMMOVE)
- PTR_ARG (0)
- PTR_ARG (1)
- SIZE_ARG (2)
-
add srcend, src, count
add dstend, dstin, count
cmp count, 128
diff --git a/sysdeps/aarch64/memrchr.S b/sysdeps/aarch64/memrchr.S
index c5274f5ebf595268..1bd3e230ca197581 100644
--- a/sysdeps/aarch64/memrchr.S
+++ b/sysdeps/aarch64/memrchr.S
@@ -55,8 +55,6 @@
exactly which byte matched. */
ENTRY (__memrchr)
- PTR_ARG (0)
- SIZE_ARG (2)
add end, srcin, cntin
sub endm1, end, 1
bic src, endm1, 15
diff --git a/sysdeps/aarch64/memset.S b/sysdeps/aarch64/memset.S
index 71814d0b2f6dd3a7..496ad332882a7e3d 100644
--- a/sysdeps/aarch64/memset.S
+++ b/sysdeps/aarch64/memset.S
@@ -40,9 +40,6 @@
#define dstend2 x5
ENTRY (MEMSET)
- PTR_ARG (0)
- SIZE_ARG (2)
-
dup v0.16B, valw
cmp count, 16
b.lo L(set_small)
diff --git a/sysdeps/aarch64/multiarch/memchr_nosimd.S b/sysdeps/aarch64/multiarch/memchr_nosimd.S
index 0a65139b0810e95b..b47059de1ee61f71 100644
--- a/sysdeps/aarch64/multiarch/memchr_nosimd.S
+++ b/sysdeps/aarch64/multiarch/memchr_nosimd.S
@@ -60,9 +60,6 @@
ENTRY (__memchr_nosimd)
- PTR_ARG (0)
- SIZE_ARG (2)
-
/* Do not dereference srcin if no bytes to compare. */
cbz cntin, L(none_chr)
diff --git a/sysdeps/aarch64/multiarch/memcpy_a64fx.S b/sysdeps/aarch64/multiarch/memcpy_a64fx.S
index d826aafd80ed7b0b..fa693f7c3a5c28a3 100644
--- a/sysdeps/aarch64/multiarch/memcpy_a64fx.S
+++ b/sysdeps/aarch64/multiarch/memcpy_a64fx.S
@@ -96,10 +96,6 @@
ENTRY (__memcpy_a64fx)
- PTR_ARG (0)
- PTR_ARG (1)
- SIZE_ARG (2)
-
cntb vlen
cmp n, vlen, lsl 1
b.hi L(copy_small)
@@ -236,10 +232,6 @@ END (__memcpy_a64fx)
ENTRY_ALIGN (__memmove_a64fx, 4)
- PTR_ARG (0)
- PTR_ARG (1)
- SIZE_ARG (2)
-
/* Fast case for up to 2 vectors. */
cntb vlen
cmp n, vlen, lsl 1
diff --git a/sysdeps/aarch64/multiarch/memcpy_mops.S b/sysdeps/aarch64/multiarch/memcpy_mops.S
index b094af3d22bc4aeb..2c426f008e699101 100644
--- a/sysdeps/aarch64/multiarch/memcpy_mops.S
+++ b/sysdeps/aarch64/multiarch/memcpy_mops.S
@@ -26,10 +26,6 @@
*/
ENTRY (__memcpy_mops)
- PTR_ARG (0)
- PTR_ARG (1)
- SIZE_ARG (2)
-
mov x3, x0
.inst 0x19010443 /* cpyfp [x3]!, [x1]!, x2! */
.inst 0x19410443 /* cpyfm [x3]!, [x1]!, x2! */
diff --git a/sysdeps/aarch64/multiarch/memcpy_sve.S b/sysdeps/aarch64/multiarch/memcpy_sve.S
index 3ce49d79ecdb94e0..26375b47174f1ba8 100644
--- a/sysdeps/aarch64/multiarch/memcpy_sve.S
+++ b/sysdeps/aarch64/multiarch/memcpy_sve.S
@@ -61,10 +61,6 @@
.arch armv8.2-a+sve
ENTRY (__memcpy_sve)
- PTR_ARG (0)
- PTR_ARG (1)
- SIZE_ARG (2)
-
cmp count, 128
b.hi L(copy_long)
cntb vlen
@@ -144,10 +140,6 @@ END (__memcpy_sve)
ENTRY (__memmove_sve)
- PTR_ARG (0)
- PTR_ARG (1)
- SIZE_ARG (2)
-
cmp count, 128
b.hi L(move_long)
cntb vlen
diff --git a/sysdeps/aarch64/multiarch/memcpy_thunderx.S b/sysdeps/aarch64/multiarch/memcpy_thunderx.S
index 5d8438a82ea2a3be..02ea27f356fe8ea1 100644
--- a/sysdeps/aarch64/multiarch/memcpy_thunderx.S
+++ b/sysdeps/aarch64/multiarch/memcpy_thunderx.S
@@ -67,10 +67,6 @@
ENTRY (__memmove_thunderx)
- PTR_ARG (0)
- PTR_ARG (1)
- SIZE_ARG (2)
-
sub tmp1, dstin, src
cmp count, 96
ccmp tmp1, count, 2, hi
diff --git a/sysdeps/aarch64/multiarch/memmove_mops.S b/sysdeps/aarch64/multiarch/memmove_mops.S
index 7df0d22454ead375..229fccd9d5a7abd2 100644
--- a/sysdeps/aarch64/multiarch/memmove_mops.S
+++ b/sysdeps/aarch64/multiarch/memmove_mops.S
@@ -26,10 +26,6 @@
*/
ENTRY (__memmove_mops)
- PTR_ARG (0)
- PTR_ARG (1)
- SIZE_ARG (2)
-
mov x3, x0
.inst 0x1d010443 /* cpyp [x3]!, [x1]!, x2! */
.inst 0x1d410443 /* cpym [x3]!, [x1]!, x2! */
diff --git a/sysdeps/aarch64/multiarch/memset_a64fx.S b/sysdeps/aarch64/multiarch/memset_a64fx.S
index 2e6d882fc931a882..9ea329a82ae7d0f6 100644
--- a/sysdeps/aarch64/multiarch/memset_a64fx.S
+++ b/sysdeps/aarch64/multiarch/memset_a64fx.S
@@ -48,8 +48,6 @@
#define BTI_C
ENTRY (__memset_a64fx)
- PTR_ARG (0)
- SIZE_ARG (2)
cntb vector_length
dup z0.b, valw
diff --git a/sysdeps/aarch64/multiarch/memset_emag.S b/sysdeps/aarch64/multiarch/memset_emag.S
index 6d714ed0e1b396ef..5c33280e0f8bf85a 100644
--- a/sysdeps/aarch64/multiarch/memset_emag.S
+++ b/sysdeps/aarch64/multiarch/memset_emag.S
@@ -28,9 +28,6 @@
ENTRY (__memset_emag)
- PTR_ARG (0)
- SIZE_ARG (2)
-
bfi valw, valw, 8, 8
bfi valw, valw, 16, 16
bfi val, val, 32, 32
diff --git a/sysdeps/aarch64/multiarch/memset_kunpeng.S b/sysdeps/aarch64/multiarch/memset_kunpeng.S
index 7b215501376cbe03..93f3bfb8cf7238a5 100644
--- a/sysdeps/aarch64/multiarch/memset_kunpeng.S
+++ b/sysdeps/aarch64/multiarch/memset_kunpeng.S
@@ -28,9 +28,6 @@
ENTRY (__memset_kunpeng)
- PTR_ARG (0)
- SIZE_ARG (2)
-
dup v0.16B, valw
add dstend, dstin, count
diff --git a/sysdeps/aarch64/multiarch/memset_mops.S b/sysdeps/aarch64/multiarch/memset_mops.S
index e879c81ab2d047b1..f13a0b561078137e 100644
--- a/sysdeps/aarch64/multiarch/memset_mops.S
+++ b/sysdeps/aarch64/multiarch/memset_mops.S
@@ -26,9 +26,6 @@
*/
ENTRY (__memset_mops)
- PTR_ARG (0)
- SIZE_ARG (2)
-
mov x3, x0
.inst 0x19c10443 /* setp [x3]!, x2!, x1 */
.inst 0x19c14443 /* setm [x3]!, x2!, x1 */
diff --git a/sysdeps/aarch64/multiarch/strlen_asimd.S b/sysdeps/aarch64/multiarch/strlen_asimd.S
index 67dcc94adc587928..3118cd00663b0b25 100644
--- a/sysdeps/aarch64/multiarch/strlen_asimd.S
+++ b/sysdeps/aarch64/multiarch/strlen_asimd.S
@@ -87,7 +87,6 @@
character, return the length, if not, continue in the main loop. */
ENTRY (__strlen_asimd)
- PTR_ARG (0)
and tmp1, srcin, MIN_PAGE_SIZE - 1
cmp tmp1, MIN_PAGE_SIZE - 32
b.hi L(page_cross)
diff --git a/sysdeps/aarch64/setjmp.S b/sysdeps/aarch64/setjmp.S
index 43fdb1b2fb1b7b78..92dc34e3e9a2650c 100644
--- a/sysdeps/aarch64/setjmp.S
+++ b/sysdeps/aarch64/setjmp.S
@@ -34,8 +34,6 @@ END (_setjmp)
libc_hidden_def (_setjmp)
ENTRY (__sigsetjmp)
- PTR_ARG (0)
-
1:
stp x19, x20, [x0, #JB_X19<<3]
stp x21, x22, [x0, #JB_X21<<3]
diff --git a/sysdeps/aarch64/strchr.S b/sysdeps/aarch64/strchr.S
index ca4c99e6bf9ac960..bc57283361e172ab 100644
--- a/sysdeps/aarch64/strchr.S
+++ b/sysdeps/aarch64/strchr.S
@@ -52,7 +52,6 @@
If it is not a multiple of 4, there was no match. */
ENTRY (strchr)
- PTR_ARG (0)
bic src, srcin, 15
dup vrepchr.16b, chrin
ld1 {vdata.16b}, [src]
diff --git a/sysdeps/aarch64/strchrnul.S b/sysdeps/aarch64/strchrnul.S
index e1a1c7eb4383e0f6..09e092bf5f847a7f 100644
--- a/sysdeps/aarch64/strchrnul.S
+++ b/sysdeps/aarch64/strchrnul.S
@@ -51,7 +51,6 @@
exactly which byte matched. */
ENTRY (__strchrnul)
- PTR_ARG (0)
bic src, srcin, 15
dup vrepchr.16b, chrin
ld1 {vdata.16b}, [src]
diff --git a/sysdeps/aarch64/strcmp.S b/sysdeps/aarch64/strcmp.S
index 47f6fb1448c464bf..7bf87073be304e0f 100644
--- a/sysdeps/aarch64/strcmp.S
+++ b/sysdeps/aarch64/strcmp.S
@@ -62,8 +62,6 @@
NUL too in big-endian, byte-reverse the data before the NUL check. */
ENTRY(strcmp)
- PTR_ARG (0)
- PTR_ARG (1)
sub off2, src2, src1
mov zeroones, REP8_01
and tmp, src1, 7
diff --git a/sysdeps/aarch64/strcpy.S b/sysdeps/aarch64/strcpy.S
index 705354060055a45e..62fb0248fa5a7ba3 100644
--- a/sysdeps/aarch64/strcpy.S
+++ b/sysdeps/aarch64/strcpy.S
@@ -69,8 +69,6 @@
exactly which byte matched. */
ENTRY (STRCPY)
- PTR_ARG (0)
- PTR_ARG (1)
bic src, srcin, 15
ld1 {vdata.16b}, [src]
cmeq vhas_nul.16b, vdata.16b, 0
diff --git a/sysdeps/aarch64/strlen.S b/sysdeps/aarch64/strlen.S
index 352fb40d3abbb44b..0d10b6efb7b31e54 100644
--- a/sysdeps/aarch64/strlen.S
+++ b/sysdeps/aarch64/strlen.S
@@ -49,7 +49,6 @@
identifies the first zero byte. */
ENTRY (STRLEN)
- PTR_ARG (0)
bic src, srcin, 15
ld1 {vdata.16b}, [src]
cmeq vhas_nul.16b, vdata.16b, 0
diff --git a/sysdeps/aarch64/strnlen.S b/sysdeps/aarch64/strnlen.S
index e4fb3506a80756b3..2a2264c0e5427225 100644
--- a/sysdeps/aarch64/strnlen.S
+++ b/sysdeps/aarch64/strnlen.S
@@ -49,8 +49,6 @@
identifies the first zero byte. */
ENTRY (__strnlen)
- PTR_ARG (0)
- SIZE_ARG (1)
bic src, srcin, 15
cbz cntin, L(nomatch)
ld1 {vdata.16b}, [src]
diff --git a/sysdeps/aarch64/strrchr.S b/sysdeps/aarch64/strrchr.S
index e52c9b275347978c..402bce444ef3bb28 100644
--- a/sysdeps/aarch64/strrchr.S
+++ b/sysdeps/aarch64/strrchr.S
@@ -55,7 +55,6 @@
if the relevant byte matched the NUL end of string. */
ENTRY (strrchr)
- PTR_ARG (0)
bic src, srcin, 15
dup vrepchr.16b, chrin
movi vrepmask.16b, 0x33
diff --git a/sysdeps/unix/sysv/linux/aarch64/clone.S b/sysdeps/unix/sysv/linux/aarch64/clone.S
index 0e7ee24e68c85377..fed19acc2f78351f 100644
--- a/sysdeps/unix/sysv/linux/aarch64/clone.S
+++ b/sysdeps/unix/sysv/linux/aarch64/clone.S
@@ -33,12 +33,6 @@
*/
.text
ENTRY(__clone)
- PTR_ARG (0)
- PTR_ARG (1)
- PTR_ARG (3)
- PTR_ARG (4)
- PTR_ARG (5)
- PTR_ARG (6)
/* Save args for the child. */
mov x10, x0
mov x11, x2
diff --git a/sysdeps/unix/sysv/linux/aarch64/clone3.S b/sysdeps/unix/sysv/linux/aarch64/clone3.S
index 92d69a5430518cbc..9b00b6b8853e9b8b 100644
--- a/sysdeps/unix/sysv/linux/aarch64/clone3.S
+++ b/sysdeps/unix/sysv/linux/aarch64/clone3.S
@@ -36,10 +36,6 @@
.text
ENTRY(__clone3)
- PTR_ARG (0)
- PTR_ARG (1)
- PTR_ARG (3)
- PTR_ARG (4)
/* Save args for the child. */
mov x10, x0 /* cl_args */
mov x11, x2 /* func */
diff --git a/sysdeps/unix/sysv/linux/aarch64/getcontext.S b/sysdeps/unix/sysv/linux/aarch64/getcontext.S
index e5b69c9a82b7a448..862bd67aa484ae1a 100644
--- a/sysdeps/unix/sysv/linux/aarch64/getcontext.S
+++ b/sysdeps/unix/sysv/linux/aarch64/getcontext.S
@@ -30,7 +30,6 @@
.text
ENTRY(__getcontext)
- PTR_ARG (0)
/* The saved context will return to the getcontext() call point
with a return value of 0 */
str xzr, [x0, oX0 + 0 * SZREG]
diff --git a/sysdeps/unix/sysv/linux/aarch64/setcontext.S b/sysdeps/unix/sysv/linux/aarch64/setcontext.S
index ba659438c564dc3b..8c072781cdf98c2b 100644
--- a/sysdeps/unix/sysv/linux/aarch64/setcontext.S
+++ b/sysdeps/unix/sysv/linux/aarch64/setcontext.S
@@ -34,7 +34,6 @@
.text
ENTRY (__setcontext)
- PTR_ARG (0)
/* Save a copy of UCP. */
mov x9, x0
diff --git a/sysdeps/unix/sysv/linux/aarch64/swapcontext.S b/sysdeps/unix/sysv/linux/aarch64/swapcontext.S
index f049140d35b79ba6..7000f220368bb094 100644
--- a/sysdeps/unix/sysv/linux/aarch64/swapcontext.S
+++ b/sysdeps/unix/sysv/linux/aarch64/swapcontext.S
@@ -27,7 +27,6 @@
.text
ENTRY(__swapcontext)
- PTR_ARG (0)
/* Set the value returned when swapcontext() returns in this context.
And set up x1 to become the return address of the caller, so we
can return there with a normal RET instead of an indirect jump. */

113
glibc-RHEL-118273-33.patch Normal file
View File

@ -0,0 +1,113 @@
commit cf56eb28fa277d9dbb301654682ca89f71c30a48
Author: Pierre Blanchard <pierre.blanchard@arm.com>
Date: Tue Mar 18 17:07:31 2025 +0000
AArch64: Optimize algorithm in users of SVE expf helper
Polynomial order was unnecessarily high, unlocking multiple
optimizations.
Max error for new SVE expf is 0.88 +0.5ULP.
Max error for new SVE coshf is 2.56 +0.5ULP.
Performance improvement on Neoverse V1: expf (30%), coshf (26%).
Reviewed-by: Wilco Dijkstra <Wilco.Dijkstra@arm.com>
diff --git a/sysdeps/aarch64/fpu/coshf_sve.c b/sysdeps/aarch64/fpu/coshf_sve.c
index 7ad6efa0fc218278..508c0790ee89e0cd 100644
--- a/sysdeps/aarch64/fpu/coshf_sve.c
+++ b/sysdeps/aarch64/fpu/coshf_sve.c
@@ -39,9 +39,9 @@ special_case (svfloat32_t x, svfloat32_t half_e, svfloat32_t half_over_e,
}
/* Single-precision vector cosh, using vector expf.
- Maximum error is 2.77 ULP:
- _ZGVsMxv_coshf(-0x1.5b38f4p+1) got 0x1.e45946p+2
- want 0x1.e4594cp+2. */
+ Maximum error is 2.56 +0.5 ULP:
+ _ZGVsMxv_coshf(-0x1.5b40f4p+1) got 0x1.e47748p+2
+ want 0x1.e4774ep+2. */
svfloat32_t SV_NAME_F1 (cosh) (svfloat32_t x, svbool_t pg)
{
const struct data *d = ptr_barrier (&data);
diff --git a/sysdeps/aarch64/fpu/expf_sve.c b/sysdeps/aarch64/fpu/expf_sve.c
index da93e01b87e0e890..aee86a203379efb3 100644
--- a/sysdeps/aarch64/fpu/expf_sve.c
+++ b/sysdeps/aarch64/fpu/expf_sve.c
@@ -40,9 +40,9 @@ special_case (svfloat32_t x, svbool_t special, const struct sv_expf_data *d)
}
/* Optimised single-precision SVE exp function.
- Worst-case error is 1.04 ulp:
- SV_NAME_F1 (exp)(0x1.a8eda4p+1) got 0x1.ba74bcp+4
- want 0x1.ba74bap+4. */
+ Worst-case error is 0.88 +0.50 ULP:
+ _ZGVsMxv_expf(-0x1.bba276p-6) got 0x1.f25288p-1
+ want 0x1.f2528ap-1. */
svfloat32_t SV_NAME_F1 (exp) (svfloat32_t x, const svbool_t pg)
{
const struct data *d = ptr_barrier (&data);
diff --git a/sysdeps/aarch64/fpu/sv_expf_inline.h b/sysdeps/aarch64/fpu/sv_expf_inline.h
index 75781fb4ddcb9790..01fbb4d4c046eb3b 100644
--- a/sysdeps/aarch64/fpu/sv_expf_inline.h
+++ b/sysdeps/aarch64/fpu/sv_expf_inline.h
@@ -24,50 +24,40 @@
struct sv_expf_data
{
- float c1, c3, inv_ln2;
- float ln2_lo, c0, c2, c4;
- float ln2_hi, shift;
+ float ln2_hi, ln2_lo, c1, null;
+ float inv_ln2, shift;
};
-/* Coefficients copied from the polynomial in AdvSIMD variant, reversed for
- compatibility with polynomial helpers. Shift is 1.5*2^17 + 127. */
+/* Shift is 1.5*2^17 + 127. */
#define SV_EXPF_DATA \
{ \
- /* Coefficients copied from the polynomial in AdvSIMD variant. */ \
- .c0 = 0x1.ffffecp-1f, .c1 = 0x1.fffdb6p-2f, .c2 = 0x1.555e66p-3f, \
- .c3 = 0x1.573e2ep-5f, .c4 = 0x1.0e4020p-7f, .inv_ln2 = 0x1.715476p+0f, \
- .ln2_hi = 0x1.62e4p-1f, .ln2_lo = 0x1.7f7d1cp-20f, \
- .shift = 0x1.803f8p17f, \
+ .c1 = 0.5f, .inv_ln2 = 0x1.715476p+0f, .ln2_hi = 0x1.62e4p-1f, \
+ .ln2_lo = 0x1.7f7d1cp-20f, .shift = 0x1.803f8p17f, \
}
-#define C(i) sv_f32 (d->poly[i])
-
static inline svfloat32_t
expf_inline (svfloat32_t x, const svbool_t pg, const struct sv_expf_data *d)
{
/* exp(x) = 2^n (1 + poly(r)), with 1 + poly(r) in [1/sqrt(2),sqrt(2)]
x = ln2*n + r, with r in [-ln2/2, ln2/2]. */
- svfloat32_t lane_consts = svld1rq (svptrue_b32 (), &d->ln2_lo);
+ svfloat32_t lane_consts = svld1rq (svptrue_b32 (), &d->ln2_hi);
/* n = round(x/(ln2/N)). */
svfloat32_t z = svmad_x (pg, sv_f32 (d->inv_ln2), x, d->shift);
svfloat32_t n = svsub_x (pg, z, d->shift);
/* r = x - n*ln2/N. */
- svfloat32_t r = svmsb_x (pg, sv_f32 (d->ln2_hi), n, x);
+ svfloat32_t r = x;
r = svmls_lane (r, n, lane_consts, 0);
+ r = svmls_lane (r, n, lane_consts, 1);
/* scale = 2^(n/N). */
svfloat32_t scale = svexpa (svreinterpret_u32 (z));
- /* poly(r) = exp(r) - 1 ~= C0 r + C1 r^2 + C2 r^3 + C3 r^4 + C4 r^5. */
- svfloat32_t p12 = svmla_lane (sv_f32 (d->c1), r, lane_consts, 2);
- svfloat32_t p34 = svmla_lane (sv_f32 (d->c3), r, lane_consts, 3);
+ /* poly(r) = exp(r) - 1 ~= r + 0.5 r^2. */
svfloat32_t r2 = svmul_x (svptrue_b32 (), r, r);
- svfloat32_t p14 = svmla_x (pg, p12, p34, r2);
- svfloat32_t p0 = svmul_lane (r, lane_consts, 1);
- svfloat32_t poly = svmla_x (pg, p0, r2, p14);
+ svfloat32_t poly = svmla_lane (r, r2, lane_consts, 2);
return svmla_x (pg, scale, scale, poly);
}

217
glibc-RHEL-118273-34.patch Normal file
View File

@ -0,0 +1,217 @@
commit 4352e2cc934b2874dba37397157bf890fcee455a
Author: Adhemerval Zanella <adhemerval.zanella@linaro.org>
Date: Fri Mar 28 14:27:45 2025 -0300
aarch64: Fix _dl_tlsdesc_dynamic unwind for pac-ret (BZ 32612)
When libgcc is built with pac-ret, it requires to autenticate the
unwinding frame based on CFI information. The _dl_tlsdesc_dynamic
uses a custom calling convention, where it is responsible to save
and restore all registers it might use (even volatile).
The pac-ret support added by 1be3d6eb823d8b952fa54b7bbc90cbecb8981380
was added only on the slow-path, but the fast path also adds DWARF
Register Rule Instruction (cfi_adjust_cfa_offset) since it requires
to save/restore some auxiliary register. It seems that this is not
fully supported neither by libgcc nor AArch64 ABI [1].
Instead, move paciasp/autiasp to function prologue/epilogue to be
used on both fast and slow paths.
I also corrected the _dl_tlsdesc_dynamic comment description, it was
copied from i386 implementation without any adjustment.
Checked on aarch64-linux-gnu with a toolchain built with
--enable-standard-branch-protection on a system with pac-ret
support.
[1] https://github.com/ARM-software/abi-aa/blob/main/aadwarf64/aadwarf64.rst#id1
Reviewed-by: Yury Khrustalev <yury.khrustalev@arm.com>
Conflicts:
sysdeps/unix/sysv/linux/aarch64/Makefile
(Fixup context to apply without out-of-scope dependency f4d00dd60d)
diff --git a/sysdeps/aarch64/dl-tlsdesc.S b/sysdeps/aarch64/dl-tlsdesc.S
index 0aeaf64edd2594f1..36195c956855e024 100644
--- a/sysdeps/aarch64/dl-tlsdesc.S
+++ b/sysdeps/aarch64/dl-tlsdesc.S
@@ -119,20 +119,19 @@ _dl_tlsdesc_undefweak:
object referenced by the argument.
ptrdiff_t
- __attribute__ ((__regparm__ (1)))
_dl_tlsdesc_dynamic (struct tlsdesc *tdp)
{
struct tlsdesc_dynamic_arg *td = tdp->arg;
- dtv_t *dtv = *(dtv_t **)((char *)__thread_pointer + TCBHEAD_DTV);
+ dtv_t *dtv = *(dtv_t **)((char *)__thread_pointer() + TCBHEAD_DTV);
if (__builtin_expect (td->gen_count <= dtv[0].counter
&& (dtv[td->tlsinfo.ti_module].pointer.val
!= TLS_DTV_UNALLOCATED),
1))
return dtv[td->tlsinfo.ti_module].pointer.val
+ td->tlsinfo.ti_offset
- - __thread_pointer;
+ - __thread_pointer();
- return ___tls_get_addr (&td->tlsinfo) - __thread_pointer;
+ return __tls_get_addr (&td->tlsinfo) - __thread_pointer();
}
*/
@@ -142,7 +141,12 @@ _dl_tlsdesc_undefweak:
cfi_startproc
.align 2
_dl_tlsdesc_dynamic:
+# if HAVE_AARCH64_PAC_RET
+ PACIASP
+ cfi_window_save
+# else
BTI_C
+# endif
/* Save just enough registers to support fast path, if we fall
into slow path we will save additional registers. */
@@ -173,6 +177,10 @@ _dl_tlsdesc_dynamic:
1:
ldp x3, x4, [sp, #16]
ldp x1, x2, [sp], #32
+# if HAVE_AARCH64_PAC_RET
+ AUTIASP
+ cfi_window_save
+# endif
cfi_adjust_cfa_offset (-32)
RET
2:
@@ -182,10 +190,6 @@ _dl_tlsdesc_dynamic:
/* Save the remaining registers that we must treat as caller save. */
cfi_restore_state
-# if HAVE_AARCH64_PAC_RET
- PACIASP
- cfi_window_save
-# endif
# define NSAVEXREGPAIRS 8
stp x29, x30, [sp,#-16*NSAVEXREGPAIRS]!
cfi_adjust_cfa_offset (16*NSAVEXREGPAIRS)
@@ -236,10 +240,6 @@ _dl_tlsdesc_dynamic:
cfi_adjust_cfa_offset (-16*NSAVEXREGPAIRS)
cfi_restore (x29)
cfi_restore (x30)
-# if HAVE_AARCH64_PAC_RET
- AUTIASP
- cfi_window_save
-# endif
b 1b
cfi_endproc
.size _dl_tlsdesc_dynamic, .-_dl_tlsdesc_dynamic
diff --git a/sysdeps/unix/sysv/linux/aarch64/Makefile b/sysdeps/unix/sysv/linux/aarch64/Makefile
index 40b9a2e5dea1ea89..607a0c56d8dfad8d 100644
--- a/sysdeps/unix/sysv/linux/aarch64/Makefile
+++ b/sysdeps/unix/sysv/linux/aarch64/Makefile
@@ -1,3 +1,16 @@
+ifeq ($(subdir),elf)
+tests += \
+ tst-tlsdesc-pac \
+ # tests
+modules-names += \
+ tst-tlsdesc-pac-mod \
+ # modules-names
+
+LDFLAGS-tst-tlsdesc-pac = -rdynamic
+
+$(objpfx)tst-tlsdesc-pac.out: $(objpfx)tst-tlsdesc-pac-mod.so
+endif
+
ifeq ($(subdir),misc)
sysdep_headers += sys/elf.h
endif
diff --git a/sysdeps/unix/sysv/linux/aarch64/tst-tlsdesc-pac-mod.c b/sysdeps/unix/sysv/linux/aarch64/tst-tlsdesc-pac-mod.c
new file mode 100644
index 0000000000000000..d34c8beda9b1986d
--- /dev/null
+++ b/sysdeps/unix/sysv/linux/aarch64/tst-tlsdesc-pac-mod.c
@@ -0,0 +1,27 @@
+/* AArch64 tests for unwinding TLSDESC (BZ 32612)
+ Copyright (C) 2025 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+_Thread_local int foo;
+/* Make the TLS segment large enough to trigger _dl_tlsdesc_dynamic. */
+_Thread_local int foobar[1000];
+
+void
+bar (void)
+{
+ foo = 1;
+}
diff --git a/sysdeps/unix/sysv/linux/aarch64/tst-tlsdesc-pac.c b/sysdeps/unix/sysv/linux/aarch64/tst-tlsdesc-pac.c
new file mode 100644
index 0000000000000000..24d656aafc2784b4
--- /dev/null
+++ b/sysdeps/unix/sysv/linux/aarch64/tst-tlsdesc-pac.c
@@ -0,0 +1,48 @@
+/* AArch64 tests for unwinding TLSDESC (BZ 32612)
+ Copyright (C) 2025 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#include <stdlib.h>
+#include <unwind.h>
+#include <support/xdlfcn.h>
+
+static _Unwind_Reason_Code
+unwind_callback (struct _Unwind_Context* context, void* closure)
+{
+ return _URC_NO_REASON;
+}
+
+/* Assume that TLS variable from tst-tlsdesc-pac-mod.so will trigger
+ the slow-path that allocates the required memory with malloc. */
+void *
+malloc (size_t s)
+{
+ _Unwind_Backtrace (unwind_callback, NULL);
+ return calloc (1, s);
+}
+
+static int
+do_test (void)
+{
+ void *h = xdlopen ("tst-tlsdesc-pac-mod.so", RTLD_LAZY);
+ void (*func)(void) = xdlsym (h, "bar");
+ func ();
+
+ return 0;
+}
+
+#include <support/test-driver.c>

View File

@ -0,0 +1,76 @@
commit 691edbdf7727466ba87e27a8eeae1c3bc5824ef5
Author: Yury Khrustalev <yury.khrustalev@arm.com>
Date: Thu May 8 13:53:38 2025 +0100
aarch64: fix unwinding in longjmp
Previously, longjmp() on aarch64 was using CFI directives around the
call to __libc_arm_za_disable() after CFA was redefined at the start
of longjmp(). This may result in unwinding issues. Move the call and
surrounding CFI directives to the beginning of longjmp().
Suggested-by: Wilco Dijkstra <wilco.dijkstra@arm.com>
diff --git a/sysdeps/aarch64/__longjmp.S b/sysdeps/aarch64/__longjmp.S
index 452ba0da6d788ce8..30b36cb25d921795 100644
--- a/sysdeps/aarch64/__longjmp.S
+++ b/sysdeps/aarch64/__longjmp.S
@@ -24,28 +24,6 @@
/* __longjmp(jmpbuf, val) */
ENTRY (__longjmp)
- cfi_def_cfa(x0, 0)
- cfi_offset(x19, JB_X19<<3)
- cfi_offset(x20, JB_X20<<3)
- cfi_offset(x21, JB_X21<<3)
- cfi_offset(x22, JB_X22<<3)
- cfi_offset(x23, JB_X23<<3)
- cfi_offset(x24, JB_X24<<3)
- cfi_offset(x25, JB_X25<<3)
- cfi_offset(x26, JB_X26<<3)
- cfi_offset(x27, JB_X27<<3)
- cfi_offset(x28, JB_X28<<3)
- cfi_offset(x29, JB_X29<<3)
- cfi_offset(x30, JB_LR<<3)
-
- cfi_offset( d8, JB_D8<<3)
- cfi_offset( d9, JB_D9<<3)
- cfi_offset(d10, JB_D10<<3)
- cfi_offset(d11, JB_D11<<3)
- cfi_offset(d12, JB_D12<<3)
- cfi_offset(d13, JB_D13<<3)
- cfi_offset(d14, JB_D14<<3)
- cfi_offset(d15, JB_D15<<3)
#if IS_IN(libc)
/* Disable ZA state of SME in libc.a and libc.so, but not in ld.so. */
@@ -69,6 +47,29 @@ ENTRY (__longjmp)
# endif
#endif
+ cfi_def_cfa (x0, 0)
+ cfi_offset (x19, JB_X19<<3)
+ cfi_offset (x20, JB_X20<<3)
+ cfi_offset (x21, JB_X21<<3)
+ cfi_offset (x22, JB_X22<<3)
+ cfi_offset (x23, JB_X23<<3)
+ cfi_offset (x24, JB_X24<<3)
+ cfi_offset (x25, JB_X25<<3)
+ cfi_offset (x26, JB_X26<<3)
+ cfi_offset (x27, JB_X27<<3)
+ cfi_offset (x28, JB_X28<<3)
+ cfi_offset (x29, JB_X29<<3)
+ cfi_offset (x30, JB_LR<<3)
+
+ cfi_offset ( d8, JB_D8<<3)
+ cfi_offset ( d9, JB_D9<<3)
+ cfi_offset (d10, JB_D10<<3)
+ cfi_offset (d11, JB_D11<<3)
+ cfi_offset (d12, JB_D12<<3)
+ cfi_offset (d13, JB_D13<<3)
+ cfi_offset (d14, JB_D14<<3)
+ cfi_offset (d15, JB_D15<<3)
+
ldp x19, x20, [x0, #JB_X19<<3]
ldp x21, x22, [x0, #JB_X21<<3]
ldp x23, x24, [x0, #JB_X23<<3]

View File

@ -0,0 +1,29 @@
commit aa18367c1169700f610565eba8acf3e08429fcf5
Author: Wilco Dijkstra <wilco.dijkstra@arm.com>
Date: Thu May 29 15:08:15 2025 +0000
AArch64: Improve enabling of SVE for libmvec
When using a -mcpu option in CFLAGS, GCC can report errors when building libmvec.
Fix this by overriding both -mcpu and -march with a generic variant with SVE added.
Also use a tune for a modern SVE core.
Reviewed-by: Yury Khrustalev <yury.khrustalev@arm.com>
diff --git a/sysdeps/aarch64/fpu/Makefile b/sysdeps/aarch64/fpu/Makefile
index be8541f6496d6688..aa547b21df5f41d9 100644
--- a/sysdeps/aarch64/fpu/Makefile
+++ b/sysdeps/aarch64/fpu/Makefile
@@ -49,8 +49,11 @@ libmvec-support = $(addsuffix f_advsimd,$(float-advsimd-funcs)) \
v_powf_data
endif
-sve-cflags = -march=armv8-a+sve
+# Enable SVE for building libmvec. Since CFLAGS may contain a -mcpu or -march,
+# add a generic -mcpu and -march with SVE enabled. Also use a tune for a modern
+# SVE core.
+sve-cflags = -mcpu=generic+sve -march=armv8-a+sve -mtune=neoverse-v2
ifeq ($(build-mathvec),yes)
bench-libmvec = $(addprefix float-advsimd-,$(float-advsimd-funcs)) \

View File

@ -0,0 +1,24 @@
commit 09795c5612c630db605886dfd55dbf56f381d128
Author: Wilco Dijkstra <wilco.dijkstra@arm.com>
Date: Fri Jun 6 13:15:30 2025 +0000
AArch64: Fix builderror with GCC 12.1/12.2
Early versions of GCC 12 didn't support -mtune=neoverse-v2, so use
-mtune=neoverse-v1 instead.
Reported-by: Yury Khrustalev <yury.khrustalev@arm.com>
diff --git a/sysdeps/aarch64/fpu/Makefile b/sysdeps/aarch64/fpu/Makefile
index aa547b21df5f41d9..c8a6fb4628d13aec 100644
--- a/sysdeps/aarch64/fpu/Makefile
+++ b/sysdeps/aarch64/fpu/Makefile
@@ -53,7 +53,7 @@ endif
# add a generic -mcpu and -march with SVE enabled. Also use a tune for a modern
# SVE core.
-sve-cflags = -mcpu=generic+sve -march=armv8-a+sve -mtune=neoverse-v2
+sve-cflags = -mcpu=generic+sve -march=armv8-a+sve -mtune=neoverse-v1
ifeq ($(build-mathvec),yes)
bench-libmvec = $(addprefix float-advsimd-,$(float-advsimd-funcs)) \

188
glibc-RHEL-118273-38.patch Normal file
View File

@ -0,0 +1,188 @@
commit 6849c5b791edd216f2ec3fdbe4d138bc69b9b333
Author: Luna Lamb <luna.lamb@arm.com>
Date: Wed Jun 18 16:12:19 2025 +0000
AArch64: Improve codegen SVE log1p helper
Improve codegen by packing coefficients.
4% and 2% improvement in throughput microbenchmark on Neoverse V1, for acosh
and atanh respectively.
Reviewed-by: Wilco Dijkstra <Wilco.Dijkstra@arm.com>
diff --git a/sysdeps/aarch64/fpu/acosh_sve.c b/sysdeps/aarch64/fpu/acosh_sve.c
index 3e4faaa5ca686c18..78ebcffbb5737641 100644
--- a/sysdeps/aarch64/fpu/acosh_sve.c
+++ b/sysdeps/aarch64/fpu/acosh_sve.c
@@ -30,10 +30,10 @@ special_case (svfloat64_t x, svfloat64_t y, svbool_t special)
}
/* SVE approximation for double-precision acosh, based on log1p.
- The largest observed error is 3.19 ULP in the region where the
+ The largest observed error is 3.14 ULP in the region where the
argument to log1p falls in the k=0 interval, i.e. x close to 1:
- SV_NAME_D1 (acosh)(0x1.1e4388d4ca821p+0) got 0x1.ed23399f5137p-2
- want 0x1.ed23399f51373p-2. */
+ SV_NAME_D1 (acosh)(0x1.1e80ed12f0ad1p+0) got 0x1.ef0cee7c33ce1p-2
+ want 0x1.ef0cee7c33ce4p-2. */
svfloat64_t SV_NAME_D1 (acosh) (svfloat64_t x, const svbool_t pg)
{
/* (ix - One) >= (BigBound - One). */
diff --git a/sysdeps/aarch64/fpu/atanh_sve.c b/sysdeps/aarch64/fpu/atanh_sve.c
index 7a52728d70f6d226..a4803e5c1305379e 100644
--- a/sysdeps/aarch64/fpu/atanh_sve.c
+++ b/sysdeps/aarch64/fpu/atanh_sve.c
@@ -30,7 +30,7 @@ special_case (svfloat64_t x, svfloat64_t y, svbool_t special)
}
/* SVE approximation for double-precision atanh, based on log1p.
- The greatest observed error is 2.81 ULP:
+ The greatest observed error is 3.3 ULP:
_ZGVsMxv_atanh(0x1.ffae6288b601p-6) got 0x1.ffd8ff31b5019p-6
want 0x1.ffd8ff31b501cp-6. */
svfloat64_t SV_NAME_D1 (atanh) (svfloat64_t x, const svbool_t pg)
@@ -42,7 +42,6 @@ svfloat64_t SV_NAME_D1 (atanh) (svfloat64_t x, const svbool_t pg)
svfloat64_t halfsign = svreinterpret_f64 (svorr_x (pg, sign, Half));
/* It is special if iax >= 1. */
-// svbool_t special = svcmpge (pg, iax, One);
svbool_t special = svacge (pg, x, 1.0);
/* Computation is performed based on the following sequence of equality:
diff --git a/sysdeps/aarch64/fpu/sv_log1p_inline.h b/sysdeps/aarch64/fpu/sv_log1p_inline.h
index da019674f94dbac7..a9ecd75d19e95d39 100644
--- a/sysdeps/aarch64/fpu/sv_log1p_inline.h
+++ b/sysdeps/aarch64/fpu/sv_log1p_inline.h
@@ -21,11 +21,12 @@
#define AARCH64_FPU_SV_LOG1P_INLINE_H
#include "sv_math.h"
-#include "poly_sve_f64.h"
static const struct sv_log1p_data
{
- double poly[19], ln2[2];
+ double c0, c2, c4, c6, c8, c10, c12, c14, c16;
+ double c1, c3, c5, c7, c9, c11, c13, c15, c17, c18;
+ double ln2_lo, ln2_hi;
uint64_t hf_rt2_top;
uint64_t one_m_hf_rt2_top;
uint32_t bottom_mask;
@@ -33,15 +34,30 @@ static const struct sv_log1p_data
} sv_log1p_data = {
/* Coefficients generated using Remez, deg=20, in [sqrt(2)/2-1, sqrt(2)-1].
*/
- .poly = { -0x1.ffffffffffffbp-2, 0x1.55555555551a9p-2, -0x1.00000000008e3p-2,
- 0x1.9999999a32797p-3, -0x1.555555552fecfp-3, 0x1.249248e071e5ap-3,
- -0x1.ffffff8bf8482p-4, 0x1.c71c8f07da57ap-4, -0x1.9999ca4ccb617p-4,
- 0x1.7459ad2e1dfa3p-4, -0x1.554d2680a3ff2p-4, 0x1.3b4c54d487455p-4,
- -0x1.2548a9ffe80e6p-4, 0x1.0f389a24b2e07p-4, -0x1.eee4db15db335p-5,
- 0x1.e95b494d4a5ddp-5, -0x1.15fdf07cb7c73p-4, 0x1.0310b70800fcfp-4,
- -0x1.cfa7385bdb37ep-6 },
- .ln2 = { 0x1.62e42fefa3800p-1, 0x1.ef35793c76730p-45 },
+ .c0 = -0x1.ffffffffffffbp-2,
+ .c1 = 0x1.55555555551a9p-2,
+ .c2 = -0x1.00000000008e3p-2,
+ .c3 = 0x1.9999999a32797p-3,
+ .c4 = -0x1.555555552fecfp-3,
+ .c5 = 0x1.249248e071e5ap-3,
+ .c6 = -0x1.ffffff8bf8482p-4,
+ .c7 = 0x1.c71c8f07da57ap-4,
+ .c8 = -0x1.9999ca4ccb617p-4,
+ .c9 = 0x1.7459ad2e1dfa3p-4,
+ .c10 = -0x1.554d2680a3ff2p-4,
+ .c11 = 0x1.3b4c54d487455p-4,
+ .c12 = -0x1.2548a9ffe80e6p-4,
+ .c13 = 0x1.0f389a24b2e07p-4,
+ .c14 = -0x1.eee4db15db335p-5,
+ .c15 = 0x1.e95b494d4a5ddp-5,
+ .c16 = -0x1.15fdf07cb7c73p-4,
+ .c17 = 0x1.0310b70800fcfp-4,
+ .c18 = -0x1.cfa7385bdb37ep-6,
+ .ln2_lo = 0x1.62e42fefa3800p-1,
+ .ln2_hi = 0x1.ef35793c76730p-45,
+ /* top32(asuint64(sqrt(2)/2)) << 32. */
.hf_rt2_top = 0x3fe6a09e00000000,
+ /* (top32(asuint64(1)) - top32(asuint64(sqrt(2)/2))) << 32. */
.one_m_hf_rt2_top = 0x00095f6200000000,
.bottom_mask = 0xffffffff,
.one_top = 0x3ff
@@ -51,14 +67,14 @@ static inline svfloat64_t
sv_log1p_inline (svfloat64_t x, const svbool_t pg)
{
/* Helper for calculating log(x + 1). Adapted from v_log1p_inline.h, which
- differs from v_log1p_2u5.c by:
+ differs from advsimd/log1p.c by:
- No special-case handling - this should be dealt with by the caller.
- Pairwise Horner polynomial evaluation for improved accuracy.
- Optionally simulate the shortcut for k=0, used in the scalar routine,
using svsel, for improved accuracy when the argument to log1p is close
to 0. This feature is enabled by defining WANT_SV_LOG1P_K0_SHORTCUT as 1
in the source of the caller before including this file.
- See sv_log1p_2u1.c for details of the algorithm. */
+ See sve/log1p.c for details of the algorithm. */
const struct sv_log1p_data *d = ptr_barrier (&sv_log1p_data);
svfloat64_t m = svadd_x (pg, x, 1);
svuint64_t mi = svreinterpret_u64 (m);
@@ -79,7 +95,7 @@ sv_log1p_inline (svfloat64_t x, const svbool_t pg)
svfloat64_t cm;
#ifndef WANT_SV_LOG1P_K0_SHORTCUT
-#error \
+#error \
"Cannot use sv_log1p_inline.h without specifying whether you need the k0 shortcut for greater accuracy close to 0"
#elif WANT_SV_LOG1P_K0_SHORTCUT
/* Shortcut if k is 0 - set correction term to 0 and f to x. The result is
@@ -96,14 +112,46 @@ sv_log1p_inline (svfloat64_t x, const svbool_t pg)
#endif
/* Approximate log1p(f) on the reduced input using a polynomial. */
- svfloat64_t f2 = svmul_x (pg, f, f);
- svfloat64_t p = sv_pw_horner_18_f64_x (pg, f, f2, d->poly);
+ svfloat64_t f2 = svmul_x (svptrue_b64 (), f, f),
+ f4 = svmul_x (svptrue_b64 (), f2, f2),
+ f8 = svmul_x (svptrue_b64 (), f4, f4),
+ f16 = svmul_x (svptrue_b64 (), f8, f8);
+
+ svfloat64_t c13 = svld1rq (svptrue_b64 (), &d->c1);
+ svfloat64_t c57 = svld1rq (svptrue_b64 (), &d->c5);
+ svfloat64_t c911 = svld1rq (svptrue_b64 (), &d->c9);
+ svfloat64_t c1315 = svld1rq (svptrue_b64 (), &d->c13);
+ svfloat64_t c1718 = svld1rq (svptrue_b64 (), &d->c17);
+
+ /* Order-18 Estrin scheme. */
+ svfloat64_t p01 = svmla_lane (sv_f64 (d->c0), f, c13, 0);
+ svfloat64_t p23 = svmla_lane (sv_f64 (d->c2), f, c13, 1);
+ svfloat64_t p45 = svmla_lane (sv_f64 (d->c4), f, c57, 0);
+ svfloat64_t p67 = svmla_lane (sv_f64 (d->c6), f, c57, 1);
+
+ svfloat64_t p03 = svmla_x (pg, p01, f2, p23);
+ svfloat64_t p47 = svmla_x (pg, p45, f2, p67);
+ svfloat64_t p07 = svmla_x (pg, p03, f4, p47);
+
+ svfloat64_t p89 = svmla_lane (sv_f64 (d->c8), f, c911, 0);
+ svfloat64_t p1011 = svmla_lane (sv_f64 (d->c10), f, c911, 1);
+ svfloat64_t p1213 = svmla_lane (sv_f64 (d->c12), f, c1315, 0);
+ svfloat64_t p1415 = svmla_lane (sv_f64 (d->c14), f, c1315, 1);
+
+ svfloat64_t p811 = svmla_x (pg, p89, f2, p1011);
+ svfloat64_t p1215 = svmla_x (pg, p1213, f2, p1415);
+ svfloat64_t p815 = svmla_x (pg, p811, f4, p1215);
+
+ svfloat64_t p015 = svmla_x (pg, p07, f8, p815);
+ svfloat64_t p1617 = svmla_lane (sv_f64 (d->c16), f, c1718, 0);
+ svfloat64_t p1618 = svmla_lane (p1617, f2, c1718, 1);
+ svfloat64_t p = svmla_x (pg, p015, f16, p1618);
/* Assemble log1p(x) = k * log2 + log1p(f) + c/m. */
- svfloat64_t ylo = svmla_x (pg, cm, k, d->ln2[0]);
- svfloat64_t yhi = svmla_x (pg, f, k, d->ln2[1]);
+ svfloat64_t ln2_lo_hi = svld1rq (svptrue_b64 (), &d->ln2_lo);
+ svfloat64_t ylo = svmla_lane (cm, k, ln2_lo_hi, 0);
+ svfloat64_t yhi = svmla_lane (f, k, ln2_lo_hi, 1);
- return svmla_x (pg, svadd_x (pg, ylo, yhi), f2, p);
+ return svmad_x (pg, p, f2, svadd_x (pg, ylo, yhi));
}
-
#endif

583
glibc-RHEL-118273-39.patch Normal file
View File

@ -0,0 +1,583 @@
commit dee22d2a81ab59afc165fb6dcb45d723f13582a0
Author: Dylan Fleming <Dylan.Fleming@arm.com>
Date: Wed Jun 18 16:19:22 2025 +0000
AArch64: Optimise SVE FP64 Hyperbolics
Reworke SVE FP64 hyperbolics to use the SVE FEXPA
instruction.
Also update the special case handelling for large
inputs to be entirely vectorised.
Performance improvements on Neoverse V1:
cosh_sve: 19% for |x| < 709, 5x otherwise
sinh_sve: 24% for |x| < 709, 5.9x otherwise
tanh_sve: 12% for |x| < 19, 9x otherwise
Reviewed-by: Wilco Dijkstra <Wilco.Dijkstra@arm.com>
diff --git a/sysdeps/aarch64/fpu/cosh_sve.c b/sysdeps/aarch64/fpu/cosh_sve.c
index e375dd8a3407feb2..3561893ae614e2ea 100644
--- a/sysdeps/aarch64/fpu/cosh_sve.c
+++ b/sysdeps/aarch64/fpu/cosh_sve.c
@@ -21,71 +21,99 @@
static const struct data
{
- float64_t poly[3];
- float64_t inv_ln2, ln2_hi, ln2_lo, shift, thres;
+ double c0, c2;
+ double c1, c3;
+ float64_t inv_ln2, ln2_hi, ln2_lo, shift;
uint64_t special_bound;
} data = {
- .poly = { 0x1.fffffffffffd4p-2, 0x1.5555571d6b68cp-3,
- 0x1.5555576a59599p-5, },
-
- .inv_ln2 = 0x1.71547652b82fep8, /* N/ln2. */
- /* -ln2/N. */
- .ln2_hi = -0x1.62e42fefa39efp-9,
- .ln2_lo = -0x1.abc9e3b39803f3p-64,
- .shift = 0x1.8p+52,
- .thres = 704.0,
-
- /* 0x1.6p9, above which exp overflows. */
- .special_bound = 0x4086000000000000,
+ /* Generated using Remez, in [-log(2)/128, log(2)/128]. */
+ .c0 = 0x1.fffffffffdbcdp-2,
+ .c1 = 0x1.555555555444cp-3,
+ .c2 = 0x1.555573c6a9f7dp-5,
+ .c3 = 0x1.1111266d28935p-7,
+ .ln2_hi = 0x1.62e42fefa3800p-1,
+ .ln2_lo = 0x1.ef35793c76730p-45,
+ /* 1/ln2. */
+ .inv_ln2 = 0x1.71547652b82fep+0,
+ .shift = 0x1.800000000ff80p+46, /* 1.5*2^46+1022. */
+
+ /* asuint(ln(2^(1024 - 1/128))), the value above which exp overflows. */
+ .special_bound = 0x40862e37e7d8ba72,
};
-static svfloat64_t NOINLINE
-special_case (svfloat64_t x, svbool_t pg, svfloat64_t t, svbool_t special)
-{
- svfloat64_t half_t = svmul_x (svptrue_b64 (), t, 0.5);
- svfloat64_t half_over_t = svdivr_x (pg, t, 0.5);
- svfloat64_t y = svadd_x (pg, half_t, half_over_t);
- return sv_call_f64 (cosh, x, y, special);
-}
-
-/* Helper for approximating exp(x). Copied from sv_exp_tail, with no
- special-case handling or tail. */
+/* Helper for approximating exp(x)/2.
+ Functionally identical to FEXPA exp(x), but an adjustment in
+ the shift value which leads to a reduction in the exponent of scale by 1,
+ thus halving the result at no cost. */
static inline svfloat64_t
-exp_inline (svfloat64_t x, const svbool_t pg, const struct data *d)
+exp_over_two_inline (const svbool_t pg, svfloat64_t x, const struct data *d)
{
/* Calculate exp(x). */
svfloat64_t z = svmla_x (pg, sv_f64 (d->shift), x, d->inv_ln2);
+ svuint64_t u = svreinterpret_u64 (z);
svfloat64_t n = svsub_x (pg, z, d->shift);
- svfloat64_t r = svmla_x (pg, x, n, d->ln2_hi);
- r = svmla_x (pg, r, n, d->ln2_lo);
+ svfloat64_t c13 = svld1rq (svptrue_b64 (), &d->c1);
+ svfloat64_t ln2 = svld1rq (svptrue_b64 (), &d->ln2_hi);
- svuint64_t u = svreinterpret_u64 (z);
- svuint64_t e = svlsl_x (pg, u, 52 - V_EXP_TAIL_TABLE_BITS);
- svuint64_t i = svand_x (svptrue_b64 (), u, 0xff);
+ svfloat64_t r = x;
+ r = svmls_lane (r, n, ln2, 0);
+ r = svmls_lane (r, n, ln2, 1);
- svfloat64_t y = svmla_x (pg, sv_f64 (d->poly[1]), r, d->poly[2]);
- y = svmla_x (pg, sv_f64 (d->poly[0]), r, y);
- y = svmla_x (pg, sv_f64 (1.0), r, y);
- y = svmul_x (svptrue_b64 (), r, y);
+ svfloat64_t r2 = svmul_x (svptrue_b64 (), r, r);
+ svfloat64_t p01 = svmla_lane (sv_f64 (d->c0), r, c13, 0);
+ svfloat64_t p23 = svmla_lane (sv_f64 (d->c2), r, c13, 1);
+ svfloat64_t p04 = svmla_x (pg, p01, p23, r2);
+ svfloat64_t p = svmla_x (pg, r, p04, r2);
- /* s = 2^(n/N). */
- u = svld1_gather_index (pg, __v_exp_tail_data, i);
- svfloat64_t s = svreinterpret_f64 (svadd_x (pg, u, e));
+ svfloat64_t scale = svexpa (u);
- return svmla_x (pg, s, s, y);
+ return svmla_x (pg, scale, scale, p);
+}
+
+/* Vectorised special case to handle values past where exp_inline overflows.
+ Halves the input value and uses the identity exp(x) = exp(x/2)^2 to double
+ the valid range of inputs, and returns inf for anything past that. */
+static svfloat64_t NOINLINE
+special_case (svbool_t pg, svbool_t special, svfloat64_t ax, svfloat64_t t,
+ const struct data *d)
+{
+ /* Finish fast path to compute values for non-special cases. */
+ svfloat64_t inv_twoexp = svdivr_x (pg, t, 0.25);
+ svfloat64_t y = svadd_x (pg, t, inv_twoexp);
+
+ /* Halves input value, and then check if any cases
+ are still going to overflow. */
+ ax = svmul_x (special, ax, 0.5);
+ svbool_t is_safe
+ = svcmplt (special, svreinterpret_u64 (ax), d->special_bound);
+
+ /* Computes exp(x/2), and sets any overflowing lanes to inf. */
+ svfloat64_t half_exp = exp_over_two_inline (special, ax, d);
+ half_exp = svsel (is_safe, half_exp, sv_f64 (INFINITY));
+
+ /* Construct special case cosh(x) = (exp(x/2)^2)/2. */
+ svfloat64_t exp = svmul_x (svptrue_b64 (), half_exp, 2);
+ svfloat64_t special_y = svmul_x (special, exp, half_exp);
+
+ /* Select correct return values for special and non-special cases. */
+ special_y = svsel (special, special_y, y);
+
+ /* Ensure an input of nan is correctly propagated. */
+ svbool_t is_nan
+ = svcmpgt (special, svreinterpret_u64 (ax), sv_u64 (0x7ff0000000000000));
+ return svsel (is_nan, ax, svsel (special, special_y, y));
}
/* Approximation for SVE double-precision cosh(x) using exp_inline.
cosh(x) = (exp(x) + exp(-x)) / 2.
- The greatest observed error is in the scalar fall-back region, so is the
- same as the scalar routine, 1.93 ULP:
- _ZGVsMxv_cosh (0x1.628ad45039d2fp+9) got 0x1.fd774e958236dp+1021
- want 0x1.fd774e958236fp+1021.
-
- The greatest observed error in the non-special region is 1.54 ULP:
- _ZGVsMxv_cosh (0x1.ba5651dd4486bp+2) got 0x1.f5e2bb8d5c98fp+8
- want 0x1.f5e2bb8d5c991p+8. */
+ The greatest observed error in special case region is 2.66 + 0.5 ULP:
+ _ZGVsMxv_cosh (0x1.633b532ffbc1ap+9) got 0x1.f9b2d3d22399ep+1023
+ want 0x1.f9b2d3d22399bp+1023
+
+ The greatest observed error in the non-special region is 1.01 + 0.5 ULP:
+ _ZGVsMxv_cosh (0x1.998ecbb3c1f81p+1) got 0x1.890b225657f84p+3
+ want 0x1.890b225657f82p+3. */
svfloat64_t SV_NAME_D1 (cosh) (svfloat64_t x, const svbool_t pg)
{
const struct data *d = ptr_barrier (&data);
@@ -94,14 +122,13 @@ svfloat64_t SV_NAME_D1 (cosh) (svfloat64_t x, const svbool_t pg)
svbool_t special = svcmpgt (pg, svreinterpret_u64 (ax), d->special_bound);
/* Up to the point that exp overflows, we can use it to calculate cosh by
- exp(|x|) / 2 + 1 / (2 * exp(|x|)). */
- svfloat64_t t = exp_inline (ax, pg, d);
+ (exp(|x|)/2 + 1) / (2 * exp(|x|)). */
+ svfloat64_t half_exp = exp_over_two_inline (pg, ax, d);
- /* Fall back to scalar for any special cases. */
+ /* Falls back to entirely standalone vectorized special case. */
if (__glibc_unlikely (svptest_any (pg, special)))
- return special_case (x, pg, t, special);
+ return special_case (pg, special, ax, half_exp, d);
- svfloat64_t half_t = svmul_x (svptrue_b64 (), t, 0.5);
- svfloat64_t half_over_t = svdivr_x (pg, t, 0.5);
- return svadd_x (pg, half_t, half_over_t);
+ svfloat64_t inv_twoexp = svdivr_x (pg, half_exp, 0.25);
+ return svadd_x (pg, half_exp, inv_twoexp);
}
diff --git a/sysdeps/aarch64/fpu/sinh_sve.c b/sysdeps/aarch64/fpu/sinh_sve.c
index df5f6c8c06e5b173..ac7b306018bda613 100644
--- a/sysdeps/aarch64/fpu/sinh_sve.c
+++ b/sysdeps/aarch64/fpu/sinh_sve.c
@@ -18,90 +18,153 @@
<https://www.gnu.org/licenses/>. */
#include "sv_math.h"
-#include "poly_sve_f64.h"
static const struct data
{
- float64_t poly[11];
- float64_t inv_ln2, m_ln2_hi, m_ln2_lo, shift;
uint64_t halff;
- int64_t onef;
- uint64_t large_bound;
+ double c2, c4;
+ double inv_ln2;
+ double ln2_hi, ln2_lo;
+ double c0, c1, c3;
+ double shift, special_bound, bound;
+ uint64_t expm1_data[20];
} data = {
- /* Generated using Remez, deg=12 in [-log(2)/2, log(2)/2]. */
- .poly = { 0x1p-1, 0x1.5555555555559p-3, 0x1.555555555554bp-5,
- 0x1.111111110f663p-7, 0x1.6c16c16c1b5f3p-10,
- 0x1.a01a01affa35dp-13, 0x1.a01a018b4ecbbp-16,
- 0x1.71ddf82db5bb4p-19, 0x1.27e517fc0d54bp-22,
- 0x1.af5eedae67435p-26, 0x1.1f143d060a28ap-29, },
-
- .inv_ln2 = 0x1.71547652b82fep0,
- .m_ln2_hi = -0x1.62e42fefa39efp-1,
- .m_ln2_lo = -0x1.abc9e3b39803fp-56,
- .shift = 0x1.8p52,
-
+ /* Table lookup of 2^(i/64) - 1, for values of i from 0..19. */
+ .expm1_data = {
+ 0x0000000000000000, 0x3f864d1f3bc03077, 0x3f966c34c5615d0f, 0x3fa0e8a30eb37901,
+ 0x3fa6ab0d9f3121ec, 0x3fac7d865a7a3440, 0x3fb1301d0125b50a, 0x3fb429aaea92ddfb,
+ 0x3fb72b83c7d517ae, 0x3fba35beb6fcb754, 0x3fbd4873168b9aa8, 0x3fc031dc431466b2,
+ 0x3fc1c3d373ab11c3, 0x3fc35a2b2f13e6e9, 0x3fc4f4efa8fef709, 0x3fc6942d3720185a,
+ 0x3fc837f0518db8a9, 0x3fc9e0459320b7fa, 0x3fcb8d39b9d54e55, 0x3fcd3ed9a72cffb7,
+ },
+
+ /* Generated using Remez, in [-log(2)/128, log(2)/128]. */
+ .c0 = 0x1p-1,
+ .c1 = 0x1.55555555548f9p-3,
+ .c2 = 0x1.5555555554c22p-5,
+ .c3 = 0x1.111123aaa2fb2p-7,
+ .c4 = 0x1.6c16d77d98e5bp-10,
+ .ln2_hi = 0x1.62e42fefa3800p-1,
+ .ln2_lo = 0x1.ef35793c76730p-45,
+ .inv_ln2 = 0x1.71547652b82fep+0,
+ .shift = 0x1.800000000ffc0p+46, /* 1.5*2^46+1023. */
.halff = 0x3fe0000000000000,
- .onef = 0x3ff0000000000000,
- /* 2^9. expm1 helper overflows for large input. */
- .large_bound = 0x4080000000000000,
+ .special_bound = 0x1.62e37e7d8ba72p+9, /* ln(2^(1024 - 1/128)). */
+ .bound = 0x1.a56ef8ec924ccp-3 /* 19*ln2/64. */
};
+/* A specialised FEXPA expm1 that is only valid for positive inputs and
+ has no special cases. Based off the full FEXPA expm1 implementated for
+ _ZGVsMxv_expm1, with a slightly modified file to keep sinh under 3.5ULP. */
static inline svfloat64_t
-expm1_inline (svfloat64_t x, svbool_t pg)
+expm1_inline (svbool_t pg, svfloat64_t x)
{
const struct data *d = ptr_barrier (&data);
- /* Reduce argument:
- exp(x) - 1 = 2^i * (expm1(f) + 1) - 1
- where i = round(x / ln2)
- and f = x - i * ln2 (f in [-ln2/2, ln2/2]). */
- svfloat64_t j
- = svsub_x (pg, svmla_x (pg, sv_f64 (d->shift), x, d->inv_ln2), d->shift);
- svint64_t i = svcvt_s64_x (pg, j);
- svfloat64_t f = svmla_x (pg, x, j, d->m_ln2_hi);
- f = svmla_x (pg, f, j, d->m_ln2_lo);
- /* Approximate expm1(f) using polynomial. */
- svfloat64_t f2 = svmul_x (pg, f, f);
- svfloat64_t f4 = svmul_x (pg, f2, f2);
- svfloat64_t f8 = svmul_x (pg, f4, f4);
- svfloat64_t p
- = svmla_x (pg, f, f2, sv_estrin_10_f64_x (pg, f, f2, f4, f8, d->poly));
- /* t = 2^i. */
- svfloat64_t t = svscale_x (pg, sv_f64 (1), i);
- /* expm1(x) ~= p * t + (t - 1). */
- return svmla_x (pg, svsub_x (pg, t, 1.0), p, t);
+ svfloat64_t z = svmla_x (pg, sv_f64 (d->shift), x, d->inv_ln2);
+ svuint64_t u = svreinterpret_u64 (z);
+ svfloat64_t n = svsub_x (pg, z, d->shift);
+
+ svfloat64_t ln2 = svld1rq (svptrue_b64 (), &d->ln2_hi);
+ svfloat64_t c24 = svld1rq (svptrue_b64 (), &d->c2);
+
+ svfloat64_t r = x;
+ r = svmls_lane (r, n, ln2, 0);
+ r = svmls_lane (r, n, ln2, 1);
+
+ svfloat64_t r2 = svmul_x (svptrue_b64 (), r, r);
+
+ svfloat64_t p;
+ svfloat64_t c12 = svmla_lane (sv_f64 (d->c1), r, c24, 0);
+ svfloat64_t c34 = svmla_lane (sv_f64 (d->c3), r, c24, 1);
+ p = svmad_x (pg, c34, r2, c12);
+ p = svmad_x (pg, p, r, sv_f64 (d->c0));
+ p = svmad_x (pg, p, r2, r);
+
+ svfloat64_t scale = svexpa (u);
+
+ /* We want to construct expm1(x) = (scale - 1) + scale * poly.
+ However, for values of scale close to 1, scale-1 causes large ULP errors
+ due to cancellation.
+
+ This can be circumvented by using a small lookup for scale-1
+ when our input is below a certain bound, otherwise we can use FEXPA. */
+ svbool_t is_small = svaclt (pg, x, d->bound);
+
+ /* Index via the input of FEXPA, but we only care about the lower 5 bits. */
+ svuint64_t base_idx = svand_x (pg, u, 0x1f);
+
+ /* Compute scale - 1 from FEXPA, and lookup values where this fails. */
+ svfloat64_t scalem1_estimate = svsub_x (pg, scale, sv_f64 (1.0));
+ svuint64_t scalem1_lookup
+ = svld1_gather_index (is_small, d->expm1_data, base_idx);
+
+ /* Select the appropriate scale - 1 value based on x. */
+ svfloat64_t scalem1
+ = svsel (is_small, svreinterpret_f64 (scalem1_lookup), scalem1_estimate);
+
+ /* return expm1 = scale - 1 + (scale * poly). */
+ return svmla_x (pg, scalem1, scale, p);
}
+/* Vectorised special case to handle values past where exp_inline overflows.
+ Halves the input value and uses the identity exp(x) = exp(x/2)^2 to double
+ the valid range of inputs, and returns inf for anything past that. */
static svfloat64_t NOINLINE
-special_case (svfloat64_t x, svbool_t pg)
+special_case (svbool_t pg, svbool_t special, svfloat64_t ax,
+ svfloat64_t halfsign, const struct data *d)
{
- return sv_call_f64 (sinh, x, x, pg);
+ /* Halves input value, and then check if any cases
+ are still going to overflow. */
+ ax = svmul_x (special, ax, 0.5);
+ svbool_t is_safe = svaclt (special, ax, d->special_bound);
+
+ svfloat64_t t = expm1_inline (pg, ax);
+
+ /* Finish fastpass to compute values for non-special cases. */
+ svfloat64_t y = svadd_x (pg, t, svdiv_x (pg, t, svadd_x (pg, t, 1.0)));
+ y = svmul_x (pg, y, halfsign);
+
+ /* Computes special lane, and set remaining overflow lanes to inf. */
+ svfloat64_t half_special_y = svmul_x (svptrue_b64 (), t, halfsign);
+ svfloat64_t special_y = svmul_x (svptrue_b64 (), half_special_y, t);
+
+ svuint64_t signed_inf
+ = svorr_x (svptrue_b64 (), svreinterpret_u64 (halfsign),
+ sv_u64 (0x7ff0000000000000));
+ special_y = svsel (is_safe, special_y, svreinterpret_f64 (signed_inf));
+
+ /* Join resulting vectors together and return. */
+ return svsel (special, special_y, y);
}
-/* Approximation for SVE double-precision sinh(x) using expm1.
- sinh(x) = (exp(x) - exp(-x)) / 2.
- The greatest observed error is 2.57 ULP:
- _ZGVsMxv_sinh (0x1.a008538399931p-2) got 0x1.ab929fc64bd66p-2
- want 0x1.ab929fc64bd63p-2. */
+/* Approximation for SVE double-precision sinh(x) using FEXPA expm1.
+ Uses sinh(x) = e^2x - 1 / 2e^x, rewritten for accuracy.
+ The greatest observed error in the non-special region is 2.63 + 0.5 ULP:
+ _ZGVsMxv_sinh (0x1.b5e0e13ba88aep-2) got 0x1.c3587faf97b0cp-2
+ want 0x1.c3587faf97b09p-2
+
+ The greatest observed error in the special region is 2.65 + 0.5 ULP:
+ _ZGVsMxv_sinh (0x1.633ce847dab1ap+9) got 0x1.fffd30eea0066p+1023
+ want 0x1.fffd30eea0063p+1023. */
svfloat64_t SV_NAME_D1 (sinh) (svfloat64_t x, svbool_t pg)
{
const struct data *d = ptr_barrier (&data);
+ svbool_t special = svacge (pg, x, d->special_bound);
svfloat64_t ax = svabs_x (pg, x);
svuint64_t sign
= sveor_x (pg, svreinterpret_u64 (x), svreinterpret_u64 (ax));
svfloat64_t halfsign = svreinterpret_f64 (svorr_x (pg, sign, d->halff));
- svbool_t special = svcmpge (pg, svreinterpret_u64 (ax), d->large_bound);
-
/* Fall back to scalar variant for all lanes if any are special. */
if (__glibc_unlikely (svptest_any (pg, special)))
- return special_case (x, pg);
+ return special_case (pg, special, ax, halfsign, d);
/* Up to the point that expm1 overflows, we can use it to calculate sinh
using a slight rearrangement of the definition of sinh. This allows us to
retain acceptable accuracy for very small inputs. */
- svfloat64_t t = expm1_inline (ax, pg);
+ svfloat64_t t = expm1_inline (pg, ax);
t = svadd_x (pg, t, svdiv_x (pg, t, svadd_x (pg, t, 1.0)));
return svmul_x (pg, t, halfsign);
}
diff --git a/sysdeps/aarch64/fpu/tanh_sve.c b/sysdeps/aarch64/fpu/tanh_sve.c
index d25e011cea305094..805669845d09e098 100644
--- a/sysdeps/aarch64/fpu/tanh_sve.c
+++ b/sysdeps/aarch64/fpu/tanh_sve.c
@@ -18,83 +18,117 @@
<https://www.gnu.org/licenses/>. */
#include "sv_math.h"
-#include "poly_sve_f64.h"
static const struct data
{
- float64_t poly[11];
- float64_t inv_ln2, ln2_hi, ln2_lo, shift;
- uint64_t thresh, tiny_bound;
+ double ln2_hi, ln2_lo;
+ double c2, c4;
+ double c0, c1, c3;
+ double two_over_ln2, shift;
+ uint64_t tiny_bound;
+ double large_bound, fexpa_bound;
+ uint64_t e2xm1_data[20];
} data = {
- /* Generated using Remez, deg=12 in [-log(2)/2, log(2)/2]. */
- .poly = { 0x1p-1, 0x1.5555555555559p-3, 0x1.555555555554bp-5,
- 0x1.111111110f663p-7, 0x1.6c16c16c1b5f3p-10,
- 0x1.a01a01affa35dp-13, 0x1.a01a018b4ecbbp-16,
- 0x1.71ddf82db5bb4p-19, 0x1.27e517fc0d54bp-22,
- 0x1.af5eedae67435p-26, 0x1.1f143d060a28ap-29, },
-
- .inv_ln2 = 0x1.71547652b82fep0,
- .ln2_hi = -0x1.62e42fefa39efp-1,
- .ln2_lo = -0x1.abc9e3b39803fp-56,
- .shift = 0x1.8p52,
-
+ /* Generated using Remez, in [-log(2)/128, log(2)/128]. */
+ .c0 = 0x1p-1,
+ .c1 = 0x1.55555555548f9p-3,
+ .c2 = 0x1.5555555554c22p-5,
+ .c3 = 0x1.111123aaa2fb2p-7,
+ .c4 = 0x1.6c16d77d98e5bp-10,
+ .ln2_hi = 0x1.62e42fefa3800p-1,
+ .ln2_lo = 0x1.ef35793c76730p-45,
+ .two_over_ln2 = 0x1.71547652b82fep+1,
+ .shift = 0x1.800000000ffc0p+46, /* 1.5*2^46+1023. */
.tiny_bound = 0x3e40000000000000, /* asuint64 (0x1p-27). */
- /* asuint64(0x1.241bf835f9d5fp+4) - asuint64(tiny_bound). */
- .thresh = 0x01f241bf835f9d5f,
+ .large_bound = 0x1.30fc1931f09cap+4, /* arctanh(1 - 2^-54). */
+ .fexpa_bound = 0x1.a56ef8ec924ccp-4, /* 19/64 * ln2/2. */
+ /* Table lookup of 2^(i/64) - 1, for values of i from 0..19. */
+ .e2xm1_data = {
+ 0x0000000000000000, 0x3f864d1f3bc03077, 0x3f966c34c5615d0f, 0x3fa0e8a30eb37901,
+ 0x3fa6ab0d9f3121ec, 0x3fac7d865a7a3440, 0x3fb1301d0125b50a, 0x3fb429aaea92ddfb,
+ 0x3fb72b83c7d517ae, 0x3fba35beb6fcb754, 0x3fbd4873168b9aa8, 0x3fc031dc431466b2,
+ 0x3fc1c3d373ab11c3, 0x3fc35a2b2f13e6e9, 0x3fc4f4efa8fef709, 0x3fc6942d3720185a,
+ 0x3fc837f0518db8a9, 0x3fc9e0459320b7fa, 0x3fcb8d39b9d54e55, 0x3fcd3ed9a72cffb7,
+ },
};
+/* An expm1 inspired, FEXPA based helper function that returns an
+ accurate estimate for e^2x - 1. With no special case or support for
+ negative inputs of x. */
static inline svfloat64_t
-expm1_inline (svfloat64_t x, const svbool_t pg, const struct data *d)
-{
- /* Helper routine for calculating exp(x) - 1. Vector port of the helper from
- the scalar variant of tanh. */
-
- /* Reduce argument: f in [-ln2/2, ln2/2], i is exact. */
- svfloat64_t j
- = svsub_x (pg, svmla_x (pg, sv_f64 (d->shift), x, d->inv_ln2), d->shift);
- svint64_t i = svcvt_s64_x (pg, j);
- svfloat64_t f = svmla_x (pg, x, j, d->ln2_hi);
- f = svmla_x (pg, f, j, d->ln2_lo);
-
- /* Approximate expm1(f) using polynomial. */
- svfloat64_t f2 = svmul_x (pg, f, f);
- svfloat64_t f4 = svmul_x (pg, f2, f2);
- svfloat64_t p = svmla_x (
- pg, f, f2,
- sv_estrin_10_f64_x (pg, f, f2, f4, svmul_x (pg, f4, f4), d->poly));
-
- /* t = 2 ^ i. */
- svfloat64_t t = svscale_x (pg, sv_f64 (1), i);
- /* expm1(x) = p * t + (t - 1). */
- return svmla_x (pg, svsub_x (pg, t, 1), p, t);
-}
-
-static svfloat64_t NOINLINE
-special_case (svfloat64_t x, svfloat64_t y, svbool_t special)
+e2xm1_inline (const svbool_t pg, svfloat64_t x, const struct data *d)
{
- return sv_call_f64 (tanh, x, y, special);
+ svfloat64_t z = svmla_x (pg, sv_f64 (d->shift), x, d->two_over_ln2);
+ svuint64_t u = svreinterpret_u64 (z);
+ svfloat64_t n = svsub_x (pg, z, d->shift);
+
+ /* r = x - n * ln2/2, r is in [-ln2/(2N), ln2/(2N)]. */
+ svfloat64_t ln2 = svld1rq (svptrue_b64 (), &d->ln2_hi);
+ svfloat64_t r = svadd_x (pg, x, x);
+ r = svmls_lane (r, n, ln2, 0);
+ r = svmls_lane (r, n, ln2, 1);
+
+ /* y = exp(r) - 1 ~= r + C0 r^2 + C1 r^3 + C2 r^4 + C3 r^5 + C4 r^6. */
+ svfloat64_t r2 = svmul_x (svptrue_b64 (), r, r);
+ svfloat64_t c24 = svld1rq (svptrue_b64 (), &d->c2);
+
+ svfloat64_t p;
+ svfloat64_t c12 = svmla_lane (sv_f64 (d->c1), r, c24, 0);
+ svfloat64_t c34 = svmla_lane (sv_f64 (d->c3), r, c24, 1);
+ p = svmad_x (pg, c34, r2, c12);
+ p = svmad_x (pg, p, r, sv_f64 (d->c0));
+ p = svmad_x (pg, p, r2, r);
+
+ svfloat64_t scale = svexpa (u);
+
+ /* We want to construct e2xm1(x) = (scale - 1) + scale * poly.
+ However, for values of scale close to 1, scale-1 causes large ULP errors
+ due to cancellation.
+
+ This can be circumvented by using a small lookup for scale-1
+ when our input is below a certain bound, otherwise we can use FEXPA. */
+ svbool_t is_small = svaclt (pg, x, d->fexpa_bound);
+
+ /* Index via the input of FEXPA, but we only care about the lower 5 bits. */
+ svuint64_t base_idx = svand_x (pg, u, 0x1f);
+
+ /* Compute scale - 1 from FEXPA, and lookup values where this fails. */
+ svfloat64_t scalem1_estimate = svsub_x (pg, scale, sv_f64 (1.0));
+ svuint64_t scalem1_lookup
+ = svld1_gather_index (is_small, d->e2xm1_data, base_idx);
+
+ /* Select the appropriate scale - 1 value based on x. */
+ svfloat64_t scalem1
+ = svsel (is_small, svreinterpret_f64 (scalem1_lookup), scalem1_estimate);
+ return svmla_x (pg, scalem1, scale, p);
}
-/* SVE approximation for double-precision tanh(x), using a simplified
- version of expm1. The greatest observed error is 2.77 ULP:
- _ZGVsMxv_tanh(-0x1.c4a4ca0f9f3b7p-3) got -0x1.bd6a21a163627p-3
- want -0x1.bd6a21a163624p-3. */
+/* SVE approximation for double-precision tanh(x), using a modified version of
+ FEXPA expm1 to calculate e^2x - 1.
+ The greatest observed error is 2.79 + 0.5 ULP:
+ _ZGVsMxv_tanh (0x1.fff868eb3c223p-9) got 0x1.fff7be486cae6p-9
+ want 0x1.fff7be486cae9p-9. */
svfloat64_t SV_NAME_D1 (tanh) (svfloat64_t x, svbool_t pg)
{
const struct data *d = ptr_barrier (&data);
- svuint64_t ia = svreinterpret_u64 (svabs_x (pg, x));
+ svbool_t large = svacge (pg, x, d->large_bound);
- /* Trigger special-cases for tiny, boring and infinity/NaN. */
- svbool_t special = svcmpgt (pg, svsub_x (pg, ia, d->tiny_bound), d->thresh);
+ /* We can use tanh(x) = (e^2x - 1) / (e^2x + 1) to approximate tanh.
+ As an additional optimisation, we can ensure more accurate values of e^x
+ by only using positive inputs. So we calculate tanh(|x|), and restore the
+ sign of the input before returning. */
+ svfloat64_t ax = svabs_x (pg, x);
+ svuint64_t sign_bit
+ = sveor_x (pg, svreinterpret_u64 (x), svreinterpret_u64 (ax));
- svfloat64_t u = svadd_x (pg, x, x);
+ svfloat64_t p = e2xm1_inline (pg, ax, d);
+ svfloat64_t q = svadd_x (pg, p, 2);
- /* tanh(x) = (e^2x - 1) / (e^2x + 1). */
- svfloat64_t q = expm1_inline (u, pg, d);
- svfloat64_t qp2 = svadd_x (pg, q, 2);
+ /* For sufficiently high inputs, the result of tanh(|x|) is 1 when correctly
+ rounded, at this point we can return 1 directly, with sign correction.
+ This will also act as a guard against our approximation overflowing. */
+ svfloat64_t y = svsel (large, sv_f64 (1.0), svdiv_x (pg, p, q));
- if (__glibc_unlikely (svptest_any (pg, special)))
- return special_case (x, svdiv_x (pg, q, qp2), special);
- return svdiv_x (pg, q, qp2);
+ return svreinterpret_f64 (svorr_x (pg, sign_bit, svreinterpret_u64 (y)));
}

673
glibc-RHEL-118273-4.patch Normal file
View File

@ -0,0 +1,673 @@
commit 81406ea3c5b5ad19e307302c13dd642785b47948
Author: Joe Ramsay <Joe.Ramsay@arm.com>
Date: Tue Feb 20 16:59:41 2024 +0000
aarch64/fpu: Add vector variants of asinh
Reviewed-by: Szabolcs Nagy <szabolcs.nagy@arm.com>
diff --git a/sysdeps/aarch64/fpu/Makefile b/sysdeps/aarch64/fpu/Makefile
index 2e5bbb5a07f4c9b0..d474f2969dd05c26 100644
--- a/sysdeps/aarch64/fpu/Makefile
+++ b/sysdeps/aarch64/fpu/Makefile
@@ -1,6 +1,7 @@
libmvec-supported-funcs = acos \
acosh \
asin \
+ asinh \
atan \
atan2 \
cos \
diff --git a/sysdeps/aarch64/fpu/Versions b/sysdeps/aarch64/fpu/Versions
index 60e1cdeacec3f77e..08ea15efaec959fb 100644
--- a/sysdeps/aarch64/fpu/Versions
+++ b/sysdeps/aarch64/fpu/Versions
@@ -84,6 +84,11 @@ libmvec {
_ZGVnN4v_acoshf;
_ZGVsMxv_acosh;
_ZGVsMxv_acoshf;
+ _ZGVnN2v_asinh;
+ _ZGVnN2v_asinhf;
+ _ZGVnN4v_asinhf;
+ _ZGVsMxv_asinh;
+ _ZGVsMxv_asinhf;
_ZGVnN2v_cosh;
_ZGVnN2v_coshf;
_ZGVnN4v_coshf;
diff --git a/sysdeps/aarch64/fpu/advsimd_f32_protos.h b/sysdeps/aarch64/fpu/advsimd_f32_protos.h
index 22fec4de77395e60..1e80721c9f73ba12 100644
--- a/sysdeps/aarch64/fpu/advsimd_f32_protos.h
+++ b/sysdeps/aarch64/fpu/advsimd_f32_protos.h
@@ -20,6 +20,7 @@
libmvec_hidden_proto (V_NAME_F1(acos));
libmvec_hidden_proto (V_NAME_F1(acosh));
libmvec_hidden_proto (V_NAME_F1(asin));
+libmvec_hidden_proto (V_NAME_F1(asinh));
libmvec_hidden_proto (V_NAME_F1(atan));
libmvec_hidden_proto (V_NAME_F1(cos));
libmvec_hidden_proto (V_NAME_F1(cosh));
diff --git a/sysdeps/aarch64/fpu/asinh_advsimd.c b/sysdeps/aarch64/fpu/asinh_advsimd.c
new file mode 100644
index 0000000000000000..544a52f6515d3201
--- /dev/null
+++ b/sysdeps/aarch64/fpu/asinh_advsimd.c
@@ -0,0 +1,171 @@
+/* Double-precision vector (Advanced SIMD) asinh function
+
+ Copyright (C) 2024 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#include "v_math.h"
+#include "poly_advsimd_f64.h"
+
+#define A(i) v_f64 (__v_log_data.poly[i])
+#define N (1 << V_LOG_TABLE_BITS)
+
+const static struct data
+{
+ float64x2_t poly[18];
+ uint64x2_t off, huge_bound, abs_mask;
+ float64x2_t ln2, tiny_bound;
+} data = {
+ .off = V2 (0x3fe6900900000000),
+ .ln2 = V2 (0x1.62e42fefa39efp-1),
+ .huge_bound = V2 (0x5fe0000000000000),
+ .tiny_bound = V2 (0x1p-26),
+ .abs_mask = V2 (0x7fffffffffffffff),
+ /* Even terms of polynomial s.t. asinh(x) is approximated by
+ asinh(x) ~= x + x^3 * (C0 + C1 * x + C2 * x^2 + C3 * x^3 + ...).
+ Generated using Remez, f = (asinh(sqrt(x)) - sqrt(x))/x^(3/2). */
+ .poly = { V2 (-0x1.55555555554a7p-3), V2 (0x1.3333333326c7p-4),
+ V2 (-0x1.6db6db68332e6p-5), V2 (0x1.f1c71b26fb40dp-6),
+ V2 (-0x1.6e8b8b654a621p-6), V2 (0x1.1c4daa9e67871p-6),
+ V2 (-0x1.c9871d10885afp-7), V2 (0x1.7a16e8d9d2ecfp-7),
+ V2 (-0x1.3ddca533e9f54p-7), V2 (0x1.0becef748dafcp-7),
+ V2 (-0x1.b90c7099dd397p-8), V2 (0x1.541f2bb1ffe51p-8),
+ V2 (-0x1.d217026a669ecp-9), V2 (0x1.0b5c7977aaf7p-9),
+ V2 (-0x1.e0f37daef9127p-11), V2 (0x1.388b5fe542a6p-12),
+ V2 (-0x1.021a48685e287p-14), V2 (0x1.93d4ba83d34dap-18) },
+};
+
+static float64x2_t NOINLINE VPCS_ATTR
+special_case (float64x2_t x, float64x2_t y, uint64x2_t special)
+{
+ return v_call_f64 (asinh, x, y, special);
+}
+
+struct entry
+{
+ float64x2_t invc;
+ float64x2_t logc;
+};
+
+static inline struct entry
+lookup (uint64x2_t i)
+{
+ float64x2_t e0 = vld1q_f64 (
+ &__v_log_data.table[(i[0] >> (52 - V_LOG_TABLE_BITS)) & (N - 1)].invc);
+ float64x2_t e1 = vld1q_f64 (
+ &__v_log_data.table[(i[1] >> (52 - V_LOG_TABLE_BITS)) & (N - 1)].invc);
+ return (struct entry){ vuzp1q_f64 (e0, e1), vuzp2q_f64 (e0, e1) };
+}
+
+static inline float64x2_t
+log_inline (float64x2_t x, const struct data *d)
+{
+ /* Double-precision vector log, copied from ordinary vector log with some
+ cosmetic modification and special-cases removed. */
+ uint64x2_t ix = vreinterpretq_u64_f64 (x);
+ uint64x2_t tmp = vsubq_u64 (ix, d->off);
+ int64x2_t k = vshrq_n_s64 (vreinterpretq_s64_u64 (tmp), 52);
+ uint64x2_t iz
+ = vsubq_u64 (ix, vandq_u64 (tmp, vdupq_n_u64 (0xfffULL << 52)));
+ float64x2_t z = vreinterpretq_f64_u64 (iz);
+ struct entry e = lookup (tmp);
+ float64x2_t r = vfmaq_f64 (v_f64 (-1.0), z, e.invc);
+ float64x2_t kd = vcvtq_f64_s64 (k);
+ float64x2_t hi = vfmaq_f64 (vaddq_f64 (e.logc, r), kd, d->ln2);
+ float64x2_t r2 = vmulq_f64 (r, r);
+ float64x2_t y = vfmaq_f64 (A (2), A (3), r);
+ float64x2_t p = vfmaq_f64 (A (0), A (1), r);
+ y = vfmaq_f64 (y, A (4), r2);
+ y = vfmaq_f64 (p, y, r2);
+ y = vfmaq_f64 (hi, y, r2);
+ return y;
+}
+
+/* Double-precision implementation of vector asinh(x).
+ asinh is very sensitive around 1, so it is impractical to devise a single
+ low-cost algorithm which is sufficiently accurate on a wide range of input.
+ Instead we use two different algorithms:
+ asinh(x) = sign(x) * log(|x| + sqrt(x^2 + 1) if |x| >= 1
+ = sign(x) * (|x| + |x|^3 * P(x^2)) otherwise
+ where log(x) is an optimized log approximation, and P(x) is a polynomial
+ shared with the scalar routine. The greatest observed error 3.29 ULP, in
+ |x| >= 1:
+ __v_asinh(0x1.2cd9d717e2c9bp+0) got 0x1.ffffcfd0e234fp-1
+ want 0x1.ffffcfd0e2352p-1. */
+VPCS_ATTR float64x2_t V_NAME_D1 (asinh) (float64x2_t x)
+{
+ const struct data *d = ptr_barrier (&data);
+
+ float64x2_t ax = vabsq_f64 (x);
+ uint64x2_t iax = vreinterpretq_u64_f64 (ax);
+
+ uint64x2_t gt1 = vcgeq_f64 (ax, v_f64 (1));
+ uint64x2_t special = vcgeq_u64 (iax, d->huge_bound);
+
+#if WANT_SIMD_EXCEPT
+ uint64x2_t tiny = vcltq_f64 (ax, d->tiny_bound);
+ special = vorrq_u64 (special, tiny);
+#endif
+
+ /* Option 1: |x| >= 1.
+ Compute asinh(x) according by asinh(x) = log(x + sqrt(x^2 + 1)).
+ If WANT_SIMD_EXCEPT is enabled, sidestep special values, which will
+ overflow, by setting special lanes to 1. These will be fixed later. */
+ float64x2_t option_1 = v_f64 (0);
+ if (__glibc_likely (v_any_u64 (gt1)))
+ {
+#if WANT_SIMD_EXCEPT
+ float64x2_t xm = v_zerofy_f64 (ax, special);
+#else
+ float64x2_t xm = ax;
+#endif
+ option_1 = log_inline (
+ vaddq_f64 (xm, vsqrtq_f64 (vfmaq_f64 (v_f64 (1), xm, xm))), d);
+ }
+
+ /* Option 2: |x| < 1.
+ Compute asinh(x) using a polynomial.
+ If WANT_SIMD_EXCEPT is enabled, sidestep special lanes, which will
+ overflow, and tiny lanes, which will underflow, by setting them to 0. They
+ will be fixed later, either by selecting x or falling back to the scalar
+ special-case. The largest observed error in this region is 1.47 ULPs:
+ __v_asinh(0x1.fdfcd00cc1e6ap-1) got 0x1.c1d6bf874019bp-1
+ want 0x1.c1d6bf874019cp-1. */
+ float64x2_t option_2 = v_f64 (0);
+ if (__glibc_likely (v_any_u64 (vceqzq_u64 (gt1))))
+ {
+#if WANT_SIMD_EXCEPT
+ ax = v_zerofy_f64 (ax, vorrq_u64 (tiny, gt1));
+#endif
+ float64x2_t x2 = vmulq_f64 (ax, ax), x3 = vmulq_f64 (ax, x2),
+ z2 = vmulq_f64 (x2, x2), z4 = vmulq_f64 (z2, z2),
+ z8 = vmulq_f64 (z4, z4), z16 = vmulq_f64 (z8, z8);
+ float64x2_t p = v_estrin_17_f64 (x2, z2, z4, z8, z16, d->poly);
+ option_2 = vfmaq_f64 (ax, p, x3);
+#if WANT_SIMD_EXCEPT
+ option_2 = vbslq_f64 (tiny, x, option_2);
+#endif
+ }
+
+ /* Choose the right option for each lane. */
+ float64x2_t y = vbslq_f64 (gt1, option_1, option_2);
+ /* Copy sign. */
+ y = vbslq_f64 (d->abs_mask, y, x);
+
+ if (__glibc_unlikely (v_any_u64 (special)))
+ return special_case (x, y, special);
+ return y;
+}
diff --git a/sysdeps/aarch64/fpu/asinh_sve.c b/sysdeps/aarch64/fpu/asinh_sve.c
new file mode 100644
index 0000000000000000..28dc5c458750bac4
--- /dev/null
+++ b/sysdeps/aarch64/fpu/asinh_sve.c
@@ -0,0 +1,150 @@
+/* Double-precision vector (SVE) asinh function
+
+ Copyright (C) 2024 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#include "sv_math.h"
+#include "poly_sve_f64.h"
+
+#define SignMask (0x8000000000000000)
+#define One (0x3ff0000000000000)
+#define Thres (0x5fe0000000000000) /* asuint64 (0x1p511). */
+
+static const struct data
+{
+ double poly[18];
+ double ln2, p3, p1, p4, p0, p2;
+ uint64_t n;
+ uint64_t off;
+
+} data = {
+ /* Polynomial generated using Remez on [2^-26, 1]. */
+ .poly
+ = { -0x1.55555555554a7p-3, 0x1.3333333326c7p-4, -0x1.6db6db68332e6p-5,
+ 0x1.f1c71b26fb40dp-6, -0x1.6e8b8b654a621p-6, 0x1.1c4daa9e67871p-6,
+ -0x1.c9871d10885afp-7, 0x1.7a16e8d9d2ecfp-7, -0x1.3ddca533e9f54p-7,
+ 0x1.0becef748dafcp-7, -0x1.b90c7099dd397p-8, 0x1.541f2bb1ffe51p-8,
+ -0x1.d217026a669ecp-9, 0x1.0b5c7977aaf7p-9, -0x1.e0f37daef9127p-11,
+ 0x1.388b5fe542a6p-12, -0x1.021a48685e287p-14, 0x1.93d4ba83d34dap-18 },
+ .ln2 = 0x1.62e42fefa39efp-1,
+ .p0 = -0x1.ffffffffffff7p-2,
+ .p1 = 0x1.55555555170d4p-2,
+ .p2 = -0x1.0000000399c27p-2,
+ .p3 = 0x1.999b2e90e94cap-3,
+ .p4 = -0x1.554e550bd501ep-3,
+ .n = 1 << V_LOG_TABLE_BITS,
+ .off = 0x3fe6900900000000
+};
+
+static svfloat64_t NOINLINE
+special_case (svfloat64_t x, svfloat64_t y, svbool_t special)
+{
+ return sv_call_f64 (asinh, x, y, special);
+}
+
+static inline svfloat64_t
+__sv_log_inline (svfloat64_t x, const struct data *d, const svbool_t pg)
+{
+ /* Double-precision SVE log, copied from SVE log implementation with some
+ cosmetic modification and special-cases removed. See that file for details
+ of the algorithm used. */
+
+ svuint64_t ix = svreinterpret_u64 (x);
+ svuint64_t tmp = svsub_x (pg, ix, d->off);
+ svuint64_t i = svand_x (pg, svlsr_x (pg, tmp, (51 - V_LOG_TABLE_BITS)),
+ (d->n - 1) << 1);
+ svint64_t k = svasr_x (pg, svreinterpret_s64 (tmp), 52);
+ svuint64_t iz = svsub_x (pg, ix, svand_x (pg, tmp, 0xfffULL << 52));
+ svfloat64_t z = svreinterpret_f64 (iz);
+
+ svfloat64_t invc = svld1_gather_index (pg, &__v_log_data.table[0].invc, i);
+ svfloat64_t logc = svld1_gather_index (pg, &__v_log_data.table[0].logc, i);
+
+ svfloat64_t ln2_p3 = svld1rq (svptrue_b64 (), &d->ln2);
+ svfloat64_t p1_p4 = svld1rq (svptrue_b64 (), &d->p1);
+
+ svfloat64_t r = svmla_x (pg, sv_f64 (-1.0), invc, z);
+ svfloat64_t kd = svcvt_f64_x (pg, k);
+
+ svfloat64_t hi = svmla_lane (svadd_x (pg, logc, r), kd, ln2_p3, 0);
+ svfloat64_t r2 = svmul_x (pg, r, r);
+
+ svfloat64_t y = svmla_lane (sv_f64 (d->p2), r, ln2_p3, 1);
+
+ svfloat64_t p = svmla_lane (sv_f64 (d->p0), r, p1_p4, 0);
+ y = svmla_lane (y, r2, p1_p4, 1);
+ y = svmla_x (pg, p, r2, y);
+ y = svmla_x (pg, hi, r2, y);
+ return y;
+}
+
+/* Double-precision implementation of SVE asinh(x).
+ asinh is very sensitive around 1, so it is impractical to devise a single
+ low-cost algorithm which is sufficiently accurate on a wide range of input.
+ Instead we use two different algorithms:
+ asinh(x) = sign(x) * log(|x| + sqrt(x^2 + 1) if |x| >= 1
+ = sign(x) * (|x| + |x|^3 * P(x^2)) otherwise
+ where log(x) is an optimized log approximation, and P(x) is a polynomial
+ shared with the scalar routine. The greatest observed error 2.51 ULP, in
+ |x| >= 1:
+ _ZGVsMxv_asinh(0x1.170469d024505p+0) got 0x1.e3181c43b0f36p-1
+ want 0x1.e3181c43b0f39p-1. */
+svfloat64_t SV_NAME_D1 (asinh) (svfloat64_t x, const svbool_t pg)
+{
+ const struct data *d = ptr_barrier (&data);
+
+ svuint64_t ix = svreinterpret_u64 (x);
+ svuint64_t iax = svbic_x (pg, ix, SignMask);
+ svuint64_t sign = svand_x (pg, ix, SignMask);
+ svfloat64_t ax = svreinterpret_f64 (iax);
+
+ svbool_t ge1 = svcmpge (pg, iax, One);
+ svbool_t special = svcmpge (pg, iax, Thres);
+
+ /* Option 1: |x| >= 1.
+ Compute asinh(x) according by asinh(x) = log(x + sqrt(x^2 + 1)). */
+ svfloat64_t option_1 = sv_f64 (0);
+ if (__glibc_likely (svptest_any (pg, ge1)))
+ {
+ svfloat64_t x2 = svmul_x (pg, ax, ax);
+ option_1 = __sv_log_inline (
+ svadd_x (pg, ax, svsqrt_x (pg, svadd_x (pg, x2, 1))), d, pg);
+ }
+
+ /* Option 2: |x| < 1.
+ Compute asinh(x) using a polynomial.
+ The largest observed error in this region is 1.51 ULPs:
+ _ZGVsMxv_asinh(0x1.fe12bf8c616a2p-1) got 0x1.c1e649ee2681bp-1
+ want 0x1.c1e649ee2681dp-1. */
+ svfloat64_t option_2 = sv_f64 (0);
+ if (__glibc_likely (svptest_any (pg, svnot_z (pg, ge1))))
+ {
+ svfloat64_t x2 = svmul_x (pg, ax, ax);
+ svfloat64_t x4 = svmul_x (pg, x2, x2);
+ svfloat64_t p = sv_pw_horner_17_f64_x (pg, x2, x4, d->poly);
+ option_2 = svmla_x (pg, ax, p, svmul_x (pg, x2, ax));
+ }
+
+ /* Choose the right option for each lane. */
+ svfloat64_t y = svsel (ge1, option_1, option_2);
+
+ if (__glibc_unlikely (svptest_any (pg, special)))
+ return special_case (
+ x, svreinterpret_f64 (sveor_x (pg, svreinterpret_u64 (y), sign)),
+ special);
+ return svreinterpret_f64 (sveor_x (pg, svreinterpret_u64 (y), sign));
+}
diff --git a/sysdeps/aarch64/fpu/asinhf_advsimd.c b/sysdeps/aarch64/fpu/asinhf_advsimd.c
new file mode 100644
index 0000000000000000..09fd8a614305563d
--- /dev/null
+++ b/sysdeps/aarch64/fpu/asinhf_advsimd.c
@@ -0,0 +1,80 @@
+/* Single-precision vector (Advanced SIMD) asinh function
+
+ Copyright (C) 2024 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#include "v_math.h"
+#include "v_log1pf_inline.h"
+
+#define SignMask v_u32 (0x80000000)
+
+const static struct data
+{
+ struct v_log1pf_data log1pf_consts;
+ uint32x4_t big_bound;
+#if WANT_SIMD_EXCEPT
+ uint32x4_t tiny_bound;
+#endif
+} data = {
+ .log1pf_consts = V_LOG1PF_CONSTANTS_TABLE,
+ .big_bound = V4 (0x5f800000), /* asuint(0x1p64). */
+#if WANT_SIMD_EXCEPT
+ .tiny_bound = V4 (0x30800000) /* asuint(0x1p-30). */
+#endif
+};
+
+static float32x4_t NOINLINE VPCS_ATTR
+special_case (float32x4_t x, float32x4_t y, uint32x4_t special)
+{
+ return v_call_f32 (asinhf, x, y, special);
+}
+
+/* Single-precision implementation of vector asinh(x), using vector log1p.
+ Worst-case error is 2.66 ULP, at roughly +/-0.25:
+ __v_asinhf(0x1.01b04p-2) got 0x1.fe163ep-3 want 0x1.fe1638p-3. */
+VPCS_ATTR float32x4_t NOINLINE V_NAME_F1 (asinh) (float32x4_t x)
+{
+ const struct data *dat = ptr_barrier (&data);
+ uint32x4_t iax = vbicq_u32 (vreinterpretq_u32_f32 (x), SignMask);
+ float32x4_t ax = vreinterpretq_f32_u32 (iax);
+ uint32x4_t special = vcgeq_u32 (iax, dat->big_bound);
+ float32x4_t special_arg = x;
+
+#if WANT_SIMD_EXCEPT
+ /* Sidestep tiny and large values to avoid inadvertently triggering
+ under/overflow. */
+ special = vorrq_u32 (special, vcltq_u32 (iax, dat->tiny_bound));
+ if (__glibc_unlikely (v_any_u32 (special)))
+ {
+ ax = v_zerofy_f32 (ax, special);
+ x = v_zerofy_f32 (x, special);
+ }
+#endif
+
+ /* asinh(x) = log(x + sqrt(x * x + 1)).
+ For positive x, asinh(x) = log1p(x + x * x / (1 + sqrt(x * x + 1))). */
+ float32x4_t d
+ = vaddq_f32 (v_f32 (1), vsqrtq_f32 (vfmaq_f32 (v_f32 (1), x, x)));
+ float32x4_t y = log1pf_inline (
+ vaddq_f32 (ax, vdivq_f32 (vmulq_f32 (ax, ax), d)), dat->log1pf_consts);
+
+ if (__glibc_unlikely (v_any_u32 (special)))
+ return special_case (special_arg, vbslq_f32 (SignMask, x, y), special);
+ return vbslq_f32 (SignMask, x, y);
+}
+libmvec_hidden_def (V_NAME_F1 (asinh))
+HALF_WIDTH_ALIAS_F1 (asinh)
diff --git a/sysdeps/aarch64/fpu/asinhf_sve.c b/sysdeps/aarch64/fpu/asinhf_sve.c
new file mode 100644
index 0000000000000000..d85c3a685c0b83ff
--- /dev/null
+++ b/sysdeps/aarch64/fpu/asinhf_sve.c
@@ -0,0 +1,56 @@
+/* Single-precision vector (SVE) asinh function
+
+ Copyright (C) 2024 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#include "sv_math.h"
+#include "sv_log1pf_inline.h"
+
+#define BigBound (0x5f800000) /* asuint(0x1p64). */
+
+static svfloat32_t NOINLINE
+special_case (svfloat32_t x, svfloat32_t y, svbool_t special)
+{
+ return sv_call_f32 (asinhf, x, y, special);
+}
+
+/* Single-precision SVE asinh(x) routine. Implements the same algorithm as
+ vector asinhf and log1p.
+
+ Maximum error is 2.48 ULPs:
+ SV_NAME_F1 (asinh) (0x1.008864p-3) got 0x1.ffbbbcp-4
+ want 0x1.ffbbb8p-4. */
+svfloat32_t SV_NAME_F1 (asinh) (svfloat32_t x, const svbool_t pg)
+{
+ svfloat32_t ax = svabs_x (pg, x);
+ svuint32_t iax = svreinterpret_u32 (ax);
+ svuint32_t sign = sveor_x (pg, svreinterpret_u32 (x), iax);
+ svbool_t special = svcmpge (pg, iax, BigBound);
+
+ /* asinh(x) = log(x + sqrt(x * x + 1)).
+ For positive x, asinh(x) = log1p(x + x * x / (1 + sqrt(x * x + 1))). */
+ svfloat32_t ax2 = svmul_x (pg, ax, ax);
+ svfloat32_t d = svadd_x (pg, svsqrt_x (pg, svadd_x (pg, ax2, 1.0f)), 1.0f);
+ svfloat32_t y
+ = sv_log1pf_inline (svadd_x (pg, ax, svdiv_x (pg, ax2, d)), pg);
+
+ if (__glibc_unlikely (svptest_any (pg, special)))
+ return special_case (
+ x, svreinterpret_f32 (svorr_x (pg, sign, svreinterpret_u32 (y))),
+ special);
+ return svreinterpret_f32 (svorr_x (pg, sign, svreinterpret_u32 (y)));
+}
diff --git a/sysdeps/aarch64/fpu/bits/math-vector.h b/sysdeps/aarch64/fpu/bits/math-vector.h
index 841330956c102ff1..eb2af35b27757fc6 100644
--- a/sysdeps/aarch64/fpu/bits/math-vector.h
+++ b/sysdeps/aarch64/fpu/bits/math-vector.h
@@ -41,6 +41,10 @@
# define __DECL_SIMD_asin __DECL_SIMD_aarch64
# undef __DECL_SIMD_asinf
# define __DECL_SIMD_asinf __DECL_SIMD_aarch64
+# undef __DECL_SIMD_asinh
+# define __DECL_SIMD_asinh __DECL_SIMD_aarch64
+# undef __DECL_SIMD_asinhf
+# define __DECL_SIMD_asinhf __DECL_SIMD_aarch64
# undef __DECL_SIMD_atan
# define __DECL_SIMD_atan __DECL_SIMD_aarch64
# undef __DECL_SIMD_atanf
@@ -131,6 +135,7 @@ __vpcs __f32x4_t _ZGVnN4vv_atan2f (__f32x4_t, __f32x4_t);
__vpcs __f32x4_t _ZGVnN4v_acosf (__f32x4_t);
__vpcs __f32x4_t _ZGVnN4v_acoshf (__f32x4_t);
__vpcs __f32x4_t _ZGVnN4v_asinf (__f32x4_t);
+__vpcs __f32x4_t _ZGVnN4v_asinhf (__f32x4_t);
__vpcs __f32x4_t _ZGVnN4v_atanf (__f32x4_t);
__vpcs __f32x4_t _ZGVnN4v_cosf (__f32x4_t);
__vpcs __f32x4_t _ZGVnN4v_coshf (__f32x4_t);
@@ -150,6 +155,7 @@ __vpcs __f64x2_t _ZGVnN2vv_atan2 (__f64x2_t, __f64x2_t);
__vpcs __f64x2_t _ZGVnN2v_acos (__f64x2_t);
__vpcs __f64x2_t _ZGVnN2v_acosh (__f64x2_t);
__vpcs __f64x2_t _ZGVnN2v_asin (__f64x2_t);
+__vpcs __f64x2_t _ZGVnN2v_asinh (__f64x2_t);
__vpcs __f64x2_t _ZGVnN2v_atan (__f64x2_t);
__vpcs __f64x2_t _ZGVnN2v_cos (__f64x2_t);
__vpcs __f64x2_t _ZGVnN2v_cosh (__f64x2_t);
@@ -174,6 +180,7 @@ __sv_f32_t _ZGVsMxvv_atan2f (__sv_f32_t, __sv_f32_t, __sv_bool_t);
__sv_f32_t _ZGVsMxv_acosf (__sv_f32_t, __sv_bool_t);
__sv_f32_t _ZGVsMxv_acoshf (__sv_f32_t, __sv_bool_t);
__sv_f32_t _ZGVsMxv_asinf (__sv_f32_t, __sv_bool_t);
+__sv_f32_t _ZGVsMxv_asinhf (__sv_f32_t, __sv_bool_t);
__sv_f32_t _ZGVsMxv_atanf (__sv_f32_t, __sv_bool_t);
__sv_f32_t _ZGVsMxv_cosf (__sv_f32_t, __sv_bool_t);
__sv_f32_t _ZGVsMxv_coshf (__sv_f32_t, __sv_bool_t);
@@ -193,6 +200,7 @@ __sv_f64_t _ZGVsMxvv_atan2 (__sv_f64_t, __sv_f64_t, __sv_bool_t);
__sv_f64_t _ZGVsMxv_acos (__sv_f64_t, __sv_bool_t);
__sv_f64_t _ZGVsMxv_acosh (__sv_f64_t, __sv_bool_t);
__sv_f64_t _ZGVsMxv_asin (__sv_f64_t, __sv_bool_t);
+__sv_f64_t _ZGVsMxv_asinh (__sv_f64_t, __sv_bool_t);
__sv_f64_t _ZGVsMxv_atan (__sv_f64_t, __sv_bool_t);
__sv_f64_t _ZGVsMxv_cos (__sv_f64_t, __sv_bool_t);
__sv_f64_t _ZGVsMxv_cosh (__sv_f64_t, __sv_bool_t);
diff --git a/sysdeps/aarch64/fpu/test-double-advsimd-wrappers.c b/sysdeps/aarch64/fpu/test-double-advsimd-wrappers.c
index f4ce1d70096888aa..3d7177c32dcd77a6 100644
--- a/sysdeps/aarch64/fpu/test-double-advsimd-wrappers.c
+++ b/sysdeps/aarch64/fpu/test-double-advsimd-wrappers.c
@@ -26,6 +26,7 @@
VPCS_VECTOR_WRAPPER (acos_advsimd, _ZGVnN2v_acos)
VPCS_VECTOR_WRAPPER (acosh_advsimd, _ZGVnN2v_acosh)
VPCS_VECTOR_WRAPPER (asin_advsimd, _ZGVnN2v_asin)
+VPCS_VECTOR_WRAPPER (asinh_advsimd, _ZGVnN2v_asinh)
VPCS_VECTOR_WRAPPER (atan_advsimd, _ZGVnN2v_atan)
VPCS_VECTOR_WRAPPER_ff (atan2_advsimd, _ZGVnN2vv_atan2)
VPCS_VECTOR_WRAPPER (cos_advsimd, _ZGVnN2v_cos)
diff --git a/sysdeps/aarch64/fpu/test-double-sve-wrappers.c b/sysdeps/aarch64/fpu/test-double-sve-wrappers.c
index 0e973cc9d7ade813..b88a2afe5c1198c0 100644
--- a/sysdeps/aarch64/fpu/test-double-sve-wrappers.c
+++ b/sysdeps/aarch64/fpu/test-double-sve-wrappers.c
@@ -45,6 +45,7 @@
SVE_VECTOR_WRAPPER (acos_sve, _ZGVsMxv_acos)
SVE_VECTOR_WRAPPER (acosh_sve, _ZGVsMxv_acosh)
SVE_VECTOR_WRAPPER (asin_sve, _ZGVsMxv_asin)
+SVE_VECTOR_WRAPPER (asinh_sve, _ZGVsMxv_asinh)
SVE_VECTOR_WRAPPER (atan_sve, _ZGVsMxv_atan)
SVE_VECTOR_WRAPPER_ff (atan2_sve, _ZGVsMxvv_atan2)
SVE_VECTOR_WRAPPER (cos_sve, _ZGVsMxv_cos)
diff --git a/sysdeps/aarch64/fpu/test-float-advsimd-wrappers.c b/sysdeps/aarch64/fpu/test-float-advsimd-wrappers.c
index 0ce026b5ea96a064..533655402d3f3737 100644
--- a/sysdeps/aarch64/fpu/test-float-advsimd-wrappers.c
+++ b/sysdeps/aarch64/fpu/test-float-advsimd-wrappers.c
@@ -26,6 +26,7 @@
VPCS_VECTOR_WRAPPER (acosf_advsimd, _ZGVnN4v_acosf)
VPCS_VECTOR_WRAPPER (acoshf_advsimd, _ZGVnN4v_acoshf)
VPCS_VECTOR_WRAPPER (asinf_advsimd, _ZGVnN4v_asinf)
+VPCS_VECTOR_WRAPPER (asinhf_advsimd, _ZGVnN4v_asinhf)
VPCS_VECTOR_WRAPPER (atanf_advsimd, _ZGVnN4v_atanf)
VPCS_VECTOR_WRAPPER_ff (atan2f_advsimd, _ZGVnN4vv_atan2f)
VPCS_VECTOR_WRAPPER (cosf_advsimd, _ZGVnN4v_cosf)
diff --git a/sysdeps/aarch64/fpu/test-float-sve-wrappers.c b/sysdeps/aarch64/fpu/test-float-sve-wrappers.c
index 398b7373e800cd5b..f7b673e3358e7d82 100644
--- a/sysdeps/aarch64/fpu/test-float-sve-wrappers.c
+++ b/sysdeps/aarch64/fpu/test-float-sve-wrappers.c
@@ -45,6 +45,7 @@
SVE_VECTOR_WRAPPER (acosf_sve, _ZGVsMxv_acosf)
SVE_VECTOR_WRAPPER (acoshf_sve, _ZGVsMxv_acoshf)
SVE_VECTOR_WRAPPER (asinf_sve, _ZGVsMxv_asinf)
+SVE_VECTOR_WRAPPER (asinhf_sve, _ZGVsMxv_asinhf)
SVE_VECTOR_WRAPPER (atanf_sve, _ZGVsMxv_atanf)
SVE_VECTOR_WRAPPER_ff (atan2f_sve, _ZGVsMxvv_atan2f)
SVE_VECTOR_WRAPPER (cosf_sve, _ZGVsMxv_cosf)
diff --git a/sysdeps/aarch64/libm-test-ulps b/sysdeps/aarch64/libm-test-ulps
index 1646cdbdd22d93d9..b916e422432014c2 100644
--- a/sysdeps/aarch64/libm-test-ulps
+++ b/sysdeps/aarch64/libm-test-ulps
@@ -90,11 +90,19 @@ double: 2
float: 2
ldouble: 4
+Function: "asinh_advsimd":
+double: 1
+float: 2
+
Function: "asinh_downward":
double: 3
float: 3
ldouble: 4
+Function: "asinh_sve":
+double: 1
+float: 2
+
Function: "asinh_towardzero":
double: 2
float: 2
diff --git a/sysdeps/unix/sysv/linux/aarch64/libmvec.abilist b/sysdeps/unix/sysv/linux/aarch64/libmvec.abilist
index f5aaa519f2c8663e..f288afdfdd9c8757 100644
--- a/sysdeps/unix/sysv/linux/aarch64/libmvec.abilist
+++ b/sysdeps/unix/sysv/linux/aarch64/libmvec.abilist
@@ -75,15 +75,20 @@ GLIBC_2.39 _ZGVsMxvv_atan2 F
GLIBC_2.39 _ZGVsMxvv_atan2f F
GLIBC_2.40 _ZGVnN2v_acosh F
GLIBC_2.40 _ZGVnN2v_acoshf F
+GLIBC_2.40 _ZGVnN2v_asinh F
+GLIBC_2.40 _ZGVnN2v_asinhf F
GLIBC_2.40 _ZGVnN2v_cosh F
GLIBC_2.40 _ZGVnN2v_coshf F
GLIBC_2.40 _ZGVnN2v_erf F
GLIBC_2.40 _ZGVnN2v_erff F
GLIBC_2.40 _ZGVnN4v_acoshf F
+GLIBC_2.40 _ZGVnN4v_asinhf F
GLIBC_2.40 _ZGVnN4v_coshf F
GLIBC_2.40 _ZGVnN4v_erff F
GLIBC_2.40 _ZGVsMxv_acosh F
GLIBC_2.40 _ZGVsMxv_acoshf F
+GLIBC_2.40 _ZGVsMxv_asinh F
+GLIBC_2.40 _ZGVsMxv_asinhf F
GLIBC_2.40 _ZGVsMxv_cosh F
GLIBC_2.40 _ZGVsMxv_coshf F
GLIBC_2.40 _ZGVsMxv_erf F

521
glibc-RHEL-118273-40.patch Normal file
View File

@ -0,0 +1,521 @@
commit 1e3d1ddf977ecd653de8d0d10eb083d80ac21cf3
Author: Dylan Fleming <Dylan.Fleming@arm.com>
Date: Wed Jun 18 16:17:12 2025 +0000
AArch64: Optimize SVE exp functions
Improve performance of SVE exps by making better use
of the SVE FEXPA instruction.
Performance improvement on Neoverse V1:
exp2_sve: 21%
exp2f_sve: 24%
exp10f_sve: 23%
expm1_sve: 25%
Reviewed-by: Wilco Dijkstra <Wilco.Dijkstra@arm.com>
diff --git a/sysdeps/aarch64/fpu/exp10f_sve.c b/sysdeps/aarch64/fpu/exp10f_sve.c
index 8aa3fa9c4335cfb8..0a4c26450601a1db 100644
--- a/sysdeps/aarch64/fpu/exp10f_sve.c
+++ b/sysdeps/aarch64/fpu/exp10f_sve.c
@@ -19,26 +19,19 @@
#include "sv_math.h"
-/* For x < -Thres, the result is subnormal and not handled correctly by
- FEXPA. */
-#define Thres 37.9
+/* For x < -Thres (-log10(2^126)), the result is subnormal and not handled
+ correctly by FEXPA. */
+#define Thres 0x1.2f702p+5
static const struct data
{
- float log2_10_lo, c0, c2, c4;
- float c1, c3, log10_2;
- float shift, log2_10_hi, thres;
+ float log10_2, log2_10_hi, log2_10_lo, c1;
+ float c0, shift, thres;
} data = {
/* Coefficients generated using Remez algorithm with minimisation of relative
- error.
- rel error: 0x1.89dafa3p-24
- abs error: 0x1.167d55p-23 in [-log10(2)/2, log10(2)/2]
- maxerr: 0.52 +0.5 ulp. */
- .c0 = 0x1.26bb16p+1f,
- .c1 = 0x1.5350d2p+1f,
- .c2 = 0x1.04744ap+1f,
- .c3 = 0x1.2d8176p+0f,
- .c4 = 0x1.12b41ap-1f,
+ error. */
+ .c0 = 0x1.26bb62p1,
+ .c1 = 0x1.53524cp1,
/* 1.5*2^17 + 127, a shift value suitable for FEXPA. */
.shift = 0x1.803f8p17f,
.log10_2 = 0x1.a934fp+1,
@@ -53,28 +46,23 @@ sv_exp10f_inline (svfloat32_t x, const svbool_t pg, const struct data *d)
/* exp10(x) = 2^(n/N) * 10^r = 2^n * (1 + poly (r)),
with poly(r) in [1/sqrt(2), sqrt(2)] and
x = r + n * log10(2) / N, with r in [-log10(2)/2N, log10(2)/2N]. */
-
- svfloat32_t lane_consts = svld1rq (svptrue_b32 (), &d->log2_10_lo);
+ svfloat32_t lane_consts = svld1rq (svptrue_b32 (), &d->log10_2);
/* n = round(x/(log10(2)/N)). */
svfloat32_t shift = sv_f32 (d->shift);
- svfloat32_t z = svmad_x (pg, sv_f32 (d->log10_2), x, shift);
- svfloat32_t n = svsub_x (svptrue_b32 (), z, shift);
+ svfloat32_t z = svmla_lane (shift, x, lane_consts, 0);
+ svfloat32_t n = svsub_x (pg, z, shift);
/* r = x - n*log10(2)/N. */
- svfloat32_t r = svmsb_x (pg, sv_f32 (d->log2_10_hi), n, x);
- r = svmls_lane (r, n, lane_consts, 0);
+ svfloat32_t r = x;
+ r = svmls_lane (r, n, lane_consts, 1);
+ r = svmls_lane (r, n, lane_consts, 2);
svfloat32_t scale = svexpa (svreinterpret_u32 (z));
/* Polynomial evaluation: poly(r) ~ exp10(r)-1. */
- svfloat32_t p12 = svmla_lane (sv_f32 (d->c1), r, lane_consts, 2);
- svfloat32_t p34 = svmla_lane (sv_f32 (d->c3), r, lane_consts, 3);
- svfloat32_t r2 = svmul_x (svptrue_b32 (), r, r);
- svfloat32_t p14 = svmla_x (pg, p12, p34, r2);
- svfloat32_t p0 = svmul_lane (r, lane_consts, 1);
- svfloat32_t poly = svmla_x (pg, p0, r2, p14);
-
+ svfloat32_t poly = svmla_lane (sv_f32 (d->c0), r, lane_consts, 3);
+ poly = svmul_x (pg, poly, r);
return svmla_x (pg, scale, scale, poly);
}
@@ -85,11 +73,10 @@ special_case (svfloat32_t x, svbool_t special, const struct data *d)
special);
}
-/* Single-precision SVE exp10f routine. Implements the same algorithm
- as AdvSIMD exp10f.
- Worst case error is 1.02 ULPs.
- _ZGVsMxv_exp10f(-0x1.040488p-4) got 0x1.ba5f9ep-1
- want 0x1.ba5f9cp-1. */
+/* Single-precision SVE exp10f routine. Based on the FEXPA instruction.
+ Worst case error is 1.10 ULP.
+ _ZGVsMxv_exp10f (0x1.cc76dep+3) got 0x1.be0172p+47
+ want 0x1.be017p+47. */
svfloat32_t SV_NAME_F1 (exp10) (svfloat32_t x, const svbool_t pg)
{
const struct data *d = ptr_barrier (&data);
diff --git a/sysdeps/aarch64/fpu/exp2_sve.c b/sysdeps/aarch64/fpu/exp2_sve.c
index 5dfb77cdbc2f6a51..ed11423e45059133 100644
--- a/sysdeps/aarch64/fpu/exp2_sve.c
+++ b/sysdeps/aarch64/fpu/exp2_sve.c
@@ -19,23 +19,21 @@
#include "sv_math.h"
-#define N (1 << V_EXP_TABLE_BITS)
-
#define BigBound 1022
#define UOFlowBound 1280
static const struct data
{
- double c0, c2;
- double c1, c3;
+ double c2, c4;
+ double c0, c1, c3;
double shift, big_bound, uoflow_bound;
} data = {
/* Coefficients are computed using Remez algorithm with
minimisation of the absolute error. */
- .c0 = 0x1.62e42fefa3686p-1, .c1 = 0x1.ebfbdff82c241p-3,
- .c2 = 0x1.c6b09b16de99ap-5, .c3 = 0x1.3b2abf5571ad8p-7,
- .shift = 0x1.8p52 / N, .uoflow_bound = UOFlowBound,
- .big_bound = BigBound,
+ .c0 = 0x1.62e42fefa39efp-1, .c1 = 0x1.ebfbdff82a31bp-3,
+ .c2 = 0x1.c6b08d706c8a5p-5, .c3 = 0x1.3b2ad2ff7d2f3p-7,
+ .c4 = 0x1.5d8761184beb3p-10, .shift = 0x1.800000000ffc0p+46,
+ .uoflow_bound = UOFlowBound, .big_bound = BigBound,
};
#define SpecialOffset 0x6000000000000000 /* 0x1p513. */
@@ -64,50 +62,52 @@ special_case (svbool_t pg, svfloat64_t s, svfloat64_t y, svfloat64_t n,
svadd_x (pg, svsub_x (pg, svreinterpret_u64 (s), SpecialBias2), b));
/* |n| > 1280 => 2^(n) overflows. */
- svbool_t p_cmp = svacgt (pg, n, d->uoflow_bound);
+ svbool_t p_cmp = svacle (pg, n, d->uoflow_bound);
svfloat64_t r1 = svmul_x (svptrue_b64 (), s1, s1);
svfloat64_t r2 = svmla_x (pg, s2, s2, y);
svfloat64_t r0 = svmul_x (svptrue_b64 (), r2, s1);
- return svsel (p_cmp, r1, r0);
+ return svsel (p_cmp, r0, r1);
}
/* Fast vector implementation of exp2.
- Maximum measured error is 1.65 ulp.
- _ZGVsMxv_exp2(-0x1.4c264ab5b559bp-6) got 0x1.f8db0d4df721fp-1
- want 0x1.f8db0d4df721dp-1. */
+ Maximum measured error is 0.52 + 0.5 ulp.
+ _ZGVsMxv_exp2 (0x1.3b72ad5b701bfp-1) got 0x1.8861641b49e08p+0
+ want 0x1.8861641b49e07p+0. */
svfloat64_t SV_NAME_D1 (exp2) (svfloat64_t x, svbool_t pg)
{
const struct data *d = ptr_barrier (&data);
- svbool_t no_big_scale = svacle (pg, x, d->big_bound);
- svbool_t special = svnot_z (pg, no_big_scale);
-
- /* Reduce x to k/N + r, where k is integer and r in [-1/2N, 1/2N]. */
- svfloat64_t shift = sv_f64 (d->shift);
- svfloat64_t kd = svadd_x (pg, x, shift);
- svuint64_t ki = svreinterpret_u64 (kd);
- /* kd = k/N. */
- kd = svsub_x (pg, kd, shift);
- svfloat64_t r = svsub_x (pg, x, kd);
-
- /* scale ~= 2^(k/N). */
- svuint64_t idx = svand_x (pg, ki, N - 1);
- svuint64_t sbits = svld1_gather_index (pg, __v_exp_data, idx);
- /* This is only a valid scale when -1023*N < k < 1024*N. */
- svuint64_t top = svlsl_x (pg, ki, 52 - V_EXP_TABLE_BITS);
- svfloat64_t scale = svreinterpret_f64 (svadd_x (pg, sbits, top));
-
- svfloat64_t c13 = svld1rq (svptrue_b64 (), &d->c1);
- /* Approximate exp2(r) using polynomial. */
- /* y = exp2(r) - 1 ~= C0 r + C1 r^2 + C2 r^3 + C3 r^4. */
+ svbool_t special = svacge (pg, x, d->big_bound);
+
+ svfloat64_t z = svadd_x (svptrue_b64 (), x, d->shift);
+ svfloat64_t n = svsub_x (svptrue_b64 (), z, d->shift);
+ svfloat64_t r = svsub_x (svptrue_b64 (), x, n);
+
+ svfloat64_t scale = svexpa (svreinterpret_u64 (z));
+
svfloat64_t r2 = svmul_x (svptrue_b64 (), r, r);
- svfloat64_t p01 = svmla_lane (sv_f64 (d->c0), r, c13, 0);
- svfloat64_t p23 = svmla_lane (sv_f64 (d->c2), r, c13, 1);
- svfloat64_t p = svmla_x (pg, p01, p23, r2);
+ svfloat64_t c24 = svld1rq (svptrue_b64 (), &d->c2);
+
+ /* Approximate exp2(r) using polynomial. */
+ /* y = exp2(r) - 1 ~= r * (C0 + C1 r + C2 r^2 + C3 r^3 + C4 r^4). */
+ svfloat64_t p12 = svmla_lane (sv_f64 (d->c1), r, c24, 0);
+ svfloat64_t p34 = svmla_lane (sv_f64 (d->c3), r, c24, 1);
+ svfloat64_t p = svmla_x (pg, p12, p34, r2);
+ p = svmad_x (pg, p, r, d->c0);
svfloat64_t y = svmul_x (svptrue_b64 (), r, p);
+
/* Assemble exp2(x) = exp2(r) * scale. */
if (__glibc_unlikely (svptest_any (pg, special)))
- return special_case (pg, scale, y, kd, d);
+ {
+ /* FEXPA zeroes the sign bit, however the sign is meaningful to the
+ special case function so needs to be copied.
+ e = sign bit of u << 46. */
+ svuint64_t e = svand_x (pg, svlsl_x (pg, svreinterpret_u64 (z), 46),
+ 0x8000000000000000);
+ scale = svreinterpret_f64 (svadd_x (pg, e, svreinterpret_u64 (scale)));
+ return special_case (pg, scale, y, n, d);
+ }
+
return svmla_x (pg, scale, scale, y);
}
diff --git a/sysdeps/aarch64/fpu/exp2f_sve.c b/sysdeps/aarch64/fpu/exp2f_sve.c
index c6216bed9e9e7538..cf01820288f1855c 100644
--- a/sysdeps/aarch64/fpu/exp2f_sve.c
+++ b/sysdeps/aarch64/fpu/exp2f_sve.c
@@ -18,21 +18,17 @@
<https://www.gnu.org/licenses/>. */
#include "sv_math.h"
-#include "poly_sve_f32.h"
#define Thres 0x1.5d5e2ap+6f
static const struct data
{
- float c0, c2, c4, c1, c3;
- float shift, thres;
+ float c0, c1, shift, thres;
} data = {
- /* Coefficients copied from the polynomial in AdvSIMD variant. */
- .c0 = 0x1.62e422p-1f,
- .c1 = 0x1.ebf9bcp-3f,
- .c2 = 0x1.c6bd32p-5f,
- .c3 = 0x1.3ce9e4p-7f,
- .c4 = 0x1.59977ap-10f,
+ /* Coefficients generated using Remez algorithm with minimisation of relative
+ error. */
+ .c0 = 0x1.62e485p-1,
+ .c1 = 0x1.ebfbe0p-3,
/* 1.5*2^17 + 127. */
.shift = 0x1.803f8p17f,
/* Roughly 87.3. For x < -Thres, the result is subnormal and not handled
@@ -51,16 +47,8 @@ sv_exp2f_inline (svfloat32_t x, const svbool_t pg, const struct data *d)
svfloat32_t scale = svexpa (svreinterpret_u32 (z));
- /* Polynomial evaluation: poly(r) ~ exp2(r)-1.
- Evaluate polynomial use hybrid scheme - offset ESTRIN by 1 for
- coefficients 1 to 4, and apply most significant coefficient directly. */
- svfloat32_t even_coeffs = svld1rq (svptrue_b32 (), &d->c0);
- svfloat32_t r2 = svmul_x (svptrue_b32 (), r, r);
- svfloat32_t p12 = svmla_lane (sv_f32 (d->c1), r, even_coeffs, 1);
- svfloat32_t p34 = svmla_lane (sv_f32 (d->c3), r, even_coeffs, 2);
- svfloat32_t p14 = svmla_x (pg, p12, r2, p34);
- svfloat32_t p0 = svmul_lane (r, even_coeffs, 0);
- svfloat32_t poly = svmla_x (pg, p0, r2, p14);
+ svfloat32_t poly = svmla_x (pg, sv_f32 (d->c0), r, sv_f32 (d->c1));
+ poly = svmul_x (svptrue_b32 (), poly, r);
return svmla_x (pg, scale, scale, poly);
}
@@ -72,11 +60,10 @@ special_case (svfloat32_t x, svbool_t special, const struct data *d)
special);
}
-/* Single-precision SVE exp2f routine. Implements the same algorithm
- as AdvSIMD exp2f.
- Worst case error is 1.04 ULPs.
- _ZGVsMxv_exp2f(-0x1.af994ap-3) got 0x1.ba6a66p-1
- want 0x1.ba6a64p-1. */
+/* Single-precision SVE exp2f routine, based on the FEXPA instruction.
+ Worst case error is 1.09 ULPs.
+ _ZGVsMxv_exp2f (0x1.9a2a94p-1) got 0x1.be1054p+0
+ want 0x1.be1052p+0. */
svfloat32_t SV_NAME_F1 (exp2) (svfloat32_t x, const svbool_t pg)
{
const struct data *d = ptr_barrier (&data);
diff --git a/sysdeps/aarch64/fpu/expm1_sve.c b/sysdeps/aarch64/fpu/expm1_sve.c
index c933cf9c0eb2406b..4c35e0341d34aee0 100644
--- a/sysdeps/aarch64/fpu/expm1_sve.c
+++ b/sysdeps/aarch64/fpu/expm1_sve.c
@@ -18,82 +18,164 @@
<https://www.gnu.org/licenses/>. */
#include "sv_math.h"
-#include "poly_sve_f64.h"
-#define SpecialBound 0x1.62b7d369a5aa9p+9
-#define ExponentBias 0x3ff0000000000000
+#define FexpaBound 0x1.4cb5ecef28adap-3 /* 15*ln2/64. */
+#define SpecialBound 0x1.628c2855bfaddp+9 /* ln(2^(1023 + 1/128)). */
static const struct data
{
- double poly[11];
- double shift, inv_ln2, special_bound;
- /* To be loaded in one quad-word. */
+ double c2, c4;
+ double inv_ln2;
double ln2_hi, ln2_lo;
+ double c0, c1, c3;
+ double shift, thres;
+ uint64_t expm1_data[32];
} data = {
- /* Generated using fpminimax. */
- .poly = { 0x1p-1, 0x1.5555555555559p-3, 0x1.555555555554bp-5,
- 0x1.111111110f663p-7, 0x1.6c16c16c1b5f3p-10, 0x1.a01a01affa35dp-13,
- 0x1.a01a018b4ecbbp-16, 0x1.71ddf82db5bb4p-19, 0x1.27e517fc0d54bp-22,
- 0x1.af5eedae67435p-26, 0x1.1f143d060a28ap-29, },
-
- .special_bound = SpecialBound,
- .inv_ln2 = 0x1.71547652b82fep0,
- .ln2_hi = 0x1.62e42fefa39efp-1,
- .ln2_lo = 0x1.abc9e3b39803fp-56,
- .shift = 0x1.8p52,
+ /* Table emulating FEXPA - 1, for values of FEXPA close to 1.
+ The table holds values of 2^(i/64) - 1, computed in arbitrary precision.
+ The first half of the table stores values associated to i from 0 to 15.
+ The second half of the table stores values associated to i from 0 to -15. */
+ .expm1_data = {
+ 0x0000000000000000, 0x3f864d1f3bc03077, 0x3f966c34c5615d0f, 0x3fa0e8a30eb37901,
+ 0x3fa6ab0d9f3121ec, 0x3fac7d865a7a3440, 0x3fb1301d0125b50a, 0x3fb429aaea92ddfb,
+ 0x3fb72b83c7d517ae, 0x3fba35beb6fcb754, 0x3fbd4873168b9aa8, 0x3fc031dc431466b2,
+ 0x3fc1c3d373ab11c3, 0x3fc35a2b2f13e6e9, 0x3fc4f4efa8fef709, 0x3fc6942d3720185a,
+ 0x0000000000000000, 0xbfc331751ec3a814, 0xbfc20224341286e4, 0xbfc0cf85bed0f8b7,
+ 0xbfbf332113d56b1f, 0xbfbcc0768d4175a6, 0xbfba46f918837cb7, 0xbfb7c695afc3b424,
+ 0xbfb53f391822dbc7, 0xbfb2b0cfe1266bd4, 0xbfb01b466423250a, 0xbfaafd11874c009e,
+ 0xbfa5b505d5b6f268, 0xbfa05e4119ea5d89, 0xbf95f134923757f3, 0xbf860f9f985bc9f4,
+ },
+
+ /* Generated using Remez, in [-log(2)/128, log(2)/128]. */
+ .c0 = 0x1p-1,
+ .c1 = 0x1.55555555548f9p-3,
+ .c2 = 0x1.5555555554c22p-5,
+ .c3 = 0x1.111123aaa2fb2p-7,
+ .c4 = 0x1.6c16d77d98e5bp-10,
+ .ln2_hi = 0x1.62e42fefa3800p-1,
+ .ln2_lo = 0x1.ef35793c76730p-45,
+ .inv_ln2 = 0x1.71547652b82fep+0,
+ .shift = 0x1.800000000ffc0p+46, /* 1.5*2^46+1023. */
+ .thres = SpecialBound,
};
-static svfloat64_t NOINLINE
-special_case (svfloat64_t x, svfloat64_t y, svbool_t pg)
+#define SpecialOffset 0x6000000000000000 /* 0x1p513. */
+/* SpecialBias1 + SpecialBias1 = asuint(1.0). */
+#define SpecialBias1 0x7000000000000000 /* 0x1p769. */
+#define SpecialBias2 0x3010000000000000 /* 0x1p-254. */
+
+static NOINLINE svfloat64_t
+special_case (svbool_t pg, svfloat64_t y, svfloat64_t s, svfloat64_t p,
+ svfloat64_t n)
{
- return sv_call_f64 (expm1, x, y, pg);
+ /* s=2^n may overflow, break it up into s=s1*s2,
+ such that exp = s + s*y can be computed as s1*(s2+s2*y)
+ and s1*s1 overflows only if n>0. */
+
+ /* If n<=0 then set b to 0x6, 0 otherwise. */
+ svbool_t p_sign = svcmple (pg, n, 0.0); /* n <= 0. */
+ svuint64_t b
+ = svdup_u64_z (p_sign, SpecialOffset); /* Inactive lanes set to 0. */
+
+ /* Set s1 to generate overflow depending on sign of exponent n,
+ ie. s1 = 0x70...0 - b. */
+ svfloat64_t s1 = svreinterpret_f64 (svsubr_x (pg, b, SpecialBias1));
+ /* Offset s to avoid overflow in final result if n is below threshold.
+ ie. s2 = as_u64 (s) - 0x3010...0 + b. */
+ svfloat64_t s2 = svreinterpret_f64 (
+ svadd_x (pg, svsub_x (pg, svreinterpret_u64 (s), SpecialBias2), b));
+
+ /* |n| > 1280 => 2^(n) overflows. */
+ svbool_t p_cmp = svacgt (pg, n, 1280.0);
+
+ svfloat64_t r1 = svmul_x (svptrue_b64 (), s1, s1);
+ svfloat64_t r2 = svmla_x (pg, s2, s2, p);
+ svfloat64_t r0 = svmul_x (svptrue_b64 (), r2, s1);
+
+ svbool_t is_safe = svacle (pg, n, 1023); /* Only correct special lanes. */
+ return svsel (is_safe, y, svsub_x (pg, svsel (p_cmp, r1, r0), 1.0));
}
-/* Double-precision vector exp(x) - 1 function.
- The maximum error observed error is 2.18 ULP:
- _ZGVsMxv_expm1(0x1.634ba0c237d7bp-2) got 0x1.a8b9ea8d66e22p-2
- want 0x1.a8b9ea8d66e2p-2. */
+/* FEXPA based SVE expm1 algorithm.
+ Maximum measured error is 2.81 + 0.5 ULP:
+ _ZGVsMxv_expm1 (0x1.974060e619bfp-3) got 0x1.c290e5858bb53p-3
+ want 0x1.c290e5858bb5p-3. */
svfloat64_t SV_NAME_D1 (expm1) (svfloat64_t x, svbool_t pg)
{
const struct data *d = ptr_barrier (&data);
- /* Large, Nan/Inf. */
- svbool_t special = svnot_z (pg, svaclt (pg, x, d->special_bound));
-
- /* Reduce argument to smaller range:
- Let i = round(x / ln2)
- and f = x - i * ln2, then f is in [-ln2/2, ln2/2].
- exp(x) - 1 = 2^i * (expm1(f) + 1) - 1
- where 2^i is exact because i is an integer. */
- svfloat64_t shift = sv_f64 (d->shift);
- svfloat64_t n = svsub_x (pg, svmla_x (pg, shift, x, d->inv_ln2), shift);
- svint64_t i = svcvt_s64_x (pg, n);
- svfloat64_t ln2 = svld1rq (svptrue_b64 (), &d->ln2_hi);
- svfloat64_t f = svmls_lane (x, n, ln2, 0);
- f = svmls_lane (f, n, ln2, 1);
-
- /* Approximate expm1(f) using polynomial.
- Taylor expansion for expm1(x) has the form:
- x + ax^2 + bx^3 + cx^4 ....
- So we calculate the polynomial P(f) = a + bf + cf^2 + ...
- and assemble the approximation expm1(f) ~= f + f^2 * P(f). */
- svfloat64_t f2 = svmul_x (pg, f, f);
- svfloat64_t f4 = svmul_x (pg, f2, f2);
- svfloat64_t f8 = svmul_x (pg, f4, f4);
- svfloat64_t p
- = svmla_x (pg, f, f2, sv_estrin_10_f64_x (pg, f, f2, f4, f8, d->poly));
-
- /* Assemble the result.
- expm1(x) ~= 2^i * (p + 1) - 1
- Let t = 2^i. */
- svint64_t u = svadd_x (pg, svlsl_x (pg, i, 52), ExponentBias);
- svfloat64_t t = svreinterpret_f64 (u);
-
- /* expm1(x) ~= p * t + (t - 1). */
- svfloat64_t y = svmla_x (pg, svsub_x (pg, t, 1), p, t);
+ svbool_t special = svacgt (pg, x, d->thres);
- if (__glibc_unlikely (svptest_any (pg, special)))
- return special_case (x, y, special);
+ svfloat64_t z = svmla_x (pg, sv_f64 (d->shift), x, d->inv_ln2);
+ svuint64_t u = svreinterpret_u64 (z);
+ svfloat64_t n = svsub_x (pg, z, d->shift);
+ /* r = x - n * ln2, r is in [-ln2/128, ln2/128]. */
+ svfloat64_t ln2 = svld1rq (svptrue_b64 (), &d->ln2_hi);
+ svfloat64_t r = x;
+ r = svmls_lane (r, n, ln2, 0);
+ r = svmls_lane (r, n, ln2, 1);
+
+ /* y = exp(r) - 1 ~= r + C0 r^2 + C1 r^3 + C2 r^4 + C3 r^5 + C4 r^6. */
+ svfloat64_t r2 = svmul_x (svptrue_b64 (), r, r);
+ svfloat64_t c24 = svld1rq (svptrue_b64 (), &d->c2);
+
+ svfloat64_t p;
+ svfloat64_t c12 = svmla_lane (sv_f64 (d->c1), r, c24, 0);
+ svfloat64_t c34 = svmla_lane (sv_f64 (d->c3), r, c24, 1);
+ p = svmad_x (pg, c34, r2, c12);
+ p = svmad_x (pg, p, r, sv_f64 (d->c0));
+ p = svmad_x (pg, p, r2, r);
+
+ svfloat64_t scale = svexpa (u);
+ svfloat64_t scalem1 = svsub_x (pg, scale, sv_f64 (1.0));
+
+ /* We want to construct expm1(x) = (scale - 1) + scale * poly.
+ However, for values of scale close to 1, scale-1 causes large ULP errors
+ due to cancellation.
+
+ This can be circumvented by using a small lookup for scale-1
+ when our input is below a certain bound, otherwise we can use FEXPA.
+
+ This bound is based upon the table size:
+ Bound = (TableSize-1/64) * ln2.
+ The current bound is based upon a table size of 16. */
+ svbool_t is_small = svaclt (pg, x, FexpaBound);
+
+ if (svptest_any (pg, is_small))
+ {
+ /* Index via the input of FEXPA, but we only care about the lower 4 bits.
+ */
+ svuint64_t base_idx = svand_x (pg, u, 0xf);
+
+ /* We can use the sign of x as a fifth bit to account for the asymmetry
+ of e^x around 0. */
+ svuint64_t signBit
+ = svlsl_x (pg, svlsr_x (pg, svreinterpret_u64 (x), 63), 4);
+ svuint64_t idx = svorr_x (pg, base_idx, signBit);
+
+ /* Lookup values for scale - 1 for small x. */
+ svfloat64_t lookup = svreinterpret_f64 (
+ svld1_gather_index (is_small, d->expm1_data, idx));
+
+ /* Select the appropriate scale - 1 value based on x. */
+ scalem1 = svsel (is_small, lookup, scalem1);
+ }
+
+ svfloat64_t y = svmla_x (pg, scalem1, scale, p);
+
+ /* FEXPA returns nan for large inputs so we special case those. */
+ if (__glibc_unlikely (svptest_any (pg, special)))
+ {
+ /* FEXPA zeroes the sign bit, however the sign is meaningful to the
+ special case function so needs to be copied.
+ e = sign bit of u << 46. */
+ svuint64_t e = svand_x (pg, svlsl_x (pg, u, 46), 0x8000000000000000);
+ /* Copy sign to s. */
+ scale = svreinterpret_f64 (svadd_x (pg, e, svreinterpret_u64 (scale)));
+ return special_case (pg, y, scale, p, n);
+ }
+
+ /* return expm1 = (scale - 1) + (scale * poly). */
return y;
}

View File

@ -0,0 +1,49 @@
commit aac077645a645bba0d67f3250e82017c539d0f4b
Author: Pierre Blanchard <pierre.blanchard@arm.com>
Date: Wed Aug 20 17:41:50 2025 +0000
AArch64: Fix SVE powf routine [BZ #33299]
Fix a bug in predicate logic introduced in last change.
A slight performance improvement from relying on all true
predicates during conversion from single to double.
This fixes BZ #33299.
Reviewed-by: Wilco Dijkstra <Wilco.Dijkstra@arm.com>
diff --git a/sysdeps/aarch64/fpu/powf_sve.c b/sysdeps/aarch64/fpu/powf_sve.c
index 08d7019a1855ff3c..33bba96054cf4cc8 100644
--- a/sysdeps/aarch64/fpu/powf_sve.c
+++ b/sysdeps/aarch64/fpu/powf_sve.c
@@ -223,15 +223,15 @@ sv_powf_core (const svbool_t pg, svuint32_t i, svuint32_t iz, svint32_t k,
const svbool_t ptrue = svptrue_b64 ();
/* Unpack and promote input vectors (pg, y, z, i, k and sign_bias) into two
- * in order to perform core computation in double precision. */
+ in order to perform core computation in double precision. */
const svbool_t pg_lo = svunpklo (pg);
const svbool_t pg_hi = svunpkhi (pg);
- svfloat64_t y_lo
- = svcvt_f64_x (pg, svreinterpret_f32 (svunpklo (svreinterpret_u32 (y))));
- svfloat64_t y_hi
- = svcvt_f64_x (pg, svreinterpret_f32 (svunpkhi (svreinterpret_u32 (y))));
- svfloat64_t z_lo = svcvt_f64_x (pg, svreinterpret_f32 (svunpklo (iz)));
- svfloat64_t z_hi = svcvt_f64_x (pg, svreinterpret_f32 (svunpkhi (iz)));
+ svfloat64_t y_lo = svcvt_f64_x (
+ ptrue, svreinterpret_f32 (svunpklo (svreinterpret_u32 (y))));
+ svfloat64_t y_hi = svcvt_f64_x (
+ ptrue, svreinterpret_f32 (svunpkhi (svreinterpret_u32 (y))));
+ svfloat64_t z_lo = svcvt_f64_x (ptrue, svreinterpret_f32 (svunpklo (iz)));
+ svfloat64_t z_hi = svcvt_f64_x (ptrue, svreinterpret_f32 (svunpkhi (iz)));
svuint64_t i_lo = svunpklo (i);
svuint64_t i_hi = svunpkhi (i);
svint64_t k_lo = svunpklo (k);
@@ -312,7 +312,7 @@ svfloat32_t SV_NAME_F2 (pow) (svfloat32_t x, svfloat32_t y, const svbool_t pg)
(23 - V_POWF_EXP2_TABLE_BITS));
/* Compute core in extended precision and return intermediate ylogx results
- * to handle cases of underflow and underflow in exp. */
+ to handle cases of underflow and overflow in exp. */
svfloat32_t ylogx;
svfloat32_t ret
= sv_powf_core (yint_or_xpos, i, iz, k, y, sign_bias, &ylogx, d);

174
glibc-RHEL-118273-42.patch Normal file
View File

@ -0,0 +1,174 @@
commit e20ca759af46fbb7eae20c52b857e7636eb50e1b
Author: remph <lhr@disroot.org>
Date: Thu Sep 4 12:53:56 2025 +0000
AArch64: add optimised strspn/strcspn
Requires Neon (aka. Advanced SIMD). Looks up 16 characters at a time,
for a 2-3x perfomance improvement, and a ~30% speedup on the strtok &
strsep benchtests, as tested on Cortex A-{53,72}.
Signed-off-by: remph <lhr@disroot.org>
Reviewed-by: Wilco Dijkstra <Wilco.Dijkstra@arm.com>
diff --git a/sysdeps/aarch64/strcspn.S b/sysdeps/aarch64/strcspn.S
new file mode 100644
index 0000000000000000..f2a69e9856cba04c
--- /dev/null
+++ b/sysdeps/aarch64/strcspn.S
@@ -0,0 +1,2 @@
+#define USE_AS_STRCSPN 1
+#include "strspn.S"
diff --git a/sysdeps/aarch64/strspn.S b/sysdeps/aarch64/strspn.S
new file mode 100644
index 0000000000000000..edbb705b15991e39
--- /dev/null
+++ b/sysdeps/aarch64/strspn.S
@@ -0,0 +1,146 @@
+/* Copyright (C) 2025 Free Software Foundation, Inc.
+
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library. If not, see
+ <https://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+
+#ifdef USE_AS_STRCSPN
+# define STRSPN strcspn
+# define SBT orr /* SBT -- `set bit' */
+#else
+# define STRSPN strspn
+# define SBT bic
+#endif
+
+#ifdef __AARCH64EB__
+# define LS_FW lsl
+# define LS_BK lsr
+#else
+# define LS_FW lsr
+# define LS_BK lsl
+#endif
+
+#define og_s x0
+#define set x1 /* ACCEPT for strspn, REJECT for strcspn */
+
+#define byte_i x3
+#define bits_i x4
+#define one x6
+
+#define syndrome x5
+#define s x6
+
+#define vbyte_i v1.16b
+#define vbits_i v2.16b
+#define table v4.16b-v5.16b
+#define table_a v4
+#define table_b v5
+#define sevens v7.16b
+
+ENTRY(STRSPN)
+ ldrb w2, [set]
+ cbz w2, L(early)
+#ifdef USE_AS_STRCSPN
+ ldrb w3, [set, 1]
+ cbz w3, L(early)
+#endif
+
+ /* Table has ones for bytes to reject and zeros for bytes to accept */
+ mov one, 1
+#ifdef USE_AS_STRCSPN
+ stp one, xzr, [sp, -32]!
+ .cfi_def_cfa_offset 32
+ stp xzr, xzr, [sp, 16]
+#else
+ mvni v0.4s, 0
+ stp q0, q0, [sp, -32]!
+ .cfi_def_cfa_offset 32
+#endif
+
+ .p2align 4
+L(fill_table):
+ lsr byte_i, x2, 6 /* x2 / 64 */
+ lsl bits_i, one, x2 /* x2 % 64 implicitly */
+ ldrb w2, [set, 1]!
+ ldr x5, [sp, byte_i, lsl 3]
+ SBT x5, x5, bits_i
+ str x5, [sp, byte_i, lsl 3]
+ cbnz w2, L(fill_table)
+
+ ld1 {table_a.2d-table_b.2d}, [sp], 32
+ .cfi_def_cfa_offset 0
+ ubfiz syndrome, og_s, 2, 4 /* Bottom 4 bits, times 4 to count nibbles */
+ and s, og_s, -16 /* Round S down to 16-byte boundary */
+ movi sevens, 7
+ /* Bias the syndrome to mask off these nibbles */
+ mov x8, -1
+ LS_BK syndrome, x8, syndrome
+ mvn syndrome, syndrome
+
+L(loop):
+ ldr q0, [s], 16
+ ushr vbyte_i, v0.16b, 3
+ bic vbits_i, sevens, v0.16b
+ tbl v0.16b, {table}, vbyte_i
+ /* Bring the relevant bit to the MSB of each byte */
+ sshl v0.16b, v0.16b, vbits_i
+ /* Set every bit of each byte to its MSB */
+ cmlt v0.16b, v0.16b, 0
+ /* Bytes->nibbles */
+ shrn v0.8b, v0.8h, 4
+ fmov x2, d0
+ bic syndrome, x2, syndrome
+ cbz syndrome, L(loop)
+
+#ifndef __AARCH64EB__
+ rbit syndrome, syndrome
+#endif
+ sub s, s, 16
+ clz syndrome, syndrome
+ sub x0, s, og_s
+ add x0, x0, syndrome, lsr 2
+ ret
+
+ .balign 8 /* For strspn, which has only 2 instructions here */
+L(early):
+#ifdef USE_AS_STRCSPN
+ /* strlen(set) < 2: call strchrnul(s, *set) and get its offset from S */
+ stp fp, lr, [sp, -32]!
+ .cfi_def_cfa_offset 32
+ .cfi_offset fp, -32
+ .cfi_offset lr, -24
+ str x19, [sp, 16]
+ .cfi_offset 19, -16
+ mov w1, w2
+ mov fp, sp
+ mov x19, x0
+ bl __strchrnul
+ sub x0, x0, x19
+ ldr x19, [sp, 16]
+ ldp fp, lr, [sp], 32
+ .cfi_restore lr
+ .cfi_restore fp
+ .cfi_restore 19
+ .cfi_def_cfa_offset 0
+#else
+ mov w0, 0
+#endif
+ ret
+END(STRSPN)
+
+#undef set
+libc_hidden_def(STRSPN)

View File

@ -0,0 +1,93 @@
commit aebaeb2c330482171340e966f7f33fac884a27f4
Author: Wilco Dijkstra <wilco.dijkstra@arm.com>
Date: Thu Sep 18 14:24:47 2025 +0000
AArch64: Update math-vector-fortran.h
Update math-vector-fortran.h with the latest set of math functions
and sort by name.
Reviewed-by: Yury Khrustalev <yury.khrustalev@arm.com>
diff --git a/sysdeps/aarch64/fpu/finclude/math-vector-fortran.h b/sysdeps/aarch64/fpu/finclude/math-vector-fortran.h
index 92e15f0d6a758258..161f43d20c51e252 100644
--- a/sysdeps/aarch64/fpu/finclude/math-vector-fortran.h
+++ b/sysdeps/aarch64/fpu/finclude/math-vector-fortran.h
@@ -15,33 +15,74 @@
! You should have received a copy of the GNU Lesser General Public
! License along with the GNU C Library; if not, see
! <https://www.gnu.org/licenses/>.
+
!GCC$ builtin (acos) attributes simd (notinbranch)
!GCC$ builtin (acosf) attributes simd (notinbranch)
+!GCC$ builtin (acosh) attributes simd (notinbranch)
+!GCC$ builtin (acoshf) attributes simd (notinbranch)
+!GCC$ builtin (acospi) attributes simd (notinbranch)
+!GCC$ builtin (acospif) attributes simd (notinbranch)
!GCC$ builtin (asin) attributes simd (notinbranch)
!GCC$ builtin (asinf) attributes simd (notinbranch)
+!GCC$ builtin (asinh) attributes simd (notinbranch)
+!GCC$ builtin (asinhf) attributes simd (notinbranch)
+!GCC$ builtin (asinpi) attributes simd (notinbranch)
+!GCC$ builtin (asinpif) attributes simd (notinbranch)
!GCC$ builtin (atan) attributes simd (notinbranch)
-!GCC$ builtin (atanf) attributes simd (notinbranch)
!GCC$ builtin (atan2) attributes simd (notinbranch)
!GCC$ builtin (atan2f) attributes simd (notinbranch)
+!GCC$ builtin (atan2pi) attributes simd (notinbranch)
+!GCC$ builtin (atan2pif) attributes simd (notinbranch)
+!GCC$ builtin (atanf) attributes simd (notinbranch)
+!GCC$ builtin (atanh) attributes simd (notinbranch)
+!GCC$ builtin (atanhf) attributes simd (notinbranch)
+!GCC$ builtin (atanpi) attributes simd (notinbranch)
+!GCC$ builtin (atanpif) attributes simd (notinbranch)
+!GCC$ builtin (cbrt) attributes simd (notinbranch)
+!GCC$ builtin (cbrtf) attributes simd (notinbranch)
!GCC$ builtin (cos) attributes simd (notinbranch)
!GCC$ builtin (cosf) attributes simd (notinbranch)
+!GCC$ builtin (cosh) attributes simd (notinbranch)
+!GCC$ builtin (coshf) attributes simd (notinbranch)
+!GCC$ builtin (cospi) attributes simd (notinbranch)
+!GCC$ builtin (cospif) attributes simd (notinbranch)
+!GCC$ builtin (erf) attributes simd (notinbranch)
+!GCC$ builtin (erfc) attributes simd (notinbranch)
+!GCC$ builtin (erfcf) attributes simd (notinbranch)
+!GCC$ builtin (erff) attributes simd (notinbranch)
!GCC$ builtin (exp) attributes simd (notinbranch)
-!GCC$ builtin (expf) attributes simd (notinbranch)
!GCC$ builtin (exp10) attributes simd (notinbranch)
!GCC$ builtin (exp10f) attributes simd (notinbranch)
+!GCC$ builtin (exp10m1) attributes simd (notinbranch)
+!GCC$ builtin (exp10m1f) attributes simd (notinbranch)
!GCC$ builtin (exp2) attributes simd (notinbranch)
!GCC$ builtin (exp2f) attributes simd (notinbranch)
+!GCC$ builtin (exp2m1) attributes simd (notinbranch)
+!GCC$ builtin (exp2m1f) attributes simd (notinbranch)
+!GCC$ builtin (expf) attributes simd (notinbranch)
!GCC$ builtin (expm1) attributes simd (notinbranch)
!GCC$ builtin (expm1f) attributes simd (notinbranch)
+!GCC$ builtin (hypot) attributes simd (notinbranch)
+!GCC$ builtin (hypotf) attributes simd (notinbranch)
!GCC$ builtin (log) attributes simd (notinbranch)
-!GCC$ builtin (logf) attributes simd (notinbranch)
!GCC$ builtin (log10) attributes simd (notinbranch)
!GCC$ builtin (log10f) attributes simd (notinbranch)
!GCC$ builtin (log1p) attributes simd (notinbranch)
!GCC$ builtin (log1pf) attributes simd (notinbranch)
!GCC$ builtin (log2) attributes simd (notinbranch)
!GCC$ builtin (log2f) attributes simd (notinbranch)
+!GCC$ builtin (logf) attributes simd (notinbranch)
+!GCC$ builtin (pow) attributes simd (notinbranch)
+!GCC$ builtin (powf) attributes simd (notinbranch)
!GCC$ builtin (sin) attributes simd (notinbranch)
!GCC$ builtin (sinf) attributes simd (notinbranch)
+!GCC$ builtin (sinh) attributes simd (notinbranch)
+!GCC$ builtin (sinhf) attributes simd (notinbranch)
+!GCC$ builtin (sinpi) attributes simd (notinbranch)
+!GCC$ builtin (sinpif) attributes simd (notinbranch)
!GCC$ builtin (tan) attributes simd (notinbranch)
!GCC$ builtin (tanf) attributes simd (notinbranch)
+!GCC$ builtin (tanh) attributes simd (notinbranch)
+!GCC$ builtin (tanhf) attributes simd (notinbranch)
+!GCC$ builtin (tanpi) attributes simd (notinbranch)
+!GCC$ builtin (tanpif) attributes simd (notinbranch)

View File

@ -0,0 +1,97 @@
commit 6c22823da57aa5218f717f569c04c9573c0448c5
Author: Joe Ramsay <Joe.Ramsay@arm.com>
Date: Thu Nov 6 18:26:54 2025 +0000
AArch64: Fix instability in AdvSIMD tan
Previously presence of special-cases in one lane could affect the
results in other lanes due to unconditional scalar fallback. The old
WANT_SIMD_EXCEPT option (which has never been enabled in libmvec) has
been removed from AOR, making it easier to spot and fix this. 4%
improvement in throughput with GCC 14 on Neoverse V1. This bug is
present as far back as 2.39 (where tan was first introduced).
Reviewed-by: Wilco Dijkstra <Wilco.Dijkstra@arm.com>
diff --git a/sysdeps/aarch64/fpu/tan_advsimd.c b/sysdeps/aarch64/fpu/tan_advsimd.c
index d56a102dd17a3463..c6a5a17126674d7d 100644
--- a/sysdeps/aarch64/fpu/tan_advsimd.c
+++ b/sysdeps/aarch64/fpu/tan_advsimd.c
@@ -25,9 +25,7 @@ static const struct data
float64x2_t poly[9];
double half_pi[2];
float64x2_t two_over_pi, shift;
-#if !WANT_SIMD_EXCEPT
float64x2_t range_val;
-#endif
} data = {
/* Coefficients generated using FPMinimax. */
.poly = { V2 (0x1.5555555555556p-2), V2 (0x1.1111111110a63p-3),
@@ -38,20 +36,17 @@ static const struct data
.half_pi = { 0x1.921fb54442d18p0, 0x1.1a62633145c07p-54 },
.two_over_pi = V2 (0x1.45f306dc9c883p-1),
.shift = V2 (0x1.8p52),
-#if !WANT_SIMD_EXCEPT
.range_val = V2 (0x1p23),
-#endif
};
#define RangeVal 0x4160000000000000 /* asuint64(0x1p23). */
#define TinyBound 0x3e50000000000000 /* asuint64(2^-26). */
-#define Thresh 0x310000000000000 /* RangeVal - TinyBound. */
/* Special cases (fall back to scalar calls). */
static float64x2_t VPCS_ATTR NOINLINE
-special_case (float64x2_t x)
+special_case (float64x2_t x, float64x2_t n, float64x2_t d, uint64x2_t special)
{
- return v_call_f64 (tan, x, x, v_u64 (-1));
+ return v_call_f64 (tan, x, vdivq_f64 (n, d), special);
}
/* Vector approximation for double-precision tan.
@@ -65,14 +60,6 @@ float64x2_t VPCS_ATTR V_NAME_D1 (tan) (float64x2_t x)
very large inputs. Fall back to scalar routine for all lanes if any are
too large, or Inf/NaN. If fenv exceptions are expected, also fall back for
tiny input to avoid underflow. */
-#if WANT_SIMD_EXCEPT
- uint64x2_t iax = vreinterpretq_u64_f64 (vabsq_f64 (x));
- /* iax - tiny_bound > range_val - tiny_bound. */
- uint64x2_t special
- = vcgtq_u64 (vsubq_u64 (iax, v_u64 (TinyBound)), v_u64 (Thresh));
- if (__glibc_unlikely (v_any_u64 (special)))
- return special_case (x);
-#endif
/* q = nearest integer to 2 * x / pi. */
float64x2_t q
@@ -81,9 +68,8 @@ float64x2_t VPCS_ATTR V_NAME_D1 (tan) (float64x2_t x)
/* Use q to reduce x to r in [-pi/4, pi/4], by:
r = x - q * pi/2, in extended precision. */
- float64x2_t r = x;
float64x2_t half_pi = vld1q_f64 (dat->half_pi);
- r = vfmsq_laneq_f64 (r, q, half_pi, 0);
+ float64x2_t r = vfmsq_laneq_f64 (x, q, half_pi, 0);
r = vfmsq_laneq_f64 (r, q, half_pi, 1);
/* Further reduce r to [-pi/8, pi/8], to be reconstructed using double angle
formula. */
@@ -114,12 +100,13 @@ float64x2_t VPCS_ATTR V_NAME_D1 (tan) (float64x2_t x)
uint64x2_t no_recip = vtstq_u64 (vreinterpretq_u64_s64 (qi), v_u64 (1));
-#if !WANT_SIMD_EXCEPT
uint64x2_t special = vcageq_f64 (x, dat->range_val);
+ float64x2_t swap = vbslq_f64 (no_recip, n, vnegq_f64 (d));
+ d = vbslq_f64 (no_recip, d, n);
+ n = swap;
+
if (__glibc_unlikely (v_any_u64 (special)))
- return special_case (x);
-#endif
+ return special_case (x, n, d, special);
- return vdivq_f64 (vbslq_f64 (no_recip, n, vnegq_f64 (d)),
- vbslq_f64 (no_recip, d, n));
+ return vdivq_f64 (n, d);
}

View File

@ -0,0 +1,88 @@
commit e45af510bc816e860c8e2e1d4a652b4fe15c4b34
Author: Joe Ramsay <Joe.Ramsay@arm.com>
Date: Thu Nov 6 18:29:33 2025 +0000
AArch64: Fix instability in AdvSIMD sinh
Previously presence of special-cases in one lane could affect the
results in other lanes due to unconditional scalar fallback. The old
WANT_SIMD_EXCEPT option (which has never been enabled in libmvec) has
been removed from AOR, making it easier to spot and fix
this. No measured change in performance. This patch applies cleanly as
far back as 2.41, however there are conflicts with 2.40 where sinh was
first introduced.
Reviewed-by: Wilco Dijkstra <Wilco.Dijkstra@arm.com>
diff --git a/sysdeps/aarch64/fpu/sinh_advsimd.c b/sysdeps/aarch64/fpu/sinh_advsimd.c
index 7adf771517de2507..66504cdee84ee77e 100644
--- a/sysdeps/aarch64/fpu/sinh_advsimd.c
+++ b/sysdeps/aarch64/fpu/sinh_advsimd.c
@@ -24,36 +24,26 @@ static const struct data
{
struct v_expm1_data d;
uint64x2_t halff;
-#if WANT_SIMD_EXCEPT
- uint64x2_t tiny_bound, thresh;
-#else
float64x2_t large_bound;
-#endif
} data = {
.d = V_EXPM1_DATA,
.halff = V2 (0x3fe0000000000000),
-#if WANT_SIMD_EXCEPT
- /* 2^-26, below which sinh(x) rounds to x. */
- .tiny_bound = V2 (0x3e50000000000000),
- /* asuint(large_bound) - asuint(tiny_bound). */
- .thresh = V2 (0x0230000000000000),
-#else
/* 2^9. expm1 helper overflows for large input. */
.large_bound = V2 (0x1p+9),
-#endif
};
static float64x2_t NOINLINE VPCS_ATTR
-special_case (float64x2_t x)
+special_case (float64x2_t x, float64x2_t t, float64x2_t halfsign,
+ uint64x2_t special)
{
- return v_call_f64 (sinh, x, x, v_u64 (-1));
+ return v_call_f64 (sinh, x, vmulq_f64 (t, halfsign), special);
}
/* Approximation for vector double-precision sinh(x) using expm1.
sinh(x) = (exp(x) - exp(-x)) / 2.
The greatest observed error is 2.52 ULP:
- _ZGVnN2v_sinh(-0x1.a098a2177a2b9p-2) got -0x1.ac2f05bb66fccp-2
- want -0x1.ac2f05bb66fc9p-2. */
+ _ZGVnN2v_sinh(0x1.9f6ff2ab6fb19p-2) got 0x1.aaed83a3153ccp-2
+ want 0x1.aaed83a3153c9p-2. */
float64x2_t VPCS_ATTR V_NAME_D1 (sinh) (float64x2_t x)
{
const struct data *d = ptr_barrier (&data);
@@ -63,21 +53,16 @@ float64x2_t VPCS_ATTR V_NAME_D1 (sinh) (float64x2_t x)
float64x2_t halfsign = vreinterpretq_f64_u64 (
vbslq_u64 (v_u64 (0x8000000000000000), ix, d->halff));
-#if WANT_SIMD_EXCEPT
- uint64x2_t special = vcgeq_u64 (
- vsubq_u64 (vreinterpretq_u64_f64 (ax), d->tiny_bound), d->thresh);
-#else
uint64x2_t special = vcageq_f64 (x, d->large_bound);
-#endif
-
- /* Fall back to scalar variant for all lanes if any of them are special. */
- if (__glibc_unlikely (v_any_u64 (special)))
- return special_case (x);
/* Up to the point that expm1 overflows, we can use it to calculate sinh
using a slight rearrangement of the definition of sinh. This allows us to
retain acceptable accuracy for very small inputs. */
float64x2_t t = expm1_inline (ax, &d->d);
t = vaddq_f64 (t, vdivq_f64 (t, vaddq_f64 (t, v_f64 (1.0))));
+
+ if (__glibc_unlikely (v_any_u64 (special)))
+ return special_case (x, t, halfsign, special);
+
return vmulq_f64 (t, halfsign);
}

475
glibc-RHEL-118273-5.patch Normal file
View File

@ -0,0 +1,475 @@
commit 8b679205286e7874f0b04187c0bc787632168aa2
Author: Joe Ramsay <Joe.Ramsay@arm.com>
Date: Wed Apr 3 12:13:53 2024 +0100
aarch64/fpu: Add vector variants of atanh
Reviewed-by: Szabolcs Nagy <szabolcs.nagy@arm.com>
diff --git a/sysdeps/aarch64/fpu/Makefile b/sysdeps/aarch64/fpu/Makefile
index d474f2969dd05c26..4c878e590681becc 100644
--- a/sysdeps/aarch64/fpu/Makefile
+++ b/sysdeps/aarch64/fpu/Makefile
@@ -3,6 +3,7 @@ libmvec-supported-funcs = acos \
asin \
asinh \
atan \
+ atanh \
atan2 \
cos \
cosh \
diff --git a/sysdeps/aarch64/fpu/Versions b/sysdeps/aarch64/fpu/Versions
index 08ea15efaec959fb..092949dc96d55624 100644
--- a/sysdeps/aarch64/fpu/Versions
+++ b/sysdeps/aarch64/fpu/Versions
@@ -89,6 +89,11 @@ libmvec {
_ZGVnN4v_asinhf;
_ZGVsMxv_asinh;
_ZGVsMxv_asinhf;
+ _ZGVnN2v_atanh;
+ _ZGVnN2v_atanhf;
+ _ZGVnN4v_atanhf;
+ _ZGVsMxv_atanh;
+ _ZGVsMxv_atanhf;
_ZGVnN2v_cosh;
_ZGVnN2v_coshf;
_ZGVnN4v_coshf;
diff --git a/sysdeps/aarch64/fpu/advsimd_f32_protos.h b/sysdeps/aarch64/fpu/advsimd_f32_protos.h
index 1e80721c9f73ba12..afbb01e191b917a4 100644
--- a/sysdeps/aarch64/fpu/advsimd_f32_protos.h
+++ b/sysdeps/aarch64/fpu/advsimd_f32_protos.h
@@ -22,6 +22,7 @@ libmvec_hidden_proto (V_NAME_F1(acosh));
libmvec_hidden_proto (V_NAME_F1(asin));
libmvec_hidden_proto (V_NAME_F1(asinh));
libmvec_hidden_proto (V_NAME_F1(atan));
+libmvec_hidden_proto (V_NAME_F1(atanh));
libmvec_hidden_proto (V_NAME_F1(cos));
libmvec_hidden_proto (V_NAME_F1(cosh));
libmvec_hidden_proto (V_NAME_F1(erf));
diff --git a/sysdeps/aarch64/fpu/atanh_advsimd.c b/sysdeps/aarch64/fpu/atanh_advsimd.c
new file mode 100644
index 0000000000000000..3c3d0bd6ad41396d
--- /dev/null
+++ b/sysdeps/aarch64/fpu/atanh_advsimd.c
@@ -0,0 +1,64 @@
+/* Double-precision vector (Advanced SIMD) atanh function
+
+ Copyright (C) 2024 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#define WANT_V_LOG1P_K0_SHORTCUT 0
+#include "v_log1p_inline.h"
+
+const static struct data
+{
+ struct v_log1p_data log1p_consts;
+ uint64x2_t one, half;
+} data = { .log1p_consts = V_LOG1P_CONSTANTS_TABLE,
+ .one = V2 (0x3ff0000000000000),
+ .half = V2 (0x3fe0000000000000) };
+
+static float64x2_t VPCS_ATTR NOINLINE
+special_case (float64x2_t x, float64x2_t y, uint64x2_t special)
+{
+ return v_call_f64 (atanh, x, y, special);
+}
+
+/* Approximation for vector double-precision atanh(x) using modified log1p.
+ The greatest observed error is 3.31 ULP:
+ _ZGVnN2v_atanh(0x1.ffae6288b601p-6) got 0x1.ffd8ff31b5019p-6
+ want 0x1.ffd8ff31b501cp-6. */
+VPCS_ATTR
+float64x2_t V_NAME_D1 (atanh) (float64x2_t x)
+{
+ const struct data *d = ptr_barrier (&data);
+
+ float64x2_t ax = vabsq_f64 (x);
+ uint64x2_t ia = vreinterpretq_u64_f64 (ax);
+ uint64x2_t sign = veorq_u64 (vreinterpretq_u64_f64 (x), ia);
+ uint64x2_t special = vcgeq_u64 (ia, d->one);
+ float64x2_t halfsign = vreinterpretq_f64_u64 (vorrq_u64 (sign, d->half));
+
+#if WANT_SIMD_EXCEPT
+ ax = v_zerofy_f64 (ax, special);
+#endif
+
+ float64x2_t y;
+ y = vaddq_f64 (ax, ax);
+ y = vdivq_f64 (y, vsubq_f64 (v_f64 (1), ax));
+ y = log1p_inline (y, &d->log1p_consts);
+
+ if (__glibc_unlikely (v_any_u64 (special)))
+ return special_case (x, vmulq_f64 (y, halfsign), special);
+ return vmulq_f64 (y, halfsign);
+}
diff --git a/sysdeps/aarch64/fpu/atanh_sve.c b/sysdeps/aarch64/fpu/atanh_sve.c
new file mode 100644
index 0000000000000000..7a52728d70f6d226
--- /dev/null
+++ b/sysdeps/aarch64/fpu/atanh_sve.c
@@ -0,0 +1,59 @@
+/* Double-precision vector (SVE) atanh function
+
+ Copyright (C) 2024 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#define WANT_SV_LOG1P_K0_SHORTCUT 0
+#include "sv_log1p_inline.h"
+
+#define One (0x3ff0000000000000)
+#define Half (0x3fe0000000000000)
+
+static svfloat64_t NOINLINE
+special_case (svfloat64_t x, svfloat64_t y, svbool_t special)
+{
+ return sv_call_f64 (atanh, x, y, special);
+}
+
+/* SVE approximation for double-precision atanh, based on log1p.
+ The greatest observed error is 2.81 ULP:
+ _ZGVsMxv_atanh(0x1.ffae6288b601p-6) got 0x1.ffd8ff31b5019p-6
+ want 0x1.ffd8ff31b501cp-6. */
+svfloat64_t SV_NAME_D1 (atanh) (svfloat64_t x, const svbool_t pg)
+{
+
+ svfloat64_t ax = svabs_x (pg, x);
+ svuint64_t iax = svreinterpret_u64 (ax);
+ svuint64_t sign = sveor_x (pg, svreinterpret_u64 (x), iax);
+ svfloat64_t halfsign = svreinterpret_f64 (svorr_x (pg, sign, Half));
+
+ /* It is special if iax >= 1. */
+// svbool_t special = svcmpge (pg, iax, One);
+ svbool_t special = svacge (pg, x, 1.0);
+
+ /* Computation is performed based on the following sequence of equality:
+ (1+x)/(1-x) = 1 + 2x/(1-x). */
+ svfloat64_t y;
+ y = svadd_x (pg, ax, ax);
+ y = svdiv_x (pg, y, svsub_x (pg, sv_f64 (1), ax));
+ /* ln((1+x)/(1-x)) = ln(1+2x/(1-x)) = ln(1 + y). */
+ y = sv_log1p_inline (y, pg);
+
+ if (__glibc_unlikely (svptest_any (pg, special)))
+ return special_case (x, svmul_x (pg, halfsign, y), special);
+ return svmul_x (pg, halfsign, y);
+}
diff --git a/sysdeps/aarch64/fpu/atanhf_advsimd.c b/sysdeps/aarch64/fpu/atanhf_advsimd.c
new file mode 100644
index 0000000000000000..ae488f7b54ddce26
--- /dev/null
+++ b/sysdeps/aarch64/fpu/atanhf_advsimd.c
@@ -0,0 +1,79 @@
+/* Single-precision vector (Advanced SIMD) atanh function
+
+ Copyright (C) 2024 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#include "v_math.h"
+#include "v_log1pf_inline.h"
+
+const static struct data
+{
+ struct v_log1pf_data log1pf_consts;
+ uint32x4_t one;
+#if WANT_SIMD_EXCEPT
+ uint32x4_t tiny_bound;
+#endif
+} data = {
+ .log1pf_consts = V_LOG1PF_CONSTANTS_TABLE,
+ .one = V4 (0x3f800000),
+#if WANT_SIMD_EXCEPT
+ /* 0x1p-12, below which atanhf(x) rounds to x. */
+ .tiny_bound = V4 (0x39800000),
+#endif
+};
+
+#define AbsMask v_u32 (0x7fffffff)
+#define Half v_u32 (0x3f000000)
+
+static float32x4_t NOINLINE VPCS_ATTR
+special_case (float32x4_t x, float32x4_t y, uint32x4_t special)
+{
+ return v_call_f32 (atanhf, x, y, special);
+}
+
+/* Approximation for vector single-precision atanh(x) using modified log1p.
+ The maximum error is 3.08 ULP:
+ __v_atanhf(0x1.ff215p-5) got 0x1.ffcb7cp-5
+ want 0x1.ffcb82p-5. */
+VPCS_ATTR float32x4_t NOINLINE V_NAME_F1 (atanh) (float32x4_t x)
+{
+ const struct data *d = ptr_barrier (&data);
+
+ float32x4_t halfsign = vbslq_f32 (AbsMask, v_f32 (0.5), x);
+ float32x4_t ax = vabsq_f32 (x);
+ uint32x4_t iax = vreinterpretq_u32_f32 (ax);
+
+#if WANT_SIMD_EXCEPT
+ uint32x4_t special
+ = vorrq_u32 (vcgeq_u32 (iax, d->one), vcltq_u32 (iax, d->tiny_bound));
+ /* Side-step special cases by setting those lanes to 0, which will trigger no
+ exceptions. These will be fixed up later. */
+ if (__glibc_unlikely (v_any_u32 (special)))
+ ax = v_zerofy_f32 (ax, special);
+#else
+ uint32x4_t special = vcgeq_u32 (iax, d->one);
+#endif
+
+ float32x4_t y = vdivq_f32 (vaddq_f32 (ax, ax), vsubq_f32 (v_f32 (1), ax));
+ y = log1pf_inline (y, d->log1pf_consts);
+
+ if (__glibc_unlikely (v_any_u32 (special)))
+ return special_case (x, vmulq_f32 (halfsign, y), special);
+ return vmulq_f32 (halfsign, y);
+}
+libmvec_hidden_def (V_NAME_F1 (atanh))
+HALF_WIDTH_ALIAS_F1 (atanh)
diff --git a/sysdeps/aarch64/fpu/atanhf_sve.c b/sysdeps/aarch64/fpu/atanhf_sve.c
new file mode 100644
index 0000000000000000..dae83041ef7157f0
--- /dev/null
+++ b/sysdeps/aarch64/fpu/atanhf_sve.c
@@ -0,0 +1,54 @@
+/* Single-precision vector (SVE) atanh function
+
+ Copyright (C) 2024 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#include "sv_log1pf_inline.h"
+
+#define One (0x3f800000)
+#define Half (0x3f000000)
+
+static svfloat32_t NOINLINE
+special_case (svfloat32_t x, svfloat32_t y, svbool_t special)
+{
+ return sv_call_f32 (atanhf, x, y, special);
+}
+
+/* Approximation for vector single-precision atanh(x) using modified log1p.
+ The maximum error is 2.28 ULP:
+ _ZGVsMxv_atanhf(0x1.ff1194p-5) got 0x1.ffbbbcp-5
+ want 0x1.ffbbb6p-5. */
+svfloat32_t SV_NAME_F1 (atanh) (svfloat32_t x, const svbool_t pg)
+{
+ svfloat32_t ax = svabs_x (pg, x);
+ svuint32_t iax = svreinterpret_u32 (ax);
+ svuint32_t sign = sveor_x (pg, svreinterpret_u32 (x), iax);
+ svfloat32_t halfsign = svreinterpret_f32 (svorr_x (pg, sign, Half));
+ svbool_t special = svcmpge (pg, iax, One);
+
+ /* Computation is performed based on the following sequence of equality:
+ * (1+x)/(1-x) = 1 + 2x/(1-x). */
+ svfloat32_t y = svadd_x (pg, ax, ax);
+ y = svdiv_x (pg, y, svsub_x (pg, sv_f32 (1), ax));
+ /* ln((1+x)/(1-x)) = ln(1+2x/(1-x)) = ln(1 + y). */
+ y = sv_log1pf_inline (y, pg);
+
+ if (__glibc_unlikely (svptest_any (pg, special)))
+ return special_case (x, svmul_x (pg, halfsign, y), special);
+
+ return svmul_x (pg, halfsign, y);
+}
diff --git a/sysdeps/aarch64/fpu/bits/math-vector.h b/sysdeps/aarch64/fpu/bits/math-vector.h
index eb2af35b27757fc6..ab7a8f74548854b9 100644
--- a/sysdeps/aarch64/fpu/bits/math-vector.h
+++ b/sysdeps/aarch64/fpu/bits/math-vector.h
@@ -49,6 +49,10 @@
# define __DECL_SIMD_atan __DECL_SIMD_aarch64
# undef __DECL_SIMD_atanf
# define __DECL_SIMD_atanf __DECL_SIMD_aarch64
+# undef __DECL_SIMD_atanh
+# define __DECL_SIMD_atanh __DECL_SIMD_aarch64
+# undef __DECL_SIMD_atanhf
+# define __DECL_SIMD_atanhf __DECL_SIMD_aarch64
# undef __DECL_SIMD_atan2
# define __DECL_SIMD_atan2 __DECL_SIMD_aarch64
# undef __DECL_SIMD_atan2f
@@ -137,6 +141,7 @@ __vpcs __f32x4_t _ZGVnN4v_acoshf (__f32x4_t);
__vpcs __f32x4_t _ZGVnN4v_asinf (__f32x4_t);
__vpcs __f32x4_t _ZGVnN4v_asinhf (__f32x4_t);
__vpcs __f32x4_t _ZGVnN4v_atanf (__f32x4_t);
+__vpcs __f32x4_t _ZGVnN4v_atanhf (__f32x4_t);
__vpcs __f32x4_t _ZGVnN4v_cosf (__f32x4_t);
__vpcs __f32x4_t _ZGVnN4v_coshf (__f32x4_t);
__vpcs __f32x4_t _ZGVnN4v_erff (__f32x4_t);
@@ -157,6 +162,7 @@ __vpcs __f64x2_t _ZGVnN2v_acosh (__f64x2_t);
__vpcs __f64x2_t _ZGVnN2v_asin (__f64x2_t);
__vpcs __f64x2_t _ZGVnN2v_asinh (__f64x2_t);
__vpcs __f64x2_t _ZGVnN2v_atan (__f64x2_t);
+__vpcs __f64x2_t _ZGVnN2v_atanh (__f64x2_t);
__vpcs __f64x2_t _ZGVnN2v_cos (__f64x2_t);
__vpcs __f64x2_t _ZGVnN2v_cosh (__f64x2_t);
__vpcs __f64x2_t _ZGVnN2v_erf (__f64x2_t);
@@ -182,6 +188,7 @@ __sv_f32_t _ZGVsMxv_acoshf (__sv_f32_t, __sv_bool_t);
__sv_f32_t _ZGVsMxv_asinf (__sv_f32_t, __sv_bool_t);
__sv_f32_t _ZGVsMxv_asinhf (__sv_f32_t, __sv_bool_t);
__sv_f32_t _ZGVsMxv_atanf (__sv_f32_t, __sv_bool_t);
+__sv_f32_t _ZGVsMxv_atanhf (__sv_f32_t, __sv_bool_t);
__sv_f32_t _ZGVsMxv_cosf (__sv_f32_t, __sv_bool_t);
__sv_f32_t _ZGVsMxv_coshf (__sv_f32_t, __sv_bool_t);
__sv_f32_t _ZGVsMxv_erff (__sv_f32_t, __sv_bool_t);
@@ -202,6 +209,7 @@ __sv_f64_t _ZGVsMxv_acosh (__sv_f64_t, __sv_bool_t);
__sv_f64_t _ZGVsMxv_asin (__sv_f64_t, __sv_bool_t);
__sv_f64_t _ZGVsMxv_asinh (__sv_f64_t, __sv_bool_t);
__sv_f64_t _ZGVsMxv_atan (__sv_f64_t, __sv_bool_t);
+__sv_f64_t _ZGVsMxv_atanh (__sv_f64_t, __sv_bool_t);
__sv_f64_t _ZGVsMxv_cos (__sv_f64_t, __sv_bool_t);
__sv_f64_t _ZGVsMxv_cosh (__sv_f64_t, __sv_bool_t);
__sv_f64_t _ZGVsMxv_erf (__sv_f64_t, __sv_bool_t);
diff --git a/sysdeps/aarch64/fpu/test-double-advsimd-wrappers.c b/sysdeps/aarch64/fpu/test-double-advsimd-wrappers.c
index 3d7177c32dcd77a6..a01aa99c16740631 100644
--- a/sysdeps/aarch64/fpu/test-double-advsimd-wrappers.c
+++ b/sysdeps/aarch64/fpu/test-double-advsimd-wrappers.c
@@ -28,6 +28,7 @@ VPCS_VECTOR_WRAPPER (acosh_advsimd, _ZGVnN2v_acosh)
VPCS_VECTOR_WRAPPER (asin_advsimd, _ZGVnN2v_asin)
VPCS_VECTOR_WRAPPER (asinh_advsimd, _ZGVnN2v_asinh)
VPCS_VECTOR_WRAPPER (atan_advsimd, _ZGVnN2v_atan)
+VPCS_VECTOR_WRAPPER (atanh_advsimd, _ZGVnN2v_atanh)
VPCS_VECTOR_WRAPPER_ff (atan2_advsimd, _ZGVnN2vv_atan2)
VPCS_VECTOR_WRAPPER (cos_advsimd, _ZGVnN2v_cos)
VPCS_VECTOR_WRAPPER (cosh_advsimd, _ZGVnN2v_cosh)
diff --git a/sysdeps/aarch64/fpu/test-double-sve-wrappers.c b/sysdeps/aarch64/fpu/test-double-sve-wrappers.c
index b88a2afe5c1198c0..83cb3ad5d0e4d056 100644
--- a/sysdeps/aarch64/fpu/test-double-sve-wrappers.c
+++ b/sysdeps/aarch64/fpu/test-double-sve-wrappers.c
@@ -47,6 +47,7 @@ SVE_VECTOR_WRAPPER (acosh_sve, _ZGVsMxv_acosh)
SVE_VECTOR_WRAPPER (asin_sve, _ZGVsMxv_asin)
SVE_VECTOR_WRAPPER (asinh_sve, _ZGVsMxv_asinh)
SVE_VECTOR_WRAPPER (atan_sve, _ZGVsMxv_atan)
+SVE_VECTOR_WRAPPER (atanh_sve, _ZGVsMxv_atanh)
SVE_VECTOR_WRAPPER_ff (atan2_sve, _ZGVsMxvv_atan2)
SVE_VECTOR_WRAPPER (cos_sve, _ZGVsMxv_cos)
SVE_VECTOR_WRAPPER (cosh_sve, _ZGVsMxv_cosh)
diff --git a/sysdeps/aarch64/fpu/test-float-advsimd-wrappers.c b/sysdeps/aarch64/fpu/test-float-advsimd-wrappers.c
index 533655402d3f3737..831d4d755272d616 100644
--- a/sysdeps/aarch64/fpu/test-float-advsimd-wrappers.c
+++ b/sysdeps/aarch64/fpu/test-float-advsimd-wrappers.c
@@ -28,6 +28,7 @@ VPCS_VECTOR_WRAPPER (acoshf_advsimd, _ZGVnN4v_acoshf)
VPCS_VECTOR_WRAPPER (asinf_advsimd, _ZGVnN4v_asinf)
VPCS_VECTOR_WRAPPER (asinhf_advsimd, _ZGVnN4v_asinhf)
VPCS_VECTOR_WRAPPER (atanf_advsimd, _ZGVnN4v_atanf)
+VPCS_VECTOR_WRAPPER (atanhf_advsimd, _ZGVnN4v_atanhf)
VPCS_VECTOR_WRAPPER_ff (atan2f_advsimd, _ZGVnN4vv_atan2f)
VPCS_VECTOR_WRAPPER (cosf_advsimd, _ZGVnN4v_cosf)
VPCS_VECTOR_WRAPPER (coshf_advsimd, _ZGVnN4v_coshf)
diff --git a/sysdeps/aarch64/fpu/test-float-sve-wrappers.c b/sysdeps/aarch64/fpu/test-float-sve-wrappers.c
index f7b673e3358e7d82..96fd612c3e76f6dc 100644
--- a/sysdeps/aarch64/fpu/test-float-sve-wrappers.c
+++ b/sysdeps/aarch64/fpu/test-float-sve-wrappers.c
@@ -47,6 +47,7 @@ SVE_VECTOR_WRAPPER (acoshf_sve, _ZGVsMxv_acoshf)
SVE_VECTOR_WRAPPER (asinf_sve, _ZGVsMxv_asinf)
SVE_VECTOR_WRAPPER (asinhf_sve, _ZGVsMxv_asinhf)
SVE_VECTOR_WRAPPER (atanf_sve, _ZGVsMxv_atanf)
+SVE_VECTOR_WRAPPER (atanhf_sve, _ZGVsMxv_atanhf)
SVE_VECTOR_WRAPPER_ff (atan2f_sve, _ZGVsMxvv_atan2f)
SVE_VECTOR_WRAPPER (cosf_sve, _ZGVsMxv_cosf)
SVE_VECTOR_WRAPPER (coshf_sve, _ZGVsMxv_coshf)
diff --git a/sysdeps/aarch64/libm-test-ulps b/sysdeps/aarch64/libm-test-ulps
index b916e422432014c2..7c2e43d3dc5bbc13 100644
--- a/sysdeps/aarch64/libm-test-ulps
+++ b/sysdeps/aarch64/libm-test-ulps
@@ -173,11 +173,19 @@ double: 2
float: 2
ldouble: 4
+Function: "atanh_advsimd":
+double: 1
+float: 1
+
Function: "atanh_downward":
double: 3
float: 3
ldouble: 4
+Function: "atanh_sve":
+double: 2
+float: 1
+
Function: "atanh_towardzero":
double: 2
float: 2
diff --git a/sysdeps/unix/sysv/linux/aarch64/libmvec.abilist b/sysdeps/unix/sysv/linux/aarch64/libmvec.abilist
index f288afdfdd9c8757..ce42372a3a276832 100644
--- a/sysdeps/unix/sysv/linux/aarch64/libmvec.abilist
+++ b/sysdeps/unix/sysv/linux/aarch64/libmvec.abilist
@@ -77,18 +77,23 @@ GLIBC_2.40 _ZGVnN2v_acosh F
GLIBC_2.40 _ZGVnN2v_acoshf F
GLIBC_2.40 _ZGVnN2v_asinh F
GLIBC_2.40 _ZGVnN2v_asinhf F
+GLIBC_2.40 _ZGVnN2v_atanh F
+GLIBC_2.40 _ZGVnN2v_atanhf F
GLIBC_2.40 _ZGVnN2v_cosh F
GLIBC_2.40 _ZGVnN2v_coshf F
GLIBC_2.40 _ZGVnN2v_erf F
GLIBC_2.40 _ZGVnN2v_erff F
GLIBC_2.40 _ZGVnN4v_acoshf F
GLIBC_2.40 _ZGVnN4v_asinhf F
+GLIBC_2.40 _ZGVnN4v_atanhf F
GLIBC_2.40 _ZGVnN4v_coshf F
GLIBC_2.40 _ZGVnN4v_erff F
GLIBC_2.40 _ZGVsMxv_acosh F
GLIBC_2.40 _ZGVsMxv_acoshf F
GLIBC_2.40 _ZGVsMxv_asinh F
GLIBC_2.40 _ZGVsMxv_asinhf F
+GLIBC_2.40 _ZGVsMxv_atanh F
+GLIBC_2.40 _ZGVsMxv_atanhf F
GLIBC_2.40 _ZGVsMxv_cosh F
GLIBC_2.40 _ZGVsMxv_coshf F
GLIBC_2.40 _ZGVsMxv_erf F

758
glibc-RHEL-118273-6.patch Normal file
View File

@ -0,0 +1,758 @@
commit eedbbca0bf3adf3c45aff6c4e128bae3a5562675
Author: Joe Ramsay <Joe.Ramsay@arm.com>
Date: Wed Apr 3 12:15:41 2024 +0100
aarch64/fpu: Add vector variants of sinh
Reviewed-by: Szabolcs Nagy <szabolcs.nagy@arm.com>
diff --git a/sysdeps/aarch64/fpu/Makefile b/sysdeps/aarch64/fpu/Makefile
index 4c878e590681becc..fb5f3a365b27fdf3 100644
--- a/sysdeps/aarch64/fpu/Makefile
+++ b/sysdeps/aarch64/fpu/Makefile
@@ -17,6 +17,7 @@ libmvec-supported-funcs = acos \
log1p \
log2 \
sin \
+ sinh \
tan
float-advsimd-funcs = $(libmvec-supported-funcs)
diff --git a/sysdeps/aarch64/fpu/Versions b/sysdeps/aarch64/fpu/Versions
index 092949dc96d55624..4774b3efeacf59fb 100644
--- a/sysdeps/aarch64/fpu/Versions
+++ b/sysdeps/aarch64/fpu/Versions
@@ -104,5 +104,10 @@ libmvec {
_ZGVnN4v_erff;
_ZGVsMxv_erf;
_ZGVsMxv_erff;
+ _ZGVnN2v_sinh;
+ _ZGVnN2v_sinhf;
+ _ZGVnN4v_sinhf;
+ _ZGVsMxv_sinh;
+ _ZGVsMxv_sinhf;
}
}
diff --git a/sysdeps/aarch64/fpu/advsimd_f32_protos.h b/sysdeps/aarch64/fpu/advsimd_f32_protos.h
index afbb01e191b917a4..7d9445d5c0c0c2a8 100644
--- a/sysdeps/aarch64/fpu/advsimd_f32_protos.h
+++ b/sysdeps/aarch64/fpu/advsimd_f32_protos.h
@@ -35,5 +35,6 @@ libmvec_hidden_proto (V_NAME_F1(log1p));
libmvec_hidden_proto (V_NAME_F1(log2));
libmvec_hidden_proto (V_NAME_F1(log));
libmvec_hidden_proto (V_NAME_F1(sin));
+libmvec_hidden_proto (V_NAME_F1(sinh));
libmvec_hidden_proto (V_NAME_F1(tan));
libmvec_hidden_proto (V_NAME_F2(atan2));
diff --git a/sysdeps/aarch64/fpu/bits/math-vector.h b/sysdeps/aarch64/fpu/bits/math-vector.h
index ab7a8f74548854b9..1e9b76cf41916365 100644
--- a/sysdeps/aarch64/fpu/bits/math-vector.h
+++ b/sysdeps/aarch64/fpu/bits/math-vector.h
@@ -105,6 +105,10 @@
# define __DECL_SIMD_sin __DECL_SIMD_aarch64
# undef __DECL_SIMD_sinf
# define __DECL_SIMD_sinf __DECL_SIMD_aarch64
+# undef __DECL_SIMD_sinh
+# define __DECL_SIMD_sinh __DECL_SIMD_aarch64
+# undef __DECL_SIMD_sinhf
+# define __DECL_SIMD_sinhf __DECL_SIMD_aarch64
# undef __DECL_SIMD_tan
# define __DECL_SIMD_tan __DECL_SIMD_aarch64
# undef __DECL_SIMD_tanf
@@ -154,6 +158,7 @@ __vpcs __f32x4_t _ZGVnN4v_log10f (__f32x4_t);
__vpcs __f32x4_t _ZGVnN4v_log1pf (__f32x4_t);
__vpcs __f32x4_t _ZGVnN4v_log2f (__f32x4_t);
__vpcs __f32x4_t _ZGVnN4v_sinf (__f32x4_t);
+__vpcs __f32x4_t _ZGVnN4v_sinhf (__f32x4_t);
__vpcs __f32x4_t _ZGVnN4v_tanf (__f32x4_t);
__vpcs __f64x2_t _ZGVnN2vv_atan2 (__f64x2_t, __f64x2_t);
@@ -175,6 +180,7 @@ __vpcs __f64x2_t _ZGVnN2v_log10 (__f64x2_t);
__vpcs __f64x2_t _ZGVnN2v_log1p (__f64x2_t);
__vpcs __f64x2_t _ZGVnN2v_log2 (__f64x2_t);
__vpcs __f64x2_t _ZGVnN2v_sin (__f64x2_t);
+__vpcs __f64x2_t _ZGVnN2v_sinh (__f64x2_t);
__vpcs __f64x2_t _ZGVnN2v_tan (__f64x2_t);
# undef __ADVSIMD_VEC_MATH_SUPPORTED
@@ -201,6 +207,7 @@ __sv_f32_t _ZGVsMxv_log10f (__sv_f32_t, __sv_bool_t);
__sv_f32_t _ZGVsMxv_log1pf (__sv_f32_t, __sv_bool_t);
__sv_f32_t _ZGVsMxv_log2f (__sv_f32_t, __sv_bool_t);
__sv_f32_t _ZGVsMxv_sinf (__sv_f32_t, __sv_bool_t);
+__sv_f32_t _ZGVsMxv_sinhf (__sv_f32_t, __sv_bool_t);
__sv_f32_t _ZGVsMxv_tanf (__sv_f32_t, __sv_bool_t);
__sv_f64_t _ZGVsMxvv_atan2 (__sv_f64_t, __sv_f64_t, __sv_bool_t);
@@ -222,6 +229,7 @@ __sv_f64_t _ZGVsMxv_log10 (__sv_f64_t, __sv_bool_t);
__sv_f64_t _ZGVsMxv_log1p (__sv_f64_t, __sv_bool_t);
__sv_f64_t _ZGVsMxv_log2 (__sv_f64_t, __sv_bool_t);
__sv_f64_t _ZGVsMxv_sin (__sv_f64_t, __sv_bool_t);
+__sv_f64_t _ZGVsMxv_sinh (__sv_f64_t, __sv_bool_t);
__sv_f64_t _ZGVsMxv_tan (__sv_f64_t, __sv_bool_t);
# undef __SVE_VEC_MATH_SUPPORTED
diff --git a/sysdeps/aarch64/fpu/sinh_advsimd.c b/sysdeps/aarch64/fpu/sinh_advsimd.c
new file mode 100644
index 0000000000000000..fa3723b10c15eb29
--- /dev/null
+++ b/sysdeps/aarch64/fpu/sinh_advsimd.c
@@ -0,0 +1,121 @@
+/* Double-precision vector (Advanced SIMD) sinh function
+
+ Copyright (C) 2024 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#include "v_math.h"
+#include "poly_advsimd_f64.h"
+
+static const struct data
+{
+ float64x2_t poly[11];
+ float64x2_t inv_ln2, m_ln2, shift;
+ uint64x2_t halff;
+ int64x2_t onef;
+#if WANT_SIMD_EXCEPT
+ uint64x2_t tiny_bound, thresh;
+#else
+ uint64x2_t large_bound;
+#endif
+} data = {
+ /* Generated using Remez, deg=12 in [-log(2)/2, log(2)/2]. */
+ .poly = { V2 (0x1p-1), V2 (0x1.5555555555559p-3), V2 (0x1.555555555554bp-5),
+ V2 (0x1.111111110f663p-7), V2 (0x1.6c16c16c1b5f3p-10),
+ V2 (0x1.a01a01affa35dp-13), V2 (0x1.a01a018b4ecbbp-16),
+ V2 (0x1.71ddf82db5bb4p-19), V2 (0x1.27e517fc0d54bp-22),
+ V2 (0x1.af5eedae67435p-26), V2 (0x1.1f143d060a28ap-29), },
+
+ .inv_ln2 = V2 (0x1.71547652b82fep0),
+ .m_ln2 = (float64x2_t) {-0x1.62e42fefa39efp-1, -0x1.abc9e3b39803fp-56},
+ .shift = V2 (0x1.8p52),
+
+ .halff = V2 (0x3fe0000000000000),
+ .onef = V2 (0x3ff0000000000000),
+#if WANT_SIMD_EXCEPT
+ /* 2^-26, below which sinh(x) rounds to x. */
+ .tiny_bound = V2 (0x3e50000000000000),
+ /* asuint(large_bound) - asuint(tiny_bound). */
+ .thresh = V2 (0x0230000000000000),
+#else
+/* 2^9. expm1 helper overflows for large input. */
+ .large_bound = V2 (0x4080000000000000),
+#endif
+};
+
+static inline float64x2_t
+expm1_inline (float64x2_t x)
+{
+ const struct data *d = ptr_barrier (&data);
+
+ /* Reduce argument:
+ exp(x) - 1 = 2^i * (expm1(f) + 1) - 1
+ where i = round(x / ln2)
+ and f = x - i * ln2 (f in [-ln2/2, ln2/2]). */
+ float64x2_t j = vsubq_f64 (vfmaq_f64 (d->shift, d->inv_ln2, x), d->shift);
+ int64x2_t i = vcvtq_s64_f64 (j);
+ float64x2_t f = vfmaq_laneq_f64 (x, j, d->m_ln2, 0);
+ f = vfmaq_laneq_f64 (f, j, d->m_ln2, 1);
+ /* Approximate expm1(f) using polynomial. */
+ float64x2_t f2 = vmulq_f64 (f, f);
+ float64x2_t f4 = vmulq_f64 (f2, f2);
+ float64x2_t f8 = vmulq_f64 (f4, f4);
+ float64x2_t p = vfmaq_f64 (f, f2, v_estrin_10_f64 (f, f2, f4, f8, d->poly));
+ /* t = 2^i. */
+ float64x2_t t = vreinterpretq_f64_u64 (
+ vreinterpretq_u64_s64 (vaddq_s64 (vshlq_n_s64 (i, 52), d->onef)));
+ /* expm1(x) ~= p * t + (t - 1). */
+ return vfmaq_f64 (vsubq_f64 (t, v_f64 (1.0)), p, t);
+}
+
+static float64x2_t NOINLINE VPCS_ATTR
+special_case (float64x2_t x)
+{
+ return v_call_f64 (sinh, x, x, v_u64 (-1));
+}
+
+/* Approximation for vector double-precision sinh(x) using expm1.
+ sinh(x) = (exp(x) - exp(-x)) / 2.
+ The greatest observed error is 2.57 ULP:
+ _ZGVnN2v_sinh (0x1.9fb1d49d1d58bp-2) got 0x1.ab34e59d678dcp-2
+ want 0x1.ab34e59d678d9p-2. */
+float64x2_t VPCS_ATTR V_NAME_D1 (sinh) (float64x2_t x)
+{
+ const struct data *d = ptr_barrier (&data);
+
+ float64x2_t ax = vabsq_f64 (x);
+ uint64x2_t sign
+ = veorq_u64 (vreinterpretq_u64_f64 (x), vreinterpretq_u64_f64 (ax));
+ float64x2_t halfsign = vreinterpretq_f64_u64 (vorrq_u64 (sign, d->halff));
+
+#if WANT_SIMD_EXCEPT
+ uint64x2_t special = vcgeq_u64 (
+ vsubq_u64 (vreinterpretq_u64_f64 (ax), d->tiny_bound), d->thresh);
+#else
+ uint64x2_t special = vcgeq_u64 (vreinterpretq_u64_f64 (ax), d->large_bound);
+#endif
+
+ /* Fall back to scalar variant for all lanes if any of them are special. */
+ if (__glibc_unlikely (v_any_u64 (special)))
+ return special_case (x);
+
+ /* Up to the point that expm1 overflows, we can use it to calculate sinh
+ using a slight rearrangement of the definition of sinh. This allows us to
+ retain acceptable accuracy for very small inputs. */
+ float64x2_t t = expm1_inline (ax);
+ t = vaddq_f64 (t, vdivq_f64 (t, vaddq_f64 (t, v_f64 (1.0))));
+ return vmulq_f64 (t, halfsign);
+}
diff --git a/sysdeps/aarch64/fpu/sinh_sve.c b/sysdeps/aarch64/fpu/sinh_sve.c
new file mode 100644
index 0000000000000000..df5f6c8c06e5b173
--- /dev/null
+++ b/sysdeps/aarch64/fpu/sinh_sve.c
@@ -0,0 +1,107 @@
+/* Double-precision vector (SVE) atanh function
+
+ Copyright (C) 2024 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#include "sv_math.h"
+#include "poly_sve_f64.h"
+
+static const struct data
+{
+ float64_t poly[11];
+ float64_t inv_ln2, m_ln2_hi, m_ln2_lo, shift;
+ uint64_t halff;
+ int64_t onef;
+ uint64_t large_bound;
+} data = {
+ /* Generated using Remez, deg=12 in [-log(2)/2, log(2)/2]. */
+ .poly = { 0x1p-1, 0x1.5555555555559p-3, 0x1.555555555554bp-5,
+ 0x1.111111110f663p-7, 0x1.6c16c16c1b5f3p-10,
+ 0x1.a01a01affa35dp-13, 0x1.a01a018b4ecbbp-16,
+ 0x1.71ddf82db5bb4p-19, 0x1.27e517fc0d54bp-22,
+ 0x1.af5eedae67435p-26, 0x1.1f143d060a28ap-29, },
+
+ .inv_ln2 = 0x1.71547652b82fep0,
+ .m_ln2_hi = -0x1.62e42fefa39efp-1,
+ .m_ln2_lo = -0x1.abc9e3b39803fp-56,
+ .shift = 0x1.8p52,
+
+ .halff = 0x3fe0000000000000,
+ .onef = 0x3ff0000000000000,
+ /* 2^9. expm1 helper overflows for large input. */
+ .large_bound = 0x4080000000000000,
+};
+
+static inline svfloat64_t
+expm1_inline (svfloat64_t x, svbool_t pg)
+{
+ const struct data *d = ptr_barrier (&data);
+
+ /* Reduce argument:
+ exp(x) - 1 = 2^i * (expm1(f) + 1) - 1
+ where i = round(x / ln2)
+ and f = x - i * ln2 (f in [-ln2/2, ln2/2]). */
+ svfloat64_t j
+ = svsub_x (pg, svmla_x (pg, sv_f64 (d->shift), x, d->inv_ln2), d->shift);
+ svint64_t i = svcvt_s64_x (pg, j);
+ svfloat64_t f = svmla_x (pg, x, j, d->m_ln2_hi);
+ f = svmla_x (pg, f, j, d->m_ln2_lo);
+ /* Approximate expm1(f) using polynomial. */
+ svfloat64_t f2 = svmul_x (pg, f, f);
+ svfloat64_t f4 = svmul_x (pg, f2, f2);
+ svfloat64_t f8 = svmul_x (pg, f4, f4);
+ svfloat64_t p
+ = svmla_x (pg, f, f2, sv_estrin_10_f64_x (pg, f, f2, f4, f8, d->poly));
+ /* t = 2^i. */
+ svfloat64_t t = svscale_x (pg, sv_f64 (1), i);
+ /* expm1(x) ~= p * t + (t - 1). */
+ return svmla_x (pg, svsub_x (pg, t, 1.0), p, t);
+}
+
+static svfloat64_t NOINLINE
+special_case (svfloat64_t x, svbool_t pg)
+{
+ return sv_call_f64 (sinh, x, x, pg);
+}
+
+/* Approximation for SVE double-precision sinh(x) using expm1.
+ sinh(x) = (exp(x) - exp(-x)) / 2.
+ The greatest observed error is 2.57 ULP:
+ _ZGVsMxv_sinh (0x1.a008538399931p-2) got 0x1.ab929fc64bd66p-2
+ want 0x1.ab929fc64bd63p-2. */
+svfloat64_t SV_NAME_D1 (sinh) (svfloat64_t x, svbool_t pg)
+{
+ const struct data *d = ptr_barrier (&data);
+
+ svfloat64_t ax = svabs_x (pg, x);
+ svuint64_t sign
+ = sveor_x (pg, svreinterpret_u64 (x), svreinterpret_u64 (ax));
+ svfloat64_t halfsign = svreinterpret_f64 (svorr_x (pg, sign, d->halff));
+
+ svbool_t special = svcmpge (pg, svreinterpret_u64 (ax), d->large_bound);
+
+ /* Fall back to scalar variant for all lanes if any are special. */
+ if (__glibc_unlikely (svptest_any (pg, special)))
+ return special_case (x, pg);
+
+ /* Up to the point that expm1 overflows, we can use it to calculate sinh
+ using a slight rearrangement of the definition of sinh. This allows us to
+ retain acceptable accuracy for very small inputs. */
+ svfloat64_t t = expm1_inline (ax, pg);
+ t = svadd_x (pg, t, svdiv_x (pg, t, svadd_x (pg, t, 1.0)));
+ return svmul_x (pg, t, halfsign);
+}
diff --git a/sysdeps/aarch64/fpu/sinhf_advsimd.c b/sysdeps/aarch64/fpu/sinhf_advsimd.c
new file mode 100644
index 0000000000000000..6bb7482dc28795c1
--- /dev/null
+++ b/sysdeps/aarch64/fpu/sinhf_advsimd.c
@@ -0,0 +1,88 @@
+/* Single-precision vector (Advanced SIMD) sinh function
+
+ Copyright (C) 2024 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#include "v_math.h"
+#include "v_expm1f_inline.h"
+
+static const struct data
+{
+ struct v_expm1f_data expm1f_consts;
+ uint32x4_t halff;
+#if WANT_SIMD_EXCEPT
+ uint32x4_t tiny_bound, thresh;
+#else
+ uint32x4_t oflow_bound;
+#endif
+} data = {
+ .expm1f_consts = V_EXPM1F_DATA,
+ .halff = V4 (0x3f000000),
+#if WANT_SIMD_EXCEPT
+ /* 0x1.6a09e8p-32, below which expm1f underflows. */
+ .tiny_bound = V4 (0x2fb504f4),
+ /* asuint(oflow_bound) - asuint(tiny_bound). */
+ .thresh = V4 (0x12fbbbb3),
+#else
+ /* 0x1.61814ep+6, above which expm1f helper overflows. */
+ .oflow_bound = V4 (0x42b0c0a7),
+#endif
+};
+
+static float32x4_t NOINLINE VPCS_ATTR
+special_case (float32x4_t x, float32x4_t y, uint32x4_t special)
+{
+ return v_call_f32 (sinhf, x, y, special);
+}
+
+/* Approximation for vector single-precision sinh(x) using expm1.
+ sinh(x) = (exp(x) - exp(-x)) / 2.
+ The maximum error is 2.26 ULP:
+ _ZGVnN4v_sinhf (0x1.e34a9ep-4) got 0x1.e469ep-4
+ want 0x1.e469e4p-4. */
+float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (sinh) (float32x4_t x)
+{
+ const struct data *d = ptr_barrier (&data);
+
+ uint32x4_t ix = vreinterpretq_u32_f32 (x);
+ float32x4_t ax = vabsq_f32 (x);
+ uint32x4_t iax = vreinterpretq_u32_f32 (ax);
+ uint32x4_t sign = veorq_u32 (ix, iax);
+ float32x4_t halfsign = vreinterpretq_f32_u32 (vorrq_u32 (sign, d->halff));
+
+#if WANT_SIMD_EXCEPT
+ uint32x4_t special = vcgeq_u32 (vsubq_u32 (iax, d->tiny_bound), d->thresh);
+ ax = v_zerofy_f32 (ax, special);
+#else
+ uint32x4_t special = vcgeq_u32 (iax, d->oflow_bound);
+#endif
+
+ /* Up to the point that expm1f overflows, we can use it to calculate sinhf
+ using a slight rearrangement of the definition of asinh. This allows us
+ to retain acceptable accuracy for very small inputs. */
+ float32x4_t t = expm1f_inline (ax, &d->expm1f_consts);
+ t = vaddq_f32 (t, vdivq_f32 (t, vaddq_f32 (t, v_f32 (1.0))));
+
+ /* Fall back to the scalar variant for any lanes that should trigger an
+ exception. */
+ if (__glibc_unlikely (v_any_u32 (special)))
+ return special_case (x, vmulq_f32 (t, halfsign), special);
+
+ return vmulq_f32 (t, halfsign);
+}
+libmvec_hidden_def (V_NAME_F1 (sinh))
+HALF_WIDTH_ALIAS_F1 (sinh)
diff --git a/sysdeps/aarch64/fpu/sinhf_sve.c b/sysdeps/aarch64/fpu/sinhf_sve.c
new file mode 100644
index 0000000000000000..6c204b57a2aa18d3
--- /dev/null
+++ b/sysdeps/aarch64/fpu/sinhf_sve.c
@@ -0,0 +1,67 @@
+/* Single-precision vector (SVE) sinh function
+
+ Copyright (C) 2024 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#include "sv_expm1f_inline.h"
+#include "sv_math.h"
+
+static const struct data
+{
+ struct sv_expm1f_data expm1f_consts;
+ uint32_t halff, large_bound;
+} data = {
+ .expm1f_consts = SV_EXPM1F_DATA,
+ .halff = 0x3f000000,
+ /* 0x1.61814ep+6, above which expm1f helper overflows. */
+ .large_bound = 0x42b0c0a7,
+};
+
+static svfloat32_t NOINLINE
+special_case (svfloat32_t x, svfloat32_t y, svbool_t pg)
+{
+ return sv_call_f32 (sinhf, x, y, pg);
+}
+
+/* Approximation for SVE single-precision sinh(x) using expm1.
+ sinh(x) = (exp(x) - exp(-x)) / 2.
+ The maximum error is 2.26 ULP:
+ _ZGVsMxv_sinhf (0x1.e34a9ep-4) got 0x1.e469ep-4
+ want 0x1.e469e4p-4. */
+svfloat32_t SV_NAME_F1 (sinh) (svfloat32_t x, const svbool_t pg)
+{
+ const struct data *d = ptr_barrier (&data);
+ svfloat32_t ax = svabs_x (pg, x);
+ svuint32_t sign
+ = sveor_x (pg, svreinterpret_u32 (x), svreinterpret_u32 (ax));
+ svfloat32_t halfsign = svreinterpret_f32 (svorr_x (pg, sign, d->halff));
+
+ svbool_t special = svcmpge (pg, svreinterpret_u32 (ax), d->large_bound);
+
+ /* Up to the point that expm1f overflows, we can use it to calculate sinhf
+ using a slight rearrangement of the definition of asinh. This allows us to
+ retain acceptable accuracy for very small inputs. */
+ svfloat32_t t = expm1f_inline (ax, pg, &d->expm1f_consts);
+ t = svadd_x (pg, t, svdiv_x (pg, t, svadd_x (pg, t, 1.0)));
+
+ /* Fall back to the scalar variant for any lanes which would cause
+ expm1f to overflow. */
+ if (__glibc_unlikely (svptest_any (pg, special)))
+ return special_case (x, svmul_x (pg, t, halfsign), special);
+
+ return svmul_x (pg, t, halfsign);
+}
diff --git a/sysdeps/aarch64/fpu/sv_expm1f_inline.h b/sysdeps/aarch64/fpu/sv_expm1f_inline.h
new file mode 100644
index 0000000000000000..5b7245122294e1b4
--- /dev/null
+++ b/sysdeps/aarch64/fpu/sv_expm1f_inline.h
@@ -0,0 +1,84 @@
+/* Single-precision inline helper for vector (SVE) expm1 function
+
+ Copyright (C) 2024 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#ifndef AARCH64_FPU_SV_EXPM1F_INLINE_H
+#define AARCH64_FPU_SV_EXPM1F_INLINE_H
+
+#include "sv_math.h"
+
+struct sv_expm1f_data
+{
+ /* These 4 are grouped together so they can be loaded as one quadword, then
+ used with _lane forms of svmla/svmls. */
+ float32_t c2, c4, ln2_hi, ln2_lo;
+ float32_t c0, c1, c3, inv_ln2, shift;
+};
+
+/* Coefficients generated using fpminimax. */
+#define SV_EXPM1F_DATA \
+ { \
+ .c0 = 0x1.fffffep-2, .c1 = 0x1.5554aep-3, .c2 = 0x1.555736p-5, \
+ .c3 = 0x1.12287cp-7, .c4 = 0x1.6b55a2p-10, \
+ \
+ .shift = 0x1.8p23f, .inv_ln2 = 0x1.715476p+0f, .ln2_hi = 0x1.62e4p-1f, \
+ .ln2_lo = 0x1.7f7d1cp-20f, \
+ }
+
+#define C(i) sv_f32 (d->c##i)
+
+static inline svfloat32_t
+expm1f_inline (svfloat32_t x, svbool_t pg, const struct sv_expm1f_data *d)
+{
+ /* This vector is reliant on layout of data - it contains constants
+ that can be used with _lane forms of svmla/svmls. Values are:
+ [ coeff_2, coeff_4, ln2_hi, ln2_lo ]. */
+ svfloat32_t lane_constants = svld1rq (svptrue_b32 (), &d->c2);
+
+ /* Reduce argument to smaller range:
+ Let i = round(x / ln2)
+ and f = x - i * ln2, then f is in [-ln2/2, ln2/2].
+ exp(x) - 1 = 2^i * (expm1(f) + 1) - 1
+ where 2^i is exact because i is an integer. */
+ svfloat32_t j = svmla_x (pg, sv_f32 (d->shift), x, d->inv_ln2);
+ j = svsub_x (pg, j, d->shift);
+ svint32_t i = svcvt_s32_x (pg, j);
+
+ svfloat32_t f = svmls_lane (x, j, lane_constants, 2);
+ f = svmls_lane (f, j, lane_constants, 3);
+
+ /* Approximate expm1(f) using polynomial.
+ Taylor expansion for expm1(x) has the form:
+ x + ax^2 + bx^3 + cx^4 ....
+ So we calculate the polynomial P(f) = a + bf + cf^2 + ...
+ and assemble the approximation expm1(f) ~= f + f^2 * P(f). */
+ svfloat32_t p12 = svmla_lane (C (1), f, lane_constants, 0);
+ svfloat32_t p34 = svmla_lane (C (3), f, lane_constants, 1);
+ svfloat32_t f2 = svmul_x (pg, f, f);
+ svfloat32_t p = svmla_x (pg, p12, f2, p34);
+ p = svmla_x (pg, C (0), f, p);
+ p = svmla_x (pg, f, f2, p);
+
+ /* Assemble the result.
+ expm1(x) ~= 2^i * (p + 1) - 1
+ Let t = 2^i. */
+ svfloat32_t t = svscale_x (pg, sv_f32 (1), i);
+ return svmla_x (pg, svsub_x (pg, t, 1), p, t);
+}
+
+#endif
diff --git a/sysdeps/aarch64/fpu/test-double-advsimd-wrappers.c b/sysdeps/aarch64/fpu/test-double-advsimd-wrappers.c
index a01aa99c16740631..1a57b22c3a92f1e1 100644
--- a/sysdeps/aarch64/fpu/test-double-advsimd-wrappers.c
+++ b/sysdeps/aarch64/fpu/test-double-advsimd-wrappers.c
@@ -42,4 +42,5 @@ VPCS_VECTOR_WRAPPER (log10_advsimd, _ZGVnN2v_log10)
VPCS_VECTOR_WRAPPER (log1p_advsimd, _ZGVnN2v_log1p)
VPCS_VECTOR_WRAPPER (log2_advsimd, _ZGVnN2v_log2)
VPCS_VECTOR_WRAPPER (sin_advsimd, _ZGVnN2v_sin)
+VPCS_VECTOR_WRAPPER (sinh_advsimd, _ZGVnN2v_sinh)
VPCS_VECTOR_WRAPPER (tan_advsimd, _ZGVnN2v_tan)
diff --git a/sysdeps/aarch64/fpu/test-double-sve-wrappers.c b/sysdeps/aarch64/fpu/test-double-sve-wrappers.c
index 83cb3ad5d0e4d056..0c9858f6b74aaef6 100644
--- a/sysdeps/aarch64/fpu/test-double-sve-wrappers.c
+++ b/sysdeps/aarch64/fpu/test-double-sve-wrappers.c
@@ -61,4 +61,5 @@ SVE_VECTOR_WRAPPER (log10_sve, _ZGVsMxv_log10)
SVE_VECTOR_WRAPPER (log1p_sve, _ZGVsMxv_log1p)
SVE_VECTOR_WRAPPER (log2_sve, _ZGVsMxv_log2)
SVE_VECTOR_WRAPPER (sin_sve, _ZGVsMxv_sin)
+SVE_VECTOR_WRAPPER (sinh_sve, _ZGVsMxv_sinh)
SVE_VECTOR_WRAPPER (tan_sve, _ZGVsMxv_tan)
diff --git a/sysdeps/aarch64/fpu/test-float-advsimd-wrappers.c b/sysdeps/aarch64/fpu/test-float-advsimd-wrappers.c
index 831d4d755272d616..4758490c6fc40fda 100644
--- a/sysdeps/aarch64/fpu/test-float-advsimd-wrappers.c
+++ b/sysdeps/aarch64/fpu/test-float-advsimd-wrappers.c
@@ -42,4 +42,5 @@ VPCS_VECTOR_WRAPPER (log10f_advsimd, _ZGVnN4v_log10f)
VPCS_VECTOR_WRAPPER (log1pf_advsimd, _ZGVnN4v_log1pf)
VPCS_VECTOR_WRAPPER (log2f_advsimd, _ZGVnN4v_log2f)
VPCS_VECTOR_WRAPPER (sinf_advsimd, _ZGVnN4v_sinf)
+VPCS_VECTOR_WRAPPER (sinhf_advsimd, _ZGVnN4v_sinhf)
VPCS_VECTOR_WRAPPER (tanf_advsimd, _ZGVnN4v_tanf)
diff --git a/sysdeps/aarch64/fpu/test-float-sve-wrappers.c b/sysdeps/aarch64/fpu/test-float-sve-wrappers.c
index 96fd612c3e76f6dc..7c04f07bbee84777 100644
--- a/sysdeps/aarch64/fpu/test-float-sve-wrappers.c
+++ b/sysdeps/aarch64/fpu/test-float-sve-wrappers.c
@@ -61,4 +61,5 @@ SVE_VECTOR_WRAPPER (log10f_sve, _ZGVsMxv_log10f)
SVE_VECTOR_WRAPPER (log1pf_sve, _ZGVsMxv_log1pf)
SVE_VECTOR_WRAPPER (log2f_sve, _ZGVsMxv_log2f)
SVE_VECTOR_WRAPPER (sinf_sve, _ZGVsMxv_sinf)
+SVE_VECTOR_WRAPPER (sinhf_sve, _ZGVsMxv_sinhf)
SVE_VECTOR_WRAPPER (tanf_sve, _ZGVsMxv_tanf)
diff --git a/sysdeps/aarch64/fpu/v_expm1f_inline.h b/sysdeps/aarch64/fpu/v_expm1f_inline.h
new file mode 100644
index 0000000000000000..337ccfbfab555c97
--- /dev/null
+++ b/sysdeps/aarch64/fpu/v_expm1f_inline.h
@@ -0,0 +1,73 @@
+/* Single-precision inline helper for vector (Advanced SIMD) expm1 function
+
+ Copyright (C) 2024 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#ifndef AARCH64_FPU_V_EXPM1F_INLINE_H
+#define AARCH64_FPU_V_EXPM1F_INLINE_H
+
+#include "v_math.h"
+#include "poly_advsimd_f32.h"
+
+struct v_expm1f_data
+{
+ float32x4_t poly[5];
+ float32x4_t invln2_and_ln2, shift;
+ int32x4_t exponent_bias;
+};
+
+/* Coefficients generated using fpminimax with degree=5 in [-log(2)/2,
+ log(2)/2]. Exponent bias is asuint(1.0f).
+ invln2_and_ln2 Stores constants: invln2, ln2_lo, ln2_hi, 0. */
+#define V_EXPM1F_DATA \
+ { \
+ .poly = { V4 (0x1.fffffep-2), V4 (0x1.5554aep-3), V4 (0x1.555736p-5), \
+ V4 (0x1.12287cp-7), V4 (0x1.6b55a2p-10) }, \
+ .shift = V4 (0x1.8p23f), .exponent_bias = V4 (0x3f800000), \
+ .invln2_and_ln2 = { 0x1.715476p+0f, 0x1.62e4p-1f, 0x1.7f7d1cp-20f, 0 }, \
+ }
+
+static inline float32x4_t
+expm1f_inline (float32x4_t x, const struct v_expm1f_data *d)
+{
+ /* Helper routine for calculating exp(x) - 1.
+ Copied from v_expm1f_1u6.c, with all special-case handling removed - the
+ calling routine should handle special values if required. */
+
+ /* Reduce argument: f in [-ln2/2, ln2/2], i is exact. */
+ float32x4_t j = vsubq_f32 (
+ vfmaq_laneq_f32 (d->shift, x, d->invln2_and_ln2, 0), d->shift);
+ int32x4_t i = vcvtq_s32_f32 (j);
+ float32x4_t f = vfmsq_laneq_f32 (x, j, d->invln2_and_ln2, 1);
+ f = vfmsq_laneq_f32 (f, j, d->invln2_and_ln2, 2);
+
+ /* Approximate expm1(f) with polynomial P, expm1(f) ~= f + f^2 * P(f).
+ Uses Estrin scheme, where the main _ZGVnN4v_expm1f routine uses
+ Horner. */
+ float32x4_t f2 = vmulq_f32 (f, f);
+ float32x4_t f4 = vmulq_f32 (f2, f2);
+ float32x4_t p = v_estrin_4_f32 (f, f2, f4, d->poly);
+ p = vfmaq_f32 (f, f2, p);
+
+ /* t = 2^i. */
+ int32x4_t u = vaddq_s32 (vshlq_n_s32 (i, 23), d->exponent_bias);
+ float32x4_t t = vreinterpretq_f32_s32 (u);
+ /* expm1(x) ~= p * t + (t - 1). */
+ return vfmaq_f32 (vsubq_f32 (t, v_f32 (1.0f)), p, t);
+}
+
+#endif
diff --git a/sysdeps/aarch64/libm-test-ulps b/sysdeps/aarch64/libm-test-ulps
index 7c2e43d3dc5bbc13..fec0972081af734a 100644
--- a/sysdeps/aarch64/libm-test-ulps
+++ b/sysdeps/aarch64/libm-test-ulps
@@ -1441,11 +1441,19 @@ double: 2
float: 2
ldouble: 2
+Function: "sinh_advsimd":
+double: 2
+float: 1
+
Function: "sinh_downward":
double: 3
float: 3
ldouble: 3
+Function: "sinh_sve":
+double: 2
+float: 1
+
Function: "sinh_towardzero":
double: 3
float: 2
diff --git a/sysdeps/unix/sysv/linux/aarch64/libmvec.abilist b/sysdeps/unix/sysv/linux/aarch64/libmvec.abilist
index ce42372a3a276832..1db5ba61d64067a2 100644
--- a/sysdeps/unix/sysv/linux/aarch64/libmvec.abilist
+++ b/sysdeps/unix/sysv/linux/aarch64/libmvec.abilist
@@ -83,11 +83,14 @@ GLIBC_2.40 _ZGVnN2v_cosh F
GLIBC_2.40 _ZGVnN2v_coshf F
GLIBC_2.40 _ZGVnN2v_erf F
GLIBC_2.40 _ZGVnN2v_erff F
+GLIBC_2.40 _ZGVnN2v_sinh F
+GLIBC_2.40 _ZGVnN2v_sinhf F
GLIBC_2.40 _ZGVnN4v_acoshf F
GLIBC_2.40 _ZGVnN4v_asinhf F
GLIBC_2.40 _ZGVnN4v_atanhf F
GLIBC_2.40 _ZGVnN4v_coshf F
GLIBC_2.40 _ZGVnN4v_erff F
+GLIBC_2.40 _ZGVnN4v_sinhf F
GLIBC_2.40 _ZGVsMxv_acosh F
GLIBC_2.40 _ZGVsMxv_acoshf F
GLIBC_2.40 _ZGVsMxv_asinh F
@@ -98,3 +101,5 @@ GLIBC_2.40 _ZGVsMxv_cosh F
GLIBC_2.40 _ZGVsMxv_coshf F
GLIBC_2.40 _ZGVsMxv_erf F
GLIBC_2.40 _ZGVsMxv_erff F
+GLIBC_2.40 _ZGVsMxv_sinh F
+GLIBC_2.40 _ZGVsMxv_sinhf F

624
glibc-RHEL-118273-7.patch Normal file
View File

@ -0,0 +1,624 @@
commit 3d3a4fb8e4fe854a0bbb3df9c26ba482c10a7e22
Author: Joe Ramsay <Joe.Ramsay@arm.com>
Date: Tue Feb 20 16:59:44 2024 +0000
aarch64/fpu: Add vector variants of tanh
Reviewed-by: Szabolcs Nagy <szabolcs.nagy@arm.com>
diff --git a/math/auto-libm-test-in b/math/auto-libm-test-in
index 5a690023e9a675cb..4584c5e498ab7194 100644
--- a/math/auto-libm-test-in
+++ b/math/auto-libm-test-in
@@ -7747,7 +7747,7 @@ tan min_subnorm
tan -min_subnorm
tanh 0
-tanh -0
+tanh -0 no-mathvec
tanh 0.75
tanh -0.75
tanh 1.0
diff --git a/math/auto-libm-test-out-tanh b/math/auto-libm-test-out-tanh
index 8b9427c917f3b388..19ce2e7b9355963d 100644
--- a/math/auto-libm-test-out-tanh
+++ b/math/auto-libm-test-out-tanh
@@ -23,31 +23,31 @@ tanh 0
= tanh tonearest ibm128 0x0p+0 : 0x0p+0 : inexact-ok
= tanh towardzero ibm128 0x0p+0 : 0x0p+0 : inexact-ok
= tanh upward ibm128 0x0p+0 : 0x0p+0 : inexact-ok
-tanh -0
-= tanh downward binary32 -0x0p+0 : -0x0p+0 : inexact-ok
-= tanh tonearest binary32 -0x0p+0 : -0x0p+0 : inexact-ok
-= tanh towardzero binary32 -0x0p+0 : -0x0p+0 : inexact-ok
-= tanh upward binary32 -0x0p+0 : -0x0p+0 : inexact-ok
-= tanh downward binary64 -0x0p+0 : -0x0p+0 : inexact-ok
-= tanh tonearest binary64 -0x0p+0 : -0x0p+0 : inexact-ok
-= tanh towardzero binary64 -0x0p+0 : -0x0p+0 : inexact-ok
-= tanh upward binary64 -0x0p+0 : -0x0p+0 : inexact-ok
-= tanh downward intel96 -0x0p+0 : -0x0p+0 : inexact-ok
-= tanh tonearest intel96 -0x0p+0 : -0x0p+0 : inexact-ok
-= tanh towardzero intel96 -0x0p+0 : -0x0p+0 : inexact-ok
-= tanh upward intel96 -0x0p+0 : -0x0p+0 : inexact-ok
-= tanh downward m68k96 -0x0p+0 : -0x0p+0 : inexact-ok
-= tanh tonearest m68k96 -0x0p+0 : -0x0p+0 : inexact-ok
-= tanh towardzero m68k96 -0x0p+0 : -0x0p+0 : inexact-ok
-= tanh upward m68k96 -0x0p+0 : -0x0p+0 : inexact-ok
-= tanh downward binary128 -0x0p+0 : -0x0p+0 : inexact-ok
-= tanh tonearest binary128 -0x0p+0 : -0x0p+0 : inexact-ok
-= tanh towardzero binary128 -0x0p+0 : -0x0p+0 : inexact-ok
-= tanh upward binary128 -0x0p+0 : -0x0p+0 : inexact-ok
-= tanh downward ibm128 -0x0p+0 : -0x0p+0 : inexact-ok
-= tanh tonearest ibm128 -0x0p+0 : -0x0p+0 : inexact-ok
-= tanh towardzero ibm128 -0x0p+0 : -0x0p+0 : inexact-ok
-= tanh upward ibm128 -0x0p+0 : -0x0p+0 : inexact-ok
+tanh -0 no-mathvec
+= tanh downward binary32 -0x0p+0 : -0x0p+0 : no-mathvec inexact-ok
+= tanh tonearest binary32 -0x0p+0 : -0x0p+0 : no-mathvec inexact-ok
+= tanh towardzero binary32 -0x0p+0 : -0x0p+0 : no-mathvec inexact-ok
+= tanh upward binary32 -0x0p+0 : -0x0p+0 : no-mathvec inexact-ok
+= tanh downward binary64 -0x0p+0 : -0x0p+0 : no-mathvec inexact-ok
+= tanh tonearest binary64 -0x0p+0 : -0x0p+0 : no-mathvec inexact-ok
+= tanh towardzero binary64 -0x0p+0 : -0x0p+0 : no-mathvec inexact-ok
+= tanh upward binary64 -0x0p+0 : -0x0p+0 : no-mathvec inexact-ok
+= tanh downward intel96 -0x0p+0 : -0x0p+0 : no-mathvec inexact-ok
+= tanh tonearest intel96 -0x0p+0 : -0x0p+0 : no-mathvec inexact-ok
+= tanh towardzero intel96 -0x0p+0 : -0x0p+0 : no-mathvec inexact-ok
+= tanh upward intel96 -0x0p+0 : -0x0p+0 : no-mathvec inexact-ok
+= tanh downward m68k96 -0x0p+0 : -0x0p+0 : no-mathvec inexact-ok
+= tanh tonearest m68k96 -0x0p+0 : -0x0p+0 : no-mathvec inexact-ok
+= tanh towardzero m68k96 -0x0p+0 : -0x0p+0 : no-mathvec inexact-ok
+= tanh upward m68k96 -0x0p+0 : -0x0p+0 : no-mathvec inexact-ok
+= tanh downward binary128 -0x0p+0 : -0x0p+0 : no-mathvec inexact-ok
+= tanh tonearest binary128 -0x0p+0 : -0x0p+0 : no-mathvec inexact-ok
+= tanh towardzero binary128 -0x0p+0 : -0x0p+0 : no-mathvec inexact-ok
+= tanh upward binary128 -0x0p+0 : -0x0p+0 : no-mathvec inexact-ok
+= tanh downward ibm128 -0x0p+0 : -0x0p+0 : no-mathvec inexact-ok
+= tanh tonearest ibm128 -0x0p+0 : -0x0p+0 : no-mathvec inexact-ok
+= tanh towardzero ibm128 -0x0p+0 : -0x0p+0 : no-mathvec inexact-ok
+= tanh upward ibm128 -0x0p+0 : -0x0p+0 : no-mathvec inexact-ok
tanh 0.75
= tanh downward binary32 0xcp-4 : 0xa.2991fp-4 : inexact-ok
= tanh tonearest binary32 0xcp-4 : 0xa.2991fp-4 : inexact-ok
diff --git a/sysdeps/aarch64/fpu/Makefile b/sysdeps/aarch64/fpu/Makefile
index fb5f3a365b27fdf3..e5f418ae4274edb2 100644
--- a/sysdeps/aarch64/fpu/Makefile
+++ b/sysdeps/aarch64/fpu/Makefile
@@ -18,7 +18,8 @@ libmvec-supported-funcs = acos \
log2 \
sin \
sinh \
- tan
+ tan \
+ tanh
float-advsimd-funcs = $(libmvec-supported-funcs)
double-advsimd-funcs = $(libmvec-supported-funcs)
diff --git a/sysdeps/aarch64/fpu/Versions b/sysdeps/aarch64/fpu/Versions
index 4774b3efeacf59fb..4dbf3d32441dd43a 100644
--- a/sysdeps/aarch64/fpu/Versions
+++ b/sysdeps/aarch64/fpu/Versions
@@ -109,5 +109,10 @@ libmvec {
_ZGVnN4v_sinhf;
_ZGVsMxv_sinh;
_ZGVsMxv_sinhf;
+ _ZGVnN2v_tanh;
+ _ZGVnN2v_tanhf;
+ _ZGVnN4v_tanhf;
+ _ZGVsMxv_tanh;
+ _ZGVsMxv_tanhf;
}
}
diff --git a/sysdeps/aarch64/fpu/advsimd_f32_protos.h b/sysdeps/aarch64/fpu/advsimd_f32_protos.h
index 7d9445d5c0c0c2a8..4ff191c324050b42 100644
--- a/sysdeps/aarch64/fpu/advsimd_f32_protos.h
+++ b/sysdeps/aarch64/fpu/advsimd_f32_protos.h
@@ -37,4 +37,5 @@ libmvec_hidden_proto (V_NAME_F1(log));
libmvec_hidden_proto (V_NAME_F1(sin));
libmvec_hidden_proto (V_NAME_F1(sinh));
libmvec_hidden_proto (V_NAME_F1(tan));
+libmvec_hidden_proto (V_NAME_F1(tanh));
libmvec_hidden_proto (V_NAME_F2(atan2));
diff --git a/sysdeps/aarch64/fpu/bits/math-vector.h b/sysdeps/aarch64/fpu/bits/math-vector.h
index 1e9b76cf41916365..585e022082d62a5d 100644
--- a/sysdeps/aarch64/fpu/bits/math-vector.h
+++ b/sysdeps/aarch64/fpu/bits/math-vector.h
@@ -113,6 +113,10 @@
# define __DECL_SIMD_tan __DECL_SIMD_aarch64
# undef __DECL_SIMD_tanf
# define __DECL_SIMD_tanf __DECL_SIMD_aarch64
+# undef __DECL_SIMD_tanh
+# define __DECL_SIMD_tanh __DECL_SIMD_aarch64
+# undef __DECL_SIMD_tanhf
+# define __DECL_SIMD_tanhf __DECL_SIMD_aarch64
#endif
#if __GNUC_PREREQ(9, 0)
@@ -160,6 +164,7 @@ __vpcs __f32x4_t _ZGVnN4v_log2f (__f32x4_t);
__vpcs __f32x4_t _ZGVnN4v_sinf (__f32x4_t);
__vpcs __f32x4_t _ZGVnN4v_sinhf (__f32x4_t);
__vpcs __f32x4_t _ZGVnN4v_tanf (__f32x4_t);
+__vpcs __f32x4_t _ZGVnN4v_tanhf (__f32x4_t);
__vpcs __f64x2_t _ZGVnN2vv_atan2 (__f64x2_t, __f64x2_t);
__vpcs __f64x2_t _ZGVnN2v_acos (__f64x2_t);
@@ -182,6 +187,7 @@ __vpcs __f64x2_t _ZGVnN2v_log2 (__f64x2_t);
__vpcs __f64x2_t _ZGVnN2v_sin (__f64x2_t);
__vpcs __f64x2_t _ZGVnN2v_sinh (__f64x2_t);
__vpcs __f64x2_t _ZGVnN2v_tan (__f64x2_t);
+__vpcs __f64x2_t _ZGVnN2v_tanh (__f64x2_t);
# undef __ADVSIMD_VEC_MATH_SUPPORTED
#endif /* __ADVSIMD_VEC_MATH_SUPPORTED */
@@ -209,6 +215,7 @@ __sv_f32_t _ZGVsMxv_log2f (__sv_f32_t, __sv_bool_t);
__sv_f32_t _ZGVsMxv_sinf (__sv_f32_t, __sv_bool_t);
__sv_f32_t _ZGVsMxv_sinhf (__sv_f32_t, __sv_bool_t);
__sv_f32_t _ZGVsMxv_tanf (__sv_f32_t, __sv_bool_t);
+__sv_f32_t _ZGVsMxv_tanhf (__sv_f32_t, __sv_bool_t);
__sv_f64_t _ZGVsMxvv_atan2 (__sv_f64_t, __sv_f64_t, __sv_bool_t);
__sv_f64_t _ZGVsMxv_acos (__sv_f64_t, __sv_bool_t);
@@ -231,6 +238,7 @@ __sv_f64_t _ZGVsMxv_log2 (__sv_f64_t, __sv_bool_t);
__sv_f64_t _ZGVsMxv_sin (__sv_f64_t, __sv_bool_t);
__sv_f64_t _ZGVsMxv_sinh (__sv_f64_t, __sv_bool_t);
__sv_f64_t _ZGVsMxv_tan (__sv_f64_t, __sv_bool_t);
+__sv_f64_t _ZGVsMxv_tanh (__sv_f64_t, __sv_bool_t);
# undef __SVE_VEC_MATH_SUPPORTED
#endif /* __SVE_VEC_MATH_SUPPORTED */
diff --git a/sysdeps/aarch64/fpu/tanh_advsimd.c b/sysdeps/aarch64/fpu/tanh_advsimd.c
new file mode 100644
index 0000000000000000..1da1dfa5dbe418b6
--- /dev/null
+++ b/sysdeps/aarch64/fpu/tanh_advsimd.c
@@ -0,0 +1,109 @@
+/* Double-precision vector (Advanced SIMD) tanh function
+
+ Copyright (C) 2024 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#include "v_math.h"
+#include "poly_advsimd_f64.h"
+
+static const struct data
+{
+ float64x2_t poly[11];
+ float64x2_t inv_ln2, ln2_hi, ln2_lo, shift;
+ uint64x2_t onef;
+ uint64x2_t thresh, tiny_bound;
+} data = {
+ /* Generated using Remez, deg=12 in [-log(2)/2, log(2)/2]. */
+ .poly = { V2 (0x1p-1), V2 (0x1.5555555555559p-3), V2 (0x1.555555555554bp-5),
+ V2 (0x1.111111110f663p-7), V2 (0x1.6c16c16c1b5f3p-10),
+ V2 (0x1.a01a01affa35dp-13), V2 (0x1.a01a018b4ecbbp-16),
+ V2 (0x1.71ddf82db5bb4p-19), V2 (0x1.27e517fc0d54bp-22),
+ V2 (0x1.af5eedae67435p-26), V2 (0x1.1f143d060a28ap-29), },
+
+ .inv_ln2 = V2 (0x1.71547652b82fep0),
+ .ln2_hi = V2 (-0x1.62e42fefa39efp-1),
+ .ln2_lo = V2 (-0x1.abc9e3b39803fp-56),
+ .shift = V2 (0x1.8p52),
+
+ .onef = V2 (0x3ff0000000000000),
+ .tiny_bound = V2 (0x3e40000000000000), /* asuint64 (0x1p-27). */
+ /* asuint64(0x1.241bf835f9d5fp+4) - asuint64(tiny_bound). */
+ .thresh = V2 (0x01f241bf835f9d5f),
+};
+
+static inline float64x2_t
+expm1_inline (float64x2_t x, const struct data *d)
+{
+ /* Helper routine for calculating exp(x) - 1. Vector port of the helper from
+ the scalar variant of tanh. */
+
+ /* Reduce argument: f in [-ln2/2, ln2/2], i is exact. */
+ float64x2_t j = vsubq_f64 (vfmaq_f64 (d->shift, d->inv_ln2, x), d->shift);
+ int64x2_t i = vcvtq_s64_f64 (j);
+ float64x2_t f = vfmaq_f64 (x, j, d->ln2_hi);
+ f = vfmaq_f64 (f, j, d->ln2_lo);
+
+ /* Approximate expm1(f) using polynomial. */
+ float64x2_t f2 = vmulq_f64 (f, f);
+ float64x2_t f4 = vmulq_f64 (f2, f2);
+ float64x2_t p = vfmaq_f64 (
+ f, f2, v_estrin_10_f64 (f, f2, f4, vmulq_f64 (f4, f4), d->poly));
+
+ /* t = 2 ^ i. */
+ float64x2_t t = vreinterpretq_f64_u64 (
+ vaddq_u64 (vreinterpretq_u64_s64 (i << 52), d->onef));
+ /* expm1(x) = p * t + (t - 1). */
+ return vfmaq_f64 (vsubq_f64 (t, v_f64 (1)), p, t);
+}
+
+static float64x2_t NOINLINE VPCS_ATTR
+special_case (float64x2_t x, float64x2_t y, uint64x2_t special)
+{
+ return v_call_f64 (tanh, x, y, special);
+}
+
+/* Vector approximation for double-precision tanh(x), using a simplified
+ version of expm1. The greatest observed error is 2.77 ULP:
+ _ZGVnN2v_tanh(-0x1.c4a4ca0f9f3b7p-3) got -0x1.bd6a21a163627p-3
+ want -0x1.bd6a21a163624p-3. */
+float64x2_t VPCS_ATTR V_NAME_D1 (tanh) (float64x2_t x)
+{
+ const struct data *d = ptr_barrier (&data);
+
+ uint64x2_t ia = vreinterpretq_u64_f64 (vabsq_f64 (x));
+
+ float64x2_t u = x;
+
+ /* Trigger special-cases for tiny, boring and infinity/NaN. */
+ uint64x2_t special = vcgtq_u64 (vsubq_u64 (ia, d->tiny_bound), d->thresh);
+#if WANT_SIMD_EXCEPT
+ /* To trigger fp exceptions correctly, set special lanes to a neutral value.
+ They will be fixed up later by the special-case handler. */
+ if (__glibc_unlikely (v_any_u64 (special)))
+ u = v_zerofy_f64 (u, special);
+#endif
+
+ u = vaddq_f64 (u, u);
+
+ /* tanh(x) = (e^2x - 1) / (e^2x + 1). */
+ float64x2_t q = expm1_inline (u, d);
+ float64x2_t qp2 = vaddq_f64 (q, v_f64 (2));
+
+ if (__glibc_unlikely (v_any_u64 (special)))
+ return special_case (x, vdivq_f64 (q, qp2), special);
+ return vdivq_f64 (q, qp2);
+}
diff --git a/sysdeps/aarch64/fpu/tanh_sve.c b/sysdeps/aarch64/fpu/tanh_sve.c
new file mode 100644
index 0000000000000000..d25e011cea305094
--- /dev/null
+++ b/sysdeps/aarch64/fpu/tanh_sve.c
@@ -0,0 +1,100 @@
+/* Double-precision vector (SVE) tanh function
+
+ Copyright (C) 2024 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#include "sv_math.h"
+#include "poly_sve_f64.h"
+
+static const struct data
+{
+ float64_t poly[11];
+ float64_t inv_ln2, ln2_hi, ln2_lo, shift;
+ uint64_t thresh, tiny_bound;
+} data = {
+ /* Generated using Remez, deg=12 in [-log(2)/2, log(2)/2]. */
+ .poly = { 0x1p-1, 0x1.5555555555559p-3, 0x1.555555555554bp-5,
+ 0x1.111111110f663p-7, 0x1.6c16c16c1b5f3p-10,
+ 0x1.a01a01affa35dp-13, 0x1.a01a018b4ecbbp-16,
+ 0x1.71ddf82db5bb4p-19, 0x1.27e517fc0d54bp-22,
+ 0x1.af5eedae67435p-26, 0x1.1f143d060a28ap-29, },
+
+ .inv_ln2 = 0x1.71547652b82fep0,
+ .ln2_hi = -0x1.62e42fefa39efp-1,
+ .ln2_lo = -0x1.abc9e3b39803fp-56,
+ .shift = 0x1.8p52,
+
+ .tiny_bound = 0x3e40000000000000, /* asuint64 (0x1p-27). */
+ /* asuint64(0x1.241bf835f9d5fp+4) - asuint64(tiny_bound). */
+ .thresh = 0x01f241bf835f9d5f,
+};
+
+static inline svfloat64_t
+expm1_inline (svfloat64_t x, const svbool_t pg, const struct data *d)
+{
+ /* Helper routine for calculating exp(x) - 1. Vector port of the helper from
+ the scalar variant of tanh. */
+
+ /* Reduce argument: f in [-ln2/2, ln2/2], i is exact. */
+ svfloat64_t j
+ = svsub_x (pg, svmla_x (pg, sv_f64 (d->shift), x, d->inv_ln2), d->shift);
+ svint64_t i = svcvt_s64_x (pg, j);
+ svfloat64_t f = svmla_x (pg, x, j, d->ln2_hi);
+ f = svmla_x (pg, f, j, d->ln2_lo);
+
+ /* Approximate expm1(f) using polynomial. */
+ svfloat64_t f2 = svmul_x (pg, f, f);
+ svfloat64_t f4 = svmul_x (pg, f2, f2);
+ svfloat64_t p = svmla_x (
+ pg, f, f2,
+ sv_estrin_10_f64_x (pg, f, f2, f4, svmul_x (pg, f4, f4), d->poly));
+
+ /* t = 2 ^ i. */
+ svfloat64_t t = svscale_x (pg, sv_f64 (1), i);
+ /* expm1(x) = p * t + (t - 1). */
+ return svmla_x (pg, svsub_x (pg, t, 1), p, t);
+}
+
+static svfloat64_t NOINLINE
+special_case (svfloat64_t x, svfloat64_t y, svbool_t special)
+{
+ return sv_call_f64 (tanh, x, y, special);
+}
+
+/* SVE approximation for double-precision tanh(x), using a simplified
+ version of expm1. The greatest observed error is 2.77 ULP:
+ _ZGVsMxv_tanh(-0x1.c4a4ca0f9f3b7p-3) got -0x1.bd6a21a163627p-3
+ want -0x1.bd6a21a163624p-3. */
+svfloat64_t SV_NAME_D1 (tanh) (svfloat64_t x, svbool_t pg)
+{
+ const struct data *d = ptr_barrier (&data);
+
+ svuint64_t ia = svreinterpret_u64 (svabs_x (pg, x));
+
+ /* Trigger special-cases for tiny, boring and infinity/NaN. */
+ svbool_t special = svcmpgt (pg, svsub_x (pg, ia, d->tiny_bound), d->thresh);
+
+ svfloat64_t u = svadd_x (pg, x, x);
+
+ /* tanh(x) = (e^2x - 1) / (e^2x + 1). */
+ svfloat64_t q = expm1_inline (u, pg, d);
+ svfloat64_t qp2 = svadd_x (pg, q, 2);
+
+ if (__glibc_unlikely (svptest_any (pg, special)))
+ return special_case (x, svdiv_x (pg, q, qp2), special);
+ return svdiv_x (pg, q, qp2);
+}
diff --git a/sysdeps/aarch64/fpu/tanhf_advsimd.c b/sysdeps/aarch64/fpu/tanhf_advsimd.c
new file mode 100644
index 0000000000000000..50defd6ef03926f4
--- /dev/null
+++ b/sysdeps/aarch64/fpu/tanhf_advsimd.c
@@ -0,0 +1,76 @@
+/* Single-precision vector (Advanced SIMD) tanh function
+
+ Copyright (C) 2024 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#include "v_expm1f_inline.h"
+
+static const struct data
+{
+ struct v_expm1f_data expm1f_consts;
+ uint32x4_t boring_bound, large_bound, onef;
+} data = {
+ .expm1f_consts = V_EXPM1F_DATA,
+ /* 0x1.205966p+3, above which tanhf rounds to 1 (or -1 for negative). */
+ .boring_bound = V4 (0x41102cb3),
+ .large_bound = V4 (0x7f800000),
+ .onef = V4 (0x3f800000),
+};
+
+static float32x4_t NOINLINE VPCS_ATTR
+special_case (float32x4_t x, float32x4_t y, uint32x4_t special)
+{
+ return v_call_f32 (tanhf, x, y, special);
+}
+
+/* Approximation for single-precision vector tanh(x), using a simplified
+ version of expm1f. The maximum error is 2.58 ULP:
+ _ZGVnN4v_tanhf (0x1.fa5eep-5) got 0x1.f9ba02p-5
+ want 0x1.f9ba08p-5. */
+float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (tanh) (float32x4_t x)
+{
+ const struct data *d = ptr_barrier (&data);
+
+ uint32x4_t ix = vreinterpretq_u32_f32 (x);
+ float32x4_t ax = vabsq_f32 (x);
+ uint32x4_t iax = vreinterpretq_u32_f32 (ax);
+ uint32x4_t sign = veorq_u32 (ix, iax);
+ uint32x4_t is_boring = vcgtq_u32 (iax, d->boring_bound);
+ float32x4_t boring = vreinterpretq_f32_u32 (vorrq_u32 (sign, d->onef));
+
+#if WANT_SIMD_EXCEPT
+ /* If fp exceptions are to be triggered properly, set all special and boring
+ lanes to 0, which will trigger no exceptions, and fix them up later. */
+ uint32x4_t special = vorrq_u32 (vcgtq_u32 (iax, d->large_bound),
+ vcltq_u32 (iax, v_u32 (0x34000000)));
+ x = v_zerofy_f32 (x, is_boring);
+ if (__glibc_unlikely (v_any_u32 (special)))
+ x = v_zerofy_f32 (x, special);
+#else
+ uint32x4_t special = vcgtq_u32 (iax, d->large_bound);
+#endif
+
+ /* tanh(x) = (e^2x - 1) / (e^2x + 1). */
+ float32x4_t q = expm1f_inline (vmulq_n_f32 (x, 2), &d->expm1f_consts);
+ float32x4_t y = vdivq_f32 (q, vaddq_f32 (q, v_f32 (2.0)));
+ if (__glibc_unlikely (v_any_u32 (special)))
+ return special_case (vreinterpretq_f32_u32 (ix),
+ vbslq_f32 (is_boring, boring, y), special);
+ return vbslq_f32 (is_boring, boring, y);
+}
+libmvec_hidden_def (V_NAME_F1 (tanh))
+HALF_WIDTH_ALIAS_F1 (tanh)
diff --git a/sysdeps/aarch64/fpu/tanhf_sve.c b/sysdeps/aarch64/fpu/tanhf_sve.c
new file mode 100644
index 0000000000000000..0b94523cf5074200
--- /dev/null
+++ b/sysdeps/aarch64/fpu/tanhf_sve.c
@@ -0,0 +1,61 @@
+/* Single-precision vector (SVE) tanh function
+
+ Copyright (C) 2024 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#include "sv_expm1f_inline.h"
+
+static const struct data
+{
+ struct sv_expm1f_data expm1f_consts;
+ uint32_t boring_bound, onef;
+} data = {
+ .expm1f_consts = SV_EXPM1F_DATA,
+ /* 0x1.205966p+3, above which tanhf rounds to 1 (or -1 for negative). */
+ .boring_bound = 0x41102cb3,
+ .onef = 0x3f800000,
+};
+
+static svfloat32_t NOINLINE
+special_case (svfloat32_t x, svfloat32_t y, svbool_t special)
+{
+ return sv_call_f32 (tanhf, x, y, special);
+}
+
+/* Approximation for single-precision SVE tanh(x), using a simplified
+ version of expm1f. The maximum error is 2.57 ULP:
+ _ZGVsMxv_tanhf (0x1.fc1832p-5) got 0x1.fb71a4p-5
+ want 0x1.fb71aap-5. */
+svfloat32_t SV_NAME_F1 (tanh) (svfloat32_t x, const svbool_t pg)
+{
+ const struct data *d = ptr_barrier (&data);
+
+ svfloat32_t ax = svabs_x (pg, x);
+ svuint32_t iax = svreinterpret_u32 (ax);
+ svuint32_t sign = sveor_x (pg, svreinterpret_u32 (x), iax);
+ svbool_t is_boring = svcmpgt (pg, iax, d->boring_bound);
+ svfloat32_t boring = svreinterpret_f32 (svorr_x (pg, sign, d->onef));
+
+ svbool_t special = svcmpgt (pg, iax, 0x7f800000);
+
+ /* tanh(x) = (e^2x - 1) / (e^2x + 1). */
+ svfloat32_t q = expm1f_inline (svmul_x (pg, x, 2.0), pg, &d->expm1f_consts);
+ svfloat32_t y = svdiv_x (pg, q, svadd_x (pg, q, 2.0));
+ if (__glibc_unlikely (svptest_any (pg, special)))
+ return special_case (x, svsel_f32 (is_boring, boring, y), special);
+ return svsel_f32 (is_boring, boring, y);
+}
diff --git a/sysdeps/aarch64/fpu/test-double-advsimd-wrappers.c b/sysdeps/aarch64/fpu/test-double-advsimd-wrappers.c
index 1a57b22c3a92f1e1..7aeda880bd885ce5 100644
--- a/sysdeps/aarch64/fpu/test-double-advsimd-wrappers.c
+++ b/sysdeps/aarch64/fpu/test-double-advsimd-wrappers.c
@@ -44,3 +44,4 @@ VPCS_VECTOR_WRAPPER (log2_advsimd, _ZGVnN2v_log2)
VPCS_VECTOR_WRAPPER (sin_advsimd, _ZGVnN2v_sin)
VPCS_VECTOR_WRAPPER (sinh_advsimd, _ZGVnN2v_sinh)
VPCS_VECTOR_WRAPPER (tan_advsimd, _ZGVnN2v_tan)
+VPCS_VECTOR_WRAPPER (tanh_advsimd, _ZGVnN2v_tanh)
diff --git a/sysdeps/aarch64/fpu/test-double-sve-wrappers.c b/sysdeps/aarch64/fpu/test-double-sve-wrappers.c
index 0c9858f6b74aaef6..95f1ec52221ba626 100644
--- a/sysdeps/aarch64/fpu/test-double-sve-wrappers.c
+++ b/sysdeps/aarch64/fpu/test-double-sve-wrappers.c
@@ -63,3 +63,4 @@ SVE_VECTOR_WRAPPER (log2_sve, _ZGVsMxv_log2)
SVE_VECTOR_WRAPPER (sin_sve, _ZGVsMxv_sin)
SVE_VECTOR_WRAPPER (sinh_sve, _ZGVsMxv_sinh)
SVE_VECTOR_WRAPPER (tan_sve, _ZGVsMxv_tan)
+SVE_VECTOR_WRAPPER (tanh_sve, _ZGVsMxv_tanh)
diff --git a/sysdeps/aarch64/fpu/test-float-advsimd-wrappers.c b/sysdeps/aarch64/fpu/test-float-advsimd-wrappers.c
index 4758490c6fc40fda..bd6800e91c64136f 100644
--- a/sysdeps/aarch64/fpu/test-float-advsimd-wrappers.c
+++ b/sysdeps/aarch64/fpu/test-float-advsimd-wrappers.c
@@ -44,3 +44,4 @@ VPCS_VECTOR_WRAPPER (log2f_advsimd, _ZGVnN4v_log2f)
VPCS_VECTOR_WRAPPER (sinf_advsimd, _ZGVnN4v_sinf)
VPCS_VECTOR_WRAPPER (sinhf_advsimd, _ZGVnN4v_sinhf)
VPCS_VECTOR_WRAPPER (tanf_advsimd, _ZGVnN4v_tanf)
+VPCS_VECTOR_WRAPPER (tanhf_advsimd, _ZGVnN4v_tanhf)
diff --git a/sysdeps/aarch64/fpu/test-float-sve-wrappers.c b/sysdeps/aarch64/fpu/test-float-sve-wrappers.c
index 7c04f07bbee84777..35ca305fddb7366c 100644
--- a/sysdeps/aarch64/fpu/test-float-sve-wrappers.c
+++ b/sysdeps/aarch64/fpu/test-float-sve-wrappers.c
@@ -63,3 +63,4 @@ SVE_VECTOR_WRAPPER (log2f_sve, _ZGVsMxv_log2f)
SVE_VECTOR_WRAPPER (sinf_sve, _ZGVsMxv_sinf)
SVE_VECTOR_WRAPPER (sinhf_sve, _ZGVsMxv_sinhf)
SVE_VECTOR_WRAPPER (tanf_sve, _ZGVsMxv_tanf)
+SVE_VECTOR_WRAPPER (tanhf_sve, _ZGVsMxv_tanhf)
diff --git a/sysdeps/aarch64/libm-test-ulps b/sysdeps/aarch64/libm-test-ulps
index fec0972081af734a..8398b7bc7749808d 100644
--- a/sysdeps/aarch64/libm-test-ulps
+++ b/sysdeps/aarch64/libm-test-ulps
@@ -1496,11 +1496,19 @@ double: 2
float: 2
ldouble: 2
+Function: "tanh_advsimd":
+double: 2
+float: 2
+
Function: "tanh_downward":
double: 3
float: 3
ldouble: 4
+Function: "tanh_sve":
+double: 2
+float: 2
+
Function: "tanh_towardzero":
double: 2
float: 2
diff --git a/sysdeps/unix/sysv/linux/aarch64/libmvec.abilist b/sysdeps/unix/sysv/linux/aarch64/libmvec.abilist
index 1db5ba61d64067a2..396082f6a7981686 100644
--- a/sysdeps/unix/sysv/linux/aarch64/libmvec.abilist
+++ b/sysdeps/unix/sysv/linux/aarch64/libmvec.abilist
@@ -85,12 +85,15 @@ GLIBC_2.40 _ZGVnN2v_erf F
GLIBC_2.40 _ZGVnN2v_erff F
GLIBC_2.40 _ZGVnN2v_sinh F
GLIBC_2.40 _ZGVnN2v_sinhf F
+GLIBC_2.40 _ZGVnN2v_tanh F
+GLIBC_2.40 _ZGVnN2v_tanhf F
GLIBC_2.40 _ZGVnN4v_acoshf F
GLIBC_2.40 _ZGVnN4v_asinhf F
GLIBC_2.40 _ZGVnN4v_atanhf F
GLIBC_2.40 _ZGVnN4v_coshf F
GLIBC_2.40 _ZGVnN4v_erff F
GLIBC_2.40 _ZGVnN4v_sinhf F
+GLIBC_2.40 _ZGVnN4v_tanhf F
GLIBC_2.40 _ZGVsMxv_acosh F
GLIBC_2.40 _ZGVsMxv_acoshf F
GLIBC_2.40 _ZGVsMxv_asinh F
@@ -103,3 +106,5 @@ GLIBC_2.40 _ZGVsMxv_erf F
GLIBC_2.40 _ZGVsMxv_erff F
GLIBC_2.40 _ZGVsMxv_sinh F
GLIBC_2.40 _ZGVsMxv_sinhf F
+GLIBC_2.40 _ZGVsMxv_tanh F
+GLIBC_2.40 _ZGVsMxv_tanhf F

5115
glibc-RHEL-118273-8.patch Normal file

File diff suppressed because it is too large Load Diff

348
glibc-RHEL-118273-9.patch Normal file
View File

@ -0,0 +1,348 @@
commit 90a6ca8b28bf34e361e577e526e1b0f4c39a32a5
Author: Joe Ramsay <Joe.Ramsay@arm.com>
Date: Thu May 2 16:43:13 2024 +0100
aarch64: Fix AdvSIMD libmvec routines for big-endian
Previously many routines used * to load from vector types stored
in the data table. This is emitted as ldr, which byte-swaps the
entire vector register, and causes bugs for big-endian when not
all lanes contain the same value. When a vector is to be used
this way, it has been replaced with an array and the load with an
explicit ld1 intrinsic, which byte-swaps only within lanes.
As well, many routines previously used non-standard GCC syntax
for vector operations such as indexing into vectors types with []
and assembling vectors using {}. This syntax should not be mixed
with ACLE, as the former does not respect endianness whereas the
latter does. Such examples have been replaced with, for instance,
vcombine_* and vgetq_lane* intrinsics. Helpers which only use the
GCC syntax, such as the v_call helpers, do not need changing as
they do not use intrinsics.
Reviewed-by: Szabolcs Nagy <szabolcs.nagy@arm.com>
Conflicts:
sysdeps/aarch64/fpu/exp10f_advsimd.c
sysdeps/aarch64/fpu/expm1_advsimd.c
sysdeps/aarch64/fpu/expm1f_advsimd.c
sysdeps/aarch64/fpu/log10_advsimd.c
sysdeps/aarch64/fpu/log2_advsimd.c
sysdeps/aarch64/fpu/log_advsimd.c
sysdeps/aarch64/fpu/tan_advsimd.c
sysdeps/aarch64/fpu/tanf_advsimd.c
(Already backported by glibc-upstream-2.39-151.patch)
diff --git a/sysdeps/aarch64/fpu/asinh_advsimd.c b/sysdeps/aarch64/fpu/asinh_advsimd.c
index 544a52f6515d3201..6207e7da9531f48d 100644
--- a/sysdeps/aarch64/fpu/asinh_advsimd.c
+++ b/sysdeps/aarch64/fpu/asinh_advsimd.c
@@ -22,6 +22,7 @@
#define A(i) v_f64 (__v_log_data.poly[i])
#define N (1 << V_LOG_TABLE_BITS)
+#define IndexMask (N - 1)
const static struct data
{
@@ -63,11 +64,15 @@ struct entry
static inline struct entry
lookup (uint64x2_t i)
{
- float64x2_t e0 = vld1q_f64 (
- &__v_log_data.table[(i[0] >> (52 - V_LOG_TABLE_BITS)) & (N - 1)].invc);
- float64x2_t e1 = vld1q_f64 (
- &__v_log_data.table[(i[1] >> (52 - V_LOG_TABLE_BITS)) & (N - 1)].invc);
- return (struct entry){ vuzp1q_f64 (e0, e1), vuzp2q_f64 (e0, e1) };
+ /* Since N is a power of 2, n % N = n & (N - 1). */
+ struct entry e;
+ uint64_t i0 = (vgetq_lane_u64 (i, 0) >> (52 - V_LOG_TABLE_BITS)) & IndexMask;
+ uint64_t i1 = (vgetq_lane_u64 (i, 1) >> (52 - V_LOG_TABLE_BITS)) & IndexMask;
+ float64x2_t e0 = vld1q_f64 (&__v_log_data.table[i0].invc);
+ float64x2_t e1 = vld1q_f64 (&__v_log_data.table[i1].invc);
+ e.invc = vuzp1q_f64 (e0, e1);
+ e.logc = vuzp2q_f64 (e0, e1);
+ return e;
}
static inline float64x2_t
diff --git a/sysdeps/aarch64/fpu/cosh_advsimd.c b/sysdeps/aarch64/fpu/cosh_advsimd.c
index ec7b59637e973da9..4bee734f00bd6a9b 100644
--- a/sysdeps/aarch64/fpu/cosh_advsimd.c
+++ b/sysdeps/aarch64/fpu/cosh_advsimd.c
@@ -22,7 +22,9 @@
static const struct data
{
float64x2_t poly[3];
- float64x2_t inv_ln2, ln2, shift, thres;
+ float64x2_t inv_ln2;
+ double ln2[2];
+ float64x2_t shift, thres;
uint64x2_t index_mask, special_bound;
} data = {
.poly = { V2 (0x1.fffffffffffd4p-2), V2 (0x1.5555571d6b68cp-3),
@@ -58,8 +60,9 @@ exp_inline (float64x2_t x)
float64x2_t n = vsubq_f64 (z, d->shift);
/* r = x - n*ln2/N. */
- float64x2_t r = vfmaq_laneq_f64 (x, n, d->ln2, 0);
- r = vfmaq_laneq_f64 (r, n, d->ln2, 1);
+ float64x2_t ln2 = vld1q_f64 (d->ln2);
+ float64x2_t r = vfmaq_laneq_f64 (x, n, ln2, 0);
+ r = vfmaq_laneq_f64 (r, n, ln2, 1);
uint64x2_t e = vshlq_n_u64 (u, 52 - V_EXP_TAIL_TABLE_BITS);
uint64x2_t i = vandq_u64 (u, d->index_mask);
diff --git a/sysdeps/aarch64/fpu/erf_advsimd.c b/sysdeps/aarch64/fpu/erf_advsimd.c
index 3e70cbc025248a05..19cbb7d0f42eb4e2 100644
--- a/sysdeps/aarch64/fpu/erf_advsimd.c
+++ b/sysdeps/aarch64/fpu/erf_advsimd.c
@@ -56,8 +56,8 @@ static inline struct entry
lookup (uint64x2_t i)
{
struct entry e;
- float64x2_t e1 = vld1q_f64 ((float64_t *) (__erf_data.tab + i[0])),
- e2 = vld1q_f64 ((float64_t *) (__erf_data.tab + i[1]));
+ float64x2_t e1 = vld1q_f64 (&__erf_data.tab[vgetq_lane_u64 (i, 0)].erf),
+ e2 = vld1q_f64 (&__erf_data.tab[vgetq_lane_u64 (i, 1)].erf);
e.erf = vuzp1q_f64 (e1, e2);
e.scale = vuzp2q_f64 (e1, e2);
return e;
diff --git a/sysdeps/aarch64/fpu/erfc_advsimd.c b/sysdeps/aarch64/fpu/erfc_advsimd.c
index 548f21a3d68d68d2..f1b3bfe8304c73b5 100644
--- a/sysdeps/aarch64/fpu/erfc_advsimd.c
+++ b/sysdeps/aarch64/fpu/erfc_advsimd.c
@@ -26,7 +26,7 @@ static const struct data
float64x2_t max, shift;
float64x2_t p20, p40, p41, p42;
float64x2_t p51, p52;
- float64x2_t qr5, qr6, qr7, qr8, qr9;
+ double qr5[2], qr6[2], qr7[2], qr8[2], qr9[2];
#if WANT_SIMD_EXCEPT
float64x2_t uflow_bound;
#endif
@@ -68,8 +68,10 @@ static inline struct entry
lookup (uint64x2_t i)
{
struct entry e;
- float64x2_t e1 = vld1q_f64 ((float64_t *) (__erfc_data.tab - Off + i[0])),
- e2 = vld1q_f64 ((float64_t *) (__erfc_data.tab - Off + i[1]));
+ float64x2_t e1
+ = vld1q_f64 (&__erfc_data.tab[vgetq_lane_u64 (i, 0) - Off].erfc);
+ float64x2_t e2
+ = vld1q_f64 (&__erfc_data.tab[vgetq_lane_u64 (i, 1) - Off].erfc);
e.erfc = vuzp1q_f64 (e1, e2);
e.scale = vuzp2q_f64 (e1, e2);
return e;
@@ -161,16 +163,19 @@ float64x2_t V_NAME_D1 (erfc) (float64x2_t x)
p5 = vmulq_f64 (r, vfmaq_f64 (vmulq_f64 (v_f64 (0.5), dat->p20), r2, p5));
/* Compute p_i using recurrence relation:
p_{i+2} = (p_i + r * Q_{i+1} * p_{i+1}) * R_{i+1}. */
- float64x2_t p6 = vfmaq_f64 (p4, p5, vmulq_laneq_f64 (r, dat->qr5, 0));
- p6 = vmulq_laneq_f64 (p6, dat->qr5, 1);
- float64x2_t p7 = vfmaq_f64 (p5, p6, vmulq_laneq_f64 (r, dat->qr6, 0));
- p7 = vmulq_laneq_f64 (p7, dat->qr6, 1);
- float64x2_t p8 = vfmaq_f64 (p6, p7, vmulq_laneq_f64 (r, dat->qr7, 0));
- p8 = vmulq_laneq_f64 (p8, dat->qr7, 1);
- float64x2_t p9 = vfmaq_f64 (p7, p8, vmulq_laneq_f64 (r, dat->qr8, 0));
- p9 = vmulq_laneq_f64 (p9, dat->qr8, 1);
- float64x2_t p10 = vfmaq_f64 (p8, p9, vmulq_laneq_f64 (r, dat->qr9, 0));
- p10 = vmulq_laneq_f64 (p10, dat->qr9, 1);
+ float64x2_t qr5 = vld1q_f64 (dat->qr5), qr6 = vld1q_f64 (dat->qr6),
+ qr7 = vld1q_f64 (dat->qr7), qr8 = vld1q_f64 (dat->qr8),
+ qr9 = vld1q_f64 (dat->qr9);
+ float64x2_t p6 = vfmaq_f64 (p4, p5, vmulq_laneq_f64 (r, qr5, 0));
+ p6 = vmulq_laneq_f64 (p6, qr5, 1);
+ float64x2_t p7 = vfmaq_f64 (p5, p6, vmulq_laneq_f64 (r, qr6, 0));
+ p7 = vmulq_laneq_f64 (p7, qr6, 1);
+ float64x2_t p8 = vfmaq_f64 (p6, p7, vmulq_laneq_f64 (r, qr7, 0));
+ p8 = vmulq_laneq_f64 (p8, qr7, 1);
+ float64x2_t p9 = vfmaq_f64 (p7, p8, vmulq_laneq_f64 (r, qr8, 0));
+ p9 = vmulq_laneq_f64 (p9, qr8, 1);
+ float64x2_t p10 = vfmaq_f64 (p8, p9, vmulq_laneq_f64 (r, qr9, 0));
+ p10 = vmulq_laneq_f64 (p10, qr9, 1);
/* Compute polynomial in d using pairwise Horner scheme. */
float64x2_t p90 = vfmaq_f64 (p9, d, p10);
float64x2_t p78 = vfmaq_f64 (p7, d, p8);
diff --git a/sysdeps/aarch64/fpu/erfcf_advsimd.c b/sysdeps/aarch64/fpu/erfcf_advsimd.c
index 30b9e48dd40d80a0..ca5bc3ab33c92f83 100644
--- a/sysdeps/aarch64/fpu/erfcf_advsimd.c
+++ b/sysdeps/aarch64/fpu/erfcf_advsimd.c
@@ -23,7 +23,8 @@ static const struct data
{
uint32x4_t offset, table_scale;
float32x4_t max, shift;
- float32x4_t coeffs, third, two_over_five, tenth;
+ float coeffs[4];
+ float32x4_t third, two_over_five, tenth;
#if WANT_SIMD_EXCEPT
float32x4_t uflow_bound;
#endif
@@ -37,7 +38,7 @@ static const struct data
.shift = V4 (0x1p17f),
/* Store 1/3, 2/3 and 2/15 in a single register for use with indexed muls and
fmas. */
- .coeffs = (float32x4_t){ 0x1.555556p-2f, 0x1.555556p-1f, 0x1.111112p-3f, 0 },
+ .coeffs = { 0x1.555556p-2f, 0x1.555556p-1f, 0x1.111112p-3f, 0 },
.third = V4 (0x1.555556p-2f),
.two_over_five = V4 (-0x1.99999ap-2f),
.tenth = V4 (-0x1.99999ap-4f),
@@ -60,12 +61,16 @@ static inline struct entry
lookup (uint32x4_t i)
{
struct entry e;
- float64_t t0 = *((float64_t *) (__erfcf_data.tab - Off + i[0]));
- float64_t t1 = *((float64_t *) (__erfcf_data.tab - Off + i[1]));
- float64_t t2 = *((float64_t *) (__erfcf_data.tab - Off + i[2]));
- float64_t t3 = *((float64_t *) (__erfcf_data.tab - Off + i[3]));
- float32x4_t e1 = vreinterpretq_f32_f64 ((float64x2_t){ t0, t1 });
- float32x4_t e2 = vreinterpretq_f32_f64 ((float64x2_t){ t2, t3 });
+ float32x2_t t0
+ = vld1_f32 (&__erfcf_data.tab[vgetq_lane_u32 (i, 0) - Off].erfc);
+ float32x2_t t1
+ = vld1_f32 (&__erfcf_data.tab[vgetq_lane_u32 (i, 1) - Off].erfc);
+ float32x2_t t2
+ = vld1_f32 (&__erfcf_data.tab[vgetq_lane_u32 (i, 2) - Off].erfc);
+ float32x2_t t3
+ = vld1_f32 (&__erfcf_data.tab[vgetq_lane_u32 (i, 3) - Off].erfc);
+ float32x4_t e1 = vcombine_f32 (t0, t1);
+ float32x4_t e2 = vcombine_f32 (t2, t3);
e.erfc = vuzp1q_f32 (e1, e2);
e.scale = vuzp2q_f32 (e1, e2);
return e;
@@ -140,10 +145,11 @@ float32x4_t NOINLINE V_NAME_F1 (erfc) (float32x4_t x)
float32x4_t r2 = vmulq_f32 (r, r);
float32x4_t p1 = r;
- float32x4_t p2 = vfmsq_laneq_f32 (dat->third, r2, dat->coeffs, 1);
+ float32x4_t coeffs = vld1q_f32 (dat->coeffs);
+ float32x4_t p2 = vfmsq_laneq_f32 (dat->third, r2, coeffs, 1);
float32x4_t p3
- = vmulq_f32 (r, vfmaq_laneq_f32 (v_f32 (-0.5), r2, dat->coeffs, 0));
- float32x4_t p4 = vfmaq_laneq_f32 (dat->two_over_five, r2, dat->coeffs, 2);
+ = vmulq_f32 (r, vfmaq_laneq_f32 (v_f32 (-0.5), r2, coeffs, 0));
+ float32x4_t p4 = vfmaq_laneq_f32 (dat->two_over_five, r2, coeffs, 2);
p4 = vfmsq_f32 (dat->tenth, r2, p4);
float32x4_t y = vfmaq_f32 (p3, d, p4);
diff --git a/sysdeps/aarch64/fpu/erff_advsimd.c b/sysdeps/aarch64/fpu/erff_advsimd.c
index c44644a71cffbb62..f2fe6ff236a6ec07 100644
--- a/sysdeps/aarch64/fpu/erff_advsimd.c
+++ b/sysdeps/aarch64/fpu/erff_advsimd.c
@@ -47,12 +47,12 @@ static inline struct entry
lookup (uint32x4_t i)
{
struct entry e;
- float64_t t0 = *((float64_t *) (__erff_data.tab + i[0]));
- float64_t t1 = *((float64_t *) (__erff_data.tab + i[1]));
- float64_t t2 = *((float64_t *) (__erff_data.tab + i[2]));
- float64_t t3 = *((float64_t *) (__erff_data.tab + i[3]));
- float32x4_t e1 = vreinterpretq_f32_f64 ((float64x2_t){ t0, t1 });
- float32x4_t e2 = vreinterpretq_f32_f64 ((float64x2_t){ t2, t3 });
+ float32x2_t t0 = vld1_f32 (&__erff_data.tab[vgetq_lane_u32 (i, 0)].erf);
+ float32x2_t t1 = vld1_f32 (&__erff_data.tab[vgetq_lane_u32 (i, 1)].erf);
+ float32x2_t t2 = vld1_f32 (&__erff_data.tab[vgetq_lane_u32 (i, 2)].erf);
+ float32x2_t t3 = vld1_f32 (&__erff_data.tab[vgetq_lane_u32 (i, 3)].erf);
+ float32x4_t e1 = vcombine_f32 (t0, t1);
+ float32x4_t e2 = vcombine_f32 (t2, t3);
e.erf = vuzp1q_f32 (e1, e2);
e.scale = vuzp2q_f32 (e1, e2);
return e;
diff --git a/sysdeps/aarch64/fpu/sinh_advsimd.c b/sysdeps/aarch64/fpu/sinh_advsimd.c
index fa3723b10c15eb29..3e3b76c502b01e16 100644
--- a/sysdeps/aarch64/fpu/sinh_advsimd.c
+++ b/sysdeps/aarch64/fpu/sinh_advsimd.c
@@ -22,8 +22,9 @@
static const struct data
{
- float64x2_t poly[11];
- float64x2_t inv_ln2, m_ln2, shift;
+ float64x2_t poly[11], inv_ln2;
+ double m_ln2[2];
+ float64x2_t shift;
uint64x2_t halff;
int64x2_t onef;
#if WANT_SIMD_EXCEPT
@@ -40,7 +41,7 @@ static const struct data
V2 (0x1.af5eedae67435p-26), V2 (0x1.1f143d060a28ap-29), },
.inv_ln2 = V2 (0x1.71547652b82fep0),
- .m_ln2 = (float64x2_t) {-0x1.62e42fefa39efp-1, -0x1.abc9e3b39803fp-56},
+ .m_ln2 = {-0x1.62e42fefa39efp-1, -0x1.abc9e3b39803fp-56},
.shift = V2 (0x1.8p52),
.halff = V2 (0x3fe0000000000000),
@@ -67,8 +68,10 @@ expm1_inline (float64x2_t x)
and f = x - i * ln2 (f in [-ln2/2, ln2/2]). */
float64x2_t j = vsubq_f64 (vfmaq_f64 (d->shift, d->inv_ln2, x), d->shift);
int64x2_t i = vcvtq_s64_f64 (j);
- float64x2_t f = vfmaq_laneq_f64 (x, j, d->m_ln2, 0);
- f = vfmaq_laneq_f64 (f, j, d->m_ln2, 1);
+
+ float64x2_t m_ln2 = vld1q_f64 (d->m_ln2);
+ float64x2_t f = vfmaq_laneq_f64 (x, j, m_ln2, 0);
+ f = vfmaq_laneq_f64 (f, j, m_ln2, 1);
/* Approximate expm1(f) using polynomial. */
float64x2_t f2 = vmulq_f64 (f, f);
float64x2_t f4 = vmulq_f64 (f2, f2);
diff --git a/sysdeps/aarch64/fpu/v_expf_inline.h b/sysdeps/aarch64/fpu/v_expf_inline.h
index a3b0e32f9eb42021..08b06e0a6b34b4f4 100644
--- a/sysdeps/aarch64/fpu/v_expf_inline.h
+++ b/sysdeps/aarch64/fpu/v_expf_inline.h
@@ -25,7 +25,8 @@
struct v_expf_data
{
float32x4_t poly[5];
- float32x4_t shift, invln2_and_ln2;
+ float32x4_t shift;
+ float invln2_and_ln2[4];
};
/* maxerr: 1.45358 +0.5 ulp. */
@@ -50,10 +51,11 @@ v_expf_inline (float32x4_t x, const struct v_expf_data *d)
/* exp(x) = 2^n (1 + poly(r)), with 1 + poly(r) in [1/sqrt(2),sqrt(2)]
x = ln2*n + r, with r in [-ln2/2, ln2/2]. */
float32x4_t n, r, z;
- z = vfmaq_laneq_f32 (d->shift, x, d->invln2_and_ln2, 0);
+ float32x4_t invln2_and_ln2 = vld1q_f32 (d->invln2_and_ln2);
+ z = vfmaq_laneq_f32 (d->shift, x, invln2_and_ln2, 0);
n = vsubq_f32 (z, d->shift);
- r = vfmsq_laneq_f32 (x, n, d->invln2_and_ln2, 1);
- r = vfmsq_laneq_f32 (r, n, d->invln2_and_ln2, 2);
+ r = vfmsq_laneq_f32 (x, n, invln2_and_ln2, 1);
+ r = vfmsq_laneq_f32 (r, n, invln2_and_ln2, 2);
uint32x4_t e = vshlq_n_u32 (vreinterpretq_u32_f32 (z), 23);
float32x4_t scale = vreinterpretq_f32_u32 (vaddq_u32 (e, ExponentBias));
diff --git a/sysdeps/aarch64/fpu/v_expm1f_inline.h b/sysdeps/aarch64/fpu/v_expm1f_inline.h
index 337ccfbfab555c97..59b552da6b74785e 100644
--- a/sysdeps/aarch64/fpu/v_expm1f_inline.h
+++ b/sysdeps/aarch64/fpu/v_expm1f_inline.h
@@ -26,7 +26,8 @@
struct v_expm1f_data
{
float32x4_t poly[5];
- float32x4_t invln2_and_ln2, shift;
+ float invln2_and_ln2[4];
+ float32x4_t shift;
int32x4_t exponent_bias;
};
@@ -49,11 +50,12 @@ expm1f_inline (float32x4_t x, const struct v_expm1f_data *d)
calling routine should handle special values if required. */
/* Reduce argument: f in [-ln2/2, ln2/2], i is exact. */
- float32x4_t j = vsubq_f32 (
- vfmaq_laneq_f32 (d->shift, x, d->invln2_and_ln2, 0), d->shift);
+ float32x4_t invln2_and_ln2 = vld1q_f32 (d->invln2_and_ln2);
+ float32x4_t j
+ = vsubq_f32 (vfmaq_laneq_f32 (d->shift, x, invln2_and_ln2, 0), d->shift);
int32x4_t i = vcvtq_s32_f32 (j);
- float32x4_t f = vfmsq_laneq_f32 (x, j, d->invln2_and_ln2, 1);
- f = vfmsq_laneq_f32 (f, j, d->invln2_and_ln2, 2);
+ float32x4_t f = vfmsq_laneq_f32 (x, j, invln2_and_ln2, 1);
+ f = vfmsq_laneq_f32 (f, j, invln2_and_ln2, 2);
/* Approximate expm1(f) with polynomial P, expm1(f) ~= f + f^2 * P(f).
Uses Estrin scheme, where the main _ZGVnN4v_expm1f routine uses