aarch64: Add GLIBC_2.40 vector functions and performance fixes (RHEL-118273)
This combines the following upstream commits: e45af510bc AArch64: Fix instability in AdvSIMD sinh 6c22823da5 AArch64: Fix instability in AdvSIMD tan aebaeb2c33 AArch64: Update math-vector-fortran.h e20ca759af AArch64: add optimised strspn/strcspn aac077645a AArch64: Fix SVE powf routine [BZ #33299] 1e3d1ddf97 AArch64: Optimize SVE exp functions dee22d2a81 AArch64: Optimise SVE FP64 Hyperbolics 6849c5b791 AArch64: Improve codegen SVE log1p helper 09795c5612 AArch64: Fix builderror with GCC 12.1/12.2 aa18367c11 AArch64: Improve enabling of SVE for libmvec 691edbdf77 aarch64: fix unwinding in longjmp 4352e2cc93 aarch64: Fix _dl_tlsdesc_dynamic unwind for pac-ret (BZ 32612) cf56eb28fa AArch64: Optimize algorithm in users of SVE expf helper ce2f26a22e AArch64: Remove PTR_ARG/SIZE_ARG defines 8f0e7fe61e Aarch64: Improve codegen in SVE asinh c0ff447edf Aarch64: Improve codegen in SVE exp and users, and update expf_inline f5ff34cb3c AArch64: Improve codegen for SVE erfcf 0b195651db AArch64: Improve codegen for SVE pow 95e807209b AArch64: Improve codegen for SVE powf d3f2b71ef1 aarch64: Fix tests not compatible with targets supporting GCS f86b4cf875 AArch64: Improve codegen in SVE expm1f and users 140b985e5a AArch64: Improve codegen in AdvSIMD asinh 91c1fadba3 AArch64: Improve codegen for SVE log1pf users cff9648d0b AArch64: Improve codegen of AdvSIMD expf family 569cfaaf49 AArch64: Improve codegen in AdvSIMD pow ca0c0d0f26 AArch64: Improve codegen in users of ADVSIMD log1p helper 13a7ef5999 AArch64: Improve codegen in users of ADVSIMD expm1 helper 2d82d781a5 AArch64: Remove SVE erf and erfc tables 1cf29fbc5b AArch64: Small optimisation in AdvSIMD erf and erfc 7b8c134b54 AArch64: Improve codegen in SVE expf & related routines a15b1394b5 AArch64: Improve codegen in SVE F32 logs 5bc100bd4b AArch64: Improve codegen in users of AdvSIMD log1pf helper 7900ac490d AArch64: Improve codegen in users of ADVSIMD expm1f helper 0fed0b250f aarch64/fpu: Add vector variants of pow 75207bde68 aarch64/fpu: Add vector variants of cbrt 157f89fa3d aarch64/fpu: Add vector variants of hypot 90a6ca8b28 aarch64: Fix AdvSIMD libmvec routines for big-endian 87cb1dfcd6 aarch64/fpu: Add vector variants of erfc 3d3a4fb8e4 aarch64/fpu: Add vector variants of tanh eedbbca0bf aarch64/fpu: Add vector variants of sinh 8b67920528 aarch64/fpu: Add vector variants of atanh 81406ea3c5 aarch64/fpu: Add vector variants of asinh b09fee1d21 aarch64/fpu: Add vector variants of acosh bdb5705b7b aarch64/fpu: Add vector variants of cosh cb5d84f1f8 aarch64/fpu: Add vector variants of erf Resolves: RHEL-118273
This commit is contained in:
parent
7361fbbfab
commit
9dd92cac18
4742
glibc-RHEL-118273-1.patch
Normal file
4742
glibc-RHEL-118273-1.patch
Normal file
File diff suppressed because it is too large
Load Diff
514
glibc-RHEL-118273-10.patch
Normal file
514
glibc-RHEL-118273-10.patch
Normal file
@ -0,0 +1,514 @@
|
||||
commit 157f89fa3d616729c8d7797168a9b3eaaa6ebf6e
|
||||
Author: Joe Ramsay <Joe.Ramsay@arm.com>
|
||||
Date: Tue Apr 30 13:49:58 2024 +0100
|
||||
|
||||
aarch64/fpu: Add vector variants of hypot
|
||||
|
||||
Reviewed-by: Szabolcs Nagy <szabolcs.nagy@arm.com>
|
||||
|
||||
diff --git a/sysdeps/aarch64/fpu/Makefile b/sysdeps/aarch64/fpu/Makefile
|
||||
index e8af35099d7b9f8f..06657782a1ee7106 100644
|
||||
--- a/sysdeps/aarch64/fpu/Makefile
|
||||
+++ b/sysdeps/aarch64/fpu/Makefile
|
||||
@@ -13,6 +13,7 @@ libmvec-supported-funcs = acos \
|
||||
exp10 \
|
||||
exp2 \
|
||||
expm1 \
|
||||
+ hypot \
|
||||
log \
|
||||
log10 \
|
||||
log1p \
|
||||
diff --git a/sysdeps/aarch64/fpu/Versions b/sysdeps/aarch64/fpu/Versions
|
||||
index 3cb1b82bd2785a4b..aedae9457b148983 100644
|
||||
--- a/sysdeps/aarch64/fpu/Versions
|
||||
+++ b/sysdeps/aarch64/fpu/Versions
|
||||
@@ -109,6 +109,11 @@ libmvec {
|
||||
_ZGVnN4v_erfcf;
|
||||
_ZGVsMxv_erfc;
|
||||
_ZGVsMxv_erfcf;
|
||||
+ _ZGVnN4vv_hypotf;
|
||||
+ _ZGVnN2vv_hypotf;
|
||||
+ _ZGVnN2vv_hypot;
|
||||
+ _ZGVsMxvv_hypotf;
|
||||
+ _ZGVsMxvv_hypot;
|
||||
_ZGVnN2v_sinh;
|
||||
_ZGVnN2v_sinhf;
|
||||
_ZGVnN4v_sinhf;
|
||||
diff --git a/sysdeps/aarch64/fpu/advsimd_f32_protos.h b/sysdeps/aarch64/fpu/advsimd_f32_protos.h
|
||||
index 383c4369729a3452..a8889a92fd041585 100644
|
||||
--- a/sysdeps/aarch64/fpu/advsimd_f32_protos.h
|
||||
+++ b/sysdeps/aarch64/fpu/advsimd_f32_protos.h
|
||||
@@ -31,6 +31,7 @@ libmvec_hidden_proto (V_NAME_F1(exp10));
|
||||
libmvec_hidden_proto (V_NAME_F1(exp2));
|
||||
libmvec_hidden_proto (V_NAME_F1(exp));
|
||||
libmvec_hidden_proto (V_NAME_F1(expm1));
|
||||
+libmvec_hidden_proto (V_NAME_F2(hypot));
|
||||
libmvec_hidden_proto (V_NAME_F1(log10));
|
||||
libmvec_hidden_proto (V_NAME_F1(log1p));
|
||||
libmvec_hidden_proto (V_NAME_F1(log2));
|
||||
diff --git a/sysdeps/aarch64/fpu/bits/math-vector.h b/sysdeps/aarch64/fpu/bits/math-vector.h
|
||||
index e29b2d1c09273969..ca3017733959702f 100644
|
||||
--- a/sysdeps/aarch64/fpu/bits/math-vector.h
|
||||
+++ b/sysdeps/aarch64/fpu/bits/math-vector.h
|
||||
@@ -89,6 +89,10 @@
|
||||
# define __DECL_SIMD_expm1 __DECL_SIMD_aarch64
|
||||
# undef __DECL_SIMD_expm1f
|
||||
# define __DECL_SIMD_expm1f __DECL_SIMD_aarch64
|
||||
+# undef __DECL_SIMD_hypot
|
||||
+# define __DECL_SIMD_hypot __DECL_SIMD_aarch64
|
||||
+# undef __DECL_SIMD_hypotf
|
||||
+# define __DECL_SIMD_hypotf __DECL_SIMD_aarch64
|
||||
# undef __DECL_SIMD_log
|
||||
# define __DECL_SIMD_log __DECL_SIMD_aarch64
|
||||
# undef __DECL_SIMD_logf
|
||||
@@ -162,6 +166,7 @@ __vpcs __f32x4_t _ZGVnN4v_expf (__f32x4_t);
|
||||
__vpcs __f32x4_t _ZGVnN4v_exp10f (__f32x4_t);
|
||||
__vpcs __f32x4_t _ZGVnN4v_exp2f (__f32x4_t);
|
||||
__vpcs __f32x4_t _ZGVnN4v_expm1f (__f32x4_t);
|
||||
+__vpcs __f32x4_t _ZGVnN4vv_hypotf (__f32x4_t, __f32x4_t);
|
||||
__vpcs __f32x4_t _ZGVnN4v_logf (__f32x4_t);
|
||||
__vpcs __f32x4_t _ZGVnN4v_log10f (__f32x4_t);
|
||||
__vpcs __f32x4_t _ZGVnN4v_log1pf (__f32x4_t);
|
||||
@@ -186,6 +191,7 @@ __vpcs __f64x2_t _ZGVnN2v_exp (__f64x2_t);
|
||||
__vpcs __f64x2_t _ZGVnN2v_exp10 (__f64x2_t);
|
||||
__vpcs __f64x2_t _ZGVnN2v_exp2 (__f64x2_t);
|
||||
__vpcs __f64x2_t _ZGVnN2v_expm1 (__f64x2_t);
|
||||
+__vpcs __f64x2_t _ZGVnN2vv_hypot (__f64x2_t, __f64x2_t);
|
||||
__vpcs __f64x2_t _ZGVnN2v_log (__f64x2_t);
|
||||
__vpcs __f64x2_t _ZGVnN2v_log10 (__f64x2_t);
|
||||
__vpcs __f64x2_t _ZGVnN2v_log1p (__f64x2_t);
|
||||
@@ -215,6 +221,7 @@ __sv_f32_t _ZGVsMxv_expf (__sv_f32_t, __sv_bool_t);
|
||||
__sv_f32_t _ZGVsMxv_exp10f (__sv_f32_t, __sv_bool_t);
|
||||
__sv_f32_t _ZGVsMxv_exp2f (__sv_f32_t, __sv_bool_t);
|
||||
__sv_f32_t _ZGVsMxv_expm1f (__sv_f32_t, __sv_bool_t);
|
||||
+__sv_f32_t _ZGVsMxvv_hypotf (__sv_f32_t, __sv_f32_t, __sv_bool_t);
|
||||
__sv_f32_t _ZGVsMxv_logf (__sv_f32_t, __sv_bool_t);
|
||||
__sv_f32_t _ZGVsMxv_log10f (__sv_f32_t, __sv_bool_t);
|
||||
__sv_f32_t _ZGVsMxv_log1pf (__sv_f32_t, __sv_bool_t);
|
||||
@@ -239,6 +246,7 @@ __sv_f64_t _ZGVsMxv_exp (__sv_f64_t, __sv_bool_t);
|
||||
__sv_f64_t _ZGVsMxv_exp10 (__sv_f64_t, __sv_bool_t);
|
||||
__sv_f64_t _ZGVsMxv_exp2 (__sv_f64_t, __sv_bool_t);
|
||||
__sv_f64_t _ZGVsMxv_expm1 (__sv_f64_t, __sv_bool_t);
|
||||
+__sv_f64_t _ZGVsMxvv_hypot (__sv_f64_t, __sv_f64_t, __sv_bool_t);
|
||||
__sv_f64_t _ZGVsMxv_log (__sv_f64_t, __sv_bool_t);
|
||||
__sv_f64_t _ZGVsMxv_log10 (__sv_f64_t, __sv_bool_t);
|
||||
__sv_f64_t _ZGVsMxv_log1p (__sv_f64_t, __sv_bool_t);
|
||||
diff --git a/sysdeps/aarch64/fpu/hypot_advsimd.c b/sysdeps/aarch64/fpu/hypot_advsimd.c
|
||||
new file mode 100644
|
||||
index 0000000000000000..e4e279fa0c362336
|
||||
--- /dev/null
|
||||
+++ b/sysdeps/aarch64/fpu/hypot_advsimd.c
|
||||
@@ -0,0 +1,97 @@
|
||||
+/* Double-precision vector (Advanced SIMD) hypot function
|
||||
+
|
||||
+ Copyright (C) 2024 Free Software Foundation, Inc.
|
||||
+ This file is part of the GNU C Library.
|
||||
+
|
||||
+ The GNU C Library is free software; you can redistribute it and/or
|
||||
+ modify it under the terms of the GNU Lesser General Public
|
||||
+ License as published by the Free Software Foundation; either
|
||||
+ version 2.1 of the License, or (at your option) any later version.
|
||||
+
|
||||
+ The GNU C Library is distributed in the hope that it will be useful,
|
||||
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
+ Lesser General Public License for more details.
|
||||
+
|
||||
+ You should have received a copy of the GNU Lesser General Public
|
||||
+ License along with the GNU C Library; if not, see
|
||||
+ <https://www.gnu.org/licenses/>. */
|
||||
+
|
||||
+#include "v_math.h"
|
||||
+
|
||||
+#if WANT_SIMD_EXCEPT
|
||||
+static const struct data
|
||||
+{
|
||||
+ uint64x2_t tiny_bound, thres;
|
||||
+} data = {
|
||||
+ .tiny_bound = V2 (0x2000000000000000), /* asuint (0x1p-511). */
|
||||
+ .thres = V2 (0x3fe0000000000000), /* asuint (0x1p511) - tiny_bound. */
|
||||
+};
|
||||
+#else
|
||||
+static const struct data
|
||||
+{
|
||||
+ uint64x2_t tiny_bound;
|
||||
+ uint32x4_t thres;
|
||||
+} data = {
|
||||
+ .tiny_bound = V2 (0x0360000000000000), /* asuint (0x1p-969). */
|
||||
+ .thres = V4 (0x7c900000), /* asuint (inf) - tiny_bound. */
|
||||
+};
|
||||
+#endif
|
||||
+
|
||||
+static float64x2_t VPCS_ATTR NOINLINE
|
||||
+special_case (float64x2_t x, float64x2_t y, float64x2_t sqsum,
|
||||
+ uint32x2_t special)
|
||||
+{
|
||||
+ return v_call2_f64 (hypot, x, y, vsqrtq_f64 (sqsum), vmovl_u32 (special));
|
||||
+}
|
||||
+
|
||||
+/* Vector implementation of double-precision hypot.
|
||||
+ Maximum error observed is 1.21 ULP:
|
||||
+ _ZGVnN2vv_hypot (0x1.6a1b193ff85b5p-204, 0x1.bc50676c2a447p-222)
|
||||
+ got 0x1.6a1b19400964ep-204
|
||||
+ want 0x1.6a1b19400964dp-204. */
|
||||
+#if WANT_SIMD_EXCEPT
|
||||
+
|
||||
+float64x2_t VPCS_ATTR V_NAME_D2 (hypot) (float64x2_t x, float64x2_t y)
|
||||
+{
|
||||
+ const struct data *d = ptr_barrier (&data);
|
||||
+
|
||||
+ float64x2_t ax = vabsq_f64 (x);
|
||||
+ float64x2_t ay = vabsq_f64 (y);
|
||||
+
|
||||
+ uint64x2_t ix = vreinterpretq_u64_f64 (ax);
|
||||
+ uint64x2_t iy = vreinterpretq_u64_f64 (ay);
|
||||
+
|
||||
+ /* Extreme values, NaNs, and infinities should be handled by the scalar
|
||||
+ fallback for correct flag handling. */
|
||||
+ uint64x2_t specialx = vcgeq_u64 (vsubq_u64 (ix, d->tiny_bound), d->thres);
|
||||
+ uint64x2_t specialy = vcgeq_u64 (vsubq_u64 (iy, d->tiny_bound), d->thres);
|
||||
+ ax = v_zerofy_f64 (ax, specialx);
|
||||
+ ay = v_zerofy_f64 (ay, specialy);
|
||||
+ uint32x2_t special = vaddhn_u64 (specialx, specialy);
|
||||
+
|
||||
+ float64x2_t sqsum = vfmaq_f64 (vmulq_f64 (ax, ax), ay, ay);
|
||||
+
|
||||
+ if (__glibc_unlikely (v_any_u32h (special)))
|
||||
+ return special_case (x, y, sqsum, special);
|
||||
+
|
||||
+ return vsqrtq_f64 (sqsum);
|
||||
+}
|
||||
+#else
|
||||
+
|
||||
+float64x2_t VPCS_ATTR V_NAME_D2 (hypot) (float64x2_t x, float64x2_t y)
|
||||
+{
|
||||
+ const struct data *d = ptr_barrier (&data);
|
||||
+
|
||||
+ float64x2_t sqsum = vfmaq_f64 (vmulq_f64 (x, x), y, y);
|
||||
+
|
||||
+ uint32x2_t special = vcge_u32 (
|
||||
+ vsubhn_u64 (vreinterpretq_u64_f64 (sqsum), d->tiny_bound),
|
||||
+ vget_low_u32 (d->thres));
|
||||
+
|
||||
+ if (__glibc_unlikely (v_any_u32h (special)))
|
||||
+ return special_case (x, y, sqsum, special);
|
||||
+
|
||||
+ return vsqrtq_f64 (sqsum);
|
||||
+}
|
||||
+#endif
|
||||
diff --git a/sysdeps/aarch64/fpu/hypot_sve.c b/sysdeps/aarch64/fpu/hypot_sve.c
|
||||
new file mode 100644
|
||||
index 0000000000000000..74417040acb2f32f
|
||||
--- /dev/null
|
||||
+++ b/sysdeps/aarch64/fpu/hypot_sve.c
|
||||
@@ -0,0 +1,54 @@
|
||||
+/* Double-precision vector (SVE) hypot function
|
||||
+
|
||||
+ Copyright (C) 2024 Free Software Foundation, Inc.
|
||||
+ This file is part of the GNU C Library.
|
||||
+
|
||||
+ The GNU C Library is free software; you can redistribute it and/or
|
||||
+ modify it under the terms of the GNU Lesser General Public
|
||||
+ License as published by the Free Software Foundation; either
|
||||
+ version 2.1 of the License, or (at your option) any later version.
|
||||
+
|
||||
+ The GNU C Library is distributed in the hope that it will be useful,
|
||||
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
+ Lesser General Public License for more details.
|
||||
+
|
||||
+ You should have received a copy of the GNU Lesser General Public
|
||||
+ License along with the GNU C Library; if not, see
|
||||
+ <https://www.gnu.org/licenses/>. */
|
||||
+
|
||||
+#include "sv_math.h"
|
||||
+
|
||||
+static const struct data
|
||||
+{
|
||||
+ uint64_t tiny_bound, thres;
|
||||
+} data = {
|
||||
+ .tiny_bound = 0x0c80000000000000, /* asuint (0x1p-102). */
|
||||
+ .thres = 0x7300000000000000, /* asuint (inf) - tiny_bound. */
|
||||
+};
|
||||
+
|
||||
+static svfloat64_t NOINLINE
|
||||
+special_case (svfloat64_t sqsum, svfloat64_t x, svfloat64_t y, svbool_t pg,
|
||||
+ svbool_t special)
|
||||
+{
|
||||
+ return sv_call2_f64 (hypot, x, y, svsqrt_x (pg, sqsum), special);
|
||||
+}
|
||||
+
|
||||
+/* SVE implementation of double-precision hypot.
|
||||
+ Maximum error observed is 1.21 ULP:
|
||||
+ _ZGVsMxvv_hypot (-0x1.6a22d0412cdd3p+352, 0x1.d3d89bd66fb1ap+330)
|
||||
+ got 0x1.6a22d0412cfp+352
|
||||
+ want 0x1.6a22d0412cf01p+352. */
|
||||
+svfloat64_t SV_NAME_D2 (hypot) (svfloat64_t x, svfloat64_t y, svbool_t pg)
|
||||
+{
|
||||
+ const struct data *d = ptr_barrier (&data);
|
||||
+
|
||||
+ svfloat64_t sqsum = svmla_x (pg, svmul_x (pg, x, x), y, y);
|
||||
+
|
||||
+ svbool_t special = svcmpge (
|
||||
+ pg, svsub_x (pg, svreinterpret_u64 (sqsum), d->tiny_bound), d->thres);
|
||||
+
|
||||
+ if (__glibc_unlikely (svptest_any (pg, special)))
|
||||
+ return special_case (sqsum, x, y, pg, special);
|
||||
+ return svsqrt_x (pg, sqsum);
|
||||
+}
|
||||
diff --git a/sysdeps/aarch64/fpu/hypotf_advsimd.c b/sysdeps/aarch64/fpu/hypotf_advsimd.c
|
||||
new file mode 100644
|
||||
index 0000000000000000..34818b021abce1b7
|
||||
--- /dev/null
|
||||
+++ b/sysdeps/aarch64/fpu/hypotf_advsimd.c
|
||||
@@ -0,0 +1,98 @@
|
||||
+/* Single-precision vector (Advanced SIMD) hypot function
|
||||
+
|
||||
+ Copyright (C) 2024 Free Software Foundation, Inc.
|
||||
+ This file is part of the GNU C Library.
|
||||
+
|
||||
+ The GNU C Library is free software; you can redistribute it and/or
|
||||
+ modify it under the terms of the GNU Lesser General Public
|
||||
+ License as published by the Free Software Foundation; either
|
||||
+ version 2.1 of the License, or (at your option) any later version.
|
||||
+
|
||||
+ The GNU C Library is distributed in the hope that it will be useful,
|
||||
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
+ Lesser General Public License for more details.
|
||||
+
|
||||
+ You should have received a copy of the GNU Lesser General Public
|
||||
+ License along with the GNU C Library; if not, see
|
||||
+ <https://www.gnu.org/licenses/>. */
|
||||
+
|
||||
+#include "v_math.h"
|
||||
+
|
||||
+#if WANT_SIMD_EXCEPT
|
||||
+static const struct data
|
||||
+{
|
||||
+ uint32x4_t tiny_bound, thres;
|
||||
+} data = {
|
||||
+ .tiny_bound = V4 (0x20000000), /* asuint (0x1p-63). */
|
||||
+ .thres = V4 (0x3f000000), /* asuint (0x1p63) - tiny_bound. */
|
||||
+};
|
||||
+#else
|
||||
+static const struct data
|
||||
+{
|
||||
+ uint32x4_t tiny_bound;
|
||||
+ uint16x8_t thres;
|
||||
+} data = {
|
||||
+ .tiny_bound = V4 (0x0C800000), /* asuint (0x1p-102). */
|
||||
+ .thres = V8 (0x7300), /* asuint (inf) - tiny_bound. */
|
||||
+};
|
||||
+#endif
|
||||
+
|
||||
+static float32x4_t VPCS_ATTR NOINLINE
|
||||
+special_case (float32x4_t x, float32x4_t y, float32x4_t sqsum,
|
||||
+ uint16x4_t special)
|
||||
+{
|
||||
+ return v_call2_f32 (hypotf, x, y, vsqrtq_f32 (sqsum), vmovl_u16 (special));
|
||||
+}
|
||||
+
|
||||
+/* Vector implementation of single-precision hypot.
|
||||
+ Maximum error observed is 1.21 ULP:
|
||||
+ _ZGVnN4vv_hypotf (0x1.6a419cp-13, 0x1.82a852p-22) got 0x1.6a41d2p-13
|
||||
+ want 0x1.6a41dp-13. */
|
||||
+#if WANT_SIMD_EXCEPT
|
||||
+
|
||||
+float32x4_t VPCS_ATTR V_NAME_F2 (hypot) (float32x4_t x, float32x4_t y)
|
||||
+{
|
||||
+ const struct data *d = ptr_barrier (&data);
|
||||
+
|
||||
+ float32x4_t ax = vabsq_f32 (x);
|
||||
+ float32x4_t ay = vabsq_f32 (y);
|
||||
+
|
||||
+ uint32x4_t ix = vreinterpretq_u32_f32 (ax);
|
||||
+ uint32x4_t iy = vreinterpretq_u32_f32 (ay);
|
||||
+
|
||||
+ /* Extreme values, NaNs, and infinities should be handled by the scalar
|
||||
+ fallback for correct flag handling. */
|
||||
+ uint32x4_t specialx = vcgeq_u32 (vsubq_u32 (ix, d->tiny_bound), d->thres);
|
||||
+ uint32x4_t specialy = vcgeq_u32 (vsubq_u32 (iy, d->tiny_bound), d->thres);
|
||||
+ ax = v_zerofy_f32 (ax, specialx);
|
||||
+ ay = v_zerofy_f32 (ay, specialy);
|
||||
+ uint16x4_t special = vaddhn_u32 (specialx, specialy);
|
||||
+
|
||||
+ float32x4_t sqsum = vfmaq_f32 (vmulq_f32 (ax, ax), ay, ay);
|
||||
+
|
||||
+ if (__glibc_unlikely (v_any_u16h (special)))
|
||||
+ return special_case (x, y, sqsum, special);
|
||||
+
|
||||
+ return vsqrtq_f32 (sqsum);
|
||||
+}
|
||||
+#else
|
||||
+
|
||||
+float32x4_t VPCS_ATTR V_NAME_F2 (hypot) (float32x4_t x, float32x4_t y)
|
||||
+{
|
||||
+ const struct data *d = ptr_barrier (&data);
|
||||
+
|
||||
+ float32x4_t sqsum = vfmaq_f32 (vmulq_f32 (x, x), y, y);
|
||||
+
|
||||
+ uint16x4_t special = vcge_u16 (
|
||||
+ vsubhn_u32 (vreinterpretq_u32_f32 (sqsum), d->tiny_bound),
|
||||
+ vget_low_u16 (d->thres));
|
||||
+
|
||||
+ if (__glibc_unlikely (v_any_u16h (special)))
|
||||
+ return special_case (x, y, sqsum, special);
|
||||
+
|
||||
+ return vsqrtq_f32 (sqsum);
|
||||
+}
|
||||
+#endif
|
||||
+libmvec_hidden_def (V_NAME_F2 (hypot))
|
||||
+HALF_WIDTH_ALIAS_F2(hypot)
|
||||
diff --git a/sysdeps/aarch64/fpu/hypotf_sve.c b/sysdeps/aarch64/fpu/hypotf_sve.c
|
||||
new file mode 100644
|
||||
index 0000000000000000..3a403de66eb091f4
|
||||
--- /dev/null
|
||||
+++ b/sysdeps/aarch64/fpu/hypotf_sve.c
|
||||
@@ -0,0 +1,48 @@
|
||||
+/* Single-precision vector (SVE) hypot function
|
||||
+
|
||||
+ Copyright (C) 2024 Free Software Foundation, Inc.
|
||||
+ This file is part of the GNU C Library.
|
||||
+
|
||||
+ The GNU C Library is free software; you can redistribute it and/or
|
||||
+ modify it under the terms of the GNU Lesser General Public
|
||||
+ License as published by the Free Software Foundation; either
|
||||
+ version 2.1 of the License, or (at your option) any later version.
|
||||
+
|
||||
+ The GNU C Library is distributed in the hope that it will be useful,
|
||||
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
+ Lesser General Public License for more details.
|
||||
+
|
||||
+ You should have received a copy of the GNU Lesser General Public
|
||||
+ License along with the GNU C Library; if not, see
|
||||
+ <https://www.gnu.org/licenses/>. */
|
||||
+
|
||||
+#include "sv_math.h"
|
||||
+
|
||||
+#define TinyBound 0x0c800000 /* asuint (0x1p-102). */
|
||||
+#define Thres 0x73000000 /* 0x70000000 - TinyBound. */
|
||||
+
|
||||
+static svfloat32_t NOINLINE
|
||||
+special_case (svfloat32_t sqsum, svfloat32_t x, svfloat32_t y, svbool_t pg,
|
||||
+ svbool_t special)
|
||||
+{
|
||||
+ return sv_call2_f32 (hypotf, x, y, svsqrt_x (pg, sqsum), special);
|
||||
+}
|
||||
+
|
||||
+/* SVE implementation of single-precision hypot.
|
||||
+ Maximum error observed is 1.21 ULP:
|
||||
+ _ZGVsMxvv_hypotf (0x1.6a213cp-19, -0x1.32b982p-26) got 0x1.6a2346p-19
|
||||
+ want 0x1.6a2344p-19. */
|
||||
+svfloat32_t SV_NAME_F2 (hypot) (svfloat32_t x, svfloat32_t y,
|
||||
+ const svbool_t pg)
|
||||
+{
|
||||
+ svfloat32_t sqsum = svmla_x (pg, svmul_x (pg, x, x), y, y);
|
||||
+
|
||||
+ svbool_t special = svcmpge (
|
||||
+ pg, svsub_x (pg, svreinterpret_u32 (sqsum), TinyBound), Thres);
|
||||
+
|
||||
+ if (__glibc_unlikely (svptest_any (pg, special)))
|
||||
+ return special_case (sqsum, x, y, pg, special);
|
||||
+
|
||||
+ return svsqrt_x (pg, sqsum);
|
||||
+}
|
||||
diff --git a/sysdeps/aarch64/fpu/test-double-advsimd-wrappers.c b/sysdeps/aarch64/fpu/test-double-advsimd-wrappers.c
|
||||
index f2d8714075ab99b8..417125be476cd75f 100644
|
||||
--- a/sysdeps/aarch64/fpu/test-double-advsimd-wrappers.c
|
||||
+++ b/sysdeps/aarch64/fpu/test-double-advsimd-wrappers.c
|
||||
@@ -38,6 +38,7 @@ VPCS_VECTOR_WRAPPER (exp_advsimd, _ZGVnN2v_exp)
|
||||
VPCS_VECTOR_WRAPPER (exp10_advsimd, _ZGVnN2v_exp10)
|
||||
VPCS_VECTOR_WRAPPER (exp2_advsimd, _ZGVnN2v_exp2)
|
||||
VPCS_VECTOR_WRAPPER (expm1_advsimd, _ZGVnN2v_expm1)
|
||||
+VPCS_VECTOR_WRAPPER_ff (hypot_advsimd, _ZGVnN2vv_hypot)
|
||||
VPCS_VECTOR_WRAPPER (log_advsimd, _ZGVnN2v_log)
|
||||
VPCS_VECTOR_WRAPPER (log10_advsimd, _ZGVnN2v_log10)
|
||||
VPCS_VECTOR_WRAPPER (log1p_advsimd, _ZGVnN2v_log1p)
|
||||
diff --git a/sysdeps/aarch64/fpu/test-double-sve-wrappers.c b/sysdeps/aarch64/fpu/test-double-sve-wrappers.c
|
||||
index 37873d5e432ae9e8..31ebf18705f68856 100644
|
||||
--- a/sysdeps/aarch64/fpu/test-double-sve-wrappers.c
|
||||
+++ b/sysdeps/aarch64/fpu/test-double-sve-wrappers.c
|
||||
@@ -57,6 +57,7 @@ SVE_VECTOR_WRAPPER (exp_sve, _ZGVsMxv_exp)
|
||||
SVE_VECTOR_WRAPPER (exp10_sve, _ZGVsMxv_exp10)
|
||||
SVE_VECTOR_WRAPPER (exp2_sve, _ZGVsMxv_exp2)
|
||||
SVE_VECTOR_WRAPPER (expm1_sve, _ZGVsMxv_expm1)
|
||||
+SVE_VECTOR_WRAPPER_ff (hypot_sve, _ZGVsMxvv_hypot)
|
||||
SVE_VECTOR_WRAPPER (log_sve, _ZGVsMxv_log)
|
||||
SVE_VECTOR_WRAPPER (log10_sve, _ZGVsMxv_log10)
|
||||
SVE_VECTOR_WRAPPER (log1p_sve, _ZGVsMxv_log1p)
|
||||
diff --git a/sysdeps/aarch64/fpu/test-float-advsimd-wrappers.c b/sysdeps/aarch64/fpu/test-float-advsimd-wrappers.c
|
||||
index 08e33115b9dc6f5e..dab0f1cfcb79a305 100644
|
||||
--- a/sysdeps/aarch64/fpu/test-float-advsimd-wrappers.c
|
||||
+++ b/sysdeps/aarch64/fpu/test-float-advsimd-wrappers.c
|
||||
@@ -38,6 +38,7 @@ VPCS_VECTOR_WRAPPER (expf_advsimd, _ZGVnN4v_expf)
|
||||
VPCS_VECTOR_WRAPPER (exp10f_advsimd, _ZGVnN4v_exp10f)
|
||||
VPCS_VECTOR_WRAPPER (exp2f_advsimd, _ZGVnN4v_exp2f)
|
||||
VPCS_VECTOR_WRAPPER (expm1f_advsimd, _ZGVnN4v_expm1f)
|
||||
+VPCS_VECTOR_WRAPPER_ff (hypotf_advsimd, _ZGVnN4vv_hypotf)
|
||||
VPCS_VECTOR_WRAPPER (logf_advsimd, _ZGVnN4v_logf)
|
||||
VPCS_VECTOR_WRAPPER (log10f_advsimd, _ZGVnN4v_log10f)
|
||||
VPCS_VECTOR_WRAPPER (log1pf_advsimd, _ZGVnN4v_log1pf)
|
||||
diff --git a/sysdeps/aarch64/fpu/test-float-sve-wrappers.c b/sysdeps/aarch64/fpu/test-float-sve-wrappers.c
|
||||
index 025daa662efd6f7f..2aa6cbcc28d69cf8 100644
|
||||
--- a/sysdeps/aarch64/fpu/test-float-sve-wrappers.c
|
||||
+++ b/sysdeps/aarch64/fpu/test-float-sve-wrappers.c
|
||||
@@ -57,6 +57,7 @@ SVE_VECTOR_WRAPPER (expf_sve, _ZGVsMxv_expf)
|
||||
SVE_VECTOR_WRAPPER (exp10f_sve, _ZGVsMxv_exp10f)
|
||||
SVE_VECTOR_WRAPPER (exp2f_sve, _ZGVsMxv_exp2f)
|
||||
SVE_VECTOR_WRAPPER (expm1f_sve, _ZGVsMxv_expm1f)
|
||||
+SVE_VECTOR_WRAPPER_ff (hypotf_sve, _ZGVsMxvv_hypotf)
|
||||
SVE_VECTOR_WRAPPER (logf_sve, _ZGVsMxv_logf)
|
||||
SVE_VECTOR_WRAPPER (log10f_sve, _ZGVsMxv_log10f)
|
||||
SVE_VECTOR_WRAPPER (log1pf_sve, _ZGVsMxv_log1pf)
|
||||
diff --git a/sysdeps/aarch64/libm-test-ulps b/sysdeps/aarch64/libm-test-ulps
|
||||
index 055da83d639a2430..17723d0c9e2dfcf5 100644
|
||||
--- a/sysdeps/aarch64/libm-test-ulps
|
||||
+++ b/sysdeps/aarch64/libm-test-ulps
|
||||
@@ -1174,10 +1174,18 @@ double: 1
|
||||
float: 1
|
||||
ldouble: 1
|
||||
|
||||
+Function: "hypot_advsimd":
|
||||
+double: 1
|
||||
+float: 1
|
||||
+
|
||||
Function: "hypot_downward":
|
||||
double: 1
|
||||
ldouble: 1
|
||||
|
||||
+Function: "hypot_sve":
|
||||
+double: 1
|
||||
+float: 1
|
||||
+
|
||||
Function: "hypot_towardzero":
|
||||
double: 1
|
||||
ldouble: 1
|
||||
diff --git a/sysdeps/unix/sysv/linux/aarch64/libmvec.abilist b/sysdeps/unix/sysv/linux/aarch64/libmvec.abilist
|
||||
index 26c3fbf18b2f12a9..1184374efd25cfa6 100644
|
||||
--- a/sysdeps/unix/sysv/linux/aarch64/libmvec.abilist
|
||||
+++ b/sysdeps/unix/sysv/linux/aarch64/libmvec.abilist
|
||||
@@ -89,6 +89,8 @@ GLIBC_2.40 _ZGVnN2v_sinh F
|
||||
GLIBC_2.40 _ZGVnN2v_sinhf F
|
||||
GLIBC_2.40 _ZGVnN2v_tanh F
|
||||
GLIBC_2.40 _ZGVnN2v_tanhf F
|
||||
+GLIBC_2.40 _ZGVnN2vv_hypot F
|
||||
+GLIBC_2.40 _ZGVnN2vv_hypotf F
|
||||
GLIBC_2.40 _ZGVnN4v_acoshf F
|
||||
GLIBC_2.40 _ZGVnN4v_asinhf F
|
||||
GLIBC_2.40 _ZGVnN4v_atanhf F
|
||||
@@ -97,6 +99,7 @@ GLIBC_2.40 _ZGVnN4v_erfcf F
|
||||
GLIBC_2.40 _ZGVnN4v_erff F
|
||||
GLIBC_2.40 _ZGVnN4v_sinhf F
|
||||
GLIBC_2.40 _ZGVnN4v_tanhf F
|
||||
+GLIBC_2.40 _ZGVnN4vv_hypotf F
|
||||
GLIBC_2.40 _ZGVsMxv_acosh F
|
||||
GLIBC_2.40 _ZGVsMxv_acoshf F
|
||||
GLIBC_2.40 _ZGVsMxv_asinh F
|
||||
@@ -113,3 +116,5 @@ GLIBC_2.40 _ZGVsMxv_sinh F
|
||||
GLIBC_2.40 _ZGVsMxv_sinhf F
|
||||
GLIBC_2.40 _ZGVsMxv_tanh F
|
||||
GLIBC_2.40 _ZGVsMxv_tanhf F
|
||||
+GLIBC_2.40 _ZGVsMxvv_hypot F
|
||||
+GLIBC_2.40 _ZGVsMxvv_hypotf F
|
||||
715
glibc-RHEL-118273-11.patch
Normal file
715
glibc-RHEL-118273-11.patch
Normal file
@ -0,0 +1,715 @@
|
||||
commit 75207bde6870eb4b258e16fbb41252b2e6377675
|
||||
Author: Joe Ramsay <Joe.Ramsay@arm.com>
|
||||
Date: Tue Apr 30 13:49:59 2024 +0100
|
||||
|
||||
aarch64/fpu: Add vector variants of cbrt
|
||||
|
||||
Reviewed-by: Szabolcs Nagy <szabolcs.nagy@arm.com>
|
||||
|
||||
diff --git a/sysdeps/aarch64/fpu/Makefile b/sysdeps/aarch64/fpu/Makefile
|
||||
index 06657782a1ee7106..990d1135b93485c5 100644
|
||||
--- a/sysdeps/aarch64/fpu/Makefile
|
||||
+++ b/sysdeps/aarch64/fpu/Makefile
|
||||
@@ -5,6 +5,7 @@ libmvec-supported-funcs = acos \
|
||||
atan \
|
||||
atanh \
|
||||
atan2 \
|
||||
+ cbrt \
|
||||
cos \
|
||||
cosh \
|
||||
erf \
|
||||
diff --git a/sysdeps/aarch64/fpu/Versions b/sysdeps/aarch64/fpu/Versions
|
||||
index aedae9457b148983..36a9e4df1e058c46 100644
|
||||
--- a/sysdeps/aarch64/fpu/Versions
|
||||
+++ b/sysdeps/aarch64/fpu/Versions
|
||||
@@ -94,6 +94,11 @@ libmvec {
|
||||
_ZGVnN4v_atanhf;
|
||||
_ZGVsMxv_atanh;
|
||||
_ZGVsMxv_atanhf;
|
||||
+ _ZGVnN2v_cbrt;
|
||||
+ _ZGVnN2v_cbrtf;
|
||||
+ _ZGVnN4v_cbrtf;
|
||||
+ _ZGVsMxv_cbrt;
|
||||
+ _ZGVsMxv_cbrtf;
|
||||
_ZGVnN2v_cosh;
|
||||
_ZGVnN2v_coshf;
|
||||
_ZGVnN4v_coshf;
|
||||
diff --git a/sysdeps/aarch64/fpu/advsimd_f32_protos.h b/sysdeps/aarch64/fpu/advsimd_f32_protos.h
|
||||
index a8889a92fd041585..54858efd8aa0ff82 100644
|
||||
--- a/sysdeps/aarch64/fpu/advsimd_f32_protos.h
|
||||
+++ b/sysdeps/aarch64/fpu/advsimd_f32_protos.h
|
||||
@@ -23,6 +23,7 @@ libmvec_hidden_proto (V_NAME_F1(asin));
|
||||
libmvec_hidden_proto (V_NAME_F1(asinh));
|
||||
libmvec_hidden_proto (V_NAME_F1(atan));
|
||||
libmvec_hidden_proto (V_NAME_F1(atanh));
|
||||
+libmvec_hidden_proto (V_NAME_F1(cbrt));
|
||||
libmvec_hidden_proto (V_NAME_F1(cos));
|
||||
libmvec_hidden_proto (V_NAME_F1(cosh));
|
||||
libmvec_hidden_proto (V_NAME_F1(erf));
|
||||
diff --git a/sysdeps/aarch64/fpu/bits/math-vector.h b/sysdeps/aarch64/fpu/bits/math-vector.h
|
||||
index ca3017733959702f..b1c024fe13a7dc32 100644
|
||||
--- a/sysdeps/aarch64/fpu/bits/math-vector.h
|
||||
+++ b/sysdeps/aarch64/fpu/bits/math-vector.h
|
||||
@@ -57,6 +57,10 @@
|
||||
# define __DECL_SIMD_atan2 __DECL_SIMD_aarch64
|
||||
# undef __DECL_SIMD_atan2f
|
||||
# define __DECL_SIMD_atan2f __DECL_SIMD_aarch64
|
||||
+# undef __DECL_SIMD_cbrt
|
||||
+# define __DECL_SIMD_cbrt __DECL_SIMD_aarch64
|
||||
+# undef __DECL_SIMD_cbrtf
|
||||
+# define __DECL_SIMD_cbrtf __DECL_SIMD_aarch64
|
||||
# undef __DECL_SIMD_cos
|
||||
# define __DECL_SIMD_cos __DECL_SIMD_aarch64
|
||||
# undef __DECL_SIMD_cosf
|
||||
@@ -158,6 +162,7 @@ __vpcs __f32x4_t _ZGVnN4v_asinf (__f32x4_t);
|
||||
__vpcs __f32x4_t _ZGVnN4v_asinhf (__f32x4_t);
|
||||
__vpcs __f32x4_t _ZGVnN4v_atanf (__f32x4_t);
|
||||
__vpcs __f32x4_t _ZGVnN4v_atanhf (__f32x4_t);
|
||||
+__vpcs __f32x4_t _ZGVnN4v_cbrtf (__f32x4_t);
|
||||
__vpcs __f32x4_t _ZGVnN4v_cosf (__f32x4_t);
|
||||
__vpcs __f32x4_t _ZGVnN4v_coshf (__f32x4_t);
|
||||
__vpcs __f32x4_t _ZGVnN4v_erff (__f32x4_t);
|
||||
@@ -183,6 +188,7 @@ __vpcs __f64x2_t _ZGVnN2v_asin (__f64x2_t);
|
||||
__vpcs __f64x2_t _ZGVnN2v_asinh (__f64x2_t);
|
||||
__vpcs __f64x2_t _ZGVnN2v_atan (__f64x2_t);
|
||||
__vpcs __f64x2_t _ZGVnN2v_atanh (__f64x2_t);
|
||||
+__vpcs __f64x2_t _ZGVnN2v_cbrt (__f64x2_t);
|
||||
__vpcs __f64x2_t _ZGVnN2v_cos (__f64x2_t);
|
||||
__vpcs __f64x2_t _ZGVnN2v_cosh (__f64x2_t);
|
||||
__vpcs __f64x2_t _ZGVnN2v_erf (__f64x2_t);
|
||||
@@ -213,6 +219,7 @@ __sv_f32_t _ZGVsMxv_asinf (__sv_f32_t, __sv_bool_t);
|
||||
__sv_f32_t _ZGVsMxv_asinhf (__sv_f32_t, __sv_bool_t);
|
||||
__sv_f32_t _ZGVsMxv_atanf (__sv_f32_t, __sv_bool_t);
|
||||
__sv_f32_t _ZGVsMxv_atanhf (__sv_f32_t, __sv_bool_t);
|
||||
+__sv_f32_t _ZGVsMxv_cbrtf (__sv_f32_t, __sv_bool_t);
|
||||
__sv_f32_t _ZGVsMxv_cosf (__sv_f32_t, __sv_bool_t);
|
||||
__sv_f32_t _ZGVsMxv_coshf (__sv_f32_t, __sv_bool_t);
|
||||
__sv_f32_t _ZGVsMxv_erff (__sv_f32_t, __sv_bool_t);
|
||||
@@ -238,6 +245,7 @@ __sv_f64_t _ZGVsMxv_asin (__sv_f64_t, __sv_bool_t);
|
||||
__sv_f64_t _ZGVsMxv_asinh (__sv_f64_t, __sv_bool_t);
|
||||
__sv_f64_t _ZGVsMxv_atan (__sv_f64_t, __sv_bool_t);
|
||||
__sv_f64_t _ZGVsMxv_atanh (__sv_f64_t, __sv_bool_t);
|
||||
+__sv_f64_t _ZGVsMxv_cbrt (__sv_f64_t, __sv_bool_t);
|
||||
__sv_f64_t _ZGVsMxv_cos (__sv_f64_t, __sv_bool_t);
|
||||
__sv_f64_t _ZGVsMxv_cosh (__sv_f64_t, __sv_bool_t);
|
||||
__sv_f64_t _ZGVsMxv_erf (__sv_f64_t, __sv_bool_t);
|
||||
diff --git a/sysdeps/aarch64/fpu/cbrt_advsimd.c b/sysdeps/aarch64/fpu/cbrt_advsimd.c
|
||||
new file mode 100644
|
||||
index 0000000000000000..adfbb60cd3918c95
|
||||
--- /dev/null
|
||||
+++ b/sysdeps/aarch64/fpu/cbrt_advsimd.c
|
||||
@@ -0,0 +1,121 @@
|
||||
+/* Double-precision vector (AdvSIMD) cbrt function
|
||||
+
|
||||
+ Copyright (C) 2024 Free Software Foundation, Inc.
|
||||
+ This file is part of the GNU C Library.
|
||||
+
|
||||
+ The GNU C Library is free software; you can redistribute it and/or
|
||||
+ modify it under the terms of the GNU Lesser General Public
|
||||
+ License as published by the Free Software Foundation; either
|
||||
+ version 2.1 of the License, or (at your option) any later version.
|
||||
+
|
||||
+ The GNU C Library is distributed in the hope that it will be useful,
|
||||
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
+ Lesser General Public License for more details.
|
||||
+
|
||||
+ You should have received a copy of the GNU Lesser General Public
|
||||
+ License along with the GNU C Library; if not, see
|
||||
+ <https://www.gnu.org/licenses/>. */
|
||||
+
|
||||
+#include "v_math.h"
|
||||
+#include "poly_advsimd_f64.h"
|
||||
+
|
||||
+const static struct data
|
||||
+{
|
||||
+ float64x2_t poly[4], one_third, shift;
|
||||
+ int64x2_t exp_bias;
|
||||
+ uint64x2_t abs_mask, tiny_bound;
|
||||
+ uint32x4_t thresh;
|
||||
+ double table[5];
|
||||
+} data = {
|
||||
+ .shift = V2 (0x1.8p52),
|
||||
+ .poly = { /* Generated with fpminimax in [0.5, 1]. */
|
||||
+ V2 (0x1.c14e8ee44767p-2), V2 (0x1.dd2d3f99e4c0ep-1),
|
||||
+ V2 (-0x1.08e83026b7e74p-1), V2 (0x1.2c74eaa3ba428p-3) },
|
||||
+ .exp_bias = V2 (1022),
|
||||
+ .abs_mask = V2(0x7fffffffffffffff),
|
||||
+ .tiny_bound = V2(0x0010000000000000), /* Smallest normal. */
|
||||
+ .thresh = V4(0x7fe00000), /* asuint64 (infinity) - tiny_bound. */
|
||||
+ .one_third = V2(0x1.5555555555555p-2),
|
||||
+ .table = { /* table[i] = 2^((i - 2) / 3). */
|
||||
+ 0x1.428a2f98d728bp-1, 0x1.965fea53d6e3dp-1, 0x1p0,
|
||||
+ 0x1.428a2f98d728bp0, 0x1.965fea53d6e3dp0 }
|
||||
+};
|
||||
+
|
||||
+#define MantissaMask v_u64 (0x000fffffffffffff)
|
||||
+
|
||||
+static float64x2_t NOINLINE VPCS_ATTR
|
||||
+special_case (float64x2_t x, float64x2_t y, uint32x2_t special)
|
||||
+{
|
||||
+ return v_call_f64 (cbrt, x, y, vmovl_u32 (special));
|
||||
+}
|
||||
+
|
||||
+/* Approximation for double-precision vector cbrt(x), using low-order polynomial
|
||||
+ and two Newton iterations. Greatest observed error is 1.79 ULP. Errors repeat
|
||||
+ according to the exponent, for instance an error observed for double value
|
||||
+ m * 2^e will be observed for any input m * 2^(e + 3*i), where i is an
|
||||
+ integer.
|
||||
+ __v_cbrt(0x1.fffff403f0bc6p+1) got 0x1.965fe72821e9bp+0
|
||||
+ want 0x1.965fe72821e99p+0. */
|
||||
+VPCS_ATTR float64x2_t V_NAME_D1 (cbrt) (float64x2_t x)
|
||||
+{
|
||||
+ const struct data *d = ptr_barrier (&data);
|
||||
+ uint64x2_t iax = vreinterpretq_u64_f64 (vabsq_f64 (x));
|
||||
+
|
||||
+ /* Subnormal, +/-0 and special values. */
|
||||
+ uint32x2_t special
|
||||
+ = vcge_u32 (vsubhn_u64 (iax, d->tiny_bound), vget_low_u32 (d->thresh));
|
||||
+
|
||||
+ /* Decompose |x| into m * 2^e, where m is in [0.5, 1.0]. This is a vector
|
||||
+ version of frexp, which gets subnormal values wrong - these have to be
|
||||
+ special-cased as a result. */
|
||||
+ float64x2_t m = vbslq_f64 (MantissaMask, x, v_f64 (0.5));
|
||||
+ int64x2_t exp_bias = d->exp_bias;
|
||||
+ uint64x2_t ia12 = vshrq_n_u64 (iax, 52);
|
||||
+ int64x2_t e = vsubq_s64 (vreinterpretq_s64_u64 (ia12), exp_bias);
|
||||
+
|
||||
+ /* Calculate rough approximation for cbrt(m) in [0.5, 1.0], starting point for
|
||||
+ Newton iterations. */
|
||||
+ float64x2_t p = v_pairwise_poly_3_f64 (m, vmulq_f64 (m, m), d->poly);
|
||||
+ float64x2_t one_third = d->one_third;
|
||||
+ /* Two iterations of Newton's method for iteratively approximating cbrt. */
|
||||
+ float64x2_t m_by_3 = vmulq_f64 (m, one_third);
|
||||
+ float64x2_t two_thirds = vaddq_f64 (one_third, one_third);
|
||||
+ float64x2_t a
|
||||
+ = vfmaq_f64 (vdivq_f64 (m_by_3, vmulq_f64 (p, p)), two_thirds, p);
|
||||
+ a = vfmaq_f64 (vdivq_f64 (m_by_3, vmulq_f64 (a, a)), two_thirds, a);
|
||||
+
|
||||
+ /* Assemble the result by the following:
|
||||
+
|
||||
+ cbrt(x) = cbrt(m) * 2 ^ (e / 3).
|
||||
+
|
||||
+ We can get 2 ^ round(e / 3) using ldexp and integer divide, but since e is
|
||||
+ not necessarily a multiple of 3 we lose some information.
|
||||
+
|
||||
+ Let q = 2 ^ round(e / 3), then t = 2 ^ (e / 3) / q.
|
||||
+
|
||||
+ Then we know t = 2 ^ (i / 3), where i is the remainder from e / 3, which is
|
||||
+ an integer in [-2, 2], and can be looked up in the table T. Hence the
|
||||
+ result is assembled as:
|
||||
+
|
||||
+ cbrt(x) = cbrt(m) * t * 2 ^ round(e / 3) * sign. */
|
||||
+
|
||||
+ float64x2_t ef = vcvtq_f64_s64 (e);
|
||||
+ float64x2_t eb3f = vrndnq_f64 (vmulq_f64 (ef, one_third));
|
||||
+ int64x2_t em3 = vcvtq_s64_f64 (vfmsq_f64 (ef, eb3f, v_f64 (3)));
|
||||
+ int64x2_t ey = vcvtq_s64_f64 (eb3f);
|
||||
+
|
||||
+ float64x2_t my = (float64x2_t){ d->table[em3[0] + 2], d->table[em3[1] + 2] };
|
||||
+ my = vmulq_f64 (my, a);
|
||||
+
|
||||
+ /* Vector version of ldexp. */
|
||||
+ float64x2_t y = vreinterpretq_f64_s64 (
|
||||
+ vshlq_n_s64 (vaddq_s64 (ey, vaddq_s64 (exp_bias, v_s64 (1))), 52));
|
||||
+ y = vmulq_f64 (y, my);
|
||||
+
|
||||
+ if (__glibc_unlikely (v_any_u32h (special)))
|
||||
+ return special_case (x, vbslq_f64 (d->abs_mask, y, x), special);
|
||||
+
|
||||
+ /* Copy sign. */
|
||||
+ return vbslq_f64 (d->abs_mask, y, x);
|
||||
+}
|
||||
diff --git a/sysdeps/aarch64/fpu/cbrt_sve.c b/sysdeps/aarch64/fpu/cbrt_sve.c
|
||||
new file mode 100644
|
||||
index 0000000000000000..fc976eda2a6018f7
|
||||
--- /dev/null
|
||||
+++ b/sysdeps/aarch64/fpu/cbrt_sve.c
|
||||
@@ -0,0 +1,128 @@
|
||||
+/* Double-precision vector (SVE) cbrt function
|
||||
+
|
||||
+ Copyright (C) 2024 Free Software Foundation, Inc.
|
||||
+ This file is part of the GNU C Library.
|
||||
+
|
||||
+ The GNU C Library is free software; you can redistribute it and/or
|
||||
+ modify it under the terms of the GNU Lesser General Public
|
||||
+ License as published by the Free Software Foundation; either
|
||||
+ version 2.1 of the License, or (at your option) any later version.
|
||||
+
|
||||
+ The GNU C Library is distributed in the hope that it will be useful,
|
||||
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
+ Lesser General Public License for more details.
|
||||
+
|
||||
+ You should have received a copy of the GNU Lesser General Public
|
||||
+ License along with the GNU C Library; if not, see
|
||||
+ <https://www.gnu.org/licenses/>. */
|
||||
+
|
||||
+#include "sv_math.h"
|
||||
+#include "poly_sve_f64.h"
|
||||
+
|
||||
+const static struct data
|
||||
+{
|
||||
+ float64_t poly[4];
|
||||
+ float64_t table[5];
|
||||
+ float64_t one_third, two_thirds, shift;
|
||||
+ int64_t exp_bias;
|
||||
+ uint64_t tiny_bound, thresh;
|
||||
+} data = {
|
||||
+ /* Generated with FPMinimax in [0.5, 1]. */
|
||||
+ .poly = { 0x1.c14e8ee44767p-2, 0x1.dd2d3f99e4c0ep-1, -0x1.08e83026b7e74p-1,
|
||||
+ 0x1.2c74eaa3ba428p-3, },
|
||||
+ /* table[i] = 2^((i - 2) / 3). */
|
||||
+ .table = { 0x1.428a2f98d728bp-1, 0x1.965fea53d6e3dp-1, 0x1p0,
|
||||
+ 0x1.428a2f98d728bp0, 0x1.965fea53d6e3dp0, },
|
||||
+ .one_third = 0x1.5555555555555p-2,
|
||||
+ .two_thirds = 0x1.5555555555555p-1,
|
||||
+ .shift = 0x1.8p52,
|
||||
+ .exp_bias = 1022,
|
||||
+ .tiny_bound = 0x0010000000000000, /* Smallest normal. */
|
||||
+ .thresh = 0x7fe0000000000000, /* asuint64 (infinity) - tiny_bound. */
|
||||
+};
|
||||
+
|
||||
+#define MantissaMask 0x000fffffffffffff
|
||||
+#define HalfExp 0x3fe0000000000000
|
||||
+
|
||||
+static svfloat64_t NOINLINE
|
||||
+special_case (svfloat64_t x, svfloat64_t y, svbool_t special)
|
||||
+{
|
||||
+ return sv_call_f64 (cbrt, x, y, special);
|
||||
+}
|
||||
+
|
||||
+static inline svfloat64_t
|
||||
+shifted_lookup (const svbool_t pg, const float64_t *table, svint64_t i)
|
||||
+{
|
||||
+ return svld1_gather_index (pg, table, svadd_x (pg, i, 2));
|
||||
+}
|
||||
+
|
||||
+/* Approximation for double-precision vector cbrt(x), using low-order
|
||||
+ polynomial and two Newton iterations. Greatest observed error is 1.79 ULP.
|
||||
+ Errors repeat according to the exponent, for instance an error observed for
|
||||
+ double value m * 2^e will be observed for any input m * 2^(e + 3*i), where i
|
||||
+ is an integer.
|
||||
+ _ZGVsMxv_cbrt (0x0.3fffb8d4413f3p-1022) got 0x1.965f53b0e5d97p-342
|
||||
+ want 0x1.965f53b0e5d95p-342. */
|
||||
+svfloat64_t SV_NAME_D1 (cbrt) (svfloat64_t x, const svbool_t pg)
|
||||
+{
|
||||
+ const struct data *d = ptr_barrier (&data);
|
||||
+
|
||||
+ svfloat64_t ax = svabs_x (pg, x);
|
||||
+ svuint64_t iax = svreinterpret_u64 (ax);
|
||||
+ svuint64_t sign = sveor_x (pg, svreinterpret_u64 (x), iax);
|
||||
+
|
||||
+ /* Subnormal, +/-0 and special values. */
|
||||
+ svbool_t special = svcmpge (pg, svsub_x (pg, iax, d->tiny_bound), d->thresh);
|
||||
+
|
||||
+ /* Decompose |x| into m * 2^e, where m is in [0.5, 1.0]. This is a vector
|
||||
+ version of frexp, which gets subnormal values wrong - these have to be
|
||||
+ special-cased as a result. */
|
||||
+ svfloat64_t m = svreinterpret_f64 (svorr_x (
|
||||
+ pg, svand_x (pg, svreinterpret_u64 (x), MantissaMask), HalfExp));
|
||||
+ svint64_t e
|
||||
+ = svsub_x (pg, svreinterpret_s64 (svlsr_x (pg, iax, 52)), d->exp_bias);
|
||||
+
|
||||
+ /* Calculate rough approximation for cbrt(m) in [0.5, 1.0], starting point
|
||||
+ for Newton iterations. */
|
||||
+ svfloat64_t p
|
||||
+ = sv_pairwise_poly_3_f64_x (pg, m, svmul_x (pg, m, m), d->poly);
|
||||
+
|
||||
+ /* Two iterations of Newton's method for iteratively approximating cbrt. */
|
||||
+ svfloat64_t m_by_3 = svmul_x (pg, m, d->one_third);
|
||||
+ svfloat64_t a = svmla_x (pg, svdiv_x (pg, m_by_3, svmul_x (pg, p, p)), p,
|
||||
+ d->two_thirds);
|
||||
+ a = svmla_x (pg, svdiv_x (pg, m_by_3, svmul_x (pg, a, a)), a, d->two_thirds);
|
||||
+
|
||||
+ /* Assemble the result by the following:
|
||||
+
|
||||
+ cbrt(x) = cbrt(m) * 2 ^ (e / 3).
|
||||
+
|
||||
+ We can get 2 ^ round(e / 3) using ldexp and integer divide, but since e is
|
||||
+ not necessarily a multiple of 3 we lose some information.
|
||||
+
|
||||
+ Let q = 2 ^ round(e / 3), then t = 2 ^ (e / 3) / q.
|
||||
+
|
||||
+ Then we know t = 2 ^ (i / 3), where i is the remainder from e / 3, which
|
||||
+ is an integer in [-2, 2], and can be looked up in the table T. Hence the
|
||||
+ result is assembled as:
|
||||
+
|
||||
+ cbrt(x) = cbrt(m) * t * 2 ^ round(e / 3) * sign. */
|
||||
+ svfloat64_t eb3f = svmul_x (pg, svcvt_f64_x (pg, e), d->one_third);
|
||||
+ svint64_t ey = svcvt_s64_x (pg, eb3f);
|
||||
+ svint64_t em3 = svmls_x (pg, e, ey, 3);
|
||||
+
|
||||
+ svfloat64_t my = shifted_lookup (pg, d->table, em3);
|
||||
+ my = svmul_x (pg, my, a);
|
||||
+
|
||||
+ /* Vector version of ldexp. */
|
||||
+ svfloat64_t y = svscale_x (pg, my, ey);
|
||||
+
|
||||
+ if (__glibc_unlikely (svptest_any (pg, special)))
|
||||
+ return special_case (
|
||||
+ x, svreinterpret_f64 (svorr_x (pg, svreinterpret_u64 (y), sign)),
|
||||
+ special);
|
||||
+
|
||||
+ /* Copy sign. */
|
||||
+ return svreinterpret_f64 (svorr_x (pg, svreinterpret_u64 (y), sign));
|
||||
+}
|
||||
diff --git a/sysdeps/aarch64/fpu/cbrtf_advsimd.c b/sysdeps/aarch64/fpu/cbrtf_advsimd.c
|
||||
new file mode 100644
|
||||
index 0000000000000000..27debb8b57c8c3e2
|
||||
--- /dev/null
|
||||
+++ b/sysdeps/aarch64/fpu/cbrtf_advsimd.c
|
||||
@@ -0,0 +1,123 @@
|
||||
+/* Single-precision vector (AdvSIMD) cbrt function
|
||||
+
|
||||
+ Copyright (C) 2024 Free Software Foundation, Inc.
|
||||
+ This file is part of the GNU C Library.
|
||||
+
|
||||
+ The GNU C Library is free software; you can redistribute it and/or
|
||||
+ modify it under the terms of the GNU Lesser General Public
|
||||
+ License as published by the Free Software Foundation; either
|
||||
+ version 2.1 of the License, or (at your option) any later version.
|
||||
+
|
||||
+ The GNU C Library is distributed in the hope that it will be useful,
|
||||
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
+ Lesser General Public License for more details.
|
||||
+
|
||||
+ You should have received a copy of the GNU Lesser General Public
|
||||
+ License along with the GNU C Library; if not, see
|
||||
+ <https://www.gnu.org/licenses/>. */
|
||||
+
|
||||
+#include "v_math.h"
|
||||
+#include "poly_advsimd_f32.h"
|
||||
+
|
||||
+const static struct data
|
||||
+{
|
||||
+ float32x4_t poly[4], one_third;
|
||||
+ float table[5];
|
||||
+} data = {
|
||||
+ .poly = { /* Very rough approximation of cbrt(x) in [0.5, 1], generated with
|
||||
+ FPMinimax. */
|
||||
+ V4 (0x1.c14e96p-2), V4 (0x1.dd2d3p-1), V4 (-0x1.08e81ap-1),
|
||||
+ V4 (0x1.2c74c2p-3) },
|
||||
+ .table = { /* table[i] = 2^((i - 2) / 3). */
|
||||
+ 0x1.428a3p-1, 0x1.965feap-1, 0x1p0, 0x1.428a3p0, 0x1.965feap0 },
|
||||
+ .one_third = V4 (0x1.555556p-2f),
|
||||
+};
|
||||
+
|
||||
+#define SignMask v_u32 (0x80000000)
|
||||
+#define SmallestNormal v_u32 (0x00800000)
|
||||
+#define Thresh vdup_n_u16 (0x7f00) /* asuint(INFINITY) - SmallestNormal. */
|
||||
+#define MantissaMask v_u32 (0x007fffff)
|
||||
+#define HalfExp v_u32 (0x3f000000)
|
||||
+
|
||||
+static float32x4_t VPCS_ATTR NOINLINE
|
||||
+special_case (float32x4_t x, float32x4_t y, uint16x4_t special)
|
||||
+{
|
||||
+ return v_call_f32 (cbrtf, x, y, vmovl_u16 (special));
|
||||
+}
|
||||
+
|
||||
+static inline float32x4_t
|
||||
+shifted_lookup (const float *table, int32x4_t i)
|
||||
+{
|
||||
+ return (float32x4_t){ table[i[0] + 2], table[i[1] + 2], table[i[2] + 2],
|
||||
+ table[i[3] + 2] };
|
||||
+}
|
||||
+
|
||||
+/* Approximation for vector single-precision cbrt(x) using Newton iteration
|
||||
+ with initial guess obtained by a low-order polynomial. Greatest error
|
||||
+ is 1.64 ULP. This is observed for every value where the mantissa is
|
||||
+ 0x1.85a2aa and the exponent is a multiple of 3, for example:
|
||||
+ _ZGVnN4v_cbrtf(0x1.85a2aap+3) got 0x1.267936p+1
|
||||
+ want 0x1.267932p+1. */
|
||||
+VPCS_ATTR float32x4_t V_NAME_F1 (cbrt) (float32x4_t x)
|
||||
+{
|
||||
+ const struct data *d = ptr_barrier (&data);
|
||||
+ uint32x4_t iax = vreinterpretq_u32_f32 (vabsq_f32 (x));
|
||||
+
|
||||
+ /* Subnormal, +/-0 and special values. */
|
||||
+ uint16x4_t special = vcge_u16 (vsubhn_u32 (iax, SmallestNormal), Thresh);
|
||||
+
|
||||
+ /* Decompose |x| into m * 2^e, where m is in [0.5, 1.0]. This is a vector
|
||||
+ version of frexpf, which gets subnormal values wrong - these have to be
|
||||
+ special-cased as a result. */
|
||||
+ float32x4_t m = vbslq_f32 (MantissaMask, x, v_f32 (0.5));
|
||||
+ int32x4_t e
|
||||
+ = vsubq_s32 (vreinterpretq_s32_u32 (vshrq_n_u32 (iax, 23)), v_s32 (126));
|
||||
+
|
||||
+ /* p is a rough approximation for cbrt(m) in [0.5, 1.0]. The better this is,
|
||||
+ the less accurate the next stage of the algorithm needs to be. An order-4
|
||||
+ polynomial is enough for one Newton iteration. */
|
||||
+ float32x4_t p = v_pairwise_poly_3_f32 (m, vmulq_f32 (m, m), d->poly);
|
||||
+
|
||||
+ float32x4_t one_third = d->one_third;
|
||||
+ float32x4_t two_thirds = vaddq_f32 (one_third, one_third);
|
||||
+
|
||||
+ /* One iteration of Newton's method for iteratively approximating cbrt. */
|
||||
+ float32x4_t m_by_3 = vmulq_f32 (m, one_third);
|
||||
+ float32x4_t a
|
||||
+ = vfmaq_f32 (vdivq_f32 (m_by_3, vmulq_f32 (p, p)), two_thirds, p);
|
||||
+
|
||||
+ /* Assemble the result by the following:
|
||||
+
|
||||
+ cbrt(x) = cbrt(m) * 2 ^ (e / 3).
|
||||
+
|
||||
+ We can get 2 ^ round(e / 3) using ldexp and integer divide, but since e is
|
||||
+ not necessarily a multiple of 3 we lose some information.
|
||||
+
|
||||
+ Let q = 2 ^ round(e / 3), then t = 2 ^ (e / 3) / q.
|
||||
+
|
||||
+ Then we know t = 2 ^ (i / 3), where i is the remainder from e / 3, which
|
||||
+ is an integer in [-2, 2], and can be looked up in the table T. Hence the
|
||||
+ result is assembled as:
|
||||
+
|
||||
+ cbrt(x) = cbrt(m) * t * 2 ^ round(e / 3) * sign. */
|
||||
+ float32x4_t ef = vmulq_f32 (vcvtq_f32_s32 (e), one_third);
|
||||
+ int32x4_t ey = vcvtq_s32_f32 (ef);
|
||||
+ int32x4_t em3 = vsubq_s32 (e, vmulq_s32 (ey, v_s32 (3)));
|
||||
+
|
||||
+ float32x4_t my = shifted_lookup (d->table, em3);
|
||||
+ my = vmulq_f32 (my, a);
|
||||
+
|
||||
+ /* Vector version of ldexpf. */
|
||||
+ float32x4_t y
|
||||
+ = vreinterpretq_f32_s32 (vshlq_n_s32 (vaddq_s32 (ey, v_s32 (127)), 23));
|
||||
+ y = vmulq_f32 (y, my);
|
||||
+
|
||||
+ if (__glibc_unlikely (v_any_u16h (special)))
|
||||
+ return special_case (x, vbslq_f32 (SignMask, x, y), special);
|
||||
+
|
||||
+ /* Copy sign. */
|
||||
+ return vbslq_f32 (SignMask, x, y);
|
||||
+}
|
||||
+libmvec_hidden_def (V_NAME_F1 (cbrt))
|
||||
+HALF_WIDTH_ALIAS_F1 (cbrt)
|
||||
diff --git a/sysdeps/aarch64/fpu/cbrtf_sve.c b/sysdeps/aarch64/fpu/cbrtf_sve.c
|
||||
new file mode 100644
|
||||
index 0000000000000000..23c220c202244c1f
|
||||
--- /dev/null
|
||||
+++ b/sysdeps/aarch64/fpu/cbrtf_sve.c
|
||||
@@ -0,0 +1,122 @@
|
||||
+/* Single-precision vector (SVE) cbrt function
|
||||
+
|
||||
+ Copyright (C) 2024 Free Software Foundation, Inc.
|
||||
+ This file is part of the GNU C Library.
|
||||
+
|
||||
+ The GNU C Library is free software; you can redistribute it and/or
|
||||
+ modify it under the terms of the GNU Lesser General Public
|
||||
+ License as published by the Free Software Foundation; either
|
||||
+ version 2.1 of the License, or (at your option) any later version.
|
||||
+
|
||||
+ The GNU C Library is distributed in the hope that it will be useful,
|
||||
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
+ Lesser General Public License for more details.
|
||||
+
|
||||
+ You should have received a copy of the GNU Lesser General Public
|
||||
+ License along with the GNU C Library; if not, see
|
||||
+ <https://www.gnu.org/licenses/>. */
|
||||
+
|
||||
+#include "sv_math.h"
|
||||
+#include "poly_sve_f32.h"
|
||||
+
|
||||
+const static struct data
|
||||
+{
|
||||
+ float32_t poly[4];
|
||||
+ float32_t table[5];
|
||||
+ float32_t one_third, two_thirds;
|
||||
+} data = {
|
||||
+ /* Very rough approximation of cbrt(x) in [0.5, 1], generated with FPMinimax.
|
||||
+ */
|
||||
+ .poly = { 0x1.c14e96p-2, 0x1.dd2d3p-1, -0x1.08e81ap-1,
|
||||
+ 0x1.2c74c2p-3, },
|
||||
+ /* table[i] = 2^((i - 2) / 3). */
|
||||
+ .table = { 0x1.428a3p-1, 0x1.965feap-1, 0x1p0, 0x1.428a3p0, 0x1.965feap0 },
|
||||
+ .one_third = 0x1.555556p-2f,
|
||||
+ .two_thirds = 0x1.555556p-1f,
|
||||
+};
|
||||
+
|
||||
+#define SmallestNormal 0x00800000
|
||||
+#define Thresh 0x7f000000 /* asuint(INFINITY) - SmallestNormal. */
|
||||
+#define MantissaMask 0x007fffff
|
||||
+#define HalfExp 0x3f000000
|
||||
+
|
||||
+static svfloat32_t NOINLINE
|
||||
+special_case (svfloat32_t x, svfloat32_t y, svbool_t special)
|
||||
+{
|
||||
+ return sv_call_f32 (cbrtf, x, y, special);
|
||||
+}
|
||||
+
|
||||
+static inline svfloat32_t
|
||||
+shifted_lookup (const svbool_t pg, const float32_t *table, svint32_t i)
|
||||
+{
|
||||
+ return svld1_gather_index (pg, table, svadd_x (pg, i, 2));
|
||||
+}
|
||||
+
|
||||
+/* Approximation for vector single-precision cbrt(x) using Newton iteration
|
||||
+ with initial guess obtained by a low-order polynomial. Greatest error
|
||||
+ is 1.64 ULP. This is observed for every value where the mantissa is
|
||||
+ 0x1.85a2aa and the exponent is a multiple of 3, for example:
|
||||
+ _ZGVsMxv_cbrtf (0x1.85a2aap+3) got 0x1.267936p+1
|
||||
+ want 0x1.267932p+1. */
|
||||
+svfloat32_t SV_NAME_F1 (cbrt) (svfloat32_t x, const svbool_t pg)
|
||||
+{
|
||||
+ const struct data *d = ptr_barrier (&data);
|
||||
+
|
||||
+ svfloat32_t ax = svabs_x (pg, x);
|
||||
+ svuint32_t iax = svreinterpret_u32 (ax);
|
||||
+ svuint32_t sign = sveor_x (pg, svreinterpret_u32 (x), iax);
|
||||
+
|
||||
+ /* Subnormal, +/-0 and special values. */
|
||||
+ svbool_t special = svcmpge (pg, svsub_x (pg, iax, SmallestNormal), Thresh);
|
||||
+
|
||||
+ /* Decompose |x| into m * 2^e, where m is in [0.5, 1.0]. This is a vector
|
||||
+ version of frexpf, which gets subnormal values wrong - these have to be
|
||||
+ special-cased as a result. */
|
||||
+ svfloat32_t m = svreinterpret_f32 (svorr_x (
|
||||
+ pg, svand_x (pg, svreinterpret_u32 (x), MantissaMask), HalfExp));
|
||||
+ svint32_t e = svsub_x (pg, svreinterpret_s32 (svlsr_x (pg, iax, 23)), 126);
|
||||
+
|
||||
+ /* p is a rough approximation for cbrt(m) in [0.5, 1.0]. The better this is,
|
||||
+ the less accurate the next stage of the algorithm needs to be. An order-4
|
||||
+ polynomial is enough for one Newton iteration. */
|
||||
+ svfloat32_t p
|
||||
+ = sv_pairwise_poly_3_f32_x (pg, m, svmul_x (pg, m, m), d->poly);
|
||||
+
|
||||
+ /* One iteration of Newton's method for iteratively approximating cbrt. */
|
||||
+ svfloat32_t m_by_3 = svmul_x (pg, m, d->one_third);
|
||||
+ svfloat32_t a = svmla_x (pg, svdiv_x (pg, m_by_3, svmul_x (pg, p, p)), p,
|
||||
+ d->two_thirds);
|
||||
+
|
||||
+ /* Assemble the result by the following:
|
||||
+
|
||||
+ cbrt(x) = cbrt(m) * 2 ^ (e / 3).
|
||||
+
|
||||
+ We can get 2 ^ round(e / 3) using ldexp and integer divide, but since e is
|
||||
+ not necessarily a multiple of 3 we lose some information.
|
||||
+
|
||||
+ Let q = 2 ^ round(e / 3), then t = 2 ^ (e / 3) / q.
|
||||
+
|
||||
+ Then we know t = 2 ^ (i / 3), where i is the remainder from e / 3, which
|
||||
+ is an integer in [-2, 2], and can be looked up in the table T. Hence the
|
||||
+ result is assembled as:
|
||||
+
|
||||
+ cbrt(x) = cbrt(m) * t * 2 ^ round(e / 3) * sign. */
|
||||
+ svfloat32_t ef = svmul_x (pg, svcvt_f32_x (pg, e), d->one_third);
|
||||
+ svint32_t ey = svcvt_s32_x (pg, ef);
|
||||
+ svint32_t em3 = svmls_x (pg, e, ey, 3);
|
||||
+
|
||||
+ svfloat32_t my = shifted_lookup (pg, d->table, em3);
|
||||
+ my = svmul_x (pg, my, a);
|
||||
+
|
||||
+ /* Vector version of ldexpf. */
|
||||
+ svfloat32_t y = svscale_x (pg, my, ey);
|
||||
+
|
||||
+ if (__glibc_unlikely (svptest_any (pg, special)))
|
||||
+ return special_case (
|
||||
+ x, svreinterpret_f32 (svorr_x (pg, svreinterpret_u32 (y), sign)),
|
||||
+ special);
|
||||
+
|
||||
+ /* Copy sign. */
|
||||
+ return svreinterpret_f32 (svorr_x (pg, svreinterpret_u32 (y), sign));
|
||||
+}
|
||||
diff --git a/sysdeps/aarch64/fpu/test-double-advsimd-wrappers.c b/sysdeps/aarch64/fpu/test-double-advsimd-wrappers.c
|
||||
index 417125be476cd75f..1877db3ac6932037 100644
|
||||
--- a/sysdeps/aarch64/fpu/test-double-advsimd-wrappers.c
|
||||
+++ b/sysdeps/aarch64/fpu/test-double-advsimd-wrappers.c
|
||||
@@ -30,6 +30,7 @@ VPCS_VECTOR_WRAPPER (asinh_advsimd, _ZGVnN2v_asinh)
|
||||
VPCS_VECTOR_WRAPPER (atan_advsimd, _ZGVnN2v_atan)
|
||||
VPCS_VECTOR_WRAPPER (atanh_advsimd, _ZGVnN2v_atanh)
|
||||
VPCS_VECTOR_WRAPPER_ff (atan2_advsimd, _ZGVnN2vv_atan2)
|
||||
+VPCS_VECTOR_WRAPPER (cbrt_advsimd, _ZGVnN2v_cbrt)
|
||||
VPCS_VECTOR_WRAPPER (cos_advsimd, _ZGVnN2v_cos)
|
||||
VPCS_VECTOR_WRAPPER (cosh_advsimd, _ZGVnN2v_cosh)
|
||||
VPCS_VECTOR_WRAPPER (erf_advsimd, _ZGVnN2v_erf)
|
||||
diff --git a/sysdeps/aarch64/fpu/test-double-sve-wrappers.c b/sysdeps/aarch64/fpu/test-double-sve-wrappers.c
|
||||
index 31ebf18705f68856..b702f942dea0749f 100644
|
||||
--- a/sysdeps/aarch64/fpu/test-double-sve-wrappers.c
|
||||
+++ b/sysdeps/aarch64/fpu/test-double-sve-wrappers.c
|
||||
@@ -49,6 +49,7 @@ SVE_VECTOR_WRAPPER (asinh_sve, _ZGVsMxv_asinh)
|
||||
SVE_VECTOR_WRAPPER (atan_sve, _ZGVsMxv_atan)
|
||||
SVE_VECTOR_WRAPPER (atanh_sve, _ZGVsMxv_atanh)
|
||||
SVE_VECTOR_WRAPPER_ff (atan2_sve, _ZGVsMxvv_atan2)
|
||||
+SVE_VECTOR_WRAPPER (cbrt_sve, _ZGVsMxv_cbrt)
|
||||
SVE_VECTOR_WRAPPER (cos_sve, _ZGVsMxv_cos)
|
||||
SVE_VECTOR_WRAPPER (cosh_sve, _ZGVsMxv_cosh)
|
||||
SVE_VECTOR_WRAPPER (erf_sve, _ZGVsMxv_erf)
|
||||
diff --git a/sysdeps/aarch64/fpu/test-float-advsimd-wrappers.c b/sysdeps/aarch64/fpu/test-float-advsimd-wrappers.c
|
||||
index dab0f1cfcb79a305..9cb451b4f045e625 100644
|
||||
--- a/sysdeps/aarch64/fpu/test-float-advsimd-wrappers.c
|
||||
+++ b/sysdeps/aarch64/fpu/test-float-advsimd-wrappers.c
|
||||
@@ -30,6 +30,7 @@ VPCS_VECTOR_WRAPPER (asinhf_advsimd, _ZGVnN4v_asinhf)
|
||||
VPCS_VECTOR_WRAPPER (atanf_advsimd, _ZGVnN4v_atanf)
|
||||
VPCS_VECTOR_WRAPPER (atanhf_advsimd, _ZGVnN4v_atanhf)
|
||||
VPCS_VECTOR_WRAPPER_ff (atan2f_advsimd, _ZGVnN4vv_atan2f)
|
||||
+VPCS_VECTOR_WRAPPER (cbrtf_advsimd, _ZGVnN4v_cbrtf)
|
||||
VPCS_VECTOR_WRAPPER (cosf_advsimd, _ZGVnN4v_cosf)
|
||||
VPCS_VECTOR_WRAPPER (coshf_advsimd, _ZGVnN4v_coshf)
|
||||
VPCS_VECTOR_WRAPPER (erff_advsimd, _ZGVnN4v_erff)
|
||||
diff --git a/sysdeps/aarch64/fpu/test-float-sve-wrappers.c b/sysdeps/aarch64/fpu/test-float-sve-wrappers.c
|
||||
index 2aa6cbcc28d69cf8..5b3dd22916d2a50d 100644
|
||||
--- a/sysdeps/aarch64/fpu/test-float-sve-wrappers.c
|
||||
+++ b/sysdeps/aarch64/fpu/test-float-sve-wrappers.c
|
||||
@@ -49,6 +49,7 @@ SVE_VECTOR_WRAPPER (asinhf_sve, _ZGVsMxv_asinhf)
|
||||
SVE_VECTOR_WRAPPER (atanf_sve, _ZGVsMxv_atanf)
|
||||
SVE_VECTOR_WRAPPER (atanhf_sve, _ZGVsMxv_atanhf)
|
||||
SVE_VECTOR_WRAPPER_ff (atan2f_sve, _ZGVsMxvv_atan2f)
|
||||
+SVE_VECTOR_WRAPPER (cbrtf_sve, _ZGVsMxv_cbrtf)
|
||||
SVE_VECTOR_WRAPPER (cosf_sve, _ZGVsMxv_cosf)
|
||||
SVE_VECTOR_WRAPPER (coshf_sve, _ZGVsMxv_coshf)
|
||||
SVE_VECTOR_WRAPPER (erff_sve, _ZGVsMxv_erff)
|
||||
diff --git a/sysdeps/aarch64/libm-test-ulps b/sysdeps/aarch64/libm-test-ulps
|
||||
index 17723d0c9e2dfcf5..a67cd7cd7399c533 100644
|
||||
--- a/sysdeps/aarch64/libm-test-ulps
|
||||
+++ b/sysdeps/aarch64/libm-test-ulps
|
||||
@@ -477,11 +477,19 @@ double: 4
|
||||
float: 1
|
||||
ldouble: 1
|
||||
|
||||
+Function: "cbrt_advsimd":
|
||||
+double: 1
|
||||
+float: 1
|
||||
+
|
||||
Function: "cbrt_downward":
|
||||
double: 4
|
||||
float: 1
|
||||
ldouble: 1
|
||||
|
||||
+Function: "cbrt_sve":
|
||||
+double: 1
|
||||
+float: 1
|
||||
+
|
||||
Function: "cbrt_towardzero":
|
||||
double: 3
|
||||
float: 1
|
||||
diff --git a/sysdeps/unix/sysv/linux/aarch64/libmvec.abilist b/sysdeps/unix/sysv/linux/aarch64/libmvec.abilist
|
||||
index 1184374efd25cfa6..89ac1dfa36279eb0 100644
|
||||
--- a/sysdeps/unix/sysv/linux/aarch64/libmvec.abilist
|
||||
+++ b/sysdeps/unix/sysv/linux/aarch64/libmvec.abilist
|
||||
@@ -79,6 +79,8 @@ GLIBC_2.40 _ZGVnN2v_asinh F
|
||||
GLIBC_2.40 _ZGVnN2v_asinhf F
|
||||
GLIBC_2.40 _ZGVnN2v_atanh F
|
||||
GLIBC_2.40 _ZGVnN2v_atanhf F
|
||||
+GLIBC_2.40 _ZGVnN2v_cbrt F
|
||||
+GLIBC_2.40 _ZGVnN2v_cbrtf F
|
||||
GLIBC_2.40 _ZGVnN2v_cosh F
|
||||
GLIBC_2.40 _ZGVnN2v_coshf F
|
||||
GLIBC_2.40 _ZGVnN2v_erf F
|
||||
@@ -94,6 +96,7 @@ GLIBC_2.40 _ZGVnN2vv_hypotf F
|
||||
GLIBC_2.40 _ZGVnN4v_acoshf F
|
||||
GLIBC_2.40 _ZGVnN4v_asinhf F
|
||||
GLIBC_2.40 _ZGVnN4v_atanhf F
|
||||
+GLIBC_2.40 _ZGVnN4v_cbrtf F
|
||||
GLIBC_2.40 _ZGVnN4v_coshf F
|
||||
GLIBC_2.40 _ZGVnN4v_erfcf F
|
||||
GLIBC_2.40 _ZGVnN4v_erff F
|
||||
@@ -106,6 +109,8 @@ GLIBC_2.40 _ZGVsMxv_asinh F
|
||||
GLIBC_2.40 _ZGVsMxv_asinhf F
|
||||
GLIBC_2.40 _ZGVsMxv_atanh F
|
||||
GLIBC_2.40 _ZGVsMxv_atanhf F
|
||||
+GLIBC_2.40 _ZGVsMxv_cbrt F
|
||||
+GLIBC_2.40 _ZGVsMxv_cbrtf F
|
||||
GLIBC_2.40 _ZGVsMxv_cosh F
|
||||
GLIBC_2.40 _ZGVsMxv_coshf F
|
||||
GLIBC_2.40 _ZGVsMxv_erf F
|
||||
2511
glibc-RHEL-118273-12.patch
Normal file
2511
glibc-RHEL-118273-12.patch
Normal file
File diff suppressed because it is too large
Load Diff
319
glibc-RHEL-118273-13.patch
Normal file
319
glibc-RHEL-118273-13.patch
Normal file
@ -0,0 +1,319 @@
|
||||
commit 7900ac490db32f6bccff812733f00280dde34e27
|
||||
Author: Joe Ramsay <Joe.Ramsay@arm.com>
|
||||
Date: Mon Sep 23 15:32:53 2024 +0100
|
||||
|
||||
AArch64: Improve codegen in users of ADVSIMD expm1f helper
|
||||
|
||||
Rearrange operations so MOV is not necessary in reduction or around
|
||||
the special-case handler. Reduce memory access by using more indexed
|
||||
MLAs in polynomial.
|
||||
|
||||
Reviewed-by: Wilco Dijkstra <Wilco.Dijkstra@arm.com>
|
||||
|
||||
diff --git a/sysdeps/aarch64/fpu/expm1f_advsimd.c b/sysdeps/aarch64/fpu/expm1f_advsimd.c
|
||||
index a0616ec7542cbfce..8303ca296e030c2e 100644
|
||||
--- a/sysdeps/aarch64/fpu/expm1f_advsimd.c
|
||||
+++ b/sysdeps/aarch64/fpu/expm1f_advsimd.c
|
||||
@@ -18,27 +18,18 @@
|
||||
<https://www.gnu.org/licenses/>. */
|
||||
|
||||
#include "v_math.h"
|
||||
-#include "poly_advsimd_f32.h"
|
||||
+#include "v_expm1f_inline.h"
|
||||
|
||||
static const struct data
|
||||
{
|
||||
- float32x4_t poly[5];
|
||||
- float invln2_and_ln2[4];
|
||||
- float32x4_t shift;
|
||||
- int32x4_t exponent_bias;
|
||||
+ struct v_expm1f_data d;
|
||||
#if WANT_SIMD_EXCEPT
|
||||
uint32x4_t thresh;
|
||||
#else
|
||||
float32x4_t oflow_bound;
|
||||
#endif
|
||||
} data = {
|
||||
- /* Generated using fpminimax with degree=5 in [-log(2)/2, log(2)/2]. */
|
||||
- .poly = { V4 (0x1.fffffep-2), V4 (0x1.5554aep-3), V4 (0x1.555736p-5),
|
||||
- V4 (0x1.12287cp-7), V4 (0x1.6b55a2p-10) },
|
||||
- /* Stores constants: invln2, ln2_hi, ln2_lo, 0. */
|
||||
- .invln2_and_ln2 = { 0x1.715476p+0f, 0x1.62e4p-1f, 0x1.7f7d1cp-20f, 0 },
|
||||
- .shift = V4 (0x1.8p23f),
|
||||
- .exponent_bias = V4 (0x3f800000),
|
||||
+ .d = V_EXPM1F_DATA,
|
||||
#if !WANT_SIMD_EXCEPT
|
||||
/* Value above which expm1f(x) should overflow. Absolute value of the
|
||||
underflow bound is greater than this, so it catches both cases - there is
|
||||
@@ -55,67 +46,38 @@ static const struct data
|
||||
#define TinyBound v_u32 (0x34000000 << 1)
|
||||
|
||||
static float32x4_t VPCS_ATTR NOINLINE
|
||||
-special_case (float32x4_t x, float32x4_t y, uint32x4_t special)
|
||||
+special_case (float32x4_t x, uint32x4_t special, const struct data *d)
|
||||
{
|
||||
- return v_call_f32 (expm1f, x, y, special);
|
||||
+ return v_call_f32 (
|
||||
+ expm1f, x, expm1f_inline (v_zerofy_f32 (x, special), &d->d), special);
|
||||
}
|
||||
|
||||
/* Single-precision vector exp(x) - 1 function.
|
||||
- The maximum error is 1.51 ULP:
|
||||
- _ZGVnN4v_expm1f (0x1.8baa96p-2) got 0x1.e2fb9p-2
|
||||
- want 0x1.e2fb94p-2. */
|
||||
+ The maximum error is 1.62 ULP:
|
||||
+ _ZGVnN4v_expm1f(0x1.85f83p-2) got 0x1.da9f4p-2
|
||||
+ want 0x1.da9f44p-2. */
|
||||
float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (expm1) (float32x4_t x)
|
||||
{
|
||||
const struct data *d = ptr_barrier (&data);
|
||||
- uint32x4_t ix = vreinterpretq_u32_f32 (x);
|
||||
|
||||
#if WANT_SIMD_EXCEPT
|
||||
+ uint32x4_t ix = vreinterpretq_u32_f32 (x);
|
||||
/* If fp exceptions are to be triggered correctly, fall back to scalar for
|
||||
|x| < 2^-23, |x| > oflow_bound, Inf & NaN. Add ix to itself for
|
||||
shift-left by 1, and compare with thresh which was left-shifted offline -
|
||||
this is effectively an absolute compare. */
|
||||
uint32x4_t special
|
||||
= vcgeq_u32 (vsubq_u32 (vaddq_u32 (ix, ix), TinyBound), d->thresh);
|
||||
- if (__glibc_unlikely (v_any_u32 (special)))
|
||||
- x = v_zerofy_f32 (x, special);
|
||||
#else
|
||||
/* Handles very large values (+ve and -ve), +/-NaN, +/-Inf. */
|
||||
uint32x4_t special = vcagtq_f32 (x, d->oflow_bound);
|
||||
#endif
|
||||
|
||||
- /* Reduce argument to smaller range:
|
||||
- Let i = round(x / ln2)
|
||||
- and f = x - i * ln2, then f is in [-ln2/2, ln2/2].
|
||||
- exp(x) - 1 = 2^i * (expm1(f) + 1) - 1
|
||||
- where 2^i is exact because i is an integer. */
|
||||
- float32x4_t invln2_and_ln2 = vld1q_f32 (d->invln2_and_ln2);
|
||||
- float32x4_t j
|
||||
- = vsubq_f32 (vfmaq_laneq_f32 (d->shift, x, invln2_and_ln2, 0), d->shift);
|
||||
- int32x4_t i = vcvtq_s32_f32 (j);
|
||||
- float32x4_t f = vfmsq_laneq_f32 (x, j, invln2_and_ln2, 1);
|
||||
- f = vfmsq_laneq_f32 (f, j, invln2_and_ln2, 2);
|
||||
-
|
||||
- /* Approximate expm1(f) using polynomial.
|
||||
- Taylor expansion for expm1(x) has the form:
|
||||
- x + ax^2 + bx^3 + cx^4 ....
|
||||
- So we calculate the polynomial P(f) = a + bf + cf^2 + ...
|
||||
- and assemble the approximation expm1(f) ~= f + f^2 * P(f). */
|
||||
- float32x4_t p = v_horner_4_f32 (f, d->poly);
|
||||
- p = vfmaq_f32 (f, vmulq_f32 (f, f), p);
|
||||
-
|
||||
- /* Assemble the result.
|
||||
- expm1(x) ~= 2^i * (p + 1) - 1
|
||||
- Let t = 2^i. */
|
||||
- int32x4_t u = vaddq_s32 (vshlq_n_s32 (i, 23), d->exponent_bias);
|
||||
- float32x4_t t = vreinterpretq_f32_s32 (u);
|
||||
-
|
||||
if (__glibc_unlikely (v_any_u32 (special)))
|
||||
- return special_case (vreinterpretq_f32_u32 (ix),
|
||||
- vfmaq_f32 (vsubq_f32 (t, v_f32 (1.0f)), p, t),
|
||||
- special);
|
||||
+ return special_case (x, special, d);
|
||||
|
||||
/* expm1(x) ~= p * t + (t - 1). */
|
||||
- return vfmaq_f32 (vsubq_f32 (t, v_f32 (1.0f)), p, t);
|
||||
+ return expm1f_inline (x, &d->d);
|
||||
}
|
||||
libmvec_hidden_def (V_NAME_F1 (expm1))
|
||||
HALF_WIDTH_ALIAS_F1 (expm1)
|
||||
diff --git a/sysdeps/aarch64/fpu/sinhf_advsimd.c b/sysdeps/aarch64/fpu/sinhf_advsimd.c
|
||||
index 6bb7482dc28795c1..c6ed7598e7deca1b 100644
|
||||
--- a/sysdeps/aarch64/fpu/sinhf_advsimd.c
|
||||
+++ b/sysdeps/aarch64/fpu/sinhf_advsimd.c
|
||||
@@ -23,15 +23,13 @@
|
||||
static const struct data
|
||||
{
|
||||
struct v_expm1f_data expm1f_consts;
|
||||
- uint32x4_t halff;
|
||||
#if WANT_SIMD_EXCEPT
|
||||
uint32x4_t tiny_bound, thresh;
|
||||
#else
|
||||
- uint32x4_t oflow_bound;
|
||||
+ float32x4_t oflow_bound;
|
||||
#endif
|
||||
} data = {
|
||||
.expm1f_consts = V_EXPM1F_DATA,
|
||||
- .halff = V4 (0x3f000000),
|
||||
#if WANT_SIMD_EXCEPT
|
||||
/* 0x1.6a09e8p-32, below which expm1f underflows. */
|
||||
.tiny_bound = V4 (0x2fb504f4),
|
||||
@@ -39,14 +37,15 @@ static const struct data
|
||||
.thresh = V4 (0x12fbbbb3),
|
||||
#else
|
||||
/* 0x1.61814ep+6, above which expm1f helper overflows. */
|
||||
- .oflow_bound = V4 (0x42b0c0a7),
|
||||
+ .oflow_bound = V4 (0x1.61814ep+6),
|
||||
#endif
|
||||
};
|
||||
|
||||
static float32x4_t NOINLINE VPCS_ATTR
|
||||
-special_case (float32x4_t x, float32x4_t y, uint32x4_t special)
|
||||
+special_case (float32x4_t x, float32x4_t t, float32x4_t halfsign,
|
||||
+ uint32x4_t special)
|
||||
{
|
||||
- return v_call_f32 (sinhf, x, y, special);
|
||||
+ return v_call_f32 (sinhf, x, vmulq_f32 (t, halfsign), special);
|
||||
}
|
||||
|
||||
/* Approximation for vector single-precision sinh(x) using expm1.
|
||||
@@ -60,15 +59,15 @@ float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (sinh) (float32x4_t x)
|
||||
|
||||
uint32x4_t ix = vreinterpretq_u32_f32 (x);
|
||||
float32x4_t ax = vabsq_f32 (x);
|
||||
- uint32x4_t iax = vreinterpretq_u32_f32 (ax);
|
||||
- uint32x4_t sign = veorq_u32 (ix, iax);
|
||||
- float32x4_t halfsign = vreinterpretq_f32_u32 (vorrq_u32 (sign, d->halff));
|
||||
+ float32x4_t halfsign = vreinterpretq_f32_u32 (
|
||||
+ vbslq_u32 (v_u32 (0x80000000), ix, vreinterpretq_u32_f32 (v_f32 (0.5))));
|
||||
|
||||
#if WANT_SIMD_EXCEPT
|
||||
- uint32x4_t special = vcgeq_u32 (vsubq_u32 (iax, d->tiny_bound), d->thresh);
|
||||
+ uint32x4_t special = vcgeq_u32 (
|
||||
+ vsubq_u32 (vreinterpretq_u32_f32 (ax), d->tiny_bound), d->thresh);
|
||||
ax = v_zerofy_f32 (ax, special);
|
||||
#else
|
||||
- uint32x4_t special = vcgeq_u32 (iax, d->oflow_bound);
|
||||
+ uint32x4_t special = vcageq_f32 (x, d->oflow_bound);
|
||||
#endif
|
||||
|
||||
/* Up to the point that expm1f overflows, we can use it to calculate sinhf
|
||||
@@ -80,7 +79,7 @@ float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (sinh) (float32x4_t x)
|
||||
/* Fall back to the scalar variant for any lanes that should trigger an
|
||||
exception. */
|
||||
if (__glibc_unlikely (v_any_u32 (special)))
|
||||
- return special_case (x, vmulq_f32 (t, halfsign), special);
|
||||
+ return special_case (x, t, halfsign, special);
|
||||
|
||||
return vmulq_f32 (t, halfsign);
|
||||
}
|
||||
diff --git a/sysdeps/aarch64/fpu/tanhf_advsimd.c b/sysdeps/aarch64/fpu/tanhf_advsimd.c
|
||||
index 50defd6ef03926f4..3ced9b7a414c812c 100644
|
||||
--- a/sysdeps/aarch64/fpu/tanhf_advsimd.c
|
||||
+++ b/sysdeps/aarch64/fpu/tanhf_advsimd.c
|
||||
@@ -28,13 +28,16 @@ static const struct data
|
||||
/* 0x1.205966p+3, above which tanhf rounds to 1 (or -1 for negative). */
|
||||
.boring_bound = V4 (0x41102cb3),
|
||||
.large_bound = V4 (0x7f800000),
|
||||
- .onef = V4 (0x3f800000),
|
||||
};
|
||||
|
||||
static float32x4_t NOINLINE VPCS_ATTR
|
||||
-special_case (float32x4_t x, float32x4_t y, uint32x4_t special)
|
||||
+special_case (float32x4_t x, uint32x4_t is_boring, float32x4_t boring,
|
||||
+ float32x4_t q, uint32x4_t special)
|
||||
{
|
||||
- return v_call_f32 (tanhf, x, y, special);
|
||||
+ return v_call_f32 (
|
||||
+ tanhf, x,
|
||||
+ vbslq_f32 (is_boring, boring, vdivq_f32 (q, vaddq_f32 (q, v_f32 (2.0)))),
|
||||
+ special);
|
||||
}
|
||||
|
||||
/* Approximation for single-precision vector tanh(x), using a simplified
|
||||
@@ -50,7 +53,9 @@ float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (tanh) (float32x4_t x)
|
||||
uint32x4_t iax = vreinterpretq_u32_f32 (ax);
|
||||
uint32x4_t sign = veorq_u32 (ix, iax);
|
||||
uint32x4_t is_boring = vcgtq_u32 (iax, d->boring_bound);
|
||||
- float32x4_t boring = vreinterpretq_f32_u32 (vorrq_u32 (sign, d->onef));
|
||||
+ /* expm1 exponent bias is 1.0f reinterpreted to int. */
|
||||
+ float32x4_t boring = vreinterpretq_f32_u32 (vorrq_u32 (
|
||||
+ sign, vreinterpretq_u32_s32 (d->expm1f_consts.exponent_bias)));
|
||||
|
||||
#if WANT_SIMD_EXCEPT
|
||||
/* If fp exceptions are to be triggered properly, set all special and boring
|
||||
@@ -66,10 +71,12 @@ float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (tanh) (float32x4_t x)
|
||||
|
||||
/* tanh(x) = (e^2x - 1) / (e^2x + 1). */
|
||||
float32x4_t q = expm1f_inline (vmulq_n_f32 (x, 2), &d->expm1f_consts);
|
||||
- float32x4_t y = vdivq_f32 (q, vaddq_f32 (q, v_f32 (2.0)));
|
||||
+
|
||||
if (__glibc_unlikely (v_any_u32 (special)))
|
||||
- return special_case (vreinterpretq_f32_u32 (ix),
|
||||
- vbslq_f32 (is_boring, boring, y), special);
|
||||
+ return special_case (vreinterpretq_f32_u32 (ix), is_boring, boring, q,
|
||||
+ special);
|
||||
+
|
||||
+ float32x4_t y = vdivq_f32 (q, vaddq_f32 (q, v_f32 (2.0)));
|
||||
return vbslq_f32 (is_boring, boring, y);
|
||||
}
|
||||
libmvec_hidden_def (V_NAME_F1 (tanh))
|
||||
diff --git a/sysdeps/aarch64/fpu/v_expm1f_inline.h b/sysdeps/aarch64/fpu/v_expm1f_inline.h
|
||||
index 59b552da6b74785e..1daedfdd51cfc54b 100644
|
||||
--- a/sysdeps/aarch64/fpu/v_expm1f_inline.h
|
||||
+++ b/sysdeps/aarch64/fpu/v_expm1f_inline.h
|
||||
@@ -21,48 +21,47 @@
|
||||
#define AARCH64_FPU_V_EXPM1F_INLINE_H
|
||||
|
||||
#include "v_math.h"
|
||||
-#include "poly_advsimd_f32.h"
|
||||
+#include "math_config.h"
|
||||
|
||||
struct v_expm1f_data
|
||||
{
|
||||
- float32x4_t poly[5];
|
||||
- float invln2_and_ln2[4];
|
||||
- float32x4_t shift;
|
||||
+ float32x4_t c0, c2;
|
||||
int32x4_t exponent_bias;
|
||||
+ float c1, c3, inv_ln2, c4;
|
||||
+ float ln2_hi, ln2_lo;
|
||||
};
|
||||
|
||||
/* Coefficients generated using fpminimax with degree=5 in [-log(2)/2,
|
||||
- log(2)/2]. Exponent bias is asuint(1.0f).
|
||||
- invln2_and_ln2 Stores constants: invln2, ln2_lo, ln2_hi, 0. */
|
||||
+ log(2)/2]. Exponent bias is asuint(1.0f). */
|
||||
#define V_EXPM1F_DATA \
|
||||
{ \
|
||||
- .poly = { V4 (0x1.fffffep-2), V4 (0x1.5554aep-3), V4 (0x1.555736p-5), \
|
||||
- V4 (0x1.12287cp-7), V4 (0x1.6b55a2p-10) }, \
|
||||
- .shift = V4 (0x1.8p23f), .exponent_bias = V4 (0x3f800000), \
|
||||
- .invln2_and_ln2 = { 0x1.715476p+0f, 0x1.62e4p-1f, 0x1.7f7d1cp-20f, 0 }, \
|
||||
+ .c0 = V4 (0x1.fffffep-2), .c1 = 0x1.5554aep-3, .c2 = V4 (0x1.555736p-5), \
|
||||
+ .c3 = 0x1.12287cp-7, .c4 = 0x1.6b55a2p-10, \
|
||||
+ .exponent_bias = V4 (0x3f800000), .inv_ln2 = 0x1.715476p+0f, \
|
||||
+ .ln2_hi = 0x1.62e4p-1f, .ln2_lo = 0x1.7f7d1cp-20f, \
|
||||
}
|
||||
|
||||
static inline float32x4_t
|
||||
expm1f_inline (float32x4_t x, const struct v_expm1f_data *d)
|
||||
{
|
||||
- /* Helper routine for calculating exp(x) - 1.
|
||||
- Copied from v_expm1f_1u6.c, with all special-case handling removed - the
|
||||
- calling routine should handle special values if required. */
|
||||
+ /* Helper routine for calculating exp(x) - 1. */
|
||||
+
|
||||
+ float32x2_t ln2 = vld1_f32 (&d->ln2_hi);
|
||||
+ float32x4_t lane_consts = vld1q_f32 (&d->c1);
|
||||
|
||||
/* Reduce argument: f in [-ln2/2, ln2/2], i is exact. */
|
||||
- float32x4_t invln2_and_ln2 = vld1q_f32 (d->invln2_and_ln2);
|
||||
- float32x4_t j
|
||||
- = vsubq_f32 (vfmaq_laneq_f32 (d->shift, x, invln2_and_ln2, 0), d->shift);
|
||||
+ float32x4_t j = vrndaq_f32 (vmulq_laneq_f32 (x, lane_consts, 2));
|
||||
int32x4_t i = vcvtq_s32_f32 (j);
|
||||
- float32x4_t f = vfmsq_laneq_f32 (x, j, invln2_and_ln2, 1);
|
||||
- f = vfmsq_laneq_f32 (f, j, invln2_and_ln2, 2);
|
||||
+ float32x4_t f = vfmsq_lane_f32 (x, j, ln2, 0);
|
||||
+ f = vfmsq_lane_f32 (f, j, ln2, 1);
|
||||
|
||||
- /* Approximate expm1(f) with polynomial P, expm1(f) ~= f + f^2 * P(f).
|
||||
- Uses Estrin scheme, where the main _ZGVnN4v_expm1f routine uses
|
||||
- Horner. */
|
||||
+ /* Approximate expm1(f) with polynomial P, expm1(f) ~= f + f^2 * P(f). */
|
||||
float32x4_t f2 = vmulq_f32 (f, f);
|
||||
float32x4_t f4 = vmulq_f32 (f2, f2);
|
||||
- float32x4_t p = v_estrin_4_f32 (f, f2, f4, d->poly);
|
||||
+ float32x4_t p01 = vfmaq_laneq_f32 (d->c0, f, lane_consts, 0);
|
||||
+ float32x4_t p23 = vfmaq_laneq_f32 (d->c2, f, lane_consts, 1);
|
||||
+ float32x4_t p = vfmaq_f32 (p01, f2, p23);
|
||||
+ p = vfmaq_laneq_f32 (p, f4, lane_consts, 3);
|
||||
p = vfmaq_f32 (f, f2, p);
|
||||
|
||||
/* t = 2^i. */
|
||||
495
glibc-RHEL-118273-14.patch
Normal file
495
glibc-RHEL-118273-14.patch
Normal file
@ -0,0 +1,495 @@
|
||||
commit 5bc100bd4b7e00db3009ae93d25d303341545d23
|
||||
Author: Joe Ramsay <Joe.Ramsay@arm.com>
|
||||
Date: Mon Sep 23 15:32:14 2024 +0100
|
||||
|
||||
AArch64: Improve codegen in users of AdvSIMD log1pf helper
|
||||
|
||||
log1pf is quite register-intensive - use fewer registers for the
|
||||
polynomial, and make various changes to shorten dependency chains in
|
||||
parent routines. There is now no spilling with GCC 14. Accuracy moves
|
||||
around a little - comments adjusted accordingly but does not require
|
||||
regen-ulps.
|
||||
|
||||
Use the helper in log1pf as well, instead of having separate
|
||||
implementations. The more accurate polynomial means special-casing can
|
||||
be simplified, and the shorter dependency chain avoids the usual dance
|
||||
around v0, which is otherwise difficult.
|
||||
|
||||
There is a small duplication of vectors containing 1.0f (or 0x3f800000) -
|
||||
GCC is not currently able to efficiently handle values which fit in FMOV
|
||||
but not MOVI, and are reinterpreted to integer. There may be potential
|
||||
for more optimisation if this is fixed.
|
||||
|
||||
Reviewed-by: Wilco Dijkstra <Wilco.Dijkstra@arm.com>
|
||||
|
||||
Conflicts:
|
||||
sysdeps/aarch64/fpu/log1pf_advsimd.c
|
||||
(Fixup context to apply without out-of-scope dependency 751a5502)
|
||||
|
||||
diff --git a/sysdeps/aarch64/fpu/acoshf_advsimd.c b/sysdeps/aarch64/fpu/acoshf_advsimd.c
|
||||
index 8916dcbf409922a9..004474acf9e9322b 100644
|
||||
--- a/sysdeps/aarch64/fpu/acoshf_advsimd.c
|
||||
+++ b/sysdeps/aarch64/fpu/acoshf_advsimd.c
|
||||
@@ -25,35 +25,32 @@ const static struct data
|
||||
{
|
||||
struct v_log1pf_data log1pf_consts;
|
||||
uint32x4_t one;
|
||||
- uint16x4_t thresh;
|
||||
-} data = {
|
||||
- .log1pf_consts = V_LOG1PF_CONSTANTS_TABLE,
|
||||
- .one = V4 (0x3f800000),
|
||||
- .thresh = V4 (0x2000) /* top(asuint(SquareLim) - asuint(1)). */
|
||||
-};
|
||||
+} data = { .log1pf_consts = V_LOG1PF_CONSTANTS_TABLE, .one = V4 (0x3f800000) };
|
||||
+
|
||||
+#define Thresh vdup_n_u16 (0x2000) /* top(asuint(SquareLim) - asuint(1)). */
|
||||
|
||||
static float32x4_t NOINLINE VPCS_ATTR
|
||||
special_case (float32x4_t x, float32x4_t y, uint16x4_t special,
|
||||
- const struct v_log1pf_data d)
|
||||
+ const struct v_log1pf_data *d)
|
||||
{
|
||||
return v_call_f32 (acoshf, x, log1pf_inline (y, d), vmovl_u16 (special));
|
||||
}
|
||||
|
||||
/* Vector approximation for single-precision acosh, based on log1p. Maximum
|
||||
error depends on WANT_SIMD_EXCEPT. With SIMD fp exceptions enabled, it
|
||||
- is 2.78 ULP:
|
||||
- __v_acoshf(0x1.07887p+0) got 0x1.ef9e9cp-3
|
||||
- want 0x1.ef9ea2p-3.
|
||||
+ is 3.00 ULP:
|
||||
+ _ZGVnN4v_acoshf(0x1.01df3ap+0) got 0x1.ef0a82p-4
|
||||
+ want 0x1.ef0a7cp-4.
|
||||
With exceptions disabled, we can compute u with a shorter dependency chain,
|
||||
- which gives maximum error of 3.07 ULP:
|
||||
- __v_acoshf(0x1.01f83ep+0) got 0x1.fbc7fap-4
|
||||
- want 0x1.fbc7f4p-4. */
|
||||
+ which gives maximum error of 3.22 ULP:
|
||||
+ _ZGVnN4v_acoshf(0x1.007ef2p+0) got 0x1.fdcdccp-5
|
||||
+ want 0x1.fdcdd2p-5. */
|
||||
|
||||
VPCS_ATTR float32x4_t NOINLINE V_NAME_F1 (acosh) (float32x4_t x)
|
||||
{
|
||||
const struct data *d = ptr_barrier (&data);
|
||||
uint32x4_t ix = vreinterpretq_u32_f32 (x);
|
||||
- uint16x4_t special = vcge_u16 (vsubhn_u32 (ix, d->one), d->thresh);
|
||||
+ uint16x4_t special = vcge_u16 (vsubhn_u32 (ix, d->one), Thresh);
|
||||
|
||||
#if WANT_SIMD_EXCEPT
|
||||
/* Mask special lanes with 1 to side-step spurious invalid or overflow. Use
|
||||
@@ -64,15 +61,16 @@ VPCS_ATTR float32x4_t NOINLINE V_NAME_F1 (acosh) (float32x4_t x)
|
||||
float32x4_t xm1 = v_zerofy_f32 (vsubq_f32 (x, v_f32 (1)), p);
|
||||
float32x4_t u = vfmaq_f32 (vaddq_f32 (xm1, xm1), xm1, xm1);
|
||||
#else
|
||||
- float32x4_t xm1 = vsubq_f32 (x, v_f32 (1));
|
||||
- float32x4_t u = vmulq_f32 (xm1, vaddq_f32 (x, v_f32 (1.0f)));
|
||||
+ float32x4_t xm1 = vsubq_f32 (x, vreinterpretq_f32_u32 (d->one));
|
||||
+ float32x4_t u
|
||||
+ = vmulq_f32 (xm1, vaddq_f32 (x, vreinterpretq_f32_u32 (d->one)));
|
||||
#endif
|
||||
|
||||
float32x4_t y = vaddq_f32 (xm1, vsqrtq_f32 (u));
|
||||
|
||||
if (__glibc_unlikely (v_any_u16h (special)))
|
||||
- return special_case (x, y, special, d->log1pf_consts);
|
||||
- return log1pf_inline (y, d->log1pf_consts);
|
||||
+ return special_case (x, y, special, &d->log1pf_consts);
|
||||
+ return log1pf_inline (y, &d->log1pf_consts);
|
||||
}
|
||||
libmvec_hidden_def (V_NAME_F1 (acosh))
|
||||
HALF_WIDTH_ALIAS_F1 (acosh)
|
||||
diff --git a/sysdeps/aarch64/fpu/asinhf_advsimd.c b/sysdeps/aarch64/fpu/asinhf_advsimd.c
|
||||
index 09fd8a614305563d..eb789b91b600af52 100644
|
||||
--- a/sysdeps/aarch64/fpu/asinhf_advsimd.c
|
||||
+++ b/sysdeps/aarch64/fpu/asinhf_advsimd.c
|
||||
@@ -20,16 +20,16 @@
|
||||
#include "v_math.h"
|
||||
#include "v_log1pf_inline.h"
|
||||
|
||||
-#define SignMask v_u32 (0x80000000)
|
||||
-
|
||||
const static struct data
|
||||
{
|
||||
struct v_log1pf_data log1pf_consts;
|
||||
+ float32x4_t one;
|
||||
uint32x4_t big_bound;
|
||||
#if WANT_SIMD_EXCEPT
|
||||
uint32x4_t tiny_bound;
|
||||
#endif
|
||||
} data = {
|
||||
+ .one = V4 (1),
|
||||
.log1pf_consts = V_LOG1PF_CONSTANTS_TABLE,
|
||||
.big_bound = V4 (0x5f800000), /* asuint(0x1p64). */
|
||||
#if WANT_SIMD_EXCEPT
|
||||
@@ -38,20 +38,27 @@ const static struct data
|
||||
};
|
||||
|
||||
static float32x4_t NOINLINE VPCS_ATTR
|
||||
-special_case (float32x4_t x, float32x4_t y, uint32x4_t special)
|
||||
+special_case (float32x4_t x, uint32x4_t sign, float32x4_t y,
|
||||
+ uint32x4_t special, const struct data *d)
|
||||
{
|
||||
- return v_call_f32 (asinhf, x, y, special);
|
||||
+ return v_call_f32 (
|
||||
+ asinhf, x,
|
||||
+ vreinterpretq_f32_u32 (veorq_u32 (
|
||||
+ sign, vreinterpretq_u32_f32 (log1pf_inline (y, &d->log1pf_consts)))),
|
||||
+ special);
|
||||
}
|
||||
|
||||
/* Single-precision implementation of vector asinh(x), using vector log1p.
|
||||
- Worst-case error is 2.66 ULP, at roughly +/-0.25:
|
||||
- __v_asinhf(0x1.01b04p-2) got 0x1.fe163ep-3 want 0x1.fe1638p-3. */
|
||||
+ Worst-case error is 2.59 ULP:
|
||||
+ _ZGVnN4v_asinhf(0x1.d86124p-3) got 0x1.d449bep-3
|
||||
+ want 0x1.d449c4p-3. */
|
||||
VPCS_ATTR float32x4_t NOINLINE V_NAME_F1 (asinh) (float32x4_t x)
|
||||
{
|
||||
const struct data *dat = ptr_barrier (&data);
|
||||
- uint32x4_t iax = vbicq_u32 (vreinterpretq_u32_f32 (x), SignMask);
|
||||
- float32x4_t ax = vreinterpretq_f32_u32 (iax);
|
||||
+ float32x4_t ax = vabsq_f32 (x);
|
||||
+ uint32x4_t iax = vreinterpretq_u32_f32 (ax);
|
||||
uint32x4_t special = vcgeq_u32 (iax, dat->big_bound);
|
||||
+ uint32x4_t sign = veorq_u32 (vreinterpretq_u32_f32 (x), iax);
|
||||
float32x4_t special_arg = x;
|
||||
|
||||
#if WANT_SIMD_EXCEPT
|
||||
@@ -68,13 +75,13 @@ VPCS_ATTR float32x4_t NOINLINE V_NAME_F1 (asinh) (float32x4_t x)
|
||||
/* asinh(x) = log(x + sqrt(x * x + 1)).
|
||||
For positive x, asinh(x) = log1p(x + x * x / (1 + sqrt(x * x + 1))). */
|
||||
float32x4_t d
|
||||
- = vaddq_f32 (v_f32 (1), vsqrtq_f32 (vfmaq_f32 (v_f32 (1), x, x)));
|
||||
- float32x4_t y = log1pf_inline (
|
||||
- vaddq_f32 (ax, vdivq_f32 (vmulq_f32 (ax, ax), d)), dat->log1pf_consts);
|
||||
+ = vaddq_f32 (v_f32 (1), vsqrtq_f32 (vfmaq_f32 (dat->one, ax, ax)));
|
||||
+ float32x4_t y = vaddq_f32 (ax, vdivq_f32 (vmulq_f32 (ax, ax), d));
|
||||
|
||||
if (__glibc_unlikely (v_any_u32 (special)))
|
||||
- return special_case (special_arg, vbslq_f32 (SignMask, x, y), special);
|
||||
- return vbslq_f32 (SignMask, x, y);
|
||||
+ return special_case (special_arg, sign, y, special, dat);
|
||||
+ return vreinterpretq_f32_u32 (veorq_u32 (
|
||||
+ sign, vreinterpretq_u32_f32 (log1pf_inline (y, &dat->log1pf_consts))));
|
||||
}
|
||||
libmvec_hidden_def (V_NAME_F1 (asinh))
|
||||
HALF_WIDTH_ALIAS_F1 (asinh)
|
||||
diff --git a/sysdeps/aarch64/fpu/atanhf_advsimd.c b/sysdeps/aarch64/fpu/atanhf_advsimd.c
|
||||
index ae488f7b54ddce26..818b6c92adcd48bb 100644
|
||||
--- a/sysdeps/aarch64/fpu/atanhf_advsimd.c
|
||||
+++ b/sysdeps/aarch64/fpu/atanhf_advsimd.c
|
||||
@@ -40,15 +40,17 @@ const static struct data
|
||||
#define Half v_u32 (0x3f000000)
|
||||
|
||||
static float32x4_t NOINLINE VPCS_ATTR
|
||||
-special_case (float32x4_t x, float32x4_t y, uint32x4_t special)
|
||||
+special_case (float32x4_t x, float32x4_t halfsign, float32x4_t y,
|
||||
+ uint32x4_t special)
|
||||
{
|
||||
- return v_call_f32 (atanhf, x, y, special);
|
||||
+ return v_call_f32 (atanhf, vbslq_f32 (AbsMask, x, halfsign),
|
||||
+ vmulq_f32 (halfsign, y), special);
|
||||
}
|
||||
|
||||
/* Approximation for vector single-precision atanh(x) using modified log1p.
|
||||
- The maximum error is 3.08 ULP:
|
||||
- __v_atanhf(0x1.ff215p-5) got 0x1.ffcb7cp-5
|
||||
- want 0x1.ffcb82p-5. */
|
||||
+ The maximum error is 2.93 ULP:
|
||||
+ _ZGVnN4v_atanhf(0x1.f43d7p-5) got 0x1.f4dcfep-5
|
||||
+ want 0x1.f4dcf8p-5. */
|
||||
VPCS_ATTR float32x4_t NOINLINE V_NAME_F1 (atanh) (float32x4_t x)
|
||||
{
|
||||
const struct data *d = ptr_barrier (&data);
|
||||
@@ -68,11 +70,19 @@ VPCS_ATTR float32x4_t NOINLINE V_NAME_F1 (atanh) (float32x4_t x)
|
||||
uint32x4_t special = vcgeq_u32 (iax, d->one);
|
||||
#endif
|
||||
|
||||
- float32x4_t y = vdivq_f32 (vaddq_f32 (ax, ax), vsubq_f32 (v_f32 (1), ax));
|
||||
- y = log1pf_inline (y, d->log1pf_consts);
|
||||
+ float32x4_t y = vdivq_f32 (vaddq_f32 (ax, ax),
|
||||
+ vsubq_f32 (vreinterpretq_f32_u32 (d->one), ax));
|
||||
+ y = log1pf_inline (y, &d->log1pf_consts);
|
||||
|
||||
+ /* If exceptions not required, pass ax to special-case for shorter dependency
|
||||
+ chain. If exceptions are required ax will have been zerofied, so have to
|
||||
+ pass x. */
|
||||
if (__glibc_unlikely (v_any_u32 (special)))
|
||||
- return special_case (x, vmulq_f32 (halfsign, y), special);
|
||||
+#if WANT_SIMD_EXCEPT
|
||||
+ return special_case (x, halfsign, y, special);
|
||||
+#else
|
||||
+ return special_case (ax, halfsign, y, special);
|
||||
+#endif
|
||||
return vmulq_f32 (halfsign, y);
|
||||
}
|
||||
libmvec_hidden_def (V_NAME_F1 (atanh))
|
||||
diff --git a/sysdeps/aarch64/fpu/log1pf_advsimd.c b/sysdeps/aarch64/fpu/log1pf_advsimd.c
|
||||
index dc15334a8537b1fc..f2d47962fe13fbdd 100644
|
||||
--- a/sysdeps/aarch64/fpu/log1pf_advsimd.c
|
||||
+++ b/sysdeps/aarch64/fpu/log1pf_advsimd.c
|
||||
@@ -18,113 +18,78 @@
|
||||
<https://www.gnu.org/licenses/>. */
|
||||
|
||||
#include "v_math.h"
|
||||
-#include "poly_advsimd_f32.h"
|
||||
+#include "v_log1pf_inline.h"
|
||||
+
|
||||
+#if WANT_SIMD_EXCEPT
|
||||
|
||||
const static struct data
|
||||
{
|
||||
- float32x4_t poly[8], ln2;
|
||||
- uint32x4_t tiny_bound, minus_one, four, thresh;
|
||||
- int32x4_t three_quarters;
|
||||
+ uint32x4_t minus_one, thresh;
|
||||
+ struct v_log1pf_data d;
|
||||
} data = {
|
||||
- .poly = { /* Generated using FPMinimax in [-0.25, 0.5]. First two coefficients
|
||||
- (1, -0.5) are not stored as they can be generated more
|
||||
- efficiently. */
|
||||
- V4 (0x1.5555aap-2f), V4 (-0x1.000038p-2f), V4 (0x1.99675cp-3f),
|
||||
- V4 (-0x1.54ef78p-3f), V4 (0x1.28a1f4p-3f), V4 (-0x1.0da91p-3f),
|
||||
- V4 (0x1.abcb6p-4f), V4 (-0x1.6f0d5ep-5f) },
|
||||
- .ln2 = V4 (0x1.62e43p-1f),
|
||||
- .tiny_bound = V4 (0x34000000), /* asuint32(0x1p-23). ulp=0.5 at 0x1p-23. */
|
||||
- .thresh = V4 (0x4b800000), /* asuint32(INFINITY) - tiny_bound. */
|
||||
+ .d = V_LOG1PF_CONSTANTS_TABLE,
|
||||
+ .thresh = V4 (0x4b800000), /* asuint32(INFINITY) - TinyBound. */
|
||||
.minus_one = V4 (0xbf800000),
|
||||
- .four = V4 (0x40800000),
|
||||
- .three_quarters = V4 (0x3f400000)
|
||||
};
|
||||
|
||||
-static inline float32x4_t
|
||||
-eval_poly (float32x4_t m, const float32x4_t *p)
|
||||
-{
|
||||
- /* Approximate log(1+m) on [-0.25, 0.5] using split Estrin scheme. */
|
||||
- float32x4_t p_12 = vfmaq_f32 (v_f32 (-0.5), m, p[0]);
|
||||
- float32x4_t p_34 = vfmaq_f32 (p[1], m, p[2]);
|
||||
- float32x4_t p_56 = vfmaq_f32 (p[3], m, p[4]);
|
||||
- float32x4_t p_78 = vfmaq_f32 (p[5], m, p[6]);
|
||||
-
|
||||
- float32x4_t m2 = vmulq_f32 (m, m);
|
||||
- float32x4_t p_02 = vfmaq_f32 (m, m2, p_12);
|
||||
- float32x4_t p_36 = vfmaq_f32 (p_34, m2, p_56);
|
||||
- float32x4_t p_79 = vfmaq_f32 (p_78, m2, p[7]);
|
||||
-
|
||||
- float32x4_t m4 = vmulq_f32 (m2, m2);
|
||||
- float32x4_t p_06 = vfmaq_f32 (p_02, m4, p_36);
|
||||
- return vfmaq_f32 (p_06, m4, vmulq_f32 (m4, p_79));
|
||||
-}
|
||||
+/* asuint32(0x1p-23). ulp=0.5 at 0x1p-23. */
|
||||
+# define TinyBound v_u32 (0x34000000)
|
||||
|
||||
static float32x4_t NOINLINE VPCS_ATTR
|
||||
-special_case (float32x4_t x, float32x4_t y, uint32x4_t special)
|
||||
+special_case (float32x4_t x, uint32x4_t cmp, const struct data *d)
|
||||
{
|
||||
- return v_call_f32 (log1pf, x, y, special);
|
||||
+ /* Side-step special lanes so fenv exceptions are not triggered
|
||||
+ inadvertently. */
|
||||
+ float32x4_t x_nospecial = v_zerofy_f32 (x, cmp);
|
||||
+ return v_call_f32 (log1pf, x, log1pf_inline (x_nospecial, &d->d), cmp);
|
||||
}
|
||||
|
||||
-/* Vector log1pf approximation using polynomial on reduced interval. Accuracy
|
||||
- is roughly 2.02 ULP:
|
||||
- log1pf(0x1.21e13ap-2) got 0x1.fe8028p-3 want 0x1.fe802cp-3. */
|
||||
+/* Vector log1pf approximation using polynomial on reduced interval. Worst-case
|
||||
+ error is 1.69 ULP:
|
||||
+ _ZGVnN4v_log1pf(0x1.04418ap-2) got 0x1.cfcbd8p-3
|
||||
+ want 0x1.cfcbdcp-3. */
|
||||
VPCS_ATTR float32x4_t V_NAME_F1 (log1p) (float32x4_t x)
|
||||
{
|
||||
const struct data *d = ptr_barrier (&data);
|
||||
-
|
||||
uint32x4_t ix = vreinterpretq_u32_f32 (x);
|
||||
uint32x4_t ia = vreinterpretq_u32_f32 (vabsq_f32 (x));
|
||||
+
|
||||
uint32x4_t special_cases
|
||||
- = vorrq_u32 (vcgeq_u32 (vsubq_u32 (ia, d->tiny_bound), d->thresh),
|
||||
+ = vorrq_u32 (vcgeq_u32 (vsubq_u32 (ia, TinyBound), d->thresh),
|
||||
vcgeq_u32 (ix, d->minus_one));
|
||||
- float32x4_t special_arg = x;
|
||||
|
||||
-#if WANT_SIMD_EXCEPT
|
||||
if (__glibc_unlikely (v_any_u32 (special_cases)))
|
||||
- /* Side-step special lanes so fenv exceptions are not triggered
|
||||
- inadvertently. */
|
||||
- x = v_zerofy_f32 (x, special_cases);
|
||||
-#endif
|
||||
+ return special_case (x, special_cases, d);
|
||||
|
||||
- /* With x + 1 = t * 2^k (where t = m + 1 and k is chosen such that m
|
||||
- is in [-0.25, 0.5]):
|
||||
- log1p(x) = log(t) + log(2^k) = log1p(m) + k*log(2).
|
||||
-
|
||||
- We approximate log1p(m) with a polynomial, then scale by
|
||||
- k*log(2). Instead of doing this directly, we use an intermediate
|
||||
- scale factor s = 4*k*log(2) to ensure the scale is representable
|
||||
- as a normalised fp32 number. */
|
||||
+ return log1pf_inline (x, &d->d);
|
||||
+}
|
||||
|
||||
- float32x4_t m = vaddq_f32 (x, v_f32 (1.0f));
|
||||
+#else
|
||||
|
||||
- /* Choose k to scale x to the range [-1/4, 1/2]. */
|
||||
- int32x4_t k
|
||||
- = vandq_s32 (vsubq_s32 (vreinterpretq_s32_f32 (m), d->three_quarters),
|
||||
- v_s32 (0xff800000));
|
||||
- uint32x4_t ku = vreinterpretq_u32_s32 (k);
|
||||
+const static struct v_log1pf_data data = V_LOG1PF_CONSTANTS_TABLE;
|
||||
|
||||
- /* Scale x by exponent manipulation. */
|
||||
- float32x4_t m_scale
|
||||
- = vreinterpretq_f32_u32 (vsubq_u32 (vreinterpretq_u32_f32 (x), ku));
|
||||
+static float32x4_t NOINLINE VPCS_ATTR
|
||||
+special_case (float32x4_t x, uint32x4_t cmp)
|
||||
+{
|
||||
+ return v_call_f32 (log1pf, x, log1pf_inline (x, ptr_barrier (&data)), cmp);
|
||||
+}
|
||||
|
||||
- /* Scale up to ensure that the scale factor is representable as normalised
|
||||
- fp32 number, and scale m down accordingly. */
|
||||
- float32x4_t s = vreinterpretq_f32_u32 (vsubq_u32 (d->four, ku));
|
||||
- m_scale = vaddq_f32 (m_scale, vfmaq_f32 (v_f32 (-1.0f), v_f32 (0.25f), s));
|
||||
+/* Vector log1pf approximation using polynomial on reduced interval. Worst-case
|
||||
+ error is 1.63 ULP:
|
||||
+ _ZGVnN4v_log1pf(0x1.216d12p-2) got 0x1.fdcb12p-3
|
||||
+ want 0x1.fdcb16p-3. */
|
||||
+VPCS_ATTR float32x4_t V_NAME_F1 (log1p) (float32x4_t x)
|
||||
+{
|
||||
+ uint32x4_t special_cases = vornq_u32 (vcleq_f32 (x, v_f32 (-1)),
|
||||
+ vcaleq_f32 (x, v_f32 (0x1p127f)));
|
||||
|
||||
- /* Evaluate polynomial on the reduced interval. */
|
||||
- float32x4_t p = eval_poly (m_scale, d->poly);
|
||||
+ if (__glibc_unlikely (v_any_u32 (special_cases)))
|
||||
+ return special_case (x, special_cases);
|
||||
|
||||
- /* The scale factor to be applied back at the end - by multiplying float(k)
|
||||
- by 2^-23 we get the unbiased exponent of k. */
|
||||
- float32x4_t scale_back = vcvtq_f32_s32 (vshrq_n_s32 (k, 23));
|
||||
+ return log1pf_inline (x, ptr_barrier (&data));
|
||||
+}
|
||||
|
||||
- /* Apply the scaling back. */
|
||||
- float32x4_t y = vfmaq_f32 (p, scale_back, d->ln2);
|
||||
+#endif
|
||||
|
||||
- if (__glibc_unlikely (v_any_u32 (special_cases)))
|
||||
- return special_case (special_arg, y, special_cases);
|
||||
- return y;
|
||||
-}
|
||||
libmvec_hidden_def (V_NAME_F1 (log1p))
|
||||
HALF_WIDTH_ALIAS_F1 (log1p)
|
||||
diff --git a/sysdeps/aarch64/fpu/v_log1pf_inline.h b/sysdeps/aarch64/fpu/v_log1pf_inline.h
|
||||
index 643a6cdcfc498970..73e45a942e24a26f 100644
|
||||
--- a/sysdeps/aarch64/fpu/v_log1pf_inline.h
|
||||
+++ b/sysdeps/aarch64/fpu/v_log1pf_inline.h
|
||||
@@ -25,54 +25,81 @@
|
||||
|
||||
struct v_log1pf_data
|
||||
{
|
||||
- float32x4_t poly[8], ln2;
|
||||
uint32x4_t four;
|
||||
int32x4_t three_quarters;
|
||||
+ float c0, c3, c5, c7;
|
||||
+ float32x4_t c4, c6, c1, c2, ln2;
|
||||
};
|
||||
|
||||
/* Polynomial generated using FPMinimax in [-0.25, 0.5]. First two coefficients
|
||||
(1, -0.5) are not stored as they can be generated more efficiently. */
|
||||
#define V_LOG1PF_CONSTANTS_TABLE \
|
||||
{ \
|
||||
- .poly \
|
||||
- = { V4 (0x1.5555aap-2f), V4 (-0x1.000038p-2f), V4 (0x1.99675cp-3f), \
|
||||
- V4 (-0x1.54ef78p-3f), V4 (0x1.28a1f4p-3f), V4 (-0x1.0da91p-3f), \
|
||||
- V4 (0x1.abcb6p-4f), V4 (-0x1.6f0d5ep-5f) }, \
|
||||
- .ln2 = V4 (0x1.62e43p-1f), .four = V4 (0x40800000), \
|
||||
- .three_quarters = V4 (0x3f400000) \
|
||||
+ .c0 = 0x1.5555aap-2f, .c1 = V4 (-0x1.000038p-2f), \
|
||||
+ .c2 = V4 (0x1.99675cp-3f), .c3 = -0x1.54ef78p-3f, \
|
||||
+ .c4 = V4 (0x1.28a1f4p-3f), .c5 = -0x1.0da91p-3f, \
|
||||
+ .c6 = V4 (0x1.abcb6p-4f), .c7 = -0x1.6f0d5ep-5f, \
|
||||
+ .ln2 = V4 (0x1.62e43p-1f), .four = V4 (0x40800000), \
|
||||
+ .three_quarters = V4 (0x3f400000) \
|
||||
}
|
||||
|
||||
static inline float32x4_t
|
||||
-eval_poly (float32x4_t m, const float32x4_t *c)
|
||||
+eval_poly (float32x4_t m, const struct v_log1pf_data *d)
|
||||
{
|
||||
- /* Approximate log(1+m) on [-0.25, 0.5] using pairwise Horner (main routine
|
||||
- uses split Estrin, but this way reduces register pressure in the calling
|
||||
- routine). */
|
||||
- float32x4_t q = vfmaq_f32 (v_f32 (-0.5), m, c[0]);
|
||||
+ /* Approximate log(1+m) on [-0.25, 0.5] using pairwise Horner. */
|
||||
+ float32x4_t c0357 = vld1q_f32 (&d->c0);
|
||||
+ float32x4_t q = vfmaq_laneq_f32 (v_f32 (-0.5), m, c0357, 0);
|
||||
float32x4_t m2 = vmulq_f32 (m, m);
|
||||
- q = vfmaq_f32 (m, m2, q);
|
||||
- float32x4_t p = v_pw_horner_6_f32 (m, m2, c + 1);
|
||||
+ float32x4_t p67 = vfmaq_laneq_f32 (d->c6, m, c0357, 3);
|
||||
+ float32x4_t p45 = vfmaq_laneq_f32 (d->c4, m, c0357, 2);
|
||||
+ float32x4_t p23 = vfmaq_laneq_f32 (d->c2, m, c0357, 1);
|
||||
+ float32x4_t p = vfmaq_f32 (p45, m2, p67);
|
||||
+ p = vfmaq_f32 (p23, m2, p);
|
||||
+ p = vfmaq_f32 (d->c1, m, p);
|
||||
p = vmulq_f32 (m2, p);
|
||||
- return vfmaq_f32 (q, m2, p);
|
||||
+ p = vfmaq_f32 (m, m2, p);
|
||||
+ return vfmaq_f32 (p, m2, q);
|
||||
}
|
||||
|
||||
static inline float32x4_t
|
||||
-log1pf_inline (float32x4_t x, const struct v_log1pf_data d)
|
||||
+log1pf_inline (float32x4_t x, const struct v_log1pf_data *d)
|
||||
{
|
||||
- /* Helper for calculating log(x + 1). Copied from log1pf_2u1.c, with no
|
||||
- special-case handling. See that file for details of the algorithm. */
|
||||
+ /* Helper for calculating log(x + 1). */
|
||||
+
|
||||
+ /* With x + 1 = t * 2^k (where t = m + 1 and k is chosen such that m
|
||||
+ is in [-0.25, 0.5]):
|
||||
+ log1p(x) = log(t) + log(2^k) = log1p(m) + k*log(2).
|
||||
+
|
||||
+ We approximate log1p(m) with a polynomial, then scale by
|
||||
+ k*log(2). Instead of doing this directly, we use an intermediate
|
||||
+ scale factor s = 4*k*log(2) to ensure the scale is representable
|
||||
+ as a normalised fp32 number. */
|
||||
float32x4_t m = vaddq_f32 (x, v_f32 (1.0f));
|
||||
+
|
||||
+ /* Choose k to scale x to the range [-1/4, 1/2]. */
|
||||
int32x4_t k
|
||||
- = vandq_s32 (vsubq_s32 (vreinterpretq_s32_f32 (m), d.three_quarters),
|
||||
+ = vandq_s32 (vsubq_s32 (vreinterpretq_s32_f32 (m), d->three_quarters),
|
||||
v_s32 (0xff800000));
|
||||
uint32x4_t ku = vreinterpretq_u32_s32 (k);
|
||||
- float32x4_t s = vreinterpretq_f32_u32 (vsubq_u32 (d.four, ku));
|
||||
+
|
||||
+ /* Scale up to ensure that the scale factor is representable as normalised
|
||||
+ fp32 number, and scale m down accordingly. */
|
||||
+ float32x4_t s = vreinterpretq_f32_u32 (vsubq_u32 (d->four, ku));
|
||||
+
|
||||
+ /* Scale x by exponent manipulation. */
|
||||
float32x4_t m_scale
|
||||
= vreinterpretq_f32_u32 (vsubq_u32 (vreinterpretq_u32_f32 (x), ku));
|
||||
m_scale = vaddq_f32 (m_scale, vfmaq_f32 (v_f32 (-1.0f), v_f32 (0.25f), s));
|
||||
- float32x4_t p = eval_poly (m_scale, d.poly);
|
||||
+
|
||||
+ /* Evaluate polynomial on the reduced interval. */
|
||||
+ float32x4_t p = eval_poly (m_scale, d);
|
||||
+
|
||||
+ /* The scale factor to be applied back at the end - by multiplying float(k)
|
||||
+ by 2^-23 we get the unbiased exponent of k. */
|
||||
float32x4_t scale_back = vmulq_f32 (vcvtq_f32_s32 (k), v_f32 (0x1.0p-23f));
|
||||
- return vfmaq_f32 (p, scale_back, d.ln2);
|
||||
+
|
||||
+ /* Apply the scaling back. */
|
||||
+ return vfmaq_f32 (p, scale_back, d->ln2);
|
||||
}
|
||||
|
||||
#endif
|
||||
261
glibc-RHEL-118273-15.patch
Normal file
261
glibc-RHEL-118273-15.patch
Normal file
@ -0,0 +1,261 @@
|
||||
commit a15b1394b5eba98ffe28a02a392b587e4fe13c0d
|
||||
Author: Joe Ramsay <Joe.Ramsay@arm.com>
|
||||
Date: Mon Sep 23 15:30:20 2024 +0100
|
||||
|
||||
AArch64: Improve codegen in SVE F32 logs
|
||||
|
||||
Reduce MOVPRFXs by using unpredicated (non-destructive) instructions
|
||||
where possible. Similar to the recent change to AdvSIMD F32 logs,
|
||||
adjust special-case arguments and bounds to allow for more optimal
|
||||
register usage. For all 3 routines one MOVPRFX remains in the
|
||||
reduction, which cannot be avoided as immediate AND and ASR are both
|
||||
destructive.
|
||||
|
||||
Reviewed-by: Wilco Dijkstra <Wilco.Dijkstra@arm.com>
|
||||
|
||||
diff --git a/sysdeps/aarch64/fpu/log10f_sve.c b/sysdeps/aarch64/fpu/log10f_sve.c
|
||||
index bdbb49cd32feccb4..7913679f6795502a 100644
|
||||
--- a/sysdeps/aarch64/fpu/log10f_sve.c
|
||||
+++ b/sysdeps/aarch64/fpu/log10f_sve.c
|
||||
@@ -24,6 +24,7 @@ static const struct data
|
||||
float poly_0246[4];
|
||||
float poly_1357[4];
|
||||
float ln2, inv_ln10;
|
||||
+ uint32_t off, lower;
|
||||
} data = {
|
||||
.poly_1357 = {
|
||||
/* Coefficients copied from the AdvSIMD routine, then rearranged so that coeffs
|
||||
@@ -35,18 +36,23 @@ static const struct data
|
||||
-0x1.0fc92cp-4f },
|
||||
.ln2 = 0x1.62e43p-1f,
|
||||
.inv_ln10 = 0x1.bcb7b2p-2f,
|
||||
+ .off = 0x3f2aaaab,
|
||||
+ /* Lower bound is the smallest positive normal float 0x00800000. For
|
||||
+ optimised register use subnormals are detected after offset has been
|
||||
+ subtracted, so lower bound is 0x0080000 - offset (which wraps around). */
|
||||
+ .lower = 0x00800000 - 0x3f2aaaab
|
||||
};
|
||||
|
||||
-#define Min 0x00800000
|
||||
-#define Max 0x7f800000
|
||||
-#define Thres 0x7f000000 /* Max - Min. */
|
||||
-#define Offset 0x3f2aaaab /* 0.666667. */
|
||||
+#define Thres 0x7f000000 /* asuint32(inf) - 0x00800000. */
|
||||
#define MantissaMask 0x007fffff
|
||||
|
||||
static svfloat32_t NOINLINE
|
||||
-special_case (svfloat32_t x, svfloat32_t y, svbool_t special)
|
||||
+special_case (svuint32_t u_off, svfloat32_t p, svfloat32_t r2, svfloat32_t y,
|
||||
+ svbool_t cmp)
|
||||
{
|
||||
- return sv_call_f32 (log10f, x, y, special);
|
||||
+ return sv_call_f32 (
|
||||
+ log10f, svreinterpret_f32 (svadd_x (svptrue_b32 (), u_off, data.off)),
|
||||
+ svmla_x (svptrue_b32 (), p, r2, y), cmp);
|
||||
}
|
||||
|
||||
/* Optimised implementation of SVE log10f using the same algorithm and
|
||||
@@ -57,23 +63,25 @@ special_case (svfloat32_t x, svfloat32_t y, svbool_t special)
|
||||
svfloat32_t SV_NAME_F1 (log10) (svfloat32_t x, const svbool_t pg)
|
||||
{
|
||||
const struct data *d = ptr_barrier (&data);
|
||||
- svuint32_t ix = svreinterpret_u32 (x);
|
||||
- svbool_t special = svcmpge (pg, svsub_x (pg, ix, Min), Thres);
|
||||
+
|
||||
+ svuint32_t u_off = svreinterpret_u32 (x);
|
||||
+
|
||||
+ u_off = svsub_x (pg, u_off, d->off);
|
||||
+ svbool_t special = svcmpge (pg, svsub_x (pg, u_off, d->lower), Thres);
|
||||
|
||||
/* x = 2^n * (1+r), where 2/3 < 1+r < 4/3. */
|
||||
- ix = svsub_x (pg, ix, Offset);
|
||||
svfloat32_t n = svcvt_f32_x (
|
||||
- pg, svasr_x (pg, svreinterpret_s32 (ix), 23)); /* signextend. */
|
||||
- ix = svand_x (pg, ix, MantissaMask);
|
||||
- ix = svadd_x (pg, ix, Offset);
|
||||
+ pg, svasr_x (pg, svreinterpret_s32 (u_off), 23)); /* signextend. */
|
||||
+ svuint32_t ix = svand_x (pg, u_off, MantissaMask);
|
||||
+ ix = svadd_x (pg, ix, d->off);
|
||||
svfloat32_t r = svsub_x (pg, svreinterpret_f32 (ix), 1.0f);
|
||||
|
||||
/* y = log10(1+r) + n*log10(2)
|
||||
log10(1+r) ~ r * InvLn(10) + P(r)
|
||||
where P(r) is a polynomial. Use order 9 for log10(1+x), i.e. order 8 for
|
||||
log10(1+x)/x, with x in [-1/3, 1/3] (offset=2/3). */
|
||||
- svfloat32_t r2 = svmul_x (pg, r, r);
|
||||
- svfloat32_t r4 = svmul_x (pg, r2, r2);
|
||||
+ svfloat32_t r2 = svmul_x (svptrue_b32 (), r, r);
|
||||
+ svfloat32_t r4 = svmul_x (svptrue_b32 (), r2, r2);
|
||||
svfloat32_t p_1357 = svld1rq (svptrue_b32 (), &d->poly_1357[0]);
|
||||
svfloat32_t q_01 = svmla_lane (sv_f32 (d->poly_0246[0]), r, p_1357, 0);
|
||||
svfloat32_t q_23 = svmla_lane (sv_f32 (d->poly_0246[1]), r, p_1357, 1);
|
||||
@@ -88,7 +96,6 @@ svfloat32_t SV_NAME_F1 (log10) (svfloat32_t x, const svbool_t pg)
|
||||
hi = svmul_x (pg, hi, d->inv_ln10);
|
||||
|
||||
if (__glibc_unlikely (svptest_any (pg, special)))
|
||||
- return special_case (x, svmla_x (svnot_z (pg, special), hi, r2, y),
|
||||
- special);
|
||||
- return svmla_x (pg, hi, r2, y);
|
||||
+ return special_case (u_off, hi, r2, y, special);
|
||||
+ return svmla_x (svptrue_b32 (), hi, r2, y);
|
||||
}
|
||||
diff --git a/sysdeps/aarch64/fpu/log2f_sve.c b/sysdeps/aarch64/fpu/log2f_sve.c
|
||||
index 5031c4248359295e..939d89bfb9a95a11 100644
|
||||
--- a/sysdeps/aarch64/fpu/log2f_sve.c
|
||||
+++ b/sysdeps/aarch64/fpu/log2f_sve.c
|
||||
@@ -23,6 +23,7 @@ static const struct data
|
||||
{
|
||||
float poly_02468[5];
|
||||
float poly_1357[4];
|
||||
+ uint32_t off, lower;
|
||||
} data = {
|
||||
.poly_1357 = {
|
||||
/* Coefficients copied from the AdvSIMD routine, then rearranged so that coeffs
|
||||
@@ -32,18 +33,23 @@ static const struct data
|
||||
},
|
||||
.poly_02468 = { 0x1.715476p0f, 0x1.ec701cp-2f, 0x1.27a0b8p-2f,
|
||||
0x1.9d8ecap-3f, 0x1.9e495p-3f },
|
||||
+ .off = 0x3f2aaaab,
|
||||
+ /* Lower bound is the smallest positive normal float 0x00800000. For
|
||||
+ optimised register use subnormals are detected after offset has been
|
||||
+ subtracted, so lower bound is 0x0080000 - offset (which wraps around). */
|
||||
+ .lower = 0x00800000 - 0x3f2aaaab
|
||||
};
|
||||
|
||||
-#define Min (0x00800000)
|
||||
-#define Max (0x7f800000)
|
||||
-#define Thres (0x7f000000) /* Max - Min. */
|
||||
+#define Thresh (0x7f000000) /* asuint32(inf) - 0x00800000. */
|
||||
#define MantissaMask (0x007fffff)
|
||||
-#define Off (0x3f2aaaab) /* 0.666667. */
|
||||
|
||||
static svfloat32_t NOINLINE
|
||||
-special_case (svfloat32_t x, svfloat32_t y, svbool_t cmp)
|
||||
+special_case (svuint32_t u_off, svfloat32_t p, svfloat32_t r2, svfloat32_t y,
|
||||
+ svbool_t cmp)
|
||||
{
|
||||
- return sv_call_f32 (log2f, x, y, cmp);
|
||||
+ return sv_call_f32 (
|
||||
+ log2f, svreinterpret_f32 (svadd_x (svptrue_b32 (), u_off, data.off)),
|
||||
+ svmla_x (svptrue_b32 (), p, r2, y), cmp);
|
||||
}
|
||||
|
||||
/* Optimised implementation of SVE log2f, using the same algorithm
|
||||
@@ -55,19 +61,20 @@ svfloat32_t SV_NAME_F1 (log2) (svfloat32_t x, const svbool_t pg)
|
||||
{
|
||||
const struct data *d = ptr_barrier (&data);
|
||||
|
||||
- svuint32_t u = svreinterpret_u32 (x);
|
||||
- svbool_t special = svcmpge (pg, svsub_x (pg, u, Min), Thres);
|
||||
+ svuint32_t u_off = svreinterpret_u32 (x);
|
||||
+
|
||||
+ u_off = svsub_x (pg, u_off, d->off);
|
||||
+ svbool_t special = svcmpge (pg, svsub_x (pg, u_off, d->lower), Thresh);
|
||||
|
||||
/* x = 2^n * (1+r), where 2/3 < 1+r < 4/3. */
|
||||
- u = svsub_x (pg, u, Off);
|
||||
svfloat32_t n = svcvt_f32_x (
|
||||
- pg, svasr_x (pg, svreinterpret_s32 (u), 23)); /* Sign-extend. */
|
||||
- u = svand_x (pg, u, MantissaMask);
|
||||
- u = svadd_x (pg, u, Off);
|
||||
+ pg, svasr_x (pg, svreinterpret_s32 (u_off), 23)); /* Sign-extend. */
|
||||
+ svuint32_t u = svand_x (pg, u_off, MantissaMask);
|
||||
+ u = svadd_x (pg, u, d->off);
|
||||
svfloat32_t r = svsub_x (pg, svreinterpret_f32 (u), 1.0f);
|
||||
|
||||
/* y = log2(1+r) + n. */
|
||||
- svfloat32_t r2 = svmul_x (pg, r, r);
|
||||
+ svfloat32_t r2 = svmul_x (svptrue_b32 (), r, r);
|
||||
|
||||
/* Evaluate polynomial using pairwise Horner scheme. */
|
||||
svfloat32_t p_1357 = svld1rq (svptrue_b32 (), &d->poly_1357[0]);
|
||||
@@ -81,6 +88,6 @@ svfloat32_t SV_NAME_F1 (log2) (svfloat32_t x, const svbool_t pg)
|
||||
y = svmla_x (pg, q_01, r2, y);
|
||||
|
||||
if (__glibc_unlikely (svptest_any (pg, special)))
|
||||
- return special_case (x, svmla_x (svnot_z (pg, special), n, r, y), special);
|
||||
- return svmla_x (pg, n, r, y);
|
||||
+ return special_case (u_off, n, r, y, special);
|
||||
+ return svmla_x (svptrue_b32 (), n, r, y);
|
||||
}
|
||||
diff --git a/sysdeps/aarch64/fpu/logf_sve.c b/sysdeps/aarch64/fpu/logf_sve.c
|
||||
index d64e810cfec9aa19..5b9324678d99455b 100644
|
||||
--- a/sysdeps/aarch64/fpu/logf_sve.c
|
||||
+++ b/sysdeps/aarch64/fpu/logf_sve.c
|
||||
@@ -24,6 +24,7 @@ static const struct data
|
||||
float poly_0135[4];
|
||||
float poly_246[3];
|
||||
float ln2;
|
||||
+ uint32_t off, lower;
|
||||
} data = {
|
||||
.poly_0135 = {
|
||||
/* Coefficients copied from the AdvSIMD routine in math/, then rearranged so
|
||||
@@ -32,19 +33,24 @@ static const struct data
|
||||
-0x1.3e737cp-3f, 0x1.5a9aa2p-3f, 0x1.961348p-3f, 0x1.555d7cp-2f
|
||||
},
|
||||
.poly_246 = { -0x1.4f9934p-3f, -0x1.00187cp-2f, -0x1.ffffc8p-2f },
|
||||
- .ln2 = 0x1.62e43p-1f
|
||||
+ .ln2 = 0x1.62e43p-1f,
|
||||
+ .off = 0x3f2aaaab,
|
||||
+ /* Lower bound is the smallest positive normal float 0x00800000. For
|
||||
+ optimised register use subnormals are detected after offset has been
|
||||
+ subtracted, so lower bound is 0x0080000 - offset (which wraps around). */
|
||||
+ .lower = 0x00800000 - 0x3f2aaaab
|
||||
};
|
||||
|
||||
-#define Min (0x00800000)
|
||||
-#define Max (0x7f800000)
|
||||
-#define Thresh (0x7f000000) /* Max - Min. */
|
||||
+#define Thresh (0x7f000000) /* asuint32(inf) - 0x00800000. */
|
||||
#define Mask (0x007fffff)
|
||||
-#define Off (0x3f2aaaab) /* 0.666667. */
|
||||
|
||||
static svfloat32_t NOINLINE
|
||||
-special_case (svfloat32_t x, svfloat32_t y, svbool_t cmp)
|
||||
+special_case (svuint32_t u_off, svfloat32_t p, svfloat32_t r2, svfloat32_t y,
|
||||
+ svbool_t cmp)
|
||||
{
|
||||
- return sv_call_f32 (logf, x, y, cmp);
|
||||
+ return sv_call_f32 (
|
||||
+ logf, svreinterpret_f32 (svadd_x (svptrue_b32 (), u_off, data.off)),
|
||||
+ svmla_x (svptrue_b32 (), p, r2, y), cmp);
|
||||
}
|
||||
|
||||
/* Optimised implementation of SVE logf, using the same algorithm and
|
||||
@@ -55,19 +61,21 @@ svfloat32_t SV_NAME_F1 (log) (svfloat32_t x, const svbool_t pg)
|
||||
{
|
||||
const struct data *d = ptr_barrier (&data);
|
||||
|
||||
- svuint32_t u = svreinterpret_u32 (x);
|
||||
- svbool_t cmp = svcmpge (pg, svsub_x (pg, u, Min), Thresh);
|
||||
+ svuint32_t u_off = svreinterpret_u32 (x);
|
||||
+
|
||||
+ u_off = svsub_x (pg, u_off, d->off);
|
||||
+ svbool_t cmp = svcmpge (pg, svsub_x (pg, u_off, d->lower), Thresh);
|
||||
|
||||
/* x = 2^n * (1+r), where 2/3 < 1+r < 4/3. */
|
||||
- u = svsub_x (pg, u, Off);
|
||||
svfloat32_t n = svcvt_f32_x (
|
||||
- pg, svasr_x (pg, svreinterpret_s32 (u), 23)); /* Sign-extend. */
|
||||
- u = svand_x (pg, u, Mask);
|
||||
- u = svadd_x (pg, u, Off);
|
||||
+ pg, svasr_x (pg, svreinterpret_s32 (u_off), 23)); /* Sign-extend. */
|
||||
+
|
||||
+ svuint32_t u = svand_x (pg, u_off, Mask);
|
||||
+ u = svadd_x (pg, u, d->off);
|
||||
svfloat32_t r = svsub_x (pg, svreinterpret_f32 (u), 1.0f);
|
||||
|
||||
/* y = log(1+r) + n*ln2. */
|
||||
- svfloat32_t r2 = svmul_x (pg, r, r);
|
||||
+ svfloat32_t r2 = svmul_x (svptrue_b32 (), r, r);
|
||||
/* n*ln2 + r + r2*(P6 + r*P5 + r2*(P4 + r*P3 + r2*(P2 + r*P1 + r2*P0))). */
|
||||
svfloat32_t p_0135 = svld1rq (svptrue_b32 (), &d->poly_0135[0]);
|
||||
svfloat32_t p = svmla_lane (sv_f32 (d->poly_246[0]), r, p_0135, 1);
|
||||
@@ -80,6 +88,6 @@ svfloat32_t SV_NAME_F1 (log) (svfloat32_t x, const svbool_t pg)
|
||||
p = svmla_x (pg, r, n, d->ln2);
|
||||
|
||||
if (__glibc_unlikely (svptest_any (pg, cmp)))
|
||||
- return special_case (x, svmla_x (svnot_z (pg, cmp), p, r2, y), cmp);
|
||||
+ return special_case (u_off, p, r2, y, cmp);
|
||||
return svmla_x (pg, p, r2, y);
|
||||
}
|
||||
467
glibc-RHEL-118273-16.patch
Normal file
467
glibc-RHEL-118273-16.patch
Normal file
@ -0,0 +1,467 @@
|
||||
commit 7b8c134b5460ed933d610fa92ed1227372b68fdc
|
||||
Author: Joe Ramsay <Joe.Ramsay@arm.com>
|
||||
Date: Mon Sep 23 15:26:12 2024 +0100
|
||||
|
||||
AArch64: Improve codegen in SVE expf & related routines
|
||||
|
||||
Reduce MOV and MOVPRFX by improving special-case handling. Use inline
|
||||
helper to duplicate the entire computation between the special- and
|
||||
non-special case branches, removing the contention for z0 between x
|
||||
and the return value.
|
||||
|
||||
Also rearrange some MLAs and MLSs - by making the multiplicand the
|
||||
destination we can avoid a MOVPRFX in several cases. Also change which
|
||||
constants go in the vector used for lanewise ops - the last lane is no
|
||||
longer wasted.
|
||||
|
||||
Spotted that shift was incorrect in exp2f and exp10f, w.r.t. to the
|
||||
comment that explains it. Fixed - worst-case ULP for exp2f moves
|
||||
around but it doesn't change significantly for either routine.
|
||||
|
||||
Worst-case error for coshf increases due to passing x to exp rather
|
||||
than abs(x) - updated the comment, but does not require regen-ulps.
|
||||
|
||||
Reviewed-by: Wilco Dijkstra <Wilco.Dijkstra@arm.com>
|
||||
|
||||
diff --git a/sysdeps/aarch64/fpu/coshf_sve.c b/sysdeps/aarch64/fpu/coshf_sve.c
|
||||
index e5d8a299c6aa7ceb..7ad6efa0fc218278 100644
|
||||
--- a/sysdeps/aarch64/fpu/coshf_sve.c
|
||||
+++ b/sysdeps/aarch64/fpu/coshf_sve.c
|
||||
@@ -23,37 +23,42 @@
|
||||
static const struct data
|
||||
{
|
||||
struct sv_expf_data expf_consts;
|
||||
- uint32_t special_bound;
|
||||
+ float special_bound;
|
||||
} data = {
|
||||
.expf_consts = SV_EXPF_DATA,
|
||||
/* 0x1.5a92d8p+6: expf overflows above this, so have to use special case. */
|
||||
- .special_bound = 0x42ad496c,
|
||||
+ .special_bound = 0x1.5a92d8p+6,
|
||||
};
|
||||
|
||||
static svfloat32_t NOINLINE
|
||||
-special_case (svfloat32_t x, svfloat32_t y, svbool_t pg)
|
||||
+special_case (svfloat32_t x, svfloat32_t half_e, svfloat32_t half_over_e,
|
||||
+ svbool_t pg)
|
||||
{
|
||||
- return sv_call_f32 (coshf, x, y, pg);
|
||||
+ return sv_call_f32 (coshf, x, svadd_x (svptrue_b32 (), half_e, half_over_e),
|
||||
+ pg);
|
||||
}
|
||||
|
||||
/* Single-precision vector cosh, using vector expf.
|
||||
- Maximum error is 1.89 ULP:
|
||||
- _ZGVsMxv_coshf (-0x1.65898cp+6) got 0x1.f00aep+127
|
||||
- want 0x1.f00adcp+127. */
|
||||
+ Maximum error is 2.77 ULP:
|
||||
+ _ZGVsMxv_coshf(-0x1.5b38f4p+1) got 0x1.e45946p+2
|
||||
+ want 0x1.e4594cp+2. */
|
||||
svfloat32_t SV_NAME_F1 (cosh) (svfloat32_t x, svbool_t pg)
|
||||
{
|
||||
const struct data *d = ptr_barrier (&data);
|
||||
|
||||
- svfloat32_t ax = svabs_x (pg, x);
|
||||
- svbool_t special = svcmpge (pg, svreinterpret_u32 (ax), d->special_bound);
|
||||
+ svbool_t special = svacge (pg, x, d->special_bound);
|
||||
|
||||
- /* Calculate cosh by exp(x) / 2 + exp(-x) / 2. */
|
||||
- svfloat32_t t = expf_inline (ax, pg, &d->expf_consts);
|
||||
- svfloat32_t half_t = svmul_x (pg, t, 0.5);
|
||||
- svfloat32_t half_over_t = svdivr_x (pg, t, 0.5);
|
||||
+ /* Calculate cosh by exp(x) / 2 + exp(-x) / 2.
|
||||
+ Note that x is passed to exp here, rather than |x|. This is to avoid using
|
||||
+ destructive unary ABS for better register usage. However it means the
|
||||
+ routine is not exactly symmetrical, as the exp helper is slightly less
|
||||
+ accurate in the negative range. */
|
||||
+ svfloat32_t e = expf_inline (x, pg, &d->expf_consts);
|
||||
+ svfloat32_t half_e = svmul_x (svptrue_b32 (), e, 0.5);
|
||||
+ svfloat32_t half_over_e = svdivr_x (pg, e, 0.5);
|
||||
|
||||
if (__glibc_unlikely (svptest_any (pg, special)))
|
||||
- return special_case (x, svadd_x (pg, half_t, half_over_t), special);
|
||||
+ return special_case (x, half_e, half_over_e, special);
|
||||
|
||||
- return svadd_x (pg, half_t, half_over_t);
|
||||
+ return svadd_x (svptrue_b32 (), half_e, half_over_e);
|
||||
}
|
||||
diff --git a/sysdeps/aarch64/fpu/exp10f_sve.c b/sysdeps/aarch64/fpu/exp10f_sve.c
|
||||
index e09b2f3b2705515a..8aa3fa9c4335cfb8 100644
|
||||
--- a/sysdeps/aarch64/fpu/exp10f_sve.c
|
||||
+++ b/sysdeps/aarch64/fpu/exp10f_sve.c
|
||||
@@ -18,74 +18,83 @@
|
||||
<https://www.gnu.org/licenses/>. */
|
||||
|
||||
#include "sv_math.h"
|
||||
-#include "poly_sve_f32.h"
|
||||
|
||||
-/* For x < -SpecialBound, the result is subnormal and not handled correctly by
|
||||
+/* For x < -Thres, the result is subnormal and not handled correctly by
|
||||
FEXPA. */
|
||||
-#define SpecialBound 37.9
|
||||
+#define Thres 37.9
|
||||
|
||||
static const struct data
|
||||
{
|
||||
- float poly[5];
|
||||
- float shift, log10_2, log2_10_hi, log2_10_lo, special_bound;
|
||||
+ float log2_10_lo, c0, c2, c4;
|
||||
+ float c1, c3, log10_2;
|
||||
+ float shift, log2_10_hi, thres;
|
||||
} data = {
|
||||
/* Coefficients generated using Remez algorithm with minimisation of relative
|
||||
error.
|
||||
rel error: 0x1.89dafa3p-24
|
||||
abs error: 0x1.167d55p-23 in [-log10(2)/2, log10(2)/2]
|
||||
maxerr: 0.52 +0.5 ulp. */
|
||||
- .poly = { 0x1.26bb16p+1f, 0x1.5350d2p+1f, 0x1.04744ap+1f, 0x1.2d8176p+0f,
|
||||
- 0x1.12b41ap-1f },
|
||||
+ .c0 = 0x1.26bb16p+1f,
|
||||
+ .c1 = 0x1.5350d2p+1f,
|
||||
+ .c2 = 0x1.04744ap+1f,
|
||||
+ .c3 = 0x1.2d8176p+0f,
|
||||
+ .c4 = 0x1.12b41ap-1f,
|
||||
/* 1.5*2^17 + 127, a shift value suitable for FEXPA. */
|
||||
- .shift = 0x1.903f8p17f,
|
||||
+ .shift = 0x1.803f8p17f,
|
||||
.log10_2 = 0x1.a934fp+1,
|
||||
.log2_10_hi = 0x1.344136p-2,
|
||||
.log2_10_lo = -0x1.ec10cp-27,
|
||||
- .special_bound = SpecialBound,
|
||||
+ .thres = Thres,
|
||||
};
|
||||
|
||||
-static svfloat32_t NOINLINE
|
||||
-special_case (svfloat32_t x, svfloat32_t y, svbool_t special)
|
||||
+static inline svfloat32_t
|
||||
+sv_exp10f_inline (svfloat32_t x, const svbool_t pg, const struct data *d)
|
||||
{
|
||||
- return sv_call_f32 (exp10f, x, y, special);
|
||||
-}
|
||||
-
|
||||
-/* Single-precision SVE exp10f routine. Implements the same algorithm
|
||||
- as AdvSIMD exp10f.
|
||||
- Worst case error is 1.02 ULPs.
|
||||
- _ZGVsMxv_exp10f(-0x1.040488p-4) got 0x1.ba5f9ep-1
|
||||
- want 0x1.ba5f9cp-1. */
|
||||
-svfloat32_t SV_NAME_F1 (exp10) (svfloat32_t x, const svbool_t pg)
|
||||
-{
|
||||
- const struct data *d = ptr_barrier (&data);
|
||||
/* exp10(x) = 2^(n/N) * 10^r = 2^n * (1 + poly (r)),
|
||||
with poly(r) in [1/sqrt(2), sqrt(2)] and
|
||||
x = r + n * log10(2) / N, with r in [-log10(2)/2N, log10(2)/2N]. */
|
||||
|
||||
- /* Load some constants in quad-word chunks to minimise memory access (last
|
||||
- lane is wasted). */
|
||||
- svfloat32_t log10_2_and_inv = svld1rq (svptrue_b32 (), &d->log10_2);
|
||||
+ svfloat32_t lane_consts = svld1rq (svptrue_b32 (), &d->log2_10_lo);
|
||||
|
||||
/* n = round(x/(log10(2)/N)). */
|
||||
svfloat32_t shift = sv_f32 (d->shift);
|
||||
- svfloat32_t z = svmla_lane (shift, x, log10_2_and_inv, 0);
|
||||
- svfloat32_t n = svsub_x (pg, z, shift);
|
||||
+ svfloat32_t z = svmad_x (pg, sv_f32 (d->log10_2), x, shift);
|
||||
+ svfloat32_t n = svsub_x (svptrue_b32 (), z, shift);
|
||||
|
||||
/* r = x - n*log10(2)/N. */
|
||||
- svfloat32_t r = svmls_lane (x, n, log10_2_and_inv, 1);
|
||||
- r = svmls_lane (r, n, log10_2_and_inv, 2);
|
||||
+ svfloat32_t r = svmsb_x (pg, sv_f32 (d->log2_10_hi), n, x);
|
||||
+ r = svmls_lane (r, n, lane_consts, 0);
|
||||
|
||||
- svbool_t special = svacgt (pg, x, d->special_bound);
|
||||
svfloat32_t scale = svexpa (svreinterpret_u32 (z));
|
||||
|
||||
/* Polynomial evaluation: poly(r) ~ exp10(r)-1. */
|
||||
- svfloat32_t r2 = svmul_x (pg, r, r);
|
||||
- svfloat32_t poly
|
||||
- = svmla_x (pg, svmul_x (pg, r, d->poly[0]),
|
||||
- sv_pairwise_poly_3_f32_x (pg, r, r2, d->poly + 1), r2);
|
||||
-
|
||||
- if (__glibc_unlikely (svptest_any (pg, special)))
|
||||
- return special_case (x, svmla_x (pg, scale, scale, poly), special);
|
||||
+ svfloat32_t p12 = svmla_lane (sv_f32 (d->c1), r, lane_consts, 2);
|
||||
+ svfloat32_t p34 = svmla_lane (sv_f32 (d->c3), r, lane_consts, 3);
|
||||
+ svfloat32_t r2 = svmul_x (svptrue_b32 (), r, r);
|
||||
+ svfloat32_t p14 = svmla_x (pg, p12, p34, r2);
|
||||
+ svfloat32_t p0 = svmul_lane (r, lane_consts, 1);
|
||||
+ svfloat32_t poly = svmla_x (pg, p0, r2, p14);
|
||||
|
||||
return svmla_x (pg, scale, scale, poly);
|
||||
}
|
||||
+
|
||||
+static svfloat32_t NOINLINE
|
||||
+special_case (svfloat32_t x, svbool_t special, const struct data *d)
|
||||
+{
|
||||
+ return sv_call_f32 (exp10f, x, sv_exp10f_inline (x, svptrue_b32 (), d),
|
||||
+ special);
|
||||
+}
|
||||
+
|
||||
+/* Single-precision SVE exp10f routine. Implements the same algorithm
|
||||
+ as AdvSIMD exp10f.
|
||||
+ Worst case error is 1.02 ULPs.
|
||||
+ _ZGVsMxv_exp10f(-0x1.040488p-4) got 0x1.ba5f9ep-1
|
||||
+ want 0x1.ba5f9cp-1. */
|
||||
+svfloat32_t SV_NAME_F1 (exp10) (svfloat32_t x, const svbool_t pg)
|
||||
+{
|
||||
+ const struct data *d = ptr_barrier (&data);
|
||||
+ svbool_t special = svacgt (pg, x, d->thres);
|
||||
+ if (__glibc_unlikely (svptest_any (special, special)))
|
||||
+ return special_case (x, special, d);
|
||||
+ return sv_exp10f_inline (x, pg, d);
|
||||
+}
|
||||
diff --git a/sysdeps/aarch64/fpu/exp2f_sve.c b/sysdeps/aarch64/fpu/exp2f_sve.c
|
||||
index 8a686e3e054cb7f5..c6216bed9e9e7538 100644
|
||||
--- a/sysdeps/aarch64/fpu/exp2f_sve.c
|
||||
+++ b/sysdeps/aarch64/fpu/exp2f_sve.c
|
||||
@@ -24,54 +24,64 @@
|
||||
|
||||
static const struct data
|
||||
{
|
||||
- float poly[5];
|
||||
+ float c0, c2, c4, c1, c3;
|
||||
float shift, thres;
|
||||
} data = {
|
||||
- /* Coefficients copied from the polynomial in AdvSIMD variant, reversed for
|
||||
- compatibility with polynomial helpers. */
|
||||
- .poly = { 0x1.62e422p-1f, 0x1.ebf9bcp-3f, 0x1.c6bd32p-5f, 0x1.3ce9e4p-7f,
|
||||
- 0x1.59977ap-10f },
|
||||
+ /* Coefficients copied from the polynomial in AdvSIMD variant. */
|
||||
+ .c0 = 0x1.62e422p-1f,
|
||||
+ .c1 = 0x1.ebf9bcp-3f,
|
||||
+ .c2 = 0x1.c6bd32p-5f,
|
||||
+ .c3 = 0x1.3ce9e4p-7f,
|
||||
+ .c4 = 0x1.59977ap-10f,
|
||||
/* 1.5*2^17 + 127. */
|
||||
- .shift = 0x1.903f8p17f,
|
||||
+ .shift = 0x1.803f8p17f,
|
||||
/* Roughly 87.3. For x < -Thres, the result is subnormal and not handled
|
||||
correctly by FEXPA. */
|
||||
.thres = Thres,
|
||||
};
|
||||
|
||||
-static svfloat32_t NOINLINE
|
||||
-special_case (svfloat32_t x, svfloat32_t y, svbool_t special)
|
||||
-{
|
||||
- return sv_call_f32 (exp2f, x, y, special);
|
||||
-}
|
||||
-
|
||||
-/* Single-precision SVE exp2f routine. Implements the same algorithm
|
||||
- as AdvSIMD exp2f.
|
||||
- Worst case error is 1.04 ULPs.
|
||||
- SV_NAME_F1 (exp2)(0x1.943b9p-1) got 0x1.ba7eb2p+0
|
||||
- want 0x1.ba7ebp+0. */
|
||||
-svfloat32_t SV_NAME_F1 (exp2) (svfloat32_t x, const svbool_t pg)
|
||||
+static inline svfloat32_t
|
||||
+sv_exp2f_inline (svfloat32_t x, const svbool_t pg, const struct data *d)
|
||||
{
|
||||
- const struct data *d = ptr_barrier (&data);
|
||||
/* exp2(x) = 2^n (1 + poly(r)), with 1 + poly(r) in [1/sqrt(2),sqrt(2)]
|
||||
x = n + r, with r in [-1/2, 1/2]. */
|
||||
- svfloat32_t shift = sv_f32 (d->shift);
|
||||
- svfloat32_t z = svadd_x (pg, x, shift);
|
||||
- svfloat32_t n = svsub_x (pg, z, shift);
|
||||
- svfloat32_t r = svsub_x (pg, x, n);
|
||||
+ svfloat32_t z = svadd_x (svptrue_b32 (), x, d->shift);
|
||||
+ svfloat32_t n = svsub_x (svptrue_b32 (), z, d->shift);
|
||||
+ svfloat32_t r = svsub_x (svptrue_b32 (), x, n);
|
||||
|
||||
- svbool_t special = svacgt (pg, x, d->thres);
|
||||
svfloat32_t scale = svexpa (svreinterpret_u32 (z));
|
||||
|
||||
/* Polynomial evaluation: poly(r) ~ exp2(r)-1.
|
||||
Evaluate polynomial use hybrid scheme - offset ESTRIN by 1 for
|
||||
coefficients 1 to 4, and apply most significant coefficient directly. */
|
||||
- svfloat32_t r2 = svmul_x (pg, r, r);
|
||||
- svfloat32_t p14 = sv_pairwise_poly_3_f32_x (pg, r, r2, d->poly + 1);
|
||||
- svfloat32_t p0 = svmul_x (pg, r, d->poly[0]);
|
||||
+ svfloat32_t even_coeffs = svld1rq (svptrue_b32 (), &d->c0);
|
||||
+ svfloat32_t r2 = svmul_x (svptrue_b32 (), r, r);
|
||||
+ svfloat32_t p12 = svmla_lane (sv_f32 (d->c1), r, even_coeffs, 1);
|
||||
+ svfloat32_t p34 = svmla_lane (sv_f32 (d->c3), r, even_coeffs, 2);
|
||||
+ svfloat32_t p14 = svmla_x (pg, p12, r2, p34);
|
||||
+ svfloat32_t p0 = svmul_lane (r, even_coeffs, 0);
|
||||
svfloat32_t poly = svmla_x (pg, p0, r2, p14);
|
||||
|
||||
- if (__glibc_unlikely (svptest_any (pg, special)))
|
||||
- return special_case (x, svmla_x (pg, scale, scale, poly), special);
|
||||
-
|
||||
return svmla_x (pg, scale, scale, poly);
|
||||
}
|
||||
+
|
||||
+static svfloat32_t NOINLINE
|
||||
+special_case (svfloat32_t x, svbool_t special, const struct data *d)
|
||||
+{
|
||||
+ return sv_call_f32 (exp2f, x, sv_exp2f_inline (x, svptrue_b32 (), d),
|
||||
+ special);
|
||||
+}
|
||||
+
|
||||
+/* Single-precision SVE exp2f routine. Implements the same algorithm
|
||||
+ as AdvSIMD exp2f.
|
||||
+ Worst case error is 1.04 ULPs.
|
||||
+ _ZGVsMxv_exp2f(-0x1.af994ap-3) got 0x1.ba6a66p-1
|
||||
+ want 0x1.ba6a64p-1. */
|
||||
+svfloat32_t SV_NAME_F1 (exp2) (svfloat32_t x, const svbool_t pg)
|
||||
+{
|
||||
+ const struct data *d = ptr_barrier (&data);
|
||||
+ svbool_t special = svacgt (pg, x, d->thres);
|
||||
+ if (__glibc_unlikely (svptest_any (special, special)))
|
||||
+ return special_case (x, special, d);
|
||||
+ return sv_exp2f_inline (x, pg, d);
|
||||
+}
|
||||
diff --git a/sysdeps/aarch64/fpu/expf_sve.c b/sysdeps/aarch64/fpu/expf_sve.c
|
||||
index 3ba79bc4f11a05f9..da93e01b87e0e890 100644
|
||||
--- a/sysdeps/aarch64/fpu/expf_sve.c
|
||||
+++ b/sysdeps/aarch64/fpu/expf_sve.c
|
||||
@@ -18,33 +18,25 @@
|
||||
<https://www.gnu.org/licenses/>. */
|
||||
|
||||
#include "sv_math.h"
|
||||
+#include "sv_expf_inline.h"
|
||||
+
|
||||
+/* Roughly 87.3. For x < -Thres, the result is subnormal and not handled
|
||||
+ correctly by FEXPA. */
|
||||
+#define Thres 0x1.5d5e2ap+6f
|
||||
|
||||
static const struct data
|
||||
{
|
||||
- float poly[5];
|
||||
- float inv_ln2, ln2_hi, ln2_lo, shift, thres;
|
||||
+ struct sv_expf_data d;
|
||||
+ float thres;
|
||||
} data = {
|
||||
- /* Coefficients copied from the polynomial in AdvSIMD variant, reversed for
|
||||
- compatibility with polynomial helpers. */
|
||||
- .poly = { 0x1.ffffecp-1f, 0x1.fffdb6p-2f, 0x1.555e66p-3f, 0x1.573e2ep-5f,
|
||||
- 0x1.0e4020p-7f },
|
||||
- .inv_ln2 = 0x1.715476p+0f,
|
||||
- .ln2_hi = 0x1.62e4p-1f,
|
||||
- .ln2_lo = 0x1.7f7d1cp-20f,
|
||||
- /* 1.5*2^17 + 127. */
|
||||
- .shift = 0x1.903f8p17f,
|
||||
- /* Roughly 87.3. For x < -Thres, the result is subnormal and not handled
|
||||
- correctly by FEXPA. */
|
||||
- .thres = 0x1.5d5e2ap+6f,
|
||||
+ .d = SV_EXPF_DATA,
|
||||
+ .thres = Thres,
|
||||
};
|
||||
|
||||
-#define C(i) sv_f32 (d->poly[i])
|
||||
-#define ExponentBias 0x3f800000
|
||||
-
|
||||
static svfloat32_t NOINLINE
|
||||
-special_case (svfloat32_t x, svfloat32_t y, svbool_t special)
|
||||
+special_case (svfloat32_t x, svbool_t special, const struct sv_expf_data *d)
|
||||
{
|
||||
- return sv_call_f32 (expf, x, y, special);
|
||||
+ return sv_call_f32 (expf, x, expf_inline (x, svptrue_b32 (), d), special);
|
||||
}
|
||||
|
||||
/* Optimised single-precision SVE exp function.
|
||||
@@ -54,36 +46,8 @@ special_case (svfloat32_t x, svfloat32_t y, svbool_t special)
|
||||
svfloat32_t SV_NAME_F1 (exp) (svfloat32_t x, const svbool_t pg)
|
||||
{
|
||||
const struct data *d = ptr_barrier (&data);
|
||||
-
|
||||
- /* exp(x) = 2^n (1 + poly(r)), with 1 + poly(r) in [1/sqrt(2),sqrt(2)]
|
||||
- x = ln2*n + r, with r in [-ln2/2, ln2/2]. */
|
||||
-
|
||||
- /* Load some constants in quad-word chunks to minimise memory access (last
|
||||
- lane is wasted). */
|
||||
- svfloat32_t invln2_and_ln2 = svld1rq (svptrue_b32 (), &d->inv_ln2);
|
||||
-
|
||||
- /* n = round(x/(ln2/N)). */
|
||||
- svfloat32_t z = svmla_lane (sv_f32 (d->shift), x, invln2_and_ln2, 0);
|
||||
- svfloat32_t n = svsub_x (pg, z, d->shift);
|
||||
-
|
||||
- /* r = x - n*ln2/N. */
|
||||
- svfloat32_t r = svmls_lane (x, n, invln2_and_ln2, 1);
|
||||
- r = svmls_lane (r, n, invln2_and_ln2, 2);
|
||||
-
|
||||
- /* scale = 2^(n/N). */
|
||||
svbool_t is_special_case = svacgt (pg, x, d->thres);
|
||||
- svfloat32_t scale = svexpa (svreinterpret_u32 (z));
|
||||
-
|
||||
- /* y = exp(r) - 1 ~= r + C0 r^2 + C1 r^3 + C2 r^4 + C3 r^5 + C4 r^6. */
|
||||
- svfloat32_t p12 = svmla_x (pg, C (1), C (2), r);
|
||||
- svfloat32_t p34 = svmla_x (pg, C (3), C (4), r);
|
||||
- svfloat32_t r2 = svmul_x (pg, r, r);
|
||||
- svfloat32_t p14 = svmla_x (pg, p12, p34, r2);
|
||||
- svfloat32_t p0 = svmul_x (pg, r, C (0));
|
||||
- svfloat32_t poly = svmla_x (pg, p0, r2, p14);
|
||||
-
|
||||
if (__glibc_unlikely (svptest_any (pg, is_special_case)))
|
||||
- return special_case (x, svmla_x (pg, scale, scale, poly), is_special_case);
|
||||
-
|
||||
- return svmla_x (pg, scale, scale, poly);
|
||||
+ return special_case (x, is_special_case, &d->d);
|
||||
+ return expf_inline (x, pg, &d->d);
|
||||
}
|
||||
diff --git a/sysdeps/aarch64/fpu/sv_expf_inline.h b/sysdeps/aarch64/fpu/sv_expf_inline.h
|
||||
index 23963b5f8ec89ead..6166df65533555a6 100644
|
||||
--- a/sysdeps/aarch64/fpu/sv_expf_inline.h
|
||||
+++ b/sysdeps/aarch64/fpu/sv_expf_inline.h
|
||||
@@ -24,19 +24,20 @@
|
||||
|
||||
struct sv_expf_data
|
||||
{
|
||||
- float poly[5];
|
||||
- float inv_ln2, ln2_hi, ln2_lo, shift;
|
||||
+ float c1, c3, inv_ln2;
|
||||
+ float ln2_lo, c0, c2, c4;
|
||||
+ float ln2_hi, shift;
|
||||
};
|
||||
|
||||
/* Coefficients copied from the polynomial in AdvSIMD variant, reversed for
|
||||
compatibility with polynomial helpers. Shift is 1.5*2^17 + 127. */
|
||||
#define SV_EXPF_DATA \
|
||||
{ \
|
||||
- .poly = { 0x1.ffffecp-1f, 0x1.fffdb6p-2f, 0x1.555e66p-3f, 0x1.573e2ep-5f, \
|
||||
- 0x1.0e4020p-7f }, \
|
||||
- \
|
||||
- .inv_ln2 = 0x1.715476p+0f, .ln2_hi = 0x1.62e4p-1f, \
|
||||
- .ln2_lo = 0x1.7f7d1cp-20f, .shift = 0x1.803f8p17f, \
|
||||
+ /* Coefficients copied from the polynomial in AdvSIMD variant. */ \
|
||||
+ .c0 = 0x1.ffffecp-1f, .c1 = 0x1.fffdb6p-2f, .c2 = 0x1.555e66p-3f, \
|
||||
+ .c3 = 0x1.573e2ep-5f, .c4 = 0x1.0e4020p-7f, .inv_ln2 = 0x1.715476p+0f, \
|
||||
+ .ln2_hi = 0x1.62e4p-1f, .ln2_lo = 0x1.7f7d1cp-20f, \
|
||||
+ .shift = 0x1.803f8p17f, \
|
||||
}
|
||||
|
||||
#define C(i) sv_f32 (d->poly[i])
|
||||
@@ -47,26 +48,25 @@ expf_inline (svfloat32_t x, const svbool_t pg, const struct sv_expf_data *d)
|
||||
/* exp(x) = 2^n (1 + poly(r)), with 1 + poly(r) in [1/sqrt(2),sqrt(2)]
|
||||
x = ln2*n + r, with r in [-ln2/2, ln2/2]. */
|
||||
|
||||
- /* Load some constants in quad-word chunks to minimise memory access. */
|
||||
- svfloat32_t c4_invln2_and_ln2 = svld1rq (svptrue_b32 (), &d->poly[4]);
|
||||
+ svfloat32_t lane_consts = svld1rq (svptrue_b32 (), &d->ln2_lo);
|
||||
|
||||
/* n = round(x/(ln2/N)). */
|
||||
- svfloat32_t z = svmla_lane (sv_f32 (d->shift), x, c4_invln2_and_ln2, 1);
|
||||
+ svfloat32_t z = svmad_x (pg, sv_f32 (d->inv_ln2), x, d->shift);
|
||||
svfloat32_t n = svsub_x (pg, z, d->shift);
|
||||
|
||||
/* r = x - n*ln2/N. */
|
||||
- svfloat32_t r = svmls_lane (x, n, c4_invln2_and_ln2, 2);
|
||||
- r = svmls_lane (r, n, c4_invln2_and_ln2, 3);
|
||||
+ svfloat32_t r = svmsb_x (pg, sv_f32 (d->ln2_hi), n, x);
|
||||
+ r = svmls_lane (r, n, lane_consts, 0);
|
||||
|
||||
/* scale = 2^(n/N). */
|
||||
- svfloat32_t scale = svexpa (svreinterpret_u32_f32 (z));
|
||||
+ svfloat32_t scale = svexpa (svreinterpret_u32 (z));
|
||||
|
||||
/* y = exp(r) - 1 ~= r + C0 r^2 + C1 r^3 + C2 r^4 + C3 r^5 + C4 r^6. */
|
||||
- svfloat32_t p12 = svmla_x (pg, C (1), C (2), r);
|
||||
- svfloat32_t p34 = svmla_lane (C (3), r, c4_invln2_and_ln2, 0);
|
||||
- svfloat32_t r2 = svmul_f32_x (pg, r, r);
|
||||
+ svfloat32_t p12 = svmla_lane (sv_f32 (d->c1), r, lane_consts, 2);
|
||||
+ svfloat32_t p34 = svmla_lane (sv_f32 (d->c3), r, lane_consts, 3);
|
||||
+ svfloat32_t r2 = svmul_x (svptrue_b32 (), r, r);
|
||||
svfloat32_t p14 = svmla_x (pg, p12, p34, r2);
|
||||
- svfloat32_t p0 = svmul_f32_x (pg, r, C (0));
|
||||
+ svfloat32_t p0 = svmul_lane (r, lane_consts, 1);
|
||||
svfloat32_t poly = svmla_x (pg, p0, r2, p14);
|
||||
|
||||
return svmla_x (pg, scale, scale, poly);
|
||||
124
glibc-RHEL-118273-17.patch
Normal file
124
glibc-RHEL-118273-17.patch
Normal file
@ -0,0 +1,124 @@
|
||||
commit 1cf29fbc5be23db775d1dfa6b332ded6e6554252
|
||||
Author: Joe Ramsay <Joe.Ramsay@arm.com>
|
||||
Date: Mon Oct 28 14:58:35 2024 +0000
|
||||
|
||||
AArch64: Small optimisation in AdvSIMD erf and erfc
|
||||
|
||||
In both routines, reduce register pressure such that GCC 14 emits no
|
||||
spills for erf and fewer spills for erfc. Also use more efficient
|
||||
comparison for the special-case in erf.
|
||||
|
||||
Benchtests show erf improves by 6.4%, erfc by 1.0%.
|
||||
|
||||
diff --git a/sysdeps/aarch64/fpu/erf_advsimd.c b/sysdeps/aarch64/fpu/erf_advsimd.c
|
||||
index 19cbb7d0f42eb4e2..c0116735e408066d 100644
|
||||
--- a/sysdeps/aarch64/fpu/erf_advsimd.c
|
||||
+++ b/sysdeps/aarch64/fpu/erf_advsimd.c
|
||||
@@ -22,19 +22,21 @@
|
||||
static const struct data
|
||||
{
|
||||
float64x2_t third;
|
||||
- float64x2_t tenth, two_over_five, two_over_fifteen;
|
||||
- float64x2_t two_over_nine, two_over_fortyfive;
|
||||
+ float64x2_t tenth, two_over_five, two_over_nine;
|
||||
+ double two_over_fifteen, two_over_fortyfive;
|
||||
float64x2_t max, shift;
|
||||
+ uint64x2_t max_idx;
|
||||
#if WANT_SIMD_EXCEPT
|
||||
float64x2_t tiny_bound, huge_bound, scale_minus_one;
|
||||
#endif
|
||||
} data = {
|
||||
+ .max_idx = V2 (768),
|
||||
.third = V2 (0x1.5555555555556p-2), /* used to compute 2/3 and 1/6 too. */
|
||||
- .two_over_fifteen = V2 (0x1.1111111111111p-3),
|
||||
+ .two_over_fifteen = 0x1.1111111111111p-3,
|
||||
.tenth = V2 (-0x1.999999999999ap-4),
|
||||
.two_over_five = V2 (-0x1.999999999999ap-2),
|
||||
.two_over_nine = V2 (-0x1.c71c71c71c71cp-3),
|
||||
- .two_over_fortyfive = V2 (0x1.6c16c16c16c17p-5),
|
||||
+ .two_over_fortyfive = 0x1.6c16c16c16c17p-5,
|
||||
.max = V2 (5.9921875), /* 6 - 1/128. */
|
||||
.shift = V2 (0x1p45),
|
||||
#if WANT_SIMD_EXCEPT
|
||||
@@ -87,8 +89,8 @@ float64x2_t VPCS_ATTR V_NAME_D1 (erf) (float64x2_t x)
|
||||
float64x2_t a = vabsq_f64 (x);
|
||||
/* Reciprocal conditions that do not catch NaNs so they can be used in BSLs
|
||||
to return expected results. */
|
||||
- uint64x2_t a_le_max = vcleq_f64 (a, dat->max);
|
||||
- uint64x2_t a_gt_max = vcgtq_f64 (a, dat->max);
|
||||
+ uint64x2_t a_le_max = vcaleq_f64 (x, dat->max);
|
||||
+ uint64x2_t a_gt_max = vcagtq_f64 (x, dat->max);
|
||||
|
||||
#if WANT_SIMD_EXCEPT
|
||||
/* |x| huge or tiny. */
|
||||
@@ -115,7 +117,7 @@ float64x2_t VPCS_ATTR V_NAME_D1 (erf) (float64x2_t x)
|
||||
segfault. */
|
||||
uint64x2_t i
|
||||
= vsubq_u64 (vreinterpretq_u64_f64 (z), vreinterpretq_u64_f64 (shift));
|
||||
- i = vbslq_u64 (a_le_max, i, v_u64 (768));
|
||||
+ i = vbslq_u64 (a_le_max, i, dat->max_idx);
|
||||
struct entry e = lookup (i);
|
||||
|
||||
float64x2_t r = vsubq_f64 (z, shift);
|
||||
@@ -125,14 +127,19 @@ float64x2_t VPCS_ATTR V_NAME_D1 (erf) (float64x2_t x)
|
||||
float64x2_t d2 = vmulq_f64 (d, d);
|
||||
float64x2_t r2 = vmulq_f64 (r, r);
|
||||
|
||||
+ float64x2_t two_over_fifteen_and_fortyfive
|
||||
+ = vld1q_f64 (&dat->two_over_fifteen);
|
||||
+
|
||||
/* poly (d, r) = 1 + p1(r) * d + p2(r) * d^2 + ... + p5(r) * d^5. */
|
||||
float64x2_t p1 = r;
|
||||
float64x2_t p2
|
||||
= vfmsq_f64 (dat->third, r2, vaddq_f64 (dat->third, dat->third));
|
||||
float64x2_t p3 = vmulq_f64 (r, vfmaq_f64 (v_f64 (-0.5), r2, dat->third));
|
||||
- float64x2_t p4 = vfmaq_f64 (dat->two_over_five, r2, dat->two_over_fifteen);
|
||||
+ float64x2_t p4 = vfmaq_laneq_f64 (dat->two_over_five, r2,
|
||||
+ two_over_fifteen_and_fortyfive, 0);
|
||||
p4 = vfmsq_f64 (dat->tenth, r2, p4);
|
||||
- float64x2_t p5 = vfmaq_f64 (dat->two_over_nine, r2, dat->two_over_fortyfive);
|
||||
+ float64x2_t p5 = vfmaq_laneq_f64 (dat->two_over_nine, r2,
|
||||
+ two_over_fifteen_and_fortyfive, 1);
|
||||
p5 = vmulq_f64 (r, vfmaq_f64 (vmulq_f64 (v_f64 (0.5), dat->third), r2, p5));
|
||||
|
||||
float64x2_t p34 = vfmaq_f64 (p3, d, p4);
|
||||
diff --git a/sysdeps/aarch64/fpu/erfc_advsimd.c b/sysdeps/aarch64/fpu/erfc_advsimd.c
|
||||
index f1b3bfe8304c73b5..2f2f755c46e71b58 100644
|
||||
--- a/sysdeps/aarch64/fpu/erfc_advsimd.c
|
||||
+++ b/sysdeps/aarch64/fpu/erfc_advsimd.c
|
||||
@@ -24,8 +24,8 @@ static const struct data
|
||||
{
|
||||
uint64x2_t offset, table_scale;
|
||||
float64x2_t max, shift;
|
||||
- float64x2_t p20, p40, p41, p42;
|
||||
- float64x2_t p51, p52;
|
||||
+ float64x2_t p20, p40, p41, p51;
|
||||
+ double p42, p52;
|
||||
double qr5[2], qr6[2], qr7[2], qr8[2], qr9[2];
|
||||
#if WANT_SIMD_EXCEPT
|
||||
float64x2_t uflow_bound;
|
||||
@@ -41,9 +41,9 @@ static const struct data
|
||||
.p20 = V2 (0x1.5555555555555p-2), /* 1/3, used to compute 2/3 and 1/6. */
|
||||
.p40 = V2 (-0x1.999999999999ap-4), /* 1/10. */
|
||||
.p41 = V2 (-0x1.999999999999ap-2), /* 2/5. */
|
||||
- .p42 = V2 (0x1.1111111111111p-3), /* 2/15. */
|
||||
+ .p42 = 0x1.1111111111111p-3, /* 2/15. */
|
||||
.p51 = V2 (-0x1.c71c71c71c71cp-3), /* 2/9. */
|
||||
- .p52 = V2 (0x1.6c16c16c16c17p-5), /* 2/45. */
|
||||
+ .p52 = 0x1.6c16c16c16c17p-5, /* 2/45. */
|
||||
/* Qi = (i+1) / i, Ri = -2 * i / ((i+1)*(i+2)), for i = 5, ..., 9. */
|
||||
.qr5 = { 0x1.3333333333333p0, -0x1.e79e79e79e79ep-3 },
|
||||
.qr6 = { 0x1.2aaaaaaaaaaabp0, -0x1.b6db6db6db6dbp-3 },
|
||||
@@ -157,9 +157,10 @@ float64x2_t V_NAME_D1 (erfc) (float64x2_t x)
|
||||
float64x2_t p1 = r;
|
||||
float64x2_t p2 = vfmsq_f64 (dat->p20, r2, vaddq_f64 (dat->p20, dat->p20));
|
||||
float64x2_t p3 = vmulq_f64 (r, vfmaq_f64 (v_f64 (-0.5), r2, dat->p20));
|
||||
- float64x2_t p4 = vfmaq_f64 (dat->p41, r2, dat->p42);
|
||||
+ float64x2_t p42_p52 = vld1q_f64 (&dat->p42);
|
||||
+ float64x2_t p4 = vfmaq_laneq_f64 (dat->p41, r2, p42_p52, 0);
|
||||
p4 = vfmsq_f64 (dat->p40, r2, p4);
|
||||
- float64x2_t p5 = vfmaq_f64 (dat->p51, r2, dat->p52);
|
||||
+ float64x2_t p5 = vfmaq_laneq_f64 (dat->p51, r2, p42_p52, 1);
|
||||
p5 = vmulq_f64 (r, vfmaq_f64 (vmulq_f64 (v_f64 (0.5), dat->p20), r2, p5));
|
||||
/* Compute p_i using recurrence relation:
|
||||
p_{i+2} = (p_i + r * Q_{i+1} * p_{i+1}) * R_{i+1}. */
|
||||
2973
glibc-RHEL-118273-18.patch
Normal file
2973
glibc-RHEL-118273-18.patch
Normal file
File diff suppressed because it is too large
Load Diff
461
glibc-RHEL-118273-19.patch
Normal file
461
glibc-RHEL-118273-19.patch
Normal file
@ -0,0 +1,461 @@
|
||||
commit 13a7ef5999de56add448a24fefb0250236271a06
|
||||
Author: Pierre Blanchard <pierre.blanchard@arm.com>
|
||||
Date: Mon Dec 9 15:58:47 2024 +0000
|
||||
|
||||
AArch64: Improve codegen in users of ADVSIMD expm1 helper
|
||||
|
||||
Add inline helper for expm1 and rearrange operations so MOV
|
||||
is not necessary in reduction or around the special-case handler.
|
||||
Reduce memory access by using more indexed MLAs in polynomial.
|
||||
Speedup on Neoverse V1 for expm1 (19%), sinh (8.5%), and tanh (7.5%).
|
||||
|
||||
diff --git a/sysdeps/aarch64/fpu/expm1_advsimd.c b/sysdeps/aarch64/fpu/expm1_advsimd.c
|
||||
index 3db3b80c49292947..f2042db8bcc8466a 100644
|
||||
--- a/sysdeps/aarch64/fpu/expm1_advsimd.c
|
||||
+++ b/sysdeps/aarch64/fpu/expm1_advsimd.c
|
||||
@@ -18,31 +18,18 @@
|
||||
<https://www.gnu.org/licenses/>. */
|
||||
|
||||
#include "v_math.h"
|
||||
-#include "poly_advsimd_f64.h"
|
||||
+#include "v_expm1_inline.h"
|
||||
|
||||
static const struct data
|
||||
{
|
||||
- float64x2_t poly[11];
|
||||
- float64x2_t invln2;
|
||||
- double ln2[2];
|
||||
- float64x2_t shift;
|
||||
- int64x2_t exponent_bias;
|
||||
+ struct v_expm1_data d;
|
||||
#if WANT_SIMD_EXCEPT
|
||||
uint64x2_t thresh, tiny_bound;
|
||||
#else
|
||||
float64x2_t oflow_bound;
|
||||
#endif
|
||||
} data = {
|
||||
- /* Generated using fpminimax, with degree=12 in [log(2)/2, log(2)/2]. */
|
||||
- .poly = { V2 (0x1p-1), V2 (0x1.5555555555559p-3), V2 (0x1.555555555554bp-5),
|
||||
- V2 (0x1.111111110f663p-7), V2 (0x1.6c16c16c1b5f3p-10),
|
||||
- V2 (0x1.a01a01affa35dp-13), V2 (0x1.a01a018b4ecbbp-16),
|
||||
- V2 (0x1.71ddf82db5bb4p-19), V2 (0x1.27e517fc0d54bp-22),
|
||||
- V2 (0x1.af5eedae67435p-26), V2 (0x1.1f143d060a28ap-29) },
|
||||
- .invln2 = V2 (0x1.71547652b82fep0),
|
||||
- .ln2 = { 0x1.62e42fefa39efp-1, 0x1.abc9e3b39803fp-56 },
|
||||
- .shift = V2 (0x1.8p52),
|
||||
- .exponent_bias = V2 (0x3ff0000000000000),
|
||||
+ .d = V_EXPM1_DATA,
|
||||
#if WANT_SIMD_EXCEPT
|
||||
/* asuint64(oflow_bound) - asuint64(0x1p-51), shifted left by 1 for abs
|
||||
compare. */
|
||||
@@ -58,67 +45,36 @@ static const struct data
|
||||
};
|
||||
|
||||
static float64x2_t VPCS_ATTR NOINLINE
|
||||
-special_case (float64x2_t x, float64x2_t y, uint64x2_t special)
|
||||
+special_case (float64x2_t x, uint64x2_t special, const struct data *d)
|
||||
{
|
||||
- return v_call_f64 (expm1, x, y, special);
|
||||
+ return v_call_f64 (expm1, x, expm1_inline (v_zerofy_f64 (x, special), &d->d),
|
||||
+ special);
|
||||
}
|
||||
|
||||
/* Double-precision vector exp(x) - 1 function.
|
||||
- The maximum error observed error is 2.18 ULP:
|
||||
- _ZGVnN2v_expm1 (0x1.634ba0c237d7bp-2) got 0x1.a8b9ea8d66e22p-2
|
||||
- want 0x1.a8b9ea8d66e2p-2. */
|
||||
+ The maximum error observed error is 2.05 ULP:
|
||||
+ _ZGVnN2v_expm1(0x1.634902eaff3adp-2) got 0x1.a8b636e2a9388p-2
|
||||
+ want 0x1.a8b636e2a9386p-2. */
|
||||
float64x2_t VPCS_ATTR V_NAME_D1 (expm1) (float64x2_t x)
|
||||
{
|
||||
const struct data *d = ptr_barrier (&data);
|
||||
|
||||
- uint64x2_t ix = vreinterpretq_u64_f64 (x);
|
||||
-
|
||||
#if WANT_SIMD_EXCEPT
|
||||
+ uint64x2_t ix = vreinterpretq_u64_f64 (x);
|
||||
/* If fp exceptions are to be triggered correctly, fall back to scalar for
|
||||
|x| < 2^-51, |x| > oflow_bound, Inf & NaN. Add ix to itself for
|
||||
shift-left by 1, and compare with thresh which was left-shifted offline -
|
||||
this is effectively an absolute compare. */
|
||||
uint64x2_t special
|
||||
= vcgeq_u64 (vsubq_u64 (vaddq_u64 (ix, ix), d->tiny_bound), d->thresh);
|
||||
- if (__glibc_unlikely (v_any_u64 (special)))
|
||||
- x = v_zerofy_f64 (x, special);
|
||||
#else
|
||||
/* Large input, NaNs and Infs. */
|
||||
uint64x2_t special = vcageq_f64 (x, d->oflow_bound);
|
||||
#endif
|
||||
|
||||
- /* Reduce argument to smaller range:
|
||||
- Let i = round(x / ln2)
|
||||
- and f = x - i * ln2, then f is in [-ln2/2, ln2/2].
|
||||
- exp(x) - 1 = 2^i * (expm1(f) + 1) - 1
|
||||
- where 2^i is exact because i is an integer. */
|
||||
- float64x2_t n = vsubq_f64 (vfmaq_f64 (d->shift, d->invln2, x), d->shift);
|
||||
- int64x2_t i = vcvtq_s64_f64 (n);
|
||||
- float64x2_t ln2 = vld1q_f64 (&d->ln2[0]);
|
||||
- float64x2_t f = vfmsq_laneq_f64 (x, n, ln2, 0);
|
||||
- f = vfmsq_laneq_f64 (f, n, ln2, 1);
|
||||
-
|
||||
- /* Approximate expm1(f) using polynomial.
|
||||
- Taylor expansion for expm1(x) has the form:
|
||||
- x + ax^2 + bx^3 + cx^4 ....
|
||||
- So we calculate the polynomial P(f) = a + bf + cf^2 + ...
|
||||
- and assemble the approximation expm1(f) ~= f + f^2 * P(f). */
|
||||
- float64x2_t f2 = vmulq_f64 (f, f);
|
||||
- float64x2_t f4 = vmulq_f64 (f2, f2);
|
||||
- float64x2_t f8 = vmulq_f64 (f4, f4);
|
||||
- float64x2_t p = vfmaq_f64 (f, f2, v_estrin_10_f64 (f, f2, f4, f8, d->poly));
|
||||
-
|
||||
- /* Assemble the result.
|
||||
- expm1(x) ~= 2^i * (p + 1) - 1
|
||||
- Let t = 2^i. */
|
||||
- int64x2_t u = vaddq_s64 (vshlq_n_s64 (i, 52), d->exponent_bias);
|
||||
- float64x2_t t = vreinterpretq_f64_s64 (u);
|
||||
-
|
||||
if (__glibc_unlikely (v_any_u64 (special)))
|
||||
- return special_case (vreinterpretq_f64_u64 (ix),
|
||||
- vfmaq_f64 (vsubq_f64 (t, v_f64 (1.0)), p, t),
|
||||
- special);
|
||||
+ return special_case (x, special, d);
|
||||
|
||||
/* expm1(x) ~= p * t + (t - 1). */
|
||||
- return vfmaq_f64 (vsubq_f64 (t, v_f64 (1.0)), p, t);
|
||||
+ return expm1_inline (x, &d->d);
|
||||
}
|
||||
diff --git a/sysdeps/aarch64/fpu/sinh_advsimd.c b/sysdeps/aarch64/fpu/sinh_advsimd.c
|
||||
index 3e3b76c502b01e16..7adf771517de2507 100644
|
||||
--- a/sysdeps/aarch64/fpu/sinh_advsimd.c
|
||||
+++ b/sysdeps/aarch64/fpu/sinh_advsimd.c
|
||||
@@ -18,72 +18,31 @@
|
||||
<https://www.gnu.org/licenses/>. */
|
||||
|
||||
#include "v_math.h"
|
||||
-#include "poly_advsimd_f64.h"
|
||||
+#include "v_expm1_inline.h"
|
||||
|
||||
static const struct data
|
||||
{
|
||||
- float64x2_t poly[11], inv_ln2;
|
||||
- double m_ln2[2];
|
||||
- float64x2_t shift;
|
||||
+ struct v_expm1_data d;
|
||||
uint64x2_t halff;
|
||||
- int64x2_t onef;
|
||||
#if WANT_SIMD_EXCEPT
|
||||
uint64x2_t tiny_bound, thresh;
|
||||
#else
|
||||
- uint64x2_t large_bound;
|
||||
+ float64x2_t large_bound;
|
||||
#endif
|
||||
} data = {
|
||||
- /* Generated using Remez, deg=12 in [-log(2)/2, log(2)/2]. */
|
||||
- .poly = { V2 (0x1p-1), V2 (0x1.5555555555559p-3), V2 (0x1.555555555554bp-5),
|
||||
- V2 (0x1.111111110f663p-7), V2 (0x1.6c16c16c1b5f3p-10),
|
||||
- V2 (0x1.a01a01affa35dp-13), V2 (0x1.a01a018b4ecbbp-16),
|
||||
- V2 (0x1.71ddf82db5bb4p-19), V2 (0x1.27e517fc0d54bp-22),
|
||||
- V2 (0x1.af5eedae67435p-26), V2 (0x1.1f143d060a28ap-29), },
|
||||
-
|
||||
- .inv_ln2 = V2 (0x1.71547652b82fep0),
|
||||
- .m_ln2 = {-0x1.62e42fefa39efp-1, -0x1.abc9e3b39803fp-56},
|
||||
- .shift = V2 (0x1.8p52),
|
||||
-
|
||||
+ .d = V_EXPM1_DATA,
|
||||
.halff = V2 (0x3fe0000000000000),
|
||||
- .onef = V2 (0x3ff0000000000000),
|
||||
#if WANT_SIMD_EXCEPT
|
||||
/* 2^-26, below which sinh(x) rounds to x. */
|
||||
.tiny_bound = V2 (0x3e50000000000000),
|
||||
/* asuint(large_bound) - asuint(tiny_bound). */
|
||||
.thresh = V2 (0x0230000000000000),
|
||||
#else
|
||||
-/* 2^9. expm1 helper overflows for large input. */
|
||||
- .large_bound = V2 (0x4080000000000000),
|
||||
+ /* 2^9. expm1 helper overflows for large input. */
|
||||
+ .large_bound = V2 (0x1p+9),
|
||||
#endif
|
||||
};
|
||||
|
||||
-static inline float64x2_t
|
||||
-expm1_inline (float64x2_t x)
|
||||
-{
|
||||
- const struct data *d = ptr_barrier (&data);
|
||||
-
|
||||
- /* Reduce argument:
|
||||
- exp(x) - 1 = 2^i * (expm1(f) + 1) - 1
|
||||
- where i = round(x / ln2)
|
||||
- and f = x - i * ln2 (f in [-ln2/2, ln2/2]). */
|
||||
- float64x2_t j = vsubq_f64 (vfmaq_f64 (d->shift, d->inv_ln2, x), d->shift);
|
||||
- int64x2_t i = vcvtq_s64_f64 (j);
|
||||
-
|
||||
- float64x2_t m_ln2 = vld1q_f64 (d->m_ln2);
|
||||
- float64x2_t f = vfmaq_laneq_f64 (x, j, m_ln2, 0);
|
||||
- f = vfmaq_laneq_f64 (f, j, m_ln2, 1);
|
||||
- /* Approximate expm1(f) using polynomial. */
|
||||
- float64x2_t f2 = vmulq_f64 (f, f);
|
||||
- float64x2_t f4 = vmulq_f64 (f2, f2);
|
||||
- float64x2_t f8 = vmulq_f64 (f4, f4);
|
||||
- float64x2_t p = vfmaq_f64 (f, f2, v_estrin_10_f64 (f, f2, f4, f8, d->poly));
|
||||
- /* t = 2^i. */
|
||||
- float64x2_t t = vreinterpretq_f64_u64 (
|
||||
- vreinterpretq_u64_s64 (vaddq_s64 (vshlq_n_s64 (i, 52), d->onef)));
|
||||
- /* expm1(x) ~= p * t + (t - 1). */
|
||||
- return vfmaq_f64 (vsubq_f64 (t, v_f64 (1.0)), p, t);
|
||||
-}
|
||||
-
|
||||
static float64x2_t NOINLINE VPCS_ATTR
|
||||
special_case (float64x2_t x)
|
||||
{
|
||||
@@ -92,23 +51,23 @@ special_case (float64x2_t x)
|
||||
|
||||
/* Approximation for vector double-precision sinh(x) using expm1.
|
||||
sinh(x) = (exp(x) - exp(-x)) / 2.
|
||||
- The greatest observed error is 2.57 ULP:
|
||||
- _ZGVnN2v_sinh (0x1.9fb1d49d1d58bp-2) got 0x1.ab34e59d678dcp-2
|
||||
- want 0x1.ab34e59d678d9p-2. */
|
||||
+ The greatest observed error is 2.52 ULP:
|
||||
+ _ZGVnN2v_sinh(-0x1.a098a2177a2b9p-2) got -0x1.ac2f05bb66fccp-2
|
||||
+ want -0x1.ac2f05bb66fc9p-2. */
|
||||
float64x2_t VPCS_ATTR V_NAME_D1 (sinh) (float64x2_t x)
|
||||
{
|
||||
const struct data *d = ptr_barrier (&data);
|
||||
|
||||
float64x2_t ax = vabsq_f64 (x);
|
||||
- uint64x2_t sign
|
||||
- = veorq_u64 (vreinterpretq_u64_f64 (x), vreinterpretq_u64_f64 (ax));
|
||||
- float64x2_t halfsign = vreinterpretq_f64_u64 (vorrq_u64 (sign, d->halff));
|
||||
+ uint64x2_t ix = vreinterpretq_u64_f64 (x);
|
||||
+ float64x2_t halfsign = vreinterpretq_f64_u64 (
|
||||
+ vbslq_u64 (v_u64 (0x8000000000000000), ix, d->halff));
|
||||
|
||||
#if WANT_SIMD_EXCEPT
|
||||
uint64x2_t special = vcgeq_u64 (
|
||||
vsubq_u64 (vreinterpretq_u64_f64 (ax), d->tiny_bound), d->thresh);
|
||||
#else
|
||||
- uint64x2_t special = vcgeq_u64 (vreinterpretq_u64_f64 (ax), d->large_bound);
|
||||
+ uint64x2_t special = vcageq_f64 (x, d->large_bound);
|
||||
#endif
|
||||
|
||||
/* Fall back to scalar variant for all lanes if any of them are special. */
|
||||
@@ -118,7 +77,7 @@ float64x2_t VPCS_ATTR V_NAME_D1 (sinh) (float64x2_t x)
|
||||
/* Up to the point that expm1 overflows, we can use it to calculate sinh
|
||||
using a slight rearrangement of the definition of sinh. This allows us to
|
||||
retain acceptable accuracy for very small inputs. */
|
||||
- float64x2_t t = expm1_inline (ax);
|
||||
+ float64x2_t t = expm1_inline (ax, &d->d);
|
||||
t = vaddq_f64 (t, vdivq_f64 (t, vaddq_f64 (t, v_f64 (1.0))));
|
||||
return vmulq_f64 (t, halfsign);
|
||||
}
|
||||
diff --git a/sysdeps/aarch64/fpu/tanh_advsimd.c b/sysdeps/aarch64/fpu/tanh_advsimd.c
|
||||
index 1da1dfa5dbe418b6..402ba9d8ad2478a8 100644
|
||||
--- a/sysdeps/aarch64/fpu/tanh_advsimd.c
|
||||
+++ b/sysdeps/aarch64/fpu/tanh_advsimd.c
|
||||
@@ -18,68 +18,30 @@
|
||||
<https://www.gnu.org/licenses/>. */
|
||||
|
||||
#include "v_math.h"
|
||||
-#include "poly_advsimd_f64.h"
|
||||
+#include "v_expm1_inline.h"
|
||||
|
||||
static const struct data
|
||||
{
|
||||
- float64x2_t poly[11];
|
||||
- float64x2_t inv_ln2, ln2_hi, ln2_lo, shift;
|
||||
- uint64x2_t onef;
|
||||
+ struct v_expm1_data d;
|
||||
uint64x2_t thresh, tiny_bound;
|
||||
} data = {
|
||||
- /* Generated using Remez, deg=12 in [-log(2)/2, log(2)/2]. */
|
||||
- .poly = { V2 (0x1p-1), V2 (0x1.5555555555559p-3), V2 (0x1.555555555554bp-5),
|
||||
- V2 (0x1.111111110f663p-7), V2 (0x1.6c16c16c1b5f3p-10),
|
||||
- V2 (0x1.a01a01affa35dp-13), V2 (0x1.a01a018b4ecbbp-16),
|
||||
- V2 (0x1.71ddf82db5bb4p-19), V2 (0x1.27e517fc0d54bp-22),
|
||||
- V2 (0x1.af5eedae67435p-26), V2 (0x1.1f143d060a28ap-29), },
|
||||
-
|
||||
- .inv_ln2 = V2 (0x1.71547652b82fep0),
|
||||
- .ln2_hi = V2 (-0x1.62e42fefa39efp-1),
|
||||
- .ln2_lo = V2 (-0x1.abc9e3b39803fp-56),
|
||||
- .shift = V2 (0x1.8p52),
|
||||
-
|
||||
- .onef = V2 (0x3ff0000000000000),
|
||||
+ .d = V_EXPM1_DATA,
|
||||
.tiny_bound = V2 (0x3e40000000000000), /* asuint64 (0x1p-27). */
|
||||
/* asuint64(0x1.241bf835f9d5fp+4) - asuint64(tiny_bound). */
|
||||
.thresh = V2 (0x01f241bf835f9d5f),
|
||||
};
|
||||
|
||||
-static inline float64x2_t
|
||||
-expm1_inline (float64x2_t x, const struct data *d)
|
||||
-{
|
||||
- /* Helper routine for calculating exp(x) - 1. Vector port of the helper from
|
||||
- the scalar variant of tanh. */
|
||||
-
|
||||
- /* Reduce argument: f in [-ln2/2, ln2/2], i is exact. */
|
||||
- float64x2_t j = vsubq_f64 (vfmaq_f64 (d->shift, d->inv_ln2, x), d->shift);
|
||||
- int64x2_t i = vcvtq_s64_f64 (j);
|
||||
- float64x2_t f = vfmaq_f64 (x, j, d->ln2_hi);
|
||||
- f = vfmaq_f64 (f, j, d->ln2_lo);
|
||||
-
|
||||
- /* Approximate expm1(f) using polynomial. */
|
||||
- float64x2_t f2 = vmulq_f64 (f, f);
|
||||
- float64x2_t f4 = vmulq_f64 (f2, f2);
|
||||
- float64x2_t p = vfmaq_f64 (
|
||||
- f, f2, v_estrin_10_f64 (f, f2, f4, vmulq_f64 (f4, f4), d->poly));
|
||||
-
|
||||
- /* t = 2 ^ i. */
|
||||
- float64x2_t t = vreinterpretq_f64_u64 (
|
||||
- vaddq_u64 (vreinterpretq_u64_s64 (i << 52), d->onef));
|
||||
- /* expm1(x) = p * t + (t - 1). */
|
||||
- return vfmaq_f64 (vsubq_f64 (t, v_f64 (1)), p, t);
|
||||
-}
|
||||
-
|
||||
static float64x2_t NOINLINE VPCS_ATTR
|
||||
-special_case (float64x2_t x, float64x2_t y, uint64x2_t special)
|
||||
+special_case (float64x2_t x, float64x2_t q, float64x2_t qp2,
|
||||
+ uint64x2_t special)
|
||||
{
|
||||
- return v_call_f64 (tanh, x, y, special);
|
||||
+ return v_call_f64 (tanh, x, vdivq_f64 (q, qp2), special);
|
||||
}
|
||||
|
||||
/* Vector approximation for double-precision tanh(x), using a simplified
|
||||
- version of expm1. The greatest observed error is 2.77 ULP:
|
||||
- _ZGVnN2v_tanh(-0x1.c4a4ca0f9f3b7p-3) got -0x1.bd6a21a163627p-3
|
||||
- want -0x1.bd6a21a163624p-3. */
|
||||
+ version of expm1. The greatest observed error is 2.70 ULP:
|
||||
+ _ZGVnN2v_tanh(-0x1.c59aa220cb177p-3) got -0x1.be5452a6459fep-3
|
||||
+ want -0x1.be5452a6459fbp-3. */
|
||||
float64x2_t VPCS_ATTR V_NAME_D1 (tanh) (float64x2_t x)
|
||||
{
|
||||
const struct data *d = ptr_barrier (&data);
|
||||
@@ -100,10 +62,10 @@ float64x2_t VPCS_ATTR V_NAME_D1 (tanh) (float64x2_t x)
|
||||
u = vaddq_f64 (u, u);
|
||||
|
||||
/* tanh(x) = (e^2x - 1) / (e^2x + 1). */
|
||||
- float64x2_t q = expm1_inline (u, d);
|
||||
- float64x2_t qp2 = vaddq_f64 (q, v_f64 (2));
|
||||
+ float64x2_t q = expm1_inline (u, &d->d);
|
||||
+ float64x2_t qp2 = vaddq_f64 (q, v_f64 (2.0));
|
||||
|
||||
if (__glibc_unlikely (v_any_u64 (special)))
|
||||
- return special_case (x, vdivq_f64 (q, qp2), special);
|
||||
+ return special_case (x, q, qp2, special);
|
||||
return vdivq_f64 (q, qp2);
|
||||
}
|
||||
diff --git a/sysdeps/aarch64/fpu/v_expm1_inline.h b/sysdeps/aarch64/fpu/v_expm1_inline.h
|
||||
new file mode 100644
|
||||
index 0000000000000000..a925183d4e5e4623
|
||||
--- /dev/null
|
||||
+++ b/sysdeps/aarch64/fpu/v_expm1_inline.h
|
||||
@@ -0,0 +1,97 @@
|
||||
+/* Double-precision inline helper for vector (Advanced SIMD) expm1 function
|
||||
+
|
||||
+ Copyright (C) 2024 Free Software Foundation, Inc.
|
||||
+ This file is part of the GNU C Library.
|
||||
+
|
||||
+ The GNU C Library is free software; you can redistribute it and/or
|
||||
+ modify it under the terms of the GNU Lesser General Public
|
||||
+ License as published by the Free Software Foundation; either
|
||||
+ version 2.1 of the License, or (at your option) any later version.
|
||||
+
|
||||
+ The GNU C Library is distributed in the hope that it will be useful,
|
||||
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
+ Lesser General Public License for more details.
|
||||
+
|
||||
+ You should have received a copy of the GNU Lesser General Public
|
||||
+ License along with the GNU C Library; if not, see
|
||||
+ <https://www.gnu.org/licenses/>. */
|
||||
+
|
||||
+#ifndef AARCH64_FPU_V_EXPM1_INLINE_H
|
||||
+#define AARCH64_FPU_V_EXPM1_INLINE_H
|
||||
+
|
||||
+#include "v_math.h"
|
||||
+
|
||||
+struct v_expm1_data
|
||||
+{
|
||||
+ float64x2_t c2, c4, c6, c8;
|
||||
+ float64x2_t invln2;
|
||||
+ int64x2_t exponent_bias;
|
||||
+ double c1, c3, c5, c7, c9, c10;
|
||||
+ double ln2[2];
|
||||
+};
|
||||
+
|
||||
+/* Generated using fpminimax, with degree=12 in [log(2)/2, log(2)/2]. */
|
||||
+#define V_EXPM1_DATA \
|
||||
+ { \
|
||||
+ .c1 = 0x1.5555555555559p-3, .c2 = V2 (0x1.555555555554bp-5), \
|
||||
+ .c3 = 0x1.111111110f663p-7, .c4 = V2 (0x1.6c16c16c1b5f3p-10), \
|
||||
+ .c5 = 0x1.a01a01affa35dp-13, .c6 = V2 (0x1.a01a018b4ecbbp-16), \
|
||||
+ .c7 = 0x1.71ddf82db5bb4p-19, .c8 = V2 (0x1.27e517fc0d54bp-22), \
|
||||
+ .c9 = 0x1.af5eedae67435p-26, .c10 = 0x1.1f143d060a28ap-29, \
|
||||
+ .ln2 = { 0x1.62e42fefa39efp-1, 0x1.abc9e3b39803fp-56 }, \
|
||||
+ .invln2 = V2 (0x1.71547652b82fep0), \
|
||||
+ .exponent_bias = V2 (0x3ff0000000000000), \
|
||||
+ }
|
||||
+
|
||||
+static inline float64x2_t
|
||||
+expm1_inline (float64x2_t x, const struct v_expm1_data *d)
|
||||
+{
|
||||
+ /* Helper routine for calculating exp(x) - 1. */
|
||||
+
|
||||
+ float64x2_t ln2 = vld1q_f64 (&d->ln2[0]);
|
||||
+
|
||||
+ /* Reduce argument to smaller range:
|
||||
+ Let i = round(x / ln2)
|
||||
+ and f = x - i * ln2, then f is in [-ln2/2, ln2/2].
|
||||
+ exp(x) - 1 = 2^i * (expm1(f) + 1) - 1
|
||||
+ where 2^i is exact because i is an integer. */
|
||||
+ float64x2_t n = vrndaq_f64 (vmulq_f64 (x, d->invln2));
|
||||
+ int64x2_t i = vcvtq_s64_f64 (n);
|
||||
+ float64x2_t f = vfmsq_laneq_f64 (x, n, ln2, 0);
|
||||
+ f = vfmsq_laneq_f64 (f, n, ln2, 1);
|
||||
+
|
||||
+ /* Approximate expm1(f) using polynomial.
|
||||
+ Taylor expansion for expm1(x) has the form:
|
||||
+ x + ax^2 + bx^3 + cx^4 ....
|
||||
+ So we calculate the polynomial P(f) = a + bf + cf^2 + ...
|
||||
+ and assemble the approximation expm1(f) ~= f + f^2 * P(f). */
|
||||
+ float64x2_t f2 = vmulq_f64 (f, f);
|
||||
+ float64x2_t f4 = vmulq_f64 (f2, f2);
|
||||
+ float64x2_t lane_consts_13 = vld1q_f64 (&d->c1);
|
||||
+ float64x2_t lane_consts_57 = vld1q_f64 (&d->c5);
|
||||
+ float64x2_t lane_consts_910 = vld1q_f64 (&d->c9);
|
||||
+ float64x2_t p01 = vfmaq_laneq_f64 (v_f64 (0.5), f, lane_consts_13, 0);
|
||||
+ float64x2_t p23 = vfmaq_laneq_f64 (d->c2, f, lane_consts_13, 1);
|
||||
+ float64x2_t p45 = vfmaq_laneq_f64 (d->c4, f, lane_consts_57, 0);
|
||||
+ float64x2_t p67 = vfmaq_laneq_f64 (d->c6, f, lane_consts_57, 1);
|
||||
+ float64x2_t p03 = vfmaq_f64 (p01, f2, p23);
|
||||
+ float64x2_t p47 = vfmaq_f64 (p45, f2, p67);
|
||||
+ float64x2_t p89 = vfmaq_laneq_f64 (d->c8, f, lane_consts_910, 0);
|
||||
+ float64x2_t p = vfmaq_laneq_f64 (p89, f2, lane_consts_910, 1);
|
||||
+ p = vfmaq_f64 (p47, f4, p);
|
||||
+ p = vfmaq_f64 (p03, f4, p);
|
||||
+
|
||||
+ p = vfmaq_f64 (f, f2, p);
|
||||
+
|
||||
+ /* Assemble the result.
|
||||
+ expm1(x) ~= 2^i * (p + 1) - 1
|
||||
+ Let t = 2^i. */
|
||||
+ int64x2_t u = vaddq_s64 (vshlq_n_s64 (i, 52), d->exponent_bias);
|
||||
+ float64x2_t t = vreinterpretq_f64_s64 (u);
|
||||
+
|
||||
+ /* expm1(x) ~= p * t + (t - 1). */
|
||||
+ return vfmaq_f64 (vsubq_f64 (t, v_f64 (1.0)), p, t);
|
||||
+}
|
||||
+
|
||||
+#endif
|
||||
diff --git a/sysdeps/aarch64/fpu/v_expm1f_inline.h b/sysdeps/aarch64/fpu/v_expm1f_inline.h
|
||||
index 1daedfdd51cfc54b..c1fb88b5e027b322 100644
|
||||
--- a/sysdeps/aarch64/fpu/v_expm1f_inline.h
|
||||
+++ b/sysdeps/aarch64/fpu/v_expm1f_inline.h
|
||||
@@ -21,7 +21,6 @@
|
||||
#define AARCH64_FPU_V_EXPM1F_INLINE_H
|
||||
|
||||
#include "v_math.h"
|
||||
-#include "math_config.h"
|
||||
|
||||
struct v_expm1f_data
|
||||
{
|
||||
862
glibc-RHEL-118273-2.patch
Normal file
862
glibc-RHEL-118273-2.patch
Normal file
@ -0,0 +1,862 @@
|
||||
commit bdb5705b7bab618ed4445f4b17d4b1e4fbbf94a7
|
||||
Author: Joe Ramsay <Joe.Ramsay@arm.com>
|
||||
Date: Tue Feb 20 16:59:39 2024 +0000
|
||||
|
||||
aarch64/fpu: Add vector variants of cosh
|
||||
|
||||
Reviewed-by: Szabolcs Nagy <szabolcs.nagy@arm.com>
|
||||
|
||||
diff --git a/sysdeps/aarch64/fpu/Makefile b/sysdeps/aarch64/fpu/Makefile
|
||||
index 320b6ed43a9a454c..019c3a51880e2306 100644
|
||||
--- a/sysdeps/aarch64/fpu/Makefile
|
||||
+++ b/sysdeps/aarch64/fpu/Makefile
|
||||
@@ -3,6 +3,7 @@ libmvec-supported-funcs = acos \
|
||||
atan \
|
||||
atan2 \
|
||||
cos \
|
||||
+ cosh \
|
||||
erf \
|
||||
exp \
|
||||
exp10 \
|
||||
@@ -32,7 +33,8 @@ libmvec-support = $(addsuffix f_advsimd,$(float-advsimd-funcs)) \
|
||||
erf_data \
|
||||
erff_data \
|
||||
sv_erf_data \
|
||||
- sv_erff_data
|
||||
+ sv_erff_data \
|
||||
+ v_exp_tail_data
|
||||
endif
|
||||
|
||||
sve-cflags = -march=armv8-a+sve
|
||||
diff --git a/sysdeps/aarch64/fpu/Versions b/sysdeps/aarch64/fpu/Versions
|
||||
index d7b1e87191b66439..884b4b57f097635f 100644
|
||||
--- a/sysdeps/aarch64/fpu/Versions
|
||||
+++ b/sysdeps/aarch64/fpu/Versions
|
||||
@@ -79,6 +79,11 @@ libmvec {
|
||||
_ZGVsMxv_tan;
|
||||
}
|
||||
GLIBC_2.40 {
|
||||
+ _ZGVnN2v_cosh;
|
||||
+ _ZGVnN2v_coshf;
|
||||
+ _ZGVnN4v_coshf;
|
||||
+ _ZGVsMxv_cosh;
|
||||
+ _ZGVsMxv_coshf;
|
||||
_ZGVnN2v_erf;
|
||||
_ZGVnN2v_erff;
|
||||
_ZGVnN4v_erff;
|
||||
diff --git a/sysdeps/aarch64/fpu/advsimd_f32_protos.h b/sysdeps/aarch64/fpu/advsimd_f32_protos.h
|
||||
index d8d88de2181569f9..c63b2948d4938b0d 100644
|
||||
--- a/sysdeps/aarch64/fpu/advsimd_f32_protos.h
|
||||
+++ b/sysdeps/aarch64/fpu/advsimd_f32_protos.h
|
||||
@@ -21,6 +21,7 @@ libmvec_hidden_proto (V_NAME_F1(acos));
|
||||
libmvec_hidden_proto (V_NAME_F1(asin));
|
||||
libmvec_hidden_proto (V_NAME_F1(atan));
|
||||
libmvec_hidden_proto (V_NAME_F1(cos));
|
||||
+libmvec_hidden_proto (V_NAME_F1(cosh));
|
||||
libmvec_hidden_proto (V_NAME_F1(erf));
|
||||
libmvec_hidden_proto (V_NAME_F1(exp10));
|
||||
libmvec_hidden_proto (V_NAME_F1(exp2));
|
||||
diff --git a/sysdeps/aarch64/fpu/bits/math-vector.h b/sysdeps/aarch64/fpu/bits/math-vector.h
|
||||
index 71f53363a071126d..8ca55098706a54c2 100644
|
||||
--- a/sysdeps/aarch64/fpu/bits/math-vector.h
|
||||
+++ b/sysdeps/aarch64/fpu/bits/math-vector.h
|
||||
@@ -49,6 +49,10 @@
|
||||
# define __DECL_SIMD_cos __DECL_SIMD_aarch64
|
||||
# undef __DECL_SIMD_cosf
|
||||
# define __DECL_SIMD_cosf __DECL_SIMD_aarch64
|
||||
+# undef __DECL_SIMD_cosh
|
||||
+# define __DECL_SIMD_cosh __DECL_SIMD_aarch64
|
||||
+# undef __DECL_SIMD_coshf
|
||||
+# define __DECL_SIMD_coshf __DECL_SIMD_aarch64
|
||||
# undef __DECL_SIMD_erf
|
||||
# define __DECL_SIMD_erf __DECL_SIMD_aarch64
|
||||
# undef __DECL_SIMD_erff
|
||||
@@ -124,6 +128,7 @@ __vpcs __f32x4_t _ZGVnN4v_acosf (__f32x4_t);
|
||||
__vpcs __f32x4_t _ZGVnN4v_asinf (__f32x4_t);
|
||||
__vpcs __f32x4_t _ZGVnN4v_atanf (__f32x4_t);
|
||||
__vpcs __f32x4_t _ZGVnN4v_cosf (__f32x4_t);
|
||||
+__vpcs __f32x4_t _ZGVnN4v_coshf (__f32x4_t);
|
||||
__vpcs __f32x4_t _ZGVnN4v_erff (__f32x4_t);
|
||||
__vpcs __f32x4_t _ZGVnN4v_expf (__f32x4_t);
|
||||
__vpcs __f32x4_t _ZGVnN4v_exp10f (__f32x4_t);
|
||||
@@ -141,6 +146,7 @@ __vpcs __f64x2_t _ZGVnN2v_acos (__f64x2_t);
|
||||
__vpcs __f64x2_t _ZGVnN2v_asin (__f64x2_t);
|
||||
__vpcs __f64x2_t _ZGVnN2v_atan (__f64x2_t);
|
||||
__vpcs __f64x2_t _ZGVnN2v_cos (__f64x2_t);
|
||||
+__vpcs __f64x2_t _ZGVnN2v_cosh (__f64x2_t);
|
||||
__vpcs __f64x2_t _ZGVnN2v_erf (__f64x2_t);
|
||||
__vpcs __f64x2_t _ZGVnN2v_exp (__f64x2_t);
|
||||
__vpcs __f64x2_t _ZGVnN2v_exp10 (__f64x2_t);
|
||||
@@ -163,6 +169,7 @@ __sv_f32_t _ZGVsMxv_acosf (__sv_f32_t, __sv_bool_t);
|
||||
__sv_f32_t _ZGVsMxv_asinf (__sv_f32_t, __sv_bool_t);
|
||||
__sv_f32_t _ZGVsMxv_atanf (__sv_f32_t, __sv_bool_t);
|
||||
__sv_f32_t _ZGVsMxv_cosf (__sv_f32_t, __sv_bool_t);
|
||||
+__sv_f32_t _ZGVsMxv_coshf (__sv_f32_t, __sv_bool_t);
|
||||
__sv_f32_t _ZGVsMxv_erff (__sv_f32_t, __sv_bool_t);
|
||||
__sv_f32_t _ZGVsMxv_expf (__sv_f32_t, __sv_bool_t);
|
||||
__sv_f32_t _ZGVsMxv_exp10f (__sv_f32_t, __sv_bool_t);
|
||||
@@ -180,6 +187,7 @@ __sv_f64_t _ZGVsMxv_acos (__sv_f64_t, __sv_bool_t);
|
||||
__sv_f64_t _ZGVsMxv_asin (__sv_f64_t, __sv_bool_t);
|
||||
__sv_f64_t _ZGVsMxv_atan (__sv_f64_t, __sv_bool_t);
|
||||
__sv_f64_t _ZGVsMxv_cos (__sv_f64_t, __sv_bool_t);
|
||||
+__sv_f64_t _ZGVsMxv_cosh (__sv_f64_t, __sv_bool_t);
|
||||
__sv_f64_t _ZGVsMxv_erf (__sv_f64_t, __sv_bool_t);
|
||||
__sv_f64_t _ZGVsMxv_exp (__sv_f64_t, __sv_bool_t);
|
||||
__sv_f64_t _ZGVsMxv_exp10 (__sv_f64_t, __sv_bool_t);
|
||||
diff --git a/sysdeps/aarch64/fpu/cosh_advsimd.c b/sysdeps/aarch64/fpu/cosh_advsimd.c
|
||||
new file mode 100644
|
||||
index 0000000000000000..ec7b59637e973da9
|
||||
--- /dev/null
|
||||
+++ b/sysdeps/aarch64/fpu/cosh_advsimd.c
|
||||
@@ -0,0 +1,108 @@
|
||||
+/* Double-precision vector (AdvSIMD) cosh function
|
||||
+
|
||||
+ Copyright (C) 2024 Free Software Foundation, Inc.
|
||||
+ This file is part of the GNU C Library.
|
||||
+
|
||||
+ The GNU C Library is free software; you can redistribute it and/or
|
||||
+ modify it under the terms of the GNU Lesser General Public
|
||||
+ License as published by the Free Software Foundation; either
|
||||
+ version 2.1 of the License, or (at your option) any later version.
|
||||
+
|
||||
+ The GNU C Library is distributed in the hope that it will be useful,
|
||||
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
+ Lesser General Public License for more details.
|
||||
+
|
||||
+ You should have received a copy of the GNU Lesser General Public
|
||||
+ License along with the GNU C Library; if not, see
|
||||
+ <https://www.gnu.org/licenses/>. */
|
||||
+
|
||||
+#include "v_math.h"
|
||||
+
|
||||
+static const struct data
|
||||
+{
|
||||
+ float64x2_t poly[3];
|
||||
+ float64x2_t inv_ln2, ln2, shift, thres;
|
||||
+ uint64x2_t index_mask, special_bound;
|
||||
+} data = {
|
||||
+ .poly = { V2 (0x1.fffffffffffd4p-2), V2 (0x1.5555571d6b68cp-3),
|
||||
+ V2 (0x1.5555576a59599p-5), },
|
||||
+
|
||||
+ .inv_ln2 = V2 (0x1.71547652b82fep8), /* N/ln2. */
|
||||
+ /* -ln2/N. */
|
||||
+ .ln2 = {-0x1.62e42fefa39efp-9, -0x1.abc9e3b39803f3p-64},
|
||||
+ .shift = V2 (0x1.8p+52),
|
||||
+ .thres = V2 (704.0),
|
||||
+
|
||||
+ .index_mask = V2 (0xff),
|
||||
+ /* 0x1.6p9, above which exp overflows. */
|
||||
+ .special_bound = V2 (0x4086000000000000),
|
||||
+};
|
||||
+
|
||||
+static float64x2_t NOINLINE VPCS_ATTR
|
||||
+special_case (float64x2_t x, float64x2_t y, uint64x2_t special)
|
||||
+{
|
||||
+ return v_call_f64 (cosh, x, y, special);
|
||||
+}
|
||||
+
|
||||
+/* Helper for approximating exp(x). Copied from v_exp_tail, with no
|
||||
+ special-case handling or tail. */
|
||||
+static inline float64x2_t
|
||||
+exp_inline (float64x2_t x)
|
||||
+{
|
||||
+ const struct data *d = ptr_barrier (&data);
|
||||
+
|
||||
+ /* n = round(x/(ln2/N)). */
|
||||
+ float64x2_t z = vfmaq_f64 (d->shift, x, d->inv_ln2);
|
||||
+ uint64x2_t u = vreinterpretq_u64_f64 (z);
|
||||
+ float64x2_t n = vsubq_f64 (z, d->shift);
|
||||
+
|
||||
+ /* r = x - n*ln2/N. */
|
||||
+ float64x2_t r = vfmaq_laneq_f64 (x, n, d->ln2, 0);
|
||||
+ r = vfmaq_laneq_f64 (r, n, d->ln2, 1);
|
||||
+
|
||||
+ uint64x2_t e = vshlq_n_u64 (u, 52 - V_EXP_TAIL_TABLE_BITS);
|
||||
+ uint64x2_t i = vandq_u64 (u, d->index_mask);
|
||||
+
|
||||
+ /* y = tail + exp(r) - 1 ~= r + C1 r^2 + C2 r^3 + C3 r^4. */
|
||||
+ float64x2_t y = vfmaq_f64 (d->poly[1], d->poly[2], r);
|
||||
+ y = vfmaq_f64 (d->poly[0], y, r);
|
||||
+ y = vmulq_f64 (vfmaq_f64 (v_f64 (1), y, r), r);
|
||||
+
|
||||
+ /* s = 2^(n/N). */
|
||||
+ u = v_lookup_u64 (__v_exp_tail_data, i);
|
||||
+ float64x2_t s = vreinterpretq_f64_u64 (vaddq_u64 (u, e));
|
||||
+
|
||||
+ return vfmaq_f64 (s, y, s);
|
||||
+}
|
||||
+
|
||||
+/* Approximation for vector double-precision cosh(x) using exp_inline.
|
||||
+ cosh(x) = (exp(x) + exp(-x)) / 2.
|
||||
+ The greatest observed error is in the scalar fall-back region, so is the
|
||||
+ same as the scalar routine, 1.93 ULP:
|
||||
+ _ZGVnN2v_cosh (0x1.628af341989dap+9) got 0x1.fdf28623ef921p+1021
|
||||
+ want 0x1.fdf28623ef923p+1021.
|
||||
+
|
||||
+ The greatest observed error in the non-special region is 1.54 ULP:
|
||||
+ _ZGVnN2v_cosh (0x1.8e205b6ecacf7p+2) got 0x1.f711dcb0c77afp+7
|
||||
+ want 0x1.f711dcb0c77b1p+7. */
|
||||
+float64x2_t VPCS_ATTR V_NAME_D1 (cosh) (float64x2_t x)
|
||||
+{
|
||||
+ const struct data *d = ptr_barrier (&data);
|
||||
+
|
||||
+ float64x2_t ax = vabsq_f64 (x);
|
||||
+ uint64x2_t special
|
||||
+ = vcgtq_u64 (vreinterpretq_u64_f64 (ax), d->special_bound);
|
||||
+
|
||||
+ /* Up to the point that exp overflows, we can use it to calculate cosh by
|
||||
+ exp(|x|) / 2 + 1 / (2 * exp(|x|)). */
|
||||
+ float64x2_t t = exp_inline (ax);
|
||||
+ float64x2_t half_t = vmulq_n_f64 (t, 0.5);
|
||||
+ float64x2_t half_over_t = vdivq_f64 (v_f64 (0.5), t);
|
||||
+
|
||||
+ /* Fall back to scalar for any special cases. */
|
||||
+ if (__glibc_unlikely (v_any_u64 (special)))
|
||||
+ return special_case (x, vaddq_f64 (half_t, half_over_t), special);
|
||||
+
|
||||
+ return vaddq_f64 (half_t, half_over_t);
|
||||
+}
|
||||
diff --git a/sysdeps/aarch64/fpu/cosh_sve.c b/sysdeps/aarch64/fpu/cosh_sve.c
|
||||
new file mode 100644
|
||||
index 0000000000000000..919f34604a452b4a
|
||||
--- /dev/null
|
||||
+++ b/sysdeps/aarch64/fpu/cosh_sve.c
|
||||
@@ -0,0 +1,105 @@
|
||||
+/* Double-precision vector (SVE) cosh function
|
||||
+
|
||||
+ Copyright (C) 2024 Free Software Foundation, Inc.
|
||||
+ This file is part of the GNU C Library.
|
||||
+
|
||||
+ The GNU C Library is free software; you can redistribute it and/or
|
||||
+ modify it under the terms of the GNU Lesser General Public
|
||||
+ License as published by the Free Software Foundation; either
|
||||
+ version 2.1 of the License, or (at your option) any later version.
|
||||
+
|
||||
+ The GNU C Library is distributed in the hope that it will be useful,
|
||||
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
+ Lesser General Public License for more details.
|
||||
+
|
||||
+ You should have received a copy of the GNU Lesser General Public
|
||||
+ License along with the GNU C Library; if not, see
|
||||
+ <https://www.gnu.org/licenses/>. */
|
||||
+
|
||||
+#include "sv_math.h"
|
||||
+
|
||||
+static const struct data
|
||||
+{
|
||||
+ float64_t poly[3];
|
||||
+ float64_t inv_ln2, ln2_hi, ln2_lo, shift, thres;
|
||||
+ uint64_t index_mask, special_bound;
|
||||
+} data = {
|
||||
+ .poly = { 0x1.fffffffffffd4p-2, 0x1.5555571d6b68cp-3,
|
||||
+ 0x1.5555576a59599p-5, },
|
||||
+
|
||||
+ .inv_ln2 = 0x1.71547652b82fep8, /* N/ln2. */
|
||||
+ /* -ln2/N. */
|
||||
+ .ln2_hi = -0x1.62e42fefa39efp-9,
|
||||
+ .ln2_lo = -0x1.abc9e3b39803f3p-64,
|
||||
+ .shift = 0x1.8p+52,
|
||||
+ .thres = 704.0,
|
||||
+
|
||||
+ .index_mask = 0xff,
|
||||
+ /* 0x1.6p9, above which exp overflows. */
|
||||
+ .special_bound = 0x4086000000000000,
|
||||
+};
|
||||
+
|
||||
+static svfloat64_t NOINLINE
|
||||
+special_case (svfloat64_t x, svfloat64_t y, svbool_t special)
|
||||
+{
|
||||
+ return sv_call_f64 (cosh, x, y, special);
|
||||
+}
|
||||
+
|
||||
+/* Helper for approximating exp(x). Copied from sv_exp_tail, with no
|
||||
+ special-case handling or tail. */
|
||||
+static inline svfloat64_t
|
||||
+exp_inline (svfloat64_t x, const svbool_t pg, const struct data *d)
|
||||
+{
|
||||
+ /* Calculate exp(x). */
|
||||
+ svfloat64_t z = svmla_x (pg, sv_f64 (d->shift), x, d->inv_ln2);
|
||||
+ svfloat64_t n = svsub_x (pg, z, d->shift);
|
||||
+
|
||||
+ svfloat64_t r = svmla_x (pg, x, n, d->ln2_hi);
|
||||
+ r = svmla_x (pg, r, n, d->ln2_lo);
|
||||
+
|
||||
+ svuint64_t u = svreinterpret_u64 (z);
|
||||
+ svuint64_t e = svlsl_x (pg, u, 52 - V_EXP_TAIL_TABLE_BITS);
|
||||
+ svuint64_t i = svand_x (pg, u, d->index_mask);
|
||||
+
|
||||
+ svfloat64_t y = svmla_x (pg, sv_f64 (d->poly[1]), r, d->poly[2]);
|
||||
+ y = svmla_x (pg, sv_f64 (d->poly[0]), r, y);
|
||||
+ y = svmla_x (pg, sv_f64 (1.0), r, y);
|
||||
+ y = svmul_x (pg, r, y);
|
||||
+
|
||||
+ /* s = 2^(n/N). */
|
||||
+ u = svld1_gather_index (pg, __v_exp_tail_data, i);
|
||||
+ svfloat64_t s = svreinterpret_f64 (svadd_x (pg, u, e));
|
||||
+
|
||||
+ return svmla_x (pg, s, s, y);
|
||||
+}
|
||||
+
|
||||
+/* Approximation for SVE double-precision cosh(x) using exp_inline.
|
||||
+ cosh(x) = (exp(x) + exp(-x)) / 2.
|
||||
+ The greatest observed error is in the scalar fall-back region, so is the
|
||||
+ same as the scalar routine, 1.93 ULP:
|
||||
+ _ZGVsMxv_cosh (0x1.628ad45039d2fp+9) got 0x1.fd774e958236dp+1021
|
||||
+ want 0x1.fd774e958236fp+1021.
|
||||
+
|
||||
+ The greatest observed error in the non-special region is 1.54 ULP:
|
||||
+ _ZGVsMxv_cosh (0x1.ba5651dd4486bp+2) got 0x1.f5e2bb8d5c98fp+8
|
||||
+ want 0x1.f5e2bb8d5c991p+8. */
|
||||
+svfloat64_t SV_NAME_D1 (cosh) (svfloat64_t x, const svbool_t pg)
|
||||
+{
|
||||
+ const struct data *d = ptr_barrier (&data);
|
||||
+
|
||||
+ svfloat64_t ax = svabs_x (pg, x);
|
||||
+ svbool_t special = svcmpgt (pg, svreinterpret_u64 (ax), d->special_bound);
|
||||
+
|
||||
+ /* Up to the point that exp overflows, we can use it to calculate cosh by
|
||||
+ exp(|x|) / 2 + 1 / (2 * exp(|x|)). */
|
||||
+ svfloat64_t t = exp_inline (ax, pg, d);
|
||||
+ svfloat64_t half_t = svmul_x (pg, t, 0.5);
|
||||
+ svfloat64_t half_over_t = svdivr_x (pg, t, 0.5);
|
||||
+
|
||||
+ /* Fall back to scalar for any special cases. */
|
||||
+ if (__glibc_unlikely (svptest_any (pg, special)))
|
||||
+ return special_case (x, svadd_x (pg, half_t, half_over_t), special);
|
||||
+
|
||||
+ return svadd_x (pg, half_t, half_over_t);
|
||||
+}
|
||||
diff --git a/sysdeps/aarch64/fpu/coshf_advsimd.c b/sysdeps/aarch64/fpu/coshf_advsimd.c
|
||||
new file mode 100644
|
||||
index 0000000000000000..c1ab4923b826569b
|
||||
--- /dev/null
|
||||
+++ b/sysdeps/aarch64/fpu/coshf_advsimd.c
|
||||
@@ -0,0 +1,84 @@
|
||||
+/* Single-precision vector (AdvSIMD) cosh function
|
||||
+
|
||||
+ Copyright (C) 2024 Free Software Foundation, Inc.
|
||||
+ This file is part of the GNU C Library.
|
||||
+
|
||||
+ The GNU C Library is free software; you can redistribute it and/or
|
||||
+ modify it under the terms of the GNU Lesser General Public
|
||||
+ License as published by the Free Software Foundation; either
|
||||
+ version 2.1 of the License, or (at your option) any later version.
|
||||
+
|
||||
+ The GNU C Library is distributed in the hope that it will be useful,
|
||||
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
+ Lesser General Public License for more details.
|
||||
+
|
||||
+ You should have received a copy of the GNU Lesser General Public
|
||||
+ License along with the GNU C Library; if not, see
|
||||
+ <https://www.gnu.org/licenses/>. */
|
||||
+
|
||||
+#include "v_expf_inline.h"
|
||||
+#include "v_math.h"
|
||||
+
|
||||
+static const struct data
|
||||
+{
|
||||
+ struct v_expf_data expf_consts;
|
||||
+ uint32x4_t tiny_bound, special_bound;
|
||||
+} data = {
|
||||
+ .expf_consts = V_EXPF_DATA,
|
||||
+ .tiny_bound = V4 (0x20000000), /* 0x1p-63: Round to 1 below this. */
|
||||
+ /* 0x1.5a92d8p+6: expf overflows above this, so have to use special case. */
|
||||
+ .special_bound = V4 (0x42ad496c),
|
||||
+};
|
||||
+
|
||||
+#if !WANT_SIMD_EXCEPT
|
||||
+static float32x4_t NOINLINE VPCS_ATTR
|
||||
+special_case (float32x4_t x, float32x4_t y, uint32x4_t special)
|
||||
+{
|
||||
+ return v_call_f32 (coshf, x, y, special);
|
||||
+}
|
||||
+#endif
|
||||
+
|
||||
+/* Single-precision vector cosh, using vector expf.
|
||||
+ Maximum error is 2.38 ULP:
|
||||
+ _ZGVnN4v_coshf (0x1.e8001ep+1) got 0x1.6a491ep+4
|
||||
+ want 0x1.6a4922p+4. */
|
||||
+float32x4_t VPCS_ATTR V_NAME_F1 (cosh) (float32x4_t x)
|
||||
+{
|
||||
+ const struct data *d = ptr_barrier (&data);
|
||||
+
|
||||
+ float32x4_t ax = vabsq_f32 (x);
|
||||
+ uint32x4_t iax = vreinterpretq_u32_f32 (ax);
|
||||
+ uint32x4_t special = vcgeq_u32 (iax, d->special_bound);
|
||||
+
|
||||
+#if WANT_SIMD_EXCEPT
|
||||
+ /* If fp exceptions are to be triggered correctly, fall back to the scalar
|
||||
+ variant for all inputs if any input is a special value or above the bound
|
||||
+ at which expf overflows. */
|
||||
+ if (__glibc_unlikely (v_any_u32 (special)))
|
||||
+ return v_call_f32 (coshf, x, x, v_u32 (-1));
|
||||
+
|
||||
+ uint32x4_t tiny = vcleq_u32 (iax, d->tiny_bound);
|
||||
+ /* If any input is tiny, avoid underflow exception by fixing tiny lanes of
|
||||
+ input to 0, which will generate no exceptions. */
|
||||
+ if (__glibc_unlikely (v_any_u32 (tiny)))
|
||||
+ ax = v_zerofy_f32 (ax, tiny);
|
||||
+#endif
|
||||
+
|
||||
+ /* Calculate cosh by exp(x) / 2 + exp(-x) / 2. */
|
||||
+ float32x4_t t = v_expf_inline (ax, &d->expf_consts);
|
||||
+ float32x4_t half_t = vmulq_n_f32 (t, 0.5);
|
||||
+ float32x4_t half_over_t = vdivq_f32 (v_f32 (0.5), t);
|
||||
+
|
||||
+#if WANT_SIMD_EXCEPT
|
||||
+ if (__glibc_unlikely (v_any_u32 (tiny)))
|
||||
+ return vbslq_f32 (tiny, v_f32 (1), vaddq_f32 (half_t, half_over_t));
|
||||
+#else
|
||||
+ if (__glibc_unlikely (v_any_u32 (special)))
|
||||
+ return special_case (x, vaddq_f32 (half_t, half_over_t), special);
|
||||
+#endif
|
||||
+
|
||||
+ return vaddq_f32 (half_t, half_over_t);
|
||||
+}
|
||||
+libmvec_hidden_def (V_NAME_F1 (cosh))
|
||||
+HALF_WIDTH_ALIAS_F1 (cosh)
|
||||
diff --git a/sysdeps/aarch64/fpu/coshf_sve.c b/sysdeps/aarch64/fpu/coshf_sve.c
|
||||
new file mode 100644
|
||||
index 0000000000000000..e5d8a299c6aa7ceb
|
||||
--- /dev/null
|
||||
+++ b/sysdeps/aarch64/fpu/coshf_sve.c
|
||||
@@ -0,0 +1,59 @@
|
||||
+/* Single-precision vector (SVE) cosh function
|
||||
+
|
||||
+ Copyright (C) 2024 Free Software Foundation, Inc.
|
||||
+ This file is part of the GNU C Library.
|
||||
+
|
||||
+ The GNU C Library is free software; you can redistribute it and/or
|
||||
+ modify it under the terms of the GNU Lesser General Public
|
||||
+ License as published by the Free Software Foundation; either
|
||||
+ version 2.1 of the License, or (at your option) any later version.
|
||||
+
|
||||
+ The GNU C Library is distributed in the hope that it will be useful,
|
||||
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
+ Lesser General Public License for more details.
|
||||
+
|
||||
+ You should have received a copy of the GNU Lesser General Public
|
||||
+ License along with the GNU C Library; if not, see
|
||||
+ <https://www.gnu.org/licenses/>. */
|
||||
+
|
||||
+#include "sv_math.h"
|
||||
+#include "sv_expf_inline.h"
|
||||
+
|
||||
+static const struct data
|
||||
+{
|
||||
+ struct sv_expf_data expf_consts;
|
||||
+ uint32_t special_bound;
|
||||
+} data = {
|
||||
+ .expf_consts = SV_EXPF_DATA,
|
||||
+ /* 0x1.5a92d8p+6: expf overflows above this, so have to use special case. */
|
||||
+ .special_bound = 0x42ad496c,
|
||||
+};
|
||||
+
|
||||
+static svfloat32_t NOINLINE
|
||||
+special_case (svfloat32_t x, svfloat32_t y, svbool_t pg)
|
||||
+{
|
||||
+ return sv_call_f32 (coshf, x, y, pg);
|
||||
+}
|
||||
+
|
||||
+/* Single-precision vector cosh, using vector expf.
|
||||
+ Maximum error is 1.89 ULP:
|
||||
+ _ZGVsMxv_coshf (-0x1.65898cp+6) got 0x1.f00aep+127
|
||||
+ want 0x1.f00adcp+127. */
|
||||
+svfloat32_t SV_NAME_F1 (cosh) (svfloat32_t x, svbool_t pg)
|
||||
+{
|
||||
+ const struct data *d = ptr_barrier (&data);
|
||||
+
|
||||
+ svfloat32_t ax = svabs_x (pg, x);
|
||||
+ svbool_t special = svcmpge (pg, svreinterpret_u32 (ax), d->special_bound);
|
||||
+
|
||||
+ /* Calculate cosh by exp(x) / 2 + exp(-x) / 2. */
|
||||
+ svfloat32_t t = expf_inline (ax, pg, &d->expf_consts);
|
||||
+ svfloat32_t half_t = svmul_x (pg, t, 0.5);
|
||||
+ svfloat32_t half_over_t = svdivr_x (pg, t, 0.5);
|
||||
+
|
||||
+ if (__glibc_unlikely (svptest_any (pg, special)))
|
||||
+ return special_case (x, svadd_x (pg, half_t, half_over_t), special);
|
||||
+
|
||||
+ return svadd_x (pg, half_t, half_over_t);
|
||||
+}
|
||||
diff --git a/sysdeps/aarch64/fpu/sv_expf_inline.h b/sysdeps/aarch64/fpu/sv_expf_inline.h
|
||||
new file mode 100644
|
||||
index 0000000000000000..23963b5f8ec89ead
|
||||
--- /dev/null
|
||||
+++ b/sysdeps/aarch64/fpu/sv_expf_inline.h
|
||||
@@ -0,0 +1,75 @@
|
||||
+/* SVE helper for single-precision routines which depend on exp
|
||||
+
|
||||
+ Copyright (C) 2024 Free Software Foundation, Inc.
|
||||
+ This file is part of the GNU C Library.
|
||||
+
|
||||
+ The GNU C Library is free software; you can redistribute it and/or
|
||||
+ modify it under the terms of the GNU Lesser General Public
|
||||
+ License as published by the Free Software Foundation; either
|
||||
+ version 2.1 of the License, or (at your option) any later version.
|
||||
+
|
||||
+ The GNU C Library is distributed in the hope that it will be useful,
|
||||
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
+ Lesser General Public License for more details.
|
||||
+
|
||||
+ You should have received a copy of the GNU Lesser General Public
|
||||
+ License along with the GNU C Library; if not, see
|
||||
+ <https://www.gnu.org/licenses/>. */
|
||||
+
|
||||
+#ifndef AARCH64_FPU_SV_EXPF_INLINE_H
|
||||
+#define AARCH64_FPU_SV_EXPF_INLINE_H
|
||||
+
|
||||
+#include "sv_math.h"
|
||||
+
|
||||
+struct sv_expf_data
|
||||
+{
|
||||
+ float poly[5];
|
||||
+ float inv_ln2, ln2_hi, ln2_lo, shift;
|
||||
+};
|
||||
+
|
||||
+/* Coefficients copied from the polynomial in AdvSIMD variant, reversed for
|
||||
+ compatibility with polynomial helpers. Shift is 1.5*2^17 + 127. */
|
||||
+#define SV_EXPF_DATA \
|
||||
+ { \
|
||||
+ .poly = { 0x1.ffffecp-1f, 0x1.fffdb6p-2f, 0x1.555e66p-3f, 0x1.573e2ep-5f, \
|
||||
+ 0x1.0e4020p-7f }, \
|
||||
+ \
|
||||
+ .inv_ln2 = 0x1.715476p+0f, .ln2_hi = 0x1.62e4p-1f, \
|
||||
+ .ln2_lo = 0x1.7f7d1cp-20f, .shift = 0x1.803f8p17f, \
|
||||
+ }
|
||||
+
|
||||
+#define C(i) sv_f32 (d->poly[i])
|
||||
+
|
||||
+static inline svfloat32_t
|
||||
+expf_inline (svfloat32_t x, const svbool_t pg, const struct sv_expf_data *d)
|
||||
+{
|
||||
+ /* exp(x) = 2^n (1 + poly(r)), with 1 + poly(r) in [1/sqrt(2),sqrt(2)]
|
||||
+ x = ln2*n + r, with r in [-ln2/2, ln2/2]. */
|
||||
+
|
||||
+ /* Load some constants in quad-word chunks to minimise memory access. */
|
||||
+ svfloat32_t c4_invln2_and_ln2 = svld1rq (svptrue_b32 (), &d->poly[4]);
|
||||
+
|
||||
+ /* n = round(x/(ln2/N)). */
|
||||
+ svfloat32_t z = svmla_lane (sv_f32 (d->shift), x, c4_invln2_and_ln2, 1);
|
||||
+ svfloat32_t n = svsub_x (pg, z, d->shift);
|
||||
+
|
||||
+ /* r = x - n*ln2/N. */
|
||||
+ svfloat32_t r = svmls_lane (x, n, c4_invln2_and_ln2, 2);
|
||||
+ r = svmls_lane (r, n, c4_invln2_and_ln2, 3);
|
||||
+
|
||||
+ /* scale = 2^(n/N). */
|
||||
+ svfloat32_t scale = svexpa (svreinterpret_u32_f32 (z));
|
||||
+
|
||||
+ /* y = exp(r) - 1 ~= r + C0 r^2 + C1 r^3 + C2 r^4 + C3 r^5 + C4 r^6. */
|
||||
+ svfloat32_t p12 = svmla_x (pg, C (1), C (2), r);
|
||||
+ svfloat32_t p34 = svmla_lane (C (3), r, c4_invln2_and_ln2, 0);
|
||||
+ svfloat32_t r2 = svmul_f32_x (pg, r, r);
|
||||
+ svfloat32_t p14 = svmla_x (pg, p12, p34, r2);
|
||||
+ svfloat32_t p0 = svmul_f32_x (pg, r, C (0));
|
||||
+ svfloat32_t poly = svmla_x (pg, p0, r2, p14);
|
||||
+
|
||||
+ return svmla_x (pg, scale, scale, poly);
|
||||
+}
|
||||
+
|
||||
+#endif
|
||||
diff --git a/sysdeps/aarch64/fpu/test-double-advsimd-wrappers.c b/sysdeps/aarch64/fpu/test-double-advsimd-wrappers.c
|
||||
index 41fdb92d7ea6e707..b37cb7d5e9c0d96a 100644
|
||||
--- a/sysdeps/aarch64/fpu/test-double-advsimd-wrappers.c
|
||||
+++ b/sysdeps/aarch64/fpu/test-double-advsimd-wrappers.c
|
||||
@@ -28,6 +28,7 @@ VPCS_VECTOR_WRAPPER (asin_advsimd, _ZGVnN2v_asin)
|
||||
VPCS_VECTOR_WRAPPER (atan_advsimd, _ZGVnN2v_atan)
|
||||
VPCS_VECTOR_WRAPPER_ff (atan2_advsimd, _ZGVnN2vv_atan2)
|
||||
VPCS_VECTOR_WRAPPER (cos_advsimd, _ZGVnN2v_cos)
|
||||
+VPCS_VECTOR_WRAPPER (cosh_advsimd, _ZGVnN2v_cosh)
|
||||
VPCS_VECTOR_WRAPPER (erf_advsimd, _ZGVnN2v_erf)
|
||||
VPCS_VECTOR_WRAPPER (exp_advsimd, _ZGVnN2v_exp)
|
||||
VPCS_VECTOR_WRAPPER (exp10_advsimd, _ZGVnN2v_exp10)
|
||||
diff --git a/sysdeps/aarch64/fpu/test-double-sve-wrappers.c b/sysdeps/aarch64/fpu/test-double-sve-wrappers.c
|
||||
index 8e3d64da420348a7..011f07d2c15b148f 100644
|
||||
--- a/sysdeps/aarch64/fpu/test-double-sve-wrappers.c
|
||||
+++ b/sysdeps/aarch64/fpu/test-double-sve-wrappers.c
|
||||
@@ -47,6 +47,7 @@ SVE_VECTOR_WRAPPER (asin_sve, _ZGVsMxv_asin)
|
||||
SVE_VECTOR_WRAPPER (atan_sve, _ZGVsMxv_atan)
|
||||
SVE_VECTOR_WRAPPER_ff (atan2_sve, _ZGVsMxvv_atan2)
|
||||
SVE_VECTOR_WRAPPER (cos_sve, _ZGVsMxv_cos)
|
||||
+SVE_VECTOR_WRAPPER (cosh_sve, _ZGVsMxv_cosh)
|
||||
SVE_VECTOR_WRAPPER (erf_sve, _ZGVsMxv_erf)
|
||||
SVE_VECTOR_WRAPPER (exp_sve, _ZGVsMxv_exp)
|
||||
SVE_VECTOR_WRAPPER (exp10_sve, _ZGVsMxv_exp10)
|
||||
diff --git a/sysdeps/aarch64/fpu/test-float-advsimd-wrappers.c b/sysdeps/aarch64/fpu/test-float-advsimd-wrappers.c
|
||||
index 33ae92878f774ac3..35452991431e238a 100644
|
||||
--- a/sysdeps/aarch64/fpu/test-float-advsimd-wrappers.c
|
||||
+++ b/sysdeps/aarch64/fpu/test-float-advsimd-wrappers.c
|
||||
@@ -28,6 +28,7 @@ VPCS_VECTOR_WRAPPER (asinf_advsimd, _ZGVnN4v_asinf)
|
||||
VPCS_VECTOR_WRAPPER (atanf_advsimd, _ZGVnN4v_atanf)
|
||||
VPCS_VECTOR_WRAPPER_ff (atan2f_advsimd, _ZGVnN4vv_atan2f)
|
||||
VPCS_VECTOR_WRAPPER (cosf_advsimd, _ZGVnN4v_cosf)
|
||||
+VPCS_VECTOR_WRAPPER (coshf_advsimd, _ZGVnN4v_coshf)
|
||||
VPCS_VECTOR_WRAPPER (erff_advsimd, _ZGVnN4v_erff)
|
||||
VPCS_VECTOR_WRAPPER (expf_advsimd, _ZGVnN4v_expf)
|
||||
VPCS_VECTOR_WRAPPER (exp10f_advsimd, _ZGVnN4v_exp10f)
|
||||
diff --git a/sysdeps/aarch64/fpu/test-float-sve-wrappers.c b/sysdeps/aarch64/fpu/test-float-sve-wrappers.c
|
||||
index ac0464f196e7972f..bbc74ede88c9e6c8 100644
|
||||
--- a/sysdeps/aarch64/fpu/test-float-sve-wrappers.c
|
||||
+++ b/sysdeps/aarch64/fpu/test-float-sve-wrappers.c
|
||||
@@ -47,6 +47,7 @@ SVE_VECTOR_WRAPPER (asinf_sve, _ZGVsMxv_asinf)
|
||||
SVE_VECTOR_WRAPPER (atanf_sve, _ZGVsMxv_atanf)
|
||||
SVE_VECTOR_WRAPPER_ff (atan2f_sve, _ZGVsMxvv_atan2f)
|
||||
SVE_VECTOR_WRAPPER (cosf_sve, _ZGVsMxv_cosf)
|
||||
+SVE_VECTOR_WRAPPER (coshf_sve, _ZGVsMxv_coshf)
|
||||
SVE_VECTOR_WRAPPER (erff_sve, _ZGVsMxv_erff)
|
||||
SVE_VECTOR_WRAPPER (expf_sve, _ZGVsMxv_expf)
|
||||
SVE_VECTOR_WRAPPER (exp10f_sve, _ZGVsMxv_exp10f)
|
||||
diff --git a/sysdeps/aarch64/fpu/v_exp_tail_data.c b/sysdeps/aarch64/fpu/v_exp_tail_data.c
|
||||
new file mode 100644
|
||||
index 0000000000000000..151e97c21bbc11ae
|
||||
--- /dev/null
|
||||
+++ b/sysdeps/aarch64/fpu/v_exp_tail_data.c
|
||||
@@ -0,0 +1,110 @@
|
||||
+/* Lookup table for high-precision exp(x, tail) function
|
||||
+
|
||||
+ Copyright (C) 2024 Free Software Foundation, Inc.
|
||||
+ This file is part of the GNU C Library.
|
||||
+
|
||||
+ The GNU C Library is free software; you can redistribute it and/or
|
||||
+ modify it under the terms of the GNU Lesser General Public
|
||||
+ License as published by the Free Software Foundation; either
|
||||
+ version 2.1 of the License, or (at your option) any later version.
|
||||
+
|
||||
+ The GNU C Library is distributed in the hope that it will be useful,
|
||||
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
+ Lesser General Public License for more details.
|
||||
+
|
||||
+ You should have received a copy of the GNU Lesser General Public
|
||||
+ License along with the GNU C Library; if not, see
|
||||
+ <https://www.gnu.org/licenses/>. */
|
||||
+
|
||||
+#include "vecmath_config.h"
|
||||
+
|
||||
+/* 2^(j/N), j=0..N, N=2^8=256. */
|
||||
+const uint64_t __v_exp_tail_data[] = {
|
||||
+ 0x3ff0000000000000, 0x3feffb1afa5abcbf, 0x3feff63da9fb3335,
|
||||
+ 0x3feff168143b0281, 0x3fefec9a3e778061, 0x3fefe7d42e11bbcc,
|
||||
+ 0x3fefe315e86e7f85, 0x3fefde5f72f654b1, 0x3fefd9b0d3158574,
|
||||
+ 0x3fefd50a0e3c1f89, 0x3fefd06b29ddf6de, 0x3fefcbd42b72a836,
|
||||
+ 0x3fefc74518759bc8, 0x3fefc2bdf66607e0, 0x3fefbe3ecac6f383,
|
||||
+ 0x3fefb9c79b1f3919, 0x3fefb5586cf9890f, 0x3fefb0f145e46c85,
|
||||
+ 0x3fefac922b7247f7, 0x3fefa83b23395dec, 0x3fefa3ec32d3d1a2,
|
||||
+ 0x3fef9fa55fdfa9c5, 0x3fef9b66affed31b, 0x3fef973028d7233e,
|
||||
+ 0x3fef9301d0125b51, 0x3fef8edbab5e2ab6, 0x3fef8abdc06c31cc,
|
||||
+ 0x3fef86a814f204ab, 0x3fef829aaea92de0, 0x3fef7e95934f312e,
|
||||
+ 0x3fef7a98c8a58e51, 0x3fef76a45471c3c2, 0x3fef72b83c7d517b,
|
||||
+ 0x3fef6ed48695bbc0, 0x3fef6af9388c8dea, 0x3fef672658375d2f,
|
||||
+ 0x3fef635beb6fcb75, 0x3fef5f99f8138a1c, 0x3fef5be084045cd4,
|
||||
+ 0x3fef582f95281c6b, 0x3fef54873168b9aa, 0x3fef50e75eb44027,
|
||||
+ 0x3fef4d5022fcd91d, 0x3fef49c18438ce4d, 0x3fef463b88628cd6,
|
||||
+ 0x3fef42be3578a819, 0x3fef3f49917ddc96, 0x3fef3bdda27912d1,
|
||||
+ 0x3fef387a6e756238, 0x3fef351ffb82140a, 0x3fef31ce4fb2a63f,
|
||||
+ 0x3fef2e85711ece75, 0x3fef2b4565e27cdd, 0x3fef280e341ddf29,
|
||||
+ 0x3fef24dfe1f56381, 0x3fef21ba7591bb70, 0x3fef1e9df51fdee1,
|
||||
+ 0x3fef1b8a66d10f13, 0x3fef187fd0dad990, 0x3fef157e39771b2f,
|
||||
+ 0x3fef1285a6e4030b, 0x3fef0f961f641589, 0x3fef0cafa93e2f56,
|
||||
+ 0x3fef09d24abd886b, 0x3fef06fe0a31b715, 0x3fef0432edeeb2fd,
|
||||
+ 0x3fef0170fc4cd831, 0x3feefeb83ba8ea32, 0x3feefc08b26416ff,
|
||||
+ 0x3feef96266e3fa2d, 0x3feef6c55f929ff1, 0x3feef431a2de883b,
|
||||
+ 0x3feef1a7373aa9cb, 0x3feeef26231e754a, 0x3feeecae6d05d866,
|
||||
+ 0x3feeea401b7140ef, 0x3feee7db34e59ff7, 0x3feee57fbfec6cf4,
|
||||
+ 0x3feee32dc313a8e5, 0x3feee0e544ede173, 0x3feedea64c123422,
|
||||
+ 0x3feedc70df1c5175, 0x3feeda4504ac801c, 0x3feed822c367a024,
|
||||
+ 0x3feed60a21f72e2a, 0x3feed3fb2709468a, 0x3feed1f5d950a897,
|
||||
+ 0x3feecffa3f84b9d4, 0x3feece086061892d, 0x3feecc2042a7d232,
|
||||
+ 0x3feeca41ed1d0057, 0x3feec86d668b3237, 0x3feec6a2b5c13cd0,
|
||||
+ 0x3feec4e1e192aed2, 0x3feec32af0d7d3de, 0x3feec17dea6db7d7,
|
||||
+ 0x3feebfdad5362a27, 0x3feebe41b817c114, 0x3feebcb299fddd0d,
|
||||
+ 0x3feebb2d81d8abff, 0x3feeb9b2769d2ca7, 0x3feeb8417f4531ee,
|
||||
+ 0x3feeb6daa2cf6642, 0x3feeb57de83f4eef, 0x3feeb42b569d4f82,
|
||||
+ 0x3feeb2e2f4f6ad27, 0x3feeb1a4ca5d920f, 0x3feeb070dde910d2,
|
||||
+ 0x3feeaf4736b527da, 0x3feeae27dbe2c4cf, 0x3feead12d497c7fd,
|
||||
+ 0x3feeac0827ff07cc, 0x3feeab07dd485429, 0x3feeaa11fba87a03,
|
||||
+ 0x3feea9268a5946b7, 0x3feea84590998b93, 0x3feea76f15ad2148,
|
||||
+ 0x3feea6a320dceb71, 0x3feea5e1b976dc09, 0x3feea52ae6cdf6f4,
|
||||
+ 0x3feea47eb03a5585, 0x3feea3dd1d1929fd, 0x3feea34634ccc320,
|
||||
+ 0x3feea2b9febc8fb7, 0x3feea23882552225, 0x3feea1c1c70833f6,
|
||||
+ 0x3feea155d44ca973, 0x3feea0f4b19e9538, 0x3feea09e667f3bcd,
|
||||
+ 0x3feea052fa75173e, 0x3feea012750bdabf, 0x3fee9fdcddd47645,
|
||||
+ 0x3fee9fb23c651a2f, 0x3fee9f9298593ae5, 0x3fee9f7df9519484,
|
||||
+ 0x3fee9f7466f42e87, 0x3fee9f75e8ec5f74, 0x3fee9f8286ead08a,
|
||||
+ 0x3fee9f9a48a58174, 0x3fee9fbd35d7cbfd, 0x3fee9feb564267c9,
|
||||
+ 0x3feea024b1ab6e09, 0x3feea0694fde5d3f, 0x3feea0b938ac1cf6,
|
||||
+ 0x3feea11473eb0187, 0x3feea17b0976cfdb, 0x3feea1ed0130c132,
|
||||
+ 0x3feea26a62ff86f0, 0x3feea2f336cf4e62, 0x3feea3878491c491,
|
||||
+ 0x3feea427543e1a12, 0x3feea4d2add106d9, 0x3feea589994cce13,
|
||||
+ 0x3feea64c1eb941f7, 0x3feea71a4623c7ad, 0x3feea7f4179f5b21,
|
||||
+ 0x3feea8d99b4492ed, 0x3feea9cad931a436, 0x3feeaac7d98a6699,
|
||||
+ 0x3feeabd0a478580f, 0x3feeace5422aa0db, 0x3feeae05bad61778,
|
||||
+ 0x3feeaf3216b5448c, 0x3feeb06a5e0866d9, 0x3feeb1ae99157736,
|
||||
+ 0x3feeb2fed0282c8a, 0x3feeb45b0b91ffc6, 0x3feeb5c353aa2fe2,
|
||||
+ 0x3feeb737b0cdc5e5, 0x3feeb8b82b5f98e5, 0x3feeba44cbc8520f,
|
||||
+ 0x3feebbdd9a7670b3, 0x3feebd829fde4e50, 0x3feebf33e47a22a2,
|
||||
+ 0x3feec0f170ca07ba, 0x3feec2bb4d53fe0d, 0x3feec49182a3f090,
|
||||
+ 0x3feec674194bb8d5, 0x3feec86319e32323, 0x3feeca5e8d07f29e,
|
||||
+ 0x3feecc667b5de565, 0x3feece7aed8eb8bb, 0x3feed09bec4a2d33,
|
||||
+ 0x3feed2c980460ad8, 0x3feed503b23e255d, 0x3feed74a8af46052,
|
||||
+ 0x3feed99e1330b358, 0x3feedbfe53c12e59, 0x3feede6b5579fdbf,
|
||||
+ 0x3feee0e521356eba, 0x3feee36bbfd3f37a, 0x3feee5ff3a3c2774,
|
||||
+ 0x3feee89f995ad3ad, 0x3feeeb4ce622f2ff, 0x3feeee07298db666,
|
||||
+ 0x3feef0ce6c9a8952, 0x3feef3a2b84f15fb, 0x3feef68415b749b1,
|
||||
+ 0x3feef9728de5593a, 0x3feefc6e29f1c52a, 0x3feeff76f2fb5e47,
|
||||
+ 0x3fef028cf22749e4, 0x3fef05b030a1064a, 0x3fef08e0b79a6f1f,
|
||||
+ 0x3fef0c1e904bc1d2, 0x3fef0f69c3f3a207, 0x3fef12c25bd71e09,
|
||||
+ 0x3fef16286141b33d, 0x3fef199bdd85529c, 0x3fef1d1cd9fa652c,
|
||||
+ 0x3fef20ab5fffd07a, 0x3fef244778fafb22, 0x3fef27f12e57d14b,
|
||||
+ 0x3fef2ba88988c933, 0x3fef2f6d9406e7b5, 0x3fef33405751c4db,
|
||||
+ 0x3fef3720dcef9069, 0x3fef3b0f2e6d1675, 0x3fef3f0b555dc3fa,
|
||||
+ 0x3fef43155b5bab74, 0x3fef472d4a07897c, 0x3fef4b532b08c968,
|
||||
+ 0x3fef4f87080d89f2, 0x3fef53c8eacaa1d6, 0x3fef5818dcfba487,
|
||||
+ 0x3fef5c76e862e6d3, 0x3fef60e316c98398, 0x3fef655d71ff6075,
|
||||
+ 0x3fef69e603db3285, 0x3fef6e7cd63a8315, 0x3fef7321f301b460,
|
||||
+ 0x3fef77d5641c0658, 0x3fef7c97337b9b5f, 0x3fef81676b197d17,
|
||||
+ 0x3fef864614f5a129, 0x3fef8b333b16ee12, 0x3fef902ee78b3ff6,
|
||||
+ 0x3fef953924676d76, 0x3fef9a51fbc74c83, 0x3fef9f7977cdb740,
|
||||
+ 0x3fefa4afa2a490da, 0x3fefa9f4867cca6e, 0x3fefaf482d8e67f1,
|
||||
+ 0x3fefb4aaa2188510, 0x3fefba1bee615a27, 0x3fefbf9c1cb6412a,
|
||||
+ 0x3fefc52b376bba97, 0x3fefcac948dd7274, 0x3fefd0765b6e4540,
|
||||
+ 0x3fefd632798844f8, 0x3fefdbfdad9cbe14, 0x3fefe1d802243c89,
|
||||
+ 0x3fefe7c1819e90d8, 0x3fefedba3692d514, 0x3feff3c22b8f71f1,
|
||||
+ 0x3feff9d96b2a23d9,
|
||||
+};
|
||||
diff --git a/sysdeps/aarch64/fpu/v_expf_inline.h b/sysdeps/aarch64/fpu/v_expf_inline.h
|
||||
new file mode 100644
|
||||
index 0000000000000000..a3b0e32f9eb42021
|
||||
--- /dev/null
|
||||
+++ b/sysdeps/aarch64/fpu/v_expf_inline.h
|
||||
@@ -0,0 +1,71 @@
|
||||
+/* Helper for single-precision AdvSIMD routines which depend on exp
|
||||
+
|
||||
+ Copyright (C) 2024 Free Software Foundation, Inc.
|
||||
+ This file is part of the GNU C Library.
|
||||
+
|
||||
+ The GNU C Library is free software; you can redistribute it and/or
|
||||
+ modify it under the terms of the GNU Lesser General Public
|
||||
+ License as published by the Free Software Foundation; either
|
||||
+ version 2.1 of the License, or (at your option) any later version.
|
||||
+
|
||||
+ The GNU C Library is distributed in the hope that it will be useful,
|
||||
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
+ Lesser General Public License for more details.
|
||||
+
|
||||
+ You should have received a copy of the GNU Lesser General Public
|
||||
+ License along with the GNU C Library; if not, see
|
||||
+ <https://www.gnu.org/licenses/>. */
|
||||
+
|
||||
+#ifndef AARCH64_FPU_V_EXPF_INLINE_H
|
||||
+#define AARCH64_FPU_V_EXPF_INLINE_H
|
||||
+
|
||||
+#include "v_math.h"
|
||||
+
|
||||
+struct v_expf_data
|
||||
+{
|
||||
+ float32x4_t poly[5];
|
||||
+ float32x4_t shift, invln2_and_ln2;
|
||||
+};
|
||||
+
|
||||
+/* maxerr: 1.45358 +0.5 ulp. */
|
||||
+#define V_EXPF_DATA \
|
||||
+ { \
|
||||
+ .poly = { V4 (0x1.0e4020p-7f), V4 (0x1.573e2ep-5f), V4 (0x1.555e66p-3f), \
|
||||
+ V4 (0x1.fffdb6p-2f), V4 (0x1.ffffecp-1f) }, \
|
||||
+ .shift = V4 (0x1.8p23f), \
|
||||
+ .invln2_and_ln2 = { 0x1.715476p+0f, 0x1.62e4p-1f, 0x1.7f7d1cp-20f, 0 }, \
|
||||
+ }
|
||||
+
|
||||
+#define ExponentBias v_u32 (0x3f800000) /* asuint(1.0f). */
|
||||
+#define C(i) d->poly[i]
|
||||
+
|
||||
+static inline float32x4_t
|
||||
+v_expf_inline (float32x4_t x, const struct v_expf_data *d)
|
||||
+{
|
||||
+ /* Helper routine for calculating exp(x).
|
||||
+ Copied from v_expf.c, with all special-case handling removed - the
|
||||
+ calling routine should handle special values if required. */
|
||||
+
|
||||
+ /* exp(x) = 2^n (1 + poly(r)), with 1 + poly(r) in [1/sqrt(2),sqrt(2)]
|
||||
+ x = ln2*n + r, with r in [-ln2/2, ln2/2]. */
|
||||
+ float32x4_t n, r, z;
|
||||
+ z = vfmaq_laneq_f32 (d->shift, x, d->invln2_and_ln2, 0);
|
||||
+ n = vsubq_f32 (z, d->shift);
|
||||
+ r = vfmsq_laneq_f32 (x, n, d->invln2_and_ln2, 1);
|
||||
+ r = vfmsq_laneq_f32 (r, n, d->invln2_and_ln2, 2);
|
||||
+ uint32x4_t e = vshlq_n_u32 (vreinterpretq_u32_f32 (z), 23);
|
||||
+ float32x4_t scale = vreinterpretq_f32_u32 (vaddq_u32 (e, ExponentBias));
|
||||
+
|
||||
+ /* Custom order-4 Estrin avoids building high order monomial. */
|
||||
+ float32x4_t r2 = vmulq_f32 (r, r);
|
||||
+ float32x4_t p, q, poly;
|
||||
+ p = vfmaq_f32 (C (1), C (0), r);
|
||||
+ q = vfmaq_f32 (C (3), C (2), r);
|
||||
+ q = vfmaq_f32 (q, p, r2);
|
||||
+ p = vmulq_f32 (C (4), r);
|
||||
+ poly = vfmaq_f32 (p, q, r2);
|
||||
+ return vfmaq_f32 (scale, poly, scale);
|
||||
+}
|
||||
+
|
||||
+#endif
|
||||
diff --git a/sysdeps/aarch64/fpu/vecmath_config.h b/sysdeps/aarch64/fpu/vecmath_config.h
|
||||
index 409c0c9bd9b85422..3f0b5f476433ca06 100644
|
||||
--- a/sysdeps/aarch64/fpu/vecmath_config.h
|
||||
+++ b/sysdeps/aarch64/fpu/vecmath_config.h
|
||||
@@ -59,6 +59,8 @@ extern const struct v_log_data
|
||||
} table[1 << V_LOG_TABLE_BITS];
|
||||
} __v_log_data attribute_hidden;
|
||||
|
||||
+#define V_EXP_TAIL_TABLE_BITS 8
|
||||
+extern const uint64_t __v_exp_tail_data[1 << V_EXP_TAIL_TABLE_BITS] attribute_hidden;
|
||||
#define V_EXP_TABLE_BITS 7
|
||||
extern const uint64_t __v_exp_data[1 << V_EXP_TABLE_BITS] attribute_hidden;
|
||||
|
||||
diff --git a/sysdeps/aarch64/libm-test-ulps b/sysdeps/aarch64/libm-test-ulps
|
||||
index f1103a245645476b..48d747ad5793be96 100644
|
||||
--- a/sysdeps/aarch64/libm-test-ulps
|
||||
+++ b/sysdeps/aarch64/libm-test-ulps
|
||||
@@ -701,11 +701,19 @@ double: 2
|
||||
float: 2
|
||||
ldouble: 2
|
||||
|
||||
+Function: "cosh_advsimd":
|
||||
+double: 2
|
||||
+float: 2
|
||||
+
|
||||
Function: "cosh_downward":
|
||||
double: 3
|
||||
float: 1
|
||||
ldouble: 3
|
||||
|
||||
+Function: "cosh_sve":
|
||||
+double: 2
|
||||
+float: 2
|
||||
+
|
||||
Function: "cosh_towardzero":
|
||||
double: 3
|
||||
float: 1
|
||||
diff --git a/sysdeps/unix/sysv/linux/aarch64/libmvec.abilist b/sysdeps/unix/sysv/linux/aarch64/libmvec.abilist
|
||||
index 6193518fb001cc92..f66da42c3630bf48 100644
|
||||
--- a/sysdeps/unix/sysv/linux/aarch64/libmvec.abilist
|
||||
+++ b/sysdeps/unix/sysv/linux/aarch64/libmvec.abilist
|
||||
@@ -73,8 +73,13 @@ GLIBC_2.39 _ZGVsMxv_tan F
|
||||
GLIBC_2.39 _ZGVsMxv_tanf F
|
||||
GLIBC_2.39 _ZGVsMxvv_atan2 F
|
||||
GLIBC_2.39 _ZGVsMxvv_atan2f F
|
||||
+GLIBC_2.40 _ZGVnN2v_cosh F
|
||||
+GLIBC_2.40 _ZGVnN2v_coshf F
|
||||
GLIBC_2.40 _ZGVnN2v_erf F
|
||||
GLIBC_2.40 _ZGVnN2v_erff F
|
||||
+GLIBC_2.40 _ZGVnN4v_coshf F
|
||||
GLIBC_2.40 _ZGVnN4v_erff F
|
||||
+GLIBC_2.40 _ZGVsMxv_cosh F
|
||||
+GLIBC_2.40 _ZGVsMxv_coshf F
|
||||
GLIBC_2.40 _ZGVsMxv_erf F
|
||||
GLIBC_2.40 _ZGVsMxv_erff F
|
||||
359
glibc-RHEL-118273-20.patch
Normal file
359
glibc-RHEL-118273-20.patch
Normal file
@ -0,0 +1,359 @@
|
||||
commit ca0c0d0f26fbf75b9cacc65122b457e8fdec40b8
|
||||
Author: Pierre Blanchard <pierre.blanchard@arm.com>
|
||||
Date: Mon Dec 9 15:55:39 2024 +0000
|
||||
|
||||
AArch64: Improve codegen in users of ADVSIMD log1p helper
|
||||
|
||||
Add inline helper for log1p and rearrange operations so MOV
|
||||
is not necessary in reduction or around the special-case handler.
|
||||
Reduce memory access by using more indexed MLAs in polynomial.
|
||||
Speedup on Neoverse V1 for log1p (3.5%), acosh (7.5%) and atanh (10%).
|
||||
|
||||
Conflicts:
|
||||
sysdeps/aarch64/fpu/log1p_advsimd.c
|
||||
(Fixup context to apply without out-of-scope dependency 751a5502)
|
||||
|
||||
diff --git a/sysdeps/aarch64/fpu/acosh_advsimd.c b/sysdeps/aarch64/fpu/acosh_advsimd.c
|
||||
index c88283cf1191f4eb..a98f4a2e4d8cbf42 100644
|
||||
--- a/sysdeps/aarch64/fpu/acosh_advsimd.c
|
||||
+++ b/sysdeps/aarch64/fpu/acosh_advsimd.c
|
||||
@@ -54,9 +54,8 @@ VPCS_ATTR float64x2_t V_NAME_D1 (acosh) (float64x2_t x)
|
||||
x = vbslq_f64 (special, vreinterpretq_f64_u64 (d->one), x);
|
||||
#endif
|
||||
|
||||
- float64x2_t xm1 = vsubq_f64 (x, v_f64 (1));
|
||||
- float64x2_t y;
|
||||
- y = vaddq_f64 (x, v_f64 (1));
|
||||
+ float64x2_t xm1 = vsubq_f64 (x, v_f64 (1.0));
|
||||
+ float64x2_t y = vaddq_f64 (x, v_f64 (1.0));
|
||||
y = vmulq_f64 (y, xm1);
|
||||
y = vsqrtq_f64 (y);
|
||||
y = vaddq_f64 (xm1, y);
|
||||
diff --git a/sysdeps/aarch64/fpu/atanh_advsimd.c b/sysdeps/aarch64/fpu/atanh_advsimd.c
|
||||
index 3c3d0bd6ad41396d..eb9769aeac29cf15 100644
|
||||
--- a/sysdeps/aarch64/fpu/atanh_advsimd.c
|
||||
+++ b/sysdeps/aarch64/fpu/atanh_advsimd.c
|
||||
@@ -23,15 +23,19 @@
|
||||
const static struct data
|
||||
{
|
||||
struct v_log1p_data log1p_consts;
|
||||
- uint64x2_t one, half;
|
||||
+ uint64x2_t one;
|
||||
+ uint64x2_t sign_mask;
|
||||
} data = { .log1p_consts = V_LOG1P_CONSTANTS_TABLE,
|
||||
.one = V2 (0x3ff0000000000000),
|
||||
- .half = V2 (0x3fe0000000000000) };
|
||||
+ .sign_mask = V2 (0x8000000000000000) };
|
||||
|
||||
static float64x2_t VPCS_ATTR NOINLINE
|
||||
-special_case (float64x2_t x, float64x2_t y, uint64x2_t special)
|
||||
+special_case (float64x2_t x, float64x2_t halfsign, float64x2_t y,
|
||||
+ uint64x2_t special, const struct data *d)
|
||||
{
|
||||
- return v_call_f64 (atanh, x, y, special);
|
||||
+ y = log1p_inline (y, &d->log1p_consts);
|
||||
+ return v_call_f64 (atanh, vbslq_f64 (d->sign_mask, halfsign, x),
|
||||
+ vmulq_f64 (halfsign, y), special);
|
||||
}
|
||||
|
||||
/* Approximation for vector double-precision atanh(x) using modified log1p.
|
||||
@@ -43,11 +47,10 @@ float64x2_t V_NAME_D1 (atanh) (float64x2_t x)
|
||||
{
|
||||
const struct data *d = ptr_barrier (&data);
|
||||
|
||||
+ float64x2_t halfsign = vbslq_f64 (d->sign_mask, x, v_f64 (0.5));
|
||||
float64x2_t ax = vabsq_f64 (x);
|
||||
uint64x2_t ia = vreinterpretq_u64_f64 (ax);
|
||||
- uint64x2_t sign = veorq_u64 (vreinterpretq_u64_f64 (x), ia);
|
||||
uint64x2_t special = vcgeq_u64 (ia, d->one);
|
||||
- float64x2_t halfsign = vreinterpretq_f64_u64 (vorrq_u64 (sign, d->half));
|
||||
|
||||
#if WANT_SIMD_EXCEPT
|
||||
ax = v_zerofy_f64 (ax, special);
|
||||
@@ -55,10 +58,15 @@ float64x2_t V_NAME_D1 (atanh) (float64x2_t x)
|
||||
|
||||
float64x2_t y;
|
||||
y = vaddq_f64 (ax, ax);
|
||||
- y = vdivq_f64 (y, vsubq_f64 (v_f64 (1), ax));
|
||||
- y = log1p_inline (y, &d->log1p_consts);
|
||||
+ y = vdivq_f64 (y, vsubq_f64 (vreinterpretq_f64_u64 (d->one), ax));
|
||||
|
||||
if (__glibc_unlikely (v_any_u64 (special)))
|
||||
- return special_case (x, vmulq_f64 (y, halfsign), special);
|
||||
+#if WANT_SIMD_EXCEPT
|
||||
+ return special_case (x, halfsign, y, special, d);
|
||||
+#else
|
||||
+ return special_case (ax, halfsign, y, special, d);
|
||||
+#endif
|
||||
+
|
||||
+ y = log1p_inline (y, &d->log1p_consts);
|
||||
return vmulq_f64 (y, halfsign);
|
||||
}
|
||||
diff --git a/sysdeps/aarch64/fpu/log1p_advsimd.c b/sysdeps/aarch64/fpu/log1p_advsimd.c
|
||||
index ffc418fc9c24be28..9d18578ce6497787 100644
|
||||
--- a/sysdeps/aarch64/fpu/log1p_advsimd.c
|
||||
+++ b/sysdeps/aarch64/fpu/log1p_advsimd.c
|
||||
@@ -17,43 +17,26 @@
|
||||
License along with the GNU C Library; if not, see
|
||||
<https://www.gnu.org/licenses/>. */
|
||||
|
||||
-#include "v_math.h"
|
||||
-#include "poly_advsimd_f64.h"
|
||||
+#define WANT_V_LOG1P_K0_SHORTCUT 0
|
||||
+#include "v_log1p_inline.h"
|
||||
|
||||
const static struct data
|
||||
{
|
||||
- float64x2_t poly[19], ln2[2];
|
||||
- uint64x2_t hf_rt2_top, one_m_hf_rt2_top, umask, inf, minus_one;
|
||||
- int64x2_t one_top;
|
||||
-} data = {
|
||||
- /* Generated using Remez, deg=20, in [sqrt(2)/2-1, sqrt(2)-1]. */
|
||||
- .poly = { V2 (-0x1.ffffffffffffbp-2), V2 (0x1.55555555551a9p-2),
|
||||
- V2 (-0x1.00000000008e3p-2), V2 (0x1.9999999a32797p-3),
|
||||
- V2 (-0x1.555555552fecfp-3), V2 (0x1.249248e071e5ap-3),
|
||||
- V2 (-0x1.ffffff8bf8482p-4), V2 (0x1.c71c8f07da57ap-4),
|
||||
- V2 (-0x1.9999ca4ccb617p-4), V2 (0x1.7459ad2e1dfa3p-4),
|
||||
- V2 (-0x1.554d2680a3ff2p-4), V2 (0x1.3b4c54d487455p-4),
|
||||
- V2 (-0x1.2548a9ffe80e6p-4), V2 (0x1.0f389a24b2e07p-4),
|
||||
- V2 (-0x1.eee4db15db335p-5), V2 (0x1.e95b494d4a5ddp-5),
|
||||
- V2 (-0x1.15fdf07cb7c73p-4), V2 (0x1.0310b70800fcfp-4),
|
||||
- V2 (-0x1.cfa7385bdb37ep-6) },
|
||||
- .ln2 = { V2 (0x1.62e42fefa3800p-1), V2 (0x1.ef35793c76730p-45) },
|
||||
- /* top32(asuint64(sqrt(2)/2)) << 32. */
|
||||
- .hf_rt2_top = V2 (0x3fe6a09e00000000),
|
||||
- /* (top32(asuint64(1)) - top32(asuint64(sqrt(2)/2))) << 32. */
|
||||
- .one_m_hf_rt2_top = V2 (0x00095f6200000000),
|
||||
- .umask = V2 (0x000fffff00000000),
|
||||
- .one_top = V2 (0x3ff),
|
||||
- .inf = V2 (0x7ff0000000000000),
|
||||
- .minus_one = V2 (0xbff0000000000000)
|
||||
-};
|
||||
+ struct v_log1p_data d;
|
||||
+ uint64x2_t inf, minus_one;
|
||||
+} data = { .d = V_LOG1P_CONSTANTS_TABLE,
|
||||
+ .inf = V2 (0x7ff0000000000000),
|
||||
+ .minus_one = V2 (0xbff0000000000000) };
|
||||
|
||||
#define BottomMask v_u64 (0xffffffff)
|
||||
|
||||
-static float64x2_t VPCS_ATTR NOINLINE
|
||||
-special_case (float64x2_t x, float64x2_t y, uint64x2_t special)
|
||||
+static float64x2_t NOINLINE VPCS_ATTR
|
||||
+special_case (float64x2_t x, uint64x2_t cmp, const struct data *d)
|
||||
{
|
||||
- return v_call_f64 (log1p, x, y, special);
|
||||
+ /* Side-step special lanes so fenv exceptions are not triggered
|
||||
+ inadvertently. */
|
||||
+ float64x2_t x_nospecial = v_zerofy_f64 (x, cmp);
|
||||
+ return v_call_f64 (log1p, x, log1p_inline (x_nospecial, &d->d), cmp);
|
||||
}
|
||||
|
||||
/* Vector log1p approximation using polynomial on reduced interval. Routine is
|
||||
@@ -66,64 +49,12 @@ VPCS_ATTR float64x2_t V_NAME_D1 (log1p) (float64x2_t x)
|
||||
const struct data *d = ptr_barrier (&data);
|
||||
uint64x2_t ix = vreinterpretq_u64_f64 (x);
|
||||
uint64x2_t ia = vreinterpretq_u64_f64 (vabsq_f64 (x));
|
||||
- uint64x2_t special = vcgeq_u64 (ia, d->inf);
|
||||
|
||||
-#if WANT_SIMD_EXCEPT
|
||||
- special = vorrq_u64 (special,
|
||||
- vcgeq_u64 (ix, vreinterpretq_u64_f64 (v_f64 (-1))));
|
||||
- if (__glibc_unlikely (v_any_u64 (special)))
|
||||
- x = v_zerofy_f64 (x, special);
|
||||
-#else
|
||||
- special = vorrq_u64 (special, vcleq_f64 (x, v_f64 (-1)));
|
||||
-#endif
|
||||
+ uint64x2_t special_cases
|
||||
+ = vorrq_u64 (vcgeq_u64 (ia, d->inf), vcgeq_u64 (ix, d->minus_one));
|
||||
|
||||
- /* With x + 1 = t * 2^k (where t = f + 1 and k is chosen such that f
|
||||
- is in [sqrt(2)/2, sqrt(2)]):
|
||||
- log1p(x) = k*log(2) + log1p(f).
|
||||
+ if (__glibc_unlikely (v_any_u64 (special_cases)))
|
||||
+ return special_case (x, special_cases, d);
|
||||
|
||||
- f may not be representable exactly, so we need a correction term:
|
||||
- let m = round(1 + x), c = (1 + x) - m.
|
||||
- c << m: at very small x, log1p(x) ~ x, hence:
|
||||
- log(1+x) - log(m) ~ c/m.
|
||||
-
|
||||
- We therefore calculate log1p(x) by k*log2 + log1p(f) + c/m. */
|
||||
-
|
||||
- /* Obtain correctly scaled k by manipulation in the exponent.
|
||||
- The scalar algorithm casts down to 32-bit at this point to calculate k and
|
||||
- u_red. We stay in double-width to obtain f and k, using the same constants
|
||||
- as the scalar algorithm but shifted left by 32. */
|
||||
- float64x2_t m = vaddq_f64 (x, v_f64 (1));
|
||||
- uint64x2_t mi = vreinterpretq_u64_f64 (m);
|
||||
- uint64x2_t u = vaddq_u64 (mi, d->one_m_hf_rt2_top);
|
||||
-
|
||||
- int64x2_t ki
|
||||
- = vsubq_s64 (vreinterpretq_s64_u64 (vshrq_n_u64 (u, 52)), d->one_top);
|
||||
- float64x2_t k = vcvtq_f64_s64 (ki);
|
||||
-
|
||||
- /* Reduce x to f in [sqrt(2)/2, sqrt(2)]. */
|
||||
- uint64x2_t utop = vaddq_u64 (vandq_u64 (u, d->umask), d->hf_rt2_top);
|
||||
- uint64x2_t u_red = vorrq_u64 (utop, vandq_u64 (mi, BottomMask));
|
||||
- float64x2_t f = vsubq_f64 (vreinterpretq_f64_u64 (u_red), v_f64 (1));
|
||||
-
|
||||
- /* Correction term c/m. */
|
||||
- float64x2_t cm = vdivq_f64 (vsubq_f64 (x, vsubq_f64 (m, v_f64 (1))), m);
|
||||
-
|
||||
- /* Approximate log1p(x) on the reduced input using a polynomial. Because
|
||||
- log1p(0)=0 we choose an approximation of the form:
|
||||
- x + C0*x^2 + C1*x^3 + C2x^4 + ...
|
||||
- Hence approximation has the form f + f^2 * P(f)
|
||||
- where P(x) = C0 + C1*x + C2x^2 + ...
|
||||
- Assembling this all correctly is dealt with at the final step. */
|
||||
- float64x2_t f2 = vmulq_f64 (f, f);
|
||||
- float64x2_t p = v_pw_horner_18_f64 (f, f2, d->poly);
|
||||
-
|
||||
- float64x2_t ylo = vfmaq_f64 (cm, k, d->ln2[1]);
|
||||
- float64x2_t yhi = vfmaq_f64 (f, k, d->ln2[0]);
|
||||
- float64x2_t y = vaddq_f64 (ylo, yhi);
|
||||
-
|
||||
- if (__glibc_unlikely (v_any_u64 (special)))
|
||||
- return special_case (vreinterpretq_f64_u64 (ix), vfmaq_f64 (y, f2, p),
|
||||
- special);
|
||||
-
|
||||
- return vfmaq_f64 (y, f2, p);
|
||||
+ return log1p_inline (x, &d->d);
|
||||
}
|
||||
diff --git a/sysdeps/aarch64/fpu/v_log1p_inline.h b/sysdeps/aarch64/fpu/v_log1p_inline.h
|
||||
index 242e43b6eecc0b6e..834ff65adf34ed4a 100644
|
||||
--- a/sysdeps/aarch64/fpu/v_log1p_inline.h
|
||||
+++ b/sysdeps/aarch64/fpu/v_log1p_inline.h
|
||||
@@ -21,29 +21,30 @@
|
||||
#define AARCH64_FPU_V_LOG1P_INLINE_H
|
||||
|
||||
#include "v_math.h"
|
||||
-#include "poly_advsimd_f64.h"
|
||||
|
||||
struct v_log1p_data
|
||||
{
|
||||
- float64x2_t poly[19], ln2[2];
|
||||
+ float64x2_t c0, c2, c4, c6, c8, c10, c12, c14, c16;
|
||||
uint64x2_t hf_rt2_top, one_m_hf_rt2_top, umask;
|
||||
int64x2_t one_top;
|
||||
+ double c1, c3, c5, c7, c9, c11, c13, c15, c17, c18;
|
||||
+ double ln2[2];
|
||||
};
|
||||
|
||||
/* Coefficients generated using Remez, deg=20, in [sqrt(2)/2-1, sqrt(2)-1]. */
|
||||
#define V_LOG1P_CONSTANTS_TABLE \
|
||||
{ \
|
||||
- .poly = { V2 (-0x1.ffffffffffffbp-2), V2 (0x1.55555555551a9p-2), \
|
||||
- V2 (-0x1.00000000008e3p-2), V2 (0x1.9999999a32797p-3), \
|
||||
- V2 (-0x1.555555552fecfp-3), V2 (0x1.249248e071e5ap-3), \
|
||||
- V2 (-0x1.ffffff8bf8482p-4), V2 (0x1.c71c8f07da57ap-4), \
|
||||
- V2 (-0x1.9999ca4ccb617p-4), V2 (0x1.7459ad2e1dfa3p-4), \
|
||||
- V2 (-0x1.554d2680a3ff2p-4), V2 (0x1.3b4c54d487455p-4), \
|
||||
- V2 (-0x1.2548a9ffe80e6p-4), V2 (0x1.0f389a24b2e07p-4), \
|
||||
- V2 (-0x1.eee4db15db335p-5), V2 (0x1.e95b494d4a5ddp-5), \
|
||||
- V2 (-0x1.15fdf07cb7c73p-4), V2 (0x1.0310b70800fcfp-4), \
|
||||
- V2 (-0x1.cfa7385bdb37ep-6) }, \
|
||||
- .ln2 = { V2 (0x1.62e42fefa3800p-1), V2 (0x1.ef35793c76730p-45) }, \
|
||||
+ .c0 = V2 (-0x1.ffffffffffffbp-2), .c1 = 0x1.55555555551a9p-2, \
|
||||
+ .c2 = V2 (-0x1.00000000008e3p-2), .c3 = 0x1.9999999a32797p-3, \
|
||||
+ .c4 = V2 (-0x1.555555552fecfp-3), .c5 = 0x1.249248e071e5ap-3, \
|
||||
+ .c6 = V2 (-0x1.ffffff8bf8482p-4), .c7 = 0x1.c71c8f07da57ap-4, \
|
||||
+ .c8 = V2 (-0x1.9999ca4ccb617p-4), .c9 = 0x1.7459ad2e1dfa3p-4, \
|
||||
+ .c10 = V2 (-0x1.554d2680a3ff2p-4), .c11 = 0x1.3b4c54d487455p-4, \
|
||||
+ .c12 = V2 (-0x1.2548a9ffe80e6p-4), .c13 = 0x1.0f389a24b2e07p-4, \
|
||||
+ .c14 = V2 (-0x1.eee4db15db335p-5), .c15 = 0x1.e95b494d4a5ddp-5, \
|
||||
+ .c16 = V2 (-0x1.15fdf07cb7c73p-4), .c17 = 0x1.0310b70800fcfp-4, \
|
||||
+ .c18 = -0x1.cfa7385bdb37ep-6, \
|
||||
+ .ln2 = { 0x1.62e42fefa3800p-1, 0x1.ef35793c76730p-45 }, \
|
||||
.hf_rt2_top = V2 (0x3fe6a09e00000000), \
|
||||
.one_m_hf_rt2_top = V2 (0x00095f6200000000), \
|
||||
.umask = V2 (0x000fffff00000000), .one_top = V2 (0x3ff) \
|
||||
@@ -51,19 +52,45 @@ struct v_log1p_data
|
||||
|
||||
#define BottomMask v_u64 (0xffffffff)
|
||||
|
||||
+static inline float64x2_t
|
||||
+eval_poly (float64x2_t m, float64x2_t m2, const struct v_log1p_data *d)
|
||||
+{
|
||||
+ /* Approximate log(1+m) on [-0.25, 0.5] using pairwise Horner. */
|
||||
+ float64x2_t c13 = vld1q_f64 (&d->c1);
|
||||
+ float64x2_t c57 = vld1q_f64 (&d->c5);
|
||||
+ float64x2_t c911 = vld1q_f64 (&d->c9);
|
||||
+ float64x2_t c1315 = vld1q_f64 (&d->c13);
|
||||
+ float64x2_t c1718 = vld1q_f64 (&d->c17);
|
||||
+ float64x2_t p1617 = vfmaq_laneq_f64 (d->c16, m, c1718, 0);
|
||||
+ float64x2_t p1415 = vfmaq_laneq_f64 (d->c14, m, c1315, 1);
|
||||
+ float64x2_t p1213 = vfmaq_laneq_f64 (d->c12, m, c1315, 0);
|
||||
+ float64x2_t p1011 = vfmaq_laneq_f64 (d->c10, m, c911, 1);
|
||||
+ float64x2_t p89 = vfmaq_laneq_f64 (d->c8, m, c911, 0);
|
||||
+ float64x2_t p67 = vfmaq_laneq_f64 (d->c6, m, c57, 1);
|
||||
+ float64x2_t p45 = vfmaq_laneq_f64 (d->c4, m, c57, 0);
|
||||
+ float64x2_t p23 = vfmaq_laneq_f64 (d->c2, m, c13, 1);
|
||||
+ float64x2_t p01 = vfmaq_laneq_f64 (d->c0, m, c13, 0);
|
||||
+ float64x2_t p = vfmaq_laneq_f64 (p1617, m2, c1718, 1);
|
||||
+ p = vfmaq_f64 (p1415, m2, p);
|
||||
+ p = vfmaq_f64 (p1213, m2, p);
|
||||
+ p = vfmaq_f64 (p1011, m2, p);
|
||||
+ p = vfmaq_f64 (p89, m2, p);
|
||||
+ p = vfmaq_f64 (p67, m2, p);
|
||||
+ p = vfmaq_f64 (p45, m2, p);
|
||||
+ p = vfmaq_f64 (p23, m2, p);
|
||||
+ return vfmaq_f64 (p01, m2, p);
|
||||
+}
|
||||
+
|
||||
static inline float64x2_t
|
||||
log1p_inline (float64x2_t x, const struct v_log1p_data *d)
|
||||
{
|
||||
- /* Helper for calculating log(x + 1). Copied from v_log1p_2u5.c, with several
|
||||
- modifications:
|
||||
+ /* Helper for calculating log(x + 1):
|
||||
- No special-case handling - this should be dealt with by the caller.
|
||||
- - Pairwise Horner polynomial evaluation for improved accuracy.
|
||||
- Optionally simulate the shortcut for k=0, used in the scalar routine,
|
||||
- using v_sel, for improved accuracy when the argument to log1p is close to
|
||||
- 0. This feature is enabled by defining WANT_V_LOG1P_K0_SHORTCUT as 1 in
|
||||
- the source of the caller before including this file.
|
||||
- See v_log1pf_2u1.c for details of the algorithm. */
|
||||
- float64x2_t m = vaddq_f64 (x, v_f64 (1));
|
||||
+ using v_sel, for improved accuracy when the argument to log1p is close
|
||||
+ to 0. This feature is enabled by defining WANT_V_LOG1P_K0_SHORTCUT as 1
|
||||
+ in the source of the caller before including this file. */
|
||||
+ float64x2_t m = vaddq_f64 (x, v_f64 (1.0));
|
||||
uint64x2_t mi = vreinterpretq_u64_f64 (m);
|
||||
uint64x2_t u = vaddq_u64 (mi, d->one_m_hf_rt2_top);
|
||||
|
||||
@@ -74,14 +101,14 @@ log1p_inline (float64x2_t x, const struct v_log1p_data *d)
|
||||
/* Reduce x to f in [sqrt(2)/2, sqrt(2)]. */
|
||||
uint64x2_t utop = vaddq_u64 (vandq_u64 (u, d->umask), d->hf_rt2_top);
|
||||
uint64x2_t u_red = vorrq_u64 (utop, vandq_u64 (mi, BottomMask));
|
||||
- float64x2_t f = vsubq_f64 (vreinterpretq_f64_u64 (u_red), v_f64 (1));
|
||||
+ float64x2_t f = vsubq_f64 (vreinterpretq_f64_u64 (u_red), v_f64 (1.0));
|
||||
|
||||
/* Correction term c/m. */
|
||||
- float64x2_t cm = vdivq_f64 (vsubq_f64 (x, vsubq_f64 (m, v_f64 (1))), m);
|
||||
+ float64x2_t cm = vdivq_f64 (vsubq_f64 (x, vsubq_f64 (m, v_f64 (1.0))), m);
|
||||
|
||||
#ifndef WANT_V_LOG1P_K0_SHORTCUT
|
||||
-#error \
|
||||
- "Cannot use v_log1p_inline.h without specifying whether you need the k0 shortcut for greater accuracy close to 0"
|
||||
+# error \
|
||||
+ "Cannot use v_log1p_inline.h without specifying whether you need the k0 shortcut for greater accuracy close to 0"
|
||||
#elif WANT_V_LOG1P_K0_SHORTCUT
|
||||
/* Shortcut if k is 0 - set correction term to 0 and f to x. The result is
|
||||
that the approximation is solely the polynomial. */
|
||||
@@ -92,11 +119,12 @@ log1p_inline (float64x2_t x, const struct v_log1p_data *d)
|
||||
|
||||
/* Approximate log1p(f) on the reduced input using a polynomial. */
|
||||
float64x2_t f2 = vmulq_f64 (f, f);
|
||||
- float64x2_t p = v_pw_horner_18_f64 (f, f2, d->poly);
|
||||
+ float64x2_t p = eval_poly (f, f2, d);
|
||||
|
||||
/* Assemble log1p(x) = k * log2 + log1p(f) + c/m. */
|
||||
- float64x2_t ylo = vfmaq_f64 (cm, k, d->ln2[1]);
|
||||
- float64x2_t yhi = vfmaq_f64 (f, k, d->ln2[0]);
|
||||
+ float64x2_t ln2 = vld1q_f64 (&d->ln2[0]);
|
||||
+ float64x2_t ylo = vfmaq_laneq_f64 (cm, k, ln2, 1);
|
||||
+ float64x2_t yhi = vfmaq_laneq_f64 (f, k, ln2, 0);
|
||||
return vfmaq_f64 (vaddq_f64 (ylo, yhi), f2, p);
|
||||
}
|
||||
|
||||
216
glibc-RHEL-118273-21.patch
Normal file
216
glibc-RHEL-118273-21.patch
Normal file
@ -0,0 +1,216 @@
|
||||
commit 569cfaaf4984ae70b23c61ee28a609b5aef93fea
|
||||
Author: Pierre Blanchard <pierre.blanchard@arm.com>
|
||||
Date: Mon Dec 9 15:53:04 2024 +0000
|
||||
|
||||
AArch64: Improve codegen in AdvSIMD pow
|
||||
|
||||
Remove spurious ADRP. Improve memory access by shuffling constants and
|
||||
using more indexed MLAs.
|
||||
|
||||
A few more optimisation with no impact on accuracy
|
||||
- force fmas contraction
|
||||
- switch from shift-aided rint to rint instruction
|
||||
|
||||
Between 1 and 5% throughput improvement on Neoverse
|
||||
V1 depending on benchmark.
|
||||
|
||||
diff --git a/sysdeps/aarch64/fpu/pow_advsimd.c b/sysdeps/aarch64/fpu/pow_advsimd.c
|
||||
index 3c91e3e183599e3e..81e134ac2f0bd2f5 100644
|
||||
--- a/sysdeps/aarch64/fpu/pow_advsimd.c
|
||||
+++ b/sysdeps/aarch64/fpu/pow_advsimd.c
|
||||
@@ -22,9 +22,6 @@
|
||||
/* Defines parameters of the approximation and scalar fallback. */
|
||||
#include "finite_pow.h"
|
||||
|
||||
-#define VecSmallExp v_u64 (SmallExp)
|
||||
-#define VecThresExp v_u64 (ThresExp)
|
||||
-
|
||||
#define VecSmallPowX v_u64 (SmallPowX)
|
||||
#define VecThresPowX v_u64 (ThresPowX)
|
||||
#define VecSmallPowY v_u64 (SmallPowY)
|
||||
@@ -32,36 +29,48 @@
|
||||
|
||||
static const struct data
|
||||
{
|
||||
- float64x2_t log_poly[6];
|
||||
- float64x2_t exp_poly[3];
|
||||
- float64x2_t ln2_hi, ln2_lo;
|
||||
- float64x2_t shift, inv_ln2_n, ln2_hi_n, ln2_lo_n, small_powx;
|
||||
uint64x2_t inf;
|
||||
+ float64x2_t small_powx;
|
||||
+ uint64x2_t offset, mask;
|
||||
+ uint64x2_t mask_sub_0, mask_sub_1;
|
||||
+ float64x2_t log_c0, log_c2, log_c4, log_c5;
|
||||
+ double log_c1, log_c3;
|
||||
+ double ln2_lo, ln2_hi;
|
||||
+ uint64x2_t small_exp, thres_exp;
|
||||
+ double ln2_lo_n, ln2_hi_n;
|
||||
+ double inv_ln2_n, exp_c2;
|
||||
+ float64x2_t exp_c0, exp_c1;
|
||||
} data = {
|
||||
+ /* Power threshold. */
|
||||
+ .inf = V2 (0x7ff0000000000000),
|
||||
+ .small_powx = V2 (0x1p-126),
|
||||
+ .offset = V2 (Off),
|
||||
+ .mask = V2 (0xfffULL << 52),
|
||||
+ .mask_sub_0 = V2 (1ULL << 52),
|
||||
+ .mask_sub_1 = V2 (52ULL << 52),
|
||||
/* Coefficients copied from v_pow_log_data.c
|
||||
relative error: 0x1.11922ap-70 in [-0x1.6bp-8, 0x1.6bp-8]
|
||||
Coefficients are scaled to match the scaling during evaluation. */
|
||||
- .log_poly
|
||||
- = { V2 (0x1.555555555556p-2 * -2), V2 (-0x1.0000000000006p-2 * -2),
|
||||
- V2 (0x1.999999959554ep-3 * 4), V2 (-0x1.555555529a47ap-3 * 4),
|
||||
- V2 (0x1.2495b9b4845e9p-3 * -8), V2 (-0x1.0002b8b263fc3p-3 * -8) },
|
||||
- .ln2_hi = V2 (0x1.62e42fefa3800p-1),
|
||||
- .ln2_lo = V2 (0x1.ef35793c76730p-45),
|
||||
+ .log_c0 = V2 (0x1.555555555556p-2 * -2),
|
||||
+ .log_c1 = -0x1.0000000000006p-2 * -2,
|
||||
+ .log_c2 = V2 (0x1.999999959554ep-3 * 4),
|
||||
+ .log_c3 = -0x1.555555529a47ap-3 * 4,
|
||||
+ .log_c4 = V2 (0x1.2495b9b4845e9p-3 * -8),
|
||||
+ .log_c5 = V2 (-0x1.0002b8b263fc3p-3 * -8),
|
||||
+ .ln2_hi = 0x1.62e42fefa3800p-1,
|
||||
+ .ln2_lo = 0x1.ef35793c76730p-45,
|
||||
/* Polynomial coefficients: abs error: 1.43*2^-58, ulp error: 0.549
|
||||
(0.550 without fma) if |x| < ln2/512. */
|
||||
- .exp_poly = { V2 (0x1.fffffffffffd4p-2), V2 (0x1.5555571d6ef9p-3),
|
||||
- V2 (0x1.5555576a5adcep-5) },
|
||||
- .shift = V2 (0x1.8p52), /* round to nearest int. without intrinsics. */
|
||||
- .inv_ln2_n = V2 (0x1.71547652b82fep8), /* N/ln2. */
|
||||
- .ln2_hi_n = V2 (0x1.62e42fefc0000p-9), /* ln2/N. */
|
||||
- .ln2_lo_n = V2 (-0x1.c610ca86c3899p-45),
|
||||
- .small_powx = V2 (0x1p-126),
|
||||
- .inf = V2 (0x7ff0000000000000)
|
||||
+ .exp_c0 = V2 (0x1.fffffffffffd4p-2),
|
||||
+ .exp_c1 = V2 (0x1.5555571d6ef9p-3),
|
||||
+ .exp_c2 = 0x1.5555576a5adcep-5,
|
||||
+ .small_exp = V2 (0x3c90000000000000),
|
||||
+ .thres_exp = V2 (0x03f0000000000000),
|
||||
+ .inv_ln2_n = 0x1.71547652b82fep8, /* N/ln2. */
|
||||
+ .ln2_hi_n = 0x1.62e42fefc0000p-9, /* ln2/N. */
|
||||
+ .ln2_lo_n = -0x1.c610ca86c3899p-45,
|
||||
};
|
||||
|
||||
-#define A(i) data.log_poly[i]
|
||||
-#define C(i) data.exp_poly[i]
|
||||
-
|
||||
/* This version implements an algorithm close to scalar pow but
|
||||
- does not implement the trick in the exp's specialcase subroutine to avoid
|
||||
double-rounding,
|
||||
@@ -91,10 +100,9 @@ v_log_inline (uint64x2_t ix, float64x2_t *tail, const struct data *d)
|
||||
/* x = 2^k z; where z is in range [OFF,2*OFF) and exact.
|
||||
The range is split into N subintervals.
|
||||
The ith subinterval contains z and c is near its center. */
|
||||
- uint64x2_t tmp = vsubq_u64 (ix, v_u64 (Off));
|
||||
- int64x2_t k
|
||||
- = vshrq_n_s64 (vreinterpretq_s64_u64 (tmp), 52); /* arithmetic shift. */
|
||||
- uint64x2_t iz = vsubq_u64 (ix, vandq_u64 (tmp, v_u64 (0xfffULL << 52)));
|
||||
+ uint64x2_t tmp = vsubq_u64 (ix, d->offset);
|
||||
+ int64x2_t k = vshrq_n_s64 (vreinterpretq_s64_u64 (tmp), 52);
|
||||
+ uint64x2_t iz = vsubq_u64 (ix, vandq_u64 (tmp, d->mask));
|
||||
float64x2_t z = vreinterpretq_f64_u64 (iz);
|
||||
float64x2_t kd = vcvtq_f64_s64 (k);
|
||||
/* log(x) = k*Ln2 + log(c) + log1p(z/c-1). */
|
||||
@@ -105,9 +113,10 @@ v_log_inline (uint64x2_t ix, float64x2_t *tail, const struct data *d)
|
||||
|z/c - 1| < 1/N, so r = z/c - 1 is exactly representible. */
|
||||
float64x2_t r = vfmaq_f64 (v_f64 (-1.0), z, invc);
|
||||
/* k*Ln2 + log(c) + r. */
|
||||
- float64x2_t t1 = vfmaq_f64 (logc, kd, d->ln2_hi);
|
||||
+ float64x2_t ln2 = vld1q_f64 (&d->ln2_lo);
|
||||
+ float64x2_t t1 = vfmaq_laneq_f64 (logc, kd, ln2, 1);
|
||||
float64x2_t t2 = vaddq_f64 (t1, r);
|
||||
- float64x2_t lo1 = vfmaq_f64 (logctail, kd, d->ln2_lo);
|
||||
+ float64x2_t lo1 = vfmaq_laneq_f64 (logctail, kd, ln2, 0);
|
||||
float64x2_t lo2 = vaddq_f64 (vsubq_f64 (t1, t2), r);
|
||||
/* Evaluation is optimized assuming superscalar pipelined execution. */
|
||||
float64x2_t ar = vmulq_f64 (v_f64 (-0.5), r);
|
||||
@@ -118,9 +127,10 @@ v_log_inline (uint64x2_t ix, float64x2_t *tail, const struct data *d)
|
||||
float64x2_t lo3 = vfmaq_f64 (vnegq_f64 (ar2), ar, r);
|
||||
float64x2_t lo4 = vaddq_f64 (vsubq_f64 (t2, hi), ar2);
|
||||
/* p = log1p(r) - r - A[0]*r*r. */
|
||||
- float64x2_t a56 = vfmaq_f64 (A (4), r, A (5));
|
||||
- float64x2_t a34 = vfmaq_f64 (A (2), r, A (3));
|
||||
- float64x2_t a12 = vfmaq_f64 (A (0), r, A (1));
|
||||
+ float64x2_t odd_coeffs = vld1q_f64 (&d->log_c1);
|
||||
+ float64x2_t a56 = vfmaq_f64 (d->log_c4, r, d->log_c5);
|
||||
+ float64x2_t a34 = vfmaq_laneq_f64 (d->log_c2, r, odd_coeffs, 1);
|
||||
+ float64x2_t a12 = vfmaq_laneq_f64 (d->log_c0, r, odd_coeffs, 0);
|
||||
float64x2_t p = vfmaq_f64 (a34, ar2, a56);
|
||||
p = vfmaq_f64 (a12, ar2, p);
|
||||
p = vmulq_f64 (ar3, p);
|
||||
@@ -140,28 +150,28 @@ exp_special_case (float64x2_t x, float64x2_t xtail)
|
||||
|
||||
/* Computes sign*exp(x+xtail) where |xtail| < 2^-8/N and |xtail| <= |x|. */
|
||||
static inline float64x2_t
|
||||
-v_exp_inline (float64x2_t x, float64x2_t xtail, const struct data *d)
|
||||
+v_exp_inline (float64x2_t x, float64x2_t neg_xtail, const struct data *d)
|
||||
{
|
||||
/* Fallback to scalar exp_inline for all lanes if any lane
|
||||
contains value of x s.t. |x| <= 2^-54 or >= 512. */
|
||||
- uint64x2_t abstop
|
||||
- = vshrq_n_u64 (vandq_u64 (vreinterpretq_u64_f64 (x), d->inf), 52);
|
||||
- uint64x2_t uoflowx
|
||||
- = vcgeq_u64 (vsubq_u64 (abstop, VecSmallExp), VecThresExp);
|
||||
+ uint64x2_t uoflowx = vcgeq_u64 (
|
||||
+ vsubq_u64 (vreinterpretq_u64_f64 (vabsq_f64 (x)), d->small_exp),
|
||||
+ d->thres_exp);
|
||||
if (__glibc_unlikely (v_any_u64 (uoflowx)))
|
||||
- return exp_special_case (x, xtail);
|
||||
+ return exp_special_case (x, vnegq_f64 (neg_xtail));
|
||||
|
||||
/* exp(x) = 2^(k/N) * exp(r), with exp(r) in [2^(-1/2N),2^(1/2N)]. */
|
||||
/* x = ln2/N*k + r, with k integer and r in [-ln2/2N, ln2/2N]. */
|
||||
- float64x2_t z = vmulq_f64 (d->inv_ln2_n, x);
|
||||
/* z - kd is in [-1, 1] in non-nearest rounding modes. */
|
||||
- float64x2_t kd = vaddq_f64 (z, d->shift);
|
||||
- uint64x2_t ki = vreinterpretq_u64_f64 (kd);
|
||||
- kd = vsubq_f64 (kd, d->shift);
|
||||
- float64x2_t r = vfmsq_f64 (x, kd, d->ln2_hi_n);
|
||||
- r = vfmsq_f64 (r, kd, d->ln2_lo_n);
|
||||
+ float64x2_t exp_consts = vld1q_f64 (&d->inv_ln2_n);
|
||||
+ float64x2_t z = vmulq_laneq_f64 (x, exp_consts, 0);
|
||||
+ float64x2_t kd = vrndnq_f64 (z);
|
||||
+ uint64x2_t ki = vreinterpretq_u64_s64 (vcvtaq_s64_f64 (z));
|
||||
+ float64x2_t ln2_n = vld1q_f64 (&d->ln2_lo_n);
|
||||
+ float64x2_t r = vfmsq_laneq_f64 (x, kd, ln2_n, 1);
|
||||
+ r = vfmsq_laneq_f64 (r, kd, ln2_n, 0);
|
||||
/* The code assumes 2^-200 < |xtail| < 2^-8/N. */
|
||||
- r = vaddq_f64 (r, xtail);
|
||||
+ r = vsubq_f64 (r, neg_xtail);
|
||||
/* 2^(k/N) ~= scale. */
|
||||
uint64x2_t idx = vandq_u64 (ki, v_u64 (N_EXP - 1));
|
||||
uint64x2_t top = vshlq_n_u64 (ki, 52 - V_POW_EXP_TABLE_BITS);
|
||||
@@ -170,8 +180,8 @@ v_exp_inline (float64x2_t x, float64x2_t xtail, const struct data *d)
|
||||
sbits = vaddq_u64 (sbits, top);
|
||||
/* exp(x) = 2^(k/N) * exp(r) ~= scale + scale * (exp(r) - 1). */
|
||||
float64x2_t r2 = vmulq_f64 (r, r);
|
||||
- float64x2_t tmp = vfmaq_f64 (C (1), r, C (2));
|
||||
- tmp = vfmaq_f64 (C (0), r, tmp);
|
||||
+ float64x2_t tmp = vfmaq_laneq_f64 (d->exp_c1, r, exp_consts, 1);
|
||||
+ tmp = vfmaq_f64 (d->exp_c0, r, tmp);
|
||||
tmp = vfmaq_f64 (r, r2, tmp);
|
||||
float64x2_t scale = vreinterpretq_f64_u64 (sbits);
|
||||
/* Note: tmp == 0 or |tmp| > 2^-200 and scale > 2^-739, so there
|
||||
@@ -230,8 +240,8 @@ float64x2_t VPCS_ATTR V_NAME_D2 (pow) (float64x2_t x, float64x2_t y)
|
||||
{
|
||||
/* Normalize subnormal x so exponent becomes negative. */
|
||||
uint64x2_t vix_norm = vreinterpretq_u64_f64 (
|
||||
- vabsq_f64 (vmulq_f64 (x, vcvtq_f64_u64 (v_u64 (1ULL << 52)))));
|
||||
- vix_norm = vsubq_u64 (vix_norm, v_u64 (52ULL << 52));
|
||||
+ vabsq_f64 (vmulq_f64 (x, vcvtq_f64_u64 (d->mask_sub_0))));
|
||||
+ vix_norm = vsubq_u64 (vix_norm, d->mask_sub_1);
|
||||
vix = vbslq_u64 (sub_x, vix_norm, vix);
|
||||
}
|
||||
}
|
||||
@@ -242,8 +252,7 @@ float64x2_t VPCS_ATTR V_NAME_D2 (pow) (float64x2_t x, float64x2_t y)
|
||||
|
||||
/* Vector Exp(y_loghi, y_loglo). */
|
||||
float64x2_t vehi = vmulq_f64 (y, vhi);
|
||||
- float64x2_t velo = vmulq_f64 (y, vlo);
|
||||
float64x2_t vemi = vfmsq_f64 (vehi, y, vhi);
|
||||
- velo = vsubq_f64 (velo, vemi);
|
||||
- return v_exp_inline (vehi, velo, d);
|
||||
+ float64x2_t neg_velo = vfmsq_f64 (vemi, y, vlo);
|
||||
+ return v_exp_inline (vehi, neg_velo, d);
|
||||
}
|
||||
501
glibc-RHEL-118273-22.patch
Normal file
501
glibc-RHEL-118273-22.patch
Normal file
@ -0,0 +1,501 @@
|
||||
commit cff9648d0b50d19cdaf685f6767add040d4e1a8e
|
||||
Author: Joana Cruz <Joana.Cruz@arm.com>
|
||||
Date: Tue Dec 17 14:50:33 2024 +0000
|
||||
|
||||
AArch64: Improve codegen of AdvSIMD expf family
|
||||
|
||||
Load the polynomial evaluation coefficients into 2 vectors and use lanewise MLAs.
|
||||
Also use intrinsics instead of native operations.
|
||||
expf: 3% improvement in throughput microbenchmark on Neoverse V1, exp2f: 5%,
|
||||
exp10f: 13%, coshf: 14%.
|
||||
|
||||
Reviewed-by: Wilco Dijkstra <Wilco.Dijkstra@arm.com>
|
||||
|
||||
diff --git a/sysdeps/aarch64/fpu/coshf_advsimd.c b/sysdeps/aarch64/fpu/coshf_advsimd.c
|
||||
index c1ab4923b826569b..cd5c86652129ea9c 100644
|
||||
--- a/sysdeps/aarch64/fpu/coshf_advsimd.c
|
||||
+++ b/sysdeps/aarch64/fpu/coshf_advsimd.c
|
||||
@@ -23,19 +23,27 @@
|
||||
static const struct data
|
||||
{
|
||||
struct v_expf_data expf_consts;
|
||||
- uint32x4_t tiny_bound, special_bound;
|
||||
+ uint32x4_t tiny_bound;
|
||||
+ float32x4_t bound;
|
||||
+#if WANT_SIMD_EXCEPT
|
||||
+ uint32x4_t special_bound;
|
||||
+#endif
|
||||
} data = {
|
||||
.expf_consts = V_EXPF_DATA,
|
||||
.tiny_bound = V4 (0x20000000), /* 0x1p-63: Round to 1 below this. */
|
||||
/* 0x1.5a92d8p+6: expf overflows above this, so have to use special case. */
|
||||
+ .bound = V4 (0x1.5a92d8p+6),
|
||||
+#if WANT_SIMD_EXCEPT
|
||||
.special_bound = V4 (0x42ad496c),
|
||||
+#endif
|
||||
};
|
||||
|
||||
#if !WANT_SIMD_EXCEPT
|
||||
static float32x4_t NOINLINE VPCS_ATTR
|
||||
-special_case (float32x4_t x, float32x4_t y, uint32x4_t special)
|
||||
+special_case (float32x4_t x, float32x4_t half_t, float32x4_t half_over_t,
|
||||
+ uint32x4_t special)
|
||||
{
|
||||
- return v_call_f32 (coshf, x, y, special);
|
||||
+ return v_call_f32 (coshf, x, vaddq_f32 (half_t, half_over_t), special);
|
||||
}
|
||||
#endif
|
||||
|
||||
@@ -47,14 +55,13 @@ float32x4_t VPCS_ATTR V_NAME_F1 (cosh) (float32x4_t x)
|
||||
{
|
||||
const struct data *d = ptr_barrier (&data);
|
||||
|
||||
- float32x4_t ax = vabsq_f32 (x);
|
||||
- uint32x4_t iax = vreinterpretq_u32_f32 (ax);
|
||||
- uint32x4_t special = vcgeq_u32 (iax, d->special_bound);
|
||||
-
|
||||
#if WANT_SIMD_EXCEPT
|
||||
/* If fp exceptions are to be triggered correctly, fall back to the scalar
|
||||
variant for all inputs if any input is a special value or above the bound
|
||||
at which expf overflows. */
|
||||
+ float32x4_t ax = vabsq_f32 (x);
|
||||
+ uint32x4_t iax = vreinterpretq_u32_f32 (ax);
|
||||
+ uint32x4_t special = vcgeq_u32 (iax, d->special_bound);
|
||||
if (__glibc_unlikely (v_any_u32 (special)))
|
||||
return v_call_f32 (coshf, x, x, v_u32 (-1));
|
||||
|
||||
@@ -63,10 +70,13 @@ float32x4_t VPCS_ATTR V_NAME_F1 (cosh) (float32x4_t x)
|
||||
input to 0, which will generate no exceptions. */
|
||||
if (__glibc_unlikely (v_any_u32 (tiny)))
|
||||
ax = v_zerofy_f32 (ax, tiny);
|
||||
+ float32x4_t t = v_expf_inline (ax, &d->expf_consts);
|
||||
+#else
|
||||
+ uint32x4_t special = vcageq_f32 (x, d->bound);
|
||||
+ float32x4_t t = v_expf_inline (x, &d->expf_consts);
|
||||
#endif
|
||||
|
||||
/* Calculate cosh by exp(x) / 2 + exp(-x) / 2. */
|
||||
- float32x4_t t = v_expf_inline (ax, &d->expf_consts);
|
||||
float32x4_t half_t = vmulq_n_f32 (t, 0.5);
|
||||
float32x4_t half_over_t = vdivq_f32 (v_f32 (0.5), t);
|
||||
|
||||
@@ -75,7 +85,7 @@ float32x4_t VPCS_ATTR V_NAME_F1 (cosh) (float32x4_t x)
|
||||
return vbslq_f32 (tiny, v_f32 (1), vaddq_f32 (half_t, half_over_t));
|
||||
#else
|
||||
if (__glibc_unlikely (v_any_u32 (special)))
|
||||
- return special_case (x, vaddq_f32 (half_t, half_over_t), special);
|
||||
+ return special_case (x, half_t, half_over_t, special);
|
||||
#endif
|
||||
|
||||
return vaddq_f32 (half_t, half_over_t);
|
||||
diff --git a/sysdeps/aarch64/fpu/exp10f_advsimd.c b/sysdeps/aarch64/fpu/exp10f_advsimd.c
|
||||
index cf53e73290fcedb6..55d9cd83f2968ab9 100644
|
||||
--- a/sysdeps/aarch64/fpu/exp10f_advsimd.c
|
||||
+++ b/sysdeps/aarch64/fpu/exp10f_advsimd.c
|
||||
@@ -18,16 +18,15 @@
|
||||
<https://www.gnu.org/licenses/>. */
|
||||
|
||||
#include "v_math.h"
|
||||
-#include "poly_advsimd_f32.h"
|
||||
|
||||
#define ScaleBound 192.0f
|
||||
|
||||
static const struct data
|
||||
{
|
||||
- float32x4_t poly[5];
|
||||
- float log10_2_and_inv[4];
|
||||
- float32x4_t shift;
|
||||
-
|
||||
+ float32x4_t c0, c1, c3;
|
||||
+ float log10_2_high, log10_2_low, c2, c4;
|
||||
+ float32x4_t inv_log10_2, special_bound;
|
||||
+ uint32x4_t exponent_bias, special_offset, special_bias;
|
||||
#if !WANT_SIMD_EXCEPT
|
||||
float32x4_t scale_thresh;
|
||||
#endif
|
||||
@@ -37,19 +36,24 @@ static const struct data
|
||||
rel error: 0x1.89dafa3p-24
|
||||
abs error: 0x1.167d55p-23 in [-log10(2)/2, log10(2)/2]
|
||||
maxerr: 1.85943 +0.5 ulp. */
|
||||
- .poly = { V4 (0x1.26bb16p+1f), V4 (0x1.5350d2p+1f), V4 (0x1.04744ap+1f),
|
||||
- V4 (0x1.2d8176p+0f), V4 (0x1.12b41ap-1f) },
|
||||
- .shift = V4 (0x1.8p23f),
|
||||
-
|
||||
- /* Stores constants 1/log10(2), log10(2)_high, log10(2)_low, 0. */
|
||||
- .log10_2_and_inv = { 0x1.a934fp+1, 0x1.344136p-2, -0x1.ec10cp-27, 0 },
|
||||
+ .c0 = V4 (0x1.26bb16p+1f),
|
||||
+ .c1 = V4 (0x1.5350d2p+1f),
|
||||
+ .c2 = 0x1.04744ap+1f,
|
||||
+ .c3 = V4 (0x1.2d8176p+0f),
|
||||
+ .c4 = 0x1.12b41ap-1f,
|
||||
+ .inv_log10_2 = V4 (0x1.a934fp+1),
|
||||
+ .log10_2_high = 0x1.344136p-2,
|
||||
+ .log10_2_low = 0x1.ec10cp-27,
|
||||
+ /* rint (log2 (2^127 / (1 + sqrt (2)))). */
|
||||
+ .special_bound = V4 (126.0f),
|
||||
+ .exponent_bias = V4 (0x3f800000),
|
||||
+ .special_offset = V4 (0x82000000),
|
||||
+ .special_bias = V4 (0x7f000000),
|
||||
#if !WANT_SIMD_EXCEPT
|
||||
.scale_thresh = V4 (ScaleBound)
|
||||
#endif
|
||||
};
|
||||
|
||||
-#define ExponentBias v_u32 (0x3f800000)
|
||||
-
|
||||
#if WANT_SIMD_EXCEPT
|
||||
|
||||
# define SpecialBound 38.0f /* rint(log10(2^127)). */
|
||||
@@ -67,17 +71,15 @@ special_case (float32x4_t x, float32x4_t y, uint32x4_t cmp)
|
||||
|
||||
#else
|
||||
|
||||
-# define SpecialBound 126.0f /* rint (log2 (2^127 / (1 + sqrt (2)))). */
|
||||
-# define SpecialOffset v_u32 (0x82000000)
|
||||
-# define SpecialBias v_u32 (0x7f000000)
|
||||
+# define SpecialBound 126.0f
|
||||
|
||||
static float32x4_t VPCS_ATTR NOINLINE
|
||||
special_case (float32x4_t poly, float32x4_t n, uint32x4_t e, uint32x4_t cmp1,
|
||||
float32x4_t scale, const struct data *d)
|
||||
{
|
||||
/* 2^n may overflow, break it up into s1*s2. */
|
||||
- uint32x4_t b = vandq_u32 (vclezq_f32 (n), SpecialOffset);
|
||||
- float32x4_t s1 = vreinterpretq_f32_u32 (vaddq_u32 (b, SpecialBias));
|
||||
+ uint32x4_t b = vandq_u32 (vclezq_f32 (n), d->special_offset);
|
||||
+ float32x4_t s1 = vreinterpretq_f32_u32 (vaddq_u32 (b, d->special_bias));
|
||||
float32x4_t s2 = vreinterpretq_f32_u32 (vsubq_u32 (e, b));
|
||||
uint32x4_t cmp2 = vcagtq_f32 (n, d->scale_thresh);
|
||||
float32x4_t r2 = vmulq_f32 (s1, s1);
|
||||
@@ -112,23 +114,23 @@ float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (exp10) (float32x4_t x)
|
||||
/* exp10(x) = 2^n * 10^r = 2^n * (1 + poly (r)),
|
||||
with poly(r) in [1/sqrt(2), sqrt(2)] and
|
||||
x = r + n * log10 (2), with r in [-log10(2)/2, log10(2)/2]. */
|
||||
- float32x4_t log10_2_and_inv = vld1q_f32 (d->log10_2_and_inv);
|
||||
- float32x4_t z = vfmaq_laneq_f32 (d->shift, x, log10_2_and_inv, 0);
|
||||
- float32x4_t n = vsubq_f32 (z, d->shift);
|
||||
- float32x4_t r = vfmsq_laneq_f32 (x, n, log10_2_and_inv, 1);
|
||||
- r = vfmsq_laneq_f32 (r, n, log10_2_and_inv, 2);
|
||||
- uint32x4_t e = vshlq_n_u32 (vreinterpretq_u32_f32 (z), 23);
|
||||
+ float32x4_t log10_2_c24 = vld1q_f32 (&d->log10_2_high);
|
||||
+ float32x4_t n = vrndaq_f32 (vmulq_f32 (x, d->inv_log10_2));
|
||||
+ float32x4_t r = vfmsq_laneq_f32 (x, n, log10_2_c24, 0);
|
||||
+ r = vfmaq_laneq_f32 (r, n, log10_2_c24, 1);
|
||||
+ uint32x4_t e = vshlq_n_u32 (vreinterpretq_u32_s32 (vcvtaq_s32_f32 (n)), 23);
|
||||
|
||||
- float32x4_t scale = vreinterpretq_f32_u32 (vaddq_u32 (e, ExponentBias));
|
||||
+ float32x4_t scale = vreinterpretq_f32_u32 (vaddq_u32 (e, d->exponent_bias));
|
||||
|
||||
#if !WANT_SIMD_EXCEPT
|
||||
- uint32x4_t cmp = vcagtq_f32 (n, v_f32 (SpecialBound));
|
||||
+ uint32x4_t cmp = vcagtq_f32 (n, d->special_bound);
|
||||
#endif
|
||||
|
||||
float32x4_t r2 = vmulq_f32 (r, r);
|
||||
- float32x4_t poly
|
||||
- = vfmaq_f32 (vmulq_f32 (r, d->poly[0]),
|
||||
- v_pairwise_poly_3_f32 (r, r2, d->poly + 1), r2);
|
||||
+ float32x4_t p12 = vfmaq_laneq_f32 (d->c1, r, log10_2_c24, 2);
|
||||
+ float32x4_t p34 = vfmaq_laneq_f32 (d->c3, r, log10_2_c24, 3);
|
||||
+ float32x4_t p14 = vfmaq_f32 (p12, r2, p34);
|
||||
+ float32x4_t poly = vfmaq_f32 (vmulq_f32 (r, d->c0), p14, r2);
|
||||
|
||||
if (__glibc_unlikely (v_any_u32 (cmp)))
|
||||
#if WANT_SIMD_EXCEPT
|
||||
diff --git a/sysdeps/aarch64/fpu/exp2f_advsimd.c b/sysdeps/aarch64/fpu/exp2f_advsimd.c
|
||||
index 69e0b193a1a91249..a4220da63c624490 100644
|
||||
--- a/sysdeps/aarch64/fpu/exp2f_advsimd.c
|
||||
+++ b/sysdeps/aarch64/fpu/exp2f_advsimd.c
|
||||
@@ -21,24 +21,28 @@
|
||||
|
||||
static const struct data
|
||||
{
|
||||
- float32x4_t poly[5];
|
||||
- uint32x4_t exponent_bias;
|
||||
+ float32x4_t c1, c3;
|
||||
+ uint32x4_t exponent_bias, special_offset, special_bias;
|
||||
#if !WANT_SIMD_EXCEPT
|
||||
- float32x4_t special_bound, scale_thresh;
|
||||
+ float32x4_t scale_thresh, special_bound;
|
||||
#endif
|
||||
+ float c0, c2, c4, zero;
|
||||
} data = {
|
||||
/* maxerr: 1.962 ulp. */
|
||||
- .poly = { V4 (0x1.59977ap-10f), V4 (0x1.3ce9e4p-7f), V4 (0x1.c6bd32p-5f),
|
||||
- V4 (0x1.ebf9bcp-3f), V4 (0x1.62e422p-1f) },
|
||||
+ .c0 = 0x1.59977ap-10f,
|
||||
+ .c1 = V4 (0x1.3ce9e4p-7f),
|
||||
+ .c2 = 0x1.c6bd32p-5f,
|
||||
+ .c3 = V4 (0x1.ebf9bcp-3f),
|
||||
+ .c4 = 0x1.62e422p-1f,
|
||||
.exponent_bias = V4 (0x3f800000),
|
||||
+ .special_offset = V4 (0x82000000),
|
||||
+ .special_bias = V4 (0x7f000000),
|
||||
#if !WANT_SIMD_EXCEPT
|
||||
.special_bound = V4 (126.0f),
|
||||
.scale_thresh = V4 (192.0f),
|
||||
#endif
|
||||
};
|
||||
|
||||
-#define C(i) d->poly[i]
|
||||
-
|
||||
#if WANT_SIMD_EXCEPT
|
||||
|
||||
# define TinyBound v_u32 (0x20000000) /* asuint (0x1p-63). */
|
||||
@@ -55,16 +59,13 @@ special_case (float32x4_t x, float32x4_t y, uint32x4_t cmp)
|
||||
|
||||
#else
|
||||
|
||||
-# define SpecialOffset v_u32 (0x82000000)
|
||||
-# define SpecialBias v_u32 (0x7f000000)
|
||||
-
|
||||
static float32x4_t VPCS_ATTR NOINLINE
|
||||
special_case (float32x4_t poly, float32x4_t n, uint32x4_t e, uint32x4_t cmp1,
|
||||
float32x4_t scale, const struct data *d)
|
||||
{
|
||||
/* 2^n may overflow, break it up into s1*s2. */
|
||||
- uint32x4_t b = vandq_u32 (vclezq_f32 (n), SpecialOffset);
|
||||
- float32x4_t s1 = vreinterpretq_f32_u32 (vaddq_u32 (b, SpecialBias));
|
||||
+ uint32x4_t b = vandq_u32 (vclezq_f32 (n), d->special_offset);
|
||||
+ float32x4_t s1 = vreinterpretq_f32_u32 (vaddq_u32 (b, d->special_bias));
|
||||
float32x4_t s2 = vreinterpretq_f32_u32 (vsubq_u32 (e, b));
|
||||
uint32x4_t cmp2 = vcagtq_f32 (n, d->scale_thresh);
|
||||
float32x4_t r2 = vmulq_f32 (s1, s1);
|
||||
@@ -80,13 +81,11 @@ special_case (float32x4_t poly, float32x4_t n, uint32x4_t e, uint32x4_t cmp1,
|
||||
float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (exp2) (float32x4_t x)
|
||||
{
|
||||
const struct data *d = ptr_barrier (&data);
|
||||
- float32x4_t n, r, r2, scale, p, q, poly;
|
||||
- uint32x4_t cmp, e;
|
||||
|
||||
#if WANT_SIMD_EXCEPT
|
||||
/* asuint(|x|) - TinyBound >= BigBound - TinyBound. */
|
||||
uint32x4_t ia = vreinterpretq_u32_f32 (vabsq_f32 (x));
|
||||
- cmp = vcgeq_u32 (vsubq_u32 (ia, TinyBound), SpecialBound);
|
||||
+ uint32x4_t cmp = vcgeq_u32 (vsubq_u32 (ia, TinyBound), SpecialBound);
|
||||
float32x4_t xm = x;
|
||||
/* If any lanes are special, mask them with 1 and retain a copy of x to allow
|
||||
special_case to fix special lanes later. This is only necessary if fenv
|
||||
@@ -95,23 +94,24 @@ float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (exp2) (float32x4_t x)
|
||||
x = vbslq_f32 (cmp, v_f32 (1), x);
|
||||
#endif
|
||||
|
||||
- /* exp2(x) = 2^n (1 + poly(r)), with 1 + poly(r) in [1/sqrt(2),sqrt(2)]
|
||||
- x = n + r, with r in [-1/2, 1/2]. */
|
||||
- n = vrndaq_f32 (x);
|
||||
- r = vsubq_f32 (x, n);
|
||||
- e = vshlq_n_u32 (vreinterpretq_u32_s32 (vcvtaq_s32_f32 (x)), 23);
|
||||
- scale = vreinterpretq_f32_u32 (vaddq_u32 (e, d->exponent_bias));
|
||||
+ /* exp2(x) = 2^n (1 + poly(r)), with 1 + poly(r) in [1/sqrt(2),sqrt(2)]
|
||||
+ x = n + r, with r in [-1/2, 1/2]. */
|
||||
+ float32x4_t n = vrndaq_f32 (x);
|
||||
+ float32x4_t r = vsubq_f32 (x, n);
|
||||
+ uint32x4_t e = vshlq_n_u32 (vreinterpretq_u32_s32 (vcvtaq_s32_f32 (x)), 23);
|
||||
+ float32x4_t scale = vreinterpretq_f32_u32 (vaddq_u32 (e, d->exponent_bias));
|
||||
|
||||
#if !WANT_SIMD_EXCEPT
|
||||
- cmp = vcagtq_f32 (n, d->special_bound);
|
||||
+ uint32x4_t cmp = vcagtq_f32 (n, d->special_bound);
|
||||
#endif
|
||||
|
||||
- r2 = vmulq_f32 (r, r);
|
||||
- p = vfmaq_f32 (C (1), C (0), r);
|
||||
- q = vfmaq_f32 (C (3), C (2), r);
|
||||
+ float32x4_t c024 = vld1q_f32 (&d->c0);
|
||||
+ float32x4_t r2 = vmulq_f32 (r, r);
|
||||
+ float32x4_t p = vfmaq_laneq_f32 (d->c1, r, c024, 0);
|
||||
+ float32x4_t q = vfmaq_laneq_f32 (d->c3, r, c024, 1);
|
||||
q = vfmaq_f32 (q, p, r2);
|
||||
- p = vmulq_f32 (C (4), r);
|
||||
- poly = vfmaq_f32 (p, q, r2);
|
||||
+ p = vmulq_laneq_f32 (r, c024, 2);
|
||||
+ float32x4_t poly = vfmaq_f32 (p, q, r2);
|
||||
|
||||
if (__glibc_unlikely (v_any_u32 (cmp)))
|
||||
#if WANT_SIMD_EXCEPT
|
||||
diff --git a/sysdeps/aarch64/fpu/expf_advsimd.c b/sysdeps/aarch64/fpu/expf_advsimd.c
|
||||
index 5c9cb726205ece6e..70f137e2e5b46207 100644
|
||||
--- a/sysdeps/aarch64/fpu/expf_advsimd.c
|
||||
+++ b/sysdeps/aarch64/fpu/expf_advsimd.c
|
||||
@@ -21,20 +21,25 @@
|
||||
|
||||
static const struct data
|
||||
{
|
||||
- float32x4_t poly[5];
|
||||
- float32x4_t inv_ln2, ln2_hi, ln2_lo;
|
||||
- uint32x4_t exponent_bias;
|
||||
+ float32x4_t c1, c3, c4, inv_ln2;
|
||||
+ float ln2_hi, ln2_lo, c0, c2;
|
||||
+ uint32x4_t exponent_bias, special_offset, special_bias;
|
||||
#if !WANT_SIMD_EXCEPT
|
||||
float32x4_t special_bound, scale_thresh;
|
||||
#endif
|
||||
} data = {
|
||||
/* maxerr: 1.45358 +0.5 ulp. */
|
||||
- .poly = { V4 (0x1.0e4020p-7f), V4 (0x1.573e2ep-5f), V4 (0x1.555e66p-3f),
|
||||
- V4 (0x1.fffdb6p-2f), V4 (0x1.ffffecp-1f) },
|
||||
+ .c0 = 0x1.0e4020p-7f,
|
||||
+ .c1 = V4 (0x1.573e2ep-5f),
|
||||
+ .c2 = 0x1.555e66p-3f,
|
||||
+ .c3 = V4 (0x1.fffdb6p-2f),
|
||||
+ .c4 = V4 (0x1.ffffecp-1f),
|
||||
.inv_ln2 = V4 (0x1.715476p+0f),
|
||||
- .ln2_hi = V4 (0x1.62e4p-1f),
|
||||
- .ln2_lo = V4 (0x1.7f7d1cp-20f),
|
||||
+ .ln2_hi = 0x1.62e4p-1f,
|
||||
+ .ln2_lo = 0x1.7f7d1cp-20f,
|
||||
.exponent_bias = V4 (0x3f800000),
|
||||
+ .special_offset = V4 (0x82000000),
|
||||
+ .special_bias = V4 (0x7f000000),
|
||||
#if !WANT_SIMD_EXCEPT
|
||||
.special_bound = V4 (126.0f),
|
||||
.scale_thresh = V4 (192.0f),
|
||||
@@ -59,19 +64,17 @@ special_case (float32x4_t x, float32x4_t y, uint32x4_t cmp)
|
||||
|
||||
#else
|
||||
|
||||
-# define SpecialOffset v_u32 (0x82000000)
|
||||
-# define SpecialBias v_u32 (0x7f000000)
|
||||
-
|
||||
static float32x4_t VPCS_ATTR NOINLINE
|
||||
special_case (float32x4_t poly, float32x4_t n, uint32x4_t e, uint32x4_t cmp1,
|
||||
float32x4_t scale, const struct data *d)
|
||||
{
|
||||
/* 2^n may overflow, break it up into s1*s2. */
|
||||
- uint32x4_t b = vandq_u32 (vclezq_f32 (n), SpecialOffset);
|
||||
- float32x4_t s1 = vreinterpretq_f32_u32 (vaddq_u32 (b, SpecialBias));
|
||||
+ uint32x4_t b = vandq_u32 (vclezq_f32 (n), d->special_offset);
|
||||
+ float32x4_t s1 = vreinterpretq_f32_u32 (vaddq_u32 (b, d->special_bias));
|
||||
float32x4_t s2 = vreinterpretq_f32_u32 (vsubq_u32 (e, b));
|
||||
uint32x4_t cmp2 = vcagtq_f32 (n, d->scale_thresh);
|
||||
float32x4_t r2 = vmulq_f32 (s1, s1);
|
||||
+ // (s2 + p*s2)*s1 = s2(p+1)s1
|
||||
float32x4_t r1 = vmulq_f32 (vfmaq_f32 (s2, poly, s2), s1);
|
||||
/* Similar to r1 but avoids double rounding in the subnormal range. */
|
||||
float32x4_t r0 = vfmaq_f32 (scale, poly, scale);
|
||||
@@ -84,12 +87,11 @@ special_case (float32x4_t poly, float32x4_t n, uint32x4_t e, uint32x4_t cmp1,
|
||||
float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (exp) (float32x4_t x)
|
||||
{
|
||||
const struct data *d = ptr_barrier (&data);
|
||||
- float32x4_t n, r, r2, scale, p, q, poly;
|
||||
- uint32x4_t cmp, e;
|
||||
+ float32x4_t ln2_c02 = vld1q_f32 (&d->ln2_hi);
|
||||
|
||||
#if WANT_SIMD_EXCEPT
|
||||
/* asuint(x) - TinyBound >= BigBound - TinyBound. */
|
||||
- cmp = vcgeq_u32 (
|
||||
+ uint32x4_t cmp = vcgeq_u32 (
|
||||
vsubq_u32 (vandq_u32 (vreinterpretq_u32_f32 (x), v_u32 (0x7fffffff)),
|
||||
TinyBound),
|
||||
SpecialBound);
|
||||
@@ -103,22 +105,22 @@ float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (exp) (float32x4_t x)
|
||||
|
||||
/* exp(x) = 2^n (1 + poly(r)), with 1 + poly(r) in [1/sqrt(2),sqrt(2)]
|
||||
x = ln2*n + r, with r in [-ln2/2, ln2/2]. */
|
||||
- n = vrndaq_f32 (vmulq_f32 (x, d->inv_ln2));
|
||||
- r = vfmsq_f32 (x, n, d->ln2_hi);
|
||||
- r = vfmsq_f32 (r, n, d->ln2_lo);
|
||||
- e = vshlq_n_u32 (vreinterpretq_u32_s32 (vcvtq_s32_f32 (n)), 23);
|
||||
- scale = vreinterpretq_f32_u32 (vaddq_u32 (e, d->exponent_bias));
|
||||
+ float32x4_t n = vrndaq_f32 (vmulq_f32 (x, d->inv_ln2));
|
||||
+ float32x4_t r = vfmsq_laneq_f32 (x, n, ln2_c02, 0);
|
||||
+ r = vfmsq_laneq_f32 (r, n, ln2_c02, 1);
|
||||
+ uint32x4_t e = vshlq_n_u32 (vreinterpretq_u32_s32 (vcvtq_s32_f32 (n)), 23);
|
||||
+ float32x4_t scale = vreinterpretq_f32_u32 (vaddq_u32 (e, d->exponent_bias));
|
||||
|
||||
#if !WANT_SIMD_EXCEPT
|
||||
- cmp = vcagtq_f32 (n, d->special_bound);
|
||||
+ uint32x4_t cmp = vcagtq_f32 (n, d->special_bound);
|
||||
#endif
|
||||
|
||||
- r2 = vmulq_f32 (r, r);
|
||||
- p = vfmaq_f32 (C (1), C (0), r);
|
||||
- q = vfmaq_f32 (C (3), C (2), r);
|
||||
+ float32x4_t r2 = vmulq_f32 (r, r);
|
||||
+ float32x4_t p = vfmaq_laneq_f32 (d->c1, r, ln2_c02, 2);
|
||||
+ float32x4_t q = vfmaq_laneq_f32 (d->c3, r, ln2_c02, 3);
|
||||
q = vfmaq_f32 (q, p, r2);
|
||||
- p = vmulq_f32 (C (4), r);
|
||||
- poly = vfmaq_f32 (p, q, r2);
|
||||
+ p = vmulq_f32 (d->c4, r);
|
||||
+ float32x4_t poly = vfmaq_f32 (p, q, r2);
|
||||
|
||||
if (__glibc_unlikely (v_any_u32 (cmp)))
|
||||
#if WANT_SIMD_EXCEPT
|
||||
diff --git a/sysdeps/aarch64/fpu/v_expf_inline.h b/sysdeps/aarch64/fpu/v_expf_inline.h
|
||||
index 08b06e0a6b34b4f4..eacd2af24161fe3a 100644
|
||||
--- a/sysdeps/aarch64/fpu/v_expf_inline.h
|
||||
+++ b/sysdeps/aarch64/fpu/v_expf_inline.h
|
||||
@@ -24,50 +24,45 @@
|
||||
|
||||
struct v_expf_data
|
||||
{
|
||||
- float32x4_t poly[5];
|
||||
- float32x4_t shift;
|
||||
- float invln2_and_ln2[4];
|
||||
+ float ln2_hi, ln2_lo, c0, c2;
|
||||
+ float32x4_t inv_ln2, c1, c3, c4;
|
||||
+ /* asuint(1.0f). */
|
||||
+ uint32x4_t exponent_bias;
|
||||
};
|
||||
|
||||
/* maxerr: 1.45358 +0.5 ulp. */
|
||||
#define V_EXPF_DATA \
|
||||
{ \
|
||||
- .poly = { V4 (0x1.0e4020p-7f), V4 (0x1.573e2ep-5f), V4 (0x1.555e66p-3f), \
|
||||
- V4 (0x1.fffdb6p-2f), V4 (0x1.ffffecp-1f) }, \
|
||||
- .shift = V4 (0x1.8p23f), \
|
||||
- .invln2_and_ln2 = { 0x1.715476p+0f, 0x1.62e4p-1f, 0x1.7f7d1cp-20f, 0 }, \
|
||||
+ .c0 = 0x1.0e4020p-7f, .c1 = V4 (0x1.573e2ep-5f), .c2 = 0x1.555e66p-3f, \
|
||||
+ .c3 = V4 (0x1.fffdb6p-2f), .c4 = V4 (0x1.ffffecp-1f), \
|
||||
+ .ln2_hi = 0x1.62e4p-1f, .ln2_lo = 0x1.7f7d1cp-20f, \
|
||||
+ .inv_ln2 = V4 (0x1.715476p+0f), .exponent_bias = V4 (0x3f800000), \
|
||||
}
|
||||
|
||||
-#define ExponentBias v_u32 (0x3f800000) /* asuint(1.0f). */
|
||||
-#define C(i) d->poly[i]
|
||||
-
|
||||
static inline float32x4_t
|
||||
v_expf_inline (float32x4_t x, const struct v_expf_data *d)
|
||||
{
|
||||
- /* Helper routine for calculating exp(x).
|
||||
+ /* Helper routine for calculating exp(ax).
|
||||
Copied from v_expf.c, with all special-case handling removed - the
|
||||
calling routine should handle special values if required. */
|
||||
|
||||
- /* exp(x) = 2^n (1 + poly(r)), with 1 + poly(r) in [1/sqrt(2),sqrt(2)]
|
||||
- x = ln2*n + r, with r in [-ln2/2, ln2/2]. */
|
||||
- float32x4_t n, r, z;
|
||||
- float32x4_t invln2_and_ln2 = vld1q_f32 (d->invln2_and_ln2);
|
||||
- z = vfmaq_laneq_f32 (d->shift, x, invln2_and_ln2, 0);
|
||||
- n = vsubq_f32 (z, d->shift);
|
||||
- r = vfmsq_laneq_f32 (x, n, invln2_and_ln2, 1);
|
||||
- r = vfmsq_laneq_f32 (r, n, invln2_and_ln2, 2);
|
||||
- uint32x4_t e = vshlq_n_u32 (vreinterpretq_u32_f32 (z), 23);
|
||||
- float32x4_t scale = vreinterpretq_f32_u32 (vaddq_u32 (e, ExponentBias));
|
||||
+ /* exp(ax) = 2^n (1 + poly(r)), with 1 + poly(r) in [1/sqrt(2),sqrt(2)]
|
||||
+ ax = ln2*n + r, with r in [-ln2/2, ln2/2]. */
|
||||
+ float32x4_t ax = vabsq_f32 (x);
|
||||
+ float32x4_t ln2_c02 = vld1q_f32 (&d->ln2_hi);
|
||||
+ float32x4_t n = vrndaq_f32 (vmulq_f32 (ax, d->inv_ln2));
|
||||
+ float32x4_t r = vfmsq_laneq_f32 (ax, n, ln2_c02, 0);
|
||||
+ r = vfmsq_laneq_f32 (r, n, ln2_c02, 1);
|
||||
+ uint32x4_t e = vshlq_n_u32 (vreinterpretq_u32_s32 (vcvtq_s32_f32 (n)), 23);
|
||||
+ float32x4_t scale = vreinterpretq_f32_u32 (vaddq_u32 (e, d->exponent_bias));
|
||||
|
||||
/* Custom order-4 Estrin avoids building high order monomial. */
|
||||
float32x4_t r2 = vmulq_f32 (r, r);
|
||||
- float32x4_t p, q, poly;
|
||||
- p = vfmaq_f32 (C (1), C (0), r);
|
||||
- q = vfmaq_f32 (C (3), C (2), r);
|
||||
+ float32x4_t p = vfmaq_laneq_f32 (d->c1, r, ln2_c02, 2);
|
||||
+ float32x4_t q = vfmaq_laneq_f32 (d->c3, r, ln2_c02, 3);
|
||||
q = vfmaq_f32 (q, p, r2);
|
||||
- p = vmulq_f32 (C (4), r);
|
||||
- poly = vfmaq_f32 (p, q, r2);
|
||||
+ p = vmulq_f32 (d->c4, r);
|
||||
+ float32x4_t poly = vfmaq_f32 (p, q, r2);
|
||||
return vfmaq_f32 (scale, poly, scale);
|
||||
}
|
||||
-
|
||||
#endif
|
||||
362
glibc-RHEL-118273-23.patch
Normal file
362
glibc-RHEL-118273-23.patch
Normal file
@ -0,0 +1,362 @@
|
||||
commit 91c1fadba338752bf514cd4cca057b27b1b10eed
|
||||
Author: Yat Long Poon <yatlong.poon@arm.com>
|
||||
Date: Fri Jan 3 19:09:05 2025 +0000
|
||||
|
||||
AArch64: Improve codegen for SVE log1pf users
|
||||
|
||||
Reduce memory access by using lanewise MLA and reduce number of MOVPRFXs.
|
||||
Move log1pf implementation to inline helper function.
|
||||
Speedup on Neoverse V1 for log1pf (10%), acoshf (-1%), atanhf (2%), asinhf (2%).
|
||||
|
||||
Conflicts:
|
||||
sysdeps/aarch64/fpu/log1pf_sve.c
|
||||
(Fixup context to apply without out-of-scope dependency 751a5502)
|
||||
|
||||
diff --git a/sysdeps/aarch64/fpu/acoshf_sve.c b/sysdeps/aarch64/fpu/acoshf_sve.c
|
||||
index 2110894e629500be..491365e24d692f0f 100644
|
||||
--- a/sysdeps/aarch64/fpu/acoshf_sve.c
|
||||
+++ b/sysdeps/aarch64/fpu/acoshf_sve.c
|
||||
@@ -17,23 +17,26 @@
|
||||
License along with the GNU C Library; if not, see
|
||||
<https://www.gnu.org/licenses/>. */
|
||||
|
||||
+#include "sv_math.h"
|
||||
+#include "sv_log1pf_inline.h"
|
||||
+
|
||||
#define One 0x3f800000
|
||||
#define Thres 0x20000000 /* asuint(0x1p64) - One. */
|
||||
|
||||
-#include "sv_log1pf_inline.h"
|
||||
-
|
||||
static svfloat32_t NOINLINE
|
||||
-special_case (svfloat32_t x, svfloat32_t y, svbool_t special)
|
||||
+special_case (svfloat32_t xm1, svfloat32_t tmp, svbool_t special)
|
||||
{
|
||||
+ svfloat32_t x = svadd_x (svptrue_b32 (), xm1, 1.0f);
|
||||
+ svfloat32_t y = sv_log1pf_inline (tmp, svptrue_b32 ());
|
||||
return sv_call_f32 (acoshf, x, y, special);
|
||||
}
|
||||
|
||||
/* Single-precision SVE acosh(x) routine. Implements the same algorithm as
|
||||
vector acoshf and log1p.
|
||||
|
||||
- Maximum error is 2.78 ULPs:
|
||||
- SV_NAME_F1 (acosh) (0x1.01e996p+0) got 0x1.f45b42p-4
|
||||
- want 0x1.f45b3cp-4. */
|
||||
+ Maximum error is 2.47 ULPs:
|
||||
+ SV_NAME_F1 (acosh) (0x1.01ca76p+0) got 0x1.e435a6p-4
|
||||
+ want 0x1.e435a2p-4. */
|
||||
svfloat32_t SV_NAME_F1 (acosh) (svfloat32_t x, const svbool_t pg)
|
||||
{
|
||||
svuint32_t ix = svreinterpret_u32 (x);
|
||||
@@ -41,9 +44,9 @@ svfloat32_t SV_NAME_F1 (acosh) (svfloat32_t x, const svbool_t pg)
|
||||
|
||||
svfloat32_t xm1 = svsub_x (pg, x, 1.0f);
|
||||
svfloat32_t u = svmul_x (pg, xm1, svadd_x (pg, x, 1.0f));
|
||||
- svfloat32_t y = sv_log1pf_inline (svadd_x (pg, xm1, svsqrt_x (pg, u)), pg);
|
||||
+ svfloat32_t tmp = svadd_x (pg, xm1, svsqrt_x (pg, u));
|
||||
|
||||
if (__glibc_unlikely (svptest_any (pg, special)))
|
||||
- return special_case (x, y, special);
|
||||
- return y;
|
||||
+ return special_case (xm1, tmp, special);
|
||||
+ return sv_log1pf_inline (tmp, pg);
|
||||
}
|
||||
diff --git a/sysdeps/aarch64/fpu/asinhf_sve.c b/sysdeps/aarch64/fpu/asinhf_sve.c
|
||||
index d85c3a685c0b83ff..b7f253bf32fb9478 100644
|
||||
--- a/sysdeps/aarch64/fpu/asinhf_sve.c
|
||||
+++ b/sysdeps/aarch64/fpu/asinhf_sve.c
|
||||
@@ -20,20 +20,23 @@
|
||||
#include "sv_math.h"
|
||||
#include "sv_log1pf_inline.h"
|
||||
|
||||
-#define BigBound (0x5f800000) /* asuint(0x1p64). */
|
||||
+#define BigBound 0x5f800000 /* asuint(0x1p64). */
|
||||
|
||||
static svfloat32_t NOINLINE
|
||||
-special_case (svfloat32_t x, svfloat32_t y, svbool_t special)
|
||||
+special_case (svuint32_t iax, svuint32_t sign, svfloat32_t y, svbool_t special)
|
||||
{
|
||||
+ svfloat32_t x = svreinterpret_f32 (sveor_x (svptrue_b32 (), iax, sign));
|
||||
+ y = svreinterpret_f32 (
|
||||
+ svorr_x (svptrue_b32 (), sign, svreinterpret_u32 (y)));
|
||||
return sv_call_f32 (asinhf, x, y, special);
|
||||
}
|
||||
|
||||
/* Single-precision SVE asinh(x) routine. Implements the same algorithm as
|
||||
vector asinhf and log1p.
|
||||
|
||||
- Maximum error is 2.48 ULPs:
|
||||
- SV_NAME_F1 (asinh) (0x1.008864p-3) got 0x1.ffbbbcp-4
|
||||
- want 0x1.ffbbb8p-4. */
|
||||
+ Maximum error is 1.92 ULPs:
|
||||
+ SV_NAME_F1 (asinh) (-0x1.0922ecp-1) got -0x1.fd0bccp-2
|
||||
+ want -0x1.fd0bc8p-2. */
|
||||
svfloat32_t SV_NAME_F1 (asinh) (svfloat32_t x, const svbool_t pg)
|
||||
{
|
||||
svfloat32_t ax = svabs_x (pg, x);
|
||||
@@ -49,8 +52,6 @@ svfloat32_t SV_NAME_F1 (asinh) (svfloat32_t x, const svbool_t pg)
|
||||
= sv_log1pf_inline (svadd_x (pg, ax, svdiv_x (pg, ax2, d)), pg);
|
||||
|
||||
if (__glibc_unlikely (svptest_any (pg, special)))
|
||||
- return special_case (
|
||||
- x, svreinterpret_f32 (svorr_x (pg, sign, svreinterpret_u32 (y))),
|
||||
- special);
|
||||
+ return special_case (iax, sign, y, special);
|
||||
return svreinterpret_f32 (svorr_x (pg, sign, svreinterpret_u32 (y)));
|
||||
}
|
||||
diff --git a/sysdeps/aarch64/fpu/atanhf_sve.c b/sysdeps/aarch64/fpu/atanhf_sve.c
|
||||
index dae83041ef7157f0..2d3005bbc88393ec 100644
|
||||
--- a/sysdeps/aarch64/fpu/atanhf_sve.c
|
||||
+++ b/sysdeps/aarch64/fpu/atanhf_sve.c
|
||||
@@ -17,21 +17,25 @@
|
||||
License along with the GNU C Library; if not, see
|
||||
<https://www.gnu.org/licenses/>. */
|
||||
|
||||
+#include "sv_math.h"
|
||||
#include "sv_log1pf_inline.h"
|
||||
|
||||
#define One (0x3f800000)
|
||||
#define Half (0x3f000000)
|
||||
|
||||
static svfloat32_t NOINLINE
|
||||
-special_case (svfloat32_t x, svfloat32_t y, svbool_t special)
|
||||
+special_case (svuint32_t iax, svuint32_t sign, svfloat32_t halfsign,
|
||||
+ svfloat32_t y, svbool_t special)
|
||||
{
|
||||
+ svfloat32_t x = svreinterpret_f32 (sveor_x (svptrue_b32 (), iax, sign));
|
||||
+ y = svmul_x (svptrue_b32 (), halfsign, y);
|
||||
return sv_call_f32 (atanhf, x, y, special);
|
||||
}
|
||||
|
||||
/* Approximation for vector single-precision atanh(x) using modified log1p.
|
||||
- The maximum error is 2.28 ULP:
|
||||
- _ZGVsMxv_atanhf(0x1.ff1194p-5) got 0x1.ffbbbcp-5
|
||||
- want 0x1.ffbbb6p-5. */
|
||||
+ The maximum error is 1.99 ULP:
|
||||
+ _ZGVsMxv_atanhf(0x1.f1583p-5) got 0x1.f1f4fap-5
|
||||
+ want 0x1.f1f4f6p-5. */
|
||||
svfloat32_t SV_NAME_F1 (atanh) (svfloat32_t x, const svbool_t pg)
|
||||
{
|
||||
svfloat32_t ax = svabs_x (pg, x);
|
||||
@@ -48,7 +52,7 @@ svfloat32_t SV_NAME_F1 (atanh) (svfloat32_t x, const svbool_t pg)
|
||||
y = sv_log1pf_inline (y, pg);
|
||||
|
||||
if (__glibc_unlikely (svptest_any (pg, special)))
|
||||
- return special_case (x, svmul_x (pg, halfsign, y), special);
|
||||
+ return special_case (iax, sign, halfsign, y, special);
|
||||
|
||||
return svmul_x (pg, halfsign, y);
|
||||
}
|
||||
diff --git a/sysdeps/aarch64/fpu/log1pf_sve.c b/sysdeps/aarch64/fpu/log1pf_sve.c
|
||||
index f645cc997e430bcb..4f17c44e2d96039a 100644
|
||||
--- a/sysdeps/aarch64/fpu/log1pf_sve.c
|
||||
+++ b/sysdeps/aarch64/fpu/log1pf_sve.c
|
||||
@@ -18,30 +18,13 @@
|
||||
<https://www.gnu.org/licenses/>. */
|
||||
|
||||
#include "sv_math.h"
|
||||
-#include "poly_sve_f32.h"
|
||||
-
|
||||
-static const struct data
|
||||
-{
|
||||
- float poly[8];
|
||||
- float ln2, exp_bias;
|
||||
- uint32_t four, three_quarters;
|
||||
-} data = {.poly = {/* Do not store first term of polynomial, which is -0.5, as
|
||||
- this can be fmov-ed directly instead of including it in
|
||||
- the main load-and-mla polynomial schedule. */
|
||||
- 0x1.5555aap-2f, -0x1.000038p-2f, 0x1.99675cp-3f,
|
||||
- -0x1.54ef78p-3f, 0x1.28a1f4p-3f, -0x1.0da91p-3f,
|
||||
- 0x1.abcb6p-4f, -0x1.6f0d5ep-5f},
|
||||
- .ln2 = 0x1.62e43p-1f,
|
||||
- .exp_bias = 0x1p-23f,
|
||||
- .four = 0x40800000,
|
||||
- .three_quarters = 0x3f400000};
|
||||
-
|
||||
-#define SignExponentMask 0xff800000
|
||||
+#include "sv_log1pf_inline.h"
|
||||
|
||||
static svfloat32_t NOINLINE
|
||||
-special_case (svfloat32_t x, svfloat32_t y, svbool_t special)
|
||||
+special_case (svfloat32_t x, svbool_t special)
|
||||
{
|
||||
- return sv_call_f32 (log1pf, x, y, special);
|
||||
+ return sv_call_f32 (log1pf, x, sv_log1pf_inline (x, svptrue_b32 ()),
|
||||
+ special);
|
||||
}
|
||||
|
||||
/* Vector log1pf approximation using polynomial on reduced interval. Worst-case
|
||||
@@ -50,51 +33,12 @@ special_case (svfloat32_t x, svfloat32_t y, svbool_t special)
|
||||
want 0x1.9f323ep-2. */
|
||||
svfloat32_t SV_NAME_F1 (log1p) (svfloat32_t x, svbool_t pg)
|
||||
{
|
||||
- const struct data *d = ptr_barrier (&data);
|
||||
/* x < -1, Inf/Nan. */
|
||||
svbool_t special = svcmpeq (pg, svreinterpret_u32 (x), 0x7f800000);
|
||||
special = svorn_z (pg, special, svcmpge (pg, x, -1));
|
||||
|
||||
- /* With x + 1 = t * 2^k (where t = m + 1 and k is chosen such that m
|
||||
- is in [-0.25, 0.5]):
|
||||
- log1p(x) = log(t) + log(2^k) = log1p(m) + k*log(2).
|
||||
-
|
||||
- We approximate log1p(m) with a polynomial, then scale by
|
||||
- k*log(2). Instead of doing this directly, we use an intermediate
|
||||
- scale factor s = 4*k*log(2) to ensure the scale is representable
|
||||
- as a normalised fp32 number. */
|
||||
- svfloat32_t m = svadd_x (pg, x, 1);
|
||||
-
|
||||
- /* Choose k to scale x to the range [-1/4, 1/2]. */
|
||||
- svint32_t k
|
||||
- = svand_x (pg, svsub_x (pg, svreinterpret_s32 (m), d->three_quarters),
|
||||
- sv_s32 (SignExponentMask));
|
||||
-
|
||||
- /* Scale x by exponent manipulation. */
|
||||
- svfloat32_t m_scale = svreinterpret_f32 (
|
||||
- svsub_x (pg, svreinterpret_u32 (x), svreinterpret_u32 (k)));
|
||||
-
|
||||
- /* Scale up to ensure that the scale factor is representable as normalised
|
||||
- fp32 number, and scale m down accordingly. */
|
||||
- svfloat32_t s = svreinterpret_f32 (svsubr_x (pg, k, d->four));
|
||||
- m_scale = svadd_x (pg, m_scale, svmla_x (pg, sv_f32 (-1), s, 0.25));
|
||||
-
|
||||
- /* Evaluate polynomial on reduced interval. */
|
||||
- svfloat32_t ms2 = svmul_x (pg, m_scale, m_scale),
|
||||
- ms4 = svmul_x (pg, ms2, ms2);
|
||||
- svfloat32_t p = sv_estrin_7_f32_x (pg, m_scale, ms2, ms4, d->poly);
|
||||
- p = svmad_x (pg, m_scale, p, -0.5);
|
||||
- p = svmla_x (pg, m_scale, m_scale, svmul_x (pg, m_scale, p));
|
||||
-
|
||||
- /* The scale factor to be applied back at the end - by multiplying float(k)
|
||||
- by 2^-23 we get the unbiased exponent of k. */
|
||||
- svfloat32_t scale_back = svmul_x (pg, svcvt_f32_x (pg, k), d->exp_bias);
|
||||
-
|
||||
- /* Apply the scaling back. */
|
||||
- svfloat32_t y = svmla_x (pg, p, scale_back, d->ln2);
|
||||
-
|
||||
if (__glibc_unlikely (svptest_any (pg, special)))
|
||||
- return special_case (x, y, special);
|
||||
+ return special_case (x, special);
|
||||
|
||||
- return y;
|
||||
+ return sv_log1pf_inline (x, pg);
|
||||
}
|
||||
diff --git a/sysdeps/aarch64/fpu/sv_log1pf_inline.h b/sysdeps/aarch64/fpu/sv_log1pf_inline.h
|
||||
index b94b2da055a6c59b..850297d61556740c 100644
|
||||
--- a/sysdeps/aarch64/fpu/sv_log1pf_inline.h
|
||||
+++ b/sysdeps/aarch64/fpu/sv_log1pf_inline.h
|
||||
@@ -22,55 +22,76 @@
|
||||
|
||||
#include "sv_math.h"
|
||||
#include "vecmath_config.h"
|
||||
-#include "poly_sve_f32.h"
|
||||
+
|
||||
+#define SignExponentMask 0xff800000
|
||||
|
||||
static const struct sv_log1pf_data
|
||||
{
|
||||
- float32_t poly[9];
|
||||
- float32_t ln2;
|
||||
- float32_t scale_back;
|
||||
+ float c0, c2, c4, c6;
|
||||
+ float c1, c3, c5, c7;
|
||||
+ float ln2, exp_bias, quarter;
|
||||
+ uint32_t four, three_quarters;
|
||||
} sv_log1pf_data = {
|
||||
- /* Polynomial generated using FPMinimax in [-0.25, 0.5]. */
|
||||
- .poly = { -0x1p-1f, 0x1.5555aap-2f, -0x1.000038p-2f, 0x1.99675cp-3f,
|
||||
- -0x1.54ef78p-3f, 0x1.28a1f4p-3f, -0x1.0da91p-3f, 0x1.abcb6p-4f,
|
||||
- -0x1.6f0d5ep-5f },
|
||||
- .scale_back = 0x1.0p-23f,
|
||||
- .ln2 = 0x1.62e43p-1f,
|
||||
+ /* Do not store first term of polynomial, which is -0.5, as
|
||||
+ this can be fmov-ed directly instead of including it in
|
||||
+ the main load-and-mla polynomial schedule. */
|
||||
+ .c0 = 0x1.5555aap-2f, .c1 = -0x1.000038p-2f, .c2 = 0x1.99675cp-3f,
|
||||
+ .c3 = -0x1.54ef78p-3f, .c4 = 0x1.28a1f4p-3f, .c5 = -0x1.0da91p-3f,
|
||||
+ .c6 = 0x1.abcb6p-4f, .c7 = -0x1.6f0d5ep-5f, .ln2 = 0x1.62e43p-1f,
|
||||
+ .exp_bias = 0x1p-23f, .quarter = 0x1p-2f, .four = 0x40800000,
|
||||
+ .three_quarters = 0x3f400000,
|
||||
};
|
||||
|
||||
-static inline svfloat32_t
|
||||
-eval_poly (svfloat32_t m, const float32_t *c, svbool_t pg)
|
||||
-{
|
||||
- svfloat32_t p_12 = svmla_x (pg, sv_f32 (c[0]), m, sv_f32 (c[1]));
|
||||
- svfloat32_t m2 = svmul_x (pg, m, m);
|
||||
- svfloat32_t q = svmla_x (pg, m, m2, p_12);
|
||||
- svfloat32_t p = sv_pw_horner_6_f32_x (pg, m, m2, c + 2);
|
||||
- p = svmul_x (pg, m2, p);
|
||||
-
|
||||
- return svmla_x (pg, q, m2, p);
|
||||
-}
|
||||
-
|
||||
static inline svfloat32_t
|
||||
sv_log1pf_inline (svfloat32_t x, svbool_t pg)
|
||||
{
|
||||
const struct sv_log1pf_data *d = ptr_barrier (&sv_log1pf_data);
|
||||
|
||||
- svfloat32_t m = svadd_x (pg, x, 1.0f);
|
||||
-
|
||||
- svint32_t ks = svsub_x (pg, svreinterpret_s32 (m),
|
||||
- svreinterpret_s32 (svdup_f32 (0.75f)));
|
||||
- ks = svand_x (pg, ks, 0xff800000);
|
||||
- svuint32_t k = svreinterpret_u32 (ks);
|
||||
- svfloat32_t s = svreinterpret_f32 (
|
||||
- svsub_x (pg, svreinterpret_u32 (svdup_f32 (4.0f)), k));
|
||||
-
|
||||
- svfloat32_t m_scale
|
||||
- = svreinterpret_f32 (svsub_x (pg, svreinterpret_u32 (x), k));
|
||||
- m_scale
|
||||
- = svadd_x (pg, m_scale, svmla_x (pg, sv_f32 (-1.0f), sv_f32 (0.25f), s));
|
||||
- svfloat32_t p = eval_poly (m_scale, d->poly, pg);
|
||||
- svfloat32_t scale_back = svmul_x (pg, svcvt_f32_x (pg, k), d->scale_back);
|
||||
- return svmla_x (pg, p, scale_back, d->ln2);
|
||||
+ /* With x + 1 = t * 2^k (where t = m + 1 and k is chosen such that m
|
||||
+ is in [-0.25, 0.5]):
|
||||
+ log1p(x) = log(t) + log(2^k) = log1p(m) + k*log(2).
|
||||
+
|
||||
+ We approximate log1p(m) with a polynomial, then scale by
|
||||
+ k*log(2). Instead of doing this directly, we use an intermediate
|
||||
+ scale factor s = 4*k*log(2) to ensure the scale is representable
|
||||
+ as a normalised fp32 number. */
|
||||
+ svfloat32_t m = svadd_x (pg, x, 1);
|
||||
+
|
||||
+ /* Choose k to scale x to the range [-1/4, 1/2]. */
|
||||
+ svint32_t k
|
||||
+ = svand_x (pg, svsub_x (pg, svreinterpret_s32 (m), d->three_quarters),
|
||||
+ sv_s32 (SignExponentMask));
|
||||
+
|
||||
+ /* Scale x by exponent manipulation. */
|
||||
+ svfloat32_t m_scale = svreinterpret_f32 (
|
||||
+ svsub_x (pg, svreinterpret_u32 (x), svreinterpret_u32 (k)));
|
||||
+
|
||||
+ /* Scale up to ensure that the scale factor is representable as normalised
|
||||
+ fp32 number, and scale m down accordingly. */
|
||||
+ svfloat32_t s = svreinterpret_f32 (svsubr_x (pg, k, d->four));
|
||||
+ svfloat32_t fconst = svld1rq_f32 (svptrue_b32 (), &d->ln2);
|
||||
+ m_scale = svadd_x (pg, m_scale, svmla_lane_f32 (sv_f32 (-1), s, fconst, 2));
|
||||
+
|
||||
+ /* Evaluate polynomial on reduced interval. */
|
||||
+ svfloat32_t ms2 = svmul_x (svptrue_b32 (), m_scale, m_scale);
|
||||
+
|
||||
+ svfloat32_t c1357 = svld1rq_f32 (svptrue_b32 (), &d->c1);
|
||||
+ svfloat32_t p01 = svmla_lane_f32 (sv_f32 (d->c0), m_scale, c1357, 0);
|
||||
+ svfloat32_t p23 = svmla_lane_f32 (sv_f32 (d->c2), m_scale, c1357, 1);
|
||||
+ svfloat32_t p45 = svmla_lane_f32 (sv_f32 (d->c4), m_scale, c1357, 2);
|
||||
+ svfloat32_t p67 = svmla_lane_f32 (sv_f32 (d->c6), m_scale, c1357, 3);
|
||||
+
|
||||
+ svfloat32_t p = svmla_x (pg, p45, p67, ms2);
|
||||
+ p = svmla_x (pg, p23, p, ms2);
|
||||
+ p = svmla_x (pg, p01, p, ms2);
|
||||
+
|
||||
+ p = svmad_x (pg, m_scale, p, -0.5);
|
||||
+ p = svmla_x (pg, m_scale, m_scale, svmul_x (pg, m_scale, p));
|
||||
+
|
||||
+ /* The scale factor to be applied back at the end - by multiplying float(k)
|
||||
+ by 2^-23 we get the unbiased exponent of k. */
|
||||
+ svfloat32_t scale_back = svmul_lane_f32 (svcvt_f32_x (pg, k), fconst, 1);
|
||||
+ return svmla_lane_f32 (p, scale_back, fconst, 0);
|
||||
}
|
||||
|
||||
#endif
|
||||
258
glibc-RHEL-118273-24.patch
Normal file
258
glibc-RHEL-118273-24.patch
Normal file
@ -0,0 +1,258 @@
|
||||
commit 140b985e5a2071000122b3cb63ebfe88cf21dd29
|
||||
Author: Luna Lamb <luna.lamb@arm.com>
|
||||
Date: Fri Jan 3 19:00:12 2025 +0000
|
||||
|
||||
AArch64: Improve codegen in AdvSIMD asinh
|
||||
|
||||
Improves memory access and removes spills.
|
||||
Load the polynomial evaluation coefficients into 2 vectors and use lanewise
|
||||
MLAs. Reduces MOVs 6->3 , LDR 11->5, STR/STP 2->0, ADRP 3->2.
|
||||
|
||||
diff --git a/sysdeps/aarch64/fpu/asinh_advsimd.c b/sysdeps/aarch64/fpu/asinh_advsimd.c
|
||||
index 6207e7da9531f48d..2739f98b390edca7 100644
|
||||
--- a/sysdeps/aarch64/fpu/asinh_advsimd.c
|
||||
+++ b/sysdeps/aarch64/fpu/asinh_advsimd.c
|
||||
@@ -20,41 +20,71 @@
|
||||
#include "v_math.h"
|
||||
#include "poly_advsimd_f64.h"
|
||||
|
||||
-#define A(i) v_f64 (__v_log_data.poly[i])
|
||||
-#define N (1 << V_LOG_TABLE_BITS)
|
||||
-#define IndexMask (N - 1)
|
||||
-
|
||||
const static struct data
|
||||
{
|
||||
- float64x2_t poly[18];
|
||||
- uint64x2_t off, huge_bound, abs_mask;
|
||||
- float64x2_t ln2, tiny_bound;
|
||||
+ uint64x2_t huge_bound, abs_mask, off, mask;
|
||||
+#if WANT_SIMD_EXCEPT
|
||||
+ float64x2_t tiny_bound;
|
||||
+#endif
|
||||
+ float64x2_t lc0, lc2;
|
||||
+ double lc1, lc3, ln2, lc4;
|
||||
+
|
||||
+ float64x2_t c0, c2, c4, c6, c8, c10, c12, c14, c16, c17;
|
||||
+ double c1, c3, c5, c7, c9, c11, c13, c15;
|
||||
+
|
||||
} data = {
|
||||
- .off = V2 (0x3fe6900900000000),
|
||||
- .ln2 = V2 (0x1.62e42fefa39efp-1),
|
||||
- .huge_bound = V2 (0x5fe0000000000000),
|
||||
+
|
||||
+#if WANT_SIMD_EXCEPT
|
||||
.tiny_bound = V2 (0x1p-26),
|
||||
- .abs_mask = V2 (0x7fffffffffffffff),
|
||||
+#endif
|
||||
/* Even terms of polynomial s.t. asinh(x) is approximated by
|
||||
asinh(x) ~= x + x^3 * (C0 + C1 * x + C2 * x^2 + C3 * x^3 + ...).
|
||||
Generated using Remez, f = (asinh(sqrt(x)) - sqrt(x))/x^(3/2). */
|
||||
- .poly = { V2 (-0x1.55555555554a7p-3), V2 (0x1.3333333326c7p-4),
|
||||
- V2 (-0x1.6db6db68332e6p-5), V2 (0x1.f1c71b26fb40dp-6),
|
||||
- V2 (-0x1.6e8b8b654a621p-6), V2 (0x1.1c4daa9e67871p-6),
|
||||
- V2 (-0x1.c9871d10885afp-7), V2 (0x1.7a16e8d9d2ecfp-7),
|
||||
- V2 (-0x1.3ddca533e9f54p-7), V2 (0x1.0becef748dafcp-7),
|
||||
- V2 (-0x1.b90c7099dd397p-8), V2 (0x1.541f2bb1ffe51p-8),
|
||||
- V2 (-0x1.d217026a669ecp-9), V2 (0x1.0b5c7977aaf7p-9),
|
||||
- V2 (-0x1.e0f37daef9127p-11), V2 (0x1.388b5fe542a6p-12),
|
||||
- V2 (-0x1.021a48685e287p-14), V2 (0x1.93d4ba83d34dap-18) },
|
||||
+
|
||||
+ .c0 = V2 (-0x1.55555555554a7p-3),
|
||||
+ .c1 = 0x1.3333333326c7p-4,
|
||||
+ .c2 = V2 (-0x1.6db6db68332e6p-5),
|
||||
+ .c3 = 0x1.f1c71b26fb40dp-6,
|
||||
+ .c4 = V2 (-0x1.6e8b8b654a621p-6),
|
||||
+ .c5 = 0x1.1c4daa9e67871p-6,
|
||||
+ .c6 = V2 (-0x1.c9871d10885afp-7),
|
||||
+ .c7 = 0x1.7a16e8d9d2ecfp-7,
|
||||
+ .c8 = V2 (-0x1.3ddca533e9f54p-7),
|
||||
+ .c9 = 0x1.0becef748dafcp-7,
|
||||
+ .c10 = V2 (-0x1.b90c7099dd397p-8),
|
||||
+ .c11 = 0x1.541f2bb1ffe51p-8,
|
||||
+ .c12 = V2 (-0x1.d217026a669ecp-9),
|
||||
+ .c13 = 0x1.0b5c7977aaf7p-9,
|
||||
+ .c14 = V2 (-0x1.e0f37daef9127p-11),
|
||||
+ .c15 = 0x1.388b5fe542a6p-12,
|
||||
+ .c16 = V2 (-0x1.021a48685e287p-14),
|
||||
+ .c17 = V2 (0x1.93d4ba83d34dap-18),
|
||||
+
|
||||
+ .lc0 = V2 (-0x1.ffffffffffff7p-2),
|
||||
+ .lc1 = 0x1.55555555170d4p-2,
|
||||
+ .lc2 = V2 (-0x1.0000000399c27p-2),
|
||||
+ .lc3 = 0x1.999b2e90e94cap-3,
|
||||
+ .lc4 = -0x1.554e550bd501ep-3,
|
||||
+ .ln2 = 0x1.62e42fefa39efp-1,
|
||||
+
|
||||
+ .off = V2 (0x3fe6900900000000),
|
||||
+ .huge_bound = V2 (0x5fe0000000000000),
|
||||
+ .abs_mask = V2 (0x7fffffffffffffff),
|
||||
+ .mask = V2 (0xfffULL << 52),
|
||||
};
|
||||
|
||||
static float64x2_t NOINLINE VPCS_ATTR
|
||||
-special_case (float64x2_t x, float64x2_t y, uint64x2_t special)
|
||||
+special_case (float64x2_t x, float64x2_t y, uint64x2_t abs_mask,
|
||||
+ uint64x2_t special)
|
||||
{
|
||||
+ /* Copy sign. */
|
||||
+ y = vbslq_f64 (abs_mask, y, x);
|
||||
return v_call_f64 (asinh, x, y, special);
|
||||
}
|
||||
|
||||
+#define N (1 << V_LOG_TABLE_BITS)
|
||||
+#define IndexMask (N - 1)
|
||||
+
|
||||
struct entry
|
||||
{
|
||||
float64x2_t invc;
|
||||
@@ -76,27 +106,34 @@ lookup (uint64x2_t i)
|
||||
}
|
||||
|
||||
static inline float64x2_t
|
||||
-log_inline (float64x2_t x, const struct data *d)
|
||||
+log_inline (float64x2_t xm, const struct data *d)
|
||||
{
|
||||
- /* Double-precision vector log, copied from ordinary vector log with some
|
||||
- cosmetic modification and special-cases removed. */
|
||||
- uint64x2_t ix = vreinterpretq_u64_f64 (x);
|
||||
- uint64x2_t tmp = vsubq_u64 (ix, d->off);
|
||||
- int64x2_t k = vshrq_n_s64 (vreinterpretq_s64_u64 (tmp), 52);
|
||||
- uint64x2_t iz
|
||||
- = vsubq_u64 (ix, vandq_u64 (tmp, vdupq_n_u64 (0xfffULL << 52)));
|
||||
+
|
||||
+ uint64x2_t u = vreinterpretq_u64_f64 (xm);
|
||||
+ uint64x2_t u_off = vsubq_u64 (u, d->off);
|
||||
+
|
||||
+ int64x2_t k = vshrq_n_s64 (vreinterpretq_s64_u64 (u_off), 52);
|
||||
+ uint64x2_t iz = vsubq_u64 (u, vandq_u64 (u_off, d->mask));
|
||||
float64x2_t z = vreinterpretq_f64_u64 (iz);
|
||||
- struct entry e = lookup (tmp);
|
||||
+
|
||||
+ struct entry e = lookup (u_off);
|
||||
+
|
||||
+ /* log(x) = log1p(z/c-1) + log(c) + k*Ln2. */
|
||||
float64x2_t r = vfmaq_f64 (v_f64 (-1.0), z, e.invc);
|
||||
float64x2_t kd = vcvtq_f64_s64 (k);
|
||||
- float64x2_t hi = vfmaq_f64 (vaddq_f64 (e.logc, r), kd, d->ln2);
|
||||
+
|
||||
+ /* hi = r + log(c) + k*Ln2. */
|
||||
+ float64x2_t ln2_and_lc4 = vld1q_f64 (&d->ln2);
|
||||
+ float64x2_t hi = vfmaq_laneq_f64 (vaddq_f64 (e.logc, r), kd, ln2_and_lc4, 0);
|
||||
+
|
||||
+ /* y = r2*(A0 + r*A1 + r2*(A2 + r*A3 + r2*A4)) + hi. */
|
||||
+ float64x2_t odd_coeffs = vld1q_f64 (&d->lc1);
|
||||
float64x2_t r2 = vmulq_f64 (r, r);
|
||||
- float64x2_t y = vfmaq_f64 (A (2), A (3), r);
|
||||
- float64x2_t p = vfmaq_f64 (A (0), A (1), r);
|
||||
- y = vfmaq_f64 (y, A (4), r2);
|
||||
- y = vfmaq_f64 (p, y, r2);
|
||||
- y = vfmaq_f64 (hi, y, r2);
|
||||
- return y;
|
||||
+ float64x2_t y = vfmaq_laneq_f64 (d->lc2, r, odd_coeffs, 1);
|
||||
+ float64x2_t p = vfmaq_laneq_f64 (d->lc0, r, odd_coeffs, 0);
|
||||
+ y = vfmaq_laneq_f64 (y, r2, ln2_and_lc4, 1);
|
||||
+ y = vfmaq_f64 (p, r2, y);
|
||||
+ return vfmaq_f64 (hi, y, r2);
|
||||
}
|
||||
|
||||
/* Double-precision implementation of vector asinh(x).
|
||||
@@ -106,23 +143,24 @@ log_inline (float64x2_t x, const struct data *d)
|
||||
asinh(x) = sign(x) * log(|x| + sqrt(x^2 + 1) if |x| >= 1
|
||||
= sign(x) * (|x| + |x|^3 * P(x^2)) otherwise
|
||||
where log(x) is an optimized log approximation, and P(x) is a polynomial
|
||||
- shared with the scalar routine. The greatest observed error 3.29 ULP, in
|
||||
+ shared with the scalar routine. The greatest observed error 2.79 ULP, in
|
||||
|x| >= 1:
|
||||
- __v_asinh(0x1.2cd9d717e2c9bp+0) got 0x1.ffffcfd0e234fp-1
|
||||
- want 0x1.ffffcfd0e2352p-1. */
|
||||
+ _ZGVnN2v_asinh(0x1.2cd9d73ea76a6p+0) got 0x1.ffffd003219dap-1
|
||||
+ want 0x1.ffffd003219ddp-1. */
|
||||
VPCS_ATTR float64x2_t V_NAME_D1 (asinh) (float64x2_t x)
|
||||
{
|
||||
const struct data *d = ptr_barrier (&data);
|
||||
-
|
||||
float64x2_t ax = vabsq_f64 (x);
|
||||
- uint64x2_t iax = vreinterpretq_u64_f64 (ax);
|
||||
|
||||
uint64x2_t gt1 = vcgeq_f64 (ax, v_f64 (1));
|
||||
- uint64x2_t special = vcgeq_u64 (iax, d->huge_bound);
|
||||
|
||||
#if WANT_SIMD_EXCEPT
|
||||
+ uint64x2_t iax = vreinterpretq_u64_f64 (ax);
|
||||
+ uint64x2_t special = vcgeq_u64 (iax, (d->huge_bound));
|
||||
uint64x2_t tiny = vcltq_f64 (ax, d->tiny_bound);
|
||||
special = vorrq_u64 (special, tiny);
|
||||
+#else
|
||||
+ uint64x2_t special = vcgeq_f64 (ax, vreinterpretq_f64_u64 (d->huge_bound));
|
||||
#endif
|
||||
|
||||
/* Option 1: |x| >= 1.
|
||||
@@ -147,19 +185,45 @@ VPCS_ATTR float64x2_t V_NAME_D1 (asinh) (float64x2_t x)
|
||||
overflow, and tiny lanes, which will underflow, by setting them to 0. They
|
||||
will be fixed later, either by selecting x or falling back to the scalar
|
||||
special-case. The largest observed error in this region is 1.47 ULPs:
|
||||
- __v_asinh(0x1.fdfcd00cc1e6ap-1) got 0x1.c1d6bf874019bp-1
|
||||
- want 0x1.c1d6bf874019cp-1. */
|
||||
+ _ZGVnN2v_asinh(0x1.fdfcd00cc1e6ap-1) got 0x1.c1d6bf874019bp-1
|
||||
+ want 0x1.c1d6bf874019cp-1. */
|
||||
float64x2_t option_2 = v_f64 (0);
|
||||
+
|
||||
if (__glibc_likely (v_any_u64 (vceqzq_u64 (gt1))))
|
||||
{
|
||||
+
|
||||
#if WANT_SIMD_EXCEPT
|
||||
ax = v_zerofy_f64 (ax, vorrq_u64 (tiny, gt1));
|
||||
#endif
|
||||
- float64x2_t x2 = vmulq_f64 (ax, ax), x3 = vmulq_f64 (ax, x2),
|
||||
- z2 = vmulq_f64 (x2, x2), z4 = vmulq_f64 (z2, z2),
|
||||
- z8 = vmulq_f64 (z4, z4), z16 = vmulq_f64 (z8, z8);
|
||||
- float64x2_t p = v_estrin_17_f64 (x2, z2, z4, z8, z16, d->poly);
|
||||
- option_2 = vfmaq_f64 (ax, p, x3);
|
||||
+ float64x2_t x2 = vmulq_f64 (ax, ax), z2 = vmulq_f64 (x2, x2);
|
||||
+ /* Order-17 Pairwise Horner scheme. */
|
||||
+ float64x2_t c13 = vld1q_f64 (&d->c1);
|
||||
+ float64x2_t c57 = vld1q_f64 (&d->c5);
|
||||
+ float64x2_t c911 = vld1q_f64 (&d->c9);
|
||||
+ float64x2_t c1315 = vld1q_f64 (&d->c13);
|
||||
+
|
||||
+ float64x2_t p01 = vfmaq_laneq_f64 (d->c0, x2, c13, 0);
|
||||
+ float64x2_t p23 = vfmaq_laneq_f64 (d->c2, x2, c13, 1);
|
||||
+ float64x2_t p45 = vfmaq_laneq_f64 (d->c4, x2, c57, 0);
|
||||
+ float64x2_t p67 = vfmaq_laneq_f64 (d->c6, x2, c57, 1);
|
||||
+ float64x2_t p89 = vfmaq_laneq_f64 (d->c8, x2, c911, 0);
|
||||
+ float64x2_t p1011 = vfmaq_laneq_f64 (d->c10, x2, c911, 1);
|
||||
+ float64x2_t p1213 = vfmaq_laneq_f64 (d->c12, x2, c1315, 0);
|
||||
+ float64x2_t p1415 = vfmaq_laneq_f64 (d->c14, x2, c1315, 1);
|
||||
+ float64x2_t p1617 = vfmaq_f64 (d->c16, x2, d->c17);
|
||||
+
|
||||
+ float64x2_t p = vfmaq_f64 (p1415, z2, p1617);
|
||||
+ p = vfmaq_f64 (p1213, z2, p);
|
||||
+ p = vfmaq_f64 (p1011, z2, p);
|
||||
+ p = vfmaq_f64 (p89, z2, p);
|
||||
+
|
||||
+ p = vfmaq_f64 (p67, z2, p);
|
||||
+ p = vfmaq_f64 (p45, z2, p);
|
||||
+
|
||||
+ p = vfmaq_f64 (p23, z2, p);
|
||||
+
|
||||
+ p = vfmaq_f64 (p01, z2, p);
|
||||
+ option_2 = vfmaq_f64 (ax, p, vmulq_f64 (ax, x2));
|
||||
#if WANT_SIMD_EXCEPT
|
||||
option_2 = vbslq_f64 (tiny, x, option_2);
|
||||
#endif
|
||||
@@ -167,10 +231,10 @@ VPCS_ATTR float64x2_t V_NAME_D1 (asinh) (float64x2_t x)
|
||||
|
||||
/* Choose the right option for each lane. */
|
||||
float64x2_t y = vbslq_f64 (gt1, option_1, option_2);
|
||||
- /* Copy sign. */
|
||||
- y = vbslq_f64 (d->abs_mask, y, x);
|
||||
-
|
||||
if (__glibc_unlikely (v_any_u64 (special)))
|
||||
- return special_case (x, y, special);
|
||||
- return y;
|
||||
+ {
|
||||
+ return special_case (x, y, d->abs_mask, special);
|
||||
+ }
|
||||
+ /* Copy sign. */
|
||||
+ return vbslq_f64 (d->abs_mask, y, x);
|
||||
}
|
||||
221
glibc-RHEL-118273-25.patch
Normal file
221
glibc-RHEL-118273-25.patch
Normal file
@ -0,0 +1,221 @@
|
||||
commit f86b4cf87581cf1e45702b07880679ffa0b1f47a
|
||||
Author: Luna Lamb <luna.lamb@arm.com>
|
||||
Date: Fri Jan 3 20:15:17 2025 +0000
|
||||
|
||||
AArch64: Improve codegen in SVE expm1f and users
|
||||
|
||||
Use unpredicated muls, use absolute compare and improve memory access.
|
||||
Expm1f, sinhf and tanhf show 7%, 5% and 1% improvement in throughput
|
||||
microbenchmark on Neoverse V1.
|
||||
|
||||
diff --git a/sysdeps/aarch64/fpu/expm1f_sve.c b/sysdeps/aarch64/fpu/expm1f_sve.c
|
||||
index 7c852125cdbd0a2b..05a66400d477b819 100644
|
||||
--- a/sysdeps/aarch64/fpu/expm1f_sve.c
|
||||
+++ b/sysdeps/aarch64/fpu/expm1f_sve.c
|
||||
@@ -18,7 +18,6 @@
|
||||
<https://www.gnu.org/licenses/>. */
|
||||
|
||||
#include "sv_math.h"
|
||||
-#include "poly_sve_f32.h"
|
||||
|
||||
/* Largest value of x for which expm1(x) should round to -1. */
|
||||
#define SpecialBound 0x1.5ebc4p+6f
|
||||
@@ -28,20 +27,17 @@ static const struct data
|
||||
/* These 4 are grouped together so they can be loaded as one quadword, then
|
||||
used with _lane forms of svmla/svmls. */
|
||||
float c2, c4, ln2_hi, ln2_lo;
|
||||
- float c0, c1, c3, inv_ln2, special_bound, shift;
|
||||
+ float c0, inv_ln2, c1, c3, special_bound;
|
||||
} data = {
|
||||
/* Generated using fpminimax. */
|
||||
.c0 = 0x1.fffffep-2, .c1 = 0x1.5554aep-3,
|
||||
.c2 = 0x1.555736p-5, .c3 = 0x1.12287cp-7,
|
||||
- .c4 = 0x1.6b55a2p-10,
|
||||
+ .c4 = 0x1.6b55a2p-10, .inv_ln2 = 0x1.715476p+0f,
|
||||
+ .special_bound = SpecialBound, .ln2_lo = 0x1.7f7d1cp-20f,
|
||||
+ .ln2_hi = 0x1.62e4p-1f,
|
||||
|
||||
- .special_bound = SpecialBound, .shift = 0x1.8p23f,
|
||||
- .inv_ln2 = 0x1.715476p+0f, .ln2_hi = 0x1.62e4p-1f,
|
||||
- .ln2_lo = 0x1.7f7d1cp-20f,
|
||||
};
|
||||
|
||||
-#define C(i) sv_f32 (d->c##i)
|
||||
-
|
||||
static svfloat32_t NOINLINE
|
||||
special_case (svfloat32_t x, svbool_t pg)
|
||||
{
|
||||
@@ -71,9 +67,8 @@ svfloat32_t SV_NAME_F1 (expm1) (svfloat32_t x, svbool_t pg)
|
||||
and f = x - i * ln2, then f is in [-ln2/2, ln2/2].
|
||||
exp(x) - 1 = 2^i * (expm1(f) + 1) - 1
|
||||
where 2^i is exact because i is an integer. */
|
||||
- svfloat32_t j = svmla_x (pg, sv_f32 (d->shift), x, d->inv_ln2);
|
||||
- j = svsub_x (pg, j, d->shift);
|
||||
- svint32_t i = svcvt_s32_x (pg, j);
|
||||
+ svfloat32_t j = svmul_x (svptrue_b32 (), x, d->inv_ln2);
|
||||
+ j = svrinta_x (pg, j);
|
||||
|
||||
svfloat32_t f = svmls_lane (x, j, lane_constants, 2);
|
||||
f = svmls_lane (f, j, lane_constants, 3);
|
||||
@@ -83,17 +78,17 @@ svfloat32_t SV_NAME_F1 (expm1) (svfloat32_t x, svbool_t pg)
|
||||
x + ax^2 + bx^3 + cx^4 ....
|
||||
So we calculate the polynomial P(f) = a + bf + cf^2 + ...
|
||||
and assemble the approximation expm1(f) ~= f + f^2 * P(f). */
|
||||
- svfloat32_t p12 = svmla_lane (C (1), f, lane_constants, 0);
|
||||
- svfloat32_t p34 = svmla_lane (C (3), f, lane_constants, 1);
|
||||
- svfloat32_t f2 = svmul_x (pg, f, f);
|
||||
+ svfloat32_t p12 = svmla_lane (sv_f32 (d->c1), f, lane_constants, 0);
|
||||
+ svfloat32_t p34 = svmla_lane (sv_f32 (d->c3), f, lane_constants, 1);
|
||||
+ svfloat32_t f2 = svmul_x (svptrue_b32 (), f, f);
|
||||
svfloat32_t p = svmla_x (pg, p12, f2, p34);
|
||||
- p = svmla_x (pg, C (0), f, p);
|
||||
+
|
||||
+ p = svmla_x (pg, sv_f32 (d->c0), f, p);
|
||||
p = svmla_x (pg, f, f2, p);
|
||||
|
||||
/* Assemble the result.
|
||||
expm1(x) ~= 2^i * (p + 1) - 1
|
||||
Let t = 2^i. */
|
||||
- svfloat32_t t = svreinterpret_f32 (
|
||||
- svadd_x (pg, svreinterpret_u32 (svlsl_x (pg, i, 23)), 0x3f800000));
|
||||
- return svmla_x (pg, svsub_x (pg, t, 1), p, t);
|
||||
+ svfloat32_t t = svscale_x (pg, sv_f32 (1.0f), svcvt_s32_x (pg, j));
|
||||
+ return svmla_x (pg, svsub_x (pg, t, 1.0f), p, t);
|
||||
}
|
||||
diff --git a/sysdeps/aarch64/fpu/sinhf_sve.c b/sysdeps/aarch64/fpu/sinhf_sve.c
|
||||
index 6c204b57a2aa18d3..50dd386774b005ca 100644
|
||||
--- a/sysdeps/aarch64/fpu/sinhf_sve.c
|
||||
+++ b/sysdeps/aarch64/fpu/sinhf_sve.c
|
||||
@@ -63,5 +63,5 @@ svfloat32_t SV_NAME_F1 (sinh) (svfloat32_t x, const svbool_t pg)
|
||||
if (__glibc_unlikely (svptest_any (pg, special)))
|
||||
return special_case (x, svmul_x (pg, t, halfsign), special);
|
||||
|
||||
- return svmul_x (pg, t, halfsign);
|
||||
+ return svmul_x (svptrue_b32 (), t, halfsign);
|
||||
}
|
||||
diff --git a/sysdeps/aarch64/fpu/sv_expm1f_inline.h b/sysdeps/aarch64/fpu/sv_expm1f_inline.h
|
||||
index 5b7245122294e1b4..e46ddda5437dc826 100644
|
||||
--- a/sysdeps/aarch64/fpu/sv_expm1f_inline.h
|
||||
+++ b/sysdeps/aarch64/fpu/sv_expm1f_inline.h
|
||||
@@ -27,21 +27,18 @@ struct sv_expm1f_data
|
||||
/* These 4 are grouped together so they can be loaded as one quadword, then
|
||||
used with _lane forms of svmla/svmls. */
|
||||
float32_t c2, c4, ln2_hi, ln2_lo;
|
||||
- float32_t c0, c1, c3, inv_ln2, shift;
|
||||
+ float c0, inv_ln2, c1, c3, special_bound;
|
||||
};
|
||||
|
||||
/* Coefficients generated using fpminimax. */
|
||||
#define SV_EXPM1F_DATA \
|
||||
{ \
|
||||
- .c0 = 0x1.fffffep-2, .c1 = 0x1.5554aep-3, .c2 = 0x1.555736p-5, \
|
||||
- .c3 = 0x1.12287cp-7, .c4 = 0x1.6b55a2p-10, \
|
||||
+ .c0 = 0x1.fffffep-2, .c1 = 0x1.5554aep-3, .inv_ln2 = 0x1.715476p+0f, \
|
||||
+ .c2 = 0x1.555736p-5, .c3 = 0x1.12287cp-7, \
|
||||
\
|
||||
- .shift = 0x1.8p23f, .inv_ln2 = 0x1.715476p+0f, .ln2_hi = 0x1.62e4p-1f, \
|
||||
- .ln2_lo = 0x1.7f7d1cp-20f, \
|
||||
+ .c4 = 0x1.6b55a2p-10, .ln2_lo = 0x1.7f7d1cp-20f, .ln2_hi = 0x1.62e4p-1f, \
|
||||
}
|
||||
|
||||
-#define C(i) sv_f32 (d->c##i)
|
||||
-
|
||||
static inline svfloat32_t
|
||||
expm1f_inline (svfloat32_t x, svbool_t pg, const struct sv_expm1f_data *d)
|
||||
{
|
||||
@@ -55,9 +52,8 @@ expm1f_inline (svfloat32_t x, svbool_t pg, const struct sv_expm1f_data *d)
|
||||
and f = x - i * ln2, then f is in [-ln2/2, ln2/2].
|
||||
exp(x) - 1 = 2^i * (expm1(f) + 1) - 1
|
||||
where 2^i is exact because i is an integer. */
|
||||
- svfloat32_t j = svmla_x (pg, sv_f32 (d->shift), x, d->inv_ln2);
|
||||
- j = svsub_x (pg, j, d->shift);
|
||||
- svint32_t i = svcvt_s32_x (pg, j);
|
||||
+ svfloat32_t j = svmul_x (svptrue_b32 (), x, d->inv_ln2);
|
||||
+ j = svrinta_x (pg, j);
|
||||
|
||||
svfloat32_t f = svmls_lane (x, j, lane_constants, 2);
|
||||
f = svmls_lane (f, j, lane_constants, 3);
|
||||
@@ -67,18 +63,18 @@ expm1f_inline (svfloat32_t x, svbool_t pg, const struct sv_expm1f_data *d)
|
||||
x + ax^2 + bx^3 + cx^4 ....
|
||||
So we calculate the polynomial P(f) = a + bf + cf^2 + ...
|
||||
and assemble the approximation expm1(f) ~= f + f^2 * P(f). */
|
||||
- svfloat32_t p12 = svmla_lane (C (1), f, lane_constants, 0);
|
||||
- svfloat32_t p34 = svmla_lane (C (3), f, lane_constants, 1);
|
||||
- svfloat32_t f2 = svmul_x (pg, f, f);
|
||||
+ svfloat32_t p12 = svmla_lane (sv_f32 (d->c1), f, lane_constants, 0);
|
||||
+ svfloat32_t p34 = svmla_lane (sv_f32 (d->c3), f, lane_constants, 1);
|
||||
+ svfloat32_t f2 = svmul_x (svptrue_b32 (), f, f);
|
||||
svfloat32_t p = svmla_x (pg, p12, f2, p34);
|
||||
- p = svmla_x (pg, C (0), f, p);
|
||||
+ p = svmla_x (pg, sv_f32 (d->c0), f, p);
|
||||
p = svmla_x (pg, f, f2, p);
|
||||
|
||||
/* Assemble the result.
|
||||
expm1(x) ~= 2^i * (p + 1) - 1
|
||||
Let t = 2^i. */
|
||||
- svfloat32_t t = svscale_x (pg, sv_f32 (1), i);
|
||||
- return svmla_x (pg, svsub_x (pg, t, 1), p, t);
|
||||
+ svfloat32_t t = svscale_x (pg, sv_f32 (1.0f), svcvt_s32_x (pg, j));
|
||||
+ return svmla_x (pg, svsub_x (pg, t, 1.0f), p, t);
|
||||
}
|
||||
|
||||
#endif
|
||||
diff --git a/sysdeps/aarch64/fpu/tanhf_sve.c b/sysdeps/aarch64/fpu/tanhf_sve.c
|
||||
index 0b94523cf5074200..80dd679346f13f37 100644
|
||||
--- a/sysdeps/aarch64/fpu/tanhf_sve.c
|
||||
+++ b/sysdeps/aarch64/fpu/tanhf_sve.c
|
||||
@@ -19,20 +19,27 @@
|
||||
|
||||
#include "sv_expm1f_inline.h"
|
||||
|
||||
+/* Largest value of x for which tanhf(x) rounds to 1 (or -1 for negative). */
|
||||
+#define BoringBound 0x1.205966p+3f
|
||||
+
|
||||
static const struct data
|
||||
{
|
||||
struct sv_expm1f_data expm1f_consts;
|
||||
- uint32_t boring_bound, onef;
|
||||
+ uint32_t onef, special_bound;
|
||||
+ float boring_bound;
|
||||
} data = {
|
||||
.expm1f_consts = SV_EXPM1F_DATA,
|
||||
- /* 0x1.205966p+3, above which tanhf rounds to 1 (or -1 for negative). */
|
||||
- .boring_bound = 0x41102cb3,
|
||||
.onef = 0x3f800000,
|
||||
+ .special_bound = 0x7f800000,
|
||||
+ .boring_bound = BoringBound,
|
||||
};
|
||||
|
||||
static svfloat32_t NOINLINE
|
||||
-special_case (svfloat32_t x, svfloat32_t y, svbool_t special)
|
||||
+special_case (svfloat32_t x, svbool_t pg, svbool_t is_boring,
|
||||
+ svfloat32_t boring, svfloat32_t q, svbool_t special)
|
||||
{
|
||||
+ svfloat32_t y
|
||||
+ = svsel_f32 (is_boring, boring, svdiv_x (pg, q, svadd_x (pg, q, 2.0)));
|
||||
return sv_call_f32 (tanhf, x, y, special);
|
||||
}
|
||||
|
||||
@@ -47,15 +54,16 @@ svfloat32_t SV_NAME_F1 (tanh) (svfloat32_t x, const svbool_t pg)
|
||||
svfloat32_t ax = svabs_x (pg, x);
|
||||
svuint32_t iax = svreinterpret_u32 (ax);
|
||||
svuint32_t sign = sveor_x (pg, svreinterpret_u32 (x), iax);
|
||||
- svbool_t is_boring = svcmpgt (pg, iax, d->boring_bound);
|
||||
svfloat32_t boring = svreinterpret_f32 (svorr_x (pg, sign, d->onef));
|
||||
-
|
||||
- svbool_t special = svcmpgt (pg, iax, 0x7f800000);
|
||||
+ svbool_t special = svcmpgt (pg, iax, d->special_bound);
|
||||
+ svbool_t is_boring = svacgt (pg, x, d->boring_bound);
|
||||
|
||||
/* tanh(x) = (e^2x - 1) / (e^2x + 1). */
|
||||
- svfloat32_t q = expm1f_inline (svmul_x (pg, x, 2.0), pg, &d->expm1f_consts);
|
||||
- svfloat32_t y = svdiv_x (pg, q, svadd_x (pg, q, 2.0));
|
||||
+ svfloat32_t q = expm1f_inline (svmul_x (svptrue_b32 (), x, 2.0), pg,
|
||||
+ &d->expm1f_consts);
|
||||
+
|
||||
if (__glibc_unlikely (svptest_any (pg, special)))
|
||||
- return special_case (x, svsel_f32 (is_boring, boring, y), special);
|
||||
+ return special_case (x, pg, is_boring, boring, q, special);
|
||||
+ svfloat32_t y = svdiv_x (pg, q, svadd_x (pg, q, 2.0));
|
||||
return svsel_f32 (is_boring, boring, y);
|
||||
}
|
||||
125
glibc-RHEL-118273-26.patch
Normal file
125
glibc-RHEL-118273-26.patch
Normal file
@ -0,0 +1,125 @@
|
||||
commit d3f2b71ef1d146137a25dd1367d97a14fac341c6
|
||||
Author: Yury Khrustalev <yury.khrustalev@arm.com>
|
||||
Date: Tue Nov 26 11:38:30 2024 +0000
|
||||
|
||||
aarch64: Fix tests not compatible with targets supporting GCS
|
||||
|
||||
- Add GCS marking to some of the tests when target supports GCS
|
||||
- Fix tst-ro-dynamic-mod.map linker script to avoid removing
|
||||
GNU properties
|
||||
- Add header with macros for GNU properties
|
||||
|
||||
Reviewed-by: Wilco Dijkstra <Wilco.Dijkstra@arm.com>
|
||||
|
||||
diff --git a/elf/tst-asm-helper.h b/elf/tst-asm-helper.h
|
||||
new file mode 100644
|
||||
index 0000000000000000..6f91ac2ddc54d3f9
|
||||
--- /dev/null
|
||||
+++ b/elf/tst-asm-helper.h
|
||||
@@ -0,0 +1,49 @@
|
||||
+/* Test header that defines macros for GNU properties that need to be
|
||||
+ used in some test assembly files where sysdep.h cannot be included
|
||||
+ for some reason.
|
||||
+ Copyright (C) 2024-2025 Free Software Foundation, Inc.
|
||||
+ This file is part of the GNU C Library.
|
||||
+
|
||||
+ The GNU C Library is free software; you can redistribute it and/or
|
||||
+ modify it under the terms of the GNU Lesser General Public
|
||||
+ License as published by the Free Software Foundation; either
|
||||
+ version 2.1 of the License, or (at your option) any later version.
|
||||
+
|
||||
+ The GNU C Library is distributed in the hope that it will be useful,
|
||||
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
+ Lesser General Public License for more details.
|
||||
+
|
||||
+ You should have received a copy of the GNU Lesser General Public
|
||||
+ License along with the GNU C Library; if not, see
|
||||
+ <https://www.gnu.org/licenses/>. */
|
||||
+
|
||||
+#include <config.h>
|
||||
+
|
||||
+/* GNU_PROPERTY_AARCH64_* macros from elf.h for use in asm code. */
|
||||
+#define FEATURE_1_AND 0xc0000000
|
||||
+#define FEATURE_1_BTI 1
|
||||
+#define FEATURE_1_PAC 2
|
||||
+#define FEATURE_1_GCS 4
|
||||
+
|
||||
+/* Add a NT_GNU_PROPERTY_TYPE_0 note. */
|
||||
+#define GNU_PROPERTY(type, value) \
|
||||
+ .section .note.gnu.property, "a"; \
|
||||
+ .p2align 3; \
|
||||
+ .word 4; \
|
||||
+ .word 16; \
|
||||
+ .word 5; \
|
||||
+ .asciz "GNU"; \
|
||||
+ .word type; \
|
||||
+ .word 4; \
|
||||
+ .word value; \
|
||||
+ .word 0; \
|
||||
+ .text
|
||||
+
|
||||
+/* Add GNU property note with the supported features to all asm code
|
||||
+ where sysdep.h is included. */
|
||||
+#if HAVE_AARCH64_BTI && HAVE_AARCH64_PAC_RET
|
||||
+GNU_PROPERTY (FEATURE_1_AND, FEATURE_1_BTI|FEATURE_1_PAC|FEATURE_1_GCS)
|
||||
+#elif HAVE_AARCH64_BTI
|
||||
+GNU_PROPERTY (FEATURE_1_AND, FEATURE_1_BTI|FEATURE_1_GCS)
|
||||
+#endif
|
||||
diff --git a/elf/tst-big-note-lib.S b/elf/tst-big-note-lib.S
|
||||
index 5eb1e03cfbe2cee8..cfd31137e85a1335 100644
|
||||
--- a/elf/tst-big-note-lib.S
|
||||
+++ b/elf/tst-big-note-lib.S
|
||||
@@ -20,6 +20,8 @@
|
||||
On a typical Linux system with 8MiB "ulimit -s", that was enough
|
||||
to trigger stack overflow in open_verify. */
|
||||
|
||||
+#include "tst-asm-helper.h"
|
||||
+
|
||||
#define NOTE_SIZE 8*1024*1024
|
||||
|
||||
.pushsection .note.big,"a"
|
||||
diff --git a/elf/tst-ro-dynamic-mod.map b/elf/tst-ro-dynamic-mod.map
|
||||
index 2fe4a2998cddd587..2a158480c07d9691 100644
|
||||
--- a/elf/tst-ro-dynamic-mod.map
|
||||
+++ b/elf/tst-ro-dynamic-mod.map
|
||||
@@ -3,14 +3,13 @@ SECTIONS
|
||||
. = SIZEOF_HEADERS;
|
||||
.dynamic : { *(.dynamic) } :text :dynamic
|
||||
.rodata : { *(.data*) *(.bss*) } :text
|
||||
- /DISCARD/ : {
|
||||
- *(.note.gnu.property)
|
||||
- }
|
||||
- .note : { *(.note.*) } :text :note
|
||||
+ .note : { *(.note) } :text :note
|
||||
+ .note.gnu.property : { *(.note.gnu.property) } :text :gnu_property
|
||||
}
|
||||
PHDRS
|
||||
{
|
||||
text PT_LOAD FLAGS(5) FILEHDR PHDRS;
|
||||
dynamic PT_DYNAMIC FLAGS(4);
|
||||
note PT_NOTE FLAGS(4);
|
||||
+ gnu_property PT_GNU_PROPERTY FLAGS(4);
|
||||
}
|
||||
diff --git a/sysdeps/aarch64/tst-vpcs-mod.S b/sysdeps/aarch64/tst-vpcs-mod.S
|
||||
index 19b01c3c3859e13b..b3b5824eda1fb076 100644
|
||||
--- a/sysdeps/aarch64/tst-vpcs-mod.S
|
||||
+++ b/sysdeps/aarch64/tst-vpcs-mod.S
|
||||
@@ -17,6 +17,8 @@
|
||||
License along with the GNU C Library. If not, see
|
||||
<https://www.gnu.org/licenses/>. */
|
||||
|
||||
+#include "tst-asm-helper.h"
|
||||
+
|
||||
.variant_pcs vpcs_call
|
||||
.global vpcs_call
|
||||
.type vpcs_call, %function
|
||||
@@ -121,7 +123,7 @@ vpcs_call_regs:
|
||||
/* Emulate a BL using B, but save x30 before the branch. */
|
||||
adr x30, .L_return_addr
|
||||
stp x30, x29, [x1, 240]
|
||||
- b vpcs_call
|
||||
+ bl vpcs_call
|
||||
.L_return_addr:
|
||||
|
||||
/* Restore callee-saved registers. */
|
||||
241
glibc-RHEL-118273-27.patch
Normal file
241
glibc-RHEL-118273-27.patch
Normal file
@ -0,0 +1,241 @@
|
||||
commit 95e807209b680257a9afe81a507754f1565dbb4d
|
||||
Author: Yat Long Poon <yatlong.poon@arm.com>
|
||||
Date: Thu Feb 13 18:03:04 2025 +0000
|
||||
|
||||
AArch64: Improve codegen for SVE powf
|
||||
|
||||
Improve memory access with indexed/unpredicated instructions.
|
||||
Eliminate register spills. Speedup on Neoverse V1: 3%.
|
||||
|
||||
Reviewed-by: Wilco Dijkstra <Wilco.Dijkstra@arm.com>
|
||||
|
||||
diff --git a/sysdeps/aarch64/fpu/powf_sve.c b/sysdeps/aarch64/fpu/powf_sve.c
|
||||
index 4f6a142325ae719b..08d7019a1855ff3c 100644
|
||||
--- a/sysdeps/aarch64/fpu/powf_sve.c
|
||||
+++ b/sysdeps/aarch64/fpu/powf_sve.c
|
||||
@@ -26,7 +26,6 @@
|
||||
#define Tlogc __v_powf_data.logc
|
||||
#define Texp __v_powf_data.scale
|
||||
#define SignBias (1 << (V_POWF_EXP2_TABLE_BITS + 11))
|
||||
-#define Shift 0x1.8p52
|
||||
#define Norm 0x1p23f /* 0x4b000000. */
|
||||
|
||||
/* Overall ULP error bound for pow is 2.6 ulp
|
||||
@@ -36,7 +35,7 @@ static const struct data
|
||||
double log_poly[4];
|
||||
double exp_poly[3];
|
||||
float uflow_bound, oflow_bound, small_bound;
|
||||
- uint32_t sign_bias, sign_mask, subnormal_bias, off;
|
||||
+ uint32_t sign_bias, subnormal_bias, off;
|
||||
} data = {
|
||||
/* rel err: 1.5 * 2^-30. Each coefficients is multiplied the value of
|
||||
V_POWF_EXP2_N. */
|
||||
@@ -53,7 +52,6 @@ static const struct data
|
||||
.small_bound = 0x1p-126f,
|
||||
.off = 0x3f35d000,
|
||||
.sign_bias = SignBias,
|
||||
- .sign_mask = 0x80000000,
|
||||
.subnormal_bias = 0x0b800000, /* 23 << 23. */
|
||||
};
|
||||
|
||||
@@ -86,7 +84,7 @@ svisodd (svbool_t pg, svfloat32_t x)
|
||||
static inline svbool_t
|
||||
sv_zeroinfnan (svbool_t pg, svuint32_t i)
|
||||
{
|
||||
- return svcmpge (pg, svsub_x (pg, svmul_x (pg, i, 2u), 1),
|
||||
+ return svcmpge (pg, svsub_x (pg, svadd_x (pg, i, i), 1),
|
||||
2u * 0x7f800000 - 1);
|
||||
}
|
||||
|
||||
@@ -150,9 +148,14 @@ powf_specialcase (float x, float y, float z)
|
||||
}
|
||||
|
||||
/* Scalar fallback for special case routines with custom signature. */
|
||||
-static inline svfloat32_t
|
||||
-sv_call_powf_sc (svfloat32_t x1, svfloat32_t x2, svfloat32_t y, svbool_t cmp)
|
||||
+static svfloat32_t NOINLINE
|
||||
+sv_call_powf_sc (svfloat32_t x1, svfloat32_t x2, svfloat32_t y)
|
||||
{
|
||||
+ /* Special cases of x or y: zero, inf and nan. */
|
||||
+ svbool_t xspecial = sv_zeroinfnan (svptrue_b32 (), svreinterpret_u32 (x1));
|
||||
+ svbool_t yspecial = sv_zeroinfnan (svptrue_b32 (), svreinterpret_u32 (x2));
|
||||
+ svbool_t cmp = svorr_z (svptrue_b32 (), xspecial, yspecial);
|
||||
+
|
||||
svbool_t p = svpfirst (cmp, svpfalse ());
|
||||
while (svptest_any (cmp, p))
|
||||
{
|
||||
@@ -182,30 +185,30 @@ sv_powf_core_ext (const svbool_t pg, svuint64_t i, svfloat64_t z, svint64_t k,
|
||||
|
||||
/* Polynomial to approximate log1p(r)/ln2. */
|
||||
svfloat64_t logx = A (0);
|
||||
- logx = svmla_x (pg, A (1), r, logx);
|
||||
- logx = svmla_x (pg, A (2), r, logx);
|
||||
- logx = svmla_x (pg, A (3), r, logx);
|
||||
- logx = svmla_x (pg, y0, r, logx);
|
||||
+ logx = svmad_x (pg, r, logx, A (1));
|
||||
+ logx = svmad_x (pg, r, logx, A (2));
|
||||
+ logx = svmad_x (pg, r, logx, A (3));
|
||||
+ logx = svmad_x (pg, r, logx, y0);
|
||||
*pylogx = svmul_x (pg, y, logx);
|
||||
|
||||
/* z - kd is in [-1, 1] in non-nearest rounding modes. */
|
||||
- svfloat64_t kd = svadd_x (pg, *pylogx, Shift);
|
||||
- svuint64_t ki = svreinterpret_u64 (kd);
|
||||
- kd = svsub_x (pg, kd, Shift);
|
||||
+ svfloat64_t kd = svrinta_x (svptrue_b64 (), *pylogx);
|
||||
+ svuint64_t ki = svreinterpret_u64 (svcvt_s64_x (svptrue_b64 (), kd));
|
||||
|
||||
r = svsub_x (pg, *pylogx, kd);
|
||||
|
||||
/* exp2(x) = 2^(k/N) * 2^r ~= s * (C0*r^3 + C1*r^2 + C2*r + 1). */
|
||||
- svuint64_t t
|
||||
- = svld1_gather_index (pg, Texp, svand_x (pg, ki, V_POWF_EXP2_N - 1));
|
||||
- svuint64_t ski = svadd_x (pg, ki, sign_bias);
|
||||
- t = svadd_x (pg, t, svlsl_x (pg, ski, 52 - V_POWF_EXP2_TABLE_BITS));
|
||||
+ svuint64_t t = svld1_gather_index (
|
||||
+ svptrue_b64 (), Texp, svand_x (svptrue_b64 (), ki, V_POWF_EXP2_N - 1));
|
||||
+ svuint64_t ski = svadd_x (svptrue_b64 (), ki, sign_bias);
|
||||
+ t = svadd_x (svptrue_b64 (), t,
|
||||
+ svlsl_x (svptrue_b64 (), ski, 52 - V_POWF_EXP2_TABLE_BITS));
|
||||
svfloat64_t s = svreinterpret_f64 (t);
|
||||
|
||||
svfloat64_t p = C (0);
|
||||
p = svmla_x (pg, C (1), p, r);
|
||||
p = svmla_x (pg, C (2), p, r);
|
||||
- p = svmla_x (pg, s, p, svmul_x (pg, s, r));
|
||||
+ p = svmla_x (pg, s, p, svmul_x (svptrue_b64 (), s, r));
|
||||
|
||||
return p;
|
||||
}
|
||||
@@ -219,19 +222,16 @@ sv_powf_core (const svbool_t pg, svuint32_t i, svuint32_t iz, svint32_t k,
|
||||
{
|
||||
const svbool_t ptrue = svptrue_b64 ();
|
||||
|
||||
- /* Unpack and promote input vectors (pg, y, z, i, k and sign_bias) into two in
|
||||
- order to perform core computation in double precision. */
|
||||
+ /* Unpack and promote input vectors (pg, y, z, i, k and sign_bias) into two
|
||||
+ * in order to perform core computation in double precision. */
|
||||
const svbool_t pg_lo = svunpklo (pg);
|
||||
const svbool_t pg_hi = svunpkhi (pg);
|
||||
- svfloat64_t y_lo = svcvt_f64_x (
|
||||
- ptrue, svreinterpret_f32 (svunpklo (svreinterpret_u32 (y))));
|
||||
- svfloat64_t y_hi = svcvt_f64_x (
|
||||
- ptrue, svreinterpret_f32 (svunpkhi (svreinterpret_u32 (y))));
|
||||
- svfloat32_t z = svreinterpret_f32 (iz);
|
||||
- svfloat64_t z_lo = svcvt_f64_x (
|
||||
- ptrue, svreinterpret_f32 (svunpklo (svreinterpret_u32 (z))));
|
||||
- svfloat64_t z_hi = svcvt_f64_x (
|
||||
- ptrue, svreinterpret_f32 (svunpkhi (svreinterpret_u32 (z))));
|
||||
+ svfloat64_t y_lo
|
||||
+ = svcvt_f64_x (pg, svreinterpret_f32 (svunpklo (svreinterpret_u32 (y))));
|
||||
+ svfloat64_t y_hi
|
||||
+ = svcvt_f64_x (pg, svreinterpret_f32 (svunpkhi (svreinterpret_u32 (y))));
|
||||
+ svfloat64_t z_lo = svcvt_f64_x (pg, svreinterpret_f32 (svunpklo (iz)));
|
||||
+ svfloat64_t z_hi = svcvt_f64_x (pg, svreinterpret_f32 (svunpkhi (iz)));
|
||||
svuint64_t i_lo = svunpklo (i);
|
||||
svuint64_t i_hi = svunpkhi (i);
|
||||
svint64_t k_lo = svunpklo (k);
|
||||
@@ -258,9 +258,9 @@ sv_powf_core (const svbool_t pg, svuint32_t i, svuint32_t iz, svint32_t k,
|
||||
/* Implementation of SVE powf.
|
||||
Provides the same accuracy as AdvSIMD powf, since it relies on the same
|
||||
algorithm. The theoretical maximum error is under 2.60 ULPs.
|
||||
- Maximum measured error is 2.56 ULPs:
|
||||
- SV_NAME_F2 (pow) (0x1.004118p+0, 0x1.5d14a4p+16) got 0x1.fd4bp+127
|
||||
- want 0x1.fd4b06p+127. */
|
||||
+ Maximum measured error is 2.57 ULPs:
|
||||
+ SV_NAME_F2 (pow) (0x1.031706p+0, 0x1.ce2ec2p+12) got 0x1.fff868p+127
|
||||
+ want 0x1.fff862p+127. */
|
||||
svfloat32_t SV_NAME_F2 (pow) (svfloat32_t x, svfloat32_t y, const svbool_t pg)
|
||||
{
|
||||
const struct data *d = ptr_barrier (&data);
|
||||
@@ -269,21 +269,19 @@ svfloat32_t SV_NAME_F2 (pow) (svfloat32_t x, svfloat32_t y, const svbool_t pg)
|
||||
svuint32_t viy0 = svreinterpret_u32 (y);
|
||||
|
||||
/* Negative x cases. */
|
||||
- svuint32_t sign_bit = svand_m (pg, vix0, d->sign_mask);
|
||||
- svbool_t xisneg = svcmpeq (pg, sign_bit, d->sign_mask);
|
||||
+ svbool_t xisneg = svcmplt (pg, x, sv_f32 (0));
|
||||
|
||||
/* Set sign_bias and ix depending on sign of x and nature of y. */
|
||||
- svbool_t yisnotint_xisneg = svpfalse_b ();
|
||||
+ svbool_t yint_or_xpos = pg;
|
||||
svuint32_t sign_bias = sv_u32 (0);
|
||||
svuint32_t vix = vix0;
|
||||
if (__glibc_unlikely (svptest_any (pg, xisneg)))
|
||||
{
|
||||
/* Determine nature of y. */
|
||||
- yisnotint_xisneg = svisnotint (xisneg, y);
|
||||
- svbool_t yisint_xisneg = svisint (xisneg, y);
|
||||
+ yint_or_xpos = svisint (xisneg, y);
|
||||
svbool_t yisodd_xisneg = svisodd (xisneg, y);
|
||||
/* ix set to abs(ix) if y is integer. */
|
||||
- vix = svand_m (yisint_xisneg, vix0, 0x7fffffff);
|
||||
+ vix = svand_m (yint_or_xpos, vix0, 0x7fffffff);
|
||||
/* Set to SignBias if x is negative and y is odd. */
|
||||
sign_bias = svsel (yisodd_xisneg, sv_u32 (d->sign_bias), sv_u32 (0));
|
||||
}
|
||||
@@ -294,8 +292,8 @@ svfloat32_t SV_NAME_F2 (pow) (svfloat32_t x, svfloat32_t y, const svbool_t pg)
|
||||
svbool_t cmp = svorr_z (pg, xspecial, yspecial);
|
||||
|
||||
/* Small cases of x: |x| < 0x1p-126. */
|
||||
- svbool_t xsmall = svaclt (pg, x, d->small_bound);
|
||||
- if (__glibc_unlikely (svptest_any (pg, xsmall)))
|
||||
+ svbool_t xsmall = svaclt (yint_or_xpos, x, d->small_bound);
|
||||
+ if (__glibc_unlikely (svptest_any (yint_or_xpos, xsmall)))
|
||||
{
|
||||
/* Normalize subnormal x so exponent becomes negative. */
|
||||
svuint32_t vix_norm = svreinterpret_u32 (svmul_x (xsmall, x, Norm));
|
||||
@@ -304,32 +302,35 @@ svfloat32_t SV_NAME_F2 (pow) (svfloat32_t x, svfloat32_t y, const svbool_t pg)
|
||||
vix = svsel (xsmall, vix_norm, vix);
|
||||
}
|
||||
/* Part of core computation carried in working precision. */
|
||||
- svuint32_t tmp = svsub_x (pg, vix, d->off);
|
||||
- svuint32_t i = svand_x (pg, svlsr_x (pg, tmp, (23 - V_POWF_LOG2_TABLE_BITS)),
|
||||
- V_POWF_LOG2_N - 1);
|
||||
- svuint32_t top = svand_x (pg, tmp, 0xff800000);
|
||||
- svuint32_t iz = svsub_x (pg, vix, top);
|
||||
- svint32_t k
|
||||
- = svasr_x (pg, svreinterpret_s32 (top), (23 - V_POWF_EXP2_TABLE_BITS));
|
||||
-
|
||||
- /* Compute core in extended precision and return intermediate ylogx results to
|
||||
- handle cases of underflow and underflow in exp. */
|
||||
+ svuint32_t tmp = svsub_x (yint_or_xpos, vix, d->off);
|
||||
+ svuint32_t i = svand_x (
|
||||
+ yint_or_xpos, svlsr_x (yint_or_xpos, tmp, (23 - V_POWF_LOG2_TABLE_BITS)),
|
||||
+ V_POWF_LOG2_N - 1);
|
||||
+ svuint32_t top = svand_x (yint_or_xpos, tmp, 0xff800000);
|
||||
+ svuint32_t iz = svsub_x (yint_or_xpos, vix, top);
|
||||
+ svint32_t k = svasr_x (yint_or_xpos, svreinterpret_s32 (top),
|
||||
+ (23 - V_POWF_EXP2_TABLE_BITS));
|
||||
+
|
||||
+ /* Compute core in extended precision and return intermediate ylogx results
|
||||
+ * to handle cases of underflow and underflow in exp. */
|
||||
svfloat32_t ylogx;
|
||||
- svfloat32_t ret = sv_powf_core (pg, i, iz, k, y, sign_bias, &ylogx, d);
|
||||
+ svfloat32_t ret
|
||||
+ = sv_powf_core (yint_or_xpos, i, iz, k, y, sign_bias, &ylogx, d);
|
||||
|
||||
/* Handle exp special cases of underflow and overflow. */
|
||||
- svuint32_t sign = svlsl_x (pg, sign_bias, 20 - V_POWF_EXP2_TABLE_BITS);
|
||||
+ svuint32_t sign
|
||||
+ = svlsl_x (yint_or_xpos, sign_bias, 20 - V_POWF_EXP2_TABLE_BITS);
|
||||
svfloat32_t ret_oflow
|
||||
- = svreinterpret_f32 (svorr_x (pg, sign, asuint (INFINITY)));
|
||||
+ = svreinterpret_f32 (svorr_x (yint_or_xpos, sign, asuint (INFINITY)));
|
||||
svfloat32_t ret_uflow = svreinterpret_f32 (sign);
|
||||
- ret = svsel (svcmple (pg, ylogx, d->uflow_bound), ret_uflow, ret);
|
||||
- ret = svsel (svcmpgt (pg, ylogx, d->oflow_bound), ret_oflow, ret);
|
||||
+ ret = svsel (svcmple (yint_or_xpos, ylogx, d->uflow_bound), ret_uflow, ret);
|
||||
+ ret = svsel (svcmpgt (yint_or_xpos, ylogx, d->oflow_bound), ret_oflow, ret);
|
||||
|
||||
/* Cases of finite y and finite negative x. */
|
||||
- ret = svsel (yisnotint_xisneg, sv_f32 (__builtin_nanf ("")), ret);
|
||||
+ ret = svsel (yint_or_xpos, ret, sv_f32 (__builtin_nanf ("")));
|
||||
|
||||
- if (__glibc_unlikely (svptest_any (pg, cmp)))
|
||||
- return sv_call_powf_sc (x, y, ret, cmp);
|
||||
+ if (__glibc_unlikely (svptest_any (cmp, cmp)))
|
||||
+ return sv_call_powf_sc (x, y, ret);
|
||||
|
||||
return ret;
|
||||
}
|
||||
401
glibc-RHEL-118273-28.patch
Normal file
401
glibc-RHEL-118273-28.patch
Normal file
@ -0,0 +1,401 @@
|
||||
commit 0b195651db3ae793187c7dd6d78b5a7a8da9d5e6
|
||||
Author: Yat Long Poon <yatlong.poon@arm.com>
|
||||
Date: Thu Feb 13 18:02:01 2025 +0000
|
||||
|
||||
AArch64: Improve codegen for SVE pow
|
||||
|
||||
Move constants to struct. Improve memory access with indexed/unpredicated
|
||||
instructions. Eliminate register spills. Speedup on Neoverse V1: 24%.
|
||||
|
||||
Reviewed-by: Wilco Dijkstra <Wilco.Dijkstra@arm.com>
|
||||
|
||||
diff --git a/sysdeps/aarch64/fpu/pow_sve.c b/sysdeps/aarch64/fpu/pow_sve.c
|
||||
index 4c0bf8956c584be7..4242d22a491ed17e 100644
|
||||
--- a/sysdeps/aarch64/fpu/pow_sve.c
|
||||
+++ b/sysdeps/aarch64/fpu/pow_sve.c
|
||||
@@ -44,19 +44,18 @@
|
||||
|
||||
/* Data is defined in v_pow_log_data.c. */
|
||||
#define N_LOG (1 << V_POW_LOG_TABLE_BITS)
|
||||
-#define A __v_pow_log_data.poly
|
||||
#define Off 0x3fe6955500000000
|
||||
|
||||
/* Data is defined in v_pow_exp_data.c. */
|
||||
#define N_EXP (1 << V_POW_EXP_TABLE_BITS)
|
||||
#define SignBias (0x800 << V_POW_EXP_TABLE_BITS)
|
||||
-#define C __v_pow_exp_data.poly
|
||||
#define SmallExp 0x3c9 /* top12(0x1p-54). */
|
||||
#define BigExp 0x408 /* top12(512.). */
|
||||
#define ThresExp 0x03f /* BigExp - SmallExp. */
|
||||
#define HugeExp 0x409 /* top12(1024.). */
|
||||
|
||||
/* Constants associated with pow. */
|
||||
+#define SmallBoundX 0x1p-126
|
||||
#define SmallPowX 0x001 /* top12(0x1p-126). */
|
||||
#define BigPowX 0x7ff /* top12(INFINITY). */
|
||||
#define ThresPowX 0x7fe /* BigPowX - SmallPowX. */
|
||||
@@ -64,6 +63,31 @@
|
||||
#define BigPowY 0x43e /* top12(0x1.749p62). */
|
||||
#define ThresPowY 0x080 /* BigPowY - SmallPowY. */
|
||||
|
||||
+static const struct data
|
||||
+{
|
||||
+ double log_c0, log_c2, log_c4, log_c6, ln2_hi, ln2_lo;
|
||||
+ double log_c1, log_c3, log_c5, off;
|
||||
+ double n_over_ln2, exp_c2, ln2_over_n_hi, ln2_over_n_lo;
|
||||
+ double exp_c0, exp_c1;
|
||||
+} data = {
|
||||
+ .log_c0 = -0x1p-1,
|
||||
+ .log_c1 = -0x1.555555555556p-1,
|
||||
+ .log_c2 = 0x1.0000000000006p-1,
|
||||
+ .log_c3 = 0x1.999999959554ep-1,
|
||||
+ .log_c4 = -0x1.555555529a47ap-1,
|
||||
+ .log_c5 = -0x1.2495b9b4845e9p0,
|
||||
+ .log_c6 = 0x1.0002b8b263fc3p0,
|
||||
+ .off = Off,
|
||||
+ .exp_c0 = 0x1.fffffffffffd4p-2,
|
||||
+ .exp_c1 = 0x1.5555571d6ef9p-3,
|
||||
+ .exp_c2 = 0x1.5555576a5adcep-5,
|
||||
+ .ln2_hi = 0x1.62e42fefa3800p-1,
|
||||
+ .ln2_lo = 0x1.ef35793c76730p-45,
|
||||
+ .n_over_ln2 = 0x1.71547652b82fep0 * N_EXP,
|
||||
+ .ln2_over_n_hi = 0x1.62e42fefc0000p-9,
|
||||
+ .ln2_over_n_lo = -0x1.c610ca86c3899p-45,
|
||||
+};
|
||||
+
|
||||
/* Check if x is an integer. */
|
||||
static inline svbool_t
|
||||
sv_isint (svbool_t pg, svfloat64_t x)
|
||||
@@ -82,7 +106,7 @@ sv_isnotint (svbool_t pg, svfloat64_t x)
|
||||
static inline svbool_t
|
||||
sv_isodd (svbool_t pg, svfloat64_t x)
|
||||
{
|
||||
- svfloat64_t y = svmul_x (pg, x, 0.5);
|
||||
+ svfloat64_t y = svmul_x (svptrue_b64 (), x, 0.5);
|
||||
return sv_isnotint (pg, y);
|
||||
}
|
||||
|
||||
@@ -121,7 +145,7 @@ zeroinfnan (uint64_t i)
|
||||
static inline svbool_t
|
||||
sv_zeroinfnan (svbool_t pg, svuint64_t i)
|
||||
{
|
||||
- return svcmpge (pg, svsub_x (pg, svmul_x (pg, i, 2), 1),
|
||||
+ return svcmpge (pg, svsub_x (pg, svadd_x (pg, i, i), 1),
|
||||
2 * asuint64 (INFINITY) - 1);
|
||||
}
|
||||
|
||||
@@ -174,16 +198,17 @@ sv_call_specialcase (svfloat64_t x1, svuint64_t u1, svuint64_t u2,
|
||||
additional 15 bits precision. IX is the bit representation of x, but
|
||||
normalized in the subnormal range using the sign bit for the exponent. */
|
||||
static inline svfloat64_t
|
||||
-sv_log_inline (svbool_t pg, svuint64_t ix, svfloat64_t *tail)
|
||||
+sv_log_inline (svbool_t pg, svuint64_t ix, svfloat64_t *tail,
|
||||
+ const struct data *d)
|
||||
{
|
||||
/* x = 2^k z; where z is in range [Off,2*Off) and exact.
|
||||
The range is split into N subintervals.
|
||||
The ith subinterval contains z and c is near its center. */
|
||||
- svuint64_t tmp = svsub_x (pg, ix, Off);
|
||||
+ svuint64_t tmp = svsub_x (pg, ix, d->off);
|
||||
svuint64_t i = svand_x (pg, svlsr_x (pg, tmp, 52 - V_POW_LOG_TABLE_BITS),
|
||||
sv_u64 (N_LOG - 1));
|
||||
svint64_t k = svasr_x (pg, svreinterpret_s64 (tmp), 52);
|
||||
- svuint64_t iz = svsub_x (pg, ix, svand_x (pg, tmp, sv_u64 (0xfffULL << 52)));
|
||||
+ svuint64_t iz = svsub_x (pg, ix, svlsl_x (pg, svreinterpret_u64 (k), 52));
|
||||
svfloat64_t z = svreinterpret_f64 (iz);
|
||||
svfloat64_t kd = svcvt_f64_x (pg, k);
|
||||
|
||||
@@ -199,40 +224,85 @@ sv_log_inline (svbool_t pg, svuint64_t ix, svfloat64_t *tail)
|
||||
|z/c - 1| < 1/N, so r = z/c - 1 is exactly representible. */
|
||||
svfloat64_t r = svmad_x (pg, z, invc, -1.0);
|
||||
/* k*Ln2 + log(c) + r. */
|
||||
- svfloat64_t t1 = svmla_x (pg, logc, kd, __v_pow_log_data.ln2_hi);
|
||||
+
|
||||
+ svfloat64_t ln2_hilo = svld1rq_f64 (svptrue_b64 (), &d->ln2_hi);
|
||||
+ svfloat64_t t1 = svmla_lane_f64 (logc, kd, ln2_hilo, 0);
|
||||
svfloat64_t t2 = svadd_x (pg, t1, r);
|
||||
- svfloat64_t lo1 = svmla_x (pg, logctail, kd, __v_pow_log_data.ln2_lo);
|
||||
+ svfloat64_t lo1 = svmla_lane_f64 (logctail, kd, ln2_hilo, 1);
|
||||
svfloat64_t lo2 = svadd_x (pg, svsub_x (pg, t1, t2), r);
|
||||
|
||||
/* Evaluation is optimized assuming superscalar pipelined execution. */
|
||||
- svfloat64_t ar = svmul_x (pg, r, -0.5); /* A[0] = -0.5. */
|
||||
- svfloat64_t ar2 = svmul_x (pg, r, ar);
|
||||
- svfloat64_t ar3 = svmul_x (pg, r, ar2);
|
||||
+
|
||||
+ svfloat64_t log_c02 = svld1rq_f64 (svptrue_b64 (), &d->log_c0);
|
||||
+ svfloat64_t ar = svmul_lane_f64 (r, log_c02, 0);
|
||||
+ svfloat64_t ar2 = svmul_x (svptrue_b64 (), r, ar);
|
||||
+ svfloat64_t ar3 = svmul_x (svptrue_b64 (), r, ar2);
|
||||
/* k*Ln2 + log(c) + r + A[0]*r*r. */
|
||||
svfloat64_t hi = svadd_x (pg, t2, ar2);
|
||||
- svfloat64_t lo3 = svmla_x (pg, svneg_x (pg, ar2), ar, r);
|
||||
+ svfloat64_t lo3 = svmls_x (pg, ar2, ar, r);
|
||||
svfloat64_t lo4 = svadd_x (pg, svsub_x (pg, t2, hi), ar2);
|
||||
/* p = log1p(r) - r - A[0]*r*r. */
|
||||
/* p = (ar3 * (A[1] + r * A[2] + ar2 * (A[3] + r * A[4] + ar2 * (A[5] + r *
|
||||
A[6])))). */
|
||||
- svfloat64_t a56 = svmla_x (pg, sv_f64 (A[5]), r, A[6]);
|
||||
- svfloat64_t a34 = svmla_x (pg, sv_f64 (A[3]), r, A[4]);
|
||||
- svfloat64_t a12 = svmla_x (pg, sv_f64 (A[1]), r, A[2]);
|
||||
+
|
||||
+ svfloat64_t log_c46 = svld1rq_f64 (svptrue_b64 (), &d->log_c4);
|
||||
+ svfloat64_t a56 = svmla_lane_f64 (sv_f64 (d->log_c5), r, log_c46, 1);
|
||||
+ svfloat64_t a34 = svmla_lane_f64 (sv_f64 (d->log_c3), r, log_c46, 0);
|
||||
+ svfloat64_t a12 = svmla_lane_f64 (sv_f64 (d->log_c1), r, log_c02, 1);
|
||||
svfloat64_t p = svmla_x (pg, a34, ar2, a56);
|
||||
p = svmla_x (pg, a12, ar2, p);
|
||||
- p = svmul_x (pg, ar3, p);
|
||||
+ p = svmul_x (svptrue_b64 (), ar3, p);
|
||||
svfloat64_t lo = svadd_x (
|
||||
- pg, svadd_x (pg, svadd_x (pg, svadd_x (pg, lo1, lo2), lo3), lo4), p);
|
||||
+ pg, svadd_x (pg, svsub_x (pg, svadd_x (pg, lo1, lo2), lo3), lo4), p);
|
||||
svfloat64_t y = svadd_x (pg, hi, lo);
|
||||
*tail = svadd_x (pg, svsub_x (pg, hi, y), lo);
|
||||
return y;
|
||||
}
|
||||
|
||||
+static inline svfloat64_t
|
||||
+sv_exp_core (svbool_t pg, svfloat64_t x, svfloat64_t xtail,
|
||||
+ svuint64_t sign_bias, svfloat64_t *tmp, svuint64_t *sbits,
|
||||
+ svuint64_t *ki, const struct data *d)
|
||||
+{
|
||||
+ /* exp(x) = 2^(k/N) * exp(r), with exp(r) in [2^(-1/2N),2^(1/2N)]. */
|
||||
+ /* x = ln2/N*k + r, with int k and r in [-ln2/2N, ln2/2N]. */
|
||||
+ svfloat64_t n_over_ln2_and_c2 = svld1rq_f64 (svptrue_b64 (), &d->n_over_ln2);
|
||||
+ svfloat64_t z = svmul_lane_f64 (x, n_over_ln2_and_c2, 0);
|
||||
+ /* z - kd is in [-1, 1] in non-nearest rounding modes. */
|
||||
+ svfloat64_t kd = svrinta_x (pg, z);
|
||||
+ *ki = svreinterpret_u64 (svcvt_s64_x (pg, kd));
|
||||
+
|
||||
+ svfloat64_t ln2_over_n_hilo
|
||||
+ = svld1rq_f64 (svptrue_b64 (), &d->ln2_over_n_hi);
|
||||
+ svfloat64_t r = x;
|
||||
+ r = svmls_lane_f64 (r, kd, ln2_over_n_hilo, 0);
|
||||
+ r = svmls_lane_f64 (r, kd, ln2_over_n_hilo, 1);
|
||||
+ /* The code assumes 2^-200 < |xtail| < 2^-8/N. */
|
||||
+ r = svadd_x (pg, r, xtail);
|
||||
+ /* 2^(k/N) ~= scale. */
|
||||
+ svuint64_t idx = svand_x (pg, *ki, N_EXP - 1);
|
||||
+ svuint64_t top
|
||||
+ = svlsl_x (pg, svadd_x (pg, *ki, sign_bias), 52 - V_POW_EXP_TABLE_BITS);
|
||||
+ /* This is only a valid scale when -1023*N < k < 1024*N. */
|
||||
+ *sbits = svld1_gather_index (pg, __v_pow_exp_data.sbits, idx);
|
||||
+ *sbits = svadd_x (pg, *sbits, top);
|
||||
+ /* exp(x) = 2^(k/N) * exp(r) ~= scale + scale * (exp(r) - 1). */
|
||||
+ svfloat64_t r2 = svmul_x (svptrue_b64 (), r, r);
|
||||
+ *tmp = svmla_lane_f64 (sv_f64 (d->exp_c1), r, n_over_ln2_and_c2, 1);
|
||||
+ *tmp = svmla_x (pg, sv_f64 (d->exp_c0), r, *tmp);
|
||||
+ *tmp = svmla_x (pg, r, r2, *tmp);
|
||||
+ svfloat64_t scale = svreinterpret_f64 (*sbits);
|
||||
+ /* Note: tmp == 0 or |tmp| > 2^-200 and scale > 2^-739, so there
|
||||
+ is no spurious underflow here even without fma. */
|
||||
+ z = svmla_x (pg, scale, scale, *tmp);
|
||||
+ return z;
|
||||
+}
|
||||
+
|
||||
/* Computes sign*exp(x+xtail) where |xtail| < 2^-8/N and |xtail| <= |x|.
|
||||
The sign_bias argument is SignBias or 0 and sets the sign to -1 or 1. */
|
||||
static inline svfloat64_t
|
||||
sv_exp_inline (svbool_t pg, svfloat64_t x, svfloat64_t xtail,
|
||||
- svuint64_t sign_bias)
|
||||
+ svuint64_t sign_bias, const struct data *d)
|
||||
{
|
||||
/* 3 types of special cases: tiny (uflow and spurious uflow), huge (oflow)
|
||||
and other cases of large values of x (scale * (1 + TMP) oflow). */
|
||||
@@ -240,73 +310,46 @@ sv_exp_inline (svbool_t pg, svfloat64_t x, svfloat64_t xtail,
|
||||
/* |x| is large (|x| >= 512) or tiny (|x| <= 0x1p-54). */
|
||||
svbool_t uoflow = svcmpge (pg, svsub_x (pg, abstop, SmallExp), ThresExp);
|
||||
|
||||
- /* Conditions special, uflow and oflow are all expressed as uoflow &&
|
||||
- something, hence do not bother computing anything if no lane in uoflow is
|
||||
- true. */
|
||||
- svbool_t special = svpfalse_b ();
|
||||
- svbool_t uflow = svpfalse_b ();
|
||||
- svbool_t oflow = svpfalse_b ();
|
||||
+ svfloat64_t tmp;
|
||||
+ svuint64_t sbits, ki;
|
||||
if (__glibc_unlikely (svptest_any (pg, uoflow)))
|
||||
{
|
||||
+ svfloat64_t z
|
||||
+ = sv_exp_core (pg, x, xtail, sign_bias, &tmp, &sbits, &ki, d);
|
||||
+
|
||||
/* |x| is tiny (|x| <= 0x1p-54). */
|
||||
- uflow = svcmpge (pg, svsub_x (pg, abstop, SmallExp), 0x80000000);
|
||||
+ svbool_t uflow
|
||||
+ = svcmpge (pg, svsub_x (pg, abstop, SmallExp), 0x80000000);
|
||||
uflow = svand_z (pg, uoflow, uflow);
|
||||
/* |x| is huge (|x| >= 1024). */
|
||||
- oflow = svcmpge (pg, abstop, HugeExp);
|
||||
+ svbool_t oflow = svcmpge (pg, abstop, HugeExp);
|
||||
oflow = svand_z (pg, uoflow, svbic_z (pg, oflow, uflow));
|
||||
+
|
||||
/* For large |x| values (512 < |x| < 1024) scale * (1 + TMP) can overflow
|
||||
- or underflow. */
|
||||
- special = svbic_z (pg, uoflow, svorr_z (pg, uflow, oflow));
|
||||
+ or underflow. */
|
||||
+ svbool_t special = svbic_z (pg, uoflow, svorr_z (pg, uflow, oflow));
|
||||
+
|
||||
+ /* Update result with special and large cases. */
|
||||
+ z = sv_call_specialcase (tmp, sbits, ki, z, special);
|
||||
+
|
||||
+ /* Handle underflow and overflow. */
|
||||
+ svbool_t x_is_neg = svcmplt (pg, x, 0);
|
||||
+ svuint64_t sign_mask
|
||||
+ = svlsl_x (pg, sign_bias, 52 - V_POW_EXP_TABLE_BITS);
|
||||
+ svfloat64_t res_uoflow
|
||||
+ = svsel (x_is_neg, sv_f64 (0.0), sv_f64 (INFINITY));
|
||||
+ res_uoflow = svreinterpret_f64 (
|
||||
+ svorr_x (pg, svreinterpret_u64 (res_uoflow), sign_mask));
|
||||
+ /* Avoid spurious underflow for tiny x. */
|
||||
+ svfloat64_t res_spurious_uflow
|
||||
+ = svreinterpret_f64 (svorr_x (pg, sign_mask, 0x3ff0000000000000));
|
||||
+
|
||||
+ z = svsel (oflow, res_uoflow, z);
|
||||
+ z = svsel (uflow, res_spurious_uflow, z);
|
||||
+ return z;
|
||||
}
|
||||
|
||||
- /* exp(x) = 2^(k/N) * exp(r), with exp(r) in [2^(-1/2N),2^(1/2N)]. */
|
||||
- /* x = ln2/N*k + r, with int k and r in [-ln2/2N, ln2/2N]. */
|
||||
- svfloat64_t z = svmul_x (pg, x, __v_pow_exp_data.n_over_ln2);
|
||||
- /* z - kd is in [-1, 1] in non-nearest rounding modes. */
|
||||
- svfloat64_t shift = sv_f64 (__v_pow_exp_data.shift);
|
||||
- svfloat64_t kd = svadd_x (pg, z, shift);
|
||||
- svuint64_t ki = svreinterpret_u64 (kd);
|
||||
- kd = svsub_x (pg, kd, shift);
|
||||
- svfloat64_t r = x;
|
||||
- r = svmls_x (pg, r, kd, __v_pow_exp_data.ln2_over_n_hi);
|
||||
- r = svmls_x (pg, r, kd, __v_pow_exp_data.ln2_over_n_lo);
|
||||
- /* The code assumes 2^-200 < |xtail| < 2^-8/N. */
|
||||
- r = svadd_x (pg, r, xtail);
|
||||
- /* 2^(k/N) ~= scale. */
|
||||
- svuint64_t idx = svand_x (pg, ki, N_EXP - 1);
|
||||
- svuint64_t top
|
||||
- = svlsl_x (pg, svadd_x (pg, ki, sign_bias), 52 - V_POW_EXP_TABLE_BITS);
|
||||
- /* This is only a valid scale when -1023*N < k < 1024*N. */
|
||||
- svuint64_t sbits = svld1_gather_index (pg, __v_pow_exp_data.sbits, idx);
|
||||
- sbits = svadd_x (pg, sbits, top);
|
||||
- /* exp(x) = 2^(k/N) * exp(r) ~= scale + scale * (exp(r) - 1). */
|
||||
- svfloat64_t r2 = svmul_x (pg, r, r);
|
||||
- svfloat64_t tmp = svmla_x (pg, sv_f64 (C[1]), r, C[2]);
|
||||
- tmp = svmla_x (pg, sv_f64 (C[0]), r, tmp);
|
||||
- tmp = svmla_x (pg, r, r2, tmp);
|
||||
- svfloat64_t scale = svreinterpret_f64 (sbits);
|
||||
- /* Note: tmp == 0 or |tmp| > 2^-200 and scale > 2^-739, so there
|
||||
- is no spurious underflow here even without fma. */
|
||||
- z = svmla_x (pg, scale, scale, tmp);
|
||||
-
|
||||
- /* Update result with special and large cases. */
|
||||
- if (__glibc_unlikely (svptest_any (pg, special)))
|
||||
- z = sv_call_specialcase (tmp, sbits, ki, z, special);
|
||||
-
|
||||
- /* Handle underflow and overflow. */
|
||||
- svuint64_t sign_bit = svlsr_x (pg, svreinterpret_u64 (x), 63);
|
||||
- svbool_t x_is_neg = svcmpne (pg, sign_bit, 0);
|
||||
- svuint64_t sign_mask = svlsl_x (pg, sign_bias, 52 - V_POW_EXP_TABLE_BITS);
|
||||
- svfloat64_t res_uoflow = svsel (x_is_neg, sv_f64 (0.0), sv_f64 (INFINITY));
|
||||
- res_uoflow = svreinterpret_f64 (
|
||||
- svorr_x (pg, svreinterpret_u64 (res_uoflow), sign_mask));
|
||||
- z = svsel (oflow, res_uoflow, z);
|
||||
- /* Avoid spurious underflow for tiny x. */
|
||||
- svfloat64_t res_spurious_uflow
|
||||
- = svreinterpret_f64 (svorr_x (pg, sign_mask, 0x3ff0000000000000));
|
||||
- z = svsel (uflow, res_spurious_uflow, z);
|
||||
-
|
||||
- return z;
|
||||
+ return sv_exp_core (pg, x, xtail, sign_bias, &tmp, &sbits, &ki, d);
|
||||
}
|
||||
|
||||
static inline double
|
||||
@@ -341,47 +384,39 @@ pow_sc (double x, double y)
|
||||
|
||||
svfloat64_t SV_NAME_D2 (pow) (svfloat64_t x, svfloat64_t y, const svbool_t pg)
|
||||
{
|
||||
+ const struct data *d = ptr_barrier (&data);
|
||||
+
|
||||
/* This preamble handles special case conditions used in the final scalar
|
||||
fallbacks. It also updates ix and sign_bias, that are used in the core
|
||||
computation too, i.e., exp( y * log (x) ). */
|
||||
svuint64_t vix0 = svreinterpret_u64 (x);
|
||||
svuint64_t viy0 = svreinterpret_u64 (y);
|
||||
- svuint64_t vtopx0 = svlsr_x (svptrue_b64 (), vix0, 52);
|
||||
|
||||
/* Negative x cases. */
|
||||
- svuint64_t sign_bit = svlsr_m (pg, vix0, 63);
|
||||
- svbool_t xisneg = svcmpeq (pg, sign_bit, 1);
|
||||
+ svbool_t xisneg = svcmplt (pg, x, 0);
|
||||
|
||||
/* Set sign_bias and ix depending on sign of x and nature of y. */
|
||||
- svbool_t yisnotint_xisneg = svpfalse_b ();
|
||||
+ svbool_t yint_or_xpos = pg;
|
||||
svuint64_t sign_bias = sv_u64 (0);
|
||||
svuint64_t vix = vix0;
|
||||
- svuint64_t vtopx1 = vtopx0;
|
||||
if (__glibc_unlikely (svptest_any (pg, xisneg)))
|
||||
{
|
||||
/* Determine nature of y. */
|
||||
- yisnotint_xisneg = sv_isnotint (xisneg, y);
|
||||
- svbool_t yisint_xisneg = sv_isint (xisneg, y);
|
||||
+ yint_or_xpos = sv_isint (xisneg, y);
|
||||
svbool_t yisodd_xisneg = sv_isodd (xisneg, y);
|
||||
/* ix set to abs(ix) if y is integer. */
|
||||
- vix = svand_m (yisint_xisneg, vix0, 0x7fffffffffffffff);
|
||||
- vtopx1 = svand_m (yisint_xisneg, vtopx0, 0x7ff);
|
||||
+ vix = svand_m (yint_or_xpos, vix0, 0x7fffffffffffffff);
|
||||
/* Set to SignBias if x is negative and y is odd. */
|
||||
sign_bias = svsel (yisodd_xisneg, sv_u64 (SignBias), sv_u64 (0));
|
||||
}
|
||||
|
||||
- /* Special cases of x or y: zero, inf and nan. */
|
||||
- svbool_t xspecial = sv_zeroinfnan (pg, vix0);
|
||||
- svbool_t yspecial = sv_zeroinfnan (pg, viy0);
|
||||
- svbool_t special = svorr_z (pg, xspecial, yspecial);
|
||||
-
|
||||
/* Small cases of x: |x| < 0x1p-126. */
|
||||
- svuint64_t vabstopx0 = svand_x (pg, vtopx0, 0x7ff);
|
||||
- svbool_t xsmall = svcmplt (pg, vabstopx0, SmallPowX);
|
||||
- if (__glibc_unlikely (svptest_any (pg, xsmall)))
|
||||
+ svbool_t xsmall = svaclt (yint_or_xpos, x, SmallBoundX);
|
||||
+ if (__glibc_unlikely (svptest_any (yint_or_xpos, xsmall)))
|
||||
{
|
||||
/* Normalize subnormal x so exponent becomes negative. */
|
||||
- svbool_t topx_is_null = svcmpeq (xsmall, vtopx1, 0);
|
||||
+ svuint64_t vtopx = svlsr_x (svptrue_b64 (), vix, 52);
|
||||
+ svbool_t topx_is_null = svcmpeq (xsmall, vtopx, 0);
|
||||
|
||||
svuint64_t vix_norm = svreinterpret_u64 (svmul_m (xsmall, x, 0x1p52));
|
||||
vix_norm = svand_m (xsmall, vix_norm, 0x7fffffffffffffff);
|
||||
@@ -391,20 +426,24 @@ svfloat64_t SV_NAME_D2 (pow) (svfloat64_t x, svfloat64_t y, const svbool_t pg)
|
||||
|
||||
/* y_hi = log(ix, &y_lo). */
|
||||
svfloat64_t vlo;
|
||||
- svfloat64_t vhi = sv_log_inline (pg, vix, &vlo);
|
||||
+ svfloat64_t vhi = sv_log_inline (yint_or_xpos, vix, &vlo, d);
|
||||
|
||||
/* z = exp(y_hi, y_lo, sign_bias). */
|
||||
- svfloat64_t vehi = svmul_x (pg, y, vhi);
|
||||
- svfloat64_t velo = svmul_x (pg, y, vlo);
|
||||
- svfloat64_t vemi = svmls_x (pg, vehi, y, vhi);
|
||||
- velo = svsub_x (pg, velo, vemi);
|
||||
- svfloat64_t vz = sv_exp_inline (pg, vehi, velo, sign_bias);
|
||||
+ svfloat64_t vehi = svmul_x (svptrue_b64 (), y, vhi);
|
||||
+ svfloat64_t vemi = svmls_x (yint_or_xpos, vehi, y, vhi);
|
||||
+ svfloat64_t velo = svnmls_x (yint_or_xpos, vemi, y, vlo);
|
||||
+ svfloat64_t vz = sv_exp_inline (yint_or_xpos, vehi, velo, sign_bias, d);
|
||||
|
||||
/* Cases of finite y and finite negative x. */
|
||||
- vz = svsel (yisnotint_xisneg, sv_f64 (__builtin_nan ("")), vz);
|
||||
+ vz = svsel (yint_or_xpos, vz, sv_f64 (__builtin_nan ("")));
|
||||
+
|
||||
+ /* Special cases of x or y: zero, inf and nan. */
|
||||
+ svbool_t xspecial = sv_zeroinfnan (svptrue_b64 (), vix0);
|
||||
+ svbool_t yspecial = sv_zeroinfnan (svptrue_b64 (), viy0);
|
||||
+ svbool_t special = svorr_z (svptrue_b64 (), xspecial, yspecial);
|
||||
|
||||
/* Cases of zero/inf/nan x or y. */
|
||||
- if (__glibc_unlikely (svptest_any (pg, special)))
|
||||
+ if (__glibc_unlikely (svptest_any (svptrue_b64 (), special)))
|
||||
vz = sv_call2_f64 (pow_sc, x, y, vz, special);
|
||||
|
||||
return vz;
|
||||
45
glibc-RHEL-118273-29.patch
Normal file
45
glibc-RHEL-118273-29.patch
Normal file
@ -0,0 +1,45 @@
|
||||
commit f5ff34cb3c75ec1061c75bb9188b3c1176426947
|
||||
Author: Yat Long Poon <yatlong.poon@arm.com>
|
||||
Date: Thu Feb 13 18:00:50 2025 +0000
|
||||
|
||||
AArch64: Improve codegen for SVE erfcf
|
||||
|
||||
Reduce number of MOV/MOVPRFXs and use unpredicated FMUL.
|
||||
Replace MUL with LSL. Speedup on Neoverse V1: 6%.
|
||||
|
||||
Reviewed-by: Wilco Dijkstra <Wilco.Dijkstra@arm.com>
|
||||
|
||||
diff --git a/sysdeps/aarch64/fpu/erfcf_sve.c b/sysdeps/aarch64/fpu/erfcf_sve.c
|
||||
index ecacb933aca40855..e4869263e31e18bc 100644
|
||||
--- a/sysdeps/aarch64/fpu/erfcf_sve.c
|
||||
+++ b/sysdeps/aarch64/fpu/erfcf_sve.c
|
||||
@@ -76,7 +76,7 @@ svfloat32_t SV_NAME_F1 (erfc) (svfloat32_t x, const svbool_t pg)
|
||||
svuint32_t i = svqadd (svreinterpret_u32 (z), dat->off_idx);
|
||||
|
||||
/* Lookup erfc(r) and 2/sqrt(pi)*exp(-r^2) in tables. */
|
||||
- i = svmul_x (pg, i, 2);
|
||||
+ i = svlsl_x (svptrue_b32 (), i, 1);
|
||||
const float32_t *p = &__v_erfcf_data.tab[0].erfc - 2 * dat->off_arr;
|
||||
svfloat32_t erfcr = svld1_gather_index (pg, p, i);
|
||||
svfloat32_t scale = svld1_gather_index (pg, p + 1, i);
|
||||
@@ -84,15 +84,15 @@ svfloat32_t SV_NAME_F1 (erfc) (svfloat32_t x, const svbool_t pg)
|
||||
/* erfc(x) ~ erfc(r) - scale * d * poly(r, d). */
|
||||
svfloat32_t r = svsub_x (pg, z, shift);
|
||||
svfloat32_t d = svsub_x (pg, a, r);
|
||||
- svfloat32_t d2 = svmul_x (pg, d, d);
|
||||
- svfloat32_t r2 = svmul_x (pg, r, r);
|
||||
+ svfloat32_t d2 = svmul_x (svptrue_b32 (), d, d);
|
||||
+ svfloat32_t r2 = svmul_x (svptrue_b32 (), r, r);
|
||||
|
||||
svfloat32_t coeffs = svld1rq (svptrue_b32 (), &dat->third);
|
||||
- svfloat32_t third = svdup_lane (coeffs, 0);
|
||||
|
||||
svfloat32_t p1 = r;
|
||||
- svfloat32_t p2 = svmls_lane (third, r2, coeffs, 1);
|
||||
- svfloat32_t p3 = svmul_x (pg, r, svmla_lane (sv_f32 (-0.5), r2, coeffs, 0));
|
||||
+ svfloat32_t p2 = svmls_lane (sv_f32 (dat->third), r2, coeffs, 1);
|
||||
+ svfloat32_t p3
|
||||
+ = svmul_x (svptrue_b32 (), r, svmla_lane (sv_f32 (-0.5), r2, coeffs, 0));
|
||||
svfloat32_t p4 = svmla_lane (sv_f32 (dat->two_over_five), r2, coeffs, 2);
|
||||
p4 = svmls_x (pg, sv_f32 (dat->tenth), r2, p4);
|
||||
|
||||
873
glibc-RHEL-118273-3.patch
Normal file
873
glibc-RHEL-118273-3.patch
Normal file
@ -0,0 +1,873 @@
|
||||
commit b09fee1d21650428a6a3335408a46ebe1165d30d
|
||||
Author: Joe Ramsay <Joe.Ramsay@arm.com>
|
||||
Date: Tue Feb 20 16:59:40 2024 +0000
|
||||
|
||||
aarch64/fpu: Add vector variants of acosh
|
||||
|
||||
Reviewed-by: Szabolcs Nagy <szabolcs.nagy@arm.com>
|
||||
|
||||
diff --git a/sysdeps/aarch64/fpu/Makefile b/sysdeps/aarch64/fpu/Makefile
|
||||
index 019c3a51880e2306..2e5bbb5a07f4c9b0 100644
|
||||
--- a/sysdeps/aarch64/fpu/Makefile
|
||||
+++ b/sysdeps/aarch64/fpu/Makefile
|
||||
@@ -1,4 +1,5 @@
|
||||
libmvec-supported-funcs = acos \
|
||||
+ acosh \
|
||||
asin \
|
||||
atan \
|
||||
atan2 \
|
||||
diff --git a/sysdeps/aarch64/fpu/Versions b/sysdeps/aarch64/fpu/Versions
|
||||
index 884b4b57f097635f..60e1cdeacec3f77e 100644
|
||||
--- a/sysdeps/aarch64/fpu/Versions
|
||||
+++ b/sysdeps/aarch64/fpu/Versions
|
||||
@@ -79,6 +79,11 @@ libmvec {
|
||||
_ZGVsMxv_tan;
|
||||
}
|
||||
GLIBC_2.40 {
|
||||
+ _ZGVnN2v_acosh;
|
||||
+ _ZGVnN2v_acoshf;
|
||||
+ _ZGVnN4v_acoshf;
|
||||
+ _ZGVsMxv_acosh;
|
||||
+ _ZGVsMxv_acoshf;
|
||||
_ZGVnN2v_cosh;
|
||||
_ZGVnN2v_coshf;
|
||||
_ZGVnN4v_coshf;
|
||||
diff --git a/sysdeps/aarch64/fpu/acosh_advsimd.c b/sysdeps/aarch64/fpu/acosh_advsimd.c
|
||||
new file mode 100644
|
||||
index 0000000000000000..c88283cf1191f4eb
|
||||
--- /dev/null
|
||||
+++ b/sysdeps/aarch64/fpu/acosh_advsimd.c
|
||||
@@ -0,0 +1,67 @@
|
||||
+/* Double-precision vector (Advanced SIMD) acosh function
|
||||
+
|
||||
+ Copyright (C) 2024 Free Software Foundation, Inc.
|
||||
+ This file is part of the GNU C Library.
|
||||
+
|
||||
+ The GNU C Library is free software; you can redistribute it and/or
|
||||
+ modify it under the terms of the GNU Lesser General Public
|
||||
+ License as published by the Free Software Foundation; either
|
||||
+ version 2.1 of the License, or (at your option) any later version.
|
||||
+
|
||||
+ The GNU C Library is distributed in the hope that it will be useful,
|
||||
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
+ Lesser General Public License for more details.
|
||||
+
|
||||
+ You should have received a copy of the GNU Lesser General Public
|
||||
+ License along with the GNU C Library; if not, see
|
||||
+ <https://www.gnu.org/licenses/>. */
|
||||
+
|
||||
+#define WANT_V_LOG1P_K0_SHORTCUT 1
|
||||
+#include "v_log1p_inline.h"
|
||||
+
|
||||
+const static struct data
|
||||
+{
|
||||
+ struct v_log1p_data log1p_consts;
|
||||
+ uint64x2_t one, thresh;
|
||||
+} data = {
|
||||
+ .log1p_consts = V_LOG1P_CONSTANTS_TABLE,
|
||||
+ .one = V2 (0x3ff0000000000000),
|
||||
+ .thresh = V2 (0x1ff0000000000000) /* asuint64(0x1p511) - asuint64(1). */
|
||||
+};
|
||||
+
|
||||
+static float64x2_t NOINLINE VPCS_ATTR
|
||||
+special_case (float64x2_t x, float64x2_t y, uint64x2_t special,
|
||||
+ const struct v_log1p_data *d)
|
||||
+{
|
||||
+ return v_call_f64 (acosh, x, log1p_inline (y, d), special);
|
||||
+}
|
||||
+
|
||||
+/* Vector approximation for double-precision acosh, based on log1p.
|
||||
+ The largest observed error is 3.02 ULP in the region where the
|
||||
+ argument to log1p falls in the k=0 interval, i.e. x close to 1:
|
||||
+ _ZGVnN2v_acosh(0x1.00798aaf80739p+0) got 0x1.f2d6d823bc9dfp-5
|
||||
+ want 0x1.f2d6d823bc9e2p-5. */
|
||||
+VPCS_ATTR float64x2_t V_NAME_D1 (acosh) (float64x2_t x)
|
||||
+{
|
||||
+ const struct data *d = ptr_barrier (&data);
|
||||
+ uint64x2_t special
|
||||
+ = vcgeq_u64 (vsubq_u64 (vreinterpretq_u64_f64 (x), d->one), d->thresh);
|
||||
+ float64x2_t special_arg = x;
|
||||
+
|
||||
+#if WANT_SIMD_EXCEPT
|
||||
+ if (__glibc_unlikely (v_any_u64 (special)))
|
||||
+ x = vbslq_f64 (special, vreinterpretq_f64_u64 (d->one), x);
|
||||
+#endif
|
||||
+
|
||||
+ float64x2_t xm1 = vsubq_f64 (x, v_f64 (1));
|
||||
+ float64x2_t y;
|
||||
+ y = vaddq_f64 (x, v_f64 (1));
|
||||
+ y = vmulq_f64 (y, xm1);
|
||||
+ y = vsqrtq_f64 (y);
|
||||
+ y = vaddq_f64 (xm1, y);
|
||||
+
|
||||
+ if (__glibc_unlikely (v_any_u64 (special)))
|
||||
+ return special_case (special_arg, y, special, &d->log1p_consts);
|
||||
+ return log1p_inline (y, &d->log1p_consts);
|
||||
+}
|
||||
diff --git a/sysdeps/aarch64/fpu/acosh_sve.c b/sysdeps/aarch64/fpu/acosh_sve.c
|
||||
new file mode 100644
|
||||
index 0000000000000000..3e4faaa5ca686c18
|
||||
--- /dev/null
|
||||
+++ b/sysdeps/aarch64/fpu/acosh_sve.c
|
||||
@@ -0,0 +1,51 @@
|
||||
+/* Double-precision vector (SVE) acosh function
|
||||
+
|
||||
+ Copyright (C) 2024 Free Software Foundation, Inc.
|
||||
+ This file is part of the GNU C Library.
|
||||
+
|
||||
+ The GNU C Library is free software; you can redistribute it and/or
|
||||
+ modify it under the terms of the GNU Lesser General Public
|
||||
+ License as published by the Free Software Foundation; either
|
||||
+ version 2.1 of the License, or (at your option) any later version.
|
||||
+
|
||||
+ The GNU C Library is distributed in the hope that it will be useful,
|
||||
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
+ Lesser General Public License for more details.
|
||||
+
|
||||
+ You should have received a copy of the GNU Lesser General Public
|
||||
+ License along with the GNU C Library; if not, see
|
||||
+ <https://www.gnu.org/licenses/>. */
|
||||
+
|
||||
+#define WANT_SV_LOG1P_K0_SHORTCUT 1
|
||||
+#include "sv_log1p_inline.h"
|
||||
+
|
||||
+#define One (0x3ff0000000000000)
|
||||
+#define Thres (0x1ff0000000000000) /* asuint64 (0x1p511) - One. */
|
||||
+
|
||||
+static svfloat64_t NOINLINE
|
||||
+special_case (svfloat64_t x, svfloat64_t y, svbool_t special)
|
||||
+{
|
||||
+ return sv_call_f64 (acosh, x, y, special);
|
||||
+}
|
||||
+
|
||||
+/* SVE approximation for double-precision acosh, based on log1p.
|
||||
+ The largest observed error is 3.19 ULP in the region where the
|
||||
+ argument to log1p falls in the k=0 interval, i.e. x close to 1:
|
||||
+ SV_NAME_D1 (acosh)(0x1.1e4388d4ca821p+0) got 0x1.ed23399f5137p-2
|
||||
+ want 0x1.ed23399f51373p-2. */
|
||||
+svfloat64_t SV_NAME_D1 (acosh) (svfloat64_t x, const svbool_t pg)
|
||||
+{
|
||||
+ /* (ix - One) >= (BigBound - One). */
|
||||
+ svuint64_t ix = svreinterpret_u64 (x);
|
||||
+ svbool_t special = svcmpge (pg, svsub_x (pg, ix, One), Thres);
|
||||
+
|
||||
+ svfloat64_t xm1 = svsub_x (pg, x, 1.0);
|
||||
+ svfloat64_t u = svmul_x (pg, xm1, svadd_x (pg, x, 1.0));
|
||||
+ svfloat64_t y = svadd_x (pg, xm1, svsqrt_x (pg, u));
|
||||
+
|
||||
+ /* Fall back to scalar routine for special lanes. */
|
||||
+ if (__glibc_unlikely (svptest_any (pg, special)))
|
||||
+ return special_case (x, sv_log1p_inline (y, pg), special);
|
||||
+ return sv_log1p_inline (y, pg);
|
||||
+}
|
||||
diff --git a/sysdeps/aarch64/fpu/acoshf_advsimd.c b/sysdeps/aarch64/fpu/acoshf_advsimd.c
|
||||
new file mode 100644
|
||||
index 0000000000000000..8916dcbf409922a9
|
||||
--- /dev/null
|
||||
+++ b/sysdeps/aarch64/fpu/acoshf_advsimd.c
|
||||
@@ -0,0 +1,78 @@
|
||||
+/* Single-precision vector (Advanced SIMD) acosh function
|
||||
+
|
||||
+ Copyright (C) 2024 Free Software Foundation, Inc.
|
||||
+ This file is part of the GNU C Library.
|
||||
+
|
||||
+ The GNU C Library is free software; you can redistribute it and/or
|
||||
+ modify it under the terms of the GNU Lesser General Public
|
||||
+ License as published by the Free Software Foundation; either
|
||||
+ version 2.1 of the License, or (at your option) any later version.
|
||||
+
|
||||
+ The GNU C Library is distributed in the hope that it will be useful,
|
||||
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
+ Lesser General Public License for more details.
|
||||
+
|
||||
+ You should have received a copy of the GNU Lesser General Public
|
||||
+ License along with the GNU C Library; if not, see
|
||||
+ <https://www.gnu.org/licenses/>. */
|
||||
+
|
||||
+#include "v_log1pf_inline.h"
|
||||
+
|
||||
+#define SquareLim 0x1p64
|
||||
+
|
||||
+const static struct data
|
||||
+{
|
||||
+ struct v_log1pf_data log1pf_consts;
|
||||
+ uint32x4_t one;
|
||||
+ uint16x4_t thresh;
|
||||
+} data = {
|
||||
+ .log1pf_consts = V_LOG1PF_CONSTANTS_TABLE,
|
||||
+ .one = V4 (0x3f800000),
|
||||
+ .thresh = V4 (0x2000) /* top(asuint(SquareLim) - asuint(1)). */
|
||||
+};
|
||||
+
|
||||
+static float32x4_t NOINLINE VPCS_ATTR
|
||||
+special_case (float32x4_t x, float32x4_t y, uint16x4_t special,
|
||||
+ const struct v_log1pf_data d)
|
||||
+{
|
||||
+ return v_call_f32 (acoshf, x, log1pf_inline (y, d), vmovl_u16 (special));
|
||||
+}
|
||||
+
|
||||
+/* Vector approximation for single-precision acosh, based on log1p. Maximum
|
||||
+ error depends on WANT_SIMD_EXCEPT. With SIMD fp exceptions enabled, it
|
||||
+ is 2.78 ULP:
|
||||
+ __v_acoshf(0x1.07887p+0) got 0x1.ef9e9cp-3
|
||||
+ want 0x1.ef9ea2p-3.
|
||||
+ With exceptions disabled, we can compute u with a shorter dependency chain,
|
||||
+ which gives maximum error of 3.07 ULP:
|
||||
+ __v_acoshf(0x1.01f83ep+0) got 0x1.fbc7fap-4
|
||||
+ want 0x1.fbc7f4p-4. */
|
||||
+
|
||||
+VPCS_ATTR float32x4_t NOINLINE V_NAME_F1 (acosh) (float32x4_t x)
|
||||
+{
|
||||
+ const struct data *d = ptr_barrier (&data);
|
||||
+ uint32x4_t ix = vreinterpretq_u32_f32 (x);
|
||||
+ uint16x4_t special = vcge_u16 (vsubhn_u32 (ix, d->one), d->thresh);
|
||||
+
|
||||
+#if WANT_SIMD_EXCEPT
|
||||
+ /* Mask special lanes with 1 to side-step spurious invalid or overflow. Use
|
||||
+ only xm1 to calculate u, as operating on x will trigger invalid for NaN.
|
||||
+ Widening sign-extend special predicate in order to mask with it. */
|
||||
+ uint32x4_t p
|
||||
+ = vreinterpretq_u32_s32 (vmovl_s16 (vreinterpret_s16_u16 (special)));
|
||||
+ float32x4_t xm1 = v_zerofy_f32 (vsubq_f32 (x, v_f32 (1)), p);
|
||||
+ float32x4_t u = vfmaq_f32 (vaddq_f32 (xm1, xm1), xm1, xm1);
|
||||
+#else
|
||||
+ float32x4_t xm1 = vsubq_f32 (x, v_f32 (1));
|
||||
+ float32x4_t u = vmulq_f32 (xm1, vaddq_f32 (x, v_f32 (1.0f)));
|
||||
+#endif
|
||||
+
|
||||
+ float32x4_t y = vaddq_f32 (xm1, vsqrtq_f32 (u));
|
||||
+
|
||||
+ if (__glibc_unlikely (v_any_u16h (special)))
|
||||
+ return special_case (x, y, special, d->log1pf_consts);
|
||||
+ return log1pf_inline (y, d->log1pf_consts);
|
||||
+}
|
||||
+libmvec_hidden_def (V_NAME_F1 (acosh))
|
||||
+HALF_WIDTH_ALIAS_F1 (acosh)
|
||||
diff --git a/sysdeps/aarch64/fpu/acoshf_sve.c b/sysdeps/aarch64/fpu/acoshf_sve.c
|
||||
new file mode 100644
|
||||
index 0000000000000000..2110894e629500be
|
||||
--- /dev/null
|
||||
+++ b/sysdeps/aarch64/fpu/acoshf_sve.c
|
||||
@@ -0,0 +1,49 @@
|
||||
+/* Single-precision vector (SVE) acosh function
|
||||
+
|
||||
+ Copyright (C) 2024 Free Software Foundation, Inc.
|
||||
+ This file is part of the GNU C Library.
|
||||
+
|
||||
+ The GNU C Library is free software; you can redistribute it and/or
|
||||
+ modify it under the terms of the GNU Lesser General Public
|
||||
+ License as published by the Free Software Foundation; either
|
||||
+ version 2.1 of the License, or (at your option) any later version.
|
||||
+
|
||||
+ The GNU C Library is distributed in the hope that it will be useful,
|
||||
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
+ Lesser General Public License for more details.
|
||||
+
|
||||
+ You should have received a copy of the GNU Lesser General Public
|
||||
+ License along with the GNU C Library; if not, see
|
||||
+ <https://www.gnu.org/licenses/>. */
|
||||
+
|
||||
+#define One 0x3f800000
|
||||
+#define Thres 0x20000000 /* asuint(0x1p64) - One. */
|
||||
+
|
||||
+#include "sv_log1pf_inline.h"
|
||||
+
|
||||
+static svfloat32_t NOINLINE
|
||||
+special_case (svfloat32_t x, svfloat32_t y, svbool_t special)
|
||||
+{
|
||||
+ return sv_call_f32 (acoshf, x, y, special);
|
||||
+}
|
||||
+
|
||||
+/* Single-precision SVE acosh(x) routine. Implements the same algorithm as
|
||||
+ vector acoshf and log1p.
|
||||
+
|
||||
+ Maximum error is 2.78 ULPs:
|
||||
+ SV_NAME_F1 (acosh) (0x1.01e996p+0) got 0x1.f45b42p-4
|
||||
+ want 0x1.f45b3cp-4. */
|
||||
+svfloat32_t SV_NAME_F1 (acosh) (svfloat32_t x, const svbool_t pg)
|
||||
+{
|
||||
+ svuint32_t ix = svreinterpret_u32 (x);
|
||||
+ svbool_t special = svcmpge (pg, svsub_x (pg, ix, One), Thres);
|
||||
+
|
||||
+ svfloat32_t xm1 = svsub_x (pg, x, 1.0f);
|
||||
+ svfloat32_t u = svmul_x (pg, xm1, svadd_x (pg, x, 1.0f));
|
||||
+ svfloat32_t y = sv_log1pf_inline (svadd_x (pg, xm1, svsqrt_x (pg, u)), pg);
|
||||
+
|
||||
+ if (__glibc_unlikely (svptest_any (pg, special)))
|
||||
+ return special_case (x, y, special);
|
||||
+ return y;
|
||||
+}
|
||||
diff --git a/sysdeps/aarch64/fpu/advsimd_f32_protos.h b/sysdeps/aarch64/fpu/advsimd_f32_protos.h
|
||||
index c63b2948d4938b0d..22fec4de77395e60 100644
|
||||
--- a/sysdeps/aarch64/fpu/advsimd_f32_protos.h
|
||||
+++ b/sysdeps/aarch64/fpu/advsimd_f32_protos.h
|
||||
@@ -18,6 +18,7 @@
|
||||
<https://www.gnu.org/licenses/>. */
|
||||
|
||||
libmvec_hidden_proto (V_NAME_F1(acos));
|
||||
+libmvec_hidden_proto (V_NAME_F1(acosh));
|
||||
libmvec_hidden_proto (V_NAME_F1(asin));
|
||||
libmvec_hidden_proto (V_NAME_F1(atan));
|
||||
libmvec_hidden_proto (V_NAME_F1(cos));
|
||||
diff --git a/sysdeps/aarch64/fpu/bits/math-vector.h b/sysdeps/aarch64/fpu/bits/math-vector.h
|
||||
index 8ca55098706a54c2..841330956c102ff1 100644
|
||||
--- a/sysdeps/aarch64/fpu/bits/math-vector.h
|
||||
+++ b/sysdeps/aarch64/fpu/bits/math-vector.h
|
||||
@@ -33,6 +33,10 @@
|
||||
# define __DECL_SIMD_acos __DECL_SIMD_aarch64
|
||||
# undef __DECL_SIMD_acosf
|
||||
# define __DECL_SIMD_acosf __DECL_SIMD_aarch64
|
||||
+# undef __DECL_SIMD_acosh
|
||||
+# define __DECL_SIMD_acosh __DECL_SIMD_aarch64
|
||||
+# undef __DECL_SIMD_acoshf
|
||||
+# define __DECL_SIMD_acoshf __DECL_SIMD_aarch64
|
||||
# undef __DECL_SIMD_asin
|
||||
# define __DECL_SIMD_asin __DECL_SIMD_aarch64
|
||||
# undef __DECL_SIMD_asinf
|
||||
@@ -125,6 +129,7 @@ typedef __SVBool_t __sv_bool_t;
|
||||
|
||||
__vpcs __f32x4_t _ZGVnN4vv_atan2f (__f32x4_t, __f32x4_t);
|
||||
__vpcs __f32x4_t _ZGVnN4v_acosf (__f32x4_t);
|
||||
+__vpcs __f32x4_t _ZGVnN4v_acoshf (__f32x4_t);
|
||||
__vpcs __f32x4_t _ZGVnN4v_asinf (__f32x4_t);
|
||||
__vpcs __f32x4_t _ZGVnN4v_atanf (__f32x4_t);
|
||||
__vpcs __f32x4_t _ZGVnN4v_cosf (__f32x4_t);
|
||||
@@ -143,6 +148,7 @@ __vpcs __f32x4_t _ZGVnN4v_tanf (__f32x4_t);
|
||||
|
||||
__vpcs __f64x2_t _ZGVnN2vv_atan2 (__f64x2_t, __f64x2_t);
|
||||
__vpcs __f64x2_t _ZGVnN2v_acos (__f64x2_t);
|
||||
+__vpcs __f64x2_t _ZGVnN2v_acosh (__f64x2_t);
|
||||
__vpcs __f64x2_t _ZGVnN2v_asin (__f64x2_t);
|
||||
__vpcs __f64x2_t _ZGVnN2v_atan (__f64x2_t);
|
||||
__vpcs __f64x2_t _ZGVnN2v_cos (__f64x2_t);
|
||||
@@ -166,6 +172,7 @@ __vpcs __f64x2_t _ZGVnN2v_tan (__f64x2_t);
|
||||
|
||||
__sv_f32_t _ZGVsMxvv_atan2f (__sv_f32_t, __sv_f32_t, __sv_bool_t);
|
||||
__sv_f32_t _ZGVsMxv_acosf (__sv_f32_t, __sv_bool_t);
|
||||
+__sv_f32_t _ZGVsMxv_acoshf (__sv_f32_t, __sv_bool_t);
|
||||
__sv_f32_t _ZGVsMxv_asinf (__sv_f32_t, __sv_bool_t);
|
||||
__sv_f32_t _ZGVsMxv_atanf (__sv_f32_t, __sv_bool_t);
|
||||
__sv_f32_t _ZGVsMxv_cosf (__sv_f32_t, __sv_bool_t);
|
||||
@@ -184,6 +191,7 @@ __sv_f32_t _ZGVsMxv_tanf (__sv_f32_t, __sv_bool_t);
|
||||
|
||||
__sv_f64_t _ZGVsMxvv_atan2 (__sv_f64_t, __sv_f64_t, __sv_bool_t);
|
||||
__sv_f64_t _ZGVsMxv_acos (__sv_f64_t, __sv_bool_t);
|
||||
+__sv_f64_t _ZGVsMxv_acosh (__sv_f64_t, __sv_bool_t);
|
||||
__sv_f64_t _ZGVsMxv_asin (__sv_f64_t, __sv_bool_t);
|
||||
__sv_f64_t _ZGVsMxv_atan (__sv_f64_t, __sv_bool_t);
|
||||
__sv_f64_t _ZGVsMxv_cos (__sv_f64_t, __sv_bool_t);
|
||||
diff --git a/sysdeps/aarch64/fpu/sv_log1p_inline.h b/sysdeps/aarch64/fpu/sv_log1p_inline.h
|
||||
new file mode 100644
|
||||
index 0000000000000000..da019674f94dbac7
|
||||
--- /dev/null
|
||||
+++ b/sysdeps/aarch64/fpu/sv_log1p_inline.h
|
||||
@@ -0,0 +1,109 @@
|
||||
+/* Helper for double-precision SVE routines which depend on log1p
|
||||
+
|
||||
+ Copyright (C) 2024 Free Software Foundation, Inc.
|
||||
+ This file is part of the GNU C Library.
|
||||
+
|
||||
+ The GNU C Library is free software; you can redistribute it and/or
|
||||
+ modify it under the terms of the GNU Lesser General Public
|
||||
+ License as published by the Free Software Foundation; either
|
||||
+ version 2.1 of the License, or (at your option) any later version.
|
||||
+
|
||||
+ The GNU C Library is distributed in the hope that it will be useful,
|
||||
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
+ Lesser General Public License for more details.
|
||||
+
|
||||
+ You should have received a copy of the GNU Lesser General Public
|
||||
+ License along with the GNU C Library; if not, see
|
||||
+ <https://www.gnu.org/licenses/>. */
|
||||
+
|
||||
+#ifndef AARCH64_FPU_SV_LOG1P_INLINE_H
|
||||
+#define AARCH64_FPU_SV_LOG1P_INLINE_H
|
||||
+
|
||||
+#include "sv_math.h"
|
||||
+#include "poly_sve_f64.h"
|
||||
+
|
||||
+static const struct sv_log1p_data
|
||||
+{
|
||||
+ double poly[19], ln2[2];
|
||||
+ uint64_t hf_rt2_top;
|
||||
+ uint64_t one_m_hf_rt2_top;
|
||||
+ uint32_t bottom_mask;
|
||||
+ int64_t one_top;
|
||||
+} sv_log1p_data = {
|
||||
+ /* Coefficients generated using Remez, deg=20, in [sqrt(2)/2-1, sqrt(2)-1].
|
||||
+ */
|
||||
+ .poly = { -0x1.ffffffffffffbp-2, 0x1.55555555551a9p-2, -0x1.00000000008e3p-2,
|
||||
+ 0x1.9999999a32797p-3, -0x1.555555552fecfp-3, 0x1.249248e071e5ap-3,
|
||||
+ -0x1.ffffff8bf8482p-4, 0x1.c71c8f07da57ap-4, -0x1.9999ca4ccb617p-4,
|
||||
+ 0x1.7459ad2e1dfa3p-4, -0x1.554d2680a3ff2p-4, 0x1.3b4c54d487455p-4,
|
||||
+ -0x1.2548a9ffe80e6p-4, 0x1.0f389a24b2e07p-4, -0x1.eee4db15db335p-5,
|
||||
+ 0x1.e95b494d4a5ddp-5, -0x1.15fdf07cb7c73p-4, 0x1.0310b70800fcfp-4,
|
||||
+ -0x1.cfa7385bdb37ep-6 },
|
||||
+ .ln2 = { 0x1.62e42fefa3800p-1, 0x1.ef35793c76730p-45 },
|
||||
+ .hf_rt2_top = 0x3fe6a09e00000000,
|
||||
+ .one_m_hf_rt2_top = 0x00095f6200000000,
|
||||
+ .bottom_mask = 0xffffffff,
|
||||
+ .one_top = 0x3ff
|
||||
+};
|
||||
+
|
||||
+static inline svfloat64_t
|
||||
+sv_log1p_inline (svfloat64_t x, const svbool_t pg)
|
||||
+{
|
||||
+ /* Helper for calculating log(x + 1). Adapted from v_log1p_inline.h, which
|
||||
+ differs from v_log1p_2u5.c by:
|
||||
+ - No special-case handling - this should be dealt with by the caller.
|
||||
+ - Pairwise Horner polynomial evaluation for improved accuracy.
|
||||
+ - Optionally simulate the shortcut for k=0, used in the scalar routine,
|
||||
+ using svsel, for improved accuracy when the argument to log1p is close
|
||||
+ to 0. This feature is enabled by defining WANT_SV_LOG1P_K0_SHORTCUT as 1
|
||||
+ in the source of the caller before including this file.
|
||||
+ See sv_log1p_2u1.c for details of the algorithm. */
|
||||
+ const struct sv_log1p_data *d = ptr_barrier (&sv_log1p_data);
|
||||
+ svfloat64_t m = svadd_x (pg, x, 1);
|
||||
+ svuint64_t mi = svreinterpret_u64 (m);
|
||||
+ svuint64_t u = svadd_x (pg, mi, d->one_m_hf_rt2_top);
|
||||
+
|
||||
+ svint64_t ki
|
||||
+ = svsub_x (pg, svreinterpret_s64 (svlsr_x (pg, u, 52)), d->one_top);
|
||||
+ svfloat64_t k = svcvt_f64_x (pg, ki);
|
||||
+
|
||||
+ /* Reduce x to f in [sqrt(2)/2, sqrt(2)]. */
|
||||
+ svuint64_t utop
|
||||
+ = svadd_x (pg, svand_x (pg, u, 0x000fffff00000000), d->hf_rt2_top);
|
||||
+ svuint64_t u_red = svorr_x (pg, utop, svand_x (pg, mi, d->bottom_mask));
|
||||
+ svfloat64_t f = svsub_x (pg, svreinterpret_f64 (u_red), 1);
|
||||
+
|
||||
+ /* Correction term c/m. */
|
||||
+ svfloat64_t c = svsub_x (pg, x, svsub_x (pg, m, 1));
|
||||
+ svfloat64_t cm;
|
||||
+
|
||||
+#ifndef WANT_SV_LOG1P_K0_SHORTCUT
|
||||
+#error \
|
||||
+ "Cannot use sv_log1p_inline.h without specifying whether you need the k0 shortcut for greater accuracy close to 0"
|
||||
+#elif WANT_SV_LOG1P_K0_SHORTCUT
|
||||
+ /* Shortcut if k is 0 - set correction term to 0 and f to x. The result is
|
||||
+ that the approximation is solely the polynomial. */
|
||||
+ svbool_t knot0 = svcmpne (pg, k, 0);
|
||||
+ cm = svdiv_z (knot0, c, m);
|
||||
+ if (__glibc_likely (!svptest_any (pg, knot0)))
|
||||
+ {
|
||||
+ f = svsel (knot0, f, x);
|
||||
+ }
|
||||
+#else
|
||||
+ /* No shortcut. */
|
||||
+ cm = svdiv_x (pg, c, m);
|
||||
+#endif
|
||||
+
|
||||
+ /* Approximate log1p(f) on the reduced input using a polynomial. */
|
||||
+ svfloat64_t f2 = svmul_x (pg, f, f);
|
||||
+ svfloat64_t p = sv_pw_horner_18_f64_x (pg, f, f2, d->poly);
|
||||
+
|
||||
+ /* Assemble log1p(x) = k * log2 + log1p(f) + c/m. */
|
||||
+ svfloat64_t ylo = svmla_x (pg, cm, k, d->ln2[0]);
|
||||
+ svfloat64_t yhi = svmla_x (pg, f, k, d->ln2[1]);
|
||||
+
|
||||
+ return svmla_x (pg, svadd_x (pg, ylo, yhi), f2, p);
|
||||
+}
|
||||
+
|
||||
+#endif
|
||||
diff --git a/sysdeps/aarch64/fpu/sv_log1pf_inline.h b/sysdeps/aarch64/fpu/sv_log1pf_inline.h
|
||||
new file mode 100644
|
||||
index 0000000000000000..b94b2da055a6c59b
|
||||
--- /dev/null
|
||||
+++ b/sysdeps/aarch64/fpu/sv_log1pf_inline.h
|
||||
@@ -0,0 +1,76 @@
|
||||
+/* Helper for single-precision SVE routines which depend on log1p
|
||||
+
|
||||
+ Copyright (C) 2024 Free Software Foundation, Inc.
|
||||
+ This file is part of the GNU C Library.
|
||||
+
|
||||
+ The GNU C Library is free software; you can redistribute it and/or
|
||||
+ modify it under the terms of the GNU Lesser General Public
|
||||
+ License as published by the Free Software Foundation; either
|
||||
+ version 2.1 of the License, or (at your option) any later version.
|
||||
+
|
||||
+ The GNU C Library is distributed in the hope that it will be useful,
|
||||
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
+ Lesser General Public License for more details.
|
||||
+
|
||||
+ You should have received a copy of the GNU Lesser General Public
|
||||
+ License along with the GNU C Library; if not, see
|
||||
+ <https://www.gnu.org/licenses/>. */
|
||||
+
|
||||
+#ifndef AARCH64_FPU_SV_LOG1PF_INLINE_H
|
||||
+#define AARCH64_FPU_SV_LOG1PF_INLINE_H
|
||||
+
|
||||
+#include "sv_math.h"
|
||||
+#include "vecmath_config.h"
|
||||
+#include "poly_sve_f32.h"
|
||||
+
|
||||
+static const struct sv_log1pf_data
|
||||
+{
|
||||
+ float32_t poly[9];
|
||||
+ float32_t ln2;
|
||||
+ float32_t scale_back;
|
||||
+} sv_log1pf_data = {
|
||||
+ /* Polynomial generated using FPMinimax in [-0.25, 0.5]. */
|
||||
+ .poly = { -0x1p-1f, 0x1.5555aap-2f, -0x1.000038p-2f, 0x1.99675cp-3f,
|
||||
+ -0x1.54ef78p-3f, 0x1.28a1f4p-3f, -0x1.0da91p-3f, 0x1.abcb6p-4f,
|
||||
+ -0x1.6f0d5ep-5f },
|
||||
+ .scale_back = 0x1.0p-23f,
|
||||
+ .ln2 = 0x1.62e43p-1f,
|
||||
+};
|
||||
+
|
||||
+static inline svfloat32_t
|
||||
+eval_poly (svfloat32_t m, const float32_t *c, svbool_t pg)
|
||||
+{
|
||||
+ svfloat32_t p_12 = svmla_x (pg, sv_f32 (c[0]), m, sv_f32 (c[1]));
|
||||
+ svfloat32_t m2 = svmul_x (pg, m, m);
|
||||
+ svfloat32_t q = svmla_x (pg, m, m2, p_12);
|
||||
+ svfloat32_t p = sv_pw_horner_6_f32_x (pg, m, m2, c + 2);
|
||||
+ p = svmul_x (pg, m2, p);
|
||||
+
|
||||
+ return svmla_x (pg, q, m2, p);
|
||||
+}
|
||||
+
|
||||
+static inline svfloat32_t
|
||||
+sv_log1pf_inline (svfloat32_t x, svbool_t pg)
|
||||
+{
|
||||
+ const struct sv_log1pf_data *d = ptr_barrier (&sv_log1pf_data);
|
||||
+
|
||||
+ svfloat32_t m = svadd_x (pg, x, 1.0f);
|
||||
+
|
||||
+ svint32_t ks = svsub_x (pg, svreinterpret_s32 (m),
|
||||
+ svreinterpret_s32 (svdup_f32 (0.75f)));
|
||||
+ ks = svand_x (pg, ks, 0xff800000);
|
||||
+ svuint32_t k = svreinterpret_u32 (ks);
|
||||
+ svfloat32_t s = svreinterpret_f32 (
|
||||
+ svsub_x (pg, svreinterpret_u32 (svdup_f32 (4.0f)), k));
|
||||
+
|
||||
+ svfloat32_t m_scale
|
||||
+ = svreinterpret_f32 (svsub_x (pg, svreinterpret_u32 (x), k));
|
||||
+ m_scale
|
||||
+ = svadd_x (pg, m_scale, svmla_x (pg, sv_f32 (-1.0f), sv_f32 (0.25f), s));
|
||||
+ svfloat32_t p = eval_poly (m_scale, d->poly, pg);
|
||||
+ svfloat32_t scale_back = svmul_x (pg, svcvt_f32_x (pg, k), d->scale_back);
|
||||
+ return svmla_x (pg, p, scale_back, d->ln2);
|
||||
+}
|
||||
+
|
||||
+#endif
|
||||
diff --git a/sysdeps/aarch64/fpu/test-double-advsimd-wrappers.c b/sysdeps/aarch64/fpu/test-double-advsimd-wrappers.c
|
||||
index b37cb7d5e9c0d96a..f4ce1d70096888aa 100644
|
||||
--- a/sysdeps/aarch64/fpu/test-double-advsimd-wrappers.c
|
||||
+++ b/sysdeps/aarch64/fpu/test-double-advsimd-wrappers.c
|
||||
@@ -24,6 +24,7 @@
|
||||
#define VEC_TYPE float64x2_t
|
||||
|
||||
VPCS_VECTOR_WRAPPER (acos_advsimd, _ZGVnN2v_acos)
|
||||
+VPCS_VECTOR_WRAPPER (acosh_advsimd, _ZGVnN2v_acosh)
|
||||
VPCS_VECTOR_WRAPPER (asin_advsimd, _ZGVnN2v_asin)
|
||||
VPCS_VECTOR_WRAPPER (atan_advsimd, _ZGVnN2v_atan)
|
||||
VPCS_VECTOR_WRAPPER_ff (atan2_advsimd, _ZGVnN2vv_atan2)
|
||||
diff --git a/sysdeps/aarch64/fpu/test-double-sve-wrappers.c b/sysdeps/aarch64/fpu/test-double-sve-wrappers.c
|
||||
index 011f07d2c15b148f..0e973cc9d7ade813 100644
|
||||
--- a/sysdeps/aarch64/fpu/test-double-sve-wrappers.c
|
||||
+++ b/sysdeps/aarch64/fpu/test-double-sve-wrappers.c
|
||||
@@ -43,6 +43,7 @@
|
||||
}
|
||||
|
||||
SVE_VECTOR_WRAPPER (acos_sve, _ZGVsMxv_acos)
|
||||
+SVE_VECTOR_WRAPPER (acosh_sve, _ZGVsMxv_acosh)
|
||||
SVE_VECTOR_WRAPPER (asin_sve, _ZGVsMxv_asin)
|
||||
SVE_VECTOR_WRAPPER (atan_sve, _ZGVsMxv_atan)
|
||||
SVE_VECTOR_WRAPPER_ff (atan2_sve, _ZGVsMxvv_atan2)
|
||||
diff --git a/sysdeps/aarch64/fpu/test-float-advsimd-wrappers.c b/sysdeps/aarch64/fpu/test-float-advsimd-wrappers.c
|
||||
index 35452991431e238a..0ce026b5ea96a064 100644
|
||||
--- a/sysdeps/aarch64/fpu/test-float-advsimd-wrappers.c
|
||||
+++ b/sysdeps/aarch64/fpu/test-float-advsimd-wrappers.c
|
||||
@@ -24,6 +24,7 @@
|
||||
#define VEC_TYPE float32x4_t
|
||||
|
||||
VPCS_VECTOR_WRAPPER (acosf_advsimd, _ZGVnN4v_acosf)
|
||||
+VPCS_VECTOR_WRAPPER (acoshf_advsimd, _ZGVnN4v_acoshf)
|
||||
VPCS_VECTOR_WRAPPER (asinf_advsimd, _ZGVnN4v_asinf)
|
||||
VPCS_VECTOR_WRAPPER (atanf_advsimd, _ZGVnN4v_atanf)
|
||||
VPCS_VECTOR_WRAPPER_ff (atan2f_advsimd, _ZGVnN4vv_atan2f)
|
||||
diff --git a/sysdeps/aarch64/fpu/test-float-sve-wrappers.c b/sysdeps/aarch64/fpu/test-float-sve-wrappers.c
|
||||
index bbc74ede88c9e6c8..398b7373e800cd5b 100644
|
||||
--- a/sysdeps/aarch64/fpu/test-float-sve-wrappers.c
|
||||
+++ b/sysdeps/aarch64/fpu/test-float-sve-wrappers.c
|
||||
@@ -43,6 +43,7 @@
|
||||
}
|
||||
|
||||
SVE_VECTOR_WRAPPER (acosf_sve, _ZGVsMxv_acosf)
|
||||
+SVE_VECTOR_WRAPPER (acoshf_sve, _ZGVsMxv_acoshf)
|
||||
SVE_VECTOR_WRAPPER (asinf_sve, _ZGVsMxv_asinf)
|
||||
SVE_VECTOR_WRAPPER (atanf_sve, _ZGVsMxv_atanf)
|
||||
SVE_VECTOR_WRAPPER_ff (atan2f_sve, _ZGVsMxvv_atan2f)
|
||||
diff --git a/sysdeps/aarch64/fpu/v_log1p_inline.h b/sysdeps/aarch64/fpu/v_log1p_inline.h
|
||||
new file mode 100644
|
||||
index 0000000000000000..242e43b6eecc0b6e
|
||||
--- /dev/null
|
||||
+++ b/sysdeps/aarch64/fpu/v_log1p_inline.h
|
||||
@@ -0,0 +1,103 @@
|
||||
+/* Helper for double-precision Advanced SIMD routines which depend on log1p
|
||||
+
|
||||
+ Copyright (C) 2024 Free Software Foundation, Inc.
|
||||
+ This file is part of the GNU C Library.
|
||||
+
|
||||
+ The GNU C Library is free software; you can redistribute it and/or
|
||||
+ modify it under the terms of the GNU Lesser General Public
|
||||
+ License as published by the Free Software Foundation; either
|
||||
+ version 2.1 of the License, or (at your option) any later version.
|
||||
+
|
||||
+ The GNU C Library is distributed in the hope that it will be useful,
|
||||
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
+ Lesser General Public License for more details.
|
||||
+
|
||||
+ You should have received a copy of the GNU Lesser General Public
|
||||
+ License along with the GNU C Library; if not, see
|
||||
+ <https://www.gnu.org/licenses/>. */
|
||||
+
|
||||
+#ifndef AARCH64_FPU_V_LOG1P_INLINE_H
|
||||
+#define AARCH64_FPU_V_LOG1P_INLINE_H
|
||||
+
|
||||
+#include "v_math.h"
|
||||
+#include "poly_advsimd_f64.h"
|
||||
+
|
||||
+struct v_log1p_data
|
||||
+{
|
||||
+ float64x2_t poly[19], ln2[2];
|
||||
+ uint64x2_t hf_rt2_top, one_m_hf_rt2_top, umask;
|
||||
+ int64x2_t one_top;
|
||||
+};
|
||||
+
|
||||
+/* Coefficients generated using Remez, deg=20, in [sqrt(2)/2-1, sqrt(2)-1]. */
|
||||
+#define V_LOG1P_CONSTANTS_TABLE \
|
||||
+ { \
|
||||
+ .poly = { V2 (-0x1.ffffffffffffbp-2), V2 (0x1.55555555551a9p-2), \
|
||||
+ V2 (-0x1.00000000008e3p-2), V2 (0x1.9999999a32797p-3), \
|
||||
+ V2 (-0x1.555555552fecfp-3), V2 (0x1.249248e071e5ap-3), \
|
||||
+ V2 (-0x1.ffffff8bf8482p-4), V2 (0x1.c71c8f07da57ap-4), \
|
||||
+ V2 (-0x1.9999ca4ccb617p-4), V2 (0x1.7459ad2e1dfa3p-4), \
|
||||
+ V2 (-0x1.554d2680a3ff2p-4), V2 (0x1.3b4c54d487455p-4), \
|
||||
+ V2 (-0x1.2548a9ffe80e6p-4), V2 (0x1.0f389a24b2e07p-4), \
|
||||
+ V2 (-0x1.eee4db15db335p-5), V2 (0x1.e95b494d4a5ddp-5), \
|
||||
+ V2 (-0x1.15fdf07cb7c73p-4), V2 (0x1.0310b70800fcfp-4), \
|
||||
+ V2 (-0x1.cfa7385bdb37ep-6) }, \
|
||||
+ .ln2 = { V2 (0x1.62e42fefa3800p-1), V2 (0x1.ef35793c76730p-45) }, \
|
||||
+ .hf_rt2_top = V2 (0x3fe6a09e00000000), \
|
||||
+ .one_m_hf_rt2_top = V2 (0x00095f6200000000), \
|
||||
+ .umask = V2 (0x000fffff00000000), .one_top = V2 (0x3ff) \
|
||||
+ }
|
||||
+
|
||||
+#define BottomMask v_u64 (0xffffffff)
|
||||
+
|
||||
+static inline float64x2_t
|
||||
+log1p_inline (float64x2_t x, const struct v_log1p_data *d)
|
||||
+{
|
||||
+ /* Helper for calculating log(x + 1). Copied from v_log1p_2u5.c, with several
|
||||
+ modifications:
|
||||
+ - No special-case handling - this should be dealt with by the caller.
|
||||
+ - Pairwise Horner polynomial evaluation for improved accuracy.
|
||||
+ - Optionally simulate the shortcut for k=0, used in the scalar routine,
|
||||
+ using v_sel, for improved accuracy when the argument to log1p is close to
|
||||
+ 0. This feature is enabled by defining WANT_V_LOG1P_K0_SHORTCUT as 1 in
|
||||
+ the source of the caller before including this file.
|
||||
+ See v_log1pf_2u1.c for details of the algorithm. */
|
||||
+ float64x2_t m = vaddq_f64 (x, v_f64 (1));
|
||||
+ uint64x2_t mi = vreinterpretq_u64_f64 (m);
|
||||
+ uint64x2_t u = vaddq_u64 (mi, d->one_m_hf_rt2_top);
|
||||
+
|
||||
+ int64x2_t ki
|
||||
+ = vsubq_s64 (vreinterpretq_s64_u64 (vshrq_n_u64 (u, 52)), d->one_top);
|
||||
+ float64x2_t k = vcvtq_f64_s64 (ki);
|
||||
+
|
||||
+ /* Reduce x to f in [sqrt(2)/2, sqrt(2)]. */
|
||||
+ uint64x2_t utop = vaddq_u64 (vandq_u64 (u, d->umask), d->hf_rt2_top);
|
||||
+ uint64x2_t u_red = vorrq_u64 (utop, vandq_u64 (mi, BottomMask));
|
||||
+ float64x2_t f = vsubq_f64 (vreinterpretq_f64_u64 (u_red), v_f64 (1));
|
||||
+
|
||||
+ /* Correction term c/m. */
|
||||
+ float64x2_t cm = vdivq_f64 (vsubq_f64 (x, vsubq_f64 (m, v_f64 (1))), m);
|
||||
+
|
||||
+#ifndef WANT_V_LOG1P_K0_SHORTCUT
|
||||
+#error \
|
||||
+ "Cannot use v_log1p_inline.h without specifying whether you need the k0 shortcut for greater accuracy close to 0"
|
||||
+#elif WANT_V_LOG1P_K0_SHORTCUT
|
||||
+ /* Shortcut if k is 0 - set correction term to 0 and f to x. The result is
|
||||
+ that the approximation is solely the polynomial. */
|
||||
+ uint64x2_t k0 = vceqzq_f64 (k);
|
||||
+ cm = v_zerofy_f64 (cm, k0);
|
||||
+ f = vbslq_f64 (k0, x, f);
|
||||
+#endif
|
||||
+
|
||||
+ /* Approximate log1p(f) on the reduced input using a polynomial. */
|
||||
+ float64x2_t f2 = vmulq_f64 (f, f);
|
||||
+ float64x2_t p = v_pw_horner_18_f64 (f, f2, d->poly);
|
||||
+
|
||||
+ /* Assemble log1p(x) = k * log2 + log1p(f) + c/m. */
|
||||
+ float64x2_t ylo = vfmaq_f64 (cm, k, d->ln2[1]);
|
||||
+ float64x2_t yhi = vfmaq_f64 (f, k, d->ln2[0]);
|
||||
+ return vfmaq_f64 (vaddq_f64 (ylo, yhi), f2, p);
|
||||
+}
|
||||
+
|
||||
+#endif
|
||||
diff --git a/sysdeps/aarch64/fpu/v_log1pf_inline.h b/sysdeps/aarch64/fpu/v_log1pf_inline.h
|
||||
new file mode 100644
|
||||
index 0000000000000000..643a6cdcfc498970
|
||||
--- /dev/null
|
||||
+++ b/sysdeps/aarch64/fpu/v_log1pf_inline.h
|
||||
@@ -0,0 +1,78 @@
|
||||
+/* Helper for single-precision Advanced SIMD routines which depend on log1p
|
||||
+
|
||||
+ Copyright (C) 2024 Free Software Foundation, Inc.
|
||||
+ This file is part of the GNU C Library.
|
||||
+
|
||||
+ The GNU C Library is free software; you can redistribute it and/or
|
||||
+ modify it under the terms of the GNU Lesser General Public
|
||||
+ License as published by the Free Software Foundation; either
|
||||
+ version 2.1 of the License, or (at your option) any later version.
|
||||
+
|
||||
+ The GNU C Library is distributed in the hope that it will be useful,
|
||||
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
+ Lesser General Public License for more details.
|
||||
+
|
||||
+ You should have received a copy of the GNU Lesser General Public
|
||||
+ License along with the GNU C Library; if not, see
|
||||
+ <https://www.gnu.org/licenses/>. */
|
||||
+
|
||||
+#ifndef AARCH64_FPU_V_LOG1PF_INLINE_H
|
||||
+#define AARCH64_FPU_V_LOG1PF_INLINE_H
|
||||
+
|
||||
+#include "v_math.h"
|
||||
+#include "poly_advsimd_f32.h"
|
||||
+
|
||||
+struct v_log1pf_data
|
||||
+{
|
||||
+ float32x4_t poly[8], ln2;
|
||||
+ uint32x4_t four;
|
||||
+ int32x4_t three_quarters;
|
||||
+};
|
||||
+
|
||||
+/* Polynomial generated using FPMinimax in [-0.25, 0.5]. First two coefficients
|
||||
+ (1, -0.5) are not stored as they can be generated more efficiently. */
|
||||
+#define V_LOG1PF_CONSTANTS_TABLE \
|
||||
+ { \
|
||||
+ .poly \
|
||||
+ = { V4 (0x1.5555aap-2f), V4 (-0x1.000038p-2f), V4 (0x1.99675cp-3f), \
|
||||
+ V4 (-0x1.54ef78p-3f), V4 (0x1.28a1f4p-3f), V4 (-0x1.0da91p-3f), \
|
||||
+ V4 (0x1.abcb6p-4f), V4 (-0x1.6f0d5ep-5f) }, \
|
||||
+ .ln2 = V4 (0x1.62e43p-1f), .four = V4 (0x40800000), \
|
||||
+ .three_quarters = V4 (0x3f400000) \
|
||||
+ }
|
||||
+
|
||||
+static inline float32x4_t
|
||||
+eval_poly (float32x4_t m, const float32x4_t *c)
|
||||
+{
|
||||
+ /* Approximate log(1+m) on [-0.25, 0.5] using pairwise Horner (main routine
|
||||
+ uses split Estrin, but this way reduces register pressure in the calling
|
||||
+ routine). */
|
||||
+ float32x4_t q = vfmaq_f32 (v_f32 (-0.5), m, c[0]);
|
||||
+ float32x4_t m2 = vmulq_f32 (m, m);
|
||||
+ q = vfmaq_f32 (m, m2, q);
|
||||
+ float32x4_t p = v_pw_horner_6_f32 (m, m2, c + 1);
|
||||
+ p = vmulq_f32 (m2, p);
|
||||
+ return vfmaq_f32 (q, m2, p);
|
||||
+}
|
||||
+
|
||||
+static inline float32x4_t
|
||||
+log1pf_inline (float32x4_t x, const struct v_log1pf_data d)
|
||||
+{
|
||||
+ /* Helper for calculating log(x + 1). Copied from log1pf_2u1.c, with no
|
||||
+ special-case handling. See that file for details of the algorithm. */
|
||||
+ float32x4_t m = vaddq_f32 (x, v_f32 (1.0f));
|
||||
+ int32x4_t k
|
||||
+ = vandq_s32 (vsubq_s32 (vreinterpretq_s32_f32 (m), d.three_quarters),
|
||||
+ v_s32 (0xff800000));
|
||||
+ uint32x4_t ku = vreinterpretq_u32_s32 (k);
|
||||
+ float32x4_t s = vreinterpretq_f32_u32 (vsubq_u32 (d.four, ku));
|
||||
+ float32x4_t m_scale
|
||||
+ = vreinterpretq_f32_u32 (vsubq_u32 (vreinterpretq_u32_f32 (x), ku));
|
||||
+ m_scale = vaddq_f32 (m_scale, vfmaq_f32 (v_f32 (-1.0f), v_f32 (0.25f), s));
|
||||
+ float32x4_t p = eval_poly (m_scale, d.poly);
|
||||
+ float32x4_t scale_back = vmulq_f32 (vcvtq_f32_s32 (k), v_f32 (0x1.0p-23f));
|
||||
+ return vfmaq_f32 (p, scale_back, d.ln2);
|
||||
+}
|
||||
+
|
||||
+#endif
|
||||
diff --git a/sysdeps/aarch64/fpu/v_math.h b/sysdeps/aarch64/fpu/v_math.h
|
||||
index d4d78bc4027abebb..12824fce8c698cf4 100644
|
||||
--- a/sysdeps/aarch64/fpu/v_math.h
|
||||
+++ b/sysdeps/aarch64/fpu/v_math.h
|
||||
@@ -108,6 +108,11 @@ v_call2_f32 (float (*f) (float, float), float32x4_t x1, float32x4_t x2,
|
||||
p[2] ? f (x1[2], x2[2]) : y[2],
|
||||
p[3] ? f (x1[3], x2[3]) : y[3] };
|
||||
}
|
||||
+static inline float32x4_t
|
||||
+v_zerofy_f32 (float32x4_t x, uint32x4_t mask)
|
||||
+{
|
||||
+ return vreinterpretq_f32_u32 (vbicq_u32 (vreinterpretq_u32_f32 (x), mask));
|
||||
+}
|
||||
|
||||
static inline float64x2_t
|
||||
v_f64 (double x)
|
||||
@@ -167,5 +172,10 @@ v_call2_f64 (double (*f) (double, double), float64x2_t x1, float64x2_t x2,
|
||||
return (float64x2_t){ p[0] ? f (x1[0], x2[0]) : y[0],
|
||||
p[1] ? f (x1[1], x2[1]) : y[1] };
|
||||
}
|
||||
+static inline float64x2_t
|
||||
+v_zerofy_f64 (float64x2_t x, uint64x2_t mask)
|
||||
+{
|
||||
+ return vreinterpretq_f64_u64 (vbicq_u64 (vreinterpretq_u64_f64 (x), mask));
|
||||
+}
|
||||
|
||||
#endif
|
||||
diff --git a/sysdeps/aarch64/libm-test-ulps b/sysdeps/aarch64/libm-test-ulps
|
||||
index 48d747ad5793be96..1646cdbdd22d93d9 100644
|
||||
--- a/sysdeps/aarch64/libm-test-ulps
|
||||
+++ b/sysdeps/aarch64/libm-test-ulps
|
||||
@@ -34,11 +34,19 @@ double: 2
|
||||
float: 2
|
||||
ldouble: 4
|
||||
|
||||
+Function: "acosh_advsimd":
|
||||
+double: 2
|
||||
+float: 2
|
||||
+
|
||||
Function: "acosh_downward":
|
||||
double: 2
|
||||
float: 2
|
||||
ldouble: 3
|
||||
|
||||
+Function: "acosh_sve":
|
||||
+double: 2
|
||||
+float: 2
|
||||
+
|
||||
Function: "acosh_towardzero":
|
||||
double: 2
|
||||
float: 2
|
||||
diff --git a/sysdeps/unix/sysv/linux/aarch64/libmvec.abilist b/sysdeps/unix/sysv/linux/aarch64/libmvec.abilist
|
||||
index f66da42c3630bf48..f5aaa519f2c8663e 100644
|
||||
--- a/sysdeps/unix/sysv/linux/aarch64/libmvec.abilist
|
||||
+++ b/sysdeps/unix/sysv/linux/aarch64/libmvec.abilist
|
||||
@@ -73,12 +73,17 @@ GLIBC_2.39 _ZGVsMxv_tan F
|
||||
GLIBC_2.39 _ZGVsMxv_tanf F
|
||||
GLIBC_2.39 _ZGVsMxvv_atan2 F
|
||||
GLIBC_2.39 _ZGVsMxvv_atan2f F
|
||||
+GLIBC_2.40 _ZGVnN2v_acosh F
|
||||
+GLIBC_2.40 _ZGVnN2v_acoshf F
|
||||
GLIBC_2.40 _ZGVnN2v_cosh F
|
||||
GLIBC_2.40 _ZGVnN2v_coshf F
|
||||
GLIBC_2.40 _ZGVnN2v_erf F
|
||||
GLIBC_2.40 _ZGVnN2v_erff F
|
||||
+GLIBC_2.40 _ZGVnN4v_acoshf F
|
||||
GLIBC_2.40 _ZGVnN4v_coshf F
|
||||
GLIBC_2.40 _ZGVnN4v_erff F
|
||||
+GLIBC_2.40 _ZGVsMxv_acosh F
|
||||
+GLIBC_2.40 _ZGVsMxv_acoshf F
|
||||
GLIBC_2.40 _ZGVsMxv_cosh F
|
||||
GLIBC_2.40 _ZGVsMxv_coshf F
|
||||
GLIBC_2.40 _ZGVsMxv_erf F
|
||||
303
glibc-RHEL-118273-30.patch
Normal file
303
glibc-RHEL-118273-30.patch
Normal file
@ -0,0 +1,303 @@
|
||||
commit c0ff447edf19bd4630fe79adf5e8b896405b059f
|
||||
Author: Luna Lamb <luna.lamb@arm.com>
|
||||
Date: Thu Feb 13 17:54:46 2025 +0000
|
||||
|
||||
Aarch64: Improve codegen in SVE exp and users, and update expf_inline
|
||||
|
||||
Use unpredicted muls, and improve memory access.
|
||||
7%, 3% and 1% improvement in throughput microbenchmark on Neoverse V1,
|
||||
for exp, exp2 and cosh respectively.
|
||||
|
||||
Reviewed-by: Wilco Dijkstra <Wilco.Dijkstra@arm.com>
|
||||
|
||||
diff --git a/sysdeps/aarch64/fpu/cosh_sve.c b/sysdeps/aarch64/fpu/cosh_sve.c
|
||||
index 919f34604a452b4a..e375dd8a3407feb2 100644
|
||||
--- a/sysdeps/aarch64/fpu/cosh_sve.c
|
||||
+++ b/sysdeps/aarch64/fpu/cosh_sve.c
|
||||
@@ -23,7 +23,7 @@ static const struct data
|
||||
{
|
||||
float64_t poly[3];
|
||||
float64_t inv_ln2, ln2_hi, ln2_lo, shift, thres;
|
||||
- uint64_t index_mask, special_bound;
|
||||
+ uint64_t special_bound;
|
||||
} data = {
|
||||
.poly = { 0x1.fffffffffffd4p-2, 0x1.5555571d6b68cp-3,
|
||||
0x1.5555576a59599p-5, },
|
||||
@@ -35,14 +35,16 @@ static const struct data
|
||||
.shift = 0x1.8p+52,
|
||||
.thres = 704.0,
|
||||
|
||||
- .index_mask = 0xff,
|
||||
/* 0x1.6p9, above which exp overflows. */
|
||||
.special_bound = 0x4086000000000000,
|
||||
};
|
||||
|
||||
static svfloat64_t NOINLINE
|
||||
-special_case (svfloat64_t x, svfloat64_t y, svbool_t special)
|
||||
+special_case (svfloat64_t x, svbool_t pg, svfloat64_t t, svbool_t special)
|
||||
{
|
||||
+ svfloat64_t half_t = svmul_x (svptrue_b64 (), t, 0.5);
|
||||
+ svfloat64_t half_over_t = svdivr_x (pg, t, 0.5);
|
||||
+ svfloat64_t y = svadd_x (pg, half_t, half_over_t);
|
||||
return sv_call_f64 (cosh, x, y, special);
|
||||
}
|
||||
|
||||
@@ -60,12 +62,12 @@ exp_inline (svfloat64_t x, const svbool_t pg, const struct data *d)
|
||||
|
||||
svuint64_t u = svreinterpret_u64 (z);
|
||||
svuint64_t e = svlsl_x (pg, u, 52 - V_EXP_TAIL_TABLE_BITS);
|
||||
- svuint64_t i = svand_x (pg, u, d->index_mask);
|
||||
+ svuint64_t i = svand_x (svptrue_b64 (), u, 0xff);
|
||||
|
||||
svfloat64_t y = svmla_x (pg, sv_f64 (d->poly[1]), r, d->poly[2]);
|
||||
y = svmla_x (pg, sv_f64 (d->poly[0]), r, y);
|
||||
y = svmla_x (pg, sv_f64 (1.0), r, y);
|
||||
- y = svmul_x (pg, r, y);
|
||||
+ y = svmul_x (svptrue_b64 (), r, y);
|
||||
|
||||
/* s = 2^(n/N). */
|
||||
u = svld1_gather_index (pg, __v_exp_tail_data, i);
|
||||
@@ -94,12 +96,12 @@ svfloat64_t SV_NAME_D1 (cosh) (svfloat64_t x, const svbool_t pg)
|
||||
/* Up to the point that exp overflows, we can use it to calculate cosh by
|
||||
exp(|x|) / 2 + 1 / (2 * exp(|x|)). */
|
||||
svfloat64_t t = exp_inline (ax, pg, d);
|
||||
- svfloat64_t half_t = svmul_x (pg, t, 0.5);
|
||||
- svfloat64_t half_over_t = svdivr_x (pg, t, 0.5);
|
||||
|
||||
/* Fall back to scalar for any special cases. */
|
||||
if (__glibc_unlikely (svptest_any (pg, special)))
|
||||
- return special_case (x, svadd_x (pg, half_t, half_over_t), special);
|
||||
+ return special_case (x, pg, t, special);
|
||||
|
||||
+ svfloat64_t half_t = svmul_x (svptrue_b64 (), t, 0.5);
|
||||
+ svfloat64_t half_over_t = svdivr_x (pg, t, 0.5);
|
||||
return svadd_x (pg, half_t, half_over_t);
|
||||
}
|
||||
diff --git a/sysdeps/aarch64/fpu/exp10_sve.c b/sysdeps/aarch64/fpu/exp10_sve.c
|
||||
index ddf64708cb1773cd..bfd3fb9e1948a3b8 100644
|
||||
--- a/sysdeps/aarch64/fpu/exp10_sve.c
|
||||
+++ b/sysdeps/aarch64/fpu/exp10_sve.c
|
||||
@@ -18,21 +18,23 @@
|
||||
<https://www.gnu.org/licenses/>. */
|
||||
|
||||
#include "sv_math.h"
|
||||
-#include "poly_sve_f64.h"
|
||||
|
||||
#define SpecialBound 307.0 /* floor (log10 (2^1023)). */
|
||||
|
||||
static const struct data
|
||||
{
|
||||
- double poly[5];
|
||||
+ double c1, c3, c2, c4, c0;
|
||||
double shift, log10_2, log2_10_hi, log2_10_lo, scale_thres, special_bound;
|
||||
} data = {
|
||||
/* Coefficients generated using Remez algorithm.
|
||||
rel error: 0x1.9fcb9b3p-60
|
||||
abs error: 0x1.a20d9598p-60 in [ -log10(2)/128, log10(2)/128 ]
|
||||
max ulp err 0.52 +0.5. */
|
||||
- .poly = { 0x1.26bb1bbb55516p1, 0x1.53524c73cd32ap1, 0x1.0470591daeafbp1,
|
||||
- 0x1.2bd77b1361ef6p0, 0x1.142b5d54e9621p-1 },
|
||||
+ .c0 = 0x1.26bb1bbb55516p1,
|
||||
+ .c1 = 0x1.53524c73cd32ap1,
|
||||
+ .c2 = 0x1.0470591daeafbp1,
|
||||
+ .c3 = 0x1.2bd77b1361ef6p0,
|
||||
+ .c4 = 0x1.142b5d54e9621p-1,
|
||||
/* 1.5*2^46+1023. This value is further explained below. */
|
||||
.shift = 0x1.800000000ffc0p+46,
|
||||
.log10_2 = 0x1.a934f0979a371p1, /* 1/log2(10). */
|
||||
@@ -70,9 +72,9 @@ special_case (svbool_t pg, svfloat64_t s, svfloat64_t y, svfloat64_t n,
|
||||
/* |n| > 1280 => 2^(n) overflows. */
|
||||
svbool_t p_cmp = svacgt (pg, n, d->scale_thres);
|
||||
|
||||
- svfloat64_t r1 = svmul_x (pg, s1, s1);
|
||||
+ svfloat64_t r1 = svmul_x (svptrue_b64 (), s1, s1);
|
||||
svfloat64_t r2 = svmla_x (pg, s2, s2, y);
|
||||
- svfloat64_t r0 = svmul_x (pg, r2, s1);
|
||||
+ svfloat64_t r0 = svmul_x (svptrue_b64 (), r2, s1);
|
||||
|
||||
return svsel (p_cmp, r1, r0);
|
||||
}
|
||||
@@ -103,11 +105,14 @@ svfloat64_t SV_NAME_D1 (exp10) (svfloat64_t x, svbool_t pg)
|
||||
comes at significant performance cost. */
|
||||
svuint64_t u = svreinterpret_u64 (z);
|
||||
svfloat64_t scale = svexpa (u);
|
||||
-
|
||||
+ svfloat64_t c24 = svld1rq (svptrue_b64 (), &d->c2);
|
||||
/* Approximate exp10(r) using polynomial. */
|
||||
- svfloat64_t r2 = svmul_x (pg, r, r);
|
||||
- svfloat64_t y = svmla_x (pg, svmul_x (pg, r, d->poly[0]), r2,
|
||||
- sv_pairwise_poly_3_f64_x (pg, r, r2, d->poly + 1));
|
||||
+ svfloat64_t r2 = svmul_x (svptrue_b64 (), r, r);
|
||||
+ svfloat64_t p12 = svmla_lane (sv_f64 (d->c1), r, c24, 0);
|
||||
+ svfloat64_t p34 = svmla_lane (sv_f64 (d->c3), r, c24, 1);
|
||||
+ svfloat64_t p14 = svmla_x (pg, p12, p34, r2);
|
||||
+
|
||||
+ svfloat64_t y = svmla_x (pg, svmul_x (svptrue_b64 (), r, d->c0), r2, p14);
|
||||
|
||||
/* Assemble result as exp10(x) = 2^n * exp10(r). If |x| > SpecialBound
|
||||
multiplication may overflow, so use special case routine. */
|
||||
diff --git a/sysdeps/aarch64/fpu/exp2_sve.c b/sysdeps/aarch64/fpu/exp2_sve.c
|
||||
index 22848ebfa5ac21d8..5dfb77cdbc2f6a51 100644
|
||||
--- a/sysdeps/aarch64/fpu/exp2_sve.c
|
||||
+++ b/sysdeps/aarch64/fpu/exp2_sve.c
|
||||
@@ -18,7 +18,6 @@
|
||||
<https://www.gnu.org/licenses/>. */
|
||||
|
||||
#include "sv_math.h"
|
||||
-#include "poly_sve_f64.h"
|
||||
|
||||
#define N (1 << V_EXP_TABLE_BITS)
|
||||
|
||||
@@ -27,15 +26,15 @@
|
||||
|
||||
static const struct data
|
||||
{
|
||||
- double poly[4];
|
||||
+ double c0, c2;
|
||||
+ double c1, c3;
|
||||
double shift, big_bound, uoflow_bound;
|
||||
} data = {
|
||||
/* Coefficients are computed using Remez algorithm with
|
||||
minimisation of the absolute error. */
|
||||
- .poly = { 0x1.62e42fefa3686p-1, 0x1.ebfbdff82c241p-3, 0x1.c6b09b16de99ap-5,
|
||||
- 0x1.3b2abf5571ad8p-7 },
|
||||
- .shift = 0x1.8p52 / N,
|
||||
- .uoflow_bound = UOFlowBound,
|
||||
+ .c0 = 0x1.62e42fefa3686p-1, .c1 = 0x1.ebfbdff82c241p-3,
|
||||
+ .c2 = 0x1.c6b09b16de99ap-5, .c3 = 0x1.3b2abf5571ad8p-7,
|
||||
+ .shift = 0x1.8p52 / N, .uoflow_bound = UOFlowBound,
|
||||
.big_bound = BigBound,
|
||||
};
|
||||
|
||||
@@ -67,9 +66,9 @@ special_case (svbool_t pg, svfloat64_t s, svfloat64_t y, svfloat64_t n,
|
||||
/* |n| > 1280 => 2^(n) overflows. */
|
||||
svbool_t p_cmp = svacgt (pg, n, d->uoflow_bound);
|
||||
|
||||
- svfloat64_t r1 = svmul_x (pg, s1, s1);
|
||||
+ svfloat64_t r1 = svmul_x (svptrue_b64 (), s1, s1);
|
||||
svfloat64_t r2 = svmla_x (pg, s2, s2, y);
|
||||
- svfloat64_t r0 = svmul_x (pg, r2, s1);
|
||||
+ svfloat64_t r0 = svmul_x (svptrue_b64 (), r2, s1);
|
||||
|
||||
return svsel (p_cmp, r1, r0);
|
||||
}
|
||||
@@ -99,11 +98,14 @@ svfloat64_t SV_NAME_D1 (exp2) (svfloat64_t x, svbool_t pg)
|
||||
svuint64_t top = svlsl_x (pg, ki, 52 - V_EXP_TABLE_BITS);
|
||||
svfloat64_t scale = svreinterpret_f64 (svadd_x (pg, sbits, top));
|
||||
|
||||
+ svfloat64_t c13 = svld1rq (svptrue_b64 (), &d->c1);
|
||||
/* Approximate exp2(r) using polynomial. */
|
||||
- svfloat64_t r2 = svmul_x (pg, r, r);
|
||||
- svfloat64_t p = sv_pairwise_poly_3_f64_x (pg, r, r2, d->poly);
|
||||
- svfloat64_t y = svmul_x (pg, r, p);
|
||||
-
|
||||
+ /* y = exp2(r) - 1 ~= C0 r + C1 r^2 + C2 r^3 + C3 r^4. */
|
||||
+ svfloat64_t r2 = svmul_x (svptrue_b64 (), r, r);
|
||||
+ svfloat64_t p01 = svmla_lane (sv_f64 (d->c0), r, c13, 0);
|
||||
+ svfloat64_t p23 = svmla_lane (sv_f64 (d->c2), r, c13, 1);
|
||||
+ svfloat64_t p = svmla_x (pg, p01, p23, r2);
|
||||
+ svfloat64_t y = svmul_x (svptrue_b64 (), r, p);
|
||||
/* Assemble exp2(x) = exp2(r) * scale. */
|
||||
if (__glibc_unlikely (svptest_any (pg, special)))
|
||||
return special_case (pg, scale, y, kd, d);
|
||||
diff --git a/sysdeps/aarch64/fpu/exp_sve.c b/sysdeps/aarch64/fpu/exp_sve.c
|
||||
index aabaaa1d61dbab27..b2421d493f2e119f 100644
|
||||
--- a/sysdeps/aarch64/fpu/exp_sve.c
|
||||
+++ b/sysdeps/aarch64/fpu/exp_sve.c
|
||||
@@ -21,12 +21,15 @@
|
||||
|
||||
static const struct data
|
||||
{
|
||||
- double poly[4];
|
||||
+ double c0, c2;
|
||||
+ double c1, c3;
|
||||
double ln2_hi, ln2_lo, inv_ln2, shift, thres;
|
||||
+
|
||||
} data = {
|
||||
- .poly = { /* ulp error: 0.53. */
|
||||
- 0x1.fffffffffdbcdp-2, 0x1.555555555444cp-3, 0x1.555573c6a9f7dp-5,
|
||||
- 0x1.1111266d28935p-7 },
|
||||
+ .c0 = 0x1.fffffffffdbcdp-2,
|
||||
+ .c1 = 0x1.555555555444cp-3,
|
||||
+ .c2 = 0x1.555573c6a9f7dp-5,
|
||||
+ .c3 = 0x1.1111266d28935p-7,
|
||||
.ln2_hi = 0x1.62e42fefa3800p-1,
|
||||
.ln2_lo = 0x1.ef35793c76730p-45,
|
||||
/* 1/ln2. */
|
||||
@@ -36,7 +39,6 @@ static const struct data
|
||||
.thres = 704.0,
|
||||
};
|
||||
|
||||
-#define C(i) sv_f64 (d->poly[i])
|
||||
#define SpecialOffset 0x6000000000000000 /* 0x1p513. */
|
||||
/* SpecialBias1 + SpecialBias1 = asuint(1.0). */
|
||||
#define SpecialBias1 0x7000000000000000 /* 0x1p769. */
|
||||
@@ -56,20 +58,20 @@ special_case (svbool_t pg, svfloat64_t s, svfloat64_t y, svfloat64_t n)
|
||||
svuint64_t b
|
||||
= svdup_u64_z (p_sign, SpecialOffset); /* Inactive lanes set to 0. */
|
||||
|
||||
- /* Set s1 to generate overflow depending on sign of exponent n. */
|
||||
- svfloat64_t s1 = svreinterpret_f64 (
|
||||
- svsubr_x (pg, b, SpecialBias1)); /* 0x70...0 - b. */
|
||||
- /* Offset s to avoid overflow in final result if n is below threshold. */
|
||||
+ /* Set s1 to generate overflow depending on sign of exponent n,
|
||||
+ ie. s1 = 0x70...0 - b. */
|
||||
+ svfloat64_t s1 = svreinterpret_f64 (svsubr_x (pg, b, SpecialBias1));
|
||||
+ /* Offset s to avoid overflow in final result if n is below threshold.
|
||||
+ ie. s2 = as_u64 (s) - 0x3010...0 + b. */
|
||||
svfloat64_t s2 = svreinterpret_f64 (
|
||||
- svadd_x (pg, svsub_x (pg, svreinterpret_u64 (s), SpecialBias2),
|
||||
- b)); /* as_u64 (s) - 0x3010...0 + b. */
|
||||
+ svadd_x (pg, svsub_x (pg, svreinterpret_u64 (s), SpecialBias2), b));
|
||||
|
||||
/* |n| > 1280 => 2^(n) overflows. */
|
||||
svbool_t p_cmp = svacgt (pg, n, 1280.0);
|
||||
|
||||
- svfloat64_t r1 = svmul_x (pg, s1, s1);
|
||||
+ svfloat64_t r1 = svmul_x (svptrue_b64 (), s1, s1);
|
||||
svfloat64_t r2 = svmla_x (pg, s2, s2, y);
|
||||
- svfloat64_t r0 = svmul_x (pg, r2, s1);
|
||||
+ svfloat64_t r0 = svmul_x (svptrue_b64 (), r2, s1);
|
||||
|
||||
return svsel (p_cmp, r1, r0);
|
||||
}
|
||||
@@ -103,16 +105,16 @@ svfloat64_t SV_NAME_D1 (exp) (svfloat64_t x, const svbool_t pg)
|
||||
svfloat64_t z = svmla_x (pg, sv_f64 (d->shift), x, d->inv_ln2);
|
||||
svuint64_t u = svreinterpret_u64 (z);
|
||||
svfloat64_t n = svsub_x (pg, z, d->shift);
|
||||
-
|
||||
+ svfloat64_t c13 = svld1rq (svptrue_b64 (), &d->c1);
|
||||
/* r = x - n * ln2, r is in [-ln2/(2N), ln2/(2N)]. */
|
||||
svfloat64_t ln2 = svld1rq (svptrue_b64 (), &d->ln2_hi);
|
||||
svfloat64_t r = svmls_lane (x, n, ln2, 0);
|
||||
r = svmls_lane (r, n, ln2, 1);
|
||||
|
||||
/* y = exp(r) - 1 ~= r + C0 r^2 + C1 r^3 + C2 r^4 + C3 r^5. */
|
||||
- svfloat64_t r2 = svmul_x (pg, r, r);
|
||||
- svfloat64_t p01 = svmla_x (pg, C (0), C (1), r);
|
||||
- svfloat64_t p23 = svmla_x (pg, C (2), C (3), r);
|
||||
+ svfloat64_t r2 = svmul_x (svptrue_b64 (), r, r);
|
||||
+ svfloat64_t p01 = svmla_lane (sv_f64 (d->c0), r, c13, 0);
|
||||
+ svfloat64_t p23 = svmla_lane (sv_f64 (d->c2), r, c13, 1);
|
||||
svfloat64_t p04 = svmla_x (pg, p01, p23, r2);
|
||||
svfloat64_t y = svmla_x (pg, r, p04, r2);
|
||||
|
||||
diff --git a/sysdeps/aarch64/fpu/sv_expf_inline.h b/sysdeps/aarch64/fpu/sv_expf_inline.h
|
||||
index 6166df65533555a6..75781fb4ddcb9790 100644
|
||||
--- a/sysdeps/aarch64/fpu/sv_expf_inline.h
|
||||
+++ b/sysdeps/aarch64/fpu/sv_expf_inline.h
|
||||
@@ -61,7 +61,7 @@ expf_inline (svfloat32_t x, const svbool_t pg, const struct sv_expf_data *d)
|
||||
/* scale = 2^(n/N). */
|
||||
svfloat32_t scale = svexpa (svreinterpret_u32 (z));
|
||||
|
||||
- /* y = exp(r) - 1 ~= r + C0 r^2 + C1 r^3 + C2 r^4 + C3 r^5 + C4 r^6. */
|
||||
+ /* poly(r) = exp(r) - 1 ~= C0 r + C1 r^2 + C2 r^3 + C3 r^4 + C4 r^5. */
|
||||
svfloat32_t p12 = svmla_lane (sv_f32 (d->c1), r, lane_consts, 2);
|
||||
svfloat32_t p34 = svmla_lane (sv_f32 (d->c3), r, lane_consts, 3);
|
||||
svfloat32_t r2 = svmul_x (svptrue_b32 (), r, r);
|
||||
@@ -71,5 +71,4 @@ expf_inline (svfloat32_t x, const svbool_t pg, const struct sv_expf_data *d)
|
||||
|
||||
return svmla_x (pg, scale, scale, poly);
|
||||
}
|
||||
-
|
||||
#endif
|
||||
194
glibc-RHEL-118273-31.patch
Normal file
194
glibc-RHEL-118273-31.patch
Normal file
@ -0,0 +1,194 @@
|
||||
commit 8f0e7fe61e0a2ad5ed777933703ce09053810ec4
|
||||
Author: Luna Lamb <luna.lamb@arm.com>
|
||||
Date: Thu Feb 13 17:52:09 2025 +0000
|
||||
|
||||
Aarch64: Improve codegen in SVE asinh
|
||||
|
||||
Use unpredicated muls, use lanewise mla's and improve memory access.
|
||||
1% regression in throughput microbenchmark on Neoverse V1.
|
||||
|
||||
Reviewed-by: Wilco Dijkstra <Wilco.Dijkstra@arm.com>
|
||||
|
||||
diff --git a/sysdeps/aarch64/fpu/asinh_sve.c b/sysdeps/aarch64/fpu/asinh_sve.c
|
||||
index 28dc5c458750bac4..fe8715e06c92ac51 100644
|
||||
--- a/sysdeps/aarch64/fpu/asinh_sve.c
|
||||
+++ b/sysdeps/aarch64/fpu/asinh_sve.c
|
||||
@@ -18,36 +18,49 @@
|
||||
<https://www.gnu.org/licenses/>. */
|
||||
|
||||
#include "sv_math.h"
|
||||
-#include "poly_sve_f64.h"
|
||||
|
||||
#define SignMask (0x8000000000000000)
|
||||
#define One (0x3ff0000000000000)
|
||||
#define Thres (0x5fe0000000000000) /* asuint64 (0x1p511). */
|
||||
+#define IndexMask (((1 << V_LOG_TABLE_BITS) - 1) << 1)
|
||||
|
||||
static const struct data
|
||||
{
|
||||
- double poly[18];
|
||||
- double ln2, p3, p1, p4, p0, p2;
|
||||
- uint64_t n;
|
||||
- uint64_t off;
|
||||
+ double even_coeffs[9];
|
||||
+ double ln2, p3, p1, p4, p0, p2, c1, c3, c5, c7, c9, c11, c13, c15, c17;
|
||||
+ uint64_t off, mask;
|
||||
|
||||
} data = {
|
||||
- /* Polynomial generated using Remez on [2^-26, 1]. */
|
||||
- .poly
|
||||
- = { -0x1.55555555554a7p-3, 0x1.3333333326c7p-4, -0x1.6db6db68332e6p-5,
|
||||
- 0x1.f1c71b26fb40dp-6, -0x1.6e8b8b654a621p-6, 0x1.1c4daa9e67871p-6,
|
||||
- -0x1.c9871d10885afp-7, 0x1.7a16e8d9d2ecfp-7, -0x1.3ddca533e9f54p-7,
|
||||
- 0x1.0becef748dafcp-7, -0x1.b90c7099dd397p-8, 0x1.541f2bb1ffe51p-8,
|
||||
- -0x1.d217026a669ecp-9, 0x1.0b5c7977aaf7p-9, -0x1.e0f37daef9127p-11,
|
||||
- 0x1.388b5fe542a6p-12, -0x1.021a48685e287p-14, 0x1.93d4ba83d34dap-18 },
|
||||
+ /* Polynomial generated using Remez on [2^-26, 1]. */
|
||||
+ .even_coeffs ={
|
||||
+ -0x1.55555555554a7p-3,
|
||||
+ -0x1.6db6db68332e6p-5,
|
||||
+ -0x1.6e8b8b654a621p-6,
|
||||
+ -0x1.c9871d10885afp-7,
|
||||
+ -0x1.3ddca533e9f54p-7,
|
||||
+ -0x1.b90c7099dd397p-8,
|
||||
+ -0x1.d217026a669ecp-9,
|
||||
+ -0x1.e0f37daef9127p-11,
|
||||
+ -0x1.021a48685e287p-14, },
|
||||
+
|
||||
+ .c1 = 0x1.3333333326c7p-4,
|
||||
+ .c3 = 0x1.f1c71b26fb40dp-6,
|
||||
+ .c5 = 0x1.1c4daa9e67871p-6,
|
||||
+ .c7 = 0x1.7a16e8d9d2ecfp-7,
|
||||
+ .c9 = 0x1.0becef748dafcp-7,
|
||||
+ .c11 = 0x1.541f2bb1ffe51p-8,
|
||||
+ .c13 = 0x1.0b5c7977aaf7p-9,
|
||||
+ .c15 = 0x1.388b5fe542a6p-12,
|
||||
+ .c17 = 0x1.93d4ba83d34dap-18,
|
||||
+
|
||||
.ln2 = 0x1.62e42fefa39efp-1,
|
||||
.p0 = -0x1.ffffffffffff7p-2,
|
||||
.p1 = 0x1.55555555170d4p-2,
|
||||
.p2 = -0x1.0000000399c27p-2,
|
||||
.p3 = 0x1.999b2e90e94cap-3,
|
||||
.p4 = -0x1.554e550bd501ep-3,
|
||||
- .n = 1 << V_LOG_TABLE_BITS,
|
||||
- .off = 0x3fe6900900000000
|
||||
+ .off = 0x3fe6900900000000,
|
||||
+ .mask = 0xfffULL << 52,
|
||||
};
|
||||
|
||||
static svfloat64_t NOINLINE
|
||||
@@ -64,11 +77,10 @@ __sv_log_inline (svfloat64_t x, const struct data *d, const svbool_t pg)
|
||||
of the algorithm used. */
|
||||
|
||||
svuint64_t ix = svreinterpret_u64 (x);
|
||||
- svuint64_t tmp = svsub_x (pg, ix, d->off);
|
||||
- svuint64_t i = svand_x (pg, svlsr_x (pg, tmp, (51 - V_LOG_TABLE_BITS)),
|
||||
- (d->n - 1) << 1);
|
||||
- svint64_t k = svasr_x (pg, svreinterpret_s64 (tmp), 52);
|
||||
- svuint64_t iz = svsub_x (pg, ix, svand_x (pg, tmp, 0xfffULL << 52));
|
||||
+ svuint64_t i_off = svsub_x (pg, ix, d->off);
|
||||
+ svuint64_t i
|
||||
+ = svand_x (pg, svlsr_x (pg, i_off, (51 - V_LOG_TABLE_BITS)), IndexMask);
|
||||
+ svuint64_t iz = svsub_x (pg, ix, svand_x (pg, i_off, d->mask));
|
||||
svfloat64_t z = svreinterpret_f64 (iz);
|
||||
|
||||
svfloat64_t invc = svld1_gather_index (pg, &__v_log_data.table[0].invc, i);
|
||||
@@ -78,14 +90,14 @@ __sv_log_inline (svfloat64_t x, const struct data *d, const svbool_t pg)
|
||||
svfloat64_t p1_p4 = svld1rq (svptrue_b64 (), &d->p1);
|
||||
|
||||
svfloat64_t r = svmla_x (pg, sv_f64 (-1.0), invc, z);
|
||||
- svfloat64_t kd = svcvt_f64_x (pg, k);
|
||||
+ svfloat64_t kd
|
||||
+ = svcvt_f64_x (pg, svasr_x (pg, svreinterpret_s64 (i_off), 52));
|
||||
|
||||
svfloat64_t hi = svmla_lane (svadd_x (pg, logc, r), kd, ln2_p3, 0);
|
||||
- svfloat64_t r2 = svmul_x (pg, r, r);
|
||||
-
|
||||
+ svfloat64_t r2 = svmul_x (svptrue_b64 (), r, r);
|
||||
svfloat64_t y = svmla_lane (sv_f64 (d->p2), r, ln2_p3, 1);
|
||||
-
|
||||
svfloat64_t p = svmla_lane (sv_f64 (d->p0), r, p1_p4, 0);
|
||||
+
|
||||
y = svmla_lane (y, r2, p1_p4, 1);
|
||||
y = svmla_x (pg, p, r2, y);
|
||||
y = svmla_x (pg, hi, r2, y);
|
||||
@@ -111,7 +123,6 @@ svfloat64_t SV_NAME_D1 (asinh) (svfloat64_t x, const svbool_t pg)
|
||||
svuint64_t iax = svbic_x (pg, ix, SignMask);
|
||||
svuint64_t sign = svand_x (pg, ix, SignMask);
|
||||
svfloat64_t ax = svreinterpret_f64 (iax);
|
||||
-
|
||||
svbool_t ge1 = svcmpge (pg, iax, One);
|
||||
svbool_t special = svcmpge (pg, iax, Thres);
|
||||
|
||||
@@ -120,7 +131,7 @@ svfloat64_t SV_NAME_D1 (asinh) (svfloat64_t x, const svbool_t pg)
|
||||
svfloat64_t option_1 = sv_f64 (0);
|
||||
if (__glibc_likely (svptest_any (pg, ge1)))
|
||||
{
|
||||
- svfloat64_t x2 = svmul_x (pg, ax, ax);
|
||||
+ svfloat64_t x2 = svmul_x (svptrue_b64 (), ax, ax);
|
||||
option_1 = __sv_log_inline (
|
||||
svadd_x (pg, ax, svsqrt_x (pg, svadd_x (pg, x2, 1))), d, pg);
|
||||
}
|
||||
@@ -130,21 +141,53 @@ svfloat64_t SV_NAME_D1 (asinh) (svfloat64_t x, const svbool_t pg)
|
||||
The largest observed error in this region is 1.51 ULPs:
|
||||
_ZGVsMxv_asinh(0x1.fe12bf8c616a2p-1) got 0x1.c1e649ee2681bp-1
|
||||
want 0x1.c1e649ee2681dp-1. */
|
||||
+
|
||||
svfloat64_t option_2 = sv_f64 (0);
|
||||
if (__glibc_likely (svptest_any (pg, svnot_z (pg, ge1))))
|
||||
{
|
||||
- svfloat64_t x2 = svmul_x (pg, ax, ax);
|
||||
- svfloat64_t x4 = svmul_x (pg, x2, x2);
|
||||
- svfloat64_t p = sv_pw_horner_17_f64_x (pg, x2, x4, d->poly);
|
||||
- option_2 = svmla_x (pg, ax, p, svmul_x (pg, x2, ax));
|
||||
+ svfloat64_t x2 = svmul_x (svptrue_b64 (), ax, ax);
|
||||
+ svfloat64_t x4 = svmul_x (svptrue_b64 (), x2, x2);
|
||||
+ /* Order-17 Pairwise Horner scheme. */
|
||||
+ svfloat64_t c13 = svld1rq (svptrue_b64 (), &d->c1);
|
||||
+ svfloat64_t c57 = svld1rq (svptrue_b64 (), &d->c5);
|
||||
+ svfloat64_t c911 = svld1rq (svptrue_b64 (), &d->c9);
|
||||
+ svfloat64_t c1315 = svld1rq (svptrue_b64 (), &d->c13);
|
||||
+
|
||||
+ svfloat64_t p01 = svmla_lane (sv_f64 (d->even_coeffs[0]), x2, c13, 0);
|
||||
+ svfloat64_t p23 = svmla_lane (sv_f64 (d->even_coeffs[1]), x2, c13, 1);
|
||||
+ svfloat64_t p45 = svmla_lane (sv_f64 (d->even_coeffs[2]), x2, c57, 0);
|
||||
+ svfloat64_t p67 = svmla_lane (sv_f64 (d->even_coeffs[3]), x2, c57, 1);
|
||||
+ svfloat64_t p89 = svmla_lane (sv_f64 (d->even_coeffs[4]), x2, c911, 0);
|
||||
+ svfloat64_t p1011 = svmla_lane (sv_f64 (d->even_coeffs[5]), x2, c911, 1);
|
||||
+ svfloat64_t p1213
|
||||
+ = svmla_lane (sv_f64 (d->even_coeffs[6]), x2, c1315, 0);
|
||||
+ svfloat64_t p1415
|
||||
+ = svmla_lane (sv_f64 (d->even_coeffs[7]), x2, c1315, 1);
|
||||
+ svfloat64_t p1617 = svmla_x (pg, sv_f64 (d->even_coeffs[8]), x2, d->c17);
|
||||
+
|
||||
+ svfloat64_t p = svmla_x (pg, p1415, x4, p1617);
|
||||
+ p = svmla_x (pg, p1213, x4, p);
|
||||
+ p = svmla_x (pg, p1011, x4, p);
|
||||
+ p = svmla_x (pg, p89, x4, p);
|
||||
+
|
||||
+ p = svmla_x (pg, p67, x4, p);
|
||||
+ p = svmla_x (pg, p45, x4, p);
|
||||
+
|
||||
+ p = svmla_x (pg, p23, x4, p);
|
||||
+
|
||||
+ p = svmla_x (pg, p01, x4, p);
|
||||
+
|
||||
+ option_2 = svmla_x (pg, ax, p, svmul_x (svptrue_b64 (), x2, ax));
|
||||
}
|
||||
|
||||
- /* Choose the right option for each lane. */
|
||||
- svfloat64_t y = svsel (ge1, option_1, option_2);
|
||||
-
|
||||
if (__glibc_unlikely (svptest_any (pg, special)))
|
||||
return special_case (
|
||||
- x, svreinterpret_f64 (sveor_x (pg, svreinterpret_u64 (y), sign)),
|
||||
+ x,
|
||||
+ svreinterpret_f64 (sveor_x (
|
||||
+ pg, svreinterpret_u64 (svsel (ge1, option_1, option_2)), sign)),
|
||||
special);
|
||||
+
|
||||
+ /* Choose the right option for each lane. */
|
||||
+ svfloat64_t y = svsel (ge1, option_1, option_2);
|
||||
return svreinterpret_f64 (sveor_x (pg, svreinterpret_u64 (y), sign));
|
||||
}
|
||||
531
glibc-RHEL-118273-32.patch
Normal file
531
glibc-RHEL-118273-32.patch
Normal file
@ -0,0 +1,531 @@
|
||||
commit ce2f26a22e6b6f5c108d156afd9b43a452bb024c
|
||||
Author: Wilco Dijkstra <wilco.dijkstra@arm.com>
|
||||
Date: Tue Dec 31 18:07:36 2024 +0000
|
||||
|
||||
AArch64: Remove PTR_ARG/SIZE_ARG defines
|
||||
|
||||
This series removes various ILP32 defines that are now
|
||||
no longer needed.
|
||||
|
||||
Remove PTR_ARG/SIZE_ARG.
|
||||
|
||||
Reviewed-by: Adhemerval Zanella <adhemerval.zanella@linaro.org>
|
||||
|
||||
Conflicts:
|
||||
sysdeps/aarch64/dl-start.S
|
||||
(Fixup context to apply without out-of-scope dependency 01f52b11de)
|
||||
sysdeps/aarch64/multiarch/memcpy_thunderx.S
|
||||
(Dropped by upstream commit e162ab2)
|
||||
sysdeps/aarch64/multiarch/memcpy_oryon1.S
|
||||
(Skipped: file from 4dc83cac is out-of-scope)
|
||||
sysdeps/aarch64/multiarch/memset_oryon1.S
|
||||
(Skipped: file from 2f1f7a5f is out-of-scope)
|
||||
|
||||
diff --git a/sysdeps/aarch64/__longjmp.S b/sysdeps/aarch64/__longjmp.S
|
||||
index 7b6add751e6bd96b..452ba0da6d788ce8 100644
|
||||
--- a/sysdeps/aarch64/__longjmp.S
|
||||
+++ b/sysdeps/aarch64/__longjmp.S
|
||||
@@ -47,8 +47,6 @@ ENTRY (__longjmp)
|
||||
cfi_offset(d14, JB_D14<<3)
|
||||
cfi_offset(d15, JB_D15<<3)
|
||||
|
||||
- PTR_ARG (0)
|
||||
-
|
||||
#if IS_IN(libc)
|
||||
/* Disable ZA state of SME in libc.a and libc.so, but not in ld.so. */
|
||||
# if HAVE_AARCH64_PAC_RET
|
||||
diff --git a/sysdeps/aarch64/__mtag_tag_region.S b/sysdeps/aarch64/__mtag_tag_region.S
|
||||
index 22e8d8b75372c8aa..90ac17ced4801f21 100644
|
||||
--- a/sysdeps/aarch64/__mtag_tag_region.S
|
||||
+++ b/sysdeps/aarch64/__mtag_tag_region.S
|
||||
@@ -40,9 +40,6 @@
|
||||
#define zva_val x4
|
||||
|
||||
ENTRY (__libc_mtag_tag_region)
|
||||
- PTR_ARG (0)
|
||||
- SIZE_ARG (1)
|
||||
-
|
||||
add dstend, dstin, count
|
||||
|
||||
cmp count, 96
|
||||
diff --git a/sysdeps/aarch64/__mtag_tag_zero_region.S b/sysdeps/aarch64/__mtag_tag_zero_region.S
|
||||
index 566698e9146e7da8..e975a2f8bdb85ae0 100644
|
||||
--- a/sysdeps/aarch64/__mtag_tag_zero_region.S
|
||||
+++ b/sysdeps/aarch64/__mtag_tag_zero_region.S
|
||||
@@ -40,9 +40,6 @@
|
||||
#define zva_val x4
|
||||
|
||||
ENTRY (__libc_mtag_tag_zero_region)
|
||||
- PTR_ARG (0)
|
||||
- SIZE_ARG (1)
|
||||
-
|
||||
add dstend, dstin, count
|
||||
|
||||
cmp count, 96
|
||||
diff --git a/sysdeps/aarch64/dl-start.S b/sysdeps/aarch64/dl-start.S
|
||||
index d645484e79858013..b7ac6c31432e07c9 100644
|
||||
--- a/sysdeps/aarch64/dl-start.S
|
||||
+++ b/sysdeps/aarch64/dl-start.S
|
||||
@@ -26,7 +26,6 @@ ENTRY (_start)
|
||||
mov x30, #0
|
||||
|
||||
mov x0, sp
|
||||
- PTR_ARG (0)
|
||||
bl _dl_start
|
||||
/* Returns user entry point in x0. */
|
||||
mov PTR_REG (21), PTR_REG (0)
|
||||
diff --git a/sysdeps/aarch64/dl-tlsdesc.S b/sysdeps/aarch64/dl-tlsdesc.S
|
||||
index 9b253b39dd1d9d46..0aeaf64edd2594f1 100644
|
||||
--- a/sysdeps/aarch64/dl-tlsdesc.S
|
||||
+++ b/sysdeps/aarch64/dl-tlsdesc.S
|
||||
@@ -75,7 +75,6 @@
|
||||
.align 2
|
||||
_dl_tlsdesc_return:
|
||||
BTI_C
|
||||
- PTR_ARG (0)
|
||||
ldr PTR_REG (0), [x0, #PTR_SIZE]
|
||||
RET
|
||||
cfi_endproc
|
||||
@@ -99,7 +98,6 @@ _dl_tlsdesc_undefweak:
|
||||
BTI_C
|
||||
str x1, [sp, #-16]!
|
||||
cfi_adjust_cfa_offset (16)
|
||||
- PTR_ARG (0)
|
||||
ldr PTR_REG (0), [x0, #PTR_SIZE]
|
||||
mrs x1, tpidr_el0
|
||||
sub PTR_REG (0), PTR_REG (0), PTR_REG (1)
|
||||
@@ -145,7 +143,6 @@ _dl_tlsdesc_undefweak:
|
||||
.align 2
|
||||
_dl_tlsdesc_dynamic:
|
||||
BTI_C
|
||||
- PTR_ARG (0)
|
||||
|
||||
/* Save just enough registers to support fast path, if we fall
|
||||
into slow path we will save additional registers. */
|
||||
diff --git a/sysdeps/aarch64/memchr.S b/sysdeps/aarch64/memchr.S
|
||||
index a9fa40519c78b7df..7173c7fafa7d6eb5 100644
|
||||
--- a/sysdeps/aarch64/memchr.S
|
||||
+++ b/sysdeps/aarch64/memchr.S
|
||||
@@ -57,8 +57,6 @@
|
||||
exactly which byte matched. */
|
||||
|
||||
ENTRY (MEMCHR)
|
||||
- PTR_ARG (0)
|
||||
- SIZE_ARG (2)
|
||||
bic src, srcin, 15
|
||||
cbz cntin, L(nomatch)
|
||||
ld1 {vdata.16b}, [src]
|
||||
diff --git a/sysdeps/aarch64/memcmp.S b/sysdeps/aarch64/memcmp.S
|
||||
index 5afa79494bf9cb7f..68dfa604f4b1bd43 100644
|
||||
--- a/sysdeps/aarch64/memcmp.S
|
||||
+++ b/sysdeps/aarch64/memcmp.S
|
||||
@@ -44,10 +44,6 @@
|
||||
|
||||
|
||||
ENTRY (memcmp)
|
||||
- PTR_ARG (0)
|
||||
- PTR_ARG (1)
|
||||
- SIZE_ARG (2)
|
||||
-
|
||||
cmp limit, 16
|
||||
b.lo L(less16)
|
||||
ldp data1, data3, [src1]
|
||||
diff --git a/sysdeps/aarch64/memcpy.S b/sysdeps/aarch64/memcpy.S
|
||||
index f21c21d3f2a21d89..fba93faeba52447f 100644
|
||||
--- a/sysdeps/aarch64/memcpy.S
|
||||
+++ b/sysdeps/aarch64/memcpy.S
|
||||
@@ -70,10 +70,6 @@
|
||||
from the end. */
|
||||
|
||||
ENTRY (MEMCPY)
|
||||
- PTR_ARG (0)
|
||||
- PTR_ARG (1)
|
||||
- SIZE_ARG (2)
|
||||
-
|
||||
add srcend, src, count
|
||||
add dstend, dstin, count
|
||||
cmp count, 128
|
||||
@@ -187,10 +183,6 @@ libc_hidden_builtin_def (MEMCPY)
|
||||
|
||||
|
||||
ENTRY (MEMMOVE)
|
||||
- PTR_ARG (0)
|
||||
- PTR_ARG (1)
|
||||
- SIZE_ARG (2)
|
||||
-
|
||||
add srcend, src, count
|
||||
add dstend, dstin, count
|
||||
cmp count, 128
|
||||
diff --git a/sysdeps/aarch64/memrchr.S b/sysdeps/aarch64/memrchr.S
|
||||
index c5274f5ebf595268..1bd3e230ca197581 100644
|
||||
--- a/sysdeps/aarch64/memrchr.S
|
||||
+++ b/sysdeps/aarch64/memrchr.S
|
||||
@@ -55,8 +55,6 @@
|
||||
exactly which byte matched. */
|
||||
|
||||
ENTRY (__memrchr)
|
||||
- PTR_ARG (0)
|
||||
- SIZE_ARG (2)
|
||||
add end, srcin, cntin
|
||||
sub endm1, end, 1
|
||||
bic src, endm1, 15
|
||||
diff --git a/sysdeps/aarch64/memset.S b/sysdeps/aarch64/memset.S
|
||||
index 71814d0b2f6dd3a7..496ad332882a7e3d 100644
|
||||
--- a/sysdeps/aarch64/memset.S
|
||||
+++ b/sysdeps/aarch64/memset.S
|
||||
@@ -40,9 +40,6 @@
|
||||
#define dstend2 x5
|
||||
|
||||
ENTRY (MEMSET)
|
||||
- PTR_ARG (0)
|
||||
- SIZE_ARG (2)
|
||||
-
|
||||
dup v0.16B, valw
|
||||
cmp count, 16
|
||||
b.lo L(set_small)
|
||||
diff --git a/sysdeps/aarch64/multiarch/memchr_nosimd.S b/sysdeps/aarch64/multiarch/memchr_nosimd.S
|
||||
index 0a65139b0810e95b..b47059de1ee61f71 100644
|
||||
--- a/sysdeps/aarch64/multiarch/memchr_nosimd.S
|
||||
+++ b/sysdeps/aarch64/multiarch/memchr_nosimd.S
|
||||
@@ -60,9 +60,6 @@
|
||||
|
||||
ENTRY (__memchr_nosimd)
|
||||
|
||||
- PTR_ARG (0)
|
||||
- SIZE_ARG (2)
|
||||
-
|
||||
/* Do not dereference srcin if no bytes to compare. */
|
||||
cbz cntin, L(none_chr)
|
||||
|
||||
diff --git a/sysdeps/aarch64/multiarch/memcpy_a64fx.S b/sysdeps/aarch64/multiarch/memcpy_a64fx.S
|
||||
index d826aafd80ed7b0b..fa693f7c3a5c28a3 100644
|
||||
--- a/sysdeps/aarch64/multiarch/memcpy_a64fx.S
|
||||
+++ b/sysdeps/aarch64/multiarch/memcpy_a64fx.S
|
||||
@@ -96,10 +96,6 @@
|
||||
|
||||
ENTRY (__memcpy_a64fx)
|
||||
|
||||
- PTR_ARG (0)
|
||||
- PTR_ARG (1)
|
||||
- SIZE_ARG (2)
|
||||
-
|
||||
cntb vlen
|
||||
cmp n, vlen, lsl 1
|
||||
b.hi L(copy_small)
|
||||
@@ -236,10 +232,6 @@ END (__memcpy_a64fx)
|
||||
|
||||
ENTRY_ALIGN (__memmove_a64fx, 4)
|
||||
|
||||
- PTR_ARG (0)
|
||||
- PTR_ARG (1)
|
||||
- SIZE_ARG (2)
|
||||
-
|
||||
/* Fast case for up to 2 vectors. */
|
||||
cntb vlen
|
||||
cmp n, vlen, lsl 1
|
||||
diff --git a/sysdeps/aarch64/multiarch/memcpy_mops.S b/sysdeps/aarch64/multiarch/memcpy_mops.S
|
||||
index b094af3d22bc4aeb..2c426f008e699101 100644
|
||||
--- a/sysdeps/aarch64/multiarch/memcpy_mops.S
|
||||
+++ b/sysdeps/aarch64/multiarch/memcpy_mops.S
|
||||
@@ -26,10 +26,6 @@
|
||||
*/
|
||||
|
||||
ENTRY (__memcpy_mops)
|
||||
- PTR_ARG (0)
|
||||
- PTR_ARG (1)
|
||||
- SIZE_ARG (2)
|
||||
-
|
||||
mov x3, x0
|
||||
.inst 0x19010443 /* cpyfp [x3]!, [x1]!, x2! */
|
||||
.inst 0x19410443 /* cpyfm [x3]!, [x1]!, x2! */
|
||||
diff --git a/sysdeps/aarch64/multiarch/memcpy_sve.S b/sysdeps/aarch64/multiarch/memcpy_sve.S
|
||||
index 3ce49d79ecdb94e0..26375b47174f1ba8 100644
|
||||
--- a/sysdeps/aarch64/multiarch/memcpy_sve.S
|
||||
+++ b/sysdeps/aarch64/multiarch/memcpy_sve.S
|
||||
@@ -61,10 +61,6 @@
|
||||
.arch armv8.2-a+sve
|
||||
|
||||
ENTRY (__memcpy_sve)
|
||||
- PTR_ARG (0)
|
||||
- PTR_ARG (1)
|
||||
- SIZE_ARG (2)
|
||||
-
|
||||
cmp count, 128
|
||||
b.hi L(copy_long)
|
||||
cntb vlen
|
||||
@@ -144,10 +140,6 @@ END (__memcpy_sve)
|
||||
|
||||
|
||||
ENTRY (__memmove_sve)
|
||||
- PTR_ARG (0)
|
||||
- PTR_ARG (1)
|
||||
- SIZE_ARG (2)
|
||||
-
|
||||
cmp count, 128
|
||||
b.hi L(move_long)
|
||||
cntb vlen
|
||||
diff --git a/sysdeps/aarch64/multiarch/memcpy_thunderx.S b/sysdeps/aarch64/multiarch/memcpy_thunderx.S
|
||||
index 5d8438a82ea2a3be..02ea27f356fe8ea1 100644
|
||||
--- a/sysdeps/aarch64/multiarch/memcpy_thunderx.S
|
||||
+++ b/sysdeps/aarch64/multiarch/memcpy_thunderx.S
|
||||
@@ -67,10 +67,6 @@
|
||||
|
||||
ENTRY (__memmove_thunderx)
|
||||
|
||||
- PTR_ARG (0)
|
||||
- PTR_ARG (1)
|
||||
- SIZE_ARG (2)
|
||||
-
|
||||
sub tmp1, dstin, src
|
||||
cmp count, 96
|
||||
ccmp tmp1, count, 2, hi
|
||||
diff --git a/sysdeps/aarch64/multiarch/memmove_mops.S b/sysdeps/aarch64/multiarch/memmove_mops.S
|
||||
index 7df0d22454ead375..229fccd9d5a7abd2 100644
|
||||
--- a/sysdeps/aarch64/multiarch/memmove_mops.S
|
||||
+++ b/sysdeps/aarch64/multiarch/memmove_mops.S
|
||||
@@ -26,10 +26,6 @@
|
||||
*/
|
||||
|
||||
ENTRY (__memmove_mops)
|
||||
- PTR_ARG (0)
|
||||
- PTR_ARG (1)
|
||||
- SIZE_ARG (2)
|
||||
-
|
||||
mov x3, x0
|
||||
.inst 0x1d010443 /* cpyp [x3]!, [x1]!, x2! */
|
||||
.inst 0x1d410443 /* cpym [x3]!, [x1]!, x2! */
|
||||
diff --git a/sysdeps/aarch64/multiarch/memset_a64fx.S b/sysdeps/aarch64/multiarch/memset_a64fx.S
|
||||
index 2e6d882fc931a882..9ea329a82ae7d0f6 100644
|
||||
--- a/sysdeps/aarch64/multiarch/memset_a64fx.S
|
||||
+++ b/sysdeps/aarch64/multiarch/memset_a64fx.S
|
||||
@@ -48,8 +48,6 @@
|
||||
#define BTI_C
|
||||
|
||||
ENTRY (__memset_a64fx)
|
||||
- PTR_ARG (0)
|
||||
- SIZE_ARG (2)
|
||||
|
||||
cntb vector_length
|
||||
dup z0.b, valw
|
||||
diff --git a/sysdeps/aarch64/multiarch/memset_emag.S b/sysdeps/aarch64/multiarch/memset_emag.S
|
||||
index 6d714ed0e1b396ef..5c33280e0f8bf85a 100644
|
||||
--- a/sysdeps/aarch64/multiarch/memset_emag.S
|
||||
+++ b/sysdeps/aarch64/multiarch/memset_emag.S
|
||||
@@ -28,9 +28,6 @@
|
||||
|
||||
ENTRY (__memset_emag)
|
||||
|
||||
- PTR_ARG (0)
|
||||
- SIZE_ARG (2)
|
||||
-
|
||||
bfi valw, valw, 8, 8
|
||||
bfi valw, valw, 16, 16
|
||||
bfi val, val, 32, 32
|
||||
diff --git a/sysdeps/aarch64/multiarch/memset_kunpeng.S b/sysdeps/aarch64/multiarch/memset_kunpeng.S
|
||||
index 7b215501376cbe03..93f3bfb8cf7238a5 100644
|
||||
--- a/sysdeps/aarch64/multiarch/memset_kunpeng.S
|
||||
+++ b/sysdeps/aarch64/multiarch/memset_kunpeng.S
|
||||
@@ -28,9 +28,6 @@
|
||||
|
||||
ENTRY (__memset_kunpeng)
|
||||
|
||||
- PTR_ARG (0)
|
||||
- SIZE_ARG (2)
|
||||
-
|
||||
dup v0.16B, valw
|
||||
add dstend, dstin, count
|
||||
|
||||
diff --git a/sysdeps/aarch64/multiarch/memset_mops.S b/sysdeps/aarch64/multiarch/memset_mops.S
|
||||
index e879c81ab2d047b1..f13a0b561078137e 100644
|
||||
--- a/sysdeps/aarch64/multiarch/memset_mops.S
|
||||
+++ b/sysdeps/aarch64/multiarch/memset_mops.S
|
||||
@@ -26,9 +26,6 @@
|
||||
*/
|
||||
|
||||
ENTRY (__memset_mops)
|
||||
- PTR_ARG (0)
|
||||
- SIZE_ARG (2)
|
||||
-
|
||||
mov x3, x0
|
||||
.inst 0x19c10443 /* setp [x3]!, x2!, x1 */
|
||||
.inst 0x19c14443 /* setm [x3]!, x2!, x1 */
|
||||
diff --git a/sysdeps/aarch64/multiarch/strlen_asimd.S b/sysdeps/aarch64/multiarch/strlen_asimd.S
|
||||
index 67dcc94adc587928..3118cd00663b0b25 100644
|
||||
--- a/sysdeps/aarch64/multiarch/strlen_asimd.S
|
||||
+++ b/sysdeps/aarch64/multiarch/strlen_asimd.S
|
||||
@@ -87,7 +87,6 @@
|
||||
character, return the length, if not, continue in the main loop. */
|
||||
|
||||
ENTRY (__strlen_asimd)
|
||||
- PTR_ARG (0)
|
||||
and tmp1, srcin, MIN_PAGE_SIZE - 1
|
||||
cmp tmp1, MIN_PAGE_SIZE - 32
|
||||
b.hi L(page_cross)
|
||||
diff --git a/sysdeps/aarch64/setjmp.S b/sysdeps/aarch64/setjmp.S
|
||||
index 43fdb1b2fb1b7b78..92dc34e3e9a2650c 100644
|
||||
--- a/sysdeps/aarch64/setjmp.S
|
||||
+++ b/sysdeps/aarch64/setjmp.S
|
||||
@@ -34,8 +34,6 @@ END (_setjmp)
|
||||
libc_hidden_def (_setjmp)
|
||||
|
||||
ENTRY (__sigsetjmp)
|
||||
- PTR_ARG (0)
|
||||
-
|
||||
1:
|
||||
stp x19, x20, [x0, #JB_X19<<3]
|
||||
stp x21, x22, [x0, #JB_X21<<3]
|
||||
diff --git a/sysdeps/aarch64/strchr.S b/sysdeps/aarch64/strchr.S
|
||||
index ca4c99e6bf9ac960..bc57283361e172ab 100644
|
||||
--- a/sysdeps/aarch64/strchr.S
|
||||
+++ b/sysdeps/aarch64/strchr.S
|
||||
@@ -52,7 +52,6 @@
|
||||
If it is not a multiple of 4, there was no match. */
|
||||
|
||||
ENTRY (strchr)
|
||||
- PTR_ARG (0)
|
||||
bic src, srcin, 15
|
||||
dup vrepchr.16b, chrin
|
||||
ld1 {vdata.16b}, [src]
|
||||
diff --git a/sysdeps/aarch64/strchrnul.S b/sysdeps/aarch64/strchrnul.S
|
||||
index e1a1c7eb4383e0f6..09e092bf5f847a7f 100644
|
||||
--- a/sysdeps/aarch64/strchrnul.S
|
||||
+++ b/sysdeps/aarch64/strchrnul.S
|
||||
@@ -51,7 +51,6 @@
|
||||
exactly which byte matched. */
|
||||
|
||||
ENTRY (__strchrnul)
|
||||
- PTR_ARG (0)
|
||||
bic src, srcin, 15
|
||||
dup vrepchr.16b, chrin
|
||||
ld1 {vdata.16b}, [src]
|
||||
diff --git a/sysdeps/aarch64/strcmp.S b/sysdeps/aarch64/strcmp.S
|
||||
index 47f6fb1448c464bf..7bf87073be304e0f 100644
|
||||
--- a/sysdeps/aarch64/strcmp.S
|
||||
+++ b/sysdeps/aarch64/strcmp.S
|
||||
@@ -62,8 +62,6 @@
|
||||
NUL too in big-endian, byte-reverse the data before the NUL check. */
|
||||
|
||||
ENTRY(strcmp)
|
||||
- PTR_ARG (0)
|
||||
- PTR_ARG (1)
|
||||
sub off2, src2, src1
|
||||
mov zeroones, REP8_01
|
||||
and tmp, src1, 7
|
||||
diff --git a/sysdeps/aarch64/strcpy.S b/sysdeps/aarch64/strcpy.S
|
||||
index 705354060055a45e..62fb0248fa5a7ba3 100644
|
||||
--- a/sysdeps/aarch64/strcpy.S
|
||||
+++ b/sysdeps/aarch64/strcpy.S
|
||||
@@ -69,8 +69,6 @@
|
||||
exactly which byte matched. */
|
||||
|
||||
ENTRY (STRCPY)
|
||||
- PTR_ARG (0)
|
||||
- PTR_ARG (1)
|
||||
bic src, srcin, 15
|
||||
ld1 {vdata.16b}, [src]
|
||||
cmeq vhas_nul.16b, vdata.16b, 0
|
||||
diff --git a/sysdeps/aarch64/strlen.S b/sysdeps/aarch64/strlen.S
|
||||
index 352fb40d3abbb44b..0d10b6efb7b31e54 100644
|
||||
--- a/sysdeps/aarch64/strlen.S
|
||||
+++ b/sysdeps/aarch64/strlen.S
|
||||
@@ -49,7 +49,6 @@
|
||||
identifies the first zero byte. */
|
||||
|
||||
ENTRY (STRLEN)
|
||||
- PTR_ARG (0)
|
||||
bic src, srcin, 15
|
||||
ld1 {vdata.16b}, [src]
|
||||
cmeq vhas_nul.16b, vdata.16b, 0
|
||||
diff --git a/sysdeps/aarch64/strnlen.S b/sysdeps/aarch64/strnlen.S
|
||||
index e4fb3506a80756b3..2a2264c0e5427225 100644
|
||||
--- a/sysdeps/aarch64/strnlen.S
|
||||
+++ b/sysdeps/aarch64/strnlen.S
|
||||
@@ -49,8 +49,6 @@
|
||||
identifies the first zero byte. */
|
||||
|
||||
ENTRY (__strnlen)
|
||||
- PTR_ARG (0)
|
||||
- SIZE_ARG (1)
|
||||
bic src, srcin, 15
|
||||
cbz cntin, L(nomatch)
|
||||
ld1 {vdata.16b}, [src]
|
||||
diff --git a/sysdeps/aarch64/strrchr.S b/sysdeps/aarch64/strrchr.S
|
||||
index e52c9b275347978c..402bce444ef3bb28 100644
|
||||
--- a/sysdeps/aarch64/strrchr.S
|
||||
+++ b/sysdeps/aarch64/strrchr.S
|
||||
@@ -55,7 +55,6 @@
|
||||
if the relevant byte matched the NUL end of string. */
|
||||
|
||||
ENTRY (strrchr)
|
||||
- PTR_ARG (0)
|
||||
bic src, srcin, 15
|
||||
dup vrepchr.16b, chrin
|
||||
movi vrepmask.16b, 0x33
|
||||
diff --git a/sysdeps/unix/sysv/linux/aarch64/clone.S b/sysdeps/unix/sysv/linux/aarch64/clone.S
|
||||
index 0e7ee24e68c85377..fed19acc2f78351f 100644
|
||||
--- a/sysdeps/unix/sysv/linux/aarch64/clone.S
|
||||
+++ b/sysdeps/unix/sysv/linux/aarch64/clone.S
|
||||
@@ -33,12 +33,6 @@
|
||||
*/
|
||||
.text
|
||||
ENTRY(__clone)
|
||||
- PTR_ARG (0)
|
||||
- PTR_ARG (1)
|
||||
- PTR_ARG (3)
|
||||
- PTR_ARG (4)
|
||||
- PTR_ARG (5)
|
||||
- PTR_ARG (6)
|
||||
/* Save args for the child. */
|
||||
mov x10, x0
|
||||
mov x11, x2
|
||||
diff --git a/sysdeps/unix/sysv/linux/aarch64/clone3.S b/sysdeps/unix/sysv/linux/aarch64/clone3.S
|
||||
index 92d69a5430518cbc..9b00b6b8853e9b8b 100644
|
||||
--- a/sysdeps/unix/sysv/linux/aarch64/clone3.S
|
||||
+++ b/sysdeps/unix/sysv/linux/aarch64/clone3.S
|
||||
@@ -36,10 +36,6 @@
|
||||
|
||||
.text
|
||||
ENTRY(__clone3)
|
||||
- PTR_ARG (0)
|
||||
- PTR_ARG (1)
|
||||
- PTR_ARG (3)
|
||||
- PTR_ARG (4)
|
||||
/* Save args for the child. */
|
||||
mov x10, x0 /* cl_args */
|
||||
mov x11, x2 /* func */
|
||||
diff --git a/sysdeps/unix/sysv/linux/aarch64/getcontext.S b/sysdeps/unix/sysv/linux/aarch64/getcontext.S
|
||||
index e5b69c9a82b7a448..862bd67aa484ae1a 100644
|
||||
--- a/sysdeps/unix/sysv/linux/aarch64/getcontext.S
|
||||
+++ b/sysdeps/unix/sysv/linux/aarch64/getcontext.S
|
||||
@@ -30,7 +30,6 @@
|
||||
.text
|
||||
|
||||
ENTRY(__getcontext)
|
||||
- PTR_ARG (0)
|
||||
/* The saved context will return to the getcontext() call point
|
||||
with a return value of 0 */
|
||||
str xzr, [x0, oX0 + 0 * SZREG]
|
||||
diff --git a/sysdeps/unix/sysv/linux/aarch64/setcontext.S b/sysdeps/unix/sysv/linux/aarch64/setcontext.S
|
||||
index ba659438c564dc3b..8c072781cdf98c2b 100644
|
||||
--- a/sysdeps/unix/sysv/linux/aarch64/setcontext.S
|
||||
+++ b/sysdeps/unix/sysv/linux/aarch64/setcontext.S
|
||||
@@ -34,7 +34,6 @@
|
||||
.text
|
||||
|
||||
ENTRY (__setcontext)
|
||||
- PTR_ARG (0)
|
||||
/* Save a copy of UCP. */
|
||||
mov x9, x0
|
||||
|
||||
diff --git a/sysdeps/unix/sysv/linux/aarch64/swapcontext.S b/sysdeps/unix/sysv/linux/aarch64/swapcontext.S
|
||||
index f049140d35b79ba6..7000f220368bb094 100644
|
||||
--- a/sysdeps/unix/sysv/linux/aarch64/swapcontext.S
|
||||
+++ b/sysdeps/unix/sysv/linux/aarch64/swapcontext.S
|
||||
@@ -27,7 +27,6 @@
|
||||
|
||||
.text
|
||||
ENTRY(__swapcontext)
|
||||
- PTR_ARG (0)
|
||||
/* Set the value returned when swapcontext() returns in this context.
|
||||
And set up x1 to become the return address of the caller, so we
|
||||
can return there with a normal RET instead of an indirect jump. */
|
||||
113
glibc-RHEL-118273-33.patch
Normal file
113
glibc-RHEL-118273-33.patch
Normal file
@ -0,0 +1,113 @@
|
||||
commit cf56eb28fa277d9dbb301654682ca89f71c30a48
|
||||
Author: Pierre Blanchard <pierre.blanchard@arm.com>
|
||||
Date: Tue Mar 18 17:07:31 2025 +0000
|
||||
|
||||
AArch64: Optimize algorithm in users of SVE expf helper
|
||||
|
||||
Polynomial order was unnecessarily high, unlocking multiple
|
||||
optimizations.
|
||||
Max error for new SVE expf is 0.88 +0.5ULP.
|
||||
Max error for new SVE coshf is 2.56 +0.5ULP.
|
||||
Performance improvement on Neoverse V1: expf (30%), coshf (26%).
|
||||
|
||||
Reviewed-by: Wilco Dijkstra <Wilco.Dijkstra@arm.com>
|
||||
|
||||
diff --git a/sysdeps/aarch64/fpu/coshf_sve.c b/sysdeps/aarch64/fpu/coshf_sve.c
|
||||
index 7ad6efa0fc218278..508c0790ee89e0cd 100644
|
||||
--- a/sysdeps/aarch64/fpu/coshf_sve.c
|
||||
+++ b/sysdeps/aarch64/fpu/coshf_sve.c
|
||||
@@ -39,9 +39,9 @@ special_case (svfloat32_t x, svfloat32_t half_e, svfloat32_t half_over_e,
|
||||
}
|
||||
|
||||
/* Single-precision vector cosh, using vector expf.
|
||||
- Maximum error is 2.77 ULP:
|
||||
- _ZGVsMxv_coshf(-0x1.5b38f4p+1) got 0x1.e45946p+2
|
||||
- want 0x1.e4594cp+2. */
|
||||
+ Maximum error is 2.56 +0.5 ULP:
|
||||
+ _ZGVsMxv_coshf(-0x1.5b40f4p+1) got 0x1.e47748p+2
|
||||
+ want 0x1.e4774ep+2. */
|
||||
svfloat32_t SV_NAME_F1 (cosh) (svfloat32_t x, svbool_t pg)
|
||||
{
|
||||
const struct data *d = ptr_barrier (&data);
|
||||
diff --git a/sysdeps/aarch64/fpu/expf_sve.c b/sysdeps/aarch64/fpu/expf_sve.c
|
||||
index da93e01b87e0e890..aee86a203379efb3 100644
|
||||
--- a/sysdeps/aarch64/fpu/expf_sve.c
|
||||
+++ b/sysdeps/aarch64/fpu/expf_sve.c
|
||||
@@ -40,9 +40,9 @@ special_case (svfloat32_t x, svbool_t special, const struct sv_expf_data *d)
|
||||
}
|
||||
|
||||
/* Optimised single-precision SVE exp function.
|
||||
- Worst-case error is 1.04 ulp:
|
||||
- SV_NAME_F1 (exp)(0x1.a8eda4p+1) got 0x1.ba74bcp+4
|
||||
- want 0x1.ba74bap+4. */
|
||||
+ Worst-case error is 0.88 +0.50 ULP:
|
||||
+ _ZGVsMxv_expf(-0x1.bba276p-6) got 0x1.f25288p-1
|
||||
+ want 0x1.f2528ap-1. */
|
||||
svfloat32_t SV_NAME_F1 (exp) (svfloat32_t x, const svbool_t pg)
|
||||
{
|
||||
const struct data *d = ptr_barrier (&data);
|
||||
diff --git a/sysdeps/aarch64/fpu/sv_expf_inline.h b/sysdeps/aarch64/fpu/sv_expf_inline.h
|
||||
index 75781fb4ddcb9790..01fbb4d4c046eb3b 100644
|
||||
--- a/sysdeps/aarch64/fpu/sv_expf_inline.h
|
||||
+++ b/sysdeps/aarch64/fpu/sv_expf_inline.h
|
||||
@@ -24,50 +24,40 @@
|
||||
|
||||
struct sv_expf_data
|
||||
{
|
||||
- float c1, c3, inv_ln2;
|
||||
- float ln2_lo, c0, c2, c4;
|
||||
- float ln2_hi, shift;
|
||||
+ float ln2_hi, ln2_lo, c1, null;
|
||||
+ float inv_ln2, shift;
|
||||
};
|
||||
|
||||
-/* Coefficients copied from the polynomial in AdvSIMD variant, reversed for
|
||||
- compatibility with polynomial helpers. Shift is 1.5*2^17 + 127. */
|
||||
+/* Shift is 1.5*2^17 + 127. */
|
||||
#define SV_EXPF_DATA \
|
||||
{ \
|
||||
- /* Coefficients copied from the polynomial in AdvSIMD variant. */ \
|
||||
- .c0 = 0x1.ffffecp-1f, .c1 = 0x1.fffdb6p-2f, .c2 = 0x1.555e66p-3f, \
|
||||
- .c3 = 0x1.573e2ep-5f, .c4 = 0x1.0e4020p-7f, .inv_ln2 = 0x1.715476p+0f, \
|
||||
- .ln2_hi = 0x1.62e4p-1f, .ln2_lo = 0x1.7f7d1cp-20f, \
|
||||
- .shift = 0x1.803f8p17f, \
|
||||
+ .c1 = 0.5f, .inv_ln2 = 0x1.715476p+0f, .ln2_hi = 0x1.62e4p-1f, \
|
||||
+ .ln2_lo = 0x1.7f7d1cp-20f, .shift = 0x1.803f8p17f, \
|
||||
}
|
||||
|
||||
-#define C(i) sv_f32 (d->poly[i])
|
||||
-
|
||||
static inline svfloat32_t
|
||||
expf_inline (svfloat32_t x, const svbool_t pg, const struct sv_expf_data *d)
|
||||
{
|
||||
/* exp(x) = 2^n (1 + poly(r)), with 1 + poly(r) in [1/sqrt(2),sqrt(2)]
|
||||
x = ln2*n + r, with r in [-ln2/2, ln2/2]. */
|
||||
|
||||
- svfloat32_t lane_consts = svld1rq (svptrue_b32 (), &d->ln2_lo);
|
||||
+ svfloat32_t lane_consts = svld1rq (svptrue_b32 (), &d->ln2_hi);
|
||||
|
||||
/* n = round(x/(ln2/N)). */
|
||||
svfloat32_t z = svmad_x (pg, sv_f32 (d->inv_ln2), x, d->shift);
|
||||
svfloat32_t n = svsub_x (pg, z, d->shift);
|
||||
|
||||
/* r = x - n*ln2/N. */
|
||||
- svfloat32_t r = svmsb_x (pg, sv_f32 (d->ln2_hi), n, x);
|
||||
+ svfloat32_t r = x;
|
||||
r = svmls_lane (r, n, lane_consts, 0);
|
||||
+ r = svmls_lane (r, n, lane_consts, 1);
|
||||
|
||||
/* scale = 2^(n/N). */
|
||||
svfloat32_t scale = svexpa (svreinterpret_u32 (z));
|
||||
|
||||
- /* poly(r) = exp(r) - 1 ~= C0 r + C1 r^2 + C2 r^3 + C3 r^4 + C4 r^5. */
|
||||
- svfloat32_t p12 = svmla_lane (sv_f32 (d->c1), r, lane_consts, 2);
|
||||
- svfloat32_t p34 = svmla_lane (sv_f32 (d->c3), r, lane_consts, 3);
|
||||
+ /* poly(r) = exp(r) - 1 ~= r + 0.5 r^2. */
|
||||
svfloat32_t r2 = svmul_x (svptrue_b32 (), r, r);
|
||||
- svfloat32_t p14 = svmla_x (pg, p12, p34, r2);
|
||||
- svfloat32_t p0 = svmul_lane (r, lane_consts, 1);
|
||||
- svfloat32_t poly = svmla_x (pg, p0, r2, p14);
|
||||
+ svfloat32_t poly = svmla_lane (r, r2, lane_consts, 2);
|
||||
|
||||
return svmla_x (pg, scale, scale, poly);
|
||||
}
|
||||
217
glibc-RHEL-118273-34.patch
Normal file
217
glibc-RHEL-118273-34.patch
Normal file
@ -0,0 +1,217 @@
|
||||
commit 4352e2cc934b2874dba37397157bf890fcee455a
|
||||
Author: Adhemerval Zanella <adhemerval.zanella@linaro.org>
|
||||
Date: Fri Mar 28 14:27:45 2025 -0300
|
||||
|
||||
aarch64: Fix _dl_tlsdesc_dynamic unwind for pac-ret (BZ 32612)
|
||||
|
||||
When libgcc is built with pac-ret, it requires to autenticate the
|
||||
unwinding frame based on CFI information. The _dl_tlsdesc_dynamic
|
||||
uses a custom calling convention, where it is responsible to save
|
||||
and restore all registers it might use (even volatile).
|
||||
|
||||
The pac-ret support added by 1be3d6eb823d8b952fa54b7bbc90cbecb8981380
|
||||
was added only on the slow-path, but the fast path also adds DWARF
|
||||
Register Rule Instruction (cfi_adjust_cfa_offset) since it requires
|
||||
to save/restore some auxiliary register. It seems that this is not
|
||||
fully supported neither by libgcc nor AArch64 ABI [1].
|
||||
|
||||
Instead, move paciasp/autiasp to function prologue/epilogue to be
|
||||
used on both fast and slow paths.
|
||||
|
||||
I also corrected the _dl_tlsdesc_dynamic comment description, it was
|
||||
copied from i386 implementation without any adjustment.
|
||||
|
||||
Checked on aarch64-linux-gnu with a toolchain built with
|
||||
--enable-standard-branch-protection on a system with pac-ret
|
||||
support.
|
||||
|
||||
[1] https://github.com/ARM-software/abi-aa/blob/main/aadwarf64/aadwarf64.rst#id1
|
||||
|
||||
Reviewed-by: Yury Khrustalev <yury.khrustalev@arm.com>
|
||||
|
||||
Conflicts:
|
||||
sysdeps/unix/sysv/linux/aarch64/Makefile
|
||||
(Fixup context to apply without out-of-scope dependency f4d00dd60d)
|
||||
|
||||
diff --git a/sysdeps/aarch64/dl-tlsdesc.S b/sysdeps/aarch64/dl-tlsdesc.S
|
||||
index 0aeaf64edd2594f1..36195c956855e024 100644
|
||||
--- a/sysdeps/aarch64/dl-tlsdesc.S
|
||||
+++ b/sysdeps/aarch64/dl-tlsdesc.S
|
||||
@@ -119,20 +119,19 @@ _dl_tlsdesc_undefweak:
|
||||
object referenced by the argument.
|
||||
|
||||
ptrdiff_t
|
||||
- __attribute__ ((__regparm__ (1)))
|
||||
_dl_tlsdesc_dynamic (struct tlsdesc *tdp)
|
||||
{
|
||||
struct tlsdesc_dynamic_arg *td = tdp->arg;
|
||||
- dtv_t *dtv = *(dtv_t **)((char *)__thread_pointer + TCBHEAD_DTV);
|
||||
+ dtv_t *dtv = *(dtv_t **)((char *)__thread_pointer() + TCBHEAD_DTV);
|
||||
if (__builtin_expect (td->gen_count <= dtv[0].counter
|
||||
&& (dtv[td->tlsinfo.ti_module].pointer.val
|
||||
!= TLS_DTV_UNALLOCATED),
|
||||
1))
|
||||
return dtv[td->tlsinfo.ti_module].pointer.val
|
||||
+ td->tlsinfo.ti_offset
|
||||
- - __thread_pointer;
|
||||
+ - __thread_pointer();
|
||||
|
||||
- return ___tls_get_addr (&td->tlsinfo) - __thread_pointer;
|
||||
+ return __tls_get_addr (&td->tlsinfo) - __thread_pointer();
|
||||
}
|
||||
*/
|
||||
|
||||
@@ -142,7 +141,12 @@ _dl_tlsdesc_undefweak:
|
||||
cfi_startproc
|
||||
.align 2
|
||||
_dl_tlsdesc_dynamic:
|
||||
+# if HAVE_AARCH64_PAC_RET
|
||||
+ PACIASP
|
||||
+ cfi_window_save
|
||||
+# else
|
||||
BTI_C
|
||||
+# endif
|
||||
|
||||
/* Save just enough registers to support fast path, if we fall
|
||||
into slow path we will save additional registers. */
|
||||
@@ -173,6 +177,10 @@ _dl_tlsdesc_dynamic:
|
||||
1:
|
||||
ldp x3, x4, [sp, #16]
|
||||
ldp x1, x2, [sp], #32
|
||||
+# if HAVE_AARCH64_PAC_RET
|
||||
+ AUTIASP
|
||||
+ cfi_window_save
|
||||
+# endif
|
||||
cfi_adjust_cfa_offset (-32)
|
||||
RET
|
||||
2:
|
||||
@@ -182,10 +190,6 @@ _dl_tlsdesc_dynamic:
|
||||
|
||||
/* Save the remaining registers that we must treat as caller save. */
|
||||
cfi_restore_state
|
||||
-# if HAVE_AARCH64_PAC_RET
|
||||
- PACIASP
|
||||
- cfi_window_save
|
||||
-# endif
|
||||
# define NSAVEXREGPAIRS 8
|
||||
stp x29, x30, [sp,#-16*NSAVEXREGPAIRS]!
|
||||
cfi_adjust_cfa_offset (16*NSAVEXREGPAIRS)
|
||||
@@ -236,10 +240,6 @@ _dl_tlsdesc_dynamic:
|
||||
cfi_adjust_cfa_offset (-16*NSAVEXREGPAIRS)
|
||||
cfi_restore (x29)
|
||||
cfi_restore (x30)
|
||||
-# if HAVE_AARCH64_PAC_RET
|
||||
- AUTIASP
|
||||
- cfi_window_save
|
||||
-# endif
|
||||
b 1b
|
||||
cfi_endproc
|
||||
.size _dl_tlsdesc_dynamic, .-_dl_tlsdesc_dynamic
|
||||
diff --git a/sysdeps/unix/sysv/linux/aarch64/Makefile b/sysdeps/unix/sysv/linux/aarch64/Makefile
|
||||
index 40b9a2e5dea1ea89..607a0c56d8dfad8d 100644
|
||||
--- a/sysdeps/unix/sysv/linux/aarch64/Makefile
|
||||
+++ b/sysdeps/unix/sysv/linux/aarch64/Makefile
|
||||
@@ -1,3 +1,16 @@
|
||||
+ifeq ($(subdir),elf)
|
||||
+tests += \
|
||||
+ tst-tlsdesc-pac \
|
||||
+ # tests
|
||||
+modules-names += \
|
||||
+ tst-tlsdesc-pac-mod \
|
||||
+ # modules-names
|
||||
+
|
||||
+LDFLAGS-tst-tlsdesc-pac = -rdynamic
|
||||
+
|
||||
+$(objpfx)tst-tlsdesc-pac.out: $(objpfx)tst-tlsdesc-pac-mod.so
|
||||
+endif
|
||||
+
|
||||
ifeq ($(subdir),misc)
|
||||
sysdep_headers += sys/elf.h
|
||||
endif
|
||||
diff --git a/sysdeps/unix/sysv/linux/aarch64/tst-tlsdesc-pac-mod.c b/sysdeps/unix/sysv/linux/aarch64/tst-tlsdesc-pac-mod.c
|
||||
new file mode 100644
|
||||
index 0000000000000000..d34c8beda9b1986d
|
||||
--- /dev/null
|
||||
+++ b/sysdeps/unix/sysv/linux/aarch64/tst-tlsdesc-pac-mod.c
|
||||
@@ -0,0 +1,27 @@
|
||||
+/* AArch64 tests for unwinding TLSDESC (BZ 32612)
|
||||
+ Copyright (C) 2025 Free Software Foundation, Inc.
|
||||
+ This file is part of the GNU C Library.
|
||||
+
|
||||
+ The GNU C Library is free software; you can redistribute it and/or
|
||||
+ modify it under the terms of the GNU Lesser General Public
|
||||
+ License as published by the Free Software Foundation; either
|
||||
+ version 2.1 of the License, or (at your option) any later version.
|
||||
+
|
||||
+ The GNU C Library is distributed in the hope that it will be useful,
|
||||
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
+ Lesser General Public License for more details.
|
||||
+
|
||||
+ You should have received a copy of the GNU Lesser General Public
|
||||
+ License along with the GNU C Library; if not, see
|
||||
+ <https://www.gnu.org/licenses/>. */
|
||||
+
|
||||
+_Thread_local int foo;
|
||||
+/* Make the TLS segment large enough to trigger _dl_tlsdesc_dynamic. */
|
||||
+_Thread_local int foobar[1000];
|
||||
+
|
||||
+void
|
||||
+bar (void)
|
||||
+{
|
||||
+ foo = 1;
|
||||
+}
|
||||
diff --git a/sysdeps/unix/sysv/linux/aarch64/tst-tlsdesc-pac.c b/sysdeps/unix/sysv/linux/aarch64/tst-tlsdesc-pac.c
|
||||
new file mode 100644
|
||||
index 0000000000000000..24d656aafc2784b4
|
||||
--- /dev/null
|
||||
+++ b/sysdeps/unix/sysv/linux/aarch64/tst-tlsdesc-pac.c
|
||||
@@ -0,0 +1,48 @@
|
||||
+/* AArch64 tests for unwinding TLSDESC (BZ 32612)
|
||||
+ Copyright (C) 2025 Free Software Foundation, Inc.
|
||||
+ This file is part of the GNU C Library.
|
||||
+
|
||||
+ The GNU C Library is free software; you can redistribute it and/or
|
||||
+ modify it under the terms of the GNU Lesser General Public
|
||||
+ License as published by the Free Software Foundation; either
|
||||
+ version 2.1 of the License, or (at your option) any later version.
|
||||
+
|
||||
+ The GNU C Library is distributed in the hope that it will be useful,
|
||||
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
+ Lesser General Public License for more details.
|
||||
+
|
||||
+ You should have received a copy of the GNU Lesser General Public
|
||||
+ License along with the GNU C Library; if not, see
|
||||
+ <https://www.gnu.org/licenses/>. */
|
||||
+
|
||||
+#include <stdlib.h>
|
||||
+#include <unwind.h>
|
||||
+#include <support/xdlfcn.h>
|
||||
+
|
||||
+static _Unwind_Reason_Code
|
||||
+unwind_callback (struct _Unwind_Context* context, void* closure)
|
||||
+{
|
||||
+ return _URC_NO_REASON;
|
||||
+}
|
||||
+
|
||||
+/* Assume that TLS variable from tst-tlsdesc-pac-mod.so will trigger
|
||||
+ the slow-path that allocates the required memory with malloc. */
|
||||
+void *
|
||||
+malloc (size_t s)
|
||||
+{
|
||||
+ _Unwind_Backtrace (unwind_callback, NULL);
|
||||
+ return calloc (1, s);
|
||||
+}
|
||||
+
|
||||
+static int
|
||||
+do_test (void)
|
||||
+{
|
||||
+ void *h = xdlopen ("tst-tlsdesc-pac-mod.so", RTLD_LAZY);
|
||||
+ void (*func)(void) = xdlsym (h, "bar");
|
||||
+ func ();
|
||||
+
|
||||
+ return 0;
|
||||
+}
|
||||
+
|
||||
+#include <support/test-driver.c>
|
||||
76
glibc-RHEL-118273-35.patch
Normal file
76
glibc-RHEL-118273-35.patch
Normal file
@ -0,0 +1,76 @@
|
||||
commit 691edbdf7727466ba87e27a8eeae1c3bc5824ef5
|
||||
Author: Yury Khrustalev <yury.khrustalev@arm.com>
|
||||
Date: Thu May 8 13:53:38 2025 +0100
|
||||
|
||||
aarch64: fix unwinding in longjmp
|
||||
|
||||
Previously, longjmp() on aarch64 was using CFI directives around the
|
||||
call to __libc_arm_za_disable() after CFA was redefined at the start
|
||||
of longjmp(). This may result in unwinding issues. Move the call and
|
||||
surrounding CFI directives to the beginning of longjmp().
|
||||
|
||||
Suggested-by: Wilco Dijkstra <wilco.dijkstra@arm.com>
|
||||
|
||||
diff --git a/sysdeps/aarch64/__longjmp.S b/sysdeps/aarch64/__longjmp.S
|
||||
index 452ba0da6d788ce8..30b36cb25d921795 100644
|
||||
--- a/sysdeps/aarch64/__longjmp.S
|
||||
+++ b/sysdeps/aarch64/__longjmp.S
|
||||
@@ -24,28 +24,6 @@
|
||||
/* __longjmp(jmpbuf, val) */
|
||||
|
||||
ENTRY (__longjmp)
|
||||
- cfi_def_cfa(x0, 0)
|
||||
- cfi_offset(x19, JB_X19<<3)
|
||||
- cfi_offset(x20, JB_X20<<3)
|
||||
- cfi_offset(x21, JB_X21<<3)
|
||||
- cfi_offset(x22, JB_X22<<3)
|
||||
- cfi_offset(x23, JB_X23<<3)
|
||||
- cfi_offset(x24, JB_X24<<3)
|
||||
- cfi_offset(x25, JB_X25<<3)
|
||||
- cfi_offset(x26, JB_X26<<3)
|
||||
- cfi_offset(x27, JB_X27<<3)
|
||||
- cfi_offset(x28, JB_X28<<3)
|
||||
- cfi_offset(x29, JB_X29<<3)
|
||||
- cfi_offset(x30, JB_LR<<3)
|
||||
-
|
||||
- cfi_offset( d8, JB_D8<<3)
|
||||
- cfi_offset( d9, JB_D9<<3)
|
||||
- cfi_offset(d10, JB_D10<<3)
|
||||
- cfi_offset(d11, JB_D11<<3)
|
||||
- cfi_offset(d12, JB_D12<<3)
|
||||
- cfi_offset(d13, JB_D13<<3)
|
||||
- cfi_offset(d14, JB_D14<<3)
|
||||
- cfi_offset(d15, JB_D15<<3)
|
||||
|
||||
#if IS_IN(libc)
|
||||
/* Disable ZA state of SME in libc.a and libc.so, but not in ld.so. */
|
||||
@@ -69,6 +47,29 @@ ENTRY (__longjmp)
|
||||
# endif
|
||||
#endif
|
||||
|
||||
+ cfi_def_cfa (x0, 0)
|
||||
+ cfi_offset (x19, JB_X19<<3)
|
||||
+ cfi_offset (x20, JB_X20<<3)
|
||||
+ cfi_offset (x21, JB_X21<<3)
|
||||
+ cfi_offset (x22, JB_X22<<3)
|
||||
+ cfi_offset (x23, JB_X23<<3)
|
||||
+ cfi_offset (x24, JB_X24<<3)
|
||||
+ cfi_offset (x25, JB_X25<<3)
|
||||
+ cfi_offset (x26, JB_X26<<3)
|
||||
+ cfi_offset (x27, JB_X27<<3)
|
||||
+ cfi_offset (x28, JB_X28<<3)
|
||||
+ cfi_offset (x29, JB_X29<<3)
|
||||
+ cfi_offset (x30, JB_LR<<3)
|
||||
+
|
||||
+ cfi_offset ( d8, JB_D8<<3)
|
||||
+ cfi_offset ( d9, JB_D9<<3)
|
||||
+ cfi_offset (d10, JB_D10<<3)
|
||||
+ cfi_offset (d11, JB_D11<<3)
|
||||
+ cfi_offset (d12, JB_D12<<3)
|
||||
+ cfi_offset (d13, JB_D13<<3)
|
||||
+ cfi_offset (d14, JB_D14<<3)
|
||||
+ cfi_offset (d15, JB_D15<<3)
|
||||
+
|
||||
ldp x19, x20, [x0, #JB_X19<<3]
|
||||
ldp x21, x22, [x0, #JB_X21<<3]
|
||||
ldp x23, x24, [x0, #JB_X23<<3]
|
||||
29
glibc-RHEL-118273-36.patch
Normal file
29
glibc-RHEL-118273-36.patch
Normal file
@ -0,0 +1,29 @@
|
||||
commit aa18367c1169700f610565eba8acf3e08429fcf5
|
||||
Author: Wilco Dijkstra <wilco.dijkstra@arm.com>
|
||||
Date: Thu May 29 15:08:15 2025 +0000
|
||||
|
||||
AArch64: Improve enabling of SVE for libmvec
|
||||
|
||||
When using a -mcpu option in CFLAGS, GCC can report errors when building libmvec.
|
||||
Fix this by overriding both -mcpu and -march with a generic variant with SVE added.
|
||||
Also use a tune for a modern SVE core.
|
||||
|
||||
Reviewed-by: Yury Khrustalev <yury.khrustalev@arm.com>
|
||||
|
||||
diff --git a/sysdeps/aarch64/fpu/Makefile b/sysdeps/aarch64/fpu/Makefile
|
||||
index be8541f6496d6688..aa547b21df5f41d9 100644
|
||||
--- a/sysdeps/aarch64/fpu/Makefile
|
||||
+++ b/sysdeps/aarch64/fpu/Makefile
|
||||
@@ -49,8 +49,11 @@ libmvec-support = $(addsuffix f_advsimd,$(float-advsimd-funcs)) \
|
||||
v_powf_data
|
||||
endif
|
||||
|
||||
-sve-cflags = -march=armv8-a+sve
|
||||
+# Enable SVE for building libmvec. Since CFLAGS may contain a -mcpu or -march,
|
||||
+# add a generic -mcpu and -march with SVE enabled. Also use a tune for a modern
|
||||
+# SVE core.
|
||||
|
||||
+sve-cflags = -mcpu=generic+sve -march=armv8-a+sve -mtune=neoverse-v2
|
||||
|
||||
ifeq ($(build-mathvec),yes)
|
||||
bench-libmvec = $(addprefix float-advsimd-,$(float-advsimd-funcs)) \
|
||||
24
glibc-RHEL-118273-37.patch
Normal file
24
glibc-RHEL-118273-37.patch
Normal file
@ -0,0 +1,24 @@
|
||||
commit 09795c5612c630db605886dfd55dbf56f381d128
|
||||
Author: Wilco Dijkstra <wilco.dijkstra@arm.com>
|
||||
Date: Fri Jun 6 13:15:30 2025 +0000
|
||||
|
||||
AArch64: Fix builderror with GCC 12.1/12.2
|
||||
|
||||
Early versions of GCC 12 didn't support -mtune=neoverse-v2, so use
|
||||
-mtune=neoverse-v1 instead.
|
||||
|
||||
Reported-by: Yury Khrustalev <yury.khrustalev@arm.com>
|
||||
|
||||
diff --git a/sysdeps/aarch64/fpu/Makefile b/sysdeps/aarch64/fpu/Makefile
|
||||
index aa547b21df5f41d9..c8a6fb4628d13aec 100644
|
||||
--- a/sysdeps/aarch64/fpu/Makefile
|
||||
+++ b/sysdeps/aarch64/fpu/Makefile
|
||||
@@ -53,7 +53,7 @@ endif
|
||||
# add a generic -mcpu and -march with SVE enabled. Also use a tune for a modern
|
||||
# SVE core.
|
||||
|
||||
-sve-cflags = -mcpu=generic+sve -march=armv8-a+sve -mtune=neoverse-v2
|
||||
+sve-cflags = -mcpu=generic+sve -march=armv8-a+sve -mtune=neoverse-v1
|
||||
|
||||
ifeq ($(build-mathvec),yes)
|
||||
bench-libmvec = $(addprefix float-advsimd-,$(float-advsimd-funcs)) \
|
||||
188
glibc-RHEL-118273-38.patch
Normal file
188
glibc-RHEL-118273-38.patch
Normal file
@ -0,0 +1,188 @@
|
||||
commit 6849c5b791edd216f2ec3fdbe4d138bc69b9b333
|
||||
Author: Luna Lamb <luna.lamb@arm.com>
|
||||
Date: Wed Jun 18 16:12:19 2025 +0000
|
||||
|
||||
AArch64: Improve codegen SVE log1p helper
|
||||
|
||||
Improve codegen by packing coefficients.
|
||||
4% and 2% improvement in throughput microbenchmark on Neoverse V1, for acosh
|
||||
and atanh respectively.
|
||||
|
||||
Reviewed-by: Wilco Dijkstra <Wilco.Dijkstra@arm.com>
|
||||
|
||||
diff --git a/sysdeps/aarch64/fpu/acosh_sve.c b/sysdeps/aarch64/fpu/acosh_sve.c
|
||||
index 3e4faaa5ca686c18..78ebcffbb5737641 100644
|
||||
--- a/sysdeps/aarch64/fpu/acosh_sve.c
|
||||
+++ b/sysdeps/aarch64/fpu/acosh_sve.c
|
||||
@@ -30,10 +30,10 @@ special_case (svfloat64_t x, svfloat64_t y, svbool_t special)
|
||||
}
|
||||
|
||||
/* SVE approximation for double-precision acosh, based on log1p.
|
||||
- The largest observed error is 3.19 ULP in the region where the
|
||||
+ The largest observed error is 3.14 ULP in the region where the
|
||||
argument to log1p falls in the k=0 interval, i.e. x close to 1:
|
||||
- SV_NAME_D1 (acosh)(0x1.1e4388d4ca821p+0) got 0x1.ed23399f5137p-2
|
||||
- want 0x1.ed23399f51373p-2. */
|
||||
+ SV_NAME_D1 (acosh)(0x1.1e80ed12f0ad1p+0) got 0x1.ef0cee7c33ce1p-2
|
||||
+ want 0x1.ef0cee7c33ce4p-2. */
|
||||
svfloat64_t SV_NAME_D1 (acosh) (svfloat64_t x, const svbool_t pg)
|
||||
{
|
||||
/* (ix - One) >= (BigBound - One). */
|
||||
diff --git a/sysdeps/aarch64/fpu/atanh_sve.c b/sysdeps/aarch64/fpu/atanh_sve.c
|
||||
index 7a52728d70f6d226..a4803e5c1305379e 100644
|
||||
--- a/sysdeps/aarch64/fpu/atanh_sve.c
|
||||
+++ b/sysdeps/aarch64/fpu/atanh_sve.c
|
||||
@@ -30,7 +30,7 @@ special_case (svfloat64_t x, svfloat64_t y, svbool_t special)
|
||||
}
|
||||
|
||||
/* SVE approximation for double-precision atanh, based on log1p.
|
||||
- The greatest observed error is 2.81 ULP:
|
||||
+ The greatest observed error is 3.3 ULP:
|
||||
_ZGVsMxv_atanh(0x1.ffae6288b601p-6) got 0x1.ffd8ff31b5019p-6
|
||||
want 0x1.ffd8ff31b501cp-6. */
|
||||
svfloat64_t SV_NAME_D1 (atanh) (svfloat64_t x, const svbool_t pg)
|
||||
@@ -42,7 +42,6 @@ svfloat64_t SV_NAME_D1 (atanh) (svfloat64_t x, const svbool_t pg)
|
||||
svfloat64_t halfsign = svreinterpret_f64 (svorr_x (pg, sign, Half));
|
||||
|
||||
/* It is special if iax >= 1. */
|
||||
-// svbool_t special = svcmpge (pg, iax, One);
|
||||
svbool_t special = svacge (pg, x, 1.0);
|
||||
|
||||
/* Computation is performed based on the following sequence of equality:
|
||||
diff --git a/sysdeps/aarch64/fpu/sv_log1p_inline.h b/sysdeps/aarch64/fpu/sv_log1p_inline.h
|
||||
index da019674f94dbac7..a9ecd75d19e95d39 100644
|
||||
--- a/sysdeps/aarch64/fpu/sv_log1p_inline.h
|
||||
+++ b/sysdeps/aarch64/fpu/sv_log1p_inline.h
|
||||
@@ -21,11 +21,12 @@
|
||||
#define AARCH64_FPU_SV_LOG1P_INLINE_H
|
||||
|
||||
#include "sv_math.h"
|
||||
-#include "poly_sve_f64.h"
|
||||
|
||||
static const struct sv_log1p_data
|
||||
{
|
||||
- double poly[19], ln2[2];
|
||||
+ double c0, c2, c4, c6, c8, c10, c12, c14, c16;
|
||||
+ double c1, c3, c5, c7, c9, c11, c13, c15, c17, c18;
|
||||
+ double ln2_lo, ln2_hi;
|
||||
uint64_t hf_rt2_top;
|
||||
uint64_t one_m_hf_rt2_top;
|
||||
uint32_t bottom_mask;
|
||||
@@ -33,15 +34,30 @@ static const struct sv_log1p_data
|
||||
} sv_log1p_data = {
|
||||
/* Coefficients generated using Remez, deg=20, in [sqrt(2)/2-1, sqrt(2)-1].
|
||||
*/
|
||||
- .poly = { -0x1.ffffffffffffbp-2, 0x1.55555555551a9p-2, -0x1.00000000008e3p-2,
|
||||
- 0x1.9999999a32797p-3, -0x1.555555552fecfp-3, 0x1.249248e071e5ap-3,
|
||||
- -0x1.ffffff8bf8482p-4, 0x1.c71c8f07da57ap-4, -0x1.9999ca4ccb617p-4,
|
||||
- 0x1.7459ad2e1dfa3p-4, -0x1.554d2680a3ff2p-4, 0x1.3b4c54d487455p-4,
|
||||
- -0x1.2548a9ffe80e6p-4, 0x1.0f389a24b2e07p-4, -0x1.eee4db15db335p-5,
|
||||
- 0x1.e95b494d4a5ddp-5, -0x1.15fdf07cb7c73p-4, 0x1.0310b70800fcfp-4,
|
||||
- -0x1.cfa7385bdb37ep-6 },
|
||||
- .ln2 = { 0x1.62e42fefa3800p-1, 0x1.ef35793c76730p-45 },
|
||||
+ .c0 = -0x1.ffffffffffffbp-2,
|
||||
+ .c1 = 0x1.55555555551a9p-2,
|
||||
+ .c2 = -0x1.00000000008e3p-2,
|
||||
+ .c3 = 0x1.9999999a32797p-3,
|
||||
+ .c4 = -0x1.555555552fecfp-3,
|
||||
+ .c5 = 0x1.249248e071e5ap-3,
|
||||
+ .c6 = -0x1.ffffff8bf8482p-4,
|
||||
+ .c7 = 0x1.c71c8f07da57ap-4,
|
||||
+ .c8 = -0x1.9999ca4ccb617p-4,
|
||||
+ .c9 = 0x1.7459ad2e1dfa3p-4,
|
||||
+ .c10 = -0x1.554d2680a3ff2p-4,
|
||||
+ .c11 = 0x1.3b4c54d487455p-4,
|
||||
+ .c12 = -0x1.2548a9ffe80e6p-4,
|
||||
+ .c13 = 0x1.0f389a24b2e07p-4,
|
||||
+ .c14 = -0x1.eee4db15db335p-5,
|
||||
+ .c15 = 0x1.e95b494d4a5ddp-5,
|
||||
+ .c16 = -0x1.15fdf07cb7c73p-4,
|
||||
+ .c17 = 0x1.0310b70800fcfp-4,
|
||||
+ .c18 = -0x1.cfa7385bdb37ep-6,
|
||||
+ .ln2_lo = 0x1.62e42fefa3800p-1,
|
||||
+ .ln2_hi = 0x1.ef35793c76730p-45,
|
||||
+ /* top32(asuint64(sqrt(2)/2)) << 32. */
|
||||
.hf_rt2_top = 0x3fe6a09e00000000,
|
||||
+ /* (top32(asuint64(1)) - top32(asuint64(sqrt(2)/2))) << 32. */
|
||||
.one_m_hf_rt2_top = 0x00095f6200000000,
|
||||
.bottom_mask = 0xffffffff,
|
||||
.one_top = 0x3ff
|
||||
@@ -51,14 +67,14 @@ static inline svfloat64_t
|
||||
sv_log1p_inline (svfloat64_t x, const svbool_t pg)
|
||||
{
|
||||
/* Helper for calculating log(x + 1). Adapted from v_log1p_inline.h, which
|
||||
- differs from v_log1p_2u5.c by:
|
||||
+ differs from advsimd/log1p.c by:
|
||||
- No special-case handling - this should be dealt with by the caller.
|
||||
- Pairwise Horner polynomial evaluation for improved accuracy.
|
||||
- Optionally simulate the shortcut for k=0, used in the scalar routine,
|
||||
using svsel, for improved accuracy when the argument to log1p is close
|
||||
to 0. This feature is enabled by defining WANT_SV_LOG1P_K0_SHORTCUT as 1
|
||||
in the source of the caller before including this file.
|
||||
- See sv_log1p_2u1.c for details of the algorithm. */
|
||||
+ See sve/log1p.c for details of the algorithm. */
|
||||
const struct sv_log1p_data *d = ptr_barrier (&sv_log1p_data);
|
||||
svfloat64_t m = svadd_x (pg, x, 1);
|
||||
svuint64_t mi = svreinterpret_u64 (m);
|
||||
@@ -79,7 +95,7 @@ sv_log1p_inline (svfloat64_t x, const svbool_t pg)
|
||||
svfloat64_t cm;
|
||||
|
||||
#ifndef WANT_SV_LOG1P_K0_SHORTCUT
|
||||
-#error \
|
||||
+#error \
|
||||
"Cannot use sv_log1p_inline.h without specifying whether you need the k0 shortcut for greater accuracy close to 0"
|
||||
#elif WANT_SV_LOG1P_K0_SHORTCUT
|
||||
/* Shortcut if k is 0 - set correction term to 0 and f to x. The result is
|
||||
@@ -96,14 +112,46 @@ sv_log1p_inline (svfloat64_t x, const svbool_t pg)
|
||||
#endif
|
||||
|
||||
/* Approximate log1p(f) on the reduced input using a polynomial. */
|
||||
- svfloat64_t f2 = svmul_x (pg, f, f);
|
||||
- svfloat64_t p = sv_pw_horner_18_f64_x (pg, f, f2, d->poly);
|
||||
+ svfloat64_t f2 = svmul_x (svptrue_b64 (), f, f),
|
||||
+ f4 = svmul_x (svptrue_b64 (), f2, f2),
|
||||
+ f8 = svmul_x (svptrue_b64 (), f4, f4),
|
||||
+ f16 = svmul_x (svptrue_b64 (), f8, f8);
|
||||
+
|
||||
+ svfloat64_t c13 = svld1rq (svptrue_b64 (), &d->c1);
|
||||
+ svfloat64_t c57 = svld1rq (svptrue_b64 (), &d->c5);
|
||||
+ svfloat64_t c911 = svld1rq (svptrue_b64 (), &d->c9);
|
||||
+ svfloat64_t c1315 = svld1rq (svptrue_b64 (), &d->c13);
|
||||
+ svfloat64_t c1718 = svld1rq (svptrue_b64 (), &d->c17);
|
||||
+
|
||||
+ /* Order-18 Estrin scheme. */
|
||||
+ svfloat64_t p01 = svmla_lane (sv_f64 (d->c0), f, c13, 0);
|
||||
+ svfloat64_t p23 = svmla_lane (sv_f64 (d->c2), f, c13, 1);
|
||||
+ svfloat64_t p45 = svmla_lane (sv_f64 (d->c4), f, c57, 0);
|
||||
+ svfloat64_t p67 = svmla_lane (sv_f64 (d->c6), f, c57, 1);
|
||||
+
|
||||
+ svfloat64_t p03 = svmla_x (pg, p01, f2, p23);
|
||||
+ svfloat64_t p47 = svmla_x (pg, p45, f2, p67);
|
||||
+ svfloat64_t p07 = svmla_x (pg, p03, f4, p47);
|
||||
+
|
||||
+ svfloat64_t p89 = svmla_lane (sv_f64 (d->c8), f, c911, 0);
|
||||
+ svfloat64_t p1011 = svmla_lane (sv_f64 (d->c10), f, c911, 1);
|
||||
+ svfloat64_t p1213 = svmla_lane (sv_f64 (d->c12), f, c1315, 0);
|
||||
+ svfloat64_t p1415 = svmla_lane (sv_f64 (d->c14), f, c1315, 1);
|
||||
+
|
||||
+ svfloat64_t p811 = svmla_x (pg, p89, f2, p1011);
|
||||
+ svfloat64_t p1215 = svmla_x (pg, p1213, f2, p1415);
|
||||
+ svfloat64_t p815 = svmla_x (pg, p811, f4, p1215);
|
||||
+
|
||||
+ svfloat64_t p015 = svmla_x (pg, p07, f8, p815);
|
||||
+ svfloat64_t p1617 = svmla_lane (sv_f64 (d->c16), f, c1718, 0);
|
||||
+ svfloat64_t p1618 = svmla_lane (p1617, f2, c1718, 1);
|
||||
+ svfloat64_t p = svmla_x (pg, p015, f16, p1618);
|
||||
|
||||
/* Assemble log1p(x) = k * log2 + log1p(f) + c/m. */
|
||||
- svfloat64_t ylo = svmla_x (pg, cm, k, d->ln2[0]);
|
||||
- svfloat64_t yhi = svmla_x (pg, f, k, d->ln2[1]);
|
||||
+ svfloat64_t ln2_lo_hi = svld1rq (svptrue_b64 (), &d->ln2_lo);
|
||||
+ svfloat64_t ylo = svmla_lane (cm, k, ln2_lo_hi, 0);
|
||||
+ svfloat64_t yhi = svmla_lane (f, k, ln2_lo_hi, 1);
|
||||
|
||||
- return svmla_x (pg, svadd_x (pg, ylo, yhi), f2, p);
|
||||
+ return svmad_x (pg, p, f2, svadd_x (pg, ylo, yhi));
|
||||
}
|
||||
-
|
||||
#endif
|
||||
583
glibc-RHEL-118273-39.patch
Normal file
583
glibc-RHEL-118273-39.patch
Normal file
@ -0,0 +1,583 @@
|
||||
commit dee22d2a81ab59afc165fb6dcb45d723f13582a0
|
||||
Author: Dylan Fleming <Dylan.Fleming@arm.com>
|
||||
Date: Wed Jun 18 16:19:22 2025 +0000
|
||||
|
||||
AArch64: Optimise SVE FP64 Hyperbolics
|
||||
|
||||
Reworke SVE FP64 hyperbolics to use the SVE FEXPA
|
||||
instruction.
|
||||
|
||||
Also update the special case handelling for large
|
||||
inputs to be entirely vectorised.
|
||||
|
||||
Performance improvements on Neoverse V1:
|
||||
|
||||
cosh_sve: 19% for |x| < 709, 5x otherwise
|
||||
sinh_sve: 24% for |x| < 709, 5.9x otherwise
|
||||
tanh_sve: 12% for |x| < 19, 9x otherwise
|
||||
|
||||
Reviewed-by: Wilco Dijkstra <Wilco.Dijkstra@arm.com>
|
||||
|
||||
diff --git a/sysdeps/aarch64/fpu/cosh_sve.c b/sysdeps/aarch64/fpu/cosh_sve.c
|
||||
index e375dd8a3407feb2..3561893ae614e2ea 100644
|
||||
--- a/sysdeps/aarch64/fpu/cosh_sve.c
|
||||
+++ b/sysdeps/aarch64/fpu/cosh_sve.c
|
||||
@@ -21,71 +21,99 @@
|
||||
|
||||
static const struct data
|
||||
{
|
||||
- float64_t poly[3];
|
||||
- float64_t inv_ln2, ln2_hi, ln2_lo, shift, thres;
|
||||
+ double c0, c2;
|
||||
+ double c1, c3;
|
||||
+ float64_t inv_ln2, ln2_hi, ln2_lo, shift;
|
||||
uint64_t special_bound;
|
||||
} data = {
|
||||
- .poly = { 0x1.fffffffffffd4p-2, 0x1.5555571d6b68cp-3,
|
||||
- 0x1.5555576a59599p-5, },
|
||||
-
|
||||
- .inv_ln2 = 0x1.71547652b82fep8, /* N/ln2. */
|
||||
- /* -ln2/N. */
|
||||
- .ln2_hi = -0x1.62e42fefa39efp-9,
|
||||
- .ln2_lo = -0x1.abc9e3b39803f3p-64,
|
||||
- .shift = 0x1.8p+52,
|
||||
- .thres = 704.0,
|
||||
-
|
||||
- /* 0x1.6p9, above which exp overflows. */
|
||||
- .special_bound = 0x4086000000000000,
|
||||
+ /* Generated using Remez, in [-log(2)/128, log(2)/128]. */
|
||||
+ .c0 = 0x1.fffffffffdbcdp-2,
|
||||
+ .c1 = 0x1.555555555444cp-3,
|
||||
+ .c2 = 0x1.555573c6a9f7dp-5,
|
||||
+ .c3 = 0x1.1111266d28935p-7,
|
||||
+ .ln2_hi = 0x1.62e42fefa3800p-1,
|
||||
+ .ln2_lo = 0x1.ef35793c76730p-45,
|
||||
+ /* 1/ln2. */
|
||||
+ .inv_ln2 = 0x1.71547652b82fep+0,
|
||||
+ .shift = 0x1.800000000ff80p+46, /* 1.5*2^46+1022. */
|
||||
+
|
||||
+ /* asuint(ln(2^(1024 - 1/128))), the value above which exp overflows. */
|
||||
+ .special_bound = 0x40862e37e7d8ba72,
|
||||
};
|
||||
|
||||
-static svfloat64_t NOINLINE
|
||||
-special_case (svfloat64_t x, svbool_t pg, svfloat64_t t, svbool_t special)
|
||||
-{
|
||||
- svfloat64_t half_t = svmul_x (svptrue_b64 (), t, 0.5);
|
||||
- svfloat64_t half_over_t = svdivr_x (pg, t, 0.5);
|
||||
- svfloat64_t y = svadd_x (pg, half_t, half_over_t);
|
||||
- return sv_call_f64 (cosh, x, y, special);
|
||||
-}
|
||||
-
|
||||
-/* Helper for approximating exp(x). Copied from sv_exp_tail, with no
|
||||
- special-case handling or tail. */
|
||||
+/* Helper for approximating exp(x)/2.
|
||||
+ Functionally identical to FEXPA exp(x), but an adjustment in
|
||||
+ the shift value which leads to a reduction in the exponent of scale by 1,
|
||||
+ thus halving the result at no cost. */
|
||||
static inline svfloat64_t
|
||||
-exp_inline (svfloat64_t x, const svbool_t pg, const struct data *d)
|
||||
+exp_over_two_inline (const svbool_t pg, svfloat64_t x, const struct data *d)
|
||||
{
|
||||
/* Calculate exp(x). */
|
||||
svfloat64_t z = svmla_x (pg, sv_f64 (d->shift), x, d->inv_ln2);
|
||||
+ svuint64_t u = svreinterpret_u64 (z);
|
||||
svfloat64_t n = svsub_x (pg, z, d->shift);
|
||||
|
||||
- svfloat64_t r = svmla_x (pg, x, n, d->ln2_hi);
|
||||
- r = svmla_x (pg, r, n, d->ln2_lo);
|
||||
+ svfloat64_t c13 = svld1rq (svptrue_b64 (), &d->c1);
|
||||
+ svfloat64_t ln2 = svld1rq (svptrue_b64 (), &d->ln2_hi);
|
||||
|
||||
- svuint64_t u = svreinterpret_u64 (z);
|
||||
- svuint64_t e = svlsl_x (pg, u, 52 - V_EXP_TAIL_TABLE_BITS);
|
||||
- svuint64_t i = svand_x (svptrue_b64 (), u, 0xff);
|
||||
+ svfloat64_t r = x;
|
||||
+ r = svmls_lane (r, n, ln2, 0);
|
||||
+ r = svmls_lane (r, n, ln2, 1);
|
||||
|
||||
- svfloat64_t y = svmla_x (pg, sv_f64 (d->poly[1]), r, d->poly[2]);
|
||||
- y = svmla_x (pg, sv_f64 (d->poly[0]), r, y);
|
||||
- y = svmla_x (pg, sv_f64 (1.0), r, y);
|
||||
- y = svmul_x (svptrue_b64 (), r, y);
|
||||
+ svfloat64_t r2 = svmul_x (svptrue_b64 (), r, r);
|
||||
+ svfloat64_t p01 = svmla_lane (sv_f64 (d->c0), r, c13, 0);
|
||||
+ svfloat64_t p23 = svmla_lane (sv_f64 (d->c2), r, c13, 1);
|
||||
+ svfloat64_t p04 = svmla_x (pg, p01, p23, r2);
|
||||
+ svfloat64_t p = svmla_x (pg, r, p04, r2);
|
||||
|
||||
- /* s = 2^(n/N). */
|
||||
- u = svld1_gather_index (pg, __v_exp_tail_data, i);
|
||||
- svfloat64_t s = svreinterpret_f64 (svadd_x (pg, u, e));
|
||||
+ svfloat64_t scale = svexpa (u);
|
||||
|
||||
- return svmla_x (pg, s, s, y);
|
||||
+ return svmla_x (pg, scale, scale, p);
|
||||
+}
|
||||
+
|
||||
+/* Vectorised special case to handle values past where exp_inline overflows.
|
||||
+ Halves the input value and uses the identity exp(x) = exp(x/2)^2 to double
|
||||
+ the valid range of inputs, and returns inf for anything past that. */
|
||||
+static svfloat64_t NOINLINE
|
||||
+special_case (svbool_t pg, svbool_t special, svfloat64_t ax, svfloat64_t t,
|
||||
+ const struct data *d)
|
||||
+{
|
||||
+ /* Finish fast path to compute values for non-special cases. */
|
||||
+ svfloat64_t inv_twoexp = svdivr_x (pg, t, 0.25);
|
||||
+ svfloat64_t y = svadd_x (pg, t, inv_twoexp);
|
||||
+
|
||||
+ /* Halves input value, and then check if any cases
|
||||
+ are still going to overflow. */
|
||||
+ ax = svmul_x (special, ax, 0.5);
|
||||
+ svbool_t is_safe
|
||||
+ = svcmplt (special, svreinterpret_u64 (ax), d->special_bound);
|
||||
+
|
||||
+ /* Computes exp(x/2), and sets any overflowing lanes to inf. */
|
||||
+ svfloat64_t half_exp = exp_over_two_inline (special, ax, d);
|
||||
+ half_exp = svsel (is_safe, half_exp, sv_f64 (INFINITY));
|
||||
+
|
||||
+ /* Construct special case cosh(x) = (exp(x/2)^2)/2. */
|
||||
+ svfloat64_t exp = svmul_x (svptrue_b64 (), half_exp, 2);
|
||||
+ svfloat64_t special_y = svmul_x (special, exp, half_exp);
|
||||
+
|
||||
+ /* Select correct return values for special and non-special cases. */
|
||||
+ special_y = svsel (special, special_y, y);
|
||||
+
|
||||
+ /* Ensure an input of nan is correctly propagated. */
|
||||
+ svbool_t is_nan
|
||||
+ = svcmpgt (special, svreinterpret_u64 (ax), sv_u64 (0x7ff0000000000000));
|
||||
+ return svsel (is_nan, ax, svsel (special, special_y, y));
|
||||
}
|
||||
|
||||
/* Approximation for SVE double-precision cosh(x) using exp_inline.
|
||||
cosh(x) = (exp(x) + exp(-x)) / 2.
|
||||
- The greatest observed error is in the scalar fall-back region, so is the
|
||||
- same as the scalar routine, 1.93 ULP:
|
||||
- _ZGVsMxv_cosh (0x1.628ad45039d2fp+9) got 0x1.fd774e958236dp+1021
|
||||
- want 0x1.fd774e958236fp+1021.
|
||||
-
|
||||
- The greatest observed error in the non-special region is 1.54 ULP:
|
||||
- _ZGVsMxv_cosh (0x1.ba5651dd4486bp+2) got 0x1.f5e2bb8d5c98fp+8
|
||||
- want 0x1.f5e2bb8d5c991p+8. */
|
||||
+ The greatest observed error in special case region is 2.66 + 0.5 ULP:
|
||||
+ _ZGVsMxv_cosh (0x1.633b532ffbc1ap+9) got 0x1.f9b2d3d22399ep+1023
|
||||
+ want 0x1.f9b2d3d22399bp+1023
|
||||
+
|
||||
+ The greatest observed error in the non-special region is 1.01 + 0.5 ULP:
|
||||
+ _ZGVsMxv_cosh (0x1.998ecbb3c1f81p+1) got 0x1.890b225657f84p+3
|
||||
+ want 0x1.890b225657f82p+3. */
|
||||
svfloat64_t SV_NAME_D1 (cosh) (svfloat64_t x, const svbool_t pg)
|
||||
{
|
||||
const struct data *d = ptr_barrier (&data);
|
||||
@@ -94,14 +122,13 @@ svfloat64_t SV_NAME_D1 (cosh) (svfloat64_t x, const svbool_t pg)
|
||||
svbool_t special = svcmpgt (pg, svreinterpret_u64 (ax), d->special_bound);
|
||||
|
||||
/* Up to the point that exp overflows, we can use it to calculate cosh by
|
||||
- exp(|x|) / 2 + 1 / (2 * exp(|x|)). */
|
||||
- svfloat64_t t = exp_inline (ax, pg, d);
|
||||
+ (exp(|x|)/2 + 1) / (2 * exp(|x|)). */
|
||||
+ svfloat64_t half_exp = exp_over_two_inline (pg, ax, d);
|
||||
|
||||
- /* Fall back to scalar for any special cases. */
|
||||
+ /* Falls back to entirely standalone vectorized special case. */
|
||||
if (__glibc_unlikely (svptest_any (pg, special)))
|
||||
- return special_case (x, pg, t, special);
|
||||
+ return special_case (pg, special, ax, half_exp, d);
|
||||
|
||||
- svfloat64_t half_t = svmul_x (svptrue_b64 (), t, 0.5);
|
||||
- svfloat64_t half_over_t = svdivr_x (pg, t, 0.5);
|
||||
- return svadd_x (pg, half_t, half_over_t);
|
||||
+ svfloat64_t inv_twoexp = svdivr_x (pg, half_exp, 0.25);
|
||||
+ return svadd_x (pg, half_exp, inv_twoexp);
|
||||
}
|
||||
diff --git a/sysdeps/aarch64/fpu/sinh_sve.c b/sysdeps/aarch64/fpu/sinh_sve.c
|
||||
index df5f6c8c06e5b173..ac7b306018bda613 100644
|
||||
--- a/sysdeps/aarch64/fpu/sinh_sve.c
|
||||
+++ b/sysdeps/aarch64/fpu/sinh_sve.c
|
||||
@@ -18,90 +18,153 @@
|
||||
<https://www.gnu.org/licenses/>. */
|
||||
|
||||
#include "sv_math.h"
|
||||
-#include "poly_sve_f64.h"
|
||||
|
||||
static const struct data
|
||||
{
|
||||
- float64_t poly[11];
|
||||
- float64_t inv_ln2, m_ln2_hi, m_ln2_lo, shift;
|
||||
uint64_t halff;
|
||||
- int64_t onef;
|
||||
- uint64_t large_bound;
|
||||
+ double c2, c4;
|
||||
+ double inv_ln2;
|
||||
+ double ln2_hi, ln2_lo;
|
||||
+ double c0, c1, c3;
|
||||
+ double shift, special_bound, bound;
|
||||
+ uint64_t expm1_data[20];
|
||||
} data = {
|
||||
- /* Generated using Remez, deg=12 in [-log(2)/2, log(2)/2]. */
|
||||
- .poly = { 0x1p-1, 0x1.5555555555559p-3, 0x1.555555555554bp-5,
|
||||
- 0x1.111111110f663p-7, 0x1.6c16c16c1b5f3p-10,
|
||||
- 0x1.a01a01affa35dp-13, 0x1.a01a018b4ecbbp-16,
|
||||
- 0x1.71ddf82db5bb4p-19, 0x1.27e517fc0d54bp-22,
|
||||
- 0x1.af5eedae67435p-26, 0x1.1f143d060a28ap-29, },
|
||||
-
|
||||
- .inv_ln2 = 0x1.71547652b82fep0,
|
||||
- .m_ln2_hi = -0x1.62e42fefa39efp-1,
|
||||
- .m_ln2_lo = -0x1.abc9e3b39803fp-56,
|
||||
- .shift = 0x1.8p52,
|
||||
-
|
||||
+ /* Table lookup of 2^(i/64) - 1, for values of i from 0..19. */
|
||||
+ .expm1_data = {
|
||||
+ 0x0000000000000000, 0x3f864d1f3bc03077, 0x3f966c34c5615d0f, 0x3fa0e8a30eb37901,
|
||||
+ 0x3fa6ab0d9f3121ec, 0x3fac7d865a7a3440, 0x3fb1301d0125b50a, 0x3fb429aaea92ddfb,
|
||||
+ 0x3fb72b83c7d517ae, 0x3fba35beb6fcb754, 0x3fbd4873168b9aa8, 0x3fc031dc431466b2,
|
||||
+ 0x3fc1c3d373ab11c3, 0x3fc35a2b2f13e6e9, 0x3fc4f4efa8fef709, 0x3fc6942d3720185a,
|
||||
+ 0x3fc837f0518db8a9, 0x3fc9e0459320b7fa, 0x3fcb8d39b9d54e55, 0x3fcd3ed9a72cffb7,
|
||||
+ },
|
||||
+
|
||||
+ /* Generated using Remez, in [-log(2)/128, log(2)/128]. */
|
||||
+ .c0 = 0x1p-1,
|
||||
+ .c1 = 0x1.55555555548f9p-3,
|
||||
+ .c2 = 0x1.5555555554c22p-5,
|
||||
+ .c3 = 0x1.111123aaa2fb2p-7,
|
||||
+ .c4 = 0x1.6c16d77d98e5bp-10,
|
||||
+ .ln2_hi = 0x1.62e42fefa3800p-1,
|
||||
+ .ln2_lo = 0x1.ef35793c76730p-45,
|
||||
+ .inv_ln2 = 0x1.71547652b82fep+0,
|
||||
+ .shift = 0x1.800000000ffc0p+46, /* 1.5*2^46+1023. */
|
||||
.halff = 0x3fe0000000000000,
|
||||
- .onef = 0x3ff0000000000000,
|
||||
- /* 2^9. expm1 helper overflows for large input. */
|
||||
- .large_bound = 0x4080000000000000,
|
||||
+ .special_bound = 0x1.62e37e7d8ba72p+9, /* ln(2^(1024 - 1/128)). */
|
||||
+ .bound = 0x1.a56ef8ec924ccp-3 /* 19*ln2/64. */
|
||||
};
|
||||
|
||||
+/* A specialised FEXPA expm1 that is only valid for positive inputs and
|
||||
+ has no special cases. Based off the full FEXPA expm1 implementated for
|
||||
+ _ZGVsMxv_expm1, with a slightly modified file to keep sinh under 3.5ULP. */
|
||||
static inline svfloat64_t
|
||||
-expm1_inline (svfloat64_t x, svbool_t pg)
|
||||
+expm1_inline (svbool_t pg, svfloat64_t x)
|
||||
{
|
||||
const struct data *d = ptr_barrier (&data);
|
||||
|
||||
- /* Reduce argument:
|
||||
- exp(x) - 1 = 2^i * (expm1(f) + 1) - 1
|
||||
- where i = round(x / ln2)
|
||||
- and f = x - i * ln2 (f in [-ln2/2, ln2/2]). */
|
||||
- svfloat64_t j
|
||||
- = svsub_x (pg, svmla_x (pg, sv_f64 (d->shift), x, d->inv_ln2), d->shift);
|
||||
- svint64_t i = svcvt_s64_x (pg, j);
|
||||
- svfloat64_t f = svmla_x (pg, x, j, d->m_ln2_hi);
|
||||
- f = svmla_x (pg, f, j, d->m_ln2_lo);
|
||||
- /* Approximate expm1(f) using polynomial. */
|
||||
- svfloat64_t f2 = svmul_x (pg, f, f);
|
||||
- svfloat64_t f4 = svmul_x (pg, f2, f2);
|
||||
- svfloat64_t f8 = svmul_x (pg, f4, f4);
|
||||
- svfloat64_t p
|
||||
- = svmla_x (pg, f, f2, sv_estrin_10_f64_x (pg, f, f2, f4, f8, d->poly));
|
||||
- /* t = 2^i. */
|
||||
- svfloat64_t t = svscale_x (pg, sv_f64 (1), i);
|
||||
- /* expm1(x) ~= p * t + (t - 1). */
|
||||
- return svmla_x (pg, svsub_x (pg, t, 1.0), p, t);
|
||||
+ svfloat64_t z = svmla_x (pg, sv_f64 (d->shift), x, d->inv_ln2);
|
||||
+ svuint64_t u = svreinterpret_u64 (z);
|
||||
+ svfloat64_t n = svsub_x (pg, z, d->shift);
|
||||
+
|
||||
+ svfloat64_t ln2 = svld1rq (svptrue_b64 (), &d->ln2_hi);
|
||||
+ svfloat64_t c24 = svld1rq (svptrue_b64 (), &d->c2);
|
||||
+
|
||||
+ svfloat64_t r = x;
|
||||
+ r = svmls_lane (r, n, ln2, 0);
|
||||
+ r = svmls_lane (r, n, ln2, 1);
|
||||
+
|
||||
+ svfloat64_t r2 = svmul_x (svptrue_b64 (), r, r);
|
||||
+
|
||||
+ svfloat64_t p;
|
||||
+ svfloat64_t c12 = svmla_lane (sv_f64 (d->c1), r, c24, 0);
|
||||
+ svfloat64_t c34 = svmla_lane (sv_f64 (d->c3), r, c24, 1);
|
||||
+ p = svmad_x (pg, c34, r2, c12);
|
||||
+ p = svmad_x (pg, p, r, sv_f64 (d->c0));
|
||||
+ p = svmad_x (pg, p, r2, r);
|
||||
+
|
||||
+ svfloat64_t scale = svexpa (u);
|
||||
+
|
||||
+ /* We want to construct expm1(x) = (scale - 1) + scale * poly.
|
||||
+ However, for values of scale close to 1, scale-1 causes large ULP errors
|
||||
+ due to cancellation.
|
||||
+
|
||||
+ This can be circumvented by using a small lookup for scale-1
|
||||
+ when our input is below a certain bound, otherwise we can use FEXPA. */
|
||||
+ svbool_t is_small = svaclt (pg, x, d->bound);
|
||||
+
|
||||
+ /* Index via the input of FEXPA, but we only care about the lower 5 bits. */
|
||||
+ svuint64_t base_idx = svand_x (pg, u, 0x1f);
|
||||
+
|
||||
+ /* Compute scale - 1 from FEXPA, and lookup values where this fails. */
|
||||
+ svfloat64_t scalem1_estimate = svsub_x (pg, scale, sv_f64 (1.0));
|
||||
+ svuint64_t scalem1_lookup
|
||||
+ = svld1_gather_index (is_small, d->expm1_data, base_idx);
|
||||
+
|
||||
+ /* Select the appropriate scale - 1 value based on x. */
|
||||
+ svfloat64_t scalem1
|
||||
+ = svsel (is_small, svreinterpret_f64 (scalem1_lookup), scalem1_estimate);
|
||||
+
|
||||
+ /* return expm1 = scale - 1 + (scale * poly). */
|
||||
+ return svmla_x (pg, scalem1, scale, p);
|
||||
}
|
||||
|
||||
+/* Vectorised special case to handle values past where exp_inline overflows.
|
||||
+ Halves the input value and uses the identity exp(x) = exp(x/2)^2 to double
|
||||
+ the valid range of inputs, and returns inf for anything past that. */
|
||||
static svfloat64_t NOINLINE
|
||||
-special_case (svfloat64_t x, svbool_t pg)
|
||||
+special_case (svbool_t pg, svbool_t special, svfloat64_t ax,
|
||||
+ svfloat64_t halfsign, const struct data *d)
|
||||
{
|
||||
- return sv_call_f64 (sinh, x, x, pg);
|
||||
+ /* Halves input value, and then check if any cases
|
||||
+ are still going to overflow. */
|
||||
+ ax = svmul_x (special, ax, 0.5);
|
||||
+ svbool_t is_safe = svaclt (special, ax, d->special_bound);
|
||||
+
|
||||
+ svfloat64_t t = expm1_inline (pg, ax);
|
||||
+
|
||||
+ /* Finish fastpass to compute values for non-special cases. */
|
||||
+ svfloat64_t y = svadd_x (pg, t, svdiv_x (pg, t, svadd_x (pg, t, 1.0)));
|
||||
+ y = svmul_x (pg, y, halfsign);
|
||||
+
|
||||
+ /* Computes special lane, and set remaining overflow lanes to inf. */
|
||||
+ svfloat64_t half_special_y = svmul_x (svptrue_b64 (), t, halfsign);
|
||||
+ svfloat64_t special_y = svmul_x (svptrue_b64 (), half_special_y, t);
|
||||
+
|
||||
+ svuint64_t signed_inf
|
||||
+ = svorr_x (svptrue_b64 (), svreinterpret_u64 (halfsign),
|
||||
+ sv_u64 (0x7ff0000000000000));
|
||||
+ special_y = svsel (is_safe, special_y, svreinterpret_f64 (signed_inf));
|
||||
+
|
||||
+ /* Join resulting vectors together and return. */
|
||||
+ return svsel (special, special_y, y);
|
||||
}
|
||||
|
||||
-/* Approximation for SVE double-precision sinh(x) using expm1.
|
||||
- sinh(x) = (exp(x) - exp(-x)) / 2.
|
||||
- The greatest observed error is 2.57 ULP:
|
||||
- _ZGVsMxv_sinh (0x1.a008538399931p-2) got 0x1.ab929fc64bd66p-2
|
||||
- want 0x1.ab929fc64bd63p-2. */
|
||||
+/* Approximation for SVE double-precision sinh(x) using FEXPA expm1.
|
||||
+ Uses sinh(x) = e^2x - 1 / 2e^x, rewritten for accuracy.
|
||||
+ The greatest observed error in the non-special region is 2.63 + 0.5 ULP:
|
||||
+ _ZGVsMxv_sinh (0x1.b5e0e13ba88aep-2) got 0x1.c3587faf97b0cp-2
|
||||
+ want 0x1.c3587faf97b09p-2
|
||||
+
|
||||
+ The greatest observed error in the special region is 2.65 + 0.5 ULP:
|
||||
+ _ZGVsMxv_sinh (0x1.633ce847dab1ap+9) got 0x1.fffd30eea0066p+1023
|
||||
+ want 0x1.fffd30eea0063p+1023. */
|
||||
svfloat64_t SV_NAME_D1 (sinh) (svfloat64_t x, svbool_t pg)
|
||||
{
|
||||
const struct data *d = ptr_barrier (&data);
|
||||
|
||||
+ svbool_t special = svacge (pg, x, d->special_bound);
|
||||
svfloat64_t ax = svabs_x (pg, x);
|
||||
svuint64_t sign
|
||||
= sveor_x (pg, svreinterpret_u64 (x), svreinterpret_u64 (ax));
|
||||
svfloat64_t halfsign = svreinterpret_f64 (svorr_x (pg, sign, d->halff));
|
||||
|
||||
- svbool_t special = svcmpge (pg, svreinterpret_u64 (ax), d->large_bound);
|
||||
-
|
||||
/* Fall back to scalar variant for all lanes if any are special. */
|
||||
if (__glibc_unlikely (svptest_any (pg, special)))
|
||||
- return special_case (x, pg);
|
||||
+ return special_case (pg, special, ax, halfsign, d);
|
||||
|
||||
/* Up to the point that expm1 overflows, we can use it to calculate sinh
|
||||
using a slight rearrangement of the definition of sinh. This allows us to
|
||||
retain acceptable accuracy for very small inputs. */
|
||||
- svfloat64_t t = expm1_inline (ax, pg);
|
||||
+ svfloat64_t t = expm1_inline (pg, ax);
|
||||
t = svadd_x (pg, t, svdiv_x (pg, t, svadd_x (pg, t, 1.0)));
|
||||
return svmul_x (pg, t, halfsign);
|
||||
}
|
||||
diff --git a/sysdeps/aarch64/fpu/tanh_sve.c b/sysdeps/aarch64/fpu/tanh_sve.c
|
||||
index d25e011cea305094..805669845d09e098 100644
|
||||
--- a/sysdeps/aarch64/fpu/tanh_sve.c
|
||||
+++ b/sysdeps/aarch64/fpu/tanh_sve.c
|
||||
@@ -18,83 +18,117 @@
|
||||
<https://www.gnu.org/licenses/>. */
|
||||
|
||||
#include "sv_math.h"
|
||||
-#include "poly_sve_f64.h"
|
||||
|
||||
static const struct data
|
||||
{
|
||||
- float64_t poly[11];
|
||||
- float64_t inv_ln2, ln2_hi, ln2_lo, shift;
|
||||
- uint64_t thresh, tiny_bound;
|
||||
+ double ln2_hi, ln2_lo;
|
||||
+ double c2, c4;
|
||||
+ double c0, c1, c3;
|
||||
+ double two_over_ln2, shift;
|
||||
+ uint64_t tiny_bound;
|
||||
+ double large_bound, fexpa_bound;
|
||||
+ uint64_t e2xm1_data[20];
|
||||
} data = {
|
||||
- /* Generated using Remez, deg=12 in [-log(2)/2, log(2)/2]. */
|
||||
- .poly = { 0x1p-1, 0x1.5555555555559p-3, 0x1.555555555554bp-5,
|
||||
- 0x1.111111110f663p-7, 0x1.6c16c16c1b5f3p-10,
|
||||
- 0x1.a01a01affa35dp-13, 0x1.a01a018b4ecbbp-16,
|
||||
- 0x1.71ddf82db5bb4p-19, 0x1.27e517fc0d54bp-22,
|
||||
- 0x1.af5eedae67435p-26, 0x1.1f143d060a28ap-29, },
|
||||
-
|
||||
- .inv_ln2 = 0x1.71547652b82fep0,
|
||||
- .ln2_hi = -0x1.62e42fefa39efp-1,
|
||||
- .ln2_lo = -0x1.abc9e3b39803fp-56,
|
||||
- .shift = 0x1.8p52,
|
||||
-
|
||||
+ /* Generated using Remez, in [-log(2)/128, log(2)/128]. */
|
||||
+ .c0 = 0x1p-1,
|
||||
+ .c1 = 0x1.55555555548f9p-3,
|
||||
+ .c2 = 0x1.5555555554c22p-5,
|
||||
+ .c3 = 0x1.111123aaa2fb2p-7,
|
||||
+ .c4 = 0x1.6c16d77d98e5bp-10,
|
||||
+ .ln2_hi = 0x1.62e42fefa3800p-1,
|
||||
+ .ln2_lo = 0x1.ef35793c76730p-45,
|
||||
+ .two_over_ln2 = 0x1.71547652b82fep+1,
|
||||
+ .shift = 0x1.800000000ffc0p+46, /* 1.5*2^46+1023. */
|
||||
.tiny_bound = 0x3e40000000000000, /* asuint64 (0x1p-27). */
|
||||
- /* asuint64(0x1.241bf835f9d5fp+4) - asuint64(tiny_bound). */
|
||||
- .thresh = 0x01f241bf835f9d5f,
|
||||
+ .large_bound = 0x1.30fc1931f09cap+4, /* arctanh(1 - 2^-54). */
|
||||
+ .fexpa_bound = 0x1.a56ef8ec924ccp-4, /* 19/64 * ln2/2. */
|
||||
+ /* Table lookup of 2^(i/64) - 1, for values of i from 0..19. */
|
||||
+ .e2xm1_data = {
|
||||
+ 0x0000000000000000, 0x3f864d1f3bc03077, 0x3f966c34c5615d0f, 0x3fa0e8a30eb37901,
|
||||
+ 0x3fa6ab0d9f3121ec, 0x3fac7d865a7a3440, 0x3fb1301d0125b50a, 0x3fb429aaea92ddfb,
|
||||
+ 0x3fb72b83c7d517ae, 0x3fba35beb6fcb754, 0x3fbd4873168b9aa8, 0x3fc031dc431466b2,
|
||||
+ 0x3fc1c3d373ab11c3, 0x3fc35a2b2f13e6e9, 0x3fc4f4efa8fef709, 0x3fc6942d3720185a,
|
||||
+ 0x3fc837f0518db8a9, 0x3fc9e0459320b7fa, 0x3fcb8d39b9d54e55, 0x3fcd3ed9a72cffb7,
|
||||
+ },
|
||||
};
|
||||
|
||||
+/* An expm1 inspired, FEXPA based helper function that returns an
|
||||
+ accurate estimate for e^2x - 1. With no special case or support for
|
||||
+ negative inputs of x. */
|
||||
static inline svfloat64_t
|
||||
-expm1_inline (svfloat64_t x, const svbool_t pg, const struct data *d)
|
||||
-{
|
||||
- /* Helper routine for calculating exp(x) - 1. Vector port of the helper from
|
||||
- the scalar variant of tanh. */
|
||||
-
|
||||
- /* Reduce argument: f in [-ln2/2, ln2/2], i is exact. */
|
||||
- svfloat64_t j
|
||||
- = svsub_x (pg, svmla_x (pg, sv_f64 (d->shift), x, d->inv_ln2), d->shift);
|
||||
- svint64_t i = svcvt_s64_x (pg, j);
|
||||
- svfloat64_t f = svmla_x (pg, x, j, d->ln2_hi);
|
||||
- f = svmla_x (pg, f, j, d->ln2_lo);
|
||||
-
|
||||
- /* Approximate expm1(f) using polynomial. */
|
||||
- svfloat64_t f2 = svmul_x (pg, f, f);
|
||||
- svfloat64_t f4 = svmul_x (pg, f2, f2);
|
||||
- svfloat64_t p = svmla_x (
|
||||
- pg, f, f2,
|
||||
- sv_estrin_10_f64_x (pg, f, f2, f4, svmul_x (pg, f4, f4), d->poly));
|
||||
-
|
||||
- /* t = 2 ^ i. */
|
||||
- svfloat64_t t = svscale_x (pg, sv_f64 (1), i);
|
||||
- /* expm1(x) = p * t + (t - 1). */
|
||||
- return svmla_x (pg, svsub_x (pg, t, 1), p, t);
|
||||
-}
|
||||
-
|
||||
-static svfloat64_t NOINLINE
|
||||
-special_case (svfloat64_t x, svfloat64_t y, svbool_t special)
|
||||
+e2xm1_inline (const svbool_t pg, svfloat64_t x, const struct data *d)
|
||||
{
|
||||
- return sv_call_f64 (tanh, x, y, special);
|
||||
+ svfloat64_t z = svmla_x (pg, sv_f64 (d->shift), x, d->two_over_ln2);
|
||||
+ svuint64_t u = svreinterpret_u64 (z);
|
||||
+ svfloat64_t n = svsub_x (pg, z, d->shift);
|
||||
+
|
||||
+ /* r = x - n * ln2/2, r is in [-ln2/(2N), ln2/(2N)]. */
|
||||
+ svfloat64_t ln2 = svld1rq (svptrue_b64 (), &d->ln2_hi);
|
||||
+ svfloat64_t r = svadd_x (pg, x, x);
|
||||
+ r = svmls_lane (r, n, ln2, 0);
|
||||
+ r = svmls_lane (r, n, ln2, 1);
|
||||
+
|
||||
+ /* y = exp(r) - 1 ~= r + C0 r^2 + C1 r^3 + C2 r^4 + C3 r^5 + C4 r^6. */
|
||||
+ svfloat64_t r2 = svmul_x (svptrue_b64 (), r, r);
|
||||
+ svfloat64_t c24 = svld1rq (svptrue_b64 (), &d->c2);
|
||||
+
|
||||
+ svfloat64_t p;
|
||||
+ svfloat64_t c12 = svmla_lane (sv_f64 (d->c1), r, c24, 0);
|
||||
+ svfloat64_t c34 = svmla_lane (sv_f64 (d->c3), r, c24, 1);
|
||||
+ p = svmad_x (pg, c34, r2, c12);
|
||||
+ p = svmad_x (pg, p, r, sv_f64 (d->c0));
|
||||
+ p = svmad_x (pg, p, r2, r);
|
||||
+
|
||||
+ svfloat64_t scale = svexpa (u);
|
||||
+
|
||||
+ /* We want to construct e2xm1(x) = (scale - 1) + scale * poly.
|
||||
+ However, for values of scale close to 1, scale-1 causes large ULP errors
|
||||
+ due to cancellation.
|
||||
+
|
||||
+ This can be circumvented by using a small lookup for scale-1
|
||||
+ when our input is below a certain bound, otherwise we can use FEXPA. */
|
||||
+ svbool_t is_small = svaclt (pg, x, d->fexpa_bound);
|
||||
+
|
||||
+ /* Index via the input of FEXPA, but we only care about the lower 5 bits. */
|
||||
+ svuint64_t base_idx = svand_x (pg, u, 0x1f);
|
||||
+
|
||||
+ /* Compute scale - 1 from FEXPA, and lookup values where this fails. */
|
||||
+ svfloat64_t scalem1_estimate = svsub_x (pg, scale, sv_f64 (1.0));
|
||||
+ svuint64_t scalem1_lookup
|
||||
+ = svld1_gather_index (is_small, d->e2xm1_data, base_idx);
|
||||
+
|
||||
+ /* Select the appropriate scale - 1 value based on x. */
|
||||
+ svfloat64_t scalem1
|
||||
+ = svsel (is_small, svreinterpret_f64 (scalem1_lookup), scalem1_estimate);
|
||||
+ return svmla_x (pg, scalem1, scale, p);
|
||||
}
|
||||
|
||||
-/* SVE approximation for double-precision tanh(x), using a simplified
|
||||
- version of expm1. The greatest observed error is 2.77 ULP:
|
||||
- _ZGVsMxv_tanh(-0x1.c4a4ca0f9f3b7p-3) got -0x1.bd6a21a163627p-3
|
||||
- want -0x1.bd6a21a163624p-3. */
|
||||
+/* SVE approximation for double-precision tanh(x), using a modified version of
|
||||
+ FEXPA expm1 to calculate e^2x - 1.
|
||||
+ The greatest observed error is 2.79 + 0.5 ULP:
|
||||
+ _ZGVsMxv_tanh (0x1.fff868eb3c223p-9) got 0x1.fff7be486cae6p-9
|
||||
+ want 0x1.fff7be486cae9p-9. */
|
||||
svfloat64_t SV_NAME_D1 (tanh) (svfloat64_t x, svbool_t pg)
|
||||
{
|
||||
const struct data *d = ptr_barrier (&data);
|
||||
|
||||
- svuint64_t ia = svreinterpret_u64 (svabs_x (pg, x));
|
||||
+ svbool_t large = svacge (pg, x, d->large_bound);
|
||||
|
||||
- /* Trigger special-cases for tiny, boring and infinity/NaN. */
|
||||
- svbool_t special = svcmpgt (pg, svsub_x (pg, ia, d->tiny_bound), d->thresh);
|
||||
+ /* We can use tanh(x) = (e^2x - 1) / (e^2x + 1) to approximate tanh.
|
||||
+ As an additional optimisation, we can ensure more accurate values of e^x
|
||||
+ by only using positive inputs. So we calculate tanh(|x|), and restore the
|
||||
+ sign of the input before returning. */
|
||||
+ svfloat64_t ax = svabs_x (pg, x);
|
||||
+ svuint64_t sign_bit
|
||||
+ = sveor_x (pg, svreinterpret_u64 (x), svreinterpret_u64 (ax));
|
||||
|
||||
- svfloat64_t u = svadd_x (pg, x, x);
|
||||
+ svfloat64_t p = e2xm1_inline (pg, ax, d);
|
||||
+ svfloat64_t q = svadd_x (pg, p, 2);
|
||||
|
||||
- /* tanh(x) = (e^2x - 1) / (e^2x + 1). */
|
||||
- svfloat64_t q = expm1_inline (u, pg, d);
|
||||
- svfloat64_t qp2 = svadd_x (pg, q, 2);
|
||||
+ /* For sufficiently high inputs, the result of tanh(|x|) is 1 when correctly
|
||||
+ rounded, at this point we can return 1 directly, with sign correction.
|
||||
+ This will also act as a guard against our approximation overflowing. */
|
||||
+ svfloat64_t y = svsel (large, sv_f64 (1.0), svdiv_x (pg, p, q));
|
||||
|
||||
- if (__glibc_unlikely (svptest_any (pg, special)))
|
||||
- return special_case (x, svdiv_x (pg, q, qp2), special);
|
||||
- return svdiv_x (pg, q, qp2);
|
||||
+ return svreinterpret_f64 (svorr_x (pg, sign_bit, svreinterpret_u64 (y)));
|
||||
}
|
||||
673
glibc-RHEL-118273-4.patch
Normal file
673
glibc-RHEL-118273-4.patch
Normal file
@ -0,0 +1,673 @@
|
||||
commit 81406ea3c5b5ad19e307302c13dd642785b47948
|
||||
Author: Joe Ramsay <Joe.Ramsay@arm.com>
|
||||
Date: Tue Feb 20 16:59:41 2024 +0000
|
||||
|
||||
aarch64/fpu: Add vector variants of asinh
|
||||
|
||||
Reviewed-by: Szabolcs Nagy <szabolcs.nagy@arm.com>
|
||||
|
||||
diff --git a/sysdeps/aarch64/fpu/Makefile b/sysdeps/aarch64/fpu/Makefile
|
||||
index 2e5bbb5a07f4c9b0..d474f2969dd05c26 100644
|
||||
--- a/sysdeps/aarch64/fpu/Makefile
|
||||
+++ b/sysdeps/aarch64/fpu/Makefile
|
||||
@@ -1,6 +1,7 @@
|
||||
libmvec-supported-funcs = acos \
|
||||
acosh \
|
||||
asin \
|
||||
+ asinh \
|
||||
atan \
|
||||
atan2 \
|
||||
cos \
|
||||
diff --git a/sysdeps/aarch64/fpu/Versions b/sysdeps/aarch64/fpu/Versions
|
||||
index 60e1cdeacec3f77e..08ea15efaec959fb 100644
|
||||
--- a/sysdeps/aarch64/fpu/Versions
|
||||
+++ b/sysdeps/aarch64/fpu/Versions
|
||||
@@ -84,6 +84,11 @@ libmvec {
|
||||
_ZGVnN4v_acoshf;
|
||||
_ZGVsMxv_acosh;
|
||||
_ZGVsMxv_acoshf;
|
||||
+ _ZGVnN2v_asinh;
|
||||
+ _ZGVnN2v_asinhf;
|
||||
+ _ZGVnN4v_asinhf;
|
||||
+ _ZGVsMxv_asinh;
|
||||
+ _ZGVsMxv_asinhf;
|
||||
_ZGVnN2v_cosh;
|
||||
_ZGVnN2v_coshf;
|
||||
_ZGVnN4v_coshf;
|
||||
diff --git a/sysdeps/aarch64/fpu/advsimd_f32_protos.h b/sysdeps/aarch64/fpu/advsimd_f32_protos.h
|
||||
index 22fec4de77395e60..1e80721c9f73ba12 100644
|
||||
--- a/sysdeps/aarch64/fpu/advsimd_f32_protos.h
|
||||
+++ b/sysdeps/aarch64/fpu/advsimd_f32_protos.h
|
||||
@@ -20,6 +20,7 @@
|
||||
libmvec_hidden_proto (V_NAME_F1(acos));
|
||||
libmvec_hidden_proto (V_NAME_F1(acosh));
|
||||
libmvec_hidden_proto (V_NAME_F1(asin));
|
||||
+libmvec_hidden_proto (V_NAME_F1(asinh));
|
||||
libmvec_hidden_proto (V_NAME_F1(atan));
|
||||
libmvec_hidden_proto (V_NAME_F1(cos));
|
||||
libmvec_hidden_proto (V_NAME_F1(cosh));
|
||||
diff --git a/sysdeps/aarch64/fpu/asinh_advsimd.c b/sysdeps/aarch64/fpu/asinh_advsimd.c
|
||||
new file mode 100644
|
||||
index 0000000000000000..544a52f6515d3201
|
||||
--- /dev/null
|
||||
+++ b/sysdeps/aarch64/fpu/asinh_advsimd.c
|
||||
@@ -0,0 +1,171 @@
|
||||
+/* Double-precision vector (Advanced SIMD) asinh function
|
||||
+
|
||||
+ Copyright (C) 2024 Free Software Foundation, Inc.
|
||||
+ This file is part of the GNU C Library.
|
||||
+
|
||||
+ The GNU C Library is free software; you can redistribute it and/or
|
||||
+ modify it under the terms of the GNU Lesser General Public
|
||||
+ License as published by the Free Software Foundation; either
|
||||
+ version 2.1 of the License, or (at your option) any later version.
|
||||
+
|
||||
+ The GNU C Library is distributed in the hope that it will be useful,
|
||||
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
+ Lesser General Public License for more details.
|
||||
+
|
||||
+ You should have received a copy of the GNU Lesser General Public
|
||||
+ License along with the GNU C Library; if not, see
|
||||
+ <https://www.gnu.org/licenses/>. */
|
||||
+
|
||||
+#include "v_math.h"
|
||||
+#include "poly_advsimd_f64.h"
|
||||
+
|
||||
+#define A(i) v_f64 (__v_log_data.poly[i])
|
||||
+#define N (1 << V_LOG_TABLE_BITS)
|
||||
+
|
||||
+const static struct data
|
||||
+{
|
||||
+ float64x2_t poly[18];
|
||||
+ uint64x2_t off, huge_bound, abs_mask;
|
||||
+ float64x2_t ln2, tiny_bound;
|
||||
+} data = {
|
||||
+ .off = V2 (0x3fe6900900000000),
|
||||
+ .ln2 = V2 (0x1.62e42fefa39efp-1),
|
||||
+ .huge_bound = V2 (0x5fe0000000000000),
|
||||
+ .tiny_bound = V2 (0x1p-26),
|
||||
+ .abs_mask = V2 (0x7fffffffffffffff),
|
||||
+ /* Even terms of polynomial s.t. asinh(x) is approximated by
|
||||
+ asinh(x) ~= x + x^3 * (C0 + C1 * x + C2 * x^2 + C3 * x^3 + ...).
|
||||
+ Generated using Remez, f = (asinh(sqrt(x)) - sqrt(x))/x^(3/2). */
|
||||
+ .poly = { V2 (-0x1.55555555554a7p-3), V2 (0x1.3333333326c7p-4),
|
||||
+ V2 (-0x1.6db6db68332e6p-5), V2 (0x1.f1c71b26fb40dp-6),
|
||||
+ V2 (-0x1.6e8b8b654a621p-6), V2 (0x1.1c4daa9e67871p-6),
|
||||
+ V2 (-0x1.c9871d10885afp-7), V2 (0x1.7a16e8d9d2ecfp-7),
|
||||
+ V2 (-0x1.3ddca533e9f54p-7), V2 (0x1.0becef748dafcp-7),
|
||||
+ V2 (-0x1.b90c7099dd397p-8), V2 (0x1.541f2bb1ffe51p-8),
|
||||
+ V2 (-0x1.d217026a669ecp-9), V2 (0x1.0b5c7977aaf7p-9),
|
||||
+ V2 (-0x1.e0f37daef9127p-11), V2 (0x1.388b5fe542a6p-12),
|
||||
+ V2 (-0x1.021a48685e287p-14), V2 (0x1.93d4ba83d34dap-18) },
|
||||
+};
|
||||
+
|
||||
+static float64x2_t NOINLINE VPCS_ATTR
|
||||
+special_case (float64x2_t x, float64x2_t y, uint64x2_t special)
|
||||
+{
|
||||
+ return v_call_f64 (asinh, x, y, special);
|
||||
+}
|
||||
+
|
||||
+struct entry
|
||||
+{
|
||||
+ float64x2_t invc;
|
||||
+ float64x2_t logc;
|
||||
+};
|
||||
+
|
||||
+static inline struct entry
|
||||
+lookup (uint64x2_t i)
|
||||
+{
|
||||
+ float64x2_t e0 = vld1q_f64 (
|
||||
+ &__v_log_data.table[(i[0] >> (52 - V_LOG_TABLE_BITS)) & (N - 1)].invc);
|
||||
+ float64x2_t e1 = vld1q_f64 (
|
||||
+ &__v_log_data.table[(i[1] >> (52 - V_LOG_TABLE_BITS)) & (N - 1)].invc);
|
||||
+ return (struct entry){ vuzp1q_f64 (e0, e1), vuzp2q_f64 (e0, e1) };
|
||||
+}
|
||||
+
|
||||
+static inline float64x2_t
|
||||
+log_inline (float64x2_t x, const struct data *d)
|
||||
+{
|
||||
+ /* Double-precision vector log, copied from ordinary vector log with some
|
||||
+ cosmetic modification and special-cases removed. */
|
||||
+ uint64x2_t ix = vreinterpretq_u64_f64 (x);
|
||||
+ uint64x2_t tmp = vsubq_u64 (ix, d->off);
|
||||
+ int64x2_t k = vshrq_n_s64 (vreinterpretq_s64_u64 (tmp), 52);
|
||||
+ uint64x2_t iz
|
||||
+ = vsubq_u64 (ix, vandq_u64 (tmp, vdupq_n_u64 (0xfffULL << 52)));
|
||||
+ float64x2_t z = vreinterpretq_f64_u64 (iz);
|
||||
+ struct entry e = lookup (tmp);
|
||||
+ float64x2_t r = vfmaq_f64 (v_f64 (-1.0), z, e.invc);
|
||||
+ float64x2_t kd = vcvtq_f64_s64 (k);
|
||||
+ float64x2_t hi = vfmaq_f64 (vaddq_f64 (e.logc, r), kd, d->ln2);
|
||||
+ float64x2_t r2 = vmulq_f64 (r, r);
|
||||
+ float64x2_t y = vfmaq_f64 (A (2), A (3), r);
|
||||
+ float64x2_t p = vfmaq_f64 (A (0), A (1), r);
|
||||
+ y = vfmaq_f64 (y, A (4), r2);
|
||||
+ y = vfmaq_f64 (p, y, r2);
|
||||
+ y = vfmaq_f64 (hi, y, r2);
|
||||
+ return y;
|
||||
+}
|
||||
+
|
||||
+/* Double-precision implementation of vector asinh(x).
|
||||
+ asinh is very sensitive around 1, so it is impractical to devise a single
|
||||
+ low-cost algorithm which is sufficiently accurate on a wide range of input.
|
||||
+ Instead we use two different algorithms:
|
||||
+ asinh(x) = sign(x) * log(|x| + sqrt(x^2 + 1) if |x| >= 1
|
||||
+ = sign(x) * (|x| + |x|^3 * P(x^2)) otherwise
|
||||
+ where log(x) is an optimized log approximation, and P(x) is a polynomial
|
||||
+ shared with the scalar routine. The greatest observed error 3.29 ULP, in
|
||||
+ |x| >= 1:
|
||||
+ __v_asinh(0x1.2cd9d717e2c9bp+0) got 0x1.ffffcfd0e234fp-1
|
||||
+ want 0x1.ffffcfd0e2352p-1. */
|
||||
+VPCS_ATTR float64x2_t V_NAME_D1 (asinh) (float64x2_t x)
|
||||
+{
|
||||
+ const struct data *d = ptr_barrier (&data);
|
||||
+
|
||||
+ float64x2_t ax = vabsq_f64 (x);
|
||||
+ uint64x2_t iax = vreinterpretq_u64_f64 (ax);
|
||||
+
|
||||
+ uint64x2_t gt1 = vcgeq_f64 (ax, v_f64 (1));
|
||||
+ uint64x2_t special = vcgeq_u64 (iax, d->huge_bound);
|
||||
+
|
||||
+#if WANT_SIMD_EXCEPT
|
||||
+ uint64x2_t tiny = vcltq_f64 (ax, d->tiny_bound);
|
||||
+ special = vorrq_u64 (special, tiny);
|
||||
+#endif
|
||||
+
|
||||
+ /* Option 1: |x| >= 1.
|
||||
+ Compute asinh(x) according by asinh(x) = log(x + sqrt(x^2 + 1)).
|
||||
+ If WANT_SIMD_EXCEPT is enabled, sidestep special values, which will
|
||||
+ overflow, by setting special lanes to 1. These will be fixed later. */
|
||||
+ float64x2_t option_1 = v_f64 (0);
|
||||
+ if (__glibc_likely (v_any_u64 (gt1)))
|
||||
+ {
|
||||
+#if WANT_SIMD_EXCEPT
|
||||
+ float64x2_t xm = v_zerofy_f64 (ax, special);
|
||||
+#else
|
||||
+ float64x2_t xm = ax;
|
||||
+#endif
|
||||
+ option_1 = log_inline (
|
||||
+ vaddq_f64 (xm, vsqrtq_f64 (vfmaq_f64 (v_f64 (1), xm, xm))), d);
|
||||
+ }
|
||||
+
|
||||
+ /* Option 2: |x| < 1.
|
||||
+ Compute asinh(x) using a polynomial.
|
||||
+ If WANT_SIMD_EXCEPT is enabled, sidestep special lanes, which will
|
||||
+ overflow, and tiny lanes, which will underflow, by setting them to 0. They
|
||||
+ will be fixed later, either by selecting x or falling back to the scalar
|
||||
+ special-case. The largest observed error in this region is 1.47 ULPs:
|
||||
+ __v_asinh(0x1.fdfcd00cc1e6ap-1) got 0x1.c1d6bf874019bp-1
|
||||
+ want 0x1.c1d6bf874019cp-1. */
|
||||
+ float64x2_t option_2 = v_f64 (0);
|
||||
+ if (__glibc_likely (v_any_u64 (vceqzq_u64 (gt1))))
|
||||
+ {
|
||||
+#if WANT_SIMD_EXCEPT
|
||||
+ ax = v_zerofy_f64 (ax, vorrq_u64 (tiny, gt1));
|
||||
+#endif
|
||||
+ float64x2_t x2 = vmulq_f64 (ax, ax), x3 = vmulq_f64 (ax, x2),
|
||||
+ z2 = vmulq_f64 (x2, x2), z4 = vmulq_f64 (z2, z2),
|
||||
+ z8 = vmulq_f64 (z4, z4), z16 = vmulq_f64 (z8, z8);
|
||||
+ float64x2_t p = v_estrin_17_f64 (x2, z2, z4, z8, z16, d->poly);
|
||||
+ option_2 = vfmaq_f64 (ax, p, x3);
|
||||
+#if WANT_SIMD_EXCEPT
|
||||
+ option_2 = vbslq_f64 (tiny, x, option_2);
|
||||
+#endif
|
||||
+ }
|
||||
+
|
||||
+ /* Choose the right option for each lane. */
|
||||
+ float64x2_t y = vbslq_f64 (gt1, option_1, option_2);
|
||||
+ /* Copy sign. */
|
||||
+ y = vbslq_f64 (d->abs_mask, y, x);
|
||||
+
|
||||
+ if (__glibc_unlikely (v_any_u64 (special)))
|
||||
+ return special_case (x, y, special);
|
||||
+ return y;
|
||||
+}
|
||||
diff --git a/sysdeps/aarch64/fpu/asinh_sve.c b/sysdeps/aarch64/fpu/asinh_sve.c
|
||||
new file mode 100644
|
||||
index 0000000000000000..28dc5c458750bac4
|
||||
--- /dev/null
|
||||
+++ b/sysdeps/aarch64/fpu/asinh_sve.c
|
||||
@@ -0,0 +1,150 @@
|
||||
+/* Double-precision vector (SVE) asinh function
|
||||
+
|
||||
+ Copyright (C) 2024 Free Software Foundation, Inc.
|
||||
+ This file is part of the GNU C Library.
|
||||
+
|
||||
+ The GNU C Library is free software; you can redistribute it and/or
|
||||
+ modify it under the terms of the GNU Lesser General Public
|
||||
+ License as published by the Free Software Foundation; either
|
||||
+ version 2.1 of the License, or (at your option) any later version.
|
||||
+
|
||||
+ The GNU C Library is distributed in the hope that it will be useful,
|
||||
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
+ Lesser General Public License for more details.
|
||||
+
|
||||
+ You should have received a copy of the GNU Lesser General Public
|
||||
+ License along with the GNU C Library; if not, see
|
||||
+ <https://www.gnu.org/licenses/>. */
|
||||
+
|
||||
+#include "sv_math.h"
|
||||
+#include "poly_sve_f64.h"
|
||||
+
|
||||
+#define SignMask (0x8000000000000000)
|
||||
+#define One (0x3ff0000000000000)
|
||||
+#define Thres (0x5fe0000000000000) /* asuint64 (0x1p511). */
|
||||
+
|
||||
+static const struct data
|
||||
+{
|
||||
+ double poly[18];
|
||||
+ double ln2, p3, p1, p4, p0, p2;
|
||||
+ uint64_t n;
|
||||
+ uint64_t off;
|
||||
+
|
||||
+} data = {
|
||||
+ /* Polynomial generated using Remez on [2^-26, 1]. */
|
||||
+ .poly
|
||||
+ = { -0x1.55555555554a7p-3, 0x1.3333333326c7p-4, -0x1.6db6db68332e6p-5,
|
||||
+ 0x1.f1c71b26fb40dp-6, -0x1.6e8b8b654a621p-6, 0x1.1c4daa9e67871p-6,
|
||||
+ -0x1.c9871d10885afp-7, 0x1.7a16e8d9d2ecfp-7, -0x1.3ddca533e9f54p-7,
|
||||
+ 0x1.0becef748dafcp-7, -0x1.b90c7099dd397p-8, 0x1.541f2bb1ffe51p-8,
|
||||
+ -0x1.d217026a669ecp-9, 0x1.0b5c7977aaf7p-9, -0x1.e0f37daef9127p-11,
|
||||
+ 0x1.388b5fe542a6p-12, -0x1.021a48685e287p-14, 0x1.93d4ba83d34dap-18 },
|
||||
+ .ln2 = 0x1.62e42fefa39efp-1,
|
||||
+ .p0 = -0x1.ffffffffffff7p-2,
|
||||
+ .p1 = 0x1.55555555170d4p-2,
|
||||
+ .p2 = -0x1.0000000399c27p-2,
|
||||
+ .p3 = 0x1.999b2e90e94cap-3,
|
||||
+ .p4 = -0x1.554e550bd501ep-3,
|
||||
+ .n = 1 << V_LOG_TABLE_BITS,
|
||||
+ .off = 0x3fe6900900000000
|
||||
+};
|
||||
+
|
||||
+static svfloat64_t NOINLINE
|
||||
+special_case (svfloat64_t x, svfloat64_t y, svbool_t special)
|
||||
+{
|
||||
+ return sv_call_f64 (asinh, x, y, special);
|
||||
+}
|
||||
+
|
||||
+static inline svfloat64_t
|
||||
+__sv_log_inline (svfloat64_t x, const struct data *d, const svbool_t pg)
|
||||
+{
|
||||
+ /* Double-precision SVE log, copied from SVE log implementation with some
|
||||
+ cosmetic modification and special-cases removed. See that file for details
|
||||
+ of the algorithm used. */
|
||||
+
|
||||
+ svuint64_t ix = svreinterpret_u64 (x);
|
||||
+ svuint64_t tmp = svsub_x (pg, ix, d->off);
|
||||
+ svuint64_t i = svand_x (pg, svlsr_x (pg, tmp, (51 - V_LOG_TABLE_BITS)),
|
||||
+ (d->n - 1) << 1);
|
||||
+ svint64_t k = svasr_x (pg, svreinterpret_s64 (tmp), 52);
|
||||
+ svuint64_t iz = svsub_x (pg, ix, svand_x (pg, tmp, 0xfffULL << 52));
|
||||
+ svfloat64_t z = svreinterpret_f64 (iz);
|
||||
+
|
||||
+ svfloat64_t invc = svld1_gather_index (pg, &__v_log_data.table[0].invc, i);
|
||||
+ svfloat64_t logc = svld1_gather_index (pg, &__v_log_data.table[0].logc, i);
|
||||
+
|
||||
+ svfloat64_t ln2_p3 = svld1rq (svptrue_b64 (), &d->ln2);
|
||||
+ svfloat64_t p1_p4 = svld1rq (svptrue_b64 (), &d->p1);
|
||||
+
|
||||
+ svfloat64_t r = svmla_x (pg, sv_f64 (-1.0), invc, z);
|
||||
+ svfloat64_t kd = svcvt_f64_x (pg, k);
|
||||
+
|
||||
+ svfloat64_t hi = svmla_lane (svadd_x (pg, logc, r), kd, ln2_p3, 0);
|
||||
+ svfloat64_t r2 = svmul_x (pg, r, r);
|
||||
+
|
||||
+ svfloat64_t y = svmla_lane (sv_f64 (d->p2), r, ln2_p3, 1);
|
||||
+
|
||||
+ svfloat64_t p = svmla_lane (sv_f64 (d->p0), r, p1_p4, 0);
|
||||
+ y = svmla_lane (y, r2, p1_p4, 1);
|
||||
+ y = svmla_x (pg, p, r2, y);
|
||||
+ y = svmla_x (pg, hi, r2, y);
|
||||
+ return y;
|
||||
+}
|
||||
+
|
||||
+/* Double-precision implementation of SVE asinh(x).
|
||||
+ asinh is very sensitive around 1, so it is impractical to devise a single
|
||||
+ low-cost algorithm which is sufficiently accurate on a wide range of input.
|
||||
+ Instead we use two different algorithms:
|
||||
+ asinh(x) = sign(x) * log(|x| + sqrt(x^2 + 1) if |x| >= 1
|
||||
+ = sign(x) * (|x| + |x|^3 * P(x^2)) otherwise
|
||||
+ where log(x) is an optimized log approximation, and P(x) is a polynomial
|
||||
+ shared with the scalar routine. The greatest observed error 2.51 ULP, in
|
||||
+ |x| >= 1:
|
||||
+ _ZGVsMxv_asinh(0x1.170469d024505p+0) got 0x1.e3181c43b0f36p-1
|
||||
+ want 0x1.e3181c43b0f39p-1. */
|
||||
+svfloat64_t SV_NAME_D1 (asinh) (svfloat64_t x, const svbool_t pg)
|
||||
+{
|
||||
+ const struct data *d = ptr_barrier (&data);
|
||||
+
|
||||
+ svuint64_t ix = svreinterpret_u64 (x);
|
||||
+ svuint64_t iax = svbic_x (pg, ix, SignMask);
|
||||
+ svuint64_t sign = svand_x (pg, ix, SignMask);
|
||||
+ svfloat64_t ax = svreinterpret_f64 (iax);
|
||||
+
|
||||
+ svbool_t ge1 = svcmpge (pg, iax, One);
|
||||
+ svbool_t special = svcmpge (pg, iax, Thres);
|
||||
+
|
||||
+ /* Option 1: |x| >= 1.
|
||||
+ Compute asinh(x) according by asinh(x) = log(x + sqrt(x^2 + 1)). */
|
||||
+ svfloat64_t option_1 = sv_f64 (0);
|
||||
+ if (__glibc_likely (svptest_any (pg, ge1)))
|
||||
+ {
|
||||
+ svfloat64_t x2 = svmul_x (pg, ax, ax);
|
||||
+ option_1 = __sv_log_inline (
|
||||
+ svadd_x (pg, ax, svsqrt_x (pg, svadd_x (pg, x2, 1))), d, pg);
|
||||
+ }
|
||||
+
|
||||
+ /* Option 2: |x| < 1.
|
||||
+ Compute asinh(x) using a polynomial.
|
||||
+ The largest observed error in this region is 1.51 ULPs:
|
||||
+ _ZGVsMxv_asinh(0x1.fe12bf8c616a2p-1) got 0x1.c1e649ee2681bp-1
|
||||
+ want 0x1.c1e649ee2681dp-1. */
|
||||
+ svfloat64_t option_2 = sv_f64 (0);
|
||||
+ if (__glibc_likely (svptest_any (pg, svnot_z (pg, ge1))))
|
||||
+ {
|
||||
+ svfloat64_t x2 = svmul_x (pg, ax, ax);
|
||||
+ svfloat64_t x4 = svmul_x (pg, x2, x2);
|
||||
+ svfloat64_t p = sv_pw_horner_17_f64_x (pg, x2, x4, d->poly);
|
||||
+ option_2 = svmla_x (pg, ax, p, svmul_x (pg, x2, ax));
|
||||
+ }
|
||||
+
|
||||
+ /* Choose the right option for each lane. */
|
||||
+ svfloat64_t y = svsel (ge1, option_1, option_2);
|
||||
+
|
||||
+ if (__glibc_unlikely (svptest_any (pg, special)))
|
||||
+ return special_case (
|
||||
+ x, svreinterpret_f64 (sveor_x (pg, svreinterpret_u64 (y), sign)),
|
||||
+ special);
|
||||
+ return svreinterpret_f64 (sveor_x (pg, svreinterpret_u64 (y), sign));
|
||||
+}
|
||||
diff --git a/sysdeps/aarch64/fpu/asinhf_advsimd.c b/sysdeps/aarch64/fpu/asinhf_advsimd.c
|
||||
new file mode 100644
|
||||
index 0000000000000000..09fd8a614305563d
|
||||
--- /dev/null
|
||||
+++ b/sysdeps/aarch64/fpu/asinhf_advsimd.c
|
||||
@@ -0,0 +1,80 @@
|
||||
+/* Single-precision vector (Advanced SIMD) asinh function
|
||||
+
|
||||
+ Copyright (C) 2024 Free Software Foundation, Inc.
|
||||
+ This file is part of the GNU C Library.
|
||||
+
|
||||
+ The GNU C Library is free software; you can redistribute it and/or
|
||||
+ modify it under the terms of the GNU Lesser General Public
|
||||
+ License as published by the Free Software Foundation; either
|
||||
+ version 2.1 of the License, or (at your option) any later version.
|
||||
+
|
||||
+ The GNU C Library is distributed in the hope that it will be useful,
|
||||
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
+ Lesser General Public License for more details.
|
||||
+
|
||||
+ You should have received a copy of the GNU Lesser General Public
|
||||
+ License along with the GNU C Library; if not, see
|
||||
+ <https://www.gnu.org/licenses/>. */
|
||||
+
|
||||
+#include "v_math.h"
|
||||
+#include "v_log1pf_inline.h"
|
||||
+
|
||||
+#define SignMask v_u32 (0x80000000)
|
||||
+
|
||||
+const static struct data
|
||||
+{
|
||||
+ struct v_log1pf_data log1pf_consts;
|
||||
+ uint32x4_t big_bound;
|
||||
+#if WANT_SIMD_EXCEPT
|
||||
+ uint32x4_t tiny_bound;
|
||||
+#endif
|
||||
+} data = {
|
||||
+ .log1pf_consts = V_LOG1PF_CONSTANTS_TABLE,
|
||||
+ .big_bound = V4 (0x5f800000), /* asuint(0x1p64). */
|
||||
+#if WANT_SIMD_EXCEPT
|
||||
+ .tiny_bound = V4 (0x30800000) /* asuint(0x1p-30). */
|
||||
+#endif
|
||||
+};
|
||||
+
|
||||
+static float32x4_t NOINLINE VPCS_ATTR
|
||||
+special_case (float32x4_t x, float32x4_t y, uint32x4_t special)
|
||||
+{
|
||||
+ return v_call_f32 (asinhf, x, y, special);
|
||||
+}
|
||||
+
|
||||
+/* Single-precision implementation of vector asinh(x), using vector log1p.
|
||||
+ Worst-case error is 2.66 ULP, at roughly +/-0.25:
|
||||
+ __v_asinhf(0x1.01b04p-2) got 0x1.fe163ep-3 want 0x1.fe1638p-3. */
|
||||
+VPCS_ATTR float32x4_t NOINLINE V_NAME_F1 (asinh) (float32x4_t x)
|
||||
+{
|
||||
+ const struct data *dat = ptr_barrier (&data);
|
||||
+ uint32x4_t iax = vbicq_u32 (vreinterpretq_u32_f32 (x), SignMask);
|
||||
+ float32x4_t ax = vreinterpretq_f32_u32 (iax);
|
||||
+ uint32x4_t special = vcgeq_u32 (iax, dat->big_bound);
|
||||
+ float32x4_t special_arg = x;
|
||||
+
|
||||
+#if WANT_SIMD_EXCEPT
|
||||
+ /* Sidestep tiny and large values to avoid inadvertently triggering
|
||||
+ under/overflow. */
|
||||
+ special = vorrq_u32 (special, vcltq_u32 (iax, dat->tiny_bound));
|
||||
+ if (__glibc_unlikely (v_any_u32 (special)))
|
||||
+ {
|
||||
+ ax = v_zerofy_f32 (ax, special);
|
||||
+ x = v_zerofy_f32 (x, special);
|
||||
+ }
|
||||
+#endif
|
||||
+
|
||||
+ /* asinh(x) = log(x + sqrt(x * x + 1)).
|
||||
+ For positive x, asinh(x) = log1p(x + x * x / (1 + sqrt(x * x + 1))). */
|
||||
+ float32x4_t d
|
||||
+ = vaddq_f32 (v_f32 (1), vsqrtq_f32 (vfmaq_f32 (v_f32 (1), x, x)));
|
||||
+ float32x4_t y = log1pf_inline (
|
||||
+ vaddq_f32 (ax, vdivq_f32 (vmulq_f32 (ax, ax), d)), dat->log1pf_consts);
|
||||
+
|
||||
+ if (__glibc_unlikely (v_any_u32 (special)))
|
||||
+ return special_case (special_arg, vbslq_f32 (SignMask, x, y), special);
|
||||
+ return vbslq_f32 (SignMask, x, y);
|
||||
+}
|
||||
+libmvec_hidden_def (V_NAME_F1 (asinh))
|
||||
+HALF_WIDTH_ALIAS_F1 (asinh)
|
||||
diff --git a/sysdeps/aarch64/fpu/asinhf_sve.c b/sysdeps/aarch64/fpu/asinhf_sve.c
|
||||
new file mode 100644
|
||||
index 0000000000000000..d85c3a685c0b83ff
|
||||
--- /dev/null
|
||||
+++ b/sysdeps/aarch64/fpu/asinhf_sve.c
|
||||
@@ -0,0 +1,56 @@
|
||||
+/* Single-precision vector (SVE) asinh function
|
||||
+
|
||||
+ Copyright (C) 2024 Free Software Foundation, Inc.
|
||||
+ This file is part of the GNU C Library.
|
||||
+
|
||||
+ The GNU C Library is free software; you can redistribute it and/or
|
||||
+ modify it under the terms of the GNU Lesser General Public
|
||||
+ License as published by the Free Software Foundation; either
|
||||
+ version 2.1 of the License, or (at your option) any later version.
|
||||
+
|
||||
+ The GNU C Library is distributed in the hope that it will be useful,
|
||||
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
+ Lesser General Public License for more details.
|
||||
+
|
||||
+ You should have received a copy of the GNU Lesser General Public
|
||||
+ License along with the GNU C Library; if not, see
|
||||
+ <https://www.gnu.org/licenses/>. */
|
||||
+
|
||||
+#include "sv_math.h"
|
||||
+#include "sv_log1pf_inline.h"
|
||||
+
|
||||
+#define BigBound (0x5f800000) /* asuint(0x1p64). */
|
||||
+
|
||||
+static svfloat32_t NOINLINE
|
||||
+special_case (svfloat32_t x, svfloat32_t y, svbool_t special)
|
||||
+{
|
||||
+ return sv_call_f32 (asinhf, x, y, special);
|
||||
+}
|
||||
+
|
||||
+/* Single-precision SVE asinh(x) routine. Implements the same algorithm as
|
||||
+ vector asinhf and log1p.
|
||||
+
|
||||
+ Maximum error is 2.48 ULPs:
|
||||
+ SV_NAME_F1 (asinh) (0x1.008864p-3) got 0x1.ffbbbcp-4
|
||||
+ want 0x1.ffbbb8p-4. */
|
||||
+svfloat32_t SV_NAME_F1 (asinh) (svfloat32_t x, const svbool_t pg)
|
||||
+{
|
||||
+ svfloat32_t ax = svabs_x (pg, x);
|
||||
+ svuint32_t iax = svreinterpret_u32 (ax);
|
||||
+ svuint32_t sign = sveor_x (pg, svreinterpret_u32 (x), iax);
|
||||
+ svbool_t special = svcmpge (pg, iax, BigBound);
|
||||
+
|
||||
+ /* asinh(x) = log(x + sqrt(x * x + 1)).
|
||||
+ For positive x, asinh(x) = log1p(x + x * x / (1 + sqrt(x * x + 1))). */
|
||||
+ svfloat32_t ax2 = svmul_x (pg, ax, ax);
|
||||
+ svfloat32_t d = svadd_x (pg, svsqrt_x (pg, svadd_x (pg, ax2, 1.0f)), 1.0f);
|
||||
+ svfloat32_t y
|
||||
+ = sv_log1pf_inline (svadd_x (pg, ax, svdiv_x (pg, ax2, d)), pg);
|
||||
+
|
||||
+ if (__glibc_unlikely (svptest_any (pg, special)))
|
||||
+ return special_case (
|
||||
+ x, svreinterpret_f32 (svorr_x (pg, sign, svreinterpret_u32 (y))),
|
||||
+ special);
|
||||
+ return svreinterpret_f32 (svorr_x (pg, sign, svreinterpret_u32 (y)));
|
||||
+}
|
||||
diff --git a/sysdeps/aarch64/fpu/bits/math-vector.h b/sysdeps/aarch64/fpu/bits/math-vector.h
|
||||
index 841330956c102ff1..eb2af35b27757fc6 100644
|
||||
--- a/sysdeps/aarch64/fpu/bits/math-vector.h
|
||||
+++ b/sysdeps/aarch64/fpu/bits/math-vector.h
|
||||
@@ -41,6 +41,10 @@
|
||||
# define __DECL_SIMD_asin __DECL_SIMD_aarch64
|
||||
# undef __DECL_SIMD_asinf
|
||||
# define __DECL_SIMD_asinf __DECL_SIMD_aarch64
|
||||
+# undef __DECL_SIMD_asinh
|
||||
+# define __DECL_SIMD_asinh __DECL_SIMD_aarch64
|
||||
+# undef __DECL_SIMD_asinhf
|
||||
+# define __DECL_SIMD_asinhf __DECL_SIMD_aarch64
|
||||
# undef __DECL_SIMD_atan
|
||||
# define __DECL_SIMD_atan __DECL_SIMD_aarch64
|
||||
# undef __DECL_SIMD_atanf
|
||||
@@ -131,6 +135,7 @@ __vpcs __f32x4_t _ZGVnN4vv_atan2f (__f32x4_t, __f32x4_t);
|
||||
__vpcs __f32x4_t _ZGVnN4v_acosf (__f32x4_t);
|
||||
__vpcs __f32x4_t _ZGVnN4v_acoshf (__f32x4_t);
|
||||
__vpcs __f32x4_t _ZGVnN4v_asinf (__f32x4_t);
|
||||
+__vpcs __f32x4_t _ZGVnN4v_asinhf (__f32x4_t);
|
||||
__vpcs __f32x4_t _ZGVnN4v_atanf (__f32x4_t);
|
||||
__vpcs __f32x4_t _ZGVnN4v_cosf (__f32x4_t);
|
||||
__vpcs __f32x4_t _ZGVnN4v_coshf (__f32x4_t);
|
||||
@@ -150,6 +155,7 @@ __vpcs __f64x2_t _ZGVnN2vv_atan2 (__f64x2_t, __f64x2_t);
|
||||
__vpcs __f64x2_t _ZGVnN2v_acos (__f64x2_t);
|
||||
__vpcs __f64x2_t _ZGVnN2v_acosh (__f64x2_t);
|
||||
__vpcs __f64x2_t _ZGVnN2v_asin (__f64x2_t);
|
||||
+__vpcs __f64x2_t _ZGVnN2v_asinh (__f64x2_t);
|
||||
__vpcs __f64x2_t _ZGVnN2v_atan (__f64x2_t);
|
||||
__vpcs __f64x2_t _ZGVnN2v_cos (__f64x2_t);
|
||||
__vpcs __f64x2_t _ZGVnN2v_cosh (__f64x2_t);
|
||||
@@ -174,6 +180,7 @@ __sv_f32_t _ZGVsMxvv_atan2f (__sv_f32_t, __sv_f32_t, __sv_bool_t);
|
||||
__sv_f32_t _ZGVsMxv_acosf (__sv_f32_t, __sv_bool_t);
|
||||
__sv_f32_t _ZGVsMxv_acoshf (__sv_f32_t, __sv_bool_t);
|
||||
__sv_f32_t _ZGVsMxv_asinf (__sv_f32_t, __sv_bool_t);
|
||||
+__sv_f32_t _ZGVsMxv_asinhf (__sv_f32_t, __sv_bool_t);
|
||||
__sv_f32_t _ZGVsMxv_atanf (__sv_f32_t, __sv_bool_t);
|
||||
__sv_f32_t _ZGVsMxv_cosf (__sv_f32_t, __sv_bool_t);
|
||||
__sv_f32_t _ZGVsMxv_coshf (__sv_f32_t, __sv_bool_t);
|
||||
@@ -193,6 +200,7 @@ __sv_f64_t _ZGVsMxvv_atan2 (__sv_f64_t, __sv_f64_t, __sv_bool_t);
|
||||
__sv_f64_t _ZGVsMxv_acos (__sv_f64_t, __sv_bool_t);
|
||||
__sv_f64_t _ZGVsMxv_acosh (__sv_f64_t, __sv_bool_t);
|
||||
__sv_f64_t _ZGVsMxv_asin (__sv_f64_t, __sv_bool_t);
|
||||
+__sv_f64_t _ZGVsMxv_asinh (__sv_f64_t, __sv_bool_t);
|
||||
__sv_f64_t _ZGVsMxv_atan (__sv_f64_t, __sv_bool_t);
|
||||
__sv_f64_t _ZGVsMxv_cos (__sv_f64_t, __sv_bool_t);
|
||||
__sv_f64_t _ZGVsMxv_cosh (__sv_f64_t, __sv_bool_t);
|
||||
diff --git a/sysdeps/aarch64/fpu/test-double-advsimd-wrappers.c b/sysdeps/aarch64/fpu/test-double-advsimd-wrappers.c
|
||||
index f4ce1d70096888aa..3d7177c32dcd77a6 100644
|
||||
--- a/sysdeps/aarch64/fpu/test-double-advsimd-wrappers.c
|
||||
+++ b/sysdeps/aarch64/fpu/test-double-advsimd-wrappers.c
|
||||
@@ -26,6 +26,7 @@
|
||||
VPCS_VECTOR_WRAPPER (acos_advsimd, _ZGVnN2v_acos)
|
||||
VPCS_VECTOR_WRAPPER (acosh_advsimd, _ZGVnN2v_acosh)
|
||||
VPCS_VECTOR_WRAPPER (asin_advsimd, _ZGVnN2v_asin)
|
||||
+VPCS_VECTOR_WRAPPER (asinh_advsimd, _ZGVnN2v_asinh)
|
||||
VPCS_VECTOR_WRAPPER (atan_advsimd, _ZGVnN2v_atan)
|
||||
VPCS_VECTOR_WRAPPER_ff (atan2_advsimd, _ZGVnN2vv_atan2)
|
||||
VPCS_VECTOR_WRAPPER (cos_advsimd, _ZGVnN2v_cos)
|
||||
diff --git a/sysdeps/aarch64/fpu/test-double-sve-wrappers.c b/sysdeps/aarch64/fpu/test-double-sve-wrappers.c
|
||||
index 0e973cc9d7ade813..b88a2afe5c1198c0 100644
|
||||
--- a/sysdeps/aarch64/fpu/test-double-sve-wrappers.c
|
||||
+++ b/sysdeps/aarch64/fpu/test-double-sve-wrappers.c
|
||||
@@ -45,6 +45,7 @@
|
||||
SVE_VECTOR_WRAPPER (acos_sve, _ZGVsMxv_acos)
|
||||
SVE_VECTOR_WRAPPER (acosh_sve, _ZGVsMxv_acosh)
|
||||
SVE_VECTOR_WRAPPER (asin_sve, _ZGVsMxv_asin)
|
||||
+SVE_VECTOR_WRAPPER (asinh_sve, _ZGVsMxv_asinh)
|
||||
SVE_VECTOR_WRAPPER (atan_sve, _ZGVsMxv_atan)
|
||||
SVE_VECTOR_WRAPPER_ff (atan2_sve, _ZGVsMxvv_atan2)
|
||||
SVE_VECTOR_WRAPPER (cos_sve, _ZGVsMxv_cos)
|
||||
diff --git a/sysdeps/aarch64/fpu/test-float-advsimd-wrappers.c b/sysdeps/aarch64/fpu/test-float-advsimd-wrappers.c
|
||||
index 0ce026b5ea96a064..533655402d3f3737 100644
|
||||
--- a/sysdeps/aarch64/fpu/test-float-advsimd-wrappers.c
|
||||
+++ b/sysdeps/aarch64/fpu/test-float-advsimd-wrappers.c
|
||||
@@ -26,6 +26,7 @@
|
||||
VPCS_VECTOR_WRAPPER (acosf_advsimd, _ZGVnN4v_acosf)
|
||||
VPCS_VECTOR_WRAPPER (acoshf_advsimd, _ZGVnN4v_acoshf)
|
||||
VPCS_VECTOR_WRAPPER (asinf_advsimd, _ZGVnN4v_asinf)
|
||||
+VPCS_VECTOR_WRAPPER (asinhf_advsimd, _ZGVnN4v_asinhf)
|
||||
VPCS_VECTOR_WRAPPER (atanf_advsimd, _ZGVnN4v_atanf)
|
||||
VPCS_VECTOR_WRAPPER_ff (atan2f_advsimd, _ZGVnN4vv_atan2f)
|
||||
VPCS_VECTOR_WRAPPER (cosf_advsimd, _ZGVnN4v_cosf)
|
||||
diff --git a/sysdeps/aarch64/fpu/test-float-sve-wrappers.c b/sysdeps/aarch64/fpu/test-float-sve-wrappers.c
|
||||
index 398b7373e800cd5b..f7b673e3358e7d82 100644
|
||||
--- a/sysdeps/aarch64/fpu/test-float-sve-wrappers.c
|
||||
+++ b/sysdeps/aarch64/fpu/test-float-sve-wrappers.c
|
||||
@@ -45,6 +45,7 @@
|
||||
SVE_VECTOR_WRAPPER (acosf_sve, _ZGVsMxv_acosf)
|
||||
SVE_VECTOR_WRAPPER (acoshf_sve, _ZGVsMxv_acoshf)
|
||||
SVE_VECTOR_WRAPPER (asinf_sve, _ZGVsMxv_asinf)
|
||||
+SVE_VECTOR_WRAPPER (asinhf_sve, _ZGVsMxv_asinhf)
|
||||
SVE_VECTOR_WRAPPER (atanf_sve, _ZGVsMxv_atanf)
|
||||
SVE_VECTOR_WRAPPER_ff (atan2f_sve, _ZGVsMxvv_atan2f)
|
||||
SVE_VECTOR_WRAPPER (cosf_sve, _ZGVsMxv_cosf)
|
||||
diff --git a/sysdeps/aarch64/libm-test-ulps b/sysdeps/aarch64/libm-test-ulps
|
||||
index 1646cdbdd22d93d9..b916e422432014c2 100644
|
||||
--- a/sysdeps/aarch64/libm-test-ulps
|
||||
+++ b/sysdeps/aarch64/libm-test-ulps
|
||||
@@ -90,11 +90,19 @@ double: 2
|
||||
float: 2
|
||||
ldouble: 4
|
||||
|
||||
+Function: "asinh_advsimd":
|
||||
+double: 1
|
||||
+float: 2
|
||||
+
|
||||
Function: "asinh_downward":
|
||||
double: 3
|
||||
float: 3
|
||||
ldouble: 4
|
||||
|
||||
+Function: "asinh_sve":
|
||||
+double: 1
|
||||
+float: 2
|
||||
+
|
||||
Function: "asinh_towardzero":
|
||||
double: 2
|
||||
float: 2
|
||||
diff --git a/sysdeps/unix/sysv/linux/aarch64/libmvec.abilist b/sysdeps/unix/sysv/linux/aarch64/libmvec.abilist
|
||||
index f5aaa519f2c8663e..f288afdfdd9c8757 100644
|
||||
--- a/sysdeps/unix/sysv/linux/aarch64/libmvec.abilist
|
||||
+++ b/sysdeps/unix/sysv/linux/aarch64/libmvec.abilist
|
||||
@@ -75,15 +75,20 @@ GLIBC_2.39 _ZGVsMxvv_atan2 F
|
||||
GLIBC_2.39 _ZGVsMxvv_atan2f F
|
||||
GLIBC_2.40 _ZGVnN2v_acosh F
|
||||
GLIBC_2.40 _ZGVnN2v_acoshf F
|
||||
+GLIBC_2.40 _ZGVnN2v_asinh F
|
||||
+GLIBC_2.40 _ZGVnN2v_asinhf F
|
||||
GLIBC_2.40 _ZGVnN2v_cosh F
|
||||
GLIBC_2.40 _ZGVnN2v_coshf F
|
||||
GLIBC_2.40 _ZGVnN2v_erf F
|
||||
GLIBC_2.40 _ZGVnN2v_erff F
|
||||
GLIBC_2.40 _ZGVnN4v_acoshf F
|
||||
+GLIBC_2.40 _ZGVnN4v_asinhf F
|
||||
GLIBC_2.40 _ZGVnN4v_coshf F
|
||||
GLIBC_2.40 _ZGVnN4v_erff F
|
||||
GLIBC_2.40 _ZGVsMxv_acosh F
|
||||
GLIBC_2.40 _ZGVsMxv_acoshf F
|
||||
+GLIBC_2.40 _ZGVsMxv_asinh F
|
||||
+GLIBC_2.40 _ZGVsMxv_asinhf F
|
||||
GLIBC_2.40 _ZGVsMxv_cosh F
|
||||
GLIBC_2.40 _ZGVsMxv_coshf F
|
||||
GLIBC_2.40 _ZGVsMxv_erf F
|
||||
521
glibc-RHEL-118273-40.patch
Normal file
521
glibc-RHEL-118273-40.patch
Normal file
@ -0,0 +1,521 @@
|
||||
commit 1e3d1ddf977ecd653de8d0d10eb083d80ac21cf3
|
||||
Author: Dylan Fleming <Dylan.Fleming@arm.com>
|
||||
Date: Wed Jun 18 16:17:12 2025 +0000
|
||||
|
||||
AArch64: Optimize SVE exp functions
|
||||
|
||||
Improve performance of SVE exps by making better use
|
||||
of the SVE FEXPA instruction.
|
||||
|
||||
Performance improvement on Neoverse V1:
|
||||
exp2_sve: 21%
|
||||
exp2f_sve: 24%
|
||||
exp10f_sve: 23%
|
||||
expm1_sve: 25%
|
||||
|
||||
Reviewed-by: Wilco Dijkstra <Wilco.Dijkstra@arm.com>
|
||||
|
||||
diff --git a/sysdeps/aarch64/fpu/exp10f_sve.c b/sysdeps/aarch64/fpu/exp10f_sve.c
|
||||
index 8aa3fa9c4335cfb8..0a4c26450601a1db 100644
|
||||
--- a/sysdeps/aarch64/fpu/exp10f_sve.c
|
||||
+++ b/sysdeps/aarch64/fpu/exp10f_sve.c
|
||||
@@ -19,26 +19,19 @@
|
||||
|
||||
#include "sv_math.h"
|
||||
|
||||
-/* For x < -Thres, the result is subnormal and not handled correctly by
|
||||
- FEXPA. */
|
||||
-#define Thres 37.9
|
||||
+/* For x < -Thres (-log10(2^126)), the result is subnormal and not handled
|
||||
+ correctly by FEXPA. */
|
||||
+#define Thres 0x1.2f702p+5
|
||||
|
||||
static const struct data
|
||||
{
|
||||
- float log2_10_lo, c0, c2, c4;
|
||||
- float c1, c3, log10_2;
|
||||
- float shift, log2_10_hi, thres;
|
||||
+ float log10_2, log2_10_hi, log2_10_lo, c1;
|
||||
+ float c0, shift, thres;
|
||||
} data = {
|
||||
/* Coefficients generated using Remez algorithm with minimisation of relative
|
||||
- error.
|
||||
- rel error: 0x1.89dafa3p-24
|
||||
- abs error: 0x1.167d55p-23 in [-log10(2)/2, log10(2)/2]
|
||||
- maxerr: 0.52 +0.5 ulp. */
|
||||
- .c0 = 0x1.26bb16p+1f,
|
||||
- .c1 = 0x1.5350d2p+1f,
|
||||
- .c2 = 0x1.04744ap+1f,
|
||||
- .c3 = 0x1.2d8176p+0f,
|
||||
- .c4 = 0x1.12b41ap-1f,
|
||||
+ error. */
|
||||
+ .c0 = 0x1.26bb62p1,
|
||||
+ .c1 = 0x1.53524cp1,
|
||||
/* 1.5*2^17 + 127, a shift value suitable for FEXPA. */
|
||||
.shift = 0x1.803f8p17f,
|
||||
.log10_2 = 0x1.a934fp+1,
|
||||
@@ -53,28 +46,23 @@ sv_exp10f_inline (svfloat32_t x, const svbool_t pg, const struct data *d)
|
||||
/* exp10(x) = 2^(n/N) * 10^r = 2^n * (1 + poly (r)),
|
||||
with poly(r) in [1/sqrt(2), sqrt(2)] and
|
||||
x = r + n * log10(2) / N, with r in [-log10(2)/2N, log10(2)/2N]. */
|
||||
-
|
||||
- svfloat32_t lane_consts = svld1rq (svptrue_b32 (), &d->log2_10_lo);
|
||||
+ svfloat32_t lane_consts = svld1rq (svptrue_b32 (), &d->log10_2);
|
||||
|
||||
/* n = round(x/(log10(2)/N)). */
|
||||
svfloat32_t shift = sv_f32 (d->shift);
|
||||
- svfloat32_t z = svmad_x (pg, sv_f32 (d->log10_2), x, shift);
|
||||
- svfloat32_t n = svsub_x (svptrue_b32 (), z, shift);
|
||||
+ svfloat32_t z = svmla_lane (shift, x, lane_consts, 0);
|
||||
+ svfloat32_t n = svsub_x (pg, z, shift);
|
||||
|
||||
/* r = x - n*log10(2)/N. */
|
||||
- svfloat32_t r = svmsb_x (pg, sv_f32 (d->log2_10_hi), n, x);
|
||||
- r = svmls_lane (r, n, lane_consts, 0);
|
||||
+ svfloat32_t r = x;
|
||||
+ r = svmls_lane (r, n, lane_consts, 1);
|
||||
+ r = svmls_lane (r, n, lane_consts, 2);
|
||||
|
||||
svfloat32_t scale = svexpa (svreinterpret_u32 (z));
|
||||
|
||||
/* Polynomial evaluation: poly(r) ~ exp10(r)-1. */
|
||||
- svfloat32_t p12 = svmla_lane (sv_f32 (d->c1), r, lane_consts, 2);
|
||||
- svfloat32_t p34 = svmla_lane (sv_f32 (d->c3), r, lane_consts, 3);
|
||||
- svfloat32_t r2 = svmul_x (svptrue_b32 (), r, r);
|
||||
- svfloat32_t p14 = svmla_x (pg, p12, p34, r2);
|
||||
- svfloat32_t p0 = svmul_lane (r, lane_consts, 1);
|
||||
- svfloat32_t poly = svmla_x (pg, p0, r2, p14);
|
||||
-
|
||||
+ svfloat32_t poly = svmla_lane (sv_f32 (d->c0), r, lane_consts, 3);
|
||||
+ poly = svmul_x (pg, poly, r);
|
||||
return svmla_x (pg, scale, scale, poly);
|
||||
}
|
||||
|
||||
@@ -85,11 +73,10 @@ special_case (svfloat32_t x, svbool_t special, const struct data *d)
|
||||
special);
|
||||
}
|
||||
|
||||
-/* Single-precision SVE exp10f routine. Implements the same algorithm
|
||||
- as AdvSIMD exp10f.
|
||||
- Worst case error is 1.02 ULPs.
|
||||
- _ZGVsMxv_exp10f(-0x1.040488p-4) got 0x1.ba5f9ep-1
|
||||
- want 0x1.ba5f9cp-1. */
|
||||
+/* Single-precision SVE exp10f routine. Based on the FEXPA instruction.
|
||||
+ Worst case error is 1.10 ULP.
|
||||
+ _ZGVsMxv_exp10f (0x1.cc76dep+3) got 0x1.be0172p+47
|
||||
+ want 0x1.be017p+47. */
|
||||
svfloat32_t SV_NAME_F1 (exp10) (svfloat32_t x, const svbool_t pg)
|
||||
{
|
||||
const struct data *d = ptr_barrier (&data);
|
||||
diff --git a/sysdeps/aarch64/fpu/exp2_sve.c b/sysdeps/aarch64/fpu/exp2_sve.c
|
||||
index 5dfb77cdbc2f6a51..ed11423e45059133 100644
|
||||
--- a/sysdeps/aarch64/fpu/exp2_sve.c
|
||||
+++ b/sysdeps/aarch64/fpu/exp2_sve.c
|
||||
@@ -19,23 +19,21 @@
|
||||
|
||||
#include "sv_math.h"
|
||||
|
||||
-#define N (1 << V_EXP_TABLE_BITS)
|
||||
-
|
||||
#define BigBound 1022
|
||||
#define UOFlowBound 1280
|
||||
|
||||
static const struct data
|
||||
{
|
||||
- double c0, c2;
|
||||
- double c1, c3;
|
||||
+ double c2, c4;
|
||||
+ double c0, c1, c3;
|
||||
double shift, big_bound, uoflow_bound;
|
||||
} data = {
|
||||
/* Coefficients are computed using Remez algorithm with
|
||||
minimisation of the absolute error. */
|
||||
- .c0 = 0x1.62e42fefa3686p-1, .c1 = 0x1.ebfbdff82c241p-3,
|
||||
- .c2 = 0x1.c6b09b16de99ap-5, .c3 = 0x1.3b2abf5571ad8p-7,
|
||||
- .shift = 0x1.8p52 / N, .uoflow_bound = UOFlowBound,
|
||||
- .big_bound = BigBound,
|
||||
+ .c0 = 0x1.62e42fefa39efp-1, .c1 = 0x1.ebfbdff82a31bp-3,
|
||||
+ .c2 = 0x1.c6b08d706c8a5p-5, .c3 = 0x1.3b2ad2ff7d2f3p-7,
|
||||
+ .c4 = 0x1.5d8761184beb3p-10, .shift = 0x1.800000000ffc0p+46,
|
||||
+ .uoflow_bound = UOFlowBound, .big_bound = BigBound,
|
||||
};
|
||||
|
||||
#define SpecialOffset 0x6000000000000000 /* 0x1p513. */
|
||||
@@ -64,50 +62,52 @@ special_case (svbool_t pg, svfloat64_t s, svfloat64_t y, svfloat64_t n,
|
||||
svadd_x (pg, svsub_x (pg, svreinterpret_u64 (s), SpecialBias2), b));
|
||||
|
||||
/* |n| > 1280 => 2^(n) overflows. */
|
||||
- svbool_t p_cmp = svacgt (pg, n, d->uoflow_bound);
|
||||
+ svbool_t p_cmp = svacle (pg, n, d->uoflow_bound);
|
||||
|
||||
svfloat64_t r1 = svmul_x (svptrue_b64 (), s1, s1);
|
||||
svfloat64_t r2 = svmla_x (pg, s2, s2, y);
|
||||
svfloat64_t r0 = svmul_x (svptrue_b64 (), r2, s1);
|
||||
|
||||
- return svsel (p_cmp, r1, r0);
|
||||
+ return svsel (p_cmp, r0, r1);
|
||||
}
|
||||
|
||||
/* Fast vector implementation of exp2.
|
||||
- Maximum measured error is 1.65 ulp.
|
||||
- _ZGVsMxv_exp2(-0x1.4c264ab5b559bp-6) got 0x1.f8db0d4df721fp-1
|
||||
- want 0x1.f8db0d4df721dp-1. */
|
||||
+ Maximum measured error is 0.52 + 0.5 ulp.
|
||||
+ _ZGVsMxv_exp2 (0x1.3b72ad5b701bfp-1) got 0x1.8861641b49e08p+0
|
||||
+ want 0x1.8861641b49e07p+0. */
|
||||
svfloat64_t SV_NAME_D1 (exp2) (svfloat64_t x, svbool_t pg)
|
||||
{
|
||||
const struct data *d = ptr_barrier (&data);
|
||||
- svbool_t no_big_scale = svacle (pg, x, d->big_bound);
|
||||
- svbool_t special = svnot_z (pg, no_big_scale);
|
||||
-
|
||||
- /* Reduce x to k/N + r, where k is integer and r in [-1/2N, 1/2N]. */
|
||||
- svfloat64_t shift = sv_f64 (d->shift);
|
||||
- svfloat64_t kd = svadd_x (pg, x, shift);
|
||||
- svuint64_t ki = svreinterpret_u64 (kd);
|
||||
- /* kd = k/N. */
|
||||
- kd = svsub_x (pg, kd, shift);
|
||||
- svfloat64_t r = svsub_x (pg, x, kd);
|
||||
-
|
||||
- /* scale ~= 2^(k/N). */
|
||||
- svuint64_t idx = svand_x (pg, ki, N - 1);
|
||||
- svuint64_t sbits = svld1_gather_index (pg, __v_exp_data, idx);
|
||||
- /* This is only a valid scale when -1023*N < k < 1024*N. */
|
||||
- svuint64_t top = svlsl_x (pg, ki, 52 - V_EXP_TABLE_BITS);
|
||||
- svfloat64_t scale = svreinterpret_f64 (svadd_x (pg, sbits, top));
|
||||
-
|
||||
- svfloat64_t c13 = svld1rq (svptrue_b64 (), &d->c1);
|
||||
- /* Approximate exp2(r) using polynomial. */
|
||||
- /* y = exp2(r) - 1 ~= C0 r + C1 r^2 + C2 r^3 + C3 r^4. */
|
||||
+ svbool_t special = svacge (pg, x, d->big_bound);
|
||||
+
|
||||
+ svfloat64_t z = svadd_x (svptrue_b64 (), x, d->shift);
|
||||
+ svfloat64_t n = svsub_x (svptrue_b64 (), z, d->shift);
|
||||
+ svfloat64_t r = svsub_x (svptrue_b64 (), x, n);
|
||||
+
|
||||
+ svfloat64_t scale = svexpa (svreinterpret_u64 (z));
|
||||
+
|
||||
svfloat64_t r2 = svmul_x (svptrue_b64 (), r, r);
|
||||
- svfloat64_t p01 = svmla_lane (sv_f64 (d->c0), r, c13, 0);
|
||||
- svfloat64_t p23 = svmla_lane (sv_f64 (d->c2), r, c13, 1);
|
||||
- svfloat64_t p = svmla_x (pg, p01, p23, r2);
|
||||
+ svfloat64_t c24 = svld1rq (svptrue_b64 (), &d->c2);
|
||||
+
|
||||
+ /* Approximate exp2(r) using polynomial. */
|
||||
+ /* y = exp2(r) - 1 ~= r * (C0 + C1 r + C2 r^2 + C3 r^3 + C4 r^4). */
|
||||
+ svfloat64_t p12 = svmla_lane (sv_f64 (d->c1), r, c24, 0);
|
||||
+ svfloat64_t p34 = svmla_lane (sv_f64 (d->c3), r, c24, 1);
|
||||
+ svfloat64_t p = svmla_x (pg, p12, p34, r2);
|
||||
+ p = svmad_x (pg, p, r, d->c0);
|
||||
svfloat64_t y = svmul_x (svptrue_b64 (), r, p);
|
||||
+
|
||||
/* Assemble exp2(x) = exp2(r) * scale. */
|
||||
if (__glibc_unlikely (svptest_any (pg, special)))
|
||||
- return special_case (pg, scale, y, kd, d);
|
||||
+ {
|
||||
+ /* FEXPA zeroes the sign bit, however the sign is meaningful to the
|
||||
+ special case function so needs to be copied.
|
||||
+ e = sign bit of u << 46. */
|
||||
+ svuint64_t e = svand_x (pg, svlsl_x (pg, svreinterpret_u64 (z), 46),
|
||||
+ 0x8000000000000000);
|
||||
+ scale = svreinterpret_f64 (svadd_x (pg, e, svreinterpret_u64 (scale)));
|
||||
+ return special_case (pg, scale, y, n, d);
|
||||
+ }
|
||||
+
|
||||
return svmla_x (pg, scale, scale, y);
|
||||
}
|
||||
diff --git a/sysdeps/aarch64/fpu/exp2f_sve.c b/sysdeps/aarch64/fpu/exp2f_sve.c
|
||||
index c6216bed9e9e7538..cf01820288f1855c 100644
|
||||
--- a/sysdeps/aarch64/fpu/exp2f_sve.c
|
||||
+++ b/sysdeps/aarch64/fpu/exp2f_sve.c
|
||||
@@ -18,21 +18,17 @@
|
||||
<https://www.gnu.org/licenses/>. */
|
||||
|
||||
#include "sv_math.h"
|
||||
-#include "poly_sve_f32.h"
|
||||
|
||||
#define Thres 0x1.5d5e2ap+6f
|
||||
|
||||
static const struct data
|
||||
{
|
||||
- float c0, c2, c4, c1, c3;
|
||||
- float shift, thres;
|
||||
+ float c0, c1, shift, thres;
|
||||
} data = {
|
||||
- /* Coefficients copied from the polynomial in AdvSIMD variant. */
|
||||
- .c0 = 0x1.62e422p-1f,
|
||||
- .c1 = 0x1.ebf9bcp-3f,
|
||||
- .c2 = 0x1.c6bd32p-5f,
|
||||
- .c3 = 0x1.3ce9e4p-7f,
|
||||
- .c4 = 0x1.59977ap-10f,
|
||||
+ /* Coefficients generated using Remez algorithm with minimisation of relative
|
||||
+ error. */
|
||||
+ .c0 = 0x1.62e485p-1,
|
||||
+ .c1 = 0x1.ebfbe0p-3,
|
||||
/* 1.5*2^17 + 127. */
|
||||
.shift = 0x1.803f8p17f,
|
||||
/* Roughly 87.3. For x < -Thres, the result is subnormal and not handled
|
||||
@@ -51,16 +47,8 @@ sv_exp2f_inline (svfloat32_t x, const svbool_t pg, const struct data *d)
|
||||
|
||||
svfloat32_t scale = svexpa (svreinterpret_u32 (z));
|
||||
|
||||
- /* Polynomial evaluation: poly(r) ~ exp2(r)-1.
|
||||
- Evaluate polynomial use hybrid scheme - offset ESTRIN by 1 for
|
||||
- coefficients 1 to 4, and apply most significant coefficient directly. */
|
||||
- svfloat32_t even_coeffs = svld1rq (svptrue_b32 (), &d->c0);
|
||||
- svfloat32_t r2 = svmul_x (svptrue_b32 (), r, r);
|
||||
- svfloat32_t p12 = svmla_lane (sv_f32 (d->c1), r, even_coeffs, 1);
|
||||
- svfloat32_t p34 = svmla_lane (sv_f32 (d->c3), r, even_coeffs, 2);
|
||||
- svfloat32_t p14 = svmla_x (pg, p12, r2, p34);
|
||||
- svfloat32_t p0 = svmul_lane (r, even_coeffs, 0);
|
||||
- svfloat32_t poly = svmla_x (pg, p0, r2, p14);
|
||||
+ svfloat32_t poly = svmla_x (pg, sv_f32 (d->c0), r, sv_f32 (d->c1));
|
||||
+ poly = svmul_x (svptrue_b32 (), poly, r);
|
||||
|
||||
return svmla_x (pg, scale, scale, poly);
|
||||
}
|
||||
@@ -72,11 +60,10 @@ special_case (svfloat32_t x, svbool_t special, const struct data *d)
|
||||
special);
|
||||
}
|
||||
|
||||
-/* Single-precision SVE exp2f routine. Implements the same algorithm
|
||||
- as AdvSIMD exp2f.
|
||||
- Worst case error is 1.04 ULPs.
|
||||
- _ZGVsMxv_exp2f(-0x1.af994ap-3) got 0x1.ba6a66p-1
|
||||
- want 0x1.ba6a64p-1. */
|
||||
+/* Single-precision SVE exp2f routine, based on the FEXPA instruction.
|
||||
+ Worst case error is 1.09 ULPs.
|
||||
+ _ZGVsMxv_exp2f (0x1.9a2a94p-1) got 0x1.be1054p+0
|
||||
+ want 0x1.be1052p+0. */
|
||||
svfloat32_t SV_NAME_F1 (exp2) (svfloat32_t x, const svbool_t pg)
|
||||
{
|
||||
const struct data *d = ptr_barrier (&data);
|
||||
diff --git a/sysdeps/aarch64/fpu/expm1_sve.c b/sysdeps/aarch64/fpu/expm1_sve.c
|
||||
index c933cf9c0eb2406b..4c35e0341d34aee0 100644
|
||||
--- a/sysdeps/aarch64/fpu/expm1_sve.c
|
||||
+++ b/sysdeps/aarch64/fpu/expm1_sve.c
|
||||
@@ -18,82 +18,164 @@
|
||||
<https://www.gnu.org/licenses/>. */
|
||||
|
||||
#include "sv_math.h"
|
||||
-#include "poly_sve_f64.h"
|
||||
|
||||
-#define SpecialBound 0x1.62b7d369a5aa9p+9
|
||||
-#define ExponentBias 0x3ff0000000000000
|
||||
+#define FexpaBound 0x1.4cb5ecef28adap-3 /* 15*ln2/64. */
|
||||
+#define SpecialBound 0x1.628c2855bfaddp+9 /* ln(2^(1023 + 1/128)). */
|
||||
|
||||
static const struct data
|
||||
{
|
||||
- double poly[11];
|
||||
- double shift, inv_ln2, special_bound;
|
||||
- /* To be loaded in one quad-word. */
|
||||
+ double c2, c4;
|
||||
+ double inv_ln2;
|
||||
double ln2_hi, ln2_lo;
|
||||
+ double c0, c1, c3;
|
||||
+ double shift, thres;
|
||||
+ uint64_t expm1_data[32];
|
||||
} data = {
|
||||
- /* Generated using fpminimax. */
|
||||
- .poly = { 0x1p-1, 0x1.5555555555559p-3, 0x1.555555555554bp-5,
|
||||
- 0x1.111111110f663p-7, 0x1.6c16c16c1b5f3p-10, 0x1.a01a01affa35dp-13,
|
||||
- 0x1.a01a018b4ecbbp-16, 0x1.71ddf82db5bb4p-19, 0x1.27e517fc0d54bp-22,
|
||||
- 0x1.af5eedae67435p-26, 0x1.1f143d060a28ap-29, },
|
||||
-
|
||||
- .special_bound = SpecialBound,
|
||||
- .inv_ln2 = 0x1.71547652b82fep0,
|
||||
- .ln2_hi = 0x1.62e42fefa39efp-1,
|
||||
- .ln2_lo = 0x1.abc9e3b39803fp-56,
|
||||
- .shift = 0x1.8p52,
|
||||
+ /* Table emulating FEXPA - 1, for values of FEXPA close to 1.
|
||||
+ The table holds values of 2^(i/64) - 1, computed in arbitrary precision.
|
||||
+ The first half of the table stores values associated to i from 0 to 15.
|
||||
+ The second half of the table stores values associated to i from 0 to -15. */
|
||||
+ .expm1_data = {
|
||||
+ 0x0000000000000000, 0x3f864d1f3bc03077, 0x3f966c34c5615d0f, 0x3fa0e8a30eb37901,
|
||||
+ 0x3fa6ab0d9f3121ec, 0x3fac7d865a7a3440, 0x3fb1301d0125b50a, 0x3fb429aaea92ddfb,
|
||||
+ 0x3fb72b83c7d517ae, 0x3fba35beb6fcb754, 0x3fbd4873168b9aa8, 0x3fc031dc431466b2,
|
||||
+ 0x3fc1c3d373ab11c3, 0x3fc35a2b2f13e6e9, 0x3fc4f4efa8fef709, 0x3fc6942d3720185a,
|
||||
+ 0x0000000000000000, 0xbfc331751ec3a814, 0xbfc20224341286e4, 0xbfc0cf85bed0f8b7,
|
||||
+ 0xbfbf332113d56b1f, 0xbfbcc0768d4175a6, 0xbfba46f918837cb7, 0xbfb7c695afc3b424,
|
||||
+ 0xbfb53f391822dbc7, 0xbfb2b0cfe1266bd4, 0xbfb01b466423250a, 0xbfaafd11874c009e,
|
||||
+ 0xbfa5b505d5b6f268, 0xbfa05e4119ea5d89, 0xbf95f134923757f3, 0xbf860f9f985bc9f4,
|
||||
+ },
|
||||
+
|
||||
+ /* Generated using Remez, in [-log(2)/128, log(2)/128]. */
|
||||
+ .c0 = 0x1p-1,
|
||||
+ .c1 = 0x1.55555555548f9p-3,
|
||||
+ .c2 = 0x1.5555555554c22p-5,
|
||||
+ .c3 = 0x1.111123aaa2fb2p-7,
|
||||
+ .c4 = 0x1.6c16d77d98e5bp-10,
|
||||
+ .ln2_hi = 0x1.62e42fefa3800p-1,
|
||||
+ .ln2_lo = 0x1.ef35793c76730p-45,
|
||||
+ .inv_ln2 = 0x1.71547652b82fep+0,
|
||||
+ .shift = 0x1.800000000ffc0p+46, /* 1.5*2^46+1023. */
|
||||
+ .thres = SpecialBound,
|
||||
};
|
||||
|
||||
-static svfloat64_t NOINLINE
|
||||
-special_case (svfloat64_t x, svfloat64_t y, svbool_t pg)
|
||||
+#define SpecialOffset 0x6000000000000000 /* 0x1p513. */
|
||||
+/* SpecialBias1 + SpecialBias1 = asuint(1.0). */
|
||||
+#define SpecialBias1 0x7000000000000000 /* 0x1p769. */
|
||||
+#define SpecialBias2 0x3010000000000000 /* 0x1p-254. */
|
||||
+
|
||||
+static NOINLINE svfloat64_t
|
||||
+special_case (svbool_t pg, svfloat64_t y, svfloat64_t s, svfloat64_t p,
|
||||
+ svfloat64_t n)
|
||||
{
|
||||
- return sv_call_f64 (expm1, x, y, pg);
|
||||
+ /* s=2^n may overflow, break it up into s=s1*s2,
|
||||
+ such that exp = s + s*y can be computed as s1*(s2+s2*y)
|
||||
+ and s1*s1 overflows only if n>0. */
|
||||
+
|
||||
+ /* If n<=0 then set b to 0x6, 0 otherwise. */
|
||||
+ svbool_t p_sign = svcmple (pg, n, 0.0); /* n <= 0. */
|
||||
+ svuint64_t b
|
||||
+ = svdup_u64_z (p_sign, SpecialOffset); /* Inactive lanes set to 0. */
|
||||
+
|
||||
+ /* Set s1 to generate overflow depending on sign of exponent n,
|
||||
+ ie. s1 = 0x70...0 - b. */
|
||||
+ svfloat64_t s1 = svreinterpret_f64 (svsubr_x (pg, b, SpecialBias1));
|
||||
+ /* Offset s to avoid overflow in final result if n is below threshold.
|
||||
+ ie. s2 = as_u64 (s) - 0x3010...0 + b. */
|
||||
+ svfloat64_t s2 = svreinterpret_f64 (
|
||||
+ svadd_x (pg, svsub_x (pg, svreinterpret_u64 (s), SpecialBias2), b));
|
||||
+
|
||||
+ /* |n| > 1280 => 2^(n) overflows. */
|
||||
+ svbool_t p_cmp = svacgt (pg, n, 1280.0);
|
||||
+
|
||||
+ svfloat64_t r1 = svmul_x (svptrue_b64 (), s1, s1);
|
||||
+ svfloat64_t r2 = svmla_x (pg, s2, s2, p);
|
||||
+ svfloat64_t r0 = svmul_x (svptrue_b64 (), r2, s1);
|
||||
+
|
||||
+ svbool_t is_safe = svacle (pg, n, 1023); /* Only correct special lanes. */
|
||||
+ return svsel (is_safe, y, svsub_x (pg, svsel (p_cmp, r1, r0), 1.0));
|
||||
}
|
||||
|
||||
-/* Double-precision vector exp(x) - 1 function.
|
||||
- The maximum error observed error is 2.18 ULP:
|
||||
- _ZGVsMxv_expm1(0x1.634ba0c237d7bp-2) got 0x1.a8b9ea8d66e22p-2
|
||||
- want 0x1.a8b9ea8d66e2p-2. */
|
||||
+/* FEXPA based SVE expm1 algorithm.
|
||||
+ Maximum measured error is 2.81 + 0.5 ULP:
|
||||
+ _ZGVsMxv_expm1 (0x1.974060e619bfp-3) got 0x1.c290e5858bb53p-3
|
||||
+ want 0x1.c290e5858bb5p-3. */
|
||||
svfloat64_t SV_NAME_D1 (expm1) (svfloat64_t x, svbool_t pg)
|
||||
{
|
||||
const struct data *d = ptr_barrier (&data);
|
||||
|
||||
- /* Large, Nan/Inf. */
|
||||
- svbool_t special = svnot_z (pg, svaclt (pg, x, d->special_bound));
|
||||
-
|
||||
- /* Reduce argument to smaller range:
|
||||
- Let i = round(x / ln2)
|
||||
- and f = x - i * ln2, then f is in [-ln2/2, ln2/2].
|
||||
- exp(x) - 1 = 2^i * (expm1(f) + 1) - 1
|
||||
- where 2^i is exact because i is an integer. */
|
||||
- svfloat64_t shift = sv_f64 (d->shift);
|
||||
- svfloat64_t n = svsub_x (pg, svmla_x (pg, shift, x, d->inv_ln2), shift);
|
||||
- svint64_t i = svcvt_s64_x (pg, n);
|
||||
- svfloat64_t ln2 = svld1rq (svptrue_b64 (), &d->ln2_hi);
|
||||
- svfloat64_t f = svmls_lane (x, n, ln2, 0);
|
||||
- f = svmls_lane (f, n, ln2, 1);
|
||||
-
|
||||
- /* Approximate expm1(f) using polynomial.
|
||||
- Taylor expansion for expm1(x) has the form:
|
||||
- x + ax^2 + bx^3 + cx^4 ....
|
||||
- So we calculate the polynomial P(f) = a + bf + cf^2 + ...
|
||||
- and assemble the approximation expm1(f) ~= f + f^2 * P(f). */
|
||||
- svfloat64_t f2 = svmul_x (pg, f, f);
|
||||
- svfloat64_t f4 = svmul_x (pg, f2, f2);
|
||||
- svfloat64_t f8 = svmul_x (pg, f4, f4);
|
||||
- svfloat64_t p
|
||||
- = svmla_x (pg, f, f2, sv_estrin_10_f64_x (pg, f, f2, f4, f8, d->poly));
|
||||
-
|
||||
- /* Assemble the result.
|
||||
- expm1(x) ~= 2^i * (p + 1) - 1
|
||||
- Let t = 2^i. */
|
||||
- svint64_t u = svadd_x (pg, svlsl_x (pg, i, 52), ExponentBias);
|
||||
- svfloat64_t t = svreinterpret_f64 (u);
|
||||
-
|
||||
- /* expm1(x) ~= p * t + (t - 1). */
|
||||
- svfloat64_t y = svmla_x (pg, svsub_x (pg, t, 1), p, t);
|
||||
+ svbool_t special = svacgt (pg, x, d->thres);
|
||||
|
||||
- if (__glibc_unlikely (svptest_any (pg, special)))
|
||||
- return special_case (x, y, special);
|
||||
+ svfloat64_t z = svmla_x (pg, sv_f64 (d->shift), x, d->inv_ln2);
|
||||
+ svuint64_t u = svreinterpret_u64 (z);
|
||||
+ svfloat64_t n = svsub_x (pg, z, d->shift);
|
||||
|
||||
+ /* r = x - n * ln2, r is in [-ln2/128, ln2/128]. */
|
||||
+ svfloat64_t ln2 = svld1rq (svptrue_b64 (), &d->ln2_hi);
|
||||
+ svfloat64_t r = x;
|
||||
+ r = svmls_lane (r, n, ln2, 0);
|
||||
+ r = svmls_lane (r, n, ln2, 1);
|
||||
+
|
||||
+ /* y = exp(r) - 1 ~= r + C0 r^2 + C1 r^3 + C2 r^4 + C3 r^5 + C4 r^6. */
|
||||
+ svfloat64_t r2 = svmul_x (svptrue_b64 (), r, r);
|
||||
+ svfloat64_t c24 = svld1rq (svptrue_b64 (), &d->c2);
|
||||
+
|
||||
+ svfloat64_t p;
|
||||
+ svfloat64_t c12 = svmla_lane (sv_f64 (d->c1), r, c24, 0);
|
||||
+ svfloat64_t c34 = svmla_lane (sv_f64 (d->c3), r, c24, 1);
|
||||
+ p = svmad_x (pg, c34, r2, c12);
|
||||
+ p = svmad_x (pg, p, r, sv_f64 (d->c0));
|
||||
+ p = svmad_x (pg, p, r2, r);
|
||||
+
|
||||
+ svfloat64_t scale = svexpa (u);
|
||||
+ svfloat64_t scalem1 = svsub_x (pg, scale, sv_f64 (1.0));
|
||||
+
|
||||
+ /* We want to construct expm1(x) = (scale - 1) + scale * poly.
|
||||
+ However, for values of scale close to 1, scale-1 causes large ULP errors
|
||||
+ due to cancellation.
|
||||
+
|
||||
+ This can be circumvented by using a small lookup for scale-1
|
||||
+ when our input is below a certain bound, otherwise we can use FEXPA.
|
||||
+
|
||||
+ This bound is based upon the table size:
|
||||
+ Bound = (TableSize-1/64) * ln2.
|
||||
+ The current bound is based upon a table size of 16. */
|
||||
+ svbool_t is_small = svaclt (pg, x, FexpaBound);
|
||||
+
|
||||
+ if (svptest_any (pg, is_small))
|
||||
+ {
|
||||
+ /* Index via the input of FEXPA, but we only care about the lower 4 bits.
|
||||
+ */
|
||||
+ svuint64_t base_idx = svand_x (pg, u, 0xf);
|
||||
+
|
||||
+ /* We can use the sign of x as a fifth bit to account for the asymmetry
|
||||
+ of e^x around 0. */
|
||||
+ svuint64_t signBit
|
||||
+ = svlsl_x (pg, svlsr_x (pg, svreinterpret_u64 (x), 63), 4);
|
||||
+ svuint64_t idx = svorr_x (pg, base_idx, signBit);
|
||||
+
|
||||
+ /* Lookup values for scale - 1 for small x. */
|
||||
+ svfloat64_t lookup = svreinterpret_f64 (
|
||||
+ svld1_gather_index (is_small, d->expm1_data, idx));
|
||||
+
|
||||
+ /* Select the appropriate scale - 1 value based on x. */
|
||||
+ scalem1 = svsel (is_small, lookup, scalem1);
|
||||
+ }
|
||||
+
|
||||
+ svfloat64_t y = svmla_x (pg, scalem1, scale, p);
|
||||
+
|
||||
+ /* FEXPA returns nan for large inputs so we special case those. */
|
||||
+ if (__glibc_unlikely (svptest_any (pg, special)))
|
||||
+ {
|
||||
+ /* FEXPA zeroes the sign bit, however the sign is meaningful to the
|
||||
+ special case function so needs to be copied.
|
||||
+ e = sign bit of u << 46. */
|
||||
+ svuint64_t e = svand_x (pg, svlsl_x (pg, u, 46), 0x8000000000000000);
|
||||
+ /* Copy sign to s. */
|
||||
+ scale = svreinterpret_f64 (svadd_x (pg, e, svreinterpret_u64 (scale)));
|
||||
+ return special_case (pg, y, scale, p, n);
|
||||
+ }
|
||||
+
|
||||
+ /* return expm1 = (scale - 1) + (scale * poly). */
|
||||
return y;
|
||||
}
|
||||
49
glibc-RHEL-118273-41.patch
Normal file
49
glibc-RHEL-118273-41.patch
Normal file
@ -0,0 +1,49 @@
|
||||
commit aac077645a645bba0d67f3250e82017c539d0f4b
|
||||
Author: Pierre Blanchard <pierre.blanchard@arm.com>
|
||||
Date: Wed Aug 20 17:41:50 2025 +0000
|
||||
|
||||
AArch64: Fix SVE powf routine [BZ #33299]
|
||||
|
||||
Fix a bug in predicate logic introduced in last change.
|
||||
A slight performance improvement from relying on all true
|
||||
predicates during conversion from single to double.
|
||||
This fixes BZ #33299.
|
||||
|
||||
Reviewed-by: Wilco Dijkstra <Wilco.Dijkstra@arm.com>
|
||||
|
||||
diff --git a/sysdeps/aarch64/fpu/powf_sve.c b/sysdeps/aarch64/fpu/powf_sve.c
|
||||
index 08d7019a1855ff3c..33bba96054cf4cc8 100644
|
||||
--- a/sysdeps/aarch64/fpu/powf_sve.c
|
||||
+++ b/sysdeps/aarch64/fpu/powf_sve.c
|
||||
@@ -223,15 +223,15 @@ sv_powf_core (const svbool_t pg, svuint32_t i, svuint32_t iz, svint32_t k,
|
||||
const svbool_t ptrue = svptrue_b64 ();
|
||||
|
||||
/* Unpack and promote input vectors (pg, y, z, i, k and sign_bias) into two
|
||||
- * in order to perform core computation in double precision. */
|
||||
+ in order to perform core computation in double precision. */
|
||||
const svbool_t pg_lo = svunpklo (pg);
|
||||
const svbool_t pg_hi = svunpkhi (pg);
|
||||
- svfloat64_t y_lo
|
||||
- = svcvt_f64_x (pg, svreinterpret_f32 (svunpklo (svreinterpret_u32 (y))));
|
||||
- svfloat64_t y_hi
|
||||
- = svcvt_f64_x (pg, svreinterpret_f32 (svunpkhi (svreinterpret_u32 (y))));
|
||||
- svfloat64_t z_lo = svcvt_f64_x (pg, svreinterpret_f32 (svunpklo (iz)));
|
||||
- svfloat64_t z_hi = svcvt_f64_x (pg, svreinterpret_f32 (svunpkhi (iz)));
|
||||
+ svfloat64_t y_lo = svcvt_f64_x (
|
||||
+ ptrue, svreinterpret_f32 (svunpklo (svreinterpret_u32 (y))));
|
||||
+ svfloat64_t y_hi = svcvt_f64_x (
|
||||
+ ptrue, svreinterpret_f32 (svunpkhi (svreinterpret_u32 (y))));
|
||||
+ svfloat64_t z_lo = svcvt_f64_x (ptrue, svreinterpret_f32 (svunpklo (iz)));
|
||||
+ svfloat64_t z_hi = svcvt_f64_x (ptrue, svreinterpret_f32 (svunpkhi (iz)));
|
||||
svuint64_t i_lo = svunpklo (i);
|
||||
svuint64_t i_hi = svunpkhi (i);
|
||||
svint64_t k_lo = svunpklo (k);
|
||||
@@ -312,7 +312,7 @@ svfloat32_t SV_NAME_F2 (pow) (svfloat32_t x, svfloat32_t y, const svbool_t pg)
|
||||
(23 - V_POWF_EXP2_TABLE_BITS));
|
||||
|
||||
/* Compute core in extended precision and return intermediate ylogx results
|
||||
- * to handle cases of underflow and underflow in exp. */
|
||||
+ to handle cases of underflow and overflow in exp. */
|
||||
svfloat32_t ylogx;
|
||||
svfloat32_t ret
|
||||
= sv_powf_core (yint_or_xpos, i, iz, k, y, sign_bias, &ylogx, d);
|
||||
174
glibc-RHEL-118273-42.patch
Normal file
174
glibc-RHEL-118273-42.patch
Normal file
@ -0,0 +1,174 @@
|
||||
commit e20ca759af46fbb7eae20c52b857e7636eb50e1b
|
||||
Author: remph <lhr@disroot.org>
|
||||
Date: Thu Sep 4 12:53:56 2025 +0000
|
||||
|
||||
AArch64: add optimised strspn/strcspn
|
||||
|
||||
Requires Neon (aka. Advanced SIMD). Looks up 16 characters at a time,
|
||||
for a 2-3x perfomance improvement, and a ~30% speedup on the strtok &
|
||||
strsep benchtests, as tested on Cortex A-{53,72}.
|
||||
|
||||
Signed-off-by: remph <lhr@disroot.org>
|
||||
|
||||
Reviewed-by: Wilco Dijkstra <Wilco.Dijkstra@arm.com>
|
||||
|
||||
diff --git a/sysdeps/aarch64/strcspn.S b/sysdeps/aarch64/strcspn.S
|
||||
new file mode 100644
|
||||
index 0000000000000000..f2a69e9856cba04c
|
||||
--- /dev/null
|
||||
+++ b/sysdeps/aarch64/strcspn.S
|
||||
@@ -0,0 +1,2 @@
|
||||
+#define USE_AS_STRCSPN 1
|
||||
+#include "strspn.S"
|
||||
diff --git a/sysdeps/aarch64/strspn.S b/sysdeps/aarch64/strspn.S
|
||||
new file mode 100644
|
||||
index 0000000000000000..edbb705b15991e39
|
||||
--- /dev/null
|
||||
+++ b/sysdeps/aarch64/strspn.S
|
||||
@@ -0,0 +1,146 @@
|
||||
+/* Copyright (C) 2025 Free Software Foundation, Inc.
|
||||
+
|
||||
+ This file is part of the GNU C Library.
|
||||
+
|
||||
+ The GNU C Library is free software; you can redistribute it and/or
|
||||
+ modify it under the terms of the GNU Lesser General Public
|
||||
+ License as published by the Free Software Foundation; either
|
||||
+ version 2.1 of the License, or (at your option) any later version.
|
||||
+
|
||||
+ The GNU C Library is distributed in the hope that it will be useful,
|
||||
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
+ Lesser General Public License for more details.
|
||||
+
|
||||
+ You should have received a copy of the GNU Lesser General Public
|
||||
+ License along with the GNU C Library. If not, see
|
||||
+ <https://www.gnu.org/licenses/>. */
|
||||
+
|
||||
+#include <sysdep.h>
|
||||
+
|
||||
+#ifdef USE_AS_STRCSPN
|
||||
+# define STRSPN strcspn
|
||||
+# define SBT orr /* SBT -- `set bit' */
|
||||
+#else
|
||||
+# define STRSPN strspn
|
||||
+# define SBT bic
|
||||
+#endif
|
||||
+
|
||||
+#ifdef __AARCH64EB__
|
||||
+# define LS_FW lsl
|
||||
+# define LS_BK lsr
|
||||
+#else
|
||||
+# define LS_FW lsr
|
||||
+# define LS_BK lsl
|
||||
+#endif
|
||||
+
|
||||
+#define og_s x0
|
||||
+#define set x1 /* ACCEPT for strspn, REJECT for strcspn */
|
||||
+
|
||||
+#define byte_i x3
|
||||
+#define bits_i x4
|
||||
+#define one x6
|
||||
+
|
||||
+#define syndrome x5
|
||||
+#define s x6
|
||||
+
|
||||
+#define vbyte_i v1.16b
|
||||
+#define vbits_i v2.16b
|
||||
+#define table v4.16b-v5.16b
|
||||
+#define table_a v4
|
||||
+#define table_b v5
|
||||
+#define sevens v7.16b
|
||||
+
|
||||
+ENTRY(STRSPN)
|
||||
+ ldrb w2, [set]
|
||||
+ cbz w2, L(early)
|
||||
+#ifdef USE_AS_STRCSPN
|
||||
+ ldrb w3, [set, 1]
|
||||
+ cbz w3, L(early)
|
||||
+#endif
|
||||
+
|
||||
+ /* Table has ones for bytes to reject and zeros for bytes to accept */
|
||||
+ mov one, 1
|
||||
+#ifdef USE_AS_STRCSPN
|
||||
+ stp one, xzr, [sp, -32]!
|
||||
+ .cfi_def_cfa_offset 32
|
||||
+ stp xzr, xzr, [sp, 16]
|
||||
+#else
|
||||
+ mvni v0.4s, 0
|
||||
+ stp q0, q0, [sp, -32]!
|
||||
+ .cfi_def_cfa_offset 32
|
||||
+#endif
|
||||
+
|
||||
+ .p2align 4
|
||||
+L(fill_table):
|
||||
+ lsr byte_i, x2, 6 /* x2 / 64 */
|
||||
+ lsl bits_i, one, x2 /* x2 % 64 implicitly */
|
||||
+ ldrb w2, [set, 1]!
|
||||
+ ldr x5, [sp, byte_i, lsl 3]
|
||||
+ SBT x5, x5, bits_i
|
||||
+ str x5, [sp, byte_i, lsl 3]
|
||||
+ cbnz w2, L(fill_table)
|
||||
+
|
||||
+ ld1 {table_a.2d-table_b.2d}, [sp], 32
|
||||
+ .cfi_def_cfa_offset 0
|
||||
+ ubfiz syndrome, og_s, 2, 4 /* Bottom 4 bits, times 4 to count nibbles */
|
||||
+ and s, og_s, -16 /* Round S down to 16-byte boundary */
|
||||
+ movi sevens, 7
|
||||
+ /* Bias the syndrome to mask off these nibbles */
|
||||
+ mov x8, -1
|
||||
+ LS_BK syndrome, x8, syndrome
|
||||
+ mvn syndrome, syndrome
|
||||
+
|
||||
+L(loop):
|
||||
+ ldr q0, [s], 16
|
||||
+ ushr vbyte_i, v0.16b, 3
|
||||
+ bic vbits_i, sevens, v0.16b
|
||||
+ tbl v0.16b, {table}, vbyte_i
|
||||
+ /* Bring the relevant bit to the MSB of each byte */
|
||||
+ sshl v0.16b, v0.16b, vbits_i
|
||||
+ /* Set every bit of each byte to its MSB */
|
||||
+ cmlt v0.16b, v0.16b, 0
|
||||
+ /* Bytes->nibbles */
|
||||
+ shrn v0.8b, v0.8h, 4
|
||||
+ fmov x2, d0
|
||||
+ bic syndrome, x2, syndrome
|
||||
+ cbz syndrome, L(loop)
|
||||
+
|
||||
+#ifndef __AARCH64EB__
|
||||
+ rbit syndrome, syndrome
|
||||
+#endif
|
||||
+ sub s, s, 16
|
||||
+ clz syndrome, syndrome
|
||||
+ sub x0, s, og_s
|
||||
+ add x0, x0, syndrome, lsr 2
|
||||
+ ret
|
||||
+
|
||||
+ .balign 8 /* For strspn, which has only 2 instructions here */
|
||||
+L(early):
|
||||
+#ifdef USE_AS_STRCSPN
|
||||
+ /* strlen(set) < 2: call strchrnul(s, *set) and get its offset from S */
|
||||
+ stp fp, lr, [sp, -32]!
|
||||
+ .cfi_def_cfa_offset 32
|
||||
+ .cfi_offset fp, -32
|
||||
+ .cfi_offset lr, -24
|
||||
+ str x19, [sp, 16]
|
||||
+ .cfi_offset 19, -16
|
||||
+ mov w1, w2
|
||||
+ mov fp, sp
|
||||
+ mov x19, x0
|
||||
+ bl __strchrnul
|
||||
+ sub x0, x0, x19
|
||||
+ ldr x19, [sp, 16]
|
||||
+ ldp fp, lr, [sp], 32
|
||||
+ .cfi_restore lr
|
||||
+ .cfi_restore fp
|
||||
+ .cfi_restore 19
|
||||
+ .cfi_def_cfa_offset 0
|
||||
+#else
|
||||
+ mov w0, 0
|
||||
+#endif
|
||||
+ ret
|
||||
+END(STRSPN)
|
||||
+
|
||||
+#undef set
|
||||
+libc_hidden_def(STRSPN)
|
||||
93
glibc-RHEL-118273-43.patch
Normal file
93
glibc-RHEL-118273-43.patch
Normal file
@ -0,0 +1,93 @@
|
||||
commit aebaeb2c330482171340e966f7f33fac884a27f4
|
||||
Author: Wilco Dijkstra <wilco.dijkstra@arm.com>
|
||||
Date: Thu Sep 18 14:24:47 2025 +0000
|
||||
|
||||
AArch64: Update math-vector-fortran.h
|
||||
|
||||
Update math-vector-fortran.h with the latest set of math functions
|
||||
and sort by name.
|
||||
|
||||
Reviewed-by: Yury Khrustalev <yury.khrustalev@arm.com>
|
||||
|
||||
diff --git a/sysdeps/aarch64/fpu/finclude/math-vector-fortran.h b/sysdeps/aarch64/fpu/finclude/math-vector-fortran.h
|
||||
index 92e15f0d6a758258..161f43d20c51e252 100644
|
||||
--- a/sysdeps/aarch64/fpu/finclude/math-vector-fortran.h
|
||||
+++ b/sysdeps/aarch64/fpu/finclude/math-vector-fortran.h
|
||||
@@ -15,33 +15,74 @@
|
||||
! You should have received a copy of the GNU Lesser General Public
|
||||
! License along with the GNU C Library; if not, see
|
||||
! <https://www.gnu.org/licenses/>.
|
||||
+
|
||||
!GCC$ builtin (acos) attributes simd (notinbranch)
|
||||
!GCC$ builtin (acosf) attributes simd (notinbranch)
|
||||
+!GCC$ builtin (acosh) attributes simd (notinbranch)
|
||||
+!GCC$ builtin (acoshf) attributes simd (notinbranch)
|
||||
+!GCC$ builtin (acospi) attributes simd (notinbranch)
|
||||
+!GCC$ builtin (acospif) attributes simd (notinbranch)
|
||||
!GCC$ builtin (asin) attributes simd (notinbranch)
|
||||
!GCC$ builtin (asinf) attributes simd (notinbranch)
|
||||
+!GCC$ builtin (asinh) attributes simd (notinbranch)
|
||||
+!GCC$ builtin (asinhf) attributes simd (notinbranch)
|
||||
+!GCC$ builtin (asinpi) attributes simd (notinbranch)
|
||||
+!GCC$ builtin (asinpif) attributes simd (notinbranch)
|
||||
!GCC$ builtin (atan) attributes simd (notinbranch)
|
||||
-!GCC$ builtin (atanf) attributes simd (notinbranch)
|
||||
!GCC$ builtin (atan2) attributes simd (notinbranch)
|
||||
!GCC$ builtin (atan2f) attributes simd (notinbranch)
|
||||
+!GCC$ builtin (atan2pi) attributes simd (notinbranch)
|
||||
+!GCC$ builtin (atan2pif) attributes simd (notinbranch)
|
||||
+!GCC$ builtin (atanf) attributes simd (notinbranch)
|
||||
+!GCC$ builtin (atanh) attributes simd (notinbranch)
|
||||
+!GCC$ builtin (atanhf) attributes simd (notinbranch)
|
||||
+!GCC$ builtin (atanpi) attributes simd (notinbranch)
|
||||
+!GCC$ builtin (atanpif) attributes simd (notinbranch)
|
||||
+!GCC$ builtin (cbrt) attributes simd (notinbranch)
|
||||
+!GCC$ builtin (cbrtf) attributes simd (notinbranch)
|
||||
!GCC$ builtin (cos) attributes simd (notinbranch)
|
||||
!GCC$ builtin (cosf) attributes simd (notinbranch)
|
||||
+!GCC$ builtin (cosh) attributes simd (notinbranch)
|
||||
+!GCC$ builtin (coshf) attributes simd (notinbranch)
|
||||
+!GCC$ builtin (cospi) attributes simd (notinbranch)
|
||||
+!GCC$ builtin (cospif) attributes simd (notinbranch)
|
||||
+!GCC$ builtin (erf) attributes simd (notinbranch)
|
||||
+!GCC$ builtin (erfc) attributes simd (notinbranch)
|
||||
+!GCC$ builtin (erfcf) attributes simd (notinbranch)
|
||||
+!GCC$ builtin (erff) attributes simd (notinbranch)
|
||||
!GCC$ builtin (exp) attributes simd (notinbranch)
|
||||
-!GCC$ builtin (expf) attributes simd (notinbranch)
|
||||
!GCC$ builtin (exp10) attributes simd (notinbranch)
|
||||
!GCC$ builtin (exp10f) attributes simd (notinbranch)
|
||||
+!GCC$ builtin (exp10m1) attributes simd (notinbranch)
|
||||
+!GCC$ builtin (exp10m1f) attributes simd (notinbranch)
|
||||
!GCC$ builtin (exp2) attributes simd (notinbranch)
|
||||
!GCC$ builtin (exp2f) attributes simd (notinbranch)
|
||||
+!GCC$ builtin (exp2m1) attributes simd (notinbranch)
|
||||
+!GCC$ builtin (exp2m1f) attributes simd (notinbranch)
|
||||
+!GCC$ builtin (expf) attributes simd (notinbranch)
|
||||
!GCC$ builtin (expm1) attributes simd (notinbranch)
|
||||
!GCC$ builtin (expm1f) attributes simd (notinbranch)
|
||||
+!GCC$ builtin (hypot) attributes simd (notinbranch)
|
||||
+!GCC$ builtin (hypotf) attributes simd (notinbranch)
|
||||
!GCC$ builtin (log) attributes simd (notinbranch)
|
||||
-!GCC$ builtin (logf) attributes simd (notinbranch)
|
||||
!GCC$ builtin (log10) attributes simd (notinbranch)
|
||||
!GCC$ builtin (log10f) attributes simd (notinbranch)
|
||||
!GCC$ builtin (log1p) attributes simd (notinbranch)
|
||||
!GCC$ builtin (log1pf) attributes simd (notinbranch)
|
||||
!GCC$ builtin (log2) attributes simd (notinbranch)
|
||||
!GCC$ builtin (log2f) attributes simd (notinbranch)
|
||||
+!GCC$ builtin (logf) attributes simd (notinbranch)
|
||||
+!GCC$ builtin (pow) attributes simd (notinbranch)
|
||||
+!GCC$ builtin (powf) attributes simd (notinbranch)
|
||||
!GCC$ builtin (sin) attributes simd (notinbranch)
|
||||
!GCC$ builtin (sinf) attributes simd (notinbranch)
|
||||
+!GCC$ builtin (sinh) attributes simd (notinbranch)
|
||||
+!GCC$ builtin (sinhf) attributes simd (notinbranch)
|
||||
+!GCC$ builtin (sinpi) attributes simd (notinbranch)
|
||||
+!GCC$ builtin (sinpif) attributes simd (notinbranch)
|
||||
!GCC$ builtin (tan) attributes simd (notinbranch)
|
||||
!GCC$ builtin (tanf) attributes simd (notinbranch)
|
||||
+!GCC$ builtin (tanh) attributes simd (notinbranch)
|
||||
+!GCC$ builtin (tanhf) attributes simd (notinbranch)
|
||||
+!GCC$ builtin (tanpi) attributes simd (notinbranch)
|
||||
+!GCC$ builtin (tanpif) attributes simd (notinbranch)
|
||||
97
glibc-RHEL-118273-44.patch
Normal file
97
glibc-RHEL-118273-44.patch
Normal file
@ -0,0 +1,97 @@
|
||||
commit 6c22823da57aa5218f717f569c04c9573c0448c5
|
||||
Author: Joe Ramsay <Joe.Ramsay@arm.com>
|
||||
Date: Thu Nov 6 18:26:54 2025 +0000
|
||||
|
||||
AArch64: Fix instability in AdvSIMD tan
|
||||
|
||||
Previously presence of special-cases in one lane could affect the
|
||||
results in other lanes due to unconditional scalar fallback. The old
|
||||
WANT_SIMD_EXCEPT option (which has never been enabled in libmvec) has
|
||||
been removed from AOR, making it easier to spot and fix this. 4%
|
||||
improvement in throughput with GCC 14 on Neoverse V1. This bug is
|
||||
present as far back as 2.39 (where tan was first introduced).
|
||||
|
||||
Reviewed-by: Wilco Dijkstra <Wilco.Dijkstra@arm.com>
|
||||
|
||||
diff --git a/sysdeps/aarch64/fpu/tan_advsimd.c b/sysdeps/aarch64/fpu/tan_advsimd.c
|
||||
index d56a102dd17a3463..c6a5a17126674d7d 100644
|
||||
--- a/sysdeps/aarch64/fpu/tan_advsimd.c
|
||||
+++ b/sysdeps/aarch64/fpu/tan_advsimd.c
|
||||
@@ -25,9 +25,7 @@ static const struct data
|
||||
float64x2_t poly[9];
|
||||
double half_pi[2];
|
||||
float64x2_t two_over_pi, shift;
|
||||
-#if !WANT_SIMD_EXCEPT
|
||||
float64x2_t range_val;
|
||||
-#endif
|
||||
} data = {
|
||||
/* Coefficients generated using FPMinimax. */
|
||||
.poly = { V2 (0x1.5555555555556p-2), V2 (0x1.1111111110a63p-3),
|
||||
@@ -38,20 +36,17 @@ static const struct data
|
||||
.half_pi = { 0x1.921fb54442d18p0, 0x1.1a62633145c07p-54 },
|
||||
.two_over_pi = V2 (0x1.45f306dc9c883p-1),
|
||||
.shift = V2 (0x1.8p52),
|
||||
-#if !WANT_SIMD_EXCEPT
|
||||
.range_val = V2 (0x1p23),
|
||||
-#endif
|
||||
};
|
||||
|
||||
#define RangeVal 0x4160000000000000 /* asuint64(0x1p23). */
|
||||
#define TinyBound 0x3e50000000000000 /* asuint64(2^-26). */
|
||||
-#define Thresh 0x310000000000000 /* RangeVal - TinyBound. */
|
||||
|
||||
/* Special cases (fall back to scalar calls). */
|
||||
static float64x2_t VPCS_ATTR NOINLINE
|
||||
-special_case (float64x2_t x)
|
||||
+special_case (float64x2_t x, float64x2_t n, float64x2_t d, uint64x2_t special)
|
||||
{
|
||||
- return v_call_f64 (tan, x, x, v_u64 (-1));
|
||||
+ return v_call_f64 (tan, x, vdivq_f64 (n, d), special);
|
||||
}
|
||||
|
||||
/* Vector approximation for double-precision tan.
|
||||
@@ -65,14 +60,6 @@ float64x2_t VPCS_ATTR V_NAME_D1 (tan) (float64x2_t x)
|
||||
very large inputs. Fall back to scalar routine for all lanes if any are
|
||||
too large, or Inf/NaN. If fenv exceptions are expected, also fall back for
|
||||
tiny input to avoid underflow. */
|
||||
-#if WANT_SIMD_EXCEPT
|
||||
- uint64x2_t iax = vreinterpretq_u64_f64 (vabsq_f64 (x));
|
||||
- /* iax - tiny_bound > range_val - tiny_bound. */
|
||||
- uint64x2_t special
|
||||
- = vcgtq_u64 (vsubq_u64 (iax, v_u64 (TinyBound)), v_u64 (Thresh));
|
||||
- if (__glibc_unlikely (v_any_u64 (special)))
|
||||
- return special_case (x);
|
||||
-#endif
|
||||
|
||||
/* q = nearest integer to 2 * x / pi. */
|
||||
float64x2_t q
|
||||
@@ -81,9 +68,8 @@ float64x2_t VPCS_ATTR V_NAME_D1 (tan) (float64x2_t x)
|
||||
|
||||
/* Use q to reduce x to r in [-pi/4, pi/4], by:
|
||||
r = x - q * pi/2, in extended precision. */
|
||||
- float64x2_t r = x;
|
||||
float64x2_t half_pi = vld1q_f64 (dat->half_pi);
|
||||
- r = vfmsq_laneq_f64 (r, q, half_pi, 0);
|
||||
+ float64x2_t r = vfmsq_laneq_f64 (x, q, half_pi, 0);
|
||||
r = vfmsq_laneq_f64 (r, q, half_pi, 1);
|
||||
/* Further reduce r to [-pi/8, pi/8], to be reconstructed using double angle
|
||||
formula. */
|
||||
@@ -114,12 +100,13 @@ float64x2_t VPCS_ATTR V_NAME_D1 (tan) (float64x2_t x)
|
||||
|
||||
uint64x2_t no_recip = vtstq_u64 (vreinterpretq_u64_s64 (qi), v_u64 (1));
|
||||
|
||||
-#if !WANT_SIMD_EXCEPT
|
||||
uint64x2_t special = vcageq_f64 (x, dat->range_val);
|
||||
+ float64x2_t swap = vbslq_f64 (no_recip, n, vnegq_f64 (d));
|
||||
+ d = vbslq_f64 (no_recip, d, n);
|
||||
+ n = swap;
|
||||
+
|
||||
if (__glibc_unlikely (v_any_u64 (special)))
|
||||
- return special_case (x);
|
||||
-#endif
|
||||
+ return special_case (x, n, d, special);
|
||||
|
||||
- return vdivq_f64 (vbslq_f64 (no_recip, n, vnegq_f64 (d)),
|
||||
- vbslq_f64 (no_recip, d, n));
|
||||
+ return vdivq_f64 (n, d);
|
||||
}
|
||||
88
glibc-RHEL-118273-45.patch
Normal file
88
glibc-RHEL-118273-45.patch
Normal file
@ -0,0 +1,88 @@
|
||||
commit e45af510bc816e860c8e2e1d4a652b4fe15c4b34
|
||||
Author: Joe Ramsay <Joe.Ramsay@arm.com>
|
||||
Date: Thu Nov 6 18:29:33 2025 +0000
|
||||
|
||||
AArch64: Fix instability in AdvSIMD sinh
|
||||
|
||||
Previously presence of special-cases in one lane could affect the
|
||||
results in other lanes due to unconditional scalar fallback. The old
|
||||
WANT_SIMD_EXCEPT option (which has never been enabled in libmvec) has
|
||||
been removed from AOR, making it easier to spot and fix
|
||||
this. No measured change in performance. This patch applies cleanly as
|
||||
far back as 2.41, however there are conflicts with 2.40 where sinh was
|
||||
first introduced.
|
||||
|
||||
Reviewed-by: Wilco Dijkstra <Wilco.Dijkstra@arm.com>
|
||||
|
||||
diff --git a/sysdeps/aarch64/fpu/sinh_advsimd.c b/sysdeps/aarch64/fpu/sinh_advsimd.c
|
||||
index 7adf771517de2507..66504cdee84ee77e 100644
|
||||
--- a/sysdeps/aarch64/fpu/sinh_advsimd.c
|
||||
+++ b/sysdeps/aarch64/fpu/sinh_advsimd.c
|
||||
@@ -24,36 +24,26 @@ static const struct data
|
||||
{
|
||||
struct v_expm1_data d;
|
||||
uint64x2_t halff;
|
||||
-#if WANT_SIMD_EXCEPT
|
||||
- uint64x2_t tiny_bound, thresh;
|
||||
-#else
|
||||
float64x2_t large_bound;
|
||||
-#endif
|
||||
} data = {
|
||||
.d = V_EXPM1_DATA,
|
||||
.halff = V2 (0x3fe0000000000000),
|
||||
-#if WANT_SIMD_EXCEPT
|
||||
- /* 2^-26, below which sinh(x) rounds to x. */
|
||||
- .tiny_bound = V2 (0x3e50000000000000),
|
||||
- /* asuint(large_bound) - asuint(tiny_bound). */
|
||||
- .thresh = V2 (0x0230000000000000),
|
||||
-#else
|
||||
/* 2^9. expm1 helper overflows for large input. */
|
||||
.large_bound = V2 (0x1p+9),
|
||||
-#endif
|
||||
};
|
||||
|
||||
static float64x2_t NOINLINE VPCS_ATTR
|
||||
-special_case (float64x2_t x)
|
||||
+special_case (float64x2_t x, float64x2_t t, float64x2_t halfsign,
|
||||
+ uint64x2_t special)
|
||||
{
|
||||
- return v_call_f64 (sinh, x, x, v_u64 (-1));
|
||||
+ return v_call_f64 (sinh, x, vmulq_f64 (t, halfsign), special);
|
||||
}
|
||||
|
||||
/* Approximation for vector double-precision sinh(x) using expm1.
|
||||
sinh(x) = (exp(x) - exp(-x)) / 2.
|
||||
The greatest observed error is 2.52 ULP:
|
||||
- _ZGVnN2v_sinh(-0x1.a098a2177a2b9p-2) got -0x1.ac2f05bb66fccp-2
|
||||
- want -0x1.ac2f05bb66fc9p-2. */
|
||||
+ _ZGVnN2v_sinh(0x1.9f6ff2ab6fb19p-2) got 0x1.aaed83a3153ccp-2
|
||||
+ want 0x1.aaed83a3153c9p-2. */
|
||||
float64x2_t VPCS_ATTR V_NAME_D1 (sinh) (float64x2_t x)
|
||||
{
|
||||
const struct data *d = ptr_barrier (&data);
|
||||
@@ -63,21 +53,16 @@ float64x2_t VPCS_ATTR V_NAME_D1 (sinh) (float64x2_t x)
|
||||
float64x2_t halfsign = vreinterpretq_f64_u64 (
|
||||
vbslq_u64 (v_u64 (0x8000000000000000), ix, d->halff));
|
||||
|
||||
-#if WANT_SIMD_EXCEPT
|
||||
- uint64x2_t special = vcgeq_u64 (
|
||||
- vsubq_u64 (vreinterpretq_u64_f64 (ax), d->tiny_bound), d->thresh);
|
||||
-#else
|
||||
uint64x2_t special = vcageq_f64 (x, d->large_bound);
|
||||
-#endif
|
||||
-
|
||||
- /* Fall back to scalar variant for all lanes if any of them are special. */
|
||||
- if (__glibc_unlikely (v_any_u64 (special)))
|
||||
- return special_case (x);
|
||||
|
||||
/* Up to the point that expm1 overflows, we can use it to calculate sinh
|
||||
using a slight rearrangement of the definition of sinh. This allows us to
|
||||
retain acceptable accuracy for very small inputs. */
|
||||
float64x2_t t = expm1_inline (ax, &d->d);
|
||||
t = vaddq_f64 (t, vdivq_f64 (t, vaddq_f64 (t, v_f64 (1.0))));
|
||||
+
|
||||
+ if (__glibc_unlikely (v_any_u64 (special)))
|
||||
+ return special_case (x, t, halfsign, special);
|
||||
+
|
||||
return vmulq_f64 (t, halfsign);
|
||||
}
|
||||
475
glibc-RHEL-118273-5.patch
Normal file
475
glibc-RHEL-118273-5.patch
Normal file
@ -0,0 +1,475 @@
|
||||
commit 8b679205286e7874f0b04187c0bc787632168aa2
|
||||
Author: Joe Ramsay <Joe.Ramsay@arm.com>
|
||||
Date: Wed Apr 3 12:13:53 2024 +0100
|
||||
|
||||
aarch64/fpu: Add vector variants of atanh
|
||||
|
||||
Reviewed-by: Szabolcs Nagy <szabolcs.nagy@arm.com>
|
||||
|
||||
diff --git a/sysdeps/aarch64/fpu/Makefile b/sysdeps/aarch64/fpu/Makefile
|
||||
index d474f2969dd05c26..4c878e590681becc 100644
|
||||
--- a/sysdeps/aarch64/fpu/Makefile
|
||||
+++ b/sysdeps/aarch64/fpu/Makefile
|
||||
@@ -3,6 +3,7 @@ libmvec-supported-funcs = acos \
|
||||
asin \
|
||||
asinh \
|
||||
atan \
|
||||
+ atanh \
|
||||
atan2 \
|
||||
cos \
|
||||
cosh \
|
||||
diff --git a/sysdeps/aarch64/fpu/Versions b/sysdeps/aarch64/fpu/Versions
|
||||
index 08ea15efaec959fb..092949dc96d55624 100644
|
||||
--- a/sysdeps/aarch64/fpu/Versions
|
||||
+++ b/sysdeps/aarch64/fpu/Versions
|
||||
@@ -89,6 +89,11 @@ libmvec {
|
||||
_ZGVnN4v_asinhf;
|
||||
_ZGVsMxv_asinh;
|
||||
_ZGVsMxv_asinhf;
|
||||
+ _ZGVnN2v_atanh;
|
||||
+ _ZGVnN2v_atanhf;
|
||||
+ _ZGVnN4v_atanhf;
|
||||
+ _ZGVsMxv_atanh;
|
||||
+ _ZGVsMxv_atanhf;
|
||||
_ZGVnN2v_cosh;
|
||||
_ZGVnN2v_coshf;
|
||||
_ZGVnN4v_coshf;
|
||||
diff --git a/sysdeps/aarch64/fpu/advsimd_f32_protos.h b/sysdeps/aarch64/fpu/advsimd_f32_protos.h
|
||||
index 1e80721c9f73ba12..afbb01e191b917a4 100644
|
||||
--- a/sysdeps/aarch64/fpu/advsimd_f32_protos.h
|
||||
+++ b/sysdeps/aarch64/fpu/advsimd_f32_protos.h
|
||||
@@ -22,6 +22,7 @@ libmvec_hidden_proto (V_NAME_F1(acosh));
|
||||
libmvec_hidden_proto (V_NAME_F1(asin));
|
||||
libmvec_hidden_proto (V_NAME_F1(asinh));
|
||||
libmvec_hidden_proto (V_NAME_F1(atan));
|
||||
+libmvec_hidden_proto (V_NAME_F1(atanh));
|
||||
libmvec_hidden_proto (V_NAME_F1(cos));
|
||||
libmvec_hidden_proto (V_NAME_F1(cosh));
|
||||
libmvec_hidden_proto (V_NAME_F1(erf));
|
||||
diff --git a/sysdeps/aarch64/fpu/atanh_advsimd.c b/sysdeps/aarch64/fpu/atanh_advsimd.c
|
||||
new file mode 100644
|
||||
index 0000000000000000..3c3d0bd6ad41396d
|
||||
--- /dev/null
|
||||
+++ b/sysdeps/aarch64/fpu/atanh_advsimd.c
|
||||
@@ -0,0 +1,64 @@
|
||||
+/* Double-precision vector (Advanced SIMD) atanh function
|
||||
+
|
||||
+ Copyright (C) 2024 Free Software Foundation, Inc.
|
||||
+ This file is part of the GNU C Library.
|
||||
+
|
||||
+ The GNU C Library is free software; you can redistribute it and/or
|
||||
+ modify it under the terms of the GNU Lesser General Public
|
||||
+ License as published by the Free Software Foundation; either
|
||||
+ version 2.1 of the License, or (at your option) any later version.
|
||||
+
|
||||
+ The GNU C Library is distributed in the hope that it will be useful,
|
||||
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
+ Lesser General Public License for more details.
|
||||
+
|
||||
+ You should have received a copy of the GNU Lesser General Public
|
||||
+ License along with the GNU C Library; if not, see
|
||||
+ <https://www.gnu.org/licenses/>. */
|
||||
+
|
||||
+#define WANT_V_LOG1P_K0_SHORTCUT 0
|
||||
+#include "v_log1p_inline.h"
|
||||
+
|
||||
+const static struct data
|
||||
+{
|
||||
+ struct v_log1p_data log1p_consts;
|
||||
+ uint64x2_t one, half;
|
||||
+} data = { .log1p_consts = V_LOG1P_CONSTANTS_TABLE,
|
||||
+ .one = V2 (0x3ff0000000000000),
|
||||
+ .half = V2 (0x3fe0000000000000) };
|
||||
+
|
||||
+static float64x2_t VPCS_ATTR NOINLINE
|
||||
+special_case (float64x2_t x, float64x2_t y, uint64x2_t special)
|
||||
+{
|
||||
+ return v_call_f64 (atanh, x, y, special);
|
||||
+}
|
||||
+
|
||||
+/* Approximation for vector double-precision atanh(x) using modified log1p.
|
||||
+ The greatest observed error is 3.31 ULP:
|
||||
+ _ZGVnN2v_atanh(0x1.ffae6288b601p-6) got 0x1.ffd8ff31b5019p-6
|
||||
+ want 0x1.ffd8ff31b501cp-6. */
|
||||
+VPCS_ATTR
|
||||
+float64x2_t V_NAME_D1 (atanh) (float64x2_t x)
|
||||
+{
|
||||
+ const struct data *d = ptr_barrier (&data);
|
||||
+
|
||||
+ float64x2_t ax = vabsq_f64 (x);
|
||||
+ uint64x2_t ia = vreinterpretq_u64_f64 (ax);
|
||||
+ uint64x2_t sign = veorq_u64 (vreinterpretq_u64_f64 (x), ia);
|
||||
+ uint64x2_t special = vcgeq_u64 (ia, d->one);
|
||||
+ float64x2_t halfsign = vreinterpretq_f64_u64 (vorrq_u64 (sign, d->half));
|
||||
+
|
||||
+#if WANT_SIMD_EXCEPT
|
||||
+ ax = v_zerofy_f64 (ax, special);
|
||||
+#endif
|
||||
+
|
||||
+ float64x2_t y;
|
||||
+ y = vaddq_f64 (ax, ax);
|
||||
+ y = vdivq_f64 (y, vsubq_f64 (v_f64 (1), ax));
|
||||
+ y = log1p_inline (y, &d->log1p_consts);
|
||||
+
|
||||
+ if (__glibc_unlikely (v_any_u64 (special)))
|
||||
+ return special_case (x, vmulq_f64 (y, halfsign), special);
|
||||
+ return vmulq_f64 (y, halfsign);
|
||||
+}
|
||||
diff --git a/sysdeps/aarch64/fpu/atanh_sve.c b/sysdeps/aarch64/fpu/atanh_sve.c
|
||||
new file mode 100644
|
||||
index 0000000000000000..7a52728d70f6d226
|
||||
--- /dev/null
|
||||
+++ b/sysdeps/aarch64/fpu/atanh_sve.c
|
||||
@@ -0,0 +1,59 @@
|
||||
+/* Double-precision vector (SVE) atanh function
|
||||
+
|
||||
+ Copyright (C) 2024 Free Software Foundation, Inc.
|
||||
+ This file is part of the GNU C Library.
|
||||
+
|
||||
+ The GNU C Library is free software; you can redistribute it and/or
|
||||
+ modify it under the terms of the GNU Lesser General Public
|
||||
+ License as published by the Free Software Foundation; either
|
||||
+ version 2.1 of the License, or (at your option) any later version.
|
||||
+
|
||||
+ The GNU C Library is distributed in the hope that it will be useful,
|
||||
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
+ Lesser General Public License for more details.
|
||||
+
|
||||
+ You should have received a copy of the GNU Lesser General Public
|
||||
+ License along with the GNU C Library; if not, see
|
||||
+ <https://www.gnu.org/licenses/>. */
|
||||
+
|
||||
+#define WANT_SV_LOG1P_K0_SHORTCUT 0
|
||||
+#include "sv_log1p_inline.h"
|
||||
+
|
||||
+#define One (0x3ff0000000000000)
|
||||
+#define Half (0x3fe0000000000000)
|
||||
+
|
||||
+static svfloat64_t NOINLINE
|
||||
+special_case (svfloat64_t x, svfloat64_t y, svbool_t special)
|
||||
+{
|
||||
+ return sv_call_f64 (atanh, x, y, special);
|
||||
+}
|
||||
+
|
||||
+/* SVE approximation for double-precision atanh, based on log1p.
|
||||
+ The greatest observed error is 2.81 ULP:
|
||||
+ _ZGVsMxv_atanh(0x1.ffae6288b601p-6) got 0x1.ffd8ff31b5019p-6
|
||||
+ want 0x1.ffd8ff31b501cp-6. */
|
||||
+svfloat64_t SV_NAME_D1 (atanh) (svfloat64_t x, const svbool_t pg)
|
||||
+{
|
||||
+
|
||||
+ svfloat64_t ax = svabs_x (pg, x);
|
||||
+ svuint64_t iax = svreinterpret_u64 (ax);
|
||||
+ svuint64_t sign = sveor_x (pg, svreinterpret_u64 (x), iax);
|
||||
+ svfloat64_t halfsign = svreinterpret_f64 (svorr_x (pg, sign, Half));
|
||||
+
|
||||
+ /* It is special if iax >= 1. */
|
||||
+// svbool_t special = svcmpge (pg, iax, One);
|
||||
+ svbool_t special = svacge (pg, x, 1.0);
|
||||
+
|
||||
+ /* Computation is performed based on the following sequence of equality:
|
||||
+ (1+x)/(1-x) = 1 + 2x/(1-x). */
|
||||
+ svfloat64_t y;
|
||||
+ y = svadd_x (pg, ax, ax);
|
||||
+ y = svdiv_x (pg, y, svsub_x (pg, sv_f64 (1), ax));
|
||||
+ /* ln((1+x)/(1-x)) = ln(1+2x/(1-x)) = ln(1 + y). */
|
||||
+ y = sv_log1p_inline (y, pg);
|
||||
+
|
||||
+ if (__glibc_unlikely (svptest_any (pg, special)))
|
||||
+ return special_case (x, svmul_x (pg, halfsign, y), special);
|
||||
+ return svmul_x (pg, halfsign, y);
|
||||
+}
|
||||
diff --git a/sysdeps/aarch64/fpu/atanhf_advsimd.c b/sysdeps/aarch64/fpu/atanhf_advsimd.c
|
||||
new file mode 100644
|
||||
index 0000000000000000..ae488f7b54ddce26
|
||||
--- /dev/null
|
||||
+++ b/sysdeps/aarch64/fpu/atanhf_advsimd.c
|
||||
@@ -0,0 +1,79 @@
|
||||
+/* Single-precision vector (Advanced SIMD) atanh function
|
||||
+
|
||||
+ Copyright (C) 2024 Free Software Foundation, Inc.
|
||||
+ This file is part of the GNU C Library.
|
||||
+
|
||||
+ The GNU C Library is free software; you can redistribute it and/or
|
||||
+ modify it under the terms of the GNU Lesser General Public
|
||||
+ License as published by the Free Software Foundation; either
|
||||
+ version 2.1 of the License, or (at your option) any later version.
|
||||
+
|
||||
+ The GNU C Library is distributed in the hope that it will be useful,
|
||||
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
+ Lesser General Public License for more details.
|
||||
+
|
||||
+ You should have received a copy of the GNU Lesser General Public
|
||||
+ License along with the GNU C Library; if not, see
|
||||
+ <https://www.gnu.org/licenses/>. */
|
||||
+
|
||||
+#include "v_math.h"
|
||||
+#include "v_log1pf_inline.h"
|
||||
+
|
||||
+const static struct data
|
||||
+{
|
||||
+ struct v_log1pf_data log1pf_consts;
|
||||
+ uint32x4_t one;
|
||||
+#if WANT_SIMD_EXCEPT
|
||||
+ uint32x4_t tiny_bound;
|
||||
+#endif
|
||||
+} data = {
|
||||
+ .log1pf_consts = V_LOG1PF_CONSTANTS_TABLE,
|
||||
+ .one = V4 (0x3f800000),
|
||||
+#if WANT_SIMD_EXCEPT
|
||||
+ /* 0x1p-12, below which atanhf(x) rounds to x. */
|
||||
+ .tiny_bound = V4 (0x39800000),
|
||||
+#endif
|
||||
+};
|
||||
+
|
||||
+#define AbsMask v_u32 (0x7fffffff)
|
||||
+#define Half v_u32 (0x3f000000)
|
||||
+
|
||||
+static float32x4_t NOINLINE VPCS_ATTR
|
||||
+special_case (float32x4_t x, float32x4_t y, uint32x4_t special)
|
||||
+{
|
||||
+ return v_call_f32 (atanhf, x, y, special);
|
||||
+}
|
||||
+
|
||||
+/* Approximation for vector single-precision atanh(x) using modified log1p.
|
||||
+ The maximum error is 3.08 ULP:
|
||||
+ __v_atanhf(0x1.ff215p-5) got 0x1.ffcb7cp-5
|
||||
+ want 0x1.ffcb82p-5. */
|
||||
+VPCS_ATTR float32x4_t NOINLINE V_NAME_F1 (atanh) (float32x4_t x)
|
||||
+{
|
||||
+ const struct data *d = ptr_barrier (&data);
|
||||
+
|
||||
+ float32x4_t halfsign = vbslq_f32 (AbsMask, v_f32 (0.5), x);
|
||||
+ float32x4_t ax = vabsq_f32 (x);
|
||||
+ uint32x4_t iax = vreinterpretq_u32_f32 (ax);
|
||||
+
|
||||
+#if WANT_SIMD_EXCEPT
|
||||
+ uint32x4_t special
|
||||
+ = vorrq_u32 (vcgeq_u32 (iax, d->one), vcltq_u32 (iax, d->tiny_bound));
|
||||
+ /* Side-step special cases by setting those lanes to 0, which will trigger no
|
||||
+ exceptions. These will be fixed up later. */
|
||||
+ if (__glibc_unlikely (v_any_u32 (special)))
|
||||
+ ax = v_zerofy_f32 (ax, special);
|
||||
+#else
|
||||
+ uint32x4_t special = vcgeq_u32 (iax, d->one);
|
||||
+#endif
|
||||
+
|
||||
+ float32x4_t y = vdivq_f32 (vaddq_f32 (ax, ax), vsubq_f32 (v_f32 (1), ax));
|
||||
+ y = log1pf_inline (y, d->log1pf_consts);
|
||||
+
|
||||
+ if (__glibc_unlikely (v_any_u32 (special)))
|
||||
+ return special_case (x, vmulq_f32 (halfsign, y), special);
|
||||
+ return vmulq_f32 (halfsign, y);
|
||||
+}
|
||||
+libmvec_hidden_def (V_NAME_F1 (atanh))
|
||||
+HALF_WIDTH_ALIAS_F1 (atanh)
|
||||
diff --git a/sysdeps/aarch64/fpu/atanhf_sve.c b/sysdeps/aarch64/fpu/atanhf_sve.c
|
||||
new file mode 100644
|
||||
index 0000000000000000..dae83041ef7157f0
|
||||
--- /dev/null
|
||||
+++ b/sysdeps/aarch64/fpu/atanhf_sve.c
|
||||
@@ -0,0 +1,54 @@
|
||||
+/* Single-precision vector (SVE) atanh function
|
||||
+
|
||||
+ Copyright (C) 2024 Free Software Foundation, Inc.
|
||||
+ This file is part of the GNU C Library.
|
||||
+
|
||||
+ The GNU C Library is free software; you can redistribute it and/or
|
||||
+ modify it under the terms of the GNU Lesser General Public
|
||||
+ License as published by the Free Software Foundation; either
|
||||
+ version 2.1 of the License, or (at your option) any later version.
|
||||
+
|
||||
+ The GNU C Library is distributed in the hope that it will be useful,
|
||||
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
+ Lesser General Public License for more details.
|
||||
+
|
||||
+ You should have received a copy of the GNU Lesser General Public
|
||||
+ License along with the GNU C Library; if not, see
|
||||
+ <https://www.gnu.org/licenses/>. */
|
||||
+
|
||||
+#include "sv_log1pf_inline.h"
|
||||
+
|
||||
+#define One (0x3f800000)
|
||||
+#define Half (0x3f000000)
|
||||
+
|
||||
+static svfloat32_t NOINLINE
|
||||
+special_case (svfloat32_t x, svfloat32_t y, svbool_t special)
|
||||
+{
|
||||
+ return sv_call_f32 (atanhf, x, y, special);
|
||||
+}
|
||||
+
|
||||
+/* Approximation for vector single-precision atanh(x) using modified log1p.
|
||||
+ The maximum error is 2.28 ULP:
|
||||
+ _ZGVsMxv_atanhf(0x1.ff1194p-5) got 0x1.ffbbbcp-5
|
||||
+ want 0x1.ffbbb6p-5. */
|
||||
+svfloat32_t SV_NAME_F1 (atanh) (svfloat32_t x, const svbool_t pg)
|
||||
+{
|
||||
+ svfloat32_t ax = svabs_x (pg, x);
|
||||
+ svuint32_t iax = svreinterpret_u32 (ax);
|
||||
+ svuint32_t sign = sveor_x (pg, svreinterpret_u32 (x), iax);
|
||||
+ svfloat32_t halfsign = svreinterpret_f32 (svorr_x (pg, sign, Half));
|
||||
+ svbool_t special = svcmpge (pg, iax, One);
|
||||
+
|
||||
+ /* Computation is performed based on the following sequence of equality:
|
||||
+ * (1+x)/(1-x) = 1 + 2x/(1-x). */
|
||||
+ svfloat32_t y = svadd_x (pg, ax, ax);
|
||||
+ y = svdiv_x (pg, y, svsub_x (pg, sv_f32 (1), ax));
|
||||
+ /* ln((1+x)/(1-x)) = ln(1+2x/(1-x)) = ln(1 + y). */
|
||||
+ y = sv_log1pf_inline (y, pg);
|
||||
+
|
||||
+ if (__glibc_unlikely (svptest_any (pg, special)))
|
||||
+ return special_case (x, svmul_x (pg, halfsign, y), special);
|
||||
+
|
||||
+ return svmul_x (pg, halfsign, y);
|
||||
+}
|
||||
diff --git a/sysdeps/aarch64/fpu/bits/math-vector.h b/sysdeps/aarch64/fpu/bits/math-vector.h
|
||||
index eb2af35b27757fc6..ab7a8f74548854b9 100644
|
||||
--- a/sysdeps/aarch64/fpu/bits/math-vector.h
|
||||
+++ b/sysdeps/aarch64/fpu/bits/math-vector.h
|
||||
@@ -49,6 +49,10 @@
|
||||
# define __DECL_SIMD_atan __DECL_SIMD_aarch64
|
||||
# undef __DECL_SIMD_atanf
|
||||
# define __DECL_SIMD_atanf __DECL_SIMD_aarch64
|
||||
+# undef __DECL_SIMD_atanh
|
||||
+# define __DECL_SIMD_atanh __DECL_SIMD_aarch64
|
||||
+# undef __DECL_SIMD_atanhf
|
||||
+# define __DECL_SIMD_atanhf __DECL_SIMD_aarch64
|
||||
# undef __DECL_SIMD_atan2
|
||||
# define __DECL_SIMD_atan2 __DECL_SIMD_aarch64
|
||||
# undef __DECL_SIMD_atan2f
|
||||
@@ -137,6 +141,7 @@ __vpcs __f32x4_t _ZGVnN4v_acoshf (__f32x4_t);
|
||||
__vpcs __f32x4_t _ZGVnN4v_asinf (__f32x4_t);
|
||||
__vpcs __f32x4_t _ZGVnN4v_asinhf (__f32x4_t);
|
||||
__vpcs __f32x4_t _ZGVnN4v_atanf (__f32x4_t);
|
||||
+__vpcs __f32x4_t _ZGVnN4v_atanhf (__f32x4_t);
|
||||
__vpcs __f32x4_t _ZGVnN4v_cosf (__f32x4_t);
|
||||
__vpcs __f32x4_t _ZGVnN4v_coshf (__f32x4_t);
|
||||
__vpcs __f32x4_t _ZGVnN4v_erff (__f32x4_t);
|
||||
@@ -157,6 +162,7 @@ __vpcs __f64x2_t _ZGVnN2v_acosh (__f64x2_t);
|
||||
__vpcs __f64x2_t _ZGVnN2v_asin (__f64x2_t);
|
||||
__vpcs __f64x2_t _ZGVnN2v_asinh (__f64x2_t);
|
||||
__vpcs __f64x2_t _ZGVnN2v_atan (__f64x2_t);
|
||||
+__vpcs __f64x2_t _ZGVnN2v_atanh (__f64x2_t);
|
||||
__vpcs __f64x2_t _ZGVnN2v_cos (__f64x2_t);
|
||||
__vpcs __f64x2_t _ZGVnN2v_cosh (__f64x2_t);
|
||||
__vpcs __f64x2_t _ZGVnN2v_erf (__f64x2_t);
|
||||
@@ -182,6 +188,7 @@ __sv_f32_t _ZGVsMxv_acoshf (__sv_f32_t, __sv_bool_t);
|
||||
__sv_f32_t _ZGVsMxv_asinf (__sv_f32_t, __sv_bool_t);
|
||||
__sv_f32_t _ZGVsMxv_asinhf (__sv_f32_t, __sv_bool_t);
|
||||
__sv_f32_t _ZGVsMxv_atanf (__sv_f32_t, __sv_bool_t);
|
||||
+__sv_f32_t _ZGVsMxv_atanhf (__sv_f32_t, __sv_bool_t);
|
||||
__sv_f32_t _ZGVsMxv_cosf (__sv_f32_t, __sv_bool_t);
|
||||
__sv_f32_t _ZGVsMxv_coshf (__sv_f32_t, __sv_bool_t);
|
||||
__sv_f32_t _ZGVsMxv_erff (__sv_f32_t, __sv_bool_t);
|
||||
@@ -202,6 +209,7 @@ __sv_f64_t _ZGVsMxv_acosh (__sv_f64_t, __sv_bool_t);
|
||||
__sv_f64_t _ZGVsMxv_asin (__sv_f64_t, __sv_bool_t);
|
||||
__sv_f64_t _ZGVsMxv_asinh (__sv_f64_t, __sv_bool_t);
|
||||
__sv_f64_t _ZGVsMxv_atan (__sv_f64_t, __sv_bool_t);
|
||||
+__sv_f64_t _ZGVsMxv_atanh (__sv_f64_t, __sv_bool_t);
|
||||
__sv_f64_t _ZGVsMxv_cos (__sv_f64_t, __sv_bool_t);
|
||||
__sv_f64_t _ZGVsMxv_cosh (__sv_f64_t, __sv_bool_t);
|
||||
__sv_f64_t _ZGVsMxv_erf (__sv_f64_t, __sv_bool_t);
|
||||
diff --git a/sysdeps/aarch64/fpu/test-double-advsimd-wrappers.c b/sysdeps/aarch64/fpu/test-double-advsimd-wrappers.c
|
||||
index 3d7177c32dcd77a6..a01aa99c16740631 100644
|
||||
--- a/sysdeps/aarch64/fpu/test-double-advsimd-wrappers.c
|
||||
+++ b/sysdeps/aarch64/fpu/test-double-advsimd-wrappers.c
|
||||
@@ -28,6 +28,7 @@ VPCS_VECTOR_WRAPPER (acosh_advsimd, _ZGVnN2v_acosh)
|
||||
VPCS_VECTOR_WRAPPER (asin_advsimd, _ZGVnN2v_asin)
|
||||
VPCS_VECTOR_WRAPPER (asinh_advsimd, _ZGVnN2v_asinh)
|
||||
VPCS_VECTOR_WRAPPER (atan_advsimd, _ZGVnN2v_atan)
|
||||
+VPCS_VECTOR_WRAPPER (atanh_advsimd, _ZGVnN2v_atanh)
|
||||
VPCS_VECTOR_WRAPPER_ff (atan2_advsimd, _ZGVnN2vv_atan2)
|
||||
VPCS_VECTOR_WRAPPER (cos_advsimd, _ZGVnN2v_cos)
|
||||
VPCS_VECTOR_WRAPPER (cosh_advsimd, _ZGVnN2v_cosh)
|
||||
diff --git a/sysdeps/aarch64/fpu/test-double-sve-wrappers.c b/sysdeps/aarch64/fpu/test-double-sve-wrappers.c
|
||||
index b88a2afe5c1198c0..83cb3ad5d0e4d056 100644
|
||||
--- a/sysdeps/aarch64/fpu/test-double-sve-wrappers.c
|
||||
+++ b/sysdeps/aarch64/fpu/test-double-sve-wrappers.c
|
||||
@@ -47,6 +47,7 @@ SVE_VECTOR_WRAPPER (acosh_sve, _ZGVsMxv_acosh)
|
||||
SVE_VECTOR_WRAPPER (asin_sve, _ZGVsMxv_asin)
|
||||
SVE_VECTOR_WRAPPER (asinh_sve, _ZGVsMxv_asinh)
|
||||
SVE_VECTOR_WRAPPER (atan_sve, _ZGVsMxv_atan)
|
||||
+SVE_VECTOR_WRAPPER (atanh_sve, _ZGVsMxv_atanh)
|
||||
SVE_VECTOR_WRAPPER_ff (atan2_sve, _ZGVsMxvv_atan2)
|
||||
SVE_VECTOR_WRAPPER (cos_sve, _ZGVsMxv_cos)
|
||||
SVE_VECTOR_WRAPPER (cosh_sve, _ZGVsMxv_cosh)
|
||||
diff --git a/sysdeps/aarch64/fpu/test-float-advsimd-wrappers.c b/sysdeps/aarch64/fpu/test-float-advsimd-wrappers.c
|
||||
index 533655402d3f3737..831d4d755272d616 100644
|
||||
--- a/sysdeps/aarch64/fpu/test-float-advsimd-wrappers.c
|
||||
+++ b/sysdeps/aarch64/fpu/test-float-advsimd-wrappers.c
|
||||
@@ -28,6 +28,7 @@ VPCS_VECTOR_WRAPPER (acoshf_advsimd, _ZGVnN4v_acoshf)
|
||||
VPCS_VECTOR_WRAPPER (asinf_advsimd, _ZGVnN4v_asinf)
|
||||
VPCS_VECTOR_WRAPPER (asinhf_advsimd, _ZGVnN4v_asinhf)
|
||||
VPCS_VECTOR_WRAPPER (atanf_advsimd, _ZGVnN4v_atanf)
|
||||
+VPCS_VECTOR_WRAPPER (atanhf_advsimd, _ZGVnN4v_atanhf)
|
||||
VPCS_VECTOR_WRAPPER_ff (atan2f_advsimd, _ZGVnN4vv_atan2f)
|
||||
VPCS_VECTOR_WRAPPER (cosf_advsimd, _ZGVnN4v_cosf)
|
||||
VPCS_VECTOR_WRAPPER (coshf_advsimd, _ZGVnN4v_coshf)
|
||||
diff --git a/sysdeps/aarch64/fpu/test-float-sve-wrappers.c b/sysdeps/aarch64/fpu/test-float-sve-wrappers.c
|
||||
index f7b673e3358e7d82..96fd612c3e76f6dc 100644
|
||||
--- a/sysdeps/aarch64/fpu/test-float-sve-wrappers.c
|
||||
+++ b/sysdeps/aarch64/fpu/test-float-sve-wrappers.c
|
||||
@@ -47,6 +47,7 @@ SVE_VECTOR_WRAPPER (acoshf_sve, _ZGVsMxv_acoshf)
|
||||
SVE_VECTOR_WRAPPER (asinf_sve, _ZGVsMxv_asinf)
|
||||
SVE_VECTOR_WRAPPER (asinhf_sve, _ZGVsMxv_asinhf)
|
||||
SVE_VECTOR_WRAPPER (atanf_sve, _ZGVsMxv_atanf)
|
||||
+SVE_VECTOR_WRAPPER (atanhf_sve, _ZGVsMxv_atanhf)
|
||||
SVE_VECTOR_WRAPPER_ff (atan2f_sve, _ZGVsMxvv_atan2f)
|
||||
SVE_VECTOR_WRAPPER (cosf_sve, _ZGVsMxv_cosf)
|
||||
SVE_VECTOR_WRAPPER (coshf_sve, _ZGVsMxv_coshf)
|
||||
diff --git a/sysdeps/aarch64/libm-test-ulps b/sysdeps/aarch64/libm-test-ulps
|
||||
index b916e422432014c2..7c2e43d3dc5bbc13 100644
|
||||
--- a/sysdeps/aarch64/libm-test-ulps
|
||||
+++ b/sysdeps/aarch64/libm-test-ulps
|
||||
@@ -173,11 +173,19 @@ double: 2
|
||||
float: 2
|
||||
ldouble: 4
|
||||
|
||||
+Function: "atanh_advsimd":
|
||||
+double: 1
|
||||
+float: 1
|
||||
+
|
||||
Function: "atanh_downward":
|
||||
double: 3
|
||||
float: 3
|
||||
ldouble: 4
|
||||
|
||||
+Function: "atanh_sve":
|
||||
+double: 2
|
||||
+float: 1
|
||||
+
|
||||
Function: "atanh_towardzero":
|
||||
double: 2
|
||||
float: 2
|
||||
diff --git a/sysdeps/unix/sysv/linux/aarch64/libmvec.abilist b/sysdeps/unix/sysv/linux/aarch64/libmvec.abilist
|
||||
index f288afdfdd9c8757..ce42372a3a276832 100644
|
||||
--- a/sysdeps/unix/sysv/linux/aarch64/libmvec.abilist
|
||||
+++ b/sysdeps/unix/sysv/linux/aarch64/libmvec.abilist
|
||||
@@ -77,18 +77,23 @@ GLIBC_2.40 _ZGVnN2v_acosh F
|
||||
GLIBC_2.40 _ZGVnN2v_acoshf F
|
||||
GLIBC_2.40 _ZGVnN2v_asinh F
|
||||
GLIBC_2.40 _ZGVnN2v_asinhf F
|
||||
+GLIBC_2.40 _ZGVnN2v_atanh F
|
||||
+GLIBC_2.40 _ZGVnN2v_atanhf F
|
||||
GLIBC_2.40 _ZGVnN2v_cosh F
|
||||
GLIBC_2.40 _ZGVnN2v_coshf F
|
||||
GLIBC_2.40 _ZGVnN2v_erf F
|
||||
GLIBC_2.40 _ZGVnN2v_erff F
|
||||
GLIBC_2.40 _ZGVnN4v_acoshf F
|
||||
GLIBC_2.40 _ZGVnN4v_asinhf F
|
||||
+GLIBC_2.40 _ZGVnN4v_atanhf F
|
||||
GLIBC_2.40 _ZGVnN4v_coshf F
|
||||
GLIBC_2.40 _ZGVnN4v_erff F
|
||||
GLIBC_2.40 _ZGVsMxv_acosh F
|
||||
GLIBC_2.40 _ZGVsMxv_acoshf F
|
||||
GLIBC_2.40 _ZGVsMxv_asinh F
|
||||
GLIBC_2.40 _ZGVsMxv_asinhf F
|
||||
+GLIBC_2.40 _ZGVsMxv_atanh F
|
||||
+GLIBC_2.40 _ZGVsMxv_atanhf F
|
||||
GLIBC_2.40 _ZGVsMxv_cosh F
|
||||
GLIBC_2.40 _ZGVsMxv_coshf F
|
||||
GLIBC_2.40 _ZGVsMxv_erf F
|
||||
758
glibc-RHEL-118273-6.patch
Normal file
758
glibc-RHEL-118273-6.patch
Normal file
@ -0,0 +1,758 @@
|
||||
commit eedbbca0bf3adf3c45aff6c4e128bae3a5562675
|
||||
Author: Joe Ramsay <Joe.Ramsay@arm.com>
|
||||
Date: Wed Apr 3 12:15:41 2024 +0100
|
||||
|
||||
aarch64/fpu: Add vector variants of sinh
|
||||
|
||||
Reviewed-by: Szabolcs Nagy <szabolcs.nagy@arm.com>
|
||||
|
||||
diff --git a/sysdeps/aarch64/fpu/Makefile b/sysdeps/aarch64/fpu/Makefile
|
||||
index 4c878e590681becc..fb5f3a365b27fdf3 100644
|
||||
--- a/sysdeps/aarch64/fpu/Makefile
|
||||
+++ b/sysdeps/aarch64/fpu/Makefile
|
||||
@@ -17,6 +17,7 @@ libmvec-supported-funcs = acos \
|
||||
log1p \
|
||||
log2 \
|
||||
sin \
|
||||
+ sinh \
|
||||
tan
|
||||
|
||||
float-advsimd-funcs = $(libmvec-supported-funcs)
|
||||
diff --git a/sysdeps/aarch64/fpu/Versions b/sysdeps/aarch64/fpu/Versions
|
||||
index 092949dc96d55624..4774b3efeacf59fb 100644
|
||||
--- a/sysdeps/aarch64/fpu/Versions
|
||||
+++ b/sysdeps/aarch64/fpu/Versions
|
||||
@@ -104,5 +104,10 @@ libmvec {
|
||||
_ZGVnN4v_erff;
|
||||
_ZGVsMxv_erf;
|
||||
_ZGVsMxv_erff;
|
||||
+ _ZGVnN2v_sinh;
|
||||
+ _ZGVnN2v_sinhf;
|
||||
+ _ZGVnN4v_sinhf;
|
||||
+ _ZGVsMxv_sinh;
|
||||
+ _ZGVsMxv_sinhf;
|
||||
}
|
||||
}
|
||||
diff --git a/sysdeps/aarch64/fpu/advsimd_f32_protos.h b/sysdeps/aarch64/fpu/advsimd_f32_protos.h
|
||||
index afbb01e191b917a4..7d9445d5c0c0c2a8 100644
|
||||
--- a/sysdeps/aarch64/fpu/advsimd_f32_protos.h
|
||||
+++ b/sysdeps/aarch64/fpu/advsimd_f32_protos.h
|
||||
@@ -35,5 +35,6 @@ libmvec_hidden_proto (V_NAME_F1(log1p));
|
||||
libmvec_hidden_proto (V_NAME_F1(log2));
|
||||
libmvec_hidden_proto (V_NAME_F1(log));
|
||||
libmvec_hidden_proto (V_NAME_F1(sin));
|
||||
+libmvec_hidden_proto (V_NAME_F1(sinh));
|
||||
libmvec_hidden_proto (V_NAME_F1(tan));
|
||||
libmvec_hidden_proto (V_NAME_F2(atan2));
|
||||
diff --git a/sysdeps/aarch64/fpu/bits/math-vector.h b/sysdeps/aarch64/fpu/bits/math-vector.h
|
||||
index ab7a8f74548854b9..1e9b76cf41916365 100644
|
||||
--- a/sysdeps/aarch64/fpu/bits/math-vector.h
|
||||
+++ b/sysdeps/aarch64/fpu/bits/math-vector.h
|
||||
@@ -105,6 +105,10 @@
|
||||
# define __DECL_SIMD_sin __DECL_SIMD_aarch64
|
||||
# undef __DECL_SIMD_sinf
|
||||
# define __DECL_SIMD_sinf __DECL_SIMD_aarch64
|
||||
+# undef __DECL_SIMD_sinh
|
||||
+# define __DECL_SIMD_sinh __DECL_SIMD_aarch64
|
||||
+# undef __DECL_SIMD_sinhf
|
||||
+# define __DECL_SIMD_sinhf __DECL_SIMD_aarch64
|
||||
# undef __DECL_SIMD_tan
|
||||
# define __DECL_SIMD_tan __DECL_SIMD_aarch64
|
||||
# undef __DECL_SIMD_tanf
|
||||
@@ -154,6 +158,7 @@ __vpcs __f32x4_t _ZGVnN4v_log10f (__f32x4_t);
|
||||
__vpcs __f32x4_t _ZGVnN4v_log1pf (__f32x4_t);
|
||||
__vpcs __f32x4_t _ZGVnN4v_log2f (__f32x4_t);
|
||||
__vpcs __f32x4_t _ZGVnN4v_sinf (__f32x4_t);
|
||||
+__vpcs __f32x4_t _ZGVnN4v_sinhf (__f32x4_t);
|
||||
__vpcs __f32x4_t _ZGVnN4v_tanf (__f32x4_t);
|
||||
|
||||
__vpcs __f64x2_t _ZGVnN2vv_atan2 (__f64x2_t, __f64x2_t);
|
||||
@@ -175,6 +180,7 @@ __vpcs __f64x2_t _ZGVnN2v_log10 (__f64x2_t);
|
||||
__vpcs __f64x2_t _ZGVnN2v_log1p (__f64x2_t);
|
||||
__vpcs __f64x2_t _ZGVnN2v_log2 (__f64x2_t);
|
||||
__vpcs __f64x2_t _ZGVnN2v_sin (__f64x2_t);
|
||||
+__vpcs __f64x2_t _ZGVnN2v_sinh (__f64x2_t);
|
||||
__vpcs __f64x2_t _ZGVnN2v_tan (__f64x2_t);
|
||||
|
||||
# undef __ADVSIMD_VEC_MATH_SUPPORTED
|
||||
@@ -201,6 +207,7 @@ __sv_f32_t _ZGVsMxv_log10f (__sv_f32_t, __sv_bool_t);
|
||||
__sv_f32_t _ZGVsMxv_log1pf (__sv_f32_t, __sv_bool_t);
|
||||
__sv_f32_t _ZGVsMxv_log2f (__sv_f32_t, __sv_bool_t);
|
||||
__sv_f32_t _ZGVsMxv_sinf (__sv_f32_t, __sv_bool_t);
|
||||
+__sv_f32_t _ZGVsMxv_sinhf (__sv_f32_t, __sv_bool_t);
|
||||
__sv_f32_t _ZGVsMxv_tanf (__sv_f32_t, __sv_bool_t);
|
||||
|
||||
__sv_f64_t _ZGVsMxvv_atan2 (__sv_f64_t, __sv_f64_t, __sv_bool_t);
|
||||
@@ -222,6 +229,7 @@ __sv_f64_t _ZGVsMxv_log10 (__sv_f64_t, __sv_bool_t);
|
||||
__sv_f64_t _ZGVsMxv_log1p (__sv_f64_t, __sv_bool_t);
|
||||
__sv_f64_t _ZGVsMxv_log2 (__sv_f64_t, __sv_bool_t);
|
||||
__sv_f64_t _ZGVsMxv_sin (__sv_f64_t, __sv_bool_t);
|
||||
+__sv_f64_t _ZGVsMxv_sinh (__sv_f64_t, __sv_bool_t);
|
||||
__sv_f64_t _ZGVsMxv_tan (__sv_f64_t, __sv_bool_t);
|
||||
|
||||
# undef __SVE_VEC_MATH_SUPPORTED
|
||||
diff --git a/sysdeps/aarch64/fpu/sinh_advsimd.c b/sysdeps/aarch64/fpu/sinh_advsimd.c
|
||||
new file mode 100644
|
||||
index 0000000000000000..fa3723b10c15eb29
|
||||
--- /dev/null
|
||||
+++ b/sysdeps/aarch64/fpu/sinh_advsimd.c
|
||||
@@ -0,0 +1,121 @@
|
||||
+/* Double-precision vector (Advanced SIMD) sinh function
|
||||
+
|
||||
+ Copyright (C) 2024 Free Software Foundation, Inc.
|
||||
+ This file is part of the GNU C Library.
|
||||
+
|
||||
+ The GNU C Library is free software; you can redistribute it and/or
|
||||
+ modify it under the terms of the GNU Lesser General Public
|
||||
+ License as published by the Free Software Foundation; either
|
||||
+ version 2.1 of the License, or (at your option) any later version.
|
||||
+
|
||||
+ The GNU C Library is distributed in the hope that it will be useful,
|
||||
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
+ Lesser General Public License for more details.
|
||||
+
|
||||
+ You should have received a copy of the GNU Lesser General Public
|
||||
+ License along with the GNU C Library; if not, see
|
||||
+ <https://www.gnu.org/licenses/>. */
|
||||
+
|
||||
+#include "v_math.h"
|
||||
+#include "poly_advsimd_f64.h"
|
||||
+
|
||||
+static const struct data
|
||||
+{
|
||||
+ float64x2_t poly[11];
|
||||
+ float64x2_t inv_ln2, m_ln2, shift;
|
||||
+ uint64x2_t halff;
|
||||
+ int64x2_t onef;
|
||||
+#if WANT_SIMD_EXCEPT
|
||||
+ uint64x2_t tiny_bound, thresh;
|
||||
+#else
|
||||
+ uint64x2_t large_bound;
|
||||
+#endif
|
||||
+} data = {
|
||||
+ /* Generated using Remez, deg=12 in [-log(2)/2, log(2)/2]. */
|
||||
+ .poly = { V2 (0x1p-1), V2 (0x1.5555555555559p-3), V2 (0x1.555555555554bp-5),
|
||||
+ V2 (0x1.111111110f663p-7), V2 (0x1.6c16c16c1b5f3p-10),
|
||||
+ V2 (0x1.a01a01affa35dp-13), V2 (0x1.a01a018b4ecbbp-16),
|
||||
+ V2 (0x1.71ddf82db5bb4p-19), V2 (0x1.27e517fc0d54bp-22),
|
||||
+ V2 (0x1.af5eedae67435p-26), V2 (0x1.1f143d060a28ap-29), },
|
||||
+
|
||||
+ .inv_ln2 = V2 (0x1.71547652b82fep0),
|
||||
+ .m_ln2 = (float64x2_t) {-0x1.62e42fefa39efp-1, -0x1.abc9e3b39803fp-56},
|
||||
+ .shift = V2 (0x1.8p52),
|
||||
+
|
||||
+ .halff = V2 (0x3fe0000000000000),
|
||||
+ .onef = V2 (0x3ff0000000000000),
|
||||
+#if WANT_SIMD_EXCEPT
|
||||
+ /* 2^-26, below which sinh(x) rounds to x. */
|
||||
+ .tiny_bound = V2 (0x3e50000000000000),
|
||||
+ /* asuint(large_bound) - asuint(tiny_bound). */
|
||||
+ .thresh = V2 (0x0230000000000000),
|
||||
+#else
|
||||
+/* 2^9. expm1 helper overflows for large input. */
|
||||
+ .large_bound = V2 (0x4080000000000000),
|
||||
+#endif
|
||||
+};
|
||||
+
|
||||
+static inline float64x2_t
|
||||
+expm1_inline (float64x2_t x)
|
||||
+{
|
||||
+ const struct data *d = ptr_barrier (&data);
|
||||
+
|
||||
+ /* Reduce argument:
|
||||
+ exp(x) - 1 = 2^i * (expm1(f) + 1) - 1
|
||||
+ where i = round(x / ln2)
|
||||
+ and f = x - i * ln2 (f in [-ln2/2, ln2/2]). */
|
||||
+ float64x2_t j = vsubq_f64 (vfmaq_f64 (d->shift, d->inv_ln2, x), d->shift);
|
||||
+ int64x2_t i = vcvtq_s64_f64 (j);
|
||||
+ float64x2_t f = vfmaq_laneq_f64 (x, j, d->m_ln2, 0);
|
||||
+ f = vfmaq_laneq_f64 (f, j, d->m_ln2, 1);
|
||||
+ /* Approximate expm1(f) using polynomial. */
|
||||
+ float64x2_t f2 = vmulq_f64 (f, f);
|
||||
+ float64x2_t f4 = vmulq_f64 (f2, f2);
|
||||
+ float64x2_t f8 = vmulq_f64 (f4, f4);
|
||||
+ float64x2_t p = vfmaq_f64 (f, f2, v_estrin_10_f64 (f, f2, f4, f8, d->poly));
|
||||
+ /* t = 2^i. */
|
||||
+ float64x2_t t = vreinterpretq_f64_u64 (
|
||||
+ vreinterpretq_u64_s64 (vaddq_s64 (vshlq_n_s64 (i, 52), d->onef)));
|
||||
+ /* expm1(x) ~= p * t + (t - 1). */
|
||||
+ return vfmaq_f64 (vsubq_f64 (t, v_f64 (1.0)), p, t);
|
||||
+}
|
||||
+
|
||||
+static float64x2_t NOINLINE VPCS_ATTR
|
||||
+special_case (float64x2_t x)
|
||||
+{
|
||||
+ return v_call_f64 (sinh, x, x, v_u64 (-1));
|
||||
+}
|
||||
+
|
||||
+/* Approximation for vector double-precision sinh(x) using expm1.
|
||||
+ sinh(x) = (exp(x) - exp(-x)) / 2.
|
||||
+ The greatest observed error is 2.57 ULP:
|
||||
+ _ZGVnN2v_sinh (0x1.9fb1d49d1d58bp-2) got 0x1.ab34e59d678dcp-2
|
||||
+ want 0x1.ab34e59d678d9p-2. */
|
||||
+float64x2_t VPCS_ATTR V_NAME_D1 (sinh) (float64x2_t x)
|
||||
+{
|
||||
+ const struct data *d = ptr_barrier (&data);
|
||||
+
|
||||
+ float64x2_t ax = vabsq_f64 (x);
|
||||
+ uint64x2_t sign
|
||||
+ = veorq_u64 (vreinterpretq_u64_f64 (x), vreinterpretq_u64_f64 (ax));
|
||||
+ float64x2_t halfsign = vreinterpretq_f64_u64 (vorrq_u64 (sign, d->halff));
|
||||
+
|
||||
+#if WANT_SIMD_EXCEPT
|
||||
+ uint64x2_t special = vcgeq_u64 (
|
||||
+ vsubq_u64 (vreinterpretq_u64_f64 (ax), d->tiny_bound), d->thresh);
|
||||
+#else
|
||||
+ uint64x2_t special = vcgeq_u64 (vreinterpretq_u64_f64 (ax), d->large_bound);
|
||||
+#endif
|
||||
+
|
||||
+ /* Fall back to scalar variant for all lanes if any of them are special. */
|
||||
+ if (__glibc_unlikely (v_any_u64 (special)))
|
||||
+ return special_case (x);
|
||||
+
|
||||
+ /* Up to the point that expm1 overflows, we can use it to calculate sinh
|
||||
+ using a slight rearrangement of the definition of sinh. This allows us to
|
||||
+ retain acceptable accuracy for very small inputs. */
|
||||
+ float64x2_t t = expm1_inline (ax);
|
||||
+ t = vaddq_f64 (t, vdivq_f64 (t, vaddq_f64 (t, v_f64 (1.0))));
|
||||
+ return vmulq_f64 (t, halfsign);
|
||||
+}
|
||||
diff --git a/sysdeps/aarch64/fpu/sinh_sve.c b/sysdeps/aarch64/fpu/sinh_sve.c
|
||||
new file mode 100644
|
||||
index 0000000000000000..df5f6c8c06e5b173
|
||||
--- /dev/null
|
||||
+++ b/sysdeps/aarch64/fpu/sinh_sve.c
|
||||
@@ -0,0 +1,107 @@
|
||||
+/* Double-precision vector (SVE) atanh function
|
||||
+
|
||||
+ Copyright (C) 2024 Free Software Foundation, Inc.
|
||||
+ This file is part of the GNU C Library.
|
||||
+
|
||||
+ The GNU C Library is free software; you can redistribute it and/or
|
||||
+ modify it under the terms of the GNU Lesser General Public
|
||||
+ License as published by the Free Software Foundation; either
|
||||
+ version 2.1 of the License, or (at your option) any later version.
|
||||
+
|
||||
+ The GNU C Library is distributed in the hope that it will be useful,
|
||||
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
+ Lesser General Public License for more details.
|
||||
+
|
||||
+ You should have received a copy of the GNU Lesser General Public
|
||||
+ License along with the GNU C Library; if not, see
|
||||
+ <https://www.gnu.org/licenses/>. */
|
||||
+
|
||||
+#include "sv_math.h"
|
||||
+#include "poly_sve_f64.h"
|
||||
+
|
||||
+static const struct data
|
||||
+{
|
||||
+ float64_t poly[11];
|
||||
+ float64_t inv_ln2, m_ln2_hi, m_ln2_lo, shift;
|
||||
+ uint64_t halff;
|
||||
+ int64_t onef;
|
||||
+ uint64_t large_bound;
|
||||
+} data = {
|
||||
+ /* Generated using Remez, deg=12 in [-log(2)/2, log(2)/2]. */
|
||||
+ .poly = { 0x1p-1, 0x1.5555555555559p-3, 0x1.555555555554bp-5,
|
||||
+ 0x1.111111110f663p-7, 0x1.6c16c16c1b5f3p-10,
|
||||
+ 0x1.a01a01affa35dp-13, 0x1.a01a018b4ecbbp-16,
|
||||
+ 0x1.71ddf82db5bb4p-19, 0x1.27e517fc0d54bp-22,
|
||||
+ 0x1.af5eedae67435p-26, 0x1.1f143d060a28ap-29, },
|
||||
+
|
||||
+ .inv_ln2 = 0x1.71547652b82fep0,
|
||||
+ .m_ln2_hi = -0x1.62e42fefa39efp-1,
|
||||
+ .m_ln2_lo = -0x1.abc9e3b39803fp-56,
|
||||
+ .shift = 0x1.8p52,
|
||||
+
|
||||
+ .halff = 0x3fe0000000000000,
|
||||
+ .onef = 0x3ff0000000000000,
|
||||
+ /* 2^9. expm1 helper overflows for large input. */
|
||||
+ .large_bound = 0x4080000000000000,
|
||||
+};
|
||||
+
|
||||
+static inline svfloat64_t
|
||||
+expm1_inline (svfloat64_t x, svbool_t pg)
|
||||
+{
|
||||
+ const struct data *d = ptr_barrier (&data);
|
||||
+
|
||||
+ /* Reduce argument:
|
||||
+ exp(x) - 1 = 2^i * (expm1(f) + 1) - 1
|
||||
+ where i = round(x / ln2)
|
||||
+ and f = x - i * ln2 (f in [-ln2/2, ln2/2]). */
|
||||
+ svfloat64_t j
|
||||
+ = svsub_x (pg, svmla_x (pg, sv_f64 (d->shift), x, d->inv_ln2), d->shift);
|
||||
+ svint64_t i = svcvt_s64_x (pg, j);
|
||||
+ svfloat64_t f = svmla_x (pg, x, j, d->m_ln2_hi);
|
||||
+ f = svmla_x (pg, f, j, d->m_ln2_lo);
|
||||
+ /* Approximate expm1(f) using polynomial. */
|
||||
+ svfloat64_t f2 = svmul_x (pg, f, f);
|
||||
+ svfloat64_t f4 = svmul_x (pg, f2, f2);
|
||||
+ svfloat64_t f8 = svmul_x (pg, f4, f4);
|
||||
+ svfloat64_t p
|
||||
+ = svmla_x (pg, f, f2, sv_estrin_10_f64_x (pg, f, f2, f4, f8, d->poly));
|
||||
+ /* t = 2^i. */
|
||||
+ svfloat64_t t = svscale_x (pg, sv_f64 (1), i);
|
||||
+ /* expm1(x) ~= p * t + (t - 1). */
|
||||
+ return svmla_x (pg, svsub_x (pg, t, 1.0), p, t);
|
||||
+}
|
||||
+
|
||||
+static svfloat64_t NOINLINE
|
||||
+special_case (svfloat64_t x, svbool_t pg)
|
||||
+{
|
||||
+ return sv_call_f64 (sinh, x, x, pg);
|
||||
+}
|
||||
+
|
||||
+/* Approximation for SVE double-precision sinh(x) using expm1.
|
||||
+ sinh(x) = (exp(x) - exp(-x)) / 2.
|
||||
+ The greatest observed error is 2.57 ULP:
|
||||
+ _ZGVsMxv_sinh (0x1.a008538399931p-2) got 0x1.ab929fc64bd66p-2
|
||||
+ want 0x1.ab929fc64bd63p-2. */
|
||||
+svfloat64_t SV_NAME_D1 (sinh) (svfloat64_t x, svbool_t pg)
|
||||
+{
|
||||
+ const struct data *d = ptr_barrier (&data);
|
||||
+
|
||||
+ svfloat64_t ax = svabs_x (pg, x);
|
||||
+ svuint64_t sign
|
||||
+ = sveor_x (pg, svreinterpret_u64 (x), svreinterpret_u64 (ax));
|
||||
+ svfloat64_t halfsign = svreinterpret_f64 (svorr_x (pg, sign, d->halff));
|
||||
+
|
||||
+ svbool_t special = svcmpge (pg, svreinterpret_u64 (ax), d->large_bound);
|
||||
+
|
||||
+ /* Fall back to scalar variant for all lanes if any are special. */
|
||||
+ if (__glibc_unlikely (svptest_any (pg, special)))
|
||||
+ return special_case (x, pg);
|
||||
+
|
||||
+ /* Up to the point that expm1 overflows, we can use it to calculate sinh
|
||||
+ using a slight rearrangement of the definition of sinh. This allows us to
|
||||
+ retain acceptable accuracy for very small inputs. */
|
||||
+ svfloat64_t t = expm1_inline (ax, pg);
|
||||
+ t = svadd_x (pg, t, svdiv_x (pg, t, svadd_x (pg, t, 1.0)));
|
||||
+ return svmul_x (pg, t, halfsign);
|
||||
+}
|
||||
diff --git a/sysdeps/aarch64/fpu/sinhf_advsimd.c b/sysdeps/aarch64/fpu/sinhf_advsimd.c
|
||||
new file mode 100644
|
||||
index 0000000000000000..6bb7482dc28795c1
|
||||
--- /dev/null
|
||||
+++ b/sysdeps/aarch64/fpu/sinhf_advsimd.c
|
||||
@@ -0,0 +1,88 @@
|
||||
+/* Single-precision vector (Advanced SIMD) sinh function
|
||||
+
|
||||
+ Copyright (C) 2024 Free Software Foundation, Inc.
|
||||
+ This file is part of the GNU C Library.
|
||||
+
|
||||
+ The GNU C Library is free software; you can redistribute it and/or
|
||||
+ modify it under the terms of the GNU Lesser General Public
|
||||
+ License as published by the Free Software Foundation; either
|
||||
+ version 2.1 of the License, or (at your option) any later version.
|
||||
+
|
||||
+ The GNU C Library is distributed in the hope that it will be useful,
|
||||
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
+ Lesser General Public License for more details.
|
||||
+
|
||||
+ You should have received a copy of the GNU Lesser General Public
|
||||
+ License along with the GNU C Library; if not, see
|
||||
+ <https://www.gnu.org/licenses/>. */
|
||||
+
|
||||
+#include "v_math.h"
|
||||
+#include "v_expm1f_inline.h"
|
||||
+
|
||||
+static const struct data
|
||||
+{
|
||||
+ struct v_expm1f_data expm1f_consts;
|
||||
+ uint32x4_t halff;
|
||||
+#if WANT_SIMD_EXCEPT
|
||||
+ uint32x4_t tiny_bound, thresh;
|
||||
+#else
|
||||
+ uint32x4_t oflow_bound;
|
||||
+#endif
|
||||
+} data = {
|
||||
+ .expm1f_consts = V_EXPM1F_DATA,
|
||||
+ .halff = V4 (0x3f000000),
|
||||
+#if WANT_SIMD_EXCEPT
|
||||
+ /* 0x1.6a09e8p-32, below which expm1f underflows. */
|
||||
+ .tiny_bound = V4 (0x2fb504f4),
|
||||
+ /* asuint(oflow_bound) - asuint(tiny_bound). */
|
||||
+ .thresh = V4 (0x12fbbbb3),
|
||||
+#else
|
||||
+ /* 0x1.61814ep+6, above which expm1f helper overflows. */
|
||||
+ .oflow_bound = V4 (0x42b0c0a7),
|
||||
+#endif
|
||||
+};
|
||||
+
|
||||
+static float32x4_t NOINLINE VPCS_ATTR
|
||||
+special_case (float32x4_t x, float32x4_t y, uint32x4_t special)
|
||||
+{
|
||||
+ return v_call_f32 (sinhf, x, y, special);
|
||||
+}
|
||||
+
|
||||
+/* Approximation for vector single-precision sinh(x) using expm1.
|
||||
+ sinh(x) = (exp(x) - exp(-x)) / 2.
|
||||
+ The maximum error is 2.26 ULP:
|
||||
+ _ZGVnN4v_sinhf (0x1.e34a9ep-4) got 0x1.e469ep-4
|
||||
+ want 0x1.e469e4p-4. */
|
||||
+float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (sinh) (float32x4_t x)
|
||||
+{
|
||||
+ const struct data *d = ptr_barrier (&data);
|
||||
+
|
||||
+ uint32x4_t ix = vreinterpretq_u32_f32 (x);
|
||||
+ float32x4_t ax = vabsq_f32 (x);
|
||||
+ uint32x4_t iax = vreinterpretq_u32_f32 (ax);
|
||||
+ uint32x4_t sign = veorq_u32 (ix, iax);
|
||||
+ float32x4_t halfsign = vreinterpretq_f32_u32 (vorrq_u32 (sign, d->halff));
|
||||
+
|
||||
+#if WANT_SIMD_EXCEPT
|
||||
+ uint32x4_t special = vcgeq_u32 (vsubq_u32 (iax, d->tiny_bound), d->thresh);
|
||||
+ ax = v_zerofy_f32 (ax, special);
|
||||
+#else
|
||||
+ uint32x4_t special = vcgeq_u32 (iax, d->oflow_bound);
|
||||
+#endif
|
||||
+
|
||||
+ /* Up to the point that expm1f overflows, we can use it to calculate sinhf
|
||||
+ using a slight rearrangement of the definition of asinh. This allows us
|
||||
+ to retain acceptable accuracy for very small inputs. */
|
||||
+ float32x4_t t = expm1f_inline (ax, &d->expm1f_consts);
|
||||
+ t = vaddq_f32 (t, vdivq_f32 (t, vaddq_f32 (t, v_f32 (1.0))));
|
||||
+
|
||||
+ /* Fall back to the scalar variant for any lanes that should trigger an
|
||||
+ exception. */
|
||||
+ if (__glibc_unlikely (v_any_u32 (special)))
|
||||
+ return special_case (x, vmulq_f32 (t, halfsign), special);
|
||||
+
|
||||
+ return vmulq_f32 (t, halfsign);
|
||||
+}
|
||||
+libmvec_hidden_def (V_NAME_F1 (sinh))
|
||||
+HALF_WIDTH_ALIAS_F1 (sinh)
|
||||
diff --git a/sysdeps/aarch64/fpu/sinhf_sve.c b/sysdeps/aarch64/fpu/sinhf_sve.c
|
||||
new file mode 100644
|
||||
index 0000000000000000..6c204b57a2aa18d3
|
||||
--- /dev/null
|
||||
+++ b/sysdeps/aarch64/fpu/sinhf_sve.c
|
||||
@@ -0,0 +1,67 @@
|
||||
+/* Single-precision vector (SVE) sinh function
|
||||
+
|
||||
+ Copyright (C) 2024 Free Software Foundation, Inc.
|
||||
+ This file is part of the GNU C Library.
|
||||
+
|
||||
+ The GNU C Library is free software; you can redistribute it and/or
|
||||
+ modify it under the terms of the GNU Lesser General Public
|
||||
+ License as published by the Free Software Foundation; either
|
||||
+ version 2.1 of the License, or (at your option) any later version.
|
||||
+
|
||||
+ The GNU C Library is distributed in the hope that it will be useful,
|
||||
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
+ Lesser General Public License for more details.
|
||||
+
|
||||
+ You should have received a copy of the GNU Lesser General Public
|
||||
+ License along with the GNU C Library; if not, see
|
||||
+ <https://www.gnu.org/licenses/>. */
|
||||
+
|
||||
+#include "sv_expm1f_inline.h"
|
||||
+#include "sv_math.h"
|
||||
+
|
||||
+static const struct data
|
||||
+{
|
||||
+ struct sv_expm1f_data expm1f_consts;
|
||||
+ uint32_t halff, large_bound;
|
||||
+} data = {
|
||||
+ .expm1f_consts = SV_EXPM1F_DATA,
|
||||
+ .halff = 0x3f000000,
|
||||
+ /* 0x1.61814ep+6, above which expm1f helper overflows. */
|
||||
+ .large_bound = 0x42b0c0a7,
|
||||
+};
|
||||
+
|
||||
+static svfloat32_t NOINLINE
|
||||
+special_case (svfloat32_t x, svfloat32_t y, svbool_t pg)
|
||||
+{
|
||||
+ return sv_call_f32 (sinhf, x, y, pg);
|
||||
+}
|
||||
+
|
||||
+/* Approximation for SVE single-precision sinh(x) using expm1.
|
||||
+ sinh(x) = (exp(x) - exp(-x)) / 2.
|
||||
+ The maximum error is 2.26 ULP:
|
||||
+ _ZGVsMxv_sinhf (0x1.e34a9ep-4) got 0x1.e469ep-4
|
||||
+ want 0x1.e469e4p-4. */
|
||||
+svfloat32_t SV_NAME_F1 (sinh) (svfloat32_t x, const svbool_t pg)
|
||||
+{
|
||||
+ const struct data *d = ptr_barrier (&data);
|
||||
+ svfloat32_t ax = svabs_x (pg, x);
|
||||
+ svuint32_t sign
|
||||
+ = sveor_x (pg, svreinterpret_u32 (x), svreinterpret_u32 (ax));
|
||||
+ svfloat32_t halfsign = svreinterpret_f32 (svorr_x (pg, sign, d->halff));
|
||||
+
|
||||
+ svbool_t special = svcmpge (pg, svreinterpret_u32 (ax), d->large_bound);
|
||||
+
|
||||
+ /* Up to the point that expm1f overflows, we can use it to calculate sinhf
|
||||
+ using a slight rearrangement of the definition of asinh. This allows us to
|
||||
+ retain acceptable accuracy for very small inputs. */
|
||||
+ svfloat32_t t = expm1f_inline (ax, pg, &d->expm1f_consts);
|
||||
+ t = svadd_x (pg, t, svdiv_x (pg, t, svadd_x (pg, t, 1.0)));
|
||||
+
|
||||
+ /* Fall back to the scalar variant for any lanes which would cause
|
||||
+ expm1f to overflow. */
|
||||
+ if (__glibc_unlikely (svptest_any (pg, special)))
|
||||
+ return special_case (x, svmul_x (pg, t, halfsign), special);
|
||||
+
|
||||
+ return svmul_x (pg, t, halfsign);
|
||||
+}
|
||||
diff --git a/sysdeps/aarch64/fpu/sv_expm1f_inline.h b/sysdeps/aarch64/fpu/sv_expm1f_inline.h
|
||||
new file mode 100644
|
||||
index 0000000000000000..5b7245122294e1b4
|
||||
--- /dev/null
|
||||
+++ b/sysdeps/aarch64/fpu/sv_expm1f_inline.h
|
||||
@@ -0,0 +1,84 @@
|
||||
+/* Single-precision inline helper for vector (SVE) expm1 function
|
||||
+
|
||||
+ Copyright (C) 2024 Free Software Foundation, Inc.
|
||||
+ This file is part of the GNU C Library.
|
||||
+
|
||||
+ The GNU C Library is free software; you can redistribute it and/or
|
||||
+ modify it under the terms of the GNU Lesser General Public
|
||||
+ License as published by the Free Software Foundation; either
|
||||
+ version 2.1 of the License, or (at your option) any later version.
|
||||
+
|
||||
+ The GNU C Library is distributed in the hope that it will be useful,
|
||||
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
+ Lesser General Public License for more details.
|
||||
+
|
||||
+ You should have received a copy of the GNU Lesser General Public
|
||||
+ License along with the GNU C Library; if not, see
|
||||
+ <https://www.gnu.org/licenses/>. */
|
||||
+
|
||||
+#ifndef AARCH64_FPU_SV_EXPM1F_INLINE_H
|
||||
+#define AARCH64_FPU_SV_EXPM1F_INLINE_H
|
||||
+
|
||||
+#include "sv_math.h"
|
||||
+
|
||||
+struct sv_expm1f_data
|
||||
+{
|
||||
+ /* These 4 are grouped together so they can be loaded as one quadword, then
|
||||
+ used with _lane forms of svmla/svmls. */
|
||||
+ float32_t c2, c4, ln2_hi, ln2_lo;
|
||||
+ float32_t c0, c1, c3, inv_ln2, shift;
|
||||
+};
|
||||
+
|
||||
+/* Coefficients generated using fpminimax. */
|
||||
+#define SV_EXPM1F_DATA \
|
||||
+ { \
|
||||
+ .c0 = 0x1.fffffep-2, .c1 = 0x1.5554aep-3, .c2 = 0x1.555736p-5, \
|
||||
+ .c3 = 0x1.12287cp-7, .c4 = 0x1.6b55a2p-10, \
|
||||
+ \
|
||||
+ .shift = 0x1.8p23f, .inv_ln2 = 0x1.715476p+0f, .ln2_hi = 0x1.62e4p-1f, \
|
||||
+ .ln2_lo = 0x1.7f7d1cp-20f, \
|
||||
+ }
|
||||
+
|
||||
+#define C(i) sv_f32 (d->c##i)
|
||||
+
|
||||
+static inline svfloat32_t
|
||||
+expm1f_inline (svfloat32_t x, svbool_t pg, const struct sv_expm1f_data *d)
|
||||
+{
|
||||
+ /* This vector is reliant on layout of data - it contains constants
|
||||
+ that can be used with _lane forms of svmla/svmls. Values are:
|
||||
+ [ coeff_2, coeff_4, ln2_hi, ln2_lo ]. */
|
||||
+ svfloat32_t lane_constants = svld1rq (svptrue_b32 (), &d->c2);
|
||||
+
|
||||
+ /* Reduce argument to smaller range:
|
||||
+ Let i = round(x / ln2)
|
||||
+ and f = x - i * ln2, then f is in [-ln2/2, ln2/2].
|
||||
+ exp(x) - 1 = 2^i * (expm1(f) + 1) - 1
|
||||
+ where 2^i is exact because i is an integer. */
|
||||
+ svfloat32_t j = svmla_x (pg, sv_f32 (d->shift), x, d->inv_ln2);
|
||||
+ j = svsub_x (pg, j, d->shift);
|
||||
+ svint32_t i = svcvt_s32_x (pg, j);
|
||||
+
|
||||
+ svfloat32_t f = svmls_lane (x, j, lane_constants, 2);
|
||||
+ f = svmls_lane (f, j, lane_constants, 3);
|
||||
+
|
||||
+ /* Approximate expm1(f) using polynomial.
|
||||
+ Taylor expansion for expm1(x) has the form:
|
||||
+ x + ax^2 + bx^3 + cx^4 ....
|
||||
+ So we calculate the polynomial P(f) = a + bf + cf^2 + ...
|
||||
+ and assemble the approximation expm1(f) ~= f + f^2 * P(f). */
|
||||
+ svfloat32_t p12 = svmla_lane (C (1), f, lane_constants, 0);
|
||||
+ svfloat32_t p34 = svmla_lane (C (3), f, lane_constants, 1);
|
||||
+ svfloat32_t f2 = svmul_x (pg, f, f);
|
||||
+ svfloat32_t p = svmla_x (pg, p12, f2, p34);
|
||||
+ p = svmla_x (pg, C (0), f, p);
|
||||
+ p = svmla_x (pg, f, f2, p);
|
||||
+
|
||||
+ /* Assemble the result.
|
||||
+ expm1(x) ~= 2^i * (p + 1) - 1
|
||||
+ Let t = 2^i. */
|
||||
+ svfloat32_t t = svscale_x (pg, sv_f32 (1), i);
|
||||
+ return svmla_x (pg, svsub_x (pg, t, 1), p, t);
|
||||
+}
|
||||
+
|
||||
+#endif
|
||||
diff --git a/sysdeps/aarch64/fpu/test-double-advsimd-wrappers.c b/sysdeps/aarch64/fpu/test-double-advsimd-wrappers.c
|
||||
index a01aa99c16740631..1a57b22c3a92f1e1 100644
|
||||
--- a/sysdeps/aarch64/fpu/test-double-advsimd-wrappers.c
|
||||
+++ b/sysdeps/aarch64/fpu/test-double-advsimd-wrappers.c
|
||||
@@ -42,4 +42,5 @@ VPCS_VECTOR_WRAPPER (log10_advsimd, _ZGVnN2v_log10)
|
||||
VPCS_VECTOR_WRAPPER (log1p_advsimd, _ZGVnN2v_log1p)
|
||||
VPCS_VECTOR_WRAPPER (log2_advsimd, _ZGVnN2v_log2)
|
||||
VPCS_VECTOR_WRAPPER (sin_advsimd, _ZGVnN2v_sin)
|
||||
+VPCS_VECTOR_WRAPPER (sinh_advsimd, _ZGVnN2v_sinh)
|
||||
VPCS_VECTOR_WRAPPER (tan_advsimd, _ZGVnN2v_tan)
|
||||
diff --git a/sysdeps/aarch64/fpu/test-double-sve-wrappers.c b/sysdeps/aarch64/fpu/test-double-sve-wrappers.c
|
||||
index 83cb3ad5d0e4d056..0c9858f6b74aaef6 100644
|
||||
--- a/sysdeps/aarch64/fpu/test-double-sve-wrappers.c
|
||||
+++ b/sysdeps/aarch64/fpu/test-double-sve-wrappers.c
|
||||
@@ -61,4 +61,5 @@ SVE_VECTOR_WRAPPER (log10_sve, _ZGVsMxv_log10)
|
||||
SVE_VECTOR_WRAPPER (log1p_sve, _ZGVsMxv_log1p)
|
||||
SVE_VECTOR_WRAPPER (log2_sve, _ZGVsMxv_log2)
|
||||
SVE_VECTOR_WRAPPER (sin_sve, _ZGVsMxv_sin)
|
||||
+SVE_VECTOR_WRAPPER (sinh_sve, _ZGVsMxv_sinh)
|
||||
SVE_VECTOR_WRAPPER (tan_sve, _ZGVsMxv_tan)
|
||||
diff --git a/sysdeps/aarch64/fpu/test-float-advsimd-wrappers.c b/sysdeps/aarch64/fpu/test-float-advsimd-wrappers.c
|
||||
index 831d4d755272d616..4758490c6fc40fda 100644
|
||||
--- a/sysdeps/aarch64/fpu/test-float-advsimd-wrappers.c
|
||||
+++ b/sysdeps/aarch64/fpu/test-float-advsimd-wrappers.c
|
||||
@@ -42,4 +42,5 @@ VPCS_VECTOR_WRAPPER (log10f_advsimd, _ZGVnN4v_log10f)
|
||||
VPCS_VECTOR_WRAPPER (log1pf_advsimd, _ZGVnN4v_log1pf)
|
||||
VPCS_VECTOR_WRAPPER (log2f_advsimd, _ZGVnN4v_log2f)
|
||||
VPCS_VECTOR_WRAPPER (sinf_advsimd, _ZGVnN4v_sinf)
|
||||
+VPCS_VECTOR_WRAPPER (sinhf_advsimd, _ZGVnN4v_sinhf)
|
||||
VPCS_VECTOR_WRAPPER (tanf_advsimd, _ZGVnN4v_tanf)
|
||||
diff --git a/sysdeps/aarch64/fpu/test-float-sve-wrappers.c b/sysdeps/aarch64/fpu/test-float-sve-wrappers.c
|
||||
index 96fd612c3e76f6dc..7c04f07bbee84777 100644
|
||||
--- a/sysdeps/aarch64/fpu/test-float-sve-wrappers.c
|
||||
+++ b/sysdeps/aarch64/fpu/test-float-sve-wrappers.c
|
||||
@@ -61,4 +61,5 @@ SVE_VECTOR_WRAPPER (log10f_sve, _ZGVsMxv_log10f)
|
||||
SVE_VECTOR_WRAPPER (log1pf_sve, _ZGVsMxv_log1pf)
|
||||
SVE_VECTOR_WRAPPER (log2f_sve, _ZGVsMxv_log2f)
|
||||
SVE_VECTOR_WRAPPER (sinf_sve, _ZGVsMxv_sinf)
|
||||
+SVE_VECTOR_WRAPPER (sinhf_sve, _ZGVsMxv_sinhf)
|
||||
SVE_VECTOR_WRAPPER (tanf_sve, _ZGVsMxv_tanf)
|
||||
diff --git a/sysdeps/aarch64/fpu/v_expm1f_inline.h b/sysdeps/aarch64/fpu/v_expm1f_inline.h
|
||||
new file mode 100644
|
||||
index 0000000000000000..337ccfbfab555c97
|
||||
--- /dev/null
|
||||
+++ b/sysdeps/aarch64/fpu/v_expm1f_inline.h
|
||||
@@ -0,0 +1,73 @@
|
||||
+/* Single-precision inline helper for vector (Advanced SIMD) expm1 function
|
||||
+
|
||||
+ Copyright (C) 2024 Free Software Foundation, Inc.
|
||||
+ This file is part of the GNU C Library.
|
||||
+
|
||||
+ The GNU C Library is free software; you can redistribute it and/or
|
||||
+ modify it under the terms of the GNU Lesser General Public
|
||||
+ License as published by the Free Software Foundation; either
|
||||
+ version 2.1 of the License, or (at your option) any later version.
|
||||
+
|
||||
+ The GNU C Library is distributed in the hope that it will be useful,
|
||||
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
+ Lesser General Public License for more details.
|
||||
+
|
||||
+ You should have received a copy of the GNU Lesser General Public
|
||||
+ License along with the GNU C Library; if not, see
|
||||
+ <https://www.gnu.org/licenses/>. */
|
||||
+
|
||||
+#ifndef AARCH64_FPU_V_EXPM1F_INLINE_H
|
||||
+#define AARCH64_FPU_V_EXPM1F_INLINE_H
|
||||
+
|
||||
+#include "v_math.h"
|
||||
+#include "poly_advsimd_f32.h"
|
||||
+
|
||||
+struct v_expm1f_data
|
||||
+{
|
||||
+ float32x4_t poly[5];
|
||||
+ float32x4_t invln2_and_ln2, shift;
|
||||
+ int32x4_t exponent_bias;
|
||||
+};
|
||||
+
|
||||
+/* Coefficients generated using fpminimax with degree=5 in [-log(2)/2,
|
||||
+ log(2)/2]. Exponent bias is asuint(1.0f).
|
||||
+ invln2_and_ln2 Stores constants: invln2, ln2_lo, ln2_hi, 0. */
|
||||
+#define V_EXPM1F_DATA \
|
||||
+ { \
|
||||
+ .poly = { V4 (0x1.fffffep-2), V4 (0x1.5554aep-3), V4 (0x1.555736p-5), \
|
||||
+ V4 (0x1.12287cp-7), V4 (0x1.6b55a2p-10) }, \
|
||||
+ .shift = V4 (0x1.8p23f), .exponent_bias = V4 (0x3f800000), \
|
||||
+ .invln2_and_ln2 = { 0x1.715476p+0f, 0x1.62e4p-1f, 0x1.7f7d1cp-20f, 0 }, \
|
||||
+ }
|
||||
+
|
||||
+static inline float32x4_t
|
||||
+expm1f_inline (float32x4_t x, const struct v_expm1f_data *d)
|
||||
+{
|
||||
+ /* Helper routine for calculating exp(x) - 1.
|
||||
+ Copied from v_expm1f_1u6.c, with all special-case handling removed - the
|
||||
+ calling routine should handle special values if required. */
|
||||
+
|
||||
+ /* Reduce argument: f in [-ln2/2, ln2/2], i is exact. */
|
||||
+ float32x4_t j = vsubq_f32 (
|
||||
+ vfmaq_laneq_f32 (d->shift, x, d->invln2_and_ln2, 0), d->shift);
|
||||
+ int32x4_t i = vcvtq_s32_f32 (j);
|
||||
+ float32x4_t f = vfmsq_laneq_f32 (x, j, d->invln2_and_ln2, 1);
|
||||
+ f = vfmsq_laneq_f32 (f, j, d->invln2_and_ln2, 2);
|
||||
+
|
||||
+ /* Approximate expm1(f) with polynomial P, expm1(f) ~= f + f^2 * P(f).
|
||||
+ Uses Estrin scheme, where the main _ZGVnN4v_expm1f routine uses
|
||||
+ Horner. */
|
||||
+ float32x4_t f2 = vmulq_f32 (f, f);
|
||||
+ float32x4_t f4 = vmulq_f32 (f2, f2);
|
||||
+ float32x4_t p = v_estrin_4_f32 (f, f2, f4, d->poly);
|
||||
+ p = vfmaq_f32 (f, f2, p);
|
||||
+
|
||||
+ /* t = 2^i. */
|
||||
+ int32x4_t u = vaddq_s32 (vshlq_n_s32 (i, 23), d->exponent_bias);
|
||||
+ float32x4_t t = vreinterpretq_f32_s32 (u);
|
||||
+ /* expm1(x) ~= p * t + (t - 1). */
|
||||
+ return vfmaq_f32 (vsubq_f32 (t, v_f32 (1.0f)), p, t);
|
||||
+}
|
||||
+
|
||||
+#endif
|
||||
diff --git a/sysdeps/aarch64/libm-test-ulps b/sysdeps/aarch64/libm-test-ulps
|
||||
index 7c2e43d3dc5bbc13..fec0972081af734a 100644
|
||||
--- a/sysdeps/aarch64/libm-test-ulps
|
||||
+++ b/sysdeps/aarch64/libm-test-ulps
|
||||
@@ -1441,11 +1441,19 @@ double: 2
|
||||
float: 2
|
||||
ldouble: 2
|
||||
|
||||
+Function: "sinh_advsimd":
|
||||
+double: 2
|
||||
+float: 1
|
||||
+
|
||||
Function: "sinh_downward":
|
||||
double: 3
|
||||
float: 3
|
||||
ldouble: 3
|
||||
|
||||
+Function: "sinh_sve":
|
||||
+double: 2
|
||||
+float: 1
|
||||
+
|
||||
Function: "sinh_towardzero":
|
||||
double: 3
|
||||
float: 2
|
||||
diff --git a/sysdeps/unix/sysv/linux/aarch64/libmvec.abilist b/sysdeps/unix/sysv/linux/aarch64/libmvec.abilist
|
||||
index ce42372a3a276832..1db5ba61d64067a2 100644
|
||||
--- a/sysdeps/unix/sysv/linux/aarch64/libmvec.abilist
|
||||
+++ b/sysdeps/unix/sysv/linux/aarch64/libmvec.abilist
|
||||
@@ -83,11 +83,14 @@ GLIBC_2.40 _ZGVnN2v_cosh F
|
||||
GLIBC_2.40 _ZGVnN2v_coshf F
|
||||
GLIBC_2.40 _ZGVnN2v_erf F
|
||||
GLIBC_2.40 _ZGVnN2v_erff F
|
||||
+GLIBC_2.40 _ZGVnN2v_sinh F
|
||||
+GLIBC_2.40 _ZGVnN2v_sinhf F
|
||||
GLIBC_2.40 _ZGVnN4v_acoshf F
|
||||
GLIBC_2.40 _ZGVnN4v_asinhf F
|
||||
GLIBC_2.40 _ZGVnN4v_atanhf F
|
||||
GLIBC_2.40 _ZGVnN4v_coshf F
|
||||
GLIBC_2.40 _ZGVnN4v_erff F
|
||||
+GLIBC_2.40 _ZGVnN4v_sinhf F
|
||||
GLIBC_2.40 _ZGVsMxv_acosh F
|
||||
GLIBC_2.40 _ZGVsMxv_acoshf F
|
||||
GLIBC_2.40 _ZGVsMxv_asinh F
|
||||
@@ -98,3 +101,5 @@ GLIBC_2.40 _ZGVsMxv_cosh F
|
||||
GLIBC_2.40 _ZGVsMxv_coshf F
|
||||
GLIBC_2.40 _ZGVsMxv_erf F
|
||||
GLIBC_2.40 _ZGVsMxv_erff F
|
||||
+GLIBC_2.40 _ZGVsMxv_sinh F
|
||||
+GLIBC_2.40 _ZGVsMxv_sinhf F
|
||||
624
glibc-RHEL-118273-7.patch
Normal file
624
glibc-RHEL-118273-7.patch
Normal file
@ -0,0 +1,624 @@
|
||||
commit 3d3a4fb8e4fe854a0bbb3df9c26ba482c10a7e22
|
||||
Author: Joe Ramsay <Joe.Ramsay@arm.com>
|
||||
Date: Tue Feb 20 16:59:44 2024 +0000
|
||||
|
||||
aarch64/fpu: Add vector variants of tanh
|
||||
|
||||
Reviewed-by: Szabolcs Nagy <szabolcs.nagy@arm.com>
|
||||
|
||||
diff --git a/math/auto-libm-test-in b/math/auto-libm-test-in
|
||||
index 5a690023e9a675cb..4584c5e498ab7194 100644
|
||||
--- a/math/auto-libm-test-in
|
||||
+++ b/math/auto-libm-test-in
|
||||
@@ -7747,7 +7747,7 @@ tan min_subnorm
|
||||
tan -min_subnorm
|
||||
|
||||
tanh 0
|
||||
-tanh -0
|
||||
+tanh -0 no-mathvec
|
||||
tanh 0.75
|
||||
tanh -0.75
|
||||
tanh 1.0
|
||||
diff --git a/math/auto-libm-test-out-tanh b/math/auto-libm-test-out-tanh
|
||||
index 8b9427c917f3b388..19ce2e7b9355963d 100644
|
||||
--- a/math/auto-libm-test-out-tanh
|
||||
+++ b/math/auto-libm-test-out-tanh
|
||||
@@ -23,31 +23,31 @@ tanh 0
|
||||
= tanh tonearest ibm128 0x0p+0 : 0x0p+0 : inexact-ok
|
||||
= tanh towardzero ibm128 0x0p+0 : 0x0p+0 : inexact-ok
|
||||
= tanh upward ibm128 0x0p+0 : 0x0p+0 : inexact-ok
|
||||
-tanh -0
|
||||
-= tanh downward binary32 -0x0p+0 : -0x0p+0 : inexact-ok
|
||||
-= tanh tonearest binary32 -0x0p+0 : -0x0p+0 : inexact-ok
|
||||
-= tanh towardzero binary32 -0x0p+0 : -0x0p+0 : inexact-ok
|
||||
-= tanh upward binary32 -0x0p+0 : -0x0p+0 : inexact-ok
|
||||
-= tanh downward binary64 -0x0p+0 : -0x0p+0 : inexact-ok
|
||||
-= tanh tonearest binary64 -0x0p+0 : -0x0p+0 : inexact-ok
|
||||
-= tanh towardzero binary64 -0x0p+0 : -0x0p+0 : inexact-ok
|
||||
-= tanh upward binary64 -0x0p+0 : -0x0p+0 : inexact-ok
|
||||
-= tanh downward intel96 -0x0p+0 : -0x0p+0 : inexact-ok
|
||||
-= tanh tonearest intel96 -0x0p+0 : -0x0p+0 : inexact-ok
|
||||
-= tanh towardzero intel96 -0x0p+0 : -0x0p+0 : inexact-ok
|
||||
-= tanh upward intel96 -0x0p+0 : -0x0p+0 : inexact-ok
|
||||
-= tanh downward m68k96 -0x0p+0 : -0x0p+0 : inexact-ok
|
||||
-= tanh tonearest m68k96 -0x0p+0 : -0x0p+0 : inexact-ok
|
||||
-= tanh towardzero m68k96 -0x0p+0 : -0x0p+0 : inexact-ok
|
||||
-= tanh upward m68k96 -0x0p+0 : -0x0p+0 : inexact-ok
|
||||
-= tanh downward binary128 -0x0p+0 : -0x0p+0 : inexact-ok
|
||||
-= tanh tonearest binary128 -0x0p+0 : -0x0p+0 : inexact-ok
|
||||
-= tanh towardzero binary128 -0x0p+0 : -0x0p+0 : inexact-ok
|
||||
-= tanh upward binary128 -0x0p+0 : -0x0p+0 : inexact-ok
|
||||
-= tanh downward ibm128 -0x0p+0 : -0x0p+0 : inexact-ok
|
||||
-= tanh tonearest ibm128 -0x0p+0 : -0x0p+0 : inexact-ok
|
||||
-= tanh towardzero ibm128 -0x0p+0 : -0x0p+0 : inexact-ok
|
||||
-= tanh upward ibm128 -0x0p+0 : -0x0p+0 : inexact-ok
|
||||
+tanh -0 no-mathvec
|
||||
+= tanh downward binary32 -0x0p+0 : -0x0p+0 : no-mathvec inexact-ok
|
||||
+= tanh tonearest binary32 -0x0p+0 : -0x0p+0 : no-mathvec inexact-ok
|
||||
+= tanh towardzero binary32 -0x0p+0 : -0x0p+0 : no-mathvec inexact-ok
|
||||
+= tanh upward binary32 -0x0p+0 : -0x0p+0 : no-mathvec inexact-ok
|
||||
+= tanh downward binary64 -0x0p+0 : -0x0p+0 : no-mathvec inexact-ok
|
||||
+= tanh tonearest binary64 -0x0p+0 : -0x0p+0 : no-mathvec inexact-ok
|
||||
+= tanh towardzero binary64 -0x0p+0 : -0x0p+0 : no-mathvec inexact-ok
|
||||
+= tanh upward binary64 -0x0p+0 : -0x0p+0 : no-mathvec inexact-ok
|
||||
+= tanh downward intel96 -0x0p+0 : -0x0p+0 : no-mathvec inexact-ok
|
||||
+= tanh tonearest intel96 -0x0p+0 : -0x0p+0 : no-mathvec inexact-ok
|
||||
+= tanh towardzero intel96 -0x0p+0 : -0x0p+0 : no-mathvec inexact-ok
|
||||
+= tanh upward intel96 -0x0p+0 : -0x0p+0 : no-mathvec inexact-ok
|
||||
+= tanh downward m68k96 -0x0p+0 : -0x0p+0 : no-mathvec inexact-ok
|
||||
+= tanh tonearest m68k96 -0x0p+0 : -0x0p+0 : no-mathvec inexact-ok
|
||||
+= tanh towardzero m68k96 -0x0p+0 : -0x0p+0 : no-mathvec inexact-ok
|
||||
+= tanh upward m68k96 -0x0p+0 : -0x0p+0 : no-mathvec inexact-ok
|
||||
+= tanh downward binary128 -0x0p+0 : -0x0p+0 : no-mathvec inexact-ok
|
||||
+= tanh tonearest binary128 -0x0p+0 : -0x0p+0 : no-mathvec inexact-ok
|
||||
+= tanh towardzero binary128 -0x0p+0 : -0x0p+0 : no-mathvec inexact-ok
|
||||
+= tanh upward binary128 -0x0p+0 : -0x0p+0 : no-mathvec inexact-ok
|
||||
+= tanh downward ibm128 -0x0p+0 : -0x0p+0 : no-mathvec inexact-ok
|
||||
+= tanh tonearest ibm128 -0x0p+0 : -0x0p+0 : no-mathvec inexact-ok
|
||||
+= tanh towardzero ibm128 -0x0p+0 : -0x0p+0 : no-mathvec inexact-ok
|
||||
+= tanh upward ibm128 -0x0p+0 : -0x0p+0 : no-mathvec inexact-ok
|
||||
tanh 0.75
|
||||
= tanh downward binary32 0xcp-4 : 0xa.2991fp-4 : inexact-ok
|
||||
= tanh tonearest binary32 0xcp-4 : 0xa.2991fp-4 : inexact-ok
|
||||
diff --git a/sysdeps/aarch64/fpu/Makefile b/sysdeps/aarch64/fpu/Makefile
|
||||
index fb5f3a365b27fdf3..e5f418ae4274edb2 100644
|
||||
--- a/sysdeps/aarch64/fpu/Makefile
|
||||
+++ b/sysdeps/aarch64/fpu/Makefile
|
||||
@@ -18,7 +18,8 @@ libmvec-supported-funcs = acos \
|
||||
log2 \
|
||||
sin \
|
||||
sinh \
|
||||
- tan
|
||||
+ tan \
|
||||
+ tanh
|
||||
|
||||
float-advsimd-funcs = $(libmvec-supported-funcs)
|
||||
double-advsimd-funcs = $(libmvec-supported-funcs)
|
||||
diff --git a/sysdeps/aarch64/fpu/Versions b/sysdeps/aarch64/fpu/Versions
|
||||
index 4774b3efeacf59fb..4dbf3d32441dd43a 100644
|
||||
--- a/sysdeps/aarch64/fpu/Versions
|
||||
+++ b/sysdeps/aarch64/fpu/Versions
|
||||
@@ -109,5 +109,10 @@ libmvec {
|
||||
_ZGVnN4v_sinhf;
|
||||
_ZGVsMxv_sinh;
|
||||
_ZGVsMxv_sinhf;
|
||||
+ _ZGVnN2v_tanh;
|
||||
+ _ZGVnN2v_tanhf;
|
||||
+ _ZGVnN4v_tanhf;
|
||||
+ _ZGVsMxv_tanh;
|
||||
+ _ZGVsMxv_tanhf;
|
||||
}
|
||||
}
|
||||
diff --git a/sysdeps/aarch64/fpu/advsimd_f32_protos.h b/sysdeps/aarch64/fpu/advsimd_f32_protos.h
|
||||
index 7d9445d5c0c0c2a8..4ff191c324050b42 100644
|
||||
--- a/sysdeps/aarch64/fpu/advsimd_f32_protos.h
|
||||
+++ b/sysdeps/aarch64/fpu/advsimd_f32_protos.h
|
||||
@@ -37,4 +37,5 @@ libmvec_hidden_proto (V_NAME_F1(log));
|
||||
libmvec_hidden_proto (V_NAME_F1(sin));
|
||||
libmvec_hidden_proto (V_NAME_F1(sinh));
|
||||
libmvec_hidden_proto (V_NAME_F1(tan));
|
||||
+libmvec_hidden_proto (V_NAME_F1(tanh));
|
||||
libmvec_hidden_proto (V_NAME_F2(atan2));
|
||||
diff --git a/sysdeps/aarch64/fpu/bits/math-vector.h b/sysdeps/aarch64/fpu/bits/math-vector.h
|
||||
index 1e9b76cf41916365..585e022082d62a5d 100644
|
||||
--- a/sysdeps/aarch64/fpu/bits/math-vector.h
|
||||
+++ b/sysdeps/aarch64/fpu/bits/math-vector.h
|
||||
@@ -113,6 +113,10 @@
|
||||
# define __DECL_SIMD_tan __DECL_SIMD_aarch64
|
||||
# undef __DECL_SIMD_tanf
|
||||
# define __DECL_SIMD_tanf __DECL_SIMD_aarch64
|
||||
+# undef __DECL_SIMD_tanh
|
||||
+# define __DECL_SIMD_tanh __DECL_SIMD_aarch64
|
||||
+# undef __DECL_SIMD_tanhf
|
||||
+# define __DECL_SIMD_tanhf __DECL_SIMD_aarch64
|
||||
#endif
|
||||
|
||||
#if __GNUC_PREREQ(9, 0)
|
||||
@@ -160,6 +164,7 @@ __vpcs __f32x4_t _ZGVnN4v_log2f (__f32x4_t);
|
||||
__vpcs __f32x4_t _ZGVnN4v_sinf (__f32x4_t);
|
||||
__vpcs __f32x4_t _ZGVnN4v_sinhf (__f32x4_t);
|
||||
__vpcs __f32x4_t _ZGVnN4v_tanf (__f32x4_t);
|
||||
+__vpcs __f32x4_t _ZGVnN4v_tanhf (__f32x4_t);
|
||||
|
||||
__vpcs __f64x2_t _ZGVnN2vv_atan2 (__f64x2_t, __f64x2_t);
|
||||
__vpcs __f64x2_t _ZGVnN2v_acos (__f64x2_t);
|
||||
@@ -182,6 +187,7 @@ __vpcs __f64x2_t _ZGVnN2v_log2 (__f64x2_t);
|
||||
__vpcs __f64x2_t _ZGVnN2v_sin (__f64x2_t);
|
||||
__vpcs __f64x2_t _ZGVnN2v_sinh (__f64x2_t);
|
||||
__vpcs __f64x2_t _ZGVnN2v_tan (__f64x2_t);
|
||||
+__vpcs __f64x2_t _ZGVnN2v_tanh (__f64x2_t);
|
||||
|
||||
# undef __ADVSIMD_VEC_MATH_SUPPORTED
|
||||
#endif /* __ADVSIMD_VEC_MATH_SUPPORTED */
|
||||
@@ -209,6 +215,7 @@ __sv_f32_t _ZGVsMxv_log2f (__sv_f32_t, __sv_bool_t);
|
||||
__sv_f32_t _ZGVsMxv_sinf (__sv_f32_t, __sv_bool_t);
|
||||
__sv_f32_t _ZGVsMxv_sinhf (__sv_f32_t, __sv_bool_t);
|
||||
__sv_f32_t _ZGVsMxv_tanf (__sv_f32_t, __sv_bool_t);
|
||||
+__sv_f32_t _ZGVsMxv_tanhf (__sv_f32_t, __sv_bool_t);
|
||||
|
||||
__sv_f64_t _ZGVsMxvv_atan2 (__sv_f64_t, __sv_f64_t, __sv_bool_t);
|
||||
__sv_f64_t _ZGVsMxv_acos (__sv_f64_t, __sv_bool_t);
|
||||
@@ -231,6 +238,7 @@ __sv_f64_t _ZGVsMxv_log2 (__sv_f64_t, __sv_bool_t);
|
||||
__sv_f64_t _ZGVsMxv_sin (__sv_f64_t, __sv_bool_t);
|
||||
__sv_f64_t _ZGVsMxv_sinh (__sv_f64_t, __sv_bool_t);
|
||||
__sv_f64_t _ZGVsMxv_tan (__sv_f64_t, __sv_bool_t);
|
||||
+__sv_f64_t _ZGVsMxv_tanh (__sv_f64_t, __sv_bool_t);
|
||||
|
||||
# undef __SVE_VEC_MATH_SUPPORTED
|
||||
#endif /* __SVE_VEC_MATH_SUPPORTED */
|
||||
diff --git a/sysdeps/aarch64/fpu/tanh_advsimd.c b/sysdeps/aarch64/fpu/tanh_advsimd.c
|
||||
new file mode 100644
|
||||
index 0000000000000000..1da1dfa5dbe418b6
|
||||
--- /dev/null
|
||||
+++ b/sysdeps/aarch64/fpu/tanh_advsimd.c
|
||||
@@ -0,0 +1,109 @@
|
||||
+/* Double-precision vector (Advanced SIMD) tanh function
|
||||
+
|
||||
+ Copyright (C) 2024 Free Software Foundation, Inc.
|
||||
+ This file is part of the GNU C Library.
|
||||
+
|
||||
+ The GNU C Library is free software; you can redistribute it and/or
|
||||
+ modify it under the terms of the GNU Lesser General Public
|
||||
+ License as published by the Free Software Foundation; either
|
||||
+ version 2.1 of the License, or (at your option) any later version.
|
||||
+
|
||||
+ The GNU C Library is distributed in the hope that it will be useful,
|
||||
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
+ Lesser General Public License for more details.
|
||||
+
|
||||
+ You should have received a copy of the GNU Lesser General Public
|
||||
+ License along with the GNU C Library; if not, see
|
||||
+ <https://www.gnu.org/licenses/>. */
|
||||
+
|
||||
+#include "v_math.h"
|
||||
+#include "poly_advsimd_f64.h"
|
||||
+
|
||||
+static const struct data
|
||||
+{
|
||||
+ float64x2_t poly[11];
|
||||
+ float64x2_t inv_ln2, ln2_hi, ln2_lo, shift;
|
||||
+ uint64x2_t onef;
|
||||
+ uint64x2_t thresh, tiny_bound;
|
||||
+} data = {
|
||||
+ /* Generated using Remez, deg=12 in [-log(2)/2, log(2)/2]. */
|
||||
+ .poly = { V2 (0x1p-1), V2 (0x1.5555555555559p-3), V2 (0x1.555555555554bp-5),
|
||||
+ V2 (0x1.111111110f663p-7), V2 (0x1.6c16c16c1b5f3p-10),
|
||||
+ V2 (0x1.a01a01affa35dp-13), V2 (0x1.a01a018b4ecbbp-16),
|
||||
+ V2 (0x1.71ddf82db5bb4p-19), V2 (0x1.27e517fc0d54bp-22),
|
||||
+ V2 (0x1.af5eedae67435p-26), V2 (0x1.1f143d060a28ap-29), },
|
||||
+
|
||||
+ .inv_ln2 = V2 (0x1.71547652b82fep0),
|
||||
+ .ln2_hi = V2 (-0x1.62e42fefa39efp-1),
|
||||
+ .ln2_lo = V2 (-0x1.abc9e3b39803fp-56),
|
||||
+ .shift = V2 (0x1.8p52),
|
||||
+
|
||||
+ .onef = V2 (0x3ff0000000000000),
|
||||
+ .tiny_bound = V2 (0x3e40000000000000), /* asuint64 (0x1p-27). */
|
||||
+ /* asuint64(0x1.241bf835f9d5fp+4) - asuint64(tiny_bound). */
|
||||
+ .thresh = V2 (0x01f241bf835f9d5f),
|
||||
+};
|
||||
+
|
||||
+static inline float64x2_t
|
||||
+expm1_inline (float64x2_t x, const struct data *d)
|
||||
+{
|
||||
+ /* Helper routine for calculating exp(x) - 1. Vector port of the helper from
|
||||
+ the scalar variant of tanh. */
|
||||
+
|
||||
+ /* Reduce argument: f in [-ln2/2, ln2/2], i is exact. */
|
||||
+ float64x2_t j = vsubq_f64 (vfmaq_f64 (d->shift, d->inv_ln2, x), d->shift);
|
||||
+ int64x2_t i = vcvtq_s64_f64 (j);
|
||||
+ float64x2_t f = vfmaq_f64 (x, j, d->ln2_hi);
|
||||
+ f = vfmaq_f64 (f, j, d->ln2_lo);
|
||||
+
|
||||
+ /* Approximate expm1(f) using polynomial. */
|
||||
+ float64x2_t f2 = vmulq_f64 (f, f);
|
||||
+ float64x2_t f4 = vmulq_f64 (f2, f2);
|
||||
+ float64x2_t p = vfmaq_f64 (
|
||||
+ f, f2, v_estrin_10_f64 (f, f2, f4, vmulq_f64 (f4, f4), d->poly));
|
||||
+
|
||||
+ /* t = 2 ^ i. */
|
||||
+ float64x2_t t = vreinterpretq_f64_u64 (
|
||||
+ vaddq_u64 (vreinterpretq_u64_s64 (i << 52), d->onef));
|
||||
+ /* expm1(x) = p * t + (t - 1). */
|
||||
+ return vfmaq_f64 (vsubq_f64 (t, v_f64 (1)), p, t);
|
||||
+}
|
||||
+
|
||||
+static float64x2_t NOINLINE VPCS_ATTR
|
||||
+special_case (float64x2_t x, float64x2_t y, uint64x2_t special)
|
||||
+{
|
||||
+ return v_call_f64 (tanh, x, y, special);
|
||||
+}
|
||||
+
|
||||
+/* Vector approximation for double-precision tanh(x), using a simplified
|
||||
+ version of expm1. The greatest observed error is 2.77 ULP:
|
||||
+ _ZGVnN2v_tanh(-0x1.c4a4ca0f9f3b7p-3) got -0x1.bd6a21a163627p-3
|
||||
+ want -0x1.bd6a21a163624p-3. */
|
||||
+float64x2_t VPCS_ATTR V_NAME_D1 (tanh) (float64x2_t x)
|
||||
+{
|
||||
+ const struct data *d = ptr_barrier (&data);
|
||||
+
|
||||
+ uint64x2_t ia = vreinterpretq_u64_f64 (vabsq_f64 (x));
|
||||
+
|
||||
+ float64x2_t u = x;
|
||||
+
|
||||
+ /* Trigger special-cases for tiny, boring and infinity/NaN. */
|
||||
+ uint64x2_t special = vcgtq_u64 (vsubq_u64 (ia, d->tiny_bound), d->thresh);
|
||||
+#if WANT_SIMD_EXCEPT
|
||||
+ /* To trigger fp exceptions correctly, set special lanes to a neutral value.
|
||||
+ They will be fixed up later by the special-case handler. */
|
||||
+ if (__glibc_unlikely (v_any_u64 (special)))
|
||||
+ u = v_zerofy_f64 (u, special);
|
||||
+#endif
|
||||
+
|
||||
+ u = vaddq_f64 (u, u);
|
||||
+
|
||||
+ /* tanh(x) = (e^2x - 1) / (e^2x + 1). */
|
||||
+ float64x2_t q = expm1_inline (u, d);
|
||||
+ float64x2_t qp2 = vaddq_f64 (q, v_f64 (2));
|
||||
+
|
||||
+ if (__glibc_unlikely (v_any_u64 (special)))
|
||||
+ return special_case (x, vdivq_f64 (q, qp2), special);
|
||||
+ return vdivq_f64 (q, qp2);
|
||||
+}
|
||||
diff --git a/sysdeps/aarch64/fpu/tanh_sve.c b/sysdeps/aarch64/fpu/tanh_sve.c
|
||||
new file mode 100644
|
||||
index 0000000000000000..d25e011cea305094
|
||||
--- /dev/null
|
||||
+++ b/sysdeps/aarch64/fpu/tanh_sve.c
|
||||
@@ -0,0 +1,100 @@
|
||||
+/* Double-precision vector (SVE) tanh function
|
||||
+
|
||||
+ Copyright (C) 2024 Free Software Foundation, Inc.
|
||||
+ This file is part of the GNU C Library.
|
||||
+
|
||||
+ The GNU C Library is free software; you can redistribute it and/or
|
||||
+ modify it under the terms of the GNU Lesser General Public
|
||||
+ License as published by the Free Software Foundation; either
|
||||
+ version 2.1 of the License, or (at your option) any later version.
|
||||
+
|
||||
+ The GNU C Library is distributed in the hope that it will be useful,
|
||||
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
+ Lesser General Public License for more details.
|
||||
+
|
||||
+ You should have received a copy of the GNU Lesser General Public
|
||||
+ License along with the GNU C Library; if not, see
|
||||
+ <https://www.gnu.org/licenses/>. */
|
||||
+
|
||||
+#include "sv_math.h"
|
||||
+#include "poly_sve_f64.h"
|
||||
+
|
||||
+static const struct data
|
||||
+{
|
||||
+ float64_t poly[11];
|
||||
+ float64_t inv_ln2, ln2_hi, ln2_lo, shift;
|
||||
+ uint64_t thresh, tiny_bound;
|
||||
+} data = {
|
||||
+ /* Generated using Remez, deg=12 in [-log(2)/2, log(2)/2]. */
|
||||
+ .poly = { 0x1p-1, 0x1.5555555555559p-3, 0x1.555555555554bp-5,
|
||||
+ 0x1.111111110f663p-7, 0x1.6c16c16c1b5f3p-10,
|
||||
+ 0x1.a01a01affa35dp-13, 0x1.a01a018b4ecbbp-16,
|
||||
+ 0x1.71ddf82db5bb4p-19, 0x1.27e517fc0d54bp-22,
|
||||
+ 0x1.af5eedae67435p-26, 0x1.1f143d060a28ap-29, },
|
||||
+
|
||||
+ .inv_ln2 = 0x1.71547652b82fep0,
|
||||
+ .ln2_hi = -0x1.62e42fefa39efp-1,
|
||||
+ .ln2_lo = -0x1.abc9e3b39803fp-56,
|
||||
+ .shift = 0x1.8p52,
|
||||
+
|
||||
+ .tiny_bound = 0x3e40000000000000, /* asuint64 (0x1p-27). */
|
||||
+ /* asuint64(0x1.241bf835f9d5fp+4) - asuint64(tiny_bound). */
|
||||
+ .thresh = 0x01f241bf835f9d5f,
|
||||
+};
|
||||
+
|
||||
+static inline svfloat64_t
|
||||
+expm1_inline (svfloat64_t x, const svbool_t pg, const struct data *d)
|
||||
+{
|
||||
+ /* Helper routine for calculating exp(x) - 1. Vector port of the helper from
|
||||
+ the scalar variant of tanh. */
|
||||
+
|
||||
+ /* Reduce argument: f in [-ln2/2, ln2/2], i is exact. */
|
||||
+ svfloat64_t j
|
||||
+ = svsub_x (pg, svmla_x (pg, sv_f64 (d->shift), x, d->inv_ln2), d->shift);
|
||||
+ svint64_t i = svcvt_s64_x (pg, j);
|
||||
+ svfloat64_t f = svmla_x (pg, x, j, d->ln2_hi);
|
||||
+ f = svmla_x (pg, f, j, d->ln2_lo);
|
||||
+
|
||||
+ /* Approximate expm1(f) using polynomial. */
|
||||
+ svfloat64_t f2 = svmul_x (pg, f, f);
|
||||
+ svfloat64_t f4 = svmul_x (pg, f2, f2);
|
||||
+ svfloat64_t p = svmla_x (
|
||||
+ pg, f, f2,
|
||||
+ sv_estrin_10_f64_x (pg, f, f2, f4, svmul_x (pg, f4, f4), d->poly));
|
||||
+
|
||||
+ /* t = 2 ^ i. */
|
||||
+ svfloat64_t t = svscale_x (pg, sv_f64 (1), i);
|
||||
+ /* expm1(x) = p * t + (t - 1). */
|
||||
+ return svmla_x (pg, svsub_x (pg, t, 1), p, t);
|
||||
+}
|
||||
+
|
||||
+static svfloat64_t NOINLINE
|
||||
+special_case (svfloat64_t x, svfloat64_t y, svbool_t special)
|
||||
+{
|
||||
+ return sv_call_f64 (tanh, x, y, special);
|
||||
+}
|
||||
+
|
||||
+/* SVE approximation for double-precision tanh(x), using a simplified
|
||||
+ version of expm1. The greatest observed error is 2.77 ULP:
|
||||
+ _ZGVsMxv_tanh(-0x1.c4a4ca0f9f3b7p-3) got -0x1.bd6a21a163627p-3
|
||||
+ want -0x1.bd6a21a163624p-3. */
|
||||
+svfloat64_t SV_NAME_D1 (tanh) (svfloat64_t x, svbool_t pg)
|
||||
+{
|
||||
+ const struct data *d = ptr_barrier (&data);
|
||||
+
|
||||
+ svuint64_t ia = svreinterpret_u64 (svabs_x (pg, x));
|
||||
+
|
||||
+ /* Trigger special-cases for tiny, boring and infinity/NaN. */
|
||||
+ svbool_t special = svcmpgt (pg, svsub_x (pg, ia, d->tiny_bound), d->thresh);
|
||||
+
|
||||
+ svfloat64_t u = svadd_x (pg, x, x);
|
||||
+
|
||||
+ /* tanh(x) = (e^2x - 1) / (e^2x + 1). */
|
||||
+ svfloat64_t q = expm1_inline (u, pg, d);
|
||||
+ svfloat64_t qp2 = svadd_x (pg, q, 2);
|
||||
+
|
||||
+ if (__glibc_unlikely (svptest_any (pg, special)))
|
||||
+ return special_case (x, svdiv_x (pg, q, qp2), special);
|
||||
+ return svdiv_x (pg, q, qp2);
|
||||
+}
|
||||
diff --git a/sysdeps/aarch64/fpu/tanhf_advsimd.c b/sysdeps/aarch64/fpu/tanhf_advsimd.c
|
||||
new file mode 100644
|
||||
index 0000000000000000..50defd6ef03926f4
|
||||
--- /dev/null
|
||||
+++ b/sysdeps/aarch64/fpu/tanhf_advsimd.c
|
||||
@@ -0,0 +1,76 @@
|
||||
+/* Single-precision vector (Advanced SIMD) tanh function
|
||||
+
|
||||
+ Copyright (C) 2024 Free Software Foundation, Inc.
|
||||
+ This file is part of the GNU C Library.
|
||||
+
|
||||
+ The GNU C Library is free software; you can redistribute it and/or
|
||||
+ modify it under the terms of the GNU Lesser General Public
|
||||
+ License as published by the Free Software Foundation; either
|
||||
+ version 2.1 of the License, or (at your option) any later version.
|
||||
+
|
||||
+ The GNU C Library is distributed in the hope that it will be useful,
|
||||
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
+ Lesser General Public License for more details.
|
||||
+
|
||||
+ You should have received a copy of the GNU Lesser General Public
|
||||
+ License along with the GNU C Library; if not, see
|
||||
+ <https://www.gnu.org/licenses/>. */
|
||||
+
|
||||
+#include "v_expm1f_inline.h"
|
||||
+
|
||||
+static const struct data
|
||||
+{
|
||||
+ struct v_expm1f_data expm1f_consts;
|
||||
+ uint32x4_t boring_bound, large_bound, onef;
|
||||
+} data = {
|
||||
+ .expm1f_consts = V_EXPM1F_DATA,
|
||||
+ /* 0x1.205966p+3, above which tanhf rounds to 1 (or -1 for negative). */
|
||||
+ .boring_bound = V4 (0x41102cb3),
|
||||
+ .large_bound = V4 (0x7f800000),
|
||||
+ .onef = V4 (0x3f800000),
|
||||
+};
|
||||
+
|
||||
+static float32x4_t NOINLINE VPCS_ATTR
|
||||
+special_case (float32x4_t x, float32x4_t y, uint32x4_t special)
|
||||
+{
|
||||
+ return v_call_f32 (tanhf, x, y, special);
|
||||
+}
|
||||
+
|
||||
+/* Approximation for single-precision vector tanh(x), using a simplified
|
||||
+ version of expm1f. The maximum error is 2.58 ULP:
|
||||
+ _ZGVnN4v_tanhf (0x1.fa5eep-5) got 0x1.f9ba02p-5
|
||||
+ want 0x1.f9ba08p-5. */
|
||||
+float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (tanh) (float32x4_t x)
|
||||
+{
|
||||
+ const struct data *d = ptr_barrier (&data);
|
||||
+
|
||||
+ uint32x4_t ix = vreinterpretq_u32_f32 (x);
|
||||
+ float32x4_t ax = vabsq_f32 (x);
|
||||
+ uint32x4_t iax = vreinterpretq_u32_f32 (ax);
|
||||
+ uint32x4_t sign = veorq_u32 (ix, iax);
|
||||
+ uint32x4_t is_boring = vcgtq_u32 (iax, d->boring_bound);
|
||||
+ float32x4_t boring = vreinterpretq_f32_u32 (vorrq_u32 (sign, d->onef));
|
||||
+
|
||||
+#if WANT_SIMD_EXCEPT
|
||||
+ /* If fp exceptions are to be triggered properly, set all special and boring
|
||||
+ lanes to 0, which will trigger no exceptions, and fix them up later. */
|
||||
+ uint32x4_t special = vorrq_u32 (vcgtq_u32 (iax, d->large_bound),
|
||||
+ vcltq_u32 (iax, v_u32 (0x34000000)));
|
||||
+ x = v_zerofy_f32 (x, is_boring);
|
||||
+ if (__glibc_unlikely (v_any_u32 (special)))
|
||||
+ x = v_zerofy_f32 (x, special);
|
||||
+#else
|
||||
+ uint32x4_t special = vcgtq_u32 (iax, d->large_bound);
|
||||
+#endif
|
||||
+
|
||||
+ /* tanh(x) = (e^2x - 1) / (e^2x + 1). */
|
||||
+ float32x4_t q = expm1f_inline (vmulq_n_f32 (x, 2), &d->expm1f_consts);
|
||||
+ float32x4_t y = vdivq_f32 (q, vaddq_f32 (q, v_f32 (2.0)));
|
||||
+ if (__glibc_unlikely (v_any_u32 (special)))
|
||||
+ return special_case (vreinterpretq_f32_u32 (ix),
|
||||
+ vbslq_f32 (is_boring, boring, y), special);
|
||||
+ return vbslq_f32 (is_boring, boring, y);
|
||||
+}
|
||||
+libmvec_hidden_def (V_NAME_F1 (tanh))
|
||||
+HALF_WIDTH_ALIAS_F1 (tanh)
|
||||
diff --git a/sysdeps/aarch64/fpu/tanhf_sve.c b/sysdeps/aarch64/fpu/tanhf_sve.c
|
||||
new file mode 100644
|
||||
index 0000000000000000..0b94523cf5074200
|
||||
--- /dev/null
|
||||
+++ b/sysdeps/aarch64/fpu/tanhf_sve.c
|
||||
@@ -0,0 +1,61 @@
|
||||
+/* Single-precision vector (SVE) tanh function
|
||||
+
|
||||
+ Copyright (C) 2024 Free Software Foundation, Inc.
|
||||
+ This file is part of the GNU C Library.
|
||||
+
|
||||
+ The GNU C Library is free software; you can redistribute it and/or
|
||||
+ modify it under the terms of the GNU Lesser General Public
|
||||
+ License as published by the Free Software Foundation; either
|
||||
+ version 2.1 of the License, or (at your option) any later version.
|
||||
+
|
||||
+ The GNU C Library is distributed in the hope that it will be useful,
|
||||
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
+ Lesser General Public License for more details.
|
||||
+
|
||||
+ You should have received a copy of the GNU Lesser General Public
|
||||
+ License along with the GNU C Library; if not, see
|
||||
+ <https://www.gnu.org/licenses/>. */
|
||||
+
|
||||
+#include "sv_expm1f_inline.h"
|
||||
+
|
||||
+static const struct data
|
||||
+{
|
||||
+ struct sv_expm1f_data expm1f_consts;
|
||||
+ uint32_t boring_bound, onef;
|
||||
+} data = {
|
||||
+ .expm1f_consts = SV_EXPM1F_DATA,
|
||||
+ /* 0x1.205966p+3, above which tanhf rounds to 1 (or -1 for negative). */
|
||||
+ .boring_bound = 0x41102cb3,
|
||||
+ .onef = 0x3f800000,
|
||||
+};
|
||||
+
|
||||
+static svfloat32_t NOINLINE
|
||||
+special_case (svfloat32_t x, svfloat32_t y, svbool_t special)
|
||||
+{
|
||||
+ return sv_call_f32 (tanhf, x, y, special);
|
||||
+}
|
||||
+
|
||||
+/* Approximation for single-precision SVE tanh(x), using a simplified
|
||||
+ version of expm1f. The maximum error is 2.57 ULP:
|
||||
+ _ZGVsMxv_tanhf (0x1.fc1832p-5) got 0x1.fb71a4p-5
|
||||
+ want 0x1.fb71aap-5. */
|
||||
+svfloat32_t SV_NAME_F1 (tanh) (svfloat32_t x, const svbool_t pg)
|
||||
+{
|
||||
+ const struct data *d = ptr_barrier (&data);
|
||||
+
|
||||
+ svfloat32_t ax = svabs_x (pg, x);
|
||||
+ svuint32_t iax = svreinterpret_u32 (ax);
|
||||
+ svuint32_t sign = sveor_x (pg, svreinterpret_u32 (x), iax);
|
||||
+ svbool_t is_boring = svcmpgt (pg, iax, d->boring_bound);
|
||||
+ svfloat32_t boring = svreinterpret_f32 (svorr_x (pg, sign, d->onef));
|
||||
+
|
||||
+ svbool_t special = svcmpgt (pg, iax, 0x7f800000);
|
||||
+
|
||||
+ /* tanh(x) = (e^2x - 1) / (e^2x + 1). */
|
||||
+ svfloat32_t q = expm1f_inline (svmul_x (pg, x, 2.0), pg, &d->expm1f_consts);
|
||||
+ svfloat32_t y = svdiv_x (pg, q, svadd_x (pg, q, 2.0));
|
||||
+ if (__glibc_unlikely (svptest_any (pg, special)))
|
||||
+ return special_case (x, svsel_f32 (is_boring, boring, y), special);
|
||||
+ return svsel_f32 (is_boring, boring, y);
|
||||
+}
|
||||
diff --git a/sysdeps/aarch64/fpu/test-double-advsimd-wrappers.c b/sysdeps/aarch64/fpu/test-double-advsimd-wrappers.c
|
||||
index 1a57b22c3a92f1e1..7aeda880bd885ce5 100644
|
||||
--- a/sysdeps/aarch64/fpu/test-double-advsimd-wrappers.c
|
||||
+++ b/sysdeps/aarch64/fpu/test-double-advsimd-wrappers.c
|
||||
@@ -44,3 +44,4 @@ VPCS_VECTOR_WRAPPER (log2_advsimd, _ZGVnN2v_log2)
|
||||
VPCS_VECTOR_WRAPPER (sin_advsimd, _ZGVnN2v_sin)
|
||||
VPCS_VECTOR_WRAPPER (sinh_advsimd, _ZGVnN2v_sinh)
|
||||
VPCS_VECTOR_WRAPPER (tan_advsimd, _ZGVnN2v_tan)
|
||||
+VPCS_VECTOR_WRAPPER (tanh_advsimd, _ZGVnN2v_tanh)
|
||||
diff --git a/sysdeps/aarch64/fpu/test-double-sve-wrappers.c b/sysdeps/aarch64/fpu/test-double-sve-wrappers.c
|
||||
index 0c9858f6b74aaef6..95f1ec52221ba626 100644
|
||||
--- a/sysdeps/aarch64/fpu/test-double-sve-wrappers.c
|
||||
+++ b/sysdeps/aarch64/fpu/test-double-sve-wrappers.c
|
||||
@@ -63,3 +63,4 @@ SVE_VECTOR_WRAPPER (log2_sve, _ZGVsMxv_log2)
|
||||
SVE_VECTOR_WRAPPER (sin_sve, _ZGVsMxv_sin)
|
||||
SVE_VECTOR_WRAPPER (sinh_sve, _ZGVsMxv_sinh)
|
||||
SVE_VECTOR_WRAPPER (tan_sve, _ZGVsMxv_tan)
|
||||
+SVE_VECTOR_WRAPPER (tanh_sve, _ZGVsMxv_tanh)
|
||||
diff --git a/sysdeps/aarch64/fpu/test-float-advsimd-wrappers.c b/sysdeps/aarch64/fpu/test-float-advsimd-wrappers.c
|
||||
index 4758490c6fc40fda..bd6800e91c64136f 100644
|
||||
--- a/sysdeps/aarch64/fpu/test-float-advsimd-wrappers.c
|
||||
+++ b/sysdeps/aarch64/fpu/test-float-advsimd-wrappers.c
|
||||
@@ -44,3 +44,4 @@ VPCS_VECTOR_WRAPPER (log2f_advsimd, _ZGVnN4v_log2f)
|
||||
VPCS_VECTOR_WRAPPER (sinf_advsimd, _ZGVnN4v_sinf)
|
||||
VPCS_VECTOR_WRAPPER (sinhf_advsimd, _ZGVnN4v_sinhf)
|
||||
VPCS_VECTOR_WRAPPER (tanf_advsimd, _ZGVnN4v_tanf)
|
||||
+VPCS_VECTOR_WRAPPER (tanhf_advsimd, _ZGVnN4v_tanhf)
|
||||
diff --git a/sysdeps/aarch64/fpu/test-float-sve-wrappers.c b/sysdeps/aarch64/fpu/test-float-sve-wrappers.c
|
||||
index 7c04f07bbee84777..35ca305fddb7366c 100644
|
||||
--- a/sysdeps/aarch64/fpu/test-float-sve-wrappers.c
|
||||
+++ b/sysdeps/aarch64/fpu/test-float-sve-wrappers.c
|
||||
@@ -63,3 +63,4 @@ SVE_VECTOR_WRAPPER (log2f_sve, _ZGVsMxv_log2f)
|
||||
SVE_VECTOR_WRAPPER (sinf_sve, _ZGVsMxv_sinf)
|
||||
SVE_VECTOR_WRAPPER (sinhf_sve, _ZGVsMxv_sinhf)
|
||||
SVE_VECTOR_WRAPPER (tanf_sve, _ZGVsMxv_tanf)
|
||||
+SVE_VECTOR_WRAPPER (tanhf_sve, _ZGVsMxv_tanhf)
|
||||
diff --git a/sysdeps/aarch64/libm-test-ulps b/sysdeps/aarch64/libm-test-ulps
|
||||
index fec0972081af734a..8398b7bc7749808d 100644
|
||||
--- a/sysdeps/aarch64/libm-test-ulps
|
||||
+++ b/sysdeps/aarch64/libm-test-ulps
|
||||
@@ -1496,11 +1496,19 @@ double: 2
|
||||
float: 2
|
||||
ldouble: 2
|
||||
|
||||
+Function: "tanh_advsimd":
|
||||
+double: 2
|
||||
+float: 2
|
||||
+
|
||||
Function: "tanh_downward":
|
||||
double: 3
|
||||
float: 3
|
||||
ldouble: 4
|
||||
|
||||
+Function: "tanh_sve":
|
||||
+double: 2
|
||||
+float: 2
|
||||
+
|
||||
Function: "tanh_towardzero":
|
||||
double: 2
|
||||
float: 2
|
||||
diff --git a/sysdeps/unix/sysv/linux/aarch64/libmvec.abilist b/sysdeps/unix/sysv/linux/aarch64/libmvec.abilist
|
||||
index 1db5ba61d64067a2..396082f6a7981686 100644
|
||||
--- a/sysdeps/unix/sysv/linux/aarch64/libmvec.abilist
|
||||
+++ b/sysdeps/unix/sysv/linux/aarch64/libmvec.abilist
|
||||
@@ -85,12 +85,15 @@ GLIBC_2.40 _ZGVnN2v_erf F
|
||||
GLIBC_2.40 _ZGVnN2v_erff F
|
||||
GLIBC_2.40 _ZGVnN2v_sinh F
|
||||
GLIBC_2.40 _ZGVnN2v_sinhf F
|
||||
+GLIBC_2.40 _ZGVnN2v_tanh F
|
||||
+GLIBC_2.40 _ZGVnN2v_tanhf F
|
||||
GLIBC_2.40 _ZGVnN4v_acoshf F
|
||||
GLIBC_2.40 _ZGVnN4v_asinhf F
|
||||
GLIBC_2.40 _ZGVnN4v_atanhf F
|
||||
GLIBC_2.40 _ZGVnN4v_coshf F
|
||||
GLIBC_2.40 _ZGVnN4v_erff F
|
||||
GLIBC_2.40 _ZGVnN4v_sinhf F
|
||||
+GLIBC_2.40 _ZGVnN4v_tanhf F
|
||||
GLIBC_2.40 _ZGVsMxv_acosh F
|
||||
GLIBC_2.40 _ZGVsMxv_acoshf F
|
||||
GLIBC_2.40 _ZGVsMxv_asinh F
|
||||
@@ -103,3 +106,5 @@ GLIBC_2.40 _ZGVsMxv_erf F
|
||||
GLIBC_2.40 _ZGVsMxv_erff F
|
||||
GLIBC_2.40 _ZGVsMxv_sinh F
|
||||
GLIBC_2.40 _ZGVsMxv_sinhf F
|
||||
+GLIBC_2.40 _ZGVsMxv_tanh F
|
||||
+GLIBC_2.40 _ZGVsMxv_tanhf F
|
||||
5115
glibc-RHEL-118273-8.patch
Normal file
5115
glibc-RHEL-118273-8.patch
Normal file
File diff suppressed because it is too large
Load Diff
348
glibc-RHEL-118273-9.patch
Normal file
348
glibc-RHEL-118273-9.patch
Normal file
@ -0,0 +1,348 @@
|
||||
commit 90a6ca8b28bf34e361e577e526e1b0f4c39a32a5
|
||||
Author: Joe Ramsay <Joe.Ramsay@arm.com>
|
||||
Date: Thu May 2 16:43:13 2024 +0100
|
||||
|
||||
aarch64: Fix AdvSIMD libmvec routines for big-endian
|
||||
|
||||
Previously many routines used * to load from vector types stored
|
||||
in the data table. This is emitted as ldr, which byte-swaps the
|
||||
entire vector register, and causes bugs for big-endian when not
|
||||
all lanes contain the same value. When a vector is to be used
|
||||
this way, it has been replaced with an array and the load with an
|
||||
explicit ld1 intrinsic, which byte-swaps only within lanes.
|
||||
|
||||
As well, many routines previously used non-standard GCC syntax
|
||||
for vector operations such as indexing into vectors types with []
|
||||
and assembling vectors using {}. This syntax should not be mixed
|
||||
with ACLE, as the former does not respect endianness whereas the
|
||||
latter does. Such examples have been replaced with, for instance,
|
||||
vcombine_* and vgetq_lane* intrinsics. Helpers which only use the
|
||||
GCC syntax, such as the v_call helpers, do not need changing as
|
||||
they do not use intrinsics.
|
||||
|
||||
Reviewed-by: Szabolcs Nagy <szabolcs.nagy@arm.com>
|
||||
|
||||
Conflicts:
|
||||
sysdeps/aarch64/fpu/exp10f_advsimd.c
|
||||
sysdeps/aarch64/fpu/expm1_advsimd.c
|
||||
sysdeps/aarch64/fpu/expm1f_advsimd.c
|
||||
sysdeps/aarch64/fpu/log10_advsimd.c
|
||||
sysdeps/aarch64/fpu/log2_advsimd.c
|
||||
sysdeps/aarch64/fpu/log_advsimd.c
|
||||
sysdeps/aarch64/fpu/tan_advsimd.c
|
||||
sysdeps/aarch64/fpu/tanf_advsimd.c
|
||||
(Already backported by glibc-upstream-2.39-151.patch)
|
||||
|
||||
diff --git a/sysdeps/aarch64/fpu/asinh_advsimd.c b/sysdeps/aarch64/fpu/asinh_advsimd.c
|
||||
index 544a52f6515d3201..6207e7da9531f48d 100644
|
||||
--- a/sysdeps/aarch64/fpu/asinh_advsimd.c
|
||||
+++ b/sysdeps/aarch64/fpu/asinh_advsimd.c
|
||||
@@ -22,6 +22,7 @@
|
||||
|
||||
#define A(i) v_f64 (__v_log_data.poly[i])
|
||||
#define N (1 << V_LOG_TABLE_BITS)
|
||||
+#define IndexMask (N - 1)
|
||||
|
||||
const static struct data
|
||||
{
|
||||
@@ -63,11 +64,15 @@ struct entry
|
||||
static inline struct entry
|
||||
lookup (uint64x2_t i)
|
||||
{
|
||||
- float64x2_t e0 = vld1q_f64 (
|
||||
- &__v_log_data.table[(i[0] >> (52 - V_LOG_TABLE_BITS)) & (N - 1)].invc);
|
||||
- float64x2_t e1 = vld1q_f64 (
|
||||
- &__v_log_data.table[(i[1] >> (52 - V_LOG_TABLE_BITS)) & (N - 1)].invc);
|
||||
- return (struct entry){ vuzp1q_f64 (e0, e1), vuzp2q_f64 (e0, e1) };
|
||||
+ /* Since N is a power of 2, n % N = n & (N - 1). */
|
||||
+ struct entry e;
|
||||
+ uint64_t i0 = (vgetq_lane_u64 (i, 0) >> (52 - V_LOG_TABLE_BITS)) & IndexMask;
|
||||
+ uint64_t i1 = (vgetq_lane_u64 (i, 1) >> (52 - V_LOG_TABLE_BITS)) & IndexMask;
|
||||
+ float64x2_t e0 = vld1q_f64 (&__v_log_data.table[i0].invc);
|
||||
+ float64x2_t e1 = vld1q_f64 (&__v_log_data.table[i1].invc);
|
||||
+ e.invc = vuzp1q_f64 (e0, e1);
|
||||
+ e.logc = vuzp2q_f64 (e0, e1);
|
||||
+ return e;
|
||||
}
|
||||
|
||||
static inline float64x2_t
|
||||
diff --git a/sysdeps/aarch64/fpu/cosh_advsimd.c b/sysdeps/aarch64/fpu/cosh_advsimd.c
|
||||
index ec7b59637e973da9..4bee734f00bd6a9b 100644
|
||||
--- a/sysdeps/aarch64/fpu/cosh_advsimd.c
|
||||
+++ b/sysdeps/aarch64/fpu/cosh_advsimd.c
|
||||
@@ -22,7 +22,9 @@
|
||||
static const struct data
|
||||
{
|
||||
float64x2_t poly[3];
|
||||
- float64x2_t inv_ln2, ln2, shift, thres;
|
||||
+ float64x2_t inv_ln2;
|
||||
+ double ln2[2];
|
||||
+ float64x2_t shift, thres;
|
||||
uint64x2_t index_mask, special_bound;
|
||||
} data = {
|
||||
.poly = { V2 (0x1.fffffffffffd4p-2), V2 (0x1.5555571d6b68cp-3),
|
||||
@@ -58,8 +60,9 @@ exp_inline (float64x2_t x)
|
||||
float64x2_t n = vsubq_f64 (z, d->shift);
|
||||
|
||||
/* r = x - n*ln2/N. */
|
||||
- float64x2_t r = vfmaq_laneq_f64 (x, n, d->ln2, 0);
|
||||
- r = vfmaq_laneq_f64 (r, n, d->ln2, 1);
|
||||
+ float64x2_t ln2 = vld1q_f64 (d->ln2);
|
||||
+ float64x2_t r = vfmaq_laneq_f64 (x, n, ln2, 0);
|
||||
+ r = vfmaq_laneq_f64 (r, n, ln2, 1);
|
||||
|
||||
uint64x2_t e = vshlq_n_u64 (u, 52 - V_EXP_TAIL_TABLE_BITS);
|
||||
uint64x2_t i = vandq_u64 (u, d->index_mask);
|
||||
diff --git a/sysdeps/aarch64/fpu/erf_advsimd.c b/sysdeps/aarch64/fpu/erf_advsimd.c
|
||||
index 3e70cbc025248a05..19cbb7d0f42eb4e2 100644
|
||||
--- a/sysdeps/aarch64/fpu/erf_advsimd.c
|
||||
+++ b/sysdeps/aarch64/fpu/erf_advsimd.c
|
||||
@@ -56,8 +56,8 @@ static inline struct entry
|
||||
lookup (uint64x2_t i)
|
||||
{
|
||||
struct entry e;
|
||||
- float64x2_t e1 = vld1q_f64 ((float64_t *) (__erf_data.tab + i[0])),
|
||||
- e2 = vld1q_f64 ((float64_t *) (__erf_data.tab + i[1]));
|
||||
+ float64x2_t e1 = vld1q_f64 (&__erf_data.tab[vgetq_lane_u64 (i, 0)].erf),
|
||||
+ e2 = vld1q_f64 (&__erf_data.tab[vgetq_lane_u64 (i, 1)].erf);
|
||||
e.erf = vuzp1q_f64 (e1, e2);
|
||||
e.scale = vuzp2q_f64 (e1, e2);
|
||||
return e;
|
||||
diff --git a/sysdeps/aarch64/fpu/erfc_advsimd.c b/sysdeps/aarch64/fpu/erfc_advsimd.c
|
||||
index 548f21a3d68d68d2..f1b3bfe8304c73b5 100644
|
||||
--- a/sysdeps/aarch64/fpu/erfc_advsimd.c
|
||||
+++ b/sysdeps/aarch64/fpu/erfc_advsimd.c
|
||||
@@ -26,7 +26,7 @@ static const struct data
|
||||
float64x2_t max, shift;
|
||||
float64x2_t p20, p40, p41, p42;
|
||||
float64x2_t p51, p52;
|
||||
- float64x2_t qr5, qr6, qr7, qr8, qr9;
|
||||
+ double qr5[2], qr6[2], qr7[2], qr8[2], qr9[2];
|
||||
#if WANT_SIMD_EXCEPT
|
||||
float64x2_t uflow_bound;
|
||||
#endif
|
||||
@@ -68,8 +68,10 @@ static inline struct entry
|
||||
lookup (uint64x2_t i)
|
||||
{
|
||||
struct entry e;
|
||||
- float64x2_t e1 = vld1q_f64 ((float64_t *) (__erfc_data.tab - Off + i[0])),
|
||||
- e2 = vld1q_f64 ((float64_t *) (__erfc_data.tab - Off + i[1]));
|
||||
+ float64x2_t e1
|
||||
+ = vld1q_f64 (&__erfc_data.tab[vgetq_lane_u64 (i, 0) - Off].erfc);
|
||||
+ float64x2_t e2
|
||||
+ = vld1q_f64 (&__erfc_data.tab[vgetq_lane_u64 (i, 1) - Off].erfc);
|
||||
e.erfc = vuzp1q_f64 (e1, e2);
|
||||
e.scale = vuzp2q_f64 (e1, e2);
|
||||
return e;
|
||||
@@ -161,16 +163,19 @@ float64x2_t V_NAME_D1 (erfc) (float64x2_t x)
|
||||
p5 = vmulq_f64 (r, vfmaq_f64 (vmulq_f64 (v_f64 (0.5), dat->p20), r2, p5));
|
||||
/* Compute p_i using recurrence relation:
|
||||
p_{i+2} = (p_i + r * Q_{i+1} * p_{i+1}) * R_{i+1}. */
|
||||
- float64x2_t p6 = vfmaq_f64 (p4, p5, vmulq_laneq_f64 (r, dat->qr5, 0));
|
||||
- p6 = vmulq_laneq_f64 (p6, dat->qr5, 1);
|
||||
- float64x2_t p7 = vfmaq_f64 (p5, p6, vmulq_laneq_f64 (r, dat->qr6, 0));
|
||||
- p7 = vmulq_laneq_f64 (p7, dat->qr6, 1);
|
||||
- float64x2_t p8 = vfmaq_f64 (p6, p7, vmulq_laneq_f64 (r, dat->qr7, 0));
|
||||
- p8 = vmulq_laneq_f64 (p8, dat->qr7, 1);
|
||||
- float64x2_t p9 = vfmaq_f64 (p7, p8, vmulq_laneq_f64 (r, dat->qr8, 0));
|
||||
- p9 = vmulq_laneq_f64 (p9, dat->qr8, 1);
|
||||
- float64x2_t p10 = vfmaq_f64 (p8, p9, vmulq_laneq_f64 (r, dat->qr9, 0));
|
||||
- p10 = vmulq_laneq_f64 (p10, dat->qr9, 1);
|
||||
+ float64x2_t qr5 = vld1q_f64 (dat->qr5), qr6 = vld1q_f64 (dat->qr6),
|
||||
+ qr7 = vld1q_f64 (dat->qr7), qr8 = vld1q_f64 (dat->qr8),
|
||||
+ qr9 = vld1q_f64 (dat->qr9);
|
||||
+ float64x2_t p6 = vfmaq_f64 (p4, p5, vmulq_laneq_f64 (r, qr5, 0));
|
||||
+ p6 = vmulq_laneq_f64 (p6, qr5, 1);
|
||||
+ float64x2_t p7 = vfmaq_f64 (p5, p6, vmulq_laneq_f64 (r, qr6, 0));
|
||||
+ p7 = vmulq_laneq_f64 (p7, qr6, 1);
|
||||
+ float64x2_t p8 = vfmaq_f64 (p6, p7, vmulq_laneq_f64 (r, qr7, 0));
|
||||
+ p8 = vmulq_laneq_f64 (p8, qr7, 1);
|
||||
+ float64x2_t p9 = vfmaq_f64 (p7, p8, vmulq_laneq_f64 (r, qr8, 0));
|
||||
+ p9 = vmulq_laneq_f64 (p9, qr8, 1);
|
||||
+ float64x2_t p10 = vfmaq_f64 (p8, p9, vmulq_laneq_f64 (r, qr9, 0));
|
||||
+ p10 = vmulq_laneq_f64 (p10, qr9, 1);
|
||||
/* Compute polynomial in d using pairwise Horner scheme. */
|
||||
float64x2_t p90 = vfmaq_f64 (p9, d, p10);
|
||||
float64x2_t p78 = vfmaq_f64 (p7, d, p8);
|
||||
diff --git a/sysdeps/aarch64/fpu/erfcf_advsimd.c b/sysdeps/aarch64/fpu/erfcf_advsimd.c
|
||||
index 30b9e48dd40d80a0..ca5bc3ab33c92f83 100644
|
||||
--- a/sysdeps/aarch64/fpu/erfcf_advsimd.c
|
||||
+++ b/sysdeps/aarch64/fpu/erfcf_advsimd.c
|
||||
@@ -23,7 +23,8 @@ static const struct data
|
||||
{
|
||||
uint32x4_t offset, table_scale;
|
||||
float32x4_t max, shift;
|
||||
- float32x4_t coeffs, third, two_over_five, tenth;
|
||||
+ float coeffs[4];
|
||||
+ float32x4_t third, two_over_five, tenth;
|
||||
#if WANT_SIMD_EXCEPT
|
||||
float32x4_t uflow_bound;
|
||||
#endif
|
||||
@@ -37,7 +38,7 @@ static const struct data
|
||||
.shift = V4 (0x1p17f),
|
||||
/* Store 1/3, 2/3 and 2/15 in a single register for use with indexed muls and
|
||||
fmas. */
|
||||
- .coeffs = (float32x4_t){ 0x1.555556p-2f, 0x1.555556p-1f, 0x1.111112p-3f, 0 },
|
||||
+ .coeffs = { 0x1.555556p-2f, 0x1.555556p-1f, 0x1.111112p-3f, 0 },
|
||||
.third = V4 (0x1.555556p-2f),
|
||||
.two_over_five = V4 (-0x1.99999ap-2f),
|
||||
.tenth = V4 (-0x1.99999ap-4f),
|
||||
@@ -60,12 +61,16 @@ static inline struct entry
|
||||
lookup (uint32x4_t i)
|
||||
{
|
||||
struct entry e;
|
||||
- float64_t t0 = *((float64_t *) (__erfcf_data.tab - Off + i[0]));
|
||||
- float64_t t1 = *((float64_t *) (__erfcf_data.tab - Off + i[1]));
|
||||
- float64_t t2 = *((float64_t *) (__erfcf_data.tab - Off + i[2]));
|
||||
- float64_t t3 = *((float64_t *) (__erfcf_data.tab - Off + i[3]));
|
||||
- float32x4_t e1 = vreinterpretq_f32_f64 ((float64x2_t){ t0, t1 });
|
||||
- float32x4_t e2 = vreinterpretq_f32_f64 ((float64x2_t){ t2, t3 });
|
||||
+ float32x2_t t0
|
||||
+ = vld1_f32 (&__erfcf_data.tab[vgetq_lane_u32 (i, 0) - Off].erfc);
|
||||
+ float32x2_t t1
|
||||
+ = vld1_f32 (&__erfcf_data.tab[vgetq_lane_u32 (i, 1) - Off].erfc);
|
||||
+ float32x2_t t2
|
||||
+ = vld1_f32 (&__erfcf_data.tab[vgetq_lane_u32 (i, 2) - Off].erfc);
|
||||
+ float32x2_t t3
|
||||
+ = vld1_f32 (&__erfcf_data.tab[vgetq_lane_u32 (i, 3) - Off].erfc);
|
||||
+ float32x4_t e1 = vcombine_f32 (t0, t1);
|
||||
+ float32x4_t e2 = vcombine_f32 (t2, t3);
|
||||
e.erfc = vuzp1q_f32 (e1, e2);
|
||||
e.scale = vuzp2q_f32 (e1, e2);
|
||||
return e;
|
||||
@@ -140,10 +145,11 @@ float32x4_t NOINLINE V_NAME_F1 (erfc) (float32x4_t x)
|
||||
float32x4_t r2 = vmulq_f32 (r, r);
|
||||
|
||||
float32x4_t p1 = r;
|
||||
- float32x4_t p2 = vfmsq_laneq_f32 (dat->third, r2, dat->coeffs, 1);
|
||||
+ float32x4_t coeffs = vld1q_f32 (dat->coeffs);
|
||||
+ float32x4_t p2 = vfmsq_laneq_f32 (dat->third, r2, coeffs, 1);
|
||||
float32x4_t p3
|
||||
- = vmulq_f32 (r, vfmaq_laneq_f32 (v_f32 (-0.5), r2, dat->coeffs, 0));
|
||||
- float32x4_t p4 = vfmaq_laneq_f32 (dat->two_over_five, r2, dat->coeffs, 2);
|
||||
+ = vmulq_f32 (r, vfmaq_laneq_f32 (v_f32 (-0.5), r2, coeffs, 0));
|
||||
+ float32x4_t p4 = vfmaq_laneq_f32 (dat->two_over_five, r2, coeffs, 2);
|
||||
p4 = vfmsq_f32 (dat->tenth, r2, p4);
|
||||
|
||||
float32x4_t y = vfmaq_f32 (p3, d, p4);
|
||||
diff --git a/sysdeps/aarch64/fpu/erff_advsimd.c b/sysdeps/aarch64/fpu/erff_advsimd.c
|
||||
index c44644a71cffbb62..f2fe6ff236a6ec07 100644
|
||||
--- a/sysdeps/aarch64/fpu/erff_advsimd.c
|
||||
+++ b/sysdeps/aarch64/fpu/erff_advsimd.c
|
||||
@@ -47,12 +47,12 @@ static inline struct entry
|
||||
lookup (uint32x4_t i)
|
||||
{
|
||||
struct entry e;
|
||||
- float64_t t0 = *((float64_t *) (__erff_data.tab + i[0]));
|
||||
- float64_t t1 = *((float64_t *) (__erff_data.tab + i[1]));
|
||||
- float64_t t2 = *((float64_t *) (__erff_data.tab + i[2]));
|
||||
- float64_t t3 = *((float64_t *) (__erff_data.tab + i[3]));
|
||||
- float32x4_t e1 = vreinterpretq_f32_f64 ((float64x2_t){ t0, t1 });
|
||||
- float32x4_t e2 = vreinterpretq_f32_f64 ((float64x2_t){ t2, t3 });
|
||||
+ float32x2_t t0 = vld1_f32 (&__erff_data.tab[vgetq_lane_u32 (i, 0)].erf);
|
||||
+ float32x2_t t1 = vld1_f32 (&__erff_data.tab[vgetq_lane_u32 (i, 1)].erf);
|
||||
+ float32x2_t t2 = vld1_f32 (&__erff_data.tab[vgetq_lane_u32 (i, 2)].erf);
|
||||
+ float32x2_t t3 = vld1_f32 (&__erff_data.tab[vgetq_lane_u32 (i, 3)].erf);
|
||||
+ float32x4_t e1 = vcombine_f32 (t0, t1);
|
||||
+ float32x4_t e2 = vcombine_f32 (t2, t3);
|
||||
e.erf = vuzp1q_f32 (e1, e2);
|
||||
e.scale = vuzp2q_f32 (e1, e2);
|
||||
return e;
|
||||
diff --git a/sysdeps/aarch64/fpu/sinh_advsimd.c b/sysdeps/aarch64/fpu/sinh_advsimd.c
|
||||
index fa3723b10c15eb29..3e3b76c502b01e16 100644
|
||||
--- a/sysdeps/aarch64/fpu/sinh_advsimd.c
|
||||
+++ b/sysdeps/aarch64/fpu/sinh_advsimd.c
|
||||
@@ -22,8 +22,9 @@
|
||||
|
||||
static const struct data
|
||||
{
|
||||
- float64x2_t poly[11];
|
||||
- float64x2_t inv_ln2, m_ln2, shift;
|
||||
+ float64x2_t poly[11], inv_ln2;
|
||||
+ double m_ln2[2];
|
||||
+ float64x2_t shift;
|
||||
uint64x2_t halff;
|
||||
int64x2_t onef;
|
||||
#if WANT_SIMD_EXCEPT
|
||||
@@ -40,7 +41,7 @@ static const struct data
|
||||
V2 (0x1.af5eedae67435p-26), V2 (0x1.1f143d060a28ap-29), },
|
||||
|
||||
.inv_ln2 = V2 (0x1.71547652b82fep0),
|
||||
- .m_ln2 = (float64x2_t) {-0x1.62e42fefa39efp-1, -0x1.abc9e3b39803fp-56},
|
||||
+ .m_ln2 = {-0x1.62e42fefa39efp-1, -0x1.abc9e3b39803fp-56},
|
||||
.shift = V2 (0x1.8p52),
|
||||
|
||||
.halff = V2 (0x3fe0000000000000),
|
||||
@@ -67,8 +68,10 @@ expm1_inline (float64x2_t x)
|
||||
and f = x - i * ln2 (f in [-ln2/2, ln2/2]). */
|
||||
float64x2_t j = vsubq_f64 (vfmaq_f64 (d->shift, d->inv_ln2, x), d->shift);
|
||||
int64x2_t i = vcvtq_s64_f64 (j);
|
||||
- float64x2_t f = vfmaq_laneq_f64 (x, j, d->m_ln2, 0);
|
||||
- f = vfmaq_laneq_f64 (f, j, d->m_ln2, 1);
|
||||
+
|
||||
+ float64x2_t m_ln2 = vld1q_f64 (d->m_ln2);
|
||||
+ float64x2_t f = vfmaq_laneq_f64 (x, j, m_ln2, 0);
|
||||
+ f = vfmaq_laneq_f64 (f, j, m_ln2, 1);
|
||||
/* Approximate expm1(f) using polynomial. */
|
||||
float64x2_t f2 = vmulq_f64 (f, f);
|
||||
float64x2_t f4 = vmulq_f64 (f2, f2);
|
||||
diff --git a/sysdeps/aarch64/fpu/v_expf_inline.h b/sysdeps/aarch64/fpu/v_expf_inline.h
|
||||
index a3b0e32f9eb42021..08b06e0a6b34b4f4 100644
|
||||
--- a/sysdeps/aarch64/fpu/v_expf_inline.h
|
||||
+++ b/sysdeps/aarch64/fpu/v_expf_inline.h
|
||||
@@ -25,7 +25,8 @@
|
||||
struct v_expf_data
|
||||
{
|
||||
float32x4_t poly[5];
|
||||
- float32x4_t shift, invln2_and_ln2;
|
||||
+ float32x4_t shift;
|
||||
+ float invln2_and_ln2[4];
|
||||
};
|
||||
|
||||
/* maxerr: 1.45358 +0.5 ulp. */
|
||||
@@ -50,10 +51,11 @@ v_expf_inline (float32x4_t x, const struct v_expf_data *d)
|
||||
/* exp(x) = 2^n (1 + poly(r)), with 1 + poly(r) in [1/sqrt(2),sqrt(2)]
|
||||
x = ln2*n + r, with r in [-ln2/2, ln2/2]. */
|
||||
float32x4_t n, r, z;
|
||||
- z = vfmaq_laneq_f32 (d->shift, x, d->invln2_and_ln2, 0);
|
||||
+ float32x4_t invln2_and_ln2 = vld1q_f32 (d->invln2_and_ln2);
|
||||
+ z = vfmaq_laneq_f32 (d->shift, x, invln2_and_ln2, 0);
|
||||
n = vsubq_f32 (z, d->shift);
|
||||
- r = vfmsq_laneq_f32 (x, n, d->invln2_and_ln2, 1);
|
||||
- r = vfmsq_laneq_f32 (r, n, d->invln2_and_ln2, 2);
|
||||
+ r = vfmsq_laneq_f32 (x, n, invln2_and_ln2, 1);
|
||||
+ r = vfmsq_laneq_f32 (r, n, invln2_and_ln2, 2);
|
||||
uint32x4_t e = vshlq_n_u32 (vreinterpretq_u32_f32 (z), 23);
|
||||
float32x4_t scale = vreinterpretq_f32_u32 (vaddq_u32 (e, ExponentBias));
|
||||
|
||||
diff --git a/sysdeps/aarch64/fpu/v_expm1f_inline.h b/sysdeps/aarch64/fpu/v_expm1f_inline.h
|
||||
index 337ccfbfab555c97..59b552da6b74785e 100644
|
||||
--- a/sysdeps/aarch64/fpu/v_expm1f_inline.h
|
||||
+++ b/sysdeps/aarch64/fpu/v_expm1f_inline.h
|
||||
@@ -26,7 +26,8 @@
|
||||
struct v_expm1f_data
|
||||
{
|
||||
float32x4_t poly[5];
|
||||
- float32x4_t invln2_and_ln2, shift;
|
||||
+ float invln2_and_ln2[4];
|
||||
+ float32x4_t shift;
|
||||
int32x4_t exponent_bias;
|
||||
};
|
||||
|
||||
@@ -49,11 +50,12 @@ expm1f_inline (float32x4_t x, const struct v_expm1f_data *d)
|
||||
calling routine should handle special values if required. */
|
||||
|
||||
/* Reduce argument: f in [-ln2/2, ln2/2], i is exact. */
|
||||
- float32x4_t j = vsubq_f32 (
|
||||
- vfmaq_laneq_f32 (d->shift, x, d->invln2_and_ln2, 0), d->shift);
|
||||
+ float32x4_t invln2_and_ln2 = vld1q_f32 (d->invln2_and_ln2);
|
||||
+ float32x4_t j
|
||||
+ = vsubq_f32 (vfmaq_laneq_f32 (d->shift, x, invln2_and_ln2, 0), d->shift);
|
||||
int32x4_t i = vcvtq_s32_f32 (j);
|
||||
- float32x4_t f = vfmsq_laneq_f32 (x, j, d->invln2_and_ln2, 1);
|
||||
- f = vfmsq_laneq_f32 (f, j, d->invln2_and_ln2, 2);
|
||||
+ float32x4_t f = vfmsq_laneq_f32 (x, j, invln2_and_ln2, 1);
|
||||
+ f = vfmsq_laneq_f32 (f, j, invln2_and_ln2, 2);
|
||||
|
||||
/* Approximate expm1(f) with polynomial P, expm1(f) ~= f + f^2 * P(f).
|
||||
Uses Estrin scheme, where the main _ZGVnN4v_expm1f routine uses
|
||||
Loading…
Reference in New Issue
Block a user