glibc/glibc-RHEL-118273-39.patch
Yuki Inoguchi 9dd92cac18 aarch64: Add GLIBC_2.40 vector functions and performance fixes (RHEL-118273)
This combines the following upstream commits:

e45af510bc AArch64: Fix instability in AdvSIMD sinh
6c22823da5 AArch64: Fix instability in AdvSIMD tan
aebaeb2c33 AArch64: Update math-vector-fortran.h
e20ca759af AArch64: add optimised strspn/strcspn
aac077645a AArch64: Fix SVE powf routine [BZ #33299]
1e3d1ddf97 AArch64: Optimize SVE exp functions
dee22d2a81 AArch64: Optimise SVE FP64 Hyperbolics
6849c5b791 AArch64: Improve codegen SVE log1p helper
09795c5612 AArch64: Fix builderror with GCC 12.1/12.2
aa18367c11 AArch64: Improve enabling of SVE for libmvec
691edbdf77 aarch64: fix unwinding in longjmp
4352e2cc93 aarch64: Fix _dl_tlsdesc_dynamic unwind for pac-ret (BZ 32612)
cf56eb28fa AArch64: Optimize algorithm in users of SVE expf helper
ce2f26a22e AArch64: Remove PTR_ARG/SIZE_ARG defines
8f0e7fe61e Aarch64: Improve codegen in SVE asinh
c0ff447edf Aarch64: Improve codegen in SVE exp and users, and update expf_inline
f5ff34cb3c AArch64: Improve codegen for SVE erfcf
0b195651db AArch64: Improve codegen for SVE pow
95e807209b AArch64: Improve codegen for SVE powf
d3f2b71ef1 aarch64: Fix tests not compatible with targets supporting GCS
f86b4cf875 AArch64: Improve codegen in SVE expm1f and users
140b985e5a AArch64: Improve codegen in AdvSIMD asinh
91c1fadba3 AArch64: Improve codegen for SVE log1pf users
cff9648d0b AArch64: Improve codegen of AdvSIMD expf family
569cfaaf49 AArch64: Improve codegen in AdvSIMD pow
ca0c0d0f26 AArch64: Improve codegen in users of ADVSIMD log1p helper
13a7ef5999 AArch64: Improve codegen in users of ADVSIMD expm1 helper
2d82d781a5 AArch64: Remove SVE erf and erfc tables
1cf29fbc5b AArch64: Small optimisation in AdvSIMD erf and erfc
7b8c134b54 AArch64: Improve codegen in SVE expf & related routines
a15b1394b5 AArch64: Improve codegen in SVE F32 logs
5bc100bd4b AArch64: Improve codegen in users of AdvSIMD log1pf helper
7900ac490d AArch64: Improve codegen in users of ADVSIMD expm1f helper
0fed0b250f aarch64/fpu: Add vector variants of pow
75207bde68 aarch64/fpu: Add vector variants of cbrt
157f89fa3d aarch64/fpu: Add vector variants of hypot
90a6ca8b28 aarch64: Fix AdvSIMD libmvec routines for big-endian
87cb1dfcd6 aarch64/fpu: Add vector variants of erfc
3d3a4fb8e4 aarch64/fpu: Add vector variants of tanh
eedbbca0bf aarch64/fpu: Add vector variants of sinh
8b67920528 aarch64/fpu: Add vector variants of atanh
81406ea3c5 aarch64/fpu: Add vector variants of asinh
b09fee1d21 aarch64/fpu: Add vector variants of acosh
bdb5705b7b aarch64/fpu: Add vector variants of cosh
cb5d84f1f8 aarch64/fpu: Add vector variants of erf

Resolves: RHEL-118273
2025-12-05 16:24:54 +01:00

584 lines
23 KiB
Diff

commit dee22d2a81ab59afc165fb6dcb45d723f13582a0
Author: Dylan Fleming <Dylan.Fleming@arm.com>
Date: Wed Jun 18 16:19:22 2025 +0000
AArch64: Optimise SVE FP64 Hyperbolics
Reworke SVE FP64 hyperbolics to use the SVE FEXPA
instruction.
Also update the special case handelling for large
inputs to be entirely vectorised.
Performance improvements on Neoverse V1:
cosh_sve: 19% for |x| < 709, 5x otherwise
sinh_sve: 24% for |x| < 709, 5.9x otherwise
tanh_sve: 12% for |x| < 19, 9x otherwise
Reviewed-by: Wilco Dijkstra <Wilco.Dijkstra@arm.com>
diff --git a/sysdeps/aarch64/fpu/cosh_sve.c b/sysdeps/aarch64/fpu/cosh_sve.c
index e375dd8a3407feb2..3561893ae614e2ea 100644
--- a/sysdeps/aarch64/fpu/cosh_sve.c
+++ b/sysdeps/aarch64/fpu/cosh_sve.c
@@ -21,71 +21,99 @@
static const struct data
{
- float64_t poly[3];
- float64_t inv_ln2, ln2_hi, ln2_lo, shift, thres;
+ double c0, c2;
+ double c1, c3;
+ float64_t inv_ln2, ln2_hi, ln2_lo, shift;
uint64_t special_bound;
} data = {
- .poly = { 0x1.fffffffffffd4p-2, 0x1.5555571d6b68cp-3,
- 0x1.5555576a59599p-5, },
-
- .inv_ln2 = 0x1.71547652b82fep8, /* N/ln2. */
- /* -ln2/N. */
- .ln2_hi = -0x1.62e42fefa39efp-9,
- .ln2_lo = -0x1.abc9e3b39803f3p-64,
- .shift = 0x1.8p+52,
- .thres = 704.0,
-
- /* 0x1.6p9, above which exp overflows. */
- .special_bound = 0x4086000000000000,
+ /* Generated using Remez, in [-log(2)/128, log(2)/128]. */
+ .c0 = 0x1.fffffffffdbcdp-2,
+ .c1 = 0x1.555555555444cp-3,
+ .c2 = 0x1.555573c6a9f7dp-5,
+ .c3 = 0x1.1111266d28935p-7,
+ .ln2_hi = 0x1.62e42fefa3800p-1,
+ .ln2_lo = 0x1.ef35793c76730p-45,
+ /* 1/ln2. */
+ .inv_ln2 = 0x1.71547652b82fep+0,
+ .shift = 0x1.800000000ff80p+46, /* 1.5*2^46+1022. */
+
+ /* asuint(ln(2^(1024 - 1/128))), the value above which exp overflows. */
+ .special_bound = 0x40862e37e7d8ba72,
};
-static svfloat64_t NOINLINE
-special_case (svfloat64_t x, svbool_t pg, svfloat64_t t, svbool_t special)
-{
- svfloat64_t half_t = svmul_x (svptrue_b64 (), t, 0.5);
- svfloat64_t half_over_t = svdivr_x (pg, t, 0.5);
- svfloat64_t y = svadd_x (pg, half_t, half_over_t);
- return sv_call_f64 (cosh, x, y, special);
-}
-
-/* Helper for approximating exp(x). Copied from sv_exp_tail, with no
- special-case handling or tail. */
+/* Helper for approximating exp(x)/2.
+ Functionally identical to FEXPA exp(x), but an adjustment in
+ the shift value which leads to a reduction in the exponent of scale by 1,
+ thus halving the result at no cost. */
static inline svfloat64_t
-exp_inline (svfloat64_t x, const svbool_t pg, const struct data *d)
+exp_over_two_inline (const svbool_t pg, svfloat64_t x, const struct data *d)
{
/* Calculate exp(x). */
svfloat64_t z = svmla_x (pg, sv_f64 (d->shift), x, d->inv_ln2);
+ svuint64_t u = svreinterpret_u64 (z);
svfloat64_t n = svsub_x (pg, z, d->shift);
- svfloat64_t r = svmla_x (pg, x, n, d->ln2_hi);
- r = svmla_x (pg, r, n, d->ln2_lo);
+ svfloat64_t c13 = svld1rq (svptrue_b64 (), &d->c1);
+ svfloat64_t ln2 = svld1rq (svptrue_b64 (), &d->ln2_hi);
- svuint64_t u = svreinterpret_u64 (z);
- svuint64_t e = svlsl_x (pg, u, 52 - V_EXP_TAIL_TABLE_BITS);
- svuint64_t i = svand_x (svptrue_b64 (), u, 0xff);
+ svfloat64_t r = x;
+ r = svmls_lane (r, n, ln2, 0);
+ r = svmls_lane (r, n, ln2, 1);
- svfloat64_t y = svmla_x (pg, sv_f64 (d->poly[1]), r, d->poly[2]);
- y = svmla_x (pg, sv_f64 (d->poly[0]), r, y);
- y = svmla_x (pg, sv_f64 (1.0), r, y);
- y = svmul_x (svptrue_b64 (), r, y);
+ svfloat64_t r2 = svmul_x (svptrue_b64 (), r, r);
+ svfloat64_t p01 = svmla_lane (sv_f64 (d->c0), r, c13, 0);
+ svfloat64_t p23 = svmla_lane (sv_f64 (d->c2), r, c13, 1);
+ svfloat64_t p04 = svmla_x (pg, p01, p23, r2);
+ svfloat64_t p = svmla_x (pg, r, p04, r2);
- /* s = 2^(n/N). */
- u = svld1_gather_index (pg, __v_exp_tail_data, i);
- svfloat64_t s = svreinterpret_f64 (svadd_x (pg, u, e));
+ svfloat64_t scale = svexpa (u);
- return svmla_x (pg, s, s, y);
+ return svmla_x (pg, scale, scale, p);
+}
+
+/* Vectorised special case to handle values past where exp_inline overflows.
+ Halves the input value and uses the identity exp(x) = exp(x/2)^2 to double
+ the valid range of inputs, and returns inf for anything past that. */
+static svfloat64_t NOINLINE
+special_case (svbool_t pg, svbool_t special, svfloat64_t ax, svfloat64_t t,
+ const struct data *d)
+{
+ /* Finish fast path to compute values for non-special cases. */
+ svfloat64_t inv_twoexp = svdivr_x (pg, t, 0.25);
+ svfloat64_t y = svadd_x (pg, t, inv_twoexp);
+
+ /* Halves input value, and then check if any cases
+ are still going to overflow. */
+ ax = svmul_x (special, ax, 0.5);
+ svbool_t is_safe
+ = svcmplt (special, svreinterpret_u64 (ax), d->special_bound);
+
+ /* Computes exp(x/2), and sets any overflowing lanes to inf. */
+ svfloat64_t half_exp = exp_over_two_inline (special, ax, d);
+ half_exp = svsel (is_safe, half_exp, sv_f64 (INFINITY));
+
+ /* Construct special case cosh(x) = (exp(x/2)^2)/2. */
+ svfloat64_t exp = svmul_x (svptrue_b64 (), half_exp, 2);
+ svfloat64_t special_y = svmul_x (special, exp, half_exp);
+
+ /* Select correct return values for special and non-special cases. */
+ special_y = svsel (special, special_y, y);
+
+ /* Ensure an input of nan is correctly propagated. */
+ svbool_t is_nan
+ = svcmpgt (special, svreinterpret_u64 (ax), sv_u64 (0x7ff0000000000000));
+ return svsel (is_nan, ax, svsel (special, special_y, y));
}
/* Approximation for SVE double-precision cosh(x) using exp_inline.
cosh(x) = (exp(x) + exp(-x)) / 2.
- The greatest observed error is in the scalar fall-back region, so is the
- same as the scalar routine, 1.93 ULP:
- _ZGVsMxv_cosh (0x1.628ad45039d2fp+9) got 0x1.fd774e958236dp+1021
- want 0x1.fd774e958236fp+1021.
-
- The greatest observed error in the non-special region is 1.54 ULP:
- _ZGVsMxv_cosh (0x1.ba5651dd4486bp+2) got 0x1.f5e2bb8d5c98fp+8
- want 0x1.f5e2bb8d5c991p+8. */
+ The greatest observed error in special case region is 2.66 + 0.5 ULP:
+ _ZGVsMxv_cosh (0x1.633b532ffbc1ap+9) got 0x1.f9b2d3d22399ep+1023
+ want 0x1.f9b2d3d22399bp+1023
+
+ The greatest observed error in the non-special region is 1.01 + 0.5 ULP:
+ _ZGVsMxv_cosh (0x1.998ecbb3c1f81p+1) got 0x1.890b225657f84p+3
+ want 0x1.890b225657f82p+3. */
svfloat64_t SV_NAME_D1 (cosh) (svfloat64_t x, const svbool_t pg)
{
const struct data *d = ptr_barrier (&data);
@@ -94,14 +122,13 @@ svfloat64_t SV_NAME_D1 (cosh) (svfloat64_t x, const svbool_t pg)
svbool_t special = svcmpgt (pg, svreinterpret_u64 (ax), d->special_bound);
/* Up to the point that exp overflows, we can use it to calculate cosh by
- exp(|x|) / 2 + 1 / (2 * exp(|x|)). */
- svfloat64_t t = exp_inline (ax, pg, d);
+ (exp(|x|)/2 + 1) / (2 * exp(|x|)). */
+ svfloat64_t half_exp = exp_over_two_inline (pg, ax, d);
- /* Fall back to scalar for any special cases. */
+ /* Falls back to entirely standalone vectorized special case. */
if (__glibc_unlikely (svptest_any (pg, special)))
- return special_case (x, pg, t, special);
+ return special_case (pg, special, ax, half_exp, d);
- svfloat64_t half_t = svmul_x (svptrue_b64 (), t, 0.5);
- svfloat64_t half_over_t = svdivr_x (pg, t, 0.5);
- return svadd_x (pg, half_t, half_over_t);
+ svfloat64_t inv_twoexp = svdivr_x (pg, half_exp, 0.25);
+ return svadd_x (pg, half_exp, inv_twoexp);
}
diff --git a/sysdeps/aarch64/fpu/sinh_sve.c b/sysdeps/aarch64/fpu/sinh_sve.c
index df5f6c8c06e5b173..ac7b306018bda613 100644
--- a/sysdeps/aarch64/fpu/sinh_sve.c
+++ b/sysdeps/aarch64/fpu/sinh_sve.c
@@ -18,90 +18,153 @@
<https://www.gnu.org/licenses/>. */
#include "sv_math.h"
-#include "poly_sve_f64.h"
static const struct data
{
- float64_t poly[11];
- float64_t inv_ln2, m_ln2_hi, m_ln2_lo, shift;
uint64_t halff;
- int64_t onef;
- uint64_t large_bound;
+ double c2, c4;
+ double inv_ln2;
+ double ln2_hi, ln2_lo;
+ double c0, c1, c3;
+ double shift, special_bound, bound;
+ uint64_t expm1_data[20];
} data = {
- /* Generated using Remez, deg=12 in [-log(2)/2, log(2)/2]. */
- .poly = { 0x1p-1, 0x1.5555555555559p-3, 0x1.555555555554bp-5,
- 0x1.111111110f663p-7, 0x1.6c16c16c1b5f3p-10,
- 0x1.a01a01affa35dp-13, 0x1.a01a018b4ecbbp-16,
- 0x1.71ddf82db5bb4p-19, 0x1.27e517fc0d54bp-22,
- 0x1.af5eedae67435p-26, 0x1.1f143d060a28ap-29, },
-
- .inv_ln2 = 0x1.71547652b82fep0,
- .m_ln2_hi = -0x1.62e42fefa39efp-1,
- .m_ln2_lo = -0x1.abc9e3b39803fp-56,
- .shift = 0x1.8p52,
-
+ /* Table lookup of 2^(i/64) - 1, for values of i from 0..19. */
+ .expm1_data = {
+ 0x0000000000000000, 0x3f864d1f3bc03077, 0x3f966c34c5615d0f, 0x3fa0e8a30eb37901,
+ 0x3fa6ab0d9f3121ec, 0x3fac7d865a7a3440, 0x3fb1301d0125b50a, 0x3fb429aaea92ddfb,
+ 0x3fb72b83c7d517ae, 0x3fba35beb6fcb754, 0x3fbd4873168b9aa8, 0x3fc031dc431466b2,
+ 0x3fc1c3d373ab11c3, 0x3fc35a2b2f13e6e9, 0x3fc4f4efa8fef709, 0x3fc6942d3720185a,
+ 0x3fc837f0518db8a9, 0x3fc9e0459320b7fa, 0x3fcb8d39b9d54e55, 0x3fcd3ed9a72cffb7,
+ },
+
+ /* Generated using Remez, in [-log(2)/128, log(2)/128]. */
+ .c0 = 0x1p-1,
+ .c1 = 0x1.55555555548f9p-3,
+ .c2 = 0x1.5555555554c22p-5,
+ .c3 = 0x1.111123aaa2fb2p-7,
+ .c4 = 0x1.6c16d77d98e5bp-10,
+ .ln2_hi = 0x1.62e42fefa3800p-1,
+ .ln2_lo = 0x1.ef35793c76730p-45,
+ .inv_ln2 = 0x1.71547652b82fep+0,
+ .shift = 0x1.800000000ffc0p+46, /* 1.5*2^46+1023. */
.halff = 0x3fe0000000000000,
- .onef = 0x3ff0000000000000,
- /* 2^9. expm1 helper overflows for large input. */
- .large_bound = 0x4080000000000000,
+ .special_bound = 0x1.62e37e7d8ba72p+9, /* ln(2^(1024 - 1/128)). */
+ .bound = 0x1.a56ef8ec924ccp-3 /* 19*ln2/64. */
};
+/* A specialised FEXPA expm1 that is only valid for positive inputs and
+ has no special cases. Based off the full FEXPA expm1 implementated for
+ _ZGVsMxv_expm1, with a slightly modified file to keep sinh under 3.5ULP. */
static inline svfloat64_t
-expm1_inline (svfloat64_t x, svbool_t pg)
+expm1_inline (svbool_t pg, svfloat64_t x)
{
const struct data *d = ptr_barrier (&data);
- /* Reduce argument:
- exp(x) - 1 = 2^i * (expm1(f) + 1) - 1
- where i = round(x / ln2)
- and f = x - i * ln2 (f in [-ln2/2, ln2/2]). */
- svfloat64_t j
- = svsub_x (pg, svmla_x (pg, sv_f64 (d->shift), x, d->inv_ln2), d->shift);
- svint64_t i = svcvt_s64_x (pg, j);
- svfloat64_t f = svmla_x (pg, x, j, d->m_ln2_hi);
- f = svmla_x (pg, f, j, d->m_ln2_lo);
- /* Approximate expm1(f) using polynomial. */
- svfloat64_t f2 = svmul_x (pg, f, f);
- svfloat64_t f4 = svmul_x (pg, f2, f2);
- svfloat64_t f8 = svmul_x (pg, f4, f4);
- svfloat64_t p
- = svmla_x (pg, f, f2, sv_estrin_10_f64_x (pg, f, f2, f4, f8, d->poly));
- /* t = 2^i. */
- svfloat64_t t = svscale_x (pg, sv_f64 (1), i);
- /* expm1(x) ~= p * t + (t - 1). */
- return svmla_x (pg, svsub_x (pg, t, 1.0), p, t);
+ svfloat64_t z = svmla_x (pg, sv_f64 (d->shift), x, d->inv_ln2);
+ svuint64_t u = svreinterpret_u64 (z);
+ svfloat64_t n = svsub_x (pg, z, d->shift);
+
+ svfloat64_t ln2 = svld1rq (svptrue_b64 (), &d->ln2_hi);
+ svfloat64_t c24 = svld1rq (svptrue_b64 (), &d->c2);
+
+ svfloat64_t r = x;
+ r = svmls_lane (r, n, ln2, 0);
+ r = svmls_lane (r, n, ln2, 1);
+
+ svfloat64_t r2 = svmul_x (svptrue_b64 (), r, r);
+
+ svfloat64_t p;
+ svfloat64_t c12 = svmla_lane (sv_f64 (d->c1), r, c24, 0);
+ svfloat64_t c34 = svmla_lane (sv_f64 (d->c3), r, c24, 1);
+ p = svmad_x (pg, c34, r2, c12);
+ p = svmad_x (pg, p, r, sv_f64 (d->c0));
+ p = svmad_x (pg, p, r2, r);
+
+ svfloat64_t scale = svexpa (u);
+
+ /* We want to construct expm1(x) = (scale - 1) + scale * poly.
+ However, for values of scale close to 1, scale-1 causes large ULP errors
+ due to cancellation.
+
+ This can be circumvented by using a small lookup for scale-1
+ when our input is below a certain bound, otherwise we can use FEXPA. */
+ svbool_t is_small = svaclt (pg, x, d->bound);
+
+ /* Index via the input of FEXPA, but we only care about the lower 5 bits. */
+ svuint64_t base_idx = svand_x (pg, u, 0x1f);
+
+ /* Compute scale - 1 from FEXPA, and lookup values where this fails. */
+ svfloat64_t scalem1_estimate = svsub_x (pg, scale, sv_f64 (1.0));
+ svuint64_t scalem1_lookup
+ = svld1_gather_index (is_small, d->expm1_data, base_idx);
+
+ /* Select the appropriate scale - 1 value based on x. */
+ svfloat64_t scalem1
+ = svsel (is_small, svreinterpret_f64 (scalem1_lookup), scalem1_estimate);
+
+ /* return expm1 = scale - 1 + (scale * poly). */
+ return svmla_x (pg, scalem1, scale, p);
}
+/* Vectorised special case to handle values past where exp_inline overflows.
+ Halves the input value and uses the identity exp(x) = exp(x/2)^2 to double
+ the valid range of inputs, and returns inf for anything past that. */
static svfloat64_t NOINLINE
-special_case (svfloat64_t x, svbool_t pg)
+special_case (svbool_t pg, svbool_t special, svfloat64_t ax,
+ svfloat64_t halfsign, const struct data *d)
{
- return sv_call_f64 (sinh, x, x, pg);
+ /* Halves input value, and then check if any cases
+ are still going to overflow. */
+ ax = svmul_x (special, ax, 0.5);
+ svbool_t is_safe = svaclt (special, ax, d->special_bound);
+
+ svfloat64_t t = expm1_inline (pg, ax);
+
+ /* Finish fastpass to compute values for non-special cases. */
+ svfloat64_t y = svadd_x (pg, t, svdiv_x (pg, t, svadd_x (pg, t, 1.0)));
+ y = svmul_x (pg, y, halfsign);
+
+ /* Computes special lane, and set remaining overflow lanes to inf. */
+ svfloat64_t half_special_y = svmul_x (svptrue_b64 (), t, halfsign);
+ svfloat64_t special_y = svmul_x (svptrue_b64 (), half_special_y, t);
+
+ svuint64_t signed_inf
+ = svorr_x (svptrue_b64 (), svreinterpret_u64 (halfsign),
+ sv_u64 (0x7ff0000000000000));
+ special_y = svsel (is_safe, special_y, svreinterpret_f64 (signed_inf));
+
+ /* Join resulting vectors together and return. */
+ return svsel (special, special_y, y);
}
-/* Approximation for SVE double-precision sinh(x) using expm1.
- sinh(x) = (exp(x) - exp(-x)) / 2.
- The greatest observed error is 2.57 ULP:
- _ZGVsMxv_sinh (0x1.a008538399931p-2) got 0x1.ab929fc64bd66p-2
- want 0x1.ab929fc64bd63p-2. */
+/* Approximation for SVE double-precision sinh(x) using FEXPA expm1.
+ Uses sinh(x) = e^2x - 1 / 2e^x, rewritten for accuracy.
+ The greatest observed error in the non-special region is 2.63 + 0.5 ULP:
+ _ZGVsMxv_sinh (0x1.b5e0e13ba88aep-2) got 0x1.c3587faf97b0cp-2
+ want 0x1.c3587faf97b09p-2
+
+ The greatest observed error in the special region is 2.65 + 0.5 ULP:
+ _ZGVsMxv_sinh (0x1.633ce847dab1ap+9) got 0x1.fffd30eea0066p+1023
+ want 0x1.fffd30eea0063p+1023. */
svfloat64_t SV_NAME_D1 (sinh) (svfloat64_t x, svbool_t pg)
{
const struct data *d = ptr_barrier (&data);
+ svbool_t special = svacge (pg, x, d->special_bound);
svfloat64_t ax = svabs_x (pg, x);
svuint64_t sign
= sveor_x (pg, svreinterpret_u64 (x), svreinterpret_u64 (ax));
svfloat64_t halfsign = svreinterpret_f64 (svorr_x (pg, sign, d->halff));
- svbool_t special = svcmpge (pg, svreinterpret_u64 (ax), d->large_bound);
-
/* Fall back to scalar variant for all lanes if any are special. */
if (__glibc_unlikely (svptest_any (pg, special)))
- return special_case (x, pg);
+ return special_case (pg, special, ax, halfsign, d);
/* Up to the point that expm1 overflows, we can use it to calculate sinh
using a slight rearrangement of the definition of sinh. This allows us to
retain acceptable accuracy for very small inputs. */
- svfloat64_t t = expm1_inline (ax, pg);
+ svfloat64_t t = expm1_inline (pg, ax);
t = svadd_x (pg, t, svdiv_x (pg, t, svadd_x (pg, t, 1.0)));
return svmul_x (pg, t, halfsign);
}
diff --git a/sysdeps/aarch64/fpu/tanh_sve.c b/sysdeps/aarch64/fpu/tanh_sve.c
index d25e011cea305094..805669845d09e098 100644
--- a/sysdeps/aarch64/fpu/tanh_sve.c
+++ b/sysdeps/aarch64/fpu/tanh_sve.c
@@ -18,83 +18,117 @@
<https://www.gnu.org/licenses/>. */
#include "sv_math.h"
-#include "poly_sve_f64.h"
static const struct data
{
- float64_t poly[11];
- float64_t inv_ln2, ln2_hi, ln2_lo, shift;
- uint64_t thresh, tiny_bound;
+ double ln2_hi, ln2_lo;
+ double c2, c4;
+ double c0, c1, c3;
+ double two_over_ln2, shift;
+ uint64_t tiny_bound;
+ double large_bound, fexpa_bound;
+ uint64_t e2xm1_data[20];
} data = {
- /* Generated using Remez, deg=12 in [-log(2)/2, log(2)/2]. */
- .poly = { 0x1p-1, 0x1.5555555555559p-3, 0x1.555555555554bp-5,
- 0x1.111111110f663p-7, 0x1.6c16c16c1b5f3p-10,
- 0x1.a01a01affa35dp-13, 0x1.a01a018b4ecbbp-16,
- 0x1.71ddf82db5bb4p-19, 0x1.27e517fc0d54bp-22,
- 0x1.af5eedae67435p-26, 0x1.1f143d060a28ap-29, },
-
- .inv_ln2 = 0x1.71547652b82fep0,
- .ln2_hi = -0x1.62e42fefa39efp-1,
- .ln2_lo = -0x1.abc9e3b39803fp-56,
- .shift = 0x1.8p52,
-
+ /* Generated using Remez, in [-log(2)/128, log(2)/128]. */
+ .c0 = 0x1p-1,
+ .c1 = 0x1.55555555548f9p-3,
+ .c2 = 0x1.5555555554c22p-5,
+ .c3 = 0x1.111123aaa2fb2p-7,
+ .c4 = 0x1.6c16d77d98e5bp-10,
+ .ln2_hi = 0x1.62e42fefa3800p-1,
+ .ln2_lo = 0x1.ef35793c76730p-45,
+ .two_over_ln2 = 0x1.71547652b82fep+1,
+ .shift = 0x1.800000000ffc0p+46, /* 1.5*2^46+1023. */
.tiny_bound = 0x3e40000000000000, /* asuint64 (0x1p-27). */
- /* asuint64(0x1.241bf835f9d5fp+4) - asuint64(tiny_bound). */
- .thresh = 0x01f241bf835f9d5f,
+ .large_bound = 0x1.30fc1931f09cap+4, /* arctanh(1 - 2^-54). */
+ .fexpa_bound = 0x1.a56ef8ec924ccp-4, /* 19/64 * ln2/2. */
+ /* Table lookup of 2^(i/64) - 1, for values of i from 0..19. */
+ .e2xm1_data = {
+ 0x0000000000000000, 0x3f864d1f3bc03077, 0x3f966c34c5615d0f, 0x3fa0e8a30eb37901,
+ 0x3fa6ab0d9f3121ec, 0x3fac7d865a7a3440, 0x3fb1301d0125b50a, 0x3fb429aaea92ddfb,
+ 0x3fb72b83c7d517ae, 0x3fba35beb6fcb754, 0x3fbd4873168b9aa8, 0x3fc031dc431466b2,
+ 0x3fc1c3d373ab11c3, 0x3fc35a2b2f13e6e9, 0x3fc4f4efa8fef709, 0x3fc6942d3720185a,
+ 0x3fc837f0518db8a9, 0x3fc9e0459320b7fa, 0x3fcb8d39b9d54e55, 0x3fcd3ed9a72cffb7,
+ },
};
+/* An expm1 inspired, FEXPA based helper function that returns an
+ accurate estimate for e^2x - 1. With no special case or support for
+ negative inputs of x. */
static inline svfloat64_t
-expm1_inline (svfloat64_t x, const svbool_t pg, const struct data *d)
-{
- /* Helper routine for calculating exp(x) - 1. Vector port of the helper from
- the scalar variant of tanh. */
-
- /* Reduce argument: f in [-ln2/2, ln2/2], i is exact. */
- svfloat64_t j
- = svsub_x (pg, svmla_x (pg, sv_f64 (d->shift), x, d->inv_ln2), d->shift);
- svint64_t i = svcvt_s64_x (pg, j);
- svfloat64_t f = svmla_x (pg, x, j, d->ln2_hi);
- f = svmla_x (pg, f, j, d->ln2_lo);
-
- /* Approximate expm1(f) using polynomial. */
- svfloat64_t f2 = svmul_x (pg, f, f);
- svfloat64_t f4 = svmul_x (pg, f2, f2);
- svfloat64_t p = svmla_x (
- pg, f, f2,
- sv_estrin_10_f64_x (pg, f, f2, f4, svmul_x (pg, f4, f4), d->poly));
-
- /* t = 2 ^ i. */
- svfloat64_t t = svscale_x (pg, sv_f64 (1), i);
- /* expm1(x) = p * t + (t - 1). */
- return svmla_x (pg, svsub_x (pg, t, 1), p, t);
-}
-
-static svfloat64_t NOINLINE
-special_case (svfloat64_t x, svfloat64_t y, svbool_t special)
+e2xm1_inline (const svbool_t pg, svfloat64_t x, const struct data *d)
{
- return sv_call_f64 (tanh, x, y, special);
+ svfloat64_t z = svmla_x (pg, sv_f64 (d->shift), x, d->two_over_ln2);
+ svuint64_t u = svreinterpret_u64 (z);
+ svfloat64_t n = svsub_x (pg, z, d->shift);
+
+ /* r = x - n * ln2/2, r is in [-ln2/(2N), ln2/(2N)]. */
+ svfloat64_t ln2 = svld1rq (svptrue_b64 (), &d->ln2_hi);
+ svfloat64_t r = svadd_x (pg, x, x);
+ r = svmls_lane (r, n, ln2, 0);
+ r = svmls_lane (r, n, ln2, 1);
+
+ /* y = exp(r) - 1 ~= r + C0 r^2 + C1 r^3 + C2 r^4 + C3 r^5 + C4 r^6. */
+ svfloat64_t r2 = svmul_x (svptrue_b64 (), r, r);
+ svfloat64_t c24 = svld1rq (svptrue_b64 (), &d->c2);
+
+ svfloat64_t p;
+ svfloat64_t c12 = svmla_lane (sv_f64 (d->c1), r, c24, 0);
+ svfloat64_t c34 = svmla_lane (sv_f64 (d->c3), r, c24, 1);
+ p = svmad_x (pg, c34, r2, c12);
+ p = svmad_x (pg, p, r, sv_f64 (d->c0));
+ p = svmad_x (pg, p, r2, r);
+
+ svfloat64_t scale = svexpa (u);
+
+ /* We want to construct e2xm1(x) = (scale - 1) + scale * poly.
+ However, for values of scale close to 1, scale-1 causes large ULP errors
+ due to cancellation.
+
+ This can be circumvented by using a small lookup for scale-1
+ when our input is below a certain bound, otherwise we can use FEXPA. */
+ svbool_t is_small = svaclt (pg, x, d->fexpa_bound);
+
+ /* Index via the input of FEXPA, but we only care about the lower 5 bits. */
+ svuint64_t base_idx = svand_x (pg, u, 0x1f);
+
+ /* Compute scale - 1 from FEXPA, and lookup values where this fails. */
+ svfloat64_t scalem1_estimate = svsub_x (pg, scale, sv_f64 (1.0));
+ svuint64_t scalem1_lookup
+ = svld1_gather_index (is_small, d->e2xm1_data, base_idx);
+
+ /* Select the appropriate scale - 1 value based on x. */
+ svfloat64_t scalem1
+ = svsel (is_small, svreinterpret_f64 (scalem1_lookup), scalem1_estimate);
+ return svmla_x (pg, scalem1, scale, p);
}
-/* SVE approximation for double-precision tanh(x), using a simplified
- version of expm1. The greatest observed error is 2.77 ULP:
- _ZGVsMxv_tanh(-0x1.c4a4ca0f9f3b7p-3) got -0x1.bd6a21a163627p-3
- want -0x1.bd6a21a163624p-3. */
+/* SVE approximation for double-precision tanh(x), using a modified version of
+ FEXPA expm1 to calculate e^2x - 1.
+ The greatest observed error is 2.79 + 0.5 ULP:
+ _ZGVsMxv_tanh (0x1.fff868eb3c223p-9) got 0x1.fff7be486cae6p-9
+ want 0x1.fff7be486cae9p-9. */
svfloat64_t SV_NAME_D1 (tanh) (svfloat64_t x, svbool_t pg)
{
const struct data *d = ptr_barrier (&data);
- svuint64_t ia = svreinterpret_u64 (svabs_x (pg, x));
+ svbool_t large = svacge (pg, x, d->large_bound);
- /* Trigger special-cases for tiny, boring and infinity/NaN. */
- svbool_t special = svcmpgt (pg, svsub_x (pg, ia, d->tiny_bound), d->thresh);
+ /* We can use tanh(x) = (e^2x - 1) / (e^2x + 1) to approximate tanh.
+ As an additional optimisation, we can ensure more accurate values of e^x
+ by only using positive inputs. So we calculate tanh(|x|), and restore the
+ sign of the input before returning. */
+ svfloat64_t ax = svabs_x (pg, x);
+ svuint64_t sign_bit
+ = sveor_x (pg, svreinterpret_u64 (x), svreinterpret_u64 (ax));
- svfloat64_t u = svadd_x (pg, x, x);
+ svfloat64_t p = e2xm1_inline (pg, ax, d);
+ svfloat64_t q = svadd_x (pg, p, 2);
- /* tanh(x) = (e^2x - 1) / (e^2x + 1). */
- svfloat64_t q = expm1_inline (u, pg, d);
- svfloat64_t qp2 = svadd_x (pg, q, 2);
+ /* For sufficiently high inputs, the result of tanh(|x|) is 1 when correctly
+ rounded, at this point we can return 1 directly, with sign correction.
+ This will also act as a guard against our approximation overflowing. */
+ svfloat64_t y = svsel (large, sv_f64 (1.0), svdiv_x (pg, p, q));
- if (__glibc_unlikely (svptest_any (pg, special)))
- return special_case (x, svdiv_x (pg, q, qp2), special);
- return svdiv_x (pg, q, qp2);
+ return svreinterpret_f64 (svorr_x (pg, sign_bit, svreinterpret_u64 (y)));
}