This combines the following upstream commits: e45af510bc AArch64: Fix instability in AdvSIMD sinh 6c22823da5 AArch64: Fix instability in AdvSIMD tan aebaeb2c33 AArch64: Update math-vector-fortran.h e20ca759af AArch64: add optimised strspn/strcspn aac077645a AArch64: Fix SVE powf routine [BZ #33299] 1e3d1ddf97 AArch64: Optimize SVE exp functions dee22d2a81 AArch64: Optimise SVE FP64 Hyperbolics 6849c5b791 AArch64: Improve codegen SVE log1p helper 09795c5612 AArch64: Fix builderror with GCC 12.1/12.2 aa18367c11 AArch64: Improve enabling of SVE for libmvec 691edbdf77 aarch64: fix unwinding in longjmp 4352e2cc93 aarch64: Fix _dl_tlsdesc_dynamic unwind for pac-ret (BZ 32612) cf56eb28fa AArch64: Optimize algorithm in users of SVE expf helper ce2f26a22e AArch64: Remove PTR_ARG/SIZE_ARG defines 8f0e7fe61e Aarch64: Improve codegen in SVE asinh c0ff447edf Aarch64: Improve codegen in SVE exp and users, and update expf_inline f5ff34cb3c AArch64: Improve codegen for SVE erfcf 0b195651db AArch64: Improve codegen for SVE pow 95e807209b AArch64: Improve codegen for SVE powf d3f2b71ef1 aarch64: Fix tests not compatible with targets supporting GCS f86b4cf875 AArch64: Improve codegen in SVE expm1f and users 140b985e5a AArch64: Improve codegen in AdvSIMD asinh 91c1fadba3 AArch64: Improve codegen for SVE log1pf users cff9648d0b AArch64: Improve codegen of AdvSIMD expf family 569cfaaf49 AArch64: Improve codegen in AdvSIMD pow ca0c0d0f26 AArch64: Improve codegen in users of ADVSIMD log1p helper 13a7ef5999 AArch64: Improve codegen in users of ADVSIMD expm1 helper 2d82d781a5 AArch64: Remove SVE erf and erfc tables 1cf29fbc5b AArch64: Small optimisation in AdvSIMD erf and erfc 7b8c134b54 AArch64: Improve codegen in SVE expf & related routines a15b1394b5 AArch64: Improve codegen in SVE F32 logs 5bc100bd4b AArch64: Improve codegen in users of AdvSIMD log1pf helper 7900ac490d AArch64: Improve codegen in users of ADVSIMD expm1f helper 0fed0b250f aarch64/fpu: Add vector variants of pow 75207bde68 aarch64/fpu: Add vector variants of cbrt 157f89fa3d aarch64/fpu: Add vector variants of hypot 90a6ca8b28 aarch64: Fix AdvSIMD libmvec routines for big-endian 87cb1dfcd6 aarch64/fpu: Add vector variants of erfc 3d3a4fb8e4 aarch64/fpu: Add vector variants of tanh eedbbca0bf aarch64/fpu: Add vector variants of sinh 8b67920528 aarch64/fpu: Add vector variants of atanh 81406ea3c5 aarch64/fpu: Add vector variants of asinh b09fee1d21 aarch64/fpu: Add vector variants of acosh bdb5705b7b aarch64/fpu: Add vector variants of cosh cb5d84f1f8 aarch64/fpu: Add vector variants of erf Resolves: RHEL-118273
320 lines
12 KiB
Diff
320 lines
12 KiB
Diff
commit 7900ac490db32f6bccff812733f00280dde34e27
|
|
Author: Joe Ramsay <Joe.Ramsay@arm.com>
|
|
Date: Mon Sep 23 15:32:53 2024 +0100
|
|
|
|
AArch64: Improve codegen in users of ADVSIMD expm1f helper
|
|
|
|
Rearrange operations so MOV is not necessary in reduction or around
|
|
the special-case handler. Reduce memory access by using more indexed
|
|
MLAs in polynomial.
|
|
|
|
Reviewed-by: Wilco Dijkstra <Wilco.Dijkstra@arm.com>
|
|
|
|
diff --git a/sysdeps/aarch64/fpu/expm1f_advsimd.c b/sysdeps/aarch64/fpu/expm1f_advsimd.c
|
|
index a0616ec7542cbfce..8303ca296e030c2e 100644
|
|
--- a/sysdeps/aarch64/fpu/expm1f_advsimd.c
|
|
+++ b/sysdeps/aarch64/fpu/expm1f_advsimd.c
|
|
@@ -18,27 +18,18 @@
|
|
<https://www.gnu.org/licenses/>. */
|
|
|
|
#include "v_math.h"
|
|
-#include "poly_advsimd_f32.h"
|
|
+#include "v_expm1f_inline.h"
|
|
|
|
static const struct data
|
|
{
|
|
- float32x4_t poly[5];
|
|
- float invln2_and_ln2[4];
|
|
- float32x4_t shift;
|
|
- int32x4_t exponent_bias;
|
|
+ struct v_expm1f_data d;
|
|
#if WANT_SIMD_EXCEPT
|
|
uint32x4_t thresh;
|
|
#else
|
|
float32x4_t oflow_bound;
|
|
#endif
|
|
} data = {
|
|
- /* Generated using fpminimax with degree=5 in [-log(2)/2, log(2)/2]. */
|
|
- .poly = { V4 (0x1.fffffep-2), V4 (0x1.5554aep-3), V4 (0x1.555736p-5),
|
|
- V4 (0x1.12287cp-7), V4 (0x1.6b55a2p-10) },
|
|
- /* Stores constants: invln2, ln2_hi, ln2_lo, 0. */
|
|
- .invln2_and_ln2 = { 0x1.715476p+0f, 0x1.62e4p-1f, 0x1.7f7d1cp-20f, 0 },
|
|
- .shift = V4 (0x1.8p23f),
|
|
- .exponent_bias = V4 (0x3f800000),
|
|
+ .d = V_EXPM1F_DATA,
|
|
#if !WANT_SIMD_EXCEPT
|
|
/* Value above which expm1f(x) should overflow. Absolute value of the
|
|
underflow bound is greater than this, so it catches both cases - there is
|
|
@@ -55,67 +46,38 @@ static const struct data
|
|
#define TinyBound v_u32 (0x34000000 << 1)
|
|
|
|
static float32x4_t VPCS_ATTR NOINLINE
|
|
-special_case (float32x4_t x, float32x4_t y, uint32x4_t special)
|
|
+special_case (float32x4_t x, uint32x4_t special, const struct data *d)
|
|
{
|
|
- return v_call_f32 (expm1f, x, y, special);
|
|
+ return v_call_f32 (
|
|
+ expm1f, x, expm1f_inline (v_zerofy_f32 (x, special), &d->d), special);
|
|
}
|
|
|
|
/* Single-precision vector exp(x) - 1 function.
|
|
- The maximum error is 1.51 ULP:
|
|
- _ZGVnN4v_expm1f (0x1.8baa96p-2) got 0x1.e2fb9p-2
|
|
- want 0x1.e2fb94p-2. */
|
|
+ The maximum error is 1.62 ULP:
|
|
+ _ZGVnN4v_expm1f(0x1.85f83p-2) got 0x1.da9f4p-2
|
|
+ want 0x1.da9f44p-2. */
|
|
float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (expm1) (float32x4_t x)
|
|
{
|
|
const struct data *d = ptr_barrier (&data);
|
|
- uint32x4_t ix = vreinterpretq_u32_f32 (x);
|
|
|
|
#if WANT_SIMD_EXCEPT
|
|
+ uint32x4_t ix = vreinterpretq_u32_f32 (x);
|
|
/* If fp exceptions are to be triggered correctly, fall back to scalar for
|
|
|x| < 2^-23, |x| > oflow_bound, Inf & NaN. Add ix to itself for
|
|
shift-left by 1, and compare with thresh which was left-shifted offline -
|
|
this is effectively an absolute compare. */
|
|
uint32x4_t special
|
|
= vcgeq_u32 (vsubq_u32 (vaddq_u32 (ix, ix), TinyBound), d->thresh);
|
|
- if (__glibc_unlikely (v_any_u32 (special)))
|
|
- x = v_zerofy_f32 (x, special);
|
|
#else
|
|
/* Handles very large values (+ve and -ve), +/-NaN, +/-Inf. */
|
|
uint32x4_t special = vcagtq_f32 (x, d->oflow_bound);
|
|
#endif
|
|
|
|
- /* Reduce argument to smaller range:
|
|
- Let i = round(x / ln2)
|
|
- and f = x - i * ln2, then f is in [-ln2/2, ln2/2].
|
|
- exp(x) - 1 = 2^i * (expm1(f) + 1) - 1
|
|
- where 2^i is exact because i is an integer. */
|
|
- float32x4_t invln2_and_ln2 = vld1q_f32 (d->invln2_and_ln2);
|
|
- float32x4_t j
|
|
- = vsubq_f32 (vfmaq_laneq_f32 (d->shift, x, invln2_and_ln2, 0), d->shift);
|
|
- int32x4_t i = vcvtq_s32_f32 (j);
|
|
- float32x4_t f = vfmsq_laneq_f32 (x, j, invln2_and_ln2, 1);
|
|
- f = vfmsq_laneq_f32 (f, j, invln2_and_ln2, 2);
|
|
-
|
|
- /* Approximate expm1(f) using polynomial.
|
|
- Taylor expansion for expm1(x) has the form:
|
|
- x + ax^2 + bx^3 + cx^4 ....
|
|
- So we calculate the polynomial P(f) = a + bf + cf^2 + ...
|
|
- and assemble the approximation expm1(f) ~= f + f^2 * P(f). */
|
|
- float32x4_t p = v_horner_4_f32 (f, d->poly);
|
|
- p = vfmaq_f32 (f, vmulq_f32 (f, f), p);
|
|
-
|
|
- /* Assemble the result.
|
|
- expm1(x) ~= 2^i * (p + 1) - 1
|
|
- Let t = 2^i. */
|
|
- int32x4_t u = vaddq_s32 (vshlq_n_s32 (i, 23), d->exponent_bias);
|
|
- float32x4_t t = vreinterpretq_f32_s32 (u);
|
|
-
|
|
if (__glibc_unlikely (v_any_u32 (special)))
|
|
- return special_case (vreinterpretq_f32_u32 (ix),
|
|
- vfmaq_f32 (vsubq_f32 (t, v_f32 (1.0f)), p, t),
|
|
- special);
|
|
+ return special_case (x, special, d);
|
|
|
|
/* expm1(x) ~= p * t + (t - 1). */
|
|
- return vfmaq_f32 (vsubq_f32 (t, v_f32 (1.0f)), p, t);
|
|
+ return expm1f_inline (x, &d->d);
|
|
}
|
|
libmvec_hidden_def (V_NAME_F1 (expm1))
|
|
HALF_WIDTH_ALIAS_F1 (expm1)
|
|
diff --git a/sysdeps/aarch64/fpu/sinhf_advsimd.c b/sysdeps/aarch64/fpu/sinhf_advsimd.c
|
|
index 6bb7482dc28795c1..c6ed7598e7deca1b 100644
|
|
--- a/sysdeps/aarch64/fpu/sinhf_advsimd.c
|
|
+++ b/sysdeps/aarch64/fpu/sinhf_advsimd.c
|
|
@@ -23,15 +23,13 @@
|
|
static const struct data
|
|
{
|
|
struct v_expm1f_data expm1f_consts;
|
|
- uint32x4_t halff;
|
|
#if WANT_SIMD_EXCEPT
|
|
uint32x4_t tiny_bound, thresh;
|
|
#else
|
|
- uint32x4_t oflow_bound;
|
|
+ float32x4_t oflow_bound;
|
|
#endif
|
|
} data = {
|
|
.expm1f_consts = V_EXPM1F_DATA,
|
|
- .halff = V4 (0x3f000000),
|
|
#if WANT_SIMD_EXCEPT
|
|
/* 0x1.6a09e8p-32, below which expm1f underflows. */
|
|
.tiny_bound = V4 (0x2fb504f4),
|
|
@@ -39,14 +37,15 @@ static const struct data
|
|
.thresh = V4 (0x12fbbbb3),
|
|
#else
|
|
/* 0x1.61814ep+6, above which expm1f helper overflows. */
|
|
- .oflow_bound = V4 (0x42b0c0a7),
|
|
+ .oflow_bound = V4 (0x1.61814ep+6),
|
|
#endif
|
|
};
|
|
|
|
static float32x4_t NOINLINE VPCS_ATTR
|
|
-special_case (float32x4_t x, float32x4_t y, uint32x4_t special)
|
|
+special_case (float32x4_t x, float32x4_t t, float32x4_t halfsign,
|
|
+ uint32x4_t special)
|
|
{
|
|
- return v_call_f32 (sinhf, x, y, special);
|
|
+ return v_call_f32 (sinhf, x, vmulq_f32 (t, halfsign), special);
|
|
}
|
|
|
|
/* Approximation for vector single-precision sinh(x) using expm1.
|
|
@@ -60,15 +59,15 @@ float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (sinh) (float32x4_t x)
|
|
|
|
uint32x4_t ix = vreinterpretq_u32_f32 (x);
|
|
float32x4_t ax = vabsq_f32 (x);
|
|
- uint32x4_t iax = vreinterpretq_u32_f32 (ax);
|
|
- uint32x4_t sign = veorq_u32 (ix, iax);
|
|
- float32x4_t halfsign = vreinterpretq_f32_u32 (vorrq_u32 (sign, d->halff));
|
|
+ float32x4_t halfsign = vreinterpretq_f32_u32 (
|
|
+ vbslq_u32 (v_u32 (0x80000000), ix, vreinterpretq_u32_f32 (v_f32 (0.5))));
|
|
|
|
#if WANT_SIMD_EXCEPT
|
|
- uint32x4_t special = vcgeq_u32 (vsubq_u32 (iax, d->tiny_bound), d->thresh);
|
|
+ uint32x4_t special = vcgeq_u32 (
|
|
+ vsubq_u32 (vreinterpretq_u32_f32 (ax), d->tiny_bound), d->thresh);
|
|
ax = v_zerofy_f32 (ax, special);
|
|
#else
|
|
- uint32x4_t special = vcgeq_u32 (iax, d->oflow_bound);
|
|
+ uint32x4_t special = vcageq_f32 (x, d->oflow_bound);
|
|
#endif
|
|
|
|
/* Up to the point that expm1f overflows, we can use it to calculate sinhf
|
|
@@ -80,7 +79,7 @@ float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (sinh) (float32x4_t x)
|
|
/* Fall back to the scalar variant for any lanes that should trigger an
|
|
exception. */
|
|
if (__glibc_unlikely (v_any_u32 (special)))
|
|
- return special_case (x, vmulq_f32 (t, halfsign), special);
|
|
+ return special_case (x, t, halfsign, special);
|
|
|
|
return vmulq_f32 (t, halfsign);
|
|
}
|
|
diff --git a/sysdeps/aarch64/fpu/tanhf_advsimd.c b/sysdeps/aarch64/fpu/tanhf_advsimd.c
|
|
index 50defd6ef03926f4..3ced9b7a414c812c 100644
|
|
--- a/sysdeps/aarch64/fpu/tanhf_advsimd.c
|
|
+++ b/sysdeps/aarch64/fpu/tanhf_advsimd.c
|
|
@@ -28,13 +28,16 @@ static const struct data
|
|
/* 0x1.205966p+3, above which tanhf rounds to 1 (or -1 for negative). */
|
|
.boring_bound = V4 (0x41102cb3),
|
|
.large_bound = V4 (0x7f800000),
|
|
- .onef = V4 (0x3f800000),
|
|
};
|
|
|
|
static float32x4_t NOINLINE VPCS_ATTR
|
|
-special_case (float32x4_t x, float32x4_t y, uint32x4_t special)
|
|
+special_case (float32x4_t x, uint32x4_t is_boring, float32x4_t boring,
|
|
+ float32x4_t q, uint32x4_t special)
|
|
{
|
|
- return v_call_f32 (tanhf, x, y, special);
|
|
+ return v_call_f32 (
|
|
+ tanhf, x,
|
|
+ vbslq_f32 (is_boring, boring, vdivq_f32 (q, vaddq_f32 (q, v_f32 (2.0)))),
|
|
+ special);
|
|
}
|
|
|
|
/* Approximation for single-precision vector tanh(x), using a simplified
|
|
@@ -50,7 +53,9 @@ float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (tanh) (float32x4_t x)
|
|
uint32x4_t iax = vreinterpretq_u32_f32 (ax);
|
|
uint32x4_t sign = veorq_u32 (ix, iax);
|
|
uint32x4_t is_boring = vcgtq_u32 (iax, d->boring_bound);
|
|
- float32x4_t boring = vreinterpretq_f32_u32 (vorrq_u32 (sign, d->onef));
|
|
+ /* expm1 exponent bias is 1.0f reinterpreted to int. */
|
|
+ float32x4_t boring = vreinterpretq_f32_u32 (vorrq_u32 (
|
|
+ sign, vreinterpretq_u32_s32 (d->expm1f_consts.exponent_bias)));
|
|
|
|
#if WANT_SIMD_EXCEPT
|
|
/* If fp exceptions are to be triggered properly, set all special and boring
|
|
@@ -66,10 +71,12 @@ float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (tanh) (float32x4_t x)
|
|
|
|
/* tanh(x) = (e^2x - 1) / (e^2x + 1). */
|
|
float32x4_t q = expm1f_inline (vmulq_n_f32 (x, 2), &d->expm1f_consts);
|
|
- float32x4_t y = vdivq_f32 (q, vaddq_f32 (q, v_f32 (2.0)));
|
|
+
|
|
if (__glibc_unlikely (v_any_u32 (special)))
|
|
- return special_case (vreinterpretq_f32_u32 (ix),
|
|
- vbslq_f32 (is_boring, boring, y), special);
|
|
+ return special_case (vreinterpretq_f32_u32 (ix), is_boring, boring, q,
|
|
+ special);
|
|
+
|
|
+ float32x4_t y = vdivq_f32 (q, vaddq_f32 (q, v_f32 (2.0)));
|
|
return vbslq_f32 (is_boring, boring, y);
|
|
}
|
|
libmvec_hidden_def (V_NAME_F1 (tanh))
|
|
diff --git a/sysdeps/aarch64/fpu/v_expm1f_inline.h b/sysdeps/aarch64/fpu/v_expm1f_inline.h
|
|
index 59b552da6b74785e..1daedfdd51cfc54b 100644
|
|
--- a/sysdeps/aarch64/fpu/v_expm1f_inline.h
|
|
+++ b/sysdeps/aarch64/fpu/v_expm1f_inline.h
|
|
@@ -21,48 +21,47 @@
|
|
#define AARCH64_FPU_V_EXPM1F_INLINE_H
|
|
|
|
#include "v_math.h"
|
|
-#include "poly_advsimd_f32.h"
|
|
+#include "math_config.h"
|
|
|
|
struct v_expm1f_data
|
|
{
|
|
- float32x4_t poly[5];
|
|
- float invln2_and_ln2[4];
|
|
- float32x4_t shift;
|
|
+ float32x4_t c0, c2;
|
|
int32x4_t exponent_bias;
|
|
+ float c1, c3, inv_ln2, c4;
|
|
+ float ln2_hi, ln2_lo;
|
|
};
|
|
|
|
/* Coefficients generated using fpminimax with degree=5 in [-log(2)/2,
|
|
- log(2)/2]. Exponent bias is asuint(1.0f).
|
|
- invln2_and_ln2 Stores constants: invln2, ln2_lo, ln2_hi, 0. */
|
|
+ log(2)/2]. Exponent bias is asuint(1.0f). */
|
|
#define V_EXPM1F_DATA \
|
|
{ \
|
|
- .poly = { V4 (0x1.fffffep-2), V4 (0x1.5554aep-3), V4 (0x1.555736p-5), \
|
|
- V4 (0x1.12287cp-7), V4 (0x1.6b55a2p-10) }, \
|
|
- .shift = V4 (0x1.8p23f), .exponent_bias = V4 (0x3f800000), \
|
|
- .invln2_and_ln2 = { 0x1.715476p+0f, 0x1.62e4p-1f, 0x1.7f7d1cp-20f, 0 }, \
|
|
+ .c0 = V4 (0x1.fffffep-2), .c1 = 0x1.5554aep-3, .c2 = V4 (0x1.555736p-5), \
|
|
+ .c3 = 0x1.12287cp-7, .c4 = 0x1.6b55a2p-10, \
|
|
+ .exponent_bias = V4 (0x3f800000), .inv_ln2 = 0x1.715476p+0f, \
|
|
+ .ln2_hi = 0x1.62e4p-1f, .ln2_lo = 0x1.7f7d1cp-20f, \
|
|
}
|
|
|
|
static inline float32x4_t
|
|
expm1f_inline (float32x4_t x, const struct v_expm1f_data *d)
|
|
{
|
|
- /* Helper routine for calculating exp(x) - 1.
|
|
- Copied from v_expm1f_1u6.c, with all special-case handling removed - the
|
|
- calling routine should handle special values if required. */
|
|
+ /* Helper routine for calculating exp(x) - 1. */
|
|
+
|
|
+ float32x2_t ln2 = vld1_f32 (&d->ln2_hi);
|
|
+ float32x4_t lane_consts = vld1q_f32 (&d->c1);
|
|
|
|
/* Reduce argument: f in [-ln2/2, ln2/2], i is exact. */
|
|
- float32x4_t invln2_and_ln2 = vld1q_f32 (d->invln2_and_ln2);
|
|
- float32x4_t j
|
|
- = vsubq_f32 (vfmaq_laneq_f32 (d->shift, x, invln2_and_ln2, 0), d->shift);
|
|
+ float32x4_t j = vrndaq_f32 (vmulq_laneq_f32 (x, lane_consts, 2));
|
|
int32x4_t i = vcvtq_s32_f32 (j);
|
|
- float32x4_t f = vfmsq_laneq_f32 (x, j, invln2_and_ln2, 1);
|
|
- f = vfmsq_laneq_f32 (f, j, invln2_and_ln2, 2);
|
|
+ float32x4_t f = vfmsq_lane_f32 (x, j, ln2, 0);
|
|
+ f = vfmsq_lane_f32 (f, j, ln2, 1);
|
|
|
|
- /* Approximate expm1(f) with polynomial P, expm1(f) ~= f + f^2 * P(f).
|
|
- Uses Estrin scheme, where the main _ZGVnN4v_expm1f routine uses
|
|
- Horner. */
|
|
+ /* Approximate expm1(f) with polynomial P, expm1(f) ~= f + f^2 * P(f). */
|
|
float32x4_t f2 = vmulq_f32 (f, f);
|
|
float32x4_t f4 = vmulq_f32 (f2, f2);
|
|
- float32x4_t p = v_estrin_4_f32 (f, f2, f4, d->poly);
|
|
+ float32x4_t p01 = vfmaq_laneq_f32 (d->c0, f, lane_consts, 0);
|
|
+ float32x4_t p23 = vfmaq_laneq_f32 (d->c2, f, lane_consts, 1);
|
|
+ float32x4_t p = vfmaq_f32 (p01, f2, p23);
|
|
+ p = vfmaq_laneq_f32 (p, f4, lane_consts, 3);
|
|
p = vfmaq_f32 (f, f2, p);
|
|
|
|
/* t = 2^i. */
|