This combines the following upstream commits: e45af510bc AArch64: Fix instability in AdvSIMD sinh 6c22823da5 AArch64: Fix instability in AdvSIMD tan aebaeb2c33 AArch64: Update math-vector-fortran.h e20ca759af AArch64: add optimised strspn/strcspn aac077645a AArch64: Fix SVE powf routine [BZ #33299] 1e3d1ddf97 AArch64: Optimize SVE exp functions dee22d2a81 AArch64: Optimise SVE FP64 Hyperbolics 6849c5b791 AArch64: Improve codegen SVE log1p helper 09795c5612 AArch64: Fix builderror with GCC 12.1/12.2 aa18367c11 AArch64: Improve enabling of SVE for libmvec 691edbdf77 aarch64: fix unwinding in longjmp 4352e2cc93 aarch64: Fix _dl_tlsdesc_dynamic unwind for pac-ret (BZ 32612) cf56eb28fa AArch64: Optimize algorithm in users of SVE expf helper ce2f26a22e AArch64: Remove PTR_ARG/SIZE_ARG defines 8f0e7fe61e Aarch64: Improve codegen in SVE asinh c0ff447edf Aarch64: Improve codegen in SVE exp and users, and update expf_inline f5ff34cb3c AArch64: Improve codegen for SVE erfcf 0b195651db AArch64: Improve codegen for SVE pow 95e807209b AArch64: Improve codegen for SVE powf d3f2b71ef1 aarch64: Fix tests not compatible with targets supporting GCS f86b4cf875 AArch64: Improve codegen in SVE expm1f and users 140b985e5a AArch64: Improve codegen in AdvSIMD asinh 91c1fadba3 AArch64: Improve codegen for SVE log1pf users cff9648d0b AArch64: Improve codegen of AdvSIMD expf family 569cfaaf49 AArch64: Improve codegen in AdvSIMD pow ca0c0d0f26 AArch64: Improve codegen in users of ADVSIMD log1p helper 13a7ef5999 AArch64: Improve codegen in users of ADVSIMD expm1 helper 2d82d781a5 AArch64: Remove SVE erf and erfc tables 1cf29fbc5b AArch64: Small optimisation in AdvSIMD erf and erfc 7b8c134b54 AArch64: Improve codegen in SVE expf & related routines a15b1394b5 AArch64: Improve codegen in SVE F32 logs 5bc100bd4b AArch64: Improve codegen in users of AdvSIMD log1pf helper 7900ac490d AArch64: Improve codegen in users of ADVSIMD expm1f helper 0fed0b250f aarch64/fpu: Add vector variants of pow 75207bde68 aarch64/fpu: Add vector variants of cbrt 157f89fa3d aarch64/fpu: Add vector variants of hypot 90a6ca8b28 aarch64: Fix AdvSIMD libmvec routines for big-endian 87cb1dfcd6 aarch64/fpu: Add vector variants of erfc 3d3a4fb8e4 aarch64/fpu: Add vector variants of tanh eedbbca0bf aarch64/fpu: Add vector variants of sinh 8b67920528 aarch64/fpu: Add vector variants of atanh 81406ea3c5 aarch64/fpu: Add vector variants of asinh b09fee1d21 aarch64/fpu: Add vector variants of acosh bdb5705b7b aarch64/fpu: Add vector variants of cosh cb5d84f1f8 aarch64/fpu: Add vector variants of erf Resolves: RHEL-118273
242 lines
10 KiB
Diff
242 lines
10 KiB
Diff
commit 95e807209b680257a9afe81a507754f1565dbb4d
|
|
Author: Yat Long Poon <yatlong.poon@arm.com>
|
|
Date: Thu Feb 13 18:03:04 2025 +0000
|
|
|
|
AArch64: Improve codegen for SVE powf
|
|
|
|
Improve memory access with indexed/unpredicated instructions.
|
|
Eliminate register spills. Speedup on Neoverse V1: 3%.
|
|
|
|
Reviewed-by: Wilco Dijkstra <Wilco.Dijkstra@arm.com>
|
|
|
|
diff --git a/sysdeps/aarch64/fpu/powf_sve.c b/sysdeps/aarch64/fpu/powf_sve.c
|
|
index 4f6a142325ae719b..08d7019a1855ff3c 100644
|
|
--- a/sysdeps/aarch64/fpu/powf_sve.c
|
|
+++ b/sysdeps/aarch64/fpu/powf_sve.c
|
|
@@ -26,7 +26,6 @@
|
|
#define Tlogc __v_powf_data.logc
|
|
#define Texp __v_powf_data.scale
|
|
#define SignBias (1 << (V_POWF_EXP2_TABLE_BITS + 11))
|
|
-#define Shift 0x1.8p52
|
|
#define Norm 0x1p23f /* 0x4b000000. */
|
|
|
|
/* Overall ULP error bound for pow is 2.6 ulp
|
|
@@ -36,7 +35,7 @@ static const struct data
|
|
double log_poly[4];
|
|
double exp_poly[3];
|
|
float uflow_bound, oflow_bound, small_bound;
|
|
- uint32_t sign_bias, sign_mask, subnormal_bias, off;
|
|
+ uint32_t sign_bias, subnormal_bias, off;
|
|
} data = {
|
|
/* rel err: 1.5 * 2^-30. Each coefficients is multiplied the value of
|
|
V_POWF_EXP2_N. */
|
|
@@ -53,7 +52,6 @@ static const struct data
|
|
.small_bound = 0x1p-126f,
|
|
.off = 0x3f35d000,
|
|
.sign_bias = SignBias,
|
|
- .sign_mask = 0x80000000,
|
|
.subnormal_bias = 0x0b800000, /* 23 << 23. */
|
|
};
|
|
|
|
@@ -86,7 +84,7 @@ svisodd (svbool_t pg, svfloat32_t x)
|
|
static inline svbool_t
|
|
sv_zeroinfnan (svbool_t pg, svuint32_t i)
|
|
{
|
|
- return svcmpge (pg, svsub_x (pg, svmul_x (pg, i, 2u), 1),
|
|
+ return svcmpge (pg, svsub_x (pg, svadd_x (pg, i, i), 1),
|
|
2u * 0x7f800000 - 1);
|
|
}
|
|
|
|
@@ -150,9 +148,14 @@ powf_specialcase (float x, float y, float z)
|
|
}
|
|
|
|
/* Scalar fallback for special case routines with custom signature. */
|
|
-static inline svfloat32_t
|
|
-sv_call_powf_sc (svfloat32_t x1, svfloat32_t x2, svfloat32_t y, svbool_t cmp)
|
|
+static svfloat32_t NOINLINE
|
|
+sv_call_powf_sc (svfloat32_t x1, svfloat32_t x2, svfloat32_t y)
|
|
{
|
|
+ /* Special cases of x or y: zero, inf and nan. */
|
|
+ svbool_t xspecial = sv_zeroinfnan (svptrue_b32 (), svreinterpret_u32 (x1));
|
|
+ svbool_t yspecial = sv_zeroinfnan (svptrue_b32 (), svreinterpret_u32 (x2));
|
|
+ svbool_t cmp = svorr_z (svptrue_b32 (), xspecial, yspecial);
|
|
+
|
|
svbool_t p = svpfirst (cmp, svpfalse ());
|
|
while (svptest_any (cmp, p))
|
|
{
|
|
@@ -182,30 +185,30 @@ sv_powf_core_ext (const svbool_t pg, svuint64_t i, svfloat64_t z, svint64_t k,
|
|
|
|
/* Polynomial to approximate log1p(r)/ln2. */
|
|
svfloat64_t logx = A (0);
|
|
- logx = svmla_x (pg, A (1), r, logx);
|
|
- logx = svmla_x (pg, A (2), r, logx);
|
|
- logx = svmla_x (pg, A (3), r, logx);
|
|
- logx = svmla_x (pg, y0, r, logx);
|
|
+ logx = svmad_x (pg, r, logx, A (1));
|
|
+ logx = svmad_x (pg, r, logx, A (2));
|
|
+ logx = svmad_x (pg, r, logx, A (3));
|
|
+ logx = svmad_x (pg, r, logx, y0);
|
|
*pylogx = svmul_x (pg, y, logx);
|
|
|
|
/* z - kd is in [-1, 1] in non-nearest rounding modes. */
|
|
- svfloat64_t kd = svadd_x (pg, *pylogx, Shift);
|
|
- svuint64_t ki = svreinterpret_u64 (kd);
|
|
- kd = svsub_x (pg, kd, Shift);
|
|
+ svfloat64_t kd = svrinta_x (svptrue_b64 (), *pylogx);
|
|
+ svuint64_t ki = svreinterpret_u64 (svcvt_s64_x (svptrue_b64 (), kd));
|
|
|
|
r = svsub_x (pg, *pylogx, kd);
|
|
|
|
/* exp2(x) = 2^(k/N) * 2^r ~= s * (C0*r^3 + C1*r^2 + C2*r + 1). */
|
|
- svuint64_t t
|
|
- = svld1_gather_index (pg, Texp, svand_x (pg, ki, V_POWF_EXP2_N - 1));
|
|
- svuint64_t ski = svadd_x (pg, ki, sign_bias);
|
|
- t = svadd_x (pg, t, svlsl_x (pg, ski, 52 - V_POWF_EXP2_TABLE_BITS));
|
|
+ svuint64_t t = svld1_gather_index (
|
|
+ svptrue_b64 (), Texp, svand_x (svptrue_b64 (), ki, V_POWF_EXP2_N - 1));
|
|
+ svuint64_t ski = svadd_x (svptrue_b64 (), ki, sign_bias);
|
|
+ t = svadd_x (svptrue_b64 (), t,
|
|
+ svlsl_x (svptrue_b64 (), ski, 52 - V_POWF_EXP2_TABLE_BITS));
|
|
svfloat64_t s = svreinterpret_f64 (t);
|
|
|
|
svfloat64_t p = C (0);
|
|
p = svmla_x (pg, C (1), p, r);
|
|
p = svmla_x (pg, C (2), p, r);
|
|
- p = svmla_x (pg, s, p, svmul_x (pg, s, r));
|
|
+ p = svmla_x (pg, s, p, svmul_x (svptrue_b64 (), s, r));
|
|
|
|
return p;
|
|
}
|
|
@@ -219,19 +222,16 @@ sv_powf_core (const svbool_t pg, svuint32_t i, svuint32_t iz, svint32_t k,
|
|
{
|
|
const svbool_t ptrue = svptrue_b64 ();
|
|
|
|
- /* Unpack and promote input vectors (pg, y, z, i, k and sign_bias) into two in
|
|
- order to perform core computation in double precision. */
|
|
+ /* Unpack and promote input vectors (pg, y, z, i, k and sign_bias) into two
|
|
+ * in order to perform core computation in double precision. */
|
|
const svbool_t pg_lo = svunpklo (pg);
|
|
const svbool_t pg_hi = svunpkhi (pg);
|
|
- svfloat64_t y_lo = svcvt_f64_x (
|
|
- ptrue, svreinterpret_f32 (svunpklo (svreinterpret_u32 (y))));
|
|
- svfloat64_t y_hi = svcvt_f64_x (
|
|
- ptrue, svreinterpret_f32 (svunpkhi (svreinterpret_u32 (y))));
|
|
- svfloat32_t z = svreinterpret_f32 (iz);
|
|
- svfloat64_t z_lo = svcvt_f64_x (
|
|
- ptrue, svreinterpret_f32 (svunpklo (svreinterpret_u32 (z))));
|
|
- svfloat64_t z_hi = svcvt_f64_x (
|
|
- ptrue, svreinterpret_f32 (svunpkhi (svreinterpret_u32 (z))));
|
|
+ svfloat64_t y_lo
|
|
+ = svcvt_f64_x (pg, svreinterpret_f32 (svunpklo (svreinterpret_u32 (y))));
|
|
+ svfloat64_t y_hi
|
|
+ = svcvt_f64_x (pg, svreinterpret_f32 (svunpkhi (svreinterpret_u32 (y))));
|
|
+ svfloat64_t z_lo = svcvt_f64_x (pg, svreinterpret_f32 (svunpklo (iz)));
|
|
+ svfloat64_t z_hi = svcvt_f64_x (pg, svreinterpret_f32 (svunpkhi (iz)));
|
|
svuint64_t i_lo = svunpklo (i);
|
|
svuint64_t i_hi = svunpkhi (i);
|
|
svint64_t k_lo = svunpklo (k);
|
|
@@ -258,9 +258,9 @@ sv_powf_core (const svbool_t pg, svuint32_t i, svuint32_t iz, svint32_t k,
|
|
/* Implementation of SVE powf.
|
|
Provides the same accuracy as AdvSIMD powf, since it relies on the same
|
|
algorithm. The theoretical maximum error is under 2.60 ULPs.
|
|
- Maximum measured error is 2.56 ULPs:
|
|
- SV_NAME_F2 (pow) (0x1.004118p+0, 0x1.5d14a4p+16) got 0x1.fd4bp+127
|
|
- want 0x1.fd4b06p+127. */
|
|
+ Maximum measured error is 2.57 ULPs:
|
|
+ SV_NAME_F2 (pow) (0x1.031706p+0, 0x1.ce2ec2p+12) got 0x1.fff868p+127
|
|
+ want 0x1.fff862p+127. */
|
|
svfloat32_t SV_NAME_F2 (pow) (svfloat32_t x, svfloat32_t y, const svbool_t pg)
|
|
{
|
|
const struct data *d = ptr_barrier (&data);
|
|
@@ -269,21 +269,19 @@ svfloat32_t SV_NAME_F2 (pow) (svfloat32_t x, svfloat32_t y, const svbool_t pg)
|
|
svuint32_t viy0 = svreinterpret_u32 (y);
|
|
|
|
/* Negative x cases. */
|
|
- svuint32_t sign_bit = svand_m (pg, vix0, d->sign_mask);
|
|
- svbool_t xisneg = svcmpeq (pg, sign_bit, d->sign_mask);
|
|
+ svbool_t xisneg = svcmplt (pg, x, sv_f32 (0));
|
|
|
|
/* Set sign_bias and ix depending on sign of x and nature of y. */
|
|
- svbool_t yisnotint_xisneg = svpfalse_b ();
|
|
+ svbool_t yint_or_xpos = pg;
|
|
svuint32_t sign_bias = sv_u32 (0);
|
|
svuint32_t vix = vix0;
|
|
if (__glibc_unlikely (svptest_any (pg, xisneg)))
|
|
{
|
|
/* Determine nature of y. */
|
|
- yisnotint_xisneg = svisnotint (xisneg, y);
|
|
- svbool_t yisint_xisneg = svisint (xisneg, y);
|
|
+ yint_or_xpos = svisint (xisneg, y);
|
|
svbool_t yisodd_xisneg = svisodd (xisneg, y);
|
|
/* ix set to abs(ix) if y is integer. */
|
|
- vix = svand_m (yisint_xisneg, vix0, 0x7fffffff);
|
|
+ vix = svand_m (yint_or_xpos, vix0, 0x7fffffff);
|
|
/* Set to SignBias if x is negative and y is odd. */
|
|
sign_bias = svsel (yisodd_xisneg, sv_u32 (d->sign_bias), sv_u32 (0));
|
|
}
|
|
@@ -294,8 +292,8 @@ svfloat32_t SV_NAME_F2 (pow) (svfloat32_t x, svfloat32_t y, const svbool_t pg)
|
|
svbool_t cmp = svorr_z (pg, xspecial, yspecial);
|
|
|
|
/* Small cases of x: |x| < 0x1p-126. */
|
|
- svbool_t xsmall = svaclt (pg, x, d->small_bound);
|
|
- if (__glibc_unlikely (svptest_any (pg, xsmall)))
|
|
+ svbool_t xsmall = svaclt (yint_or_xpos, x, d->small_bound);
|
|
+ if (__glibc_unlikely (svptest_any (yint_or_xpos, xsmall)))
|
|
{
|
|
/* Normalize subnormal x so exponent becomes negative. */
|
|
svuint32_t vix_norm = svreinterpret_u32 (svmul_x (xsmall, x, Norm));
|
|
@@ -304,32 +302,35 @@ svfloat32_t SV_NAME_F2 (pow) (svfloat32_t x, svfloat32_t y, const svbool_t pg)
|
|
vix = svsel (xsmall, vix_norm, vix);
|
|
}
|
|
/* Part of core computation carried in working precision. */
|
|
- svuint32_t tmp = svsub_x (pg, vix, d->off);
|
|
- svuint32_t i = svand_x (pg, svlsr_x (pg, tmp, (23 - V_POWF_LOG2_TABLE_BITS)),
|
|
- V_POWF_LOG2_N - 1);
|
|
- svuint32_t top = svand_x (pg, tmp, 0xff800000);
|
|
- svuint32_t iz = svsub_x (pg, vix, top);
|
|
- svint32_t k
|
|
- = svasr_x (pg, svreinterpret_s32 (top), (23 - V_POWF_EXP2_TABLE_BITS));
|
|
-
|
|
- /* Compute core in extended precision and return intermediate ylogx results to
|
|
- handle cases of underflow and underflow in exp. */
|
|
+ svuint32_t tmp = svsub_x (yint_or_xpos, vix, d->off);
|
|
+ svuint32_t i = svand_x (
|
|
+ yint_or_xpos, svlsr_x (yint_or_xpos, tmp, (23 - V_POWF_LOG2_TABLE_BITS)),
|
|
+ V_POWF_LOG2_N - 1);
|
|
+ svuint32_t top = svand_x (yint_or_xpos, tmp, 0xff800000);
|
|
+ svuint32_t iz = svsub_x (yint_or_xpos, vix, top);
|
|
+ svint32_t k = svasr_x (yint_or_xpos, svreinterpret_s32 (top),
|
|
+ (23 - V_POWF_EXP2_TABLE_BITS));
|
|
+
|
|
+ /* Compute core in extended precision and return intermediate ylogx results
|
|
+ * to handle cases of underflow and underflow in exp. */
|
|
svfloat32_t ylogx;
|
|
- svfloat32_t ret = sv_powf_core (pg, i, iz, k, y, sign_bias, &ylogx, d);
|
|
+ svfloat32_t ret
|
|
+ = sv_powf_core (yint_or_xpos, i, iz, k, y, sign_bias, &ylogx, d);
|
|
|
|
/* Handle exp special cases of underflow and overflow. */
|
|
- svuint32_t sign = svlsl_x (pg, sign_bias, 20 - V_POWF_EXP2_TABLE_BITS);
|
|
+ svuint32_t sign
|
|
+ = svlsl_x (yint_or_xpos, sign_bias, 20 - V_POWF_EXP2_TABLE_BITS);
|
|
svfloat32_t ret_oflow
|
|
- = svreinterpret_f32 (svorr_x (pg, sign, asuint (INFINITY)));
|
|
+ = svreinterpret_f32 (svorr_x (yint_or_xpos, sign, asuint (INFINITY)));
|
|
svfloat32_t ret_uflow = svreinterpret_f32 (sign);
|
|
- ret = svsel (svcmple (pg, ylogx, d->uflow_bound), ret_uflow, ret);
|
|
- ret = svsel (svcmpgt (pg, ylogx, d->oflow_bound), ret_oflow, ret);
|
|
+ ret = svsel (svcmple (yint_or_xpos, ylogx, d->uflow_bound), ret_uflow, ret);
|
|
+ ret = svsel (svcmpgt (yint_or_xpos, ylogx, d->oflow_bound), ret_oflow, ret);
|
|
|
|
/* Cases of finite y and finite negative x. */
|
|
- ret = svsel (yisnotint_xisneg, sv_f32 (__builtin_nanf ("")), ret);
|
|
+ ret = svsel (yint_or_xpos, ret, sv_f32 (__builtin_nanf ("")));
|
|
|
|
- if (__glibc_unlikely (svptest_any (pg, cmp)))
|
|
- return sv_call_powf_sc (x, y, ret, cmp);
|
|
+ if (__glibc_unlikely (svptest_any (cmp, cmp)))
|
|
+ return sv_call_powf_sc (x, y, ret);
|
|
|
|
return ret;
|
|
}
|