This combines the following upstream commits: e45af510bc AArch64: Fix instability in AdvSIMD sinh 6c22823da5 AArch64: Fix instability in AdvSIMD tan aebaeb2c33 AArch64: Update math-vector-fortran.h e20ca759af AArch64: add optimised strspn/strcspn aac077645a AArch64: Fix SVE powf routine [BZ #33299] 1e3d1ddf97 AArch64: Optimize SVE exp functions dee22d2a81 AArch64: Optimise SVE FP64 Hyperbolics 6849c5b791 AArch64: Improve codegen SVE log1p helper 09795c5612 AArch64: Fix builderror with GCC 12.1/12.2 aa18367c11 AArch64: Improve enabling of SVE for libmvec 691edbdf77 aarch64: fix unwinding in longjmp 4352e2cc93 aarch64: Fix _dl_tlsdesc_dynamic unwind for pac-ret (BZ 32612) cf56eb28fa AArch64: Optimize algorithm in users of SVE expf helper ce2f26a22e AArch64: Remove PTR_ARG/SIZE_ARG defines 8f0e7fe61e Aarch64: Improve codegen in SVE asinh c0ff447edf Aarch64: Improve codegen in SVE exp and users, and update expf_inline f5ff34cb3c AArch64: Improve codegen for SVE erfcf 0b195651db AArch64: Improve codegen for SVE pow 95e807209b AArch64: Improve codegen for SVE powf d3f2b71ef1 aarch64: Fix tests not compatible with targets supporting GCS f86b4cf875 AArch64: Improve codegen in SVE expm1f and users 140b985e5a AArch64: Improve codegen in AdvSIMD asinh 91c1fadba3 AArch64: Improve codegen for SVE log1pf users cff9648d0b AArch64: Improve codegen of AdvSIMD expf family 569cfaaf49 AArch64: Improve codegen in AdvSIMD pow ca0c0d0f26 AArch64: Improve codegen in users of ADVSIMD log1p helper 13a7ef5999 AArch64: Improve codegen in users of ADVSIMD expm1 helper 2d82d781a5 AArch64: Remove SVE erf and erfc tables 1cf29fbc5b AArch64: Small optimisation in AdvSIMD erf and erfc 7b8c134b54 AArch64: Improve codegen in SVE expf & related routines a15b1394b5 AArch64: Improve codegen in SVE F32 logs 5bc100bd4b AArch64: Improve codegen in users of AdvSIMD log1pf helper 7900ac490d AArch64: Improve codegen in users of ADVSIMD expm1f helper 0fed0b250f aarch64/fpu: Add vector variants of pow 75207bde68 aarch64/fpu: Add vector variants of cbrt 157f89fa3d aarch64/fpu: Add vector variants of hypot 90a6ca8b28 aarch64: Fix AdvSIMD libmvec routines for big-endian 87cb1dfcd6 aarch64/fpu: Add vector variants of erfc 3d3a4fb8e4 aarch64/fpu: Add vector variants of tanh eedbbca0bf aarch64/fpu: Add vector variants of sinh 8b67920528 aarch64/fpu: Add vector variants of atanh 81406ea3c5 aarch64/fpu: Add vector variants of asinh b09fee1d21 aarch64/fpu: Add vector variants of acosh bdb5705b7b aarch64/fpu: Add vector variants of cosh cb5d84f1f8 aarch64/fpu: Add vector variants of erf Resolves: RHEL-118273
262 lines
10 KiB
Diff
262 lines
10 KiB
Diff
commit a15b1394b5eba98ffe28a02a392b587e4fe13c0d
|
|
Author: Joe Ramsay <Joe.Ramsay@arm.com>
|
|
Date: Mon Sep 23 15:30:20 2024 +0100
|
|
|
|
AArch64: Improve codegen in SVE F32 logs
|
|
|
|
Reduce MOVPRFXs by using unpredicated (non-destructive) instructions
|
|
where possible. Similar to the recent change to AdvSIMD F32 logs,
|
|
adjust special-case arguments and bounds to allow for more optimal
|
|
register usage. For all 3 routines one MOVPRFX remains in the
|
|
reduction, which cannot be avoided as immediate AND and ASR are both
|
|
destructive.
|
|
|
|
Reviewed-by: Wilco Dijkstra <Wilco.Dijkstra@arm.com>
|
|
|
|
diff --git a/sysdeps/aarch64/fpu/log10f_sve.c b/sysdeps/aarch64/fpu/log10f_sve.c
|
|
index bdbb49cd32feccb4..7913679f6795502a 100644
|
|
--- a/sysdeps/aarch64/fpu/log10f_sve.c
|
|
+++ b/sysdeps/aarch64/fpu/log10f_sve.c
|
|
@@ -24,6 +24,7 @@ static const struct data
|
|
float poly_0246[4];
|
|
float poly_1357[4];
|
|
float ln2, inv_ln10;
|
|
+ uint32_t off, lower;
|
|
} data = {
|
|
.poly_1357 = {
|
|
/* Coefficients copied from the AdvSIMD routine, then rearranged so that coeffs
|
|
@@ -35,18 +36,23 @@ static const struct data
|
|
-0x1.0fc92cp-4f },
|
|
.ln2 = 0x1.62e43p-1f,
|
|
.inv_ln10 = 0x1.bcb7b2p-2f,
|
|
+ .off = 0x3f2aaaab,
|
|
+ /* Lower bound is the smallest positive normal float 0x00800000. For
|
|
+ optimised register use subnormals are detected after offset has been
|
|
+ subtracted, so lower bound is 0x0080000 - offset (which wraps around). */
|
|
+ .lower = 0x00800000 - 0x3f2aaaab
|
|
};
|
|
|
|
-#define Min 0x00800000
|
|
-#define Max 0x7f800000
|
|
-#define Thres 0x7f000000 /* Max - Min. */
|
|
-#define Offset 0x3f2aaaab /* 0.666667. */
|
|
+#define Thres 0x7f000000 /* asuint32(inf) - 0x00800000. */
|
|
#define MantissaMask 0x007fffff
|
|
|
|
static svfloat32_t NOINLINE
|
|
-special_case (svfloat32_t x, svfloat32_t y, svbool_t special)
|
|
+special_case (svuint32_t u_off, svfloat32_t p, svfloat32_t r2, svfloat32_t y,
|
|
+ svbool_t cmp)
|
|
{
|
|
- return sv_call_f32 (log10f, x, y, special);
|
|
+ return sv_call_f32 (
|
|
+ log10f, svreinterpret_f32 (svadd_x (svptrue_b32 (), u_off, data.off)),
|
|
+ svmla_x (svptrue_b32 (), p, r2, y), cmp);
|
|
}
|
|
|
|
/* Optimised implementation of SVE log10f using the same algorithm and
|
|
@@ -57,23 +63,25 @@ special_case (svfloat32_t x, svfloat32_t y, svbool_t special)
|
|
svfloat32_t SV_NAME_F1 (log10) (svfloat32_t x, const svbool_t pg)
|
|
{
|
|
const struct data *d = ptr_barrier (&data);
|
|
- svuint32_t ix = svreinterpret_u32 (x);
|
|
- svbool_t special = svcmpge (pg, svsub_x (pg, ix, Min), Thres);
|
|
+
|
|
+ svuint32_t u_off = svreinterpret_u32 (x);
|
|
+
|
|
+ u_off = svsub_x (pg, u_off, d->off);
|
|
+ svbool_t special = svcmpge (pg, svsub_x (pg, u_off, d->lower), Thres);
|
|
|
|
/* x = 2^n * (1+r), where 2/3 < 1+r < 4/3. */
|
|
- ix = svsub_x (pg, ix, Offset);
|
|
svfloat32_t n = svcvt_f32_x (
|
|
- pg, svasr_x (pg, svreinterpret_s32 (ix), 23)); /* signextend. */
|
|
- ix = svand_x (pg, ix, MantissaMask);
|
|
- ix = svadd_x (pg, ix, Offset);
|
|
+ pg, svasr_x (pg, svreinterpret_s32 (u_off), 23)); /* signextend. */
|
|
+ svuint32_t ix = svand_x (pg, u_off, MantissaMask);
|
|
+ ix = svadd_x (pg, ix, d->off);
|
|
svfloat32_t r = svsub_x (pg, svreinterpret_f32 (ix), 1.0f);
|
|
|
|
/* y = log10(1+r) + n*log10(2)
|
|
log10(1+r) ~ r * InvLn(10) + P(r)
|
|
where P(r) is a polynomial. Use order 9 for log10(1+x), i.e. order 8 for
|
|
log10(1+x)/x, with x in [-1/3, 1/3] (offset=2/3). */
|
|
- svfloat32_t r2 = svmul_x (pg, r, r);
|
|
- svfloat32_t r4 = svmul_x (pg, r2, r2);
|
|
+ svfloat32_t r2 = svmul_x (svptrue_b32 (), r, r);
|
|
+ svfloat32_t r4 = svmul_x (svptrue_b32 (), r2, r2);
|
|
svfloat32_t p_1357 = svld1rq (svptrue_b32 (), &d->poly_1357[0]);
|
|
svfloat32_t q_01 = svmla_lane (sv_f32 (d->poly_0246[0]), r, p_1357, 0);
|
|
svfloat32_t q_23 = svmla_lane (sv_f32 (d->poly_0246[1]), r, p_1357, 1);
|
|
@@ -88,7 +96,6 @@ svfloat32_t SV_NAME_F1 (log10) (svfloat32_t x, const svbool_t pg)
|
|
hi = svmul_x (pg, hi, d->inv_ln10);
|
|
|
|
if (__glibc_unlikely (svptest_any (pg, special)))
|
|
- return special_case (x, svmla_x (svnot_z (pg, special), hi, r2, y),
|
|
- special);
|
|
- return svmla_x (pg, hi, r2, y);
|
|
+ return special_case (u_off, hi, r2, y, special);
|
|
+ return svmla_x (svptrue_b32 (), hi, r2, y);
|
|
}
|
|
diff --git a/sysdeps/aarch64/fpu/log2f_sve.c b/sysdeps/aarch64/fpu/log2f_sve.c
|
|
index 5031c4248359295e..939d89bfb9a95a11 100644
|
|
--- a/sysdeps/aarch64/fpu/log2f_sve.c
|
|
+++ b/sysdeps/aarch64/fpu/log2f_sve.c
|
|
@@ -23,6 +23,7 @@ static const struct data
|
|
{
|
|
float poly_02468[5];
|
|
float poly_1357[4];
|
|
+ uint32_t off, lower;
|
|
} data = {
|
|
.poly_1357 = {
|
|
/* Coefficients copied from the AdvSIMD routine, then rearranged so that coeffs
|
|
@@ -32,18 +33,23 @@ static const struct data
|
|
},
|
|
.poly_02468 = { 0x1.715476p0f, 0x1.ec701cp-2f, 0x1.27a0b8p-2f,
|
|
0x1.9d8ecap-3f, 0x1.9e495p-3f },
|
|
+ .off = 0x3f2aaaab,
|
|
+ /* Lower bound is the smallest positive normal float 0x00800000. For
|
|
+ optimised register use subnormals are detected after offset has been
|
|
+ subtracted, so lower bound is 0x0080000 - offset (which wraps around). */
|
|
+ .lower = 0x00800000 - 0x3f2aaaab
|
|
};
|
|
|
|
-#define Min (0x00800000)
|
|
-#define Max (0x7f800000)
|
|
-#define Thres (0x7f000000) /* Max - Min. */
|
|
+#define Thresh (0x7f000000) /* asuint32(inf) - 0x00800000. */
|
|
#define MantissaMask (0x007fffff)
|
|
-#define Off (0x3f2aaaab) /* 0.666667. */
|
|
|
|
static svfloat32_t NOINLINE
|
|
-special_case (svfloat32_t x, svfloat32_t y, svbool_t cmp)
|
|
+special_case (svuint32_t u_off, svfloat32_t p, svfloat32_t r2, svfloat32_t y,
|
|
+ svbool_t cmp)
|
|
{
|
|
- return sv_call_f32 (log2f, x, y, cmp);
|
|
+ return sv_call_f32 (
|
|
+ log2f, svreinterpret_f32 (svadd_x (svptrue_b32 (), u_off, data.off)),
|
|
+ svmla_x (svptrue_b32 (), p, r2, y), cmp);
|
|
}
|
|
|
|
/* Optimised implementation of SVE log2f, using the same algorithm
|
|
@@ -55,19 +61,20 @@ svfloat32_t SV_NAME_F1 (log2) (svfloat32_t x, const svbool_t pg)
|
|
{
|
|
const struct data *d = ptr_barrier (&data);
|
|
|
|
- svuint32_t u = svreinterpret_u32 (x);
|
|
- svbool_t special = svcmpge (pg, svsub_x (pg, u, Min), Thres);
|
|
+ svuint32_t u_off = svreinterpret_u32 (x);
|
|
+
|
|
+ u_off = svsub_x (pg, u_off, d->off);
|
|
+ svbool_t special = svcmpge (pg, svsub_x (pg, u_off, d->lower), Thresh);
|
|
|
|
/* x = 2^n * (1+r), where 2/3 < 1+r < 4/3. */
|
|
- u = svsub_x (pg, u, Off);
|
|
svfloat32_t n = svcvt_f32_x (
|
|
- pg, svasr_x (pg, svreinterpret_s32 (u), 23)); /* Sign-extend. */
|
|
- u = svand_x (pg, u, MantissaMask);
|
|
- u = svadd_x (pg, u, Off);
|
|
+ pg, svasr_x (pg, svreinterpret_s32 (u_off), 23)); /* Sign-extend. */
|
|
+ svuint32_t u = svand_x (pg, u_off, MantissaMask);
|
|
+ u = svadd_x (pg, u, d->off);
|
|
svfloat32_t r = svsub_x (pg, svreinterpret_f32 (u), 1.0f);
|
|
|
|
/* y = log2(1+r) + n. */
|
|
- svfloat32_t r2 = svmul_x (pg, r, r);
|
|
+ svfloat32_t r2 = svmul_x (svptrue_b32 (), r, r);
|
|
|
|
/* Evaluate polynomial using pairwise Horner scheme. */
|
|
svfloat32_t p_1357 = svld1rq (svptrue_b32 (), &d->poly_1357[0]);
|
|
@@ -81,6 +88,6 @@ svfloat32_t SV_NAME_F1 (log2) (svfloat32_t x, const svbool_t pg)
|
|
y = svmla_x (pg, q_01, r2, y);
|
|
|
|
if (__glibc_unlikely (svptest_any (pg, special)))
|
|
- return special_case (x, svmla_x (svnot_z (pg, special), n, r, y), special);
|
|
- return svmla_x (pg, n, r, y);
|
|
+ return special_case (u_off, n, r, y, special);
|
|
+ return svmla_x (svptrue_b32 (), n, r, y);
|
|
}
|
|
diff --git a/sysdeps/aarch64/fpu/logf_sve.c b/sysdeps/aarch64/fpu/logf_sve.c
|
|
index d64e810cfec9aa19..5b9324678d99455b 100644
|
|
--- a/sysdeps/aarch64/fpu/logf_sve.c
|
|
+++ b/sysdeps/aarch64/fpu/logf_sve.c
|
|
@@ -24,6 +24,7 @@ static const struct data
|
|
float poly_0135[4];
|
|
float poly_246[3];
|
|
float ln2;
|
|
+ uint32_t off, lower;
|
|
} data = {
|
|
.poly_0135 = {
|
|
/* Coefficients copied from the AdvSIMD routine in math/, then rearranged so
|
|
@@ -32,19 +33,24 @@ static const struct data
|
|
-0x1.3e737cp-3f, 0x1.5a9aa2p-3f, 0x1.961348p-3f, 0x1.555d7cp-2f
|
|
},
|
|
.poly_246 = { -0x1.4f9934p-3f, -0x1.00187cp-2f, -0x1.ffffc8p-2f },
|
|
- .ln2 = 0x1.62e43p-1f
|
|
+ .ln2 = 0x1.62e43p-1f,
|
|
+ .off = 0x3f2aaaab,
|
|
+ /* Lower bound is the smallest positive normal float 0x00800000. For
|
|
+ optimised register use subnormals are detected after offset has been
|
|
+ subtracted, so lower bound is 0x0080000 - offset (which wraps around). */
|
|
+ .lower = 0x00800000 - 0x3f2aaaab
|
|
};
|
|
|
|
-#define Min (0x00800000)
|
|
-#define Max (0x7f800000)
|
|
-#define Thresh (0x7f000000) /* Max - Min. */
|
|
+#define Thresh (0x7f000000) /* asuint32(inf) - 0x00800000. */
|
|
#define Mask (0x007fffff)
|
|
-#define Off (0x3f2aaaab) /* 0.666667. */
|
|
|
|
static svfloat32_t NOINLINE
|
|
-special_case (svfloat32_t x, svfloat32_t y, svbool_t cmp)
|
|
+special_case (svuint32_t u_off, svfloat32_t p, svfloat32_t r2, svfloat32_t y,
|
|
+ svbool_t cmp)
|
|
{
|
|
- return sv_call_f32 (logf, x, y, cmp);
|
|
+ return sv_call_f32 (
|
|
+ logf, svreinterpret_f32 (svadd_x (svptrue_b32 (), u_off, data.off)),
|
|
+ svmla_x (svptrue_b32 (), p, r2, y), cmp);
|
|
}
|
|
|
|
/* Optimised implementation of SVE logf, using the same algorithm and
|
|
@@ -55,19 +61,21 @@ svfloat32_t SV_NAME_F1 (log) (svfloat32_t x, const svbool_t pg)
|
|
{
|
|
const struct data *d = ptr_barrier (&data);
|
|
|
|
- svuint32_t u = svreinterpret_u32 (x);
|
|
- svbool_t cmp = svcmpge (pg, svsub_x (pg, u, Min), Thresh);
|
|
+ svuint32_t u_off = svreinterpret_u32 (x);
|
|
+
|
|
+ u_off = svsub_x (pg, u_off, d->off);
|
|
+ svbool_t cmp = svcmpge (pg, svsub_x (pg, u_off, d->lower), Thresh);
|
|
|
|
/* x = 2^n * (1+r), where 2/3 < 1+r < 4/3. */
|
|
- u = svsub_x (pg, u, Off);
|
|
svfloat32_t n = svcvt_f32_x (
|
|
- pg, svasr_x (pg, svreinterpret_s32 (u), 23)); /* Sign-extend. */
|
|
- u = svand_x (pg, u, Mask);
|
|
- u = svadd_x (pg, u, Off);
|
|
+ pg, svasr_x (pg, svreinterpret_s32 (u_off), 23)); /* Sign-extend. */
|
|
+
|
|
+ svuint32_t u = svand_x (pg, u_off, Mask);
|
|
+ u = svadd_x (pg, u, d->off);
|
|
svfloat32_t r = svsub_x (pg, svreinterpret_f32 (u), 1.0f);
|
|
|
|
/* y = log(1+r) + n*ln2. */
|
|
- svfloat32_t r2 = svmul_x (pg, r, r);
|
|
+ svfloat32_t r2 = svmul_x (svptrue_b32 (), r, r);
|
|
/* n*ln2 + r + r2*(P6 + r*P5 + r2*(P4 + r*P3 + r2*(P2 + r*P1 + r2*P0))). */
|
|
svfloat32_t p_0135 = svld1rq (svptrue_b32 (), &d->poly_0135[0]);
|
|
svfloat32_t p = svmla_lane (sv_f32 (d->poly_246[0]), r, p_0135, 1);
|
|
@@ -80,6 +88,6 @@ svfloat32_t SV_NAME_F1 (log) (svfloat32_t x, const svbool_t pg)
|
|
p = svmla_x (pg, r, n, d->ln2);
|
|
|
|
if (__glibc_unlikely (svptest_any (pg, cmp)))
|
|
- return special_case (x, svmla_x (svnot_z (pg, cmp), p, r2, y), cmp);
|
|
+ return special_case (u_off, p, r2, y, cmp);
|
|
return svmla_x (pg, p, r2, y);
|
|
}
|