This combines the following upstream commits: e45af510bc AArch64: Fix instability in AdvSIMD sinh 6c22823da5 AArch64: Fix instability in AdvSIMD tan aebaeb2c33 AArch64: Update math-vector-fortran.h e20ca759af AArch64: add optimised strspn/strcspn aac077645a AArch64: Fix SVE powf routine [BZ #33299] 1e3d1ddf97 AArch64: Optimize SVE exp functions dee22d2a81 AArch64: Optimise SVE FP64 Hyperbolics 6849c5b791 AArch64: Improve codegen SVE log1p helper 09795c5612 AArch64: Fix builderror with GCC 12.1/12.2 aa18367c11 AArch64: Improve enabling of SVE for libmvec 691edbdf77 aarch64: fix unwinding in longjmp 4352e2cc93 aarch64: Fix _dl_tlsdesc_dynamic unwind for pac-ret (BZ 32612) cf56eb28fa AArch64: Optimize algorithm in users of SVE expf helper ce2f26a22e AArch64: Remove PTR_ARG/SIZE_ARG defines 8f0e7fe61e Aarch64: Improve codegen in SVE asinh c0ff447edf Aarch64: Improve codegen in SVE exp and users, and update expf_inline f5ff34cb3c AArch64: Improve codegen for SVE erfcf 0b195651db AArch64: Improve codegen for SVE pow 95e807209b AArch64: Improve codegen for SVE powf d3f2b71ef1 aarch64: Fix tests not compatible with targets supporting GCS f86b4cf875 AArch64: Improve codegen in SVE expm1f and users 140b985e5a AArch64: Improve codegen in AdvSIMD asinh 91c1fadba3 AArch64: Improve codegen for SVE log1pf users cff9648d0b AArch64: Improve codegen of AdvSIMD expf family 569cfaaf49 AArch64: Improve codegen in AdvSIMD pow ca0c0d0f26 AArch64: Improve codegen in users of ADVSIMD log1p helper 13a7ef5999 AArch64: Improve codegen in users of ADVSIMD expm1 helper 2d82d781a5 AArch64: Remove SVE erf and erfc tables 1cf29fbc5b AArch64: Small optimisation in AdvSIMD erf and erfc 7b8c134b54 AArch64: Improve codegen in SVE expf & related routines a15b1394b5 AArch64: Improve codegen in SVE F32 logs 5bc100bd4b AArch64: Improve codegen in users of AdvSIMD log1pf helper 7900ac490d AArch64: Improve codegen in users of ADVSIMD expm1f helper 0fed0b250f aarch64/fpu: Add vector variants of pow 75207bde68 aarch64/fpu: Add vector variants of cbrt 157f89fa3d aarch64/fpu: Add vector variants of hypot 90a6ca8b28 aarch64: Fix AdvSIMD libmvec routines for big-endian 87cb1dfcd6 aarch64/fpu: Add vector variants of erfc 3d3a4fb8e4 aarch64/fpu: Add vector variants of tanh eedbbca0bf aarch64/fpu: Add vector variants of sinh 8b67920528 aarch64/fpu: Add vector variants of atanh 81406ea3c5 aarch64/fpu: Add vector variants of asinh b09fee1d21 aarch64/fpu: Add vector variants of acosh bdb5705b7b aarch64/fpu: Add vector variants of cosh cb5d84f1f8 aarch64/fpu: Add vector variants of erf Resolves: RHEL-118273
98 lines
3.6 KiB
Diff
98 lines
3.6 KiB
Diff
commit 6c22823da57aa5218f717f569c04c9573c0448c5
|
|
Author: Joe Ramsay <Joe.Ramsay@arm.com>
|
|
Date: Thu Nov 6 18:26:54 2025 +0000
|
|
|
|
AArch64: Fix instability in AdvSIMD tan
|
|
|
|
Previously presence of special-cases in one lane could affect the
|
|
results in other lanes due to unconditional scalar fallback. The old
|
|
WANT_SIMD_EXCEPT option (which has never been enabled in libmvec) has
|
|
been removed from AOR, making it easier to spot and fix this. 4%
|
|
improvement in throughput with GCC 14 on Neoverse V1. This bug is
|
|
present as far back as 2.39 (where tan was first introduced).
|
|
|
|
Reviewed-by: Wilco Dijkstra <Wilco.Dijkstra@arm.com>
|
|
|
|
diff --git a/sysdeps/aarch64/fpu/tan_advsimd.c b/sysdeps/aarch64/fpu/tan_advsimd.c
|
|
index d56a102dd17a3463..c6a5a17126674d7d 100644
|
|
--- a/sysdeps/aarch64/fpu/tan_advsimd.c
|
|
+++ b/sysdeps/aarch64/fpu/tan_advsimd.c
|
|
@@ -25,9 +25,7 @@ static const struct data
|
|
float64x2_t poly[9];
|
|
double half_pi[2];
|
|
float64x2_t two_over_pi, shift;
|
|
-#if !WANT_SIMD_EXCEPT
|
|
float64x2_t range_val;
|
|
-#endif
|
|
} data = {
|
|
/* Coefficients generated using FPMinimax. */
|
|
.poly = { V2 (0x1.5555555555556p-2), V2 (0x1.1111111110a63p-3),
|
|
@@ -38,20 +36,17 @@ static const struct data
|
|
.half_pi = { 0x1.921fb54442d18p0, 0x1.1a62633145c07p-54 },
|
|
.two_over_pi = V2 (0x1.45f306dc9c883p-1),
|
|
.shift = V2 (0x1.8p52),
|
|
-#if !WANT_SIMD_EXCEPT
|
|
.range_val = V2 (0x1p23),
|
|
-#endif
|
|
};
|
|
|
|
#define RangeVal 0x4160000000000000 /* asuint64(0x1p23). */
|
|
#define TinyBound 0x3e50000000000000 /* asuint64(2^-26). */
|
|
-#define Thresh 0x310000000000000 /* RangeVal - TinyBound. */
|
|
|
|
/* Special cases (fall back to scalar calls). */
|
|
static float64x2_t VPCS_ATTR NOINLINE
|
|
-special_case (float64x2_t x)
|
|
+special_case (float64x2_t x, float64x2_t n, float64x2_t d, uint64x2_t special)
|
|
{
|
|
- return v_call_f64 (tan, x, x, v_u64 (-1));
|
|
+ return v_call_f64 (tan, x, vdivq_f64 (n, d), special);
|
|
}
|
|
|
|
/* Vector approximation for double-precision tan.
|
|
@@ -65,14 +60,6 @@ float64x2_t VPCS_ATTR V_NAME_D1 (tan) (float64x2_t x)
|
|
very large inputs. Fall back to scalar routine for all lanes if any are
|
|
too large, or Inf/NaN. If fenv exceptions are expected, also fall back for
|
|
tiny input to avoid underflow. */
|
|
-#if WANT_SIMD_EXCEPT
|
|
- uint64x2_t iax = vreinterpretq_u64_f64 (vabsq_f64 (x));
|
|
- /* iax - tiny_bound > range_val - tiny_bound. */
|
|
- uint64x2_t special
|
|
- = vcgtq_u64 (vsubq_u64 (iax, v_u64 (TinyBound)), v_u64 (Thresh));
|
|
- if (__glibc_unlikely (v_any_u64 (special)))
|
|
- return special_case (x);
|
|
-#endif
|
|
|
|
/* q = nearest integer to 2 * x / pi. */
|
|
float64x2_t q
|
|
@@ -81,9 +68,8 @@ float64x2_t VPCS_ATTR V_NAME_D1 (tan) (float64x2_t x)
|
|
|
|
/* Use q to reduce x to r in [-pi/4, pi/4], by:
|
|
r = x - q * pi/2, in extended precision. */
|
|
- float64x2_t r = x;
|
|
float64x2_t half_pi = vld1q_f64 (dat->half_pi);
|
|
- r = vfmsq_laneq_f64 (r, q, half_pi, 0);
|
|
+ float64x2_t r = vfmsq_laneq_f64 (x, q, half_pi, 0);
|
|
r = vfmsq_laneq_f64 (r, q, half_pi, 1);
|
|
/* Further reduce r to [-pi/8, pi/8], to be reconstructed using double angle
|
|
formula. */
|
|
@@ -114,12 +100,13 @@ float64x2_t VPCS_ATTR V_NAME_D1 (tan) (float64x2_t x)
|
|
|
|
uint64x2_t no_recip = vtstq_u64 (vreinterpretq_u64_s64 (qi), v_u64 (1));
|
|
|
|
-#if !WANT_SIMD_EXCEPT
|
|
uint64x2_t special = vcageq_f64 (x, dat->range_val);
|
|
+ float64x2_t swap = vbslq_f64 (no_recip, n, vnegq_f64 (d));
|
|
+ d = vbslq_f64 (no_recip, d, n);
|
|
+ n = swap;
|
|
+
|
|
if (__glibc_unlikely (v_any_u64 (special)))
|
|
- return special_case (x);
|
|
-#endif
|
|
+ return special_case (x, n, d, special);
|
|
|
|
- return vdivq_f64 (vbslq_f64 (no_recip, n, vnegq_f64 (d)),
|
|
- vbslq_f64 (no_recip, d, n));
|
|
+ return vdivq_f64 (n, d);
|
|
}
|