glibc/glibc-RHEL-118273-44.patch
Yuki Inoguchi 9dd92cac18 aarch64: Add GLIBC_2.40 vector functions and performance fixes (RHEL-118273)
This combines the following upstream commits:

e45af510bc AArch64: Fix instability in AdvSIMD sinh
6c22823da5 AArch64: Fix instability in AdvSIMD tan
aebaeb2c33 AArch64: Update math-vector-fortran.h
e20ca759af AArch64: add optimised strspn/strcspn
aac077645a AArch64: Fix SVE powf routine [BZ #33299]
1e3d1ddf97 AArch64: Optimize SVE exp functions
dee22d2a81 AArch64: Optimise SVE FP64 Hyperbolics
6849c5b791 AArch64: Improve codegen SVE log1p helper
09795c5612 AArch64: Fix builderror with GCC 12.1/12.2
aa18367c11 AArch64: Improve enabling of SVE for libmvec
691edbdf77 aarch64: fix unwinding in longjmp
4352e2cc93 aarch64: Fix _dl_tlsdesc_dynamic unwind for pac-ret (BZ 32612)
cf56eb28fa AArch64: Optimize algorithm in users of SVE expf helper
ce2f26a22e AArch64: Remove PTR_ARG/SIZE_ARG defines
8f0e7fe61e Aarch64: Improve codegen in SVE asinh
c0ff447edf Aarch64: Improve codegen in SVE exp and users, and update expf_inline
f5ff34cb3c AArch64: Improve codegen for SVE erfcf
0b195651db AArch64: Improve codegen for SVE pow
95e807209b AArch64: Improve codegen for SVE powf
d3f2b71ef1 aarch64: Fix tests not compatible with targets supporting GCS
f86b4cf875 AArch64: Improve codegen in SVE expm1f and users
140b985e5a AArch64: Improve codegen in AdvSIMD asinh
91c1fadba3 AArch64: Improve codegen for SVE log1pf users
cff9648d0b AArch64: Improve codegen of AdvSIMD expf family
569cfaaf49 AArch64: Improve codegen in AdvSIMD pow
ca0c0d0f26 AArch64: Improve codegen in users of ADVSIMD log1p helper
13a7ef5999 AArch64: Improve codegen in users of ADVSIMD expm1 helper
2d82d781a5 AArch64: Remove SVE erf and erfc tables
1cf29fbc5b AArch64: Small optimisation in AdvSIMD erf and erfc
7b8c134b54 AArch64: Improve codegen in SVE expf & related routines
a15b1394b5 AArch64: Improve codegen in SVE F32 logs
5bc100bd4b AArch64: Improve codegen in users of AdvSIMD log1pf helper
7900ac490d AArch64: Improve codegen in users of ADVSIMD expm1f helper
0fed0b250f aarch64/fpu: Add vector variants of pow
75207bde68 aarch64/fpu: Add vector variants of cbrt
157f89fa3d aarch64/fpu: Add vector variants of hypot
90a6ca8b28 aarch64: Fix AdvSIMD libmvec routines for big-endian
87cb1dfcd6 aarch64/fpu: Add vector variants of erfc
3d3a4fb8e4 aarch64/fpu: Add vector variants of tanh
eedbbca0bf aarch64/fpu: Add vector variants of sinh
8b67920528 aarch64/fpu: Add vector variants of atanh
81406ea3c5 aarch64/fpu: Add vector variants of asinh
b09fee1d21 aarch64/fpu: Add vector variants of acosh
bdb5705b7b aarch64/fpu: Add vector variants of cosh
cb5d84f1f8 aarch64/fpu: Add vector variants of erf

Resolves: RHEL-118273
2025-12-05 16:24:54 +01:00

98 lines
3.6 KiB
Diff

commit 6c22823da57aa5218f717f569c04c9573c0448c5
Author: Joe Ramsay <Joe.Ramsay@arm.com>
Date: Thu Nov 6 18:26:54 2025 +0000
AArch64: Fix instability in AdvSIMD tan
Previously presence of special-cases in one lane could affect the
results in other lanes due to unconditional scalar fallback. The old
WANT_SIMD_EXCEPT option (which has never been enabled in libmvec) has
been removed from AOR, making it easier to spot and fix this. 4%
improvement in throughput with GCC 14 on Neoverse V1. This bug is
present as far back as 2.39 (where tan was first introduced).
Reviewed-by: Wilco Dijkstra <Wilco.Dijkstra@arm.com>
diff --git a/sysdeps/aarch64/fpu/tan_advsimd.c b/sysdeps/aarch64/fpu/tan_advsimd.c
index d56a102dd17a3463..c6a5a17126674d7d 100644
--- a/sysdeps/aarch64/fpu/tan_advsimd.c
+++ b/sysdeps/aarch64/fpu/tan_advsimd.c
@@ -25,9 +25,7 @@ static const struct data
float64x2_t poly[9];
double half_pi[2];
float64x2_t two_over_pi, shift;
-#if !WANT_SIMD_EXCEPT
float64x2_t range_val;
-#endif
} data = {
/* Coefficients generated using FPMinimax. */
.poly = { V2 (0x1.5555555555556p-2), V2 (0x1.1111111110a63p-3),
@@ -38,20 +36,17 @@ static const struct data
.half_pi = { 0x1.921fb54442d18p0, 0x1.1a62633145c07p-54 },
.two_over_pi = V2 (0x1.45f306dc9c883p-1),
.shift = V2 (0x1.8p52),
-#if !WANT_SIMD_EXCEPT
.range_val = V2 (0x1p23),
-#endif
};
#define RangeVal 0x4160000000000000 /* asuint64(0x1p23). */
#define TinyBound 0x3e50000000000000 /* asuint64(2^-26). */
-#define Thresh 0x310000000000000 /* RangeVal - TinyBound. */
/* Special cases (fall back to scalar calls). */
static float64x2_t VPCS_ATTR NOINLINE
-special_case (float64x2_t x)
+special_case (float64x2_t x, float64x2_t n, float64x2_t d, uint64x2_t special)
{
- return v_call_f64 (tan, x, x, v_u64 (-1));
+ return v_call_f64 (tan, x, vdivq_f64 (n, d), special);
}
/* Vector approximation for double-precision tan.
@@ -65,14 +60,6 @@ float64x2_t VPCS_ATTR V_NAME_D1 (tan) (float64x2_t x)
very large inputs. Fall back to scalar routine for all lanes if any are
too large, or Inf/NaN. If fenv exceptions are expected, also fall back for
tiny input to avoid underflow. */
-#if WANT_SIMD_EXCEPT
- uint64x2_t iax = vreinterpretq_u64_f64 (vabsq_f64 (x));
- /* iax - tiny_bound > range_val - tiny_bound. */
- uint64x2_t special
- = vcgtq_u64 (vsubq_u64 (iax, v_u64 (TinyBound)), v_u64 (Thresh));
- if (__glibc_unlikely (v_any_u64 (special)))
- return special_case (x);
-#endif
/* q = nearest integer to 2 * x / pi. */
float64x2_t q
@@ -81,9 +68,8 @@ float64x2_t VPCS_ATTR V_NAME_D1 (tan) (float64x2_t x)
/* Use q to reduce x to r in [-pi/4, pi/4], by:
r = x - q * pi/2, in extended precision. */
- float64x2_t r = x;
float64x2_t half_pi = vld1q_f64 (dat->half_pi);
- r = vfmsq_laneq_f64 (r, q, half_pi, 0);
+ float64x2_t r = vfmsq_laneq_f64 (x, q, half_pi, 0);
r = vfmsq_laneq_f64 (r, q, half_pi, 1);
/* Further reduce r to [-pi/8, pi/8], to be reconstructed using double angle
formula. */
@@ -114,12 +100,13 @@ float64x2_t VPCS_ATTR V_NAME_D1 (tan) (float64x2_t x)
uint64x2_t no_recip = vtstq_u64 (vreinterpretq_u64_s64 (qi), v_u64 (1));
-#if !WANT_SIMD_EXCEPT
uint64x2_t special = vcageq_f64 (x, dat->range_val);
+ float64x2_t swap = vbslq_f64 (no_recip, n, vnegq_f64 (d));
+ d = vbslq_f64 (no_recip, d, n);
+ n = swap;
+
if (__glibc_unlikely (v_any_u64 (special)))
- return special_case (x);
-#endif
+ return special_case (x, n, d, special);
- return vdivq_f64 (vbslq_f64 (no_recip, n, vnegq_f64 (d)),
- vbslq_f64 (no_recip, d, n));
+ return vdivq_f64 (n, d);
}