glibc/glibc-RHEL-118273-17.patch
Yuki Inoguchi 9dd92cac18 aarch64: Add GLIBC_2.40 vector functions and performance fixes (RHEL-118273)
This combines the following upstream commits:

e45af510bc AArch64: Fix instability in AdvSIMD sinh
6c22823da5 AArch64: Fix instability in AdvSIMD tan
aebaeb2c33 AArch64: Update math-vector-fortran.h
e20ca759af AArch64: add optimised strspn/strcspn
aac077645a AArch64: Fix SVE powf routine [BZ #33299]
1e3d1ddf97 AArch64: Optimize SVE exp functions
dee22d2a81 AArch64: Optimise SVE FP64 Hyperbolics
6849c5b791 AArch64: Improve codegen SVE log1p helper
09795c5612 AArch64: Fix builderror with GCC 12.1/12.2
aa18367c11 AArch64: Improve enabling of SVE for libmvec
691edbdf77 aarch64: fix unwinding in longjmp
4352e2cc93 aarch64: Fix _dl_tlsdesc_dynamic unwind for pac-ret (BZ 32612)
cf56eb28fa AArch64: Optimize algorithm in users of SVE expf helper
ce2f26a22e AArch64: Remove PTR_ARG/SIZE_ARG defines
8f0e7fe61e Aarch64: Improve codegen in SVE asinh
c0ff447edf Aarch64: Improve codegen in SVE exp and users, and update expf_inline
f5ff34cb3c AArch64: Improve codegen for SVE erfcf
0b195651db AArch64: Improve codegen for SVE pow
95e807209b AArch64: Improve codegen for SVE powf
d3f2b71ef1 aarch64: Fix tests not compatible with targets supporting GCS
f86b4cf875 AArch64: Improve codegen in SVE expm1f and users
140b985e5a AArch64: Improve codegen in AdvSIMD asinh
91c1fadba3 AArch64: Improve codegen for SVE log1pf users
cff9648d0b AArch64: Improve codegen of AdvSIMD expf family
569cfaaf49 AArch64: Improve codegen in AdvSIMD pow
ca0c0d0f26 AArch64: Improve codegen in users of ADVSIMD log1p helper
13a7ef5999 AArch64: Improve codegen in users of ADVSIMD expm1 helper
2d82d781a5 AArch64: Remove SVE erf and erfc tables
1cf29fbc5b AArch64: Small optimisation in AdvSIMD erf and erfc
7b8c134b54 AArch64: Improve codegen in SVE expf & related routines
a15b1394b5 AArch64: Improve codegen in SVE F32 logs
5bc100bd4b AArch64: Improve codegen in users of AdvSIMD log1pf helper
7900ac490d AArch64: Improve codegen in users of ADVSIMD expm1f helper
0fed0b250f aarch64/fpu: Add vector variants of pow
75207bde68 aarch64/fpu: Add vector variants of cbrt
157f89fa3d aarch64/fpu: Add vector variants of hypot
90a6ca8b28 aarch64: Fix AdvSIMD libmvec routines for big-endian
87cb1dfcd6 aarch64/fpu: Add vector variants of erfc
3d3a4fb8e4 aarch64/fpu: Add vector variants of tanh
eedbbca0bf aarch64/fpu: Add vector variants of sinh
8b67920528 aarch64/fpu: Add vector variants of atanh
81406ea3c5 aarch64/fpu: Add vector variants of asinh
b09fee1d21 aarch64/fpu: Add vector variants of acosh
bdb5705b7b aarch64/fpu: Add vector variants of cosh
cb5d84f1f8 aarch64/fpu: Add vector variants of erf

Resolves: RHEL-118273
2025-12-05 16:24:54 +01:00

125 lines
5.3 KiB
Diff

commit 1cf29fbc5be23db775d1dfa6b332ded6e6554252
Author: Joe Ramsay <Joe.Ramsay@arm.com>
Date: Mon Oct 28 14:58:35 2024 +0000
AArch64: Small optimisation in AdvSIMD erf and erfc
In both routines, reduce register pressure such that GCC 14 emits no
spills for erf and fewer spills for erfc. Also use more efficient
comparison for the special-case in erf.
Benchtests show erf improves by 6.4%, erfc by 1.0%.
diff --git a/sysdeps/aarch64/fpu/erf_advsimd.c b/sysdeps/aarch64/fpu/erf_advsimd.c
index 19cbb7d0f42eb4e2..c0116735e408066d 100644
--- a/sysdeps/aarch64/fpu/erf_advsimd.c
+++ b/sysdeps/aarch64/fpu/erf_advsimd.c
@@ -22,19 +22,21 @@
static const struct data
{
float64x2_t third;
- float64x2_t tenth, two_over_five, two_over_fifteen;
- float64x2_t two_over_nine, two_over_fortyfive;
+ float64x2_t tenth, two_over_five, two_over_nine;
+ double two_over_fifteen, two_over_fortyfive;
float64x2_t max, shift;
+ uint64x2_t max_idx;
#if WANT_SIMD_EXCEPT
float64x2_t tiny_bound, huge_bound, scale_minus_one;
#endif
} data = {
+ .max_idx = V2 (768),
.third = V2 (0x1.5555555555556p-2), /* used to compute 2/3 and 1/6 too. */
- .two_over_fifteen = V2 (0x1.1111111111111p-3),
+ .two_over_fifteen = 0x1.1111111111111p-3,
.tenth = V2 (-0x1.999999999999ap-4),
.two_over_five = V2 (-0x1.999999999999ap-2),
.two_over_nine = V2 (-0x1.c71c71c71c71cp-3),
- .two_over_fortyfive = V2 (0x1.6c16c16c16c17p-5),
+ .two_over_fortyfive = 0x1.6c16c16c16c17p-5,
.max = V2 (5.9921875), /* 6 - 1/128. */
.shift = V2 (0x1p45),
#if WANT_SIMD_EXCEPT
@@ -87,8 +89,8 @@ float64x2_t VPCS_ATTR V_NAME_D1 (erf) (float64x2_t x)
float64x2_t a = vabsq_f64 (x);
/* Reciprocal conditions that do not catch NaNs so they can be used in BSLs
to return expected results. */
- uint64x2_t a_le_max = vcleq_f64 (a, dat->max);
- uint64x2_t a_gt_max = vcgtq_f64 (a, dat->max);
+ uint64x2_t a_le_max = vcaleq_f64 (x, dat->max);
+ uint64x2_t a_gt_max = vcagtq_f64 (x, dat->max);
#if WANT_SIMD_EXCEPT
/* |x| huge or tiny. */
@@ -115,7 +117,7 @@ float64x2_t VPCS_ATTR V_NAME_D1 (erf) (float64x2_t x)
segfault. */
uint64x2_t i
= vsubq_u64 (vreinterpretq_u64_f64 (z), vreinterpretq_u64_f64 (shift));
- i = vbslq_u64 (a_le_max, i, v_u64 (768));
+ i = vbslq_u64 (a_le_max, i, dat->max_idx);
struct entry e = lookup (i);
float64x2_t r = vsubq_f64 (z, shift);
@@ -125,14 +127,19 @@ float64x2_t VPCS_ATTR V_NAME_D1 (erf) (float64x2_t x)
float64x2_t d2 = vmulq_f64 (d, d);
float64x2_t r2 = vmulq_f64 (r, r);
+ float64x2_t two_over_fifteen_and_fortyfive
+ = vld1q_f64 (&dat->two_over_fifteen);
+
/* poly (d, r) = 1 + p1(r) * d + p2(r) * d^2 + ... + p5(r) * d^5. */
float64x2_t p1 = r;
float64x2_t p2
= vfmsq_f64 (dat->third, r2, vaddq_f64 (dat->third, dat->third));
float64x2_t p3 = vmulq_f64 (r, vfmaq_f64 (v_f64 (-0.5), r2, dat->third));
- float64x2_t p4 = vfmaq_f64 (dat->two_over_five, r2, dat->two_over_fifteen);
+ float64x2_t p4 = vfmaq_laneq_f64 (dat->two_over_five, r2,
+ two_over_fifteen_and_fortyfive, 0);
p4 = vfmsq_f64 (dat->tenth, r2, p4);
- float64x2_t p5 = vfmaq_f64 (dat->two_over_nine, r2, dat->two_over_fortyfive);
+ float64x2_t p5 = vfmaq_laneq_f64 (dat->two_over_nine, r2,
+ two_over_fifteen_and_fortyfive, 1);
p5 = vmulq_f64 (r, vfmaq_f64 (vmulq_f64 (v_f64 (0.5), dat->third), r2, p5));
float64x2_t p34 = vfmaq_f64 (p3, d, p4);
diff --git a/sysdeps/aarch64/fpu/erfc_advsimd.c b/sysdeps/aarch64/fpu/erfc_advsimd.c
index f1b3bfe8304c73b5..2f2f755c46e71b58 100644
--- a/sysdeps/aarch64/fpu/erfc_advsimd.c
+++ b/sysdeps/aarch64/fpu/erfc_advsimd.c
@@ -24,8 +24,8 @@ static const struct data
{
uint64x2_t offset, table_scale;
float64x2_t max, shift;
- float64x2_t p20, p40, p41, p42;
- float64x2_t p51, p52;
+ float64x2_t p20, p40, p41, p51;
+ double p42, p52;
double qr5[2], qr6[2], qr7[2], qr8[2], qr9[2];
#if WANT_SIMD_EXCEPT
float64x2_t uflow_bound;
@@ -41,9 +41,9 @@ static const struct data
.p20 = V2 (0x1.5555555555555p-2), /* 1/3, used to compute 2/3 and 1/6. */
.p40 = V2 (-0x1.999999999999ap-4), /* 1/10. */
.p41 = V2 (-0x1.999999999999ap-2), /* 2/5. */
- .p42 = V2 (0x1.1111111111111p-3), /* 2/15. */
+ .p42 = 0x1.1111111111111p-3, /* 2/15. */
.p51 = V2 (-0x1.c71c71c71c71cp-3), /* 2/9. */
- .p52 = V2 (0x1.6c16c16c16c17p-5), /* 2/45. */
+ .p52 = 0x1.6c16c16c16c17p-5, /* 2/45. */
/* Qi = (i+1) / i, Ri = -2 * i / ((i+1)*(i+2)), for i = 5, ..., 9. */
.qr5 = { 0x1.3333333333333p0, -0x1.e79e79e79e79ep-3 },
.qr6 = { 0x1.2aaaaaaaaaaabp0, -0x1.b6db6db6db6dbp-3 },
@@ -157,9 +157,10 @@ float64x2_t V_NAME_D1 (erfc) (float64x2_t x)
float64x2_t p1 = r;
float64x2_t p2 = vfmsq_f64 (dat->p20, r2, vaddq_f64 (dat->p20, dat->p20));
float64x2_t p3 = vmulq_f64 (r, vfmaq_f64 (v_f64 (-0.5), r2, dat->p20));
- float64x2_t p4 = vfmaq_f64 (dat->p41, r2, dat->p42);
+ float64x2_t p42_p52 = vld1q_f64 (&dat->p42);
+ float64x2_t p4 = vfmaq_laneq_f64 (dat->p41, r2, p42_p52, 0);
p4 = vfmsq_f64 (dat->p40, r2, p4);
- float64x2_t p5 = vfmaq_f64 (dat->p51, r2, dat->p52);
+ float64x2_t p5 = vfmaq_laneq_f64 (dat->p51, r2, p42_p52, 1);
p5 = vmulq_f64 (r, vfmaq_f64 (vmulq_f64 (v_f64 (0.5), dat->p20), r2, p5));
/* Compute p_i using recurrence relation:
p_{i+2} = (p_i + r * Q_{i+1} * p_{i+1}) * R_{i+1}. */