This combines the following upstream commits: e45af510bc AArch64: Fix instability in AdvSIMD sinh 6c22823da5 AArch64: Fix instability in AdvSIMD tan aebaeb2c33 AArch64: Update math-vector-fortran.h e20ca759af AArch64: add optimised strspn/strcspn aac077645a AArch64: Fix SVE powf routine [BZ #33299] 1e3d1ddf97 AArch64: Optimize SVE exp functions dee22d2a81 AArch64: Optimise SVE FP64 Hyperbolics 6849c5b791 AArch64: Improve codegen SVE log1p helper 09795c5612 AArch64: Fix builderror with GCC 12.1/12.2 aa18367c11 AArch64: Improve enabling of SVE for libmvec 691edbdf77 aarch64: fix unwinding in longjmp 4352e2cc93 aarch64: Fix _dl_tlsdesc_dynamic unwind for pac-ret (BZ 32612) cf56eb28fa AArch64: Optimize algorithm in users of SVE expf helper ce2f26a22e AArch64: Remove PTR_ARG/SIZE_ARG defines 8f0e7fe61e Aarch64: Improve codegen in SVE asinh c0ff447edf Aarch64: Improve codegen in SVE exp and users, and update expf_inline f5ff34cb3c AArch64: Improve codegen for SVE erfcf 0b195651db AArch64: Improve codegen for SVE pow 95e807209b AArch64: Improve codegen for SVE powf d3f2b71ef1 aarch64: Fix tests not compatible with targets supporting GCS f86b4cf875 AArch64: Improve codegen in SVE expm1f and users 140b985e5a AArch64: Improve codegen in AdvSIMD asinh 91c1fadba3 AArch64: Improve codegen for SVE log1pf users cff9648d0b AArch64: Improve codegen of AdvSIMD expf family 569cfaaf49 AArch64: Improve codegen in AdvSIMD pow ca0c0d0f26 AArch64: Improve codegen in users of ADVSIMD log1p helper 13a7ef5999 AArch64: Improve codegen in users of ADVSIMD expm1 helper 2d82d781a5 AArch64: Remove SVE erf and erfc tables 1cf29fbc5b AArch64: Small optimisation in AdvSIMD erf and erfc 7b8c134b54 AArch64: Improve codegen in SVE expf & related routines a15b1394b5 AArch64: Improve codegen in SVE F32 logs 5bc100bd4b AArch64: Improve codegen in users of AdvSIMD log1pf helper 7900ac490d AArch64: Improve codegen in users of ADVSIMD expm1f helper 0fed0b250f aarch64/fpu: Add vector variants of pow 75207bde68 aarch64/fpu: Add vector variants of cbrt 157f89fa3d aarch64/fpu: Add vector variants of hypot 90a6ca8b28 aarch64: Fix AdvSIMD libmvec routines for big-endian 87cb1dfcd6 aarch64/fpu: Add vector variants of erfc 3d3a4fb8e4 aarch64/fpu: Add vector variants of tanh eedbbca0bf aarch64/fpu: Add vector variants of sinh 8b67920528 aarch64/fpu: Add vector variants of atanh 81406ea3c5 aarch64/fpu: Add vector variants of asinh b09fee1d21 aarch64/fpu: Add vector variants of acosh bdb5705b7b aarch64/fpu: Add vector variants of cosh cb5d84f1f8 aarch64/fpu: Add vector variants of erf Resolves: RHEL-118273
46 lines
2.0 KiB
Diff
46 lines
2.0 KiB
Diff
commit f5ff34cb3c75ec1061c75bb9188b3c1176426947
|
|
Author: Yat Long Poon <yatlong.poon@arm.com>
|
|
Date: Thu Feb 13 18:00:50 2025 +0000
|
|
|
|
AArch64: Improve codegen for SVE erfcf
|
|
|
|
Reduce number of MOV/MOVPRFXs and use unpredicated FMUL.
|
|
Replace MUL with LSL. Speedup on Neoverse V1: 6%.
|
|
|
|
Reviewed-by: Wilco Dijkstra <Wilco.Dijkstra@arm.com>
|
|
|
|
diff --git a/sysdeps/aarch64/fpu/erfcf_sve.c b/sysdeps/aarch64/fpu/erfcf_sve.c
|
|
index ecacb933aca40855..e4869263e31e18bc 100644
|
|
--- a/sysdeps/aarch64/fpu/erfcf_sve.c
|
|
+++ b/sysdeps/aarch64/fpu/erfcf_sve.c
|
|
@@ -76,7 +76,7 @@ svfloat32_t SV_NAME_F1 (erfc) (svfloat32_t x, const svbool_t pg)
|
|
svuint32_t i = svqadd (svreinterpret_u32 (z), dat->off_idx);
|
|
|
|
/* Lookup erfc(r) and 2/sqrt(pi)*exp(-r^2) in tables. */
|
|
- i = svmul_x (pg, i, 2);
|
|
+ i = svlsl_x (svptrue_b32 (), i, 1);
|
|
const float32_t *p = &__v_erfcf_data.tab[0].erfc - 2 * dat->off_arr;
|
|
svfloat32_t erfcr = svld1_gather_index (pg, p, i);
|
|
svfloat32_t scale = svld1_gather_index (pg, p + 1, i);
|
|
@@ -84,15 +84,15 @@ svfloat32_t SV_NAME_F1 (erfc) (svfloat32_t x, const svbool_t pg)
|
|
/* erfc(x) ~ erfc(r) - scale * d * poly(r, d). */
|
|
svfloat32_t r = svsub_x (pg, z, shift);
|
|
svfloat32_t d = svsub_x (pg, a, r);
|
|
- svfloat32_t d2 = svmul_x (pg, d, d);
|
|
- svfloat32_t r2 = svmul_x (pg, r, r);
|
|
+ svfloat32_t d2 = svmul_x (svptrue_b32 (), d, d);
|
|
+ svfloat32_t r2 = svmul_x (svptrue_b32 (), r, r);
|
|
|
|
svfloat32_t coeffs = svld1rq (svptrue_b32 (), &dat->third);
|
|
- svfloat32_t third = svdup_lane (coeffs, 0);
|
|
|
|
svfloat32_t p1 = r;
|
|
- svfloat32_t p2 = svmls_lane (third, r2, coeffs, 1);
|
|
- svfloat32_t p3 = svmul_x (pg, r, svmla_lane (sv_f32 (-0.5), r2, coeffs, 0));
|
|
+ svfloat32_t p2 = svmls_lane (sv_f32 (dat->third), r2, coeffs, 1);
|
|
+ svfloat32_t p3
|
|
+ = svmul_x (svptrue_b32 (), r, svmla_lane (sv_f32 (-0.5), r2, coeffs, 0));
|
|
svfloat32_t p4 = svmla_lane (sv_f32 (dat->two_over_five), r2, coeffs, 2);
|
|
p4 = svmls_x (pg, sv_f32 (dat->tenth), r2, p4);
|
|
|