glibc/glibc-RHEL-118273-23.patch
Yuki Inoguchi 9dd92cac18 aarch64: Add GLIBC_2.40 vector functions and performance fixes (RHEL-118273)
This combines the following upstream commits:

e45af510bc AArch64: Fix instability in AdvSIMD sinh
6c22823da5 AArch64: Fix instability in AdvSIMD tan
aebaeb2c33 AArch64: Update math-vector-fortran.h
e20ca759af AArch64: add optimised strspn/strcspn
aac077645a AArch64: Fix SVE powf routine [BZ #33299]
1e3d1ddf97 AArch64: Optimize SVE exp functions
dee22d2a81 AArch64: Optimise SVE FP64 Hyperbolics
6849c5b791 AArch64: Improve codegen SVE log1p helper
09795c5612 AArch64: Fix builderror with GCC 12.1/12.2
aa18367c11 AArch64: Improve enabling of SVE for libmvec
691edbdf77 aarch64: fix unwinding in longjmp
4352e2cc93 aarch64: Fix _dl_tlsdesc_dynamic unwind for pac-ret (BZ 32612)
cf56eb28fa AArch64: Optimize algorithm in users of SVE expf helper
ce2f26a22e AArch64: Remove PTR_ARG/SIZE_ARG defines
8f0e7fe61e Aarch64: Improve codegen in SVE asinh
c0ff447edf Aarch64: Improve codegen in SVE exp and users, and update expf_inline
f5ff34cb3c AArch64: Improve codegen for SVE erfcf
0b195651db AArch64: Improve codegen for SVE pow
95e807209b AArch64: Improve codegen for SVE powf
d3f2b71ef1 aarch64: Fix tests not compatible with targets supporting GCS
f86b4cf875 AArch64: Improve codegen in SVE expm1f and users
140b985e5a AArch64: Improve codegen in AdvSIMD asinh
91c1fadba3 AArch64: Improve codegen for SVE log1pf users
cff9648d0b AArch64: Improve codegen of AdvSIMD expf family
569cfaaf49 AArch64: Improve codegen in AdvSIMD pow
ca0c0d0f26 AArch64: Improve codegen in users of ADVSIMD log1p helper
13a7ef5999 AArch64: Improve codegen in users of ADVSIMD expm1 helper
2d82d781a5 AArch64: Remove SVE erf and erfc tables
1cf29fbc5b AArch64: Small optimisation in AdvSIMD erf and erfc
7b8c134b54 AArch64: Improve codegen in SVE expf & related routines
a15b1394b5 AArch64: Improve codegen in SVE F32 logs
5bc100bd4b AArch64: Improve codegen in users of AdvSIMD log1pf helper
7900ac490d AArch64: Improve codegen in users of ADVSIMD expm1f helper
0fed0b250f aarch64/fpu: Add vector variants of pow
75207bde68 aarch64/fpu: Add vector variants of cbrt
157f89fa3d aarch64/fpu: Add vector variants of hypot
90a6ca8b28 aarch64: Fix AdvSIMD libmvec routines for big-endian
87cb1dfcd6 aarch64/fpu: Add vector variants of erfc
3d3a4fb8e4 aarch64/fpu: Add vector variants of tanh
eedbbca0bf aarch64/fpu: Add vector variants of sinh
8b67920528 aarch64/fpu: Add vector variants of atanh
81406ea3c5 aarch64/fpu: Add vector variants of asinh
b09fee1d21 aarch64/fpu: Add vector variants of acosh
bdb5705b7b aarch64/fpu: Add vector variants of cosh
cb5d84f1f8 aarch64/fpu: Add vector variants of erf

Resolves: RHEL-118273
2025-12-05 16:24:54 +01:00

363 lines
14 KiB
Diff

commit 91c1fadba338752bf514cd4cca057b27b1b10eed
Author: Yat Long Poon <yatlong.poon@arm.com>
Date: Fri Jan 3 19:09:05 2025 +0000
AArch64: Improve codegen for SVE log1pf users
Reduce memory access by using lanewise MLA and reduce number of MOVPRFXs.
Move log1pf implementation to inline helper function.
Speedup on Neoverse V1 for log1pf (10%), acoshf (-1%), atanhf (2%), asinhf (2%).
Conflicts:
sysdeps/aarch64/fpu/log1pf_sve.c
(Fixup context to apply without out-of-scope dependency 751a5502)
diff --git a/sysdeps/aarch64/fpu/acoshf_sve.c b/sysdeps/aarch64/fpu/acoshf_sve.c
index 2110894e629500be..491365e24d692f0f 100644
--- a/sysdeps/aarch64/fpu/acoshf_sve.c
+++ b/sysdeps/aarch64/fpu/acoshf_sve.c
@@ -17,23 +17,26 @@
License along with the GNU C Library; if not, see
<https://www.gnu.org/licenses/>. */
+#include "sv_math.h"
+#include "sv_log1pf_inline.h"
+
#define One 0x3f800000
#define Thres 0x20000000 /* asuint(0x1p64) - One. */
-#include "sv_log1pf_inline.h"
-
static svfloat32_t NOINLINE
-special_case (svfloat32_t x, svfloat32_t y, svbool_t special)
+special_case (svfloat32_t xm1, svfloat32_t tmp, svbool_t special)
{
+ svfloat32_t x = svadd_x (svptrue_b32 (), xm1, 1.0f);
+ svfloat32_t y = sv_log1pf_inline (tmp, svptrue_b32 ());
return sv_call_f32 (acoshf, x, y, special);
}
/* Single-precision SVE acosh(x) routine. Implements the same algorithm as
vector acoshf and log1p.
- Maximum error is 2.78 ULPs:
- SV_NAME_F1 (acosh) (0x1.01e996p+0) got 0x1.f45b42p-4
- want 0x1.f45b3cp-4. */
+ Maximum error is 2.47 ULPs:
+ SV_NAME_F1 (acosh) (0x1.01ca76p+0) got 0x1.e435a6p-4
+ want 0x1.e435a2p-4. */
svfloat32_t SV_NAME_F1 (acosh) (svfloat32_t x, const svbool_t pg)
{
svuint32_t ix = svreinterpret_u32 (x);
@@ -41,9 +44,9 @@ svfloat32_t SV_NAME_F1 (acosh) (svfloat32_t x, const svbool_t pg)
svfloat32_t xm1 = svsub_x (pg, x, 1.0f);
svfloat32_t u = svmul_x (pg, xm1, svadd_x (pg, x, 1.0f));
- svfloat32_t y = sv_log1pf_inline (svadd_x (pg, xm1, svsqrt_x (pg, u)), pg);
+ svfloat32_t tmp = svadd_x (pg, xm1, svsqrt_x (pg, u));
if (__glibc_unlikely (svptest_any (pg, special)))
- return special_case (x, y, special);
- return y;
+ return special_case (xm1, tmp, special);
+ return sv_log1pf_inline (tmp, pg);
}
diff --git a/sysdeps/aarch64/fpu/asinhf_sve.c b/sysdeps/aarch64/fpu/asinhf_sve.c
index d85c3a685c0b83ff..b7f253bf32fb9478 100644
--- a/sysdeps/aarch64/fpu/asinhf_sve.c
+++ b/sysdeps/aarch64/fpu/asinhf_sve.c
@@ -20,20 +20,23 @@
#include "sv_math.h"
#include "sv_log1pf_inline.h"
-#define BigBound (0x5f800000) /* asuint(0x1p64). */
+#define BigBound 0x5f800000 /* asuint(0x1p64). */
static svfloat32_t NOINLINE
-special_case (svfloat32_t x, svfloat32_t y, svbool_t special)
+special_case (svuint32_t iax, svuint32_t sign, svfloat32_t y, svbool_t special)
{
+ svfloat32_t x = svreinterpret_f32 (sveor_x (svptrue_b32 (), iax, sign));
+ y = svreinterpret_f32 (
+ svorr_x (svptrue_b32 (), sign, svreinterpret_u32 (y)));
return sv_call_f32 (asinhf, x, y, special);
}
/* Single-precision SVE asinh(x) routine. Implements the same algorithm as
vector asinhf and log1p.
- Maximum error is 2.48 ULPs:
- SV_NAME_F1 (asinh) (0x1.008864p-3) got 0x1.ffbbbcp-4
- want 0x1.ffbbb8p-4. */
+ Maximum error is 1.92 ULPs:
+ SV_NAME_F1 (asinh) (-0x1.0922ecp-1) got -0x1.fd0bccp-2
+ want -0x1.fd0bc8p-2. */
svfloat32_t SV_NAME_F1 (asinh) (svfloat32_t x, const svbool_t pg)
{
svfloat32_t ax = svabs_x (pg, x);
@@ -49,8 +52,6 @@ svfloat32_t SV_NAME_F1 (asinh) (svfloat32_t x, const svbool_t pg)
= sv_log1pf_inline (svadd_x (pg, ax, svdiv_x (pg, ax2, d)), pg);
if (__glibc_unlikely (svptest_any (pg, special)))
- return special_case (
- x, svreinterpret_f32 (svorr_x (pg, sign, svreinterpret_u32 (y))),
- special);
+ return special_case (iax, sign, y, special);
return svreinterpret_f32 (svorr_x (pg, sign, svreinterpret_u32 (y)));
}
diff --git a/sysdeps/aarch64/fpu/atanhf_sve.c b/sysdeps/aarch64/fpu/atanhf_sve.c
index dae83041ef7157f0..2d3005bbc88393ec 100644
--- a/sysdeps/aarch64/fpu/atanhf_sve.c
+++ b/sysdeps/aarch64/fpu/atanhf_sve.c
@@ -17,21 +17,25 @@
License along with the GNU C Library; if not, see
<https://www.gnu.org/licenses/>. */
+#include "sv_math.h"
#include "sv_log1pf_inline.h"
#define One (0x3f800000)
#define Half (0x3f000000)
static svfloat32_t NOINLINE
-special_case (svfloat32_t x, svfloat32_t y, svbool_t special)
+special_case (svuint32_t iax, svuint32_t sign, svfloat32_t halfsign,
+ svfloat32_t y, svbool_t special)
{
+ svfloat32_t x = svreinterpret_f32 (sveor_x (svptrue_b32 (), iax, sign));
+ y = svmul_x (svptrue_b32 (), halfsign, y);
return sv_call_f32 (atanhf, x, y, special);
}
/* Approximation for vector single-precision atanh(x) using modified log1p.
- The maximum error is 2.28 ULP:
- _ZGVsMxv_atanhf(0x1.ff1194p-5) got 0x1.ffbbbcp-5
- want 0x1.ffbbb6p-5. */
+ The maximum error is 1.99 ULP:
+ _ZGVsMxv_atanhf(0x1.f1583p-5) got 0x1.f1f4fap-5
+ want 0x1.f1f4f6p-5. */
svfloat32_t SV_NAME_F1 (atanh) (svfloat32_t x, const svbool_t pg)
{
svfloat32_t ax = svabs_x (pg, x);
@@ -48,7 +52,7 @@ svfloat32_t SV_NAME_F1 (atanh) (svfloat32_t x, const svbool_t pg)
y = sv_log1pf_inline (y, pg);
if (__glibc_unlikely (svptest_any (pg, special)))
- return special_case (x, svmul_x (pg, halfsign, y), special);
+ return special_case (iax, sign, halfsign, y, special);
return svmul_x (pg, halfsign, y);
}
diff --git a/sysdeps/aarch64/fpu/log1pf_sve.c b/sysdeps/aarch64/fpu/log1pf_sve.c
index f645cc997e430bcb..4f17c44e2d96039a 100644
--- a/sysdeps/aarch64/fpu/log1pf_sve.c
+++ b/sysdeps/aarch64/fpu/log1pf_sve.c
@@ -18,30 +18,13 @@
<https://www.gnu.org/licenses/>. */
#include "sv_math.h"
-#include "poly_sve_f32.h"
-
-static const struct data
-{
- float poly[8];
- float ln2, exp_bias;
- uint32_t four, three_quarters;
-} data = {.poly = {/* Do not store first term of polynomial, which is -0.5, as
- this can be fmov-ed directly instead of including it in
- the main load-and-mla polynomial schedule. */
- 0x1.5555aap-2f, -0x1.000038p-2f, 0x1.99675cp-3f,
- -0x1.54ef78p-3f, 0x1.28a1f4p-3f, -0x1.0da91p-3f,
- 0x1.abcb6p-4f, -0x1.6f0d5ep-5f},
- .ln2 = 0x1.62e43p-1f,
- .exp_bias = 0x1p-23f,
- .four = 0x40800000,
- .three_quarters = 0x3f400000};
-
-#define SignExponentMask 0xff800000
+#include "sv_log1pf_inline.h"
static svfloat32_t NOINLINE
-special_case (svfloat32_t x, svfloat32_t y, svbool_t special)
+special_case (svfloat32_t x, svbool_t special)
{
- return sv_call_f32 (log1pf, x, y, special);
+ return sv_call_f32 (log1pf, x, sv_log1pf_inline (x, svptrue_b32 ()),
+ special);
}
/* Vector log1pf approximation using polynomial on reduced interval. Worst-case
@@ -50,51 +33,12 @@ special_case (svfloat32_t x, svfloat32_t y, svbool_t special)
want 0x1.9f323ep-2. */
svfloat32_t SV_NAME_F1 (log1p) (svfloat32_t x, svbool_t pg)
{
- const struct data *d = ptr_barrier (&data);
/* x < -1, Inf/Nan. */
svbool_t special = svcmpeq (pg, svreinterpret_u32 (x), 0x7f800000);
special = svorn_z (pg, special, svcmpge (pg, x, -1));
- /* With x + 1 = t * 2^k (where t = m + 1 and k is chosen such that m
- is in [-0.25, 0.5]):
- log1p(x) = log(t) + log(2^k) = log1p(m) + k*log(2).
-
- We approximate log1p(m) with a polynomial, then scale by
- k*log(2). Instead of doing this directly, we use an intermediate
- scale factor s = 4*k*log(2) to ensure the scale is representable
- as a normalised fp32 number. */
- svfloat32_t m = svadd_x (pg, x, 1);
-
- /* Choose k to scale x to the range [-1/4, 1/2]. */
- svint32_t k
- = svand_x (pg, svsub_x (pg, svreinterpret_s32 (m), d->three_quarters),
- sv_s32 (SignExponentMask));
-
- /* Scale x by exponent manipulation. */
- svfloat32_t m_scale = svreinterpret_f32 (
- svsub_x (pg, svreinterpret_u32 (x), svreinterpret_u32 (k)));
-
- /* Scale up to ensure that the scale factor is representable as normalised
- fp32 number, and scale m down accordingly. */
- svfloat32_t s = svreinterpret_f32 (svsubr_x (pg, k, d->four));
- m_scale = svadd_x (pg, m_scale, svmla_x (pg, sv_f32 (-1), s, 0.25));
-
- /* Evaluate polynomial on reduced interval. */
- svfloat32_t ms2 = svmul_x (pg, m_scale, m_scale),
- ms4 = svmul_x (pg, ms2, ms2);
- svfloat32_t p = sv_estrin_7_f32_x (pg, m_scale, ms2, ms4, d->poly);
- p = svmad_x (pg, m_scale, p, -0.5);
- p = svmla_x (pg, m_scale, m_scale, svmul_x (pg, m_scale, p));
-
- /* The scale factor to be applied back at the end - by multiplying float(k)
- by 2^-23 we get the unbiased exponent of k. */
- svfloat32_t scale_back = svmul_x (pg, svcvt_f32_x (pg, k), d->exp_bias);
-
- /* Apply the scaling back. */
- svfloat32_t y = svmla_x (pg, p, scale_back, d->ln2);
-
if (__glibc_unlikely (svptest_any (pg, special)))
- return special_case (x, y, special);
+ return special_case (x, special);
- return y;
+ return sv_log1pf_inline (x, pg);
}
diff --git a/sysdeps/aarch64/fpu/sv_log1pf_inline.h b/sysdeps/aarch64/fpu/sv_log1pf_inline.h
index b94b2da055a6c59b..850297d61556740c 100644
--- a/sysdeps/aarch64/fpu/sv_log1pf_inline.h
+++ b/sysdeps/aarch64/fpu/sv_log1pf_inline.h
@@ -22,55 +22,76 @@
#include "sv_math.h"
#include "vecmath_config.h"
-#include "poly_sve_f32.h"
+
+#define SignExponentMask 0xff800000
static const struct sv_log1pf_data
{
- float32_t poly[9];
- float32_t ln2;
- float32_t scale_back;
+ float c0, c2, c4, c6;
+ float c1, c3, c5, c7;
+ float ln2, exp_bias, quarter;
+ uint32_t four, three_quarters;
} sv_log1pf_data = {
- /* Polynomial generated using FPMinimax in [-0.25, 0.5]. */
- .poly = { -0x1p-1f, 0x1.5555aap-2f, -0x1.000038p-2f, 0x1.99675cp-3f,
- -0x1.54ef78p-3f, 0x1.28a1f4p-3f, -0x1.0da91p-3f, 0x1.abcb6p-4f,
- -0x1.6f0d5ep-5f },
- .scale_back = 0x1.0p-23f,
- .ln2 = 0x1.62e43p-1f,
+ /* Do not store first term of polynomial, which is -0.5, as
+ this can be fmov-ed directly instead of including it in
+ the main load-and-mla polynomial schedule. */
+ .c0 = 0x1.5555aap-2f, .c1 = -0x1.000038p-2f, .c2 = 0x1.99675cp-3f,
+ .c3 = -0x1.54ef78p-3f, .c4 = 0x1.28a1f4p-3f, .c5 = -0x1.0da91p-3f,
+ .c6 = 0x1.abcb6p-4f, .c7 = -0x1.6f0d5ep-5f, .ln2 = 0x1.62e43p-1f,
+ .exp_bias = 0x1p-23f, .quarter = 0x1p-2f, .four = 0x40800000,
+ .three_quarters = 0x3f400000,
};
-static inline svfloat32_t
-eval_poly (svfloat32_t m, const float32_t *c, svbool_t pg)
-{
- svfloat32_t p_12 = svmla_x (pg, sv_f32 (c[0]), m, sv_f32 (c[1]));
- svfloat32_t m2 = svmul_x (pg, m, m);
- svfloat32_t q = svmla_x (pg, m, m2, p_12);
- svfloat32_t p = sv_pw_horner_6_f32_x (pg, m, m2, c + 2);
- p = svmul_x (pg, m2, p);
-
- return svmla_x (pg, q, m2, p);
-}
-
static inline svfloat32_t
sv_log1pf_inline (svfloat32_t x, svbool_t pg)
{
const struct sv_log1pf_data *d = ptr_barrier (&sv_log1pf_data);
- svfloat32_t m = svadd_x (pg, x, 1.0f);
-
- svint32_t ks = svsub_x (pg, svreinterpret_s32 (m),
- svreinterpret_s32 (svdup_f32 (0.75f)));
- ks = svand_x (pg, ks, 0xff800000);
- svuint32_t k = svreinterpret_u32 (ks);
- svfloat32_t s = svreinterpret_f32 (
- svsub_x (pg, svreinterpret_u32 (svdup_f32 (4.0f)), k));
-
- svfloat32_t m_scale
- = svreinterpret_f32 (svsub_x (pg, svreinterpret_u32 (x), k));
- m_scale
- = svadd_x (pg, m_scale, svmla_x (pg, sv_f32 (-1.0f), sv_f32 (0.25f), s));
- svfloat32_t p = eval_poly (m_scale, d->poly, pg);
- svfloat32_t scale_back = svmul_x (pg, svcvt_f32_x (pg, k), d->scale_back);
- return svmla_x (pg, p, scale_back, d->ln2);
+ /* With x + 1 = t * 2^k (where t = m + 1 and k is chosen such that m
+ is in [-0.25, 0.5]):
+ log1p(x) = log(t) + log(2^k) = log1p(m) + k*log(2).
+
+ We approximate log1p(m) with a polynomial, then scale by
+ k*log(2). Instead of doing this directly, we use an intermediate
+ scale factor s = 4*k*log(2) to ensure the scale is representable
+ as a normalised fp32 number. */
+ svfloat32_t m = svadd_x (pg, x, 1);
+
+ /* Choose k to scale x to the range [-1/4, 1/2]. */
+ svint32_t k
+ = svand_x (pg, svsub_x (pg, svreinterpret_s32 (m), d->three_quarters),
+ sv_s32 (SignExponentMask));
+
+ /* Scale x by exponent manipulation. */
+ svfloat32_t m_scale = svreinterpret_f32 (
+ svsub_x (pg, svreinterpret_u32 (x), svreinterpret_u32 (k)));
+
+ /* Scale up to ensure that the scale factor is representable as normalised
+ fp32 number, and scale m down accordingly. */
+ svfloat32_t s = svreinterpret_f32 (svsubr_x (pg, k, d->four));
+ svfloat32_t fconst = svld1rq_f32 (svptrue_b32 (), &d->ln2);
+ m_scale = svadd_x (pg, m_scale, svmla_lane_f32 (sv_f32 (-1), s, fconst, 2));
+
+ /* Evaluate polynomial on reduced interval. */
+ svfloat32_t ms2 = svmul_x (svptrue_b32 (), m_scale, m_scale);
+
+ svfloat32_t c1357 = svld1rq_f32 (svptrue_b32 (), &d->c1);
+ svfloat32_t p01 = svmla_lane_f32 (sv_f32 (d->c0), m_scale, c1357, 0);
+ svfloat32_t p23 = svmla_lane_f32 (sv_f32 (d->c2), m_scale, c1357, 1);
+ svfloat32_t p45 = svmla_lane_f32 (sv_f32 (d->c4), m_scale, c1357, 2);
+ svfloat32_t p67 = svmla_lane_f32 (sv_f32 (d->c6), m_scale, c1357, 3);
+
+ svfloat32_t p = svmla_x (pg, p45, p67, ms2);
+ p = svmla_x (pg, p23, p, ms2);
+ p = svmla_x (pg, p01, p, ms2);
+
+ p = svmad_x (pg, m_scale, p, -0.5);
+ p = svmla_x (pg, m_scale, m_scale, svmul_x (pg, m_scale, p));
+
+ /* The scale factor to be applied back at the end - by multiplying float(k)
+ by 2^-23 we get the unbiased exponent of k. */
+ svfloat32_t scale_back = svmul_lane_f32 (svcvt_f32_x (pg, k), fconst, 1);
+ return svmla_lane_f32 (p, scale_back, fconst, 0);
}
#endif