glibc/glibc-RHEL-118273-14.patch
Yuki Inoguchi 9dd92cac18 aarch64: Add GLIBC_2.40 vector functions and performance fixes (RHEL-118273)
This combines the following upstream commits:

e45af510bc AArch64: Fix instability in AdvSIMD sinh
6c22823da5 AArch64: Fix instability in AdvSIMD tan
aebaeb2c33 AArch64: Update math-vector-fortran.h
e20ca759af AArch64: add optimised strspn/strcspn
aac077645a AArch64: Fix SVE powf routine [BZ #33299]
1e3d1ddf97 AArch64: Optimize SVE exp functions
dee22d2a81 AArch64: Optimise SVE FP64 Hyperbolics
6849c5b791 AArch64: Improve codegen SVE log1p helper
09795c5612 AArch64: Fix builderror with GCC 12.1/12.2
aa18367c11 AArch64: Improve enabling of SVE for libmvec
691edbdf77 aarch64: fix unwinding in longjmp
4352e2cc93 aarch64: Fix _dl_tlsdesc_dynamic unwind for pac-ret (BZ 32612)
cf56eb28fa AArch64: Optimize algorithm in users of SVE expf helper
ce2f26a22e AArch64: Remove PTR_ARG/SIZE_ARG defines
8f0e7fe61e Aarch64: Improve codegen in SVE asinh
c0ff447edf Aarch64: Improve codegen in SVE exp and users, and update expf_inline
f5ff34cb3c AArch64: Improve codegen for SVE erfcf
0b195651db AArch64: Improve codegen for SVE pow
95e807209b AArch64: Improve codegen for SVE powf
d3f2b71ef1 aarch64: Fix tests not compatible with targets supporting GCS
f86b4cf875 AArch64: Improve codegen in SVE expm1f and users
140b985e5a AArch64: Improve codegen in AdvSIMD asinh
91c1fadba3 AArch64: Improve codegen for SVE log1pf users
cff9648d0b AArch64: Improve codegen of AdvSIMD expf family
569cfaaf49 AArch64: Improve codegen in AdvSIMD pow
ca0c0d0f26 AArch64: Improve codegen in users of ADVSIMD log1p helper
13a7ef5999 AArch64: Improve codegen in users of ADVSIMD expm1 helper
2d82d781a5 AArch64: Remove SVE erf and erfc tables
1cf29fbc5b AArch64: Small optimisation in AdvSIMD erf and erfc
7b8c134b54 AArch64: Improve codegen in SVE expf & related routines
a15b1394b5 AArch64: Improve codegen in SVE F32 logs
5bc100bd4b AArch64: Improve codegen in users of AdvSIMD log1pf helper
7900ac490d AArch64: Improve codegen in users of ADVSIMD expm1f helper
0fed0b250f aarch64/fpu: Add vector variants of pow
75207bde68 aarch64/fpu: Add vector variants of cbrt
157f89fa3d aarch64/fpu: Add vector variants of hypot
90a6ca8b28 aarch64: Fix AdvSIMD libmvec routines for big-endian
87cb1dfcd6 aarch64/fpu: Add vector variants of erfc
3d3a4fb8e4 aarch64/fpu: Add vector variants of tanh
eedbbca0bf aarch64/fpu: Add vector variants of sinh
8b67920528 aarch64/fpu: Add vector variants of atanh
81406ea3c5 aarch64/fpu: Add vector variants of asinh
b09fee1d21 aarch64/fpu: Add vector variants of acosh
bdb5705b7b aarch64/fpu: Add vector variants of cosh
cb5d84f1f8 aarch64/fpu: Add vector variants of erf

Resolves: RHEL-118273
2025-12-05 16:24:54 +01:00

496 lines
20 KiB
Diff

commit 5bc100bd4b7e00db3009ae93d25d303341545d23
Author: Joe Ramsay <Joe.Ramsay@arm.com>
Date: Mon Sep 23 15:32:14 2024 +0100
AArch64: Improve codegen in users of AdvSIMD log1pf helper
log1pf is quite register-intensive - use fewer registers for the
polynomial, and make various changes to shorten dependency chains in
parent routines. There is now no spilling with GCC 14. Accuracy moves
around a little - comments adjusted accordingly but does not require
regen-ulps.
Use the helper in log1pf as well, instead of having separate
implementations. The more accurate polynomial means special-casing can
be simplified, and the shorter dependency chain avoids the usual dance
around v0, which is otherwise difficult.
There is a small duplication of vectors containing 1.0f (or 0x3f800000) -
GCC is not currently able to efficiently handle values which fit in FMOV
but not MOVI, and are reinterpreted to integer. There may be potential
for more optimisation if this is fixed.
Reviewed-by: Wilco Dijkstra <Wilco.Dijkstra@arm.com>
Conflicts:
sysdeps/aarch64/fpu/log1pf_advsimd.c
(Fixup context to apply without out-of-scope dependency 751a5502)
diff --git a/sysdeps/aarch64/fpu/acoshf_advsimd.c b/sysdeps/aarch64/fpu/acoshf_advsimd.c
index 8916dcbf409922a9..004474acf9e9322b 100644
--- a/sysdeps/aarch64/fpu/acoshf_advsimd.c
+++ b/sysdeps/aarch64/fpu/acoshf_advsimd.c
@@ -25,35 +25,32 @@ const static struct data
{
struct v_log1pf_data log1pf_consts;
uint32x4_t one;
- uint16x4_t thresh;
-} data = {
- .log1pf_consts = V_LOG1PF_CONSTANTS_TABLE,
- .one = V4 (0x3f800000),
- .thresh = V4 (0x2000) /* top(asuint(SquareLim) - asuint(1)). */
-};
+} data = { .log1pf_consts = V_LOG1PF_CONSTANTS_TABLE, .one = V4 (0x3f800000) };
+
+#define Thresh vdup_n_u16 (0x2000) /* top(asuint(SquareLim) - asuint(1)). */
static float32x4_t NOINLINE VPCS_ATTR
special_case (float32x4_t x, float32x4_t y, uint16x4_t special,
- const struct v_log1pf_data d)
+ const struct v_log1pf_data *d)
{
return v_call_f32 (acoshf, x, log1pf_inline (y, d), vmovl_u16 (special));
}
/* Vector approximation for single-precision acosh, based on log1p. Maximum
error depends on WANT_SIMD_EXCEPT. With SIMD fp exceptions enabled, it
- is 2.78 ULP:
- __v_acoshf(0x1.07887p+0) got 0x1.ef9e9cp-3
- want 0x1.ef9ea2p-3.
+ is 3.00 ULP:
+ _ZGVnN4v_acoshf(0x1.01df3ap+0) got 0x1.ef0a82p-4
+ want 0x1.ef0a7cp-4.
With exceptions disabled, we can compute u with a shorter dependency chain,
- which gives maximum error of 3.07 ULP:
- __v_acoshf(0x1.01f83ep+0) got 0x1.fbc7fap-4
- want 0x1.fbc7f4p-4. */
+ which gives maximum error of 3.22 ULP:
+ _ZGVnN4v_acoshf(0x1.007ef2p+0) got 0x1.fdcdccp-5
+ want 0x1.fdcdd2p-5. */
VPCS_ATTR float32x4_t NOINLINE V_NAME_F1 (acosh) (float32x4_t x)
{
const struct data *d = ptr_barrier (&data);
uint32x4_t ix = vreinterpretq_u32_f32 (x);
- uint16x4_t special = vcge_u16 (vsubhn_u32 (ix, d->one), d->thresh);
+ uint16x4_t special = vcge_u16 (vsubhn_u32 (ix, d->one), Thresh);
#if WANT_SIMD_EXCEPT
/* Mask special lanes with 1 to side-step spurious invalid or overflow. Use
@@ -64,15 +61,16 @@ VPCS_ATTR float32x4_t NOINLINE V_NAME_F1 (acosh) (float32x4_t x)
float32x4_t xm1 = v_zerofy_f32 (vsubq_f32 (x, v_f32 (1)), p);
float32x4_t u = vfmaq_f32 (vaddq_f32 (xm1, xm1), xm1, xm1);
#else
- float32x4_t xm1 = vsubq_f32 (x, v_f32 (1));
- float32x4_t u = vmulq_f32 (xm1, vaddq_f32 (x, v_f32 (1.0f)));
+ float32x4_t xm1 = vsubq_f32 (x, vreinterpretq_f32_u32 (d->one));
+ float32x4_t u
+ = vmulq_f32 (xm1, vaddq_f32 (x, vreinterpretq_f32_u32 (d->one)));
#endif
float32x4_t y = vaddq_f32 (xm1, vsqrtq_f32 (u));
if (__glibc_unlikely (v_any_u16h (special)))
- return special_case (x, y, special, d->log1pf_consts);
- return log1pf_inline (y, d->log1pf_consts);
+ return special_case (x, y, special, &d->log1pf_consts);
+ return log1pf_inline (y, &d->log1pf_consts);
}
libmvec_hidden_def (V_NAME_F1 (acosh))
HALF_WIDTH_ALIAS_F1 (acosh)
diff --git a/sysdeps/aarch64/fpu/asinhf_advsimd.c b/sysdeps/aarch64/fpu/asinhf_advsimd.c
index 09fd8a614305563d..eb789b91b600af52 100644
--- a/sysdeps/aarch64/fpu/asinhf_advsimd.c
+++ b/sysdeps/aarch64/fpu/asinhf_advsimd.c
@@ -20,16 +20,16 @@
#include "v_math.h"
#include "v_log1pf_inline.h"
-#define SignMask v_u32 (0x80000000)
-
const static struct data
{
struct v_log1pf_data log1pf_consts;
+ float32x4_t one;
uint32x4_t big_bound;
#if WANT_SIMD_EXCEPT
uint32x4_t tiny_bound;
#endif
} data = {
+ .one = V4 (1),
.log1pf_consts = V_LOG1PF_CONSTANTS_TABLE,
.big_bound = V4 (0x5f800000), /* asuint(0x1p64). */
#if WANT_SIMD_EXCEPT
@@ -38,20 +38,27 @@ const static struct data
};
static float32x4_t NOINLINE VPCS_ATTR
-special_case (float32x4_t x, float32x4_t y, uint32x4_t special)
+special_case (float32x4_t x, uint32x4_t sign, float32x4_t y,
+ uint32x4_t special, const struct data *d)
{
- return v_call_f32 (asinhf, x, y, special);
+ return v_call_f32 (
+ asinhf, x,
+ vreinterpretq_f32_u32 (veorq_u32 (
+ sign, vreinterpretq_u32_f32 (log1pf_inline (y, &d->log1pf_consts)))),
+ special);
}
/* Single-precision implementation of vector asinh(x), using vector log1p.
- Worst-case error is 2.66 ULP, at roughly +/-0.25:
- __v_asinhf(0x1.01b04p-2) got 0x1.fe163ep-3 want 0x1.fe1638p-3. */
+ Worst-case error is 2.59 ULP:
+ _ZGVnN4v_asinhf(0x1.d86124p-3) got 0x1.d449bep-3
+ want 0x1.d449c4p-3. */
VPCS_ATTR float32x4_t NOINLINE V_NAME_F1 (asinh) (float32x4_t x)
{
const struct data *dat = ptr_barrier (&data);
- uint32x4_t iax = vbicq_u32 (vreinterpretq_u32_f32 (x), SignMask);
- float32x4_t ax = vreinterpretq_f32_u32 (iax);
+ float32x4_t ax = vabsq_f32 (x);
+ uint32x4_t iax = vreinterpretq_u32_f32 (ax);
uint32x4_t special = vcgeq_u32 (iax, dat->big_bound);
+ uint32x4_t sign = veorq_u32 (vreinterpretq_u32_f32 (x), iax);
float32x4_t special_arg = x;
#if WANT_SIMD_EXCEPT
@@ -68,13 +75,13 @@ VPCS_ATTR float32x4_t NOINLINE V_NAME_F1 (asinh) (float32x4_t x)
/* asinh(x) = log(x + sqrt(x * x + 1)).
For positive x, asinh(x) = log1p(x + x * x / (1 + sqrt(x * x + 1))). */
float32x4_t d
- = vaddq_f32 (v_f32 (1), vsqrtq_f32 (vfmaq_f32 (v_f32 (1), x, x)));
- float32x4_t y = log1pf_inline (
- vaddq_f32 (ax, vdivq_f32 (vmulq_f32 (ax, ax), d)), dat->log1pf_consts);
+ = vaddq_f32 (v_f32 (1), vsqrtq_f32 (vfmaq_f32 (dat->one, ax, ax)));
+ float32x4_t y = vaddq_f32 (ax, vdivq_f32 (vmulq_f32 (ax, ax), d));
if (__glibc_unlikely (v_any_u32 (special)))
- return special_case (special_arg, vbslq_f32 (SignMask, x, y), special);
- return vbslq_f32 (SignMask, x, y);
+ return special_case (special_arg, sign, y, special, dat);
+ return vreinterpretq_f32_u32 (veorq_u32 (
+ sign, vreinterpretq_u32_f32 (log1pf_inline (y, &dat->log1pf_consts))));
}
libmvec_hidden_def (V_NAME_F1 (asinh))
HALF_WIDTH_ALIAS_F1 (asinh)
diff --git a/sysdeps/aarch64/fpu/atanhf_advsimd.c b/sysdeps/aarch64/fpu/atanhf_advsimd.c
index ae488f7b54ddce26..818b6c92adcd48bb 100644
--- a/sysdeps/aarch64/fpu/atanhf_advsimd.c
+++ b/sysdeps/aarch64/fpu/atanhf_advsimd.c
@@ -40,15 +40,17 @@ const static struct data
#define Half v_u32 (0x3f000000)
static float32x4_t NOINLINE VPCS_ATTR
-special_case (float32x4_t x, float32x4_t y, uint32x4_t special)
+special_case (float32x4_t x, float32x4_t halfsign, float32x4_t y,
+ uint32x4_t special)
{
- return v_call_f32 (atanhf, x, y, special);
+ return v_call_f32 (atanhf, vbslq_f32 (AbsMask, x, halfsign),
+ vmulq_f32 (halfsign, y), special);
}
/* Approximation for vector single-precision atanh(x) using modified log1p.
- The maximum error is 3.08 ULP:
- __v_atanhf(0x1.ff215p-5) got 0x1.ffcb7cp-5
- want 0x1.ffcb82p-5. */
+ The maximum error is 2.93 ULP:
+ _ZGVnN4v_atanhf(0x1.f43d7p-5) got 0x1.f4dcfep-5
+ want 0x1.f4dcf8p-5. */
VPCS_ATTR float32x4_t NOINLINE V_NAME_F1 (atanh) (float32x4_t x)
{
const struct data *d = ptr_barrier (&data);
@@ -68,11 +70,19 @@ VPCS_ATTR float32x4_t NOINLINE V_NAME_F1 (atanh) (float32x4_t x)
uint32x4_t special = vcgeq_u32 (iax, d->one);
#endif
- float32x4_t y = vdivq_f32 (vaddq_f32 (ax, ax), vsubq_f32 (v_f32 (1), ax));
- y = log1pf_inline (y, d->log1pf_consts);
+ float32x4_t y = vdivq_f32 (vaddq_f32 (ax, ax),
+ vsubq_f32 (vreinterpretq_f32_u32 (d->one), ax));
+ y = log1pf_inline (y, &d->log1pf_consts);
+ /* If exceptions not required, pass ax to special-case for shorter dependency
+ chain. If exceptions are required ax will have been zerofied, so have to
+ pass x. */
if (__glibc_unlikely (v_any_u32 (special)))
- return special_case (x, vmulq_f32 (halfsign, y), special);
+#if WANT_SIMD_EXCEPT
+ return special_case (x, halfsign, y, special);
+#else
+ return special_case (ax, halfsign, y, special);
+#endif
return vmulq_f32 (halfsign, y);
}
libmvec_hidden_def (V_NAME_F1 (atanh))
diff --git a/sysdeps/aarch64/fpu/log1pf_advsimd.c b/sysdeps/aarch64/fpu/log1pf_advsimd.c
index dc15334a8537b1fc..f2d47962fe13fbdd 100644
--- a/sysdeps/aarch64/fpu/log1pf_advsimd.c
+++ b/sysdeps/aarch64/fpu/log1pf_advsimd.c
@@ -18,113 +18,78 @@
<https://www.gnu.org/licenses/>. */
#include "v_math.h"
-#include "poly_advsimd_f32.h"
+#include "v_log1pf_inline.h"
+
+#if WANT_SIMD_EXCEPT
const static struct data
{
- float32x4_t poly[8], ln2;
- uint32x4_t tiny_bound, minus_one, four, thresh;
- int32x4_t three_quarters;
+ uint32x4_t minus_one, thresh;
+ struct v_log1pf_data d;
} data = {
- .poly = { /* Generated using FPMinimax in [-0.25, 0.5]. First two coefficients
- (1, -0.5) are not stored as they can be generated more
- efficiently. */
- V4 (0x1.5555aap-2f), V4 (-0x1.000038p-2f), V4 (0x1.99675cp-3f),
- V4 (-0x1.54ef78p-3f), V4 (0x1.28a1f4p-3f), V4 (-0x1.0da91p-3f),
- V4 (0x1.abcb6p-4f), V4 (-0x1.6f0d5ep-5f) },
- .ln2 = V4 (0x1.62e43p-1f),
- .tiny_bound = V4 (0x34000000), /* asuint32(0x1p-23). ulp=0.5 at 0x1p-23. */
- .thresh = V4 (0x4b800000), /* asuint32(INFINITY) - tiny_bound. */
+ .d = V_LOG1PF_CONSTANTS_TABLE,
+ .thresh = V4 (0x4b800000), /* asuint32(INFINITY) - TinyBound. */
.minus_one = V4 (0xbf800000),
- .four = V4 (0x40800000),
- .three_quarters = V4 (0x3f400000)
};
-static inline float32x4_t
-eval_poly (float32x4_t m, const float32x4_t *p)
-{
- /* Approximate log(1+m) on [-0.25, 0.5] using split Estrin scheme. */
- float32x4_t p_12 = vfmaq_f32 (v_f32 (-0.5), m, p[0]);
- float32x4_t p_34 = vfmaq_f32 (p[1], m, p[2]);
- float32x4_t p_56 = vfmaq_f32 (p[3], m, p[4]);
- float32x4_t p_78 = vfmaq_f32 (p[5], m, p[6]);
-
- float32x4_t m2 = vmulq_f32 (m, m);
- float32x4_t p_02 = vfmaq_f32 (m, m2, p_12);
- float32x4_t p_36 = vfmaq_f32 (p_34, m2, p_56);
- float32x4_t p_79 = vfmaq_f32 (p_78, m2, p[7]);
-
- float32x4_t m4 = vmulq_f32 (m2, m2);
- float32x4_t p_06 = vfmaq_f32 (p_02, m4, p_36);
- return vfmaq_f32 (p_06, m4, vmulq_f32 (m4, p_79));
-}
+/* asuint32(0x1p-23). ulp=0.5 at 0x1p-23. */
+# define TinyBound v_u32 (0x34000000)
static float32x4_t NOINLINE VPCS_ATTR
-special_case (float32x4_t x, float32x4_t y, uint32x4_t special)
+special_case (float32x4_t x, uint32x4_t cmp, const struct data *d)
{
- return v_call_f32 (log1pf, x, y, special);
+ /* Side-step special lanes so fenv exceptions are not triggered
+ inadvertently. */
+ float32x4_t x_nospecial = v_zerofy_f32 (x, cmp);
+ return v_call_f32 (log1pf, x, log1pf_inline (x_nospecial, &d->d), cmp);
}
-/* Vector log1pf approximation using polynomial on reduced interval. Accuracy
- is roughly 2.02 ULP:
- log1pf(0x1.21e13ap-2) got 0x1.fe8028p-3 want 0x1.fe802cp-3. */
+/* Vector log1pf approximation using polynomial on reduced interval. Worst-case
+ error is 1.69 ULP:
+ _ZGVnN4v_log1pf(0x1.04418ap-2) got 0x1.cfcbd8p-3
+ want 0x1.cfcbdcp-3. */
VPCS_ATTR float32x4_t V_NAME_F1 (log1p) (float32x4_t x)
{
const struct data *d = ptr_barrier (&data);
-
uint32x4_t ix = vreinterpretq_u32_f32 (x);
uint32x4_t ia = vreinterpretq_u32_f32 (vabsq_f32 (x));
+
uint32x4_t special_cases
- = vorrq_u32 (vcgeq_u32 (vsubq_u32 (ia, d->tiny_bound), d->thresh),
+ = vorrq_u32 (vcgeq_u32 (vsubq_u32 (ia, TinyBound), d->thresh),
vcgeq_u32 (ix, d->minus_one));
- float32x4_t special_arg = x;
-#if WANT_SIMD_EXCEPT
if (__glibc_unlikely (v_any_u32 (special_cases)))
- /* Side-step special lanes so fenv exceptions are not triggered
- inadvertently. */
- x = v_zerofy_f32 (x, special_cases);
-#endif
+ return special_case (x, special_cases, d);
- /* With x + 1 = t * 2^k (where t = m + 1 and k is chosen such that m
- is in [-0.25, 0.5]):
- log1p(x) = log(t) + log(2^k) = log1p(m) + k*log(2).
-
- We approximate log1p(m) with a polynomial, then scale by
- k*log(2). Instead of doing this directly, we use an intermediate
- scale factor s = 4*k*log(2) to ensure the scale is representable
- as a normalised fp32 number. */
+ return log1pf_inline (x, &d->d);
+}
- float32x4_t m = vaddq_f32 (x, v_f32 (1.0f));
+#else
- /* Choose k to scale x to the range [-1/4, 1/2]. */
- int32x4_t k
- = vandq_s32 (vsubq_s32 (vreinterpretq_s32_f32 (m), d->three_quarters),
- v_s32 (0xff800000));
- uint32x4_t ku = vreinterpretq_u32_s32 (k);
+const static struct v_log1pf_data data = V_LOG1PF_CONSTANTS_TABLE;
- /* Scale x by exponent manipulation. */
- float32x4_t m_scale
- = vreinterpretq_f32_u32 (vsubq_u32 (vreinterpretq_u32_f32 (x), ku));
+static float32x4_t NOINLINE VPCS_ATTR
+special_case (float32x4_t x, uint32x4_t cmp)
+{
+ return v_call_f32 (log1pf, x, log1pf_inline (x, ptr_barrier (&data)), cmp);
+}
- /* Scale up to ensure that the scale factor is representable as normalised
- fp32 number, and scale m down accordingly. */
- float32x4_t s = vreinterpretq_f32_u32 (vsubq_u32 (d->four, ku));
- m_scale = vaddq_f32 (m_scale, vfmaq_f32 (v_f32 (-1.0f), v_f32 (0.25f), s));
+/* Vector log1pf approximation using polynomial on reduced interval. Worst-case
+ error is 1.63 ULP:
+ _ZGVnN4v_log1pf(0x1.216d12p-2) got 0x1.fdcb12p-3
+ want 0x1.fdcb16p-3. */
+VPCS_ATTR float32x4_t V_NAME_F1 (log1p) (float32x4_t x)
+{
+ uint32x4_t special_cases = vornq_u32 (vcleq_f32 (x, v_f32 (-1)),
+ vcaleq_f32 (x, v_f32 (0x1p127f)));
- /* Evaluate polynomial on the reduced interval. */
- float32x4_t p = eval_poly (m_scale, d->poly);
+ if (__glibc_unlikely (v_any_u32 (special_cases)))
+ return special_case (x, special_cases);
- /* The scale factor to be applied back at the end - by multiplying float(k)
- by 2^-23 we get the unbiased exponent of k. */
- float32x4_t scale_back = vcvtq_f32_s32 (vshrq_n_s32 (k, 23));
+ return log1pf_inline (x, ptr_barrier (&data));
+}
- /* Apply the scaling back. */
- float32x4_t y = vfmaq_f32 (p, scale_back, d->ln2);
+#endif
- if (__glibc_unlikely (v_any_u32 (special_cases)))
- return special_case (special_arg, y, special_cases);
- return y;
-}
libmvec_hidden_def (V_NAME_F1 (log1p))
HALF_WIDTH_ALIAS_F1 (log1p)
diff --git a/sysdeps/aarch64/fpu/v_log1pf_inline.h b/sysdeps/aarch64/fpu/v_log1pf_inline.h
index 643a6cdcfc498970..73e45a942e24a26f 100644
--- a/sysdeps/aarch64/fpu/v_log1pf_inline.h
+++ b/sysdeps/aarch64/fpu/v_log1pf_inline.h
@@ -25,54 +25,81 @@
struct v_log1pf_data
{
- float32x4_t poly[8], ln2;
uint32x4_t four;
int32x4_t three_quarters;
+ float c0, c3, c5, c7;
+ float32x4_t c4, c6, c1, c2, ln2;
};
/* Polynomial generated using FPMinimax in [-0.25, 0.5]. First two coefficients
(1, -0.5) are not stored as they can be generated more efficiently. */
#define V_LOG1PF_CONSTANTS_TABLE \
{ \
- .poly \
- = { V4 (0x1.5555aap-2f), V4 (-0x1.000038p-2f), V4 (0x1.99675cp-3f), \
- V4 (-0x1.54ef78p-3f), V4 (0x1.28a1f4p-3f), V4 (-0x1.0da91p-3f), \
- V4 (0x1.abcb6p-4f), V4 (-0x1.6f0d5ep-5f) }, \
- .ln2 = V4 (0x1.62e43p-1f), .four = V4 (0x40800000), \
- .three_quarters = V4 (0x3f400000) \
+ .c0 = 0x1.5555aap-2f, .c1 = V4 (-0x1.000038p-2f), \
+ .c2 = V4 (0x1.99675cp-3f), .c3 = -0x1.54ef78p-3f, \
+ .c4 = V4 (0x1.28a1f4p-3f), .c5 = -0x1.0da91p-3f, \
+ .c6 = V4 (0x1.abcb6p-4f), .c7 = -0x1.6f0d5ep-5f, \
+ .ln2 = V4 (0x1.62e43p-1f), .four = V4 (0x40800000), \
+ .three_quarters = V4 (0x3f400000) \
}
static inline float32x4_t
-eval_poly (float32x4_t m, const float32x4_t *c)
+eval_poly (float32x4_t m, const struct v_log1pf_data *d)
{
- /* Approximate log(1+m) on [-0.25, 0.5] using pairwise Horner (main routine
- uses split Estrin, but this way reduces register pressure in the calling
- routine). */
- float32x4_t q = vfmaq_f32 (v_f32 (-0.5), m, c[0]);
+ /* Approximate log(1+m) on [-0.25, 0.5] using pairwise Horner. */
+ float32x4_t c0357 = vld1q_f32 (&d->c0);
+ float32x4_t q = vfmaq_laneq_f32 (v_f32 (-0.5), m, c0357, 0);
float32x4_t m2 = vmulq_f32 (m, m);
- q = vfmaq_f32 (m, m2, q);
- float32x4_t p = v_pw_horner_6_f32 (m, m2, c + 1);
+ float32x4_t p67 = vfmaq_laneq_f32 (d->c6, m, c0357, 3);
+ float32x4_t p45 = vfmaq_laneq_f32 (d->c4, m, c0357, 2);
+ float32x4_t p23 = vfmaq_laneq_f32 (d->c2, m, c0357, 1);
+ float32x4_t p = vfmaq_f32 (p45, m2, p67);
+ p = vfmaq_f32 (p23, m2, p);
+ p = vfmaq_f32 (d->c1, m, p);
p = vmulq_f32 (m2, p);
- return vfmaq_f32 (q, m2, p);
+ p = vfmaq_f32 (m, m2, p);
+ return vfmaq_f32 (p, m2, q);
}
static inline float32x4_t
-log1pf_inline (float32x4_t x, const struct v_log1pf_data d)
+log1pf_inline (float32x4_t x, const struct v_log1pf_data *d)
{
- /* Helper for calculating log(x + 1). Copied from log1pf_2u1.c, with no
- special-case handling. See that file for details of the algorithm. */
+ /* Helper for calculating log(x + 1). */
+
+ /* With x + 1 = t * 2^k (where t = m + 1 and k is chosen such that m
+ is in [-0.25, 0.5]):
+ log1p(x) = log(t) + log(2^k) = log1p(m) + k*log(2).
+
+ We approximate log1p(m) with a polynomial, then scale by
+ k*log(2). Instead of doing this directly, we use an intermediate
+ scale factor s = 4*k*log(2) to ensure the scale is representable
+ as a normalised fp32 number. */
float32x4_t m = vaddq_f32 (x, v_f32 (1.0f));
+
+ /* Choose k to scale x to the range [-1/4, 1/2]. */
int32x4_t k
- = vandq_s32 (vsubq_s32 (vreinterpretq_s32_f32 (m), d.three_quarters),
+ = vandq_s32 (vsubq_s32 (vreinterpretq_s32_f32 (m), d->three_quarters),
v_s32 (0xff800000));
uint32x4_t ku = vreinterpretq_u32_s32 (k);
- float32x4_t s = vreinterpretq_f32_u32 (vsubq_u32 (d.four, ku));
+
+ /* Scale up to ensure that the scale factor is representable as normalised
+ fp32 number, and scale m down accordingly. */
+ float32x4_t s = vreinterpretq_f32_u32 (vsubq_u32 (d->four, ku));
+
+ /* Scale x by exponent manipulation. */
float32x4_t m_scale
= vreinterpretq_f32_u32 (vsubq_u32 (vreinterpretq_u32_f32 (x), ku));
m_scale = vaddq_f32 (m_scale, vfmaq_f32 (v_f32 (-1.0f), v_f32 (0.25f), s));
- float32x4_t p = eval_poly (m_scale, d.poly);
+
+ /* Evaluate polynomial on the reduced interval. */
+ float32x4_t p = eval_poly (m_scale, d);
+
+ /* The scale factor to be applied back at the end - by multiplying float(k)
+ by 2^-23 we get the unbiased exponent of k. */
float32x4_t scale_back = vmulq_f32 (vcvtq_f32_s32 (k), v_f32 (0x1.0p-23f));
- return vfmaq_f32 (p, scale_back, d.ln2);
+
+ /* Apply the scaling back. */
+ return vfmaq_f32 (p, scale_back, d->ln2);
}
#endif