commit 90a6ca8b28bf34e361e577e526e1b0f4c39a32a5 Author: Joe Ramsay Date: Thu May 2 16:43:13 2024 +0100 aarch64: Fix AdvSIMD libmvec routines for big-endian Previously many routines used * to load from vector types stored in the data table. This is emitted as ldr, which byte-swaps the entire vector register, and causes bugs for big-endian when not all lanes contain the same value. When a vector is to be used this way, it has been replaced with an array and the load with an explicit ld1 intrinsic, which byte-swaps only within lanes. As well, many routines previously used non-standard GCC syntax for vector operations such as indexing into vectors types with [] and assembling vectors using {}. This syntax should not be mixed with ACLE, as the former does not respect endianness whereas the latter does. Such examples have been replaced with, for instance, vcombine_* and vgetq_lane* intrinsics. Helpers which only use the GCC syntax, such as the v_call helpers, do not need changing as they do not use intrinsics. Reviewed-by: Szabolcs Nagy Conflicts: sysdeps/aarch64/fpu/exp10f_advsimd.c sysdeps/aarch64/fpu/expm1_advsimd.c sysdeps/aarch64/fpu/expm1f_advsimd.c sysdeps/aarch64/fpu/log10_advsimd.c sysdeps/aarch64/fpu/log2_advsimd.c sysdeps/aarch64/fpu/log_advsimd.c sysdeps/aarch64/fpu/tan_advsimd.c sysdeps/aarch64/fpu/tanf_advsimd.c (Already backported by glibc-upstream-2.39-151.patch) diff --git a/sysdeps/aarch64/fpu/asinh_advsimd.c b/sysdeps/aarch64/fpu/asinh_advsimd.c index 544a52f6515d3201..6207e7da9531f48d 100644 --- a/sysdeps/aarch64/fpu/asinh_advsimd.c +++ b/sysdeps/aarch64/fpu/asinh_advsimd.c @@ -22,6 +22,7 @@ #define A(i) v_f64 (__v_log_data.poly[i]) #define N (1 << V_LOG_TABLE_BITS) +#define IndexMask (N - 1) const static struct data { @@ -63,11 +64,15 @@ struct entry static inline struct entry lookup (uint64x2_t i) { - float64x2_t e0 = vld1q_f64 ( - &__v_log_data.table[(i[0] >> (52 - V_LOG_TABLE_BITS)) & (N - 1)].invc); - float64x2_t e1 = vld1q_f64 ( - &__v_log_data.table[(i[1] >> (52 - V_LOG_TABLE_BITS)) & (N - 1)].invc); - return (struct entry){ vuzp1q_f64 (e0, e1), vuzp2q_f64 (e0, e1) }; + /* Since N is a power of 2, n % N = n & (N - 1). */ + struct entry e; + uint64_t i0 = (vgetq_lane_u64 (i, 0) >> (52 - V_LOG_TABLE_BITS)) & IndexMask; + uint64_t i1 = (vgetq_lane_u64 (i, 1) >> (52 - V_LOG_TABLE_BITS)) & IndexMask; + float64x2_t e0 = vld1q_f64 (&__v_log_data.table[i0].invc); + float64x2_t e1 = vld1q_f64 (&__v_log_data.table[i1].invc); + e.invc = vuzp1q_f64 (e0, e1); + e.logc = vuzp2q_f64 (e0, e1); + return e; } static inline float64x2_t diff --git a/sysdeps/aarch64/fpu/cosh_advsimd.c b/sysdeps/aarch64/fpu/cosh_advsimd.c index ec7b59637e973da9..4bee734f00bd6a9b 100644 --- a/sysdeps/aarch64/fpu/cosh_advsimd.c +++ b/sysdeps/aarch64/fpu/cosh_advsimd.c @@ -22,7 +22,9 @@ static const struct data { float64x2_t poly[3]; - float64x2_t inv_ln2, ln2, shift, thres; + float64x2_t inv_ln2; + double ln2[2]; + float64x2_t shift, thres; uint64x2_t index_mask, special_bound; } data = { .poly = { V2 (0x1.fffffffffffd4p-2), V2 (0x1.5555571d6b68cp-3), @@ -58,8 +60,9 @@ exp_inline (float64x2_t x) float64x2_t n = vsubq_f64 (z, d->shift); /* r = x - n*ln2/N. */ - float64x2_t r = vfmaq_laneq_f64 (x, n, d->ln2, 0); - r = vfmaq_laneq_f64 (r, n, d->ln2, 1); + float64x2_t ln2 = vld1q_f64 (d->ln2); + float64x2_t r = vfmaq_laneq_f64 (x, n, ln2, 0); + r = vfmaq_laneq_f64 (r, n, ln2, 1); uint64x2_t e = vshlq_n_u64 (u, 52 - V_EXP_TAIL_TABLE_BITS); uint64x2_t i = vandq_u64 (u, d->index_mask); diff --git a/sysdeps/aarch64/fpu/erf_advsimd.c b/sysdeps/aarch64/fpu/erf_advsimd.c index 3e70cbc025248a05..19cbb7d0f42eb4e2 100644 --- a/sysdeps/aarch64/fpu/erf_advsimd.c +++ b/sysdeps/aarch64/fpu/erf_advsimd.c @@ -56,8 +56,8 @@ static inline struct entry lookup (uint64x2_t i) { struct entry e; - float64x2_t e1 = vld1q_f64 ((float64_t *) (__erf_data.tab + i[0])), - e2 = vld1q_f64 ((float64_t *) (__erf_data.tab + i[1])); + float64x2_t e1 = vld1q_f64 (&__erf_data.tab[vgetq_lane_u64 (i, 0)].erf), + e2 = vld1q_f64 (&__erf_data.tab[vgetq_lane_u64 (i, 1)].erf); e.erf = vuzp1q_f64 (e1, e2); e.scale = vuzp2q_f64 (e1, e2); return e; diff --git a/sysdeps/aarch64/fpu/erfc_advsimd.c b/sysdeps/aarch64/fpu/erfc_advsimd.c index 548f21a3d68d68d2..f1b3bfe8304c73b5 100644 --- a/sysdeps/aarch64/fpu/erfc_advsimd.c +++ b/sysdeps/aarch64/fpu/erfc_advsimd.c @@ -26,7 +26,7 @@ static const struct data float64x2_t max, shift; float64x2_t p20, p40, p41, p42; float64x2_t p51, p52; - float64x2_t qr5, qr6, qr7, qr8, qr9; + double qr5[2], qr6[2], qr7[2], qr8[2], qr9[2]; #if WANT_SIMD_EXCEPT float64x2_t uflow_bound; #endif @@ -68,8 +68,10 @@ static inline struct entry lookup (uint64x2_t i) { struct entry e; - float64x2_t e1 = vld1q_f64 ((float64_t *) (__erfc_data.tab - Off + i[0])), - e2 = vld1q_f64 ((float64_t *) (__erfc_data.tab - Off + i[1])); + float64x2_t e1 + = vld1q_f64 (&__erfc_data.tab[vgetq_lane_u64 (i, 0) - Off].erfc); + float64x2_t e2 + = vld1q_f64 (&__erfc_data.tab[vgetq_lane_u64 (i, 1) - Off].erfc); e.erfc = vuzp1q_f64 (e1, e2); e.scale = vuzp2q_f64 (e1, e2); return e; @@ -161,16 +163,19 @@ float64x2_t V_NAME_D1 (erfc) (float64x2_t x) p5 = vmulq_f64 (r, vfmaq_f64 (vmulq_f64 (v_f64 (0.5), dat->p20), r2, p5)); /* Compute p_i using recurrence relation: p_{i+2} = (p_i + r * Q_{i+1} * p_{i+1}) * R_{i+1}. */ - float64x2_t p6 = vfmaq_f64 (p4, p5, vmulq_laneq_f64 (r, dat->qr5, 0)); - p6 = vmulq_laneq_f64 (p6, dat->qr5, 1); - float64x2_t p7 = vfmaq_f64 (p5, p6, vmulq_laneq_f64 (r, dat->qr6, 0)); - p7 = vmulq_laneq_f64 (p7, dat->qr6, 1); - float64x2_t p8 = vfmaq_f64 (p6, p7, vmulq_laneq_f64 (r, dat->qr7, 0)); - p8 = vmulq_laneq_f64 (p8, dat->qr7, 1); - float64x2_t p9 = vfmaq_f64 (p7, p8, vmulq_laneq_f64 (r, dat->qr8, 0)); - p9 = vmulq_laneq_f64 (p9, dat->qr8, 1); - float64x2_t p10 = vfmaq_f64 (p8, p9, vmulq_laneq_f64 (r, dat->qr9, 0)); - p10 = vmulq_laneq_f64 (p10, dat->qr9, 1); + float64x2_t qr5 = vld1q_f64 (dat->qr5), qr6 = vld1q_f64 (dat->qr6), + qr7 = vld1q_f64 (dat->qr7), qr8 = vld1q_f64 (dat->qr8), + qr9 = vld1q_f64 (dat->qr9); + float64x2_t p6 = vfmaq_f64 (p4, p5, vmulq_laneq_f64 (r, qr5, 0)); + p6 = vmulq_laneq_f64 (p6, qr5, 1); + float64x2_t p7 = vfmaq_f64 (p5, p6, vmulq_laneq_f64 (r, qr6, 0)); + p7 = vmulq_laneq_f64 (p7, qr6, 1); + float64x2_t p8 = vfmaq_f64 (p6, p7, vmulq_laneq_f64 (r, qr7, 0)); + p8 = vmulq_laneq_f64 (p8, qr7, 1); + float64x2_t p9 = vfmaq_f64 (p7, p8, vmulq_laneq_f64 (r, qr8, 0)); + p9 = vmulq_laneq_f64 (p9, qr8, 1); + float64x2_t p10 = vfmaq_f64 (p8, p9, vmulq_laneq_f64 (r, qr9, 0)); + p10 = vmulq_laneq_f64 (p10, qr9, 1); /* Compute polynomial in d using pairwise Horner scheme. */ float64x2_t p90 = vfmaq_f64 (p9, d, p10); float64x2_t p78 = vfmaq_f64 (p7, d, p8); diff --git a/sysdeps/aarch64/fpu/erfcf_advsimd.c b/sysdeps/aarch64/fpu/erfcf_advsimd.c index 30b9e48dd40d80a0..ca5bc3ab33c92f83 100644 --- a/sysdeps/aarch64/fpu/erfcf_advsimd.c +++ b/sysdeps/aarch64/fpu/erfcf_advsimd.c @@ -23,7 +23,8 @@ static const struct data { uint32x4_t offset, table_scale; float32x4_t max, shift; - float32x4_t coeffs, third, two_over_five, tenth; + float coeffs[4]; + float32x4_t third, two_over_five, tenth; #if WANT_SIMD_EXCEPT float32x4_t uflow_bound; #endif @@ -37,7 +38,7 @@ static const struct data .shift = V4 (0x1p17f), /* Store 1/3, 2/3 and 2/15 in a single register for use with indexed muls and fmas. */ - .coeffs = (float32x4_t){ 0x1.555556p-2f, 0x1.555556p-1f, 0x1.111112p-3f, 0 }, + .coeffs = { 0x1.555556p-2f, 0x1.555556p-1f, 0x1.111112p-3f, 0 }, .third = V4 (0x1.555556p-2f), .two_over_five = V4 (-0x1.99999ap-2f), .tenth = V4 (-0x1.99999ap-4f), @@ -60,12 +61,16 @@ static inline struct entry lookup (uint32x4_t i) { struct entry e; - float64_t t0 = *((float64_t *) (__erfcf_data.tab - Off + i[0])); - float64_t t1 = *((float64_t *) (__erfcf_data.tab - Off + i[1])); - float64_t t2 = *((float64_t *) (__erfcf_data.tab - Off + i[2])); - float64_t t3 = *((float64_t *) (__erfcf_data.tab - Off + i[3])); - float32x4_t e1 = vreinterpretq_f32_f64 ((float64x2_t){ t0, t1 }); - float32x4_t e2 = vreinterpretq_f32_f64 ((float64x2_t){ t2, t3 }); + float32x2_t t0 + = vld1_f32 (&__erfcf_data.tab[vgetq_lane_u32 (i, 0) - Off].erfc); + float32x2_t t1 + = vld1_f32 (&__erfcf_data.tab[vgetq_lane_u32 (i, 1) - Off].erfc); + float32x2_t t2 + = vld1_f32 (&__erfcf_data.tab[vgetq_lane_u32 (i, 2) - Off].erfc); + float32x2_t t3 + = vld1_f32 (&__erfcf_data.tab[vgetq_lane_u32 (i, 3) - Off].erfc); + float32x4_t e1 = vcombine_f32 (t0, t1); + float32x4_t e2 = vcombine_f32 (t2, t3); e.erfc = vuzp1q_f32 (e1, e2); e.scale = vuzp2q_f32 (e1, e2); return e; @@ -140,10 +145,11 @@ float32x4_t NOINLINE V_NAME_F1 (erfc) (float32x4_t x) float32x4_t r2 = vmulq_f32 (r, r); float32x4_t p1 = r; - float32x4_t p2 = vfmsq_laneq_f32 (dat->third, r2, dat->coeffs, 1); + float32x4_t coeffs = vld1q_f32 (dat->coeffs); + float32x4_t p2 = vfmsq_laneq_f32 (dat->third, r2, coeffs, 1); float32x4_t p3 - = vmulq_f32 (r, vfmaq_laneq_f32 (v_f32 (-0.5), r2, dat->coeffs, 0)); - float32x4_t p4 = vfmaq_laneq_f32 (dat->two_over_five, r2, dat->coeffs, 2); + = vmulq_f32 (r, vfmaq_laneq_f32 (v_f32 (-0.5), r2, coeffs, 0)); + float32x4_t p4 = vfmaq_laneq_f32 (dat->two_over_five, r2, coeffs, 2); p4 = vfmsq_f32 (dat->tenth, r2, p4); float32x4_t y = vfmaq_f32 (p3, d, p4); diff --git a/sysdeps/aarch64/fpu/erff_advsimd.c b/sysdeps/aarch64/fpu/erff_advsimd.c index c44644a71cffbb62..f2fe6ff236a6ec07 100644 --- a/sysdeps/aarch64/fpu/erff_advsimd.c +++ b/sysdeps/aarch64/fpu/erff_advsimd.c @@ -47,12 +47,12 @@ static inline struct entry lookup (uint32x4_t i) { struct entry e; - float64_t t0 = *((float64_t *) (__erff_data.tab + i[0])); - float64_t t1 = *((float64_t *) (__erff_data.tab + i[1])); - float64_t t2 = *((float64_t *) (__erff_data.tab + i[2])); - float64_t t3 = *((float64_t *) (__erff_data.tab + i[3])); - float32x4_t e1 = vreinterpretq_f32_f64 ((float64x2_t){ t0, t1 }); - float32x4_t e2 = vreinterpretq_f32_f64 ((float64x2_t){ t2, t3 }); + float32x2_t t0 = vld1_f32 (&__erff_data.tab[vgetq_lane_u32 (i, 0)].erf); + float32x2_t t1 = vld1_f32 (&__erff_data.tab[vgetq_lane_u32 (i, 1)].erf); + float32x2_t t2 = vld1_f32 (&__erff_data.tab[vgetq_lane_u32 (i, 2)].erf); + float32x2_t t3 = vld1_f32 (&__erff_data.tab[vgetq_lane_u32 (i, 3)].erf); + float32x4_t e1 = vcombine_f32 (t0, t1); + float32x4_t e2 = vcombine_f32 (t2, t3); e.erf = vuzp1q_f32 (e1, e2); e.scale = vuzp2q_f32 (e1, e2); return e; diff --git a/sysdeps/aarch64/fpu/sinh_advsimd.c b/sysdeps/aarch64/fpu/sinh_advsimd.c index fa3723b10c15eb29..3e3b76c502b01e16 100644 --- a/sysdeps/aarch64/fpu/sinh_advsimd.c +++ b/sysdeps/aarch64/fpu/sinh_advsimd.c @@ -22,8 +22,9 @@ static const struct data { - float64x2_t poly[11]; - float64x2_t inv_ln2, m_ln2, shift; + float64x2_t poly[11], inv_ln2; + double m_ln2[2]; + float64x2_t shift; uint64x2_t halff; int64x2_t onef; #if WANT_SIMD_EXCEPT @@ -40,7 +41,7 @@ static const struct data V2 (0x1.af5eedae67435p-26), V2 (0x1.1f143d060a28ap-29), }, .inv_ln2 = V2 (0x1.71547652b82fep0), - .m_ln2 = (float64x2_t) {-0x1.62e42fefa39efp-1, -0x1.abc9e3b39803fp-56}, + .m_ln2 = {-0x1.62e42fefa39efp-1, -0x1.abc9e3b39803fp-56}, .shift = V2 (0x1.8p52), .halff = V2 (0x3fe0000000000000), @@ -67,8 +68,10 @@ expm1_inline (float64x2_t x) and f = x - i * ln2 (f in [-ln2/2, ln2/2]). */ float64x2_t j = vsubq_f64 (vfmaq_f64 (d->shift, d->inv_ln2, x), d->shift); int64x2_t i = vcvtq_s64_f64 (j); - float64x2_t f = vfmaq_laneq_f64 (x, j, d->m_ln2, 0); - f = vfmaq_laneq_f64 (f, j, d->m_ln2, 1); + + float64x2_t m_ln2 = vld1q_f64 (d->m_ln2); + float64x2_t f = vfmaq_laneq_f64 (x, j, m_ln2, 0); + f = vfmaq_laneq_f64 (f, j, m_ln2, 1); /* Approximate expm1(f) using polynomial. */ float64x2_t f2 = vmulq_f64 (f, f); float64x2_t f4 = vmulq_f64 (f2, f2); diff --git a/sysdeps/aarch64/fpu/v_expf_inline.h b/sysdeps/aarch64/fpu/v_expf_inline.h index a3b0e32f9eb42021..08b06e0a6b34b4f4 100644 --- a/sysdeps/aarch64/fpu/v_expf_inline.h +++ b/sysdeps/aarch64/fpu/v_expf_inline.h @@ -25,7 +25,8 @@ struct v_expf_data { float32x4_t poly[5]; - float32x4_t shift, invln2_and_ln2; + float32x4_t shift; + float invln2_and_ln2[4]; }; /* maxerr: 1.45358 +0.5 ulp. */ @@ -50,10 +51,11 @@ v_expf_inline (float32x4_t x, const struct v_expf_data *d) /* exp(x) = 2^n (1 + poly(r)), with 1 + poly(r) in [1/sqrt(2),sqrt(2)] x = ln2*n + r, with r in [-ln2/2, ln2/2]. */ float32x4_t n, r, z; - z = vfmaq_laneq_f32 (d->shift, x, d->invln2_and_ln2, 0); + float32x4_t invln2_and_ln2 = vld1q_f32 (d->invln2_and_ln2); + z = vfmaq_laneq_f32 (d->shift, x, invln2_and_ln2, 0); n = vsubq_f32 (z, d->shift); - r = vfmsq_laneq_f32 (x, n, d->invln2_and_ln2, 1); - r = vfmsq_laneq_f32 (r, n, d->invln2_and_ln2, 2); + r = vfmsq_laneq_f32 (x, n, invln2_and_ln2, 1); + r = vfmsq_laneq_f32 (r, n, invln2_and_ln2, 2); uint32x4_t e = vshlq_n_u32 (vreinterpretq_u32_f32 (z), 23); float32x4_t scale = vreinterpretq_f32_u32 (vaddq_u32 (e, ExponentBias)); diff --git a/sysdeps/aarch64/fpu/v_expm1f_inline.h b/sysdeps/aarch64/fpu/v_expm1f_inline.h index 337ccfbfab555c97..59b552da6b74785e 100644 --- a/sysdeps/aarch64/fpu/v_expm1f_inline.h +++ b/sysdeps/aarch64/fpu/v_expm1f_inline.h @@ -26,7 +26,8 @@ struct v_expm1f_data { float32x4_t poly[5]; - float32x4_t invln2_and_ln2, shift; + float invln2_and_ln2[4]; + float32x4_t shift; int32x4_t exponent_bias; }; @@ -49,11 +50,12 @@ expm1f_inline (float32x4_t x, const struct v_expm1f_data *d) calling routine should handle special values if required. */ /* Reduce argument: f in [-ln2/2, ln2/2], i is exact. */ - float32x4_t j = vsubq_f32 ( - vfmaq_laneq_f32 (d->shift, x, d->invln2_and_ln2, 0), d->shift); + float32x4_t invln2_and_ln2 = vld1q_f32 (d->invln2_and_ln2); + float32x4_t j + = vsubq_f32 (vfmaq_laneq_f32 (d->shift, x, invln2_and_ln2, 0), d->shift); int32x4_t i = vcvtq_s32_f32 (j); - float32x4_t f = vfmsq_laneq_f32 (x, j, d->invln2_and_ln2, 1); - f = vfmsq_laneq_f32 (f, j, d->invln2_and_ln2, 2); + float32x4_t f = vfmsq_laneq_f32 (x, j, invln2_and_ln2, 1); + f = vfmsq_laneq_f32 (f, j, invln2_and_ln2, 2); /* Approximate expm1(f) with polynomial P, expm1(f) ~= f + f^2 * P(f). Uses Estrin scheme, where the main _ZGVnN4v_expm1f routine uses