glibc/glibc-RHEL-118273-44.patch

commit 6c22823da57aa5218f717f569c04c9573c0448c5
Author: Joe Ramsay <Joe.Ramsay@arm.com>
Date:   Thu Nov 6 18:26:54 2025 +0000

    AArch64: Fix instability in AdvSIMD tan

    Previously presence of special-cases in one lane could affect the
    results in other lanes due to unconditional scalar fallback. The old
    WANT_SIMD_EXCEPT option (which has never been enabled in libmvec) has
    been removed from AOR, making it easier to spot and fix this. 4%
    improvement in throughput with GCC 14 on Neoverse V1. This bug is
    present as far back as 2.39 (where tan was first introduced).

    Reviewed-by: Wilco Dijkstra  <Wilco.Dijkstra@arm.com>

diff --git a/sysdeps/aarch64/fpu/tan_advsimd.c b/sysdeps/aarch64/fpu/tan_advsimd.c
index d56a102dd17a3463..c6a5a17126674d7d 100644
--- a/sysdeps/aarch64/fpu/tan_advsimd.c
+++ b/sysdeps/aarch64/fpu/tan_advsimd.c
@@ -25,9 +25,7 @@ static const struct data
   float64x2_t poly[9];
   double half_pi[2];
   float64x2_t two_over_pi, shift;
-#if !WANT_SIMD_EXCEPT
   float64x2_t range_val;
-#endif
 } data = {
   /* Coefficients generated using FPMinimax.  */
   .poly = { V2 (0x1.5555555555556p-2), V2 (0x1.1111111110a63p-3),
@@ -38,20 +36,17 @@ static const struct data
   .half_pi = { 0x1.921fb54442d18p0, 0x1.1a62633145c07p-54 },
   .two_over_pi = V2 (0x1.45f306dc9c883p-1),
   .shift = V2 (0x1.8p52),
-#if !WANT_SIMD_EXCEPT
   .range_val = V2 (0x1p23),
-#endif
 };

 #define RangeVal 0x4160000000000000  /* asuint64(0x1p23).  */
 #define TinyBound 0x3e50000000000000 /* asuint64(2^-26).  */
-#define Thresh 0x310000000000000     /* RangeVal - TinyBound.  */

 /* Special cases (fall back to scalar calls).  */
 static float64x2_t VPCS_ATTR NOINLINE
-special_case (float64x2_t x)
+special_case (float64x2_t x, float64x2_t n, float64x2_t d, uint64x2_t special)
 {
-  return v_call_f64 (tan, x, x, v_u64 (-1));
+  return v_call_f64 (tan, x, vdivq_f64 (n, d), special);
 }

 /* Vector approximation for double-precision tan.
@@ -65,14 +60,6 @@ float64x2_t VPCS_ATTR V_NAME_D1 (tan) (float64x2_t x)
      very large inputs. Fall back to scalar routine for all lanes if any are
      too large, or Inf/NaN. If fenv exceptions are expected, also fall back for
      tiny input to avoid underflow.  */
-#if WANT_SIMD_EXCEPT
-  uint64x2_t iax = vreinterpretq_u64_f64 (vabsq_f64 (x));
-  /* iax - tiny_bound > range_val - tiny_bound.  */
-  uint64x2_t special
-      = vcgtq_u64 (vsubq_u64 (iax, v_u64 (TinyBound)), v_u64 (Thresh));
-  if (__glibc_unlikely (v_any_u64 (special)))
-    return special_case (x);
-#endif

   /* q = nearest integer to 2 * x / pi.  */
   float64x2_t q
@@ -81,9 +68,8 @@ float64x2_t VPCS_ATTR V_NAME_D1 (tan) (float64x2_t x)

   /* Use q to reduce x to r in [-pi/4, pi/4], by:
      r = x - q * pi/2, in extended precision.  */
-  float64x2_t r = x;
   float64x2_t half_pi = vld1q_f64 (dat->half_pi);
-  r = vfmsq_laneq_f64 (r, q, half_pi, 0);
+  float64x2_t r = vfmsq_laneq_f64 (x, q, half_pi, 0);
   r = vfmsq_laneq_f64 (r, q, half_pi, 1);
   /* Further reduce r to [-pi/8, pi/8], to be reconstructed using double angle
      formula.  */
@@ -114,12 +100,13 @@ float64x2_t VPCS_ATTR V_NAME_D1 (tan) (float64x2_t x)

   uint64x2_t no_recip = vtstq_u64 (vreinterpretq_u64_s64 (qi), v_u64 (1));

-#if !WANT_SIMD_EXCEPT
   uint64x2_t special = vcageq_f64 (x, dat->range_val);
+  float64x2_t swap = vbslq_f64 (no_recip, n, vnegq_f64 (d));
+  d = vbslq_f64 (no_recip, d, n);
+  n = swap;
+
   if (__glibc_unlikely (v_any_u64 (special)))
-    return special_case (x);
-#endif
+    return special_case (x, n, d, special);

-  return vdivq_f64 (vbslq_f64 (no_recip, n, vnegq_f64 (d)),
-		    vbslq_f64 (no_recip, d, n));
+  return vdivq_f64 (n, d);
 }