aarch64: Add GLIBC_2.40 vector functions and performance fixes (RHEL-118273)

This combines the following upstream commits: e45af510bc AArch64: Fix instability in AdvSIMD sinh 6c22823da5 AArch64: Fix instability in AdvSIMD tan aebaeb2c33 AArch64: Update math-vector-fortran.h e20ca759af AArch64: add optimised strspn/strcspn aac077645a AArch64: Fix SVE powf routine [BZ #33299] 1e3d1ddf97 AArch64: Optimize SVE exp functions dee22d2a81 AArch64: Optimise SVE FP64 Hyperbolics 6849c5b791 AArch64: Improve codegen SVE log1p helper 09795c5612 AArch64: Fix builderror with GCC 12.1/12.2 aa18367c11 AArch64: Improve enabling of SVE for libmvec 691edbdf77 aarch64: fix unwinding in longjmp 4352e2cc93 aarch64: Fix _dl_tlsdesc_dynamic unwind for pac-ret (BZ 32612) cf56eb28fa AArch64: Optimize algorithm in users of SVE expf helper ce2f26a22e AArch64: Remove PTR_ARG/SIZE_ARG defines 8f0e7fe61e Aarch64: Improve codegen in SVE asinh c0ff447edf Aarch64: Improve codegen in SVE exp and users, and update expf_inline f5ff34cb3c AArch64: Improve codegen for SVE erfcf 0b195651db AArch64: Improve codegen for SVE pow 95e807209b AArch64: Improve codegen for SVE powf d3f2b71ef1 aarch64: Fix tests not compatible with targets supporting GCS f86b4cf875 AArch64: Improve codegen in SVE expm1f and users 140b985e5a AArch64: Improve codegen in AdvSIMD asinh 91c1fadba3 AArch64: Improve codegen for SVE log1pf users cff9648d0b AArch64: Improve codegen of AdvSIMD expf family 569cfaaf49 AArch64: Improve codegen in AdvSIMD pow ca0c0d0f26 AArch64: Improve codegen in users of ADVSIMD log1p helper 13a7ef5999 AArch64: Improve codegen in users of ADVSIMD expm1 helper 2d82d781a5 AArch64: Remove SVE erf and erfc tables 1cf29fbc5b AArch64: Small optimisation in AdvSIMD erf and erfc 7b8c134b54 AArch64: Improve codegen in SVE expf & related routines a15b1394b5 AArch64: Improve codegen in SVE F32 logs 5bc100bd4b AArch64: Improve codegen in users of AdvSIMD log1pf helper 7900ac490d AArch64: Improve codegen in users of ADVSIMD expm1f helper 0fed0b250f aarch64/fpu: Add vector variants of pow 75207bde68 aarch64/fpu: Add vector variants of cbrt 157f89fa3d aarch64/fpu: Add vector variants of hypot 90a6ca8b28 aarch64: Fix AdvSIMD libmvec routines for big-endian 87cb1dfcd6 aarch64/fpu: Add vector variants of erfc 3d3a4fb8e4 aarch64/fpu: Add vector variants of tanh eedbbca0bf aarch64/fpu: Add vector variants of sinh 8b67920528 aarch64/fpu: Add vector variants of atanh 81406ea3c5 aarch64/fpu: Add vector variants of asinh b09fee1d21 aarch64/fpu: Add vector variants of acosh bdb5705b7b aarch64/fpu: Add vector variants of cosh cb5d84f1f8 aarch64/fpu: Add vector variants of erf Resolves: RHEL-118273
2025-11-12 13:08:45 -05:00 · 2025-11-12 13:08:45 -05:00 · 9dd92cac18
commit 9dd92cac18
parent 7361fbbfab
45 changed files with 29319 additions and 0 deletions
--- a/glibc-RHEL-118273-1.patch
+++ b/glibc-RHEL-118273-1.patch
--- a/glibc-RHEL-118273-10.patch
+++ b/glibc-RHEL-118273-10.patch
@ -0,0 +1,514 @@
+commit 157f89fa3d616729c8d7797168a9b3eaaa6ebf6e
+Author: Joe Ramsay <Joe.Ramsay@arm.com>
+Date:   Tue Apr 30 13:49:58 2024 +0100
+
+    aarch64/fpu: Add vector variants of hypot
+    
+    Reviewed-by: Szabolcs Nagy <szabolcs.nagy@arm.com>
+
+diff --git a/sysdeps/aarch64/fpu/Makefile b/sysdeps/aarch64/fpu/Makefile
+index e8af35099d7b9f8f..06657782a1ee7106 100644
+--- a/sysdeps/aarch64/fpu/Makefile
+++ b/sysdeps/aarch64/fpu/Makefile
+@@ -13,6 +13,7 @@ libmvec-supported-funcs = acos \
+                           exp10 \
+                           exp2 \
+                           expm1 \
+                          hypot \
+                           log \
+                           log10 \
+                           log1p \
+diff --git a/sysdeps/aarch64/fpu/Versions b/sysdeps/aarch64/fpu/Versions
+index 3cb1b82bd2785a4b..aedae9457b148983 100644
+--- a/sysdeps/aarch64/fpu/Versions
+++ b/sysdeps/aarch64/fpu/Versions
+@@ -109,6 +109,11 @@ libmvec {
+     _ZGVnN4v_erfcf;
+     _ZGVsMxv_erfc;
+     _ZGVsMxv_erfcf;
+    _ZGVnN4vv_hypotf;
+    _ZGVnN2vv_hypotf;
+    _ZGVnN2vv_hypot;
+    _ZGVsMxvv_hypotf;
+    _ZGVsMxvv_hypot;
+     _ZGVnN2v_sinh;
+     _ZGVnN2v_sinhf;
+     _ZGVnN4v_sinhf;
+diff --git a/sysdeps/aarch64/fpu/advsimd_f32_protos.h b/sysdeps/aarch64/fpu/advsimd_f32_protos.h
+index 383c4369729a3452..a8889a92fd041585 100644
+--- a/sysdeps/aarch64/fpu/advsimd_f32_protos.h
+++ b/sysdeps/aarch64/fpu/advsimd_f32_protos.h
+@@ -31,6 +31,7 @@ libmvec_hidden_proto (V_NAME_F1(exp10));
+ libmvec_hidden_proto (V_NAME_F1(exp2));
+ libmvec_hidden_proto (V_NAME_F1(exp));
+ libmvec_hidden_proto (V_NAME_F1(expm1));
+libmvec_hidden_proto (V_NAME_F2(hypot));
+ libmvec_hidden_proto (V_NAME_F1(log10));
+ libmvec_hidden_proto (V_NAME_F1(log1p));
+ libmvec_hidden_proto (V_NAME_F1(log2));
+diff --git a/sysdeps/aarch64/fpu/bits/math-vector.h b/sysdeps/aarch64/fpu/bits/math-vector.h
+index e29b2d1c09273969..ca3017733959702f 100644
+--- a/sysdeps/aarch64/fpu/bits/math-vector.h
+++ b/sysdeps/aarch64/fpu/bits/math-vector.h
+@@ -89,6 +89,10 @@
+ # define __DECL_SIMD_expm1 __DECL_SIMD_aarch64
+ # undef __DECL_SIMD_expm1f
+ # define __DECL_SIMD_expm1f __DECL_SIMD_aarch64
+# undef __DECL_SIMD_hypot
+# define __DECL_SIMD_hypot __DECL_SIMD_aarch64
+# undef __DECL_SIMD_hypotf
+# define __DECL_SIMD_hypotf __DECL_SIMD_aarch64
+ # undef __DECL_SIMD_log
+ # define __DECL_SIMD_log __DECL_SIMD_aarch64
+ # undef __DECL_SIMD_logf
+@@ -162,6 +166,7 @@ __vpcs __f32x4_t _ZGVnN4v_expf (__f32x4_t);
+ __vpcs __f32x4_t _ZGVnN4v_exp10f (__f32x4_t);
+ __vpcs __f32x4_t _ZGVnN4v_exp2f (__f32x4_t);
+ __vpcs __f32x4_t _ZGVnN4v_expm1f (__f32x4_t);
+__vpcs __f32x4_t _ZGVnN4vv_hypotf (__f32x4_t, __f32x4_t);
+ __vpcs __f32x4_t _ZGVnN4v_logf (__f32x4_t);
+ __vpcs __f32x4_t _ZGVnN4v_log10f (__f32x4_t);
+ __vpcs __f32x4_t _ZGVnN4v_log1pf (__f32x4_t);
+@@ -186,6 +191,7 @@ __vpcs __f64x2_t _ZGVnN2v_exp (__f64x2_t);
+ __vpcs __f64x2_t _ZGVnN2v_exp10 (__f64x2_t);
+ __vpcs __f64x2_t _ZGVnN2v_exp2 (__f64x2_t);
+ __vpcs __f64x2_t _ZGVnN2v_expm1 (__f64x2_t);
+__vpcs __f64x2_t _ZGVnN2vv_hypot (__f64x2_t, __f64x2_t);
+ __vpcs __f64x2_t _ZGVnN2v_log (__f64x2_t);
+ __vpcs __f64x2_t _ZGVnN2v_log10 (__f64x2_t);
+ __vpcs __f64x2_t _ZGVnN2v_log1p (__f64x2_t);
+@@ -215,6 +221,7 @@ __sv_f32_t _ZGVsMxv_expf (__sv_f32_t, __sv_bool_t);
+ __sv_f32_t _ZGVsMxv_exp10f (__sv_f32_t, __sv_bool_t);
+ __sv_f32_t _ZGVsMxv_exp2f (__sv_f32_t, __sv_bool_t);
+ __sv_f32_t _ZGVsMxv_expm1f (__sv_f32_t, __sv_bool_t);
+__sv_f32_t _ZGVsMxvv_hypotf (__sv_f32_t, __sv_f32_t, __sv_bool_t);
+ __sv_f32_t _ZGVsMxv_logf (__sv_f32_t, __sv_bool_t);
+ __sv_f32_t _ZGVsMxv_log10f (__sv_f32_t, __sv_bool_t);
+ __sv_f32_t _ZGVsMxv_log1pf (__sv_f32_t, __sv_bool_t);
+@@ -239,6 +246,7 @@ __sv_f64_t _ZGVsMxv_exp (__sv_f64_t, __sv_bool_t);
+ __sv_f64_t _ZGVsMxv_exp10 (__sv_f64_t, __sv_bool_t);
+ __sv_f64_t _ZGVsMxv_exp2 (__sv_f64_t, __sv_bool_t);
+ __sv_f64_t _ZGVsMxv_expm1 (__sv_f64_t, __sv_bool_t);
+__sv_f64_t _ZGVsMxvv_hypot (__sv_f64_t, __sv_f64_t, __sv_bool_t);
+ __sv_f64_t _ZGVsMxv_log (__sv_f64_t, __sv_bool_t);
+ __sv_f64_t _ZGVsMxv_log10 (__sv_f64_t, __sv_bool_t);
+ __sv_f64_t _ZGVsMxv_log1p (__sv_f64_t, __sv_bool_t);
+diff --git a/sysdeps/aarch64/fpu/hypot_advsimd.c b/sysdeps/aarch64/fpu/hypot_advsimd.c
+new file mode 100644
+index 0000000000000000..e4e279fa0c362336
+--- /dev/null
+++ b/sysdeps/aarch64/fpu/hypot_advsimd.c
+@@ -0,0 +1,97 @@
+/* Double-precision vector (Advanced SIMD) hypot function
+
+   Copyright (C) 2024 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include "v_math.h"
+
+#if WANT_SIMD_EXCEPT
+static const struct data
+{
+  uint64x2_t tiny_bound, thres;
+} data = {
+  .tiny_bound = V2 (0x2000000000000000), /* asuint (0x1p-511).  */
+  .thres = V2 (0x3fe0000000000000), /* asuint (0x1p511) - tiny_bound.  */
+};
+#else
+static const struct data
+{
+  uint64x2_t tiny_bound;
+  uint32x4_t thres;
+} data = {
+  .tiny_bound = V2 (0x0360000000000000), /* asuint (0x1p-969).  */
+  .thres = V4 (0x7c900000),	 /* asuint (inf) - tiny_bound.  */
+};
+#endif
+
+static float64x2_t VPCS_ATTR NOINLINE
+special_case (float64x2_t x, float64x2_t y, float64x2_t sqsum,
+	      uint32x2_t special)
+{
+  return v_call2_f64 (hypot, x, y, vsqrtq_f64 (sqsum), vmovl_u32 (special));
+}
+
+/* Vector implementation of double-precision hypot.
+   Maximum error observed is 1.21 ULP:
+   _ZGVnN2vv_hypot (0x1.6a1b193ff85b5p-204, 0x1.bc50676c2a447p-222)
+    got 0x1.6a1b19400964ep-204
+   want 0x1.6a1b19400964dp-204.  */
+#if WANT_SIMD_EXCEPT
+
+float64x2_t VPCS_ATTR V_NAME_D2 (hypot) (float64x2_t x, float64x2_t y)
+{
+  const struct data *d = ptr_barrier (&data);
+
+  float64x2_t ax = vabsq_f64 (x);
+  float64x2_t ay = vabsq_f64 (y);
+
+  uint64x2_t ix = vreinterpretq_u64_f64 (ax);
+  uint64x2_t iy = vreinterpretq_u64_f64 (ay);
+
+  /* Extreme values, NaNs, and infinities should be handled by the scalar
+     fallback for correct flag handling.  */
+  uint64x2_t specialx = vcgeq_u64 (vsubq_u64 (ix, d->tiny_bound), d->thres);
+  uint64x2_t specialy = vcgeq_u64 (vsubq_u64 (iy, d->tiny_bound), d->thres);
+  ax = v_zerofy_f64 (ax, specialx);
+  ay = v_zerofy_f64 (ay, specialy);
+  uint32x2_t special = vaddhn_u64 (specialx, specialy);
+
+  float64x2_t sqsum = vfmaq_f64 (vmulq_f64 (ax, ax), ay, ay);
+
+  if (__glibc_unlikely (v_any_u32h (special)))
+    return special_case (x, y, sqsum, special);
+
+  return vsqrtq_f64 (sqsum);
+}
+#else
+
+float64x2_t VPCS_ATTR V_NAME_D2 (hypot) (float64x2_t x, float64x2_t y)
+{
+  const struct data *d = ptr_barrier (&data);
+
+  float64x2_t sqsum = vfmaq_f64 (vmulq_f64 (x, x), y, y);
+
+  uint32x2_t special = vcge_u32 (
+      vsubhn_u64 (vreinterpretq_u64_f64 (sqsum), d->tiny_bound),
+      vget_low_u32 (d->thres));
+
+  if (__glibc_unlikely (v_any_u32h (special)))
+    return special_case (x, y, sqsum, special);
+
+  return vsqrtq_f64 (sqsum);
+}
+#endif
+diff --git a/sysdeps/aarch64/fpu/hypot_sve.c b/sysdeps/aarch64/fpu/hypot_sve.c
+new file mode 100644
+index 0000000000000000..74417040acb2f32f
+--- /dev/null
+++ b/sysdeps/aarch64/fpu/hypot_sve.c
+@@ -0,0 +1,54 @@
+/* Double-precision vector (SVE) hypot function
+
+   Copyright (C) 2024 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include "sv_math.h"
+
+static const struct data
+{
+  uint64_t tiny_bound, thres;
+} data = {
+  .tiny_bound = 0x0c80000000000000, /* asuint (0x1p-102).  */
+  .thres = 0x7300000000000000,	    /* asuint (inf) - tiny_bound.  */
+};
+
+static svfloat64_t NOINLINE
+special_case (svfloat64_t sqsum, svfloat64_t x, svfloat64_t y, svbool_t pg,
+	      svbool_t special)
+{
+  return sv_call2_f64 (hypot, x, y, svsqrt_x (pg, sqsum), special);
+}
+
+/* SVE implementation of double-precision hypot.
+   Maximum error observed is 1.21 ULP:
+   _ZGVsMxvv_hypot (-0x1.6a22d0412cdd3p+352, 0x1.d3d89bd66fb1ap+330)
+    got 0x1.6a22d0412cfp+352
+   want 0x1.6a22d0412cf01p+352.  */
+svfloat64_t SV_NAME_D2 (hypot) (svfloat64_t x, svfloat64_t y, svbool_t pg)
+{
+  const struct data *d = ptr_barrier (&data);
+
+  svfloat64_t sqsum = svmla_x (pg, svmul_x (pg, x, x), y, y);
+
+  svbool_t special = svcmpge (
+      pg, svsub_x (pg, svreinterpret_u64 (sqsum), d->tiny_bound), d->thres);
+
+  if (__glibc_unlikely (svptest_any (pg, special)))
+    return special_case (sqsum, x, y, pg, special);
+  return svsqrt_x (pg, sqsum);
+}
+diff --git a/sysdeps/aarch64/fpu/hypotf_advsimd.c b/sysdeps/aarch64/fpu/hypotf_advsimd.c
+new file mode 100644
+index 0000000000000000..34818b021abce1b7
+--- /dev/null
+++ b/sysdeps/aarch64/fpu/hypotf_advsimd.c
+@@ -0,0 +1,98 @@
+/* Single-precision vector (Advanced SIMD) hypot function
+
+   Copyright (C) 2024 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include "v_math.h"
+
+#if WANT_SIMD_EXCEPT
+static const struct data
+{
+  uint32x4_t tiny_bound, thres;
+} data = {
+  .tiny_bound = V4 (0x20000000), /* asuint (0x1p-63).  */
+  .thres = V4 (0x3f000000), /* asuint (0x1p63) - tiny_bound.  */
+};
+#else
+static const struct data
+{
+  uint32x4_t tiny_bound;
+  uint16x8_t thres;
+} data = {
+  .tiny_bound = V4 (0x0C800000), /* asuint (0x1p-102).  */
+  .thres = V8 (0x7300), /* asuint (inf) - tiny_bound.  */
+};
+#endif
+
+static float32x4_t VPCS_ATTR NOINLINE
+special_case (float32x4_t x, float32x4_t y, float32x4_t sqsum,
+	      uint16x4_t special)
+{
+  return v_call2_f32 (hypotf, x, y, vsqrtq_f32 (sqsum), vmovl_u16 (special));
+}
+
+/* Vector implementation of single-precision hypot.
+   Maximum error observed is 1.21 ULP:
+   _ZGVnN4vv_hypotf (0x1.6a419cp-13, 0x1.82a852p-22) got 0x1.6a41d2p-13
+						    want 0x1.6a41dp-13.  */
+#if WANT_SIMD_EXCEPT
+
+float32x4_t VPCS_ATTR V_NAME_F2 (hypot) (float32x4_t x, float32x4_t y)
+{
+  const struct data *d = ptr_barrier (&data);
+
+  float32x4_t ax = vabsq_f32 (x);
+  float32x4_t ay = vabsq_f32 (y);
+
+  uint32x4_t ix = vreinterpretq_u32_f32 (ax);
+  uint32x4_t iy = vreinterpretq_u32_f32 (ay);
+
+  /* Extreme values, NaNs, and infinities should be handled by the scalar
+     fallback for correct flag handling.  */
+  uint32x4_t specialx = vcgeq_u32 (vsubq_u32 (ix, d->tiny_bound), d->thres);
+  uint32x4_t specialy = vcgeq_u32 (vsubq_u32 (iy, d->tiny_bound), d->thres);
+  ax = v_zerofy_f32 (ax, specialx);
+  ay = v_zerofy_f32 (ay, specialy);
+  uint16x4_t special = vaddhn_u32 (specialx, specialy);
+
+  float32x4_t sqsum = vfmaq_f32 (vmulq_f32 (ax, ax), ay, ay);
+
+  if (__glibc_unlikely (v_any_u16h (special)))
+    return special_case (x, y, sqsum, special);
+
+  return vsqrtq_f32 (sqsum);
+}
+#else
+
+float32x4_t VPCS_ATTR V_NAME_F2 (hypot) (float32x4_t x, float32x4_t y)
+{
+  const struct data *d = ptr_barrier (&data);
+
+  float32x4_t sqsum = vfmaq_f32 (vmulq_f32 (x, x), y, y);
+
+  uint16x4_t special = vcge_u16 (
+      vsubhn_u32 (vreinterpretq_u32_f32 (sqsum), d->tiny_bound),
+      vget_low_u16 (d->thres));
+
+  if (__glibc_unlikely (v_any_u16h (special)))
+    return special_case (x, y, sqsum, special);
+
+  return vsqrtq_f32 (sqsum);
+}
+#endif
+libmvec_hidden_def (V_NAME_F2 (hypot))
+HALF_WIDTH_ALIAS_F2(hypot)
+diff --git a/sysdeps/aarch64/fpu/hypotf_sve.c b/sysdeps/aarch64/fpu/hypotf_sve.c
+new file mode 100644
+index 0000000000000000..3a403de66eb091f4
+--- /dev/null
+++ b/sysdeps/aarch64/fpu/hypotf_sve.c
+@@ -0,0 +1,48 @@
+/* Single-precision vector (SVE) hypot function
+
+   Copyright (C) 2024 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include "sv_math.h"
+
+#define TinyBound 0x0c800000 /* asuint (0x1p-102).  */
+#define Thres 0x73000000     /* 0x70000000 - TinyBound.  */
+
+static svfloat32_t NOINLINE
+special_case (svfloat32_t sqsum, svfloat32_t x, svfloat32_t y, svbool_t pg,
+	      svbool_t special)
+{
+  return sv_call2_f32 (hypotf, x, y, svsqrt_x (pg, sqsum), special);
+}
+
+/* SVE implementation of single-precision hypot.
+   Maximum error observed is 1.21 ULP:
+   _ZGVsMxvv_hypotf (0x1.6a213cp-19, -0x1.32b982p-26) got 0x1.6a2346p-19
+						     want 0x1.6a2344p-19.  */
+svfloat32_t SV_NAME_F2 (hypot) (svfloat32_t x, svfloat32_t y,
+				const svbool_t pg)
+{
+  svfloat32_t sqsum = svmla_x (pg, svmul_x (pg, x, x), y, y);
+
+  svbool_t special = svcmpge (
+      pg, svsub_x (pg, svreinterpret_u32 (sqsum), TinyBound), Thres);
+
+  if (__glibc_unlikely (svptest_any (pg, special)))
+    return special_case (sqsum, x, y, pg, special);
+
+  return svsqrt_x (pg, sqsum);
+}
+diff --git a/sysdeps/aarch64/fpu/test-double-advsimd-wrappers.c b/sysdeps/aarch64/fpu/test-double-advsimd-wrappers.c
+index f2d8714075ab99b8..417125be476cd75f 100644
+--- a/sysdeps/aarch64/fpu/test-double-advsimd-wrappers.c
+++ b/sysdeps/aarch64/fpu/test-double-advsimd-wrappers.c
+@@ -38,6 +38,7 @@ VPCS_VECTOR_WRAPPER (exp_advsimd, _ZGVnN2v_exp)
+ VPCS_VECTOR_WRAPPER (exp10_advsimd, _ZGVnN2v_exp10)
+ VPCS_VECTOR_WRAPPER (exp2_advsimd, _ZGVnN2v_exp2)
+ VPCS_VECTOR_WRAPPER (expm1_advsimd, _ZGVnN2v_expm1)
+VPCS_VECTOR_WRAPPER_ff (hypot_advsimd, _ZGVnN2vv_hypot)
+ VPCS_VECTOR_WRAPPER (log_advsimd, _ZGVnN2v_log)
+ VPCS_VECTOR_WRAPPER (log10_advsimd, _ZGVnN2v_log10)
+ VPCS_VECTOR_WRAPPER (log1p_advsimd, _ZGVnN2v_log1p)
+diff --git a/sysdeps/aarch64/fpu/test-double-sve-wrappers.c b/sysdeps/aarch64/fpu/test-double-sve-wrappers.c
+index 37873d5e432ae9e8..31ebf18705f68856 100644
+--- a/sysdeps/aarch64/fpu/test-double-sve-wrappers.c
+++ b/sysdeps/aarch64/fpu/test-double-sve-wrappers.c
+@@ -57,6 +57,7 @@ SVE_VECTOR_WRAPPER (exp_sve, _ZGVsMxv_exp)
+ SVE_VECTOR_WRAPPER (exp10_sve, _ZGVsMxv_exp10)
+ SVE_VECTOR_WRAPPER (exp2_sve, _ZGVsMxv_exp2)
+ SVE_VECTOR_WRAPPER (expm1_sve, _ZGVsMxv_expm1)
+SVE_VECTOR_WRAPPER_ff (hypot_sve, _ZGVsMxvv_hypot)
+ SVE_VECTOR_WRAPPER (log_sve, _ZGVsMxv_log)
+ SVE_VECTOR_WRAPPER (log10_sve, _ZGVsMxv_log10)
+ SVE_VECTOR_WRAPPER (log1p_sve, _ZGVsMxv_log1p)
+diff --git a/sysdeps/aarch64/fpu/test-float-advsimd-wrappers.c b/sysdeps/aarch64/fpu/test-float-advsimd-wrappers.c
+index 08e33115b9dc6f5e..dab0f1cfcb79a305 100644
+--- a/sysdeps/aarch64/fpu/test-float-advsimd-wrappers.c
+++ b/sysdeps/aarch64/fpu/test-float-advsimd-wrappers.c
+@@ -38,6 +38,7 @@ VPCS_VECTOR_WRAPPER (expf_advsimd, _ZGVnN4v_expf)
+ VPCS_VECTOR_WRAPPER (exp10f_advsimd, _ZGVnN4v_exp10f)
+ VPCS_VECTOR_WRAPPER (exp2f_advsimd, _ZGVnN4v_exp2f)
+ VPCS_VECTOR_WRAPPER (expm1f_advsimd, _ZGVnN4v_expm1f)
+VPCS_VECTOR_WRAPPER_ff (hypotf_advsimd, _ZGVnN4vv_hypotf)
+ VPCS_VECTOR_WRAPPER (logf_advsimd, _ZGVnN4v_logf)
+ VPCS_VECTOR_WRAPPER (log10f_advsimd, _ZGVnN4v_log10f)
+ VPCS_VECTOR_WRAPPER (log1pf_advsimd, _ZGVnN4v_log1pf)
+diff --git a/sysdeps/aarch64/fpu/test-float-sve-wrappers.c b/sysdeps/aarch64/fpu/test-float-sve-wrappers.c
+index 025daa662efd6f7f..2aa6cbcc28d69cf8 100644
+--- a/sysdeps/aarch64/fpu/test-float-sve-wrappers.c
+++ b/sysdeps/aarch64/fpu/test-float-sve-wrappers.c
+@@ -57,6 +57,7 @@ SVE_VECTOR_WRAPPER (expf_sve, _ZGVsMxv_expf)
+ SVE_VECTOR_WRAPPER (exp10f_sve, _ZGVsMxv_exp10f)
+ SVE_VECTOR_WRAPPER (exp2f_sve, _ZGVsMxv_exp2f)
+ SVE_VECTOR_WRAPPER (expm1f_sve, _ZGVsMxv_expm1f)
+SVE_VECTOR_WRAPPER_ff (hypotf_sve, _ZGVsMxvv_hypotf)
+ SVE_VECTOR_WRAPPER (logf_sve, _ZGVsMxv_logf)
+ SVE_VECTOR_WRAPPER (log10f_sve, _ZGVsMxv_log10f)
+ SVE_VECTOR_WRAPPER (log1pf_sve, _ZGVsMxv_log1pf)
+diff --git a/sysdeps/aarch64/libm-test-ulps b/sysdeps/aarch64/libm-test-ulps
+index 055da83d639a2430..17723d0c9e2dfcf5 100644
+--- a/sysdeps/aarch64/libm-test-ulps
+++ b/sysdeps/aarch64/libm-test-ulps
+@@ -1174,10 +1174,18 @@ double: 1
+ float: 1
+ ldouble: 1
+ 
+Function: "hypot_advsimd":
+double: 1
+float: 1
+
+ Function: "hypot_downward":
+ double: 1
+ ldouble: 1
+ 
+Function: "hypot_sve":
+double: 1
+float: 1
+
+ Function: "hypot_towardzero":
+ double: 1
+ ldouble: 1
+diff --git a/sysdeps/unix/sysv/linux/aarch64/libmvec.abilist b/sysdeps/unix/sysv/linux/aarch64/libmvec.abilist
+index 26c3fbf18b2f12a9..1184374efd25cfa6 100644
+--- a/sysdeps/unix/sysv/linux/aarch64/libmvec.abilist
+++ b/sysdeps/unix/sysv/linux/aarch64/libmvec.abilist
+@@ -89,6 +89,8 @@ GLIBC_2.40 _ZGVnN2v_sinh F
+ GLIBC_2.40 _ZGVnN2v_sinhf F
+ GLIBC_2.40 _ZGVnN2v_tanh F
+ GLIBC_2.40 _ZGVnN2v_tanhf F
+GLIBC_2.40 _ZGVnN2vv_hypot F
+GLIBC_2.40 _ZGVnN2vv_hypotf F
+ GLIBC_2.40 _ZGVnN4v_acoshf F
+ GLIBC_2.40 _ZGVnN4v_asinhf F
+ GLIBC_2.40 _ZGVnN4v_atanhf F
+@@ -97,6 +99,7 @@ GLIBC_2.40 _ZGVnN4v_erfcf F
+ GLIBC_2.40 _ZGVnN4v_erff F
+ GLIBC_2.40 _ZGVnN4v_sinhf F
+ GLIBC_2.40 _ZGVnN4v_tanhf F
+GLIBC_2.40 _ZGVnN4vv_hypotf F
+ GLIBC_2.40 _ZGVsMxv_acosh F
+ GLIBC_2.40 _ZGVsMxv_acoshf F
+ GLIBC_2.40 _ZGVsMxv_asinh F
+@@ -113,3 +116,5 @@ GLIBC_2.40 _ZGVsMxv_sinh F
+ GLIBC_2.40 _ZGVsMxv_sinhf F
+ GLIBC_2.40 _ZGVsMxv_tanh F
+ GLIBC_2.40 _ZGVsMxv_tanhf F
+GLIBC_2.40 _ZGVsMxvv_hypot F
+GLIBC_2.40 _ZGVsMxvv_hypotf F
--- a/glibc-RHEL-118273-11.patch
+++ b/glibc-RHEL-118273-11.patch
@ -0,0 +1,715 @@
+commit 75207bde6870eb4b258e16fbb41252b2e6377675
+Author: Joe Ramsay <Joe.Ramsay@arm.com>
+Date:   Tue Apr 30 13:49:59 2024 +0100
+
+    aarch64/fpu: Add vector variants of cbrt
+    
+    Reviewed-by: Szabolcs Nagy <szabolcs.nagy@arm.com>
+
+diff --git a/sysdeps/aarch64/fpu/Makefile b/sysdeps/aarch64/fpu/Makefile
+index 06657782a1ee7106..990d1135b93485c5 100644
+--- a/sysdeps/aarch64/fpu/Makefile
+++ b/sysdeps/aarch64/fpu/Makefile
+@@ -5,6 +5,7 @@ libmvec-supported-funcs = acos \
+                           atan \
+                           atanh \
+                           atan2 \
+                          cbrt \
+                           cos \
+                           cosh \
+                           erf \
+diff --git a/sysdeps/aarch64/fpu/Versions b/sysdeps/aarch64/fpu/Versions
+index aedae9457b148983..36a9e4df1e058c46 100644
+--- a/sysdeps/aarch64/fpu/Versions
+++ b/sysdeps/aarch64/fpu/Versions
+@@ -94,6 +94,11 @@ libmvec {
+     _ZGVnN4v_atanhf;
+     _ZGVsMxv_atanh;
+     _ZGVsMxv_atanhf;
+    _ZGVnN2v_cbrt;
+    _ZGVnN2v_cbrtf;
+    _ZGVnN4v_cbrtf;
+    _ZGVsMxv_cbrt;
+    _ZGVsMxv_cbrtf;
+     _ZGVnN2v_cosh;
+     _ZGVnN2v_coshf;
+     _ZGVnN4v_coshf;
+diff --git a/sysdeps/aarch64/fpu/advsimd_f32_protos.h b/sysdeps/aarch64/fpu/advsimd_f32_protos.h
+index a8889a92fd041585..54858efd8aa0ff82 100644
+--- a/sysdeps/aarch64/fpu/advsimd_f32_protos.h
+++ b/sysdeps/aarch64/fpu/advsimd_f32_protos.h
+@@ -23,6 +23,7 @@ libmvec_hidden_proto (V_NAME_F1(asin));
+ libmvec_hidden_proto (V_NAME_F1(asinh));
+ libmvec_hidden_proto (V_NAME_F1(atan));
+ libmvec_hidden_proto (V_NAME_F1(atanh));
+libmvec_hidden_proto (V_NAME_F1(cbrt));
+ libmvec_hidden_proto (V_NAME_F1(cos));
+ libmvec_hidden_proto (V_NAME_F1(cosh));
+ libmvec_hidden_proto (V_NAME_F1(erf));
+diff --git a/sysdeps/aarch64/fpu/bits/math-vector.h b/sysdeps/aarch64/fpu/bits/math-vector.h
+index ca3017733959702f..b1c024fe13a7dc32 100644
+--- a/sysdeps/aarch64/fpu/bits/math-vector.h
+++ b/sysdeps/aarch64/fpu/bits/math-vector.h
+@@ -57,6 +57,10 @@
+ # define __DECL_SIMD_atan2 __DECL_SIMD_aarch64
+ # undef __DECL_SIMD_atan2f
+ # define __DECL_SIMD_atan2f __DECL_SIMD_aarch64
+# undef __DECL_SIMD_cbrt
+# define __DECL_SIMD_cbrt __DECL_SIMD_aarch64
+# undef __DECL_SIMD_cbrtf
+# define __DECL_SIMD_cbrtf __DECL_SIMD_aarch64
+ # undef __DECL_SIMD_cos
+ # define __DECL_SIMD_cos __DECL_SIMD_aarch64
+ # undef __DECL_SIMD_cosf
+@@ -158,6 +162,7 @@ __vpcs __f32x4_t _ZGVnN4v_asinf (__f32x4_t);
+ __vpcs __f32x4_t _ZGVnN4v_asinhf (__f32x4_t);
+ __vpcs __f32x4_t _ZGVnN4v_atanf (__f32x4_t);
+ __vpcs __f32x4_t _ZGVnN4v_atanhf (__f32x4_t);
+__vpcs __f32x4_t _ZGVnN4v_cbrtf (__f32x4_t);
+ __vpcs __f32x4_t _ZGVnN4v_cosf (__f32x4_t);
+ __vpcs __f32x4_t _ZGVnN4v_coshf (__f32x4_t);
+ __vpcs __f32x4_t _ZGVnN4v_erff (__f32x4_t);
+@@ -183,6 +188,7 @@ __vpcs __f64x2_t _ZGVnN2v_asin (__f64x2_t);
+ __vpcs __f64x2_t _ZGVnN2v_asinh (__f64x2_t);
+ __vpcs __f64x2_t _ZGVnN2v_atan (__f64x2_t);
+ __vpcs __f64x2_t _ZGVnN2v_atanh (__f64x2_t);
+__vpcs __f64x2_t _ZGVnN2v_cbrt (__f64x2_t);
+ __vpcs __f64x2_t _ZGVnN2v_cos (__f64x2_t);
+ __vpcs __f64x2_t _ZGVnN2v_cosh (__f64x2_t);
+ __vpcs __f64x2_t _ZGVnN2v_erf (__f64x2_t);
+@@ -213,6 +219,7 @@ __sv_f32_t _ZGVsMxv_asinf (__sv_f32_t, __sv_bool_t);
+ __sv_f32_t _ZGVsMxv_asinhf (__sv_f32_t, __sv_bool_t);
+ __sv_f32_t _ZGVsMxv_atanf (__sv_f32_t, __sv_bool_t);
+ __sv_f32_t _ZGVsMxv_atanhf (__sv_f32_t, __sv_bool_t);
+__sv_f32_t _ZGVsMxv_cbrtf (__sv_f32_t, __sv_bool_t);
+ __sv_f32_t _ZGVsMxv_cosf (__sv_f32_t, __sv_bool_t);
+ __sv_f32_t _ZGVsMxv_coshf (__sv_f32_t, __sv_bool_t);
+ __sv_f32_t _ZGVsMxv_erff (__sv_f32_t, __sv_bool_t);
+@@ -238,6 +245,7 @@ __sv_f64_t _ZGVsMxv_asin (__sv_f64_t, __sv_bool_t);
+ __sv_f64_t _ZGVsMxv_asinh (__sv_f64_t, __sv_bool_t);
+ __sv_f64_t _ZGVsMxv_atan (__sv_f64_t, __sv_bool_t);
+ __sv_f64_t _ZGVsMxv_atanh (__sv_f64_t, __sv_bool_t);
+__sv_f64_t _ZGVsMxv_cbrt (__sv_f64_t, __sv_bool_t);
+ __sv_f64_t _ZGVsMxv_cos (__sv_f64_t, __sv_bool_t);
+ __sv_f64_t _ZGVsMxv_cosh (__sv_f64_t, __sv_bool_t);
+ __sv_f64_t _ZGVsMxv_erf (__sv_f64_t, __sv_bool_t);
+diff --git a/sysdeps/aarch64/fpu/cbrt_advsimd.c b/sysdeps/aarch64/fpu/cbrt_advsimd.c
+new file mode 100644
+index 0000000000000000..adfbb60cd3918c95
+--- /dev/null
+++ b/sysdeps/aarch64/fpu/cbrt_advsimd.c
+@@ -0,0 +1,121 @@
+/* Double-precision vector (AdvSIMD) cbrt function
+
+   Copyright (C) 2024 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include "v_math.h"
+#include "poly_advsimd_f64.h"
+
+const static struct data
+{
+  float64x2_t poly[4], one_third, shift;
+  int64x2_t exp_bias;
+  uint64x2_t abs_mask, tiny_bound;
+  uint32x4_t thresh;
+  double table[5];
+} data = {
+  .shift = V2 (0x1.8p52),
+  .poly = { /* Generated with fpminimax in [0.5, 1].  */
+            V2 (0x1.c14e8ee44767p-2), V2 (0x1.dd2d3f99e4c0ep-1),
+	    V2 (-0x1.08e83026b7e74p-1), V2 (0x1.2c74eaa3ba428p-3) },
+  .exp_bias = V2 (1022),
+  .abs_mask = V2(0x7fffffffffffffff),
+  .tiny_bound = V2(0x0010000000000000), /* Smallest normal.  */
+  .thresh = V4(0x7fe00000),   /* asuint64 (infinity) - tiny_bound.  */
+  .one_third = V2(0x1.5555555555555p-2),
+  .table = { /* table[i] = 2^((i - 2) / 3).  */
+             0x1.428a2f98d728bp-1, 0x1.965fea53d6e3dp-1, 0x1p0,
+	     0x1.428a2f98d728bp0, 0x1.965fea53d6e3dp0 }
+};
+
+#define MantissaMask v_u64 (0x000fffffffffffff)
+
+static float64x2_t NOINLINE VPCS_ATTR
+special_case (float64x2_t x, float64x2_t y, uint32x2_t special)
+{
+  return v_call_f64 (cbrt, x, y, vmovl_u32 (special));
+}
+
+/* Approximation for double-precision vector cbrt(x), using low-order polynomial
+   and two Newton iterations. Greatest observed error is 1.79 ULP. Errors repeat
+   according to the exponent, for instance an error observed for double value
+   m * 2^e will be observed for any input m * 2^(e + 3*i), where i is an
+   integer.
+   __v_cbrt(0x1.fffff403f0bc6p+1) got 0x1.965fe72821e9bp+0
+				 want 0x1.965fe72821e99p+0.  */
+VPCS_ATTR float64x2_t V_NAME_D1 (cbrt) (float64x2_t x)
+{
+  const struct data *d = ptr_barrier (&data);
+  uint64x2_t iax = vreinterpretq_u64_f64 (vabsq_f64 (x));
+
+  /* Subnormal, +/-0 and special values.  */
+  uint32x2_t special
+      = vcge_u32 (vsubhn_u64 (iax, d->tiny_bound), vget_low_u32 (d->thresh));
+
+  /* Decompose |x| into m * 2^e, where m is in [0.5, 1.0]. This is a vector
+     version of frexp, which gets subnormal values wrong - these have to be
+     special-cased as a result.  */
+  float64x2_t m = vbslq_f64 (MantissaMask, x, v_f64 (0.5));
+  int64x2_t exp_bias = d->exp_bias;
+  uint64x2_t ia12 = vshrq_n_u64 (iax, 52);
+  int64x2_t e = vsubq_s64 (vreinterpretq_s64_u64 (ia12), exp_bias);
+
+  /* Calculate rough approximation for cbrt(m) in [0.5, 1.0], starting point for
+     Newton iterations.  */
+  float64x2_t p = v_pairwise_poly_3_f64 (m, vmulq_f64 (m, m), d->poly);
+  float64x2_t one_third = d->one_third;
+  /* Two iterations of Newton's method for iteratively approximating cbrt.  */
+  float64x2_t m_by_3 = vmulq_f64 (m, one_third);
+  float64x2_t two_thirds = vaddq_f64 (one_third, one_third);
+  float64x2_t a
+      = vfmaq_f64 (vdivq_f64 (m_by_3, vmulq_f64 (p, p)), two_thirds, p);
+  a = vfmaq_f64 (vdivq_f64 (m_by_3, vmulq_f64 (a, a)), two_thirds, a);
+
+  /* Assemble the result by the following:
+
+     cbrt(x) = cbrt(m) * 2 ^ (e / 3).
+
+     We can get 2 ^ round(e / 3) using ldexp and integer divide, but since e is
+     not necessarily a multiple of 3 we lose some information.
+
+     Let q = 2 ^ round(e / 3), then t = 2 ^ (e / 3) / q.
+
+     Then we know t = 2 ^ (i / 3), where i is the remainder from e / 3, which is
+     an integer in [-2, 2], and can be looked up in the table T. Hence the
+     result is assembled as:
+
+     cbrt(x) = cbrt(m) * t * 2 ^ round(e / 3) * sign.  */
+
+  float64x2_t ef = vcvtq_f64_s64 (e);
+  float64x2_t eb3f = vrndnq_f64 (vmulq_f64 (ef, one_third));
+  int64x2_t em3 = vcvtq_s64_f64 (vfmsq_f64 (ef, eb3f, v_f64 (3)));
+  int64x2_t ey = vcvtq_s64_f64 (eb3f);
+
+  float64x2_t my = (float64x2_t){ d->table[em3[0] + 2], d->table[em3[1] + 2] };
+  my = vmulq_f64 (my, a);
+
+  /* Vector version of ldexp.  */
+  float64x2_t y = vreinterpretq_f64_s64 (
+      vshlq_n_s64 (vaddq_s64 (ey, vaddq_s64 (exp_bias, v_s64 (1))), 52));
+  y = vmulq_f64 (y, my);
+
+  if (__glibc_unlikely (v_any_u32h (special)))
+    return special_case (x, vbslq_f64 (d->abs_mask, y, x), special);
+
+  /* Copy sign.  */
+  return vbslq_f64 (d->abs_mask, y, x);
+}
+diff --git a/sysdeps/aarch64/fpu/cbrt_sve.c b/sysdeps/aarch64/fpu/cbrt_sve.c
+new file mode 100644
+index 0000000000000000..fc976eda2a6018f7
+--- /dev/null
+++ b/sysdeps/aarch64/fpu/cbrt_sve.c
+@@ -0,0 +1,128 @@
+/* Double-precision vector (SVE) cbrt function
+
+   Copyright (C) 2024 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include "sv_math.h"
+#include "poly_sve_f64.h"
+
+const static struct data
+{
+  float64_t poly[4];
+  float64_t table[5];
+  float64_t one_third, two_thirds, shift;
+  int64_t exp_bias;
+  uint64_t tiny_bound, thresh;
+} data = {
+  /* Generated with FPMinimax in [0.5, 1].  */
+  .poly = { 0x1.c14e8ee44767p-2, 0x1.dd2d3f99e4c0ep-1, -0x1.08e83026b7e74p-1,
+	    0x1.2c74eaa3ba428p-3, },
+  /* table[i] = 2^((i - 2) / 3).  */
+  .table = { 0x1.428a2f98d728bp-1, 0x1.965fea53d6e3dp-1, 0x1p0,
+	     0x1.428a2f98d728bp0, 0x1.965fea53d6e3dp0, },
+  .one_third = 0x1.5555555555555p-2,
+  .two_thirds = 0x1.5555555555555p-1,
+  .shift = 0x1.8p52,
+  .exp_bias = 1022,
+  .tiny_bound = 0x0010000000000000, /* Smallest normal.  */
+  .thresh = 0x7fe0000000000000, /* asuint64 (infinity) - tiny_bound.  */
+};
+
+#define MantissaMask 0x000fffffffffffff
+#define HalfExp 0x3fe0000000000000
+
+static svfloat64_t NOINLINE
+special_case (svfloat64_t x, svfloat64_t y, svbool_t special)
+{
+  return sv_call_f64 (cbrt, x, y, special);
+}
+
+static inline svfloat64_t
+shifted_lookup (const svbool_t pg, const float64_t *table, svint64_t i)
+{
+  return svld1_gather_index (pg, table, svadd_x (pg, i, 2));
+}
+
+/* Approximation for double-precision vector cbrt(x), using low-order
+   polynomial and two Newton iterations. Greatest observed error is 1.79 ULP.
+   Errors repeat according to the exponent, for instance an error observed for
+   double value m * 2^e will be observed for any input m * 2^(e + 3*i), where i
+   is an integer.
+   _ZGVsMxv_cbrt (0x0.3fffb8d4413f3p-1022) got 0x1.965f53b0e5d97p-342
+					  want 0x1.965f53b0e5d95p-342.  */
+svfloat64_t SV_NAME_D1 (cbrt) (svfloat64_t x, const svbool_t pg)
+{
+  const struct data *d = ptr_barrier (&data);
+
+  svfloat64_t ax = svabs_x (pg, x);
+  svuint64_t iax = svreinterpret_u64 (ax);
+  svuint64_t sign = sveor_x (pg, svreinterpret_u64 (x), iax);
+
+  /* Subnormal, +/-0 and special values.  */
+  svbool_t special = svcmpge (pg, svsub_x (pg, iax, d->tiny_bound), d->thresh);
+
+  /* Decompose |x| into m * 2^e, where m is in [0.5, 1.0]. This is a vector
+     version of frexp, which gets subnormal values wrong - these have to be
+     special-cased as a result.  */
+  svfloat64_t m = svreinterpret_f64 (svorr_x (
+      pg, svand_x (pg, svreinterpret_u64 (x), MantissaMask), HalfExp));
+  svint64_t e
+      = svsub_x (pg, svreinterpret_s64 (svlsr_x (pg, iax, 52)), d->exp_bias);
+
+  /* Calculate rough approximation for cbrt(m) in [0.5, 1.0], starting point
+     for Newton iterations.  */
+  svfloat64_t p
+      = sv_pairwise_poly_3_f64_x (pg, m, svmul_x (pg, m, m), d->poly);
+
+  /* Two iterations of Newton's method for iteratively approximating cbrt.  */
+  svfloat64_t m_by_3 = svmul_x (pg, m, d->one_third);
+  svfloat64_t a = svmla_x (pg, svdiv_x (pg, m_by_3, svmul_x (pg, p, p)), p,
+			   d->two_thirds);
+  a = svmla_x (pg, svdiv_x (pg, m_by_3, svmul_x (pg, a, a)), a, d->two_thirds);
+
+  /* Assemble the result by the following:
+
+     cbrt(x) = cbrt(m) * 2 ^ (e / 3).
+
+     We can get 2 ^ round(e / 3) using ldexp and integer divide, but since e is
+     not necessarily a multiple of 3 we lose some information.
+
+     Let q = 2 ^ round(e / 3), then t = 2 ^ (e / 3) / q.
+
+     Then we know t = 2 ^ (i / 3), where i is the remainder from e / 3, which
+     is an integer in [-2, 2], and can be looked up in the table T. Hence the
+     result is assembled as:
+
+     cbrt(x) = cbrt(m) * t * 2 ^ round(e / 3) * sign.  */
+  svfloat64_t eb3f = svmul_x (pg, svcvt_f64_x (pg, e), d->one_third);
+  svint64_t ey = svcvt_s64_x (pg, eb3f);
+  svint64_t em3 = svmls_x (pg, e, ey, 3);
+
+  svfloat64_t my = shifted_lookup (pg, d->table, em3);
+  my = svmul_x (pg, my, a);
+
+  /* Vector version of ldexp.  */
+  svfloat64_t y = svscale_x (pg, my, ey);
+
+  if (__glibc_unlikely (svptest_any (pg, special)))
+    return special_case (
+	x, svreinterpret_f64 (svorr_x (pg, svreinterpret_u64 (y), sign)),
+	special);
+
+  /* Copy sign.  */
+  return svreinterpret_f64 (svorr_x (pg, svreinterpret_u64 (y), sign));
+}
+diff --git a/sysdeps/aarch64/fpu/cbrtf_advsimd.c b/sysdeps/aarch64/fpu/cbrtf_advsimd.c
+new file mode 100644
+index 0000000000000000..27debb8b57c8c3e2
+--- /dev/null
+++ b/sysdeps/aarch64/fpu/cbrtf_advsimd.c
+@@ -0,0 +1,123 @@
+/* Single-precision vector (AdvSIMD) cbrt function
+
+   Copyright (C) 2024 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include "v_math.h"
+#include "poly_advsimd_f32.h"
+
+const static struct data
+{
+  float32x4_t poly[4], one_third;
+  float table[5];
+} data = {
+  .poly = { /* Very rough approximation of cbrt(x) in [0.5, 1], generated with
+               FPMinimax.  */
+	    V4 (0x1.c14e96p-2), V4 (0x1.dd2d3p-1), V4 (-0x1.08e81ap-1),
+	    V4 (0x1.2c74c2p-3) },
+  .table = { /* table[i] = 2^((i - 2) / 3).  */
+	    0x1.428a3p-1, 0x1.965feap-1, 0x1p0, 0x1.428a3p0, 0x1.965feap0 },
+  .one_third = V4 (0x1.555556p-2f),
+};
+
+#define SignMask v_u32 (0x80000000)
+#define SmallestNormal v_u32 (0x00800000)
+#define Thresh vdup_n_u16 (0x7f00) /* asuint(INFINITY) - SmallestNormal.  */
+#define MantissaMask v_u32 (0x007fffff)
+#define HalfExp v_u32 (0x3f000000)
+
+static float32x4_t VPCS_ATTR NOINLINE
+special_case (float32x4_t x, float32x4_t y, uint16x4_t special)
+{
+  return v_call_f32 (cbrtf, x, y, vmovl_u16 (special));
+}
+
+static inline float32x4_t
+shifted_lookup (const float *table, int32x4_t i)
+{
+  return (float32x4_t){ table[i[0] + 2], table[i[1] + 2], table[i[2] + 2],
+			table[i[3] + 2] };
+}
+
+/* Approximation for vector single-precision cbrt(x) using Newton iteration
+   with initial guess obtained by a low-order polynomial. Greatest error
+   is 1.64 ULP. This is observed for every value where the mantissa is
+   0x1.85a2aa and the exponent is a multiple of 3, for example:
+   _ZGVnN4v_cbrtf(0x1.85a2aap+3) got 0x1.267936p+1
+				want 0x1.267932p+1.  */
+VPCS_ATTR float32x4_t V_NAME_F1 (cbrt) (float32x4_t x)
+{
+  const struct data *d = ptr_barrier (&data);
+  uint32x4_t iax = vreinterpretq_u32_f32 (vabsq_f32 (x));
+
+  /* Subnormal, +/-0 and special values.  */
+  uint16x4_t special = vcge_u16 (vsubhn_u32 (iax, SmallestNormal), Thresh);
+
+  /* Decompose |x| into m * 2^e, where m is in [0.5, 1.0]. This is a vector
+     version of frexpf, which gets subnormal values wrong - these have to be
+     special-cased as a result.  */
+  float32x4_t m = vbslq_f32 (MantissaMask, x, v_f32 (0.5));
+  int32x4_t e
+      = vsubq_s32 (vreinterpretq_s32_u32 (vshrq_n_u32 (iax, 23)), v_s32 (126));
+
+  /* p is a rough approximation for cbrt(m) in [0.5, 1.0]. The better this is,
+     the less accurate the next stage of the algorithm needs to be. An order-4
+     polynomial is enough for one Newton iteration.  */
+  float32x4_t p = v_pairwise_poly_3_f32 (m, vmulq_f32 (m, m), d->poly);
+
+  float32x4_t one_third = d->one_third;
+  float32x4_t two_thirds = vaddq_f32 (one_third, one_third);
+
+  /* One iteration of Newton's method for iteratively approximating cbrt.  */
+  float32x4_t m_by_3 = vmulq_f32 (m, one_third);
+  float32x4_t a
+      = vfmaq_f32 (vdivq_f32 (m_by_3, vmulq_f32 (p, p)), two_thirds, p);
+
+  /* Assemble the result by the following:
+
+     cbrt(x) = cbrt(m) * 2 ^ (e / 3).
+
+     We can get 2 ^ round(e / 3) using ldexp and integer divide, but since e is
+     not necessarily a multiple of 3 we lose some information.
+
+     Let q = 2 ^ round(e / 3), then t = 2 ^ (e / 3) / q.
+
+     Then we know t = 2 ^ (i / 3), where i is the remainder from e / 3, which
+     is an integer in [-2, 2], and can be looked up in the table T. Hence the
+     result is assembled as:
+
+     cbrt(x) = cbrt(m) * t * 2 ^ round(e / 3) * sign.  */
+  float32x4_t ef = vmulq_f32 (vcvtq_f32_s32 (e), one_third);
+  int32x4_t ey = vcvtq_s32_f32 (ef);
+  int32x4_t em3 = vsubq_s32 (e, vmulq_s32 (ey, v_s32 (3)));
+
+  float32x4_t my = shifted_lookup (d->table, em3);
+  my = vmulq_f32 (my, a);
+
+  /* Vector version of ldexpf.  */
+  float32x4_t y
+      = vreinterpretq_f32_s32 (vshlq_n_s32 (vaddq_s32 (ey, v_s32 (127)), 23));
+  y = vmulq_f32 (y, my);
+
+  if (__glibc_unlikely (v_any_u16h (special)))
+    return special_case (x, vbslq_f32 (SignMask, x, y), special);
+
+  /* Copy sign.  */
+  return vbslq_f32 (SignMask, x, y);
+}
+libmvec_hidden_def (V_NAME_F1 (cbrt))
+HALF_WIDTH_ALIAS_F1 (cbrt)
+diff --git a/sysdeps/aarch64/fpu/cbrtf_sve.c b/sysdeps/aarch64/fpu/cbrtf_sve.c
+new file mode 100644
+index 0000000000000000..23c220c202244c1f
+--- /dev/null
+++ b/sysdeps/aarch64/fpu/cbrtf_sve.c
+@@ -0,0 +1,122 @@
+/* Single-precision vector (SVE) cbrt function
+
+   Copyright (C) 2024 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include "sv_math.h"
+#include "poly_sve_f32.h"
+
+const static struct data
+{
+  float32_t poly[4];
+  float32_t table[5];
+  float32_t one_third, two_thirds;
+} data = {
+  /* Very rough approximation of cbrt(x) in [0.5, 1], generated with FPMinimax.
+   */
+  .poly = { 0x1.c14e96p-2, 0x1.dd2d3p-1, -0x1.08e81ap-1,
+	    0x1.2c74c2p-3, },
+  /* table[i] = 2^((i - 2) / 3).  */
+  .table = { 0x1.428a3p-1, 0x1.965feap-1, 0x1p0, 0x1.428a3p0, 0x1.965feap0 },
+  .one_third = 0x1.555556p-2f,
+  .two_thirds = 0x1.555556p-1f,
+};
+
+#define SmallestNormal 0x00800000
+#define Thresh 0x7f000000 /* asuint(INFINITY) - SmallestNormal.  */
+#define MantissaMask 0x007fffff
+#define HalfExp 0x3f000000
+
+static svfloat32_t NOINLINE
+special_case (svfloat32_t x, svfloat32_t y, svbool_t special)
+{
+  return sv_call_f32 (cbrtf, x, y, special);
+}
+
+static inline svfloat32_t
+shifted_lookup (const svbool_t pg, const float32_t *table, svint32_t i)
+{
+  return svld1_gather_index (pg, table, svadd_x (pg, i, 2));
+}
+
+/* Approximation for vector single-precision cbrt(x) using Newton iteration
+   with initial guess obtained by a low-order polynomial. Greatest error
+   is 1.64 ULP. This is observed for every value where the mantissa is
+   0x1.85a2aa and the exponent is a multiple of 3, for example:
+   _ZGVsMxv_cbrtf (0x1.85a2aap+3) got 0x1.267936p+1
+				 want 0x1.267932p+1.  */
+svfloat32_t SV_NAME_F1 (cbrt) (svfloat32_t x, const svbool_t pg)
+{
+  const struct data *d = ptr_barrier (&data);
+
+  svfloat32_t ax = svabs_x (pg, x);
+  svuint32_t iax = svreinterpret_u32 (ax);
+  svuint32_t sign = sveor_x (pg, svreinterpret_u32 (x), iax);
+
+  /* Subnormal, +/-0 and special values.  */
+  svbool_t special = svcmpge (pg, svsub_x (pg, iax, SmallestNormal), Thresh);
+
+  /* Decompose |x| into m * 2^e, where m is in [0.5, 1.0]. This is a vector
+     version of frexpf, which gets subnormal values wrong - these have to be
+     special-cased as a result.  */
+  svfloat32_t m = svreinterpret_f32 (svorr_x (
+      pg, svand_x (pg, svreinterpret_u32 (x), MantissaMask), HalfExp));
+  svint32_t e = svsub_x (pg, svreinterpret_s32 (svlsr_x (pg, iax, 23)), 126);
+
+  /* p is a rough approximation for cbrt(m) in [0.5, 1.0]. The better this is,
+     the less accurate the next stage of the algorithm needs to be. An order-4
+     polynomial is enough for one Newton iteration.  */
+  svfloat32_t p
+      = sv_pairwise_poly_3_f32_x (pg, m, svmul_x (pg, m, m), d->poly);
+
+  /* One iteration of Newton's method for iteratively approximating cbrt.  */
+  svfloat32_t m_by_3 = svmul_x (pg, m, d->one_third);
+  svfloat32_t a = svmla_x (pg, svdiv_x (pg, m_by_3, svmul_x (pg, p, p)), p,
+			   d->two_thirds);
+
+  /* Assemble the result by the following:
+
+     cbrt(x) = cbrt(m) * 2 ^ (e / 3).
+
+     We can get 2 ^ round(e / 3) using ldexp and integer divide, but since e is
+     not necessarily a multiple of 3 we lose some information.
+
+     Let q = 2 ^ round(e / 3), then t = 2 ^ (e / 3) / q.
+
+     Then we know t = 2 ^ (i / 3), where i is the remainder from e / 3, which
+     is an integer in [-2, 2], and can be looked up in the table T. Hence the
+     result is assembled as:
+
+     cbrt(x) = cbrt(m) * t * 2 ^ round(e / 3) * sign.  */
+  svfloat32_t ef = svmul_x (pg, svcvt_f32_x (pg, e), d->one_third);
+  svint32_t ey = svcvt_s32_x (pg, ef);
+  svint32_t em3 = svmls_x (pg, e, ey, 3);
+
+  svfloat32_t my = shifted_lookup (pg, d->table, em3);
+  my = svmul_x (pg, my, a);
+
+  /* Vector version of ldexpf.  */
+  svfloat32_t y = svscale_x (pg, my, ey);
+
+  if (__glibc_unlikely (svptest_any (pg, special)))
+    return special_case (
+	x, svreinterpret_f32 (svorr_x (pg, svreinterpret_u32 (y), sign)),
+	special);
+
+  /* Copy sign.  */
+  return svreinterpret_f32 (svorr_x (pg, svreinterpret_u32 (y), sign));
+}
+diff --git a/sysdeps/aarch64/fpu/test-double-advsimd-wrappers.c b/sysdeps/aarch64/fpu/test-double-advsimd-wrappers.c
+index 417125be476cd75f..1877db3ac6932037 100644
+--- a/sysdeps/aarch64/fpu/test-double-advsimd-wrappers.c
+++ b/sysdeps/aarch64/fpu/test-double-advsimd-wrappers.c
+@@ -30,6 +30,7 @@ VPCS_VECTOR_WRAPPER (asinh_advsimd, _ZGVnN2v_asinh)
+ VPCS_VECTOR_WRAPPER (atan_advsimd, _ZGVnN2v_atan)
+ VPCS_VECTOR_WRAPPER (atanh_advsimd, _ZGVnN2v_atanh)
+ VPCS_VECTOR_WRAPPER_ff (atan2_advsimd, _ZGVnN2vv_atan2)
+VPCS_VECTOR_WRAPPER (cbrt_advsimd, _ZGVnN2v_cbrt)
+ VPCS_VECTOR_WRAPPER (cos_advsimd, _ZGVnN2v_cos)
+ VPCS_VECTOR_WRAPPER (cosh_advsimd, _ZGVnN2v_cosh)
+ VPCS_VECTOR_WRAPPER (erf_advsimd, _ZGVnN2v_erf)
+diff --git a/sysdeps/aarch64/fpu/test-double-sve-wrappers.c b/sysdeps/aarch64/fpu/test-double-sve-wrappers.c
+index 31ebf18705f68856..b702f942dea0749f 100644
+--- a/sysdeps/aarch64/fpu/test-double-sve-wrappers.c
+++ b/sysdeps/aarch64/fpu/test-double-sve-wrappers.c
+@@ -49,6 +49,7 @@ SVE_VECTOR_WRAPPER (asinh_sve, _ZGVsMxv_asinh)
+ SVE_VECTOR_WRAPPER (atan_sve, _ZGVsMxv_atan)
+ SVE_VECTOR_WRAPPER (atanh_sve, _ZGVsMxv_atanh)
+ SVE_VECTOR_WRAPPER_ff (atan2_sve, _ZGVsMxvv_atan2)
+SVE_VECTOR_WRAPPER (cbrt_sve, _ZGVsMxv_cbrt)
+ SVE_VECTOR_WRAPPER (cos_sve, _ZGVsMxv_cos)
+ SVE_VECTOR_WRAPPER (cosh_sve, _ZGVsMxv_cosh)
+ SVE_VECTOR_WRAPPER (erf_sve, _ZGVsMxv_erf)
+diff --git a/sysdeps/aarch64/fpu/test-float-advsimd-wrappers.c b/sysdeps/aarch64/fpu/test-float-advsimd-wrappers.c
+index dab0f1cfcb79a305..9cb451b4f045e625 100644
+--- a/sysdeps/aarch64/fpu/test-float-advsimd-wrappers.c
+++ b/sysdeps/aarch64/fpu/test-float-advsimd-wrappers.c
+@@ -30,6 +30,7 @@ VPCS_VECTOR_WRAPPER (asinhf_advsimd, _ZGVnN4v_asinhf)
+ VPCS_VECTOR_WRAPPER (atanf_advsimd, _ZGVnN4v_atanf)
+ VPCS_VECTOR_WRAPPER (atanhf_advsimd, _ZGVnN4v_atanhf)
+ VPCS_VECTOR_WRAPPER_ff (atan2f_advsimd, _ZGVnN4vv_atan2f)
+VPCS_VECTOR_WRAPPER (cbrtf_advsimd, _ZGVnN4v_cbrtf)
+ VPCS_VECTOR_WRAPPER (cosf_advsimd, _ZGVnN4v_cosf)
+ VPCS_VECTOR_WRAPPER (coshf_advsimd, _ZGVnN4v_coshf)
+ VPCS_VECTOR_WRAPPER (erff_advsimd, _ZGVnN4v_erff)
+diff --git a/sysdeps/aarch64/fpu/test-float-sve-wrappers.c b/sysdeps/aarch64/fpu/test-float-sve-wrappers.c
+index 2aa6cbcc28d69cf8..5b3dd22916d2a50d 100644
+--- a/sysdeps/aarch64/fpu/test-float-sve-wrappers.c
+++ b/sysdeps/aarch64/fpu/test-float-sve-wrappers.c
+@@ -49,6 +49,7 @@ SVE_VECTOR_WRAPPER (asinhf_sve, _ZGVsMxv_asinhf)
+ SVE_VECTOR_WRAPPER (atanf_sve, _ZGVsMxv_atanf)
+ SVE_VECTOR_WRAPPER (atanhf_sve, _ZGVsMxv_atanhf)
+ SVE_VECTOR_WRAPPER_ff (atan2f_sve, _ZGVsMxvv_atan2f)
+SVE_VECTOR_WRAPPER (cbrtf_sve, _ZGVsMxv_cbrtf)
+ SVE_VECTOR_WRAPPER (cosf_sve, _ZGVsMxv_cosf)
+ SVE_VECTOR_WRAPPER (coshf_sve, _ZGVsMxv_coshf)
+ SVE_VECTOR_WRAPPER (erff_sve, _ZGVsMxv_erff)
+diff --git a/sysdeps/aarch64/libm-test-ulps b/sysdeps/aarch64/libm-test-ulps
+index 17723d0c9e2dfcf5..a67cd7cd7399c533 100644
+--- a/sysdeps/aarch64/libm-test-ulps
+++ b/sysdeps/aarch64/libm-test-ulps
+@@ -477,11 +477,19 @@ double: 4
+ float: 1
+ ldouble: 1
+ 
+Function: "cbrt_advsimd":
+double: 1
+float: 1
+
+ Function: "cbrt_downward":
+ double: 4
+ float: 1
+ ldouble: 1
+ 
+Function: "cbrt_sve":
+double: 1
+float: 1
+
+ Function: "cbrt_towardzero":
+ double: 3
+ float: 1
+diff --git a/sysdeps/unix/sysv/linux/aarch64/libmvec.abilist b/sysdeps/unix/sysv/linux/aarch64/libmvec.abilist
+index 1184374efd25cfa6..89ac1dfa36279eb0 100644
+--- a/sysdeps/unix/sysv/linux/aarch64/libmvec.abilist
+++ b/sysdeps/unix/sysv/linux/aarch64/libmvec.abilist
+@@ -79,6 +79,8 @@ GLIBC_2.40 _ZGVnN2v_asinh F
+ GLIBC_2.40 _ZGVnN2v_asinhf F
+ GLIBC_2.40 _ZGVnN2v_atanh F
+ GLIBC_2.40 _ZGVnN2v_atanhf F
+GLIBC_2.40 _ZGVnN2v_cbrt F
+GLIBC_2.40 _ZGVnN2v_cbrtf F
+ GLIBC_2.40 _ZGVnN2v_cosh F
+ GLIBC_2.40 _ZGVnN2v_coshf F
+ GLIBC_2.40 _ZGVnN2v_erf F
+@@ -94,6 +96,7 @@ GLIBC_2.40 _ZGVnN2vv_hypotf F
+ GLIBC_2.40 _ZGVnN4v_acoshf F
+ GLIBC_2.40 _ZGVnN4v_asinhf F
+ GLIBC_2.40 _ZGVnN4v_atanhf F
+GLIBC_2.40 _ZGVnN4v_cbrtf F
+ GLIBC_2.40 _ZGVnN4v_coshf F
+ GLIBC_2.40 _ZGVnN4v_erfcf F
+ GLIBC_2.40 _ZGVnN4v_erff F
+@@ -106,6 +109,8 @@ GLIBC_2.40 _ZGVsMxv_asinh F
+ GLIBC_2.40 _ZGVsMxv_asinhf F
+ GLIBC_2.40 _ZGVsMxv_atanh F
+ GLIBC_2.40 _ZGVsMxv_atanhf F
+GLIBC_2.40 _ZGVsMxv_cbrt F
+GLIBC_2.40 _ZGVsMxv_cbrtf F
+ GLIBC_2.40 _ZGVsMxv_cosh F
+ GLIBC_2.40 _ZGVsMxv_coshf F
+ GLIBC_2.40 _ZGVsMxv_erf F
--- a/glibc-RHEL-118273-12.patch
+++ b/glibc-RHEL-118273-12.patch
--- a/glibc-RHEL-118273-13.patch
+++ b/glibc-RHEL-118273-13.patch
@ -0,0 +1,319 @@
+commit 7900ac490db32f6bccff812733f00280dde34e27
+Author: Joe Ramsay <Joe.Ramsay@arm.com>
+Date:   Mon Sep 23 15:32:53 2024 +0100
+
+    AArch64: Improve codegen in users of ADVSIMD expm1f helper
+    
+    Rearrange operations so MOV is not necessary in reduction or around
+    the special-case handler.  Reduce memory access by using more indexed
+    MLAs in polynomial.
+    
+    Reviewed-by: Wilco Dijkstra  <Wilco.Dijkstra@arm.com>
+
+diff --git a/sysdeps/aarch64/fpu/expm1f_advsimd.c b/sysdeps/aarch64/fpu/expm1f_advsimd.c
+index a0616ec7542cbfce..8303ca296e030c2e 100644
+--- a/sysdeps/aarch64/fpu/expm1f_advsimd.c
+++ b/sysdeps/aarch64/fpu/expm1f_advsimd.c
+@@ -18,27 +18,18 @@
+    <https://www.gnu.org/licenses/>.  */
+ 
+ #include "v_math.h"
+-#include "poly_advsimd_f32.h"
+#include "v_expm1f_inline.h"
+ 
+ static const struct data
+ {
+-  float32x4_t poly[5];
+-  float invln2_and_ln2[4];
+-  float32x4_t shift;
+-  int32x4_t exponent_bias;
+  struct v_expm1f_data d;
+ #if WANT_SIMD_EXCEPT
+   uint32x4_t thresh;
+ #else
+   float32x4_t oflow_bound;
+ #endif
+ } data = {
+-  /* Generated using fpminimax with degree=5 in [-log(2)/2, log(2)/2].  */
+-  .poly = { V4 (0x1.fffffep-2), V4 (0x1.5554aep-3), V4 (0x1.555736p-5),
+-	    V4 (0x1.12287cp-7), V4 (0x1.6b55a2p-10) },
+-  /* Stores constants: invln2, ln2_hi, ln2_lo, 0.  */
+-  .invln2_and_ln2 = { 0x1.715476p+0f, 0x1.62e4p-1f, 0x1.7f7d1cp-20f, 0 },
+-  .shift = V4 (0x1.8p23f),
+-  .exponent_bias = V4 (0x3f800000),
+  .d = V_EXPM1F_DATA,
+ #if !WANT_SIMD_EXCEPT
+   /* Value above which expm1f(x) should overflow. Absolute value of the
+      underflow bound is greater than this, so it catches both cases - there is
+@@ -55,67 +46,38 @@ static const struct data
+ #define TinyBound v_u32 (0x34000000 << 1)
+ 
+ static float32x4_t VPCS_ATTR NOINLINE
+-special_case (float32x4_t x, float32x4_t y, uint32x4_t special)
+special_case (float32x4_t x, uint32x4_t special, const struct data *d)
+ {
+-  return v_call_f32 (expm1f, x, y, special);
+  return v_call_f32 (
+      expm1f, x, expm1f_inline (v_zerofy_f32 (x, special), &d->d), special);
+ }
+ 
+ /* Single-precision vector exp(x) - 1 function.
+-   The maximum error is 1.51 ULP:
+-   _ZGVnN4v_expm1f (0x1.8baa96p-2) got 0x1.e2fb9p-2
+-				  want 0x1.e2fb94p-2.  */
+   The maximum error is 1.62 ULP:
+   _ZGVnN4v_expm1f(0x1.85f83p-2) got 0x1.da9f4p-2
+				want 0x1.da9f44p-2.  */
+ float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (expm1) (float32x4_t x)
+ {
+   const struct data *d = ptr_barrier (&data);
+-  uint32x4_t ix = vreinterpretq_u32_f32 (x);
+ 
+ #if WANT_SIMD_EXCEPT
+  uint32x4_t ix = vreinterpretq_u32_f32 (x);
+   /* If fp exceptions are to be triggered correctly, fall back to scalar for
+      |x| < 2^-23, |x| > oflow_bound, Inf & NaN. Add ix to itself for
+      shift-left by 1, and compare with thresh which was left-shifted offline -
+      this is effectively an absolute compare.  */
+   uint32x4_t special
+       = vcgeq_u32 (vsubq_u32 (vaddq_u32 (ix, ix), TinyBound), d->thresh);
+-  if (__glibc_unlikely (v_any_u32 (special)))
+-    x = v_zerofy_f32 (x, special);
+ #else
+   /* Handles very large values (+ve and -ve), +/-NaN, +/-Inf.  */
+   uint32x4_t special = vcagtq_f32 (x, d->oflow_bound);
+ #endif
+ 
+-  /* Reduce argument to smaller range:
+-     Let i = round(x / ln2)
+-     and f = x - i * ln2, then f is in [-ln2/2, ln2/2].
+-     exp(x) - 1 = 2^i * (expm1(f) + 1) - 1
+-     where 2^i is exact because i is an integer.  */
+-  float32x4_t invln2_and_ln2 = vld1q_f32 (d->invln2_and_ln2);
+-  float32x4_t j
+-      = vsubq_f32 (vfmaq_laneq_f32 (d->shift, x, invln2_and_ln2, 0), d->shift);
+-  int32x4_t i = vcvtq_s32_f32 (j);
+-  float32x4_t f = vfmsq_laneq_f32 (x, j, invln2_and_ln2, 1);
+-  f = vfmsq_laneq_f32 (f, j, invln2_and_ln2, 2);
+-
+-  /* Approximate expm1(f) using polynomial.
+-     Taylor expansion for expm1(x) has the form:
+-	 x + ax^2 + bx^3 + cx^4 ....
+-     So we calculate the polynomial P(f) = a + bf + cf^2 + ...
+-     and assemble the approximation expm1(f) ~= f + f^2 * P(f).  */
+-  float32x4_t p = v_horner_4_f32 (f, d->poly);
+-  p = vfmaq_f32 (f, vmulq_f32 (f, f), p);
+-
+-  /* Assemble the result.
+-     expm1(x) ~= 2^i * (p + 1) - 1
+-     Let t = 2^i.  */
+-  int32x4_t u = vaddq_s32 (vshlq_n_s32 (i, 23), d->exponent_bias);
+-  float32x4_t t = vreinterpretq_f32_s32 (u);
+-
+   if (__glibc_unlikely (v_any_u32 (special)))
+-    return special_case (vreinterpretq_f32_u32 (ix),
+-			 vfmaq_f32 (vsubq_f32 (t, v_f32 (1.0f)), p, t),
+-			 special);
+    return special_case (x, special, d);
+ 
+   /* expm1(x) ~= p * t + (t - 1).  */
+-  return vfmaq_f32 (vsubq_f32 (t, v_f32 (1.0f)), p, t);
+  return expm1f_inline (x, &d->d);
+ }
+ libmvec_hidden_def (V_NAME_F1 (expm1))
+ HALF_WIDTH_ALIAS_F1 (expm1)
+diff --git a/sysdeps/aarch64/fpu/sinhf_advsimd.c b/sysdeps/aarch64/fpu/sinhf_advsimd.c
+index 6bb7482dc28795c1..c6ed7598e7deca1b 100644
+--- a/sysdeps/aarch64/fpu/sinhf_advsimd.c
+++ b/sysdeps/aarch64/fpu/sinhf_advsimd.c
+@@ -23,15 +23,13 @@
+ static const struct data
+ {
+   struct v_expm1f_data expm1f_consts;
+-  uint32x4_t halff;
+ #if WANT_SIMD_EXCEPT
+   uint32x4_t tiny_bound, thresh;
+ #else
+-  uint32x4_t oflow_bound;
+  float32x4_t oflow_bound;
+ #endif
+ } data = {
+   .expm1f_consts = V_EXPM1F_DATA,
+-  .halff = V4 (0x3f000000),
+ #if WANT_SIMD_EXCEPT
+   /* 0x1.6a09e8p-32, below which expm1f underflows.  */
+   .tiny_bound = V4 (0x2fb504f4),
+@@ -39,14 +37,15 @@ static const struct data
+   .thresh = V4 (0x12fbbbb3),
+ #else
+   /* 0x1.61814ep+6, above which expm1f helper overflows.  */
+-  .oflow_bound = V4 (0x42b0c0a7),
+  .oflow_bound = V4 (0x1.61814ep+6),
+ #endif
+ };
+ 
+ static float32x4_t NOINLINE VPCS_ATTR
+-special_case (float32x4_t x, float32x4_t y, uint32x4_t special)
+special_case (float32x4_t x, float32x4_t t, float32x4_t halfsign,
+	      uint32x4_t special)
+ {
+-  return v_call_f32 (sinhf, x, y, special);
+  return v_call_f32 (sinhf, x, vmulq_f32 (t, halfsign), special);
+ }
+ 
+ /* Approximation for vector single-precision sinh(x) using expm1.
+@@ -60,15 +59,15 @@ float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (sinh) (float32x4_t x)
+ 
+   uint32x4_t ix = vreinterpretq_u32_f32 (x);
+   float32x4_t ax = vabsq_f32 (x);
+-  uint32x4_t iax = vreinterpretq_u32_f32 (ax);
+-  uint32x4_t sign = veorq_u32 (ix, iax);
+-  float32x4_t halfsign = vreinterpretq_f32_u32 (vorrq_u32 (sign, d->halff));
+  float32x4_t halfsign = vreinterpretq_f32_u32 (
+      vbslq_u32 (v_u32 (0x80000000), ix, vreinterpretq_u32_f32 (v_f32 (0.5))));
+ 
+ #if WANT_SIMD_EXCEPT
+-  uint32x4_t special = vcgeq_u32 (vsubq_u32 (iax, d->tiny_bound), d->thresh);
+  uint32x4_t special = vcgeq_u32 (
+      vsubq_u32 (vreinterpretq_u32_f32 (ax), d->tiny_bound), d->thresh);
+   ax = v_zerofy_f32 (ax, special);
+ #else
+-  uint32x4_t special = vcgeq_u32 (iax, d->oflow_bound);
+  uint32x4_t special = vcageq_f32 (x, d->oflow_bound);
+ #endif
+ 
+   /* Up to the point that expm1f overflows, we can use it to calculate sinhf
+@@ -80,7 +79,7 @@ float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (sinh) (float32x4_t x)
+   /* Fall back to the scalar variant for any lanes that should trigger an
+      exception.  */
+   if (__glibc_unlikely (v_any_u32 (special)))
+-    return special_case (x, vmulq_f32 (t, halfsign), special);
+    return special_case (x, t, halfsign, special);
+ 
+   return vmulq_f32 (t, halfsign);
+ }
+diff --git a/sysdeps/aarch64/fpu/tanhf_advsimd.c b/sysdeps/aarch64/fpu/tanhf_advsimd.c
+index 50defd6ef03926f4..3ced9b7a414c812c 100644
+--- a/sysdeps/aarch64/fpu/tanhf_advsimd.c
+++ b/sysdeps/aarch64/fpu/tanhf_advsimd.c
+@@ -28,13 +28,16 @@ static const struct data
+   /* 0x1.205966p+3, above which tanhf rounds to 1 (or -1 for  negative).  */
+   .boring_bound = V4 (0x41102cb3),
+   .large_bound = V4 (0x7f800000),
+-  .onef = V4 (0x3f800000),
+ };
+ 
+ static float32x4_t NOINLINE VPCS_ATTR
+-special_case (float32x4_t x, float32x4_t y, uint32x4_t special)
+special_case (float32x4_t x, uint32x4_t is_boring, float32x4_t boring,
+	      float32x4_t q, uint32x4_t special)
+ {
+-  return v_call_f32 (tanhf, x, y, special);
+  return v_call_f32 (
+      tanhf, x,
+      vbslq_f32 (is_boring, boring, vdivq_f32 (q, vaddq_f32 (q, v_f32 (2.0)))),
+      special);
+ }
+ 
+ /* Approximation for single-precision vector tanh(x), using a simplified
+@@ -50,7 +53,9 @@ float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (tanh) (float32x4_t x)
+   uint32x4_t iax = vreinterpretq_u32_f32 (ax);
+   uint32x4_t sign = veorq_u32 (ix, iax);
+   uint32x4_t is_boring = vcgtq_u32 (iax, d->boring_bound);
+-  float32x4_t boring = vreinterpretq_f32_u32 (vorrq_u32 (sign, d->onef));
+  /* expm1 exponent bias is 1.0f reinterpreted to int.  */
+  float32x4_t boring = vreinterpretq_f32_u32 (vorrq_u32 (
+      sign, vreinterpretq_u32_s32 (d->expm1f_consts.exponent_bias)));
+ 
+ #if WANT_SIMD_EXCEPT
+   /* If fp exceptions are to be triggered properly, set all special and boring
+@@ -66,10 +71,12 @@ float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (tanh) (float32x4_t x)
+ 
+   /* tanh(x) = (e^2x - 1) / (e^2x + 1).  */
+   float32x4_t q = expm1f_inline (vmulq_n_f32 (x, 2), &d->expm1f_consts);
+-  float32x4_t y = vdivq_f32 (q, vaddq_f32 (q, v_f32 (2.0)));
+
+   if (__glibc_unlikely (v_any_u32 (special)))
+-    return special_case (vreinterpretq_f32_u32 (ix),
+-			 vbslq_f32 (is_boring, boring, y), special);
+    return special_case (vreinterpretq_f32_u32 (ix), is_boring, boring, q,
+			 special);
+
+  float32x4_t y = vdivq_f32 (q, vaddq_f32 (q, v_f32 (2.0)));
+   return vbslq_f32 (is_boring, boring, y);
+ }
+ libmvec_hidden_def (V_NAME_F1 (tanh))
+diff --git a/sysdeps/aarch64/fpu/v_expm1f_inline.h b/sysdeps/aarch64/fpu/v_expm1f_inline.h
+index 59b552da6b74785e..1daedfdd51cfc54b 100644
+--- a/sysdeps/aarch64/fpu/v_expm1f_inline.h
+++ b/sysdeps/aarch64/fpu/v_expm1f_inline.h
+@@ -21,48 +21,47 @@
+ #define AARCH64_FPU_V_EXPM1F_INLINE_H
+ 
+ #include "v_math.h"
+-#include "poly_advsimd_f32.h"
+#include "math_config.h"
+ 
+ struct v_expm1f_data
+ {
+-  float32x4_t poly[5];
+-  float invln2_and_ln2[4];
+-  float32x4_t shift;
+  float32x4_t c0, c2;
+   int32x4_t exponent_bias;
+  float c1, c3, inv_ln2, c4;
+  float ln2_hi, ln2_lo;
+ };
+ 
+ /* Coefficients generated using fpminimax with degree=5 in [-log(2)/2,
+-   log(2)/2]. Exponent bias is asuint(1.0f).
+-   invln2_and_ln2 Stores constants: invln2, ln2_lo, ln2_hi, 0.  */
+   log(2)/2]. Exponent bias is asuint(1.0f).  */
+ #define V_EXPM1F_DATA                                                         \
+   {                                                                           \
+-    .poly = { V4 (0x1.fffffep-2), V4 (0x1.5554aep-3), V4 (0x1.555736p-5),     \
+-	      V4 (0x1.12287cp-7), V4 (0x1.6b55a2p-10) },                      \
+-    .shift = V4 (0x1.8p23f), .exponent_bias = V4 (0x3f800000),                \
+-    .invln2_and_ln2 = { 0x1.715476p+0f, 0x1.62e4p-1f, 0x1.7f7d1cp-20f, 0 },   \
+    .c0 = V4 (0x1.fffffep-2), .c1 = 0x1.5554aep-3, .c2 = V4 (0x1.555736p-5),  \
+    .c3 = 0x1.12287cp-7, .c4 = 0x1.6b55a2p-10,                                \
+    .exponent_bias = V4 (0x3f800000), .inv_ln2 = 0x1.715476p+0f,              \
+    .ln2_hi = 0x1.62e4p-1f, .ln2_lo = 0x1.7f7d1cp-20f,                        \
+   }
+ 
+ static inline float32x4_t
+ expm1f_inline (float32x4_t x, const struct v_expm1f_data *d)
+ {
+-  /* Helper routine for calculating exp(x) - 1.
+-     Copied from v_expm1f_1u6.c, with all special-case handling removed - the
+-     calling routine should handle special values if required.  */
+  /* Helper routine for calculating exp(x) - 1.  */
+
+  float32x2_t ln2 = vld1_f32 (&d->ln2_hi);
+  float32x4_t lane_consts = vld1q_f32 (&d->c1);
+ 
+   /* Reduce argument: f in [-ln2/2, ln2/2], i is exact.  */
+-  float32x4_t invln2_and_ln2 = vld1q_f32 (d->invln2_and_ln2);
+-  float32x4_t j
+-      = vsubq_f32 (vfmaq_laneq_f32 (d->shift, x, invln2_and_ln2, 0), d->shift);
+  float32x4_t j = vrndaq_f32 (vmulq_laneq_f32 (x, lane_consts, 2));
+   int32x4_t i = vcvtq_s32_f32 (j);
+-  float32x4_t f = vfmsq_laneq_f32 (x, j, invln2_and_ln2, 1);
+-  f = vfmsq_laneq_f32 (f, j, invln2_and_ln2, 2);
+  float32x4_t f = vfmsq_lane_f32 (x, j, ln2, 0);
+  f = vfmsq_lane_f32 (f, j, ln2, 1);
+ 
+-  /* Approximate expm1(f) with polynomial P, expm1(f) ~= f + f^2 * P(f).
+-     Uses Estrin scheme, where the main _ZGVnN4v_expm1f routine uses
+-     Horner.  */
+  /* Approximate expm1(f) with polynomial P, expm1(f) ~= f + f^2 * P(f).  */
+   float32x4_t f2 = vmulq_f32 (f, f);
+   float32x4_t f4 = vmulq_f32 (f2, f2);
+-  float32x4_t p = v_estrin_4_f32 (f, f2, f4, d->poly);
+  float32x4_t p01 = vfmaq_laneq_f32 (d->c0, f, lane_consts, 0);
+  float32x4_t p23 = vfmaq_laneq_f32 (d->c2, f, lane_consts, 1);
+  float32x4_t p = vfmaq_f32 (p01, f2, p23);
+  p = vfmaq_laneq_f32 (p, f4, lane_consts, 3);
+   p = vfmaq_f32 (f, f2, p);
+ 
+   /* t = 2^i.  */
--- a/glibc-RHEL-118273-14.patch
+++ b/glibc-RHEL-118273-14.patch
@ -0,0 +1,495 @@
+commit 5bc100bd4b7e00db3009ae93d25d303341545d23
+Author: Joe Ramsay <Joe.Ramsay@arm.com>
+Date:   Mon Sep 23 15:32:14 2024 +0100
+
+    AArch64: Improve codegen in users of AdvSIMD log1pf helper
+    
+    log1pf is quite register-intensive - use fewer registers for the
+    polynomial, and make various changes to shorten dependency chains in
+    parent routines.  There is now no spilling with GCC 14.  Accuracy moves
+    around a little - comments adjusted accordingly but does not require
+    regen-ulps.
+    
+    Use the helper in log1pf as well, instead of having separate
+    implementations.  The more accurate polynomial means special-casing can
+    be simplified, and the shorter dependency chain avoids the usual dance
+    around v0, which is otherwise difficult.
+    
+    There is a small duplication of vectors containing 1.0f (or 0x3f800000) -
+    GCC is not currently able to efficiently handle values which fit in FMOV
+    but not MOVI, and are reinterpreted to integer.  There may be potential
+    for more optimisation if this is fixed.
+    
+    Reviewed-by: Wilco Dijkstra  <Wilco.Dijkstra@arm.com>
+
+Conflicts:
+        sysdeps/aarch64/fpu/log1pf_advsimd.c
+          (Fixup context to apply without out-of-scope dependency 751a5502)
+
+diff --git a/sysdeps/aarch64/fpu/acoshf_advsimd.c b/sysdeps/aarch64/fpu/acoshf_advsimd.c
+index 8916dcbf409922a9..004474acf9e9322b 100644
+--- a/sysdeps/aarch64/fpu/acoshf_advsimd.c
+++ b/sysdeps/aarch64/fpu/acoshf_advsimd.c
+@@ -25,35 +25,32 @@ const static struct data
+ {
+   struct v_log1pf_data log1pf_consts;
+   uint32x4_t one;
+-  uint16x4_t thresh;
+-} data = {
+-  .log1pf_consts = V_LOG1PF_CONSTANTS_TABLE,
+-  .one = V4 (0x3f800000),
+-  .thresh = V4 (0x2000) /* top(asuint(SquareLim) - asuint(1)).  */
+-};
+} data = { .log1pf_consts = V_LOG1PF_CONSTANTS_TABLE, .one = V4 (0x3f800000) };
+
+#define Thresh vdup_n_u16 (0x2000) /* top(asuint(SquareLim) - asuint(1)).  */
+ 
+ static float32x4_t NOINLINE VPCS_ATTR
+ special_case (float32x4_t x, float32x4_t y, uint16x4_t special,
+-	      const struct v_log1pf_data d)
+	      const struct v_log1pf_data *d)
+ {
+   return v_call_f32 (acoshf, x, log1pf_inline (y, d), vmovl_u16 (special));
+ }
+ 
+ /* Vector approximation for single-precision acosh, based on log1p. Maximum
+    error depends on WANT_SIMD_EXCEPT. With SIMD fp exceptions enabled, it
+-   is 2.78 ULP:
+-   __v_acoshf(0x1.07887p+0) got 0x1.ef9e9cp-3
+-			   want 0x1.ef9ea2p-3.
+   is 3.00 ULP:
+   _ZGVnN4v_acoshf(0x1.01df3ap+0) got 0x1.ef0a82p-4
+				 want 0x1.ef0a7cp-4.
+    With exceptions disabled, we can compute u with a shorter dependency chain,
+-   which gives maximum error of 3.07 ULP:
+-  __v_acoshf(0x1.01f83ep+0) got 0x1.fbc7fap-4
+-			   want 0x1.fbc7f4p-4.  */
+   which gives maximum error of 3.22 ULP:
+   _ZGVnN4v_acoshf(0x1.007ef2p+0) got 0x1.fdcdccp-5
+				 want 0x1.fdcdd2p-5.  */
+ 
+ VPCS_ATTR float32x4_t NOINLINE V_NAME_F1 (acosh) (float32x4_t x)
+ {
+   const struct data *d = ptr_barrier (&data);
+   uint32x4_t ix = vreinterpretq_u32_f32 (x);
+-  uint16x4_t special = vcge_u16 (vsubhn_u32 (ix, d->one), d->thresh);
+  uint16x4_t special = vcge_u16 (vsubhn_u32 (ix, d->one), Thresh);
+ 
+ #if WANT_SIMD_EXCEPT
+   /* Mask special lanes with 1 to side-step spurious invalid or overflow. Use
+@@ -64,15 +61,16 @@ VPCS_ATTR float32x4_t NOINLINE V_NAME_F1 (acosh) (float32x4_t x)
+   float32x4_t xm1 = v_zerofy_f32 (vsubq_f32 (x, v_f32 (1)), p);
+   float32x4_t u = vfmaq_f32 (vaddq_f32 (xm1, xm1), xm1, xm1);
+ #else
+-  float32x4_t xm1 = vsubq_f32 (x, v_f32 (1));
+-  float32x4_t u = vmulq_f32 (xm1, vaddq_f32 (x, v_f32 (1.0f)));
+  float32x4_t xm1 = vsubq_f32 (x, vreinterpretq_f32_u32 (d->one));
+  float32x4_t u
+      = vmulq_f32 (xm1, vaddq_f32 (x, vreinterpretq_f32_u32 (d->one)));
+ #endif
+ 
+   float32x4_t y = vaddq_f32 (xm1, vsqrtq_f32 (u));
+ 
+   if (__glibc_unlikely (v_any_u16h (special)))
+-    return special_case (x, y, special, d->log1pf_consts);
+-  return log1pf_inline (y, d->log1pf_consts);
+    return special_case (x, y, special, &d->log1pf_consts);
+  return log1pf_inline (y, &d->log1pf_consts);
+ }
+ libmvec_hidden_def (V_NAME_F1 (acosh))
+ HALF_WIDTH_ALIAS_F1 (acosh)
+diff --git a/sysdeps/aarch64/fpu/asinhf_advsimd.c b/sysdeps/aarch64/fpu/asinhf_advsimd.c
+index 09fd8a614305563d..eb789b91b600af52 100644
+--- a/sysdeps/aarch64/fpu/asinhf_advsimd.c
+++ b/sysdeps/aarch64/fpu/asinhf_advsimd.c
+@@ -20,16 +20,16 @@
+ #include "v_math.h"
+ #include "v_log1pf_inline.h"
+ 
+-#define SignMask v_u32 (0x80000000)
+-
+ const static struct data
+ {
+   struct v_log1pf_data log1pf_consts;
+  float32x4_t one;
+   uint32x4_t big_bound;
+ #if WANT_SIMD_EXCEPT
+   uint32x4_t tiny_bound;
+ #endif
+ } data = {
+  .one = V4 (1),
+   .log1pf_consts = V_LOG1PF_CONSTANTS_TABLE,
+   .big_bound = V4 (0x5f800000), /* asuint(0x1p64).  */
+ #if WANT_SIMD_EXCEPT
+@@ -38,20 +38,27 @@ const static struct data
+ };
+ 
+ static float32x4_t NOINLINE VPCS_ATTR
+-special_case (float32x4_t x, float32x4_t y, uint32x4_t special)
+special_case (float32x4_t x, uint32x4_t sign, float32x4_t y,
+	      uint32x4_t special, const struct data *d)
+ {
+-  return v_call_f32 (asinhf, x, y, special);
+  return v_call_f32 (
+      asinhf, x,
+      vreinterpretq_f32_u32 (veorq_u32 (
+	  sign, vreinterpretq_u32_f32 (log1pf_inline (y, &d->log1pf_consts)))),
+      special);
+ }
+ 
+ /* Single-precision implementation of vector asinh(x), using vector log1p.
+-   Worst-case error is 2.66 ULP, at roughly +/-0.25:
+-   __v_asinhf(0x1.01b04p-2) got 0x1.fe163ep-3 want 0x1.fe1638p-3.  */
+   Worst-case error is 2.59 ULP:
+   _ZGVnN4v_asinhf(0x1.d86124p-3) got 0x1.d449bep-3
+				 want 0x1.d449c4p-3.  */
+ VPCS_ATTR float32x4_t NOINLINE V_NAME_F1 (asinh) (float32x4_t x)
+ {
+   const struct data *dat = ptr_barrier (&data);
+-  uint32x4_t iax = vbicq_u32 (vreinterpretq_u32_f32 (x), SignMask);
+-  float32x4_t ax = vreinterpretq_f32_u32 (iax);
+  float32x4_t ax = vabsq_f32 (x);
+  uint32x4_t iax = vreinterpretq_u32_f32 (ax);
+   uint32x4_t special = vcgeq_u32 (iax, dat->big_bound);
+  uint32x4_t sign = veorq_u32 (vreinterpretq_u32_f32 (x), iax);
+   float32x4_t special_arg = x;
+ 
+ #if WANT_SIMD_EXCEPT
+@@ -68,13 +75,13 @@ VPCS_ATTR float32x4_t NOINLINE V_NAME_F1 (asinh) (float32x4_t x)
+   /* asinh(x) = log(x + sqrt(x * x + 1)).
+      For positive x, asinh(x) = log1p(x + x * x / (1 + sqrt(x * x + 1))).  */
+   float32x4_t d
+-      = vaddq_f32 (v_f32 (1), vsqrtq_f32 (vfmaq_f32 (v_f32 (1), x, x)));
+-  float32x4_t y = log1pf_inline (
+-      vaddq_f32 (ax, vdivq_f32 (vmulq_f32 (ax, ax), d)), dat->log1pf_consts);
+      = vaddq_f32 (v_f32 (1), vsqrtq_f32 (vfmaq_f32 (dat->one, ax, ax)));
+  float32x4_t y = vaddq_f32 (ax, vdivq_f32 (vmulq_f32 (ax, ax), d));
+ 
+   if (__glibc_unlikely (v_any_u32 (special)))
+-    return special_case (special_arg, vbslq_f32 (SignMask, x, y), special);
+-  return vbslq_f32 (SignMask, x, y);
+    return special_case (special_arg, sign, y, special, dat);
+  return vreinterpretq_f32_u32 (veorq_u32 (
+      sign, vreinterpretq_u32_f32 (log1pf_inline (y, &dat->log1pf_consts))));
+ }
+ libmvec_hidden_def (V_NAME_F1 (asinh))
+ HALF_WIDTH_ALIAS_F1 (asinh)
+diff --git a/sysdeps/aarch64/fpu/atanhf_advsimd.c b/sysdeps/aarch64/fpu/atanhf_advsimd.c
+index ae488f7b54ddce26..818b6c92adcd48bb 100644
+--- a/sysdeps/aarch64/fpu/atanhf_advsimd.c
+++ b/sysdeps/aarch64/fpu/atanhf_advsimd.c
+@@ -40,15 +40,17 @@ const static struct data
+ #define Half v_u32 (0x3f000000)
+ 
+ static float32x4_t NOINLINE VPCS_ATTR
+-special_case (float32x4_t x, float32x4_t y, uint32x4_t special)
+special_case (float32x4_t x, float32x4_t halfsign, float32x4_t y,
+	      uint32x4_t special)
+ {
+-  return v_call_f32 (atanhf, x, y, special);
+  return v_call_f32 (atanhf, vbslq_f32 (AbsMask, x, halfsign),
+		     vmulq_f32 (halfsign, y), special);
+ }
+ 
+ /* Approximation for vector single-precision atanh(x) using modified log1p.
+-   The maximum error is 3.08 ULP:
+-   __v_atanhf(0x1.ff215p-5) got 0x1.ffcb7cp-5
+-			   want 0x1.ffcb82p-5.  */
+   The maximum error is 2.93 ULP:
+   _ZGVnN4v_atanhf(0x1.f43d7p-5) got 0x1.f4dcfep-5
+				want 0x1.f4dcf8p-5.  */
+ VPCS_ATTR float32x4_t NOINLINE V_NAME_F1 (atanh) (float32x4_t x)
+ {
+   const struct data *d = ptr_barrier (&data);
+@@ -68,11 +70,19 @@ VPCS_ATTR float32x4_t NOINLINE V_NAME_F1 (atanh) (float32x4_t x)
+   uint32x4_t special = vcgeq_u32 (iax, d->one);
+ #endif
+ 
+-  float32x4_t y = vdivq_f32 (vaddq_f32 (ax, ax), vsubq_f32 (v_f32 (1), ax));
+-  y = log1pf_inline (y, d->log1pf_consts);
+  float32x4_t y = vdivq_f32 (vaddq_f32 (ax, ax),
+			     vsubq_f32 (vreinterpretq_f32_u32 (d->one), ax));
+  y = log1pf_inline (y, &d->log1pf_consts);
+ 
+  /* If exceptions not required, pass ax to special-case for shorter dependency
+     chain. If exceptions are required ax will have been zerofied, so have to
+     pass x.  */
+   if (__glibc_unlikely (v_any_u32 (special)))
+-    return special_case (x, vmulq_f32 (halfsign, y), special);
+#if WANT_SIMD_EXCEPT
+    return special_case (x, halfsign, y, special);
+#else
+    return special_case (ax, halfsign, y, special);
+#endif
+   return vmulq_f32 (halfsign, y);
+ }
+ libmvec_hidden_def (V_NAME_F1 (atanh))
+diff --git a/sysdeps/aarch64/fpu/log1pf_advsimd.c b/sysdeps/aarch64/fpu/log1pf_advsimd.c
+index dc15334a8537b1fc..f2d47962fe13fbdd 100644
+--- a/sysdeps/aarch64/fpu/log1pf_advsimd.c
+++ b/sysdeps/aarch64/fpu/log1pf_advsimd.c
+@@ -18,113 +18,78 @@
+    <https://www.gnu.org/licenses/>.  */
+ 
+ #include "v_math.h"
+-#include "poly_advsimd_f32.h"
+#include "v_log1pf_inline.h"
+
+#if WANT_SIMD_EXCEPT
+ 
+ const static struct data
+ {
+-  float32x4_t poly[8], ln2;
+-  uint32x4_t tiny_bound, minus_one, four, thresh;
+-  int32x4_t three_quarters;
+  uint32x4_t minus_one, thresh;
+  struct v_log1pf_data d;
+ } data = {
+-  .poly = { /* Generated using FPMinimax in [-0.25, 0.5]. First two coefficients
+-	       (1, -0.5) are not stored as they can be generated more
+-	       efficiently.  */
+-	    V4 (0x1.5555aap-2f), V4 (-0x1.000038p-2f), V4 (0x1.99675cp-3f),
+-	    V4 (-0x1.54ef78p-3f), V4 (0x1.28a1f4p-3f), V4 (-0x1.0da91p-3f),
+-	    V4 (0x1.abcb6p-4f), V4 (-0x1.6f0d5ep-5f) },
+-  .ln2 = V4 (0x1.62e43p-1f),
+-  .tiny_bound = V4 (0x34000000), /* asuint32(0x1p-23). ulp=0.5 at 0x1p-23.  */
+-  .thresh = V4 (0x4b800000), /* asuint32(INFINITY) - tiny_bound.  */
+  .d = V_LOG1PF_CONSTANTS_TABLE,
+  .thresh = V4 (0x4b800000), /* asuint32(INFINITY) - TinyBound.  */
+   .minus_one = V4 (0xbf800000),
+-  .four = V4 (0x40800000),
+-  .three_quarters = V4 (0x3f400000)
+ };
+ 
+-static inline float32x4_t
+-eval_poly (float32x4_t m, const float32x4_t *p)
+-{
+-  /* Approximate log(1+m) on [-0.25, 0.5] using split Estrin scheme.  */
+-  float32x4_t p_12 = vfmaq_f32 (v_f32 (-0.5), m, p[0]);
+-  float32x4_t p_34 = vfmaq_f32 (p[1], m, p[2]);
+-  float32x4_t p_56 = vfmaq_f32 (p[3], m, p[4]);
+-  float32x4_t p_78 = vfmaq_f32 (p[5], m, p[6]);
+-
+-  float32x4_t m2 = vmulq_f32 (m, m);
+-  float32x4_t p_02 = vfmaq_f32 (m, m2, p_12);
+-  float32x4_t p_36 = vfmaq_f32 (p_34, m2, p_56);
+-  float32x4_t p_79 = vfmaq_f32 (p_78, m2, p[7]);
+-
+-  float32x4_t m4 = vmulq_f32 (m2, m2);
+-  float32x4_t p_06 = vfmaq_f32 (p_02, m4, p_36);
+-  return vfmaq_f32 (p_06, m4, vmulq_f32 (m4, p_79));
+-}
+/* asuint32(0x1p-23). ulp=0.5 at 0x1p-23.  */
+#  define TinyBound v_u32 (0x34000000)
+ 
+ static float32x4_t NOINLINE VPCS_ATTR
+-special_case (float32x4_t x, float32x4_t y, uint32x4_t special)
+special_case (float32x4_t x, uint32x4_t cmp, const struct data *d)
+ {
+-  return v_call_f32 (log1pf, x, y, special);
+  /* Side-step special lanes so fenv exceptions are not triggered
+     inadvertently.  */
+  float32x4_t x_nospecial = v_zerofy_f32 (x, cmp);
+  return v_call_f32 (log1pf, x, log1pf_inline (x_nospecial, &d->d), cmp);
+ }
+ 
+-/* Vector log1pf approximation using polynomial on reduced interval. Accuracy
+-   is roughly 2.02 ULP:
+-   log1pf(0x1.21e13ap-2) got 0x1.fe8028p-3 want 0x1.fe802cp-3.  */
+/* Vector log1pf approximation using polynomial on reduced interval. Worst-case
+   error is 1.69 ULP:
+   _ZGVnN4v_log1pf(0x1.04418ap-2) got 0x1.cfcbd8p-3
+				 want 0x1.cfcbdcp-3.  */
+ VPCS_ATTR float32x4_t V_NAME_F1 (log1p) (float32x4_t x)
+ {
+   const struct data *d = ptr_barrier (&data);
+-
+   uint32x4_t ix = vreinterpretq_u32_f32 (x);
+   uint32x4_t ia = vreinterpretq_u32_f32 (vabsq_f32 (x));
+
+   uint32x4_t special_cases
+-      = vorrq_u32 (vcgeq_u32 (vsubq_u32 (ia, d->tiny_bound), d->thresh),
+      = vorrq_u32 (vcgeq_u32 (vsubq_u32 (ia, TinyBound), d->thresh),
+ 		   vcgeq_u32 (ix, d->minus_one));
+-  float32x4_t special_arg = x;
+ 
+-#if WANT_SIMD_EXCEPT
+   if (__glibc_unlikely (v_any_u32 (special_cases)))
+-    /* Side-step special lanes so fenv exceptions are not triggered
+-       inadvertently.  */
+-    x = v_zerofy_f32 (x, special_cases);
+-#endif
+    return special_case (x, special_cases, d);
+ 
+-  /* With x + 1 = t * 2^k (where t = m + 1 and k is chosen such that m
+-			   is in [-0.25, 0.5]):
+-     log1p(x) = log(t) + log(2^k) = log1p(m) + k*log(2).
+-
+-     We approximate log1p(m) with a polynomial, then scale by
+-     k*log(2). Instead of doing this directly, we use an intermediate
+-     scale factor s = 4*k*log(2) to ensure the scale is representable
+-     as a normalised fp32 number.  */
+  return log1pf_inline (x, &d->d);
+}
+ 
+-  float32x4_t m = vaddq_f32 (x, v_f32 (1.0f));
+#else
+ 
+-  /* Choose k to scale x to the range [-1/4, 1/2].  */
+-  int32x4_t k
+-      = vandq_s32 (vsubq_s32 (vreinterpretq_s32_f32 (m), d->three_quarters),
+-		   v_s32 (0xff800000));
+-  uint32x4_t ku = vreinterpretq_u32_s32 (k);
+const static struct v_log1pf_data data = V_LOG1PF_CONSTANTS_TABLE;
+ 
+-  /* Scale x by exponent manipulation.  */
+-  float32x4_t m_scale
+-      = vreinterpretq_f32_u32 (vsubq_u32 (vreinterpretq_u32_f32 (x), ku));
+static float32x4_t NOINLINE VPCS_ATTR
+special_case (float32x4_t x, uint32x4_t cmp)
+{
+  return v_call_f32 (log1pf, x, log1pf_inline (x, ptr_barrier (&data)), cmp);
+}
+ 
+-  /* Scale up to ensure that the scale factor is representable as normalised
+-     fp32 number, and scale m down accordingly.  */
+-  float32x4_t s = vreinterpretq_f32_u32 (vsubq_u32 (d->four, ku));
+-  m_scale = vaddq_f32 (m_scale, vfmaq_f32 (v_f32 (-1.0f), v_f32 (0.25f), s));
+/* Vector log1pf approximation using polynomial on reduced interval. Worst-case
+   error is 1.63 ULP:
+   _ZGVnN4v_log1pf(0x1.216d12p-2) got 0x1.fdcb12p-3
+				 want 0x1.fdcb16p-3.  */
+VPCS_ATTR float32x4_t V_NAME_F1 (log1p) (float32x4_t x)
+{
+  uint32x4_t special_cases = vornq_u32 (vcleq_f32 (x, v_f32 (-1)),
+					vcaleq_f32 (x, v_f32 (0x1p127f)));
+ 
+-  /* Evaluate polynomial on the reduced interval.  */
+-  float32x4_t p = eval_poly (m_scale, d->poly);
+  if (__glibc_unlikely (v_any_u32 (special_cases)))
+    return special_case (x, special_cases);
+ 
+-  /* The scale factor to be applied back at the end - by multiplying float(k)
+-     by 2^-23 we get the unbiased exponent of k.  */
+-  float32x4_t scale_back = vcvtq_f32_s32 (vshrq_n_s32 (k, 23));
+  return log1pf_inline (x, ptr_barrier (&data));
+}
+ 
+-  /* Apply the scaling back.  */
+-  float32x4_t y = vfmaq_f32 (p, scale_back, d->ln2);
+#endif
+ 
+-  if (__glibc_unlikely (v_any_u32 (special_cases)))
+-    return special_case (special_arg, y, special_cases);
+-  return y;
+-}
+ libmvec_hidden_def (V_NAME_F1 (log1p))
+ HALF_WIDTH_ALIAS_F1 (log1p)
+diff --git a/sysdeps/aarch64/fpu/v_log1pf_inline.h b/sysdeps/aarch64/fpu/v_log1pf_inline.h
+index 643a6cdcfc498970..73e45a942e24a26f 100644
+--- a/sysdeps/aarch64/fpu/v_log1pf_inline.h
+++ b/sysdeps/aarch64/fpu/v_log1pf_inline.h
+@@ -25,54 +25,81 @@
+ 
+ struct v_log1pf_data
+ {
+-  float32x4_t poly[8], ln2;
+   uint32x4_t four;
+   int32x4_t three_quarters;
+  float c0, c3, c5, c7;
+  float32x4_t c4, c6, c1, c2, ln2;
+ };
+ 
+ /* Polynomial generated using FPMinimax in [-0.25, 0.5]. First two coefficients
+    (1, -0.5) are not stored as they can be generated more efficiently.  */
+ #define V_LOG1PF_CONSTANTS_TABLE                                              \
+   {                                                                           \
+-    .poly                                                                     \
+-	= { V4 (0x1.5555aap-2f),  V4 (-0x1.000038p-2f), V4 (0x1.99675cp-3f),  \
+-	    V4 (-0x1.54ef78p-3f), V4 (0x1.28a1f4p-3f),	V4 (-0x1.0da91p-3f),  \
+-	    V4 (0x1.abcb6p-4f),	  V4 (-0x1.6f0d5ep-5f) },                     \
+-	.ln2 = V4 (0x1.62e43p-1f), .four = V4 (0x40800000),                   \
+-	.three_quarters = V4 (0x3f400000)                                     \
+    .c0 = 0x1.5555aap-2f, .c1 = V4 (-0x1.000038p-2f),                         \
+    .c2 = V4 (0x1.99675cp-3f), .c3 = -0x1.54ef78p-3f,                         \
+    .c4 = V4 (0x1.28a1f4p-3f), .c5 = -0x1.0da91p-3f,                          \
+    .c6 = V4 (0x1.abcb6p-4f), .c7 = -0x1.6f0d5ep-5f,                          \
+    .ln2 = V4 (0x1.62e43p-1f), .four = V4 (0x40800000),                       \
+    .three_quarters = V4 (0x3f400000)                                         \
+   }
+ 
+ static inline float32x4_t
+-eval_poly (float32x4_t m, const float32x4_t *c)
+eval_poly (float32x4_t m, const struct v_log1pf_data *d)
+ {
+-  /* Approximate log(1+m) on [-0.25, 0.5] using pairwise Horner (main routine
+-     uses split Estrin, but this way reduces register pressure in the calling
+-     routine).  */
+-  float32x4_t q = vfmaq_f32 (v_f32 (-0.5), m, c[0]);
+  /* Approximate log(1+m) on [-0.25, 0.5] using pairwise Horner.  */
+  float32x4_t c0357 = vld1q_f32 (&d->c0);
+  float32x4_t q = vfmaq_laneq_f32 (v_f32 (-0.5), m, c0357, 0);
+   float32x4_t m2 = vmulq_f32 (m, m);
+-  q = vfmaq_f32 (m, m2, q);
+-  float32x4_t p = v_pw_horner_6_f32 (m, m2, c + 1);
+  float32x4_t p67 = vfmaq_laneq_f32 (d->c6, m, c0357, 3);
+  float32x4_t p45 = vfmaq_laneq_f32 (d->c4, m, c0357, 2);
+  float32x4_t p23 = vfmaq_laneq_f32 (d->c2, m, c0357, 1);
+  float32x4_t p = vfmaq_f32 (p45, m2, p67);
+  p = vfmaq_f32 (p23, m2, p);
+  p = vfmaq_f32 (d->c1, m, p);
+   p = vmulq_f32 (m2, p);
+-  return vfmaq_f32 (q, m2, p);
+  p = vfmaq_f32 (m, m2, p);
+  return vfmaq_f32 (p, m2, q);
+ }
+ 
+ static inline float32x4_t
+-log1pf_inline (float32x4_t x, const struct v_log1pf_data d)
+log1pf_inline (float32x4_t x, const struct v_log1pf_data *d)
+ {
+-  /* Helper for calculating log(x + 1). Copied from log1pf_2u1.c, with no
+-     special-case handling. See that file for details of the algorithm.  */
+  /* Helper for calculating log(x + 1).  */
+
+  /* With x + 1 = t * 2^k (where t = m + 1 and k is chosen such that m
+			   is in [-0.25, 0.5]):
+     log1p(x) = log(t) + log(2^k) = log1p(m) + k*log(2).
+
+     We approximate log1p(m) with a polynomial, then scale by
+     k*log(2). Instead of doing this directly, we use an intermediate
+     scale factor s = 4*k*log(2) to ensure the scale is representable
+     as a normalised fp32 number.  */
+   float32x4_t m = vaddq_f32 (x, v_f32 (1.0f));
+
+  /* Choose k to scale x to the range [-1/4, 1/2].  */
+   int32x4_t k
+-      = vandq_s32 (vsubq_s32 (vreinterpretq_s32_f32 (m), d.three_quarters),
+      = vandq_s32 (vsubq_s32 (vreinterpretq_s32_f32 (m), d->three_quarters),
+ 		   v_s32 (0xff800000));
+   uint32x4_t ku = vreinterpretq_u32_s32 (k);
+-  float32x4_t s = vreinterpretq_f32_u32 (vsubq_u32 (d.four, ku));
+
+  /* Scale up to ensure that the scale factor is representable as normalised
+     fp32 number, and scale m down accordingly.  */
+  float32x4_t s = vreinterpretq_f32_u32 (vsubq_u32 (d->four, ku));
+
+  /* Scale x by exponent manipulation.  */
+   float32x4_t m_scale
+       = vreinterpretq_f32_u32 (vsubq_u32 (vreinterpretq_u32_f32 (x), ku));
+   m_scale = vaddq_f32 (m_scale, vfmaq_f32 (v_f32 (-1.0f), v_f32 (0.25f), s));
+-  float32x4_t p = eval_poly (m_scale, d.poly);
+
+  /* Evaluate polynomial on the reduced interval.  */
+  float32x4_t p = eval_poly (m_scale, d);
+
+  /* The scale factor to be applied back at the end - by multiplying float(k)
+     by 2^-23 we get the unbiased exponent of k.  */
+   float32x4_t scale_back = vmulq_f32 (vcvtq_f32_s32 (k), v_f32 (0x1.0p-23f));
+-  return vfmaq_f32 (p, scale_back, d.ln2);
+
+  /* Apply the scaling back.  */
+  return vfmaq_f32 (p, scale_back, d->ln2);
+ }
+ 
+ #endif
--- a/glibc-RHEL-118273-15.patch
+++ b/glibc-RHEL-118273-15.patch
@ -0,0 +1,261 @@
+commit a15b1394b5eba98ffe28a02a392b587e4fe13c0d
+Author: Joe Ramsay <Joe.Ramsay@arm.com>
+Date:   Mon Sep 23 15:30:20 2024 +0100
+
+    AArch64: Improve codegen in SVE F32 logs
+    
+    Reduce MOVPRFXs by using unpredicated (non-destructive) instructions
+    where possible.  Similar to the recent change to AdvSIMD F32 logs,
+    adjust special-case arguments and bounds to allow for more optimal
+    register usage.  For all 3 routines one MOVPRFX remains in the
+    reduction, which cannot be avoided as immediate AND and ASR are both
+    destructive.
+    
+    Reviewed-by: Wilco Dijkstra  <Wilco.Dijkstra@arm.com>
+
+diff --git a/sysdeps/aarch64/fpu/log10f_sve.c b/sysdeps/aarch64/fpu/log10f_sve.c
+index bdbb49cd32feccb4..7913679f6795502a 100644
+--- a/sysdeps/aarch64/fpu/log10f_sve.c
+++ b/sysdeps/aarch64/fpu/log10f_sve.c
+@@ -24,6 +24,7 @@ static const struct data
+   float poly_0246[4];
+   float poly_1357[4];
+   float ln2, inv_ln10;
+  uint32_t off, lower;
+ } data = {
+   .poly_1357 = {
+     /* Coefficients copied from the AdvSIMD routine, then rearranged so that coeffs
+@@ -35,18 +36,23 @@ static const struct data
+ 		 -0x1.0fc92cp-4f },
+   .ln2 = 0x1.62e43p-1f,
+   .inv_ln10 = 0x1.bcb7b2p-2f,
+  .off = 0x3f2aaaab,
+  /* Lower bound is the smallest positive normal float 0x00800000. For
+     optimised register use subnormals are detected after offset has been
+     subtracted, so lower bound is 0x0080000 - offset (which wraps around).  */
+  .lower = 0x00800000 - 0x3f2aaaab
+ };
+ 
+-#define Min 0x00800000
+-#define Max 0x7f800000
+-#define Thres 0x7f000000  /* Max - Min.  */
+-#define Offset 0x3f2aaaab /* 0.666667.  */
+#define Thres 0x7f000000 /* asuint32(inf) - 0x00800000.  */
+ #define MantissaMask 0x007fffff
+ 
+ static svfloat32_t NOINLINE
+-special_case (svfloat32_t x, svfloat32_t y, svbool_t special)
+special_case (svuint32_t u_off, svfloat32_t p, svfloat32_t r2, svfloat32_t y,
+	      svbool_t cmp)
+ {
+-  return sv_call_f32 (log10f, x, y, special);
+  return sv_call_f32 (
+      log10f, svreinterpret_f32 (svadd_x (svptrue_b32 (), u_off, data.off)),
+      svmla_x (svptrue_b32 (), p, r2, y), cmp);
+ }
+ 
+ /* Optimised implementation of SVE log10f using the same algorithm and
+@@ -57,23 +63,25 @@ special_case (svfloat32_t x, svfloat32_t y, svbool_t special)
+ svfloat32_t SV_NAME_F1 (log10) (svfloat32_t x, const svbool_t pg)
+ {
+   const struct data *d = ptr_barrier (&data);
+-  svuint32_t ix = svreinterpret_u32 (x);
+-  svbool_t special = svcmpge (pg, svsub_x (pg, ix, Min), Thres);
+
+  svuint32_t u_off = svreinterpret_u32 (x);
+
+  u_off = svsub_x (pg, u_off, d->off);
+  svbool_t special = svcmpge (pg, svsub_x (pg, u_off, d->lower), Thres);
+ 
+   /* x = 2^n * (1+r), where 2/3 < 1+r < 4/3.  */
+-  ix = svsub_x (pg, ix, Offset);
+   svfloat32_t n = svcvt_f32_x (
+-      pg, svasr_x (pg, svreinterpret_s32 (ix), 23)); /* signextend.  */
+-  ix = svand_x (pg, ix, MantissaMask);
+-  ix = svadd_x (pg, ix, Offset);
+      pg, svasr_x (pg, svreinterpret_s32 (u_off), 23)); /* signextend.  */
+  svuint32_t ix = svand_x (pg, u_off, MantissaMask);
+  ix = svadd_x (pg, ix, d->off);
+   svfloat32_t r = svsub_x (pg, svreinterpret_f32 (ix), 1.0f);
+ 
+   /* y = log10(1+r) + n*log10(2)
+      log10(1+r) ~ r * InvLn(10) + P(r)
+      where P(r) is a polynomial. Use order 9 for log10(1+x), i.e. order 8 for
+      log10(1+x)/x, with x in [-1/3, 1/3] (offset=2/3).  */
+-  svfloat32_t r2 = svmul_x (pg, r, r);
+-  svfloat32_t r4 = svmul_x (pg, r2, r2);
+  svfloat32_t r2 = svmul_x (svptrue_b32 (), r, r);
+  svfloat32_t r4 = svmul_x (svptrue_b32 (), r2, r2);
+   svfloat32_t p_1357 = svld1rq (svptrue_b32 (), &d->poly_1357[0]);
+   svfloat32_t q_01 = svmla_lane (sv_f32 (d->poly_0246[0]), r, p_1357, 0);
+   svfloat32_t q_23 = svmla_lane (sv_f32 (d->poly_0246[1]), r, p_1357, 1);
+@@ -88,7 +96,6 @@ svfloat32_t SV_NAME_F1 (log10) (svfloat32_t x, const svbool_t pg)
+   hi = svmul_x (pg, hi, d->inv_ln10);
+ 
+   if (__glibc_unlikely (svptest_any (pg, special)))
+-    return special_case (x, svmla_x (svnot_z (pg, special), hi, r2, y),
+-			 special);
+-  return svmla_x (pg, hi, r2, y);
+    return special_case (u_off, hi, r2, y, special);
+  return svmla_x (svptrue_b32 (), hi, r2, y);
+ }
+diff --git a/sysdeps/aarch64/fpu/log2f_sve.c b/sysdeps/aarch64/fpu/log2f_sve.c
+index 5031c4248359295e..939d89bfb9a95a11 100644
+--- a/sysdeps/aarch64/fpu/log2f_sve.c
+++ b/sysdeps/aarch64/fpu/log2f_sve.c
+@@ -23,6 +23,7 @@ static const struct data
+ {
+   float poly_02468[5];
+   float poly_1357[4];
+  uint32_t off, lower;
+ } data = {
+   .poly_1357 = {
+     /* Coefficients copied from the AdvSIMD routine, then rearranged so that coeffs
+@@ -32,18 +33,23 @@ static const struct data
+   },
+   .poly_02468 = { 0x1.715476p0f, 0x1.ec701cp-2f, 0x1.27a0b8p-2f,
+ 		  0x1.9d8ecap-3f, 0x1.9e495p-3f },
+  .off = 0x3f2aaaab,
+  /* Lower bound is the smallest positive normal float 0x00800000. For
+     optimised register use subnormals are detected after offset has been
+     subtracted, so lower bound is 0x0080000 - offset (which wraps around).  */
+  .lower = 0x00800000 - 0x3f2aaaab
+ };
+ 
+-#define Min (0x00800000)
+-#define Max (0x7f800000)
+-#define Thres (0x7f000000) /* Max - Min.  */
+#define Thresh (0x7f000000) /* asuint32(inf) - 0x00800000.  */
+ #define MantissaMask (0x007fffff)
+-#define Off (0x3f2aaaab) /* 0.666667.  */
+ 
+ static svfloat32_t NOINLINE
+-special_case (svfloat32_t x, svfloat32_t y, svbool_t cmp)
+special_case (svuint32_t u_off, svfloat32_t p, svfloat32_t r2, svfloat32_t y,
+	      svbool_t cmp)
+ {
+-  return sv_call_f32 (log2f, x, y, cmp);
+  return sv_call_f32 (
+      log2f, svreinterpret_f32 (svadd_x (svptrue_b32 (), u_off, data.off)),
+      svmla_x (svptrue_b32 (), p, r2, y), cmp);
+ }
+ 
+ /* Optimised implementation of SVE log2f, using the same algorithm
+@@ -55,19 +61,20 @@ svfloat32_t SV_NAME_F1 (log2) (svfloat32_t x, const svbool_t pg)
+ {
+   const struct data *d = ptr_barrier (&data);
+ 
+-  svuint32_t u = svreinterpret_u32 (x);
+-  svbool_t special = svcmpge (pg, svsub_x (pg, u, Min), Thres);
+  svuint32_t u_off = svreinterpret_u32 (x);
+
+  u_off = svsub_x (pg, u_off, d->off);
+  svbool_t special = svcmpge (pg, svsub_x (pg, u_off, d->lower), Thresh);
+ 
+   /* x = 2^n * (1+r), where 2/3 < 1+r < 4/3.  */
+-  u = svsub_x (pg, u, Off);
+   svfloat32_t n = svcvt_f32_x (
+-      pg, svasr_x (pg, svreinterpret_s32 (u), 23)); /* Sign-extend.  */
+-  u = svand_x (pg, u, MantissaMask);
+-  u = svadd_x (pg, u, Off);
+      pg, svasr_x (pg, svreinterpret_s32 (u_off), 23)); /* Sign-extend.  */
+  svuint32_t u = svand_x (pg, u_off, MantissaMask);
+  u = svadd_x (pg, u, d->off);
+   svfloat32_t r = svsub_x (pg, svreinterpret_f32 (u), 1.0f);
+ 
+   /* y = log2(1+r) + n.  */
+-  svfloat32_t r2 = svmul_x (pg, r, r);
+  svfloat32_t r2 = svmul_x (svptrue_b32 (), r, r);
+ 
+   /* Evaluate polynomial using pairwise Horner scheme.  */
+   svfloat32_t p_1357 = svld1rq (svptrue_b32 (), &d->poly_1357[0]);
+@@ -81,6 +88,6 @@ svfloat32_t SV_NAME_F1 (log2) (svfloat32_t x, const svbool_t pg)
+   y = svmla_x (pg, q_01, r2, y);
+ 
+   if (__glibc_unlikely (svptest_any (pg, special)))
+-    return special_case (x, svmla_x (svnot_z (pg, special), n, r, y), special);
+-  return svmla_x (pg, n, r, y);
+    return special_case (u_off, n, r, y, special);
+  return svmla_x (svptrue_b32 (), n, r, y);
+ }
+diff --git a/sysdeps/aarch64/fpu/logf_sve.c b/sysdeps/aarch64/fpu/logf_sve.c
+index d64e810cfec9aa19..5b9324678d99455b 100644
+--- a/sysdeps/aarch64/fpu/logf_sve.c
+++ b/sysdeps/aarch64/fpu/logf_sve.c
+@@ -24,6 +24,7 @@ static const struct data
+   float poly_0135[4];
+   float poly_246[3];
+   float ln2;
+  uint32_t off, lower;
+ } data = {
+   .poly_0135 = {
+     /* Coefficients copied from the AdvSIMD routine in math/, then rearranged so
+@@ -32,19 +33,24 @@ static const struct data
+     -0x1.3e737cp-3f, 0x1.5a9aa2p-3f, 0x1.961348p-3f, 0x1.555d7cp-2f
+   },
+   .poly_246 = { -0x1.4f9934p-3f, -0x1.00187cp-2f, -0x1.ffffc8p-2f },
+-  .ln2 = 0x1.62e43p-1f
+  .ln2 = 0x1.62e43p-1f,
+  .off = 0x3f2aaaab,
+  /* Lower bound is the smallest positive normal float 0x00800000. For
+     optimised register use subnormals are detected after offset has been
+     subtracted, so lower bound is 0x0080000 - offset (which wraps around).  */
+  .lower = 0x00800000 - 0x3f2aaaab
+ };
+ 
+-#define Min (0x00800000)
+-#define Max (0x7f800000)
+-#define Thresh (0x7f000000) /* Max - Min.  */
+#define Thresh (0x7f000000) /* asuint32(inf) - 0x00800000.  */
+ #define Mask (0x007fffff)
+-#define Off (0x3f2aaaab) /* 0.666667.  */
+ 
+ static svfloat32_t NOINLINE
+-special_case (svfloat32_t x, svfloat32_t y, svbool_t cmp)
+special_case (svuint32_t u_off, svfloat32_t p, svfloat32_t r2, svfloat32_t y,
+	      svbool_t cmp)
+ {
+-  return sv_call_f32 (logf, x, y, cmp);
+  return sv_call_f32 (
+      logf, svreinterpret_f32 (svadd_x (svptrue_b32 (), u_off, data.off)),
+      svmla_x (svptrue_b32 (), p, r2, y), cmp);
+ }
+ 
+ /* Optimised implementation of SVE logf, using the same algorithm and
+@@ -55,19 +61,21 @@ svfloat32_t SV_NAME_F1 (log) (svfloat32_t x, const svbool_t pg)
+ {
+   const struct data *d = ptr_barrier (&data);
+ 
+-  svuint32_t u = svreinterpret_u32 (x);
+-  svbool_t cmp = svcmpge (pg, svsub_x (pg, u, Min), Thresh);
+  svuint32_t u_off = svreinterpret_u32 (x);
+
+  u_off = svsub_x (pg, u_off, d->off);
+  svbool_t cmp = svcmpge (pg, svsub_x (pg, u_off, d->lower), Thresh);
+ 
+   /* x = 2^n * (1+r), where 2/3 < 1+r < 4/3.  */
+-  u = svsub_x (pg, u, Off);
+   svfloat32_t n = svcvt_f32_x (
+-      pg, svasr_x (pg, svreinterpret_s32 (u), 23)); /* Sign-extend.  */
+-  u = svand_x (pg, u, Mask);
+-  u = svadd_x (pg, u, Off);
+      pg, svasr_x (pg, svreinterpret_s32 (u_off), 23)); /* Sign-extend.  */
+
+  svuint32_t u = svand_x (pg, u_off, Mask);
+  u = svadd_x (pg, u, d->off);
+   svfloat32_t r = svsub_x (pg, svreinterpret_f32 (u), 1.0f);
+ 
+   /* y = log(1+r) + n*ln2.  */
+-  svfloat32_t r2 = svmul_x (pg, r, r);
+  svfloat32_t r2 = svmul_x (svptrue_b32 (), r, r);
+   /* n*ln2 + r + r2*(P6 + r*P5 + r2*(P4 + r*P3 + r2*(P2 + r*P1 + r2*P0))).  */
+   svfloat32_t p_0135 = svld1rq (svptrue_b32 (), &d->poly_0135[0]);
+   svfloat32_t p = svmla_lane (sv_f32 (d->poly_246[0]), r, p_0135, 1);
+@@ -80,6 +88,6 @@ svfloat32_t SV_NAME_F1 (log) (svfloat32_t x, const svbool_t pg)
+   p = svmla_x (pg, r, n, d->ln2);
+ 
+   if (__glibc_unlikely (svptest_any (pg, cmp)))
+-    return special_case (x, svmla_x (svnot_z (pg, cmp), p, r2, y), cmp);
+    return special_case (u_off, p, r2, y, cmp);
+   return svmla_x (pg, p, r2, y);
+ }
--- a/glibc-RHEL-118273-16.patch
+++ b/glibc-RHEL-118273-16.patch
@ -0,0 +1,467 @@
+commit 7b8c134b5460ed933d610fa92ed1227372b68fdc
+Author: Joe Ramsay <Joe.Ramsay@arm.com>
+Date:   Mon Sep 23 15:26:12 2024 +0100
+
+    AArch64: Improve codegen in SVE expf & related routines
+    
+    Reduce MOV and MOVPRFX by improving special-case handling.  Use inline
+    helper to duplicate the entire computation between the special- and
+    non-special case branches, removing the contention for z0 between x
+    and the return value.
+    
+    Also rearrange some MLAs and MLSs - by making the multiplicand the
+    destination we can avoid a MOVPRFX in several cases.  Also change which
+    constants go in the vector used for lanewise ops - the last lane is no
+    longer wasted.
+    
+    Spotted that shift was incorrect in exp2f and exp10f, w.r.t. to the
+    comment that explains it.  Fixed - worst-case ULP for exp2f moves
+    around but it doesn't change significantly for either routine.
+    
+    Worst-case error for coshf increases due to passing x to exp rather
+    than abs(x) - updated the comment, but does not require regen-ulps.
+    
+    Reviewed-by: Wilco Dijkstra  <Wilco.Dijkstra@arm.com>
+
+diff --git a/sysdeps/aarch64/fpu/coshf_sve.c b/sysdeps/aarch64/fpu/coshf_sve.c
+index e5d8a299c6aa7ceb..7ad6efa0fc218278 100644
+--- a/sysdeps/aarch64/fpu/coshf_sve.c
+++ b/sysdeps/aarch64/fpu/coshf_sve.c
+@@ -23,37 +23,42 @@
+ static const struct data
+ {
+   struct sv_expf_data expf_consts;
+-  uint32_t special_bound;
+  float special_bound;
+ } data = {
+   .expf_consts = SV_EXPF_DATA,
+   /* 0x1.5a92d8p+6: expf overflows above this, so have to use special case.  */
+-  .special_bound = 0x42ad496c,
+  .special_bound = 0x1.5a92d8p+6,
+ };
+ 
+ static svfloat32_t NOINLINE
+-special_case (svfloat32_t x, svfloat32_t y, svbool_t pg)
+special_case (svfloat32_t x, svfloat32_t half_e, svfloat32_t half_over_e,
+	      svbool_t pg)
+ {
+-  return sv_call_f32 (coshf, x, y, pg);
+  return sv_call_f32 (coshf, x, svadd_x (svptrue_b32 (), half_e, half_over_e),
+		      pg);
+ }
+ 
+ /* Single-precision vector cosh, using vector expf.
+-   Maximum error is 1.89 ULP:
+-   _ZGVsMxv_coshf (-0x1.65898cp+6) got 0x1.f00aep+127
+-				  want 0x1.f00adcp+127.  */
+   Maximum error is 2.77 ULP:
+   _ZGVsMxv_coshf(-0x1.5b38f4p+1) got 0x1.e45946p+2
+				 want 0x1.e4594cp+2.  */
+ svfloat32_t SV_NAME_F1 (cosh) (svfloat32_t x, svbool_t pg)
+ {
+   const struct data *d = ptr_barrier (&data);
+ 
+-  svfloat32_t ax = svabs_x (pg, x);
+-  svbool_t special = svcmpge (pg, svreinterpret_u32 (ax), d->special_bound);
+  svbool_t special = svacge (pg, x, d->special_bound);
+ 
+-  /* Calculate cosh by exp(x) / 2 + exp(-x) / 2.  */
+-  svfloat32_t t = expf_inline (ax, pg, &d->expf_consts);
+-  svfloat32_t half_t = svmul_x (pg, t, 0.5);
+-  svfloat32_t half_over_t = svdivr_x (pg, t, 0.5);
+  /* Calculate cosh by exp(x) / 2 + exp(-x) / 2.
+     Note that x is passed to exp here, rather than |x|. This is to avoid using
+     destructive unary ABS for better register usage. However it means the
+     routine is not exactly symmetrical, as the exp helper is slightly less
+     accurate in the negative range.  */
+  svfloat32_t e = expf_inline (x, pg, &d->expf_consts);
+  svfloat32_t half_e = svmul_x (svptrue_b32 (), e, 0.5);
+  svfloat32_t half_over_e = svdivr_x (pg, e, 0.5);
+ 
+   if (__glibc_unlikely (svptest_any (pg, special)))
+-    return special_case (x, svadd_x (pg, half_t, half_over_t), special);
+    return special_case (x, half_e, half_over_e, special);
+ 
+-  return svadd_x (pg, half_t, half_over_t);
+  return svadd_x (svptrue_b32 (), half_e, half_over_e);
+ }
+diff --git a/sysdeps/aarch64/fpu/exp10f_sve.c b/sysdeps/aarch64/fpu/exp10f_sve.c
+index e09b2f3b2705515a..8aa3fa9c4335cfb8 100644
+--- a/sysdeps/aarch64/fpu/exp10f_sve.c
+++ b/sysdeps/aarch64/fpu/exp10f_sve.c
+@@ -18,74 +18,83 @@
+    <https://www.gnu.org/licenses/>.  */
+ 
+ #include "sv_math.h"
+-#include "poly_sve_f32.h"
+ 
+-/* For x < -SpecialBound, the result is subnormal and not handled correctly by
+/* For x < -Thres, the result is subnormal and not handled correctly by
+    FEXPA.  */
+-#define SpecialBound 37.9
+#define Thres 37.9
+ 
+ static const struct data
+ {
+-  float poly[5];
+-  float shift, log10_2, log2_10_hi, log2_10_lo, special_bound;
+  float log2_10_lo, c0, c2, c4;
+  float c1, c3, log10_2;
+  float shift, log2_10_hi, thres;
+ } data = {
+   /* Coefficients generated using Remez algorithm with minimisation of relative
+      error.
+      rel error: 0x1.89dafa3p-24
+      abs error: 0x1.167d55p-23 in [-log10(2)/2, log10(2)/2]
+      maxerr: 0.52 +0.5 ulp.  */
+-  .poly = { 0x1.26bb16p+1f, 0x1.5350d2p+1f, 0x1.04744ap+1f, 0x1.2d8176p+0f,
+-	    0x1.12b41ap-1f },
+  .c0 = 0x1.26bb16p+1f,
+  .c1 = 0x1.5350d2p+1f,
+  .c2 = 0x1.04744ap+1f,
+  .c3 = 0x1.2d8176p+0f,
+  .c4 = 0x1.12b41ap-1f,
+   /* 1.5*2^17 + 127, a shift value suitable for FEXPA.  */
+-  .shift = 0x1.903f8p17f,
+  .shift = 0x1.803f8p17f,
+   .log10_2 = 0x1.a934fp+1,
+   .log2_10_hi = 0x1.344136p-2,
+   .log2_10_lo = -0x1.ec10cp-27,
+-  .special_bound = SpecialBound,
+  .thres = Thres,
+ };
+ 
+-static svfloat32_t NOINLINE
+-special_case (svfloat32_t x, svfloat32_t y, svbool_t special)
+static inline svfloat32_t
+sv_exp10f_inline (svfloat32_t x, const svbool_t pg, const struct data *d)
+ {
+-  return sv_call_f32 (exp10f, x, y, special);
+-}
+-
+-/* Single-precision SVE exp10f routine. Implements the same algorithm
+-   as AdvSIMD exp10f.
+-   Worst case error is 1.02 ULPs.
+-   _ZGVsMxv_exp10f(-0x1.040488p-4) got 0x1.ba5f9ep-1
+-				  want 0x1.ba5f9cp-1.  */
+-svfloat32_t SV_NAME_F1 (exp10) (svfloat32_t x, const svbool_t pg)
+-{
+-  const struct data *d = ptr_barrier (&data);
+   /* exp10(x) = 2^(n/N) * 10^r = 2^n * (1 + poly (r)),
+      with poly(r) in [1/sqrt(2), sqrt(2)] and
+      x = r + n * log10(2) / N, with r in [-log10(2)/2N, log10(2)/2N].  */
+ 
+-  /* Load some constants in quad-word chunks to minimise memory access (last
+-     lane is wasted).  */
+-  svfloat32_t log10_2_and_inv = svld1rq (svptrue_b32 (), &d->log10_2);
+  svfloat32_t lane_consts = svld1rq (svptrue_b32 (), &d->log2_10_lo);
+ 
+   /* n = round(x/(log10(2)/N)).  */
+   svfloat32_t shift = sv_f32 (d->shift);
+-  svfloat32_t z = svmla_lane (shift, x, log10_2_and_inv, 0);
+-  svfloat32_t n = svsub_x (pg, z, shift);
+  svfloat32_t z = svmad_x (pg, sv_f32 (d->log10_2), x, shift);
+  svfloat32_t n = svsub_x (svptrue_b32 (), z, shift);
+ 
+   /* r = x - n*log10(2)/N.  */
+-  svfloat32_t r = svmls_lane (x, n, log10_2_and_inv, 1);
+-  r = svmls_lane (r, n, log10_2_and_inv, 2);
+  svfloat32_t r = svmsb_x (pg, sv_f32 (d->log2_10_hi), n, x);
+  r = svmls_lane (r, n, lane_consts, 0);
+ 
+-  svbool_t special = svacgt (pg, x, d->special_bound);
+   svfloat32_t scale = svexpa (svreinterpret_u32 (z));
+ 
+   /* Polynomial evaluation: poly(r) ~ exp10(r)-1.  */
+-  svfloat32_t r2 = svmul_x (pg, r, r);
+-  svfloat32_t poly
+-      = svmla_x (pg, svmul_x (pg, r, d->poly[0]),
+-		 sv_pairwise_poly_3_f32_x (pg, r, r2, d->poly + 1), r2);
+-
+-  if (__glibc_unlikely (svptest_any (pg, special)))
+-    return special_case (x, svmla_x (pg, scale, scale, poly), special);
+  svfloat32_t p12 = svmla_lane (sv_f32 (d->c1), r, lane_consts, 2);
+  svfloat32_t p34 = svmla_lane (sv_f32 (d->c3), r, lane_consts, 3);
+  svfloat32_t r2 = svmul_x (svptrue_b32 (), r, r);
+  svfloat32_t p14 = svmla_x (pg, p12, p34, r2);
+  svfloat32_t p0 = svmul_lane (r, lane_consts, 1);
+  svfloat32_t poly = svmla_x (pg, p0, r2, p14);
+ 
+   return svmla_x (pg, scale, scale, poly);
+ }
+
+static svfloat32_t NOINLINE
+special_case (svfloat32_t x, svbool_t special, const struct data *d)
+{
+  return sv_call_f32 (exp10f, x, sv_exp10f_inline (x, svptrue_b32 (), d),
+		      special);
+}
+
+/* Single-precision SVE exp10f routine. Implements the same algorithm
+   as AdvSIMD exp10f.
+   Worst case error is 1.02 ULPs.
+   _ZGVsMxv_exp10f(-0x1.040488p-4) got 0x1.ba5f9ep-1
+				  want 0x1.ba5f9cp-1.  */
+svfloat32_t SV_NAME_F1 (exp10) (svfloat32_t x, const svbool_t pg)
+{
+  const struct data *d = ptr_barrier (&data);
+  svbool_t special = svacgt (pg, x, d->thres);
+  if (__glibc_unlikely (svptest_any (special, special)))
+    return special_case (x, special, d);
+  return sv_exp10f_inline (x, pg, d);
+}
+diff --git a/sysdeps/aarch64/fpu/exp2f_sve.c b/sysdeps/aarch64/fpu/exp2f_sve.c
+index 8a686e3e054cb7f5..c6216bed9e9e7538 100644
+--- a/sysdeps/aarch64/fpu/exp2f_sve.c
+++ b/sysdeps/aarch64/fpu/exp2f_sve.c
+@@ -24,54 +24,64 @@
+ 
+ static const struct data
+ {
+-  float poly[5];
+  float c0, c2, c4, c1, c3;
+   float shift, thres;
+ } data = {
+-  /* Coefficients copied from the polynomial in AdvSIMD variant, reversed for
+-     compatibility with polynomial helpers.  */
+-  .poly = { 0x1.62e422p-1f, 0x1.ebf9bcp-3f, 0x1.c6bd32p-5f, 0x1.3ce9e4p-7f,
+-	    0x1.59977ap-10f },
+  /* Coefficients copied from the polynomial in AdvSIMD variant.  */
+  .c0 = 0x1.62e422p-1f,
+  .c1 = 0x1.ebf9bcp-3f,
+  .c2 = 0x1.c6bd32p-5f,
+  .c3 = 0x1.3ce9e4p-7f,
+  .c4 = 0x1.59977ap-10f,
+   /* 1.5*2^17 + 127.  */
+-  .shift = 0x1.903f8p17f,
+  .shift = 0x1.803f8p17f,
+   /* Roughly 87.3. For x < -Thres, the result is subnormal and not handled
+      correctly by FEXPA.  */
+   .thres = Thres,
+ };
+ 
+-static svfloat32_t NOINLINE
+-special_case (svfloat32_t x, svfloat32_t y, svbool_t special)
+-{
+-  return sv_call_f32 (exp2f, x, y, special);
+-}
+-
+-/* Single-precision SVE exp2f routine. Implements the same algorithm
+-   as AdvSIMD exp2f.
+-   Worst case error is 1.04 ULPs.
+-   SV_NAME_F1 (exp2)(0x1.943b9p-1) got 0x1.ba7eb2p+0
+-				  want 0x1.ba7ebp+0.  */
+-svfloat32_t SV_NAME_F1 (exp2) (svfloat32_t x, const svbool_t pg)
+static inline svfloat32_t
+sv_exp2f_inline (svfloat32_t x, const svbool_t pg, const struct data *d)
+ {
+-  const struct data *d = ptr_barrier (&data);
+   /* exp2(x) = 2^n (1 + poly(r)), with 1 + poly(r) in [1/sqrt(2),sqrt(2)]
+     x = n + r, with r in [-1/2, 1/2].  */
+-  svfloat32_t shift = sv_f32 (d->shift);
+-  svfloat32_t z = svadd_x (pg, x, shift);
+-  svfloat32_t n = svsub_x (pg, z, shift);
+-  svfloat32_t r = svsub_x (pg, x, n);
+  svfloat32_t z = svadd_x (svptrue_b32 (), x, d->shift);
+  svfloat32_t n = svsub_x (svptrue_b32 (), z, d->shift);
+  svfloat32_t r = svsub_x (svptrue_b32 (), x, n);
+ 
+-  svbool_t special = svacgt (pg, x, d->thres);
+   svfloat32_t scale = svexpa (svreinterpret_u32 (z));
+ 
+   /* Polynomial evaluation: poly(r) ~ exp2(r)-1.
+      Evaluate polynomial use hybrid scheme - offset ESTRIN by 1 for
+      coefficients 1 to 4, and apply most significant coefficient directly.  */
+-  svfloat32_t r2 = svmul_x (pg, r, r);
+-  svfloat32_t p14 = sv_pairwise_poly_3_f32_x (pg, r, r2, d->poly + 1);
+-  svfloat32_t p0 = svmul_x (pg, r, d->poly[0]);
+  svfloat32_t even_coeffs = svld1rq (svptrue_b32 (), &d->c0);
+  svfloat32_t r2 = svmul_x (svptrue_b32 (), r, r);
+  svfloat32_t p12 = svmla_lane (sv_f32 (d->c1), r, even_coeffs, 1);
+  svfloat32_t p34 = svmla_lane (sv_f32 (d->c3), r, even_coeffs, 2);
+  svfloat32_t p14 = svmla_x (pg, p12, r2, p34);
+  svfloat32_t p0 = svmul_lane (r, even_coeffs, 0);
+   svfloat32_t poly = svmla_x (pg, p0, r2, p14);
+ 
+-  if (__glibc_unlikely (svptest_any (pg, special)))
+-    return special_case (x, svmla_x (pg, scale, scale, poly), special);
+-
+   return svmla_x (pg, scale, scale, poly);
+ }
+
+static svfloat32_t NOINLINE
+special_case (svfloat32_t x, svbool_t special, const struct data *d)
+{
+  return sv_call_f32 (exp2f, x, sv_exp2f_inline (x, svptrue_b32 (), d),
+		      special);
+}
+
+/* Single-precision SVE exp2f routine. Implements the same algorithm
+   as AdvSIMD exp2f.
+   Worst case error is 1.04 ULPs.
+   _ZGVsMxv_exp2f(-0x1.af994ap-3) got 0x1.ba6a66p-1
+				 want 0x1.ba6a64p-1.  */
+svfloat32_t SV_NAME_F1 (exp2) (svfloat32_t x, const svbool_t pg)
+{
+  const struct data *d = ptr_barrier (&data);
+  svbool_t special = svacgt (pg, x, d->thres);
+  if (__glibc_unlikely (svptest_any (special, special)))
+    return special_case (x, special, d);
+  return sv_exp2f_inline (x, pg, d);
+}
+diff --git a/sysdeps/aarch64/fpu/expf_sve.c b/sysdeps/aarch64/fpu/expf_sve.c
+index 3ba79bc4f11a05f9..da93e01b87e0e890 100644
+--- a/sysdeps/aarch64/fpu/expf_sve.c
+++ b/sysdeps/aarch64/fpu/expf_sve.c
+@@ -18,33 +18,25 @@
+    <https://www.gnu.org/licenses/>.  */
+ 
+ #include "sv_math.h"
+#include "sv_expf_inline.h"
+
+/* Roughly 87.3. For x < -Thres, the result is subnormal and not handled
+   correctly by FEXPA.  */
+#define Thres 0x1.5d5e2ap+6f
+ 
+ static const struct data
+ {
+-  float poly[5];
+-  float inv_ln2, ln2_hi, ln2_lo, shift, thres;
+  struct sv_expf_data d;
+  float thres;
+ } data = {
+-  /* Coefficients copied from the polynomial in AdvSIMD variant, reversed for
+-     compatibility with polynomial helpers.  */
+-  .poly = { 0x1.ffffecp-1f, 0x1.fffdb6p-2f, 0x1.555e66p-3f, 0x1.573e2ep-5f,
+-	    0x1.0e4020p-7f },
+-  .inv_ln2 = 0x1.715476p+0f,
+-  .ln2_hi = 0x1.62e4p-1f,
+-  .ln2_lo = 0x1.7f7d1cp-20f,
+-  /* 1.5*2^17 + 127.  */
+-  .shift = 0x1.903f8p17f,
+-  /* Roughly 87.3. For x < -Thres, the result is subnormal and not handled
+-     correctly by FEXPA.  */
+-  .thres = 0x1.5d5e2ap+6f,
+  .d = SV_EXPF_DATA,
+  .thres = Thres,
+ };
+ 
+-#define C(i) sv_f32 (d->poly[i])
+-#define ExponentBias 0x3f800000
+-
+ static svfloat32_t NOINLINE
+-special_case (svfloat32_t x, svfloat32_t y, svbool_t special)
+special_case (svfloat32_t x, svbool_t special, const struct sv_expf_data *d)
+ {
+-  return sv_call_f32 (expf, x, y, special);
+  return sv_call_f32 (expf, x, expf_inline (x, svptrue_b32 (), d), special);
+ }
+ 
+ /* Optimised single-precision SVE exp function.
+@@ -54,36 +46,8 @@ special_case (svfloat32_t x, svfloat32_t y, svbool_t special)
+ svfloat32_t SV_NAME_F1 (exp) (svfloat32_t x, const svbool_t pg)
+ {
+   const struct data *d = ptr_barrier (&data);
+-
+-  /* exp(x) = 2^n (1 + poly(r)), with 1 + poly(r) in [1/sqrt(2),sqrt(2)]
+-     x = ln2*n + r, with r in [-ln2/2, ln2/2].  */
+-
+-  /* Load some constants in quad-word chunks to minimise memory access (last
+-     lane is wasted).  */
+-  svfloat32_t invln2_and_ln2 = svld1rq (svptrue_b32 (), &d->inv_ln2);
+-
+-  /* n = round(x/(ln2/N)).  */
+-  svfloat32_t z = svmla_lane (sv_f32 (d->shift), x, invln2_and_ln2, 0);
+-  svfloat32_t n = svsub_x (pg, z, d->shift);
+-
+-  /* r = x - n*ln2/N.  */
+-  svfloat32_t r = svmls_lane (x, n, invln2_and_ln2, 1);
+-  r = svmls_lane (r, n, invln2_and_ln2, 2);
+-
+-  /* scale = 2^(n/N).  */
+   svbool_t is_special_case = svacgt (pg, x, d->thres);
+-  svfloat32_t scale = svexpa (svreinterpret_u32 (z));
+-
+-  /* y = exp(r) - 1 ~= r + C0 r^2 + C1 r^3 + C2 r^4 + C3 r^5 + C4 r^6.  */
+-  svfloat32_t p12 = svmla_x (pg, C (1), C (2), r);
+-  svfloat32_t p34 = svmla_x (pg, C (3), C (4), r);
+-  svfloat32_t r2 = svmul_x (pg, r, r);
+-  svfloat32_t p14 = svmla_x (pg, p12, p34, r2);
+-  svfloat32_t p0 = svmul_x (pg, r, C (0));
+-  svfloat32_t poly = svmla_x (pg, p0, r2, p14);
+-
+   if (__glibc_unlikely (svptest_any (pg, is_special_case)))
+-    return special_case (x, svmla_x (pg, scale, scale, poly), is_special_case);
+-
+-  return svmla_x (pg, scale, scale, poly);
+    return special_case (x, is_special_case, &d->d);
+  return expf_inline (x, pg, &d->d);
+ }
+diff --git a/sysdeps/aarch64/fpu/sv_expf_inline.h b/sysdeps/aarch64/fpu/sv_expf_inline.h
+index 23963b5f8ec89ead..6166df65533555a6 100644
+--- a/sysdeps/aarch64/fpu/sv_expf_inline.h
+++ b/sysdeps/aarch64/fpu/sv_expf_inline.h
+@@ -24,19 +24,20 @@
+ 
+ struct sv_expf_data
+ {
+-  float poly[5];
+-  float inv_ln2, ln2_hi, ln2_lo, shift;
+  float c1, c3, inv_ln2;
+  float ln2_lo, c0, c2, c4;
+  float ln2_hi, shift;
+ };
+ 
+ /* Coefficients copied from the polynomial in AdvSIMD variant, reversed for
+    compatibility with polynomial helpers. Shift is 1.5*2^17 + 127.  */
+ #define SV_EXPF_DATA                                                          \
+   {                                                                           \
+-    .poly = { 0x1.ffffecp-1f, 0x1.fffdb6p-2f, 0x1.555e66p-3f, 0x1.573e2ep-5f, \
+-	      0x1.0e4020p-7f },                                               \
+-                                                                              \
+-    .inv_ln2 = 0x1.715476p+0f, .ln2_hi = 0x1.62e4p-1f,                        \
+-    .ln2_lo = 0x1.7f7d1cp-20f, .shift = 0x1.803f8p17f,                        \
+    /* Coefficients copied from the polynomial in AdvSIMD variant.  */        \
+    .c0 = 0x1.ffffecp-1f, .c1 = 0x1.fffdb6p-2f, .c2 = 0x1.555e66p-3f,         \
+    .c3 = 0x1.573e2ep-5f, .c4 = 0x1.0e4020p-7f, .inv_ln2 = 0x1.715476p+0f,    \
+    .ln2_hi = 0x1.62e4p-1f, .ln2_lo = 0x1.7f7d1cp-20f,                        \
+    .shift = 0x1.803f8p17f,                                                   \
+   }
+ 
+ #define C(i) sv_f32 (d->poly[i])
+@@ -47,26 +48,25 @@ expf_inline (svfloat32_t x, const svbool_t pg, const struct sv_expf_data *d)
+   /* exp(x) = 2^n (1 + poly(r)), with 1 + poly(r) in [1/sqrt(2),sqrt(2)]
+      x = ln2*n + r, with r in [-ln2/2, ln2/2].  */
+ 
+-  /* Load some constants in quad-word chunks to minimise memory access.  */
+-  svfloat32_t c4_invln2_and_ln2 = svld1rq (svptrue_b32 (), &d->poly[4]);
+  svfloat32_t lane_consts = svld1rq (svptrue_b32 (), &d->ln2_lo);
+ 
+   /* n = round(x/(ln2/N)).  */
+-  svfloat32_t z = svmla_lane (sv_f32 (d->shift), x, c4_invln2_and_ln2, 1);
+  svfloat32_t z = svmad_x (pg, sv_f32 (d->inv_ln2), x, d->shift);
+   svfloat32_t n = svsub_x (pg, z, d->shift);
+ 
+   /* r = x - n*ln2/N.  */
+-  svfloat32_t r = svmls_lane (x, n, c4_invln2_and_ln2, 2);
+-  r = svmls_lane (r, n, c4_invln2_and_ln2, 3);
+  svfloat32_t r = svmsb_x (pg, sv_f32 (d->ln2_hi), n, x);
+  r = svmls_lane (r, n, lane_consts, 0);
+ 
+   /* scale = 2^(n/N).  */
+-  svfloat32_t scale = svexpa (svreinterpret_u32_f32 (z));
+  svfloat32_t scale = svexpa (svreinterpret_u32 (z));
+ 
+   /* y = exp(r) - 1 ~= r + C0 r^2 + C1 r^3 + C2 r^4 + C3 r^5 + C4 r^6.  */
+-  svfloat32_t p12 = svmla_x (pg, C (1), C (2), r);
+-  svfloat32_t p34 = svmla_lane (C (3), r, c4_invln2_and_ln2, 0);
+-  svfloat32_t r2 = svmul_f32_x (pg, r, r);
+  svfloat32_t p12 = svmla_lane (sv_f32 (d->c1), r, lane_consts, 2);
+  svfloat32_t p34 = svmla_lane (sv_f32 (d->c3), r, lane_consts, 3);
+  svfloat32_t r2 = svmul_x (svptrue_b32 (), r, r);
+   svfloat32_t p14 = svmla_x (pg, p12, p34, r2);
+-  svfloat32_t p0 = svmul_f32_x (pg, r, C (0));
+  svfloat32_t p0 = svmul_lane (r, lane_consts, 1);
+   svfloat32_t poly = svmla_x (pg, p0, r2, p14);
+ 
+   return svmla_x (pg, scale, scale, poly);
--- a/glibc-RHEL-118273-17.patch
+++ b/glibc-RHEL-118273-17.patch
@ -0,0 +1,124 @@
+commit 1cf29fbc5be23db775d1dfa6b332ded6e6554252
+Author: Joe Ramsay <Joe.Ramsay@arm.com>
+Date:   Mon Oct 28 14:58:35 2024 +0000
+
+    AArch64: Small optimisation in AdvSIMD erf and erfc
+    
+    In both routines, reduce register pressure such that GCC 14 emits no
+    spills for erf and fewer spills for erfc.  Also use more efficient
+    comparison for the special-case in erf.
+    
+    Benchtests show erf improves by 6.4%, erfc by 1.0%.
+
+diff --git a/sysdeps/aarch64/fpu/erf_advsimd.c b/sysdeps/aarch64/fpu/erf_advsimd.c
+index 19cbb7d0f42eb4e2..c0116735e408066d 100644
+--- a/sysdeps/aarch64/fpu/erf_advsimd.c
+++ b/sysdeps/aarch64/fpu/erf_advsimd.c
+@@ -22,19 +22,21 @@
+ static const struct data
+ {
+   float64x2_t third;
+-  float64x2_t tenth, two_over_five, two_over_fifteen;
+-  float64x2_t two_over_nine, two_over_fortyfive;
+  float64x2_t tenth, two_over_five, two_over_nine;
+  double two_over_fifteen, two_over_fortyfive;
+   float64x2_t max, shift;
+  uint64x2_t max_idx;
+ #if WANT_SIMD_EXCEPT
+   float64x2_t tiny_bound, huge_bound, scale_minus_one;
+ #endif
+ } data = {
+  .max_idx = V2 (768),
+   .third = V2 (0x1.5555555555556p-2), /* used to compute 2/3 and 1/6 too.  */
+-  .two_over_fifteen = V2 (0x1.1111111111111p-3),
+  .two_over_fifteen = 0x1.1111111111111p-3,
+   .tenth = V2 (-0x1.999999999999ap-4),
+   .two_over_five = V2 (-0x1.999999999999ap-2),
+   .two_over_nine = V2 (-0x1.c71c71c71c71cp-3),
+-  .two_over_fortyfive = V2 (0x1.6c16c16c16c17p-5),
+  .two_over_fortyfive = 0x1.6c16c16c16c17p-5,
+   .max = V2 (5.9921875), /* 6 - 1/128.  */
+   .shift = V2 (0x1p45),
+ #if WANT_SIMD_EXCEPT
+@@ -87,8 +89,8 @@ float64x2_t VPCS_ATTR V_NAME_D1 (erf) (float64x2_t x)
+   float64x2_t a = vabsq_f64 (x);
+   /* Reciprocal conditions that do not catch NaNs so they can be used in BSLs
+      to return expected results.  */
+-  uint64x2_t a_le_max = vcleq_f64 (a, dat->max);
+-  uint64x2_t a_gt_max = vcgtq_f64 (a, dat->max);
+  uint64x2_t a_le_max = vcaleq_f64 (x, dat->max);
+  uint64x2_t a_gt_max = vcagtq_f64 (x, dat->max);
+ 
+ #if WANT_SIMD_EXCEPT
+   /* |x| huge or tiny.  */
+@@ -115,7 +117,7 @@ float64x2_t VPCS_ATTR V_NAME_D1 (erf) (float64x2_t x)
+      segfault.  */
+   uint64x2_t i
+       = vsubq_u64 (vreinterpretq_u64_f64 (z), vreinterpretq_u64_f64 (shift));
+-  i = vbslq_u64 (a_le_max, i, v_u64 (768));
+  i = vbslq_u64 (a_le_max, i, dat->max_idx);
+   struct entry e = lookup (i);
+ 
+   float64x2_t r = vsubq_f64 (z, shift);
+@@ -125,14 +127,19 @@ float64x2_t VPCS_ATTR V_NAME_D1 (erf) (float64x2_t x)
+   float64x2_t d2 = vmulq_f64 (d, d);
+   float64x2_t r2 = vmulq_f64 (r, r);
+ 
+  float64x2_t two_over_fifteen_and_fortyfive
+      = vld1q_f64 (&dat->two_over_fifteen);
+
+   /* poly (d, r) = 1 + p1(r) * d + p2(r) * d^2 + ... + p5(r) * d^5.  */
+   float64x2_t p1 = r;
+   float64x2_t p2
+       = vfmsq_f64 (dat->third, r2, vaddq_f64 (dat->third, dat->third));
+   float64x2_t p3 = vmulq_f64 (r, vfmaq_f64 (v_f64 (-0.5), r2, dat->third));
+-  float64x2_t p4 = vfmaq_f64 (dat->two_over_five, r2, dat->two_over_fifteen);
+  float64x2_t p4 = vfmaq_laneq_f64 (dat->two_over_five, r2,
+				    two_over_fifteen_and_fortyfive, 0);
+   p4 = vfmsq_f64 (dat->tenth, r2, p4);
+-  float64x2_t p5 = vfmaq_f64 (dat->two_over_nine, r2, dat->two_over_fortyfive);
+  float64x2_t p5 = vfmaq_laneq_f64 (dat->two_over_nine, r2,
+				    two_over_fifteen_and_fortyfive, 1);
+   p5 = vmulq_f64 (r, vfmaq_f64 (vmulq_f64 (v_f64 (0.5), dat->third), r2, p5));
+ 
+   float64x2_t p34 = vfmaq_f64 (p3, d, p4);
+diff --git a/sysdeps/aarch64/fpu/erfc_advsimd.c b/sysdeps/aarch64/fpu/erfc_advsimd.c
+index f1b3bfe8304c73b5..2f2f755c46e71b58 100644
+--- a/sysdeps/aarch64/fpu/erfc_advsimd.c
+++ b/sysdeps/aarch64/fpu/erfc_advsimd.c
+@@ -24,8 +24,8 @@ static const struct data
+ {
+   uint64x2_t offset, table_scale;
+   float64x2_t max, shift;
+-  float64x2_t p20, p40, p41, p42;
+-  float64x2_t p51, p52;
+  float64x2_t p20, p40, p41, p51;
+  double p42, p52;
+   double qr5[2], qr6[2], qr7[2], qr8[2], qr9[2];
+ #if WANT_SIMD_EXCEPT
+   float64x2_t uflow_bound;
+@@ -41,9 +41,9 @@ static const struct data
+   .p20 = V2 (0x1.5555555555555p-2),  /* 1/3, used to compute 2/3 and 1/6.  */
+   .p40 = V2 (-0x1.999999999999ap-4), /* 1/10.  */
+   .p41 = V2 (-0x1.999999999999ap-2), /* 2/5.  */
+-  .p42 = V2 (0x1.1111111111111p-3),  /* 2/15.  */
+  .p42 = 0x1.1111111111111p-3,	     /* 2/15.  */
+   .p51 = V2 (-0x1.c71c71c71c71cp-3), /* 2/9.  */
+-  .p52 = V2 (0x1.6c16c16c16c17p-5),  /* 2/45.  */
+  .p52 = 0x1.6c16c16c16c17p-5,	     /* 2/45.  */
+   /* Qi = (i+1) / i, Ri = -2 * i / ((i+1)*(i+2)), for i = 5, ..., 9.  */
+   .qr5 = { 0x1.3333333333333p0, -0x1.e79e79e79e79ep-3 },
+   .qr6 = { 0x1.2aaaaaaaaaaabp0, -0x1.b6db6db6db6dbp-3 },
+@@ -157,9 +157,10 @@ float64x2_t V_NAME_D1 (erfc) (float64x2_t x)
+   float64x2_t p1 = r;
+   float64x2_t p2 = vfmsq_f64 (dat->p20, r2, vaddq_f64 (dat->p20, dat->p20));
+   float64x2_t p3 = vmulq_f64 (r, vfmaq_f64 (v_f64 (-0.5), r2, dat->p20));
+-  float64x2_t p4 = vfmaq_f64 (dat->p41, r2, dat->p42);
+  float64x2_t p42_p52 = vld1q_f64 (&dat->p42);
+  float64x2_t p4 = vfmaq_laneq_f64 (dat->p41, r2, p42_p52, 0);
+   p4 = vfmsq_f64 (dat->p40, r2, p4);
+-  float64x2_t p5 = vfmaq_f64 (dat->p51, r2, dat->p52);
+  float64x2_t p5 = vfmaq_laneq_f64 (dat->p51, r2, p42_p52, 1);
+   p5 = vmulq_f64 (r, vfmaq_f64 (vmulq_f64 (v_f64 (0.5), dat->p20), r2, p5));
+   /* Compute p_i using recurrence relation:
+      p_{i+2} = (p_i + r * Q_{i+1} * p_{i+1}) * R_{i+1}.  */
--- a/glibc-RHEL-118273-18.patch
+++ b/glibc-RHEL-118273-18.patch
--- a/glibc-RHEL-118273-19.patch
+++ b/glibc-RHEL-118273-19.patch
@ -0,0 +1,461 @@
+commit 13a7ef5999de56add448a24fefb0250236271a06
+Author: Pierre Blanchard <pierre.blanchard@arm.com>
+Date:   Mon Dec 9 15:58:47 2024 +0000
+
+    AArch64: Improve codegen in users of ADVSIMD expm1 helper
+    
+    Add inline helper for expm1 and rearrange operations so MOV
+    is not necessary in reduction or around the special-case handler.
+    Reduce memory access by using more indexed MLAs in polynomial.
+    Speedup on Neoverse V1 for expm1 (19%), sinh (8.5%), and tanh (7.5%).
+
+diff --git a/sysdeps/aarch64/fpu/expm1_advsimd.c b/sysdeps/aarch64/fpu/expm1_advsimd.c
+index 3db3b80c49292947..f2042db8bcc8466a 100644
+--- a/sysdeps/aarch64/fpu/expm1_advsimd.c
+++ b/sysdeps/aarch64/fpu/expm1_advsimd.c
+@@ -18,31 +18,18 @@
+    <https://www.gnu.org/licenses/>.  */
+ 
+ #include "v_math.h"
+-#include "poly_advsimd_f64.h"
+#include "v_expm1_inline.h"
+ 
+ static const struct data
+ {
+-  float64x2_t poly[11];
+-  float64x2_t invln2;
+-  double ln2[2];
+-  float64x2_t shift;
+-  int64x2_t exponent_bias;
+  struct v_expm1_data d;
+ #if WANT_SIMD_EXCEPT
+   uint64x2_t thresh, tiny_bound;
+ #else
+   float64x2_t oflow_bound;
+ #endif
+ } data = {
+-  /* Generated using fpminimax, with degree=12 in [log(2)/2, log(2)/2].  */
+-  .poly = { V2 (0x1p-1), V2 (0x1.5555555555559p-3), V2 (0x1.555555555554bp-5),
+-	    V2 (0x1.111111110f663p-7), V2 (0x1.6c16c16c1b5f3p-10),
+-	    V2 (0x1.a01a01affa35dp-13), V2 (0x1.a01a018b4ecbbp-16),
+-	    V2 (0x1.71ddf82db5bb4p-19), V2 (0x1.27e517fc0d54bp-22),
+-	    V2 (0x1.af5eedae67435p-26), V2 (0x1.1f143d060a28ap-29) },
+-  .invln2 = V2 (0x1.71547652b82fep0),
+-  .ln2 = { 0x1.62e42fefa39efp-1, 0x1.abc9e3b39803fp-56 },
+-  .shift = V2 (0x1.8p52),
+-  .exponent_bias = V2 (0x3ff0000000000000),
+  .d = V_EXPM1_DATA,
+ #if WANT_SIMD_EXCEPT
+   /* asuint64(oflow_bound) - asuint64(0x1p-51), shifted left by 1 for abs
+      compare.  */
+@@ -58,67 +45,36 @@ static const struct data
+ };
+ 
+ static float64x2_t VPCS_ATTR NOINLINE
+-special_case (float64x2_t x, float64x2_t y, uint64x2_t special)
+special_case (float64x2_t x, uint64x2_t special, const struct data *d)
+ {
+-  return v_call_f64 (expm1, x, y, special);
+  return v_call_f64 (expm1, x, expm1_inline (v_zerofy_f64 (x, special), &d->d),
+		     special);
+ }
+ 
+ /* Double-precision vector exp(x) - 1 function.
+-   The maximum error observed error is 2.18 ULP:
+-   _ZGVnN2v_expm1 (0x1.634ba0c237d7bp-2) got 0x1.a8b9ea8d66e22p-2
+-					want 0x1.a8b9ea8d66e2p-2.  */
+   The maximum error observed error is 2.05 ULP:
+  _ZGVnN2v_expm1(0x1.634902eaff3adp-2) got 0x1.a8b636e2a9388p-2
+				      want 0x1.a8b636e2a9386p-2.  */
+ float64x2_t VPCS_ATTR V_NAME_D1 (expm1) (float64x2_t x)
+ {
+   const struct data *d = ptr_barrier (&data);
+ 
+-  uint64x2_t ix = vreinterpretq_u64_f64 (x);
+-
+ #if WANT_SIMD_EXCEPT
+  uint64x2_t ix = vreinterpretq_u64_f64 (x);
+   /* If fp exceptions are to be triggered correctly, fall back to scalar for
+      |x| < 2^-51, |x| > oflow_bound, Inf & NaN. Add ix to itself for
+      shift-left by 1, and compare with thresh which was left-shifted offline -
+      this is effectively an absolute compare.  */
+   uint64x2_t special
+       = vcgeq_u64 (vsubq_u64 (vaddq_u64 (ix, ix), d->tiny_bound), d->thresh);
+-  if (__glibc_unlikely (v_any_u64 (special)))
+-    x = v_zerofy_f64 (x, special);
+ #else
+   /* Large input, NaNs and Infs.  */
+   uint64x2_t special = vcageq_f64 (x, d->oflow_bound);
+ #endif
+ 
+-  /* Reduce argument to smaller range:
+-     Let i = round(x / ln2)
+-     and f = x - i * ln2, then f is in [-ln2/2, ln2/2].
+-     exp(x) - 1 = 2^i * (expm1(f) + 1) - 1
+-     where 2^i is exact because i is an integer.  */
+-  float64x2_t n = vsubq_f64 (vfmaq_f64 (d->shift, d->invln2, x), d->shift);
+-  int64x2_t i = vcvtq_s64_f64 (n);
+-  float64x2_t ln2 = vld1q_f64 (&d->ln2[0]);
+-  float64x2_t f = vfmsq_laneq_f64 (x, n, ln2, 0);
+-  f = vfmsq_laneq_f64 (f, n, ln2, 1);
+-
+-  /* Approximate expm1(f) using polynomial.
+-     Taylor expansion for expm1(x) has the form:
+-	 x + ax^2 + bx^3 + cx^4 ....
+-     So we calculate the polynomial P(f) = a + bf + cf^2 + ...
+-     and assemble the approximation expm1(f) ~= f + f^2 * P(f).  */
+-  float64x2_t f2 = vmulq_f64 (f, f);
+-  float64x2_t f4 = vmulq_f64 (f2, f2);
+-  float64x2_t f8 = vmulq_f64 (f4, f4);
+-  float64x2_t p = vfmaq_f64 (f, f2, v_estrin_10_f64 (f, f2, f4, f8, d->poly));
+-
+-  /* Assemble the result.
+-     expm1(x) ~= 2^i * (p + 1) - 1
+-     Let t = 2^i.  */
+-  int64x2_t u = vaddq_s64 (vshlq_n_s64 (i, 52), d->exponent_bias);
+-  float64x2_t t = vreinterpretq_f64_s64 (u);
+-
+   if (__glibc_unlikely (v_any_u64 (special)))
+-    return special_case (vreinterpretq_f64_u64 (ix),
+-			 vfmaq_f64 (vsubq_f64 (t, v_f64 (1.0)), p, t),
+-			 special);
+    return special_case (x, special, d);
+ 
+   /* expm1(x) ~= p * t + (t - 1).  */
+-  return vfmaq_f64 (vsubq_f64 (t, v_f64 (1.0)), p, t);
+  return expm1_inline (x, &d->d);
+ }
+diff --git a/sysdeps/aarch64/fpu/sinh_advsimd.c b/sysdeps/aarch64/fpu/sinh_advsimd.c
+index 3e3b76c502b01e16..7adf771517de2507 100644
+--- a/sysdeps/aarch64/fpu/sinh_advsimd.c
+++ b/sysdeps/aarch64/fpu/sinh_advsimd.c
+@@ -18,72 +18,31 @@
+    <https://www.gnu.org/licenses/>.  */
+ 
+ #include "v_math.h"
+-#include "poly_advsimd_f64.h"
+#include "v_expm1_inline.h"
+ 
+ static const struct data
+ {
+-  float64x2_t poly[11], inv_ln2;
+-  double m_ln2[2];
+-  float64x2_t shift;
+  struct v_expm1_data d;
+   uint64x2_t halff;
+-  int64x2_t onef;
+ #if WANT_SIMD_EXCEPT
+   uint64x2_t tiny_bound, thresh;
+ #else
+-  uint64x2_t large_bound;
+  float64x2_t large_bound;
+ #endif
+ } data = {
+-  /* Generated using Remez, deg=12 in [-log(2)/2, log(2)/2].  */
+-  .poly = { V2 (0x1p-1), V2 (0x1.5555555555559p-3), V2 (0x1.555555555554bp-5),
+-	    V2 (0x1.111111110f663p-7), V2 (0x1.6c16c16c1b5f3p-10),
+-	    V2 (0x1.a01a01affa35dp-13), V2 (0x1.a01a018b4ecbbp-16),
+-	    V2 (0x1.71ddf82db5bb4p-19), V2 (0x1.27e517fc0d54bp-22),
+-	    V2 (0x1.af5eedae67435p-26), V2 (0x1.1f143d060a28ap-29), },
+-
+-  .inv_ln2 = V2 (0x1.71547652b82fep0),
+-  .m_ln2 = {-0x1.62e42fefa39efp-1, -0x1.abc9e3b39803fp-56},
+-  .shift = V2 (0x1.8p52),
+-
+  .d = V_EXPM1_DATA,
+   .halff = V2 (0x3fe0000000000000),
+-  .onef = V2 (0x3ff0000000000000),
+ #if WANT_SIMD_EXCEPT
+   /* 2^-26, below which sinh(x) rounds to x.  */
+   .tiny_bound = V2 (0x3e50000000000000),
+   /* asuint(large_bound) - asuint(tiny_bound).  */
+   .thresh = V2 (0x0230000000000000),
+ #else
+-/* 2^9. expm1 helper overflows for large input.  */
+-  .large_bound = V2 (0x4080000000000000),
+  /* 2^9. expm1 helper overflows for large input.  */
+  .large_bound = V2 (0x1p+9),
+ #endif
+ };
+ 
+-static inline float64x2_t
+-expm1_inline (float64x2_t x)
+-{
+-  const struct data *d = ptr_barrier (&data);
+-
+-  /* Reduce argument:
+-     exp(x) - 1 = 2^i * (expm1(f) + 1) - 1
+-     where i = round(x / ln2)
+-     and   f = x - i * ln2 (f in [-ln2/2, ln2/2]).  */
+-  float64x2_t j = vsubq_f64 (vfmaq_f64 (d->shift, d->inv_ln2, x), d->shift);
+-  int64x2_t i = vcvtq_s64_f64 (j);
+-
+-  float64x2_t m_ln2 = vld1q_f64 (d->m_ln2);
+-  float64x2_t f = vfmaq_laneq_f64 (x, j, m_ln2, 0);
+-  f = vfmaq_laneq_f64 (f, j, m_ln2, 1);
+-  /* Approximate expm1(f) using polynomial.  */
+-  float64x2_t f2 = vmulq_f64 (f, f);
+-  float64x2_t f4 = vmulq_f64 (f2, f2);
+-  float64x2_t f8 = vmulq_f64 (f4, f4);
+-  float64x2_t p = vfmaq_f64 (f, f2, v_estrin_10_f64 (f, f2, f4, f8, d->poly));
+-  /* t = 2^i.  */
+-  float64x2_t t = vreinterpretq_f64_u64 (
+-      vreinterpretq_u64_s64 (vaddq_s64 (vshlq_n_s64 (i, 52), d->onef)));
+-  /* expm1(x) ~= p * t + (t - 1).  */
+-  return vfmaq_f64 (vsubq_f64 (t, v_f64 (1.0)), p, t);
+-}
+-
+ static float64x2_t NOINLINE VPCS_ATTR
+ special_case (float64x2_t x)
+ {
+@@ -92,23 +51,23 @@ special_case (float64x2_t x)
+ 
+ /* Approximation for vector double-precision sinh(x) using expm1.
+    sinh(x) = (exp(x) - exp(-x)) / 2.
+-   The greatest observed error is 2.57 ULP:
+-   _ZGVnN2v_sinh (0x1.9fb1d49d1d58bp-2) got 0x1.ab34e59d678dcp-2
+-				       want 0x1.ab34e59d678d9p-2.  */
+   The greatest observed error is 2.52 ULP:
+   _ZGVnN2v_sinh(-0x1.a098a2177a2b9p-2) got -0x1.ac2f05bb66fccp-2
+				       want -0x1.ac2f05bb66fc9p-2.  */
+ float64x2_t VPCS_ATTR V_NAME_D1 (sinh) (float64x2_t x)
+ {
+   const struct data *d = ptr_barrier (&data);
+ 
+   float64x2_t ax = vabsq_f64 (x);
+-  uint64x2_t sign
+-      = veorq_u64 (vreinterpretq_u64_f64 (x), vreinterpretq_u64_f64 (ax));
+-  float64x2_t halfsign = vreinterpretq_f64_u64 (vorrq_u64 (sign, d->halff));
+  uint64x2_t ix = vreinterpretq_u64_f64 (x);
+  float64x2_t halfsign = vreinterpretq_f64_u64 (
+      vbslq_u64 (v_u64 (0x8000000000000000), ix, d->halff));
+ 
+ #if WANT_SIMD_EXCEPT
+   uint64x2_t special = vcgeq_u64 (
+       vsubq_u64 (vreinterpretq_u64_f64 (ax), d->tiny_bound), d->thresh);
+ #else
+-  uint64x2_t special = vcgeq_u64 (vreinterpretq_u64_f64 (ax), d->large_bound);
+  uint64x2_t special = vcageq_f64 (x, d->large_bound);
+ #endif
+ 
+   /* Fall back to scalar variant for all lanes if any of them are special.  */
+@@ -118,7 +77,7 @@ float64x2_t VPCS_ATTR V_NAME_D1 (sinh) (float64x2_t x)
+   /* Up to the point that expm1 overflows, we can use it to calculate sinh
+      using a slight rearrangement of the definition of sinh. This allows us to
+      retain acceptable accuracy for very small inputs.  */
+-  float64x2_t t = expm1_inline (ax);
+  float64x2_t t = expm1_inline (ax, &d->d);
+   t = vaddq_f64 (t, vdivq_f64 (t, vaddq_f64 (t, v_f64 (1.0))));
+   return vmulq_f64 (t, halfsign);
+ }
+diff --git a/sysdeps/aarch64/fpu/tanh_advsimd.c b/sysdeps/aarch64/fpu/tanh_advsimd.c
+index 1da1dfa5dbe418b6..402ba9d8ad2478a8 100644
+--- a/sysdeps/aarch64/fpu/tanh_advsimd.c
+++ b/sysdeps/aarch64/fpu/tanh_advsimd.c
+@@ -18,68 +18,30 @@
+    <https://www.gnu.org/licenses/>.  */
+ 
+ #include "v_math.h"
+-#include "poly_advsimd_f64.h"
+#include "v_expm1_inline.h"
+ 
+ static const struct data
+ {
+-  float64x2_t poly[11];
+-  float64x2_t inv_ln2, ln2_hi, ln2_lo, shift;
+-  uint64x2_t onef;
+  struct v_expm1_data d;
+   uint64x2_t thresh, tiny_bound;
+ } data = {
+-  /* Generated using Remez, deg=12 in [-log(2)/2, log(2)/2].  */
+-  .poly = { V2 (0x1p-1), V2 (0x1.5555555555559p-3), V2 (0x1.555555555554bp-5),
+-	    V2 (0x1.111111110f663p-7), V2 (0x1.6c16c16c1b5f3p-10),
+-	    V2 (0x1.a01a01affa35dp-13), V2 (0x1.a01a018b4ecbbp-16),
+-	    V2 (0x1.71ddf82db5bb4p-19), V2 (0x1.27e517fc0d54bp-22),
+-	    V2 (0x1.af5eedae67435p-26), V2 (0x1.1f143d060a28ap-29), },
+-
+-  .inv_ln2 = V2 (0x1.71547652b82fep0),
+-  .ln2_hi = V2 (-0x1.62e42fefa39efp-1),
+-  .ln2_lo = V2 (-0x1.abc9e3b39803fp-56),
+-  .shift = V2 (0x1.8p52),
+-
+-  .onef = V2 (0x3ff0000000000000),
+  .d = V_EXPM1_DATA,
+   .tiny_bound = V2 (0x3e40000000000000), /* asuint64 (0x1p-27).  */
+   /* asuint64(0x1.241bf835f9d5fp+4) - asuint64(tiny_bound).  */
+   .thresh = V2 (0x01f241bf835f9d5f),
+ };
+ 
+-static inline float64x2_t
+-expm1_inline (float64x2_t x, const struct data *d)
+-{
+-  /* Helper routine for calculating exp(x) - 1. Vector port of the helper from
+-     the scalar variant of tanh.  */
+-
+-  /* Reduce argument: f in [-ln2/2, ln2/2], i is exact.  */
+-  float64x2_t j = vsubq_f64 (vfmaq_f64 (d->shift, d->inv_ln2, x), d->shift);
+-  int64x2_t i = vcvtq_s64_f64 (j);
+-  float64x2_t f = vfmaq_f64 (x, j, d->ln2_hi);
+-  f = vfmaq_f64 (f, j, d->ln2_lo);
+-
+-  /* Approximate expm1(f) using polynomial.  */
+-  float64x2_t f2 = vmulq_f64 (f, f);
+-  float64x2_t f4 = vmulq_f64 (f2, f2);
+-  float64x2_t p = vfmaq_f64 (
+-      f, f2, v_estrin_10_f64 (f, f2, f4, vmulq_f64 (f4, f4), d->poly));
+-
+-  /* t = 2 ^ i.  */
+-  float64x2_t t = vreinterpretq_f64_u64 (
+-      vaddq_u64 (vreinterpretq_u64_s64 (i << 52), d->onef));
+-  /* expm1(x) = p * t + (t - 1).  */
+-  return vfmaq_f64 (vsubq_f64 (t, v_f64 (1)), p, t);
+-}
+-
+ static float64x2_t NOINLINE VPCS_ATTR
+-special_case (float64x2_t x, float64x2_t y, uint64x2_t special)
+special_case (float64x2_t x, float64x2_t q, float64x2_t qp2,
+	      uint64x2_t special)
+ {
+-  return v_call_f64 (tanh, x, y, special);
+  return v_call_f64 (tanh, x, vdivq_f64 (q, qp2), special);
+ }
+ 
+ /* Vector approximation for double-precision tanh(x), using a simplified
+-   version of expm1. The greatest observed error is 2.77 ULP:
+-   _ZGVnN2v_tanh(-0x1.c4a4ca0f9f3b7p-3) got -0x1.bd6a21a163627p-3
+-				       want -0x1.bd6a21a163624p-3.  */
+   version of expm1. The greatest observed error is 2.70 ULP:
+   _ZGVnN2v_tanh(-0x1.c59aa220cb177p-3) got -0x1.be5452a6459fep-3
+				       want -0x1.be5452a6459fbp-3.  */
+ float64x2_t VPCS_ATTR V_NAME_D1 (tanh) (float64x2_t x)
+ {
+   const struct data *d = ptr_barrier (&data);
+@@ -100,10 +62,10 @@ float64x2_t VPCS_ATTR V_NAME_D1 (tanh) (float64x2_t x)
+   u = vaddq_f64 (u, u);
+ 
+   /* tanh(x) = (e^2x - 1) / (e^2x + 1).  */
+-  float64x2_t q = expm1_inline (u, d);
+-  float64x2_t qp2 = vaddq_f64 (q, v_f64 (2));
+  float64x2_t q = expm1_inline (u, &d->d);
+  float64x2_t qp2 = vaddq_f64 (q, v_f64 (2.0));
+ 
+   if (__glibc_unlikely (v_any_u64 (special)))
+-    return special_case (x, vdivq_f64 (q, qp2), special);
+    return special_case (x, q, qp2, special);
+   return vdivq_f64 (q, qp2);
+ }
+diff --git a/sysdeps/aarch64/fpu/v_expm1_inline.h b/sysdeps/aarch64/fpu/v_expm1_inline.h
+new file mode 100644
+index 0000000000000000..a925183d4e5e4623
+--- /dev/null
+++ b/sysdeps/aarch64/fpu/v_expm1_inline.h
+@@ -0,0 +1,97 @@
+/* Double-precision inline helper for vector (Advanced SIMD) expm1 function
+
+   Copyright (C) 2024 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#ifndef AARCH64_FPU_V_EXPM1_INLINE_H
+#define AARCH64_FPU_V_EXPM1_INLINE_H
+
+#include "v_math.h"
+
+struct v_expm1_data
+{
+  float64x2_t c2, c4, c6, c8;
+  float64x2_t invln2;
+  int64x2_t exponent_bias;
+  double c1, c3, c5, c7, c9, c10;
+  double ln2[2];
+};
+
+/* Generated using fpminimax, with degree=12 in [log(2)/2, log(2)/2].  */
+#define V_EXPM1_DATA                                                          \
+  {                                                                           \
+    .c1 = 0x1.5555555555559p-3, .c2 = V2 (0x1.555555555554bp-5),              \
+    .c3 = 0x1.111111110f663p-7, .c4 = V2 (0x1.6c16c16c1b5f3p-10),             \
+    .c5 = 0x1.a01a01affa35dp-13, .c6 = V2 (0x1.a01a018b4ecbbp-16),            \
+    .c7 = 0x1.71ddf82db5bb4p-19, .c8 = V2 (0x1.27e517fc0d54bp-22),            \
+    .c9 = 0x1.af5eedae67435p-26, .c10 = 0x1.1f143d060a28ap-29,                \
+    .ln2 = { 0x1.62e42fefa39efp-1, 0x1.abc9e3b39803fp-56 },                   \
+    .invln2 = V2 (0x1.71547652b82fep0),                                       \
+    .exponent_bias = V2 (0x3ff0000000000000),                                 \
+  }
+
+static inline float64x2_t
+expm1_inline (float64x2_t x, const struct v_expm1_data *d)
+{
+  /* Helper routine for calculating exp(x) - 1.  */
+
+  float64x2_t ln2 = vld1q_f64 (&d->ln2[0]);
+
+  /* Reduce argument to smaller range:
+     Let i = round(x / ln2)
+     and f = x - i * ln2, then f is in [-ln2/2, ln2/2].
+     exp(x) - 1 = 2^i * (expm1(f) + 1) - 1
+     where 2^i is exact because i is an integer.  */
+  float64x2_t n = vrndaq_f64 (vmulq_f64 (x, d->invln2));
+  int64x2_t i = vcvtq_s64_f64 (n);
+  float64x2_t f = vfmsq_laneq_f64 (x, n, ln2, 0);
+  f = vfmsq_laneq_f64 (f, n, ln2, 1);
+
+  /* Approximate expm1(f) using polynomial.
+     Taylor expansion for expm1(x) has the form:
+	 x + ax^2 + bx^3 + cx^4 ....
+     So we calculate the polynomial P(f) = a + bf + cf^2 + ...
+     and assemble the approximation expm1(f) ~= f + f^2 * P(f).  */
+  float64x2_t f2 = vmulq_f64 (f, f);
+  float64x2_t f4 = vmulq_f64 (f2, f2);
+  float64x2_t lane_consts_13 = vld1q_f64 (&d->c1);
+  float64x2_t lane_consts_57 = vld1q_f64 (&d->c5);
+  float64x2_t lane_consts_910 = vld1q_f64 (&d->c9);
+  float64x2_t p01 = vfmaq_laneq_f64 (v_f64 (0.5), f, lane_consts_13, 0);
+  float64x2_t p23 = vfmaq_laneq_f64 (d->c2, f, lane_consts_13, 1);
+  float64x2_t p45 = vfmaq_laneq_f64 (d->c4, f, lane_consts_57, 0);
+  float64x2_t p67 = vfmaq_laneq_f64 (d->c6, f, lane_consts_57, 1);
+  float64x2_t p03 = vfmaq_f64 (p01, f2, p23);
+  float64x2_t p47 = vfmaq_f64 (p45, f2, p67);
+  float64x2_t p89 = vfmaq_laneq_f64 (d->c8, f, lane_consts_910, 0);
+  float64x2_t p = vfmaq_laneq_f64 (p89, f2, lane_consts_910, 1);
+  p = vfmaq_f64 (p47, f4, p);
+  p = vfmaq_f64 (p03, f4, p);
+
+  p = vfmaq_f64 (f, f2, p);
+
+  /* Assemble the result.
+     expm1(x) ~= 2^i * (p + 1) - 1
+     Let t = 2^i.  */
+  int64x2_t u = vaddq_s64 (vshlq_n_s64 (i, 52), d->exponent_bias);
+  float64x2_t t = vreinterpretq_f64_s64 (u);
+
+  /* expm1(x) ~= p * t + (t - 1).  */
+  return vfmaq_f64 (vsubq_f64 (t, v_f64 (1.0)), p, t);
+}
+
+#endif
+diff --git a/sysdeps/aarch64/fpu/v_expm1f_inline.h b/sysdeps/aarch64/fpu/v_expm1f_inline.h
+index 1daedfdd51cfc54b..c1fb88b5e027b322 100644
+--- a/sysdeps/aarch64/fpu/v_expm1f_inline.h
+++ b/sysdeps/aarch64/fpu/v_expm1f_inline.h
+@@ -21,7 +21,6 @@
+ #define AARCH64_FPU_V_EXPM1F_INLINE_H
+ 
+ #include "v_math.h"
+-#include "math_config.h"
+ 
+ struct v_expm1f_data
+ {
--- a/glibc-RHEL-118273-2.patch
+++ b/glibc-RHEL-118273-2.patch
@ -0,0 +1,862 @@
+commit bdb5705b7bab618ed4445f4b17d4b1e4fbbf94a7
+Author: Joe Ramsay <Joe.Ramsay@arm.com>
+Date:   Tue Feb 20 16:59:39 2024 +0000
+
+    aarch64/fpu: Add vector variants of cosh
+    
+    Reviewed-by: Szabolcs Nagy <szabolcs.nagy@arm.com>
+
+diff --git a/sysdeps/aarch64/fpu/Makefile b/sysdeps/aarch64/fpu/Makefile
+index 320b6ed43a9a454c..019c3a51880e2306 100644
+--- a/sysdeps/aarch64/fpu/Makefile
+++ b/sysdeps/aarch64/fpu/Makefile
+@@ -3,6 +3,7 @@ libmvec-supported-funcs = acos \
+                           atan \
+                           atan2 \
+                           cos \
+                          cosh \
+                           erf \
+                           exp \
+                           exp10 \
+@@ -32,7 +33,8 @@ libmvec-support = $(addsuffix f_advsimd,$(float-advsimd-funcs)) \
+                   erf_data \
+                   erff_data \
+                   sv_erf_data \
+-                  sv_erff_data
+                  sv_erff_data \
+                  v_exp_tail_data
+ endif
+ 
+ sve-cflags = -march=armv8-a+sve
+diff --git a/sysdeps/aarch64/fpu/Versions b/sysdeps/aarch64/fpu/Versions
+index d7b1e87191b66439..884b4b57f097635f 100644
+--- a/sysdeps/aarch64/fpu/Versions
+++ b/sysdeps/aarch64/fpu/Versions
+@@ -79,6 +79,11 @@ libmvec {
+     _ZGVsMxv_tan;
+   }
+   GLIBC_2.40 {
+    _ZGVnN2v_cosh;
+    _ZGVnN2v_coshf;
+    _ZGVnN4v_coshf;
+    _ZGVsMxv_cosh;
+    _ZGVsMxv_coshf;
+     _ZGVnN2v_erf;
+     _ZGVnN2v_erff;
+     _ZGVnN4v_erff;
+diff --git a/sysdeps/aarch64/fpu/advsimd_f32_protos.h b/sysdeps/aarch64/fpu/advsimd_f32_protos.h
+index d8d88de2181569f9..c63b2948d4938b0d 100644
+--- a/sysdeps/aarch64/fpu/advsimd_f32_protos.h
+++ b/sysdeps/aarch64/fpu/advsimd_f32_protos.h
+@@ -21,6 +21,7 @@ libmvec_hidden_proto (V_NAME_F1(acos));
+ libmvec_hidden_proto (V_NAME_F1(asin));
+ libmvec_hidden_proto (V_NAME_F1(atan));
+ libmvec_hidden_proto (V_NAME_F1(cos));
+libmvec_hidden_proto (V_NAME_F1(cosh));
+ libmvec_hidden_proto (V_NAME_F1(erf));
+ libmvec_hidden_proto (V_NAME_F1(exp10));
+ libmvec_hidden_proto (V_NAME_F1(exp2));
+diff --git a/sysdeps/aarch64/fpu/bits/math-vector.h b/sysdeps/aarch64/fpu/bits/math-vector.h
+index 71f53363a071126d..8ca55098706a54c2 100644
+--- a/sysdeps/aarch64/fpu/bits/math-vector.h
+++ b/sysdeps/aarch64/fpu/bits/math-vector.h
+@@ -49,6 +49,10 @@
+ # define __DECL_SIMD_cos __DECL_SIMD_aarch64
+ # undef __DECL_SIMD_cosf
+ # define __DECL_SIMD_cosf __DECL_SIMD_aarch64
+# undef __DECL_SIMD_cosh
+# define __DECL_SIMD_cosh __DECL_SIMD_aarch64
+# undef __DECL_SIMD_coshf
+# define __DECL_SIMD_coshf __DECL_SIMD_aarch64
+ # undef __DECL_SIMD_erf
+ # define __DECL_SIMD_erf __DECL_SIMD_aarch64
+ # undef __DECL_SIMD_erff
+@@ -124,6 +128,7 @@ __vpcs __f32x4_t _ZGVnN4v_acosf (__f32x4_t);
+ __vpcs __f32x4_t _ZGVnN4v_asinf (__f32x4_t);
+ __vpcs __f32x4_t _ZGVnN4v_atanf (__f32x4_t);
+ __vpcs __f32x4_t _ZGVnN4v_cosf (__f32x4_t);
+__vpcs __f32x4_t _ZGVnN4v_coshf (__f32x4_t);
+ __vpcs __f32x4_t _ZGVnN4v_erff (__f32x4_t);
+ __vpcs __f32x4_t _ZGVnN4v_expf (__f32x4_t);
+ __vpcs __f32x4_t _ZGVnN4v_exp10f (__f32x4_t);
+@@ -141,6 +146,7 @@ __vpcs __f64x2_t _ZGVnN2v_acos (__f64x2_t);
+ __vpcs __f64x2_t _ZGVnN2v_asin (__f64x2_t);
+ __vpcs __f64x2_t _ZGVnN2v_atan (__f64x2_t);
+ __vpcs __f64x2_t _ZGVnN2v_cos (__f64x2_t);
+__vpcs __f64x2_t _ZGVnN2v_cosh (__f64x2_t);
+ __vpcs __f64x2_t _ZGVnN2v_erf (__f64x2_t);
+ __vpcs __f64x2_t _ZGVnN2v_exp (__f64x2_t);
+ __vpcs __f64x2_t _ZGVnN2v_exp10 (__f64x2_t);
+@@ -163,6 +169,7 @@ __sv_f32_t _ZGVsMxv_acosf (__sv_f32_t, __sv_bool_t);
+ __sv_f32_t _ZGVsMxv_asinf (__sv_f32_t, __sv_bool_t);
+ __sv_f32_t _ZGVsMxv_atanf (__sv_f32_t, __sv_bool_t);
+ __sv_f32_t _ZGVsMxv_cosf (__sv_f32_t, __sv_bool_t);
+__sv_f32_t _ZGVsMxv_coshf (__sv_f32_t, __sv_bool_t);
+ __sv_f32_t _ZGVsMxv_erff (__sv_f32_t, __sv_bool_t);
+ __sv_f32_t _ZGVsMxv_expf (__sv_f32_t, __sv_bool_t);
+ __sv_f32_t _ZGVsMxv_exp10f (__sv_f32_t, __sv_bool_t);
+@@ -180,6 +187,7 @@ __sv_f64_t _ZGVsMxv_acos (__sv_f64_t, __sv_bool_t);
+ __sv_f64_t _ZGVsMxv_asin (__sv_f64_t, __sv_bool_t);
+ __sv_f64_t _ZGVsMxv_atan (__sv_f64_t, __sv_bool_t);
+ __sv_f64_t _ZGVsMxv_cos (__sv_f64_t, __sv_bool_t);
+__sv_f64_t _ZGVsMxv_cosh (__sv_f64_t, __sv_bool_t);
+ __sv_f64_t _ZGVsMxv_erf (__sv_f64_t, __sv_bool_t);
+ __sv_f64_t _ZGVsMxv_exp (__sv_f64_t, __sv_bool_t);
+ __sv_f64_t _ZGVsMxv_exp10 (__sv_f64_t, __sv_bool_t);
+diff --git a/sysdeps/aarch64/fpu/cosh_advsimd.c b/sysdeps/aarch64/fpu/cosh_advsimd.c
+new file mode 100644
+index 0000000000000000..ec7b59637e973da9
+--- /dev/null
+++ b/sysdeps/aarch64/fpu/cosh_advsimd.c
+@@ -0,0 +1,108 @@
+/* Double-precision vector (AdvSIMD) cosh function
+
+   Copyright (C) 2024 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include "v_math.h"
+
+static const struct data
+{
+  float64x2_t poly[3];
+  float64x2_t inv_ln2, ln2, shift, thres;
+  uint64x2_t index_mask, special_bound;
+} data = {
+  .poly = { V2 (0x1.fffffffffffd4p-2), V2 (0x1.5555571d6b68cp-3),
+	    V2 (0x1.5555576a59599p-5), },
+
+  .inv_ln2 = V2 (0x1.71547652b82fep8), /* N/ln2.  */
+  /* -ln2/N.  */
+  .ln2 = {-0x1.62e42fefa39efp-9, -0x1.abc9e3b39803f3p-64},
+  .shift = V2 (0x1.8p+52),
+  .thres = V2 (704.0),
+
+  .index_mask = V2 (0xff),
+  /* 0x1.6p9, above which exp overflows.  */
+  .special_bound = V2 (0x4086000000000000),
+};
+
+static float64x2_t NOINLINE VPCS_ATTR
+special_case (float64x2_t x, float64x2_t y, uint64x2_t special)
+{
+  return v_call_f64 (cosh, x, y, special);
+}
+
+/* Helper for approximating exp(x). Copied from v_exp_tail, with no
+   special-case handling or tail.  */
+static inline float64x2_t
+exp_inline (float64x2_t x)
+{
+  const struct data *d = ptr_barrier (&data);
+
+  /* n = round(x/(ln2/N)).  */
+  float64x2_t z = vfmaq_f64 (d->shift, x, d->inv_ln2);
+  uint64x2_t u = vreinterpretq_u64_f64 (z);
+  float64x2_t n = vsubq_f64 (z, d->shift);
+
+  /* r = x - n*ln2/N.  */
+  float64x2_t r = vfmaq_laneq_f64 (x, n, d->ln2, 0);
+  r = vfmaq_laneq_f64 (r, n, d->ln2, 1);
+
+  uint64x2_t e = vshlq_n_u64 (u, 52 - V_EXP_TAIL_TABLE_BITS);
+  uint64x2_t i = vandq_u64 (u, d->index_mask);
+
+  /* y = tail + exp(r) - 1 ~= r + C1 r^2 + C2 r^3 + C3 r^4.  */
+  float64x2_t y = vfmaq_f64 (d->poly[1], d->poly[2], r);
+  y = vfmaq_f64 (d->poly[0], y, r);
+  y = vmulq_f64 (vfmaq_f64 (v_f64 (1), y, r), r);
+
+  /* s = 2^(n/N).  */
+  u = v_lookup_u64 (__v_exp_tail_data, i);
+  float64x2_t s = vreinterpretq_f64_u64 (vaddq_u64 (u, e));
+
+  return vfmaq_f64 (s, y, s);
+}
+
+/* Approximation for vector double-precision cosh(x) using exp_inline.
+   cosh(x) = (exp(x) + exp(-x)) / 2.
+   The greatest observed error is in the scalar fall-back region, so is the
+   same as the scalar routine, 1.93 ULP:
+   _ZGVnN2v_cosh (0x1.628af341989dap+9) got 0x1.fdf28623ef921p+1021
+				       want 0x1.fdf28623ef923p+1021.
+
+   The greatest observed error in the non-special region is 1.54 ULP:
+   _ZGVnN2v_cosh (0x1.8e205b6ecacf7p+2) got 0x1.f711dcb0c77afp+7
+				       want 0x1.f711dcb0c77b1p+7.  */
+float64x2_t VPCS_ATTR V_NAME_D1 (cosh) (float64x2_t x)
+{
+  const struct data *d = ptr_barrier (&data);
+
+  float64x2_t ax = vabsq_f64 (x);
+  uint64x2_t special
+      = vcgtq_u64 (vreinterpretq_u64_f64 (ax), d->special_bound);
+
+  /* Up to the point that exp overflows, we can use it to calculate cosh by
+     exp(|x|) / 2 + 1 / (2 * exp(|x|)).  */
+  float64x2_t t = exp_inline (ax);
+  float64x2_t half_t = vmulq_n_f64 (t, 0.5);
+  float64x2_t half_over_t = vdivq_f64 (v_f64 (0.5), t);
+
+  /* Fall back to scalar for any special cases.  */
+  if (__glibc_unlikely (v_any_u64 (special)))
+    return special_case (x, vaddq_f64 (half_t, half_over_t), special);
+
+  return vaddq_f64 (half_t, half_over_t);
+}
+diff --git a/sysdeps/aarch64/fpu/cosh_sve.c b/sysdeps/aarch64/fpu/cosh_sve.c
+new file mode 100644
+index 0000000000000000..919f34604a452b4a
+--- /dev/null
+++ b/sysdeps/aarch64/fpu/cosh_sve.c
+@@ -0,0 +1,105 @@
+/* Double-precision vector (SVE) cosh function
+
+   Copyright (C) 2024 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include "sv_math.h"
+
+static const struct data
+{
+  float64_t poly[3];
+  float64_t inv_ln2, ln2_hi, ln2_lo, shift, thres;
+  uint64_t index_mask, special_bound;
+} data = {
+  .poly = { 0x1.fffffffffffd4p-2, 0x1.5555571d6b68cp-3,
+	    0x1.5555576a59599p-5, },
+
+  .inv_ln2 = 0x1.71547652b82fep8, /* N/ln2.  */
+  /* -ln2/N.  */
+  .ln2_hi = -0x1.62e42fefa39efp-9,
+  .ln2_lo = -0x1.abc9e3b39803f3p-64,
+  .shift = 0x1.8p+52,
+  .thres = 704.0,
+
+  .index_mask = 0xff,
+  /* 0x1.6p9, above which exp overflows.  */
+  .special_bound = 0x4086000000000000,
+};
+
+static svfloat64_t NOINLINE
+special_case (svfloat64_t x, svfloat64_t y, svbool_t special)
+{
+  return sv_call_f64 (cosh, x, y, special);
+}
+
+/* Helper for approximating exp(x). Copied from sv_exp_tail, with no
+   special-case handling or tail.  */
+static inline svfloat64_t
+exp_inline (svfloat64_t x, const svbool_t pg, const struct data *d)
+{
+  /* Calculate exp(x).  */
+  svfloat64_t z = svmla_x (pg, sv_f64 (d->shift), x, d->inv_ln2);
+  svfloat64_t n = svsub_x (pg, z, d->shift);
+
+  svfloat64_t r = svmla_x (pg, x, n, d->ln2_hi);
+  r = svmla_x (pg, r, n, d->ln2_lo);
+
+  svuint64_t u = svreinterpret_u64 (z);
+  svuint64_t e = svlsl_x (pg, u, 52 - V_EXP_TAIL_TABLE_BITS);
+  svuint64_t i = svand_x (pg, u, d->index_mask);
+
+  svfloat64_t y = svmla_x (pg, sv_f64 (d->poly[1]), r, d->poly[2]);
+  y = svmla_x (pg, sv_f64 (d->poly[0]), r, y);
+  y = svmla_x (pg, sv_f64 (1.0), r, y);
+  y = svmul_x (pg, r, y);
+
+  /* s = 2^(n/N).  */
+  u = svld1_gather_index (pg, __v_exp_tail_data, i);
+  svfloat64_t s = svreinterpret_f64 (svadd_x (pg, u, e));
+
+  return svmla_x (pg, s, s, y);
+}
+
+/* Approximation for SVE double-precision cosh(x) using exp_inline.
+   cosh(x) = (exp(x) + exp(-x)) / 2.
+   The greatest observed error is in the scalar fall-back region, so is the
+   same as the scalar routine, 1.93 ULP:
+   _ZGVsMxv_cosh (0x1.628ad45039d2fp+9) got 0x1.fd774e958236dp+1021
+				       want 0x1.fd774e958236fp+1021.
+
+   The greatest observed error in the non-special region is 1.54 ULP:
+   _ZGVsMxv_cosh (0x1.ba5651dd4486bp+2) got 0x1.f5e2bb8d5c98fp+8
+				       want 0x1.f5e2bb8d5c991p+8.  */
+svfloat64_t SV_NAME_D1 (cosh) (svfloat64_t x, const svbool_t pg)
+{
+  const struct data *d = ptr_barrier (&data);
+
+  svfloat64_t ax = svabs_x (pg, x);
+  svbool_t special = svcmpgt (pg, svreinterpret_u64 (ax), d->special_bound);
+
+  /* Up to the point that exp overflows, we can use it to calculate cosh by
+     exp(|x|) / 2 + 1 / (2 * exp(|x|)).  */
+  svfloat64_t t = exp_inline (ax, pg, d);
+  svfloat64_t half_t = svmul_x (pg, t, 0.5);
+  svfloat64_t half_over_t = svdivr_x (pg, t, 0.5);
+
+  /* Fall back to scalar for any special cases.  */
+  if (__glibc_unlikely (svptest_any (pg, special)))
+    return special_case (x, svadd_x (pg, half_t, half_over_t), special);
+
+  return svadd_x (pg, half_t, half_over_t);
+}
+diff --git a/sysdeps/aarch64/fpu/coshf_advsimd.c b/sysdeps/aarch64/fpu/coshf_advsimd.c
+new file mode 100644
+index 0000000000000000..c1ab4923b826569b
+--- /dev/null
+++ b/sysdeps/aarch64/fpu/coshf_advsimd.c
+@@ -0,0 +1,84 @@
+/* Single-precision vector (AdvSIMD) cosh function
+
+   Copyright (C) 2024 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include "v_expf_inline.h"
+#include "v_math.h"
+
+static const struct data
+{
+  struct v_expf_data expf_consts;
+  uint32x4_t tiny_bound, special_bound;
+} data = {
+  .expf_consts = V_EXPF_DATA,
+  .tiny_bound = V4 (0x20000000), /* 0x1p-63: Round to 1 below this.  */
+  /* 0x1.5a92d8p+6: expf overflows above this, so have to use special case.  */
+  .special_bound = V4 (0x42ad496c),
+};
+
+#if !WANT_SIMD_EXCEPT
+static float32x4_t NOINLINE VPCS_ATTR
+special_case (float32x4_t x, float32x4_t y, uint32x4_t special)
+{
+  return v_call_f32 (coshf, x, y, special);
+}
+#endif
+
+/* Single-precision vector cosh, using vector expf.
+   Maximum error is 2.38 ULP:
+   _ZGVnN4v_coshf (0x1.e8001ep+1) got 0x1.6a491ep+4
+				 want 0x1.6a4922p+4.  */
+float32x4_t VPCS_ATTR V_NAME_F1 (cosh) (float32x4_t x)
+{
+  const struct data *d = ptr_barrier (&data);
+
+  float32x4_t ax = vabsq_f32 (x);
+  uint32x4_t iax = vreinterpretq_u32_f32 (ax);
+  uint32x4_t special = vcgeq_u32 (iax, d->special_bound);
+
+#if WANT_SIMD_EXCEPT
+  /* If fp exceptions are to be triggered correctly, fall back to the scalar
+     variant for all inputs if any input is a special value or above the bound
+     at which expf overflows.  */
+  if (__glibc_unlikely (v_any_u32 (special)))
+    return v_call_f32 (coshf, x, x, v_u32 (-1));
+
+  uint32x4_t tiny = vcleq_u32 (iax, d->tiny_bound);
+  /* If any input is tiny, avoid underflow exception by fixing tiny lanes of
+     input to 0, which will generate no exceptions.  */
+  if (__glibc_unlikely (v_any_u32 (tiny)))
+    ax = v_zerofy_f32 (ax, tiny);
+#endif
+
+  /* Calculate cosh by exp(x) / 2 + exp(-x) / 2.  */
+  float32x4_t t = v_expf_inline (ax, &d->expf_consts);
+  float32x4_t half_t = vmulq_n_f32 (t, 0.5);
+  float32x4_t half_over_t = vdivq_f32 (v_f32 (0.5), t);
+
+#if WANT_SIMD_EXCEPT
+  if (__glibc_unlikely (v_any_u32 (tiny)))
+    return vbslq_f32 (tiny, v_f32 (1), vaddq_f32 (half_t, half_over_t));
+#else
+  if (__glibc_unlikely (v_any_u32 (special)))
+    return special_case (x, vaddq_f32 (half_t, half_over_t), special);
+#endif
+
+  return vaddq_f32 (half_t, half_over_t);
+}
+libmvec_hidden_def (V_NAME_F1 (cosh))
+HALF_WIDTH_ALIAS_F1 (cosh)
+diff --git a/sysdeps/aarch64/fpu/coshf_sve.c b/sysdeps/aarch64/fpu/coshf_sve.c
+new file mode 100644
+index 0000000000000000..e5d8a299c6aa7ceb
+--- /dev/null
+++ b/sysdeps/aarch64/fpu/coshf_sve.c
+@@ -0,0 +1,59 @@
+/* Single-precision vector (SVE) cosh function
+
+   Copyright (C) 2024 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include "sv_math.h"
+#include "sv_expf_inline.h"
+
+static const struct data
+{
+  struct sv_expf_data expf_consts;
+  uint32_t special_bound;
+} data = {
+  .expf_consts = SV_EXPF_DATA,
+  /* 0x1.5a92d8p+6: expf overflows above this, so have to use special case.  */
+  .special_bound = 0x42ad496c,
+};
+
+static svfloat32_t NOINLINE
+special_case (svfloat32_t x, svfloat32_t y, svbool_t pg)
+{
+  return sv_call_f32 (coshf, x, y, pg);
+}
+
+/* Single-precision vector cosh, using vector expf.
+   Maximum error is 1.89 ULP:
+   _ZGVsMxv_coshf (-0x1.65898cp+6) got 0x1.f00aep+127
+				  want 0x1.f00adcp+127.  */
+svfloat32_t SV_NAME_F1 (cosh) (svfloat32_t x, svbool_t pg)
+{
+  const struct data *d = ptr_barrier (&data);
+
+  svfloat32_t ax = svabs_x (pg, x);
+  svbool_t special = svcmpge (pg, svreinterpret_u32 (ax), d->special_bound);
+
+  /* Calculate cosh by exp(x) / 2 + exp(-x) / 2.  */
+  svfloat32_t t = expf_inline (ax, pg, &d->expf_consts);
+  svfloat32_t half_t = svmul_x (pg, t, 0.5);
+  svfloat32_t half_over_t = svdivr_x (pg, t, 0.5);
+
+  if (__glibc_unlikely (svptest_any (pg, special)))
+    return special_case (x, svadd_x (pg, half_t, half_over_t), special);
+
+  return svadd_x (pg, half_t, half_over_t);
+}
+diff --git a/sysdeps/aarch64/fpu/sv_expf_inline.h b/sysdeps/aarch64/fpu/sv_expf_inline.h
+new file mode 100644
+index 0000000000000000..23963b5f8ec89ead
+--- /dev/null
+++ b/sysdeps/aarch64/fpu/sv_expf_inline.h
+@@ -0,0 +1,75 @@
+/* SVE helper for single-precision routines which depend on exp
+
+   Copyright (C) 2024 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#ifndef AARCH64_FPU_SV_EXPF_INLINE_H
+#define AARCH64_FPU_SV_EXPF_INLINE_H
+
+#include "sv_math.h"
+
+struct sv_expf_data
+{
+  float poly[5];
+  float inv_ln2, ln2_hi, ln2_lo, shift;
+};
+
+/* Coefficients copied from the polynomial in AdvSIMD variant, reversed for
+   compatibility with polynomial helpers. Shift is 1.5*2^17 + 127.  */
+#define SV_EXPF_DATA                                                          \
+  {                                                                           \
+    .poly = { 0x1.ffffecp-1f, 0x1.fffdb6p-2f, 0x1.555e66p-3f, 0x1.573e2ep-5f, \
+	      0x1.0e4020p-7f },                                               \
+                                                                              \
+    .inv_ln2 = 0x1.715476p+0f, .ln2_hi = 0x1.62e4p-1f,                        \
+    .ln2_lo = 0x1.7f7d1cp-20f, .shift = 0x1.803f8p17f,                        \
+  }
+
+#define C(i) sv_f32 (d->poly[i])
+
+static inline svfloat32_t
+expf_inline (svfloat32_t x, const svbool_t pg, const struct sv_expf_data *d)
+{
+  /* exp(x) = 2^n (1 + poly(r)), with 1 + poly(r) in [1/sqrt(2),sqrt(2)]
+     x = ln2*n + r, with r in [-ln2/2, ln2/2].  */
+
+  /* Load some constants in quad-word chunks to minimise memory access.  */
+  svfloat32_t c4_invln2_and_ln2 = svld1rq (svptrue_b32 (), &d->poly[4]);
+
+  /* n = round(x/(ln2/N)).  */
+  svfloat32_t z = svmla_lane (sv_f32 (d->shift), x, c4_invln2_and_ln2, 1);
+  svfloat32_t n = svsub_x (pg, z, d->shift);
+
+  /* r = x - n*ln2/N.  */
+  svfloat32_t r = svmls_lane (x, n, c4_invln2_and_ln2, 2);
+  r = svmls_lane (r, n, c4_invln2_and_ln2, 3);
+
+  /* scale = 2^(n/N).  */
+  svfloat32_t scale = svexpa (svreinterpret_u32_f32 (z));
+
+  /* y = exp(r) - 1 ~= r + C0 r^2 + C1 r^3 + C2 r^4 + C3 r^5 + C4 r^6.  */
+  svfloat32_t p12 = svmla_x (pg, C (1), C (2), r);
+  svfloat32_t p34 = svmla_lane (C (3), r, c4_invln2_and_ln2, 0);
+  svfloat32_t r2 = svmul_f32_x (pg, r, r);
+  svfloat32_t p14 = svmla_x (pg, p12, p34, r2);
+  svfloat32_t p0 = svmul_f32_x (pg, r, C (0));
+  svfloat32_t poly = svmla_x (pg, p0, r2, p14);
+
+  return svmla_x (pg, scale, scale, poly);
+}
+
+#endif
+diff --git a/sysdeps/aarch64/fpu/test-double-advsimd-wrappers.c b/sysdeps/aarch64/fpu/test-double-advsimd-wrappers.c
+index 41fdb92d7ea6e707..b37cb7d5e9c0d96a 100644
+--- a/sysdeps/aarch64/fpu/test-double-advsimd-wrappers.c
+++ b/sysdeps/aarch64/fpu/test-double-advsimd-wrappers.c
+@@ -28,6 +28,7 @@ VPCS_VECTOR_WRAPPER (asin_advsimd, _ZGVnN2v_asin)
+ VPCS_VECTOR_WRAPPER (atan_advsimd, _ZGVnN2v_atan)
+ VPCS_VECTOR_WRAPPER_ff (atan2_advsimd, _ZGVnN2vv_atan2)
+ VPCS_VECTOR_WRAPPER (cos_advsimd, _ZGVnN2v_cos)
+VPCS_VECTOR_WRAPPER (cosh_advsimd, _ZGVnN2v_cosh)
+ VPCS_VECTOR_WRAPPER (erf_advsimd, _ZGVnN2v_erf)
+ VPCS_VECTOR_WRAPPER (exp_advsimd, _ZGVnN2v_exp)
+ VPCS_VECTOR_WRAPPER (exp10_advsimd, _ZGVnN2v_exp10)
+diff --git a/sysdeps/aarch64/fpu/test-double-sve-wrappers.c b/sysdeps/aarch64/fpu/test-double-sve-wrappers.c
+index 8e3d64da420348a7..011f07d2c15b148f 100644
+--- a/sysdeps/aarch64/fpu/test-double-sve-wrappers.c
+++ b/sysdeps/aarch64/fpu/test-double-sve-wrappers.c
+@@ -47,6 +47,7 @@ SVE_VECTOR_WRAPPER (asin_sve, _ZGVsMxv_asin)
+ SVE_VECTOR_WRAPPER (atan_sve, _ZGVsMxv_atan)
+ SVE_VECTOR_WRAPPER_ff (atan2_sve, _ZGVsMxvv_atan2)
+ SVE_VECTOR_WRAPPER (cos_sve, _ZGVsMxv_cos)
+SVE_VECTOR_WRAPPER (cosh_sve, _ZGVsMxv_cosh)
+ SVE_VECTOR_WRAPPER (erf_sve, _ZGVsMxv_erf)
+ SVE_VECTOR_WRAPPER (exp_sve, _ZGVsMxv_exp)
+ SVE_VECTOR_WRAPPER (exp10_sve, _ZGVsMxv_exp10)
+diff --git a/sysdeps/aarch64/fpu/test-float-advsimd-wrappers.c b/sysdeps/aarch64/fpu/test-float-advsimd-wrappers.c
+index 33ae92878f774ac3..35452991431e238a 100644
+--- a/sysdeps/aarch64/fpu/test-float-advsimd-wrappers.c
+++ b/sysdeps/aarch64/fpu/test-float-advsimd-wrappers.c
+@@ -28,6 +28,7 @@ VPCS_VECTOR_WRAPPER (asinf_advsimd, _ZGVnN4v_asinf)
+ VPCS_VECTOR_WRAPPER (atanf_advsimd, _ZGVnN4v_atanf)
+ VPCS_VECTOR_WRAPPER_ff (atan2f_advsimd, _ZGVnN4vv_atan2f)
+ VPCS_VECTOR_WRAPPER (cosf_advsimd, _ZGVnN4v_cosf)
+VPCS_VECTOR_WRAPPER (coshf_advsimd, _ZGVnN4v_coshf)
+ VPCS_VECTOR_WRAPPER (erff_advsimd, _ZGVnN4v_erff)
+ VPCS_VECTOR_WRAPPER (expf_advsimd, _ZGVnN4v_expf)
+ VPCS_VECTOR_WRAPPER (exp10f_advsimd, _ZGVnN4v_exp10f)
+diff --git a/sysdeps/aarch64/fpu/test-float-sve-wrappers.c b/sysdeps/aarch64/fpu/test-float-sve-wrappers.c
+index ac0464f196e7972f..bbc74ede88c9e6c8 100644
+--- a/sysdeps/aarch64/fpu/test-float-sve-wrappers.c
+++ b/sysdeps/aarch64/fpu/test-float-sve-wrappers.c
+@@ -47,6 +47,7 @@ SVE_VECTOR_WRAPPER (asinf_sve, _ZGVsMxv_asinf)
+ SVE_VECTOR_WRAPPER (atanf_sve, _ZGVsMxv_atanf)
+ SVE_VECTOR_WRAPPER_ff (atan2f_sve, _ZGVsMxvv_atan2f)
+ SVE_VECTOR_WRAPPER (cosf_sve, _ZGVsMxv_cosf)
+SVE_VECTOR_WRAPPER (coshf_sve, _ZGVsMxv_coshf)
+ SVE_VECTOR_WRAPPER (erff_sve, _ZGVsMxv_erff)
+ SVE_VECTOR_WRAPPER (expf_sve, _ZGVsMxv_expf)
+ SVE_VECTOR_WRAPPER (exp10f_sve, _ZGVsMxv_exp10f)
+diff --git a/sysdeps/aarch64/fpu/v_exp_tail_data.c b/sysdeps/aarch64/fpu/v_exp_tail_data.c
+new file mode 100644
+index 0000000000000000..151e97c21bbc11ae
+--- /dev/null
+++ b/sysdeps/aarch64/fpu/v_exp_tail_data.c
+@@ -0,0 +1,110 @@
+/* Lookup table for high-precision exp(x, tail) function
+
+   Copyright (C) 2024 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include "vecmath_config.h"
+
+/* 2^(j/N), j=0..N, N=2^8=256.  */
+const uint64_t __v_exp_tail_data[] = {
+  0x3ff0000000000000, 0x3feffb1afa5abcbf, 0x3feff63da9fb3335,
+  0x3feff168143b0281, 0x3fefec9a3e778061, 0x3fefe7d42e11bbcc,
+  0x3fefe315e86e7f85, 0x3fefde5f72f654b1, 0x3fefd9b0d3158574,
+  0x3fefd50a0e3c1f89, 0x3fefd06b29ddf6de, 0x3fefcbd42b72a836,
+  0x3fefc74518759bc8, 0x3fefc2bdf66607e0, 0x3fefbe3ecac6f383,
+  0x3fefb9c79b1f3919, 0x3fefb5586cf9890f, 0x3fefb0f145e46c85,
+  0x3fefac922b7247f7, 0x3fefa83b23395dec, 0x3fefa3ec32d3d1a2,
+  0x3fef9fa55fdfa9c5, 0x3fef9b66affed31b, 0x3fef973028d7233e,
+  0x3fef9301d0125b51, 0x3fef8edbab5e2ab6, 0x3fef8abdc06c31cc,
+  0x3fef86a814f204ab, 0x3fef829aaea92de0, 0x3fef7e95934f312e,
+  0x3fef7a98c8a58e51, 0x3fef76a45471c3c2, 0x3fef72b83c7d517b,
+  0x3fef6ed48695bbc0, 0x3fef6af9388c8dea, 0x3fef672658375d2f,
+  0x3fef635beb6fcb75, 0x3fef5f99f8138a1c, 0x3fef5be084045cd4,
+  0x3fef582f95281c6b, 0x3fef54873168b9aa, 0x3fef50e75eb44027,
+  0x3fef4d5022fcd91d, 0x3fef49c18438ce4d, 0x3fef463b88628cd6,
+  0x3fef42be3578a819, 0x3fef3f49917ddc96, 0x3fef3bdda27912d1,
+  0x3fef387a6e756238, 0x3fef351ffb82140a, 0x3fef31ce4fb2a63f,
+  0x3fef2e85711ece75, 0x3fef2b4565e27cdd, 0x3fef280e341ddf29,
+  0x3fef24dfe1f56381, 0x3fef21ba7591bb70, 0x3fef1e9df51fdee1,
+  0x3fef1b8a66d10f13, 0x3fef187fd0dad990, 0x3fef157e39771b2f,
+  0x3fef1285a6e4030b, 0x3fef0f961f641589, 0x3fef0cafa93e2f56,
+  0x3fef09d24abd886b, 0x3fef06fe0a31b715, 0x3fef0432edeeb2fd,
+  0x3fef0170fc4cd831, 0x3feefeb83ba8ea32, 0x3feefc08b26416ff,
+  0x3feef96266e3fa2d, 0x3feef6c55f929ff1, 0x3feef431a2de883b,
+  0x3feef1a7373aa9cb, 0x3feeef26231e754a, 0x3feeecae6d05d866,
+  0x3feeea401b7140ef, 0x3feee7db34e59ff7, 0x3feee57fbfec6cf4,
+  0x3feee32dc313a8e5, 0x3feee0e544ede173, 0x3feedea64c123422,
+  0x3feedc70df1c5175, 0x3feeda4504ac801c, 0x3feed822c367a024,
+  0x3feed60a21f72e2a, 0x3feed3fb2709468a, 0x3feed1f5d950a897,
+  0x3feecffa3f84b9d4, 0x3feece086061892d, 0x3feecc2042a7d232,
+  0x3feeca41ed1d0057, 0x3feec86d668b3237, 0x3feec6a2b5c13cd0,
+  0x3feec4e1e192aed2, 0x3feec32af0d7d3de, 0x3feec17dea6db7d7,
+  0x3feebfdad5362a27, 0x3feebe41b817c114, 0x3feebcb299fddd0d,
+  0x3feebb2d81d8abff, 0x3feeb9b2769d2ca7, 0x3feeb8417f4531ee,
+  0x3feeb6daa2cf6642, 0x3feeb57de83f4eef, 0x3feeb42b569d4f82,
+  0x3feeb2e2f4f6ad27, 0x3feeb1a4ca5d920f, 0x3feeb070dde910d2,
+  0x3feeaf4736b527da, 0x3feeae27dbe2c4cf, 0x3feead12d497c7fd,
+  0x3feeac0827ff07cc, 0x3feeab07dd485429, 0x3feeaa11fba87a03,
+  0x3feea9268a5946b7, 0x3feea84590998b93, 0x3feea76f15ad2148,
+  0x3feea6a320dceb71, 0x3feea5e1b976dc09, 0x3feea52ae6cdf6f4,
+  0x3feea47eb03a5585, 0x3feea3dd1d1929fd, 0x3feea34634ccc320,
+  0x3feea2b9febc8fb7, 0x3feea23882552225, 0x3feea1c1c70833f6,
+  0x3feea155d44ca973, 0x3feea0f4b19e9538, 0x3feea09e667f3bcd,
+  0x3feea052fa75173e, 0x3feea012750bdabf, 0x3fee9fdcddd47645,
+  0x3fee9fb23c651a2f, 0x3fee9f9298593ae5, 0x3fee9f7df9519484,
+  0x3fee9f7466f42e87, 0x3fee9f75e8ec5f74, 0x3fee9f8286ead08a,
+  0x3fee9f9a48a58174, 0x3fee9fbd35d7cbfd, 0x3fee9feb564267c9,
+  0x3feea024b1ab6e09, 0x3feea0694fde5d3f, 0x3feea0b938ac1cf6,
+  0x3feea11473eb0187, 0x3feea17b0976cfdb, 0x3feea1ed0130c132,
+  0x3feea26a62ff86f0, 0x3feea2f336cf4e62, 0x3feea3878491c491,
+  0x3feea427543e1a12, 0x3feea4d2add106d9, 0x3feea589994cce13,
+  0x3feea64c1eb941f7, 0x3feea71a4623c7ad, 0x3feea7f4179f5b21,
+  0x3feea8d99b4492ed, 0x3feea9cad931a436, 0x3feeaac7d98a6699,
+  0x3feeabd0a478580f, 0x3feeace5422aa0db, 0x3feeae05bad61778,
+  0x3feeaf3216b5448c, 0x3feeb06a5e0866d9, 0x3feeb1ae99157736,
+  0x3feeb2fed0282c8a, 0x3feeb45b0b91ffc6, 0x3feeb5c353aa2fe2,
+  0x3feeb737b0cdc5e5, 0x3feeb8b82b5f98e5, 0x3feeba44cbc8520f,
+  0x3feebbdd9a7670b3, 0x3feebd829fde4e50, 0x3feebf33e47a22a2,
+  0x3feec0f170ca07ba, 0x3feec2bb4d53fe0d, 0x3feec49182a3f090,
+  0x3feec674194bb8d5, 0x3feec86319e32323, 0x3feeca5e8d07f29e,
+  0x3feecc667b5de565, 0x3feece7aed8eb8bb, 0x3feed09bec4a2d33,
+  0x3feed2c980460ad8, 0x3feed503b23e255d, 0x3feed74a8af46052,
+  0x3feed99e1330b358, 0x3feedbfe53c12e59, 0x3feede6b5579fdbf,
+  0x3feee0e521356eba, 0x3feee36bbfd3f37a, 0x3feee5ff3a3c2774,
+  0x3feee89f995ad3ad, 0x3feeeb4ce622f2ff, 0x3feeee07298db666,
+  0x3feef0ce6c9a8952, 0x3feef3a2b84f15fb, 0x3feef68415b749b1,
+  0x3feef9728de5593a, 0x3feefc6e29f1c52a, 0x3feeff76f2fb5e47,
+  0x3fef028cf22749e4, 0x3fef05b030a1064a, 0x3fef08e0b79a6f1f,
+  0x3fef0c1e904bc1d2, 0x3fef0f69c3f3a207, 0x3fef12c25bd71e09,
+  0x3fef16286141b33d, 0x3fef199bdd85529c, 0x3fef1d1cd9fa652c,
+  0x3fef20ab5fffd07a, 0x3fef244778fafb22, 0x3fef27f12e57d14b,
+  0x3fef2ba88988c933, 0x3fef2f6d9406e7b5, 0x3fef33405751c4db,
+  0x3fef3720dcef9069, 0x3fef3b0f2e6d1675, 0x3fef3f0b555dc3fa,
+  0x3fef43155b5bab74, 0x3fef472d4a07897c, 0x3fef4b532b08c968,
+  0x3fef4f87080d89f2, 0x3fef53c8eacaa1d6, 0x3fef5818dcfba487,
+  0x3fef5c76e862e6d3, 0x3fef60e316c98398, 0x3fef655d71ff6075,
+  0x3fef69e603db3285, 0x3fef6e7cd63a8315, 0x3fef7321f301b460,
+  0x3fef77d5641c0658, 0x3fef7c97337b9b5f, 0x3fef81676b197d17,
+  0x3fef864614f5a129, 0x3fef8b333b16ee12, 0x3fef902ee78b3ff6,
+  0x3fef953924676d76, 0x3fef9a51fbc74c83, 0x3fef9f7977cdb740,
+  0x3fefa4afa2a490da, 0x3fefa9f4867cca6e, 0x3fefaf482d8e67f1,
+  0x3fefb4aaa2188510, 0x3fefba1bee615a27, 0x3fefbf9c1cb6412a,
+  0x3fefc52b376bba97, 0x3fefcac948dd7274, 0x3fefd0765b6e4540,
+  0x3fefd632798844f8, 0x3fefdbfdad9cbe14, 0x3fefe1d802243c89,
+  0x3fefe7c1819e90d8, 0x3fefedba3692d514, 0x3feff3c22b8f71f1,
+  0x3feff9d96b2a23d9,
+};
+diff --git a/sysdeps/aarch64/fpu/v_expf_inline.h b/sysdeps/aarch64/fpu/v_expf_inline.h
+new file mode 100644
+index 0000000000000000..a3b0e32f9eb42021
+--- /dev/null
+++ b/sysdeps/aarch64/fpu/v_expf_inline.h
+@@ -0,0 +1,71 @@
+/* Helper for single-precision AdvSIMD routines which depend on exp
+
+   Copyright (C) 2024 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#ifndef AARCH64_FPU_V_EXPF_INLINE_H
+#define AARCH64_FPU_V_EXPF_INLINE_H
+
+#include "v_math.h"
+
+struct v_expf_data
+{
+  float32x4_t poly[5];
+  float32x4_t shift, invln2_and_ln2;
+};
+
+/* maxerr: 1.45358 +0.5 ulp.  */
+#define V_EXPF_DATA                                                           \
+  {                                                                           \
+    .poly = { V4 (0x1.0e4020p-7f), V4 (0x1.573e2ep-5f), V4 (0x1.555e66p-3f),  \
+	      V4 (0x1.fffdb6p-2f), V4 (0x1.ffffecp-1f) },                     \
+    .shift = V4 (0x1.8p23f),                                                  \
+    .invln2_and_ln2 = { 0x1.715476p+0f, 0x1.62e4p-1f, 0x1.7f7d1cp-20f, 0 },   \
+  }
+
+#define ExponentBias v_u32 (0x3f800000) /* asuint(1.0f).  */
+#define C(i) d->poly[i]
+
+static inline float32x4_t
+v_expf_inline (float32x4_t x, const struct v_expf_data *d)
+{
+  /* Helper routine for calculating exp(x).
+     Copied from v_expf.c, with all special-case handling removed - the
+     calling routine should handle special values if required.  */
+
+  /* exp(x) = 2^n (1 + poly(r)), with 1 + poly(r) in [1/sqrt(2),sqrt(2)]
+     x = ln2*n + r, with r in [-ln2/2, ln2/2].  */
+  float32x4_t n, r, z;
+  z = vfmaq_laneq_f32 (d->shift, x, d->invln2_and_ln2, 0);
+  n = vsubq_f32 (z, d->shift);
+  r = vfmsq_laneq_f32 (x, n, d->invln2_and_ln2, 1);
+  r = vfmsq_laneq_f32 (r, n, d->invln2_and_ln2, 2);
+  uint32x4_t e = vshlq_n_u32 (vreinterpretq_u32_f32 (z), 23);
+  float32x4_t scale = vreinterpretq_f32_u32 (vaddq_u32 (e, ExponentBias));
+
+  /* Custom order-4 Estrin avoids building high order monomial.  */
+  float32x4_t r2 = vmulq_f32 (r, r);
+  float32x4_t p, q, poly;
+  p = vfmaq_f32 (C (1), C (0), r);
+  q = vfmaq_f32 (C (3), C (2), r);
+  q = vfmaq_f32 (q, p, r2);
+  p = vmulq_f32 (C (4), r);
+  poly = vfmaq_f32 (p, q, r2);
+  return vfmaq_f32 (scale, poly, scale);
+}
+
+#endif
+diff --git a/sysdeps/aarch64/fpu/vecmath_config.h b/sysdeps/aarch64/fpu/vecmath_config.h
+index 409c0c9bd9b85422..3f0b5f476433ca06 100644
+--- a/sysdeps/aarch64/fpu/vecmath_config.h
+++ b/sysdeps/aarch64/fpu/vecmath_config.h
+@@ -59,6 +59,8 @@ extern const struct v_log_data
+   } table[1 << V_LOG_TABLE_BITS];
+ } __v_log_data attribute_hidden;
+ 
+#define V_EXP_TAIL_TABLE_BITS 8
+extern const uint64_t __v_exp_tail_data[1 << V_EXP_TAIL_TABLE_BITS] attribute_hidden;
+ #define V_EXP_TABLE_BITS 7
+ extern const uint64_t __v_exp_data[1 << V_EXP_TABLE_BITS] attribute_hidden;
+ 
+diff --git a/sysdeps/aarch64/libm-test-ulps b/sysdeps/aarch64/libm-test-ulps
+index f1103a245645476b..48d747ad5793be96 100644
+--- a/sysdeps/aarch64/libm-test-ulps
+++ b/sysdeps/aarch64/libm-test-ulps
+@@ -701,11 +701,19 @@ double: 2
+ float: 2
+ ldouble: 2
+ 
+Function: "cosh_advsimd":
+double: 2
+float: 2
+
+ Function: "cosh_downward":
+ double: 3
+ float: 1
+ ldouble: 3
+ 
+Function: "cosh_sve":
+double: 2
+float: 2
+
+ Function: "cosh_towardzero":
+ double: 3
+ float: 1
+diff --git a/sysdeps/unix/sysv/linux/aarch64/libmvec.abilist b/sysdeps/unix/sysv/linux/aarch64/libmvec.abilist
+index 6193518fb001cc92..f66da42c3630bf48 100644
+--- a/sysdeps/unix/sysv/linux/aarch64/libmvec.abilist
+++ b/sysdeps/unix/sysv/linux/aarch64/libmvec.abilist
+@@ -73,8 +73,13 @@ GLIBC_2.39 _ZGVsMxv_tan F
+ GLIBC_2.39 _ZGVsMxv_tanf F
+ GLIBC_2.39 _ZGVsMxvv_atan2 F
+ GLIBC_2.39 _ZGVsMxvv_atan2f F
+GLIBC_2.40 _ZGVnN2v_cosh F
+GLIBC_2.40 _ZGVnN2v_coshf F
+ GLIBC_2.40 _ZGVnN2v_erf F
+ GLIBC_2.40 _ZGVnN2v_erff F
+GLIBC_2.40 _ZGVnN4v_coshf F
+ GLIBC_2.40 _ZGVnN4v_erff F
+GLIBC_2.40 _ZGVsMxv_cosh F
+GLIBC_2.40 _ZGVsMxv_coshf F
+ GLIBC_2.40 _ZGVsMxv_erf F
+ GLIBC_2.40 _ZGVsMxv_erff F
--- a/glibc-RHEL-118273-20.patch
+++ b/glibc-RHEL-118273-20.patch
@ -0,0 +1,359 @@
+commit ca0c0d0f26fbf75b9cacc65122b457e8fdec40b8
+Author: Pierre Blanchard <pierre.blanchard@arm.com>
+Date:   Mon Dec 9 15:55:39 2024 +0000
+
+    AArch64: Improve codegen in users of ADVSIMD log1p helper
+    
+    Add inline helper for log1p and rearrange operations so MOV
+    is not necessary in reduction or around the special-case handler.
+    Reduce memory access by using more indexed MLAs in polynomial.
+    Speedup on Neoverse V1 for log1p (3.5%), acosh (7.5%) and atanh (10%).
+
+Conflicts:
+        sysdeps/aarch64/fpu/log1p_advsimd.c
+          (Fixup context to apply without out-of-scope dependency 751a5502)
+
+diff --git a/sysdeps/aarch64/fpu/acosh_advsimd.c b/sysdeps/aarch64/fpu/acosh_advsimd.c
+index c88283cf1191f4eb..a98f4a2e4d8cbf42 100644
+--- a/sysdeps/aarch64/fpu/acosh_advsimd.c
+++ b/sysdeps/aarch64/fpu/acosh_advsimd.c
+@@ -54,9 +54,8 @@ VPCS_ATTR float64x2_t V_NAME_D1 (acosh) (float64x2_t x)
+     x = vbslq_f64 (special, vreinterpretq_f64_u64 (d->one), x);
+ #endif
+ 
+-  float64x2_t xm1 = vsubq_f64 (x, v_f64 (1));
+-  float64x2_t y;
+-  y = vaddq_f64 (x, v_f64 (1));
+  float64x2_t xm1 = vsubq_f64 (x, v_f64 (1.0));
+  float64x2_t y = vaddq_f64 (x, v_f64 (1.0));
+   y = vmulq_f64 (y, xm1);
+   y = vsqrtq_f64 (y);
+   y = vaddq_f64 (xm1, y);
+diff --git a/sysdeps/aarch64/fpu/atanh_advsimd.c b/sysdeps/aarch64/fpu/atanh_advsimd.c
+index 3c3d0bd6ad41396d..eb9769aeac29cf15 100644
+--- a/sysdeps/aarch64/fpu/atanh_advsimd.c
+++ b/sysdeps/aarch64/fpu/atanh_advsimd.c
+@@ -23,15 +23,19 @@
+ const static struct data
+ {
+   struct v_log1p_data log1p_consts;
+-  uint64x2_t one, half;
+  uint64x2_t one;
+  uint64x2_t sign_mask;
+ } data = { .log1p_consts = V_LOG1P_CONSTANTS_TABLE,
+ 	   .one = V2 (0x3ff0000000000000),
+-	   .half = V2 (0x3fe0000000000000) };
+	   .sign_mask = V2 (0x8000000000000000) };
+ 
+ static float64x2_t VPCS_ATTR NOINLINE
+-special_case (float64x2_t x, float64x2_t y, uint64x2_t special)
+special_case (float64x2_t x, float64x2_t halfsign, float64x2_t y,
+	      uint64x2_t special, const struct data *d)
+ {
+-  return v_call_f64 (atanh, x, y, special);
+  y = log1p_inline (y, &d->log1p_consts);
+  return v_call_f64 (atanh, vbslq_f64 (d->sign_mask, halfsign, x),
+		     vmulq_f64 (halfsign, y), special);
+ }
+ 
+ /* Approximation for vector double-precision atanh(x) using modified log1p.
+@@ -43,11 +47,10 @@ float64x2_t V_NAME_D1 (atanh) (float64x2_t x)
+ {
+   const struct data *d = ptr_barrier (&data);
+ 
+  float64x2_t halfsign = vbslq_f64 (d->sign_mask, x, v_f64 (0.5));
+   float64x2_t ax = vabsq_f64 (x);
+   uint64x2_t ia = vreinterpretq_u64_f64 (ax);
+-  uint64x2_t sign = veorq_u64 (vreinterpretq_u64_f64 (x), ia);
+   uint64x2_t special = vcgeq_u64 (ia, d->one);
+-  float64x2_t halfsign = vreinterpretq_f64_u64 (vorrq_u64 (sign, d->half));
+ 
+ #if WANT_SIMD_EXCEPT
+   ax = v_zerofy_f64 (ax, special);
+@@ -55,10 +58,15 @@ float64x2_t V_NAME_D1 (atanh) (float64x2_t x)
+ 
+   float64x2_t y;
+   y = vaddq_f64 (ax, ax);
+-  y = vdivq_f64 (y, vsubq_f64 (v_f64 (1), ax));
+-  y = log1p_inline (y, &d->log1p_consts);
+  y = vdivq_f64 (y, vsubq_f64 (vreinterpretq_f64_u64 (d->one), ax));
+ 
+   if (__glibc_unlikely (v_any_u64 (special)))
+-    return special_case (x, vmulq_f64 (y, halfsign), special);
+#if WANT_SIMD_EXCEPT
+    return special_case (x, halfsign, y, special, d);
+#else
+    return special_case (ax, halfsign, y, special, d);
+#endif
+
+  y = log1p_inline (y, &d->log1p_consts);
+   return vmulq_f64 (y, halfsign);
+ }
+diff --git a/sysdeps/aarch64/fpu/log1p_advsimd.c b/sysdeps/aarch64/fpu/log1p_advsimd.c
+index ffc418fc9c24be28..9d18578ce6497787 100644
+--- a/sysdeps/aarch64/fpu/log1p_advsimd.c
+++ b/sysdeps/aarch64/fpu/log1p_advsimd.c
+@@ -17,43 +17,26 @@
+    License along with the GNU C Library; if not, see
+    <https://www.gnu.org/licenses/>.  */
+ 
+-#include "v_math.h"
+-#include "poly_advsimd_f64.h"
+#define WANT_V_LOG1P_K0_SHORTCUT 0
+#include "v_log1p_inline.h"
+ 
+ const static struct data
+ {
+-  float64x2_t poly[19], ln2[2];
+-  uint64x2_t hf_rt2_top, one_m_hf_rt2_top, umask, inf, minus_one;
+-  int64x2_t one_top;
+-} data = {
+-  /* Generated using Remez, deg=20, in [sqrt(2)/2-1, sqrt(2)-1].  */
+-  .poly = { V2 (-0x1.ffffffffffffbp-2), V2 (0x1.55555555551a9p-2),
+-	    V2 (-0x1.00000000008e3p-2), V2 (0x1.9999999a32797p-3),
+-	    V2 (-0x1.555555552fecfp-3), V2 (0x1.249248e071e5ap-3),
+-	    V2 (-0x1.ffffff8bf8482p-4), V2 (0x1.c71c8f07da57ap-4),
+-	    V2 (-0x1.9999ca4ccb617p-4), V2 (0x1.7459ad2e1dfa3p-4),
+-	    V2 (-0x1.554d2680a3ff2p-4), V2 (0x1.3b4c54d487455p-4),
+-	    V2 (-0x1.2548a9ffe80e6p-4), V2 (0x1.0f389a24b2e07p-4),
+-	    V2 (-0x1.eee4db15db335p-5), V2 (0x1.e95b494d4a5ddp-5),
+-	    V2 (-0x1.15fdf07cb7c73p-4), V2 (0x1.0310b70800fcfp-4),
+-	    V2 (-0x1.cfa7385bdb37ep-6) },
+-  .ln2 = { V2 (0x1.62e42fefa3800p-1), V2 (0x1.ef35793c76730p-45) },
+-  /* top32(asuint64(sqrt(2)/2)) << 32.  */
+-  .hf_rt2_top = V2 (0x3fe6a09e00000000),
+-  /* (top32(asuint64(1)) - top32(asuint64(sqrt(2)/2))) << 32.  */
+-  .one_m_hf_rt2_top = V2 (0x00095f6200000000),
+-  .umask = V2 (0x000fffff00000000),
+-  .one_top = V2 (0x3ff),
+-  .inf = V2 (0x7ff0000000000000),
+-  .minus_one = V2 (0xbff0000000000000)
+-};
+  struct v_log1p_data d;
+  uint64x2_t inf, minus_one;
+} data = { .d = V_LOG1P_CONSTANTS_TABLE,
+	   .inf = V2 (0x7ff0000000000000),
+	   .minus_one = V2 (0xbff0000000000000) };
+ 
+ #define BottomMask v_u64 (0xffffffff)
+ 
+-static float64x2_t VPCS_ATTR NOINLINE
+-special_case (float64x2_t x, float64x2_t y, uint64x2_t special)
+static float64x2_t NOINLINE VPCS_ATTR
+special_case (float64x2_t x, uint64x2_t cmp, const struct data *d)
+ {
+-  return v_call_f64 (log1p, x, y, special);
+  /* Side-step special lanes so fenv exceptions are not triggered
+     inadvertently.  */
+  float64x2_t x_nospecial = v_zerofy_f64 (x, cmp);
+  return v_call_f64 (log1p, x, log1p_inline (x_nospecial, &d->d), cmp);
+ }
+ 
+ /* Vector log1p approximation using polynomial on reduced interval. Routine is
+@@ -66,64 +49,12 @@ VPCS_ATTR float64x2_t V_NAME_D1 (log1p) (float64x2_t x)
+   const struct data *d = ptr_barrier (&data);
+   uint64x2_t ix = vreinterpretq_u64_f64 (x);
+   uint64x2_t ia = vreinterpretq_u64_f64 (vabsq_f64 (x));
+-  uint64x2_t special = vcgeq_u64 (ia, d->inf);
+ 
+-#if WANT_SIMD_EXCEPT
+-  special = vorrq_u64 (special,
+-		       vcgeq_u64 (ix, vreinterpretq_u64_f64 (v_f64 (-1))));
+-  if (__glibc_unlikely (v_any_u64 (special)))
+-    x = v_zerofy_f64 (x, special);
+-#else
+-  special = vorrq_u64 (special, vcleq_f64 (x, v_f64 (-1)));
+-#endif
+  uint64x2_t special_cases
+      = vorrq_u64 (vcgeq_u64 (ia, d->inf), vcgeq_u64 (ix, d->minus_one));
+ 
+-  /* With x + 1 = t * 2^k (where t = f + 1 and k is chosen such that f
+-			   is in [sqrt(2)/2, sqrt(2)]):
+-     log1p(x) = k*log(2) + log1p(f).
+  if (__glibc_unlikely (v_any_u64 (special_cases)))
+    return special_case (x, special_cases, d);
+ 
+-     f may not be representable exactly, so we need a correction term:
+-     let m = round(1 + x), c = (1 + x) - m.
+-     c << m: at very small x, log1p(x) ~ x, hence:
+-     log(1+x) - log(m) ~ c/m.
+-
+-     We therefore calculate log1p(x) by k*log2 + log1p(f) + c/m.  */
+-
+-  /* Obtain correctly scaled k by manipulation in the exponent.
+-     The scalar algorithm casts down to 32-bit at this point to calculate k and
+-     u_red. We stay in double-width to obtain f and k, using the same constants
+-     as the scalar algorithm but shifted left by 32.  */
+-  float64x2_t m = vaddq_f64 (x, v_f64 (1));
+-  uint64x2_t mi = vreinterpretq_u64_f64 (m);
+-  uint64x2_t u = vaddq_u64 (mi, d->one_m_hf_rt2_top);
+-
+-  int64x2_t ki
+-      = vsubq_s64 (vreinterpretq_s64_u64 (vshrq_n_u64 (u, 52)), d->one_top);
+-  float64x2_t k = vcvtq_f64_s64 (ki);
+-
+-  /* Reduce x to f in [sqrt(2)/2, sqrt(2)].  */
+-  uint64x2_t utop = vaddq_u64 (vandq_u64 (u, d->umask), d->hf_rt2_top);
+-  uint64x2_t u_red = vorrq_u64 (utop, vandq_u64 (mi, BottomMask));
+-  float64x2_t f = vsubq_f64 (vreinterpretq_f64_u64 (u_red), v_f64 (1));
+-
+-  /* Correction term c/m.  */
+-  float64x2_t cm = vdivq_f64 (vsubq_f64 (x, vsubq_f64 (m, v_f64 (1))), m);
+-
+-  /* Approximate log1p(x) on the reduced input using a polynomial. Because
+-     log1p(0)=0 we choose an approximation of the form:
+-       x + C0*x^2 + C1*x^3 + C2x^4 + ...
+-     Hence approximation has the form f + f^2 * P(f)
+-      where P(x) = C0 + C1*x + C2x^2 + ...
+-     Assembling this all correctly is dealt with at the final step.  */
+-  float64x2_t f2 = vmulq_f64 (f, f);
+-  float64x2_t p = v_pw_horner_18_f64 (f, f2, d->poly);
+-
+-  float64x2_t ylo = vfmaq_f64 (cm, k, d->ln2[1]);
+-  float64x2_t yhi = vfmaq_f64 (f, k, d->ln2[0]);
+-  float64x2_t y = vaddq_f64 (ylo, yhi);
+-
+-  if (__glibc_unlikely (v_any_u64 (special)))
+-    return special_case (vreinterpretq_f64_u64 (ix), vfmaq_f64 (y, f2, p),
+-			 special);
+-
+-  return vfmaq_f64 (y, f2, p);
+  return log1p_inline (x, &d->d);
+ }
+diff --git a/sysdeps/aarch64/fpu/v_log1p_inline.h b/sysdeps/aarch64/fpu/v_log1p_inline.h
+index 242e43b6eecc0b6e..834ff65adf34ed4a 100644
+--- a/sysdeps/aarch64/fpu/v_log1p_inline.h
+++ b/sysdeps/aarch64/fpu/v_log1p_inline.h
+@@ -21,29 +21,30 @@
+ #define AARCH64_FPU_V_LOG1P_INLINE_H
+ 
+ #include "v_math.h"
+-#include "poly_advsimd_f64.h"
+ 
+ struct v_log1p_data
+ {
+-  float64x2_t poly[19], ln2[2];
+  float64x2_t c0, c2, c4, c6, c8, c10, c12, c14, c16;
+   uint64x2_t hf_rt2_top, one_m_hf_rt2_top, umask;
+   int64x2_t one_top;
+  double c1, c3, c5, c7, c9, c11, c13, c15, c17, c18;
+  double ln2[2];
+ };
+ 
+ /* Coefficients generated using Remez, deg=20, in [sqrt(2)/2-1, sqrt(2)-1].  */
+ #define V_LOG1P_CONSTANTS_TABLE                                               \
+   {                                                                           \
+-    .poly = { V2 (-0x1.ffffffffffffbp-2), V2 (0x1.55555555551a9p-2),          \
+-	      V2 (-0x1.00000000008e3p-2), V2 (0x1.9999999a32797p-3),          \
+-	      V2 (-0x1.555555552fecfp-3), V2 (0x1.249248e071e5ap-3),          \
+-	      V2 (-0x1.ffffff8bf8482p-4), V2 (0x1.c71c8f07da57ap-4),          \
+-	      V2 (-0x1.9999ca4ccb617p-4), V2 (0x1.7459ad2e1dfa3p-4),          \
+-	      V2 (-0x1.554d2680a3ff2p-4), V2 (0x1.3b4c54d487455p-4),          \
+-	      V2 (-0x1.2548a9ffe80e6p-4), V2 (0x1.0f389a24b2e07p-4),          \
+-	      V2 (-0x1.eee4db15db335p-5), V2 (0x1.e95b494d4a5ddp-5),          \
+-	      V2 (-0x1.15fdf07cb7c73p-4), V2 (0x1.0310b70800fcfp-4),          \
+-	      V2 (-0x1.cfa7385bdb37ep-6) },                                   \
+-    .ln2 = { V2 (0x1.62e42fefa3800p-1), V2 (0x1.ef35793c76730p-45) },         \
+    .c0 = V2 (-0x1.ffffffffffffbp-2), .c1 = 0x1.55555555551a9p-2,             \
+    .c2 = V2 (-0x1.00000000008e3p-2), .c3 = 0x1.9999999a32797p-3,             \
+    .c4 = V2 (-0x1.555555552fecfp-3), .c5 = 0x1.249248e071e5ap-3,             \
+    .c6 = V2 (-0x1.ffffff8bf8482p-4), .c7 = 0x1.c71c8f07da57ap-4,             \
+    .c8 = V2 (-0x1.9999ca4ccb617p-4), .c9 = 0x1.7459ad2e1dfa3p-4,             \
+    .c10 = V2 (-0x1.554d2680a3ff2p-4), .c11 = 0x1.3b4c54d487455p-4,           \
+    .c12 = V2 (-0x1.2548a9ffe80e6p-4), .c13 = 0x1.0f389a24b2e07p-4,           \
+    .c14 = V2 (-0x1.eee4db15db335p-5), .c15 = 0x1.e95b494d4a5ddp-5,           \
+    .c16 = V2 (-0x1.15fdf07cb7c73p-4), .c17 = 0x1.0310b70800fcfp-4,           \
+    .c18 = -0x1.cfa7385bdb37ep-6,                                             \
+    .ln2 = { 0x1.62e42fefa3800p-1, 0x1.ef35793c76730p-45 },                   \
+     .hf_rt2_top = V2 (0x3fe6a09e00000000),                                    \
+     .one_m_hf_rt2_top = V2 (0x00095f6200000000),                              \
+     .umask = V2 (0x000fffff00000000), .one_top = V2 (0x3ff)                   \
+@@ -51,19 +52,45 @@ struct v_log1p_data
+ 
+ #define BottomMask v_u64 (0xffffffff)
+ 
+static inline float64x2_t
+eval_poly (float64x2_t m, float64x2_t m2, const struct v_log1p_data *d)
+{
+  /* Approximate log(1+m) on [-0.25, 0.5] using pairwise Horner.  */
+  float64x2_t c13 = vld1q_f64 (&d->c1);
+  float64x2_t c57 = vld1q_f64 (&d->c5);
+  float64x2_t c911 = vld1q_f64 (&d->c9);
+  float64x2_t c1315 = vld1q_f64 (&d->c13);
+  float64x2_t c1718 = vld1q_f64 (&d->c17);
+  float64x2_t p1617 = vfmaq_laneq_f64 (d->c16, m, c1718, 0);
+  float64x2_t p1415 = vfmaq_laneq_f64 (d->c14, m, c1315, 1);
+  float64x2_t p1213 = vfmaq_laneq_f64 (d->c12, m, c1315, 0);
+  float64x2_t p1011 = vfmaq_laneq_f64 (d->c10, m, c911, 1);
+  float64x2_t p89 = vfmaq_laneq_f64 (d->c8, m, c911, 0);
+  float64x2_t p67 = vfmaq_laneq_f64 (d->c6, m, c57, 1);
+  float64x2_t p45 = vfmaq_laneq_f64 (d->c4, m, c57, 0);
+  float64x2_t p23 = vfmaq_laneq_f64 (d->c2, m, c13, 1);
+  float64x2_t p01 = vfmaq_laneq_f64 (d->c0, m, c13, 0);
+  float64x2_t p = vfmaq_laneq_f64 (p1617, m2, c1718, 1);
+  p = vfmaq_f64 (p1415, m2, p);
+  p = vfmaq_f64 (p1213, m2, p);
+  p = vfmaq_f64 (p1011, m2, p);
+  p = vfmaq_f64 (p89, m2, p);
+  p = vfmaq_f64 (p67, m2, p);
+  p = vfmaq_f64 (p45, m2, p);
+  p = vfmaq_f64 (p23, m2, p);
+  return vfmaq_f64 (p01, m2, p);
+}
+
+ static inline float64x2_t
+ log1p_inline (float64x2_t x, const struct v_log1p_data *d)
+ {
+-  /* Helper for calculating log(x + 1). Copied from v_log1p_2u5.c, with several
+-     modifications:
+  /* Helper for calculating log(x + 1):
+      - No special-case handling - this should be dealt with by the caller.
+-     - Pairwise Horner polynomial evaluation for improved accuracy.
+      - Optionally simulate the shortcut for k=0, used in the scalar routine,
+-       using v_sel, for improved accuracy when the argument to log1p is close to
+-       0. This feature is enabled by defining WANT_V_LOG1P_K0_SHORTCUT as 1 in
+-       the source of the caller before including this file.
+-     See v_log1pf_2u1.c for details of the algorithm.  */
+-  float64x2_t m = vaddq_f64 (x, v_f64 (1));
+       using v_sel, for improved accuracy when the argument to log1p is close
+       to 0. This feature is enabled by defining WANT_V_LOG1P_K0_SHORTCUT as 1
+       in the source of the caller before including this file.  */
+  float64x2_t m = vaddq_f64 (x, v_f64 (1.0));
+   uint64x2_t mi = vreinterpretq_u64_f64 (m);
+   uint64x2_t u = vaddq_u64 (mi, d->one_m_hf_rt2_top);
+ 
+@@ -74,14 +101,14 @@ log1p_inline (float64x2_t x, const struct v_log1p_data *d)
+   /* Reduce x to f in [sqrt(2)/2, sqrt(2)].  */
+   uint64x2_t utop = vaddq_u64 (vandq_u64 (u, d->umask), d->hf_rt2_top);
+   uint64x2_t u_red = vorrq_u64 (utop, vandq_u64 (mi, BottomMask));
+-  float64x2_t f = vsubq_f64 (vreinterpretq_f64_u64 (u_red), v_f64 (1));
+  float64x2_t f = vsubq_f64 (vreinterpretq_f64_u64 (u_red), v_f64 (1.0));
+ 
+   /* Correction term c/m.  */
+-  float64x2_t cm = vdivq_f64 (vsubq_f64 (x, vsubq_f64 (m, v_f64 (1))), m);
+  float64x2_t cm = vdivq_f64 (vsubq_f64 (x, vsubq_f64 (m, v_f64 (1.0))), m);
+ 
+ #ifndef WANT_V_LOG1P_K0_SHORTCUT
+-#error                                                                         \
+-  "Cannot use v_log1p_inline.h without specifying whether you need the k0 shortcut for greater accuracy close to 0"
+# error                                                                       \
+      "Cannot use v_log1p_inline.h without specifying whether you need the k0 shortcut for greater accuracy close to 0"
+ #elif WANT_V_LOG1P_K0_SHORTCUT
+   /* Shortcut if k is 0 - set correction term to 0 and f to x. The result is
+      that the approximation is solely the polynomial.  */
+@@ -92,11 +119,12 @@ log1p_inline (float64x2_t x, const struct v_log1p_data *d)
+ 
+   /* Approximate log1p(f) on the reduced input using a polynomial.  */
+   float64x2_t f2 = vmulq_f64 (f, f);
+-  float64x2_t p = v_pw_horner_18_f64 (f, f2, d->poly);
+  float64x2_t p = eval_poly (f, f2, d);
+ 
+   /* Assemble log1p(x) = k * log2 + log1p(f) + c/m.  */
+-  float64x2_t ylo = vfmaq_f64 (cm, k, d->ln2[1]);
+-  float64x2_t yhi = vfmaq_f64 (f, k, d->ln2[0]);
+  float64x2_t ln2 = vld1q_f64 (&d->ln2[0]);
+  float64x2_t ylo = vfmaq_laneq_f64 (cm, k, ln2, 1);
+  float64x2_t yhi = vfmaq_laneq_f64 (f, k, ln2, 0);
+   return vfmaq_f64 (vaddq_f64 (ylo, yhi), f2, p);
+ }
+ 
--- a/glibc-RHEL-118273-21.patch
+++ b/glibc-RHEL-118273-21.patch
@ -0,0 +1,216 @@
+commit 569cfaaf4984ae70b23c61ee28a609b5aef93fea
+Author: Pierre Blanchard <pierre.blanchard@arm.com>
+Date:   Mon Dec 9 15:53:04 2024 +0000
+
+    AArch64: Improve codegen in AdvSIMD pow
+    
+    Remove spurious ADRP. Improve memory access by shuffling constants and
+    using more indexed MLAs.
+    
+    A few more optimisation with no impact on accuracy
+    - force fmas contraction
+    - switch from shift-aided rint to rint instruction
+    
+    Between 1 and 5% throughput improvement on Neoverse
+    V1 depending on benchmark.
+
+diff --git a/sysdeps/aarch64/fpu/pow_advsimd.c b/sysdeps/aarch64/fpu/pow_advsimd.c
+index 3c91e3e183599e3e..81e134ac2f0bd2f5 100644
+--- a/sysdeps/aarch64/fpu/pow_advsimd.c
+++ b/sysdeps/aarch64/fpu/pow_advsimd.c
+@@ -22,9 +22,6 @@
+ /* Defines parameters of the approximation and scalar fallback.  */
+ #include "finite_pow.h"
+ 
+-#define VecSmallExp v_u64 (SmallExp)
+-#define VecThresExp v_u64 (ThresExp)
+-
+ #define VecSmallPowX v_u64 (SmallPowX)
+ #define VecThresPowX v_u64 (ThresPowX)
+ #define VecSmallPowY v_u64 (SmallPowY)
+@@ -32,36 +29,48 @@
+ 
+ static const struct data
+ {
+-  float64x2_t log_poly[6];
+-  float64x2_t exp_poly[3];
+-  float64x2_t ln2_hi, ln2_lo;
+-  float64x2_t shift, inv_ln2_n, ln2_hi_n, ln2_lo_n, small_powx;
+   uint64x2_t inf;
+  float64x2_t small_powx;
+  uint64x2_t offset, mask;
+  uint64x2_t mask_sub_0, mask_sub_1;
+  float64x2_t log_c0, log_c2, log_c4, log_c5;
+  double log_c1, log_c3;
+  double ln2_lo, ln2_hi;
+  uint64x2_t small_exp, thres_exp;
+  double ln2_lo_n, ln2_hi_n;
+  double inv_ln2_n, exp_c2;
+  float64x2_t exp_c0, exp_c1;
+ } data = {
+  /* Power threshold.  */
+  .inf = V2 (0x7ff0000000000000),
+  .small_powx = V2 (0x1p-126),
+  .offset = V2 (Off),
+  .mask = V2 (0xfffULL << 52),
+  .mask_sub_0 = V2 (1ULL << 52),
+  .mask_sub_1 = V2 (52ULL << 52),
+   /* Coefficients copied from v_pow_log_data.c
+      relative error: 0x1.11922ap-70 in [-0x1.6bp-8, 0x1.6bp-8]
+      Coefficients are scaled to match the scaling during evaluation.  */
+-  .log_poly
+-  = { V2 (0x1.555555555556p-2 * -2), V2 (-0x1.0000000000006p-2 * -2),
+-      V2 (0x1.999999959554ep-3 * 4), V2 (-0x1.555555529a47ap-3 * 4),
+-      V2 (0x1.2495b9b4845e9p-3 * -8), V2 (-0x1.0002b8b263fc3p-3 * -8) },
+-  .ln2_hi = V2 (0x1.62e42fefa3800p-1),
+-  .ln2_lo = V2 (0x1.ef35793c76730p-45),
+  .log_c0 = V2 (0x1.555555555556p-2 * -2),
+  .log_c1 = -0x1.0000000000006p-2 * -2,
+  .log_c2 = V2 (0x1.999999959554ep-3 * 4),
+  .log_c3 = -0x1.555555529a47ap-3 * 4,
+  .log_c4 = V2 (0x1.2495b9b4845e9p-3 * -8),
+  .log_c5 = V2 (-0x1.0002b8b263fc3p-3 * -8),
+  .ln2_hi = 0x1.62e42fefa3800p-1,
+  .ln2_lo = 0x1.ef35793c76730p-45,
+   /* Polynomial coefficients: abs error: 1.43*2^-58, ulp error: 0.549
+      (0.550 without fma) if |x| < ln2/512.  */
+-  .exp_poly = { V2 (0x1.fffffffffffd4p-2), V2 (0x1.5555571d6ef9p-3),
+-		V2 (0x1.5555576a5adcep-5) },
+-  .shift = V2 (0x1.8p52), /* round to nearest int. without intrinsics.  */
+-  .inv_ln2_n = V2 (0x1.71547652b82fep8), /* N/ln2.  */
+-  .ln2_hi_n = V2 (0x1.62e42fefc0000p-9), /* ln2/N.  */
+-  .ln2_lo_n = V2 (-0x1.c610ca86c3899p-45),
+-  .small_powx = V2 (0x1p-126),
+-  .inf = V2 (0x7ff0000000000000)
+  .exp_c0 = V2 (0x1.fffffffffffd4p-2),
+  .exp_c1 = V2 (0x1.5555571d6ef9p-3),
+  .exp_c2 = 0x1.5555576a5adcep-5,
+  .small_exp = V2 (0x3c90000000000000),
+  .thres_exp = V2 (0x03f0000000000000),
+  .inv_ln2_n = 0x1.71547652b82fep8, /* N/ln2.  */
+  .ln2_hi_n = 0x1.62e42fefc0000p-9, /* ln2/N.  */
+  .ln2_lo_n = -0x1.c610ca86c3899p-45,
+ };
+ 
+-#define A(i) data.log_poly[i]
+-#define C(i) data.exp_poly[i]
+-
+ /* This version implements an algorithm close to scalar pow but
+    - does not implement the trick in the exp's specialcase subroutine to avoid
+      double-rounding,
+@@ -91,10 +100,9 @@ v_log_inline (uint64x2_t ix, float64x2_t *tail, const struct data *d)
+   /* x = 2^k z; where z is in range [OFF,2*OFF) and exact.
+      The range is split into N subintervals.
+      The ith subinterval contains z and c is near its center.  */
+-  uint64x2_t tmp = vsubq_u64 (ix, v_u64 (Off));
+-  int64x2_t k
+-      = vshrq_n_s64 (vreinterpretq_s64_u64 (tmp), 52); /* arithmetic shift.  */
+-  uint64x2_t iz = vsubq_u64 (ix, vandq_u64 (tmp, v_u64 (0xfffULL << 52)));
+  uint64x2_t tmp = vsubq_u64 (ix, d->offset);
+  int64x2_t k = vshrq_n_s64 (vreinterpretq_s64_u64 (tmp), 52);
+  uint64x2_t iz = vsubq_u64 (ix, vandq_u64 (tmp, d->mask));
+   float64x2_t z = vreinterpretq_f64_u64 (iz);
+   float64x2_t kd = vcvtq_f64_s64 (k);
+   /* log(x) = k*Ln2 + log(c) + log1p(z/c-1).  */
+@@ -105,9 +113,10 @@ v_log_inline (uint64x2_t ix, float64x2_t *tail, const struct data *d)
+      |z/c - 1| < 1/N, so r = z/c - 1 is exactly representible.  */
+   float64x2_t r = vfmaq_f64 (v_f64 (-1.0), z, invc);
+   /* k*Ln2 + log(c) + r.  */
+-  float64x2_t t1 = vfmaq_f64 (logc, kd, d->ln2_hi);
+  float64x2_t ln2 = vld1q_f64 (&d->ln2_lo);
+  float64x2_t t1 = vfmaq_laneq_f64 (logc, kd, ln2, 1);
+   float64x2_t t2 = vaddq_f64 (t1, r);
+-  float64x2_t lo1 = vfmaq_f64 (logctail, kd, d->ln2_lo);
+  float64x2_t lo1 = vfmaq_laneq_f64 (logctail, kd, ln2, 0);
+   float64x2_t lo2 = vaddq_f64 (vsubq_f64 (t1, t2), r);
+   /* Evaluation is optimized assuming superscalar pipelined execution.  */
+   float64x2_t ar = vmulq_f64 (v_f64 (-0.5), r);
+@@ -118,9 +127,10 @@ v_log_inline (uint64x2_t ix, float64x2_t *tail, const struct data *d)
+   float64x2_t lo3 = vfmaq_f64 (vnegq_f64 (ar2), ar, r);
+   float64x2_t lo4 = vaddq_f64 (vsubq_f64 (t2, hi), ar2);
+   /* p = log1p(r) - r - A[0]*r*r.  */
+-  float64x2_t a56 = vfmaq_f64 (A (4), r, A (5));
+-  float64x2_t a34 = vfmaq_f64 (A (2), r, A (3));
+-  float64x2_t a12 = vfmaq_f64 (A (0), r, A (1));
+  float64x2_t odd_coeffs = vld1q_f64 (&d->log_c1);
+  float64x2_t a56 = vfmaq_f64 (d->log_c4, r, d->log_c5);
+  float64x2_t a34 = vfmaq_laneq_f64 (d->log_c2, r, odd_coeffs, 1);
+  float64x2_t a12 = vfmaq_laneq_f64 (d->log_c0, r, odd_coeffs, 0);
+   float64x2_t p = vfmaq_f64 (a34, ar2, a56);
+   p = vfmaq_f64 (a12, ar2, p);
+   p = vmulq_f64 (ar3, p);
+@@ -140,28 +150,28 @@ exp_special_case (float64x2_t x, float64x2_t xtail)
+ 
+ /* Computes sign*exp(x+xtail) where |xtail| < 2^-8/N and |xtail| <= |x|.  */
+ static inline float64x2_t
+-v_exp_inline (float64x2_t x, float64x2_t xtail, const struct data *d)
+v_exp_inline (float64x2_t x, float64x2_t neg_xtail, const struct data *d)
+ {
+   /* Fallback to scalar exp_inline for all lanes if any lane
+      contains value of x s.t. |x| <= 2^-54 or >= 512.  */
+-  uint64x2_t abstop
+-      = vshrq_n_u64 (vandq_u64 (vreinterpretq_u64_f64 (x), d->inf), 52);
+-  uint64x2_t uoflowx
+-      = vcgeq_u64 (vsubq_u64 (abstop, VecSmallExp), VecThresExp);
+  uint64x2_t uoflowx = vcgeq_u64 (
+      vsubq_u64 (vreinterpretq_u64_f64 (vabsq_f64 (x)), d->small_exp),
+      d->thres_exp);
+   if (__glibc_unlikely (v_any_u64 (uoflowx)))
+-    return exp_special_case (x, xtail);
+    return exp_special_case (x, vnegq_f64 (neg_xtail));
+ 
+   /* exp(x) = 2^(k/N) * exp(r), with exp(r) in [2^(-1/2N),2^(1/2N)].  */
+   /* x = ln2/N*k + r, with k integer and r in [-ln2/2N, ln2/2N].  */
+-  float64x2_t z = vmulq_f64 (d->inv_ln2_n, x);
+   /* z - kd is in [-1, 1] in non-nearest rounding modes.  */
+-  float64x2_t kd = vaddq_f64 (z, d->shift);
+-  uint64x2_t ki = vreinterpretq_u64_f64 (kd);
+-  kd = vsubq_f64 (kd, d->shift);
+-  float64x2_t r = vfmsq_f64 (x, kd, d->ln2_hi_n);
+-  r = vfmsq_f64 (r, kd, d->ln2_lo_n);
+  float64x2_t exp_consts = vld1q_f64 (&d->inv_ln2_n);
+  float64x2_t z = vmulq_laneq_f64 (x, exp_consts, 0);
+  float64x2_t kd = vrndnq_f64 (z);
+  uint64x2_t ki = vreinterpretq_u64_s64 (vcvtaq_s64_f64 (z));
+  float64x2_t ln2_n = vld1q_f64 (&d->ln2_lo_n);
+  float64x2_t r = vfmsq_laneq_f64 (x, kd, ln2_n, 1);
+  r = vfmsq_laneq_f64 (r, kd, ln2_n, 0);
+   /* The code assumes 2^-200 < |xtail| < 2^-8/N.  */
+-  r = vaddq_f64 (r, xtail);
+  r = vsubq_f64 (r, neg_xtail);
+   /* 2^(k/N) ~= scale.  */
+   uint64x2_t idx = vandq_u64 (ki, v_u64 (N_EXP - 1));
+   uint64x2_t top = vshlq_n_u64 (ki, 52 - V_POW_EXP_TABLE_BITS);
+@@ -170,8 +180,8 @@ v_exp_inline (float64x2_t x, float64x2_t xtail, const struct data *d)
+   sbits = vaddq_u64 (sbits, top);
+   /* exp(x) = 2^(k/N) * exp(r) ~= scale + scale * (exp(r) - 1).  */
+   float64x2_t r2 = vmulq_f64 (r, r);
+-  float64x2_t tmp = vfmaq_f64 (C (1), r, C (2));
+-  tmp = vfmaq_f64 (C (0), r, tmp);
+  float64x2_t tmp = vfmaq_laneq_f64 (d->exp_c1, r, exp_consts, 1);
+  tmp = vfmaq_f64 (d->exp_c0, r, tmp);
+   tmp = vfmaq_f64 (r, r2, tmp);
+   float64x2_t scale = vreinterpretq_f64_u64 (sbits);
+   /* Note: tmp == 0 or |tmp| > 2^-200 and scale > 2^-739, so there
+@@ -230,8 +240,8 @@ float64x2_t VPCS_ATTR V_NAME_D2 (pow) (float64x2_t x, float64x2_t y)
+ 	{
+ 	  /* Normalize subnormal x so exponent becomes negative.  */
+ 	  uint64x2_t vix_norm = vreinterpretq_u64_f64 (
+-	      vabsq_f64 (vmulq_f64 (x, vcvtq_f64_u64 (v_u64 (1ULL << 52)))));
+-	  vix_norm = vsubq_u64 (vix_norm, v_u64 (52ULL << 52));
+	      vabsq_f64 (vmulq_f64 (x, vcvtq_f64_u64 (d->mask_sub_0))));
+	  vix_norm = vsubq_u64 (vix_norm, d->mask_sub_1);
+ 	  vix = vbslq_u64 (sub_x, vix_norm, vix);
+ 	}
+     }
+@@ -242,8 +252,7 @@ float64x2_t VPCS_ATTR V_NAME_D2 (pow) (float64x2_t x, float64x2_t y)
+ 
+   /* Vector Exp(y_loghi, y_loglo).  */
+   float64x2_t vehi = vmulq_f64 (y, vhi);
+-  float64x2_t velo = vmulq_f64 (y, vlo);
+   float64x2_t vemi = vfmsq_f64 (vehi, y, vhi);
+-  velo = vsubq_f64 (velo, vemi);
+-  return v_exp_inline (vehi, velo, d);
+  float64x2_t neg_velo = vfmsq_f64 (vemi, y, vlo);
+  return v_exp_inline (vehi, neg_velo, d);
+ }
--- a/glibc-RHEL-118273-22.patch
+++ b/glibc-RHEL-118273-22.patch
@ -0,0 +1,501 @@
+commit cff9648d0b50d19cdaf685f6767add040d4e1a8e
+Author: Joana Cruz <Joana.Cruz@arm.com>
+Date:   Tue Dec 17 14:50:33 2024 +0000
+
+    AArch64: Improve codegen of AdvSIMD expf family
+    
+    Load the polynomial evaluation coefficients into 2 vectors and use lanewise MLAs.
+    Also use intrinsics instead of native operations.
+    expf: 3% improvement in throughput microbenchmark on Neoverse V1, exp2f: 5%,
+    exp10f: 13%, coshf: 14%.
+    
+    Reviewed-by: Wilco Dijkstra  <Wilco.Dijkstra@arm.com>
+
+diff --git a/sysdeps/aarch64/fpu/coshf_advsimd.c b/sysdeps/aarch64/fpu/coshf_advsimd.c
+index c1ab4923b826569b..cd5c86652129ea9c 100644
+--- a/sysdeps/aarch64/fpu/coshf_advsimd.c
+++ b/sysdeps/aarch64/fpu/coshf_advsimd.c
+@@ -23,19 +23,27 @@
+ static const struct data
+ {
+   struct v_expf_data expf_consts;
+-  uint32x4_t tiny_bound, special_bound;
+  uint32x4_t tiny_bound;
+  float32x4_t bound;
+#if WANT_SIMD_EXCEPT
+  uint32x4_t special_bound;
+#endif
+ } data = {
+   .expf_consts = V_EXPF_DATA,
+   .tiny_bound = V4 (0x20000000), /* 0x1p-63: Round to 1 below this.  */
+   /* 0x1.5a92d8p+6: expf overflows above this, so have to use special case.  */
+  .bound = V4 (0x1.5a92d8p+6),
+#if WANT_SIMD_EXCEPT
+   .special_bound = V4 (0x42ad496c),
+#endif
+ };
+ 
+ #if !WANT_SIMD_EXCEPT
+ static float32x4_t NOINLINE VPCS_ATTR
+-special_case (float32x4_t x, float32x4_t y, uint32x4_t special)
+special_case (float32x4_t x, float32x4_t half_t, float32x4_t half_over_t,
+	      uint32x4_t special)
+ {
+-  return v_call_f32 (coshf, x, y, special);
+  return v_call_f32 (coshf, x, vaddq_f32 (half_t, half_over_t), special);
+ }
+ #endif
+ 
+@@ -47,14 +55,13 @@ float32x4_t VPCS_ATTR V_NAME_F1 (cosh) (float32x4_t x)
+ {
+   const struct data *d = ptr_barrier (&data);
+ 
+-  float32x4_t ax = vabsq_f32 (x);
+-  uint32x4_t iax = vreinterpretq_u32_f32 (ax);
+-  uint32x4_t special = vcgeq_u32 (iax, d->special_bound);
+-
+ #if WANT_SIMD_EXCEPT
+   /* If fp exceptions are to be triggered correctly, fall back to the scalar
+      variant for all inputs if any input is a special value or above the bound
+      at which expf overflows.  */
+  float32x4_t ax = vabsq_f32 (x);
+  uint32x4_t iax = vreinterpretq_u32_f32 (ax);
+  uint32x4_t special = vcgeq_u32 (iax, d->special_bound);
+   if (__glibc_unlikely (v_any_u32 (special)))
+     return v_call_f32 (coshf, x, x, v_u32 (-1));
+ 
+@@ -63,10 +70,13 @@ float32x4_t VPCS_ATTR V_NAME_F1 (cosh) (float32x4_t x)
+      input to 0, which will generate no exceptions.  */
+   if (__glibc_unlikely (v_any_u32 (tiny)))
+     ax = v_zerofy_f32 (ax, tiny);
+  float32x4_t t = v_expf_inline (ax, &d->expf_consts);
+#else
+  uint32x4_t special = vcageq_f32 (x, d->bound);
+  float32x4_t t = v_expf_inline (x, &d->expf_consts);
+ #endif
+ 
+   /* Calculate cosh by exp(x) / 2 + exp(-x) / 2.  */
+-  float32x4_t t = v_expf_inline (ax, &d->expf_consts);
+   float32x4_t half_t = vmulq_n_f32 (t, 0.5);
+   float32x4_t half_over_t = vdivq_f32 (v_f32 (0.5), t);
+ 
+@@ -75,7 +85,7 @@ float32x4_t VPCS_ATTR V_NAME_F1 (cosh) (float32x4_t x)
+     return vbslq_f32 (tiny, v_f32 (1), vaddq_f32 (half_t, half_over_t));
+ #else
+   if (__glibc_unlikely (v_any_u32 (special)))
+-    return special_case (x, vaddq_f32 (half_t, half_over_t), special);
+    return special_case (x, half_t, half_over_t, special);
+ #endif
+ 
+   return vaddq_f32 (half_t, half_over_t);
+diff --git a/sysdeps/aarch64/fpu/exp10f_advsimd.c b/sysdeps/aarch64/fpu/exp10f_advsimd.c
+index cf53e73290fcedb6..55d9cd83f2968ab9 100644
+--- a/sysdeps/aarch64/fpu/exp10f_advsimd.c
+++ b/sysdeps/aarch64/fpu/exp10f_advsimd.c
+@@ -18,16 +18,15 @@
+    <https://www.gnu.org/licenses/>.  */
+ 
+ #include "v_math.h"
+-#include "poly_advsimd_f32.h"
+ 
+ #define ScaleBound 192.0f
+ 
+ static const struct data
+ {
+-  float32x4_t poly[5];
+-  float log10_2_and_inv[4];
+-  float32x4_t shift;
+-
+  float32x4_t c0, c1, c3;
+  float log10_2_high, log10_2_low, c2, c4;
+  float32x4_t inv_log10_2, special_bound;
+  uint32x4_t exponent_bias, special_offset, special_bias;
+ #if !WANT_SIMD_EXCEPT
+   float32x4_t scale_thresh;
+ #endif
+@@ -37,19 +36,24 @@ static const struct data
+      rel error: 0x1.89dafa3p-24
+      abs error: 0x1.167d55p-23 in [-log10(2)/2, log10(2)/2]
+      maxerr: 1.85943 +0.5 ulp.  */
+-  .poly = { V4 (0x1.26bb16p+1f), V4 (0x1.5350d2p+1f), V4 (0x1.04744ap+1f),
+-	    V4 (0x1.2d8176p+0f), V4 (0x1.12b41ap-1f) },
+-  .shift = V4 (0x1.8p23f),
+-
+-  /* Stores constants 1/log10(2), log10(2)_high, log10(2)_low, 0.  */
+-  .log10_2_and_inv = { 0x1.a934fp+1, 0x1.344136p-2, -0x1.ec10cp-27, 0 },
+  .c0 = V4 (0x1.26bb16p+1f),
+  .c1 = V4 (0x1.5350d2p+1f),
+  .c2 = 0x1.04744ap+1f,
+  .c3 = V4 (0x1.2d8176p+0f),
+  .c4 = 0x1.12b41ap-1f,
+  .inv_log10_2 = V4 (0x1.a934fp+1),
+  .log10_2_high = 0x1.344136p-2,
+  .log10_2_low = 0x1.ec10cp-27,
+  /* rint (log2 (2^127 / (1 + sqrt (2)))).  */
+  .special_bound = V4 (126.0f),
+  .exponent_bias = V4 (0x3f800000),
+  .special_offset = V4 (0x82000000),
+  .special_bias = V4 (0x7f000000),
+ #if !WANT_SIMD_EXCEPT
+   .scale_thresh = V4 (ScaleBound)
+ #endif
+ };
+ 
+-#define ExponentBias v_u32 (0x3f800000)
+-
+ #if WANT_SIMD_EXCEPT
+ 
+ # define SpecialBound 38.0f	       /* rint(log10(2^127)).  */
+@@ -67,17 +71,15 @@ special_case (float32x4_t x, float32x4_t y, uint32x4_t cmp)
+ 
+ #else
+ 
+-# define SpecialBound 126.0f /* rint (log2 (2^127 / (1 + sqrt (2)))).  */
+-# define SpecialOffset v_u32 (0x82000000)
+-# define SpecialBias v_u32 (0x7f000000)
+# define SpecialBound 126.0f
+ 
+ static float32x4_t VPCS_ATTR NOINLINE
+ special_case (float32x4_t poly, float32x4_t n, uint32x4_t e, uint32x4_t cmp1,
+ 	      float32x4_t scale, const struct data *d)
+ {
+   /* 2^n may overflow, break it up into s1*s2.  */
+-  uint32x4_t b = vandq_u32 (vclezq_f32 (n), SpecialOffset);
+-  float32x4_t s1 = vreinterpretq_f32_u32 (vaddq_u32 (b, SpecialBias));
+  uint32x4_t b = vandq_u32 (vclezq_f32 (n), d->special_offset);
+  float32x4_t s1 = vreinterpretq_f32_u32 (vaddq_u32 (b, d->special_bias));
+   float32x4_t s2 = vreinterpretq_f32_u32 (vsubq_u32 (e, b));
+   uint32x4_t cmp2 = vcagtq_f32 (n, d->scale_thresh);
+   float32x4_t r2 = vmulq_f32 (s1, s1);
+@@ -112,23 +114,23 @@ float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (exp10) (float32x4_t x)
+   /* exp10(x) = 2^n * 10^r = 2^n * (1 + poly (r)),
+      with poly(r) in [1/sqrt(2), sqrt(2)] and
+      x = r + n * log10 (2), with r in [-log10(2)/2, log10(2)/2].  */
+-  float32x4_t log10_2_and_inv = vld1q_f32 (d->log10_2_and_inv);
+-  float32x4_t z = vfmaq_laneq_f32 (d->shift, x, log10_2_and_inv, 0);
+-  float32x4_t n = vsubq_f32 (z, d->shift);
+-  float32x4_t r = vfmsq_laneq_f32 (x, n, log10_2_and_inv, 1);
+-  r = vfmsq_laneq_f32 (r, n, log10_2_and_inv, 2);
+-  uint32x4_t e = vshlq_n_u32 (vreinterpretq_u32_f32 (z), 23);
+  float32x4_t log10_2_c24 = vld1q_f32 (&d->log10_2_high);
+  float32x4_t n = vrndaq_f32 (vmulq_f32 (x, d->inv_log10_2));
+  float32x4_t r = vfmsq_laneq_f32 (x, n, log10_2_c24, 0);
+  r = vfmaq_laneq_f32 (r, n, log10_2_c24, 1);
+  uint32x4_t e = vshlq_n_u32 (vreinterpretq_u32_s32 (vcvtaq_s32_f32 (n)), 23);
+ 
+-  float32x4_t scale = vreinterpretq_f32_u32 (vaddq_u32 (e, ExponentBias));
+  float32x4_t scale = vreinterpretq_f32_u32 (vaddq_u32 (e, d->exponent_bias));
+ 
+ #if !WANT_SIMD_EXCEPT
+-  uint32x4_t cmp = vcagtq_f32 (n, v_f32 (SpecialBound));
+  uint32x4_t cmp = vcagtq_f32 (n, d->special_bound);
+ #endif
+ 
+   float32x4_t r2 = vmulq_f32 (r, r);
+-  float32x4_t poly
+-      = vfmaq_f32 (vmulq_f32 (r, d->poly[0]),
+-		   v_pairwise_poly_3_f32 (r, r2, d->poly + 1), r2);
+  float32x4_t p12 = vfmaq_laneq_f32 (d->c1, r, log10_2_c24, 2);
+  float32x4_t p34 = vfmaq_laneq_f32 (d->c3, r, log10_2_c24, 3);
+  float32x4_t p14 = vfmaq_f32 (p12, r2, p34);
+  float32x4_t poly = vfmaq_f32 (vmulq_f32 (r, d->c0), p14, r2);
+ 
+   if (__glibc_unlikely (v_any_u32 (cmp)))
+ #if WANT_SIMD_EXCEPT
+diff --git a/sysdeps/aarch64/fpu/exp2f_advsimd.c b/sysdeps/aarch64/fpu/exp2f_advsimd.c
+index 69e0b193a1a91249..a4220da63c624490 100644
+--- a/sysdeps/aarch64/fpu/exp2f_advsimd.c
+++ b/sysdeps/aarch64/fpu/exp2f_advsimd.c
+@@ -21,24 +21,28 @@
+ 
+ static const struct data
+ {
+-  float32x4_t poly[5];
+-  uint32x4_t exponent_bias;
+  float32x4_t c1, c3;
+  uint32x4_t exponent_bias, special_offset, special_bias;
+ #if !WANT_SIMD_EXCEPT
+-  float32x4_t special_bound, scale_thresh;
+  float32x4_t scale_thresh, special_bound;
+ #endif
+  float c0, c2, c4, zero;
+ } data = {
+   /* maxerr: 1.962 ulp.  */
+-  .poly = { V4 (0x1.59977ap-10f), V4 (0x1.3ce9e4p-7f), V4 (0x1.c6bd32p-5f),
+-	    V4 (0x1.ebf9bcp-3f), V4 (0x1.62e422p-1f) },
+  .c0 = 0x1.59977ap-10f,
+  .c1 = V4 (0x1.3ce9e4p-7f),
+  .c2 = 0x1.c6bd32p-5f,
+  .c3 = V4 (0x1.ebf9bcp-3f),
+  .c4 = 0x1.62e422p-1f,
+   .exponent_bias = V4 (0x3f800000),
+  .special_offset = V4 (0x82000000),
+  .special_bias = V4 (0x7f000000),
+ #if !WANT_SIMD_EXCEPT
+   .special_bound = V4 (126.0f),
+   .scale_thresh = V4 (192.0f),
+ #endif
+ };
+ 
+-#define C(i) d->poly[i]
+-
+ #if WANT_SIMD_EXCEPT
+ 
+ # define TinyBound v_u32 (0x20000000)	  /* asuint (0x1p-63).  */
+@@ -55,16 +59,13 @@ special_case (float32x4_t x, float32x4_t y, uint32x4_t cmp)
+ 
+ #else
+ 
+-# define SpecialOffset v_u32 (0x82000000)
+-# define SpecialBias v_u32 (0x7f000000)
+-
+ static float32x4_t VPCS_ATTR NOINLINE
+ special_case (float32x4_t poly, float32x4_t n, uint32x4_t e, uint32x4_t cmp1,
+ 	      float32x4_t scale, const struct data *d)
+ {
+   /* 2^n may overflow, break it up into s1*s2.  */
+-  uint32x4_t b = vandq_u32 (vclezq_f32 (n), SpecialOffset);
+-  float32x4_t s1 = vreinterpretq_f32_u32 (vaddq_u32 (b, SpecialBias));
+  uint32x4_t b = vandq_u32 (vclezq_f32 (n), d->special_offset);
+  float32x4_t s1 = vreinterpretq_f32_u32 (vaddq_u32 (b, d->special_bias));
+   float32x4_t s2 = vreinterpretq_f32_u32 (vsubq_u32 (e, b));
+   uint32x4_t cmp2 = vcagtq_f32 (n, d->scale_thresh);
+   float32x4_t r2 = vmulq_f32 (s1, s1);
+@@ -80,13 +81,11 @@ special_case (float32x4_t poly, float32x4_t n, uint32x4_t e, uint32x4_t cmp1,
+ float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (exp2) (float32x4_t x)
+ {
+   const struct data *d = ptr_barrier (&data);
+-  float32x4_t n, r, r2, scale, p, q, poly;
+-  uint32x4_t cmp, e;
+ 
+ #if WANT_SIMD_EXCEPT
+   /* asuint(|x|) - TinyBound >= BigBound - TinyBound.  */
+   uint32x4_t ia = vreinterpretq_u32_f32 (vabsq_f32 (x));
+-  cmp = vcgeq_u32 (vsubq_u32 (ia, TinyBound), SpecialBound);
+  uint32x4_t cmp = vcgeq_u32 (vsubq_u32 (ia, TinyBound), SpecialBound);
+   float32x4_t xm = x;
+   /* If any lanes are special, mask them with 1 and retain a copy of x to allow
+      special_case to fix special lanes later. This is only necessary if fenv
+@@ -95,23 +94,24 @@ float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (exp2) (float32x4_t x)
+     x = vbslq_f32 (cmp, v_f32 (1), x);
+ #endif
+ 
+-    /* exp2(x) = 2^n (1 + poly(r)), with 1 + poly(r) in [1/sqrt(2),sqrt(2)]
+-       x = n + r, with r in [-1/2, 1/2].  */
+-  n = vrndaq_f32 (x);
+-  r = vsubq_f32 (x, n);
+-  e = vshlq_n_u32 (vreinterpretq_u32_s32 (vcvtaq_s32_f32 (x)), 23);
+-  scale = vreinterpretq_f32_u32 (vaddq_u32 (e, d->exponent_bias));
+  /* exp2(x) = 2^n (1 + poly(r)), with 1 + poly(r) in [1/sqrt(2),sqrt(2)]
+     x = n + r, with r in [-1/2, 1/2].  */
+  float32x4_t n = vrndaq_f32 (x);
+  float32x4_t r = vsubq_f32 (x, n);
+  uint32x4_t e = vshlq_n_u32 (vreinterpretq_u32_s32 (vcvtaq_s32_f32 (x)), 23);
+  float32x4_t scale = vreinterpretq_f32_u32 (vaddq_u32 (e, d->exponent_bias));
+ 
+ #if !WANT_SIMD_EXCEPT
+-  cmp = vcagtq_f32 (n, d->special_bound);
+  uint32x4_t cmp = vcagtq_f32 (n, d->special_bound);
+ #endif
+ 
+-  r2 = vmulq_f32 (r, r);
+-  p = vfmaq_f32 (C (1), C (0), r);
+-  q = vfmaq_f32 (C (3), C (2), r);
+  float32x4_t c024 = vld1q_f32 (&d->c0);
+  float32x4_t r2 = vmulq_f32 (r, r);
+  float32x4_t p = vfmaq_laneq_f32 (d->c1, r, c024, 0);
+  float32x4_t q = vfmaq_laneq_f32 (d->c3, r, c024, 1);
+   q = vfmaq_f32 (q, p, r2);
+-  p = vmulq_f32 (C (4), r);
+-  poly = vfmaq_f32 (p, q, r2);
+  p = vmulq_laneq_f32 (r, c024, 2);
+  float32x4_t poly = vfmaq_f32 (p, q, r2);
+ 
+   if (__glibc_unlikely (v_any_u32 (cmp)))
+ #if WANT_SIMD_EXCEPT
+diff --git a/sysdeps/aarch64/fpu/expf_advsimd.c b/sysdeps/aarch64/fpu/expf_advsimd.c
+index 5c9cb726205ece6e..70f137e2e5b46207 100644
+--- a/sysdeps/aarch64/fpu/expf_advsimd.c
+++ b/sysdeps/aarch64/fpu/expf_advsimd.c
+@@ -21,20 +21,25 @@
+ 
+ static const struct data
+ {
+-  float32x4_t poly[5];
+-  float32x4_t inv_ln2, ln2_hi, ln2_lo;
+-  uint32x4_t exponent_bias;
+  float32x4_t c1, c3, c4, inv_ln2;
+  float ln2_hi, ln2_lo, c0, c2;
+  uint32x4_t exponent_bias, special_offset, special_bias;
+ #if !WANT_SIMD_EXCEPT
+   float32x4_t special_bound, scale_thresh;
+ #endif
+ } data = {
+   /* maxerr: 1.45358 +0.5 ulp.  */
+-  .poly = { V4 (0x1.0e4020p-7f), V4 (0x1.573e2ep-5f), V4 (0x1.555e66p-3f),
+-	    V4 (0x1.fffdb6p-2f), V4 (0x1.ffffecp-1f) },
+  .c0 = 0x1.0e4020p-7f,
+  .c1 = V4 (0x1.573e2ep-5f),
+  .c2 = 0x1.555e66p-3f,
+  .c3 = V4 (0x1.fffdb6p-2f),
+  .c4 = V4 (0x1.ffffecp-1f),
+   .inv_ln2 = V4 (0x1.715476p+0f),
+-  .ln2_hi = V4 (0x1.62e4p-1f),
+-  .ln2_lo = V4 (0x1.7f7d1cp-20f),
+  .ln2_hi = 0x1.62e4p-1f,
+  .ln2_lo = 0x1.7f7d1cp-20f,
+   .exponent_bias = V4 (0x3f800000),
+  .special_offset = V4 (0x82000000),
+  .special_bias = V4 (0x7f000000),
+ #if !WANT_SIMD_EXCEPT
+   .special_bound = V4 (126.0f),
+   .scale_thresh = V4 (192.0f),
+@@ -59,19 +64,17 @@ special_case (float32x4_t x, float32x4_t y, uint32x4_t cmp)
+ 
+ #else
+ 
+-# define SpecialOffset v_u32 (0x82000000)
+-# define SpecialBias v_u32 (0x7f000000)
+-
+ static float32x4_t VPCS_ATTR NOINLINE
+ special_case (float32x4_t poly, float32x4_t n, uint32x4_t e, uint32x4_t cmp1,
+ 	      float32x4_t scale, const struct data *d)
+ {
+   /* 2^n may overflow, break it up into s1*s2.  */
+-  uint32x4_t b = vandq_u32 (vclezq_f32 (n), SpecialOffset);
+-  float32x4_t s1 = vreinterpretq_f32_u32 (vaddq_u32 (b, SpecialBias));
+  uint32x4_t b = vandq_u32 (vclezq_f32 (n), d->special_offset);
+  float32x4_t s1 = vreinterpretq_f32_u32 (vaddq_u32 (b, d->special_bias));
+   float32x4_t s2 = vreinterpretq_f32_u32 (vsubq_u32 (e, b));
+   uint32x4_t cmp2 = vcagtq_f32 (n, d->scale_thresh);
+   float32x4_t r2 = vmulq_f32 (s1, s1);
+  // (s2 + p*s2)*s1 = s2(p+1)s1
+   float32x4_t r1 = vmulq_f32 (vfmaq_f32 (s2, poly, s2), s1);
+   /* Similar to r1 but avoids double rounding in the subnormal range.  */
+   float32x4_t r0 = vfmaq_f32 (scale, poly, scale);
+@@ -84,12 +87,11 @@ special_case (float32x4_t poly, float32x4_t n, uint32x4_t e, uint32x4_t cmp1,
+ float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (exp) (float32x4_t x)
+ {
+   const struct data *d = ptr_barrier (&data);
+-  float32x4_t n, r, r2, scale, p, q, poly;
+-  uint32x4_t cmp, e;
+  float32x4_t ln2_c02 = vld1q_f32 (&d->ln2_hi);
+ 
+ #if WANT_SIMD_EXCEPT
+   /* asuint(x) - TinyBound >= BigBound - TinyBound.  */
+-  cmp = vcgeq_u32 (
+  uint32x4_t cmp = vcgeq_u32 (
+       vsubq_u32 (vandq_u32 (vreinterpretq_u32_f32 (x), v_u32 (0x7fffffff)),
+ 		 TinyBound),
+       SpecialBound);
+@@ -103,22 +105,22 @@ float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (exp) (float32x4_t x)
+ 
+   /* exp(x) = 2^n (1 + poly(r)), with 1 + poly(r) in [1/sqrt(2),sqrt(2)]
+      x = ln2*n + r, with r in [-ln2/2, ln2/2].  */
+-  n = vrndaq_f32 (vmulq_f32 (x, d->inv_ln2));
+-  r = vfmsq_f32 (x, n, d->ln2_hi);
+-  r = vfmsq_f32 (r, n, d->ln2_lo);
+-  e = vshlq_n_u32 (vreinterpretq_u32_s32 (vcvtq_s32_f32 (n)), 23);
+-  scale = vreinterpretq_f32_u32 (vaddq_u32 (e, d->exponent_bias));
+  float32x4_t n = vrndaq_f32 (vmulq_f32 (x, d->inv_ln2));
+  float32x4_t r = vfmsq_laneq_f32 (x, n, ln2_c02, 0);
+  r = vfmsq_laneq_f32 (r, n, ln2_c02, 1);
+  uint32x4_t e = vshlq_n_u32 (vreinterpretq_u32_s32 (vcvtq_s32_f32 (n)), 23);
+  float32x4_t scale = vreinterpretq_f32_u32 (vaddq_u32 (e, d->exponent_bias));
+ 
+ #if !WANT_SIMD_EXCEPT
+-  cmp = vcagtq_f32 (n, d->special_bound);
+  uint32x4_t cmp = vcagtq_f32 (n, d->special_bound);
+ #endif
+ 
+-  r2 = vmulq_f32 (r, r);
+-  p = vfmaq_f32 (C (1), C (0), r);
+-  q = vfmaq_f32 (C (3), C (2), r);
+  float32x4_t r2 = vmulq_f32 (r, r);
+  float32x4_t p = vfmaq_laneq_f32 (d->c1, r, ln2_c02, 2);
+  float32x4_t q = vfmaq_laneq_f32 (d->c3, r, ln2_c02, 3);
+   q = vfmaq_f32 (q, p, r2);
+-  p = vmulq_f32 (C (4), r);
+-  poly = vfmaq_f32 (p, q, r2);
+  p = vmulq_f32 (d->c4, r);
+  float32x4_t poly = vfmaq_f32 (p, q, r2);
+ 
+   if (__glibc_unlikely (v_any_u32 (cmp)))
+ #if WANT_SIMD_EXCEPT
+diff --git a/sysdeps/aarch64/fpu/v_expf_inline.h b/sysdeps/aarch64/fpu/v_expf_inline.h
+index 08b06e0a6b34b4f4..eacd2af24161fe3a 100644
+--- a/sysdeps/aarch64/fpu/v_expf_inline.h
+++ b/sysdeps/aarch64/fpu/v_expf_inline.h
+@@ -24,50 +24,45 @@
+ 
+ struct v_expf_data
+ {
+-  float32x4_t poly[5];
+-  float32x4_t shift;
+-  float invln2_and_ln2[4];
+  float ln2_hi, ln2_lo, c0, c2;
+  float32x4_t inv_ln2, c1, c3, c4;
+  /* asuint(1.0f).  */
+  uint32x4_t exponent_bias;
+ };
+ 
+ /* maxerr: 1.45358 +0.5 ulp.  */
+ #define V_EXPF_DATA                                                           \
+   {                                                                           \
+-    .poly = { V4 (0x1.0e4020p-7f), V4 (0x1.573e2ep-5f), V4 (0x1.555e66p-3f),  \
+-	      V4 (0x1.fffdb6p-2f), V4 (0x1.ffffecp-1f) },                     \
+-    .shift = V4 (0x1.8p23f),                                                  \
+-    .invln2_and_ln2 = { 0x1.715476p+0f, 0x1.62e4p-1f, 0x1.7f7d1cp-20f, 0 },   \
+    .c0 = 0x1.0e4020p-7f, .c1 = V4 (0x1.573e2ep-5f), .c2 = 0x1.555e66p-3f,    \
+    .c3 = V4 (0x1.fffdb6p-2f), .c4 = V4 (0x1.ffffecp-1f),                     \
+    .ln2_hi = 0x1.62e4p-1f, .ln2_lo = 0x1.7f7d1cp-20f,                        \
+    .inv_ln2 = V4 (0x1.715476p+0f), .exponent_bias = V4 (0x3f800000),         \
+   }
+ 
+-#define ExponentBias v_u32 (0x3f800000) /* asuint(1.0f).  */
+-#define C(i) d->poly[i]
+-
+ static inline float32x4_t
+ v_expf_inline (float32x4_t x, const struct v_expf_data *d)
+ {
+-  /* Helper routine for calculating exp(x).
+  /* Helper routine for calculating exp(ax).
+      Copied from v_expf.c, with all special-case handling removed - the
+      calling routine should handle special values if required.  */
+ 
+-  /* exp(x) = 2^n (1 + poly(r)), with 1 + poly(r) in [1/sqrt(2),sqrt(2)]
+-     x = ln2*n + r, with r in [-ln2/2, ln2/2].  */
+-  float32x4_t n, r, z;
+-  float32x4_t invln2_and_ln2 = vld1q_f32 (d->invln2_and_ln2);
+-  z = vfmaq_laneq_f32 (d->shift, x, invln2_and_ln2, 0);
+-  n = vsubq_f32 (z, d->shift);
+-  r = vfmsq_laneq_f32 (x, n, invln2_and_ln2, 1);
+-  r = vfmsq_laneq_f32 (r, n, invln2_and_ln2, 2);
+-  uint32x4_t e = vshlq_n_u32 (vreinterpretq_u32_f32 (z), 23);
+-  float32x4_t scale = vreinterpretq_f32_u32 (vaddq_u32 (e, ExponentBias));
+  /* exp(ax) = 2^n (1 + poly(r)), with 1 + poly(r) in [1/sqrt(2),sqrt(2)]
+     ax = ln2*n + r, with r in [-ln2/2, ln2/2].  */
+  float32x4_t ax = vabsq_f32 (x);
+  float32x4_t ln2_c02 = vld1q_f32 (&d->ln2_hi);
+  float32x4_t n = vrndaq_f32 (vmulq_f32 (ax, d->inv_ln2));
+  float32x4_t r = vfmsq_laneq_f32 (ax, n, ln2_c02, 0);
+  r = vfmsq_laneq_f32 (r, n, ln2_c02, 1);
+  uint32x4_t e = vshlq_n_u32 (vreinterpretq_u32_s32 (vcvtq_s32_f32 (n)), 23);
+  float32x4_t scale = vreinterpretq_f32_u32 (vaddq_u32 (e, d->exponent_bias));
+ 
+   /* Custom order-4 Estrin avoids building high order monomial.  */
+   float32x4_t r2 = vmulq_f32 (r, r);
+-  float32x4_t p, q, poly;
+-  p = vfmaq_f32 (C (1), C (0), r);
+-  q = vfmaq_f32 (C (3), C (2), r);
+  float32x4_t p = vfmaq_laneq_f32 (d->c1, r, ln2_c02, 2);
+  float32x4_t q = vfmaq_laneq_f32 (d->c3, r, ln2_c02, 3);
+   q = vfmaq_f32 (q, p, r2);
+-  p = vmulq_f32 (C (4), r);
+-  poly = vfmaq_f32 (p, q, r2);
+  p = vmulq_f32 (d->c4, r);
+  float32x4_t poly = vfmaq_f32 (p, q, r2);
+   return vfmaq_f32 (scale, poly, scale);
+ }
+-
+ #endif
--- a/glibc-RHEL-118273-23.patch
+++ b/glibc-RHEL-118273-23.patch
@ -0,0 +1,362 @@
+commit 91c1fadba338752bf514cd4cca057b27b1b10eed
+Author: Yat Long Poon <yatlong.poon@arm.com>
+Date:   Fri Jan 3 19:09:05 2025 +0000
+
+    AArch64: Improve codegen for SVE log1pf users
+    
+    Reduce memory access by using lanewise MLA and reduce number of MOVPRFXs.
+    Move log1pf implementation to inline helper function.
+    Speedup on Neoverse V1 for log1pf (10%), acoshf (-1%), atanhf (2%), asinhf (2%).
+
+Conflicts:
+        sysdeps/aarch64/fpu/log1pf_sve.c
+          (Fixup context to apply without out-of-scope dependency 751a5502)
+
+diff --git a/sysdeps/aarch64/fpu/acoshf_sve.c b/sysdeps/aarch64/fpu/acoshf_sve.c
+index 2110894e629500be..491365e24d692f0f 100644
+--- a/sysdeps/aarch64/fpu/acoshf_sve.c
+++ b/sysdeps/aarch64/fpu/acoshf_sve.c
+@@ -17,23 +17,26 @@
+    License along with the GNU C Library; if not, see
+    <https://www.gnu.org/licenses/>.  */
+ 
+#include "sv_math.h"
+#include "sv_log1pf_inline.h"
+
+ #define One 0x3f800000
+ #define Thres 0x20000000 /* asuint(0x1p64) - One.  */
+ 
+-#include "sv_log1pf_inline.h"
+-
+ static svfloat32_t NOINLINE
+-special_case (svfloat32_t x, svfloat32_t y, svbool_t special)
+special_case (svfloat32_t xm1, svfloat32_t tmp, svbool_t special)
+ {
+  svfloat32_t x = svadd_x (svptrue_b32 (), xm1, 1.0f);
+  svfloat32_t y = sv_log1pf_inline (tmp, svptrue_b32 ());
+   return sv_call_f32 (acoshf, x, y, special);
+ }
+ 
+ /* Single-precision SVE acosh(x) routine. Implements the same algorithm as
+    vector acoshf and log1p.
+ 
+-   Maximum error is 2.78 ULPs:
+-   SV_NAME_F1 (acosh) (0x1.01e996p+0) got 0x1.f45b42p-4
+-				     want 0x1.f45b3cp-4.  */
+   Maximum error is 2.47 ULPs:
+   SV_NAME_F1 (acosh) (0x1.01ca76p+0) got 0x1.e435a6p-4
+				     want 0x1.e435a2p-4.  */
+ svfloat32_t SV_NAME_F1 (acosh) (svfloat32_t x, const svbool_t pg)
+ {
+   svuint32_t ix = svreinterpret_u32 (x);
+@@ -41,9 +44,9 @@ svfloat32_t SV_NAME_F1 (acosh) (svfloat32_t x, const svbool_t pg)
+ 
+   svfloat32_t xm1 = svsub_x (pg, x, 1.0f);
+   svfloat32_t u = svmul_x (pg, xm1, svadd_x (pg, x, 1.0f));
+-  svfloat32_t y = sv_log1pf_inline (svadd_x (pg, xm1, svsqrt_x (pg, u)), pg);
+  svfloat32_t tmp = svadd_x (pg, xm1, svsqrt_x (pg, u));
+ 
+   if (__glibc_unlikely (svptest_any (pg, special)))
+-    return special_case (x, y, special);
+-  return y;
+    return special_case (xm1, tmp, special);
+  return sv_log1pf_inline (tmp, pg);
+ }
+diff --git a/sysdeps/aarch64/fpu/asinhf_sve.c b/sysdeps/aarch64/fpu/asinhf_sve.c
+index d85c3a685c0b83ff..b7f253bf32fb9478 100644
+--- a/sysdeps/aarch64/fpu/asinhf_sve.c
+++ b/sysdeps/aarch64/fpu/asinhf_sve.c
+@@ -20,20 +20,23 @@
+ #include "sv_math.h"
+ #include "sv_log1pf_inline.h"
+ 
+-#define BigBound (0x5f800000)  /* asuint(0x1p64).  */
+#define BigBound 0x5f800000 /* asuint(0x1p64).  */
+ 
+ static svfloat32_t NOINLINE
+-special_case (svfloat32_t x, svfloat32_t y, svbool_t special)
+special_case (svuint32_t iax, svuint32_t sign, svfloat32_t y, svbool_t special)
+ {
+  svfloat32_t x = svreinterpret_f32 (sveor_x (svptrue_b32 (), iax, sign));
+  y = svreinterpret_f32 (
+      svorr_x (svptrue_b32 (), sign, svreinterpret_u32 (y)));
+   return sv_call_f32 (asinhf, x, y, special);
+ }
+ 
+ /* Single-precision SVE asinh(x) routine. Implements the same algorithm as
+    vector asinhf and log1p.
+ 
+-   Maximum error is 2.48 ULPs:
+-   SV_NAME_F1 (asinh) (0x1.008864p-3) got 0x1.ffbbbcp-4
+-				     want 0x1.ffbbb8p-4.  */
+   Maximum error is 1.92 ULPs:
+   SV_NAME_F1 (asinh) (-0x1.0922ecp-1) got -0x1.fd0bccp-2
+				      want -0x1.fd0bc8p-2.  */
+ svfloat32_t SV_NAME_F1 (asinh) (svfloat32_t x, const svbool_t pg)
+ {
+   svfloat32_t ax = svabs_x (pg, x);
+@@ -49,8 +52,6 @@ svfloat32_t SV_NAME_F1 (asinh) (svfloat32_t x, const svbool_t pg)
+       = sv_log1pf_inline (svadd_x (pg, ax, svdiv_x (pg, ax2, d)), pg);
+ 
+   if (__glibc_unlikely (svptest_any (pg, special)))
+-    return special_case (
+-	x, svreinterpret_f32 (svorr_x (pg, sign, svreinterpret_u32 (y))),
+-	special);
+    return special_case (iax, sign, y, special);
+   return svreinterpret_f32 (svorr_x (pg, sign, svreinterpret_u32 (y)));
+ }
+diff --git a/sysdeps/aarch64/fpu/atanhf_sve.c b/sysdeps/aarch64/fpu/atanhf_sve.c
+index dae83041ef7157f0..2d3005bbc88393ec 100644
+--- a/sysdeps/aarch64/fpu/atanhf_sve.c
+++ b/sysdeps/aarch64/fpu/atanhf_sve.c
+@@ -17,21 +17,25 @@
+    License along with the GNU C Library; if not, see
+    <https://www.gnu.org/licenses/>.  */
+ 
+#include "sv_math.h"
+ #include "sv_log1pf_inline.h"
+ 
+ #define One (0x3f800000)
+ #define Half (0x3f000000)
+ 
+ static svfloat32_t NOINLINE
+-special_case (svfloat32_t x, svfloat32_t y, svbool_t special)
+special_case (svuint32_t iax, svuint32_t sign, svfloat32_t halfsign,
+	      svfloat32_t y, svbool_t special)
+ {
+  svfloat32_t x = svreinterpret_f32 (sveor_x (svptrue_b32 (), iax, sign));
+  y = svmul_x (svptrue_b32 (), halfsign, y);
+   return sv_call_f32 (atanhf, x, y, special);
+ }
+ 
+ /* Approximation for vector single-precision atanh(x) using modified log1p.
+-   The maximum error is 2.28 ULP:
+-   _ZGVsMxv_atanhf(0x1.ff1194p-5) got 0x1.ffbbbcp-5
+-				 want 0x1.ffbbb6p-5.  */
+   The maximum error is 1.99 ULP:
+   _ZGVsMxv_atanhf(0x1.f1583p-5) got 0x1.f1f4fap-5
+				want 0x1.f1f4f6p-5.  */
+ svfloat32_t SV_NAME_F1 (atanh) (svfloat32_t x, const svbool_t pg)
+ {
+   svfloat32_t ax = svabs_x (pg, x);
+@@ -48,7 +52,7 @@ svfloat32_t SV_NAME_F1 (atanh) (svfloat32_t x, const svbool_t pg)
+   y = sv_log1pf_inline (y, pg);
+ 
+   if (__glibc_unlikely (svptest_any (pg, special)))
+-    return special_case (x, svmul_x (pg, halfsign, y), special);
+    return special_case (iax, sign, halfsign, y, special);
+ 
+   return svmul_x (pg, halfsign, y);
+ }
+diff --git a/sysdeps/aarch64/fpu/log1pf_sve.c b/sysdeps/aarch64/fpu/log1pf_sve.c
+index f645cc997e430bcb..4f17c44e2d96039a 100644
+--- a/sysdeps/aarch64/fpu/log1pf_sve.c
+++ b/sysdeps/aarch64/fpu/log1pf_sve.c
+@@ -18,30 +18,13 @@
+    <https://www.gnu.org/licenses/>.  */
+ 
+ #include "sv_math.h"
+-#include "poly_sve_f32.h"
+-
+-static const struct data
+-{
+-  float poly[8];
+-  float ln2, exp_bias;
+-  uint32_t four, three_quarters;
+-} data = {.poly = {/* Do not store first term of polynomial, which is -0.5, as
+-                      this can be fmov-ed directly instead of including it in
+-                      the main load-and-mla polynomial schedule.  */
+-		   0x1.5555aap-2f, -0x1.000038p-2f, 0x1.99675cp-3f,
+-		   -0x1.54ef78p-3f, 0x1.28a1f4p-3f, -0x1.0da91p-3f,
+-		   0x1.abcb6p-4f, -0x1.6f0d5ep-5f},
+-	  .ln2 = 0x1.62e43p-1f,
+-	  .exp_bias = 0x1p-23f,
+-	  .four = 0x40800000,
+-	  .three_quarters = 0x3f400000};
+-
+-#define SignExponentMask 0xff800000
+#include "sv_log1pf_inline.h"
+ 
+ static svfloat32_t NOINLINE
+-special_case (svfloat32_t x, svfloat32_t y, svbool_t special)
+special_case (svfloat32_t x, svbool_t special)
+ {
+-  return sv_call_f32 (log1pf, x, y, special);
+  return sv_call_f32 (log1pf, x, sv_log1pf_inline (x, svptrue_b32 ()),
+		      special);
+ }
+ 
+ /* Vector log1pf approximation using polynomial on reduced interval. Worst-case
+@@ -50,51 +33,12 @@ special_case (svfloat32_t x, svfloat32_t y, svbool_t special)
+ 				 want 0x1.9f323ep-2.  */
+ svfloat32_t SV_NAME_F1 (log1p) (svfloat32_t x, svbool_t pg)
+ {
+-  const struct data *d = ptr_barrier (&data);
+   /* x < -1, Inf/Nan.  */
+   svbool_t special = svcmpeq (pg, svreinterpret_u32 (x), 0x7f800000);
+   special = svorn_z (pg, special, svcmpge (pg, x, -1));
+ 
+-  /* With x + 1 = t * 2^k (where t = m + 1 and k is chosen such that m
+-			   is in [-0.25, 0.5]):
+-     log1p(x) = log(t) + log(2^k) = log1p(m) + k*log(2).
+-
+-     We approximate log1p(m) with a polynomial, then scale by
+-     k*log(2). Instead of doing this directly, we use an intermediate
+-     scale factor s = 4*k*log(2) to ensure the scale is representable
+-     as a normalised fp32 number.  */
+-  svfloat32_t m = svadd_x (pg, x, 1);
+-
+-  /* Choose k to scale x to the range [-1/4, 1/2].  */
+-  svint32_t k
+-      = svand_x (pg, svsub_x (pg, svreinterpret_s32 (m), d->three_quarters),
+-		 sv_s32 (SignExponentMask));
+-
+-  /* Scale x by exponent manipulation.  */
+-  svfloat32_t m_scale = svreinterpret_f32 (
+-      svsub_x (pg, svreinterpret_u32 (x), svreinterpret_u32 (k)));
+-
+-  /* Scale up to ensure that the scale factor is representable as normalised
+-     fp32 number, and scale m down accordingly.  */
+-  svfloat32_t s = svreinterpret_f32 (svsubr_x (pg, k, d->four));
+-  m_scale = svadd_x (pg, m_scale, svmla_x (pg, sv_f32 (-1), s, 0.25));
+-
+-  /* Evaluate polynomial on reduced interval.  */
+-  svfloat32_t ms2 = svmul_x (pg, m_scale, m_scale),
+-	      ms4 = svmul_x (pg, ms2, ms2);
+-  svfloat32_t p = sv_estrin_7_f32_x (pg, m_scale, ms2, ms4, d->poly);
+-  p = svmad_x (pg, m_scale, p, -0.5);
+-  p = svmla_x (pg, m_scale, m_scale, svmul_x (pg, m_scale, p));
+-
+-  /* The scale factor to be applied back at the end - by multiplying float(k)
+-     by 2^-23 we get the unbiased exponent of k.  */
+-  svfloat32_t scale_back = svmul_x (pg, svcvt_f32_x (pg, k), d->exp_bias);
+-
+-  /* Apply the scaling back.  */
+-  svfloat32_t y = svmla_x (pg, p, scale_back, d->ln2);
+-
+   if (__glibc_unlikely (svptest_any (pg, special)))
+-    return special_case (x, y, special);
+    return special_case (x, special);
+ 
+-  return y;
+  return sv_log1pf_inline (x, pg);
+ }
+diff --git a/sysdeps/aarch64/fpu/sv_log1pf_inline.h b/sysdeps/aarch64/fpu/sv_log1pf_inline.h
+index b94b2da055a6c59b..850297d61556740c 100644
+--- a/sysdeps/aarch64/fpu/sv_log1pf_inline.h
+++ b/sysdeps/aarch64/fpu/sv_log1pf_inline.h
+@@ -22,55 +22,76 @@
+ 
+ #include "sv_math.h"
+ #include "vecmath_config.h"
+-#include "poly_sve_f32.h"
+
+#define SignExponentMask 0xff800000
+ 
+ static const struct sv_log1pf_data
+ {
+-  float32_t poly[9];
+-  float32_t ln2;
+-  float32_t scale_back;
+  float c0, c2, c4, c6;
+  float c1, c3, c5, c7;
+  float ln2, exp_bias, quarter;
+  uint32_t four, three_quarters;
+ } sv_log1pf_data = {
+-  /* Polynomial generated using FPMinimax in [-0.25, 0.5].  */
+-  .poly = { -0x1p-1f, 0x1.5555aap-2f, -0x1.000038p-2f, 0x1.99675cp-3f,
+-	    -0x1.54ef78p-3f, 0x1.28a1f4p-3f, -0x1.0da91p-3f, 0x1.abcb6p-4f,
+-	    -0x1.6f0d5ep-5f },
+-  .scale_back = 0x1.0p-23f,
+-  .ln2 = 0x1.62e43p-1f,
+  /* Do not store first term of polynomial, which is -0.5, as
+     this can be fmov-ed directly instead of including it in
+     the main load-and-mla polynomial schedule.  */
+  .c0 = 0x1.5555aap-2f,		.c1 = -0x1.000038p-2f, .c2 = 0x1.99675cp-3f,
+  .c3 = -0x1.54ef78p-3f,	.c4 = 0x1.28a1f4p-3f,  .c5 = -0x1.0da91p-3f,
+  .c6 = 0x1.abcb6p-4f,		.c7 = -0x1.6f0d5ep-5f, .ln2 = 0x1.62e43p-1f,
+  .exp_bias = 0x1p-23f,		.quarter = 0x1p-2f,    .four = 0x40800000,
+  .three_quarters = 0x3f400000,
+ };
+ 
+-static inline svfloat32_t
+-eval_poly (svfloat32_t m, const float32_t *c, svbool_t pg)
+-{
+-  svfloat32_t p_12 = svmla_x (pg, sv_f32 (c[0]), m, sv_f32 (c[1]));
+-  svfloat32_t m2 = svmul_x (pg, m, m);
+-  svfloat32_t q = svmla_x (pg, m, m2, p_12);
+-  svfloat32_t p = sv_pw_horner_6_f32_x (pg, m, m2, c + 2);
+-  p = svmul_x (pg, m2, p);
+-
+-  return svmla_x (pg, q, m2, p);
+-}
+-
+ static inline svfloat32_t
+ sv_log1pf_inline (svfloat32_t x, svbool_t pg)
+ {
+   const struct sv_log1pf_data *d = ptr_barrier (&sv_log1pf_data);
+ 
+-  svfloat32_t m = svadd_x (pg, x, 1.0f);
+-
+-  svint32_t ks = svsub_x (pg, svreinterpret_s32 (m),
+-			  svreinterpret_s32 (svdup_f32 (0.75f)));
+-  ks = svand_x (pg, ks, 0xff800000);
+-  svuint32_t k = svreinterpret_u32 (ks);
+-  svfloat32_t s = svreinterpret_f32 (
+-      svsub_x (pg, svreinterpret_u32 (svdup_f32 (4.0f)), k));
+-
+-  svfloat32_t m_scale
+-      = svreinterpret_f32 (svsub_x (pg, svreinterpret_u32 (x), k));
+-  m_scale
+-      = svadd_x (pg, m_scale, svmla_x (pg, sv_f32 (-1.0f), sv_f32 (0.25f), s));
+-  svfloat32_t p = eval_poly (m_scale, d->poly, pg);
+-  svfloat32_t scale_back = svmul_x (pg, svcvt_f32_x (pg, k), d->scale_back);
+-  return svmla_x (pg, p, scale_back, d->ln2);
+  /* With x + 1 = t * 2^k (where t = m + 1 and k is chosen such that m
+			 is in [-0.25, 0.5]):
+   log1p(x) = log(t) + log(2^k) = log1p(m) + k*log(2).
+
+   We approximate log1p(m) with a polynomial, then scale by
+   k*log(2). Instead of doing this directly, we use an intermediate
+   scale factor s = 4*k*log(2) to ensure the scale is representable
+   as a normalised fp32 number.  */
+  svfloat32_t m = svadd_x (pg, x, 1);
+
+  /* Choose k to scale x to the range [-1/4, 1/2].  */
+  svint32_t k
+      = svand_x (pg, svsub_x (pg, svreinterpret_s32 (m), d->three_quarters),
+		 sv_s32 (SignExponentMask));
+
+  /* Scale x by exponent manipulation.  */
+  svfloat32_t m_scale = svreinterpret_f32 (
+      svsub_x (pg, svreinterpret_u32 (x), svreinterpret_u32 (k)));
+
+  /* Scale up to ensure that the scale factor is representable as normalised
+     fp32 number, and scale m down accordingly.  */
+  svfloat32_t s = svreinterpret_f32 (svsubr_x (pg, k, d->four));
+  svfloat32_t fconst = svld1rq_f32 (svptrue_b32 (), &d->ln2);
+  m_scale = svadd_x (pg, m_scale, svmla_lane_f32 (sv_f32 (-1), s, fconst, 2));
+
+  /* Evaluate polynomial on reduced interval.  */
+  svfloat32_t ms2 = svmul_x (svptrue_b32 (), m_scale, m_scale);
+
+  svfloat32_t c1357 = svld1rq_f32 (svptrue_b32 (), &d->c1);
+  svfloat32_t p01 = svmla_lane_f32 (sv_f32 (d->c0), m_scale, c1357, 0);
+  svfloat32_t p23 = svmla_lane_f32 (sv_f32 (d->c2), m_scale, c1357, 1);
+  svfloat32_t p45 = svmla_lane_f32 (sv_f32 (d->c4), m_scale, c1357, 2);
+  svfloat32_t p67 = svmla_lane_f32 (sv_f32 (d->c6), m_scale, c1357, 3);
+
+  svfloat32_t p = svmla_x (pg, p45, p67, ms2);
+  p = svmla_x (pg, p23, p, ms2);
+  p = svmla_x (pg, p01, p, ms2);
+
+  p = svmad_x (pg, m_scale, p, -0.5);
+  p = svmla_x (pg, m_scale, m_scale, svmul_x (pg, m_scale, p));
+
+  /* The scale factor to be applied back at the end - by multiplying float(k)
+   by 2^-23 we get the unbiased exponent of k.  */
+  svfloat32_t scale_back = svmul_lane_f32 (svcvt_f32_x (pg, k), fconst, 1);
+  return svmla_lane_f32 (p, scale_back, fconst, 0);
+ }
+ 
+ #endif
--- a/glibc-RHEL-118273-24.patch
+++ b/glibc-RHEL-118273-24.patch
@ -0,0 +1,258 @@
+commit 140b985e5a2071000122b3cb63ebfe88cf21dd29
+Author: Luna Lamb <luna.lamb@arm.com>
+Date:   Fri Jan 3 19:00:12 2025 +0000
+
+    AArch64: Improve codegen in AdvSIMD asinh
+    
+    Improves memory access and removes spills.
+    Load the polynomial evaluation coefficients into 2 vectors and use lanewise
+    MLAs.  Reduces MOVs 6->3 , LDR 11->5, STR/STP 2->0, ADRP 3->2.
+
+diff --git a/sysdeps/aarch64/fpu/asinh_advsimd.c b/sysdeps/aarch64/fpu/asinh_advsimd.c
+index 6207e7da9531f48d..2739f98b390edca7 100644
+--- a/sysdeps/aarch64/fpu/asinh_advsimd.c
+++ b/sysdeps/aarch64/fpu/asinh_advsimd.c
+@@ -20,41 +20,71 @@
+ #include "v_math.h"
+ #include "poly_advsimd_f64.h"
+ 
+-#define A(i) v_f64 (__v_log_data.poly[i])
+-#define N (1 << V_LOG_TABLE_BITS)
+-#define IndexMask (N - 1)
+-
+ const static struct data
+ {
+-  float64x2_t poly[18];
+-  uint64x2_t off, huge_bound, abs_mask;
+-  float64x2_t ln2, tiny_bound;
+  uint64x2_t huge_bound, abs_mask, off, mask;
+#if WANT_SIMD_EXCEPT
+  float64x2_t tiny_bound;
+#endif
+  float64x2_t lc0, lc2;
+  double lc1, lc3, ln2, lc4;
+
+  float64x2_t c0, c2, c4, c6, c8, c10, c12, c14, c16, c17;
+  double c1, c3, c5, c7, c9, c11, c13, c15;
+
+ } data = {
+-  .off = V2 (0x3fe6900900000000),
+-  .ln2 = V2 (0x1.62e42fefa39efp-1),
+-  .huge_bound = V2 (0x5fe0000000000000),
+
+#if WANT_SIMD_EXCEPT
+   .tiny_bound = V2 (0x1p-26),
+-  .abs_mask = V2 (0x7fffffffffffffff),
+#endif
+   /* Even terms of polynomial s.t. asinh(x) is approximated by
+      asinh(x) ~= x + x^3 * (C0 + C1 * x + C2 * x^2 + C3 * x^3 + ...).
+      Generated using Remez, f = (asinh(sqrt(x)) - sqrt(x))/x^(3/2).  */
+-  .poly = { V2 (-0x1.55555555554a7p-3), V2 (0x1.3333333326c7p-4),
+-	    V2 (-0x1.6db6db68332e6p-5), V2 (0x1.f1c71b26fb40dp-6),
+-	    V2 (-0x1.6e8b8b654a621p-6), V2 (0x1.1c4daa9e67871p-6),
+-	    V2 (-0x1.c9871d10885afp-7), V2 (0x1.7a16e8d9d2ecfp-7),
+-	    V2 (-0x1.3ddca533e9f54p-7), V2 (0x1.0becef748dafcp-7),
+-	    V2 (-0x1.b90c7099dd397p-8), V2 (0x1.541f2bb1ffe51p-8),
+-	    V2 (-0x1.d217026a669ecp-9), V2 (0x1.0b5c7977aaf7p-9),
+-	    V2 (-0x1.e0f37daef9127p-11), V2 (0x1.388b5fe542a6p-12),
+-	    V2 (-0x1.021a48685e287p-14), V2 (0x1.93d4ba83d34dap-18) },
+
+  .c0 = V2 (-0x1.55555555554a7p-3),
+  .c1 = 0x1.3333333326c7p-4,
+  .c2 = V2 (-0x1.6db6db68332e6p-5),
+  .c3 = 0x1.f1c71b26fb40dp-6,
+  .c4 = V2 (-0x1.6e8b8b654a621p-6),
+  .c5 = 0x1.1c4daa9e67871p-6,
+  .c6 = V2 (-0x1.c9871d10885afp-7),
+  .c7 = 0x1.7a16e8d9d2ecfp-7,
+  .c8 = V2 (-0x1.3ddca533e9f54p-7),
+  .c9 = 0x1.0becef748dafcp-7,
+  .c10 = V2 (-0x1.b90c7099dd397p-8),
+  .c11 = 0x1.541f2bb1ffe51p-8,
+  .c12 = V2 (-0x1.d217026a669ecp-9),
+  .c13 = 0x1.0b5c7977aaf7p-9,
+  .c14 = V2 (-0x1.e0f37daef9127p-11),
+  .c15 = 0x1.388b5fe542a6p-12,
+  .c16 = V2 (-0x1.021a48685e287p-14),
+  .c17 = V2 (0x1.93d4ba83d34dap-18),
+
+  .lc0 = V2 (-0x1.ffffffffffff7p-2),
+  .lc1 = 0x1.55555555170d4p-2,
+  .lc2 = V2 (-0x1.0000000399c27p-2),
+  .lc3 = 0x1.999b2e90e94cap-3,
+  .lc4 = -0x1.554e550bd501ep-3,
+  .ln2 = 0x1.62e42fefa39efp-1,
+
+  .off = V2 (0x3fe6900900000000),
+  .huge_bound = V2 (0x5fe0000000000000),
+  .abs_mask = V2 (0x7fffffffffffffff),
+  .mask = V2 (0xfffULL << 52),
+ };
+ 
+ static float64x2_t NOINLINE VPCS_ATTR
+-special_case (float64x2_t x, float64x2_t y, uint64x2_t special)
+special_case (float64x2_t x, float64x2_t y, uint64x2_t abs_mask,
+	      uint64x2_t special)
+ {
+  /* Copy sign.  */
+  y = vbslq_f64 (abs_mask, y, x);
+   return v_call_f64 (asinh, x, y, special);
+ }
+ 
+#define N (1 << V_LOG_TABLE_BITS)
+#define IndexMask (N - 1)
+
+ struct entry
+ {
+   float64x2_t invc;
+@@ -76,27 +106,34 @@ lookup (uint64x2_t i)
+ }
+ 
+ static inline float64x2_t
+-log_inline (float64x2_t x, const struct data *d)
+log_inline (float64x2_t xm, const struct data *d)
+ {
+-  /* Double-precision vector log, copied from ordinary vector log with some
+-     cosmetic modification and special-cases removed.  */
+-  uint64x2_t ix = vreinterpretq_u64_f64 (x);
+-  uint64x2_t tmp = vsubq_u64 (ix, d->off);
+-  int64x2_t k = vshrq_n_s64 (vreinterpretq_s64_u64 (tmp), 52);
+-  uint64x2_t iz
+-      = vsubq_u64 (ix, vandq_u64 (tmp, vdupq_n_u64 (0xfffULL << 52)));
+
+  uint64x2_t u = vreinterpretq_u64_f64 (xm);
+  uint64x2_t u_off = vsubq_u64 (u, d->off);
+
+  int64x2_t k = vshrq_n_s64 (vreinterpretq_s64_u64 (u_off), 52);
+  uint64x2_t iz = vsubq_u64 (u, vandq_u64 (u_off, d->mask));
+   float64x2_t z = vreinterpretq_f64_u64 (iz);
+-  struct entry e = lookup (tmp);
+
+  struct entry e = lookup (u_off);
+
+  /* log(x) = log1p(z/c-1) + log(c) + k*Ln2.  */
+   float64x2_t r = vfmaq_f64 (v_f64 (-1.0), z, e.invc);
+   float64x2_t kd = vcvtq_f64_s64 (k);
+-  float64x2_t hi = vfmaq_f64 (vaddq_f64 (e.logc, r), kd, d->ln2);
+
+  /* hi = r + log(c) + k*Ln2.  */
+  float64x2_t ln2_and_lc4 = vld1q_f64 (&d->ln2);
+  float64x2_t hi = vfmaq_laneq_f64 (vaddq_f64 (e.logc, r), kd, ln2_and_lc4, 0);
+
+  /* y = r2*(A0 + r*A1 + r2*(A2 + r*A3 + r2*A4)) + hi.  */
+  float64x2_t odd_coeffs = vld1q_f64 (&d->lc1);
+   float64x2_t r2 = vmulq_f64 (r, r);
+-  float64x2_t y = vfmaq_f64 (A (2), A (3), r);
+-  float64x2_t p = vfmaq_f64 (A (0), A (1), r);
+-  y = vfmaq_f64 (y, A (4), r2);
+-  y = vfmaq_f64 (p, y, r2);
+-  y = vfmaq_f64 (hi, y, r2);
+-  return y;
+  float64x2_t y = vfmaq_laneq_f64 (d->lc2, r, odd_coeffs, 1);
+  float64x2_t p = vfmaq_laneq_f64 (d->lc0, r, odd_coeffs, 0);
+  y = vfmaq_laneq_f64 (y, r2, ln2_and_lc4, 1);
+  y = vfmaq_f64 (p, r2, y);
+  return vfmaq_f64 (hi, y, r2);
+ }
+ 
+ /* Double-precision implementation of vector asinh(x).
+@@ -106,23 +143,24 @@ log_inline (float64x2_t x, const struct data *d)
+    asinh(x) = sign(x) * log(|x| + sqrt(x^2 + 1)      if |x| >= 1
+ 	    = sign(x) * (|x| + |x|^3 * P(x^2))       otherwise
+    where log(x) is an optimized log approximation, and P(x) is a polynomial
+-   shared with the scalar routine. The greatest observed error 3.29 ULP, in
+   shared with the scalar routine. The greatest observed error 2.79 ULP, in
+    |x| >= 1:
+-   __v_asinh(0x1.2cd9d717e2c9bp+0) got 0x1.ffffcfd0e234fp-1
+-				  want 0x1.ffffcfd0e2352p-1.  */
+   _ZGVnN2v_asinh(0x1.2cd9d73ea76a6p+0) got 0x1.ffffd003219dap-1
+				       want  0x1.ffffd003219ddp-1.  */
+ VPCS_ATTR float64x2_t V_NAME_D1 (asinh) (float64x2_t x)
+ {
+   const struct data *d = ptr_barrier (&data);
+-
+   float64x2_t ax = vabsq_f64 (x);
+-  uint64x2_t iax = vreinterpretq_u64_f64 (ax);
+ 
+   uint64x2_t gt1 = vcgeq_f64 (ax, v_f64 (1));
+-  uint64x2_t special = vcgeq_u64 (iax, d->huge_bound);
+ 
+ #if WANT_SIMD_EXCEPT
+  uint64x2_t iax = vreinterpretq_u64_f64 (ax);
+  uint64x2_t special = vcgeq_u64 (iax, (d->huge_bound));
+   uint64x2_t tiny = vcltq_f64 (ax, d->tiny_bound);
+   special = vorrq_u64 (special, tiny);
+#else
+  uint64x2_t special = vcgeq_f64 (ax, vreinterpretq_f64_u64 (d->huge_bound));
+ #endif
+ 
+   /* Option 1: |x| >= 1.
+@@ -147,19 +185,45 @@ VPCS_ATTR float64x2_t V_NAME_D1 (asinh) (float64x2_t x)
+      overflow, and tiny lanes, which will underflow, by setting them to 0. They
+      will be fixed later, either by selecting x or falling back to the scalar
+      special-case. The largest observed error in this region is 1.47 ULPs:
+-     __v_asinh(0x1.fdfcd00cc1e6ap-1) got 0x1.c1d6bf874019bp-1
+-				    want 0x1.c1d6bf874019cp-1.  */
+     _ZGVnN2v_asinh(0x1.fdfcd00cc1e6ap-1) got 0x1.c1d6bf874019bp-1
+					 want 0x1.c1d6bf874019cp-1.  */
+   float64x2_t option_2 = v_f64 (0);
+
+   if (__glibc_likely (v_any_u64 (vceqzq_u64 (gt1))))
+     {
+
+ #if WANT_SIMD_EXCEPT
+       ax = v_zerofy_f64 (ax, vorrq_u64 (tiny, gt1));
+ #endif
+-      float64x2_t x2 = vmulq_f64 (ax, ax), x3 = vmulq_f64 (ax, x2),
+-		  z2 = vmulq_f64 (x2, x2), z4 = vmulq_f64 (z2, z2),
+-		  z8 = vmulq_f64 (z4, z4), z16 = vmulq_f64 (z8, z8);
+-      float64x2_t p = v_estrin_17_f64 (x2, z2, z4, z8, z16, d->poly);
+-      option_2 = vfmaq_f64 (ax, p, x3);
+      float64x2_t x2 = vmulq_f64 (ax, ax), z2 = vmulq_f64 (x2, x2);
+      /* Order-17 Pairwise Horner scheme.  */
+      float64x2_t c13 = vld1q_f64 (&d->c1);
+      float64x2_t c57 = vld1q_f64 (&d->c5);
+      float64x2_t c911 = vld1q_f64 (&d->c9);
+      float64x2_t c1315 = vld1q_f64 (&d->c13);
+
+      float64x2_t p01 = vfmaq_laneq_f64 (d->c0, x2, c13, 0);
+      float64x2_t p23 = vfmaq_laneq_f64 (d->c2, x2, c13, 1);
+      float64x2_t p45 = vfmaq_laneq_f64 (d->c4, x2, c57, 0);
+      float64x2_t p67 = vfmaq_laneq_f64 (d->c6, x2, c57, 1);
+      float64x2_t p89 = vfmaq_laneq_f64 (d->c8, x2, c911, 0);
+      float64x2_t p1011 = vfmaq_laneq_f64 (d->c10, x2, c911, 1);
+      float64x2_t p1213 = vfmaq_laneq_f64 (d->c12, x2, c1315, 0);
+      float64x2_t p1415 = vfmaq_laneq_f64 (d->c14, x2, c1315, 1);
+      float64x2_t p1617 = vfmaq_f64 (d->c16, x2, d->c17);
+
+      float64x2_t p = vfmaq_f64 (p1415, z2, p1617);
+      p = vfmaq_f64 (p1213, z2, p);
+      p = vfmaq_f64 (p1011, z2, p);
+      p = vfmaq_f64 (p89, z2, p);
+
+      p = vfmaq_f64 (p67, z2, p);
+      p = vfmaq_f64 (p45, z2, p);
+
+      p = vfmaq_f64 (p23, z2, p);
+
+      p = vfmaq_f64 (p01, z2, p);
+      option_2 = vfmaq_f64 (ax, p, vmulq_f64 (ax, x2));
+ #if WANT_SIMD_EXCEPT
+       option_2 = vbslq_f64 (tiny, x, option_2);
+ #endif
+@@ -167,10 +231,10 @@ VPCS_ATTR float64x2_t V_NAME_D1 (asinh) (float64x2_t x)
+ 
+   /* Choose the right option for each lane.  */
+   float64x2_t y = vbslq_f64 (gt1, option_1, option_2);
+-  /* Copy sign.  */
+-  y = vbslq_f64 (d->abs_mask, y, x);
+-
+   if (__glibc_unlikely (v_any_u64 (special)))
+-    return special_case (x, y, special);
+-  return y;
+    {
+      return special_case (x, y, d->abs_mask, special);
+    }
+  /* Copy sign.  */
+  return vbslq_f64 (d->abs_mask, y, x);
+ }
--- a/glibc-RHEL-118273-25.patch
+++ b/glibc-RHEL-118273-25.patch
@ -0,0 +1,221 @@
+commit f86b4cf87581cf1e45702b07880679ffa0b1f47a
+Author: Luna Lamb <luna.lamb@arm.com>
+Date:   Fri Jan 3 20:15:17 2025 +0000
+
+    AArch64: Improve codegen in SVE expm1f and users
+    
+    Use unpredicated muls, use absolute compare and improve memory access.
+    Expm1f, sinhf and tanhf show 7%, 5% and 1% improvement in throughput
+    microbenchmark on Neoverse V1.
+
+diff --git a/sysdeps/aarch64/fpu/expm1f_sve.c b/sysdeps/aarch64/fpu/expm1f_sve.c
+index 7c852125cdbd0a2b..05a66400d477b819 100644
+--- a/sysdeps/aarch64/fpu/expm1f_sve.c
+++ b/sysdeps/aarch64/fpu/expm1f_sve.c
+@@ -18,7 +18,6 @@
+    <https://www.gnu.org/licenses/>.  */
+ 
+ #include "sv_math.h"
+-#include "poly_sve_f32.h"
+ 
+ /* Largest value of x for which expm1(x) should round to -1.  */
+ #define SpecialBound 0x1.5ebc4p+6f
+@@ -28,20 +27,17 @@ static const struct data
+   /* These 4 are grouped together so they can be loaded as one quadword, then
+      used with _lane forms of svmla/svmls.  */
+   float c2, c4, ln2_hi, ln2_lo;
+-  float c0, c1, c3, inv_ln2, special_bound, shift;
+  float c0, inv_ln2, c1, c3, special_bound;
+ } data = {
+   /* Generated using fpminimax.  */
+   .c0 = 0x1.fffffep-2,		 .c1 = 0x1.5554aep-3,
+   .c2 = 0x1.555736p-5,		 .c3 = 0x1.12287cp-7,
+-  .c4 = 0x1.6b55a2p-10,
+  .c4 = 0x1.6b55a2p-10,		 .inv_ln2 = 0x1.715476p+0f,
+  .special_bound = SpecialBound, .ln2_lo = 0x1.7f7d1cp-20f,
+  .ln2_hi = 0x1.62e4p-1f,
+ 
+-  .special_bound = SpecialBound, .shift = 0x1.8p23f,
+-  .inv_ln2 = 0x1.715476p+0f,	 .ln2_hi = 0x1.62e4p-1f,
+-  .ln2_lo = 0x1.7f7d1cp-20f,
+ };
+ 
+-#define C(i) sv_f32 (d->c##i)
+-
+ static svfloat32_t NOINLINE
+ special_case (svfloat32_t x, svbool_t pg)
+ {
+@@ -71,9 +67,8 @@ svfloat32_t SV_NAME_F1 (expm1) (svfloat32_t x, svbool_t pg)
+      and f = x - i * ln2, then f is in [-ln2/2, ln2/2].
+      exp(x) - 1 = 2^i * (expm1(f) + 1) - 1
+      where 2^i is exact because i is an integer.  */
+-  svfloat32_t j = svmla_x (pg, sv_f32 (d->shift), x, d->inv_ln2);
+-  j = svsub_x (pg, j, d->shift);
+-  svint32_t i = svcvt_s32_x (pg, j);
+  svfloat32_t j = svmul_x (svptrue_b32 (), x, d->inv_ln2);
+  j = svrinta_x (pg, j);
+ 
+   svfloat32_t f = svmls_lane (x, j, lane_constants, 2);
+   f = svmls_lane (f, j, lane_constants, 3);
+@@ -83,17 +78,17 @@ svfloat32_t SV_NAME_F1 (expm1) (svfloat32_t x, svbool_t pg)
+ 	 x + ax^2 + bx^3 + cx^4 ....
+      So we calculate the polynomial P(f) = a + bf + cf^2 + ...
+      and assemble the approximation expm1(f) ~= f + f^2 * P(f).  */
+-  svfloat32_t p12 = svmla_lane (C (1), f, lane_constants, 0);
+-  svfloat32_t p34 = svmla_lane (C (3), f, lane_constants, 1);
+-  svfloat32_t f2 = svmul_x (pg, f, f);
+  svfloat32_t p12 = svmla_lane (sv_f32 (d->c1), f, lane_constants, 0);
+  svfloat32_t p34 = svmla_lane (sv_f32 (d->c3), f, lane_constants, 1);
+  svfloat32_t f2 = svmul_x (svptrue_b32 (), f, f);
+   svfloat32_t p = svmla_x (pg, p12, f2, p34);
+-  p = svmla_x (pg, C (0), f, p);
+
+  p = svmla_x (pg, sv_f32 (d->c0), f, p);
+   p = svmla_x (pg, f, f2, p);
+ 
+   /* Assemble the result.
+      expm1(x) ~= 2^i * (p + 1) - 1
+      Let t = 2^i.  */
+-  svfloat32_t t = svreinterpret_f32 (
+-      svadd_x (pg, svreinterpret_u32 (svlsl_x (pg, i, 23)), 0x3f800000));
+-  return svmla_x (pg, svsub_x (pg, t, 1), p, t);
+  svfloat32_t t = svscale_x (pg, sv_f32 (1.0f), svcvt_s32_x (pg, j));
+  return svmla_x (pg, svsub_x (pg, t, 1.0f), p, t);
+ }
+diff --git a/sysdeps/aarch64/fpu/sinhf_sve.c b/sysdeps/aarch64/fpu/sinhf_sve.c
+index 6c204b57a2aa18d3..50dd386774b005ca 100644
+--- a/sysdeps/aarch64/fpu/sinhf_sve.c
+++ b/sysdeps/aarch64/fpu/sinhf_sve.c
+@@ -63,5 +63,5 @@ svfloat32_t SV_NAME_F1 (sinh) (svfloat32_t x, const svbool_t pg)
+   if (__glibc_unlikely (svptest_any (pg, special)))
+     return special_case (x, svmul_x (pg, t, halfsign), special);
+ 
+-  return svmul_x (pg, t, halfsign);
+  return svmul_x (svptrue_b32 (), t, halfsign);
+ }
+diff --git a/sysdeps/aarch64/fpu/sv_expm1f_inline.h b/sysdeps/aarch64/fpu/sv_expm1f_inline.h
+index 5b7245122294e1b4..e46ddda5437dc826 100644
+--- a/sysdeps/aarch64/fpu/sv_expm1f_inline.h
+++ b/sysdeps/aarch64/fpu/sv_expm1f_inline.h
+@@ -27,21 +27,18 @@ struct sv_expm1f_data
+   /* These 4 are grouped together so they can be loaded as one quadword, then
+    used with _lane forms of svmla/svmls.  */
+   float32_t c2, c4, ln2_hi, ln2_lo;
+-  float32_t c0, c1, c3, inv_ln2, shift;
+  float c0, inv_ln2, c1, c3, special_bound;
+ };
+ 
+ /* Coefficients generated using fpminimax.  */
+ #define SV_EXPM1F_DATA                                                        \
+   {                                                                           \
+-    .c0 = 0x1.fffffep-2, .c1 = 0x1.5554aep-3, .c2 = 0x1.555736p-5,            \
+-    .c3 = 0x1.12287cp-7, .c4 = 0x1.6b55a2p-10,                                \
+    .c0 = 0x1.fffffep-2, .c1 = 0x1.5554aep-3, .inv_ln2 = 0x1.715476p+0f,      \
+    .c2 = 0x1.555736p-5, .c3 = 0x1.12287cp-7,                                 \
+                                                                               \
+-    .shift = 0x1.8p23f, .inv_ln2 = 0x1.715476p+0f, .ln2_hi = 0x1.62e4p-1f,    \
+-    .ln2_lo = 0x1.7f7d1cp-20f,                                                \
+    .c4 = 0x1.6b55a2p-10, .ln2_lo = 0x1.7f7d1cp-20f, .ln2_hi = 0x1.62e4p-1f,  \
+   }
+ 
+-#define C(i) sv_f32 (d->c##i)
+-
+ static inline svfloat32_t
+ expm1f_inline (svfloat32_t x, svbool_t pg, const struct sv_expm1f_data *d)
+ {
+@@ -55,9 +52,8 @@ expm1f_inline (svfloat32_t x, svbool_t pg, const struct sv_expm1f_data *d)
+      and f = x - i * ln2, then f is in [-ln2/2, ln2/2].
+      exp(x) - 1 = 2^i * (expm1(f) + 1) - 1
+      where 2^i is exact because i is an integer.  */
+-  svfloat32_t j = svmla_x (pg, sv_f32 (d->shift), x, d->inv_ln2);
+-  j = svsub_x (pg, j, d->shift);
+-  svint32_t i = svcvt_s32_x (pg, j);
+  svfloat32_t j = svmul_x (svptrue_b32 (), x, d->inv_ln2);
+  j = svrinta_x (pg, j);
+ 
+   svfloat32_t f = svmls_lane (x, j, lane_constants, 2);
+   f = svmls_lane (f, j, lane_constants, 3);
+@@ -67,18 +63,18 @@ expm1f_inline (svfloat32_t x, svbool_t pg, const struct sv_expm1f_data *d)
+ 	 x + ax^2 + bx^3 + cx^4 ....
+      So we calculate the polynomial P(f) = a + bf + cf^2 + ...
+      and assemble the approximation expm1(f) ~= f + f^2 * P(f).  */
+-  svfloat32_t p12 = svmla_lane (C (1), f, lane_constants, 0);
+-  svfloat32_t p34 = svmla_lane (C (3), f, lane_constants, 1);
+-  svfloat32_t f2 = svmul_x (pg, f, f);
+  svfloat32_t p12 = svmla_lane (sv_f32 (d->c1), f, lane_constants, 0);
+  svfloat32_t p34 = svmla_lane (sv_f32 (d->c3), f, lane_constants, 1);
+  svfloat32_t f2 = svmul_x (svptrue_b32 (), f, f);
+   svfloat32_t p = svmla_x (pg, p12, f2, p34);
+-  p = svmla_x (pg, C (0), f, p);
+  p = svmla_x (pg, sv_f32 (d->c0), f, p);
+   p = svmla_x (pg, f, f2, p);
+ 
+   /* Assemble the result.
+      expm1(x) ~= 2^i * (p + 1) - 1
+      Let t = 2^i.  */
+-  svfloat32_t t = svscale_x (pg, sv_f32 (1), i);
+-  return svmla_x (pg, svsub_x (pg, t, 1), p, t);
+  svfloat32_t t = svscale_x (pg, sv_f32 (1.0f), svcvt_s32_x (pg, j));
+  return svmla_x (pg, svsub_x (pg, t, 1.0f), p, t);
+ }
+ 
+ #endif
+diff --git a/sysdeps/aarch64/fpu/tanhf_sve.c b/sysdeps/aarch64/fpu/tanhf_sve.c
+index 0b94523cf5074200..80dd679346f13f37 100644
+--- a/sysdeps/aarch64/fpu/tanhf_sve.c
+++ b/sysdeps/aarch64/fpu/tanhf_sve.c
+@@ -19,20 +19,27 @@
+ 
+ #include "sv_expm1f_inline.h"
+ 
+/* Largest value of x for which tanhf(x) rounds to 1 (or -1 for negative).  */
+#define BoringBound 0x1.205966p+3f
+
+ static const struct data
+ {
+   struct sv_expm1f_data expm1f_consts;
+-  uint32_t boring_bound, onef;
+  uint32_t onef, special_bound;
+  float boring_bound;
+ } data = {
+   .expm1f_consts = SV_EXPM1F_DATA,
+-  /* 0x1.205966p+3, above which tanhf rounds to 1 (or -1 for negative).  */
+-  .boring_bound = 0x41102cb3,
+   .onef = 0x3f800000,
+  .special_bound = 0x7f800000,
+  .boring_bound = BoringBound,
+ };
+ 
+ static svfloat32_t NOINLINE
+-special_case (svfloat32_t x, svfloat32_t y, svbool_t special)
+special_case (svfloat32_t x, svbool_t pg, svbool_t is_boring,
+	      svfloat32_t boring, svfloat32_t q, svbool_t special)
+ {
+  svfloat32_t y
+      = svsel_f32 (is_boring, boring, svdiv_x (pg, q, svadd_x (pg, q, 2.0)));
+   return sv_call_f32 (tanhf, x, y, special);
+ }
+ 
+@@ -47,15 +54,16 @@ svfloat32_t SV_NAME_F1 (tanh) (svfloat32_t x, const svbool_t pg)
+   svfloat32_t ax = svabs_x (pg, x);
+   svuint32_t iax = svreinterpret_u32 (ax);
+   svuint32_t sign = sveor_x (pg, svreinterpret_u32 (x), iax);
+-  svbool_t is_boring = svcmpgt (pg, iax, d->boring_bound);
+   svfloat32_t boring = svreinterpret_f32 (svorr_x (pg, sign, d->onef));
+-
+-  svbool_t special = svcmpgt (pg, iax, 0x7f800000);
+  svbool_t special = svcmpgt (pg, iax, d->special_bound);
+  svbool_t is_boring = svacgt (pg, x, d->boring_bound);
+ 
+   /* tanh(x) = (e^2x - 1) / (e^2x + 1).  */
+-  svfloat32_t q = expm1f_inline (svmul_x (pg, x, 2.0), pg, &d->expm1f_consts);
+-  svfloat32_t y = svdiv_x (pg, q, svadd_x (pg, q, 2.0));
+  svfloat32_t q = expm1f_inline (svmul_x (svptrue_b32 (), x, 2.0), pg,
+				 &d->expm1f_consts);
+
+   if (__glibc_unlikely (svptest_any (pg, special)))
+-    return special_case (x, svsel_f32 (is_boring, boring, y), special);
+    return special_case (x, pg, is_boring, boring, q, special);
+  svfloat32_t y = svdiv_x (pg, q, svadd_x (pg, q, 2.0));
+   return svsel_f32 (is_boring, boring, y);
+ }
--- a/glibc-RHEL-118273-26.patch
+++ b/glibc-RHEL-118273-26.patch
@ -0,0 +1,125 @@
+commit d3f2b71ef1d146137a25dd1367d97a14fac341c6
+Author: Yury Khrustalev <yury.khrustalev@arm.com>
+Date:   Tue Nov 26 11:38:30 2024 +0000
+
+    aarch64: Fix tests not compatible with targets supporting GCS
+    
+     - Add GCS marking to some of the tests when target supports GCS
+     - Fix tst-ro-dynamic-mod.map linker script to avoid removing
+       GNU properties
+     - Add header with macros for GNU properties
+    
+    Reviewed-by: Wilco Dijkstra <Wilco.Dijkstra@arm.com>
+
+diff --git a/elf/tst-asm-helper.h b/elf/tst-asm-helper.h
+new file mode 100644
+index 0000000000000000..6f91ac2ddc54d3f9
+--- /dev/null
+++ b/elf/tst-asm-helper.h
+@@ -0,0 +1,49 @@
+/* Test header that defines macros for GNU properties that need to be
+   used in some test assembly files where sysdep.h cannot be included
+   for some reason.
+   Copyright (C) 2024-2025 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <config.h>
+
+/* GNU_PROPERTY_AARCH64_* macros from elf.h for use in asm code.  */
+#define FEATURE_1_AND 0xc0000000
+#define FEATURE_1_BTI 1
+#define FEATURE_1_PAC 2
+#define FEATURE_1_GCS 4
+
+/* Add a NT_GNU_PROPERTY_TYPE_0 note.  */
+#define GNU_PROPERTY(type, value)	\
+  .section .note.gnu.property, "a";	\
+  .p2align 3;				\
+  .word 4;				\
+  .word 16;				\
+  .word 5;				\
+  .asciz "GNU";				\
+  .word type;				\
+  .word 4;				\
+  .word value;				\
+  .word 0;				\
+  .text
+
+/* Add GNU property note with the supported features to all asm code
+   where sysdep.h is included.  */
+#if HAVE_AARCH64_BTI && HAVE_AARCH64_PAC_RET
+GNU_PROPERTY (FEATURE_1_AND, FEATURE_1_BTI|FEATURE_1_PAC|FEATURE_1_GCS)
+#elif HAVE_AARCH64_BTI
+GNU_PROPERTY (FEATURE_1_AND, FEATURE_1_BTI|FEATURE_1_GCS)
+#endif
+diff --git a/elf/tst-big-note-lib.S b/elf/tst-big-note-lib.S
+index 5eb1e03cfbe2cee8..cfd31137e85a1335 100644
+--- a/elf/tst-big-note-lib.S
+++ b/elf/tst-big-note-lib.S
+@@ -20,6 +20,8 @@
+    On a typical Linux system with 8MiB "ulimit -s", that was enough
+    to trigger stack overflow in open_verify.  */
+ 
+#include "tst-asm-helper.h"
+
+ #define NOTE_SIZE 8*1024*1024
+ 
+ .pushsection .note.big,"a"
+diff --git a/elf/tst-ro-dynamic-mod.map b/elf/tst-ro-dynamic-mod.map
+index 2fe4a2998cddd587..2a158480c07d9691 100644
+--- a/elf/tst-ro-dynamic-mod.map
+++ b/elf/tst-ro-dynamic-mod.map
+@@ -3,14 +3,13 @@ SECTIONS
+  . = SIZEOF_HEADERS;
+  .dynamic : { *(.dynamic) } :text :dynamic
+  .rodata : { *(.data*) *(.bss*) } :text
+- /DISCARD/ : {
+-  *(.note.gnu.property)
+- }
+- .note : { *(.note.*) } :text :note
+ .note : { *(.note) } :text :note
+ .note.gnu.property : { *(.note.gnu.property) } :text :gnu_property
+ }
+ PHDRS
+ {
+  text PT_LOAD FLAGS(5) FILEHDR PHDRS;
+  dynamic PT_DYNAMIC FLAGS(4);
+  note PT_NOTE FLAGS(4);
+ gnu_property PT_GNU_PROPERTY FLAGS(4);
+ }
+diff --git a/sysdeps/aarch64/tst-vpcs-mod.S b/sysdeps/aarch64/tst-vpcs-mod.S
+index 19b01c3c3859e13b..b3b5824eda1fb076 100644
+--- a/sysdeps/aarch64/tst-vpcs-mod.S
+++ b/sysdeps/aarch64/tst-vpcs-mod.S
+@@ -17,6 +17,8 @@
+    License along with the GNU C Library.  If not, see
+    <https://www.gnu.org/licenses/>.  */
+ 
+#include "tst-asm-helper.h"
+
+ 	.variant_pcs	vpcs_call
+ 	.global	vpcs_call
+ 	.type	vpcs_call, %function
+@@ -121,7 +123,7 @@ vpcs_call_regs:
+ 	/* Emulate a BL using B, but save x30 before the branch.  */
+ 	adr	x30, .L_return_addr
+ 	stp	x30, x29, [x1, 240]
+-	b	vpcs_call
+	bl	vpcs_call
+ .L_return_addr:
+ 
+ 	/* Restore callee-saved registers.  */
--- a/glibc-RHEL-118273-27.patch
+++ b/glibc-RHEL-118273-27.patch
@ -0,0 +1,241 @@
+commit 95e807209b680257a9afe81a507754f1565dbb4d
+Author: Yat Long Poon <yatlong.poon@arm.com>
+Date:   Thu Feb 13 18:03:04 2025 +0000
+
+    AArch64: Improve codegen for SVE powf
+    
+    Improve memory access with indexed/unpredicated instructions.
+    Eliminate register spills.  Speedup on Neoverse V1: 3%.
+    
+    Reviewed-by: Wilco Dijkstra  <Wilco.Dijkstra@arm.com>
+
+diff --git a/sysdeps/aarch64/fpu/powf_sve.c b/sysdeps/aarch64/fpu/powf_sve.c
+index 4f6a142325ae719b..08d7019a1855ff3c 100644
+--- a/sysdeps/aarch64/fpu/powf_sve.c
+++ b/sysdeps/aarch64/fpu/powf_sve.c
+@@ -26,7 +26,6 @@
+ #define Tlogc __v_powf_data.logc
+ #define Texp __v_powf_data.scale
+ #define SignBias (1 << (V_POWF_EXP2_TABLE_BITS + 11))
+-#define Shift 0x1.8p52
+ #define Norm 0x1p23f /* 0x4b000000.  */
+ 
+ /* Overall ULP error bound for pow is 2.6 ulp
+@@ -36,7 +35,7 @@ static const struct data
+   double log_poly[4];
+   double exp_poly[3];
+   float uflow_bound, oflow_bound, small_bound;
+-  uint32_t sign_bias, sign_mask, subnormal_bias, off;
+  uint32_t sign_bias, subnormal_bias, off;
+ } data = {
+   /* rel err: 1.5 * 2^-30. Each coefficients is multiplied the value of
+      V_POWF_EXP2_N.  */
+@@ -53,7 +52,6 @@ static const struct data
+   .small_bound = 0x1p-126f,
+   .off = 0x3f35d000,
+   .sign_bias = SignBias,
+-  .sign_mask = 0x80000000,
+   .subnormal_bias = 0x0b800000, /* 23 << 23.  */
+ };
+ 
+@@ -86,7 +84,7 @@ svisodd (svbool_t pg, svfloat32_t x)
+ static inline svbool_t
+ sv_zeroinfnan (svbool_t pg, svuint32_t i)
+ {
+-  return svcmpge (pg, svsub_x (pg, svmul_x (pg, i, 2u), 1),
+  return svcmpge (pg, svsub_x (pg, svadd_x (pg, i, i), 1),
+ 		  2u * 0x7f800000 - 1);
+ }
+ 
+@@ -150,9 +148,14 @@ powf_specialcase (float x, float y, float z)
+ }
+ 
+ /* Scalar fallback for special case routines with custom signature.  */
+-static inline svfloat32_t
+-sv_call_powf_sc (svfloat32_t x1, svfloat32_t x2, svfloat32_t y, svbool_t cmp)
+static svfloat32_t NOINLINE
+sv_call_powf_sc (svfloat32_t x1, svfloat32_t x2, svfloat32_t y)
+ {
+  /* Special cases of x or y: zero, inf and nan.  */
+  svbool_t xspecial = sv_zeroinfnan (svptrue_b32 (), svreinterpret_u32 (x1));
+  svbool_t yspecial = sv_zeroinfnan (svptrue_b32 (), svreinterpret_u32 (x2));
+  svbool_t cmp = svorr_z (svptrue_b32 (), xspecial, yspecial);
+
+   svbool_t p = svpfirst (cmp, svpfalse ());
+   while (svptest_any (cmp, p))
+     {
+@@ -182,30 +185,30 @@ sv_powf_core_ext (const svbool_t pg, svuint64_t i, svfloat64_t z, svint64_t k,
+ 
+   /* Polynomial to approximate log1p(r)/ln2.  */
+   svfloat64_t logx = A (0);
+-  logx = svmla_x (pg, A (1), r, logx);
+-  logx = svmla_x (pg, A (2), r, logx);
+-  logx = svmla_x (pg, A (3), r, logx);
+-  logx = svmla_x (pg, y0, r, logx);
+  logx = svmad_x (pg, r, logx, A (1));
+  logx = svmad_x (pg, r, logx, A (2));
+  logx = svmad_x (pg, r, logx, A (3));
+  logx = svmad_x (pg, r, logx, y0);
+   *pylogx = svmul_x (pg, y, logx);
+ 
+   /* z - kd is in [-1, 1] in non-nearest rounding modes.  */
+-  svfloat64_t kd = svadd_x (pg, *pylogx, Shift);
+-  svuint64_t ki = svreinterpret_u64 (kd);
+-  kd = svsub_x (pg, kd, Shift);
+  svfloat64_t kd = svrinta_x (svptrue_b64 (), *pylogx);
+  svuint64_t ki = svreinterpret_u64 (svcvt_s64_x (svptrue_b64 (), kd));
+ 
+   r = svsub_x (pg, *pylogx, kd);
+ 
+   /* exp2(x) = 2^(k/N) * 2^r ~= s * (C0*r^3 + C1*r^2 + C2*r + 1).  */
+-  svuint64_t t
+-      = svld1_gather_index (pg, Texp, svand_x (pg, ki, V_POWF_EXP2_N - 1));
+-  svuint64_t ski = svadd_x (pg, ki, sign_bias);
+-  t = svadd_x (pg, t, svlsl_x (pg, ski, 52 - V_POWF_EXP2_TABLE_BITS));
+  svuint64_t t = svld1_gather_index (
+      svptrue_b64 (), Texp, svand_x (svptrue_b64 (), ki, V_POWF_EXP2_N - 1));
+  svuint64_t ski = svadd_x (svptrue_b64 (), ki, sign_bias);
+  t = svadd_x (svptrue_b64 (), t,
+	       svlsl_x (svptrue_b64 (), ski, 52 - V_POWF_EXP2_TABLE_BITS));
+   svfloat64_t s = svreinterpret_f64 (t);
+ 
+   svfloat64_t p = C (0);
+   p = svmla_x (pg, C (1), p, r);
+   p = svmla_x (pg, C (2), p, r);
+-  p = svmla_x (pg, s, p, svmul_x (pg, s, r));
+  p = svmla_x (pg, s, p, svmul_x (svptrue_b64 (), s, r));
+ 
+   return p;
+ }
+@@ -219,19 +222,16 @@ sv_powf_core (const svbool_t pg, svuint32_t i, svuint32_t iz, svint32_t k,
+ {
+   const svbool_t ptrue = svptrue_b64 ();
+ 
+-  /* Unpack and promote input vectors (pg, y, z, i, k and sign_bias) into two in
+-     order to perform core computation in double precision.  */
+  /* Unpack and promote input vectors (pg, y, z, i, k and sign_bias) into two
+   * in order to perform core computation in double precision.  */
+   const svbool_t pg_lo = svunpklo (pg);
+   const svbool_t pg_hi = svunpkhi (pg);
+-  svfloat64_t y_lo = svcvt_f64_x (
+-      ptrue, svreinterpret_f32 (svunpklo (svreinterpret_u32 (y))));
+-  svfloat64_t y_hi = svcvt_f64_x (
+-      ptrue, svreinterpret_f32 (svunpkhi (svreinterpret_u32 (y))));
+-  svfloat32_t z = svreinterpret_f32 (iz);
+-  svfloat64_t z_lo = svcvt_f64_x (
+-      ptrue, svreinterpret_f32 (svunpklo (svreinterpret_u32 (z))));
+-  svfloat64_t z_hi = svcvt_f64_x (
+-      ptrue, svreinterpret_f32 (svunpkhi (svreinterpret_u32 (z))));
+  svfloat64_t y_lo
+      = svcvt_f64_x (pg, svreinterpret_f32 (svunpklo (svreinterpret_u32 (y))));
+  svfloat64_t y_hi
+      = svcvt_f64_x (pg, svreinterpret_f32 (svunpkhi (svreinterpret_u32 (y))));
+  svfloat64_t z_lo = svcvt_f64_x (pg, svreinterpret_f32 (svunpklo (iz)));
+  svfloat64_t z_hi = svcvt_f64_x (pg, svreinterpret_f32 (svunpkhi (iz)));
+   svuint64_t i_lo = svunpklo (i);
+   svuint64_t i_hi = svunpkhi (i);
+   svint64_t k_lo = svunpklo (k);
+@@ -258,9 +258,9 @@ sv_powf_core (const svbool_t pg, svuint32_t i, svuint32_t iz, svint32_t k,
+ /* Implementation of SVE powf.
+    Provides the same accuracy as AdvSIMD powf, since it relies on the same
+    algorithm. The theoretical maximum error is under 2.60 ULPs.
+-   Maximum measured error is 2.56 ULPs:
+-   SV_NAME_F2 (pow) (0x1.004118p+0, 0x1.5d14a4p+16) got 0x1.fd4bp+127
+-						   want 0x1.fd4b06p+127.  */
+   Maximum measured error is 2.57 ULPs:
+   SV_NAME_F2 (pow) (0x1.031706p+0, 0x1.ce2ec2p+12) got 0x1.fff868p+127
+						   want 0x1.fff862p+127.  */
+ svfloat32_t SV_NAME_F2 (pow) (svfloat32_t x, svfloat32_t y, const svbool_t pg)
+ {
+   const struct data *d = ptr_barrier (&data);
+@@ -269,21 +269,19 @@ svfloat32_t SV_NAME_F2 (pow) (svfloat32_t x, svfloat32_t y, const svbool_t pg)
+   svuint32_t viy0 = svreinterpret_u32 (y);
+ 
+   /* Negative x cases.  */
+-  svuint32_t sign_bit = svand_m (pg, vix0, d->sign_mask);
+-  svbool_t xisneg = svcmpeq (pg, sign_bit, d->sign_mask);
+  svbool_t xisneg = svcmplt (pg, x, sv_f32 (0));
+ 
+   /* Set sign_bias and ix depending on sign of x and nature of y.  */
+-  svbool_t yisnotint_xisneg = svpfalse_b ();
+  svbool_t yint_or_xpos = pg;
+   svuint32_t sign_bias = sv_u32 (0);
+   svuint32_t vix = vix0;
+   if (__glibc_unlikely (svptest_any (pg, xisneg)))
+     {
+       /* Determine nature of y.  */
+-      yisnotint_xisneg = svisnotint (xisneg, y);
+-      svbool_t yisint_xisneg = svisint (xisneg, y);
+      yint_or_xpos = svisint (xisneg, y);
+       svbool_t yisodd_xisneg = svisodd (xisneg, y);
+       /* ix set to abs(ix) if y is integer.  */
+-      vix = svand_m (yisint_xisneg, vix0, 0x7fffffff);
+      vix = svand_m (yint_or_xpos, vix0, 0x7fffffff);
+       /* Set to SignBias if x is negative and y is odd.  */
+       sign_bias = svsel (yisodd_xisneg, sv_u32 (d->sign_bias), sv_u32 (0));
+     }
+@@ -294,8 +292,8 @@ svfloat32_t SV_NAME_F2 (pow) (svfloat32_t x, svfloat32_t y, const svbool_t pg)
+   svbool_t cmp = svorr_z (pg, xspecial, yspecial);
+ 
+   /* Small cases of x: |x| < 0x1p-126.  */
+-  svbool_t xsmall = svaclt (pg, x, d->small_bound);
+-  if (__glibc_unlikely (svptest_any (pg, xsmall)))
+  svbool_t xsmall = svaclt (yint_or_xpos, x, d->small_bound);
+  if (__glibc_unlikely (svptest_any (yint_or_xpos, xsmall)))
+     {
+       /* Normalize subnormal x so exponent becomes negative.  */
+       svuint32_t vix_norm = svreinterpret_u32 (svmul_x (xsmall, x, Norm));
+@@ -304,32 +302,35 @@ svfloat32_t SV_NAME_F2 (pow) (svfloat32_t x, svfloat32_t y, const svbool_t pg)
+       vix = svsel (xsmall, vix_norm, vix);
+     }
+   /* Part of core computation carried in working precision.  */
+-  svuint32_t tmp = svsub_x (pg, vix, d->off);
+-  svuint32_t i = svand_x (pg, svlsr_x (pg, tmp, (23 - V_POWF_LOG2_TABLE_BITS)),
+-			  V_POWF_LOG2_N - 1);
+-  svuint32_t top = svand_x (pg, tmp, 0xff800000);
+-  svuint32_t iz = svsub_x (pg, vix, top);
+-  svint32_t k
+-      = svasr_x (pg, svreinterpret_s32 (top), (23 - V_POWF_EXP2_TABLE_BITS));
+-
+-  /* Compute core in extended precision and return intermediate ylogx results to
+-      handle cases of underflow and underflow in exp.  */
+  svuint32_t tmp = svsub_x (yint_or_xpos, vix, d->off);
+  svuint32_t i = svand_x (
+      yint_or_xpos, svlsr_x (yint_or_xpos, tmp, (23 - V_POWF_LOG2_TABLE_BITS)),
+      V_POWF_LOG2_N - 1);
+  svuint32_t top = svand_x (yint_or_xpos, tmp, 0xff800000);
+  svuint32_t iz = svsub_x (yint_or_xpos, vix, top);
+  svint32_t k = svasr_x (yint_or_xpos, svreinterpret_s32 (top),
+			 (23 - V_POWF_EXP2_TABLE_BITS));
+
+  /* Compute core in extended precision and return intermediate ylogx results
+   * to handle cases of underflow and underflow in exp.  */
+   svfloat32_t ylogx;
+-  svfloat32_t ret = sv_powf_core (pg, i, iz, k, y, sign_bias, &ylogx, d);
+  svfloat32_t ret
+      = sv_powf_core (yint_or_xpos, i, iz, k, y, sign_bias, &ylogx, d);
+ 
+   /* Handle exp special cases of underflow and overflow.  */
+-  svuint32_t sign = svlsl_x (pg, sign_bias, 20 - V_POWF_EXP2_TABLE_BITS);
+  svuint32_t sign
+      = svlsl_x (yint_or_xpos, sign_bias, 20 - V_POWF_EXP2_TABLE_BITS);
+   svfloat32_t ret_oflow
+-      = svreinterpret_f32 (svorr_x (pg, sign, asuint (INFINITY)));
+      = svreinterpret_f32 (svorr_x (yint_or_xpos, sign, asuint (INFINITY)));
+   svfloat32_t ret_uflow = svreinterpret_f32 (sign);
+-  ret = svsel (svcmple (pg, ylogx, d->uflow_bound), ret_uflow, ret);
+-  ret = svsel (svcmpgt (pg, ylogx, d->oflow_bound), ret_oflow, ret);
+  ret = svsel (svcmple (yint_or_xpos, ylogx, d->uflow_bound), ret_uflow, ret);
+  ret = svsel (svcmpgt (yint_or_xpos, ylogx, d->oflow_bound), ret_oflow, ret);
+ 
+   /* Cases of finite y and finite negative x.  */
+-  ret = svsel (yisnotint_xisneg, sv_f32 (__builtin_nanf ("")), ret);
+  ret = svsel (yint_or_xpos, ret, sv_f32 (__builtin_nanf ("")));
+ 
+-  if (__glibc_unlikely (svptest_any (pg, cmp)))
+-    return sv_call_powf_sc (x, y, ret, cmp);
+  if (__glibc_unlikely (svptest_any (cmp, cmp)))
+    return sv_call_powf_sc (x, y, ret);
+ 
+   return ret;
+ }
--- a/glibc-RHEL-118273-28.patch
+++ b/glibc-RHEL-118273-28.patch
@ -0,0 +1,401 @@
+commit 0b195651db3ae793187c7dd6d78b5a7a8da9d5e6
+Author: Yat Long Poon <yatlong.poon@arm.com>
+Date:   Thu Feb 13 18:02:01 2025 +0000
+
+    AArch64: Improve codegen for SVE pow
+    
+    Move constants to struct.  Improve memory access with indexed/unpredicated
+    instructions.  Eliminate register spills.  Speedup on Neoverse V1: 24%.
+    
+    Reviewed-by: Wilco Dijkstra  <Wilco.Dijkstra@arm.com>
+
+diff --git a/sysdeps/aarch64/fpu/pow_sve.c b/sysdeps/aarch64/fpu/pow_sve.c
+index 4c0bf8956c584be7..4242d22a491ed17e 100644
+--- a/sysdeps/aarch64/fpu/pow_sve.c
+++ b/sysdeps/aarch64/fpu/pow_sve.c
+@@ -44,19 +44,18 @@
+ 
+ /* Data is defined in v_pow_log_data.c.  */
+ #define N_LOG (1 << V_POW_LOG_TABLE_BITS)
+-#define A __v_pow_log_data.poly
+ #define Off 0x3fe6955500000000
+ 
+ /* Data is defined in v_pow_exp_data.c.  */
+ #define N_EXP (1 << V_POW_EXP_TABLE_BITS)
+ #define SignBias (0x800 << V_POW_EXP_TABLE_BITS)
+-#define C __v_pow_exp_data.poly
+ #define SmallExp 0x3c9 /* top12(0x1p-54).  */
+ #define BigExp 0x408   /* top12(512.).  */
+ #define ThresExp 0x03f /* BigExp - SmallExp.  */
+ #define HugeExp 0x409  /* top12(1024.).  */
+ 
+ /* Constants associated with pow.  */
+#define SmallBoundX 0x1p-126
+ #define SmallPowX 0x001 /* top12(0x1p-126).  */
+ #define BigPowX 0x7ff	/* top12(INFINITY).  */
+ #define ThresPowX 0x7fe /* BigPowX - SmallPowX.  */
+@@ -64,6 +63,31 @@
+ #define BigPowY 0x43e	/* top12(0x1.749p62).  */
+ #define ThresPowY 0x080 /* BigPowY - SmallPowY.  */
+ 
+static const struct data
+{
+  double log_c0, log_c2, log_c4, log_c6, ln2_hi, ln2_lo;
+  double log_c1, log_c3, log_c5, off;
+  double n_over_ln2, exp_c2, ln2_over_n_hi, ln2_over_n_lo;
+  double exp_c0, exp_c1;
+} data = {
+  .log_c0 = -0x1p-1,
+  .log_c1 = -0x1.555555555556p-1,
+  .log_c2 = 0x1.0000000000006p-1,
+  .log_c3 = 0x1.999999959554ep-1,
+  .log_c4 = -0x1.555555529a47ap-1,
+  .log_c5 = -0x1.2495b9b4845e9p0,
+  .log_c6 = 0x1.0002b8b263fc3p0,
+  .off = Off,
+  .exp_c0 = 0x1.fffffffffffd4p-2,
+  .exp_c1 = 0x1.5555571d6ef9p-3,
+  .exp_c2 = 0x1.5555576a5adcep-5,
+  .ln2_hi = 0x1.62e42fefa3800p-1,
+  .ln2_lo = 0x1.ef35793c76730p-45,
+  .n_over_ln2 = 0x1.71547652b82fep0 * N_EXP,
+  .ln2_over_n_hi = 0x1.62e42fefc0000p-9,
+  .ln2_over_n_lo = -0x1.c610ca86c3899p-45,
+};
+
+ /* Check if x is an integer.  */
+ static inline svbool_t
+ sv_isint (svbool_t pg, svfloat64_t x)
+@@ -82,7 +106,7 @@ sv_isnotint (svbool_t pg, svfloat64_t x)
+ static inline svbool_t
+ sv_isodd (svbool_t pg, svfloat64_t x)
+ {
+-  svfloat64_t y = svmul_x (pg, x, 0.5);
+  svfloat64_t y = svmul_x (svptrue_b64 (), x, 0.5);
+   return sv_isnotint (pg, y);
+ }
+ 
+@@ -121,7 +145,7 @@ zeroinfnan (uint64_t i)
+ static inline svbool_t
+ sv_zeroinfnan (svbool_t pg, svuint64_t i)
+ {
+-  return svcmpge (pg, svsub_x (pg, svmul_x (pg, i, 2), 1),
+  return svcmpge (pg, svsub_x (pg, svadd_x (pg, i, i), 1),
+ 		  2 * asuint64 (INFINITY) - 1);
+ }
+ 
+@@ -174,16 +198,17 @@ sv_call_specialcase (svfloat64_t x1, svuint64_t u1, svuint64_t u2,
+    additional 15 bits precision.  IX is the bit representation of x, but
+    normalized in the subnormal range using the sign bit for the exponent.  */
+ static inline svfloat64_t
+-sv_log_inline (svbool_t pg, svuint64_t ix, svfloat64_t *tail)
+sv_log_inline (svbool_t pg, svuint64_t ix, svfloat64_t *tail,
+	       const struct data *d)
+ {
+   /* x = 2^k z; where z is in range [Off,2*Off) and exact.
+      The range is split into N subintervals.
+      The ith subinterval contains z and c is near its center.  */
+-  svuint64_t tmp = svsub_x (pg, ix, Off);
+  svuint64_t tmp = svsub_x (pg, ix, d->off);
+   svuint64_t i = svand_x (pg, svlsr_x (pg, tmp, 52 - V_POW_LOG_TABLE_BITS),
+ 			  sv_u64 (N_LOG - 1));
+   svint64_t k = svasr_x (pg, svreinterpret_s64 (tmp), 52);
+-  svuint64_t iz = svsub_x (pg, ix, svand_x (pg, tmp, sv_u64 (0xfffULL << 52)));
+  svuint64_t iz = svsub_x (pg, ix, svlsl_x (pg, svreinterpret_u64 (k), 52));
+   svfloat64_t z = svreinterpret_f64 (iz);
+   svfloat64_t kd = svcvt_f64_x (pg, k);
+ 
+@@ -199,40 +224,85 @@ sv_log_inline (svbool_t pg, svuint64_t ix, svfloat64_t *tail)
+      |z/c - 1| < 1/N, so r = z/c - 1 is exactly representible.  */
+   svfloat64_t r = svmad_x (pg, z, invc, -1.0);
+   /* k*Ln2 + log(c) + r.  */
+-  svfloat64_t t1 = svmla_x (pg, logc, kd, __v_pow_log_data.ln2_hi);
+
+  svfloat64_t ln2_hilo = svld1rq_f64 (svptrue_b64 (), &d->ln2_hi);
+  svfloat64_t t1 = svmla_lane_f64 (logc, kd, ln2_hilo, 0);
+   svfloat64_t t2 = svadd_x (pg, t1, r);
+-  svfloat64_t lo1 = svmla_x (pg, logctail, kd, __v_pow_log_data.ln2_lo);
+  svfloat64_t lo1 = svmla_lane_f64 (logctail, kd, ln2_hilo, 1);
+   svfloat64_t lo2 = svadd_x (pg, svsub_x (pg, t1, t2), r);
+ 
+   /* Evaluation is optimized assuming superscalar pipelined execution.  */
+-  svfloat64_t ar = svmul_x (pg, r, -0.5); /* A[0] = -0.5.  */
+-  svfloat64_t ar2 = svmul_x (pg, r, ar);
+-  svfloat64_t ar3 = svmul_x (pg, r, ar2);
+
+  svfloat64_t log_c02 = svld1rq_f64 (svptrue_b64 (), &d->log_c0);
+  svfloat64_t ar = svmul_lane_f64 (r, log_c02, 0);
+  svfloat64_t ar2 = svmul_x (svptrue_b64 (), r, ar);
+  svfloat64_t ar3 = svmul_x (svptrue_b64 (), r, ar2);
+   /* k*Ln2 + log(c) + r + A[0]*r*r.  */
+   svfloat64_t hi = svadd_x (pg, t2, ar2);
+-  svfloat64_t lo3 = svmla_x (pg, svneg_x (pg, ar2), ar, r);
+  svfloat64_t lo3 = svmls_x (pg, ar2, ar, r);
+   svfloat64_t lo4 = svadd_x (pg, svsub_x (pg, t2, hi), ar2);
+   /* p = log1p(r) - r - A[0]*r*r.  */
+   /* p = (ar3 * (A[1] + r * A[2] + ar2 * (A[3] + r * A[4] + ar2 * (A[5] + r *
+      A[6])))).  */
+-  svfloat64_t a56 = svmla_x (pg, sv_f64 (A[5]), r, A[6]);
+-  svfloat64_t a34 = svmla_x (pg, sv_f64 (A[3]), r, A[4]);
+-  svfloat64_t a12 = svmla_x (pg, sv_f64 (A[1]), r, A[2]);
+
+  svfloat64_t log_c46 = svld1rq_f64 (svptrue_b64 (), &d->log_c4);
+  svfloat64_t a56 = svmla_lane_f64 (sv_f64 (d->log_c5), r, log_c46, 1);
+  svfloat64_t a34 = svmla_lane_f64 (sv_f64 (d->log_c3), r, log_c46, 0);
+  svfloat64_t a12 = svmla_lane_f64 (sv_f64 (d->log_c1), r, log_c02, 1);
+   svfloat64_t p = svmla_x (pg, a34, ar2, a56);
+   p = svmla_x (pg, a12, ar2, p);
+-  p = svmul_x (pg, ar3, p);
+  p = svmul_x (svptrue_b64 (), ar3, p);
+   svfloat64_t lo = svadd_x (
+-      pg, svadd_x (pg, svadd_x (pg, svadd_x (pg, lo1, lo2), lo3), lo4), p);
+      pg, svadd_x (pg, svsub_x (pg, svadd_x (pg, lo1, lo2), lo3), lo4), p);
+   svfloat64_t y = svadd_x (pg, hi, lo);
+   *tail = svadd_x (pg, svsub_x (pg, hi, y), lo);
+   return y;
+ }
+ 
+static inline svfloat64_t
+sv_exp_core (svbool_t pg, svfloat64_t x, svfloat64_t xtail,
+	     svuint64_t sign_bias, svfloat64_t *tmp, svuint64_t *sbits,
+	     svuint64_t *ki, const struct data *d)
+{
+  /* exp(x) = 2^(k/N) * exp(r), with exp(r) in [2^(-1/2N),2^(1/2N)].  */
+  /* x = ln2/N*k + r, with int k and r in [-ln2/2N, ln2/2N].  */
+  svfloat64_t n_over_ln2_and_c2 = svld1rq_f64 (svptrue_b64 (), &d->n_over_ln2);
+  svfloat64_t z = svmul_lane_f64 (x, n_over_ln2_and_c2, 0);
+  /* z - kd is in [-1, 1] in non-nearest rounding modes.  */
+  svfloat64_t kd = svrinta_x (pg, z);
+  *ki = svreinterpret_u64 (svcvt_s64_x (pg, kd));
+
+  svfloat64_t ln2_over_n_hilo
+      = svld1rq_f64 (svptrue_b64 (), &d->ln2_over_n_hi);
+  svfloat64_t r = x;
+  r = svmls_lane_f64 (r, kd, ln2_over_n_hilo, 0);
+  r = svmls_lane_f64 (r, kd, ln2_over_n_hilo, 1);
+  /* The code assumes 2^-200 < |xtail| < 2^-8/N.  */
+  r = svadd_x (pg, r, xtail);
+  /* 2^(k/N) ~= scale.  */
+  svuint64_t idx = svand_x (pg, *ki, N_EXP - 1);
+  svuint64_t top
+      = svlsl_x (pg, svadd_x (pg, *ki, sign_bias), 52 - V_POW_EXP_TABLE_BITS);
+  /* This is only a valid scale when -1023*N < k < 1024*N.  */
+  *sbits = svld1_gather_index (pg, __v_pow_exp_data.sbits, idx);
+  *sbits = svadd_x (pg, *sbits, top);
+  /* exp(x) = 2^(k/N) * exp(r) ~= scale + scale * (exp(r) - 1).  */
+  svfloat64_t r2 = svmul_x (svptrue_b64 (), r, r);
+  *tmp = svmla_lane_f64 (sv_f64 (d->exp_c1), r, n_over_ln2_and_c2, 1);
+  *tmp = svmla_x (pg, sv_f64 (d->exp_c0), r, *tmp);
+  *tmp = svmla_x (pg, r, r2, *tmp);
+  svfloat64_t scale = svreinterpret_f64 (*sbits);
+  /* Note: tmp == 0 or |tmp| > 2^-200 and scale > 2^-739, so there
+     is no spurious underflow here even without fma.  */
+  z = svmla_x (pg, scale, scale, *tmp);
+  return z;
+}
+
+ /* Computes sign*exp(x+xtail) where |xtail| < 2^-8/N and |xtail| <= |x|.
+    The sign_bias argument is SignBias or 0 and sets the sign to -1 or 1.  */
+ static inline svfloat64_t
+ sv_exp_inline (svbool_t pg, svfloat64_t x, svfloat64_t xtail,
+-	       svuint64_t sign_bias)
+	       svuint64_t sign_bias, const struct data *d)
+ {
+   /* 3 types of special cases: tiny (uflow and spurious uflow), huge (oflow)
+      and other cases of large values of x (scale * (1 + TMP) oflow).  */
+@@ -240,73 +310,46 @@ sv_exp_inline (svbool_t pg, svfloat64_t x, svfloat64_t xtail,
+   /* |x| is large (|x| >= 512) or tiny (|x| <= 0x1p-54).  */
+   svbool_t uoflow = svcmpge (pg, svsub_x (pg, abstop, SmallExp), ThresExp);
+ 
+-  /* Conditions special, uflow and oflow are all expressed as uoflow &&
+-     something, hence do not bother computing anything if no lane in uoflow is
+-     true.  */
+-  svbool_t special = svpfalse_b ();
+-  svbool_t uflow = svpfalse_b ();
+-  svbool_t oflow = svpfalse_b ();
+  svfloat64_t tmp;
+  svuint64_t sbits, ki;
+   if (__glibc_unlikely (svptest_any (pg, uoflow)))
+     {
+      svfloat64_t z
+	  = sv_exp_core (pg, x, xtail, sign_bias, &tmp, &sbits, &ki, d);
+
+       /* |x| is tiny (|x| <= 0x1p-54).  */
+-      uflow = svcmpge (pg, svsub_x (pg, abstop, SmallExp), 0x80000000);
+      svbool_t uflow
+	  = svcmpge (pg, svsub_x (pg, abstop, SmallExp), 0x80000000);
+       uflow = svand_z (pg, uoflow, uflow);
+       /* |x| is huge (|x| >= 1024).  */
+-      oflow = svcmpge (pg, abstop, HugeExp);
+      svbool_t oflow = svcmpge (pg, abstop, HugeExp);
+       oflow = svand_z (pg, uoflow, svbic_z (pg, oflow, uflow));
+
+       /* For large |x| values (512 < |x| < 1024) scale * (1 + TMP) can overflow
+-	 or underflow.  */
+-      special = svbic_z (pg, uoflow, svorr_z (pg, uflow, oflow));
+    or underflow.  */
+      svbool_t special = svbic_z (pg, uoflow, svorr_z (pg, uflow, oflow));
+
+      /* Update result with special and large cases.  */
+      z = sv_call_specialcase (tmp, sbits, ki, z, special);
+
+      /* Handle underflow and overflow.  */
+      svbool_t x_is_neg = svcmplt (pg, x, 0);
+      svuint64_t sign_mask
+	  = svlsl_x (pg, sign_bias, 52 - V_POW_EXP_TABLE_BITS);
+      svfloat64_t res_uoflow
+	  = svsel (x_is_neg, sv_f64 (0.0), sv_f64 (INFINITY));
+      res_uoflow = svreinterpret_f64 (
+	  svorr_x (pg, svreinterpret_u64 (res_uoflow), sign_mask));
+      /* Avoid spurious underflow for tiny x.  */
+      svfloat64_t res_spurious_uflow
+	  = svreinterpret_f64 (svorr_x (pg, sign_mask, 0x3ff0000000000000));
+
+      z = svsel (oflow, res_uoflow, z);
+      z = svsel (uflow, res_spurious_uflow, z);
+      return z;
+     }
+ 
+-  /* exp(x) = 2^(k/N) * exp(r), with exp(r) in [2^(-1/2N),2^(1/2N)].  */
+-  /* x = ln2/N*k + r, with int k and r in [-ln2/2N, ln2/2N].  */
+-  svfloat64_t z = svmul_x (pg, x, __v_pow_exp_data.n_over_ln2);
+-  /* z - kd is in [-1, 1] in non-nearest rounding modes.  */
+-  svfloat64_t shift = sv_f64 (__v_pow_exp_data.shift);
+-  svfloat64_t kd = svadd_x (pg, z, shift);
+-  svuint64_t ki = svreinterpret_u64 (kd);
+-  kd = svsub_x (pg, kd, shift);
+-  svfloat64_t r = x;
+-  r = svmls_x (pg, r, kd, __v_pow_exp_data.ln2_over_n_hi);
+-  r = svmls_x (pg, r, kd, __v_pow_exp_data.ln2_over_n_lo);
+-  /* The code assumes 2^-200 < |xtail| < 2^-8/N.  */
+-  r = svadd_x (pg, r, xtail);
+-  /* 2^(k/N) ~= scale.  */
+-  svuint64_t idx = svand_x (pg, ki, N_EXP - 1);
+-  svuint64_t top
+-      = svlsl_x (pg, svadd_x (pg, ki, sign_bias), 52 - V_POW_EXP_TABLE_BITS);
+-  /* This is only a valid scale when -1023*N < k < 1024*N.  */
+-  svuint64_t sbits = svld1_gather_index (pg, __v_pow_exp_data.sbits, idx);
+-  sbits = svadd_x (pg, sbits, top);
+-  /* exp(x) = 2^(k/N) * exp(r) ~= scale + scale * (exp(r) - 1).  */
+-  svfloat64_t r2 = svmul_x (pg, r, r);
+-  svfloat64_t tmp = svmla_x (pg, sv_f64 (C[1]), r, C[2]);
+-  tmp = svmla_x (pg, sv_f64 (C[0]), r, tmp);
+-  tmp = svmla_x (pg, r, r2, tmp);
+-  svfloat64_t scale = svreinterpret_f64 (sbits);
+-  /* Note: tmp == 0 or |tmp| > 2^-200 and scale > 2^-739, so there
+-     is no spurious underflow here even without fma.  */
+-  z = svmla_x (pg, scale, scale, tmp);
+-
+-  /* Update result with special and large cases.  */
+-  if (__glibc_unlikely (svptest_any (pg, special)))
+-    z = sv_call_specialcase (tmp, sbits, ki, z, special);
+-
+-  /* Handle underflow and overflow.  */
+-  svuint64_t sign_bit = svlsr_x (pg, svreinterpret_u64 (x), 63);
+-  svbool_t x_is_neg = svcmpne (pg, sign_bit, 0);
+-  svuint64_t sign_mask = svlsl_x (pg, sign_bias, 52 - V_POW_EXP_TABLE_BITS);
+-  svfloat64_t res_uoflow = svsel (x_is_neg, sv_f64 (0.0), sv_f64 (INFINITY));
+-  res_uoflow = svreinterpret_f64 (
+-      svorr_x (pg, svreinterpret_u64 (res_uoflow), sign_mask));
+-  z = svsel (oflow, res_uoflow, z);
+-  /* Avoid spurious underflow for tiny x.  */
+-  svfloat64_t res_spurious_uflow
+-      = svreinterpret_f64 (svorr_x (pg, sign_mask, 0x3ff0000000000000));
+-  z = svsel (uflow, res_spurious_uflow, z);
+-
+-  return z;
+  return sv_exp_core (pg, x, xtail, sign_bias, &tmp, &sbits, &ki, d);
+ }
+ 
+ static inline double
+@@ -341,47 +384,39 @@ pow_sc (double x, double y)
+ 
+ svfloat64_t SV_NAME_D2 (pow) (svfloat64_t x, svfloat64_t y, const svbool_t pg)
+ {
+  const struct data *d = ptr_barrier (&data);
+
+   /* This preamble handles special case conditions used in the final scalar
+      fallbacks. It also updates ix and sign_bias, that are used in the core
+      computation too, i.e., exp( y * log (x) ).  */
+   svuint64_t vix0 = svreinterpret_u64 (x);
+   svuint64_t viy0 = svreinterpret_u64 (y);
+-  svuint64_t vtopx0 = svlsr_x (svptrue_b64 (), vix0, 52);
+ 
+   /* Negative x cases.  */
+-  svuint64_t sign_bit = svlsr_m (pg, vix0, 63);
+-  svbool_t xisneg = svcmpeq (pg, sign_bit, 1);
+  svbool_t xisneg = svcmplt (pg, x, 0);
+ 
+   /* Set sign_bias and ix depending on sign of x and nature of y.  */
+-  svbool_t yisnotint_xisneg = svpfalse_b ();
+  svbool_t yint_or_xpos = pg;
+   svuint64_t sign_bias = sv_u64 (0);
+   svuint64_t vix = vix0;
+-  svuint64_t vtopx1 = vtopx0;
+   if (__glibc_unlikely (svptest_any (pg, xisneg)))
+     {
+       /* Determine nature of y.  */
+-      yisnotint_xisneg = sv_isnotint (xisneg, y);
+-      svbool_t yisint_xisneg = sv_isint (xisneg, y);
+      yint_or_xpos = sv_isint (xisneg, y);
+       svbool_t yisodd_xisneg = sv_isodd (xisneg, y);
+       /* ix set to abs(ix) if y is integer.  */
+-      vix = svand_m (yisint_xisneg, vix0, 0x7fffffffffffffff);
+-      vtopx1 = svand_m (yisint_xisneg, vtopx0, 0x7ff);
+      vix = svand_m (yint_or_xpos, vix0, 0x7fffffffffffffff);
+       /* Set to SignBias if x is negative and y is odd.  */
+       sign_bias = svsel (yisodd_xisneg, sv_u64 (SignBias), sv_u64 (0));
+     }
+ 
+-  /* Special cases of x or y: zero, inf and nan.  */
+-  svbool_t xspecial = sv_zeroinfnan (pg, vix0);
+-  svbool_t yspecial = sv_zeroinfnan (pg, viy0);
+-  svbool_t special = svorr_z (pg, xspecial, yspecial);
+-
+   /* Small cases of x: |x| < 0x1p-126.  */
+-  svuint64_t vabstopx0 = svand_x (pg, vtopx0, 0x7ff);
+-  svbool_t xsmall = svcmplt (pg, vabstopx0, SmallPowX);
+-  if (__glibc_unlikely (svptest_any (pg, xsmall)))
+  svbool_t xsmall = svaclt (yint_or_xpos, x, SmallBoundX);
+  if (__glibc_unlikely (svptest_any (yint_or_xpos, xsmall)))
+     {
+       /* Normalize subnormal x so exponent becomes negative.  */
+-      svbool_t topx_is_null = svcmpeq (xsmall, vtopx1, 0);
+      svuint64_t vtopx = svlsr_x (svptrue_b64 (), vix, 52);
+      svbool_t topx_is_null = svcmpeq (xsmall, vtopx, 0);
+ 
+       svuint64_t vix_norm = svreinterpret_u64 (svmul_m (xsmall, x, 0x1p52));
+       vix_norm = svand_m (xsmall, vix_norm, 0x7fffffffffffffff);
+@@ -391,20 +426,24 @@ svfloat64_t SV_NAME_D2 (pow) (svfloat64_t x, svfloat64_t y, const svbool_t pg)
+ 
+   /* y_hi = log(ix, &y_lo).  */
+   svfloat64_t vlo;
+-  svfloat64_t vhi = sv_log_inline (pg, vix, &vlo);
+  svfloat64_t vhi = sv_log_inline (yint_or_xpos, vix, &vlo, d);
+ 
+   /* z = exp(y_hi, y_lo, sign_bias).  */
+-  svfloat64_t vehi = svmul_x (pg, y, vhi);
+-  svfloat64_t velo = svmul_x (pg, y, vlo);
+-  svfloat64_t vemi = svmls_x (pg, vehi, y, vhi);
+-  velo = svsub_x (pg, velo, vemi);
+-  svfloat64_t vz = sv_exp_inline (pg, vehi, velo, sign_bias);
+  svfloat64_t vehi = svmul_x (svptrue_b64 (), y, vhi);
+  svfloat64_t vemi = svmls_x (yint_or_xpos, vehi, y, vhi);
+  svfloat64_t velo = svnmls_x (yint_or_xpos, vemi, y, vlo);
+  svfloat64_t vz = sv_exp_inline (yint_or_xpos, vehi, velo, sign_bias, d);
+ 
+   /* Cases of finite y and finite negative x.  */
+-  vz = svsel (yisnotint_xisneg, sv_f64 (__builtin_nan ("")), vz);
+  vz = svsel (yint_or_xpos, vz, sv_f64 (__builtin_nan ("")));
+
+  /* Special cases of x or y: zero, inf and nan.  */
+  svbool_t xspecial = sv_zeroinfnan (svptrue_b64 (), vix0);
+  svbool_t yspecial = sv_zeroinfnan (svptrue_b64 (), viy0);
+  svbool_t special = svorr_z (svptrue_b64 (), xspecial, yspecial);
+ 
+   /* Cases of zero/inf/nan x or y.  */
+-  if (__glibc_unlikely (svptest_any (pg, special)))
+  if (__glibc_unlikely (svptest_any (svptrue_b64 (), special)))
+     vz = sv_call2_f64 (pow_sc, x, y, vz, special);
+ 
+   return vz;
--- a/glibc-RHEL-118273-29.patch
+++ b/glibc-RHEL-118273-29.patch
@ -0,0 +1,45 @@
+commit f5ff34cb3c75ec1061c75bb9188b3c1176426947
+Author: Yat Long Poon <yatlong.poon@arm.com>
+Date:   Thu Feb 13 18:00:50 2025 +0000
+
+    AArch64: Improve codegen for SVE erfcf
+    
+    Reduce number of MOV/MOVPRFXs and use unpredicated FMUL.
+    Replace MUL with LSL.  Speedup on Neoverse V1: 6%.
+    
+    Reviewed-by: Wilco Dijkstra  <Wilco.Dijkstra@arm.com>
+
+diff --git a/sysdeps/aarch64/fpu/erfcf_sve.c b/sysdeps/aarch64/fpu/erfcf_sve.c
+index ecacb933aca40855..e4869263e31e18bc 100644
+--- a/sysdeps/aarch64/fpu/erfcf_sve.c
+++ b/sysdeps/aarch64/fpu/erfcf_sve.c
+@@ -76,7 +76,7 @@ svfloat32_t SV_NAME_F1 (erfc) (svfloat32_t x, const svbool_t pg)
+   svuint32_t i = svqadd (svreinterpret_u32 (z), dat->off_idx);
+ 
+   /* Lookup erfc(r) and 2/sqrt(pi)*exp(-r^2) in tables.  */
+-  i = svmul_x (pg, i, 2);
+  i = svlsl_x (svptrue_b32 (), i, 1);
+   const float32_t *p = &__v_erfcf_data.tab[0].erfc - 2 * dat->off_arr;
+   svfloat32_t erfcr = svld1_gather_index (pg, p, i);
+   svfloat32_t scale = svld1_gather_index (pg, p + 1, i);
+@@ -84,15 +84,15 @@ svfloat32_t SV_NAME_F1 (erfc) (svfloat32_t x, const svbool_t pg)
+   /* erfc(x) ~ erfc(r) - scale * d * poly(r, d).  */
+   svfloat32_t r = svsub_x (pg, z, shift);
+   svfloat32_t d = svsub_x (pg, a, r);
+-  svfloat32_t d2 = svmul_x (pg, d, d);
+-  svfloat32_t r2 = svmul_x (pg, r, r);
+  svfloat32_t d2 = svmul_x (svptrue_b32 (), d, d);
+  svfloat32_t r2 = svmul_x (svptrue_b32 (), r, r);
+ 
+   svfloat32_t coeffs = svld1rq (svptrue_b32 (), &dat->third);
+-  svfloat32_t third = svdup_lane (coeffs, 0);
+ 
+   svfloat32_t p1 = r;
+-  svfloat32_t p2 = svmls_lane (third, r2, coeffs, 1);
+-  svfloat32_t p3 = svmul_x (pg, r, svmla_lane (sv_f32 (-0.5), r2, coeffs, 0));
+  svfloat32_t p2 = svmls_lane (sv_f32 (dat->third), r2, coeffs, 1);
+  svfloat32_t p3
+      = svmul_x (svptrue_b32 (), r, svmla_lane (sv_f32 (-0.5), r2, coeffs, 0));
+   svfloat32_t p4 = svmla_lane (sv_f32 (dat->two_over_five), r2, coeffs, 2);
+   p4 = svmls_x (pg, sv_f32 (dat->tenth), r2, p4);
+ 
--- a/glibc-RHEL-118273-3.patch
+++ b/glibc-RHEL-118273-3.patch
@ -0,0 +1,873 @@
+commit b09fee1d21650428a6a3335408a46ebe1165d30d
+Author: Joe Ramsay <Joe.Ramsay@arm.com>
+Date:   Tue Feb 20 16:59:40 2024 +0000
+
+    aarch64/fpu: Add vector variants of acosh
+    
+    Reviewed-by: Szabolcs Nagy <szabolcs.nagy@arm.com>
+
+diff --git a/sysdeps/aarch64/fpu/Makefile b/sysdeps/aarch64/fpu/Makefile
+index 019c3a51880e2306..2e5bbb5a07f4c9b0 100644
+--- a/sysdeps/aarch64/fpu/Makefile
+++ b/sysdeps/aarch64/fpu/Makefile
+@@ -1,4 +1,5 @@
+ libmvec-supported-funcs = acos \
+                          acosh \
+                           asin \
+                           atan \
+                           atan2 \
+diff --git a/sysdeps/aarch64/fpu/Versions b/sysdeps/aarch64/fpu/Versions
+index 884b4b57f097635f..60e1cdeacec3f77e 100644
+--- a/sysdeps/aarch64/fpu/Versions
+++ b/sysdeps/aarch64/fpu/Versions
+@@ -79,6 +79,11 @@ libmvec {
+     _ZGVsMxv_tan;
+   }
+   GLIBC_2.40 {
+    _ZGVnN2v_acosh;
+    _ZGVnN2v_acoshf;
+    _ZGVnN4v_acoshf;
+    _ZGVsMxv_acosh;
+    _ZGVsMxv_acoshf;
+     _ZGVnN2v_cosh;
+     _ZGVnN2v_coshf;
+     _ZGVnN4v_coshf;
+diff --git a/sysdeps/aarch64/fpu/acosh_advsimd.c b/sysdeps/aarch64/fpu/acosh_advsimd.c
+new file mode 100644
+index 0000000000000000..c88283cf1191f4eb
+--- /dev/null
+++ b/sysdeps/aarch64/fpu/acosh_advsimd.c
+@@ -0,0 +1,67 @@
+/* Double-precision vector (Advanced SIMD) acosh function
+
+   Copyright (C) 2024 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#define WANT_V_LOG1P_K0_SHORTCUT 1
+#include "v_log1p_inline.h"
+
+const static struct data
+{
+  struct v_log1p_data log1p_consts;
+  uint64x2_t one, thresh;
+} data = {
+  .log1p_consts = V_LOG1P_CONSTANTS_TABLE,
+  .one = V2 (0x3ff0000000000000),
+  .thresh = V2 (0x1ff0000000000000) /* asuint64(0x1p511) - asuint64(1).  */
+};
+
+static float64x2_t NOINLINE VPCS_ATTR
+special_case (float64x2_t x, float64x2_t y, uint64x2_t special,
+	      const struct v_log1p_data *d)
+{
+  return v_call_f64 (acosh, x, log1p_inline (y, d), special);
+}
+
+/* Vector approximation for double-precision acosh, based on log1p.
+   The largest observed error is 3.02 ULP in the region where the
+   argument to log1p falls in the k=0 interval, i.e. x close to 1:
+   _ZGVnN2v_acosh(0x1.00798aaf80739p+0) got 0x1.f2d6d823bc9dfp-5
+				       want 0x1.f2d6d823bc9e2p-5.  */
+VPCS_ATTR float64x2_t V_NAME_D1 (acosh) (float64x2_t x)
+{
+  const struct data *d = ptr_barrier (&data);
+  uint64x2_t special
+      = vcgeq_u64 (vsubq_u64 (vreinterpretq_u64_f64 (x), d->one), d->thresh);
+  float64x2_t special_arg = x;
+
+#if WANT_SIMD_EXCEPT
+  if (__glibc_unlikely (v_any_u64 (special)))
+    x = vbslq_f64 (special, vreinterpretq_f64_u64 (d->one), x);
+#endif
+
+  float64x2_t xm1 = vsubq_f64 (x, v_f64 (1));
+  float64x2_t y;
+  y = vaddq_f64 (x, v_f64 (1));
+  y = vmulq_f64 (y, xm1);
+  y = vsqrtq_f64 (y);
+  y = vaddq_f64 (xm1, y);
+
+  if (__glibc_unlikely (v_any_u64 (special)))
+    return special_case (special_arg, y, special, &d->log1p_consts);
+  return log1p_inline (y, &d->log1p_consts);
+}
+diff --git a/sysdeps/aarch64/fpu/acosh_sve.c b/sysdeps/aarch64/fpu/acosh_sve.c
+new file mode 100644
+index 0000000000000000..3e4faaa5ca686c18
+--- /dev/null
+++ b/sysdeps/aarch64/fpu/acosh_sve.c
+@@ -0,0 +1,51 @@
+/* Double-precision vector (SVE) acosh function
+
+   Copyright (C) 2024 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#define WANT_SV_LOG1P_K0_SHORTCUT 1
+#include "sv_log1p_inline.h"
+
+#define One (0x3ff0000000000000)
+#define Thres (0x1ff0000000000000) /* asuint64 (0x1p511) - One.  */
+
+static svfloat64_t NOINLINE
+special_case (svfloat64_t x, svfloat64_t y, svbool_t special)
+{
+  return sv_call_f64 (acosh, x, y, special);
+}
+
+/* SVE approximation for double-precision acosh, based on log1p.
+   The largest observed error is 3.19 ULP in the region where the
+   argument to log1p falls in the k=0 interval, i.e. x close to 1:
+   SV_NAME_D1 (acosh)(0x1.1e4388d4ca821p+0) got 0x1.ed23399f5137p-2
+					   want 0x1.ed23399f51373p-2.  */
+svfloat64_t SV_NAME_D1 (acosh) (svfloat64_t x, const svbool_t pg)
+{
+  /* (ix - One) >= (BigBound - One).  */
+  svuint64_t ix = svreinterpret_u64 (x);
+  svbool_t special = svcmpge (pg, svsub_x (pg, ix, One), Thres);
+
+  svfloat64_t xm1 = svsub_x (pg, x, 1.0);
+  svfloat64_t u = svmul_x (pg, xm1, svadd_x (pg, x, 1.0));
+  svfloat64_t y = svadd_x (pg, xm1, svsqrt_x (pg, u));
+
+  /* Fall back to scalar routine for special lanes.  */
+  if (__glibc_unlikely (svptest_any (pg, special)))
+    return special_case (x, sv_log1p_inline (y, pg), special);
+  return sv_log1p_inline (y, pg);
+}
+diff --git a/sysdeps/aarch64/fpu/acoshf_advsimd.c b/sysdeps/aarch64/fpu/acoshf_advsimd.c
+new file mode 100644
+index 0000000000000000..8916dcbf409922a9
+--- /dev/null
+++ b/sysdeps/aarch64/fpu/acoshf_advsimd.c
+@@ -0,0 +1,78 @@
+/* Single-precision vector (Advanced SIMD) acosh function
+
+   Copyright (C) 2024 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include "v_log1pf_inline.h"
+
+#define SquareLim 0x1p64
+
+const static struct data
+{
+  struct v_log1pf_data log1pf_consts;
+  uint32x4_t one;
+  uint16x4_t thresh;
+} data = {
+  .log1pf_consts = V_LOG1PF_CONSTANTS_TABLE,
+  .one = V4 (0x3f800000),
+  .thresh = V4 (0x2000) /* top(asuint(SquareLim) - asuint(1)).  */
+};
+
+static float32x4_t NOINLINE VPCS_ATTR
+special_case (float32x4_t x, float32x4_t y, uint16x4_t special,
+	      const struct v_log1pf_data d)
+{
+  return v_call_f32 (acoshf, x, log1pf_inline (y, d), vmovl_u16 (special));
+}
+
+/* Vector approximation for single-precision acosh, based on log1p. Maximum
+   error depends on WANT_SIMD_EXCEPT. With SIMD fp exceptions enabled, it
+   is 2.78 ULP:
+   __v_acoshf(0x1.07887p+0) got 0x1.ef9e9cp-3
+			   want 0x1.ef9ea2p-3.
+   With exceptions disabled, we can compute u with a shorter dependency chain,
+   which gives maximum error of 3.07 ULP:
+  __v_acoshf(0x1.01f83ep+0) got 0x1.fbc7fap-4
+			   want 0x1.fbc7f4p-4.  */
+
+VPCS_ATTR float32x4_t NOINLINE V_NAME_F1 (acosh) (float32x4_t x)
+{
+  const struct data *d = ptr_barrier (&data);
+  uint32x4_t ix = vreinterpretq_u32_f32 (x);
+  uint16x4_t special = vcge_u16 (vsubhn_u32 (ix, d->one), d->thresh);
+
+#if WANT_SIMD_EXCEPT
+  /* Mask special lanes with 1 to side-step spurious invalid or overflow. Use
+     only xm1 to calculate u, as operating on x will trigger invalid for NaN.
+     Widening sign-extend special predicate in order to mask with it.  */
+  uint32x4_t p
+      = vreinterpretq_u32_s32 (vmovl_s16 (vreinterpret_s16_u16 (special)));
+  float32x4_t xm1 = v_zerofy_f32 (vsubq_f32 (x, v_f32 (1)), p);
+  float32x4_t u = vfmaq_f32 (vaddq_f32 (xm1, xm1), xm1, xm1);
+#else
+  float32x4_t xm1 = vsubq_f32 (x, v_f32 (1));
+  float32x4_t u = vmulq_f32 (xm1, vaddq_f32 (x, v_f32 (1.0f)));
+#endif
+
+  float32x4_t y = vaddq_f32 (xm1, vsqrtq_f32 (u));
+
+  if (__glibc_unlikely (v_any_u16h (special)))
+    return special_case (x, y, special, d->log1pf_consts);
+  return log1pf_inline (y, d->log1pf_consts);
+}
+libmvec_hidden_def (V_NAME_F1 (acosh))
+HALF_WIDTH_ALIAS_F1 (acosh)
+diff --git a/sysdeps/aarch64/fpu/acoshf_sve.c b/sysdeps/aarch64/fpu/acoshf_sve.c
+new file mode 100644
+index 0000000000000000..2110894e629500be
+--- /dev/null
+++ b/sysdeps/aarch64/fpu/acoshf_sve.c
+@@ -0,0 +1,49 @@
+/* Single-precision vector (SVE) acosh function
+
+   Copyright (C) 2024 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#define One 0x3f800000
+#define Thres 0x20000000 /* asuint(0x1p64) - One.  */
+
+#include "sv_log1pf_inline.h"
+
+static svfloat32_t NOINLINE
+special_case (svfloat32_t x, svfloat32_t y, svbool_t special)
+{
+  return sv_call_f32 (acoshf, x, y, special);
+}
+
+/* Single-precision SVE acosh(x) routine. Implements the same algorithm as
+   vector acoshf and log1p.
+
+   Maximum error is 2.78 ULPs:
+   SV_NAME_F1 (acosh) (0x1.01e996p+0) got 0x1.f45b42p-4
+				     want 0x1.f45b3cp-4.  */
+svfloat32_t SV_NAME_F1 (acosh) (svfloat32_t x, const svbool_t pg)
+{
+  svuint32_t ix = svreinterpret_u32 (x);
+  svbool_t special = svcmpge (pg, svsub_x (pg, ix, One), Thres);
+
+  svfloat32_t xm1 = svsub_x (pg, x, 1.0f);
+  svfloat32_t u = svmul_x (pg, xm1, svadd_x (pg, x, 1.0f));
+  svfloat32_t y = sv_log1pf_inline (svadd_x (pg, xm1, svsqrt_x (pg, u)), pg);
+
+  if (__glibc_unlikely (svptest_any (pg, special)))
+    return special_case (x, y, special);
+  return y;
+}
+diff --git a/sysdeps/aarch64/fpu/advsimd_f32_protos.h b/sysdeps/aarch64/fpu/advsimd_f32_protos.h
+index c63b2948d4938b0d..22fec4de77395e60 100644
+--- a/sysdeps/aarch64/fpu/advsimd_f32_protos.h
+++ b/sysdeps/aarch64/fpu/advsimd_f32_protos.h
+@@ -18,6 +18,7 @@
+    <https://www.gnu.org/licenses/>.  */
+ 
+ libmvec_hidden_proto (V_NAME_F1(acos));
+libmvec_hidden_proto (V_NAME_F1(acosh));
+ libmvec_hidden_proto (V_NAME_F1(asin));
+ libmvec_hidden_proto (V_NAME_F1(atan));
+ libmvec_hidden_proto (V_NAME_F1(cos));
+diff --git a/sysdeps/aarch64/fpu/bits/math-vector.h b/sysdeps/aarch64/fpu/bits/math-vector.h
+index 8ca55098706a54c2..841330956c102ff1 100644
+--- a/sysdeps/aarch64/fpu/bits/math-vector.h
+++ b/sysdeps/aarch64/fpu/bits/math-vector.h
+@@ -33,6 +33,10 @@
+ # define __DECL_SIMD_acos __DECL_SIMD_aarch64
+ # undef __DECL_SIMD_acosf
+ # define __DECL_SIMD_acosf __DECL_SIMD_aarch64
+# undef __DECL_SIMD_acosh
+# define __DECL_SIMD_acosh __DECL_SIMD_aarch64
+# undef __DECL_SIMD_acoshf
+# define __DECL_SIMD_acoshf __DECL_SIMD_aarch64
+ # undef __DECL_SIMD_asin
+ # define __DECL_SIMD_asin __DECL_SIMD_aarch64
+ # undef __DECL_SIMD_asinf
+@@ -125,6 +129,7 @@ typedef __SVBool_t __sv_bool_t;
+ 
+ __vpcs __f32x4_t _ZGVnN4vv_atan2f (__f32x4_t, __f32x4_t);
+ __vpcs __f32x4_t _ZGVnN4v_acosf (__f32x4_t);
+__vpcs __f32x4_t _ZGVnN4v_acoshf (__f32x4_t);
+ __vpcs __f32x4_t _ZGVnN4v_asinf (__f32x4_t);
+ __vpcs __f32x4_t _ZGVnN4v_atanf (__f32x4_t);
+ __vpcs __f32x4_t _ZGVnN4v_cosf (__f32x4_t);
+@@ -143,6 +148,7 @@ __vpcs __f32x4_t _ZGVnN4v_tanf (__f32x4_t);
+ 
+ __vpcs __f64x2_t _ZGVnN2vv_atan2 (__f64x2_t, __f64x2_t);
+ __vpcs __f64x2_t _ZGVnN2v_acos (__f64x2_t);
+__vpcs __f64x2_t _ZGVnN2v_acosh (__f64x2_t);
+ __vpcs __f64x2_t _ZGVnN2v_asin (__f64x2_t);
+ __vpcs __f64x2_t _ZGVnN2v_atan (__f64x2_t);
+ __vpcs __f64x2_t _ZGVnN2v_cos (__f64x2_t);
+@@ -166,6 +172,7 @@ __vpcs __f64x2_t _ZGVnN2v_tan (__f64x2_t);
+ 
+ __sv_f32_t _ZGVsMxvv_atan2f (__sv_f32_t, __sv_f32_t, __sv_bool_t);
+ __sv_f32_t _ZGVsMxv_acosf (__sv_f32_t, __sv_bool_t);
+__sv_f32_t _ZGVsMxv_acoshf (__sv_f32_t, __sv_bool_t);
+ __sv_f32_t _ZGVsMxv_asinf (__sv_f32_t, __sv_bool_t);
+ __sv_f32_t _ZGVsMxv_atanf (__sv_f32_t, __sv_bool_t);
+ __sv_f32_t _ZGVsMxv_cosf (__sv_f32_t, __sv_bool_t);
+@@ -184,6 +191,7 @@ __sv_f32_t _ZGVsMxv_tanf (__sv_f32_t, __sv_bool_t);
+ 
+ __sv_f64_t _ZGVsMxvv_atan2 (__sv_f64_t, __sv_f64_t, __sv_bool_t);
+ __sv_f64_t _ZGVsMxv_acos (__sv_f64_t, __sv_bool_t);
+__sv_f64_t _ZGVsMxv_acosh (__sv_f64_t, __sv_bool_t);
+ __sv_f64_t _ZGVsMxv_asin (__sv_f64_t, __sv_bool_t);
+ __sv_f64_t _ZGVsMxv_atan (__sv_f64_t, __sv_bool_t);
+ __sv_f64_t _ZGVsMxv_cos (__sv_f64_t, __sv_bool_t);
+diff --git a/sysdeps/aarch64/fpu/sv_log1p_inline.h b/sysdeps/aarch64/fpu/sv_log1p_inline.h
+new file mode 100644
+index 0000000000000000..da019674f94dbac7
+--- /dev/null
+++ b/sysdeps/aarch64/fpu/sv_log1p_inline.h
+@@ -0,0 +1,109 @@
+/* Helper for double-precision SVE routines which depend on log1p
+
+   Copyright (C) 2024 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#ifndef AARCH64_FPU_SV_LOG1P_INLINE_H
+#define AARCH64_FPU_SV_LOG1P_INLINE_H
+
+#include "sv_math.h"
+#include "poly_sve_f64.h"
+
+static const struct sv_log1p_data
+{
+  double poly[19], ln2[2];
+  uint64_t hf_rt2_top;
+  uint64_t one_m_hf_rt2_top;
+  uint32_t bottom_mask;
+  int64_t one_top;
+} sv_log1p_data = {
+  /* Coefficients generated using Remez, deg=20, in [sqrt(2)/2-1, sqrt(2)-1].
+   */
+  .poly = { -0x1.ffffffffffffbp-2, 0x1.55555555551a9p-2, -0x1.00000000008e3p-2,
+	    0x1.9999999a32797p-3, -0x1.555555552fecfp-3, 0x1.249248e071e5ap-3,
+	    -0x1.ffffff8bf8482p-4, 0x1.c71c8f07da57ap-4, -0x1.9999ca4ccb617p-4,
+	    0x1.7459ad2e1dfa3p-4, -0x1.554d2680a3ff2p-4, 0x1.3b4c54d487455p-4,
+	    -0x1.2548a9ffe80e6p-4, 0x1.0f389a24b2e07p-4, -0x1.eee4db15db335p-5,
+	    0x1.e95b494d4a5ddp-5, -0x1.15fdf07cb7c73p-4, 0x1.0310b70800fcfp-4,
+	    -0x1.cfa7385bdb37ep-6 },
+  .ln2 = { 0x1.62e42fefa3800p-1, 0x1.ef35793c76730p-45 },
+  .hf_rt2_top = 0x3fe6a09e00000000,
+  .one_m_hf_rt2_top = 0x00095f6200000000,
+  .bottom_mask = 0xffffffff,
+  .one_top = 0x3ff
+};
+
+static inline svfloat64_t
+sv_log1p_inline (svfloat64_t x, const svbool_t pg)
+{
+  /* Helper for calculating log(x + 1). Adapted from v_log1p_inline.h, which
+     differs from v_log1p_2u5.c by:
+     - No special-case handling - this should be dealt with by the caller.
+     - Pairwise Horner polynomial evaluation for improved accuracy.
+     - Optionally simulate the shortcut for k=0, used in the scalar routine,
+       using svsel, for improved accuracy when the argument to log1p is close
+     to 0. This feature is enabled by defining WANT_SV_LOG1P_K0_SHORTCUT as 1
+     in the source of the caller before including this file.
+     See sv_log1p_2u1.c for details of the algorithm.  */
+  const struct sv_log1p_data *d = ptr_barrier (&sv_log1p_data);
+  svfloat64_t m = svadd_x (pg, x, 1);
+  svuint64_t mi = svreinterpret_u64 (m);
+  svuint64_t u = svadd_x (pg, mi, d->one_m_hf_rt2_top);
+
+  svint64_t ki
+      = svsub_x (pg, svreinterpret_s64 (svlsr_x (pg, u, 52)), d->one_top);
+  svfloat64_t k = svcvt_f64_x (pg, ki);
+
+  /* Reduce x to f in [sqrt(2)/2, sqrt(2)].  */
+  svuint64_t utop
+      = svadd_x (pg, svand_x (pg, u, 0x000fffff00000000), d->hf_rt2_top);
+  svuint64_t u_red = svorr_x (pg, utop, svand_x (pg, mi, d->bottom_mask));
+  svfloat64_t f = svsub_x (pg, svreinterpret_f64 (u_red), 1);
+
+  /* Correction term c/m.  */
+  svfloat64_t c = svsub_x (pg, x, svsub_x (pg, m, 1));
+  svfloat64_t cm;
+
+#ifndef WANT_SV_LOG1P_K0_SHORTCUT
+#error                                                                         \
+  "Cannot use sv_log1p_inline.h without specifying whether you need the k0 shortcut for greater accuracy close to 0"
+#elif WANT_SV_LOG1P_K0_SHORTCUT
+  /* Shortcut if k is 0 - set correction term to 0 and f to x. The result is
+     that the approximation is solely the polynomial.  */
+  svbool_t knot0 = svcmpne (pg, k, 0);
+  cm = svdiv_z (knot0, c, m);
+  if (__glibc_likely (!svptest_any (pg, knot0)))
+    {
+      f = svsel (knot0, f, x);
+    }
+#else
+  /* No shortcut.  */
+  cm = svdiv_x (pg, c, m);
+#endif
+
+  /* Approximate log1p(f) on the reduced input using a polynomial.  */
+  svfloat64_t f2 = svmul_x (pg, f, f);
+  svfloat64_t p = sv_pw_horner_18_f64_x (pg, f, f2, d->poly);
+
+  /* Assemble log1p(x) = k * log2 + log1p(f) + c/m.  */
+  svfloat64_t ylo = svmla_x (pg, cm, k, d->ln2[0]);
+  svfloat64_t yhi = svmla_x (pg, f, k, d->ln2[1]);
+
+  return svmla_x (pg, svadd_x (pg, ylo, yhi), f2, p);
+}
+
+#endif
+diff --git a/sysdeps/aarch64/fpu/sv_log1pf_inline.h b/sysdeps/aarch64/fpu/sv_log1pf_inline.h
+new file mode 100644
+index 0000000000000000..b94b2da055a6c59b
+--- /dev/null
+++ b/sysdeps/aarch64/fpu/sv_log1pf_inline.h
+@@ -0,0 +1,76 @@
+/* Helper for single-precision SVE routines which depend on log1p
+
+   Copyright (C) 2024 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#ifndef AARCH64_FPU_SV_LOG1PF_INLINE_H
+#define AARCH64_FPU_SV_LOG1PF_INLINE_H
+
+#include "sv_math.h"
+#include "vecmath_config.h"
+#include "poly_sve_f32.h"
+
+static const struct sv_log1pf_data
+{
+  float32_t poly[9];
+  float32_t ln2;
+  float32_t scale_back;
+} sv_log1pf_data = {
+  /* Polynomial generated using FPMinimax in [-0.25, 0.5].  */
+  .poly = { -0x1p-1f, 0x1.5555aap-2f, -0x1.000038p-2f, 0x1.99675cp-3f,
+	    -0x1.54ef78p-3f, 0x1.28a1f4p-3f, -0x1.0da91p-3f, 0x1.abcb6p-4f,
+	    -0x1.6f0d5ep-5f },
+  .scale_back = 0x1.0p-23f,
+  .ln2 = 0x1.62e43p-1f,
+};
+
+static inline svfloat32_t
+eval_poly (svfloat32_t m, const float32_t *c, svbool_t pg)
+{
+  svfloat32_t p_12 = svmla_x (pg, sv_f32 (c[0]), m, sv_f32 (c[1]));
+  svfloat32_t m2 = svmul_x (pg, m, m);
+  svfloat32_t q = svmla_x (pg, m, m2, p_12);
+  svfloat32_t p = sv_pw_horner_6_f32_x (pg, m, m2, c + 2);
+  p = svmul_x (pg, m2, p);
+
+  return svmla_x (pg, q, m2, p);
+}
+
+static inline svfloat32_t
+sv_log1pf_inline (svfloat32_t x, svbool_t pg)
+{
+  const struct sv_log1pf_data *d = ptr_barrier (&sv_log1pf_data);
+
+  svfloat32_t m = svadd_x (pg, x, 1.0f);
+
+  svint32_t ks = svsub_x (pg, svreinterpret_s32 (m),
+			  svreinterpret_s32 (svdup_f32 (0.75f)));
+  ks = svand_x (pg, ks, 0xff800000);
+  svuint32_t k = svreinterpret_u32 (ks);
+  svfloat32_t s = svreinterpret_f32 (
+      svsub_x (pg, svreinterpret_u32 (svdup_f32 (4.0f)), k));
+
+  svfloat32_t m_scale
+      = svreinterpret_f32 (svsub_x (pg, svreinterpret_u32 (x), k));
+  m_scale
+      = svadd_x (pg, m_scale, svmla_x (pg, sv_f32 (-1.0f), sv_f32 (0.25f), s));
+  svfloat32_t p = eval_poly (m_scale, d->poly, pg);
+  svfloat32_t scale_back = svmul_x (pg, svcvt_f32_x (pg, k), d->scale_back);
+  return svmla_x (pg, p, scale_back, d->ln2);
+}
+
+#endif
+diff --git a/sysdeps/aarch64/fpu/test-double-advsimd-wrappers.c b/sysdeps/aarch64/fpu/test-double-advsimd-wrappers.c
+index b37cb7d5e9c0d96a..f4ce1d70096888aa 100644
+--- a/sysdeps/aarch64/fpu/test-double-advsimd-wrappers.c
+++ b/sysdeps/aarch64/fpu/test-double-advsimd-wrappers.c
+@@ -24,6 +24,7 @@
+ #define VEC_TYPE float64x2_t
+ 
+ VPCS_VECTOR_WRAPPER (acos_advsimd, _ZGVnN2v_acos)
+VPCS_VECTOR_WRAPPER (acosh_advsimd, _ZGVnN2v_acosh)
+ VPCS_VECTOR_WRAPPER (asin_advsimd, _ZGVnN2v_asin)
+ VPCS_VECTOR_WRAPPER (atan_advsimd, _ZGVnN2v_atan)
+ VPCS_VECTOR_WRAPPER_ff (atan2_advsimd, _ZGVnN2vv_atan2)
+diff --git a/sysdeps/aarch64/fpu/test-double-sve-wrappers.c b/sysdeps/aarch64/fpu/test-double-sve-wrappers.c
+index 011f07d2c15b148f..0e973cc9d7ade813 100644
+--- a/sysdeps/aarch64/fpu/test-double-sve-wrappers.c
+++ b/sysdeps/aarch64/fpu/test-double-sve-wrappers.c
+@@ -43,6 +43,7 @@
+   }
+ 
+ SVE_VECTOR_WRAPPER (acos_sve, _ZGVsMxv_acos)
+SVE_VECTOR_WRAPPER (acosh_sve, _ZGVsMxv_acosh)
+ SVE_VECTOR_WRAPPER (asin_sve, _ZGVsMxv_asin)
+ SVE_VECTOR_WRAPPER (atan_sve, _ZGVsMxv_atan)
+ SVE_VECTOR_WRAPPER_ff (atan2_sve, _ZGVsMxvv_atan2)
+diff --git a/sysdeps/aarch64/fpu/test-float-advsimd-wrappers.c b/sysdeps/aarch64/fpu/test-float-advsimd-wrappers.c
+index 35452991431e238a..0ce026b5ea96a064 100644
+--- a/sysdeps/aarch64/fpu/test-float-advsimd-wrappers.c
+++ b/sysdeps/aarch64/fpu/test-float-advsimd-wrappers.c
+@@ -24,6 +24,7 @@
+ #define VEC_TYPE float32x4_t
+ 
+ VPCS_VECTOR_WRAPPER (acosf_advsimd, _ZGVnN4v_acosf)
+VPCS_VECTOR_WRAPPER (acoshf_advsimd, _ZGVnN4v_acoshf)
+ VPCS_VECTOR_WRAPPER (asinf_advsimd, _ZGVnN4v_asinf)
+ VPCS_VECTOR_WRAPPER (atanf_advsimd, _ZGVnN4v_atanf)
+ VPCS_VECTOR_WRAPPER_ff (atan2f_advsimd, _ZGVnN4vv_atan2f)
+diff --git a/sysdeps/aarch64/fpu/test-float-sve-wrappers.c b/sysdeps/aarch64/fpu/test-float-sve-wrappers.c
+index bbc74ede88c9e6c8..398b7373e800cd5b 100644
+--- a/sysdeps/aarch64/fpu/test-float-sve-wrappers.c
+++ b/sysdeps/aarch64/fpu/test-float-sve-wrappers.c
+@@ -43,6 +43,7 @@
+   }
+ 
+ SVE_VECTOR_WRAPPER (acosf_sve, _ZGVsMxv_acosf)
+SVE_VECTOR_WRAPPER (acoshf_sve, _ZGVsMxv_acoshf)
+ SVE_VECTOR_WRAPPER (asinf_sve, _ZGVsMxv_asinf)
+ SVE_VECTOR_WRAPPER (atanf_sve, _ZGVsMxv_atanf)
+ SVE_VECTOR_WRAPPER_ff (atan2f_sve, _ZGVsMxvv_atan2f)
+diff --git a/sysdeps/aarch64/fpu/v_log1p_inline.h b/sysdeps/aarch64/fpu/v_log1p_inline.h
+new file mode 100644
+index 0000000000000000..242e43b6eecc0b6e
+--- /dev/null
+++ b/sysdeps/aarch64/fpu/v_log1p_inline.h
+@@ -0,0 +1,103 @@
+/* Helper for double-precision Advanced SIMD routines which depend on log1p
+
+   Copyright (C) 2024 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#ifndef AARCH64_FPU_V_LOG1P_INLINE_H
+#define AARCH64_FPU_V_LOG1P_INLINE_H
+
+#include "v_math.h"
+#include "poly_advsimd_f64.h"
+
+struct v_log1p_data
+{
+  float64x2_t poly[19], ln2[2];
+  uint64x2_t hf_rt2_top, one_m_hf_rt2_top, umask;
+  int64x2_t one_top;
+};
+
+/* Coefficients generated using Remez, deg=20, in [sqrt(2)/2-1, sqrt(2)-1].  */
+#define V_LOG1P_CONSTANTS_TABLE                                               \
+  {                                                                           \
+    .poly = { V2 (-0x1.ffffffffffffbp-2), V2 (0x1.55555555551a9p-2),          \
+	      V2 (-0x1.00000000008e3p-2), V2 (0x1.9999999a32797p-3),          \
+	      V2 (-0x1.555555552fecfp-3), V2 (0x1.249248e071e5ap-3),          \
+	      V2 (-0x1.ffffff8bf8482p-4), V2 (0x1.c71c8f07da57ap-4),          \
+	      V2 (-0x1.9999ca4ccb617p-4), V2 (0x1.7459ad2e1dfa3p-4),          \
+	      V2 (-0x1.554d2680a3ff2p-4), V2 (0x1.3b4c54d487455p-4),          \
+	      V2 (-0x1.2548a9ffe80e6p-4), V2 (0x1.0f389a24b2e07p-4),          \
+	      V2 (-0x1.eee4db15db335p-5), V2 (0x1.e95b494d4a5ddp-5),          \
+	      V2 (-0x1.15fdf07cb7c73p-4), V2 (0x1.0310b70800fcfp-4),          \
+	      V2 (-0x1.cfa7385bdb37ep-6) },                                   \
+    .ln2 = { V2 (0x1.62e42fefa3800p-1), V2 (0x1.ef35793c76730p-45) },         \
+    .hf_rt2_top = V2 (0x3fe6a09e00000000),                                    \
+    .one_m_hf_rt2_top = V2 (0x00095f6200000000),                              \
+    .umask = V2 (0x000fffff00000000), .one_top = V2 (0x3ff)                   \
+  }
+
+#define BottomMask v_u64 (0xffffffff)
+
+static inline float64x2_t
+log1p_inline (float64x2_t x, const struct v_log1p_data *d)
+{
+  /* Helper for calculating log(x + 1). Copied from v_log1p_2u5.c, with several
+     modifications:
+     - No special-case handling - this should be dealt with by the caller.
+     - Pairwise Horner polynomial evaluation for improved accuracy.
+     - Optionally simulate the shortcut for k=0, used in the scalar routine,
+       using v_sel, for improved accuracy when the argument to log1p is close to
+       0. This feature is enabled by defining WANT_V_LOG1P_K0_SHORTCUT as 1 in
+       the source of the caller before including this file.
+     See v_log1pf_2u1.c for details of the algorithm.  */
+  float64x2_t m = vaddq_f64 (x, v_f64 (1));
+  uint64x2_t mi = vreinterpretq_u64_f64 (m);
+  uint64x2_t u = vaddq_u64 (mi, d->one_m_hf_rt2_top);
+
+  int64x2_t ki
+      = vsubq_s64 (vreinterpretq_s64_u64 (vshrq_n_u64 (u, 52)), d->one_top);
+  float64x2_t k = vcvtq_f64_s64 (ki);
+
+  /* Reduce x to f in [sqrt(2)/2, sqrt(2)].  */
+  uint64x2_t utop = vaddq_u64 (vandq_u64 (u, d->umask), d->hf_rt2_top);
+  uint64x2_t u_red = vorrq_u64 (utop, vandq_u64 (mi, BottomMask));
+  float64x2_t f = vsubq_f64 (vreinterpretq_f64_u64 (u_red), v_f64 (1));
+
+  /* Correction term c/m.  */
+  float64x2_t cm = vdivq_f64 (vsubq_f64 (x, vsubq_f64 (m, v_f64 (1))), m);
+
+#ifndef WANT_V_LOG1P_K0_SHORTCUT
+#error                                                                         \
+  "Cannot use v_log1p_inline.h without specifying whether you need the k0 shortcut for greater accuracy close to 0"
+#elif WANT_V_LOG1P_K0_SHORTCUT
+  /* Shortcut if k is 0 - set correction term to 0 and f to x. The result is
+     that the approximation is solely the polynomial.  */
+  uint64x2_t k0 = vceqzq_f64 (k);
+  cm = v_zerofy_f64 (cm, k0);
+  f = vbslq_f64 (k0, x, f);
+#endif
+
+  /* Approximate log1p(f) on the reduced input using a polynomial.  */
+  float64x2_t f2 = vmulq_f64 (f, f);
+  float64x2_t p = v_pw_horner_18_f64 (f, f2, d->poly);
+
+  /* Assemble log1p(x) = k * log2 + log1p(f) + c/m.  */
+  float64x2_t ylo = vfmaq_f64 (cm, k, d->ln2[1]);
+  float64x2_t yhi = vfmaq_f64 (f, k, d->ln2[0]);
+  return vfmaq_f64 (vaddq_f64 (ylo, yhi), f2, p);
+}
+
+#endif
+diff --git a/sysdeps/aarch64/fpu/v_log1pf_inline.h b/sysdeps/aarch64/fpu/v_log1pf_inline.h
+new file mode 100644
+index 0000000000000000..643a6cdcfc498970
+--- /dev/null
+++ b/sysdeps/aarch64/fpu/v_log1pf_inline.h
+@@ -0,0 +1,78 @@
+/* Helper for single-precision Advanced SIMD routines which depend on log1p
+
+   Copyright (C) 2024 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#ifndef AARCH64_FPU_V_LOG1PF_INLINE_H
+#define AARCH64_FPU_V_LOG1PF_INLINE_H
+
+#include "v_math.h"
+#include "poly_advsimd_f32.h"
+
+struct v_log1pf_data
+{
+  float32x4_t poly[8], ln2;
+  uint32x4_t four;
+  int32x4_t three_quarters;
+};
+
+/* Polynomial generated using FPMinimax in [-0.25, 0.5]. First two coefficients
+   (1, -0.5) are not stored as they can be generated more efficiently.  */
+#define V_LOG1PF_CONSTANTS_TABLE                                              \
+  {                                                                           \
+    .poly                                                                     \
+	= { V4 (0x1.5555aap-2f),  V4 (-0x1.000038p-2f), V4 (0x1.99675cp-3f),  \
+	    V4 (-0x1.54ef78p-3f), V4 (0x1.28a1f4p-3f),	V4 (-0x1.0da91p-3f),  \
+	    V4 (0x1.abcb6p-4f),	  V4 (-0x1.6f0d5ep-5f) },                     \
+	.ln2 = V4 (0x1.62e43p-1f), .four = V4 (0x40800000),                   \
+	.three_quarters = V4 (0x3f400000)                                     \
+  }
+
+static inline float32x4_t
+eval_poly (float32x4_t m, const float32x4_t *c)
+{
+  /* Approximate log(1+m) on [-0.25, 0.5] using pairwise Horner (main routine
+     uses split Estrin, but this way reduces register pressure in the calling
+     routine).  */
+  float32x4_t q = vfmaq_f32 (v_f32 (-0.5), m, c[0]);
+  float32x4_t m2 = vmulq_f32 (m, m);
+  q = vfmaq_f32 (m, m2, q);
+  float32x4_t p = v_pw_horner_6_f32 (m, m2, c + 1);
+  p = vmulq_f32 (m2, p);
+  return vfmaq_f32 (q, m2, p);
+}
+
+static inline float32x4_t
+log1pf_inline (float32x4_t x, const struct v_log1pf_data d)
+{
+  /* Helper for calculating log(x + 1). Copied from log1pf_2u1.c, with no
+     special-case handling. See that file for details of the algorithm.  */
+  float32x4_t m = vaddq_f32 (x, v_f32 (1.0f));
+  int32x4_t k
+      = vandq_s32 (vsubq_s32 (vreinterpretq_s32_f32 (m), d.three_quarters),
+		   v_s32 (0xff800000));
+  uint32x4_t ku = vreinterpretq_u32_s32 (k);
+  float32x4_t s = vreinterpretq_f32_u32 (vsubq_u32 (d.four, ku));
+  float32x4_t m_scale
+      = vreinterpretq_f32_u32 (vsubq_u32 (vreinterpretq_u32_f32 (x), ku));
+  m_scale = vaddq_f32 (m_scale, vfmaq_f32 (v_f32 (-1.0f), v_f32 (0.25f), s));
+  float32x4_t p = eval_poly (m_scale, d.poly);
+  float32x4_t scale_back = vmulq_f32 (vcvtq_f32_s32 (k), v_f32 (0x1.0p-23f));
+  return vfmaq_f32 (p, scale_back, d.ln2);
+}
+
+#endif
+diff --git a/sysdeps/aarch64/fpu/v_math.h b/sysdeps/aarch64/fpu/v_math.h
+index d4d78bc4027abebb..12824fce8c698cf4 100644
+--- a/sysdeps/aarch64/fpu/v_math.h
+++ b/sysdeps/aarch64/fpu/v_math.h
+@@ -108,6 +108,11 @@ v_call2_f32 (float (*f) (float, float), float32x4_t x1, float32x4_t x2,
+ 			p[2] ? f (x1[2], x2[2]) : y[2],
+ 			p[3] ? f (x1[3], x2[3]) : y[3] };
+ }
+static inline float32x4_t
+v_zerofy_f32 (float32x4_t x, uint32x4_t mask)
+{
+  return vreinterpretq_f32_u32 (vbicq_u32 (vreinterpretq_u32_f32 (x), mask));
+}
+ 
+ static inline float64x2_t
+ v_f64 (double x)
+@@ -167,5 +172,10 @@ v_call2_f64 (double (*f) (double, double), float64x2_t x1, float64x2_t x2,
+   return (float64x2_t){ p[0] ? f (x1[0], x2[0]) : y[0],
+ 			p[1] ? f (x1[1], x2[1]) : y[1] };
+ }
+static inline float64x2_t
+v_zerofy_f64 (float64x2_t x, uint64x2_t mask)
+{
+  return vreinterpretq_f64_u64 (vbicq_u64 (vreinterpretq_u64_f64 (x), mask));
+}
+ 
+ #endif
+diff --git a/sysdeps/aarch64/libm-test-ulps b/sysdeps/aarch64/libm-test-ulps
+index 48d747ad5793be96..1646cdbdd22d93d9 100644
+--- a/sysdeps/aarch64/libm-test-ulps
+++ b/sysdeps/aarch64/libm-test-ulps
+@@ -34,11 +34,19 @@ double: 2
+ float: 2
+ ldouble: 4
+ 
+Function: "acosh_advsimd":
+double: 2
+float: 2
+
+ Function: "acosh_downward":
+ double: 2
+ float: 2
+ ldouble: 3
+ 
+Function: "acosh_sve":
+double: 2
+float: 2
+
+ Function: "acosh_towardzero":
+ double: 2
+ float: 2
+diff --git a/sysdeps/unix/sysv/linux/aarch64/libmvec.abilist b/sysdeps/unix/sysv/linux/aarch64/libmvec.abilist
+index f66da42c3630bf48..f5aaa519f2c8663e 100644
+--- a/sysdeps/unix/sysv/linux/aarch64/libmvec.abilist
+++ b/sysdeps/unix/sysv/linux/aarch64/libmvec.abilist
+@@ -73,12 +73,17 @@ GLIBC_2.39 _ZGVsMxv_tan F
+ GLIBC_2.39 _ZGVsMxv_tanf F
+ GLIBC_2.39 _ZGVsMxvv_atan2 F
+ GLIBC_2.39 _ZGVsMxvv_atan2f F
+GLIBC_2.40 _ZGVnN2v_acosh F
+GLIBC_2.40 _ZGVnN2v_acoshf F
+ GLIBC_2.40 _ZGVnN2v_cosh F
+ GLIBC_2.40 _ZGVnN2v_coshf F
+ GLIBC_2.40 _ZGVnN2v_erf F
+ GLIBC_2.40 _ZGVnN2v_erff F
+GLIBC_2.40 _ZGVnN4v_acoshf F
+ GLIBC_2.40 _ZGVnN4v_coshf F
+ GLIBC_2.40 _ZGVnN4v_erff F
+GLIBC_2.40 _ZGVsMxv_acosh F
+GLIBC_2.40 _ZGVsMxv_acoshf F
+ GLIBC_2.40 _ZGVsMxv_cosh F
+ GLIBC_2.40 _ZGVsMxv_coshf F
+ GLIBC_2.40 _ZGVsMxv_erf F
--- a/glibc-RHEL-118273-30.patch
+++ b/glibc-RHEL-118273-30.patch
@ -0,0 +1,303 @@
+commit c0ff447edf19bd4630fe79adf5e8b896405b059f
+Author: Luna Lamb <luna.lamb@arm.com>
+Date:   Thu Feb 13 17:54:46 2025 +0000
+
+    Aarch64: Improve codegen in SVE exp and users, and update expf_inline
+    
+    Use unpredicted muls, and improve memory access.
+    7%, 3% and 1% improvement in throughput microbenchmark on Neoverse V1,
+    for exp, exp2 and cosh respectively.
+    
+    Reviewed-by: Wilco Dijkstra  <Wilco.Dijkstra@arm.com>
+
+diff --git a/sysdeps/aarch64/fpu/cosh_sve.c b/sysdeps/aarch64/fpu/cosh_sve.c
+index 919f34604a452b4a..e375dd8a3407feb2 100644
+--- a/sysdeps/aarch64/fpu/cosh_sve.c
+++ b/sysdeps/aarch64/fpu/cosh_sve.c
+@@ -23,7 +23,7 @@ static const struct data
+ {
+   float64_t poly[3];
+   float64_t inv_ln2, ln2_hi, ln2_lo, shift, thres;
+-  uint64_t index_mask, special_bound;
+  uint64_t special_bound;
+ } data = {
+   .poly = { 0x1.fffffffffffd4p-2, 0x1.5555571d6b68cp-3,
+ 	    0x1.5555576a59599p-5, },
+@@ -35,14 +35,16 @@ static const struct data
+   .shift = 0x1.8p+52,
+   .thres = 704.0,
+ 
+-  .index_mask = 0xff,
+   /* 0x1.6p9, above which exp overflows.  */
+   .special_bound = 0x4086000000000000,
+ };
+ 
+ static svfloat64_t NOINLINE
+-special_case (svfloat64_t x, svfloat64_t y, svbool_t special)
+special_case (svfloat64_t x, svbool_t pg, svfloat64_t t, svbool_t special)
+ {
+  svfloat64_t half_t = svmul_x (svptrue_b64 (), t, 0.5);
+  svfloat64_t half_over_t = svdivr_x (pg, t, 0.5);
+  svfloat64_t y = svadd_x (pg, half_t, half_over_t);
+   return sv_call_f64 (cosh, x, y, special);
+ }
+ 
+@@ -60,12 +62,12 @@ exp_inline (svfloat64_t x, const svbool_t pg, const struct data *d)
+ 
+   svuint64_t u = svreinterpret_u64 (z);
+   svuint64_t e = svlsl_x (pg, u, 52 - V_EXP_TAIL_TABLE_BITS);
+-  svuint64_t i = svand_x (pg, u, d->index_mask);
+  svuint64_t i = svand_x (svptrue_b64 (), u, 0xff);
+ 
+   svfloat64_t y = svmla_x (pg, sv_f64 (d->poly[1]), r, d->poly[2]);
+   y = svmla_x (pg, sv_f64 (d->poly[0]), r, y);
+   y = svmla_x (pg, sv_f64 (1.0), r, y);
+-  y = svmul_x (pg, r, y);
+  y = svmul_x (svptrue_b64 (), r, y);
+ 
+   /* s = 2^(n/N).  */
+   u = svld1_gather_index (pg, __v_exp_tail_data, i);
+@@ -94,12 +96,12 @@ svfloat64_t SV_NAME_D1 (cosh) (svfloat64_t x, const svbool_t pg)
+   /* Up to the point that exp overflows, we can use it to calculate cosh by
+      exp(|x|) / 2 + 1 / (2 * exp(|x|)).  */
+   svfloat64_t t = exp_inline (ax, pg, d);
+-  svfloat64_t half_t = svmul_x (pg, t, 0.5);
+-  svfloat64_t half_over_t = svdivr_x (pg, t, 0.5);
+ 
+   /* Fall back to scalar for any special cases.  */
+   if (__glibc_unlikely (svptest_any (pg, special)))
+-    return special_case (x, svadd_x (pg, half_t, half_over_t), special);
+    return special_case (x, pg, t, special);
+ 
+  svfloat64_t half_t = svmul_x (svptrue_b64 (), t, 0.5);
+  svfloat64_t half_over_t = svdivr_x (pg, t, 0.5);
+   return svadd_x (pg, half_t, half_over_t);
+ }
+diff --git a/sysdeps/aarch64/fpu/exp10_sve.c b/sysdeps/aarch64/fpu/exp10_sve.c
+index ddf64708cb1773cd..bfd3fb9e1948a3b8 100644
+--- a/sysdeps/aarch64/fpu/exp10_sve.c
+++ b/sysdeps/aarch64/fpu/exp10_sve.c
+@@ -18,21 +18,23 @@
+    <https://www.gnu.org/licenses/>.  */
+ 
+ #include "sv_math.h"
+-#include "poly_sve_f64.h"
+ 
+ #define SpecialBound 307.0 /* floor (log10 (2^1023)).  */
+ 
+ static const struct data
+ {
+-  double poly[5];
+  double c1, c3, c2, c4, c0;
+   double shift, log10_2, log2_10_hi, log2_10_lo, scale_thres, special_bound;
+ } data = {
+   /* Coefficients generated using Remez algorithm.
+      rel error: 0x1.9fcb9b3p-60
+      abs error: 0x1.a20d9598p-60 in [ -log10(2)/128, log10(2)/128 ]
+      max ulp err 0.52 +0.5.  */
+-  .poly = { 0x1.26bb1bbb55516p1, 0x1.53524c73cd32ap1, 0x1.0470591daeafbp1,
+-	    0x1.2bd77b1361ef6p0, 0x1.142b5d54e9621p-1 },
+  .c0 = 0x1.26bb1bbb55516p1,
+  .c1 = 0x1.53524c73cd32ap1,
+  .c2 = 0x1.0470591daeafbp1,
+  .c3 = 0x1.2bd77b1361ef6p0,
+  .c4 = 0x1.142b5d54e9621p-1,
+   /* 1.5*2^46+1023. This value is further explained below.  */
+   .shift = 0x1.800000000ffc0p+46,
+   .log10_2 = 0x1.a934f0979a371p1,     /* 1/log2(10).  */
+@@ -70,9 +72,9 @@ special_case (svbool_t pg, svfloat64_t s, svfloat64_t y, svfloat64_t n,
+   /* |n| > 1280 => 2^(n) overflows.  */
+   svbool_t p_cmp = svacgt (pg, n, d->scale_thres);
+ 
+-  svfloat64_t r1 = svmul_x (pg, s1, s1);
+  svfloat64_t r1 = svmul_x (svptrue_b64 (), s1, s1);
+   svfloat64_t r2 = svmla_x (pg, s2, s2, y);
+-  svfloat64_t r0 = svmul_x (pg, r2, s1);
+  svfloat64_t r0 = svmul_x (svptrue_b64 (), r2, s1);
+ 
+   return svsel (p_cmp, r1, r0);
+ }
+@@ -103,11 +105,14 @@ svfloat64_t SV_NAME_D1 (exp10) (svfloat64_t x, svbool_t pg)
+      comes at significant performance cost.  */
+   svuint64_t u = svreinterpret_u64 (z);
+   svfloat64_t scale = svexpa (u);
+-
+  svfloat64_t c24 = svld1rq (svptrue_b64 (), &d->c2);
+   /* Approximate exp10(r) using polynomial.  */
+-  svfloat64_t r2 = svmul_x (pg, r, r);
+-  svfloat64_t y = svmla_x (pg, svmul_x (pg, r, d->poly[0]), r2,
+-			   sv_pairwise_poly_3_f64_x (pg, r, r2, d->poly + 1));
+  svfloat64_t r2 = svmul_x (svptrue_b64 (), r, r);
+  svfloat64_t p12 = svmla_lane (sv_f64 (d->c1), r, c24, 0);
+  svfloat64_t p34 = svmla_lane (sv_f64 (d->c3), r, c24, 1);
+  svfloat64_t p14 = svmla_x (pg, p12, p34, r2);
+
+  svfloat64_t y = svmla_x (pg, svmul_x (svptrue_b64 (), r, d->c0), r2, p14);
+ 
+   /* Assemble result as exp10(x) = 2^n * exp10(r).  If |x| > SpecialBound
+      multiplication may overflow, so use special case routine.  */
+diff --git a/sysdeps/aarch64/fpu/exp2_sve.c b/sysdeps/aarch64/fpu/exp2_sve.c
+index 22848ebfa5ac21d8..5dfb77cdbc2f6a51 100644
+--- a/sysdeps/aarch64/fpu/exp2_sve.c
+++ b/sysdeps/aarch64/fpu/exp2_sve.c
+@@ -18,7 +18,6 @@
+    <https://www.gnu.org/licenses/>.  */
+ 
+ #include "sv_math.h"
+-#include "poly_sve_f64.h"
+ 
+ #define N (1 << V_EXP_TABLE_BITS)
+ 
+@@ -27,15 +26,15 @@
+ 
+ static const struct data
+ {
+-  double poly[4];
+  double c0, c2;
+  double c1, c3;
+   double shift, big_bound, uoflow_bound;
+ } data = {
+   /* Coefficients are computed using Remez algorithm with
+      minimisation of the absolute error.  */
+-  .poly = { 0x1.62e42fefa3686p-1, 0x1.ebfbdff82c241p-3, 0x1.c6b09b16de99ap-5,
+-	    0x1.3b2abf5571ad8p-7 },
+-  .shift = 0x1.8p52 / N,
+-  .uoflow_bound = UOFlowBound,
+  .c0 = 0x1.62e42fefa3686p-1, .c1 = 0x1.ebfbdff82c241p-3,
+  .c2 = 0x1.c6b09b16de99ap-5, .c3 = 0x1.3b2abf5571ad8p-7,
+  .shift = 0x1.8p52 / N,      .uoflow_bound = UOFlowBound,
+   .big_bound = BigBound,
+ };
+ 
+@@ -67,9 +66,9 @@ special_case (svbool_t pg, svfloat64_t s, svfloat64_t y, svfloat64_t n,
+   /* |n| > 1280 => 2^(n) overflows.  */
+   svbool_t p_cmp = svacgt (pg, n, d->uoflow_bound);
+ 
+-  svfloat64_t r1 = svmul_x (pg, s1, s1);
+  svfloat64_t r1 = svmul_x (svptrue_b64 (), s1, s1);
+   svfloat64_t r2 = svmla_x (pg, s2, s2, y);
+-  svfloat64_t r0 = svmul_x (pg, r2, s1);
+  svfloat64_t r0 = svmul_x (svptrue_b64 (), r2, s1);
+ 
+   return svsel (p_cmp, r1, r0);
+ }
+@@ -99,11 +98,14 @@ svfloat64_t SV_NAME_D1 (exp2) (svfloat64_t x, svbool_t pg)
+   svuint64_t top = svlsl_x (pg, ki, 52 - V_EXP_TABLE_BITS);
+   svfloat64_t scale = svreinterpret_f64 (svadd_x (pg, sbits, top));
+ 
+  svfloat64_t c13 = svld1rq (svptrue_b64 (), &d->c1);
+   /* Approximate exp2(r) using polynomial.  */
+-  svfloat64_t r2 = svmul_x (pg, r, r);
+-  svfloat64_t p = sv_pairwise_poly_3_f64_x (pg, r, r2, d->poly);
+-  svfloat64_t y = svmul_x (pg, r, p);
+-
+  /* y = exp2(r) - 1 ~= C0 r + C1 r^2 + C2 r^3 + C3 r^4.  */
+  svfloat64_t r2 = svmul_x (svptrue_b64 (), r, r);
+  svfloat64_t p01 = svmla_lane (sv_f64 (d->c0), r, c13, 0);
+  svfloat64_t p23 = svmla_lane (sv_f64 (d->c2), r, c13, 1);
+  svfloat64_t p = svmla_x (pg, p01, p23, r2);
+  svfloat64_t y = svmul_x (svptrue_b64 (), r, p);
+   /* Assemble exp2(x) = exp2(r) * scale.  */
+   if (__glibc_unlikely (svptest_any (pg, special)))
+     return special_case (pg, scale, y, kd, d);
+diff --git a/sysdeps/aarch64/fpu/exp_sve.c b/sysdeps/aarch64/fpu/exp_sve.c
+index aabaaa1d61dbab27..b2421d493f2e119f 100644
+--- a/sysdeps/aarch64/fpu/exp_sve.c
+++ b/sysdeps/aarch64/fpu/exp_sve.c
+@@ -21,12 +21,15 @@
+ 
+ static const struct data
+ {
+-  double poly[4];
+  double c0, c2;
+  double c1, c3;
+   double ln2_hi, ln2_lo, inv_ln2, shift, thres;
+
+ } data = {
+-  .poly = { /* ulp error: 0.53.  */
+-	    0x1.fffffffffdbcdp-2, 0x1.555555555444cp-3, 0x1.555573c6a9f7dp-5,
+-	    0x1.1111266d28935p-7 },
+  .c0 = 0x1.fffffffffdbcdp-2,
+  .c1 = 0x1.555555555444cp-3,
+  .c2 = 0x1.555573c6a9f7dp-5,
+  .c3 = 0x1.1111266d28935p-7,
+   .ln2_hi = 0x1.62e42fefa3800p-1,
+   .ln2_lo = 0x1.ef35793c76730p-45,
+   /* 1/ln2.  */
+@@ -36,7 +39,6 @@ static const struct data
+   .thres = 704.0,
+ };
+ 
+-#define C(i) sv_f64 (d->poly[i])
+ #define SpecialOffset 0x6000000000000000 /* 0x1p513.  */
+ /* SpecialBias1 + SpecialBias1 = asuint(1.0).  */
+ #define SpecialBias1 0x7000000000000000 /* 0x1p769.  */
+@@ -56,20 +58,20 @@ special_case (svbool_t pg, svfloat64_t s, svfloat64_t y, svfloat64_t n)
+   svuint64_t b
+       = svdup_u64_z (p_sign, SpecialOffset); /* Inactive lanes set to 0.  */
+ 
+-  /* Set s1 to generate overflow depending on sign of exponent n.  */
+-  svfloat64_t s1 = svreinterpret_f64 (
+-      svsubr_x (pg, b, SpecialBias1)); /* 0x70...0 - b.  */
+-  /* Offset s to avoid overflow in final result if n is below threshold.  */
+  /* Set s1 to generate overflow depending on sign of exponent n,
+     ie. s1 = 0x70...0 - b.  */
+  svfloat64_t s1 = svreinterpret_f64 (svsubr_x (pg, b, SpecialBias1));
+  /* Offset s to avoid overflow in final result if n is below threshold.
+     ie. s2 = as_u64 (s) - 0x3010...0 + b.  */
+   svfloat64_t s2 = svreinterpret_f64 (
+-      svadd_x (pg, svsub_x (pg, svreinterpret_u64 (s), SpecialBias2),
+-	       b)); /* as_u64 (s) - 0x3010...0 + b.  */
+      svadd_x (pg, svsub_x (pg, svreinterpret_u64 (s), SpecialBias2), b));
+ 
+   /* |n| > 1280 => 2^(n) overflows.  */
+   svbool_t p_cmp = svacgt (pg, n, 1280.0);
+ 
+-  svfloat64_t r1 = svmul_x (pg, s1, s1);
+  svfloat64_t r1 = svmul_x (svptrue_b64 (), s1, s1);
+   svfloat64_t r2 = svmla_x (pg, s2, s2, y);
+-  svfloat64_t r0 = svmul_x (pg, r2, s1);
+  svfloat64_t r0 = svmul_x (svptrue_b64 (), r2, s1);
+ 
+   return svsel (p_cmp, r1, r0);
+ }
+@@ -103,16 +105,16 @@ svfloat64_t SV_NAME_D1 (exp) (svfloat64_t x, const svbool_t pg)
+   svfloat64_t z = svmla_x (pg, sv_f64 (d->shift), x, d->inv_ln2);
+   svuint64_t u = svreinterpret_u64 (z);
+   svfloat64_t n = svsub_x (pg, z, d->shift);
+-
+  svfloat64_t c13 = svld1rq (svptrue_b64 (), &d->c1);
+   /* r = x - n * ln2, r is in [-ln2/(2N), ln2/(2N)].  */
+   svfloat64_t ln2 = svld1rq (svptrue_b64 (), &d->ln2_hi);
+   svfloat64_t r = svmls_lane (x, n, ln2, 0);
+   r = svmls_lane (r, n, ln2, 1);
+ 
+   /* y = exp(r) - 1 ~= r + C0 r^2 + C1 r^3 + C2 r^4 + C3 r^5.  */
+-  svfloat64_t r2 = svmul_x (pg, r, r);
+-  svfloat64_t p01 = svmla_x (pg, C (0), C (1), r);
+-  svfloat64_t p23 = svmla_x (pg, C (2), C (3), r);
+  svfloat64_t r2 = svmul_x (svptrue_b64 (), r, r);
+  svfloat64_t p01 = svmla_lane (sv_f64 (d->c0), r, c13, 0);
+  svfloat64_t p23 = svmla_lane (sv_f64 (d->c2), r, c13, 1);
+   svfloat64_t p04 = svmla_x (pg, p01, p23, r2);
+   svfloat64_t y = svmla_x (pg, r, p04, r2);
+ 
+diff --git a/sysdeps/aarch64/fpu/sv_expf_inline.h b/sysdeps/aarch64/fpu/sv_expf_inline.h
+index 6166df65533555a6..75781fb4ddcb9790 100644
+--- a/sysdeps/aarch64/fpu/sv_expf_inline.h
+++ b/sysdeps/aarch64/fpu/sv_expf_inline.h
+@@ -61,7 +61,7 @@ expf_inline (svfloat32_t x, const svbool_t pg, const struct sv_expf_data *d)
+   /* scale = 2^(n/N).  */
+   svfloat32_t scale = svexpa (svreinterpret_u32 (z));
+ 
+-  /* y = exp(r) - 1 ~= r + C0 r^2 + C1 r^3 + C2 r^4 + C3 r^5 + C4 r^6.  */
+  /* poly(r) = exp(r) - 1 ~= C0 r + C1 r^2 + C2 r^3 + C3 r^4 + C4 r^5.  */
+   svfloat32_t p12 = svmla_lane (sv_f32 (d->c1), r, lane_consts, 2);
+   svfloat32_t p34 = svmla_lane (sv_f32 (d->c3), r, lane_consts, 3);
+   svfloat32_t r2 = svmul_x (svptrue_b32 (), r, r);
+@@ -71,5 +71,4 @@ expf_inline (svfloat32_t x, const svbool_t pg, const struct sv_expf_data *d)
+ 
+   return svmla_x (pg, scale, scale, poly);
+ }
+-
+ #endif
--- a/glibc-RHEL-118273-31.patch
+++ b/glibc-RHEL-118273-31.patch
@ -0,0 +1,194 @@
+commit 8f0e7fe61e0a2ad5ed777933703ce09053810ec4
+Author: Luna Lamb <luna.lamb@arm.com>
+Date:   Thu Feb 13 17:52:09 2025 +0000
+
+    Aarch64: Improve codegen in SVE asinh
+    
+    Use unpredicated muls, use lanewise mla's and improve memory access.
+    1% regression in throughput microbenchmark on Neoverse V1.
+    
+    Reviewed-by: Wilco Dijkstra  <Wilco.Dijkstra@arm.com>
+
+diff --git a/sysdeps/aarch64/fpu/asinh_sve.c b/sysdeps/aarch64/fpu/asinh_sve.c
+index 28dc5c458750bac4..fe8715e06c92ac51 100644
+--- a/sysdeps/aarch64/fpu/asinh_sve.c
+++ b/sysdeps/aarch64/fpu/asinh_sve.c
+@@ -18,36 +18,49 @@
+    <https://www.gnu.org/licenses/>.  */
+ 
+ #include "sv_math.h"
+-#include "poly_sve_f64.h"
+ 
+ #define SignMask (0x8000000000000000)
+ #define One (0x3ff0000000000000)
+ #define Thres (0x5fe0000000000000) /* asuint64 (0x1p511).  */
+#define IndexMask (((1 << V_LOG_TABLE_BITS) - 1) << 1)
+ 
+ static const struct data
+ {
+-  double poly[18];
+-  double ln2, p3, p1, p4, p0, p2;
+-  uint64_t n;
+-  uint64_t off;
+  double even_coeffs[9];
+  double ln2, p3, p1, p4, p0, p2, c1, c3, c5, c7, c9, c11, c13, c15, c17;
+  uint64_t off, mask;
+ 
+ } data = {
+-  /* Polynomial generated using Remez on [2^-26, 1].  */
+-  .poly
+-  = { -0x1.55555555554a7p-3, 0x1.3333333326c7p-4, -0x1.6db6db68332e6p-5,
+-      0x1.f1c71b26fb40dp-6, -0x1.6e8b8b654a621p-6, 0x1.1c4daa9e67871p-6,
+-      -0x1.c9871d10885afp-7, 0x1.7a16e8d9d2ecfp-7, -0x1.3ddca533e9f54p-7,
+-      0x1.0becef748dafcp-7, -0x1.b90c7099dd397p-8, 0x1.541f2bb1ffe51p-8,
+-      -0x1.d217026a669ecp-9, 0x1.0b5c7977aaf7p-9, -0x1.e0f37daef9127p-11,
+-      0x1.388b5fe542a6p-12, -0x1.021a48685e287p-14, 0x1.93d4ba83d34dap-18 },
+   /* Polynomial generated using Remez on [2^-26, 1].  */
+  .even_coeffs ={
+    -0x1.55555555554a7p-3,
+    -0x1.6db6db68332e6p-5,
+    -0x1.6e8b8b654a621p-6,
+    -0x1.c9871d10885afp-7,
+    -0x1.3ddca533e9f54p-7,
+    -0x1.b90c7099dd397p-8,
+    -0x1.d217026a669ecp-9,
+    -0x1.e0f37daef9127p-11,
+    -0x1.021a48685e287p-14, },
+
+  .c1 = 0x1.3333333326c7p-4,
+  .c3 = 0x1.f1c71b26fb40dp-6,
+  .c5 = 0x1.1c4daa9e67871p-6,
+  .c7 = 0x1.7a16e8d9d2ecfp-7,
+  .c9 = 0x1.0becef748dafcp-7,
+  .c11 = 0x1.541f2bb1ffe51p-8,
+  .c13 = 0x1.0b5c7977aaf7p-9,
+  .c15 = 0x1.388b5fe542a6p-12,
+  .c17 = 0x1.93d4ba83d34dap-18,
+
+   .ln2 = 0x1.62e42fefa39efp-1,
+   .p0 = -0x1.ffffffffffff7p-2,
+   .p1 = 0x1.55555555170d4p-2,
+   .p2 = -0x1.0000000399c27p-2,
+   .p3 = 0x1.999b2e90e94cap-3,
+   .p4 = -0x1.554e550bd501ep-3,
+-  .n = 1 << V_LOG_TABLE_BITS,
+-  .off = 0x3fe6900900000000
+  .off = 0x3fe6900900000000,
+  .mask = 0xfffULL << 52,
+ };
+ 
+ static svfloat64_t NOINLINE
+@@ -64,11 +77,10 @@ __sv_log_inline (svfloat64_t x, const struct data *d, const svbool_t pg)
+      of the algorithm used.  */
+ 
+   svuint64_t ix = svreinterpret_u64 (x);
+-  svuint64_t tmp = svsub_x (pg, ix, d->off);
+-  svuint64_t i = svand_x (pg, svlsr_x (pg, tmp, (51 - V_LOG_TABLE_BITS)),
+-			  (d->n - 1) << 1);
+-  svint64_t k = svasr_x (pg, svreinterpret_s64 (tmp), 52);
+-  svuint64_t iz = svsub_x (pg, ix, svand_x (pg, tmp, 0xfffULL << 52));
+  svuint64_t i_off = svsub_x (pg, ix, d->off);
+  svuint64_t i
+      = svand_x (pg, svlsr_x (pg, i_off, (51 - V_LOG_TABLE_BITS)), IndexMask);
+  svuint64_t iz = svsub_x (pg, ix, svand_x (pg, i_off, d->mask));
+   svfloat64_t z = svreinterpret_f64 (iz);
+ 
+   svfloat64_t invc = svld1_gather_index (pg, &__v_log_data.table[0].invc, i);
+@@ -78,14 +90,14 @@ __sv_log_inline (svfloat64_t x, const struct data *d, const svbool_t pg)
+   svfloat64_t p1_p4 = svld1rq (svptrue_b64 (), &d->p1);
+ 
+   svfloat64_t r = svmla_x (pg, sv_f64 (-1.0), invc, z);
+-  svfloat64_t kd = svcvt_f64_x (pg, k);
+  svfloat64_t kd
+      = svcvt_f64_x (pg, svasr_x (pg, svreinterpret_s64 (i_off), 52));
+ 
+   svfloat64_t hi = svmla_lane (svadd_x (pg, logc, r), kd, ln2_p3, 0);
+-  svfloat64_t r2 = svmul_x (pg, r, r);
+-
+  svfloat64_t r2 = svmul_x (svptrue_b64 (), r, r);
+   svfloat64_t y = svmla_lane (sv_f64 (d->p2), r, ln2_p3, 1);
+-
+   svfloat64_t p = svmla_lane (sv_f64 (d->p0), r, p1_p4, 0);
+
+   y = svmla_lane (y, r2, p1_p4, 1);
+   y = svmla_x (pg, p, r2, y);
+   y = svmla_x (pg, hi, r2, y);
+@@ -111,7 +123,6 @@ svfloat64_t SV_NAME_D1 (asinh) (svfloat64_t x, const svbool_t pg)
+   svuint64_t iax = svbic_x (pg, ix, SignMask);
+   svuint64_t sign = svand_x (pg, ix, SignMask);
+   svfloat64_t ax = svreinterpret_f64 (iax);
+-
+   svbool_t ge1 = svcmpge (pg, iax, One);
+   svbool_t special = svcmpge (pg, iax, Thres);
+ 
+@@ -120,7 +131,7 @@ svfloat64_t SV_NAME_D1 (asinh) (svfloat64_t x, const svbool_t pg)
+   svfloat64_t option_1 = sv_f64 (0);
+   if (__glibc_likely (svptest_any (pg, ge1)))
+     {
+-      svfloat64_t x2 = svmul_x (pg, ax, ax);
+      svfloat64_t x2 = svmul_x (svptrue_b64 (), ax, ax);
+       option_1 = __sv_log_inline (
+ 	  svadd_x (pg, ax, svsqrt_x (pg, svadd_x (pg, x2, 1))), d, pg);
+     }
+@@ -130,21 +141,53 @@ svfloat64_t SV_NAME_D1 (asinh) (svfloat64_t x, const svbool_t pg)
+      The largest observed error in this region is 1.51 ULPs:
+      _ZGVsMxv_asinh(0x1.fe12bf8c616a2p-1) got 0x1.c1e649ee2681bp-1
+ 					 want 0x1.c1e649ee2681dp-1.  */
+
+   svfloat64_t option_2 = sv_f64 (0);
+   if (__glibc_likely (svptest_any (pg, svnot_z (pg, ge1))))
+     {
+-      svfloat64_t x2 = svmul_x (pg, ax, ax);
+-      svfloat64_t x4 = svmul_x (pg, x2, x2);
+-      svfloat64_t p = sv_pw_horner_17_f64_x (pg, x2, x4, d->poly);
+-      option_2 = svmla_x (pg, ax, p, svmul_x (pg, x2, ax));
+      svfloat64_t x2 = svmul_x (svptrue_b64 (), ax, ax);
+      svfloat64_t x4 = svmul_x (svptrue_b64 (), x2, x2);
+      /* Order-17 Pairwise Horner scheme.  */
+      svfloat64_t c13 = svld1rq (svptrue_b64 (), &d->c1);
+      svfloat64_t c57 = svld1rq (svptrue_b64 (), &d->c5);
+      svfloat64_t c911 = svld1rq (svptrue_b64 (), &d->c9);
+      svfloat64_t c1315 = svld1rq (svptrue_b64 (), &d->c13);
+
+      svfloat64_t p01 = svmla_lane (sv_f64 (d->even_coeffs[0]), x2, c13, 0);
+      svfloat64_t p23 = svmla_lane (sv_f64 (d->even_coeffs[1]), x2, c13, 1);
+      svfloat64_t p45 = svmla_lane (sv_f64 (d->even_coeffs[2]), x2, c57, 0);
+      svfloat64_t p67 = svmla_lane (sv_f64 (d->even_coeffs[3]), x2, c57, 1);
+      svfloat64_t p89 = svmla_lane (sv_f64 (d->even_coeffs[4]), x2, c911, 0);
+      svfloat64_t p1011 = svmla_lane (sv_f64 (d->even_coeffs[5]), x2, c911, 1);
+      svfloat64_t p1213
+	  = svmla_lane (sv_f64 (d->even_coeffs[6]), x2, c1315, 0);
+      svfloat64_t p1415
+	  = svmla_lane (sv_f64 (d->even_coeffs[7]), x2, c1315, 1);
+      svfloat64_t p1617 = svmla_x (pg, sv_f64 (d->even_coeffs[8]), x2, d->c17);
+
+      svfloat64_t p = svmla_x (pg, p1415, x4, p1617);
+      p = svmla_x (pg, p1213, x4, p);
+      p = svmla_x (pg, p1011, x4, p);
+      p = svmla_x (pg, p89, x4, p);
+
+      p = svmla_x (pg, p67, x4, p);
+      p = svmla_x (pg, p45, x4, p);
+
+      p = svmla_x (pg, p23, x4, p);
+
+      p = svmla_x (pg, p01, x4, p);
+
+      option_2 = svmla_x (pg, ax, p, svmul_x (svptrue_b64 (), x2, ax));
+     }
+ 
+-  /* Choose the right option for each lane.  */
+-  svfloat64_t y = svsel (ge1, option_1, option_2);
+-
+   if (__glibc_unlikely (svptest_any (pg, special)))
+     return special_case (
+-	x, svreinterpret_f64 (sveor_x (pg, svreinterpret_u64 (y), sign)),
+	x,
+	svreinterpret_f64 (sveor_x (
+	    pg, svreinterpret_u64 (svsel (ge1, option_1, option_2)), sign)),
+ 	special);
+
+  /* Choose the right option for each lane.  */
+  svfloat64_t y = svsel (ge1, option_1, option_2);
+   return svreinterpret_f64 (sveor_x (pg, svreinterpret_u64 (y), sign));
+ }
--- a/glibc-RHEL-118273-32.patch
+++ b/glibc-RHEL-118273-32.patch
@ -0,0 +1,531 @@
+commit ce2f26a22e6b6f5c108d156afd9b43a452bb024c
+Author: Wilco Dijkstra <wilco.dijkstra@arm.com>
+Date:   Tue Dec 31 18:07:36 2024 +0000
+
+    AArch64: Remove PTR_ARG/SIZE_ARG defines
+    
+    This series removes various ILP32 defines that are now
+    no longer needed.
+    
+    Remove PTR_ARG/SIZE_ARG.
+    
+    Reviewed-by: Adhemerval Zanella  <adhemerval.zanella@linaro.org>
+
+Conflicts:
+        sysdeps/aarch64/dl-start.S
+          (Fixup context to apply without out-of-scope dependency 01f52b11de)
+        sysdeps/aarch64/multiarch/memcpy_thunderx.S
+          (Dropped by upstream commit e162ab2)
+        sysdeps/aarch64/multiarch/memcpy_oryon1.S
+          (Skipped: file from 4dc83cac is out-of-scope)
+        sysdeps/aarch64/multiarch/memset_oryon1.S
+          (Skipped: file from 2f1f7a5f is out-of-scope)
+
+diff --git a/sysdeps/aarch64/__longjmp.S b/sysdeps/aarch64/__longjmp.S
+index 7b6add751e6bd96b..452ba0da6d788ce8 100644
+--- a/sysdeps/aarch64/__longjmp.S
+++ b/sysdeps/aarch64/__longjmp.S
+@@ -47,8 +47,6 @@ ENTRY (__longjmp)
+ 	cfi_offset(d14, JB_D14<<3)
+ 	cfi_offset(d15, JB_D15<<3)
+ 
+-	PTR_ARG (0)
+-
+ #if IS_IN(libc)
+ 	/* Disable ZA state of SME in libc.a and libc.so, but not in ld.so.  */
+ # if HAVE_AARCH64_PAC_RET
+diff --git a/sysdeps/aarch64/__mtag_tag_region.S b/sysdeps/aarch64/__mtag_tag_region.S
+index 22e8d8b75372c8aa..90ac17ced4801f21 100644
+--- a/sysdeps/aarch64/__mtag_tag_region.S
+++ b/sysdeps/aarch64/__mtag_tag_region.S
+@@ -40,9 +40,6 @@
+ #define zva_val	x4
+ 
+ ENTRY (__libc_mtag_tag_region)
+-	PTR_ARG (0)
+-	SIZE_ARG (1)
+-
+ 	add	dstend, dstin, count
+ 
+ 	cmp	count, 96
+diff --git a/sysdeps/aarch64/__mtag_tag_zero_region.S b/sysdeps/aarch64/__mtag_tag_zero_region.S
+index 566698e9146e7da8..e975a2f8bdb85ae0 100644
+--- a/sysdeps/aarch64/__mtag_tag_zero_region.S
+++ b/sysdeps/aarch64/__mtag_tag_zero_region.S
+@@ -40,9 +40,6 @@
+ #define zva_val	x4
+ 
+ ENTRY (__libc_mtag_tag_zero_region)
+-	PTR_ARG (0)
+-	SIZE_ARG (1)
+-
+ 	add	dstend, dstin, count
+ 
+ 	cmp	count, 96
+diff --git a/sysdeps/aarch64/dl-start.S b/sysdeps/aarch64/dl-start.S
+index d645484e79858013..b7ac6c31432e07c9 100644
+--- a/sysdeps/aarch64/dl-start.S
+++ b/sysdeps/aarch64/dl-start.S
+@@ -26,7 +26,6 @@ ENTRY (_start)
+ 	mov	x30, #0
+ 
+ 	mov	x0, sp
+-	PTR_ARG (0)
+ 	bl	_dl_start
+ 	/* Returns user entry point in x0.  */
+ 	mov	PTR_REG (21), PTR_REG (0)
+diff --git a/sysdeps/aarch64/dl-tlsdesc.S b/sysdeps/aarch64/dl-tlsdesc.S
+index 9b253b39dd1d9d46..0aeaf64edd2594f1 100644
+--- a/sysdeps/aarch64/dl-tlsdesc.S
+++ b/sysdeps/aarch64/dl-tlsdesc.S
+@@ -75,7 +75,6 @@
+ 	.align 2
+ _dl_tlsdesc_return:
+ 	BTI_C
+-	PTR_ARG (0)
+ 	ldr	PTR_REG (0), [x0, #PTR_SIZE]
+ 	RET
+ 	cfi_endproc
+@@ -99,7 +98,6 @@ _dl_tlsdesc_undefweak:
+ 	BTI_C
+ 	str	x1, [sp, #-16]!
+ 	cfi_adjust_cfa_offset (16)
+-	PTR_ARG (0)
+ 	ldr	PTR_REG (0), [x0, #PTR_SIZE]
+ 	mrs	x1, tpidr_el0
+ 	sub	PTR_REG (0), PTR_REG (0), PTR_REG (1)
+@@ -145,7 +143,6 @@ _dl_tlsdesc_undefweak:
+ 	.align 2
+ _dl_tlsdesc_dynamic:
+ 	BTI_C
+-	PTR_ARG (0)
+ 
+ 	/* Save just enough registers to support fast path, if we fall
+ 	   into slow path we will save additional registers.  */
+diff --git a/sysdeps/aarch64/memchr.S b/sysdeps/aarch64/memchr.S
+index a9fa40519c78b7df..7173c7fafa7d6eb5 100644
+--- a/sysdeps/aarch64/memchr.S
+++ b/sysdeps/aarch64/memchr.S
+@@ -57,8 +57,6 @@
+    exactly which byte matched.  */
+ 
+ ENTRY (MEMCHR)
+-	PTR_ARG (0)
+-	SIZE_ARG (2)
+ 	bic	src, srcin, 15
+ 	cbz	cntin, L(nomatch)
+ 	ld1	{vdata.16b}, [src]
+diff --git a/sysdeps/aarch64/memcmp.S b/sysdeps/aarch64/memcmp.S
+index 5afa79494bf9cb7f..68dfa604f4b1bd43 100644
+--- a/sysdeps/aarch64/memcmp.S
+++ b/sysdeps/aarch64/memcmp.S
+@@ -44,10 +44,6 @@
+ 
+ 
+ ENTRY (memcmp)
+-	PTR_ARG (0)
+-	PTR_ARG (1)
+-	SIZE_ARG (2)
+-
+ 	cmp	limit, 16
+ 	b.lo	L(less16)
+ 	ldp	data1, data3, [src1]
+diff --git a/sysdeps/aarch64/memcpy.S b/sysdeps/aarch64/memcpy.S
+index f21c21d3f2a21d89..fba93faeba52447f 100644
+--- a/sysdeps/aarch64/memcpy.S
+++ b/sysdeps/aarch64/memcpy.S
+@@ -70,10 +70,6 @@
+    from the end.  */
+ 
+ ENTRY (MEMCPY)
+-	PTR_ARG (0)
+-	PTR_ARG (1)
+-	SIZE_ARG (2)
+-
+ 	add	srcend, src, count
+ 	add	dstend, dstin, count
+ 	cmp	count, 128
+@@ -187,10 +183,6 @@ libc_hidden_builtin_def (MEMCPY)
+ 
+ 
+ ENTRY (MEMMOVE)
+-	PTR_ARG (0)
+-	PTR_ARG (1)
+-	SIZE_ARG (2)
+-
+ 	add	srcend, src, count
+ 	add	dstend, dstin, count
+ 	cmp	count, 128
+diff --git a/sysdeps/aarch64/memrchr.S b/sysdeps/aarch64/memrchr.S
+index c5274f5ebf595268..1bd3e230ca197581 100644
+--- a/sysdeps/aarch64/memrchr.S
+++ b/sysdeps/aarch64/memrchr.S
+@@ -55,8 +55,6 @@
+    exactly which byte matched.  */
+ 
+ ENTRY (__memrchr)
+-	PTR_ARG (0)
+-	SIZE_ARG (2)
+ 	add	end, srcin, cntin
+ 	sub	endm1, end, 1
+ 	bic	src, endm1, 15
+diff --git a/sysdeps/aarch64/memset.S b/sysdeps/aarch64/memset.S
+index 71814d0b2f6dd3a7..496ad332882a7e3d 100644
+--- a/sysdeps/aarch64/memset.S
+++ b/sysdeps/aarch64/memset.S
+@@ -40,9 +40,6 @@
+ #define dstend2	x5
+ 
+ ENTRY (MEMSET)
+-	PTR_ARG (0)
+-	SIZE_ARG (2)
+-
+ 	dup	v0.16B, valw
+ 	cmp	count, 16
+ 	b.lo	L(set_small)
+diff --git a/sysdeps/aarch64/multiarch/memchr_nosimd.S b/sysdeps/aarch64/multiarch/memchr_nosimd.S
+index 0a65139b0810e95b..b47059de1ee61f71 100644
+--- a/sysdeps/aarch64/multiarch/memchr_nosimd.S
+++ b/sysdeps/aarch64/multiarch/memchr_nosimd.S
+@@ -60,9 +60,6 @@
+ 
+ ENTRY (__memchr_nosimd)
+ 
+-	PTR_ARG (0)
+-	SIZE_ARG (2)
+-
+ 	/* Do not dereference srcin if no bytes to compare. */
+ 	cbz	cntin, L(none_chr)
+ 
+diff --git a/sysdeps/aarch64/multiarch/memcpy_a64fx.S b/sysdeps/aarch64/multiarch/memcpy_a64fx.S
+index d826aafd80ed7b0b..fa693f7c3a5c28a3 100644
+--- a/sysdeps/aarch64/multiarch/memcpy_a64fx.S
+++ b/sysdeps/aarch64/multiarch/memcpy_a64fx.S
+@@ -96,10 +96,6 @@
+ 
+ ENTRY (__memcpy_a64fx)
+ 
+-	PTR_ARG (0)
+-	PTR_ARG (1)
+-	SIZE_ARG (2)
+-
+ 	cntb	vlen
+ 	cmp	n, vlen, lsl 1
+ 	b.hi	L(copy_small)
+@@ -236,10 +232,6 @@ END (__memcpy_a64fx)
+ 
+ ENTRY_ALIGN (__memmove_a64fx, 4)
+ 
+-	PTR_ARG (0)
+-	PTR_ARG (1)
+-	SIZE_ARG (2)
+-
+ 	/* Fast case for up to 2 vectors.  */
+ 	cntb	vlen
+ 	cmp	n, vlen, lsl 1
+diff --git a/sysdeps/aarch64/multiarch/memcpy_mops.S b/sysdeps/aarch64/multiarch/memcpy_mops.S
+index b094af3d22bc4aeb..2c426f008e699101 100644
+--- a/sysdeps/aarch64/multiarch/memcpy_mops.S
+++ b/sysdeps/aarch64/multiarch/memcpy_mops.S
+@@ -26,10 +26,6 @@
+  */
+ 
+ ENTRY (__memcpy_mops)
+-	PTR_ARG (0)
+-	PTR_ARG (1)
+-	SIZE_ARG (2)
+-
+ 	mov	x3, x0
+ 	.inst	0x19010443	/* cpyfp   [x3]!, [x1]!, x2!  */
+ 	.inst	0x19410443	/* cpyfm   [x3]!, [x1]!, x2!  */
+diff --git a/sysdeps/aarch64/multiarch/memcpy_sve.S b/sysdeps/aarch64/multiarch/memcpy_sve.S
+index 3ce49d79ecdb94e0..26375b47174f1ba8 100644
+--- a/sysdeps/aarch64/multiarch/memcpy_sve.S
+++ b/sysdeps/aarch64/multiarch/memcpy_sve.S
+@@ -61,10 +61,6 @@
+ 	.arch armv8.2-a+sve
+ 
+ ENTRY (__memcpy_sve)
+-	PTR_ARG (0)
+-	PTR_ARG (1)
+-	SIZE_ARG (2)
+-
+ 	cmp	count, 128
+ 	b.hi	L(copy_long)
+ 	cntb	vlen
+@@ -144,10 +140,6 @@ END (__memcpy_sve)
+ 
+ 
+ ENTRY (__memmove_sve)
+-	PTR_ARG (0)
+-	PTR_ARG (1)
+-	SIZE_ARG (2)
+-
+ 	cmp	count, 128
+ 	b.hi	L(move_long)
+ 	cntb	vlen
+diff --git a/sysdeps/aarch64/multiarch/memcpy_thunderx.S b/sysdeps/aarch64/multiarch/memcpy_thunderx.S
+index 5d8438a82ea2a3be..02ea27f356fe8ea1 100644
+--- a/sysdeps/aarch64/multiarch/memcpy_thunderx.S
+++ b/sysdeps/aarch64/multiarch/memcpy_thunderx.S
+@@ -67,10 +67,6 @@
+ 
+ ENTRY (__memmove_thunderx)
+ 
+-	PTR_ARG (0)
+-	PTR_ARG (1)
+-	SIZE_ARG (2)
+-
+ 	sub	tmp1, dstin, src
+ 	cmp	count, 96
+ 	ccmp	tmp1, count, 2, hi
+diff --git a/sysdeps/aarch64/multiarch/memmove_mops.S b/sysdeps/aarch64/multiarch/memmove_mops.S
+index 7df0d22454ead375..229fccd9d5a7abd2 100644
+--- a/sysdeps/aarch64/multiarch/memmove_mops.S
+++ b/sysdeps/aarch64/multiarch/memmove_mops.S
+@@ -26,10 +26,6 @@
+  */
+ 
+ ENTRY (__memmove_mops)
+-	PTR_ARG (0)
+-	PTR_ARG (1)
+-	SIZE_ARG (2)
+-
+ 	mov	x3, x0
+ 	.inst	0x1d010443	/* cpyp    [x3]!, [x1]!, x2!  */
+ 	.inst	0x1d410443	/* cpym    [x3]!, [x1]!, x2!  */
+diff --git a/sysdeps/aarch64/multiarch/memset_a64fx.S b/sysdeps/aarch64/multiarch/memset_a64fx.S
+index 2e6d882fc931a882..9ea329a82ae7d0f6 100644
+--- a/sysdeps/aarch64/multiarch/memset_a64fx.S
+++ b/sysdeps/aarch64/multiarch/memset_a64fx.S
+@@ -48,8 +48,6 @@
+ #define BTI_C
+ 
+ ENTRY (__memset_a64fx)
+-	PTR_ARG (0)
+-	SIZE_ARG (2)
+ 
+ 	cntb	vector_length
+ 	dup	z0.b, valw
+diff --git a/sysdeps/aarch64/multiarch/memset_emag.S b/sysdeps/aarch64/multiarch/memset_emag.S
+index 6d714ed0e1b396ef..5c33280e0f8bf85a 100644
+--- a/sysdeps/aarch64/multiarch/memset_emag.S
+++ b/sysdeps/aarch64/multiarch/memset_emag.S
+@@ -28,9 +28,6 @@
+ 
+ ENTRY (__memset_emag)
+ 
+-	PTR_ARG (0)
+-	SIZE_ARG (2)
+-
+ 	bfi	valw, valw, 8, 8
+ 	bfi	valw, valw, 16, 16
+ 	bfi	val, val, 32, 32
+diff --git a/sysdeps/aarch64/multiarch/memset_kunpeng.S b/sysdeps/aarch64/multiarch/memset_kunpeng.S
+index 7b215501376cbe03..93f3bfb8cf7238a5 100644
+--- a/sysdeps/aarch64/multiarch/memset_kunpeng.S
+++ b/sysdeps/aarch64/multiarch/memset_kunpeng.S
+@@ -28,9 +28,6 @@
+ 
+ ENTRY (__memset_kunpeng)
+ 
+-	PTR_ARG (0)
+-	SIZE_ARG (2)
+-
+ 	dup	v0.16B, valw
+ 	add	dstend, dstin, count
+ 
+diff --git a/sysdeps/aarch64/multiarch/memset_mops.S b/sysdeps/aarch64/multiarch/memset_mops.S
+index e879c81ab2d047b1..f13a0b561078137e 100644
+--- a/sysdeps/aarch64/multiarch/memset_mops.S
+++ b/sysdeps/aarch64/multiarch/memset_mops.S
+@@ -26,9 +26,6 @@
+  */
+ 
+ ENTRY (__memset_mops)
+-	PTR_ARG (0)
+-	SIZE_ARG (2)
+-
+ 	mov     x3, x0
+ 	.inst   0x19c10443	/* setp    [x3]!, x2!, x1  */
+ 	.inst   0x19c14443	/* setm    [x3]!, x2!, x1  */
+diff --git a/sysdeps/aarch64/multiarch/strlen_asimd.S b/sysdeps/aarch64/multiarch/strlen_asimd.S
+index 67dcc94adc587928..3118cd00663b0b25 100644
+--- a/sysdeps/aarch64/multiarch/strlen_asimd.S
+++ b/sysdeps/aarch64/multiarch/strlen_asimd.S
+@@ -87,7 +87,6 @@
+    character, return the length, if not, continue in the main loop.  */
+ 
+ ENTRY (__strlen_asimd)
+-	PTR_ARG (0)
+ 	and	tmp1, srcin, MIN_PAGE_SIZE - 1
+ 	cmp	tmp1, MIN_PAGE_SIZE - 32
+ 	b.hi	L(page_cross)
+diff --git a/sysdeps/aarch64/setjmp.S b/sysdeps/aarch64/setjmp.S
+index 43fdb1b2fb1b7b78..92dc34e3e9a2650c 100644
+--- a/sysdeps/aarch64/setjmp.S
+++ b/sysdeps/aarch64/setjmp.S
+@@ -34,8 +34,6 @@ END (_setjmp)
+ libc_hidden_def (_setjmp)
+ 
+ ENTRY (__sigsetjmp)
+-	PTR_ARG (0)
+-
+ 1:
+ 	stp	x19, x20, [x0, #JB_X19<<3]
+ 	stp	x21, x22, [x0, #JB_X21<<3]
+diff --git a/sysdeps/aarch64/strchr.S b/sysdeps/aarch64/strchr.S
+index ca4c99e6bf9ac960..bc57283361e172ab 100644
+--- a/sysdeps/aarch64/strchr.S
+++ b/sysdeps/aarch64/strchr.S
+@@ -52,7 +52,6 @@
+    If it is not a multiple of 4, there was no match.  */
+ 
+ ENTRY (strchr)
+-	PTR_ARG (0)
+ 	bic	src, srcin, 15
+ 	dup	vrepchr.16b, chrin
+ 	ld1	{vdata.16b}, [src]
+diff --git a/sysdeps/aarch64/strchrnul.S b/sysdeps/aarch64/strchrnul.S
+index e1a1c7eb4383e0f6..09e092bf5f847a7f 100644
+--- a/sysdeps/aarch64/strchrnul.S
+++ b/sysdeps/aarch64/strchrnul.S
+@@ -51,7 +51,6 @@
+    exactly which byte matched.  */
+ 
+ ENTRY (__strchrnul)
+-	PTR_ARG (0)
+ 	bic	src, srcin, 15
+ 	dup	vrepchr.16b, chrin
+ 	ld1	{vdata.16b}, [src]
+diff --git a/sysdeps/aarch64/strcmp.S b/sysdeps/aarch64/strcmp.S
+index 47f6fb1448c464bf..7bf87073be304e0f 100644
+--- a/sysdeps/aarch64/strcmp.S
+++ b/sysdeps/aarch64/strcmp.S
+@@ -62,8 +62,6 @@
+    NUL too in big-endian, byte-reverse the data before the NUL check.  */
+ 
+ ENTRY(strcmp)
+-	PTR_ARG (0)
+-	PTR_ARG (1)
+ 	sub	off2, src2, src1
+ 	mov	zeroones, REP8_01
+ 	and	tmp, src1, 7
+diff --git a/sysdeps/aarch64/strcpy.S b/sysdeps/aarch64/strcpy.S
+index 705354060055a45e..62fb0248fa5a7ba3 100644
+--- a/sysdeps/aarch64/strcpy.S
+++ b/sysdeps/aarch64/strcpy.S
+@@ -69,8 +69,6 @@
+    exactly which byte matched.  */
+ 
+ ENTRY (STRCPY)
+-	PTR_ARG (0)
+-	PTR_ARG (1)
+ 	bic	src, srcin, 15
+ 	ld1	{vdata.16b}, [src]
+ 	cmeq	vhas_nul.16b, vdata.16b, 0
+diff --git a/sysdeps/aarch64/strlen.S b/sysdeps/aarch64/strlen.S
+index 352fb40d3abbb44b..0d10b6efb7b31e54 100644
+--- a/sysdeps/aarch64/strlen.S
+++ b/sysdeps/aarch64/strlen.S
+@@ -49,7 +49,6 @@
+    identifies the first zero byte.  */
+ 
+ ENTRY (STRLEN)
+-	PTR_ARG (0)
+ 	bic	src, srcin, 15
+ 	ld1	{vdata.16b}, [src]
+ 	cmeq	vhas_nul.16b, vdata.16b, 0
+diff --git a/sysdeps/aarch64/strnlen.S b/sysdeps/aarch64/strnlen.S
+index e4fb3506a80756b3..2a2264c0e5427225 100644
+--- a/sysdeps/aarch64/strnlen.S
+++ b/sysdeps/aarch64/strnlen.S
+@@ -49,8 +49,6 @@
+    identifies the first zero byte.  */
+ 
+ ENTRY (__strnlen)
+-	PTR_ARG (0)
+-	SIZE_ARG (1)
+ 	bic	src, srcin, 15
+ 	cbz	cntin, L(nomatch)
+ 	ld1	{vdata.16b}, [src]
+diff --git a/sysdeps/aarch64/strrchr.S b/sysdeps/aarch64/strrchr.S
+index e52c9b275347978c..402bce444ef3bb28 100644
+--- a/sysdeps/aarch64/strrchr.S
+++ b/sysdeps/aarch64/strrchr.S
+@@ -55,7 +55,6 @@
+    if the relevant byte matched the NUL end of string.  */
+ 
+ ENTRY (strrchr)
+-	PTR_ARG (0)
+ 	bic	src, srcin, 15
+ 	dup	vrepchr.16b, chrin
+ 	movi	vrepmask.16b, 0x33
+diff --git a/sysdeps/unix/sysv/linux/aarch64/clone.S b/sysdeps/unix/sysv/linux/aarch64/clone.S
+index 0e7ee24e68c85377..fed19acc2f78351f 100644
+--- a/sysdeps/unix/sysv/linux/aarch64/clone.S
+++ b/sysdeps/unix/sysv/linux/aarch64/clone.S
+@@ -33,12 +33,6 @@
+  */
+         .text
+ ENTRY(__clone)
+-	PTR_ARG (0)
+-	PTR_ARG (1)
+-	PTR_ARG (3)
+-	PTR_ARG (4)
+-	PTR_ARG (5)
+-	PTR_ARG (6)
+ 	/* Save args for the child.  */
+ 	mov	x10, x0
+ 	mov	x11, x2
+diff --git a/sysdeps/unix/sysv/linux/aarch64/clone3.S b/sysdeps/unix/sysv/linux/aarch64/clone3.S
+index 92d69a5430518cbc..9b00b6b8853e9b8b 100644
+--- a/sysdeps/unix/sysv/linux/aarch64/clone3.S
+++ b/sysdeps/unix/sysv/linux/aarch64/clone3.S
+@@ -36,10 +36,6 @@
+ 
+         .text
+ ENTRY(__clone3)
+-	PTR_ARG (0)
+-	PTR_ARG (1)
+-	PTR_ARG (3)
+-	PTR_ARG (4)
+ 	/* Save args for the child.  */
+ 	mov	x10, x0		/* cl_args  */
+ 	mov	x11, x2		/* func	 */
+diff --git a/sysdeps/unix/sysv/linux/aarch64/getcontext.S b/sysdeps/unix/sysv/linux/aarch64/getcontext.S
+index e5b69c9a82b7a448..862bd67aa484ae1a 100644
+--- a/sysdeps/unix/sysv/linux/aarch64/getcontext.S
+++ b/sysdeps/unix/sysv/linux/aarch64/getcontext.S
+@@ -30,7 +30,6 @@
+ 	.text
+ 
+ ENTRY(__getcontext)
+-	PTR_ARG (0)
+ 	/* The saved context will return to the getcontext() call point
+ 	   with a return value of 0 */
+ 	str	xzr,	  [x0, oX0 +  0 * SZREG]
+diff --git a/sysdeps/unix/sysv/linux/aarch64/setcontext.S b/sysdeps/unix/sysv/linux/aarch64/setcontext.S
+index ba659438c564dc3b..8c072781cdf98c2b 100644
+--- a/sysdeps/unix/sysv/linux/aarch64/setcontext.S
+++ b/sysdeps/unix/sysv/linux/aarch64/setcontext.S
+@@ -34,7 +34,6 @@
+ 	.text
+ 
+ ENTRY (__setcontext)
+-	PTR_ARG (0)
+ 	/* Save a copy of UCP.  */
+ 	mov	x9, x0
+ 
+diff --git a/sysdeps/unix/sysv/linux/aarch64/swapcontext.S b/sysdeps/unix/sysv/linux/aarch64/swapcontext.S
+index f049140d35b79ba6..7000f220368bb094 100644
+--- a/sysdeps/unix/sysv/linux/aarch64/swapcontext.S
+++ b/sysdeps/unix/sysv/linux/aarch64/swapcontext.S
+@@ -27,7 +27,6 @@
+ 
+ 	.text
+ ENTRY(__swapcontext)
+-	PTR_ARG (0)
+ 	/* Set the value returned when swapcontext() returns in this context.
+ 	   And set up x1 to become the return address of the caller, so we
+ 	   can return there with a normal RET instead of an indirect jump.  */
--- a/glibc-RHEL-118273-33.patch
+++ b/glibc-RHEL-118273-33.patch
@ -0,0 +1,113 @@
+commit cf56eb28fa277d9dbb301654682ca89f71c30a48
+Author: Pierre Blanchard <pierre.blanchard@arm.com>
+Date:   Tue Mar 18 17:07:31 2025 +0000
+
+    AArch64: Optimize algorithm in users of SVE expf helper
+    
+    Polynomial order was unnecessarily high, unlocking multiple
+    optimizations.
+    Max error for new SVE expf is 0.88 +0.5ULP.
+    Max error for new SVE coshf is 2.56 +0.5ULP.
+    Performance improvement on Neoverse V1: expf (30%), coshf (26%).
+    
+    Reviewed-by: Wilco Dijkstra  <Wilco.Dijkstra@arm.com>
+
+diff --git a/sysdeps/aarch64/fpu/coshf_sve.c b/sysdeps/aarch64/fpu/coshf_sve.c
+index 7ad6efa0fc218278..508c0790ee89e0cd 100644
+--- a/sysdeps/aarch64/fpu/coshf_sve.c
+++ b/sysdeps/aarch64/fpu/coshf_sve.c
+@@ -39,9 +39,9 @@ special_case (svfloat32_t x, svfloat32_t half_e, svfloat32_t half_over_e,
+ }
+ 
+ /* Single-precision vector cosh, using vector expf.
+-   Maximum error is 2.77 ULP:
+-   _ZGVsMxv_coshf(-0x1.5b38f4p+1) got 0x1.e45946p+2
+-				 want 0x1.e4594cp+2.  */
+   Maximum error is 2.56 +0.5 ULP:
+   _ZGVsMxv_coshf(-0x1.5b40f4p+1) got 0x1.e47748p+2
+				 want 0x1.e4774ep+2.  */
+ svfloat32_t SV_NAME_F1 (cosh) (svfloat32_t x, svbool_t pg)
+ {
+   const struct data *d = ptr_barrier (&data);
+diff --git a/sysdeps/aarch64/fpu/expf_sve.c b/sysdeps/aarch64/fpu/expf_sve.c
+index da93e01b87e0e890..aee86a203379efb3 100644
+--- a/sysdeps/aarch64/fpu/expf_sve.c
+++ b/sysdeps/aarch64/fpu/expf_sve.c
+@@ -40,9 +40,9 @@ special_case (svfloat32_t x, svbool_t special, const struct sv_expf_data *d)
+ }
+ 
+ /* Optimised single-precision SVE exp function.
+-   Worst-case error is 1.04 ulp:
+-   SV_NAME_F1 (exp)(0x1.a8eda4p+1) got 0x1.ba74bcp+4
+-				  want 0x1.ba74bap+4.  */
+   Worst-case error is 0.88 +0.50 ULP:
+   _ZGVsMxv_expf(-0x1.bba276p-6) got 0x1.f25288p-1
+				want 0x1.f2528ap-1.  */
+ svfloat32_t SV_NAME_F1 (exp) (svfloat32_t x, const svbool_t pg)
+ {
+   const struct data *d = ptr_barrier (&data);
+diff --git a/sysdeps/aarch64/fpu/sv_expf_inline.h b/sysdeps/aarch64/fpu/sv_expf_inline.h
+index 75781fb4ddcb9790..01fbb4d4c046eb3b 100644
+--- a/sysdeps/aarch64/fpu/sv_expf_inline.h
+++ b/sysdeps/aarch64/fpu/sv_expf_inline.h
+@@ -24,50 +24,40 @@
+ 
+ struct sv_expf_data
+ {
+-  float c1, c3, inv_ln2;
+-  float ln2_lo, c0, c2, c4;
+-  float ln2_hi, shift;
+  float ln2_hi, ln2_lo, c1, null;
+  float inv_ln2, shift;
+ };
+ 
+-/* Coefficients copied from the polynomial in AdvSIMD variant, reversed for
+-   compatibility with polynomial helpers. Shift is 1.5*2^17 + 127.  */
+/* Shift is 1.5*2^17 + 127.  */
+ #define SV_EXPF_DATA                                                          \
+   {                                                                           \
+-    /* Coefficients copied from the polynomial in AdvSIMD variant.  */        \
+-    .c0 = 0x1.ffffecp-1f, .c1 = 0x1.fffdb6p-2f, .c2 = 0x1.555e66p-3f,         \
+-    .c3 = 0x1.573e2ep-5f, .c4 = 0x1.0e4020p-7f, .inv_ln2 = 0x1.715476p+0f,    \
+-    .ln2_hi = 0x1.62e4p-1f, .ln2_lo = 0x1.7f7d1cp-20f,                        \
+-    .shift = 0x1.803f8p17f,                                                   \
+    .c1 = 0.5f, .inv_ln2 = 0x1.715476p+0f, .ln2_hi = 0x1.62e4p-1f,            \
+    .ln2_lo = 0x1.7f7d1cp-20f, .shift = 0x1.803f8p17f,                        \
+   }
+ 
+-#define C(i) sv_f32 (d->poly[i])
+-
+ static inline svfloat32_t
+ expf_inline (svfloat32_t x, const svbool_t pg, const struct sv_expf_data *d)
+ {
+   /* exp(x) = 2^n (1 + poly(r)), with 1 + poly(r) in [1/sqrt(2),sqrt(2)]
+      x = ln2*n + r, with r in [-ln2/2, ln2/2].  */
+ 
+-  svfloat32_t lane_consts = svld1rq (svptrue_b32 (), &d->ln2_lo);
+  svfloat32_t lane_consts = svld1rq (svptrue_b32 (), &d->ln2_hi);
+ 
+   /* n = round(x/(ln2/N)).  */
+   svfloat32_t z = svmad_x (pg, sv_f32 (d->inv_ln2), x, d->shift);
+   svfloat32_t n = svsub_x (pg, z, d->shift);
+ 
+   /* r = x - n*ln2/N.  */
+-  svfloat32_t r = svmsb_x (pg, sv_f32 (d->ln2_hi), n, x);
+  svfloat32_t r = x;
+   r = svmls_lane (r, n, lane_consts, 0);
+  r = svmls_lane (r, n, lane_consts, 1);
+ 
+   /* scale = 2^(n/N).  */
+   svfloat32_t scale = svexpa (svreinterpret_u32 (z));
+ 
+-  /* poly(r) = exp(r) - 1 ~= C0 r + C1 r^2 + C2 r^3 + C3 r^4 + C4 r^5.  */
+-  svfloat32_t p12 = svmla_lane (sv_f32 (d->c1), r, lane_consts, 2);
+-  svfloat32_t p34 = svmla_lane (sv_f32 (d->c3), r, lane_consts, 3);
+  /* poly(r) = exp(r) - 1 ~= r + 0.5 r^2.  */
+   svfloat32_t r2 = svmul_x (svptrue_b32 (), r, r);
+-  svfloat32_t p14 = svmla_x (pg, p12, p34, r2);
+-  svfloat32_t p0 = svmul_lane (r, lane_consts, 1);
+-  svfloat32_t poly = svmla_x (pg, p0, r2, p14);
+  svfloat32_t poly = svmla_lane (r, r2, lane_consts, 2);
+ 
+   return svmla_x (pg, scale, scale, poly);
+ }
--- a/glibc-RHEL-118273-34.patch
+++ b/glibc-RHEL-118273-34.patch
@ -0,0 +1,217 @@
+commit 4352e2cc934b2874dba37397157bf890fcee455a
+Author: Adhemerval Zanella <adhemerval.zanella@linaro.org>
+Date:   Fri Mar 28 14:27:45 2025 -0300
+
+    aarch64: Fix _dl_tlsdesc_dynamic unwind for pac-ret (BZ 32612)
+    
+    When libgcc is built with pac-ret, it requires to autenticate the
+    unwinding frame based on CFI information.  The _dl_tlsdesc_dynamic
+    uses a custom calling convention, where it is responsible to save
+    and restore all registers it might use (even volatile).
+    
+    The pac-ret support added by 1be3d6eb823d8b952fa54b7bbc90cbecb8981380
+    was added only on the slow-path, but the fast path also adds DWARF
+    Register Rule Instruction (cfi_adjust_cfa_offset) since it requires
+    to save/restore some auxiliary register.  It seems that this is not
+    fully supported neither by libgcc nor AArch64 ABI [1].
+    
+    Instead, move paciasp/autiasp to function prologue/epilogue to be
+    used on both fast and slow paths.
+    
+    I also corrected the _dl_tlsdesc_dynamic comment description, it was
+    copied from i386 implementation without any adjustment.
+    
+    Checked on aarch64-linux-gnu with a toolchain built with
+    --enable-standard-branch-protection on a system with pac-ret
+    support.
+    
+    [1]  https://github.com/ARM-software/abi-aa/blob/main/aadwarf64/aadwarf64.rst#id1
+    
+    Reviewed-by: Yury Khrustalev <yury.khrustalev@arm.com>
+
+Conflicts:
+        sysdeps/unix/sysv/linux/aarch64/Makefile
+          (Fixup context to apply without out-of-scope dependency f4d00dd60d)
+
+diff --git a/sysdeps/aarch64/dl-tlsdesc.S b/sysdeps/aarch64/dl-tlsdesc.S
+index 0aeaf64edd2594f1..36195c956855e024 100644
+--- a/sysdeps/aarch64/dl-tlsdesc.S
+++ b/sysdeps/aarch64/dl-tlsdesc.S
+@@ -119,20 +119,19 @@ _dl_tlsdesc_undefweak:
+ 	   object referenced by the argument.
+ 
+ 	   ptrdiff_t
+-	   __attribute__ ((__regparm__ (1)))
+ 	   _dl_tlsdesc_dynamic (struct tlsdesc *tdp)
+ 	   {
+ 	     struct tlsdesc_dynamic_arg *td = tdp->arg;
+-	     dtv_t *dtv = *(dtv_t **)((char *)__thread_pointer + TCBHEAD_DTV);
+	     dtv_t *dtv = *(dtv_t **)((char *)__thread_pointer() + TCBHEAD_DTV);
+ 	     if (__builtin_expect (td->gen_count <= dtv[0].counter
+ 		&& (dtv[td->tlsinfo.ti_module].pointer.val
+ 		    != TLS_DTV_UNALLOCATED),
+ 		1))
+ 	       return dtv[td->tlsinfo.ti_module].pointer.val
+ 		+ td->tlsinfo.ti_offset
+-		- __thread_pointer;
+		- __thread_pointer();
+ 
+-	     return ___tls_get_addr (&td->tlsinfo) - __thread_pointer;
+	     return __tls_get_addr (&td->tlsinfo) - __thread_pointer();
+ 	   }
+ 	 */
+ 
+@@ -142,7 +141,12 @@ _dl_tlsdesc_undefweak:
+ 	cfi_startproc
+ 	.align 2
+ _dl_tlsdesc_dynamic:
+# if HAVE_AARCH64_PAC_RET
+	PACIASP
+	cfi_window_save
+# else
+ 	BTI_C
+# endif
+ 
+ 	/* Save just enough registers to support fast path, if we fall
+ 	   into slow path we will save additional registers.  */
+@@ -173,6 +177,10 @@ _dl_tlsdesc_dynamic:
+ 1:
+ 	ldp	 x3,  x4, [sp, #16]
+ 	ldp	 x1,  x2, [sp], #32
+# if HAVE_AARCH64_PAC_RET
+	AUTIASP
+	cfi_window_save
+# endif
+ 	cfi_adjust_cfa_offset (-32)
+ 	RET
+ 2:
+@@ -182,10 +190,6 @@ _dl_tlsdesc_dynamic:
+ 
+ 	/* Save the remaining registers that we must treat as caller save.  */
+ 	cfi_restore_state
+-# if HAVE_AARCH64_PAC_RET
+-	PACIASP
+-	cfi_window_save
+-# endif
+ # define NSAVEXREGPAIRS 8
+ 	stp	x29, x30, [sp,#-16*NSAVEXREGPAIRS]!
+ 	cfi_adjust_cfa_offset (16*NSAVEXREGPAIRS)
+@@ -236,10 +240,6 @@ _dl_tlsdesc_dynamic:
+ 	cfi_adjust_cfa_offset (-16*NSAVEXREGPAIRS)
+ 	cfi_restore (x29)
+ 	cfi_restore (x30)
+-# if HAVE_AARCH64_PAC_RET
+-	AUTIASP
+-	cfi_window_save
+-# endif
+ 	b	1b
+ 	cfi_endproc
+ 	.size	_dl_tlsdesc_dynamic, .-_dl_tlsdesc_dynamic
+diff --git a/sysdeps/unix/sysv/linux/aarch64/Makefile b/sysdeps/unix/sysv/linux/aarch64/Makefile
+index 40b9a2e5dea1ea89..607a0c56d8dfad8d 100644
+--- a/sysdeps/unix/sysv/linux/aarch64/Makefile
+++ b/sysdeps/unix/sysv/linux/aarch64/Makefile
+@@ -1,3 +1,16 @@
+ifeq ($(subdir),elf)
+tests += \
+  tst-tlsdesc-pac \
+  # tests
+modules-names += \
+  tst-tlsdesc-pac-mod \
+  # modules-names
+
+LDFLAGS-tst-tlsdesc-pac = -rdynamic
+
+$(objpfx)tst-tlsdesc-pac.out: $(objpfx)tst-tlsdesc-pac-mod.so
+endif
+
+ ifeq ($(subdir),misc)
+ sysdep_headers += sys/elf.h
+ endif
+diff --git a/sysdeps/unix/sysv/linux/aarch64/tst-tlsdesc-pac-mod.c b/sysdeps/unix/sysv/linux/aarch64/tst-tlsdesc-pac-mod.c
+new file mode 100644
+index 0000000000000000..d34c8beda9b1986d
+--- /dev/null
+++ b/sysdeps/unix/sysv/linux/aarch64/tst-tlsdesc-pac-mod.c
+@@ -0,0 +1,27 @@
+/* AArch64 tests for unwinding TLSDESC (BZ 32612)
+   Copyright (C) 2025 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+_Thread_local int foo;
+/* Make the TLS segment large enough to trigger _dl_tlsdesc_dynamic.  */
+_Thread_local int foobar[1000];
+
+void
+bar (void)
+{
+  foo = 1;
+}
+diff --git a/sysdeps/unix/sysv/linux/aarch64/tst-tlsdesc-pac.c b/sysdeps/unix/sysv/linux/aarch64/tst-tlsdesc-pac.c
+new file mode 100644
+index 0000000000000000..24d656aafc2784b4
+--- /dev/null
+++ b/sysdeps/unix/sysv/linux/aarch64/tst-tlsdesc-pac.c
+@@ -0,0 +1,48 @@
+/* AArch64 tests for unwinding TLSDESC (BZ 32612)
+   Copyright (C) 2025 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <stdlib.h>
+#include <unwind.h>
+#include <support/xdlfcn.h>
+
+static _Unwind_Reason_Code
+unwind_callback (struct _Unwind_Context* context, void* closure)
+{
+  return _URC_NO_REASON;
+}
+
+/* Assume that TLS variable from tst-tlsdesc-pac-mod.so will trigger
+   the slow-path that allocates the required memory with malloc.  */
+void *
+malloc (size_t s)
+{
+  _Unwind_Backtrace (unwind_callback, NULL);
+  return calloc (1, s);
+}
+
+static int
+do_test (void)
+{
+  void *h = xdlopen ("tst-tlsdesc-pac-mod.so", RTLD_LAZY);
+  void (*func)(void) = xdlsym (h, "bar");
+  func ();
+
+  return 0;
+}
+
+#include <support/test-driver.c>
--- a/glibc-RHEL-118273-35.patch
+++ b/glibc-RHEL-118273-35.patch
@ -0,0 +1,76 @@
+commit 691edbdf7727466ba87e27a8eeae1c3bc5824ef5
+Author: Yury Khrustalev <yury.khrustalev@arm.com>
+Date:   Thu May 8 13:53:38 2025 +0100
+
+    aarch64: fix unwinding in longjmp
+    
+    Previously, longjmp() on aarch64 was using CFI directives around the
+    call to __libc_arm_za_disable() after CFA was redefined at the start
+    of longjmp(). This may result in unwinding issues. Move the call and
+    surrounding CFI directives to the beginning of longjmp().
+    
+    Suggested-by: Wilco Dijkstra <wilco.dijkstra@arm.com>
+
+diff --git a/sysdeps/aarch64/__longjmp.S b/sysdeps/aarch64/__longjmp.S
+index 452ba0da6d788ce8..30b36cb25d921795 100644
+--- a/sysdeps/aarch64/__longjmp.S
+++ b/sysdeps/aarch64/__longjmp.S
+@@ -24,28 +24,6 @@
+ /* __longjmp(jmpbuf, val) */
+ 
+ ENTRY (__longjmp)
+-	cfi_def_cfa(x0, 0)
+-	cfi_offset(x19, JB_X19<<3)
+-	cfi_offset(x20, JB_X20<<3)
+-	cfi_offset(x21, JB_X21<<3)
+-	cfi_offset(x22, JB_X22<<3)
+-	cfi_offset(x23, JB_X23<<3)
+-	cfi_offset(x24, JB_X24<<3)
+-	cfi_offset(x25, JB_X25<<3)
+-	cfi_offset(x26, JB_X26<<3)
+-	cfi_offset(x27, JB_X27<<3)
+-	cfi_offset(x28, JB_X28<<3)
+-	cfi_offset(x29, JB_X29<<3)
+-	cfi_offset(x30, JB_LR<<3)
+-
+-	cfi_offset( d8, JB_D8<<3)
+-	cfi_offset( d9, JB_D9<<3)
+-	cfi_offset(d10, JB_D10<<3)
+-	cfi_offset(d11, JB_D11<<3)
+-	cfi_offset(d12, JB_D12<<3)
+-	cfi_offset(d13, JB_D13<<3)
+-	cfi_offset(d14, JB_D14<<3)
+-	cfi_offset(d15, JB_D15<<3)
+ 
+ #if IS_IN(libc)
+ 	/* Disable ZA state of SME in libc.a and libc.so, but not in ld.so.  */
+@@ -69,6 +47,29 @@ ENTRY (__longjmp)
+ # endif
+ #endif
+ 
+	cfi_def_cfa (x0, 0)
+	cfi_offset (x19, JB_X19<<3)
+	cfi_offset (x20, JB_X20<<3)
+	cfi_offset (x21, JB_X21<<3)
+	cfi_offset (x22, JB_X22<<3)
+	cfi_offset (x23, JB_X23<<3)
+	cfi_offset (x24, JB_X24<<3)
+	cfi_offset (x25, JB_X25<<3)
+	cfi_offset (x26, JB_X26<<3)
+	cfi_offset (x27, JB_X27<<3)
+	cfi_offset (x28, JB_X28<<3)
+	cfi_offset (x29, JB_X29<<3)
+	cfi_offset (x30, JB_LR<<3)
+
+	cfi_offset ( d8, JB_D8<<3)
+	cfi_offset ( d9, JB_D9<<3)
+	cfi_offset (d10, JB_D10<<3)
+	cfi_offset (d11, JB_D11<<3)
+	cfi_offset (d12, JB_D12<<3)
+	cfi_offset (d13, JB_D13<<3)
+	cfi_offset (d14, JB_D14<<3)
+	cfi_offset (d15, JB_D15<<3)
+
+ 	ldp	x19, x20, [x0, #JB_X19<<3]
+ 	ldp	x21, x22, [x0, #JB_X21<<3]
+ 	ldp	x23, x24, [x0, #JB_X23<<3]
--- a/glibc-RHEL-118273-36.patch
+++ b/glibc-RHEL-118273-36.patch
@ -0,0 +1,29 @@
+commit aa18367c1169700f610565eba8acf3e08429fcf5
+Author: Wilco Dijkstra <wilco.dijkstra@arm.com>
+Date:   Thu May 29 15:08:15 2025 +0000
+
+    AArch64: Improve enabling of SVE for libmvec
+    
+    When using a -mcpu option in CFLAGS, GCC can report errors when building libmvec.
+    Fix this by overriding both -mcpu and -march with a generic variant with SVE added.
+    Also use a tune for a modern SVE core.
+    
+    Reviewed-by: Yury Khrustalev <yury.khrustalev@arm.com>
+
+diff --git a/sysdeps/aarch64/fpu/Makefile b/sysdeps/aarch64/fpu/Makefile
+index be8541f6496d6688..aa547b21df5f41d9 100644
+--- a/sysdeps/aarch64/fpu/Makefile
+++ b/sysdeps/aarch64/fpu/Makefile
+@@ -49,8 +49,11 @@ libmvec-support = $(addsuffix f_advsimd,$(float-advsimd-funcs)) \
+                   v_powf_data
+ endif
+ 
+-sve-cflags = -march=armv8-a+sve
+# Enable SVE for building libmvec.  Since CFLAGS may contain a -mcpu or -march,
+# add a generic -mcpu and -march with SVE enabled.  Also use a tune for a modern
+# SVE core.
+ 
+sve-cflags = -mcpu=generic+sve -march=armv8-a+sve -mtune=neoverse-v2
+ 
+ ifeq ($(build-mathvec),yes)
+ bench-libmvec = $(addprefix float-advsimd-,$(float-advsimd-funcs)) \
--- a/glibc-RHEL-118273-37.patch
+++ b/glibc-RHEL-118273-37.patch
@ -0,0 +1,24 @@
+commit 09795c5612c630db605886dfd55dbf56f381d128
+Author: Wilco Dijkstra <wilco.dijkstra@arm.com>
+Date:   Fri Jun 6 13:15:30 2025 +0000
+
+    AArch64: Fix builderror with GCC 12.1/12.2
+    
+    Early versions of GCC 12 didn't support -mtune=neoverse-v2, so use
+    -mtune=neoverse-v1 instead.
+    
+    Reported-by: Yury Khrustalev <yury.khrustalev@arm.com>
+
+diff --git a/sysdeps/aarch64/fpu/Makefile b/sysdeps/aarch64/fpu/Makefile
+index aa547b21df5f41d9..c8a6fb4628d13aec 100644
+--- a/sysdeps/aarch64/fpu/Makefile
+++ b/sysdeps/aarch64/fpu/Makefile
+@@ -53,7 +53,7 @@ endif
+ # add a generic -mcpu and -march with SVE enabled.  Also use a tune for a modern
+ # SVE core.
+ 
+-sve-cflags = -mcpu=generic+sve -march=armv8-a+sve -mtune=neoverse-v2
+sve-cflags = -mcpu=generic+sve -march=armv8-a+sve -mtune=neoverse-v1
+ 
+ ifeq ($(build-mathvec),yes)
+ bench-libmvec = $(addprefix float-advsimd-,$(float-advsimd-funcs)) \
--- a/glibc-RHEL-118273-38.patch
+++ b/glibc-RHEL-118273-38.patch
@ -0,0 +1,188 @@
+commit 6849c5b791edd216f2ec3fdbe4d138bc69b9b333
+Author: Luna Lamb <luna.lamb@arm.com>
+Date:   Wed Jun 18 16:12:19 2025 +0000
+
+    AArch64: Improve codegen SVE log1p helper
+    
+    Improve codegen by packing coefficients.
+    4% and 2% improvement in throughput microbenchmark on Neoverse V1, for acosh
+    and atanh respectively.
+    
+    Reviewed-by: Wilco Dijkstra  <Wilco.Dijkstra@arm.com>
+
+diff --git a/sysdeps/aarch64/fpu/acosh_sve.c b/sysdeps/aarch64/fpu/acosh_sve.c
+index 3e4faaa5ca686c18..78ebcffbb5737641 100644
+--- a/sysdeps/aarch64/fpu/acosh_sve.c
+++ b/sysdeps/aarch64/fpu/acosh_sve.c
+@@ -30,10 +30,10 @@ special_case (svfloat64_t x, svfloat64_t y, svbool_t special)
+ }
+ 
+ /* SVE approximation for double-precision acosh, based on log1p.
+-   The largest observed error is 3.19 ULP in the region where the
+   The largest observed error is 3.14 ULP in the region where the
+    argument to log1p falls in the k=0 interval, i.e. x close to 1:
+-   SV_NAME_D1 (acosh)(0x1.1e4388d4ca821p+0) got 0x1.ed23399f5137p-2
+-					   want 0x1.ed23399f51373p-2.  */
+   SV_NAME_D1 (acosh)(0x1.1e80ed12f0ad1p+0) got 0x1.ef0cee7c33ce1p-2
+					   want 0x1.ef0cee7c33ce4p-2.  */
+ svfloat64_t SV_NAME_D1 (acosh) (svfloat64_t x, const svbool_t pg)
+ {
+   /* (ix - One) >= (BigBound - One).  */
+diff --git a/sysdeps/aarch64/fpu/atanh_sve.c b/sysdeps/aarch64/fpu/atanh_sve.c
+index 7a52728d70f6d226..a4803e5c1305379e 100644
+--- a/sysdeps/aarch64/fpu/atanh_sve.c
+++ b/sysdeps/aarch64/fpu/atanh_sve.c
+@@ -30,7 +30,7 @@ special_case (svfloat64_t x, svfloat64_t y, svbool_t special)
+ }
+ 
+ /* SVE approximation for double-precision atanh, based on log1p.
+-   The greatest observed error is 2.81 ULP:
+   The greatest observed error is 3.3 ULP:
+    _ZGVsMxv_atanh(0x1.ffae6288b601p-6) got 0x1.ffd8ff31b5019p-6
+ 				      want 0x1.ffd8ff31b501cp-6.  */
+ svfloat64_t SV_NAME_D1 (atanh) (svfloat64_t x, const svbool_t pg)
+@@ -42,7 +42,6 @@ svfloat64_t SV_NAME_D1 (atanh) (svfloat64_t x, const svbool_t pg)
+   svfloat64_t halfsign = svreinterpret_f64 (svorr_x (pg, sign, Half));
+ 
+   /* It is special if iax >= 1.  */
+-//   svbool_t special = svcmpge (pg, iax, One);
+   svbool_t special = svacge (pg, x, 1.0);
+ 
+   /* Computation is performed based on the following sequence of equality:
+diff --git a/sysdeps/aarch64/fpu/sv_log1p_inline.h b/sysdeps/aarch64/fpu/sv_log1p_inline.h
+index da019674f94dbac7..a9ecd75d19e95d39 100644
+--- a/sysdeps/aarch64/fpu/sv_log1p_inline.h
+++ b/sysdeps/aarch64/fpu/sv_log1p_inline.h
+@@ -21,11 +21,12 @@
+ #define AARCH64_FPU_SV_LOG1P_INLINE_H
+ 
+ #include "sv_math.h"
+-#include "poly_sve_f64.h"
+ 
+ static const struct sv_log1p_data
+ {
+-  double poly[19], ln2[2];
+  double c0, c2, c4, c6, c8, c10, c12, c14, c16;
+  double c1, c3, c5, c7, c9, c11, c13, c15, c17, c18;
+  double ln2_lo, ln2_hi;
+   uint64_t hf_rt2_top;
+   uint64_t one_m_hf_rt2_top;
+   uint32_t bottom_mask;
+@@ -33,15 +34,30 @@ static const struct sv_log1p_data
+ } sv_log1p_data = {
+   /* Coefficients generated using Remez, deg=20, in [sqrt(2)/2-1, sqrt(2)-1].
+    */
+-  .poly = { -0x1.ffffffffffffbp-2, 0x1.55555555551a9p-2, -0x1.00000000008e3p-2,
+-	    0x1.9999999a32797p-3, -0x1.555555552fecfp-3, 0x1.249248e071e5ap-3,
+-	    -0x1.ffffff8bf8482p-4, 0x1.c71c8f07da57ap-4, -0x1.9999ca4ccb617p-4,
+-	    0x1.7459ad2e1dfa3p-4, -0x1.554d2680a3ff2p-4, 0x1.3b4c54d487455p-4,
+-	    -0x1.2548a9ffe80e6p-4, 0x1.0f389a24b2e07p-4, -0x1.eee4db15db335p-5,
+-	    0x1.e95b494d4a5ddp-5, -0x1.15fdf07cb7c73p-4, 0x1.0310b70800fcfp-4,
+-	    -0x1.cfa7385bdb37ep-6 },
+-  .ln2 = { 0x1.62e42fefa3800p-1, 0x1.ef35793c76730p-45 },
+  .c0 = -0x1.ffffffffffffbp-2,
+  .c1 = 0x1.55555555551a9p-2,
+  .c2 = -0x1.00000000008e3p-2,
+  .c3 = 0x1.9999999a32797p-3,
+  .c4 = -0x1.555555552fecfp-3,
+  .c5 = 0x1.249248e071e5ap-3,
+  .c6 = -0x1.ffffff8bf8482p-4,
+  .c7 = 0x1.c71c8f07da57ap-4,
+  .c8 = -0x1.9999ca4ccb617p-4,
+  .c9 = 0x1.7459ad2e1dfa3p-4,
+  .c10 = -0x1.554d2680a3ff2p-4,
+  .c11 = 0x1.3b4c54d487455p-4,
+  .c12 = -0x1.2548a9ffe80e6p-4,
+  .c13 = 0x1.0f389a24b2e07p-4,
+  .c14 = -0x1.eee4db15db335p-5,
+  .c15 = 0x1.e95b494d4a5ddp-5,
+  .c16 = -0x1.15fdf07cb7c73p-4,
+  .c17 = 0x1.0310b70800fcfp-4,
+  .c18 = -0x1.cfa7385bdb37ep-6,
+  .ln2_lo = 0x1.62e42fefa3800p-1,
+  .ln2_hi = 0x1.ef35793c76730p-45,
+  /* top32(asuint64(sqrt(2)/2)) << 32.  */
+   .hf_rt2_top = 0x3fe6a09e00000000,
+  /* (top32(asuint64(1)) - top32(asuint64(sqrt(2)/2))) << 32.  */
+   .one_m_hf_rt2_top = 0x00095f6200000000,
+   .bottom_mask = 0xffffffff,
+   .one_top = 0x3ff
+@@ -51,14 +67,14 @@ static inline svfloat64_t
+ sv_log1p_inline (svfloat64_t x, const svbool_t pg)
+ {
+   /* Helper for calculating log(x + 1). Adapted from v_log1p_inline.h, which
+-     differs from v_log1p_2u5.c by:
+     differs from advsimd/log1p.c by:
+      - No special-case handling - this should be dealt with by the caller.
+      - Pairwise Horner polynomial evaluation for improved accuracy.
+      - Optionally simulate the shortcut for k=0, used in the scalar routine,
+        using svsel, for improved accuracy when the argument to log1p is close
+      to 0. This feature is enabled by defining WANT_SV_LOG1P_K0_SHORTCUT as 1
+      in the source of the caller before including this file.
+-     See sv_log1p_2u1.c for details of the algorithm.  */
+     See sve/log1p.c for details of the algorithm.  */
+   const struct sv_log1p_data *d = ptr_barrier (&sv_log1p_data);
+   svfloat64_t m = svadd_x (pg, x, 1);
+   svuint64_t mi = svreinterpret_u64 (m);
+@@ -79,7 +95,7 @@ sv_log1p_inline (svfloat64_t x, const svbool_t pg)
+   svfloat64_t cm;
+ 
+ #ifndef WANT_SV_LOG1P_K0_SHORTCUT
+-#error                                                                         \
+#error                                                                       \
+   "Cannot use sv_log1p_inline.h without specifying whether you need the k0 shortcut for greater accuracy close to 0"
+ #elif WANT_SV_LOG1P_K0_SHORTCUT
+   /* Shortcut if k is 0 - set correction term to 0 and f to x. The result is
+@@ -96,14 +112,46 @@ sv_log1p_inline (svfloat64_t x, const svbool_t pg)
+ #endif
+ 
+   /* Approximate log1p(f) on the reduced input using a polynomial.  */
+-  svfloat64_t f2 = svmul_x (pg, f, f);
+-  svfloat64_t p = sv_pw_horner_18_f64_x (pg, f, f2, d->poly);
+  svfloat64_t f2 = svmul_x (svptrue_b64 (), f, f),
+	      f4 = svmul_x (svptrue_b64 (), f2, f2),
+	      f8 = svmul_x (svptrue_b64 (), f4, f4),
+	      f16 = svmul_x (svptrue_b64 (), f8, f8);
+
+  svfloat64_t c13 = svld1rq (svptrue_b64 (), &d->c1);
+  svfloat64_t c57 = svld1rq (svptrue_b64 (), &d->c5);
+  svfloat64_t c911 = svld1rq (svptrue_b64 (), &d->c9);
+  svfloat64_t c1315 = svld1rq (svptrue_b64 (), &d->c13);
+  svfloat64_t c1718 = svld1rq (svptrue_b64 (), &d->c17);
+
+  /* Order-18 Estrin scheme.  */
+  svfloat64_t p01 = svmla_lane (sv_f64 (d->c0), f, c13, 0);
+  svfloat64_t p23 = svmla_lane (sv_f64 (d->c2), f, c13, 1);
+  svfloat64_t p45 = svmla_lane (sv_f64 (d->c4), f, c57, 0);
+  svfloat64_t p67 = svmla_lane (sv_f64 (d->c6), f, c57, 1);
+
+  svfloat64_t p03 = svmla_x (pg, p01, f2, p23);
+  svfloat64_t p47 = svmla_x (pg, p45, f2, p67);
+  svfloat64_t p07 = svmla_x (pg, p03, f4, p47);
+
+  svfloat64_t p89 = svmla_lane (sv_f64 (d->c8), f, c911, 0);
+  svfloat64_t p1011 = svmla_lane (sv_f64 (d->c10), f, c911, 1);
+  svfloat64_t p1213 = svmla_lane (sv_f64 (d->c12), f, c1315, 0);
+  svfloat64_t p1415 = svmla_lane (sv_f64 (d->c14), f, c1315, 1);
+
+  svfloat64_t p811 = svmla_x (pg, p89, f2, p1011);
+  svfloat64_t p1215 = svmla_x (pg, p1213, f2, p1415);
+  svfloat64_t p815 = svmla_x (pg, p811, f4, p1215);
+
+  svfloat64_t p015 = svmla_x (pg, p07, f8, p815);
+  svfloat64_t p1617 = svmla_lane (sv_f64 (d->c16), f, c1718, 0);
+  svfloat64_t p1618 = svmla_lane (p1617, f2, c1718, 1);
+  svfloat64_t p = svmla_x (pg, p015, f16, p1618);
+ 
+   /* Assemble log1p(x) = k * log2 + log1p(f) + c/m.  */
+-  svfloat64_t ylo = svmla_x (pg, cm, k, d->ln2[0]);
+-  svfloat64_t yhi = svmla_x (pg, f, k, d->ln2[1]);
+  svfloat64_t ln2_lo_hi = svld1rq (svptrue_b64 (), &d->ln2_lo);
+  svfloat64_t ylo = svmla_lane (cm, k, ln2_lo_hi, 0);
+  svfloat64_t yhi = svmla_lane (f, k, ln2_lo_hi, 1);
+ 
+-  return svmla_x (pg, svadd_x (pg, ylo, yhi), f2, p);
+  return svmad_x (pg, p, f2, svadd_x (pg, ylo, yhi));
+ }
+-
+ #endif
--- a/glibc-RHEL-118273-39.patch
+++ b/glibc-RHEL-118273-39.patch
@ -0,0 +1,583 @@
+commit dee22d2a81ab59afc165fb6dcb45d723f13582a0
+Author: Dylan Fleming <Dylan.Fleming@arm.com>
+Date:   Wed Jun 18 16:19:22 2025 +0000
+
+    AArch64: Optimise SVE FP64 Hyperbolics
+    
+    Reworke SVE FP64 hyperbolics to use the SVE FEXPA
+    instruction.
+    
+    Also update the special case handelling for large
+    inputs to be entirely vectorised.
+    
+    Performance improvements on Neoverse V1:
+    
+    cosh_sve: 19% for |x| < 709, 5x otherwise
+    sinh_sve: 24% for |x| < 709, 5.9x otherwise
+    tanh_sve: 12% for |x| < 19,  9x otherwise
+    
+    Reviewed-by: Wilco Dijkstra  <Wilco.Dijkstra@arm.com>
+
+diff --git a/sysdeps/aarch64/fpu/cosh_sve.c b/sysdeps/aarch64/fpu/cosh_sve.c
+index e375dd8a3407feb2..3561893ae614e2ea 100644
+--- a/sysdeps/aarch64/fpu/cosh_sve.c
+++ b/sysdeps/aarch64/fpu/cosh_sve.c
+@@ -21,71 +21,99 @@
+ 
+ static const struct data
+ {
+-  float64_t poly[3];
+-  float64_t inv_ln2, ln2_hi, ln2_lo, shift, thres;
+  double c0, c2;
+  double c1, c3;
+  float64_t inv_ln2, ln2_hi, ln2_lo, shift;
+   uint64_t special_bound;
+ } data = {
+-  .poly = { 0x1.fffffffffffd4p-2, 0x1.5555571d6b68cp-3,
+-	    0x1.5555576a59599p-5, },
+-
+-  .inv_ln2 = 0x1.71547652b82fep8, /* N/ln2.  */
+-  /* -ln2/N.  */
+-  .ln2_hi = -0x1.62e42fefa39efp-9,
+-  .ln2_lo = -0x1.abc9e3b39803f3p-64,
+-  .shift = 0x1.8p+52,
+-  .thres = 704.0,
+-
+-  /* 0x1.6p9, above which exp overflows.  */
+-  .special_bound = 0x4086000000000000,
+  /* Generated using Remez, in [-log(2)/128, log(2)/128].  */
+  .c0 = 0x1.fffffffffdbcdp-2,
+  .c1 = 0x1.555555555444cp-3,
+  .c2 = 0x1.555573c6a9f7dp-5,
+  .c3 = 0x1.1111266d28935p-7,
+  .ln2_hi = 0x1.62e42fefa3800p-1,
+  .ln2_lo = 0x1.ef35793c76730p-45,
+  /* 1/ln2.  */
+  .inv_ln2 = 0x1.71547652b82fep+0,
+  .shift = 0x1.800000000ff80p+46, /* 1.5*2^46+1022.  */
+
+  /* asuint(ln(2^(1024 - 1/128))), the value above which exp overflows.  */
+  .special_bound = 0x40862e37e7d8ba72,
+ };
+ 
+-static svfloat64_t NOINLINE
+-special_case (svfloat64_t x, svbool_t pg, svfloat64_t t, svbool_t special)
+-{
+-  svfloat64_t half_t = svmul_x (svptrue_b64 (), t, 0.5);
+-  svfloat64_t half_over_t = svdivr_x (pg, t, 0.5);
+-  svfloat64_t y = svadd_x (pg, half_t, half_over_t);
+-  return sv_call_f64 (cosh, x, y, special);
+-}
+-
+-/* Helper for approximating exp(x). Copied from sv_exp_tail, with no
+-   special-case handling or tail.  */
+/* Helper for approximating exp(x)/2.
+   Functionally identical to FEXPA exp(x), but an adjustment in
+   the shift value which leads to a reduction in the exponent of scale by 1,
+   thus halving the result at no cost.  */
+ static inline svfloat64_t
+-exp_inline (svfloat64_t x, const svbool_t pg, const struct data *d)
+exp_over_two_inline (const svbool_t pg, svfloat64_t x, const struct data *d)
+ {
+   /* Calculate exp(x).  */
+   svfloat64_t z = svmla_x (pg, sv_f64 (d->shift), x, d->inv_ln2);
+  svuint64_t u = svreinterpret_u64 (z);
+   svfloat64_t n = svsub_x (pg, z, d->shift);
+ 
+-  svfloat64_t r = svmla_x (pg, x, n, d->ln2_hi);
+-  r = svmla_x (pg, r, n, d->ln2_lo);
+  svfloat64_t c13 = svld1rq (svptrue_b64 (), &d->c1);
+  svfloat64_t ln2 = svld1rq (svptrue_b64 (), &d->ln2_hi);
+ 
+-  svuint64_t u = svreinterpret_u64 (z);
+-  svuint64_t e = svlsl_x (pg, u, 52 - V_EXP_TAIL_TABLE_BITS);
+-  svuint64_t i = svand_x (svptrue_b64 (), u, 0xff);
+  svfloat64_t r = x;
+  r = svmls_lane (r, n, ln2, 0);
+  r = svmls_lane (r, n, ln2, 1);
+ 
+-  svfloat64_t y = svmla_x (pg, sv_f64 (d->poly[1]), r, d->poly[2]);
+-  y = svmla_x (pg, sv_f64 (d->poly[0]), r, y);
+-  y = svmla_x (pg, sv_f64 (1.0), r, y);
+-  y = svmul_x (svptrue_b64 (), r, y);
+  svfloat64_t r2 = svmul_x (svptrue_b64 (), r, r);
+  svfloat64_t p01 = svmla_lane (sv_f64 (d->c0), r, c13, 0);
+  svfloat64_t p23 = svmla_lane (sv_f64 (d->c2), r, c13, 1);
+  svfloat64_t p04 = svmla_x (pg, p01, p23, r2);
+  svfloat64_t p = svmla_x (pg, r, p04, r2);
+ 
+-  /* s = 2^(n/N).  */
+-  u = svld1_gather_index (pg, __v_exp_tail_data, i);
+-  svfloat64_t s = svreinterpret_f64 (svadd_x (pg, u, e));
+  svfloat64_t scale = svexpa (u);
+ 
+-  return svmla_x (pg, s, s, y);
+  return svmla_x (pg, scale, scale, p);
+}
+
+/* Vectorised special case to handle values past where exp_inline overflows.
+   Halves the input value and uses the identity exp(x) = exp(x/2)^2 to double
+   the valid range of inputs, and returns inf for anything past that.  */
+static svfloat64_t NOINLINE
+special_case (svbool_t pg, svbool_t special, svfloat64_t ax, svfloat64_t t,
+	      const struct data *d)
+{
+  /* Finish fast path to compute values for non-special cases.  */
+  svfloat64_t inv_twoexp = svdivr_x (pg, t, 0.25);
+  svfloat64_t y = svadd_x (pg, t, inv_twoexp);
+
+  /* Halves input value, and then check if any cases
+     are still going to overflow.  */
+  ax = svmul_x (special, ax, 0.5);
+  svbool_t is_safe
+      = svcmplt (special, svreinterpret_u64 (ax), d->special_bound);
+
+  /* Computes exp(x/2), and sets any overflowing lanes to inf.  */
+  svfloat64_t half_exp = exp_over_two_inline (special, ax, d);
+  half_exp = svsel (is_safe, half_exp, sv_f64 (INFINITY));
+
+  /* Construct special case cosh(x) = (exp(x/2)^2)/2.  */
+  svfloat64_t exp = svmul_x (svptrue_b64 (), half_exp, 2);
+  svfloat64_t special_y = svmul_x (special, exp, half_exp);
+
+  /* Select correct return values for special and non-special cases.  */
+  special_y = svsel (special, special_y, y);
+
+  /* Ensure an input of nan is correctly propagated.  */
+  svbool_t is_nan
+      = svcmpgt (special, svreinterpret_u64 (ax), sv_u64 (0x7ff0000000000000));
+  return svsel (is_nan, ax, svsel (special, special_y, y));
+ }
+ 
+ /* Approximation for SVE double-precision cosh(x) using exp_inline.
+    cosh(x) = (exp(x) + exp(-x)) / 2.
+-   The greatest observed error is in the scalar fall-back region, so is the
+-   same as the scalar routine, 1.93 ULP:
+-   _ZGVsMxv_cosh (0x1.628ad45039d2fp+9) got 0x1.fd774e958236dp+1021
+-				       want 0x1.fd774e958236fp+1021.
+-
+-   The greatest observed error in the non-special region is 1.54 ULP:
+-   _ZGVsMxv_cosh (0x1.ba5651dd4486bp+2) got 0x1.f5e2bb8d5c98fp+8
+-				       want 0x1.f5e2bb8d5c991p+8.  */
+   The greatest observed error in special case region is 2.66 + 0.5 ULP:
+   _ZGVsMxv_cosh (0x1.633b532ffbc1ap+9) got 0x1.f9b2d3d22399ep+1023
+				       want 0x1.f9b2d3d22399bp+1023
+
+  The greatest observed error in the non-special region is 1.01 + 0.5 ULP:
+  _ZGVsMxv_cosh (0x1.998ecbb3c1f81p+1) got 0x1.890b225657f84p+3
+				      want 0x1.890b225657f82p+3.  */
+ svfloat64_t SV_NAME_D1 (cosh) (svfloat64_t x, const svbool_t pg)
+ {
+   const struct data *d = ptr_barrier (&data);
+@@ -94,14 +122,13 @@ svfloat64_t SV_NAME_D1 (cosh) (svfloat64_t x, const svbool_t pg)
+   svbool_t special = svcmpgt (pg, svreinterpret_u64 (ax), d->special_bound);
+ 
+   /* Up to the point that exp overflows, we can use it to calculate cosh by
+-     exp(|x|) / 2 + 1 / (2 * exp(|x|)).  */
+-  svfloat64_t t = exp_inline (ax, pg, d);
+     (exp(|x|)/2 + 1) / (2 * exp(|x|)).  */
+  svfloat64_t half_exp = exp_over_two_inline (pg, ax, d);
+ 
+-  /* Fall back to scalar for any special cases.  */
+  /* Falls back to entirely standalone vectorized special case.  */
+   if (__glibc_unlikely (svptest_any (pg, special)))
+-    return special_case (x, pg, t, special);
+    return special_case (pg, special, ax, half_exp, d);
+ 
+-  svfloat64_t half_t = svmul_x (svptrue_b64 (), t, 0.5);
+-  svfloat64_t half_over_t = svdivr_x (pg, t, 0.5);
+-  return svadd_x (pg, half_t, half_over_t);
+  svfloat64_t inv_twoexp = svdivr_x (pg, half_exp, 0.25);
+  return svadd_x (pg, half_exp, inv_twoexp);
+ }
+diff --git a/sysdeps/aarch64/fpu/sinh_sve.c b/sysdeps/aarch64/fpu/sinh_sve.c
+index df5f6c8c06e5b173..ac7b306018bda613 100644
+--- a/sysdeps/aarch64/fpu/sinh_sve.c
+++ b/sysdeps/aarch64/fpu/sinh_sve.c
+@@ -18,90 +18,153 @@
+    <https://www.gnu.org/licenses/>.  */
+ 
+ #include "sv_math.h"
+-#include "poly_sve_f64.h"
+ 
+ static const struct data
+ {
+-  float64_t poly[11];
+-  float64_t inv_ln2, m_ln2_hi, m_ln2_lo, shift;
+   uint64_t halff;
+-  int64_t onef;
+-  uint64_t large_bound;
+  double c2, c4;
+  double inv_ln2;
+  double ln2_hi, ln2_lo;
+  double c0, c1, c3;
+  double shift, special_bound, bound;
+  uint64_t expm1_data[20];
+ } data = {
+-  /* Generated using Remez, deg=12 in [-log(2)/2, log(2)/2].  */
+-  .poly = { 0x1p-1, 0x1.5555555555559p-3, 0x1.555555555554bp-5,
+-	    0x1.111111110f663p-7, 0x1.6c16c16c1b5f3p-10,
+-	    0x1.a01a01affa35dp-13, 0x1.a01a018b4ecbbp-16,
+-	    0x1.71ddf82db5bb4p-19, 0x1.27e517fc0d54bp-22,
+-	    0x1.af5eedae67435p-26, 0x1.1f143d060a28ap-29, },
+-
+-  .inv_ln2 = 0x1.71547652b82fep0,
+-  .m_ln2_hi = -0x1.62e42fefa39efp-1,
+-  .m_ln2_lo = -0x1.abc9e3b39803fp-56,
+-  .shift = 0x1.8p52,
+-
+  /* Table lookup of 2^(i/64) - 1, for values of i from 0..19.  */
+  .expm1_data = {
+    0x0000000000000000, 0x3f864d1f3bc03077, 0x3f966c34c5615d0f, 0x3fa0e8a30eb37901,
+    0x3fa6ab0d9f3121ec, 0x3fac7d865a7a3440, 0x3fb1301d0125b50a, 0x3fb429aaea92ddfb,
+    0x3fb72b83c7d517ae, 0x3fba35beb6fcb754, 0x3fbd4873168b9aa8, 0x3fc031dc431466b2,
+    0x3fc1c3d373ab11c3, 0x3fc35a2b2f13e6e9, 0x3fc4f4efa8fef709, 0x3fc6942d3720185a,
+    0x3fc837f0518db8a9, 0x3fc9e0459320b7fa, 0x3fcb8d39b9d54e55, 0x3fcd3ed9a72cffb7,
+  },
+
+  /* Generated using Remez, in [-log(2)/128, log(2)/128].  */
+  .c0 = 0x1p-1,
+  .c1 = 0x1.55555555548f9p-3,
+  .c2 = 0x1.5555555554c22p-5,
+  .c3 = 0x1.111123aaa2fb2p-7,
+  .c4 = 0x1.6c16d77d98e5bp-10,
+  .ln2_hi = 0x1.62e42fefa3800p-1,
+  .ln2_lo = 0x1.ef35793c76730p-45,
+  .inv_ln2 = 0x1.71547652b82fep+0,
+  .shift = 0x1.800000000ffc0p+46, /* 1.5*2^46+1023.  */
+   .halff = 0x3fe0000000000000,
+-  .onef = 0x3ff0000000000000,
+-  /* 2^9. expm1 helper overflows for large input.  */
+-  .large_bound = 0x4080000000000000,
+  .special_bound = 0x1.62e37e7d8ba72p+9,	/* ln(2^(1024 - 1/128)).  */
+  .bound = 0x1.a56ef8ec924ccp-3 /* 19*ln2/64.  */
+ };
+ 
+/* A specialised FEXPA expm1 that is only valid for positive inputs and
+   has no special cases. Based off the full FEXPA expm1 implementated for
+   _ZGVsMxv_expm1, with a slightly modified file to keep sinh under 3.5ULP.  */
+ static inline svfloat64_t
+-expm1_inline (svfloat64_t x, svbool_t pg)
+expm1_inline (svbool_t pg, svfloat64_t x)
+ {
+   const struct data *d = ptr_barrier (&data);
+ 
+-  /* Reduce argument:
+-     exp(x) - 1 = 2^i * (expm1(f) + 1) - 1
+-     where i = round(x / ln2)
+-     and   f = x - i * ln2 (f in [-ln2/2, ln2/2]).  */
+-  svfloat64_t j
+-      = svsub_x (pg, svmla_x (pg, sv_f64 (d->shift), x, d->inv_ln2), d->shift);
+-  svint64_t i = svcvt_s64_x (pg, j);
+-  svfloat64_t f = svmla_x (pg, x, j, d->m_ln2_hi);
+-  f = svmla_x (pg, f, j, d->m_ln2_lo);
+-  /* Approximate expm1(f) using polynomial.  */
+-  svfloat64_t f2 = svmul_x (pg, f, f);
+-  svfloat64_t f4 = svmul_x (pg, f2, f2);
+-  svfloat64_t f8 = svmul_x (pg, f4, f4);
+-  svfloat64_t p
+-      = svmla_x (pg, f, f2, sv_estrin_10_f64_x (pg, f, f2, f4, f8, d->poly));
+-  /* t = 2^i.  */
+-  svfloat64_t t = svscale_x (pg, sv_f64 (1), i);
+-  /* expm1(x) ~= p * t + (t - 1).  */
+-  return svmla_x (pg, svsub_x (pg, t, 1.0), p, t);
+  svfloat64_t z = svmla_x (pg, sv_f64 (d->shift), x, d->inv_ln2);
+  svuint64_t u = svreinterpret_u64 (z);
+  svfloat64_t n = svsub_x (pg, z, d->shift);
+
+  svfloat64_t ln2 = svld1rq (svptrue_b64 (), &d->ln2_hi);
+  svfloat64_t c24 = svld1rq (svptrue_b64 (), &d->c2);
+
+  svfloat64_t r = x;
+  r = svmls_lane (r, n, ln2, 0);
+  r = svmls_lane (r, n, ln2, 1);
+
+  svfloat64_t r2 = svmul_x (svptrue_b64 (), r, r);
+
+  svfloat64_t p;
+  svfloat64_t c12 = svmla_lane (sv_f64 (d->c1), r, c24, 0);
+  svfloat64_t c34 = svmla_lane (sv_f64 (d->c3), r, c24, 1);
+  p = svmad_x (pg, c34, r2, c12);
+  p = svmad_x (pg, p, r, sv_f64 (d->c0));
+  p = svmad_x (pg, p, r2, r);
+
+  svfloat64_t scale = svexpa (u);
+
+  /* We want to construct expm1(x) = (scale - 1) + scale * poly.
+     However, for values of scale close to 1, scale-1 causes large ULP errors
+     due to cancellation.
+
+     This can be circumvented by using a small lookup for scale-1
+     when our input is below a certain bound, otherwise we can use FEXPA.  */
+  svbool_t is_small = svaclt (pg, x, d->bound);
+
+  /* Index via the input of FEXPA, but we only care about the lower 5 bits.  */
+  svuint64_t base_idx = svand_x (pg, u, 0x1f);
+
+  /* Compute scale - 1 from FEXPA, and lookup values where this fails.  */
+  svfloat64_t scalem1_estimate = svsub_x (pg, scale, sv_f64 (1.0));
+  svuint64_t scalem1_lookup
+      = svld1_gather_index (is_small, d->expm1_data, base_idx);
+
+  /* Select the appropriate scale - 1 value based on x.  */
+  svfloat64_t scalem1
+      = svsel (is_small, svreinterpret_f64 (scalem1_lookup), scalem1_estimate);
+
+  /* return expm1 = scale - 1 + (scale * poly).  */
+  return svmla_x (pg, scalem1, scale, p);
+ }
+ 
+/* Vectorised special case to handle values past where exp_inline overflows.
+   Halves the input value and uses the identity exp(x) = exp(x/2)^2 to double
+   the valid range of inputs, and returns inf for anything past that.  */
+ static svfloat64_t NOINLINE
+-special_case (svfloat64_t x, svbool_t pg)
+special_case (svbool_t pg, svbool_t special, svfloat64_t ax,
+	      svfloat64_t halfsign, const struct data *d)
+ {
+-  return sv_call_f64 (sinh, x, x, pg);
+  /* Halves input value, and then check if any cases
+     are still going to overflow.  */
+  ax = svmul_x (special, ax, 0.5);
+  svbool_t is_safe = svaclt (special, ax, d->special_bound);
+
+  svfloat64_t t = expm1_inline (pg, ax);
+
+  /* Finish fastpass to compute values for non-special cases.  */
+  svfloat64_t y = svadd_x (pg, t, svdiv_x (pg, t, svadd_x (pg, t, 1.0)));
+  y = svmul_x (pg, y, halfsign);
+
+  /* Computes special lane, and set remaining overflow lanes to inf.  */
+  svfloat64_t half_special_y = svmul_x (svptrue_b64 (), t, halfsign);
+  svfloat64_t special_y = svmul_x (svptrue_b64 (), half_special_y, t);
+
+  svuint64_t signed_inf
+      = svorr_x (svptrue_b64 (), svreinterpret_u64 (halfsign),
+		 sv_u64 (0x7ff0000000000000));
+  special_y = svsel (is_safe, special_y, svreinterpret_f64 (signed_inf));
+
+  /* Join resulting vectors together and return.  */
+  return svsel (special, special_y, y);
+ }
+ 
+-/* Approximation for SVE double-precision sinh(x) using expm1.
+-   sinh(x) = (exp(x) - exp(-x)) / 2.
+-   The greatest observed error is 2.57 ULP:
+-   _ZGVsMxv_sinh (0x1.a008538399931p-2) got 0x1.ab929fc64bd66p-2
+-				       want 0x1.ab929fc64bd63p-2.  */
+/* Approximation for SVE double-precision sinh(x) using FEXPA expm1.
+   Uses sinh(x) = e^2x - 1 / 2e^x, rewritten for accuracy.
+   The greatest observed error in the non-special region is 2.63 + 0.5 ULP:
+   _ZGVsMxv_sinh (0x1.b5e0e13ba88aep-2) got 0x1.c3587faf97b0cp-2
+				       want 0x1.c3587faf97b09p-2
+
+   The greatest observed error in the special region is 2.65 + 0.5 ULP:
+   _ZGVsMxv_sinh (0x1.633ce847dab1ap+9) got 0x1.fffd30eea0066p+1023
+				       want 0x1.fffd30eea0063p+1023.  */
+ svfloat64_t SV_NAME_D1 (sinh) (svfloat64_t x, svbool_t pg)
+ {
+   const struct data *d = ptr_barrier (&data);
+ 
+  svbool_t special = svacge (pg, x, d->special_bound);
+   svfloat64_t ax = svabs_x (pg, x);
+   svuint64_t sign
+       = sveor_x (pg, svreinterpret_u64 (x), svreinterpret_u64 (ax));
+   svfloat64_t halfsign = svreinterpret_f64 (svorr_x (pg, sign, d->halff));
+ 
+-  svbool_t special = svcmpge (pg, svreinterpret_u64 (ax), d->large_bound);
+-
+   /* Fall back to scalar variant for all lanes if any are special.  */
+   if (__glibc_unlikely (svptest_any (pg, special)))
+-    return special_case (x, pg);
+    return special_case (pg, special, ax, halfsign, d);
+ 
+   /* Up to the point that expm1 overflows, we can use it to calculate sinh
+      using a slight rearrangement of the definition of sinh. This allows us to
+      retain acceptable accuracy for very small inputs.  */
+-  svfloat64_t t = expm1_inline (ax, pg);
+  svfloat64_t t = expm1_inline (pg, ax);
+   t = svadd_x (pg, t, svdiv_x (pg, t, svadd_x (pg, t, 1.0)));
+   return svmul_x (pg, t, halfsign);
+ }
+diff --git a/sysdeps/aarch64/fpu/tanh_sve.c b/sysdeps/aarch64/fpu/tanh_sve.c
+index d25e011cea305094..805669845d09e098 100644
+--- a/sysdeps/aarch64/fpu/tanh_sve.c
+++ b/sysdeps/aarch64/fpu/tanh_sve.c
+@@ -18,83 +18,117 @@
+    <https://www.gnu.org/licenses/>.  */
+ 
+ #include "sv_math.h"
+-#include "poly_sve_f64.h"
+ 
+ static const struct data
+ {
+-  float64_t poly[11];
+-  float64_t inv_ln2, ln2_hi, ln2_lo, shift;
+-  uint64_t thresh, tiny_bound;
+  double ln2_hi, ln2_lo;
+  double c2, c4;
+  double c0, c1, c3;
+  double two_over_ln2, shift;
+  uint64_t tiny_bound;
+  double large_bound, fexpa_bound;
+  uint64_t e2xm1_data[20];
+ } data = {
+-  /* Generated using Remez, deg=12 in [-log(2)/2, log(2)/2].  */
+-  .poly = { 0x1p-1, 0x1.5555555555559p-3, 0x1.555555555554bp-5,
+-	    0x1.111111110f663p-7, 0x1.6c16c16c1b5f3p-10,
+-	    0x1.a01a01affa35dp-13, 0x1.a01a018b4ecbbp-16,
+-	    0x1.71ddf82db5bb4p-19, 0x1.27e517fc0d54bp-22,
+-	    0x1.af5eedae67435p-26, 0x1.1f143d060a28ap-29, },
+-
+-  .inv_ln2 = 0x1.71547652b82fep0,
+-  .ln2_hi = -0x1.62e42fefa39efp-1,
+-  .ln2_lo = -0x1.abc9e3b39803fp-56,
+-  .shift = 0x1.8p52,
+-
+  /* Generated using Remez, in [-log(2)/128, log(2)/128].  */
+  .c0 = 0x1p-1,
+  .c1 = 0x1.55555555548f9p-3,
+  .c2 = 0x1.5555555554c22p-5,
+  .c3 = 0x1.111123aaa2fb2p-7,
+  .c4 = 0x1.6c16d77d98e5bp-10,
+  .ln2_hi = 0x1.62e42fefa3800p-1,
+  .ln2_lo = 0x1.ef35793c76730p-45,
+  .two_over_ln2 = 0x1.71547652b82fep+1,
+  .shift = 0x1.800000000ffc0p+46,   /* 1.5*2^46+1023.  */
+   .tiny_bound = 0x3e40000000000000, /* asuint64 (0x1p-27).  */
+-  /* asuint64(0x1.241bf835f9d5fp+4) - asuint64(tiny_bound).  */
+-  .thresh = 0x01f241bf835f9d5f,
+  .large_bound = 0x1.30fc1931f09cap+4, /* arctanh(1 - 2^-54).  */
+  .fexpa_bound = 0x1.a56ef8ec924ccp-4,	  /* 19/64 * ln2/2.  */
+  /* Table lookup of 2^(i/64) - 1, for values of i from 0..19.  */
+  .e2xm1_data = {
+    0x0000000000000000, 0x3f864d1f3bc03077, 0x3f966c34c5615d0f, 0x3fa0e8a30eb37901,
+    0x3fa6ab0d9f3121ec, 0x3fac7d865a7a3440, 0x3fb1301d0125b50a, 0x3fb429aaea92ddfb,
+    0x3fb72b83c7d517ae, 0x3fba35beb6fcb754, 0x3fbd4873168b9aa8, 0x3fc031dc431466b2,
+    0x3fc1c3d373ab11c3, 0x3fc35a2b2f13e6e9, 0x3fc4f4efa8fef709, 0x3fc6942d3720185a,
+    0x3fc837f0518db8a9, 0x3fc9e0459320b7fa, 0x3fcb8d39b9d54e55, 0x3fcd3ed9a72cffb7,
+  },
+ };
+ 
+/* An expm1 inspired, FEXPA based helper function that returns an
+   accurate estimate for e^2x - 1. With no special case or support for
+   negative inputs of x.  */
+ static inline svfloat64_t
+-expm1_inline (svfloat64_t x, const svbool_t pg, const struct data *d)
+-{
+-  /* Helper routine for calculating exp(x) - 1. Vector port of the helper from
+-     the scalar variant of tanh.  */
+-
+-  /* Reduce argument: f in [-ln2/2, ln2/2], i is exact.  */
+-  svfloat64_t j
+-      = svsub_x (pg, svmla_x (pg, sv_f64 (d->shift), x, d->inv_ln2), d->shift);
+-  svint64_t i = svcvt_s64_x (pg, j);
+-  svfloat64_t f = svmla_x (pg, x, j, d->ln2_hi);
+-  f = svmla_x (pg, f, j, d->ln2_lo);
+-
+-  /* Approximate expm1(f) using polynomial.  */
+-  svfloat64_t f2 = svmul_x (pg, f, f);
+-  svfloat64_t f4 = svmul_x (pg, f2, f2);
+-  svfloat64_t p = svmla_x (
+-      pg, f, f2,
+-      sv_estrin_10_f64_x (pg, f, f2, f4, svmul_x (pg, f4, f4), d->poly));
+-
+-  /* t = 2 ^ i.  */
+-  svfloat64_t t = svscale_x (pg, sv_f64 (1), i);
+-  /* expm1(x) = p * t + (t - 1).  */
+-  return svmla_x (pg, svsub_x (pg, t, 1), p, t);
+-}
+-
+-static svfloat64_t NOINLINE
+-special_case (svfloat64_t x, svfloat64_t y, svbool_t special)
+e2xm1_inline (const svbool_t pg, svfloat64_t x, const struct data *d)
+ {
+-  return sv_call_f64 (tanh, x, y, special);
+  svfloat64_t z = svmla_x (pg, sv_f64 (d->shift), x, d->two_over_ln2);
+  svuint64_t u = svreinterpret_u64 (z);
+  svfloat64_t n = svsub_x (pg, z, d->shift);
+
+  /* r = x - n * ln2/2, r is in [-ln2/(2N), ln2/(2N)].  */
+  svfloat64_t ln2 = svld1rq (svptrue_b64 (), &d->ln2_hi);
+  svfloat64_t r = svadd_x (pg, x, x);
+  r = svmls_lane (r, n, ln2, 0);
+  r = svmls_lane (r, n, ln2, 1);
+
+  /* y = exp(r) - 1 ~= r + C0 r^2 + C1 r^3 + C2 r^4 + C3 r^5 + C4 r^6.  */
+  svfloat64_t r2 = svmul_x (svptrue_b64 (), r, r);
+  svfloat64_t c24 = svld1rq (svptrue_b64 (), &d->c2);
+
+  svfloat64_t p;
+  svfloat64_t c12 = svmla_lane (sv_f64 (d->c1), r, c24, 0);
+  svfloat64_t c34 = svmla_lane (sv_f64 (d->c3), r, c24, 1);
+  p = svmad_x (pg, c34, r2, c12);
+  p = svmad_x (pg, p, r, sv_f64 (d->c0));
+  p = svmad_x (pg, p, r2, r);
+
+  svfloat64_t scale = svexpa (u);
+
+  /* We want to construct e2xm1(x) = (scale - 1) + scale * poly.
+     However, for values of scale close to 1, scale-1 causes large ULP errors
+     due to cancellation.
+
+     This can be circumvented by using a small lookup for scale-1
+     when our input is below a certain bound, otherwise we can use FEXPA.  */
+  svbool_t is_small = svaclt (pg, x, d->fexpa_bound);
+
+  /* Index via the input of FEXPA, but we only care about the lower 5 bits.  */
+  svuint64_t base_idx = svand_x (pg, u, 0x1f);
+
+  /* Compute scale - 1 from FEXPA, and lookup values where this fails.  */
+  svfloat64_t scalem1_estimate = svsub_x (pg, scale, sv_f64 (1.0));
+  svuint64_t scalem1_lookup
+      = svld1_gather_index (is_small, d->e2xm1_data, base_idx);
+
+  /* Select the appropriate scale - 1 value based on x.  */
+  svfloat64_t scalem1
+      = svsel (is_small, svreinterpret_f64 (scalem1_lookup), scalem1_estimate);
+  return svmla_x (pg, scalem1, scale, p);
+ }
+ 
+-/* SVE approximation for double-precision tanh(x), using a simplified
+-   version of expm1. The greatest observed error is 2.77 ULP:
+-   _ZGVsMxv_tanh(-0x1.c4a4ca0f9f3b7p-3) got -0x1.bd6a21a163627p-3
+-				       want -0x1.bd6a21a163624p-3.  */
+/* SVE approximation for double-precision tanh(x), using a modified version of
+   FEXPA expm1 to calculate e^2x - 1.
+   The greatest observed error is 2.79 + 0.5 ULP:
+   _ZGVsMxv_tanh (0x1.fff868eb3c223p-9) got 0x1.fff7be486cae6p-9
+				       want 0x1.fff7be486cae9p-9.  */
+ svfloat64_t SV_NAME_D1 (tanh) (svfloat64_t x, svbool_t pg)
+ {
+   const struct data *d = ptr_barrier (&data);
+ 
+-  svuint64_t ia = svreinterpret_u64 (svabs_x (pg, x));
+  svbool_t large = svacge (pg, x, d->large_bound);
+ 
+-  /* Trigger special-cases for tiny, boring and infinity/NaN.  */
+-  svbool_t special = svcmpgt (pg, svsub_x (pg, ia, d->tiny_bound), d->thresh);
+  /* We can use tanh(x) = (e^2x - 1) / (e^2x + 1) to approximate tanh.
+  As an additional optimisation, we can ensure more accurate values of e^x
+  by only using positive inputs. So we calculate tanh(|x|), and restore the
+  sign of the input before returning.  */
+  svfloat64_t ax = svabs_x (pg, x);
+  svuint64_t sign_bit
+      = sveor_x (pg, svreinterpret_u64 (x), svreinterpret_u64 (ax));
+ 
+-  svfloat64_t u = svadd_x (pg, x, x);
+  svfloat64_t p = e2xm1_inline (pg, ax, d);
+  svfloat64_t q = svadd_x (pg, p, 2);
+ 
+-  /* tanh(x) = (e^2x - 1) / (e^2x + 1).  */
+-  svfloat64_t q = expm1_inline (u, pg, d);
+-  svfloat64_t qp2 = svadd_x (pg, q, 2);
+  /* For sufficiently high inputs, the result of tanh(|x|) is 1 when correctly
+     rounded, at this point we can return 1 directly, with sign correction.
+     This will also act as a guard against our approximation overflowing.  */
+  svfloat64_t y = svsel (large, sv_f64 (1.0), svdiv_x (pg, p, q));
+ 
+-  if (__glibc_unlikely (svptest_any (pg, special)))
+-    return special_case (x, svdiv_x (pg, q, qp2), special);
+-  return svdiv_x (pg, q, qp2);
+  return svreinterpret_f64 (svorr_x (pg, sign_bit, svreinterpret_u64 (y)));
+ }
--- a/glibc-RHEL-118273-4.patch
+++ b/glibc-RHEL-118273-4.patch
@ -0,0 +1,673 @@
+commit 81406ea3c5b5ad19e307302c13dd642785b47948
+Author: Joe Ramsay <Joe.Ramsay@arm.com>
+Date:   Tue Feb 20 16:59:41 2024 +0000
+
+    aarch64/fpu: Add vector variants of asinh
+    
+    Reviewed-by: Szabolcs Nagy <szabolcs.nagy@arm.com>
+
+diff --git a/sysdeps/aarch64/fpu/Makefile b/sysdeps/aarch64/fpu/Makefile
+index 2e5bbb5a07f4c9b0..d474f2969dd05c26 100644
+--- a/sysdeps/aarch64/fpu/Makefile
+++ b/sysdeps/aarch64/fpu/Makefile
+@@ -1,6 +1,7 @@
+ libmvec-supported-funcs = acos \
+                           acosh \
+                           asin \
+                          asinh \
+                           atan \
+                           atan2 \
+                           cos \
+diff --git a/sysdeps/aarch64/fpu/Versions b/sysdeps/aarch64/fpu/Versions
+index 60e1cdeacec3f77e..08ea15efaec959fb 100644
+--- a/sysdeps/aarch64/fpu/Versions
+++ b/sysdeps/aarch64/fpu/Versions
+@@ -84,6 +84,11 @@ libmvec {
+     _ZGVnN4v_acoshf;
+     _ZGVsMxv_acosh;
+     _ZGVsMxv_acoshf;
+    _ZGVnN2v_asinh;
+    _ZGVnN2v_asinhf;
+    _ZGVnN4v_asinhf;
+    _ZGVsMxv_asinh;
+    _ZGVsMxv_asinhf;
+     _ZGVnN2v_cosh;
+     _ZGVnN2v_coshf;
+     _ZGVnN4v_coshf;
+diff --git a/sysdeps/aarch64/fpu/advsimd_f32_protos.h b/sysdeps/aarch64/fpu/advsimd_f32_protos.h
+index 22fec4de77395e60..1e80721c9f73ba12 100644
+--- a/sysdeps/aarch64/fpu/advsimd_f32_protos.h
+++ b/sysdeps/aarch64/fpu/advsimd_f32_protos.h
+@@ -20,6 +20,7 @@
+ libmvec_hidden_proto (V_NAME_F1(acos));
+ libmvec_hidden_proto (V_NAME_F1(acosh));
+ libmvec_hidden_proto (V_NAME_F1(asin));
+libmvec_hidden_proto (V_NAME_F1(asinh));
+ libmvec_hidden_proto (V_NAME_F1(atan));
+ libmvec_hidden_proto (V_NAME_F1(cos));
+ libmvec_hidden_proto (V_NAME_F1(cosh));
+diff --git a/sysdeps/aarch64/fpu/asinh_advsimd.c b/sysdeps/aarch64/fpu/asinh_advsimd.c
+new file mode 100644
+index 0000000000000000..544a52f6515d3201
+--- /dev/null
+++ b/sysdeps/aarch64/fpu/asinh_advsimd.c
+@@ -0,0 +1,171 @@
+/* Double-precision vector (Advanced SIMD) asinh function
+
+   Copyright (C) 2024 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include "v_math.h"
+#include "poly_advsimd_f64.h"
+
+#define A(i) v_f64 (__v_log_data.poly[i])
+#define N (1 << V_LOG_TABLE_BITS)
+
+const static struct data
+{
+  float64x2_t poly[18];
+  uint64x2_t off, huge_bound, abs_mask;
+  float64x2_t ln2, tiny_bound;
+} data = {
+  .off = V2 (0x3fe6900900000000),
+  .ln2 = V2 (0x1.62e42fefa39efp-1),
+  .huge_bound = V2 (0x5fe0000000000000),
+  .tiny_bound = V2 (0x1p-26),
+  .abs_mask = V2 (0x7fffffffffffffff),
+  /* Even terms of polynomial s.t. asinh(x) is approximated by
+     asinh(x) ~= x + x^3 * (C0 + C1 * x + C2 * x^2 + C3 * x^3 + ...).
+     Generated using Remez, f = (asinh(sqrt(x)) - sqrt(x))/x^(3/2).  */
+  .poly = { V2 (-0x1.55555555554a7p-3), V2 (0x1.3333333326c7p-4),
+	    V2 (-0x1.6db6db68332e6p-5), V2 (0x1.f1c71b26fb40dp-6),
+	    V2 (-0x1.6e8b8b654a621p-6), V2 (0x1.1c4daa9e67871p-6),
+	    V2 (-0x1.c9871d10885afp-7), V2 (0x1.7a16e8d9d2ecfp-7),
+	    V2 (-0x1.3ddca533e9f54p-7), V2 (0x1.0becef748dafcp-7),
+	    V2 (-0x1.b90c7099dd397p-8), V2 (0x1.541f2bb1ffe51p-8),
+	    V2 (-0x1.d217026a669ecp-9), V2 (0x1.0b5c7977aaf7p-9),
+	    V2 (-0x1.e0f37daef9127p-11), V2 (0x1.388b5fe542a6p-12),
+	    V2 (-0x1.021a48685e287p-14), V2 (0x1.93d4ba83d34dap-18) },
+};
+
+static float64x2_t NOINLINE VPCS_ATTR
+special_case (float64x2_t x, float64x2_t y, uint64x2_t special)
+{
+  return v_call_f64 (asinh, x, y, special);
+}
+
+struct entry
+{
+  float64x2_t invc;
+  float64x2_t logc;
+};
+
+static inline struct entry
+lookup (uint64x2_t i)
+{
+  float64x2_t e0 = vld1q_f64 (
+      &__v_log_data.table[(i[0] >> (52 - V_LOG_TABLE_BITS)) & (N - 1)].invc);
+  float64x2_t e1 = vld1q_f64 (
+      &__v_log_data.table[(i[1] >> (52 - V_LOG_TABLE_BITS)) & (N - 1)].invc);
+  return (struct entry){ vuzp1q_f64 (e0, e1), vuzp2q_f64 (e0, e1) };
+}
+
+static inline float64x2_t
+log_inline (float64x2_t x, const struct data *d)
+{
+  /* Double-precision vector log, copied from ordinary vector log with some
+     cosmetic modification and special-cases removed.  */
+  uint64x2_t ix = vreinterpretq_u64_f64 (x);
+  uint64x2_t tmp = vsubq_u64 (ix, d->off);
+  int64x2_t k = vshrq_n_s64 (vreinterpretq_s64_u64 (tmp), 52);
+  uint64x2_t iz
+      = vsubq_u64 (ix, vandq_u64 (tmp, vdupq_n_u64 (0xfffULL << 52)));
+  float64x2_t z = vreinterpretq_f64_u64 (iz);
+  struct entry e = lookup (tmp);
+  float64x2_t r = vfmaq_f64 (v_f64 (-1.0), z, e.invc);
+  float64x2_t kd = vcvtq_f64_s64 (k);
+  float64x2_t hi = vfmaq_f64 (vaddq_f64 (e.logc, r), kd, d->ln2);
+  float64x2_t r2 = vmulq_f64 (r, r);
+  float64x2_t y = vfmaq_f64 (A (2), A (3), r);
+  float64x2_t p = vfmaq_f64 (A (0), A (1), r);
+  y = vfmaq_f64 (y, A (4), r2);
+  y = vfmaq_f64 (p, y, r2);
+  y = vfmaq_f64 (hi, y, r2);
+  return y;
+}
+
+/* Double-precision implementation of vector asinh(x).
+   asinh is very sensitive around 1, so it is impractical to devise a single
+   low-cost algorithm which is sufficiently accurate on a wide range of input.
+   Instead we use two different algorithms:
+   asinh(x) = sign(x) * log(|x| + sqrt(x^2 + 1)      if |x| >= 1
+	    = sign(x) * (|x| + |x|^3 * P(x^2))       otherwise
+   where log(x) is an optimized log approximation, and P(x) is a polynomial
+   shared with the scalar routine. The greatest observed error 3.29 ULP, in
+   |x| >= 1:
+   __v_asinh(0x1.2cd9d717e2c9bp+0) got 0x1.ffffcfd0e234fp-1
+				  want 0x1.ffffcfd0e2352p-1.  */
+VPCS_ATTR float64x2_t V_NAME_D1 (asinh) (float64x2_t x)
+{
+  const struct data *d = ptr_barrier (&data);
+
+  float64x2_t ax = vabsq_f64 (x);
+  uint64x2_t iax = vreinterpretq_u64_f64 (ax);
+
+  uint64x2_t gt1 = vcgeq_f64 (ax, v_f64 (1));
+  uint64x2_t special = vcgeq_u64 (iax, d->huge_bound);
+
+#if WANT_SIMD_EXCEPT
+  uint64x2_t tiny = vcltq_f64 (ax, d->tiny_bound);
+  special = vorrq_u64 (special, tiny);
+#endif
+
+  /* Option 1: |x| >= 1.
+     Compute asinh(x) according by asinh(x) = log(x + sqrt(x^2 + 1)).
+     If WANT_SIMD_EXCEPT is enabled, sidestep special values, which will
+     overflow, by setting special lanes to 1. These will be fixed later.  */
+  float64x2_t option_1 = v_f64 (0);
+  if (__glibc_likely (v_any_u64 (gt1)))
+    {
+#if WANT_SIMD_EXCEPT
+      float64x2_t xm = v_zerofy_f64 (ax, special);
+#else
+      float64x2_t xm = ax;
+#endif
+      option_1 = log_inline (
+	  vaddq_f64 (xm, vsqrtq_f64 (vfmaq_f64 (v_f64 (1), xm, xm))), d);
+    }
+
+  /* Option 2: |x| < 1.
+     Compute asinh(x) using a polynomial.
+     If WANT_SIMD_EXCEPT is enabled, sidestep special lanes, which will
+     overflow, and tiny lanes, which will underflow, by setting them to 0. They
+     will be fixed later, either by selecting x or falling back to the scalar
+     special-case. The largest observed error in this region is 1.47 ULPs:
+     __v_asinh(0x1.fdfcd00cc1e6ap-1) got 0x1.c1d6bf874019bp-1
+				    want 0x1.c1d6bf874019cp-1.  */
+  float64x2_t option_2 = v_f64 (0);
+  if (__glibc_likely (v_any_u64 (vceqzq_u64 (gt1))))
+    {
+#if WANT_SIMD_EXCEPT
+      ax = v_zerofy_f64 (ax, vorrq_u64 (tiny, gt1));
+#endif
+      float64x2_t x2 = vmulq_f64 (ax, ax), x3 = vmulq_f64 (ax, x2),
+		  z2 = vmulq_f64 (x2, x2), z4 = vmulq_f64 (z2, z2),
+		  z8 = vmulq_f64 (z4, z4), z16 = vmulq_f64 (z8, z8);
+      float64x2_t p = v_estrin_17_f64 (x2, z2, z4, z8, z16, d->poly);
+      option_2 = vfmaq_f64 (ax, p, x3);
+#if WANT_SIMD_EXCEPT
+      option_2 = vbslq_f64 (tiny, x, option_2);
+#endif
+    }
+
+  /* Choose the right option for each lane.  */
+  float64x2_t y = vbslq_f64 (gt1, option_1, option_2);
+  /* Copy sign.  */
+  y = vbslq_f64 (d->abs_mask, y, x);
+
+  if (__glibc_unlikely (v_any_u64 (special)))
+    return special_case (x, y, special);
+  return y;
+}
+diff --git a/sysdeps/aarch64/fpu/asinh_sve.c b/sysdeps/aarch64/fpu/asinh_sve.c
+new file mode 100644
+index 0000000000000000..28dc5c458750bac4
+--- /dev/null
+++ b/sysdeps/aarch64/fpu/asinh_sve.c
+@@ -0,0 +1,150 @@
+/* Double-precision vector (SVE) asinh function
+
+   Copyright (C) 2024 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include "sv_math.h"
+#include "poly_sve_f64.h"
+
+#define SignMask (0x8000000000000000)
+#define One (0x3ff0000000000000)
+#define Thres (0x5fe0000000000000) /* asuint64 (0x1p511).  */
+
+static const struct data
+{
+  double poly[18];
+  double ln2, p3, p1, p4, p0, p2;
+  uint64_t n;
+  uint64_t off;
+
+} data = {
+  /* Polynomial generated using Remez on [2^-26, 1].  */
+  .poly
+  = { -0x1.55555555554a7p-3, 0x1.3333333326c7p-4, -0x1.6db6db68332e6p-5,
+      0x1.f1c71b26fb40dp-6, -0x1.6e8b8b654a621p-6, 0x1.1c4daa9e67871p-6,
+      -0x1.c9871d10885afp-7, 0x1.7a16e8d9d2ecfp-7, -0x1.3ddca533e9f54p-7,
+      0x1.0becef748dafcp-7, -0x1.b90c7099dd397p-8, 0x1.541f2bb1ffe51p-8,
+      -0x1.d217026a669ecp-9, 0x1.0b5c7977aaf7p-9, -0x1.e0f37daef9127p-11,
+      0x1.388b5fe542a6p-12, -0x1.021a48685e287p-14, 0x1.93d4ba83d34dap-18 },
+  .ln2 = 0x1.62e42fefa39efp-1,
+  .p0 = -0x1.ffffffffffff7p-2,
+  .p1 = 0x1.55555555170d4p-2,
+  .p2 = -0x1.0000000399c27p-2,
+  .p3 = 0x1.999b2e90e94cap-3,
+  .p4 = -0x1.554e550bd501ep-3,
+  .n = 1 << V_LOG_TABLE_BITS,
+  .off = 0x3fe6900900000000
+};
+
+static svfloat64_t NOINLINE
+special_case (svfloat64_t x, svfloat64_t y, svbool_t special)
+{
+  return sv_call_f64 (asinh, x, y, special);
+}
+
+static inline svfloat64_t
+__sv_log_inline (svfloat64_t x, const struct data *d, const svbool_t pg)
+{
+  /* Double-precision SVE log, copied from SVE log implementation with some
+     cosmetic modification and special-cases removed. See that file for details
+     of the algorithm used.  */
+
+  svuint64_t ix = svreinterpret_u64 (x);
+  svuint64_t tmp = svsub_x (pg, ix, d->off);
+  svuint64_t i = svand_x (pg, svlsr_x (pg, tmp, (51 - V_LOG_TABLE_BITS)),
+			  (d->n - 1) << 1);
+  svint64_t k = svasr_x (pg, svreinterpret_s64 (tmp), 52);
+  svuint64_t iz = svsub_x (pg, ix, svand_x (pg, tmp, 0xfffULL << 52));
+  svfloat64_t z = svreinterpret_f64 (iz);
+
+  svfloat64_t invc = svld1_gather_index (pg, &__v_log_data.table[0].invc, i);
+  svfloat64_t logc = svld1_gather_index (pg, &__v_log_data.table[0].logc, i);
+
+  svfloat64_t ln2_p3 = svld1rq (svptrue_b64 (), &d->ln2);
+  svfloat64_t p1_p4 = svld1rq (svptrue_b64 (), &d->p1);
+
+  svfloat64_t r = svmla_x (pg, sv_f64 (-1.0), invc, z);
+  svfloat64_t kd = svcvt_f64_x (pg, k);
+
+  svfloat64_t hi = svmla_lane (svadd_x (pg, logc, r), kd, ln2_p3, 0);
+  svfloat64_t r2 = svmul_x (pg, r, r);
+
+  svfloat64_t y = svmla_lane (sv_f64 (d->p2), r, ln2_p3, 1);
+
+  svfloat64_t p = svmla_lane (sv_f64 (d->p0), r, p1_p4, 0);
+  y = svmla_lane (y, r2, p1_p4, 1);
+  y = svmla_x (pg, p, r2, y);
+  y = svmla_x (pg, hi, r2, y);
+  return y;
+}
+
+/* Double-precision implementation of SVE asinh(x).
+   asinh is very sensitive around 1, so it is impractical to devise a single
+   low-cost algorithm which is sufficiently accurate on a wide range of input.
+   Instead we use two different algorithms:
+   asinh(x) = sign(x) * log(|x| + sqrt(x^2 + 1)      if |x| >= 1
+	    = sign(x) * (|x| + |x|^3 * P(x^2))       otherwise
+   where log(x) is an optimized log approximation, and P(x) is a polynomial
+   shared with the scalar routine. The greatest observed error 2.51 ULP, in
+   |x| >= 1:
+   _ZGVsMxv_asinh(0x1.170469d024505p+0) got 0x1.e3181c43b0f36p-1
+				       want 0x1.e3181c43b0f39p-1.  */
+svfloat64_t SV_NAME_D1 (asinh) (svfloat64_t x, const svbool_t pg)
+{
+  const struct data *d = ptr_barrier (&data);
+
+  svuint64_t ix = svreinterpret_u64 (x);
+  svuint64_t iax = svbic_x (pg, ix, SignMask);
+  svuint64_t sign = svand_x (pg, ix, SignMask);
+  svfloat64_t ax = svreinterpret_f64 (iax);
+
+  svbool_t ge1 = svcmpge (pg, iax, One);
+  svbool_t special = svcmpge (pg, iax, Thres);
+
+  /* Option 1: |x| >= 1.
+     Compute asinh(x) according by asinh(x) = log(x + sqrt(x^2 + 1)).  */
+  svfloat64_t option_1 = sv_f64 (0);
+  if (__glibc_likely (svptest_any (pg, ge1)))
+    {
+      svfloat64_t x2 = svmul_x (pg, ax, ax);
+      option_1 = __sv_log_inline (
+	  svadd_x (pg, ax, svsqrt_x (pg, svadd_x (pg, x2, 1))), d, pg);
+    }
+
+  /* Option 2: |x| < 1.
+     Compute asinh(x) using a polynomial.
+     The largest observed error in this region is 1.51 ULPs:
+     _ZGVsMxv_asinh(0x1.fe12bf8c616a2p-1) got 0x1.c1e649ee2681bp-1
+					 want 0x1.c1e649ee2681dp-1.  */
+  svfloat64_t option_2 = sv_f64 (0);
+  if (__glibc_likely (svptest_any (pg, svnot_z (pg, ge1))))
+    {
+      svfloat64_t x2 = svmul_x (pg, ax, ax);
+      svfloat64_t x4 = svmul_x (pg, x2, x2);
+      svfloat64_t p = sv_pw_horner_17_f64_x (pg, x2, x4, d->poly);
+      option_2 = svmla_x (pg, ax, p, svmul_x (pg, x2, ax));
+    }
+
+  /* Choose the right option for each lane.  */
+  svfloat64_t y = svsel (ge1, option_1, option_2);
+
+  if (__glibc_unlikely (svptest_any (pg, special)))
+    return special_case (
+	x, svreinterpret_f64 (sveor_x (pg, svreinterpret_u64 (y), sign)),
+	special);
+  return svreinterpret_f64 (sveor_x (pg, svreinterpret_u64 (y), sign));
+}
+diff --git a/sysdeps/aarch64/fpu/asinhf_advsimd.c b/sysdeps/aarch64/fpu/asinhf_advsimd.c
+new file mode 100644
+index 0000000000000000..09fd8a614305563d
+--- /dev/null
+++ b/sysdeps/aarch64/fpu/asinhf_advsimd.c
+@@ -0,0 +1,80 @@
+/* Single-precision vector (Advanced SIMD) asinh function
+
+   Copyright (C) 2024 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include "v_math.h"
+#include "v_log1pf_inline.h"
+
+#define SignMask v_u32 (0x80000000)
+
+const static struct data
+{
+  struct v_log1pf_data log1pf_consts;
+  uint32x4_t big_bound;
+#if WANT_SIMD_EXCEPT
+  uint32x4_t tiny_bound;
+#endif
+} data = {
+  .log1pf_consts = V_LOG1PF_CONSTANTS_TABLE,
+  .big_bound = V4 (0x5f800000), /* asuint(0x1p64).  */
+#if WANT_SIMD_EXCEPT
+  .tiny_bound = V4 (0x30800000) /* asuint(0x1p-30).  */
+#endif
+};
+
+static float32x4_t NOINLINE VPCS_ATTR
+special_case (float32x4_t x, float32x4_t y, uint32x4_t special)
+{
+  return v_call_f32 (asinhf, x, y, special);
+}
+
+/* Single-precision implementation of vector asinh(x), using vector log1p.
+   Worst-case error is 2.66 ULP, at roughly +/-0.25:
+   __v_asinhf(0x1.01b04p-2) got 0x1.fe163ep-3 want 0x1.fe1638p-3.  */
+VPCS_ATTR float32x4_t NOINLINE V_NAME_F1 (asinh) (float32x4_t x)
+{
+  const struct data *dat = ptr_barrier (&data);
+  uint32x4_t iax = vbicq_u32 (vreinterpretq_u32_f32 (x), SignMask);
+  float32x4_t ax = vreinterpretq_f32_u32 (iax);
+  uint32x4_t special = vcgeq_u32 (iax, dat->big_bound);
+  float32x4_t special_arg = x;
+
+#if WANT_SIMD_EXCEPT
+  /* Sidestep tiny and large values to avoid inadvertently triggering
+     under/overflow.  */
+  special = vorrq_u32 (special, vcltq_u32 (iax, dat->tiny_bound));
+  if (__glibc_unlikely (v_any_u32 (special)))
+    {
+      ax = v_zerofy_f32 (ax, special);
+      x = v_zerofy_f32 (x, special);
+    }
+#endif
+
+  /* asinh(x) = log(x + sqrt(x * x + 1)).
+     For positive x, asinh(x) = log1p(x + x * x / (1 + sqrt(x * x + 1))).  */
+  float32x4_t d
+      = vaddq_f32 (v_f32 (1), vsqrtq_f32 (vfmaq_f32 (v_f32 (1), x, x)));
+  float32x4_t y = log1pf_inline (
+      vaddq_f32 (ax, vdivq_f32 (vmulq_f32 (ax, ax), d)), dat->log1pf_consts);
+
+  if (__glibc_unlikely (v_any_u32 (special)))
+    return special_case (special_arg, vbslq_f32 (SignMask, x, y), special);
+  return vbslq_f32 (SignMask, x, y);
+}
+libmvec_hidden_def (V_NAME_F1 (asinh))
+HALF_WIDTH_ALIAS_F1 (asinh)
+diff --git a/sysdeps/aarch64/fpu/asinhf_sve.c b/sysdeps/aarch64/fpu/asinhf_sve.c
+new file mode 100644
+index 0000000000000000..d85c3a685c0b83ff
+--- /dev/null
+++ b/sysdeps/aarch64/fpu/asinhf_sve.c
+@@ -0,0 +1,56 @@
+/* Single-precision vector (SVE) asinh function
+
+   Copyright (C) 2024 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include "sv_math.h"
+#include "sv_log1pf_inline.h"
+
+#define BigBound (0x5f800000)  /* asuint(0x1p64).  */
+
+static svfloat32_t NOINLINE
+special_case (svfloat32_t x, svfloat32_t y, svbool_t special)
+{
+  return sv_call_f32 (asinhf, x, y, special);
+}
+
+/* Single-precision SVE asinh(x) routine. Implements the same algorithm as
+   vector asinhf and log1p.
+
+   Maximum error is 2.48 ULPs:
+   SV_NAME_F1 (asinh) (0x1.008864p-3) got 0x1.ffbbbcp-4
+				     want 0x1.ffbbb8p-4.  */
+svfloat32_t SV_NAME_F1 (asinh) (svfloat32_t x, const svbool_t pg)
+{
+  svfloat32_t ax = svabs_x (pg, x);
+  svuint32_t iax = svreinterpret_u32 (ax);
+  svuint32_t sign = sveor_x (pg, svreinterpret_u32 (x), iax);
+  svbool_t special = svcmpge (pg, iax, BigBound);
+
+  /* asinh(x) = log(x + sqrt(x * x + 1)).
+     For positive x, asinh(x) = log1p(x + x * x / (1 + sqrt(x * x + 1))).  */
+  svfloat32_t ax2 = svmul_x (pg, ax, ax);
+  svfloat32_t d = svadd_x (pg, svsqrt_x (pg, svadd_x (pg, ax2, 1.0f)), 1.0f);
+  svfloat32_t y
+      = sv_log1pf_inline (svadd_x (pg, ax, svdiv_x (pg, ax2, d)), pg);
+
+  if (__glibc_unlikely (svptest_any (pg, special)))
+    return special_case (
+	x, svreinterpret_f32 (svorr_x (pg, sign, svreinterpret_u32 (y))),
+	special);
+  return svreinterpret_f32 (svorr_x (pg, sign, svreinterpret_u32 (y)));
+}
+diff --git a/sysdeps/aarch64/fpu/bits/math-vector.h b/sysdeps/aarch64/fpu/bits/math-vector.h
+index 841330956c102ff1..eb2af35b27757fc6 100644
+--- a/sysdeps/aarch64/fpu/bits/math-vector.h
+++ b/sysdeps/aarch64/fpu/bits/math-vector.h
+@@ -41,6 +41,10 @@
+ # define __DECL_SIMD_asin __DECL_SIMD_aarch64
+ # undef __DECL_SIMD_asinf
+ # define __DECL_SIMD_asinf __DECL_SIMD_aarch64
+# undef __DECL_SIMD_asinh
+# define __DECL_SIMD_asinh __DECL_SIMD_aarch64
+# undef __DECL_SIMD_asinhf
+# define __DECL_SIMD_asinhf __DECL_SIMD_aarch64
+ # undef __DECL_SIMD_atan
+ # define __DECL_SIMD_atan __DECL_SIMD_aarch64
+ # undef __DECL_SIMD_atanf
+@@ -131,6 +135,7 @@ __vpcs __f32x4_t _ZGVnN4vv_atan2f (__f32x4_t, __f32x4_t);
+ __vpcs __f32x4_t _ZGVnN4v_acosf (__f32x4_t);
+ __vpcs __f32x4_t _ZGVnN4v_acoshf (__f32x4_t);
+ __vpcs __f32x4_t _ZGVnN4v_asinf (__f32x4_t);
+__vpcs __f32x4_t _ZGVnN4v_asinhf (__f32x4_t);
+ __vpcs __f32x4_t _ZGVnN4v_atanf (__f32x4_t);
+ __vpcs __f32x4_t _ZGVnN4v_cosf (__f32x4_t);
+ __vpcs __f32x4_t _ZGVnN4v_coshf (__f32x4_t);
+@@ -150,6 +155,7 @@ __vpcs __f64x2_t _ZGVnN2vv_atan2 (__f64x2_t, __f64x2_t);
+ __vpcs __f64x2_t _ZGVnN2v_acos (__f64x2_t);
+ __vpcs __f64x2_t _ZGVnN2v_acosh (__f64x2_t);
+ __vpcs __f64x2_t _ZGVnN2v_asin (__f64x2_t);
+__vpcs __f64x2_t _ZGVnN2v_asinh (__f64x2_t);
+ __vpcs __f64x2_t _ZGVnN2v_atan (__f64x2_t);
+ __vpcs __f64x2_t _ZGVnN2v_cos (__f64x2_t);
+ __vpcs __f64x2_t _ZGVnN2v_cosh (__f64x2_t);
+@@ -174,6 +180,7 @@ __sv_f32_t _ZGVsMxvv_atan2f (__sv_f32_t, __sv_f32_t, __sv_bool_t);
+ __sv_f32_t _ZGVsMxv_acosf (__sv_f32_t, __sv_bool_t);
+ __sv_f32_t _ZGVsMxv_acoshf (__sv_f32_t, __sv_bool_t);
+ __sv_f32_t _ZGVsMxv_asinf (__sv_f32_t, __sv_bool_t);
+__sv_f32_t _ZGVsMxv_asinhf (__sv_f32_t, __sv_bool_t);
+ __sv_f32_t _ZGVsMxv_atanf (__sv_f32_t, __sv_bool_t);
+ __sv_f32_t _ZGVsMxv_cosf (__sv_f32_t, __sv_bool_t);
+ __sv_f32_t _ZGVsMxv_coshf (__sv_f32_t, __sv_bool_t);
+@@ -193,6 +200,7 @@ __sv_f64_t _ZGVsMxvv_atan2 (__sv_f64_t, __sv_f64_t, __sv_bool_t);
+ __sv_f64_t _ZGVsMxv_acos (__sv_f64_t, __sv_bool_t);
+ __sv_f64_t _ZGVsMxv_acosh (__sv_f64_t, __sv_bool_t);
+ __sv_f64_t _ZGVsMxv_asin (__sv_f64_t, __sv_bool_t);
+__sv_f64_t _ZGVsMxv_asinh (__sv_f64_t, __sv_bool_t);
+ __sv_f64_t _ZGVsMxv_atan (__sv_f64_t, __sv_bool_t);
+ __sv_f64_t _ZGVsMxv_cos (__sv_f64_t, __sv_bool_t);
+ __sv_f64_t _ZGVsMxv_cosh (__sv_f64_t, __sv_bool_t);
+diff --git a/sysdeps/aarch64/fpu/test-double-advsimd-wrappers.c b/sysdeps/aarch64/fpu/test-double-advsimd-wrappers.c
+index f4ce1d70096888aa..3d7177c32dcd77a6 100644
+--- a/sysdeps/aarch64/fpu/test-double-advsimd-wrappers.c
+++ b/sysdeps/aarch64/fpu/test-double-advsimd-wrappers.c
+@@ -26,6 +26,7 @@
+ VPCS_VECTOR_WRAPPER (acos_advsimd, _ZGVnN2v_acos)
+ VPCS_VECTOR_WRAPPER (acosh_advsimd, _ZGVnN2v_acosh)
+ VPCS_VECTOR_WRAPPER (asin_advsimd, _ZGVnN2v_asin)
+VPCS_VECTOR_WRAPPER (asinh_advsimd, _ZGVnN2v_asinh)
+ VPCS_VECTOR_WRAPPER (atan_advsimd, _ZGVnN2v_atan)
+ VPCS_VECTOR_WRAPPER_ff (atan2_advsimd, _ZGVnN2vv_atan2)
+ VPCS_VECTOR_WRAPPER (cos_advsimd, _ZGVnN2v_cos)
+diff --git a/sysdeps/aarch64/fpu/test-double-sve-wrappers.c b/sysdeps/aarch64/fpu/test-double-sve-wrappers.c
+index 0e973cc9d7ade813..b88a2afe5c1198c0 100644
+--- a/sysdeps/aarch64/fpu/test-double-sve-wrappers.c
+++ b/sysdeps/aarch64/fpu/test-double-sve-wrappers.c
+@@ -45,6 +45,7 @@
+ SVE_VECTOR_WRAPPER (acos_sve, _ZGVsMxv_acos)
+ SVE_VECTOR_WRAPPER (acosh_sve, _ZGVsMxv_acosh)
+ SVE_VECTOR_WRAPPER (asin_sve, _ZGVsMxv_asin)
+SVE_VECTOR_WRAPPER (asinh_sve, _ZGVsMxv_asinh)
+ SVE_VECTOR_WRAPPER (atan_sve, _ZGVsMxv_atan)
+ SVE_VECTOR_WRAPPER_ff (atan2_sve, _ZGVsMxvv_atan2)
+ SVE_VECTOR_WRAPPER (cos_sve, _ZGVsMxv_cos)
+diff --git a/sysdeps/aarch64/fpu/test-float-advsimd-wrappers.c b/sysdeps/aarch64/fpu/test-float-advsimd-wrappers.c
+index 0ce026b5ea96a064..533655402d3f3737 100644
+--- a/sysdeps/aarch64/fpu/test-float-advsimd-wrappers.c
+++ b/sysdeps/aarch64/fpu/test-float-advsimd-wrappers.c
+@@ -26,6 +26,7 @@
+ VPCS_VECTOR_WRAPPER (acosf_advsimd, _ZGVnN4v_acosf)
+ VPCS_VECTOR_WRAPPER (acoshf_advsimd, _ZGVnN4v_acoshf)
+ VPCS_VECTOR_WRAPPER (asinf_advsimd, _ZGVnN4v_asinf)
+VPCS_VECTOR_WRAPPER (asinhf_advsimd, _ZGVnN4v_asinhf)
+ VPCS_VECTOR_WRAPPER (atanf_advsimd, _ZGVnN4v_atanf)
+ VPCS_VECTOR_WRAPPER_ff (atan2f_advsimd, _ZGVnN4vv_atan2f)
+ VPCS_VECTOR_WRAPPER (cosf_advsimd, _ZGVnN4v_cosf)
+diff --git a/sysdeps/aarch64/fpu/test-float-sve-wrappers.c b/sysdeps/aarch64/fpu/test-float-sve-wrappers.c
+index 398b7373e800cd5b..f7b673e3358e7d82 100644
+--- a/sysdeps/aarch64/fpu/test-float-sve-wrappers.c
+++ b/sysdeps/aarch64/fpu/test-float-sve-wrappers.c
+@@ -45,6 +45,7 @@
+ SVE_VECTOR_WRAPPER (acosf_sve, _ZGVsMxv_acosf)
+ SVE_VECTOR_WRAPPER (acoshf_sve, _ZGVsMxv_acoshf)
+ SVE_VECTOR_WRAPPER (asinf_sve, _ZGVsMxv_asinf)
+SVE_VECTOR_WRAPPER (asinhf_sve, _ZGVsMxv_asinhf)
+ SVE_VECTOR_WRAPPER (atanf_sve, _ZGVsMxv_atanf)
+ SVE_VECTOR_WRAPPER_ff (atan2f_sve, _ZGVsMxvv_atan2f)
+ SVE_VECTOR_WRAPPER (cosf_sve, _ZGVsMxv_cosf)
+diff --git a/sysdeps/aarch64/libm-test-ulps b/sysdeps/aarch64/libm-test-ulps
+index 1646cdbdd22d93d9..b916e422432014c2 100644
+--- a/sysdeps/aarch64/libm-test-ulps
+++ b/sysdeps/aarch64/libm-test-ulps
+@@ -90,11 +90,19 @@ double: 2
+ float: 2
+ ldouble: 4
+ 
+Function: "asinh_advsimd":
+double: 1
+float: 2
+
+ Function: "asinh_downward":
+ double: 3
+ float: 3
+ ldouble: 4
+ 
+Function: "asinh_sve":
+double: 1
+float: 2
+
+ Function: "asinh_towardzero":
+ double: 2
+ float: 2
+diff --git a/sysdeps/unix/sysv/linux/aarch64/libmvec.abilist b/sysdeps/unix/sysv/linux/aarch64/libmvec.abilist
+index f5aaa519f2c8663e..f288afdfdd9c8757 100644
+--- a/sysdeps/unix/sysv/linux/aarch64/libmvec.abilist
+++ b/sysdeps/unix/sysv/linux/aarch64/libmvec.abilist
+@@ -75,15 +75,20 @@ GLIBC_2.39 _ZGVsMxvv_atan2 F
+ GLIBC_2.39 _ZGVsMxvv_atan2f F
+ GLIBC_2.40 _ZGVnN2v_acosh F
+ GLIBC_2.40 _ZGVnN2v_acoshf F
+GLIBC_2.40 _ZGVnN2v_asinh F
+GLIBC_2.40 _ZGVnN2v_asinhf F
+ GLIBC_2.40 _ZGVnN2v_cosh F
+ GLIBC_2.40 _ZGVnN2v_coshf F
+ GLIBC_2.40 _ZGVnN2v_erf F
+ GLIBC_2.40 _ZGVnN2v_erff F
+ GLIBC_2.40 _ZGVnN4v_acoshf F
+GLIBC_2.40 _ZGVnN4v_asinhf F
+ GLIBC_2.40 _ZGVnN4v_coshf F
+ GLIBC_2.40 _ZGVnN4v_erff F
+ GLIBC_2.40 _ZGVsMxv_acosh F
+ GLIBC_2.40 _ZGVsMxv_acoshf F
+GLIBC_2.40 _ZGVsMxv_asinh F
+GLIBC_2.40 _ZGVsMxv_asinhf F
+ GLIBC_2.40 _ZGVsMxv_cosh F
+ GLIBC_2.40 _ZGVsMxv_coshf F
+ GLIBC_2.40 _ZGVsMxv_erf F
--- a/glibc-RHEL-118273-40.patch
+++ b/glibc-RHEL-118273-40.patch
@ -0,0 +1,521 @@
+commit 1e3d1ddf977ecd653de8d0d10eb083d80ac21cf3
+Author: Dylan Fleming <Dylan.Fleming@arm.com>
+Date:   Wed Jun 18 16:17:12 2025 +0000
+
+    AArch64: Optimize SVE exp functions
+    
+    Improve performance of SVE exps by making better use
+    of the SVE FEXPA instruction.
+    
+    Performance improvement on Neoverse V1:
+    exp2_sve:   21%
+    exp2f_sve:  24%
+    exp10f_sve: 23%
+    expm1_sve:  25%
+    
+    Reviewed-by: Wilco Dijkstra  <Wilco.Dijkstra@arm.com>
+
+diff --git a/sysdeps/aarch64/fpu/exp10f_sve.c b/sysdeps/aarch64/fpu/exp10f_sve.c
+index 8aa3fa9c4335cfb8..0a4c26450601a1db 100644
+--- a/sysdeps/aarch64/fpu/exp10f_sve.c
+++ b/sysdeps/aarch64/fpu/exp10f_sve.c
+@@ -19,26 +19,19 @@
+ 
+ #include "sv_math.h"
+ 
+-/* For x < -Thres, the result is subnormal and not handled correctly by
+-   FEXPA.  */
+-#define Thres 37.9
+/* For x < -Thres (-log10(2^126)), the result is subnormal and not handled
+   correctly by FEXPA.  */
+#define Thres 0x1.2f702p+5
+ 
+ static const struct data
+ {
+-  float log2_10_lo, c0, c2, c4;
+-  float c1, c3, log10_2;
+-  float shift, log2_10_hi, thres;
+  float log10_2, log2_10_hi, log2_10_lo, c1;
+  float c0, shift, thres;
+ } data = {
+   /* Coefficients generated using Remez algorithm with minimisation of relative
+-     error.
+-     rel error: 0x1.89dafa3p-24
+-     abs error: 0x1.167d55p-23 in [-log10(2)/2, log10(2)/2]
+-     maxerr: 0.52 +0.5 ulp.  */
+-  .c0 = 0x1.26bb16p+1f,
+-  .c1 = 0x1.5350d2p+1f,
+-  .c2 = 0x1.04744ap+1f,
+-  .c3 = 0x1.2d8176p+0f,
+-  .c4 = 0x1.12b41ap-1f,
+     error.  */
+  .c0 = 0x1.26bb62p1,
+  .c1 = 0x1.53524cp1,
+   /* 1.5*2^17 + 127, a shift value suitable for FEXPA.  */
+   .shift = 0x1.803f8p17f,
+   .log10_2 = 0x1.a934fp+1,
+@@ -53,28 +46,23 @@ sv_exp10f_inline (svfloat32_t x, const svbool_t pg, const struct data *d)
+   /* exp10(x) = 2^(n/N) * 10^r = 2^n * (1 + poly (r)),
+      with poly(r) in [1/sqrt(2), sqrt(2)] and
+      x = r + n * log10(2) / N, with r in [-log10(2)/2N, log10(2)/2N].  */
+-
+-  svfloat32_t lane_consts = svld1rq (svptrue_b32 (), &d->log2_10_lo);
+  svfloat32_t lane_consts = svld1rq (svptrue_b32 (), &d->log10_2);
+ 
+   /* n = round(x/(log10(2)/N)).  */
+   svfloat32_t shift = sv_f32 (d->shift);
+-  svfloat32_t z = svmad_x (pg, sv_f32 (d->log10_2), x, shift);
+-  svfloat32_t n = svsub_x (svptrue_b32 (), z, shift);
+  svfloat32_t z = svmla_lane (shift, x, lane_consts, 0);
+  svfloat32_t n = svsub_x (pg, z, shift);
+ 
+   /* r = x - n*log10(2)/N.  */
+-  svfloat32_t r = svmsb_x (pg, sv_f32 (d->log2_10_hi), n, x);
+-  r = svmls_lane (r, n, lane_consts, 0);
+  svfloat32_t r = x;
+  r = svmls_lane (r, n, lane_consts, 1);
+  r = svmls_lane (r, n, lane_consts, 2);
+ 
+   svfloat32_t scale = svexpa (svreinterpret_u32 (z));
+ 
+   /* Polynomial evaluation: poly(r) ~ exp10(r)-1.  */
+-  svfloat32_t p12 = svmla_lane (sv_f32 (d->c1), r, lane_consts, 2);
+-  svfloat32_t p34 = svmla_lane (sv_f32 (d->c3), r, lane_consts, 3);
+-  svfloat32_t r2 = svmul_x (svptrue_b32 (), r, r);
+-  svfloat32_t p14 = svmla_x (pg, p12, p34, r2);
+-  svfloat32_t p0 = svmul_lane (r, lane_consts, 1);
+-  svfloat32_t poly = svmla_x (pg, p0, r2, p14);
+-
+  svfloat32_t poly = svmla_lane (sv_f32 (d->c0), r, lane_consts, 3);
+  poly = svmul_x (pg, poly, r);
+   return svmla_x (pg, scale, scale, poly);
+ }
+ 
+@@ -85,11 +73,10 @@ special_case (svfloat32_t x, svbool_t special, const struct data *d)
+ 		      special);
+ }
+ 
+-/* Single-precision SVE exp10f routine. Implements the same algorithm
+-   as AdvSIMD exp10f.
+-   Worst case error is 1.02 ULPs.
+-   _ZGVsMxv_exp10f(-0x1.040488p-4) got 0x1.ba5f9ep-1
+-				  want 0x1.ba5f9cp-1.  */
+/* Single-precision SVE exp10f routine. Based on the FEXPA instruction.
+   Worst case error is 1.10 ULP.
+   _ZGVsMxv_exp10f (0x1.cc76dep+3) got 0x1.be0172p+47
+				  want 0x1.be017p+47.  */
+ svfloat32_t SV_NAME_F1 (exp10) (svfloat32_t x, const svbool_t pg)
+ {
+   const struct data *d = ptr_barrier (&data);
+diff --git a/sysdeps/aarch64/fpu/exp2_sve.c b/sysdeps/aarch64/fpu/exp2_sve.c
+index 5dfb77cdbc2f6a51..ed11423e45059133 100644
+--- a/sysdeps/aarch64/fpu/exp2_sve.c
+++ b/sysdeps/aarch64/fpu/exp2_sve.c
+@@ -19,23 +19,21 @@
+ 
+ #include "sv_math.h"
+ 
+-#define N (1 << V_EXP_TABLE_BITS)
+-
+ #define BigBound 1022
+ #define UOFlowBound 1280
+ 
+ static const struct data
+ {
+-  double c0, c2;
+-  double c1, c3;
+  double c2, c4;
+  double c0, c1, c3;
+   double shift, big_bound, uoflow_bound;
+ } data = {
+   /* Coefficients are computed using Remez algorithm with
+      minimisation of the absolute error.  */
+-  .c0 = 0x1.62e42fefa3686p-1, .c1 = 0x1.ebfbdff82c241p-3,
+-  .c2 = 0x1.c6b09b16de99ap-5, .c3 = 0x1.3b2abf5571ad8p-7,
+-  .shift = 0x1.8p52 / N,      .uoflow_bound = UOFlowBound,
+-  .big_bound = BigBound,
+  .c0 = 0x1.62e42fefa39efp-1,  .c1 = 0x1.ebfbdff82a31bp-3,
+  .c2 = 0x1.c6b08d706c8a5p-5,  .c3 = 0x1.3b2ad2ff7d2f3p-7,
+  .c4 = 0x1.5d8761184beb3p-10, .shift = 0x1.800000000ffc0p+46,
+  .uoflow_bound = UOFlowBound, .big_bound = BigBound,
+ };
+ 
+ #define SpecialOffset 0x6000000000000000 /* 0x1p513.  */
+@@ -64,50 +62,52 @@ special_case (svbool_t pg, svfloat64_t s, svfloat64_t y, svfloat64_t n,
+       svadd_x (pg, svsub_x (pg, svreinterpret_u64 (s), SpecialBias2), b));
+ 
+   /* |n| > 1280 => 2^(n) overflows.  */
+-  svbool_t p_cmp = svacgt (pg, n, d->uoflow_bound);
+  svbool_t p_cmp = svacle (pg, n, d->uoflow_bound);
+ 
+   svfloat64_t r1 = svmul_x (svptrue_b64 (), s1, s1);
+   svfloat64_t r2 = svmla_x (pg, s2, s2, y);
+   svfloat64_t r0 = svmul_x (svptrue_b64 (), r2, s1);
+ 
+-  return svsel (p_cmp, r1, r0);
+  return svsel (p_cmp, r0, r1);
+ }
+ 
+ /* Fast vector implementation of exp2.
+-   Maximum measured error is 1.65 ulp.
+-   _ZGVsMxv_exp2(-0x1.4c264ab5b559bp-6) got 0x1.f8db0d4df721fp-1
+-				       want 0x1.f8db0d4df721dp-1.  */
+   Maximum measured error is 0.52 + 0.5 ulp.
+   _ZGVsMxv_exp2 (0x1.3b72ad5b701bfp-1) got 0x1.8861641b49e08p+0
+				       want 0x1.8861641b49e07p+0.  */
+ svfloat64_t SV_NAME_D1 (exp2) (svfloat64_t x, svbool_t pg)
+ {
+   const struct data *d = ptr_barrier (&data);
+-  svbool_t no_big_scale = svacle (pg, x, d->big_bound);
+-  svbool_t special = svnot_z (pg, no_big_scale);
+-
+-  /* Reduce x to k/N + r, where k is integer and r in [-1/2N, 1/2N].  */
+-  svfloat64_t shift = sv_f64 (d->shift);
+-  svfloat64_t kd = svadd_x (pg, x, shift);
+-  svuint64_t ki = svreinterpret_u64 (kd);
+-  /* kd = k/N.  */
+-  kd = svsub_x (pg, kd, shift);
+-  svfloat64_t r = svsub_x (pg, x, kd);
+-
+-  /* scale ~= 2^(k/N).  */
+-  svuint64_t idx = svand_x (pg, ki, N - 1);
+-  svuint64_t sbits = svld1_gather_index (pg, __v_exp_data, idx);
+-  /* This is only a valid scale when -1023*N < k < 1024*N.  */
+-  svuint64_t top = svlsl_x (pg, ki, 52 - V_EXP_TABLE_BITS);
+-  svfloat64_t scale = svreinterpret_f64 (svadd_x (pg, sbits, top));
+-
+-  svfloat64_t c13 = svld1rq (svptrue_b64 (), &d->c1);
+-  /* Approximate exp2(r) using polynomial.  */
+-  /* y = exp2(r) - 1 ~= C0 r + C1 r^2 + C2 r^3 + C3 r^4.  */
+  svbool_t special = svacge (pg, x, d->big_bound);
+
+  svfloat64_t z = svadd_x (svptrue_b64 (), x, d->shift);
+  svfloat64_t n = svsub_x (svptrue_b64 (), z, d->shift);
+  svfloat64_t r = svsub_x (svptrue_b64 (), x, n);
+
+  svfloat64_t scale = svexpa (svreinterpret_u64 (z));
+
+   svfloat64_t r2 = svmul_x (svptrue_b64 (), r, r);
+-  svfloat64_t p01 = svmla_lane (sv_f64 (d->c0), r, c13, 0);
+-  svfloat64_t p23 = svmla_lane (sv_f64 (d->c2), r, c13, 1);
+-  svfloat64_t p = svmla_x (pg, p01, p23, r2);
+  svfloat64_t c24 = svld1rq (svptrue_b64 (), &d->c2);
+
+  /* Approximate exp2(r) using polynomial.  */
+  /* y = exp2(r) - 1 ~= r * (C0 + C1 r + C2 r^2 + C3 r^3 + C4 r^4).  */
+  svfloat64_t p12 = svmla_lane (sv_f64 (d->c1), r, c24, 0);
+  svfloat64_t p34 = svmla_lane (sv_f64 (d->c3), r, c24, 1);
+  svfloat64_t p = svmla_x (pg, p12, p34, r2);
+  p = svmad_x (pg, p, r, d->c0);
+   svfloat64_t y = svmul_x (svptrue_b64 (), r, p);
+
+   /* Assemble exp2(x) = exp2(r) * scale.  */
+   if (__glibc_unlikely (svptest_any (pg, special)))
+-    return special_case (pg, scale, y, kd, d);
+    {
+      /* FEXPA zeroes the sign bit, however the sign is meaningful to the
+          special case function so needs to be copied.
+          e = sign bit of u << 46.  */
+      svuint64_t e = svand_x (pg, svlsl_x (pg, svreinterpret_u64 (z), 46),
+            0x8000000000000000);
+      scale = svreinterpret_f64 (svadd_x (pg, e, svreinterpret_u64 (scale)));
+      return special_case (pg, scale, y, n, d);
+    }
+
+   return svmla_x (pg, scale, scale, y);
+ }
+diff --git a/sysdeps/aarch64/fpu/exp2f_sve.c b/sysdeps/aarch64/fpu/exp2f_sve.c
+index c6216bed9e9e7538..cf01820288f1855c 100644
+--- a/sysdeps/aarch64/fpu/exp2f_sve.c
+++ b/sysdeps/aarch64/fpu/exp2f_sve.c
+@@ -18,21 +18,17 @@
+    <https://www.gnu.org/licenses/>.  */
+ 
+ #include "sv_math.h"
+-#include "poly_sve_f32.h"
+ 
+ #define Thres 0x1.5d5e2ap+6f
+ 
+ static const struct data
+ {
+-  float c0, c2, c4, c1, c3;
+-  float shift, thres;
+  float c0, c1, shift, thres;
+ } data = {
+-  /* Coefficients copied from the polynomial in AdvSIMD variant.  */
+-  .c0 = 0x1.62e422p-1f,
+-  .c1 = 0x1.ebf9bcp-3f,
+-  .c2 = 0x1.c6bd32p-5f,
+-  .c3 = 0x1.3ce9e4p-7f,
+-  .c4 = 0x1.59977ap-10f,
+  /* Coefficients generated using Remez algorithm with minimisation of relative
+     error.  */
+  .c0 = 0x1.62e485p-1,
+  .c1 = 0x1.ebfbe0p-3,
+   /* 1.5*2^17 + 127.  */
+   .shift = 0x1.803f8p17f,
+   /* Roughly 87.3. For x < -Thres, the result is subnormal and not handled
+@@ -51,16 +47,8 @@ sv_exp2f_inline (svfloat32_t x, const svbool_t pg, const struct data *d)
+ 
+   svfloat32_t scale = svexpa (svreinterpret_u32 (z));
+ 
+-  /* Polynomial evaluation: poly(r) ~ exp2(r)-1.
+-     Evaluate polynomial use hybrid scheme - offset ESTRIN by 1 for
+-     coefficients 1 to 4, and apply most significant coefficient directly.  */
+-  svfloat32_t even_coeffs = svld1rq (svptrue_b32 (), &d->c0);
+-  svfloat32_t r2 = svmul_x (svptrue_b32 (), r, r);
+-  svfloat32_t p12 = svmla_lane (sv_f32 (d->c1), r, even_coeffs, 1);
+-  svfloat32_t p34 = svmla_lane (sv_f32 (d->c3), r, even_coeffs, 2);
+-  svfloat32_t p14 = svmla_x (pg, p12, r2, p34);
+-  svfloat32_t p0 = svmul_lane (r, even_coeffs, 0);
+-  svfloat32_t poly = svmla_x (pg, p0, r2, p14);
+  svfloat32_t poly = svmla_x (pg, sv_f32 (d->c0), r, sv_f32 (d->c1));
+  poly = svmul_x (svptrue_b32 (), poly, r);
+ 
+   return svmla_x (pg, scale, scale, poly);
+ }
+@@ -72,11 +60,10 @@ special_case (svfloat32_t x, svbool_t special, const struct data *d)
+ 		      special);
+ }
+ 
+-/* Single-precision SVE exp2f routine. Implements the same algorithm
+-   as AdvSIMD exp2f.
+-   Worst case error is 1.04 ULPs.
+-   _ZGVsMxv_exp2f(-0x1.af994ap-3) got 0x1.ba6a66p-1
+-				 want 0x1.ba6a64p-1.  */
+/* Single-precision SVE exp2f routine, based on the FEXPA instruction.
+   Worst case error is 1.09 ULPs.
+   _ZGVsMxv_exp2f (0x1.9a2a94p-1) got 0x1.be1054p+0
+				 want 0x1.be1052p+0.  */
+ svfloat32_t SV_NAME_F1 (exp2) (svfloat32_t x, const svbool_t pg)
+ {
+   const struct data *d = ptr_barrier (&data);
+diff --git a/sysdeps/aarch64/fpu/expm1_sve.c b/sysdeps/aarch64/fpu/expm1_sve.c
+index c933cf9c0eb2406b..4c35e0341d34aee0 100644
+--- a/sysdeps/aarch64/fpu/expm1_sve.c
+++ b/sysdeps/aarch64/fpu/expm1_sve.c
+@@ -18,82 +18,164 @@
+    <https://www.gnu.org/licenses/>.  */
+ 
+ #include "sv_math.h"
+-#include "poly_sve_f64.h"
+ 
+-#define SpecialBound 0x1.62b7d369a5aa9p+9
+-#define ExponentBias 0x3ff0000000000000
+#define FexpaBound 0x1.4cb5ecef28adap-3 /* 15*ln2/64.  */
+#define SpecialBound 0x1.628c2855bfaddp+9 /* ln(2^(1023 + 1/128)).  */
+ 
+ static const struct data
+ {
+-  double poly[11];
+-  double shift, inv_ln2, special_bound;
+-  /* To be loaded in one quad-word.  */
+  double c2, c4;
+  double inv_ln2;
+   double ln2_hi, ln2_lo;
+  double c0, c1, c3;
+  double shift, thres;
+  uint64_t expm1_data[32];
+ } data = {
+-  /* Generated using fpminimax.  */
+-  .poly = { 0x1p-1, 0x1.5555555555559p-3, 0x1.555555555554bp-5,
+-            0x1.111111110f663p-7, 0x1.6c16c16c1b5f3p-10, 0x1.a01a01affa35dp-13,
+-            0x1.a01a018b4ecbbp-16, 0x1.71ddf82db5bb4p-19, 0x1.27e517fc0d54bp-22,
+-            0x1.af5eedae67435p-26, 0x1.1f143d060a28ap-29, },
+-
+-  .special_bound = SpecialBound,
+-  .inv_ln2 = 0x1.71547652b82fep0,
+-  .ln2_hi = 0x1.62e42fefa39efp-1,
+-  .ln2_lo = 0x1.abc9e3b39803fp-56,
+-  .shift = 0x1.8p52,
+  /* Table emulating FEXPA - 1, for values of FEXPA close to 1.
+  The table holds values of 2^(i/64) - 1, computed in arbitrary precision.
+  The first half of the table stores values associated to i from 0 to 15.
+  The second half of the table stores values associated to i from 0 to -15.  */
+  .expm1_data = {
+      0x0000000000000000, 0x3f864d1f3bc03077, 0x3f966c34c5615d0f, 0x3fa0e8a30eb37901,
+      0x3fa6ab0d9f3121ec, 0x3fac7d865a7a3440, 0x3fb1301d0125b50a, 0x3fb429aaea92ddfb,
+      0x3fb72b83c7d517ae, 0x3fba35beb6fcb754, 0x3fbd4873168b9aa8, 0x3fc031dc431466b2,
+		  0x3fc1c3d373ab11c3, 0x3fc35a2b2f13e6e9, 0x3fc4f4efa8fef709, 0x3fc6942d3720185a,
+      0x0000000000000000, 0xbfc331751ec3a814, 0xbfc20224341286e4, 0xbfc0cf85bed0f8b7,
+      0xbfbf332113d56b1f, 0xbfbcc0768d4175a6, 0xbfba46f918837cb7, 0xbfb7c695afc3b424,
+		  0xbfb53f391822dbc7, 0xbfb2b0cfe1266bd4, 0xbfb01b466423250a, 0xbfaafd11874c009e,
+      0xbfa5b505d5b6f268, 0xbfa05e4119ea5d89, 0xbf95f134923757f3, 0xbf860f9f985bc9f4,
+    },
+
+  /* Generated using Remez, in [-log(2)/128, log(2)/128].  */
+  .c0 = 0x1p-1,
+  .c1 = 0x1.55555555548f9p-3,
+  .c2 = 0x1.5555555554c22p-5,
+  .c3 = 0x1.111123aaa2fb2p-7,
+  .c4 = 0x1.6c16d77d98e5bp-10,
+  .ln2_hi = 0x1.62e42fefa3800p-1,
+  .ln2_lo = 0x1.ef35793c76730p-45,
+  .inv_ln2 = 0x1.71547652b82fep+0,
+  .shift = 0x1.800000000ffc0p+46, /* 1.5*2^46+1023.  */
+  .thres = SpecialBound,
+ };
+ 
+-static svfloat64_t NOINLINE
+-special_case (svfloat64_t x, svfloat64_t y, svbool_t pg)
+#define SpecialOffset 0x6000000000000000 /* 0x1p513.  */
+/* SpecialBias1 + SpecialBias1 = asuint(1.0).  */
+#define SpecialBias1 0x7000000000000000 /* 0x1p769.  */
+#define SpecialBias2 0x3010000000000000 /* 0x1p-254.  */
+
+static NOINLINE svfloat64_t
+special_case (svbool_t pg, svfloat64_t y, svfloat64_t s, svfloat64_t p,
+	      svfloat64_t n)
+ {
+-  return sv_call_f64 (expm1, x, y, pg);
+  /* s=2^n may overflow, break it up into s=s1*s2,
+     such that exp = s + s*y can be computed as s1*(s2+s2*y)
+     and s1*s1 overflows only if n>0.  */
+
+  /* If n<=0 then set b to 0x6, 0 otherwise.  */
+  svbool_t p_sign = svcmple (pg, n, 0.0); /* n <= 0.  */
+  svuint64_t b
+      = svdup_u64_z (p_sign, SpecialOffset); /* Inactive lanes set to 0.  */
+
+  /* Set s1 to generate overflow depending on sign of exponent n,
+     ie. s1 = 0x70...0 - b.  */
+  svfloat64_t s1 = svreinterpret_f64 (svsubr_x (pg, b, SpecialBias1));
+  /* Offset s to avoid overflow in final result if n is below threshold.
+     ie. s2 = as_u64 (s) - 0x3010...0 + b.  */
+  svfloat64_t s2 = svreinterpret_f64 (
+      svadd_x (pg, svsub_x (pg, svreinterpret_u64 (s), SpecialBias2), b));
+
+  /* |n| > 1280 => 2^(n) overflows.  */
+  svbool_t p_cmp = svacgt (pg, n, 1280.0);
+
+  svfloat64_t r1 = svmul_x (svptrue_b64 (), s1, s1);
+  svfloat64_t r2 = svmla_x (pg, s2, s2, p);
+  svfloat64_t r0 = svmul_x (svptrue_b64 (), r2, s1);
+
+  svbool_t is_safe = svacle (pg, n, 1023); /* Only correct special lanes.  */
+  return svsel (is_safe, y, svsub_x (pg, svsel (p_cmp, r1, r0), 1.0));
+ }
+ 
+-/* Double-precision vector exp(x) - 1 function.
+-   The maximum error observed error is 2.18 ULP:
+-   _ZGVsMxv_expm1(0x1.634ba0c237d7bp-2) got 0x1.a8b9ea8d66e22p-2
+-				       want 0x1.a8b9ea8d66e2p-2.  */
+/* FEXPA based SVE expm1 algorithm.
+   Maximum measured error is 2.81 + 0.5 ULP:
+   _ZGVsMxv_expm1 (0x1.974060e619bfp-3) got 0x1.c290e5858bb53p-3
+				       want 0x1.c290e5858bb5p-3.  */
+ svfloat64_t SV_NAME_D1 (expm1) (svfloat64_t x, svbool_t pg)
+ {
+   const struct data *d = ptr_barrier (&data);
+ 
+-  /* Large, Nan/Inf.  */
+-  svbool_t special = svnot_z (pg, svaclt (pg, x, d->special_bound));
+-
+-  /* Reduce argument to smaller range:
+-     Let i = round(x / ln2)
+-     and f = x - i * ln2, then f is in [-ln2/2, ln2/2].
+-     exp(x) - 1 = 2^i * (expm1(f) + 1) - 1
+-     where 2^i is exact because i is an integer.  */
+-  svfloat64_t shift = sv_f64 (d->shift);
+-  svfloat64_t n = svsub_x (pg, svmla_x (pg, shift, x, d->inv_ln2), shift);
+-  svint64_t i = svcvt_s64_x (pg, n);
+-  svfloat64_t ln2 = svld1rq (svptrue_b64 (), &d->ln2_hi);
+-  svfloat64_t f = svmls_lane (x, n, ln2, 0);
+-  f = svmls_lane (f, n, ln2, 1);
+-
+-  /* Approximate expm1(f) using polynomial.
+-     Taylor expansion for expm1(x) has the form:
+-	 x + ax^2 + bx^3 + cx^4 ....
+-     So we calculate the polynomial P(f) = a + bf + cf^2 + ...
+-     and assemble the approximation expm1(f) ~= f + f^2 * P(f).  */
+-  svfloat64_t f2 = svmul_x (pg, f, f);
+-  svfloat64_t f4 = svmul_x (pg, f2, f2);
+-  svfloat64_t f8 = svmul_x (pg, f4, f4);
+-  svfloat64_t p
+-      = svmla_x (pg, f, f2, sv_estrin_10_f64_x (pg, f, f2, f4, f8, d->poly));
+-
+-  /* Assemble the result.
+-   expm1(x) ~= 2^i * (p + 1) - 1
+-   Let t = 2^i.  */
+-  svint64_t u = svadd_x (pg, svlsl_x (pg, i, 52), ExponentBias);
+-  svfloat64_t t = svreinterpret_f64 (u);
+-
+-  /* expm1(x) ~= p * t + (t - 1).  */
+-  svfloat64_t y = svmla_x (pg, svsub_x (pg, t, 1), p, t);
+  svbool_t special = svacgt (pg, x, d->thres);
+ 
+-  if (__glibc_unlikely (svptest_any (pg, special)))
+-    return special_case (x, y, special);
+  svfloat64_t z = svmla_x (pg, sv_f64 (d->shift), x, d->inv_ln2);
+  svuint64_t u = svreinterpret_u64 (z);
+  svfloat64_t n = svsub_x (pg, z, d->shift);
+ 
+  /* r = x - n * ln2, r is in [-ln2/128, ln2/128].  */
+  svfloat64_t ln2 = svld1rq (svptrue_b64 (), &d->ln2_hi);
+  svfloat64_t r = x;
+  r = svmls_lane (r, n, ln2, 0);
+  r = svmls_lane (r, n, ln2, 1);
+
+  /* y = exp(r) - 1 ~= r + C0 r^2 + C1 r^3 + C2 r^4 + C3 r^5 + C4 r^6.  */
+  svfloat64_t r2 = svmul_x (svptrue_b64 (), r, r);
+  svfloat64_t c24 = svld1rq (svptrue_b64 (), &d->c2);
+
+  svfloat64_t p;
+  svfloat64_t c12 = svmla_lane (sv_f64 (d->c1), r, c24, 0);
+  svfloat64_t c34 = svmla_lane (sv_f64 (d->c3), r, c24, 1);
+  p = svmad_x (pg, c34, r2, c12);
+  p = svmad_x (pg, p, r, sv_f64 (d->c0));
+  p = svmad_x (pg, p, r2, r);
+
+  svfloat64_t scale = svexpa (u);
+  svfloat64_t scalem1 = svsub_x (pg, scale, sv_f64 (1.0));
+
+  /* We want to construct expm1(x) = (scale - 1) + scale * poly.
+     However, for values of scale close to 1, scale-1 causes large ULP errors
+     due to cancellation.
+
+     This can be circumvented by using a small lookup for scale-1
+     when our input is below a certain bound, otherwise we can use FEXPA.
+
+     This bound is based upon the table size:
+	   Bound = (TableSize-1/64) * ln2.
+     The current bound is based upon a table size of 16.  */
+  svbool_t is_small = svaclt (pg, x, FexpaBound);
+
+  if (svptest_any (pg, is_small))
+    {
+      /* Index via the input of FEXPA, but we only care about the lower 4 bits.
+       */
+      svuint64_t base_idx = svand_x (pg, u, 0xf);
+
+      /* We can use the sign of x as a fifth bit to account for the asymmetry
+	 of e^x around 0.  */
+      svuint64_t signBit
+	  = svlsl_x (pg, svlsr_x (pg, svreinterpret_u64 (x), 63), 4);
+      svuint64_t idx = svorr_x (pg, base_idx, signBit);
+
+      /* Lookup values for scale - 1 for small x.  */
+      svfloat64_t lookup = svreinterpret_f64 (
+	  svld1_gather_index (is_small, d->expm1_data, idx));
+
+      /* Select the appropriate scale - 1 value based on x.  */
+      scalem1 = svsel (is_small, lookup, scalem1);
+    }
+
+  svfloat64_t y = svmla_x (pg, scalem1, scale, p);
+
+  /* FEXPA returns nan for large inputs so we special case those.  */
+  if (__glibc_unlikely (svptest_any (pg, special)))
+    {
+      /* FEXPA zeroes the sign bit, however the sign is meaningful to the
+          special case function so needs to be copied.
+          e = sign bit of u << 46.  */
+      svuint64_t e = svand_x (pg, svlsl_x (pg, u, 46), 0x8000000000000000);
+      /* Copy sign to s.  */
+      scale = svreinterpret_f64 (svadd_x (pg, e, svreinterpret_u64 (scale)));
+      return special_case (pg, y, scale, p, n);
+    }
+
+  /* return expm1 = (scale - 1) + (scale * poly).  */
+   return y;
+ }
--- a/glibc-RHEL-118273-41.patch
+++ b/glibc-RHEL-118273-41.patch
@ -0,0 +1,49 @@
+commit aac077645a645bba0d67f3250e82017c539d0f4b
+Author: Pierre Blanchard <pierre.blanchard@arm.com>
+Date:   Wed Aug 20 17:41:50 2025 +0000
+
+    AArch64: Fix SVE powf routine [BZ #33299]
+    
+    Fix a bug in predicate logic introduced in last change.
+    A slight performance improvement from relying on all true
+    predicates during conversion from single to double.
+    This fixes BZ #33299.
+    
+    Reviewed-by: Wilco Dijkstra  <Wilco.Dijkstra@arm.com>
+
+diff --git a/sysdeps/aarch64/fpu/powf_sve.c b/sysdeps/aarch64/fpu/powf_sve.c
+index 08d7019a1855ff3c..33bba96054cf4cc8 100644
+--- a/sysdeps/aarch64/fpu/powf_sve.c
+++ b/sysdeps/aarch64/fpu/powf_sve.c
+@@ -223,15 +223,15 @@ sv_powf_core (const svbool_t pg, svuint32_t i, svuint32_t iz, svint32_t k,
+   const svbool_t ptrue = svptrue_b64 ();
+ 
+   /* Unpack and promote input vectors (pg, y, z, i, k and sign_bias) into two
+-   * in order to perform core computation in double precision.  */
+     in order to perform core computation in double precision.  */
+   const svbool_t pg_lo = svunpklo (pg);
+   const svbool_t pg_hi = svunpkhi (pg);
+-  svfloat64_t y_lo
+-      = svcvt_f64_x (pg, svreinterpret_f32 (svunpklo (svreinterpret_u32 (y))));
+-  svfloat64_t y_hi
+-      = svcvt_f64_x (pg, svreinterpret_f32 (svunpkhi (svreinterpret_u32 (y))));
+-  svfloat64_t z_lo = svcvt_f64_x (pg, svreinterpret_f32 (svunpklo (iz)));
+-  svfloat64_t z_hi = svcvt_f64_x (pg, svreinterpret_f32 (svunpkhi (iz)));
+  svfloat64_t y_lo = svcvt_f64_x (
+      ptrue, svreinterpret_f32 (svunpklo (svreinterpret_u32 (y))));
+  svfloat64_t y_hi = svcvt_f64_x (
+      ptrue, svreinterpret_f32 (svunpkhi (svreinterpret_u32 (y))));
+  svfloat64_t z_lo = svcvt_f64_x (ptrue, svreinterpret_f32 (svunpklo (iz)));
+  svfloat64_t z_hi = svcvt_f64_x (ptrue, svreinterpret_f32 (svunpkhi (iz)));
+   svuint64_t i_lo = svunpklo (i);
+   svuint64_t i_hi = svunpkhi (i);
+   svint64_t k_lo = svunpklo (k);
+@@ -312,7 +312,7 @@ svfloat32_t SV_NAME_F2 (pow) (svfloat32_t x, svfloat32_t y, const svbool_t pg)
+ 			 (23 - V_POWF_EXP2_TABLE_BITS));
+ 
+   /* Compute core in extended precision and return intermediate ylogx results
+-   * to handle cases of underflow and underflow in exp.  */
+     to handle cases of underflow and overflow in exp.  */
+   svfloat32_t ylogx;
+   svfloat32_t ret
+       = sv_powf_core (yint_or_xpos, i, iz, k, y, sign_bias, &ylogx, d);
--- a/glibc-RHEL-118273-42.patch
+++ b/glibc-RHEL-118273-42.patch
@ -0,0 +1,174 @@
+commit e20ca759af46fbb7eae20c52b857e7636eb50e1b
+Author: remph <lhr@disroot.org>
+Date:   Thu Sep 4 12:53:56 2025 +0000
+
+    AArch64: add optimised strspn/strcspn
+    
+    Requires Neon (aka. Advanced SIMD).  Looks up 16 characters at a time,
+    for a 2-3x perfomance improvement, and a ~30% speedup on the strtok &
+    strsep benchtests, as tested on Cortex A-{53,72}.
+    
+    Signed-off-by: remph <lhr@disroot.org>
+    
+    Reviewed-by: Wilco Dijkstra  <Wilco.Dijkstra@arm.com>
+
+diff --git a/sysdeps/aarch64/strcspn.S b/sysdeps/aarch64/strcspn.S
+new file mode 100644
+index 0000000000000000..f2a69e9856cba04c
+--- /dev/null
+++ b/sysdeps/aarch64/strcspn.S
+@@ -0,0 +1,2 @@
+#define USE_AS_STRCSPN 1
+#include "strspn.S"
+diff --git a/sysdeps/aarch64/strspn.S b/sysdeps/aarch64/strspn.S
+new file mode 100644
+index 0000000000000000..edbb705b15991e39
+--- /dev/null
+++ b/sysdeps/aarch64/strspn.S
+@@ -0,0 +1,146 @@
+/* Copyright (C) 2025 Free Software Foundation, Inc.
+
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library.  If not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+#ifdef USE_AS_STRCSPN
+# define STRSPN strcspn
+# define SBT orr	/* SBT -- `set bit' */
+#else
+# define STRSPN strspn
+# define SBT bic
+#endif
+
+#ifdef __AARCH64EB__
+# define LS_FW lsl
+# define LS_BK lsr
+#else
+# define LS_FW lsr
+# define LS_BK lsl
+#endif
+
+#define og_s	x0
+#define set	x1	/* ACCEPT for strspn, REJECT for strcspn */
+
+#define byte_i	x3
+#define bits_i	x4
+#define one	x6
+
+#define syndrome	x5
+#define s		x6
+
+#define vbyte_i	v1.16b
+#define	vbits_i	v2.16b
+#define table	v4.16b-v5.16b
+#define table_a	v4
+#define table_b	v5
+#define sevens	v7.16b
+
+ENTRY(STRSPN)
+	ldrb	w2, [set]
+	cbz	w2, L(early)
+#ifdef USE_AS_STRCSPN
+	ldrb	w3, [set, 1]
+	cbz	w3, L(early)
+#endif
+
+	/* Table has ones for bytes to reject and zeros for bytes to accept */
+	mov	one, 1
+#ifdef USE_AS_STRCSPN
+	stp	one, xzr, [sp, -32]!
+	.cfi_def_cfa_offset 32
+	stp	xzr, xzr, [sp, 16]
+#else
+	mvni	v0.4s, 0
+	stp	q0, q0, [sp, -32]!
+	.cfi_def_cfa_offset 32
+#endif
+
+	.p2align 4
+L(fill_table):
+	lsr	byte_i, x2, 6	/* x2 / 64 */
+	lsl	bits_i, one, x2	/* x2 % 64 implicitly */
+	ldrb	w2, [set, 1]!
+	ldr	x5, [sp, byte_i, lsl 3]
+	SBT	x5, x5, bits_i
+	str	x5, [sp, byte_i, lsl 3]
+	cbnz	w2, L(fill_table)
+
+	ld1	{table_a.2d-table_b.2d}, [sp], 32
+	.cfi_def_cfa_offset 0
+	ubfiz	syndrome, og_s, 2, 4	/* Bottom 4 bits, times 4 to count nibbles */
+	and	s, og_s, -16		/* Round S down to 16-byte boundary */
+	movi	sevens, 7
+	/* Bias the syndrome to mask off these nibbles */
+	mov	x8, -1
+	LS_BK	syndrome, x8, syndrome
+	mvn	syndrome, syndrome
+
+L(loop):
+	ldr	q0, [s], 16
+	ushr	vbyte_i, v0.16b, 3
+	bic	vbits_i, sevens, v0.16b
+	tbl	v0.16b, {table}, vbyte_i
+	/* Bring the relevant bit to the MSB of each byte */
+	sshl	v0.16b, v0.16b, vbits_i
+	/* Set every bit of each byte to its MSB */
+	cmlt	v0.16b, v0.16b, 0
+	/* Bytes->nibbles */
+	shrn	v0.8b, v0.8h, 4
+	fmov	x2, d0
+	bic	syndrome, x2, syndrome
+	cbz	syndrome, L(loop)
+
+#ifndef __AARCH64EB__
+	rbit	syndrome, syndrome
+#endif
+	sub	s, s, 16
+	clz	syndrome, syndrome
+	sub	x0, s, og_s
+	add	x0, x0, syndrome, lsr 2
+	ret
+
+	.balign 8 /* For strspn, which has only 2 instructions here */
+L(early):
+#ifdef USE_AS_STRCSPN
+	/* strlen(set) < 2: call strchrnul(s, *set) and get its offset from S */
+	stp	fp, lr, [sp, -32]!
+	.cfi_def_cfa_offset 32
+	.cfi_offset fp, -32
+	.cfi_offset lr, -24
+	str	x19, [sp, 16]
+	.cfi_offset 19, -16
+	mov	w1, w2
+	mov	fp, sp
+	mov	x19, x0
+	bl	__strchrnul
+	sub	x0, x0, x19
+	ldr	x19, [sp, 16]
+	ldp	fp, lr, [sp], 32
+	.cfi_restore lr
+	.cfi_restore fp
+	.cfi_restore 19
+	.cfi_def_cfa_offset 0
+#else
+	mov	w0, 0
+#endif
+	ret
+END(STRSPN)
+
+#undef set
+libc_hidden_def(STRSPN)
--- a/glibc-RHEL-118273-43.patch
+++ b/glibc-RHEL-118273-43.patch
@ -0,0 +1,93 @@
+commit aebaeb2c330482171340e966f7f33fac884a27f4
+Author: Wilco Dijkstra <wilco.dijkstra@arm.com>
+Date:   Thu Sep 18 14:24:47 2025 +0000
+
+    AArch64: Update math-vector-fortran.h
+    
+    Update math-vector-fortran.h with the latest set of math functions
+    and sort by name.
+    
+    Reviewed-by: Yury Khrustalev <yury.khrustalev@arm.com>
+
+diff --git a/sysdeps/aarch64/fpu/finclude/math-vector-fortran.h b/sysdeps/aarch64/fpu/finclude/math-vector-fortran.h
+index 92e15f0d6a758258..161f43d20c51e252 100644
+--- a/sysdeps/aarch64/fpu/finclude/math-vector-fortran.h
+++ b/sysdeps/aarch64/fpu/finclude/math-vector-fortran.h
+@@ -15,33 +15,74 @@
+ !   You should have received a copy of the GNU Lesser General Public
+ !   License along with the GNU C Library; if not, see
+ !   <https://www.gnu.org/licenses/>.
+
+ !GCC$ builtin (acos) attributes simd (notinbranch)
+ !GCC$ builtin (acosf) attributes simd (notinbranch)
+!GCC$ builtin (acosh) attributes simd (notinbranch)
+!GCC$ builtin (acoshf) attributes simd (notinbranch)
+!GCC$ builtin (acospi) attributes simd (notinbranch)
+!GCC$ builtin (acospif) attributes simd (notinbranch)
+ !GCC$ builtin (asin) attributes simd (notinbranch)
+ !GCC$ builtin (asinf) attributes simd (notinbranch)
+!GCC$ builtin (asinh) attributes simd (notinbranch)
+!GCC$ builtin (asinhf) attributes simd (notinbranch)
+!GCC$ builtin (asinpi) attributes simd (notinbranch)
+!GCC$ builtin (asinpif) attributes simd (notinbranch)
+ !GCC$ builtin (atan) attributes simd (notinbranch)
+-!GCC$ builtin (atanf) attributes simd (notinbranch)
+ !GCC$ builtin (atan2) attributes simd (notinbranch)
+ !GCC$ builtin (atan2f) attributes simd (notinbranch)
+!GCC$ builtin (atan2pi) attributes simd (notinbranch)
+!GCC$ builtin (atan2pif) attributes simd (notinbranch)
+!GCC$ builtin (atanf) attributes simd (notinbranch)
+!GCC$ builtin (atanh) attributes simd (notinbranch)
+!GCC$ builtin (atanhf) attributes simd (notinbranch)
+!GCC$ builtin (atanpi) attributes simd (notinbranch)
+!GCC$ builtin (atanpif) attributes simd (notinbranch)
+!GCC$ builtin (cbrt) attributes simd (notinbranch)
+!GCC$ builtin (cbrtf) attributes simd (notinbranch)
+ !GCC$ builtin (cos) attributes simd (notinbranch)
+ !GCC$ builtin (cosf) attributes simd (notinbranch)
+!GCC$ builtin (cosh) attributes simd (notinbranch)
+!GCC$ builtin (coshf) attributes simd (notinbranch)
+!GCC$ builtin (cospi) attributes simd (notinbranch)
+!GCC$ builtin (cospif) attributes simd (notinbranch)
+!GCC$ builtin (erf) attributes simd (notinbranch)
+!GCC$ builtin (erfc) attributes simd (notinbranch)
+!GCC$ builtin (erfcf) attributes simd (notinbranch)
+!GCC$ builtin (erff) attributes simd (notinbranch)
+ !GCC$ builtin (exp) attributes simd (notinbranch)
+-!GCC$ builtin (expf) attributes simd (notinbranch)
+ !GCC$ builtin (exp10) attributes simd (notinbranch)
+ !GCC$ builtin (exp10f) attributes simd (notinbranch)
+!GCC$ builtin (exp10m1) attributes simd (notinbranch)
+!GCC$ builtin (exp10m1f) attributes simd (notinbranch)
+ !GCC$ builtin (exp2) attributes simd (notinbranch)
+ !GCC$ builtin (exp2f) attributes simd (notinbranch)
+!GCC$ builtin (exp2m1) attributes simd (notinbranch)
+!GCC$ builtin (exp2m1f) attributes simd (notinbranch)
+!GCC$ builtin (expf) attributes simd (notinbranch)
+ !GCC$ builtin (expm1) attributes simd (notinbranch)
+ !GCC$ builtin (expm1f) attributes simd (notinbranch)
+!GCC$ builtin (hypot) attributes simd (notinbranch)
+!GCC$ builtin (hypotf) attributes simd (notinbranch)
+ !GCC$ builtin (log) attributes simd (notinbranch)
+-!GCC$ builtin (logf) attributes simd (notinbranch)
+ !GCC$ builtin (log10) attributes simd (notinbranch)
+ !GCC$ builtin (log10f) attributes simd (notinbranch)
+ !GCC$ builtin (log1p) attributes simd (notinbranch)
+ !GCC$ builtin (log1pf) attributes simd (notinbranch)
+ !GCC$ builtin (log2) attributes simd (notinbranch)
+ !GCC$ builtin (log2f) attributes simd (notinbranch)
+!GCC$ builtin (logf) attributes simd (notinbranch)
+!GCC$ builtin (pow) attributes simd (notinbranch)
+!GCC$ builtin (powf) attributes simd (notinbranch)
+ !GCC$ builtin (sin) attributes simd (notinbranch)
+ !GCC$ builtin (sinf) attributes simd (notinbranch)
+!GCC$ builtin (sinh) attributes simd (notinbranch)
+!GCC$ builtin (sinhf) attributes simd (notinbranch)
+!GCC$ builtin (sinpi) attributes simd (notinbranch)
+!GCC$ builtin (sinpif) attributes simd (notinbranch)
+ !GCC$ builtin (tan) attributes simd (notinbranch)
+ !GCC$ builtin (tanf) attributes simd (notinbranch)
+!GCC$ builtin (tanh) attributes simd (notinbranch)
+!GCC$ builtin (tanhf) attributes simd (notinbranch)
+!GCC$ builtin (tanpi) attributes simd (notinbranch)
+!GCC$ builtin (tanpif) attributes simd (notinbranch)
--- a/glibc-RHEL-118273-44.patch
+++ b/glibc-RHEL-118273-44.patch
@ -0,0 +1,97 @@
+commit 6c22823da57aa5218f717f569c04c9573c0448c5
+Author: Joe Ramsay <Joe.Ramsay@arm.com>
+Date:   Thu Nov 6 18:26:54 2025 +0000
+
+    AArch64: Fix instability in AdvSIMD tan
+    
+    Previously presence of special-cases in one lane could affect the
+    results in other lanes due to unconditional scalar fallback. The old
+    WANT_SIMD_EXCEPT option (which has never been enabled in libmvec) has
+    been removed from AOR, making it easier to spot and fix this. 4%
+    improvement in throughput with GCC 14 on Neoverse V1. This bug is
+    present as far back as 2.39 (where tan was first introduced).
+    
+    Reviewed-by: Wilco Dijkstra  <Wilco.Dijkstra@arm.com>
+
+diff --git a/sysdeps/aarch64/fpu/tan_advsimd.c b/sysdeps/aarch64/fpu/tan_advsimd.c
+index d56a102dd17a3463..c6a5a17126674d7d 100644
+--- a/sysdeps/aarch64/fpu/tan_advsimd.c
+++ b/sysdeps/aarch64/fpu/tan_advsimd.c
+@@ -25,9 +25,7 @@ static const struct data
+   float64x2_t poly[9];
+   double half_pi[2];
+   float64x2_t two_over_pi, shift;
+-#if !WANT_SIMD_EXCEPT
+   float64x2_t range_val;
+-#endif
+ } data = {
+   /* Coefficients generated using FPMinimax.  */
+   .poly = { V2 (0x1.5555555555556p-2), V2 (0x1.1111111110a63p-3),
+@@ -38,20 +36,17 @@ static const struct data
+   .half_pi = { 0x1.921fb54442d18p0, 0x1.1a62633145c07p-54 },
+   .two_over_pi = V2 (0x1.45f306dc9c883p-1),
+   .shift = V2 (0x1.8p52),
+-#if !WANT_SIMD_EXCEPT
+   .range_val = V2 (0x1p23),
+-#endif
+ };
+ 
+ #define RangeVal 0x4160000000000000  /* asuint64(0x1p23).  */
+ #define TinyBound 0x3e50000000000000 /* asuint64(2^-26).  */
+-#define Thresh 0x310000000000000     /* RangeVal - TinyBound.  */
+ 
+ /* Special cases (fall back to scalar calls).  */
+ static float64x2_t VPCS_ATTR NOINLINE
+-special_case (float64x2_t x)
+special_case (float64x2_t x, float64x2_t n, float64x2_t d, uint64x2_t special)
+ {
+-  return v_call_f64 (tan, x, x, v_u64 (-1));
+  return v_call_f64 (tan, x, vdivq_f64 (n, d), special);
+ }
+ 
+ /* Vector approximation for double-precision tan.
+@@ -65,14 +60,6 @@ float64x2_t VPCS_ATTR V_NAME_D1 (tan) (float64x2_t x)
+      very large inputs. Fall back to scalar routine for all lanes if any are
+      too large, or Inf/NaN. If fenv exceptions are expected, also fall back for
+      tiny input to avoid underflow.  */
+-#if WANT_SIMD_EXCEPT
+-  uint64x2_t iax = vreinterpretq_u64_f64 (vabsq_f64 (x));
+-  /* iax - tiny_bound > range_val - tiny_bound.  */
+-  uint64x2_t special
+-      = vcgtq_u64 (vsubq_u64 (iax, v_u64 (TinyBound)), v_u64 (Thresh));
+-  if (__glibc_unlikely (v_any_u64 (special)))
+-    return special_case (x);
+-#endif
+ 
+   /* q = nearest integer to 2 * x / pi.  */
+   float64x2_t q
+@@ -81,9 +68,8 @@ float64x2_t VPCS_ATTR V_NAME_D1 (tan) (float64x2_t x)
+ 
+   /* Use q to reduce x to r in [-pi/4, pi/4], by:
+      r = x - q * pi/2, in extended precision.  */
+-  float64x2_t r = x;
+   float64x2_t half_pi = vld1q_f64 (dat->half_pi);
+-  r = vfmsq_laneq_f64 (r, q, half_pi, 0);
+  float64x2_t r = vfmsq_laneq_f64 (x, q, half_pi, 0);
+   r = vfmsq_laneq_f64 (r, q, half_pi, 1);
+   /* Further reduce r to [-pi/8, pi/8], to be reconstructed using double angle
+      formula.  */
+@@ -114,12 +100,13 @@ float64x2_t VPCS_ATTR V_NAME_D1 (tan) (float64x2_t x)
+ 
+   uint64x2_t no_recip = vtstq_u64 (vreinterpretq_u64_s64 (qi), v_u64 (1));
+ 
+-#if !WANT_SIMD_EXCEPT
+   uint64x2_t special = vcageq_f64 (x, dat->range_val);
+  float64x2_t swap = vbslq_f64 (no_recip, n, vnegq_f64 (d));
+  d = vbslq_f64 (no_recip, d, n);
+  n = swap;
+
+   if (__glibc_unlikely (v_any_u64 (special)))
+-    return special_case (x);
+-#endif
+    return special_case (x, n, d, special);
+ 
+-  return vdivq_f64 (vbslq_f64 (no_recip, n, vnegq_f64 (d)),
+-		    vbslq_f64 (no_recip, d, n));
+  return vdivq_f64 (n, d);
+ }
--- a/glibc-RHEL-118273-45.patch
+++ b/glibc-RHEL-118273-45.patch
@ -0,0 +1,88 @@
+commit e45af510bc816e860c8e2e1d4a652b4fe15c4b34
+Author: Joe Ramsay <Joe.Ramsay@arm.com>
+Date:   Thu Nov 6 18:29:33 2025 +0000
+
+    AArch64: Fix instability in AdvSIMD sinh
+    
+    Previously presence of special-cases in one lane could affect the
+    results in other lanes due to unconditional scalar fallback. The old
+    WANT_SIMD_EXCEPT option (which has never been enabled in libmvec) has
+    been removed from AOR, making it easier to spot and fix
+    this. No measured change in performance. This patch applies cleanly as
+    far back as 2.41, however there are conflicts with 2.40 where sinh was
+    first introduced.
+    
+    Reviewed-by: Wilco Dijkstra  <Wilco.Dijkstra@arm.com>
+
+diff --git a/sysdeps/aarch64/fpu/sinh_advsimd.c b/sysdeps/aarch64/fpu/sinh_advsimd.c
+index 7adf771517de2507..66504cdee84ee77e 100644
+--- a/sysdeps/aarch64/fpu/sinh_advsimd.c
+++ b/sysdeps/aarch64/fpu/sinh_advsimd.c
+@@ -24,36 +24,26 @@ static const struct data
+ {
+   struct v_expm1_data d;
+   uint64x2_t halff;
+-#if WANT_SIMD_EXCEPT
+-  uint64x2_t tiny_bound, thresh;
+-#else
+   float64x2_t large_bound;
+-#endif
+ } data = {
+   .d = V_EXPM1_DATA,
+   .halff = V2 (0x3fe0000000000000),
+-#if WANT_SIMD_EXCEPT
+-  /* 2^-26, below which sinh(x) rounds to x.  */
+-  .tiny_bound = V2 (0x3e50000000000000),
+-  /* asuint(large_bound) - asuint(tiny_bound).  */
+-  .thresh = V2 (0x0230000000000000),
+-#else
+   /* 2^9. expm1 helper overflows for large input.  */
+   .large_bound = V2 (0x1p+9),
+-#endif
+ };
+ 
+ static float64x2_t NOINLINE VPCS_ATTR
+-special_case (float64x2_t x)
+special_case (float64x2_t x, float64x2_t t, float64x2_t halfsign,
+	      uint64x2_t special)
+ {
+-  return v_call_f64 (sinh, x, x, v_u64 (-1));
+  return v_call_f64 (sinh, x, vmulq_f64 (t, halfsign), special);
+ }
+ 
+ /* Approximation for vector double-precision sinh(x) using expm1.
+    sinh(x) = (exp(x) - exp(-x)) / 2.
+    The greatest observed error is 2.52 ULP:
+-   _ZGVnN2v_sinh(-0x1.a098a2177a2b9p-2) got -0x1.ac2f05bb66fccp-2
+-				       want -0x1.ac2f05bb66fc9p-2.  */
+   _ZGVnN2v_sinh(0x1.9f6ff2ab6fb19p-2) got 0x1.aaed83a3153ccp-2
+				      want 0x1.aaed83a3153c9p-2.  */
+ float64x2_t VPCS_ATTR V_NAME_D1 (sinh) (float64x2_t x)
+ {
+   const struct data *d = ptr_barrier (&data);
+@@ -63,21 +53,16 @@ float64x2_t VPCS_ATTR V_NAME_D1 (sinh) (float64x2_t x)
+   float64x2_t halfsign = vreinterpretq_f64_u64 (
+       vbslq_u64 (v_u64 (0x8000000000000000), ix, d->halff));
+ 
+-#if WANT_SIMD_EXCEPT
+-  uint64x2_t special = vcgeq_u64 (
+-      vsubq_u64 (vreinterpretq_u64_f64 (ax), d->tiny_bound), d->thresh);
+-#else
+   uint64x2_t special = vcageq_f64 (x, d->large_bound);
+-#endif
+-
+-  /* Fall back to scalar variant for all lanes if any of them are special.  */
+-  if (__glibc_unlikely (v_any_u64 (special)))
+-    return special_case (x);
+ 
+   /* Up to the point that expm1 overflows, we can use it to calculate sinh
+      using a slight rearrangement of the definition of sinh. This allows us to
+      retain acceptable accuracy for very small inputs.  */
+   float64x2_t t = expm1_inline (ax, &d->d);
+   t = vaddq_f64 (t, vdivq_f64 (t, vaddq_f64 (t, v_f64 (1.0))));
+
+  if (__glibc_unlikely (v_any_u64 (special)))
+    return special_case (x, t, halfsign, special);
+
+   return vmulq_f64 (t, halfsign);
+ }
--- a/glibc-RHEL-118273-5.patch
+++ b/glibc-RHEL-118273-5.patch
@ -0,0 +1,475 @@
+commit 8b679205286e7874f0b04187c0bc787632168aa2
+Author: Joe Ramsay <Joe.Ramsay@arm.com>
+Date:   Wed Apr 3 12:13:53 2024 +0100
+
+    aarch64/fpu: Add vector variants of atanh
+    
+    Reviewed-by: Szabolcs Nagy <szabolcs.nagy@arm.com>
+
+diff --git a/sysdeps/aarch64/fpu/Makefile b/sysdeps/aarch64/fpu/Makefile
+index d474f2969dd05c26..4c878e590681becc 100644
+--- a/sysdeps/aarch64/fpu/Makefile
+++ b/sysdeps/aarch64/fpu/Makefile
+@@ -3,6 +3,7 @@ libmvec-supported-funcs = acos \
+                           asin \
+                           asinh \
+                           atan \
+                          atanh \
+                           atan2 \
+                           cos \
+                           cosh \
+diff --git a/sysdeps/aarch64/fpu/Versions b/sysdeps/aarch64/fpu/Versions
+index 08ea15efaec959fb..092949dc96d55624 100644
+--- a/sysdeps/aarch64/fpu/Versions
+++ b/sysdeps/aarch64/fpu/Versions
+@@ -89,6 +89,11 @@ libmvec {
+     _ZGVnN4v_asinhf;
+     _ZGVsMxv_asinh;
+     _ZGVsMxv_asinhf;
+    _ZGVnN2v_atanh;
+    _ZGVnN2v_atanhf;
+    _ZGVnN4v_atanhf;
+    _ZGVsMxv_atanh;
+    _ZGVsMxv_atanhf;
+     _ZGVnN2v_cosh;
+     _ZGVnN2v_coshf;
+     _ZGVnN4v_coshf;
+diff --git a/sysdeps/aarch64/fpu/advsimd_f32_protos.h b/sysdeps/aarch64/fpu/advsimd_f32_protos.h
+index 1e80721c9f73ba12..afbb01e191b917a4 100644
+--- a/sysdeps/aarch64/fpu/advsimd_f32_protos.h
+++ b/sysdeps/aarch64/fpu/advsimd_f32_protos.h
+@@ -22,6 +22,7 @@ libmvec_hidden_proto (V_NAME_F1(acosh));
+ libmvec_hidden_proto (V_NAME_F1(asin));
+ libmvec_hidden_proto (V_NAME_F1(asinh));
+ libmvec_hidden_proto (V_NAME_F1(atan));
+libmvec_hidden_proto (V_NAME_F1(atanh));
+ libmvec_hidden_proto (V_NAME_F1(cos));
+ libmvec_hidden_proto (V_NAME_F1(cosh));
+ libmvec_hidden_proto (V_NAME_F1(erf));
+diff --git a/sysdeps/aarch64/fpu/atanh_advsimd.c b/sysdeps/aarch64/fpu/atanh_advsimd.c
+new file mode 100644
+index 0000000000000000..3c3d0bd6ad41396d
+--- /dev/null
+++ b/sysdeps/aarch64/fpu/atanh_advsimd.c
+@@ -0,0 +1,64 @@
+/* Double-precision vector (Advanced SIMD) atanh function
+
+   Copyright (C) 2024 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#define WANT_V_LOG1P_K0_SHORTCUT 0
+#include "v_log1p_inline.h"
+
+const static struct data
+{
+  struct v_log1p_data log1p_consts;
+  uint64x2_t one, half;
+} data = { .log1p_consts = V_LOG1P_CONSTANTS_TABLE,
+	   .one = V2 (0x3ff0000000000000),
+	   .half = V2 (0x3fe0000000000000) };
+
+static float64x2_t VPCS_ATTR NOINLINE
+special_case (float64x2_t x, float64x2_t y, uint64x2_t special)
+{
+  return v_call_f64 (atanh, x, y, special);
+}
+
+/* Approximation for vector double-precision atanh(x) using modified log1p.
+   The greatest observed error is 3.31 ULP:
+   _ZGVnN2v_atanh(0x1.ffae6288b601p-6) got 0x1.ffd8ff31b5019p-6
+				      want 0x1.ffd8ff31b501cp-6.  */
+VPCS_ATTR
+float64x2_t V_NAME_D1 (atanh) (float64x2_t x)
+{
+  const struct data *d = ptr_barrier (&data);
+
+  float64x2_t ax = vabsq_f64 (x);
+  uint64x2_t ia = vreinterpretq_u64_f64 (ax);
+  uint64x2_t sign = veorq_u64 (vreinterpretq_u64_f64 (x), ia);
+  uint64x2_t special = vcgeq_u64 (ia, d->one);
+  float64x2_t halfsign = vreinterpretq_f64_u64 (vorrq_u64 (sign, d->half));
+
+#if WANT_SIMD_EXCEPT
+  ax = v_zerofy_f64 (ax, special);
+#endif
+
+  float64x2_t y;
+  y = vaddq_f64 (ax, ax);
+  y = vdivq_f64 (y, vsubq_f64 (v_f64 (1), ax));
+  y = log1p_inline (y, &d->log1p_consts);
+
+  if (__glibc_unlikely (v_any_u64 (special)))
+    return special_case (x, vmulq_f64 (y, halfsign), special);
+  return vmulq_f64 (y, halfsign);
+}
+diff --git a/sysdeps/aarch64/fpu/atanh_sve.c b/sysdeps/aarch64/fpu/atanh_sve.c
+new file mode 100644
+index 0000000000000000..7a52728d70f6d226
+--- /dev/null
+++ b/sysdeps/aarch64/fpu/atanh_sve.c
+@@ -0,0 +1,59 @@
+/* Double-precision vector (SVE) atanh function
+
+   Copyright (C) 2024 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#define WANT_SV_LOG1P_K0_SHORTCUT 0
+#include "sv_log1p_inline.h"
+
+#define One (0x3ff0000000000000)
+#define Half (0x3fe0000000000000)
+
+static svfloat64_t NOINLINE
+special_case (svfloat64_t x, svfloat64_t y, svbool_t special)
+{
+  return sv_call_f64 (atanh, x, y, special);
+}
+
+/* SVE approximation for double-precision atanh, based on log1p.
+   The greatest observed error is 2.81 ULP:
+   _ZGVsMxv_atanh(0x1.ffae6288b601p-6) got 0x1.ffd8ff31b5019p-6
+				      want 0x1.ffd8ff31b501cp-6.  */
+svfloat64_t SV_NAME_D1 (atanh) (svfloat64_t x, const svbool_t pg)
+{
+
+  svfloat64_t ax = svabs_x (pg, x);
+  svuint64_t iax = svreinterpret_u64 (ax);
+  svuint64_t sign = sveor_x (pg, svreinterpret_u64 (x), iax);
+  svfloat64_t halfsign = svreinterpret_f64 (svorr_x (pg, sign, Half));
+
+  /* It is special if iax >= 1.  */
+//   svbool_t special = svcmpge (pg, iax, One);
+  svbool_t special = svacge (pg, x, 1.0);
+
+  /* Computation is performed based on the following sequence of equality:
+	(1+x)/(1-x) = 1 + 2x/(1-x).  */
+  svfloat64_t y;
+  y = svadd_x (pg, ax, ax);
+  y = svdiv_x (pg, y, svsub_x (pg, sv_f64 (1), ax));
+  /* ln((1+x)/(1-x)) = ln(1+2x/(1-x)) = ln(1 + y).  */
+  y = sv_log1p_inline (y, pg);
+
+  if (__glibc_unlikely (svptest_any (pg, special)))
+    return special_case (x, svmul_x (pg, halfsign, y), special);
+  return svmul_x (pg, halfsign, y);
+}
+diff --git a/sysdeps/aarch64/fpu/atanhf_advsimd.c b/sysdeps/aarch64/fpu/atanhf_advsimd.c
+new file mode 100644
+index 0000000000000000..ae488f7b54ddce26
+--- /dev/null
+++ b/sysdeps/aarch64/fpu/atanhf_advsimd.c
+@@ -0,0 +1,79 @@
+/* Single-precision vector (Advanced SIMD) atanh function
+
+   Copyright (C) 2024 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include "v_math.h"
+#include "v_log1pf_inline.h"
+
+const static struct data
+{
+  struct v_log1pf_data log1pf_consts;
+  uint32x4_t one;
+#if WANT_SIMD_EXCEPT
+  uint32x4_t tiny_bound;
+#endif
+} data = {
+  .log1pf_consts = V_LOG1PF_CONSTANTS_TABLE,
+  .one = V4 (0x3f800000),
+#if WANT_SIMD_EXCEPT
+  /* 0x1p-12, below which atanhf(x) rounds to x.  */
+  .tiny_bound = V4 (0x39800000),
+#endif
+};
+
+#define AbsMask v_u32 (0x7fffffff)
+#define Half v_u32 (0x3f000000)
+
+static float32x4_t NOINLINE VPCS_ATTR
+special_case (float32x4_t x, float32x4_t y, uint32x4_t special)
+{
+  return v_call_f32 (atanhf, x, y, special);
+}
+
+/* Approximation for vector single-precision atanh(x) using modified log1p.
+   The maximum error is 3.08 ULP:
+   __v_atanhf(0x1.ff215p-5) got 0x1.ffcb7cp-5
+			   want 0x1.ffcb82p-5.  */
+VPCS_ATTR float32x4_t NOINLINE V_NAME_F1 (atanh) (float32x4_t x)
+{
+  const struct data *d = ptr_barrier (&data);
+
+  float32x4_t halfsign = vbslq_f32 (AbsMask, v_f32 (0.5), x);
+  float32x4_t ax = vabsq_f32 (x);
+  uint32x4_t iax = vreinterpretq_u32_f32 (ax);
+
+#if WANT_SIMD_EXCEPT
+  uint32x4_t special
+      = vorrq_u32 (vcgeq_u32 (iax, d->one), vcltq_u32 (iax, d->tiny_bound));
+  /* Side-step special cases by setting those lanes to 0, which will trigger no
+     exceptions. These will be fixed up later.  */
+  if (__glibc_unlikely (v_any_u32 (special)))
+    ax = v_zerofy_f32 (ax, special);
+#else
+  uint32x4_t special = vcgeq_u32 (iax, d->one);
+#endif
+
+  float32x4_t y = vdivq_f32 (vaddq_f32 (ax, ax), vsubq_f32 (v_f32 (1), ax));
+  y = log1pf_inline (y, d->log1pf_consts);
+
+  if (__glibc_unlikely (v_any_u32 (special)))
+    return special_case (x, vmulq_f32 (halfsign, y), special);
+  return vmulq_f32 (halfsign, y);
+}
+libmvec_hidden_def (V_NAME_F1 (atanh))
+HALF_WIDTH_ALIAS_F1 (atanh)
+diff --git a/sysdeps/aarch64/fpu/atanhf_sve.c b/sysdeps/aarch64/fpu/atanhf_sve.c
+new file mode 100644
+index 0000000000000000..dae83041ef7157f0
+--- /dev/null
+++ b/sysdeps/aarch64/fpu/atanhf_sve.c
+@@ -0,0 +1,54 @@
+/* Single-precision vector (SVE) atanh function
+
+   Copyright (C) 2024 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include "sv_log1pf_inline.h"
+
+#define One (0x3f800000)
+#define Half (0x3f000000)
+
+static svfloat32_t NOINLINE
+special_case (svfloat32_t x, svfloat32_t y, svbool_t special)
+{
+  return sv_call_f32 (atanhf, x, y, special);
+}
+
+/* Approximation for vector single-precision atanh(x) using modified log1p.
+   The maximum error is 2.28 ULP:
+   _ZGVsMxv_atanhf(0x1.ff1194p-5) got 0x1.ffbbbcp-5
+				 want 0x1.ffbbb6p-5.  */
+svfloat32_t SV_NAME_F1 (atanh) (svfloat32_t x, const svbool_t pg)
+{
+  svfloat32_t ax = svabs_x (pg, x);
+  svuint32_t iax = svreinterpret_u32 (ax);
+  svuint32_t sign = sveor_x (pg, svreinterpret_u32 (x), iax);
+  svfloat32_t halfsign = svreinterpret_f32 (svorr_x (pg, sign, Half));
+  svbool_t special = svcmpge (pg, iax, One);
+
+  /* Computation is performed based on the following sequence of equality:
+   * (1+x)/(1-x) = 1 + 2x/(1-x).  */
+  svfloat32_t y = svadd_x (pg, ax, ax);
+  y = svdiv_x (pg, y, svsub_x (pg, sv_f32 (1), ax));
+  /* ln((1+x)/(1-x)) = ln(1+2x/(1-x)) = ln(1 + y).  */
+  y = sv_log1pf_inline (y, pg);
+
+  if (__glibc_unlikely (svptest_any (pg, special)))
+    return special_case (x, svmul_x (pg, halfsign, y), special);
+
+  return svmul_x (pg, halfsign, y);
+}
+diff --git a/sysdeps/aarch64/fpu/bits/math-vector.h b/sysdeps/aarch64/fpu/bits/math-vector.h
+index eb2af35b27757fc6..ab7a8f74548854b9 100644
+--- a/sysdeps/aarch64/fpu/bits/math-vector.h
+++ b/sysdeps/aarch64/fpu/bits/math-vector.h
+@@ -49,6 +49,10 @@
+ # define __DECL_SIMD_atan __DECL_SIMD_aarch64
+ # undef __DECL_SIMD_atanf
+ # define __DECL_SIMD_atanf __DECL_SIMD_aarch64
+# undef __DECL_SIMD_atanh
+# define __DECL_SIMD_atanh __DECL_SIMD_aarch64
+# undef __DECL_SIMD_atanhf
+# define __DECL_SIMD_atanhf __DECL_SIMD_aarch64
+ # undef __DECL_SIMD_atan2
+ # define __DECL_SIMD_atan2 __DECL_SIMD_aarch64
+ # undef __DECL_SIMD_atan2f
+@@ -137,6 +141,7 @@ __vpcs __f32x4_t _ZGVnN4v_acoshf (__f32x4_t);
+ __vpcs __f32x4_t _ZGVnN4v_asinf (__f32x4_t);
+ __vpcs __f32x4_t _ZGVnN4v_asinhf (__f32x4_t);
+ __vpcs __f32x4_t _ZGVnN4v_atanf (__f32x4_t);
+__vpcs __f32x4_t _ZGVnN4v_atanhf (__f32x4_t);
+ __vpcs __f32x4_t _ZGVnN4v_cosf (__f32x4_t);
+ __vpcs __f32x4_t _ZGVnN4v_coshf (__f32x4_t);
+ __vpcs __f32x4_t _ZGVnN4v_erff (__f32x4_t);
+@@ -157,6 +162,7 @@ __vpcs __f64x2_t _ZGVnN2v_acosh (__f64x2_t);
+ __vpcs __f64x2_t _ZGVnN2v_asin (__f64x2_t);
+ __vpcs __f64x2_t _ZGVnN2v_asinh (__f64x2_t);
+ __vpcs __f64x2_t _ZGVnN2v_atan (__f64x2_t);
+__vpcs __f64x2_t _ZGVnN2v_atanh (__f64x2_t);
+ __vpcs __f64x2_t _ZGVnN2v_cos (__f64x2_t);
+ __vpcs __f64x2_t _ZGVnN2v_cosh (__f64x2_t);
+ __vpcs __f64x2_t _ZGVnN2v_erf (__f64x2_t);
+@@ -182,6 +188,7 @@ __sv_f32_t _ZGVsMxv_acoshf (__sv_f32_t, __sv_bool_t);
+ __sv_f32_t _ZGVsMxv_asinf (__sv_f32_t, __sv_bool_t);
+ __sv_f32_t _ZGVsMxv_asinhf (__sv_f32_t, __sv_bool_t);
+ __sv_f32_t _ZGVsMxv_atanf (__sv_f32_t, __sv_bool_t);
+__sv_f32_t _ZGVsMxv_atanhf (__sv_f32_t, __sv_bool_t);
+ __sv_f32_t _ZGVsMxv_cosf (__sv_f32_t, __sv_bool_t);
+ __sv_f32_t _ZGVsMxv_coshf (__sv_f32_t, __sv_bool_t);
+ __sv_f32_t _ZGVsMxv_erff (__sv_f32_t, __sv_bool_t);
+@@ -202,6 +209,7 @@ __sv_f64_t _ZGVsMxv_acosh (__sv_f64_t, __sv_bool_t);
+ __sv_f64_t _ZGVsMxv_asin (__sv_f64_t, __sv_bool_t);
+ __sv_f64_t _ZGVsMxv_asinh (__sv_f64_t, __sv_bool_t);
+ __sv_f64_t _ZGVsMxv_atan (__sv_f64_t, __sv_bool_t);
+__sv_f64_t _ZGVsMxv_atanh (__sv_f64_t, __sv_bool_t);
+ __sv_f64_t _ZGVsMxv_cos (__sv_f64_t, __sv_bool_t);
+ __sv_f64_t _ZGVsMxv_cosh (__sv_f64_t, __sv_bool_t);
+ __sv_f64_t _ZGVsMxv_erf (__sv_f64_t, __sv_bool_t);
+diff --git a/sysdeps/aarch64/fpu/test-double-advsimd-wrappers.c b/sysdeps/aarch64/fpu/test-double-advsimd-wrappers.c
+index 3d7177c32dcd77a6..a01aa99c16740631 100644
+--- a/sysdeps/aarch64/fpu/test-double-advsimd-wrappers.c
+++ b/sysdeps/aarch64/fpu/test-double-advsimd-wrappers.c
+@@ -28,6 +28,7 @@ VPCS_VECTOR_WRAPPER (acosh_advsimd, _ZGVnN2v_acosh)
+ VPCS_VECTOR_WRAPPER (asin_advsimd, _ZGVnN2v_asin)
+ VPCS_VECTOR_WRAPPER (asinh_advsimd, _ZGVnN2v_asinh)
+ VPCS_VECTOR_WRAPPER (atan_advsimd, _ZGVnN2v_atan)
+VPCS_VECTOR_WRAPPER (atanh_advsimd, _ZGVnN2v_atanh)
+ VPCS_VECTOR_WRAPPER_ff (atan2_advsimd, _ZGVnN2vv_atan2)
+ VPCS_VECTOR_WRAPPER (cos_advsimd, _ZGVnN2v_cos)
+ VPCS_VECTOR_WRAPPER (cosh_advsimd, _ZGVnN2v_cosh)
+diff --git a/sysdeps/aarch64/fpu/test-double-sve-wrappers.c b/sysdeps/aarch64/fpu/test-double-sve-wrappers.c
+index b88a2afe5c1198c0..83cb3ad5d0e4d056 100644
+--- a/sysdeps/aarch64/fpu/test-double-sve-wrappers.c
+++ b/sysdeps/aarch64/fpu/test-double-sve-wrappers.c
+@@ -47,6 +47,7 @@ SVE_VECTOR_WRAPPER (acosh_sve, _ZGVsMxv_acosh)
+ SVE_VECTOR_WRAPPER (asin_sve, _ZGVsMxv_asin)
+ SVE_VECTOR_WRAPPER (asinh_sve, _ZGVsMxv_asinh)
+ SVE_VECTOR_WRAPPER (atan_sve, _ZGVsMxv_atan)
+SVE_VECTOR_WRAPPER (atanh_sve, _ZGVsMxv_atanh)
+ SVE_VECTOR_WRAPPER_ff (atan2_sve, _ZGVsMxvv_atan2)
+ SVE_VECTOR_WRAPPER (cos_sve, _ZGVsMxv_cos)
+ SVE_VECTOR_WRAPPER (cosh_sve, _ZGVsMxv_cosh)
+diff --git a/sysdeps/aarch64/fpu/test-float-advsimd-wrappers.c b/sysdeps/aarch64/fpu/test-float-advsimd-wrappers.c
+index 533655402d3f3737..831d4d755272d616 100644
+--- a/sysdeps/aarch64/fpu/test-float-advsimd-wrappers.c
+++ b/sysdeps/aarch64/fpu/test-float-advsimd-wrappers.c
+@@ -28,6 +28,7 @@ VPCS_VECTOR_WRAPPER (acoshf_advsimd, _ZGVnN4v_acoshf)
+ VPCS_VECTOR_WRAPPER (asinf_advsimd, _ZGVnN4v_asinf)
+ VPCS_VECTOR_WRAPPER (asinhf_advsimd, _ZGVnN4v_asinhf)
+ VPCS_VECTOR_WRAPPER (atanf_advsimd, _ZGVnN4v_atanf)
+VPCS_VECTOR_WRAPPER (atanhf_advsimd, _ZGVnN4v_atanhf)
+ VPCS_VECTOR_WRAPPER_ff (atan2f_advsimd, _ZGVnN4vv_atan2f)
+ VPCS_VECTOR_WRAPPER (cosf_advsimd, _ZGVnN4v_cosf)
+ VPCS_VECTOR_WRAPPER (coshf_advsimd, _ZGVnN4v_coshf)
+diff --git a/sysdeps/aarch64/fpu/test-float-sve-wrappers.c b/sysdeps/aarch64/fpu/test-float-sve-wrappers.c
+index f7b673e3358e7d82..96fd612c3e76f6dc 100644
+--- a/sysdeps/aarch64/fpu/test-float-sve-wrappers.c
+++ b/sysdeps/aarch64/fpu/test-float-sve-wrappers.c
+@@ -47,6 +47,7 @@ SVE_VECTOR_WRAPPER (acoshf_sve, _ZGVsMxv_acoshf)
+ SVE_VECTOR_WRAPPER (asinf_sve, _ZGVsMxv_asinf)
+ SVE_VECTOR_WRAPPER (asinhf_sve, _ZGVsMxv_asinhf)
+ SVE_VECTOR_WRAPPER (atanf_sve, _ZGVsMxv_atanf)
+SVE_VECTOR_WRAPPER (atanhf_sve, _ZGVsMxv_atanhf)
+ SVE_VECTOR_WRAPPER_ff (atan2f_sve, _ZGVsMxvv_atan2f)
+ SVE_VECTOR_WRAPPER (cosf_sve, _ZGVsMxv_cosf)
+ SVE_VECTOR_WRAPPER (coshf_sve, _ZGVsMxv_coshf)
+diff --git a/sysdeps/aarch64/libm-test-ulps b/sysdeps/aarch64/libm-test-ulps
+index b916e422432014c2..7c2e43d3dc5bbc13 100644
+--- a/sysdeps/aarch64/libm-test-ulps
+++ b/sysdeps/aarch64/libm-test-ulps
+@@ -173,11 +173,19 @@ double: 2
+ float: 2
+ ldouble: 4
+ 
+Function: "atanh_advsimd":
+double: 1
+float: 1
+
+ Function: "atanh_downward":
+ double: 3
+ float: 3
+ ldouble: 4
+ 
+Function: "atanh_sve":
+double: 2
+float: 1
+
+ Function: "atanh_towardzero":
+ double: 2
+ float: 2
+diff --git a/sysdeps/unix/sysv/linux/aarch64/libmvec.abilist b/sysdeps/unix/sysv/linux/aarch64/libmvec.abilist
+index f288afdfdd9c8757..ce42372a3a276832 100644
+--- a/sysdeps/unix/sysv/linux/aarch64/libmvec.abilist
+++ b/sysdeps/unix/sysv/linux/aarch64/libmvec.abilist
+@@ -77,18 +77,23 @@ GLIBC_2.40 _ZGVnN2v_acosh F
+ GLIBC_2.40 _ZGVnN2v_acoshf F
+ GLIBC_2.40 _ZGVnN2v_asinh F
+ GLIBC_2.40 _ZGVnN2v_asinhf F
+GLIBC_2.40 _ZGVnN2v_atanh F
+GLIBC_2.40 _ZGVnN2v_atanhf F
+ GLIBC_2.40 _ZGVnN2v_cosh F
+ GLIBC_2.40 _ZGVnN2v_coshf F
+ GLIBC_2.40 _ZGVnN2v_erf F
+ GLIBC_2.40 _ZGVnN2v_erff F
+ GLIBC_2.40 _ZGVnN4v_acoshf F
+ GLIBC_2.40 _ZGVnN4v_asinhf F
+GLIBC_2.40 _ZGVnN4v_atanhf F
+ GLIBC_2.40 _ZGVnN4v_coshf F
+ GLIBC_2.40 _ZGVnN4v_erff F
+ GLIBC_2.40 _ZGVsMxv_acosh F
+ GLIBC_2.40 _ZGVsMxv_acoshf F
+ GLIBC_2.40 _ZGVsMxv_asinh F
+ GLIBC_2.40 _ZGVsMxv_asinhf F
+GLIBC_2.40 _ZGVsMxv_atanh F
+GLIBC_2.40 _ZGVsMxv_atanhf F
+ GLIBC_2.40 _ZGVsMxv_cosh F
+ GLIBC_2.40 _ZGVsMxv_coshf F
+ GLIBC_2.40 _ZGVsMxv_erf F
--- a/glibc-RHEL-118273-6.patch
+++ b/glibc-RHEL-118273-6.patch
@ -0,0 +1,758 @@
+commit eedbbca0bf3adf3c45aff6c4e128bae3a5562675
+Author: Joe Ramsay <Joe.Ramsay@arm.com>
+Date:   Wed Apr 3 12:15:41 2024 +0100
+
+    aarch64/fpu: Add vector variants of sinh
+    
+    Reviewed-by: Szabolcs Nagy <szabolcs.nagy@arm.com>
+
+diff --git a/sysdeps/aarch64/fpu/Makefile b/sysdeps/aarch64/fpu/Makefile
+index 4c878e590681becc..fb5f3a365b27fdf3 100644
+--- a/sysdeps/aarch64/fpu/Makefile
+++ b/sysdeps/aarch64/fpu/Makefile
+@@ -17,6 +17,7 @@ libmvec-supported-funcs = acos \
+                           log1p \
+                           log2 \
+                           sin \
+                          sinh \
+                           tan
+ 
+ float-advsimd-funcs = $(libmvec-supported-funcs)
+diff --git a/sysdeps/aarch64/fpu/Versions b/sysdeps/aarch64/fpu/Versions
+index 092949dc96d55624..4774b3efeacf59fb 100644
+--- a/sysdeps/aarch64/fpu/Versions
+++ b/sysdeps/aarch64/fpu/Versions
+@@ -104,5 +104,10 @@ libmvec {
+     _ZGVnN4v_erff;
+     _ZGVsMxv_erf;
+     _ZGVsMxv_erff;
+    _ZGVnN2v_sinh;
+    _ZGVnN2v_sinhf;
+    _ZGVnN4v_sinhf;
+    _ZGVsMxv_sinh;
+    _ZGVsMxv_sinhf;
+   }
+ }
+diff --git a/sysdeps/aarch64/fpu/advsimd_f32_protos.h b/sysdeps/aarch64/fpu/advsimd_f32_protos.h
+index afbb01e191b917a4..7d9445d5c0c0c2a8 100644
+--- a/sysdeps/aarch64/fpu/advsimd_f32_protos.h
+++ b/sysdeps/aarch64/fpu/advsimd_f32_protos.h
+@@ -35,5 +35,6 @@ libmvec_hidden_proto (V_NAME_F1(log1p));
+ libmvec_hidden_proto (V_NAME_F1(log2));
+ libmvec_hidden_proto (V_NAME_F1(log));
+ libmvec_hidden_proto (V_NAME_F1(sin));
+libmvec_hidden_proto (V_NAME_F1(sinh));
+ libmvec_hidden_proto (V_NAME_F1(tan));
+ libmvec_hidden_proto (V_NAME_F2(atan2));
+diff --git a/sysdeps/aarch64/fpu/bits/math-vector.h b/sysdeps/aarch64/fpu/bits/math-vector.h
+index ab7a8f74548854b9..1e9b76cf41916365 100644
+--- a/sysdeps/aarch64/fpu/bits/math-vector.h
+++ b/sysdeps/aarch64/fpu/bits/math-vector.h
+@@ -105,6 +105,10 @@
+ # define __DECL_SIMD_sin __DECL_SIMD_aarch64
+ # undef __DECL_SIMD_sinf
+ # define __DECL_SIMD_sinf __DECL_SIMD_aarch64
+# undef __DECL_SIMD_sinh
+# define __DECL_SIMD_sinh __DECL_SIMD_aarch64
+# undef __DECL_SIMD_sinhf
+# define __DECL_SIMD_sinhf __DECL_SIMD_aarch64
+ # undef __DECL_SIMD_tan
+ # define __DECL_SIMD_tan __DECL_SIMD_aarch64
+ # undef __DECL_SIMD_tanf
+@@ -154,6 +158,7 @@ __vpcs __f32x4_t _ZGVnN4v_log10f (__f32x4_t);
+ __vpcs __f32x4_t _ZGVnN4v_log1pf (__f32x4_t);
+ __vpcs __f32x4_t _ZGVnN4v_log2f (__f32x4_t);
+ __vpcs __f32x4_t _ZGVnN4v_sinf (__f32x4_t);
+__vpcs __f32x4_t _ZGVnN4v_sinhf (__f32x4_t);
+ __vpcs __f32x4_t _ZGVnN4v_tanf (__f32x4_t);
+ 
+ __vpcs __f64x2_t _ZGVnN2vv_atan2 (__f64x2_t, __f64x2_t);
+@@ -175,6 +180,7 @@ __vpcs __f64x2_t _ZGVnN2v_log10 (__f64x2_t);
+ __vpcs __f64x2_t _ZGVnN2v_log1p (__f64x2_t);
+ __vpcs __f64x2_t _ZGVnN2v_log2 (__f64x2_t);
+ __vpcs __f64x2_t _ZGVnN2v_sin (__f64x2_t);
+__vpcs __f64x2_t _ZGVnN2v_sinh (__f64x2_t);
+ __vpcs __f64x2_t _ZGVnN2v_tan (__f64x2_t);
+ 
+ #  undef __ADVSIMD_VEC_MATH_SUPPORTED
+@@ -201,6 +207,7 @@ __sv_f32_t _ZGVsMxv_log10f (__sv_f32_t, __sv_bool_t);
+ __sv_f32_t _ZGVsMxv_log1pf (__sv_f32_t, __sv_bool_t);
+ __sv_f32_t _ZGVsMxv_log2f (__sv_f32_t, __sv_bool_t);
+ __sv_f32_t _ZGVsMxv_sinf (__sv_f32_t, __sv_bool_t);
+__sv_f32_t _ZGVsMxv_sinhf (__sv_f32_t, __sv_bool_t);
+ __sv_f32_t _ZGVsMxv_tanf (__sv_f32_t, __sv_bool_t);
+ 
+ __sv_f64_t _ZGVsMxvv_atan2 (__sv_f64_t, __sv_f64_t, __sv_bool_t);
+@@ -222,6 +229,7 @@ __sv_f64_t _ZGVsMxv_log10 (__sv_f64_t, __sv_bool_t);
+ __sv_f64_t _ZGVsMxv_log1p (__sv_f64_t, __sv_bool_t);
+ __sv_f64_t _ZGVsMxv_log2 (__sv_f64_t, __sv_bool_t);
+ __sv_f64_t _ZGVsMxv_sin (__sv_f64_t, __sv_bool_t);
+__sv_f64_t _ZGVsMxv_sinh (__sv_f64_t, __sv_bool_t);
+ __sv_f64_t _ZGVsMxv_tan (__sv_f64_t, __sv_bool_t);
+ 
+ #  undef __SVE_VEC_MATH_SUPPORTED
+diff --git a/sysdeps/aarch64/fpu/sinh_advsimd.c b/sysdeps/aarch64/fpu/sinh_advsimd.c
+new file mode 100644
+index 0000000000000000..fa3723b10c15eb29
+--- /dev/null
+++ b/sysdeps/aarch64/fpu/sinh_advsimd.c
+@@ -0,0 +1,121 @@
+/* Double-precision vector (Advanced SIMD) sinh function
+
+   Copyright (C) 2024 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include "v_math.h"
+#include "poly_advsimd_f64.h"
+
+static const struct data
+{
+  float64x2_t poly[11];
+  float64x2_t inv_ln2, m_ln2, shift;
+  uint64x2_t halff;
+  int64x2_t onef;
+#if WANT_SIMD_EXCEPT
+  uint64x2_t tiny_bound, thresh;
+#else
+  uint64x2_t large_bound;
+#endif
+} data = {
+  /* Generated using Remez, deg=12 in [-log(2)/2, log(2)/2].  */
+  .poly = { V2 (0x1p-1), V2 (0x1.5555555555559p-3), V2 (0x1.555555555554bp-5),
+	    V2 (0x1.111111110f663p-7), V2 (0x1.6c16c16c1b5f3p-10),
+	    V2 (0x1.a01a01affa35dp-13), V2 (0x1.a01a018b4ecbbp-16),
+	    V2 (0x1.71ddf82db5bb4p-19), V2 (0x1.27e517fc0d54bp-22),
+	    V2 (0x1.af5eedae67435p-26), V2 (0x1.1f143d060a28ap-29), },
+
+  .inv_ln2 = V2 (0x1.71547652b82fep0),
+  .m_ln2 = (float64x2_t) {-0x1.62e42fefa39efp-1, -0x1.abc9e3b39803fp-56},
+  .shift = V2 (0x1.8p52),
+
+  .halff = V2 (0x3fe0000000000000),
+  .onef = V2 (0x3ff0000000000000),
+#if WANT_SIMD_EXCEPT
+  /* 2^-26, below which sinh(x) rounds to x.  */
+  .tiny_bound = V2 (0x3e50000000000000),
+  /* asuint(large_bound) - asuint(tiny_bound).  */
+  .thresh = V2 (0x0230000000000000),
+#else
+/* 2^9. expm1 helper overflows for large input.  */
+  .large_bound = V2 (0x4080000000000000),
+#endif
+};
+
+static inline float64x2_t
+expm1_inline (float64x2_t x)
+{
+  const struct data *d = ptr_barrier (&data);
+
+  /* Reduce argument:
+     exp(x) - 1 = 2^i * (expm1(f) + 1) - 1
+     where i = round(x / ln2)
+     and   f = x - i * ln2 (f in [-ln2/2, ln2/2]).  */
+  float64x2_t j = vsubq_f64 (vfmaq_f64 (d->shift, d->inv_ln2, x), d->shift);
+  int64x2_t i = vcvtq_s64_f64 (j);
+  float64x2_t f = vfmaq_laneq_f64 (x, j, d->m_ln2, 0);
+  f = vfmaq_laneq_f64 (f, j, d->m_ln2, 1);
+  /* Approximate expm1(f) using polynomial.  */
+  float64x2_t f2 = vmulq_f64 (f, f);
+  float64x2_t f4 = vmulq_f64 (f2, f2);
+  float64x2_t f8 = vmulq_f64 (f4, f4);
+  float64x2_t p = vfmaq_f64 (f, f2, v_estrin_10_f64 (f, f2, f4, f8, d->poly));
+  /* t = 2^i.  */
+  float64x2_t t = vreinterpretq_f64_u64 (
+      vreinterpretq_u64_s64 (vaddq_s64 (vshlq_n_s64 (i, 52), d->onef)));
+  /* expm1(x) ~= p * t + (t - 1).  */
+  return vfmaq_f64 (vsubq_f64 (t, v_f64 (1.0)), p, t);
+}
+
+static float64x2_t NOINLINE VPCS_ATTR
+special_case (float64x2_t x)
+{
+  return v_call_f64 (sinh, x, x, v_u64 (-1));
+}
+
+/* Approximation for vector double-precision sinh(x) using expm1.
+   sinh(x) = (exp(x) - exp(-x)) / 2.
+   The greatest observed error is 2.57 ULP:
+   _ZGVnN2v_sinh (0x1.9fb1d49d1d58bp-2) got 0x1.ab34e59d678dcp-2
+				       want 0x1.ab34e59d678d9p-2.  */
+float64x2_t VPCS_ATTR V_NAME_D1 (sinh) (float64x2_t x)
+{
+  const struct data *d = ptr_barrier (&data);
+
+  float64x2_t ax = vabsq_f64 (x);
+  uint64x2_t sign
+      = veorq_u64 (vreinterpretq_u64_f64 (x), vreinterpretq_u64_f64 (ax));
+  float64x2_t halfsign = vreinterpretq_f64_u64 (vorrq_u64 (sign, d->halff));
+
+#if WANT_SIMD_EXCEPT
+  uint64x2_t special = vcgeq_u64 (
+      vsubq_u64 (vreinterpretq_u64_f64 (ax), d->tiny_bound), d->thresh);
+#else
+  uint64x2_t special = vcgeq_u64 (vreinterpretq_u64_f64 (ax), d->large_bound);
+#endif
+
+  /* Fall back to scalar variant for all lanes if any of them are special.  */
+  if (__glibc_unlikely (v_any_u64 (special)))
+    return special_case (x);
+
+  /* Up to the point that expm1 overflows, we can use it to calculate sinh
+     using a slight rearrangement of the definition of sinh. This allows us to
+     retain acceptable accuracy for very small inputs.  */
+  float64x2_t t = expm1_inline (ax);
+  t = vaddq_f64 (t, vdivq_f64 (t, vaddq_f64 (t, v_f64 (1.0))));
+  return vmulq_f64 (t, halfsign);
+}
+diff --git a/sysdeps/aarch64/fpu/sinh_sve.c b/sysdeps/aarch64/fpu/sinh_sve.c
+new file mode 100644
+index 0000000000000000..df5f6c8c06e5b173
+--- /dev/null
+++ b/sysdeps/aarch64/fpu/sinh_sve.c
+@@ -0,0 +1,107 @@
+/* Double-precision vector (SVE) atanh function
+
+   Copyright (C) 2024 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include "sv_math.h"
+#include "poly_sve_f64.h"
+
+static const struct data
+{
+  float64_t poly[11];
+  float64_t inv_ln2, m_ln2_hi, m_ln2_lo, shift;
+  uint64_t halff;
+  int64_t onef;
+  uint64_t large_bound;
+} data = {
+  /* Generated using Remez, deg=12 in [-log(2)/2, log(2)/2].  */
+  .poly = { 0x1p-1, 0x1.5555555555559p-3, 0x1.555555555554bp-5,
+	    0x1.111111110f663p-7, 0x1.6c16c16c1b5f3p-10,
+	    0x1.a01a01affa35dp-13, 0x1.a01a018b4ecbbp-16,
+	    0x1.71ddf82db5bb4p-19, 0x1.27e517fc0d54bp-22,
+	    0x1.af5eedae67435p-26, 0x1.1f143d060a28ap-29, },
+
+  .inv_ln2 = 0x1.71547652b82fep0,
+  .m_ln2_hi = -0x1.62e42fefa39efp-1,
+  .m_ln2_lo = -0x1.abc9e3b39803fp-56,
+  .shift = 0x1.8p52,
+
+  .halff = 0x3fe0000000000000,
+  .onef = 0x3ff0000000000000,
+  /* 2^9. expm1 helper overflows for large input.  */
+  .large_bound = 0x4080000000000000,
+};
+
+static inline svfloat64_t
+expm1_inline (svfloat64_t x, svbool_t pg)
+{
+  const struct data *d = ptr_barrier (&data);
+
+  /* Reduce argument:
+     exp(x) - 1 = 2^i * (expm1(f) + 1) - 1
+     where i = round(x / ln2)
+     and   f = x - i * ln2 (f in [-ln2/2, ln2/2]).  */
+  svfloat64_t j
+      = svsub_x (pg, svmla_x (pg, sv_f64 (d->shift), x, d->inv_ln2), d->shift);
+  svint64_t i = svcvt_s64_x (pg, j);
+  svfloat64_t f = svmla_x (pg, x, j, d->m_ln2_hi);
+  f = svmla_x (pg, f, j, d->m_ln2_lo);
+  /* Approximate expm1(f) using polynomial.  */
+  svfloat64_t f2 = svmul_x (pg, f, f);
+  svfloat64_t f4 = svmul_x (pg, f2, f2);
+  svfloat64_t f8 = svmul_x (pg, f4, f4);
+  svfloat64_t p
+      = svmla_x (pg, f, f2, sv_estrin_10_f64_x (pg, f, f2, f4, f8, d->poly));
+  /* t = 2^i.  */
+  svfloat64_t t = svscale_x (pg, sv_f64 (1), i);
+  /* expm1(x) ~= p * t + (t - 1).  */
+  return svmla_x (pg, svsub_x (pg, t, 1.0), p, t);
+}
+
+static svfloat64_t NOINLINE
+special_case (svfloat64_t x, svbool_t pg)
+{
+  return sv_call_f64 (sinh, x, x, pg);
+}
+
+/* Approximation for SVE double-precision sinh(x) using expm1.
+   sinh(x) = (exp(x) - exp(-x)) / 2.
+   The greatest observed error is 2.57 ULP:
+   _ZGVsMxv_sinh (0x1.a008538399931p-2) got 0x1.ab929fc64bd66p-2
+				       want 0x1.ab929fc64bd63p-2.  */
+svfloat64_t SV_NAME_D1 (sinh) (svfloat64_t x, svbool_t pg)
+{
+  const struct data *d = ptr_barrier (&data);
+
+  svfloat64_t ax = svabs_x (pg, x);
+  svuint64_t sign
+      = sveor_x (pg, svreinterpret_u64 (x), svreinterpret_u64 (ax));
+  svfloat64_t halfsign = svreinterpret_f64 (svorr_x (pg, sign, d->halff));
+
+  svbool_t special = svcmpge (pg, svreinterpret_u64 (ax), d->large_bound);
+
+  /* Fall back to scalar variant for all lanes if any are special.  */
+  if (__glibc_unlikely (svptest_any (pg, special)))
+    return special_case (x, pg);
+
+  /* Up to the point that expm1 overflows, we can use it to calculate sinh
+     using a slight rearrangement of the definition of sinh. This allows us to
+     retain acceptable accuracy for very small inputs.  */
+  svfloat64_t t = expm1_inline (ax, pg);
+  t = svadd_x (pg, t, svdiv_x (pg, t, svadd_x (pg, t, 1.0)));
+  return svmul_x (pg, t, halfsign);
+}
+diff --git a/sysdeps/aarch64/fpu/sinhf_advsimd.c b/sysdeps/aarch64/fpu/sinhf_advsimd.c
+new file mode 100644
+index 0000000000000000..6bb7482dc28795c1
+--- /dev/null
+++ b/sysdeps/aarch64/fpu/sinhf_advsimd.c
+@@ -0,0 +1,88 @@
+/* Single-precision vector (Advanced SIMD) sinh function
+
+   Copyright (C) 2024 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include "v_math.h"
+#include "v_expm1f_inline.h"
+
+static const struct data
+{
+  struct v_expm1f_data expm1f_consts;
+  uint32x4_t halff;
+#if WANT_SIMD_EXCEPT
+  uint32x4_t tiny_bound, thresh;
+#else
+  uint32x4_t oflow_bound;
+#endif
+} data = {
+  .expm1f_consts = V_EXPM1F_DATA,
+  .halff = V4 (0x3f000000),
+#if WANT_SIMD_EXCEPT
+  /* 0x1.6a09e8p-32, below which expm1f underflows.  */
+  .tiny_bound = V4 (0x2fb504f4),
+  /* asuint(oflow_bound) - asuint(tiny_bound).  */
+  .thresh = V4 (0x12fbbbb3),
+#else
+  /* 0x1.61814ep+6, above which expm1f helper overflows.  */
+  .oflow_bound = V4 (0x42b0c0a7),
+#endif
+};
+
+static float32x4_t NOINLINE VPCS_ATTR
+special_case (float32x4_t x, float32x4_t y, uint32x4_t special)
+{
+  return v_call_f32 (sinhf, x, y, special);
+}
+
+/* Approximation for vector single-precision sinh(x) using expm1.
+   sinh(x) = (exp(x) - exp(-x)) / 2.
+   The maximum error is 2.26 ULP:
+   _ZGVnN4v_sinhf (0x1.e34a9ep-4) got 0x1.e469ep-4
+				 want 0x1.e469e4p-4.  */
+float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (sinh) (float32x4_t x)
+{
+  const struct data *d = ptr_barrier (&data);
+
+  uint32x4_t ix = vreinterpretq_u32_f32 (x);
+  float32x4_t ax = vabsq_f32 (x);
+  uint32x4_t iax = vreinterpretq_u32_f32 (ax);
+  uint32x4_t sign = veorq_u32 (ix, iax);
+  float32x4_t halfsign = vreinterpretq_f32_u32 (vorrq_u32 (sign, d->halff));
+
+#if WANT_SIMD_EXCEPT
+  uint32x4_t special = vcgeq_u32 (vsubq_u32 (iax, d->tiny_bound), d->thresh);
+  ax = v_zerofy_f32 (ax, special);
+#else
+  uint32x4_t special = vcgeq_u32 (iax, d->oflow_bound);
+#endif
+
+  /* Up to the point that expm1f overflows, we can use it to calculate sinhf
+       using a slight rearrangement of the definition of asinh. This allows us
+     to retain acceptable accuracy for very small inputs.  */
+  float32x4_t t = expm1f_inline (ax, &d->expm1f_consts);
+  t = vaddq_f32 (t, vdivq_f32 (t, vaddq_f32 (t, v_f32 (1.0))));
+
+  /* Fall back to the scalar variant for any lanes that should trigger an
+     exception.  */
+  if (__glibc_unlikely (v_any_u32 (special)))
+    return special_case (x, vmulq_f32 (t, halfsign), special);
+
+  return vmulq_f32 (t, halfsign);
+}
+libmvec_hidden_def (V_NAME_F1 (sinh))
+HALF_WIDTH_ALIAS_F1 (sinh)
+diff --git a/sysdeps/aarch64/fpu/sinhf_sve.c b/sysdeps/aarch64/fpu/sinhf_sve.c
+new file mode 100644
+index 0000000000000000..6c204b57a2aa18d3
+--- /dev/null
+++ b/sysdeps/aarch64/fpu/sinhf_sve.c
+@@ -0,0 +1,67 @@
+/* Single-precision vector (SVE) sinh function
+
+   Copyright (C) 2024 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include "sv_expm1f_inline.h"
+#include "sv_math.h"
+
+static const struct data
+{
+  struct sv_expm1f_data expm1f_consts;
+  uint32_t halff, large_bound;
+} data = {
+  .expm1f_consts = SV_EXPM1F_DATA,
+  .halff = 0x3f000000,
+  /* 0x1.61814ep+6, above which expm1f helper overflows.  */
+  .large_bound = 0x42b0c0a7,
+};
+
+static svfloat32_t NOINLINE
+special_case (svfloat32_t x, svfloat32_t y, svbool_t pg)
+{
+  return sv_call_f32 (sinhf, x, y, pg);
+}
+
+/* Approximation for SVE single-precision sinh(x) using expm1.
+   sinh(x) = (exp(x) - exp(-x)) / 2.
+   The maximum error is 2.26 ULP:
+   _ZGVsMxv_sinhf (0x1.e34a9ep-4) got 0x1.e469ep-4
+				 want 0x1.e469e4p-4.  */
+svfloat32_t SV_NAME_F1 (sinh) (svfloat32_t x, const svbool_t pg)
+{
+  const struct data *d = ptr_barrier (&data);
+  svfloat32_t ax = svabs_x (pg, x);
+  svuint32_t sign
+      = sveor_x (pg, svreinterpret_u32 (x), svreinterpret_u32 (ax));
+  svfloat32_t halfsign = svreinterpret_f32 (svorr_x (pg, sign, d->halff));
+
+  svbool_t special = svcmpge (pg, svreinterpret_u32 (ax), d->large_bound);
+
+  /* Up to the point that expm1f overflows, we can use it to calculate sinhf
+   using a slight rearrangement of the definition of asinh. This allows us to
+   retain acceptable accuracy for very small inputs.  */
+  svfloat32_t t = expm1f_inline (ax, pg, &d->expm1f_consts);
+  t = svadd_x (pg, t, svdiv_x (pg, t, svadd_x (pg, t, 1.0)));
+
+  /* Fall back to the scalar variant for any lanes which would cause
+     expm1f to overflow.  */
+  if (__glibc_unlikely (svptest_any (pg, special)))
+    return special_case (x, svmul_x (pg, t, halfsign), special);
+
+  return svmul_x (pg, t, halfsign);
+}
+diff --git a/sysdeps/aarch64/fpu/sv_expm1f_inline.h b/sysdeps/aarch64/fpu/sv_expm1f_inline.h
+new file mode 100644
+index 0000000000000000..5b7245122294e1b4
+--- /dev/null
+++ b/sysdeps/aarch64/fpu/sv_expm1f_inline.h
+@@ -0,0 +1,84 @@
+/* Single-precision inline helper for vector (SVE) expm1 function
+
+   Copyright (C) 2024 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#ifndef AARCH64_FPU_SV_EXPM1F_INLINE_H
+#define AARCH64_FPU_SV_EXPM1F_INLINE_H
+
+#include "sv_math.h"
+
+struct sv_expm1f_data
+{
+  /* These 4 are grouped together so they can be loaded as one quadword, then
+   used with _lane forms of svmla/svmls.  */
+  float32_t c2, c4, ln2_hi, ln2_lo;
+  float32_t c0, c1, c3, inv_ln2, shift;
+};
+
+/* Coefficients generated using fpminimax.  */
+#define SV_EXPM1F_DATA                                                        \
+  {                                                                           \
+    .c0 = 0x1.fffffep-2, .c1 = 0x1.5554aep-3, .c2 = 0x1.555736p-5,            \
+    .c3 = 0x1.12287cp-7, .c4 = 0x1.6b55a2p-10,                                \
+                                                                              \
+    .shift = 0x1.8p23f, .inv_ln2 = 0x1.715476p+0f, .ln2_hi = 0x1.62e4p-1f,    \
+    .ln2_lo = 0x1.7f7d1cp-20f,                                                \
+  }
+
+#define C(i) sv_f32 (d->c##i)
+
+static inline svfloat32_t
+expm1f_inline (svfloat32_t x, svbool_t pg, const struct sv_expm1f_data *d)
+{
+  /* This vector is reliant on layout of data - it contains constants
+   that can be used with _lane forms of svmla/svmls. Values are:
+   [ coeff_2, coeff_4, ln2_hi, ln2_lo ].  */
+  svfloat32_t lane_constants = svld1rq (svptrue_b32 (), &d->c2);
+
+  /* Reduce argument to smaller range:
+     Let i = round(x / ln2)
+     and f = x - i * ln2, then f is in [-ln2/2, ln2/2].
+     exp(x) - 1 = 2^i * (expm1(f) + 1) - 1
+     where 2^i is exact because i is an integer.  */
+  svfloat32_t j = svmla_x (pg, sv_f32 (d->shift), x, d->inv_ln2);
+  j = svsub_x (pg, j, d->shift);
+  svint32_t i = svcvt_s32_x (pg, j);
+
+  svfloat32_t f = svmls_lane (x, j, lane_constants, 2);
+  f = svmls_lane (f, j, lane_constants, 3);
+
+  /* Approximate expm1(f) using polynomial.
+     Taylor expansion for expm1(x) has the form:
+	 x + ax^2 + bx^3 + cx^4 ....
+     So we calculate the polynomial P(f) = a + bf + cf^2 + ...
+     and assemble the approximation expm1(f) ~= f + f^2 * P(f).  */
+  svfloat32_t p12 = svmla_lane (C (1), f, lane_constants, 0);
+  svfloat32_t p34 = svmla_lane (C (3), f, lane_constants, 1);
+  svfloat32_t f2 = svmul_x (pg, f, f);
+  svfloat32_t p = svmla_x (pg, p12, f2, p34);
+  p = svmla_x (pg, C (0), f, p);
+  p = svmla_x (pg, f, f2, p);
+
+  /* Assemble the result.
+     expm1(x) ~= 2^i * (p + 1) - 1
+     Let t = 2^i.  */
+  svfloat32_t t = svscale_x (pg, sv_f32 (1), i);
+  return svmla_x (pg, svsub_x (pg, t, 1), p, t);
+}
+
+#endif
+diff --git a/sysdeps/aarch64/fpu/test-double-advsimd-wrappers.c b/sysdeps/aarch64/fpu/test-double-advsimd-wrappers.c
+index a01aa99c16740631..1a57b22c3a92f1e1 100644
+--- a/sysdeps/aarch64/fpu/test-double-advsimd-wrappers.c
+++ b/sysdeps/aarch64/fpu/test-double-advsimd-wrappers.c
+@@ -42,4 +42,5 @@ VPCS_VECTOR_WRAPPER (log10_advsimd, _ZGVnN2v_log10)
+ VPCS_VECTOR_WRAPPER (log1p_advsimd, _ZGVnN2v_log1p)
+ VPCS_VECTOR_WRAPPER (log2_advsimd, _ZGVnN2v_log2)
+ VPCS_VECTOR_WRAPPER (sin_advsimd, _ZGVnN2v_sin)
+VPCS_VECTOR_WRAPPER (sinh_advsimd, _ZGVnN2v_sinh)
+ VPCS_VECTOR_WRAPPER (tan_advsimd, _ZGVnN2v_tan)
+diff --git a/sysdeps/aarch64/fpu/test-double-sve-wrappers.c b/sysdeps/aarch64/fpu/test-double-sve-wrappers.c
+index 83cb3ad5d0e4d056..0c9858f6b74aaef6 100644
+--- a/sysdeps/aarch64/fpu/test-double-sve-wrappers.c
+++ b/sysdeps/aarch64/fpu/test-double-sve-wrappers.c
+@@ -61,4 +61,5 @@ SVE_VECTOR_WRAPPER (log10_sve, _ZGVsMxv_log10)
+ SVE_VECTOR_WRAPPER (log1p_sve, _ZGVsMxv_log1p)
+ SVE_VECTOR_WRAPPER (log2_sve, _ZGVsMxv_log2)
+ SVE_VECTOR_WRAPPER (sin_sve, _ZGVsMxv_sin)
+SVE_VECTOR_WRAPPER (sinh_sve, _ZGVsMxv_sinh)
+ SVE_VECTOR_WRAPPER (tan_sve, _ZGVsMxv_tan)
+diff --git a/sysdeps/aarch64/fpu/test-float-advsimd-wrappers.c b/sysdeps/aarch64/fpu/test-float-advsimd-wrappers.c
+index 831d4d755272d616..4758490c6fc40fda 100644
+--- a/sysdeps/aarch64/fpu/test-float-advsimd-wrappers.c
+++ b/sysdeps/aarch64/fpu/test-float-advsimd-wrappers.c
+@@ -42,4 +42,5 @@ VPCS_VECTOR_WRAPPER (log10f_advsimd, _ZGVnN4v_log10f)
+ VPCS_VECTOR_WRAPPER (log1pf_advsimd, _ZGVnN4v_log1pf)
+ VPCS_VECTOR_WRAPPER (log2f_advsimd, _ZGVnN4v_log2f)
+ VPCS_VECTOR_WRAPPER (sinf_advsimd, _ZGVnN4v_sinf)
+VPCS_VECTOR_WRAPPER (sinhf_advsimd, _ZGVnN4v_sinhf)
+ VPCS_VECTOR_WRAPPER (tanf_advsimd, _ZGVnN4v_tanf)
+diff --git a/sysdeps/aarch64/fpu/test-float-sve-wrappers.c b/sysdeps/aarch64/fpu/test-float-sve-wrappers.c
+index 96fd612c3e76f6dc..7c04f07bbee84777 100644
+--- a/sysdeps/aarch64/fpu/test-float-sve-wrappers.c
+++ b/sysdeps/aarch64/fpu/test-float-sve-wrappers.c
+@@ -61,4 +61,5 @@ SVE_VECTOR_WRAPPER (log10f_sve, _ZGVsMxv_log10f)
+ SVE_VECTOR_WRAPPER (log1pf_sve, _ZGVsMxv_log1pf)
+ SVE_VECTOR_WRAPPER (log2f_sve, _ZGVsMxv_log2f)
+ SVE_VECTOR_WRAPPER (sinf_sve, _ZGVsMxv_sinf)
+SVE_VECTOR_WRAPPER (sinhf_sve, _ZGVsMxv_sinhf)
+ SVE_VECTOR_WRAPPER (tanf_sve, _ZGVsMxv_tanf)
+diff --git a/sysdeps/aarch64/fpu/v_expm1f_inline.h b/sysdeps/aarch64/fpu/v_expm1f_inline.h
+new file mode 100644
+index 0000000000000000..337ccfbfab555c97
+--- /dev/null
+++ b/sysdeps/aarch64/fpu/v_expm1f_inline.h
+@@ -0,0 +1,73 @@
+/* Single-precision inline helper for vector (Advanced SIMD) expm1 function
+
+   Copyright (C) 2024 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#ifndef AARCH64_FPU_V_EXPM1F_INLINE_H
+#define AARCH64_FPU_V_EXPM1F_INLINE_H
+
+#include "v_math.h"
+#include "poly_advsimd_f32.h"
+
+struct v_expm1f_data
+{
+  float32x4_t poly[5];
+  float32x4_t invln2_and_ln2, shift;
+  int32x4_t exponent_bias;
+};
+
+/* Coefficients generated using fpminimax with degree=5 in [-log(2)/2,
+   log(2)/2]. Exponent bias is asuint(1.0f).
+   invln2_and_ln2 Stores constants: invln2, ln2_lo, ln2_hi, 0.  */
+#define V_EXPM1F_DATA                                                         \
+  {                                                                           \
+    .poly = { V4 (0x1.fffffep-2), V4 (0x1.5554aep-3), V4 (0x1.555736p-5),     \
+	      V4 (0x1.12287cp-7), V4 (0x1.6b55a2p-10) },                      \
+    .shift = V4 (0x1.8p23f), .exponent_bias = V4 (0x3f800000),                \
+    .invln2_and_ln2 = { 0x1.715476p+0f, 0x1.62e4p-1f, 0x1.7f7d1cp-20f, 0 },   \
+  }
+
+static inline float32x4_t
+expm1f_inline (float32x4_t x, const struct v_expm1f_data *d)
+{
+  /* Helper routine for calculating exp(x) - 1.
+     Copied from v_expm1f_1u6.c, with all special-case handling removed - the
+     calling routine should handle special values if required.  */
+
+  /* Reduce argument: f in [-ln2/2, ln2/2], i is exact.  */
+  float32x4_t j = vsubq_f32 (
+      vfmaq_laneq_f32 (d->shift, x, d->invln2_and_ln2, 0), d->shift);
+  int32x4_t i = vcvtq_s32_f32 (j);
+  float32x4_t f = vfmsq_laneq_f32 (x, j, d->invln2_and_ln2, 1);
+  f = vfmsq_laneq_f32 (f, j, d->invln2_and_ln2, 2);
+
+  /* Approximate expm1(f) with polynomial P, expm1(f) ~= f + f^2 * P(f).
+     Uses Estrin scheme, where the main _ZGVnN4v_expm1f routine uses
+     Horner.  */
+  float32x4_t f2 = vmulq_f32 (f, f);
+  float32x4_t f4 = vmulq_f32 (f2, f2);
+  float32x4_t p = v_estrin_4_f32 (f, f2, f4, d->poly);
+  p = vfmaq_f32 (f, f2, p);
+
+  /* t = 2^i.  */
+  int32x4_t u = vaddq_s32 (vshlq_n_s32 (i, 23), d->exponent_bias);
+  float32x4_t t = vreinterpretq_f32_s32 (u);
+  /* expm1(x) ~= p * t + (t - 1).  */
+  return vfmaq_f32 (vsubq_f32 (t, v_f32 (1.0f)), p, t);
+}
+
+#endif
+diff --git a/sysdeps/aarch64/libm-test-ulps b/sysdeps/aarch64/libm-test-ulps
+index 7c2e43d3dc5bbc13..fec0972081af734a 100644
+--- a/sysdeps/aarch64/libm-test-ulps
+++ b/sysdeps/aarch64/libm-test-ulps
+@@ -1441,11 +1441,19 @@ double: 2
+ float: 2
+ ldouble: 2
+ 
+Function: "sinh_advsimd":
+double: 2
+float: 1
+
+ Function: "sinh_downward":
+ double: 3
+ float: 3
+ ldouble: 3
+ 
+Function: "sinh_sve":
+double: 2
+float: 1
+
+ Function: "sinh_towardzero":
+ double: 3
+ float: 2
+diff --git a/sysdeps/unix/sysv/linux/aarch64/libmvec.abilist b/sysdeps/unix/sysv/linux/aarch64/libmvec.abilist
+index ce42372a3a276832..1db5ba61d64067a2 100644
+--- a/sysdeps/unix/sysv/linux/aarch64/libmvec.abilist
+++ b/sysdeps/unix/sysv/linux/aarch64/libmvec.abilist
+@@ -83,11 +83,14 @@ GLIBC_2.40 _ZGVnN2v_cosh F
+ GLIBC_2.40 _ZGVnN2v_coshf F
+ GLIBC_2.40 _ZGVnN2v_erf F
+ GLIBC_2.40 _ZGVnN2v_erff F
+GLIBC_2.40 _ZGVnN2v_sinh F
+GLIBC_2.40 _ZGVnN2v_sinhf F
+ GLIBC_2.40 _ZGVnN4v_acoshf F
+ GLIBC_2.40 _ZGVnN4v_asinhf F
+ GLIBC_2.40 _ZGVnN4v_atanhf F
+ GLIBC_2.40 _ZGVnN4v_coshf F
+ GLIBC_2.40 _ZGVnN4v_erff F
+GLIBC_2.40 _ZGVnN4v_sinhf F
+ GLIBC_2.40 _ZGVsMxv_acosh F
+ GLIBC_2.40 _ZGVsMxv_acoshf F
+ GLIBC_2.40 _ZGVsMxv_asinh F
+@@ -98,3 +101,5 @@ GLIBC_2.40 _ZGVsMxv_cosh F
+ GLIBC_2.40 _ZGVsMxv_coshf F
+ GLIBC_2.40 _ZGVsMxv_erf F
+ GLIBC_2.40 _ZGVsMxv_erff F
+GLIBC_2.40 _ZGVsMxv_sinh F
+GLIBC_2.40 _ZGVsMxv_sinhf F
--- a/glibc-RHEL-118273-7.patch
+++ b/glibc-RHEL-118273-7.patch
@ -0,0 +1,624 @@
+commit 3d3a4fb8e4fe854a0bbb3df9c26ba482c10a7e22
+Author: Joe Ramsay <Joe.Ramsay@arm.com>
+Date:   Tue Feb 20 16:59:44 2024 +0000
+
+    aarch64/fpu: Add vector variants of tanh
+    
+    Reviewed-by: Szabolcs Nagy <szabolcs.nagy@arm.com>
+
+diff --git a/math/auto-libm-test-in b/math/auto-libm-test-in
+index 5a690023e9a675cb..4584c5e498ab7194 100644
+--- a/math/auto-libm-test-in
+++ b/math/auto-libm-test-in
+@@ -7747,7 +7747,7 @@ tan min_subnorm
+ tan -min_subnorm
+ 
+ tanh 0
+-tanh -0
+tanh -0 no-mathvec
+ tanh 0.75
+ tanh -0.75
+ tanh 1.0
+diff --git a/math/auto-libm-test-out-tanh b/math/auto-libm-test-out-tanh
+index 8b9427c917f3b388..19ce2e7b9355963d 100644
+--- a/math/auto-libm-test-out-tanh
+++ b/math/auto-libm-test-out-tanh
+@@ -23,31 +23,31 @@ tanh 0
+ = tanh tonearest ibm128 0x0p+0 : 0x0p+0 : inexact-ok
+ = tanh towardzero ibm128 0x0p+0 : 0x0p+0 : inexact-ok
+ = tanh upward ibm128 0x0p+0 : 0x0p+0 : inexact-ok
+-tanh -0
+-= tanh downward binary32 -0x0p+0 : -0x0p+0 : inexact-ok
+-= tanh tonearest binary32 -0x0p+0 : -0x0p+0 : inexact-ok
+-= tanh towardzero binary32 -0x0p+0 : -0x0p+0 : inexact-ok
+-= tanh upward binary32 -0x0p+0 : -0x0p+0 : inexact-ok
+-= tanh downward binary64 -0x0p+0 : -0x0p+0 : inexact-ok
+-= tanh tonearest binary64 -0x0p+0 : -0x0p+0 : inexact-ok
+-= tanh towardzero binary64 -0x0p+0 : -0x0p+0 : inexact-ok
+-= tanh upward binary64 -0x0p+0 : -0x0p+0 : inexact-ok
+-= tanh downward intel96 -0x0p+0 : -0x0p+0 : inexact-ok
+-= tanh tonearest intel96 -0x0p+0 : -0x0p+0 : inexact-ok
+-= tanh towardzero intel96 -0x0p+0 : -0x0p+0 : inexact-ok
+-= tanh upward intel96 -0x0p+0 : -0x0p+0 : inexact-ok
+-= tanh downward m68k96 -0x0p+0 : -0x0p+0 : inexact-ok
+-= tanh tonearest m68k96 -0x0p+0 : -0x0p+0 : inexact-ok
+-= tanh towardzero m68k96 -0x0p+0 : -0x0p+0 : inexact-ok
+-= tanh upward m68k96 -0x0p+0 : -0x0p+0 : inexact-ok
+-= tanh downward binary128 -0x0p+0 : -0x0p+0 : inexact-ok
+-= tanh tonearest binary128 -0x0p+0 : -0x0p+0 : inexact-ok
+-= tanh towardzero binary128 -0x0p+0 : -0x0p+0 : inexact-ok
+-= tanh upward binary128 -0x0p+0 : -0x0p+0 : inexact-ok
+-= tanh downward ibm128 -0x0p+0 : -0x0p+0 : inexact-ok
+-= tanh tonearest ibm128 -0x0p+0 : -0x0p+0 : inexact-ok
+-= tanh towardzero ibm128 -0x0p+0 : -0x0p+0 : inexact-ok
+-= tanh upward ibm128 -0x0p+0 : -0x0p+0 : inexact-ok
+tanh -0 no-mathvec
+= tanh downward binary32 -0x0p+0 : -0x0p+0 : no-mathvec inexact-ok
+= tanh tonearest binary32 -0x0p+0 : -0x0p+0 : no-mathvec inexact-ok
+= tanh towardzero binary32 -0x0p+0 : -0x0p+0 : no-mathvec inexact-ok
+= tanh upward binary32 -0x0p+0 : -0x0p+0 : no-mathvec inexact-ok
+= tanh downward binary64 -0x0p+0 : -0x0p+0 : no-mathvec inexact-ok
+= tanh tonearest binary64 -0x0p+0 : -0x0p+0 : no-mathvec inexact-ok
+= tanh towardzero binary64 -0x0p+0 : -0x0p+0 : no-mathvec inexact-ok
+= tanh upward binary64 -0x0p+0 : -0x0p+0 : no-mathvec inexact-ok
+= tanh downward intel96 -0x0p+0 : -0x0p+0 : no-mathvec inexact-ok
+= tanh tonearest intel96 -0x0p+0 : -0x0p+0 : no-mathvec inexact-ok
+= tanh towardzero intel96 -0x0p+0 : -0x0p+0 : no-mathvec inexact-ok
+= tanh upward intel96 -0x0p+0 : -0x0p+0 : no-mathvec inexact-ok
+= tanh downward m68k96 -0x0p+0 : -0x0p+0 : no-mathvec inexact-ok
+= tanh tonearest m68k96 -0x0p+0 : -0x0p+0 : no-mathvec inexact-ok
+= tanh towardzero m68k96 -0x0p+0 : -0x0p+0 : no-mathvec inexact-ok
+= tanh upward m68k96 -0x0p+0 : -0x0p+0 : no-mathvec inexact-ok
+= tanh downward binary128 -0x0p+0 : -0x0p+0 : no-mathvec inexact-ok
+= tanh tonearest binary128 -0x0p+0 : -0x0p+0 : no-mathvec inexact-ok
+= tanh towardzero binary128 -0x0p+0 : -0x0p+0 : no-mathvec inexact-ok
+= tanh upward binary128 -0x0p+0 : -0x0p+0 : no-mathvec inexact-ok
+= tanh downward ibm128 -0x0p+0 : -0x0p+0 : no-mathvec inexact-ok
+= tanh tonearest ibm128 -0x0p+0 : -0x0p+0 : no-mathvec inexact-ok
+= tanh towardzero ibm128 -0x0p+0 : -0x0p+0 : no-mathvec inexact-ok
+= tanh upward ibm128 -0x0p+0 : -0x0p+0 : no-mathvec inexact-ok
+ tanh 0.75
+ = tanh downward binary32 0xcp-4 : 0xa.2991fp-4 : inexact-ok
+ = tanh tonearest binary32 0xcp-4 : 0xa.2991fp-4 : inexact-ok
+diff --git a/sysdeps/aarch64/fpu/Makefile b/sysdeps/aarch64/fpu/Makefile
+index fb5f3a365b27fdf3..e5f418ae4274edb2 100644
+--- a/sysdeps/aarch64/fpu/Makefile
+++ b/sysdeps/aarch64/fpu/Makefile
+@@ -18,7 +18,8 @@ libmvec-supported-funcs = acos \
+                           log2 \
+                           sin \
+                           sinh \
+-                          tan
+                          tan \
+                          tanh
+ 
+ float-advsimd-funcs = $(libmvec-supported-funcs)
+ double-advsimd-funcs = $(libmvec-supported-funcs)
+diff --git a/sysdeps/aarch64/fpu/Versions b/sysdeps/aarch64/fpu/Versions
+index 4774b3efeacf59fb..4dbf3d32441dd43a 100644
+--- a/sysdeps/aarch64/fpu/Versions
+++ b/sysdeps/aarch64/fpu/Versions
+@@ -109,5 +109,10 @@ libmvec {
+     _ZGVnN4v_sinhf;
+     _ZGVsMxv_sinh;
+     _ZGVsMxv_sinhf;
+    _ZGVnN2v_tanh;
+    _ZGVnN2v_tanhf;
+    _ZGVnN4v_tanhf;
+    _ZGVsMxv_tanh;
+    _ZGVsMxv_tanhf;
+   }
+ }
+diff --git a/sysdeps/aarch64/fpu/advsimd_f32_protos.h b/sysdeps/aarch64/fpu/advsimd_f32_protos.h
+index 7d9445d5c0c0c2a8..4ff191c324050b42 100644
+--- a/sysdeps/aarch64/fpu/advsimd_f32_protos.h
+++ b/sysdeps/aarch64/fpu/advsimd_f32_protos.h
+@@ -37,4 +37,5 @@ libmvec_hidden_proto (V_NAME_F1(log));
+ libmvec_hidden_proto (V_NAME_F1(sin));
+ libmvec_hidden_proto (V_NAME_F1(sinh));
+ libmvec_hidden_proto (V_NAME_F1(tan));
+libmvec_hidden_proto (V_NAME_F1(tanh));
+ libmvec_hidden_proto (V_NAME_F2(atan2));
+diff --git a/sysdeps/aarch64/fpu/bits/math-vector.h b/sysdeps/aarch64/fpu/bits/math-vector.h
+index 1e9b76cf41916365..585e022082d62a5d 100644
+--- a/sysdeps/aarch64/fpu/bits/math-vector.h
+++ b/sysdeps/aarch64/fpu/bits/math-vector.h
+@@ -113,6 +113,10 @@
+ # define __DECL_SIMD_tan __DECL_SIMD_aarch64
+ # undef __DECL_SIMD_tanf
+ # define __DECL_SIMD_tanf __DECL_SIMD_aarch64
+# undef __DECL_SIMD_tanh
+# define __DECL_SIMD_tanh __DECL_SIMD_aarch64
+# undef __DECL_SIMD_tanhf
+# define __DECL_SIMD_tanhf __DECL_SIMD_aarch64
+ #endif
+ 
+ #if __GNUC_PREREQ(9, 0)
+@@ -160,6 +164,7 @@ __vpcs __f32x4_t _ZGVnN4v_log2f (__f32x4_t);
+ __vpcs __f32x4_t _ZGVnN4v_sinf (__f32x4_t);
+ __vpcs __f32x4_t _ZGVnN4v_sinhf (__f32x4_t);
+ __vpcs __f32x4_t _ZGVnN4v_tanf (__f32x4_t);
+__vpcs __f32x4_t _ZGVnN4v_tanhf (__f32x4_t);
+ 
+ __vpcs __f64x2_t _ZGVnN2vv_atan2 (__f64x2_t, __f64x2_t);
+ __vpcs __f64x2_t _ZGVnN2v_acos (__f64x2_t);
+@@ -182,6 +187,7 @@ __vpcs __f64x2_t _ZGVnN2v_log2 (__f64x2_t);
+ __vpcs __f64x2_t _ZGVnN2v_sin (__f64x2_t);
+ __vpcs __f64x2_t _ZGVnN2v_sinh (__f64x2_t);
+ __vpcs __f64x2_t _ZGVnN2v_tan (__f64x2_t);
+__vpcs __f64x2_t _ZGVnN2v_tanh (__f64x2_t);
+ 
+ #  undef __ADVSIMD_VEC_MATH_SUPPORTED
+ #endif /* __ADVSIMD_VEC_MATH_SUPPORTED */
+@@ -209,6 +215,7 @@ __sv_f32_t _ZGVsMxv_log2f (__sv_f32_t, __sv_bool_t);
+ __sv_f32_t _ZGVsMxv_sinf (__sv_f32_t, __sv_bool_t);
+ __sv_f32_t _ZGVsMxv_sinhf (__sv_f32_t, __sv_bool_t);
+ __sv_f32_t _ZGVsMxv_tanf (__sv_f32_t, __sv_bool_t);
+__sv_f32_t _ZGVsMxv_tanhf (__sv_f32_t, __sv_bool_t);
+ 
+ __sv_f64_t _ZGVsMxvv_atan2 (__sv_f64_t, __sv_f64_t, __sv_bool_t);
+ __sv_f64_t _ZGVsMxv_acos (__sv_f64_t, __sv_bool_t);
+@@ -231,6 +238,7 @@ __sv_f64_t _ZGVsMxv_log2 (__sv_f64_t, __sv_bool_t);
+ __sv_f64_t _ZGVsMxv_sin (__sv_f64_t, __sv_bool_t);
+ __sv_f64_t _ZGVsMxv_sinh (__sv_f64_t, __sv_bool_t);
+ __sv_f64_t _ZGVsMxv_tan (__sv_f64_t, __sv_bool_t);
+__sv_f64_t _ZGVsMxv_tanh (__sv_f64_t, __sv_bool_t);
+ 
+ #  undef __SVE_VEC_MATH_SUPPORTED
+ #endif /* __SVE_VEC_MATH_SUPPORTED */
+diff --git a/sysdeps/aarch64/fpu/tanh_advsimd.c b/sysdeps/aarch64/fpu/tanh_advsimd.c
+new file mode 100644
+index 0000000000000000..1da1dfa5dbe418b6
+--- /dev/null
+++ b/sysdeps/aarch64/fpu/tanh_advsimd.c
+@@ -0,0 +1,109 @@
+/* Double-precision vector (Advanced SIMD) tanh function
+
+   Copyright (C) 2024 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include "v_math.h"
+#include "poly_advsimd_f64.h"
+
+static const struct data
+{
+  float64x2_t poly[11];
+  float64x2_t inv_ln2, ln2_hi, ln2_lo, shift;
+  uint64x2_t onef;
+  uint64x2_t thresh, tiny_bound;
+} data = {
+  /* Generated using Remez, deg=12 in [-log(2)/2, log(2)/2].  */
+  .poly = { V2 (0x1p-1), V2 (0x1.5555555555559p-3), V2 (0x1.555555555554bp-5),
+	    V2 (0x1.111111110f663p-7), V2 (0x1.6c16c16c1b5f3p-10),
+	    V2 (0x1.a01a01affa35dp-13), V2 (0x1.a01a018b4ecbbp-16),
+	    V2 (0x1.71ddf82db5bb4p-19), V2 (0x1.27e517fc0d54bp-22),
+	    V2 (0x1.af5eedae67435p-26), V2 (0x1.1f143d060a28ap-29), },
+
+  .inv_ln2 = V2 (0x1.71547652b82fep0),
+  .ln2_hi = V2 (-0x1.62e42fefa39efp-1),
+  .ln2_lo = V2 (-0x1.abc9e3b39803fp-56),
+  .shift = V2 (0x1.8p52),
+
+  .onef = V2 (0x3ff0000000000000),
+  .tiny_bound = V2 (0x3e40000000000000), /* asuint64 (0x1p-27).  */
+  /* asuint64(0x1.241bf835f9d5fp+4) - asuint64(tiny_bound).  */
+  .thresh = V2 (0x01f241bf835f9d5f),
+};
+
+static inline float64x2_t
+expm1_inline (float64x2_t x, const struct data *d)
+{
+  /* Helper routine for calculating exp(x) - 1. Vector port of the helper from
+     the scalar variant of tanh.  */
+
+  /* Reduce argument: f in [-ln2/2, ln2/2], i is exact.  */
+  float64x2_t j = vsubq_f64 (vfmaq_f64 (d->shift, d->inv_ln2, x), d->shift);
+  int64x2_t i = vcvtq_s64_f64 (j);
+  float64x2_t f = vfmaq_f64 (x, j, d->ln2_hi);
+  f = vfmaq_f64 (f, j, d->ln2_lo);
+
+  /* Approximate expm1(f) using polynomial.  */
+  float64x2_t f2 = vmulq_f64 (f, f);
+  float64x2_t f4 = vmulq_f64 (f2, f2);
+  float64x2_t p = vfmaq_f64 (
+      f, f2, v_estrin_10_f64 (f, f2, f4, vmulq_f64 (f4, f4), d->poly));
+
+  /* t = 2 ^ i.  */
+  float64x2_t t = vreinterpretq_f64_u64 (
+      vaddq_u64 (vreinterpretq_u64_s64 (i << 52), d->onef));
+  /* expm1(x) = p * t + (t - 1).  */
+  return vfmaq_f64 (vsubq_f64 (t, v_f64 (1)), p, t);
+}
+
+static float64x2_t NOINLINE VPCS_ATTR
+special_case (float64x2_t x, float64x2_t y, uint64x2_t special)
+{
+  return v_call_f64 (tanh, x, y, special);
+}
+
+/* Vector approximation for double-precision tanh(x), using a simplified
+   version of expm1. The greatest observed error is 2.77 ULP:
+   _ZGVnN2v_tanh(-0x1.c4a4ca0f9f3b7p-3) got -0x1.bd6a21a163627p-3
+				       want -0x1.bd6a21a163624p-3.  */
+float64x2_t VPCS_ATTR V_NAME_D1 (tanh) (float64x2_t x)
+{
+  const struct data *d = ptr_barrier (&data);
+
+  uint64x2_t ia = vreinterpretq_u64_f64 (vabsq_f64 (x));
+
+  float64x2_t u = x;
+
+  /* Trigger special-cases for tiny, boring and infinity/NaN.  */
+  uint64x2_t special = vcgtq_u64 (vsubq_u64 (ia, d->tiny_bound), d->thresh);
+#if WANT_SIMD_EXCEPT
+  /* To trigger fp exceptions correctly, set special lanes to a neutral value.
+     They will be fixed up later by the special-case handler.  */
+  if (__glibc_unlikely (v_any_u64 (special)))
+    u = v_zerofy_f64 (u, special);
+#endif
+
+  u = vaddq_f64 (u, u);
+
+  /* tanh(x) = (e^2x - 1) / (e^2x + 1).  */
+  float64x2_t q = expm1_inline (u, d);
+  float64x2_t qp2 = vaddq_f64 (q, v_f64 (2));
+
+  if (__glibc_unlikely (v_any_u64 (special)))
+    return special_case (x, vdivq_f64 (q, qp2), special);
+  return vdivq_f64 (q, qp2);
+}
+diff --git a/sysdeps/aarch64/fpu/tanh_sve.c b/sysdeps/aarch64/fpu/tanh_sve.c
+new file mode 100644
+index 0000000000000000..d25e011cea305094
+--- /dev/null
+++ b/sysdeps/aarch64/fpu/tanh_sve.c
+@@ -0,0 +1,100 @@
+/* Double-precision vector (SVE) tanh function
+
+   Copyright (C) 2024 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include "sv_math.h"
+#include "poly_sve_f64.h"
+
+static const struct data
+{
+  float64_t poly[11];
+  float64_t inv_ln2, ln2_hi, ln2_lo, shift;
+  uint64_t thresh, tiny_bound;
+} data = {
+  /* Generated using Remez, deg=12 in [-log(2)/2, log(2)/2].  */
+  .poly = { 0x1p-1, 0x1.5555555555559p-3, 0x1.555555555554bp-5,
+	    0x1.111111110f663p-7, 0x1.6c16c16c1b5f3p-10,
+	    0x1.a01a01affa35dp-13, 0x1.a01a018b4ecbbp-16,
+	    0x1.71ddf82db5bb4p-19, 0x1.27e517fc0d54bp-22,
+	    0x1.af5eedae67435p-26, 0x1.1f143d060a28ap-29, },
+
+  .inv_ln2 = 0x1.71547652b82fep0,
+  .ln2_hi = -0x1.62e42fefa39efp-1,
+  .ln2_lo = -0x1.abc9e3b39803fp-56,
+  .shift = 0x1.8p52,
+
+  .tiny_bound = 0x3e40000000000000, /* asuint64 (0x1p-27).  */
+  /* asuint64(0x1.241bf835f9d5fp+4) - asuint64(tiny_bound).  */
+  .thresh = 0x01f241bf835f9d5f,
+};
+
+static inline svfloat64_t
+expm1_inline (svfloat64_t x, const svbool_t pg, const struct data *d)
+{
+  /* Helper routine for calculating exp(x) - 1. Vector port of the helper from
+     the scalar variant of tanh.  */
+
+  /* Reduce argument: f in [-ln2/2, ln2/2], i is exact.  */
+  svfloat64_t j
+      = svsub_x (pg, svmla_x (pg, sv_f64 (d->shift), x, d->inv_ln2), d->shift);
+  svint64_t i = svcvt_s64_x (pg, j);
+  svfloat64_t f = svmla_x (pg, x, j, d->ln2_hi);
+  f = svmla_x (pg, f, j, d->ln2_lo);
+
+  /* Approximate expm1(f) using polynomial.  */
+  svfloat64_t f2 = svmul_x (pg, f, f);
+  svfloat64_t f4 = svmul_x (pg, f2, f2);
+  svfloat64_t p = svmla_x (
+      pg, f, f2,
+      sv_estrin_10_f64_x (pg, f, f2, f4, svmul_x (pg, f4, f4), d->poly));
+
+  /* t = 2 ^ i.  */
+  svfloat64_t t = svscale_x (pg, sv_f64 (1), i);
+  /* expm1(x) = p * t + (t - 1).  */
+  return svmla_x (pg, svsub_x (pg, t, 1), p, t);
+}
+
+static svfloat64_t NOINLINE
+special_case (svfloat64_t x, svfloat64_t y, svbool_t special)
+{
+  return sv_call_f64 (tanh, x, y, special);
+}
+
+/* SVE approximation for double-precision tanh(x), using a simplified
+   version of expm1. The greatest observed error is 2.77 ULP:
+   _ZGVsMxv_tanh(-0x1.c4a4ca0f9f3b7p-3) got -0x1.bd6a21a163627p-3
+				       want -0x1.bd6a21a163624p-3.  */
+svfloat64_t SV_NAME_D1 (tanh) (svfloat64_t x, svbool_t pg)
+{
+  const struct data *d = ptr_barrier (&data);
+
+  svuint64_t ia = svreinterpret_u64 (svabs_x (pg, x));
+
+  /* Trigger special-cases for tiny, boring and infinity/NaN.  */
+  svbool_t special = svcmpgt (pg, svsub_x (pg, ia, d->tiny_bound), d->thresh);
+
+  svfloat64_t u = svadd_x (pg, x, x);
+
+  /* tanh(x) = (e^2x - 1) / (e^2x + 1).  */
+  svfloat64_t q = expm1_inline (u, pg, d);
+  svfloat64_t qp2 = svadd_x (pg, q, 2);
+
+  if (__glibc_unlikely (svptest_any (pg, special)))
+    return special_case (x, svdiv_x (pg, q, qp2), special);
+  return svdiv_x (pg, q, qp2);
+}
+diff --git a/sysdeps/aarch64/fpu/tanhf_advsimd.c b/sysdeps/aarch64/fpu/tanhf_advsimd.c
+new file mode 100644
+index 0000000000000000..50defd6ef03926f4
+--- /dev/null
+++ b/sysdeps/aarch64/fpu/tanhf_advsimd.c
+@@ -0,0 +1,76 @@
+/* Single-precision vector (Advanced SIMD) tanh function
+
+   Copyright (C) 2024 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include "v_expm1f_inline.h"
+
+static const struct data
+{
+  struct v_expm1f_data expm1f_consts;
+  uint32x4_t boring_bound, large_bound, onef;
+} data = {
+  .expm1f_consts = V_EXPM1F_DATA,
+  /* 0x1.205966p+3, above which tanhf rounds to 1 (or -1 for  negative).  */
+  .boring_bound = V4 (0x41102cb3),
+  .large_bound = V4 (0x7f800000),
+  .onef = V4 (0x3f800000),
+};
+
+static float32x4_t NOINLINE VPCS_ATTR
+special_case (float32x4_t x, float32x4_t y, uint32x4_t special)
+{
+  return v_call_f32 (tanhf, x, y, special);
+}
+
+/* Approximation for single-precision vector tanh(x), using a simplified
+   version of expm1f. The maximum error is 2.58 ULP:
+   _ZGVnN4v_tanhf (0x1.fa5eep-5) got 0x1.f9ba02p-5
+				want 0x1.f9ba08p-5.  */
+float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (tanh) (float32x4_t x)
+{
+  const struct data *d = ptr_barrier (&data);
+
+  uint32x4_t ix = vreinterpretq_u32_f32 (x);
+  float32x4_t ax = vabsq_f32 (x);
+  uint32x4_t iax = vreinterpretq_u32_f32 (ax);
+  uint32x4_t sign = veorq_u32 (ix, iax);
+  uint32x4_t is_boring = vcgtq_u32 (iax, d->boring_bound);
+  float32x4_t boring = vreinterpretq_f32_u32 (vorrq_u32 (sign, d->onef));
+
+#if WANT_SIMD_EXCEPT
+  /* If fp exceptions are to be triggered properly, set all special and boring
+     lanes to 0, which will trigger no exceptions, and fix them up later.  */
+  uint32x4_t special = vorrq_u32 (vcgtq_u32 (iax, d->large_bound),
+				  vcltq_u32 (iax, v_u32 (0x34000000)));
+  x = v_zerofy_f32 (x, is_boring);
+  if (__glibc_unlikely (v_any_u32 (special)))
+    x = v_zerofy_f32 (x, special);
+#else
+  uint32x4_t special = vcgtq_u32 (iax, d->large_bound);
+#endif
+
+  /* tanh(x) = (e^2x - 1) / (e^2x + 1).  */
+  float32x4_t q = expm1f_inline (vmulq_n_f32 (x, 2), &d->expm1f_consts);
+  float32x4_t y = vdivq_f32 (q, vaddq_f32 (q, v_f32 (2.0)));
+  if (__glibc_unlikely (v_any_u32 (special)))
+    return special_case (vreinterpretq_f32_u32 (ix),
+			 vbslq_f32 (is_boring, boring, y), special);
+  return vbslq_f32 (is_boring, boring, y);
+}
+libmvec_hidden_def (V_NAME_F1 (tanh))
+HALF_WIDTH_ALIAS_F1 (tanh)
+diff --git a/sysdeps/aarch64/fpu/tanhf_sve.c b/sysdeps/aarch64/fpu/tanhf_sve.c
+new file mode 100644
+index 0000000000000000..0b94523cf5074200
+--- /dev/null
+++ b/sysdeps/aarch64/fpu/tanhf_sve.c
+@@ -0,0 +1,61 @@
+/* Single-precision vector (SVE) tanh function
+
+   Copyright (C) 2024 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include "sv_expm1f_inline.h"
+
+static const struct data
+{
+  struct sv_expm1f_data expm1f_consts;
+  uint32_t boring_bound, onef;
+} data = {
+  .expm1f_consts = SV_EXPM1F_DATA,
+  /* 0x1.205966p+3, above which tanhf rounds to 1 (or -1 for negative).  */
+  .boring_bound = 0x41102cb3,
+  .onef = 0x3f800000,
+};
+
+static svfloat32_t NOINLINE
+special_case (svfloat32_t x, svfloat32_t y, svbool_t special)
+{
+  return sv_call_f32 (tanhf, x, y, special);
+}
+
+/* Approximation for single-precision SVE tanh(x), using a simplified
+   version of expm1f. The maximum error is 2.57 ULP:
+   _ZGVsMxv_tanhf (0x1.fc1832p-5) got 0x1.fb71a4p-5
+				 want 0x1.fb71aap-5.  */
+svfloat32_t SV_NAME_F1 (tanh) (svfloat32_t x, const svbool_t pg)
+{
+  const struct data *d = ptr_barrier (&data);
+
+  svfloat32_t ax = svabs_x (pg, x);
+  svuint32_t iax = svreinterpret_u32 (ax);
+  svuint32_t sign = sveor_x (pg, svreinterpret_u32 (x), iax);
+  svbool_t is_boring = svcmpgt (pg, iax, d->boring_bound);
+  svfloat32_t boring = svreinterpret_f32 (svorr_x (pg, sign, d->onef));
+
+  svbool_t special = svcmpgt (pg, iax, 0x7f800000);
+
+  /* tanh(x) = (e^2x - 1) / (e^2x + 1).  */
+  svfloat32_t q = expm1f_inline (svmul_x (pg, x, 2.0), pg, &d->expm1f_consts);
+  svfloat32_t y = svdiv_x (pg, q, svadd_x (pg, q, 2.0));
+  if (__glibc_unlikely (svptest_any (pg, special)))
+    return special_case (x, svsel_f32 (is_boring, boring, y), special);
+  return svsel_f32 (is_boring, boring, y);
+}
+diff --git a/sysdeps/aarch64/fpu/test-double-advsimd-wrappers.c b/sysdeps/aarch64/fpu/test-double-advsimd-wrappers.c
+index 1a57b22c3a92f1e1..7aeda880bd885ce5 100644
+--- a/sysdeps/aarch64/fpu/test-double-advsimd-wrappers.c
+++ b/sysdeps/aarch64/fpu/test-double-advsimd-wrappers.c
+@@ -44,3 +44,4 @@ VPCS_VECTOR_WRAPPER (log2_advsimd, _ZGVnN2v_log2)
+ VPCS_VECTOR_WRAPPER (sin_advsimd, _ZGVnN2v_sin)
+ VPCS_VECTOR_WRAPPER (sinh_advsimd, _ZGVnN2v_sinh)
+ VPCS_VECTOR_WRAPPER (tan_advsimd, _ZGVnN2v_tan)
+VPCS_VECTOR_WRAPPER (tanh_advsimd, _ZGVnN2v_tanh)
+diff --git a/sysdeps/aarch64/fpu/test-double-sve-wrappers.c b/sysdeps/aarch64/fpu/test-double-sve-wrappers.c
+index 0c9858f6b74aaef6..95f1ec52221ba626 100644
+--- a/sysdeps/aarch64/fpu/test-double-sve-wrappers.c
+++ b/sysdeps/aarch64/fpu/test-double-sve-wrappers.c
+@@ -63,3 +63,4 @@ SVE_VECTOR_WRAPPER (log2_sve, _ZGVsMxv_log2)
+ SVE_VECTOR_WRAPPER (sin_sve, _ZGVsMxv_sin)
+ SVE_VECTOR_WRAPPER (sinh_sve, _ZGVsMxv_sinh)
+ SVE_VECTOR_WRAPPER (tan_sve, _ZGVsMxv_tan)
+SVE_VECTOR_WRAPPER (tanh_sve, _ZGVsMxv_tanh)
+diff --git a/sysdeps/aarch64/fpu/test-float-advsimd-wrappers.c b/sysdeps/aarch64/fpu/test-float-advsimd-wrappers.c
+index 4758490c6fc40fda..bd6800e91c64136f 100644
+--- a/sysdeps/aarch64/fpu/test-float-advsimd-wrappers.c
+++ b/sysdeps/aarch64/fpu/test-float-advsimd-wrappers.c
+@@ -44,3 +44,4 @@ VPCS_VECTOR_WRAPPER (log2f_advsimd, _ZGVnN4v_log2f)
+ VPCS_VECTOR_WRAPPER (sinf_advsimd, _ZGVnN4v_sinf)
+ VPCS_VECTOR_WRAPPER (sinhf_advsimd, _ZGVnN4v_sinhf)
+ VPCS_VECTOR_WRAPPER (tanf_advsimd, _ZGVnN4v_tanf)
+VPCS_VECTOR_WRAPPER (tanhf_advsimd, _ZGVnN4v_tanhf)
+diff --git a/sysdeps/aarch64/fpu/test-float-sve-wrappers.c b/sysdeps/aarch64/fpu/test-float-sve-wrappers.c
+index 7c04f07bbee84777..35ca305fddb7366c 100644
+--- a/sysdeps/aarch64/fpu/test-float-sve-wrappers.c
+++ b/sysdeps/aarch64/fpu/test-float-sve-wrappers.c
+@@ -63,3 +63,4 @@ SVE_VECTOR_WRAPPER (log2f_sve, _ZGVsMxv_log2f)
+ SVE_VECTOR_WRAPPER (sinf_sve, _ZGVsMxv_sinf)
+ SVE_VECTOR_WRAPPER (sinhf_sve, _ZGVsMxv_sinhf)
+ SVE_VECTOR_WRAPPER (tanf_sve, _ZGVsMxv_tanf)
+SVE_VECTOR_WRAPPER (tanhf_sve, _ZGVsMxv_tanhf)
+diff --git a/sysdeps/aarch64/libm-test-ulps b/sysdeps/aarch64/libm-test-ulps
+index fec0972081af734a..8398b7bc7749808d 100644
+--- a/sysdeps/aarch64/libm-test-ulps
+++ b/sysdeps/aarch64/libm-test-ulps
+@@ -1496,11 +1496,19 @@ double: 2
+ float: 2
+ ldouble: 2
+ 
+Function: "tanh_advsimd":
+double: 2
+float: 2
+
+ Function: "tanh_downward":
+ double: 3
+ float: 3
+ ldouble: 4
+ 
+Function: "tanh_sve":
+double: 2
+float: 2
+
+ Function: "tanh_towardzero":
+ double: 2
+ float: 2
+diff --git a/sysdeps/unix/sysv/linux/aarch64/libmvec.abilist b/sysdeps/unix/sysv/linux/aarch64/libmvec.abilist
+index 1db5ba61d64067a2..396082f6a7981686 100644
+--- a/sysdeps/unix/sysv/linux/aarch64/libmvec.abilist
+++ b/sysdeps/unix/sysv/linux/aarch64/libmvec.abilist
+@@ -85,12 +85,15 @@ GLIBC_2.40 _ZGVnN2v_erf F
+ GLIBC_2.40 _ZGVnN2v_erff F
+ GLIBC_2.40 _ZGVnN2v_sinh F
+ GLIBC_2.40 _ZGVnN2v_sinhf F
+GLIBC_2.40 _ZGVnN2v_tanh F
+GLIBC_2.40 _ZGVnN2v_tanhf F
+ GLIBC_2.40 _ZGVnN4v_acoshf F
+ GLIBC_2.40 _ZGVnN4v_asinhf F
+ GLIBC_2.40 _ZGVnN4v_atanhf F
+ GLIBC_2.40 _ZGVnN4v_coshf F
+ GLIBC_2.40 _ZGVnN4v_erff F
+ GLIBC_2.40 _ZGVnN4v_sinhf F
+GLIBC_2.40 _ZGVnN4v_tanhf F
+ GLIBC_2.40 _ZGVsMxv_acosh F
+ GLIBC_2.40 _ZGVsMxv_acoshf F
+ GLIBC_2.40 _ZGVsMxv_asinh F
+@@ -103,3 +106,5 @@ GLIBC_2.40 _ZGVsMxv_erf F
+ GLIBC_2.40 _ZGVsMxv_erff F
+ GLIBC_2.40 _ZGVsMxv_sinh F
+ GLIBC_2.40 _ZGVsMxv_sinhf F
+GLIBC_2.40 _ZGVsMxv_tanh F
+GLIBC_2.40 _ZGVsMxv_tanhf F
--- a/glibc-RHEL-118273-8.patch
+++ b/glibc-RHEL-118273-8.patch
--- a/glibc-RHEL-118273-9.patch
+++ b/glibc-RHEL-118273-9.patch
@ -0,0 +1,348 @@
+commit 90a6ca8b28bf34e361e577e526e1b0f4c39a32a5
+Author: Joe Ramsay <Joe.Ramsay@arm.com>
+Date:   Thu May 2 16:43:13 2024 +0100
+
+    aarch64: Fix AdvSIMD libmvec routines for big-endian
+    
+    Previously many routines used * to load from vector types stored
+    in the data table. This is emitted as ldr, which byte-swaps the
+    entire vector register, and causes bugs for big-endian when not
+    all lanes contain the same value. When a vector is to be used
+    this way, it has been replaced with an array and the load with an
+    explicit ld1 intrinsic, which byte-swaps only within lanes.
+    
+    As well, many routines previously used non-standard GCC syntax
+    for vector operations such as indexing into vectors types with []
+    and assembling vectors using {}. This syntax should not be mixed
+    with ACLE, as the former does not respect endianness whereas the
+    latter does. Such examples have been replaced with, for instance,
+    vcombine_* and vgetq_lane* intrinsics. Helpers which only use the
+    GCC syntax, such as the v_call helpers, do not need changing as
+    they do not use intrinsics.
+    
+    Reviewed-by: Szabolcs Nagy <szabolcs.nagy@arm.com>
+
+Conflicts:
+        sysdeps/aarch64/fpu/exp10f_advsimd.c
+        sysdeps/aarch64/fpu/expm1_advsimd.c
+        sysdeps/aarch64/fpu/expm1f_advsimd.c
+        sysdeps/aarch64/fpu/log10_advsimd.c
+        sysdeps/aarch64/fpu/log2_advsimd.c
+        sysdeps/aarch64/fpu/log_advsimd.c
+        sysdeps/aarch64/fpu/tan_advsimd.c
+        sysdeps/aarch64/fpu/tanf_advsimd.c
+          (Already backported by glibc-upstream-2.39-151.patch)
+
+diff --git a/sysdeps/aarch64/fpu/asinh_advsimd.c b/sysdeps/aarch64/fpu/asinh_advsimd.c
+index 544a52f6515d3201..6207e7da9531f48d 100644
+--- a/sysdeps/aarch64/fpu/asinh_advsimd.c
+++ b/sysdeps/aarch64/fpu/asinh_advsimd.c
+@@ -22,6 +22,7 @@
+ 
+ #define A(i) v_f64 (__v_log_data.poly[i])
+ #define N (1 << V_LOG_TABLE_BITS)
+#define IndexMask (N - 1)
+ 
+ const static struct data
+ {
+@@ -63,11 +64,15 @@ struct entry
+ static inline struct entry
+ lookup (uint64x2_t i)
+ {
+-  float64x2_t e0 = vld1q_f64 (
+-      &__v_log_data.table[(i[0] >> (52 - V_LOG_TABLE_BITS)) & (N - 1)].invc);
+-  float64x2_t e1 = vld1q_f64 (
+-      &__v_log_data.table[(i[1] >> (52 - V_LOG_TABLE_BITS)) & (N - 1)].invc);
+-  return (struct entry){ vuzp1q_f64 (e0, e1), vuzp2q_f64 (e0, e1) };
+  /* Since N is a power of 2, n % N = n & (N - 1).  */
+  struct entry e;
+  uint64_t i0 = (vgetq_lane_u64 (i, 0) >> (52 - V_LOG_TABLE_BITS)) & IndexMask;
+  uint64_t i1 = (vgetq_lane_u64 (i, 1) >> (52 - V_LOG_TABLE_BITS)) & IndexMask;
+  float64x2_t e0 = vld1q_f64 (&__v_log_data.table[i0].invc);
+  float64x2_t e1 = vld1q_f64 (&__v_log_data.table[i1].invc);
+  e.invc = vuzp1q_f64 (e0, e1);
+  e.logc = vuzp2q_f64 (e0, e1);
+  return e;
+ }
+ 
+ static inline float64x2_t
+diff --git a/sysdeps/aarch64/fpu/cosh_advsimd.c b/sysdeps/aarch64/fpu/cosh_advsimd.c
+index ec7b59637e973da9..4bee734f00bd6a9b 100644
+--- a/sysdeps/aarch64/fpu/cosh_advsimd.c
+++ b/sysdeps/aarch64/fpu/cosh_advsimd.c
+@@ -22,7 +22,9 @@
+ static const struct data
+ {
+   float64x2_t poly[3];
+-  float64x2_t inv_ln2, ln2, shift, thres;
+  float64x2_t inv_ln2;
+  double ln2[2];
+  float64x2_t shift, thres;
+   uint64x2_t index_mask, special_bound;
+ } data = {
+   .poly = { V2 (0x1.fffffffffffd4p-2), V2 (0x1.5555571d6b68cp-3),
+@@ -58,8 +60,9 @@ exp_inline (float64x2_t x)
+   float64x2_t n = vsubq_f64 (z, d->shift);
+ 
+   /* r = x - n*ln2/N.  */
+-  float64x2_t r = vfmaq_laneq_f64 (x, n, d->ln2, 0);
+-  r = vfmaq_laneq_f64 (r, n, d->ln2, 1);
+  float64x2_t ln2 = vld1q_f64 (d->ln2);
+  float64x2_t r = vfmaq_laneq_f64 (x, n, ln2, 0);
+  r = vfmaq_laneq_f64 (r, n, ln2, 1);
+ 
+   uint64x2_t e = vshlq_n_u64 (u, 52 - V_EXP_TAIL_TABLE_BITS);
+   uint64x2_t i = vandq_u64 (u, d->index_mask);
+diff --git a/sysdeps/aarch64/fpu/erf_advsimd.c b/sysdeps/aarch64/fpu/erf_advsimd.c
+index 3e70cbc025248a05..19cbb7d0f42eb4e2 100644
+--- a/sysdeps/aarch64/fpu/erf_advsimd.c
+++ b/sysdeps/aarch64/fpu/erf_advsimd.c
+@@ -56,8 +56,8 @@ static inline struct entry
+ lookup (uint64x2_t i)
+ {
+   struct entry e;
+-  float64x2_t e1 = vld1q_f64 ((float64_t *) (__erf_data.tab + i[0])),
+-	      e2 = vld1q_f64 ((float64_t *) (__erf_data.tab + i[1]));
+  float64x2_t e1 = vld1q_f64 (&__erf_data.tab[vgetq_lane_u64 (i, 0)].erf),
+	      e2 = vld1q_f64 (&__erf_data.tab[vgetq_lane_u64 (i, 1)].erf);
+   e.erf = vuzp1q_f64 (e1, e2);
+   e.scale = vuzp2q_f64 (e1, e2);
+   return e;
+diff --git a/sysdeps/aarch64/fpu/erfc_advsimd.c b/sysdeps/aarch64/fpu/erfc_advsimd.c
+index 548f21a3d68d68d2..f1b3bfe8304c73b5 100644
+--- a/sysdeps/aarch64/fpu/erfc_advsimd.c
+++ b/sysdeps/aarch64/fpu/erfc_advsimd.c
+@@ -26,7 +26,7 @@ static const struct data
+   float64x2_t max, shift;
+   float64x2_t p20, p40, p41, p42;
+   float64x2_t p51, p52;
+-  float64x2_t qr5, qr6, qr7, qr8, qr9;
+  double qr5[2], qr6[2], qr7[2], qr8[2], qr9[2];
+ #if WANT_SIMD_EXCEPT
+   float64x2_t uflow_bound;
+ #endif
+@@ -68,8 +68,10 @@ static inline struct entry
+ lookup (uint64x2_t i)
+ {
+   struct entry e;
+-  float64x2_t e1 = vld1q_f64 ((float64_t *) (__erfc_data.tab - Off + i[0])),
+-	      e2 = vld1q_f64 ((float64_t *) (__erfc_data.tab - Off + i[1]));
+  float64x2_t e1
+      = vld1q_f64 (&__erfc_data.tab[vgetq_lane_u64 (i, 0) - Off].erfc);
+  float64x2_t e2
+      = vld1q_f64 (&__erfc_data.tab[vgetq_lane_u64 (i, 1) - Off].erfc);
+   e.erfc = vuzp1q_f64 (e1, e2);
+   e.scale = vuzp2q_f64 (e1, e2);
+   return e;
+@@ -161,16 +163,19 @@ float64x2_t V_NAME_D1 (erfc) (float64x2_t x)
+   p5 = vmulq_f64 (r, vfmaq_f64 (vmulq_f64 (v_f64 (0.5), dat->p20), r2, p5));
+   /* Compute p_i using recurrence relation:
+      p_{i+2} = (p_i + r * Q_{i+1} * p_{i+1}) * R_{i+1}.  */
+-  float64x2_t p6 = vfmaq_f64 (p4, p5, vmulq_laneq_f64 (r, dat->qr5, 0));
+-  p6 = vmulq_laneq_f64 (p6, dat->qr5, 1);
+-  float64x2_t p7 = vfmaq_f64 (p5, p6, vmulq_laneq_f64 (r, dat->qr6, 0));
+-  p7 = vmulq_laneq_f64 (p7, dat->qr6, 1);
+-  float64x2_t p8 = vfmaq_f64 (p6, p7, vmulq_laneq_f64 (r, dat->qr7, 0));
+-  p8 = vmulq_laneq_f64 (p8, dat->qr7, 1);
+-  float64x2_t p9 = vfmaq_f64 (p7, p8, vmulq_laneq_f64 (r, dat->qr8, 0));
+-  p9 = vmulq_laneq_f64 (p9, dat->qr8, 1);
+-  float64x2_t p10 = vfmaq_f64 (p8, p9, vmulq_laneq_f64 (r, dat->qr9, 0));
+-  p10 = vmulq_laneq_f64 (p10, dat->qr9, 1);
+  float64x2_t qr5 = vld1q_f64 (dat->qr5), qr6 = vld1q_f64 (dat->qr6),
+	      qr7 = vld1q_f64 (dat->qr7), qr8 = vld1q_f64 (dat->qr8),
+	      qr9 = vld1q_f64 (dat->qr9);
+  float64x2_t p6 = vfmaq_f64 (p4, p5, vmulq_laneq_f64 (r, qr5, 0));
+  p6 = vmulq_laneq_f64 (p6, qr5, 1);
+  float64x2_t p7 = vfmaq_f64 (p5, p6, vmulq_laneq_f64 (r, qr6, 0));
+  p7 = vmulq_laneq_f64 (p7, qr6, 1);
+  float64x2_t p8 = vfmaq_f64 (p6, p7, vmulq_laneq_f64 (r, qr7, 0));
+  p8 = vmulq_laneq_f64 (p8, qr7, 1);
+  float64x2_t p9 = vfmaq_f64 (p7, p8, vmulq_laneq_f64 (r, qr8, 0));
+  p9 = vmulq_laneq_f64 (p9, qr8, 1);
+  float64x2_t p10 = vfmaq_f64 (p8, p9, vmulq_laneq_f64 (r, qr9, 0));
+  p10 = vmulq_laneq_f64 (p10, qr9, 1);
+   /* Compute polynomial in d using pairwise Horner scheme.  */
+   float64x2_t p90 = vfmaq_f64 (p9, d, p10);
+   float64x2_t p78 = vfmaq_f64 (p7, d, p8);
+diff --git a/sysdeps/aarch64/fpu/erfcf_advsimd.c b/sysdeps/aarch64/fpu/erfcf_advsimd.c
+index 30b9e48dd40d80a0..ca5bc3ab33c92f83 100644
+--- a/sysdeps/aarch64/fpu/erfcf_advsimd.c
+++ b/sysdeps/aarch64/fpu/erfcf_advsimd.c
+@@ -23,7 +23,8 @@ static const struct data
+ {
+   uint32x4_t offset, table_scale;
+   float32x4_t max, shift;
+-  float32x4_t coeffs, third, two_over_five, tenth;
+  float coeffs[4];
+  float32x4_t third, two_over_five, tenth;
+ #if WANT_SIMD_EXCEPT
+   float32x4_t uflow_bound;
+ #endif
+@@ -37,7 +38,7 @@ static const struct data
+   .shift = V4 (0x1p17f),
+   /* Store 1/3, 2/3 and 2/15 in a single register for use with indexed muls and
+      fmas.  */
+-  .coeffs = (float32x4_t){ 0x1.555556p-2f, 0x1.555556p-1f, 0x1.111112p-3f, 0 },
+  .coeffs = { 0x1.555556p-2f, 0x1.555556p-1f, 0x1.111112p-3f, 0 },
+   .third = V4 (0x1.555556p-2f),
+   .two_over_five = V4 (-0x1.99999ap-2f),
+   .tenth = V4 (-0x1.99999ap-4f),
+@@ -60,12 +61,16 @@ static inline struct entry
+ lookup (uint32x4_t i)
+ {
+   struct entry e;
+-  float64_t t0 = *((float64_t *) (__erfcf_data.tab - Off + i[0]));
+-  float64_t t1 = *((float64_t *) (__erfcf_data.tab - Off + i[1]));
+-  float64_t t2 = *((float64_t *) (__erfcf_data.tab - Off + i[2]));
+-  float64_t t3 = *((float64_t *) (__erfcf_data.tab - Off + i[3]));
+-  float32x4_t e1 = vreinterpretq_f32_f64 ((float64x2_t){ t0, t1 });
+-  float32x4_t e2 = vreinterpretq_f32_f64 ((float64x2_t){ t2, t3 });
+  float32x2_t t0
+      = vld1_f32 (&__erfcf_data.tab[vgetq_lane_u32 (i, 0) - Off].erfc);
+  float32x2_t t1
+      = vld1_f32 (&__erfcf_data.tab[vgetq_lane_u32 (i, 1) - Off].erfc);
+  float32x2_t t2
+      = vld1_f32 (&__erfcf_data.tab[vgetq_lane_u32 (i, 2) - Off].erfc);
+  float32x2_t t3
+      = vld1_f32 (&__erfcf_data.tab[vgetq_lane_u32 (i, 3) - Off].erfc);
+  float32x4_t e1 = vcombine_f32 (t0, t1);
+  float32x4_t e2 = vcombine_f32 (t2, t3);
+   e.erfc = vuzp1q_f32 (e1, e2);
+   e.scale = vuzp2q_f32 (e1, e2);
+   return e;
+@@ -140,10 +145,11 @@ float32x4_t NOINLINE V_NAME_F1 (erfc) (float32x4_t x)
+   float32x4_t r2 = vmulq_f32 (r, r);
+ 
+   float32x4_t p1 = r;
+-  float32x4_t p2 = vfmsq_laneq_f32 (dat->third, r2, dat->coeffs, 1);
+  float32x4_t coeffs = vld1q_f32 (dat->coeffs);
+  float32x4_t p2 = vfmsq_laneq_f32 (dat->third, r2, coeffs, 1);
+   float32x4_t p3
+-      = vmulq_f32 (r, vfmaq_laneq_f32 (v_f32 (-0.5), r2, dat->coeffs, 0));
+-  float32x4_t p4 = vfmaq_laneq_f32 (dat->two_over_five, r2, dat->coeffs, 2);
+      = vmulq_f32 (r, vfmaq_laneq_f32 (v_f32 (-0.5), r2, coeffs, 0));
+  float32x4_t p4 = vfmaq_laneq_f32 (dat->two_over_five, r2, coeffs, 2);
+   p4 = vfmsq_f32 (dat->tenth, r2, p4);
+ 
+   float32x4_t y = vfmaq_f32 (p3, d, p4);
+diff --git a/sysdeps/aarch64/fpu/erff_advsimd.c b/sysdeps/aarch64/fpu/erff_advsimd.c
+index c44644a71cffbb62..f2fe6ff236a6ec07 100644
+--- a/sysdeps/aarch64/fpu/erff_advsimd.c
+++ b/sysdeps/aarch64/fpu/erff_advsimd.c
+@@ -47,12 +47,12 @@ static inline struct entry
+ lookup (uint32x4_t i)
+ {
+   struct entry e;
+-  float64_t t0 = *((float64_t *) (__erff_data.tab + i[0]));
+-  float64_t t1 = *((float64_t *) (__erff_data.tab + i[1]));
+-  float64_t t2 = *((float64_t *) (__erff_data.tab + i[2]));
+-  float64_t t3 = *((float64_t *) (__erff_data.tab + i[3]));
+-  float32x4_t e1 = vreinterpretq_f32_f64 ((float64x2_t){ t0, t1 });
+-  float32x4_t e2 = vreinterpretq_f32_f64 ((float64x2_t){ t2, t3 });
+  float32x2_t t0 = vld1_f32 (&__erff_data.tab[vgetq_lane_u32 (i, 0)].erf);
+  float32x2_t t1 = vld1_f32 (&__erff_data.tab[vgetq_lane_u32 (i, 1)].erf);
+  float32x2_t t2 = vld1_f32 (&__erff_data.tab[vgetq_lane_u32 (i, 2)].erf);
+  float32x2_t t3 = vld1_f32 (&__erff_data.tab[vgetq_lane_u32 (i, 3)].erf);
+  float32x4_t e1 = vcombine_f32 (t0, t1);
+  float32x4_t e2 = vcombine_f32 (t2, t3);
+   e.erf = vuzp1q_f32 (e1, e2);
+   e.scale = vuzp2q_f32 (e1, e2);
+   return e;
+diff --git a/sysdeps/aarch64/fpu/sinh_advsimd.c b/sysdeps/aarch64/fpu/sinh_advsimd.c
+index fa3723b10c15eb29..3e3b76c502b01e16 100644
+--- a/sysdeps/aarch64/fpu/sinh_advsimd.c
+++ b/sysdeps/aarch64/fpu/sinh_advsimd.c
+@@ -22,8 +22,9 @@
+ 
+ static const struct data
+ {
+-  float64x2_t poly[11];
+-  float64x2_t inv_ln2, m_ln2, shift;
+  float64x2_t poly[11], inv_ln2;
+  double m_ln2[2];
+  float64x2_t shift;
+   uint64x2_t halff;
+   int64x2_t onef;
+ #if WANT_SIMD_EXCEPT
+@@ -40,7 +41,7 @@ static const struct data
+ 	    V2 (0x1.af5eedae67435p-26), V2 (0x1.1f143d060a28ap-29), },
+ 
+   .inv_ln2 = V2 (0x1.71547652b82fep0),
+-  .m_ln2 = (float64x2_t) {-0x1.62e42fefa39efp-1, -0x1.abc9e3b39803fp-56},
+  .m_ln2 = {-0x1.62e42fefa39efp-1, -0x1.abc9e3b39803fp-56},
+   .shift = V2 (0x1.8p52),
+ 
+   .halff = V2 (0x3fe0000000000000),
+@@ -67,8 +68,10 @@ expm1_inline (float64x2_t x)
+      and   f = x - i * ln2 (f in [-ln2/2, ln2/2]).  */
+   float64x2_t j = vsubq_f64 (vfmaq_f64 (d->shift, d->inv_ln2, x), d->shift);
+   int64x2_t i = vcvtq_s64_f64 (j);
+-  float64x2_t f = vfmaq_laneq_f64 (x, j, d->m_ln2, 0);
+-  f = vfmaq_laneq_f64 (f, j, d->m_ln2, 1);
+
+  float64x2_t m_ln2 = vld1q_f64 (d->m_ln2);
+  float64x2_t f = vfmaq_laneq_f64 (x, j, m_ln2, 0);
+  f = vfmaq_laneq_f64 (f, j, m_ln2, 1);
+   /* Approximate expm1(f) using polynomial.  */
+   float64x2_t f2 = vmulq_f64 (f, f);
+   float64x2_t f4 = vmulq_f64 (f2, f2);
+diff --git a/sysdeps/aarch64/fpu/v_expf_inline.h b/sysdeps/aarch64/fpu/v_expf_inline.h
+index a3b0e32f9eb42021..08b06e0a6b34b4f4 100644
+--- a/sysdeps/aarch64/fpu/v_expf_inline.h
+++ b/sysdeps/aarch64/fpu/v_expf_inline.h
+@@ -25,7 +25,8 @@
+ struct v_expf_data
+ {
+   float32x4_t poly[5];
+-  float32x4_t shift, invln2_and_ln2;
+  float32x4_t shift;
+  float invln2_and_ln2[4];
+ };
+ 
+ /* maxerr: 1.45358 +0.5 ulp.  */
+@@ -50,10 +51,11 @@ v_expf_inline (float32x4_t x, const struct v_expf_data *d)
+   /* exp(x) = 2^n (1 + poly(r)), with 1 + poly(r) in [1/sqrt(2),sqrt(2)]
+      x = ln2*n + r, with r in [-ln2/2, ln2/2].  */
+   float32x4_t n, r, z;
+-  z = vfmaq_laneq_f32 (d->shift, x, d->invln2_and_ln2, 0);
+  float32x4_t invln2_and_ln2 = vld1q_f32 (d->invln2_and_ln2);
+  z = vfmaq_laneq_f32 (d->shift, x, invln2_and_ln2, 0);
+   n = vsubq_f32 (z, d->shift);
+-  r = vfmsq_laneq_f32 (x, n, d->invln2_and_ln2, 1);
+-  r = vfmsq_laneq_f32 (r, n, d->invln2_and_ln2, 2);
+  r = vfmsq_laneq_f32 (x, n, invln2_and_ln2, 1);
+  r = vfmsq_laneq_f32 (r, n, invln2_and_ln2, 2);
+   uint32x4_t e = vshlq_n_u32 (vreinterpretq_u32_f32 (z), 23);
+   float32x4_t scale = vreinterpretq_f32_u32 (vaddq_u32 (e, ExponentBias));
+ 
+diff --git a/sysdeps/aarch64/fpu/v_expm1f_inline.h b/sysdeps/aarch64/fpu/v_expm1f_inline.h
+index 337ccfbfab555c97..59b552da6b74785e 100644
+--- a/sysdeps/aarch64/fpu/v_expm1f_inline.h
+++ b/sysdeps/aarch64/fpu/v_expm1f_inline.h
+@@ -26,7 +26,8 @@
+ struct v_expm1f_data
+ {
+   float32x4_t poly[5];
+-  float32x4_t invln2_and_ln2, shift;
+  float invln2_and_ln2[4];
+  float32x4_t shift;
+   int32x4_t exponent_bias;
+ };
+ 
+@@ -49,11 +50,12 @@ expm1f_inline (float32x4_t x, const struct v_expm1f_data *d)
+      calling routine should handle special values if required.  */
+ 
+   /* Reduce argument: f in [-ln2/2, ln2/2], i is exact.  */
+-  float32x4_t j = vsubq_f32 (
+-      vfmaq_laneq_f32 (d->shift, x, d->invln2_and_ln2, 0), d->shift);
+  float32x4_t invln2_and_ln2 = vld1q_f32 (d->invln2_and_ln2);
+  float32x4_t j
+      = vsubq_f32 (vfmaq_laneq_f32 (d->shift, x, invln2_and_ln2, 0), d->shift);
+   int32x4_t i = vcvtq_s32_f32 (j);
+-  float32x4_t f = vfmsq_laneq_f32 (x, j, d->invln2_and_ln2, 1);
+-  f = vfmsq_laneq_f32 (f, j, d->invln2_and_ln2, 2);
+  float32x4_t f = vfmsq_laneq_f32 (x, j, invln2_and_ln2, 1);
+  f = vfmsq_laneq_f32 (f, j, invln2_and_ln2, 2);
+ 
+   /* Approximate expm1(f) with polynomial P, expm1(f) ~= f + f^2 * P(f).
+      Uses Estrin scheme, where the main _ZGVnN4v_expm1f routine uses