Auto-sync with upstream branch master

Upstream commit: 5d7f1bce7d8eea31f4baeb68bcc3124b35acc751 - Apply glibc-benchtests-aarch64.patch to fix an aarch64 build failure. - Drop glibc-rh2244688.patch revert. Fix applied upstream. - Drop glibc-rh2244992.patch, glibc-rh2248915.patch, glibc-rh2248502-3.patch. All applied upstream. - posix: Revert the removal of the crypt prototype from <unistd.h> - elf: Add comments on how LD_AUDIT and LD_PRELOAD handle __libc_enable_secure - elf: Ignore LD_LIBRARY_PATH and debug env var for setuid for static - elf: Remove any_debug from dl_main_state - elf: Remove LD_PROFILE for static binaries - elf: Ignore LD_PROFILE for setuid binaries - s390: Use dl-symbol-redir-ifunc.h on cpu-tunables - x86: Use dl-symbol-redir-ifunc.h on cpu-tunables - elf: Emit warning if tunable is ill-formatted - elf: Fix _dl_debug_vdprintf to work before self-relocation - elf: Do not parse ill-formatted strings - elf: Do not process invalid tunable format - elf: Add all malloc tunable to unsecvars - elf: Ignore GLIBC_TUNABLES for setuid/setgid binaries - elf: Add GLIBC_TUNABLES to unsecvars - elf: Remove /etc/suid-debug support - stdlib: The qsort implementation needs to use heapsort in more cases - stdlib: Handle various corner cases in the fallback heapsort for qsort - stdlib: Avoid another self-comparison in qsort - hurd: fix restarting reauth_dtable on signal - hurd: Prevent the final file_exec_paths call from signals - manual: Fix termios.c example. (Bug 31078) - aarch64: Add vector implementations of expm1 routines - linux: Use fchmodat2 on fchmod for flags different than 0 (BZ 26401) - intl: Add test case for bug 16621 - resolv: free only initialized items from gai pool - ldconfig: Fixes for skipping temporary files. - nptl: Link tst-execstack-threads-mod.so with -z execstack - nptl: Rename tst-execstack to tst-execstack-threads - localedata: Convert oc_FR locale to UTF-8 - localedata: Add information for Occitan - elf: Fix force_first handling in dlclose (bug 30981) - elf: Handle non-directory name in search path (BZ 31035) - New Zealand locales (en_NZ & mi_NZ) first day of week should be Monday - x86: Fix unchecked AVX512-VBMI2 usage in strrchr-evex-base.S - posix: Check pidfd_spawn with tst-spawn7-pid - y2038: Fix support for 64-bit time on legacy ABIs - AArch64: Remove Falkor memcpy - AArch64: Add memset_zva64 - AArch64: Cleanup emag memset - test: Run the tst-tls-allocation-failure-static-patched with test-wrapper. - aarch64: Add vector implementations of log1p routines - aarch64: Add vector implementations of atan2 routines - aarch64: Add vector implementations of atan routines - aarch64: Add vector implementations of acos routines - aarch64: Add vector implementations of asin routines
2023-11-22 08:49:54 +01:00 · 2023-11-22 08:49:54 +01:00 · f94f56a23b
commit f94f56a23b
parent 9e361d1258
7 changed files with 308 additions and 1166 deletions
--- a/glibc-benchtests-aarch64.patch
+++ b/glibc-benchtests-aarch64.patch
@ -0,0 +1,250 @@
+Author: Joe Ramsay <Joe.Ramsay@arm.com>
+Date:   Tue Nov 21 14:39:39 2023 +0000
+
+    aarch64: Fix libmvec benchmarks
+    
+    These were broken by the new atan2 functions, as they were only
+    set up for univariate functions. Arity is now detected from the
+    input file - this revealed a mistake that the double-precision
+    inputs were being used for both single- and double-precision
+    routines, which is now remedied.
+
+diff --git a/sysdeps/aarch64/fpu/scripts/bench_libmvec_advsimd.py b/sysdeps/aarch64/fpu/scripts/bench_libmvec_advsimd.py
+index 3e124c781065fea9..3661a24044cc9770 100644
+--- a/sysdeps/aarch64/fpu/scripts/bench_libmvec_advsimd.py
+++ b/sysdeps/aarch64/fpu/scripts/bench_libmvec_advsimd.py
+@@ -22,40 +22,49 @@ TEMPLATE = """
+ #include <math.h>
+ #include <arm_neon.h>
+ 
+-#define STRIDE {stride}
+#define STRIDE {rowlen}
+ 
+-#define CALL_BENCH_FUNC(v, i) (__extension__ ({{                         \\
+-   {rtype} mx0 = {fname}(vld1q_f{prec_short} (variants[v].in[i].arg0));  \\
+#define CALL_BENCH_FUNC_1(v, i) (__extension__ ({{                                 \\
+   {rtype} mx0 = {fname}(vld1q_f{prec_short} (&variants[v].in->arg0[i * STRIDE])); \\
+    mx0; }}))
+ 
+-struct args
+#define CALL_BENCH_FUNC_2(v, i) (__extension__ ({{                                 \\
+   {rtype} mx0 = {fname}(vld1q_f{prec_short} (&variants[v].in->arg0[i * STRIDE]),  \\
+                         vld1q_f{prec_short} (&variants[v].in->arg1[i * STRIDE])); \\
+   mx0; }}))
+
+struct args_1
+{{
+  {stype} arg0[{nelems}];
+}};
+
+struct args_2
+ {{
+-  {stype} arg0[STRIDE];
+-  double timing;
+  {stype} arg0[{nelems}];
+  {stype} arg1[{nelems}];
+ }};
+ 
+ struct _variants
+ {{
+   const char *name;
+-  int count;
+-  const struct args *in;
+  const struct args_{arity} *in;
+ }};
+ 
+-static const struct args in0[{rowcount}] = {{
+static const struct args_{arity} in0 = {{
+ {in_data}
+ }};
+ 
+ static const struct _variants variants[1] = {{
+-  {{"", {rowcount}, in0}},
+  {{"", &in0}},
+ }};
+ 
+ #define NUM_VARIANTS 1
+-#define NUM_SAMPLES(i) (variants[i].count)
+#define NUM_SAMPLES(i) ({nelems} / STRIDE)
+ #define VARIANT(i) (variants[i].name)
+ 
+ static {rtype} volatile ret;
+ 
+-#define BENCH_FUNC(i, j) ({{ ret = CALL_BENCH_FUNC(i, j); }})
+#define BENCH_FUNC(i, j) ({{ ret = CALL_BENCH_FUNC_{arity}(i, j); }})
+ #define FUNCNAME "{fname}"
+ #include <bench-libmvec-skeleton.c>
+ """
+@@ -63,27 +72,34 @@ static {rtype} volatile ret;
+ def main(name):
+     _, prec, _, func = name.split("-")
+     scalar_to_advsimd_type = {"double": "float64x2_t", "float": "float32x4_t"}
+-
+-    stride = {"double": 2, "float": 4}[prec]
+    rowlen = {"double": 2, "float": 4}[prec]
+     rtype = scalar_to_advsimd_type[prec]
+     atype = scalar_to_advsimd_type[prec]
+-    fname = f"_ZGVnN{stride}v_{func}{'f' if prec == 'float' else ''}"
+     prec_short = {"double": 64, "float": 32}[prec]
+-
+-    with open(f"../benchtests/libmvec/{func}-inputs") as f:
+-        in_vals = [l.strip() for l in f.readlines() if l and not l.startswith("#")]
+-    in_vals = [in_vals[i:i+stride] for i in range(0, len(in_vals), stride)]
+-    rowcount= len(in_vals)
+-    in_data = ",\n".join("{{" + ", ".join(row) + "}, 0}" for row in in_vals)
+-
+-    print(TEMPLATE.format(stride=stride,
+    input_filename = {"double": f"{func}-inputs", "float": f"{func}f-inputs"}[prec]
+
+    with open(f"../benchtests/libmvec/{input_filename}") as f:
+        input_file = f.readlines()
+    in_vals = (l.strip() for l in input_file if l and not l.startswith("#"))
+    # Split in case of multivariate signature
+    in_vals = (l.split(", ") for l in in_vals)
+    # Transpose
+    in_vals = list(zip(*in_vals))
+    in_data = ",\n".join("{" + (", ".join(val for val in col) + "}")
+                         for col in in_vals)
+
+    arity = [l for l in input_file if l.startswith("## args: ")][0].count(prec)
+    fname = f"_ZGVnN{rowlen}{'v' * arity}_{func}{'f' if prec == 'float' else ''}"
+
+    print(TEMPLATE.format(rowlen=rowlen,
+                           rtype=rtype,
+                           atype=atype,
+                           fname=fname,
+                           prec_short=prec_short,
+                           in_data=in_data,
+-                          rowcount=rowcount,
+-                          stype=prec))
+                          stype=prec,
+                          arity=arity,
+                          nelems=len(in_vals[0])))
+ 
+ 
+ if __name__ == "__main__":
+diff --git a/sysdeps/aarch64/fpu/scripts/bench_libmvec_sve.py b/sysdeps/aarch64/fpu/scripts/bench_libmvec_sve.py
+index 66f2c8e0f465f9ce..5d9332be9c5a536a 100755
+--- a/sysdeps/aarch64/fpu/scripts/bench_libmvec_sve.py
+++ b/sysdeps/aarch64/fpu/scripts/bench_libmvec_sve.py
+@@ -22,46 +22,55 @@ TEMPLATE = """
+ #include <math.h>
+ #include <arm_sve.h>
+ 
+-#define MAX_STRIDE {max_stride}
+ #define STRIDE {stride}
+ #define PTRUE svptrue_b{prec_short}
+ #define SV_LOAD svld1_f{prec_short}
+ #define SV_STORE svst1_f{prec_short}
+ #define REQUIRE_SVE
+ 
+-#define CALL_BENCH_FUNC(v, i) (__extension__ ({{                              \\
+-   {rtype} mx0 = {fname}(SV_LOAD (PTRUE(), variants[v].in[i].arg0), PTRUE()); \\
+#define CALL_BENCH_FUNC_1(v, i) (__extension__ ({{                                       \\
+   {rtype} mx0 = {fname}(SV_LOAD (PTRUE(), &variants[v].in->arg0[i * STRIDE]), PTRUE()); \\
+    mx0; }}))
+ 
+-struct args
+#define CALL_BENCH_FUNC_2(v, i) (__extension__ ({{                              \\
+   {rtype} mx0 = {fname}(SV_LOAD (PTRUE(), &variants[v].in->arg0[i * STRIDE]),  \\
+                         SV_LOAD (PTRUE(), &variants[v].in->arg1[i * STRIDE]),  \\
+                         PTRUE());                                              \\
+   mx0; }}))
+
+struct args_1
+ {{
+-  {stype} arg0[MAX_STRIDE];
+-  double timing;
+  {stype} arg0[{nelems}];
+}};
+
+struct args_2
+{{
+  {stype} arg0[{nelems}];
+  {stype} arg1[{nelems}];
+ }};
+ 
+ struct _variants
+ {{
+   const char *name;
+-  int count;
+-  const struct args *in;
+  const struct args_{arity} *in;
+ }};
+ 
+-static const struct args in0[{rowcount}] = {{
+static const struct args_{arity} in0 = {{
+ {in_data}
+ }};
+ 
+ static const struct _variants variants[1] = {{
+-  {{"", {rowcount}, in0}},
+  {{"", &in0}},
+ }};
+ 
+ #define NUM_VARIANTS 1
+-#define NUM_SAMPLES(i) (variants[i].count)
+#define NUM_SAMPLES(i) ({nelems} / STRIDE)
+ #define VARIANT(i) (variants[i].name)
+ 
+ // Cannot pass volatile pointer to svst1. This still does not appear to get optimised out.
+-static {stype} /*volatile*/ ret[MAX_STRIDE];
+static {stype} /*volatile*/ ret[{rowlen}];
+ 
+-#define BENCH_FUNC(i, j) ({{ SV_STORE(PTRUE(), ret, CALL_BENCH_FUNC(i, j)); }})
+#define BENCH_FUNC(i, j) ({{ SV_STORE(PTRUE(), ret, CALL_BENCH_FUNC_{arity}(i, j)); }})
+ #define FUNCNAME "{fname}"
+ #include <bench-libmvec-skeleton.c>
+ """
+@@ -69,23 +78,29 @@ static {stype} /*volatile*/ ret[MAX_STRIDE];
+ def main(name):
+     _, prec, _, func = name.split("-")
+     scalar_to_sve_type = {"double": "svfloat64_t", "float": "svfloat32_t"}
+-
+     stride = {"double": "svcntd()", "float": "svcntw()"}[prec]
+     rtype = scalar_to_sve_type[prec]
+     atype = scalar_to_sve_type[prec]
+-    fname = f"_ZGVsMxv_{func}{'f' if prec == 'float' else ''}"
+     prec_short = {"double": 64, "float": 32}[prec]
+     # Max SVE vector length is 2048 bits. To ensure benchmarks are
+     # vector-length-agnostic, but still use as wide vectors as
+     # possible on any given target, divide input data into 2048-bit
+     # rows, then load/store as many elements as the target will allow.
+-    max_stride = 2048 // prec_short
+-
+-    with open(f"../benchtests/libmvec/{func}-inputs") as f:
+-        in_vals = [l.strip() for l in f.readlines() if l and not l.startswith("#")]
+-    in_vals = [in_vals[i:i+max_stride] for i in range(0, len(in_vals), max_stride)]
+-    rowcount= len(in_vals)
+-    in_data = ",\n".join("{{" + ", ".join(row) + "}, 0}" for row in in_vals)
+    rowlen = {"double": 32, "float": 64}[prec]
+    input_filename = {"double": f"{func}-inputs", "float": f"{func}f-inputs"}[prec]
+
+    with open(f"../benchtests/libmvec/{input_filename}") as f:
+        input_file = f.readlines()
+    in_vals = (l.strip() for l in input_file if l and not l.startswith("#"))
+    # Split in case of multivariate signature
+    in_vals = (l.split(", ") for l in in_vals)
+    # Transpose
+    in_vals = list(zip(*in_vals))
+    in_data = ",\n".join("{" + (", ".join(val for val in col) + "}")
+                         for col in in_vals)
+
+    arity = [l for l in input_file if l.startswith("## args: ")][0].count(prec)
+    fname = f"_ZGVsMx{'v' * arity}_{func}{'f' if prec == 'float' else ''}"
+ 
+     print(TEMPLATE.format(stride=stride,
+                           rtype=rtype,
+@@ -93,9 +108,10 @@ def main(name):
+                           fname=fname,
+                           prec_short=prec_short,
+                           in_data=in_data,
+-                          rowcount=rowcount,
+                           stype=prec,
+-                          max_stride=max_stride))
+                          rowlen=rowlen,
+                          arity=arity,
+                          nelems=len(in_vals[0])))
+ 
+ 
+ if __name__ == "__main__":
--- a/glibc-rh2244688.patch
+++ b/glibc-rh2244688.patch
@ -1,967 +0,0 @@
-Author: Florian Weimer <fweimer@redhat.com>
-Date:   Wed Oct 18 11:12:29 2023 +0200
-
-    Revert "x86: Prepare `strrchr-evex` and `strrchr-evex512` for AVX10"
-    
-    This reverts commit a3c50bf46a1ca6d9d2b7d879176d345abf95a9de.
-
-diff --git a/sysdeps/x86_64/multiarch/strrchr-evex-base.S b/sysdeps/x86_64/multiarch/strrchr-evex-base.S
-index cd6a0a870a02b9bd..58b2853ab69265e8 100644
--- a/sysdeps/x86_64/multiarch/strrchr-evex-base.S
-+++ b/sysdeps/x86_64/multiarch/strrchr-evex-base.S
-@@ -1,4 +1,4 @@
-/* Implementation for strrchr using evex256 and evex512.
-+/* Placeholder function, not used by any processor at the moment.
-    Copyright (C) 2022-2023 Free Software Foundation, Inc.
-    This file is part of the GNU C Library.
- 
-@@ -16,6 +16,8 @@
-    License along with the GNU C Library; if not, see
-    <https://www.gnu.org/licenses/>.  */
- 
-+/* UNUSED. Exists purely as reference implementation.  */
-+
- #include <isa-level.h>
- 
- #if ISA_SHOULD_BUILD (4)
-@@ -23,351 +25,240 @@
- # include <sysdep.h>
- 
- # ifdef USE_AS_WCSRCHR
-#  if VEC_SIZE == 64
-#   define RCX_M	cx
-#   define KORTEST_M	kortestw
-#  else
-#   define RCX_M	cl
-#   define KORTEST_M	kortestb
-#  endif
-
-#  define SHIFT_REG	VRCX
- #  define CHAR_SIZE	4
-#  define VPCMP		vpcmpd
-#  define VPMIN		vpminud
-#  define VPCOMPRESS	vpcompressd
-#  define VPTESTN	vptestnmd
-#  define VPTEST	vptestmd
-#  define VPBROADCAST	vpbroadcastd
-+#  define VPBROADCAST   vpbroadcastd
- #  define VPCMPEQ	vpcmpeqd
-
-+#  define VPMINU	vpminud
-+#  define VPTESTN	vptestnmd
- # else
-#  define SHIFT_REG	VRDI
- #  define CHAR_SIZE	1
-#  define VPCMP		vpcmpb
-#  define VPMIN		vpminub
-#  define VPCOMPRESS	vpcompressb
-#  define VPTESTN	vptestnmb
-#  define VPTEST	vptestmb
-#  define VPBROADCAST	vpbroadcastb
-+#  define VPBROADCAST   vpbroadcastb
- #  define VPCMPEQ	vpcmpeqb
-
-#  define RCX_M		VRCX
-#  define KORTEST_M	KORTEST
-+#  define VPMINU	vpminub
-+#  define VPTESTN	vptestnmb
- # endif
- 
-# define VMATCH		VMM(0)
-# define CHAR_PER_VEC	(VEC_SIZE / CHAR_SIZE)
- # define PAGE_SIZE	4096
-+# define CHAR_PER_VEC	(VEC_SIZE / CHAR_SIZE)
- 
- 	.section SECTION(.text), "ax", @progbits
-	/* Aligning entry point to 64 byte, provides better performance for
-	   one vector length string.  */
-ENTRY_P2ALIGN(STRRCHR, 6)
-	movl	%edi, %eax
-	/* Broadcast CHAR to VMATCH.  */
-	VPBROADCAST %esi, %VMATCH
-+/* Aligning entry point to 64 byte, provides better performance for
-+   one vector length string.  */
-+ENTRY_P2ALIGN (STRRCHR, 6)
- 
-	andl	$(PAGE_SIZE - 1), %eax
-	cmpl	$(PAGE_SIZE - VEC_SIZE), %eax
-	jg	L(cross_page_boundary)
-+	/* Broadcast CHAR to VMM(0).  */
-+	VPBROADCAST %esi, %VMM(0)
-+	movl	%edi, %eax
-+	sall	$20, %eax
-+	cmpl	$((PAGE_SIZE - VEC_SIZE) << 20), %eax
-+	ja	L(page_cross)
- 
-+L(page_cross_continue):
-+	/* Compare [w]char for null, mask bit will be set for match.  */
- 	VMOVU	(%rdi), %VMM(1)
-	/* k0 has a 1 for each zero CHAR in YMM1.  */
-	VPTESTN	%VMM(1), %VMM(1), %k0
-	KMOV	%k0, %VGPR(rsi)
-	test	%VGPR(rsi), %VGPR(rsi)
-	jz	L(aligned_more)
-	/* fallthrough: zero CHAR in first VEC.  */
-L(page_cross_return):
-	/* K1 has a 1 for each search CHAR match in VEC(1).  */
-	VPCMPEQ	%VMATCH, %VMM(1), %k1
-	KMOV	%k1, %VGPR(rax)
-	/* Build mask up until first zero CHAR (used to mask of
-	   potential search CHAR matches past the end of the string).  */
-	blsmsk	%VGPR(rsi), %VGPR(rsi)
-	/* Use `and` here to remove any out of bounds matches so we can
-	   do a reverse scan on `rax` to find the last match.  */
-	and	%VGPR(rsi), %VGPR(rax)
-	jz	L(ret0)
-	/* Get last match.  */
-	bsr	%VGPR(rax), %VGPR(rax)
-+
-+	VPTESTN	%VMM(1), %VMM(1), %k1
-+	KMOV	%k1, %VRCX
-+	test	%VRCX, %VRCX
-+	jz	L(align_more)
-+
-+	VPCMPEQ	%VMM(1), %VMM(0), %k0
-+	KMOV	%k0, %VRAX
-+	BLSMSK	%VRCX, %VRCX
-+	and	%VRCX, %VRAX
-+	jz	L(ret)
-+
-+	BSR	%VRAX, %VRAX
- # ifdef USE_AS_WCSRCHR
- 	leaq	(%rdi, %rax, CHAR_SIZE), %rax
- # else
-	addq	%rdi, %rax
-+	add	%rdi, %rax
- # endif
-L(ret0):
-+L(ret):
- 	ret
- 
-	/* Returns for first vec x1/x2/x3 have hard coded backward
-	   search path for earlier matches.  */
-	.p2align 4,, 6
-L(first_vec_x1):
-	VPCMPEQ	%VMATCH, %VMM(2), %k1
-	KMOV	%k1, %VGPR(rax)
-	blsmsk	%VGPR(rcx), %VGPR(rcx)
-	/* eax non-zero if search CHAR in range.  */
-	and	%VGPR(rcx), %VGPR(rax)
-	jnz	L(first_vec_x1_return)
-
-	/* fallthrough: no match in YMM2 then need to check for earlier
-	   matches (in YMM1).  */
-	.p2align 4,, 4
-L(first_vec_x0_test):
-	VPCMPEQ	%VMATCH, %VMM(1), %k1
-	KMOV	%k1, %VGPR(rax)
-	test	%VGPR(rax), %VGPR(rax)
-	jz	L(ret1)
-	bsr	%VGPR(rax), %VGPR(rax)
-+L(vector_x2_end):
-+	VPCMPEQ	%VMM(2), %VMM(0), %k2
-+	KMOV	%k2, %VRAX
-+	BLSMSK	%VRCX, %VRCX
-+	and	%VRCX, %VRAX
-+	jz	L(vector_x1_ret)
-+
-+	BSR	%VRAX, %VRAX
-+	leaq	(VEC_SIZE)(%rdi, %rax, CHAR_SIZE), %rax
-+	ret
-+
-+	/* Check the first vector at very last to look for match.  */
-+L(vector_x1_ret):
-+	VPCMPEQ %VMM(1), %VMM(0), %k2
-+	KMOV	%k2, %VRAX
-+	test	%VRAX, %VRAX
-+	jz	L(ret)
-+
-+	BSR	%VRAX, %VRAX
- # ifdef USE_AS_WCSRCHR
- 	leaq	(%rsi, %rax, CHAR_SIZE), %rax
- # else
-	addq	%rsi, %rax
-+	add	%rsi, %rax
- # endif
-L(ret1):
- 	ret
- 
-	.p2align 4,, 10
-L(first_vec_x3):
-	VPCMPEQ	%VMATCH, %VMM(4), %k1
-	KMOV	%k1, %VGPR(rax)
-	blsmsk	%VGPR(rcx), %VGPR(rcx)
-	/* If no search CHAR match in range check YMM1/YMM2/YMM3.  */
-	and	%VGPR(rcx), %VGPR(rax)
-	jz	L(first_vec_x1_or_x2)
-	bsr	%VGPR(rax), %VGPR(rax)
-	leaq	(VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax
-	ret
-	.p2align 4,, 4
-
-L(first_vec_x2):
-	VPCMPEQ	%VMATCH, %VMM(3), %k1
-	KMOV	%k1, %VGPR(rax)
-	blsmsk	%VGPR(rcx), %VGPR(rcx)
-	/* Check YMM3 for last match first. If no match try YMM2/YMM1.  */
-	and	%VGPR(rcx), %VGPR(rax)
-	jz	L(first_vec_x0_x1_test)
-	bsr	%VGPR(rax), %VGPR(rax)
-	leaq	(VEC_SIZE * 2)(%r8, %rax, CHAR_SIZE), %rax
-	ret
-
-	.p2align 4,, 6
-L(first_vec_x0_x1_test):
-	VPCMPEQ	%VMATCH, %VMM(2), %k1
-	KMOV	%k1, %VGPR(rax)
-	/* Check YMM2 for last match first. If no match try YMM1.  */
-	test	%VGPR(rax), %VGPR(rax)
-	jz	L(first_vec_x0_test)
-	.p2align 4,, 4
-L(first_vec_x1_return):
-	bsr	%VGPR(rax), %VGPR(rax)
-	leaq	(VEC_SIZE)(%r8, %rax, CHAR_SIZE), %rax
-	ret
-
-	.p2align 4,, 12
-L(aligned_more):
-L(page_cross_continue):
-	/* Need to keep original pointer incase VEC(1) has last match.  */
-+L(align_more):
-+	/* Zero r8 to store match result.  */
-+	xorl	%r8d, %r8d
-+	/* Save pointer of first vector, in case if no match found.  */
- 	movq	%rdi, %rsi
-+	/* Align pointer to vector size.  */
- 	andq	$-VEC_SIZE, %rdi
-
-	VMOVU	VEC_SIZE(%rdi), %VMM(2)
-+	/* Loop unroll for 2 vector loop.  */
-+	VMOVA	(VEC_SIZE)(%rdi), %VMM(2)
- 	VPTESTN	%VMM(2), %VMM(2), %k0
- 	KMOV	%k0, %VRCX
-	movq	%rdi, %r8
- 	test	%VRCX, %VRCX
-	jnz	L(first_vec_x1)
-
-	VMOVU	(VEC_SIZE * 2)(%rdi), %VMM(3)
-	VPTESTN	%VMM(3), %VMM(3), %k0
-	KMOV	%k0, %VRCX
-
-	test	%VRCX, %VRCX
-	jnz	L(first_vec_x2)
-
-	VMOVU	(VEC_SIZE * 3)(%rdi), %VMM(4)
-	VPTESTN	%VMM(4), %VMM(4), %k0
-	KMOV	%k0, %VRCX
-
-	/* Intentionally use 64-bit here.  EVEX256 version needs 1-byte
-	   padding for efficient nop before loop alignment.  */
-	test	%rcx, %rcx
-	jnz	L(first_vec_x3)
-+	jnz	L(vector_x2_end)
- 
-+	/* Save pointer of second vector, in case if no match
-+	   found.  */
-+	movq	%rdi, %r9
-+	/* Align address to VEC_SIZE * 2 for loop.  */
- 	andq	$-(VEC_SIZE * 2), %rdi
-	.p2align 4
-L(first_aligned_loop):
-	/* Preserve VEC(1), VEC(2), VEC(3), and VEC(4) until we can
-	   gurantee they don't store a match.  */
-	VMOVA	(VEC_SIZE * 4)(%rdi), %VMM(5)
-	VMOVA	(VEC_SIZE * 5)(%rdi), %VMM(6)
-
-	VPCMP	$4, %VMM(5), %VMATCH, %k2
-	VPCMP	$4, %VMM(6), %VMATCH, %k3{%k2}
-
-	VPMIN	%VMM(5), %VMM(6), %VMM(7)
- 
-	VPTEST	%VMM(7), %VMM(7), %k1{%k3}
-	subq	$(VEC_SIZE * -2), %rdi
-	KORTEST_M %k1, %k1
-	jc	L(first_aligned_loop)
-+	.p2align 4,,11
-+L(loop):
-+	/* 2 vector loop, as it provide better performance as compared
-+	   to 4 vector loop.  */
-+	VMOVA	(VEC_SIZE * 2)(%rdi), %VMM(3)
-+	VMOVA	(VEC_SIZE * 3)(%rdi), %VMM(4)
-+	VPCMPEQ	%VMM(3), %VMM(0), %k1
-+	VPCMPEQ	%VMM(4), %VMM(0), %k2
-+	VPMINU	%VMM(3), %VMM(4), %VMM(5)
-+	VPTESTN	%VMM(5), %VMM(5), %k0
-+	KOR	%k1, %k2, %k3
-+	subq	$-(VEC_SIZE * 2), %rdi
-+	/* If k0 and k3 zero, match and end of string not found.  */
-+	KORTEST	%k0, %k3
-+	jz	L(loop)
-+
-+	/* If k0 is non zero, end of string found.  */
-+	KORTEST %k0, %k0
-+	jnz	L(endloop)
-+
-+	lea	VEC_SIZE(%rdi), %r8
-+	/* A match found, it need to be stored in r8 before loop
-+	   continue.  */
-+	/* Check second vector first.  */
-+	KMOV	%k2, %VRDX
-+	test	%VRDX, %VRDX
-+	jnz	L(loop_vec_x2_match)
- 
-	VPTESTN	%VMM(7), %VMM(7), %k1
- 	KMOV	%k1, %VRDX
-	test	%VRDX, %VRDX
-	jz	L(second_aligned_loop_prep)
-+	/* Match is in first vector, rdi offset need to be subtracted
-+	  by VEC_SIZE.  */
-+	sub	$VEC_SIZE, %r8
-+
-+	/* If second vector doesn't have match, first vector must
-+	   have match.  */
-+L(loop_vec_x2_match):
-+	BSR	%VRDX, %VRDX
-+# ifdef USE_AS_WCSRCHR
-+	sal	$2, %rdx
-+# endif
-+	add	%rdx, %r8
-+	jmp	L(loop)
- 
-	KORTEST_M %k3, %k3
-	jnc	L(return_first_aligned_loop)
-+L(endloop):
-+	/* Check if string end in first loop vector.  */
-+	VPTESTN	%VMM(3), %VMM(3), %k0
-+	KMOV	%k0, %VRCX
-+	test	%VRCX, %VRCX
-+	jnz	L(loop_vector_x1_end)
- 
-	.p2align 4,, 6
-L(first_vec_x1_or_x2_or_x3):
-	VPCMPEQ	%VMM(4), %VMATCH, %k4
-	KMOV	%k4, %VRAX
-+	/* Check if it has match in first loop vector.  */
-+	KMOV	%k1, %VRAX
- 	test	%VRAX, %VRAX
-	jz	L(first_vec_x1_or_x2)
-	bsr	%VRAX, %VRAX
-	leaq	(VEC_SIZE * 3)(%r8, %rax, CHAR_SIZE), %rax
-	ret
-+	jz	L(loop_vector_x2_end)
- 
-	.p2align 4,, 8
-L(return_first_aligned_loop):
-	VPTESTN	%VMM(5), %VMM(5), %k0
-+	BSR	%VRAX, %VRAX
-+	leaq	(%rdi, %rax, CHAR_SIZE), %r8
-+
-+	/* String must end in second loop vector.  */
-+L(loop_vector_x2_end):
-+	VPTESTN	%VMM(4), %VMM(4), %k0
- 	KMOV	%k0, %VRCX
-	blsmsk	%VRCX, %VRCX
-	jnc	L(return_first_new_match_first)
-	blsmsk	%VRDX, %VRDX
-	VPCMPEQ	%VMM(6), %VMATCH, %k0
-	KMOV	%k0, %VRAX
-	addq	$VEC_SIZE, %rdi
-	and	%VRDX, %VRAX
-	jnz	L(return_first_new_match_ret)
-	subq	$VEC_SIZE, %rdi
-L(return_first_new_match_first):
- 	KMOV	%k2, %VRAX
-# ifdef USE_AS_WCSRCHR
-	xorl	$((1 << CHAR_PER_VEC)- 1), %VRAX
-+	BLSMSK	%VRCX, %VRCX
-+	/* Check if it has match in second loop vector.  */
- 	and	%VRCX, %VRAX
-# else
-	andn	%VRCX, %VRAX, %VRAX
-# endif
-	jz	L(first_vec_x1_or_x2_or_x3)
-L(return_first_new_match_ret):
-	bsr	%VRAX, %VRAX
-	leaq	(VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
-	ret
-+	jz	L(check_last_match)
- 
-	.p2align 4,, 10
-L(first_vec_x1_or_x2):
-	VPCMPEQ	%VMM(3), %VMATCH, %k3
-	KMOV	%k3, %VRAX
-	test	%VRAX, %VRAX
-	jz	L(first_vec_x0_x1_test)
-	bsr	%VRAX, %VRAX
-	leaq	(VEC_SIZE * 2)(%r8, %rax, CHAR_SIZE), %rax
-+	BSR	%VRAX, %VRAX
-+	leaq	(VEC_SIZE)(%rdi, %rax, CHAR_SIZE), %rax
- 	ret
- 
-	.p2align 4
-	/* We can throw away the work done for the first 4x checks here
-	   as we have a later match. This is the 'fast' path persay.  */
-L(second_aligned_loop_prep):
-L(second_aligned_loop_set_furthest_match):
-	movq	%rdi, %rsi
-	VMOVA	%VMM(5), %VMM(7)
-	VMOVA	%VMM(6), %VMM(8)
-	.p2align 4
-L(second_aligned_loop):
-	VMOVU	(VEC_SIZE * 4)(%rdi), %VMM(5)
-	VMOVU	(VEC_SIZE * 5)(%rdi), %VMM(6)
-	VPCMP	$4, %VMM(5), %VMATCH, %k2
-	VPCMP	$4, %VMM(6), %VMATCH, %k3{%k2}
-
-	VPMIN	%VMM(5), %VMM(6), %VMM(4)
-
-	VPTEST	%VMM(4), %VMM(4), %k1{%k3}
-	subq	$(VEC_SIZE * -2), %rdi
-	KMOV	%k1, %VRCX
-	inc	%RCX_M
-	jz	L(second_aligned_loop)
-	VPTESTN	%VMM(4), %VMM(4), %k1
-	KMOV	%k1, %VRDX
-	test	%VRDX, %VRDX
-	jz	L(second_aligned_loop_set_furthest_match)
-
-	KORTEST_M %k3, %k3
-	jnc	L(return_new_match)
-	/* branch here because there is a significant advantage interms
-	   of output dependency chance in using edx.  */
-+	/* String end in first loop vector.  */
-+L(loop_vector_x1_end):
-+	KMOV	%k1, %VRAX
-+	BLSMSK	%VRCX, %VRCX
-+	/* Check if it has match in second loop vector.  */
-+	and	%VRCX, %VRAX
-+	jz	L(check_last_match)
- 
-L(return_old_match):
-	VPCMPEQ	%VMM(8), %VMATCH, %k0
-	KMOV	%k0, %VRCX
-	bsr	%VRCX, %VRCX
-	jnz	L(return_old_match_ret)
-+	BSR	%VRAX, %VRAX
-+	leaq	(%rdi, %rax, CHAR_SIZE), %rax
-+	ret
- 
-	VPCMPEQ	%VMM(7), %VMATCH, %k0
-	KMOV	%k0, %VRCX
-	bsr	%VRCX, %VRCX
-	subq	$VEC_SIZE, %rsi
-L(return_old_match_ret):
-	leaq	(VEC_SIZE * 3)(%rsi, %rcx, CHAR_SIZE), %rax
-+	/* No match in first and second loop vector.  */
-+L(check_last_match):
-+	/* Check if any match recorded in r8.  */
-+	test	%r8, %r8
-+	jz	L(vector_x2_ret)
-+	movq	%r8, %rax
- 	ret
- 
-L(return_new_match):
-	VPTESTN	%VMM(5), %VMM(5), %k0
-	KMOV	%k0, %VRCX
-	blsmsk	%VRCX, %VRCX
-	jnc	L(return_new_match_first)
-	dec	%VRDX
-	VPCMPEQ	%VMM(6), %VMATCH, %k0
-	KMOV	%k0, %VRAX
-	addq	$VEC_SIZE, %rdi
-	and	%VRDX, %VRAX
-	jnz	L(return_new_match_ret)
-	subq	$VEC_SIZE, %rdi
-L(return_new_match_first):
-+	/* No match recorded in r8. Check the second saved vector
-+	   in beginning.  */
-+L(vector_x2_ret):
-+	VPCMPEQ %VMM(2), %VMM(0), %k2
- 	KMOV	%k2, %VRAX
-# ifdef USE_AS_WCSRCHR
-	xorl	$((1 << CHAR_PER_VEC)- 1), %VRAX
-	and	%VRCX, %VRAX
-# else
-	andn	%VRCX, %VRAX, %VRAX
-# endif
-	jz	L(return_old_match)
-L(return_new_match_ret):
-	bsr	%VRAX, %VRAX
-	leaq	(VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
-+	test	%VRAX, %VRAX
-+	jz	L(vector_x1_ret)
-+
-+	/* Match found in the second saved vector.  */
-+	BSR	%VRAX, %VRAX
-+	leaq	(VEC_SIZE)(%r9, %rax, CHAR_SIZE), %rax
- 	ret
- 
-	.p2align 4,, 4
-L(cross_page_boundary):
-	xorq	%rdi, %rax
-	mov	$-1, %VRDX
-	VMOVU	(PAGE_SIZE - VEC_SIZE)(%rax), %VMM(6)
-	VPTESTN	%VMM(6), %VMM(6), %k0
-	KMOV	%k0, %VRSI
-+L(page_cross):
-+	mov	%rdi, %rax
-+	movl	%edi, %ecx
- 
- # ifdef USE_AS_WCSRCHR
-	movl	%edi, %ecx
-	and	$(VEC_SIZE - 1), %ecx
-	shrl	$2, %ecx
-+	/* Calculate number of compare result bits to be skipped for
-+	   wide string alignment adjustment.  */
-+	andl	$(VEC_SIZE - 1), %ecx
-+	sarl	$2, %ecx
- # endif
-	shlx	%SHIFT_REG, %VRDX, %VRDX
-
-+	/* ecx contains number of w[char] to be skipped as a result
-+	   of address alignment.  */
-+	andq    $-VEC_SIZE, %rax
-+	VMOVA	(%rax), %VMM(1)
-+	VPTESTN	%VMM(1), %VMM(1), %k1
-+	KMOV	%k1, %VRAX
-+	SHR     %cl, %VRAX
-+	jz	L(page_cross_continue)
-+	VPCMPEQ	%VMM(1), %VMM(0), %k0
-+	KMOV	%k0, %VRDX
-+	SHR     %cl, %VRDX
-+	BLSMSK	%VRAX, %VRAX
-+	and	%VRDX, %VRAX
-+	jz	L(ret)
-+	BSR	%VRAX, %VRAX
- # ifdef USE_AS_WCSRCHR
-	kmovw	%edx, %k1
-+	leaq	(%rdi, %rax, CHAR_SIZE), %rax
- # else
-	KMOV	%VRDX, %k1
-+	add	%rdi, %rax
- # endif
- 
-	VPCOMPRESS %VMM(6), %VMM(1){%k1}{z}
-	/* We could technically just jmp back after the vpcompress but
-	   it doesn't save any 16-byte blocks.  */
-	shrx	%SHIFT_REG, %VRSI, %VRSI
-	test	%VRSI, %VRSI
-	jnz	L(page_cross_return)
-	jmp	L(page_cross_continue)
-	/* 1-byte from cache line.  */
-END(STRRCHR)
-+	ret
-+END (STRRCHR)
- #endif
-diff --git a/sysdeps/x86_64/multiarch/strrchr-evex.S b/sysdeps/x86_64/multiarch/strrchr-evex.S
-index 3bf6a5101422e4d1..85e3b0119f5dc923 100644
--- a/sysdeps/x86_64/multiarch/strrchr-evex.S
-+++ b/sysdeps/x86_64/multiarch/strrchr-evex.S
-@@ -1,8 +1,394 @@
-+/* strrchr/wcsrchr optimized with 256-bit EVEX instructions.
-+   Copyright (C) 2021-2023 Free Software Foundation, Inc.
-+   This file is part of the GNU C Library.
-+
-+   The GNU C Library is free software; you can redistribute it and/or
-+   modify it under the terms of the GNU Lesser General Public
-+   License as published by the Free Software Foundation; either
-+   version 2.1 of the License, or (at your option) any later version.
-+
-+   The GNU C Library is distributed in the hope that it will be useful,
-+   but WITHOUT ANY WARRANTY; without even the implied warranty of
-+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-+   Lesser General Public License for more details.
-+
-+   You should have received a copy of the GNU Lesser General Public
-+   License along with the GNU C Library; if not, see
-+   <https://www.gnu.org/licenses/>.  */
-+
-+#include <isa-level.h>
-+
-+#if ISA_SHOULD_BUILD (4)
-+
-+# include <sysdep.h>
-+
- # ifndef STRRCHR
- #  define STRRCHR	__strrchr_evex
- # endif
- 
-#include "x86-evex256-vecs.h"
-#include "reg-macros.h"
-+# include "x86-evex256-vecs.h"
-+
-+# ifdef USE_AS_WCSRCHR
-+#  define SHIFT_REG	rsi
-+#  define kunpck_2x	kunpckbw
-+#  define kmov_2x	kmovd
-+#  define maskz_2x	ecx
-+#  define maskm_2x	eax
-+#  define CHAR_SIZE	4
-+#  define VPMIN	vpminud
-+#  define VPTESTN	vptestnmd
-+#  define VPTEST	vptestmd
-+#  define VPBROADCAST	vpbroadcastd
-+#  define VPCMPEQ	vpcmpeqd
-+#  define VPCMP	vpcmpd
-+
-+#  define USE_WIDE_CHAR
-+# else
-+#  define SHIFT_REG	rdi
-+#  define kunpck_2x	kunpckdq
-+#  define kmov_2x	kmovq
-+#  define maskz_2x	rcx
-+#  define maskm_2x	rax
-+
-+#  define CHAR_SIZE	1
-+#  define VPMIN	vpminub
-+#  define VPTESTN	vptestnmb
-+#  define VPTEST	vptestmb
-+#  define VPBROADCAST	vpbroadcastb
-+#  define VPCMPEQ	vpcmpeqb
-+#  define VPCMP	vpcmpb
-+# endif
-+
-+# include "reg-macros.h"
-+
-+# define VMATCH	VMM(0)
-+# define CHAR_PER_VEC	(VEC_SIZE / CHAR_SIZE)
-+# define PAGE_SIZE	4096
-+
-+	.section SECTION(.text), "ax", @progbits
-+ENTRY_P2ALIGN(STRRCHR, 6)
-+	movl	%edi, %eax
-+	/* Broadcast CHAR to VMATCH.  */
-+	VPBROADCAST %esi, %VMATCH
-+
-+	andl	$(PAGE_SIZE - 1), %eax
-+	cmpl	$(PAGE_SIZE - VEC_SIZE), %eax
-+	jg	L(cross_page_boundary)
-+L(page_cross_continue):
-+	VMOVU	(%rdi), %VMM(1)
-+	/* k0 has a 1 for each zero CHAR in VEC(1).  */
-+	VPTESTN	%VMM(1), %VMM(1), %k0
-+	KMOV	%k0, %VRSI
-+	test	%VRSI, %VRSI
-+	jz	L(aligned_more)
-+	/* fallthrough: zero CHAR in first VEC.  */
-+	/* K1 has a 1 for each search CHAR match in VEC(1).  */
-+	VPCMPEQ	%VMATCH, %VMM(1), %k1
-+	KMOV	%k1, %VRAX
-+	/* Build mask up until first zero CHAR (used to mask of
-+	   potential search CHAR matches past the end of the string).
-+	 */
-+	blsmsk	%VRSI, %VRSI
-+	and	%VRSI, %VRAX
-+	jz	L(ret0)
-+	/* Get last match (the `and` removed any out of bounds matches).
-+	 */
-+	bsr	%VRAX, %VRAX
-+# ifdef USE_AS_WCSRCHR
-+	leaq	(%rdi, %rax, CHAR_SIZE), %rax
-+# else
-+	addq	%rdi, %rax
-+# endif
-+L(ret0):
-+	ret
-+
-+	/* Returns for first vec x1/x2/x3 have hard coded backward
-+	   search path for earlier matches.  */
-+	.p2align 4,, 6
-+L(first_vec_x1):
-+	VPCMPEQ	%VMATCH, %VMM(2), %k1
-+	KMOV	%k1, %VRAX
-+	blsmsk	%VRCX, %VRCX
-+	/* eax non-zero if search CHAR in range.  */
-+	and	%VRCX, %VRAX
-+	jnz	L(first_vec_x1_return)
-+
-+	/* fallthrough: no match in VEC(2) then need to check for
-+	   earlier matches (in VEC(1)).  */
-+	.p2align 4,, 4
-+L(first_vec_x0_test):
-+	VPCMPEQ	%VMATCH, %VMM(1), %k1
-+	KMOV	%k1, %VRAX
-+	test	%VRAX, %VRAX
-+	jz	L(ret1)
-+	bsr	%VRAX, %VRAX
-+# ifdef USE_AS_WCSRCHR
-+	leaq	(%rsi, %rax, CHAR_SIZE), %rax
-+# else
-+	addq	%rsi, %rax
-+# endif
-+L(ret1):
-+	ret
-+
-+	.p2align 4,, 10
-+L(first_vec_x1_or_x2):
-+	VPCMPEQ	%VMM(3), %VMATCH, %k3
-+	VPCMPEQ	%VMM(2), %VMATCH, %k2
-+	/* K2 and K3 have 1 for any search CHAR match. Test if any
-+	   matches between either of them. Otherwise check VEC(1).  */
-+	KORTEST %k2, %k3
-+	jz	L(first_vec_x0_test)
-+
-+	/* Guaranteed that VEC(2) and VEC(3) are within range so merge
-+	   the two bitmasks then get last result.  */
-+	kunpck_2x %k2, %k3, %k3
-+	kmov_2x	%k3, %maskm_2x
-+	bsr	%maskm_2x, %maskm_2x
-+	leaq	(VEC_SIZE * 1)(%r8, %rax, CHAR_SIZE), %rax
-+	ret
-+
-+	.p2align 4,, 7
-+L(first_vec_x3):
-+	VPCMPEQ	%VMATCH, %VMM(4), %k1
-+	KMOV	%k1, %VRAX
-+	blsmsk	%VRCX, %VRCX
-+	/* If no search CHAR match in range check VEC(1)/VEC(2)/VEC(3).
-+	 */
-+	and	%VRCX, %VRAX
-+	jz	L(first_vec_x1_or_x2)
-+	bsr	%VRAX, %VRAX
-+	leaq	(VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax
-+	ret
-+
-+
-+	.p2align 4,, 6
-+L(first_vec_x0_x1_test):
-+	VPCMPEQ	%VMATCH, %VMM(2), %k1
-+	KMOV	%k1, %VRAX
-+	/* Check VEC(2) for last match first. If no match try VEC(1).
-+	 */
-+	test	%VRAX, %VRAX
-+	jz	L(first_vec_x0_test)
-+	.p2align 4,, 4
-+L(first_vec_x1_return):
-+	bsr	%VRAX, %VRAX
-+	leaq	(VEC_SIZE)(%rdi, %rax, CHAR_SIZE), %rax
-+	ret
-+
-+
-+	.p2align 4,, 10
-+L(first_vec_x2):
-+	VPCMPEQ	%VMATCH, %VMM(3), %k1
-+	KMOV	%k1, %VRAX
-+	blsmsk	%VRCX, %VRCX
-+	/* Check VEC(3) for last match first. If no match try
-+	   VEC(2)/VEC(1).  */
-+	and	%VRCX, %VRAX
-+	jz	L(first_vec_x0_x1_test)
-+	bsr	%VRAX, %VRAX
-+	leaq	(VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
-+	ret
-+
-+
-+	.p2align 4,, 12
-+L(aligned_more):
-+	/* Need to keep original pointer in case VEC(1) has last match.
-+	 */
-+	movq	%rdi, %rsi
-+	andq	$-VEC_SIZE, %rdi
-+
-+	VMOVU	VEC_SIZE(%rdi), %VMM(2)
-+	VPTESTN	%VMM(2), %VMM(2), %k0
-+	KMOV	%k0, %VRCX
-+
-+	test	%VRCX, %VRCX
-+	jnz	L(first_vec_x1)
-+
-+	VMOVU	(VEC_SIZE * 2)(%rdi), %VMM(3)
-+	VPTESTN	%VMM(3), %VMM(3), %k0
-+	KMOV	%k0, %VRCX
-+
-+	test	%VRCX, %VRCX
-+	jnz	L(first_vec_x2)
-+
-+	VMOVU	(VEC_SIZE * 3)(%rdi), %VMM(4)
-+	VPTESTN	%VMM(4), %VMM(4), %k0
-+	KMOV	%k0, %VRCX
-+	movq	%rdi, %r8
-+	test	%VRCX, %VRCX
-+	jnz	L(first_vec_x3)
-+
-+	andq	$-(VEC_SIZE * 2), %rdi
-+	.p2align 4,, 10
-+L(first_aligned_loop):
-+	/* Preserve VEC(1), VEC(2), VEC(3), and VEC(4) until we can
-+	   guarantee they don't store a match.  */
-+	VMOVA	(VEC_SIZE * 4)(%rdi), %VMM(5)
-+	VMOVA	(VEC_SIZE * 5)(%rdi), %VMM(6)
-+
-+	VPCMPEQ	%VMM(5), %VMATCH, %k2
-+	vpxord	%VMM(6), %VMATCH, %VMM(7)
-+
-+	VPMIN	%VMM(5), %VMM(6), %VMM(8)
-+	VPMIN	%VMM(8), %VMM(7), %VMM(7)
-+
-+	VPTESTN	%VMM(7), %VMM(7), %k1
-+	subq	$(VEC_SIZE * -2), %rdi
-+	KORTEST %k1, %k2
-+	jz	L(first_aligned_loop)
-+
-+	VPCMPEQ	%VMM(6), %VMATCH, %k3
-+	VPTESTN	%VMM(8), %VMM(8), %k1
-+
-+	/* If k1 is zero, then we found a CHAR match but no null-term.
-+	   We can now safely throw out VEC1-4.  */
-+	KTEST	%k1, %k1
-+	jz	L(second_aligned_loop_prep)
-+
-+	KORTEST %k2, %k3
-+	jnz	L(return_first_aligned_loop)
-+
-+
-+	.p2align 4,, 6
-+L(first_vec_x1_or_x2_or_x3):
-+	VPCMPEQ	%VMM(4), %VMATCH, %k4
-+	KMOV	%k4, %VRAX
-+	bsr	%VRAX, %VRAX
-+	jz	L(first_vec_x1_or_x2)
-+	leaq	(VEC_SIZE * 3)(%r8, %rax, CHAR_SIZE), %rax
-+	ret
-+
-+
-+	.p2align 4,, 8
-+L(return_first_aligned_loop):
-+	VPTESTN	%VMM(5), %VMM(5), %k0
-+
-+	/* Combined results from VEC5/6.  */
-+	kunpck_2x %k0, %k1, %k0
-+	kmov_2x	%k0, %maskz_2x
-+
-+	blsmsk	%maskz_2x, %maskz_2x
-+	kunpck_2x %k2, %k3, %k3
-+	kmov_2x	%k3, %maskm_2x
-+	and	%maskz_2x, %maskm_2x
-+	jz	L(first_vec_x1_or_x2_or_x3)
-+
-+	bsr	%maskm_2x, %maskm_2x
-+	leaq	(VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
-+	ret
-+
-+	.p2align 4
-+	/* We can throw away the work done for the first 4x checks here
-+	   as we have a later match. This is the 'fast' path persay.
-+	 */
-+L(second_aligned_loop_prep):
-+L(second_aligned_loop_set_furthest_match):
-+	movq	%rdi, %rsi
-+	/* Ideally we would safe k2/k3 but `kmov/kunpck` take uops on
-+	   port0 and have noticeable overhead in the loop.  */
-+	VMOVA	%VMM(5), %VMM(7)
-+	VMOVA	%VMM(6), %VMM(8)
-+	.p2align 4
-+L(second_aligned_loop):
-+	VMOVU	(VEC_SIZE * 4)(%rdi), %VMM(5)
-+	VMOVU	(VEC_SIZE * 5)(%rdi), %VMM(6)
-+	VPCMPEQ	%VMM(5), %VMATCH, %k2
-+	vpxord	%VMM(6), %VMATCH, %VMM(3)
-+
-+	VPMIN	%VMM(5), %VMM(6), %VMM(4)
-+	VPMIN	%VMM(3), %VMM(4), %VMM(3)
-+
-+	VPTESTN	%VMM(3), %VMM(3), %k1
-+	subq	$(VEC_SIZE * -2), %rdi
-+	KORTEST %k1, %k2
-+	jz	L(second_aligned_loop)
-+	VPCMPEQ	%VMM(6), %VMATCH, %k3
-+	VPTESTN	%VMM(4), %VMM(4), %k1
-+	KTEST	%k1, %k1
-+	jz	L(second_aligned_loop_set_furthest_match)
-+
-+	/* branch here because we know we have a match in VEC7/8 but
-+	   might not in VEC5/6 so the latter is expected to be less
-+	   likely.  */
-+	KORTEST %k2, %k3
-+	jnz	L(return_new_match)
-+
-+L(return_old_match):
-+	VPCMPEQ	%VMM(8), %VMATCH, %k0
-+	KMOV	%k0, %VRCX
-+	bsr	%VRCX, %VRCX
-+	jnz	L(return_old_match_ret)
-+
-+	VPCMPEQ	%VMM(7), %VMATCH, %k0
-+	KMOV	%k0, %VRCX
-+	bsr	%VRCX, %VRCX
-+	subq	$VEC_SIZE, %rsi
-+L(return_old_match_ret):
-+	leaq	(VEC_SIZE * 3)(%rsi, %rcx, CHAR_SIZE), %rax
-+	ret
-+
-+	.p2align 4,, 10
-+L(return_new_match):
-+	VPTESTN	%VMM(5), %VMM(5), %k0
-+
-+	/* Combined results from VEC5/6.  */
-+	kunpck_2x %k0, %k1, %k0
-+	kmov_2x	%k0, %maskz_2x
-+
-+	blsmsk	%maskz_2x, %maskz_2x
-+	kunpck_2x %k2, %k3, %k3
-+	kmov_2x	%k3, %maskm_2x
-+
-+	/* Match at end was out-of-bounds so use last known match.  */
-+	and	%maskz_2x, %maskm_2x
-+	jz	L(return_old_match)
-+
-+	bsr	%maskm_2x, %maskm_2x
-+	leaq	(VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
-+	ret
-+
-+L(cross_page_boundary):
-+	/* eax contains all the page offset bits of src (rdi). `xor rdi,
-+	   rax` sets pointer will all page offset bits cleared so
-+	   offset of (PAGE_SIZE - VEC_SIZE) will get last aligned VEC
-+	   before page cross (guaranteed to be safe to read). Doing this
-+	   as opposed to `movq %rdi, %rax; andq $-VEC_SIZE, %rax` saves
-+	   a bit of code size.  */
-+	xorq	%rdi, %rax
-+	VMOVU	(PAGE_SIZE - VEC_SIZE)(%rax), %VMM(1)
-+	VPTESTN	%VMM(1), %VMM(1), %k0
-+	KMOV	%k0, %VRCX
-+
-+	/* Shift out zero CHAR matches that are before the beginning of
-+	   src (rdi).  */
-+# ifdef USE_AS_WCSRCHR
-+	movl	%edi, %esi
-+	andl	$(VEC_SIZE - 1), %esi
-+	shrl	$2, %esi
-+# endif
-+	shrx	%VGPR(SHIFT_REG), %VRCX, %VRCX
-+
-+	test	%VRCX, %VRCX
-+	jz	L(page_cross_continue)
- 
-#include "strrchr-evex-base.S"
-+	/* Found zero CHAR so need to test for search CHAR.  */
-+	VPCMP	$0, %VMATCH, %VMM(1), %k1
-+	KMOV	%k1, %VRAX
-+	/* Shift out search CHAR matches that are before the beginning of
-+	   src (rdi).  */
-+	shrx	%VGPR(SHIFT_REG), %VRAX, %VRAX
-+
-+	/* Check if any search CHAR match in range.  */
-+	blsmsk	%VRCX, %VRCX
-+	and	%VRCX, %VRAX
-+	jz	L(ret3)
-+	bsr	%VRAX, %VRAX
-+# ifdef USE_AS_WCSRCHR
-+	leaq	(%rdi, %rax, CHAR_SIZE), %rax
-+# else
-+	addq	%rdi, %rax
-+# endif
-+L(ret3):
-+	ret
-+END(STRRCHR)
-+#endif
-diff --git a/sysdeps/x86_64/multiarch/wcsrchr-evex.S b/sysdeps/x86_64/multiarch/wcsrchr-evex.S
-index a584cd3f430ba9d5..e5c5fe3bf28a5966 100644
--- a/sysdeps/x86_64/multiarch/wcsrchr-evex.S
-+++ b/sysdeps/x86_64/multiarch/wcsrchr-evex.S
-@@ -4,5 +4,4 @@
- 
- #define STRRCHR	WCSRCHR
- #define USE_AS_WCSRCHR 1
-#define USE_WIDE_CHAR 1
- #include "strrchr-evex.S"
--- a/glibc-rh2244992.patch
+++ b/glibc-rh2244992.patch
@ -1,110 +0,0 @@
-commit ffe333d46c4ef7c5221cccad2743cac3e149e615
-Author: Florian Weimer <fweimer@redhat.com>
-Date:   Tue Nov 7 11:23:15 2023 +0100
-
-    elf: Fix force_first handling in dlclose (bug 30785)
-    
-    The force_first parameter was ineffective because the dlclose'd
-    object was not necessarily the first in the maps array.  Also
-    enable force_first handling unconditionally, regardless of namespace.
-    The initial object in a namespace should be destructed first, too.
-    
-    The _dl_sort_maps_dfs function had early returns for relocation
-    dependency processing which broke force_first handling, too, and
-    this is fixed in this change as well.
-
-diff --git a/elf/dl-close.c b/elf/dl-close.c
-index 1c7a861db140259d..a97a1efa45eb7409 100644
--- a/elf/dl-close.c
-+++ b/elf/dl-close.c
-@@ -153,6 +153,16 @@ _dl_close_worker (struct link_map *map, bool force)
-     }
-   assert (idx == nloaded);
- 
-+  /* Put the dlclose'd map first, so that its destructor runs first.
-+     The map variable is NULL after a retry.  */
-+  if (map != NULL)
-+    {
-+      maps[map->l_idx] = maps[0];
-+      maps[map->l_idx]->l_idx = map->l_idx;
-+      maps[0] = map;
-+      maps[0]->l_idx = 0;
-+    }
-+
-   /* Keep track of the lowest index link map we have covered already.  */
-   int done_index = -1;
-   while (++done_index < nloaded)
-@@ -226,9 +236,10 @@ _dl_close_worker (struct link_map *map, bool force)
- 	  }
-     }
- 
-  /* Sort the entries.  We can skip looking for the binary itself which is
-     at the front of the search list for the main namespace.  */
-  _dl_sort_maps (maps, nloaded, (nsid == LM_ID_BASE), true);
-+  /* Sort the entries.  Unless retrying, the maps[0] object (the
-+     original argument to dlclose) needs to remain first, so that its
-+     destructor runs first.  */
-+  _dl_sort_maps (maps, nloaded, /* force_first */ map != NULL, true);
- 
-   /* Call all termination functions at once.  */
-   bool unload_any = false;
-@@ -732,7 +743,11 @@ _dl_close_worker (struct link_map *map, bool force)
-   /* Recheck if we need to retry, release the lock.  */
-  out:
-   if (dl_close_state == rerun)
-    goto retry;
-+    {
-+      /* The map may have been deallocated.  */
-+      map = NULL;
-+      goto retry;
-+    }
- 
-   dl_close_state = not_pending;
- }
-diff --git a/elf/dl-sort-maps.c b/elf/dl-sort-maps.c
-index 5616c8a6a33ebb1e..5c846c7c6f9d88bc 100644
--- a/elf/dl-sort-maps.c
-+++ b/elf/dl-sort-maps.c
-@@ -255,13 +255,12 @@ _dl_sort_maps_dfs (struct link_map **maps, unsigned int nmaps,
- 	     The below memcpy is not needed in the do_reldeps case here,
- 	     since we wrote back to maps[] during DFS traversal.  */
- 	  if (maps_head == maps)
-	    return;
-+	    break;
- 	}
-       assert (maps_head == maps);
-      return;
-     }
-
-  memcpy (maps, rpo, sizeof (struct link_map *) * nmaps);
-+  else
-+    memcpy (maps, rpo, sizeof (struct link_map *) * nmaps);
- 
-   /* Skipping the first object at maps[0] is not valid in general,
-      since traversing along object dependency-links may "find" that
-diff --git a/elf/dso-sort-tests-1.def b/elf/dso-sort-tests-1.def
-index 4bf9052db16fb352..cf6453e9eb85ac65 100644
--- a/elf/dso-sort-tests-1.def
-+++ b/elf/dso-sort-tests-1.def
-@@ -56,14 +56,16 @@ output: b>a>{}<a<b
- # relocation(dynamic) dependencies. While this is technically unspecified, the
- # presumed reasonable practical behavior is for the destructor order to respect
- # the static DT_NEEDED links (here this means the a->b->c->d order).
-# The older dynamic_sort=1 algorithm does not achieve this, while the DFS-based
-# dynamic_sort=2 algorithm does, although it is still arguable whether going
-# beyond spec to do this is the right thing to do.
-+# The older dynamic_sort=1 algorithm originally did not achieve this,
-+# but this was a bug in the way _dl_sort_maps was called from _dl_close_worker,
-+# effectively disabling proper force_first handling.
-+# The new dynamic_sort=2 algorithm shows the effect of the simpler force_first
-+# handling: the a object is simply moved to the front.
- # The below expected outputs are what the two algorithms currently produce
- # respectively, for regression testing purposes.
- tst-bz15311: {+a;+e;+f;+g;+d;%d;-d;-g;-f;-e;-a};a->b->c->d;d=>[ba];c=>a;b=>e=>a;c=>f=>b;d=>g=>c
-output(glibc.rtld.dynamic_sort=1): {+a[d>c>b>a>];+e[e>];+f[f>];+g[g>];+d[];%d(b(e(a()))a()g(c(a()f(b(e(a()))))));-d[];-g[];-f[];-e[];-a[<a<c<d<g<f<b<e];}
-output(glibc.rtld.dynamic_sort=2): {+a[d>c>b>a>];+e[e>];+f[f>];+g[g>];+d[];%d(b(e(a()))a()g(c(a()f(b(e(a()))))));-d[];-g[];-f[];-e[];-a[<g<f<a<b<c<d<e];}
-+output(glibc.rtld.dynamic_sort=1): {+a[d>c>b>a>];+e[e>];+f[f>];+g[g>];+d[];%d(b(e(a()))a()g(c(a()f(b(e(a()))))));-d[];-g[];-f[];-e[];-a[<a<b<c<d<g<f<e];}
-+output(glibc.rtld.dynamic_sort=2): {+a[d>c>b>a>];+e[e>];+f[f>];+g[g>];+d[];%d(b(e(a()))a()g(c(a()f(b(e(a()))))));-d[];-g[];-f[];-e[];-a[<a<g<f<b<c<d<e];}
- 
- # Test that even in the presence of dependency loops involving dlopen'ed
- # object, that object is initialized last (and not unloaded prematurely).
--- a/glibc-rh2248502-3.patch
+++ b/glibc-rh2248502-3.patch
@ -1,22 +0,0 @@
-Author: Florian Weimer <fweimer@redhat.com>
-Date:   Wed Nov 15 06:31:40 2023 +0100
-
-    stdlib: Avoid another self-comparison in qsort
-    
-    In the insertion phase, we could run off the start of the array if the
-    comparison function never runs zero.  In that case, it never finds the
-    initial element that terminates the iteration.
-
-diff --git a/stdlib/qsort.c b/stdlib/qsort.c
-index ad110e8a892a66e1..1ea31ff4247da7a4 100644
--- a/stdlib/qsort.c
-+++ b/stdlib/qsort.c
-@@ -217,7 +217,7 @@ insertion_sort_qsort_partitions (void *const pbase, size_t total_elems,
-   while ((run_ptr += size) <= end_ptr)
-     {
-       tmp_ptr = run_ptr - size;
-      while (cmp (run_ptr, tmp_ptr, arg) < 0)
-+      while (tmp_ptr != base_ptr && cmp (run_ptr, tmp_ptr, arg) < 0)
-         tmp_ptr -= size;
- 
-       tmp_ptr += size;
--- a/glibc-rh2248915.patch
+++ b/glibc-rh2248915.patch
@ -1,60 +0,0 @@
-Author: Florian Weimer <fweimer@redhat.com>
-Date:   Mon Oct 23 12:53:16 2023 +0200
-
-    ldconfig: Fixes for skipping temporary files.
-    
-    Arguments to a memchr call were swapped, causing incorrect skipping
-    of files.
-    
-    Files related to dpkg have different names: they actually end in
-    .dpkg-new and .dpkg-tmp, not .tmp as I mistakenly assumed.
-    
-    Fixes commit 2aa0974d2573441bffd59 ("elf: ldconfig should skip
-    temporary files created by package managers").
-
-diff --git a/elf/ldconfig.c b/elf/ldconfig.c
-index 02387a169c902f01..bccd386761d8cbb2 100644
--- a/elf/ldconfig.c
-+++ b/elf/ldconfig.c
-@@ -661,6 +661,17 @@ struct dlib_entry
-   struct dlib_entry *next;
- };
- 
-+/* Return true if the N bytes at NAME end with with the characters in
-+   the string SUFFIX.  (NAME[N + 1] does not have to be a null byte.)
-+   Expected to be called with a string literal for SUFFIX.  */
-+static inline bool
-+endswithn (const char *name, size_t n, const char *suffix)
-+{
-+  return (n >= strlen (suffix)
-+	  && memcmp (name + n - strlen (suffix), suffix,
-+		     strlen (suffix)) == 0);
-+}
-+
- /* Skip some temporary DSO files.  These files may be partially written
-    and lead to ldconfig crashes when examined.  */
- static bool
-@@ -670,8 +681,7 @@ skip_dso_based_on_name (const char *name, size_t len)
-      names like these are never really DSOs we want to look at.  */
-   if (len >= sizeof (".#prelink#") - 1)
-     {
-      if (strcmp (name + len - sizeof (".#prelink#") + 1,
-		  ".#prelink#") == 0)
-+      if (endswithn (name, len, ".#prelink#"))
- 	return true;
-       if (len >= sizeof (".#prelink#.XXXXXX") - 1
- 	  && memcmp (name + len - sizeof (".#prelink#.XXXXXX")
-@@ -679,10 +689,11 @@ skip_dso_based_on_name (const char *name, size_t len)
- 	return true;
-     }
-   /* Skip temporary files created by RPM.  */
-  if (memchr (name, len, ';') != NULL)
-+  if (memchr (name, ';', len) != NULL)
-     return true;
-   /* Skip temporary files created by dpkg.  */
-  if (len > 4 && memcmp (name + len - 4, ".tmp", 4) == 0)
-+  if (endswithn (name, len, ".dpkg-new")
-+      || endswithn (name, len, ".dpkg-tmp"))
-     return true;
-   return false;
- }
--- a/glibc.spec
+++ b/glibc.spec
@ -1,4 +1,4 @@
-%global glibcsrcdir glibc-2.38.9000-244-gd1dcb565a1
+%global glibcsrcdir glibc-2.38.9000-295-g5d7f1bce7d
 %global glibcversion 2.38.9000
 # Pre-release tarballs are pulled in from git using a command that is
 # effectively:
@ -159,7 +159,7 @@ Version: %{glibcversion}
 # - It allows using the Release number without the %%dist tag in the dependency
 #   generator to make the generated requires interchangeable between Rawhide
 #   and ELN (.elnYY < .fcXX).
-%global baserelease 22
+%global baserelease 23
 Release: %{baserelease}%{?dist}

 # In general, GPLv2+ is used by programs, LGPLv2+ is used for
@ -230,10 +230,7 @@ Patch9: glibc-rh827510.patch
 Patch13: glibc-fedora-localedata-rh61908.patch
 Patch17: glibc-cs-path.patch
 Patch23: glibc-python3.patch
-Patch24: glibc-rh2244688.patch
-Patch25: glibc-rh2244992.patch
-Patch26: glibc-rh2248915.patch
-Patch27: glibc-rh2248502-3.patch
+Patch24: glibc-benchtests-aarch64.patch

 ##############################################################################
 # Continued list of core "glibc" package information:
@ -2204,6 +2201,60 @@ update_gconv_modules_cache ()
 %files -f compat-libpthread-nonshared.filelist -n compat-libpthread-nonshared

 %changelog
+* Wed Nov 22 2023 Florian Weimer <fweimer@redhat.com> - 2.38.9000-23
+- Apply glibc-benchtests-aarch64.patch to fix an aarch64 build failure.
+- Drop glibc-rh2244688.patch revert.  Fix applied upstream.
+- Drop glibc-rh2244992.patch, glibc-rh2248915.patch, glibc-rh2248502-3.patch.
+  All applied upstream.
+- Auto-sync with upstream branch master,
+  commit 5d7f1bce7d8eea31f4baeb68bcc3124b35acc751:
+- posix: Revert the removal of the crypt prototype from <unistd.h>
+- elf: Add comments on how LD_AUDIT and LD_PRELOAD handle __libc_enable_secure
+- elf: Ignore LD_LIBRARY_PATH and debug env var for setuid for static
+- elf: Remove any_debug from dl_main_state
+- elf: Remove LD_PROFILE for static binaries
+- elf: Ignore LD_PROFILE for setuid binaries
+- s390: Use dl-symbol-redir-ifunc.h on cpu-tunables
+- x86: Use dl-symbol-redir-ifunc.h on cpu-tunables
+- elf: Emit warning if tunable is ill-formatted
+- elf: Fix _dl_debug_vdprintf to work before self-relocation
+- elf: Do not parse ill-formatted strings
+- elf: Do not process invalid tunable format
+- elf: Add all malloc tunable to unsecvars
+- elf: Ignore GLIBC_TUNABLES for setuid/setgid binaries
+- elf: Add GLIBC_TUNABLES to unsecvars
+- elf: Remove /etc/suid-debug support
+- stdlib: The qsort implementation needs to use heapsort in more cases
+- stdlib: Handle various corner cases in the fallback heapsort for qsort
+- stdlib: Avoid another self-comparison in qsort
+- hurd: fix restarting reauth_dtable on signal
+- hurd: Prevent the final file_exec_paths call from signals
+- manual: Fix termios.c example. (Bug 31078)
+- aarch64: Add vector implementations of expm1 routines
+- linux: Use fchmodat2 on fchmod for flags different than 0 (BZ 26401)
+- intl: Add test case for bug 16621
+- resolv: free only initialized items from gai pool
+- ldconfig: Fixes for skipping temporary files.
+- nptl: Link tst-execstack-threads-mod.so with -z execstack
+- nptl: Rename tst-execstack to tst-execstack-threads
+- localedata: Convert oc_FR locale to UTF-8
+- localedata: Add information for Occitan
+- elf: Fix force_first handling in dlclose (bug 30981)
+- elf: Handle non-directory name in search path (BZ 31035)
+- New Zealand locales (en_NZ & mi_NZ) first day of week should be Monday
+- x86: Fix unchecked AVX512-VBMI2 usage in strrchr-evex-base.S
+- posix: Check pidfd_spawn with tst-spawn7-pid
+- y2038: Fix support for 64-bit time on legacy ABIs
+- AArch64: Remove Falkor memcpy
+- AArch64: Add memset_zva64
+- AArch64: Cleanup emag memset
+- test: Run the tst-tls-allocation-failure-static-patched with test-wrapper.
+- aarch64: Add vector implementations of log1p routines
+- aarch64: Add vector implementations of atan2 routines
+- aarch64: Add vector implementations of atan routines
+- aarch64: Add vector implementations of acos routines
+- aarch64: Add vector implementations of asin routines
+
 * Wed Nov 15 2023 Florian Weimer <fweimer@redhat.com> - 2.38.9000-22
 - Work around another self-comparison application issue in qsort (#2248502)

--- a/2
+++ b/2
@ -1 +1 @@
-SHA512 (glibc-2.38.9000-244-gd1dcb565a1.tar.xz) = 74d23268abb91447d74906ae477f926baa2f54a95d87fc18d8972a976524987e98f6141f3767406aef6293d2ef7f7bc16cfedaacd2d33bca207253ec7073c915
+SHA512 (glibc-2.38.9000-295-g5d7f1bce7d.tar.xz) = 7e531ed030bef65198d86d9974dc280ea2897d3d89c0ceecdd2667470ce4418c42f5a1633927786c8a9f4d7d967cec2a462e4a6918dfb92fc419e5ec9ecf28e3