Auto-sync with upstream branch master

Upstream commit: 5d7f1bce7d8eea31f4baeb68bcc3124b35acc751 - Apply glibc-benchtests-aarch64.patch to fix an aarch64 build failure. - Drop glibc-rh2244688.patch revert. Fix applied upstream. - Drop glibc-rh2244992.patch, glibc-rh2248915.patch, glibc-rh2248502-3.patch. All applied upstream. - posix: Revert the removal of the crypt prototype from <unistd.h> - elf: Add comments on how LD_AUDIT and LD_PRELOAD handle __libc_enable_secure - elf: Ignore LD_LIBRARY_PATH and debug env var for setuid for static - elf: Remove any_debug from dl_main_state - elf: Remove LD_PROFILE for static binaries - elf: Ignore LD_PROFILE for setuid binaries - s390: Use dl-symbol-redir-ifunc.h on cpu-tunables - x86: Use dl-symbol-redir-ifunc.h on cpu-tunables - elf: Emit warning if tunable is ill-formatted - elf: Fix _dl_debug_vdprintf to work before self-relocation - elf: Do not parse ill-formatted strings - elf: Do not process invalid tunable format - elf: Add all malloc tunable to unsecvars - elf: Ignore GLIBC_TUNABLES for setuid/setgid binaries - elf: Add GLIBC_TUNABLES to unsecvars - elf: Remove /etc/suid-debug support - stdlib: The qsort implementation needs to use heapsort in more cases - stdlib: Handle various corner cases in the fallback heapsort for qsort - stdlib: Avoid another self-comparison in qsort - hurd: fix restarting reauth_dtable on signal - hurd: Prevent the final file_exec_paths call from signals - manual: Fix termios.c example. (Bug 31078) - aarch64: Add vector implementations of expm1 routines - linux: Use fchmodat2 on fchmod for flags different than 0 (BZ 26401) - intl: Add test case for bug 16621 - resolv: free only initialized items from gai pool - ldconfig: Fixes for skipping temporary files. - nptl: Link tst-execstack-threads-mod.so with -z execstack - nptl: Rename tst-execstack to tst-execstack-threads - localedata: Convert oc_FR locale to UTF-8 - localedata: Add information for Occitan - elf: Fix force_first handling in dlclose (bug 30981) - elf: Handle non-directory name in search path (BZ 31035) - New Zealand locales (en_NZ & mi_NZ) first day of week should be Monday - x86: Fix unchecked AVX512-VBMI2 usage in strrchr-evex-base.S - posix: Check pidfd_spawn with tst-spawn7-pid - y2038: Fix support for 64-bit time on legacy ABIs - AArch64: Remove Falkor memcpy - AArch64: Add memset_zva64 - AArch64: Cleanup emag memset - test: Run the tst-tls-allocation-failure-static-patched with test-wrapper. - aarch64: Add vector implementations of log1p routines - aarch64: Add vector implementations of atan2 routines - aarch64: Add vector implementations of atan routines - aarch64: Add vector implementations of acos routines - aarch64: Add vector implementations of asin routines
2023-11-22 08:49:54 +01:00 · 2023-11-22 08:49:54 +01:00 · f94f56a23b
commit f94f56a23b
parent 9e361d1258
7 changed files with 308 additions and 1166 deletions
--- a/glibc-benchtests-aarch64.patch
+++ b/glibc-benchtests-aarch64.patch
@ -0,0 +1,250 @@
 Author: Joe Ramsay <Joe.Ramsay@arm.com>
 Date:   Tue Nov 21 14:39:39 2023 +0000
    aarch64: Fix libmvec benchmarks
    These were broken by the new atan2 functions, as they were only
    set up for univariate functions. Arity is now detected from the
    input file - this revealed a mistake that the double-precision
    inputs were being used for both single- and double-precision
    routines, which is now remedied.
 diff --git a/sysdeps/aarch64/fpu/scripts/bench_libmvec_advsimd.py b/sysdeps/aarch64/fpu/scripts/bench_libmvec_advsimd.py
 index 3e124c781065fea9..3661a24044cc9770 100644
 --- a/sysdeps/aarch64/fpu/scripts/bench_libmvec_advsimd.py
 +++ b/sysdeps/aarch64/fpu/scripts/bench_libmvec_advsimd.py
@@ -22,40 +22,49 @@ TEMPLATE = """
 #include <math.h>
 #include <arm_neon.h>
 -#define STRIDE {stride}
 +#define STRIDE {rowlen}
 -#define CALL_BENCH_FUNC(v, i) (__extension__ ({{                         \\
 -   {rtype} mx0 = {fname}(vld1q_f{prec_short} (variants[v].in[i].arg0));  \\
 +#define CALL_BENCH_FUNC_1(v, i) (__extension__ ({{                                 \\
 +   {rtype} mx0 = {fname}(vld1q_f{prec_short} (&variants[v].in->arg0[i * STRIDE])); \\
    mx0; }}))
 -struct args
 +#define CALL_BENCH_FUNC_2(v, i) (__extension__ ({{                                 \\
 +   {rtype} mx0 = {fname}(vld1q_f{prec_short} (&variants[v].in->arg0[i * STRIDE]),  \\
 +                         vld1q_f{prec_short} (&variants[v].in->arg1[i * STRIDE])); \\
 +   mx0; }}))
 +
 +struct args_1
 +{{
 +  {stype} arg0[{nelems}];
 +}};
 +
 +struct args_2
 {{
 -  {stype} arg0[STRIDE];
 -  double timing;
 +  {stype} arg0[{nelems}];
 +  {stype} arg1[{nelems}];
 }};
 struct _variants
 {{
   const char *name;
 -  int count;
 -  const struct args *in;
 +  const struct args_{arity} *in;
 }};
 -static const struct args in0[{rowcount}] = {{
 +static const struct args_{arity} in0 = {{
 {in_data}
 }};
 static const struct _variants variants[1] = {{
 -  {{"", {rowcount}, in0}},
 +  {{"", &in0}},
 }};
 #define NUM_VARIANTS 1
 -#define NUM_SAMPLES(i) (variants[i].count)
 +#define NUM_SAMPLES(i) ({nelems} / STRIDE)
 #define VARIANT(i) (variants[i].name)
 static {rtype} volatile ret;
 -#define BENCH_FUNC(i, j) ({{ ret = CALL_BENCH_FUNC(i, j); }})
 +#define BENCH_FUNC(i, j) ({{ ret = CALL_BENCH_FUNC_{arity}(i, j); }})
 #define FUNCNAME "{fname}"
 #include <bench-libmvec-skeleton.c>
 """
@@ -63,27 +72,34 @@ static {rtype} volatile ret;
 def main(name):
     _, prec, _, func = name.split("-")
     scalar_to_advsimd_type = {"double": "float64x2_t", "float": "float32x4_t"}
 -
 -    stride = {"double": 2, "float": 4}[prec]
 +    rowlen = {"double": 2, "float": 4}[prec]
     rtype = scalar_to_advsimd_type[prec]
     atype = scalar_to_advsimd_type[prec]
 -    fname = f"_ZGVnN{stride}v_{func}{'f' if prec == 'float' else ''}"
     prec_short = {"double": 64, "float": 32}[prec]
 -
 -    with open(f"../benchtests/libmvec/{func}-inputs") as f:
 -        in_vals = [l.strip() for l in f.readlines() if l and not l.startswith("#")]
 -    in_vals = [in_vals[i:i+stride] for i in range(0, len(in_vals), stride)]
 -    rowcount= len(in_vals)
 -    in_data = ",\n".join("{{" + ", ".join(row) + "}, 0}" for row in in_vals)
 -
 -    print(TEMPLATE.format(stride=stride,
 +    input_filename = {"double": f"{func}-inputs", "float": f"{func}f-inputs"}[prec]
 +
 +    with open(f"../benchtests/libmvec/{input_filename}") as f:
 +        input_file = f.readlines()
 +    in_vals = (l.strip() for l in input_file if l and not l.startswith("#"))
 +    # Split in case of multivariate signature
 +    in_vals = (l.split(", ") for l in in_vals)
 +    # Transpose
 +    in_vals = list(zip(*in_vals))
 +    in_data = ",\n".join("{" + (", ".join(val for val in col) + "}")
 +                         for col in in_vals)
 +
 +    arity = [l for l in input_file if l.startswith("## args: ")][0].count(prec)
 +    fname = f"_ZGVnN{rowlen}{'v' * arity}_{func}{'f' if prec == 'float' else ''}"
 +
 +    print(TEMPLATE.format(rowlen=rowlen,
                           rtype=rtype,
                           atype=atype,
                           fname=fname,
                           prec_short=prec_short,
                           in_data=in_data,
 -                          rowcount=rowcount,
 -                          stype=prec))
 +                          stype=prec,
 +                          arity=arity,
 +                          nelems=len(in_vals[0])))
 if __name__ == "__main__":
 diff --git a/sysdeps/aarch64/fpu/scripts/bench_libmvec_sve.py b/sysdeps/aarch64/fpu/scripts/bench_libmvec_sve.py
 index 66f2c8e0f465f9ce..5d9332be9c5a536a 100755
 --- a/sysdeps/aarch64/fpu/scripts/bench_libmvec_sve.py
 +++ b/sysdeps/aarch64/fpu/scripts/bench_libmvec_sve.py
@@ -22,46 +22,55 @@ TEMPLATE = """
 #include <math.h>
 #include <arm_sve.h>
 -#define MAX_STRIDE {max_stride}
 #define STRIDE {stride}
 #define PTRUE svptrue_b{prec_short}
 #define SV_LOAD svld1_f{prec_short}
 #define SV_STORE svst1_f{prec_short}
 #define REQUIRE_SVE
 -#define CALL_BENCH_FUNC(v, i) (__extension__ ({{                              \\
 -   {rtype} mx0 = {fname}(SV_LOAD (PTRUE(), variants[v].in[i].arg0), PTRUE()); \\
 +#define CALL_BENCH_FUNC_1(v, i) (__extension__ ({{                                       \\
 +   {rtype} mx0 = {fname}(SV_LOAD (PTRUE(), &variants[v].in->arg0[i * STRIDE]), PTRUE()); \\
    mx0; }}))
 -struct args
 +#define CALL_BENCH_FUNC_2(v, i) (__extension__ ({{                              \\
 +   {rtype} mx0 = {fname}(SV_LOAD (PTRUE(), &variants[v].in->arg0[i * STRIDE]),  \\
 +                         SV_LOAD (PTRUE(), &variants[v].in->arg1[i * STRIDE]),  \\
 +                         PTRUE());                                              \\
 +   mx0; }}))
 +
 +struct args_1
 {{
 -  {stype} arg0[MAX_STRIDE];
 -  double timing;
 +  {stype} arg0[{nelems}];
 +}};
 +
 +struct args_2
 +{{
 +  {stype} arg0[{nelems}];
 +  {stype} arg1[{nelems}];
 }};
 struct _variants
 {{
   const char *name;
 -  int count;
 -  const struct args *in;
 +  const struct args_{arity} *in;
 }};
 -static const struct args in0[{rowcount}] = {{
 +static const struct args_{arity} in0 = {{
 {in_data}
 }};
 static const struct _variants variants[1] = {{
 -  {{"", {rowcount}, in0}},
 +  {{"", &in0}},
 }};
 #define NUM_VARIANTS 1
 -#define NUM_SAMPLES(i) (variants[i].count)
 +#define NUM_SAMPLES(i) ({nelems} / STRIDE)
 #define VARIANT(i) (variants[i].name)
 // Cannot pass volatile pointer to svst1. This still does not appear to get optimised out.
 -static {stype} /*volatile*/ ret[MAX_STRIDE];
 +static {stype} /*volatile*/ ret[{rowlen}];
 -#define BENCH_FUNC(i, j) ({{ SV_STORE(PTRUE(), ret, CALL_BENCH_FUNC(i, j)); }})
 +#define BENCH_FUNC(i, j) ({{ SV_STORE(PTRUE(), ret, CALL_BENCH_FUNC_{arity}(i, j)); }})
 #define FUNCNAME "{fname}"
 #include <bench-libmvec-skeleton.c>
 """
@@ -69,23 +78,29 @@ static {stype} /*volatile*/ ret[MAX_STRIDE];
 def main(name):
     _, prec, _, func = name.split("-")
     scalar_to_sve_type = {"double": "svfloat64_t", "float": "svfloat32_t"}
 -
     stride = {"double": "svcntd()", "float": "svcntw()"}[prec]
     rtype = scalar_to_sve_type[prec]
     atype = scalar_to_sve_type[prec]
 -    fname = f"_ZGVsMxv_{func}{'f' if prec == 'float' else ''}"
     prec_short = {"double": 64, "float": 32}[prec]
     # Max SVE vector length is 2048 bits. To ensure benchmarks are
     # vector-length-agnostic, but still use as wide vectors as
     # possible on any given target, divide input data into 2048-bit
     # rows, then load/store as many elements as the target will allow.
 -    max_stride = 2048 // prec_short
 -
 -    with open(f"../benchtests/libmvec/{func}-inputs") as f:
 -        in_vals = [l.strip() for l in f.readlines() if l and not l.startswith("#")]
 -    in_vals = [in_vals[i:i+max_stride] for i in range(0, len(in_vals), max_stride)]
 -    rowcount= len(in_vals)
 -    in_data = ",\n".join("{{" + ", ".join(row) + "}, 0}" for row in in_vals)
 +    rowlen = {"double": 32, "float": 64}[prec]
 +    input_filename = {"double": f"{func}-inputs", "float": f"{func}f-inputs"}[prec]
 +
 +    with open(f"../benchtests/libmvec/{input_filename}") as f:
 +        input_file = f.readlines()
 +    in_vals = (l.strip() for l in input_file if l and not l.startswith("#"))
 +    # Split in case of multivariate signature
 +    in_vals = (l.split(", ") for l in in_vals)
 +    # Transpose
 +    in_vals = list(zip(*in_vals))
 +    in_data = ",\n".join("{" + (", ".join(val for val in col) + "}")
 +                         for col in in_vals)
 +
 +    arity = [l for l in input_file if l.startswith("## args: ")][0].count(prec)
 +    fname = f"_ZGVsMx{'v' * arity}_{func}{'f' if prec == 'float' else ''}"
     print(TEMPLATE.format(stride=stride,
                           rtype=rtype,
@@ -93,9 +108,10 @@ def main(name):
                           fname=fname,
                           prec_short=prec_short,
                           in_data=in_data,
 -                          rowcount=rowcount,
                           stype=prec,
 -                          max_stride=max_stride))
 +                          rowlen=rowlen,
 +                          arity=arity,
 +                          nelems=len(in_vals[0])))
 if __name__ == "__main__":
--- a/glibc-rh2244688.patch
+++ b/glibc-rh2244688.patch
@ -1,967 +0,0 @@
 Author: Florian Weimer <fweimer@redhat.com>
 Date:   Wed Oct 18 11:12:29 2023 +0200
    Revert "x86: Prepare `strrchr-evex` and `strrchr-evex512` for AVX10"
    This reverts commit a3c50bf46a1ca6d9d2b7d879176d345abf95a9de.
 diff --git a/sysdeps/x86_64/multiarch/strrchr-evex-base.S b/sysdeps/x86_64/multiarch/strrchr-evex-base.S
 index cd6a0a870a02b9bd..58b2853ab69265e8 100644
 --- a/sysdeps/x86_64/multiarch/strrchr-evex-base.S
 +++ b/sysdeps/x86_64/multiarch/strrchr-evex-base.S
@@ -1,4 +1,4 @@
 -/* Implementation for strrchr using evex256 and evex512.
 +/* Placeholder function, not used by any processor at the moment.
    Copyright (C) 2022-2023 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
@@ -16,6 +16,8 @@
    License along with the GNU C Library; if not, see
    <https://www.gnu.org/licenses/>.  */
 +/* UNUSED. Exists purely as reference implementation.  */
 +
 #include <isa-level.h>
 #if ISA_SHOULD_BUILD (4)
@@ -23,351 +25,240 @@
 # include <sysdep.h>
 # ifdef USE_AS_WCSRCHR
 -#  if VEC_SIZE == 64
 -#   define RCX_M	cx
 -#   define KORTEST_M	kortestw
 -#  else
 -#   define RCX_M	cl
 -#   define KORTEST_M	kortestb
 -#  endif
 -
 -#  define SHIFT_REG	VRCX
 #  define CHAR_SIZE	4
 -#  define VPCMP		vpcmpd
 -#  define VPMIN		vpminud
 -#  define VPCOMPRESS	vpcompressd
 -#  define VPTESTN	vptestnmd
 -#  define VPTEST	vptestmd
 -#  define VPBROADCAST	vpbroadcastd
 +#  define VPBROADCAST   vpbroadcastd
 #  define VPCMPEQ	vpcmpeqd
 -
 +#  define VPMINU	vpminud
 +#  define VPTESTN	vptestnmd
 # else
 -#  define SHIFT_REG	VRDI
 #  define CHAR_SIZE	1
 -#  define VPCMP		vpcmpb
 -#  define VPMIN		vpminub
 -#  define VPCOMPRESS	vpcompressb
 -#  define VPTESTN	vptestnmb
 -#  define VPTEST	vptestmb
 -#  define VPBROADCAST	vpbroadcastb
 +#  define VPBROADCAST   vpbroadcastb
 #  define VPCMPEQ	vpcmpeqb
 -
 -#  define RCX_M		VRCX
 -#  define KORTEST_M	KORTEST
 +#  define VPMINU	vpminub
 +#  define VPTESTN	vptestnmb
 # endif
 -# define VMATCH		VMM(0)
 -# define CHAR_PER_VEC	(VEC_SIZE / CHAR_SIZE)
 # define PAGE_SIZE	4096
 +# define CHAR_PER_VEC	(VEC_SIZE / CHAR_SIZE)
 	.section SECTION(.text), "ax", @progbits
 -	/* Aligning entry point to 64 byte, provides better performance for
 -	   one vector length string.  */
 -ENTRY_P2ALIGN(STRRCHR, 6)
 -	movl	%edi, %eax
 -	/* Broadcast CHAR to VMATCH.  */
 -	VPBROADCAST %esi, %VMATCH
 +/* Aligning entry point to 64 byte, provides better performance for
 +   one vector length string.  */
 +ENTRY_P2ALIGN (STRRCHR, 6)
 -	andl	$(PAGE_SIZE - 1), %eax
 -	cmpl	$(PAGE_SIZE - VEC_SIZE), %eax
 -	jg	L(cross_page_boundary)
 +	/* Broadcast CHAR to VMM(0).  */
 +	VPBROADCAST %esi, %VMM(0)
 +	movl	%edi, %eax
 +	sall	$20, %eax
 +	cmpl	$((PAGE_SIZE - VEC_SIZE) << 20), %eax
 +	ja	L(page_cross)
 +L(page_cross_continue):
 +	/* Compare [w]char for null, mask bit will be set for match.  */
 	VMOVU	(%rdi), %VMM(1)
 -	/* k0 has a 1 for each zero CHAR in YMM1.  */
 -	VPTESTN	%VMM(1), %VMM(1), %k0
 -	KMOV	%k0, %VGPR(rsi)
 -	test	%VGPR(rsi), %VGPR(rsi)
 -	jz	L(aligned_more)
 -	/* fallthrough: zero CHAR in first VEC.  */
 -L(page_cross_return):
 -	/* K1 has a 1 for each search CHAR match in VEC(1).  */
 -	VPCMPEQ	%VMATCH, %VMM(1), %k1
 -	KMOV	%k1, %VGPR(rax)
 -	/* Build mask up until first zero CHAR (used to mask of
 -	   potential search CHAR matches past the end of the string).  */
 -	blsmsk	%VGPR(rsi), %VGPR(rsi)
 -	/* Use `and` here to remove any out of bounds matches so we can
 -	   do a reverse scan on `rax` to find the last match.  */
 -	and	%VGPR(rsi), %VGPR(rax)
 -	jz	L(ret0)
 -	/* Get last match.  */
 -	bsr	%VGPR(rax), %VGPR(rax)
 +
 +	VPTESTN	%VMM(1), %VMM(1), %k1
 +	KMOV	%k1, %VRCX
 +	test	%VRCX, %VRCX
 +	jz	L(align_more)
 +
 +	VPCMPEQ	%VMM(1), %VMM(0), %k0
 +	KMOV	%k0, %VRAX
 +	BLSMSK	%VRCX, %VRCX
 +	and	%VRCX, %VRAX
 +	jz	L(ret)
 +
 +	BSR	%VRAX, %VRAX
 # ifdef USE_AS_WCSRCHR
 	leaq	(%rdi, %rax, CHAR_SIZE), %rax
 # else
 -	addq	%rdi, %rax
 +	add	%rdi, %rax
 # endif
 -L(ret0):
 +L(ret):
 	ret
 -	/* Returns for first vec x1/x2/x3 have hard coded backward
 -	   search path for earlier matches.  */
 -	.p2align 4,, 6
 -L(first_vec_x1):
 -	VPCMPEQ	%VMATCH, %VMM(2), %k1
 -	KMOV	%k1, %VGPR(rax)
 -	blsmsk	%VGPR(rcx), %VGPR(rcx)
 -	/* eax non-zero if search CHAR in range.  */
 -	and	%VGPR(rcx), %VGPR(rax)
 -	jnz	L(first_vec_x1_return)
 -
 -	/* fallthrough: no match in YMM2 then need to check for earlier
 -	   matches (in YMM1).  */
 -	.p2align 4,, 4
 -L(first_vec_x0_test):
 -	VPCMPEQ	%VMATCH, %VMM(1), %k1
 -	KMOV	%k1, %VGPR(rax)
 -	test	%VGPR(rax), %VGPR(rax)
 -	jz	L(ret1)
 -	bsr	%VGPR(rax), %VGPR(rax)
 +L(vector_x2_end):
 +	VPCMPEQ	%VMM(2), %VMM(0), %k2
 +	KMOV	%k2, %VRAX
 +	BLSMSK	%VRCX, %VRCX
 +	and	%VRCX, %VRAX
 +	jz	L(vector_x1_ret)
 +
 +	BSR	%VRAX, %VRAX
 +	leaq	(VEC_SIZE)(%rdi, %rax, CHAR_SIZE), %rax
 +	ret
 +
 +	/* Check the first vector at very last to look for match.  */
 +L(vector_x1_ret):
 +	VPCMPEQ %VMM(1), %VMM(0), %k2
 +	KMOV	%k2, %VRAX
 +	test	%VRAX, %VRAX
 +	jz	L(ret)
 +
 +	BSR	%VRAX, %VRAX
 # ifdef USE_AS_WCSRCHR
 	leaq	(%rsi, %rax, CHAR_SIZE), %rax
 # else
 -	addq	%rsi, %rax
 +	add	%rsi, %rax
 # endif
 -L(ret1):
 	ret
 -	.p2align 4,, 10
 -L(first_vec_x3):
 -	VPCMPEQ	%VMATCH, %VMM(4), %k1
 -	KMOV	%k1, %VGPR(rax)
 -	blsmsk	%VGPR(rcx), %VGPR(rcx)
 -	/* If no search CHAR match in range check YMM1/YMM2/YMM3.  */
 -	and	%VGPR(rcx), %VGPR(rax)
 -	jz	L(first_vec_x1_or_x2)
 -	bsr	%VGPR(rax), %VGPR(rax)
 -	leaq	(VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax
 -	ret
 -	.p2align 4,, 4
 -
 -L(first_vec_x2):
 -	VPCMPEQ	%VMATCH, %VMM(3), %k1
 -	KMOV	%k1, %VGPR(rax)
 -	blsmsk	%VGPR(rcx), %VGPR(rcx)
 -	/* Check YMM3 for last match first. If no match try YMM2/YMM1.  */
 -	and	%VGPR(rcx), %VGPR(rax)
 -	jz	L(first_vec_x0_x1_test)
 -	bsr	%VGPR(rax), %VGPR(rax)
 -	leaq	(VEC_SIZE * 2)(%r8, %rax, CHAR_SIZE), %rax
 -	ret
 -
 -	.p2align 4,, 6
 -L(first_vec_x0_x1_test):
 -	VPCMPEQ	%VMATCH, %VMM(2), %k1
 -	KMOV	%k1, %VGPR(rax)
 -	/* Check YMM2 for last match first. If no match try YMM1.  */
 -	test	%VGPR(rax), %VGPR(rax)
 -	jz	L(first_vec_x0_test)
 -	.p2align 4,, 4
 -L(first_vec_x1_return):
 -	bsr	%VGPR(rax), %VGPR(rax)
 -	leaq	(VEC_SIZE)(%r8, %rax, CHAR_SIZE), %rax
 -	ret
 -
 -	.p2align 4,, 12
 -L(aligned_more):
 -L(page_cross_continue):
 -	/* Need to keep original pointer incase VEC(1) has last match.  */
 +L(align_more):
 +	/* Zero r8 to store match result.  */
 +	xorl	%r8d, %r8d
 +	/* Save pointer of first vector, in case if no match found.  */
 	movq	%rdi, %rsi
 +	/* Align pointer to vector size.  */
 	andq	$-VEC_SIZE, %rdi
 -
 -	VMOVU	VEC_SIZE(%rdi), %VMM(2)
 +	/* Loop unroll for 2 vector loop.  */
 +	VMOVA	(VEC_SIZE)(%rdi), %VMM(2)
 	VPTESTN	%VMM(2), %VMM(2), %k0
 	KMOV	%k0, %VRCX
 -	movq	%rdi, %r8
 	test	%VRCX, %VRCX
 -	jnz	L(first_vec_x1)
 -
 -	VMOVU	(VEC_SIZE * 2)(%rdi), %VMM(3)
 -	VPTESTN	%VMM(3), %VMM(3), %k0
 -	KMOV	%k0, %VRCX
 -
 -	test	%VRCX, %VRCX
 -	jnz	L(first_vec_x2)
 -
 -	VMOVU	(VEC_SIZE * 3)(%rdi), %VMM(4)
 -	VPTESTN	%VMM(4), %VMM(4), %k0
 -	KMOV	%k0, %VRCX
 -
 -	/* Intentionally use 64-bit here.  EVEX256 version needs 1-byte
 -	   padding for efficient nop before loop alignment.  */
 -	test	%rcx, %rcx
 -	jnz	L(first_vec_x3)
 +	jnz	L(vector_x2_end)
 +	/* Save pointer of second vector, in case if no match
 +	   found.  */
 +	movq	%rdi, %r9
 +	/* Align address to VEC_SIZE * 2 for loop.  */
 	andq	$-(VEC_SIZE * 2), %rdi
 -	.p2align 4
 -L(first_aligned_loop):
 -	/* Preserve VEC(1), VEC(2), VEC(3), and VEC(4) until we can
 -	   gurantee they don't store a match.  */
 -	VMOVA	(VEC_SIZE * 4)(%rdi), %VMM(5)
 -	VMOVA	(VEC_SIZE * 5)(%rdi), %VMM(6)
 -
 -	VPCMP	$4, %VMM(5), %VMATCH, %k2
 -	VPCMP	$4, %VMM(6), %VMATCH, %k3{%k2}
 -
 -	VPMIN	%VMM(5), %VMM(6), %VMM(7)
 -	VPTEST	%VMM(7), %VMM(7), %k1{%k3}
 -	subq	$(VEC_SIZE * -2), %rdi
 -	KORTEST_M %k1, %k1
 -	jc	L(first_aligned_loop)
 +	.p2align 4,,11
 +L(loop):
 +	/* 2 vector loop, as it provide better performance as compared
 +	   to 4 vector loop.  */
 +	VMOVA	(VEC_SIZE * 2)(%rdi), %VMM(3)
 +	VMOVA	(VEC_SIZE * 3)(%rdi), %VMM(4)
 +	VPCMPEQ	%VMM(3), %VMM(0), %k1
 +	VPCMPEQ	%VMM(4), %VMM(0), %k2
 +	VPMINU	%VMM(3), %VMM(4), %VMM(5)
 +	VPTESTN	%VMM(5), %VMM(5), %k0
 +	KOR	%k1, %k2, %k3
 +	subq	$-(VEC_SIZE * 2), %rdi
 +	/* If k0 and k3 zero, match and end of string not found.  */
 +	KORTEST	%k0, %k3
 +	jz	L(loop)
 +
 +	/* If k0 is non zero, end of string found.  */
 +	KORTEST %k0, %k0
 +	jnz	L(endloop)
 +
 +	lea	VEC_SIZE(%rdi), %r8
 +	/* A match found, it need to be stored in r8 before loop
 +	   continue.  */
 +	/* Check second vector first.  */
 +	KMOV	%k2, %VRDX
 +	test	%VRDX, %VRDX
 +	jnz	L(loop_vec_x2_match)
 -	VPTESTN	%VMM(7), %VMM(7), %k1
 	KMOV	%k1, %VRDX
 -	test	%VRDX, %VRDX
 -	jz	L(second_aligned_loop_prep)
 +	/* Match is in first vector, rdi offset need to be subtracted
 +	  by VEC_SIZE.  */
 +	sub	$VEC_SIZE, %r8
 +
 +	/* If second vector doesn't have match, first vector must
 +	   have match.  */
 +L(loop_vec_x2_match):
 +	BSR	%VRDX, %VRDX
 +# ifdef USE_AS_WCSRCHR
 +	sal	$2, %rdx
 +# endif
 +	add	%rdx, %r8
 +	jmp	L(loop)
 -	KORTEST_M %k3, %k3
 -	jnc	L(return_first_aligned_loop)
 +L(endloop):
 +	/* Check if string end in first loop vector.  */
 +	VPTESTN	%VMM(3), %VMM(3), %k0
 +	KMOV	%k0, %VRCX
 +	test	%VRCX, %VRCX
 +	jnz	L(loop_vector_x1_end)
 -	.p2align 4,, 6
 -L(first_vec_x1_or_x2_or_x3):
 -	VPCMPEQ	%VMM(4), %VMATCH, %k4
 -	KMOV	%k4, %VRAX
 +	/* Check if it has match in first loop vector.  */
 +	KMOV	%k1, %VRAX
 	test	%VRAX, %VRAX
 -	jz	L(first_vec_x1_or_x2)
 -	bsr	%VRAX, %VRAX
 -	leaq	(VEC_SIZE * 3)(%r8, %rax, CHAR_SIZE), %rax
 -	ret
 +	jz	L(loop_vector_x2_end)
 -	.p2align 4,, 8
 -L(return_first_aligned_loop):
 -	VPTESTN	%VMM(5), %VMM(5), %k0
 +	BSR	%VRAX, %VRAX
 +	leaq	(%rdi, %rax, CHAR_SIZE), %r8
 +
 +	/* String must end in second loop vector.  */
 +L(loop_vector_x2_end):
 +	VPTESTN	%VMM(4), %VMM(4), %k0
 	KMOV	%k0, %VRCX
 -	blsmsk	%VRCX, %VRCX
 -	jnc	L(return_first_new_match_first)
 -	blsmsk	%VRDX, %VRDX
 -	VPCMPEQ	%VMM(6), %VMATCH, %k0
 -	KMOV	%k0, %VRAX
 -	addq	$VEC_SIZE, %rdi
 -	and	%VRDX, %VRAX
 -	jnz	L(return_first_new_match_ret)
 -	subq	$VEC_SIZE, %rdi
 -L(return_first_new_match_first):
 	KMOV	%k2, %VRAX
 -# ifdef USE_AS_WCSRCHR
 -	xorl	$((1 << CHAR_PER_VEC)- 1), %VRAX
 +	BLSMSK	%VRCX, %VRCX
 +	/* Check if it has match in second loop vector.  */
 	and	%VRCX, %VRAX
 -# else
 -	andn	%VRCX, %VRAX, %VRAX
 -# endif
 -	jz	L(first_vec_x1_or_x2_or_x3)
 -L(return_first_new_match_ret):
 -	bsr	%VRAX, %VRAX
 -	leaq	(VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
 -	ret
 +	jz	L(check_last_match)
 -	.p2align 4,, 10
 -L(first_vec_x1_or_x2):
 -	VPCMPEQ	%VMM(3), %VMATCH, %k3
 -	KMOV	%k3, %VRAX
 -	test	%VRAX, %VRAX
 -	jz	L(first_vec_x0_x1_test)
 -	bsr	%VRAX, %VRAX
 -	leaq	(VEC_SIZE * 2)(%r8, %rax, CHAR_SIZE), %rax
 +	BSR	%VRAX, %VRAX
 +	leaq	(VEC_SIZE)(%rdi, %rax, CHAR_SIZE), %rax
 	ret
 -	.p2align 4
 -	/* We can throw away the work done for the first 4x checks here
 -	   as we have a later match. This is the 'fast' path persay.  */
 -L(second_aligned_loop_prep):
 -L(second_aligned_loop_set_furthest_match):
 -	movq	%rdi, %rsi
 -	VMOVA	%VMM(5), %VMM(7)
 -	VMOVA	%VMM(6), %VMM(8)
 -	.p2align 4
 -L(second_aligned_loop):
 -	VMOVU	(VEC_SIZE * 4)(%rdi), %VMM(5)
 -	VMOVU	(VEC_SIZE * 5)(%rdi), %VMM(6)
 -	VPCMP	$4, %VMM(5), %VMATCH, %k2
 -	VPCMP	$4, %VMM(6), %VMATCH, %k3{%k2}
 -
 -	VPMIN	%VMM(5), %VMM(6), %VMM(4)
 -
 -	VPTEST	%VMM(4), %VMM(4), %k1{%k3}
 -	subq	$(VEC_SIZE * -2), %rdi
 -	KMOV	%k1, %VRCX
 -	inc	%RCX_M
 -	jz	L(second_aligned_loop)
 -	VPTESTN	%VMM(4), %VMM(4), %k1
 -	KMOV	%k1, %VRDX
 -	test	%VRDX, %VRDX
 -	jz	L(second_aligned_loop_set_furthest_match)
 -
 -	KORTEST_M %k3, %k3
 -	jnc	L(return_new_match)
 -	/* branch here because there is a significant advantage interms
 -	   of output dependency chance in using edx.  */
 +	/* String end in first loop vector.  */
 +L(loop_vector_x1_end):
 +	KMOV	%k1, %VRAX
 +	BLSMSK	%VRCX, %VRCX
 +	/* Check if it has match in second loop vector.  */
 +	and	%VRCX, %VRAX
 +	jz	L(check_last_match)
 -L(return_old_match):
 -	VPCMPEQ	%VMM(8), %VMATCH, %k0
 -	KMOV	%k0, %VRCX
 -	bsr	%VRCX, %VRCX
 -	jnz	L(return_old_match_ret)
 +	BSR	%VRAX, %VRAX
 +	leaq	(%rdi, %rax, CHAR_SIZE), %rax
 +	ret
 -	VPCMPEQ	%VMM(7), %VMATCH, %k0
 -	KMOV	%k0, %VRCX
 -	bsr	%VRCX, %VRCX
 -	subq	$VEC_SIZE, %rsi
 -L(return_old_match_ret):
 -	leaq	(VEC_SIZE * 3)(%rsi, %rcx, CHAR_SIZE), %rax
 +	/* No match in first and second loop vector.  */
 +L(check_last_match):
 +	/* Check if any match recorded in r8.  */
 +	test	%r8, %r8
 +	jz	L(vector_x2_ret)
 +	movq	%r8, %rax
 	ret
 -L(return_new_match):
 -	VPTESTN	%VMM(5), %VMM(5), %k0
 -	KMOV	%k0, %VRCX
 -	blsmsk	%VRCX, %VRCX
 -	jnc	L(return_new_match_first)
 -	dec	%VRDX
 -	VPCMPEQ	%VMM(6), %VMATCH, %k0
 -	KMOV	%k0, %VRAX
 -	addq	$VEC_SIZE, %rdi
 -	and	%VRDX, %VRAX
 -	jnz	L(return_new_match_ret)
 -	subq	$VEC_SIZE, %rdi
 -L(return_new_match_first):
 +	/* No match recorded in r8. Check the second saved vector
 +	   in beginning.  */
 +L(vector_x2_ret):
 +	VPCMPEQ %VMM(2), %VMM(0), %k2
 	KMOV	%k2, %VRAX
 -# ifdef USE_AS_WCSRCHR
 -	xorl	$((1 << CHAR_PER_VEC)- 1), %VRAX
 -	and	%VRCX, %VRAX
 -# else
 -	andn	%VRCX, %VRAX, %VRAX
 -# endif
 -	jz	L(return_old_match)
 -L(return_new_match_ret):
 -	bsr	%VRAX, %VRAX
 -	leaq	(VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
 +	test	%VRAX, %VRAX
 +	jz	L(vector_x1_ret)
 +
 +	/* Match found in the second saved vector.  */
 +	BSR	%VRAX, %VRAX
 +	leaq	(VEC_SIZE)(%r9, %rax, CHAR_SIZE), %rax
 	ret
 -	.p2align 4,, 4
 -L(cross_page_boundary):
 -	xorq	%rdi, %rax
 -	mov	$-1, %VRDX
 -	VMOVU	(PAGE_SIZE - VEC_SIZE)(%rax), %VMM(6)
 -	VPTESTN	%VMM(6), %VMM(6), %k0
 -	KMOV	%k0, %VRSI
 +L(page_cross):
 +	mov	%rdi, %rax
 +	movl	%edi, %ecx
 # ifdef USE_AS_WCSRCHR
 -	movl	%edi, %ecx
 -	and	$(VEC_SIZE - 1), %ecx
 -	shrl	$2, %ecx
 +	/* Calculate number of compare result bits to be skipped for
 +	   wide string alignment adjustment.  */
 +	andl	$(VEC_SIZE - 1), %ecx
 +	sarl	$2, %ecx
 # endif
 -	shlx	%SHIFT_REG, %VRDX, %VRDX
 -
 +	/* ecx contains number of w[char] to be skipped as a result
 +	   of address alignment.  */
 +	andq    $-VEC_SIZE, %rax
 +	VMOVA	(%rax), %VMM(1)
 +	VPTESTN	%VMM(1), %VMM(1), %k1
 +	KMOV	%k1, %VRAX
 +	SHR     %cl, %VRAX
 +	jz	L(page_cross_continue)
 +	VPCMPEQ	%VMM(1), %VMM(0), %k0
 +	KMOV	%k0, %VRDX
 +	SHR     %cl, %VRDX
 +	BLSMSK	%VRAX, %VRAX
 +	and	%VRDX, %VRAX
 +	jz	L(ret)
 +	BSR	%VRAX, %VRAX
 # ifdef USE_AS_WCSRCHR
 -	kmovw	%edx, %k1
 +	leaq	(%rdi, %rax, CHAR_SIZE), %rax
 # else
 -	KMOV	%VRDX, %k1
 +	add	%rdi, %rax
 # endif
 -	VPCOMPRESS %VMM(6), %VMM(1){%k1}{z}
 -	/* We could technically just jmp back after the vpcompress but
 -	   it doesn't save any 16-byte blocks.  */
 -	shrx	%SHIFT_REG, %VRSI, %VRSI
 -	test	%VRSI, %VRSI
 -	jnz	L(page_cross_return)
 -	jmp	L(page_cross_continue)
 -	/* 1-byte from cache line.  */
 -END(STRRCHR)
 +	ret
 +END (STRRCHR)
 #endif
 diff --git a/sysdeps/x86_64/multiarch/strrchr-evex.S b/sysdeps/x86_64/multiarch/strrchr-evex.S
 index 3bf6a5101422e4d1..85e3b0119f5dc923 100644
 --- a/sysdeps/x86_64/multiarch/strrchr-evex.S
 +++ b/sysdeps/x86_64/multiarch/strrchr-evex.S
@@ -1,8 +1,394 @@
 +/* strrchr/wcsrchr optimized with 256-bit EVEX instructions.
 +   Copyright (C) 2021-2023 Free Software Foundation, Inc.
 +   This file is part of the GNU C Library.
 +
 +   The GNU C Library is free software; you can redistribute it and/or
 +   modify it under the terms of the GNU Lesser General Public
 +   License as published by the Free Software Foundation; either
 +   version 2.1 of the License, or (at your option) any later version.
 +
 +   The GNU C Library is distributed in the hope that it will be useful,
 +   but WITHOUT ANY WARRANTY; without even the implied warranty of
 +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 +   Lesser General Public License for more details.
 +
 +   You should have received a copy of the GNU Lesser General Public
 +   License along with the GNU C Library; if not, see
 +   <https://www.gnu.org/licenses/>.  */
 +
 +#include <isa-level.h>
 +
 +#if ISA_SHOULD_BUILD (4)
 +
 +# include <sysdep.h>
 +
 # ifndef STRRCHR
 #  define STRRCHR	__strrchr_evex
 # endif
 -#include "x86-evex256-vecs.h"
 -#include "reg-macros.h"
 +# include "x86-evex256-vecs.h"
 +
 +# ifdef USE_AS_WCSRCHR
 +#  define SHIFT_REG	rsi
 +#  define kunpck_2x	kunpckbw
 +#  define kmov_2x	kmovd
 +#  define maskz_2x	ecx
 +#  define maskm_2x	eax
 +#  define CHAR_SIZE	4
 +#  define VPMIN	vpminud
 +#  define VPTESTN	vptestnmd
 +#  define VPTEST	vptestmd
 +#  define VPBROADCAST	vpbroadcastd
 +#  define VPCMPEQ	vpcmpeqd
 +#  define VPCMP	vpcmpd
 +
 +#  define USE_WIDE_CHAR
 +# else
 +#  define SHIFT_REG	rdi
 +#  define kunpck_2x	kunpckdq
 +#  define kmov_2x	kmovq
 +#  define maskz_2x	rcx
 +#  define maskm_2x	rax
 +
 +#  define CHAR_SIZE	1
 +#  define VPMIN	vpminub
 +#  define VPTESTN	vptestnmb
 +#  define VPTEST	vptestmb
 +#  define VPBROADCAST	vpbroadcastb
 +#  define VPCMPEQ	vpcmpeqb
 +#  define VPCMP	vpcmpb
 +# endif
 +
 +# include "reg-macros.h"
 +
 +# define VMATCH	VMM(0)
 +# define CHAR_PER_VEC	(VEC_SIZE / CHAR_SIZE)
 +# define PAGE_SIZE	4096
 +
 +	.section SECTION(.text), "ax", @progbits
 +ENTRY_P2ALIGN(STRRCHR, 6)
 +	movl	%edi, %eax
 +	/* Broadcast CHAR to VMATCH.  */
 +	VPBROADCAST %esi, %VMATCH
 +
 +	andl	$(PAGE_SIZE - 1), %eax
 +	cmpl	$(PAGE_SIZE - VEC_SIZE), %eax
 +	jg	L(cross_page_boundary)
 +L(page_cross_continue):
 +	VMOVU	(%rdi), %VMM(1)
 +	/* k0 has a 1 for each zero CHAR in VEC(1).  */
 +	VPTESTN	%VMM(1), %VMM(1), %k0
 +	KMOV	%k0, %VRSI
 +	test	%VRSI, %VRSI
 +	jz	L(aligned_more)
 +	/* fallthrough: zero CHAR in first VEC.  */
 +	/* K1 has a 1 for each search CHAR match in VEC(1).  */
 +	VPCMPEQ	%VMATCH, %VMM(1), %k1
 +	KMOV	%k1, %VRAX
 +	/* Build mask up until first zero CHAR (used to mask of
 +	   potential search CHAR matches past the end of the string).
 +	 */
 +	blsmsk	%VRSI, %VRSI
 +	and	%VRSI, %VRAX
 +	jz	L(ret0)
 +	/* Get last match (the `and` removed any out of bounds matches).
 +	 */
 +	bsr	%VRAX, %VRAX
 +# ifdef USE_AS_WCSRCHR
 +	leaq	(%rdi, %rax, CHAR_SIZE), %rax
 +# else
 +	addq	%rdi, %rax
 +# endif
 +L(ret0):
 +	ret
 +
 +	/* Returns for first vec x1/x2/x3 have hard coded backward
 +	   search path for earlier matches.  */
 +	.p2align 4,, 6
 +L(first_vec_x1):
 +	VPCMPEQ	%VMATCH, %VMM(2), %k1
 +	KMOV	%k1, %VRAX
 +	blsmsk	%VRCX, %VRCX
 +	/* eax non-zero if search CHAR in range.  */
 +	and	%VRCX, %VRAX
 +	jnz	L(first_vec_x1_return)
 +
 +	/* fallthrough: no match in VEC(2) then need to check for
 +	   earlier matches (in VEC(1)).  */
 +	.p2align 4,, 4
 +L(first_vec_x0_test):
 +	VPCMPEQ	%VMATCH, %VMM(1), %k1
 +	KMOV	%k1, %VRAX
 +	test	%VRAX, %VRAX
 +	jz	L(ret1)
 +	bsr	%VRAX, %VRAX
 +# ifdef USE_AS_WCSRCHR
 +	leaq	(%rsi, %rax, CHAR_SIZE), %rax
 +# else
 +	addq	%rsi, %rax
 +# endif
 +L(ret1):
 +	ret
 +
 +	.p2align 4,, 10
 +L(first_vec_x1_or_x2):
 +	VPCMPEQ	%VMM(3), %VMATCH, %k3
 +	VPCMPEQ	%VMM(2), %VMATCH, %k2
 +	/* K2 and K3 have 1 for any search CHAR match. Test if any
 +	   matches between either of them. Otherwise check VEC(1).  */
 +	KORTEST %k2, %k3
 +	jz	L(first_vec_x0_test)
 +
 +	/* Guaranteed that VEC(2) and VEC(3) are within range so merge
 +	   the two bitmasks then get last result.  */
 +	kunpck_2x %k2, %k3, %k3
 +	kmov_2x	%k3, %maskm_2x
 +	bsr	%maskm_2x, %maskm_2x
 +	leaq	(VEC_SIZE * 1)(%r8, %rax, CHAR_SIZE), %rax
 +	ret
 +
 +	.p2align 4,, 7
 +L(first_vec_x3):
 +	VPCMPEQ	%VMATCH, %VMM(4), %k1
 +	KMOV	%k1, %VRAX
 +	blsmsk	%VRCX, %VRCX
 +	/* If no search CHAR match in range check VEC(1)/VEC(2)/VEC(3).
 +	 */
 +	and	%VRCX, %VRAX
 +	jz	L(first_vec_x1_or_x2)
 +	bsr	%VRAX, %VRAX
 +	leaq	(VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax
 +	ret
 +
 +
 +	.p2align 4,, 6
 +L(first_vec_x0_x1_test):
 +	VPCMPEQ	%VMATCH, %VMM(2), %k1
 +	KMOV	%k1, %VRAX
 +	/* Check VEC(2) for last match first. If no match try VEC(1).
 +	 */
 +	test	%VRAX, %VRAX
 +	jz	L(first_vec_x0_test)
 +	.p2align 4,, 4
 +L(first_vec_x1_return):
 +	bsr	%VRAX, %VRAX
 +	leaq	(VEC_SIZE)(%rdi, %rax, CHAR_SIZE), %rax
 +	ret
 +
 +
 +	.p2align 4,, 10
 +L(first_vec_x2):
 +	VPCMPEQ	%VMATCH, %VMM(3), %k1
 +	KMOV	%k1, %VRAX
 +	blsmsk	%VRCX, %VRCX
 +	/* Check VEC(3) for last match first. If no match try
 +	   VEC(2)/VEC(1).  */
 +	and	%VRCX, %VRAX
 +	jz	L(first_vec_x0_x1_test)
 +	bsr	%VRAX, %VRAX
 +	leaq	(VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
 +	ret
 +
 +
 +	.p2align 4,, 12
 +L(aligned_more):
 +	/* Need to keep original pointer in case VEC(1) has last match.
 +	 */
 +	movq	%rdi, %rsi
 +	andq	$-VEC_SIZE, %rdi
 +
 +	VMOVU	VEC_SIZE(%rdi), %VMM(2)
 +	VPTESTN	%VMM(2), %VMM(2), %k0
 +	KMOV	%k0, %VRCX
 +
 +	test	%VRCX, %VRCX
 +	jnz	L(first_vec_x1)
 +
 +	VMOVU	(VEC_SIZE * 2)(%rdi), %VMM(3)
 +	VPTESTN	%VMM(3), %VMM(3), %k0
 +	KMOV	%k0, %VRCX
 +
 +	test	%VRCX, %VRCX
 +	jnz	L(first_vec_x2)
 +
 +	VMOVU	(VEC_SIZE * 3)(%rdi), %VMM(4)
 +	VPTESTN	%VMM(4), %VMM(4), %k0
 +	KMOV	%k0, %VRCX
 +	movq	%rdi, %r8
 +	test	%VRCX, %VRCX
 +	jnz	L(first_vec_x3)
 +
 +	andq	$-(VEC_SIZE * 2), %rdi
 +	.p2align 4,, 10
 +L(first_aligned_loop):
 +	/* Preserve VEC(1), VEC(2), VEC(3), and VEC(4) until we can
 +	   guarantee they don't store a match.  */
 +	VMOVA	(VEC_SIZE * 4)(%rdi), %VMM(5)
 +	VMOVA	(VEC_SIZE * 5)(%rdi), %VMM(6)
 +
 +	VPCMPEQ	%VMM(5), %VMATCH, %k2
 +	vpxord	%VMM(6), %VMATCH, %VMM(7)
 +
 +	VPMIN	%VMM(5), %VMM(6), %VMM(8)
 +	VPMIN	%VMM(8), %VMM(7), %VMM(7)
 +
 +	VPTESTN	%VMM(7), %VMM(7), %k1
 +	subq	$(VEC_SIZE * -2), %rdi
 +	KORTEST %k1, %k2
 +	jz	L(first_aligned_loop)
 +
 +	VPCMPEQ	%VMM(6), %VMATCH, %k3
 +	VPTESTN	%VMM(8), %VMM(8), %k1
 +
 +	/* If k1 is zero, then we found a CHAR match but no null-term.
 +	   We can now safely throw out VEC1-4.  */
 +	KTEST	%k1, %k1
 +	jz	L(second_aligned_loop_prep)
 +
 +	KORTEST %k2, %k3
 +	jnz	L(return_first_aligned_loop)
 +
 +
 +	.p2align 4,, 6
 +L(first_vec_x1_or_x2_or_x3):
 +	VPCMPEQ	%VMM(4), %VMATCH, %k4
 +	KMOV	%k4, %VRAX
 +	bsr	%VRAX, %VRAX
 +	jz	L(first_vec_x1_or_x2)
 +	leaq	(VEC_SIZE * 3)(%r8, %rax, CHAR_SIZE), %rax
 +	ret
 +
 +
 +	.p2align 4,, 8
 +L(return_first_aligned_loop):
 +	VPTESTN	%VMM(5), %VMM(5), %k0
 +
 +	/* Combined results from VEC5/6.  */
 +	kunpck_2x %k0, %k1, %k0
 +	kmov_2x	%k0, %maskz_2x
 +
 +	blsmsk	%maskz_2x, %maskz_2x
 +	kunpck_2x %k2, %k3, %k3
 +	kmov_2x	%k3, %maskm_2x
 +	and	%maskz_2x, %maskm_2x
 +	jz	L(first_vec_x1_or_x2_or_x3)
 +
 +	bsr	%maskm_2x, %maskm_2x
 +	leaq	(VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
 +	ret
 +
 +	.p2align 4
 +	/* We can throw away the work done for the first 4x checks here
 +	   as we have a later match. This is the 'fast' path persay.
 +	 */
 +L(second_aligned_loop_prep):
 +L(second_aligned_loop_set_furthest_match):
 +	movq	%rdi, %rsi
 +	/* Ideally we would safe k2/k3 but `kmov/kunpck` take uops on
 +	   port0 and have noticeable overhead in the loop.  */
 +	VMOVA	%VMM(5), %VMM(7)
 +	VMOVA	%VMM(6), %VMM(8)
 +	.p2align 4
 +L(second_aligned_loop):
 +	VMOVU	(VEC_SIZE * 4)(%rdi), %VMM(5)
 +	VMOVU	(VEC_SIZE * 5)(%rdi), %VMM(6)
 +	VPCMPEQ	%VMM(5), %VMATCH, %k2
 +	vpxord	%VMM(6), %VMATCH, %VMM(3)
 +
 +	VPMIN	%VMM(5), %VMM(6), %VMM(4)
 +	VPMIN	%VMM(3), %VMM(4), %VMM(3)
 +
 +	VPTESTN	%VMM(3), %VMM(3), %k1
 +	subq	$(VEC_SIZE * -2), %rdi
 +	KORTEST %k1, %k2
 +	jz	L(second_aligned_loop)
 +	VPCMPEQ	%VMM(6), %VMATCH, %k3
 +	VPTESTN	%VMM(4), %VMM(4), %k1
 +	KTEST	%k1, %k1
 +	jz	L(second_aligned_loop_set_furthest_match)
 +
 +	/* branch here because we know we have a match in VEC7/8 but
 +	   might not in VEC5/6 so the latter is expected to be less
 +	   likely.  */
 +	KORTEST %k2, %k3
 +	jnz	L(return_new_match)
 +
 +L(return_old_match):
 +	VPCMPEQ	%VMM(8), %VMATCH, %k0
 +	KMOV	%k0, %VRCX
 +	bsr	%VRCX, %VRCX
 +	jnz	L(return_old_match_ret)
 +
 +	VPCMPEQ	%VMM(7), %VMATCH, %k0
 +	KMOV	%k0, %VRCX
 +	bsr	%VRCX, %VRCX
 +	subq	$VEC_SIZE, %rsi
 +L(return_old_match_ret):
 +	leaq	(VEC_SIZE * 3)(%rsi, %rcx, CHAR_SIZE), %rax
 +	ret
 +
 +	.p2align 4,, 10
 +L(return_new_match):
 +	VPTESTN	%VMM(5), %VMM(5), %k0
 +
 +	/* Combined results from VEC5/6.  */
 +	kunpck_2x %k0, %k1, %k0
 +	kmov_2x	%k0, %maskz_2x
 +
 +	blsmsk	%maskz_2x, %maskz_2x
 +	kunpck_2x %k2, %k3, %k3
 +	kmov_2x	%k3, %maskm_2x
 +
 +	/* Match at end was out-of-bounds so use last known match.  */
 +	and	%maskz_2x, %maskm_2x
 +	jz	L(return_old_match)
 +
 +	bsr	%maskm_2x, %maskm_2x
 +	leaq	(VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
 +	ret
 +
 +L(cross_page_boundary):
 +	/* eax contains all the page offset bits of src (rdi). `xor rdi,
 +	   rax` sets pointer will all page offset bits cleared so
 +	   offset of (PAGE_SIZE - VEC_SIZE) will get last aligned VEC
 +	   before page cross (guaranteed to be safe to read). Doing this
 +	   as opposed to `movq %rdi, %rax; andq $-VEC_SIZE, %rax` saves
 +	   a bit of code size.  */
 +	xorq	%rdi, %rax
 +	VMOVU	(PAGE_SIZE - VEC_SIZE)(%rax), %VMM(1)
 +	VPTESTN	%VMM(1), %VMM(1), %k0
 +	KMOV	%k0, %VRCX
 +
 +	/* Shift out zero CHAR matches that are before the beginning of
 +	   src (rdi).  */
 +# ifdef USE_AS_WCSRCHR
 +	movl	%edi, %esi
 +	andl	$(VEC_SIZE - 1), %esi
 +	shrl	$2, %esi
 +# endif
 +	shrx	%VGPR(SHIFT_REG), %VRCX, %VRCX
 +
 +	test	%VRCX, %VRCX
 +	jz	L(page_cross_continue)
 -#include "strrchr-evex-base.S"
 +	/* Found zero CHAR so need to test for search CHAR.  */
 +	VPCMP	$0, %VMATCH, %VMM(1), %k1
 +	KMOV	%k1, %VRAX
 +	/* Shift out search CHAR matches that are before the beginning of
 +	   src (rdi).  */
 +	shrx	%VGPR(SHIFT_REG), %VRAX, %VRAX
 +
 +	/* Check if any search CHAR match in range.  */
 +	blsmsk	%VRCX, %VRCX
 +	and	%VRCX, %VRAX
 +	jz	L(ret3)
 +	bsr	%VRAX, %VRAX
 +# ifdef USE_AS_WCSRCHR
 +	leaq	(%rdi, %rax, CHAR_SIZE), %rax
 +# else
 +	addq	%rdi, %rax
 +# endif
 +L(ret3):
 +	ret
 +END(STRRCHR)
 +#endif
 diff --git a/sysdeps/x86_64/multiarch/wcsrchr-evex.S b/sysdeps/x86_64/multiarch/wcsrchr-evex.S
 index a584cd3f430ba9d5..e5c5fe3bf28a5966 100644
 --- a/sysdeps/x86_64/multiarch/wcsrchr-evex.S
 +++ b/sysdeps/x86_64/multiarch/wcsrchr-evex.S
@@ -4,5 +4,4 @@
 #define STRRCHR	WCSRCHR
 #define USE_AS_WCSRCHR 1
 -#define USE_WIDE_CHAR 1
 #include "strrchr-evex.S"
--- a/glibc-rh2244992.patch
+++ b/glibc-rh2244992.patch
@ -1,110 +0,0 @@
 commit ffe333d46c4ef7c5221cccad2743cac3e149e615
 Author: Florian Weimer <fweimer@redhat.com>
 Date:   Tue Nov 7 11:23:15 2023 +0100
    elf: Fix force_first handling in dlclose (bug 30785)
    The force_first parameter was ineffective because the dlclose'd
    object was not necessarily the first in the maps array.  Also
    enable force_first handling unconditionally, regardless of namespace.
    The initial object in a namespace should be destructed first, too.
    The _dl_sort_maps_dfs function had early returns for relocation
    dependency processing which broke force_first handling, too, and
    this is fixed in this change as well.
 diff --git a/elf/dl-close.c b/elf/dl-close.c
 index 1c7a861db140259d..a97a1efa45eb7409 100644
 --- a/elf/dl-close.c
 +++ b/elf/dl-close.c
@@ -153,6 +153,16 @@ _dl_close_worker (struct link_map *map, bool force)
     }
   assert (idx == nloaded);
 +  /* Put the dlclose'd map first, so that its destructor runs first.
 +     The map variable is NULL after a retry.  */
 +  if (map != NULL)
 +    {
 +      maps[map->l_idx] = maps[0];
 +      maps[map->l_idx]->l_idx = map->l_idx;
 +      maps[0] = map;
 +      maps[0]->l_idx = 0;
 +    }
 +
   /* Keep track of the lowest index link map we have covered already.  */
   int done_index = -1;
   while (++done_index < nloaded)
@@ -226,9 +236,10 @@ _dl_close_worker (struct link_map *map, bool force)
 	  }
     }
 -  /* Sort the entries.  We can skip looking for the binary itself which is
 -     at the front of the search list for the main namespace.  */
 -  _dl_sort_maps (maps, nloaded, (nsid == LM_ID_BASE), true);
 +  /* Sort the entries.  Unless retrying, the maps[0] object (the
 +     original argument to dlclose) needs to remain first, so that its
 +     destructor runs first.  */
 +  _dl_sort_maps (maps, nloaded, /* force_first */ map != NULL, true);
   /* Call all termination functions at once.  */
   bool unload_any = false;
@@ -732,7 +743,11 @@ _dl_close_worker (struct link_map *map, bool force)
   /* Recheck if we need to retry, release the lock.  */
  out:
   if (dl_close_state == rerun)
 -    goto retry;
 +    {
 +      /* The map may have been deallocated.  */
 +      map = NULL;
 +      goto retry;
 +    }
   dl_close_state = not_pending;
 }
 diff --git a/elf/dl-sort-maps.c b/elf/dl-sort-maps.c
 index 5616c8a6a33ebb1e..5c846c7c6f9d88bc 100644
 --- a/elf/dl-sort-maps.c
 +++ b/elf/dl-sort-maps.c
@@ -255,13 +255,12 @@ _dl_sort_maps_dfs (struct link_map **maps, unsigned int nmaps,
 	     The below memcpy is not needed in the do_reldeps case here,
 	     since we wrote back to maps[] during DFS traversal.  */
 	  if (maps_head == maps)
 -	    return;
 +	    break;
 	}
       assert (maps_head == maps);
 -      return;
     }
 -
 -  memcpy (maps, rpo, sizeof (struct link_map *) * nmaps);
 +  else
 +    memcpy (maps, rpo, sizeof (struct link_map *) * nmaps);
   /* Skipping the first object at maps[0] is not valid in general,
      since traversing along object dependency-links may "find" that
 diff --git a/elf/dso-sort-tests-1.def b/elf/dso-sort-tests-1.def
 index 4bf9052db16fb352..cf6453e9eb85ac65 100644
 --- a/elf/dso-sort-tests-1.def
 +++ b/elf/dso-sort-tests-1.def
@@ -56,14 +56,16 @@ output: b>a>{}<a<b
 # relocation(dynamic) dependencies. While this is technically unspecified, the
 # presumed reasonable practical behavior is for the destructor order to respect
 # the static DT_NEEDED links (here this means the a->b->c->d order).
 -# The older dynamic_sort=1 algorithm does not achieve this, while the DFS-based
 -# dynamic_sort=2 algorithm does, although it is still arguable whether going
 -# beyond spec to do this is the right thing to do.
 +# The older dynamic_sort=1 algorithm originally did not achieve this,
 +# but this was a bug in the way _dl_sort_maps was called from _dl_close_worker,
 +# effectively disabling proper force_first handling.
 +# The new dynamic_sort=2 algorithm shows the effect of the simpler force_first
 +# handling: the a object is simply moved to the front.
 # The below expected outputs are what the two algorithms currently produce
 # respectively, for regression testing purposes.
 tst-bz15311: {+a;+e;+f;+g;+d;%d;-d;-g;-f;-e;-a};a->b->c->d;d=>[ba];c=>a;b=>e=>a;c=>f=>b;d=>g=>c
 -output(glibc.rtld.dynamic_sort=1): {+a[d>c>b>a>];+e[e>];+f[f>];+g[g>];+d[];%d(b(e(a()))a()g(c(a()f(b(e(a()))))));-d[];-g[];-f[];-e[];-a[<a<c<d<g<f<b<e];}
 -output(glibc.rtld.dynamic_sort=2): {+a[d>c>b>a>];+e[e>];+f[f>];+g[g>];+d[];%d(b(e(a()))a()g(c(a()f(b(e(a()))))));-d[];-g[];-f[];-e[];-a[<g<f<a<b<c<d<e];}
 +output(glibc.rtld.dynamic_sort=1): {+a[d>c>b>a>];+e[e>];+f[f>];+g[g>];+d[];%d(b(e(a()))a()g(c(a()f(b(e(a()))))));-d[];-g[];-f[];-e[];-a[<a<b<c<d<g<f<e];}
 +output(glibc.rtld.dynamic_sort=2): {+a[d>c>b>a>];+e[e>];+f[f>];+g[g>];+d[];%d(b(e(a()))a()g(c(a()f(b(e(a()))))));-d[];-g[];-f[];-e[];-a[<a<g<f<b<c<d<e];}
 # Test that even in the presence of dependency loops involving dlopen'ed
 # object, that object is initialized last (and not unloaded prematurely).
--- a/glibc-rh2248502-3.patch
+++ b/glibc-rh2248502-3.patch
@ -1,22 +0,0 @@
 Author: Florian Weimer <fweimer@redhat.com>
 Date:   Wed Nov 15 06:31:40 2023 +0100
    stdlib: Avoid another self-comparison in qsort
    In the insertion phase, we could run off the start of the array if the
    comparison function never runs zero.  In that case, it never finds the
    initial element that terminates the iteration.
 diff --git a/stdlib/qsort.c b/stdlib/qsort.c
 index ad110e8a892a66e1..1ea31ff4247da7a4 100644
 --- a/stdlib/qsort.c
 +++ b/stdlib/qsort.c
@@ -217,7 +217,7 @@ insertion_sort_qsort_partitions (void *const pbase, size_t total_elems,
   while ((run_ptr += size) <= end_ptr)
     {
       tmp_ptr = run_ptr - size;
 -      while (cmp (run_ptr, tmp_ptr, arg) < 0)
 +      while (tmp_ptr != base_ptr && cmp (run_ptr, tmp_ptr, arg) < 0)
         tmp_ptr -= size;
       tmp_ptr += size;
--- a/glibc-rh2248915.patch
+++ b/glibc-rh2248915.patch
@ -1,60 +0,0 @@
 Author: Florian Weimer <fweimer@redhat.com>
 Date:   Mon Oct 23 12:53:16 2023 +0200
    ldconfig: Fixes for skipping temporary files.
    Arguments to a memchr call were swapped, causing incorrect skipping
    of files.
    Files related to dpkg have different names: they actually end in
    .dpkg-new and .dpkg-tmp, not .tmp as I mistakenly assumed.
    Fixes commit 2aa0974d2573441bffd59 ("elf: ldconfig should skip
    temporary files created by package managers").
 diff --git a/elf/ldconfig.c b/elf/ldconfig.c
 index 02387a169c902f01..bccd386761d8cbb2 100644
 --- a/elf/ldconfig.c
 +++ b/elf/ldconfig.c
@@ -661,6 +661,17 @@ struct dlib_entry
   struct dlib_entry *next;
 };
 +/* Return true if the N bytes at NAME end with with the characters in
 +   the string SUFFIX.  (NAME[N + 1] does not have to be a null byte.)
 +   Expected to be called with a string literal for SUFFIX.  */
 +static inline bool
 +endswithn (const char *name, size_t n, const char *suffix)
 +{
 +  return (n >= strlen (suffix)
 +	  && memcmp (name + n - strlen (suffix), suffix,
 +		     strlen (suffix)) == 0);
 +}
 +
 /* Skip some temporary DSO files.  These files may be partially written
    and lead to ldconfig crashes when examined.  */
 static bool
@@ -670,8 +681,7 @@ skip_dso_based_on_name (const char *name, size_t len)
      names like these are never really DSOs we want to look at.  */
   if (len >= sizeof (".#prelink#") - 1)
     {
 -      if (strcmp (name + len - sizeof (".#prelink#") + 1,
 -		  ".#prelink#") == 0)
 +      if (endswithn (name, len, ".#prelink#"))
 	return true;
       if (len >= sizeof (".#prelink#.XXXXXX") - 1
 	  && memcmp (name + len - sizeof (".#prelink#.XXXXXX")
@@ -679,10 +689,11 @@ skip_dso_based_on_name (const char *name, size_t len)
 	return true;
     }
   /* Skip temporary files created by RPM.  */
 -  if (memchr (name, len, ';') != NULL)
 +  if (memchr (name, ';', len) != NULL)
     return true;
   /* Skip temporary files created by dpkg.  */
 -  if (len > 4 && memcmp (name + len - 4, ".tmp", 4) == 0)
 +  if (endswithn (name, len, ".dpkg-new")
 +      || endswithn (name, len, ".dpkg-tmp"))
     return true;
   return false;
 }
--- a/glibc.spec
+++ b/glibc.spec
@ -1,4 +1,4 @@
-%global glibcsrcdir glibc-2.38.9000-244-gd1dcb565a1
+%global glibcsrcdir glibc-2.38.9000-295-g5d7f1bce7d
 %global glibcversion 2.38.9000
 # Pre-release tarballs are pulled in from git using a command that is
 # effectively:
@ -159,7 +159,7 @@ Version: %{glibcversion}
 # - It allows using the Release number without the %%dist tag in the dependency
 #   generator to make the generated requires interchangeable between Rawhide
 #   and ELN (.elnYY < .fcXX).
-%global baserelease 22
+%global baserelease 23
 Release: %{baserelease}%{?dist}
 # In general, GPLv2+ is used by programs, LGPLv2+ is used for
@ -230,10 +230,7 @@ Patch9: glibc-rh827510.patch
 Patch13: glibc-fedora-localedata-rh61908.patch
 Patch17: glibc-cs-path.patch
 Patch23: glibc-python3.patch
-Patch24: glibc-rh2244688.patch
+Patch24: glibc-benchtests-aarch64.patch
 Patch25: glibc-rh2244992.patch
 Patch26: glibc-rh2248915.patch
 Patch27: glibc-rh2248502-3.patch
 ##############################################################################
 # Continued list of core "glibc" package information:
@ -2204,6 +2201,60 @@ update_gconv_modules_cache ()
 %files -f compat-libpthread-nonshared.filelist -n compat-libpthread-nonshared
 %changelog
 * Wed Nov 22 2023 Florian Weimer <fweimer@redhat.com> - 2.38.9000-23
 - Apply glibc-benchtests-aarch64.patch to fix an aarch64 build failure.
 - Drop glibc-rh2244688.patch revert.  Fix applied upstream.
 - Drop glibc-rh2244992.patch, glibc-rh2248915.patch, glibc-rh2248502-3.patch.
  All applied upstream.
 - Auto-sync with upstream branch master,
  commit 5d7f1bce7d8eea31f4baeb68bcc3124b35acc751:
 - posix: Revert the removal of the crypt prototype from <unistd.h>
 - elf: Add comments on how LD_AUDIT and LD_PRELOAD handle __libc_enable_secure
 - elf: Ignore LD_LIBRARY_PATH and debug env var for setuid for static
 - elf: Remove any_debug from dl_main_state
 - elf: Remove LD_PROFILE for static binaries
 - elf: Ignore LD_PROFILE for setuid binaries
 - s390: Use dl-symbol-redir-ifunc.h on cpu-tunables
 - x86: Use dl-symbol-redir-ifunc.h on cpu-tunables
 - elf: Emit warning if tunable is ill-formatted
 - elf: Fix _dl_debug_vdprintf to work before self-relocation
 - elf: Do not parse ill-formatted strings
 - elf: Do not process invalid tunable format
 - elf: Add all malloc tunable to unsecvars
 - elf: Ignore GLIBC_TUNABLES for setuid/setgid binaries
 - elf: Add GLIBC_TUNABLES to unsecvars
 - elf: Remove /etc/suid-debug support
 - stdlib: The qsort implementation needs to use heapsort in more cases
 - stdlib: Handle various corner cases in the fallback heapsort for qsort
 - stdlib: Avoid another self-comparison in qsort
 - hurd: fix restarting reauth_dtable on signal
 - hurd: Prevent the final file_exec_paths call from signals
 - manual: Fix termios.c example. (Bug 31078)
 - aarch64: Add vector implementations of expm1 routines
 - linux: Use fchmodat2 on fchmod for flags different than 0 (BZ 26401)
 - intl: Add test case for bug 16621
 - resolv: free only initialized items from gai pool
 - ldconfig: Fixes for skipping temporary files.
 - nptl: Link tst-execstack-threads-mod.so with -z execstack
 - nptl: Rename tst-execstack to tst-execstack-threads
 - localedata: Convert oc_FR locale to UTF-8
 - localedata: Add information for Occitan
 - elf: Fix force_first handling in dlclose (bug 30981)
 - elf: Handle non-directory name in search path (BZ 31035)
 - New Zealand locales (en_NZ & mi_NZ) first day of week should be Monday
 - x86: Fix unchecked AVX512-VBMI2 usage in strrchr-evex-base.S
 - posix: Check pidfd_spawn with tst-spawn7-pid
 - y2038: Fix support for 64-bit time on legacy ABIs
 - AArch64: Remove Falkor memcpy
 - AArch64: Add memset_zva64
 - AArch64: Cleanup emag memset
 - test: Run the tst-tls-allocation-failure-static-patched with test-wrapper.
 - aarch64: Add vector implementations of log1p routines
 - aarch64: Add vector implementations of atan2 routines
 - aarch64: Add vector implementations of atan routines
 - aarch64: Add vector implementations of acos routines
 - aarch64: Add vector implementations of asin routines
 * Wed Nov 15 2023 Florian Weimer <fweimer@redhat.com> - 2.38.9000-22
 - Work around another self-comparison application issue in qsort (#2248502)
--- a/2
+++ b/2
@ -1 +1 @@
-SHA512 (glibc-2.38.9000-244-gd1dcb565a1.tar.xz) = 74d23268abb91447d74906ae477f926baa2f54a95d87fc18d8972a976524987e98f6141f3767406aef6293d2ef7f7bc16cfedaacd2d33bca207253ec7073c915
+SHA512 (glibc-2.38.9000-295-g5d7f1bce7d.tar.xz) = 7e531ed030bef65198d86d9974dc280ea2897d3d89c0ceecdd2667470ce4418c42f5a1633927786c8a9f4d7d967cec2a462e4a6918dfb92fc419e5ec9ecf28e3
`@ -1 +1 @@`
	`SHA512 (glibc-2.38.9000-244-gd1dcb565a1.tar.xz) = 74d23268abb91447d74906ae477f926baa2f54a95d87fc18d8972a976524987e98f6141f3767406aef6293d2ef7f7bc16cfedaacd2d33bca207253ec7073c915`	`SHA512 (glibc-2.38.9000-295-g5d7f1bce7d.tar.xz) = 7e531ed030bef65198d86d9974dc280ea2897d3d89c0ceecdd2667470ce4418c42f5a1633927786c8a9f4d7d967cec2a462e4a6918dfb92fc419e5ec9ecf28e3`