Import Intel hyperscale improvements (RHEL-15696)

Resolves: RHEL-15696 Includes two additional (well, 1.5) upstream patches to resolve roundeven redirects.
2023-12-14 17:33:45 -05:00 · 2023-12-14 17:33:45 -05:00 · 02cfe04e36
commit 02cfe04e36
parent b76a8ebe01
111 changed files with 41462 additions and 1 deletions
--- a/glibc-RHEL-15696-1.patch
+++ b/glibc-RHEL-15696-1.patch
@ -0,0 +1,259 @@
+From 97700a34f36721b11a754cf37a1cc40695ece1fd Mon Sep 17 00:00:00 2001
+From: "H.J. Lu" <hjl.tools@gmail.com>
+Date: Mon, 21 Jan 2019 11:23:59 -0800
+Subject: [PATCH] x86-64 memchr/wmemchr: Properly handle the length parameter
+ [BZ# 24097]
+Content-type: text/plain; charset=UTF-8
+
+On x32, the size_t parameter may be passed in the lower 32 bits of a
+64-bit register with the non-zero upper 32 bits.  The string/memory
+functions written in assembly can only use the lower 32 bits of a
+64-bit register as length or must clear the upper 32 bits before using
+the full 64-bit register for length.
+
+This pach fixes memchr/wmemchr for x32.  Tested on x86-64 and x32.  On
+x86-64, libc.so is the same with and withou the fix.
+
+	[BZ# 24097]
+	CVE-2019-6488
+	* sysdeps/x86_64/memchr.S: Use RDX_LP for length.  Clear the
+	upper 32 bits of RDX register.
+	* sysdeps/x86_64/multiarch/memchr-avx2.S: Likewise.
+	* sysdeps/x86_64/x32/Makefile (tests): Add tst-size_t-memchr and
+	tst-size_t-wmemchr.
+	* sysdeps/x86_64/x32/test-size_t.h: New file.
+	* sysdeps/x86_64/x32/tst-size_t-memchr.c: Likewise.
+	* sysdeps/x86_64/x32/tst-size_t-wmemchr.c: Likewise.
+---
+ sysdeps/x86_64/memchr.S                 | 10 ++--
+ sysdeps/x86_64/multiarch/memchr-avx2.S  |  8 ++-
+ sysdeps/x86_64/x32/Makefile             |  8 +++
+ sysdeps/x86_64/x32/test-size_t.h        | 35 ++++++++++++
+ sysdeps/x86_64/x32/tst-size_t-memchr.c  | 72 +++++++++++++++++++++++++
+ sysdeps/x86_64/x32/tst-size_t-wmemchr.c | 20 +++++++
+ 6 files changed, 148 insertions(+), 5 deletions(-)
+ create mode 100644 sysdeps/x86_64/x32/test-size_t.h
+ create mode 100644 sysdeps/x86_64/x32/tst-size_t-memchr.c
+ create mode 100644 sysdeps/x86_64/x32/tst-size_t-wmemchr.c
+
+Conflicts:
+	ChangeLog
+	(removed)
+	NEWS
+	(removed)
+
+diff --git a/sysdeps/x86_64/memchr.S b/sysdeps/x86_64/memchr.S
+index feef5d4f..cb320257 100644
+--- a/sysdeps/x86_64/memchr.S
+++ b/sysdeps/x86_64/memchr.S
+@@ -34,12 +34,16 @@ ENTRY(MEMCHR)
+ 	mov	%edi, %ecx
+ 
+ #ifdef USE_AS_WMEMCHR
+-	test	%rdx, %rdx
+	test	%RDX_LP, %RDX_LP
+ 	jz	L(return_null)
+-	shl	$2, %rdx
+	shl	$2, %RDX_LP
+ #else
+# ifdef __ILP32__
+	/* Clear the upper 32 bits.  */
+	movl	%edx, %edx
+# endif
+ 	punpcklbw %xmm1, %xmm1
+-	test	%rdx, %rdx
+	test	%RDX_LP, %RDX_LP
+ 	jz	L(return_null)
+ 	punpcklbw %xmm1, %xmm1
+ #endif
+diff --git a/sysdeps/x86_64/multiarch/memchr-avx2.S b/sysdeps/x86_64/multiarch/memchr-avx2.S
+index 5f5e7725..c81da19b 100644
+--- a/sysdeps/x86_64/multiarch/memchr-avx2.S
+++ b/sysdeps/x86_64/multiarch/memchr-avx2.S
+@@ -40,16 +40,20 @@
+ ENTRY (MEMCHR)
+ # ifndef USE_AS_RAWMEMCHR
+ 	/* Check for zero length.  */
+-	testq	%rdx, %rdx
+	test	%RDX_LP, %RDX_LP
+ 	jz	L(null)
+ # endif
+ 	movl	%edi, %ecx
+ 	/* Broadcast CHAR to YMM0.  */
+ 	vmovd	%esi, %xmm0
+ # ifdef USE_AS_WMEMCHR
+-	shl	$2, %rdx
+	shl	$2, %RDX_LP
+ 	vpbroadcastd %xmm0, %ymm0
+ # else
+#  ifdef __ILP32__
+	/* Clear the upper 32 bits.  */
+	movl	%edx, %edx
+#  endif
+ 	vpbroadcastb %xmm0, %ymm0
+ # endif
+ 	/* Check if we may cross page boundary with one vector load.  */
+diff --git a/sysdeps/x86_64/x32/Makefile b/sysdeps/x86_64/x32/Makefile
+index f2ebc24f..7d528889 100644
+--- a/sysdeps/x86_64/x32/Makefile
+++ b/sysdeps/x86_64/x32/Makefile
+@@ -4,3 +4,11 @@ ifeq ($(subdir),math)
+ # 64-bit llround.  Add -fno-builtin-lround to silence the compiler.
+ CFLAGS-s_llround.c += -fno-builtin-lround
+ endif
+
+ifeq ($(subdir),string)
+tests += tst-size_t-memchr
+endif
+
+ifeq ($(subdir),wcsmbs)
+tests += tst-size_t-wmemchr
+endif
+diff --git a/sysdeps/x86_64/x32/test-size_t.h b/sysdeps/x86_64/x32/test-size_t.h
+new file mode 100644
+index 00000000..78a94086
+--- /dev/null
+++ b/sysdeps/x86_64/x32/test-size_t.h
+@@ -0,0 +1,35 @@
+/* Test string/memory functions with size_t in the lower 32 bits of
+   64-bit register.
+   Copyright (C) 2019 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#define TEST_MAIN
+#include <string/test-string.h>
+
+/* On x32, parameter_t may be passed in a 64-bit register with the LEN
+   field in the lower 32 bits.  When the LEN field of 64-bit register
+   is passed to string/memory function as the size_t parameter, only
+   the lower 32 bits can be used.  */
+typedef struct
+{
+  union
+    {
+      size_t len;
+      void (*fn) (void);
+    };
+  void *p;
+} parameter_t;
+diff --git a/sysdeps/x86_64/x32/tst-size_t-memchr.c b/sysdeps/x86_64/x32/tst-size_t-memchr.c
+new file mode 100644
+index 00000000..29a3daf1
+--- /dev/null
+++ b/sysdeps/x86_64/x32/tst-size_t-memchr.c
+@@ -0,0 +1,72 @@
+/* Test memchr with size_t in the lower 32 bits of 64-bit register.
+   Copyright (C) 2019 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#ifndef WIDE
+# define TEST_NAME "memchr"
+#else
+# define TEST_NAME "wmemchr"
+#endif /* WIDE */
+#include "test-size_t.h"
+
+#ifndef WIDE
+# define MEMCHR memchr
+# define CHAR char
+# define UCHAR unsigned char
+#else
+# include <wchar.h>
+# define MEMCHR wmemchr
+# define CHAR wchar_t
+# define UCHAR wchar_t
+#endif /* WIDE */
+
+IMPL (MEMCHR, 1)
+
+typedef CHAR * (*proto_t) (const CHAR*, int, size_t);
+
+static CHAR *
+__attribute__ ((noinline, noclone))
+do_memchr (parameter_t a, parameter_t b)
+{
+  return CALL (&b, a.p, (uintptr_t) b.p, a.len);
+}
+
+static int
+test_main (void)
+{
+  test_init ();
+
+  parameter_t src = { { page_size / sizeof (CHAR) }, buf2 };
+  parameter_t c = { { 0 }, (void *) (uintptr_t) 0x12 };
+
+  int ret = 0;
+  FOR_EACH_IMPL (impl, 0)
+    {
+      c.fn = impl->fn;
+      CHAR *res = do_memchr (src, c);
+      if (res)
+	{
+	  error (0, 0, "Wrong result in function %s: %p != NULL",
+		 impl->name, res);
+	  ret = 1;
+	}
+    }
+
+  return ret ? EXIT_FAILURE : EXIT_SUCCESS;
+}
+
+#include <support/test-driver.c>
+diff --git a/sysdeps/x86_64/x32/tst-size_t-wmemchr.c b/sysdeps/x86_64/x32/tst-size_t-wmemchr.c
+new file mode 100644
+index 00000000..877801d6
+--- /dev/null
+++ b/sysdeps/x86_64/x32/tst-size_t-wmemchr.c
+@@ -0,0 +1,20 @@
+/* Test wmemchr with size_t in the lower 32 bits of 64-bit register.
+   Copyright (C) 2019 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#define WIDE 1
+#include "tst-size_t-memchr.c"
+-- 
+GitLab
+
--- a/glibc-RHEL-15696-10.patch
+++ b/glibc-RHEL-15696-10.patch
@ -0,0 +1,41 @@
+From ddf0992cf57a93200e0c782e2a94d0733a5a0b87 Mon Sep 17 00:00:00 2001
+From: Noah Goldstein <goldstein.w.n@gmail.com>
+Date: Sun, 9 Jan 2022 16:02:21 -0600
+Subject: [PATCH] x86: Fix __wcsncmp_avx2 in strcmp-avx2.S [BZ# 28755]
+Content-type: text/plain; charset=UTF-8
+
+Fixes [BZ# 28755] for wcsncmp by redirecting length >= 2^56 to
+__wcscmp_avx2. For x86_64 this covers the entire address range so any
+length larger could not possibly be used to bound `s1` or `s2`.
+
+test-strcmp, test-strncmp, test-wcscmp, and test-wcsncmp all pass.
+
+Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
+---
+ sysdeps/x86_64/multiarch/strcmp-avx2.S | 10 ++++++++++
+ 1 file changed, 10 insertions(+)
+
+diff --git a/sysdeps/x86_64/multiarch/strcmp-avx2.S b/sysdeps/x86_64/multiarch/strcmp-avx2.S
+index 156c1949..8fb8eedc 100644
+--- a/sysdeps/x86_64/multiarch/strcmp-avx2.S
+++ b/sysdeps/x86_64/multiarch/strcmp-avx2.S
+@@ -83,6 +83,16 @@ ENTRY (STRCMP)
+ 	je	L(char0)
+ 	jb	L(zero)
+ #  ifdef USE_AS_WCSCMP
+#  ifndef __ILP32__
+	movq	%rdx, %rcx
+	/* Check if length could overflow when multiplied by
+	   sizeof(wchar_t). Checking top 8 bits will cover all potential
+	   overflow cases as well as redirect cases where its impossible to
+	   length to bound a valid memory region. In these cases just use
+	   'wcscmp'.  */
+	shrq	$56, %rcx
+	jnz	__wcscmp_avx2
+#  endif
+ 	/* Convert units: from wide to byte char.  */
+ 	shl	$2, %RDX_LP
+ #  endif
+-- 
+GitLab
+
--- a/glibc-RHEL-15696-100.patch
+++ b/glibc-RHEL-15696-100.patch
@ -0,0 +1,257 @@
+From 244b415d386487521882debb845a040a4758cb18 Mon Sep 17 00:00:00 2001
+From: Noah Goldstein <goldstein.w.n@gmail.com>
+Date: Fri, 25 Mar 2022 17:13:33 -0500
+Subject: [PATCH] x86: Small improvements for wcslen
+Content-type: text/plain; charset=UTF-8
+
+Just a few QOL changes.
+    1. Prefer `add` > `lea` as it has high execution units it can run
+       on.
+    2. Don't break macro-fusion between `test` and `jcc`
+    3. Reduce code size by removing gratuitous padding bytes (-90
+       bytes).
+
+geometric_mean(N=20) of all benchmarks New / Original: 0.959
+
+All string/memory tests pass.
+Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
+---
+ sysdeps/x86_64/wcslen.S | 86 ++++++++++++++++++++---------------------
+ 1 file changed, 41 insertions(+), 45 deletions(-)
+
+diff --git a/sysdeps/x86_64/wcslen.S b/sysdeps/x86_64/wcslen.S
+index 9f5f7232..254bb030 100644
+--- a/sysdeps/x86_64/wcslen.S
+++ b/sysdeps/x86_64/wcslen.S
+@@ -41,82 +41,82 @@ ENTRY (__wcslen)
+ 	pxor	%xmm0, %xmm0
+ 
+ 	lea	32(%rdi), %rax
+-	lea	16(%rdi), %rcx
+	addq	$16, %rdi
+ 	and	$-16, %rax
+ 
+ 	pcmpeqd	(%rax), %xmm0
+ 	pmovmskb %xmm0, %edx
+ 	pxor	%xmm1, %xmm1
+	addq	$16, %rax
+ 	test	%edx, %edx
+-	lea	16(%rax), %rax
+ 	jnz	L(exit)
+ 
+ 	pcmpeqd	(%rax), %xmm1
+ 	pmovmskb %xmm1, %edx
+ 	pxor	%xmm2, %xmm2
+	addq	$16, %rax
+ 	test	%edx, %edx
+-	lea	16(%rax), %rax
+ 	jnz	L(exit)
+ 
+ 	pcmpeqd	(%rax), %xmm2
+ 	pmovmskb %xmm2, %edx
+ 	pxor	%xmm3, %xmm3
+	addq	$16, %rax
+ 	test	%edx, %edx
+-	lea	16(%rax), %rax
+ 	jnz	L(exit)
+ 
+ 	pcmpeqd	(%rax), %xmm3
+ 	pmovmskb %xmm3, %edx
+	addq	$16, %rax
+ 	test	%edx, %edx
+-	lea	16(%rax), %rax
+ 	jnz	L(exit)
+ 
+ 	pcmpeqd	(%rax), %xmm0
+ 	pmovmskb %xmm0, %edx
+	addq	$16, %rax
+ 	test	%edx, %edx
+-	lea	16(%rax), %rax
+ 	jnz	L(exit)
+ 
+ 	pcmpeqd	(%rax), %xmm1
+ 	pmovmskb %xmm1, %edx
+	addq	$16, %rax
+ 	test	%edx, %edx
+-	lea	16(%rax), %rax
+ 	jnz	L(exit)
+ 
+ 	pcmpeqd	(%rax), %xmm2
+ 	pmovmskb %xmm2, %edx
+	addq	$16, %rax
+ 	test	%edx, %edx
+-	lea	16(%rax), %rax
+ 	jnz	L(exit)
+ 
+ 	pcmpeqd	(%rax), %xmm3
+ 	pmovmskb %xmm3, %edx
+	addq	$16, %rax
+ 	test	%edx, %edx
+-	lea	16(%rax), %rax
+ 	jnz	L(exit)
+ 
+ 	pcmpeqd	(%rax), %xmm0
+ 	pmovmskb %xmm0, %edx
+	addq	$16, %rax
+ 	test	%edx, %edx
+-	lea	16(%rax), %rax
+ 	jnz	L(exit)
+ 
+ 	pcmpeqd	(%rax), %xmm1
+ 	pmovmskb %xmm1, %edx
+	addq	$16, %rax
+ 	test	%edx, %edx
+-	lea	16(%rax), %rax
+ 	jnz	L(exit)
+ 
+ 	pcmpeqd	(%rax), %xmm2
+ 	pmovmskb %xmm2, %edx
+	addq	$16, %rax
+ 	test	%edx, %edx
+-	lea	16(%rax), %rax
+ 	jnz	L(exit)
+ 
+ 	pcmpeqd	(%rax), %xmm3
+ 	pmovmskb %xmm3, %edx
+	addq	$16, %rax
+ 	test	%edx, %edx
+-	lea	16(%rax), %rax
+ 	jnz	L(exit)
+ 
+ 	and	$-0x40, %rax
+@@ -133,104 +133,100 @@ L(aligned_64_loop):
+ 	pminub	%xmm0, %xmm2
+ 	pcmpeqd	%xmm3, %xmm2
+ 	pmovmskb %xmm2, %edx
+	addq	$64, %rax
+ 	test	%edx, %edx
+-	lea	64(%rax), %rax
+ 	jz	L(aligned_64_loop)
+ 
+ 	pcmpeqd	-64(%rax), %xmm3
+ 	pmovmskb %xmm3, %edx
+    addq	$48, %rdi
+ 	test	%edx, %edx
+-	lea	48(%rcx), %rcx
+ 	jnz	L(exit)
+ 
+ 	pcmpeqd	%xmm1, %xmm3
+ 	pmovmskb %xmm3, %edx
+    addq	$-16, %rdi
+ 	test	%edx, %edx
+-	lea	-16(%rcx), %rcx
+ 	jnz	L(exit)
+ 
+ 	pcmpeqd	-32(%rax), %xmm3
+ 	pmovmskb %xmm3, %edx
+    addq	$-16, %rdi
+ 	test	%edx, %edx
+-	lea	-16(%rcx), %rcx
+ 	jnz	L(exit)
+ 
+ 	pcmpeqd	%xmm6, %xmm3
+ 	pmovmskb %xmm3, %edx
+    addq	$-16, %rdi
+ 	test	%edx, %edx
+-	lea	-16(%rcx), %rcx
+-	jnz	L(exit)
+-
+-	jmp	L(aligned_64_loop)
+	jz	L(aligned_64_loop)
+ 
+ 	.p2align 4
+ L(exit):
+-	sub	%rcx, %rax
+	sub	%rdi, %rax
+ 	shr	$2, %rax
+ 	test	%dl, %dl
+ 	jz	L(exit_high)
+ 
+-	mov	%dl, %cl
+-	and	$15, %cl
+	andl	$15, %edx
+ 	jz	L(exit_1)
+ 	ret
+ 
+-	.p2align 4
+	/* No align here. Naturally aligned % 16 == 1.  */
+ L(exit_high):
+-	mov	%dh, %ch
+-	and	$15, %ch
+	andl	$(15 << 8), %edx
+ 	jz	L(exit_3)
+ 	add	$2, %rax
+ 	ret
+ 
+-	.p2align 4
+	.p2align 3
+ L(exit_1):
+ 	add	$1, %rax
+ 	ret
+ 
+-	.p2align 4
+	.p2align 3
+ L(exit_3):
+ 	add	$3, %rax
+ 	ret
+ 
+-	.p2align 4
+	.p2align 3
+ L(exit_tail0):
+-	xor	%rax, %rax
+	xorl	%eax, %eax
+ 	ret
+ 
+-	.p2align 4
+	.p2align 3
+ L(exit_tail1):
+-	mov	$1, %rax
+	movl	$1, %eax
+ 	ret
+ 
+-	.p2align 4
+	.p2align 3
+ L(exit_tail2):
+-	mov	$2, %rax
+	movl	$2, %eax
+ 	ret
+ 
+-	.p2align 4
+	.p2align 3
+ L(exit_tail3):
+-	mov	$3, %rax
+	movl	$3, %eax
+ 	ret
+ 
+-	.p2align 4
+	.p2align 3
+ L(exit_tail4):
+-	mov	$4, %rax
+	movl	$4, %eax
+ 	ret
+ 
+-	.p2align 4
+	.p2align 3
+ L(exit_tail5):
+-	mov	$5, %rax
+	movl	$5, %eax
+ 	ret
+ 
+-	.p2align 4
+	.p2align 3
+ L(exit_tail6):
+-	mov	$6, %rax
+	movl	$6, %eax
+ 	ret
+ 
+-	.p2align 4
+	.p2align 3
+ L(exit_tail7):
+-	mov	$7, %rax
+	movl	$7, %eax
+ 	ret
+ 
+ END (__wcslen)
+-- 
+GitLab
+
--- a/glibc-RHEL-15696-101.patch
+++ b/glibc-RHEL-15696-101.patch
@ -0,0 +1,964 @@
+From 7cbc03d03091d5664060924789afe46d30a5477e Mon Sep 17 00:00:00 2001
+From: Noah Goldstein <goldstein.w.n@gmail.com>
+Date: Fri, 15 Apr 2022 12:28:00 -0500
+Subject: [PATCH] x86: Remove memcmp-sse4.S
+Content-type: text/plain; charset=UTF-8
+
+Code didn't actually use any sse4 instructions since `ptest` was
+removed in:
+
+commit 2f9062d7171850451e6044ef78d91ff8c017b9c0
+Author: Noah Goldstein <goldstein.w.n@gmail.com>
+Date:   Wed Nov 10 16:18:56 2021 -0600
+
+    x86: Shrink memcmp-sse4.S code size
+
+The new memcmp-sse2 implementation is also faster.
+
+geometric_mean(N=20) of page cross cases SSE2 / SSE4: 0.905
+
+Note there are two regressions preferring SSE2 for Size = 1 and Size =
+65.
+
+Size = 1:
+size, align0, align1, ret, New Time/Old Time
+   1,      1,      1,   0,               1.2
+   1,      1,      1,   1,             1.197
+   1,      1,      1,  -1,               1.2
+
+This is intentional. Size == 1 is significantly less hot based on
+profiles of GCC11 and Python3 than sizes [4, 8] (which is made
+hotter).
+
+Python3 Size = 1        -> 13.64%
+Python3 Size = [4, 8]   -> 60.92%
+
+GCC11   Size = 1        ->  1.29%
+GCC11   Size = [4, 8]   -> 33.86%
+
+size, align0, align1, ret, New Time/Old Time
+   4,      4,      4,   0,             0.622
+   4,      4,      4,   1,             0.797
+   4,      4,      4,  -1,             0.805
+   5,      5,      5,   0,             0.623
+   5,      5,      5,   1,             0.777
+   5,      5,      5,  -1,             0.802
+   6,      6,      6,   0,             0.625
+   6,      6,      6,   1,             0.813
+   6,      6,      6,  -1,             0.788
+   7,      7,      7,   0,             0.625
+   7,      7,      7,   1,             0.799
+   7,      7,      7,  -1,             0.795
+   8,      8,      8,   0,             0.625
+   8,      8,      8,   1,             0.848
+   8,      8,      8,  -1,             0.914
+   9,      9,      9,   0,             0.625
+
+Size = 65:
+size, align0, align1, ret, New Time/Old Time
+  65,      0,      0,   0,             1.103
+  65,      0,      0,   1,             1.216
+  65,      0,      0,  -1,             1.227
+  65,     65,      0,   0,             1.091
+  65,      0,     65,   1,              1.19
+  65,     65,     65,  -1,             1.215
+
+This is because A) the checks in range [65, 96] are now unrolled 2x
+and B) because smaller values <= 16 are now given a hotter path. By
+contrast the SSE4 version has a branch for Size = 80. The unrolled
+version has get better performance for returns which need both
+comparisons.
+
+size, align0, align1, ret, New Time/Old Time
+ 128,      4,      8,   0,             0.858
+ 128,      4,      8,   1,             0.879
+ 128,      4,      8,  -1,             0.888
+
+As well, out of microbenchmark environments that are not full
+predictable the branch will have a real-cost.
+Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
+---
+ sysdeps/x86_64/multiarch/Makefile          |   2 -
+ sysdeps/x86_64/multiarch/ifunc-impl-list.c |   4 -
+ sysdeps/x86_64/multiarch/ifunc-memcmp.h    |   4 -
+ sysdeps/x86_64/multiarch/memcmp-sse4.S     | 804 ---------------------
+ 4 files changed, 814 deletions(-)
+ delete mode 100644 sysdeps/x86_64/multiarch/memcmp-sse4.S
+
+diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
+index bca82e38..b503e4b8 100644
+--- a/sysdeps/x86_64/multiarch/Makefile
+++ b/sysdeps/x86_64/multiarch/Makefile
+@@ -11,7 +11,6 @@ sysdep_routines += \
+   memcmp-avx2-movbe-rtm \
+   memcmp-evex-movbe \
+   memcmp-sse2 \
+-  memcmp-sse4 \
+   memcmp-ssse3 \
+   memcpy-ssse3 \
+   memcpy-ssse3-back \
+@@ -174,7 +173,6 @@ sysdep_routines += \
+   wmemcmp-avx2-movbe-rtm \
+   wmemcmp-c \
+   wmemcmp-evex-movbe \
+-  wmemcmp-sse4 \
+   wmemcmp-ssse3 \
+ # sysdep_routines
+ endif
+diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+index 14314367..450a2917 100644
+--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+@@ -78,8 +78,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 			       && CPU_FEATURE_USABLE (BMI2)
+ 			       && CPU_FEATURE_USABLE (MOVBE)),
+ 			      __memcmp_evex_movbe)
+-	      IFUNC_IMPL_ADD (array, i, memcmp, CPU_FEATURE_USABLE (SSE4_1),
+-			      __memcmp_sse4_1)
+ 	      IFUNC_IMPL_ADD (array, i, memcmp, CPU_FEATURE_USABLE (SSSE3),
+ 			      __memcmp_ssse3)
+ 	      IFUNC_IMPL_ADD (array, i, memcmp, 1, __memcmp_sse2))
+@@ -824,8 +822,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 			       && CPU_FEATURE_USABLE (BMI2)
+ 			       && CPU_FEATURE_USABLE (MOVBE)),
+ 			      __wmemcmp_evex_movbe)
+-	      IFUNC_IMPL_ADD (array, i, wmemcmp, CPU_FEATURE_USABLE (SSE4_1),
+-			      __wmemcmp_sse4_1)
+ 	      IFUNC_IMPL_ADD (array, i, wmemcmp, CPU_FEATURE_USABLE (SSSE3),
+ 			      __wmemcmp_ssse3)
+ 	      IFUNC_IMPL_ADD (array, i, wmemcmp, 1, __wmemcmp_sse2))
+diff --git a/sysdeps/x86_64/multiarch/ifunc-memcmp.h b/sysdeps/x86_64/multiarch/ifunc-memcmp.h
+index 690dffe8..0bc47a7f 100644
+--- a/sysdeps/x86_64/multiarch/ifunc-memcmp.h
+++ b/sysdeps/x86_64/multiarch/ifunc-memcmp.h
+@@ -21,7 +21,6 @@
+ 
+ extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden;
+ extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden;
+-extern __typeof (REDIRECT_NAME) OPTIMIZE (sse4_1) attribute_hidden;
+ extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_movbe) attribute_hidden;
+ extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_movbe_rtm) attribute_hidden;
+ extern __typeof (REDIRECT_NAME) OPTIMIZE (evex_movbe) attribute_hidden;
+@@ -47,9 +46,6 @@ IFUNC_SELECTOR (void)
+ 	return OPTIMIZE (avx2_movbe);
+     }
+ 
+-  if (CPU_FEATURE_USABLE_P (cpu_features, SSE4_1))
+-    return OPTIMIZE (sse4_1);
+-
+   if (CPU_FEATURE_USABLE_P (cpu_features, SSSE3))
+     return OPTIMIZE (ssse3);
+ 
+diff --git a/sysdeps/x86_64/multiarch/memcmp-sse4.S b/sysdeps/x86_64/multiarch/memcmp-sse4.S
+deleted file mode 100644
+index 50060006..00000000
+--- a/sysdeps/x86_64/multiarch/memcmp-sse4.S
+++ /dev/null
+@@ -1,804 +0,0 @@
+-/* memcmp with SSE4.1, wmemcmp with SSE4.1
+-   Copyright (C) 2010-2018 Free Software Foundation, Inc.
+-   Contributed by Intel Corporation.
+-   This file is part of the GNU C Library.
+-
+-   The GNU C Library is free software; you can redistribute it and/or
+-   modify it under the terms of the GNU Lesser General Public
+-   License as published by the Free Software Foundation; either
+-   version 2.1 of the License, or (at your option) any later version.
+-
+-   The GNU C Library is distributed in the hope that it will be useful,
+-   but WITHOUT ANY WARRANTY; without even the implied warranty of
+-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+-   Lesser General Public License for more details.
+-
+-   You should have received a copy of the GNU Lesser General Public
+-   License along with the GNU C Library; if not, see
+-   <http://www.gnu.org/licenses/>.  */
+-
+-#if IS_IN (libc)
+-
+-# include <sysdep.h>
+-
+-# ifndef MEMCMP
+-#  define MEMCMP	__memcmp_sse4_1
+-# endif
+-
+-#ifdef USE_AS_WMEMCMP
+-# define CMPEQ	pcmpeqd
+-# define CHAR_SIZE	4
+-#else
+-# define CMPEQ	pcmpeqb
+-# define CHAR_SIZE	1
+-#endif
+-
+-
+-/* Warning!
+-           wmemcmp has to use SIGNED comparison for elements.
+-           memcmp has to use UNSIGNED comparison for elemnts.
+-*/
+-
+-	.section .text.sse4.1,"ax",@progbits
+-ENTRY (MEMCMP)
+-# ifdef USE_AS_WMEMCMP
+-	shl	$2, %RDX_LP
+-# elif defined __ILP32__
+-	/* Clear the upper 32 bits.  */
+-	mov	%edx, %edx
+-# endif
+-	cmp	$79, %RDX_LP
+-	ja	L(79bytesormore)
+-
+-	cmp	$CHAR_SIZE, %RDX_LP
+-	jbe	L(firstbyte)
+-
+-	/* N in (CHAR_SIZE, 79) bytes.  */
+-	cmpl	$32, %edx
+-	ja	L(more_32_bytes)
+-
+-	cmpl	$16, %edx
+-	jae	L(16_to_32_bytes)
+-
+-# ifndef USE_AS_WMEMCMP
+-	cmpl	$8, %edx
+-	jae	L(8_to_16_bytes)
+-
+-	cmpl	$4, %edx
+-	jb	L(2_to_3_bytes)
+-
+-	movl	(%rdi), %eax
+-	movl	(%rsi), %ecx
+-
+-	bswap	%eax
+-	bswap	%ecx
+-
+-	shlq	$32, %rax
+-	shlq	$32, %rcx
+-
+-	movl	-4(%rdi, %rdx), %edi
+-	movl	-4(%rsi, %rdx), %esi
+-
+-	bswap	%edi
+-	bswap	%esi
+-
+-	orq	%rdi, %rax
+-	orq	%rsi, %rcx
+-	subq	%rcx, %rax
+-	cmovne	%edx, %eax
+-	sbbl	%ecx, %ecx
+-	orl	%ecx, %eax
+-	ret
+-
+-	.p2align 4,, 8
+-L(2_to_3_bytes):
+-	movzwl	(%rdi), %eax
+-	movzwl	(%rsi), %ecx
+-	shll	$8, %eax
+-	shll	$8, %ecx
+-	bswap	%eax
+-	bswap	%ecx
+-	movzbl	-1(%rdi, %rdx), %edi
+-	movzbl	-1(%rsi, %rdx), %esi
+-	orl	%edi, %eax
+-	orl	%esi, %ecx
+-	subl	%ecx, %eax
+-	ret
+-
+-	.p2align 4,, 8
+-L(8_to_16_bytes):
+-	movq	(%rdi), %rax
+-	movq	(%rsi), %rcx
+-
+-	bswap	%rax
+-	bswap	%rcx
+-
+-	subq	%rcx, %rax
+-	jne	L(8_to_16_bytes_done)
+-
+-	movq	-8(%rdi, %rdx), %rax
+-	movq	-8(%rsi, %rdx), %rcx
+-
+-	bswap	%rax
+-	bswap	%rcx
+-
+-	subq	%rcx, %rax
+-
+-L(8_to_16_bytes_done):
+-	cmovne	%edx, %eax
+-	sbbl	%ecx, %ecx
+-	orl	%ecx, %eax
+-	ret
+-# else
+-	xorl	%eax, %eax
+-	movl	(%rdi), %ecx
+-	cmpl	(%rsi), %ecx
+-	jne	L(8_to_16_bytes_done)
+-	movl	4(%rdi), %ecx
+-	cmpl	4(%rsi), %ecx
+-	jne	L(8_to_16_bytes_done)
+-	movl	-4(%rdi, %rdx), %ecx
+-	cmpl	-4(%rsi, %rdx), %ecx
+-	jne	L(8_to_16_bytes_done)
+-	ret
+-# endif
+-
+-	.p2align 4,, 3
+-L(ret_zero):
+-	xorl	%eax, %eax
+-L(zero):
+-	ret
+-
+-	.p2align 4,, 8
+-L(firstbyte):
+-	jb	L(ret_zero)
+-# ifdef USE_AS_WMEMCMP
+-	xorl	%eax, %eax
+-	movl	(%rdi), %ecx
+-	cmpl	(%rsi), %ecx
+-	je	L(zero)
+-L(8_to_16_bytes_done):
+-	setg	%al
+-	leal	-1(%rax, %rax), %eax
+-# else
+-	movzbl	(%rdi), %eax
+-	movzbl	(%rsi), %ecx
+-	sub	%ecx, %eax
+-# endif
+-	ret
+-
+-	.p2align 4
+-L(vec_return_begin_48):
+-	addq	$16, %rdi
+-	addq	$16, %rsi
+-L(vec_return_begin_32):
+-	bsfl	%eax, %eax
+-# ifdef USE_AS_WMEMCMP
+-	movl	32(%rdi, %rax), %ecx
+-	xorl	%edx, %edx
+-	cmpl	32(%rsi, %rax), %ecx
+-	setg	%dl
+-	leal	-1(%rdx, %rdx), %eax
+-# else
+-	movzbl	32(%rsi, %rax), %ecx
+-	movzbl	32(%rdi, %rax), %eax
+-	subl	%ecx, %eax
+-# endif
+-	ret
+-
+-	.p2align 4
+-L(vec_return_begin_16):
+-	addq	$16, %rdi
+-	addq	$16, %rsi
+-L(vec_return_begin):
+-	bsfl	%eax, %eax
+-# ifdef USE_AS_WMEMCMP
+-	movl	(%rdi, %rax), %ecx
+-	xorl	%edx, %edx
+-	cmpl	(%rsi, %rax), %ecx
+-	setg	%dl
+-	leal	-1(%rdx, %rdx), %eax
+-# else
+-	movzbl	(%rsi, %rax), %ecx
+-	movzbl	(%rdi, %rax), %eax
+-	subl	%ecx, %eax
+-# endif
+-	ret
+-
+-	.p2align 4
+-L(vec_return_end_16):
+-	subl	$16, %edx
+-L(vec_return_end):
+-	bsfl	%eax, %eax
+-	addl	%edx, %eax
+-# ifdef USE_AS_WMEMCMP
+-	movl	-16(%rdi, %rax), %ecx
+-	xorl	%edx, %edx
+-	cmpl	-16(%rsi, %rax), %ecx
+-	setg	%dl
+-	leal	-1(%rdx, %rdx), %eax
+-# else
+-	movzbl	-16(%rsi, %rax), %ecx
+-	movzbl	-16(%rdi, %rax), %eax
+-	subl	%ecx, %eax
+-# endif
+-	ret
+-
+-	.p2align 4,, 8
+-L(more_32_bytes):
+-	movdqu	(%rdi), %xmm0
+-	movdqu	(%rsi), %xmm1
+-	CMPEQ	%xmm0, %xmm1
+-	pmovmskb %xmm1, %eax
+-	incw	%ax
+-	jnz	L(vec_return_begin)
+-
+-	movdqu	16(%rdi), %xmm0
+-	movdqu	16(%rsi), %xmm1
+-	CMPEQ	%xmm0, %xmm1
+-	pmovmskb %xmm1, %eax
+-	incw	%ax
+-	jnz	L(vec_return_begin_16)
+-
+-	cmpl	$64, %edx
+-	jbe	L(32_to_64_bytes)
+-	movdqu	32(%rdi), %xmm0
+-	movdqu	32(%rsi), %xmm1
+-	CMPEQ	%xmm0, %xmm1
+-	pmovmskb %xmm1, %eax
+-	incw	%ax
+-	jnz	L(vec_return_begin_32)
+-
+-	.p2align 4,, 6
+-L(32_to_64_bytes):
+-	movdqu	-32(%rdi, %rdx), %xmm0
+-	movdqu	-32(%rsi, %rdx), %xmm1
+-	CMPEQ	%xmm0, %xmm1
+-	pmovmskb %xmm1, %eax
+-	incw	%ax
+-	jnz	L(vec_return_end_16)
+-
+-	movdqu	-16(%rdi, %rdx), %xmm0
+-	movdqu	-16(%rsi, %rdx), %xmm1
+-	CMPEQ	%xmm0, %xmm1
+-	pmovmskb %xmm1, %eax
+-	incw	%ax
+-	jnz	L(vec_return_end)
+-	ret
+-
+-	.p2align 4
+-L(16_to_32_bytes):
+-	movdqu	(%rdi), %xmm0
+-	movdqu	(%rsi), %xmm1
+-	CMPEQ	%xmm0, %xmm1
+-	pmovmskb %xmm1, %eax
+-	incw	%ax
+-	jnz	L(vec_return_begin)
+-
+-	movdqu	-16(%rdi, %rdx), %xmm0
+-	movdqu	-16(%rsi, %rdx), %xmm1
+-	CMPEQ	%xmm0, %xmm1
+-	pmovmskb %xmm1, %eax
+-	incw	%ax
+-	jnz	L(vec_return_end)
+-	ret
+-
+-
+-	.p2align 4
+-L(79bytesormore):
+-	movdqu	(%rdi), %xmm0
+-	movdqu	(%rsi), %xmm1
+-	CMPEQ	%xmm0, %xmm1
+-	pmovmskb %xmm1, %eax
+-	incw	%ax
+-	jnz	L(vec_return_begin)
+-
+-
+-	mov	%rsi, %rcx
+-	and	$-16, %rsi
+-	add	$16, %rsi
+-	sub	%rsi, %rcx
+-
+-	sub	%rcx, %rdi
+-	add	%rcx, %rdx
+-	test	$0xf, %rdi
+-	jz	L(2aligned)
+-
+-	cmp	$128, %rdx
+-	ja	L(128bytesormore)
+-
+-	.p2align 4,, 6
+-L(less128bytes):
+-	movdqu	(%rdi), %xmm1
+-	CMPEQ	(%rsi), %xmm1
+-	pmovmskb %xmm1, %eax
+-	incw	%ax
+-	jnz	L(vec_return_begin)
+-
+-	movdqu	16(%rdi), %xmm1
+-	CMPEQ	16(%rsi), %xmm1
+-	pmovmskb %xmm1, %eax
+-	incw	%ax
+-	jnz	L(vec_return_begin_16)
+-
+-	movdqu	32(%rdi), %xmm1
+-	CMPEQ	32(%rsi), %xmm1
+-	pmovmskb %xmm1, %eax
+-	incw	%ax
+-	jnz	L(vec_return_begin_32)
+-
+-	movdqu	48(%rdi), %xmm1
+-	CMPEQ	48(%rsi), %xmm1
+-	pmovmskb %xmm1, %eax
+-	incw	%ax
+-	jnz	L(vec_return_begin_48)
+-
+-	cmp	$96, %rdx
+-	jb	L(32_to_64_bytes)
+-
+-	addq	$64, %rdi
+-	addq	$64, %rsi
+-	subq	$64, %rdx
+-
+-	.p2align 4,, 6
+-L(last_64_bytes):
+-	movdqu	(%rdi), %xmm1
+-	CMPEQ	(%rsi), %xmm1
+-	pmovmskb %xmm1, %eax
+-	incw	%ax
+-	jnz	L(vec_return_begin)
+-
+-	movdqu	16(%rdi), %xmm1
+-	CMPEQ	16(%rsi), %xmm1
+-	pmovmskb %xmm1, %eax
+-	incw	%ax
+-	jnz	L(vec_return_begin_16)
+-
+-	movdqu	-32(%rdi, %rdx), %xmm0
+-	movdqu	-32(%rsi, %rdx), %xmm1
+-	CMPEQ	%xmm0, %xmm1
+-	pmovmskb %xmm1, %eax
+-	incw	%ax
+-	jnz	L(vec_return_end_16)
+-
+-	movdqu	-16(%rdi, %rdx), %xmm0
+-	movdqu	-16(%rsi, %rdx), %xmm1
+-	CMPEQ	%xmm0, %xmm1
+-	pmovmskb %xmm1, %eax
+-	incw	%ax
+-	jnz	L(vec_return_end)
+-	ret
+-
+-	.p2align 4
+-L(128bytesormore):
+-	cmp	$256, %rdx
+-	ja	L(unaligned_loop)
+-L(less256bytes):
+-	movdqu	(%rdi), %xmm1
+-	CMPEQ	(%rsi), %xmm1
+-	pmovmskb %xmm1, %eax
+-	incw	%ax
+-	jnz	L(vec_return_begin)
+-
+-	movdqu	16(%rdi), %xmm1
+-	CMPEQ	16(%rsi), %xmm1
+-	pmovmskb %xmm1, %eax
+-	incw	%ax
+-	jnz	L(vec_return_begin_16)
+-
+-	movdqu	32(%rdi), %xmm1
+-	CMPEQ	32(%rsi), %xmm1
+-	pmovmskb %xmm1, %eax
+-	incw	%ax
+-	jnz	L(vec_return_begin_32)
+-
+-	movdqu	48(%rdi), %xmm1
+-	CMPEQ	48(%rsi), %xmm1
+-	pmovmskb %xmm1, %eax
+-	incw	%ax
+-	jnz	L(vec_return_begin_48)
+-
+-	addq	$64, %rdi
+-	addq	$64, %rsi
+-
+-	movdqu	(%rdi), %xmm1
+-	CMPEQ	(%rsi), %xmm1
+-	pmovmskb %xmm1, %eax
+-	incw	%ax
+-	jnz	L(vec_return_begin)
+-
+-	movdqu	16(%rdi), %xmm1
+-	CMPEQ	16(%rsi), %xmm1
+-	pmovmskb %xmm1, %eax
+-	incw	%ax
+-	jnz	L(vec_return_begin_16)
+-
+-	movdqu	32(%rdi), %xmm1
+-	CMPEQ	32(%rsi), %xmm1
+-	pmovmskb %xmm1, %eax
+-	incw	%ax
+-	jnz	L(vec_return_begin_32)
+-
+-	movdqu	48(%rdi), %xmm1
+-	CMPEQ	48(%rsi), %xmm1
+-	pmovmskb %xmm1, %eax
+-	incw	%ax
+-	jnz	L(vec_return_begin_48)
+-
+-	addq	$-128, %rdx
+-	subq	$-64, %rsi
+-	subq	$-64, %rdi
+-
+-	cmp	$64, %rdx
+-	ja	L(less128bytes)
+-
+-	cmp	$32, %rdx
+-	ja	L(last_64_bytes)
+-
+-	movdqu	-32(%rdi, %rdx), %xmm0
+-	movdqu	-32(%rsi, %rdx), %xmm1
+-	CMPEQ	%xmm0, %xmm1
+-	pmovmskb %xmm1, %eax
+-	incw	%ax
+-	jnz	L(vec_return_end_16)
+-
+-	movdqu	-16(%rdi, %rdx), %xmm0
+-	movdqu	-16(%rsi, %rdx), %xmm1
+-	CMPEQ	%xmm0, %xmm1
+-	pmovmskb %xmm1, %eax
+-	incw	%ax
+-	jnz	L(vec_return_end)
+-	ret
+-
+-	.p2align 4
+-L(unaligned_loop):
+-# ifdef DATA_CACHE_SIZE_HALF
+-	mov	$DATA_CACHE_SIZE_HALF, %R8_LP
+-# else
+-	mov	__x86_data_cache_size_half(%rip), %R8_LP
+-# endif
+-	movq	%r8, %r9
+-	addq	%r8, %r8
+-	addq	%r9, %r8
+-	cmpq	%r8, %rdx
+-	ja	L(L2_L3_cache_unaligned)
+-	sub	$64, %rdx
+-	.p2align 4
+-L(64bytesormore_loop):
+-	movdqu	(%rdi), %xmm0
+-	movdqu	16(%rdi), %xmm1
+-	movdqu	32(%rdi), %xmm2
+-	movdqu	48(%rdi), %xmm3
+-
+-	CMPEQ	(%rsi), %xmm0
+-	CMPEQ	16(%rsi), %xmm1
+-	CMPEQ	32(%rsi), %xmm2
+-	CMPEQ	48(%rsi), %xmm3
+-
+-	pand	%xmm0, %xmm1
+-	pand	%xmm2, %xmm3
+-	pand	%xmm1, %xmm3
+-
+-	pmovmskb %xmm3, %eax
+-	incw	%ax
+-	jnz	L(64bytesormore_loop_end)
+-
+-	add	$64, %rsi
+-	add	$64, %rdi
+-	sub	$64, %rdx
+-	ja	L(64bytesormore_loop)
+-
+-	.p2align 4,, 6
+-L(loop_tail):
+-	addq	%rdx, %rdi
+-	movdqu	(%rdi), %xmm0
+-	movdqu	16(%rdi), %xmm1
+-	movdqu	32(%rdi), %xmm2
+-	movdqu	48(%rdi), %xmm3
+-
+-	addq	%rdx, %rsi
+-	movdqu	(%rsi), %xmm4
+-	movdqu	16(%rsi), %xmm5
+-	movdqu	32(%rsi), %xmm6
+-	movdqu	48(%rsi), %xmm7
+-
+-	CMPEQ	%xmm4, %xmm0
+-	CMPEQ	%xmm5, %xmm1
+-	CMPEQ	%xmm6, %xmm2
+-	CMPEQ	%xmm7, %xmm3
+-
+-	pand	%xmm0, %xmm1
+-	pand	%xmm2, %xmm3
+-	pand	%xmm1, %xmm3
+-
+-	pmovmskb %xmm3, %eax
+-	incw	%ax
+-	jnz	L(64bytesormore_loop_end)
+-	ret
+-
+-L(L2_L3_cache_unaligned):
+-	subq	$64, %rdx
+-	.p2align 4
+-L(L2_L3_unaligned_128bytes_loop):
+-	prefetchnta 0x1c0(%rdi)
+-	prefetchnta 0x1c0(%rsi)
+-
+-	movdqu	(%rdi), %xmm0
+-	movdqu	16(%rdi), %xmm1
+-	movdqu	32(%rdi), %xmm2
+-	movdqu	48(%rdi), %xmm3
+-
+-	CMPEQ	(%rsi), %xmm0
+-	CMPEQ	16(%rsi), %xmm1
+-	CMPEQ	32(%rsi), %xmm2
+-	CMPEQ	48(%rsi), %xmm3
+-
+-	pand	%xmm0, %xmm1
+-	pand	%xmm2, %xmm3
+-	pand	%xmm1, %xmm3
+-
+-	pmovmskb %xmm3, %eax
+-	incw	%ax
+-	jnz	L(64bytesormore_loop_end)
+-
+-	add	$64, %rsi
+-	add	$64, %rdi
+-	sub	$64, %rdx
+-	ja	L(L2_L3_unaligned_128bytes_loop)
+-	jmp	L(loop_tail)
+-
+-
+-	/* This case is for machines which are sensitive for unaligned
+-	 * instructions.  */
+-	.p2align 4
+-L(2aligned):
+-	cmp	$128, %rdx
+-	ja	L(128bytesormorein2aligned)
+-L(less128bytesin2aligned):
+-	movdqa	(%rdi), %xmm1
+-	CMPEQ	(%rsi), %xmm1
+-	pmovmskb %xmm1, %eax
+-	incw	%ax
+-	jnz	L(vec_return_begin)
+-
+-	movdqa	16(%rdi), %xmm1
+-	CMPEQ	16(%rsi), %xmm1
+-	pmovmskb %xmm1, %eax
+-	incw	%ax
+-	jnz	L(vec_return_begin_16)
+-
+-	movdqa	32(%rdi), %xmm1
+-	CMPEQ	32(%rsi), %xmm1
+-	pmovmskb %xmm1, %eax
+-	incw	%ax
+-	jnz	L(vec_return_begin_32)
+-
+-	movdqa	48(%rdi), %xmm1
+-	CMPEQ	48(%rsi), %xmm1
+-	pmovmskb %xmm1, %eax
+-	incw	%ax
+-	jnz	L(vec_return_begin_48)
+-
+-	cmp	$96, %rdx
+-	jb	L(32_to_64_bytes)
+-
+-	addq	$64, %rdi
+-	addq	$64, %rsi
+-	subq	$64, %rdx
+-
+-	.p2align 4,, 6
+-L(aligned_last_64_bytes):
+-	movdqa	(%rdi), %xmm1
+-	CMPEQ	(%rsi), %xmm1
+-	pmovmskb %xmm1, %eax
+-	incw	%ax
+-	jnz	L(vec_return_begin)
+-
+-	movdqa	16(%rdi), %xmm1
+-	CMPEQ	16(%rsi), %xmm1
+-	pmovmskb %xmm1, %eax
+-	incw	%ax
+-	jnz	L(vec_return_begin_16)
+-
+-	movdqu	-32(%rdi, %rdx), %xmm0
+-	movdqu	-32(%rsi, %rdx), %xmm1
+-	CMPEQ	%xmm0, %xmm1
+-	pmovmskb %xmm1, %eax
+-	incw	%ax
+-	jnz	L(vec_return_end_16)
+-
+-	movdqu	-16(%rdi, %rdx), %xmm0
+-	movdqu	-16(%rsi, %rdx), %xmm1
+-	CMPEQ	%xmm0, %xmm1
+-	pmovmskb %xmm1, %eax
+-	incw	%ax
+-	jnz	L(vec_return_end)
+-	ret
+-
+-	.p2align 4
+-L(128bytesormorein2aligned):
+-	cmp	$256, %rdx
+-	ja	L(aligned_loop)
+-L(less256bytesin2alinged):
+-	movdqa	(%rdi), %xmm1
+-	CMPEQ	(%rsi), %xmm1
+-	pmovmskb %xmm1, %eax
+-	incw	%ax
+-	jnz	L(vec_return_begin)
+-
+-	movdqa	16(%rdi), %xmm1
+-	CMPEQ	16(%rsi), %xmm1
+-	pmovmskb %xmm1, %eax
+-	incw	%ax
+-	jnz	L(vec_return_begin_16)
+-
+-	movdqa	32(%rdi), %xmm1
+-	CMPEQ	32(%rsi), %xmm1
+-	pmovmskb %xmm1, %eax
+-	incw	%ax
+-	jnz	L(vec_return_begin_32)
+-
+-	movdqa	48(%rdi), %xmm1
+-	CMPEQ	48(%rsi), %xmm1
+-	pmovmskb %xmm1, %eax
+-	incw	%ax
+-	jnz	L(vec_return_begin_48)
+-
+-	addq	$64, %rdi
+-	addq	$64, %rsi
+-
+-	movdqa	(%rdi), %xmm1
+-	CMPEQ	(%rsi), %xmm1
+-	pmovmskb %xmm1, %eax
+-	incw	%ax
+-	jnz	L(vec_return_begin)
+-
+-	movdqa	16(%rdi), %xmm1
+-	CMPEQ	16(%rsi), %xmm1
+-	pmovmskb %xmm1, %eax
+-	incw	%ax
+-	jnz	L(vec_return_begin_16)
+-
+-	movdqa	32(%rdi), %xmm1
+-	CMPEQ	32(%rsi), %xmm1
+-	pmovmskb %xmm1, %eax
+-	incw	%ax
+-	jnz	L(vec_return_begin_32)
+-
+-	movdqa	48(%rdi), %xmm1
+-	CMPEQ	48(%rsi), %xmm1
+-	pmovmskb %xmm1, %eax
+-	incw	%ax
+-	jnz	L(vec_return_begin_48)
+-
+-	addq	$-128, %rdx
+-	subq	$-64, %rsi
+-	subq	$-64, %rdi
+-
+-	cmp	$64, %rdx
+-	ja	L(less128bytesin2aligned)
+-
+-	cmp	$32, %rdx
+-	ja	L(aligned_last_64_bytes)
+-
+-	movdqu	-32(%rdi, %rdx), %xmm0
+-	movdqu	-32(%rsi, %rdx), %xmm1
+-	CMPEQ	%xmm0, %xmm1
+-	pmovmskb %xmm1, %eax
+-	incw	%ax
+-	jnz	L(vec_return_end_16)
+-
+-	movdqu	-16(%rdi, %rdx), %xmm0
+-	movdqu	-16(%rsi, %rdx), %xmm1
+-	CMPEQ	%xmm0, %xmm1
+-	pmovmskb %xmm1, %eax
+-	incw	%ax
+-	jnz	L(vec_return_end)
+-	ret
+-
+-	.p2align 4
+-L(aligned_loop):
+-# ifdef DATA_CACHE_SIZE_HALF
+-	mov	$DATA_CACHE_SIZE_HALF, %R8_LP
+-# else
+-	mov	__x86_data_cache_size_half(%rip), %R8_LP
+-# endif
+-	movq	%r8, %r9
+-	addq	%r8, %r8
+-	addq	%r9, %r8
+-	cmpq	%r8, %rdx
+-	ja	L(L2_L3_cache_aligned)
+-
+-	sub	$64, %rdx
+-	.p2align 4
+-L(64bytesormore_loopin2aligned):
+-	movdqa	(%rdi), %xmm0
+-	movdqa	16(%rdi), %xmm1
+-	movdqa	32(%rdi), %xmm2
+-	movdqa	48(%rdi), %xmm3
+-
+-	CMPEQ	(%rsi), %xmm0
+-	CMPEQ	16(%rsi), %xmm1
+-	CMPEQ	32(%rsi), %xmm2
+-	CMPEQ	48(%rsi), %xmm3
+-
+-	pand	%xmm0, %xmm1
+-	pand	%xmm2, %xmm3
+-	pand	%xmm1, %xmm3
+-
+-	pmovmskb %xmm3, %eax
+-	incw	%ax
+-	jnz	L(64bytesormore_loop_end)
+-	add	$64, %rsi
+-	add	$64, %rdi
+-	sub	$64, %rdx
+-	ja	L(64bytesormore_loopin2aligned)
+-	jmp	L(loop_tail)
+-
+-L(L2_L3_cache_aligned):
+-	subq	$64, %rdx
+-	.p2align 4
+-L(L2_L3_aligned_128bytes_loop):
+-	prefetchnta 0x1c0(%rdi)
+-	prefetchnta 0x1c0(%rsi)
+-	movdqa	(%rdi), %xmm0
+-	movdqa	16(%rdi), %xmm1
+-	movdqa	32(%rdi), %xmm2
+-	movdqa	48(%rdi), %xmm3
+-
+-	CMPEQ	(%rsi), %xmm0
+-	CMPEQ	16(%rsi), %xmm1
+-	CMPEQ	32(%rsi), %xmm2
+-	CMPEQ	48(%rsi), %xmm3
+-
+-	pand	%xmm0, %xmm1
+-	pand	%xmm2, %xmm3
+-	pand	%xmm1, %xmm3
+-
+-	pmovmskb %xmm3, %eax
+-	incw	%ax
+-	jnz	L(64bytesormore_loop_end)
+-
+-	addq	$64, %rsi
+-	addq	$64, %rdi
+-	subq	$64, %rdx
+-	ja	L(L2_L3_aligned_128bytes_loop)
+-	jmp	L(loop_tail)
+-
+-	.p2align 4
+-L(64bytesormore_loop_end):
+-	pmovmskb %xmm0, %ecx
+-	incw	%cx
+-	jnz	L(loop_end_ret)
+-
+-	pmovmskb %xmm1, %ecx
+-	notw	%cx
+-	sall	$16, %ecx
+-	jnz	L(loop_end_ret)
+-
+-	pmovmskb %xmm2, %ecx
+-	notw	%cx
+-	shlq	$32, %rcx
+-	jnz	L(loop_end_ret)
+-
+-	addq	$48, %rdi
+-	addq	$48, %rsi
+-	movq	%rax, %rcx
+-
+-	.p2align 4,, 6
+-L(loop_end_ret):
+-	bsfq	%rcx, %rcx
+-# ifdef USE_AS_WMEMCMP
+-	movl	(%rdi, %rcx), %eax
+-	xorl	%edx, %edx
+-	cmpl	(%rsi, %rcx), %eax
+-	setg	%dl
+-	leal	-1(%rdx, %rdx), %eax
+-# else
+-	movzbl	(%rdi, %rcx), %eax
+-	movzbl	(%rsi, %rcx), %ecx
+-	subl	%ecx, %eax
+-# endif
+-	ret
+-END (MEMCMP)
+-#endif
+-- 
+GitLab
+
--- a/glibc-RHEL-15696-102.patch
+++ b/glibc-RHEL-15696-102.patch
@ -0,0 +1,263 @@
+From 23102686ec67b856a2d4fd25ddaa1c0b8d175c4f Mon Sep 17 00:00:00 2001
+From: Noah Goldstein <goldstein.w.n@gmail.com>
+Date: Fri, 15 Apr 2022 12:28:01 -0500
+Subject: [PATCH] x86: Cleanup page cross code in memcmp-avx2-movbe.S
+Content-type: text/plain; charset=UTF-8
+
+Old code was both inefficient and wasted code size. New code (-62
+bytes) and comparable or better performance in the page cross case.
+
+geometric_mean(N=20) of page cross cases New / Original: 0.960
+
+size, align0, align1, ret, New Time/Old Time
+   1,   4095,      0,   0,             1.001
+   1,   4095,      0,   1,             0.999
+   1,   4095,      0,  -1,               1.0
+   2,   4094,      0,   0,               1.0
+   2,   4094,      0,   1,               1.0
+   2,   4094,      0,  -1,               1.0
+   3,   4093,      0,   0,               1.0
+   3,   4093,      0,   1,               1.0
+   3,   4093,      0,  -1,               1.0
+   4,   4092,      0,   0,             0.987
+   4,   4092,      0,   1,               1.0
+   4,   4092,      0,  -1,               1.0
+   5,   4091,      0,   0,             0.984
+   5,   4091,      0,   1,             1.002
+   5,   4091,      0,  -1,             1.005
+   6,   4090,      0,   0,             0.993
+   6,   4090,      0,   1,             1.001
+   6,   4090,      0,  -1,             1.003
+   7,   4089,      0,   0,             0.991
+   7,   4089,      0,   1,               1.0
+   7,   4089,      0,  -1,             1.001
+   8,   4088,      0,   0,             0.875
+   8,   4088,      0,   1,             0.881
+   8,   4088,      0,  -1,             0.888
+   9,   4087,      0,   0,             0.872
+   9,   4087,      0,   1,             0.879
+   9,   4087,      0,  -1,             0.883
+  10,   4086,      0,   0,             0.878
+  10,   4086,      0,   1,             0.886
+  10,   4086,      0,  -1,             0.873
+  11,   4085,      0,   0,             0.878
+  11,   4085,      0,   1,             0.881
+  11,   4085,      0,  -1,             0.879
+  12,   4084,      0,   0,             0.873
+  12,   4084,      0,   1,             0.889
+  12,   4084,      0,  -1,             0.875
+  13,   4083,      0,   0,             0.873
+  13,   4083,      0,   1,             0.863
+  13,   4083,      0,  -1,             0.863
+  14,   4082,      0,   0,             0.838
+  14,   4082,      0,   1,             0.869
+  14,   4082,      0,  -1,             0.877
+  15,   4081,      0,   0,             0.841
+  15,   4081,      0,   1,             0.869
+  15,   4081,      0,  -1,             0.876
+  16,   4080,      0,   0,             0.988
+  16,   4080,      0,   1,              0.99
+  16,   4080,      0,  -1,             0.989
+  17,   4079,      0,   0,             0.978
+  17,   4079,      0,   1,             0.981
+  17,   4079,      0,  -1,              0.98
+  18,   4078,      0,   0,             0.981
+  18,   4078,      0,   1,              0.98
+  18,   4078,      0,  -1,             0.985
+  19,   4077,      0,   0,             0.977
+  19,   4077,      0,   1,             0.979
+  19,   4077,      0,  -1,             0.986
+  20,   4076,      0,   0,             0.977
+  20,   4076,      0,   1,             0.986
+  20,   4076,      0,  -1,             0.984
+  21,   4075,      0,   0,             0.977
+  21,   4075,      0,   1,             0.983
+  21,   4075,      0,  -1,             0.988
+  22,   4074,      0,   0,             0.983
+  22,   4074,      0,   1,             0.994
+  22,   4074,      0,  -1,             0.993
+  23,   4073,      0,   0,              0.98
+  23,   4073,      0,   1,             0.992
+  23,   4073,      0,  -1,             0.995
+  24,   4072,      0,   0,             0.989
+  24,   4072,      0,   1,             0.989
+  24,   4072,      0,  -1,             0.991
+  25,   4071,      0,   0,              0.99
+  25,   4071,      0,   1,             0.999
+  25,   4071,      0,  -1,             0.996
+  26,   4070,      0,   0,             0.993
+  26,   4070,      0,   1,             0.995
+  26,   4070,      0,  -1,             0.998
+  27,   4069,      0,   0,             0.993
+  27,   4069,      0,   1,             0.999
+  27,   4069,      0,  -1,               1.0
+  28,   4068,      0,   0,             0.997
+  28,   4068,      0,   1,               1.0
+  28,   4068,      0,  -1,             0.999
+  29,   4067,      0,   0,             0.996
+  29,   4067,      0,   1,             0.999
+  29,   4067,      0,  -1,             0.999
+  30,   4066,      0,   0,             0.991
+  30,   4066,      0,   1,             1.001
+  30,   4066,      0,  -1,             0.999
+  31,   4065,      0,   0,             0.988
+  31,   4065,      0,   1,             0.998
+  31,   4065,      0,  -1,             0.998
+Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
+---
+ sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S | 98 ++++++++++++--------
+ 1 file changed, 61 insertions(+), 37 deletions(-)
+
+diff --git a/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S b/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S
+index 16fc673e..99258cf5 100644
+--- a/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S
+++ b/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S
+@@ -429,22 +429,21 @@ L(page_cross_less_vec):
+ # ifndef USE_AS_WMEMCMP
+ 	cmpl	$8, %edx
+ 	jae	L(between_8_15)
+	/* Fall through for [4, 7].  */
+ 	cmpl	$4, %edx
+-	jae	L(between_4_7)
+	jb	L(between_2_3)
+ 
+-	/* Load as big endian to avoid branches.  */
+-	movzwl	(%rdi), %eax
+-	movzwl	(%rsi), %ecx
+-	shll	$8, %eax
+-	shll	$8, %ecx
+-	bswap	%eax
+-	bswap	%ecx
+-	movzbl	-1(%rdi, %rdx), %edi
+-	movzbl	-1(%rsi, %rdx), %esi
+-	orl	%edi, %eax
+-	orl	%esi, %ecx
+-	/* Subtraction is okay because the upper 8 bits are zero.  */
+-	subl	%ecx, %eax
+	movbe	(%rdi), %eax
+	movbe	(%rsi), %ecx
+	shlq	$32, %rax
+	shlq	$32, %rcx
+	movbe	-4(%rdi, %rdx), %edi
+	movbe	-4(%rsi, %rdx), %esi
+	orq	%rdi, %rax
+	orq	%rsi, %rcx
+	subq	%rcx, %rax
+	/* Fast path for return zero.  */
+	jnz	L(ret_nonzero)
+ 	/* No ymm register was touched.  */
+ 	ret
+ 
+@@ -457,9 +456,33 @@ L(one_or_less):
+ 	/* No ymm register was touched.  */
+ 	ret
+ 
+	.p2align 4,, 5
+L(ret_nonzero):
+	sbbl	%eax, %eax
+	orl	$1, %eax
+	/* No ymm register was touched.  */
+	ret
+
+	.p2align 4,, 2
+L(zero):
+	xorl	%eax, %eax
+	/* No ymm register was touched.  */
+	ret
+
+ 	.p2align 4
+ L(between_8_15):
+-# endif
+	movbe	(%rdi), %rax
+	movbe	(%rsi), %rcx
+	subq	%rcx, %rax
+	jnz	L(ret_nonzero)
+	movbe	-8(%rdi, %rdx), %rax
+	movbe	-8(%rsi, %rdx), %rcx
+	subq	%rcx, %rax
+	/* Fast path for return zero.  */
+	jnz	L(ret_nonzero)
+	/* No ymm register was touched.  */
+	ret
+# else
+ 	/* If USE_AS_WMEMCMP fall through into 8-15 byte case.  */
+ 	vmovq	(%rdi), %xmm1
+ 	vmovq	(%rsi), %xmm2
+@@ -475,16 +498,13 @@ L(between_8_15):
+ 	VPCMPEQ	%xmm1, %xmm2, %xmm2
+ 	vpmovmskb %xmm2, %eax
+ 	subl	$0xffff, %eax
+	/* Fast path for return zero.  */
+ 	jnz	L(return_vec_0)
+ 	/* No ymm register was touched.  */
+ 	ret
+# endif
+ 
+-	.p2align 4
+-L(zero):
+-	xorl	%eax, %eax
+-	ret
+-
+-	.p2align 4
+	.p2align 4,, 10
+ L(between_16_31):
+ 	/* From 16 to 31 bytes.  No branch when size == 16.  */
+ 	vmovdqu	(%rsi), %xmm2
+@@ -501,11 +521,17 @@ L(between_16_31):
+ 	VPCMPEQ	(%rdi), %xmm2, %xmm2
+ 	vpmovmskb %xmm2, %eax
+ 	subl	$0xffff, %eax
+	/* Fast path for return zero.  */
+ 	jnz	L(return_vec_0)
+ 	/* No ymm register was touched.  */
+ 	ret
+ 
+ # ifdef USE_AS_WMEMCMP
+	.p2align 4,, 2
+L(zero):
+	xorl	%eax, %eax
+	ret
+
+ 	.p2align 4
+ L(one_or_less):
+ 	jb	L(zero)
+@@ -520,22 +546,20 @@ L(one_or_less):
+ # else
+ 
+ 	.p2align 4
+-L(between_4_7):
+-	/* Load as big endian with overlapping movbe to avoid branches.
+-	 */
+-	movbe	(%rdi), %eax
+-	movbe	(%rsi), %ecx
+-	shlq	$32, %rax
+-	shlq	$32, %rcx
+-	movbe	-4(%rdi, %rdx), %edi
+-	movbe	-4(%rsi, %rdx), %esi
+-	orq	%rdi, %rax
+-	orq	%rsi, %rcx
+-	subq	%rcx, %rax
+-	jz	L(zero_4_7)
+-	sbbl	%eax, %eax
+-	orl	$1, %eax
+-L(zero_4_7):
+L(between_2_3):
+	/* Load as big endian to avoid branches.  */
+	movzwl	(%rdi), %eax
+	movzwl	(%rsi), %ecx
+	bswap	%eax
+	bswap	%ecx
+	shrl	%eax
+	shrl	%ecx
+	movzbl	-1(%rdi, %rdx), %edi
+	movzbl	-1(%rsi, %rdx), %esi
+	orl	%edi, %eax
+	orl	%esi, %ecx
+	/* Subtraction is okay because the upper bit is zero.  */
+	subl	%ecx, %eax
+ 	/* No ymm register was touched.  */
+ 	ret
+ # endif
+-- 
+GitLab
+
--- a/glibc-RHEL-15696-103.patch
+++ b/glibc-RHEL-15696-103.patch
@ -0,0 +1,876 @@
+From 5307aa9c1800f36a64c183c091c9af392c1fa75c Mon Sep 17 00:00:00 2001
+From: Noah Goldstein <goldstein.w.n@gmail.com>
+Date: Thu, 21 Apr 2022 20:52:28 -0500
+Subject: [PATCH] x86: Optimize {str|wcs}rchr-sse2
+Content-type: text/plain; charset=UTF-8
+
+The new code unrolls the main loop slightly without adding too much
+overhead and minimizes the comparisons for the search CHAR.
+
+Geometric Mean of all benchmarks New / Old: 0.741
+See email for all results.
+
+Full xcheck passes on x86_64 with and without multiarch enabled.
+Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
+---
+ sysdeps/x86_64/multiarch/strrchr-sse2.S |   2 +-
+ sysdeps/x86_64/multiarch/wcsrchr-sse2.S |   3 +-
+ sysdeps/x86_64/strrchr.S                | 510 +++++++++++++++---------
+ sysdeps/x86_64/wcsrchr.S                | 266 +-----------
+ 4 files changed, 338 insertions(+), 443 deletions(-)
+
+Conflicts:
+	sysdeps/x86_64/wcsrchr.S
+	(copyright header)
+
+diff --git a/sysdeps/x86_64/multiarch/strrchr-sse2.S b/sysdeps/x86_64/multiarch/strrchr-sse2.S
+index 0ec76fe9..6bb1284b 100644
+--- a/sysdeps/x86_64/multiarch/strrchr-sse2.S
+++ b/sysdeps/x86_64/multiarch/strrchr-sse2.S
+@@ -17,7 +17,7 @@
+    <http://www.gnu.org/licenses/>.  */
+ 
+ #if IS_IN (libc)
+-# define strrchr __strrchr_sse2
+# define STRRCHR __strrchr_sse2
+ 
+ # undef weak_alias
+ # define weak_alias(strrchr, rindex)
+diff --git a/sysdeps/x86_64/multiarch/wcsrchr-sse2.S b/sysdeps/x86_64/multiarch/wcsrchr-sse2.S
+index d015e953..f26d53b5 100644
+--- a/sysdeps/x86_64/multiarch/wcsrchr-sse2.S
+++ b/sysdeps/x86_64/multiarch/wcsrchr-sse2.S
+@@ -17,7 +17,6 @@
+    <http://www.gnu.org/licenses/>.  */
+ 
+ #if IS_IN (libc)
+-# define wcsrchr __wcsrchr_sse2
+# define STRRCHR	__wcsrchr_sse2
+ #endif
+-
+ #include "../wcsrchr.S"
+diff --git a/sysdeps/x86_64/strrchr.S b/sysdeps/x86_64/strrchr.S
+index aca98e7e..a58cc220 100644
+--- a/sysdeps/x86_64/strrchr.S
+++ b/sysdeps/x86_64/strrchr.S
+@@ -19,210 +19,360 @@
+ 
+ #include <sysdep.h>
+ 
+#ifndef STRRCHR
+# define STRRCHR	strrchr
+#endif
+
+#ifdef USE_AS_WCSRCHR
+# define PCMPEQ	pcmpeqd
+# define CHAR_SIZE	4
+# define PMINU	pminud
+#else
+# define PCMPEQ	pcmpeqb
+# define CHAR_SIZE	1
+# define PMINU	pminub
+#endif
+
+#define PAGE_SIZE	4096
+#define VEC_SIZE	16
+
+ 	.text
+-ENTRY (strrchr)
+-	movd	%esi, %xmm1
+ENTRY(STRRCHR)
+	movd	%esi, %xmm0
+ 	movq	%rdi, %rax
+-	andl	$4095, %eax
+-	punpcklbw	%xmm1, %xmm1
+-	cmpq	$4032, %rax
+-	punpcklwd	%xmm1, %xmm1
+-	pshufd	$0, %xmm1, %xmm1
+	andl	$(PAGE_SIZE - 1), %eax
+#ifndef USE_AS_WCSRCHR
+	punpcklbw %xmm0, %xmm0
+	punpcklwd %xmm0, %xmm0
+#endif
+	pshufd	$0, %xmm0, %xmm0
+	cmpl	$(PAGE_SIZE - VEC_SIZE), %eax
+ 	ja	L(cross_page)
+-	movdqu	(%rdi), %xmm0
+
+L(cross_page_continue):
+	movups	(%rdi), %xmm1
+ 	pxor	%xmm2, %xmm2
+-	movdqa	%xmm0, %xmm3
+-	pcmpeqb	%xmm1, %xmm0
+-	pcmpeqb	%xmm2, %xmm3
+-	pmovmskb	%xmm0, %ecx
+-	pmovmskb	%xmm3, %edx
+-	testq	%rdx, %rdx
+-	je	L(next_48_bytes)
+-	leaq	-1(%rdx), %rax
+-	xorq	%rdx, %rax
+-	andq	%rcx, %rax
+-	je	L(exit)
+-	bsrq	%rax, %rax
+	PCMPEQ	%xmm1, %xmm2
+	pmovmskb %xmm2, %ecx
+	testl	%ecx, %ecx
+	jz	L(aligned_more)
+
+	PCMPEQ	%xmm0, %xmm1
+	pmovmskb %xmm1, %eax
+	leal	-1(%rcx), %edx
+	xorl	%edx, %ecx
+	andl	%ecx, %eax
+	jz	L(ret0)
+	bsrl	%eax, %eax
+ 	addq	%rdi, %rax
+	/* We are off by 3 for wcsrchr if search CHAR is non-zero. If
+	   search CHAR is zero we are correct. Either way `andq
+	   -CHAR_SIZE, %rax` gets the correct result.  */
+#ifdef USE_AS_WCSRCHR
+	andq	$-CHAR_SIZE, %rax
+#endif
+L(ret0):
+ 	ret
+ 
+	/* Returns for first vec x1/x2 have hard coded backward search
+	   path for earlier matches.  */
+ 	.p2align 4
+-L(next_48_bytes):
+-	movdqu	16(%rdi), %xmm4
+-	movdqa	%xmm4, %xmm5
+-	movdqu	32(%rdi), %xmm3
+-	pcmpeqb	%xmm1, %xmm4
+-	pcmpeqb	%xmm2, %xmm5
+-	movdqu	48(%rdi), %xmm0
+-	pmovmskb	%xmm5, %edx
+-	movdqa	%xmm3, %xmm5
+-	pcmpeqb	%xmm1, %xmm3
+-	pcmpeqb	%xmm2, %xmm5
+-	pcmpeqb	%xmm0, %xmm2
+-	salq	$16, %rdx
+-	pmovmskb	%xmm3, %r8d
+-	pmovmskb	%xmm5, %eax
+-	pmovmskb	%xmm2, %esi
+-	salq	$32, %r8
+-	salq	$32, %rax
+-	pcmpeqb	%xmm1, %xmm0
+-	orq	%rdx, %rax
+-	movq	%rsi, %rdx
+-	pmovmskb	%xmm4, %esi
+-	salq	$48, %rdx
+-	salq	$16, %rsi
+-	orq	%r8, %rsi
+-	orq	%rcx, %rsi
+-	pmovmskb	%xmm0, %ecx
+-	salq	$48, %rcx
+-	orq	%rcx, %rsi
+-	orq	%rdx, %rax
+-	je	L(loop_header2)
+-	leaq	-1(%rax), %rcx
+-	xorq	%rax, %rcx
+-	andq	%rcx, %rsi
+-	je	L(exit)
+-	bsrq	%rsi, %rsi
+-	leaq	(%rdi,%rsi), %rax
+L(first_vec_x0_test):
+	PCMPEQ	%xmm0, %xmm1
+	pmovmskb %xmm1, %eax
+	testl	%eax, %eax
+	jz	L(ret0)
+	bsrl	%eax, %eax
+	addq	%r8, %rax
+#ifdef USE_AS_WCSRCHR
+	andq	$-CHAR_SIZE, %rax
+#endif
+ 	ret
+ 
+ 	.p2align 4
+-L(loop_header2):
+-	testq	%rsi, %rsi
+-	movq	%rdi, %rcx
+-	je	L(no_c_found)
+-L(loop_header):
+-	addq	$64, %rdi
+-	pxor	%xmm7, %xmm7
+-	andq	$-64, %rdi
+-	jmp	L(loop_entry)
+L(first_vec_x1):
+	PCMPEQ	%xmm0, %xmm2
+	pmovmskb %xmm2, %eax
+	leal	-1(%rcx), %edx
+	xorl	%edx, %ecx
+	andl	%ecx, %eax
+	jz	L(first_vec_x0_test)
+	bsrl	%eax, %eax
+	leaq	(VEC_SIZE)(%rdi, %rax), %rax
+#ifdef USE_AS_WCSRCHR
+	andq	$-CHAR_SIZE, %rax
+#endif
+	ret
+ 
+ 	.p2align 4
+-L(loop64):
+-	testq	%rdx, %rdx
+-	cmovne	%rdx, %rsi
+-	cmovne	%rdi, %rcx
+-	addq	$64, %rdi
+-L(loop_entry):
+-	movdqa	32(%rdi), %xmm3
+-	pxor	%xmm6, %xmm6
+-	movdqa	48(%rdi), %xmm2
+-	movdqa	%xmm3, %xmm0
+-	movdqa	16(%rdi), %xmm4
+-	pminub	%xmm2, %xmm0
+-	movdqa	(%rdi), %xmm5
+-	pminub	%xmm4, %xmm0
+-	pminub	%xmm5, %xmm0
+-	pcmpeqb	%xmm7, %xmm0
+-	pmovmskb	%xmm0, %eax
+-	movdqa	%xmm5, %xmm0
+-	pcmpeqb	%xmm1, %xmm0
+-	pmovmskb	%xmm0, %r9d
+-	movdqa	%xmm4, %xmm0
+-	pcmpeqb	%xmm1, %xmm0
+-	pmovmskb	%xmm0, %edx
+-	movdqa	%xmm3, %xmm0
+-	pcmpeqb	%xmm1, %xmm0
+-	salq	$16, %rdx
+-	pmovmskb	%xmm0, %r10d
+-	movdqa	%xmm2, %xmm0
+-	pcmpeqb	%xmm1, %xmm0
+-	salq	$32, %r10
+-	orq	%r10, %rdx
+-	pmovmskb	%xmm0, %r8d
+-	orq	%r9, %rdx
+-	salq	$48, %r8
+-	orq	%r8, %rdx
+L(first_vec_x1_test):
+	PCMPEQ	%xmm0, %xmm2
+	pmovmskb %xmm2, %eax
+ 	testl	%eax, %eax
+-	je	L(loop64)
+-	pcmpeqb	%xmm6, %xmm4
+-	pcmpeqb	%xmm6, %xmm3
+-	pcmpeqb	%xmm6, %xmm5
+-	pmovmskb	%xmm4, %eax
+-	pmovmskb	%xmm3, %r10d
+-	pcmpeqb	%xmm6, %xmm2
+-	pmovmskb	%xmm5, %r9d
+-	salq	$32, %r10
+-	salq	$16, %rax
+-	pmovmskb	%xmm2, %r8d
+-	orq	%r10, %rax
+-	orq	%r9, %rax
+-	salq	$48, %r8
+-	orq	%r8, %rax
+-	leaq	-1(%rax), %r8
+-	xorq	%rax, %r8
+-	andq	%r8, %rdx
+-	cmovne	%rdi, %rcx
+-	cmovne	%rdx, %rsi
+-	bsrq	%rsi, %rsi
+-	leaq	(%rcx,%rsi), %rax
+	jz	L(first_vec_x0_test)
+	bsrl	%eax, %eax
+	leaq	(VEC_SIZE)(%rdi, %rax), %rax
+#ifdef USE_AS_WCSRCHR
+	andq	$-CHAR_SIZE, %rax
+#endif
+	ret
+
+	.p2align 4
+L(first_vec_x2):
+	PCMPEQ	%xmm0, %xmm3
+	pmovmskb %xmm3, %eax
+	leal	-1(%rcx), %edx
+	xorl	%edx, %ecx
+	andl	%ecx, %eax
+	jz	L(first_vec_x1_test)
+	bsrl	%eax, %eax
+	leaq	(VEC_SIZE * 2)(%rdi, %rax), %rax
+#ifdef USE_AS_WCSRCHR
+	andq	$-CHAR_SIZE, %rax
+#endif
+	ret
+
+	.p2align 4
+L(aligned_more):
+	/* Save original pointer if match was in VEC 0.  */
+	movq	%rdi, %r8
+	andq	$-VEC_SIZE, %rdi
+
+	movaps	VEC_SIZE(%rdi), %xmm2
+	pxor	%xmm3, %xmm3
+	PCMPEQ	%xmm2, %xmm3
+	pmovmskb %xmm3, %ecx
+	testl	%ecx, %ecx
+	jnz	L(first_vec_x1)
+
+	movaps	(VEC_SIZE * 2)(%rdi), %xmm3
+	pxor	%xmm4, %xmm4
+	PCMPEQ	%xmm3, %xmm4
+	pmovmskb %xmm4, %ecx
+	testl	%ecx, %ecx
+	jnz	L(first_vec_x2)
+
+	addq	$VEC_SIZE, %rdi
+	/* Save pointer again before realigning.  */
+	movq	%rdi, %rsi
+	andq	$-(VEC_SIZE * 2), %rdi
+	.p2align 4
+L(first_loop):
+	/* Do 2x VEC at a time.  */
+	movaps	(VEC_SIZE * 2)(%rdi), %xmm4
+	movaps	(VEC_SIZE * 3)(%rdi), %xmm5
+	/* Since SSE2 no pminud so wcsrchr needs seperate logic for
+	   detecting zero. Note if this is found to be a bottleneck it
+	   may be worth adding an SSE4.1 wcsrchr implementation.  */
+#ifdef USE_AS_WCSRCHR
+	movaps	%xmm5, %xmm6
+	pxor	%xmm8, %xmm8
+
+	PCMPEQ	%xmm8, %xmm5
+	PCMPEQ	%xmm4, %xmm8
+	por	%xmm5, %xmm8
+#else
+	movaps	%xmm5, %xmm6
+	PMINU	%xmm4, %xmm5
+#endif
+
+	movaps	%xmm4, %xmm9
+	PCMPEQ	%xmm0, %xmm4
+	PCMPEQ	%xmm0, %xmm6
+	movaps	%xmm6, %xmm7
+	por	%xmm4, %xmm6
+#ifndef USE_AS_WCSRCHR
+	pxor	%xmm8, %xmm8
+	PCMPEQ	%xmm5, %xmm8
+#endif
+	pmovmskb %xmm8, %ecx
+	pmovmskb %xmm6, %eax
+
+	addq	$(VEC_SIZE * 2), %rdi
+	/* Use `addl` 1) so we can undo it with `subl` and 2) it can
+	   macro-fuse with `jz`.  */
+	addl	%ecx, %eax
+	jz	L(first_loop)
+
+	/* Check if there is zero match.  */
+	testl	%ecx, %ecx
+	jz	L(second_loop_match)
+
+	/* Check if there was a match in last iteration.  */
+	subl	%ecx, %eax
+	jnz	L(new_match)
+
+L(first_loop_old_match):
+	PCMPEQ	%xmm0, %xmm2
+	PCMPEQ	%xmm0, %xmm3
+	pmovmskb %xmm2, %ecx
+	pmovmskb %xmm3, %eax
+	addl	%eax, %ecx
+	jz	L(first_vec_x0_test)
+	/* NB: We could move this shift to before the branch and save a
+	   bit of code size / performance on the fall through. The
+	   branch leads to the null case which generally seems hotter
+	   than char in first 3x VEC.  */
+	sall	$16, %eax
+	orl	%ecx, %eax
+
+	bsrl	%eax, %eax
+	addq	%rsi, %rax
+#ifdef USE_AS_WCSRCHR
+	andq	$-CHAR_SIZE, %rax
+#endif
+	ret
+
+	.p2align 4
+L(new_match):
+	pxor	%xmm6, %xmm6
+	PCMPEQ	%xmm9, %xmm6
+	pmovmskb %xmm6, %eax
+	sall	$16, %ecx
+	orl	%eax, %ecx
+
+	/* We can't reuse either of the old comparisons as since we mask
+	   of zeros after first zero (instead of using the full
+	   comparison) we can't gurantee no interference between match
+	   after end of string and valid match.  */
+	pmovmskb %xmm4, %eax
+	pmovmskb %xmm7, %edx
+	sall	$16, %edx
+	orl	%edx, %eax
+
+	leal	-1(%ecx), %edx
+	xorl	%edx, %ecx
+	andl	%ecx, %eax
+	jz	L(first_loop_old_match)
+	bsrl	%eax, %eax
+	addq	%rdi, %rax
+#ifdef USE_AS_WCSRCHR
+	andq	$-CHAR_SIZE, %rax
+#endif
+ 	ret
+ 
+	/* Save minimum state for getting most recent match. We can
+	   throw out all previous work.  */
+ 	.p2align 4
+-L(no_c_found):
+-	movl	$1, %esi
+-	xorl	%ecx, %ecx
+-	jmp	L(loop_header)
+L(second_loop_match):
+	movq	%rdi, %rsi
+	movaps	%xmm4, %xmm2
+	movaps	%xmm7, %xmm3
+ 
+ 	.p2align 4
+-L(exit):
+-	xorl	%eax, %eax
+L(second_loop):
+	movaps	(VEC_SIZE * 2)(%rdi), %xmm4
+	movaps	(VEC_SIZE * 3)(%rdi), %xmm5
+	/* Since SSE2 no pminud so wcsrchr needs seperate logic for
+	   detecting zero. Note if this is found to be a bottleneck it
+	   may be worth adding an SSE4.1 wcsrchr implementation.  */
+#ifdef USE_AS_WCSRCHR
+	movaps	%xmm5, %xmm6
+	pxor	%xmm8, %xmm8
+
+	PCMPEQ	%xmm8, %xmm5
+	PCMPEQ	%xmm4, %xmm8
+	por	%xmm5, %xmm8
+#else
+	movaps	%xmm5, %xmm6
+	PMINU	%xmm4, %xmm5
+#endif
+
+	movaps	%xmm4, %xmm9
+	PCMPEQ	%xmm0, %xmm4
+	PCMPEQ	%xmm0, %xmm6
+	movaps	%xmm6, %xmm7
+	por	%xmm4, %xmm6
+#ifndef USE_AS_WCSRCHR
+	pxor	%xmm8, %xmm8
+	PCMPEQ	%xmm5, %xmm8
+#endif
+
+	pmovmskb %xmm8, %ecx
+	pmovmskb %xmm6, %eax
+
+	addq	$(VEC_SIZE * 2), %rdi
+	/* Either null term or new occurence of CHAR.  */
+	addl	%ecx, %eax
+	jz	L(second_loop)
+
+	/* No null term so much be new occurence of CHAR.  */
+	testl	%ecx, %ecx
+	jz	L(second_loop_match)
+
+
+	subl	%ecx, %eax
+	jnz	L(second_loop_new_match)
+
+L(second_loop_old_match):
+	pmovmskb %xmm2, %ecx
+	pmovmskb %xmm3, %eax
+	sall	$16, %eax
+	orl	%ecx, %eax
+	bsrl	%eax, %eax
+	addq	%rsi, %rax
+#ifdef USE_AS_WCSRCHR
+	andq	$-CHAR_SIZE, %rax
+#endif
+ 	ret
+ 
+ 	.p2align 4
+L(second_loop_new_match):
+	pxor	%xmm6, %xmm6
+	PCMPEQ	%xmm9, %xmm6
+	pmovmskb %xmm6, %eax
+	sall	$16, %ecx
+	orl	%eax, %ecx
+
+	/* We can't reuse either of the old comparisons as since we mask
+	   of zeros after first zero (instead of using the full
+	   comparison) we can't gurantee no interference between match
+	   after end of string and valid match.  */
+	pmovmskb %xmm4, %eax
+	pmovmskb %xmm7, %edx
+	sall	$16, %edx
+	orl	%edx, %eax
+
+	leal	-1(%ecx), %edx
+	xorl	%edx, %ecx
+	andl	%ecx, %eax
+	jz	L(second_loop_old_match)
+	bsrl	%eax, %eax
+	addq	%rdi, %rax
+#ifdef USE_AS_WCSRCHR
+	andq	$-CHAR_SIZE, %rax
+#endif
+	ret
+
+	.p2align 4,, 4
+ L(cross_page):
+-	movq	%rdi, %rax
+-	pxor	%xmm0, %xmm0
+-	andq	$-64, %rax
+-	movdqu	(%rax), %xmm5
+-	movdqa	%xmm5, %xmm6
+-	movdqu	16(%rax), %xmm4
+-	pcmpeqb	%xmm1, %xmm5
+-	pcmpeqb	%xmm0, %xmm6
+-	movdqu	32(%rax), %xmm3
+-	pmovmskb	%xmm6, %esi
+-	movdqa	%xmm4, %xmm6
+-	movdqu	48(%rax), %xmm2
+-	pcmpeqb	%xmm1, %xmm4
+-	pcmpeqb	%xmm0, %xmm6
+-	pmovmskb	%xmm6, %edx
+-	movdqa	%xmm3, %xmm6
+-	pcmpeqb	%xmm1, %xmm3
+-	pcmpeqb	%xmm0, %xmm6
+-	pcmpeqb	%xmm2, %xmm0
+-	salq	$16, %rdx
+-	pmovmskb	%xmm3, %r9d
+-	pmovmskb	%xmm6, %r8d
+-	pmovmskb	%xmm0, %ecx
+-	salq	$32, %r9
+-	salq	$32, %r8
+-	pcmpeqb	%xmm1, %xmm2
+-	orq	%r8, %rdx
+-	salq	$48, %rcx
+-	pmovmskb	%xmm5, %r8d
+-	orq	%rsi, %rdx
+-	pmovmskb	%xmm4, %esi
+-	orq	%rcx, %rdx
+-	pmovmskb	%xmm2, %ecx
+-	salq	$16, %rsi
+-	salq	$48, %rcx
+-	orq	%r9, %rsi
+-	orq	%r8, %rsi
+-	orq	%rcx, %rsi
+	movq	%rdi, %rsi
+	andq	$-VEC_SIZE, %rsi
+	movaps	(%rsi), %xmm1
+	pxor	%xmm2, %xmm2
+	PCMPEQ	%xmm1, %xmm2
+	pmovmskb %xmm2, %edx
+ 	movl	%edi, %ecx
+-	subl	%eax, %ecx
+-	shrq	%cl, %rdx
+-	shrq	%cl, %rsi
+-	testq	%rdx, %rdx
+-	je	L(loop_header2)
+-	leaq	-1(%rdx), %rax
+-	xorq	%rdx, %rax
+-	andq	%rax, %rsi
+-	je	L(exit)
+-	bsrq	%rsi, %rax
+	andl	$(VEC_SIZE - 1), %ecx
+	sarl	%cl, %edx
+	jz	L(cross_page_continue)
+	PCMPEQ	%xmm0, %xmm1
+	pmovmskb %xmm1, %eax
+	sarl	%cl, %eax
+	leal	-1(%rdx), %ecx
+	xorl	%edx, %ecx
+	andl	%ecx, %eax
+	jz	L(ret1)
+	bsrl	%eax, %eax
+ 	addq	%rdi, %rax
+#ifdef USE_AS_WCSRCHR
+	andq	$-CHAR_SIZE, %rax
+#endif
+L(ret1):
+ 	ret
+-END (strrchr)
+END(STRRCHR)
+ 
+-weak_alias (strrchr, rindex)
+-libc_hidden_builtin_def (strrchr)
+#ifndef USE_AS_WCSRCHR
+	weak_alias (STRRCHR, rindex)
+	libc_hidden_builtin_def (STRRCHR)
+#endif
+diff --git a/sysdeps/x86_64/wcsrchr.S b/sysdeps/x86_64/wcsrchr.S
+index 2f388537..ae3cfa7d 100644
+--- a/sysdeps/x86_64/wcsrchr.S
+++ b/sysdeps/x86_64/wcsrchr.S
+@@ -17,266 +17,12 @@
+    License along with the GNU C Library; if not, see
+    <http://www.gnu.org/licenses/>.  */
+ 
+-#include <sysdep.h>
+ 
+-	.text
+-ENTRY (wcsrchr)
+#define USE_AS_WCSRCHR	1
+#define NO_PMINU	1
+ 
+-	movd	%rsi, %xmm1
+-	mov	%rdi, %rcx
+-	punpckldq %xmm1, %xmm1
+-	pxor	%xmm2, %xmm2
+-	punpckldq %xmm1, %xmm1
+-	and	$63, %rcx
+-	cmp	$48, %rcx
+-	ja	L(crosscache)
+#ifndef STRRCHR
+# define STRRCHR	wcsrchr
+#endif
+ 
+-	movdqu	(%rdi), %xmm0
+-	pcmpeqd	%xmm0, %xmm2
+-	pcmpeqd	%xmm1, %xmm0
+-	pmovmskb %xmm2, %rcx
+-	pmovmskb %xmm0, %rax
+-	add	$16, %rdi
+-
+-	test	%rax, %rax
+-	jnz	L(unaligned_match1)
+-
+-	test	%rcx, %rcx
+-	jnz	L(return_null)
+-
+-	and	$-16, %rdi
+-	xor	%r8, %r8
+-	jmp	L(loop)
+-
+-	.p2align 4
+-L(unaligned_match1):
+-	test	%rcx, %rcx
+-	jnz	L(prolog_find_zero_1)
+-
+-	mov	%rax, %r8
+-	mov	%rdi, %rsi
+-	and	$-16, %rdi
+-	jmp	L(loop)
+-
+-	.p2align 4
+-L(crosscache):
+-	and	$15, %rcx
+-	and	$-16, %rdi
+-	pxor	%xmm3, %xmm3
+-	movdqa	(%rdi), %xmm0
+-	pcmpeqd	%xmm0, %xmm3
+-	pcmpeqd	%xmm1, %xmm0
+-	pmovmskb %xmm3, %rdx
+-	pmovmskb %xmm0, %rax
+-	shr	%cl, %rdx
+-	shr	%cl, %rax
+-	add	$16, %rdi
+-
+-	test	%rax, %rax
+-	jnz	L(unaligned_match)
+-
+-	test	%rdx, %rdx
+-	jnz	L(return_null)
+-
+-	xor	%r8, %r8
+-	jmp	L(loop)
+-
+-	.p2align 4
+-L(unaligned_match):
+-	test	%rdx, %rdx
+-	jnz	L(prolog_find_zero)
+-
+-	mov	%rax, %r8
+-	lea	(%rdi, %rcx), %rsi
+-
+-/* Loop start on aligned string.  */
+-	.p2align 4
+-L(loop):
+-	movdqa	(%rdi), %xmm0
+-	pcmpeqd	%xmm0, %xmm2
+-	add	$16, %rdi
+-	pcmpeqd	%xmm1, %xmm0
+-	pmovmskb %xmm2, %rcx
+-	pmovmskb %xmm0, %rax
+-	or	%rax, %rcx
+-	jnz	L(matches)
+-
+-	movdqa	(%rdi), %xmm3
+-	pcmpeqd	%xmm3, %xmm2
+-	add	$16, %rdi
+-	pcmpeqd	%xmm1, %xmm3
+-	pmovmskb %xmm2, %rcx
+-	pmovmskb %xmm3, %rax
+-	or	%rax, %rcx
+-	jnz	L(matches)
+-
+-	movdqa	(%rdi), %xmm4
+-	pcmpeqd	%xmm4, %xmm2
+-	add	$16, %rdi
+-	pcmpeqd	%xmm1, %xmm4
+-	pmovmskb %xmm2, %rcx
+-	pmovmskb %xmm4, %rax
+-	or	%rax, %rcx
+-	jnz	L(matches)
+-
+-	movdqa	(%rdi), %xmm5
+-	pcmpeqd	%xmm5, %xmm2
+-	add	$16, %rdi
+-	pcmpeqd	%xmm1, %xmm5
+-	pmovmskb %xmm2, %rcx
+-	pmovmskb %xmm5, %rax
+-	or	%rax, %rcx
+-	jz	L(loop)
+-
+-	.p2align 4
+-L(matches):
+-	test	%rax, %rax
+-	jnz	L(match)
+-L(return_value):
+-	test	%r8, %r8
+-	jz	L(return_null)
+-	mov	%r8, %rax
+-	mov	%rsi, %rdi
+-
+-	test	$15 << 4, %ah
+-	jnz	L(match_fourth_wchar)
+-	test	%ah, %ah
+-	jnz	L(match_third_wchar)
+-	test	$15 << 4, %al
+-	jnz	L(match_second_wchar)
+-	lea	-16(%rdi), %rax
+-	ret
+-
+-	.p2align 4
+-L(match):
+-	pmovmskb %xmm2, %rcx
+-	test	%rcx, %rcx
+-	jnz	L(find_zero)
+-	mov	%rax, %r8
+-	mov	%rdi, %rsi
+-	jmp	L(loop)
+-
+-	.p2align 4
+-L(find_zero):
+-	test	$15, %cl
+-	jnz	L(find_zero_in_first_wchar)
+-	test	%cl, %cl
+-	jnz	L(find_zero_in_second_wchar)
+-	test	$15, %ch
+-	jnz	L(find_zero_in_third_wchar)
+-
+-	and	$1 << 13 - 1, %rax
+-	jz	L(return_value)
+-
+-	test	$15 << 4, %ah
+-	jnz	L(match_fourth_wchar)
+-	test	%ah, %ah
+-	jnz	L(match_third_wchar)
+-	test	$15 << 4, %al
+-	jnz	L(match_second_wchar)
+-	lea	-16(%rdi), %rax
+-	ret
+-
+-	.p2align 4
+-L(find_zero_in_first_wchar):
+-	test	$1, %rax
+-	jz	L(return_value)
+-	lea	-16(%rdi), %rax
+-	ret
+-
+-	.p2align 4
+-L(find_zero_in_second_wchar):
+-	and	$1 << 5 - 1, %rax
+-	jz	L(return_value)
+-
+-	test	$15 << 4, %al
+-	jnz	L(match_second_wchar)
+-	lea	-16(%rdi), %rax
+-	ret
+-
+-	.p2align 4
+-L(find_zero_in_third_wchar):
+-	and	$1 << 9 - 1, %rax
+-	jz	L(return_value)
+-
+-	test	%ah, %ah
+-	jnz	L(match_third_wchar)
+-	test	$15 << 4, %al
+-	jnz	L(match_second_wchar)
+-	lea	-16(%rdi), %rax
+-	ret
+-
+-	.p2align 4
+-L(prolog_find_zero):
+-	add	%rcx, %rdi
+-	mov     %rdx, %rcx
+-L(prolog_find_zero_1):
+-	test	$15, %cl
+-	jnz	L(prolog_find_zero_in_first_wchar)
+-	test	%cl, %cl
+-	jnz	L(prolog_find_zero_in_second_wchar)
+-	test	$15, %ch
+-	jnz	L(prolog_find_zero_in_third_wchar)
+-
+-	and	$1 << 13 - 1, %rax
+-	jz	L(return_null)
+-
+-	test	$15 << 4, %ah
+-	jnz	L(match_fourth_wchar)
+-	test	%ah, %ah
+-	jnz	L(match_third_wchar)
+-	test	$15 << 4, %al
+-	jnz	L(match_second_wchar)
+-	lea	-16(%rdi), %rax
+-	ret
+-
+-	.p2align 4
+-L(prolog_find_zero_in_first_wchar):
+-	test	$1, %rax
+-	jz	L(return_null)
+-	lea	-16(%rdi), %rax
+-	ret
+-
+-	.p2align 4
+-L(prolog_find_zero_in_second_wchar):
+-	and	$1 << 5 - 1, %rax
+-	jz	L(return_null)
+-
+-	test	$15 << 4, %al
+-	jnz	L(match_second_wchar)
+-	lea	-16(%rdi), %rax
+-	ret
+-
+-	.p2align 4
+-L(prolog_find_zero_in_third_wchar):
+-	and	$1 << 9 - 1, %rax
+-	jz	L(return_null)
+-
+-	test	%ah, %ah
+-	jnz	L(match_third_wchar)
+-	test	$15 << 4, %al
+-	jnz	L(match_second_wchar)
+-	lea	-16(%rdi), %rax
+-	ret
+-
+-	.p2align 4
+-L(match_second_wchar):
+-	lea	-12(%rdi), %rax
+-	ret
+-
+-	.p2align 4
+-L(match_third_wchar):
+-	lea	-8(%rdi), %rax
+-	ret
+-
+-	.p2align 4
+-L(match_fourth_wchar):
+-	lea	-4(%rdi), %rax
+-	ret
+-
+-	.p2align 4
+-L(return_null):
+-	xor	%rax, %rax
+-	ret
+-
+-END (wcsrchr)
+#include "../strrchr.S"
+-- 
+GitLab
+
--- a/glibc-RHEL-15696-104.patch
+++ b/glibc-RHEL-15696-104.patch
@ -0,0 +1,501 @@
+From df7e295d18ffa34f629578c0017a9881af7620f6 Mon Sep 17 00:00:00 2001
+From: Noah Goldstein <goldstein.w.n@gmail.com>
+Date: Thu, 21 Apr 2022 20:52:29 -0500
+Subject: [PATCH] x86: Optimize {str|wcs}rchr-avx2
+Content-type: text/plain; charset=UTF-8
+
+The new code unrolls the main loop slightly without adding too much
+overhead and minimizes the comparisons for the search CHAR.
+
+Geometric Mean of all benchmarks New / Old: 0.832
+See email for all results.
+
+Full xcheck passes on x86_64 with and without multiarch enabled.
+Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
+---
+ sysdeps/x86_64/multiarch/strrchr-avx2.S | 426 +++++++++++++++---------
+ 1 file changed, 269 insertions(+), 157 deletions(-)
+
+diff --git a/sysdeps/x86_64/multiarch/strrchr-avx2.S b/sysdeps/x86_64/multiarch/strrchr-avx2.S
+index c949410b..3d26fad4 100644
+--- a/sysdeps/x86_64/multiarch/strrchr-avx2.S
+++ b/sysdeps/x86_64/multiarch/strrchr-avx2.S
+@@ -27,9 +27,13 @@
+ # ifdef USE_AS_WCSRCHR
+ #  define VPBROADCAST	vpbroadcastd
+ #  define VPCMPEQ	vpcmpeqd
+#  define VPMIN	vpminud
+#  define CHAR_SIZE	4
+ # else
+ #  define VPBROADCAST	vpbroadcastb
+ #  define VPCMPEQ	vpcmpeqb
+#  define VPMIN	vpminub
+#  define CHAR_SIZE	1
+ # endif
+ 
+ # ifndef VZEROUPPER
+@@ -41,196 +45,304 @@
+ # endif
+ 
+ # define VEC_SIZE	32
+# define PAGE_SIZE	4096
+ 
+-	.section SECTION(.text),"ax",@progbits
+-ENTRY (STRRCHR)
+-	movd	%esi, %xmm4
+-	movl	%edi, %ecx
+	.section SECTION(.text), "ax", @progbits
+ENTRY(STRRCHR)
+	movd	%esi, %xmm7
+	movl	%edi, %eax
+ 	/* Broadcast CHAR to YMM4.  */
+-	VPBROADCAST %xmm4, %ymm4
+	VPBROADCAST %xmm7, %ymm7
+ 	vpxor	%xmm0, %xmm0, %xmm0
+ 
+-	/* Check if we may cross page boundary with one vector load.  */
+-	andl	$(2 * VEC_SIZE - 1), %ecx
+-	cmpl	$VEC_SIZE, %ecx
+-	ja	L(cros_page_boundary)
+	/* Shift here instead of `andl` to save code size (saves a fetch
+	   block).  */
+	sall	$20, %eax
+	cmpl	$((PAGE_SIZE - VEC_SIZE) << 20), %eax
+	ja	L(cross_page)
+ 
+L(page_cross_continue):
+ 	vmovdqu	(%rdi), %ymm1
+-	VPCMPEQ	%ymm1, %ymm0, %ymm2
+-	VPCMPEQ	%ymm1, %ymm4, %ymm3
+-	vpmovmskb %ymm2, %ecx
+-	vpmovmskb %ymm3, %eax
+-	addq	$VEC_SIZE, %rdi
+	/* Check end of string match.  */
+	VPCMPEQ	%ymm1, %ymm0, %ymm6
+	vpmovmskb %ymm6, %ecx
+	testl	%ecx, %ecx
+	jz	L(aligned_more)
+
+	/* Only check match with search CHAR if needed.  */
+	VPCMPEQ	%ymm1, %ymm7, %ymm1
+	vpmovmskb %ymm1, %eax
+	/* Check if match before first zero.  */
+	blsmskl	%ecx, %ecx
+	andl	%ecx, %eax
+	jz	L(ret0)
+	bsrl	%eax, %eax
+	addq	%rdi, %rax
+	/* We are off by 3 for wcsrchr if search CHAR is non-zero. If
+	   search CHAR is zero we are correct. Either way `andq
+	   -CHAR_SIZE, %rax` gets the correct result.  */
+# ifdef USE_AS_WCSRCHR
+	andq	$-CHAR_SIZE, %rax
+# endif
+L(ret0):
+L(return_vzeroupper):
+	ZERO_UPPER_VEC_REGISTERS_RETURN
+
+	/* Returns for first vec x1/x2 have hard coded backward search
+	   path for earlier matches.  */
+	.p2align 4,, 10
+L(first_vec_x1):
+	VPCMPEQ	%ymm2, %ymm7, %ymm6
+	vpmovmskb %ymm6, %eax
+	blsmskl	%ecx, %ecx
+	andl	%ecx, %eax
+	jnz	L(first_vec_x1_return)
+
+	.p2align 4,, 4
+L(first_vec_x0_test):
+	VPCMPEQ	%ymm1, %ymm7, %ymm6
+	vpmovmskb %ymm6, %eax
+	testl	%eax, %eax
+	jz	L(ret1)
+	bsrl	%eax, %eax
+	addq	%r8, %rax
+# ifdef USE_AS_WCSRCHR
+	andq	$-CHAR_SIZE, %rax
+# endif
+L(ret1):
+	VZEROUPPER_RETURN
+ 
+	.p2align 4,, 10
+L(first_vec_x0_x1_test):
+	VPCMPEQ	%ymm2, %ymm7, %ymm6
+	vpmovmskb %ymm6, %eax
+	/* Check ymm2 for search CHAR match. If no match then check ymm1
+	   before returning.  */
+ 	testl	%eax, %eax
+-	jnz	L(first_vec)
+	jz	L(first_vec_x0_test)
+	.p2align 4,, 4
+L(first_vec_x1_return):
+	bsrl	%eax, %eax
+	leaq	1(%rdi, %rax), %rax
+# ifdef USE_AS_WCSRCHR
+	andq	$-CHAR_SIZE, %rax
+# endif
+	VZEROUPPER_RETURN
+ 
+-	testl	%ecx, %ecx
+-	jnz	L(return_null)
+ 
+-	andq	$-VEC_SIZE, %rdi
+-	xorl	%edx, %edx
+-	jmp	L(aligned_loop)
+	.p2align 4,, 10
+L(first_vec_x2):
+	VPCMPEQ	%ymm3, %ymm7, %ymm6
+	vpmovmskb %ymm6, %eax
+	blsmskl	%ecx, %ecx
+	/* If no in-range search CHAR match in ymm3 then need to check
+	   ymm1/ymm2 for an earlier match (we delay checking search
+	   CHAR matches until needed).  */
+	andl	%ecx, %eax
+	jz	L(first_vec_x0_x1_test)
+	bsrl	%eax, %eax
+	leaq	(VEC_SIZE + 1)(%rdi, %rax), %rax
+# ifdef USE_AS_WCSRCHR
+	andq	$-CHAR_SIZE, %rax
+# endif
+	VZEROUPPER_RETURN
+
+ 
+ 	.p2align 4
+-L(first_vec):
+-	/* Check if there is a nul CHAR.  */
+L(aligned_more):
+	/* Save original pointer if match was in VEC 0.  */
+	movq	%rdi, %r8
+
+	/* Align src.  */
+	orq	$(VEC_SIZE - 1), %rdi
+	vmovdqu	1(%rdi), %ymm2
+	VPCMPEQ	%ymm2, %ymm0, %ymm6
+	vpmovmskb %ymm6, %ecx
+ 	testl	%ecx, %ecx
+-	jnz	L(char_and_nul_in_first_vec)
+	jnz	L(first_vec_x1)
+ 
+-	/* Remember the match and keep searching.  */
+-	movl	%eax, %edx
+-	movq	%rdi, %rsi
+-	andq	$-VEC_SIZE, %rdi
+-	jmp	L(aligned_loop)
+	vmovdqu	(VEC_SIZE + 1)(%rdi), %ymm3
+	VPCMPEQ	%ymm3, %ymm0, %ymm6
+	vpmovmskb %ymm6, %ecx
+	testl	%ecx, %ecx
+	jnz	L(first_vec_x2)
+ 
+	/* Save pointer again before realigning.  */
+	movq	%rdi, %rsi
+	addq	$(VEC_SIZE + 1), %rdi
+	andq	$-(VEC_SIZE * 2), %rdi
+ 	.p2align 4
+-L(cros_page_boundary):
+-	andl	$(VEC_SIZE - 1), %ecx
+-	andq	$-VEC_SIZE, %rdi
+-	vmovdqa	(%rdi), %ymm1
+-	VPCMPEQ	%ymm1, %ymm0, %ymm2
+-	VPCMPEQ	%ymm1, %ymm4, %ymm3
+-	vpmovmskb %ymm2, %edx
+-	vpmovmskb %ymm3, %eax
+-	shrl	%cl, %edx
+-	shrl	%cl, %eax
+-	addq	$VEC_SIZE, %rdi
+-
+-	/* Check if there is a CHAR.  */
+L(first_aligned_loop):
+	/* Do 2x VEC at a time. Any more and the cost of finding the
+	   match outweights loop benefit.  */
+	vmovdqa	(VEC_SIZE * 0)(%rdi), %ymm4
+	vmovdqa	(VEC_SIZE * 1)(%rdi), %ymm5
+
+	VPCMPEQ	%ymm4, %ymm7, %ymm6
+	VPMIN	%ymm4, %ymm5, %ymm8
+	VPCMPEQ	%ymm5, %ymm7, %ymm10
+	vpor	%ymm6, %ymm10, %ymm5
+	VPCMPEQ	%ymm8, %ymm0, %ymm8
+	vpor	%ymm5, %ymm8, %ymm9
+
+	vpmovmskb %ymm9, %eax
+	addq	$(VEC_SIZE * 2), %rdi
+	/* No zero or search CHAR.  */
+ 	testl	%eax, %eax
+-	jnz	L(found_char)
+-
+-	testl	%edx, %edx
+-	jnz	L(return_null)
+	jz	L(first_aligned_loop)
+ 
+-	jmp	L(aligned_loop)
+-
+-	.p2align 4
+-L(found_char):
+-	testl	%edx, %edx
+-	jnz	L(char_and_nul)
+	/* If no zero CHAR then go to second loop (this allows us to
+	   throw away all prior work).  */
+	vpmovmskb %ymm8, %ecx
+	testl	%ecx, %ecx
+	jz	L(second_aligned_loop_prep)
+ 
+-	/* Remember the match and keep searching.  */
+-	movl	%eax, %edx
+-	leaq	(%rdi, %rcx), %rsi
+	/* Search char could be zero so we need to get the true match.
+	 */
+	vpmovmskb %ymm5, %eax
+	testl	%eax, %eax
+	jnz	L(first_aligned_loop_return)
+ 
+-	.p2align 4
+-L(aligned_loop):
+-	vmovdqa	(%rdi), %ymm1
+-	VPCMPEQ	%ymm1, %ymm0, %ymm2
+-	addq	$VEC_SIZE, %rdi
+-	VPCMPEQ	%ymm1, %ymm4, %ymm3
+-	vpmovmskb %ymm2, %ecx
+-	vpmovmskb %ymm3, %eax
+-	orl	%eax, %ecx
+-	jnz	L(char_nor_null)
+-
+-	vmovdqa	(%rdi), %ymm1
+-	VPCMPEQ	%ymm1, %ymm0, %ymm2
+-	add	$VEC_SIZE, %rdi
+-	VPCMPEQ	%ymm1, %ymm4, %ymm3
+-	vpmovmskb %ymm2, %ecx
+	.p2align 4,, 4
+L(first_vec_x1_or_x2):
+	VPCMPEQ	%ymm3, %ymm7, %ymm3
+	VPCMPEQ	%ymm2, %ymm7, %ymm2
+ 	vpmovmskb %ymm3, %eax
+-	orl	%eax, %ecx
+-	jnz	L(char_nor_null)
+-
+-	vmovdqa	(%rdi), %ymm1
+-	VPCMPEQ	%ymm1, %ymm0, %ymm2
+-	addq	$VEC_SIZE, %rdi
+-	VPCMPEQ	%ymm1, %ymm4, %ymm3
+-	vpmovmskb %ymm2, %ecx
+-	vpmovmskb %ymm3, %eax
+-	orl	%eax, %ecx
+-	jnz	L(char_nor_null)
+-
+-	vmovdqa	(%rdi), %ymm1
+-	VPCMPEQ	%ymm1, %ymm0, %ymm2
+-	addq	$VEC_SIZE, %rdi
+-	VPCMPEQ	%ymm1, %ymm4, %ymm3
+-	vpmovmskb %ymm2, %ecx
+-	vpmovmskb %ymm3, %eax
+-	orl	%eax, %ecx
+-	jz	L(aligned_loop)
+-
+-	.p2align 4
+-L(char_nor_null):
+-	/* Find a CHAR or a nul CHAR in a loop.  */
+-	testl	%eax, %eax
+-	jnz	L(match)
+-L(return_value):
+-	testl	%edx, %edx
+-	jz	L(return_null)
+-	movl	%edx, %eax
+-	movq	%rsi, %rdi
+	vpmovmskb %ymm2, %edx
+	/* Use add for macro-fusion.  */
+	addq	%rax, %rdx
+	jz	L(first_vec_x0_test)
+	/* NB: We could move this shift to before the branch and save a
+	   bit of code size / performance on the fall through. The
+	   branch leads to the null case which generally seems hotter
+	   than char in first 3x VEC.  */
+	salq	$32, %rax
+	addq	%rdx, %rax
+	bsrq	%rax, %rax
+	leaq	1(%rsi, %rax), %rax
+# ifdef USE_AS_WCSRCHR
+	andq	$-CHAR_SIZE, %rax
+# endif
+	VZEROUPPER_RETURN
+ 
+	.p2align 4,, 8
+L(first_aligned_loop_return):
+	VPCMPEQ	%ymm4, %ymm0, %ymm4
+	vpmovmskb %ymm4, %edx
+	salq	$32, %rcx
+	orq	%rdx, %rcx
+
+	vpmovmskb %ymm10, %eax
+	vpmovmskb %ymm6, %edx
+	salq	$32, %rax
+	orq	%rdx, %rax
+	blsmskq	%rcx, %rcx
+	andq	%rcx, %rax
+	jz	L(first_vec_x1_or_x2)
+
+	bsrq	%rax, %rax
+	leaq	-(VEC_SIZE * 2)(%rdi, %rax), %rax
+ # ifdef USE_AS_WCSRCHR
+-	/* Keep the first bit for each matching CHAR for bsr.  */
+-	andl	$0x11111111, %eax
+	andq	$-CHAR_SIZE, %rax
+ # endif
+-	bsrl	%eax, %eax
+-	leaq	-VEC_SIZE(%rdi, %rax), %rax
+-L(return_vzeroupper):
+-	ZERO_UPPER_VEC_REGISTERS_RETURN
+	VZEROUPPER_RETURN
+ 
+	/* Search char cannot be zero.  */
+ 	.p2align 4
+-L(match):
+-	/* Find a CHAR.  Check if there is a nul CHAR.  */
+-	vpmovmskb %ymm2, %ecx
+-	testl	%ecx, %ecx
+-	jnz	L(find_nul)
+-
+-	/* Remember the match and keep searching.  */
+-	movl	%eax, %edx
+L(second_aligned_loop_set_furthest_match):
+	/* Save VEC and pointer from most recent match.  */
+L(second_aligned_loop_prep):
+ 	movq	%rdi, %rsi
+-	jmp	L(aligned_loop)
+	vmovdqu	%ymm6, %ymm2
+	vmovdqu	%ymm10, %ymm3
+ 
+ 	.p2align 4
+-L(find_nul):
+-# ifdef USE_AS_WCSRCHR
+-	/* Keep the first bit for each matching CHAR for bsr.  */
+-	andl	$0x11111111, %ecx
+-	andl	$0x11111111, %eax
+-# endif
+-	/* Mask out any matching bits after the nul CHAR.  */
+-	movl	%ecx, %r8d
+-	subl	$1, %r8d
+-	xorl	%ecx, %r8d
+-	andl	%r8d, %eax
+L(second_aligned_loop):
+	/* Search 2x at at time.  */
+	vmovdqa	(VEC_SIZE * 0)(%rdi), %ymm4
+	vmovdqa	(VEC_SIZE * 1)(%rdi), %ymm5
+
+	VPCMPEQ	%ymm4, %ymm7, %ymm6
+	VPMIN	%ymm4, %ymm5, %ymm1
+	VPCMPEQ	%ymm5, %ymm7, %ymm10
+	vpor	%ymm6, %ymm10, %ymm5
+	VPCMPEQ	%ymm1, %ymm0, %ymm1
+	vpor	%ymm5, %ymm1, %ymm9
+
+	vpmovmskb %ymm9, %eax
+	addq	$(VEC_SIZE * 2), %rdi
+ 	testl	%eax, %eax
+-	/* If there is no CHAR here, return the remembered one.  */
+-	jz	L(return_value)
+-	bsrl	%eax, %eax
+-	leaq	-VEC_SIZE(%rdi, %rax), %rax
+-	VZEROUPPER_RETURN
+-
+-	.p2align 4
+-L(char_and_nul):
+-	/* Find both a CHAR and a nul CHAR.  */
+-	addq	%rcx, %rdi
+-	movl	%edx, %ecx
+-L(char_and_nul_in_first_vec):
+-# ifdef USE_AS_WCSRCHR
+-	/* Keep the first bit for each matching CHAR for bsr.  */
+-	andl	$0x11111111, %ecx
+-	andl	$0x11111111, %eax
+-# endif
+-	/* Mask out any matching bits after the nul CHAR.  */
+-	movl	%ecx, %r8d
+-	subl	$1, %r8d
+-	xorl	%ecx, %r8d
+-	andl	%r8d, %eax
+	jz	L(second_aligned_loop)
+	vpmovmskb %ymm1, %ecx
+	testl	%ecx, %ecx
+	jz	L(second_aligned_loop_set_furthest_match)
+	vpmovmskb %ymm5, %eax
+ 	testl	%eax, %eax
+-	/* Return null pointer if the nul CHAR comes first.  */
+-	jz	L(return_null)
+-	bsrl	%eax, %eax
+-	leaq	-VEC_SIZE(%rdi, %rax), %rax
+	jnz	L(return_new_match)
+
+	/* This is the hot patch. We know CHAR is inbounds and that
+	   ymm3/ymm2 have latest match.  */
+	.p2align 4,, 4
+L(return_old_match):
+	vpmovmskb %ymm3, %eax
+	vpmovmskb %ymm2, %edx
+	salq	$32, %rax
+	orq	%rdx, %rax
+	bsrq	%rax, %rax
+	/* Search char cannot be zero so safe to just use lea for
+	   wcsrchr.  */
+	leaq	(VEC_SIZE * -2 -(CHAR_SIZE - 1))(%rsi, %rax), %rax
+ 	VZEROUPPER_RETURN
+ 
+-	.p2align 4
+-L(return_null):
+-	xorl	%eax, %eax
+	/* Last iteration also potentially has a match.  */
+	.p2align 4,, 8
+L(return_new_match):
+	VPCMPEQ	%ymm4, %ymm0, %ymm4
+	vpmovmskb %ymm4, %edx
+	salq	$32, %rcx
+	orq	%rdx, %rcx
+
+	vpmovmskb %ymm10, %eax
+	vpmovmskb %ymm6, %edx
+	salq	$32, %rax
+	orq	%rdx, %rax
+	blsmskq	%rcx, %rcx
+	andq	%rcx, %rax
+	jz	L(return_old_match)
+	bsrq	%rax, %rax
+	/* Search char cannot be zero so safe to just use lea for
+	   wcsrchr.  */
+	leaq	(VEC_SIZE * -2 -(CHAR_SIZE - 1))(%rdi, %rax), %rax
+ 	VZEROUPPER_RETURN
+ 
+-END (STRRCHR)
+	.p2align 4,, 4
+L(cross_page):
+	movq	%rdi, %rsi
+	andq	$-VEC_SIZE, %rsi
+	vmovdqu	(%rsi), %ymm1
+	VPCMPEQ	%ymm1, %ymm0, %ymm6
+	vpmovmskb %ymm6, %ecx
+	/* Shift out zero CHAR matches that are before the begining of
+	   src (rdi).  */
+	shrxl	%edi, %ecx, %ecx
+	testl	%ecx, %ecx
+	jz	L(page_cross_continue)
+	VPCMPEQ	%ymm1, %ymm7, %ymm1
+	vpmovmskb %ymm1, %eax
+
+	/* Shift out search CHAR matches that are before the begining of
+	   src (rdi).  */
+	shrxl	%edi, %eax, %eax
+	blsmskl	%ecx, %ecx
+	/* Check if any search CHAR match in range.  */
+	andl	%ecx, %eax
+	jz	L(ret2)
+	bsrl	%eax, %eax
+	addq	%rdi, %rax
+# ifdef USE_AS_WCSRCHR
+	andq	$-CHAR_SIZE, %rax
+# endif
+L(ret2):
+	VZEROUPPER_RETURN
+END(STRRCHR)
+ #endif
+-- 
+GitLab
+
--- a/glibc-RHEL-15696-105.patch
+++ b/glibc-RHEL-15696-105.patch
@ -0,0 +1,558 @@
+From c966099cdc3e0fdf92f63eac09b22fa7e5f5f02d Mon Sep 17 00:00:00 2001
+From: Noah Goldstein <goldstein.w.n@gmail.com>
+Date: Thu, 21 Apr 2022 20:52:30 -0500
+Subject: [PATCH] x86: Optimize {str|wcs}rchr-evex
+Content-type: text/plain; charset=UTF-8
+
+The new code unrolls the main loop slightly without adding too much
+overhead and minimizes the comparisons for the search CHAR.
+
+Geometric Mean of all benchmarks New / Old: 0.755
+See email for all results.
+
+Full xcheck passes on x86_64 with and without multiarch enabled.
+Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
+---
+ sysdeps/x86_64/multiarch/strrchr-evex.S | 471 +++++++++++++++---------
+ 1 file changed, 290 insertions(+), 181 deletions(-)
+
+diff --git a/sysdeps/x86_64/multiarch/strrchr-evex.S b/sysdeps/x86_64/multiarch/strrchr-evex.S
+index f920b5a5..f5b6d755 100644
+--- a/sysdeps/x86_64/multiarch/strrchr-evex.S
+++ b/sysdeps/x86_64/multiarch/strrchr-evex.S
+@@ -24,242 +24,351 @@
+ #  define STRRCHR	__strrchr_evex
+ # endif
+ 
+-# define VMOVU		vmovdqu64
+-# define VMOVA		vmovdqa64
+# define VMOVU	vmovdqu64
+# define VMOVA	vmovdqa64
+ 
+ # ifdef USE_AS_WCSRCHR
+#  define SHIFT_REG	esi
+
+#  define kunpck	kunpckbw
+#  define kmov_2x	kmovd
+#  define maskz_2x	ecx
+#  define maskm_2x	eax
+#  define CHAR_SIZE	4
+#  define VPMIN	vpminud
+#  define VPTESTN	vptestnmd
+ #  define VPBROADCAST	vpbroadcastd
+-#  define VPCMP		vpcmpd
+-#  define SHIFT_REG	r8d
+#  define VPCMP	vpcmpd
+ # else
+#  define SHIFT_REG	edi
+
+#  define kunpck	kunpckdq
+#  define kmov_2x	kmovq
+#  define maskz_2x	rcx
+#  define maskm_2x	rax
+
+#  define CHAR_SIZE	1
+#  define VPMIN	vpminub
+#  define VPTESTN	vptestnmb
+ #  define VPBROADCAST	vpbroadcastb
+-#  define VPCMP		vpcmpb
+-#  define SHIFT_REG	ecx
+#  define VPCMP	vpcmpb
+ # endif
+ 
+ # define XMMZERO	xmm16
+ # define YMMZERO	ymm16
+ # define YMMMATCH	ymm17
+-# define YMM1		ymm18
+# define YMMSAVE	ymm18
+
+# define YMM1	ymm19
+# define YMM2	ymm20
+# define YMM3	ymm21
+# define YMM4	ymm22
+# define YMM5	ymm23
+# define YMM6	ymm24
+# define YMM7	ymm25
+# define YMM8	ymm26
+ 
+-# define VEC_SIZE	32
+ 
+-	.section .text.evex,"ax",@progbits
+-ENTRY (STRRCHR)
+-	movl	%edi, %ecx
+# define VEC_SIZE	32
+# define PAGE_SIZE	4096
+	.section .text.evex, "ax", @progbits
+ENTRY(STRRCHR)
+	movl	%edi, %eax
+ 	/* Broadcast CHAR to YMMMATCH.  */
+ 	VPBROADCAST %esi, %YMMMATCH
+ 
+-	vpxorq	%XMMZERO, %XMMZERO, %XMMZERO
+-
+-	/* Check if we may cross page boundary with one vector load.  */
+-	andl	$(2 * VEC_SIZE - 1), %ecx
+-	cmpl	$VEC_SIZE, %ecx
+-	ja	L(cros_page_boundary)
+	andl	$(PAGE_SIZE - 1), %eax
+	cmpl	$(PAGE_SIZE - VEC_SIZE), %eax
+	jg	L(cross_page_boundary)
+ 
+L(page_cross_continue):
+ 	VMOVU	(%rdi), %YMM1
+-
+-	/* Each bit in K0 represents a null byte in YMM1.  */
+-	VPCMP	$0, %YMMZERO, %YMM1, %k0
+-	/* Each bit in K1 represents a CHAR in YMM1.  */
+-	VPCMP	$0, %YMMMATCH, %YMM1, %k1
+	/* k0 has a 1 for each zero CHAR in YMM1.  */
+	VPTESTN	%YMM1, %YMM1, %k0
+ 	kmovd	%k0, %ecx
+-	kmovd	%k1, %eax
+-
+-	addq	$VEC_SIZE, %rdi
+-
+-	testl	%eax, %eax
+-	jnz	L(first_vec)
+-
+ 	testl	%ecx, %ecx
+-	jnz	L(return_null)
+-
+-	andq	$-VEC_SIZE, %rdi
+-	xorl	%edx, %edx
+-	jmp	L(aligned_loop)
+-
+-	.p2align 4
+-L(first_vec):
+-	/* Check if there is a null byte.  */
+-	testl	%ecx, %ecx
+-	jnz	L(char_and_nul_in_first_vec)
+-
+-	/* Remember the match and keep searching.  */
+-	movl	%eax, %edx
+-	movq	%rdi, %rsi
+-	andq	$-VEC_SIZE, %rdi
+-	jmp	L(aligned_loop)
+-
+-	.p2align 4
+-L(cros_page_boundary):
+-	andl	$(VEC_SIZE - 1), %ecx
+-	andq	$-VEC_SIZE, %rdi
+	jz	L(aligned_more)
+	/* fallthrough: zero CHAR in first VEC.  */
+ 
+	/* K1 has a 1 for each search CHAR match in YMM1.  */
+	VPCMP	$0, %YMMMATCH, %YMM1, %k1
+	kmovd	%k1, %eax
+	/* Build mask up until first zero CHAR (used to mask of
+	   potential search CHAR matches past the end of the string).
+	 */
+	blsmskl	%ecx, %ecx
+	andl	%ecx, %eax
+	jz	L(ret0)
+	/* Get last match (the `andl` removed any out of bounds
+	   matches).  */
+	bsrl	%eax, %eax
+ # ifdef USE_AS_WCSRCHR
+-	/* NB: Divide shift count by 4 since each bit in K1 represent 4
+-	   bytes.  */
+-	movl	%ecx, %SHIFT_REG
+-	sarl	$2, %SHIFT_REG
+	leaq	(%rdi, %rax, CHAR_SIZE), %rax
+# else
+	addq	%rdi, %rax
+ # endif
+L(ret0):
+	ret
+ 
+-	VMOVA	(%rdi), %YMM1
+-
+-	/* Each bit in K0 represents a null byte in YMM1.  */
+-	VPCMP	$0, %YMMZERO, %YMM1, %k0
+-	/* Each bit in K1 represents a CHAR in YMM1.  */
+	/* Returns for first vec x1/x2/x3 have hard coded backward
+	   search path for earlier matches.  */
+	.p2align 4,, 6
+L(first_vec_x1):
+	VPCMP	$0, %YMMMATCH, %YMM2, %k1
+	kmovd	%k1, %eax
+	blsmskl	%ecx, %ecx
+	/* eax non-zero if search CHAR in range.  */
+	andl	%ecx, %eax
+	jnz	L(first_vec_x1_return)
+
+	/* fallthrough: no match in YMM2 then need to check for earlier
+	   matches (in YMM1).  */
+	.p2align 4,, 4
+L(first_vec_x0_test):
+ 	VPCMP	$0, %YMMMATCH, %YMM1, %k1
+-	kmovd	%k0, %edx
+ 	kmovd	%k1, %eax
+-
+-	shrxl	%SHIFT_REG, %edx, %edx
+-	shrxl	%SHIFT_REG, %eax, %eax
+-	addq	$VEC_SIZE, %rdi
+-
+-	/* Check if there is a CHAR.  */
+ 	testl	%eax, %eax
+-	jnz	L(found_char)
+-
+-	testl	%edx, %edx
+-	jnz	L(return_null)
+-
+-	jmp	L(aligned_loop)
+-
+-	.p2align 4
+-L(found_char):
+-	testl	%edx, %edx
+-	jnz	L(char_and_nul)
+-
+-	/* Remember the match and keep searching.  */
+-	movl	%eax, %edx
+-	leaq	(%rdi, %rcx), %rsi
+	jz	L(ret1)
+	bsrl	%eax, %eax
+# ifdef USE_AS_WCSRCHR
+	leaq	(%rsi, %rax, CHAR_SIZE), %rax
+# else
+	addq	%rsi, %rax
+# endif
+L(ret1):
+	ret
+ 
+-	.p2align 4
+-L(aligned_loop):
+-	VMOVA	(%rdi), %YMM1
+-	addq	$VEC_SIZE, %rdi
+	.p2align 4,, 10
+L(first_vec_x1_or_x2):
+	VPCMP	$0, %YMM3, %YMMMATCH, %k3
+	VPCMP	$0, %YMM2, %YMMMATCH, %k2
+	/* K2 and K3 have 1 for any search CHAR match. Test if any
+	   matches between either of them. Otherwise check YMM1.  */
+	kortestd %k2, %k3
+	jz	L(first_vec_x0_test)
+
+	/* Guranteed that YMM2 and YMM3 are within range so merge the
+	   two bitmasks then get last result.  */
+	kunpck	%k2, %k3, %k3
+	kmovq	%k3, %rax
+	bsrq	%rax, %rax
+	leaq	(VEC_SIZE)(%r8, %rax, CHAR_SIZE), %rax
+	ret
+ 
+-	/* Each bit in K0 represents a null byte in YMM1.  */
+-	VPCMP	$0, %YMMZERO, %YMM1, %k0
+-	/* Each bit in K1 represents a CHAR in YMM1.  */
+-	VPCMP	$0, %YMMMATCH, %YMM1, %k1
+-	kmovd	%k0, %ecx
+	.p2align 4,, 6
+L(first_vec_x3):
+	VPCMP	$0, %YMMMATCH, %YMM4, %k1
+ 	kmovd	%k1, %eax
+-	orl	%eax, %ecx
+-	jnz	L(char_nor_null)
+	blsmskl	%ecx, %ecx
+	/* If no search CHAR match in range check YMM1/YMM2/YMM3.  */
+	andl	%ecx, %eax
+	jz	L(first_vec_x1_or_x2)
+	bsrl	%eax, %eax
+	leaq	(VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax
+	ret
+ 
+-	VMOVA	(%rdi), %YMM1
+-	add	$VEC_SIZE, %rdi
+	.p2align 4,, 6
+L(first_vec_x0_x1_test):
+	VPCMP	$0, %YMMMATCH, %YMM2, %k1
+	kmovd	%k1, %eax
+	/* Check YMM2 for last match first. If no match try YMM1.  */
+	testl	%eax, %eax
+	jz	L(first_vec_x0_test)
+	.p2align 4,, 4
+L(first_vec_x1_return):
+	bsrl	%eax, %eax
+	leaq	(VEC_SIZE)(%rdi, %rax, CHAR_SIZE), %rax
+	ret
+ 
+-	/* Each bit in K0 represents a null byte in YMM1.  */
+-	VPCMP	$0, %YMMZERO, %YMM1, %k0
+-	/* Each bit in K1 represents a CHAR in YMM1.  */
+-	VPCMP	$0, %YMMMATCH, %YMM1, %k1
+-	kmovd	%k0, %ecx
+	.p2align 4,, 10
+L(first_vec_x2):
+	VPCMP	$0, %YMMMATCH, %YMM3, %k1
+ 	kmovd	%k1, %eax
+-	orl	%eax, %ecx
+-	jnz	L(char_nor_null)
+	blsmskl	%ecx, %ecx
+	/* Check YMM3 for last match first. If no match try YMM2/YMM1.
+	 */
+	andl	%ecx, %eax
+	jz	L(first_vec_x0_x1_test)
+	bsrl	%eax, %eax
+	leaq	(VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
+	ret
+ 
+-	VMOVA	(%rdi), %YMM1
+-	addq	$VEC_SIZE, %rdi
+ 
+-	/* Each bit in K0 represents a null byte in YMM1.  */
+-	VPCMP	$0, %YMMZERO, %YMM1, %k0
+-	/* Each bit in K1 represents a CHAR in YMM1.  */
+-	VPCMP	$0, %YMMMATCH, %YMM1, %k1
+	.p2align 4
+L(aligned_more):
+	/* Need to keep original pointer incase YMM1 has last match.  */
+	movq	%rdi, %rsi
+	andq	$-VEC_SIZE, %rdi
+	VMOVU	VEC_SIZE(%rdi), %YMM2
+	VPTESTN	%YMM2, %YMM2, %k0
+ 	kmovd	%k0, %ecx
+-	kmovd	%k1, %eax
+-	orl	%eax, %ecx
+-	jnz	L(char_nor_null)
+	testl	%ecx, %ecx
+	jnz	L(first_vec_x1)
+ 
+-	VMOVA	(%rdi), %YMM1
+-	addq	$VEC_SIZE, %rdi
+	VMOVU	(VEC_SIZE * 2)(%rdi), %YMM3
+	VPTESTN	%YMM3, %YMM3, %k0
+	kmovd	%k0, %ecx
+	testl	%ecx, %ecx
+	jnz	L(first_vec_x2)
+ 
+-	/* Each bit in K0 represents a null byte in YMM1.  */
+-	VPCMP	$0, %YMMZERO, %YMM1, %k0
+-	/* Each bit in K1 represents a CHAR in YMM1.  */
+-	VPCMP	$0, %YMMMATCH, %YMM1, %k1
+	VMOVU	(VEC_SIZE * 3)(%rdi), %YMM4
+	VPTESTN	%YMM4, %YMM4, %k0
+ 	kmovd	%k0, %ecx
+-	kmovd	%k1, %eax
+-	orl	%eax, %ecx
+-	jz	L(aligned_loop)
+	movq	%rdi, %r8
+	testl	%ecx, %ecx
+	jnz	L(first_vec_x3)
+ 
+	andq	$-(VEC_SIZE * 2), %rdi
+ 	.p2align 4
+-L(char_nor_null):
+-	/* Find a CHAR or a null byte in a loop.  */
+L(first_aligned_loop):
+	/* Preserve YMM1, YMM2, YMM3, and YMM4 until we can gurantee
+	   they don't store a match.  */
+	VMOVA	(VEC_SIZE * 4)(%rdi), %YMM5
+	VMOVA	(VEC_SIZE * 5)(%rdi), %YMM6
+
+	VPCMP	$0, %YMM5, %YMMMATCH, %k2
+	vpxord	%YMM6, %YMMMATCH, %YMM7
+
+	VPMIN	%YMM5, %YMM6, %YMM8
+	VPMIN	%YMM8, %YMM7, %YMM7
+
+	VPTESTN	%YMM7, %YMM7, %k1
+	subq	$(VEC_SIZE * -2), %rdi
+	kortestd %k1, %k2
+	jz	L(first_aligned_loop)
+
+	VPCMP	$0, %YMM6, %YMMMATCH, %k3
+	VPTESTN	%YMM8, %YMM8, %k1
+	ktestd	%k1, %k1
+	jz	L(second_aligned_loop_prep)
+
+	kortestd %k2, %k3
+	jnz	L(return_first_aligned_loop)
+
+	.p2align 4,, 6
+L(first_vec_x1_or_x2_or_x3):
+	VPCMP	$0, %YMM4, %YMMMATCH, %k4
+	kmovd	%k4, %eax
+ 	testl	%eax, %eax
+-	jnz	L(match)
+-L(return_value):
+-	testl	%edx, %edx
+-	jz	L(return_null)
+-	movl	%edx, %eax
+-	movq	%rsi, %rdi
+	jz	L(first_vec_x1_or_x2)
+ 	bsrl	%eax, %eax
+-# ifdef USE_AS_WCSRCHR
+-	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
+-	leaq	-VEC_SIZE(%rdi, %rax, 4), %rax
+-# else
+-	leaq	-VEC_SIZE(%rdi, %rax), %rax
+-# endif
+	leaq	(VEC_SIZE * 3)(%r8, %rax, CHAR_SIZE), %rax
+ 	ret
+ 
+-	.p2align 4
+-L(match):
+-	/* Find a CHAR.  Check if there is a null byte.  */
+-	kmovd	%k0, %ecx
+-	testl	%ecx, %ecx
+-	jnz	L(find_nul)
+	.p2align 4,, 8
+L(return_first_aligned_loop):
+	VPTESTN	%YMM5, %YMM5, %k0
+	kunpck	%k0, %k1, %k0
+	kmov_2x	%k0, %maskz_2x
+
+	blsmsk	%maskz_2x, %maskz_2x
+	kunpck	%k2, %k3, %k3
+	kmov_2x	%k3, %maskm_2x
+	and	%maskz_2x, %maskm_2x
+	jz	L(first_vec_x1_or_x2_or_x3)
+ 
+-	/* Remember the match and keep searching.  */
+-	movl	%eax, %edx
+	bsr	%maskm_2x, %maskm_2x
+	leaq	(VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
+	ret
+
+	.p2align 4
+	/* We can throw away the work done for the first 4x checks here
+	   as we have a later match. This is the 'fast' path persay.
+	 */
+L(second_aligned_loop_prep):
+L(second_aligned_loop_set_furthest_match):
+ 	movq	%rdi, %rsi
+-	jmp	L(aligned_loop)
+	kunpck	%k2, %k3, %k4
+ 
+ 	.p2align 4
+-L(find_nul):
+-	/* Mask out any matching bits after the null byte.  */
+-	movl	%ecx, %r8d
+-	subl	$1, %r8d
+-	xorl	%ecx, %r8d
+-	andl	%r8d, %eax
+-	testl	%eax, %eax
+-	/* If there is no CHAR here, return the remembered one.  */
+-	jz	L(return_value)
+-	bsrl	%eax, %eax
+L(second_aligned_loop):
+	VMOVU	(VEC_SIZE * 4)(%rdi), %YMM1
+	VMOVU	(VEC_SIZE * 5)(%rdi), %YMM2
+
+	VPCMP	$0, %YMM1, %YMMMATCH, %k2
+	vpxord	%YMM2, %YMMMATCH, %YMM3
+
+	VPMIN	%YMM1, %YMM2, %YMM4
+	VPMIN	%YMM3, %YMM4, %YMM3
+
+	VPTESTN	%YMM3, %YMM3, %k1
+	subq	$(VEC_SIZE * -2), %rdi
+	kortestd %k1, %k2
+	jz	L(second_aligned_loop)
+
+	VPCMP	$0, %YMM2, %YMMMATCH, %k3
+	VPTESTN	%YMM4, %YMM4, %k1
+	ktestd	%k1, %k1
+	jz	L(second_aligned_loop_set_furthest_match)
+
+	kortestd %k2, %k3
+	/* branch here because there is a significant advantage interms
+	   of output dependency chance in using edx.  */
+	jnz	L(return_new_match)
+L(return_old_match):
+	kmovq	%k4, %rax
+	bsrq	%rax, %rax
+	leaq	(VEC_SIZE * 2)(%rsi, %rax, CHAR_SIZE), %rax
+	ret
+
+L(return_new_match):
+	VPTESTN	%YMM1, %YMM1, %k0
+	kunpck	%k0, %k1, %k0
+	kmov_2x	%k0, %maskz_2x
+
+	blsmsk	%maskz_2x, %maskz_2x
+	kunpck	%k2, %k3, %k3
+	kmov_2x	%k3, %maskm_2x
+	and	%maskz_2x, %maskm_2x
+	jz	L(return_old_match)
+
+	bsr	%maskm_2x, %maskm_2x
+	leaq	(VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
+	ret
+
+L(cross_page_boundary):
+	/* eax contains all the page offset bits of src (rdi). `xor rdi,
+	   rax` sets pointer will all page offset bits cleared so
+	   offset of (PAGE_SIZE - VEC_SIZE) will get last aligned VEC
+	   before page cross (guranteed to be safe to read). Doing this
+	   as opposed to `movq %rdi, %rax; andq $-VEC_SIZE, %rax` saves
+	   a bit of code size.  */
+	xorq	%rdi, %rax
+	VMOVU	(PAGE_SIZE - VEC_SIZE)(%rax), %YMM1
+	VPTESTN	%YMM1, %YMM1, %k0
+	kmovd	%k0, %ecx
+
+	/* Shift out zero CHAR matches that are before the begining of
+	   src (rdi).  */
+ # ifdef USE_AS_WCSRCHR
+-	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
+-	leaq	-VEC_SIZE(%rdi, %rax, 4), %rax
+-# else
+-	leaq	-VEC_SIZE(%rdi, %rax), %rax
+	movl	%edi, %esi
+	andl	$(VEC_SIZE - 1), %esi
+	shrl	$2, %esi
+ # endif
+-	ret
+	shrxl	%SHIFT_REG, %ecx, %ecx
+ 
+-	.p2align 4
+-L(char_and_nul):
+-	/* Find both a CHAR and a null byte.  */
+-	addq	%rcx, %rdi
+-	movl	%edx, %ecx
+-L(char_and_nul_in_first_vec):
+-	/* Mask out any matching bits after the null byte.  */
+-	movl	%ecx, %r8d
+-	subl	$1, %r8d
+-	xorl	%ecx, %r8d
+-	andl	%r8d, %eax
+-	testl	%eax, %eax
+-	/* Return null pointer if the null byte comes first.  */
+-	jz	L(return_null)
+	testl	%ecx, %ecx
+	jz	L(page_cross_continue)
+
+	/* Found zero CHAR so need to test for search CHAR.  */
+	VPCMP	$0, %YMMMATCH, %YMM1, %k1
+	kmovd	%k1, %eax
+	/* Shift out search CHAR matches that are before the begining of
+	   src (rdi).  */
+	shrxl	%SHIFT_REG, %eax, %eax
+
+	/* Check if any search CHAR match in range.  */
+	blsmskl	%ecx, %ecx
+	andl	%ecx, %eax
+	jz	L(ret3)
+ 	bsrl	%eax, %eax
+ # ifdef USE_AS_WCSRCHR
+-	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
+-	leaq	-VEC_SIZE(%rdi, %rax, 4), %rax
+	leaq	(%rdi, %rax, CHAR_SIZE), %rax
+ # else
+-	leaq	-VEC_SIZE(%rdi, %rax), %rax
+	addq	%rdi, %rax
+ # endif
+L(ret3):
+ 	ret
+ 
+-	.p2align 4
+-L(return_null):
+-	xorl	%eax, %eax
+-	ret
+-
+-END (STRRCHR)
+END(STRRCHR)
+ #endif
+-- 
+GitLab
+
--- a/glibc-RHEL-15696-106.patch
+++ b/glibc-RHEL-15696-106.patch
@ -0,0 +1,73 @@
+From 911c63a51c690dd1a97dfc587097277029baf00f Mon Sep 17 00:00:00 2001
+From: Noah Goldstein <goldstein.w.n@gmail.com>
+Date: Wed, 27 Apr 2022 15:13:02 -0500
+Subject: [PATCH] sysdeps: Add 'get_fast_jitter' interace in fast-jitter.h
+Content-type: text/plain; charset=UTF-8
+
+'get_fast_jitter' is meant to be used purely for performance
+purposes. In all cases it's used it should be acceptable to get no
+randomness (see default case). An example use case is in setting
+jitter for retries between threads at a lock. There is a
+performance benefit to having jitter, but only if the jitter can
+be generated very quickly and ultimately there is no serious issue
+if no jitter is generated.
+
+The implementation generally uses 'HP_TIMING_NOW' iff it is
+inlined (avoid any potential syscall paths).
+Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
+---
+ sysdeps/generic/fast-jitter.h | 42 +++++++++++++++++++++++++++++++++++
+ 1 file changed, 42 insertions(+)
+ create mode 100644 sysdeps/generic/fast-jitter.h
+
+diff --git a/sysdeps/generic/fast-jitter.h b/sysdeps/generic/fast-jitter.h
+new file mode 100644
+index 00000000..4dd53e34
+--- /dev/null
+++ b/sysdeps/generic/fast-jitter.h
+@@ -0,0 +1,42 @@
+/* Fallback for fast jitter just return 0.
+   Copyright (C) 2019-2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#ifndef _FAST_JITTER_H
+# define _FAST_JITTER_H
+
+# include <stdint.h>
+# include <hp-timing.h>
+
+/* Baseline just return 0.  We could create jitter using a clock or
+   'random_bits' but that may imply a syscall and the goal of
+   'get_fast_jitter' is minimal overhead "randomness" when such
+   randomness helps performance.  Adding high overhead the function
+   defeats the purpose.  */
+static inline uint32_t
+get_fast_jitter (void)
+{
+# if HP_TIMING_INLINE
+  hp_timing_t jitter;
+  HP_TIMING_NOW (jitter);
+  return (uint32_t) jitter;
+# else
+  return 0;
+# endif
+}
+
+#endif
+-- 
+GitLab
+
--- a/glibc-RHEL-15696-107.patch
+++ b/glibc-RHEL-15696-107.patch
@ -0,0 +1,226 @@
+From 8162147872491bb5b48e91543b19c49a29ae6b6d Mon Sep 17 00:00:00 2001
+From: Wangyang Guo <wangyang.guo@intel.com>
+Date: Fri, 6 May 2022 01:50:10 +0000
+Subject: [PATCH] nptl: Add backoff mechanism to spinlock loop
+Content-type: text/plain; charset=UTF-8
+
+When mutiple threads waiting for lock at the same time, once lock owner
+releases the lock, waiters will see lock available and all try to lock,
+which may cause an expensive CAS storm.
+
+Binary exponential backoff with random jitter is introduced. As try-lock
+attempt increases, there is more likely that a larger number threads
+compete for adaptive mutex lock, so increase wait time in exponential.
+A random jitter is also added to avoid synchronous try-lock from other
+threads.
+
+v2: Remove read-check before try-lock for performance.
+
+v3:
+1. Restore read-check since it works well in some platform.
+2. Make backoff arch dependent, and enable it for x86_64.
+3. Limit max backoff to reduce latency in large critical section.
+
+v4: Fix strict-prototypes error in sysdeps/nptl/pthread_mutex_backoff.h
+
+v5: Commit log updated for regression in large critical section.
+
+Result of pthread-mutex-locks bench
+
+Test Platform: Xeon 8280L (2 socket, 112 CPUs in total)
+First Row: thread number
+First Col: critical section length
+Values: backoff vs upstream, time based, low is better
+
+non-critical-length: 1
+	1	2	4	8	16	32	64	112	140
+0	0.99	0.58	0.52	0.49	0.43	0.44	0.46	0.52	0.54
+1	0.98	0.43	0.56	0.50	0.44	0.45	0.50	0.56	0.57
+2	0.99	0.41	0.57	0.51	0.45	0.47	0.48	0.60	0.61
+4	0.99	0.45	0.59	0.53	0.48	0.49	0.52	0.64	0.65
+8	1.00	0.66	0.71	0.63	0.56	0.59	0.66	0.72	0.71
+16	0.97	0.78	0.91	0.73	0.67	0.70	0.79	0.80	0.80
+32	0.95	1.17	0.98	0.87	0.82	0.86	0.89	0.90	0.90
+64	0.96	0.95	1.01	1.01	0.98	1.00	1.03	0.99	0.99
+128	0.99	1.01	1.01	1.17	1.08	1.12	1.02	0.97	1.02
+
+non-critical-length: 32
+	1	2	4	8	16	32	64	112	140
+0	1.03	0.97	0.75	0.65	0.58	0.58	0.56	0.70	0.70
+1	0.94	0.95	0.76	0.65	0.58	0.58	0.61	0.71	0.72
+2	0.97	0.96	0.77	0.66	0.58	0.59	0.62	0.74	0.74
+4	0.99	0.96	0.78	0.66	0.60	0.61	0.66	0.76	0.77
+8	0.99	0.99	0.84	0.70	0.64	0.66	0.71	0.80	0.80
+16	0.98	0.97	0.95	0.76	0.70	0.73	0.81	0.85	0.84
+32	1.04	1.12	1.04	0.89	0.82	0.86	0.93	0.91	0.91
+64	0.99	1.15	1.07	1.00	0.99	1.01	1.05	0.99	0.99
+128	1.00	1.21	1.20	1.22	1.25	1.31	1.12	1.10	0.99
+
+non-critical-length: 128
+	1	2	4	8	16	32	64	112	140
+0	1.02	1.00	0.99	0.67	0.61	0.61	0.61	0.74	0.73
+1	0.95	0.99	1.00	0.68	0.61	0.60	0.60	0.74	0.74
+2	1.00	1.04	1.00	0.68	0.59	0.61	0.65	0.76	0.76
+4	1.00	0.96	0.98	0.70	0.63	0.63	0.67	0.78	0.77
+8	1.01	1.02	0.89	0.73	0.65	0.67	0.71	0.81	0.80
+16	0.99	0.96	0.96	0.79	0.71	0.73	0.80	0.84	0.84
+32	0.99	0.95	1.05	0.89	0.84	0.85	0.94	0.92	0.91
+64	1.00	0.99	1.16	1.04	1.00	1.02	1.06	0.99	0.99
+128	1.00	1.06	0.98	1.14	1.39	1.26	1.08	1.02	0.98
+
+There is regression in large critical section. But adaptive mutex is
+aimed for "quick" locks. Small critical section is more common when
+users choose to use adaptive pthread_mutex.
+
+Signed-off-by: Wangyang Guo <wangyang.guo@intel.com>
+Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
+
+Conflicts:
+	pthreadP.h
+	(had been moved)
+	nptl/pthread_mutex_lock.c
+	(max_adaptive_count renamed)
+
+---
+ nptl/pthreadP.h                             |  1 +
+ nptl/pthread_mutex_lock.c                   | 16 +++++++--
+ sysdeps/nptl/pthread_mutex_backoff.h        | 35 ++++++++++++++++++
+ sysdeps/x86_64/nptl/pthread_mutex_backoff.h | 39 +++++++++++++++++++++
+ 4 files changed, 89 insertions(+), 2 deletions(-)
+ create mode 100644 sysdeps/nptl/pthread_mutex_backoff.h
+ create mode 100644 sysdeps/x86_64/nptl/pthread_mutex_backoff.h
+
+diff --git a/nptl/pthreadP.h b/nptl/pthreadP.h
+index 7ddc166c..1550e3b6 100644
+--- a/nptl/pthreadP.h
+++ b/nptl/pthreadP.h
+@@ -33,6 +33,7 @@
+ #include <kernel-features.h>
+ #include <errno.h>
+ #include <internal-signals.h>
+#include <pthread_mutex_backoff.h>
+ 
+ 
+ /* Atomic operations on TLS memory.  */
+diff --git a/nptl/pthread_mutex_lock.c b/nptl/pthread_mutex_lock.c
+index d96a9933..c7770fc9 100644
+--- a/nptl/pthread_mutex_lock.c
+++ b/nptl/pthread_mutex_lock.c
+@@ -133,14 +133,26 @@ __pthread_mutex_lock (pthread_mutex_t *mutex)
+ 	  int cnt = 0;
+ 	  int max_cnt = MIN (MAX_ADAPTIVE_COUNT,
+ 			     mutex->__data.__spins * 2 + 10);
+	  int spin_count, exp_backoff = 1;
+	  unsigned int jitter = get_jitter ();
+ 	  do
+ 	    {
+-	      if (cnt++ >= max_cnt)
+	      /* In each loop, spin count is exponential backoff plus
+		 random jitter, random range is [0, exp_backoff-1].  */
+	      spin_count = exp_backoff + (jitter & (exp_backoff - 1));
+	      cnt += spin_count;
+	      if (cnt >= max_cnt)
+ 		{
+		  /* If cnt exceeds max spin count, just go to wait
+		     queue.  */
+ 		  LLL_MUTEX_LOCK (mutex);
+ 		  break;
+ 		}
+-	      atomic_spin_nop ();
+	      do
+		atomic_spin_nop ();
+	      while (--spin_count > 0);
+	      /* Prepare for next loop.  */
+	      exp_backoff = get_next_backoff (exp_backoff);
+ 	    }
+ 	  while (LLL_MUTEX_READ_LOCK (mutex) != 0
+ 		 || LLL_MUTEX_TRYLOCK (mutex) != 0);
+diff --git a/sysdeps/nptl/pthread_mutex_backoff.h b/sysdeps/nptl/pthread_mutex_backoff.h
+new file mode 100644
+index 00000000..5b26c22a
+--- /dev/null
+++ b/sysdeps/nptl/pthread_mutex_backoff.h
+@@ -0,0 +1,35 @@
+/* Pthread mutex backoff configuration.
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+#ifndef _PTHREAD_MUTEX_BACKOFF_H
+#define _PTHREAD_MUTEX_BACKOFF_H 1
+
+static inline unsigned int
+get_jitter (void)
+{
+  /* Arch dependent random jitter, return 0 disables random.  */
+  return 0;
+}
+
+static inline int
+get_next_backoff (int backoff)
+{
+  /* Next backoff, return 1 disables mutex backoff.  */
+  return 1;
+}
+
+#endif
+diff --git a/sysdeps/x86_64/nptl/pthread_mutex_backoff.h b/sysdeps/x86_64/nptl/pthread_mutex_backoff.h
+new file mode 100644
+index 00000000..ec74c3d9
+--- /dev/null
+++ b/sysdeps/x86_64/nptl/pthread_mutex_backoff.h
+@@ -0,0 +1,39 @@
+/* Pthread mutex backoff configuration.
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+#ifndef _PTHREAD_MUTEX_BACKOFF_H
+#define _PTHREAD_MUTEX_BACKOFF_H 1
+
+#include <fast-jitter.h>
+
+static inline unsigned int
+get_jitter (void)
+{
+  return get_fast_jitter ();
+}
+
+#define MAX_BACKOFF 16
+
+static inline int
+get_next_backoff (int backoff)
+{
+  /* Binary expontial backoff. Limiting max backoff
+     can reduce latency in large critical section.  */
+  return (backoff < MAX_BACKOFF) ? backoff << 1 : backoff;
+}
+
+#endif
+-- 
+GitLab
+
--- a/glibc-RHEL-15696-108.patch
+++ b/glibc-RHEL-15696-108.patch
@ -0,0 +1,55 @@
+From c6272098323153db373f2986c67786ea8c85f1cf Mon Sep 17 00:00:00 2001
+From: Noah Goldstein <goldstein.w.n@gmail.com>
+Date: Tue, 15 Feb 2022 08:18:15 -0600
+Subject: [PATCH] x86: Fallback {str|wcs}cmp RTM in the ncmp overflow case [BZ
+ #28896]
+Content-type: text/plain; charset=UTF-8
+
+In the overflow fallback strncmp-avx2-rtm and wcsncmp-avx2-rtm would
+call strcmp-avx2 and wcscmp-avx2 respectively. This would have
+not checks around vzeroupper and would trigger spurious
+aborts. This commit fixes that.
+
+test-strcmp, test-strncmp, test-wcscmp, and test-wcsncmp all pass on
+AVX2 machines with and without RTM.
+
+Co-authored-by: H.J. Lu <hjl.tools@gmail.com>
+---
+ sysdeps/x86_64/multiarch/strcmp-avx2.S | 8 ++------
+ 1 file changed, 2 insertions(+), 6 deletions(-)
+
+Conflicts:
+	sysdeps/x86_64/multiarch/strcmp-avx2.S
+	(split into two patches due to upstream bug differences)
+
+diff --git a/sysdeps/x86_64/multiarch/strcmp-avx2.S b/sysdeps/x86_64/multiarch/strcmp-avx2.S
+index 28cc98b6..e267c6cb 100644
+--- a/sysdeps/x86_64/multiarch/strcmp-avx2.S
+++ b/sysdeps/x86_64/multiarch/strcmp-avx2.S
+@@ -345,10 +345,10 @@ L(one_or_less):
+ 	movq	%LOCALE_REG, %rdx
+ #  endif
+ 	jb	L(ret_zero)
+-#  ifdef USE_AS_WCSCMP
+ 	/* 'nbe' covers the case where length is negative (large
+ 	   unsigned).  */
+-	jnbe	__wcscmp_avx2
+	jnbe	OVERFLOW_STRCMP
+#  ifdef USE_AS_WCSCMP
+ 	movl	(%rdi), %edx
+ 	xorl	%eax, %eax
+ 	cmpl	(%rsi), %edx
+@@ -357,10 +357,6 @@ L(one_or_less):
+ 	negl	%eax
+ 	orl	$1, %eax
+ #  else
+-	/* 'nbe' covers the case where length is negative (large
+-	   unsigned).  */
+-
+-	jnbe	__strcmp_avx2
+ 	movzbl	(%rdi), %eax
+ 	movzbl	(%rsi), %ecx
+ 	TOLOWER_gpr (%rax, %eax)
+-- 
+GitLab
+
--- a/glibc-RHEL-15696-109.patch
+++ b/glibc-RHEL-15696-109.patch
@ -0,0 +1,60 @@
+From 259a17cc98058d2576511201f85d28cb5d9de2a2 Mon Sep 17 00:00:00 2001
+From: Stefan Liebler <stli@linux.ibm.com>
+Date: Mon, 28 Jun 2021 13:01:07 +0200
+Subject: s390x: Update math: redirect roundeven function
+
+After recent commit
+447954a206837b5f153869cfeeeab44631c3fac9
+"math: redirect roundeven function", building on
+s390x fails with:
+Error: symbol `__roundevenl' is already defined
+
+Similar to aarch64/riscv fix, this patch redirects target
+specific functions for s390x:
+commit 3213ed770cbc5821920d16caa93c85e92dd7b9f6
+"Update math: redirect roundeven function"
+
+diff --git a/sysdeps/s390/fpu/s_roundeven.c b/sysdeps/s390/fpu/s_roundeven.c
+index 40b07e054b..0773adfed0 100644
+--- a/sysdeps/s390/fpu/s_roundeven.c
+++ b/sysdeps/s390/fpu/s_roundeven.c
+@@ -18,6 +18,7 @@
+    <https://www.gnu.org/licenses/>.  */
+ 
+ #ifdef HAVE_S390_MIN_Z196_ZARCH_ASM_SUPPORT
+# define NO_MATH_REDIRECT
+ # include <math.h>
+ # include <libm-alias-double.h>
+ 
+@@ -31,7 +32,6 @@ __roundeven (double x)
+   __asm__ ("fidbra %0,4,%1,4" : "=f" (y) : "f" (x));
+   return y;
+ }
+-hidden_def (__roundeven)
+ libm_alias_double (__roundeven, roundeven)
+ 
+ #else
+diff --git a/sysdeps/s390/fpu/s_roundevenf.c b/sysdeps/s390/fpu/s_roundevenf.c
+index d2fbf3d2b6..289785bc4a 100644
+--- a/sysdeps/s390/fpu/s_roundevenf.c
+++ b/sysdeps/s390/fpu/s_roundevenf.c
+@@ -18,6 +18,7 @@
+    <https://www.gnu.org/licenses/>.  */
+ 
+ #ifdef HAVE_S390_MIN_Z196_ZARCH_ASM_SUPPORT
+# define NO_MATH_REDIRECT
+ # include <math.h>
+ # include <libm-alias-float.h>
+ 
+diff --git a/sysdeps/s390/fpu/s_roundevenl.c b/sysdeps/s390/fpu/s_roundevenl.c
+index 29ab7a8616..94b6459ab4 100644
+--- a/sysdeps/s390/fpu/s_roundevenl.c
+++ b/sysdeps/s390/fpu/s_roundevenl.c
+@@ -18,6 +18,7 @@
+    <https://www.gnu.org/licenses/>.  */
+ 
+ #ifdef HAVE_S390_MIN_Z196_ZARCH_ASM_SUPPORT
+# define NO_MATH_REDIRECT
+ # include <math.h>
+ # include <math_private.h>
+ # include <libm-alias-ldouble.h>
--- a/glibc-RHEL-15696-11.patch
+++ b/glibc-RHEL-15696-11.patch
@ -0,0 +1,74 @@
+From 1da50d4bda07f04135dca39f40e79fc9eabed1f8 Mon Sep 17 00:00:00 2001
+From: "H.J. Lu" <hjl.tools@gmail.com>
+Date: Fri, 26 Feb 2021 05:36:59 -0800
+Subject: [PATCH] x86: Set Prefer_No_VZEROUPPER and add Prefer_AVX2_STRCMP
+Content-type: text/plain; charset=UTF-8
+
+1. Set Prefer_No_VZEROUPPER if RTM is usable to avoid RTM abort triggered
+by VZEROUPPER inside a transactionally executing RTM region.
+2. Since to compare 2 32-byte strings, 256-bit EVEX strcmp requires 2
+loads, 3 VPCMPs and 2 KORDs while AVX2 strcmp requires 1 load, 2 VPCMPEQs,
+1 VPMINU and 1 VPMOVMSKB, AVX2 strcmp is faster than EVEX strcmp.  Add
+Prefer_AVX2_STRCMP to prefer AVX2 strcmp family functions.
+---
+ sysdeps/x86/cpu-features.c                    | 20 +++++++++++++++++--
+ sysdeps/x86/cpu-tunables.c                    |  2 ++
+ ...cpu-features-preferred_feature_index_1.def |  1 +
+ 3 files changed, 21 insertions(+), 2 deletions(-)
+
+diff --git a/sysdeps/x86/cpu-features.c b/sysdeps/x86/cpu-features.c
+index 91042505..3610ee5c 100644
+--- a/sysdeps/x86/cpu-features.c
+++ b/sysdeps/x86/cpu-features.c
+@@ -524,8 +524,24 @@ init_cpu_features (struct cpu_features *cpu_features)
+ 	cpu_features->preferred[index_arch_Prefer_No_VZEROUPPER]
+ 	  |= bit_arch_Prefer_No_VZEROUPPER;
+       else
+-	cpu_features->preferred[index_arch_Prefer_No_AVX512]
+-	  |= bit_arch_Prefer_No_AVX512;
+	{
+	  cpu_features->preferred[index_arch_Prefer_No_AVX512]
+	    |= bit_arch_Prefer_No_AVX512;
+
+	  /* Avoid RTM abort triggered by VZEROUPPER inside a
+	     transactionally executing RTM region.  */
+	  if (CPU_FEATURE_USABLE_P (cpu_features, RTM))
+	    cpu_features->preferred[index_arch_Prefer_No_VZEROUPPER]
+	      |= bit_arch_Prefer_No_VZEROUPPER;
+
+	  /* Since to compare 2 32-byte strings, 256-bit EVEX strcmp
+	     requires 2 loads, 3 VPCMPs and 2 KORDs while AVX2 strcmp
+	     requires 1 load, 2 VPCMPEQs, 1 VPMINU and 1 VPMOVMSKB,
+	     AVX2 strcmp is faster than EVEX strcmp.  */
+	  if (CPU_FEATURE_USABLE_P (cpu_features, AVX2))
+	    cpu_features->preferred[index_arch_Prefer_AVX2_STRCMP]
+	      |= bit_arch_Prefer_AVX2_STRCMP;
+	}
+     }
+   /* This spells out "AuthenticAMD".  */
+   else if (ebx == 0x68747541 && ecx == 0x444d4163 && edx == 0x69746e65)
+diff --git a/sysdeps/x86/cpu-tunables.c b/sysdeps/x86/cpu-tunables.c
+index 3173b2b9..73adbaba 100644
+--- a/sysdeps/x86/cpu-tunables.c
+++ b/sysdeps/x86/cpu-tunables.c
+@@ -239,6 +239,8 @@ TUNABLE_CALLBACK (set_hwcaps) (tunable_val_t *valp)
+ 	      CHECK_GLIBC_IFUNC_PREFERRED_BOTH (n, cpu_features,
+ 						Fast_Copy_Backward,
+ 						disable, 18);
+	      CHECK_GLIBC_IFUNC_PREFERRED_NEED_BOTH
+		(n, cpu_features, Prefer_AVX2_STRCMP, AVX2, disable, 18);
+ 	    }
+ 	  break;
+ 	case 19:
+diff --git a/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def b/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def
+index 17a5cc42..4ca70b40 100644
+--- a/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def
+++ b/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def
+@@ -32,3 +32,4 @@ BIT (Prefer_ERMS)
+ BIT (Prefer_FSRM)
+ BIT (Prefer_No_AVX512)
+ BIT (MathVec_Prefer_No_AVX512)
+BIT (Prefer_AVX2_STRCMP)
+-- 
+GitLab
+
--- a/glibc-RHEL-15696-110.patch
+++ b/glibc-RHEL-15696-110.patch
@ -0,0 +1,26 @@
+From 3213ed770cbc5821920d16caa93c85e92dd7b9f6 Mon Sep 17 00:00:00 2001
+From: "H.J. Lu" <hjl.tools@gmail.com>
+Date: Wed, 23 Jun 2021 13:29:41 -0700
+Subject: Update math: redirect roundeven function
+
+Redirect target specific roundeven functions for aarch64, ldbl-128ibm
+and riscv.
+
+Conflicts:
+	sysdeps/aarch64/*
+	(not needed)
+	sysdeps/riscv/*
+	(not supported)
+
+diff --git a/sysdeps/ieee754/ldbl-128ibm/s_roundevenl.c b/sysdeps/ieee754/ldbl-128ibm/s_roundevenl.c
+index 6701970f4a..90eecf496b 100644
+--- a/sysdeps/ieee754/ldbl-128ibm/s_roundevenl.c
+++ b/sysdeps/ieee754/ldbl-128ibm/s_roundevenl.c
+@@ -17,6 +17,7 @@
+    License along with the GNU C Library; if not, see
+    <http://www.gnu.org/licenses/>.  */
+ 
+#define NO_MATH_REDIRECT
+ #include <math.h>
+ #include <math_private.h>
+ 
--- a/glibc-RHEL-15696-12.patch
+++ b/glibc-RHEL-15696-12.patch
--- a/glibc-RHEL-15696-13.patch
+++ b/glibc-RHEL-15696-13.patch
--- a/glibc-RHEL-15696-14.patch
+++ b/glibc-RHEL-15696-14.patch
@ -0,0 +1,242 @@
+From 63ad43566f7a25d140dc723598aeb441ad657eed Mon Sep 17 00:00:00 2001
+From: "H.J. Lu" <hjl.tools@gmail.com>
+Date: Fri, 5 Mar 2021 06:46:08 -0800
+Subject: [PATCH] x86-64: Add memmove family functions with 256-bit EVEX
+Content-type: text/plain; charset=UTF-8
+
+Update ifunc-memmove.h to select the function optimized with 256-bit EVEX
+instructions using YMM16-YMM31 registers to avoid RTM abort with usable
+AVX512VL since VZEROUPPER isn't needed at function exit.
+---
+ sysdeps/x86_64/multiarch/Makefile             |  1 +
+ sysdeps/x86_64/multiarch/ifunc-impl-list.c    | 36 +++++++++++++++++++
+ sysdeps/x86_64/multiarch/ifunc-memmove.h      | 21 +++++++++--
+ .../multiarch/memmove-evex-unaligned-erms.S   | 33 +++++++++++++++++
+ .../multiarch/memmove-vec-unaligned-erms.S    | 24 ++++++++-----
+ 5 files changed, 104 insertions(+), 11 deletions(-)
+ create mode 100644 sysdeps/x86_64/multiarch/memmove-evex-unaligned-erms.S
+
+diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
+index 46783cd1..4563fc56 100644
+--- a/sysdeps/x86_64/multiarch/Makefile
+++ b/sysdeps/x86_64/multiarch/Makefile
+@@ -41,6 +41,7 @@ sysdep_routines += strncat-c stpncpy-c strncpy-c \
+ 		   memset-avx2-unaligned-erms \
+ 		   memset-avx512-unaligned-erms \
+ 		   memchr-evex \
+		   memmove-evex-unaligned-erms \
+ 		   memrchr-evex \
+ 		   rawmemchr-evex \
+ 		   stpcpy-evex \
+diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+index 082e4da3..6bd3abfc 100644
+--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+@@ -80,6 +80,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 	      IFUNC_IMPL_ADD (array, i, __memmove_chk,
+ 			      CPU_FEATURE_USABLE (AVX),
+ 			      __memmove_chk_avx_unaligned_erms)
+	      IFUNC_IMPL_ADD (array, i, __memmove_chk,
+			      CPU_FEATURE_USABLE (AVX512VL),
+			      __memmove_chk_evex_unaligned)
+	      IFUNC_IMPL_ADD (array, i, __memmove_chk,
+			      CPU_FEATURE_USABLE (AVX512VL),
+			      __memmove_chk_evex_unaligned_erms)
+ 	      IFUNC_IMPL_ADD (array, i, __memmove_chk,
+ 			      CPU_FEATURE_USABLE (SSSE3),
+ 			      __memmove_chk_ssse3_back)
+@@ -102,6 +108,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 	      IFUNC_IMPL_ADD (array, i, memmove,
+ 			      CPU_FEATURE_USABLE (AVX),
+ 			      __memmove_avx_unaligned_erms)
+	      IFUNC_IMPL_ADD (array, i, memmove,
+			      CPU_FEATURE_USABLE (AVX512VL),
+			      __memmove_evex_unaligned)
+	      IFUNC_IMPL_ADD (array, i, memmove,
+			      CPU_FEATURE_USABLE (AVX512VL),
+			      __memmove_evex_unaligned_erms)
+ 	      IFUNC_IMPL_ADD (array, i, memmove,
+ 			      CPU_FEATURE_USABLE (AVX512F),
+ 			      __memmove_avx512_no_vzeroupper)
+@@ -565,6 +577,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 	      IFUNC_IMPL_ADD (array, i, __memcpy_chk,
+ 			      CPU_FEATURE_USABLE (AVX),
+ 			      __memcpy_chk_avx_unaligned_erms)
+	      IFUNC_IMPL_ADD (array, i, __memcpy_chk,
+			      CPU_FEATURE_USABLE (AVX512VL),
+			      __memcpy_chk_evex_unaligned)
+	      IFUNC_IMPL_ADD (array, i, __memcpy_chk,
+			      CPU_FEATURE_USABLE (AVX512VL),
+			      __memcpy_chk_evex_unaligned_erms)
+ 	      IFUNC_IMPL_ADD (array, i, __memcpy_chk,
+ 			      CPU_FEATURE_USABLE (SSSE3),
+ 			      __memcpy_chk_ssse3_back)
+@@ -587,6 +605,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 	      IFUNC_IMPL_ADD (array, i, memcpy,
+ 			      CPU_FEATURE_USABLE (AVX),
+ 			      __memcpy_avx_unaligned_erms)
+	      IFUNC_IMPL_ADD (array, i, memcpy,
+			      CPU_FEATURE_USABLE (AVX512VL),
+			      __memcpy_evex_unaligned)
+	      IFUNC_IMPL_ADD (array, i, memcpy,
+			      CPU_FEATURE_USABLE (AVX512VL),
+			      __memcpy_evex_unaligned_erms)
+ 	      IFUNC_IMPL_ADD (array, i, memcpy, CPU_FEATURE_USABLE (SSSE3),
+ 			      __memcpy_ssse3_back)
+ 	      IFUNC_IMPL_ADD (array, i, memcpy, CPU_FEATURE_USABLE (SSSE3),
+@@ -623,6 +647,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 	      IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
+ 			      CPU_FEATURE_USABLE (AVX),
+ 			      __mempcpy_chk_avx_unaligned_erms)
+	      IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
+			      CPU_FEATURE_USABLE (AVX512VL),
+			      __mempcpy_chk_evex_unaligned)
+	      IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
+			      CPU_FEATURE_USABLE (AVX512VL),
+			      __mempcpy_chk_evex_unaligned_erms)
+ 	      IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
+ 			      CPU_FEATURE_USABLE (SSSE3),
+ 			      __mempcpy_chk_ssse3_back)
+@@ -654,6 +684,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 	      IFUNC_IMPL_ADD (array, i, mempcpy,
+ 			      CPU_FEATURE_USABLE (AVX),
+ 			      __mempcpy_avx_unaligned_erms)
+	      IFUNC_IMPL_ADD (array, i, mempcpy,
+			      CPU_FEATURE_USABLE (AVX512VL),
+			      __mempcpy_evex_unaligned)
+	      IFUNC_IMPL_ADD (array, i, mempcpy,
+			      CPU_FEATURE_USABLE (AVX512VL),
+			      __mempcpy_evex_unaligned_erms)
+ 	      IFUNC_IMPL_ADD (array, i, mempcpy, CPU_FEATURE_USABLE (SSSE3),
+ 			      __mempcpy_ssse3_back)
+ 	      IFUNC_IMPL_ADD (array, i, mempcpy, CPU_FEATURE_USABLE (SSSE3),
+diff --git a/sysdeps/x86_64/multiarch/ifunc-memmove.h b/sysdeps/x86_64/multiarch/ifunc-memmove.h
+index 5e5f0299..6f8bce5f 100644
+--- a/sysdeps/x86_64/multiarch/ifunc-memmove.h
+++ b/sysdeps/x86_64/multiarch/ifunc-memmove.h
+@@ -29,6 +29,10 @@ extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3_back) attribute_hidden;
+ extern __typeof (REDIRECT_NAME) OPTIMIZE (avx_unaligned) attribute_hidden;
+ extern __typeof (REDIRECT_NAME) OPTIMIZE (avx_unaligned_erms)
+   attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE (evex_unaligned)
+  attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE (evex_unaligned_erms)
+  attribute_hidden;
+ extern __typeof (REDIRECT_NAME) OPTIMIZE (avx512_unaligned)
+   attribute_hidden;
+ extern __typeof (REDIRECT_NAME) OPTIMIZE (avx512_unaligned_erms)
+@@ -59,10 +63,21 @@ IFUNC_SELECTOR (void)
+ 
+   if (CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load))
+     {
+-      if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
+-	return OPTIMIZE (avx_unaligned_erms);
+      if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL))
+	{
+	  if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
+	    return OPTIMIZE (evex_unaligned_erms);
+
+	  return OPTIMIZE (evex_unaligned);
+	}
+
+      if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
+	{
+	  if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
+	    return OPTIMIZE (avx_unaligned_erms);
+ 
+-      return OPTIMIZE (avx_unaligned);
+	  return OPTIMIZE (avx_unaligned);
+	}
+     }
+ 
+   if (!CPU_FEATURE_USABLE_P (cpu_features, SSSE3)
+diff --git a/sysdeps/x86_64/multiarch/memmove-evex-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-evex-unaligned-erms.S
+new file mode 100644
+index 00000000..0cbce8f9
+--- /dev/null
+++ b/sysdeps/x86_64/multiarch/memmove-evex-unaligned-erms.S
+@@ -0,0 +1,33 @@
+#if IS_IN (libc)
+# define VEC_SIZE	32
+# define XMM0		xmm16
+# define XMM1		xmm17
+# define YMM0		ymm16
+# define YMM1		ymm17
+# define VEC0		ymm16
+# define VEC1		ymm17
+# define VEC2		ymm18
+# define VEC3		ymm19
+# define VEC4		ymm20
+# define VEC5		ymm21
+# define VEC6		ymm22
+# define VEC7		ymm23
+# define VEC8		ymm24
+# define VEC9		ymm25
+# define VEC10		ymm26
+# define VEC11		ymm27
+# define VEC12		ymm28
+# define VEC13		ymm29
+# define VEC14		ymm30
+# define VEC15		ymm31
+# define VEC(i)		VEC##i
+# define VMOVNT		vmovntdq
+# define VMOVU		vmovdqu64
+# define VMOVA		vmovdqa64
+# define VZEROUPPER
+
+# define SECTION(p)		p##.evex
+# define MEMMOVE_SYMBOL(p,s)	p##_evex_##s
+
+# include "memmove-vec-unaligned-erms.S"
+#endif
+diff --git a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
+index 274aa1c7..08e21692 100644
+--- a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
+@@ -48,6 +48,14 @@
+ # define MEMMOVE_CHK_SYMBOL(p,s)	MEMMOVE_SYMBOL(p, s)
+ #endif
+ 
+#ifndef XMM0
+# define XMM0				xmm0
+#endif
+
+#ifndef YMM0
+# define YMM0				ymm0
+#endif
+
+ #ifndef VZEROUPPER
+ # if VEC_SIZE > 16
+ #  define VZEROUPPER vzeroupper
+@@ -277,20 +285,20 @@ L(less_vec):
+ #if VEC_SIZE > 32
+ L(between_32_63):
+ 	/* From 32 to 63.  No branch when size == 32.  */
+-	vmovdqu	(%rsi), %ymm0
+-	vmovdqu	-32(%rsi,%rdx), %ymm1
+-	vmovdqu	%ymm0, (%rdi)
+-	vmovdqu	%ymm1, -32(%rdi,%rdx)
+	VMOVU	(%rsi), %YMM0
+	VMOVU	-32(%rsi,%rdx), %YMM1
+	VMOVU	%YMM0, (%rdi)
+	VMOVU	%YMM1, -32(%rdi,%rdx)
+ 	VZEROUPPER
+ 	ret
+ #endif
+ #if VEC_SIZE > 16
+ 	/* From 16 to 31.  No branch when size == 16.  */
+ L(between_16_31):
+-	vmovdqu	(%rsi), %xmm0
+-	vmovdqu	-16(%rsi,%rdx), %xmm1
+-	vmovdqu	%xmm0, (%rdi)
+-	vmovdqu	%xmm1, -16(%rdi,%rdx)
+	VMOVU	(%rsi), %XMM0
+	VMOVU	-16(%rsi,%rdx), %XMM1
+	VMOVU	%XMM0, (%rdi)
+	VMOVU	%XMM1, -16(%rdi,%rdx)
+ 	ret
+ #endif
+ L(between_8_15):
+-- 
+GitLab
+
--- a/glibc-RHEL-15696-15.patch
+++ b/glibc-RHEL-15696-15.patch
@ -0,0 +1,254 @@
+From 1b968b6b9b3aac702ac2f133e0dd16cfdbb415ee Mon Sep 17 00:00:00 2001
+From: "H.J. Lu" <hjl.tools@gmail.com>
+Date: Fri, 5 Mar 2021 07:15:03 -0800
+Subject: [PATCH] x86-64: Add memset family functions with 256-bit EVEX
+Content-type: text/plain; charset=UTF-8
+
+Update ifunc-memset.h/ifunc-wmemset.h to select the function optimized
+with 256-bit EVEX instructions using YMM16-YMM31 registers to avoid RTM
+abort with usable AVX512VL and AVX512BW since VZEROUPPER isn't needed at
+function exit.
+---
+ sysdeps/x86_64/multiarch/Makefile             |  1 +
+ sysdeps/x86_64/multiarch/ifunc-impl-list.c    | 22 +++++++++++++++++
+ sysdeps/x86_64/multiarch/ifunc-memset.h       | 24 +++++++++++++++----
+ sysdeps/x86_64/multiarch/ifunc-wmemset.h      | 13 ++++++----
+ .../multiarch/memset-evex-unaligned-erms.S    | 24 +++++++++++++++++++
+ .../multiarch/memset-vec-unaligned-erms.S     | 20 +++++++++++-----
+ 6 files changed, 90 insertions(+), 14 deletions(-)
+ create mode 100644 sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
+
+diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
+index 4563fc56..1cc0a10e 100644
+--- a/sysdeps/x86_64/multiarch/Makefile
+++ b/sysdeps/x86_64/multiarch/Makefile
+@@ -43,6 +43,7 @@ sysdep_routines += strncat-c stpncpy-c strncpy-c \
+ 		   memchr-evex \
+ 		   memmove-evex-unaligned-erms \
+ 		   memrchr-evex \
+		   memset-evex-unaligned-erms \
+ 		   rawmemchr-evex \
+ 		   stpcpy-evex \
+ 		   stpncpy-evex \
+diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+index 6bd3abfc..7cf83485 100644
+--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+@@ -160,6 +160,14 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 	      IFUNC_IMPL_ADD (array, i, __memset_chk,
+ 			      CPU_FEATURE_USABLE (AVX2),
+ 			      __memset_chk_avx2_unaligned_erms)
+	      IFUNC_IMPL_ADD (array, i, __memset_chk,
+			      (CPU_FEATURE_USABLE (AVX512VL)
+			       && CPU_FEATURE_USABLE (AVX512BW)),
+			      __memset_chk_evex_unaligned)
+	      IFUNC_IMPL_ADD (array, i, __memset_chk,
+			      (CPU_FEATURE_USABLE (AVX512VL)
+			       && CPU_FEATURE_USABLE (AVX512BW)),
+			      __memset_chk_evex_unaligned_erms)
+ 	      IFUNC_IMPL_ADD (array, i, __memset_chk,
+ 			      CPU_FEATURE_USABLE (AVX512F),
+ 			      __memset_chk_avx512_unaligned_erms)
+@@ -185,6 +193,14 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 	      IFUNC_IMPL_ADD (array, i, memset,
+ 			      CPU_FEATURE_USABLE (AVX2),
+ 			      __memset_avx2_unaligned_erms)
+	      IFUNC_IMPL_ADD (array, i, memset,
+			      (CPU_FEATURE_USABLE (AVX512VL)
+			       && CPU_FEATURE_USABLE (AVX512BW)),
+			      __memset_evex_unaligned)
+	      IFUNC_IMPL_ADD (array, i, memset,
+			      (CPU_FEATURE_USABLE (AVX512VL)
+			       && CPU_FEATURE_USABLE (AVX512BW)),
+			      __memset_evex_unaligned_erms)
+ 	      IFUNC_IMPL_ADD (array, i, memset,
+ 			      CPU_FEATURE_USABLE (AVX512F),
+ 			      __memset_avx512_unaligned_erms)
+@@ -555,6 +571,9 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 	      IFUNC_IMPL_ADD (array, i, wmemset,
+ 			      CPU_FEATURE_USABLE (AVX2),
+ 			      __wmemset_avx2_unaligned)
+	      IFUNC_IMPL_ADD (array, i, wmemset,
+			      CPU_FEATURE_USABLE (AVX512VL),
+			      __wmemset_evex_unaligned)
+ 	      IFUNC_IMPL_ADD (array, i, wmemset,
+ 			      CPU_FEATURE_USABLE (AVX512F),
+ 			      __wmemset_avx512_unaligned))
+@@ -723,6 +742,9 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 	      IFUNC_IMPL_ADD (array, i, __wmemset_chk,
+ 			      CPU_FEATURE_USABLE (AVX2),
+ 			      __wmemset_chk_avx2_unaligned)
+	      IFUNC_IMPL_ADD (array, i, __wmemset_chk,
+			      CPU_FEATURE_USABLE (AVX512VL),
+			      __wmemset_chk_evex_unaligned)
+ 	      IFUNC_IMPL_ADD (array, i, __wmemset_chk,
+ 			      CPU_FEATURE_USABLE (AVX512F),
+ 			      __wmemset_chk_avx512_unaligned))
+diff --git a/sysdeps/x86_64/multiarch/ifunc-memset.h b/sysdeps/x86_64/multiarch/ifunc-memset.h
+index 708bd72e..6f31f4dc 100644
+--- a/sysdeps/x86_64/multiarch/ifunc-memset.h
+++ b/sysdeps/x86_64/multiarch/ifunc-memset.h
+@@ -27,6 +27,10 @@ extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2_unaligned_erms)
+ extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_unaligned) attribute_hidden;
+ extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_unaligned_erms)
+   attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE (evex_unaligned)
+  attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE (evex_unaligned_erms)
+  attribute_hidden;
+ extern __typeof (REDIRECT_NAME) OPTIMIZE (avx512_unaligned)
+   attribute_hidden;
+ extern __typeof (REDIRECT_NAME) OPTIMIZE (avx512_unaligned_erms)
+@@ -56,10 +60,22 @@ IFUNC_SELECTOR (void)
+ 
+   if (CPU_FEATURE_USABLE_P (cpu_features, AVX2))
+     {
+-      if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
+-	return OPTIMIZE (avx2_unaligned_erms);
+-      else
+-	return OPTIMIZE (avx2_unaligned);
+      if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
+	  && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW))
+	{
+	  if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
+	    return OPTIMIZE (evex_unaligned_erms);
+
+	  return OPTIMIZE (evex_unaligned);
+	}
+
+      if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
+	{
+	  if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
+	    return OPTIMIZE (avx2_unaligned_erms);
+
+	  return OPTIMIZE (avx2_unaligned);
+	}
+     }
+ 
+   if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
+diff --git a/sysdeps/x86_64/multiarch/ifunc-wmemset.h b/sysdeps/x86_64/multiarch/ifunc-wmemset.h
+index eb242210..9290c4bf 100644
+--- a/sysdeps/x86_64/multiarch/ifunc-wmemset.h
+++ b/sysdeps/x86_64/multiarch/ifunc-wmemset.h
+@@ -20,6 +20,7 @@
+ 
+ extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2_unaligned) attribute_hidden;
+ extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_unaligned) attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE (evex_unaligned) attribute_hidden;
+ extern __typeof (REDIRECT_NAME) OPTIMIZE (avx512_unaligned) attribute_hidden;
+ 
+ static inline void *
+@@ -27,14 +28,18 @@ IFUNC_SELECTOR (void)
+ {
+   const struct cpu_features* cpu_features = __get_cpu_features ();
+ 
+-  if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER)
+-      && CPU_FEATURE_USABLE_P (cpu_features, AVX2)
+  if (CPU_FEATURE_USABLE_P (cpu_features, AVX2)
+       && CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load))
+     {
+       if (CPU_FEATURE_USABLE_P (cpu_features, AVX512F)
+-	  && !CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_AVX512))
+	  && !CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_AVX512)
+	  && !CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
+ 	return OPTIMIZE (avx512_unaligned);
+-      else
+
+      if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL))
+	return OPTIMIZE (evex_unaligned);
+
+      if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
+ 	return OPTIMIZE (avx2_unaligned);
+     }
+ 
+diff --git a/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
+new file mode 100644
+index 00000000..ae0a4d6e
+--- /dev/null
+++ b/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
+@@ -0,0 +1,24 @@
+#if IS_IN (libc)
+# define VEC_SIZE	32
+# define XMM0		xmm16
+# define YMM0		ymm16
+# define VEC0		ymm16
+# define VEC(i)		VEC##i
+# define VMOVU		vmovdqu64
+# define VMOVA		vmovdqa64
+# define VZEROUPPER
+
+# define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
+  movq r, %rax; \
+  vpbroadcastb d, %VEC0
+
+# define WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
+  movq r, %rax; \
+  vpbroadcastd d, %VEC0
+
+# define SECTION(p)		p##.evex
+# define MEMSET_SYMBOL(p,s)	p##_evex_##s
+# define WMEMSET_SYMBOL(p,s)	p##_evex_##s
+
+# include "memset-vec-unaligned-erms.S"
+#endif
+diff --git a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
+index 9a0fd818..71e91a8f 100644
+--- a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
+@@ -34,6 +34,14 @@
+ # define WMEMSET_CHK_SYMBOL(p,s)	WMEMSET_SYMBOL(p, s)
+ #endif
+ 
+#ifndef XMM0
+# define XMM0				xmm0
+#endif
+
+#ifndef YMM0
+# define YMM0				ymm0
+#endif
+
+ #ifndef VZEROUPPER
+ # if VEC_SIZE > 16
+ #  define VZEROUPPER			vzeroupper
+@@ -67,7 +75,7 @@
+ ENTRY (__bzero)
+ 	mov	%RDI_LP, %RAX_LP /* Set return value.  */
+ 	mov	%RSI_LP, %RDX_LP /* Set n.  */
+-	pxor	%xmm0, %xmm0
+	pxor	%XMM0, %XMM0
+ 	jmp	L(entry_from_bzero)
+ END (__bzero)
+ weak_alias (__bzero, bzero)
+@@ -223,7 +231,7 @@ L(less_vec):
+ 	cmpb	$16, %dl
+ 	jae	L(between_16_31)
+ # endif
+-	MOVQ	%xmm0, %rcx
+	MOVQ	%XMM0, %rcx
+ 	cmpb	$8, %dl
+ 	jae	L(between_8_15)
+ 	cmpb	$4, %dl
+@@ -238,16 +246,16 @@ L(less_vec):
+ # if VEC_SIZE > 32
+ 	/* From 32 to 63.  No branch when size == 32.  */
+ L(between_32_63):
+-	vmovdqu	%ymm0, -32(%rdi,%rdx)
+-	vmovdqu	%ymm0, (%rdi)
+	VMOVU	%YMM0, -32(%rdi,%rdx)
+	VMOVU	%YMM0, (%rdi)
+ 	VZEROUPPER
+ 	ret
+ # endif
+ # if VEC_SIZE > 16
+ 	/* From 16 to 31.  No branch when size == 16.  */
+ L(between_16_31):
+-	vmovdqu	%xmm0, -16(%rdi,%rdx)
+-	vmovdqu	%xmm0, (%rdi)
+	VMOVU	%XMM0, -16(%rdi,%rdx)
+	VMOVU	%XMM0, (%rdi)
+ 	VZEROUPPER
+ 	ret
+ # endif
+-- 
+GitLab
+
--- a/glibc-RHEL-15696-16.patch
+++ b/glibc-RHEL-15696-16.patch
@ -0,0 +1,561 @@
+From 91264fe3577fe887b4860923fa6142b5274c8965 Mon Sep 17 00:00:00 2001
+From: "H.J. Lu" <hjl.tools@gmail.com>
+Date: Fri, 5 Mar 2021 07:20:28 -0800
+Subject: [PATCH] x86-64: Add memcmp family functions with 256-bit EVEX
+Content-type: text/plain; charset=UTF-8
+
+Update ifunc-memcmp.h to select the function optimized with 256-bit EVEX
+instructions using YMM16-YMM31 registers to avoid RTM abort with usable
+AVX512VL, AVX512BW and MOVBE since VZEROUPPER isn't needed at function
+exit.
+---
+ sysdeps/x86_64/multiarch/Makefile             |   4 +-
+ sysdeps/x86_64/multiarch/ifunc-impl-list.c    |  10 +
+ sysdeps/x86_64/multiarch/ifunc-memcmp.h       |  13 +-
+ sysdeps/x86_64/multiarch/memcmp-evex-movbe.S  | 440 ++++++++++++++++++
+ sysdeps/x86_64/multiarch/wmemcmp-evex-movbe.S |   4 +
+ 5 files changed, 467 insertions(+), 4 deletions(-)
+ create mode 100644 sysdeps/x86_64/multiarch/memcmp-evex-movbe.S
+ create mode 100644 sysdeps/x86_64/multiarch/wmemcmp-evex-movbe.S
+
+diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
+index 1cc0a10e..9d79b138 100644
+--- a/sysdeps/x86_64/multiarch/Makefile
+++ b/sysdeps/x86_64/multiarch/Makefile
+@@ -41,6 +41,7 @@ sysdep_routines += strncat-c stpncpy-c strncpy-c \
+ 		   memset-avx2-unaligned-erms \
+ 		   memset-avx512-unaligned-erms \
+ 		   memchr-evex \
+		   memcmp-evex-movbe \
+ 		   memmove-evex-unaligned-erms \
+ 		   memrchr-evex \
+ 		   memset-evex-unaligned-erms \
+@@ -81,7 +82,8 @@ sysdep_routines += wmemcmp-sse4 wmemcmp-ssse3 wmemcmp-c \
+ 		   wcsncmp-evex \
+ 		   wcsnlen-evex \
+ 		   wcsrchr-evex \
+-		   wmemchr-evex
+		   wmemchr-evex \
+		   wmemcmp-evex-movbe
+ endif
+ 
+ ifeq ($(subdir),debug)
+diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+index 7cf83485..c8da910e 100644
+--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+@@ -56,6 +56,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 			      (CPU_FEATURE_USABLE (AVX2)
+ 			       && CPU_FEATURE_USABLE (MOVBE)),
+ 			      __memcmp_avx2_movbe)
+	      IFUNC_IMPL_ADD (array, i, memcmp,
+			      (CPU_FEATURE_USABLE (AVX512VL)
+			       && CPU_FEATURE_USABLE (AVX512BW)
+			       && CPU_FEATURE_USABLE (MOVBE)),
+			      __memcmp_evex_movbe)
+ 	      IFUNC_IMPL_ADD (array, i, memcmp, CPU_FEATURE_USABLE (SSE4_1),
+ 			      __memcmp_sse4_1)
+ 	      IFUNC_IMPL_ADD (array, i, memcmp, CPU_FEATURE_USABLE (SSSE3),
+@@ -558,6 +563,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 			      (CPU_FEATURE_USABLE (AVX2)
+ 			       && CPU_FEATURE_USABLE (MOVBE)),
+ 			      __wmemcmp_avx2_movbe)
+	      IFUNC_IMPL_ADD (array, i, wmemcmp,
+			      (CPU_FEATURE_USABLE (AVX512VL)
+			       && CPU_FEATURE_USABLE (AVX512BW)
+			       && CPU_FEATURE_USABLE (MOVBE)),
+			      __wmemcmp_evex_movbe)
+ 	      IFUNC_IMPL_ADD (array, i, wmemcmp, CPU_FEATURE_USABLE (SSE4_1),
+ 			      __wmemcmp_sse4_1)
+ 	      IFUNC_IMPL_ADD (array, i, wmemcmp, CPU_FEATURE_USABLE (SSSE3),
+diff --git a/sysdeps/x86_64/multiarch/ifunc-memcmp.h b/sysdeps/x86_64/multiarch/ifunc-memcmp.h
+index 6c1f3153..3ca1f0a6 100644
+--- a/sysdeps/x86_64/multiarch/ifunc-memcmp.h
+++ b/sysdeps/x86_64/multiarch/ifunc-memcmp.h
+@@ -23,17 +23,24 @@ extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden;
+ extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden;
+ extern __typeof (REDIRECT_NAME) OPTIMIZE (sse4_1) attribute_hidden;
+ extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_movbe) attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE (evex_movbe) attribute_hidden;
+ 
+ static inline void *
+ IFUNC_SELECTOR (void)
+ {
+   const struct cpu_features* cpu_features = __get_cpu_features ();
+ 
+-  if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER)
+-      && CPU_FEATURE_USABLE_P (cpu_features, AVX2)
+  if (CPU_FEATURE_USABLE_P (cpu_features, AVX2)
+       && CPU_FEATURE_USABLE_P (cpu_features, MOVBE)
+       && CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load))
+-    return OPTIMIZE (avx2_movbe);
+    {
+      if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
+	  && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW))
+	return OPTIMIZE (evex_movbe);
+
+      if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
+	return OPTIMIZE (avx2_movbe);
+    }
+ 
+   if (CPU_FEATURE_USABLE_P (cpu_features, SSE4_1))
+     return OPTIMIZE (sse4_1);
+diff --git a/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S b/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S
+new file mode 100644
+index 00000000..9c093972
+--- /dev/null
+++ b/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S
+@@ -0,0 +1,440 @@
+/* memcmp/wmemcmp optimized with 256-bit EVEX instructions.
+   Copyright (C) 2021 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#if IS_IN (libc)
+
+/* memcmp/wmemcmp is implemented as:
+   1. For size from 2 to 7 bytes, load as big endian with movbe and bswap
+      to avoid branches.
+   2. Use overlapping compare to avoid branch.
+   3. Use vector compare when size >= 4 bytes for memcmp or size >= 8
+      bytes for wmemcmp.
+   4. If size is 8 * VEC_SIZE or less, unroll the loop.
+   5. Compare 4 * VEC_SIZE at a time with the aligned first memory
+      area.
+   6. Use 2 vector compares when size is 2 * VEC_SIZE or less.
+   7. Use 4 vector compares when size is 4 * VEC_SIZE or less.
+   8. Use 8 vector compares when size is 8 * VEC_SIZE or less.  */
+
+# include <sysdep.h>
+
+# ifndef MEMCMP
+#  define MEMCMP	__memcmp_evex_movbe
+# endif
+
+# define VMOVU		vmovdqu64
+
+# ifdef USE_AS_WMEMCMP
+#  define VPCMPEQ	vpcmpeqd
+# else
+#  define VPCMPEQ	vpcmpeqb
+# endif
+
+# define XMM1		xmm17
+# define XMM2		xmm18
+# define YMM1		ymm17
+# define YMM2		ymm18
+# define YMM3		ymm19
+# define YMM4		ymm20
+# define YMM5		ymm21
+# define YMM6		ymm22
+
+# define VEC_SIZE 32
+# ifdef USE_AS_WMEMCMP
+#  define VEC_MASK 0xff
+#  define XMM_MASK 0xf
+# else
+#  define VEC_MASK 0xffffffff
+#  define XMM_MASK 0xffff
+# endif
+
+/* Warning!
+           wmemcmp has to use SIGNED comparison for elements.
+           memcmp has to use UNSIGNED comparison for elemnts.
+*/
+
+	.section .text.evex,"ax",@progbits
+ENTRY (MEMCMP)
+# ifdef USE_AS_WMEMCMP
+	shl	$2, %RDX_LP
+# elif defined __ILP32__
+	/* Clear the upper 32 bits.  */
+	movl	%edx, %edx
+# endif
+	cmp	$VEC_SIZE, %RDX_LP
+	jb	L(less_vec)
+
+	/* From VEC to 2 * VEC.  No branch when size == VEC_SIZE.  */
+	VMOVU	(%rsi), %YMM2
+	VPCMPEQ (%rdi), %YMM2, %k1
+	kmovd	%k1, %eax
+	subl    $VEC_MASK, %eax
+	jnz	L(first_vec)
+
+	cmpq	$(VEC_SIZE * 2), %rdx
+	jbe	L(last_vec)
+
+	/* More than 2 * VEC.  */
+	cmpq	$(VEC_SIZE * 8), %rdx
+	ja	L(more_8x_vec)
+	cmpq	$(VEC_SIZE * 4), %rdx
+	jb	L(last_4x_vec)
+
+	/* From 4 * VEC to 8 * VEC, inclusively. */
+	VMOVU	(%rsi), %YMM1
+	VPCMPEQ (%rdi), %YMM1, %k1
+
+	VMOVU	VEC_SIZE(%rsi), %YMM2
+	VPCMPEQ VEC_SIZE(%rdi), %YMM2, %k2
+
+	VMOVU	(VEC_SIZE * 2)(%rsi), %YMM3
+	VPCMPEQ (VEC_SIZE * 2)(%rdi), %YMM3, %k3
+
+	VMOVU	(VEC_SIZE * 3)(%rsi), %YMM4
+	VPCMPEQ (VEC_SIZE * 3)(%rdi), %YMM4, %k4
+
+	kandd	%k1, %k2, %k5
+	kandd	%k3, %k4, %k6
+	kandd	%k5, %k6, %k6
+
+	kmovd	%k6, %eax
+	cmpl	$VEC_MASK, %eax
+	jne	L(4x_vec_end)
+
+	leaq	-(4 * VEC_SIZE)(%rdi, %rdx), %rdi
+	leaq	-(4 * VEC_SIZE)(%rsi, %rdx), %rsi
+	VMOVU	(%rsi), %YMM1
+	VPCMPEQ (%rdi), %YMM1, %k1
+
+	VMOVU	VEC_SIZE(%rsi), %YMM2
+	VPCMPEQ VEC_SIZE(%rdi), %YMM2, %k2
+	kandd	%k1, %k2, %k5
+
+	VMOVU	(VEC_SIZE * 2)(%rsi), %YMM3
+	VPCMPEQ (VEC_SIZE * 2)(%rdi), %YMM3, %k3
+	kandd	%k3, %k5, %k5
+
+	VMOVU	(VEC_SIZE * 3)(%rsi), %YMM4
+	VPCMPEQ (VEC_SIZE * 3)(%rdi), %YMM4, %k4
+	kandd	%k4, %k5, %k5
+
+	kmovd	%k5, %eax
+	cmpl	$VEC_MASK, %eax
+	jne	L(4x_vec_end)
+	xorl	%eax, %eax
+	ret
+
+	.p2align 4
+L(last_2x_vec):
+	/* From VEC to 2 * VEC.  No branch when size == VEC_SIZE.  */
+	VMOVU	(%rsi), %YMM2
+	VPCMPEQ (%rdi), %YMM2, %k2
+	kmovd	%k2, %eax
+	subl    $VEC_MASK, %eax
+	jnz	L(first_vec)
+
+L(last_vec):
+	/* Use overlapping loads to avoid branches.  */
+	leaq	-VEC_SIZE(%rdi, %rdx), %rdi
+	leaq	-VEC_SIZE(%rsi, %rdx), %rsi
+	VMOVU	(%rsi), %YMM2
+	VPCMPEQ (%rdi), %YMM2, %k2
+	kmovd	%k2, %eax
+	subl    $VEC_MASK, %eax
+	jnz	L(first_vec)
+	ret
+
+	.p2align 4
+L(first_vec):
+	/* A byte or int32 is different within 16 or 32 bytes.  */
+	tzcntl	%eax, %ecx
+# ifdef USE_AS_WMEMCMP
+	xorl	%eax, %eax
+	movl	(%rdi, %rcx, 4), %edx
+	cmpl	(%rsi, %rcx, 4), %edx
+L(wmemcmp_return):
+	setl	%al
+	negl	%eax
+	orl	$1, %eax
+# else
+	movzbl	(%rdi, %rcx), %eax
+	movzbl	(%rsi, %rcx), %edx
+	sub	%edx, %eax
+# endif
+	ret
+
+# ifdef USE_AS_WMEMCMP
+	.p2align 4
+L(4):
+	xorl	%eax, %eax
+	movl	(%rdi), %edx
+	cmpl	(%rsi), %edx
+	jne	L(wmemcmp_return)
+	ret
+# else
+	.p2align 4
+L(between_4_7):
+	/* Load as big endian with overlapping movbe to avoid branches.  */
+	movbe	(%rdi), %eax
+	movbe	(%rsi), %ecx
+	shlq	$32, %rax
+	shlq	$32, %rcx
+	movbe	-4(%rdi, %rdx), %edi
+	movbe	-4(%rsi, %rdx), %esi
+	orq	%rdi, %rax
+	orq	%rsi, %rcx
+	subq	%rcx, %rax
+	je	L(exit)
+	sbbl	%eax, %eax
+	orl	$1, %eax
+	ret
+
+	.p2align 4
+L(exit):
+	ret
+
+	.p2align 4
+L(between_2_3):
+	/* Load as big endian to avoid branches.  */
+	movzwl	(%rdi), %eax
+	movzwl	(%rsi), %ecx
+	shll	$8, %eax
+	shll	$8, %ecx
+	bswap	%eax
+	bswap	%ecx
+	movb	-1(%rdi, %rdx), %al
+	movb	-1(%rsi, %rdx), %cl
+	/* Subtraction is okay because the upper 8 bits are zero.  */
+	subl	%ecx, %eax
+	ret
+
+	.p2align 4
+L(1):
+	movzbl	(%rdi), %eax
+	movzbl	(%rsi), %ecx
+	subl	%ecx, %eax
+	ret
+# endif
+
+	.p2align 4
+L(zero):
+	xorl	%eax, %eax
+	ret
+
+	.p2align 4
+L(less_vec):
+# ifdef USE_AS_WMEMCMP
+	/* It can only be 0, 4, 8, 12, 16, 20, 24, 28 bytes.  */
+	cmpb	$4, %dl
+	je	L(4)
+	jb	L(zero)
+# else
+	cmpb	$1, %dl
+	je	L(1)
+	jb	L(zero)
+	cmpb	$4, %dl
+	jb	L(between_2_3)
+	cmpb	$8, %dl
+	jb	L(between_4_7)
+# endif
+	cmpb	$16, %dl
+	jae	L(between_16_31)
+	/* It is between 8 and 15 bytes.  */
+	vmovq	(%rdi), %XMM1
+	vmovq	(%rsi), %XMM2
+	VPCMPEQ %XMM1, %XMM2, %k2
+	kmovw	%k2, %eax
+	subl    $XMM_MASK, %eax
+	jnz	L(first_vec)
+	/* Use overlapping loads to avoid branches.  */
+	leaq	-8(%rdi, %rdx), %rdi
+	leaq	-8(%rsi, %rdx), %rsi
+	vmovq	(%rdi), %XMM1
+	vmovq	(%rsi), %XMM2
+	VPCMPEQ %XMM1, %XMM2, %k2
+	kmovw	%k2, %eax
+	subl    $XMM_MASK, %eax
+	jnz	L(first_vec)
+	ret
+
+	.p2align 4
+L(between_16_31):
+	/* From 16 to 31 bytes.  No branch when size == 16.  */
+	VMOVU	(%rsi), %XMM2
+	VPCMPEQ (%rdi), %XMM2, %k2
+	kmovw	%k2, %eax
+	subl    $XMM_MASK, %eax
+	jnz	L(first_vec)
+
+	/* Use overlapping loads to avoid branches.  */
+	leaq	-16(%rdi, %rdx), %rdi
+	leaq	-16(%rsi, %rdx), %rsi
+	VMOVU	(%rsi), %XMM2
+	VPCMPEQ (%rdi), %XMM2, %k2
+	kmovw	%k2, %eax
+	subl    $XMM_MASK, %eax
+	jnz	L(first_vec)
+	ret
+
+	.p2align 4
+L(more_8x_vec):
+	/* More than 8 * VEC.  Check the first VEC.  */
+	VMOVU	(%rsi), %YMM2
+	VPCMPEQ (%rdi), %YMM2, %k2
+	kmovd	%k2, %eax
+	subl    $VEC_MASK, %eax
+	jnz	L(first_vec)
+
+	/* Align the first memory area for aligned loads in the loop.
+	   Compute how much the first memory area is misaligned.  */
+	movq	%rdi, %rcx
+	andl	$(VEC_SIZE - 1), %ecx
+	/* Get the negative of offset for alignment.  */
+	subq	$VEC_SIZE, %rcx
+	/* Adjust the second memory area.  */
+	subq	%rcx, %rsi
+	/* Adjust the first memory area which should be aligned now.  */
+	subq	%rcx, %rdi
+	/* Adjust length.  */
+	addq	%rcx, %rdx
+
+L(loop_4x_vec):
+	/* Compare 4 * VEC at a time forward.  */
+	VMOVU	(%rsi), %YMM1
+	VPCMPEQ (%rdi), %YMM1, %k1
+
+	VMOVU	VEC_SIZE(%rsi), %YMM2
+	VPCMPEQ VEC_SIZE(%rdi), %YMM2, %k2
+	kandd	%k2, %k1, %k5
+
+	VMOVU	(VEC_SIZE * 2)(%rsi), %YMM3
+	VPCMPEQ (VEC_SIZE * 2)(%rdi), %YMM3, %k3
+	kandd	%k3, %k5, %k5
+
+	VMOVU	(VEC_SIZE * 3)(%rsi), %YMM4
+	VPCMPEQ (VEC_SIZE * 3)(%rdi), %YMM4, %k4
+	kandd	%k4, %k5, %k5
+
+	kmovd	%k5, %eax
+	cmpl	$VEC_MASK, %eax
+	jne	L(4x_vec_end)
+
+	addq	$(VEC_SIZE * 4), %rdi
+	addq	$(VEC_SIZE * 4), %rsi
+
+	subq	$(VEC_SIZE * 4), %rdx
+	cmpq	$(VEC_SIZE * 4), %rdx
+	jae	L(loop_4x_vec)
+
+	/* Less than 4 * VEC.  */
+	cmpq	$VEC_SIZE, %rdx
+	jbe	L(last_vec)
+	cmpq	$(VEC_SIZE * 2), %rdx
+	jbe	L(last_2x_vec)
+
+L(last_4x_vec):
+	/* From 2 * VEC to 4 * VEC. */
+	VMOVU	(%rsi), %YMM2
+	VPCMPEQ (%rdi), %YMM2, %k2
+	kmovd	%k2, %eax
+	subl    $VEC_MASK, %eax
+	jnz	L(first_vec)
+
+	addq	$VEC_SIZE, %rdi
+	addq	$VEC_SIZE, %rsi
+	VMOVU	(%rsi), %YMM2
+	VPCMPEQ (%rdi), %YMM2, %k2
+	kmovd	%k2, %eax
+	subl    $VEC_MASK, %eax
+	jnz	L(first_vec)
+
+	/* Use overlapping loads to avoid branches.  */
+	leaq	-(3 * VEC_SIZE)(%rdi, %rdx), %rdi
+	leaq	-(3 * VEC_SIZE)(%rsi, %rdx), %rsi
+	VMOVU	(%rsi), %YMM2
+	VPCMPEQ (%rdi), %YMM2, %k2
+	kmovd	%k2, %eax
+	subl    $VEC_MASK, %eax
+	jnz	L(first_vec)
+
+	addq	$VEC_SIZE, %rdi
+	addq	$VEC_SIZE, %rsi
+	VMOVU	(%rsi), %YMM2
+	VPCMPEQ (%rdi), %YMM2, %k2
+	kmovd	%k2, %eax
+	subl    $VEC_MASK, %eax
+	jnz	L(first_vec)
+	ret
+
+	.p2align 4
+L(4x_vec_end):
+	kmovd	%k1, %eax
+	subl	$VEC_MASK, %eax
+	jnz	L(first_vec)
+	kmovd	%k2, %eax
+	subl	$VEC_MASK, %eax
+	jnz	L(first_vec_x1)
+	kmovd	%k3, %eax
+	subl	$VEC_MASK, %eax
+	jnz	L(first_vec_x2)
+	kmovd	%k4, %eax
+	subl	$VEC_MASK, %eax
+	tzcntl	%eax, %ecx
+# ifdef USE_AS_WMEMCMP
+	xorl	%eax, %eax
+	movl	(VEC_SIZE * 3)(%rdi, %rcx, 4), %edx
+	cmpl	(VEC_SIZE * 3)(%rsi, %rcx, 4), %edx
+	jmp	L(wmemcmp_return)
+# else
+	movzbl	(VEC_SIZE * 3)(%rdi, %rcx), %eax
+	movzbl	(VEC_SIZE * 3)(%rsi, %rcx), %edx
+	sub	%edx, %eax
+# endif
+	ret
+
+	.p2align 4
+L(first_vec_x1):
+	tzcntl	%eax, %ecx
+# ifdef USE_AS_WMEMCMP
+	xorl	%eax, %eax
+	movl	VEC_SIZE(%rdi, %rcx, 4), %edx
+	cmpl	VEC_SIZE(%rsi, %rcx, 4), %edx
+	jmp	L(wmemcmp_return)
+# else
+	movzbl	VEC_SIZE(%rdi, %rcx), %eax
+	movzbl	VEC_SIZE(%rsi, %rcx), %edx
+	sub	%edx, %eax
+# endif
+	ret
+
+	.p2align 4
+L(first_vec_x2):
+	tzcntl	%eax, %ecx
+# ifdef USE_AS_WMEMCMP
+	xorl	%eax, %eax
+	movl	(VEC_SIZE * 2)(%rdi, %rcx, 4), %edx
+	cmpl	(VEC_SIZE * 2)(%rsi, %rcx, 4), %edx
+	jmp	L(wmemcmp_return)
+# else
+	movzbl	(VEC_SIZE * 2)(%rdi, %rcx), %eax
+	movzbl	(VEC_SIZE * 2)(%rsi, %rcx), %edx
+	sub	%edx, %eax
+# endif
+	ret
+END (MEMCMP)
+#endif
+diff --git a/sysdeps/x86_64/multiarch/wmemcmp-evex-movbe.S b/sysdeps/x86_64/multiarch/wmemcmp-evex-movbe.S
+new file mode 100644
+index 00000000..4726d74a
+--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wmemcmp-evex-movbe.S
+@@ -0,0 +1,4 @@
+#define MEMCMP __wmemcmp_evex_movbe
+#define USE_AS_WMEMCMP 1
+
+#include "memcmp-evex-movbe.S"
+-- 
+GitLab
+
--- a/glibc-RHEL-15696-17.patch
+++ b/glibc-RHEL-15696-17.patch
--- a/glibc-RHEL-15696-18.patch
+++ b/glibc-RHEL-15696-18.patch
@ -0,0 +1,735 @@
+From 4bd660be40967cd69072f69ebc2ad32bfcc1f206 Mon Sep 17 00:00:00 2001
+From: "H.J. Lu" <hjl.tools@gmail.com>
+Date: Tue, 23 Feb 2021 06:33:10 -0800
+Subject: [PATCH] x86: Add string/memory function tests in RTM region
+Content-type: text/plain; charset=UTF-8
+
+At function exit, AVX optimized string/memory functions have VZEROUPPER
+which triggers RTM abort.   When such functions are called inside a
+transactionally executing RTM region, RTM abort causes severe performance
+degradation.  Add tests to verify that string/memory functions won't
+cause RTM abort in RTM region.
+---
+ sysdeps/x86/Makefile          | 23 +++++++++++
+ sysdeps/x86/tst-memchr-rtm.c  | 54 ++++++++++++++++++++++++++
+ sysdeps/x86/tst-memcmp-rtm.c  | 52 +++++++++++++++++++++++++
+ sysdeps/x86/tst-memmove-rtm.c | 53 ++++++++++++++++++++++++++
+ sysdeps/x86/tst-memrchr-rtm.c | 54 ++++++++++++++++++++++++++
+ sysdeps/x86/tst-memset-rtm.c  | 45 ++++++++++++++++++++++
+ sysdeps/x86/tst-strchr-rtm.c  | 54 ++++++++++++++++++++++++++
+ sysdeps/x86/tst-strcpy-rtm.c  | 53 ++++++++++++++++++++++++++
+ sysdeps/x86/tst-string-rtm.h  | 72 +++++++++++++++++++++++++++++++++++
+ sysdeps/x86/tst-strlen-rtm.c  | 53 ++++++++++++++++++++++++++
+ sysdeps/x86/tst-strncmp-rtm.c | 52 +++++++++++++++++++++++++
+ sysdeps/x86/tst-strrchr-rtm.c | 53 ++++++++++++++++++++++++++
+ 12 files changed, 618 insertions(+)
+ create mode 100644 sysdeps/x86/tst-memchr-rtm.c
+ create mode 100644 sysdeps/x86/tst-memcmp-rtm.c
+ create mode 100644 sysdeps/x86/tst-memmove-rtm.c
+ create mode 100644 sysdeps/x86/tst-memrchr-rtm.c
+ create mode 100644 sysdeps/x86/tst-memset-rtm.c
+ create mode 100644 sysdeps/x86/tst-strchr-rtm.c
+ create mode 100644 sysdeps/x86/tst-strcpy-rtm.c
+ create mode 100644 sysdeps/x86/tst-string-rtm.h
+ create mode 100644 sysdeps/x86/tst-strlen-rtm.c
+ create mode 100644 sysdeps/x86/tst-strncmp-rtm.c
+ create mode 100644 sysdeps/x86/tst-strrchr-rtm.c
+
+diff --git a/sysdeps/x86/Makefile b/sysdeps/x86/Makefile
+index 59e928e9..5be71ada 100644
+--- a/sysdeps/x86/Makefile
+++ b/sysdeps/x86/Makefile
+@@ -17,6 +17,29 @@ endif
+ 
+ ifeq ($(subdir),string)
+ sysdep_routines += cacheinfo
+
+tests += \
+  tst-memchr-rtm \
+  tst-memcmp-rtm \
+  tst-memmove-rtm \
+  tst-memrchr-rtm \
+  tst-memset-rtm \
+  tst-strchr-rtm \
+  tst-strcpy-rtm \
+  tst-strlen-rtm \
+  tst-strncmp-rtm \
+  tst-strrchr-rtm
+
+CFLAGS-tst-memchr-rtm.c += -mrtm
+CFLAGS-tst-memcmp-rtm.c += -mrtm
+CFLAGS-tst-memmove-rtm.c += -mrtm
+CFLAGS-tst-memrchr-rtm.c += -mrtm
+CFLAGS-tst-memset-rtm.c += -mrtm
+CFLAGS-tst-strchr-rtm.c += -mrtm
+CFLAGS-tst-strcpy-rtm.c += -mrtm
+CFLAGS-tst-strlen-rtm.c += -mrtm
+CFLAGS-tst-strncmp-rtm.c += -mrtm
+CFLAGS-tst-strrchr-rtm.c += -mrtm
+ endif
+ 
+ ifneq ($(enable-cet),no)
+diff --git a/sysdeps/x86/tst-memchr-rtm.c b/sysdeps/x86/tst-memchr-rtm.c
+new file mode 100644
+index 00000000..e4749401
+--- /dev/null
+++ b/sysdeps/x86/tst-memchr-rtm.c
+@@ -0,0 +1,54 @@
+/* Test case for memchr inside a transactionally executing RTM region.
+   Copyright (C) 2021 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <tst-string-rtm.h>
+
+#define LOOP 3000
+#define STRING_SIZE 1024
+char string1[STRING_SIZE];
+
+__attribute__ ((noinline, noclone))
+static int
+prepare (void)
+{
+  memset (string1, 'a', STRING_SIZE);
+  string1[100] = 'c';
+  string1[STRING_SIZE - 100] = 'c';
+  char *p = memchr (string1, 'c', STRING_SIZE);
+  if (p == &string1[100])
+    return EXIT_SUCCESS;
+  else
+    return EXIT_FAILURE;
+}
+
+__attribute__ ((noinline, noclone))
+static int
+function (void)
+{
+  char *p = memchr (string1, 'c', STRING_SIZE);
+  if (p == &string1[100])
+    return 0;
+  else
+    return 1;
+}
+
+static int
+do_test (void)
+{
+  return do_test_1 ("memchr", LOOP, prepare, function);
+}
+diff --git a/sysdeps/x86/tst-memcmp-rtm.c b/sysdeps/x86/tst-memcmp-rtm.c
+new file mode 100644
+index 00000000..e4c8a623
+--- /dev/null
+++ b/sysdeps/x86/tst-memcmp-rtm.c
+@@ -0,0 +1,52 @@
+/* Test case for memcmp inside a transactionally executing RTM region.
+   Copyright (C) 2021 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <tst-string-rtm.h>
+
+#define LOOP 3000
+#define STRING_SIZE 1024
+char string1[STRING_SIZE];
+char string2[STRING_SIZE];
+
+__attribute__ ((noinline, noclone))
+static int
+prepare (void)
+{
+  memset (string1, 'a', STRING_SIZE);
+  memset (string2, 'a', STRING_SIZE);
+  if (memcmp (string1, string2, STRING_SIZE) == 0)
+    return EXIT_SUCCESS;
+  else
+    return EXIT_FAILURE;
+}
+
+__attribute__ ((noinline, noclone))
+static int
+function (void)
+{
+  if (memcmp (string1, string2, STRING_SIZE) == 0)
+    return 0;
+  else
+    return 1;
+}
+
+static int
+do_test (void)
+{
+  return do_test_1 ("memcmp", LOOP, prepare, function);
+}
+diff --git a/sysdeps/x86/tst-memmove-rtm.c b/sysdeps/x86/tst-memmove-rtm.c
+new file mode 100644
+index 00000000..4bf97ef1
+--- /dev/null
+++ b/sysdeps/x86/tst-memmove-rtm.c
+@@ -0,0 +1,53 @@
+/* Test case for memmove inside a transactionally executing RTM region.
+   Copyright (C) 2021 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <tst-string-rtm.h>
+
+#define LOOP 3000
+#define STRING_SIZE 1024
+char string1[STRING_SIZE];
+char string2[STRING_SIZE];
+
+__attribute__ ((noinline, noclone))
+static int
+prepare (void)
+{
+  memset (string1, 'a', STRING_SIZE);
+  if (memmove (string2, string1, STRING_SIZE) == string2
+      && memcmp (string2, string1, STRING_SIZE) == 0)
+    return EXIT_SUCCESS;
+  else
+    return EXIT_FAILURE;
+}
+
+__attribute__ ((noinline, noclone))
+static int
+function (void)
+{
+  if (memmove (string2, string1, STRING_SIZE) == string2
+      && memcmp (string2, string1, STRING_SIZE) == 0)
+    return 0;
+  else
+    return 1;
+}
+
+static int
+do_test (void)
+{
+  return do_test_1 ("memmove", LOOP, prepare, function);
+}
+diff --git a/sysdeps/x86/tst-memrchr-rtm.c b/sysdeps/x86/tst-memrchr-rtm.c
+new file mode 100644
+index 00000000..a57a5a8e
+--- /dev/null
+++ b/sysdeps/x86/tst-memrchr-rtm.c
+@@ -0,0 +1,54 @@
+/* Test case for memrchr inside a transactionally executing RTM region.
+   Copyright (C) 2021 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <tst-string-rtm.h>
+
+#define LOOP 3000
+#define STRING_SIZE 1024
+char string1[STRING_SIZE];
+
+__attribute__ ((noinline, noclone))
+static int
+prepare (void)
+{
+  memset (string1, 'a', STRING_SIZE);
+  string1[100] = 'c';
+  string1[STRING_SIZE - 100] = 'c';
+  char *p = memrchr (string1, 'c', STRING_SIZE);
+  if (p == &string1[STRING_SIZE - 100])
+    return EXIT_SUCCESS;
+  else
+    return EXIT_FAILURE;
+}
+
+__attribute__ ((noinline, noclone))
+static int
+function (void)
+{
+  char *p = memrchr (string1, 'c', STRING_SIZE);
+  if (p == &string1[STRING_SIZE - 100])
+    return 0;
+  else
+    return 1;
+}
+
+static int
+do_test (void)
+{
+  return do_test_1 ("memrchr", LOOP, prepare, function);
+}
+diff --git a/sysdeps/x86/tst-memset-rtm.c b/sysdeps/x86/tst-memset-rtm.c
+new file mode 100644
+index 00000000..bf343a4d
+--- /dev/null
+++ b/sysdeps/x86/tst-memset-rtm.c
+@@ -0,0 +1,45 @@
+/* Test case for memset inside a transactionally executing RTM region.
+   Copyright (C) 2021 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <tst-string-rtm.h>
+
+#define LOOP 3000
+#define STRING_SIZE 1024
+char string1[STRING_SIZE];
+
+__attribute__ ((noinline, noclone))
+static int
+prepare (void)
+{
+  memset (string1, 'a', STRING_SIZE);
+  return EXIT_SUCCESS;
+}
+
+__attribute__ ((noinline, noclone))
+static int
+function (void)
+{
+  memset (string1, 'a', STRING_SIZE);
+  return 0;
+}
+
+static int
+do_test (void)
+{
+  return do_test_1 ("memset", LOOP, prepare, function);
+}
+diff --git a/sysdeps/x86/tst-strchr-rtm.c b/sysdeps/x86/tst-strchr-rtm.c
+new file mode 100644
+index 00000000..a82e29c0
+--- /dev/null
+++ b/sysdeps/x86/tst-strchr-rtm.c
+@@ -0,0 +1,54 @@
+/* Test case for strchr inside a transactionally executing RTM region.
+   Copyright (C) 2021 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <tst-string-rtm.h>
+
+#define LOOP 3000
+#define STRING_SIZE 1024
+char string1[STRING_SIZE];
+
+__attribute__ ((noinline, noclone))
+static int
+prepare (void)
+{
+  memset (string1, 'a', STRING_SIZE - 1);
+  string1[100] = 'c';
+  string1[STRING_SIZE - 100] = 'c';
+  char *p = strchr (string1, 'c');
+  if (p == &string1[100])
+    return EXIT_SUCCESS;
+  else
+    return EXIT_FAILURE;
+}
+
+__attribute__ ((noinline, noclone))
+static int
+function (void)
+{
+  char *p = strchr (string1, 'c');
+  if (p == &string1[100])
+    return 0;
+  else
+    return 1;
+}
+
+static int
+do_test (void)
+{
+  return do_test_1 ("strchr", LOOP, prepare, function);
+}
+diff --git a/sysdeps/x86/tst-strcpy-rtm.c b/sysdeps/x86/tst-strcpy-rtm.c
+new file mode 100644
+index 00000000..2b2a583f
+--- /dev/null
+++ b/sysdeps/x86/tst-strcpy-rtm.c
+@@ -0,0 +1,53 @@
+/* Test case for strcpy inside a transactionally executing RTM region.
+   Copyright (C) 2021 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <tst-string-rtm.h>
+
+#define LOOP 3000
+#define STRING_SIZE 1024
+char string1[STRING_SIZE];
+char string2[STRING_SIZE];
+
+__attribute__ ((noinline, noclone))
+static int
+prepare (void)
+{
+  memset (string1, 'a', STRING_SIZE - 1);
+  if (strcpy (string2, string1) == string2
+      && strcmp (string2, string1) == 0)
+    return EXIT_SUCCESS;
+  else
+    return EXIT_FAILURE;
+}
+
+__attribute__ ((noinline, noclone))
+static int
+function (void)
+{
+  if (strcpy (string2, string1) == string2
+      && strcmp (string2, string1) == 0)
+    return 0;
+  else
+    return 1;
+}
+
+static int
+do_test (void)
+{
+  return do_test_1 ("strcpy", LOOP, prepare, function);
+}
+diff --git a/sysdeps/x86/tst-string-rtm.h b/sysdeps/x86/tst-string-rtm.h
+new file mode 100644
+index 00000000..d2470afa
+--- /dev/null
+++ b/sysdeps/x86/tst-string-rtm.h
+@@ -0,0 +1,72 @@
+/* Test string function in a transactionally executing RTM region.
+   Copyright (C) 2021 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <string.h>
+#include <x86intrin.h>
+#include <sys/platform/x86.h>
+#include <support/check.h>
+#include <support/test-driver.h>
+
+static int
+do_test_1 (const char *name, unsigned int loop, int (*prepare) (void),
+	   int (*function) (void))
+{
+  if (!CPU_FEATURE_USABLE (RTM))
+    return EXIT_UNSUPPORTED;
+
+  int status = prepare ();
+  if (status != EXIT_SUCCESS)
+    return status;
+
+  unsigned int i;
+  unsigned int naborts = 0;
+  unsigned int failed = 0;
+  for (i = 0; i < loop; i++)
+    {
+      failed |= function ();
+      if (_xbegin() == _XBEGIN_STARTED)
+	{
+	  failed |= function ();
+	  _xend();
+	}
+      else
+	{
+	  failed |= function ();
+	  ++naborts;
+	}
+    }
+
+  if (failed)
+    FAIL_EXIT1 ("%s() failed", name);
+
+  if (naborts)
+    {
+      /* NB: Low single digit (<= 5%) noise-level aborts are normal for
+	 TSX.  */
+      double rate = 100 * ((double) naborts) / ((double) loop);
+      if (rate > 5)
+	FAIL_EXIT1 ("TSX abort rate: %.2f%% (%d out of %d)",
+		    rate, naborts, loop);
+    }
+
+  return EXIT_SUCCESS;
+}
+
+static int do_test (void);
+
+#include <support/test-driver.c>
+diff --git a/sysdeps/x86/tst-strlen-rtm.c b/sysdeps/x86/tst-strlen-rtm.c
+new file mode 100644
+index 00000000..0dcf14db
+--- /dev/null
+++ b/sysdeps/x86/tst-strlen-rtm.c
+@@ -0,0 +1,53 @@
+/* Test case for strlen inside a transactionally executing RTM region.
+   Copyright (C) 2021 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <tst-string-rtm.h>
+
+#define LOOP 3000
+#define STRING_SIZE 1024
+char string1[STRING_SIZE];
+
+__attribute__ ((noinline, noclone))
+static int
+prepare (void)
+{
+  memset (string1, 'a', STRING_SIZE - 1);
+  string1[STRING_SIZE - 100] = '\0';
+  size_t len = strlen (string1);
+  if (len == STRING_SIZE - 100)
+    return EXIT_SUCCESS;
+  else
+    return EXIT_FAILURE;
+}
+
+__attribute__ ((noinline, noclone))
+static int
+function (void)
+{
+  size_t len = strlen (string1);
+  if (len == STRING_SIZE - 100)
+    return 0;
+  else
+    return 1;
+}
+
+static int
+do_test (void)
+{
+  return do_test_1 ("strlen", LOOP, prepare, function);
+}
+diff --git a/sysdeps/x86/tst-strncmp-rtm.c b/sysdeps/x86/tst-strncmp-rtm.c
+new file mode 100644
+index 00000000..236ad951
+--- /dev/null
+++ b/sysdeps/x86/tst-strncmp-rtm.c
+@@ -0,0 +1,52 @@
+/* Test case for strncmp inside a transactionally executing RTM region.
+   Copyright (C) 2021 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <tst-string-rtm.h>
+
+#define LOOP 3000
+#define STRING_SIZE 1024
+char string1[STRING_SIZE];
+char string2[STRING_SIZE];
+
+__attribute__ ((noinline, noclone))
+static int
+prepare (void)
+{
+  memset (string1, 'a', STRING_SIZE - 1);
+  memset (string2, 'a', STRING_SIZE - 1);
+  if (strncmp (string1, string2, STRING_SIZE) == 0)
+    return EXIT_SUCCESS;
+  else
+    return EXIT_FAILURE;
+}
+
+__attribute__ ((noinline, noclone))
+static int
+function (void)
+{
+  if (strncmp (string1, string2, STRING_SIZE) == 0)
+    return 0;
+  else
+    return 1;
+}
+
+static int
+do_test (void)
+{
+  return do_test_1 ("strncmp", LOOP, prepare, function);
+}
+diff --git a/sysdeps/x86/tst-strrchr-rtm.c b/sysdeps/x86/tst-strrchr-rtm.c
+new file mode 100644
+index 00000000..e32bfaf5
+--- /dev/null
+++ b/sysdeps/x86/tst-strrchr-rtm.c
+@@ -0,0 +1,53 @@
+/* Test case for strrchr inside a transactionally executing RTM region.
+   Copyright (C) 2021 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <tst-string-rtm.h>
+
+#define LOOP 3000
+#define STRING_SIZE 1024
+char string1[STRING_SIZE];
+
+__attribute__ ((noinline, noclone))
+static int
+prepare (void)
+{
+  memset (string1, 'a', STRING_SIZE - 1);
+  string1[STRING_SIZE - 100] = 'c';
+  char *p = strrchr (string1, 'c');
+  if (p == &string1[STRING_SIZE - 100])
+    return EXIT_SUCCESS;
+  else
+    return EXIT_FAILURE;
+}
+
+__attribute__ ((noinline, noclone))
+static int
+function (void)
+{
+  char *p = strrchr (string1, 'c');
+  if (p == &string1[STRING_SIZE - 100])
+    return 0;
+  else
+    return 1;
+}
+
+static int
+do_test (void)
+{
+  return do_test_1 ("strrchr", LOOP, prepare, function);
+}
+-- 
+GitLab
+
--- a/glibc-RHEL-15696-19.patch
+++ b/glibc-RHEL-15696-19.patch
@ -0,0 +1,148 @@
+From 4e2d8f352774b56078c34648b14a2412c38384f4 Mon Sep 17 00:00:00 2001
+From: "H.J. Lu" <hjl.tools@gmail.com>
+Date: Sun, 7 Mar 2021 09:44:18 -0800
+Subject: [PATCH] x86-64: Use ZMM16-ZMM31 in AVX512 memset family functions
+Content-type: text/plain; charset=UTF-8
+
+Update ifunc-memset.h/ifunc-wmemset.h to select the function optimized
+with AVX512 instructions using ZMM16-ZMM31 registers to avoid RTM abort
+with usable AVX512VL and AVX512BW since VZEROUPPER isn't needed at
+function exit.
+---
+ sysdeps/x86_64/multiarch/ifunc-impl-list.c       | 14 +++++++++-----
+ sysdeps/x86_64/multiarch/ifunc-memset.h          | 13 ++++++++-----
+ sysdeps/x86_64/multiarch/ifunc-wmemset.h         | 12 ++++++------
+ .../multiarch/memset-avx512-unaligned-erms.S     | 16 ++++++++--------
+ 4 files changed, 31 insertions(+), 24 deletions(-)
+
+diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+index c1efeec0..d969a156 100644
+--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+@@ -211,10 +211,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 			       && CPU_FEATURE_USABLE (AVX512BW)),
+ 			      __memset_chk_evex_unaligned_erms)
+ 	      IFUNC_IMPL_ADD (array, i, __memset_chk,
+-			      CPU_FEATURE_USABLE (AVX512F),
+			      (CPU_FEATURE_USABLE (AVX512VL)
+			       && CPU_FEATURE_USABLE (AVX512BW)),
+ 			      __memset_chk_avx512_unaligned_erms)
+ 	      IFUNC_IMPL_ADD (array, i, __memset_chk,
+-			      CPU_FEATURE_USABLE (AVX512F),
+			      (CPU_FEATURE_USABLE (AVX512VL)
+			       && CPU_FEATURE_USABLE (AVX512BW)),
+ 			      __memset_chk_avx512_unaligned)
+ 	      IFUNC_IMPL_ADD (array, i, __memset_chk,
+ 			      CPU_FEATURE_USABLE (AVX512F),
+@@ -252,10 +254,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 			       && CPU_FEATURE_USABLE (AVX512BW)),
+ 			      __memset_evex_unaligned_erms)
+ 	      IFUNC_IMPL_ADD (array, i, memset,
+-			      CPU_FEATURE_USABLE (AVX512F),
+			      (CPU_FEATURE_USABLE (AVX512VL)
+			       && CPU_FEATURE_USABLE (AVX512BW)),
+ 			      __memset_avx512_unaligned_erms)
+ 	      IFUNC_IMPL_ADD (array, i, memset,
+-			      CPU_FEATURE_USABLE (AVX512F),
+			      (CPU_FEATURE_USABLE (AVX512VL)
+			       && CPU_FEATURE_USABLE (AVX512BW)),
+ 			      __memset_avx512_unaligned)
+ 	      IFUNC_IMPL_ADD (array, i, memset,
+ 			      CPU_FEATURE_USABLE (AVX512F),
+@@ -719,7 +723,7 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 			      CPU_FEATURE_USABLE (AVX512VL),
+ 			      __wmemset_evex_unaligned)
+ 	      IFUNC_IMPL_ADD (array, i, wmemset,
+-			      CPU_FEATURE_USABLE (AVX512F),
+			      CPU_FEATURE_USABLE (AVX512VL),
+ 			      __wmemset_avx512_unaligned))
+ 
+ #ifdef SHARED
+diff --git a/sysdeps/x86_64/multiarch/ifunc-memset.h b/sysdeps/x86_64/multiarch/ifunc-memset.h
+index 6f3375cc..19795938 100644
+--- a/sysdeps/x86_64/multiarch/ifunc-memset.h
+++ b/sysdeps/x86_64/multiarch/ifunc-memset.h
+@@ -53,13 +53,16 @@ IFUNC_SELECTOR (void)
+   if (CPU_FEATURE_USABLE_P (cpu_features, AVX512F)
+       && !CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_AVX512))
+     {
+-      if (CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
+-	return OPTIMIZE (avx512_no_vzeroupper);
+      if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
+	  && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW))
+	{
+	  if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
+	    return OPTIMIZE (avx512_unaligned_erms);
+ 
+-      if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
+-	return OPTIMIZE (avx512_unaligned_erms);
+	  return OPTIMIZE (avx512_unaligned);
+	}
+ 
+-      return OPTIMIZE (avx512_unaligned);
+      return OPTIMIZE (avx512_no_vzeroupper);
+     }
+ 
+   if (CPU_FEATURE_USABLE_P (cpu_features, AVX2))
+diff --git a/sysdeps/x86_64/multiarch/ifunc-wmemset.h b/sysdeps/x86_64/multiarch/ifunc-wmemset.h
+index bdc94c6c..98c5d406 100644
+--- a/sysdeps/x86_64/multiarch/ifunc-wmemset.h
+++ b/sysdeps/x86_64/multiarch/ifunc-wmemset.h
+@@ -33,13 +33,13 @@ IFUNC_SELECTOR (void)
+   if (CPU_FEATURE_USABLE_P (cpu_features, AVX2)
+       && CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load))
+     {
+-      if (CPU_FEATURE_USABLE_P (cpu_features, AVX512F)
+-	  && !CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_AVX512)
+-	  && !CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
+-	return OPTIMIZE (avx512_unaligned);
+-
+       if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL))
+-	return OPTIMIZE (evex_unaligned);
+	{
+	  if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_AVX512))
+	    return OPTIMIZE (avx512_unaligned);
+
+	  return OPTIMIZE (evex_unaligned);
+	}
+ 
+       if (CPU_FEATURE_USABLE_P (cpu_features, RTM))
+ 	return OPTIMIZE (avx2_unaligned_rtm);
+diff --git a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
+index 0783979c..22e7b187 100644
+--- a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
+@@ -1,22 +1,22 @@
+ #if IS_IN (libc)
+ # define VEC_SIZE	64
+-# define VEC(i)		zmm##i
+# define XMM0		xmm16
+# define YMM0		ymm16
+# define VEC0		zmm16
+# define VEC(i)		VEC##i
+ # define VMOVU		vmovdqu64
+ # define VMOVA		vmovdqa64
+# define VZEROUPPER
+ 
+ # define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
+-  vmovd d, %xmm0; \
+   movq r, %rax; \
+-  vpbroadcastb %xmm0, %xmm0; \
+-  vpbroadcastq %xmm0, %zmm0
+  vpbroadcastb d, %VEC0
+ 
+ # define WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
+-  vmovd d, %xmm0; \
+   movq r, %rax; \
+-  vpbroadcastd %xmm0, %xmm0; \
+-  vpbroadcastq %xmm0, %zmm0
+  vpbroadcastd d, %VEC0
+ 
+-# define SECTION(p)		p##.avx512
+# define SECTION(p)		p##.evex512
+ # define MEMSET_SYMBOL(p,s)	p##_avx512_##s
+ # define WMEMSET_SYMBOL(p,s)	p##_avx512_##s
+ 
+-- 
+GitLab
+
--- a/glibc-RHEL-15696-2.patch
+++ b/glibc-RHEL-15696-2.patch
@ -0,0 +1,230 @@
+From b304fc201d2f6baf52ea790df8643e99772243cd Mon Sep 17 00:00:00 2001
+From: "H.J. Lu" <hjl.tools@gmail.com>
+Date: Mon, 21 Jan 2019 11:25:56 -0800
+Subject: [PATCH] x86-64 memcmp/wmemcmp: Properly handle the length parameter
+ [BZ# 24097]
+Content-type: text/plain; charset=UTF-8
+
+On x32, the size_t parameter may be passed in the lower 32 bits of a
+64-bit register with the non-zero upper 32 bits.  The string/memory
+functions written in assembly can only use the lower 32 bits of a
+64-bit register as length or must clear the upper 32 bits before using
+the full 64-bit register for length.
+
+This pach fixes memcmp/wmemcmp for x32.  Tested on x86-64 and x32.  On
+x86-64, libc.so is the same with and withou the fix.
+
+	[BZ# 24097]
+	CVE-2019-6488
+	* sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S: Use RDX_LP for
+	length.  Clear the upper 32 bits of RDX register.
+	* sysdeps/x86_64/multiarch/memcmp-sse4.S: Likewise.
+	* sysdeps/x86_64/multiarch/memcmp-ssse3.S: Likewise.
+	* sysdeps/x86_64/x32/Makefile (tests): Add tst-size_t-memcmp and
+	tst-size_t-wmemcmp.
+	* sysdeps/x86_64/x32/tst-size_t-memcmp.c: New file.
+	* sysdeps/x86_64/x32/tst-size_t-wmemcmp.c: Likewise.
+---
+ sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S |  7 +-
+ sysdeps/x86_64/multiarch/memcmp-sse4.S       |  9 ++-
+ sysdeps/x86_64/multiarch/memcmp-ssse3.S      |  7 +-
+ sysdeps/x86_64/x32/Makefile                  |  4 +-
+ sysdeps/x86_64/x32/tst-size_t-memcmp.c       | 76 ++++++++++++++++++++
+ sysdeps/x86_64/x32/tst-size_t-wmemcmp.c      | 20 ++++++
+ 6 files changed, 114 insertions(+), 9 deletions(-)
+ create mode 100644 sysdeps/x86_64/x32/tst-size_t-memcmp.c
+ create mode 100644 sysdeps/x86_64/x32/tst-size_t-wmemcmp.c
+
+Conflicts:
+	ChangeLog
+	(removed)
+
+diff --git a/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S b/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S
+index 30f764c3..e3a35b89 100644
+--- a/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S
+++ b/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S
+@@ -58,9 +58,12 @@
+ 	.section .text.avx,"ax",@progbits
+ ENTRY (MEMCMP)
+ # ifdef USE_AS_WMEMCMP
+-	shl	$2, %rdx
+	shl	$2, %RDX_LP
+# elif defined __ILP32__
+	/* Clear the upper 32 bits.  */
+	movl	%edx, %edx
+ # endif
+-	cmpq	$VEC_SIZE, %rdx
+	cmp	$VEC_SIZE, %RDX_LP
+ 	jb	L(less_vec)
+ 
+ 	/* From VEC to 2 * VEC.  No branch when size == VEC_SIZE.  */
+diff --git a/sysdeps/x86_64/multiarch/memcmp-sse4.S b/sysdeps/x86_64/multiarch/memcmp-sse4.S
+index 8e164f2c..302900f5 100644
+--- a/sysdeps/x86_64/multiarch/memcmp-sse4.S
+++ b/sysdeps/x86_64/multiarch/memcmp-sse4.S
+@@ -42,13 +42,16 @@
+ 	.section .text.sse4.1,"ax",@progbits
+ ENTRY (MEMCMP)
+ # ifdef USE_AS_WMEMCMP
+-	shl	$2, %rdx
+	shl	$2, %RDX_LP
+# elif defined __ILP32__
+	/* Clear the upper 32 bits.  */
+	mov	%edx, %edx
+ # endif
+ 	pxor	%xmm0, %xmm0
+-	cmp	$79, %rdx
+	cmp	$79, %RDX_LP
+ 	ja	L(79bytesormore)
+ # ifndef USE_AS_WMEMCMP
+-	cmp	$1, %rdx
+	cmp	$1, %RDX_LP
+ 	je	L(firstbyte)
+ # endif
+ 	add	%rdx, %rsi
+diff --git a/sysdeps/x86_64/multiarch/memcmp-ssse3.S b/sysdeps/x86_64/multiarch/memcmp-ssse3.S
+index 6f76c641..69d030fc 100644
+--- a/sysdeps/x86_64/multiarch/memcmp-ssse3.S
+++ b/sysdeps/x86_64/multiarch/memcmp-ssse3.S
+@@ -33,9 +33,12 @@
+ 	atom_text_section
+ ENTRY (MEMCMP)
+ # ifdef USE_AS_WMEMCMP
+-	shl	$2, %rdx
+-	test	%rdx, %rdx
+	shl	$2, %RDX_LP
+	test	%RDX_LP, %RDX_LP
+ 	jz	L(equal)
+# elif defined __ILP32__
+	/* Clear the upper 32 bits.  */
+	mov	%edx, %edx
+ # endif
+ 	mov	%rdx, %rcx
+ 	mov	%rdi, %rdx
+diff --git a/sysdeps/x86_64/x32/Makefile b/sysdeps/x86_64/x32/Makefile
+index 7d528889..ddec7f04 100644
+--- a/sysdeps/x86_64/x32/Makefile
+++ b/sysdeps/x86_64/x32/Makefile
+@@ -6,9 +6,9 @@ CFLAGS-s_llround.c += -fno-builtin-lround
+ endif
+ 
+ ifeq ($(subdir),string)
+-tests += tst-size_t-memchr
+tests += tst-size_t-memchr tst-size_t-memcmp
+ endif
+ 
+ ifeq ($(subdir),wcsmbs)
+-tests += tst-size_t-wmemchr
+tests += tst-size_t-wmemchr tst-size_t-wmemcmp
+ endif
+diff --git a/sysdeps/x86_64/x32/tst-size_t-memcmp.c b/sysdeps/x86_64/x32/tst-size_t-memcmp.c
+new file mode 100644
+index 00000000..9bd6fdb4
+--- /dev/null
+++ b/sysdeps/x86_64/x32/tst-size_t-memcmp.c
+@@ -0,0 +1,76 @@
+/* Test memcmp with size_t in the lower 32 bits of 64-bit register.
+   Copyright (C) 2019 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#define TEST_MAIN
+#ifdef WIDE
+# define TEST_NAME "wmemcmp"
+#else
+# define TEST_NAME "memcmp"
+#endif
+
+#include "test-size_t.h"
+
+#ifdef WIDE
+# include <inttypes.h>
+# include <wchar.h>
+
+# define MEMCMP wmemcmp
+# define CHAR wchar_t
+#else
+# define MEMCMP memcmp
+# define CHAR char
+#endif
+
+IMPL (MEMCMP, 1)
+
+typedef int (*proto_t) (const CHAR *, const CHAR *, size_t);
+
+static int
+__attribute__ ((noinline, noclone))
+do_memcmp (parameter_t a, parameter_t b)
+{
+  return CALL (&b, a.p, b.p, a.len);
+}
+
+static int
+test_main (void)
+{
+  test_init ();
+
+  parameter_t dest = { { page_size / sizeof (CHAR) }, buf1 };
+  parameter_t src = { { 0 }, buf2 };
+
+  memcpy (buf1, buf2, page_size);
+
+  int ret = 0;
+  FOR_EACH_IMPL (impl, 0)
+    {
+      src.fn = impl->fn;
+      int res = do_memcmp (dest, src);
+      if (res)
+	{
+	  error (0, 0, "Wrong result in function %s: %i != 0",
+		 impl->name, res);
+	  ret = 1;
+	}
+    }
+
+  return ret ? EXIT_FAILURE : EXIT_SUCCESS;
+}
+
+#include <support/test-driver.c>
+diff --git a/sysdeps/x86_64/x32/tst-size_t-wmemcmp.c b/sysdeps/x86_64/x32/tst-size_t-wmemcmp.c
+new file mode 100644
+index 00000000..e8b5ffd0
+--- /dev/null
+++ b/sysdeps/x86_64/x32/tst-size_t-wmemcmp.c
+@@ -0,0 +1,20 @@
+/* Test wmemcmp with size_t in the lower 32 bits of 64-bit register.
+   Copyright (C) 2019 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#define WIDE 1
+#include "tst-size_t-memcmp.c"
+-- 
+GitLab
+
--- a/glibc-RHEL-15696-20.patch
+++ b/glibc-RHEL-15696-20.patch
@ -0,0 +1,164 @@
+From e4fda4631017e49d4ee5a2755db34289b6860fa4 Mon Sep 17 00:00:00 2001
+From: "H.J. Lu" <hjl.tools@gmail.com>
+Date: Sun, 7 Mar 2021 09:45:23 -0800
+Subject: [PATCH] x86-64: Use ZMM16-ZMM31 in AVX512 memmove family functions
+Content-type: text/plain; charset=UTF-8
+
+Update ifunc-memmove.h to select the function optimized with AVX512
+instructions using ZMM16-ZMM31 registers to avoid RTM abort with usable
+AVX512VL since VZEROUPPER isn't needed at function exit.
+---
+ sysdeps/x86_64/multiarch/ifunc-impl-list.c    | 24 +++++++++---------
+ sysdeps/x86_64/multiarch/ifunc-memmove.h      | 12 +++++----
+ .../multiarch/memmove-avx512-unaligned-erms.S | 25 +++++++++++++++++--
+ 3 files changed, 42 insertions(+), 19 deletions(-)
+
+diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+index d969a156..fec384f6 100644
+--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+@@ -83,10 +83,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 			      CPU_FEATURE_USABLE (AVX512F),
+ 			      __memmove_chk_avx512_no_vzeroupper)
+ 	      IFUNC_IMPL_ADD (array, i, __memmove_chk,
+-			      CPU_FEATURE_USABLE (AVX512F),
+			      CPU_FEATURE_USABLE (AVX512VL),
+ 			      __memmove_chk_avx512_unaligned)
+ 	      IFUNC_IMPL_ADD (array, i, __memmove_chk,
+-			      CPU_FEATURE_USABLE (AVX512F),
+			      CPU_FEATURE_USABLE (AVX512VL),
+ 			      __memmove_chk_avx512_unaligned_erms)
+ 	      IFUNC_IMPL_ADD (array, i, __memmove_chk,
+ 			      CPU_FEATURE_USABLE (AVX),
+@@ -148,10 +148,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 			      CPU_FEATURE_USABLE (AVX512F),
+ 			      __memmove_avx512_no_vzeroupper)
+ 	      IFUNC_IMPL_ADD (array, i, memmove,
+-			      CPU_FEATURE_USABLE (AVX512F),
+			      CPU_FEATURE_USABLE (AVX512VL),
+ 			      __memmove_avx512_unaligned)
+ 	      IFUNC_IMPL_ADD (array, i, memmove,
+-			      CPU_FEATURE_USABLE (AVX512F),
+			      CPU_FEATURE_USABLE (AVX512VL),
+ 			      __memmove_avx512_unaligned_erms)
+ 	      IFUNC_IMPL_ADD (array, i, memmove, CPU_FEATURE_USABLE (SSSE3),
+ 			      __memmove_ssse3_back)
+@@ -733,10 +733,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 			      CPU_FEATURE_USABLE (AVX512F),
+ 			      __memcpy_chk_avx512_no_vzeroupper)
+ 	      IFUNC_IMPL_ADD (array, i, __memcpy_chk,
+-			      CPU_FEATURE_USABLE (AVX512F),
+			      CPU_FEATURE_USABLE (AVX512VL),
+ 			      __memcpy_chk_avx512_unaligned)
+ 	      IFUNC_IMPL_ADD (array, i, __memcpy_chk,
+-			      CPU_FEATURE_USABLE (AVX512F),
+			      CPU_FEATURE_USABLE (AVX512VL),
+ 			      __memcpy_chk_avx512_unaligned_erms)
+ 	      IFUNC_IMPL_ADD (array, i, __memcpy_chk,
+ 			      CPU_FEATURE_USABLE (AVX),
+@@ -802,10 +802,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 			      CPU_FEATURE_USABLE (AVX512F),
+ 			      __memcpy_avx512_no_vzeroupper)
+ 	      IFUNC_IMPL_ADD (array, i, memcpy,
+-			      CPU_FEATURE_USABLE (AVX512F),
+			      CPU_FEATURE_USABLE (AVX512VL),
+ 			      __memcpy_avx512_unaligned)
+ 	      IFUNC_IMPL_ADD (array, i, memcpy,
+-			      CPU_FEATURE_USABLE (AVX512F),
+			      CPU_FEATURE_USABLE (AVX512VL),
+ 			      __memcpy_avx512_unaligned_erms)
+ 	      IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_sse2_unaligned)
+ 	      IFUNC_IMPL_ADD (array, i, memcpy, 1,
+@@ -819,10 +819,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 			      CPU_FEATURE_USABLE (AVX512F),
+ 			      __mempcpy_chk_avx512_no_vzeroupper)
+ 	      IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
+-			      CPU_FEATURE_USABLE (AVX512F),
+			      CPU_FEATURE_USABLE (AVX512VL),
+ 			      __mempcpy_chk_avx512_unaligned)
+ 	      IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
+-			      CPU_FEATURE_USABLE (AVX512F),
+			      CPU_FEATURE_USABLE (AVX512VL),
+ 			      __mempcpy_chk_avx512_unaligned_erms)
+ 	      IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
+ 			      CPU_FEATURE_USABLE (AVX),
+@@ -864,10 +864,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 			      CPU_FEATURE_USABLE (AVX512F),
+ 			      __mempcpy_avx512_no_vzeroupper)
+ 	      IFUNC_IMPL_ADD (array, i, mempcpy,
+-			      CPU_FEATURE_USABLE (AVX512F),
+			      CPU_FEATURE_USABLE (AVX512VL),
+ 			      __mempcpy_avx512_unaligned)
+ 	      IFUNC_IMPL_ADD (array, i, mempcpy,
+-			      CPU_FEATURE_USABLE (AVX512F),
+			      CPU_FEATURE_USABLE (AVX512VL),
+ 			      __mempcpy_avx512_unaligned_erms)
+ 	      IFUNC_IMPL_ADD (array, i, mempcpy,
+ 			      CPU_FEATURE_USABLE (AVX),
+diff --git a/sysdeps/x86_64/multiarch/ifunc-memmove.h b/sysdeps/x86_64/multiarch/ifunc-memmove.h
+index fa09b9fb..014e95c7 100644
+--- a/sysdeps/x86_64/multiarch/ifunc-memmove.h
+++ b/sysdeps/x86_64/multiarch/ifunc-memmove.h
+@@ -56,13 +56,15 @@ IFUNC_SELECTOR (void)
+   if (CPU_FEATURE_USABLE_P (cpu_features, AVX512F)
+       && !CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_AVX512))
+     {
+-      if (CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
+-	return OPTIMIZE (avx512_no_vzeroupper);
+      if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL))
+	{
+	  if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
+	    return OPTIMIZE (avx512_unaligned_erms);
+ 
+-      if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
+-	return OPTIMIZE (avx512_unaligned_erms);
+	  return OPTIMIZE (avx512_unaligned);
+	}
+ 
+-      return OPTIMIZE (avx512_unaligned);
+      return OPTIMIZE (avx512_no_vzeroupper);
+     }
+ 
+   if (CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load))
+diff --git a/sysdeps/x86_64/multiarch/memmove-avx512-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-avx512-unaligned-erms.S
+index aac1515c..848848ab 100644
+--- a/sysdeps/x86_64/multiarch/memmove-avx512-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memmove-avx512-unaligned-erms.S
+@@ -1,11 +1,32 @@
+ #if IS_IN (libc)
+ # define VEC_SIZE	64
+-# define VEC(i)		zmm##i
+# define XMM0		xmm16
+# define XMM1		xmm17
+# define YMM0		ymm16
+# define YMM1		ymm17
+# define VEC0		zmm16
+# define VEC1		zmm17
+# define VEC2		zmm18
+# define VEC3		zmm19
+# define VEC4		zmm20
+# define VEC5		zmm21
+# define VEC6		zmm22
+# define VEC7		zmm23
+# define VEC8		zmm24
+# define VEC9		zmm25
+# define VEC10		zmm26
+# define VEC11		zmm27
+# define VEC12		zmm28
+# define VEC13		zmm29
+# define VEC14		zmm30
+# define VEC15		zmm31
+# define VEC(i)		VEC##i
+ # define VMOVNT		vmovntdq
+ # define VMOVU		vmovdqu64
+ # define VMOVA		vmovdqa64
+# define VZEROUPPER
+ 
+-# define SECTION(p)		p##.avx512
+# define SECTION(p)		p##.evex512
+ # define MEMMOVE_SYMBOL(p,s)	p##_avx512_##s
+ 
+ # include "memmove-vec-unaligned-erms.S"
+-- 
+GitLab
+
--- a/glibc-RHEL-15696-21.patch
+++ b/glibc-RHEL-15696-21.patch
@ -0,0 +1,71 @@
+From 595c22ecd8e87a27fd19270ed30fdbae9ad25426 Mon Sep 17 00:00:00 2001
+From: Sunil K Pandey <skpgkp2@gmail.com>
+Date: Thu, 1 Apr 2021 15:47:04 -0700
+Subject: [PATCH] x86-64: Fix ifdef indentation in strlen-evex.S
+Content-type: text/plain; charset=UTF-8
+
+Fix some indentations of ifdef in file strlen-evex.S which are off by 1
+and confusing to read.
+---
+ sysdeps/x86_64/multiarch/strlen-evex.S | 16 ++++++++--------
+ 1 file changed, 8 insertions(+), 8 deletions(-)
+
+diff --git a/sysdeps/x86_64/multiarch/strlen-evex.S b/sysdeps/x86_64/multiarch/strlen-evex.S
+index cd022509..05838190 100644
+--- a/sysdeps/x86_64/multiarch/strlen-evex.S
+++ b/sysdeps/x86_64/multiarch/strlen-evex.S
+@@ -276,10 +276,10 @@ L(last_2x_vec):
+ 	.p2align 4
+ L(first_vec_x0_check):
+ 	tzcntl	%eax, %eax
+-# ifdef USE_AS_WCSLEN
+#  ifdef USE_AS_WCSLEN
+ 	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
+ 	sall	$2, %eax
+-# endif
+#  endif
+ 	/* Check the end of data.  */
+ 	cmpq	%rax, %rsi
+ 	jbe	L(max)
+@@ -293,10 +293,10 @@ L(first_vec_x0_check):
+ 	.p2align 4
+ L(first_vec_x1_check):
+ 	tzcntl	%eax, %eax
+-# ifdef USE_AS_WCSLEN
+#  ifdef USE_AS_WCSLEN
+ 	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
+ 	sall	$2, %eax
+-# endif
+#  endif
+ 	/* Check the end of data.  */
+ 	cmpq	%rax, %rsi
+ 	jbe	L(max)
+@@ -311,10 +311,10 @@ L(first_vec_x1_check):
+ 	.p2align 4
+ L(first_vec_x2_check):
+ 	tzcntl	%eax, %eax
+-# ifdef USE_AS_WCSLEN
+#  ifdef USE_AS_WCSLEN
+ 	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
+ 	sall	$2, %eax
+-# endif
+#  endif
+ 	/* Check the end of data.  */
+ 	cmpq	%rax, %rsi
+ 	jbe	L(max)
+@@ -329,10 +329,10 @@ L(first_vec_x2_check):
+ 	.p2align 4
+ L(first_vec_x3_check):
+ 	tzcntl	%eax, %eax
+-# ifdef USE_AS_WCSLEN
+#  ifdef USE_AS_WCSLEN
+ 	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
+ 	sall	$2, %eax
+-# endif
+#  endif
+ 	/* Check the end of data.  */
+ 	cmpq	%rax, %rsi
+ 	jbe	L(max)
+-- 
+GitLab
+
--- a/glibc-RHEL-15696-22.patch
+++ b/glibc-RHEL-15696-22.patch
@ -0,0 +1,51 @@
+From 55bf411b451c13f0fb7ff3d3bf9a820020b45df1 Mon Sep 17 00:00:00 2001
+From: "H.J. Lu" <hjl.tools@gmail.com>
+Date: Mon, 19 Apr 2021 07:07:21 -0700
+Subject: [PATCH] x86-64: Require BMI2 for __strlen_evex and __strnlen_evex
+Content-type: text/plain; charset=UTF-8
+
+Since __strlen_evex and __strnlen_evex added by
+
+commit 1fd8c163a83d96ace1ff78fa6bac7aee084f6f77
+Author: H.J. Lu <hjl.tools@gmail.com>
+Date:   Fri Mar 5 06:24:52 2021 -0800
+
+    x86-64: Add ifunc-avx2.h functions with 256-bit EVEX
+
+use sarx:
+
+c4 e2 6a f7 c0       	sarx   %edx,%eax,%eax
+
+require BMI2 for __strlen_evex and __strnlen_evex in ifunc-impl-list.c.
+ifunc-avx2.h already requires BMI2 for EVEX implementation.
+---
+ sysdeps/x86_64/multiarch/ifunc-impl-list.c | 6 ++++--
+ 1 file changed, 4 insertions(+), 2 deletions(-)
+
+diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+index fec384f6..cbfc1a5d 100644
+--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+@@ -293,7 +293,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 			      __strlen_avx2_rtm)
+ 	      IFUNC_IMPL_ADD (array, i, strlen,
+ 			      (CPU_FEATURE_USABLE (AVX512VL)
+-			       && CPU_FEATURE_USABLE (AVX512BW)),
+			       && CPU_FEATURE_USABLE (AVX512BW)
+			       && CPU_FEATURE_USABLE (BMI2)),
+ 			      __strlen_evex)
+ 	      IFUNC_IMPL_ADD (array, i, strlen, 1, __strlen_sse2))
+ 
+@@ -308,7 +309,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 			      __strnlen_avx2_rtm)
+ 	      IFUNC_IMPL_ADD (array, i, strnlen,
+ 			      (CPU_FEATURE_USABLE (AVX512VL)
+-			       && CPU_FEATURE_USABLE (AVX512BW)),
+			       && CPU_FEATURE_USABLE (AVX512BW)
+			       && CPU_FEATURE_USABLE (BMI2)),
+ 			      __strnlen_evex)
+ 	      IFUNC_IMPL_ADD (array, i, strnlen, 1, __strnlen_sse2))
+ 
+-- 
+GitLab
+
--- a/glibc-RHEL-15696-23.patch
+++ b/glibc-RHEL-15696-23.patch
@ -0,0 +1,584 @@
+From acfd088a1963ba51cd83c78f95c0ab25ead79e04 Mon Sep 17 00:00:00 2001
+From: Noah Goldstein <goldstein.w.n@gmail.com>
+Date: Mon, 3 May 2021 03:01:58 -0400
+Subject: [PATCH] x86: Optimize memchr-avx2.S
+Content-type: text/plain; charset=UTF-8
+
+No bug. This commit optimizes memchr-avx2.S. The optimizations include
+replacing some branches with cmovcc, avoiding some branches entirely
+in the less_4x_vec case, making the page cross logic less strict,
+asaving a few instructions the in loop return loop. test-memchr,
+test-rawmemchr, and test-wmemchr are all passing.
+
+Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
+Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
+---
+ sysdeps/x86_64/multiarch/memchr-avx2.S | 425 ++++++++++++++-----------
+ 1 file changed, 247 insertions(+), 178 deletions(-)
+
+diff --git a/sysdeps/x86_64/multiarch/memchr-avx2.S b/sysdeps/x86_64/multiarch/memchr-avx2.S
+index cf893e77..b377f22e 100644
+--- a/sysdeps/x86_64/multiarch/memchr-avx2.S
+++ b/sysdeps/x86_64/multiarch/memchr-avx2.S
+@@ -26,8 +26,22 @@
+ 
+ # ifdef USE_AS_WMEMCHR
+ #  define VPCMPEQ	vpcmpeqd
+#  define VPBROADCAST	vpbroadcastd
+#  define CHAR_SIZE	4
+ # else
+ #  define VPCMPEQ	vpcmpeqb
+#  define VPBROADCAST	vpbroadcastb
+#  define CHAR_SIZE	1
+# endif
+
+# ifdef USE_AS_RAWMEMCHR
+#  define ERAW_PTR_REG	ecx
+#  define RRAW_PTR_REG	rcx
+#  define ALGN_PTR_REG	rdi
+# else
+#  define ERAW_PTR_REG	edi
+#  define RRAW_PTR_REG	rdi
+#  define ALGN_PTR_REG	rcx
+ # endif
+ 
+ # ifndef VZEROUPPER
+@@ -39,6 +53,7 @@
+ # endif
+ 
+ # define VEC_SIZE 32
+# define PAGE_SIZE 4096
+ 
+ 	.section SECTION(.text),"ax",@progbits
+ ENTRY (MEMCHR)
+@@ -47,295 +62,349 @@ ENTRY (MEMCHR)
+ 	test	%RDX_LP, %RDX_LP
+ 	jz	L(null)
+ # endif
+-	movl	%edi, %ecx
+-	/* Broadcast CHAR to YMM0.  */
+-	vmovd	%esi, %xmm0
+ # ifdef USE_AS_WMEMCHR
+ 	shl	$2, %RDX_LP
+-	vpbroadcastd %xmm0, %ymm0
+ # else
+ #  ifdef __ILP32__
+ 	/* Clear the upper 32 bits.  */
+ 	movl	%edx, %edx
+ #  endif
+-	vpbroadcastb %xmm0, %ymm0
+ # endif
+	/* Broadcast CHAR to YMMMATCH.  */
+	vmovd	%esi, %xmm0
+	VPBROADCAST %xmm0, %ymm0
+ 	/* Check if we may cross page boundary with one vector load.  */
+-	andl	$(2 * VEC_SIZE - 1), %ecx
+-	cmpl	$VEC_SIZE, %ecx
+-	ja	L(cros_page_boundary)
+	movl	%edi, %eax
+	andl	$(PAGE_SIZE - 1), %eax
+	cmpl	$(PAGE_SIZE - VEC_SIZE), %eax
+	ja	L(cross_page_boundary)
+ 
+ 	/* Check the first VEC_SIZE bytes.  */
+-	VPCMPEQ (%rdi), %ymm0, %ymm1
+	VPCMPEQ	(%rdi), %ymm0, %ymm1
+ 	vpmovmskb %ymm1, %eax
+-	testl	%eax, %eax
+-
+ # ifndef USE_AS_RAWMEMCHR
+-	jnz	L(first_vec_x0_check)
+-	/* Adjust length and check the end of data.  */
+-	subq	$VEC_SIZE, %rdx
+-	jbe	L(zero)
+-# else
+-	jnz	L(first_vec_x0)
+	/* If length < CHAR_PER_VEC handle special.  */
+	cmpq	$VEC_SIZE, %rdx
+	jbe	L(first_vec_x0)
+ # endif
+-
+-	/* Align data for aligned loads in the loop.  */
+-	addq	$VEC_SIZE, %rdi
+-	andl	$(VEC_SIZE - 1), %ecx
+-	andq	$-VEC_SIZE, %rdi
+	testl	%eax, %eax
+	jz	L(aligned_more)
+	tzcntl	%eax, %eax
+	addq	%rdi, %rax
+	VZEROUPPER_RETURN
+ 
+ # ifndef USE_AS_RAWMEMCHR
+-	/* Adjust length.  */
+-	addq	%rcx, %rdx
+	.p2align 5
+L(first_vec_x0):
+	/* Check if first match was before length.  */
+	tzcntl	%eax, %eax
+	xorl	%ecx, %ecx
+	cmpl	%eax, %edx
+	leaq	(%rdi, %rax), %rax
+	cmovle	%rcx, %rax
+	VZEROUPPER_RETURN
+ 
+-	subq	$(VEC_SIZE * 4), %rdx
+-	jbe	L(last_4x_vec_or_less)
+L(null):
+	xorl	%eax, %eax
+	ret
+ # endif
+-	jmp	L(more_4x_vec)
+-
+ 	.p2align 4
+-L(cros_page_boundary):
+-	andl	$(VEC_SIZE - 1), %ecx
+-	andq	$-VEC_SIZE, %rdi
+-	VPCMPEQ (%rdi), %ymm0, %ymm1
+L(cross_page_boundary):
+	/* Save pointer before aligning as its original value is necessary
+	   for computer return address if byte is found or adjusting length
+	   if it is not and this is memchr.  */
+	movq	%rdi, %rcx
+	/* Align data to VEC_SIZE - 1. ALGN_PTR_REG is rcx for memchr and
+	   rdi for rawmemchr.  */
+	orq	$(VEC_SIZE - 1), %ALGN_PTR_REG
+	VPCMPEQ	-(VEC_SIZE - 1)(%ALGN_PTR_REG), %ymm0, %ymm1
+ 	vpmovmskb %ymm1, %eax
+# ifndef USE_AS_RAWMEMCHR
+	/* Calculate length until end of page (length checked for a
+	   match).  */
+	leaq	1(%ALGN_PTR_REG), %rsi
+	subq	%RRAW_PTR_REG, %rsi
+# endif
+ 	/* Remove the leading bytes.  */
+-	sarl	%cl, %eax
+-	testl	%eax, %eax
+-	jz	L(aligned_more)
+-	tzcntl	%eax, %eax
+	sarxl	%ERAW_PTR_REG, %eax, %eax
+ # ifndef USE_AS_RAWMEMCHR
+ 	/* Check the end of data.  */
+-	cmpq	%rax, %rdx
+-	jbe	L(zero)
+	cmpq	%rsi, %rdx
+	jbe	L(first_vec_x0)
+ # endif
+-	addq	%rdi, %rax
+-	addq	%rcx, %rax
+	testl	%eax, %eax
+	jz	L(cross_page_continue)
+	tzcntl	%eax, %eax
+	addq	%RRAW_PTR_REG, %rax
+ L(return_vzeroupper):
+ 	ZERO_UPPER_VEC_REGISTERS_RETURN
+ 
+ 	.p2align 4
+-L(aligned_more):
+-# ifndef USE_AS_RAWMEMCHR
+-        /* Calculate "rdx + rcx - VEC_SIZE" with "rdx - (VEC_SIZE - rcx)"
+-	   instead of "(rdx + rcx) - VEC_SIZE" to void possible addition
+-	   overflow.  */
+-	negq	%rcx
+-	addq	$VEC_SIZE, %rcx
+L(first_vec_x1):
+	tzcntl	%eax, %eax
+	incq	%rdi
+	addq	%rdi, %rax
+	VZEROUPPER_RETURN
+ 
+-	/* Check the end of data.  */
+-	subq	%rcx, %rdx
+-	jbe	L(zero)
+-# endif
+	.p2align 4
+L(first_vec_x2):
+	tzcntl	%eax, %eax
+	addq	$(VEC_SIZE + 1), %rdi
+	addq	%rdi, %rax
+	VZEROUPPER_RETURN
+
+	.p2align 4
+L(first_vec_x3):
+	tzcntl	%eax, %eax
+	addq	$(VEC_SIZE * 2 + 1), %rdi
+	addq	%rdi, %rax
+	VZEROUPPER_RETURN
+ 
+-	addq	$VEC_SIZE, %rdi
+ 
+-# ifndef USE_AS_RAWMEMCHR
+-	subq	$(VEC_SIZE * 4), %rdx
+-	jbe	L(last_4x_vec_or_less)
+-# endif
+	.p2align 4
+L(first_vec_x4):
+	tzcntl	%eax, %eax
+	addq	$(VEC_SIZE * 3 + 1), %rdi
+	addq	%rdi, %rax
+	VZEROUPPER_RETURN
+ 
+-L(more_4x_vec):
+	.p2align 4
+L(aligned_more):
+ 	/* Check the first 4 * VEC_SIZE.  Only one VEC_SIZE at a time
+ 	   since data is only aligned to VEC_SIZE.  */
+-	VPCMPEQ (%rdi), %ymm0, %ymm1
+-	vpmovmskb %ymm1, %eax
+-	testl	%eax, %eax
+-	jnz	L(first_vec_x0)
+ 
+-	VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1
+# ifndef USE_AS_RAWMEMCHR
+L(cross_page_continue):
+	/* Align data to VEC_SIZE - 1.  */
+	xorl	%ecx, %ecx
+	subl	%edi, %ecx
+	orq	$(VEC_SIZE - 1), %rdi
+	/* esi is for adjusting length to see if near the end.  */
+	leal	(VEC_SIZE * 4 + 1)(%rdi, %rcx), %esi
+# else
+	orq	$(VEC_SIZE - 1), %rdi
+L(cross_page_continue):
+# endif
+	/* Load first VEC regardless.  */
+	VPCMPEQ	1(%rdi), %ymm0, %ymm1
+ 	vpmovmskb %ymm1, %eax
+# ifndef USE_AS_RAWMEMCHR
+	/* Adjust length. If near end handle specially.  */
+	subq	%rsi, %rdx
+	jbe	L(last_4x_vec_or_less)
+# endif
+ 	testl	%eax, %eax
+ 	jnz	L(first_vec_x1)
+ 
+-	VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm0, %ymm1
+	VPCMPEQ	(VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
+ 	vpmovmskb %ymm1, %eax
+ 	testl	%eax, %eax
+ 	jnz	L(first_vec_x2)
+ 
+-	VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1
+	VPCMPEQ	(VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm1
+ 	vpmovmskb %ymm1, %eax
+ 	testl	%eax, %eax
+ 	jnz	L(first_vec_x3)
+ 
+-	addq	$(VEC_SIZE * 4), %rdi
+	VPCMPEQ	(VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm1
+	vpmovmskb %ymm1, %eax
+	testl	%eax, %eax
+	jnz	L(first_vec_x4)
+ 
+ # ifndef USE_AS_RAWMEMCHR
+	/* Check if at last VEC_SIZE * 4 length.  */
+ 	subq	$(VEC_SIZE * 4), %rdx
+-	jbe	L(last_4x_vec_or_less)
+-# endif
+-
+-	/* Align data to 4 * VEC_SIZE.  */
+-	movq	%rdi, %rcx
+-	andl	$(4 * VEC_SIZE - 1), %ecx
+-	andq	$-(4 * VEC_SIZE), %rdi
+-
+-# ifndef USE_AS_RAWMEMCHR
+-	/* Adjust length.  */
+	jbe	L(last_4x_vec_or_less_cmpeq)
+	/* Align data to VEC_SIZE * 4 - 1 for the loop and readjust
+	   length.  */
+	incq	%rdi
+	movl	%edi, %ecx
+	orq	$(VEC_SIZE * 4 - 1), %rdi
+	andl	$(VEC_SIZE * 4 - 1), %ecx
+ 	addq	%rcx, %rdx
+# else
+	/* Align data to VEC_SIZE * 4 - 1 for loop.  */
+	incq	%rdi
+	orq	$(VEC_SIZE * 4 - 1), %rdi
+ # endif
+ 
+	/* Compare 4 * VEC at a time forward.  */
+ 	.p2align 4
+ L(loop_4x_vec):
+-	/* Compare 4 * VEC at a time forward.  */
+-	VPCMPEQ (%rdi), %ymm0, %ymm1
+-	VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm2
+-	VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm0, %ymm3
+-	VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm0, %ymm4
+-
+	VPCMPEQ	1(%rdi), %ymm0, %ymm1
+	VPCMPEQ	(VEC_SIZE + 1)(%rdi), %ymm0, %ymm2
+	VPCMPEQ	(VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm3
+	VPCMPEQ	(VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm4
+ 	vpor	%ymm1, %ymm2, %ymm5
+ 	vpor	%ymm3, %ymm4, %ymm6
+ 	vpor	%ymm5, %ymm6, %ymm5
+ 
+-	vpmovmskb %ymm5, %eax
+-	testl	%eax, %eax
+-	jnz	L(4x_vec_end)
+-
+-	addq	$(VEC_SIZE * 4), %rdi
+-
+	vpmovmskb %ymm5, %ecx
+ # ifdef USE_AS_RAWMEMCHR
+-	jmp	L(loop_4x_vec)
+	subq	$-(VEC_SIZE * 4), %rdi
+	testl	%ecx, %ecx
+	jz	L(loop_4x_vec)
+ # else
+-	subq	$(VEC_SIZE * 4), %rdx
+-	ja	L(loop_4x_vec)
+	testl	%ecx, %ecx
+	jnz	L(loop_4x_vec_end)
+ 
+-L(last_4x_vec_or_less):
+-	/* Less than 4 * VEC and aligned to VEC_SIZE.  */
+-	addl	$(VEC_SIZE * 2), %edx
+-	jle	L(last_2x_vec)
+	subq	$-(VEC_SIZE * 4), %rdi
+ 
+-	VPCMPEQ (%rdi), %ymm0, %ymm1
+-	vpmovmskb %ymm1, %eax
+-	testl	%eax, %eax
+-	jnz	L(first_vec_x0)
+	subq	$(VEC_SIZE * 4), %rdx
+	ja	L(loop_4x_vec)
+ 
+-	VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1
+	/* Fall through into less than 4 remaining vectors of length case.
+	 */
+	VPCMPEQ	(VEC_SIZE * 0 + 1)(%rdi), %ymm0, %ymm1
+ 	vpmovmskb %ymm1, %eax
+	.p2align 4
+L(last_4x_vec_or_less):
+	/* Check if first VEC contained match.  */
+ 	testl	%eax, %eax
+-	jnz	L(first_vec_x1)
+	jnz	L(first_vec_x1_check)
+ 
+-	VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm0, %ymm1
+-	vpmovmskb %ymm1, %eax
+-	testl	%eax, %eax
+	/* If remaining length > VEC_SIZE * 2.  */
+	addl	$(VEC_SIZE * 2), %edx
+	jg	L(last_4x_vec)
+ 
+-	jnz	L(first_vec_x2_check)
+-	subl	$VEC_SIZE, %edx
+-	jle	L(zero)
+L(last_2x_vec):
+	/* If remaining length < VEC_SIZE.  */
+	addl	$VEC_SIZE, %edx
+	jle	L(zero_end)
+ 
+-	VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1
+	/* Check VEC2 and compare any match with remaining length.  */
+	VPCMPEQ	(VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
+ 	vpmovmskb %ymm1, %eax
+-	testl	%eax, %eax
+-
+-	jnz	L(first_vec_x3_check)
+-	xorl	%eax, %eax
+	tzcntl	%eax, %eax
+	cmpl	%eax, %edx
+	jbe	L(set_zero_end)
+	addq	$(VEC_SIZE + 1), %rdi
+	addq	%rdi, %rax
+L(zero_end):
+ 	VZEROUPPER_RETURN
+ 
+ 	.p2align 4
+-L(last_2x_vec):
+-	addl	$(VEC_SIZE * 2), %edx
+-	VPCMPEQ (%rdi), %ymm0, %ymm1
+L(loop_4x_vec_end):
+# endif
+	/* rawmemchr will fall through into this if match was found in
+	   loop.  */
+
+ 	vpmovmskb %ymm1, %eax
+ 	testl	%eax, %eax
+	jnz	L(last_vec_x1_return)
+ 
+-	jnz	L(first_vec_x0_check)
+-	subl	$VEC_SIZE, %edx
+-	jle	L(zero)
+-
+-	VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1
+-	vpmovmskb %ymm1, %eax
+	vpmovmskb %ymm2, %eax
+ 	testl	%eax, %eax
+-	jnz	L(first_vec_x1_check)
+-	xorl	%eax, %eax
+-	VZEROUPPER_RETURN
+	jnz	L(last_vec_x2_return)
+ 
+-	.p2align 4
+-L(first_vec_x0_check):
+-	tzcntl	%eax, %eax
+-	/* Check the end of data.  */
+-	cmpq	%rax, %rdx
+-	jbe	L(zero)
+	vpmovmskb %ymm3, %eax
+	/* Combine VEC3 matches (eax) with VEC4 matches (ecx).  */
+	salq	$32, %rcx
+	orq	%rcx, %rax
+	tzcntq	%rax, %rax
+# ifdef USE_AS_RAWMEMCHR
+	subq	$(VEC_SIZE * 2 - 1), %rdi
+# else
+	subq	$-(VEC_SIZE * 2 + 1), %rdi
+# endif
+ 	addq	%rdi, %rax
+ 	VZEROUPPER_RETURN
+# ifndef USE_AS_RAWMEMCHR
+ 
+ 	.p2align 4
+ L(first_vec_x1_check):
+ 	tzcntl	%eax, %eax
+-	/* Check the end of data.  */
+-	cmpq	%rax, %rdx
+-	jbe	L(zero)
+-	addq	$VEC_SIZE, %rax
+	/* Adjust length.  */
+	subl	$-(VEC_SIZE * 4), %edx
+	/* Check if match within remaining length.  */
+	cmpl	%eax, %edx
+	jbe	L(set_zero_end)
+	incq	%rdi
+ 	addq	%rdi, %rax
+ 	VZEROUPPER_RETURN
+	.p2align 4
+L(set_zero_end):
+	xorl	%eax, %eax
+	VZEROUPPER_RETURN
+# endif
+ 
+ 	.p2align 4
+-L(first_vec_x2_check):
+L(last_vec_x1_return):
+ 	tzcntl	%eax, %eax
+-	/* Check the end of data.  */
+-	cmpq	%rax, %rdx
+-	jbe	L(zero)
+-	addq	$(VEC_SIZE * 2), %rax
+# ifdef USE_AS_RAWMEMCHR
+	subq	$(VEC_SIZE * 4 - 1), %rdi
+# else
+	incq	%rdi
+# endif
+ 	addq	%rdi, %rax
+ 	VZEROUPPER_RETURN
+ 
+ 	.p2align 4
+-L(first_vec_x3_check):
+L(last_vec_x2_return):
+ 	tzcntl	%eax, %eax
+-	/* Check the end of data.  */
+-	cmpq	%rax, %rdx
+-	jbe	L(zero)
+-	addq	$(VEC_SIZE * 3), %rax
+# ifdef USE_AS_RAWMEMCHR
+	subq	$(VEC_SIZE * 3 - 1), %rdi
+# else
+	subq	$-(VEC_SIZE + 1), %rdi
+# endif
+ 	addq	%rdi, %rax
+ 	VZEROUPPER_RETURN
+ 
+# ifndef USE_AS_RAWMEMCHR
+ 	.p2align 4
+-L(zero):
+-	xorl	%eax, %eax
+-	jmp     L(return_vzeroupper)
+L(last_4x_vec_or_less_cmpeq):
+	VPCMPEQ	(VEC_SIZE * 4 + 1)(%rdi), %ymm0, %ymm1
+	vpmovmskb %ymm1, %eax
+	subq	$-(VEC_SIZE * 4), %rdi
+	/* Check first VEC regardless.  */
+	testl	%eax, %eax
+	jnz	L(first_vec_x1_check)
+ 
+	/* If remaining length <= CHAR_PER_VEC * 2.  */
+	addl	$(VEC_SIZE * 2), %edx
+	jle	L(last_2x_vec)
+ 	.p2align 4
+-L(null):
+-	xorl	%eax, %eax
+-	ret
+-# endif
+L(last_4x_vec):
+	VPCMPEQ	(VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
+	vpmovmskb %ymm1, %eax
+	testl	%eax, %eax
+	jnz	L(last_vec_x2_return)
+ 
+-	.p2align 4
+-L(first_vec_x0):
+-	tzcntl	%eax, %eax
+-	addq	%rdi, %rax
+-	VZEROUPPER_RETURN
+	VPCMPEQ	(VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm1
+	vpmovmskb %ymm1, %eax
+ 
+-	.p2align 4
+-L(first_vec_x1):
+-	tzcntl	%eax, %eax
+-	addq	$VEC_SIZE, %rax
+-	addq	%rdi, %rax
+-	VZEROUPPER_RETURN
+	/* Create mask for possible matches within remaining length.  */
+	movq	$-1, %rcx
+	bzhiq	%rdx, %rcx, %rcx
+ 
+-	.p2align 4
+-L(first_vec_x2):
+	/* Test matches in data against length match.  */
+	andl	%ecx, %eax
+	jnz	L(last_vec_x3)
+
+	/* if remaining length <= VEC_SIZE * 3 (Note this is after
+	   remaining length was found to be > VEC_SIZE * 2.  */
+	subl	$VEC_SIZE, %edx
+	jbe	L(zero_end2)
+
+	VPCMPEQ	(VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm1
+	vpmovmskb %ymm1, %eax
+	/* Shift remaining length mask for last VEC.  */
+	shrq	$32, %rcx
+	andl	%ecx, %eax
+	jz	L(zero_end2)
+ 	tzcntl	%eax, %eax
+-	addq	$(VEC_SIZE * 2), %rax
+	addq	$(VEC_SIZE * 3 + 1), %rdi
+ 	addq	%rdi, %rax
+L(zero_end2):
+ 	VZEROUPPER_RETURN
+ 
+ 	.p2align 4
+-L(4x_vec_end):
+-	vpmovmskb %ymm1, %eax
+-	testl	%eax, %eax
+-	jnz	L(first_vec_x0)
+-	vpmovmskb %ymm2, %eax
+-	testl	%eax, %eax
+-	jnz	L(first_vec_x1)
+-	vpmovmskb %ymm3, %eax
+-	testl	%eax, %eax
+-	jnz	L(first_vec_x2)
+-	vpmovmskb %ymm4, %eax
+-	testl	%eax, %eax
+-L(first_vec_x3):
+L(last_vec_x3):
+ 	tzcntl	%eax, %eax
+-	addq	$(VEC_SIZE * 3), %rax
+	subq	$-(VEC_SIZE * 2 + 1), %rdi
+ 	addq	%rdi, %rax
+ 	VZEROUPPER_RETURN
+# endif
+ 
+ END (MEMCHR)
+ #endif
+-- 
+GitLab
+
--- a/glibc-RHEL-15696-24.patch
+++ b/glibc-RHEL-15696-24.patch
@ -0,0 +1,388 @@
+From 645a158978f9520e74074e8c14047503be4db0f0 Mon Sep 17 00:00:00 2001
+From: Noah Goldstein <goldstein.w.n@gmail.com>
+Date: Wed, 9 Jun 2021 16:25:32 -0400
+Subject: [PATCH] x86: Fix overflow bug with wmemchr-sse2 and wmemchr-avx2 [BZ
+ #27974]
+Content-type: text/plain; charset=UTF-8
+
+This commit fixes the bug mentioned in the previous commit.
+
+The previous implementations of wmemchr in these files relied
+on n * sizeof(wchar_t) which was not guranteed by the standard.
+
+The new overflow tests added in the previous commit now
+pass (As well as all the other tests).
+
+Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
+Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
+---
+ sysdeps/x86_64/memchr.S                | 77 +++++++++++++++++++-------
+ sysdeps/x86_64/multiarch/memchr-avx2.S | 58 +++++++++++++------
+ 2 files changed, 98 insertions(+), 37 deletions(-)
+
+diff --git a/sysdeps/x86_64/memchr.S b/sysdeps/x86_64/memchr.S
+index cb320257..24f9a0c5 100644
+--- a/sysdeps/x86_64/memchr.S
+++ b/sysdeps/x86_64/memchr.S
+@@ -21,9 +21,11 @@
+ #ifdef USE_AS_WMEMCHR
+ # define MEMCHR		wmemchr
+ # define PCMPEQ		pcmpeqd
+# define CHAR_PER_VEC	4
+ #else
+ # define MEMCHR		memchr
+ # define PCMPEQ		pcmpeqb
+# define CHAR_PER_VEC	16
+ #endif
+ 
+ /* fast SSE2 version with using pmaxub and 64 byte loop */
+@@ -33,15 +35,14 @@ ENTRY(MEMCHR)
+ 	movd	%esi, %xmm1
+ 	mov	%edi, %ecx
+ 
+#ifdef __ILP32__
+	/* Clear the upper 32 bits.  */
+	movl	%edx, %edx
+#endif
+ #ifdef USE_AS_WMEMCHR
+ 	test	%RDX_LP, %RDX_LP
+ 	jz	L(return_null)
+-	shl	$2, %RDX_LP
+ #else
+-# ifdef __ILP32__
+-	/* Clear the upper 32 bits.  */
+-	movl	%edx, %edx
+-# endif
+ 	punpcklbw %xmm1, %xmm1
+ 	test	%RDX_LP, %RDX_LP
+ 	jz	L(return_null)
+@@ -60,13 +61,16 @@ ENTRY(MEMCHR)
+ 	test	%eax, %eax
+ 
+ 	jnz	L(matches_1)
+-	sub	$16, %rdx
+	sub	$CHAR_PER_VEC, %rdx
+ 	jbe	L(return_null)
+ 	add	$16, %rdi
+ 	and	$15, %ecx
+ 	and	$-16, %rdi
+#ifdef USE_AS_WMEMCHR
+	shr	$2, %ecx
+#endif
+ 	add	%rcx, %rdx
+-	sub	$64, %rdx
+	sub	$(CHAR_PER_VEC * 4), %rdx
+ 	jbe	L(exit_loop)
+ 	jmp	L(loop_prolog)
+ 
+@@ -77,16 +81,21 @@ L(crosscache):
+ 	movdqa	(%rdi), %xmm0
+ 
+ 	PCMPEQ	%xmm1, %xmm0
+-/* Check if there is a match.  */
+	/* Check if there is a match.  */
+ 	pmovmskb %xmm0, %eax
+-/* Remove the leading bytes.  */
+	/* Remove the leading bytes.  */
+ 	sar	%cl, %eax
+ 	test	%eax, %eax
+ 	je	L(unaligned_no_match)
+-/* Check which byte is a match.  */
+	/* Check which byte is a match.  */
+ 	bsf	%eax, %eax
+-
+#ifdef USE_AS_WMEMCHR
+	mov	%eax, %esi
+	shr	$2, %esi
+	sub	%rsi, %rdx
+#else
+ 	sub	%rax, %rdx
+#endif
+ 	jbe	L(return_null)
+ 	add	%rdi, %rax
+ 	add	%rcx, %rax
+@@ -94,15 +103,18 @@ L(crosscache):
+ 
+ 	.p2align 4
+ L(unaligned_no_match):
+-        /* "rcx" is less than 16.  Calculate "rdx + rcx - 16" by using
+	/* "rcx" is less than 16.  Calculate "rdx + rcx - 16" by using
+ 	   "rdx - (16 - rcx)" instead of "(rdx + rcx) - 16" to void
+ 	   possible addition overflow.  */
+ 	neg	%rcx
+ 	add	$16, %rcx
+#ifdef USE_AS_WMEMCHR
+	shr	$2, %ecx
+#endif
+ 	sub	%rcx, %rdx
+ 	jbe	L(return_null)
+ 	add	$16, %rdi
+-	sub	$64, %rdx
+	sub	$(CHAR_PER_VEC * 4), %rdx
+ 	jbe	L(exit_loop)
+ 
+ 	.p2align 4
+@@ -135,7 +147,7 @@ L(loop_prolog):
+ 	test	$0x3f, %rdi
+ 	jz	L(align64_loop)
+ 
+-	sub	$64, %rdx
+	sub	$(CHAR_PER_VEC * 4), %rdx
+ 	jbe	L(exit_loop)
+ 
+ 	movdqa	(%rdi), %xmm0
+@@ -167,11 +179,14 @@ L(loop_prolog):
+ 	mov	%rdi, %rcx
+ 	and	$-64, %rdi
+ 	and	$63, %ecx
+#ifdef USE_AS_WMEMCHR
+	shr	$2, %ecx
+#endif
+ 	add	%rcx, %rdx
+ 
+ 	.p2align 4
+ L(align64_loop):
+-	sub	$64, %rdx
+	sub	$(CHAR_PER_VEC * 4), %rdx
+ 	jbe	L(exit_loop)
+ 	movdqa	(%rdi), %xmm0
+ 	movdqa	16(%rdi), %xmm2
+@@ -218,7 +233,7 @@ L(align64_loop):
+ 
+ 	.p2align 4
+ L(exit_loop):
+-	add	$32, %edx
+	add	$(CHAR_PER_VEC * 2), %edx
+ 	jle	L(exit_loop_32)
+ 
+ 	movdqa	(%rdi), %xmm0
+@@ -238,7 +253,7 @@ L(exit_loop):
+ 	pmovmskb %xmm3, %eax
+ 	test	%eax, %eax
+ 	jnz	L(matches32_1)
+-	sub	$16, %edx
+	sub	$CHAR_PER_VEC, %edx
+ 	jle	L(return_null)
+ 
+ 	PCMPEQ	48(%rdi), %xmm1
+@@ -250,13 +265,13 @@ L(exit_loop):
+ 
+ 	.p2align 4
+ L(exit_loop_32):
+-	add	$32, %edx
+	add	$(CHAR_PER_VEC * 2), %edx
+ 	movdqa	(%rdi), %xmm0
+ 	PCMPEQ	%xmm1, %xmm0
+ 	pmovmskb %xmm0, %eax
+ 	test	%eax, %eax
+ 	jnz	L(matches_1)
+-	sub	$16, %edx
+	sub	$CHAR_PER_VEC, %edx
+ 	jbe	L(return_null)
+ 
+ 	PCMPEQ	16(%rdi), %xmm1
+@@ -293,7 +308,13 @@ L(matches32):
+ 	.p2align 4
+ L(matches_1):
+ 	bsf	%eax, %eax
+#ifdef USE_AS_WMEMCHR
+	mov	%eax, %esi
+	shr	$2, %esi
+	sub	%rsi, %rdx
+#else
+ 	sub	%rax, %rdx
+#endif
+ 	jbe	L(return_null)
+ 	add	%rdi, %rax
+ 	ret
+@@ -301,7 +322,13 @@ L(matches_1):
+ 	.p2align 4
+ L(matches16_1):
+ 	bsf	%eax, %eax
+#ifdef USE_AS_WMEMCHR
+	mov	%eax, %esi
+	shr	$2, %esi
+	sub	%rsi, %rdx
+#else
+ 	sub	%rax, %rdx
+#endif
+ 	jbe	L(return_null)
+ 	lea	16(%rdi, %rax), %rax
+ 	ret
+@@ -309,7 +336,13 @@ L(matches16_1):
+ 	.p2align 4
+ L(matches32_1):
+ 	bsf	%eax, %eax
+#ifdef USE_AS_WMEMCHR
+	mov	%eax, %esi
+	shr	$2, %esi
+	sub	%rsi, %rdx
+#else
+ 	sub	%rax, %rdx
+#endif
+ 	jbe	L(return_null)
+ 	lea	32(%rdi, %rax), %rax
+ 	ret
+@@ -317,7 +350,13 @@ L(matches32_1):
+ 	.p2align 4
+ L(matches48_1):
+ 	bsf	%eax, %eax
+#ifdef USE_AS_WMEMCHR
+	mov	%eax, %esi
+	shr	$2, %esi
+	sub	%rsi, %rdx
+#else
+ 	sub	%rax, %rdx
+#endif
+ 	jbe	L(return_null)
+ 	lea	48(%rdi, %rax), %rax
+ 	ret
+diff --git a/sysdeps/x86_64/multiarch/memchr-avx2.S b/sysdeps/x86_64/multiarch/memchr-avx2.S
+index b377f22e..16027abb 100644
+--- a/sysdeps/x86_64/multiarch/memchr-avx2.S
+++ b/sysdeps/x86_64/multiarch/memchr-avx2.S
+@@ -54,21 +54,19 @@
+ 
+ # define VEC_SIZE 32
+ # define PAGE_SIZE 4096
+# define CHAR_PER_VEC	(VEC_SIZE / CHAR_SIZE)
+ 
+ 	.section SECTION(.text),"ax",@progbits
+ ENTRY (MEMCHR)
+ # ifndef USE_AS_RAWMEMCHR
+ 	/* Check for zero length.  */
+-	test	%RDX_LP, %RDX_LP
+-	jz	L(null)
+-# endif
+-# ifdef USE_AS_WMEMCHR
+-	shl	$2, %RDX_LP
+-# else
+ #  ifdef __ILP32__
+-	/* Clear the upper 32 bits.  */
+-	movl	%edx, %edx
+	/* Clear upper bits.  */
+	and	%RDX_LP, %RDX_LP
+#  else
+	test	%RDX_LP, %RDX_LP
+ #  endif
+	jz	L(null)
+ # endif
+ 	/* Broadcast CHAR to YMMMATCH.  */
+ 	vmovd	%esi, %xmm0
+@@ -84,7 +82,7 @@ ENTRY (MEMCHR)
+ 	vpmovmskb %ymm1, %eax
+ # ifndef USE_AS_RAWMEMCHR
+ 	/* If length < CHAR_PER_VEC handle special.  */
+-	cmpq	$VEC_SIZE, %rdx
+	cmpq	$CHAR_PER_VEC, %rdx
+ 	jbe	L(first_vec_x0)
+ # endif
+ 	testl	%eax, %eax
+@@ -98,6 +96,10 @@ ENTRY (MEMCHR)
+ L(first_vec_x0):
+ 	/* Check if first match was before length.  */
+ 	tzcntl	%eax, %eax
+#  ifdef USE_AS_WMEMCHR
+	/* NB: Multiply length by 4 to get byte count.  */
+	sall	$2, %edx
+#  endif
+ 	xorl	%ecx, %ecx
+ 	cmpl	%eax, %edx
+ 	leaq	(%rdi, %rax), %rax
+@@ -110,12 +112,12 @@ L(null):
+ # endif
+ 	.p2align 4
+ L(cross_page_boundary):
+-	/* Save pointer before aligning as its original value is necessary
+-	   for computer return address if byte is found or adjusting length
+-	   if it is not and this is memchr.  */
+	/* Save pointer before aligning as its original value is
+	   necessary for computer return address if byte is found or
+	   adjusting length if it is not and this is memchr.  */
+ 	movq	%rdi, %rcx
+-	/* Align data to VEC_SIZE - 1. ALGN_PTR_REG is rcx for memchr and
+-	   rdi for rawmemchr.  */
+	/* Align data to VEC_SIZE - 1. ALGN_PTR_REG is rcx for memchr
+	   and rdi for rawmemchr.  */
+ 	orq	$(VEC_SIZE - 1), %ALGN_PTR_REG
+ 	VPCMPEQ	-(VEC_SIZE - 1)(%ALGN_PTR_REG), %ymm0, %ymm1
+ 	vpmovmskb %ymm1, %eax
+@@ -124,6 +126,10 @@ L(cross_page_boundary):
+ 	   match).  */
+ 	leaq	1(%ALGN_PTR_REG), %rsi
+ 	subq	%RRAW_PTR_REG, %rsi
+#  ifdef USE_AS_WMEMCHR
+	/* NB: Divide bytes by 4 to get wchar_t count.  */
+	shrl	$2, %esi
+#  endif
+ # endif
+ 	/* Remove the leading bytes.  */
+ 	sarxl	%ERAW_PTR_REG, %eax, %eax
+@@ -181,6 +187,10 @@ L(cross_page_continue):
+ 	orq	$(VEC_SIZE - 1), %rdi
+ 	/* esi is for adjusting length to see if near the end.  */
+ 	leal	(VEC_SIZE * 4 + 1)(%rdi, %rcx), %esi
+#  ifdef USE_AS_WMEMCHR
+	/* NB: Divide bytes by 4 to get the wchar_t count.  */
+	sarl	$2, %esi
+#  endif
+ # else
+ 	orq	$(VEC_SIZE - 1), %rdi
+ L(cross_page_continue):
+@@ -213,7 +223,7 @@ L(cross_page_continue):
+ 
+ # ifndef USE_AS_RAWMEMCHR
+ 	/* Check if at last VEC_SIZE * 4 length.  */
+-	subq	$(VEC_SIZE * 4), %rdx
+	subq	$(CHAR_PER_VEC * 4), %rdx
+ 	jbe	L(last_4x_vec_or_less_cmpeq)
+ 	/* Align data to VEC_SIZE * 4 - 1 for the loop and readjust
+ 	   length.  */
+@@ -221,6 +231,10 @@ L(cross_page_continue):
+ 	movl	%edi, %ecx
+ 	orq	$(VEC_SIZE * 4 - 1), %rdi
+ 	andl	$(VEC_SIZE * 4 - 1), %ecx
+#  ifdef USE_AS_WMEMCHR
+	/* NB: Divide bytes by 4 to get the wchar_t count.  */
+	sarl	$2, %ecx
+#  endif
+ 	addq	%rcx, %rdx
+ # else
+ 	/* Align data to VEC_SIZE * 4 - 1 for loop.  */
+@@ -250,15 +264,19 @@ L(loop_4x_vec):
+ 
+ 	subq	$-(VEC_SIZE * 4), %rdi
+ 
+-	subq	$(VEC_SIZE * 4), %rdx
+	subq	$(CHAR_PER_VEC * 4), %rdx
+ 	ja	L(loop_4x_vec)
+ 
+-	/* Fall through into less than 4 remaining vectors of length case.
+-	 */
+	/* Fall through into less than 4 remaining vectors of length
+	   case.  */
+ 	VPCMPEQ	(VEC_SIZE * 0 + 1)(%rdi), %ymm0, %ymm1
+ 	vpmovmskb %ymm1, %eax
+ 	.p2align 4
+ L(last_4x_vec_or_less):
+#  ifdef USE_AS_WMEMCHR
+	/* NB: Multiply length by 4 to get byte count.  */
+	sall	$2, %edx
+#  endif
+ 	/* Check if first VEC contained match.  */
+ 	testl	%eax, %eax
+ 	jnz	L(first_vec_x1_check)
+@@ -355,6 +373,10 @@ L(last_vec_x2_return):
+ L(last_4x_vec_or_less_cmpeq):
+ 	VPCMPEQ	(VEC_SIZE * 4 + 1)(%rdi), %ymm0, %ymm1
+ 	vpmovmskb %ymm1, %eax
+#  ifdef USE_AS_WMEMCHR
+	/* NB: Multiply length by 4 to get byte count.  */
+	sall	$2, %edx
+#  endif
+ 	subq	$-(VEC_SIZE * 4), %rdi
+ 	/* Check first VEC regardless.  */
+ 	testl	%eax, %eax
+-- 
+GitLab
+
--- a/glibc-RHEL-15696-25.patch
+++ b/glibc-RHEL-15696-25.patch
@ -0,0 +1,767 @@
+From aaa23c35071537e2dcf5807e956802ed215210aa Mon Sep 17 00:00:00 2001
+From: Noah Goldstein <goldstein.w.n@gmail.com>
+Date: Mon, 19 Apr 2021 19:36:07 -0400
+Subject: [PATCH] x86: Optimize strlen-avx2.S
+Content-type: text/plain; charset=UTF-8
+
+No bug. This commit optimizes strlen-avx2.S. The optimizations are
+mostly small things but they add up to roughly 10-30% performance
+improvement for strlen. The results for strnlen are bit more
+ambiguous. test-strlen, test-strnlen, test-wcslen, and test-wcsnlen
+are all passing.
+
+Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
+---
+ sysdeps/x86_64/multiarch/ifunc-impl-list.c |  16 +-
+ sysdeps/x86_64/multiarch/strlen-avx2.S     | 532 +++++++++++++--------
+ 2 files changed, 334 insertions(+), 214 deletions(-)
+
+diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+index cbfc1a5d..f1a6460a 100644
+--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+@@ -285,10 +285,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+   /* Support sysdeps/x86_64/multiarch/strlen.c.  */
+   IFUNC_IMPL (i, name, strlen,
+ 	      IFUNC_IMPL_ADD (array, i, strlen,
+-			      CPU_FEATURE_USABLE (AVX2),
+			      (CPU_FEATURE_USABLE (AVX2)
+			       && CPU_FEATURE_USABLE (BMI2)),
+ 			      __strlen_avx2)
+ 	      IFUNC_IMPL_ADD (array, i, strlen,
+ 			      (CPU_FEATURE_USABLE (AVX2)
+			       && CPU_FEATURE_USABLE (BMI2)
+ 			       && CPU_FEATURE_USABLE (RTM)),
+ 			      __strlen_avx2_rtm)
+ 	      IFUNC_IMPL_ADD (array, i, strlen,
+@@ -301,10 +303,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+   /* Support sysdeps/x86_64/multiarch/strnlen.c.  */
+   IFUNC_IMPL (i, name, strnlen,
+ 	      IFUNC_IMPL_ADD (array, i, strnlen,
+-			      CPU_FEATURE_USABLE (AVX2),
+			      (CPU_FEATURE_USABLE (AVX2)
+			       && CPU_FEATURE_USABLE (BMI2)),
+ 			      __strnlen_avx2)
+ 	      IFUNC_IMPL_ADD (array, i, strnlen,
+ 			      (CPU_FEATURE_USABLE (AVX2)
+			       && CPU_FEATURE_USABLE (BMI2)
+ 			       && CPU_FEATURE_USABLE (RTM)),
+ 			      __strnlen_avx2_rtm)
+ 	      IFUNC_IMPL_ADD (array, i, strnlen,
+@@ -640,10 +644,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+   /* Support sysdeps/x86_64/multiarch/wcslen.c.  */
+   IFUNC_IMPL (i, name, wcslen,
+ 	      IFUNC_IMPL_ADD (array, i, wcslen,
+-			      CPU_FEATURE_USABLE (AVX2),
+			      (CPU_FEATURE_USABLE (AVX2)
+			       && CPU_FEATURE_USABLE (BMI2)),
+ 			      __wcslen_avx2)
+ 	      IFUNC_IMPL_ADD (array, i, wcslen,
+ 			      (CPU_FEATURE_USABLE (AVX2)
+			       && CPU_FEATURE_USABLE (BMI2)
+ 			       && CPU_FEATURE_USABLE (RTM)),
+ 			      __wcslen_avx2_rtm)
+ 	      IFUNC_IMPL_ADD (array, i, wcslen,
+@@ -656,10 +662,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+   /* Support sysdeps/x86_64/multiarch/wcsnlen.c.  */
+   IFUNC_IMPL (i, name, wcsnlen,
+ 	      IFUNC_IMPL_ADD (array, i, wcsnlen,
+-			      CPU_FEATURE_USABLE (AVX2),
+			      (CPU_FEATURE_USABLE (AVX2)
+			       && CPU_FEATURE_USABLE (BMI2)),
+ 			      __wcsnlen_avx2)
+ 	      IFUNC_IMPL_ADD (array, i, wcsnlen,
+ 			      (CPU_FEATURE_USABLE (AVX2)
+			       && CPU_FEATURE_USABLE (BMI2)
+ 			       && CPU_FEATURE_USABLE (RTM)),
+ 			      __wcsnlen_avx2_rtm)
+ 	      IFUNC_IMPL_ADD (array, i, wcsnlen,
+diff --git a/sysdeps/x86_64/multiarch/strlen-avx2.S b/sysdeps/x86_64/multiarch/strlen-avx2.S
+index 82826e10..be8a5db5 100644
+--- a/sysdeps/x86_64/multiarch/strlen-avx2.S
+++ b/sysdeps/x86_64/multiarch/strlen-avx2.S
+@@ -27,9 +27,11 @@
+ # ifdef USE_AS_WCSLEN
+ #  define VPCMPEQ	vpcmpeqd
+ #  define VPMINU	vpminud
+#  define CHAR_SIZE	4
+ # else
+ #  define VPCMPEQ	vpcmpeqb
+ #  define VPMINU	vpminub
+#  define CHAR_SIZE	1
+ # endif
+ 
+ # ifndef VZEROUPPER
+@@ -41,349 +43,459 @@
+ # endif
+ 
+ # define VEC_SIZE 32
+# define PAGE_SIZE 4096
+ 
+ 	.section SECTION(.text),"ax",@progbits
+ ENTRY (STRLEN)
+ # ifdef USE_AS_STRNLEN
+-	/* Check for zero length.  */
+	/* Check zero length.  */
+ 	test	%RSI_LP, %RSI_LP
+ 	jz	L(zero)
+	/* Store max len in R8_LP before adjusting if using WCSLEN.  */
+	mov	%RSI_LP, %R8_LP
+ #  ifdef USE_AS_WCSLEN
+ 	shl	$2, %RSI_LP
+ #  elif defined __ILP32__
+ 	/* Clear the upper 32 bits.  */
+ 	movl	%esi, %esi
+ #  endif
+-	mov	%RSI_LP, %R8_LP
+ # endif
+-	movl	%edi, %ecx
+	movl	%edi, %eax
+ 	movq	%rdi, %rdx
+ 	vpxor	%xmm0, %xmm0, %xmm0
+-
+	/* Clear high bits from edi. Only keeping bits relevant to page
+	   cross check.  */
+	andl	$(PAGE_SIZE - 1), %eax
+ 	/* Check if we may cross page boundary with one vector load.  */
+-	andl	$(2 * VEC_SIZE - 1), %ecx
+-	cmpl	$VEC_SIZE, %ecx
+-	ja	L(cros_page_boundary)
+	cmpl	$(PAGE_SIZE - VEC_SIZE), %eax
+	ja	L(cross_page_boundary)
+ 
+ 	/* Check the first VEC_SIZE bytes.  */
+-	VPCMPEQ (%rdi), %ymm0, %ymm1
+-	vpmovmskb %ymm1, %eax
+-	testl	%eax, %eax
+-
+	VPCMPEQ	(%rdi), %ymm0, %ymm1
+	vpmovmskb	%ymm1, %eax
+ # ifdef USE_AS_STRNLEN
+-	jnz	L(first_vec_x0_check)
+-	/* Adjust length and check the end of data.  */
+-	subq	$VEC_SIZE, %rsi
+-	jbe	L(max)
+-# else
+-	jnz	L(first_vec_x0)
+	/* If length < VEC_SIZE handle special.  */
+	cmpq	$VEC_SIZE, %rsi
+	jbe	L(first_vec_x0)
+ # endif
+-
+-	/* Align data for aligned loads in the loop.  */
+-	addq	$VEC_SIZE, %rdi
+-	andl	$(VEC_SIZE - 1), %ecx
+-	andq	$-VEC_SIZE, %rdi
+	/* If empty continue to aligned_more. Otherwise return bit
+	   position of first match.  */
+	testl	%eax, %eax
+	jz	L(aligned_more)
+	tzcntl	%eax, %eax
+# ifdef USE_AS_WCSLEN
+	shrl	$2, %eax
+# endif
+	VZEROUPPER_RETURN
+ 
+ # ifdef USE_AS_STRNLEN
+-	/* Adjust length.  */
+-	addq	%rcx, %rsi
+L(zero):
+	xorl	%eax, %eax
+	ret
+ 
+-	subq	$(VEC_SIZE * 4), %rsi
+-	jbe	L(last_4x_vec_or_less)
+	.p2align 4
+L(first_vec_x0):
+	/* Set bit for max len so that tzcnt will return min of max len
+	   and position of first match.  */
+	btsq	%rsi, %rax
+	tzcntl	%eax, %eax
+#  ifdef USE_AS_WCSLEN
+	shrl	$2, %eax
+#  endif
+	VZEROUPPER_RETURN
+ # endif
+-	jmp	L(more_4x_vec)
+ 
+ 	.p2align 4
+-L(cros_page_boundary):
+-	andl	$(VEC_SIZE - 1), %ecx
+-	andq	$-VEC_SIZE, %rdi
+-	VPCMPEQ (%rdi), %ymm0, %ymm1
+-	vpmovmskb %ymm1, %eax
+-	/* Remove the leading bytes.  */
+-	sarl	%cl, %eax
+-	testl	%eax, %eax
+-	jz	L(aligned_more)
+L(first_vec_x1):
+ 	tzcntl	%eax, %eax
+	/* Safe to use 32 bit instructions as these are only called for
+	   size = [1, 159].  */
+ # ifdef USE_AS_STRNLEN
+-	/* Check the end of data.  */
+-	cmpq	%rax, %rsi
+-	jbe	L(max)
+	/* Use ecx which was computed earlier to compute correct value.
+	 */
+	subl	$(VEC_SIZE * 4 + 1), %ecx
+	addl	%ecx, %eax
+# else
+	subl	%edx, %edi
+	incl	%edi
+	addl	%edi, %eax
+ # endif
+-	addq	%rdi, %rax
+-	addq	%rcx, %rax
+-	subq	%rdx, %rax
+ # ifdef USE_AS_WCSLEN
+-	shrq	$2, %rax
+	shrl	$2, %eax
+ # endif
+-L(return_vzeroupper):
+-	ZERO_UPPER_VEC_REGISTERS_RETURN
+	VZEROUPPER_RETURN
+ 
+ 	.p2align 4
+-L(aligned_more):
+L(first_vec_x2):
+	tzcntl	%eax, %eax
+	/* Safe to use 32 bit instructions as these are only called for
+	   size = [1, 159].  */
+ # ifdef USE_AS_STRNLEN
+-        /* "rcx" is less than VEC_SIZE.  Calculate "rdx + rcx - VEC_SIZE"
+-	    with "rdx - (VEC_SIZE - rcx)" instead of "(rdx + rcx) - VEC_SIZE"
+-	    to void possible addition overflow.  */
+-	negq	%rcx
+-	addq	$VEC_SIZE, %rcx
+-
+-	/* Check the end of data.  */
+-	subq	%rcx, %rsi
+-	jbe	L(max)
+	/* Use ecx which was computed earlier to compute correct value.
+	 */
+	subl	$(VEC_SIZE * 3 + 1), %ecx
+	addl	%ecx, %eax
+# else
+	subl	%edx, %edi
+	addl	$(VEC_SIZE + 1), %edi
+	addl	%edi, %eax
+ # endif
+# ifdef USE_AS_WCSLEN
+	shrl	$2, %eax
+# endif
+	VZEROUPPER_RETURN
+ 
+-	addq	$VEC_SIZE, %rdi
+	.p2align 4
+L(first_vec_x3):
+	tzcntl	%eax, %eax
+	/* Safe to use 32 bit instructions as these are only called for
+	   size = [1, 159].  */
+# ifdef USE_AS_STRNLEN
+	/* Use ecx which was computed earlier to compute correct value.
+	 */
+	subl	$(VEC_SIZE * 2 + 1), %ecx
+	addl	%ecx, %eax
+# else
+	subl	%edx, %edi
+	addl	$(VEC_SIZE * 2 + 1), %edi
+	addl	%edi, %eax
+# endif
+# ifdef USE_AS_WCSLEN
+	shrl	$2, %eax
+# endif
+	VZEROUPPER_RETURN
+ 
+	.p2align 4
+L(first_vec_x4):
+	tzcntl	%eax, %eax
+	/* Safe to use 32 bit instructions as these are only called for
+	   size = [1, 159].  */
+ # ifdef USE_AS_STRNLEN
+-	subq	$(VEC_SIZE * 4), %rsi
+-	jbe	L(last_4x_vec_or_less)
+	/* Use ecx which was computed earlier to compute correct value.
+	 */
+	subl	$(VEC_SIZE + 1), %ecx
+	addl	%ecx, %eax
+# else
+	subl	%edx, %edi
+	addl	$(VEC_SIZE * 3 + 1), %edi
+	addl	%edi, %eax
+ # endif
+# ifdef USE_AS_WCSLEN
+	shrl	$2, %eax
+# endif
+	VZEROUPPER_RETURN
+ 
+-L(more_4x_vec):
+	.p2align 5
+L(aligned_more):
+	/* Align data to VEC_SIZE - 1. This is the same number of
+	   instructions as using andq with -VEC_SIZE but saves 4 bytes of
+	   code on the x4 check.  */
+	orq	$(VEC_SIZE - 1), %rdi
+L(cross_page_continue):
+ 	/* Check the first 4 * VEC_SIZE.  Only one VEC_SIZE at a time
+ 	   since data is only aligned to VEC_SIZE.  */
+-	VPCMPEQ (%rdi), %ymm0, %ymm1
+-	vpmovmskb %ymm1, %eax
+-	testl	%eax, %eax
+-	jnz	L(first_vec_x0)
+-
+-	VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1
+-	vpmovmskb %ymm1, %eax
+# ifdef USE_AS_STRNLEN
+	/* + 1 because rdi is aligned to VEC_SIZE - 1. + CHAR_SIZE because
+	   it simplies the logic in last_4x_vec_or_less.  */
+	leaq	(VEC_SIZE * 4 + CHAR_SIZE + 1)(%rdi), %rcx
+	subq	%rdx, %rcx
+# endif
+	/* Load first VEC regardless.  */
+	VPCMPEQ	1(%rdi), %ymm0, %ymm1
+# ifdef USE_AS_STRNLEN
+	/* Adjust length. If near end handle specially.  */
+	subq	%rcx, %rsi
+	jb	L(last_4x_vec_or_less)
+# endif
+	vpmovmskb	%ymm1, %eax
+ 	testl	%eax, %eax
+ 	jnz	L(first_vec_x1)
+ 
+-	VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm0, %ymm1
+-	vpmovmskb %ymm1, %eax
+	VPCMPEQ	(VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
+	vpmovmskb	%ymm1, %eax
+ 	testl	%eax, %eax
+ 	jnz	L(first_vec_x2)
+ 
+-	VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1
+-	vpmovmskb %ymm1, %eax
+	VPCMPEQ	(VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm1
+	vpmovmskb	%ymm1, %eax
+ 	testl	%eax, %eax
+ 	jnz	L(first_vec_x3)
+ 
+-	addq	$(VEC_SIZE * 4), %rdi
+-
+-# ifdef USE_AS_STRNLEN
+-	subq	$(VEC_SIZE * 4), %rsi
+-	jbe	L(last_4x_vec_or_less)
+-# endif
+-
+-	/* Align data to 4 * VEC_SIZE.  */
+-	movq	%rdi, %rcx
+-	andl	$(4 * VEC_SIZE - 1), %ecx
+-	andq	$-(4 * VEC_SIZE), %rdi
+	VPCMPEQ	(VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm1
+	vpmovmskb	%ymm1, %eax
+	testl	%eax, %eax
+	jnz	L(first_vec_x4)
+ 
+	/* Align data to VEC_SIZE * 4 - 1.  */
+ # ifdef USE_AS_STRNLEN
+-	/* Adjust length.  */
+	/* Before adjusting length check if at last VEC_SIZE * 4.  */
+	cmpq	$(VEC_SIZE * 4 - 1), %rsi
+	jbe	L(last_4x_vec_or_less_load)
+	incq	%rdi
+	movl	%edi, %ecx
+	orq	$(VEC_SIZE * 4 - 1), %rdi
+	andl	$(VEC_SIZE * 4 - 1), %ecx
+	/* Readjust length.  */
+ 	addq	%rcx, %rsi
+# else
+	incq	%rdi
+	orq	$(VEC_SIZE * 4 - 1), %rdi
+ # endif
+-
+	/* Compare 4 * VEC at a time forward.  */
+ 	.p2align 4
+ L(loop_4x_vec):
+-	/* Compare 4 * VEC at a time forward.  */
+-	vmovdqa (%rdi), %ymm1
+-	vmovdqa	VEC_SIZE(%rdi), %ymm2
+-	vmovdqa	(VEC_SIZE * 2)(%rdi), %ymm3
+-	vmovdqa	(VEC_SIZE * 3)(%rdi), %ymm4
+-	VPMINU	%ymm1, %ymm2, %ymm5
+-	VPMINU	%ymm3, %ymm4, %ymm6
+-	VPMINU	%ymm5, %ymm6, %ymm5
+-
+-	VPCMPEQ	%ymm5, %ymm0, %ymm5
+-	vpmovmskb %ymm5, %eax
+-	testl	%eax, %eax
+-	jnz	L(4x_vec_end)
+-
+-	addq	$(VEC_SIZE * 4), %rdi
+-
+-# ifndef USE_AS_STRNLEN
+-	jmp	L(loop_4x_vec)
+-# else
+# ifdef USE_AS_STRNLEN
+	/* Break if at end of length.  */
+ 	subq	$(VEC_SIZE * 4), %rsi
+-	ja	L(loop_4x_vec)
+-
+-L(last_4x_vec_or_less):
+-	/* Less than 4 * VEC and aligned to VEC_SIZE.  */
+-	addl	$(VEC_SIZE * 2), %esi
+-	jle	L(last_2x_vec)
+	jb	L(last_4x_vec_or_less_cmpeq)
+# endif
+	/* Save some code size by microfusing VPMINU with the load. Since
+	   the matches in ymm2/ymm4 can only be returned if there where no
+	   matches in ymm1/ymm3 respectively there is no issue with overlap.
+	 */
+	vmovdqa	1(%rdi), %ymm1
+	VPMINU	(VEC_SIZE + 1)(%rdi), %ymm1, %ymm2
+	vmovdqa	(VEC_SIZE * 2 + 1)(%rdi), %ymm3
+	VPMINU	(VEC_SIZE * 3 + 1)(%rdi), %ymm3, %ymm4
+
+	VPMINU	%ymm2, %ymm4, %ymm5
+	VPCMPEQ	%ymm5, %ymm0, %ymm5
+	vpmovmskb	%ymm5, %ecx
+ 
+-	VPCMPEQ (%rdi), %ymm0, %ymm1
+-	vpmovmskb %ymm1, %eax
+-	testl	%eax, %eax
+-	jnz	L(first_vec_x0)
+	subq	$-(VEC_SIZE * 4), %rdi
+	testl	%ecx, %ecx
+	jz	L(loop_4x_vec)
+ 
+-	VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1
+-	vpmovmskb %ymm1, %eax
+-	testl	%eax, %eax
+-	jnz	L(first_vec_x1)
+ 
+-	VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm0, %ymm1
+-	vpmovmskb %ymm1, %eax
+	VPCMPEQ	%ymm1, %ymm0, %ymm1
+	vpmovmskb	%ymm1, %eax
+	subq	%rdx, %rdi
+ 	testl	%eax, %eax
+	jnz	L(last_vec_return_x0)
+ 
+-	jnz	L(first_vec_x2_check)
+-	subl	$VEC_SIZE, %esi
+-	jle	L(max)
+-
+-	VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1
+-	vpmovmskb %ymm1, %eax
+	VPCMPEQ	%ymm2, %ymm0, %ymm2
+	vpmovmskb	%ymm2, %eax
+ 	testl	%eax, %eax
+-
+-	jnz	L(first_vec_x3_check)
+-	movq	%r8, %rax
+-#  ifdef USE_AS_WCSLEN
+	jnz	L(last_vec_return_x1)
+
+	/* Combine last 2 VEC.  */
+	VPCMPEQ	%ymm3, %ymm0, %ymm3
+	vpmovmskb	%ymm3, %eax
+	/* rcx has combined result from all 4 VEC. It will only be used if
+	   the first 3 other VEC all did not contain a match.  */
+	salq	$32, %rcx
+	orq	%rcx, %rax
+	tzcntq	%rax, %rax
+	subq	$(VEC_SIZE * 2 - 1), %rdi
+	addq	%rdi, %rax
+# ifdef USE_AS_WCSLEN
+ 	shrq	$2, %rax
+-#  endif
+# endif
+ 	VZEROUPPER_RETURN
+ 
+
+# ifdef USE_AS_STRNLEN
+ 	.p2align 4
+-L(last_2x_vec):
+-	addl	$(VEC_SIZE * 2), %esi
+-	VPCMPEQ (%rdi), %ymm0, %ymm1
+-	vpmovmskb %ymm1, %eax
+-	testl	%eax, %eax
+L(last_4x_vec_or_less_load):
+	/* Depending on entry adjust rdi / prepare first VEC in ymm1.  */
+	subq	$-(VEC_SIZE * 4), %rdi
+L(last_4x_vec_or_less_cmpeq):
+	VPCMPEQ	1(%rdi), %ymm0, %ymm1
+L(last_4x_vec_or_less):
+ 
+-	jnz	L(first_vec_x0_check)
+-	subl	$VEC_SIZE, %esi
+-	jle	L(max)
+	vpmovmskb	%ymm1, %eax
+	/* If remaining length > VEC_SIZE * 2. This works if esi is off by
+	   VEC_SIZE * 4.  */
+	testl	$(VEC_SIZE * 2), %esi
+	jnz	L(last_4x_vec)
+ 
+-	VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1
+-	vpmovmskb %ymm1, %eax
+	/* length may have been negative or positive by an offset of
+	   VEC_SIZE * 4 depending on where this was called from. This fixes
+	   that.  */
+	andl	$(VEC_SIZE * 4 - 1), %esi
+ 	testl	%eax, %eax
+-	jnz	L(first_vec_x1_check)
+-	movq	%r8, %rax
+-#  ifdef USE_AS_WCSLEN
+-	shrq	$2, %rax
+-#  endif
+-	VZEROUPPER_RETURN
+	jnz	L(last_vec_x1_check)
+ 
+-	.p2align 4
+-L(first_vec_x0_check):
+	subl	$VEC_SIZE, %esi
+	jb	L(max)
+
+	VPCMPEQ	(VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
+	vpmovmskb	%ymm1, %eax
+ 	tzcntl	%eax, %eax
+ 	/* Check the end of data.  */
+-	cmpq	%rax, %rsi
+-	jbe	L(max)
+	cmpl	%eax, %esi
+	jb	L(max)
+	subq	%rdx, %rdi
+	addl	$(VEC_SIZE + 1), %eax
+ 	addq	%rdi, %rax
+-	subq	%rdx, %rax
+ #  ifdef USE_AS_WCSLEN
+ 	shrq	$2, %rax
+ #  endif
+ 	VZEROUPPER_RETURN
+# endif
+ 
+ 	.p2align 4
+-L(first_vec_x1_check):
+L(last_vec_return_x0):
+ 	tzcntl	%eax, %eax
+-	/* Check the end of data.  */
+-	cmpq	%rax, %rsi
+-	jbe	L(max)
+-	addq	$VEC_SIZE, %rax
+	subq	$(VEC_SIZE * 4 - 1), %rdi
+ 	addq	%rdi, %rax
+-	subq	%rdx, %rax
+-#  ifdef USE_AS_WCSLEN
+# ifdef USE_AS_WCSLEN
+ 	shrq	$2, %rax
+-#  endif
+# endif
+ 	VZEROUPPER_RETURN
+ 
+ 	.p2align 4
+-L(first_vec_x2_check):
+L(last_vec_return_x1):
+ 	tzcntl	%eax, %eax
+-	/* Check the end of data.  */
+-	cmpq	%rax, %rsi
+-	jbe	L(max)
+-	addq	$(VEC_SIZE * 2), %rax
+	subq	$(VEC_SIZE * 3 - 1), %rdi
+ 	addq	%rdi, %rax
+-	subq	%rdx, %rax
+-#  ifdef USE_AS_WCSLEN
+# ifdef USE_AS_WCSLEN
+ 	shrq	$2, %rax
+-#  endif
+# endif
+ 	VZEROUPPER_RETURN
+ 
+# ifdef USE_AS_STRNLEN
+ 	.p2align 4
+-L(first_vec_x3_check):
+L(last_vec_x1_check):
+
+ 	tzcntl	%eax, %eax
+ 	/* Check the end of data.  */
+-	cmpq	%rax, %rsi
+-	jbe	L(max)
+-	addq	$(VEC_SIZE * 3), %rax
+	cmpl	%eax, %esi
+	jb	L(max)
+	subq	%rdx, %rdi
+	incl	%eax
+ 	addq	%rdi, %rax
+-	subq	%rdx, %rax
+ #  ifdef USE_AS_WCSLEN
+ 	shrq	$2, %rax
+ #  endif
+ 	VZEROUPPER_RETURN
+ 
+-	.p2align 4
+ L(max):
+ 	movq	%r8, %rax
+	VZEROUPPER_RETURN
+
+	.p2align 4
+L(last_4x_vec):
+	/* Test first 2x VEC normally.  */
+	testl	%eax, %eax
+	jnz	L(last_vec_x1)
+
+	VPCMPEQ	(VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
+	vpmovmskb	%ymm1, %eax
+	testl	%eax, %eax
+	jnz	L(last_vec_x2)
+
+	/* Normalize length.  */
+	andl	$(VEC_SIZE * 4 - 1), %esi
+	VPCMPEQ	(VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm1
+	vpmovmskb	%ymm1, %eax
+	testl	%eax, %eax
+	jnz	L(last_vec_x3)
+
+	subl	$(VEC_SIZE * 3), %esi
+	jb	L(max)
+
+	VPCMPEQ	(VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm1
+	vpmovmskb	%ymm1, %eax
+	tzcntl	%eax, %eax
+	/* Check the end of data.  */
+	cmpl	%eax, %esi
+	jb	L(max)
+	subq	%rdx, %rdi
+	addl	$(VEC_SIZE * 3 + 1), %eax
+	addq	%rdi, %rax
+ #  ifdef USE_AS_WCSLEN
+ 	shrq	$2, %rax
+ #  endif
+ 	VZEROUPPER_RETURN
+ 
+-	.p2align 4
+-L(zero):
+-	xorl	%eax, %eax
+-	ret
+-# endif
+ 
+ 	.p2align 4
+-L(first_vec_x0):
+L(last_vec_x1):
+	/* essentially duplicates of first_vec_x1 but use 64 bit
+	   instructions.  */
+ 	tzcntl	%eax, %eax
+	subq	%rdx, %rdi
+	incl	%eax
+ 	addq	%rdi, %rax
+-	subq	%rdx, %rax
+-# ifdef USE_AS_WCSLEN
+#  ifdef USE_AS_WCSLEN
+ 	shrq	$2, %rax
+-# endif
+#  endif
+ 	VZEROUPPER_RETURN
+ 
+ 	.p2align 4
+-L(first_vec_x1):
+L(last_vec_x2):
+	/* essentially duplicates of first_vec_x1 but use 64 bit
+	   instructions.  */
+ 	tzcntl	%eax, %eax
+-	addq	$VEC_SIZE, %rax
+	subq	%rdx, %rdi
+	addl	$(VEC_SIZE + 1), %eax
+ 	addq	%rdi, %rax
+-	subq	%rdx, %rax
+-# ifdef USE_AS_WCSLEN
+#  ifdef USE_AS_WCSLEN
+ 	shrq	$2, %rax
+-# endif
+#  endif
+ 	VZEROUPPER_RETURN
+ 
+ 	.p2align 4
+-L(first_vec_x2):
+L(last_vec_x3):
+ 	tzcntl	%eax, %eax
+-	addq	$(VEC_SIZE * 2), %rax
+	subl	$(VEC_SIZE * 2), %esi
+	/* Check the end of data.  */
+	cmpl	%eax, %esi
+	jb	L(max_end)
+	subq	%rdx, %rdi
+	addl	$(VEC_SIZE * 2 + 1), %eax
+ 	addq	%rdi, %rax
+-	subq	%rdx, %rax
+-# ifdef USE_AS_WCSLEN
+#  ifdef USE_AS_WCSLEN
+ 	shrq	$2, %rax
+-# endif
+#  endif
+	VZEROUPPER_RETURN
+L(max_end):
+	movq	%r8, %rax
+ 	VZEROUPPER_RETURN
+# endif
+ 
+	/* Cold case for crossing page with first load.	 */
+ 	.p2align 4
+-L(4x_vec_end):
+-	VPCMPEQ	%ymm1, %ymm0, %ymm1
+-	vpmovmskb %ymm1, %eax
+-	testl	%eax, %eax
+-	jnz	L(first_vec_x0)
+-	VPCMPEQ %ymm2, %ymm0, %ymm2
+-	vpmovmskb %ymm2, %eax
+L(cross_page_boundary):
+	/* Align data to VEC_SIZE - 1.  */
+	orq	$(VEC_SIZE - 1), %rdi
+	VPCMPEQ	-(VEC_SIZE - 1)(%rdi), %ymm0, %ymm1
+	vpmovmskb	%ymm1, %eax
+	/* Remove the leading bytes. sarxl only uses bits [5:0] of COUNT
+	   so no need to manually mod rdx.  */
+	sarxl	%edx, %eax, %eax
+# ifdef USE_AS_STRNLEN
+ 	testl	%eax, %eax
+-	jnz	L(first_vec_x1)
+-	VPCMPEQ %ymm3, %ymm0, %ymm3
+-	vpmovmskb %ymm3, %eax
+	jnz	L(cross_page_less_vec)
+	leaq	1(%rdi), %rcx
+	subq	%rdx, %rcx
+	/* Check length.  */
+	cmpq	%rsi, %rcx
+	jb	L(cross_page_continue)
+	movq	%r8, %rax
+# else
+ 	testl	%eax, %eax
+-	jnz	L(first_vec_x2)
+-	VPCMPEQ %ymm4, %ymm0, %ymm4
+-	vpmovmskb %ymm4, %eax
+-L(first_vec_x3):
+	jz	L(cross_page_continue)
+ 	tzcntl	%eax, %eax
+-	addq	$(VEC_SIZE * 3), %rax
+-	addq	%rdi, %rax
+-	subq	%rdx, %rax
+-# ifdef USE_AS_WCSLEN
+-	shrq	$2, %rax
+#  ifdef USE_AS_WCSLEN
+	shrl	$2, %eax
+#  endif
+ # endif
+L(return_vzeroupper):
+	ZERO_UPPER_VEC_REGISTERS_RETURN
+
+# ifdef USE_AS_STRNLEN
+	.p2align 4
+L(cross_page_less_vec):
+	tzcntl	%eax, %eax
+	cmpq	%rax, %rsi
+	cmovb	%esi, %eax
+#  ifdef USE_AS_WCSLEN
+	shrl	$2, %eax
+#  endif
+ 	VZEROUPPER_RETURN
+# endif
+ 
+ END (STRLEN)
+ #endif
+-- 
+GitLab
+
--- a/glibc-RHEL-15696-26.patch
+++ b/glibc-RHEL-15696-26.patch
@ -0,0 +1,701 @@
+From 2a76821c3081d2c0231ecd2618f52662cb48fccd Mon Sep 17 00:00:00 2001
+From: Noah Goldstein <goldstein.w.n@gmail.com>
+Date: Mon, 3 May 2021 03:03:19 -0400
+Subject: [PATCH] x86: Optimize memchr-evex.S
+Content-type: text/plain; charset=UTF-8
+
+No bug. This commit optimizes memchr-evex.S. The optimizations include
+replacing some branches with cmovcc, avoiding some branches entirely
+in the less_4x_vec case, making the page cross logic less strict,
+saving some ALU in the alignment process, and most importantly
+increasing ILP in the 4x loop. test-memchr, test-rawmemchr, and
+test-wmemchr are all passing.
+
+Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
+Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
+---
+ sysdeps/x86_64/multiarch/memchr-evex.S | 547 +++++++++++++++----------
+ 1 file changed, 322 insertions(+), 225 deletions(-)
+
+diff --git a/sysdeps/x86_64/multiarch/memchr-evex.S b/sysdeps/x86_64/multiarch/memchr-evex.S
+index 6dd5d67b..81d5cd64 100644
+--- a/sysdeps/x86_64/multiarch/memchr-evex.S
+++ b/sysdeps/x86_64/multiarch/memchr-evex.S
+@@ -26,14 +26,28 @@
+ 
+ # ifdef USE_AS_WMEMCHR
+ #  define VPBROADCAST	vpbroadcastd
+-#  define VPCMP		vpcmpd
+-#  define SHIFT_REG	r8d
+#  define VPMINU	vpminud
+#  define VPCMP	vpcmpd
+#  define VPCMPEQ	vpcmpeqd
+#  define CHAR_SIZE	4
+ # else
+ #  define VPBROADCAST	vpbroadcastb
+-#  define VPCMP		vpcmpb
+-#  define SHIFT_REG	ecx
+#  define VPMINU	vpminub
+#  define VPCMP	vpcmpb
+#  define VPCMPEQ	vpcmpeqb
+#  define CHAR_SIZE	1
+ # endif
+ 
+# ifdef USE_AS_RAWMEMCHR
+#  define RAW_PTR_REG	rcx
+#  define ALGN_PTR_REG	rdi
+# else
+#  define RAW_PTR_REG	rdi
+#  define ALGN_PTR_REG	rcx
+# endif
+
+# define XMMZERO	xmm23
+# define YMMZERO	ymm23
+ # define XMMMATCH	xmm16
+ # define YMMMATCH	ymm16
+ # define YMM1		ymm17
+@@ -44,6 +58,8 @@
+ # define YMM6		ymm22
+ 
+ # define VEC_SIZE 32
+# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE)
+# define PAGE_SIZE 4096
+ 
+ 	.section .text.evex,"ax",@progbits
+ ENTRY (MEMCHR)
+@@ -51,11 +67,7 @@ ENTRY (MEMCHR)
+ 	/* Check for zero length.  */
+ 	test	%RDX_LP, %RDX_LP
+ 	jz	L(zero)
+-# endif
+-	movl	%edi, %ecx
+-# ifdef USE_AS_WMEMCHR
+-	shl	$2, %RDX_LP
+-# else
+
+ #  ifdef __ILP32__
+ 	/* Clear the upper 32 bits.  */
+ 	movl	%edx, %edx
+@@ -64,318 +76,403 @@ ENTRY (MEMCHR)
+ 	/* Broadcast CHAR to YMMMATCH.  */
+ 	VPBROADCAST %esi, %YMMMATCH
+ 	/* Check if we may cross page boundary with one vector load.  */
+-	andl	$(2 * VEC_SIZE - 1), %ecx
+-	cmpl	$VEC_SIZE, %ecx
+-	ja	L(cros_page_boundary)
+	movl	%edi, %eax
+	andl	$(PAGE_SIZE - 1), %eax
+	cmpl	$(PAGE_SIZE - VEC_SIZE), %eax
+	ja	L(cross_page_boundary)
+ 
+ 	/* Check the first VEC_SIZE bytes.  */
+-	VPCMP	$0, (%rdi), %YMMMATCH, %k1
+-	kmovd	%k1, %eax
+-	testl	%eax, %eax
+-
+	VPCMP	$0, (%rdi), %YMMMATCH, %k0
+	kmovd	%k0, %eax
+ # ifndef USE_AS_RAWMEMCHR
+-	jnz	L(first_vec_x0_check)
+-	/* Adjust length and check the end of data.  */
+-	subq	$VEC_SIZE, %rdx
+-	jbe	L(zero)
+	/* If length < CHAR_PER_VEC handle special.  */
+	cmpq	$CHAR_PER_VEC, %rdx
+	jbe	L(first_vec_x0)
+# endif
+	testl	%eax, %eax
+	jz	L(aligned_more)
+	tzcntl	%eax, %eax
+# ifdef USE_AS_WMEMCHR
+	/* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count.  */
+	leaq	(%rdi, %rax, CHAR_SIZE), %rax
+ # else
+-	jnz	L(first_vec_x0)
+	addq	%rdi, %rax
+ # endif
+-
+-	/* Align data for aligned loads in the loop.  */
+-	addq	$VEC_SIZE, %rdi
+-	andl	$(VEC_SIZE - 1), %ecx
+-	andq	$-VEC_SIZE, %rdi
+	ret
+ 
+ # ifndef USE_AS_RAWMEMCHR
+-	/* Adjust length.  */
+-	addq	%rcx, %rdx
+-
+-	subq	$(VEC_SIZE * 4), %rdx
+-	jbe	L(last_4x_vec_or_less)
+-# endif
+-	jmp	L(more_4x_vec)
+L(zero):
+	xorl	%eax, %eax
+	ret
+ 
+	.p2align 5
+L(first_vec_x0):
+	/* Check if first match was before length.  */
+	tzcntl	%eax, %eax
+	xorl	%ecx, %ecx
+	cmpl	%eax, %edx
+	leaq	(%rdi, %rax, CHAR_SIZE), %rax
+	cmovle	%rcx, %rax
+	ret
+# else
+	/* NB: first_vec_x0 is 17 bytes which will leave
+	   cross_page_boundary (which is relatively cold) close enough
+	   to ideal alignment. So only realign L(cross_page_boundary) if
+	   rawmemchr.  */
+ 	.p2align 4
+-L(cros_page_boundary):
+-	andl	$(VEC_SIZE - 1), %ecx
+# endif
+L(cross_page_boundary):
+	/* Save pointer before aligning as its original value is
+	   necessary for computer return address if byte is found or
+	   adjusting length if it is not and this is memchr.  */
+	movq	%rdi, %rcx
+	/* Align data to VEC_SIZE. ALGN_PTR_REG is rcx for memchr and rdi
+	   for rawmemchr.  */
+	andq	$-VEC_SIZE, %ALGN_PTR_REG
+	VPCMP	$0, (%ALGN_PTR_REG), %YMMMATCH, %k0
+	kmovd	%k0, %r8d
+ # ifdef USE_AS_WMEMCHR
+-	/* NB: Divide shift count by 4 since each bit in K1 represent 4
+	/* NB: Divide shift count by 4 since each bit in K0 represent 4
+ 	   bytes.  */
+-	movl	%ecx, %SHIFT_REG
+-	sarl	$2, %SHIFT_REG
+	sarl	$2, %eax
+# endif
+# ifndef USE_AS_RAWMEMCHR
+	movl	$(PAGE_SIZE / CHAR_SIZE), %esi
+	subl	%eax, %esi
+ # endif
+-	andq	$-VEC_SIZE, %rdi
+-	VPCMP	$0, (%rdi), %YMMMATCH, %k1
+-	kmovd	%k1, %eax
+-	/* Remove the leading bytes.  */
+-	sarxl	%SHIFT_REG, %eax, %eax
+-	testl	%eax, %eax
+-	jz	L(aligned_more)
+-	tzcntl	%eax, %eax
+ # ifdef USE_AS_WMEMCHR
+-	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
+-	sall	$2, %eax
+	andl	$(CHAR_PER_VEC - 1), %eax
+ # endif
+	/* Remove the leading bytes.  */
+	sarxl	%eax, %r8d, %eax
+ # ifndef USE_AS_RAWMEMCHR
+ 	/* Check the end of data.  */
+-	cmpq	%rax, %rdx
+-	jbe	L(zero)
+	cmpq	%rsi, %rdx
+	jbe	L(first_vec_x0)
+# endif
+	testl	%eax, %eax
+	jz	L(cross_page_continue)
+	tzcntl	%eax, %eax
+# ifdef USE_AS_WMEMCHR
+	/* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count.  */
+	leaq	(%RAW_PTR_REG, %rax, CHAR_SIZE), %rax
+# else
+	addq	%RAW_PTR_REG, %rax
+ # endif
+-	addq	%rdi, %rax
+-	addq	%rcx, %rax
+ 	ret
+ 
+ 	.p2align 4
+-L(aligned_more):
+-# ifndef USE_AS_RAWMEMCHR
+-        /* Calculate "rdx + rcx - VEC_SIZE" with "rdx - (VEC_SIZE - rcx)"
+-	   instead of "(rdx + rcx) - VEC_SIZE" to void possible addition
+-	   overflow.  */
+-	negq	%rcx
+-	addq	$VEC_SIZE, %rcx
+L(first_vec_x1):
+	tzcntl	%eax, %eax
+	leaq	VEC_SIZE(%rdi, %rax, CHAR_SIZE), %rax
+	ret
+ 
+-	/* Check the end of data.  */
+-	subq	%rcx, %rdx
+-	jbe	L(zero)
+-# endif
+	.p2align 4
+L(first_vec_x2):
+	tzcntl	%eax, %eax
+	leaq	(VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
+	ret
+ 
+-	addq	$VEC_SIZE, %rdi
+	.p2align 4
+L(first_vec_x3):
+	tzcntl	%eax, %eax
+	leaq	(VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax
+	ret
+ 
+-# ifndef USE_AS_RAWMEMCHR
+-	subq	$(VEC_SIZE * 4), %rdx
+-	jbe	L(last_4x_vec_or_less)
+-# endif
+	.p2align 4
+L(first_vec_x4):
+	tzcntl	%eax, %eax
+	leaq	(VEC_SIZE * 4)(%rdi, %rax, CHAR_SIZE), %rax
+	ret
+ 
+-L(more_4x_vec):
+	.p2align 5
+L(aligned_more):
+ 	/* Check the first 4 * VEC_SIZE.  Only one VEC_SIZE at a time
+ 	   since data is only aligned to VEC_SIZE.  */
+-	VPCMP	$0, (%rdi), %YMMMATCH, %k1
+-	kmovd	%k1, %eax
+-	testl	%eax, %eax
+-	jnz	L(first_vec_x0)
+ 
+-	VPCMP	$0, VEC_SIZE(%rdi), %YMMMATCH, %k1
+-	kmovd	%k1, %eax
+# ifndef USE_AS_RAWMEMCHR
+	/* Align data to VEC_SIZE.  */
+L(cross_page_continue):
+	xorl	%ecx, %ecx
+	subl	%edi, %ecx
+	andq	$-VEC_SIZE, %rdi
+	/* esi is for adjusting length to see if near the end.  */
+	leal	(VEC_SIZE * 5)(%rdi, %rcx), %esi
+#  ifdef USE_AS_WMEMCHR
+	/* NB: Divide bytes by 4 to get the wchar_t count.  */
+	sarl	$2, %esi
+#  endif
+# else
+	andq	$-VEC_SIZE, %rdi
+L(cross_page_continue):
+# endif
+	/* Load first VEC regardless.  */
+	VPCMP	$0, (VEC_SIZE)(%rdi), %YMMMATCH, %k0
+	kmovd	%k0, %eax
+# ifndef USE_AS_RAWMEMCHR
+	/* Adjust length. If near end handle specially.  */
+	subq	%rsi, %rdx
+	jbe	L(last_4x_vec_or_less)
+# endif
+ 	testl	%eax, %eax
+ 	jnz	L(first_vec_x1)
+ 
+-	VPCMP	$0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k1
+-	kmovd	%k1, %eax
+	VPCMP	$0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k0
+	kmovd	%k0, %eax
+ 	testl	%eax, %eax
+ 	jnz	L(first_vec_x2)
+ 
+-	VPCMP	$0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k1
+-	kmovd	%k1, %eax
+	VPCMP	$0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k0
+	kmovd	%k0, %eax
+ 	testl	%eax, %eax
+ 	jnz	L(first_vec_x3)
+ 
+-	addq	$(VEC_SIZE * 4), %rdi
+	VPCMP	$0, (VEC_SIZE * 4)(%rdi), %YMMMATCH, %k0
+	kmovd	%k0, %eax
+	testl	%eax, %eax
+	jnz	L(first_vec_x4)
+
+ 
+ # ifndef USE_AS_RAWMEMCHR
+-	subq	$(VEC_SIZE * 4), %rdx
+-	jbe	L(last_4x_vec_or_less)
+-# endif
+	/* Check if at last CHAR_PER_VEC * 4 length.  */
+	subq	$(CHAR_PER_VEC * 4), %rdx
+	jbe	L(last_4x_vec_or_less_cmpeq)
+	addq	$VEC_SIZE, %rdi
+ 
+-	/* Align data to 4 * VEC_SIZE.  */
+-	movq	%rdi, %rcx
+-	andl	$(4 * VEC_SIZE - 1), %ecx
+	/* Align data to VEC_SIZE * 4 for the loop and readjust length.
+	 */
+#  ifdef USE_AS_WMEMCHR
+	movl	%edi, %ecx
+ 	andq	$-(4 * VEC_SIZE), %rdi
+-
+-# ifndef USE_AS_RAWMEMCHR
+-	/* Adjust length.  */
+	andl	$(VEC_SIZE * 4 - 1), %ecx
+	/* NB: Divide bytes by 4 to get the wchar_t count.  */
+	sarl	$2, %ecx
+ 	addq	%rcx, %rdx
+#  else
+	addq	%rdi, %rdx
+	andq	$-(4 * VEC_SIZE), %rdi
+	subq	%rdi, %rdx
+#  endif
+# else
+	addq	$VEC_SIZE, %rdi
+	andq	$-(4 * VEC_SIZE), %rdi
+ # endif
+ 
+	vpxorq	%XMMZERO, %XMMZERO, %XMMZERO
+
+	/* Compare 4 * VEC at a time forward.  */
+ 	.p2align 4
+ L(loop_4x_vec):
+-	/* Compare 4 * VEC at a time forward.  */
+-	VPCMP	$0, (%rdi), %YMMMATCH, %k1
+-	VPCMP	$0, VEC_SIZE(%rdi), %YMMMATCH, %k2
+-	kord	%k1, %k2, %k5
+-	VPCMP	$0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k3
+-	VPCMP	$0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k4
+-
+-	kord	%k3, %k4, %k6
+-	kortestd %k5, %k6
+-	jnz	L(4x_vec_end)
+-
+-	addq	$(VEC_SIZE * 4), %rdi
+-
+	/* It would be possible to save some instructions using 4x VPCMP
+	   but bottleneck on port 5 makes it not woth it.  */
+	VPCMP	$4, (VEC_SIZE * 4)(%rdi), %YMMMATCH, %k1
+	/* xor will set bytes match esi to zero.  */
+	vpxorq	(VEC_SIZE * 5)(%rdi), %YMMMATCH, %YMM2
+	vpxorq	(VEC_SIZE * 6)(%rdi), %YMMMATCH, %YMM3
+	VPCMP	$0, (VEC_SIZE * 7)(%rdi), %YMMMATCH, %k3
+	/* Reduce VEC2 / VEC3 with min and VEC1 with zero mask.  */
+	VPMINU	%YMM2, %YMM3, %YMM3 {%k1} {z}
+	VPCMP	$0, %YMM3, %YMMZERO, %k2
+ # ifdef USE_AS_RAWMEMCHR
+-	jmp	L(loop_4x_vec)
+	subq	$-(VEC_SIZE * 4), %rdi
+	kortestd %k2, %k3
+	jz	L(loop_4x_vec)
+ # else
+-	subq	$(VEC_SIZE * 4), %rdx
+	kortestd %k2, %k3
+	jnz	L(loop_4x_vec_end)
+
+	subq	$-(VEC_SIZE * 4), %rdi
+
+	subq	$(CHAR_PER_VEC * 4), %rdx
+ 	ja	L(loop_4x_vec)
+ 
+	/* Fall through into less than 4 remaining vectors of length case.
+	 */
+	VPCMP	$0, (VEC_SIZE * 4)(%rdi), %YMMMATCH, %k0
+	kmovd	%k0, %eax
+	addq	$(VEC_SIZE * 3), %rdi
+	.p2align 4
+ L(last_4x_vec_or_less):
+-	/* Less than 4 * VEC and aligned to VEC_SIZE.  */
+-	addl	$(VEC_SIZE * 2), %edx
+-	jle	L(last_2x_vec)
+-
+-	VPCMP	$0, (%rdi), %YMMMATCH, %k1
+-	kmovd	%k1, %eax
+	/* Check if first VEC contained match.  */
+ 	testl	%eax, %eax
+-	jnz	L(first_vec_x0)
+	jnz	L(first_vec_x1_check)
+ 
+-	VPCMP	$0, VEC_SIZE(%rdi), %YMMMATCH, %k1
+-	kmovd	%k1, %eax
+-	testl	%eax, %eax
+-	jnz	L(first_vec_x1)
+	/* If remaining length > CHAR_PER_VEC * 2.  */
+	addl	$(CHAR_PER_VEC * 2), %edx
+	jg	L(last_4x_vec)
+ 
+-	VPCMP	$0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k1
+-	kmovd	%k1, %eax
+-	testl	%eax, %eax
+L(last_2x_vec):
+	/* If remaining length < CHAR_PER_VEC.  */
+	addl	$CHAR_PER_VEC, %edx
+	jle	L(zero_end)
+ 
+-	jnz	L(first_vec_x2_check)
+-	subl	$VEC_SIZE, %edx
+-	jle	L(zero)
+	/* Check VEC2 and compare any match with remaining length.  */
+	VPCMP	$0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k0
+	kmovd	%k0, %eax
+	tzcntl	%eax, %eax
+	cmpl	%eax, %edx
+	jbe	L(set_zero_end)
+	leaq	(VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
+L(zero_end):
+	ret
+ 
+-	VPCMP	$0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k1
+-	kmovd	%k1, %eax
+-	testl	%eax, %eax
+ 
+-	jnz	L(first_vec_x3_check)
+	.p2align 4
+L(first_vec_x1_check):
+	tzcntl	%eax, %eax
+	/* Adjust length.  */
+	subl	$-(CHAR_PER_VEC * 4), %edx
+	/* Check if match within remaining length.  */
+	cmpl	%eax, %edx
+	jbe	L(set_zero_end)
+	/* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count.  */
+	leaq	VEC_SIZE(%rdi, %rax, CHAR_SIZE), %rax
+	ret
+L(set_zero_end):
+ 	xorl	%eax, %eax
+ 	ret
+ 
+ 	.p2align 4
+-L(last_2x_vec):
+-	addl	$(VEC_SIZE * 2), %edx
+-	VPCMP	$0, (%rdi), %YMMMATCH, %k1
+L(loop_4x_vec_end):
+# endif
+	/* rawmemchr will fall through into this if match was found in
+	   loop.  */
+
+	/* k1 has not of matches with VEC1.  */
+ 	kmovd	%k1, %eax
+-	testl	%eax, %eax
+# ifdef USE_AS_WMEMCHR
+	subl	$((1 << CHAR_PER_VEC) - 1), %eax
+# else
+	incl	%eax
+# endif
+	jnz	L(last_vec_x1_return)
+ 
+-	jnz	L(first_vec_x0_check)
+-	subl	$VEC_SIZE, %edx
+-	jle	L(zero)
+	VPCMP	$0, %YMM2, %YMMZERO, %k0
+	kmovd	%k0, %eax
+	testl	%eax, %eax
+	jnz	L(last_vec_x2_return)
+ 
+-	VPCMP	$0, VEC_SIZE(%rdi), %YMMMATCH, %k1
+-	kmovd	%k1, %eax
+	kmovd	%k2, %eax
+ 	testl	%eax, %eax
+-	jnz	L(first_vec_x1_check)
+-	xorl	%eax, %eax
+-	ret
+	jnz	L(last_vec_x3_return)
+ 
+-	.p2align 4
+-L(first_vec_x0_check):
+	kmovd	%k3, %eax
+ 	tzcntl	%eax, %eax
+-# ifdef USE_AS_WMEMCHR
+-	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
+-	sall	$2, %eax
+# ifdef USE_AS_RAWMEMCHR
+	leaq	(VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax
+# else
+	leaq	(VEC_SIZE * 7)(%rdi, %rax, CHAR_SIZE), %rax
+ # endif
+-	/* Check the end of data.  */
+-	cmpq	%rax, %rdx
+-	jbe	L(zero)
+-	addq	%rdi, %rax
+ 	ret
+ 
+ 	.p2align 4
+-L(first_vec_x1_check):
+L(last_vec_x1_return):
+ 	tzcntl	%eax, %eax
+-# ifdef USE_AS_WMEMCHR
+-	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
+-	sall	$2, %eax
+-# endif
+-	/* Check the end of data.  */
+-	cmpq	%rax, %rdx
+-	jbe	L(zero)
+-	addq	$VEC_SIZE, %rax
+# ifdef USE_AS_RAWMEMCHR
+#  ifdef USE_AS_WMEMCHR
+	/* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count.  */
+	leaq	(%rdi, %rax, CHAR_SIZE), %rax
+#  else
+ 	addq	%rdi, %rax
+-	ret
+-
+-	.p2align 4
+-L(first_vec_x2_check):
+-	tzcntl	%eax, %eax
+-# ifdef USE_AS_WMEMCHR
+-	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
+-	sall	$2, %eax
+#  endif
+# else
+	/* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count.  */
+	leaq	(VEC_SIZE * 4)(%rdi, %rax, CHAR_SIZE), %rax
+ # endif
+-	/* Check the end of data.  */
+-	cmpq	%rax, %rdx
+-	jbe	L(zero)
+-	addq	$(VEC_SIZE * 2), %rax
+-	addq	%rdi, %rax
+ 	ret
+ 
+ 	.p2align 4
+-L(first_vec_x3_check):
+L(last_vec_x2_return):
+ 	tzcntl	%eax, %eax
+-# ifdef USE_AS_WMEMCHR
+-	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
+-	sall	$2, %eax
+# ifdef USE_AS_RAWMEMCHR
+	/* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count.  */
+	leaq	VEC_SIZE(%rdi, %rax, CHAR_SIZE), %rax
+# else
+	/* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count.  */
+	leaq	(VEC_SIZE * 5)(%rdi, %rax, CHAR_SIZE), %rax
+ # endif
+-	/* Check the end of data.  */
+-	cmpq	%rax, %rdx
+-	jbe	L(zero)
+-	addq	$(VEC_SIZE * 3), %rax
+-	addq	%rdi, %rax
+ 	ret
+ 
+ 	.p2align 4
+-L(zero):
+-	xorl	%eax, %eax
+-	ret
+-# endif
+-
+-	.p2align 4
+-L(first_vec_x0):
+L(last_vec_x3_return):
+ 	tzcntl	%eax, %eax
+-# ifdef USE_AS_WMEMCHR
+-	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
+-	leaq	(%rdi, %rax, 4), %rax
+# ifdef USE_AS_RAWMEMCHR
+	/* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count.  */
+	leaq	(VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
+ # else
+-	addq	%rdi, %rax
+	/* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count.  */
+	leaq	(VEC_SIZE * 6)(%rdi, %rax, CHAR_SIZE), %rax
+ # endif
+ 	ret
+ 
+
+# ifndef USE_AS_RAWMEMCHR
+L(last_4x_vec_or_less_cmpeq):
+	VPCMP	$0, (VEC_SIZE * 5)(%rdi), %YMMMATCH, %k0
+	kmovd	%k0, %eax
+	subq	$-(VEC_SIZE * 4), %rdi
+	/* Check first VEC regardless.  */
+	testl	%eax, %eax
+	jnz	L(first_vec_x1_check)
+
+	/* If remaining length <= CHAR_PER_VEC * 2.  */
+	addl	$(CHAR_PER_VEC * 2), %edx
+	jle	L(last_2x_vec)
+
+ 	.p2align 4
+-L(first_vec_x1):
+L(last_4x_vec):
+	VPCMP	$0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k0
+	kmovd	%k0, %eax
+	testl	%eax, %eax
+	jnz	L(last_vec_x2)
+
+
+	VPCMP	$0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k0
+	kmovd	%k0, %eax
+	/* Create mask for possible matches within remaining length.  */
+#  ifdef USE_AS_WMEMCHR
+	movl	$((1 << (CHAR_PER_VEC * 2)) - 1), %ecx
+	bzhil	%edx, %ecx, %ecx
+#  else
+	movq	$-1, %rcx
+	bzhiq	%rdx, %rcx, %rcx
+#  endif
+	/* Test matches in data against length match.  */
+	andl	%ecx, %eax
+	jnz	L(last_vec_x3)
+
+	/* if remaining length <= CHAR_PER_VEC * 3 (Note this is after
+	   remaining length was found to be > CHAR_PER_VEC * 2.  */
+	subl	$CHAR_PER_VEC, %edx
+	jbe	L(zero_end2)
+
+
+	VPCMP	$0, (VEC_SIZE * 4)(%rdi), %YMMMATCH, %k0
+	kmovd	%k0, %eax
+	/* Shift remaining length mask for last VEC.  */
+#  ifdef USE_AS_WMEMCHR
+	shrl	$CHAR_PER_VEC, %ecx
+#  else
+	shrq	$CHAR_PER_VEC, %rcx
+#  endif
+	andl	%ecx, %eax
+	jz	L(zero_end2)
+ 	tzcntl	%eax, %eax
+-# ifdef USE_AS_WMEMCHR
+-	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
+-	leaq	VEC_SIZE(%rdi, %rax, 4), %rax
+-# else
+-	addq	$VEC_SIZE, %rax
+-	addq	%rdi, %rax
+-# endif
+	leaq	(VEC_SIZE * 4)(%rdi, %rax, CHAR_SIZE), %rax
+L(zero_end2):
+ 	ret
+ 
+-	.p2align 4
+-L(first_vec_x2):
+L(last_vec_x2):
+ 	tzcntl	%eax, %eax
+-# ifdef USE_AS_WMEMCHR
+-	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
+-	leaq	(VEC_SIZE * 2)(%rdi, %rax, 4), %rax
+-# else
+-	addq	$(VEC_SIZE * 2), %rax
+-	addq	%rdi, %rax
+-# endif
+	leaq	(VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
+ 	ret
+ 
+ 	.p2align 4
+-L(4x_vec_end):
+-	kmovd	%k1, %eax
+-	testl	%eax, %eax
+-	jnz	L(first_vec_x0)
+-	kmovd	%k2, %eax
+-	testl	%eax, %eax
+-	jnz	L(first_vec_x1)
+-	kmovd	%k3, %eax
+-	testl	%eax, %eax
+-	jnz	L(first_vec_x2)
+-	kmovd	%k4, %eax
+-	testl	%eax, %eax
+-L(first_vec_x3):
+L(last_vec_x3):
+ 	tzcntl	%eax, %eax
+-# ifdef USE_AS_WMEMCHR
+-	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
+-	leaq	(VEC_SIZE * 3)(%rdi, %rax, 4), %rax
+-# else
+-	addq	$(VEC_SIZE * 3), %rax
+-	addq	%rdi, %rax
+-# endif
+	leaq	(VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax
+ 	ret
+# endif
+ 
+ END (MEMCHR)
+ #endif
+-- 
+GitLab
+
--- a/glibc-RHEL-15696-27.patch
+++ b/glibc-RHEL-15696-27.patch
@ -0,0 +1,30 @@
+From 6ea916adfa0ab9af6e7dc6adcf6f977dfe017835 Mon Sep 17 00:00:00 2001
+From: Alice Xu <alice.d.xu@gmail.com>
+Date: Fri, 7 May 2021 19:03:21 -0700
+Subject: [PATCH] x86-64: Fix an unknown vector operation in memchr-evex.S
+Content-type: text/plain; charset=UTF-8
+
+An unknown vector operation occurred in commit 2a76821c308. Fixed it
+by using "ymm{k1}{z}" but not "ymm {k1} {z}".
+
+Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
+---
+ sysdeps/x86_64/multiarch/memchr-evex.S | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+diff --git a/sysdeps/x86_64/multiarch/memchr-evex.S b/sysdeps/x86_64/multiarch/memchr-evex.S
+index 81d5cd64..f3fdad4f 100644
+--- a/sysdeps/x86_64/multiarch/memchr-evex.S
+++ b/sysdeps/x86_64/multiarch/memchr-evex.S
+@@ -271,7 +271,7 @@ L(loop_4x_vec):
+ 	vpxorq	(VEC_SIZE * 6)(%rdi), %YMMMATCH, %YMM3
+ 	VPCMP	$0, (VEC_SIZE * 7)(%rdi), %YMMMATCH, %k3
+ 	/* Reduce VEC2 / VEC3 with min and VEC1 with zero mask.  */
+-	VPMINU	%YMM2, %YMM3, %YMM3 {%k1} {z}
+	VPMINU	%YMM2, %YMM3, %YMM3{%k1}{z}
+ 	VPCMP	$0, %YMM3, %YMMZERO, %k2
+ # ifdef USE_AS_RAWMEMCHR
+ 	subq	$-(VEC_SIZE * 4), %rdi
+-- 
+GitLab
+
--- a/glibc-RHEL-15696-28.patch
+++ b/glibc-RHEL-15696-28.patch
@ -0,0 +1,566 @@
+From a0db678071c60b6c47c468d231dd0b3694ba7a98 Mon Sep 17 00:00:00 2001
+From: "H.J. Lu" <hjl.tools@gmail.com>
+Date: Tue, 22 Jun 2021 20:42:10 -0700
+Subject: [PATCH] x86-64: Move strlen.S to multiarch/strlen-vec.S
+Content-type: text/plain; charset=UTF-8
+
+Since strlen.S contains SSE2 version of strlen/strnlen and SSE4.1
+version of wcslen/wcsnlen, move strlen.S to multiarch/strlen-vec.S
+and include multiarch/strlen-vec.S from SSE2 and SSE4.1 variants.
+This also removes the unused symbols, __GI___strlen_sse2 and
+__GI___wcsnlen_sse4_1.
+---
+ sysdeps/x86_64/multiarch/strlen-sse2.S    |   2 +-
+ sysdeps/x86_64/multiarch/strlen-vec.S     | 257 ++++++++++++++++++++++
+ sysdeps/x86_64/multiarch/wcsnlen-sse4_1.S |   2 +-
+ sysdeps/x86_64/strlen.S                   | 243 +-------------------
+ 4 files changed, 262 insertions(+), 242 deletions(-)
+ create mode 100644 sysdeps/x86_64/multiarch/strlen-vec.S
+
+Conflicts:
+	sysdeps/x86_64/strlen.S b/sysdeps/x86_64/strlen.S
+	(Copyright dates, URL)
+
+diff --git a/sysdeps/x86_64/multiarch/strlen-sse2.S b/sysdeps/x86_64/multiarch/strlen-sse2.S
+index 7bc57b8d..449c8a7f 100644
+--- a/sysdeps/x86_64/multiarch/strlen-sse2.S
+++ b/sysdeps/x86_64/multiarch/strlen-sse2.S
+@@ -20,4 +20,4 @@
+ # define strlen __strlen_sse2
+ #endif
+ 
+-#include "../strlen.S"
+#include "strlen-vec.S"
+diff --git a/sysdeps/x86_64/multiarch/strlen-vec.S b/sysdeps/x86_64/multiarch/strlen-vec.S
+new file mode 100644
+index 00000000..8f660bb9
+--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strlen-vec.S
+@@ -0,0 +1,257 @@
+/* SSE2 version of strlen and SSE4.1 version of wcslen.
+   Copyright (C) 2012-2021 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+#ifdef AS_WCSLEN
+# define PMINU		pminud
+# define PCMPEQ		pcmpeqd
+# define SHIFT_RETURN	shrq $2, %rax
+#else
+# define PMINU		pminub
+# define PCMPEQ		pcmpeqb
+# define SHIFT_RETURN
+#endif
+
+/* Long lived register in strlen(s), strnlen(s, n) are:
+
+	%xmm3 - zero
+	%rdi   - s
+	%r10  (s+n) & (~(64-1))
+	%r11   s+n
+*/
+
+
+.text
+ENTRY(strlen)
+
+/* Test 64 bytes from %rax for zero. Save result as bitmask in %rdx.  */
+#define FIND_ZERO	\
+	PCMPEQ	(%rax), %xmm0;	\
+	PCMPEQ	16(%rax), %xmm1;	\
+	PCMPEQ	32(%rax), %xmm2;	\
+	PCMPEQ	48(%rax), %xmm3;	\
+	pmovmskb	%xmm0, %esi;	\
+	pmovmskb	%xmm1, %edx;	\
+	pmovmskb	%xmm2, %r8d;	\
+	pmovmskb	%xmm3, %ecx;	\
+	salq	$16, %rdx;	\
+	salq	$16, %rcx;	\
+	orq	%rsi, %rdx;	\
+	orq	%r8, %rcx;	\
+	salq	$32, %rcx;	\
+	orq	%rcx, %rdx;
+
+#ifdef AS_STRNLEN
+/* Do not read anything when n==0.  */
+	test	%RSI_LP, %RSI_LP
+	jne	L(n_nonzero)
+	xor	%rax, %rax
+	ret
+L(n_nonzero):
+# ifdef AS_WCSLEN
+	shl	$2, %RSI_LP
+# endif
+
+/* Initialize long lived registers.  */
+
+	add	%RDI_LP, %RSI_LP
+	mov	%RSI_LP, %R10_LP
+	and	$-64, %R10_LP
+	mov	%RSI_LP, %R11_LP
+#endif
+
+	pxor	%xmm0, %xmm0
+	pxor	%xmm1, %xmm1
+	pxor	%xmm2, %xmm2
+	pxor	%xmm3, %xmm3
+	movq	%rdi, %rax
+	movq	%rdi, %rcx
+	andq	$4095, %rcx
+/* Offsets 4032-4047 will be aligned into 4032 thus fit into page.  */
+	cmpq	$4047, %rcx
+/* We cannot unify this branching as it would be ~6 cycles slower.  */
+	ja	L(cross_page)
+
+#ifdef AS_STRNLEN
+/* Test if end is among first 64 bytes.  */
+# define STRNLEN_PROLOG	\
+	mov	%r11, %rsi;	\
+	subq	%rax, %rsi;	\
+	andq	$-64, %rax;	\
+	testq	$-64, %rsi;	\
+	je	L(strnlen_ret)
+#else
+# define STRNLEN_PROLOG  andq $-64, %rax;
+#endif
+
+/* Ignore bits in mask that come before start of string.  */
+#define PROLOG(lab)	\
+	movq	%rdi, %rcx;	\
+	xorq	%rax, %rcx;	\
+	STRNLEN_PROLOG;	\
+	sarq	%cl, %rdx;	\
+	test	%rdx, %rdx;	\
+	je	L(lab);	\
+	bsfq	%rdx, %rax;	\
+	SHIFT_RETURN;		\
+	ret
+
+#ifdef AS_STRNLEN
+	andq	$-16, %rax
+	FIND_ZERO
+#else
+	/* Test first 16 bytes unaligned.  */
+	movdqu	(%rax), %xmm4
+	PCMPEQ	%xmm0, %xmm4
+	pmovmskb	%xmm4, %edx
+	test	%edx, %edx
+	je 	L(next48_bytes)
+	bsf	%edx, %eax /* If eax is zeroed 16bit bsf can be used.  */
+	SHIFT_RETURN
+	ret
+
+L(next48_bytes):
+/* Same as FIND_ZERO except we do not check first 16 bytes.  */
+	andq	$-16, %rax
+	PCMPEQ 16(%rax), %xmm1
+	PCMPEQ 32(%rax), %xmm2
+	PCMPEQ 48(%rax), %xmm3
+	pmovmskb	%xmm1, %edx
+	pmovmskb	%xmm2, %r8d
+	pmovmskb	%xmm3, %ecx
+	salq	$16, %rdx
+	salq	$16, %rcx
+	orq	%r8, %rcx
+	salq	$32, %rcx
+	orq	%rcx, %rdx
+#endif
+
+	/* When no zero byte is found xmm1-3 are zero so we do not have to
+	   zero them.  */
+	PROLOG(loop)
+
+	.p2align 4
+L(cross_page):
+	andq	$-64, %rax
+	FIND_ZERO
+	PROLOG(loop_init)
+
+#ifdef AS_STRNLEN
+/* We must do this check to correctly handle strnlen (s, -1).  */
+L(strnlen_ret):
+	bts	%rsi, %rdx
+	sarq	%cl, %rdx
+	test	%rdx, %rdx
+	je	L(loop_init)
+	bsfq	%rdx, %rax
+	SHIFT_RETURN
+	ret
+#endif
+	.p2align 4
+L(loop_init):
+	pxor	%xmm1, %xmm1
+	pxor	%xmm2, %xmm2
+	pxor	%xmm3, %xmm3
+#ifdef AS_STRNLEN
+	.p2align 4
+L(loop):
+
+	addq	$64, %rax
+	cmpq	%rax, %r10
+	je	L(exit_end)
+
+	movdqa	(%rax), %xmm0
+	PMINU	16(%rax), %xmm0
+	PMINU	32(%rax), %xmm0
+	PMINU	48(%rax), %xmm0
+	PCMPEQ	%xmm3, %xmm0
+	pmovmskb	%xmm0, %edx
+	testl	%edx, %edx
+	jne	L(exit)
+	jmp	L(loop)
+
+	.p2align 4
+L(exit_end):
+	cmp	%rax, %r11
+	je	L(first) /* Do not read when end is at page boundary.  */
+	pxor	%xmm0, %xmm0
+	FIND_ZERO
+
+L(first):
+	bts	%r11, %rdx
+	bsfq	%rdx, %rdx
+	addq	%rdx, %rax
+	subq	%rdi, %rax
+	SHIFT_RETURN
+	ret
+
+	.p2align 4
+L(exit):
+	pxor	%xmm0, %xmm0
+	FIND_ZERO
+
+	bsfq	%rdx, %rdx
+	addq	%rdx, %rax
+	subq	%rdi, %rax
+	SHIFT_RETURN
+	ret
+
+#else
+
+	/* Main loop.  Unrolled twice to improve L2 cache performance on core2.  */
+	.p2align 4
+L(loop):
+
+	movdqa	64(%rax), %xmm0
+	PMINU	80(%rax), %xmm0
+	PMINU	96(%rax), %xmm0
+	PMINU	112(%rax), %xmm0
+	PCMPEQ	%xmm3, %xmm0
+	pmovmskb	%xmm0, %edx
+	testl	%edx, %edx
+	jne	L(exit64)
+
+	subq	$-128, %rax
+
+	movdqa	(%rax), %xmm0
+	PMINU	16(%rax), %xmm0
+	PMINU	32(%rax), %xmm0
+	PMINU	48(%rax), %xmm0
+	PCMPEQ	%xmm3, %xmm0
+	pmovmskb	%xmm0, %edx
+	testl	%edx, %edx
+	jne	L(exit0)
+	jmp	L(loop)
+
+	.p2align 4
+L(exit64):
+	addq	$64, %rax
+L(exit0):
+	pxor	%xmm0, %xmm0
+	FIND_ZERO
+
+	bsfq	%rdx, %rdx
+	addq	%rdx, %rax
+	subq	%rdi, %rax
+	SHIFT_RETURN
+	ret
+
+#endif
+
+END(strlen)
+diff --git a/sysdeps/x86_64/multiarch/wcsnlen-sse4_1.S b/sysdeps/x86_64/multiarch/wcsnlen-sse4_1.S
+index a8cab0cb..5fa51fe0 100644
+--- a/sysdeps/x86_64/multiarch/wcsnlen-sse4_1.S
+++ b/sysdeps/x86_64/multiarch/wcsnlen-sse4_1.S
+@@ -2,4 +2,4 @@
+ #define AS_STRNLEN
+ #define strlen	__wcsnlen_sse4_1
+ 
+-#include "../strlen.S"
+#include "strlen-vec.S"
+diff --git a/sysdeps/x86_64/strlen.S b/sysdeps/x86_64/strlen.S
+index f845f3d4..ad047d84 100644
+--- a/sysdeps/x86_64/strlen.S
+++ b/sysdeps/x86_64/strlen.S
+@@ -1,5 +1,5 @@
+-/* SSE2 version of strlen/wcslen.
+-   Copyright (C) 2012-2018 Free Software Foundation, Inc.
+/* SSE2 version of strlen.
+   Copyright (C) 2021 Free Software Foundation, Inc.
+    This file is part of the GNU C Library.
+ 
+    The GNU C Library is free software; you can redistribute it and/or
+@@ -16,243 +16,6 @@
+    License along with the GNU C Library; if not, see
+    <http://www.gnu.org/licenses/>.  */
+ 
+-#include <sysdep.h>
+#include "multiarch/strlen-vec.S"
+ 
+-#ifdef AS_WCSLEN
+-# define PMINU		pminud
+-# define PCMPEQ		pcmpeqd
+-# define SHIFT_RETURN	shrq $2, %rax
+-#else
+-# define PMINU		pminub
+-# define PCMPEQ		pcmpeqb
+-# define SHIFT_RETURN
+-#endif
+-
+-/* Long lived register in strlen(s), strnlen(s, n) are:
+-
+-	%xmm3 - zero
+-	%rdi   - s
+-	%r10  (s+n) & (~(64-1))
+-	%r11   s+n
+-*/
+-
+-
+-.text
+-ENTRY(strlen)
+-
+-/* Test 64 bytes from %rax for zero. Save result as bitmask in %rdx.  */
+-#define FIND_ZERO	\
+-	PCMPEQ	(%rax), %xmm0;	\
+-	PCMPEQ	16(%rax), %xmm1;	\
+-	PCMPEQ	32(%rax), %xmm2;	\
+-	PCMPEQ	48(%rax), %xmm3;	\
+-	pmovmskb	%xmm0, %esi;	\
+-	pmovmskb	%xmm1, %edx;	\
+-	pmovmskb	%xmm2, %r8d;	\
+-	pmovmskb	%xmm3, %ecx;	\
+-	salq	$16, %rdx;	\
+-	salq	$16, %rcx;	\
+-	orq	%rsi, %rdx;	\
+-	orq	%r8, %rcx;	\
+-	salq	$32, %rcx;	\
+-	orq	%rcx, %rdx;
+-
+-#ifdef AS_STRNLEN
+-/* Do not read anything when n==0.  */
+-	test	%RSI_LP, %RSI_LP
+-	jne	L(n_nonzero)
+-	xor	%rax, %rax
+-	ret
+-L(n_nonzero):
+-# ifdef AS_WCSLEN
+-	shl	$2, %RSI_LP
+-# endif
+-
+-/* Initialize long lived registers.  */
+-
+-	add	%RDI_LP, %RSI_LP
+-	mov	%RSI_LP, %R10_LP
+-	and	$-64, %R10_LP
+-	mov	%RSI_LP, %R11_LP
+-#endif
+-
+-	pxor	%xmm0, %xmm0
+-	pxor	%xmm1, %xmm1
+-	pxor	%xmm2, %xmm2
+-	pxor	%xmm3, %xmm3
+-	movq	%rdi, %rax
+-	movq	%rdi, %rcx
+-	andq	$4095, %rcx
+-/* Offsets 4032-4047 will be aligned into 4032 thus fit into page.  */
+-	cmpq	$4047, %rcx
+-/* We cannot unify this branching as it would be ~6 cycles slower.  */
+-	ja	L(cross_page)
+-
+-#ifdef AS_STRNLEN
+-/* Test if end is among first 64 bytes.  */
+-# define STRNLEN_PROLOG	\
+-	mov	%r11, %rsi;	\
+-	subq	%rax, %rsi;	\
+-	andq	$-64, %rax;	\
+-	testq	$-64, %rsi;	\
+-	je	L(strnlen_ret)
+-#else
+-# define STRNLEN_PROLOG  andq $-64, %rax;
+-#endif
+-
+-/* Ignore bits in mask that come before start of string.  */
+-#define PROLOG(lab)	\
+-	movq	%rdi, %rcx;	\
+-	xorq	%rax, %rcx;	\
+-	STRNLEN_PROLOG;	\
+-	sarq	%cl, %rdx;	\
+-	test	%rdx, %rdx;	\
+-	je	L(lab);	\
+-	bsfq	%rdx, %rax;	\
+-	SHIFT_RETURN;		\
+-	ret
+-
+-#ifdef AS_STRNLEN
+-	andq	$-16, %rax
+-	FIND_ZERO
+-#else
+-	/* Test first 16 bytes unaligned.  */
+-	movdqu	(%rax), %xmm4
+-	PCMPEQ	%xmm0, %xmm4
+-	pmovmskb	%xmm4, %edx
+-	test	%edx, %edx
+-	je 	L(next48_bytes)
+-	bsf	%edx, %eax /* If eax is zeroed 16bit bsf can be used.  */
+-	SHIFT_RETURN
+-	ret
+-
+-L(next48_bytes):
+-/* Same as FIND_ZERO except we do not check first 16 bytes.  */
+-	andq	$-16, %rax
+-	PCMPEQ 16(%rax), %xmm1
+-	PCMPEQ 32(%rax), %xmm2
+-	PCMPEQ 48(%rax), %xmm3
+-	pmovmskb	%xmm1, %edx
+-	pmovmskb	%xmm2, %r8d
+-	pmovmskb	%xmm3, %ecx
+-	salq	$16, %rdx
+-	salq	$16, %rcx
+-	orq	%r8, %rcx
+-	salq	$32, %rcx
+-	orq	%rcx, %rdx
+-#endif
+-
+-	/* When no zero byte is found xmm1-3 are zero so we do not have to
+-	   zero them.  */
+-	PROLOG(loop)
+-
+-	.p2align 4
+-L(cross_page):
+-	andq	$-64, %rax
+-	FIND_ZERO
+-	PROLOG(loop_init)
+-
+-#ifdef AS_STRNLEN
+-/* We must do this check to correctly handle strnlen (s, -1).  */
+-L(strnlen_ret):
+-	bts	%rsi, %rdx
+-	sarq	%cl, %rdx
+-	test	%rdx, %rdx
+-	je	L(loop_init)
+-	bsfq	%rdx, %rax
+-	SHIFT_RETURN
+-	ret
+-#endif
+-	.p2align 4
+-L(loop_init):
+-	pxor	%xmm1, %xmm1
+-	pxor	%xmm2, %xmm2
+-	pxor	%xmm3, %xmm3
+-#ifdef AS_STRNLEN
+-	.p2align 4
+-L(loop):
+-
+-	addq	$64, %rax
+-	cmpq	%rax, %r10
+-	je	L(exit_end)
+-
+-	movdqa	(%rax), %xmm0
+-	PMINU	16(%rax), %xmm0
+-	PMINU	32(%rax), %xmm0
+-	PMINU	48(%rax), %xmm0
+-	PCMPEQ	%xmm3, %xmm0
+-	pmovmskb	%xmm0, %edx
+-	testl	%edx, %edx
+-	jne	L(exit)
+-	jmp	L(loop)
+-
+-	.p2align 4
+-L(exit_end):
+-	cmp	%rax, %r11
+-	je	L(first) /* Do not read when end is at page boundary.  */
+-	pxor	%xmm0, %xmm0
+-	FIND_ZERO
+-
+-L(first):
+-	bts	%r11, %rdx
+-	bsfq	%rdx, %rdx
+-	addq	%rdx, %rax
+-	subq	%rdi, %rax
+-	SHIFT_RETURN
+-	ret
+-
+-	.p2align 4
+-L(exit):
+-	pxor	%xmm0, %xmm0
+-	FIND_ZERO
+-
+-	bsfq	%rdx, %rdx
+-	addq	%rdx, %rax
+-	subq	%rdi, %rax
+-	SHIFT_RETURN
+-	ret
+-
+-#else
+-
+-	/* Main loop.  Unrolled twice to improve L2 cache performance on core2.  */
+-	.p2align 4
+-L(loop):
+-
+-	movdqa	64(%rax), %xmm0
+-	PMINU	80(%rax), %xmm0
+-	PMINU	96(%rax), %xmm0
+-	PMINU	112(%rax), %xmm0
+-	PCMPEQ	%xmm3, %xmm0
+-	pmovmskb	%xmm0, %edx
+-	testl	%edx, %edx
+-	jne	L(exit64)
+-
+-	subq	$-128, %rax
+-
+-	movdqa	(%rax), %xmm0
+-	PMINU	16(%rax), %xmm0
+-	PMINU	32(%rax), %xmm0
+-	PMINU	48(%rax), %xmm0
+-	PCMPEQ	%xmm3, %xmm0
+-	pmovmskb	%xmm0, %edx
+-	testl	%edx, %edx
+-	jne	L(exit0)
+-	jmp	L(loop)
+-
+-	.p2align 4
+-L(exit64):
+-	addq	$64, %rax
+-L(exit0):
+-	pxor	%xmm0, %xmm0
+-	FIND_ZERO
+-
+-	bsfq	%rdx, %rdx
+-	addq	%rdx, %rax
+-	subq	%rdi, %rax
+-	SHIFT_RETURN
+-	ret
+-
+-#endif
+-
+-END(strlen)
+ libc_hidden_builtin_def (strlen)
+-- 
+GitLab
+
--- a/glibc-RHEL-15696-29.patch
+++ b/glibc-RHEL-15696-29.patch
@ -0,0 +1,181 @@
+From 6f573a27b6c8b4236445810a44660612323f5a73 Mon Sep 17 00:00:00 2001
+From: Noah Goldstein <goldstein.w.n@gmail.com>
+Date: Wed, 23 Jun 2021 01:19:34 -0400
+Subject: [PATCH] x86-64: Add wcslen optimize for sse4.1
+Content-type: text/plain; charset=UTF-8
+
+No bug. This comment adds the ifunc / build infrastructure
+necessary for wcslen to prefer the sse4.1 implementation
+in strlen-vec.S. test-wcslen.c is passing.
+
+Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
+Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
+---
+ sysdeps/x86_64/multiarch/Makefile          |  4 +-
+ sysdeps/x86_64/multiarch/ifunc-impl-list.c |  3 ++
+ sysdeps/x86_64/multiarch/ifunc-wcslen.h    | 52 ++++++++++++++++++++++
+ sysdeps/x86_64/multiarch/wcslen-sse4_1.S   |  4 ++
+ sysdeps/x86_64/multiarch/wcslen.c          |  2 +-
+ sysdeps/x86_64/multiarch/wcsnlen.c         | 34 +-------------
+ 6 files changed, 63 insertions(+), 36 deletions(-)
+ create mode 100644 sysdeps/x86_64/multiarch/ifunc-wcslen.h
+ create mode 100644 sysdeps/x86_64/multiarch/wcslen-sse4_1.S
+
+diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
+index 491c7698..65fde4eb 100644
+--- a/sysdeps/x86_64/multiarch/Makefile
+++ b/sysdeps/x86_64/multiarch/Makefile
+@@ -93,8 +93,8 @@ sysdep_routines += wmemcmp-sse4 wmemcmp-ssse3 wmemcmp-c \
+ 		   wcscpy-ssse3 wcscpy-c \
+ 		   wcschr-sse2 wcschr-avx2 \
+ 		   wcsrchr-sse2 wcsrchr-avx2 \
+-		   wcsnlen-sse4_1 wcsnlen-c \
+-		   wcslen-sse2 wcslen-avx2 wcsnlen-avx2 \
+		   wcslen-sse2 wcslen-sse4_1 wcslen-avx2 \
+		   wcsnlen-c wcsnlen-sse4_1 wcsnlen-avx2 \
+ 		   wcschr-avx2-rtm \
+ 		   wcscmp-avx2-rtm \
+ 		   wcslen-avx2-rtm \
+diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+index f1a6460a..580913ca 100644
+--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+@@ -657,6 +657,9 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 			       && CPU_FEATURE_USABLE (AVX512BW)
+ 			       && CPU_FEATURE_USABLE (BMI2)),
+ 			      __wcslen_evex)
+	      IFUNC_IMPL_ADD (array, i, wcsnlen,
+			      CPU_FEATURE_USABLE (SSE4_1),
+			      __wcsnlen_sse4_1)
+ 	      IFUNC_IMPL_ADD (array, i, wcslen, 1, __wcslen_sse2))
+ 
+   /* Support sysdeps/x86_64/multiarch/wcsnlen.c.  */
+diff --git a/sysdeps/x86_64/multiarch/ifunc-wcslen.h b/sysdeps/x86_64/multiarch/ifunc-wcslen.h
+new file mode 100644
+index 00000000..39e33473
+--- /dev/null
+++ b/sysdeps/x86_64/multiarch/ifunc-wcslen.h
+@@ -0,0 +1,52 @@
+/* Common definition for ifunc selections for wcslen and wcsnlen
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2017-2021 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <init-arch.h>
+
+extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE (sse4_1) attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden;
+
+static inline void *
+IFUNC_SELECTOR (void)
+{
+  const struct cpu_features* cpu_features = __get_cpu_features ();
+
+  if (CPU_FEATURE_USABLE_P (cpu_features, AVX2)
+      && CPU_FEATURE_USABLE_P (cpu_features, BMI2)
+      && CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load))
+    {
+      if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
+	  && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW))
+	return OPTIMIZE (evex);
+
+      if (CPU_FEATURE_USABLE_P (cpu_features, RTM))
+	return OPTIMIZE (avx2_rtm);
+
+      if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
+	return OPTIMIZE (avx2);
+    }
+
+  if (CPU_FEATURE_USABLE_P (cpu_features, SSE4_1))
+    return OPTIMIZE (sse4_1);
+
+  return OPTIMIZE (sse2);
+}
+diff --git a/sysdeps/x86_64/multiarch/wcslen-sse4_1.S b/sysdeps/x86_64/multiarch/wcslen-sse4_1.S
+new file mode 100644
+index 00000000..7e62621a
+--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wcslen-sse4_1.S
+@@ -0,0 +1,4 @@
+#define AS_WCSLEN
+#define strlen	__wcslen_sse4_1
+
+#include "strlen-vec.S"
+diff --git a/sysdeps/x86_64/multiarch/wcslen.c b/sysdeps/x86_64/multiarch/wcslen.c
+index 6d06e47c..3b04b75b 100644
+--- a/sysdeps/x86_64/multiarch/wcslen.c
+++ b/sysdeps/x86_64/multiarch/wcslen.c
+@@ -24,7 +24,7 @@
+ # undef __wcslen
+ 
+ # define SYMBOL_NAME wcslen
+-# include "ifunc-avx2.h"
+# include "ifunc-wcslen.h"
+ 
+ libc_ifunc_redirected (__redirect_wcslen, __wcslen, IFUNC_SELECTOR ());
+ weak_alias (__wcslen, wcslen);
+diff --git a/sysdeps/x86_64/multiarch/wcsnlen.c b/sysdeps/x86_64/multiarch/wcsnlen.c
+index 20b731ae..06736410 100644
+--- a/sysdeps/x86_64/multiarch/wcsnlen.c
+++ b/sysdeps/x86_64/multiarch/wcsnlen.c
+@@ -24,39 +24,7 @@
+ # undef __wcsnlen
+ 
+ # define SYMBOL_NAME wcsnlen
+-# include <init-arch.h>
+-
+-extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden;
+-extern __typeof (REDIRECT_NAME) OPTIMIZE (sse4_1) attribute_hidden;
+-extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden;
+-extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden;
+-extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden;
+-
+-static inline void *
+-IFUNC_SELECTOR (void)
+-{
+-  const struct cpu_features* cpu_features = __get_cpu_features ();
+-
+-  if (CPU_FEATURE_USABLE_P (cpu_features, AVX2)
+-      && CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load))
+-    {
+-      if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
+-	  && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW)
+-	  && CPU_FEATURE_USABLE_P (cpu_features, BMI2))
+-	return OPTIMIZE (evex);
+-
+-      if (CPU_FEATURE_USABLE_P (cpu_features, RTM))
+-	return OPTIMIZE (avx2_rtm);
+-
+-      if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
+-	return OPTIMIZE (avx2);
+-    }
+-
+-  if (CPU_FEATURE_USABLE_P (cpu_features, SSE4_1))
+-    return OPTIMIZE (sse4_1);
+-
+-  return OPTIMIZE (sse2);
+-}
+# include "ifunc-wcslen.h"
+ 
+ libc_ifunc_redirected (__redirect_wcsnlen, __wcsnlen, IFUNC_SELECTOR ());
+ weak_alias (__wcsnlen, wcsnlen);
+-- 
+GitLab
+
--- a/glibc-RHEL-15696-3.patch
+++ b/glibc-RHEL-15696-3.patch
@ -0,0 +1,396 @@
+From 231c56760c1e2ded21ad96bbb860b1f08c556c7a Mon Sep 17 00:00:00 2001
+From: "H.J. Lu" <hjl.tools@gmail.com>
+Date: Mon, 21 Jan 2019 11:27:25 -0800
+Subject: [PATCH] x86-64 memcpy: Properly handle the length parameter [BZ#
+ 24097]
+Content-type: text/plain; charset=UTF-8
+
+On x32, the size_t parameter may be passed in the lower 32 bits of a
+64-bit register with the non-zero upper 32 bits.  The string/memory
+functions written in assembly can only use the lower 32 bits of a
+64-bit register as length or must clear the upper 32 bits before using
+the full 64-bit register for length.
+
+This pach fixes memcpy for x32.  Tested on x86-64 and x32.  On x86-64,
+libc.so is the same with and withou the fix.
+
+	[BZ# 24097]
+	CVE-2019-6488
+	* sysdeps/x86_64/multiarch/memcpy-ssse3-back.S: Use RDX_LP for
+	length.  Clear the upper 32 bits of RDX register.
+	* sysdeps/x86_64/multiarch/memcpy-ssse3.S: Likewise.
+	* sysdeps/x86_64/multiarch/memmove-avx512-no-vzeroupper.S:
+	Likewise.
+	* sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S:
+	Likewise.
+	* sysdeps/x86_64/x32/Makefile (tests): Add tst-size_t-memcpy.
+	tst-size_t-wmemchr.
+	* sysdeps/x86_64/x32/tst-size_t-memcpy.c: New file.
+---
+ sysdeps/x86_64/multiarch/memcpy-ssse3-back.S  | 17 ++++--
+ sysdeps/x86_64/multiarch/memcpy-ssse3.S       | 17 ++++--
+ .../multiarch/memmove-avx512-no-vzeroupper.S  | 16 +++--
+ .../multiarch/memmove-vec-unaligned-erms.S    | 54 +++++++++--------
+ sysdeps/x86_64/x32/Makefile                   |  2 +-
+ sysdeps/x86_64/x32/tst-size_t-memcpy.c        | 58 +++++++++++++++++++
+ 6 files changed, 122 insertions(+), 42 deletions(-)
+ create mode 100644 sysdeps/x86_64/x32/tst-size_t-memcpy.c
+
+Conflicts:
+	ChangeLog
+	(removed)
+
+diff --git a/sysdeps/x86_64/multiarch/memcpy-ssse3-back.S b/sysdeps/x86_64/multiarch/memcpy-ssse3-back.S
+index 3cd11233..568eebd3 100644
+--- a/sysdeps/x86_64/multiarch/memcpy-ssse3-back.S
+++ b/sysdeps/x86_64/multiarch/memcpy-ssse3-back.S
+@@ -45,28 +45,33 @@
+ 	.section .text.ssse3,"ax",@progbits
+ #if !defined USE_AS_MEMPCPY && !defined USE_AS_MEMMOVE
+ ENTRY (MEMPCPY_CHK)
+-	cmpq	%rdx, %rcx
+	cmp	%RDX_LP, %RCX_LP
+ 	jb	HIDDEN_JUMPTARGET (__chk_fail)
+ END (MEMPCPY_CHK)
+ 
+ ENTRY (MEMPCPY)
+-	movq	%rdi, %rax
+-	addq	%rdx, %rax
+	mov	%RDI_LP, %RAX_LP
+	add	%RDX_LP, %RAX_LP
+ 	jmp	L(start)
+ END (MEMPCPY)
+ #endif
+ 
+ #if !defined USE_AS_BCOPY
+ ENTRY (MEMCPY_CHK)
+-	cmpq	%rdx, %rcx
+	cmp	%RDX_LP, %RCX_LP
+ 	jb	HIDDEN_JUMPTARGET (__chk_fail)
+ END (MEMCPY_CHK)
+ #endif
+ 
+ ENTRY (MEMCPY)
+-	mov	%rdi, %rax
+	mov	%RDI_LP, %RAX_LP
+ #ifdef USE_AS_MEMPCPY
+-	add	%rdx, %rax
+	add	%RDX_LP, %RAX_LP
+#endif
+
+#ifdef __ILP32__
+	/* Clear the upper 32 bits.  */
+	mov	%edx, %edx
+ #endif
+ 
+ #ifdef USE_AS_MEMMOVE
+diff --git a/sysdeps/x86_64/multiarch/memcpy-ssse3.S b/sysdeps/x86_64/multiarch/memcpy-ssse3.S
+index 0240bfa3..0bd5ee99 100644
+--- a/sysdeps/x86_64/multiarch/memcpy-ssse3.S
+++ b/sysdeps/x86_64/multiarch/memcpy-ssse3.S
+@@ -45,28 +45,33 @@
+ 	.section .text.ssse3,"ax",@progbits
+ #if !defined USE_AS_MEMPCPY && !defined USE_AS_MEMMOVE
+ ENTRY (MEMPCPY_CHK)
+-	cmpq	%rdx, %rcx
+	cmp	%RDX_LP, %RCX_LP
+ 	jb	HIDDEN_JUMPTARGET (__chk_fail)
+ END (MEMPCPY_CHK)
+ 
+ ENTRY (MEMPCPY)
+-	movq	%rdi, %rax
+-	addq	%rdx, %rax
+	mov	%RDI_LP, %RAX_LP
+	add	%RDX_LP, %RAX_LP
+ 	jmp	L(start)
+ END (MEMPCPY)
+ #endif
+ 
+ #if !defined USE_AS_BCOPY
+ ENTRY (MEMCPY_CHK)
+-	cmpq	%rdx, %rcx
+	cmp	%RDX_LP, %RCX_LP
+ 	jb	HIDDEN_JUMPTARGET (__chk_fail)
+ END (MEMCPY_CHK)
+ #endif
+ 
+ ENTRY (MEMCPY)
+-	mov	%rdi, %rax
+	mov	%RDI_LP, %RAX_LP
+ #ifdef USE_AS_MEMPCPY
+-	add	%rdx, %rax
+	add	%RDX_LP, %RAX_LP
+#endif
+
+#ifdef __ILP32__
+	/* Clear the upper 32 bits.  */
+	mov	%edx, %edx
+ #endif
+ 
+ #ifdef USE_AS_MEMMOVE
+diff --git a/sysdeps/x86_64/multiarch/memmove-avx512-no-vzeroupper.S b/sysdeps/x86_64/multiarch/memmove-avx512-no-vzeroupper.S
+index effc3ac2..6ca2bbc9 100644
+--- a/sysdeps/x86_64/multiarch/memmove-avx512-no-vzeroupper.S
+++ b/sysdeps/x86_64/multiarch/memmove-avx512-no-vzeroupper.S
+@@ -24,27 +24,31 @@
+ 
+ 	.section .text.avx512,"ax",@progbits
+ ENTRY (__mempcpy_chk_avx512_no_vzeroupper)
+-	cmpq	%rdx, %rcx
+	cmp	%RDX_LP, %RCX_LP
+ 	jb	HIDDEN_JUMPTARGET (__chk_fail)
+ END (__mempcpy_chk_avx512_no_vzeroupper)
+ 
+ ENTRY (__mempcpy_avx512_no_vzeroupper)
+-	movq	%rdi, %rax
+-	addq	%rdx, %rax
+	mov	%RDI_LP, %RAX_LP
+	add	%RDX_LP, %RAX_LP
+ 	jmp	L(start)
+ END (__mempcpy_avx512_no_vzeroupper)
+ 
+ ENTRY (__memmove_chk_avx512_no_vzeroupper)
+-	cmpq	%rdx, %rcx
+	cmp	%RDX_LP, %RCX_LP
+ 	jb	HIDDEN_JUMPTARGET (__chk_fail)
+ END (__memmove_chk_avx512_no_vzeroupper)
+ 
+ ENTRY (__memmove_avx512_no_vzeroupper)
+-	mov	%rdi, %rax
+	mov	%RDI_LP, %RAX_LP
+ # ifdef USE_AS_MEMPCPY
+-	add	%rdx, %rax
+	add	%RDX_LP, %RAX_LP
+ # endif
+ L(start):
+# ifdef __ILP32__
+	/* Clear the upper 32 bits.  */
+	mov	%edx, %edx
+# endif
+ 	lea	(%rsi, %rdx), %rcx
+ 	lea	(%rdi, %rdx), %r9
+ 	cmp	$512, %rdx
+diff --git a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
+index c952576c..274aa1c7 100644
+--- a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
+@@ -95,20 +95,20 @@
+ 	.section SECTION(.text),"ax",@progbits
+ #if defined SHARED && IS_IN (libc)
+ ENTRY (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned))
+-	cmpq	%rdx, %rcx
+	cmp	%RDX_LP, %RCX_LP
+ 	jb	HIDDEN_JUMPTARGET (__chk_fail)
+ END (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned))
+ #endif
+ 
+ ENTRY (MEMPCPY_SYMBOL (__mempcpy, unaligned))
+-	movq	%rdi, %rax
+-	addq	%rdx, %rax
+	mov	%RDI_LP, %RAX_LP
+	add	%RDX_LP, %RAX_LP
+ 	jmp	L(start)
+ END (MEMPCPY_SYMBOL (__mempcpy, unaligned))
+ 
+ #if defined SHARED && IS_IN (libc)
+ ENTRY (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned))
+-	cmpq	%rdx, %rcx
+	cmp	%RDX_LP, %RCX_LP
+ 	jb	HIDDEN_JUMPTARGET (__chk_fail)
+ END (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned))
+ #endif
+@@ -116,9 +116,13 @@ END (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned))
+ ENTRY (MEMMOVE_SYMBOL (__memmove, unaligned))
+ 	movq	%rdi, %rax
+ L(start):
+-	cmpq	$VEC_SIZE, %rdx
+# ifdef __ILP32__
+	/* Clear the upper 32 bits.  */
+	movl	%edx, %edx
+# endif
+	cmp	$VEC_SIZE, %RDX_LP
+ 	jb	L(less_vec)
+-	cmpq	$(VEC_SIZE * 2), %rdx
+	cmp	$(VEC_SIZE * 2), %RDX_LP
+ 	ja	L(more_2x_vec)
+ #if !defined USE_MULTIARCH || !IS_IN (libc)
+ L(last_2x_vec):
+@@ -138,38 +142,38 @@ END (MEMMOVE_SYMBOL (__memmove, unaligned))
+ 
+ # if VEC_SIZE == 16
+ ENTRY (__mempcpy_chk_erms)
+-	cmpq	%rdx, %rcx
+	cmp	%RDX_LP, %RCX_LP
+ 	jb	HIDDEN_JUMPTARGET (__chk_fail)
+ END (__mempcpy_chk_erms)
+ 
+ /* Only used to measure performance of REP MOVSB.  */
+ ENTRY (__mempcpy_erms)
+-	movq	%rdi, %rax
+	mov	%RDI_LP, %RAX_LP
+ 	/* Skip zero length.  */
+-	testq	%rdx, %rdx
+	test	%RDX_LP, %RDX_LP
+ 	jz	2f
+-	addq	%rdx, %rax
+	add	%RDX_LP, %RAX_LP
+ 	jmp	L(start_movsb)
+ END (__mempcpy_erms)
+ 
+ ENTRY (__memmove_chk_erms)
+-	cmpq	%rdx, %rcx
+	cmp	%RDX_LP, %RCX_LP
+ 	jb	HIDDEN_JUMPTARGET (__chk_fail)
+ END (__memmove_chk_erms)
+ 
+ ENTRY (__memmove_erms)
+ 	movq	%rdi, %rax
+ 	/* Skip zero length.  */
+-	testq	%rdx, %rdx
+	test	%RDX_LP, %RDX_LP
+ 	jz	2f
+ L(start_movsb):
+-	movq	%rdx, %rcx
+-	cmpq	%rsi, %rdi
+	mov	%RDX_LP, %RCX_LP
+	cmp	%RSI_LP, %RDI_LP
+ 	jb	1f
+ 	/* Source == destination is less common.  */
+ 	je	2f
+-	leaq	(%rsi,%rcx), %rdx
+-	cmpq	%rdx, %rdi
+	lea	(%rsi,%rcx), %RDX_LP
+	cmp	%RDX_LP, %RDI_LP
+ 	jb	L(movsb_backward)
+ 1:
+ 	rep movsb
+@@ -189,20 +193,20 @@ strong_alias (__memmove_chk_erms, __memcpy_chk_erms)
+ 
+ # ifdef SHARED
+ ENTRY (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned_erms))
+-	cmpq	%rdx, %rcx
+	cmp	%RDX_LP, %RCX_LP
+ 	jb	HIDDEN_JUMPTARGET (__chk_fail)
+ END (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned_erms))
+ # endif
+ 
+ ENTRY (MEMMOVE_SYMBOL (__mempcpy, unaligned_erms))
+-	movq	%rdi, %rax
+-	addq	%rdx, %rax
+	mov	%RDI_LP, %RAX_LP
+	add	%RDX_LP, %RAX_LP
+ 	jmp	L(start_erms)
+ END (MEMMOVE_SYMBOL (__mempcpy, unaligned_erms))
+ 
+ # ifdef SHARED
+ ENTRY (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned_erms))
+-	cmpq	%rdx, %rcx
+	cmp	%RDX_LP, %RCX_LP
+ 	jb	HIDDEN_JUMPTARGET (__chk_fail)
+ END (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned_erms))
+ # endif
+@@ -210,9 +214,13 @@ END (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned_erms))
+ ENTRY (MEMMOVE_SYMBOL (__memmove, unaligned_erms))
+ 	movq	%rdi, %rax
+ L(start_erms):
+-	cmpq	$VEC_SIZE, %rdx
+# ifdef __ILP32__
+	/* Clear the upper 32 bits.  */
+	movl	%edx, %edx
+# endif
+	cmp	$VEC_SIZE, %RDX_LP
+ 	jb	L(less_vec)
+-	cmpq	$(VEC_SIZE * 2), %rdx
+	cmp	$(VEC_SIZE * 2), %RDX_LP
+ 	ja	L(movsb_more_2x_vec)
+ L(last_2x_vec):
+ 	/* From VEC and to 2 * VEC.  No branch when size == VEC_SIZE. */
+@@ -236,7 +244,7 @@ L(movsb):
+ 	/* Avoid slow backward REP MOVSB.  */
+ 	jb	L(more_8x_vec_backward)
+ 1:
+-	movq	%rdx, %rcx
+	mov	%RDX_LP, %RCX_LP
+ 	rep movsb
+ L(nop):
+ 	ret
+diff --git a/sysdeps/x86_64/x32/Makefile b/sysdeps/x86_64/x32/Makefile
+index ddec7f04..2fe1e5ac 100644
+--- a/sysdeps/x86_64/x32/Makefile
+++ b/sysdeps/x86_64/x32/Makefile
+@@ -6,7 +6,7 @@ CFLAGS-s_llround.c += -fno-builtin-lround
+ endif
+ 
+ ifeq ($(subdir),string)
+-tests += tst-size_t-memchr tst-size_t-memcmp
+tests += tst-size_t-memchr tst-size_t-memcmp tst-size_t-memcpy
+ endif
+ 
+ ifeq ($(subdir),wcsmbs)
+diff --git a/sysdeps/x86_64/x32/tst-size_t-memcpy.c b/sysdeps/x86_64/x32/tst-size_t-memcpy.c
+new file mode 100644
+index 00000000..66b71e17
+--- /dev/null
+++ b/sysdeps/x86_64/x32/tst-size_t-memcpy.c
+@@ -0,0 +1,58 @@
+/* Test memcpy with size_t in the lower 32 bits of 64-bit register.
+   Copyright (C) 2019 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#define TEST_NAME "memcpy"
+#include "test-size_t.h"
+
+IMPL (memcpy, 1)
+
+typedef void *(*proto_t) (void *, const void *, size_t);
+
+static void *
+__attribute__ ((noinline, noclone))
+do_memcpy (parameter_t a, parameter_t b)
+{
+  return CALL (&b, a.p, b.p, a.len);
+}
+
+static int
+test_main (void)
+{
+  test_init ();
+
+  parameter_t dest = { { page_size }, buf1 };
+  parameter_t src = { { 0 }, buf2 };
+
+  int ret = 0;
+  FOR_EACH_IMPL (impl, 0)
+    {
+      src.fn = impl->fn;
+      do_memcpy (dest, src);
+      int res = memcmp (dest.p, src.p, dest.len);
+      if (res)
+	{
+	  error (0, 0, "Wrong result in function %s: %i != 0",
+		 impl->name, res);
+	  ret = 1;
+	}
+    }
+
+  return ret ? EXIT_FAILURE : EXIT_SUCCESS;
+}
+
+#include <support/test-driver.c>
+-- 
+GitLab
+
--- a/glibc-RHEL-15696-30.patch
+++ b/glibc-RHEL-15696-30.patch
@ -0,0 +1,497 @@
+From a775a7a3eb1e85b54af0b4ee5ff4dcf66772a1fb Mon Sep 17 00:00:00 2001
+From: Noah Goldstein <goldstein.w.n@gmail.com>
+Date: Wed, 23 Jun 2021 01:56:29 -0400
+Subject: [PATCH] x86: Fix overflow bug in wcsnlen-sse4_1 and wcsnlen-avx2 [BZ
+ #27974]
+Content-type: text/plain; charset=UTF-8
+
+This commit fixes the bug mentioned in the previous commit.
+
+The previous implementations of wmemchr in these files relied
+on maxlen * sizeof(wchar_t) which was not guranteed by the standard.
+
+The new overflow tests added in the previous commit now
+pass (As well as all the other tests).
+
+Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
+Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
+---
+ sysdeps/x86_64/multiarch/strlen-avx2.S | 130 ++++++++++++++++++-------
+ sysdeps/x86_64/multiarch/strlen-vec.S  |  15 ++-
+ 2 files changed, 107 insertions(+), 38 deletions(-)
+
+diff --git a/sysdeps/x86_64/multiarch/strlen-avx2.S b/sysdeps/x86_64/multiarch/strlen-avx2.S
+index be8a5db5..37688966 100644
+--- a/sysdeps/x86_64/multiarch/strlen-avx2.S
+++ b/sysdeps/x86_64/multiarch/strlen-avx2.S
+@@ -44,21 +44,21 @@
+ 
+ # define VEC_SIZE 32
+ # define PAGE_SIZE 4096
+# define CHAR_PER_VEC	(VEC_SIZE / CHAR_SIZE)
+ 
+ 	.section SECTION(.text),"ax",@progbits
+ ENTRY (STRLEN)
+ # ifdef USE_AS_STRNLEN
+ 	/* Check zero length.  */
+#  ifdef __ILP32__
+	/* Clear upper bits.  */
+	and	%RSI_LP, %RSI_LP
+#  else
+ 	test	%RSI_LP, %RSI_LP
+#  endif
+ 	jz	L(zero)
+ 	/* Store max len in R8_LP before adjusting if using WCSLEN.  */
+ 	mov	%RSI_LP, %R8_LP
+-#  ifdef USE_AS_WCSLEN
+-	shl	$2, %RSI_LP
+-#  elif defined __ILP32__
+-	/* Clear the upper 32 bits.  */
+-	movl	%esi, %esi
+-#  endif
+ # endif
+ 	movl	%edi, %eax
+ 	movq	%rdi, %rdx
+@@ -72,10 +72,10 @@ ENTRY (STRLEN)
+ 
+ 	/* Check the first VEC_SIZE bytes.  */
+ 	VPCMPEQ	(%rdi), %ymm0, %ymm1
+-	vpmovmskb	%ymm1, %eax
+	vpmovmskb %ymm1, %eax
+ # ifdef USE_AS_STRNLEN
+ 	/* If length < VEC_SIZE handle special.  */
+-	cmpq	$VEC_SIZE, %rsi
+	cmpq	$CHAR_PER_VEC, %rsi
+ 	jbe	L(first_vec_x0)
+ # endif
+ 	/* If empty continue to aligned_more. Otherwise return bit
+@@ -84,6 +84,7 @@ ENTRY (STRLEN)
+ 	jz	L(aligned_more)
+ 	tzcntl	%eax, %eax
+ # ifdef USE_AS_WCSLEN
+	/* NB: Divide bytes by 4 to get wchar_t count.  */
+ 	shrl	$2, %eax
+ # endif
+ 	VZEROUPPER_RETURN
+@@ -97,9 +98,14 @@ L(zero):
+ L(first_vec_x0):
+ 	/* Set bit for max len so that tzcnt will return min of max len
+ 	   and position of first match.  */
+#  ifdef USE_AS_WCSLEN
+	/* NB: Multiply length by 4 to get byte count.  */
+	sall	$2, %esi
+#  endif
+ 	btsq	%rsi, %rax
+ 	tzcntl	%eax, %eax
+ #  ifdef USE_AS_WCSLEN
+	/* NB: Divide bytes by 4 to get wchar_t count.  */
+ 	shrl	$2, %eax
+ #  endif
+ 	VZEROUPPER_RETURN
+@@ -113,14 +119,19 @@ L(first_vec_x1):
+ # ifdef USE_AS_STRNLEN
+ 	/* Use ecx which was computed earlier to compute correct value.
+ 	 */
+#  ifdef USE_AS_WCSLEN
+	leal	-(VEC_SIZE * 4 + 1)(%rax, %rcx, 4), %eax
+#  else
+ 	subl	$(VEC_SIZE * 4 + 1), %ecx
+ 	addl	%ecx, %eax
+#  endif
+ # else
+ 	subl	%edx, %edi
+ 	incl	%edi
+ 	addl	%edi, %eax
+ # endif
+ # ifdef USE_AS_WCSLEN
+	/* NB: Divide bytes by 4 to get wchar_t count.  */
+ 	shrl	$2, %eax
+ # endif
+ 	VZEROUPPER_RETURN
+@@ -133,14 +144,19 @@ L(first_vec_x2):
+ # ifdef USE_AS_STRNLEN
+ 	/* Use ecx which was computed earlier to compute correct value.
+ 	 */
+#  ifdef USE_AS_WCSLEN
+	leal	-(VEC_SIZE * 3 + 1)(%rax, %rcx, 4), %eax
+#  else
+ 	subl	$(VEC_SIZE * 3 + 1), %ecx
+ 	addl	%ecx, %eax
+#  endif
+ # else
+ 	subl	%edx, %edi
+ 	addl	$(VEC_SIZE + 1), %edi
+ 	addl	%edi, %eax
+ # endif
+ # ifdef USE_AS_WCSLEN
+	/* NB: Divide bytes by 4 to get wchar_t count.  */
+ 	shrl	$2, %eax
+ # endif
+ 	VZEROUPPER_RETURN
+@@ -153,14 +169,19 @@ L(first_vec_x3):
+ # ifdef USE_AS_STRNLEN
+ 	/* Use ecx which was computed earlier to compute correct value.
+ 	 */
+#  ifdef USE_AS_WCSLEN
+	leal	-(VEC_SIZE * 2 + 1)(%rax, %rcx, 4), %eax
+#  else
+ 	subl	$(VEC_SIZE * 2 + 1), %ecx
+ 	addl	%ecx, %eax
+#  endif
+ # else
+ 	subl	%edx, %edi
+ 	addl	$(VEC_SIZE * 2 + 1), %edi
+ 	addl	%edi, %eax
+ # endif
+ # ifdef USE_AS_WCSLEN
+	/* NB: Divide bytes by 4 to get wchar_t count.  */
+ 	shrl	$2, %eax
+ # endif
+ 	VZEROUPPER_RETURN
+@@ -173,14 +194,19 @@ L(first_vec_x4):
+ # ifdef USE_AS_STRNLEN
+ 	/* Use ecx which was computed earlier to compute correct value.
+ 	 */
+#  ifdef USE_AS_WCSLEN
+	leal	-(VEC_SIZE * 1 + 1)(%rax, %rcx, 4), %eax
+#  else
+ 	subl	$(VEC_SIZE + 1), %ecx
+ 	addl	%ecx, %eax
+#  endif
+ # else
+ 	subl	%edx, %edi
+ 	addl	$(VEC_SIZE * 3 + 1), %edi
+ 	addl	%edi, %eax
+ # endif
+ # ifdef USE_AS_WCSLEN
+	/* NB: Divide bytes by 4 to get wchar_t count.  */
+ 	shrl	$2, %eax
+ # endif
+ 	VZEROUPPER_RETURN
+@@ -195,10 +221,14 @@ L(cross_page_continue):
+ 	/* Check the first 4 * VEC_SIZE.  Only one VEC_SIZE at a time
+ 	   since data is only aligned to VEC_SIZE.  */
+ # ifdef USE_AS_STRNLEN
+-	/* + 1 because rdi is aligned to VEC_SIZE - 1. + CHAR_SIZE because
+-	   it simplies the logic in last_4x_vec_or_less.  */
+	/* + 1 because rdi is aligned to VEC_SIZE - 1. + CHAR_SIZE
+	   because it simplies the logic in last_4x_vec_or_less.  */
+ 	leaq	(VEC_SIZE * 4 + CHAR_SIZE + 1)(%rdi), %rcx
+ 	subq	%rdx, %rcx
+#  ifdef USE_AS_WCSLEN
+	/* NB: Divide bytes by 4 to get the wchar_t count.  */
+	sarl	$2, %ecx
+#  endif
+ # endif
+ 	/* Load first VEC regardless.  */
+ 	VPCMPEQ	1(%rdi), %ymm0, %ymm1
+@@ -207,34 +237,38 @@ L(cross_page_continue):
+ 	subq	%rcx, %rsi
+ 	jb	L(last_4x_vec_or_less)
+ # endif
+-	vpmovmskb	%ymm1, %eax
+	vpmovmskb %ymm1, %eax
+ 	testl	%eax, %eax
+ 	jnz	L(first_vec_x1)
+ 
+ 	VPCMPEQ	(VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
+-	vpmovmskb	%ymm1, %eax
+	vpmovmskb %ymm1, %eax
+ 	testl	%eax, %eax
+ 	jnz	L(first_vec_x2)
+ 
+ 	VPCMPEQ	(VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm1
+-	vpmovmskb	%ymm1, %eax
+	vpmovmskb %ymm1, %eax
+ 	testl	%eax, %eax
+ 	jnz	L(first_vec_x3)
+ 
+ 	VPCMPEQ	(VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm1
+-	vpmovmskb	%ymm1, %eax
+	vpmovmskb %ymm1, %eax
+ 	testl	%eax, %eax
+ 	jnz	L(first_vec_x4)
+ 
+ 	/* Align data to VEC_SIZE * 4 - 1.  */
+ # ifdef USE_AS_STRNLEN
+ 	/* Before adjusting length check if at last VEC_SIZE * 4.  */
+-	cmpq	$(VEC_SIZE * 4 - 1), %rsi
+	cmpq	$(CHAR_PER_VEC * 4 - 1), %rsi
+ 	jbe	L(last_4x_vec_or_less_load)
+ 	incq	%rdi
+ 	movl	%edi, %ecx
+ 	orq	$(VEC_SIZE * 4 - 1), %rdi
+ 	andl	$(VEC_SIZE * 4 - 1), %ecx
+#  ifdef USE_AS_WCSLEN
+	/* NB: Divide bytes by 4 to get the wchar_t count.  */
+	sarl	$2, %ecx
+#  endif
+ 	/* Readjust length.  */
+ 	addq	%rcx, %rsi
+ # else
+@@ -246,13 +280,13 @@ L(cross_page_continue):
+ L(loop_4x_vec):
+ # ifdef USE_AS_STRNLEN
+ 	/* Break if at end of length.  */
+-	subq	$(VEC_SIZE * 4), %rsi
+	subq	$(CHAR_PER_VEC * 4), %rsi
+ 	jb	L(last_4x_vec_or_less_cmpeq)
+ # endif
+-	/* Save some code size by microfusing VPMINU with the load. Since
+-	   the matches in ymm2/ymm4 can only be returned if there where no
+-	   matches in ymm1/ymm3 respectively there is no issue with overlap.
+-	 */
+	/* Save some code size by microfusing VPMINU with the load.
+	   Since the matches in ymm2/ymm4 can only be returned if there
+	   where no matches in ymm1/ymm3 respectively there is no issue
+	   with overlap.  */
+ 	vmovdqa	1(%rdi), %ymm1
+ 	VPMINU	(VEC_SIZE + 1)(%rdi), %ymm1, %ymm2
+ 	vmovdqa	(VEC_SIZE * 2 + 1)(%rdi), %ymm3
+@@ -260,7 +294,7 @@ L(loop_4x_vec):
+ 
+ 	VPMINU	%ymm2, %ymm4, %ymm5
+ 	VPCMPEQ	%ymm5, %ymm0, %ymm5
+-	vpmovmskb	%ymm5, %ecx
+	vpmovmskb %ymm5, %ecx
+ 
+ 	subq	$-(VEC_SIZE * 4), %rdi
+ 	testl	%ecx, %ecx
+@@ -268,27 +302,28 @@ L(loop_4x_vec):
+ 
+ 
+ 	VPCMPEQ	%ymm1, %ymm0, %ymm1
+-	vpmovmskb	%ymm1, %eax
+	vpmovmskb %ymm1, %eax
+ 	subq	%rdx, %rdi
+ 	testl	%eax, %eax
+ 	jnz	L(last_vec_return_x0)
+ 
+ 	VPCMPEQ	%ymm2, %ymm0, %ymm2
+-	vpmovmskb	%ymm2, %eax
+	vpmovmskb %ymm2, %eax
+ 	testl	%eax, %eax
+ 	jnz	L(last_vec_return_x1)
+ 
+ 	/* Combine last 2 VEC.  */
+ 	VPCMPEQ	%ymm3, %ymm0, %ymm3
+-	vpmovmskb	%ymm3, %eax
+-	/* rcx has combined result from all 4 VEC. It will only be used if
+-	   the first 3 other VEC all did not contain a match.  */
+	vpmovmskb %ymm3, %eax
+	/* rcx has combined result from all 4 VEC. It will only be used
+	   if the first 3 other VEC all did not contain a match.  */
+ 	salq	$32, %rcx
+ 	orq	%rcx, %rax
+ 	tzcntq	%rax, %rax
+ 	subq	$(VEC_SIZE * 2 - 1), %rdi
+ 	addq	%rdi, %rax
+ # ifdef USE_AS_WCSLEN
+	/* NB: Divide bytes by 4 to get wchar_t count.  */
+ 	shrq	$2, %rax
+ # endif
+ 	VZEROUPPER_RETURN
+@@ -297,15 +332,19 @@ L(loop_4x_vec):
+ # ifdef USE_AS_STRNLEN
+ 	.p2align 4
+ L(last_4x_vec_or_less_load):
+-	/* Depending on entry adjust rdi / prepare first VEC in ymm1.  */
+	/* Depending on entry adjust rdi / prepare first VEC in ymm1.
+	 */
+ 	subq	$-(VEC_SIZE * 4), %rdi
+ L(last_4x_vec_or_less_cmpeq):
+ 	VPCMPEQ	1(%rdi), %ymm0, %ymm1
+ L(last_4x_vec_or_less):
+-
+-	vpmovmskb	%ymm1, %eax
+-	/* If remaining length > VEC_SIZE * 2. This works if esi is off by
+-	   VEC_SIZE * 4.  */
+#  ifdef USE_AS_WCSLEN
+	/* NB: Multiply length by 4 to get byte count.  */
+	sall	$2, %esi
+#  endif
+	vpmovmskb %ymm1, %eax
+	/* If remaining length > VEC_SIZE * 2. This works if esi is off
+	   by VEC_SIZE * 4.  */
+ 	testl	$(VEC_SIZE * 2), %esi
+ 	jnz	L(last_4x_vec)
+ 
+@@ -320,7 +359,7 @@ L(last_4x_vec_or_less):
+ 	jb	L(max)
+ 
+ 	VPCMPEQ	(VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
+-	vpmovmskb	%ymm1, %eax
+	vpmovmskb %ymm1, %eax
+ 	tzcntl	%eax, %eax
+ 	/* Check the end of data.  */
+ 	cmpl	%eax, %esi
+@@ -329,6 +368,7 @@ L(last_4x_vec_or_less):
+ 	addl	$(VEC_SIZE + 1), %eax
+ 	addq	%rdi, %rax
+ #  ifdef USE_AS_WCSLEN
+	/* NB: Divide bytes by 4 to get wchar_t count.  */
+ 	shrq	$2, %rax
+ #  endif
+ 	VZEROUPPER_RETURN
+@@ -340,6 +380,7 @@ L(last_vec_return_x0):
+ 	subq	$(VEC_SIZE * 4 - 1), %rdi
+ 	addq	%rdi, %rax
+ # ifdef USE_AS_WCSLEN
+	/* NB: Divide bytes by 4 to get wchar_t count.  */
+ 	shrq	$2, %rax
+ # endif
+ 	VZEROUPPER_RETURN
+@@ -350,6 +391,7 @@ L(last_vec_return_x1):
+ 	subq	$(VEC_SIZE * 3 - 1), %rdi
+ 	addq	%rdi, %rax
+ # ifdef USE_AS_WCSLEN
+	/* NB: Divide bytes by 4 to get wchar_t count.  */
+ 	shrq	$2, %rax
+ # endif
+ 	VZEROUPPER_RETURN
+@@ -366,6 +408,7 @@ L(last_vec_x1_check):
+ 	incl	%eax
+ 	addq	%rdi, %rax
+ #  ifdef USE_AS_WCSLEN
+	/* NB: Divide bytes by 4 to get wchar_t count.  */
+ 	shrq	$2, %rax
+ #  endif
+ 	VZEROUPPER_RETURN
+@@ -381,14 +424,14 @@ L(last_4x_vec):
+ 	jnz	L(last_vec_x1)
+ 
+ 	VPCMPEQ	(VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
+-	vpmovmskb	%ymm1, %eax
+	vpmovmskb %ymm1, %eax
+ 	testl	%eax, %eax
+ 	jnz	L(last_vec_x2)
+ 
+ 	/* Normalize length.  */
+ 	andl	$(VEC_SIZE * 4 - 1), %esi
+ 	VPCMPEQ	(VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm1
+-	vpmovmskb	%ymm1, %eax
+	vpmovmskb %ymm1, %eax
+ 	testl	%eax, %eax
+ 	jnz	L(last_vec_x3)
+ 
+@@ -396,7 +439,7 @@ L(last_4x_vec):
+ 	jb	L(max)
+ 
+ 	VPCMPEQ	(VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm1
+-	vpmovmskb	%ymm1, %eax
+	vpmovmskb %ymm1, %eax
+ 	tzcntl	%eax, %eax
+ 	/* Check the end of data.  */
+ 	cmpl	%eax, %esi
+@@ -405,6 +448,7 @@ L(last_4x_vec):
+ 	addl	$(VEC_SIZE * 3 + 1), %eax
+ 	addq	%rdi, %rax
+ #  ifdef USE_AS_WCSLEN
+	/* NB: Divide bytes by 4 to get wchar_t count.  */
+ 	shrq	$2, %rax
+ #  endif
+ 	VZEROUPPER_RETURN
+@@ -419,6 +463,7 @@ L(last_vec_x1):
+ 	incl	%eax
+ 	addq	%rdi, %rax
+ #  ifdef USE_AS_WCSLEN
+	/* NB: Divide bytes by 4 to get wchar_t count.  */
+ 	shrq	$2, %rax
+ #  endif
+ 	VZEROUPPER_RETURN
+@@ -432,6 +477,7 @@ L(last_vec_x2):
+ 	addl	$(VEC_SIZE + 1), %eax
+ 	addq	%rdi, %rax
+ #  ifdef USE_AS_WCSLEN
+	/* NB: Divide bytes by 4 to get wchar_t count.  */
+ 	shrq	$2, %rax
+ #  endif
+ 	VZEROUPPER_RETURN
+@@ -447,6 +493,7 @@ L(last_vec_x3):
+ 	addl	$(VEC_SIZE * 2 + 1), %eax
+ 	addq	%rdi, %rax
+ #  ifdef USE_AS_WCSLEN
+	/* NB: Divide bytes by 4 to get wchar_t count.  */
+ 	shrq	$2, %rax
+ #  endif
+ 	VZEROUPPER_RETURN
+@@ -455,13 +502,13 @@ L(max_end):
+ 	VZEROUPPER_RETURN
+ # endif
+ 
+-	/* Cold case for crossing page with first load.	 */
+	/* Cold case for crossing page with first load.  */
+ 	.p2align 4
+ L(cross_page_boundary):
+ 	/* Align data to VEC_SIZE - 1.  */
+ 	orq	$(VEC_SIZE - 1), %rdi
+ 	VPCMPEQ	-(VEC_SIZE - 1)(%rdi), %ymm0, %ymm1
+-	vpmovmskb	%ymm1, %eax
+	vpmovmskb %ymm1, %eax
+ 	/* Remove the leading bytes. sarxl only uses bits [5:0] of COUNT
+ 	   so no need to manually mod rdx.  */
+ 	sarxl	%edx, %eax, %eax
+@@ -470,6 +517,10 @@ L(cross_page_boundary):
+ 	jnz	L(cross_page_less_vec)
+ 	leaq	1(%rdi), %rcx
+ 	subq	%rdx, %rcx
+#  ifdef USE_AS_WCSLEN
+	/* NB: Divide bytes by 4 to get wchar_t count.  */
+	shrl	$2, %ecx
+#  endif
+ 	/* Check length.  */
+ 	cmpq	%rsi, %rcx
+ 	jb	L(cross_page_continue)
+@@ -479,6 +530,7 @@ L(cross_page_boundary):
+ 	jz	L(cross_page_continue)
+ 	tzcntl	%eax, %eax
+ #  ifdef USE_AS_WCSLEN
+	/* NB: Divide length by 4 to get wchar_t count.  */
+ 	shrl	$2, %eax
+ #  endif
+ # endif
+@@ -489,6 +541,10 @@ L(return_vzeroupper):
+ 	.p2align 4
+ L(cross_page_less_vec):
+ 	tzcntl	%eax, %eax
+#  ifdef USE_AS_WCSLEN
+	/* NB: Multiply length by 4 to get byte count.  */
+	sall	$2, %esi
+#  endif
+ 	cmpq	%rax, %rsi
+ 	cmovb	%esi, %eax
+ #  ifdef USE_AS_WCSLEN
+diff --git a/sysdeps/x86_64/multiarch/strlen-vec.S b/sysdeps/x86_64/multiarch/strlen-vec.S
+index 8f660bb9..439e486a 100644
+--- a/sysdeps/x86_64/multiarch/strlen-vec.S
+++ b/sysdeps/x86_64/multiarch/strlen-vec.S
+@@ -65,12 +65,25 @@ ENTRY(strlen)
+ 	ret
+ L(n_nonzero):
+ # ifdef AS_WCSLEN
+-	shl	$2, %RSI_LP
+/* Check for overflow from maxlen * sizeof(wchar_t). If it would
+   overflow the only way this program doesn't have undefined behavior 
+   is if there is a null terminator in valid memory so wcslen will 
+   suffice.  */
+	mov	%RSI_LP, %R10_LP
+	sar	$62, %R10_LP
+	test	%R10_LP, %R10_LP
+	jnz	__wcslen_sse4_1
+	sal	$2, %RSI_LP
+ # endif
+ 
+
+ /* Initialize long lived registers.  */
+ 
+ 	add	%RDI_LP, %RSI_LP
+# ifdef AS_WCSLEN
+/* Check for overflow again from s + maxlen * sizeof(wchar_t).  */
+	jbe	__wcslen_sse4_1
+# endif
+ 	mov	%RSI_LP, %R10_LP
+ 	and	$-64, %R10_LP
+ 	mov	%RSI_LP, %R11_LP
+-- 
+GitLab
+
--- a/glibc-RHEL-15696-31.patch
+++ b/glibc-RHEL-15696-31.patch
@ -0,0 +1,745 @@
+From 4ba65586847751372520a36757c17f114588794e Mon Sep 17 00:00:00 2001
+From: Noah Goldstein <goldstein.w.n@gmail.com>
+Date: Mon, 19 Apr 2021 19:36:06 -0400
+Subject: [PATCH] x86: Optimize strlen-evex.S
+Content-type: text/plain; charset=UTF-8
+
+No bug. This commit optimizes strlen-evex.S. The
+optimizations are mostly small things but they add up to roughly
+10-30% performance improvement for strlen. The results for strnlen are
+bit more ambiguous. test-strlen, test-strnlen, test-wcslen, and
+test-wcsnlen are all passing.
+
+Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
+---
+ sysdeps/x86_64/multiarch/strlen-evex.S | 581 ++++++++++++++-----------
+ 1 file changed, 317 insertions(+), 264 deletions(-)
+
+diff --git a/sysdeps/x86_64/multiarch/strlen-evex.S b/sysdeps/x86_64/multiarch/strlen-evex.S
+index 05838190..4bf6874b 100644
+--- a/sysdeps/x86_64/multiarch/strlen-evex.S
+++ b/sysdeps/x86_64/multiarch/strlen-evex.S
+@@ -29,11 +29,13 @@
+ # ifdef USE_AS_WCSLEN
+ #  define VPCMP		vpcmpd
+ #  define VPMINU	vpminud
+-#  define SHIFT_REG	r9d
+#  define SHIFT_REG ecx
+#  define CHAR_SIZE	4
+ # else
+ #  define VPCMP		vpcmpb
+ #  define VPMINU	vpminub
+-#  define SHIFT_REG	ecx
+#  define SHIFT_REG edx
+#  define CHAR_SIZE	1
+ # endif
+ 
+ # define XMMZERO	xmm16
+@@ -46,132 +48,165 @@
+ # define YMM6		ymm22
+ 
+ # define VEC_SIZE 32
+# define PAGE_SIZE 4096
+# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE)
+ 
+ 	.section .text.evex,"ax",@progbits
+ ENTRY (STRLEN)
+ # ifdef USE_AS_STRNLEN
+-	/* Check for zero length.  */
+	/* Check zero length.  */
+ 	test	%RSI_LP, %RSI_LP
+ 	jz	L(zero)
+-#  ifdef USE_AS_WCSLEN
+-	shl	$2, %RSI_LP
+-#  elif defined __ILP32__
+#  ifdef __ILP32__
+ 	/* Clear the upper 32 bits.  */
+ 	movl	%esi, %esi
+ #  endif
+ 	mov	%RSI_LP, %R8_LP
+ # endif
+-	movl	%edi, %ecx
+-	movq	%rdi, %rdx
+	movl	%edi, %eax
+ 	vpxorq	%XMMZERO, %XMMZERO, %XMMZERO
+-
+	/* Clear high bits from edi. Only keeping bits relevant to page
+	   cross check.  */
+	andl	$(PAGE_SIZE - 1), %eax
+ 	/* Check if we may cross page boundary with one vector load.  */
+-	andl	$(2 * VEC_SIZE - 1), %ecx
+-	cmpl	$VEC_SIZE, %ecx
+-	ja	L(cros_page_boundary)
+	cmpl	$(PAGE_SIZE - VEC_SIZE), %eax
+	ja	L(cross_page_boundary)
+ 
+ 	/* Check the first VEC_SIZE bytes.  Each bit in K0 represents a
+ 	   null byte.  */
+ 	VPCMP	$0, (%rdi), %YMMZERO, %k0
+ 	kmovd	%k0, %eax
+-	testl	%eax, %eax
+-
+ # ifdef USE_AS_STRNLEN
+-	jnz	L(first_vec_x0_check)
+-	/* Adjust length and check the end of data.  */
+-	subq	$VEC_SIZE, %rsi
+-	jbe	L(max)
+-# else
+-	jnz	L(first_vec_x0)
+	/* If length < CHAR_PER_VEC handle special.  */
+	cmpq	$CHAR_PER_VEC, %rsi
+	jbe	L(first_vec_x0)
+ # endif
+-
+-	/* Align data for aligned loads in the loop.  */
+-	addq	$VEC_SIZE, %rdi
+-	andl	$(VEC_SIZE - 1), %ecx
+-	andq	$-VEC_SIZE, %rdi
+-
+	testl	%eax, %eax
+	jz	L(aligned_more)
+	tzcntl	%eax, %eax
+	ret
+ # ifdef USE_AS_STRNLEN
+-	/* Adjust length.  */
+-	addq	%rcx, %rsi
+L(zero):
+	xorl	%eax, %eax
+	ret
+ 
+-	subq	$(VEC_SIZE * 4), %rsi
+-	jbe	L(last_4x_vec_or_less)
+	.p2align 4
+L(first_vec_x0):
+	/* Set bit for max len so that tzcnt will return min of max len
+	   and position of first match.  */
+	btsq	%rsi, %rax
+	tzcntl	%eax, %eax
+	ret
+ # endif
+-	jmp	L(more_4x_vec)
+ 
+ 	.p2align 4
+-L(cros_page_boundary):
+-	andl	$(VEC_SIZE - 1), %ecx
+-	andq	$-VEC_SIZE, %rdi
+-
+-# ifdef USE_AS_WCSLEN
+-	/* NB: Divide shift count by 4 since each bit in K0 represent 4
+-	   bytes.  */
+-	movl	%ecx, %SHIFT_REG
+-	sarl	$2, %SHIFT_REG
+L(first_vec_x1):
+	tzcntl	%eax, %eax
+	/* Safe to use 32 bit instructions as these are only called for
+	   size = [1, 159].  */
+# ifdef USE_AS_STRNLEN
+	/* Use ecx which was computed earlier to compute correct value.
+	 */
+	leal	-(CHAR_PER_VEC * 4 + 1)(%rcx, %rax), %eax
+# else
+	subl	%edx, %edi
+#  ifdef USE_AS_WCSLEN
+	/* NB: Divide bytes by 4 to get the wchar_t count.  */
+	sarl	$2, %edi
+#  endif
+	leal	CHAR_PER_VEC(%rdi, %rax), %eax
+ # endif
+-	VPCMP	$0, (%rdi), %YMMZERO, %k0
+-	kmovd	%k0, %eax
+	ret
+ 
+-	/* Remove the leading bytes.  */
+-	sarxl	%SHIFT_REG, %eax, %eax
+-	testl	%eax, %eax
+-	jz	L(aligned_more)
+	.p2align 4
+L(first_vec_x2):
+ 	tzcntl	%eax, %eax
+-# ifdef USE_AS_WCSLEN
+-	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
+-	sall	$2, %eax
+-# endif
+	/* Safe to use 32 bit instructions as these are only called for
+	   size = [1, 159].  */
+ # ifdef USE_AS_STRNLEN
+-	/* Check the end of data.  */
+-	cmpq	%rax, %rsi
+-	jbe	L(max)
+-# endif
+-	addq	%rdi, %rax
+-	addq	%rcx, %rax
+-	subq	%rdx, %rax
+-# ifdef USE_AS_WCSLEN
+-	shrq	$2, %rax
+	/* Use ecx which was computed earlier to compute correct value.
+	 */
+	leal	-(CHAR_PER_VEC * 3 + 1)(%rcx, %rax), %eax
+# else
+	subl	%edx, %edi
+#  ifdef USE_AS_WCSLEN
+	/* NB: Divide bytes by 4 to get the wchar_t count.  */
+	sarl	$2, %edi
+#  endif
+	leal	(CHAR_PER_VEC * 2)(%rdi, %rax), %eax
+ # endif
+ 	ret
+ 
+ 	.p2align 4
+-L(aligned_more):
+L(first_vec_x3):
+	tzcntl	%eax, %eax
+	/* Safe to use 32 bit instructions as these are only called for
+	   size = [1, 159].  */
+ # ifdef USE_AS_STRNLEN
+-        /* "rcx" is less than VEC_SIZE.  Calculate "rdx + rcx - VEC_SIZE"
+-	    with "rdx - (VEC_SIZE - rcx)" instead of "(rdx + rcx) - VEC_SIZE"
+-	    to void possible addition overflow.  */
+-	negq	%rcx
+-	addq	$VEC_SIZE, %rcx
+-
+-	/* Check the end of data.  */
+-	subq	%rcx, %rsi
+-	jbe	L(max)
+	/* Use ecx which was computed earlier to compute correct value.
+	 */
+	leal	-(CHAR_PER_VEC * 2 + 1)(%rcx, %rax), %eax
+# else
+	subl	%edx, %edi
+#  ifdef USE_AS_WCSLEN
+	/* NB: Divide bytes by 4 to get the wchar_t count.  */
+	sarl	$2, %edi
+#  endif
+	leal	(CHAR_PER_VEC * 3)(%rdi, %rax), %eax
+ # endif
+	ret
+ 
+-	addq	$VEC_SIZE, %rdi
+-
+	.p2align 4
+L(first_vec_x4):
+	tzcntl	%eax, %eax
+	/* Safe to use 32 bit instructions as these are only called for
+	   size = [1, 159].  */
+ # ifdef USE_AS_STRNLEN
+-	subq	$(VEC_SIZE * 4), %rsi
+-	jbe	L(last_4x_vec_or_less)
+	/* Use ecx which was computed earlier to compute correct value.
+	 */
+	leal	-(CHAR_PER_VEC + 1)(%rcx, %rax), %eax
+# else
+	subl	%edx, %edi
+#  ifdef USE_AS_WCSLEN
+	/* NB: Divide bytes by 4 to get the wchar_t count.  */
+	sarl	$2, %edi
+#  endif
+	leal	(CHAR_PER_VEC * 4)(%rdi, %rax), %eax
+ # endif
+	ret
+ 
+-L(more_4x_vec):
+	.p2align 5
+L(aligned_more):
+	movq	%rdi, %rdx
+	/* Align data to VEC_SIZE.  */
+	andq	$-(VEC_SIZE), %rdi
+L(cross_page_continue):
+ 	/* Check the first 4 * VEC_SIZE.  Only one VEC_SIZE at a time
+ 	   since data is only aligned to VEC_SIZE.  */
+-	VPCMP	$0, (%rdi), %YMMZERO, %k0
+-	kmovd	%k0, %eax
+-	testl	%eax, %eax
+-	jnz	L(first_vec_x0)
+-
+# ifdef USE_AS_STRNLEN
+	/* + CHAR_SIZE because it simplies the logic in
+	   last_4x_vec_or_less.  */
+	leaq	(VEC_SIZE * 5 + CHAR_SIZE)(%rdi), %rcx
+	subq	%rdx, %rcx
+#  ifdef USE_AS_WCSLEN
+	/* NB: Divide bytes by 4 to get the wchar_t count.  */
+	sarl	$2, %ecx
+#  endif
+# endif
+	/* Load first VEC regardless.  */
+ 	VPCMP	$0, VEC_SIZE(%rdi), %YMMZERO, %k0
+# ifdef USE_AS_STRNLEN
+	/* Adjust length. If near end handle specially.  */
+	subq	%rcx, %rsi
+	jb	L(last_4x_vec_or_less)
+# endif
+ 	kmovd	%k0, %eax
+ 	testl	%eax, %eax
+ 	jnz	L(first_vec_x1)
+ 
+ 	VPCMP	$0, (VEC_SIZE * 2)(%rdi), %YMMZERO, %k0
+ 	kmovd	%k0, %eax
+-	testl	%eax, %eax
+	test	%eax, %eax
+ 	jnz	L(first_vec_x2)
+ 
+ 	VPCMP	$0, (VEC_SIZE * 3)(%rdi), %YMMZERO, %k0
+@@ -179,258 +214,276 @@ L(more_4x_vec):
+ 	testl	%eax, %eax
+ 	jnz	L(first_vec_x3)
+ 
+-	addq	$(VEC_SIZE * 4), %rdi
+-
+-# ifdef USE_AS_STRNLEN
+-	subq	$(VEC_SIZE * 4), %rsi
+-	jbe	L(last_4x_vec_or_less)
+-# endif
+-
+-	/* Align data to 4 * VEC_SIZE.  */
+-	movq	%rdi, %rcx
+-	andl	$(4 * VEC_SIZE - 1), %ecx
+-	andq	$-(4 * VEC_SIZE), %rdi
+	VPCMP	$0, (VEC_SIZE * 4)(%rdi), %YMMZERO, %k0
+	kmovd	%k0, %eax
+	testl	%eax, %eax
+	jnz	L(first_vec_x4)
+ 
+	addq	$VEC_SIZE, %rdi
+ # ifdef USE_AS_STRNLEN
+-	/* Adjust length.  */
+	/* Check if at last VEC_SIZE * 4 length.  */
+	cmpq	$(CHAR_PER_VEC * 4 - 1), %rsi
+	jbe	L(last_4x_vec_or_less_load)
+	movl	%edi, %ecx
+	andl	$(VEC_SIZE * 4 - 1), %ecx
+#  ifdef USE_AS_WCSLEN
+	/* NB: Divide bytes by 4 to get the wchar_t count.  */
+	sarl	$2, %ecx
+#  endif
+	/* Readjust length.  */
+ 	addq	%rcx, %rsi
+ # endif
+	/* Align data to VEC_SIZE * 4.  */
+	andq	$-(VEC_SIZE * 4), %rdi
+ 
+	/* Compare 4 * VEC at a time forward.  */
+ 	.p2align 4
+ L(loop_4x_vec):
+-	/* Compare 4 * VEC at a time forward.  */
+-	VMOVA	(%rdi), %YMM1
+-	VMOVA	VEC_SIZE(%rdi), %YMM2
+-	VMOVA	(VEC_SIZE * 2)(%rdi), %YMM3
+-	VMOVA	(VEC_SIZE * 3)(%rdi), %YMM4
+-
+-	VPMINU	%YMM1, %YMM2, %YMM5
+-	VPMINU	%YMM3, %YMM4, %YMM6
+	/* Load first VEC regardless.  */
+	VMOVA	(VEC_SIZE * 4)(%rdi), %YMM1
+# ifdef USE_AS_STRNLEN
+	/* Break if at end of length.  */
+	subq	$(CHAR_PER_VEC * 4), %rsi
+	jb	L(last_4x_vec_or_less_cmpeq)
+# endif
+	/* Save some code size by microfusing VPMINU with the load. Since
+	   the matches in ymm2/ymm4 can only be returned if there where no
+	   matches in ymm1/ymm3 respectively there is no issue with overlap.
+	 */
+	VPMINU	(VEC_SIZE * 5)(%rdi), %YMM1, %YMM2
+	VMOVA	(VEC_SIZE * 6)(%rdi), %YMM3
+	VPMINU	(VEC_SIZE * 7)(%rdi), %YMM3, %YMM4
+
+	VPCMP	$0, %YMM2, %YMMZERO, %k0
+	VPCMP	$0, %YMM4, %YMMZERO, %k1
+	subq	$-(VEC_SIZE * 4), %rdi
+	kortestd	%k0, %k1
+	jz	L(loop_4x_vec)
+
+	/* Check if end was in first half.  */
+	kmovd	%k0, %eax
+	subq	%rdx, %rdi
+# ifdef USE_AS_WCSLEN
+	shrq	$2, %rdi
+# endif
+	testl	%eax, %eax
+	jz	L(second_vec_return)
+ 
+-	VPMINU	%YMM5, %YMM6, %YMM5
+-	VPCMP	$0, %YMM5, %YMMZERO, %k0
+-	ktestd	%k0, %k0
+-	jnz	L(4x_vec_end)
+	VPCMP	$0, %YMM1, %YMMZERO, %k2
+	kmovd	%k2, %edx
+	/* Combine VEC1 matches (edx) with VEC2 matches (eax).  */
+# ifdef USE_AS_WCSLEN
+	sall	$CHAR_PER_VEC, %eax
+	orl	%edx, %eax
+	tzcntl	%eax, %eax
+# else
+	salq	$CHAR_PER_VEC, %rax
+	orq	%rdx, %rax
+	tzcntq	%rax, %rax
+# endif
+	addq	%rdi, %rax
+	ret
+ 
+-	addq	$(VEC_SIZE * 4), %rdi
+ 
+-# ifndef USE_AS_STRNLEN
+-	jmp	L(loop_4x_vec)
+-# else
+-	subq	$(VEC_SIZE * 4), %rsi
+-	ja	L(loop_4x_vec)
+# ifdef USE_AS_STRNLEN
+ 
+L(last_4x_vec_or_less_load):
+	/* Depending on entry adjust rdi / prepare first VEC in YMM1.  */
+	VMOVA	(VEC_SIZE * 4)(%rdi), %YMM1
+L(last_4x_vec_or_less_cmpeq):
+	VPCMP	$0, %YMM1, %YMMZERO, %k0
+	addq	$(VEC_SIZE * 3), %rdi
+ L(last_4x_vec_or_less):
+-	/* Less than 4 * VEC and aligned to VEC_SIZE.  */
+-	addl	$(VEC_SIZE * 2), %esi
+-	jle	L(last_2x_vec)
+-
+-	VPCMP	$0, (%rdi), %YMMZERO, %k0
+ 	kmovd	%k0, %eax
+	/* If remaining length > VEC_SIZE * 2. This works if esi is off by
+	   VEC_SIZE * 4.  */
+	testl	$(CHAR_PER_VEC * 2), %esi
+	jnz	L(last_4x_vec)
+
+	/* length may have been negative or positive by an offset of
+	   CHAR_PER_VEC * 4 depending on where this was called from. This
+	   fixes that.  */
+	andl	$(CHAR_PER_VEC * 4 - 1), %esi
+ 	testl	%eax, %eax
+-	jnz	L(first_vec_x0)
+	jnz	L(last_vec_x1_check)
+ 
+-	VPCMP	$0, VEC_SIZE(%rdi), %YMMZERO, %k0
+-	kmovd	%k0, %eax
+-	testl	%eax, %eax
+-	jnz	L(first_vec_x1)
+	/* Check the end of data.  */
+	subl	$CHAR_PER_VEC, %esi
+	jb	L(max)
+ 
+ 	VPCMP	$0, (VEC_SIZE * 2)(%rdi), %YMMZERO, %k0
+ 	kmovd	%k0, %eax
+-	testl	%eax, %eax
+-	jnz	L(first_vec_x2_check)
+-	subl	$VEC_SIZE, %esi
+-	jle	L(max)
+	tzcntl	%eax, %eax
+	/* Check the end of data.  */
+	cmpl	%eax, %esi
+	jb	L(max)
+ 
+-	VPCMP	$0, (VEC_SIZE * 3)(%rdi), %YMMZERO, %k0
+-	kmovd	%k0, %eax
+-	testl	%eax, %eax
+-	jnz	L(first_vec_x3_check)
+	subq	%rdx, %rdi
+#  ifdef USE_AS_WCSLEN
+	/* NB: Divide bytes by 4 to get the wchar_t count.  */
+	sarq	$2, %rdi
+#  endif
+	leaq	(CHAR_PER_VEC * 2)(%rdi, %rax), %rax
+	ret
+L(max):
+ 	movq	%r8, %rax
+	ret
+# endif
+
+	/* Placed here in strnlen so that the jcc L(last_4x_vec_or_less)
+	   in the 4x VEC loop can use 2 byte encoding.  */
+	.p2align 4
+L(second_vec_return):
+	VPCMP	$0, %YMM3, %YMMZERO, %k0
+	/* Combine YMM3 matches (k0) with YMM4 matches (k1).  */
+# ifdef USE_AS_WCSLEN
+	kunpckbw	%k0, %k1, %k0
+	kmovd	%k0, %eax
+	tzcntl	%eax, %eax
+# else
+	kunpckdq	%k0, %k1, %k0
+	kmovq	%k0, %rax
+	tzcntq	%rax, %rax
+# endif
+	leaq	(CHAR_PER_VEC * 2)(%rdi, %rax), %rax
+	ret
+
+
+# ifdef USE_AS_STRNLEN
+L(last_vec_x1_check):
+	tzcntl	%eax, %eax
+	/* Check the end of data.  */
+	cmpl	%eax, %esi
+	jb	L(max)
+	subq	%rdx, %rdi
+ #  ifdef USE_AS_WCSLEN
+-	shrq	$2, %rax
+	/* NB: Divide bytes by 4 to get the wchar_t count.  */
+	sarq	$2, %rdi
+ #  endif
+	leaq	(CHAR_PER_VEC)(%rdi, %rax), %rax
+ 	ret
+ 
+ 	.p2align 4
+-L(last_2x_vec):
+-	addl	$(VEC_SIZE * 2), %esi
+L(last_4x_vec):
+	/* Test first 2x VEC normally.  */
+	testl	%eax, %eax
+	jnz	L(last_vec_x1)
+ 
+-	VPCMP	$0, (%rdi), %YMMZERO, %k0
+	VPCMP	$0, (VEC_SIZE * 2)(%rdi), %YMMZERO, %k0
+ 	kmovd	%k0, %eax
+ 	testl	%eax, %eax
+-	jnz	L(first_vec_x0_check)
+-	subl	$VEC_SIZE, %esi
+-	jle	L(max)
+	jnz	L(last_vec_x2)
+ 
+-	VPCMP	$0, VEC_SIZE(%rdi), %YMMZERO, %k0
+	/* Normalize length.  */
+	andl	$(CHAR_PER_VEC * 4 - 1), %esi
+	VPCMP	$0, (VEC_SIZE * 3)(%rdi), %YMMZERO, %k0
+ 	kmovd	%k0, %eax
+ 	testl	%eax, %eax
+-	jnz	L(first_vec_x1_check)
+-	movq	%r8, %rax
+-#  ifdef USE_AS_WCSLEN
+-	shrq	$2, %rax
+-#  endif
+-	ret
+	jnz	L(last_vec_x3)
+ 
+-	.p2align 4
+-L(first_vec_x0_check):
+	/* Check the end of data.  */
+	subl	$(CHAR_PER_VEC * 3), %esi
+	jb	L(max)
+
+	VPCMP	$0, (VEC_SIZE * 4)(%rdi), %YMMZERO, %k0
+	kmovd	%k0, %eax
+ 	tzcntl	%eax, %eax
+-#  ifdef USE_AS_WCSLEN
+-	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
+-	sall	$2, %eax
+-#  endif
+ 	/* Check the end of data.  */
+-	cmpq	%rax, %rsi
+-	jbe	L(max)
+-	addq	%rdi, %rax
+-	subq	%rdx, %rax
+	cmpl	%eax, %esi
+	jb	L(max_end)
+
+	subq	%rdx, %rdi
+ #  ifdef USE_AS_WCSLEN
+-	shrq	$2, %rax
+	/* NB: Divide bytes by 4 to get the wchar_t count.  */
+	sarq	$2, %rdi
+ #  endif
+	leaq	(CHAR_PER_VEC * 4)(%rdi, %rax), %rax
+ 	ret
+ 
+ 	.p2align 4
+-L(first_vec_x1_check):
+L(last_vec_x1):
+ 	tzcntl	%eax, %eax
+	subq	%rdx, %rdi
+ #  ifdef USE_AS_WCSLEN
+-	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
+-	sall	$2, %eax
+-#  endif
+-	/* Check the end of data.  */
+-	cmpq	%rax, %rsi
+-	jbe	L(max)
+-	addq	$VEC_SIZE, %rax
+-	addq	%rdi, %rax
+-	subq	%rdx, %rax
+-#  ifdef USE_AS_WCSLEN
+-	shrq	$2, %rax
+	/* NB: Divide bytes by 4 to get the wchar_t count.  */
+	sarq	$2, %rdi
+ #  endif
+	leaq	(CHAR_PER_VEC)(%rdi, %rax), %rax
+ 	ret
+ 
+ 	.p2align 4
+-L(first_vec_x2_check):
+L(last_vec_x2):
+ 	tzcntl	%eax, %eax
+	subq	%rdx, %rdi
+ #  ifdef USE_AS_WCSLEN
+-	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
+-	sall	$2, %eax
+-#  endif
+-	/* Check the end of data.  */
+-	cmpq	%rax, %rsi
+-	jbe	L(max)
+-	addq	$(VEC_SIZE * 2), %rax
+-	addq	%rdi, %rax
+-	subq	%rdx, %rax
+-#  ifdef USE_AS_WCSLEN
+-	shrq	$2, %rax
+	/* NB: Divide bytes by 4 to get the wchar_t count.  */
+	sarq	$2, %rdi
+ #  endif
+	leaq	(CHAR_PER_VEC * 2)(%rdi, %rax), %rax
+ 	ret
+ 
+ 	.p2align 4
+-L(first_vec_x3_check):
+L(last_vec_x3):
+ 	tzcntl	%eax, %eax
+-#  ifdef USE_AS_WCSLEN
+-	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
+-	sall	$2, %eax
+-#  endif
+	subl	$(CHAR_PER_VEC * 2), %esi
+ 	/* Check the end of data.  */
+-	cmpq	%rax, %rsi
+-	jbe	L(max)
+-	addq	$(VEC_SIZE * 3), %rax
+-	addq	%rdi, %rax
+-	subq	%rdx, %rax
+	cmpl	%eax, %esi
+	jb	L(max_end)
+	subq	%rdx, %rdi
+ #  ifdef USE_AS_WCSLEN
+-	shrq	$2, %rax
+	/* NB: Divide bytes by 4 to get the wchar_t count.  */
+	sarq	$2, %rdi
+ #  endif
+	leaq	(CHAR_PER_VEC * 3)(%rdi, %rax), %rax
+ 	ret
+-
+-	.p2align 4
+-L(max):
+L(max_end):
+ 	movq	%r8, %rax
+-#  ifdef USE_AS_WCSLEN
+-	shrq	$2, %rax
+-#  endif
+-	ret
+-
+-	.p2align 4
+-L(zero):
+-	xorl	%eax, %eax
+ 	ret
+ # endif
+ 
+	/* Cold case for crossing page with first load.	 */
+ 	.p2align 4
+-L(first_vec_x0):
+-	tzcntl	%eax, %eax
+-# ifdef USE_AS_WCSLEN
+-	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
+-	sall	$2, %eax
+-# endif
+-	addq	%rdi, %rax
+-	subq	%rdx, %rax
+L(cross_page_boundary):
+	movq	%rdi, %rdx
+	/* Align data to VEC_SIZE.  */
+	andq	$-VEC_SIZE, %rdi
+	VPCMP	$0, (%rdi), %YMMZERO, %k0
+	kmovd	%k0, %eax
+	/* Remove the leading bytes.  */
+ # ifdef USE_AS_WCSLEN
+-	shrq	$2, %rax
+	/* NB: Divide shift count by 4 since each bit in K0 represent 4
+	   bytes.  */
+	movl	%edx, %ecx
+	shrl	$2, %ecx
+	andl	$(CHAR_PER_VEC - 1), %ecx
+ # endif
+-	ret
+-
+-	.p2align 4
+-L(first_vec_x1):
+	/* SHIFT_REG is ecx for USE_AS_WCSLEN and edx otherwise.  */
+	sarxl	%SHIFT_REG, %eax, %eax
+	testl	%eax, %eax
+# ifndef USE_AS_STRNLEN
+	jz	L(cross_page_continue)
+ 	tzcntl	%eax, %eax
+-# ifdef USE_AS_WCSLEN
+-	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
+-	sall	$2, %eax
+-# endif
+-	addq	$VEC_SIZE, %rax
+-	addq	%rdi, %rax
+-	subq	%rdx, %rax
+-# ifdef USE_AS_WCSLEN
+-	shrq	$2, %rax
+-# endif
+ 	ret
+-
+-	.p2align 4
+-L(first_vec_x2):
+-	tzcntl	%eax, %eax
+-# ifdef USE_AS_WCSLEN
+-	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
+-	sall	$2, %eax
+-# endif
+-	addq	$(VEC_SIZE * 2), %rax
+-	addq	%rdi, %rax
+-	subq	%rdx, %rax
+-# ifdef USE_AS_WCSLEN
+-	shrq	$2, %rax
+-# endif
+# else
+	jnz	L(cross_page_less_vec)
+#  ifndef USE_AS_WCSLEN
+	movl	%edx, %ecx
+	andl	$(CHAR_PER_VEC - 1), %ecx
+#  endif
+	movl	$CHAR_PER_VEC, %eax
+	subl	%ecx, %eax
+	/* Check the end of data.  */
+	cmpq	%rax, %rsi
+	ja	L(cross_page_continue)
+	movl	%esi, %eax
+ 	ret
+-
+-	.p2align 4
+-L(4x_vec_end):
+-	VPCMP	$0, %YMM1, %YMMZERO, %k0
+-	kmovd	%k0, %eax
+-	testl	%eax, %eax
+-	jnz	L(first_vec_x0)
+-	VPCMP	$0, %YMM2, %YMMZERO, %k1
+-	kmovd	%k1, %eax
+-	testl	%eax, %eax
+-	jnz	L(first_vec_x1)
+-	VPCMP	$0, %YMM3, %YMMZERO, %k2
+-	kmovd	%k2, %eax
+-	testl	%eax, %eax
+-	jnz	L(first_vec_x2)
+-	VPCMP	$0, %YMM4, %YMMZERO, %k3
+-	kmovd	%k3, %eax
+-L(first_vec_x3):
+L(cross_page_less_vec):
+ 	tzcntl	%eax, %eax
+-# ifdef USE_AS_WCSLEN
+-	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
+-	sall	$2, %eax
+-# endif
+-	addq	$(VEC_SIZE * 3), %rax
+-	addq	%rdi, %rax
+-	subq	%rdx, %rax
+-# ifdef USE_AS_WCSLEN
+-	shrq	$2, %rax
+-# endif
+	/* Select min of length and position of first null.  */
+	cmpq	%rax, %rsi
+	cmovb	%esi, %eax
+ 	ret
+# endif
+ 
+ END (STRLEN)
+ #endif
+-- 
+GitLab
+
--- a/glibc-RHEL-15696-32.patch
+++ b/glibc-RHEL-15696-32.patch
@ -0,0 +1,158 @@
+From ea8e465a6b8d0f26c72bcbe453a854de3abf68ec Mon Sep 17 00:00:00 2001
+From: "H.J. Lu" <hjl.tools@gmail.com>
+Date: Wed, 30 Jun 2021 10:47:06 -0700
+Subject: [PATCH] x86: Check RTM_ALWAYS_ABORT for RTM [BZ #28033]
+Content-type: text/plain; charset=UTF-8
+
+From
+
+https://www.intel.com/content/www/us/en/support/articles/000059422/processors.html
+
+* Intel TSX will be disabled by default.
+* The processor will force abort all Restricted Transactional Memory (RTM)
+  transactions by default.
+* A new CPUID bit CPUID.07H.0H.EDX[11](RTM_ALWAYS_ABORT) will be enumerated,
+  which is set to indicate to updated software that the loaded microcode is
+  forcing RTM abort.
+* On processors that enumerate support for RTM, the CPUID enumeration bits
+  for Intel TSX (CPUID.07H.0H.EBX[11] and CPUID.07H.0H.EBX[4]) continue to
+  be set by default after microcode update.
+* Workloads that were benefited from Intel TSX might experience a change
+  in performance.
+* System software may use a new bit in Model-Specific Register (MSR) 0x10F
+  TSX_FORCE_ABORT[TSX_CPUID_CLEAR] functionality to clear the Hardware Lock
+  Elision (HLE) and RTM bits to indicate to software that Intel TSX is
+  disabled.
+
+1. Add RTM_ALWAYS_ABORT to CPUID features.
+2. Set RTM usable only if RTM_ALWAYS_ABORT isn't set.  This skips the
+string/tst-memchr-rtm etc. testcases on the affected processors, which
+always fail after a microcde update.
+3. Check RTM feature, instead of usability, against /proc/cpuinfo.
+
+This fixes BZ #28033.
+---
+ manual/platform.texi                    | 3 +++
+ sysdeps/x86/cpu-features.c              | 5 ++++-
+ sysdeps/x86/sys/platform/x86.h          | 6 +++---
+ sysdeps/x86/tst-cpu-features-supports.c | 2 +-
+ sysdeps/x86/tst-get-cpu-features.c      | 2 ++
+ 5 files changed, 13 insertions(+), 5 deletions(-)
+
+Conflicts:
+	sysdeps/x86/bits/platform/x86.h
+	(doesn't exist)
+	sysdeps/x86/bits/platform/x86.h
+	(account for lack of upstream renames)
+
+diff --git a/manual/platform.texi b/manual/platform.texi
+index 8fec2933..b7e8aef7 100644
+--- a/manual/platform.texi
+++ b/manual/platform.texi
+@@ -510,6 +510,9 @@ capability.
+ @item
+ @code{RTM} -- RTM instruction extensions.
+ 
+@item
+@code{RTM_ALWAYS_ABORT} -- Transactions always abort, making RTM unusable.
+
+ @item
+ @code{SDBG} -- IA32_DEBUG_INTERFACE MSR for silicon debug.
+ 
+diff --git a/sysdeps/x86/cpu-features.c b/sysdeps/x86/cpu-features.c
+index 3610ee5c..4889f062 100644
+--- a/sysdeps/x86/cpu-features.c
+++ b/sysdeps/x86/cpu-features.c
+@@ -74,7 +74,6 @@ update_usable (struct cpu_features *cpu_features)
+   CPU_FEATURE_SET_USABLE (cpu_features, HLE);
+   CPU_FEATURE_SET_USABLE (cpu_features, BMI2);
+   CPU_FEATURE_SET_USABLE (cpu_features, ERMS);
+-  CPU_FEATURE_SET_USABLE (cpu_features, RTM);
+   CPU_FEATURE_SET_USABLE (cpu_features, RDSEED);
+   CPU_FEATURE_SET_USABLE (cpu_features, ADX);
+   CPU_FEATURE_SET_USABLE (cpu_features, CLFLUSHOPT);
+@@ -90,6 +89,7 @@ update_usable (struct cpu_features *cpu_features)
+   CPU_FEATURE_SET_USABLE (cpu_features, MOVDIRI);
+   CPU_FEATURE_SET_USABLE (cpu_features, MOVDIR64B);
+   CPU_FEATURE_SET_USABLE (cpu_features, FSRM);
+  CPU_FEATURE_SET_USABLE (cpu_features, RTM_ALWAYS_ABORT);
+   CPU_FEATURE_SET_USABLE (cpu_features, SERIALIZE);
+   CPU_FEATURE_SET_USABLE (cpu_features, TSXLDTRK);
+   CPU_FEATURE_SET_USABLE (cpu_features, LAHF64_SAHF64);
+@@ -779,6 +779,9 @@ no_cpuid:
+     GLRO(dl_platform) = "i586";
+ #endif
+ 
+  if (!CPU_FEATURES_CPU_P (cpu_features, RTM_ALWAYS_ABORT))
+    CPU_FEATURE_SET_USABLE (cpu_features, RTM);
+
+ #if CET_ENABLED
+ # if HAVE_TUNABLES
+   TUNABLE_GET (x86_ibt, tunable_val_t *,
+diff --git a/sysdeps/x86/sys/platform/x86.h b/sysdeps/x86/sys/platform/x86.h
+index e5cc7c68..7a434926 100644
+--- a/sysdeps/x86/sys/platform/x86.h
+++ b/sysdeps/x86/sys/platform/x86.h
+@@ -247,7 +247,7 @@ extern const struct cpu_features *__x86_get_cpu_features (unsigned int)
+ #define bit_cpu_AVX512_VP2INTERSECT (1u << 8)
+ #define bit_cpu_INDEX_7_EDX_9	(1u << 9)
+ #define bit_cpu_MD_CLEAR	(1u << 10)
+-#define bit_cpu_INDEX_7_EDX_11	(1u << 11)
+#define bit_cpu_RTM_ALWAYS_ABORT (1u << 11)
+ #define bit_cpu_INDEX_7_EDX_12	(1u << 12)
+ #define bit_cpu_INDEX_7_EDX_13	(1u << 13)
+ #define bit_cpu_SERIALIZE	(1u << 14)
+@@ -471,7 +471,7 @@ extern const struct cpu_features *__x86_get_cpu_features (unsigned int)
+ #define index_cpu_AVX512_VP2INTERSECT COMMON_CPUID_INDEX_7
+ #define index_cpu_INDEX_7_EDX_9	COMMON_CPUID_INDEX_7
+ #define index_cpu_MD_CLEAR	COMMON_CPUID_INDEX_7
+-#define index_cpu_INDEX_7_EDX_11 COMMON_CPUID_INDEX_7
+#define index_cpu_RTM_ALWAYS_ABORT COMMON_CPUID_INDEX_7
+ #define index_cpu_INDEX_7_EDX_12 COMMON_CPUID_INDEX_7
+ #define index_cpu_INDEX_7_EDX_13 COMMON_CPUID_INDEX_7
+ #define index_cpu_SERIALIZE	COMMON_CPUID_INDEX_7
+@@ -695,7 +695,7 @@ extern const struct cpu_features *__x86_get_cpu_features (unsigned int)
+ #define reg_AVX512_VP2INTERSECT	edx
+ #define reg_INDEX_7_EDX_9	edx
+ #define reg_MD_CLEAR		edx
+-#define reg_INDEX_7_EDX_11	edx
+#define reg_RTM_ALWAYS_ABORT	edx
+ #define reg_INDEX_7_EDX_12	edx
+ #define reg_INDEX_7_EDX_13	edx
+ #define reg_SERIALIZE		edx
+diff --git a/sysdeps/x86/tst-cpu-features-supports.c b/sysdeps/x86/tst-cpu-features-supports.c
+index 287cf01f..8100a319 100644
+--- a/sysdeps/x86/tst-cpu-features-supports.c
+++ b/sysdeps/x86/tst-cpu-features-supports.c
+@@ -152,7 +152,7 @@ do_test (int argc, char **argv)
+   fails += CHECK_SUPPORTS (rdpid, RDPID);
+   fails += CHECK_SUPPORTS (rdrnd, RDRAND);
+   fails += CHECK_SUPPORTS (rdseed, RDSEED);
+-  fails += CHECK_SUPPORTS (rtm, RTM);
+  fails += CHECK_CPU_SUPPORTS (rtm, RTM);
+   fails += CHECK_SUPPORTS (serialize, SERIALIZE);
+   fails += CHECK_SUPPORTS (sha, SHA);
+   fails += CHECK_CPU_SUPPORTS (shstk, SHSTK);
+diff --git a/sysdeps/x86/tst-get-cpu-features.c b/sysdeps/x86/tst-get-cpu-features.c
+index 2763deb6..0717e5d8 100644
+--- a/sysdeps/x86/tst-get-cpu-features.c
+++ b/sysdeps/x86/tst-get-cpu-features.c
+@@ -183,6 +183,7 @@ do_test (void)
+   CHECK_CPU_FEATURE (UINTR);
+   CHECK_CPU_FEATURE (AVX512_VP2INTERSECT);
+   CHECK_CPU_FEATURE (MD_CLEAR);
+  CHECK_CPU_FEATURE (RTM_ALWAYS_ABORT);
+   CHECK_CPU_FEATURE (SERIALIZE);
+   CHECK_CPU_FEATURE (HYBRID);
+   CHECK_CPU_FEATURE (TSXLDTRK);
+@@ -344,6 +345,7 @@ do_test (void)
+   CHECK_CPU_FEATURE_USABLE (FSRM);
+   CHECK_CPU_FEATURE_USABLE (AVX512_VP2INTERSECT);
+   CHECK_CPU_FEATURE_USABLE (MD_CLEAR);
+  CHECK_CPU_FEATURE_USABLE (RTM_ALWAYS_ABORT);
+   CHECK_CPU_FEATURE_USABLE (SERIALIZE);
+   CHECK_CPU_FEATURE_USABLE (HYBRID);
+   CHECK_CPU_FEATURE_USABLE (TSXLDTRK);
+-- 
+GitLab
+
--- a/glibc-RHEL-15696-33.patch
+++ b/glibc-RHEL-15696-33.patch
@ -0,0 +1,51 @@
+From 0679442defedf7e52a94264975880ab8674736b2 Mon Sep 17 00:00:00 2001
+From: Noah Goldstein <goldstein.w.n@gmail.com>
+Date: Thu, 8 Jul 2021 16:13:19 -0400
+Subject: [PATCH] x86: Remove wcsnlen-sse4_1 from wcslen ifunc-impl-list [BZ
+ #28064]
+Content-type: text/plain; charset=UTF-8
+
+The following commit
+
+commit 6f573a27b6c8b4236445810a44660612323f5a73
+Author: Noah Goldstein <goldstein.w.n@gmail.com>
+Date:   Wed Jun 23 01:19:34 2021 -0400
+
+    x86-64: Add wcslen optimize for sse4.1
+
+Added wcsnlen-sse4.1 to the wcslen ifunc implementation list and did
+not add wcslen-sse4.1 to wcslen ifunc implementation list. This commit
+fixes that by removing wcsnlen-sse4.1 from the wcslen ifunc
+implementation list and adding wcslen-sse4.1 to the ifunc
+implementation list.
+
+Testing:
+test-wcslen.c, test-rsi-wcslen.c, and test-rsi-strlen.c are passing as
+well as all other tests in wcsmbs and string.
+
+Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
+Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
+Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
+---
+ sysdeps/x86_64/multiarch/ifunc-impl-list.c | 4 ++--
+ 1 file changed, 2 insertions(+), 2 deletions(-)
+
+diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+index 580913ca..695cdba6 100644
+--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+@@ -657,9 +657,9 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 			       && CPU_FEATURE_USABLE (AVX512BW)
+ 			       && CPU_FEATURE_USABLE (BMI2)),
+ 			      __wcslen_evex)
+-	      IFUNC_IMPL_ADD (array, i, wcsnlen,
+	      IFUNC_IMPL_ADD (array, i, wcslen,
+ 			      CPU_FEATURE_USABLE (SSE4_1),
+-			      __wcsnlen_sse4_1)
+			      __wcslen_sse4_1)
+ 	      IFUNC_IMPL_ADD (array, i, wcslen, 1, __wcslen_sse2))
+ 
+   /* Support sysdeps/x86_64/multiarch/wcsnlen.c.  */
+-- 
+GitLab
+
--- a/glibc-RHEL-15696-34.patch
+++ b/glibc-RHEL-15696-34.patch
@ -0,0 +1,135 @@
+From c6272098323153db373f2986c67786ea8c85f1cf Mon Sep 17 00:00:00 2001
+From: Noah Goldstein <goldstein.w.n@gmail.com>
+Date: Tue, 15 Feb 2022 08:18:15 -0600
+Subject: [PATCH] x86: Fallback {str|wcs}cmp RTM in the ncmp overflow case [BZ
+ #28896]
+Content-type: text/plain; charset=UTF-8
+
+In the overflow fallback strncmp-avx2-rtm and wcsncmp-avx2-rtm would
+call strcmp-avx2 and wcscmp-avx2 respectively. This would have
+not checks around vzeroupper and would trigger spurious
+aborts. This commit fixes that.
+
+test-strcmp, test-strncmp, test-wcscmp, and test-wcsncmp all pass on
+AVX2 machines with and without RTM.
+
+Co-authored-by: H.J. Lu <hjl.tools@gmail.com>
+---
+ sysdeps/x86/Makefile                        |  2 +-
+ sysdeps/x86/tst-strncmp-rtm.c               | 17 ++++++++++++++++-
+ sysdeps/x86_64/multiarch/strcmp-avx2.S      |  2 +-
+ sysdeps/x86_64/multiarch/strncmp-avx2-rtm.S |  1 +
+ sysdeps/x86_64/multiarch/strncmp-avx2.S     |  1 +
+ sysdeps/x86_64/multiarch/wcsncmp-avx2-rtm.S |  2 +-
+ sysdeps/x86_64/multiarch/wcsncmp-avx2.S     |  2 +-
+ 7 files changed, 22 insertions(+), 5 deletions(-)
+
+Conflicts:
+	sysdeps/x86_64/multiarch/strcmp-avx2.S
+	(split into two patches due to upstream bug differences)
+
+diff --git a/sysdeps/x86/Makefile b/sysdeps/x86/Makefile
+index 5be71ada..2d814915 100644
+--- a/sysdeps/x86/Makefile
+++ b/sysdeps/x86/Makefile
+@@ -38,7 +38,7 @@ CFLAGS-tst-memset-rtm.c += -mrtm
+ CFLAGS-tst-strchr-rtm.c += -mrtm
+ CFLAGS-tst-strcpy-rtm.c += -mrtm
+ CFLAGS-tst-strlen-rtm.c += -mrtm
+-CFLAGS-tst-strncmp-rtm.c += -mrtm
+CFLAGS-tst-strncmp-rtm.c += -mrtm -Wno-error
+ CFLAGS-tst-strrchr-rtm.c += -mrtm
+ endif
+ 
+diff --git a/sysdeps/x86/tst-strncmp-rtm.c b/sysdeps/x86/tst-strncmp-rtm.c
+index 236ad951..4d0004b5 100644
+--- a/sysdeps/x86/tst-strncmp-rtm.c
+++ b/sysdeps/x86/tst-strncmp-rtm.c
+@@ -16,6 +16,7 @@
+    License along with the GNU C Library; if not, see
+    <https://www.gnu.org/licenses/>.  */
+ 
+#include <stdint.h>
+ #include <tst-string-rtm.h>
+ 
+ #define LOOP 3000
+@@ -45,8 +46,22 @@ function (void)
+     return 1;
+ }
+ 
+__attribute__ ((noinline, noclone))
+static int
+function_overflow (void)
+{
+  if (strncmp (string1, string2, SIZE_MAX) == 0)
+    return 0;
+  else
+    return 1;
+}
+
+ static int
+ do_test (void)
+ {
+-  return do_test_1 ("strncmp", LOOP, prepare, function);
+  int status = do_test_1 ("strncmp", LOOP, prepare, function);
+  if (status != EXIT_SUCCESS)
+    return status;
+  status = do_test_1 ("strncmp", LOOP, prepare, function_overflow);
+  return status;
+ }
+diff --git a/sysdeps/x86_64/multiarch/strcmp-avx2.S b/sysdeps/x86_64/multiarch/strcmp-avx2.S
+index 5d1c9d90..433ae047 100644
+--- a/sysdeps/x86_64/multiarch/strcmp-avx2.S
+++ b/sysdeps/x86_64/multiarch/strcmp-avx2.S
+@@ -95,7 +95,7 @@ ENTRY (STRCMP)
+ 	   length to bound a valid memory region. In these cases just use
+ 	   'wcscmp'.  */
+ 	shrq	$56, %rcx
+-	jnz	__wcscmp_avx2
+	jnz	OVERFLOW_STRCMP
+ #  endif
+ 	/* Convert units: from wide to byte char.  */
+ 	shl	$2, %RDX_LP
+diff --git a/sysdeps/x86_64/multiarch/strncmp-avx2-rtm.S b/sysdeps/x86_64/multiarch/strncmp-avx2-rtm.S
+index 37d1224b..68bad365 100644
+--- a/sysdeps/x86_64/multiarch/strncmp-avx2-rtm.S
+++ b/sysdeps/x86_64/multiarch/strncmp-avx2-rtm.S
+@@ -1,3 +1,4 @@
+ #define STRCMP	__strncmp_avx2_rtm
+ #define USE_AS_STRNCMP 1
+#define OVERFLOW_STRCMP	__strcmp_avx2_rtm
+ #include "strcmp-avx2-rtm.S"
+diff --git a/sysdeps/x86_64/multiarch/strncmp-avx2.S b/sysdeps/x86_64/multiarch/strncmp-avx2.S
+index 1678bcc2..f138e9f1 100644
+--- a/sysdeps/x86_64/multiarch/strncmp-avx2.S
+++ b/sysdeps/x86_64/multiarch/strncmp-avx2.S
+@@ -1,3 +1,4 @@
+ #define STRCMP	__strncmp_avx2
+ #define USE_AS_STRNCMP 1
+#define OVERFLOW_STRCMP __strcmp_avx2
+ #include "strcmp-avx2.S"
+diff --git a/sysdeps/x86_64/multiarch/wcsncmp-avx2-rtm.S b/sysdeps/x86_64/multiarch/wcsncmp-avx2-rtm.S
+index 4e88c70c..f467582c 100644
+--- a/sysdeps/x86_64/multiarch/wcsncmp-avx2-rtm.S
+++ b/sysdeps/x86_64/multiarch/wcsncmp-avx2-rtm.S
+@@ -1,5 +1,5 @@
+ #define STRCMP __wcsncmp_avx2_rtm
+ #define USE_AS_STRNCMP 1
+ #define USE_AS_WCSCMP 1
+-
+#define OVERFLOW_STRCMP	__wcscmp_avx2_rtm
+ #include "strcmp-avx2-rtm.S"
+diff --git a/sysdeps/x86_64/multiarch/wcsncmp-avx2.S b/sysdeps/x86_64/multiarch/wcsncmp-avx2.S
+index 4fa1de4d..e9ede522 100644
+--- a/sysdeps/x86_64/multiarch/wcsncmp-avx2.S
+++ b/sysdeps/x86_64/multiarch/wcsncmp-avx2.S
+@@ -1,5 +1,5 @@
+ #define STRCMP __wcsncmp_avx2
+ #define USE_AS_STRNCMP 1
+ #define USE_AS_WCSCMP 1
+-
+#define OVERFLOW_STRCMP	__wcscmp_avx2
+ #include "strcmp-avx2.S"
+-- 
+GitLab
+
--- a/glibc-RHEL-15696-35.patch
+++ b/glibc-RHEL-15696-35.patch
@ -0,0 +1,51 @@
+From 55c7bcc71b84123d5d4bd2814366a6b05fcf8ebd Mon Sep 17 00:00:00 2001
+From: "H.J. Lu" <hjl.tools@gmail.com>
+Date: Sat, 9 May 2020 12:04:23 -0700
+Subject: [PATCH] x86-64: Use RDX_LP on __x86_shared_non_temporal_threshold [BZ
+ #25966]
+Content-type: text/plain; charset=UTF-8
+
+Since __x86_shared_non_temporal_threshold is defined as
+
+long int __x86_shared_non_temporal_threshold;
+
+and long int is 4 bytes for x32, use RDX_LP to compare against
+__x86_shared_non_temporal_threshold in assembly code.
+---
+ sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S | 6 +++---
+ 1 file changed, 3 insertions(+), 3 deletions(-)
+
+diff --git a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
+index 71f5954d..673b73aa 100644
+--- a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
+@@ -245,7 +245,7 @@ L(return):
+ #endif
+ 
+ L(movsb):
+-	cmpq	__x86_shared_non_temporal_threshold(%rip), %rdx
+	cmp	__x86_shared_non_temporal_threshold(%rip), %RDX_LP
+ 	jae	L(more_8x_vec)
+ 	cmpq	%rsi, %rdi
+ 	jb	1f
+@@ -397,7 +397,7 @@ L(more_8x_vec):
+ 	addq	%r8, %rdx
+ #if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc)
+ 	/* Check non-temporal store threshold.  */
+-	cmpq	__x86_shared_non_temporal_threshold(%rip), %rdx
+	cmp	__x86_shared_non_temporal_threshold(%rip), %RDX_LP
+ 	ja	L(large_forward)
+ #endif
+ L(loop_4x_vec_forward):
+@@ -448,7 +448,7 @@ L(more_8x_vec_backward):
+ 	subq	%r8, %rdx
+ #if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc)
+ 	/* Check non-temporal store threshold.  */
+-	cmpq	__x86_shared_non_temporal_threshold(%rip), %rdx
+	cmp	__x86_shared_non_temporal_threshold(%rip), %RDX_LP
+ 	ja	L(large_backward)
+ #endif
+ L(loop_4x_vec_backward):
+-- 
+GitLab
+
--- a/glibc-RHEL-15696-36.patch
+++ b/glibc-RHEL-15696-36.patch
@ -0,0 +1,44 @@
+From a35a59036ebae3efcdf5e8167610e0656fca9770 Mon Sep 17 00:00:00 2001
+From: "H.J. Lu" <hjl.tools@gmail.com>
+Date: Thu, 11 Jun 2020 12:41:18 -0700
+Subject: [PATCH] x86_64: Use %xmmN with vpxor to clear a vector register
+Content-type: text/plain; charset=UTF-8
+
+Since "vpxor %xmmN, %xmmN, %xmmN" clears the whole vector register, use
+%xmmN, instead of %ymmN, with vpxor to clear a vector register.
+---
+ sysdeps/x86_64/multiarch/strcmp-avx2.S  | 4 ++--
+ sysdeps/x86_64/multiarch/strrchr-avx2.S | 2 +-
+ 2 files changed, 3 insertions(+), 3 deletions(-)
+
+diff --git a/sysdeps/x86_64/multiarch/strcmp-avx2.S b/sysdeps/x86_64/multiarch/strcmp-avx2.S
+index 433ae047..70d8499b 100644
+--- a/sysdeps/x86_64/multiarch/strcmp-avx2.S
+++ b/sysdeps/x86_64/multiarch/strcmp-avx2.S
+@@ -105,8 +105,8 @@ ENTRY (STRCMP)
+ # endif
+ 	movl	%edi, %eax
+ 	xorl	%edx, %edx
+-	/* Make %ymm7 all zeros in this function.  */
+-	vpxor	%ymm7, %ymm7, %ymm7
+	/* Make %xmm7 (%ymm7) all zeros in this function.  */
+	vpxor	%xmm7, %xmm7, %xmm7
+ 	orl	%esi, %eax
+ 	andl	$(PAGE_SIZE - 1), %eax
+ 	cmpl	$(PAGE_SIZE - (VEC_SIZE * 4)), %eax
+diff --git a/sysdeps/x86_64/multiarch/strrchr-avx2.S b/sysdeps/x86_64/multiarch/strrchr-avx2.S
+index 9f22a15e..c949410b 100644
+--- a/sysdeps/x86_64/multiarch/strrchr-avx2.S
+++ b/sysdeps/x86_64/multiarch/strrchr-avx2.S
+@@ -48,7 +48,7 @@ ENTRY (STRRCHR)
+ 	movl	%edi, %ecx
+ 	/* Broadcast CHAR to YMM4.  */
+ 	VPBROADCAST %xmm4, %ymm4
+-	vpxor	%ymm0, %ymm0, %ymm0
+	vpxor	%xmm0, %xmm0, %xmm0
+ 
+ 	/* Check if we may cross page boundary with one vector load.  */
+ 	andl	$(2 * VEC_SIZE - 1), %ecx
+-- 
+GitLab
+
--- a/glibc-RHEL-15696-37.patch
+++ b/glibc-RHEL-15696-37.patch
@ -0,0 +1,359 @@
+From 1f745ecc2109890886b161d4791e1406fdfc29b8 Mon Sep 17 00:00:00 2001
+From: noah <goldstein.w.n@gmail.com>
+Date: Wed, 3 Feb 2021 00:38:59 -0500
+Subject: [PATCH] x86-64: Refactor and improve performance of strchr-avx2.S
+Content-type: text/plain; charset=UTF-8
+
+No bug. Just seemed the performance could be improved a bit. Observed
+and expected behavior are unchanged. Optimized body of main
+loop. Updated page cross logic and optimized accordingly. Made a few
+minor instruction selection modifications. No regressions in test
+suite. Both test-strchrnul and test-strchr passed.
+---
+ sysdeps/x86_64/multiarch/strchr-avx2.S | 225 ++++++++++++-------------
+ sysdeps/x86_64/multiarch/strchr.c      |   4 +-
+ 2 files changed, 114 insertions(+), 115 deletions(-)
+
+Conflicts:
+	sysdeps/x86_64/multiarch/strchr.c
+	(account for missing upstream macros)
+
+diff --git a/sysdeps/x86_64/multiarch/strchr-avx2.S b/sysdeps/x86_64/multiarch/strchr-avx2.S
+index da7d2620..919d256c 100644
+--- a/sysdeps/x86_64/multiarch/strchr-avx2.S
+++ b/sysdeps/x86_64/multiarch/strchr-avx2.S
+@@ -27,10 +27,12 @@
+ # ifdef USE_AS_WCSCHR
+ #  define VPBROADCAST	vpbroadcastd
+ #  define VPCMPEQ	vpcmpeqd
+#  define VPMINU	vpminud
+ #  define CHAR_REG	esi
+ # else
+ #  define VPBROADCAST	vpbroadcastb
+ #  define VPCMPEQ	vpcmpeqb
+#  define VPMINU	vpminub
+ #  define CHAR_REG	sil
+ # endif
+ 
+@@ -43,71 +45,54 @@
+ # endif
+ 
+ # define VEC_SIZE 32
+# define PAGE_SIZE 4096
+ 
+ 	.section SECTION(.text),"ax",@progbits
+ ENTRY (STRCHR)
+ 	movl	%edi, %ecx
+-	/* Broadcast CHAR to YMM0.  */
+# ifndef USE_AS_STRCHRNUL
+	xorl	%edx, %edx
+# endif
+
+	/* Broadcast CHAR to YMM0.	*/
+ 	vmovd	%esi, %xmm0
+ 	vpxor	%xmm9, %xmm9, %xmm9
+ 	VPBROADCAST %xmm0, %ymm0
+-	/* Check if we may cross page boundary with one vector load.  */
+-	andl	$(2 * VEC_SIZE - 1), %ecx
+-	cmpl	$VEC_SIZE, %ecx
+-	ja	L(cros_page_boundary)
+ 
+-	/* Check the first VEC_SIZE bytes.  Search for both CHAR and the
+-	   null byte.  */
+-	vmovdqu	(%rdi), %ymm8
+-	VPCMPEQ %ymm8, %ymm0, %ymm1
+-	VPCMPEQ %ymm8, %ymm9, %ymm2
+-	vpor	%ymm1, %ymm2, %ymm1
+-	vpmovmskb %ymm1, %eax
+-	testl	%eax, %eax
+-	jnz	L(first_vec_x0)
+	/* Check if we cross page boundary with one vector load.  */
+	andl	$(PAGE_SIZE - 1), %ecx
+	cmpl	$(PAGE_SIZE - VEC_SIZE), %ecx
+	ja  L(cross_page_boundary)
+ 
+-	/* Align data for aligned loads in the loop.  */
+-	addq	$VEC_SIZE, %rdi
+-	andl	$(VEC_SIZE - 1), %ecx
+-	andq	$-VEC_SIZE, %rdi
+-
+-	jmp	L(more_4x_vec)
+-
+-	.p2align 4
+-L(cros_page_boundary):
+-	andl	$(VEC_SIZE - 1), %ecx
+-	andq	$-VEC_SIZE, %rdi
+	/* Check the first VEC_SIZE bytes.	Search for both CHAR and the
+	   null byte.  */
+ 	vmovdqu	(%rdi), %ymm8
+ 	VPCMPEQ %ymm8, %ymm0, %ymm1
+ 	VPCMPEQ %ymm8, %ymm9, %ymm2
+ 	vpor	%ymm1, %ymm2, %ymm1
+ 	vpmovmskb %ymm1, %eax
+-	/* Remove the leading bytes.  */
+-	sarl	%cl, %eax
+ 	testl	%eax, %eax
+-	jz	L(aligned_more)
+-	/* Found CHAR or the null byte.  */
+	jz	L(more_vecs)
+ 	tzcntl	%eax, %eax
+-	addq	%rcx, %rax
+-# ifdef USE_AS_STRCHRNUL
+	/* Found CHAR or the null byte.	 */
+ 	addq	%rdi, %rax
+-# else
+-	xorl	%edx, %edx
+-	leaq	(%rdi, %rax), %rax
+-	cmp	(%rax), %CHAR_REG
+# ifndef USE_AS_STRCHRNUL
+	cmp (%rax), %CHAR_REG
+ 	cmovne	%rdx, %rax
+ # endif
+ L(return_vzeroupper):
+ 	ZERO_UPPER_VEC_REGISTERS_RETURN
+ 
+ 	.p2align 4
+L(more_vecs):
+	/* Align data for aligned loads in the loop.  */
+	andq	$-VEC_SIZE, %rdi
+ L(aligned_more):
+-	addq	$VEC_SIZE, %rdi
+ 
+-L(more_4x_vec):
+-	/* Check the first 4 * VEC_SIZE.  Only one VEC_SIZE at a time
+-	   since data is only aligned to VEC_SIZE.  */
+-	vmovdqa	(%rdi), %ymm8
+	/* Check the next 4 * VEC_SIZE.	 Only one VEC_SIZE at a time
+	   since data is only aligned to VEC_SIZE.	*/
+	vmovdqa	VEC_SIZE(%rdi), %ymm8
+	addq	$VEC_SIZE, %rdi
+ 	VPCMPEQ %ymm8, %ymm0, %ymm1
+ 	VPCMPEQ %ymm8, %ymm9, %ymm2
+ 	vpor	%ymm1, %ymm2, %ymm1
+@@ -137,61 +122,24 @@ L(more_4x_vec):
+ 	vpor	%ymm1, %ymm2, %ymm1
+ 	vpmovmskb %ymm1, %eax
+ 	testl	%eax, %eax
+-	jnz	L(first_vec_x3)
+-
+-	addq	$(VEC_SIZE * 4), %rdi
+-
+-	/* Align data to 4 * VEC_SIZE.  */
+-	movq	%rdi, %rcx
+-	andl	$(4 * VEC_SIZE - 1), %ecx
+-	andq	$-(4 * VEC_SIZE), %rdi
+-
+-	.p2align 4
+-L(loop_4x_vec):
+-	/* Compare 4 * VEC at a time forward.  */
+-	vmovdqa	(%rdi), %ymm5
+-	vmovdqa	VEC_SIZE(%rdi), %ymm6
+-	vmovdqa	(VEC_SIZE * 2)(%rdi), %ymm7
+-	vmovdqa	(VEC_SIZE * 3)(%rdi), %ymm8
+-
+-	VPCMPEQ %ymm5, %ymm0, %ymm1
+-	VPCMPEQ %ymm6, %ymm0, %ymm2
+-	VPCMPEQ %ymm7, %ymm0, %ymm3
+-	VPCMPEQ %ymm8, %ymm0, %ymm4
+-
+-	VPCMPEQ %ymm5, %ymm9, %ymm5
+-	VPCMPEQ %ymm6, %ymm9, %ymm6
+-	VPCMPEQ %ymm7, %ymm9, %ymm7
+-	VPCMPEQ %ymm8, %ymm9, %ymm8
+-
+-	vpor	%ymm1, %ymm5, %ymm1
+-	vpor	%ymm2, %ymm6, %ymm2
+-	vpor	%ymm3, %ymm7, %ymm3
+-	vpor	%ymm4, %ymm8, %ymm4
+-
+-	vpor	%ymm1, %ymm2, %ymm5
+-	vpor	%ymm3, %ymm4, %ymm6
+-
+-	vpor	%ymm5, %ymm6, %ymm5
+-
+-	vpmovmskb %ymm5, %eax
+-	testl	%eax, %eax
+-	jnz	L(4x_vec_end)
+-
+-	addq	$(VEC_SIZE * 4), %rdi
+	jz	L(prep_loop_4x)
+ 
+-	jmp	L(loop_4x_vec)
+	tzcntl	%eax, %eax
+	leaq	(VEC_SIZE * 3)(%rdi, %rax), %rax
+# ifndef USE_AS_STRCHRNUL
+	cmp (%rax), %CHAR_REG
+	cmovne	%rdx, %rax
+# endif
+	VZEROUPPER
+	ret
+ 
+ 	.p2align 4
+ L(first_vec_x0):
+-	/* Found CHAR or the null byte.  */
+ 	tzcntl	%eax, %eax
+-# ifdef USE_AS_STRCHRNUL
+	/* Found CHAR or the null byte.	 */
+ 	addq	%rdi, %rax
+-# else
+-	xorl	%edx, %edx
+-	leaq	(%rdi, %rax), %rax
+-	cmp	(%rax), %CHAR_REG
+# ifndef USE_AS_STRCHRNUL
+	cmp (%rax), %CHAR_REG
+ 	cmovne	%rdx, %rax
+ # endif
+ 	VZEROUPPER_RETURN
+@@ -199,13 +147,9 @@ L(first_vec_x0):
+ 	.p2align 4
+ L(first_vec_x1):
+ 	tzcntl	%eax, %eax
+-# ifdef USE_AS_STRCHRNUL
+-	addq	$VEC_SIZE, %rax
+-	addq	%rdi, %rax
+-# else
+-	xorl	%edx, %edx
+ 	leaq	VEC_SIZE(%rdi, %rax), %rax
+-	cmp	(%rax), %CHAR_REG
+# ifndef USE_AS_STRCHRNUL
+	cmp (%rax), %CHAR_REG
+ 	cmovne	%rdx, %rax
+ # endif
+ 	VZEROUPPER_RETURN
+@@ -213,42 +157,97 @@ L(first_vec_x1):
+ 	.p2align 4
+ L(first_vec_x2):
+ 	tzcntl	%eax, %eax
+-# ifdef USE_AS_STRCHRNUL
+-	addq	$(VEC_SIZE * 2), %rax
+-	addq	%rdi, %rax
+-# else
+-	xorl	%edx, %edx
+	/* Found CHAR or the null byte.	 */
+ 	leaq	(VEC_SIZE * 2)(%rdi, %rax), %rax
+-	cmp	(%rax), %CHAR_REG
+# ifndef USE_AS_STRCHRNUL
+	cmp (%rax), %CHAR_REG
+ 	cmovne	%rdx, %rax
+ # endif
+ 	VZEROUPPER_RETURN
+ 
+L(prep_loop_4x):
+	/* Align data to 4 * VEC_SIZE.	*/
+	andq	$-(VEC_SIZE * 4), %rdi
+
+ 	.p2align 4
+-L(4x_vec_end):
+L(loop_4x_vec):
+	/* Compare 4 * VEC at a time forward.  */
+	vmovdqa	(VEC_SIZE * 4)(%rdi), %ymm5
+	vmovdqa	(VEC_SIZE * 5)(%rdi), %ymm6
+	vmovdqa	(VEC_SIZE * 6)(%rdi), %ymm7
+	vmovdqa	(VEC_SIZE * 7)(%rdi), %ymm8
+
+	/* Leaves only CHARS matching esi as 0.	 */
+	vpxor	%ymm5, %ymm0, %ymm1
+	vpxor	%ymm6, %ymm0, %ymm2
+	vpxor	%ymm7, %ymm0, %ymm3
+	vpxor	%ymm8, %ymm0, %ymm4
+
+	VPMINU	%ymm1, %ymm5, %ymm1
+	VPMINU	%ymm2, %ymm6, %ymm2
+	VPMINU	%ymm3, %ymm7, %ymm3
+	VPMINU	%ymm4, %ymm8, %ymm4
+
+	VPMINU	%ymm1, %ymm2, %ymm5
+	VPMINU	%ymm3, %ymm4, %ymm6
+
+	VPMINU	%ymm5, %ymm6, %ymm5
+
+	VPCMPEQ %ymm5, %ymm9, %ymm5
+	vpmovmskb %ymm5, %eax
+
+	addq	$(VEC_SIZE * 4), %rdi
+	testl	%eax, %eax
+	jz  L(loop_4x_vec)
+
+	VPCMPEQ %ymm1, %ymm9, %ymm1
+ 	vpmovmskb %ymm1, %eax
+ 	testl	%eax, %eax
+ 	jnz	L(first_vec_x0)
+
+	VPCMPEQ %ymm2, %ymm9, %ymm2
+ 	vpmovmskb %ymm2, %eax
+ 	testl	%eax, %eax
+ 	jnz	L(first_vec_x1)
+-	vpmovmskb %ymm3, %eax
+-	testl	%eax, %eax
+-	jnz	L(first_vec_x2)
+
+	VPCMPEQ %ymm3, %ymm9, %ymm3
+	VPCMPEQ %ymm4, %ymm9, %ymm4
+	vpmovmskb %ymm3, %ecx
+ 	vpmovmskb %ymm4, %eax
+	salq	$32, %rax
+	orq %rcx, %rax
+	tzcntq  %rax, %rax
+	leaq	(VEC_SIZE * 2)(%rdi, %rax), %rax
+# ifndef USE_AS_STRCHRNUL
+	cmp (%rax), %CHAR_REG
+	cmovne	%rdx, %rax
+# endif
+	VZEROUPPER
+	ret
+
+	/* Cold case for crossing page with first load.	 */
+	.p2align 4
+L(cross_page_boundary):
+	andq	$-VEC_SIZE, %rdi
+	andl	$(VEC_SIZE - 1), %ecx
+
+	vmovdqa	(%rdi), %ymm8
+	VPCMPEQ %ymm8, %ymm0, %ymm1
+	VPCMPEQ %ymm8, %ymm9, %ymm2
+	vpor	%ymm1, %ymm2, %ymm1
+	vpmovmskb %ymm1, %eax
+	/* Remove the leading bits.	 */
+	sarxl	%ecx, %eax, %eax
+ 	testl	%eax, %eax
+-L(first_vec_x3):
+	jz	L(aligned_more)
+ 	tzcntl	%eax, %eax
+-# ifdef USE_AS_STRCHRNUL
+-	addq	$(VEC_SIZE * 3), %rax
+	addq	%rcx, %rdi
+ 	addq	%rdi, %rax
+-# else
+-	xorl	%edx, %edx
+-	leaq	(VEC_SIZE * 3)(%rdi, %rax), %rax
+-	cmp	(%rax), %CHAR_REG
+# ifndef USE_AS_STRCHRNUL
+	cmp (%rax), %CHAR_REG
+ 	cmovne	%rdx, %rax
+ # endif
+ 	VZEROUPPER_RETURN
+ 
+ END (STRCHR)
+-#endif
+# endif
+diff --git a/sysdeps/x86_64/multiarch/strchr.c b/sysdeps/x86_64/multiarch/strchr.c
+index 7e582f02..5225bd4f 100644
+--- a/sysdeps/x86_64/multiarch/strchr.c
+++ b/sysdeps/x86_64/multiarch/strchr.c
+@@ -38,11 +38,11 @@ IFUNC_SELECTOR (void)
+   const struct cpu_features* cpu_features = __get_cpu_features ();
+ 
+   if (CPU_FEATURE_USABLE_P (cpu_features, AVX2)
+      && CPU_FEATURE_USABLE_P (cpu_features, BMI2)
+       && CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load))
+     {
+       if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
+-	  && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW)
+-	  && CPU_FEATURE_USABLE_P (cpu_features, BMI2))
+	  && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW))
+ 	return OPTIMIZE (evex);
+ 
+       if (CPU_FEATURE_USABLE_P (cpu_features, RTM))
+-- 
+GitLab
+
--- a/glibc-RHEL-15696-38.patch
+++ b/glibc-RHEL-15696-38.patch
@ -0,0 +1,67 @@
+From 3ec5d83d2a237d39e7fd6ef7a0bc8ac4c171a4a5 Mon Sep 17 00:00:00 2001
+From: "H.J. Lu" <hjl.tools@gmail.com>
+Date: Sat, 25 Jan 2020 14:19:40 -0800
+Subject: [PATCH] x86-64: Avoid rep movsb with short distance [BZ #27130]
+Content-type: text/plain; charset=UTF-8
+
+When copying with "rep movsb", if the distance between source and
+destination is N*4GB + [1..63] with N >= 0, performance may be very
+slow.  This patch updates memmove-vec-unaligned-erms.S for AVX and
+AVX512 versions with the distance in RCX:
+
+	cmpl	$63, %ecx
+	// Don't use "rep movsb" if ECX <= 63
+	jbe	L(Don't use rep movsb")
+	Use "rep movsb"
+
+Benchtests data with bench-memcpy, bench-memcpy-large, bench-memcpy-random
+and bench-memcpy-walk on Skylake, Ice Lake and Tiger Lake show that its
+performance impact is within noise range as "rep movsb" is only used for
+data size >= 4KB.
+---
+ .../multiarch/memmove-vec-unaligned-erms.S    | 21 +++++++++++++++++++
+ 1 file changed, 21 insertions(+)
+
+diff --git a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
+index 673b73aa..c475fed4 100644
+--- a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
+@@ -64,6 +64,13 @@
+ # endif
+ #endif
+ 
+/* Avoid short distance rep movsb only with non-SSE vector.  */
+#ifndef AVOID_SHORT_DISTANCE_REP_MOVSB
+# define AVOID_SHORT_DISTANCE_REP_MOVSB (VEC_SIZE > 16)
+#else
+# define AVOID_SHORT_DISTANCE_REP_MOVSB 0
+#endif
+
+ #ifndef PREFETCH
+ # define PREFETCH(addr) prefetcht0 addr
+ #endif
+@@ -255,7 +262,21 @@ L(movsb):
+ 	cmpq	%r9, %rdi
+ 	/* Avoid slow backward REP MOVSB.  */
+ 	jb	L(more_8x_vec_backward)
+# if AVOID_SHORT_DISTANCE_REP_MOVSB
+	movq	%rdi, %rcx
+	subq	%rsi, %rcx
+	jmp	2f
+# endif
+ 1:
+# if AVOID_SHORT_DISTANCE_REP_MOVSB
+	movq	%rsi, %rcx
+	subq	%rdi, %rcx
+2:
+/* Avoid "rep movsb" if RCX, the distance between source and destination,
+   is N*4GB + [1..63] with N >= 0.  */
+	cmpl	$63, %ecx
+	jbe	L(more_2x_vec)	/* Avoid "rep movsb" if ECX <= 63.  */
+# endif
+ 	mov	%RDX_LP, %RCX_LP
+ 	rep movsb
+ L(nop):
+-- 
+GitLab
+
--- a/glibc-RHEL-15696-39.patch
+++ b/glibc-RHEL-15696-39.patch
@ -0,0 +1,449 @@
+From 1a8605b6cd257e8a74e29b5b71c057211f5fb847 Mon Sep 17 00:00:00 2001
+From: noah <goldstein.w.n@gmail.com>
+Date: Sat, 3 Apr 2021 04:12:15 -0400
+Subject: [PATCH] x86: Update large memcpy case in memmove-vec-unaligned-erms.S
+Content-type: text/plain; charset=UTF-8
+
+No Bug. This commit updates the large memcpy case (no overlap). The
+update is to perform memcpy on either 2 or 4 contiguous pages at
+once. This 1) helps to alleviate the affects of false memory aliasing
+when destination and source have a close 4k alignment and 2) In most
+cases and for most DRAM units is a modestly more efficient access
+pattern. These changes are a clear performance improvement for
+VEC_SIZE =16/32, though more ambiguous for VEC_SIZE=64. test-memcpy,
+test-memccpy, test-mempcpy, test-memmove, and tst-memmove-overflow all
+pass.
+
+Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
+---
+ .../multiarch/memmove-vec-unaligned-erms.S    | 338 ++++++++++++++----
+ 1 file changed, 265 insertions(+), 73 deletions(-)
+
+Conflicts:
+	sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
+	(different number of sections)
+
+diff --git a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
+index c475fed4..3e2dd6bc 100644
+--- a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
+@@ -32,7 +32,16 @@
+       overlapping addresses.
+    6. If size >= __x86_shared_non_temporal_threshold and there is no
+       overlap between destination and source, use non-temporal store
+-      instead of aligned store.  */
+      instead of aligned store copying from either 2 or 4 pages at
+      once.
+   8. For point 7) if size < 16 * __x86_shared_non_temporal_threshold
+      and source and destination do not page alias, copy from 2 pages
+      at once using non-temporal stores. Page aliasing in this case is
+      considered true if destination's page alignment - sources' page
+      alignment is less than 8 * VEC_SIZE.
+   9. If size >= 16 * __x86_shared_non_temporal_threshold or source
+      and destination do page alias copy from 4 pages at once using
+      non-temporal stores.  */
+ 
+ #include <sysdep.h>
+ 
+@@ -64,6 +73,34 @@
+ # endif
+ #endif
+ 
+#ifndef PAGE_SIZE
+# define PAGE_SIZE 4096
+#endif
+
+#if PAGE_SIZE != 4096
+# error Unsupported PAGE_SIZE
+#endif
+
+#ifndef LOG_PAGE_SIZE
+# define LOG_PAGE_SIZE 12
+#endif
+
+#if PAGE_SIZE != (1 << LOG_PAGE_SIZE)
+# error Invalid LOG_PAGE_SIZE
+#endif
+
+/* Byte per page for large_memcpy inner loop.  */
+#if VEC_SIZE == 64
+# define LARGE_LOAD_SIZE (VEC_SIZE * 2)
+#else
+# define LARGE_LOAD_SIZE (VEC_SIZE * 4)
+#endif
+
+/* Amount to shift rdx by to compare for memcpy_large_4x.  */
+#ifndef LOG_4X_MEMCPY_THRESH
+# define LOG_4X_MEMCPY_THRESH 4
+#endif
+
+ /* Avoid short distance rep movsb only with non-SSE vector.  */
+ #ifndef AVOID_SHORT_DISTANCE_REP_MOVSB
+ # define AVOID_SHORT_DISTANCE_REP_MOVSB (VEC_SIZE > 16)
+@@ -103,6 +140,28 @@
+ # error Unsupported PREFETCH_SIZE!
+ #endif
+ 
+#if LARGE_LOAD_SIZE == (VEC_SIZE * 2)
+# define LOAD_ONE_SET(base, offset, vec0, vec1, ...) \
+	VMOVU	(offset)base, vec0; \
+	VMOVU	((offset) + VEC_SIZE)base, vec1;
+# define STORE_ONE_SET(base, offset, vec0, vec1, ...) \
+	VMOVNT  vec0, (offset)base; \
+	VMOVNT  vec1, ((offset) + VEC_SIZE)base;
+#elif LARGE_LOAD_SIZE == (VEC_SIZE * 4)
+# define LOAD_ONE_SET(base, offset, vec0, vec1, vec2, vec3) \
+	VMOVU	(offset)base, vec0; \
+	VMOVU	((offset) + VEC_SIZE)base, vec1; \
+	VMOVU	((offset) + VEC_SIZE * 2)base, vec2; \
+	VMOVU	((offset) + VEC_SIZE * 3)base, vec3;
+# define STORE_ONE_SET(base, offset, vec0, vec1, vec2, vec3) \
+	VMOVNT	vec0, (offset)base; \
+	VMOVNT	vec1, ((offset) + VEC_SIZE)base; \
+	VMOVNT	vec2, ((offset) + VEC_SIZE * 2)base; \
+	VMOVNT	vec3, ((offset) + VEC_SIZE * 3)base;
+#else
+# error Invalid LARGE_LOAD_SIZE
+#endif
+
+ #ifndef SECTION
+ # error SECTION is not defined!
+ #endif
+@@ -390,6 +449,15 @@ L(last_4x_vec):
+ 	VZEROUPPER_RETURN
+ 
+ L(more_8x_vec):
+	/* Check if non-temporal move candidate.  */
+#if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc)
+	/* Check non-temporal store threshold.  */
+	cmp __x86_shared_non_temporal_threshold(%rip), %RDX_LP
+	ja	L(large_memcpy_2x)
+#endif
+	/* Entry if rdx is greater than non-temporal threshold but there
+       is overlap.  */
+L(more_8x_vec_check):
+ 	cmpq	%rsi, %rdi
+ 	ja	L(more_8x_vec_backward)
+ 	/* Source == destination is less common.  */
+@@ -416,24 +484,21 @@ L(more_8x_vec):
+ 	subq	%r8, %rdi
+ 	/* Adjust length.  */
+ 	addq	%r8, %rdx
+-#if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc)
+-	/* Check non-temporal store threshold.  */
+-	cmp	__x86_shared_non_temporal_threshold(%rip), %RDX_LP
+-	ja	L(large_forward)
+-#endif
+
+	.p2align 4
+ L(loop_4x_vec_forward):
+ 	/* Copy 4 * VEC a time forward.  */
+ 	VMOVU	(%rsi), %VEC(0)
+ 	VMOVU	VEC_SIZE(%rsi), %VEC(1)
+ 	VMOVU	(VEC_SIZE * 2)(%rsi), %VEC(2)
+ 	VMOVU	(VEC_SIZE * 3)(%rsi), %VEC(3)
+-	addq	$(VEC_SIZE * 4), %rsi
+-	subq	$(VEC_SIZE * 4), %rdx
+	subq	$-(VEC_SIZE * 4), %rsi
+	addq	$-(VEC_SIZE * 4), %rdx
+ 	VMOVA	%VEC(0), (%rdi)
+ 	VMOVA	%VEC(1), VEC_SIZE(%rdi)
+ 	VMOVA	%VEC(2), (VEC_SIZE * 2)(%rdi)
+ 	VMOVA	%VEC(3), (VEC_SIZE * 3)(%rdi)
+-	addq	$(VEC_SIZE * 4), %rdi
+	subq	$-(VEC_SIZE * 4), %rdi
+ 	cmpq	$(VEC_SIZE * 4), %rdx
+ 	ja	L(loop_4x_vec_forward)
+ 	/* Store the last 4 * VEC.  */
+@@ -467,24 +532,21 @@ L(more_8x_vec_backward):
+ 	subq	%r8, %r9
+ 	/* Adjust length.  */
+ 	subq	%r8, %rdx
+-#if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc)
+-	/* Check non-temporal store threshold.  */
+-	cmp	__x86_shared_non_temporal_threshold(%rip), %RDX_LP
+-	ja	L(large_backward)
+-#endif
+
+	.p2align 4
+ L(loop_4x_vec_backward):
+ 	/* Copy 4 * VEC a time backward.  */
+ 	VMOVU	(%rcx), %VEC(0)
+ 	VMOVU	-VEC_SIZE(%rcx), %VEC(1)
+ 	VMOVU	-(VEC_SIZE * 2)(%rcx), %VEC(2)
+ 	VMOVU	-(VEC_SIZE * 3)(%rcx), %VEC(3)
+-	subq	$(VEC_SIZE * 4), %rcx
+-	subq	$(VEC_SIZE * 4), %rdx
+	addq	$-(VEC_SIZE * 4), %rcx
+	addq	$-(VEC_SIZE * 4), %rdx
+ 	VMOVA	%VEC(0), (%r9)
+ 	VMOVA	%VEC(1), -VEC_SIZE(%r9)
+ 	VMOVA	%VEC(2), -(VEC_SIZE * 2)(%r9)
+ 	VMOVA	%VEC(3), -(VEC_SIZE * 3)(%r9)
+-	subq	$(VEC_SIZE * 4), %r9
+	addq	$-(VEC_SIZE * 4), %r9
+ 	cmpq	$(VEC_SIZE * 4), %rdx
+ 	ja	L(loop_4x_vec_backward)
+ 	/* Store the first 4 * VEC.  */
+@@ -497,72 +559,202 @@ L(loop_4x_vec_backward):
+ 	VZEROUPPER_RETURN
+ 
+ #if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc)
+-L(large_forward):
+	.p2align 4
+L(large_memcpy_2x):
+	/* Compute absolute value of difference between source and
+	   destination.  */
+	movq	%rdi, %r9
+	subq	%rsi, %r9
+	movq	%r9, %r8
+	leaq	-1(%r9), %rcx
+	sarq	$63, %r8
+	xorq	%r8, %r9
+	subq	%r8, %r9
+ 	/* Don't use non-temporal store if there is overlap between
+-	   destination and source since destination may be in cache
+-	   when source is loaded.  */
+-	leaq    (%rdi, %rdx), %r10
+-	cmpq    %r10, %rsi
+-	jb	L(loop_4x_vec_forward)
+-L(loop_large_forward):
+	   destination and source since destination may be in cache when
+	   source is loaded.  */
+	cmpq	%r9, %rdx
+	ja	L(more_8x_vec_check)
+
+	/* Cache align destination. First store the first 64 bytes then
+	   adjust alignments.  */
+	VMOVU	(%rsi), %VEC(8)
+#if VEC_SIZE < 64
+	VMOVU	VEC_SIZE(%rsi), %VEC(9)
+#if VEC_SIZE < 32
+	VMOVU	(VEC_SIZE * 2)(%rsi), %VEC(10)
+	VMOVU	(VEC_SIZE * 3)(%rsi), %VEC(11)
+#endif
+#endif
+	VMOVU	%VEC(8), (%rdi)
+#if VEC_SIZE < 64
+	VMOVU	%VEC(9), VEC_SIZE(%rdi)
+#if VEC_SIZE < 32
+	VMOVU	%VEC(10), (VEC_SIZE * 2)(%rdi)
+	VMOVU	%VEC(11), (VEC_SIZE * 3)(%rdi)
+#endif
+#endif
+	/* Adjust source, destination, and size.  */
+	movq	%rdi, %r8
+	andq	$63, %r8
+	/* Get the negative of offset for alignment.  */
+	subq	$64, %r8
+	/* Adjust source.  */
+	subq	%r8, %rsi
+	/* Adjust destination which should be aligned now.  */
+	subq	%r8, %rdi
+	/* Adjust length.  */
+	addq	%r8, %rdx
+
+	/* Test if source and destination addresses will alias. If they do
+	   the larger pipeline in large_memcpy_4x alleviated the
+	   performance drop.  */
+	testl	$(PAGE_SIZE - VEC_SIZE * 8), %ecx
+	jz	L(large_memcpy_4x)
+
+	movq	%rdx, %r10
+	shrq	$LOG_4X_MEMCPY_THRESH, %r10
+	cmp	__x86_shared_non_temporal_threshold(%rip), %r10
+	jae	L(large_memcpy_4x)
+
+	/* edx will store remainder size for copying tail.  */
+	andl	$(PAGE_SIZE * 2 - 1), %edx
+	/* r10 stores outer loop counter.  */
+	shrq	$((LOG_PAGE_SIZE + 1) - LOG_4X_MEMCPY_THRESH), %r10
+	/* Copy 4x VEC at a time from 2 pages.  */
+	.p2align 4
+L(loop_large_memcpy_2x_outer):
+	/* ecx stores inner loop counter.  */
+	movl	$(PAGE_SIZE / LARGE_LOAD_SIZE), %ecx
+L(loop_large_memcpy_2x_inner):
+	PREFETCH_ONE_SET(1, (%rsi), PREFETCHED_LOAD_SIZE)
+	PREFETCH_ONE_SET(1, (%rsi), PREFETCHED_LOAD_SIZE * 2)
+	PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE + PREFETCHED_LOAD_SIZE)
+	PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE + PREFETCHED_LOAD_SIZE * 2)
+	/* Load vectors from rsi.  */
+	LOAD_ONE_SET((%rsi), 0, %VEC(0), %VEC(1), %VEC(2), %VEC(3))
+	LOAD_ONE_SET((%rsi), PAGE_SIZE, %VEC(4), %VEC(5), %VEC(6), %VEC(7))
+	subq	$-LARGE_LOAD_SIZE, %rsi
+	/* Non-temporal store vectors to rdi.  */
+	STORE_ONE_SET((%rdi), 0, %VEC(0), %VEC(1), %VEC(2), %VEC(3))
+	STORE_ONE_SET((%rdi), PAGE_SIZE, %VEC(4), %VEC(5), %VEC(6), %VEC(7))
+	subq	$-LARGE_LOAD_SIZE, %rdi
+	decl	%ecx
+	jnz	L(loop_large_memcpy_2x_inner)
+	addq	$PAGE_SIZE, %rdi
+	addq	$PAGE_SIZE, %rsi
+	decq	%r10
+	jne	L(loop_large_memcpy_2x_outer)
+	sfence
+
+	/* Check if only last 4 loads are needed.  */
+	cmpl	$(VEC_SIZE * 4), %edx
+	jbe	L(large_memcpy_2x_end)
+
+	/* Handle the last 2 * PAGE_SIZE bytes.  */
+L(loop_large_memcpy_2x_tail):
+ 	/* Copy 4 * VEC a time forward with non-temporal stores.  */
+-	PREFETCH_ONE_SET (1, (%rsi), PREFETCHED_LOAD_SIZE * 2)
+-	PREFETCH_ONE_SET (1, (%rsi), PREFETCHED_LOAD_SIZE * 3)
+	PREFETCH_ONE_SET (1, (%rsi), PREFETCHED_LOAD_SIZE)
+	PREFETCH_ONE_SET (1, (%rdi), PREFETCHED_LOAD_SIZE)
+ 	VMOVU	(%rsi), %VEC(0)
+ 	VMOVU	VEC_SIZE(%rsi), %VEC(1)
+ 	VMOVU	(VEC_SIZE * 2)(%rsi), %VEC(2)
+ 	VMOVU	(VEC_SIZE * 3)(%rsi), %VEC(3)
+-	addq	$PREFETCHED_LOAD_SIZE, %rsi
+-	subq	$PREFETCHED_LOAD_SIZE, %rdx
+-	VMOVNT	%VEC(0), (%rdi)
+-	VMOVNT	%VEC(1), VEC_SIZE(%rdi)
+-	VMOVNT	%VEC(2), (VEC_SIZE * 2)(%rdi)
+-	VMOVNT	%VEC(3), (VEC_SIZE * 3)(%rdi)
+-	addq	$PREFETCHED_LOAD_SIZE, %rdi
+-	cmpq	$PREFETCHED_LOAD_SIZE, %rdx
+-	ja	L(loop_large_forward)
+-	sfence
+	subq	$-(VEC_SIZE * 4), %rsi
+	addl	$-(VEC_SIZE * 4), %edx
+	VMOVA	%VEC(0), (%rdi)
+	VMOVA	%VEC(1), VEC_SIZE(%rdi)
+	VMOVA	%VEC(2), (VEC_SIZE * 2)(%rdi)
+	VMOVA	%VEC(3), (VEC_SIZE * 3)(%rdi)
+	subq	$-(VEC_SIZE * 4), %rdi
+	cmpl	$(VEC_SIZE * 4), %edx
+	ja	L(loop_large_memcpy_2x_tail)
+
+L(large_memcpy_2x_end):
+ 	/* Store the last 4 * VEC.  */
+-	VMOVU	%VEC(5), (%rcx)
+-	VMOVU	%VEC(6), -VEC_SIZE(%rcx)
+-	VMOVU	%VEC(7), -(VEC_SIZE * 2)(%rcx)
+-	VMOVU	%VEC(8), -(VEC_SIZE * 3)(%rcx)
+-	/* Store the first VEC.  */
+-	VMOVU	%VEC(4), (%r11)
+	VMOVU	-(VEC_SIZE * 4)(%rsi, %rdx), %VEC(0)
+	VMOVU	-(VEC_SIZE * 3)(%rsi, %rdx), %VEC(1)
+	VMOVU	-(VEC_SIZE * 2)(%rsi, %rdx), %VEC(2)
+	VMOVU	-VEC_SIZE(%rsi, %rdx), %VEC(3)
+
+	VMOVU	%VEC(0), -(VEC_SIZE * 4)(%rdi, %rdx)
+	VMOVU	%VEC(1), -(VEC_SIZE * 3)(%rdi, %rdx)
+	VMOVU	%VEC(2), -(VEC_SIZE * 2)(%rdi, %rdx)
+	VMOVU	%VEC(3), -VEC_SIZE(%rdi, %rdx)
+ 	VZEROUPPER_RETURN
+ 
+-L(large_backward):
+-	/* Don't use non-temporal store if there is overlap between
+-	   destination and source since destination may be in cache
+-	   when source is loaded.  */
+-	leaq    (%rcx, %rdx), %r10
+-	cmpq    %r10, %r9
+-	jb	L(loop_4x_vec_backward)
+-L(loop_large_backward):
+-	/* Copy 4 * VEC a time backward with non-temporal stores.  */
+-	PREFETCH_ONE_SET (-1, (%rcx), -PREFETCHED_LOAD_SIZE * 2)
+-	PREFETCH_ONE_SET (-1, (%rcx), -PREFETCHED_LOAD_SIZE * 3)
+-	VMOVU	(%rcx), %VEC(0)
+-	VMOVU	-VEC_SIZE(%rcx), %VEC(1)
+-	VMOVU	-(VEC_SIZE * 2)(%rcx), %VEC(2)
+-	VMOVU	-(VEC_SIZE * 3)(%rcx), %VEC(3)
+-	subq	$PREFETCHED_LOAD_SIZE, %rcx
+-	subq	$PREFETCHED_LOAD_SIZE, %rdx
+-	VMOVNT	%VEC(0), (%r9)
+-	VMOVNT	%VEC(1), -VEC_SIZE(%r9)
+-	VMOVNT	%VEC(2), -(VEC_SIZE * 2)(%r9)
+-	VMOVNT	%VEC(3), -(VEC_SIZE * 3)(%r9)
+-	subq	$PREFETCHED_LOAD_SIZE, %r9
+-	cmpq	$PREFETCHED_LOAD_SIZE, %rdx
+-	ja	L(loop_large_backward)
+	.p2align 4
+L(large_memcpy_4x):
+	movq	%rdx, %r10
+	/* edx will store remainder size for copying tail.  */
+	andl	$(PAGE_SIZE * 4 - 1), %edx
+	/* r10 stores outer loop counter.  */
+	shrq	$(LOG_PAGE_SIZE + 2), %r10
+	/* Copy 4x VEC at a time from 4 pages.  */
+	.p2align 4
+L(loop_large_memcpy_4x_outer):
+	/* ecx stores inner loop counter.  */
+	movl	$(PAGE_SIZE / LARGE_LOAD_SIZE), %ecx
+L(loop_large_memcpy_4x_inner):
+	/* Only one prefetch set per page as doing 4 pages give more time
+	   for prefetcher to keep up.  */
+	PREFETCH_ONE_SET(1, (%rsi), PREFETCHED_LOAD_SIZE)
+	PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE + PREFETCHED_LOAD_SIZE)
+	PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE * 2 + PREFETCHED_LOAD_SIZE)
+	PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE * 3 + PREFETCHED_LOAD_SIZE)
+	/* Load vectors from rsi.  */
+	LOAD_ONE_SET((%rsi), 0, %VEC(0), %VEC(1), %VEC(2), %VEC(3))
+	LOAD_ONE_SET((%rsi), PAGE_SIZE, %VEC(4), %VEC(5), %VEC(6), %VEC(7))
+	LOAD_ONE_SET((%rsi), PAGE_SIZE * 2, %VEC(8), %VEC(9), %VEC(10), %VEC(11))
+	LOAD_ONE_SET((%rsi), PAGE_SIZE * 3, %VEC(12), %VEC(13), %VEC(14), %VEC(15))
+	subq	$-LARGE_LOAD_SIZE, %rsi
+	/* Non-temporal store vectors to rdi.  */
+	STORE_ONE_SET((%rdi), 0, %VEC(0), %VEC(1), %VEC(2), %VEC(3))
+	STORE_ONE_SET((%rdi), PAGE_SIZE, %VEC(4), %VEC(5), %VEC(6), %VEC(7))
+	STORE_ONE_SET((%rdi), PAGE_SIZE * 2, %VEC(8), %VEC(9), %VEC(10), %VEC(11))
+	STORE_ONE_SET((%rdi), PAGE_SIZE * 3, %VEC(12), %VEC(13), %VEC(14), %VEC(15))
+	subq	$-LARGE_LOAD_SIZE, %rdi
+	decl	%ecx
+	jnz	L(loop_large_memcpy_4x_inner)
+	addq	$(PAGE_SIZE * 3), %rdi
+	addq	$(PAGE_SIZE * 3), %rsi
+	decq	%r10
+	jne	L(loop_large_memcpy_4x_outer)
+ 	sfence
+-	/* Store the first 4 * VEC.  */
+-	VMOVU	%VEC(4), (%rdi)
+-	VMOVU	%VEC(5), VEC_SIZE(%rdi)
+-	VMOVU	%VEC(6), (VEC_SIZE * 2)(%rdi)
+-	VMOVU	%VEC(7), (VEC_SIZE * 3)(%rdi)
+-	/* Store the last VEC.  */
+-	VMOVU	%VEC(8), (%r11)
+	/* Check if only last 4 loads are needed.  */
+	cmpl	$(VEC_SIZE * 4), %edx
+	jbe	L(large_memcpy_4x_end)
+
+	/* Handle the last 4  * PAGE_SIZE bytes.  */
+L(loop_large_memcpy_4x_tail):
+	/* Copy 4 * VEC a time forward with non-temporal stores.  */
+	PREFETCH_ONE_SET (1, (%rsi), PREFETCHED_LOAD_SIZE)
+	PREFETCH_ONE_SET (1, (%rdi), PREFETCHED_LOAD_SIZE)
+	VMOVU	(%rsi), %VEC(0)
+	VMOVU	VEC_SIZE(%rsi), %VEC(1)
+	VMOVU	(VEC_SIZE * 2)(%rsi), %VEC(2)
+	VMOVU	(VEC_SIZE * 3)(%rsi), %VEC(3)
+	subq	$-(VEC_SIZE * 4), %rsi
+	addl	$-(VEC_SIZE * 4), %edx
+	VMOVA	%VEC(0), (%rdi)
+	VMOVA	%VEC(1), VEC_SIZE(%rdi)
+	VMOVA	%VEC(2), (VEC_SIZE * 2)(%rdi)
+	VMOVA	%VEC(3), (VEC_SIZE * 3)(%rdi)
+	subq	$-(VEC_SIZE * 4), %rdi
+	cmpl	$(VEC_SIZE * 4), %edx
+	ja	L(loop_large_memcpy_4x_tail)
+
+L(large_memcpy_4x_end):
+	/* Store the last 4 * VEC.  */
+	VMOVU	-(VEC_SIZE * 4)(%rsi, %rdx), %VEC(0)
+	VMOVU	-(VEC_SIZE * 3)(%rsi, %rdx), %VEC(1)
+	VMOVU	-(VEC_SIZE * 2)(%rsi, %rdx), %VEC(2)
+	VMOVU	-VEC_SIZE(%rsi, %rdx), %VEC(3)
+
+	VMOVU	%VEC(0), -(VEC_SIZE * 4)(%rdi, %rdx)
+	VMOVU	%VEC(1), -(VEC_SIZE * 3)(%rdi, %rdx)
+	VMOVU	%VEC(2), -(VEC_SIZE * 2)(%rdi, %rdx)
+	VMOVU	%VEC(3), -VEC_SIZE(%rdi, %rdx)
+ 	VZEROUPPER_RETURN
+ #endif
+ END (MEMMOVE_SYMBOL (__memmove, unaligned_erms))
+-- 
+GitLab
+
--- a/glibc-RHEL-15696-4.patch
+++ b/glibc-RHEL-15696-4.patch
@ -0,0 +1,151 @@
+From ecd8b842cf37ea112e59cd9085ff1f1b6e208ae0 Mon Sep 17 00:00:00 2001
+From: "H.J. Lu" <hjl.tools@gmail.com>
+Date: Mon, 21 Jan 2019 11:29:58 -0800
+Subject: [PATCH] x86-64 memrchr: Properly handle the length parameter [BZ#
+ 24097]
+Content-type: text/plain; charset=UTF-8
+
+On x32, the size_t parameter may be passed in the lower 32 bits of a
+64-bit register with the non-zero upper 32 bits.  The string/memory
+functions written in assembly can only use the lower 32 bits of a
+64-bit register as length or must clear the upper 32 bits before using
+the full 64-bit register for length.
+
+This pach fixes memrchr for x32.  Tested on x86-64 and x32.  On x86-64,
+libc.so is the same with and withou the fix.
+
+	[BZ# 24097]
+	CVE-2019-6488
+	* sysdeps/x86_64/memrchr.S: Use RDX_LP for length.
+	* sysdeps/x86_64/multiarch/memrchr-avx2.S: Likewise.
+	* sysdeps/x86_64/x32/Makefile (tests): Add tst-size_t-memrchr.
+	* sysdeps/x86_64/x32/tst-size_t-memrchr.c: New file.
+---
+ sysdeps/x86_64/memrchr.S                |  4 +-
+ sysdeps/x86_64/multiarch/memrchr-avx2.S |  4 +-
+ sysdeps/x86_64/x32/Makefile             |  3 +-
+ sysdeps/x86_64/x32/tst-size_t-memrchr.c | 57 +++++++++++++++++++++++++
+ 4 files changed, 63 insertions(+), 5 deletions(-)
+ create mode 100644 sysdeps/x86_64/x32/tst-size_t-memrchr.c
+
+Conflicts:
+	ChangeLog
+	(removed)
+
+diff --git a/sysdeps/x86_64/memrchr.S b/sysdeps/x86_64/memrchr.S
+index b8e3fa1d..dc82f8f7 100644
+--- a/sysdeps/x86_64/memrchr.S
+++ b/sysdeps/x86_64/memrchr.S
+@@ -24,13 +24,13 @@
+ ENTRY (__memrchr)
+ 	movd	%esi, %xmm1
+ 
+-	sub	$16, %rdx
+	sub	$16, %RDX_LP
+ 	jbe	L(length_less16)
+ 
+ 	punpcklbw	%xmm1, %xmm1
+ 	punpcklbw	%xmm1, %xmm1
+ 
+-	add	%rdx, %rdi
+	add	%RDX_LP, %RDI_LP
+ 	pshufd	$0, %xmm1, %xmm1
+ 
+ 	movdqu	(%rdi), %xmm0
+diff --git a/sysdeps/x86_64/multiarch/memrchr-avx2.S b/sysdeps/x86_64/multiarch/memrchr-avx2.S
+index b41a58bc..ce488dd9 100644
+--- a/sysdeps/x86_64/multiarch/memrchr-avx2.S
+++ b/sysdeps/x86_64/multiarch/memrchr-avx2.S
+@@ -32,10 +32,10 @@ ENTRY (__memrchr_avx2)
+ 	vmovd	%esi, %xmm0
+ 	vpbroadcastb %xmm0, %ymm0
+ 
+-	subq	$VEC_SIZE, %rdx
+	sub	$VEC_SIZE, %RDX_LP
+ 	jbe	L(last_vec_or_less)
+ 
+-	addq	%rdx, %rdi
+	add	%RDX_LP, %RDI_LP
+ 
+ 	/* Check the last VEC_SIZE bytes.  */
+ 	vpcmpeqb (%rdi), %ymm0, %ymm1
+diff --git a/sysdeps/x86_64/x32/Makefile b/sysdeps/x86_64/x32/Makefile
+index 2fe1e5ac..e99dbd7c 100644
+--- a/sysdeps/x86_64/x32/Makefile
+++ b/sysdeps/x86_64/x32/Makefile
+@@ -6,7 +6,8 @@ CFLAGS-s_llround.c += -fno-builtin-lround
+ endif
+ 
+ ifeq ($(subdir),string)
+-tests += tst-size_t-memchr tst-size_t-memcmp tst-size_t-memcpy
+tests += tst-size_t-memchr tst-size_t-memcmp tst-size_t-memcpy \
+	 tst-size_t-memrchr
+ endif
+ 
+ ifeq ($(subdir),wcsmbs)
+diff --git a/sysdeps/x86_64/x32/tst-size_t-memrchr.c b/sysdeps/x86_64/x32/tst-size_t-memrchr.c
+new file mode 100644
+index 00000000..c83699c0
+--- /dev/null
+++ b/sysdeps/x86_64/x32/tst-size_t-memrchr.c
+@@ -0,0 +1,57 @@
+/* Test memrchr with size_t in the lower 32 bits of 64-bit register.
+   Copyright (C) 2019 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#define TEST_NAME "memrchr"
+#include "test-size_t.h"
+
+IMPL (memchr, 1)
+
+typedef void * (*proto_t) (const void *, int, size_t);
+
+static void *
+__attribute__ ((noinline, noclone))
+do_memrchr (parameter_t a, parameter_t b)
+{
+  return CALL (&b, a.p, (uintptr_t) b.p, a.len);
+}
+
+static int
+test_main (void)
+{
+  test_init ();
+
+  parameter_t src = { { page_size }, buf2 };
+  parameter_t c = { { 0 }, (void *) (uintptr_t) 0x12 };
+
+  int ret = 0;
+  FOR_EACH_IMPL (impl, 0)
+    {
+      c.fn = impl->fn;
+      void * res = do_memrchr (src, c);
+      if (res)
+	{
+	  error (0, 0, "Wrong result in function %s: %p != NULL",
+		 impl->name, res);
+	  ret = 1;
+	}
+    }
+
+  return ret ? EXIT_FAILURE : EXIT_SUCCESS;
+}
+
+#include <support/test-driver.c>
+-- 
+GitLab
+
--- a/glibc-RHEL-15696-40.patch
+++ b/glibc-RHEL-15696-40.patch
@ -0,0 +1,92 @@
+From 83c5b368226c34a2f0a5287df40fc290b2b34359 Mon Sep 17 00:00:00 2001
+From: "H.J. Lu" <hjl.tools@gmail.com>
+Date: Mon, 19 Apr 2021 10:45:07 -0700
+Subject: [PATCH] x86-64: Require BMI2 for strchr-avx2.S
+Content-type: text/plain; charset=UTF-8
+
+Since strchr-avx2.S updated by
+
+commit 1f745ecc2109890886b161d4791e1406fdfc29b8
+Author: noah <goldstein.w.n@gmail.com>
+Date:   Wed Feb 3 00:38:59 2021 -0500
+
+    x86-64: Refactor and improve performance of strchr-avx2.S
+
+uses sarx:
+
+c4 e2 72 f7 c0       	sarx   %ecx,%eax,%eax
+
+for strchr-avx2 family functions, require BMI2 in ifunc-impl-list.c and
+ifunc-avx2.h.
+---
+ sysdeps/x86_64/multiarch/ifunc-avx2.h      |  4 ++--
+ sysdeps/x86_64/multiarch/ifunc-impl-list.c | 12 +++++++++---
+ 2 files changed, 11 insertions(+), 5 deletions(-)
+
+diff --git a/sysdeps/x86_64/multiarch/ifunc-avx2.h b/sysdeps/x86_64/multiarch/ifunc-avx2.h
+index e0f30e61..ef72b73f 100644
+--- a/sysdeps/x86_64/multiarch/ifunc-avx2.h
+++ b/sysdeps/x86_64/multiarch/ifunc-avx2.h
+@@ -30,11 +30,11 @@ IFUNC_SELECTOR (void)
+   const struct cpu_features* cpu_features = __get_cpu_features ();
+ 
+   if (CPU_FEATURE_USABLE_P (cpu_features, AVX2)
+      && CPU_FEATURE_USABLE_P (cpu_features, BMI2)
+       && CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load))
+     {
+       if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
+-	  && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW)
+-	  && CPU_FEATURE_USABLE_P (cpu_features, BMI2))
+	  && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW))
+ 	return OPTIMIZE (evex);
+ 
+       if (CPU_FEATURE_USABLE_P (cpu_features, RTM))
+diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+index 695cdba6..85b8863a 100644
+--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+@@ -400,10 +400,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+   /* Support sysdeps/x86_64/multiarch/strchr.c.  */
+   IFUNC_IMPL (i, name, strchr,
+ 	      IFUNC_IMPL_ADD (array, i, strchr,
+-			      CPU_FEATURE_USABLE (AVX2),
+			      (CPU_FEATURE_USABLE (AVX2)
+			       && CPU_FEATURE_USABLE (BMI2)),
+ 			      __strchr_avx2)
+ 	      IFUNC_IMPL_ADD (array, i, strchr,
+ 			      (CPU_FEATURE_USABLE (AVX2)
+			       && CPU_FEATURE_USABLE (BMI2)
+ 			       && CPU_FEATURE_USABLE (RTM)),
+ 			      __strchr_avx2_rtm)
+ 	      IFUNC_IMPL_ADD (array, i, strchr,
+@@ -417,10 +419,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+   /* Support sysdeps/x86_64/multiarch/strchrnul.c.  */
+   IFUNC_IMPL (i, name, strchrnul,
+ 	      IFUNC_IMPL_ADD (array, i, strchrnul,
+-			      CPU_FEATURE_USABLE (AVX2),
+			      (CPU_FEATURE_USABLE (AVX2)
+			       && CPU_FEATURE_USABLE (BMI2)),
+ 			      __strchrnul_avx2)
+ 	      IFUNC_IMPL_ADD (array, i, strchrnul,
+ 			      (CPU_FEATURE_USABLE (AVX2)
+			       && CPU_FEATURE_USABLE (BMI2)
+ 			       && CPU_FEATURE_USABLE (RTM)),
+ 			      __strchrnul_avx2_rtm)
+ 	      IFUNC_IMPL_ADD (array, i, strchrnul,
+@@ -574,10 +578,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+   /* Support sysdeps/x86_64/multiarch/wcschr.c.  */
+   IFUNC_IMPL (i, name, wcschr,
+ 	      IFUNC_IMPL_ADD (array, i, wcschr,
+-			      CPU_FEATURE_USABLE (AVX2),
+			      (CPU_FEATURE_USABLE (AVX2)
+			       && CPU_FEATURE_USABLE (BMI2)),
+ 			      __wcschr_avx2)
+ 	      IFUNC_IMPL_ADD (array, i, wcschr,
+ 			      (CPU_FEATURE_USABLE (AVX2)
+			       && CPU_FEATURE_USABLE (BMI2)
+ 			       && CPU_FEATURE_USABLE (RTM)),
+ 			      __wcschr_avx2_rtm)
+ 	      IFUNC_IMPL_ADD (array, i, wcschr,
+-- 
+GitLab
+
--- a/glibc-RHEL-15696-41.patch
+++ b/glibc-RHEL-15696-41.patch
@ -0,0 +1,265 @@
+From f53790272ce7bdc5ecd14b45f65d0464d2a61a3a Mon Sep 17 00:00:00 2001
+From: Noah Goldstein <goldstein.w.n@gmail.com>
+Date: Mon, 19 Apr 2021 17:48:10 -0400
+Subject: [PATCH] x86: Optimize less_vec evex and avx512
+ memset-vec-unaligned-erms.S
+Content-type: text/plain; charset=UTF-8
+
+No bug. This commit adds optimized cased for less_vec memset case that
+uses the avx512vl/avx512bw mask store avoiding the excessive
+branches. test-memset and test-wmemset are passing.
+
+Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
+---
+ sysdeps/x86_64/multiarch/ifunc-impl-list.c    | 40 ++++++++++-----
+ sysdeps/x86_64/multiarch/ifunc-memset.h       |  6 ++-
+ .../multiarch/memset-avx512-unaligned-erms.S  |  2 +-
+ .../multiarch/memset-evex-unaligned-erms.S    |  2 +-
+ .../multiarch/memset-vec-unaligned-erms.S     | 51 +++++++++++++++----
+ 5 files changed, 74 insertions(+), 27 deletions(-)
+
+diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+index 85b8863a..d59d65f8 100644
+--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+@@ -204,19 +204,23 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 			      __memset_chk_avx2_unaligned_erms_rtm)
+ 	      IFUNC_IMPL_ADD (array, i, __memset_chk,
+ 			      (CPU_FEATURE_USABLE (AVX512VL)
+-			       && CPU_FEATURE_USABLE (AVX512BW)),
+			       && CPU_FEATURE_USABLE (AVX512BW)
+			       && CPU_FEATURE_USABLE (BMI2)),
+ 			      __memset_chk_evex_unaligned)
+ 	      IFUNC_IMPL_ADD (array, i, __memset_chk,
+ 			      (CPU_FEATURE_USABLE (AVX512VL)
+-			       && CPU_FEATURE_USABLE (AVX512BW)),
+			       && CPU_FEATURE_USABLE (AVX512BW)
+			       && CPU_FEATURE_USABLE (BMI2)),
+ 			      __memset_chk_evex_unaligned_erms)
+ 	      IFUNC_IMPL_ADD (array, i, __memset_chk,
+ 			      (CPU_FEATURE_USABLE (AVX512VL)
+-			       && CPU_FEATURE_USABLE (AVX512BW)),
+			       && CPU_FEATURE_USABLE (AVX512BW)
+			       && CPU_FEATURE_USABLE (BMI2)),
+ 			      __memset_chk_avx512_unaligned_erms)
+ 	      IFUNC_IMPL_ADD (array, i, __memset_chk,
+ 			      (CPU_FEATURE_USABLE (AVX512VL)
+-			       && CPU_FEATURE_USABLE (AVX512BW)),
+			       && CPU_FEATURE_USABLE (AVX512BW)
+			       && CPU_FEATURE_USABLE (BMI2)),
+ 			      __memset_chk_avx512_unaligned)
+ 	      IFUNC_IMPL_ADD (array, i, __memset_chk,
+ 			      CPU_FEATURE_USABLE (AVX512F),
+@@ -247,19 +251,23 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 			      __memset_avx2_unaligned_erms_rtm)
+ 	      IFUNC_IMPL_ADD (array, i, memset,
+ 			      (CPU_FEATURE_USABLE (AVX512VL)
+-			       && CPU_FEATURE_USABLE (AVX512BW)),
+			       && CPU_FEATURE_USABLE (AVX512BW)
+			       && CPU_FEATURE_USABLE (BMI2)),
+ 			      __memset_evex_unaligned)
+ 	      IFUNC_IMPL_ADD (array, i, memset,
+ 			      (CPU_FEATURE_USABLE (AVX512VL)
+-			       && CPU_FEATURE_USABLE (AVX512BW)),
+			       && CPU_FEATURE_USABLE (AVX512BW)
+			       && CPU_FEATURE_USABLE (BMI2)),
+ 			      __memset_evex_unaligned_erms)
+ 	      IFUNC_IMPL_ADD (array, i, memset,
+ 			      (CPU_FEATURE_USABLE (AVX512VL)
+-			       && CPU_FEATURE_USABLE (AVX512BW)),
+			       && CPU_FEATURE_USABLE (AVX512BW)
+			       && CPU_FEATURE_USABLE (BMI2)),
+ 			      __memset_avx512_unaligned_erms)
+ 	      IFUNC_IMPL_ADD (array, i, memset,
+ 			      (CPU_FEATURE_USABLE (AVX512VL)
+-			       && CPU_FEATURE_USABLE (AVX512BW)),
+			       && CPU_FEATURE_USABLE (AVX512BW)
+			       && CPU_FEATURE_USABLE (BMI2)),
+ 			      __memset_avx512_unaligned)
+ 	      IFUNC_IMPL_ADD (array, i, memset,
+ 			      CPU_FEATURE_USABLE (AVX512F),
+@@ -739,10 +747,14 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 			       && CPU_FEATURE_USABLE (RTM)),
+ 			      __wmemset_avx2_unaligned_rtm)
+ 	      IFUNC_IMPL_ADD (array, i, wmemset,
+-			      CPU_FEATURE_USABLE (AVX512VL),
+			      (CPU_FEATURE_USABLE (AVX512VL)
+			       && CPU_FEATURE_USABLE (AVX512BW)
+			       && CPU_FEATURE_USABLE (BMI2)),
+ 			      __wmemset_evex_unaligned)
+ 	      IFUNC_IMPL_ADD (array, i, wmemset,
+-			      CPU_FEATURE_USABLE (AVX512VL),
+			      (CPU_FEATURE_USABLE (AVX512VL)
+			       && CPU_FEATURE_USABLE (AVX512BW)
+			       && CPU_FEATURE_USABLE (BMI2)),
+ 			      __wmemset_avx512_unaligned))
+ 
+ #ifdef SHARED
+@@ -946,10 +958,14 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 			      CPU_FEATURE_USABLE (AVX2),
+ 			      __wmemset_chk_avx2_unaligned)
+ 	      IFUNC_IMPL_ADD (array, i, __wmemset_chk,
+-			      CPU_FEATURE_USABLE (AVX512VL),
+			      (CPU_FEATURE_USABLE (AVX512VL)
+			       && CPU_FEATURE_USABLE (AVX512BW)
+			       && CPU_FEATURE_USABLE (BMI2)),
+ 			      __wmemset_chk_evex_unaligned)
+ 	      IFUNC_IMPL_ADD (array, i, __wmemset_chk,
+-			      CPU_FEATURE_USABLE (AVX512F),
+			      (CPU_FEATURE_USABLE (AVX512VL)
+			       && CPU_FEATURE_USABLE (AVX512BW)
+			       && CPU_FEATURE_USABLE (BMI2)),
+ 			      __wmemset_chk_avx512_unaligned))
+ #endif
+ 
+diff --git a/sysdeps/x86_64/multiarch/ifunc-memset.h b/sysdeps/x86_64/multiarch/ifunc-memset.h
+index 19795938..100e3707 100644
+--- a/sysdeps/x86_64/multiarch/ifunc-memset.h
+++ b/sysdeps/x86_64/multiarch/ifunc-memset.h
+@@ -54,7 +54,8 @@ IFUNC_SELECTOR (void)
+       && !CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_AVX512))
+     {
+       if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
+-	  && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW))
+          && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW)
+          && CPU_FEATURE_USABLE_P (cpu_features, BMI2))
+ 	{
+ 	  if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
+ 	    return OPTIMIZE (avx512_unaligned_erms);
+@@ -68,7 +69,8 @@ IFUNC_SELECTOR (void)
+   if (CPU_FEATURE_USABLE_P (cpu_features, AVX2))
+     {
+       if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
+-	  && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW))
+          && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW)
+          && CPU_FEATURE_USABLE_P (cpu_features, BMI2))
+ 	{
+ 	  if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
+ 	    return OPTIMIZE (evex_unaligned_erms);
+diff --git a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
+index 22e7b187..8ad842fc 100644
+--- a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
+@@ -19,6 +19,6 @@
+ # define SECTION(p)		p##.evex512
+ # define MEMSET_SYMBOL(p,s)	p##_avx512_##s
+ # define WMEMSET_SYMBOL(p,s)	p##_avx512_##s
+-
+# define USE_LESS_VEC_MASK_STORE	1
+ # include "memset-vec-unaligned-erms.S"
+ #endif
+diff --git a/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
+index ae0a4d6e..640f0929 100644
+--- a/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
+@@ -19,6 +19,6 @@
+ # define SECTION(p)		p##.evex
+ # define MEMSET_SYMBOL(p,s)	p##_evex_##s
+ # define WMEMSET_SYMBOL(p,s)	p##_evex_##s
+-
+# define USE_LESS_VEC_MASK_STORE	1
+ # include "memset-vec-unaligned-erms.S"
+ #endif
+diff --git a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
+index bae5cba4..f877ac9d 100644
+--- a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
+@@ -63,6 +63,8 @@
+ # endif
+ #endif
+ 
+#define PAGE_SIZE 4096
+
+ #ifndef SECTION
+ # error SECTION is not defined!
+ #endif
+@@ -213,11 +215,38 @@ L(loop):
+ 	cmpq	%rcx, %rdx
+ 	jne	L(loop)
+ 	VZEROUPPER_SHORT_RETURN
+
+	.p2align 4
+ L(less_vec):
+ 	/* Less than 1 VEC.  */
+ # if VEC_SIZE != 16 && VEC_SIZE != 32 && VEC_SIZE != 64
+ #  error Unsupported VEC_SIZE!
+ # endif
+# ifdef USE_LESS_VEC_MASK_STORE
+	/* Clear high bits from edi. Only keeping bits relevant to page
+	   cross check. Note that we are using rax which is set in
+	   MEMSET_VDUP_TO_VEC0_AND_SET_RETURN as ptr from here on out.
+	 */
+	andl	$(PAGE_SIZE - 1), %edi
+	/* Check if VEC_SIZE store cross page. Mask stores suffer serious
+	   performance degradation when it has to fault supress.  */
+	cmpl	$(PAGE_SIZE - VEC_SIZE), %edi
+	ja	L(cross_page)
+# if VEC_SIZE > 32
+	movq	$-1, %rcx
+	bzhiq	%rdx, %rcx, %rcx
+	kmovq	%rcx, %k1
+# else
+	movl	$-1, %ecx
+	bzhil	%edx, %ecx, %ecx
+	kmovd	%ecx, %k1
+# endif
+	vmovdqu8	%VEC(0), (%rax) {%k1}
+	VZEROUPPER_RETURN
+
+	.p2align 4
+L(cross_page):
+# endif
+ # if VEC_SIZE > 32
+ 	cmpb	$32, %dl
+ 	jae	L(between_32_63)
+@@ -234,36 +263,36 @@ L(less_vec):
+ 	cmpb	$1, %dl
+ 	ja	L(between_2_3)
+ 	jb	1f
+-	movb	%cl, (%rdi)
+	movb	%cl, (%rax)
+ 1:
+ 	VZEROUPPER_RETURN
+ # if VEC_SIZE > 32
+ 	/* From 32 to 63.  No branch when size == 32.  */
+ L(between_32_63):
+-	VMOVU	%YMM0, -32(%rdi,%rdx)
+-	VMOVU	%YMM0, (%rdi)
+	VMOVU	%YMM0, -32(%rax,%rdx)
+	VMOVU	%YMM0, (%rax)
+ 	VZEROUPPER_RETURN
+ # endif
+ # if VEC_SIZE > 16
+ 	/* From 16 to 31.  No branch when size == 16.  */
+ L(between_16_31):
+-	VMOVU	%XMM0, -16(%rdi,%rdx)
+-	VMOVU	%XMM0, (%rdi)
+	VMOVU	%XMM0, -16(%rax,%rdx)
+	VMOVU	%XMM0, (%rax)
+ 	VZEROUPPER_RETURN
+ # endif
+ 	/* From 8 to 15.  No branch when size == 8.  */
+ L(between_8_15):
+-	movq	%rcx, -8(%rdi,%rdx)
+-	movq	%rcx, (%rdi)
+	movq	%rcx, -8(%rax,%rdx)
+	movq	%rcx, (%rax)
+ 	VZEROUPPER_RETURN
+ L(between_4_7):
+ 	/* From 4 to 7.  No branch when size == 4.  */
+-	movl	%ecx, -4(%rdi,%rdx)
+-	movl	%ecx, (%rdi)
+	movl	%ecx, -4(%rax,%rdx)
+	movl	%ecx, (%rax)
+ 	VZEROUPPER_RETURN
+ L(between_2_3):
+ 	/* From 2 to 3.  No branch when size == 2.  */
+-	movw	%cx, -2(%rdi,%rdx)
+-	movw	%cx, (%rdi)
+	movw	%cx, -2(%rax,%rdx)
+	movw	%cx, (%rax)
+ 	VZEROUPPER_RETURN
+ END (MEMSET_SYMBOL (__memset, unaligned_erms))
+-- 
+GitLab
+
--- a/glibc-RHEL-15696-42.patch
+++ b/glibc-RHEL-15696-42.patch
@ -0,0 +1,396 @@
+From ccabe7971f508709d034b63b8672f6f751a3d356 Mon Sep 17 00:00:00 2001
+From: Noah Goldstein <goldstein.w.n@gmail.com>
+Date: Fri, 23 Apr 2021 15:56:24 -0400
+Subject: [PATCH] x86: Optimize strchr-avx2.S
+Content-type: text/plain; charset=UTF-8
+
+No bug. This commit optimizes strchr-avx2.S. The optimizations are all
+small things such as save an ALU in the alignment process, saving a
+few instructions in the loop return, saving some bytes in the main
+loop, and increasing the ILP in the return cases. test-strchr,
+test-strchrnul, test-wcschr, and test-wcschrnul are all passing.
+
+Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
+---
+ sysdeps/x86_64/multiarch/strchr-avx2.S | 290 +++++++++++++++----------
+ 1 file changed, 170 insertions(+), 120 deletions(-)
+
+Conflics:
+	sysdeps/x86_64/multiarch/strchr-avx2.S
+	(rearranged to account for branch changes)
+
+diff --git a/sysdeps/x86_64/multiarch/strchr-avx2.S b/sysdeps/x86_64/multiarch/strchr-avx2.S
+index 919d256c..5884726b 100644
+--- a/sysdeps/x86_64/multiarch/strchr-avx2.S
+++ b/sysdeps/x86_64/multiarch/strchr-avx2.S
+@@ -49,133 +49,144 @@
+ 
+ 	.section SECTION(.text),"ax",@progbits
+ ENTRY (STRCHR)
+-	movl	%edi, %ecx
+-# ifndef USE_AS_STRCHRNUL
+-	xorl	%edx, %edx
+-# endif
+-
+ 	/* Broadcast CHAR to YMM0.	*/
+ 	vmovd	%esi, %xmm0
+	movl	%edi, %eax
+	andl	$(PAGE_SIZE - 1), %eax
+	VPBROADCAST	%xmm0, %ymm0
+ 	vpxor	%xmm9, %xmm9, %xmm9
+-	VPBROADCAST %xmm0, %ymm0
+ 
+ 	/* Check if we cross page boundary with one vector load.  */
+-	andl	$(PAGE_SIZE - 1), %ecx
+-	cmpl	$(PAGE_SIZE - VEC_SIZE), %ecx
+-	ja  L(cross_page_boundary)
+	cmpl	$(PAGE_SIZE - VEC_SIZE), %eax
+	ja	L(cross_page_boundary)
+ 
+ 	/* Check the first VEC_SIZE bytes.	Search for both CHAR and the
+ 	   null byte.  */
+ 	vmovdqu	(%rdi), %ymm8
+-	VPCMPEQ %ymm8, %ymm0, %ymm1
+-	VPCMPEQ %ymm8, %ymm9, %ymm2
+	VPCMPEQ	%ymm8, %ymm0, %ymm1
+	VPCMPEQ	%ymm8, %ymm9, %ymm2
+ 	vpor	%ymm1, %ymm2, %ymm1
+ 	vpmovmskb %ymm1, %eax
+ 	testl	%eax, %eax
+-	jz	L(more_vecs)
+	jz	L(aligned_more)
+ 	tzcntl	%eax, %eax
+# ifndef USE_AS_STRCHRNUL
+ 	/* Found CHAR or the null byte.	 */
+	cmp	(%rdi, %rax), %CHAR_REG
+	jne	L(zero)
+# endif
+ 	addq	%rdi, %rax
+	VZEROUPPER_RETURN
+
+	/* .p2align 5 helps keep performance more consistent if ENTRY()
+	   alignment % 32 was either 16 or 0. As well this makes the
+	   alignment % 32 of the loop_4x_vec fixed which makes tuning it
+	   easier.  */
+	.p2align 5
+L(first_vec_x4):
+	tzcntl	%eax, %eax
+	addq	$(VEC_SIZE * 3 + 1), %rdi
+ # ifndef USE_AS_STRCHRNUL
+-	cmp (%rax), %CHAR_REG
+-	cmovne	%rdx, %rax
+	/* Found CHAR or the null byte.	 */
+	cmp	(%rdi, %rax), %CHAR_REG
+	jne	L(zero)
+ # endif
+-L(return_vzeroupper):
+-	ZERO_UPPER_VEC_REGISTERS_RETURN
+-
+-	.p2align 4
+-L(more_vecs):
+-	/* Align data for aligned loads in the loop.  */
+-	andq	$-VEC_SIZE, %rdi
+-L(aligned_more):
+-
+-	/* Check the next 4 * VEC_SIZE.	 Only one VEC_SIZE at a time
+-	   since data is only aligned to VEC_SIZE.	*/
+-	vmovdqa	VEC_SIZE(%rdi), %ymm8
+-	addq	$VEC_SIZE, %rdi
+-	VPCMPEQ %ymm8, %ymm0, %ymm1
+-	VPCMPEQ %ymm8, %ymm9, %ymm2
+-	vpor	%ymm1, %ymm2, %ymm1
+-	vpmovmskb %ymm1, %eax
+-	testl	%eax, %eax
+-	jnz	L(first_vec_x0)
+-
+-	vmovdqa	VEC_SIZE(%rdi), %ymm8
+-	VPCMPEQ %ymm8, %ymm0, %ymm1
+-	VPCMPEQ %ymm8, %ymm9, %ymm2
+-	vpor	%ymm1, %ymm2, %ymm1
+-	vpmovmskb %ymm1, %eax
+-	testl	%eax, %eax
+-	jnz	L(first_vec_x1)
+-
+-	vmovdqa	(VEC_SIZE * 2)(%rdi), %ymm8
+-	VPCMPEQ %ymm8, %ymm0, %ymm1
+-	VPCMPEQ %ymm8, %ymm9, %ymm2
+-	vpor	%ymm1, %ymm2, %ymm1
+-	vpmovmskb %ymm1, %eax
+-	testl	%eax, %eax
+-	jnz	L(first_vec_x2)
+-
+-	vmovdqa	(VEC_SIZE * 3)(%rdi), %ymm8
+-	VPCMPEQ %ymm8, %ymm0, %ymm1
+-	VPCMPEQ %ymm8, %ymm9, %ymm2
+-	vpor	%ymm1, %ymm2, %ymm1
+-	vpmovmskb %ymm1, %eax
+-	testl	%eax, %eax
+-	jz	L(prep_loop_4x)
+	addq	%rdi, %rax
+	VZEROUPPER_RETURN
+ 
+-	tzcntl	%eax, %eax
+-	leaq	(VEC_SIZE * 3)(%rdi, %rax), %rax
+ # ifndef USE_AS_STRCHRNUL
+-	cmp (%rax), %CHAR_REG
+-	cmovne	%rdx, %rax
+L(zero):
+	xorl	%eax, %eax
+	VZEROUPPER_RETURN
+ # endif
+-	VZEROUPPER
+-	ret
+
+ 
+ 	.p2align 4
+-L(first_vec_x0):
+L(first_vec_x1):
+ 	tzcntl	%eax, %eax
+-	/* Found CHAR or the null byte.	 */
+-	addq	%rdi, %rax
+	incq	%rdi
+ # ifndef USE_AS_STRCHRNUL
+-	cmp (%rax), %CHAR_REG
+-	cmovne	%rdx, %rax
+	/* Found CHAR or the null byte.	 */
+	cmp	(%rdi, %rax), %CHAR_REG
+	jne	L(zero)
+ # endif
+	addq	%rdi, %rax
+ 	VZEROUPPER_RETURN
+ 
+ 	.p2align 4
+-L(first_vec_x1):
+L(first_vec_x2):
+ 	tzcntl	%eax, %eax
+-	leaq	VEC_SIZE(%rdi, %rax), %rax
+	addq	$(VEC_SIZE + 1), %rdi
+ # ifndef USE_AS_STRCHRNUL
+-	cmp (%rax), %CHAR_REG
+-	cmovne	%rdx, %rax
+	/* Found CHAR or the null byte.	 */
+	cmp	(%rdi, %rax), %CHAR_REG
+	jne	L(zero)
+ # endif
+	addq	%rdi, %rax
+ 	VZEROUPPER_RETURN
+ 
+ 	.p2align 4
+-L(first_vec_x2):
+L(first_vec_x3):
+ 	tzcntl	%eax, %eax
+-	/* Found CHAR or the null byte.	 */
+-	leaq	(VEC_SIZE * 2)(%rdi, %rax), %rax
+	addq	$(VEC_SIZE * 2 + 1), %rdi
+ # ifndef USE_AS_STRCHRNUL
+-	cmp (%rax), %CHAR_REG
+-	cmovne	%rdx, %rax
+	/* Found CHAR or the null byte.	 */
+	cmp	(%rdi, %rax), %CHAR_REG
+	jne	L(zero)
+ # endif
+	addq	%rdi, %rax
+ 	VZEROUPPER_RETURN
+ 
+-L(prep_loop_4x):
+-	/* Align data to 4 * VEC_SIZE.	*/
+-	andq	$-(VEC_SIZE * 4), %rdi
+	.p2align 4
+L(aligned_more):
+	/* Align data to VEC_SIZE - 1. This is the same number of
+	   instructions as using andq -VEC_SIZE but saves 4 bytes of code
+	   on x4 check.  */
+	orq	$(VEC_SIZE - 1), %rdi
+L(cross_page_continue):
+	/* Check the next 4 * VEC_SIZE.  Only one VEC_SIZE at a time
+	   since data is only aligned to VEC_SIZE.  */
+	vmovdqa	1(%rdi), %ymm8
+	VPCMPEQ	%ymm8, %ymm0, %ymm1
+	VPCMPEQ	%ymm8, %ymm9, %ymm2
+	vpor	%ymm1, %ymm2, %ymm1
+	vpmovmskb %ymm1, %eax
+	testl	%eax, %eax
+	jnz	L(first_vec_x1)
+
+	vmovdqa	(VEC_SIZE + 1)(%rdi), %ymm8
+	VPCMPEQ	%ymm8, %ymm0, %ymm1
+	VPCMPEQ	%ymm8, %ymm9, %ymm2
+	vpor	%ymm1, %ymm2, %ymm1
+	vpmovmskb %ymm1, %eax
+	testl	%eax, %eax
+	jnz	L(first_vec_x2)
+
+	vmovdqa	(VEC_SIZE * 2 + 1)(%rdi), %ymm8
+	VPCMPEQ	%ymm8, %ymm0, %ymm1
+	VPCMPEQ	%ymm8, %ymm9, %ymm2
+	vpor	%ymm1, %ymm2, %ymm1
+	vpmovmskb %ymm1, %eax
+	testl	%eax, %eax
+	jnz	L(first_vec_x3)
+ 
+	vmovdqa	(VEC_SIZE * 3 + 1)(%rdi), %ymm8
+	VPCMPEQ	%ymm8, %ymm0, %ymm1
+	VPCMPEQ	%ymm8, %ymm9, %ymm2
+	vpor	%ymm1, %ymm2, %ymm1
+	vpmovmskb %ymm1, %eax
+	testl	%eax, %eax
+	jnz	L(first_vec_x4)
+	/* Align data to VEC_SIZE * 4 - 1.	*/
+	addq	$(VEC_SIZE * 4 + 1), %rdi
+	andq	$-(VEC_SIZE * 4), %rdi
+ 	.p2align 4
+ L(loop_4x_vec):
+ 	/* Compare 4 * VEC at a time forward.  */
+-	vmovdqa	(VEC_SIZE * 4)(%rdi), %ymm5
+-	vmovdqa	(VEC_SIZE * 5)(%rdi), %ymm6
+-	vmovdqa	(VEC_SIZE * 6)(%rdi), %ymm7
+-	vmovdqa	(VEC_SIZE * 7)(%rdi), %ymm8
+	vmovdqa	(%rdi), %ymm5
+	vmovdqa	(VEC_SIZE)(%rdi), %ymm6
+	vmovdqa	(VEC_SIZE * 2)(%rdi), %ymm7
+	vmovdqa	(VEC_SIZE * 3)(%rdi), %ymm8
+ 
+ 	/* Leaves only CHARS matching esi as 0.	 */
+ 	vpxor	%ymm5, %ymm0, %ymm1
+@@ -191,63 +202,102 @@ L(loop_4x_vec):
+ 	VPMINU	%ymm1, %ymm2, %ymm5
+ 	VPMINU	%ymm3, %ymm4, %ymm6
+ 
+-	VPMINU	%ymm5, %ymm6, %ymm5
+	VPMINU	%ymm5, %ymm6, %ymm6
+ 
+-	VPCMPEQ %ymm5, %ymm9, %ymm5
+-	vpmovmskb %ymm5, %eax
+	VPCMPEQ	%ymm6, %ymm9, %ymm6
+	vpmovmskb %ymm6, %ecx
+	subq	$-(VEC_SIZE * 4), %rdi
+	testl	%ecx, %ecx
+	jz	L(loop_4x_vec)
+ 
+-	addq	$(VEC_SIZE * 4), %rdi
+-	testl	%eax, %eax
+-	jz  L(loop_4x_vec)
+ 
+-	VPCMPEQ %ymm1, %ymm9, %ymm1
+	VPCMPEQ	%ymm1, %ymm9, %ymm1
+ 	vpmovmskb %ymm1, %eax
+ 	testl	%eax, %eax
+-	jnz	L(first_vec_x0)
+	jnz	L(last_vec_x0)
+
+ 
+-	VPCMPEQ %ymm2, %ymm9, %ymm2
+	VPCMPEQ	%ymm5, %ymm9, %ymm2
+ 	vpmovmskb %ymm2, %eax
+ 	testl	%eax, %eax
+-	jnz	L(first_vec_x1)
+	jnz	L(last_vec_x1)
+
+	VPCMPEQ	%ymm3, %ymm9, %ymm3
+	vpmovmskb %ymm3, %eax
+	/* rcx has combined result from all 4 VEC. It will only be used
+	   if the first 3 other VEC all did not contain a match.  */
+	salq	$32, %rcx
+	orq	%rcx, %rax
+	tzcntq	%rax, %rax
+	subq	$(VEC_SIZE * 2), %rdi
+# ifndef USE_AS_STRCHRNUL
+	/* Found CHAR or the null byte.	 */
+	cmp	(%rdi, %rax), %CHAR_REG
+	jne	L(zero_end)
+# endif
+	addq	%rdi, %rax
+	VZEROUPPER_RETURN
+
+
+	.p2align 4
+L(last_vec_x0):
+	tzcntl	%eax, %eax
+	addq	$-(VEC_SIZE * 4), %rdi
+# ifndef USE_AS_STRCHRNUL
+	/* Found CHAR or the null byte.	 */
+	cmp	(%rdi, %rax), %CHAR_REG
+	jne	L(zero_end)
+# endif
+	addq	%rdi, %rax
+	VZEROUPPER_RETURN
+ 
+-	VPCMPEQ %ymm3, %ymm9, %ymm3
+-	VPCMPEQ %ymm4, %ymm9, %ymm4
+-	vpmovmskb %ymm3, %ecx
+-	vpmovmskb %ymm4, %eax
+-	salq	$32, %rax
+-	orq %rcx, %rax
+-	tzcntq  %rax, %rax
+-	leaq	(VEC_SIZE * 2)(%rdi, %rax), %rax
+ # ifndef USE_AS_STRCHRNUL
+-	cmp (%rax), %CHAR_REG
+-	cmovne	%rdx, %rax
+L(zero_end):
+	xorl	%eax, %eax
+	VZEROUPPER_RETURN
+ # endif
+-	VZEROUPPER
+-	ret
+
+	.p2align 4
+L(last_vec_x1):
+	tzcntl	%eax, %eax
+	subq	$(VEC_SIZE * 3), %rdi
+# ifndef USE_AS_STRCHRNUL
+	/* Found CHAR or the null byte.	 */
+	cmp	(%rdi, %rax), %CHAR_REG
+	jne	L(zero_end)
+# endif
+	addq	%rdi, %rax
+	VZEROUPPER_RETURN
+
+ 
+ 	/* Cold case for crossing page with first load.	 */
+ 	.p2align 4
+ L(cross_page_boundary):
+-	andq	$-VEC_SIZE, %rdi
+-	andl	$(VEC_SIZE - 1), %ecx
+-
+-	vmovdqa	(%rdi), %ymm8
+-	VPCMPEQ %ymm8, %ymm0, %ymm1
+-	VPCMPEQ %ymm8, %ymm9, %ymm2
+	movq	%rdi, %rdx
+	/* Align rdi to VEC_SIZE - 1.  */
+	orq	$(VEC_SIZE - 1), %rdi
+	vmovdqa	-(VEC_SIZE - 1)(%rdi), %ymm8
+	VPCMPEQ	%ymm8, %ymm0, %ymm1
+	VPCMPEQ	%ymm8, %ymm9, %ymm2
+ 	vpor	%ymm1, %ymm2, %ymm1
+ 	vpmovmskb %ymm1, %eax
+-	/* Remove the leading bits.	 */
+-	sarxl	%ecx, %eax, %eax
+	/* Remove the leading bytes. sarxl only uses bits [5:0] of COUNT
+	   so no need to manually mod edx.  */
+	sarxl	%edx, %eax, %eax
+ 	testl	%eax, %eax
+-	jz	L(aligned_more)
+	jz	L(cross_page_continue)
+ 	tzcntl	%eax, %eax
+-	addq	%rcx, %rdi
+-	addq	%rdi, %rax
+ # ifndef USE_AS_STRCHRNUL
+-	cmp (%rax), %CHAR_REG
+-	cmovne	%rdx, %rax
+	xorl	%ecx, %ecx
+	/* Found CHAR or the null byte.	 */
+	cmp	(%rdx, %rax), %CHAR_REG
+	leaq	(%rdx, %rax), %rax
+	cmovne	%rcx, %rax
+# else
+	addq	%rdx, %rax
+ # endif
+-	VZEROUPPER_RETURN
+L(return_vzeroupper):
+	ZERO_UPPER_VEC_REGISTERS_RETURN
+ 
+ END (STRCHR)
+ # endif
+-- 
+GitLab
+
--- a/glibc-RHEL-15696-43.patch
+++ b/glibc-RHEL-15696-43.patch
@ -0,0 +1,532 @@
+From 7f3e7c262cab4e2401e4331a6ef29c428de02044 Mon Sep 17 00:00:00 2001
+From: Noah Goldstein <goldstein.w.n@gmail.com>
+Date: Fri, 23 Apr 2021 15:56:25 -0400
+Subject: [PATCH] x86: Optimize strchr-evex.S
+Content-type: text/plain; charset=UTF-8
+
+No bug. This commit optimizes strchr-evex.S. The optimizations are
+mostly small things such as save an ALU in the alignment process,
+saving a few instructions in the loop return. The one significant
+change is saving 2 instructions in the 4x loop. test-strchr,
+test-strchrnul, test-wcschr, and test-wcschrnul are all passing.
+
+Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
+---
+ sysdeps/x86_64/multiarch/strchr-evex.S | 392 ++++++++++++++-----------
+ 1 file changed, 218 insertions(+), 174 deletions(-)
+
+diff --git a/sysdeps/x86_64/multiarch/strchr-evex.S b/sysdeps/x86_64/multiarch/strchr-evex.S
+index ddc86a70..7f9d4ee4 100644
+--- a/sysdeps/x86_64/multiarch/strchr-evex.S
+++ b/sysdeps/x86_64/multiarch/strchr-evex.S
+@@ -32,13 +32,15 @@
+ #  define VPCMP		vpcmpd
+ #  define VPMINU	vpminud
+ #  define CHAR_REG	esi
+-#  define SHIFT_REG	r8d
+#  define SHIFT_REG	ecx
+#  define CHAR_SIZE	4
+ # else
+ #  define VPBROADCAST	vpbroadcastb
+ #  define VPCMP		vpcmpb
+ #  define VPMINU	vpminub
+ #  define CHAR_REG	sil
+-#  define SHIFT_REG	ecx
+#  define SHIFT_REG	edx
+#  define CHAR_SIZE	1
+ # endif
+ 
+ # define XMMZERO	xmm16
+@@ -56,23 +58,20 @@
+ 
+ # define VEC_SIZE 32
+ # define PAGE_SIZE 4096
+# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE)
+ 
+ 	.section .text.evex,"ax",@progbits
+ ENTRY (STRCHR)
+-	movl	%edi, %ecx
+-# ifndef USE_AS_STRCHRNUL
+-	xorl	%edx, %edx
+-# endif
+-
+ 	/* Broadcast CHAR to YMM0.	*/
+-	VPBROADCAST %esi, %YMM0
+-
+	VPBROADCAST	%esi, %YMM0
+	movl	%edi, %eax
+	andl	$(PAGE_SIZE - 1), %eax
+ 	vpxorq	%XMMZERO, %XMMZERO, %XMMZERO
+ 
+-	/* Check if we cross page boundary with one vector load.  */
+-	andl	$(PAGE_SIZE - 1), %ecx
+-	cmpl	$(PAGE_SIZE - VEC_SIZE), %ecx
+-	ja  L(cross_page_boundary)
+	/* Check if we cross page boundary with one vector load.
+	   Otherwise it is safe to use an unaligned load.  */
+	cmpl	$(PAGE_SIZE - VEC_SIZE), %eax
+	ja	L(cross_page_boundary)
+ 
+ 	/* Check the first VEC_SIZE bytes. Search for both CHAR and the
+ 	   null bytes.  */
+@@ -83,251 +82,296 @@ ENTRY (STRCHR)
+ 	VPMINU	%YMM2, %YMM1, %YMM2
+ 	/* Each bit in K0 represents a CHAR or a null byte in YMM1.  */
+ 	VPCMP	$0, %YMMZERO, %YMM2, %k0
+-	ktestd	%k0, %k0
+-	jz	L(more_vecs)
+ 	kmovd	%k0, %eax
+	testl	%eax, %eax
+	jz	L(aligned_more)
+ 	tzcntl	%eax, %eax
+-	/* Found CHAR or the null byte.	 */
+ # ifdef USE_AS_WCSCHR
+-	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
+-	leaq	(%rdi, %rax, 4), %rax
+	/* NB: Multiply wchar_t count by 4 to get the number of bytes.
+	 */
+	leaq	(%rdi, %rax, CHAR_SIZE), %rax
+ # else
+ 	addq	%rdi, %rax
+ # endif
+ # ifndef USE_AS_STRCHRNUL
+-	cmp (%rax), %CHAR_REG
+-	cmovne	%rdx, %rax
+	/* Found CHAR or the null byte.	 */
+	cmp	(%rax), %CHAR_REG
+	jne	L(zero)
+ # endif
+ 	ret
+ 
+-	.p2align 4
+-L(more_vecs):
+-	/* Align data for aligned loads in the loop.  */
+-	andq	$-VEC_SIZE, %rdi
+-L(aligned_more):
+-
+-	/* Check the next 4 * VEC_SIZE.	 Only one VEC_SIZE at a time
+-	   since data is only aligned to VEC_SIZE.	*/
+-	VMOVA	VEC_SIZE(%rdi), %YMM1
+-	addq	$VEC_SIZE, %rdi
+-
+-	/* Leaves only CHARS matching esi as 0.  */
+-	vpxorq	%YMM1, %YMM0, %YMM2
+-	VPMINU	%YMM2, %YMM1, %YMM2
+-	/* Each bit in K0 represents a CHAR or a null byte in YMM1.  */
+-	VPCMP	$0, %YMMZERO, %YMM2, %k0
+-	kmovd	%k0, %eax
+-	testl	%eax, %eax
+-	jnz	L(first_vec_x0)
+-
+-	VMOVA	VEC_SIZE(%rdi), %YMM1
+-	/* Leaves only CHARS matching esi as 0.  */
+-	vpxorq	%YMM1, %YMM0, %YMM2
+-	VPMINU	%YMM2, %YMM1, %YMM2
+-	/* Each bit in K0 represents a CHAR or a null byte in YMM1.  */
+-	VPCMP	$0, %YMMZERO, %YMM2, %k0
+-	kmovd	%k0, %eax
+-	testl	%eax, %eax
+-	jnz	L(first_vec_x1)
+-
+-	VMOVA	(VEC_SIZE * 2)(%rdi), %YMM1
+-	/* Leaves only CHARS matching esi as 0.  */
+-	vpxorq	%YMM1, %YMM0, %YMM2
+-	VPMINU	%YMM2, %YMM1, %YMM2
+-	/* Each bit in K0 represents a CHAR or a null byte in YMM1.  */
+-	VPCMP	$0, %YMMZERO, %YMM2, %k0
+-	kmovd	%k0, %eax
+-	testl	%eax, %eax
+-	jnz	L(first_vec_x2)
+-
+-	VMOVA	(VEC_SIZE * 3)(%rdi), %YMM1
+-	/* Leaves only CHARS matching esi as 0.  */
+-	vpxorq	%YMM1, %YMM0, %YMM2
+-	VPMINU	%YMM2, %YMM1, %YMM2
+-	/* Each bit in K0 represents a CHAR or a null byte in YMM1.  */
+-	VPCMP	$0, %YMMZERO, %YMM2, %k0
+-	ktestd	%k0, %k0
+-	jz	L(prep_loop_4x)
+-
+-	kmovd	%k0, %eax
+	/* .p2align 5 helps keep performance more consistent if ENTRY()
+	   alignment % 32 was either 16 or 0. As well this makes the
+	   alignment % 32 of the loop_4x_vec fixed which makes tuning it
+	   easier.  */
+	.p2align 5
+L(first_vec_x3):
+ 	tzcntl	%eax, %eax
+# ifndef USE_AS_STRCHRNUL
+ 	/* Found CHAR or the null byte.	 */
+-# ifdef USE_AS_WCSCHR
+-	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
+-	leaq	(VEC_SIZE * 3)(%rdi, %rax, 4), %rax
+-# else
+-	leaq	(VEC_SIZE * 3)(%rdi, %rax), %rax
+	cmp	(VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %CHAR_REG
+	jne	L(zero)
+ # endif
+	/* NB: Multiply sizeof char type (1 or 4) to get the number of
+	   bytes.  */
+	leaq	(VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax
+	ret
+
+ # ifndef USE_AS_STRCHRNUL
+-	cmp (%rax), %CHAR_REG
+-	cmovne	%rdx, %rax
+-# endif
+L(zero):
+	xorl	%eax, %eax
+ 	ret
+# endif
+ 
+ 	.p2align 4
+-L(first_vec_x0):
+L(first_vec_x4):
+# ifndef USE_AS_STRCHRNUL
+	/* Check to see if first match was CHAR (k0) or null (k1).  */
+	kmovd	%k0, %eax
+ 	tzcntl	%eax, %eax
+-	/* Found CHAR or the null byte.	 */
+-# ifdef USE_AS_WCSCHR
+-	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
+-	leaq	(%rdi, %rax, 4), %rax
+	kmovd	%k1, %ecx
+	/* bzhil will not be 0 if first match was null.  */
+	bzhil	%eax, %ecx, %ecx
+	jne	L(zero)
+ # else
+-	addq	%rdi, %rax
+-# endif
+-# ifndef USE_AS_STRCHRNUL
+-	cmp (%rax), %CHAR_REG
+-	cmovne	%rdx, %rax
+	/* Combine CHAR and null matches.  */
+	kord	%k0, %k1, %k0
+	kmovd	%k0, %eax
+	tzcntl	%eax, %eax
+ # endif
+	/* NB: Multiply sizeof char type (1 or 4) to get the number of
+	   bytes.  */
+	leaq	(VEC_SIZE * 4)(%rdi, %rax, CHAR_SIZE), %rax
+ 	ret
+ 
+ 	.p2align 4
+ L(first_vec_x1):
+ 	tzcntl	%eax, %eax
+-	/* Found CHAR or the null byte.	 */
+-# ifdef USE_AS_WCSCHR
+-	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
+-	leaq	VEC_SIZE(%rdi, %rax, 4), %rax
+-# else
+-	leaq	VEC_SIZE(%rdi, %rax), %rax
+-# endif
+ # ifndef USE_AS_STRCHRNUL
+-	cmp (%rax), %CHAR_REG
+-	cmovne	%rdx, %rax
+	/* Found CHAR or the null byte.	 */
+	cmp	(VEC_SIZE)(%rdi, %rax, CHAR_SIZE), %CHAR_REG
+	jne	L(zero)
+
+ # endif
+	/* NB: Multiply sizeof char type (1 or 4) to get the number of
+	   bytes.  */
+	leaq	(VEC_SIZE)(%rdi, %rax, CHAR_SIZE), %rax
+ 	ret
+ 
+ 	.p2align 4
+ L(first_vec_x2):
+# ifndef USE_AS_STRCHRNUL
+	/* Check to see if first match was CHAR (k0) or null (k1).  */
+	kmovd	%k0, %eax
+ 	tzcntl	%eax, %eax
+-	/* Found CHAR or the null byte.	 */
+-# ifdef USE_AS_WCSCHR
+-	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
+-	leaq	(VEC_SIZE * 2)(%rdi, %rax, 4), %rax
+	kmovd	%k1, %ecx
+	/* bzhil will not be 0 if first match was null.  */
+	bzhil	%eax, %ecx, %ecx
+	jne	L(zero)
+ # else
+-	leaq	(VEC_SIZE * 2)(%rdi, %rax), %rax
+-# endif
+-# ifndef USE_AS_STRCHRNUL
+-	cmp (%rax), %CHAR_REG
+-	cmovne	%rdx, %rax
+	/* Combine CHAR and null matches.  */
+	kord	%k0, %k1, %k0
+	kmovd	%k0, %eax
+	tzcntl	%eax, %eax
+ # endif
+	/* NB: Multiply sizeof char type (1 or 4) to get the number of
+	   bytes.  */
+	leaq	(VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
+ 	ret
+ 
+-L(prep_loop_4x):
+-	/* Align data to 4 * VEC_SIZE.	*/
+	.p2align 4
+L(aligned_more):
+	/* Align data to VEC_SIZE.  */
+	andq	$-VEC_SIZE, %rdi
+L(cross_page_continue):
+	/* Check the next 4 * VEC_SIZE. Only one VEC_SIZE at a time since
+	   data is only aligned to VEC_SIZE. Use two alternating methods
+	   for checking VEC to balance latency and port contention.  */
+
+	/* This method has higher latency but has better port
+	   distribution.  */
+	VMOVA	(VEC_SIZE)(%rdi), %YMM1
+	/* Leaves only CHARS matching esi as 0.  */
+	vpxorq	%YMM1, %YMM0, %YMM2
+	VPMINU	%YMM2, %YMM1, %YMM2
+	/* Each bit in K0 represents a CHAR or a null byte in YMM1.  */
+	VPCMP	$0, %YMMZERO, %YMM2, %k0
+	kmovd	%k0, %eax
+	testl	%eax, %eax
+	jnz	L(first_vec_x1)
+
+	/* This method has higher latency but has better port
+	   distribution.  */
+	VMOVA	(VEC_SIZE * 2)(%rdi), %YMM1
+	/* Each bit in K0 represents a CHAR in YMM1.  */
+	VPCMP	$0, %YMM1, %YMM0, %k0
+	/* Each bit in K1 represents a CHAR in YMM1.  */
+	VPCMP	$0, %YMM1, %YMMZERO, %k1
+	kortestd	%k0, %k1
+	jnz	L(first_vec_x2)
+
+	VMOVA	(VEC_SIZE * 3)(%rdi), %YMM1
+	/* Leaves only CHARS matching esi as 0.  */
+	vpxorq	%YMM1, %YMM0, %YMM2
+	VPMINU	%YMM2, %YMM1, %YMM2
+	/* Each bit in K0 represents a CHAR or a null byte in YMM1.  */
+	VPCMP	$0, %YMMZERO, %YMM2, %k0
+	kmovd	%k0, %eax
+	testl	%eax, %eax
+	jnz	L(first_vec_x3)
+
+	VMOVA	(VEC_SIZE * 4)(%rdi), %YMM1
+	/* Each bit in K0 represents a CHAR in YMM1.  */
+	VPCMP	$0, %YMM1, %YMM0, %k0
+	/* Each bit in K1 represents a CHAR in YMM1.  */
+	VPCMP	$0, %YMM1, %YMMZERO, %k1
+	kortestd	%k0, %k1
+	jnz	L(first_vec_x4)
+
+	/* Align data to VEC_SIZE * 4 for the loop.  */
+	addq	$VEC_SIZE, %rdi
+ 	andq	$-(VEC_SIZE * 4), %rdi
+ 
+ 	.p2align 4
+ L(loop_4x_vec):
+-	/* Compare 4 * VEC at a time forward.  */
+	/* Check 4x VEC at a time. No penalty to imm32 offset with evex
+	   encoding.  */
+ 	VMOVA	(VEC_SIZE * 4)(%rdi), %YMM1
+ 	VMOVA	(VEC_SIZE * 5)(%rdi), %YMM2
+ 	VMOVA	(VEC_SIZE * 6)(%rdi), %YMM3
+ 	VMOVA	(VEC_SIZE * 7)(%rdi), %YMM4
+ 
+-	/* Leaves only CHARS matching esi as 0.  */
+	/* For YMM1 and YMM3 use xor to set the CHARs matching esi to
+	   zero.  */
+ 	vpxorq	%YMM1, %YMM0, %YMM5
+-	vpxorq	%YMM2, %YMM0, %YMM6
+	/* For YMM2 and YMM4 cmp not equals to CHAR and store result in
+	   k register. Its possible to save either 1 or 2 instructions
+	   using cmp no equals method for either YMM1 or YMM1 and YMM3
+	   respectively but bottleneck on p5 makes it not worth it.  */
+	VPCMP	$4, %YMM0, %YMM2, %k2
+ 	vpxorq	%YMM3, %YMM0, %YMM7
+-	vpxorq	%YMM4, %YMM0, %YMM8
+-
+-	VPMINU	%YMM5, %YMM1, %YMM5
+-	VPMINU	%YMM6, %YMM2, %YMM6
+-	VPMINU	%YMM7, %YMM3, %YMM7
+-	VPMINU	%YMM8, %YMM4, %YMM8
+-
+-	VPMINU	%YMM5, %YMM6, %YMM1
+-	VPMINU	%YMM7, %YMM8, %YMM2
+-
+-	VPMINU	%YMM1, %YMM2, %YMM1
+-
+-	/* Each bit in K0 represents a CHAR or a null byte.  */
+-	VPCMP	$0, %YMMZERO, %YMM1, %k0
+-
+-	addq	$(VEC_SIZE * 4), %rdi
+-
+-	ktestd	%k0, %k0
+	VPCMP	$4, %YMM0, %YMM4, %k4
+
+	/* Use min to select all zeros from either xor or end of string).
+	 */
+	VPMINU	%YMM1, %YMM5, %YMM1
+	VPMINU	%YMM3, %YMM7, %YMM3
+
+	/* Use min + zeromask to select for zeros. Since k2 and k4 will
+	   have 0 as positions that matched with CHAR which will set
+	   zero in the corresponding destination bytes in YMM2 / YMM4.
+	 */
+	VPMINU	%YMM1, %YMM2, %YMM2{%k2}{z}
+	VPMINU	%YMM3, %YMM4, %YMM4
+	VPMINU	%YMM2, %YMM4, %YMM4{%k4}{z}
+
+	VPCMP	$0, %YMMZERO, %YMM4, %k1
+	kmovd	%k1, %ecx
+	subq	$-(VEC_SIZE * 4), %rdi
+	testl	%ecx, %ecx
+ 	jz	L(loop_4x_vec)
+ 
+-	/* Each bit in K0 represents a CHAR or a null byte in YMM1.  */
+-	VPCMP	$0, %YMMZERO, %YMM5, %k0
+	VPCMP	$0, %YMMZERO, %YMM1, %k0
+ 	kmovd	%k0, %eax
+ 	testl	%eax, %eax
+-	jnz	L(first_vec_x0)
+	jnz	L(last_vec_x1)
+ 
+-	/* Each bit in K1 represents a CHAR or a null byte in YMM2.  */
+-	VPCMP	$0, %YMMZERO, %YMM6, %k1
+-	kmovd	%k1, %eax
+	VPCMP	$0, %YMMZERO, %YMM2, %k0
+	kmovd	%k0, %eax
+ 	testl	%eax, %eax
+-	jnz	L(first_vec_x1)
+-
+-	/* Each bit in K2 represents a CHAR or a null byte in YMM3.  */
+-	VPCMP	$0, %YMMZERO, %YMM7, %k2
+-	/* Each bit in K3 represents a CHAR or a null byte in YMM4.  */
+-	VPCMP	$0, %YMMZERO, %YMM8, %k3
+	jnz	L(last_vec_x2)
+ 
+	VPCMP	$0, %YMMZERO, %YMM3, %k0
+	kmovd	%k0, %eax
+	/* Combine YMM3 matches (eax) with YMM4 matches (ecx).  */
+ # ifdef USE_AS_WCSCHR
+-	/* NB: Each bit in K2/K3 represents 4-byte element.  */
+-	kshiftlw $8, %k3, %k1
+	sall	$8, %ecx
+	orl	%ecx, %eax
+	tzcntl	%eax, %eax
+ # else
+-	kshiftlq $32, %k3, %k1
+	salq	$32, %rcx
+	orq	%rcx, %rax
+	tzcntq	%rax, %rax
+ # endif
+# ifndef USE_AS_STRCHRNUL
+	/* Check if match was CHAR or null.  */
+	cmp	(VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %CHAR_REG
+	jne	L(zero_end)
+# endif
+	/* NB: Multiply sizeof char type (1 or 4) to get the number of
+	   bytes.  */
+	leaq	(VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
+	ret
+ 
+-	/* Each bit in K1 represents a NULL or a mismatch.  */
+-	korq	%k1, %k2, %k1
+-	kmovq	%k1, %rax
+# ifndef USE_AS_STRCHRNUL
+L(zero_end):
+	xorl	%eax, %eax
+	ret
+# endif
+ 
+-	tzcntq  %rax, %rax
+-# ifdef USE_AS_WCSCHR
+-	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
+-	leaq	(VEC_SIZE * 2)(%rdi, %rax, 4), %rax
+-# else
+-	leaq	(VEC_SIZE * 2)(%rdi, %rax), %rax
+	.p2align 4
+L(last_vec_x1):
+	tzcntl	%eax, %eax
+# ifndef USE_AS_STRCHRNUL
+	/* Check if match was null.  */
+	cmp	(%rdi, %rax, CHAR_SIZE), %CHAR_REG
+	jne	L(zero_end)
+ # endif
+	/* NB: Multiply sizeof char type (1 or 4) to get the number of
+	   bytes.  */
+	leaq	(%rdi, %rax, CHAR_SIZE), %rax
+	ret
+
+	.p2align 4
+L(last_vec_x2):
+	tzcntl	%eax, %eax
+ # ifndef USE_AS_STRCHRNUL
+-	cmp (%rax), %CHAR_REG
+-	cmovne	%rdx, %rax
+	/* Check if match was null.  */
+	cmp	(VEC_SIZE)(%rdi, %rax, CHAR_SIZE), %CHAR_REG
+	jne	L(zero_end)
+ # endif
+	/* NB: Multiply sizeof char type (1 or 4) to get the number of
+	   bytes.  */
+	leaq	(VEC_SIZE)(%rdi, %rax, CHAR_SIZE), %rax
+ 	ret
+ 
+ 	/* Cold case for crossing page with first load.	 */
+ 	.p2align 4
+ L(cross_page_boundary):
+	movq	%rdi, %rdx
+	/* Align rdi.  */
+ 	andq	$-VEC_SIZE, %rdi
+-	andl	$(VEC_SIZE - 1), %ecx
+-
+ 	VMOVA	(%rdi), %YMM1
+-
+ 	/* Leaves only CHARS matching esi as 0.  */
+ 	vpxorq	%YMM1, %YMM0, %YMM2
+ 	VPMINU	%YMM2, %YMM1, %YMM2
+ 	/* Each bit in K0 represents a CHAR or a null byte in YMM1.  */
+ 	VPCMP	$0, %YMMZERO, %YMM2, %k0
+ 	kmovd	%k0, %eax
+-	testl	%eax, %eax
+-
+	/* Remove the leading bits.	 */
+ # ifdef USE_AS_WCSCHR
+	movl	%edx, %SHIFT_REG
+ 	/* NB: Divide shift count by 4 since each bit in K1 represent 4
+ 	   bytes.  */
+-	movl	%ecx, %SHIFT_REG
+-	sarl    $2, %SHIFT_REG
+	sarl	$2, %SHIFT_REG
+	andl	$(CHAR_PER_VEC - 1), %SHIFT_REG
+ # endif
+-
+-	/* Remove the leading bits.	 */
+ 	sarxl	%SHIFT_REG, %eax, %eax
+	/* If eax is zero continue.  */
+ 	testl	%eax, %eax
+-
+-	jz	L(aligned_more)
+	jz	L(cross_page_continue)
+ 	tzcntl	%eax, %eax
+-	addq	%rcx, %rdi
+# ifndef USE_AS_STRCHRNUL
+	/* Check to see if match was CHAR or null.  */
+	cmp	(%rdx, %rax, CHAR_SIZE), %CHAR_REG
+	jne	L(zero_end)
+# endif
+ # ifdef USE_AS_WCSCHR
+-	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
+-	leaq	(%rdi, %rax, 4), %rax
+	/* NB: Multiply wchar_t count by 4 to get the number of
+	   bytes.  */
+	leaq	(%rdx, %rax, CHAR_SIZE), %rax
+ # else
+-	addq	%rdi, %rax
+-# endif
+-# ifndef USE_AS_STRCHRNUL
+-	cmp (%rax), %CHAR_REG
+-	cmovne	%rdx, %rax
+	addq	%rdx, %rax
+ # endif
+ 	ret
+ 
+-- 
+GitLab
+
--- a/glibc-RHEL-15696-44.patch
+++ b/glibc-RHEL-15696-44.patch
@ -0,0 +1,536 @@
+From 104c7b1967c3e78435c6f7eab5e225a7eddf9c6e Mon Sep 17 00:00:00 2001
+From: Noah Goldstein <goldstein.w.n@gmail.com>
+Date: Tue, 4 May 2021 19:02:40 -0400
+Subject: [PATCH] x86: Add EVEX optimized memchr family not safe for RTM
+Content-type: text/plain; charset=UTF-8
+
+No bug.
+
+This commit adds a new implementation for EVEX memchr that is not safe
+for RTM because it uses vzeroupper. The benefit is that by using
+ymm0-ymm15 it can use vpcmpeq and vpternlogd in the 4x loop which is
+faster than the RTM safe version which cannot use vpcmpeq because
+there is no EVEX encoding for the instruction. All parts of the
+implementation aside from the 4x loop are the same for the two
+versions and the optimization is only relevant for large sizes.
+
+Tigerlake:
+size  , algn  , Pos   , Cur T , New T , Win     , Dif
+512   , 6     , 192   , 9.2   , 9.04  , no-RTM  , 0.16
+512   , 7     , 224   , 9.19  , 8.98  , no-RTM  , 0.21
+2048  , 0     , 256   , 10.74 , 10.54 , no-RTM  , 0.2
+2048  , 0     , 512   , 14.81 , 14.87 , RTM     , 0.06
+2048  , 0     , 1024  , 22.97 , 22.57 , no-RTM  , 0.4
+2048  , 0     , 2048  , 37.49 , 34.51 , no-RTM  , 2.98   <--
+
+Icelake:
+size  , algn  , Pos   , Cur T , New T , Win     , Dif
+512   , 6     , 192   , 7.6   , 7.3   , no-RTM  , 0.3
+512   , 7     , 224   , 7.63  , 7.27  , no-RTM  , 0.36
+2048  , 0     , 256   , 8.48  , 8.38  , no-RTM  , 0.1
+2048  , 0     , 512   , 11.57 , 11.42 , no-RTM  , 0.15
+2048  , 0     , 1024  , 17.92 , 17.38 , no-RTM  , 0.54
+2048  , 0     , 2048  , 30.37 , 27.34 , no-RTM  , 3.03   <--
+
+test-memchr, test-wmemchr, and test-rawmemchr are all passing.
+
+Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
+Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
+---
+ sysdeps/x86_64/multiarch/Makefile             |   7 +-
+ sysdeps/x86_64/multiarch/ifunc-evex.h         |  55 ++++++
+ sysdeps/x86_64/multiarch/ifunc-impl-list.c    |  15 ++
+ sysdeps/x86_64/multiarch/memchr-evex-rtm.S    |   8 +
+ sysdeps/x86_64/multiarch/memchr-evex.S        | 161 ++++++++++++++----
+ sysdeps/x86_64/multiarch/memchr.c             |   2 +-
+ sysdeps/x86_64/multiarch/rawmemchr-evex-rtm.S |   3 +
+ sysdeps/x86_64/multiarch/rawmemchr.c          |   2 +-
+ sysdeps/x86_64/multiarch/wmemchr-evex-rtm.S   |   3 +
+ sysdeps/x86_64/multiarch/wmemchr.c            |   2 +-
+ 10 files changed, 217 insertions(+), 41 deletions(-)
+ create mode 100644 sysdeps/x86_64/multiarch/ifunc-evex.h
+ create mode 100644 sysdeps/x86_64/multiarch/memchr-evex-rtm.S
+ create mode 100644 sysdeps/x86_64/multiarch/rawmemchr-evex-rtm.S
+ create mode 100644 sysdeps/x86_64/multiarch/wmemchr-evex-rtm.S
+
+diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
+index 65fde4eb..26be4095 100644
+--- a/sysdeps/x86_64/multiarch/Makefile
+++ b/sysdeps/x86_64/multiarch/Makefile
+@@ -77,7 +77,9 @@ sysdep_routines += strncat-c stpncpy-c strncpy-c \
+ 		   strncmp-evex \
+ 		   strncpy-evex \
+ 		   strnlen-evex \
+-		   strrchr-evex
+		   strrchr-evex \
+		   memchr-evex-rtm \
+		   rawmemchr-evex-rtm
+ CFLAGS-varshift.c += -msse4
+ CFLAGS-strcspn-c.c += -msse4
+ CFLAGS-strpbrk-c.c += -msse4
+@@ -110,7 +112,8 @@ sysdep_routines += wmemcmp-sse4 wmemcmp-ssse3 wmemcmp-c \
+ 		   wcsnlen-evex \
+ 		   wcsrchr-evex \
+ 		   wmemchr-evex \
+-		   wmemcmp-evex-movbe
+		   wmemcmp-evex-movbe \
+		   wmemchr-evex-rtm
+ endif
+ 
+ ifeq ($(subdir),debug)
+diff --git a/sysdeps/x86_64/multiarch/ifunc-evex.h b/sysdeps/x86_64/multiarch/ifunc-evex.h
+new file mode 100644
+index 00000000..fc391edb
+--- /dev/null
+++ b/sysdeps/x86_64/multiarch/ifunc-evex.h
+@@ -0,0 +1,55 @@
+/* Common definition for ifunc selection optimized with EVEX.
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2017-2021 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <init-arch.h>
+
+extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE (evex_rtm) attribute_hidden;
+
+
+static inline void *
+IFUNC_SELECTOR (void)
+{
+  const struct cpu_features* cpu_features = __get_cpu_features ();
+
+  if (CPU_FEATURE_USABLE_P (cpu_features, AVX2)
+      && CPU_FEATURE_USABLE_P (cpu_features, BMI2)
+      && CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load))
+    {
+      if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
+	  && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW))
+	{
+	  if (CPU_FEATURE_USABLE_P (cpu_features, RTM))
+	    return OPTIMIZE (evex_rtm);
+
+	  return OPTIMIZE (evex);
+	}
+
+      if (CPU_FEATURE_USABLE_P (cpu_features, RTM))
+	return OPTIMIZE (avx2_rtm);
+
+      if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
+	return OPTIMIZE (avx2);
+    }
+
+  return OPTIMIZE (sse2);
+}
+diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+index d59d65f8..ac097e8d 100644
+--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+@@ -52,6 +52,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 			       && CPU_FEATURE_USABLE (AVX512BW)
+ 			       && CPU_FEATURE_USABLE (BMI2)),
+ 			      __memchr_evex)
+	      IFUNC_IMPL_ADD (array, i, memchr,
+			      (CPU_FEATURE_USABLE (AVX512VL)
+			       && CPU_FEATURE_USABLE (AVX512BW)
+			       && CPU_FEATURE_USABLE (BMI2)),
+			      __memchr_evex_rtm)
+ 	      IFUNC_IMPL_ADD (array, i, memchr, 1, __memchr_sse2))
+ 
+   /* Support sysdeps/x86_64/multiarch/memcmp.c.  */
+@@ -288,6 +293,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 			       && CPU_FEATURE_USABLE (AVX512BW)
+ 			       && CPU_FEATURE_USABLE (BMI2)),
+ 			      __rawmemchr_evex)
+	      IFUNC_IMPL_ADD (array, i, rawmemchr,
+			      (CPU_FEATURE_USABLE (AVX512VL)
+			       && CPU_FEATURE_USABLE (AVX512BW)
+			       && CPU_FEATURE_USABLE (BMI2)),
+			      __rawmemchr_evex_rtm)
+ 	      IFUNC_IMPL_ADD (array, i, rawmemchr, 1, __rawmemchr_sse2))
+ 
+   /* Support sysdeps/x86_64/multiarch/strlen.c.  */
+@@ -711,6 +721,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 			       && CPU_FEATURE_USABLE (AVX512BW)
+ 			       && CPU_FEATURE_USABLE (BMI2)),
+ 			      __wmemchr_evex)
+	      IFUNC_IMPL_ADD (array, i, wmemchr,
+			      (CPU_FEATURE_USABLE (AVX512VL)
+			       && CPU_FEATURE_USABLE (AVX512BW)
+			       && CPU_FEATURE_USABLE (BMI2)),
+			      __wmemchr_evex_rtm)
+ 	      IFUNC_IMPL_ADD (array, i, wmemchr, 1, __wmemchr_sse2))
+ 
+   /* Support sysdeps/x86_64/multiarch/wmemcmp.c.  */
+diff --git a/sysdeps/x86_64/multiarch/memchr-evex-rtm.S b/sysdeps/x86_64/multiarch/memchr-evex-rtm.S
+new file mode 100644
+index 00000000..19871882
+--- /dev/null
+++ b/sysdeps/x86_64/multiarch/memchr-evex-rtm.S
+@@ -0,0 +1,8 @@
+#ifndef MEMCHR
+# define MEMCHR __memchr_evex_rtm
+#endif
+
+#define USE_IN_RTM 1
+#define SECTION(p) p##.evex.rtm
+
+#include "memchr-evex.S"
+diff --git a/sysdeps/x86_64/multiarch/memchr-evex.S b/sysdeps/x86_64/multiarch/memchr-evex.S
+index f3fdad4f..4d0ed6d1 100644
+--- a/sysdeps/x86_64/multiarch/memchr-evex.S
+++ b/sysdeps/x86_64/multiarch/memchr-evex.S
+@@ -38,10 +38,32 @@
+ #  define CHAR_SIZE	1
+ # endif
+ 
+	/* In the 4x loop the RTM and non-RTM versions have data pointer
+	   off by VEC_SIZE * 4 with RTM version being VEC_SIZE * 4 greater.
+	   This is represented by BASE_OFFSET. As well because the RTM
+	   version uses vpcmp which stores a bit per element compared where
+	   the non-RTM version uses vpcmpeq which stores a bit per byte
+	   compared RET_SCALE of CHAR_SIZE is only relevant for the RTM
+	   version.  */
+# ifdef USE_IN_RTM
+#  define VZEROUPPER
+#  define BASE_OFFSET	(VEC_SIZE * 4)
+#  define RET_SCALE	CHAR_SIZE
+# else
+#  define VZEROUPPER	vzeroupper
+#  define BASE_OFFSET	0
+#  define RET_SCALE	1
+# endif
+
+	/* In the return from 4x loop memchr and rawmemchr versions have
+	   data pointers off by VEC_SIZE * 4 with memchr version being
+	   VEC_SIZE * 4 greater.  */
+ # ifdef USE_AS_RAWMEMCHR
+#  define RET_OFFSET	(BASE_OFFSET - (VEC_SIZE * 4))
+ #  define RAW_PTR_REG	rcx
+ #  define ALGN_PTR_REG	rdi
+ # else
+#  define RET_OFFSET	BASE_OFFSET
+ #  define RAW_PTR_REG	rdi
+ #  define ALGN_PTR_REG	rcx
+ # endif
+@@ -57,11 +79,15 @@
+ # define YMM5		ymm21
+ # define YMM6		ymm22
+ 
+# ifndef SECTION
+#  define SECTION(p)	p##.evex
+# endif
+
+ # define VEC_SIZE 32
+ # define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE)
+ # define PAGE_SIZE 4096
+ 
+-	.section .text.evex,"ax",@progbits
+	.section SECTION(.text),"ax",@progbits
+ ENTRY (MEMCHR)
+ # ifndef USE_AS_RAWMEMCHR
+ 	/* Check for zero length.  */
+@@ -237,14 +263,15 @@ L(cross_page_continue):
+ 	/* Check if at last CHAR_PER_VEC * 4 length.  */
+ 	subq	$(CHAR_PER_VEC * 4), %rdx
+ 	jbe	L(last_4x_vec_or_less_cmpeq)
+-	addq	$VEC_SIZE, %rdi
+	/* +VEC_SIZE if USE_IN_RTM otherwise +VEC_SIZE * 5.  */
+	addq	$(VEC_SIZE + (VEC_SIZE * 4 - BASE_OFFSET)), %rdi
+ 
+ 	/* Align data to VEC_SIZE * 4 for the loop and readjust length.
+ 	 */
+ #  ifdef USE_AS_WMEMCHR
+ 	movl	%edi, %ecx
+ 	andq	$-(4 * VEC_SIZE), %rdi
+-	andl	$(VEC_SIZE * 4 - 1), %ecx
+	subl	%edi, %ecx
+ 	/* NB: Divide bytes by 4 to get the wchar_t count.  */
+ 	sarl	$2, %ecx
+ 	addq	%rcx, %rdx
+@@ -254,15 +281,28 @@ L(cross_page_continue):
+ 	subq	%rdi, %rdx
+ #  endif
+ # else
+-	addq	$VEC_SIZE, %rdi
+	addq	$(VEC_SIZE + (VEC_SIZE * 4 - BASE_OFFSET)), %rdi
+ 	andq	$-(4 * VEC_SIZE), %rdi
+ # endif
+-
+# ifdef USE_IN_RTM
+ 	vpxorq	%XMMZERO, %XMMZERO, %XMMZERO
+# else
+	/* copy ymmmatch to ymm0 so we can use vpcmpeq which is not
+	   encodable with EVEX registers (ymm16-ymm31).  */
+	vmovdqa64 %YMMMATCH, %ymm0
+# endif
+ 
+ 	/* Compare 4 * VEC at a time forward.  */
+ 	.p2align 4
+ L(loop_4x_vec):
+	/* Two versions of the loop. One that does not require
+	   vzeroupper by not using ymm0-ymm15 and another does that require
+	   vzeroupper because it uses ymm0-ymm15. The reason why ymm0-ymm15
+	   is used at all is because there is no EVEX encoding vpcmpeq and
+	   with vpcmpeq this loop can be performed more efficiently. The
+	   non-vzeroupper version is safe for RTM while the vzeroupper
+	   version should be prefered if RTM are not supported.  */
+# ifdef USE_IN_RTM
+ 	/* It would be possible to save some instructions using 4x VPCMP
+ 	   but bottleneck on port 5 makes it not woth it.  */
+ 	VPCMP	$4, (VEC_SIZE * 4)(%rdi), %YMMMATCH, %k1
+@@ -273,12 +313,55 @@ L(loop_4x_vec):
+ 	/* Reduce VEC2 / VEC3 with min and VEC1 with zero mask.  */
+ 	VPMINU	%YMM2, %YMM3, %YMM3{%k1}{z}
+ 	VPCMP	$0, %YMM3, %YMMZERO, %k2
+# else
+	/* Since vptern can only take 3x vectors fastest to do 1 vec
+	   seperately with EVEX vpcmp.  */
+#  ifdef USE_AS_WMEMCHR
+	/* vptern can only accept masks for epi32/epi64 so can only save
+	   instruction using not equals mask on vptern with wmemchr.  */
+	VPCMP	$4, (%rdi), %YMMMATCH, %k1
+#  else
+	VPCMP	$0, (%rdi), %YMMMATCH, %k1
+#  endif
+	/* Compare 3x with vpcmpeq and or them all together with vptern.
+	 */
+	VPCMPEQ	VEC_SIZE(%rdi), %ymm0, %ymm2
+	VPCMPEQ	(VEC_SIZE * 2)(%rdi), %ymm0, %ymm3
+	VPCMPEQ	(VEC_SIZE * 3)(%rdi), %ymm0, %ymm4
+#  ifdef USE_AS_WMEMCHR
+	/* This takes the not of or between ymm2, ymm3, ymm4 as well as
+	   combines result from VEC0 with zero mask.  */
+	vpternlogd $1, %ymm2, %ymm3, %ymm4{%k1}{z}
+	vpmovmskb %ymm4, %ecx
+#  else
+	/* 254 is mask for oring ymm2, ymm3, ymm4 into ymm4.  */
+	vpternlogd $254, %ymm2, %ymm3, %ymm4
+	vpmovmskb %ymm4, %ecx
+	kmovd	%k1, %eax
+#  endif
+# endif
+
+ # ifdef USE_AS_RAWMEMCHR
+ 	subq	$-(VEC_SIZE * 4), %rdi
+# endif
+# ifdef USE_IN_RTM
+ 	kortestd %k2, %k3
+# else
+#  ifdef USE_AS_WMEMCHR
+	/* ecx contains not of matches. All 1s means no matches. incl will
+	   overflow and set zeroflag if that is the case.  */
+	incl	%ecx
+#  else
+	/* If either VEC1 (eax) or VEC2-VEC4 (ecx) are not zero. Adding
+	   to ecx is not an issue because if eax is non-zero it will be
+	   used for returning the match. If it is zero the add does
+	   nothing.  */
+	addq	%rax, %rcx
+#  endif
+# endif
+# ifdef USE_AS_RAWMEMCHR
+ 	jz	L(loop_4x_vec)
+ # else
+-	kortestd %k2, %k3
+ 	jnz	L(loop_4x_vec_end)
+ 
+ 	subq	$-(VEC_SIZE * 4), %rdi
+@@ -288,10 +371,11 @@ L(loop_4x_vec):
+ 
+ 	/* Fall through into less than 4 remaining vectors of length case.
+ 	 */
+-	VPCMP	$0, (VEC_SIZE * 4)(%rdi), %YMMMATCH, %k0
+	VPCMP	$0, BASE_OFFSET(%rdi), %YMMMATCH, %k0
+	addq	$(BASE_OFFSET - VEC_SIZE), %rdi
+ 	kmovd	%k0, %eax
+-	addq	$(VEC_SIZE * 3), %rdi
+-	.p2align 4
+	VZEROUPPER
+
+ L(last_4x_vec_or_less):
+ 	/* Check if first VEC contained match.  */
+ 	testl	%eax, %eax
+@@ -338,73 +422,78 @@ L(loop_4x_vec_end):
+ 	/* rawmemchr will fall through into this if match was found in
+ 	   loop.  */
+ 
+# if defined USE_IN_RTM || defined USE_AS_WMEMCHR
+ 	/* k1 has not of matches with VEC1.  */
+ 	kmovd	%k1, %eax
+-# ifdef USE_AS_WMEMCHR
+#  ifdef USE_AS_WMEMCHR
+ 	subl	$((1 << CHAR_PER_VEC) - 1), %eax
+-# else
+#  else
+ 	incl	%eax
+#  endif
+# else
+	/* eax already has matches for VEC1.  */
+	testl	%eax, %eax
+ # endif
+ 	jnz	L(last_vec_x1_return)
+ 
+# ifdef USE_IN_RTM
+ 	VPCMP	$0, %YMM2, %YMMZERO, %k0
+ 	kmovd	%k0, %eax
+# else
+	vpmovmskb %ymm2, %eax
+# endif
+ 	testl	%eax, %eax
+ 	jnz	L(last_vec_x2_return)
+ 
+# ifdef USE_IN_RTM
+ 	kmovd	%k2, %eax
+ 	testl	%eax, %eax
+ 	jnz	L(last_vec_x3_return)
+ 
+ 	kmovd	%k3, %eax
+ 	tzcntl	%eax, %eax
+-# ifdef USE_AS_RAWMEMCHR
+-	leaq	(VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax
+	leaq	(VEC_SIZE * 3 + RET_OFFSET)(%rdi, %rax, CHAR_SIZE), %rax
+ # else
+-	leaq	(VEC_SIZE * 7)(%rdi, %rax, CHAR_SIZE), %rax
+	vpmovmskb %ymm3, %eax
+	/* Combine matches in VEC3 (eax) with matches in VEC4 (ecx).  */
+	salq	$VEC_SIZE, %rcx
+	orq	%rcx, %rax
+	tzcntq	%rax, %rax
+	leaq	(VEC_SIZE * 2 + RET_OFFSET)(%rdi, %rax), %rax
+	VZEROUPPER
+ # endif
+ 	ret
+ 
+ 	.p2align 4
+ L(last_vec_x1_return):
+ 	tzcntl	%eax, %eax
+-# ifdef USE_AS_RAWMEMCHR
+-#  ifdef USE_AS_WMEMCHR
+# if defined USE_AS_WMEMCHR || RET_OFFSET != 0
+ 	/* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count.  */
+-	leaq	(%rdi, %rax, CHAR_SIZE), %rax
+-#  else
+-	addq	%rdi, %rax
+-#  endif
+	leaq	RET_OFFSET(%rdi, %rax, CHAR_SIZE), %rax
+ # else
+-	/* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count.  */
+-	leaq	(VEC_SIZE * 4)(%rdi, %rax, CHAR_SIZE), %rax
+	addq	%rdi, %rax
+ # endif
+	VZEROUPPER
+ 	ret
+ 
+ 	.p2align 4
+ L(last_vec_x2_return):
+ 	tzcntl	%eax, %eax
+-# ifdef USE_AS_RAWMEMCHR
+-	/* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count.  */
+-	leaq	VEC_SIZE(%rdi, %rax, CHAR_SIZE), %rax
+-# else
+-	/* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count.  */
+-	leaq	(VEC_SIZE * 5)(%rdi, %rax, CHAR_SIZE), %rax
+-# endif
+	/* NB: Multiply bytes by RET_SCALE to get the wchar_t count
+	   if relevant (RET_SCALE = CHAR_SIZE if USE_AS_WMEMCHAR and
+	   USE_IN_RTM are both defined. Otherwise RET_SCALE = 1.  */
+	leaq	(VEC_SIZE + RET_OFFSET)(%rdi, %rax, RET_SCALE), %rax
+	VZEROUPPER
+ 	ret
+ 
+# ifdef USE_IN_RTM
+ 	.p2align 4
+ L(last_vec_x3_return):
+ 	tzcntl	%eax, %eax
+-# ifdef USE_AS_RAWMEMCHR
+-	/* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count.  */
+-	leaq	(VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
+-# else
+ 	/* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count.  */
+-	leaq	(VEC_SIZE * 6)(%rdi, %rax, CHAR_SIZE), %rax
+-# endif
+	leaq	(VEC_SIZE * 2 + RET_OFFSET)(%rdi, %rax, CHAR_SIZE), %rax
+ 	ret
+-
+# endif
+ 
+ # ifndef USE_AS_RAWMEMCHR
+ L(last_4x_vec_or_less_cmpeq):
+diff --git a/sysdeps/x86_64/multiarch/memchr.c b/sysdeps/x86_64/multiarch/memchr.c
+index 016f5784..f28aea77 100644
+--- a/sysdeps/x86_64/multiarch/memchr.c
+++ b/sysdeps/x86_64/multiarch/memchr.c
+@@ -24,7 +24,7 @@
+ # undef memchr
+ 
+ # define SYMBOL_NAME memchr
+-# include "ifunc-avx2.h"
+# include "ifunc-evex.h"
+ 
+ libc_ifunc_redirected (__redirect_memchr, memchr, IFUNC_SELECTOR ());
+ strong_alias (memchr, __memchr)
+diff --git a/sysdeps/x86_64/multiarch/rawmemchr-evex-rtm.S b/sysdeps/x86_64/multiarch/rawmemchr-evex-rtm.S
+new file mode 100644
+index 00000000..deda1ca3
+--- /dev/null
+++ b/sysdeps/x86_64/multiarch/rawmemchr-evex-rtm.S
+@@ -0,0 +1,3 @@
+#define MEMCHR __rawmemchr_evex_rtm
+#define USE_AS_RAWMEMCHR 1
+#include "memchr-evex-rtm.S"
+diff --git a/sysdeps/x86_64/multiarch/rawmemchr.c b/sysdeps/x86_64/multiarch/rawmemchr.c
+index 8a0bc313..1f764f35 100644
+--- a/sysdeps/x86_64/multiarch/rawmemchr.c
+++ b/sysdeps/x86_64/multiarch/rawmemchr.c
+@@ -26,7 +26,7 @@
+ # undef __rawmemchr
+ 
+ # define SYMBOL_NAME rawmemchr
+-# include "ifunc-avx2.h"
+# include "ifunc-evex.h"
+ 
+ libc_ifunc_redirected (__redirect_rawmemchr, __rawmemchr,
+ 		       IFUNC_SELECTOR ());
+diff --git a/sysdeps/x86_64/multiarch/wmemchr-evex-rtm.S b/sysdeps/x86_64/multiarch/wmemchr-evex-rtm.S
+new file mode 100644
+index 00000000..a346cd35
+--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wmemchr-evex-rtm.S
+@@ -0,0 +1,3 @@
+#define MEMCHR __wmemchr_evex_rtm
+#define USE_AS_WMEMCHR 1
+#include "memchr-evex-rtm.S"
+diff --git a/sysdeps/x86_64/multiarch/wmemchr.c b/sysdeps/x86_64/multiarch/wmemchr.c
+index 6d833702..f9c91915 100644
+--- a/sysdeps/x86_64/multiarch/wmemchr.c
+++ b/sysdeps/x86_64/multiarch/wmemchr.c
+@@ -26,7 +26,7 @@
+ # undef __wmemchr
+ 
+ # define SYMBOL_NAME wmemchr
+-# include "ifunc-avx2.h"
+# include "ifunc-evex.h"
+ 
+ libc_ifunc_redirected (__redirect_wmemchr, __wmemchr, IFUNC_SELECTOR ());
+ weak_alias (__wmemchr, wmemchr)
+-- 
+GitLab
+
--- a/glibc-RHEL-15696-45.patch
+++ b/glibc-RHEL-15696-45.patch
@ -0,0 +1,873 @@
+From 16d12015c57701b08d7bbed6ec536641bcafb428 Mon Sep 17 00:00:00 2001
+From: Noah Goldstein <goldstein.w.n@gmail.com>
+Date: Mon, 17 May 2021 13:56:52 -0400
+Subject: [PATCH] x86: Optimize memcmp-avx2-movbe.S
+Content-type: text/plain; charset=UTF-8
+
+No bug. This commit optimizes memcmp-avx2.S. The optimizations include
+adding a new vec compare path for small sizes, reorganizing the entry
+control flow, and removing some unnecissary ALU instructions from the
+main loop. test-memcmp and test-wmemcmp are both passing.
+
+Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
+Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
+---
+ sysdeps/x86_64/multiarch/ifunc-impl-list.c   |   6 +
+ sysdeps/x86_64/multiarch/ifunc-memcmp.h      |   1 +
+ sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S | 676 +++++++++++--------
+ 3 files changed, 402 insertions(+), 281 deletions(-)
+
+diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+index ac097e8d..8be0d78a 100644
+--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+@@ -63,16 +63,19 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+   IFUNC_IMPL (i, name, memcmp,
+ 	      IFUNC_IMPL_ADD (array, i, memcmp,
+ 			      (CPU_FEATURE_USABLE (AVX2)
+			       && CPU_FEATURE_USABLE (BMI2)
+ 			       && CPU_FEATURE_USABLE (MOVBE)),
+ 			      __memcmp_avx2_movbe)
+ 	      IFUNC_IMPL_ADD (array, i, memcmp,
+ 			      (CPU_FEATURE_USABLE (AVX2)
+			       && CPU_FEATURE_USABLE (BMI2)
+ 			       && CPU_FEATURE_USABLE (MOVBE)
+ 			       && CPU_FEATURE_USABLE (RTM)),
+ 			      __memcmp_avx2_movbe_rtm)
+ 	      IFUNC_IMPL_ADD (array, i, memcmp,
+ 			      (CPU_FEATURE_USABLE (AVX512VL)
+ 			       && CPU_FEATURE_USABLE (AVX512BW)
+			       && CPU_FEATURE_USABLE (BMI2)
+ 			       && CPU_FEATURE_USABLE (MOVBE)),
+ 			      __memcmp_evex_movbe)
+ 	      IFUNC_IMPL_ADD (array, i, memcmp, CPU_FEATURE_USABLE (SSE4_1),
+@@ -732,16 +735,19 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+   IFUNC_IMPL (i, name, wmemcmp,
+ 	      IFUNC_IMPL_ADD (array, i, wmemcmp,
+ 			      (CPU_FEATURE_USABLE (AVX2)
+			       && CPU_FEATURE_USABLE (BMI2)
+ 			       && CPU_FEATURE_USABLE (MOVBE)),
+ 			      __wmemcmp_avx2_movbe)
+ 	      IFUNC_IMPL_ADD (array, i, wmemcmp,
+ 			      (CPU_FEATURE_USABLE (AVX2)
+			       && CPU_FEATURE_USABLE (BMI2)
+ 			       && CPU_FEATURE_USABLE (MOVBE)
+ 			       && CPU_FEATURE_USABLE (RTM)),
+ 			      __wmemcmp_avx2_movbe_rtm)
+ 	      IFUNC_IMPL_ADD (array, i, wmemcmp,
+ 			      (CPU_FEATURE_USABLE (AVX512VL)
+ 			       && CPU_FEATURE_USABLE (AVX512BW)
+			       && CPU_FEATURE_USABLE (BMI2)
+ 			       && CPU_FEATURE_USABLE (MOVBE)),
+ 			      __wmemcmp_evex_movbe)
+ 	      IFUNC_IMPL_ADD (array, i, wmemcmp, CPU_FEATURE_USABLE (SSE4_1),
+diff --git a/sysdeps/x86_64/multiarch/ifunc-memcmp.h b/sysdeps/x86_64/multiarch/ifunc-memcmp.h
+index 8043c635..690dffe8 100644
+--- a/sysdeps/x86_64/multiarch/ifunc-memcmp.h
+++ b/sysdeps/x86_64/multiarch/ifunc-memcmp.h
+@@ -33,6 +33,7 @@ IFUNC_SELECTOR (void)
+ 
+   if (CPU_FEATURE_USABLE_P (cpu_features, AVX2)
+       && CPU_FEATURE_USABLE_P (cpu_features, MOVBE)
+      && CPU_FEATURE_USABLE_P (cpu_features, BMI2)
+       && CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load))
+     {
+       if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
+diff --git a/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S b/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S
+index 9d5c9c72..16fc673e 100644
+--- a/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S
+++ b/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S
+@@ -19,17 +19,23 @@
+ #if IS_IN (libc)
+ 
+ /* memcmp/wmemcmp is implemented as:
+-   1. For size from 2 to 7 bytes, load as big endian with movbe and bswap
+-      to avoid branches.
+-   2. Use overlapping compare to avoid branch.
+-   3. Use vector compare when size >= 4 bytes for memcmp or size >= 8
+-      bytes for wmemcmp.
+-   4. If size is 8 * VEC_SIZE or less, unroll the loop.
+-   5. Compare 4 * VEC_SIZE at a time with the aligned first memory
+   1. Use ymm vector compares when possible. The only case where
+      vector compares is not possible for when size < VEC_SIZE
+      and loading from either s1 or s2 would cause a page cross.
+   2. For size from 2 to 7 bytes on page cross, load as big endian
+      with movbe and bswap to avoid branches.
+   3. Use xmm vector compare when size >= 4 bytes for memcmp or
+      size >= 8 bytes for wmemcmp.
+   4. Optimistically compare up to first 4 * VEC_SIZE one at a
+      to check for early mismatches. Only do this if its guranteed the
+      work is not wasted.
+   5. If size is 8 * VEC_SIZE or less, unroll the loop.
+   6. Compare 4 * VEC_SIZE at a time with the aligned first memory
+       area.
+-   6. Use 2 vector compares when size is 2 * VEC_SIZE or less.
+-   7. Use 4 vector compares when size is 4 * VEC_SIZE or less.
+-   8. Use 8 vector compares when size is 8 * VEC_SIZE or less.  */
+   7. Use 2 vector compares when size is 2 * VEC_SIZE or less.
+   8. Use 4 vector compares when size is 4 * VEC_SIZE or less.
+   9. Use 8 vector compares when size is 8 * VEC_SIZE or less.  */
+
+ 
+ # include <sysdep.h>
+ 
+@@ -38,8 +44,10 @@
+ # endif
+ 
+ # ifdef USE_AS_WMEMCMP
+#  define CHAR_SIZE	4
+ #  define VPCMPEQ	vpcmpeqd
+ # else
+#  define CHAR_SIZE	1
+ #  define VPCMPEQ	vpcmpeqb
+ # endif
+ 
+@@ -52,7 +60,7 @@
+ # endif
+ 
+ # define VEC_SIZE 32
+-# define VEC_MASK ((1 << VEC_SIZE) - 1)
+# define PAGE_SIZE	4096
+ 
+ /* Warning!
+            wmemcmp has to use SIGNED comparison for elements.
+@@ -71,136 +79,359 @@ ENTRY (MEMCMP)
+ 	jb	L(less_vec)
+ 
+ 	/* From VEC to 2 * VEC.  No branch when size == VEC_SIZE.  */
+-	vmovdqu	(%rsi), %ymm2
+-	VPCMPEQ (%rdi), %ymm2, %ymm2
+-	vpmovmskb %ymm2, %eax
+-	subl    $VEC_MASK, %eax
+-	jnz	L(first_vec)
+	vmovdqu	(%rsi), %ymm1
+	VPCMPEQ	(%rdi), %ymm1, %ymm1
+	vpmovmskb %ymm1, %eax
+	/* NB: eax must be destination register if going to
+	   L(return_vec_[0,2]). For L(return_vec_3 destination register
+	   must be ecx.  */
+	incl	%eax
+	jnz	L(return_vec_0)
+ 
+ 	cmpq	$(VEC_SIZE * 2), %rdx
+-	jbe	L(last_vec)
+-
+-	VPCMPEQ	%ymm0, %ymm0, %ymm0
+-	/* More than 2 * VEC.  */
+-	cmpq	$(VEC_SIZE * 8), %rdx
+-	ja	L(more_8x_vec)
+-	cmpq	$(VEC_SIZE * 4), %rdx
+-	jb	L(last_4x_vec)
+-
+-	/* From 4 * VEC to 8 * VEC, inclusively. */
+-	vmovdqu	(%rsi), %ymm1
+-	VPCMPEQ (%rdi), %ymm1, %ymm1
+	jbe	L(last_1x_vec)
+ 
+	/* Check second VEC no matter what.  */
+ 	vmovdqu	VEC_SIZE(%rsi), %ymm2
+-	VPCMPEQ VEC_SIZE(%rdi), %ymm2, %ymm2
+	VPCMPEQ	VEC_SIZE(%rdi), %ymm2, %ymm2
+	vpmovmskb %ymm2, %eax
+	/* If all 4 VEC where equal eax will be all 1s so incl will
+	   overflow and set zero flag.  */
+	incl	%eax
+	jnz	L(return_vec_1)
+ 
+-	vmovdqu	(VEC_SIZE * 2)(%rsi), %ymm3
+-	VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm3, %ymm3
+	/* Less than 4 * VEC.  */
+	cmpq	$(VEC_SIZE * 4), %rdx
+	jbe	L(last_2x_vec)
+ 
+	/* Check third and fourth VEC no matter what.  */
+	vmovdqu	(VEC_SIZE * 2)(%rsi), %ymm3
+	VPCMPEQ	(VEC_SIZE * 2)(%rdi), %ymm3, %ymm3
+	vpmovmskb %ymm3, %eax
+	incl	%eax
+	jnz	L(return_vec_2)
+ 	vmovdqu	(VEC_SIZE * 3)(%rsi), %ymm4
+-	VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm4, %ymm4
+	VPCMPEQ	(VEC_SIZE * 3)(%rdi), %ymm4, %ymm4
+	vpmovmskb %ymm4, %ecx
+	incl	%ecx
+	jnz	L(return_vec_3)
+ 
+-	vpand	%ymm1, %ymm2, %ymm5
+-	vpand	%ymm3, %ymm4, %ymm6
+-	vpand	%ymm5, %ymm6, %ymm5
+	/* Go to 4x VEC loop.  */
+	cmpq	$(VEC_SIZE * 8), %rdx
+	ja	L(more_8x_vec)
+ 
+-	vptest	%ymm0, %ymm5
+-	jnc	L(4x_vec_end)
+	/* Handle remainder of size = 4 * VEC + 1 to 8 * VEC without any
+	   branches.  */
+ 
+	/* Load first two VEC from s2 before adjusting addresses.  */
+	vmovdqu	-(VEC_SIZE * 4)(%rsi, %rdx), %ymm1
+	vmovdqu	-(VEC_SIZE * 3)(%rsi, %rdx), %ymm2
+ 	leaq	-(4 * VEC_SIZE)(%rdi, %rdx), %rdi
+ 	leaq	-(4 * VEC_SIZE)(%rsi, %rdx), %rsi
+-	vmovdqu	(%rsi), %ymm1
+-	VPCMPEQ (%rdi), %ymm1, %ymm1
+ 
+-	vmovdqu	VEC_SIZE(%rsi), %ymm2
+-	VPCMPEQ VEC_SIZE(%rdi), %ymm2, %ymm2
+-	vpand	%ymm2, %ymm1, %ymm5
+	/* Wait to load from s1 until addressed adjust due to
+	   unlamination of microfusion with complex address mode.  */
+	VPCMPEQ	(%rdi), %ymm1, %ymm1
+	VPCMPEQ	(VEC_SIZE)(%rdi), %ymm2, %ymm2
+ 
+ 	vmovdqu	(VEC_SIZE * 2)(%rsi), %ymm3
+-	VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm3, %ymm3
+-	vpand	%ymm3, %ymm5, %ymm5
+-
+	VPCMPEQ	(VEC_SIZE * 2)(%rdi), %ymm3, %ymm3
+ 	vmovdqu	(VEC_SIZE * 3)(%rsi), %ymm4
+-	VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm4, %ymm4
+-	vpand	%ymm4, %ymm5, %ymm5
+	VPCMPEQ	(VEC_SIZE * 3)(%rdi), %ymm4, %ymm4
+ 
+-	vptest	%ymm0, %ymm5
+-	jnc	L(4x_vec_end)
+-	xorl	%eax, %eax
+	/* Reduce VEC0 - VEC4.  */
+	vpand	%ymm1, %ymm2, %ymm5
+	vpand	%ymm3, %ymm4, %ymm6
+	vpand	%ymm5, %ymm6, %ymm7
+	vpmovmskb %ymm7, %ecx
+	incl	%ecx
+	jnz	L(return_vec_0_1_2_3)
+	/* NB: eax must be zero to reach here.  */
+	VZEROUPPER_RETURN
+
+	.p2align 4
+L(return_vec_0):
+	tzcntl	%eax, %eax
+# ifdef USE_AS_WMEMCMP
+	movl	(%rdi, %rax), %ecx
+	xorl	%edx, %edx
+	cmpl	(%rsi, %rax), %ecx
+	/* NB: no partial register stall here because xorl zero idiom
+	   above.  */
+	setg	%dl
+	leal	-1(%rdx, %rdx), %eax
+# else
+	movzbl	(%rsi, %rax), %ecx
+	movzbl	(%rdi, %rax), %eax
+	subl	%ecx, %eax
+# endif
+ L(return_vzeroupper):
+ 	ZERO_UPPER_VEC_REGISTERS_RETURN
+ 
+ 	.p2align 4
+-L(last_2x_vec):
+-	/* From VEC to 2 * VEC.  No branch when size == VEC_SIZE.  */
+-	vmovdqu	(%rsi), %ymm2
+-	VPCMPEQ (%rdi), %ymm2, %ymm2
+-	vpmovmskb %ymm2, %eax
+-	subl    $VEC_MASK, %eax
+-	jnz	L(first_vec)
+L(return_vec_1):
+	tzcntl	%eax, %eax
+# ifdef USE_AS_WMEMCMP
+	movl	VEC_SIZE(%rdi, %rax), %ecx
+	xorl	%edx, %edx
+	cmpl	VEC_SIZE(%rsi, %rax), %ecx
+	setg	%dl
+	leal	-1(%rdx, %rdx), %eax
+# else
+	movzbl	VEC_SIZE(%rsi, %rax), %ecx
+	movzbl	VEC_SIZE(%rdi, %rax), %eax
+	subl	%ecx, %eax
+# endif
+	VZEROUPPER_RETURN
+
+	.p2align 4
+L(return_vec_2):
+	tzcntl	%eax, %eax
+# ifdef USE_AS_WMEMCMP
+	movl	(VEC_SIZE * 2)(%rdi, %rax), %ecx
+	xorl	%edx, %edx
+	cmpl	(VEC_SIZE * 2)(%rsi, %rax), %ecx
+	setg	%dl
+	leal	-1(%rdx, %rdx), %eax
+# else
+	movzbl	(VEC_SIZE * 2)(%rsi, %rax), %ecx
+	movzbl	(VEC_SIZE * 2)(%rdi, %rax), %eax
+	subl	%ecx, %eax
+# endif
+	VZEROUPPER_RETURN
+
+	/* NB: p2align 5 here to ensure 4x loop is 32 byte aligned.  */
+	.p2align 5
+L(8x_return_vec_0_1_2_3):
+	/* Returning from L(more_8x_vec) requires restoring rsi.  */
+	addq	%rdi, %rsi
+L(return_vec_0_1_2_3):
+	vpmovmskb %ymm1, %eax
+	incl	%eax
+	jnz	L(return_vec_0)
+ 
+-L(last_vec):
+-	/* Use overlapping loads to avoid branches.  */
+-	leaq	-VEC_SIZE(%rdi, %rdx), %rdi
+-	leaq	-VEC_SIZE(%rsi, %rdx), %rsi
+-	vmovdqu	(%rsi), %ymm2
+-	VPCMPEQ (%rdi), %ymm2, %ymm2
+ 	vpmovmskb %ymm2, %eax
+-	subl    $VEC_MASK, %eax
+-	jnz	L(first_vec)
+	incl	%eax
+	jnz	L(return_vec_1)
+
+	vpmovmskb %ymm3, %eax
+	incl	%eax
+	jnz	L(return_vec_2)
+L(return_vec_3):
+	tzcntl	%ecx, %ecx
+# ifdef USE_AS_WMEMCMP
+	movl	(VEC_SIZE * 3)(%rdi, %rcx), %eax
+	xorl	%edx, %edx
+	cmpl	(VEC_SIZE * 3)(%rsi, %rcx), %eax
+	setg	%dl
+	leal	-1(%rdx, %rdx), %eax
+# else
+	movzbl	(VEC_SIZE * 3)(%rdi, %rcx), %eax
+	movzbl	(VEC_SIZE * 3)(%rsi, %rcx), %ecx
+	subl	%ecx, %eax
+# endif
+	VZEROUPPER_RETURN
+
+	.p2align 4
+L(more_8x_vec):
+	/* Set end of s1 in rdx.  */
+	leaq	-(VEC_SIZE * 4)(%rdi, %rdx), %rdx
+	/* rsi stores s2 - s1. This allows loop to only update one
+	   pointer.  */
+	subq	%rdi, %rsi
+	/* Align s1 pointer.  */
+	andq	$-VEC_SIZE, %rdi
+	/* Adjust because first 4x vec where check already.  */
+	subq	$-(VEC_SIZE * 4), %rdi
+	.p2align 4
+L(loop_4x_vec):
+	/* rsi has s2 - s1 so get correct address by adding s1 (in rdi).
+	 */
+	vmovdqu	(%rsi, %rdi), %ymm1
+	VPCMPEQ	(%rdi), %ymm1, %ymm1
+
+	vmovdqu	VEC_SIZE(%rsi, %rdi), %ymm2
+	VPCMPEQ	VEC_SIZE(%rdi), %ymm2, %ymm2
+
+	vmovdqu	(VEC_SIZE * 2)(%rsi, %rdi), %ymm3
+	VPCMPEQ	(VEC_SIZE * 2)(%rdi), %ymm3, %ymm3
+
+	vmovdqu	(VEC_SIZE * 3)(%rsi, %rdi), %ymm4
+	VPCMPEQ	(VEC_SIZE * 3)(%rdi), %ymm4, %ymm4
+
+	vpand	%ymm1, %ymm2, %ymm5
+	vpand	%ymm3, %ymm4, %ymm6
+	vpand	%ymm5, %ymm6, %ymm7
+	vpmovmskb %ymm7, %ecx
+	incl	%ecx
+	jnz	L(8x_return_vec_0_1_2_3)
+	subq	$-(VEC_SIZE * 4), %rdi
+	/* Check if s1 pointer at end.  */
+	cmpq	%rdx, %rdi
+	jb	L(loop_4x_vec)
+
+	subq	%rdx, %rdi
+	/* rdi has 4 * VEC_SIZE - remaining length.  */
+	cmpl	$(VEC_SIZE * 3), %edi
+	jae	L(8x_last_1x_vec)
+	/* Load regardless of branch.  */
+	vmovdqu	(VEC_SIZE * 2)(%rsi, %rdx), %ymm3
+	cmpl	$(VEC_SIZE * 2), %edi
+	jae	L(8x_last_2x_vec)
+
+	/* Check last 4 VEC.  */
+	vmovdqu	(%rsi, %rdx), %ymm1
+	VPCMPEQ	(%rdx), %ymm1, %ymm1
+
+	vmovdqu	VEC_SIZE(%rsi, %rdx), %ymm2
+	VPCMPEQ	VEC_SIZE(%rdx), %ymm2, %ymm2
+
+	VPCMPEQ	(VEC_SIZE * 2)(%rdx), %ymm3, %ymm3
+
+	vmovdqu	(VEC_SIZE * 3)(%rsi, %rdx), %ymm4
+	VPCMPEQ	(VEC_SIZE * 3)(%rdx), %ymm4, %ymm4
+
+	vpand	%ymm1, %ymm2, %ymm5
+	vpand	%ymm3, %ymm4, %ymm6
+	vpand	%ymm5, %ymm6, %ymm7
+	vpmovmskb %ymm7, %ecx
+	/* Restore s1 pointer to rdi.  */
+	movq	%rdx, %rdi
+	incl	%ecx
+	jnz	L(8x_return_vec_0_1_2_3)
+	/* NB: eax must be zero to reach here.  */
+	VZEROUPPER_RETURN
+
+	/* Only entry is from L(more_8x_vec).  */
+	.p2align 4
+L(8x_last_2x_vec):
+	/* Check second to last VEC. rdx store end pointer of s1 and
+	   ymm3 has already been loaded with second to last VEC from s2.
+	 */
+	VPCMPEQ	(VEC_SIZE * 2)(%rdx), %ymm3, %ymm3
+	vpmovmskb %ymm3, %eax
+	incl	%eax
+	jnz	L(8x_return_vec_2)
+	/* Check last VEC.  */
+	.p2align 4
+L(8x_last_1x_vec):
+	vmovdqu	(VEC_SIZE * 3)(%rsi, %rdx), %ymm4
+	VPCMPEQ	(VEC_SIZE * 3)(%rdx), %ymm4, %ymm4
+	vpmovmskb %ymm4, %eax
+	incl	%eax
+	jnz	L(8x_return_vec_3)
+ 	VZEROUPPER_RETURN
+ 
+ 	.p2align 4
+-L(first_vec):
+-	/* A byte or int32 is different within 16 or 32 bytes.  */
+-	tzcntl	%eax, %ecx
+L(last_2x_vec):
+	/* Check second to last VEC.  */
+	vmovdqu	-(VEC_SIZE * 2)(%rsi, %rdx), %ymm1
+	VPCMPEQ	-(VEC_SIZE * 2)(%rdi, %rdx), %ymm1, %ymm1
+	vpmovmskb %ymm1, %eax
+	incl	%eax
+	jnz	L(return_vec_1_end)
+	/* Check last VEC.  */
+L(last_1x_vec):
+	vmovdqu	-(VEC_SIZE * 1)(%rsi, %rdx), %ymm1
+	VPCMPEQ	-(VEC_SIZE * 1)(%rdi, %rdx), %ymm1, %ymm1
+	vpmovmskb %ymm1, %eax
+	incl	%eax
+	jnz	L(return_vec_0_end)
+	VZEROUPPER_RETURN
+
+	.p2align 4
+L(8x_return_vec_2):
+	subq	$VEC_SIZE, %rdx
+L(8x_return_vec_3):
+	tzcntl	%eax, %eax
+	addq	%rdx, %rax
+ # ifdef USE_AS_WMEMCMP
+-	xorl	%eax, %eax
+-	movl	(%rdi, %rcx), %edx
+-	cmpl	(%rsi, %rcx), %edx
+-L(wmemcmp_return):
+-	setl	%al
+-	negl	%eax
+-	orl	$1, %eax
+	movl	(VEC_SIZE * 3)(%rax), %ecx
+	xorl	%edx, %edx
+	cmpl	(VEC_SIZE * 3)(%rsi, %rax), %ecx
+	setg	%dl
+	leal	-1(%rdx, %rdx), %eax
+ # else
+-	movzbl	(%rdi, %rcx), %eax
+-	movzbl	(%rsi, %rcx), %edx
+-	sub	%edx, %eax
+	movzbl	(VEC_SIZE * 3)(%rsi, %rax), %ecx
+	movzbl	(VEC_SIZE * 3)(%rax), %eax
+	subl	%ecx, %eax
+ # endif
+ 	VZEROUPPER_RETURN
+ 
+-# ifdef USE_AS_WMEMCMP
+ 	.p2align 4
+-L(4):
+-	xorl	%eax, %eax
+-	movl	(%rdi), %edx
+-	cmpl	(%rsi), %edx
+-	jne	L(wmemcmp_return)
+-	ret
+L(return_vec_1_end):
+	tzcntl	%eax, %eax
+	addl	%edx, %eax
+# ifdef USE_AS_WMEMCMP
+	movl	-(VEC_SIZE * 2)(%rdi, %rax), %ecx
+	xorl	%edx, %edx
+	cmpl	-(VEC_SIZE * 2)(%rsi, %rax), %ecx
+	setg	%dl
+	leal	-1(%rdx, %rdx), %eax
+ # else
+	movzbl	-(VEC_SIZE * 2)(%rsi, %rax), %ecx
+	movzbl	-(VEC_SIZE * 2)(%rdi, %rax), %eax
+	subl	%ecx, %eax
+# endif
+	VZEROUPPER_RETURN
+
+ 	.p2align 4
+-L(between_4_7):
+-	/* Load as big endian with overlapping movbe to avoid branches.  */
+-	movbe	(%rdi), %eax
+-	movbe	(%rsi), %ecx
+-	shlq	$32, %rax
+-	shlq	$32, %rcx
+-	movbe	-4(%rdi, %rdx), %edi
+-	movbe	-4(%rsi, %rdx), %esi
+-	orq	%rdi, %rax
+-	orq	%rsi, %rcx
+-	subq	%rcx, %rax
+-	je	L(exit)
+-	sbbl	%eax, %eax
+-	orl	$1, %eax
+-	ret
+L(return_vec_0_end):
+	tzcntl	%eax, %eax
+	addl	%edx, %eax
+# ifdef USE_AS_WMEMCMP
+	movl	-VEC_SIZE(%rdi, %rax), %ecx
+	xorl	%edx, %edx
+	cmpl	-VEC_SIZE(%rsi, %rax), %ecx
+	setg	%dl
+	leal	-1(%rdx, %rdx), %eax
+# else
+	movzbl	-VEC_SIZE(%rsi, %rax), %ecx
+	movzbl	-VEC_SIZE(%rdi, %rax), %eax
+	subl	%ecx, %eax
+# endif
+	VZEROUPPER_RETURN
+ 
+ 	.p2align 4
+-L(exit):
+-	ret
+L(less_vec):
+	/* Check if one or less CHAR. This is necessary for size = 0 but
+	   is also faster for size = CHAR_SIZE.  */
+	cmpl	$CHAR_SIZE, %edx
+	jbe	L(one_or_less)
+
+	/* Check if loading one VEC from either s1 or s2 could cause a
+	   page cross. This can have false positives but is by far the
+	   fastest method.  */
+	movl	%edi, %eax
+	orl	%esi, %eax
+	andl	$(PAGE_SIZE - 1), %eax
+	cmpl	$(PAGE_SIZE - VEC_SIZE), %eax
+	jg	L(page_cross_less_vec)
+
+	/* No page cross possible.  */
+	vmovdqu	(%rsi), %ymm2
+	VPCMPEQ	(%rdi), %ymm2, %ymm2
+	vpmovmskb %ymm2, %eax
+	incl	%eax
+	/* Result will be zero if s1 and s2 match. Otherwise first set
+	   bit will be first mismatch.  */
+	bzhil	%edx, %eax, %edx
+	jnz	L(return_vec_0)
+	xorl	%eax, %eax
+	VZEROUPPER_RETURN
+ 
+ 	.p2align 4
+-L(between_2_3):
+L(page_cross_less_vec):
+	/* if USE_AS_WMEMCMP it can only be 0, 4, 8, 12, 16, 20, 24, 28
+	   bytes.  */
+	cmpl	$16, %edx
+	jae	L(between_16_31)
+# ifndef USE_AS_WMEMCMP
+	cmpl	$8, %edx
+	jae	L(between_8_15)
+	cmpl	$4, %edx
+	jae	L(between_4_7)
+
+ 	/* Load as big endian to avoid branches.  */
+ 	movzwl	(%rdi), %eax
+ 	movzwl	(%rsi), %ecx
+@@ -208,223 +439,106 @@ L(between_2_3):
+ 	shll	$8, %ecx
+ 	bswap	%eax
+ 	bswap	%ecx
+-	movb	-1(%rdi, %rdx), %al
+-	movb	-1(%rsi, %rdx), %cl
+	movzbl	-1(%rdi, %rdx), %edi
+	movzbl	-1(%rsi, %rdx), %esi
+	orl	%edi, %eax
+	orl	%esi, %ecx
+ 	/* Subtraction is okay because the upper 8 bits are zero.  */
+ 	subl	%ecx, %eax
+	/* No ymm register was touched.  */
+ 	ret
+ 
+ 	.p2align 4
+-L(1):
+-	movzbl	(%rdi), %eax
+L(one_or_less):
+	jb	L(zero)
+ 	movzbl	(%rsi), %ecx
+	movzbl	(%rdi), %eax
+ 	subl	%ecx, %eax
+-	ret
+-# endif
+-
+-	.p2align 4
+-L(zero):
+-	xorl	%eax, %eax
+	/* No ymm register was touched.  */
+ 	ret
+ 
+ 	.p2align 4
+-L(less_vec):
+-# ifdef USE_AS_WMEMCMP
+-	/* It can only be 0, 4, 8, 12, 16, 20, 24, 28 bytes.  */
+-	cmpb	$4, %dl
+-	je	L(4)
+-	jb	L(zero)
+-# else
+-	cmpb	$1, %dl
+-	je	L(1)
+-	jb	L(zero)
+-	cmpb	$4, %dl
+-	jb	L(between_2_3)
+-	cmpb	$8, %dl
+-	jb	L(between_4_7)
+L(between_8_15):
+ # endif
+-	cmpb	$16, %dl
+-	jae	L(between_16_31)
+-	/* It is between 8 and 15 bytes.  */
+	/* If USE_AS_WMEMCMP fall through into 8-15 byte case.  */
+ 	vmovq	(%rdi), %xmm1
+ 	vmovq	(%rsi), %xmm2
+-	VPCMPEQ %xmm1, %xmm2, %xmm2
+	VPCMPEQ	%xmm1, %xmm2, %xmm2
+ 	vpmovmskb %xmm2, %eax
+-	subl    $0xffff, %eax
+-	jnz	L(first_vec)
+	subl	$0xffff, %eax
+	jnz	L(return_vec_0)
+ 	/* Use overlapping loads to avoid branches.  */
+ 	leaq	-8(%rdi, %rdx), %rdi
+ 	leaq	-8(%rsi, %rdx), %rsi
+ 	vmovq	(%rdi), %xmm1
+ 	vmovq	(%rsi), %xmm2
+-	VPCMPEQ %xmm1, %xmm2, %xmm2
+	VPCMPEQ	%xmm1, %xmm2, %xmm2
+ 	vpmovmskb %xmm2, %eax
+-	subl    $0xffff, %eax
+-	jnz	L(first_vec)
+	subl	$0xffff, %eax
+	jnz	L(return_vec_0)
+	/* No ymm register was touched.  */
+	ret
+
+	.p2align 4
+L(zero):
+	xorl	%eax, %eax
+ 	ret
+ 
+ 	.p2align 4
+ L(between_16_31):
+ 	/* From 16 to 31 bytes.  No branch when size == 16.  */
+ 	vmovdqu	(%rsi), %xmm2
+-	VPCMPEQ (%rdi), %xmm2, %xmm2
+	VPCMPEQ	(%rdi), %xmm2, %xmm2
+ 	vpmovmskb %xmm2, %eax
+-	subl    $0xffff, %eax
+-	jnz	L(first_vec)
+	subl	$0xffff, %eax
+	jnz	L(return_vec_0)
+ 
+ 	/* Use overlapping loads to avoid branches.  */
+
+	vmovdqu	-16(%rsi, %rdx), %xmm2
+ 	leaq	-16(%rdi, %rdx), %rdi
+ 	leaq	-16(%rsi, %rdx), %rsi
+-	vmovdqu	(%rsi), %xmm2
+-	VPCMPEQ (%rdi), %xmm2, %xmm2
+	VPCMPEQ	(%rdi), %xmm2, %xmm2
+ 	vpmovmskb %xmm2, %eax
+-	subl    $0xffff, %eax
+-	jnz	L(first_vec)
+	subl	$0xffff, %eax
+	jnz	L(return_vec_0)
+	/* No ymm register was touched.  */
+ 	ret
+ 
+-	.p2align 4
+-L(more_8x_vec):
+-	/* More than 8 * VEC.  Check the first VEC.  */
+-	vmovdqu	(%rsi), %ymm2
+-	VPCMPEQ (%rdi), %ymm2, %ymm2
+-	vpmovmskb %ymm2, %eax
+-	subl    $VEC_MASK, %eax
+-	jnz	L(first_vec)
+-
+-	/* Align the first memory area for aligned loads in the loop.
+-	   Compute how much the first memory area is misaligned.  */
+-	movq	%rdi, %rcx
+-	andl	$(VEC_SIZE - 1), %ecx
+-	/* Get the negative of offset for alignment.  */
+-	subq	$VEC_SIZE, %rcx
+-	/* Adjust the second memory area.  */
+-	subq	%rcx, %rsi
+-	/* Adjust the first memory area which should be aligned now.  */
+-	subq	%rcx, %rdi
+-	/* Adjust length.  */
+-	addq	%rcx, %rdx
+-
+-L(loop_4x_vec):
+-	/* Compare 4 * VEC at a time forward.  */
+-	vmovdqu	(%rsi), %ymm1
+-	VPCMPEQ (%rdi), %ymm1, %ymm1
+-
+-	vmovdqu	VEC_SIZE(%rsi), %ymm2
+-	VPCMPEQ VEC_SIZE(%rdi), %ymm2, %ymm2
+-	vpand	%ymm2, %ymm1, %ymm5
+-
+-	vmovdqu	(VEC_SIZE * 2)(%rsi), %ymm3
+-	VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm3, %ymm3
+-	vpand	%ymm3, %ymm5, %ymm5
+-
+-	vmovdqu	(VEC_SIZE * 3)(%rsi), %ymm4
+-	VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm4, %ymm4
+-	vpand	%ymm4, %ymm5, %ymm5
+-
+-	vptest	%ymm0, %ymm5
+-	jnc	L(4x_vec_end)
+-
+-	addq	$(VEC_SIZE * 4), %rdi
+-	addq	$(VEC_SIZE * 4), %rsi
+-
+-	subq	$(VEC_SIZE * 4), %rdx
+-	cmpq	$(VEC_SIZE * 4), %rdx
+-	jae	L(loop_4x_vec)
+-
+-	/* Less than 4 * VEC.  */
+-	cmpq	$VEC_SIZE, %rdx
+-	jbe	L(last_vec)
+-	cmpq	$(VEC_SIZE * 2), %rdx
+-	jbe	L(last_2x_vec)
+-
+-L(last_4x_vec):
+-	/* From 2 * VEC to 4 * VEC. */
+-	vmovdqu	(%rsi), %ymm2
+-	VPCMPEQ (%rdi), %ymm2, %ymm2
+-	vpmovmskb %ymm2, %eax
+-	subl    $VEC_MASK, %eax
+-	jnz	L(first_vec)
+-
+-	addq	$VEC_SIZE, %rdi
+-	addq	$VEC_SIZE, %rsi
+-	vmovdqu	(%rsi), %ymm2
+-	VPCMPEQ (%rdi), %ymm2, %ymm2
+-	vpmovmskb %ymm2, %eax
+-	subl    $VEC_MASK, %eax
+-	jnz	L(first_vec)
+-
+-	/* Use overlapping loads to avoid branches.  */
+-	leaq	-(3 * VEC_SIZE)(%rdi, %rdx), %rdi
+-	leaq	-(3 * VEC_SIZE)(%rsi, %rdx), %rsi
+-	vmovdqu	(%rsi), %ymm2
+-	VPCMPEQ (%rdi), %ymm2, %ymm2
+-	vpmovmskb %ymm2, %eax
+-	subl    $VEC_MASK, %eax
+-	jnz	L(first_vec)
+-
+-	addq	$VEC_SIZE, %rdi
+-	addq	$VEC_SIZE, %rsi
+-	vmovdqu	(%rsi), %ymm2
+-	VPCMPEQ (%rdi), %ymm2, %ymm2
+-	vpmovmskb %ymm2, %eax
+-	subl    $VEC_MASK, %eax
+-	jnz	L(first_vec)
+-	VZEROUPPER_RETURN
+-
+-	.p2align 4
+-L(4x_vec_end):
+-	vpmovmskb %ymm1, %eax
+-	subl	$VEC_MASK, %eax
+-	jnz	L(first_vec)
+-	vpmovmskb %ymm2, %eax
+-	subl	$VEC_MASK, %eax
+-	jnz	L(first_vec_x1)
+-	vpmovmskb %ymm3, %eax
+-	subl	$VEC_MASK, %eax
+-	jnz	L(first_vec_x2)
+-	vpmovmskb %ymm4, %eax
+-	subl	$VEC_MASK, %eax
+-	tzcntl	%eax, %ecx
+ # ifdef USE_AS_WMEMCMP
+-	xorl	%eax, %eax
+-	movl	(VEC_SIZE * 3)(%rdi, %rcx), %edx
+-	cmpl	(VEC_SIZE * 3)(%rsi, %rcx), %edx
+-	jmp	L(wmemcmp_return)
+-# else
+-	movzbl	(VEC_SIZE * 3)(%rdi, %rcx), %eax
+-	movzbl	(VEC_SIZE * 3)(%rsi, %rcx), %edx
+-	sub	%edx, %eax
+-# endif
+-	VZEROUPPER_RETURN
+-
+ 	.p2align 4
+-L(first_vec_x1):
+-	tzcntl	%eax, %ecx
+-# ifdef USE_AS_WMEMCMP
+-	xorl	%eax, %eax
+-	movl	VEC_SIZE(%rdi, %rcx), %edx
+-	cmpl	VEC_SIZE(%rsi, %rcx), %edx
+-	jmp	L(wmemcmp_return)
+L(one_or_less):
+	jb	L(zero)
+	movl	(%rdi), %ecx
+	xorl	%edx, %edx
+	cmpl	(%rsi), %ecx
+	je	L(zero)
+	setg	%dl
+	leal	-1(%rdx, %rdx), %eax
+	/* No ymm register was touched.  */
+	ret
+ # else
+-	movzbl	VEC_SIZE(%rdi, %rcx), %eax
+-	movzbl	VEC_SIZE(%rsi, %rcx), %edx
+-	sub	%edx, %eax
+-# endif
+-	VZEROUPPER_RETURN
+ 
+ 	.p2align 4
+-L(first_vec_x2):
+-	tzcntl	%eax, %ecx
+-# ifdef USE_AS_WMEMCMP
+-	xorl	%eax, %eax
+-	movl	(VEC_SIZE * 2)(%rdi, %rcx), %edx
+-	cmpl	(VEC_SIZE * 2)(%rsi, %rcx), %edx
+-	jmp	L(wmemcmp_return)
+-# else
+-	movzbl	(VEC_SIZE * 2)(%rdi, %rcx), %eax
+-	movzbl	(VEC_SIZE * 2)(%rsi, %rcx), %edx
+-	sub	%edx, %eax
+L(between_4_7):
+	/* Load as big endian with overlapping movbe to avoid branches.
+	 */
+	movbe	(%rdi), %eax
+	movbe	(%rsi), %ecx
+	shlq	$32, %rax
+	shlq	$32, %rcx
+	movbe	-4(%rdi, %rdx), %edi
+	movbe	-4(%rsi, %rdx), %esi
+	orq	%rdi, %rax
+	orq	%rsi, %rcx
+	subq	%rcx, %rax
+	jz	L(zero_4_7)
+	sbbl	%eax, %eax
+	orl	$1, %eax
+L(zero_4_7):
+	/* No ymm register was touched.  */
+	ret
+ # endif
+-	VZEROUPPER_RETURN
+
+ END (MEMCMP)
+ #endif
+-- 
+GitLab
+
--- a/glibc-RHEL-15696-46.patch
+++ b/glibc-RHEL-15696-46.patch
@ -0,0 +1,851 @@
+From 4ad473e97acdc5f6d811755b67c09f2128a644ce Mon Sep 17 00:00:00 2001
+From: Noah Goldstein <goldstein.w.n@gmail.com>
+Date: Mon, 17 May 2021 13:57:24 -0400
+Subject: [PATCH] x86: Optimize memcmp-evex-movbe.S
+Content-type: text/plain; charset=UTF-8
+
+No bug. This commit optimizes memcmp-evex.S. The optimizations include
+adding a new vec compare path for small sizes, reorganizing the entry
+control flow, removing some unnecissary ALU instructions from the main
+loop, and most importantly replacing the heavy use of vpcmp + kand
+logic with vpxor + vptern. test-memcmp and test-wmemcmp are both
+passing.
+
+Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
+Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
+---
+ sysdeps/x86_64/multiarch/memcmp-evex-movbe.S | 710 +++++++++++--------
+ 1 file changed, 408 insertions(+), 302 deletions(-)
+
+diff --git a/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S b/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S
+index 9c093972..654dc7ac 100644
+--- a/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S
+++ b/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S
+@@ -19,17 +19,22 @@
+ #if IS_IN (libc)
+ 
+ /* memcmp/wmemcmp is implemented as:
+-   1. For size from 2 to 7 bytes, load as big endian with movbe and bswap
+-      to avoid branches.
+-   2. Use overlapping compare to avoid branch.
+-   3. Use vector compare when size >= 4 bytes for memcmp or size >= 8
+-      bytes for wmemcmp.
+-   4. If size is 8 * VEC_SIZE or less, unroll the loop.
+-   5. Compare 4 * VEC_SIZE at a time with the aligned first memory
+   1. Use ymm vector compares when possible. The only case where
+      vector compares is not possible for when size < CHAR_PER_VEC
+      and loading from either s1 or s2 would cause a page cross.
+   2. For size from 2 to 7 bytes on page cross, load as big endian
+      with movbe and bswap to avoid branches.
+   3. Use xmm vector compare when size >= 4 bytes for memcmp or
+      size >= 8 bytes for wmemcmp.
+   4. Optimistically compare up to first 4 * CHAR_PER_VEC one at a
+      to check for early mismatches. Only do this if its guranteed the
+      work is not wasted.
+   5. If size is 8 * VEC_SIZE or less, unroll the loop.
+   6. Compare 4 * VEC_SIZE at a time with the aligned first memory
+       area.
+-   6. Use 2 vector compares when size is 2 * VEC_SIZE or less.
+-   7. Use 4 vector compares when size is 4 * VEC_SIZE or less.
+-   8. Use 8 vector compares when size is 8 * VEC_SIZE or less.  */
+   7. Use 2 vector compares when size is 2 * CHAR_PER_VEC or less.
+   8. Use 4 vector compares when size is 4 * CHAR_PER_VEC or less.
+   9. Use 8 vector compares when size is 8 * CHAR_PER_VEC or less.  */
+ 
+ # include <sysdep.h>
+ 
+@@ -40,11 +45,21 @@
+ # define VMOVU		vmovdqu64
+ 
+ # ifdef USE_AS_WMEMCMP
+-#  define VPCMPEQ	vpcmpeqd
+#  define CHAR_SIZE	4
+#  define VPCMP	vpcmpd
+ # else
+-#  define VPCMPEQ	vpcmpeqb
+#  define CHAR_SIZE	1
+#  define VPCMP	vpcmpub
+ # endif
+ 
+# define VEC_SIZE	32
+# define PAGE_SIZE	4096
+# define CHAR_PER_VEC	(VEC_SIZE / CHAR_SIZE)
+
+# define XMM0		xmm16
+# define XMM1		xmm17
+# define XMM2		xmm18
+# define YMM0		ymm16
+ # define XMM1		xmm17
+ # define XMM2		xmm18
+ # define YMM1		ymm17
+@@ -54,15 +69,6 @@
+ # define YMM5		ymm21
+ # define YMM6		ymm22
+ 
+-# define VEC_SIZE 32
+-# ifdef USE_AS_WMEMCMP
+-#  define VEC_MASK 0xff
+-#  define XMM_MASK 0xf
+-# else
+-#  define VEC_MASK 0xffffffff
+-#  define XMM_MASK 0xffff
+-# endif
+-
+ /* Warning!
+            wmemcmp has to use SIGNED comparison for elements.
+            memcmp has to use UNSIGNED comparison for elemnts.
+@@ -70,145 +76,370 @@
+ 
+ 	.section .text.evex,"ax",@progbits
+ ENTRY (MEMCMP)
+-# ifdef USE_AS_WMEMCMP
+-	shl	$2, %RDX_LP
+-# elif defined __ILP32__
+# ifdef __ILP32__
+ 	/* Clear the upper 32 bits.  */
+ 	movl	%edx, %edx
+ # endif
+-	cmp	$VEC_SIZE, %RDX_LP
+	cmp	$CHAR_PER_VEC, %RDX_LP
+ 	jb	L(less_vec)
+ 
+ 	/* From VEC to 2 * VEC.  No branch when size == VEC_SIZE.  */
+-	VMOVU	(%rsi), %YMM2
+-	VPCMPEQ (%rdi), %YMM2, %k1
+	VMOVU	(%rsi), %YMM1
+	/* Use compare not equals to directly check for mismatch.  */
+	VPCMP	$4, (%rdi), %YMM1, %k1
+ 	kmovd	%k1, %eax
+-	subl    $VEC_MASK, %eax
+-	jnz	L(first_vec)
+-
+-	cmpq	$(VEC_SIZE * 2), %rdx
+-	jbe	L(last_vec)
+-
+-	/* More than 2 * VEC.  */
+-	cmpq	$(VEC_SIZE * 8), %rdx
+-	ja	L(more_8x_vec)
+-	cmpq	$(VEC_SIZE * 4), %rdx
+-	jb	L(last_4x_vec)
+	/* NB: eax must be destination register if going to
+	   L(return_vec_[0,2]). For L(return_vec_3 destination register
+	   must be ecx.  */
+	testl	%eax, %eax
+	jnz	L(return_vec_0)
+ 
+-	/* From 4 * VEC to 8 * VEC, inclusively. */
+-	VMOVU	(%rsi), %YMM1
+-	VPCMPEQ (%rdi), %YMM1, %k1
+	cmpq	$(CHAR_PER_VEC * 2), %rdx
+	jbe	L(last_1x_vec)
+ 
+	/* Check second VEC no matter what.  */
+ 	VMOVU	VEC_SIZE(%rsi), %YMM2
+-	VPCMPEQ VEC_SIZE(%rdi), %YMM2, %k2
+	VPCMP	$4, VEC_SIZE(%rdi), %YMM2, %k1
+	kmovd	%k1, %eax
+	testl	%eax, %eax
+	jnz	L(return_vec_1)
+
+	/* Less than 4 * VEC.  */
+	cmpq	$(CHAR_PER_VEC * 4), %rdx
+	jbe	L(last_2x_vec)
+ 
+	/* Check third and fourth VEC no matter what.  */
+ 	VMOVU	(VEC_SIZE * 2)(%rsi), %YMM3
+-	VPCMPEQ (VEC_SIZE * 2)(%rdi), %YMM3, %k3
+	VPCMP	$4, (VEC_SIZE * 2)(%rdi), %YMM3, %k1
+	kmovd	%k1, %eax
+	testl	%eax, %eax
+	jnz	L(return_vec_2)
+ 
+ 	VMOVU	(VEC_SIZE * 3)(%rsi), %YMM4
+-	VPCMPEQ (VEC_SIZE * 3)(%rdi), %YMM4, %k4
+	VPCMP	$4, (VEC_SIZE * 3)(%rdi), %YMM4, %k1
+	kmovd	%k1, %ecx
+	testl	%ecx, %ecx
+	jnz	L(return_vec_3)
+ 
+-	kandd	%k1, %k2, %k5
+-	kandd	%k3, %k4, %k6
+-	kandd	%k5, %k6, %k6
+	/* Zero YMM0. 4x VEC reduction is done with vpxor + vtern so
+	   compare with zero to get a mask is needed.  */
+	vpxorq	%XMM0, %XMM0, %XMM0
+ 
+-	kmovd	%k6, %eax
+-	cmpl	$VEC_MASK, %eax
+-	jne	L(4x_vec_end)
+	/* Go to 4x VEC loop.  */
+	cmpq	$(CHAR_PER_VEC * 8), %rdx
+	ja	L(more_8x_vec)
+ 
+-	leaq	-(4 * VEC_SIZE)(%rdi, %rdx), %rdi
+-	leaq	-(4 * VEC_SIZE)(%rsi, %rdx), %rsi
+-	VMOVU	(%rsi), %YMM1
+-	VPCMPEQ (%rdi), %YMM1, %k1
+	/* Handle remainder of size = 4 * VEC + 1 to 8 * VEC without any
+	   branches.  */
+ 
+-	VMOVU	VEC_SIZE(%rsi), %YMM2
+-	VPCMPEQ VEC_SIZE(%rdi), %YMM2, %k2
+-	kandd	%k1, %k2, %k5
+	/* Load first two VEC from s2 before adjusting addresses.  */
+	VMOVU	-(VEC_SIZE * 4)(%rsi, %rdx, CHAR_SIZE), %YMM1
+	VMOVU	-(VEC_SIZE * 3)(%rsi, %rdx, CHAR_SIZE), %YMM2
+	leaq	-(4 * VEC_SIZE)(%rdi, %rdx, CHAR_SIZE), %rdi
+	leaq	-(4 * VEC_SIZE)(%rsi, %rdx, CHAR_SIZE), %rsi
+
+	/* Wait to load from s1 until addressed adjust due to
+	   unlamination of microfusion with complex address mode.  */
+
+	/* vpxor will be all 0s if s1 and s2 are equal. Otherwise it
+	   will have some 1s.  */
+	vpxorq	(%rdi), %YMM1, %YMM1
+	vpxorq	(VEC_SIZE)(%rdi), %YMM2, %YMM2
+ 
+ 	VMOVU	(VEC_SIZE * 2)(%rsi), %YMM3
+-	VPCMPEQ (VEC_SIZE * 2)(%rdi), %YMM3, %k3
+-	kandd	%k3, %k5, %k5
+	vpxorq	(VEC_SIZE * 2)(%rdi), %YMM3, %YMM3
+	/* Or together YMM1, YMM2, and YMM3 into YMM3.  */
+	vpternlogd $0xfe, %YMM1, %YMM2, %YMM3
+ 
+ 	VMOVU	(VEC_SIZE * 3)(%rsi), %YMM4
+-	VPCMPEQ (VEC_SIZE * 3)(%rdi), %YMM4, %k4
+-	kandd	%k4, %k5, %k5
+	/* Ternary logic to xor (VEC_SIZE * 3)(%rdi) with YMM4 while
+	   oring with YMM3. Result is stored in YMM4.  */
+	vpternlogd $0xde, (VEC_SIZE * 3)(%rdi), %YMM3, %YMM4
+	/* Compare YMM4 with 0. If any 1s s1 and s2 don't match.  */
+	VPCMP	$4, %YMM4, %YMM0, %k1
+	kmovd	%k1, %ecx
+	testl	%ecx, %ecx
+	jnz	L(return_vec_0_1_2_3)
+	/* NB: eax must be zero to reach here.  */
+	ret
+ 
+-	kmovd	%k5, %eax
+-	cmpl	$VEC_MASK, %eax
+-	jne	L(4x_vec_end)
+-	xorl	%eax, %eax
+	/* NB: aligning 32 here allows for the rest of the jump targets
+	   to be tuned for 32 byte alignment. Most important this ensures
+	   the L(more_8x_vec) loop is 32 byte aligned.  */
+	.p2align 5
+L(less_vec):
+	/* Check if one or less CHAR. This is necessary for size = 0 but
+	   is also faster for size = CHAR_SIZE.  */
+	cmpl	$1, %edx
+	jbe	L(one_or_less)
+
+	/* Check if loading one VEC from either s1 or s2 could cause a
+	   page cross. This can have false positives but is by far the
+	   fastest method.  */
+	movl	%edi, %eax
+	orl	%esi, %eax
+	andl	$(PAGE_SIZE - 1), %eax
+	cmpl	$(PAGE_SIZE - VEC_SIZE), %eax
+	jg	L(page_cross_less_vec)
+
+	/* No page cross possible.  */
+	VMOVU	(%rsi), %YMM2
+	VPCMP	$4, (%rdi), %YMM2, %k1
+	kmovd	%k1, %eax
+	/* Create mask in ecx for potentially in bound matches.  */
+	bzhil	%edx, %eax, %eax
+	jnz	L(return_vec_0)
+ 	ret
+ 
+ 	.p2align 4
+-L(last_2x_vec):
+-	/* From VEC to 2 * VEC.  No branch when size == VEC_SIZE.  */
+-	VMOVU	(%rsi), %YMM2
+-	VPCMPEQ (%rdi), %YMM2, %k2
+-	kmovd	%k2, %eax
+-	subl    $VEC_MASK, %eax
+-	jnz	L(first_vec)
+L(return_vec_0):
+	tzcntl	%eax, %eax
+# ifdef USE_AS_WMEMCMP
+	movl	(%rdi, %rax, CHAR_SIZE), %ecx
+	xorl	%edx, %edx
+	cmpl	(%rsi, %rax, CHAR_SIZE), %ecx
+	/* NB: no partial register stall here because xorl zero idiom
+	   above.  */
+	setg	%dl
+	leal	-1(%rdx, %rdx), %eax
+# else
+	movzbl	(%rsi, %rax), %ecx
+	movzbl	(%rdi, %rax), %eax
+	subl	%ecx, %eax
+# endif
+	ret
+ 
+-L(last_vec):
+-	/* Use overlapping loads to avoid branches.  */
+-	leaq	-VEC_SIZE(%rdi, %rdx), %rdi
+-	leaq	-VEC_SIZE(%rsi, %rdx), %rsi
+-	VMOVU	(%rsi), %YMM2
+-	VPCMPEQ (%rdi), %YMM2, %k2
+-	kmovd	%k2, %eax
+-	subl    $VEC_MASK, %eax
+-	jnz	L(first_vec)
+	/* NB: No p2align necessary. Alignment  % 16 is naturally 1
+	   which is good enough for a target not in a loop.  */
+L(return_vec_1):
+	tzcntl	%eax, %eax
+# ifdef USE_AS_WMEMCMP
+	movl	VEC_SIZE(%rdi, %rax, CHAR_SIZE), %ecx
+	xorl	%edx, %edx
+	cmpl	VEC_SIZE(%rsi, %rax, CHAR_SIZE), %ecx
+	setg	%dl
+	leal	-1(%rdx, %rdx), %eax
+# else
+	movzbl	VEC_SIZE(%rsi, %rax), %ecx
+	movzbl	VEC_SIZE(%rdi, %rax), %eax
+	subl	%ecx, %eax
+# endif
+ 	ret
+ 
+-	.p2align 4
+-L(first_vec):
+-	/* A byte or int32 is different within 16 or 32 bytes.  */
+-	tzcntl	%eax, %ecx
+	/* NB: No p2align necessary. Alignment  % 16 is naturally 2
+	   which is good enough for a target not in a loop.  */
+L(return_vec_2):
+	tzcntl	%eax, %eax
+ # ifdef USE_AS_WMEMCMP
+-	xorl	%eax, %eax
+-	movl	(%rdi, %rcx, 4), %edx
+-	cmpl	(%rsi, %rcx, 4), %edx
+-L(wmemcmp_return):
+-	setl	%al
+-	negl	%eax
+-	orl	$1, %eax
+	movl	(VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %ecx
+	xorl	%edx, %edx
+	cmpl	(VEC_SIZE * 2)(%rsi, %rax, CHAR_SIZE), %ecx
+	setg	%dl
+	leal	-1(%rdx, %rdx), %eax
+ # else
+-	movzbl	(%rdi, %rcx), %eax
+-	movzbl	(%rsi, %rcx), %edx
+-	sub	%edx, %eax
+	movzbl	(VEC_SIZE * 2)(%rsi, %rax), %ecx
+	movzbl	(VEC_SIZE * 2)(%rdi, %rax), %eax
+	subl	%ecx, %eax
+ # endif
+ 	ret
+ 
+	.p2align 4
+L(8x_return_vec_0_1_2_3):
+	/* Returning from L(more_8x_vec) requires restoring rsi.  */
+	addq	%rdi, %rsi
+L(return_vec_0_1_2_3):
+	VPCMP	$4, %YMM1, %YMM0, %k0
+	kmovd	%k0, %eax
+	testl	%eax, %eax
+	jnz	L(return_vec_0)
+
+	VPCMP	$4, %YMM2, %YMM0, %k0
+	kmovd	%k0, %eax
+	testl	%eax, %eax
+	jnz	L(return_vec_1)
+
+	VPCMP	$4, %YMM3, %YMM0, %k0
+	kmovd	%k0, %eax
+	testl	%eax, %eax
+	jnz	L(return_vec_2)
+L(return_vec_3):
+	tzcntl	%ecx, %ecx
+ # ifdef USE_AS_WMEMCMP
+	movl	(VEC_SIZE * 3)(%rdi, %rcx, CHAR_SIZE), %eax
+	xorl	%edx, %edx
+	cmpl	(VEC_SIZE * 3)(%rsi, %rcx, CHAR_SIZE), %eax
+	setg	%dl
+	leal	-1(%rdx, %rdx), %eax
+# else
+	movzbl	(VEC_SIZE * 3)(%rdi, %rcx), %eax
+	movzbl	(VEC_SIZE * 3)(%rsi, %rcx), %ecx
+	subl	%ecx, %eax
+# endif
+	ret
+
+ 	.p2align 4
+-L(4):
+-	xorl	%eax, %eax
+-	movl	(%rdi), %edx
+-	cmpl	(%rsi), %edx
+-	jne	L(wmemcmp_return)
+L(more_8x_vec):
+	/* Set end of s1 in rdx.  */
+	leaq	-(VEC_SIZE * 4)(%rdi, %rdx, CHAR_SIZE), %rdx
+	/* rsi stores s2 - s1. This allows loop to only update one
+	   pointer.  */
+	subq	%rdi, %rsi
+	/* Align s1 pointer.  */
+	andq	$-VEC_SIZE, %rdi
+	/* Adjust because first 4x vec where check already.  */
+	subq	$-(VEC_SIZE * 4), %rdi
+	.p2align 4
+L(loop_4x_vec):
+	VMOVU	(%rsi, %rdi), %YMM1
+	vpxorq	(%rdi), %YMM1, %YMM1
+
+	VMOVU	VEC_SIZE(%rsi, %rdi), %YMM2
+	vpxorq	VEC_SIZE(%rdi), %YMM2, %YMM2
+
+	VMOVU	(VEC_SIZE * 2)(%rsi, %rdi), %YMM3
+	vpxorq	(VEC_SIZE * 2)(%rdi), %YMM3, %YMM3
+	vpternlogd $0xfe, %YMM1, %YMM2, %YMM3
+
+	VMOVU	(VEC_SIZE * 3)(%rsi, %rdi), %YMM4
+	vpternlogd $0xde, (VEC_SIZE * 3)(%rdi), %YMM3, %YMM4
+	VPCMP	$4, %YMM4, %YMM0, %k1
+	kmovd	%k1, %ecx
+	testl	%ecx, %ecx
+	jnz	L(8x_return_vec_0_1_2_3)
+	subq	$-(VEC_SIZE * 4), %rdi
+	cmpq	%rdx, %rdi
+	jb	L(loop_4x_vec)
+
+	subq	%rdx, %rdi
+	/* rdi has 4 * VEC_SIZE - remaining length.  */
+	cmpl	$(VEC_SIZE * 3), %edi
+	jae	L(8x_last_1x_vec)
+	/* Load regardless of branch.  */
+	VMOVU	(VEC_SIZE * 2)(%rsi, %rdx), %YMM3
+	cmpl	$(VEC_SIZE * 2), %edi
+	jae	L(8x_last_2x_vec)
+
+	VMOVU	(%rsi, %rdx), %YMM1
+	vpxorq	(%rdx), %YMM1, %YMM1
+
+	VMOVU	VEC_SIZE(%rsi, %rdx), %YMM2
+	vpxorq	VEC_SIZE(%rdx), %YMM2, %YMM2
+
+	vpxorq	(VEC_SIZE * 2)(%rdx), %YMM3, %YMM3
+	vpternlogd $0xfe, %YMM1, %YMM2, %YMM3
+
+	VMOVU	(VEC_SIZE * 3)(%rsi, %rdx), %YMM4
+	vpternlogd $0xde, (VEC_SIZE * 3)(%rdx), %YMM3, %YMM4
+	VPCMP	$4, %YMM4, %YMM0, %k1
+	kmovd	%k1, %ecx
+	/* Restore s1 pointer to rdi.  */
+	movq	%rdx, %rdi
+	testl	%ecx, %ecx
+	jnz	L(8x_return_vec_0_1_2_3)
+	/* NB: eax must be zero to reach here.  */
+	ret
+
+	/* Only entry is from L(more_8x_vec).  */
+	.p2align 4
+L(8x_last_2x_vec):
+	VPCMP	$4, (VEC_SIZE * 2)(%rdx), %YMM3, %k1
+	kmovd	%k1, %eax
+	testl	%eax, %eax
+	jnz	L(8x_return_vec_2)
+	/* Naturally aligned to 16 bytes.  */
+L(8x_last_1x_vec):
+	VMOVU	(VEC_SIZE * 3)(%rsi, %rdx), %YMM1
+	VPCMP	$4, (VEC_SIZE * 3)(%rdx), %YMM1, %k1
+	kmovd	%k1, %eax
+	testl	%eax, %eax
+	jnz	L(8x_return_vec_3)
+	ret
+
+	.p2align 4
+L(last_2x_vec):
+	/* Check second to last VEC.  */
+	VMOVU	-(VEC_SIZE * 2)(%rsi, %rdx, CHAR_SIZE), %YMM1
+	VPCMP	$4, -(VEC_SIZE * 2)(%rdi, %rdx, CHAR_SIZE), %YMM1, %k1
+	kmovd	%k1, %eax
+	testl	%eax, %eax
+	jnz	L(return_vec_1_end)
+
+	/* Check last VEC.  */
+	.p2align 4
+L(last_1x_vec):
+	VMOVU	-(VEC_SIZE * 1)(%rsi, %rdx, CHAR_SIZE), %YMM1
+	VPCMP	$4, -(VEC_SIZE * 1)(%rdi, %rdx, CHAR_SIZE), %YMM1, %k1
+	kmovd	%k1, %eax
+	testl	%eax, %eax
+	jnz	L(return_vec_0_end)
+ 	ret
+
+	.p2align 4
+L(8x_return_vec_2):
+	subq	$VEC_SIZE, %rdx
+L(8x_return_vec_3):
+	tzcntl	%eax, %eax
+# ifdef USE_AS_WMEMCMP
+	leaq	(%rdx, %rax, CHAR_SIZE), %rax
+	movl	(VEC_SIZE * 3)(%rax), %ecx
+	xorl	%edx, %edx
+	cmpl	(VEC_SIZE * 3)(%rsi, %rax), %ecx
+	setg	%dl
+	leal	-1(%rdx, %rdx), %eax
+ # else
+	addq	%rdx, %rax
+	movzbl	(VEC_SIZE * 3)(%rsi, %rax), %ecx
+	movzbl	(VEC_SIZE * 3)(%rax), %eax
+	subl	%ecx, %eax
+# endif
+	ret
+
+ 	.p2align 4
+-L(between_4_7):
+-	/* Load as big endian with overlapping movbe to avoid branches.  */
+-	movbe	(%rdi), %eax
+-	movbe	(%rsi), %ecx
+-	shlq	$32, %rax
+-	shlq	$32, %rcx
+-	movbe	-4(%rdi, %rdx), %edi
+-	movbe	-4(%rsi, %rdx), %esi
+-	orq	%rdi, %rax
+-	orq	%rsi, %rcx
+-	subq	%rcx, %rax
+-	je	L(exit)
+-	sbbl	%eax, %eax
+-	orl	$1, %eax
+L(return_vec_0_end):
+	tzcntl	%eax, %eax
+	addl	%edx, %eax
+# ifdef USE_AS_WMEMCMP
+	movl	-VEC_SIZE(%rdi, %rax, CHAR_SIZE), %ecx
+	xorl	%edx, %edx
+	cmpl	-VEC_SIZE(%rsi, %rax, CHAR_SIZE), %ecx
+	setg	%dl
+	leal	-1(%rdx, %rdx), %eax
+# else
+	movzbl	-VEC_SIZE(%rsi, %rax), %ecx
+	movzbl	-VEC_SIZE(%rdi, %rax), %eax
+	subl	%ecx, %eax
+# endif
+ 	ret
+ 
+ 	.p2align 4
+-L(exit):
+L(return_vec_1_end):
+	tzcntl	%eax, %eax
+	addl	%edx, %eax
+# ifdef USE_AS_WMEMCMP
+	movl	-(VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %ecx
+	xorl	%edx, %edx
+	cmpl	-(VEC_SIZE * 2)(%rsi, %rax, CHAR_SIZE), %ecx
+	setg	%dl
+	leal	-1(%rdx, %rdx), %eax
+# else
+	movzbl	-(VEC_SIZE * 2)(%rsi, %rax), %ecx
+	movzbl	-(VEC_SIZE * 2)(%rdi, %rax), %eax
+	subl	%ecx, %eax
+# endif
+ 	ret
+ 
+
+ 	.p2align 4
+L(page_cross_less_vec):
+	/* if USE_AS_WMEMCMP it can only be 0, 4, 8, 12, 16, 20, 24, 28
+	   bytes.  */
+	cmpl	$(16 / CHAR_SIZE), %edx
+	jae	L(between_16_31)
+# ifndef USE_AS_WMEMCMP
+	cmpl	$8, %edx
+	jae	L(between_8_15)
+	cmpl	$4, %edx
+	jae	L(between_4_7)
+ L(between_2_3):
+ 	/* Load as big endian to avoid branches.  */
+ 	movzwl	(%rdi), %eax
+@@ -217,224 +448,99 @@ L(between_2_3):
+ 	shll	$8, %ecx
+ 	bswap	%eax
+ 	bswap	%ecx
+-	movb	-1(%rdi, %rdx), %al
+-	movb	-1(%rsi, %rdx), %cl
+	movzbl	-1(%rdi, %rdx), %edi
+	movzbl	-1(%rsi, %rdx), %esi
+	orl	%edi, %eax
+	orl	%esi, %ecx
+ 	/* Subtraction is okay because the upper 8 bits are zero.  */
+ 	subl	%ecx, %eax
+ 	ret
+-
+ 	.p2align 4
+-L(1):
+-	movzbl	(%rdi), %eax
+L(one_or_less):
+	jb	L(zero)
+ 	movzbl	(%rsi), %ecx
+	movzbl	(%rdi), %eax
+ 	subl	%ecx, %eax
+ 	ret
+-# endif
+-
+-	.p2align 4
+-L(zero):
+-	xorl	%eax, %eax
+-	ret
+ 
+ 	.p2align 4
+-L(less_vec):
+-# ifdef USE_AS_WMEMCMP
+-	/* It can only be 0, 4, 8, 12, 16, 20, 24, 28 bytes.  */
+-	cmpb	$4, %dl
+-	je	L(4)
+-	jb	L(zero)
+-# else
+-	cmpb	$1, %dl
+-	je	L(1)
+-	jb	L(zero)
+-	cmpb	$4, %dl
+-	jb	L(between_2_3)
+-	cmpb	$8, %dl
+-	jb	L(between_4_7)
+L(between_8_15):
+ # endif
+-	cmpb	$16, %dl
+-	jae	L(between_16_31)
+-	/* It is between 8 and 15 bytes.  */
+	/* If USE_AS_WMEMCMP fall through into 8-15 byte case.  */
+ 	vmovq	(%rdi), %XMM1
+ 	vmovq	(%rsi), %XMM2
+-	VPCMPEQ %XMM1, %XMM2, %k2
+-	kmovw	%k2, %eax
+-	subl    $XMM_MASK, %eax
+-	jnz	L(first_vec)
+	VPCMP	$4, %XMM1, %XMM2, %k1
+	kmovd	%k1, %eax
+	testl	%eax, %eax
+	jnz	L(return_vec_0)
+ 	/* Use overlapping loads to avoid branches.  */
+-	leaq	-8(%rdi, %rdx), %rdi
+-	leaq	-8(%rsi, %rdx), %rsi
+	leaq	-8(%rdi, %rdx, CHAR_SIZE), %rdi
+	leaq	-8(%rsi, %rdx, CHAR_SIZE), %rsi
+ 	vmovq	(%rdi), %XMM1
+ 	vmovq	(%rsi), %XMM2
+-	VPCMPEQ %XMM1, %XMM2, %k2
+-	kmovw	%k2, %eax
+-	subl    $XMM_MASK, %eax
+-	jnz	L(first_vec)
+	VPCMP	$4, %XMM1, %XMM2, %k1
+	kmovd	%k1, %eax
+	testl	%eax, %eax
+	jnz	L(return_vec_0)
+ 	ret
+ 
+ 	.p2align 4
+-L(between_16_31):
+-	/* From 16 to 31 bytes.  No branch when size == 16.  */
+-	VMOVU	(%rsi), %XMM2
+-	VPCMPEQ (%rdi), %XMM2, %k2
+-	kmovw	%k2, %eax
+-	subl    $XMM_MASK, %eax
+-	jnz	L(first_vec)
+-
+-	/* Use overlapping loads to avoid branches.  */
+-	leaq	-16(%rdi, %rdx), %rdi
+-	leaq	-16(%rsi, %rdx), %rsi
+-	VMOVU	(%rsi), %XMM2
+-	VPCMPEQ (%rdi), %XMM2, %k2
+-	kmovw	%k2, %eax
+-	subl    $XMM_MASK, %eax
+-	jnz	L(first_vec)
+L(zero):
+	xorl	%eax, %eax
+ 	ret
+ 
+ 	.p2align 4
+-L(more_8x_vec):
+-	/* More than 8 * VEC.  Check the first VEC.  */
+-	VMOVU	(%rsi), %YMM2
+-	VPCMPEQ (%rdi), %YMM2, %k2
+-	kmovd	%k2, %eax
+-	subl    $VEC_MASK, %eax
+-	jnz	L(first_vec)
+-
+-	/* Align the first memory area for aligned loads in the loop.
+-	   Compute how much the first memory area is misaligned.  */
+-	movq	%rdi, %rcx
+-	andl	$(VEC_SIZE - 1), %ecx
+-	/* Get the negative of offset for alignment.  */
+-	subq	$VEC_SIZE, %rcx
+-	/* Adjust the second memory area.  */
+-	subq	%rcx, %rsi
+-	/* Adjust the first memory area which should be aligned now.  */
+-	subq	%rcx, %rdi
+-	/* Adjust length.  */
+-	addq	%rcx, %rdx
+-
+-L(loop_4x_vec):
+-	/* Compare 4 * VEC at a time forward.  */
+-	VMOVU	(%rsi), %YMM1
+-	VPCMPEQ (%rdi), %YMM1, %k1
+-
+-	VMOVU	VEC_SIZE(%rsi), %YMM2
+-	VPCMPEQ VEC_SIZE(%rdi), %YMM2, %k2
+-	kandd	%k2, %k1, %k5
+-
+-	VMOVU	(VEC_SIZE * 2)(%rsi), %YMM3
+-	VPCMPEQ (VEC_SIZE * 2)(%rdi), %YMM3, %k3
+-	kandd	%k3, %k5, %k5
+-
+-	VMOVU	(VEC_SIZE * 3)(%rsi), %YMM4
+-	VPCMPEQ (VEC_SIZE * 3)(%rdi), %YMM4, %k4
+-	kandd	%k4, %k5, %k5
+-
+-	kmovd	%k5, %eax
+-	cmpl	$VEC_MASK, %eax
+-	jne	L(4x_vec_end)
+-
+-	addq	$(VEC_SIZE * 4), %rdi
+-	addq	$(VEC_SIZE * 4), %rsi
+-
+-	subq	$(VEC_SIZE * 4), %rdx
+-	cmpq	$(VEC_SIZE * 4), %rdx
+-	jae	L(loop_4x_vec)
+-
+-	/* Less than 4 * VEC.  */
+-	cmpq	$VEC_SIZE, %rdx
+-	jbe	L(last_vec)
+-	cmpq	$(VEC_SIZE * 2), %rdx
+-	jbe	L(last_2x_vec)
+-
+-L(last_4x_vec):
+-	/* From 2 * VEC to 4 * VEC. */
+-	VMOVU	(%rsi), %YMM2
+-	VPCMPEQ (%rdi), %YMM2, %k2
+-	kmovd	%k2, %eax
+-	subl    $VEC_MASK, %eax
+-	jnz	L(first_vec)
+-
+-	addq	$VEC_SIZE, %rdi
+-	addq	$VEC_SIZE, %rsi
+-	VMOVU	(%rsi), %YMM2
+-	VPCMPEQ (%rdi), %YMM2, %k2
+-	kmovd	%k2, %eax
+-	subl    $VEC_MASK, %eax
+-	jnz	L(first_vec)
+L(between_16_31):
+	/* From 16 to 31 bytes.  No branch when size == 16.  */
+	VMOVU	(%rsi), %XMM2
+	VPCMP	$4, (%rdi), %XMM2, %k1
+	kmovd	%k1, %eax
+	testl	%eax, %eax
+	jnz	L(return_vec_0)
+ 
+ 	/* Use overlapping loads to avoid branches.  */
+-	leaq	-(3 * VEC_SIZE)(%rdi, %rdx), %rdi
+-	leaq	-(3 * VEC_SIZE)(%rsi, %rdx), %rsi
+-	VMOVU	(%rsi), %YMM2
+-	VPCMPEQ (%rdi), %YMM2, %k2
+-	kmovd	%k2, %eax
+-	subl    $VEC_MASK, %eax
+-	jnz	L(first_vec)
+ 
+-	addq	$VEC_SIZE, %rdi
+-	addq	$VEC_SIZE, %rsi
+-	VMOVU	(%rsi), %YMM2
+-	VPCMPEQ (%rdi), %YMM2, %k2
+-	kmovd	%k2, %eax
+-	subl    $VEC_MASK, %eax
+-	jnz	L(first_vec)
+-	ret
+-
+-	.p2align 4
+-L(4x_vec_end):
+	VMOVU	-16(%rsi, %rdx, CHAR_SIZE), %XMM2
+	leaq	-16(%rdi, %rdx, CHAR_SIZE), %rdi
+	leaq	-16(%rsi, %rdx, CHAR_SIZE), %rsi
+	VPCMP	$4, (%rdi), %XMM2, %k1
+ 	kmovd	%k1, %eax
+-	subl	$VEC_MASK, %eax
+-	jnz	L(first_vec)
+-	kmovd	%k2, %eax
+-	subl	$VEC_MASK, %eax
+-	jnz	L(first_vec_x1)
+-	kmovd	%k3, %eax
+-	subl	$VEC_MASK, %eax
+-	jnz	L(first_vec_x2)
+-	kmovd	%k4, %eax
+-	subl	$VEC_MASK, %eax
+-	tzcntl	%eax, %ecx
+-# ifdef USE_AS_WMEMCMP
+-	xorl	%eax, %eax
+-	movl	(VEC_SIZE * 3)(%rdi, %rcx, 4), %edx
+-	cmpl	(VEC_SIZE * 3)(%rsi, %rcx, 4), %edx
+-	jmp	L(wmemcmp_return)
+-# else
+-	movzbl	(VEC_SIZE * 3)(%rdi, %rcx), %eax
+-	movzbl	(VEC_SIZE * 3)(%rsi, %rcx), %edx
+-	sub	%edx, %eax
+-# endif
+	testl	%eax, %eax
+	jnz	L(return_vec_0)
+ 	ret
+ 
+-	.p2align 4
+-L(first_vec_x1):
+-	tzcntl	%eax, %ecx
+ # ifdef USE_AS_WMEMCMP
+-	xorl	%eax, %eax
+-	movl	VEC_SIZE(%rdi, %rcx, 4), %edx
+-	cmpl	VEC_SIZE(%rsi, %rcx, 4), %edx
+-	jmp	L(wmemcmp_return)
+-# else
+-	movzbl	VEC_SIZE(%rdi, %rcx), %eax
+-	movzbl	VEC_SIZE(%rsi, %rcx), %edx
+-	sub	%edx, %eax
+-# endif
+	.p2align 4
+L(one_or_less):
+	jb	L(zero)
+	movl	(%rdi), %ecx
+	xorl	%edx, %edx
+	cmpl	(%rsi), %ecx
+	je	L(zero)
+	setg	%dl
+	leal	-1(%rdx, %rdx), %eax
+ 	ret
+# else
+ 
+ 	.p2align 4
+-L(first_vec_x2):
+-	tzcntl	%eax, %ecx
+-# ifdef USE_AS_WMEMCMP
+-	xorl	%eax, %eax
+-	movl	(VEC_SIZE * 2)(%rdi, %rcx, 4), %edx
+-	cmpl	(VEC_SIZE * 2)(%rsi, %rcx, 4), %edx
+-	jmp	L(wmemcmp_return)
+-# else
+-	movzbl	(VEC_SIZE * 2)(%rdi, %rcx), %eax
+-	movzbl	(VEC_SIZE * 2)(%rsi, %rcx), %edx
+-	sub	%edx, %eax
+-# endif
+L(between_4_7):
+	/* Load as big endian with overlapping movbe to avoid branches.
+	 */
+	movbe	(%rdi), %eax
+	movbe	(%rsi), %ecx
+	shlq	$32, %rax
+	shlq	$32, %rcx
+	movbe	-4(%rdi, %rdx), %edi
+	movbe	-4(%rsi, %rdx), %esi
+	orq	%rdi, %rax
+	orq	%rsi, %rcx
+	subq	%rcx, %rax
+	jz	L(zero_4_7)
+	sbbl	%eax, %eax
+	orl	$1, %eax
+L(zero_4_7):
+ 	ret
+# endif
+
+ END (MEMCMP)
+ #endif
+-- 
+GitLab
+
--- a/glibc-RHEL-15696-47.patch
+++ b/glibc-RHEL-15696-47.patch
@ -0,0 +1,104 @@
+From 6abf27980a947f9b6e514d6b33b83059d39566ae Mon Sep 17 00:00:00 2001
+From: Noah Goldstein <goldstein.w.n@gmail.com>
+Date: Thu, 20 May 2021 13:13:51 -0400
+Subject: [PATCH] x86: Improve memset-vec-unaligned-erms.S
+Content-type: text/plain; charset=UTF-8
+
+No bug. This commit makes a few small improvements to
+memset-vec-unaligned-erms.S. The changes are 1) only aligning to 64
+instead of 128. Either alignment will perform equally well in a loop
+and 128 just increases the odds of having to do an extra iteration
+which can be significant overhead for small values. 2) Align some
+targets and the loop. 3) Remove an ALU from the alignment process. 4)
+Reorder the last 4x VEC so that they are stored after the loop. 5)
+Move the condition for leq 8x VEC to before the alignment
+process. test-memset and test-wmemset are both passing.
+
+Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
+Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
+---
+ .../multiarch/memset-vec-unaligned-erms.S     | 50 +++++++++++--------
+ 1 file changed, 28 insertions(+), 22 deletions(-)
+
+diff --git a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
+index f877ac9d..909c33f6 100644
+--- a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
+@@ -173,17 +173,22 @@ ENTRY (MEMSET_SYMBOL (__memset, unaligned_erms))
+ 	VMOVU	%VEC(0), (%rdi)
+ 	VZEROUPPER_RETURN
+ 
+	.p2align 4
+ L(stosb_more_2x_vec):
+ 	cmp	__x86_rep_stosb_threshold(%rip), %RDX_LP
+ 	ja	L(stosb)
+#else
+	.p2align 4
+ #endif
+ L(more_2x_vec):
+-	cmpq  $(VEC_SIZE * 4), %rdx
+-	ja	L(loop_start)
+	/* Stores to first 2x VEC before cmp as any path forward will
+	   require it.  */
+ 	VMOVU	%VEC(0), (%rdi)
+ 	VMOVU	%VEC(0), VEC_SIZE(%rdi)
+-	VMOVU	%VEC(0), -VEC_SIZE(%rdi,%rdx)
+	cmpq	$(VEC_SIZE * 4), %rdx
+	ja	L(loop_start)
+ 	VMOVU	%VEC(0), -(VEC_SIZE * 2)(%rdi,%rdx)
+	VMOVU	%VEC(0), -VEC_SIZE(%rdi,%rdx)
+ L(return):
+ #if VEC_SIZE > 16
+ 	ZERO_UPPER_VEC_REGISTERS_RETURN
+@@ -192,28 +197,29 @@ L(return):
+ #endif
+ 
+ L(loop_start):
+-	leaq	(VEC_SIZE * 4)(%rdi), %rcx
+-	VMOVU	%VEC(0), (%rdi)
+-	andq	$-(VEC_SIZE * 4), %rcx
+-	VMOVU	%VEC(0), -VEC_SIZE(%rdi,%rdx)
+-	VMOVU	%VEC(0), VEC_SIZE(%rdi)
+-	VMOVU	%VEC(0), -(VEC_SIZE * 2)(%rdi,%rdx)
+ 	VMOVU	%VEC(0), (VEC_SIZE * 2)(%rdi)
+-	VMOVU	%VEC(0), -(VEC_SIZE * 3)(%rdi,%rdx)
+ 	VMOVU	%VEC(0), (VEC_SIZE * 3)(%rdi)
+-	VMOVU	%VEC(0), -(VEC_SIZE * 4)(%rdi,%rdx)
+-	addq	%rdi, %rdx
+-	andq	$-(VEC_SIZE * 4), %rdx
+-	cmpq	%rdx, %rcx
+-	je	L(return)
+	cmpq	$(VEC_SIZE * 8), %rdx
+	jbe	L(loop_end)
+	andq	$-(VEC_SIZE * 2), %rdi
+	subq	$-(VEC_SIZE * 4), %rdi
+	leaq	-(VEC_SIZE * 4)(%rax, %rdx), %rcx
+	.p2align 4
+ L(loop):
+-	VMOVA	%VEC(0), (%rcx)
+-	VMOVA	%VEC(0), VEC_SIZE(%rcx)
+-	VMOVA	%VEC(0), (VEC_SIZE * 2)(%rcx)
+-	VMOVA	%VEC(0), (VEC_SIZE * 3)(%rcx)
+-	addq	$(VEC_SIZE * 4), %rcx
+-	cmpq	%rcx, %rdx
+-	jne	L(loop)
+	VMOVA	%VEC(0), (%rdi)
+	VMOVA	%VEC(0), VEC_SIZE(%rdi)
+	VMOVA	%VEC(0), (VEC_SIZE * 2)(%rdi)
+	VMOVA	%VEC(0), (VEC_SIZE * 3)(%rdi)
+	subq	$-(VEC_SIZE * 4), %rdi
+	cmpq	%rcx, %rdi
+	jb	L(loop)
+L(loop_end):
+	/* NB: rax is set as ptr in MEMSET_VDUP_TO_VEC0_AND_SET_RETURN.
+	       rdx as length is also unchanged.  */
+	VMOVU	%VEC(0), -(VEC_SIZE * 4)(%rax, %rdx)
+	VMOVU	%VEC(0), -(VEC_SIZE * 3)(%rax, %rdx)
+	VMOVU	%VEC(0), -(VEC_SIZE * 2)(%rax, %rdx)
+	VMOVU	%VEC(0), -VEC_SIZE(%rax, %rdx)
+ 	VZEROUPPER_SHORT_RETURN
+ 
+ 	.p2align 4
+-- 
+GitLab
+
--- a/glibc-RHEL-15696-48.patch
+++ b/glibc-RHEL-15696-48.patch
@ -0,0 +1,84 @@
+From 1b992204f68af851e905c16016756fd4421e1934 Mon Sep 17 00:00:00 2001
+From: Noah Goldstein <goldstein.w.n@gmail.com>
+Date: Sun, 23 May 2021 19:43:24 -0400
+Subject: [PATCH] x86: Improve memmove-vec-unaligned-erms.S
+Content-type: text/plain; charset=UTF-8
+
+This patch changes the condition for copy 4x VEC so that if length is
+exactly equal to 4 * VEC_SIZE it will use the 4x VEC case instead of
+8x VEC case.
+
+Results For Skylake memcpy-avx2-erms
+size, al1 , al2 , Cur T   , New T   , Win , New / Cur
+128 , 0   , 0   , 9.137   , 6.873   , New , 75.22
+128 , 7   , 0   , 12.933  , 7.732   , New , 59.79
+128 , 0   , 7   , 11.852  , 6.76    , New , 57.04
+128 , 7   , 7   , 12.587  , 6.808   , New , 54.09
+
+Results For Icelake memcpy-evex-erms
+size, al1 , al2 , Cur T   , New T   , Win , New / Cur
+128 , 0   , 0   , 9.963   , 5.416   , New , 54.36
+128 , 7   , 0   , 16.467  , 8.061   , New , 48.95
+128 , 0   , 7   , 14.388  , 7.644   , New , 53.13
+128 , 7   , 7   , 14.546  , 7.642   , New , 52.54
+
+Results For Tigerlake memcpy-evex-erms
+size, al1 , al2 , Cur T   , New T   , Win , New / Cur
+128 , 0   , 0   , 8.979   , 4.95    , New , 55.13
+128 , 7   , 0   , 14.245  , 7.122   , New , 50.0
+128 , 0   , 7   , 12.668  , 6.675   , New , 52.69
+128 , 7   , 7   , 13.042  , 6.802   , New , 52.15
+
+Results For Skylake memmove-avx2-erms
+size, al1 , al2 , Cur T   , New T   , Win , New / Cur
+128 , 0   , 32  , 6.181   , 5.691   , New , 92.07
+128 , 32  , 0   , 6.165   , 5.752   , New , 93.3
+128 , 0   , 7   , 13.923  , 9.37    , New , 67.3
+128 , 7   , 0   , 12.049  , 10.182  , New , 84.5
+
+Results For Icelake memmove-evex-erms
+size, al1 , al2 , Cur T   , New T   , Win , New / Cur
+128 , 0   , 32  , 5.479   , 4.889   , New , 89.23
+128 , 32  , 0   , 5.127   , 4.911   , New , 95.79
+128 , 0   , 7   , 18.885  , 13.547  , New , 71.73
+128 , 7   , 0   , 15.565  , 14.436  , New , 92.75
+
+Results For Tigerlake memmove-evex-erms
+size, al1 , al2 , Cur T   , New T   , Win , New / Cur
+128 , 0   , 32  , 5.275   , 4.815   , New , 91.28
+128 , 32  , 0   , 5.376   , 4.565   , New , 84.91
+128 , 0   , 7   , 19.426  , 14.273  , New , 73.47
+128 , 7   , 0   , 15.924  , 14.951  , New , 93.89
+
+Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
+---
+ sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S | 6 +++---
+ 1 file changed, 3 insertions(+), 3 deletions(-)
+
+diff --git a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
+index 3e2dd6bc..572cef04 100644
+--- a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
+@@ -417,8 +417,8 @@ L(more_2x_vec):
+ 	cmpq	$(VEC_SIZE * 8), %rdx
+ 	ja	L(more_8x_vec)
+ 	cmpq	$(VEC_SIZE * 4), %rdx
+-	jb	L(last_4x_vec)
+-	/* Copy from 4 * VEC to 8 * VEC, inclusively. */
+	jbe	L(last_4x_vec)
+	/* Copy from 4 * VEC + 1 to 8 * VEC, inclusively. */
+ 	VMOVU	(%rsi), %VEC(0)
+ 	VMOVU	VEC_SIZE(%rsi), %VEC(1)
+ 	VMOVU	(VEC_SIZE * 2)(%rsi), %VEC(2)
+@@ -437,7 +437,7 @@ L(more_2x_vec):
+ 	VMOVU	%VEC(7), -(VEC_SIZE * 4)(%rdi,%rdx)
+ 	VZEROUPPER_RETURN
+ L(last_4x_vec):
+-	/* Copy from 2 * VEC to 4 * VEC. */
+	/* Copy from 2 * VEC + 1 to 4 * VEC, inclusively. */
+ 	VMOVU	(%rsi), %VEC(0)
+ 	VMOVU	VEC_SIZE(%rsi), %VEC(1)
+ 	VMOVU	-VEC_SIZE(%rsi,%rdx), %VEC(2)
+-- 
+GitLab
+
--- a/glibc-RHEL-15696-49.patch
+++ b/glibc-RHEL-15696-49.patch
@ -0,0 +1,55 @@
+From 08cbcd4dbc686bb38ec3093aff2f919fbff5ec17 Mon Sep 17 00:00:00 2001
+From: Noah Goldstein <goldstein.w.n@gmail.com>
+Date: Wed, 23 Jun 2021 19:19:34 -0400
+Subject: [PATCH] x86: Remove unnecessary overflow check from wcsnlen-sse4_1.S
+Content-type: text/plain; charset=UTF-8
+
+No bug. The way wcsnlen will check if near the end of maxlen
+is the following macro:
+
+	mov	%r11, %rsi;	\
+	subq	%rax, %rsi;	\
+	andq	$-64, %rax;	\
+	testq	$-64, %rsi;	\
+	je	L(strnlen_ret)
+
+Which words independently of s + maxlen overflowing. So the
+second overflow check is unnecissary for correctness and
+just extra overhead in the common no overflow case.
+
+test-strlen.c, test-wcslen.c, test-strnlen.c and test-wcsnlen.c are
+all passing
+
+Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
+Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
+---
+ sysdeps/x86_64/multiarch/strlen-vec.S | 7 -------
+ 1 file changed, 7 deletions(-)
+
+diff --git a/sysdeps/x86_64/multiarch/strlen-vec.S b/sysdeps/x86_64/multiarch/strlen-vec.S
+index 439e486a..b7657282 100644
+--- a/sysdeps/x86_64/multiarch/strlen-vec.S
+++ b/sysdeps/x86_64/multiarch/strlen-vec.S
+@@ -71,19 +71,12 @@ L(n_nonzero):
+    suffice.  */
+ 	mov	%RSI_LP, %R10_LP
+ 	sar	$62, %R10_LP
+-	test	%R10_LP, %R10_LP
+ 	jnz	__wcslen_sse4_1
+ 	sal	$2, %RSI_LP
+ # endif
+ 
+-
+ /* Initialize long lived registers.  */
+-
+ 	add	%RDI_LP, %RSI_LP
+-# ifdef AS_WCSLEN
+-/* Check for overflow again from s + maxlen * sizeof(wchar_t).  */
+-	jbe	__wcslen_sse4_1
+-# endif
+ 	mov	%RSI_LP, %R10_LP
+ 	and	$-64, %R10_LP
+ 	mov	%RSI_LP, %R11_LP
+-- 
+GitLab
+
--- a/glibc-RHEL-15696-5.patch
+++ b/glibc-RHEL-15696-5.patch
@ -0,0 +1,290 @@
+From 82d0b4a4d76db554eb6757acb790fcea30b19965 Mon Sep 17 00:00:00 2001
+From: "H.J. Lu" <hjl.tools@gmail.com>
+Date: Mon, 21 Jan 2019 11:32:24 -0800
+Subject: [PATCH] x86-64 memset/wmemset: Properly handle the length parameter
+ [BZ# 24097]
+Content-type: text/plain; charset=UTF-8
+
+On x32, the size_t parameter may be passed in the lower 32 bits of a
+64-bit register with the non-zero upper 32 bits.  The string/memory
+functions written in assembly can only use the lower 32 bits of a
+64-bit register as length or must clear the upper 32 bits before using
+the full 64-bit register for length.
+
+This pach fixes memset/wmemset for x32.  Tested on x86-64 and x32.  On
+x86-64, libc.so is the same with and withou the fix.
+
+	[BZ# 24097]
+	CVE-2019-6488
+	* sysdeps/x86_64/multiarch/memset-avx512-no-vzeroupper.S: Use
+	RDX_LP for length.  Clear the upper 32 bits of RDX register.
+	* sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S: Likewise.
+	* sysdeps/x86_64/x32/Makefile (tests): Add tst-size_t-wmemset.
+	* sysdeps/x86_64/x32/tst-size_t-memset.c: New file.
+	* sysdeps/x86_64/x32/tst-size_t-wmemset.c: Likewise.
+---
+ .../multiarch/memset-avx512-no-vzeroupper.S   |  6 +-
+ .../multiarch/memset-vec-unaligned-erms.S     | 34 +++++----
+ sysdeps/x86_64/x32/Makefile                   |  4 +-
+ sysdeps/x86_64/x32/tst-size_t-memset.c        | 73 +++++++++++++++++++
+ sysdeps/x86_64/x32/tst-size_t-wmemset.c       | 20 +++++
+ 5 files changed, 121 insertions(+), 16 deletions(-)
+ create mode 100644 sysdeps/x86_64/x32/tst-size_t-memset.c
+ create mode 100644 sysdeps/x86_64/x32/tst-size_t-wmemset.c
+
+Conflicts:
+	ChangeLog
+	(removed)
+
+diff --git a/sysdeps/x86_64/multiarch/memset-avx512-no-vzeroupper.S b/sysdeps/x86_64/multiarch/memset-avx512-no-vzeroupper.S
+index 689cc119..99e25519 100644
+--- a/sysdeps/x86_64/multiarch/memset-avx512-no-vzeroupper.S
+++ b/sysdeps/x86_64/multiarch/memset-avx512-no-vzeroupper.S
+@@ -29,12 +29,16 @@
+ 	.section .text.avx512,"ax",@progbits
+ #if defined PIC
+ ENTRY (MEMSET_CHK)
+-	cmpq	%rdx, %rcx
+	cmp	%RDX_LP, %RCX_LP
+ 	jb	HIDDEN_JUMPTARGET (__chk_fail)
+ END (MEMSET_CHK)
+ #endif
+ 
+ ENTRY (MEMSET)
+# ifdef __ILP32__
+	/* Clear the upper 32 bits.  */
+	mov	%edx, %edx
+# endif
+ 	vpxor	%xmm0, %xmm0, %xmm0
+ 	vmovd	%esi, %xmm1
+ 	lea	(%rdi, %rdx), %rsi
+diff --git a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
+index 270a1d49..9a0fd818 100644
+--- a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
+@@ -65,8 +65,8 @@
+ 	.section SECTION(.text),"ax",@progbits
+ #if VEC_SIZE == 16 && IS_IN (libc)
+ ENTRY (__bzero)
+-	movq	%rdi, %rax /* Set return value.  */
+-	movq	%rsi, %rdx /* Set n.  */
+	mov	%RDI_LP, %RAX_LP /* Set return value.  */
+	mov	%RSI_LP, %RDX_LP /* Set n.  */
+ 	pxor	%xmm0, %xmm0
+ 	jmp	L(entry_from_bzero)
+ END (__bzero)
+@@ -76,13 +76,13 @@ weak_alias (__bzero, bzero)
+ #if IS_IN (libc)
+ # if defined SHARED
+ ENTRY_CHK (WMEMSET_CHK_SYMBOL (__wmemset_chk, unaligned))
+-	cmpq	%rdx, %rcx
+	cmp	%RDX_LP, %RCX_LP
+ 	jb	HIDDEN_JUMPTARGET (__chk_fail)
+ END_CHK (WMEMSET_CHK_SYMBOL (__wmemset_chk, unaligned))
+ # endif
+ 
+ ENTRY (WMEMSET_SYMBOL (__wmemset, unaligned))
+-	shlq	$2, %rdx
+	shl	$2, %RDX_LP
+ 	WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi)
+ 	jmp	L(entry_from_bzero)
+ END (WMEMSET_SYMBOL (__wmemset, unaligned))
+@@ -90,13 +90,17 @@ END (WMEMSET_SYMBOL (__wmemset, unaligned))
+ 
+ #if defined SHARED && IS_IN (libc)
+ ENTRY_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned))
+-	cmpq	%rdx, %rcx
+	cmp	%RDX_LP, %RCX_LP
+ 	jb	HIDDEN_JUMPTARGET (__chk_fail)
+ END_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned))
+ #endif
+ 
+ ENTRY (MEMSET_SYMBOL (__memset, unaligned))
+ 	MEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi)
+# ifdef __ILP32__
+	/* Clear the upper 32 bits.  */
+	mov	%edx, %edx
+# endif
+ L(entry_from_bzero):
+ 	cmpq	$VEC_SIZE, %rdx
+ 	jb	L(less_vec)
+@@ -112,14 +116,14 @@ END (MEMSET_SYMBOL (__memset, unaligned))
+ 
+ # if VEC_SIZE == 16
+ ENTRY (__memset_chk_erms)
+-	cmpq	%rdx, %rcx
+	cmp	%RDX_LP, %RCX_LP
+ 	jb	HIDDEN_JUMPTARGET (__chk_fail)
+ END (__memset_chk_erms)
+ 
+ /* Only used to measure performance of REP STOSB.  */
+ ENTRY (__memset_erms)
+ 	/* Skip zero length.  */
+-	testq	%rdx, %rdx
+	test	%RDX_LP, %RDX_LP
+ 	jnz	 L(stosb)
+ 	movq	%rdi, %rax
+ 	ret
+@@ -131,11 +135,11 @@ ENTRY (MEMSET_SYMBOL (__memset, erms))
+ L(stosb):
+ 	/* Issue vzeroupper before rep stosb.  */
+ 	VZEROUPPER
+-	movq	%rdx, %rcx
+	mov	%RDX_LP, %RCX_LP
+ 	movzbl	%sil, %eax
+-	movq	%rdi, %rdx
+	mov	%RDI_LP, %RDX_LP
+ 	rep stosb
+-	movq	%rdx, %rax
+	mov	%RDX_LP, %RAX_LP
+ 	ret
+ # if VEC_SIZE == 16
+ END (__memset_erms)
+@@ -145,16 +149,20 @@ END (MEMSET_SYMBOL (__memset, erms))
+ 
+ # if defined SHARED && IS_IN (libc)
+ ENTRY_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned_erms))
+-	cmpq	%rdx, %rcx
+	cmp	%RDX_LP, %RCX_LP
+ 	jb	HIDDEN_JUMPTARGET (__chk_fail)
+ END_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned_erms))
+ # endif
+ 
+ ENTRY (MEMSET_SYMBOL (__memset, unaligned_erms))
+ 	MEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi)
+-	cmpq	$VEC_SIZE, %rdx
+# ifdef __ILP32__
+	/* Clear the upper 32 bits.  */
+	mov	%edx, %edx
+# endif
+	cmp	$VEC_SIZE, %RDX_LP
+ 	jb	L(less_vec)
+-	cmpq	$(VEC_SIZE * 2), %rdx
+	cmp	$(VEC_SIZE * 2), %RDX_LP
+ 	ja	L(stosb_more_2x_vec)
+ 	/* From VEC and to 2 * VEC.  No branch when size == VEC_SIZE.  */
+ 	VMOVU	%VEC(0), -VEC_SIZE(%rdi,%rdx)
+diff --git a/sysdeps/x86_64/x32/Makefile b/sysdeps/x86_64/x32/Makefile
+index e99dbd7c..98bd9ae9 100644
+--- a/sysdeps/x86_64/x32/Makefile
+++ b/sysdeps/x86_64/x32/Makefile
+@@ -7,9 +7,9 @@ endif
+ 
+ ifeq ($(subdir),string)
+ tests += tst-size_t-memchr tst-size_t-memcmp tst-size_t-memcpy \
+-	 tst-size_t-memrchr
+	 tst-size_t-memrchr tst-size_t-memset
+ endif
+ 
+ ifeq ($(subdir),wcsmbs)
+-tests += tst-size_t-wmemchr tst-size_t-wmemcmp
+tests += tst-size_t-wmemchr tst-size_t-wmemcmp tst-size_t-wmemset
+ endif
+diff --git a/sysdeps/x86_64/x32/tst-size_t-memset.c b/sysdeps/x86_64/x32/tst-size_t-memset.c
+new file mode 100644
+index 00000000..2c367af6
+--- /dev/null
+++ b/sysdeps/x86_64/x32/tst-size_t-memset.c
+@@ -0,0 +1,73 @@
+/* Test memset with size_t in the lower 32 bits of 64-bit register.
+   Copyright (C) 2019 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#ifdef WIDE
+# define TEST_NAME "wmemset"
+#else
+# define TEST_NAME "memset"
+#endif /* WIDE */
+
+#include "test-size_t.h"
+
+#ifdef WIDE
+# include <wchar.h>
+# define MEMSET wmemset
+# define CHAR wchar_t
+#else
+# define MEMSET memset
+# define CHAR char
+#endif /* WIDE */
+
+IMPL (MEMSET, 1)
+
+typedef CHAR *(*proto_t) (CHAR *, int, size_t);
+
+static void *
+__attribute__ ((noinline, noclone))
+do_memset (parameter_t a, parameter_t b)
+{
+  return CALL (&b, a.p, (uintptr_t) b.p, a.len);
+}
+
+static int
+test_main (void)
+{
+  test_init ();
+
+  CHAR ch = 0x23;
+  parameter_t src = { { page_size / sizeof (CHAR) }, buf2 };
+  parameter_t c = { { 0 }, (void *) (uintptr_t) ch };
+
+  int ret = 0;
+  FOR_EACH_IMPL (impl, 0)
+    {
+      c.fn = impl->fn;
+      CHAR *p = (CHAR *) do_memset (src, c);
+      size_t i;
+      for (i = 0; i < src.len; i++)
+	if (p[i] != ch)
+	  {
+	    error (0, 0, "Wrong result in function %s", impl->name);
+	    ret = 1;
+	  }
+    }
+
+  return ret ? EXIT_FAILURE : EXIT_SUCCESS;
+}
+
+#include <support/test-driver.c>
+diff --git a/sysdeps/x86_64/x32/tst-size_t-wmemset.c b/sysdeps/x86_64/x32/tst-size_t-wmemset.c
+new file mode 100644
+index 00000000..955eb488
+--- /dev/null
+++ b/sysdeps/x86_64/x32/tst-size_t-wmemset.c
+@@ -0,0 +1,20 @@
+/* Test wmemset with size_t in the lower 32 bits of 64-bit register.
+   Copyright (C) 2019 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#define WIDE 1
+#include "tst-size_t-memset.c"
+-- 
+GitLab
+
--- a/glibc-RHEL-15696-50.patch
+++ b/glibc-RHEL-15696-50.patch
@ -0,0 +1,43 @@
+From 447954a206837b5f153869cfeeeab44631c3fac9 Mon Sep 17 00:00:00 2001
+Author: Shen-Ta Hsieh <ibmibmibm.tw@gmail.com>  2021-05-23 21:43:10
+Committer: H.J. Lu <hjl.tools@gmail.com>  2021-06-27 10:56:57
+Parent: 2c16cb88a6e5ace0fb7cedca86860ea7bde522a7 (Linux: Move timer helper routines from librt to libc)
+Child:  1683249d17e14827b6579529742eb895027dfa84 (x86_64: roundeven with sse4.1 support)
+Branches: master, remotes/origin/master and many more (41)
+Follows: glibc-2.33.9000
+Precedes: glibc-2.34
+
+    math: redirect roundeven function
+    
+    This patch redirect roundeven function for futhermore changes.
+    
+    Signed-off-by: Shen-Ta Hsieh <ibmibmibm.tw@gmail.com>
+    Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
+
+Conflicts:
+	*
+	(rewritten for older branch)
+
+diff --git a/sysdeps/ieee754/dbl-64/wordsize-64/s_roundeven.c b/sysdeps/ieee754/dbl-64/wordsize-64/s_roundeven.c
+index 7bbbb2dc..8728d0f2 100644
+--- a/sysdeps/ieee754/dbl-64/wordsize-64/s_roundeven.c
+++ b/sysdeps/ieee754/dbl-64/wordsize-64/s_roundeven.c
+@@ -17,6 +17,7 @@
+    License along with the GNU C Library; if not, see
+    <http://www.gnu.org/licenses/>.  */
+ 
+#define NO_MATH_REDIRECT
+ #include <math.h>
+ #include <math_private.h>
+ #include <libm-alias-double.h>
+@@ -67,5 +68,6 @@ __roundeven (double x)
+   INSERT_WORDS64 (x, ix);
+   return x;
+ }
+-hidden_def (__roundeven)
+#ifndef __roundeven
+ libm_alias_double (__roundeven, roundeven)
+#endif
+-- 
+GitLab
+
--- a/glibc-RHEL-15696-51.patch
+++ b/glibc-RHEL-15696-51.patch
@ -0,0 +1,118 @@
+From 447954a206837b5f153869cfeeeab44631c3fac9 Mon Sep 17 00:00:00 2001
+From: Shen-Ta Hsieh <ibmibmibm.tw@gmail.com>
+Date: Mon, 24 May 2021 09:43:10 +0800
+Subject: [PATCH] math: redirect roundeven function
+Content-type: text/plain; charset=UTF-8
+
+This patch redirect roundeven function for futhermore changes.
+
+Signed-off-by: Shen-Ta Hsieh <ibmibmibm.tw@gmail.com>
+Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
+---
+ include/math.h                             | 3 ++-
+ sysdeps/ieee754/dbl-64/s_roundeven.c       | 4 +++-
+ sysdeps/ieee754/float128/s_roundevenf128.c | 1 +
+ sysdeps/ieee754/flt-32/s_roundevenf.c      | 3 +++
+ sysdeps/ieee754/ldbl-128/s_roundevenl.c    | 1 +
+ sysdeps/ieee754/ldbl-96/s_roundevenl.c     | 1 +
+ 6 files changed, 11 insertions(+), 2 deletions(-)
+
+Conflicts:
+	include/math.h
+	(missing MATH_REDIRECT macros)
+
+diff --git a/include/math.h b/include/math.h
+index e21d34b8..1f9f9a54 100644
+--- a/include/math.h
+++ b/include/math.h
+@@ -38,7 +38,6 @@ libm_hidden_proto (__issignaling)
+ libm_hidden_proto (__issignalingf)
+ libm_hidden_proto (__exp)
+ libm_hidden_proto (__expf)
+-libm_hidden_proto (__roundeven)
+ 
+ # ifndef __NO_LONG_DOUBLE_MATH
+ libm_hidden_proto (__fpclassifyl)
+@@ -56,6 +55,8 @@ libm_hidden_proto (__expm1f128)
+ 
+ # if !(defined __FINITE_MATH_ONLY__ && __FINITE_MATH_ONLY__ > 0)
+ #  ifndef NO_MATH_REDIRECT
+float (roundevenf) (float) asm ("__roundevenf");
+double (roundeven) (double) asm ("__roundeven");
+ /* Declare sqrt for use within GLIBC.  Compilers typically inline sqrt as a
+    single instruction.  Use an asm to avoid use of PLTs if it doesn't.  */
+ float (sqrtf) (float) asm ("__ieee754_sqrtf");
+diff --git a/sysdeps/ieee754/dbl-64/s_roundeven.c b/sysdeps/ieee754/dbl-64/s_roundeven.c
+index 1438e81d..61962184 100644
+--- a/sysdeps/ieee754/dbl-64/s_roundeven.c
+++ b/sysdeps/ieee754/dbl-64/s_roundeven.c
+@@ -17,6 +17,7 @@
+    License along with the GNU C Library; if not, see
+    <http://www.gnu.org/licenses/>.  */
+ 
+#define NO_MATH_REDIRECT
+ #include <math.h>
+ #include <math_private.h>
+ #include <libm-alias-double.h>
+@@ -101,5 +102,6 @@ __roundeven (double x)
+   INSERT_WORDS (x, hx, lx);
+   return x;
+ }
+-hidden_def (__roundeven)
+#ifndef __roundeven
+ libm_alias_double (__roundeven, roundeven)
+#endif
+diff --git a/sysdeps/ieee754/float128/s_roundevenf128.c b/sysdeps/ieee754/float128/s_roundevenf128.c
+index 5a9b3f39..e0faf727 100644
+--- a/sysdeps/ieee754/float128/s_roundevenf128.c
+++ b/sysdeps/ieee754/float128/s_roundevenf128.c
+@@ -1,2 +1,3 @@
+#define NO_MATH_REDIRECT
+ #include <float128_private.h>
+ #include "../ldbl-128/s_roundevenl.c"
+diff --git a/sysdeps/ieee754/flt-32/s_roundevenf.c b/sysdeps/ieee754/flt-32/s_roundevenf.c
+index 90f991d5..a661875e 100644
+--- a/sysdeps/ieee754/flt-32/s_roundevenf.c
+++ b/sysdeps/ieee754/flt-32/s_roundevenf.c
+@@ -17,6 +17,7 @@
+    License along with the GNU C Library; if not, see
+    <http://www.gnu.org/licenses/>.  */
+ 
+#define NO_MATH_REDIRECT
+ #include <math.h>
+ #include <math_private.h>
+ #include <libm-alias-float.h>
+@@ -67,4 +68,6 @@ __roundevenf (float x)
+   SET_FLOAT_WORD (x, ix);
+   return x;
+ }
+#ifndef __roundevenf
+ libm_alias_float (__roundeven, roundeven)
+#endif
+diff --git a/sysdeps/ieee754/ldbl-128/s_roundevenl.c b/sysdeps/ieee754/ldbl-128/s_roundevenl.c
+index 5fc59af4..b9375b6c 100644
+--- a/sysdeps/ieee754/ldbl-128/s_roundevenl.c
+++ b/sysdeps/ieee754/ldbl-128/s_roundevenl.c
+@@ -17,6 +17,7 @@
+    License along with the GNU C Library; if not, see
+    <http://www.gnu.org/licenses/>.  */
+ 
+#define NO_MATH_REDIRECT
+ #include <math.h>
+ #include <math_private.h>
+ #include <libm-alias-ldouble.h>
+diff --git a/sysdeps/ieee754/ldbl-96/s_roundevenl.c b/sysdeps/ieee754/ldbl-96/s_roundevenl.c
+index be2e4fa4..65031ab7 100644
+--- a/sysdeps/ieee754/ldbl-96/s_roundevenl.c
+++ b/sysdeps/ieee754/ldbl-96/s_roundevenl.c
+@@ -17,6 +17,7 @@
+    License along with the GNU C Library; if not, see
+    <http://www.gnu.org/licenses/>.  */
+ 
+#define NO_MATH_REDIRECT
+ #include <math.h>
+ #include <math_private.h>
+ #include <libm-alias-ldouble.h>
+-- 
+GitLab
+
--- a/glibc-RHEL-15696-52.patch
+++ b/glibc-RHEL-15696-52.patch
@ -0,0 +1,242 @@
+From 1683249d17e14827b6579529742eb895027dfa84 Mon Sep 17 00:00:00 2001
+From: Shen-Ta Hsieh <ibmibmibm.tw@gmail.com>
+Date: Mon, 24 May 2021 09:43:11 +0800
+Subject: [PATCH] x86_64: roundeven with sse4.1 support
+Content-type: text/plain; charset=UTF-8
+
+This patch adds support for the sse4.1 hardware floating point
+roundeven.
+
+Here is some benchmark results on my systems:
+
+=AMD Ryzen 9 3900X 12-Core Processor=
+
+* benchmark result before this commit
+|            |    roundeven |   roundevenf |
+|------------|--------------|--------------|
+| duration   |  3.75587e+09 |  3.75114e+09 |
+| iterations |  3.93053e+08 |  4.35402e+08 |
+| max        | 52.592       | 58.71        |
+| min        |  7.98        |  7.22        |
+| mean       |  9.55563     |  8.61535     |
+
+* benchmark result after this commit
+|            |     roundeven |   roundevenf |
+|------------|---------------|--------------|
+| duration   |   3.73815e+09 |  3.73738e+09 |
+| iterations |   5.82692e+08 |  5.91498e+08 |
+| max        |  56.468       | 51.642       |
+| min        |   6.27        |  6.156       |
+| mean       |   6.41532     |  6.3185      |
+
+=Intel(R) Pentium(R) CPU D1508 @ 2.20GHz=
+
+* benchmark result before this commit
+|            |    roundeven |   roundevenf |
+|------------|--------------|--------------|
+| duration   |  2.18208e+09 |  2.18258e+09 |
+| iterations |  2.39932e+08 |  2.46924e+08 |
+| max        | 96.378       | 98.035       |
+| min        |  6.776       |  5.94        |
+| mean       |  9.09456     |  8.83907     |
+
+* benchmark result after this commit
+|            |    roundeven |   roundevenf |
+|------------|--------------|--------------|
+| duration   |  2.17415e+09 |  2.17005e+09 |
+| iterations |  3.56193e+08 |  4.09824e+08 |
+| max        | 51.693       | 97.192       |
+| min        |  5.926       |  5.093       |
+| mean       |  6.10385     |  5.29507     |
+
+Signed-off-by: Shen-Ta Hsieh <ibmibmibm.tw@gmail.com>
+Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
+---
+ sysdeps/x86_64/fpu/multiarch/Makefile         |  5 +--
+ sysdeps/x86_64/fpu/multiarch/s_roundeven-c.c  |  2 ++
+ .../x86_64/fpu/multiarch/s_roundeven-sse4_1.S | 24 ++++++++++++++
+ sysdeps/x86_64/fpu/multiarch/s_roundeven.c    | 31 +++++++++++++++++++
+ sysdeps/x86_64/fpu/multiarch/s_roundevenf-c.c |  3 ++
+ .../fpu/multiarch/s_roundevenf-sse4_1.S       | 24 ++++++++++++++
+ sysdeps/x86_64/fpu/multiarch/s_roundevenf.c   | 31 +++++++++++++++++++
+ 7 files changed, 118 insertions(+), 2 deletions(-)
+ create mode 100644 sysdeps/x86_64/fpu/multiarch/s_roundeven-c.c
+ create mode 100644 sysdeps/x86_64/fpu/multiarch/s_roundeven-sse4_1.S
+ create mode 100644 sysdeps/x86_64/fpu/multiarch/s_roundeven.c
+ create mode 100644 sysdeps/x86_64/fpu/multiarch/s_roundevenf-c.c
+ create mode 100644 sysdeps/x86_64/fpu/multiarch/s_roundevenf-sse4_1.S
+ create mode 100644 sysdeps/x86_64/fpu/multiarch/s_roundevenf.c
+
+diff --git a/sysdeps/x86_64/fpu/multiarch/Makefile b/sysdeps/x86_64/fpu/multiarch/Makefile
+index 9f387248..6ddd1c01 100644
+--- a/sysdeps/x86_64/fpu/multiarch/Makefile
+++ b/sysdeps/x86_64/fpu/multiarch/Makefile
+@@ -1,11 +1,12 @@
+ ifeq ($(subdir),math)
+ libm-sysdep_routines += s_floor-c s_ceil-c s_floorf-c s_ceilf-c \
+ 			s_rint-c s_rintf-c s_nearbyint-c s_nearbyintf-c \
+-			s_trunc-c s_truncf-c
+			s_roundeven-c s_roundevenf-c s_trunc-c s_truncf-c
+ 
+ libm-sysdep_routines += s_ceil-sse4_1 s_ceilf-sse4_1 s_floor-sse4_1 \
+ 			s_floorf-sse4_1 s_nearbyint-sse4_1 \
+-			s_nearbyintf-sse4_1 s_rint-sse4_1 s_rintf-sse4_1 \
+			s_nearbyintf-sse4_1 s_roundeven-sse4_1 \
+			s_roundevenf-sse4_1 s_rint-sse4_1 s_rintf-sse4_1 \
+ 			s_trunc-sse4_1 s_truncf-sse4_1
+ 
+ libm-sysdep_routines += e_exp-fma e_log-fma e_pow-fma s_atan-fma \
+diff --git a/sysdeps/x86_64/fpu/multiarch/s_roundeven-c.c b/sysdeps/x86_64/fpu/multiarch/s_roundeven-c.c
+new file mode 100644
+index 00000000..c7be43cb
+--- /dev/null
+++ b/sysdeps/x86_64/fpu/multiarch/s_roundeven-c.c
+@@ -0,0 +1,2 @@
+#define __roundeven __roundeven_c
+#include <sysdeps/ieee754/dbl-64/s_roundeven.c>
+diff --git a/sysdeps/x86_64/fpu/multiarch/s_roundeven-sse4_1.S b/sysdeps/x86_64/fpu/multiarch/s_roundeven-sse4_1.S
+new file mode 100644
+index 00000000..6ae8f6b1
+--- /dev/null
+++ b/sysdeps/x86_64/fpu/multiarch/s_roundeven-sse4_1.S
+@@ -0,0 +1,24 @@
+/* Copyright (C) 2021 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+	.section .text.sse4.1,"ax",@progbits
+ENTRY(__roundeven_sse41)
+	roundsd	$8, %xmm0, %xmm0
+	ret
+END(__roundeven_sse41)
+diff --git a/sysdeps/x86_64/fpu/multiarch/s_roundeven.c b/sysdeps/x86_64/fpu/multiarch/s_roundeven.c
+new file mode 100644
+index 00000000..d92eda65
+--- /dev/null
+++ b/sysdeps/x86_64/fpu/multiarch/s_roundeven.c
+@@ -0,0 +1,31 @@
+/* Multiple versions of __roundeven.
+   Copyright (C) 2021 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <libm-alias-double.h>
+
+#define roundeven __redirect_roundeven
+#define __roundeven __redirect___roundeven
+#include <math.h>
+#undef roundeven
+#undef __roundeven
+
+#define SYMBOL_NAME roundeven
+#include "ifunc-sse4_1.h"
+
+libc_ifunc_redirected (__redirect_roundeven, __roundeven, IFUNC_SELECTOR ());
+libm_alias_double (__roundeven, roundeven)
+diff --git a/sysdeps/x86_64/fpu/multiarch/s_roundevenf-c.c b/sysdeps/x86_64/fpu/multiarch/s_roundevenf-c.c
+new file mode 100644
+index 00000000..72a6e7d1
+--- /dev/null
+++ b/sysdeps/x86_64/fpu/multiarch/s_roundevenf-c.c
+@@ -0,0 +1,3 @@
+#undef __roundevenf
+#define __roundevenf __roundevenf_c
+#include <sysdeps/ieee754/flt-32/s_roundevenf.c>
+diff --git a/sysdeps/x86_64/fpu/multiarch/s_roundevenf-sse4_1.S b/sysdeps/x86_64/fpu/multiarch/s_roundevenf-sse4_1.S
+new file mode 100644
+index 00000000..a76e1080
+--- /dev/null
+++ b/sysdeps/x86_64/fpu/multiarch/s_roundevenf-sse4_1.S
+@@ -0,0 +1,24 @@
+/* Copyright (C) 2021 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+	.section .text.sse4.1,"ax",@progbits
+ENTRY(__roundevenf_sse41)
+	roundss	$8, %xmm0, %xmm0
+	ret
+END(__roundevenf_sse41)
+diff --git a/sysdeps/x86_64/fpu/multiarch/s_roundevenf.c b/sysdeps/x86_64/fpu/multiarch/s_roundevenf.c
+new file mode 100644
+index 00000000..2ee196e6
+--- /dev/null
+++ b/sysdeps/x86_64/fpu/multiarch/s_roundevenf.c
+@@ -0,0 +1,31 @@
+/* Multiple versions of __roundevenf.
+   Copyright (C) 2021 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <libm-alias-float.h>
+
+#define roundevenf __redirect_roundevenf
+#define __roundevenf __redirect___roundevenf
+#include <math.h>
+#undef roundevenf
+#undef __roundevenf
+
+#define SYMBOL_NAME roundevenf
+#include "ifunc-sse4_1.h"
+
+libc_ifunc_redirected (__redirect_roundevenf, __roundevenf, IFUNC_SELECTOR ());
+libm_alias_float (__roundeven, roundeven)
+-- 
+GitLab
+
--- a/glibc-RHEL-15696-53.patch
+++ b/glibc-RHEL-15696-53.patch
@ -0,0 +1,41 @@
+From 7e08db3359c86c94918feb33a1182cd0ff3bb10b Mon Sep 17 00:00:00 2001
+From: Noah Goldstein <goldstein.w.n@gmail.com>
+Date: Sun, 9 Jan 2022 16:02:28 -0600
+Subject: [PATCH] x86: Fix __wcsncmp_evex in strcmp-evex.S [BZ# 28755]
+Content-type: text/plain; charset=UTF-8
+
+Fixes [BZ# 28755] for wcsncmp by redirecting length >= 2^56 to
+__wcscmp_evex. For x86_64 this covers the entire address range so any
+length larger could not possibly be used to bound `s1` or `s2`.
+
+test-strcmp, test-strncmp, test-wcscmp, and test-wcsncmp all pass.
+
+Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
+---
+ sysdeps/x86_64/multiarch/strcmp-evex.S | 10 ++++++++++
+ 1 file changed, 10 insertions(+)
+
+diff --git a/sysdeps/x86_64/multiarch/strcmp-evex.S b/sysdeps/x86_64/multiarch/strcmp-evex.S
+index 459eeed0..d5aa6daa 100644
+--- a/sysdeps/x86_64/multiarch/strcmp-evex.S
+++ b/sysdeps/x86_64/multiarch/strcmp-evex.S
+@@ -97,6 +97,16 @@ ENTRY (STRCMP)
+ 	je	L(char0)
+ 	jb	L(zero)
+ #  ifdef USE_AS_WCSCMP
+#  ifndef __ILP32__
+	movq	%rdx, %rcx
+	/* Check if length could overflow when multiplied by
+	   sizeof(wchar_t). Checking top 8 bits will cover all potential
+	   overflow cases as well as redirect cases where its impossible to
+	   length to bound a valid memory region. In these cases just use
+	   'wcscmp'.  */
+	shrq	$56, %rcx
+	jnz	__wcscmp_evex
+#  endif
+ 	/* Convert units: from wide to byte char.  */
+ 	shl	$2, %RDX_LP
+ #  endif
+-- 
+GitLab
+
--- a/glibc-RHEL-15696-54.patch
+++ b/glibc-RHEL-15696-54.patch
@ -0,0 +1,268 @@
+From 78c9ec9000f873abe7a15a91b87080a2e4308260 Mon Sep 17 00:00:00 2001
+From: "H.J. Lu" <hjl.tools@gmail.com>
+Date: Fri, 20 Aug 2021 06:42:24 -0700
+Subject: [PATCH] x86-64: Optimize load of all bits set into ZMM register [BZ
+ #28252]
+Content-type: text/plain; charset=UTF-8
+
+Optimize loads of all bits set into ZMM register in AVX512 SVML codes
+by replacing
+
+	vpbroadcastq .L_2il0floatpacket.16(%rip), %zmmX
+
+and
+
+	vmovups   .L_2il0floatpacket.13(%rip), %zmmX
+
+with
+	vpternlogd $0xff, %zmmX, %zmmX, %zmmX
+
+This fixes BZ #28252.
+---
+ .../x86_64/fpu/multiarch/svml_d_cos8_core_avx512.S   |  7 +------
+ .../x86_64/fpu/multiarch/svml_d_log8_core_avx512.S   |  7 +------
+ .../x86_64/fpu/multiarch/svml_d_sin8_core_avx512.S   |  7 +------
+ .../fpu/multiarch/svml_d_sincos8_core_avx512.S       |  7 +------
+ .../x86_64/fpu/multiarch/svml_s_cosf16_core_avx512.S |  7 +------
+ .../x86_64/fpu/multiarch/svml_s_expf16_core_avx512.S |  7 +------
+ .../x86_64/fpu/multiarch/svml_s_logf16_core_avx512.S |  7 +------
+ .../x86_64/fpu/multiarch/svml_s_powf16_core_avx512.S | 12 ++----------
+ .../fpu/multiarch/svml_s_sincosf16_core_avx512.S     |  7 +------
+ .../x86_64/fpu/multiarch/svml_s_sinf16_core_avx512.S |  7 +------
+ 10 files changed, 11 insertions(+), 64 deletions(-)
+
+diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_cos8_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_d_cos8_core_avx512.S
+index 24e3b363..07dfed85 100644
+--- a/sysdeps/x86_64/fpu/multiarch/svml_d_cos8_core_avx512.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_cos8_core_avx512.S
+@@ -265,7 +265,7 @@ WRAPPER_IMPL_AVX512 _ZGVdN4v_cos
+         vmovaps   %zmm0, %zmm8
+ 
+ /* Check for large arguments path */
+-        vpbroadcastq .L_2il0floatpacket.16(%rip), %zmm2
+        vpternlogd $0xff, %zmm2, %zmm2, %zmm2
+ 
+ /*
+   ARGUMENT RANGE REDUCTION:
+@@ -456,8 +456,3 @@ WRAPPER_IMPL_AVX512 _ZGVdN4v_cos
+         jmp       .LBL_2_7
+ #endif
+ END (_ZGVeN8v_cos_skx)
+-
+-	.section .rodata, "a"
+-.L_2il0floatpacket.16:
+-	.long	0xffffffff,0xffffffff
+-	.type	.L_2il0floatpacket.16,@object
+diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_log8_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_d_log8_core_avx512.S
+index ae8af8d8..ddb60e5b 100644
+--- a/sysdeps/x86_64/fpu/multiarch/svml_d_log8_core_avx512.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_log8_core_avx512.S
+@@ -274,7 +274,7 @@ WRAPPER_IMPL_AVX512 _ZGVdN4v_log
+ 
+ /* preserve mantissa, set input exponent to 2^(-10) */
+         vpternlogq $248, _ExpMask(%rax), %zmm3, %zmm2
+-        vpbroadcastq .L_2il0floatpacket.12(%rip), %zmm1
+        vpternlogd $0xff, %zmm1, %zmm1, %zmm1
+         vpsrlq    $32, %zmm4, %zmm6
+ 
+ /* reciprocal approximation good to at least 11 bits */
+@@ -461,8 +461,3 @@ WRAPPER_IMPL_AVX512 _ZGVdN4v_log
+         jmp       .LBL_2_7
+ #endif
+ END (_ZGVeN8v_log_skx)
+-
+-	.section .rodata, "a"
+-.L_2il0floatpacket.12:
+-	.long	0xffffffff,0xffffffff
+-	.type	.L_2il0floatpacket.12,@object
+diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_sin8_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_d_sin8_core_avx512.S
+index 2d4b14fd..529c454a 100644
+--- a/sysdeps/x86_64/fpu/multiarch/svml_d_sin8_core_avx512.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_sin8_core_avx512.S
+@@ -261,7 +261,7 @@ WRAPPER_IMPL_AVX512 _ZGVdN4v_sin
+         andq      $-64, %rsp
+         subq      $1280, %rsp
+         movq      __svml_d_trig_data@GOTPCREL(%rip), %rax
+-        vpbroadcastq .L_2il0floatpacket.14(%rip), %zmm14
+        vpternlogd $0xff, %zmm1, %zmm1, %zmm14
+         vmovups __dAbsMask(%rax), %zmm7
+         vmovups __dInvPI(%rax), %zmm2
+         vmovups __dRShifter(%rax), %zmm1
+@@ -458,8 +458,3 @@ WRAPPER_IMPL_AVX512 _ZGVdN4v_sin
+         jmp       .LBL_2_7
+ #endif
+ END (_ZGVeN8v_sin_skx)
+-
+-	.section .rodata, "a"
+-.L_2il0floatpacket.14:
+-	.long	0xffffffff,0xffffffff
+-	.type	.L_2il0floatpacket.14,@object
+diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_sincos8_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_d_sincos8_core_avx512.S
+index 2df626c0..e501a53a 100644
+--- a/sysdeps/x86_64/fpu/multiarch/svml_d_sincos8_core_avx512.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_sincos8_core_avx512.S
+@@ -430,7 +430,7 @@ WRAPPER_IMPL_AVX512_fFF _ZGVdN4vl8l8_sincos
+ 
+ /* SinPoly = SinR*SinPoly */
+         vfmadd213pd %zmm5, %zmm5, %zmm4
+-        vpbroadcastq .L_2il0floatpacket.15(%rip), %zmm3
+        vpternlogd $0xff, %zmm3, %zmm3, %zmm3
+ 
+ /* Update Cos result's sign */
+         vxorpd    %zmm2, %zmm1, %zmm1
+@@ -741,8 +741,3 @@ END (_ZGVeN8vvv_sincos_knl)
+ ENTRY (_ZGVeN8vvv_sincos_skx)
+ WRAPPER_AVX512_vvv_vl8l8 _ZGVeN8vl8l8_sincos_skx
+ END (_ZGVeN8vvv_sincos_skx)
+-
+-	.section .rodata, "a"
+-.L_2il0floatpacket.15:
+-	.long	0xffffffff,0xffffffff
+-	.type	.L_2il0floatpacket.15,@object
+diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_cosf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_cosf16_core_avx512.S
+index 6ea1137b..377af394 100644
+--- a/sysdeps/x86_64/fpu/multiarch/svml_s_cosf16_core_avx512.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_cosf16_core_avx512.S
+@@ -278,7 +278,7 @@ WRAPPER_IMPL_AVX512 _ZGVdN8v_cosf
+   X = X - Y*PI1 - Y*PI2 - Y*PI3
+  */
+         vmovaps   %zmm0, %zmm6
+-        vmovups   .L_2il0floatpacket.13(%rip), %zmm12
+        vpternlogd $0xff, %zmm12, %zmm12, %zmm12
+         vmovups __sRShifter(%rax), %zmm3
+         vmovups __sPI1_FMA(%rax), %zmm5
+         vmovups __sA9_FMA(%rax), %zmm9
+@@ -453,8 +453,3 @@ WRAPPER_IMPL_AVX512 _ZGVdN8v_cosf
+         jmp       .LBL_2_7
+ #endif
+ END (_ZGVeN16v_cosf_skx)
+-
+-	.section .rodata, "a"
+-.L_2il0floatpacket.13:
+-	.long	0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff
+-	.type	.L_2il0floatpacket.13,@object
+diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_expf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_expf16_core_avx512.S
+index 89ba0df2..46f33d46 100644
+--- a/sysdeps/x86_64/fpu/multiarch/svml_s_expf16_core_avx512.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_expf16_core_avx512.S
+@@ -264,7 +264,7 @@ WRAPPER_IMPL_AVX512 _ZGVdN8v_expf
+         vmovaps   %zmm0, %zmm7
+ 
+ /* compare against threshold */
+-        vmovups   .L_2il0floatpacket.13(%rip), %zmm3
+        vpternlogd $0xff, %zmm3, %zmm3, %zmm3
+         vmovups __sInvLn2(%rax), %zmm4
+         vmovups __sShifter(%rax), %zmm1
+         vmovups __sLn2hi(%rax), %zmm6
+@@ -440,8 +440,3 @@ WRAPPER_IMPL_AVX512 _ZGVdN8v_expf
+ 
+ #endif
+ END (_ZGVeN16v_expf_skx)
+-
+-	.section .rodata, "a"
+-.L_2il0floatpacket.13:
+-	.long	0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff
+-	.type	.L_2il0floatpacket.13,@object
+diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_logf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_logf16_core_avx512.S
+index 4cf0a96f..9e254956 100644
+--- a/sysdeps/x86_64/fpu/multiarch/svml_s_logf16_core_avx512.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_logf16_core_avx512.S
+@@ -235,7 +235,7 @@ WRAPPER_IMPL_AVX512 _ZGVdN8v_logf
+         andq      $-64, %rsp
+         subq      $1280, %rsp
+         movq      __svml_slog_data@GOTPCREL(%rip), %rax
+-        vmovups   .L_2il0floatpacket.7(%rip), %zmm6
+        vpternlogd $0xff, %zmm6, %zmm6, %zmm6
+         vmovups _iBrkValue(%rax), %zmm4
+         vmovups _sPoly_7(%rax), %zmm8
+ 
+@@ -409,8 +409,3 @@ WRAPPER_IMPL_AVX512 _ZGVdN8v_logf
+ 
+ #endif
+ END (_ZGVeN16v_logf_skx)
+-
+-	.section .rodata, "a"
+-.L_2il0floatpacket.7:
+-	.long	0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff
+-	.type	.L_2il0floatpacket.7,@object
+diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_powf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_powf16_core_avx512.S
+index bdcd50af..e8331ba1 100644
+--- a/sysdeps/x86_64/fpu/multiarch/svml_s_powf16_core_avx512.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_powf16_core_avx512.S
+@@ -385,7 +385,7 @@ WRAPPER_IMPL_AVX512_ff _ZGVdN8vv_powf
+         vpsrlq    $32, %zmm3, %zmm2
+         vpmovqd   %zmm2, %ymm11
+         vcvtps2pd %ymm14, %zmm13
+-        vmovups   .L_2il0floatpacket.23(%rip), %zmm14
+        vpternlogd $0xff, %zmm14, %zmm14, %zmm14
+         vmovaps   %zmm14, %zmm26
+         vpandd _ABSMASK(%rax), %zmm1, %zmm8
+         vpcmpd    $1, _INF(%rax), %zmm8, %k2
+@@ -427,7 +427,7 @@ WRAPPER_IMPL_AVX512_ff _ZGVdN8vv_powf
+         vpmovqd   %zmm11, %ymm5
+         vpxord    %zmm10, %zmm10, %zmm10
+         vgatherdpd _Log2Rcp_lookup(%rax,%ymm4), %zmm10{%k3}
+-        vpbroadcastq .L_2il0floatpacket.24(%rip), %zmm4
+        vpternlogd $0xff, %zmm4, %zmm4, %zmm4
+         vpxord    %zmm11, %zmm11, %zmm11
+         vcvtdq2pd %ymm7, %zmm7
+         vgatherdpd _Log2Rcp_lookup(%rax,%ymm5), %zmm11{%k1}
+@@ -643,11 +643,3 @@ WRAPPER_IMPL_AVX512_ff _ZGVdN8vv_powf
+         jmp       .LBL_2_7
+ #endif
+ END (_ZGVeN16vv_powf_skx)
+-
+-	.section .rodata, "a"
+-.L_2il0floatpacket.23:
+-	.long	0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff
+-	.type	.L_2il0floatpacket.23,@object
+-.L_2il0floatpacket.24:
+-	.long	0xffffffff,0xffffffff
+-	.type	.L_2il0floatpacket.24,@object
+diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf16_core_avx512.S
+index 5fa4bc41..1f46f334 100644
+--- a/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf16_core_avx512.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf16_core_avx512.S
+@@ -317,7 +317,7 @@ WRAPPER_IMPL_AVX512_fFF _ZGVdN8vvv_sincosf
+ 
+ /* Result sign calculations */
+         vpternlogd $150, %zmm0, %zmm14, %zmm1
+-        vmovups   .L_2il0floatpacket.13(%rip), %zmm14
+        vpternlogd $0xff, %zmm14, %zmm14, %zmm14
+ 
+ /* Add correction term 0.5 for cos() part */
+         vaddps    %zmm8, %zmm5, %zmm15
+@@ -748,8 +748,3 @@ END (_ZGVeN16vvv_sincosf_knl)
+ ENTRY (_ZGVeN16vvv_sincosf_skx)
+ WRAPPER_AVX512_vvv_vl4l4 _ZGVeN16vl4l4_sincosf_skx
+ END (_ZGVeN16vvv_sincosf_skx)
+-
+-	.section .rodata, "a"
+-.L_2il0floatpacket.13:
+-	.long	0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff
+-	.type	.L_2il0floatpacket.13,@object
+diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_sinf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_sinf16_core_avx512.S
+index 141f747e..1fc9308a 100644
+--- a/sysdeps/x86_64/fpu/multiarch/svml_s_sinf16_core_avx512.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_sinf16_core_avx512.S
+@@ -280,7 +280,7 @@ WRAPPER_IMPL_AVX512 _ZGVdN8v_sinf
+         movq      __svml_s_trig_data@GOTPCREL(%rip), %rax
+ 
+ /* Check for large and special values */
+-        vmovups   .L_2il0floatpacket.11(%rip), %zmm14
+        vpternlogd $0xff, %zmm14, %zmm14, %zmm14
+         vmovups __sAbsMask(%rax), %zmm5
+         vmovups __sInvPI(%rax), %zmm1
+         vmovups __sRShifter(%rax), %zmm2
+@@ -472,8 +472,3 @@ WRAPPER_IMPL_AVX512 _ZGVdN8v_sinf
+         jmp       .LBL_2_7
+ #endif
+ END (_ZGVeN16v_sinf_skx)
+-
+-	.section .rodata, "a"
+-.L_2il0floatpacket.11:
+-	.long	0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff
+-	.type	.L_2il0floatpacket.11,@object
+-- 
+GitLab
+
--- a/glibc-RHEL-15696-55.patch
+++ b/glibc-RHEL-15696-55.patch
@ -0,0 +1,48 @@
+From fc5bd179ef3a953dff8d1655bd530d0e230ffe71 Mon Sep 17 00:00:00 2001
+From: Noah Goldstein <goldstein.w.n@gmail.com>
+Date: Tue, 21 Sep 2021 18:31:49 -0500
+Subject: [PATCH] x86: Modify ENTRY in sysdep.h so that p2align can be
+ specified
+Content-type: text/plain; charset=UTF-8
+
+No bug.
+
+This change adds a new macro ENTRY_P2ALIGN which takes a second
+argument, log2 of the desired function alignment.
+
+The old ENTRY(name) macro is just ENTRY_P2ALIGN(name, 4) so this
+doesn't affect any existing functionality.
+
+Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
+---
+ sysdeps/x86/sysdep.h | 7 +++++--
+ 1 file changed, 5 insertions(+), 2 deletions(-)
+
+diff --git a/sysdeps/x86/sysdep.h b/sysdeps/x86/sysdep.h
+index 01bac0f6..a70bb3a2 100644
+--- a/sysdeps/x86/sysdep.h
+++ b/sysdeps/x86/sysdep.h
+@@ -78,15 +78,18 @@ enum cf_protection_level
+ #define ASM_SIZE_DIRECTIVE(name) .size name,.-name;
+ 
+ /* Define an entry point visible from C.  */
+-#define	ENTRY(name)							      \
+#define	ENTRY_P2ALIGN(name, alignment)					      \
+   .globl C_SYMBOL_NAME(name);						      \
+   .type C_SYMBOL_NAME(name),@function;					      \
+-  .align ALIGNARG(4);							      \
+  .align ALIGNARG(alignment);						      \
+   C_LABEL(name)								      \
+   cfi_startproc;							      \
+   _CET_ENDBR;								      \
+   CALL_MCOUNT
+ 
+/* Common entry 16 byte aligns.  */
+#define ENTRY(name) ENTRY_P2ALIGN (name, 4)
+
+ #undef	END
+ #define END(name)							      \
+   cfi_endproc;								      \
+-- 
+GitLab
+
--- a/glibc-RHEL-15696-56.patch
+++ b/glibc-RHEL-15696-56.patch
@ -0,0 +1,658 @@
+From 1bd8b8d58fc9967cc073d2c13bfb6befefca2faa Mon Sep 17 00:00:00 2001
+From: Noah Goldstein <goldstein.w.n@gmail.com>
+Date: Tue, 21 Sep 2021 18:45:03 -0500
+Subject: [PATCH] x86: Optimize memcmp-evex-movbe.S for frontend behavior and
+ size
+Content-type: text/plain; charset=UTF-8
+
+No bug.
+
+The frontend optimizations are to:
+1. Reorganize logically connected basic blocks so they are either in
+   the same cache line or adjacent cache lines.
+2. Avoid cases when basic blocks unnecissarily cross cache lines.
+3. Try and 32 byte align any basic blocks possible without sacrificing
+   code size. Smaller / Less hot basic blocks are used for this.
+
+Overall code size shrunk by 168 bytes. This should make up for any
+extra costs due to aligning to 64 bytes.
+
+In general performance before deviated a great deal dependending on
+whether entry alignment % 64 was 0, 16, 32, or 48. These changes
+essentially make it so that the current implementation is at least
+equal to the best alignment of the original for any arguments.
+
+The only additional optimization is in the page cross case. Branch on
+equals case was removed from the size == [4, 7] case. As well the [4,
+7] and [2, 3] case where swapped as [4, 7] is likely a more hot
+argument size.
+
+test-memcmp and test-wmemcmp are both passing.
+---
+ sysdeps/x86_64/multiarch/memcmp-evex-movbe.S | 434 +++++++++++--------
+ 1 file changed, 242 insertions(+), 192 deletions(-)
+
+diff --git a/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S b/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S
+index 654dc7ac..2761b54f 100644
+--- a/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S
+++ b/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S
+@@ -34,7 +34,24 @@
+       area.
+    7. Use 2 vector compares when size is 2 * CHAR_PER_VEC or less.
+    8. Use 4 vector compares when size is 4 * CHAR_PER_VEC or less.
+-   9. Use 8 vector compares when size is 8 * CHAR_PER_VEC or less.  */
+   9. Use 8 vector compares when size is 8 * CHAR_PER_VEC or less.
+
+When possible the implementation tries to optimize for frontend in the
+following ways:
+Throughput:
+    1. All code sections that fit are able to run optimally out of the
+       LSD.
+    2. All code sections that fit are able to run optimally out of the
+       DSB
+    3. Basic blocks are contained in minimum number of fetch blocks
+       necessary.
+
+Latency:
+    1. Logically connected basic blocks are put in the same
+       cache-line.
+    2. Logically connected basic blocks that do not fit in the same
+       cache-line are put in adjacent lines. This can get beneficial
+       L2 spatial prefetching and L1 next-line prefetching.  */
+ 
+ # include <sysdep.h>
+ 
+@@ -47,9 +64,11 @@
+ # ifdef USE_AS_WMEMCMP
+ #  define CHAR_SIZE	4
+ #  define VPCMP	vpcmpd
+#  define VPTEST	vptestmd
+ # else
+ #  define CHAR_SIZE	1
+ #  define VPCMP	vpcmpub
+#  define VPTEST	vptestmb
+ # endif
+ 
+ # define VEC_SIZE	32
+@@ -75,7 +94,9 @@
+ */
+ 
+ 	.section .text.evex,"ax",@progbits
+-ENTRY (MEMCMP)
+/* Cache align memcmp entry. This allows for much more thorough
+   frontend optimization.  */
+ENTRY_P2ALIGN (MEMCMP, 6)
+ # ifdef __ILP32__
+ 	/* Clear the upper 32 bits.  */
+ 	movl	%edx, %edx
+@@ -89,7 +110,7 @@ ENTRY (MEMCMP)
+ 	VPCMP	$4, (%rdi), %YMM1, %k1
+ 	kmovd	%k1, %eax
+ 	/* NB: eax must be destination register if going to
+-	   L(return_vec_[0,2]). For L(return_vec_3 destination register
+	   L(return_vec_[0,2]). For L(return_vec_3) destination register
+ 	   must be ecx.  */
+ 	testl	%eax, %eax
+ 	jnz	L(return_vec_0)
+@@ -121,10 +142,6 @@ ENTRY (MEMCMP)
+ 	testl	%ecx, %ecx
+ 	jnz	L(return_vec_3)
+ 
+-	/* Zero YMM0. 4x VEC reduction is done with vpxor + vtern so
+-	   compare with zero to get a mask is needed.  */
+-	vpxorq	%XMM0, %XMM0, %XMM0
+-
+ 	/* Go to 4x VEC loop.  */
+ 	cmpq	$(CHAR_PER_VEC * 8), %rdx
+ 	ja	L(more_8x_vec)
+@@ -148,47 +165,61 @@ ENTRY (MEMCMP)
+ 
+ 	VMOVU	(VEC_SIZE * 2)(%rsi), %YMM3
+ 	vpxorq	(VEC_SIZE * 2)(%rdi), %YMM3, %YMM3
+-	/* Or together YMM1, YMM2, and YMM3 into YMM3.  */
+-	vpternlogd $0xfe, %YMM1, %YMM2, %YMM3
+ 
+ 	VMOVU	(VEC_SIZE * 3)(%rsi), %YMM4
+ 	/* Ternary logic to xor (VEC_SIZE * 3)(%rdi) with YMM4 while
+-	   oring with YMM3. Result is stored in YMM4.  */
+-	vpternlogd $0xde, (VEC_SIZE * 3)(%rdi), %YMM3, %YMM4
+-	/* Compare YMM4 with 0. If any 1s s1 and s2 don't match.  */
+-	VPCMP	$4, %YMM4, %YMM0, %k1
+	   oring with YMM1. Result is stored in YMM4.  */
+	vpternlogd $0xde, (VEC_SIZE * 3)(%rdi), %YMM1, %YMM4
+
+	/* Or together YMM2, YMM3, and YMM4 into YMM4.  */
+	vpternlogd $0xfe, %YMM2, %YMM3, %YMM4
+
+	/* Test YMM4 against itself. Store any CHAR mismatches in k1.
+	 */
+	VPTEST	%YMM4, %YMM4, %k1
+	/* k1 must go to ecx for L(return_vec_0_1_2_3).  */
+ 	kmovd	%k1, %ecx
+ 	testl	%ecx, %ecx
+ 	jnz	L(return_vec_0_1_2_3)
+ 	/* NB: eax must be zero to reach here.  */
+ 	ret
+ 
+-	/* NB: aligning 32 here allows for the rest of the jump targets
+-	   to be tuned for 32 byte alignment. Most important this ensures
+-	   the L(more_8x_vec) loop is 32 byte aligned.  */
+-	.p2align 5
+-L(less_vec):
+-	/* Check if one or less CHAR. This is necessary for size = 0 but
+-	   is also faster for size = CHAR_SIZE.  */
+-	cmpl	$1, %edx
+-	jbe	L(one_or_less)
+	.p2align 4
+L(8x_end_return_vec_0_1_2_3):
+	movq	%rdx, %rdi
+L(8x_return_vec_0_1_2_3):
+	addq	%rdi, %rsi
+L(return_vec_0_1_2_3):
+	VPTEST	%YMM1, %YMM1, %k0
+	kmovd	%k0, %eax
+	testl	%eax, %eax
+	jnz	L(return_vec_0)
+ 
+-	/* Check if loading one VEC from either s1 or s2 could cause a
+-	   page cross. This can have false positives but is by far the
+-	   fastest method.  */
+-	movl	%edi, %eax
+-	orl	%esi, %eax
+-	andl	$(PAGE_SIZE - 1), %eax
+-	cmpl	$(PAGE_SIZE - VEC_SIZE), %eax
+-	jg	L(page_cross_less_vec)
+	VPTEST	%YMM2, %YMM2, %k0
+	kmovd	%k0, %eax
+	testl	%eax, %eax
+	jnz	L(return_vec_1)
+ 
+-	/* No page cross possible.  */
+-	VMOVU	(%rsi), %YMM2
+-	VPCMP	$4, (%rdi), %YMM2, %k1
+-	kmovd	%k1, %eax
+-	/* Create mask in ecx for potentially in bound matches.  */
+-	bzhil	%edx, %eax, %eax
+-	jnz	L(return_vec_0)
+	VPTEST	%YMM3, %YMM3, %k0
+	kmovd	%k0, %eax
+	testl	%eax, %eax
+	jnz	L(return_vec_2)
+L(return_vec_3):
+	/* bsf saves 1 byte from tzcnt. This keep L(return_vec_3) in one
+	   fetch block and the entire L(*return_vec_0_1_2_3) in 1 cache
+	   line.  */
+	bsfl	%ecx, %ecx
+# ifdef USE_AS_WMEMCMP
+	movl	(VEC_SIZE * 3)(%rdi, %rcx, CHAR_SIZE), %eax
+	xorl	%edx, %edx
+	cmpl	(VEC_SIZE * 3)(%rsi, %rcx, CHAR_SIZE), %eax
+	setg	%dl
+	leal	-1(%rdx, %rdx), %eax
+# else
+	movzbl	(VEC_SIZE * 3)(%rdi, %rcx), %eax
+	movzbl	(VEC_SIZE * 3)(%rsi, %rcx), %ecx
+	subl	%ecx, %eax
+# endif
+ 	ret
+ 
+ 	.p2align 4
+@@ -209,10 +240,11 @@ L(return_vec_0):
+ # endif
+ 	ret
+ 
+-	/* NB: No p2align necessary. Alignment  % 16 is naturally 1
+-	   which is good enough for a target not in a loop.  */
+	.p2align 4
+ L(return_vec_1):
+-	tzcntl	%eax, %eax
+	/* bsf saves 1 byte over tzcnt and keeps L(return_vec_1) in one
+	   fetch block.  */
+	bsfl	%eax, %eax
+ # ifdef USE_AS_WMEMCMP
+ 	movl	VEC_SIZE(%rdi, %rax, CHAR_SIZE), %ecx
+ 	xorl	%edx, %edx
+@@ -226,10 +258,11 @@ L(return_vec_1):
+ # endif
+ 	ret
+ 
+-	/* NB: No p2align necessary. Alignment  % 16 is naturally 2
+-	   which is good enough for a target not in a loop.  */
+	.p2align 4,, 10
+ L(return_vec_2):
+-	tzcntl	%eax, %eax
+	/* bsf saves 1 byte over tzcnt and keeps L(return_vec_2) in one
+	   fetch block.  */
+	bsfl	%eax, %eax
+ # ifdef USE_AS_WMEMCMP
+ 	movl	(VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %ecx
+ 	xorl	%edx, %edx
+@@ -243,40 +276,6 @@ L(return_vec_2):
+ # endif
+ 	ret
+ 
+-	.p2align 4
+-L(8x_return_vec_0_1_2_3):
+-	/* Returning from L(more_8x_vec) requires restoring rsi.  */
+-	addq	%rdi, %rsi
+-L(return_vec_0_1_2_3):
+-	VPCMP	$4, %YMM1, %YMM0, %k0
+-	kmovd	%k0, %eax
+-	testl	%eax, %eax
+-	jnz	L(return_vec_0)
+-
+-	VPCMP	$4, %YMM2, %YMM0, %k0
+-	kmovd	%k0, %eax
+-	testl	%eax, %eax
+-	jnz	L(return_vec_1)
+-
+-	VPCMP	$4, %YMM3, %YMM0, %k0
+-	kmovd	%k0, %eax
+-	testl	%eax, %eax
+-	jnz	L(return_vec_2)
+-L(return_vec_3):
+-	tzcntl	%ecx, %ecx
+-# ifdef USE_AS_WMEMCMP
+-	movl	(VEC_SIZE * 3)(%rdi, %rcx, CHAR_SIZE), %eax
+-	xorl	%edx, %edx
+-	cmpl	(VEC_SIZE * 3)(%rsi, %rcx, CHAR_SIZE), %eax
+-	setg	%dl
+-	leal	-1(%rdx, %rdx), %eax
+-# else
+-	movzbl	(VEC_SIZE * 3)(%rdi, %rcx), %eax
+-	movzbl	(VEC_SIZE * 3)(%rsi, %rcx), %ecx
+-	subl	%ecx, %eax
+-# endif
+-	ret
+-
+ 	.p2align 4
+ L(more_8x_vec):
+ 	/* Set end of s1 in rdx.  */
+@@ -288,21 +287,19 @@ L(more_8x_vec):
+ 	andq	$-VEC_SIZE, %rdi
+ 	/* Adjust because first 4x vec where check already.  */
+ 	subq	$-(VEC_SIZE * 4), %rdi
+
+ 	.p2align 4
+ L(loop_4x_vec):
+ 	VMOVU	(%rsi, %rdi), %YMM1
+ 	vpxorq	(%rdi), %YMM1, %YMM1
+-
+ 	VMOVU	VEC_SIZE(%rsi, %rdi), %YMM2
+ 	vpxorq	VEC_SIZE(%rdi), %YMM2, %YMM2
+-
+ 	VMOVU	(VEC_SIZE * 2)(%rsi, %rdi), %YMM3
+ 	vpxorq	(VEC_SIZE * 2)(%rdi), %YMM3, %YMM3
+-	vpternlogd $0xfe, %YMM1, %YMM2, %YMM3
+-
+ 	VMOVU	(VEC_SIZE * 3)(%rsi, %rdi), %YMM4
+-	vpternlogd $0xde, (VEC_SIZE * 3)(%rdi), %YMM3, %YMM4
+-	VPCMP	$4, %YMM4, %YMM0, %k1
+	vpternlogd $0xde, (VEC_SIZE * 3)(%rdi), %YMM1, %YMM4
+	vpternlogd $0xfe, %YMM2, %YMM3, %YMM4
+	VPTEST	%YMM4, %YMM4, %k1
+ 	kmovd	%k1, %ecx
+ 	testl	%ecx, %ecx
+ 	jnz	L(8x_return_vec_0_1_2_3)
+@@ -319,28 +316,25 @@ L(loop_4x_vec):
+ 	cmpl	$(VEC_SIZE * 2), %edi
+ 	jae	L(8x_last_2x_vec)
+ 
+	vpxorq	(VEC_SIZE * 2)(%rdx), %YMM3, %YMM3
+
+ 	VMOVU	(%rsi, %rdx), %YMM1
+ 	vpxorq	(%rdx), %YMM1, %YMM1
+ 
+ 	VMOVU	VEC_SIZE(%rsi, %rdx), %YMM2
+ 	vpxorq	VEC_SIZE(%rdx), %YMM2, %YMM2
+-
+-	vpxorq	(VEC_SIZE * 2)(%rdx), %YMM3, %YMM3
+-	vpternlogd $0xfe, %YMM1, %YMM2, %YMM3
+-
+ 	VMOVU	(VEC_SIZE * 3)(%rsi, %rdx), %YMM4
+-	vpternlogd $0xde, (VEC_SIZE * 3)(%rdx), %YMM3, %YMM4
+-	VPCMP	$4, %YMM4, %YMM0, %k1
+	vpternlogd $0xde, (VEC_SIZE * 3)(%rdx), %YMM1, %YMM4
+	vpternlogd $0xfe, %YMM2, %YMM3, %YMM4
+	VPTEST	%YMM4, %YMM4, %k1
+ 	kmovd	%k1, %ecx
+-	/* Restore s1 pointer to rdi.  */
+-	movq	%rdx, %rdi
+ 	testl	%ecx, %ecx
+-	jnz	L(8x_return_vec_0_1_2_3)
+	jnz	L(8x_end_return_vec_0_1_2_3)
+ 	/* NB: eax must be zero to reach here.  */
+ 	ret
+ 
+ 	/* Only entry is from L(more_8x_vec).  */
+-	.p2align 4
+	.p2align 4,, 10
+ L(8x_last_2x_vec):
+ 	VPCMP	$4, (VEC_SIZE * 2)(%rdx), %YMM3, %k1
+ 	kmovd	%k1, %eax
+@@ -355,7 +349,31 @@ L(8x_last_1x_vec):
+ 	jnz	L(8x_return_vec_3)
+ 	ret
+ 
+-	.p2align 4
+	/* Not ideally aligned (at offset +9 bytes in fetch block) but
+	   not aligning keeps it in the same cache line as
+	   L(8x_last_1x/2x_vec) so likely worth it. As well, saves code
+	   size.  */
+	.p2align 4,, 4
+L(8x_return_vec_2):
+	subq	$VEC_SIZE, %rdx
+L(8x_return_vec_3):
+	bsfl	%eax, %eax
+# ifdef USE_AS_WMEMCMP
+	leaq	(%rdx, %rax, CHAR_SIZE), %rax
+	movl	(VEC_SIZE * 3)(%rax), %ecx
+	xorl	%edx, %edx
+	cmpl	(VEC_SIZE * 3)(%rsi, %rax), %ecx
+	setg	%dl
+	leal	-1(%rdx, %rdx), %eax
+# else
+	addq	%rdx, %rax
+	movzbl	(VEC_SIZE * 3)(%rsi, %rax), %ecx
+	movzbl	(VEC_SIZE * 3)(%rax), %eax
+	subl	%ecx, %eax
+# endif
+	ret
+
+	.p2align 4,, 10
+ L(last_2x_vec):
+ 	/* Check second to last VEC.  */
+ 	VMOVU	-(VEC_SIZE * 2)(%rsi, %rdx, CHAR_SIZE), %YMM1
+@@ -374,26 +392,49 @@ L(last_1x_vec):
+ 	jnz	L(return_vec_0_end)
+ 	ret
+ 
+-	.p2align 4
+-L(8x_return_vec_2):
+-	subq	$VEC_SIZE, %rdx
+-L(8x_return_vec_3):
+-	tzcntl	%eax, %eax
+	.p2align 4,, 10
+L(return_vec_1_end):
+	/* Use bsf to save code size. This is necessary to have
+	   L(one_or_less) fit in aligning bytes between.  */
+	bsfl	%eax, %eax
+	addl	%edx, %eax
+ # ifdef USE_AS_WMEMCMP
+-	leaq	(%rdx, %rax, CHAR_SIZE), %rax
+-	movl	(VEC_SIZE * 3)(%rax), %ecx
+	movl	-(VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %ecx
+ 	xorl	%edx, %edx
+-	cmpl	(VEC_SIZE * 3)(%rsi, %rax), %ecx
+	cmpl	-(VEC_SIZE * 2)(%rsi, %rax, CHAR_SIZE), %ecx
+ 	setg	%dl
+ 	leal	-1(%rdx, %rdx), %eax
+ # else
+-	addq	%rdx, %rax
+-	movzbl	(VEC_SIZE * 3)(%rsi, %rax), %ecx
+-	movzbl	(VEC_SIZE * 3)(%rax), %eax
+	movzbl	-(VEC_SIZE * 2)(%rsi, %rax), %ecx
+	movzbl	-(VEC_SIZE * 2)(%rdi, %rax), %eax
+ 	subl	%ecx, %eax
+ # endif
+ 	ret
+ 
+	/* NB: L(one_or_less) fits in alignment padding between
+	   L(return_vec_1_end) and L(return_vec_0_end).  */
+# ifdef USE_AS_WMEMCMP
+L(one_or_less):
+	jb	L(zero)
+	movl	(%rdi), %ecx
+	xorl	%edx, %edx
+	cmpl	(%rsi), %ecx
+	je	L(zero)
+	setg	%dl
+	leal	-1(%rdx, %rdx), %eax
+	ret
+# else
+L(one_or_less):
+	jb	L(zero)
+	movzbl	(%rsi), %ecx
+	movzbl	(%rdi), %eax
+	subl	%ecx, %eax
+	ret
+# endif
+L(zero):
+	xorl	%eax, %eax
+	ret
+
+ 	.p2align 4
+ L(return_vec_0_end):
+ 	tzcntl	%eax, %eax
+@@ -412,23 +453,56 @@ L(return_vec_0_end):
+ 	ret
+ 
+ 	.p2align 4
+-L(return_vec_1_end):
+L(less_vec):
+	/* Check if one or less CHAR. This is necessary for size == 0
+	   but is also faster for size == CHAR_SIZE.  */
+	cmpl	$1, %edx
+	jbe	L(one_or_less)
+
+	/* Check if loading one VEC from either s1 or s2 could cause a
+	   page cross. This can have false positives but is by far the
+	   fastest method.  */
+	movl	%edi, %eax
+	orl	%esi, %eax
+	andl	$(PAGE_SIZE - 1), %eax
+	cmpl	$(PAGE_SIZE - VEC_SIZE), %eax
+	jg	L(page_cross_less_vec)
+
+	/* No page cross possible.  */
+	VMOVU	(%rsi), %YMM2
+	VPCMP	$4, (%rdi), %YMM2, %k1
+	kmovd	%k1, %eax
+	/* Check if any matches where in bounds. Intentionally not
+	   storing result in eax to limit dependency chain if it goes to
+	   L(return_vec_0_lv).  */
+	bzhil	%edx, %eax, %edx
+	jnz	L(return_vec_0_lv)
+	xorl	%eax, %eax
+	ret
+
+	/* Essentially duplicate of L(return_vec_0). Ends up not costing
+	   any code as shrinks L(less_vec) by allowing 2-byte encoding of
+	   the jump and ends up fitting in aligning bytes. As well fits on
+	   same cache line as L(less_vec) so also saves a line from having
+	   to be fetched on cold calls to memcmp.  */
+	.p2align 4,, 4
+L(return_vec_0_lv):
+ 	tzcntl	%eax, %eax
+-	addl	%edx, %eax
+ # ifdef USE_AS_WMEMCMP
+-	movl	-(VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %ecx
+	movl	(%rdi, %rax, CHAR_SIZE), %ecx
+ 	xorl	%edx, %edx
+-	cmpl	-(VEC_SIZE * 2)(%rsi, %rax, CHAR_SIZE), %ecx
+	cmpl	(%rsi, %rax, CHAR_SIZE), %ecx
+	/* NB: no partial register stall here because xorl zero idiom
+	   above.  */
+ 	setg	%dl
+ 	leal	-1(%rdx, %rdx), %eax
+ # else
+-	movzbl	-(VEC_SIZE * 2)(%rsi, %rax), %ecx
+-	movzbl	-(VEC_SIZE * 2)(%rdi, %rax), %eax
+	movzbl	(%rsi, %rax), %ecx
+	movzbl	(%rdi, %rax), %eax
+ 	subl	%ecx, %eax
+ # endif
+ 	ret
+ 
+-
+ 	.p2align 4
+ L(page_cross_less_vec):
+ 	/* if USE_AS_WMEMCMP it can only be 0, 4, 8, 12, 16, 20, 24, 28
+@@ -439,108 +513,84 @@ L(page_cross_less_vec):
+ 	cmpl	$8, %edx
+ 	jae	L(between_8_15)
+ 	cmpl	$4, %edx
+-	jae	L(between_4_7)
+-L(between_2_3):
+-	/* Load as big endian to avoid branches.  */
+-	movzwl	(%rdi), %eax
+-	movzwl	(%rsi), %ecx
+-	shll	$8, %eax
+-	shll	$8, %ecx
+-	bswap	%eax
+-	bswap	%ecx
+-	movzbl	-1(%rdi, %rdx), %edi
+-	movzbl	-1(%rsi, %rdx), %esi
+-	orl	%edi, %eax
+-	orl	%esi, %ecx
+-	/* Subtraction is okay because the upper 8 bits are zero.  */
+-	subl	%ecx, %eax
+-	ret
+-	.p2align 4
+-L(one_or_less):
+-	jb	L(zero)
+-	movzbl	(%rsi), %ecx
+-	movzbl	(%rdi), %eax
+-	subl	%ecx, %eax
+	jb	L(between_2_3)
+
+	/* Load as big endian with overlapping movbe to avoid branches.
+	 */
+	movbe	(%rdi), %eax
+	movbe	(%rsi), %ecx
+	shlq	$32, %rax
+	shlq	$32, %rcx
+	movbe	-4(%rdi, %rdx), %edi
+	movbe	-4(%rsi, %rdx), %esi
+	orq	%rdi, %rax
+	orq	%rsi, %rcx
+	subq	%rcx, %rax
+	/* edx is guranteed to be positive int32 in range [4, 7].  */
+	cmovne	%edx, %eax
+	/* ecx is -1 if rcx > rax. Otherwise 0.  */
+	sbbl	%ecx, %ecx
+	/* If rcx > rax, then ecx is 0 and eax is positive. If rcx ==
+	   rax then eax and ecx are zero. If rax < rax then ecx is -1 so
+	   eax doesn't matter.  */
+	orl	%ecx, %eax
+ 	ret
+ 
+-	.p2align 4
+	.p2align 4,, 8
+ L(between_8_15):
+ # endif
+ 	/* If USE_AS_WMEMCMP fall through into 8-15 byte case.  */
+-	vmovq	(%rdi), %XMM1
+-	vmovq	(%rsi), %XMM2
+-	VPCMP	$4, %XMM1, %XMM2, %k1
+	vmovq	(%rdi), %xmm1
+	vmovq	(%rsi), %xmm2
+	VPCMP	$4, %xmm1, %xmm2, %k1
+ 	kmovd	%k1, %eax
+ 	testl	%eax, %eax
+-	jnz	L(return_vec_0)
+	jnz	L(return_vec_0_lv)
+ 	/* Use overlapping loads to avoid branches.  */
+-	leaq	-8(%rdi, %rdx, CHAR_SIZE), %rdi
+-	leaq	-8(%rsi, %rdx, CHAR_SIZE), %rsi
+-	vmovq	(%rdi), %XMM1
+-	vmovq	(%rsi), %XMM2
+-	VPCMP	$4, %XMM1, %XMM2, %k1
+	vmovq	-8(%rdi, %rdx, CHAR_SIZE), %xmm1
+	vmovq	-8(%rsi, %rdx, CHAR_SIZE), %xmm2
+	VPCMP	$4, %xmm1, %xmm2, %k1
+	addl	$(CHAR_PER_VEC - (8 / CHAR_SIZE)), %edx
+ 	kmovd	%k1, %eax
+ 	testl	%eax, %eax
+-	jnz	L(return_vec_0)
+-	ret
+-
+-	.p2align 4
+-L(zero):
+-	xorl	%eax, %eax
+	jnz	L(return_vec_0_end)
+ 	ret
+ 
+-	.p2align 4
+	.p2align 4,, 8
+ L(between_16_31):
+ 	/* From 16 to 31 bytes.  No branch when size == 16.  */
+-	VMOVU	(%rsi), %XMM2
+-	VPCMP	$4, (%rdi), %XMM2, %k1
+
+	/* Use movups to save code size.  */
+	movups	(%rsi), %xmm2
+	VPCMP	$4, (%rdi), %xmm2, %k1
+ 	kmovd	%k1, %eax
+ 	testl	%eax, %eax
+-	jnz	L(return_vec_0)
+-
+	jnz	L(return_vec_0_lv)
+ 	/* Use overlapping loads to avoid branches.  */
+-
+-	VMOVU	-16(%rsi, %rdx, CHAR_SIZE), %XMM2
+-	leaq	-16(%rdi, %rdx, CHAR_SIZE), %rdi
+-	leaq	-16(%rsi, %rdx, CHAR_SIZE), %rsi
+-	VPCMP	$4, (%rdi), %XMM2, %k1
+	movups	-16(%rsi, %rdx, CHAR_SIZE), %xmm2
+	VPCMP	$4, -16(%rdi, %rdx, CHAR_SIZE), %xmm2, %k1
+	addl	$(CHAR_PER_VEC - (16 / CHAR_SIZE)), %edx
+ 	kmovd	%k1, %eax
+ 	testl	%eax, %eax
+-	jnz	L(return_vec_0)
+-	ret
+-
+-# ifdef USE_AS_WMEMCMP
+-	.p2align 4
+-L(one_or_less):
+-	jb	L(zero)
+-	movl	(%rdi), %ecx
+-	xorl	%edx, %edx
+-	cmpl	(%rsi), %ecx
+-	je	L(zero)
+-	setg	%dl
+-	leal	-1(%rdx, %rdx), %eax
+	jnz	L(return_vec_0_end)
+ 	ret
+-# else
+ 
+-	.p2align 4
+-L(between_4_7):
+-	/* Load as big endian with overlapping movbe to avoid branches.
+-	 */
+-	movbe	(%rdi), %eax
+-	movbe	(%rsi), %ecx
+-	shlq	$32, %rax
+-	shlq	$32, %rcx
+-	movbe	-4(%rdi, %rdx), %edi
+-	movbe	-4(%rsi, %rdx), %esi
+-	orq	%rdi, %rax
+-	orq	%rsi, %rcx
+-	subq	%rcx, %rax
+-	jz	L(zero_4_7)
+-	sbbl	%eax, %eax
+-	orl	$1, %eax
+-L(zero_4_7):
+# ifndef USE_AS_WMEMCMP
+L(between_2_3):
+	/* Load as big endian to avoid branches.  */
+	movzwl	(%rdi), %eax
+	movzwl	(%rsi), %ecx
+	shll	$8, %eax
+	shll	$8, %ecx
+	bswap	%eax
+	bswap	%ecx
+	movzbl	-1(%rdi, %rdx), %edi
+	movzbl	-1(%rsi, %rdx), %esi
+	orl	%edi, %eax
+	orl	%esi, %ecx
+	/* Subtraction is okay because the upper 8 bits are zero.  */
+	subl	%ecx, %eax
+ 	ret
+ # endif
+-
+ END (MEMCMP)
+ #endif
+-- 
+GitLab
+
--- a/glibc-RHEL-15696-57.patch
+++ b/glibc-RHEL-15696-57.patch
@ -0,0 +1,510 @@
+From e59ced238482fd71f3e493717f14f6507346741e Mon Sep 17 00:00:00 2001
+From: Noah Goldstein <goldstein.w.n@gmail.com>
+Date: Mon, 20 Sep 2021 16:20:15 -0500
+Subject: [PATCH] x86: Optimize memset-vec-unaligned-erms.S
+Content-type: text/plain; charset=UTF-8
+
+No bug.
+
+Optimization are
+
+1. change control flow for L(more_2x_vec) to fall through to loop and
+   jump for L(less_4x_vec) and L(less_8x_vec). This uses less code
+   size and saves jumps for length > 4x VEC_SIZE.
+
+2. For EVEX/AVX512 move L(less_vec) closer to entry.
+
+3. Avoid complex address mode for length > 2x VEC_SIZE
+
+4. Slightly better aligning code for the loop from the perspective of
+   code size and uops.
+
+5. Align targets so they make full use of their fetch block and if
+   possible cache line.
+
+6. Try and reduce total number of icache lines that will need to be
+   pulled in for a given length.
+
+7. Include "local" version of stosb target. For AVX2/EVEX/AVX512
+   jumping to the stosb target in the sse2 code section will almost
+   certainly be to a new page. The new version does increase code size
+   marginally by duplicating the target but should get better iTLB
+   behavior as a result.
+
+test-memset, test-wmemset, and test-bzero are all passing.
+
+Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
+Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
+---
+ sysdeps/x86_64/memset.S                       |  10 +-
+ .../multiarch/memset-avx2-unaligned-erms.S    |  10 +-
+ .../multiarch/memset-avx512-unaligned-erms.S  |  11 +-
+ .../multiarch/memset-evex-unaligned-erms.S    |  11 +-
+ .../multiarch/memset-vec-unaligned-erms.S     | 285 ++++++++++++------
+ 5 files changed, 232 insertions(+), 95 deletions(-)
+
+Conflicts:
+	sysdeps/x86_64/memset.S
+	(GNU URL)
+
+diff --git a/sysdeps/x86_64/memset.S b/sysdeps/x86_64/memset.S
+index b3426795..8672b030 100644
+--- a/sysdeps/x86_64/memset.S
+++ b/sysdeps/x86_64/memset.S
+@@ -18,13 +18,15 @@
+    <http://www.gnu.org/licenses/>.  */
+ 
+ #include <sysdep.h>
+#define USE_WITH_SSE2	1
+ 
+ #define VEC_SIZE	16
+#define MOV_SIZE	3
+#define RET_SIZE	1
+
+ #define VEC(i)		xmm##i
+-/* Don't use movups and movaps since it will get larger nop paddings for
+-   alignment.  */
+-#define VMOVU		movdqu
+-#define VMOVA		movdqa
+#define VMOVU     movups
+#define VMOVA     movaps
+ 
+ #define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
+   movd d, %xmm0; \
+diff --git a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
+index ae0860f3..1af668af 100644
+--- a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
+@@ -1,8 +1,14 @@
+ #if IS_IN (libc)
+# define USE_WITH_AVX2	1
+
+ # define VEC_SIZE	32
+# define MOV_SIZE	4
+# define RET_SIZE	4
+
+ # define VEC(i)		ymm##i
+-# define VMOVU		vmovdqu
+-# define VMOVA		vmovdqa
+
+# define VMOVU     vmovdqu
+# define VMOVA     vmovdqa
+ 
+ # define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
+   vmovd d, %xmm0; \
+diff --git a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
+index 8ad842fc..f14d6f84 100644
+--- a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
+@@ -1,11 +1,18 @@
+ #if IS_IN (libc)
+# define USE_WITH_AVX512	1
+
+ # define VEC_SIZE	64
+# define MOV_SIZE	6
+# define RET_SIZE	1
+
+ # define XMM0		xmm16
+ # define YMM0		ymm16
+ # define VEC0		zmm16
+ # define VEC(i)		VEC##i
+-# define VMOVU		vmovdqu64
+-# define VMOVA		vmovdqa64
+
+# define VMOVU     vmovdqu64
+# define VMOVA     vmovdqa64
+
+ # define VZEROUPPER
+ 
+ # define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
+diff --git a/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
+index 640f0929..64b09e77 100644
+--- a/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
+@@ -1,11 +1,18 @@
+ #if IS_IN (libc)
+# define USE_WITH_EVEX	1
+
+ # define VEC_SIZE	32
+# define MOV_SIZE	6
+# define RET_SIZE	1
+
+ # define XMM0		xmm16
+ # define YMM0		ymm16
+ # define VEC0		ymm16
+ # define VEC(i)		VEC##i
+-# define VMOVU		vmovdqu64
+-# define VMOVA		vmovdqa64
+
+# define VMOVU     vmovdqu64
+# define VMOVA     vmovdqa64
+
+ # define VZEROUPPER
+ 
+ # define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
+diff --git a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
+index 909c33f6..f08b7323 100644
+--- a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
+@@ -63,8 +63,27 @@
+ # endif
+ #endif
+ 
+#if VEC_SIZE == 64
+# define LOOP_4X_OFFSET	(VEC_SIZE * 4)
+#else
+# define LOOP_4X_OFFSET	(0)
+#endif
+
+#if defined USE_WITH_EVEX || defined USE_WITH_AVX512
+# define END_REG	rcx
+# define LOOP_REG	rdi
+#else
+# define END_REG	rdi
+# define LOOP_REG	rdx
+#endif
+
+ #define PAGE_SIZE 4096
+ 
+/* Macro to calculate size of small memset block for aligning
+   purposes.  */
+#define SMALL_MEMSET_ALIGN(mov_sz,	ret_sz)	(2 * (mov_sz) + (ret_sz) + 1)
+
+
+ #ifndef SECTION
+ # error SECTION is not defined!
+ #endif
+@@ -74,6 +93,7 @@
+ ENTRY (__bzero)
+ 	mov	%RDI_LP, %RAX_LP /* Set return value.  */
+ 	mov	%RSI_LP, %RDX_LP /* Set n.  */
+	xorl	%esi, %esi
+ 	pxor	%XMM0, %XMM0
+ 	jmp	L(entry_from_bzero)
+ END (__bzero)
+@@ -158,7 +178,7 @@ ENTRY_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned_erms))
+ END_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned_erms))
+ # endif
+ 
+-ENTRY (MEMSET_SYMBOL (__memset, unaligned_erms))
+ENTRY_P2ALIGN (MEMSET_SYMBOL (__memset, unaligned_erms), 6)
+ 	MEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi)
+ # ifdef __ILP32__
+ 	/* Clear the upper 32 bits.  */
+@@ -168,75 +188,43 @@ ENTRY (MEMSET_SYMBOL (__memset, unaligned_erms))
+ 	jb	L(less_vec)
+ 	cmp	$(VEC_SIZE * 2), %RDX_LP
+ 	ja	L(stosb_more_2x_vec)
+-	/* From VEC and to 2 * VEC.  No branch when size == VEC_SIZE.  */
+-	VMOVU	%VEC(0), -VEC_SIZE(%rdi,%rdx)
+-	VMOVU	%VEC(0), (%rdi)
+	/* From VEC and to 2 * VEC.  No branch when size == VEC_SIZE.
+	 */
+	VMOVU	%VEC(0), (%rax)
+	VMOVU	%VEC(0), -VEC_SIZE(%rax, %rdx)
+ 	VZEROUPPER_RETURN
+-
+-	.p2align 4
+-L(stosb_more_2x_vec):
+-	cmp	__x86_rep_stosb_threshold(%rip), %RDX_LP
+-	ja	L(stosb)
+-#else
+-	.p2align 4
+ #endif
+-L(more_2x_vec):
+-	/* Stores to first 2x VEC before cmp as any path forward will
+-	   require it.  */
+-	VMOVU	%VEC(0), (%rdi)
+-	VMOVU	%VEC(0), VEC_SIZE(%rdi)
+-	cmpq	$(VEC_SIZE * 4), %rdx
+-	ja	L(loop_start)
+-	VMOVU	%VEC(0), -(VEC_SIZE * 2)(%rdi,%rdx)
+-	VMOVU	%VEC(0), -VEC_SIZE(%rdi,%rdx)
+-L(return):
+-#if VEC_SIZE > 16
+-	ZERO_UPPER_VEC_REGISTERS_RETURN
+
+	.p2align 4,, 10
+L(last_2x_vec):
+#ifdef USE_LESS_VEC_MASK_STORE
+	VMOVU	%VEC(0), (VEC_SIZE * 2 + LOOP_4X_OFFSET)(%rcx)
+	VMOVU	%VEC(0), (VEC_SIZE * 3 + LOOP_4X_OFFSET)(%rcx)
+ #else
+-	ret
+	VMOVU	%VEC(0), (VEC_SIZE * -2)(%rdi)
+	VMOVU	%VEC(0), (VEC_SIZE * -1)(%rdi)
+ #endif
+	VZEROUPPER_RETURN
+ 
+-L(loop_start):
+-	VMOVU	%VEC(0), (VEC_SIZE * 2)(%rdi)
+-	VMOVU	%VEC(0), (VEC_SIZE * 3)(%rdi)
+-	cmpq	$(VEC_SIZE * 8), %rdx
+-	jbe	L(loop_end)
+-	andq	$-(VEC_SIZE * 2), %rdi
+-	subq	$-(VEC_SIZE * 4), %rdi
+-	leaq	-(VEC_SIZE * 4)(%rax, %rdx), %rcx
+-	.p2align 4
+-L(loop):
+-	VMOVA	%VEC(0), (%rdi)
+-	VMOVA	%VEC(0), VEC_SIZE(%rdi)
+-	VMOVA	%VEC(0), (VEC_SIZE * 2)(%rdi)
+-	VMOVA	%VEC(0), (VEC_SIZE * 3)(%rdi)
+-	subq	$-(VEC_SIZE * 4), %rdi
+-	cmpq	%rcx, %rdi
+-	jb	L(loop)
+-L(loop_end):
+-	/* NB: rax is set as ptr in MEMSET_VDUP_TO_VEC0_AND_SET_RETURN.
+-	       rdx as length is also unchanged.  */
+-	VMOVU	%VEC(0), -(VEC_SIZE * 4)(%rax, %rdx)
+-	VMOVU	%VEC(0), -(VEC_SIZE * 3)(%rax, %rdx)
+-	VMOVU	%VEC(0), -(VEC_SIZE * 2)(%rax, %rdx)
+-	VMOVU	%VEC(0), -VEC_SIZE(%rax, %rdx)
+-	VZEROUPPER_SHORT_RETURN
+-
+-	.p2align 4
+	/* If have AVX512 mask instructions put L(less_vec) close to
+	   entry as it doesn't take much space and is likely a hot target.
+	 */
+#ifdef USE_LESS_VEC_MASK_STORE
+	.p2align 4,, 10
+ L(less_vec):
+ 	/* Less than 1 VEC.  */
+ # if VEC_SIZE != 16 && VEC_SIZE != 32 && VEC_SIZE != 64
+ #  error Unsupported VEC_SIZE!
+ # endif
+-# ifdef USE_LESS_VEC_MASK_STORE
+ 	/* Clear high bits from edi. Only keeping bits relevant to page
+ 	   cross check. Note that we are using rax which is set in
+-	   MEMSET_VDUP_TO_VEC0_AND_SET_RETURN as ptr from here on out.
+-	 */
+	   MEMSET_VDUP_TO_VEC0_AND_SET_RETURN as ptr from here on out.  */
+ 	andl	$(PAGE_SIZE - 1), %edi
+-	/* Check if VEC_SIZE store cross page. Mask stores suffer serious
+-	   performance degradation when it has to fault supress.  */
+	/* Check if VEC_SIZE store cross page. Mask stores suffer
+	   serious performance degradation when it has to fault supress.
+	 */
+ 	cmpl	$(PAGE_SIZE - VEC_SIZE), %edi
+	/* This is generally considered a cold target.  */
+ 	ja	L(cross_page)
+ # if VEC_SIZE > 32
+ 	movq	$-1, %rcx
+@@ -247,58 +235,185 @@ L(less_vec):
+ 	bzhil	%edx, %ecx, %ecx
+ 	kmovd	%ecx, %k1
+ # endif
+-	vmovdqu8	%VEC(0), (%rax) {%k1}
+	vmovdqu8 %VEC(0), (%rax){%k1}
+ 	VZEROUPPER_RETURN
+ 
+# if defined USE_MULTIARCH && IS_IN (libc)
+	/* Include L(stosb_local) here if including L(less_vec) between
+	   L(stosb_more_2x_vec) and ENTRY. This is to cache align the
+	   L(stosb_more_2x_vec) target.  */
+	.p2align 4,, 10
+L(stosb_local):
+	movzbl	%sil, %eax
+	mov	%RDX_LP, %RCX_LP
+	mov	%RDI_LP, %RDX_LP
+	rep	stosb
+	mov	%RDX_LP, %RAX_LP
+	VZEROUPPER_RETURN
+# endif
+#endif
+
+#if defined USE_MULTIARCH && IS_IN (libc)
+ 	.p2align 4
+-L(cross_page):
+L(stosb_more_2x_vec):
+	cmp	__x86_rep_stosb_threshold(%rip), %RDX_LP
+	ja	L(stosb_local)
+#endif
+	/* Fallthrough goes to L(loop_4x_vec). Tests for memset (2x, 4x]
+	   and (4x, 8x] jump to target.  */
+L(more_2x_vec):
+
+	/* Two different methods of setting up pointers / compare. The
+	   two methods are based on the fact that EVEX/AVX512 mov
+	   instructions take more bytes then AVX2/SSE2 mov instructions. As
+	   well that EVEX/AVX512 machines also have fast LEA_BID. Both
+	   setup and END_REG to avoid complex address mode. For EVEX/AVX512
+	   this saves code size and keeps a few targets in one fetch block.
+	   For AVX2/SSE2 this helps prevent AGU bottlenecks.  */
+#if defined USE_WITH_EVEX || defined USE_WITH_AVX512
+	/* If EVEX/AVX512 compute END_REG - (VEC_SIZE * 4 +
+	   LOOP_4X_OFFSET) with LEA_BID.  */
+
+	/* END_REG is rcx for EVEX/AVX512.  */
+	leaq	-(VEC_SIZE * 4 + LOOP_4X_OFFSET)(%rdi, %rdx), %END_REG
+#endif
+
+	/* Stores to first 2x VEC before cmp as any path forward will
+	   require it.  */
+	VMOVU	%VEC(0), (%rax)
+	VMOVU	%VEC(0), VEC_SIZE(%rax)
+
+
+#if !(defined USE_WITH_EVEX || defined USE_WITH_AVX512)
+	/* If AVX2/SSE2 compute END_REG (rdi) with ALU.  */
+	addq	%rdx, %END_REG
+#endif
+
+	cmpq	$(VEC_SIZE * 4), %rdx
+	jbe	L(last_2x_vec)
+
+	/* Store next 2x vec regardless.  */
+	VMOVU	%VEC(0), (VEC_SIZE * 2)(%rax)
+	VMOVU	%VEC(0), (VEC_SIZE * 3)(%rax)
+
+
+#if defined USE_WITH_EVEX || defined USE_WITH_AVX512
+	/* If LOOP_4X_OFFSET don't readjust LOOP_REG (rdi), just add
+	   extra offset to addresses in loop. Used for AVX512 to save space
+	   as no way to get (VEC_SIZE * 4) in imm8.  */
+# if LOOP_4X_OFFSET == 0
+	subq	$-(VEC_SIZE * 4), %LOOP_REG
+ # endif
+-# if VEC_SIZE > 32
+-	cmpb	$32, %dl
+-	jae	L(between_32_63)
+	/* Avoid imm32 compare here to save code size.  */
+	cmpq	%rdi, %rcx
+#else
+	addq	$-(VEC_SIZE * 4), %END_REG
+	cmpq	$(VEC_SIZE * 8), %rdx
+#endif
+	jbe	L(last_4x_vec)
+#if !(defined USE_WITH_EVEX || defined USE_WITH_AVX512)
+	/* Set LOOP_REG (rdx).  */
+	leaq	(VEC_SIZE * 4)(%rax), %LOOP_REG
+#endif
+	/* Align dst for loop.  */
+	andq	$(VEC_SIZE * -2), %LOOP_REG
+	.p2align 4
+L(loop):
+	VMOVA	%VEC(0), LOOP_4X_OFFSET(%LOOP_REG)
+	VMOVA	%VEC(0), (VEC_SIZE + LOOP_4X_OFFSET)(%LOOP_REG)
+	VMOVA	%VEC(0), (VEC_SIZE * 2 + LOOP_4X_OFFSET)(%LOOP_REG)
+	VMOVA	%VEC(0), (VEC_SIZE * 3 + LOOP_4X_OFFSET)(%LOOP_REG)
+	subq	$-(VEC_SIZE * 4), %LOOP_REG
+	cmpq	%END_REG, %LOOP_REG
+	jb	L(loop)
+	.p2align 4,, MOV_SIZE
+L(last_4x_vec):
+	VMOVU	%VEC(0), LOOP_4X_OFFSET(%END_REG)
+	VMOVU	%VEC(0), (VEC_SIZE + LOOP_4X_OFFSET)(%END_REG)
+	VMOVU	%VEC(0), (VEC_SIZE * 2 + LOOP_4X_OFFSET)(%END_REG)
+	VMOVU	%VEC(0), (VEC_SIZE * 3 + LOOP_4X_OFFSET)(%END_REG)
+L(return):
+#if VEC_SIZE > 16
+	ZERO_UPPER_VEC_REGISTERS_RETURN
+#else
+	ret
+#endif
+
+	.p2align 4,, 10
+#ifndef USE_LESS_VEC_MASK_STORE
+# if defined USE_MULTIARCH && IS_IN (libc)
+	/* If no USE_LESS_VEC_MASK put L(stosb_local) here. Will be in
+	   range for 2-byte jump encoding.  */
+L(stosb_local):
+	movzbl	%sil, %eax
+	mov	%RDX_LP, %RCX_LP
+	mov	%RDI_LP, %RDX_LP
+	rep	stosb
+	mov	%RDX_LP, %RAX_LP
+	VZEROUPPER_RETURN
+ # endif
+-# if VEC_SIZE > 16
+-	cmpb	$16, %dl
+	/* Define L(less_vec) only if not otherwise defined.  */
+	.p2align 4
+L(less_vec):
+#endif
+L(cross_page):
+#if VEC_SIZE > 32
+	cmpl	$32, %edx
+	jae	L(between_32_63)
+#endif
+#if VEC_SIZE > 16
+	cmpl	$16, %edx
+ 	jae	L(between_16_31)
+-# endif
+-	MOVQ	%XMM0, %rcx
+-	cmpb	$8, %dl
+#endif
+	MOVQ	%XMM0, %rdi
+	cmpl	$8, %edx
+ 	jae	L(between_8_15)
+-	cmpb	$4, %dl
+	cmpl	$4, %edx
+ 	jae	L(between_4_7)
+-	cmpb	$1, %dl
+	cmpl	$1, %edx
+ 	ja	L(between_2_3)
+-	jb	1f
+-	movb	%cl, (%rax)
+-1:
+	jb	L(return)
+	movb	%sil, (%rax)
+ 	VZEROUPPER_RETURN
+-# if VEC_SIZE > 32
+
+	/* Align small targets only if not doing so would cross a fetch
+	   line.  */
+#if VEC_SIZE > 32
+	.p2align 4,, SMALL_MEMSET_ALIGN(MOV_SIZE, RET_SIZE)
+ 	/* From 32 to 63.  No branch when size == 32.  */
+ L(between_32_63):
+-	VMOVU	%YMM0, -32(%rax,%rdx)
+ 	VMOVU	%YMM0, (%rax)
+	VMOVU	%YMM0, -32(%rax, %rdx)
+ 	VZEROUPPER_RETURN
+-# endif
+-# if VEC_SIZE > 16
+-	/* From 16 to 31.  No branch when size == 16.  */
+#endif
+
+#if VEC_SIZE >= 32
+	.p2align 4,, SMALL_MEMSET_ALIGN(MOV_SIZE, RET_SIZE)
+ L(between_16_31):
+-	VMOVU	%XMM0, -16(%rax,%rdx)
+	/* From 16 to 31.  No branch when size == 16.  */
+ 	VMOVU	%XMM0, (%rax)
+	VMOVU	%XMM0, -16(%rax, %rdx)
+ 	VZEROUPPER_RETURN
+-# endif
+-	/* From 8 to 15.  No branch when size == 8.  */
+#endif
+
+	.p2align 4,, SMALL_MEMSET_ALIGN(3, RET_SIZE)
+ L(between_8_15):
+-	movq	%rcx, -8(%rax,%rdx)
+-	movq	%rcx, (%rax)
+	/* From 8 to 15.  No branch when size == 8.  */
+	movq	%rdi, (%rax)
+	movq	%rdi, -8(%rax, %rdx)
+ 	VZEROUPPER_RETURN
+
+	.p2align 4,, SMALL_MEMSET_ALIGN(2, RET_SIZE)
+ L(between_4_7):
+ 	/* From 4 to 7.  No branch when size == 4.  */
+-	movl	%ecx, -4(%rax,%rdx)
+-	movl	%ecx, (%rax)
+	movl	%edi, (%rax)
+	movl	%edi, -4(%rax, %rdx)
+ 	VZEROUPPER_RETURN
+
+	.p2align 4,, SMALL_MEMSET_ALIGN(3, RET_SIZE)
+ L(between_2_3):
+ 	/* From 2 to 3.  No branch when size == 2.  */
+-	movw	%cx, -2(%rax,%rdx)
+-	movw	%cx, (%rax)
+	movw	%di, (%rax)
+	movb	%dil, -1(%rax, %rdx)
+ 	VZEROUPPER_RETURN
+ END (MEMSET_SYMBOL (__memset, unaligned_erms))
+-- 
+GitLab
+
--- a/glibc-RHEL-15696-58.patch
+++ b/glibc-RHEL-15696-58.patch
@ -0,0 +1,45 @@
+From bad852b61b79503fcb3c5fc379c70f768df3e1fb Mon Sep 17 00:00:00 2001
+From: Noah Goldstein <goldstein.w.n@gmail.com>
+Date: Sat, 23 Oct 2021 01:26:47 -0400
+Subject: [PATCH] x86: Replace sse2 instructions with avx in
+ memcmp-evex-movbe.S
+Content-type: text/plain; charset=UTF-8
+
+This commit replaces two usages of SSE2 'movups' with AVX 'vmovdqu'.
+
+it could potentially be dangerous to use SSE2 if this function is ever
+called without using 'vzeroupper' beforehand. While compilers appear
+to use 'vzeroupper' before function calls if AVX2 has been used, using
+SSE2 here is more brittle. Since it is not absolutely necessary it
+should be avoided.
+
+It costs 2-extra bytes but the extra bytes should only eat into
+alignment padding.
+Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
+---
+ sysdeps/x86_64/multiarch/memcmp-evex-movbe.S | 4 ++--
+ 1 file changed, 2 insertions(+), 2 deletions(-)
+
+diff --git a/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S b/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S
+index 2761b54f..640f6757 100644
+--- a/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S
+++ b/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S
+@@ -561,13 +561,13 @@ L(between_16_31):
+ 	/* From 16 to 31 bytes.  No branch when size == 16.  */
+ 
+ 	/* Use movups to save code size.  */
+-	movups	(%rsi), %xmm2
+	vmovdqu	(%rsi), %xmm2
+ 	VPCMP	$4, (%rdi), %xmm2, %k1
+ 	kmovd	%k1, %eax
+ 	testl	%eax, %eax
+ 	jnz	L(return_vec_0_lv)
+ 	/* Use overlapping loads to avoid branches.  */
+-	movups	-16(%rsi, %rdx, CHAR_SIZE), %xmm2
+	vmovdqu	-16(%rsi, %rdx, CHAR_SIZE), %xmm2
+ 	VPCMP	$4, -16(%rdi, %rdx, CHAR_SIZE), %xmm2, %k1
+ 	addl	$(CHAR_PER_VEC - (16 / CHAR_SIZE)), %edx
+ 	kmovd	%k1, %eax
+-- 
+GitLab
+
--- a/glibc-RHEL-15696-59.patch
+++ b/glibc-RHEL-15696-59.patch
@ -0,0 +1,695 @@
+From c46e9afb2df5fc9e39ff4d13777e4b4c26e04e55 Mon Sep 17 00:00:00 2001
+From: "H.J. Lu" <hjl.tools@gmail.com>
+Date: Fri, 29 Oct 2021 12:40:20 -0700
+Subject: [PATCH] x86-64: Improve EVEX strcmp with masked load
+Content-type: text/plain; charset=UTF-8
+
+In strcmp-evex.S, to compare 2 32-byte strings, replace
+
+        VMOVU   (%rdi, %rdx), %YMM0
+        VMOVU   (%rsi, %rdx), %YMM1
+        /* Each bit in K0 represents a mismatch in YMM0 and YMM1.  */
+        VPCMP   $4, %YMM0, %YMM1, %k0
+        VPCMP   $0, %YMMZERO, %YMM0, %k1
+        VPCMP   $0, %YMMZERO, %YMM1, %k2
+        /* Each bit in K1 represents a NULL in YMM0 or YMM1.  */
+        kord    %k1, %k2, %k1
+        /* Each bit in K1 represents a NULL or a mismatch.  */
+        kord    %k0, %k1, %k1
+        kmovd   %k1, %ecx
+        testl   %ecx, %ecx
+        jne     L(last_vector)
+
+with
+
+        VMOVU   (%rdi, %rdx), %YMM0
+        VPTESTM %YMM0, %YMM0, %k2
+        /* Each bit cleared in K1 represents a mismatch or a null CHAR
+           in YMM0 and 32 bytes at (%rsi, %rdx).  */
+        VPCMP   $0, (%rsi, %rdx), %YMM0, %k1{%k2}
+        kmovd   %k1, %ecx
+        incl    %ecx
+        jne     L(last_vector)
+
+It makes EVEX strcmp faster than AVX2 strcmp by up to 40% on Tiger Lake
+and Ice Lake.
+
+Co-Authored-By: Noah Goldstein <goldstein.w.n@gmail.com>
+---
+ sysdeps/x86_64/multiarch/strcmp-evex.S | 461 +++++++++++++------------
+ 1 file changed, 243 insertions(+), 218 deletions(-)
+
+diff --git a/sysdeps/x86_64/multiarch/strcmp-evex.S b/sysdeps/x86_64/multiarch/strcmp-evex.S
+index d5aa6daa..82f12ac8 100644
+--- a/sysdeps/x86_64/multiarch/strcmp-evex.S
+++ b/sysdeps/x86_64/multiarch/strcmp-evex.S
+@@ -41,6 +41,8 @@
+ # ifdef USE_AS_WCSCMP
+ /* Compare packed dwords.  */
+ #  define VPCMP		vpcmpd
+#  define VPMINU	vpminud
+#  define VPTESTM	vptestmd
+ #  define SHIFT_REG32	r8d
+ #  define SHIFT_REG64	r8
+ /* 1 dword char == 4 bytes.  */
+@@ -48,6 +50,8 @@
+ # else
+ /* Compare packed bytes.  */
+ #  define VPCMP		vpcmpb
+#  define VPMINU	vpminub
+#  define VPTESTM	vptestmb
+ #  define SHIFT_REG32	ecx
+ #  define SHIFT_REG64	rcx
+ /* 1 byte char == 1 byte.  */
+@@ -67,6 +71,9 @@
+ # define YMM5		ymm22
+ # define YMM6		ymm23
+ # define YMM7		ymm24
+# define YMM8		ymm25
+# define YMM9		ymm26
+# define YMM10		ymm27
+ 
+ /* Warning!
+            wcscmp/wcsncmp have to use SIGNED comparison for elements.
+@@ -76,7 +83,7 @@
+ /* The main idea of the string comparison (byte or dword) using 256-bit
+    EVEX instructions consists of comparing (VPCMP) two ymm vectors. The
+    latter can be on either packed bytes or dwords depending on
+-   USE_AS_WCSCMP. In order to check the null char, algorithm keeps the
+   USE_AS_WCSCMP. In order to check the null CHAR, algorithm keeps the
+    matched bytes/dwords, requiring 5 EVEX instructions (3 VPCMP and 2
+    KORD). In general, the costs of comparing VEC_SIZE bytes (32-bytes)
+    are 3 VPCMP and 2 KORD instructions, together with VMOVU and ktestd
+@@ -123,27 +130,21 @@ ENTRY (STRCMP)
+ 	jg	L(cross_page)
+ 	/* Start comparing 4 vectors.  */
+ 	VMOVU	(%rdi), %YMM0
+-	VMOVU	(%rsi), %YMM1
+ 
+-	/* Each bit in K0 represents a mismatch in YMM0 and YMM1.  */
+-	VPCMP	$4, %YMM0, %YMM1, %k0
+	/* Each bit set in K2 represents a non-null CHAR in YMM0.  */
+	VPTESTM	%YMM0, %YMM0, %k2
+ 
+-	/* Check for NULL in YMM0.  */
+-	VPCMP	$0, %YMMZERO, %YMM0, %k1
+-	/* Check for NULL in YMM1.  */
+-	VPCMP	$0, %YMMZERO, %YMM1, %k2
+-	/* Each bit in K1 represents a NULL in YMM0 or YMM1.  */
+-	kord	%k1, %k2, %k1
+	/* Each bit cleared in K1 represents a mismatch or a null CHAR
+	   in YMM0 and 32 bytes at (%rsi).  */
+	VPCMP	$0, (%rsi), %YMM0, %k1{%k2}
+ 
+-	/* Each bit in K1 represents:
+-	   1. A mismatch in YMM0 and YMM1.  Or
+-	   2. A NULL in YMM0 or YMM1.
+-	 */
+-	kord	%k0, %k1, %k1
+-
+-	ktestd	%k1, %k1
+-	je	L(next_3_vectors)
+ 	kmovd	%k1, %ecx
+# ifdef USE_AS_WCSCMP
+	subl	$0xff, %ecx
+# else
+	incl	%ecx
+# endif
+	je	L(next_3_vectors)
+ 	tzcntl	%ecx, %edx
+ # ifdef USE_AS_WCSCMP
+ 	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
+@@ -172,9 +173,7 @@ L(return):
+ # endif
+ 	ret
+ 
+-	.p2align 4
+ L(return_vec_size):
+-	kmovd	%k1, %ecx
+ 	tzcntl	%ecx, %edx
+ # ifdef USE_AS_WCSCMP
+ 	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
+@@ -210,9 +209,7 @@ L(return_vec_size):
+ # endif
+ 	ret
+ 
+-	.p2align 4
+ L(return_2_vec_size):
+-	kmovd	%k1, %ecx
+ 	tzcntl	%ecx, %edx
+ # ifdef USE_AS_WCSCMP
+ 	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
+@@ -248,9 +245,7 @@ L(return_2_vec_size):
+ # endif
+ 	ret
+ 
+-	.p2align 4
+ L(return_3_vec_size):
+-	kmovd	%k1, %ecx
+ 	tzcntl	%ecx, %edx
+ # ifdef USE_AS_WCSCMP
+ 	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
+@@ -289,43 +284,45 @@ L(return_3_vec_size):
+ 	.p2align 4
+ L(next_3_vectors):
+ 	VMOVU	VEC_SIZE(%rdi), %YMM0
+-	VMOVU	VEC_SIZE(%rsi), %YMM1
+-	/* Each bit in K0 represents a mismatch in YMM0 and YMM1.  */
+-	VPCMP	$4, %YMM0, %YMM1, %k0
+-	VPCMP	$0, %YMMZERO, %YMM0, %k1
+-	VPCMP	$0, %YMMZERO, %YMM1, %k2
+-	/* Each bit in K1 represents a NULL in YMM0 or YMM1.  */
+-	kord	%k1, %k2, %k1
+-	/* Each bit in K1 represents a NULL or a mismatch.  */
+-	kord	%k0, %k1, %k1
+-	ktestd	%k1, %k1
+	/* Each bit set in K2 represents a non-null CHAR in YMM0.  */
+	VPTESTM	%YMM0, %YMM0, %k2
+	/* Each bit cleared in K1 represents a mismatch or a null CHAR
+	   in YMM0 and 32 bytes at VEC_SIZE(%rsi).  */
+	VPCMP	$0, VEC_SIZE(%rsi), %YMM0, %k1{%k2}
+	kmovd	%k1, %ecx
+# ifdef USE_AS_WCSCMP
+	subl	$0xff, %ecx
+# else
+	incl	%ecx
+# endif
+ 	jne	L(return_vec_size)
+ 
+-	VMOVU	(VEC_SIZE * 2)(%rdi), %YMM2
+-	VMOVU	(VEC_SIZE * 3)(%rdi), %YMM3
+-	VMOVU	(VEC_SIZE * 2)(%rsi), %YMM4
+-	VMOVU	(VEC_SIZE * 3)(%rsi), %YMM5
+-
+-	/* Each bit in K0 represents a mismatch in YMM2 and YMM4.  */
+-	VPCMP	$4, %YMM2, %YMM4, %k0
+-	VPCMP	$0, %YMMZERO, %YMM2, %k1
+-	VPCMP	$0, %YMMZERO, %YMM4, %k2
+-	/* Each bit in K1 represents a NULL in YMM2 or YMM4.  */
+-	kord	%k1, %k2, %k1
+-	/* Each bit in K1 represents a NULL or a mismatch.  */
+-	kord	%k0, %k1, %k1
+-	ktestd	%k1, %k1
+	VMOVU	(VEC_SIZE * 2)(%rdi), %YMM0
+	/* Each bit set in K2 represents a non-null CHAR in YMM0.  */
+	VPTESTM	%YMM0, %YMM0, %k2
+	/* Each bit cleared in K1 represents a mismatch or a null CHAR
+	   in YMM0 and 32 bytes at (VEC_SIZE * 2)(%rsi).  */
+	VPCMP	$0, (VEC_SIZE * 2)(%rsi), %YMM0, %k1{%k2}
+	kmovd	%k1, %ecx
+# ifdef USE_AS_WCSCMP
+	subl	$0xff, %ecx
+# else
+	incl	%ecx
+# endif
+ 	jne	L(return_2_vec_size)
+ 
+-	/* Each bit in K0 represents a mismatch in YMM3 and YMM5.  */
+-	VPCMP	$4, %YMM3, %YMM5, %k0
+-	VPCMP	$0, %YMMZERO, %YMM3, %k1
+-	VPCMP	$0, %YMMZERO, %YMM5, %k2
+-	/* Each bit in K1 represents a NULL in YMM3 or YMM5.  */
+-	kord	%k1, %k2, %k1
+-	/* Each bit in K1 represents a NULL or a mismatch.  */
+-	kord	%k0, %k1, %k1
+-	ktestd	%k1, %k1
+	VMOVU	(VEC_SIZE * 3)(%rdi), %YMM0
+	/* Each bit set in K2 represents a non-null CHAR in YMM0.  */
+	VPTESTM	%YMM0, %YMM0, %k2
+	/* Each bit cleared in K1 represents a mismatch or a null CHAR
+	   in YMM0 and 32 bytes at (VEC_SIZE * 2)(%rsi).  */
+	VPCMP	$0, (VEC_SIZE * 3)(%rsi), %YMM0, %k1{%k2}
+	kmovd	%k1, %ecx
+# ifdef USE_AS_WCSCMP
+	subl	$0xff, %ecx
+# else
+	incl	%ecx
+# endif
+ 	jne	L(return_3_vec_size)
+ L(main_loop_header):
+ 	leaq	(VEC_SIZE * 4)(%rdi), %rdx
+@@ -375,56 +372,51 @@ L(back_to_loop):
+ 	VMOVA	VEC_SIZE(%rax), %YMM2
+ 	VMOVA	(VEC_SIZE * 2)(%rax), %YMM4
+ 	VMOVA	(VEC_SIZE * 3)(%rax), %YMM6
+-	VMOVU	(%rdx), %YMM1
+-	VMOVU	VEC_SIZE(%rdx), %YMM3
+-	VMOVU	(VEC_SIZE * 2)(%rdx), %YMM5
+-	VMOVU	(VEC_SIZE * 3)(%rdx), %YMM7
+-
+-	VPCMP	$4, %YMM0, %YMM1, %k0
+-	VPCMP	$0, %YMMZERO, %YMM0, %k1
+-	VPCMP	$0, %YMMZERO, %YMM1, %k2
+-	kord	%k1, %k2, %k1
+-	/* Each bit in K4 represents a NULL or a mismatch in YMM0 and
+-	   YMM1.  */
+-	kord	%k0, %k1, %k4
+-
+-	VPCMP	$4, %YMM2, %YMM3, %k0
+-	VPCMP	$0, %YMMZERO, %YMM2, %k1
+-	VPCMP	$0, %YMMZERO, %YMM3, %k2
+-	kord	%k1, %k2, %k1
+-	/* Each bit in K5 represents a NULL or a mismatch in YMM2 and
+-	   YMM3.  */
+-	kord	%k0, %k1, %k5
+-
+-	VPCMP	$4, %YMM4, %YMM5, %k0
+-	VPCMP	$0, %YMMZERO, %YMM4, %k1
+-	VPCMP	$0, %YMMZERO, %YMM5, %k2
+-	kord	%k1, %k2, %k1
+-	/* Each bit in K6 represents a NULL or a mismatch in YMM4 and
+-	   YMM5.  */
+-	kord	%k0, %k1, %k6
+-
+-	VPCMP	$4, %YMM6, %YMM7, %k0
+-	VPCMP	$0, %YMMZERO, %YMM6, %k1
+-	VPCMP	$0, %YMMZERO, %YMM7, %k2
+-	kord	%k1, %k2, %k1
+-	/* Each bit in K7 represents a NULL or a mismatch in YMM6 and
+-	   YMM7.  */
+-	kord	%k0, %k1, %k7
+-
+-	kord	%k4, %k5, %k0
+-	kord	%k6, %k7, %k1
+-
+-	/* Test each mask (32 bits) individually because for VEC_SIZE
+-	   == 32 is not possible to OR the four masks and keep all bits
+-	   in a 64-bit integer register, differing from SSE2 strcmp
+-	   where ORing is possible.  */
+-	kortestd %k0, %k1
+-	je	L(loop)
+-	ktestd	%k4, %k4
+
+	VPMINU	%YMM0, %YMM2, %YMM8
+	VPMINU	%YMM4, %YMM6, %YMM9
+
+	/* A zero CHAR in YMM8 means that there is a null CHAR.  */
+	VPMINU	%YMM8, %YMM9, %YMM8
+
+	/* Each bit set in K1 represents a non-null CHAR in YMM8.  */
+	VPTESTM	%YMM8, %YMM8, %k1
+
+	/* (YMM ^ YMM): A non-zero CHAR represents a mismatch.  */
+	vpxorq	(%rdx), %YMM0, %YMM1
+	vpxorq	VEC_SIZE(%rdx), %YMM2, %YMM3
+	vpxorq	(VEC_SIZE * 2)(%rdx), %YMM4, %YMM5
+	vpxorq	(VEC_SIZE * 3)(%rdx), %YMM6, %YMM7
+
+	vporq	%YMM1, %YMM3, %YMM9
+	vporq	%YMM5, %YMM7, %YMM10
+
+	/* A non-zero CHAR in YMM9 represents a mismatch.  */
+	vporq	%YMM9, %YMM10, %YMM9
+
+	/* Each bit cleared in K0 represents a mismatch or a null CHAR.  */
+	VPCMP	$0, %YMMZERO, %YMM9, %k0{%k1}
+	kmovd   %k0, %ecx
+# ifdef USE_AS_WCSCMP
+	subl	$0xff, %ecx
+# else
+	incl	%ecx
+# endif
+	je	 L(loop)
+
+	/* Each bit set in K1 represents a non-null CHAR in YMM0.  */
+	VPTESTM	%YMM0, %YMM0, %k1
+	/* Each bit cleared in K0 represents a mismatch or a null CHAR
+	   in YMM0 and (%rdx).  */
+	VPCMP	$0, %YMMZERO, %YMM1, %k0{%k1}
+	kmovd	%k0, %ecx
+# ifdef USE_AS_WCSCMP
+	subl	$0xff, %ecx
+# else
+	incl	%ecx
+# endif
+ 	je	L(test_vec)
+-	kmovd	%k4, %edi
+-	tzcntl	%edi, %ecx
+	tzcntl	%ecx, %ecx
+ # ifdef USE_AS_WCSCMP
+ 	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
+ 	sall	$2, %ecx
+@@ -466,9 +458,18 @@ L(test_vec):
+ 	cmpq	$VEC_SIZE, %r11
+ 	jbe	L(zero)
+ # endif
+-	ktestd	%k5, %k5
+	/* Each bit set in K1 represents a non-null CHAR in YMM2.  */
+	VPTESTM	%YMM2, %YMM2, %k1
+	/* Each bit cleared in K0 represents a mismatch or a null CHAR
+	   in YMM2 and VEC_SIZE(%rdx).  */
+	VPCMP	$0, %YMMZERO, %YMM3, %k0{%k1}
+	kmovd	%k0, %ecx
+# ifdef USE_AS_WCSCMP
+	subl	$0xff, %ecx
+# else
+	incl	%ecx
+# endif
+ 	je	L(test_2_vec)
+-	kmovd	%k5, %ecx
+ 	tzcntl	%ecx, %edi
+ # ifdef USE_AS_WCSCMP
+ 	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
+@@ -512,9 +513,18 @@ L(test_2_vec):
+ 	cmpq	$(VEC_SIZE * 2), %r11
+ 	jbe	L(zero)
+ # endif
+-	ktestd	%k6, %k6
+	/* Each bit set in K1 represents a non-null CHAR in YMM4.  */
+	VPTESTM	%YMM4, %YMM4, %k1
+	/* Each bit cleared in K0 represents a mismatch or a null CHAR
+	   in YMM4 and (VEC_SIZE * 2)(%rdx).  */
+	VPCMP	$0, %YMMZERO, %YMM5, %k0{%k1}
+	kmovd	%k0, %ecx
+# ifdef USE_AS_WCSCMP
+	subl	$0xff, %ecx
+# else
+	incl	%ecx
+# endif
+ 	je	L(test_3_vec)
+-	kmovd	%k6, %ecx
+ 	tzcntl	%ecx, %edi
+ # ifdef USE_AS_WCSCMP
+ 	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
+@@ -558,8 +568,18 @@ L(test_3_vec):
+ 	cmpq	$(VEC_SIZE * 3), %r11
+ 	jbe	L(zero)
+ # endif
+-	kmovd	%k7, %esi
+-	tzcntl	%esi, %ecx
+	/* Each bit set in K1 represents a non-null CHAR in YMM6.  */
+	VPTESTM	%YMM6, %YMM6, %k1
+	/* Each bit cleared in K0 represents a mismatch or a null CHAR
+	   in YMM6 and (VEC_SIZE * 3)(%rdx).  */
+	VPCMP	$0, %YMMZERO, %YMM7, %k0{%k1}
+	kmovd	%k0, %ecx
+# ifdef USE_AS_WCSCMP
+	subl	$0xff, %ecx
+# else
+	incl	%ecx
+# endif
+	tzcntl	%ecx, %ecx
+ # ifdef USE_AS_WCSCMP
+ 	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
+ 	sall	$2, %ecx
+@@ -615,39 +635,51 @@ L(loop_cross_page):
+ 
+ 	VMOVU	(%rax, %r10), %YMM2
+ 	VMOVU	VEC_SIZE(%rax, %r10), %YMM3
+-	VMOVU	(%rdx, %r10), %YMM4
+-	VMOVU	VEC_SIZE(%rdx, %r10), %YMM5
+-
+-	VPCMP	$4, %YMM4, %YMM2, %k0
+-	VPCMP	$0, %YMMZERO, %YMM2, %k1
+-	VPCMP	$0, %YMMZERO, %YMM4, %k2
+-	kord	%k1, %k2, %k1
+-	/* Each bit in K1 represents a NULL or a mismatch in YMM2 and
+-	   YMM4.  */
+-	kord	%k0, %k1, %k1
+-
+-	VPCMP	$4, %YMM5, %YMM3, %k3
+-	VPCMP	$0, %YMMZERO, %YMM3, %k4
+-	VPCMP	$0, %YMMZERO, %YMM5, %k5
+-	kord	%k4, %k5, %k4
+-	/* Each bit in K3 represents a NULL or a mismatch in YMM3 and
+-	   YMM5.  */
+-	kord	%k3, %k4, %k3
+
+	/* Each bit set in K2 represents a non-null CHAR in YMM2.  */
+	VPTESTM	%YMM2, %YMM2, %k2
+	/* Each bit cleared in K1 represents a mismatch or a null CHAR
+	   in YMM2 and 32 bytes at (%rdx, %r10).  */
+	VPCMP	$0, (%rdx, %r10), %YMM2, %k1{%k2}
+	kmovd	%k1, %r9d
+	/* Don't use subl since it is the lower 16/32 bits of RDI
+	   below.  */
+	notl	%r9d
+# ifdef USE_AS_WCSCMP
+	/* Only last 8 bits are valid.  */
+	andl	$0xff, %r9d
+# endif
+
+	/* Each bit set in K4 represents a non-null CHAR in YMM3.  */
+	VPTESTM	%YMM3, %YMM3, %k4
+	/* Each bit cleared in K3 represents a mismatch or a null CHAR
+	   in YMM3 and 32 bytes at VEC_SIZE(%rdx, %r10).  */
+	VPCMP	$0, VEC_SIZE(%rdx, %r10), %YMM3, %k3{%k4}
+	kmovd	%k3, %edi
+# ifdef USE_AS_WCSCMP
+	/* Don't use subl since it is the upper 8 bits of EDI below.  */
+	notl	%edi
+	andl	$0xff, %edi
+# else
+	incl	%edi
+# endif
+ 
+ # ifdef USE_AS_WCSCMP
+-	/* NB: Each bit in K1/K3 represents 4-byte element.  */
+-	kshiftlw $8, %k3, %k2
+	/* NB: Each bit in EDI/R9D represents 4-byte element.  */
+	sall	$8, %edi
+ 	/* NB: Divide shift count by 4 since each bit in K1 represent 4
+ 	   bytes.  */
+ 	movl	%ecx, %SHIFT_REG32
+ 	sarl	$2, %SHIFT_REG32
+
+	/* Each bit in EDI represents a null CHAR or a mismatch.  */
+	orl	%r9d, %edi
+ # else
+-	kshiftlq $32, %k3, %k2
+-# endif
+	salq	$32, %rdi
+ 
+-	/* Each bit in K1 represents a NULL or a mismatch.  */
+-	korq	%k1, %k2, %k1
+-	kmovq	%k1, %rdi
+	/* Each bit in RDI represents a null CHAR or a mismatch.  */
+	orq	%r9, %rdi
+# endif
+ 
+ 	/* Since ECX < VEC_SIZE * 2, simply skip the first ECX bytes.  */
+ 	shrxq	%SHIFT_REG64, %rdi, %rdi
+@@ -692,35 +724,45 @@ L(loop_cross_page_2_vec):
+ 	/* The first VEC_SIZE * 2 bytes match or are ignored.  */
+ 	VMOVU	(VEC_SIZE * 2)(%rax, %r10), %YMM0
+ 	VMOVU	(VEC_SIZE * 3)(%rax, %r10), %YMM1
+-	VMOVU	(VEC_SIZE * 2)(%rdx, %r10), %YMM2
+-	VMOVU	(VEC_SIZE * 3)(%rdx, %r10), %YMM3
+-
+-	VPCMP	$4, %YMM0, %YMM2, %k0
+-	VPCMP	$0, %YMMZERO, %YMM0, %k1
+-	VPCMP	$0, %YMMZERO, %YMM2, %k2
+-	kord	%k1, %k2, %k1
+-	/* Each bit in K1 represents a NULL or a mismatch in YMM0 and
+-	   YMM2.  */
+-	kord	%k0, %k1, %k1
+-
+-	VPCMP	$4, %YMM1, %YMM3, %k3
+-	VPCMP	$0, %YMMZERO, %YMM1, %k4
+-	VPCMP	$0, %YMMZERO, %YMM3, %k5
+-	kord	%k4, %k5, %k4
+-	/* Each bit in K3 represents a NULL or a mismatch in YMM1 and
+-	   YMM3.  */
+-	kord	%k3, %k4, %k3
+ 
+	VPTESTM	%YMM0, %YMM0, %k2
+	/* Each bit cleared in K1 represents a mismatch or a null CHAR
+	   in YMM0 and 32 bytes at (VEC_SIZE * 2)(%rdx, %r10).  */
+	VPCMP	$0, (VEC_SIZE * 2)(%rdx, %r10), %YMM0, %k1{%k2}
+	kmovd	%k1, %r9d
+	/* Don't use subl since it is the lower 16/32 bits of RDI
+	   below.  */
+	notl	%r9d
+ # ifdef USE_AS_WCSCMP
+-	/* NB: Each bit in K1/K3 represents 4-byte element.  */
+-	kshiftlw $8, %k3, %k2
+	/* Only last 8 bits are valid.  */
+	andl	$0xff, %r9d
+# endif
+
+	VPTESTM	%YMM1, %YMM1, %k4
+	/* Each bit cleared in K3 represents a mismatch or a null CHAR
+	   in YMM1 and 32 bytes at (VEC_SIZE * 3)(%rdx, %r10).  */
+	VPCMP	$0, (VEC_SIZE * 3)(%rdx, %r10), %YMM1, %k3{%k4}
+	kmovd	%k3, %edi
+# ifdef USE_AS_WCSCMP
+	/* Don't use subl since it is the upper 8 bits of EDI below.  */
+	notl	%edi
+	andl	$0xff, %edi
+ # else
+-	kshiftlq $32, %k3, %k2
+	incl	%edi
+ # endif
+ 
+-	/* Each bit in K1 represents a NULL or a mismatch.  */
+-	korq	%k1, %k2, %k1
+-	kmovq	%k1, %rdi
+# ifdef USE_AS_WCSCMP
+	/* NB: Each bit in EDI/R9D represents 4-byte element.  */
+	sall	$8, %edi
+
+	/* Each bit in EDI represents a null CHAR or a mismatch.  */
+	orl	%r9d, %edi
+# else
+	salq	$32, %rdi
+
+	/* Each bit in RDI represents a null CHAR or a mismatch.  */
+	orq	%r9, %rdi
+# endif
+ 
+ 	xorl	%r8d, %r8d
+ 	/* If ECX > VEC_SIZE * 2, skip ECX - (VEC_SIZE * 2) bytes.  */
+@@ -729,12 +771,15 @@ L(loop_cross_page_2_vec):
+ 	/* R8 has number of bytes skipped.  */
+ 	movl	%ecx, %r8d
+ # ifdef USE_AS_WCSCMP
+-	/* NB: Divide shift count by 4 since each bit in K1 represent 4
+	/* NB: Divide shift count by 4 since each bit in RDI represent 4
+ 	   bytes.  */
+ 	sarl	$2, %ecx
+-# endif
+	/* Skip ECX bytes.  */
+	shrl	%cl, %edi
+# else
+ 	/* Skip ECX bytes.  */
+ 	shrq	%cl, %rdi
+# endif
+ 1:
+ 	/* Before jumping back to the loop, set ESI to the number of
+ 	   VEC_SIZE * 4 blocks before page crossing.  */
+@@ -818,7 +863,7 @@ L(cross_page_loop):
+ 	movzbl	(%rdi, %rdx), %eax
+ 	movzbl	(%rsi, %rdx), %ecx
+ # endif
+-	/* Check null char.  */
+	/* Check null CHAR.  */
+ 	testl	%eax, %eax
+ 	jne	L(cross_page_loop)
+ 	/* Since %eax == 0, subtract is OK for both SIGNED and UNSIGNED
+@@ -901,18 +946,17 @@ L(cross_page):
+ 	jg	L(cross_page_1_vector)
+ L(loop_1_vector):
+ 	VMOVU	(%rdi, %rdx), %YMM0
+-	VMOVU	(%rsi, %rdx), %YMM1
+-
+-	/* Each bit in K0 represents a mismatch in YMM0 and YMM1.  */
+-	VPCMP	$4, %YMM0, %YMM1, %k0
+-	VPCMP	$0, %YMMZERO, %YMM0, %k1
+-	VPCMP	$0, %YMMZERO, %YMM1, %k2
+-	/* Each bit in K1 represents a NULL in YMM0 or YMM1.  */
+-	kord	%k1, %k2, %k1
+-	/* Each bit in K1 represents a NULL or a mismatch.  */
+-	kord	%k0, %k1, %k1
+
+	VPTESTM	%YMM0, %YMM0, %k2
+	/* Each bit cleared in K1 represents a mismatch or a null CHAR
+	   in YMM0 and 32 bytes at (%rsi, %rdx).  */
+	VPCMP	$0, (%rsi, %rdx), %YMM0, %k1{%k2}
+ 	kmovd	%k1, %ecx
+-	testl	%ecx, %ecx
+# ifdef USE_AS_WCSCMP
+	subl	$0xff, %ecx
+# else
+	incl	%ecx
+# endif
+ 	jne	L(last_vector)
+ 
+ 	addl	$VEC_SIZE, %edx
+@@ -931,18 +975,17 @@ L(cross_page_1_vector):
+ 	cmpl	$(PAGE_SIZE - 16), %eax
+ 	jg	L(cross_page_1_xmm)
+ 	VMOVU	(%rdi, %rdx), %XMM0
+-	VMOVU	(%rsi, %rdx), %XMM1
+-
+-	/* Each bit in K0 represents a mismatch in XMM0 and XMM1.  */
+-	VPCMP	$4, %XMM0, %XMM1, %k0
+-	VPCMP	$0, %XMMZERO, %XMM0, %k1
+-	VPCMP	$0, %XMMZERO, %XMM1, %k2
+-	/* Each bit in K1 represents a NULL in XMM0 or XMM1.  */
+-	korw	%k1, %k2, %k1
+-	/* Each bit in K1 represents a NULL or a mismatch.  */
+-	korw	%k0, %k1, %k1
+-	kmovw	%k1, %ecx
+-	testl	%ecx, %ecx
+
+	VPTESTM	%YMM0, %YMM0, %k2
+	/* Each bit cleared in K1 represents a mismatch or a null CHAR
+	   in XMM0 and 16 bytes at (%rsi, %rdx).  */
+	VPCMP	$0, (%rsi, %rdx), %XMM0, %k1{%k2}
+	kmovd	%k1, %ecx
+# ifdef USE_AS_WCSCMP
+	subl	$0xf, %ecx
+# else
+	subl	$0xffff, %ecx
+# endif
+ 	jne	L(last_vector)
+ 
+ 	addl	$16, %edx
+@@ -965,25 +1008,16 @@ L(cross_page_1_xmm):
+ 	vmovq	(%rdi, %rdx), %XMM0
+ 	vmovq	(%rsi, %rdx), %XMM1
+ 
+-	/* Each bit in K0 represents a mismatch in XMM0 and XMM1.  */
+-	VPCMP	$4, %XMM0, %XMM1, %k0
+-	VPCMP	$0, %XMMZERO, %XMM0, %k1
+-	VPCMP	$0, %XMMZERO, %XMM1, %k2
+-	/* Each bit in K1 represents a NULL in XMM0 or XMM1.  */
+-	kord	%k1, %k2, %k1
+-	/* Each bit in K1 represents a NULL or a mismatch.  */
+-	kord	%k0, %k1, %k1
+-	kmovd	%k1, %ecx
+-
+	VPTESTM	%YMM0, %YMM0, %k2
+	/* Each bit cleared in K1 represents a mismatch or a null CHAR
+	   in XMM0 and XMM1.  */
+	VPCMP	$0, %XMM1, %XMM0, %k1{%k2}
+	kmovb	%k1, %ecx
+ # ifdef USE_AS_WCSCMP
+-	/* Only last 2 bits are valid.  */
+-	andl	$0x3, %ecx
+	subl	$0x3, %ecx
+ # else
+-	/* Only last 8 bits are valid.  */
+-	andl	$0xff, %ecx
+	subl	$0xff, %ecx
+ # endif
+-
+-	testl	%ecx, %ecx
+ 	jne	L(last_vector)
+ 
+ 	addl	$8, %edx
+@@ -1002,25 +1036,16 @@ L(cross_page_8bytes):
+ 	vmovd	(%rdi, %rdx), %XMM0
+ 	vmovd	(%rsi, %rdx), %XMM1
+ 
+-	/* Each bit in K0 represents a mismatch in XMM0 and XMM1.  */
+-	VPCMP	$4, %XMM0, %XMM1, %k0
+-	VPCMP	$0, %XMMZERO, %XMM0, %k1
+-	VPCMP	$0, %XMMZERO, %XMM1, %k2
+-	/* Each bit in K1 represents a NULL in XMM0 or XMM1.  */
+-	kord	%k1, %k2, %k1
+-	/* Each bit in K1 represents a NULL or a mismatch.  */
+-	kord	%k0, %k1, %k1
+	VPTESTM	%YMM0, %YMM0, %k2
+	/* Each bit cleared in K1 represents a mismatch or a null CHAR
+	   in XMM0 and XMM1.  */
+	VPCMP	$0, %XMM1, %XMM0, %k1{%k2}
+ 	kmovd	%k1, %ecx
+-
+ # ifdef USE_AS_WCSCMP
+-	/* Only the last bit is valid.  */
+-	andl	$0x1, %ecx
+	subl	$0x1, %ecx
+ # else
+-	/* Only last 4 bits are valid.  */
+-	andl	$0xf, %ecx
+	subl	$0xf, %ecx
+ # endif
+-
+-	testl	%ecx, %ecx
+ 	jne	L(last_vector)
+ 
+ 	addl	$4, %edx
+-- 
+GitLab
+
--- a/glibc-RHEL-15696-6.patch
+++ b/glibc-RHEL-15696-6.patch
@ -0,0 +1,300 @@
+From ee915088a0231cd421054dbd8abab7aadf331153 Mon Sep 17 00:00:00 2001
+From: "H.J. Lu" <hjl.tools@gmail.com>
+Date: Mon, 21 Jan 2019 11:33:52 -0800
+Subject: [PATCH] x86-64 strncmp family: Properly handle the length parameter
+ [BZ# 24097]
+Content-type: text/plain; charset=UTF-8
+
+On x32, the size_t parameter may be passed in the lower 32 bits of a
+64-bit register with the non-zero upper 32 bits.  The string/memory
+functions written in assembly can only use the lower 32 bits of a
+64-bit register as length or must clear the upper 32 bits before using
+the full 64-bit register for length.
+
+This pach fixes the strncmp family for x32.  Tested on x86-64 and x32.
+On x86-64, libc.so is the same with and withou the fix.
+
+	[BZ# 24097]
+	CVE-2019-6488
+	* sysdeps/x86_64/multiarch/strcmp-avx2.S: Use RDX_LP for length.
+	* sysdeps/x86_64/multiarch/strcmp-sse42.S: Likewise.
+	* sysdeps/x86_64/strcmp.S: Likewise.
+	* sysdeps/x86_64/x32/Makefile (tests): Add tst-size_t-strncasecmp,
+	tst-size_t-strncmp and tst-size_t-wcsncmp.
+	* sysdeps/x86_64/x32/tst-size_t-strncasecmp.c: New file.
+	* sysdeps/x86_64/x32/tst-size_t-strncmp.c: Likewise.
+	* sysdeps/x86_64/x32/tst-size_t-wcsncmp.c: Likewise.
+---
+ sysdeps/x86_64/multiarch/strcmp-avx2.S      |  6 +-
+ sysdeps/x86_64/multiarch/strcmp-sse42.S     |  6 +-
+ sysdeps/x86_64/strcmp.S                     |  6 +-
+ sysdeps/x86_64/x32/Makefile                 |  6 +-
+ sysdeps/x86_64/x32/tst-size_t-strncasecmp.c | 59 ++++++++++++++++
+ sysdeps/x86_64/x32/tst-size_t-strncmp.c     | 78 +++++++++++++++++++++
+ sysdeps/x86_64/x32/tst-size_t-wcsncmp.c     | 20 ++++++
+ 7 files changed, 170 insertions(+), 11 deletions(-)
+ create mode 100644 sysdeps/x86_64/x32/tst-size_t-strncasecmp.c
+ create mode 100644 sysdeps/x86_64/x32/tst-size_t-strncmp.c
+ create mode 100644 sysdeps/x86_64/x32/tst-size_t-wcsncmp.c
+
+Conflicts:
+	ChangeLog
+	(removed)
+
+diff --git a/sysdeps/x86_64/multiarch/strcmp-avx2.S b/sysdeps/x86_64/multiarch/strcmp-avx2.S
+index 327e3d87..156c1949 100644
+--- a/sysdeps/x86_64/multiarch/strcmp-avx2.S
+++ b/sysdeps/x86_64/multiarch/strcmp-avx2.S
+@@ -79,15 +79,15 @@
+ ENTRY (STRCMP)
+ # ifdef USE_AS_STRNCMP
+ 	/* Check for simple cases (0 or 1) in offset.  */
+-	cmp	$1, %rdx
+	cmp	$1, %RDX_LP
+ 	je	L(char0)
+ 	jb	L(zero)
+ #  ifdef USE_AS_WCSCMP
+ 	/* Convert units: from wide to byte char.  */
+-	shl	$2, %rdx
+	shl	$2, %RDX_LP
+ #  endif
+ 	/* Register %r11 tracks the maximum offset.  */
+-	movq	%rdx, %r11
+	mov	%RDX_LP, %R11_LP
+ # endif
+ 	movl	%edi, %eax
+ 	xorl	%edx, %edx
+diff --git a/sysdeps/x86_64/multiarch/strcmp-sse42.S b/sysdeps/x86_64/multiarch/strcmp-sse42.S
+index d3c07bd2..a1ebea46 100644
+--- a/sysdeps/x86_64/multiarch/strcmp-sse42.S
+++ b/sysdeps/x86_64/multiarch/strcmp-sse42.S
+@@ -156,11 +156,11 @@ STRCMP_SSE42:
+ #endif
+ 
+ #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+-	test	%rdx, %rdx
+	test	%RDX_LP, %RDX_LP
+ 	je	LABEL(strcmp_exitz)
+-	cmp	$1, %rdx
+	cmp	$1, %RDX_LP
+ 	je	LABEL(Byte0)
+-	mov	%rdx, %r11
+	mov	%RDX_LP, %R11_LP
+ #endif
+ 	mov	%esi, %ecx
+ 	mov	%edi, %eax
+diff --git a/sysdeps/x86_64/strcmp.S b/sysdeps/x86_64/strcmp.S
+index e16945b9..f47c8ad4 100644
+--- a/sysdeps/x86_64/strcmp.S
+++ b/sysdeps/x86_64/strcmp.S
+@@ -135,11 +135,11 @@ ENTRY (STRCMP)
+  * This implementation uses SSE to compare up to 16 bytes at a time.
+  */
+ #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+-	test	%rdx, %rdx
+	test	%RDX_LP, %RDX_LP
+ 	je	LABEL(strcmp_exitz)
+-	cmp	$1, %rdx
+	cmp	$1, %RDX_LP
+ 	je	LABEL(Byte0)
+-	mov	%rdx, %r11
+	mov	%RDX_LP, %R11_LP
+ #endif
+ 	mov	%esi, %ecx
+ 	mov	%edi, %eax
+diff --git a/sysdeps/x86_64/x32/Makefile b/sysdeps/x86_64/x32/Makefile
+index 98bd9ae9..db302839 100644
+--- a/sysdeps/x86_64/x32/Makefile
+++ b/sysdeps/x86_64/x32/Makefile
+@@ -7,9 +7,11 @@ endif
+ 
+ ifeq ($(subdir),string)
+ tests += tst-size_t-memchr tst-size_t-memcmp tst-size_t-memcpy \
+-	 tst-size_t-memrchr tst-size_t-memset
+	 tst-size_t-memrchr tst-size_t-memset tst-size_t-strncasecmp \
+	 tst-size_t-strncmp
+ endif
+ 
+ ifeq ($(subdir),wcsmbs)
+-tests += tst-size_t-wmemchr tst-size_t-wmemcmp tst-size_t-wmemset
+tests += tst-size_t-wmemchr tst-size_t-wmemcmp tst-size_t-wmemset \
+	 tst-size_t-wcsncmp
+ endif
+diff --git a/sysdeps/x86_64/x32/tst-size_t-strncasecmp.c b/sysdeps/x86_64/x32/tst-size_t-strncasecmp.c
+new file mode 100644
+index 00000000..86233593
+--- /dev/null
+++ b/sysdeps/x86_64/x32/tst-size_t-strncasecmp.c
+@@ -0,0 +1,59 @@
+/* Test strncaecmp with size_t in the lower 32 bits of 64-bit register.
+   Copyright (C) 2019 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#define TEST_NAME "strncasecmp"
+#include "test-size_t.h"
+
+IMPL (strncasecmp, 1)
+
+typedef int (*proto_t) (const char *, const char *, size_t);
+
+static int
+__attribute__ ((noinline, noclone))
+do_strncasecmp (parameter_t a, parameter_t b)
+{
+  return CALL (&b, a.p, b.p, a.len);
+}
+
+static int
+test_main (void)
+{
+  test_init ();
+
+  parameter_t dest = { { page_size }, buf1 };
+  parameter_t src = { { 0 }, buf2 };
+
+  strncpy ((char *) buf1, (const char *) buf2, page_size);
+
+  int ret = 0;
+  FOR_EACH_IMPL (impl, 0)
+    {
+      src.fn = impl->fn;
+      int res = do_strncasecmp (dest, src);
+      if (res)
+	{
+	  error (0, 0, "Wrong result in function %s: %i != 0",
+		 impl->name, res);
+	  ret = 1;
+	}
+    }
+
+  return ret ? EXIT_FAILURE : EXIT_SUCCESS;
+}
+
+#include <support/test-driver.c>
+diff --git a/sysdeps/x86_64/x32/tst-size_t-strncmp.c b/sysdeps/x86_64/x32/tst-size_t-strncmp.c
+new file mode 100644
+index 00000000..54e6bd83
+--- /dev/null
+++ b/sysdeps/x86_64/x32/tst-size_t-strncmp.c
+@@ -0,0 +1,78 @@
+/* Test strncmp with size_t in the lower 32 bits of 64-bit register.
+   Copyright (C) 2019 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#ifdef WIDE
+# define TEST_NAME "wcsncmp"
+#else
+# define TEST_NAME "strncmp"
+#endif
+
+#include "test-size_t.h"
+
+#ifdef WIDE
+# include <wchar.h>
+
+# define STRNCMP wcsncmp
+# define STRNCPY wcsncpy
+# define CHAR wchar_t
+#else
+# define STRNCMP strncmp
+# define STRNCPY strncpy
+# define CHAR char
+#endif
+
+IMPL (STRNCMP, 1)
+
+typedef int (*proto_t) (const CHAR *, const CHAR *, size_t);
+
+
+static int
+__attribute__ ((noinline, noclone))
+do_strncmp (parameter_t a, parameter_t b)
+{
+  return CALL (&b, a.p, b.p, a.len);
+}
+
+static int
+test_main (void)
+{
+  test_init ();
+
+  size_t size = page_size / sizeof (CHAR);
+  parameter_t dest = { { size }, buf1 };
+  parameter_t src = { { 0 }, buf2 };
+
+  STRNCPY ((CHAR *) buf1, (const CHAR *) buf2, size);
+
+  int ret = 0;
+  FOR_EACH_IMPL (impl, 0)
+    {
+      src.fn = impl->fn;
+      int res = do_strncmp (dest, src);
+      if (res)
+	{
+	  error (0, 0, "Wrong result in function %s: %i != 0",
+		 impl->name, res);
+	  ret = 1;
+	}
+    }
+
+  return ret ? EXIT_FAILURE : EXIT_SUCCESS;
+}
+
+#include <support/test-driver.c>
+diff --git a/sysdeps/x86_64/x32/tst-size_t-wcsncmp.c b/sysdeps/x86_64/x32/tst-size_t-wcsncmp.c
+new file mode 100644
+index 00000000..4829647c
+--- /dev/null
+++ b/sysdeps/x86_64/x32/tst-size_t-wcsncmp.c
+@@ -0,0 +1,20 @@
+/* Test wcsncmp with size_t in the lower 32 bits of 64-bit register.
+   Copyright (C) 2019 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#define WIDE 1
+#include "tst-size_t-strncmp.c"
+-- 
+GitLab
+
--- a/glibc-RHEL-15696-60.patch
+++ b/glibc-RHEL-15696-60.patch
@ -0,0 +1,54 @@
+From 6720d36b6623c5e48c070d86acf61198b33e144e Mon Sep 17 00:00:00 2001
+From: Fangrui Song <maskray@google.com>
+Date: Tue, 2 Nov 2021 20:59:52 -0700
+Subject: [PATCH] x86-64: Replace movzx with movzbl
+Content-type: text/plain; charset=UTF-8
+
+Clang cannot assemble movzx in the AT&T dialect mode.
+
+../sysdeps/x86_64/strcmp.S:2232:16: error: invalid operand for instruction
+ movzx (%rsi), %ecx
+               ^~~~
+
+Change movzx to movzbl, which follows the AT&T dialect and is used
+elsewhere in the file.
+
+Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
+---
+ sysdeps/x86_64/multiarch/strcmp-sse42.S | 4 ++--
+ sysdeps/x86_64/strcmp.S                 | 4 ++--
+ 2 files changed, 4 insertions(+), 4 deletions(-)
+
+diff --git a/sysdeps/x86_64/multiarch/strcmp-sse42.S b/sysdeps/x86_64/multiarch/strcmp-sse42.S
+index a1ebea46..d8fdeb3a 100644
+--- a/sysdeps/x86_64/multiarch/strcmp-sse42.S
+++ b/sysdeps/x86_64/multiarch/strcmp-sse42.S
+@@ -1771,8 +1771,8 @@ LABEL(strcmp_exitz):
+ 	.p2align 4
+ 	// XXX Same as code above
+ LABEL(Byte0):
+-	movzx	(%rsi), %ecx
+-	movzx	(%rdi), %eax
+	movzbl	(%rsi), %ecx
+	movzbl	(%rdi), %eax
+ 
+ #if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+ 	leaq	_nl_C_LC_CTYPE_tolower+128*4(%rip), %rdx
+diff --git a/sysdeps/x86_64/strcmp.S b/sysdeps/x86_64/strcmp.S
+index f47c8ad4..aa6df898 100644
+--- a/sysdeps/x86_64/strcmp.S
+++ b/sysdeps/x86_64/strcmp.S
+@@ -2232,8 +2232,8 @@ LABEL(strcmp_exitz):
+ 
+ 	.p2align 4
+ LABEL(Byte0):
+-	movzx	(%rsi), %ecx
+-	movzx	(%rdi), %eax
+	movzbl	(%rsi), %ecx
+	movzbl	(%rdi), %eax
+ 
+ #if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+ 	leaq	_nl_C_LC_CTYPE_tolower+128*4(%rip), %rdx
+-- 
+GitLab
+
--- a/glibc-RHEL-15696-61.patch
+++ b/glibc-RHEL-15696-61.patch
@ -0,0 +1,56 @@
+From cf2c57526ba4b57e6863ad4db8a868e2678adce8 Mon Sep 17 00:00:00 2001
+From: "H.J. Lu" <hjl.tools@gmail.com>
+Date: Fri, 30 Apr 2021 05:58:59 -0700
+Subject: [PATCH] x86: Set rep_movsb_threshold to 2112 on processors with FSRM
+Content-type: text/plain; charset=UTF-8
+
+The glibc memcpy benchmark on Intel Core i7-1065G7 (Ice Lake) showed
+that REP MOVSB became faster after 2112 bytes:
+
+                                      Vector Move       REP MOVSB
+length=2112, align1=0, align2=0:        24.20             24.40
+length=2112, align1=1, align2=0:        26.07             23.13
+length=2112, align1=0, align2=1:        27.18             28.13
+length=2112, align1=1, align2=1:        26.23             25.16
+length=2176, align1=0, align2=0:        23.18             22.52
+length=2176, align1=2, align2=0:        25.45             22.52
+length=2176, align1=0, align2=2:        27.14             27.82
+length=2176, align1=2, align2=2:        22.73             25.56
+length=2240, align1=0, align2=0:        24.62             24.25
+length=2240, align1=3, align2=0:        29.77             27.15
+length=2240, align1=0, align2=3:        35.55             29.93
+length=2240, align1=3, align2=3:        34.49             25.15
+length=2304, align1=0, align2=0:        34.75             26.64
+length=2304, align1=4, align2=0:        32.09             22.63
+length=2304, align1=0, align2=4:        28.43             31.24
+
+Use REP MOVSB for data size > 2112 bytes in memcpy on processors with
+fast short REP MOVSB (FSRM).
+
+	* sysdeps/x86/dl-cacheinfo.h (dl_init_cacheinfo): Set
+	rep_movsb_threshold to 2112 on processors with fast short REP
+	MOVSB (FSRM).
+---
+ sysdeps/x86/cacheinfo.h | 6 ++++++
+ 1 file changed, 6 insertions(+)
+
+diff --git a/sysdeps/x86/cacheinfo.h b/sysdeps/x86/cacheinfo.h
+index f72f634a..cc3941d3 100644
+--- a/sysdeps/x86/cacheinfo.h
+++ b/sysdeps/x86/cacheinfo.h
+@@ -430,6 +430,12 @@ init_cacheinfo (void)
+       rep_movsb_threshold = 2048 * (16 / 16);
+       minimum_rep_movsb_threshold = 16 * 8;
+     }
+
+  /* NB: The default REP MOVSB threshold is 2112 on processors with fast
+     short REP MOVSB (FSRM).  */
+  if (CPU_FEATURE_USABLE_P (cpu_features, FSRM))
+    rep_movsb_threshold = 2112;
+
+   if (cpu_features->rep_movsb_threshold > minimum_rep_movsb_threshold)
+     __x86_rep_movsb_threshold = cpu_features->rep_movsb_threshold;
+   else
+-- 
+GitLab
+
--- a/glibc-RHEL-15696-62.patch
+++ b/glibc-RHEL-15696-62.patch
@ -0,0 +1,136 @@
+From 475b63702ef38b69558fc3d31a0b66776a70f1d3 Mon Sep 17 00:00:00 2001
+From: Noah Goldstein <goldstein.w.n@gmail.com>
+Date: Mon, 1 Nov 2021 00:49:52 -0500
+Subject: [PATCH] x86: Double size of ERMS rep_movsb_threshold in
+ dl-cacheinfo.h
+Content-type: text/plain; charset=UTF-8
+
+No bug.
+
+This patch doubles the rep_movsb_threshold when using ERMS. Based on
+benchmarks the vector copy loop, especially now that it handles 4k
+aliasing, is better for these medium ranged.
+
+On Skylake with ERMS:
+
+Size,   Align1, Align2, dst>src,(rep movsb) / (vec copy)
+4096,   0,      0,      0,      0.975
+4096,   0,      0,      1,      0.953
+4096,   12,     0,      0,      0.969
+4096,   12,     0,      1,      0.872
+4096,   44,     0,      0,      0.979
+4096,   44,     0,      1,      0.83
+4096,   0,      12,     0,      1.006
+4096,   0,      12,     1,      0.989
+4096,   0,      44,     0,      0.739
+4096,   0,      44,     1,      0.942
+4096,   12,     12,     0,      1.009
+4096,   12,     12,     1,      0.973
+4096,   44,     44,     0,      0.791
+4096,   44,     44,     1,      0.961
+4096,   2048,   0,      0,      0.978
+4096,   2048,   0,      1,      0.951
+4096,   2060,   0,      0,      0.986
+4096,   2060,   0,      1,      0.963
+4096,   2048,   12,     0,      0.971
+4096,   2048,   12,     1,      0.941
+4096,   2060,   12,     0,      0.977
+4096,   2060,   12,     1,      0.949
+8192,   0,      0,      0,      0.85
+8192,   0,      0,      1,      0.845
+8192,   13,     0,      0,      0.937
+8192,   13,     0,      1,      0.939
+8192,   45,     0,      0,      0.932
+8192,   45,     0,      1,      0.927
+8192,   0,      13,     0,      0.621
+8192,   0,      13,     1,      0.62
+8192,   0,      45,     0,      0.53
+8192,   0,      45,     1,      0.516
+8192,   13,     13,     0,      0.664
+8192,   13,     13,     1,      0.659
+8192,   45,     45,     0,      0.593
+8192,   45,     45,     1,      0.575
+8192,   2048,   0,      0,      0.854
+8192,   2048,   0,      1,      0.834
+8192,   2061,   0,      0,      0.863
+8192,   2061,   0,      1,      0.857
+8192,   2048,   13,     0,      0.63
+8192,   2048,   13,     1,      0.629
+8192,   2061,   13,     0,      0.627
+8192,   2061,   13,     1,      0.62
+
+Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
+Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
+---
+ sysdeps/x86/cacheinfo.h      |  8 +++++---
+ sysdeps/x86/dl-tunables.list | 26 +++++++++++++++-----------
+ 2 files changed, 20 insertions(+), 14 deletions(-)
+
+diff --git a/sysdeps/x86/cacheinfo.h b/sysdeps/x86/cacheinfo.h
+index cc3941d3..ac025e08 100644
+--- a/sysdeps/x86/cacheinfo.h
+++ b/sysdeps/x86/cacheinfo.h
+@@ -411,18 +411,20 @@ init_cacheinfo (void)
+ 
+   /* NB: The REP MOVSB threshold must be greater than VEC_SIZE * 8.  */
+   unsigned int minimum_rep_movsb_threshold;
+-  /* NB: The default REP MOVSB threshold is 2048 * (VEC_SIZE / 16).  */
+  /* NB: The default REP MOVSB threshold is 4096 * (VEC_SIZE / 16) for
+     VEC_SIZE == 64 or 32.  For VEC_SIZE == 16, the default REP MOVSB
+     threshold is 2048 * (VEC_SIZE / 16).  */
+   unsigned int rep_movsb_threshold;
+   if (CPU_FEATURE_USABLE_P (cpu_features, AVX512F)
+       && !CPU_FEATURE_PREFERRED_P (cpu_features, Prefer_No_AVX512))
+     {
+-      rep_movsb_threshold = 2048 * (64 / 16);
+      rep_movsb_threshold = 4096 * (64 / 16);
+       minimum_rep_movsb_threshold = 64 * 8;
+     }
+   else if (CPU_FEATURE_PREFERRED_P (cpu_features,
+ 				    AVX_Fast_Unaligned_Load))
+     {
+-      rep_movsb_threshold = 2048 * (32 / 16);
+      rep_movsb_threshold = 4096 * (32 / 16);
+       minimum_rep_movsb_threshold = 32 * 8;
+     }
+   else
+diff --git a/sysdeps/x86/dl-tunables.list b/sysdeps/x86/dl-tunables.list
+index 89bf2966..56c6834a 100644
+--- a/sysdeps/x86/dl-tunables.list
+++ b/sysdeps/x86/dl-tunables.list
+@@ -32,17 +32,21 @@ glibc {
+     }
+     x86_rep_movsb_threshold {
+       type: SIZE_T
+-      # Since there is overhead to set up REP MOVSB operation, REP MOVSB
+-      # isn't faster on short data.  The memcpy micro benchmark in glibc
+-      # shows that 2KB is the approximate value above which REP MOVSB
+-      # becomes faster than SSE2 optimization on processors with Enhanced
+-      # REP MOVSB.  Since larger register size can move more data with a
+-      # single load and store, the threshold is higher with larger register
+-      # size.  Note: Since the REP MOVSB threshold must be greater than 8
+-      # times of vector size and the default value is 2048 * (vector size
+-      # / 16), the default value and the minimum value must be updated at
+-      # run-time.  NB: Don't set the default value since we can't tell if
+-      # the tunable value is set by user or not [BZ #27069].
+      # Since there is overhead to set up REP MOVSB operation, REP
+      # MOVSB isn't faster on short data.  The memcpy micro benchmark
+      # in glibc shows that 2KB is the approximate value above which
+      # REP MOVSB becomes faster than SSE2 optimization on processors
+      # with Enhanced REP MOVSB.  Since larger register size can move
+      # more data with a single load and store, the threshold is
+      # higher with larger register size.  Micro benchmarks show AVX
+      # REP MOVSB becomes faster apprximately at 8KB.  The AVX512
+      # threshold is extrapolated to 16KB.  For machines with FSRM the
+      # threshold is universally set at 2112 bytes.  Note: Since the
+      # REP MOVSB threshold must be greater than 8 times of vector
+      # size and the default value is 4096 * (vector size / 16), the
+      # default value and the minimum value must be updated at
+      # run-time.  NB: Don't set the default value since we can't tell
+      # if the tunable value is set by user or not [BZ #27069].
+       minval: 1
+     }
+     x86_rep_stosb_threshold {
+-- 
+GitLab
+
--- a/glibc-RHEL-15696-63.patch
+++ b/glibc-RHEL-15696-63.patch
--- a/glibc-RHEL-15696-64.patch
+++ b/glibc-RHEL-15696-64.patch
@ -0,0 +1,39 @@
+From 0b82747dc48d5bf0871bdc6da8cb6eec1256355f Mon Sep 17 00:00:00 2001
+From: "H.J. Lu" <hjl.tools@gmail.com>
+Date: Thu, 11 Nov 2021 06:31:51 -0800
+Subject: [PATCH] Avoid extra load with CAS in __pthread_mutex_lock_full [BZ
+ #28537]
+Content-type: text/plain; charset=UTF-8
+
+Replace boolean CAS with value CAS to avoid the extra load.
+
+Reviewed-by: Szabolcs Nagy <szabolcs.nagy@arm.com>
+---
+ nptl/pthread_mutex_lock.c | 10 +++++-----
+ 1 file changed, 5 insertions(+), 5 deletions(-)
+
+diff --git a/nptl/pthread_mutex_lock.c b/nptl/pthread_mutex_lock.c
+index 29cc143e..60ada70d 100644
+--- a/nptl/pthread_mutex_lock.c
+++ b/nptl/pthread_mutex_lock.c
+@@ -292,12 +292,12 @@ __pthread_mutex_lock_full (pthread_mutex_t *mutex)
+ 	     meantime.  */
+ 	  if ((oldval & FUTEX_WAITERS) == 0)
+ 	    {
+-	      if (atomic_compare_and_exchange_bool_acq (&mutex->__data.__lock,
+-							oldval | FUTEX_WAITERS,
+-							oldval)
+-		  != 0)
+	      int val;
+	      if ((val = atomic_compare_and_exchange_val_acq
+		   (&mutex->__data.__lock, oldval | FUTEX_WAITERS,
+		    oldval)) != oldval)
+ 		{
+-		  oldval = mutex->__data.__lock;
+		  oldval = val;
+ 		  continue;
+ 		}
+ 	      oldval |= FUTEX_WAITERS;
+-- 
+GitLab
+
--- a/glibc-RHEL-15696-65.patch
+++ b/glibc-RHEL-15696-65.patch
@ -0,0 +1,39 @@
+From 49302b8fdf9103b6fc0a398678668a22fa19574c Mon Sep 17 00:00:00 2001
+From: "H.J. Lu" <hjl.tools@gmail.com>
+Date: Thu, 11 Nov 2021 06:54:01 -0800
+Subject: [PATCH] Avoid extra load with CAS in __pthread_mutex_clocklock_common
+ [BZ #28537]
+Content-type: text/plain; charset=UTF-8
+
+Replace boolean CAS with value CAS to avoid the extra load.
+
+Reviewed-by: Szabolcs Nagy <szabolcs.nagy@arm.com>
+---
+ nptl/pthread_mutex_timedlock.c | 10 +++++-----
+ 1 file changed, 5 insertions(+), 5 deletions(-)
+
+diff --git a/nptl/pthread_mutex_timedlock.c b/nptl/pthread_mutex_timedlock.c
+index 888c12fe..c4627ef6 100644
+--- a/nptl/pthread_mutex_timedlock.c
+++ b/nptl/pthread_mutex_timedlock.c
+@@ -269,12 +269,12 @@ __pthread_mutex_timedlock (pthread_mutex_t *mutex,
+ 	     meantime.  */
+ 	  if ((oldval & FUTEX_WAITERS) == 0)
+ 	    {
+-	      if (atomic_compare_and_exchange_bool_acq (&mutex->__data.__lock,
+-							oldval | FUTEX_WAITERS,
+-							oldval)
+-		  != 0)
+	      int val;
+	      if ((val = atomic_compare_and_exchange_val_acq
+		   (&mutex->__data.__lock, oldval | FUTEX_WAITERS,
+		    oldval)) != oldval)
+ 		{
+-		  oldval = mutex->__data.__lock;
+		  oldval = val;
+ 		  continue;
+ 		}
+ 	      oldval |= FUTEX_WAITERS;
+-- 
+GitLab
+
--- a/glibc-RHEL-15696-66.patch
+++ b/glibc-RHEL-15696-66.patch
@ -0,0 +1,51 @@
+From d672a98a1af106bd68deb15576710cd61363f7a6 Mon Sep 17 00:00:00 2001
+From: "H.J. Lu" <hjl.tools@gmail.com>
+Date: Tue, 2 Nov 2021 18:33:07 -0700
+Subject: [PATCH] Add LLL_MUTEX_READ_LOCK [BZ #28537]
+Content-type: text/plain; charset=UTF-8
+
+CAS instruction is expensive.  From the x86 CPU's point of view, getting
+a cache line for writing is more expensive than reading.  See Appendix
+A.2 Spinlock in:
+
+https://www.intel.com/content/dam/www/public/us/en/documents/white-papers/xeon-lock-scaling-analysis-paper.pdf
+
+The full compare and swap will grab the cache line exclusive and cause
+excessive cache line bouncing.
+
+Add LLL_MUTEX_READ_LOCK to do an atomic load and skip CAS in spinlock
+loop if compare may fail to reduce cache line bouncing on contended locks.
+
+Reviewed-by: Szabolcs Nagy <szabolcs.nagy@arm.com>
+---
+ nptl/pthread_mutex_lock.c | 7 +++++++
+ 1 file changed, 7 insertions(+)
+
+diff --git a/nptl/pthread_mutex_lock.c b/nptl/pthread_mutex_lock.c
+index 60ada70d..eb4d8baa 100644
+--- a/nptl/pthread_mutex_lock.c
+++ b/nptl/pthread_mutex_lock.c
+@@ -56,6 +56,11 @@
+ #define FORCE_ELISION(m, s)
+ #endif
+ 
+#ifndef LLL_MUTEX_READ_LOCK
+# define LLL_MUTEX_READ_LOCK(mutex) \
+  atomic_load_relaxed (&(mutex)->__data.__lock)
+#endif
+
+ static int __pthread_mutex_lock_full (pthread_mutex_t *mutex)
+      __attribute_noinline__;
+ 
+@@ -136,6 +141,8 @@ __pthread_mutex_lock (pthread_mutex_t *mutex)
+ 		  break;
+ 		}
+ 	      atomic_spin_nop ();
+	      if (LLL_MUTEX_READ_LOCK (mutex) != 0)
+		continue;
+ 	    }
+ 	  while (LLL_MUTEX_TRYLOCK (mutex) != 0);
+ 
+-- 
+GitLab
+
--- a/glibc-RHEL-15696-67.patch
+++ b/glibc-RHEL-15696-67.patch
@ -0,0 +1,71 @@
+From 120ac6d238825452e8024e2f627da33b2508dfd3 Mon Sep 17 00:00:00 2001
+From: "H.J. Lu" <hjl.tools@gmail.com>
+Date: Fri, 12 Nov 2021 11:47:42 -0800
+Subject: [PATCH] Move assignment out of the CAS condition
+Content-type: text/plain; charset=UTF-8
+
+Update
+
+commit 49302b8fdf9103b6fc0a398678668a22fa19574c
+Author: H.J. Lu <hjl.tools@gmail.com>
+Date:   Thu Nov 11 06:54:01 2021 -0800
+
+    Avoid extra load with CAS in __pthread_mutex_clocklock_common [BZ #28537]
+
+    Replace boolean CAS with value CAS to avoid the extra load.
+
+and
+
+commit 0b82747dc48d5bf0871bdc6da8cb6eec1256355f
+Author: H.J. Lu <hjl.tools@gmail.com>
+Date:   Thu Nov 11 06:31:51 2021 -0800
+
+    Avoid extra load with CAS in __pthread_mutex_lock_full [BZ #28537]
+
+    Replace boolean CAS with value CAS to avoid the extra load.
+
+by moving assignment out of the CAS condition.
+---
+ nptl/pthread_mutex_lock.c      | 7 +++----
+ nptl/pthread_mutex_timedlock.c | 7 +++----
+ 2 files changed, 6 insertions(+), 8 deletions(-)
+
+diff --git a/nptl/pthread_mutex_lock.c b/nptl/pthread_mutex_lock.c
+index eb4d8baa..a633d95e 100644
+--- a/nptl/pthread_mutex_lock.c
+++ b/nptl/pthread_mutex_lock.c
+@@ -299,10 +299,9 @@ __pthread_mutex_lock_full (pthread_mutex_t *mutex)
+ 	     meantime.  */
+ 	  if ((oldval & FUTEX_WAITERS) == 0)
+ 	    {
+-	      int val;
+-	      if ((val = atomic_compare_and_exchange_val_acq
+-		   (&mutex->__data.__lock, oldval | FUTEX_WAITERS,
+-		    oldval)) != oldval)
+	      int val = atomic_compare_and_exchange_val_acq
+		(&mutex->__data.__lock, oldval | FUTEX_WAITERS, oldval);
+	      if (val != oldval)
+ 		{
+ 		  oldval = val;
+ 		  continue;
+diff --git a/nptl/pthread_mutex_timedlock.c b/nptl/pthread_mutex_timedlock.c
+index c4627ef6..a76c30b7 100644
+--- a/nptl/pthread_mutex_timedlock.c
+++ b/nptl/pthread_mutex_timedlock.c
+@@ -269,10 +269,9 @@ __pthread_mutex_timedlock (pthread_mutex_t *mutex,
+ 	     meantime.  */
+ 	  if ((oldval & FUTEX_WAITERS) == 0)
+ 	    {
+-	      int val;
+-	      if ((val = atomic_compare_and_exchange_val_acq
+-		   (&mutex->__data.__lock, oldval | FUTEX_WAITERS,
+-		    oldval)) != oldval)
+	      int val = atomic_compare_and_exchange_val_acq
+		(&mutex->__data.__lock, oldval | FUTEX_WAITERS, oldval);
+	      if (val != oldval)
+ 		{
+ 		  oldval = val;
+ 		  continue;
+-- 
+GitLab
+
--- a/glibc-RHEL-15696-68.patch
+++ b/glibc-RHEL-15696-68.patch
@ -0,0 +1,60 @@
+From 4df1fa6ddc8925a75f3da644d5da3bb16eb33f02 Mon Sep 17 00:00:00 2001
+From: Noah Goldstein <goldstein.w.n@gmail.com>
+Date: Fri, 3 Dec 2021 15:29:25 -0800
+Subject: [PATCH] x86-64: Use notl in EVEX strcmp [BZ #28646]
+Content-type: text/plain; charset=UTF-8
+
+Must use notl %edi here as lower bits are for CHAR comparisons
+potentially out of range thus can be 0 without indicating mismatch.
+This fixes BZ #28646.
+
+Co-Authored-By: H.J. Lu <hjl.tools@gmail.com>
+---
+ sysdeps/x86_64/multiarch/strcmp-evex.S | 14 ++++++++------
+ 1 file changed, 8 insertions(+), 6 deletions(-)
+
+Conflicts:
+	string/test-strcmp.c
+	(new check omitted)
+
+diff --git a/sysdeps/x86_64/multiarch/strcmp-evex.S b/sysdeps/x86_64/multiarch/strcmp-evex.S
+index 82f12ac8..6f5c4bf9 100644
+--- a/sysdeps/x86_64/multiarch/strcmp-evex.S
+++ b/sysdeps/x86_64/multiarch/strcmp-evex.S
+@@ -656,12 +656,13 @@ L(loop_cross_page):
+ 	   in YMM3 and 32 bytes at VEC_SIZE(%rdx, %r10).  */
+ 	VPCMP	$0, VEC_SIZE(%rdx, %r10), %YMM3, %k3{%k4}
+ 	kmovd	%k3, %edi
+    /* Must use notl %edi here as lower bits are for CHAR
+	   comparisons potentially out of range thus can be 0 without
+	   indicating mismatch.  */
+	notl	%edi
+ # ifdef USE_AS_WCSCMP
+ 	/* Don't use subl since it is the upper 8 bits of EDI below.  */
+-	notl	%edi
+ 	andl	$0xff, %edi
+-# else
+-	incl	%edi
+ # endif
+ 
+ # ifdef USE_AS_WCSCMP
+@@ -743,12 +744,13 @@ L(loop_cross_page_2_vec):
+ 	   in YMM1 and 32 bytes at (VEC_SIZE * 3)(%rdx, %r10).  */
+ 	VPCMP	$0, (VEC_SIZE * 3)(%rdx, %r10), %YMM1, %k3{%k4}
+ 	kmovd	%k3, %edi
+	/* Must use notl %edi here as lower bits are for CHAR
+	   comparisons potentially out of range thus can be 0 without
+	   indicating mismatch.  */
+	notl	%edi
+ # ifdef USE_AS_WCSCMP
+ 	/* Don't use subl since it is the upper 8 bits of EDI below.  */
+-	notl	%edi
+ 	andl	$0xff, %edi
+-# else
+-	incl	%edi
+ # endif
+ 
+ # ifdef USE_AS_WCSCMP
+-- 
+GitLab
+
--- a/glibc-RHEL-15696-69.patch
+++ b/glibc-RHEL-15696-69.patch
@ -0,0 +1,35 @@
+From ceeffe968c01b1202e482f4855cb6baf5c6cb713 Mon Sep 17 00:00:00 2001
+From: "H.J. Lu" <hjl.tools@gmail.com>
+Date: Mon, 6 Dec 2021 07:14:12 -0800
+Subject: [PATCH] x86: Don't set Prefer_No_AVX512 for processors with AVX512
+ and AVX-VNNI
+Content-type: text/plain; charset=UTF-8
+
+Don't set Prefer_No_AVX512 on processors with AVX512 and AVX-VNNI since
+they won't lower CPU frequency when ZMM load and store instructions are
+used.
+---
+ sysdeps/x86/cpu-features.c | 7 +++++--
+ 1 file changed, 5 insertions(+), 2 deletions(-)
+
+diff --git a/sysdeps/x86/cpu-features.c b/sysdeps/x86/cpu-features.c
+index 956bfb4f..5ff2baa0 100644
+--- a/sysdeps/x86/cpu-features.c
+++ b/sysdeps/x86/cpu-features.c
+@@ -525,8 +525,11 @@ init_cpu_features (struct cpu_features *cpu_features)
+ 	  |= bit_arch_Prefer_No_VZEROUPPER;
+       else
+ 	{
+-	  cpu_features->preferred[index_arch_Prefer_No_AVX512]
+-	    |= bit_arch_Prefer_No_AVX512;
+	  /* Processors with AVX512 and AVX-VNNI won't lower CPU frequency
+	     when ZMM load and store instructions are used.  */
+	  if (!CPU_FEATURES_CPU_P (cpu_features, AVX_VNNI))
+	    cpu_features->preferred[index_arch_Prefer_No_AVX512]
+	      |= bit_arch_Prefer_No_AVX512;
+ 
+ 	  /* Avoid RTM abort triggered by VZEROUPPER inside a
+ 	     transactionally executing RTM region.  */
+-- 
+GitLab
+
--- a/glibc-RHEL-15696-7.patch
+++ b/glibc-RHEL-15696-7.patch
@ -0,0 +1,153 @@
+From c7c54f65b080affb87a1513dee449c8ad6143c8b Mon Sep 17 00:00:00 2001
+From: "H.J. Lu" <hjl.tools@gmail.com>
+Date: Mon, 21 Jan 2019 11:35:18 -0800
+Subject: [PATCH] x86-64 strncpy: Properly handle the length parameter [BZ#
+ 24097]
+Content-type: text/plain; charset=UTF-8
+
+On x32, the size_t parameter may be passed in the lower 32 bits of a
+64-bit register with the non-zero upper 32 bits.  The string/memory
+functions written in assembly can only use the lower 32 bits of a
+64-bit register as length or must clear the upper 32 bits before using
+the full 64-bit register for length.
+
+This pach fixes strncpy for x32.  Tested on x86-64 and x32.  On x86-64,
+libc.so is the same with and withou the fix.
+
+	[BZ# 24097]
+	CVE-2019-6488
+	* sysdeps/x86_64/multiarch/strcpy-avx2.S: Use RDX_LP for length.
+	* sysdeps/x86_64/multiarch/strcpy-sse2-unaligned.S: Likewise.
+	* sysdeps/x86_64/multiarch/strcpy-ssse3.S: Likewise.
+	* sysdeps/x86_64/x32/Makefile (tests): Add tst-size_t-strncpy.
+	* sysdeps/x86_64/x32/tst-size_t-strncpy.c: New file.
+---
+ .../x86_64/multiarch/strcpy-sse2-unaligned.S  |  4 +-
+ sysdeps/x86_64/multiarch/strcpy-ssse3.S       |  6 +-
+ sysdeps/x86_64/x32/Makefile                   |  2 +-
+ sysdeps/x86_64/x32/tst-size_t-strncpy.c       | 58 +++++++++++++++++++
+ 4 files changed, 64 insertions(+), 6 deletions(-)
+ create mode 100644 sysdeps/x86_64/x32/tst-size_t-strncpy.c
+
+Conflicts:
+	ChangeLog
+	(removed)
+	sysdeps/x86_64/multiarch/strcpy-avx2.S
+	(skipped, only needed for x32 arch)
+
+diff --git a/sysdeps/x86_64/multiarch/strcpy-sse2-unaligned.S b/sysdeps/x86_64/multiarch/strcpy-sse2-unaligned.S
+index 72bf7e85..50aca22d 100644
+--- a/sysdeps/x86_64/multiarch/strcpy-sse2-unaligned.S
+++ b/sysdeps/x86_64/multiarch/strcpy-sse2-unaligned.S
+@@ -40,8 +40,8 @@
+ .text
+ ENTRY (STRCPY)
+ #  ifdef USE_AS_STRNCPY
+-	mov	%rdx, %r8
+-	test	%r8, %r8
+	mov	%RDX_LP, %R8_LP
+	test	%R8_LP, %R8_LP
+ 	jz	L(ExitZero)
+ #  endif
+ 	mov	%rsi, %rcx
+diff --git a/sysdeps/x86_64/multiarch/strcpy-ssse3.S b/sysdeps/x86_64/multiarch/strcpy-ssse3.S
+index 9858d0c4..0a62814a 100644
+--- a/sysdeps/x86_64/multiarch/strcpy-ssse3.S
+++ b/sysdeps/x86_64/multiarch/strcpy-ssse3.S
+@@ -31,13 +31,13 @@ ENTRY (STRCPY)
+ 
+ 	mov	%rsi, %rcx
+ #  ifdef USE_AS_STRNCPY
+-	mov	%rdx, %r8
+	mov	%RDX_LP, %R8_LP
+ #  endif
+ 	mov	%rdi, %rdx
+ #  ifdef USE_AS_STRNCPY
+-	test	%r8, %r8
+	test	%R8_LP, %R8_LP
+ 	jz	L(Exit0)
+-	cmp	$8, %r8
+	cmp	$8, %R8_LP
+ 	jbe	L(StrncpyExit8Bytes)
+ # endif
+ 	cmpb	$0, (%rcx)
+diff --git a/sysdeps/x86_64/x32/Makefile b/sysdeps/x86_64/x32/Makefile
+index db302839..2a9e20a9 100644
+--- a/sysdeps/x86_64/x32/Makefile
+++ b/sysdeps/x86_64/x32/Makefile
+@@ -8,7 +8,7 @@ endif
+ ifeq ($(subdir),string)
+ tests += tst-size_t-memchr tst-size_t-memcmp tst-size_t-memcpy \
+ 	 tst-size_t-memrchr tst-size_t-memset tst-size_t-strncasecmp \
+-	 tst-size_t-strncmp
+	 tst-size_t-strncmp tst-size_t-strncpy
+ endif
+ 
+ ifeq ($(subdir),wcsmbs)
+diff --git a/sysdeps/x86_64/x32/tst-size_t-strncpy.c b/sysdeps/x86_64/x32/tst-size_t-strncpy.c
+new file mode 100644
+index 00000000..4dec71e6
+--- /dev/null
+++ b/sysdeps/x86_64/x32/tst-size_t-strncpy.c
+@@ -0,0 +1,58 @@
+/* Test strncpy with size_t in the lower 32 bits of 64-bit register.
+   Copyright (C) 2019 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#define TEST_NAME "strncpy"
+#include "test-size_t.h"
+
+IMPL (strncpy, 1)
+
+typedef char *(*proto_t) (char *, const char*, size_t);
+
+static void *
+__attribute__ ((noinline, noclone))
+do_strncpy (parameter_t a, parameter_t b)
+{
+  return CALL (&b, a.p, b.p, a.len);
+}
+
+static int
+test_main (void)
+{
+  test_init ();
+
+  parameter_t dest = { { page_size }, buf1 };
+  parameter_t src = { { 0 }, buf2 };
+
+  int ret = 0;
+  FOR_EACH_IMPL (impl, 0)
+    {
+      src.fn = impl->fn;
+      do_strncpy (dest, src);
+      int res = strncmp (dest.p, src.p, dest.len);
+      if (res)
+	{
+	  error (0, 0, "Wrong result in function %s: %i != 0",
+		 impl->name, res);
+	  ret = 1;
+	}
+    }
+
+  return ret ? EXIT_FAILURE : EXIT_SUCCESS;
+}
+
+#include <support/test-driver.c>
+-- 
+GitLab
+
--- a/glibc-RHEL-15696-70.patch
+++ b/glibc-RHEL-15696-70.patch
@ -0,0 +1,389 @@
+From abddd61de090ae84e380aff68a98bd94ef704667 Mon Sep 17 00:00:00 2001
+From: Noah Goldstein <goldstein.w.n@gmail.com>
+Date: Fri, 24 Dec 2021 18:54:41 -0600
+Subject: [PATCH] x86: Optimize L(less_vec) case in memcmp-evex-movbe.S
+Content-type: text/plain; charset=UTF-8
+
+No bug.
+Optimizations are twofold.
+
+1) Replace page cross and 0/1 checks with masked load instructions in
+   L(less_vec). In applications this reduces branch-misses in the
+   hot [0, 32] case.
+2) Change controlflow so that L(less_vec) case gets the fall through.
+
+Change 2) helps copies in the [0, 32] size range but comes at the cost
+of copies in the [33, 64] size range.  From profiles of GCC and
+Python3, 94%+ and 99%+ of calls are in the [0, 32] range so this
+appears to the the right tradeoff.
+
+Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
+Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
+---
+ sysdeps/x86_64/multiarch/memcmp-evex-movbe.S | 249 +++++--------------
+ 1 file changed, 56 insertions(+), 193 deletions(-)
+
+diff --git a/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S b/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S
+index 640f6757..d2899e7c 100644
+--- a/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S
+++ b/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S
+@@ -62,15 +62,18 @@ Latency:
+ # define VMOVU		vmovdqu64
+ 
+ # ifdef USE_AS_WMEMCMP
+#  define VMOVU_MASK	vmovdqu32
+ #  define CHAR_SIZE	4
+ #  define VPCMP	vpcmpd
+ #  define VPTEST	vptestmd
+ # else
+#  define VMOVU_MASK	vmovdqu8
+ #  define CHAR_SIZE	1
+ #  define VPCMP	vpcmpub
+ #  define VPTEST	vptestmb
+ # endif
+ 
+
+ # define VEC_SIZE	32
+ # define PAGE_SIZE	4096
+ # define CHAR_PER_VEC	(VEC_SIZE / CHAR_SIZE)
+@@ -102,12 +105,48 @@ ENTRY_P2ALIGN (MEMCMP, 6)
+ 	movl	%edx, %edx
+ # endif
+ 	cmp	$CHAR_PER_VEC, %RDX_LP
+-	jb	L(less_vec)
+	/* Fall through for [0, VEC_SIZE] as its the hottest.  */
+	ja	L(more_1x_vec)
+
+	/* Create mask for CHAR's we want to compare. This allows us to
+	   avoid having to include page cross logic.  */
+	movl	$-1, %ecx
+	bzhil	%edx, %ecx, %ecx
+	kmovd	%ecx, %k2
+
+	/* Safe to load full ymm with mask.  */
+	VMOVU_MASK (%rsi), %YMM2{%k2}
+	VPCMP	$4,(%rdi), %YMM2, %k1{%k2}
+	kmovd	%k1, %eax
+	testl	%eax, %eax
+	jnz	L(return_vec_0)
+	ret
+ 
+	.p2align 4
+L(return_vec_0):
+	tzcntl	%eax, %eax
+# ifdef USE_AS_WMEMCMP
+	movl	(%rdi, %rax, CHAR_SIZE), %ecx
+	xorl	%edx, %edx
+	cmpl	(%rsi, %rax, CHAR_SIZE), %ecx
+	/* NB: no partial register stall here because xorl zero idiom
+	   above.  */
+	setg	%dl
+	leal	-1(%rdx, %rdx), %eax
+# else
+	movzbl	(%rsi, %rax), %ecx
+	movzbl	(%rdi, %rax), %eax
+	subl	%ecx, %eax
+# endif
+	ret
+
+
+	.p2align 4
+L(more_1x_vec):
+ 	/* From VEC to 2 * VEC.  No branch when size == VEC_SIZE.  */
+ 	VMOVU	(%rsi), %YMM1
+ 	/* Use compare not equals to directly check for mismatch.  */
+-	VPCMP	$4, (%rdi), %YMM1, %k1
+	VPCMP	$4,(%rdi), %YMM1, %k1
+ 	kmovd	%k1, %eax
+ 	/* NB: eax must be destination register if going to
+ 	   L(return_vec_[0,2]). For L(return_vec_3) destination register
+@@ -131,13 +170,13 @@ ENTRY_P2ALIGN (MEMCMP, 6)
+ 
+ 	/* Check third and fourth VEC no matter what.  */
+ 	VMOVU	(VEC_SIZE * 2)(%rsi), %YMM3
+-	VPCMP	$4, (VEC_SIZE * 2)(%rdi), %YMM3, %k1
+	VPCMP	$4,(VEC_SIZE * 2)(%rdi), %YMM3, %k1
+ 	kmovd	%k1, %eax
+ 	testl	%eax, %eax
+ 	jnz	L(return_vec_2)
+ 
+ 	VMOVU	(VEC_SIZE * 3)(%rsi), %YMM4
+-	VPCMP	$4, (VEC_SIZE * 3)(%rdi), %YMM4, %k1
+	VPCMP	$4,(VEC_SIZE * 3)(%rdi), %YMM4, %k1
+ 	kmovd	%k1, %ecx
+ 	testl	%ecx, %ecx
+ 	jnz	L(return_vec_3)
+@@ -169,7 +208,7 @@ ENTRY_P2ALIGN (MEMCMP, 6)
+ 	VMOVU	(VEC_SIZE * 3)(%rsi), %YMM4
+ 	/* Ternary logic to xor (VEC_SIZE * 3)(%rdi) with YMM4 while
+ 	   oring with YMM1. Result is stored in YMM4.  */
+-	vpternlogd $0xde, (VEC_SIZE * 3)(%rdi), %YMM1, %YMM4
+	vpternlogd $0xde,(VEC_SIZE * 3)(%rdi), %YMM1, %YMM4
+ 
+ 	/* Or together YMM2, YMM3, and YMM4 into YMM4.  */
+ 	vpternlogd $0xfe, %YMM2, %YMM3, %YMM4
+@@ -184,7 +223,8 @@ ENTRY_P2ALIGN (MEMCMP, 6)
+ 	/* NB: eax must be zero to reach here.  */
+ 	ret
+ 
+-	.p2align 4
+
+	.p2align 4,, 8
+ L(8x_end_return_vec_0_1_2_3):
+ 	movq	%rdx, %rdi
+ L(8x_return_vec_0_1_2_3):
+@@ -222,23 +262,6 @@ L(return_vec_3):
+ # endif
+ 	ret
+ 
+-	.p2align 4
+-L(return_vec_0):
+-	tzcntl	%eax, %eax
+-# ifdef USE_AS_WMEMCMP
+-	movl	(%rdi, %rax, CHAR_SIZE), %ecx
+-	xorl	%edx, %edx
+-	cmpl	(%rsi, %rax, CHAR_SIZE), %ecx
+-	/* NB: no partial register stall here because xorl zero idiom
+-	   above.  */
+-	setg	%dl
+-	leal	-1(%rdx, %rdx), %eax
+-# else
+-	movzbl	(%rsi, %rax), %ecx
+-	movzbl	(%rdi, %rax), %eax
+-	subl	%ecx, %eax
+-# endif
+-	ret
+ 
+ 	.p2align 4
+ L(return_vec_1):
+@@ -297,7 +320,7 @@ L(loop_4x_vec):
+ 	VMOVU	(VEC_SIZE * 2)(%rsi, %rdi), %YMM3
+ 	vpxorq	(VEC_SIZE * 2)(%rdi), %YMM3, %YMM3
+ 	VMOVU	(VEC_SIZE * 3)(%rsi, %rdi), %YMM4
+-	vpternlogd $0xde, (VEC_SIZE * 3)(%rdi), %YMM1, %YMM4
+	vpternlogd $0xde,(VEC_SIZE * 3)(%rdi), %YMM1, %YMM4
+ 	vpternlogd $0xfe, %YMM2, %YMM3, %YMM4
+ 	VPTEST	%YMM4, %YMM4, %k1
+ 	kmovd	%k1, %ecx
+@@ -324,7 +347,7 @@ L(loop_4x_vec):
+ 	VMOVU	VEC_SIZE(%rsi, %rdx), %YMM2
+ 	vpxorq	VEC_SIZE(%rdx), %YMM2, %YMM2
+ 	VMOVU	(VEC_SIZE * 3)(%rsi, %rdx), %YMM4
+-	vpternlogd $0xde, (VEC_SIZE * 3)(%rdx), %YMM1, %YMM4
+	vpternlogd $0xde,(VEC_SIZE * 3)(%rdx), %YMM1, %YMM4
+ 	vpternlogd $0xfe, %YMM2, %YMM3, %YMM4
+ 	VPTEST	%YMM4, %YMM4, %k1
+ 	kmovd	%k1, %ecx
+@@ -336,14 +359,14 @@ L(loop_4x_vec):
+ 	/* Only entry is from L(more_8x_vec).  */
+ 	.p2align 4,, 10
+ L(8x_last_2x_vec):
+-	VPCMP	$4, (VEC_SIZE * 2)(%rdx), %YMM3, %k1
+	VPCMP	$4,(VEC_SIZE * 2)(%rdx), %YMM3, %k1
+ 	kmovd	%k1, %eax
+ 	testl	%eax, %eax
+ 	jnz	L(8x_return_vec_2)
+ 	/* Naturally aligned to 16 bytes.  */
+ L(8x_last_1x_vec):
+ 	VMOVU	(VEC_SIZE * 3)(%rsi, %rdx), %YMM1
+-	VPCMP	$4, (VEC_SIZE * 3)(%rdx), %YMM1, %k1
+	VPCMP	$4,(VEC_SIZE * 3)(%rdx), %YMM1, %k1
+ 	kmovd	%k1, %eax
+ 	testl	%eax, %eax
+ 	jnz	L(8x_return_vec_3)
+@@ -392,7 +415,9 @@ L(last_1x_vec):
+ 	jnz	L(return_vec_0_end)
+ 	ret
+ 
+-	.p2align 4,, 10
+
+	/* Don't align. Takes 2-fetch blocks either way and aligning
+	   will cause code to spill into another cacheline.  */
+ L(return_vec_1_end):
+ 	/* Use bsf to save code size. This is necessary to have
+ 	   L(one_or_less) fit in aligning bytes between.  */
+@@ -411,31 +436,8 @@ L(return_vec_1_end):
+ # endif
+ 	ret
+ 
+-	/* NB: L(one_or_less) fits in alignment padding between
+-	   L(return_vec_1_end) and L(return_vec_0_end).  */
+-# ifdef USE_AS_WMEMCMP
+-L(one_or_less):
+-	jb	L(zero)
+-	movl	(%rdi), %ecx
+-	xorl	%edx, %edx
+-	cmpl	(%rsi), %ecx
+-	je	L(zero)
+-	setg	%dl
+-	leal	-1(%rdx, %rdx), %eax
+-	ret
+-# else
+-L(one_or_less):
+-	jb	L(zero)
+-	movzbl	(%rsi), %ecx
+-	movzbl	(%rdi), %eax
+-	subl	%ecx, %eax
+-	ret
+-# endif
+-L(zero):
+-	xorl	%eax, %eax
+-	ret
+-
+-	.p2align 4
+	/* Don't align. Takes 2-fetch blocks either way and aligning
+	   will cause code to spill into another cacheline.  */
+ L(return_vec_0_end):
+ 	tzcntl	%eax, %eax
+ 	addl	%edx, %eax
+@@ -451,146 +453,7 @@ L(return_vec_0_end):
+ 	subl	%ecx, %eax
+ # endif
+ 	ret
+	/* 1-byte until next cache line.  */
+ 
+-	.p2align 4
+-L(less_vec):
+-	/* Check if one or less CHAR. This is necessary for size == 0
+-	   but is also faster for size == CHAR_SIZE.  */
+-	cmpl	$1, %edx
+-	jbe	L(one_or_less)
+-
+-	/* Check if loading one VEC from either s1 or s2 could cause a
+-	   page cross. This can have false positives but is by far the
+-	   fastest method.  */
+-	movl	%edi, %eax
+-	orl	%esi, %eax
+-	andl	$(PAGE_SIZE - 1), %eax
+-	cmpl	$(PAGE_SIZE - VEC_SIZE), %eax
+-	jg	L(page_cross_less_vec)
+-
+-	/* No page cross possible.  */
+-	VMOVU	(%rsi), %YMM2
+-	VPCMP	$4, (%rdi), %YMM2, %k1
+-	kmovd	%k1, %eax
+-	/* Check if any matches where in bounds. Intentionally not
+-	   storing result in eax to limit dependency chain if it goes to
+-	   L(return_vec_0_lv).  */
+-	bzhil	%edx, %eax, %edx
+-	jnz	L(return_vec_0_lv)
+-	xorl	%eax, %eax
+-	ret
+-
+-	/* Essentially duplicate of L(return_vec_0). Ends up not costing
+-	   any code as shrinks L(less_vec) by allowing 2-byte encoding of
+-	   the jump and ends up fitting in aligning bytes. As well fits on
+-	   same cache line as L(less_vec) so also saves a line from having
+-	   to be fetched on cold calls to memcmp.  */
+-	.p2align 4,, 4
+-L(return_vec_0_lv):
+-	tzcntl	%eax, %eax
+-# ifdef USE_AS_WMEMCMP
+-	movl	(%rdi, %rax, CHAR_SIZE), %ecx
+-	xorl	%edx, %edx
+-	cmpl	(%rsi, %rax, CHAR_SIZE), %ecx
+-	/* NB: no partial register stall here because xorl zero idiom
+-	   above.  */
+-	setg	%dl
+-	leal	-1(%rdx, %rdx), %eax
+-# else
+-	movzbl	(%rsi, %rax), %ecx
+-	movzbl	(%rdi, %rax), %eax
+-	subl	%ecx, %eax
+-# endif
+-	ret
+-
+-	.p2align 4
+-L(page_cross_less_vec):
+-	/* if USE_AS_WMEMCMP it can only be 0, 4, 8, 12, 16, 20, 24, 28
+-	   bytes.  */
+-	cmpl	$(16 / CHAR_SIZE), %edx
+-	jae	L(between_16_31)
+-# ifndef USE_AS_WMEMCMP
+-	cmpl	$8, %edx
+-	jae	L(between_8_15)
+-	cmpl	$4, %edx
+-	jb	L(between_2_3)
+-
+-	/* Load as big endian with overlapping movbe to avoid branches.
+-	 */
+-	movbe	(%rdi), %eax
+-	movbe	(%rsi), %ecx
+-	shlq	$32, %rax
+-	shlq	$32, %rcx
+-	movbe	-4(%rdi, %rdx), %edi
+-	movbe	-4(%rsi, %rdx), %esi
+-	orq	%rdi, %rax
+-	orq	%rsi, %rcx
+-	subq	%rcx, %rax
+-	/* edx is guranteed to be positive int32 in range [4, 7].  */
+-	cmovne	%edx, %eax
+-	/* ecx is -1 if rcx > rax. Otherwise 0.  */
+-	sbbl	%ecx, %ecx
+-	/* If rcx > rax, then ecx is 0 and eax is positive. If rcx ==
+-	   rax then eax and ecx are zero. If rax < rax then ecx is -1 so
+-	   eax doesn't matter.  */
+-	orl	%ecx, %eax
+-	ret
+-
+-	.p2align 4,, 8
+-L(between_8_15):
+-# endif
+-	/* If USE_AS_WMEMCMP fall through into 8-15 byte case.  */
+-	vmovq	(%rdi), %xmm1
+-	vmovq	(%rsi), %xmm2
+-	VPCMP	$4, %xmm1, %xmm2, %k1
+-	kmovd	%k1, %eax
+-	testl	%eax, %eax
+-	jnz	L(return_vec_0_lv)
+-	/* Use overlapping loads to avoid branches.  */
+-	vmovq	-8(%rdi, %rdx, CHAR_SIZE), %xmm1
+-	vmovq	-8(%rsi, %rdx, CHAR_SIZE), %xmm2
+-	VPCMP	$4, %xmm1, %xmm2, %k1
+-	addl	$(CHAR_PER_VEC - (8 / CHAR_SIZE)), %edx
+-	kmovd	%k1, %eax
+-	testl	%eax, %eax
+-	jnz	L(return_vec_0_end)
+-	ret
+-
+-	.p2align 4,, 8
+-L(between_16_31):
+-	/* From 16 to 31 bytes.  No branch when size == 16.  */
+-
+-	/* Use movups to save code size.  */
+-	vmovdqu	(%rsi), %xmm2
+-	VPCMP	$4, (%rdi), %xmm2, %k1
+-	kmovd	%k1, %eax
+-	testl	%eax, %eax
+-	jnz	L(return_vec_0_lv)
+-	/* Use overlapping loads to avoid branches.  */
+-	vmovdqu	-16(%rsi, %rdx, CHAR_SIZE), %xmm2
+-	VPCMP	$4, -16(%rdi, %rdx, CHAR_SIZE), %xmm2, %k1
+-	addl	$(CHAR_PER_VEC - (16 / CHAR_SIZE)), %edx
+-	kmovd	%k1, %eax
+-	testl	%eax, %eax
+-	jnz	L(return_vec_0_end)
+-	ret
+-
+-# ifndef USE_AS_WMEMCMP
+-L(between_2_3):
+-	/* Load as big endian to avoid branches.  */
+-	movzwl	(%rdi), %eax
+-	movzwl	(%rsi), %ecx
+-	shll	$8, %eax
+-	shll	$8, %ecx
+-	bswap	%eax
+-	bswap	%ecx
+-	movzbl	-1(%rdi, %rdx), %edi
+-	movzbl	-1(%rsi, %rdx), %esi
+-	orl	%edi, %eax
+-	orl	%esi, %ecx
+-	/* Subtraction is okay because the upper 8 bits are zero.  */
+-	subl	%ecx, %eax
+-	ret
+-# endif
+ END (MEMCMP)
+ #endif
+-- 
+GitLab
+
--- a/glibc-RHEL-15696-71.patch
+++ b/glibc-RHEL-15696-71.patch
@ -0,0 +1,43 @@
+From 6b8dbbd03ac88f169b65b5c7d7278576a11d2e44 Mon Sep 17 00:00:00 2001
+From: Jangwoong Kim <6812skiii@gmail.com>
+Date: Tue, 14 Dec 2021 21:30:51 +0900
+Subject: [PATCH] nptl: Effectively skip CAS in spinlock loop
+Content-type: text/plain; charset=UTF-8
+
+The commit:
+"Add LLL_MUTEX_READ_LOCK [BZ #28537]"
+SHA1: d672a98a1af106bd68deb15576710cd61363f7a6
+
+introduced LLL_MUTEX_READ_LOCK, to skip CAS in spinlock loop
+if atomic load fails. But, "continue" inside of do-while loop
+does not skip the evaluation of escape expression, thus CAS
+is not skipped.
+
+Replace do-while with while and skip LLL_MUTEX_TRYLOCK if
+LLL_MUTEX_READ_LOCK fails.
+
+Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
+---
+ nptl/pthread_mutex_lock.c | 5 ++---
+ 1 file changed, 2 insertions(+), 3 deletions(-)
+
+diff --git a/nptl/pthread_mutex_lock.c b/nptl/pthread_mutex_lock.c
+index a633d95e..d96a9933 100644
+--- a/nptl/pthread_mutex_lock.c
+++ b/nptl/pthread_mutex_lock.c
+@@ -141,10 +141,9 @@ __pthread_mutex_lock (pthread_mutex_t *mutex)
+ 		  break;
+ 		}
+ 	      atomic_spin_nop ();
+-	      if (LLL_MUTEX_READ_LOCK (mutex) != 0)
+-		continue;
+ 	    }
+-	  while (LLL_MUTEX_TRYLOCK (mutex) != 0);
+	  while (LLL_MUTEX_READ_LOCK (mutex) != 0
+		 || LLL_MUTEX_TRYLOCK (mutex) != 0);
+ 
+ 	  mutex->__data.__spins += (cnt - mutex->__data.__spins) / 8;
+ 	}
+-- 
+GitLab
+
--- a/glibc-RHEL-15696-72.patch
+++ b/glibc-RHEL-15696-72.patch
@ -0,0 +1,146 @@
+From 7835d611af0854e69a0c71e3806f8fe379282d6f Mon Sep 17 00:00:00 2001
+From: Noah Goldstein <goldstein.w.n@gmail.com>
+Date: Fri, 18 Feb 2022 14:19:15 -0600
+Subject: [PATCH] x86: Test wcscmp RTM in the wcsncmp overflow case [BZ #28896]
+Content-type: text/plain; charset=UTF-8
+
+In the overflow fallback strncmp-avx2-rtm and wcsncmp-avx2-rtm would
+call strcmp-avx2 and wcscmp-avx2 respectively. This would have
+not checks around vzeroupper and would trigger spurious
+aborts. This commit fixes that.
+
+test-strcmp, test-strncmp, test-wcscmp, and test-wcsncmp all pass on
+AVX2 machines with and without RTM.
+Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
+---
+ sysdeps/x86/Makefile          |  5 ++++-
+ sysdeps/x86/tst-strncmp-rtm.c | 32 +++++++++++++++++++++++---------
+ sysdeps/x86/tst-wcsncmp-rtm.c | 21 +++++++++++++++++++++
+ 3 files changed, 48 insertions(+), 10 deletions(-)
+ create mode 100644 sysdeps/x86/tst-wcsncmp-rtm.c
+
+diff --git a/sysdeps/x86/Makefile b/sysdeps/x86/Makefile
+index 2d814915..c2111f49 100644
+--- a/sysdeps/x86/Makefile
+++ b/sysdeps/x86/Makefile
+@@ -28,7 +28,9 @@ tests += \
+   tst-strcpy-rtm \
+   tst-strlen-rtm \
+   tst-strncmp-rtm \
+-  tst-strrchr-rtm
+  tst-strrchr-rtm \
+  tst-wcsncmp-rtm \
+# tests
+ 
+ CFLAGS-tst-memchr-rtm.c += -mrtm
+ CFLAGS-tst-memcmp-rtm.c += -mrtm
+@@ -40,6 +42,7 @@ CFLAGS-tst-strcpy-rtm.c += -mrtm
+ CFLAGS-tst-strlen-rtm.c += -mrtm
+ CFLAGS-tst-strncmp-rtm.c += -mrtm -Wno-error
+ CFLAGS-tst-strrchr-rtm.c += -mrtm
+CFLAGS-tst-wcsncmp-rtm.c += -mrtm -Wno-error
+ endif
+ 
+ ifneq ($(enable-cet),no)
+diff --git a/sysdeps/x86/tst-strncmp-rtm.c b/sysdeps/x86/tst-strncmp-rtm.c
+index 4d0004b5..4e9f094f 100644
+--- a/sysdeps/x86/tst-strncmp-rtm.c
+++ b/sysdeps/x86/tst-strncmp-rtm.c
+@@ -19,18 +19,32 @@
+ #include <stdint.h>
+ #include <tst-string-rtm.h>
+ 
+#ifdef WIDE
+# define CHAR wchar_t
+# define MEMSET wmemset
+# define STRNCMP wcsncmp
+# define TEST_NAME wcsncmp
+#else /* !WIDE */
+# define CHAR char
+# define MEMSET memset
+# define STRNCMP strncmp
+# define TEST_NAME strncmp
+#endif /* !WIDE */
+
+
+
+ #define LOOP 3000
+ #define STRING_SIZE 1024
+-char string1[STRING_SIZE];
+-char string2[STRING_SIZE];
+CHAR string1[STRING_SIZE];
+CHAR string2[STRING_SIZE];
+ 
+ __attribute__ ((noinline, noclone))
+ static int
+ prepare (void)
+ {
+-  memset (string1, 'a', STRING_SIZE - 1);
+-  memset (string2, 'a', STRING_SIZE - 1);
+-  if (strncmp (string1, string2, STRING_SIZE) == 0)
+  MEMSET (string1, 'a', STRING_SIZE - 1);
+  MEMSET (string2, 'a', STRING_SIZE - 1);
+  if (STRNCMP (string1, string2, STRING_SIZE) == 0)
+     return EXIT_SUCCESS;
+   else
+     return EXIT_FAILURE;
+@@ -40,7 +54,7 @@ __attribute__ ((noinline, noclone))
+ static int
+ function (void)
+ {
+-  if (strncmp (string1, string2, STRING_SIZE) == 0)
+  if (STRNCMP (string1, string2, STRING_SIZE) == 0)
+     return 0;
+   else
+     return 1;
+@@ -50,7 +64,7 @@ __attribute__ ((noinline, noclone))
+ static int
+ function_overflow (void)
+ {
+-  if (strncmp (string1, string2, SIZE_MAX) == 0)
+  if (STRNCMP (string1, string2, SIZE_MAX) == 0)
+     return 0;
+   else
+     return 1;
+@@ -59,9 +73,9 @@ function_overflow (void)
+ static int
+ do_test (void)
+ {
+-  int status = do_test_1 ("strncmp", LOOP, prepare, function);
+  int status = do_test_1 (TEST_NAME, LOOP, prepare, function);
+   if (status != EXIT_SUCCESS)
+     return status;
+-  status = do_test_1 ("strncmp", LOOP, prepare, function_overflow);
+  status = do_test_1 (TEST_NAME, LOOP, prepare, function_overflow);
+   return status;
+ }
+diff --git a/sysdeps/x86/tst-wcsncmp-rtm.c b/sysdeps/x86/tst-wcsncmp-rtm.c
+new file mode 100644
+index 00000000..bad3b863
+--- /dev/null
+++ b/sysdeps/x86/tst-wcsncmp-rtm.c
+@@ -0,0 +1,21 @@
+/* Test case for wcsncmp inside a transactionally executing RTM region.
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#define WIDE 1
+#include <wchar.h>
+#include "tst-strncmp-rtm.c"
+-- 
+GitLab
+
--- a/glibc-RHEL-15696-73.patch
+++ b/glibc-RHEL-15696-73.patch
@ -0,0 +1,37 @@
+From b98d0bbf747f39770e0caba7e984ce9f8f900330 Mon Sep 17 00:00:00 2001
+From: Noah Goldstein <goldstein.w.n@gmail.com>
+Date: Fri, 18 Feb 2022 17:00:25 -0600
+Subject: [PATCH] x86: Fix TEST_NAME to make it a string in tst-strncmp-rtm.c
+Content-type: text/plain; charset=UTF-8
+
+Previously TEST_NAME was passing a function pointer. This didn't fail
+because of the -Wno-error flag (to allow for overflow sizes passed
+to strncmp/wcsncmp)
+
+Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
+---
+ sysdeps/x86/tst-strncmp-rtm.c | 4 ++--
+ 1 file changed, 2 insertions(+), 2 deletions(-)
+
+diff --git a/sysdeps/x86/tst-strncmp-rtm.c b/sysdeps/x86/tst-strncmp-rtm.c
+index 4e9f094f..aef9866c 100644
+--- a/sysdeps/x86/tst-strncmp-rtm.c
+++ b/sysdeps/x86/tst-strncmp-rtm.c
+@@ -23,12 +23,12 @@
+ # define CHAR wchar_t
+ # define MEMSET wmemset
+ # define STRNCMP wcsncmp
+-# define TEST_NAME wcsncmp
+# define TEST_NAME "wcsncmp"
+ #else /* !WIDE */
+ # define CHAR char
+ # define MEMSET memset
+ # define STRNCMP strncmp
+-# define TEST_NAME strncmp
+# define TEST_NAME "strncmp"
+ #endif /* !WIDE */
+ 
+ 
+-- 
+GitLab
+
--- a/glibc-RHEL-15696-74.patch
+++ b/glibc-RHEL-15696-74.patch
--- a/glibc-RHEL-15696-75.patch
+++ b/glibc-RHEL-15696-75.patch
--- a/glibc-RHEL-15696-76.patch
+++ b/glibc-RHEL-15696-76.patch
@ -0,0 +1,33 @@
+From c15efd011cea3d8f0494269eb539583215a1feed Mon Sep 17 00:00:00 2001
+From: "H.J. Lu" <hjl.tools@gmail.com>
+Date: Fri, 4 Feb 2022 11:09:10 -0800
+Subject: [PATCH] x86-64: Fix strcmp-avx2.S
+Content-type: text/plain; charset=UTF-8
+
+Change "movl %edx, %rdx" to "movl %edx, %edx" in:
+
+commit b77b06e0e296f1a2276c27a67e1d44f2cfa38d45
+Author: Noah Goldstein <goldstein.w.n@gmail.com>
+Date:   Mon Jan 10 15:35:38 2022 -0600
+
+    x86: Optimize strcmp-avx2.S
+---
+ sysdeps/x86_64/multiarch/strcmp-avx2.S | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+diff --git a/sysdeps/x86_64/multiarch/strcmp-avx2.S b/sysdeps/x86_64/multiarch/strcmp-avx2.S
+index 554ffe4c..04675aa4 100644
+--- a/sysdeps/x86_64/multiarch/strcmp-avx2.S
+++ b/sysdeps/x86_64/multiarch/strcmp-avx2.S
+@@ -106,7 +106,7 @@ ENTRY(STRCMP)
+ # ifdef USE_AS_STRNCMP
+ #  ifdef __ILP32__
+ 	/* Clear the upper 32 bits.  */
+-	movl	%edx, %rdx
+	movl	%edx, %edx
+ #  endif
+ 	cmp	$1, %RDX_LP
+ 	/* Signed comparison intentional. We use this branch to also
+-- 
+GitLab
+
--- a/glibc-RHEL-15696-77.patch
+++ b/glibc-RHEL-15696-77.patch
@ -0,0 +1,33 @@
+From 0e0199a9e02ebe42e2b36958964d63f03573c382 Mon Sep 17 00:00:00 2001
+From: "H.J. Lu" <hjl.tools@gmail.com>
+Date: Fri, 4 Feb 2022 11:11:08 -0800
+Subject: [PATCH] x86-64: Fix strcmp-evex.S
+Content-type: text/plain; charset=UTF-8
+
+Change "movl %edx, %rdx" to "movl %edx, %edx" in:
+
+commit 8418eb3ff4b781d31c4ed5dc6c0bd7356bc45db9
+Author: Noah Goldstein <goldstein.w.n@gmail.com>
+Date:   Mon Jan 10 15:35:39 2022 -0600
+
+    x86: Optimize strcmp-evex.S
+---
+ sysdeps/x86_64/multiarch/strcmp-evex.S | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+diff --git a/sysdeps/x86_64/multiarch/strcmp-evex.S b/sysdeps/x86_64/multiarch/strcmp-evex.S
+index 99d8409a..ed56af8e 100644
+--- a/sysdeps/x86_64/multiarch/strcmp-evex.S
+++ b/sysdeps/x86_64/multiarch/strcmp-evex.S
+@@ -116,7 +116,7 @@ ENTRY(STRCMP)
+ # ifdef USE_AS_STRNCMP
+ #  ifdef __ILP32__
+ 	/* Clear the upper 32 bits.  */
+-	movl	%edx, %rdx
+	movl	%edx, %edx
+ #  endif
+ 	cmp	$1, %RDX_LP
+ 	/* Signed comparison intentional. We use this branch to also
+-- 
+GitLab
+
--- a/glibc-RHEL-15696-78.patch
+++ b/glibc-RHEL-15696-78.patch
@ -0,0 +1,459 @@
+From b62ace2740a106222e124cc86956448fa07abf4d Mon Sep 17 00:00:00 2001
+From: Noah Goldstein <goldstein.w.n@gmail.com>
+Date: Sun, 6 Feb 2022 00:54:18 -0600
+Subject: [PATCH] x86: Improve vec generation in memset-vec-unaligned-erms.S
+Content-type: text/plain; charset=UTF-8
+
+No bug.
+
+Split vec generation into multiple steps. This allows the
+broadcast in AVX2 to use 'xmm' registers for the L(less_vec)
+case. This saves an expensive lane-cross instruction and removes
+the need for 'vzeroupper'.
+
+For SSE2 replace 2x 'punpck' instructions with zero-idiom 'pxor' for
+byte broadcast.
+
+Results for memset-avx2 small (geomean of N = 20 benchset runs).
+
+size, New Time, Old Time, New / Old
+   0,    4.100,    3.831,     0.934
+   1,    5.074,    4.399,     0.867
+   2,    4.433,    4.411,     0.995
+   4,    4.487,    4.415,     0.984
+   8,    4.454,    4.396,     0.987
+  16,    4.502,    4.443,     0.987
+
+All relevant string/wcsmbs tests are passing.
+Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
+---
+ sysdeps/x86_64/memset.S                       |  21 ++-
+ .../multiarch/memset-avx2-unaligned-erms.S    |  18 +-
+ .../multiarch/memset-avx512-unaligned-erms.S  |  18 +-
+ .../multiarch/memset-evex-unaligned-erms.S    |  18 +-
+ .../multiarch/memset-vec-unaligned-erms.S     | 164 +++++++++++-------
+ 5 files changed, 152 insertions(+), 87 deletions(-)
+
+diff --git a/sysdeps/x86_64/memset.S b/sysdeps/x86_64/memset.S
+index 8672b030..27debd2b 100644
+--- a/sysdeps/x86_64/memset.S
+++ b/sysdeps/x86_64/memset.S
+@@ -28,17 +28,22 @@
+ #define VMOVU     movups
+ #define VMOVA     movaps
+ 
+-#define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
+# define MEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
+   movd d, %xmm0; \
+-  movq r, %rax; \
+-  punpcklbw %xmm0, %xmm0; \
+-  punpcklwd %xmm0, %xmm0; \
+-  pshufd $0, %xmm0, %xmm0
+  pxor %xmm1, %xmm1; \
+  pshufb %xmm1, %xmm0; \
+  movq r, %rax
+ 
+-#define WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
+# define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
+   movd d, %xmm0; \
+-  movq r, %rax; \
+-  pshufd $0, %xmm0, %xmm0
+  pshufd $0, %xmm0, %xmm0; \
+  movq r, %rax
+
+# define MEMSET_VDUP_TO_VEC0_HIGH()
+# define MEMSET_VDUP_TO_VEC0_LOW()
+
+# define WMEMSET_VDUP_TO_VEC0_HIGH()
+# define WMEMSET_VDUP_TO_VEC0_LOW()
+ 
+ #define SECTION(p)		p
+ 
+diff --git a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
+index 1af668af..c0bf2875 100644
+--- a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
+@@ -10,15 +10,18 @@
+ # define VMOVU     vmovdqu
+ # define VMOVA     vmovdqa
+ 
+-# define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
+# define MEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
+   vmovd d, %xmm0; \
+-  movq r, %rax; \
+-  vpbroadcastb %xmm0, %ymm0
+  movq r, %rax;
+ 
+-# define WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
+-  vmovd d, %xmm0; \
+-  movq r, %rax; \
+-  vpbroadcastd %xmm0, %ymm0
+# define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
+  MEMSET_SET_VEC0_AND_SET_RETURN(d, r)
+
+# define MEMSET_VDUP_TO_VEC0_HIGH() vpbroadcastb %xmm0, %ymm0
+# define MEMSET_VDUP_TO_VEC0_LOW() vpbroadcastb %xmm0, %xmm0
+
+# define WMEMSET_VDUP_TO_VEC0_HIGH() vpbroadcastd %xmm0, %ymm0
+# define WMEMSET_VDUP_TO_VEC0_LOW() vpbroadcastd %xmm0, %xmm0
+ 
+ # ifndef SECTION
+ #  define SECTION(p)		p##.avx
+@@ -30,5 +33,6 @@
+ #  define WMEMSET_SYMBOL(p,s)	p##_avx2_##s
+ # endif
+ 
+# define USE_XMM_LESS_VEC
+ # include "memset-vec-unaligned-erms.S"
+ #endif
+diff --git a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
+index f14d6f84..5241216a 100644
+--- a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
+@@ -15,13 +15,19 @@
+ 
+ # define VZEROUPPER
+ 
+-# define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
+-  movq r, %rax; \
+-  vpbroadcastb d, %VEC0
+# define MEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
+  vpbroadcastb d, %VEC0; \
+  movq r, %rax
+ 
+-# define WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
+-  movq r, %rax; \
+-  vpbroadcastd d, %VEC0
+# define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
+  vpbroadcastd d, %VEC0; \
+  movq r, %rax
+
+# define MEMSET_VDUP_TO_VEC0_HIGH()
+# define MEMSET_VDUP_TO_VEC0_LOW()
+
+# define WMEMSET_VDUP_TO_VEC0_HIGH()
+# define WMEMSET_VDUP_TO_VEC0_LOW()
+ 
+ # define SECTION(p)		p##.evex512
+ # define MEMSET_SYMBOL(p,s)	p##_avx512_##s
+diff --git a/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
+index 64b09e77..63700215 100644
+--- a/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
+@@ -15,13 +15,19 @@
+ 
+ # define VZEROUPPER
+ 
+-# define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
+-  movq r, %rax; \
+-  vpbroadcastb d, %VEC0
+# define MEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
+  vpbroadcastb d, %VEC0; \
+  movq r, %rax
+ 
+-# define WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
+-  movq r, %rax; \
+-  vpbroadcastd d, %VEC0
+# define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
+  vpbroadcastd d, %VEC0; \
+  movq r, %rax
+
+# define MEMSET_VDUP_TO_VEC0_HIGH()
+# define MEMSET_VDUP_TO_VEC0_LOW()
+
+# define WMEMSET_VDUP_TO_VEC0_HIGH()
+# define WMEMSET_VDUP_TO_VEC0_LOW()
+ 
+ # define SECTION(p)		p##.evex
+ # define MEMSET_SYMBOL(p,s)	p##_evex_##s
+diff --git a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
+index f08b7323..a67f9833 100644
+--- a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
+@@ -58,8 +58,10 @@
+ #ifndef MOVQ
+ # if VEC_SIZE > 16
+ #  define MOVQ				vmovq
+#  define MOVD				vmovd
+ # else
+ #  define MOVQ				movq
+#  define MOVD				movd
+ # endif
+ #endif
+ 
+@@ -72,9 +74,17 @@
+ #if defined USE_WITH_EVEX || defined USE_WITH_AVX512
+ # define END_REG	rcx
+ # define LOOP_REG	rdi
+# define LESS_VEC_REG	rax
+ #else
+ # define END_REG	rdi
+ # define LOOP_REG	rdx
+# define LESS_VEC_REG	rdi
+#endif
+
+#ifdef USE_XMM_LESS_VEC
+# define XMM_SMALL	1
+#else
+# define XMM_SMALL	0
+ #endif
+ 
+ #define PAGE_SIZE 4096
+@@ -110,8 +120,12 @@ END_CHK (WMEMSET_CHK_SYMBOL (__wmemset_chk, unaligned))
+ 
+ ENTRY (WMEMSET_SYMBOL (__wmemset, unaligned))
+ 	shl	$2, %RDX_LP
+-	WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi)
+-	jmp	L(entry_from_bzero)
+	WMEMSET_SET_VEC0_AND_SET_RETURN (%esi, %rdi)
+	WMEMSET_VDUP_TO_VEC0_LOW()
+	cmpq	$VEC_SIZE, %rdx
+	jb	L(less_vec_no_vdup)
+	WMEMSET_VDUP_TO_VEC0_HIGH()
+	jmp	L(entry_from_wmemset)
+ END (WMEMSET_SYMBOL (__wmemset, unaligned))
+ #endif
+ 
+@@ -123,7 +137,7 @@ END_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned))
+ #endif
+ 
+ ENTRY (MEMSET_SYMBOL (__memset, unaligned))
+-	MEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi)
+	MEMSET_SET_VEC0_AND_SET_RETURN (%esi, %rdi)
+ # ifdef __ILP32__
+ 	/* Clear the upper 32 bits.  */
+ 	mov	%edx, %edx
+@@ -131,6 +145,8 @@ ENTRY (MEMSET_SYMBOL (__memset, unaligned))
+ L(entry_from_bzero):
+ 	cmpq	$VEC_SIZE, %rdx
+ 	jb	L(less_vec)
+	MEMSET_VDUP_TO_VEC0_HIGH()
+L(entry_from_wmemset):
+ 	cmpq	$(VEC_SIZE * 2), %rdx
+ 	ja	L(more_2x_vec)
+ 	/* From VEC and to 2 * VEC.  No branch when size == VEC_SIZE.  */
+@@ -179,27 +195,27 @@ END_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned_erms))
+ # endif
+ 
+ ENTRY_P2ALIGN (MEMSET_SYMBOL (__memset, unaligned_erms), 6)
+-	MEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi)
+	MEMSET_SET_VEC0_AND_SET_RETURN (%esi, %rdi)
+ # ifdef __ILP32__
+ 	/* Clear the upper 32 bits.  */
+ 	mov	%edx, %edx
+ # endif
+ 	cmp	$VEC_SIZE, %RDX_LP
+ 	jb	L(less_vec)
+	MEMSET_VDUP_TO_VEC0_HIGH ()
+ 	cmp	$(VEC_SIZE * 2), %RDX_LP
+ 	ja	L(stosb_more_2x_vec)
+-	/* From VEC and to 2 * VEC.  No branch when size == VEC_SIZE.
+-	 */
+-	VMOVU	%VEC(0), (%rax)
+-	VMOVU	%VEC(0), -VEC_SIZE(%rax, %rdx)
+	/* From VEC and to 2 * VEC.  No branch when size == VEC_SIZE.  */
+	VMOVU	%VEC(0), (%rdi)
+	VMOVU	%VEC(0), (VEC_SIZE * -1)(%rdi, %rdx)
+ 	VZEROUPPER_RETURN
+ #endif
+ 
+-	.p2align 4,, 10
+	.p2align 4,, 4
+ L(last_2x_vec):
+ #ifdef USE_LESS_VEC_MASK_STORE
+-	VMOVU	%VEC(0), (VEC_SIZE * 2 + LOOP_4X_OFFSET)(%rcx)
+-	VMOVU	%VEC(0), (VEC_SIZE * 3 + LOOP_4X_OFFSET)(%rcx)
+	VMOVU	%VEC(0), (VEC_SIZE * -2)(%rdi, %rdx)
+	VMOVU	%VEC(0), (VEC_SIZE * -1)(%rdi, %rdx)
+ #else
+ 	VMOVU	%VEC(0), (VEC_SIZE * -2)(%rdi)
+ 	VMOVU	%VEC(0), (VEC_SIZE * -1)(%rdi)
+@@ -212,6 +228,7 @@ L(last_2x_vec):
+ #ifdef USE_LESS_VEC_MASK_STORE
+ 	.p2align 4,, 10
+ L(less_vec):
+L(less_vec_no_vdup):
+ 	/* Less than 1 VEC.  */
+ # if VEC_SIZE != 16 && VEC_SIZE != 32 && VEC_SIZE != 64
+ #  error Unsupported VEC_SIZE!
+@@ -262,28 +279,18 @@ L(stosb_more_2x_vec):
+ 	/* Fallthrough goes to L(loop_4x_vec). Tests for memset (2x, 4x]
+ 	   and (4x, 8x] jump to target.  */
+ L(more_2x_vec):
+-
+-	/* Two different methods of setting up pointers / compare. The
+-	   two methods are based on the fact that EVEX/AVX512 mov
+-	   instructions take more bytes then AVX2/SSE2 mov instructions. As
+-	   well that EVEX/AVX512 machines also have fast LEA_BID. Both
+-	   setup and END_REG to avoid complex address mode. For EVEX/AVX512
+-	   this saves code size and keeps a few targets in one fetch block.
+-	   For AVX2/SSE2 this helps prevent AGU bottlenecks.  */
+-#if defined USE_WITH_EVEX || defined USE_WITH_AVX512
+-	/* If EVEX/AVX512 compute END_REG - (VEC_SIZE * 4 +
+-	   LOOP_4X_OFFSET) with LEA_BID.  */
+-
+-	/* END_REG is rcx for EVEX/AVX512.  */
+-	leaq	-(VEC_SIZE * 4 + LOOP_4X_OFFSET)(%rdi, %rdx), %END_REG
+-#endif
+-
+-	/* Stores to first 2x VEC before cmp as any path forward will
+-	   require it.  */
+-	VMOVU	%VEC(0), (%rax)
+-	VMOVU	%VEC(0), VEC_SIZE(%rax)
+	/* Store next 2x vec regardless.  */
+	VMOVU	%VEC(0), (%rdi)
+	VMOVU	%VEC(0), (VEC_SIZE * 1)(%rdi)
+ 
+ 
+	/* Two different methods of setting up pointers / compare. The two
+	   methods are based on the fact that EVEX/AVX512 mov instructions take
+	   more bytes then AVX2/SSE2 mov instructions. As well that EVEX/AVX512
+	   machines also have fast LEA_BID. Both setup and END_REG to avoid complex
+	   address mode. For EVEX/AVX512 this saves code size and keeps a few
+	   targets in one fetch block. For AVX2/SSE2 this helps prevent AGU
+	   bottlenecks.  */
+ #if !(defined USE_WITH_EVEX || defined USE_WITH_AVX512)
+ 	/* If AVX2/SSE2 compute END_REG (rdi) with ALU.  */
+ 	addq	%rdx, %END_REG
+@@ -292,6 +299,15 @@ L(more_2x_vec):
+ 	cmpq	$(VEC_SIZE * 4), %rdx
+ 	jbe	L(last_2x_vec)
+ 
+
+#if defined USE_WITH_EVEX || defined USE_WITH_AVX512
+	/* If EVEX/AVX512 compute END_REG - (VEC_SIZE * 4 + LOOP_4X_OFFSET) with
+	   LEA_BID.  */
+
+	/* END_REG is rcx for EVEX/AVX512.  */
+	leaq	-(VEC_SIZE * 4 + LOOP_4X_OFFSET)(%rdi, %rdx), %END_REG
+#endif
+
+ 	/* Store next 2x vec regardless.  */
+ 	VMOVU	%VEC(0), (VEC_SIZE * 2)(%rax)
+ 	VMOVU	%VEC(0), (VEC_SIZE * 3)(%rax)
+@@ -355,65 +371,93 @@ L(stosb_local):
+ 	/* Define L(less_vec) only if not otherwise defined.  */
+ 	.p2align 4
+ L(less_vec):
+	/* Broadcast esi to partial register (i.e VEC_SIZE == 32 broadcast to
+	   xmm). This is only does anything for AVX2.  */
+	MEMSET_VDUP_TO_VEC0_LOW ()
+L(less_vec_no_vdup):
+ #endif
+ L(cross_page):
+ #if VEC_SIZE > 32
+ 	cmpl	$32, %edx
+-	jae	L(between_32_63)
+	jge	L(between_32_63)
+ #endif
+ #if VEC_SIZE > 16
+ 	cmpl	$16, %edx
+-	jae	L(between_16_31)
+	jge	L(between_16_31)
+#endif
+#ifndef USE_XMM_LESS_VEC
+	MOVQ	%XMM0, %rcx
+ #endif
+-	MOVQ	%XMM0, %rdi
+ 	cmpl	$8, %edx
+-	jae	L(between_8_15)
+	jge	L(between_8_15)
+ 	cmpl	$4, %edx
+-	jae	L(between_4_7)
+	jge	L(between_4_7)
+ 	cmpl	$1, %edx
+-	ja	L(between_2_3)
+-	jb	L(return)
+-	movb	%sil, (%rax)
+-	VZEROUPPER_RETURN
+	jg	L(between_2_3)
+	jl	L(between_0_0)
+	movb	%sil, (%LESS_VEC_REG)
+L(between_0_0):
+	ret
+ 
+-	/* Align small targets only if not doing so would cross a fetch
+-	   line.  */
+	/* Align small targets only if not doing so would cross a fetch line.
+	 */
+ #if VEC_SIZE > 32
+ 	.p2align 4,, SMALL_MEMSET_ALIGN(MOV_SIZE, RET_SIZE)
+ 	/* From 32 to 63.  No branch when size == 32.  */
+ L(between_32_63):
+-	VMOVU	%YMM0, (%rax)
+-	VMOVU	%YMM0, -32(%rax, %rdx)
+	VMOVU	%YMM0, (%LESS_VEC_REG)
+	VMOVU	%YMM0, -32(%LESS_VEC_REG, %rdx)
+ 	VZEROUPPER_RETURN
+ #endif
+ 
+ #if VEC_SIZE >= 32
+-	.p2align 4,, SMALL_MEMSET_ALIGN(MOV_SIZE, RET_SIZE)
+	.p2align 4,, SMALL_MEMSET_ALIGN(MOV_SIZE, 1)
+ L(between_16_31):
+ 	/* From 16 to 31.  No branch when size == 16.  */
+-	VMOVU	%XMM0, (%rax)
+-	VMOVU	%XMM0, -16(%rax, %rdx)
+-	VZEROUPPER_RETURN
+	VMOVU	%XMM0, (%LESS_VEC_REG)
+	VMOVU	%XMM0, -16(%LESS_VEC_REG, %rdx)
+	ret
+ #endif
+ 
+-	.p2align 4,, SMALL_MEMSET_ALIGN(3, RET_SIZE)
+	/* Move size is 3 for SSE2, EVEX, and AVX512. Move size is 4 for AVX2.
+	 */
+	.p2align 4,, SMALL_MEMSET_ALIGN(3 + XMM_SMALL, 1)
+ L(between_8_15):
+ 	/* From 8 to 15.  No branch when size == 8.  */
+-	movq	%rdi, (%rax)
+-	movq	%rdi, -8(%rax, %rdx)
+-	VZEROUPPER_RETURN
+#ifdef USE_XMM_LESS_VEC
+	MOVQ	%XMM0, (%rdi)
+	MOVQ	%XMM0, -8(%rdi, %rdx)
+#else
+	movq	%rcx, (%LESS_VEC_REG)
+	movq	%rcx, -8(%LESS_VEC_REG, %rdx)
+#endif
+	ret
+ 
+-	.p2align 4,, SMALL_MEMSET_ALIGN(2, RET_SIZE)
+	/* Move size is 2 for SSE2, EVEX, and AVX512. Move size is 4 for AVX2.
+	 */
+	.p2align 4,, SMALL_MEMSET_ALIGN(2 << XMM_SMALL, 1)
+ L(between_4_7):
+ 	/* From 4 to 7.  No branch when size == 4.  */
+-	movl	%edi, (%rax)
+-	movl	%edi, -4(%rax, %rdx)
+-	VZEROUPPER_RETURN
+#ifdef USE_XMM_LESS_VEC
+	MOVD	%XMM0, (%rdi)
+	MOVD	%XMM0, -4(%rdi, %rdx)
+#else
+	movl	%ecx, (%LESS_VEC_REG)
+	movl	%ecx, -4(%LESS_VEC_REG, %rdx)
+#endif
+	ret
+ 
+-	.p2align 4,, SMALL_MEMSET_ALIGN(3, RET_SIZE)
+	/* 4 * XMM_SMALL for the third mov for AVX2.  */
+	.p2align 4,, 4 * XMM_SMALL + SMALL_MEMSET_ALIGN(3, 1)
+ L(between_2_3):
+ 	/* From 2 to 3.  No branch when size == 2.  */
+-	movw	%di, (%rax)
+-	movb	%dil, -1(%rax, %rdx)
+-	VZEROUPPER_RETURN
+#ifdef USE_XMM_LESS_VEC
+	movb	%sil, (%rdi)
+	movb	%sil, 1(%rdi)
+	movb	%sil, -1(%rdi, %rdx)
+#else
+	movw	%cx, (%LESS_VEC_REG)
+	movb	%sil, -1(%LESS_VEC_REG, %rdx)
+#endif
+	ret
+ END (MEMSET_SYMBOL (__memset, unaligned_erms))
+-- 
+GitLab
+
--- a/glibc-RHEL-15696-79.patch
+++ b/glibc-RHEL-15696-79.patch
@ -0,0 +1,40 @@
+From 1b0c60f95bbe2eded80b2bb5be75c0e45b11cde1 Mon Sep 17 00:00:00 2001
+From: Noah Goldstein <goldstein.w.n@gmail.com>
+Date: Mon, 7 Feb 2022 00:32:23 -0600
+Subject: [PATCH] x86: Remove SSSE3 instruction for broadcast in memset.S (SSE2
+ Only)
+Content-type: text/plain; charset=UTF-8
+
+commit b62ace2740a106222e124cc86956448fa07abf4d
+Author: Noah Goldstein <goldstein.w.n@gmail.com>
+Date:   Sun Feb 6 00:54:18 2022 -0600
+
+    x86: Improve vec generation in memset-vec-unaligned-erms.S
+
+Revert usage of 'pshufb' in broadcast logic as it is an SSSE3
+instruction and memset.S is restricted to only SSE2 instructions.
+---
+ sysdeps/x86_64/memset.S | 7 ++++---
+ 1 file changed, 4 insertions(+), 3 deletions(-)
+
+diff --git a/sysdeps/x86_64/memset.S b/sysdeps/x86_64/memset.S
+index 27debd2b..4cb4aa71 100644
+--- a/sysdeps/x86_64/memset.S
+++ b/sysdeps/x86_64/memset.S
+@@ -30,9 +30,10 @@
+ 
+ # define MEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
+   movd d, %xmm0; \
+-  pxor %xmm1, %xmm1; \
+-  pshufb %xmm1, %xmm0; \
+-  movq r, %rax
+  movq r, %rax; \
+  punpcklbw %xmm0, %xmm0; \
+  punpcklwd %xmm0, %xmm0; \
+  pshufd $0, %xmm0, %xmm0
+ 
+ # define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
+   movd d, %xmm0; \
+-- 
+GitLab
+
--- a/glibc-RHEL-15696-8.patch
+++ b/glibc-RHEL-15696-8.patch
@ -0,0 +1,218 @@
+From 5165de69c0908e28a380cbd4bb054e55ea4abc95 Mon Sep 17 00:00:00 2001
+From: "H.J. Lu" <hjl.tools@gmail.com>
+Date: Mon, 21 Jan 2019 11:36:36 -0800
+Subject: [PATCH] x86-64 strnlen/wcsnlen: Properly handle the length parameter
+ [BZ# 24097]
+Content-type: text/plain; charset=UTF-8
+
+On x32, the size_t parameter may be passed in the lower 32 bits of a
+64-bit register with the non-zero upper 32 bits.  The string/memory
+functions written in assembly can only use the lower 32 bits of a
+64-bit register as length or must clear the upper 32 bits before using
+the full 64-bit register for length.
+
+This pach fixes strnlen/wcsnlen for x32.  Tested on x86-64 and x32.  On
+x86-64, libc.so is the same with and withou the fix.
+
+	[BZ# 24097]
+	CVE-2019-6488
+	* sysdeps/x86_64/multiarch/strlen-avx2.S: Use RSI_LP for length.
+	Clear the upper 32 bits of RSI register.
+	* sysdeps/x86_64/strlen.S: Use RSI_LP for length.
+	* sysdeps/x86_64/x32/Makefile (tests): Add tst-size_t-strnlen
+	and tst-size_t-wcsnlen.
+	* sysdeps/x86_64/x32/tst-size_t-strnlen.c: New file.
+	* sysdeps/x86_64/x32/tst-size_t-wcsnlen.c: Likewise.
+---
+ sysdeps/x86_64/multiarch/strlen-avx2.S  |  9 ++--
+ sysdeps/x86_64/strlen.S                 | 12 ++---
+ sysdeps/x86_64/x32/Makefile             |  4 +-
+ sysdeps/x86_64/x32/tst-size_t-strnlen.c | 72 +++++++++++++++++++++++++
+ sysdeps/x86_64/x32/tst-size_t-wcsnlen.c | 20 +++++++
+ 5 files changed, 106 insertions(+), 11 deletions(-)
+ create mode 100644 sysdeps/x86_64/x32/tst-size_t-strnlen.c
+ create mode 100644 sysdeps/x86_64/x32/tst-size_t-wcsnlen.c
+
+Conflicts:
+	ChangeLog
+	(removed)
+
+diff --git a/sysdeps/x86_64/multiarch/strlen-avx2.S b/sysdeps/x86_64/multiarch/strlen-avx2.S
+index fb2418cd..645e0446 100644
+--- a/sysdeps/x86_64/multiarch/strlen-avx2.S
+++ b/sysdeps/x86_64/multiarch/strlen-avx2.S
+@@ -42,12 +42,15 @@
+ ENTRY (STRLEN)
+ # ifdef USE_AS_STRNLEN
+ 	/* Check for zero length.  */
+-	testq	%rsi, %rsi
+	test	%RSI_LP, %RSI_LP
+ 	jz	L(zero)
+ #  ifdef USE_AS_WCSLEN
+-	shl	$2, %rsi
+	shl	$2, %RSI_LP
+#  elif defined __ILP32__
+	/* Clear the upper 32 bits.  */
+	movl	%esi, %esi
+ #  endif
+-	movq	%rsi, %r8
+	mov	%RSI_LP, %R8_LP
+ # endif
+ 	movl	%edi, %ecx
+ 	movq	%rdi, %rdx
+diff --git a/sysdeps/x86_64/strlen.S b/sysdeps/x86_64/strlen.S
+index 01cb5fa8..f845f3d4 100644
+--- a/sysdeps/x86_64/strlen.S
+++ b/sysdeps/x86_64/strlen.S
+@@ -59,21 +59,21 @@ ENTRY(strlen)
+ 
+ #ifdef AS_STRNLEN
+ /* Do not read anything when n==0.  */
+-	test	%rsi, %rsi
+	test	%RSI_LP, %RSI_LP
+ 	jne	L(n_nonzero)
+ 	xor	%rax, %rax
+ 	ret
+ L(n_nonzero):
+ # ifdef AS_WCSLEN
+-	shlq	$2, %rsi
+	shl	$2, %RSI_LP
+ # endif
+ 
+ /* Initialize long lived registers.  */
+ 
+-	add	%rdi, %rsi
+-	mov	%rsi, %r10
+-	and	$-64, %r10
+-	mov	%rsi, %r11
+	add	%RDI_LP, %RSI_LP
+	mov	%RSI_LP, %R10_LP
+	and	$-64, %R10_LP
+	mov	%RSI_LP, %R11_LP
+ #endif
+ 
+ 	pxor	%xmm0, %xmm0
+diff --git a/sysdeps/x86_64/x32/Makefile b/sysdeps/x86_64/x32/Makefile
+index 2a9e20a9..1557724b 100644
+--- a/sysdeps/x86_64/x32/Makefile
+++ b/sysdeps/x86_64/x32/Makefile
+@@ -8,10 +8,10 @@ endif
+ ifeq ($(subdir),string)
+ tests += tst-size_t-memchr tst-size_t-memcmp tst-size_t-memcpy \
+ 	 tst-size_t-memrchr tst-size_t-memset tst-size_t-strncasecmp \
+-	 tst-size_t-strncmp tst-size_t-strncpy
+	 tst-size_t-strncmp tst-size_t-strncpy tst-size_t-strnlen
+ endif
+ 
+ ifeq ($(subdir),wcsmbs)
+ tests += tst-size_t-wmemchr tst-size_t-wmemcmp tst-size_t-wmemset \
+-	 tst-size_t-wcsncmp
+	 tst-size_t-wcsncmp tst-size_t-wcsnlen
+ endif
+diff --git a/sysdeps/x86_64/x32/tst-size_t-strnlen.c b/sysdeps/x86_64/x32/tst-size_t-strnlen.c
+new file mode 100644
+index 00000000..690a4a8a
+--- /dev/null
+++ b/sysdeps/x86_64/x32/tst-size_t-strnlen.c
+@@ -0,0 +1,72 @@
+/* Test strnlen with size_t in the lower 32 bits of 64-bit register.
+   Copyright (C) 2019 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#ifdef WIDE
+# define TEST_NAME "wcsnlen"
+#else
+# define TEST_NAME "strnlen"
+#endif /* WIDE */
+
+#include "test-size_t.h"
+
+#ifdef WIDE
+# include <wchar.h>
+# define STRNLEN wcsnlen
+# define CHAR wchar_t
+#else
+# define STRNLEN strnlen
+# define CHAR char
+#endif /* WIDE */
+
+IMPL (STRNLEN, 1)
+
+typedef size_t (*proto_t) (const CHAR *, size_t);
+
+static size_t
+__attribute__ ((noinline, noclone))
+do_strnlen (parameter_t a, parameter_t b)
+{
+  return CALL (&a, a.p, b.len);
+}
+
+static int
+test_main (void)
+{
+  test_init ();
+
+  size_t size = page_size / sizeof (CHAR);
+  parameter_t src = { { 0 }, buf2 };
+  parameter_t c = { { size }, (void *) (uintptr_t) 'a' };
+
+  int ret = 0;
+  FOR_EACH_IMPL (impl, 0)
+    {
+      src.fn = impl->fn;
+      size_t res = do_strnlen (src, c);
+      if (res != size)
+	{
+	  error (0, 0, "Wrong result in function %s: 0x%x != 0x%x",
+		 impl->name, res, size);
+	  ret = 1;
+	}
+    }
+
+  return ret ? EXIT_FAILURE : EXIT_SUCCESS;
+}
+
+#include <support/test-driver.c>
+diff --git a/sysdeps/x86_64/x32/tst-size_t-wcsnlen.c b/sysdeps/x86_64/x32/tst-size_t-wcsnlen.c
+new file mode 100644
+index 00000000..093b4bbe
+--- /dev/null
+++ b/sysdeps/x86_64/x32/tst-size_t-wcsnlen.c
+@@ -0,0 +1,20 @@
+/* Test wcsnlen with size_t in the lower 32 bits of 64-bit register.
+   Copyright (C) 2019 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#define WIDE 1
+#include "tst-size_t-strnlen.c"
+-- 
+GitLab
+
--- a/glibc-RHEL-15696-80.patch
+++ b/glibc-RHEL-15696-80.patch
@ -0,0 +1,753 @@
+From 3d9f171bfb5325bd5f427e9fc386453358c6e840 Mon Sep 17 00:00:00 2001
+From: "H.J. Lu" <hjl.tools@gmail.com>
+Date: Mon, 7 Feb 2022 05:55:15 -0800
+Subject: [PATCH] x86-64: Optimize bzero
+Content-type: text/plain; charset=UTF-8
+
+memset with zero as the value to set is by far the majority value (99%+
+for Python3 and GCC).
+
+bzero can be slightly more optimized for this case by using a zero-idiom
+xor for broadcasting the set value to a register (vector or GPR).
+
+Co-developed-by: Noah Goldstein <goldstein.w.n@gmail.com>
+---
+ sysdeps/generic/ifunc-init.h                  |   5 +-
+ sysdeps/x86_64/memset.S                       |   8 +
+ sysdeps/x86_64/multiarch/Makefile             | 205 +++++++++++-------
+ sysdeps/x86_64/multiarch/bzero.c              | 106 +++++++++
+ sysdeps/x86_64/multiarch/ifunc-impl-list.c    |  42 ++++
+ .../memset-avx2-unaligned-erms-rtm.S          |   1 +
+ .../multiarch/memset-avx2-unaligned-erms.S    |   6 +
+ .../multiarch/memset-avx512-unaligned-erms.S  |   3 +
+ .../multiarch/memset-evex-unaligned-erms.S    |   3 +
+ .../multiarch/memset-sse2-unaligned-erms.S    |   1 +
+ .../multiarch/memset-vec-unaligned-erms.S     | 110 +++++++---
+ 11 files changed, 384 insertions(+), 106 deletions(-)
+ create mode 100644 sysdeps/x86_64/multiarch/bzero.c
+
+Conflicts:
+	sysdeps/generic/ifunc-init.h
+	(needs macros from cf4fd28ea453d1a9cec93939bc88b58ccef5437a (memcmpeq))
+	sysdeps/x86_64/multiarch/Makefile
+	(file ordering)
+
+diff --git a/sysdeps/generic/ifunc-init.h b/sysdeps/generic/ifunc-init.h
+index 241e4161..f7a72375 100644
+--- a/sysdeps/generic/ifunc-init.h
+++ b/sysdeps/generic/ifunc-init.h
+@@ -50,5 +50,8 @@
+    '__<symbol>_<variant>' as the optimized implementation and
+    '<symbol>_ifunc_selector' as the IFUNC selector.  */
+ #define REDIRECT_NAME	EVALUATOR1 (__redirect, SYMBOL_NAME)
+-#define OPTIMIZE(name)	EVALUATOR2 (SYMBOL_NAME, name)
+#define OPTIMIZE1(name)	EVALUATOR1 (SYMBOL_NAME, name)
+#define OPTIMIZE2(name)	EVALUATOR2 (SYMBOL_NAME, name)
+/* Default is to use OPTIMIZE2.  */
+#define OPTIMIZE(name)	OPTIMIZE2(name)
+ #define IFUNC_SELECTOR	EVALUATOR1 (SYMBOL_NAME, ifunc_selector)
+diff --git a/sysdeps/x86_64/memset.S b/sysdeps/x86_64/memset.S
+index 4cb4aa71..a1353f89 100644
+--- a/sysdeps/x86_64/memset.S
+++ b/sysdeps/x86_64/memset.S
+@@ -35,6 +35,9 @@
+   punpcklwd %xmm0, %xmm0; \
+   pshufd $0, %xmm0, %xmm0
+ 
+# define BZERO_ZERO_VEC0() \
+  pxor %xmm0, %xmm0
+
+ # define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
+   movd d, %xmm0; \
+   pshufd $0, %xmm0, %xmm0; \
+@@ -53,6 +56,10 @@
+ # define MEMSET_SYMBOL(p,s)	memset
+ #endif
+ 
+#ifndef BZERO_SYMBOL
+# define BZERO_SYMBOL(p,s)	__bzero
+#endif
+
+ #ifndef WMEMSET_SYMBOL
+ # define WMEMSET_CHK_SYMBOL(p,s) p
+ # define WMEMSET_SYMBOL(p,s)	__wmemset
+@@ -63,6 +70,7 @@
+ libc_hidden_builtin_def (memset)
+ 
+ #if IS_IN (libc)
+weak_alias (__bzero, bzero)
+ libc_hidden_def (__wmemset)
+ weak_alias (__wmemset, wmemset)
+ libc_hidden_weak (wmemset)
+diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
+index 26be4095..37d8d6f0 100644
+--- a/sysdeps/x86_64/multiarch/Makefile
+++ b/sysdeps/x86_64/multiarch/Makefile
+@@ -1,85 +1,130 @@
+ ifeq ($(subdir),string)
+ 
+-sysdep_routines += strncat-c stpncpy-c strncpy-c \
+-		   strcmp-sse2 strcmp-sse2-unaligned strcmp-ssse3  \
+-		   strcmp-sse4_2 strcmp-avx2 \
+-		   strncmp-sse2 strncmp-ssse3 strncmp-sse4_2 strncmp-avx2 \
+-		   memchr-sse2 rawmemchr-sse2 memchr-avx2 rawmemchr-avx2 \
+-		   memrchr-sse2 memrchr-avx2 \
+-		   memcmp-sse2 \
+-		   memcmp-avx2-movbe \
+-		   memcmp-sse4 memcpy-ssse3 \
+-		   memmove-ssse3 \
+-		   memcpy-ssse3-back \
+-		   memmove-ssse3-back \
+-		   memmove-avx512-no-vzeroupper \
+-		   strcasecmp_l-sse2 strcasecmp_l-ssse3 \
+-		   strcasecmp_l-sse4_2 strcasecmp_l-avx \
+-		   strncase_l-sse2 strncase_l-ssse3 \
+-		   strncase_l-sse4_2 strncase_l-avx \
+-		   strchr-sse2 strchrnul-sse2 strchr-avx2 strchrnul-avx2 \
+-		   strrchr-sse2 strrchr-avx2 \
+-		   strlen-sse2 strnlen-sse2 strlen-avx2 strnlen-avx2 \
+-		   strcat-avx2 strncat-avx2 \
+-		   strcat-ssse3 strncat-ssse3\
+-		   strcpy-avx2 strncpy-avx2 \
+-		   strcpy-sse2 stpcpy-sse2 \
+-		   strcpy-ssse3 strncpy-ssse3 stpcpy-ssse3 stpncpy-ssse3 \
+-		   strcpy-sse2-unaligned strncpy-sse2-unaligned \
+-		   stpcpy-sse2-unaligned stpncpy-sse2-unaligned \
+-		   stpcpy-avx2 stpncpy-avx2 \
+-		   strcat-sse2 \
+-		   strcat-sse2-unaligned strncat-sse2-unaligned \
+-		   strchr-sse2-no-bsf memcmp-ssse3 strstr-sse2-unaligned \
+-		   strcspn-sse2 strpbrk-sse2 strspn-sse2 \
+-		   strcspn-c strpbrk-c strspn-c varshift \
+-		   memset-avx512-no-vzeroupper \
+-		   memmove-sse2-unaligned-erms \
+-		   memmove-avx-unaligned-erms \
+-		   memmove-avx512-unaligned-erms \
+-		   memset-sse2-unaligned-erms \
+-		   memset-avx2-unaligned-erms \
+-		   memset-avx512-unaligned-erms \
+-		   memchr-avx2-rtm \
+-		   memcmp-avx2-movbe-rtm \
+-		   memmove-avx-unaligned-erms-rtm \
+-		   memrchr-avx2-rtm \
+-		   memset-avx2-unaligned-erms-rtm \
+-		   rawmemchr-avx2-rtm \
+-		   strchr-avx2-rtm \
+-		   strcmp-avx2-rtm \
+-		   strchrnul-avx2-rtm \
+-		   stpcpy-avx2-rtm \
+-		   stpncpy-avx2-rtm \
+-		   strcat-avx2-rtm \
+-		   strcpy-avx2-rtm \
+-		   strlen-avx2-rtm \
+-		   strncat-avx2-rtm \
+-		   strncmp-avx2-rtm \
+-		   strncpy-avx2-rtm \
+-		   strnlen-avx2-rtm \
+-		   strrchr-avx2-rtm \
+-		   memchr-evex \
+-		   memcmp-evex-movbe \
+-		   memmove-evex-unaligned-erms \
+-		   memrchr-evex \
+-		   memset-evex-unaligned-erms \
+-		   rawmemchr-evex \
+-		   stpcpy-evex \
+-		   stpncpy-evex \
+-		   strcat-evex \
+-		   strchr-evex \
+-		   strchrnul-evex \
+-		   strcmp-evex \
+-		   strcpy-evex \
+-		   strlen-evex \
+-		   strncat-evex \
+-		   strncmp-evex \
+-		   strncpy-evex \
+-		   strnlen-evex \
+-		   strrchr-evex \
+-		   memchr-evex-rtm \
+-		   rawmemchr-evex-rtm
+sysdep_routines += \
+  bzero \
+  memchr-avx2 \
+  memchr-avx2-rtm \
+  memchr-evex \
+  memchr-evex-rtm \
+  memchr-sse2 \
+  memcmp-avx2-movbe \
+  memcmp-avx2-movbe-rtm \
+  memcmp-evex-movbe \
+  memcmp-sse2 \
+  memcmp-sse4 \
+  memcmp-ssse3 \
+  memcpy-ssse3 \
+  memcpy-ssse3-back \
+  memmove-avx-unaligned-erms \
+  memmove-avx-unaligned-erms-rtm \
+  memmove-avx512-no-vzeroupper \
+  memmove-avx512-unaligned-erms \
+  memmove-evex-unaligned-erms \
+  memmove-sse2-unaligned-erms \
+  memmove-ssse3 \
+  memmove-ssse3-back \
+  memrchr-avx2 \
+  memrchr-avx2-rtm \
+  memrchr-evex \
+  memrchr-sse2 \
+  memset-avx2-unaligned-erms \
+  memset-avx2-unaligned-erms-rtm \
+  memset-avx512-no-vzeroupper \
+  memset-avx512-unaligned-erms \
+  memset-evex-unaligned-erms \
+  memset-sse2-unaligned-erms \
+  rawmemchr-avx2 \
+  rawmemchr-avx2-rtm \
+  rawmemchr-evex \
+  rawmemchr-evex-rtm \
+  rawmemchr-sse2 \
+  stpcpy-avx2 \
+  stpcpy-avx2-rtm \
+  stpcpy-evex \
+  stpcpy-sse2 \
+  stpcpy-sse2-unaligned \
+  stpcpy-ssse3 \
+  stpncpy-avx2 \
+  stpncpy-avx2-rtm \
+  stpncpy-c \
+  stpncpy-evex \
+  stpncpy-sse2-unaligned \
+  stpncpy-ssse3 \
+  strcasecmp_l-avx \
+  strcasecmp_l-sse2 \
+  strcasecmp_l-sse4_2 \
+  strcasecmp_l-ssse3 \
+  strcat-avx2 \
+  strcat-avx2-rtm \
+  strcat-evex \
+  strcat-sse2 \
+  strcat-sse2-unaligned \
+  strcat-ssse3 \
+  strchr-avx2 \
+  strchr-avx2-rtm \
+  strchr-evex \
+  strchr-sse2 \
+  strchr-sse2-no-bsf \
+  strchrnul-avx2 \
+  strchrnul-avx2-rtm \
+  strchrnul-evex \
+  strchrnul-sse2 \
+  strcmp-avx2 \
+  strcmp-avx2-rtm \
+  strcmp-evex \
+  strcmp-sse2 \
+  strcmp-sse2-unaligned \
+  strcmp-sse4_2 \
+  strcmp-ssse3 \
+  strcpy-avx2 \
+  strcpy-avx2-rtm \
+  strcpy-evex \
+  strcpy-sse2 \
+  strcpy-sse2-unaligned \
+  strcpy-ssse3 \
+  strcspn-c \
+  strcspn-sse2 \
+  strlen-avx2 \
+  strlen-avx2-rtm \
+  strlen-evex \
+  strlen-sse2 \
+  strncase_l-avx \
+  strncase_l-sse2 \
+  strncase_l-sse4_2 \
+  strncase_l-ssse3 \
+  strncat-avx2 \
+  strncat-avx2-rtm \
+  strncat-c \
+  strncat-evex \
+  strncat-sse2-unaligned \
+  strncat-ssse3 \
+  strncmp-avx2 \
+  strncmp-avx2-rtm \
+  strncmp-evex \
+  strncmp-sse2 \
+  strncmp-sse4_2 \
+  strncmp-ssse3 \
+  strncpy-avx2 \
+  strncpy-avx2-rtm \
+  strncpy-c \
+  strncpy-evex \
+  strncpy-sse2-unaligned \
+  strncpy-ssse3 \
+  strnlen-avx2 \
+  strnlen-avx2-rtm \
+  strnlen-evex \
+  strnlen-sse2 \
+  strpbrk-c \
+  strpbrk-sse2 \
+  strrchr-avx2 \
+  strrchr-avx2-rtm \
+  strrchr-evex \
+  strrchr-sse2 \
+  strspn-c \
+  strspn-sse2 \
+  strstr-sse2-unaligned \
+  varshift \
+# sysdep_routines
+ CFLAGS-varshift.c += -msse4
+ CFLAGS-strcspn-c.c += -msse4
+ CFLAGS-strpbrk-c.c += -msse4
+diff --git a/sysdeps/x86_64/multiarch/bzero.c b/sysdeps/x86_64/multiarch/bzero.c
+new file mode 100644
+index 00000000..58a14b2c
+--- /dev/null
+++ b/sysdeps/x86_64/multiarch/bzero.c
+@@ -0,0 +1,106 @@
+/* Multiple versions of bzero.
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+/* Define multiple versions only for the definition in libc.  */
+#if IS_IN (libc)
+# define __bzero __redirect___bzero
+# include <string.h>
+# undef __bzero
+
+# define SYMBOL_NAME __bzero
+# include <init-arch.h>
+
+extern __typeof (REDIRECT_NAME) OPTIMIZE1 (sse2_unaligned)
+  attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE1 (sse2_unaligned_erms)
+  attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE1 (avx2_unaligned) attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE1 (avx2_unaligned_erms)
+  attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE1 (avx2_unaligned_rtm)
+  attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE1 (avx2_unaligned_erms_rtm)
+  attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE1 (evex_unaligned)
+  attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE1 (evex_unaligned_erms)
+  attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE1 (avx512_unaligned)
+  attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE1 (avx512_unaligned_erms)
+  attribute_hidden;
+
+static inline void *
+IFUNC_SELECTOR (void)
+{
+  const struct cpu_features* cpu_features = __get_cpu_features ();
+
+  if (CPU_FEATURE_USABLE_P (cpu_features, AVX512F)
+      && !CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_AVX512))
+    {
+      if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
+          && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW)
+          && CPU_FEATURE_USABLE_P (cpu_features, BMI2))
+	{
+	  if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
+	    return OPTIMIZE1 (avx512_unaligned_erms);
+
+	  return OPTIMIZE1 (avx512_unaligned);
+	}
+    }
+
+  if (CPU_FEATURE_USABLE_P (cpu_features, AVX2))
+    {
+      if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
+          && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW)
+          && CPU_FEATURE_USABLE_P (cpu_features, BMI2))
+	{
+	  if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
+	    return OPTIMIZE1 (evex_unaligned_erms);
+
+	  return OPTIMIZE1 (evex_unaligned);
+	}
+
+      if (CPU_FEATURE_USABLE_P (cpu_features, RTM))
+	{
+	  if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
+	    return OPTIMIZE1 (avx2_unaligned_erms_rtm);
+
+	  return OPTIMIZE1 (avx2_unaligned_rtm);
+	}
+
+      if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
+	{
+	  if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
+	    return OPTIMIZE1 (avx2_unaligned_erms);
+
+	  return OPTIMIZE1 (avx2_unaligned);
+	}
+    }
+
+  if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
+    return OPTIMIZE1 (sse2_unaligned_erms);
+
+  return OPTIMIZE1 (sse2_unaligned);
+}
+
+libc_ifunc_redirected (__redirect___bzero, __bzero, IFUNC_SELECTOR ());
+
+weak_alias (__bzero, bzero)
+#endif
+diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+index 8be0d78a..c963d391 100644
+--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+@@ -282,6 +282,48 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 			      __memset_avx512_no_vzeroupper)
+ 	     )
+ 
+  /* Support sysdeps/x86_64/multiarch/bzero.c.  */
+  IFUNC_IMPL (i, name, bzero,
+	      IFUNC_IMPL_ADD (array, i, bzero, 1,
+			      __bzero_sse2_unaligned)
+	      IFUNC_IMPL_ADD (array, i, bzero, 1,
+			      __bzero_sse2_unaligned_erms)
+	      IFUNC_IMPL_ADD (array, i, bzero,
+			      CPU_FEATURE_USABLE (AVX2),
+			      __bzero_avx2_unaligned)
+	      IFUNC_IMPL_ADD (array, i, bzero,
+			      CPU_FEATURE_USABLE (AVX2),
+			      __bzero_avx2_unaligned_erms)
+	      IFUNC_IMPL_ADD (array, i, bzero,
+			      (CPU_FEATURE_USABLE (AVX2)
+			       && CPU_FEATURE_USABLE (RTM)),
+			      __bzero_avx2_unaligned_rtm)
+	      IFUNC_IMPL_ADD (array, i, bzero,
+			      (CPU_FEATURE_USABLE (AVX2)
+			       && CPU_FEATURE_USABLE (RTM)),
+			      __bzero_avx2_unaligned_erms_rtm)
+	      IFUNC_IMPL_ADD (array, i, bzero,
+			      (CPU_FEATURE_USABLE (AVX512VL)
+			       && CPU_FEATURE_USABLE (AVX512BW)
+			       && CPU_FEATURE_USABLE (BMI2)),
+			      __bzero_evex_unaligned)
+	      IFUNC_IMPL_ADD (array, i, bzero,
+			      (CPU_FEATURE_USABLE (AVX512VL)
+			       && CPU_FEATURE_USABLE (AVX512BW)
+			       && CPU_FEATURE_USABLE (BMI2)),
+			      __bzero_evex_unaligned_erms)
+	      IFUNC_IMPL_ADD (array, i, bzero,
+			      (CPU_FEATURE_USABLE (AVX512VL)
+			       && CPU_FEATURE_USABLE (AVX512BW)
+			       && CPU_FEATURE_USABLE (BMI2)),
+			      __bzero_avx512_unaligned_erms)
+	      IFUNC_IMPL_ADD (array, i, bzero,
+			      (CPU_FEATURE_USABLE (AVX512VL)
+			       && CPU_FEATURE_USABLE (AVX512BW)
+			       && CPU_FEATURE_USABLE (BMI2)),
+			      __bzero_avx512_unaligned)
+	     )
+
+   /* Support sysdeps/x86_64/multiarch/rawmemchr.c.  */
+   IFUNC_IMPL (i, name, rawmemchr,
+ 	      IFUNC_IMPL_ADD (array, i, rawmemchr,
+diff --git a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms-rtm.S b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms-rtm.S
+index 8ac3e479..5a5ee6f6 100644
+--- a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms-rtm.S
+++ b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms-rtm.S
+@@ -5,6 +5,7 @@
+ 
+ #define SECTION(p) p##.avx.rtm
+ #define MEMSET_SYMBOL(p,s)	p##_avx2_##s##_rtm
+#define BZERO_SYMBOL(p,s)	p##_avx2_##s##_rtm
+ #define WMEMSET_SYMBOL(p,s)	p##_avx2_##s##_rtm
+ 
+ #include "memset-avx2-unaligned-erms.S"
+diff --git a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
+index c0bf2875..a093a283 100644
+--- a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
+@@ -14,6 +14,9 @@
+   vmovd d, %xmm0; \
+   movq r, %rax;
+ 
+# define BZERO_ZERO_VEC0() \
+  vpxor %xmm0, %xmm0, %xmm0
+
+ # define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
+   MEMSET_SET_VEC0_AND_SET_RETURN(d, r)
+ 
+@@ -29,6 +32,9 @@
+ # ifndef MEMSET_SYMBOL
+ #  define MEMSET_SYMBOL(p,s)	p##_avx2_##s
+ # endif
+# ifndef BZERO_SYMBOL
+#  define BZERO_SYMBOL(p,s)	p##_avx2_##s
+# endif
+ # ifndef WMEMSET_SYMBOL
+ #  define WMEMSET_SYMBOL(p,s)	p##_avx2_##s
+ # endif
+diff --git a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
+index 5241216a..727c9213 100644
+--- a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
+@@ -19,6 +19,9 @@
+   vpbroadcastb d, %VEC0; \
+   movq r, %rax
+ 
+# define BZERO_ZERO_VEC0() \
+  vpxorq %XMM0, %XMM0, %XMM0
+
+ # define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
+   vpbroadcastd d, %VEC0; \
+   movq r, %rax
+diff --git a/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
+index 63700215..5d8fa78f 100644
+--- a/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
+@@ -19,6 +19,9 @@
+   vpbroadcastb d, %VEC0; \
+   movq r, %rax
+ 
+# define BZERO_ZERO_VEC0() \
+  vpxorq %XMM0, %XMM0, %XMM0
+
+ # define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
+   vpbroadcastd d, %VEC0; \
+   movq r, %rax
+diff --git a/sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S
+index 56b81f5c..8f579ad6 100644
+--- a/sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S
+@@ -22,6 +22,7 @@
+ 
+ #if IS_IN (libc)
+ # define MEMSET_SYMBOL(p,s)	p##_sse2_##s
+# define BZERO_SYMBOL(p,s)	MEMSET_SYMBOL (p, s)
+ # define WMEMSET_SYMBOL(p,s)	p##_sse2_##s
+ 
+ # ifdef SHARED
+diff --git a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
+index a67f9833..06f5f5d7 100644
+--- a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
+@@ -26,6 +26,10 @@
+ 
+ #include <sysdep.h>
+ 
+#ifndef BZERO_SYMBOL
+# define BZERO_SYMBOL(p,s)		MEMSET_SYMBOL (p, s)
+#endif
+
+ #ifndef MEMSET_CHK_SYMBOL
+ # define MEMSET_CHK_SYMBOL(p,s)		MEMSET_SYMBOL(p, s)
+ #endif
+@@ -87,6 +91,18 @@
+ # define XMM_SMALL	0
+ #endif
+ 
+#ifdef USE_LESS_VEC_MASK_STORE
+# define SET_REG64	rcx
+# define SET_REG32	ecx
+# define SET_REG16	cx
+# define SET_REG8	cl
+#else
+# define SET_REG64	rsi
+# define SET_REG32	esi
+# define SET_REG16	si
+# define SET_REG8	sil
+#endif
+
+ #define PAGE_SIZE 4096
+ 
+ /* Macro to calculate size of small memset block for aligning
+@@ -96,18 +112,6 @@
+ 
+ #ifndef SECTION
+ # error SECTION is not defined!
+-#endif
+-
+-	.section SECTION(.text),"ax",@progbits
+-#if VEC_SIZE == 16 && IS_IN (libc)
+-ENTRY (__bzero)
+-	mov	%RDI_LP, %RAX_LP /* Set return value.  */
+-	mov	%RSI_LP, %RDX_LP /* Set n.  */
+-	xorl	%esi, %esi
+-	pxor	%XMM0, %XMM0
+-	jmp	L(entry_from_bzero)
+-END (__bzero)
+-weak_alias (__bzero, bzero)
+ #endif
+ 
+ #if IS_IN (libc)
+@@ -123,12 +127,37 @@ ENTRY (WMEMSET_SYMBOL (__wmemset, unaligned))
+ 	WMEMSET_SET_VEC0_AND_SET_RETURN (%esi, %rdi)
+ 	WMEMSET_VDUP_TO_VEC0_LOW()
+ 	cmpq	$VEC_SIZE, %rdx
+-	jb	L(less_vec_no_vdup)
+	jb	L(less_vec_from_wmemset)
+ 	WMEMSET_VDUP_TO_VEC0_HIGH()
+ 	jmp	L(entry_from_wmemset)
+ END (WMEMSET_SYMBOL (__wmemset, unaligned))
+ #endif
+ 
+ENTRY (BZERO_SYMBOL(__bzero, unaligned))
+#if VEC_SIZE > 16
+	BZERO_ZERO_VEC0 ()
+#endif
+	mov	%RDI_LP, %RAX_LP
+	mov	%RSI_LP, %RDX_LP
+#ifndef USE_LESS_VEC_MASK_STORE
+	xorl	%esi, %esi
+#endif
+	cmp	$VEC_SIZE, %RDX_LP
+	jb	L(less_vec_no_vdup)
+#ifdef USE_LESS_VEC_MASK_STORE
+	xorl	%esi, %esi
+#endif
+#if VEC_SIZE <= 16
+	BZERO_ZERO_VEC0 ()
+#endif
+	cmp	$(VEC_SIZE * 2), %RDX_LP
+	ja	L(more_2x_vec)
+	/* From VEC and to 2 * VEC.  No branch when size == VEC_SIZE.  */
+	VMOVU	%VEC(0), (%rdi)
+	VMOVU	%VEC(0), (VEC_SIZE * -1)(%rdi, %rdx)
+	VZEROUPPER_RETURN
+END (BZERO_SYMBOL(__bzero, unaligned))
+
+ #if defined SHARED && IS_IN (libc)
+ ENTRY_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned))
+ 	cmp	%RDX_LP, %RCX_LP
+@@ -142,7 +171,6 @@ ENTRY (MEMSET_SYMBOL (__memset, unaligned))
+ 	/* Clear the upper 32 bits.  */
+ 	mov	%edx, %edx
+ # endif
+-L(entry_from_bzero):
+ 	cmpq	$VEC_SIZE, %rdx
+ 	jb	L(less_vec)
+ 	MEMSET_VDUP_TO_VEC0_HIGH()
+@@ -187,6 +215,31 @@ END (__memset_erms)
+ END (MEMSET_SYMBOL (__memset, erms))
+ # endif
+ 
+ENTRY_P2ALIGN (BZERO_SYMBOL(__bzero, unaligned_erms), 6)
+# if VEC_SIZE > 16
+	BZERO_ZERO_VEC0 ()
+# endif
+	mov	%RDI_LP, %RAX_LP
+	mov	%RSI_LP, %RDX_LP
+# ifndef USE_LESS_VEC_MASK_STORE
+	xorl	%esi, %esi
+# endif
+	cmp	$VEC_SIZE, %RDX_LP
+	jb	L(less_vec_no_vdup)
+# ifdef USE_LESS_VEC_MASK_STORE
+	xorl	%esi, %esi
+# endif
+# if VEC_SIZE <= 16
+	BZERO_ZERO_VEC0 ()
+# endif
+	cmp	$(VEC_SIZE * 2), %RDX_LP
+	ja	L(stosb_more_2x_vec)
+	/* From VEC and to 2 * VEC.  No branch when size == VEC_SIZE.  */
+	VMOVU	%VEC(0), (%rdi)
+	VMOVU	%VEC(0), (VEC_SIZE * -1)(%rdi, %rdx)
+	VZEROUPPER_RETURN
+END (BZERO_SYMBOL(__bzero, unaligned_erms))
+
+ # if defined SHARED && IS_IN (libc)
+ ENTRY_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned_erms))
+ 	cmp	%RDX_LP, %RCX_LP
+@@ -229,6 +282,7 @@ L(last_2x_vec):
+ 	.p2align 4,, 10
+ L(less_vec):
+ L(less_vec_no_vdup):
+L(less_vec_from_wmemset):
+ 	/* Less than 1 VEC.  */
+ # if VEC_SIZE != 16 && VEC_SIZE != 32 && VEC_SIZE != 64
+ #  error Unsupported VEC_SIZE!
+@@ -374,8 +428,11 @@ L(less_vec):
+ 	/* Broadcast esi to partial register (i.e VEC_SIZE == 32 broadcast to
+ 	   xmm). This is only does anything for AVX2.  */
+ 	MEMSET_VDUP_TO_VEC0_LOW ()
+L(less_vec_from_wmemset):
+#if VEC_SIZE > 16
+ L(less_vec_no_vdup):
+ #endif
+#endif
+ L(cross_page):
+ #if VEC_SIZE > 32
+ 	cmpl	$32, %edx
+@@ -386,7 +443,10 @@ L(cross_page):
+ 	jge	L(between_16_31)
+ #endif
+ #ifndef USE_XMM_LESS_VEC
+-	MOVQ	%XMM0, %rcx
+	MOVQ	%XMM0, %SET_REG64
+#endif
+#if VEC_SIZE <= 16
+L(less_vec_no_vdup):
+ #endif
+ 	cmpl	$8, %edx
+ 	jge	L(between_8_15)
+@@ -395,7 +455,7 @@ L(cross_page):
+ 	cmpl	$1, %edx
+ 	jg	L(between_2_3)
+ 	jl	L(between_0_0)
+-	movb	%sil, (%LESS_VEC_REG)
+	movb	%SET_REG8, (%LESS_VEC_REG)
+ L(between_0_0):
+ 	ret
+ 
+@@ -428,8 +488,8 @@ L(between_8_15):
+ 	MOVQ	%XMM0, (%rdi)
+ 	MOVQ	%XMM0, -8(%rdi, %rdx)
+ #else
+-	movq	%rcx, (%LESS_VEC_REG)
+-	movq	%rcx, -8(%LESS_VEC_REG, %rdx)
+	movq	%SET_REG64, (%LESS_VEC_REG)
+	movq	%SET_REG64, -8(%LESS_VEC_REG, %rdx)
+ #endif
+ 	ret
+ 
+@@ -442,8 +502,8 @@ L(between_4_7):
+ 	MOVD	%XMM0, (%rdi)
+ 	MOVD	%XMM0, -4(%rdi, %rdx)
+ #else
+-	movl	%ecx, (%LESS_VEC_REG)
+-	movl	%ecx, -4(%LESS_VEC_REG, %rdx)
+	movl	%SET_REG32, (%LESS_VEC_REG)
+	movl	%SET_REG32, -4(%LESS_VEC_REG, %rdx)
+ #endif
+ 	ret
+ 
+@@ -452,12 +512,12 @@ L(between_4_7):
+ L(between_2_3):
+ 	/* From 2 to 3.  No branch when size == 2.  */
+ #ifdef USE_XMM_LESS_VEC
+-	movb	%sil, (%rdi)
+-	movb	%sil, 1(%rdi)
+-	movb	%sil, -1(%rdi, %rdx)
+	movb	%SET_REG8, (%rdi)
+	movb	%SET_REG8, 1(%rdi)
+	movb	%SET_REG8, -1(%rdi, %rdx)
+ #else
+-	movw	%cx, (%LESS_VEC_REG)
+-	movb	%sil, -1(%LESS_VEC_REG, %rdx)
+	movw	%SET_REG16, (%LESS_VEC_REG)
+	movb	%SET_REG8, -1(%LESS_VEC_REG, %rdx)
+ #endif
+ 	ret
+ END (MEMSET_SYMBOL (__memset, unaligned_erms))
+-- 
+GitLab
+
--- a/glibc-RHEL-15696-81.patch
+++ b/glibc-RHEL-15696-81.patch
@ -0,0 +1,33 @@
+From 7912236f4a597deb092650ca79f33504ddb4af28 Mon Sep 17 00:00:00 2001
+From: Noah Goldstein <goldstein.w.n@gmail.com>
+Date: Sat, 12 Feb 2022 00:45:00 -0600
+Subject: [PATCH] x86: Set .text section in memset-vec-unaligned-erms
+Content-type: text/plain; charset=UTF-8
+
+commit 3d9f171bfb5325bd5f427e9fc386453358c6e840
+Author: H.J. Lu <hjl.tools@gmail.com>
+Date:   Mon Feb 7 05:55:15 2022 -0800
+
+    x86-64: Optimize bzero
+
+Remove setting the .text section for the code. This commit
+adds that back.
+---
+ sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S | 1 +
+ 1 file changed, 1 insertion(+)
+
+diff --git a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
+index 06f5f5d7..4fb475c0 100644
+--- a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
+@@ -114,6 +114,7 @@
+ # error SECTION is not defined!
+ #endif
+ 
+	.section SECTION(.text), "ax", @progbits
+ #if IS_IN (libc)
+ # if defined SHARED
+ ENTRY_CHK (WMEMSET_CHK_SYMBOL (__wmemset_chk, unaligned))
+-- 
+GitLab
+
--- a/glibc-RHEL-15696-82.patch
+++ b/glibc-RHEL-15696-82.patch
@ -0,0 +1,90 @@
+From e108c02a5e23c8c88ce66d8705d4a24bb6b9a8bf Mon Sep 17 00:00:00 2001
+From: Noah Goldstein <goldstein.w.n@gmail.com>
+Date: Tue, 15 Feb 2022 20:27:21 -0600
+Subject: [PATCH] x86: Fix bug in strncmp-evex and strncmp-avx2 [BZ #28895]
+Content-type: text/plain; charset=UTF-8
+
+Logic can read before the start of `s1` / `s2` if both `s1` and `s2`
+are near the start of a page. To avoid having the result contimated by
+these comparisons the `strcmp` variants would mask off these
+comparisons. This was missing in the `strncmp` variants causing
+the bug. This commit adds the masking to `strncmp` so that out of
+range comparisons don't affect the result.
+
+test-strcmp, test-strncmp, test-wcscmp, and test-wcsncmp all pass as
+well a full xcheck on x86_64 linux.
+Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
+---
+ string/test-strncmp.c                  | 23 +++++++++++++++++++++++
+ sysdeps/x86_64/multiarch/strcmp-avx2.S |  1 +
+ sysdeps/x86_64/multiarch/strcmp-evex.S |  1 +
+ 3 files changed, 25 insertions(+)
+
+diff --git a/string/test-strncmp.c b/string/test-strncmp.c
+index 927a6daa..e61fffd9 100644
+--- a/string/test-strncmp.c
+++ b/string/test-strncmp.c
+@@ -403,6 +403,28 @@ check2 (void)
+   free (s2);
+ }
+ 
+static void
+check4 (void)
+{
+  /* To trigger bug 28895; We need 1) both s1 and s2 to be within 32 bytes of
+     the end of the page. 2) For there to be no mismatch/null byte before the
+     first page cross. 3) For length (`n`) to be large enough for one string to
+     cross the page. And 4) for there to be either mismatch/null bytes before
+     the start of the strings.  */
+
+  size_t size = 10;
+  size_t addr_mask = (getpagesize () - 1) ^ (sizeof (CHAR) - 1);
+  CHAR *s1 = (CHAR *)(buf1 + (addr_mask & 0xffa));
+  CHAR *s2 = (CHAR *)(buf2 + (addr_mask & 0xfed));
+  int exp_result;
+
+  STRCPY (s1, L ("tst-tlsmod%"));
+  STRCPY (s2, L ("tst-tls-manydynamic73mod"));
+  exp_result = SIMPLE_STRNCMP (s1, s2, size);
+  FOR_EACH_IMPL (impl, 0)
+  check_result (impl, s1, s2, size, exp_result);
+}
+
+ static void
+ check3 (void)
+ {
+@@ -445,6 +467,7 @@ test_main (void)
+   check1 ();
+   check2 ();
+   check3 ();
+  check4 ();
+ 
+   printf ("%23s", "");
+   FOR_EACH_IMPL (impl, 0)
+diff --git a/sysdeps/x86_64/multiarch/strcmp-avx2.S b/sysdeps/x86_64/multiarch/strcmp-avx2.S
+index 04675aa4..179cc0e3 100644
+--- a/sysdeps/x86_64/multiarch/strcmp-avx2.S
+++ b/sysdeps/x86_64/multiarch/strcmp-avx2.S
+@@ -661,6 +661,7 @@ L(ret8):
+ # ifdef USE_AS_STRNCMP
+ 	.p2align 4,, 10
+ L(return_page_cross_end_check):
+	andl	%r10d, %ecx
+ 	tzcntl	%ecx, %ecx
+ 	leal	-VEC_SIZE(%rax, %rcx), %ecx
+ 	cmpl	%ecx, %edx
+diff --git a/sysdeps/x86_64/multiarch/strcmp-evex.S b/sysdeps/x86_64/multiarch/strcmp-evex.S
+index ed56af8e..0dfa62bd 100644
+--- a/sysdeps/x86_64/multiarch/strcmp-evex.S
+++ b/sysdeps/x86_64/multiarch/strcmp-evex.S
+@@ -689,6 +689,7 @@ L(ret8):
+ # ifdef USE_AS_STRNCMP
+ 	.p2align 4,, 10
+ L(return_page_cross_end_check):
+	andl	%r10d, %ecx
+ 	tzcntl	%ecx, %ecx
+ 	leal	-VEC_SIZE(%rax, %rcx, SIZE_OF_CHAR), %ecx
+ #  ifdef USE_AS_WCSCMP
+-- 
+GitLab
+
--- a/glibc-RHEL-15696-83.patch
+++ b/glibc-RHEL-15696-83.patch
@ -0,0 +1,77 @@
+From 9fef7039a7d04947bc89296ee0d187bc8d89b772 Mon Sep 17 00:00:00 2001
+From: Noah Goldstein <goldstein.w.n@gmail.com>
+Date: Thu, 24 Mar 2022 15:50:33 -0500
+Subject: [PATCH] x86: Fix fallback for wcsncmp_avx2 in strcmp-avx2.S [BZ
+ #28896]
+Content-type: text/plain; charset=UTF-8
+
+Overflow case for __wcsncmp_avx2_rtm should be __wcscmp_avx2_rtm not
+__wcscmp_avx2.
+
+commit ddf0992cf57a93200e0c782e2a94d0733a5a0b87
+Author: Noah Goldstein <goldstein.w.n@gmail.com>
+Date:   Sun Jan 9 16:02:21 2022 -0600
+
+    x86: Fix __wcsncmp_avx2 in strcmp-avx2.S [BZ# 28755]
+
+Set the wrong fallback function for `__wcsncmp_avx2_rtm`. It was set
+to fallback on to `__wcscmp_avx2` instead of `__wcscmp_avx2_rtm` which
+can cause spurious aborts.
+
+This change will need to be backported.
+
+All string/memory tests pass.
+Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
+---
+ sysdeps/x86/tst-strncmp-rtm.c          | 15 +++++++++++++++
+ sysdeps/x86_64/multiarch/strcmp-avx2.S |  2 +-
+ 2 files changed, 16 insertions(+), 1 deletion(-)
+
+diff --git a/sysdeps/x86/tst-strncmp-rtm.c b/sysdeps/x86/tst-strncmp-rtm.c
+index aef9866c..ba6543be 100644
+--- a/sysdeps/x86/tst-strncmp-rtm.c
+++ b/sysdeps/x86/tst-strncmp-rtm.c
+@@ -70,6 +70,16 @@ function_overflow (void)
+     return 1;
+ }
+ 
+__attribute__ ((noinline, noclone))
+static int
+function_overflow2 (void)
+{
+  if (STRNCMP (string1, string2, SIZE_MAX >> 4) == 0)
+    return 0;
+  else
+    return 1;
+}
+
+ static int
+ do_test (void)
+ {
+@@ -77,5 +87,10 @@ do_test (void)
+   if (status != EXIT_SUCCESS)
+     return status;
+   status = do_test_1 (TEST_NAME, LOOP, prepare, function_overflow);
+  if (status != EXIT_SUCCESS)
+    return status;
+  status = do_test_1 (TEST_NAME, LOOP, prepare, function_overflow2);
+  if (status != EXIT_SUCCESS)
+    return status;
+   return status;
+ }
+diff --git a/sysdeps/x86_64/multiarch/strcmp-avx2.S b/sysdeps/x86_64/multiarch/strcmp-avx2.S
+index 179cc0e3..782f9472 100644
+--- a/sysdeps/x86_64/multiarch/strcmp-avx2.S
+++ b/sysdeps/x86_64/multiarch/strcmp-avx2.S
+@@ -122,7 +122,7 @@ ENTRY(STRCMP)
+ 	   are cases where length is large enough that it can never be a
+ 	   bound on valid memory so just use wcscmp.  */
+ 	shrq	$56, %rcx
+-	jnz	__wcscmp_avx2
+	jnz	OVERFLOW_STRCMP
+ 
+ 	leaq	(, %rdx, 4), %rdx
+ #  endif
+-- 
+GitLab
+
--- a/glibc-RHEL-15696-84.patch
+++ b/glibc-RHEL-15696-84.patch
@ -0,0 +1,27 @@
+From 1283948f236f209b7d3f44b69a42b96806fa6da0 Mon Sep 17 00:00:00 2001
+From: "H.J. Lu" <hjl.tools@gmail.com>
+Date: Sat, 5 Feb 2022 11:06:01 -0800
+Subject: [PATCH] x86: Improve L to support L(XXX_SYMBOL (YYY, ZZZ))
+Content-type: text/plain; charset=UTF-8
+
+---
+ sysdeps/x86/sysdep.h | 3 ++-
+ 1 file changed, 2 insertions(+), 1 deletion(-)
+
+diff --git a/sysdeps/x86/sysdep.h b/sysdeps/x86/sysdep.h
+index a70bb3a2..49b0efe2 100644
+--- a/sysdeps/x86/sysdep.h
+++ b/sysdeps/x86/sysdep.h
+@@ -111,7 +111,8 @@ enum cf_protection_level
+ /* Local label name for asm code. */
+ #ifndef L
+ /* ELF-like local names start with `.L'.  */
+-# define L(name)	.L##name
+# define LOCAL_LABEL(name) .L##name
+# define L(name)	LOCAL_LABEL(name)
+ #endif
+ 
+ #define atom_text_section .section ".text.atom", "ax"
+-- 
+GitLab
+
--- a/glibc-RHEL-15696-85.patch
+++ b/glibc-RHEL-15696-85.patch
@ -0,0 +1,108 @@
+From c328d0152d4b14cca58407ec68143894c8863004 Mon Sep 17 00:00:00 2001
+From: "H.J. Lu" <hjl.tools@gmail.com>
+Date: Sat, 5 Feb 2022 11:52:33 -0800
+Subject: [PATCH] x86_64/multiarch: Sort sysdep_routines and put one entry per
+ line
+Content-type: text/plain; charset=UTF-8
+
+Conflicts:
+	sysdeps/x86_64/multiarch/Makefile
+	(test order changed)
+
+---
+ sysdeps/x86_64/multiarch/Makefile | 78 +++++++++++++++++++------------
+ 1 file changed, 48 insertions(+), 30 deletions(-)
+
+diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
+index 37d8d6f0..8c9e7812 100644
+--- a/sysdeps/x86_64/multiarch/Makefile
+++ b/sysdeps/x86_64/multiarch/Makefile
+@@ -132,37 +132,55 @@ CFLAGS-strspn-c.c += -msse4
+ endif
+ 
+ ifeq ($(subdir),wcsmbs)
+-sysdep_routines += wmemcmp-sse4 wmemcmp-ssse3 wmemcmp-c \
+-		   wmemcmp-avx2-movbe \
+-		   wmemchr-sse2 wmemchr-avx2 \
+-		   wcscmp-sse2 wcscmp-avx2 \
+-		   wcsncmp-sse2 wcsncmp-avx2 \
+-		   wcscpy-ssse3 wcscpy-c \
+-		   wcschr-sse2 wcschr-avx2 \
+-		   wcsrchr-sse2 wcsrchr-avx2 \
+-		   wcslen-sse2 wcslen-sse4_1 wcslen-avx2 \
+-		   wcsnlen-c wcsnlen-sse4_1 wcsnlen-avx2 \
+-		   wcschr-avx2-rtm \
+-		   wcscmp-avx2-rtm \
+-		   wcslen-avx2-rtm \
+-		   wcsncmp-avx2-rtm \
+-		   wcsnlen-avx2-rtm \
+-		   wcsrchr-avx2-rtm \
+-		   wmemchr-avx2-rtm \
+-		   wmemcmp-avx2-movbe-rtm \
+-		   wcschr-evex \
+-		   wcscmp-evex \
+-		   wcslen-evex \
+-		   wcsncmp-evex \
+-		   wcsnlen-evex \
+-		   wcsrchr-evex \
+-		   wmemchr-evex \
+-		   wmemcmp-evex-movbe \
+-		   wmemchr-evex-rtm
+sysdep_routines += \
+  wcschr-avx2 \
+  wcschr-avx2-rtm \
+  wcschr-evex \
+  wcschr-sse2 \
+  wcscmp-avx2 \
+  wcscmp-avx2-rtm \
+  wcscmp-evex \
+  wcscmp-sse2 \
+  wcscpy-c \
+  wcscpy-ssse3 \
+  wcslen-avx2 \
+  wcslen-avx2-rtm \
+  wcslen-evex \
+  wcslen-sse2 \
+  wcslen-sse4_1 \
+  wcsncmp-avx2 \
+  wcsncmp-avx2-rtm \
+  wcsncmp-evex \
+  wcsncmp-sse2 \
+  wcsnlen-avx2 \
+  wcsnlen-avx2-rtm \
+  wcsnlen-c \
+  wcsnlen-evex \
+  wcsnlen-sse4_1 \
+  wcsrchr-avx2 \
+  wcsrchr-avx2-rtm \
+  wcsrchr-evex \
+  wcsrchr-sse2 \
+  wmemchr-avx2 \
+  wmemchr-avx2-rtm \
+  wmemchr-evex \
+  wmemchr-evex-rtm \
+  wmemchr-sse2 \
+  wmemcmp-avx2-movbe \
+  wmemcmp-avx2-movbe-rtm \
+  wmemcmp-c \
+  wmemcmp-evex-movbe \
+  wmemcmp-sse4 \
+  wmemcmp-ssse3 \
+# sysdep_routines
+ endif
+ 
+ ifeq ($(subdir),debug)
+-sysdep_routines += memcpy_chk-nonshared mempcpy_chk-nonshared \
+-		   memmove_chk-nonshared memset_chk-nonshared \
+-		   wmemset_chk-nonshared
+sysdep_routines += \
+  memcpy_chk-nonshared \
+  memmove_chk-nonshared \
+  mempcpy_chk-nonshared \
+  memset_chk-nonshared \
+  wmemset_chk-nonshared \
+# sysdep_routines
+ endif
+-- 
+GitLab
+
--- a/glibc-RHEL-15696-86.patch
+++ b/glibc-RHEL-15696-86.patch
@ -0,0 +1,36 @@
+From 0fb8800029d230b3711bf722b2a47db92d0e273f Mon Sep 17 00:00:00 2001
+From: "H.J. Lu" <hjl.tools@gmail.com>
+Date: Thu, 10 Feb 2022 11:52:50 -0800
+Subject: [PATCH] x86-64: Remove bzero weak alias in SS2 memset
+Content-type: text/plain; charset=UTF-8
+
+commit 3d9f171bfb5325bd5f427e9fc386453358c6e840
+Author: H.J. Lu <hjl.tools@gmail.com>
+Date:   Mon Feb 7 05:55:15 2022 -0800
+
+    x86-64: Optimize bzero
+
+added the optimized bzero.  Remove bzero weak alias in SS2 memset to
+avoid undefined __bzero in memset-sse2-unaligned-erms.
+---
+ sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S | 4 +---
+ 1 file changed, 1 insertion(+), 3 deletions(-)
+
+diff --git a/sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S
+index 8f579ad6..af51362b 100644
+--- a/sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S
+@@ -31,9 +31,7 @@
+ # endif
+ 
+ # undef weak_alias
+-# define weak_alias(original, alias) \
+-	.weak bzero; bzero = __bzero
+-
+# define weak_alias(original, alias)
+ # undef strong_alias
+ # define strong_alias(ignored1, ignored2)
+ #endif
+-- 
+GitLab
+
--- a/glibc-RHEL-15696-87.patch
+++ b/glibc-RHEL-15696-87.patch
@ -0,0 +1,29 @@
+From bf92893a14ebc161b08b28acc24fa06ae6be19cb Mon Sep 17 00:00:00 2001
+From: Adhemerval Zanella <adhemerval.zanella@linaro.org>
+Date: Thu, 10 Feb 2022 11:23:24 -0300
+Subject: [PATCH] x86_64: Remove bcopy optimizations
+Content-type: text/plain; charset=UTF-8
+
+The symbols is not present in current POSIX specification and compiler
+already generates memmove call.
+---
+ sysdeps/x86_64/multiarch/bcopy.S | 7 -------
+ 1 file changed, 7 deletions(-)
+ delete mode 100644 sysdeps/x86_64/multiarch/bcopy.S
+
+diff --git a/sysdeps/x86_64/multiarch/bcopy.S b/sysdeps/x86_64/multiarch/bcopy.S
+deleted file mode 100644
+index 639f02bd..00000000
+--- a/sysdeps/x86_64/multiarch/bcopy.S
+++ /dev/null
+@@ -1,7 +0,0 @@
+-#include <sysdep.h>
+-
+-	.text
+-ENTRY(bcopy)
+-	xchg	%rdi, %rsi
+-	jmp	__libc_memmove	/* Branch to IFUNC memmove.  */
+-END(bcopy)
+-- 
+GitLab
+
--- a/glibc-RHEL-15696-88.patch
+++ b/glibc-RHEL-15696-88.patch
@ -0,0 +1,372 @@
+From a6fbf4d51e9ba8063c4f8331564892ead9c67344 Mon Sep 17 00:00:00 2001
+From: Noah Goldstein <goldstein.w.n@gmail.com>
+Date: Wed, 23 Mar 2022 16:57:16 -0500
+Subject: [PATCH] x86: Code cleanup in strchr-avx2 and comment justifying
+ branch
+Content-type: text/plain; charset=UTF-8
+
+Small code cleanup for size: -53 bytes.
+
+Add comment justifying using a branch to do NULL/non-null return.
+
+All string/memory tests pass and no regressions in benchtests.
+
+geometric_mean(N=20) of all benchmarks Original / New: 1.00
+Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
+---
+ sysdeps/x86_64/multiarch/strchr-avx2.S | 204 +++++++++++++------------
+ 1 file changed, 107 insertions(+), 97 deletions(-)
+
+diff --git a/sysdeps/x86_64/multiarch/strchr-avx2.S b/sysdeps/x86_64/multiarch/strchr-avx2.S
+index 5884726b..89dd2bf7 100644
+--- a/sysdeps/x86_64/multiarch/strchr-avx2.S
+++ b/sysdeps/x86_64/multiarch/strchr-avx2.S
+@@ -48,13 +48,13 @@
+ # define PAGE_SIZE 4096
+ 
+ 	.section SECTION(.text),"ax",@progbits
+-ENTRY (STRCHR)
+ENTRY_P2ALIGN (STRCHR, 5)
+ 	/* Broadcast CHAR to YMM0.	*/
+ 	vmovd	%esi, %xmm0
+ 	movl	%edi, %eax
+ 	andl	$(PAGE_SIZE - 1), %eax
+ 	VPBROADCAST	%xmm0, %ymm0
+-	vpxor	%xmm9, %xmm9, %xmm9
+	vpxor	%xmm1, %xmm1, %xmm1
+ 
+ 	/* Check if we cross page boundary with one vector load.  */
+ 	cmpl	$(PAGE_SIZE - VEC_SIZE), %eax
+@@ -62,37 +62,29 @@ ENTRY (STRCHR)
+ 
+ 	/* Check the first VEC_SIZE bytes.	Search for both CHAR and the
+ 	   null byte.  */
+-	vmovdqu	(%rdi), %ymm8
+-	VPCMPEQ	%ymm8, %ymm0, %ymm1
+-	VPCMPEQ	%ymm8, %ymm9, %ymm2
+-	vpor	%ymm1, %ymm2, %ymm1
+-	vpmovmskb %ymm1, %eax
+	vmovdqu	(%rdi), %ymm2
+	VPCMPEQ	%ymm2, %ymm0, %ymm3
+	VPCMPEQ	%ymm2, %ymm1, %ymm2
+	vpor	%ymm3, %ymm2, %ymm3
+	vpmovmskb %ymm3, %eax
+ 	testl	%eax, %eax
+ 	jz	L(aligned_more)
+ 	tzcntl	%eax, %eax
+ # ifndef USE_AS_STRCHRNUL
+-	/* Found CHAR or the null byte.	 */
+-	cmp	(%rdi, %rax), %CHAR_REG
+-	jne	L(zero)
+-# endif
+-	addq	%rdi, %rax
+-	VZEROUPPER_RETURN
+-
+-	/* .p2align 5 helps keep performance more consistent if ENTRY()
+-	   alignment % 32 was either 16 or 0. As well this makes the
+-	   alignment % 32 of the loop_4x_vec fixed which makes tuning it
+-	   easier.  */
+-	.p2align 5
+-L(first_vec_x4):
+-	tzcntl	%eax, %eax
+-	addq	$(VEC_SIZE * 3 + 1), %rdi
+-# ifndef USE_AS_STRCHRNUL
+-	/* Found CHAR or the null byte.	 */
+	/* Found CHAR or the null byte.  */
+ 	cmp	(%rdi, %rax), %CHAR_REG
+	/* NB: Use a branch instead of cmovcc here. The expectation is
+	   that with strchr the user will branch based on input being
+	   null. Since this branch will be 100% predictive of the user
+	   branch a branch miss here should save what otherwise would
+	   be branch miss in the user code. Otherwise using a branch 1)
+	   saves code size and 2) is faster in highly predictable
+	   environments.  */
+ 	jne	L(zero)
+ # endif
+ 	addq	%rdi, %rax
+-	VZEROUPPER_RETURN
+L(return_vzeroupper):
+	ZERO_UPPER_VEC_REGISTERS_RETURN
+ 
+ # ifndef USE_AS_STRCHRNUL
+ L(zero):
+@@ -103,7 +95,8 @@ L(zero):
+ 
+ 	.p2align 4
+ L(first_vec_x1):
+-	tzcntl	%eax, %eax
+	/* Use bsf to save code size.  */
+	bsfl	%eax, %eax
+ 	incq	%rdi
+ # ifndef USE_AS_STRCHRNUL
+ 	/* Found CHAR or the null byte.	 */
+@@ -113,9 +106,10 @@ L(first_vec_x1):
+ 	addq	%rdi, %rax
+ 	VZEROUPPER_RETURN
+ 
+-	.p2align 4
+	.p2align 4,, 10
+ L(first_vec_x2):
+-	tzcntl	%eax, %eax
+	/* Use bsf to save code size.  */
+	bsfl	%eax, %eax
+ 	addq	$(VEC_SIZE + 1), %rdi
+ # ifndef USE_AS_STRCHRNUL
+ 	/* Found CHAR or the null byte.	 */
+@@ -125,9 +119,10 @@ L(first_vec_x2):
+ 	addq	%rdi, %rax
+ 	VZEROUPPER_RETURN
+ 
+-	.p2align 4
+	.p2align 4,, 8
+ L(first_vec_x3):
+-	tzcntl	%eax, %eax
+	/* Use bsf to save code size.  */
+	bsfl	%eax, %eax
+ 	addq	$(VEC_SIZE * 2 + 1), %rdi
+ # ifndef USE_AS_STRCHRNUL
+ 	/* Found CHAR or the null byte.	 */
+@@ -137,6 +132,21 @@ L(first_vec_x3):
+ 	addq	%rdi, %rax
+ 	VZEROUPPER_RETURN
+ 
+	.p2align 4,, 10
+L(first_vec_x4):
+	/* Use bsf to save code size.  */
+	bsfl	%eax, %eax
+	addq	$(VEC_SIZE * 3 + 1), %rdi
+# ifndef USE_AS_STRCHRNUL
+	/* Found CHAR or the null byte.	 */
+	cmp	(%rdi, %rax), %CHAR_REG
+	jne	L(zero)
+# endif
+	addq	%rdi, %rax
+	VZEROUPPER_RETURN
+
+
+
+ 	.p2align 4
+ L(aligned_more):
+ 	/* Align data to VEC_SIZE - 1. This is the same number of
+@@ -146,90 +156,92 @@ L(aligned_more):
+ L(cross_page_continue):
+ 	/* Check the next 4 * VEC_SIZE.  Only one VEC_SIZE at a time
+ 	   since data is only aligned to VEC_SIZE.  */
+-	vmovdqa	1(%rdi), %ymm8
+-	VPCMPEQ	%ymm8, %ymm0, %ymm1
+-	VPCMPEQ	%ymm8, %ymm9, %ymm2
+-	vpor	%ymm1, %ymm2, %ymm1
+-	vpmovmskb %ymm1, %eax
+	vmovdqa	1(%rdi), %ymm2
+	VPCMPEQ	%ymm2, %ymm0, %ymm3
+	VPCMPEQ	%ymm2, %ymm1, %ymm2
+	vpor	%ymm3, %ymm2, %ymm3
+	vpmovmskb %ymm3, %eax
+ 	testl	%eax, %eax
+ 	jnz	L(first_vec_x1)
+ 
+-	vmovdqa	(VEC_SIZE + 1)(%rdi), %ymm8
+-	VPCMPEQ	%ymm8, %ymm0, %ymm1
+-	VPCMPEQ	%ymm8, %ymm9, %ymm2
+-	vpor	%ymm1, %ymm2, %ymm1
+-	vpmovmskb %ymm1, %eax
+	vmovdqa	(VEC_SIZE + 1)(%rdi), %ymm2
+	VPCMPEQ	%ymm2, %ymm0, %ymm3
+	VPCMPEQ	%ymm2, %ymm1, %ymm2
+	vpor	%ymm3, %ymm2, %ymm3
+	vpmovmskb %ymm3, %eax
+ 	testl	%eax, %eax
+ 	jnz	L(first_vec_x2)
+ 
+-	vmovdqa	(VEC_SIZE * 2 + 1)(%rdi), %ymm8
+-	VPCMPEQ	%ymm8, %ymm0, %ymm1
+-	VPCMPEQ	%ymm8, %ymm9, %ymm2
+-	vpor	%ymm1, %ymm2, %ymm1
+-	vpmovmskb %ymm1, %eax
+	vmovdqa	(VEC_SIZE * 2 + 1)(%rdi), %ymm2
+	VPCMPEQ	%ymm2, %ymm0, %ymm3
+	VPCMPEQ	%ymm2, %ymm1, %ymm2
+	vpor	%ymm3, %ymm2, %ymm3
+	vpmovmskb %ymm3, %eax
+ 	testl	%eax, %eax
+ 	jnz	L(first_vec_x3)
+ 
+-	vmovdqa	(VEC_SIZE * 3 + 1)(%rdi), %ymm8
+-	VPCMPEQ	%ymm8, %ymm0, %ymm1
+-	VPCMPEQ	%ymm8, %ymm9, %ymm2
+-	vpor	%ymm1, %ymm2, %ymm1
+-	vpmovmskb %ymm1, %eax
+	vmovdqa	(VEC_SIZE * 3 + 1)(%rdi), %ymm2
+	VPCMPEQ	%ymm2, %ymm0, %ymm3
+	VPCMPEQ	%ymm2, %ymm1, %ymm2
+	vpor	%ymm3, %ymm2, %ymm3
+	vpmovmskb %ymm3, %eax
+ 	testl	%eax, %eax
+ 	jnz	L(first_vec_x4)
+-	/* Align data to VEC_SIZE * 4 - 1.	*/
+-	addq	$(VEC_SIZE * 4 + 1), %rdi
+-	andq	$-(VEC_SIZE * 4), %rdi
+	/* Align data to VEC_SIZE * 4 - 1.  */
+	incq	%rdi
+	orq	$(VEC_SIZE * 4 - 1), %rdi
+ 	.p2align 4
+ L(loop_4x_vec):
+ 	/* Compare 4 * VEC at a time forward.  */
+-	vmovdqa	(%rdi), %ymm5
+-	vmovdqa	(VEC_SIZE)(%rdi), %ymm6
+-	vmovdqa	(VEC_SIZE * 2)(%rdi), %ymm7
+-	vmovdqa	(VEC_SIZE * 3)(%rdi), %ymm8
+	vmovdqa	1(%rdi), %ymm6
+	vmovdqa	(VEC_SIZE + 1)(%rdi), %ymm7
+ 
+ 	/* Leaves only CHARS matching esi as 0.	 */
+-	vpxor	%ymm5, %ymm0, %ymm1
+ 	vpxor	%ymm6, %ymm0, %ymm2
+ 	vpxor	%ymm7, %ymm0, %ymm3
+-	vpxor	%ymm8, %ymm0, %ymm4
+ 
+-	VPMINU	%ymm1, %ymm5, %ymm1
+ 	VPMINU	%ymm2, %ymm6, %ymm2
+ 	VPMINU	%ymm3, %ymm7, %ymm3
+-	VPMINU	%ymm4, %ymm8, %ymm4
+ 
+-	VPMINU	%ymm1, %ymm2, %ymm5
+-	VPMINU	%ymm3, %ymm4, %ymm6
+	vmovdqa	(VEC_SIZE * 2 + 1)(%rdi), %ymm6
+	vmovdqa	(VEC_SIZE * 3 + 1)(%rdi), %ymm7
+
+	vpxor	%ymm6, %ymm0, %ymm4
+	vpxor	%ymm7, %ymm0, %ymm5
+
+	VPMINU	%ymm4, %ymm6, %ymm4
+	VPMINU	%ymm5, %ymm7, %ymm5
+ 
+-	VPMINU	%ymm5, %ymm6, %ymm6
+	VPMINU	%ymm2, %ymm3, %ymm6
+	VPMINU	%ymm4, %ymm5, %ymm7
+ 
+-	VPCMPEQ	%ymm6, %ymm9, %ymm6
+-	vpmovmskb %ymm6, %ecx
+	VPMINU	%ymm6, %ymm7, %ymm7
+
+	VPCMPEQ	%ymm7, %ymm1, %ymm7
+	vpmovmskb %ymm7, %ecx
+ 	subq	$-(VEC_SIZE * 4), %rdi
+ 	testl	%ecx, %ecx
+ 	jz	L(loop_4x_vec)
+ 
+-
+-	VPCMPEQ	%ymm1, %ymm9, %ymm1
+-	vpmovmskb %ymm1, %eax
+	VPCMPEQ	%ymm2, %ymm1, %ymm2
+	vpmovmskb %ymm2, %eax
+ 	testl	%eax, %eax
+ 	jnz	L(last_vec_x0)
+ 
+ 
+-	VPCMPEQ	%ymm5, %ymm9, %ymm2
+-	vpmovmskb %ymm2, %eax
+	VPCMPEQ	%ymm3, %ymm1, %ymm3
+	vpmovmskb %ymm3, %eax
+ 	testl	%eax, %eax
+ 	jnz	L(last_vec_x1)
+ 
+-	VPCMPEQ	%ymm3, %ymm9, %ymm3
+-	vpmovmskb %ymm3, %eax
+	VPCMPEQ	%ymm4, %ymm1, %ymm4
+	vpmovmskb %ymm4, %eax
+ 	/* rcx has combined result from all 4 VEC. It will only be used
+ 	   if the first 3 other VEC all did not contain a match.  */
+ 	salq	$32, %rcx
+ 	orq	%rcx, %rax
+ 	tzcntq	%rax, %rax
+-	subq	$(VEC_SIZE * 2), %rdi
+	subq	$(VEC_SIZE * 2 - 1), %rdi
+ # ifndef USE_AS_STRCHRNUL
+ 	/* Found CHAR or the null byte.	 */
+ 	cmp	(%rdi, %rax), %CHAR_REG
+@@ -239,10 +251,11 @@ L(loop_4x_vec):
+ 	VZEROUPPER_RETURN
+ 
+ 
+-	.p2align 4
+	.p2align 4,, 10
+ L(last_vec_x0):
+-	tzcntl	%eax, %eax
+-	addq	$-(VEC_SIZE * 4), %rdi
+	/* Use bsf to save code size.  */
+	bsfl	%eax, %eax
+	addq	$-(VEC_SIZE * 4 - 1), %rdi
+ # ifndef USE_AS_STRCHRNUL
+ 	/* Found CHAR or the null byte.	 */
+ 	cmp	(%rdi, %rax), %CHAR_REG
+@@ -251,16 +264,11 @@ L(last_vec_x0):
+ 	addq	%rdi, %rax
+ 	VZEROUPPER_RETURN
+ 
+-# ifndef USE_AS_STRCHRNUL
+-L(zero_end):
+-	xorl	%eax, %eax
+-	VZEROUPPER_RETURN
+-# endif
+ 
+-	.p2align 4
+	.p2align 4,, 10
+ L(last_vec_x1):
+ 	tzcntl	%eax, %eax
+-	subq	$(VEC_SIZE * 3), %rdi
+	subq	$(VEC_SIZE * 3 - 1), %rdi
+ # ifndef USE_AS_STRCHRNUL
+ 	/* Found CHAR or the null byte.	 */
+ 	cmp	(%rdi, %rax), %CHAR_REG
+@@ -269,18 +277,23 @@ L(last_vec_x1):
+ 	addq	%rdi, %rax
+ 	VZEROUPPER_RETURN
+ 
+# ifndef USE_AS_STRCHRNUL
+L(zero_end):
+	xorl	%eax, %eax
+	VZEROUPPER_RETURN
+# endif
+ 
+ 	/* Cold case for crossing page with first load.	 */
+-	.p2align 4
+	.p2align 4,, 8
+ L(cross_page_boundary):
+ 	movq	%rdi, %rdx
+ 	/* Align rdi to VEC_SIZE - 1.  */
+ 	orq	$(VEC_SIZE - 1), %rdi
+-	vmovdqa	-(VEC_SIZE - 1)(%rdi), %ymm8
+-	VPCMPEQ	%ymm8, %ymm0, %ymm1
+-	VPCMPEQ	%ymm8, %ymm9, %ymm2
+-	vpor	%ymm1, %ymm2, %ymm1
+-	vpmovmskb %ymm1, %eax
+	vmovdqa	-(VEC_SIZE - 1)(%rdi), %ymm2
+	VPCMPEQ	%ymm2, %ymm0, %ymm3
+	VPCMPEQ	%ymm2, %ymm1, %ymm2
+	vpor	%ymm3, %ymm2, %ymm3
+	vpmovmskb %ymm3, %eax
+ 	/* Remove the leading bytes. sarxl only uses bits [5:0] of COUNT
+ 	   so no need to manually mod edx.  */
+ 	sarxl	%edx, %eax, %eax
+@@ -291,13 +304,10 @@ L(cross_page_boundary):
+ 	xorl	%ecx, %ecx
+ 	/* Found CHAR or the null byte.	 */
+ 	cmp	(%rdx, %rax), %CHAR_REG
+-	leaq	(%rdx, %rax), %rax
+-	cmovne	%rcx, %rax
+-# else
+-	addq	%rdx, %rax
+	jne	L(zero_end)
+ # endif
+-L(return_vzeroupper):
+-	ZERO_UPPER_VEC_REGISTERS_RETURN
+	addq	%rdx, %rax
+	VZEROUPPER_RETURN
+ 
+ END (STRCHR)
+-# endif
+#endif
+-- 
+GitLab
+
--- a/glibc-RHEL-15696-89.patch
+++ b/glibc-RHEL-15696-89.patch
@ -0,0 +1,343 @@
+From ec285ea90415458225623ddc0492ae3f705af043 Mon Sep 17 00:00:00 2001
+From: Noah Goldstein <goldstein.w.n@gmail.com>
+Date: Wed, 23 Mar 2022 16:57:18 -0500
+Subject: [PATCH] x86: Code cleanup in strchr-evex and comment justifying
+ branch
+Content-type: text/plain; charset=UTF-8
+
+Small code cleanup for size: -81 bytes.
+
+Add comment justifying using a branch to do NULL/non-null return.
+
+All string/memory tests pass and no regressions in benchtests.
+
+geometric_mean(N=20) of all benchmarks New / Original: .985
+Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
+---
+ sysdeps/x86_64/multiarch/strchr-evex.S | 146 ++++++++++++++-----------
+ 1 file changed, 80 insertions(+), 66 deletions(-)
+
+diff --git a/sysdeps/x86_64/multiarch/strchr-evex.S b/sysdeps/x86_64/multiarch/strchr-evex.S
+index 7f9d4ee4..0b49e0ac 100644
+--- a/sysdeps/x86_64/multiarch/strchr-evex.S
+++ b/sysdeps/x86_64/multiarch/strchr-evex.S
+@@ -30,6 +30,7 @@
+ # ifdef USE_AS_WCSCHR
+ #  define VPBROADCAST	vpbroadcastd
+ #  define VPCMP		vpcmpd
+#  define VPTESTN	vptestnmd
+ #  define VPMINU	vpminud
+ #  define CHAR_REG	esi
+ #  define SHIFT_REG	ecx
+@@ -37,6 +38,7 @@
+ # else
+ #  define VPBROADCAST	vpbroadcastb
+ #  define VPCMP		vpcmpb
+#  define VPTESTN	vptestnmb
+ #  define VPMINU	vpminub
+ #  define CHAR_REG	sil
+ #  define SHIFT_REG	edx
+@@ -61,13 +63,11 @@
+ # define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE)
+ 
+ 	.section .text.evex,"ax",@progbits
+-ENTRY (STRCHR)
+ENTRY_P2ALIGN (STRCHR, 5)
+ 	/* Broadcast CHAR to YMM0.	*/
+ 	VPBROADCAST	%esi, %YMM0
+ 	movl	%edi, %eax
+ 	andl	$(PAGE_SIZE - 1), %eax
+-	vpxorq	%XMMZERO, %XMMZERO, %XMMZERO
+-
+ 	/* Check if we cross page boundary with one vector load.
+ 	   Otherwise it is safe to use an unaligned load.  */
+ 	cmpl	$(PAGE_SIZE - VEC_SIZE), %eax
+@@ -81,49 +81,35 @@ ENTRY (STRCHR)
+ 	vpxorq	%YMM1, %YMM0, %YMM2
+ 	VPMINU	%YMM2, %YMM1, %YMM2
+ 	/* Each bit in K0 represents a CHAR or a null byte in YMM1.  */
+-	VPCMP	$0, %YMMZERO, %YMM2, %k0
+	VPTESTN	%YMM2, %YMM2, %k0
+ 	kmovd	%k0, %eax
+ 	testl	%eax, %eax
+ 	jz	L(aligned_more)
+ 	tzcntl	%eax, %eax
+# ifndef USE_AS_STRCHRNUL
+	/* Found CHAR or the null byte.  */
+	cmp	(%rdi, %rax, CHAR_SIZE), %CHAR_REG
+	/* NB: Use a branch instead of cmovcc here. The expectation is
+	   that with strchr the user will branch based on input being
+	   null. Since this branch will be 100% predictive of the user
+	   branch a branch miss here should save what otherwise would
+	   be branch miss in the user code. Otherwise using a branch 1)
+	   saves code size and 2) is faster in highly predictable
+	   environments.  */
+	jne	L(zero)
+# endif
+ # ifdef USE_AS_WCSCHR
+ 	/* NB: Multiply wchar_t count by 4 to get the number of bytes.
+ 	 */
+ 	leaq	(%rdi, %rax, CHAR_SIZE), %rax
+ # else
+ 	addq	%rdi, %rax
+-# endif
+-# ifndef USE_AS_STRCHRNUL
+-	/* Found CHAR or the null byte.	 */
+-	cmp	(%rax), %CHAR_REG
+-	jne	L(zero)
+ # endif
+ 	ret
+ 
+-	/* .p2align 5 helps keep performance more consistent if ENTRY()
+-	   alignment % 32 was either 16 or 0. As well this makes the
+-	   alignment % 32 of the loop_4x_vec fixed which makes tuning it
+-	   easier.  */
+-	.p2align 5
+-L(first_vec_x3):
+-	tzcntl	%eax, %eax
+-# ifndef USE_AS_STRCHRNUL
+-	/* Found CHAR or the null byte.	 */
+-	cmp	(VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %CHAR_REG
+-	jne	L(zero)
+-# endif
+-	/* NB: Multiply sizeof char type (1 or 4) to get the number of
+-	   bytes.  */
+-	leaq	(VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax
+-	ret
+ 
+-# ifndef USE_AS_STRCHRNUL
+-L(zero):
+-	xorl	%eax, %eax
+-	ret
+-# endif
+ 
+-	.p2align 4
+	.p2align 4,, 10
+ L(first_vec_x4):
+ # ifndef USE_AS_STRCHRNUL
+ 	/* Check to see if first match was CHAR (k0) or null (k1).  */
+@@ -144,9 +130,18 @@ L(first_vec_x4):
+ 	leaq	(VEC_SIZE * 4)(%rdi, %rax, CHAR_SIZE), %rax
+ 	ret
+ 
+# ifndef USE_AS_STRCHRNUL
+L(zero):
+	xorl	%eax, %eax
+	ret
+# endif
+
+
+ 	.p2align 4
+ L(first_vec_x1):
+-	tzcntl	%eax, %eax
+	/* Use bsf here to save 1-byte keeping keeping the block in 1x
+	   fetch block. eax guranteed non-zero.  */
+	bsfl	%eax, %eax
+ # ifndef USE_AS_STRCHRNUL
+ 	/* Found CHAR or the null byte.	 */
+ 	cmp	(VEC_SIZE)(%rdi, %rax, CHAR_SIZE), %CHAR_REG
+@@ -158,7 +153,7 @@ L(first_vec_x1):
+ 	leaq	(VEC_SIZE)(%rdi, %rax, CHAR_SIZE), %rax
+ 	ret
+ 
+-	.p2align 4
+	.p2align 4,, 10
+ L(first_vec_x2):
+ # ifndef USE_AS_STRCHRNUL
+ 	/* Check to see if first match was CHAR (k0) or null (k1).  */
+@@ -179,6 +174,21 @@ L(first_vec_x2):
+ 	leaq	(VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
+ 	ret
+ 
+	.p2align 4,, 10
+L(first_vec_x3):
+	/* Use bsf here to save 1-byte keeping keeping the block in 1x
+	   fetch block. eax guranteed non-zero.  */
+	bsfl	%eax, %eax
+# ifndef USE_AS_STRCHRNUL
+	/* Found CHAR or the null byte.	 */
+	cmp	(VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %CHAR_REG
+	jne	L(zero)
+# endif
+	/* NB: Multiply sizeof char type (1 or 4) to get the number of
+	   bytes.  */
+	leaq	(VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax
+	ret
+
+ 	.p2align 4
+ L(aligned_more):
+ 	/* Align data to VEC_SIZE.  */
+@@ -195,7 +205,7 @@ L(cross_page_continue):
+ 	vpxorq	%YMM1, %YMM0, %YMM2
+ 	VPMINU	%YMM2, %YMM1, %YMM2
+ 	/* Each bit in K0 represents a CHAR or a null byte in YMM1.  */
+-	VPCMP	$0, %YMMZERO, %YMM2, %k0
+	VPTESTN	%YMM2, %YMM2, %k0
+ 	kmovd	%k0, %eax
+ 	testl	%eax, %eax
+ 	jnz	L(first_vec_x1)
+@@ -206,7 +216,7 @@ L(cross_page_continue):
+ 	/* Each bit in K0 represents a CHAR in YMM1.  */
+ 	VPCMP	$0, %YMM1, %YMM0, %k0
+ 	/* Each bit in K1 represents a CHAR in YMM1.  */
+-	VPCMP	$0, %YMM1, %YMMZERO, %k1
+	VPTESTN	%YMM1, %YMM1, %k1
+ 	kortestd	%k0, %k1
+ 	jnz	L(first_vec_x2)
+ 
+@@ -215,7 +225,7 @@ L(cross_page_continue):
+ 	vpxorq	%YMM1, %YMM0, %YMM2
+ 	VPMINU	%YMM2, %YMM1, %YMM2
+ 	/* Each bit in K0 represents a CHAR or a null byte in YMM1.  */
+-	VPCMP	$0, %YMMZERO, %YMM2, %k0
+	VPTESTN	%YMM2, %YMM2, %k0
+ 	kmovd	%k0, %eax
+ 	testl	%eax, %eax
+ 	jnz	L(first_vec_x3)
+@@ -224,7 +234,7 @@ L(cross_page_continue):
+ 	/* Each bit in K0 represents a CHAR in YMM1.  */
+ 	VPCMP	$0, %YMM1, %YMM0, %k0
+ 	/* Each bit in K1 represents a CHAR in YMM1.  */
+-	VPCMP	$0, %YMM1, %YMMZERO, %k1
+	VPTESTN	%YMM1, %YMM1, %k1
+ 	kortestd	%k0, %k1
+ 	jnz	L(first_vec_x4)
+ 
+@@ -265,33 +275,33 @@ L(loop_4x_vec):
+ 	VPMINU	%YMM3, %YMM4, %YMM4
+ 	VPMINU	%YMM2, %YMM4, %YMM4{%k4}{z}
+ 
+-	VPCMP	$0, %YMMZERO, %YMM4, %k1
+	VPTESTN	%YMM4, %YMM4, %k1
+ 	kmovd	%k1, %ecx
+ 	subq	$-(VEC_SIZE * 4), %rdi
+ 	testl	%ecx, %ecx
+ 	jz	L(loop_4x_vec)
+ 
+-	VPCMP	$0, %YMMZERO, %YMM1, %k0
+	VPTESTN	%YMM1, %YMM1, %k0
+ 	kmovd	%k0, %eax
+ 	testl	%eax, %eax
+ 	jnz	L(last_vec_x1)
+ 
+-	VPCMP	$0, %YMMZERO, %YMM2, %k0
+	VPTESTN	%YMM2, %YMM2, %k0
+ 	kmovd	%k0, %eax
+ 	testl	%eax, %eax
+ 	jnz	L(last_vec_x2)
+ 
+-	VPCMP	$0, %YMMZERO, %YMM3, %k0
+	VPTESTN	%YMM3, %YMM3, %k0
+ 	kmovd	%k0, %eax
+ 	/* Combine YMM3 matches (eax) with YMM4 matches (ecx).  */
+ # ifdef USE_AS_WCSCHR
+ 	sall	$8, %ecx
+ 	orl	%ecx, %eax
+-	tzcntl	%eax, %eax
+	bsfl	%eax, %eax
+ # else
+ 	salq	$32, %rcx
+ 	orq	%rcx, %rax
+-	tzcntq	%rax, %rax
+	bsfq	%rax, %rax
+ # endif
+ # ifndef USE_AS_STRCHRNUL
+ 	/* Check if match was CHAR or null.  */
+@@ -303,28 +313,28 @@ L(loop_4x_vec):
+ 	leaq	(VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
+ 	ret
+ 
+-# ifndef USE_AS_STRCHRNUL
+-L(zero_end):
+-	xorl	%eax, %eax
+-	ret
+	.p2align 4,, 8
+L(last_vec_x1):
+	bsfl	%eax, %eax
+# ifdef USE_AS_WCSCHR
+	/* NB: Multiply wchar_t count by 4 to get the number of bytes.
+	   */
+	leaq	(%rdi, %rax, CHAR_SIZE), %rax
+# else
+	addq	%rdi, %rax
+ # endif
+ 
+-	.p2align 4
+-L(last_vec_x1):
+-	tzcntl	%eax, %eax
+ # ifndef USE_AS_STRCHRNUL
+ 	/* Check if match was null.  */
+-	cmp	(%rdi, %rax, CHAR_SIZE), %CHAR_REG
+	cmp	(%rax), %CHAR_REG
+ 	jne	L(zero_end)
+ # endif
+-	/* NB: Multiply sizeof char type (1 or 4) to get the number of
+-	   bytes.  */
+-	leaq	(%rdi, %rax, CHAR_SIZE), %rax
+
+ 	ret
+ 
+-	.p2align 4
+	.p2align 4,, 8
+ L(last_vec_x2):
+-	tzcntl	%eax, %eax
+	bsfl	%eax, %eax
+ # ifndef USE_AS_STRCHRNUL
+ 	/* Check if match was null.  */
+ 	cmp	(VEC_SIZE)(%rdi, %rax, CHAR_SIZE), %CHAR_REG
+@@ -336,7 +346,7 @@ L(last_vec_x2):
+ 	ret
+ 
+ 	/* Cold case for crossing page with first load.	 */
+-	.p2align 4
+	.p2align 4,, 8
+ L(cross_page_boundary):
+ 	movq	%rdi, %rdx
+ 	/* Align rdi.  */
+@@ -346,9 +356,9 @@ L(cross_page_boundary):
+ 	vpxorq	%YMM1, %YMM0, %YMM2
+ 	VPMINU	%YMM2, %YMM1, %YMM2
+ 	/* Each bit in K0 represents a CHAR or a null byte in YMM1.  */
+-	VPCMP	$0, %YMMZERO, %YMM2, %k0
+	VPTESTN	%YMM2, %YMM2, %k0
+ 	kmovd	%k0, %eax
+-	/* Remove the leading bits.	 */
+	/* Remove the leading bits.  */
+ # ifdef USE_AS_WCSCHR
+ 	movl	%edx, %SHIFT_REG
+ 	/* NB: Divide shift count by 4 since each bit in K1 represent 4
+@@ -360,20 +370,24 @@ L(cross_page_boundary):
+ 	/* If eax is zero continue.  */
+ 	testl	%eax, %eax
+ 	jz	L(cross_page_continue)
+-	tzcntl	%eax, %eax
+-# ifndef USE_AS_STRCHRNUL
+-	/* Check to see if match was CHAR or null.  */
+-	cmp	(%rdx, %rax, CHAR_SIZE), %CHAR_REG
+-	jne	L(zero_end)
+-# endif
+	bsfl	%eax, %eax
+
+ # ifdef USE_AS_WCSCHR
+ 	/* NB: Multiply wchar_t count by 4 to get the number of
+ 	   bytes.  */
+ 	leaq	(%rdx, %rax, CHAR_SIZE), %rax
+ # else
+ 	addq	%rdx, %rax
+# endif
+# ifndef USE_AS_STRCHRNUL
+	/* Check to see if match was CHAR or null.  */
+	cmp	(%rax), %CHAR_REG
+	je	L(cross_page_ret)
+L(zero_end):
+	xorl	%eax, %eax
+L(cross_page_ret):
+ # endif
+ 	ret
+ 
+ END (STRCHR)
+-# endif
+#endif
+-- 
+GitLab
+
--- a/glibc-RHEL-15696-9.patch
+++ b/glibc-RHEL-15696-9.patch
@ -0,0 +1,206 @@
+From 3f635fb43389b54f682fc9ed2acc0b2aaf4a923d Mon Sep 17 00:00:00 2001
+From: "H.J. Lu" <hjl.tools@gmail.com>
+Date: Mon, 4 Feb 2019 06:31:01 -0800
+Subject: [PATCH] x86-64 memcmp: Use unsigned Jcc instructions on size [BZ
+ #24155]
+Content-type: text/plain; charset=UTF-8
+
+Since the size argument is unsigned. we should use unsigned Jcc
+instructions, instead of signed, to check size.
+
+Tested on x86-64 and x32, with and without --disable-multi-arch.
+
+	[BZ #24155]
+	CVE-2019-7309
+	* NEWS: Updated for CVE-2019-7309.
+	* sysdeps/x86_64/memcmp.S: Use RDX_LP for size.  Clear the
+	upper 32 bits of RDX register for x32.  Use unsigned Jcc
+	instructions, instead of signed.
+	* sysdeps/x86_64/x32/Makefile (tests): Add tst-size_t-memcmp-2.
+	* sysdeps/x86_64/x32/tst-size_t-memcmp-2.c: New test.
+---
+ sysdeps/x86_64/memcmp.S                  | 20 +++---
+ sysdeps/x86_64/x32/Makefile              |  3 +-
+ sysdeps/x86_64/x32/tst-size_t-memcmp-2.c | 79 ++++++++++++++++++++++++
+ 3 files changed, 93 insertions(+), 9 deletions(-)
+ create mode 100644 sysdeps/x86_64/x32/tst-size_t-memcmp-2.c
+
+Conflics:
+	ChangeLog
+	(removed)
+	NEWS
+	(removed)
+
+diff --git a/sysdeps/x86_64/memcmp.S b/sysdeps/x86_64/memcmp.S
+index bcb4a2e8..45918d37 100644
+--- a/sysdeps/x86_64/memcmp.S
+++ b/sysdeps/x86_64/memcmp.S
+@@ -21,14 +21,18 @@
+ 
+ 	.text
+ ENTRY (memcmp)
+-	test	%rdx, %rdx
+#ifdef __ILP32__
+	/* Clear the upper 32 bits.  */
+	movl	%edx, %edx
+#endif
+	test	%RDX_LP, %RDX_LP
+ 	jz	L(finz)
+ 	cmpq	$1, %rdx
+-	jle	L(finr1b)
+	jbe	L(finr1b)
+ 	subq	%rdi, %rsi
+ 	movq	%rdx, %r10
+ 	cmpq	$32, %r10
+-	jge	L(gt32)
+	jae	L(gt32)
+ 	/* Handle small chunks and last block of less than 32 bytes.  */
+ L(small):
+ 	testq	$1, %r10
+@@ -156,7 +160,7 @@ L(A32):
+ 	movq	%r11, %r10
+ 	andq	$-32, %r10
+ 	cmpq	%r10, %rdi
+-        jge	L(mt16)
+        jae	L(mt16)
+ 	/* Pre-unroll to be ready for unrolled 64B loop.  */
+ 	testq	$32, %rdi
+ 	jz	L(A64)
+@@ -178,7 +182,7 @@ L(A64):
+ 	movq	%r11, %r10
+ 	andq	$-64, %r10
+ 	cmpq	%r10, %rdi
+-        jge	L(mt32)
+        jae	L(mt32)
+ 
+ L(A64main):
+ 	movdqu    (%rdi,%rsi), %xmm0
+@@ -216,7 +220,7 @@ L(mt32):
+ 	movq	%r11, %r10
+ 	andq	$-32, %r10
+ 	cmpq	%r10, %rdi
+-        jge	L(mt16)
+        jae	L(mt16)
+ 
+ L(A32main):
+ 	movdqu    (%rdi,%rsi), %xmm0
+@@ -254,7 +258,7 @@ L(ATR):
+ 	movq	%r11, %r10
+ 	andq	$-32, %r10
+ 	cmpq	%r10, %rdi
+-        jge	L(mt16)
+        jae	L(mt16)
+ 	testq	$16, %rdi
+ 	jz	L(ATR32)
+ 
+@@ -325,7 +329,7 @@ L(ATR64main):
+ 	movq	%r11, %r10
+ 	andq	$-32, %r10
+ 	cmpq	%r10, %rdi
+-        jge	L(mt16)
+        jae	L(mt16)
+ 
+ L(ATR32res):
+ 	movdqa    (%rdi,%rsi), %xmm0
+diff --git a/sysdeps/x86_64/x32/Makefile b/sysdeps/x86_64/x32/Makefile
+index 1557724b..87489565 100644
+--- a/sysdeps/x86_64/x32/Makefile
+++ b/sysdeps/x86_64/x32/Makefile
+@@ -8,7 +8,8 @@ endif
+ ifeq ($(subdir),string)
+ tests += tst-size_t-memchr tst-size_t-memcmp tst-size_t-memcpy \
+ 	 tst-size_t-memrchr tst-size_t-memset tst-size_t-strncasecmp \
+-	 tst-size_t-strncmp tst-size_t-strncpy tst-size_t-strnlen
+	 tst-size_t-strncmp tst-size_t-strncpy tst-size_t-strnlen \
+	 tst-size_t-memcmp-2
+ endif
+ 
+ ifeq ($(subdir),wcsmbs)
+diff --git a/sysdeps/x86_64/x32/tst-size_t-memcmp-2.c b/sysdeps/x86_64/x32/tst-size_t-memcmp-2.c
+new file mode 100644
+index 00000000..d8ae1a08
+--- /dev/null
+++ b/sysdeps/x86_64/x32/tst-size_t-memcmp-2.c
+@@ -0,0 +1,79 @@
+/* Test memcmp with size_t in the lower 32 bits of 64-bit register.
+   Copyright (C) 2019 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#define TEST_MAIN
+#ifdef WIDE
+# define TEST_NAME "wmemcmp"
+#else
+# define TEST_NAME "memcmp"
+#endif
+
+#include "test-size_t.h"
+
+#ifdef WIDE
+# include <inttypes.h>
+# include <wchar.h>
+
+# define MEMCMP wmemcmp
+# define CHAR wchar_t
+#else
+# define MEMCMP memcmp
+# define CHAR char
+#endif
+
+IMPL (MEMCMP, 1)
+
+typedef int (*proto_t) (const CHAR *, const CHAR *, size_t);
+
+static int
+__attribute__ ((noinline, noclone))
+do_memcmp (parameter_t a, parameter_t b)
+{
+  return CALL (&b, a.p, b.p, a.len);
+}
+
+static int
+test_main (void)
+{
+  test_init ();
+
+  parameter_t dest = { { page_size / sizeof (CHAR) }, buf1 };
+  parameter_t src = { { 0 }, buf2 };
+
+  memcpy (buf1, buf2, page_size);
+
+  CHAR *p = (CHAR *) buf1;
+  p[page_size / sizeof (CHAR) - 1] = (CHAR) 1;
+
+  int ret = 0;
+  FOR_EACH_IMPL (impl, 0)
+    {
+      src.fn = impl->fn;
+      int res = do_memcmp (dest, src);
+      if (res >= 0)
+	{
+	  error (0, 0, "Wrong result in function %s: %i >= 0",
+		 impl->name, res);
+	  ret = 1;
+	}
+    }
+
+  return ret ? EXIT_FAILURE : EXIT_SUCCESS;
+}
+
+#include <support/test-driver.c>
+-- 
+GitLab
+
--- a/Show More
+++ b/Show More