From 02cfe04e361ac9ecd2663c077179617908a2a2a2 Mon Sep 17 00:00:00 2001
From: DJ Delorie <dj@redhat.com>
Date: Thu, 14 Dec 2023 17:33:45 -0500
Subject: [PATCH] Import Intel hyperscale improvements (RHEL-15696)

Resolves: RHEL-15696

Includes two additional (well, 1.5) upstream patches
to resolve roundeven redirects.
---
 glibc-RHEL-15696-1.patch   |  259 +++
 glibc-RHEL-15696-10.patch  |   41 +
 glibc-RHEL-15696-100.patch |  257 +++
 glibc-RHEL-15696-101.patch |  964 ++++++++++
 glibc-RHEL-15696-102.patch |  263 +++
 glibc-RHEL-15696-103.patch |  876 +++++++++
 glibc-RHEL-15696-104.patch |  501 ++++++
 glibc-RHEL-15696-105.patch |  558 ++++++
 glibc-RHEL-15696-106.patch |   73 +
 glibc-RHEL-15696-107.patch |  226 +++
 glibc-RHEL-15696-108.patch |   55 +
 glibc-RHEL-15696-109.patch |   60 +
 glibc-RHEL-15696-11.patch  |   74 +
 glibc-RHEL-15696-110.patch |   26 +
 glibc-RHEL-15696-12.patch  | 3410 ++++++++++++++++++++++++++++++++++++
 glibc-RHEL-15696-13.patch  | 1488 ++++++++++++++++
 glibc-RHEL-15696-14.patch  |  242 +++
 glibc-RHEL-15696-15.patch  |  254 +++
 glibc-RHEL-15696-16.patch  |  561 ++++++
 glibc-RHEL-15696-17.patch  | 2568 +++++++++++++++++++++++++++
 glibc-RHEL-15696-18.patch  |  735 ++++++++
 glibc-RHEL-15696-19.patch  |  148 ++
 glibc-RHEL-15696-2.patch   |  230 +++
 glibc-RHEL-15696-20.patch  |  164 ++
 glibc-RHEL-15696-21.patch  |   71 +
 glibc-RHEL-15696-22.patch  |   51 +
 glibc-RHEL-15696-23.patch  |  584 ++++++
 glibc-RHEL-15696-24.patch  |  388 ++++
 glibc-RHEL-15696-25.patch  |  767 ++++++++
 glibc-RHEL-15696-26.patch  |  701 ++++++++
 glibc-RHEL-15696-27.patch  |   30 +
 glibc-RHEL-15696-28.patch  |  566 ++++++
 glibc-RHEL-15696-29.patch  |  181 ++
 glibc-RHEL-15696-3.patch   |  396 +++++
 glibc-RHEL-15696-30.patch  |  497 ++++++
 glibc-RHEL-15696-31.patch  |  745 ++++++++
 glibc-RHEL-15696-32.patch  |  158 ++
 glibc-RHEL-15696-33.patch  |   51 +
 glibc-RHEL-15696-34.patch  |  135 ++
 glibc-RHEL-15696-35.patch  |   51 +
 glibc-RHEL-15696-36.patch  |   44 +
 glibc-RHEL-15696-37.patch  |  359 ++++
 glibc-RHEL-15696-38.patch  |   67 +
 glibc-RHEL-15696-39.patch  |  449 +++++
 glibc-RHEL-15696-4.patch   |  151 ++
 glibc-RHEL-15696-40.patch  |   92 +
 glibc-RHEL-15696-41.patch  |  265 +++
 glibc-RHEL-15696-42.patch  |  396 +++++
 glibc-RHEL-15696-43.patch  |  532 ++++++
 glibc-RHEL-15696-44.patch  |  536 ++++++
 glibc-RHEL-15696-45.patch  |  873 +++++++++
 glibc-RHEL-15696-46.patch  |  851 +++++++++
 glibc-RHEL-15696-47.patch  |  104 ++
 glibc-RHEL-15696-48.patch  |   84 +
 glibc-RHEL-15696-49.patch  |   55 +
 glibc-RHEL-15696-5.patch   |  290 +++
 glibc-RHEL-15696-50.patch  |   43 +
 glibc-RHEL-15696-51.patch  |  118 ++
 glibc-RHEL-15696-52.patch  |  242 +++
 glibc-RHEL-15696-53.patch  |   41 +
 glibc-RHEL-15696-54.patch  |  268 +++
 glibc-RHEL-15696-55.patch  |   48 +
 glibc-RHEL-15696-56.patch  |  658 +++++++
 glibc-RHEL-15696-57.patch  |  510 ++++++
 glibc-RHEL-15696-58.patch  |   45 +
 glibc-RHEL-15696-59.patch  |  695 ++++++++
 glibc-RHEL-15696-6.patch   |  300 ++++
 glibc-RHEL-15696-60.patch  |   54 +
 glibc-RHEL-15696-61.patch  |   56 +
 glibc-RHEL-15696-62.patch  |  136 ++
 glibc-RHEL-15696-63.patch  | 2428 +++++++++++++++++++++++++
 glibc-RHEL-15696-64.patch  |   39 +
 glibc-RHEL-15696-65.patch  |   39 +
 glibc-RHEL-15696-66.patch  |   51 +
 glibc-RHEL-15696-67.patch  |   71 +
 glibc-RHEL-15696-68.patch  |   60 +
 glibc-RHEL-15696-69.patch  |   35 +
 glibc-RHEL-15696-7.patch   |  153 ++
 glibc-RHEL-15696-70.patch  |  389 ++++
 glibc-RHEL-15696-71.patch  |   43 +
 glibc-RHEL-15696-72.patch  |  146 ++
 glibc-RHEL-15696-73.patch  |   37 +
 glibc-RHEL-15696-74.patch  | 1798 +++++++++++++++++++
 glibc-RHEL-15696-75.patch  | 1992 +++++++++++++++++++++
 glibc-RHEL-15696-76.patch  |   33 +
 glibc-RHEL-15696-77.patch  |   33 +
 glibc-RHEL-15696-78.patch  |  459 +++++
 glibc-RHEL-15696-79.patch  |   40 +
 glibc-RHEL-15696-8.patch   |  218 +++
 glibc-RHEL-15696-80.patch  |  753 ++++++++
 glibc-RHEL-15696-81.patch  |   33 +
 glibc-RHEL-15696-82.patch  |   90 +
 glibc-RHEL-15696-83.patch  |   77 +
 glibc-RHEL-15696-84.patch  |   27 +
 glibc-RHEL-15696-85.patch  |  108 ++
 glibc-RHEL-15696-86.patch  |   36 +
 glibc-RHEL-15696-87.patch  |   29 +
 glibc-RHEL-15696-88.patch  |  372 ++++
 glibc-RHEL-15696-89.patch  |  343 ++++
 glibc-RHEL-15696-9.patch   |  206 +++
 glibc-RHEL-15696-90.patch  |  147 ++
 glibc-RHEL-15696-91.patch  |  147 ++
 glibc-RHEL-15696-92.patch  |  175 ++
 glibc-RHEL-15696-93.patch  |   55 +
 glibc-RHEL-15696-94.patch  |  168 ++
 glibc-RHEL-15696-95.patch  |  122 ++
 glibc-RHEL-15696-96.patch  |  143 ++
 glibc-RHEL-15696-97.patch  |  759 ++++++++
 glibc-RHEL-15696-98.patch  |  814 +++++++++
 glibc-RHEL-15696-99.patch  |  913 ++++++++++
 glibc.spec                 |  115 +-
 111 files changed, 41462 insertions(+), 1 deletion(-)
 create mode 100644 glibc-RHEL-15696-1.patch
 create mode 100644 glibc-RHEL-15696-10.patch
 create mode 100644 glibc-RHEL-15696-100.patch
 create mode 100644 glibc-RHEL-15696-101.patch
 create mode 100644 glibc-RHEL-15696-102.patch
 create mode 100644 glibc-RHEL-15696-103.patch
 create mode 100644 glibc-RHEL-15696-104.patch
 create mode 100644 glibc-RHEL-15696-105.patch
 create mode 100644 glibc-RHEL-15696-106.patch
 create mode 100644 glibc-RHEL-15696-107.patch
 create mode 100644 glibc-RHEL-15696-108.patch
 create mode 100644 glibc-RHEL-15696-109.patch
 create mode 100644 glibc-RHEL-15696-11.patch
 create mode 100644 glibc-RHEL-15696-110.patch
 create mode 100644 glibc-RHEL-15696-12.patch
 create mode 100644 glibc-RHEL-15696-13.patch
 create mode 100644 glibc-RHEL-15696-14.patch
 create mode 100644 glibc-RHEL-15696-15.patch
 create mode 100644 glibc-RHEL-15696-16.patch
 create mode 100644 glibc-RHEL-15696-17.patch
 create mode 100644 glibc-RHEL-15696-18.patch
 create mode 100644 glibc-RHEL-15696-19.patch
 create mode 100644 glibc-RHEL-15696-2.patch
 create mode 100644 glibc-RHEL-15696-20.patch
 create mode 100644 glibc-RHEL-15696-21.patch
 create mode 100644 glibc-RHEL-15696-22.patch
 create mode 100644 glibc-RHEL-15696-23.patch
 create mode 100644 glibc-RHEL-15696-24.patch
 create mode 100644 glibc-RHEL-15696-25.patch
 create mode 100644 glibc-RHEL-15696-26.patch
 create mode 100644 glibc-RHEL-15696-27.patch
 create mode 100644 glibc-RHEL-15696-28.patch
 create mode 100644 glibc-RHEL-15696-29.patch
 create mode 100644 glibc-RHEL-15696-3.patch
 create mode 100644 glibc-RHEL-15696-30.patch
 create mode 100644 glibc-RHEL-15696-31.patch
 create mode 100644 glibc-RHEL-15696-32.patch
 create mode 100644 glibc-RHEL-15696-33.patch
 create mode 100644 glibc-RHEL-15696-34.patch
 create mode 100644 glibc-RHEL-15696-35.patch
 create mode 100644 glibc-RHEL-15696-36.patch
 create mode 100644 glibc-RHEL-15696-37.patch
 create mode 100644 glibc-RHEL-15696-38.patch
 create mode 100644 glibc-RHEL-15696-39.patch
 create mode 100644 glibc-RHEL-15696-4.patch
 create mode 100644 glibc-RHEL-15696-40.patch
 create mode 100644 glibc-RHEL-15696-41.patch
 create mode 100644 glibc-RHEL-15696-42.patch
 create mode 100644 glibc-RHEL-15696-43.patch
 create mode 100644 glibc-RHEL-15696-44.patch
 create mode 100644 glibc-RHEL-15696-45.patch
 create mode 100644 glibc-RHEL-15696-46.patch
 create mode 100644 glibc-RHEL-15696-47.patch
 create mode 100644 glibc-RHEL-15696-48.patch
 create mode 100644 glibc-RHEL-15696-49.patch
 create mode 100644 glibc-RHEL-15696-5.patch
 create mode 100644 glibc-RHEL-15696-50.patch
 create mode 100644 glibc-RHEL-15696-51.patch
 create mode 100644 glibc-RHEL-15696-52.patch
 create mode 100644 glibc-RHEL-15696-53.patch
 create mode 100644 glibc-RHEL-15696-54.patch
 create mode 100644 glibc-RHEL-15696-55.patch
 create mode 100644 glibc-RHEL-15696-56.patch
 create mode 100644 glibc-RHEL-15696-57.patch
 create mode 100644 glibc-RHEL-15696-58.patch
 create mode 100644 glibc-RHEL-15696-59.patch
 create mode 100644 glibc-RHEL-15696-6.patch
 create mode 100644 glibc-RHEL-15696-60.patch
 create mode 100644 glibc-RHEL-15696-61.patch
 create mode 100644 glibc-RHEL-15696-62.patch
 create mode 100644 glibc-RHEL-15696-63.patch
 create mode 100644 glibc-RHEL-15696-64.patch
 create mode 100644 glibc-RHEL-15696-65.patch
 create mode 100644 glibc-RHEL-15696-66.patch
 create mode 100644 glibc-RHEL-15696-67.patch
 create mode 100644 glibc-RHEL-15696-68.patch
 create mode 100644 glibc-RHEL-15696-69.patch
 create mode 100644 glibc-RHEL-15696-7.patch
 create mode 100644 glibc-RHEL-15696-70.patch
 create mode 100644 glibc-RHEL-15696-71.patch
 create mode 100644 glibc-RHEL-15696-72.patch
 create mode 100644 glibc-RHEL-15696-73.patch
 create mode 100644 glibc-RHEL-15696-74.patch
 create mode 100644 glibc-RHEL-15696-75.patch
 create mode 100644 glibc-RHEL-15696-76.patch
 create mode 100644 glibc-RHEL-15696-77.patch
 create mode 100644 glibc-RHEL-15696-78.patch
 create mode 100644 glibc-RHEL-15696-79.patch
 create mode 100644 glibc-RHEL-15696-8.patch
 create mode 100644 glibc-RHEL-15696-80.patch
 create mode 100644 glibc-RHEL-15696-81.patch
 create mode 100644 glibc-RHEL-15696-82.patch
 create mode 100644 glibc-RHEL-15696-83.patch
 create mode 100644 glibc-RHEL-15696-84.patch
 create mode 100644 glibc-RHEL-15696-85.patch
 create mode 100644 glibc-RHEL-15696-86.patch
 create mode 100644 glibc-RHEL-15696-87.patch
 create mode 100644 glibc-RHEL-15696-88.patch
 create mode 100644 glibc-RHEL-15696-89.patch
 create mode 100644 glibc-RHEL-15696-9.patch
 create mode 100644 glibc-RHEL-15696-90.patch
 create mode 100644 glibc-RHEL-15696-91.patch
 create mode 100644 glibc-RHEL-15696-92.patch
 create mode 100644 glibc-RHEL-15696-93.patch
 create mode 100644 glibc-RHEL-15696-94.patch
 create mode 100644 glibc-RHEL-15696-95.patch
 create mode 100644 glibc-RHEL-15696-96.patch
 create mode 100644 glibc-RHEL-15696-97.patch
 create mode 100644 glibc-RHEL-15696-98.patch
 create mode 100644 glibc-RHEL-15696-99.patch

diff --git a/glibc-RHEL-15696-1.patch b/glibc-RHEL-15696-1.patch
new file mode 100644
index 0000000..804de54
--- /dev/null
+++ b/glibc-RHEL-15696-1.patch
@@ -0,0 +1,259 @@
+From 97700a34f36721b11a754cf37a1cc40695ece1fd Mon Sep 17 00:00:00 2001
+From: "H.J. Lu" <hjl.tools@gmail.com>
+Date: Mon, 21 Jan 2019 11:23:59 -0800
+Subject: [PATCH] x86-64 memchr/wmemchr: Properly handle the length parameter
+ [BZ# 24097]
+Content-type: text/plain; charset=UTF-8
+
+On x32, the size_t parameter may be passed in the lower 32 bits of a
+64-bit register with the non-zero upper 32 bits.  The string/memory
+functions written in assembly can only use the lower 32 bits of a
+64-bit register as length or must clear the upper 32 bits before using
+the full 64-bit register for length.
+
+This pach fixes memchr/wmemchr for x32.  Tested on x86-64 and x32.  On
+x86-64, libc.so is the same with and withou the fix.
+
+	[BZ# 24097]
+	CVE-2019-6488
+	* sysdeps/x86_64/memchr.S: Use RDX_LP for length.  Clear the
+	upper 32 bits of RDX register.
+	* sysdeps/x86_64/multiarch/memchr-avx2.S: Likewise.
+	* sysdeps/x86_64/x32/Makefile (tests): Add tst-size_t-memchr and
+	tst-size_t-wmemchr.
+	* sysdeps/x86_64/x32/test-size_t.h: New file.
+	* sysdeps/x86_64/x32/tst-size_t-memchr.c: Likewise.
+	* sysdeps/x86_64/x32/tst-size_t-wmemchr.c: Likewise.
+---
+ sysdeps/x86_64/memchr.S                 | 10 ++--
+ sysdeps/x86_64/multiarch/memchr-avx2.S  |  8 ++-
+ sysdeps/x86_64/x32/Makefile             |  8 +++
+ sysdeps/x86_64/x32/test-size_t.h        | 35 ++++++++++++
+ sysdeps/x86_64/x32/tst-size_t-memchr.c  | 72 +++++++++++++++++++++++++
+ sysdeps/x86_64/x32/tst-size_t-wmemchr.c | 20 +++++++
+ 6 files changed, 148 insertions(+), 5 deletions(-)
+ create mode 100644 sysdeps/x86_64/x32/test-size_t.h
+ create mode 100644 sysdeps/x86_64/x32/tst-size_t-memchr.c
+ create mode 100644 sysdeps/x86_64/x32/tst-size_t-wmemchr.c
+
+Conflicts:
+	ChangeLog
+	(removed)
+	NEWS
+	(removed)
+
+diff --git a/sysdeps/x86_64/memchr.S b/sysdeps/x86_64/memchr.S
+index feef5d4f..cb320257 100644
+--- a/sysdeps/x86_64/memchr.S
++++ b/sysdeps/x86_64/memchr.S
+@@ -34,12 +34,16 @@ ENTRY(MEMCHR)
+ 	mov	%edi, %ecx
+ 
+ #ifdef USE_AS_WMEMCHR
+-	test	%rdx, %rdx
++	test	%RDX_LP, %RDX_LP
+ 	jz	L(return_null)
+-	shl	$2, %rdx
++	shl	$2, %RDX_LP
+ #else
++# ifdef __ILP32__
++	/* Clear the upper 32 bits.  */
++	movl	%edx, %edx
++# endif
+ 	punpcklbw %xmm1, %xmm1
+-	test	%rdx, %rdx
++	test	%RDX_LP, %RDX_LP
+ 	jz	L(return_null)
+ 	punpcklbw %xmm1, %xmm1
+ #endif
+diff --git a/sysdeps/x86_64/multiarch/memchr-avx2.S b/sysdeps/x86_64/multiarch/memchr-avx2.S
+index 5f5e7725..c81da19b 100644
+--- a/sysdeps/x86_64/multiarch/memchr-avx2.S
++++ b/sysdeps/x86_64/multiarch/memchr-avx2.S
+@@ -40,16 +40,20 @@
+ ENTRY (MEMCHR)
+ # ifndef USE_AS_RAWMEMCHR
+ 	/* Check for zero length.  */
+-	testq	%rdx, %rdx
++	test	%RDX_LP, %RDX_LP
+ 	jz	L(null)
+ # endif
+ 	movl	%edi, %ecx
+ 	/* Broadcast CHAR to YMM0.  */
+ 	vmovd	%esi, %xmm0
+ # ifdef USE_AS_WMEMCHR
+-	shl	$2, %rdx
++	shl	$2, %RDX_LP
+ 	vpbroadcastd %xmm0, %ymm0
+ # else
++#  ifdef __ILP32__
++	/* Clear the upper 32 bits.  */
++	movl	%edx, %edx
++#  endif
+ 	vpbroadcastb %xmm0, %ymm0
+ # endif
+ 	/* Check if we may cross page boundary with one vector load.  */
+diff --git a/sysdeps/x86_64/x32/Makefile b/sysdeps/x86_64/x32/Makefile
+index f2ebc24f..7d528889 100644
+--- a/sysdeps/x86_64/x32/Makefile
++++ b/sysdeps/x86_64/x32/Makefile
+@@ -4,3 +4,11 @@ ifeq ($(subdir),math)
+ # 64-bit llround.  Add -fno-builtin-lround to silence the compiler.
+ CFLAGS-s_llround.c += -fno-builtin-lround
+ endif
++
++ifeq ($(subdir),string)
++tests += tst-size_t-memchr
++endif
++
++ifeq ($(subdir),wcsmbs)
++tests += tst-size_t-wmemchr
++endif
+diff --git a/sysdeps/x86_64/x32/test-size_t.h b/sysdeps/x86_64/x32/test-size_t.h
+new file mode 100644
+index 00000000..78a94086
+--- /dev/null
++++ b/sysdeps/x86_64/x32/test-size_t.h
+@@ -0,0 +1,35 @@
++/* Test string/memory functions with size_t in the lower 32 bits of
++   64-bit register.
++   Copyright (C) 2019 Free Software Foundation, Inc.
++   This file is part of the GNU C Library.
++
++   The GNU C Library is free software; you can redistribute it and/or
++   modify it under the terms of the GNU Lesser General Public
++   License as published by the Free Software Foundation; either
++   version 2.1 of the License, or (at your option) any later version.
++
++   The GNU C Library is distributed in the hope that it will be useful,
++   but WITHOUT ANY WARRANTY; without even the implied warranty of
++   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++   Lesser General Public License for more details.
++
++   You should have received a copy of the GNU Lesser General Public
++   License along with the GNU C Library; if not, see
++   <http://www.gnu.org/licenses/>.  */
++
++#define TEST_MAIN
++#include <string/test-string.h>
++
++/* On x32, parameter_t may be passed in a 64-bit register with the LEN
++   field in the lower 32 bits.  When the LEN field of 64-bit register
++   is passed to string/memory function as the size_t parameter, only
++   the lower 32 bits can be used.  */
++typedef struct
++{
++  union
++    {
++      size_t len;
++      void (*fn) (void);
++    };
++  void *p;
++} parameter_t;
+diff --git a/sysdeps/x86_64/x32/tst-size_t-memchr.c b/sysdeps/x86_64/x32/tst-size_t-memchr.c
+new file mode 100644
+index 00000000..29a3daf1
+--- /dev/null
++++ b/sysdeps/x86_64/x32/tst-size_t-memchr.c
+@@ -0,0 +1,72 @@
++/* Test memchr with size_t in the lower 32 bits of 64-bit register.
++   Copyright (C) 2019 Free Software Foundation, Inc.
++   This file is part of the GNU C Library.
++
++   The GNU C Library is free software; you can redistribute it and/or
++   modify it under the terms of the GNU Lesser General Public
++   License as published by the Free Software Foundation; either
++   version 2.1 of the License, or (at your option) any later version.
++
++   The GNU C Library is distributed in the hope that it will be useful,
++   but WITHOUT ANY WARRANTY; without even the implied warranty of
++   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++   Lesser General Public License for more details.
++
++   You should have received a copy of the GNU Lesser General Public
++   License along with the GNU C Library; if not, see
++   <http://www.gnu.org/licenses/>.  */
++
++#ifndef WIDE
++# define TEST_NAME "memchr"
++#else
++# define TEST_NAME "wmemchr"
++#endif /* WIDE */
++#include "test-size_t.h"
++
++#ifndef WIDE
++# define MEMCHR memchr
++# define CHAR char
++# define UCHAR unsigned char
++#else
++# include <wchar.h>
++# define MEMCHR wmemchr
++# define CHAR wchar_t
++# define UCHAR wchar_t
++#endif /* WIDE */
++
++IMPL (MEMCHR, 1)
++
++typedef CHAR * (*proto_t) (const CHAR*, int, size_t);
++
++static CHAR *
++__attribute__ ((noinline, noclone))
++do_memchr (parameter_t a, parameter_t b)
++{
++  return CALL (&b, a.p, (uintptr_t) b.p, a.len);
++}
++
++static int
++test_main (void)
++{
++  test_init ();
++
++  parameter_t src = { { page_size / sizeof (CHAR) }, buf2 };
++  parameter_t c = { { 0 }, (void *) (uintptr_t) 0x12 };
++
++  int ret = 0;
++  FOR_EACH_IMPL (impl, 0)
++    {
++      c.fn = impl->fn;
++      CHAR *res = do_memchr (src, c);
++      if (res)
++	{
++	  error (0, 0, "Wrong result in function %s: %p != NULL",
++		 impl->name, res);
++	  ret = 1;
++	}
++    }
++
++  return ret ? EXIT_FAILURE : EXIT_SUCCESS;
++}
++
++#include <support/test-driver.c>
+diff --git a/sysdeps/x86_64/x32/tst-size_t-wmemchr.c b/sysdeps/x86_64/x32/tst-size_t-wmemchr.c
+new file mode 100644
+index 00000000..877801d6
+--- /dev/null
++++ b/sysdeps/x86_64/x32/tst-size_t-wmemchr.c
+@@ -0,0 +1,20 @@
++/* Test wmemchr with size_t in the lower 32 bits of 64-bit register.
++   Copyright (C) 2019 Free Software Foundation, Inc.
++   This file is part of the GNU C Library.
++
++   The GNU C Library is free software; you can redistribute it and/or
++   modify it under the terms of the GNU Lesser General Public
++   License as published by the Free Software Foundation; either
++   version 2.1 of the License, or (at your option) any later version.
++
++   The GNU C Library is distributed in the hope that it will be useful,
++   but WITHOUT ANY WARRANTY; without even the implied warranty of
++   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++   Lesser General Public License for more details.
++
++   You should have received a copy of the GNU Lesser General Public
++   License along with the GNU C Library; if not, see
++   <http://www.gnu.org/licenses/>.  */
++
++#define WIDE 1
++#include "tst-size_t-memchr.c"
+-- 
+GitLab
+
diff --git a/glibc-RHEL-15696-10.patch b/glibc-RHEL-15696-10.patch
new file mode 100644
index 0000000..10bd49d
--- /dev/null
+++ b/glibc-RHEL-15696-10.patch
@@ -0,0 +1,41 @@
+From ddf0992cf57a93200e0c782e2a94d0733a5a0b87 Mon Sep 17 00:00:00 2001
+From: Noah Goldstein <goldstein.w.n@gmail.com>
+Date: Sun, 9 Jan 2022 16:02:21 -0600
+Subject: [PATCH] x86: Fix __wcsncmp_avx2 in strcmp-avx2.S [BZ# 28755]
+Content-type: text/plain; charset=UTF-8
+
+Fixes [BZ# 28755] for wcsncmp by redirecting length >= 2^56 to
+__wcscmp_avx2. For x86_64 this covers the entire address range so any
+length larger could not possibly be used to bound `s1` or `s2`.
+
+test-strcmp, test-strncmp, test-wcscmp, and test-wcsncmp all pass.
+
+Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
+---
+ sysdeps/x86_64/multiarch/strcmp-avx2.S | 10 ++++++++++
+ 1 file changed, 10 insertions(+)
+
+diff --git a/sysdeps/x86_64/multiarch/strcmp-avx2.S b/sysdeps/x86_64/multiarch/strcmp-avx2.S
+index 156c1949..8fb8eedc 100644
+--- a/sysdeps/x86_64/multiarch/strcmp-avx2.S
++++ b/sysdeps/x86_64/multiarch/strcmp-avx2.S
+@@ -83,6 +83,16 @@ ENTRY (STRCMP)
+ 	je	L(char0)
+ 	jb	L(zero)
+ #  ifdef USE_AS_WCSCMP
++#  ifndef __ILP32__
++	movq	%rdx, %rcx
++	/* Check if length could overflow when multiplied by
++	   sizeof(wchar_t). Checking top 8 bits will cover all potential
++	   overflow cases as well as redirect cases where its impossible to
++	   length to bound a valid memory region. In these cases just use
++	   'wcscmp'.  */
++	shrq	$56, %rcx
++	jnz	__wcscmp_avx2
++#  endif
+ 	/* Convert units: from wide to byte char.  */
+ 	shl	$2, %RDX_LP
+ #  endif
+-- 
+GitLab
+
diff --git a/glibc-RHEL-15696-100.patch b/glibc-RHEL-15696-100.patch
new file mode 100644
index 0000000..0e779e4
--- /dev/null
+++ b/glibc-RHEL-15696-100.patch
@@ -0,0 +1,257 @@
+From 244b415d386487521882debb845a040a4758cb18 Mon Sep 17 00:00:00 2001
+From: Noah Goldstein <goldstein.w.n@gmail.com>
+Date: Fri, 25 Mar 2022 17:13:33 -0500
+Subject: [PATCH] x86: Small improvements for wcslen
+Content-type: text/plain; charset=UTF-8
+
+Just a few QOL changes.
+    1. Prefer `add` > `lea` as it has high execution units it can run
+       on.
+    2. Don't break macro-fusion between `test` and `jcc`
+    3. Reduce code size by removing gratuitous padding bytes (-90
+       bytes).
+
+geometric_mean(N=20) of all benchmarks New / Original: 0.959
+
+All string/memory tests pass.
+Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
+---
+ sysdeps/x86_64/wcslen.S | 86 ++++++++++++++++++++---------------------
+ 1 file changed, 41 insertions(+), 45 deletions(-)
+
+diff --git a/sysdeps/x86_64/wcslen.S b/sysdeps/x86_64/wcslen.S
+index 9f5f7232..254bb030 100644
+--- a/sysdeps/x86_64/wcslen.S
++++ b/sysdeps/x86_64/wcslen.S
+@@ -41,82 +41,82 @@ ENTRY (__wcslen)
+ 	pxor	%xmm0, %xmm0
+ 
+ 	lea	32(%rdi), %rax
+-	lea	16(%rdi), %rcx
++	addq	$16, %rdi
+ 	and	$-16, %rax
+ 
+ 	pcmpeqd	(%rax), %xmm0
+ 	pmovmskb %xmm0, %edx
+ 	pxor	%xmm1, %xmm1
++	addq	$16, %rax
+ 	test	%edx, %edx
+-	lea	16(%rax), %rax
+ 	jnz	L(exit)
+ 
+ 	pcmpeqd	(%rax), %xmm1
+ 	pmovmskb %xmm1, %edx
+ 	pxor	%xmm2, %xmm2
++	addq	$16, %rax
+ 	test	%edx, %edx
+-	lea	16(%rax), %rax
+ 	jnz	L(exit)
+ 
+ 	pcmpeqd	(%rax), %xmm2
+ 	pmovmskb %xmm2, %edx
+ 	pxor	%xmm3, %xmm3
++	addq	$16, %rax
+ 	test	%edx, %edx
+-	lea	16(%rax), %rax
+ 	jnz	L(exit)
+ 
+ 	pcmpeqd	(%rax), %xmm3
+ 	pmovmskb %xmm3, %edx
++	addq	$16, %rax
+ 	test	%edx, %edx
+-	lea	16(%rax), %rax
+ 	jnz	L(exit)
+ 
+ 	pcmpeqd	(%rax), %xmm0
+ 	pmovmskb %xmm0, %edx
++	addq	$16, %rax
+ 	test	%edx, %edx
+-	lea	16(%rax), %rax
+ 	jnz	L(exit)
+ 
+ 	pcmpeqd	(%rax), %xmm1
+ 	pmovmskb %xmm1, %edx
++	addq	$16, %rax
+ 	test	%edx, %edx
+-	lea	16(%rax), %rax
+ 	jnz	L(exit)
+ 
+ 	pcmpeqd	(%rax), %xmm2
+ 	pmovmskb %xmm2, %edx
++	addq	$16, %rax
+ 	test	%edx, %edx
+-	lea	16(%rax), %rax
+ 	jnz	L(exit)
+ 
+ 	pcmpeqd	(%rax), %xmm3
+ 	pmovmskb %xmm3, %edx
++	addq	$16, %rax
+ 	test	%edx, %edx
+-	lea	16(%rax), %rax
+ 	jnz	L(exit)
+ 
+ 	pcmpeqd	(%rax), %xmm0
+ 	pmovmskb %xmm0, %edx
++	addq	$16, %rax
+ 	test	%edx, %edx
+-	lea	16(%rax), %rax
+ 	jnz	L(exit)
+ 
+ 	pcmpeqd	(%rax), %xmm1
+ 	pmovmskb %xmm1, %edx
++	addq	$16, %rax
+ 	test	%edx, %edx
+-	lea	16(%rax), %rax
+ 	jnz	L(exit)
+ 
+ 	pcmpeqd	(%rax), %xmm2
+ 	pmovmskb %xmm2, %edx
++	addq	$16, %rax
+ 	test	%edx, %edx
+-	lea	16(%rax), %rax
+ 	jnz	L(exit)
+ 
+ 	pcmpeqd	(%rax), %xmm3
+ 	pmovmskb %xmm3, %edx
++	addq	$16, %rax
+ 	test	%edx, %edx
+-	lea	16(%rax), %rax
+ 	jnz	L(exit)
+ 
+ 	and	$-0x40, %rax
+@@ -133,104 +133,100 @@ L(aligned_64_loop):
+ 	pminub	%xmm0, %xmm2
+ 	pcmpeqd	%xmm3, %xmm2
+ 	pmovmskb %xmm2, %edx
++	addq	$64, %rax
+ 	test	%edx, %edx
+-	lea	64(%rax), %rax
+ 	jz	L(aligned_64_loop)
+ 
+ 	pcmpeqd	-64(%rax), %xmm3
+ 	pmovmskb %xmm3, %edx
++    addq	$48, %rdi
+ 	test	%edx, %edx
+-	lea	48(%rcx), %rcx
+ 	jnz	L(exit)
+ 
+ 	pcmpeqd	%xmm1, %xmm3
+ 	pmovmskb %xmm3, %edx
++    addq	$-16, %rdi
+ 	test	%edx, %edx
+-	lea	-16(%rcx), %rcx
+ 	jnz	L(exit)
+ 
+ 	pcmpeqd	-32(%rax), %xmm3
+ 	pmovmskb %xmm3, %edx
++    addq	$-16, %rdi
+ 	test	%edx, %edx
+-	lea	-16(%rcx), %rcx
+ 	jnz	L(exit)
+ 
+ 	pcmpeqd	%xmm6, %xmm3
+ 	pmovmskb %xmm3, %edx
++    addq	$-16, %rdi
+ 	test	%edx, %edx
+-	lea	-16(%rcx), %rcx
+-	jnz	L(exit)
+-
+-	jmp	L(aligned_64_loop)
++	jz	L(aligned_64_loop)
+ 
+ 	.p2align 4
+ L(exit):
+-	sub	%rcx, %rax
++	sub	%rdi, %rax
+ 	shr	$2, %rax
+ 	test	%dl, %dl
+ 	jz	L(exit_high)
+ 
+-	mov	%dl, %cl
+-	and	$15, %cl
++	andl	$15, %edx
+ 	jz	L(exit_1)
+ 	ret
+ 
+-	.p2align 4
++	/* No align here. Naturally aligned % 16 == 1.  */
+ L(exit_high):
+-	mov	%dh, %ch
+-	and	$15, %ch
++	andl	$(15 << 8), %edx
+ 	jz	L(exit_3)
+ 	add	$2, %rax
+ 	ret
+ 
+-	.p2align 4
++	.p2align 3
+ L(exit_1):
+ 	add	$1, %rax
+ 	ret
+ 
+-	.p2align 4
++	.p2align 3
+ L(exit_3):
+ 	add	$3, %rax
+ 	ret
+ 
+-	.p2align 4
++	.p2align 3
+ L(exit_tail0):
+-	xor	%rax, %rax
++	xorl	%eax, %eax
+ 	ret
+ 
+-	.p2align 4
++	.p2align 3
+ L(exit_tail1):
+-	mov	$1, %rax
++	movl	$1, %eax
+ 	ret
+ 
+-	.p2align 4
++	.p2align 3
+ L(exit_tail2):
+-	mov	$2, %rax
++	movl	$2, %eax
+ 	ret
+ 
+-	.p2align 4
++	.p2align 3
+ L(exit_tail3):
+-	mov	$3, %rax
++	movl	$3, %eax
+ 	ret
+ 
+-	.p2align 4
++	.p2align 3
+ L(exit_tail4):
+-	mov	$4, %rax
++	movl	$4, %eax
+ 	ret
+ 
+-	.p2align 4
++	.p2align 3
+ L(exit_tail5):
+-	mov	$5, %rax
++	movl	$5, %eax
+ 	ret
+ 
+-	.p2align 4
++	.p2align 3
+ L(exit_tail6):
+-	mov	$6, %rax
++	movl	$6, %eax
+ 	ret
+ 
+-	.p2align 4
++	.p2align 3
+ L(exit_tail7):
+-	mov	$7, %rax
++	movl	$7, %eax
+ 	ret
+ 
+ END (__wcslen)
+-- 
+GitLab
+
diff --git a/glibc-RHEL-15696-101.patch b/glibc-RHEL-15696-101.patch
new file mode 100644
index 0000000..131ea5b
--- /dev/null
+++ b/glibc-RHEL-15696-101.patch
@@ -0,0 +1,964 @@
+From 7cbc03d03091d5664060924789afe46d30a5477e Mon Sep 17 00:00:00 2001
+From: Noah Goldstein <goldstein.w.n@gmail.com>
+Date: Fri, 15 Apr 2022 12:28:00 -0500
+Subject: [PATCH] x86: Remove memcmp-sse4.S
+Content-type: text/plain; charset=UTF-8
+
+Code didn't actually use any sse4 instructions since `ptest` was
+removed in:
+
+commit 2f9062d7171850451e6044ef78d91ff8c017b9c0
+Author: Noah Goldstein <goldstein.w.n@gmail.com>
+Date:   Wed Nov 10 16:18:56 2021 -0600
+
+    x86: Shrink memcmp-sse4.S code size
+
+The new memcmp-sse2 implementation is also faster.
+
+geometric_mean(N=20) of page cross cases SSE2 / SSE4: 0.905
+
+Note there are two regressions preferring SSE2 for Size = 1 and Size =
+65.
+
+Size = 1:
+size, align0, align1, ret, New Time/Old Time
+   1,      1,      1,   0,               1.2
+   1,      1,      1,   1,             1.197
+   1,      1,      1,  -1,               1.2
+
+This is intentional. Size == 1 is significantly less hot based on
+profiles of GCC11 and Python3 than sizes [4, 8] (which is made
+hotter).
+
+Python3 Size = 1        -> 13.64%
+Python3 Size = [4, 8]   -> 60.92%
+
+GCC11   Size = 1        ->  1.29%
+GCC11   Size = [4, 8]   -> 33.86%
+
+size, align0, align1, ret, New Time/Old Time
+   4,      4,      4,   0,             0.622
+   4,      4,      4,   1,             0.797
+   4,      4,      4,  -1,             0.805
+   5,      5,      5,   0,             0.623
+   5,      5,      5,   1,             0.777
+   5,      5,      5,  -1,             0.802
+   6,      6,      6,   0,             0.625
+   6,      6,      6,   1,             0.813
+   6,      6,      6,  -1,             0.788
+   7,      7,      7,   0,             0.625
+   7,      7,      7,   1,             0.799
+   7,      7,      7,  -1,             0.795
+   8,      8,      8,   0,             0.625
+   8,      8,      8,   1,             0.848
+   8,      8,      8,  -1,             0.914
+   9,      9,      9,   0,             0.625
+
+Size = 65:
+size, align0, align1, ret, New Time/Old Time
+  65,      0,      0,   0,             1.103
+  65,      0,      0,   1,             1.216
+  65,      0,      0,  -1,             1.227
+  65,     65,      0,   0,             1.091
+  65,      0,     65,   1,              1.19
+  65,     65,     65,  -1,             1.215
+
+This is because A) the checks in range [65, 96] are now unrolled 2x
+and B) because smaller values <= 16 are now given a hotter path. By
+contrast the SSE4 version has a branch for Size = 80. The unrolled
+version has get better performance for returns which need both
+comparisons.
+
+size, align0, align1, ret, New Time/Old Time
+ 128,      4,      8,   0,             0.858
+ 128,      4,      8,   1,             0.879
+ 128,      4,      8,  -1,             0.888
+
+As well, out of microbenchmark environments that are not full
+predictable the branch will have a real-cost.
+Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
+---
+ sysdeps/x86_64/multiarch/Makefile          |   2 -
+ sysdeps/x86_64/multiarch/ifunc-impl-list.c |   4 -
+ sysdeps/x86_64/multiarch/ifunc-memcmp.h    |   4 -
+ sysdeps/x86_64/multiarch/memcmp-sse4.S     | 804 ---------------------
+ 4 files changed, 814 deletions(-)
+ delete mode 100644 sysdeps/x86_64/multiarch/memcmp-sse4.S
+
+diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
+index bca82e38..b503e4b8 100644
+--- a/sysdeps/x86_64/multiarch/Makefile
++++ b/sysdeps/x86_64/multiarch/Makefile
+@@ -11,7 +11,6 @@ sysdep_routines += \
+   memcmp-avx2-movbe-rtm \
+   memcmp-evex-movbe \
+   memcmp-sse2 \
+-  memcmp-sse4 \
+   memcmp-ssse3 \
+   memcpy-ssse3 \
+   memcpy-ssse3-back \
+@@ -174,7 +173,6 @@ sysdep_routines += \
+   wmemcmp-avx2-movbe-rtm \
+   wmemcmp-c \
+   wmemcmp-evex-movbe \
+-  wmemcmp-sse4 \
+   wmemcmp-ssse3 \
+ # sysdep_routines
+ endif
+diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+index 14314367..450a2917 100644
+--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
++++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+@@ -78,8 +78,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 			       && CPU_FEATURE_USABLE (BMI2)
+ 			       && CPU_FEATURE_USABLE (MOVBE)),
+ 			      __memcmp_evex_movbe)
+-	      IFUNC_IMPL_ADD (array, i, memcmp, CPU_FEATURE_USABLE (SSE4_1),
+-			      __memcmp_sse4_1)
+ 	      IFUNC_IMPL_ADD (array, i, memcmp, CPU_FEATURE_USABLE (SSSE3),
+ 			      __memcmp_ssse3)
+ 	      IFUNC_IMPL_ADD (array, i, memcmp, 1, __memcmp_sse2))
+@@ -824,8 +822,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 			       && CPU_FEATURE_USABLE (BMI2)
+ 			       && CPU_FEATURE_USABLE (MOVBE)),
+ 			      __wmemcmp_evex_movbe)
+-	      IFUNC_IMPL_ADD (array, i, wmemcmp, CPU_FEATURE_USABLE (SSE4_1),
+-			      __wmemcmp_sse4_1)
+ 	      IFUNC_IMPL_ADD (array, i, wmemcmp, CPU_FEATURE_USABLE (SSSE3),
+ 			      __wmemcmp_ssse3)
+ 	      IFUNC_IMPL_ADD (array, i, wmemcmp, 1, __wmemcmp_sse2))
+diff --git a/sysdeps/x86_64/multiarch/ifunc-memcmp.h b/sysdeps/x86_64/multiarch/ifunc-memcmp.h
+index 690dffe8..0bc47a7f 100644
+--- a/sysdeps/x86_64/multiarch/ifunc-memcmp.h
++++ b/sysdeps/x86_64/multiarch/ifunc-memcmp.h
+@@ -21,7 +21,6 @@
+ 
+ extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden;
+ extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden;
+-extern __typeof (REDIRECT_NAME) OPTIMIZE (sse4_1) attribute_hidden;
+ extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_movbe) attribute_hidden;
+ extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_movbe_rtm) attribute_hidden;
+ extern __typeof (REDIRECT_NAME) OPTIMIZE (evex_movbe) attribute_hidden;
+@@ -47,9 +46,6 @@ IFUNC_SELECTOR (void)
+ 	return OPTIMIZE (avx2_movbe);
+     }
+ 
+-  if (CPU_FEATURE_USABLE_P (cpu_features, SSE4_1))
+-    return OPTIMIZE (sse4_1);
+-
+   if (CPU_FEATURE_USABLE_P (cpu_features, SSSE3))
+     return OPTIMIZE (ssse3);
+ 
+diff --git a/sysdeps/x86_64/multiarch/memcmp-sse4.S b/sysdeps/x86_64/multiarch/memcmp-sse4.S
+deleted file mode 100644
+index 50060006..00000000
+--- a/sysdeps/x86_64/multiarch/memcmp-sse4.S
++++ /dev/null
+@@ -1,804 +0,0 @@
+-/* memcmp with SSE4.1, wmemcmp with SSE4.1
+-   Copyright (C) 2010-2018 Free Software Foundation, Inc.
+-   Contributed by Intel Corporation.
+-   This file is part of the GNU C Library.
+-
+-   The GNU C Library is free software; you can redistribute it and/or
+-   modify it under the terms of the GNU Lesser General Public
+-   License as published by the Free Software Foundation; either
+-   version 2.1 of the License, or (at your option) any later version.
+-
+-   The GNU C Library is distributed in the hope that it will be useful,
+-   but WITHOUT ANY WARRANTY; without even the implied warranty of
+-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+-   Lesser General Public License for more details.
+-
+-   You should have received a copy of the GNU Lesser General Public
+-   License along with the GNU C Library; if not, see
+-   <http://www.gnu.org/licenses/>.  */
+-
+-#if IS_IN (libc)
+-
+-# include <sysdep.h>
+-
+-# ifndef MEMCMP
+-#  define MEMCMP	__memcmp_sse4_1
+-# endif
+-
+-#ifdef USE_AS_WMEMCMP
+-# define CMPEQ	pcmpeqd
+-# define CHAR_SIZE	4
+-#else
+-# define CMPEQ	pcmpeqb
+-# define CHAR_SIZE	1
+-#endif
+-
+-
+-/* Warning!
+-           wmemcmp has to use SIGNED comparison for elements.
+-           memcmp has to use UNSIGNED comparison for elemnts.
+-*/
+-
+-	.section .text.sse4.1,"ax",@progbits
+-ENTRY (MEMCMP)
+-# ifdef USE_AS_WMEMCMP
+-	shl	$2, %RDX_LP
+-# elif defined __ILP32__
+-	/* Clear the upper 32 bits.  */
+-	mov	%edx, %edx
+-# endif
+-	cmp	$79, %RDX_LP
+-	ja	L(79bytesormore)
+-
+-	cmp	$CHAR_SIZE, %RDX_LP
+-	jbe	L(firstbyte)
+-
+-	/* N in (CHAR_SIZE, 79) bytes.  */
+-	cmpl	$32, %edx
+-	ja	L(more_32_bytes)
+-
+-	cmpl	$16, %edx
+-	jae	L(16_to_32_bytes)
+-
+-# ifndef USE_AS_WMEMCMP
+-	cmpl	$8, %edx
+-	jae	L(8_to_16_bytes)
+-
+-	cmpl	$4, %edx
+-	jb	L(2_to_3_bytes)
+-
+-	movl	(%rdi), %eax
+-	movl	(%rsi), %ecx
+-
+-	bswap	%eax
+-	bswap	%ecx
+-
+-	shlq	$32, %rax
+-	shlq	$32, %rcx
+-
+-	movl	-4(%rdi, %rdx), %edi
+-	movl	-4(%rsi, %rdx), %esi
+-
+-	bswap	%edi
+-	bswap	%esi
+-
+-	orq	%rdi, %rax
+-	orq	%rsi, %rcx
+-	subq	%rcx, %rax
+-	cmovne	%edx, %eax
+-	sbbl	%ecx, %ecx
+-	orl	%ecx, %eax
+-	ret
+-
+-	.p2align 4,, 8
+-L(2_to_3_bytes):
+-	movzwl	(%rdi), %eax
+-	movzwl	(%rsi), %ecx
+-	shll	$8, %eax
+-	shll	$8, %ecx
+-	bswap	%eax
+-	bswap	%ecx
+-	movzbl	-1(%rdi, %rdx), %edi
+-	movzbl	-1(%rsi, %rdx), %esi
+-	orl	%edi, %eax
+-	orl	%esi, %ecx
+-	subl	%ecx, %eax
+-	ret
+-
+-	.p2align 4,, 8
+-L(8_to_16_bytes):
+-	movq	(%rdi), %rax
+-	movq	(%rsi), %rcx
+-
+-	bswap	%rax
+-	bswap	%rcx
+-
+-	subq	%rcx, %rax
+-	jne	L(8_to_16_bytes_done)
+-
+-	movq	-8(%rdi, %rdx), %rax
+-	movq	-8(%rsi, %rdx), %rcx
+-
+-	bswap	%rax
+-	bswap	%rcx
+-
+-	subq	%rcx, %rax
+-
+-L(8_to_16_bytes_done):
+-	cmovne	%edx, %eax
+-	sbbl	%ecx, %ecx
+-	orl	%ecx, %eax
+-	ret
+-# else
+-	xorl	%eax, %eax
+-	movl	(%rdi), %ecx
+-	cmpl	(%rsi), %ecx
+-	jne	L(8_to_16_bytes_done)
+-	movl	4(%rdi), %ecx
+-	cmpl	4(%rsi), %ecx
+-	jne	L(8_to_16_bytes_done)
+-	movl	-4(%rdi, %rdx), %ecx
+-	cmpl	-4(%rsi, %rdx), %ecx
+-	jne	L(8_to_16_bytes_done)
+-	ret
+-# endif
+-
+-	.p2align 4,, 3
+-L(ret_zero):
+-	xorl	%eax, %eax
+-L(zero):
+-	ret
+-
+-	.p2align 4,, 8
+-L(firstbyte):
+-	jb	L(ret_zero)
+-# ifdef USE_AS_WMEMCMP
+-	xorl	%eax, %eax
+-	movl	(%rdi), %ecx
+-	cmpl	(%rsi), %ecx
+-	je	L(zero)
+-L(8_to_16_bytes_done):
+-	setg	%al
+-	leal	-1(%rax, %rax), %eax
+-# else
+-	movzbl	(%rdi), %eax
+-	movzbl	(%rsi), %ecx
+-	sub	%ecx, %eax
+-# endif
+-	ret
+-
+-	.p2align 4
+-L(vec_return_begin_48):
+-	addq	$16, %rdi
+-	addq	$16, %rsi
+-L(vec_return_begin_32):
+-	bsfl	%eax, %eax
+-# ifdef USE_AS_WMEMCMP
+-	movl	32(%rdi, %rax), %ecx
+-	xorl	%edx, %edx
+-	cmpl	32(%rsi, %rax), %ecx
+-	setg	%dl
+-	leal	-1(%rdx, %rdx), %eax
+-# else
+-	movzbl	32(%rsi, %rax), %ecx
+-	movzbl	32(%rdi, %rax), %eax
+-	subl	%ecx, %eax
+-# endif
+-	ret
+-
+-	.p2align 4
+-L(vec_return_begin_16):
+-	addq	$16, %rdi
+-	addq	$16, %rsi
+-L(vec_return_begin):
+-	bsfl	%eax, %eax
+-# ifdef USE_AS_WMEMCMP
+-	movl	(%rdi, %rax), %ecx
+-	xorl	%edx, %edx
+-	cmpl	(%rsi, %rax), %ecx
+-	setg	%dl
+-	leal	-1(%rdx, %rdx), %eax
+-# else
+-	movzbl	(%rsi, %rax), %ecx
+-	movzbl	(%rdi, %rax), %eax
+-	subl	%ecx, %eax
+-# endif
+-	ret
+-
+-	.p2align 4
+-L(vec_return_end_16):
+-	subl	$16, %edx
+-L(vec_return_end):
+-	bsfl	%eax, %eax
+-	addl	%edx, %eax
+-# ifdef USE_AS_WMEMCMP
+-	movl	-16(%rdi, %rax), %ecx
+-	xorl	%edx, %edx
+-	cmpl	-16(%rsi, %rax), %ecx
+-	setg	%dl
+-	leal	-1(%rdx, %rdx), %eax
+-# else
+-	movzbl	-16(%rsi, %rax), %ecx
+-	movzbl	-16(%rdi, %rax), %eax
+-	subl	%ecx, %eax
+-# endif
+-	ret
+-
+-	.p2align 4,, 8
+-L(more_32_bytes):
+-	movdqu	(%rdi), %xmm0
+-	movdqu	(%rsi), %xmm1
+-	CMPEQ	%xmm0, %xmm1
+-	pmovmskb %xmm1, %eax
+-	incw	%ax
+-	jnz	L(vec_return_begin)
+-
+-	movdqu	16(%rdi), %xmm0
+-	movdqu	16(%rsi), %xmm1
+-	CMPEQ	%xmm0, %xmm1
+-	pmovmskb %xmm1, %eax
+-	incw	%ax
+-	jnz	L(vec_return_begin_16)
+-
+-	cmpl	$64, %edx
+-	jbe	L(32_to_64_bytes)
+-	movdqu	32(%rdi), %xmm0
+-	movdqu	32(%rsi), %xmm1
+-	CMPEQ	%xmm0, %xmm1
+-	pmovmskb %xmm1, %eax
+-	incw	%ax
+-	jnz	L(vec_return_begin_32)
+-
+-	.p2align 4,, 6
+-L(32_to_64_bytes):
+-	movdqu	-32(%rdi, %rdx), %xmm0
+-	movdqu	-32(%rsi, %rdx), %xmm1
+-	CMPEQ	%xmm0, %xmm1
+-	pmovmskb %xmm1, %eax
+-	incw	%ax
+-	jnz	L(vec_return_end_16)
+-
+-	movdqu	-16(%rdi, %rdx), %xmm0
+-	movdqu	-16(%rsi, %rdx), %xmm1
+-	CMPEQ	%xmm0, %xmm1
+-	pmovmskb %xmm1, %eax
+-	incw	%ax
+-	jnz	L(vec_return_end)
+-	ret
+-
+-	.p2align 4
+-L(16_to_32_bytes):
+-	movdqu	(%rdi), %xmm0
+-	movdqu	(%rsi), %xmm1
+-	CMPEQ	%xmm0, %xmm1
+-	pmovmskb %xmm1, %eax
+-	incw	%ax
+-	jnz	L(vec_return_begin)
+-
+-	movdqu	-16(%rdi, %rdx), %xmm0
+-	movdqu	-16(%rsi, %rdx), %xmm1
+-	CMPEQ	%xmm0, %xmm1
+-	pmovmskb %xmm1, %eax
+-	incw	%ax
+-	jnz	L(vec_return_end)
+-	ret
+-
+-
+-	.p2align 4
+-L(79bytesormore):
+-	movdqu	(%rdi), %xmm0
+-	movdqu	(%rsi), %xmm1
+-	CMPEQ	%xmm0, %xmm1
+-	pmovmskb %xmm1, %eax
+-	incw	%ax
+-	jnz	L(vec_return_begin)
+-
+-
+-	mov	%rsi, %rcx
+-	and	$-16, %rsi
+-	add	$16, %rsi
+-	sub	%rsi, %rcx
+-
+-	sub	%rcx, %rdi
+-	add	%rcx, %rdx
+-	test	$0xf, %rdi
+-	jz	L(2aligned)
+-
+-	cmp	$128, %rdx
+-	ja	L(128bytesormore)
+-
+-	.p2align 4,, 6
+-L(less128bytes):
+-	movdqu	(%rdi), %xmm1
+-	CMPEQ	(%rsi), %xmm1
+-	pmovmskb %xmm1, %eax
+-	incw	%ax
+-	jnz	L(vec_return_begin)
+-
+-	movdqu	16(%rdi), %xmm1
+-	CMPEQ	16(%rsi), %xmm1
+-	pmovmskb %xmm1, %eax
+-	incw	%ax
+-	jnz	L(vec_return_begin_16)
+-
+-	movdqu	32(%rdi), %xmm1
+-	CMPEQ	32(%rsi), %xmm1
+-	pmovmskb %xmm1, %eax
+-	incw	%ax
+-	jnz	L(vec_return_begin_32)
+-
+-	movdqu	48(%rdi), %xmm1
+-	CMPEQ	48(%rsi), %xmm1
+-	pmovmskb %xmm1, %eax
+-	incw	%ax
+-	jnz	L(vec_return_begin_48)
+-
+-	cmp	$96, %rdx
+-	jb	L(32_to_64_bytes)
+-
+-	addq	$64, %rdi
+-	addq	$64, %rsi
+-	subq	$64, %rdx
+-
+-	.p2align 4,, 6
+-L(last_64_bytes):
+-	movdqu	(%rdi), %xmm1
+-	CMPEQ	(%rsi), %xmm1
+-	pmovmskb %xmm1, %eax
+-	incw	%ax
+-	jnz	L(vec_return_begin)
+-
+-	movdqu	16(%rdi), %xmm1
+-	CMPEQ	16(%rsi), %xmm1
+-	pmovmskb %xmm1, %eax
+-	incw	%ax
+-	jnz	L(vec_return_begin_16)
+-
+-	movdqu	-32(%rdi, %rdx), %xmm0
+-	movdqu	-32(%rsi, %rdx), %xmm1
+-	CMPEQ	%xmm0, %xmm1
+-	pmovmskb %xmm1, %eax
+-	incw	%ax
+-	jnz	L(vec_return_end_16)
+-
+-	movdqu	-16(%rdi, %rdx), %xmm0
+-	movdqu	-16(%rsi, %rdx), %xmm1
+-	CMPEQ	%xmm0, %xmm1
+-	pmovmskb %xmm1, %eax
+-	incw	%ax
+-	jnz	L(vec_return_end)
+-	ret
+-
+-	.p2align 4
+-L(128bytesormore):
+-	cmp	$256, %rdx
+-	ja	L(unaligned_loop)
+-L(less256bytes):
+-	movdqu	(%rdi), %xmm1
+-	CMPEQ	(%rsi), %xmm1
+-	pmovmskb %xmm1, %eax
+-	incw	%ax
+-	jnz	L(vec_return_begin)
+-
+-	movdqu	16(%rdi), %xmm1
+-	CMPEQ	16(%rsi), %xmm1
+-	pmovmskb %xmm1, %eax
+-	incw	%ax
+-	jnz	L(vec_return_begin_16)
+-
+-	movdqu	32(%rdi), %xmm1
+-	CMPEQ	32(%rsi), %xmm1
+-	pmovmskb %xmm1, %eax
+-	incw	%ax
+-	jnz	L(vec_return_begin_32)
+-
+-	movdqu	48(%rdi), %xmm1
+-	CMPEQ	48(%rsi), %xmm1
+-	pmovmskb %xmm1, %eax
+-	incw	%ax
+-	jnz	L(vec_return_begin_48)
+-
+-	addq	$64, %rdi
+-	addq	$64, %rsi
+-
+-	movdqu	(%rdi), %xmm1
+-	CMPEQ	(%rsi), %xmm1
+-	pmovmskb %xmm1, %eax
+-	incw	%ax
+-	jnz	L(vec_return_begin)
+-
+-	movdqu	16(%rdi), %xmm1
+-	CMPEQ	16(%rsi), %xmm1
+-	pmovmskb %xmm1, %eax
+-	incw	%ax
+-	jnz	L(vec_return_begin_16)
+-
+-	movdqu	32(%rdi), %xmm1
+-	CMPEQ	32(%rsi), %xmm1
+-	pmovmskb %xmm1, %eax
+-	incw	%ax
+-	jnz	L(vec_return_begin_32)
+-
+-	movdqu	48(%rdi), %xmm1
+-	CMPEQ	48(%rsi), %xmm1
+-	pmovmskb %xmm1, %eax
+-	incw	%ax
+-	jnz	L(vec_return_begin_48)
+-
+-	addq	$-128, %rdx
+-	subq	$-64, %rsi
+-	subq	$-64, %rdi
+-
+-	cmp	$64, %rdx
+-	ja	L(less128bytes)
+-
+-	cmp	$32, %rdx
+-	ja	L(last_64_bytes)
+-
+-	movdqu	-32(%rdi, %rdx), %xmm0
+-	movdqu	-32(%rsi, %rdx), %xmm1
+-	CMPEQ	%xmm0, %xmm1
+-	pmovmskb %xmm1, %eax
+-	incw	%ax
+-	jnz	L(vec_return_end_16)
+-
+-	movdqu	-16(%rdi, %rdx), %xmm0
+-	movdqu	-16(%rsi, %rdx), %xmm1
+-	CMPEQ	%xmm0, %xmm1
+-	pmovmskb %xmm1, %eax
+-	incw	%ax
+-	jnz	L(vec_return_end)
+-	ret
+-
+-	.p2align 4
+-L(unaligned_loop):
+-# ifdef DATA_CACHE_SIZE_HALF
+-	mov	$DATA_CACHE_SIZE_HALF, %R8_LP
+-# else
+-	mov	__x86_data_cache_size_half(%rip), %R8_LP
+-# endif
+-	movq	%r8, %r9
+-	addq	%r8, %r8
+-	addq	%r9, %r8
+-	cmpq	%r8, %rdx
+-	ja	L(L2_L3_cache_unaligned)
+-	sub	$64, %rdx
+-	.p2align 4
+-L(64bytesormore_loop):
+-	movdqu	(%rdi), %xmm0
+-	movdqu	16(%rdi), %xmm1
+-	movdqu	32(%rdi), %xmm2
+-	movdqu	48(%rdi), %xmm3
+-
+-	CMPEQ	(%rsi), %xmm0
+-	CMPEQ	16(%rsi), %xmm1
+-	CMPEQ	32(%rsi), %xmm2
+-	CMPEQ	48(%rsi), %xmm3
+-
+-	pand	%xmm0, %xmm1
+-	pand	%xmm2, %xmm3
+-	pand	%xmm1, %xmm3
+-
+-	pmovmskb %xmm3, %eax
+-	incw	%ax
+-	jnz	L(64bytesormore_loop_end)
+-
+-	add	$64, %rsi
+-	add	$64, %rdi
+-	sub	$64, %rdx
+-	ja	L(64bytesormore_loop)
+-
+-	.p2align 4,, 6
+-L(loop_tail):
+-	addq	%rdx, %rdi
+-	movdqu	(%rdi), %xmm0
+-	movdqu	16(%rdi), %xmm1
+-	movdqu	32(%rdi), %xmm2
+-	movdqu	48(%rdi), %xmm3
+-
+-	addq	%rdx, %rsi
+-	movdqu	(%rsi), %xmm4
+-	movdqu	16(%rsi), %xmm5
+-	movdqu	32(%rsi), %xmm6
+-	movdqu	48(%rsi), %xmm7
+-
+-	CMPEQ	%xmm4, %xmm0
+-	CMPEQ	%xmm5, %xmm1
+-	CMPEQ	%xmm6, %xmm2
+-	CMPEQ	%xmm7, %xmm3
+-
+-	pand	%xmm0, %xmm1
+-	pand	%xmm2, %xmm3
+-	pand	%xmm1, %xmm3
+-
+-	pmovmskb %xmm3, %eax
+-	incw	%ax
+-	jnz	L(64bytesormore_loop_end)
+-	ret
+-
+-L(L2_L3_cache_unaligned):
+-	subq	$64, %rdx
+-	.p2align 4
+-L(L2_L3_unaligned_128bytes_loop):
+-	prefetchnta 0x1c0(%rdi)
+-	prefetchnta 0x1c0(%rsi)
+-
+-	movdqu	(%rdi), %xmm0
+-	movdqu	16(%rdi), %xmm1
+-	movdqu	32(%rdi), %xmm2
+-	movdqu	48(%rdi), %xmm3
+-
+-	CMPEQ	(%rsi), %xmm0
+-	CMPEQ	16(%rsi), %xmm1
+-	CMPEQ	32(%rsi), %xmm2
+-	CMPEQ	48(%rsi), %xmm3
+-
+-	pand	%xmm0, %xmm1
+-	pand	%xmm2, %xmm3
+-	pand	%xmm1, %xmm3
+-
+-	pmovmskb %xmm3, %eax
+-	incw	%ax
+-	jnz	L(64bytesormore_loop_end)
+-
+-	add	$64, %rsi
+-	add	$64, %rdi
+-	sub	$64, %rdx
+-	ja	L(L2_L3_unaligned_128bytes_loop)
+-	jmp	L(loop_tail)
+-
+-
+-	/* This case is for machines which are sensitive for unaligned
+-	 * instructions.  */
+-	.p2align 4
+-L(2aligned):
+-	cmp	$128, %rdx
+-	ja	L(128bytesormorein2aligned)
+-L(less128bytesin2aligned):
+-	movdqa	(%rdi), %xmm1
+-	CMPEQ	(%rsi), %xmm1
+-	pmovmskb %xmm1, %eax
+-	incw	%ax
+-	jnz	L(vec_return_begin)
+-
+-	movdqa	16(%rdi), %xmm1
+-	CMPEQ	16(%rsi), %xmm1
+-	pmovmskb %xmm1, %eax
+-	incw	%ax
+-	jnz	L(vec_return_begin_16)
+-
+-	movdqa	32(%rdi), %xmm1
+-	CMPEQ	32(%rsi), %xmm1
+-	pmovmskb %xmm1, %eax
+-	incw	%ax
+-	jnz	L(vec_return_begin_32)
+-
+-	movdqa	48(%rdi), %xmm1
+-	CMPEQ	48(%rsi), %xmm1
+-	pmovmskb %xmm1, %eax
+-	incw	%ax
+-	jnz	L(vec_return_begin_48)
+-
+-	cmp	$96, %rdx
+-	jb	L(32_to_64_bytes)
+-
+-	addq	$64, %rdi
+-	addq	$64, %rsi
+-	subq	$64, %rdx
+-
+-	.p2align 4,, 6
+-L(aligned_last_64_bytes):
+-	movdqa	(%rdi), %xmm1
+-	CMPEQ	(%rsi), %xmm1
+-	pmovmskb %xmm1, %eax
+-	incw	%ax
+-	jnz	L(vec_return_begin)
+-
+-	movdqa	16(%rdi), %xmm1
+-	CMPEQ	16(%rsi), %xmm1
+-	pmovmskb %xmm1, %eax
+-	incw	%ax
+-	jnz	L(vec_return_begin_16)
+-
+-	movdqu	-32(%rdi, %rdx), %xmm0
+-	movdqu	-32(%rsi, %rdx), %xmm1
+-	CMPEQ	%xmm0, %xmm1
+-	pmovmskb %xmm1, %eax
+-	incw	%ax
+-	jnz	L(vec_return_end_16)
+-
+-	movdqu	-16(%rdi, %rdx), %xmm0
+-	movdqu	-16(%rsi, %rdx), %xmm1
+-	CMPEQ	%xmm0, %xmm1
+-	pmovmskb %xmm1, %eax
+-	incw	%ax
+-	jnz	L(vec_return_end)
+-	ret
+-
+-	.p2align 4
+-L(128bytesormorein2aligned):
+-	cmp	$256, %rdx
+-	ja	L(aligned_loop)
+-L(less256bytesin2alinged):
+-	movdqa	(%rdi), %xmm1
+-	CMPEQ	(%rsi), %xmm1
+-	pmovmskb %xmm1, %eax
+-	incw	%ax
+-	jnz	L(vec_return_begin)
+-
+-	movdqa	16(%rdi), %xmm1
+-	CMPEQ	16(%rsi), %xmm1
+-	pmovmskb %xmm1, %eax
+-	incw	%ax
+-	jnz	L(vec_return_begin_16)
+-
+-	movdqa	32(%rdi), %xmm1
+-	CMPEQ	32(%rsi), %xmm1
+-	pmovmskb %xmm1, %eax
+-	incw	%ax
+-	jnz	L(vec_return_begin_32)
+-
+-	movdqa	48(%rdi), %xmm1
+-	CMPEQ	48(%rsi), %xmm1
+-	pmovmskb %xmm1, %eax
+-	incw	%ax
+-	jnz	L(vec_return_begin_48)
+-
+-	addq	$64, %rdi
+-	addq	$64, %rsi
+-
+-	movdqa	(%rdi), %xmm1
+-	CMPEQ	(%rsi), %xmm1
+-	pmovmskb %xmm1, %eax
+-	incw	%ax
+-	jnz	L(vec_return_begin)
+-
+-	movdqa	16(%rdi), %xmm1
+-	CMPEQ	16(%rsi), %xmm1
+-	pmovmskb %xmm1, %eax
+-	incw	%ax
+-	jnz	L(vec_return_begin_16)
+-
+-	movdqa	32(%rdi), %xmm1
+-	CMPEQ	32(%rsi), %xmm1
+-	pmovmskb %xmm1, %eax
+-	incw	%ax
+-	jnz	L(vec_return_begin_32)
+-
+-	movdqa	48(%rdi), %xmm1
+-	CMPEQ	48(%rsi), %xmm1
+-	pmovmskb %xmm1, %eax
+-	incw	%ax
+-	jnz	L(vec_return_begin_48)
+-
+-	addq	$-128, %rdx
+-	subq	$-64, %rsi
+-	subq	$-64, %rdi
+-
+-	cmp	$64, %rdx
+-	ja	L(less128bytesin2aligned)
+-
+-	cmp	$32, %rdx
+-	ja	L(aligned_last_64_bytes)
+-
+-	movdqu	-32(%rdi, %rdx), %xmm0
+-	movdqu	-32(%rsi, %rdx), %xmm1
+-	CMPEQ	%xmm0, %xmm1
+-	pmovmskb %xmm1, %eax
+-	incw	%ax
+-	jnz	L(vec_return_end_16)
+-
+-	movdqu	-16(%rdi, %rdx), %xmm0
+-	movdqu	-16(%rsi, %rdx), %xmm1
+-	CMPEQ	%xmm0, %xmm1
+-	pmovmskb %xmm1, %eax
+-	incw	%ax
+-	jnz	L(vec_return_end)
+-	ret
+-
+-	.p2align 4
+-L(aligned_loop):
+-# ifdef DATA_CACHE_SIZE_HALF
+-	mov	$DATA_CACHE_SIZE_HALF, %R8_LP
+-# else
+-	mov	__x86_data_cache_size_half(%rip), %R8_LP
+-# endif
+-	movq	%r8, %r9
+-	addq	%r8, %r8
+-	addq	%r9, %r8
+-	cmpq	%r8, %rdx
+-	ja	L(L2_L3_cache_aligned)
+-
+-	sub	$64, %rdx
+-	.p2align 4
+-L(64bytesormore_loopin2aligned):
+-	movdqa	(%rdi), %xmm0
+-	movdqa	16(%rdi), %xmm1
+-	movdqa	32(%rdi), %xmm2
+-	movdqa	48(%rdi), %xmm3
+-
+-	CMPEQ	(%rsi), %xmm0
+-	CMPEQ	16(%rsi), %xmm1
+-	CMPEQ	32(%rsi), %xmm2
+-	CMPEQ	48(%rsi), %xmm3
+-
+-	pand	%xmm0, %xmm1
+-	pand	%xmm2, %xmm3
+-	pand	%xmm1, %xmm3
+-
+-	pmovmskb %xmm3, %eax
+-	incw	%ax
+-	jnz	L(64bytesormore_loop_end)
+-	add	$64, %rsi
+-	add	$64, %rdi
+-	sub	$64, %rdx
+-	ja	L(64bytesormore_loopin2aligned)
+-	jmp	L(loop_tail)
+-
+-L(L2_L3_cache_aligned):
+-	subq	$64, %rdx
+-	.p2align 4
+-L(L2_L3_aligned_128bytes_loop):
+-	prefetchnta 0x1c0(%rdi)
+-	prefetchnta 0x1c0(%rsi)
+-	movdqa	(%rdi), %xmm0
+-	movdqa	16(%rdi), %xmm1
+-	movdqa	32(%rdi), %xmm2
+-	movdqa	48(%rdi), %xmm3
+-
+-	CMPEQ	(%rsi), %xmm0
+-	CMPEQ	16(%rsi), %xmm1
+-	CMPEQ	32(%rsi), %xmm2
+-	CMPEQ	48(%rsi), %xmm3
+-
+-	pand	%xmm0, %xmm1
+-	pand	%xmm2, %xmm3
+-	pand	%xmm1, %xmm3
+-
+-	pmovmskb %xmm3, %eax
+-	incw	%ax
+-	jnz	L(64bytesormore_loop_end)
+-
+-	addq	$64, %rsi
+-	addq	$64, %rdi
+-	subq	$64, %rdx
+-	ja	L(L2_L3_aligned_128bytes_loop)
+-	jmp	L(loop_tail)
+-
+-	.p2align 4
+-L(64bytesormore_loop_end):
+-	pmovmskb %xmm0, %ecx
+-	incw	%cx
+-	jnz	L(loop_end_ret)
+-
+-	pmovmskb %xmm1, %ecx
+-	notw	%cx
+-	sall	$16, %ecx
+-	jnz	L(loop_end_ret)
+-
+-	pmovmskb %xmm2, %ecx
+-	notw	%cx
+-	shlq	$32, %rcx
+-	jnz	L(loop_end_ret)
+-
+-	addq	$48, %rdi
+-	addq	$48, %rsi
+-	movq	%rax, %rcx
+-
+-	.p2align 4,, 6
+-L(loop_end_ret):
+-	bsfq	%rcx, %rcx
+-# ifdef USE_AS_WMEMCMP
+-	movl	(%rdi, %rcx), %eax
+-	xorl	%edx, %edx
+-	cmpl	(%rsi, %rcx), %eax
+-	setg	%dl
+-	leal	-1(%rdx, %rdx), %eax
+-# else
+-	movzbl	(%rdi, %rcx), %eax
+-	movzbl	(%rsi, %rcx), %ecx
+-	subl	%ecx, %eax
+-# endif
+-	ret
+-END (MEMCMP)
+-#endif
+-- 
+GitLab
+
diff --git a/glibc-RHEL-15696-102.patch b/glibc-RHEL-15696-102.patch
new file mode 100644
index 0000000..8cb20ad
--- /dev/null
+++ b/glibc-RHEL-15696-102.patch
@@ -0,0 +1,263 @@
+From 23102686ec67b856a2d4fd25ddaa1c0b8d175c4f Mon Sep 17 00:00:00 2001
+From: Noah Goldstein <goldstein.w.n@gmail.com>
+Date: Fri, 15 Apr 2022 12:28:01 -0500
+Subject: [PATCH] x86: Cleanup page cross code in memcmp-avx2-movbe.S
+Content-type: text/plain; charset=UTF-8
+
+Old code was both inefficient and wasted code size. New code (-62
+bytes) and comparable or better performance in the page cross case.
+
+geometric_mean(N=20) of page cross cases New / Original: 0.960
+
+size, align0, align1, ret, New Time/Old Time
+   1,   4095,      0,   0,             1.001
+   1,   4095,      0,   1,             0.999
+   1,   4095,      0,  -1,               1.0
+   2,   4094,      0,   0,               1.0
+   2,   4094,      0,   1,               1.0
+   2,   4094,      0,  -1,               1.0
+   3,   4093,      0,   0,               1.0
+   3,   4093,      0,   1,               1.0
+   3,   4093,      0,  -1,               1.0
+   4,   4092,      0,   0,             0.987
+   4,   4092,      0,   1,               1.0
+   4,   4092,      0,  -1,               1.0
+   5,   4091,      0,   0,             0.984
+   5,   4091,      0,   1,             1.002
+   5,   4091,      0,  -1,             1.005
+   6,   4090,      0,   0,             0.993
+   6,   4090,      0,   1,             1.001
+   6,   4090,      0,  -1,             1.003
+   7,   4089,      0,   0,             0.991
+   7,   4089,      0,   1,               1.0
+   7,   4089,      0,  -1,             1.001
+   8,   4088,      0,   0,             0.875
+   8,   4088,      0,   1,             0.881
+   8,   4088,      0,  -1,             0.888
+   9,   4087,      0,   0,             0.872
+   9,   4087,      0,   1,             0.879
+   9,   4087,      0,  -1,             0.883
+  10,   4086,      0,   0,             0.878
+  10,   4086,      0,   1,             0.886
+  10,   4086,      0,  -1,             0.873
+  11,   4085,      0,   0,             0.878
+  11,   4085,      0,   1,             0.881
+  11,   4085,      0,  -1,             0.879
+  12,   4084,      0,   0,             0.873
+  12,   4084,      0,   1,             0.889
+  12,   4084,      0,  -1,             0.875
+  13,   4083,      0,   0,             0.873
+  13,   4083,      0,   1,             0.863
+  13,   4083,      0,  -1,             0.863
+  14,   4082,      0,   0,             0.838
+  14,   4082,      0,   1,             0.869
+  14,   4082,      0,  -1,             0.877
+  15,   4081,      0,   0,             0.841
+  15,   4081,      0,   1,             0.869
+  15,   4081,      0,  -1,             0.876
+  16,   4080,      0,   0,             0.988
+  16,   4080,      0,   1,              0.99
+  16,   4080,      0,  -1,             0.989
+  17,   4079,      0,   0,             0.978
+  17,   4079,      0,   1,             0.981
+  17,   4079,      0,  -1,              0.98
+  18,   4078,      0,   0,             0.981
+  18,   4078,      0,   1,              0.98
+  18,   4078,      0,  -1,             0.985
+  19,   4077,      0,   0,             0.977
+  19,   4077,      0,   1,             0.979
+  19,   4077,      0,  -1,             0.986
+  20,   4076,      0,   0,             0.977
+  20,   4076,      0,   1,             0.986
+  20,   4076,      0,  -1,             0.984
+  21,   4075,      0,   0,             0.977
+  21,   4075,      0,   1,             0.983
+  21,   4075,      0,  -1,             0.988
+  22,   4074,      0,   0,             0.983
+  22,   4074,      0,   1,             0.994
+  22,   4074,      0,  -1,             0.993
+  23,   4073,      0,   0,              0.98
+  23,   4073,      0,   1,             0.992
+  23,   4073,      0,  -1,             0.995
+  24,   4072,      0,   0,             0.989
+  24,   4072,      0,   1,             0.989
+  24,   4072,      0,  -1,             0.991
+  25,   4071,      0,   0,              0.99
+  25,   4071,      0,   1,             0.999
+  25,   4071,      0,  -1,             0.996
+  26,   4070,      0,   0,             0.993
+  26,   4070,      0,   1,             0.995
+  26,   4070,      0,  -1,             0.998
+  27,   4069,      0,   0,             0.993
+  27,   4069,      0,   1,             0.999
+  27,   4069,      0,  -1,               1.0
+  28,   4068,      0,   0,             0.997
+  28,   4068,      0,   1,               1.0
+  28,   4068,      0,  -1,             0.999
+  29,   4067,      0,   0,             0.996
+  29,   4067,      0,   1,             0.999
+  29,   4067,      0,  -1,             0.999
+  30,   4066,      0,   0,             0.991
+  30,   4066,      0,   1,             1.001
+  30,   4066,      0,  -1,             0.999
+  31,   4065,      0,   0,             0.988
+  31,   4065,      0,   1,             0.998
+  31,   4065,      0,  -1,             0.998
+Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
+---
+ sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S | 98 ++++++++++++--------
+ 1 file changed, 61 insertions(+), 37 deletions(-)
+
+diff --git a/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S b/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S
+index 16fc673e..99258cf5 100644
+--- a/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S
++++ b/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S
+@@ -429,22 +429,21 @@ L(page_cross_less_vec):
+ # ifndef USE_AS_WMEMCMP
+ 	cmpl	$8, %edx
+ 	jae	L(between_8_15)
++	/* Fall through for [4, 7].  */
+ 	cmpl	$4, %edx
+-	jae	L(between_4_7)
++	jb	L(between_2_3)
+ 
+-	/* Load as big endian to avoid branches.  */
+-	movzwl	(%rdi), %eax
+-	movzwl	(%rsi), %ecx
+-	shll	$8, %eax
+-	shll	$8, %ecx
+-	bswap	%eax
+-	bswap	%ecx
+-	movzbl	-1(%rdi, %rdx), %edi
+-	movzbl	-1(%rsi, %rdx), %esi
+-	orl	%edi, %eax
+-	orl	%esi, %ecx
+-	/* Subtraction is okay because the upper 8 bits are zero.  */
+-	subl	%ecx, %eax
++	movbe	(%rdi), %eax
++	movbe	(%rsi), %ecx
++	shlq	$32, %rax
++	shlq	$32, %rcx
++	movbe	-4(%rdi, %rdx), %edi
++	movbe	-4(%rsi, %rdx), %esi
++	orq	%rdi, %rax
++	orq	%rsi, %rcx
++	subq	%rcx, %rax
++	/* Fast path for return zero.  */
++	jnz	L(ret_nonzero)
+ 	/* No ymm register was touched.  */
+ 	ret
+ 
+@@ -457,9 +456,33 @@ L(one_or_less):
+ 	/* No ymm register was touched.  */
+ 	ret
+ 
++	.p2align 4,, 5
++L(ret_nonzero):
++	sbbl	%eax, %eax
++	orl	$1, %eax
++	/* No ymm register was touched.  */
++	ret
++
++	.p2align 4,, 2
++L(zero):
++	xorl	%eax, %eax
++	/* No ymm register was touched.  */
++	ret
++
+ 	.p2align 4
+ L(between_8_15):
+-# endif
++	movbe	(%rdi), %rax
++	movbe	(%rsi), %rcx
++	subq	%rcx, %rax
++	jnz	L(ret_nonzero)
++	movbe	-8(%rdi, %rdx), %rax
++	movbe	-8(%rsi, %rdx), %rcx
++	subq	%rcx, %rax
++	/* Fast path for return zero.  */
++	jnz	L(ret_nonzero)
++	/* No ymm register was touched.  */
++	ret
++# else
+ 	/* If USE_AS_WMEMCMP fall through into 8-15 byte case.  */
+ 	vmovq	(%rdi), %xmm1
+ 	vmovq	(%rsi), %xmm2
+@@ -475,16 +498,13 @@ L(between_8_15):
+ 	VPCMPEQ	%xmm1, %xmm2, %xmm2
+ 	vpmovmskb %xmm2, %eax
+ 	subl	$0xffff, %eax
++	/* Fast path for return zero.  */
+ 	jnz	L(return_vec_0)
+ 	/* No ymm register was touched.  */
+ 	ret
++# endif
+ 
+-	.p2align 4
+-L(zero):
+-	xorl	%eax, %eax
+-	ret
+-
+-	.p2align 4
++	.p2align 4,, 10
+ L(between_16_31):
+ 	/* From 16 to 31 bytes.  No branch when size == 16.  */
+ 	vmovdqu	(%rsi), %xmm2
+@@ -501,11 +521,17 @@ L(between_16_31):
+ 	VPCMPEQ	(%rdi), %xmm2, %xmm2
+ 	vpmovmskb %xmm2, %eax
+ 	subl	$0xffff, %eax
++	/* Fast path for return zero.  */
+ 	jnz	L(return_vec_0)
+ 	/* No ymm register was touched.  */
+ 	ret
+ 
+ # ifdef USE_AS_WMEMCMP
++	.p2align 4,, 2
++L(zero):
++	xorl	%eax, %eax
++	ret
++
+ 	.p2align 4
+ L(one_or_less):
+ 	jb	L(zero)
+@@ -520,22 +546,20 @@ L(one_or_less):
+ # else
+ 
+ 	.p2align 4
+-L(between_4_7):
+-	/* Load as big endian with overlapping movbe to avoid branches.
+-	 */
+-	movbe	(%rdi), %eax
+-	movbe	(%rsi), %ecx
+-	shlq	$32, %rax
+-	shlq	$32, %rcx
+-	movbe	-4(%rdi, %rdx), %edi
+-	movbe	-4(%rsi, %rdx), %esi
+-	orq	%rdi, %rax
+-	orq	%rsi, %rcx
+-	subq	%rcx, %rax
+-	jz	L(zero_4_7)
+-	sbbl	%eax, %eax
+-	orl	$1, %eax
+-L(zero_4_7):
++L(between_2_3):
++	/* Load as big endian to avoid branches.  */
++	movzwl	(%rdi), %eax
++	movzwl	(%rsi), %ecx
++	bswap	%eax
++	bswap	%ecx
++	shrl	%eax
++	shrl	%ecx
++	movzbl	-1(%rdi, %rdx), %edi
++	movzbl	-1(%rsi, %rdx), %esi
++	orl	%edi, %eax
++	orl	%esi, %ecx
++	/* Subtraction is okay because the upper bit is zero.  */
++	subl	%ecx, %eax
+ 	/* No ymm register was touched.  */
+ 	ret
+ # endif
+-- 
+GitLab
+
diff --git a/glibc-RHEL-15696-103.patch b/glibc-RHEL-15696-103.patch
new file mode 100644
index 0000000..c080e54
--- /dev/null
+++ b/glibc-RHEL-15696-103.patch
@@ -0,0 +1,876 @@
+From 5307aa9c1800f36a64c183c091c9af392c1fa75c Mon Sep 17 00:00:00 2001
+From: Noah Goldstein <goldstein.w.n@gmail.com>
+Date: Thu, 21 Apr 2022 20:52:28 -0500
+Subject: [PATCH] x86: Optimize {str|wcs}rchr-sse2
+Content-type: text/plain; charset=UTF-8
+
+The new code unrolls the main loop slightly without adding too much
+overhead and minimizes the comparisons for the search CHAR.
+
+Geometric Mean of all benchmarks New / Old: 0.741
+See email for all results.
+
+Full xcheck passes on x86_64 with and without multiarch enabled.
+Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
+---
+ sysdeps/x86_64/multiarch/strrchr-sse2.S |   2 +-
+ sysdeps/x86_64/multiarch/wcsrchr-sse2.S |   3 +-
+ sysdeps/x86_64/strrchr.S                | 510 +++++++++++++++---------
+ sysdeps/x86_64/wcsrchr.S                | 266 +-----------
+ 4 files changed, 338 insertions(+), 443 deletions(-)
+
+Conflicts:
+	sysdeps/x86_64/wcsrchr.S
+	(copyright header)
+
+diff --git a/sysdeps/x86_64/multiarch/strrchr-sse2.S b/sysdeps/x86_64/multiarch/strrchr-sse2.S
+index 0ec76fe9..6bb1284b 100644
+--- a/sysdeps/x86_64/multiarch/strrchr-sse2.S
++++ b/sysdeps/x86_64/multiarch/strrchr-sse2.S
+@@ -17,7 +17,7 @@
+    <http://www.gnu.org/licenses/>.  */
+ 
+ #if IS_IN (libc)
+-# define strrchr __strrchr_sse2
++# define STRRCHR __strrchr_sse2
+ 
+ # undef weak_alias
+ # define weak_alias(strrchr, rindex)
+diff --git a/sysdeps/x86_64/multiarch/wcsrchr-sse2.S b/sysdeps/x86_64/multiarch/wcsrchr-sse2.S
+index d015e953..f26d53b5 100644
+--- a/sysdeps/x86_64/multiarch/wcsrchr-sse2.S
++++ b/sysdeps/x86_64/multiarch/wcsrchr-sse2.S
+@@ -17,7 +17,6 @@
+    <http://www.gnu.org/licenses/>.  */
+ 
+ #if IS_IN (libc)
+-# define wcsrchr __wcsrchr_sse2
++# define STRRCHR	__wcsrchr_sse2
+ #endif
+-
+ #include "../wcsrchr.S"
+diff --git a/sysdeps/x86_64/strrchr.S b/sysdeps/x86_64/strrchr.S
+index aca98e7e..a58cc220 100644
+--- a/sysdeps/x86_64/strrchr.S
++++ b/sysdeps/x86_64/strrchr.S
+@@ -19,210 +19,360 @@
+ 
+ #include <sysdep.h>
+ 
++#ifndef STRRCHR
++# define STRRCHR	strrchr
++#endif
++
++#ifdef USE_AS_WCSRCHR
++# define PCMPEQ	pcmpeqd
++# define CHAR_SIZE	4
++# define PMINU	pminud
++#else
++# define PCMPEQ	pcmpeqb
++# define CHAR_SIZE	1
++# define PMINU	pminub
++#endif
++
++#define PAGE_SIZE	4096
++#define VEC_SIZE	16
++
+ 	.text
+-ENTRY (strrchr)
+-	movd	%esi, %xmm1
++ENTRY(STRRCHR)
++	movd	%esi, %xmm0
+ 	movq	%rdi, %rax
+-	andl	$4095, %eax
+-	punpcklbw	%xmm1, %xmm1
+-	cmpq	$4032, %rax
+-	punpcklwd	%xmm1, %xmm1
+-	pshufd	$0, %xmm1, %xmm1
++	andl	$(PAGE_SIZE - 1), %eax
++#ifndef USE_AS_WCSRCHR
++	punpcklbw %xmm0, %xmm0
++	punpcklwd %xmm0, %xmm0
++#endif
++	pshufd	$0, %xmm0, %xmm0
++	cmpl	$(PAGE_SIZE - VEC_SIZE), %eax
+ 	ja	L(cross_page)
+-	movdqu	(%rdi), %xmm0
++
++L(cross_page_continue):
++	movups	(%rdi), %xmm1
+ 	pxor	%xmm2, %xmm2
+-	movdqa	%xmm0, %xmm3
+-	pcmpeqb	%xmm1, %xmm0
+-	pcmpeqb	%xmm2, %xmm3
+-	pmovmskb	%xmm0, %ecx
+-	pmovmskb	%xmm3, %edx
+-	testq	%rdx, %rdx
+-	je	L(next_48_bytes)
+-	leaq	-1(%rdx), %rax
+-	xorq	%rdx, %rax
+-	andq	%rcx, %rax
+-	je	L(exit)
+-	bsrq	%rax, %rax
++	PCMPEQ	%xmm1, %xmm2
++	pmovmskb %xmm2, %ecx
++	testl	%ecx, %ecx
++	jz	L(aligned_more)
++
++	PCMPEQ	%xmm0, %xmm1
++	pmovmskb %xmm1, %eax
++	leal	-1(%rcx), %edx
++	xorl	%edx, %ecx
++	andl	%ecx, %eax
++	jz	L(ret0)
++	bsrl	%eax, %eax
+ 	addq	%rdi, %rax
++	/* We are off by 3 for wcsrchr if search CHAR is non-zero. If
++	   search CHAR is zero we are correct. Either way `andq
++	   -CHAR_SIZE, %rax` gets the correct result.  */
++#ifdef USE_AS_WCSRCHR
++	andq	$-CHAR_SIZE, %rax
++#endif
++L(ret0):
+ 	ret
+ 
++	/* Returns for first vec x1/x2 have hard coded backward search
++	   path for earlier matches.  */
+ 	.p2align 4
+-L(next_48_bytes):
+-	movdqu	16(%rdi), %xmm4
+-	movdqa	%xmm4, %xmm5
+-	movdqu	32(%rdi), %xmm3
+-	pcmpeqb	%xmm1, %xmm4
+-	pcmpeqb	%xmm2, %xmm5
+-	movdqu	48(%rdi), %xmm0
+-	pmovmskb	%xmm5, %edx
+-	movdqa	%xmm3, %xmm5
+-	pcmpeqb	%xmm1, %xmm3
+-	pcmpeqb	%xmm2, %xmm5
+-	pcmpeqb	%xmm0, %xmm2
+-	salq	$16, %rdx
+-	pmovmskb	%xmm3, %r8d
+-	pmovmskb	%xmm5, %eax
+-	pmovmskb	%xmm2, %esi
+-	salq	$32, %r8
+-	salq	$32, %rax
+-	pcmpeqb	%xmm1, %xmm0
+-	orq	%rdx, %rax
+-	movq	%rsi, %rdx
+-	pmovmskb	%xmm4, %esi
+-	salq	$48, %rdx
+-	salq	$16, %rsi
+-	orq	%r8, %rsi
+-	orq	%rcx, %rsi
+-	pmovmskb	%xmm0, %ecx
+-	salq	$48, %rcx
+-	orq	%rcx, %rsi
+-	orq	%rdx, %rax
+-	je	L(loop_header2)
+-	leaq	-1(%rax), %rcx
+-	xorq	%rax, %rcx
+-	andq	%rcx, %rsi
+-	je	L(exit)
+-	bsrq	%rsi, %rsi
+-	leaq	(%rdi,%rsi), %rax
++L(first_vec_x0_test):
++	PCMPEQ	%xmm0, %xmm1
++	pmovmskb %xmm1, %eax
++	testl	%eax, %eax
++	jz	L(ret0)
++	bsrl	%eax, %eax
++	addq	%r8, %rax
++#ifdef USE_AS_WCSRCHR
++	andq	$-CHAR_SIZE, %rax
++#endif
+ 	ret
+ 
+ 	.p2align 4
+-L(loop_header2):
+-	testq	%rsi, %rsi
+-	movq	%rdi, %rcx
+-	je	L(no_c_found)
+-L(loop_header):
+-	addq	$64, %rdi
+-	pxor	%xmm7, %xmm7
+-	andq	$-64, %rdi
+-	jmp	L(loop_entry)
++L(first_vec_x1):
++	PCMPEQ	%xmm0, %xmm2
++	pmovmskb %xmm2, %eax
++	leal	-1(%rcx), %edx
++	xorl	%edx, %ecx
++	andl	%ecx, %eax
++	jz	L(first_vec_x0_test)
++	bsrl	%eax, %eax
++	leaq	(VEC_SIZE)(%rdi, %rax), %rax
++#ifdef USE_AS_WCSRCHR
++	andq	$-CHAR_SIZE, %rax
++#endif
++	ret
+ 
+ 	.p2align 4
+-L(loop64):
+-	testq	%rdx, %rdx
+-	cmovne	%rdx, %rsi
+-	cmovne	%rdi, %rcx
+-	addq	$64, %rdi
+-L(loop_entry):
+-	movdqa	32(%rdi), %xmm3
+-	pxor	%xmm6, %xmm6
+-	movdqa	48(%rdi), %xmm2
+-	movdqa	%xmm3, %xmm0
+-	movdqa	16(%rdi), %xmm4
+-	pminub	%xmm2, %xmm0
+-	movdqa	(%rdi), %xmm5
+-	pminub	%xmm4, %xmm0
+-	pminub	%xmm5, %xmm0
+-	pcmpeqb	%xmm7, %xmm0
+-	pmovmskb	%xmm0, %eax
+-	movdqa	%xmm5, %xmm0
+-	pcmpeqb	%xmm1, %xmm0
+-	pmovmskb	%xmm0, %r9d
+-	movdqa	%xmm4, %xmm0
+-	pcmpeqb	%xmm1, %xmm0
+-	pmovmskb	%xmm0, %edx
+-	movdqa	%xmm3, %xmm0
+-	pcmpeqb	%xmm1, %xmm0
+-	salq	$16, %rdx
+-	pmovmskb	%xmm0, %r10d
+-	movdqa	%xmm2, %xmm0
+-	pcmpeqb	%xmm1, %xmm0
+-	salq	$32, %r10
+-	orq	%r10, %rdx
+-	pmovmskb	%xmm0, %r8d
+-	orq	%r9, %rdx
+-	salq	$48, %r8
+-	orq	%r8, %rdx
++L(first_vec_x1_test):
++	PCMPEQ	%xmm0, %xmm2
++	pmovmskb %xmm2, %eax
+ 	testl	%eax, %eax
+-	je	L(loop64)
+-	pcmpeqb	%xmm6, %xmm4
+-	pcmpeqb	%xmm6, %xmm3
+-	pcmpeqb	%xmm6, %xmm5
+-	pmovmskb	%xmm4, %eax
+-	pmovmskb	%xmm3, %r10d
+-	pcmpeqb	%xmm6, %xmm2
+-	pmovmskb	%xmm5, %r9d
+-	salq	$32, %r10
+-	salq	$16, %rax
+-	pmovmskb	%xmm2, %r8d
+-	orq	%r10, %rax
+-	orq	%r9, %rax
+-	salq	$48, %r8
+-	orq	%r8, %rax
+-	leaq	-1(%rax), %r8
+-	xorq	%rax, %r8
+-	andq	%r8, %rdx
+-	cmovne	%rdi, %rcx
+-	cmovne	%rdx, %rsi
+-	bsrq	%rsi, %rsi
+-	leaq	(%rcx,%rsi), %rax
++	jz	L(first_vec_x0_test)
++	bsrl	%eax, %eax
++	leaq	(VEC_SIZE)(%rdi, %rax), %rax
++#ifdef USE_AS_WCSRCHR
++	andq	$-CHAR_SIZE, %rax
++#endif
++	ret
++
++	.p2align 4
++L(first_vec_x2):
++	PCMPEQ	%xmm0, %xmm3
++	pmovmskb %xmm3, %eax
++	leal	-1(%rcx), %edx
++	xorl	%edx, %ecx
++	andl	%ecx, %eax
++	jz	L(first_vec_x1_test)
++	bsrl	%eax, %eax
++	leaq	(VEC_SIZE * 2)(%rdi, %rax), %rax
++#ifdef USE_AS_WCSRCHR
++	andq	$-CHAR_SIZE, %rax
++#endif
++	ret
++
++	.p2align 4
++L(aligned_more):
++	/* Save original pointer if match was in VEC 0.  */
++	movq	%rdi, %r8
++	andq	$-VEC_SIZE, %rdi
++
++	movaps	VEC_SIZE(%rdi), %xmm2
++	pxor	%xmm3, %xmm3
++	PCMPEQ	%xmm2, %xmm3
++	pmovmskb %xmm3, %ecx
++	testl	%ecx, %ecx
++	jnz	L(first_vec_x1)
++
++	movaps	(VEC_SIZE * 2)(%rdi), %xmm3
++	pxor	%xmm4, %xmm4
++	PCMPEQ	%xmm3, %xmm4
++	pmovmskb %xmm4, %ecx
++	testl	%ecx, %ecx
++	jnz	L(first_vec_x2)
++
++	addq	$VEC_SIZE, %rdi
++	/* Save pointer again before realigning.  */
++	movq	%rdi, %rsi
++	andq	$-(VEC_SIZE * 2), %rdi
++	.p2align 4
++L(first_loop):
++	/* Do 2x VEC at a time.  */
++	movaps	(VEC_SIZE * 2)(%rdi), %xmm4
++	movaps	(VEC_SIZE * 3)(%rdi), %xmm5
++	/* Since SSE2 no pminud so wcsrchr needs seperate logic for
++	   detecting zero. Note if this is found to be a bottleneck it
++	   may be worth adding an SSE4.1 wcsrchr implementation.  */
++#ifdef USE_AS_WCSRCHR
++	movaps	%xmm5, %xmm6
++	pxor	%xmm8, %xmm8
++
++	PCMPEQ	%xmm8, %xmm5
++	PCMPEQ	%xmm4, %xmm8
++	por	%xmm5, %xmm8
++#else
++	movaps	%xmm5, %xmm6
++	PMINU	%xmm4, %xmm5
++#endif
++
++	movaps	%xmm4, %xmm9
++	PCMPEQ	%xmm0, %xmm4
++	PCMPEQ	%xmm0, %xmm6
++	movaps	%xmm6, %xmm7
++	por	%xmm4, %xmm6
++#ifndef USE_AS_WCSRCHR
++	pxor	%xmm8, %xmm8
++	PCMPEQ	%xmm5, %xmm8
++#endif
++	pmovmskb %xmm8, %ecx
++	pmovmskb %xmm6, %eax
++
++	addq	$(VEC_SIZE * 2), %rdi
++	/* Use `addl` 1) so we can undo it with `subl` and 2) it can
++	   macro-fuse with `jz`.  */
++	addl	%ecx, %eax
++	jz	L(first_loop)
++
++	/* Check if there is zero match.  */
++	testl	%ecx, %ecx
++	jz	L(second_loop_match)
++
++	/* Check if there was a match in last iteration.  */
++	subl	%ecx, %eax
++	jnz	L(new_match)
++
++L(first_loop_old_match):
++	PCMPEQ	%xmm0, %xmm2
++	PCMPEQ	%xmm0, %xmm3
++	pmovmskb %xmm2, %ecx
++	pmovmskb %xmm3, %eax
++	addl	%eax, %ecx
++	jz	L(first_vec_x0_test)
++	/* NB: We could move this shift to before the branch and save a
++	   bit of code size / performance on the fall through. The
++	   branch leads to the null case which generally seems hotter
++	   than char in first 3x VEC.  */
++	sall	$16, %eax
++	orl	%ecx, %eax
++
++	bsrl	%eax, %eax
++	addq	%rsi, %rax
++#ifdef USE_AS_WCSRCHR
++	andq	$-CHAR_SIZE, %rax
++#endif
++	ret
++
++	.p2align 4
++L(new_match):
++	pxor	%xmm6, %xmm6
++	PCMPEQ	%xmm9, %xmm6
++	pmovmskb %xmm6, %eax
++	sall	$16, %ecx
++	orl	%eax, %ecx
++
++	/* We can't reuse either of the old comparisons as since we mask
++	   of zeros after first zero (instead of using the full
++	   comparison) we can't gurantee no interference between match
++	   after end of string and valid match.  */
++	pmovmskb %xmm4, %eax
++	pmovmskb %xmm7, %edx
++	sall	$16, %edx
++	orl	%edx, %eax
++
++	leal	-1(%ecx), %edx
++	xorl	%edx, %ecx
++	andl	%ecx, %eax
++	jz	L(first_loop_old_match)
++	bsrl	%eax, %eax
++	addq	%rdi, %rax
++#ifdef USE_AS_WCSRCHR
++	andq	$-CHAR_SIZE, %rax
++#endif
+ 	ret
+ 
++	/* Save minimum state for getting most recent match. We can
++	   throw out all previous work.  */
+ 	.p2align 4
+-L(no_c_found):
+-	movl	$1, %esi
+-	xorl	%ecx, %ecx
+-	jmp	L(loop_header)
++L(second_loop_match):
++	movq	%rdi, %rsi
++	movaps	%xmm4, %xmm2
++	movaps	%xmm7, %xmm3
+ 
+ 	.p2align 4
+-L(exit):
+-	xorl	%eax, %eax
++L(second_loop):
++	movaps	(VEC_SIZE * 2)(%rdi), %xmm4
++	movaps	(VEC_SIZE * 3)(%rdi), %xmm5
++	/* Since SSE2 no pminud so wcsrchr needs seperate logic for
++	   detecting zero. Note if this is found to be a bottleneck it
++	   may be worth adding an SSE4.1 wcsrchr implementation.  */
++#ifdef USE_AS_WCSRCHR
++	movaps	%xmm5, %xmm6
++	pxor	%xmm8, %xmm8
++
++	PCMPEQ	%xmm8, %xmm5
++	PCMPEQ	%xmm4, %xmm8
++	por	%xmm5, %xmm8
++#else
++	movaps	%xmm5, %xmm6
++	PMINU	%xmm4, %xmm5
++#endif
++
++	movaps	%xmm4, %xmm9
++	PCMPEQ	%xmm0, %xmm4
++	PCMPEQ	%xmm0, %xmm6
++	movaps	%xmm6, %xmm7
++	por	%xmm4, %xmm6
++#ifndef USE_AS_WCSRCHR
++	pxor	%xmm8, %xmm8
++	PCMPEQ	%xmm5, %xmm8
++#endif
++
++	pmovmskb %xmm8, %ecx
++	pmovmskb %xmm6, %eax
++
++	addq	$(VEC_SIZE * 2), %rdi
++	/* Either null term or new occurence of CHAR.  */
++	addl	%ecx, %eax
++	jz	L(second_loop)
++
++	/* No null term so much be new occurence of CHAR.  */
++	testl	%ecx, %ecx
++	jz	L(second_loop_match)
++
++
++	subl	%ecx, %eax
++	jnz	L(second_loop_new_match)
++
++L(second_loop_old_match):
++	pmovmskb %xmm2, %ecx
++	pmovmskb %xmm3, %eax
++	sall	$16, %eax
++	orl	%ecx, %eax
++	bsrl	%eax, %eax
++	addq	%rsi, %rax
++#ifdef USE_AS_WCSRCHR
++	andq	$-CHAR_SIZE, %rax
++#endif
+ 	ret
+ 
+ 	.p2align 4
++L(second_loop_new_match):
++	pxor	%xmm6, %xmm6
++	PCMPEQ	%xmm9, %xmm6
++	pmovmskb %xmm6, %eax
++	sall	$16, %ecx
++	orl	%eax, %ecx
++
++	/* We can't reuse either of the old comparisons as since we mask
++	   of zeros after first zero (instead of using the full
++	   comparison) we can't gurantee no interference between match
++	   after end of string and valid match.  */
++	pmovmskb %xmm4, %eax
++	pmovmskb %xmm7, %edx
++	sall	$16, %edx
++	orl	%edx, %eax
++
++	leal	-1(%ecx), %edx
++	xorl	%edx, %ecx
++	andl	%ecx, %eax
++	jz	L(second_loop_old_match)
++	bsrl	%eax, %eax
++	addq	%rdi, %rax
++#ifdef USE_AS_WCSRCHR
++	andq	$-CHAR_SIZE, %rax
++#endif
++	ret
++
++	.p2align 4,, 4
+ L(cross_page):
+-	movq	%rdi, %rax
+-	pxor	%xmm0, %xmm0
+-	andq	$-64, %rax
+-	movdqu	(%rax), %xmm5
+-	movdqa	%xmm5, %xmm6
+-	movdqu	16(%rax), %xmm4
+-	pcmpeqb	%xmm1, %xmm5
+-	pcmpeqb	%xmm0, %xmm6
+-	movdqu	32(%rax), %xmm3
+-	pmovmskb	%xmm6, %esi
+-	movdqa	%xmm4, %xmm6
+-	movdqu	48(%rax), %xmm2
+-	pcmpeqb	%xmm1, %xmm4
+-	pcmpeqb	%xmm0, %xmm6
+-	pmovmskb	%xmm6, %edx
+-	movdqa	%xmm3, %xmm6
+-	pcmpeqb	%xmm1, %xmm3
+-	pcmpeqb	%xmm0, %xmm6
+-	pcmpeqb	%xmm2, %xmm0
+-	salq	$16, %rdx
+-	pmovmskb	%xmm3, %r9d
+-	pmovmskb	%xmm6, %r8d
+-	pmovmskb	%xmm0, %ecx
+-	salq	$32, %r9
+-	salq	$32, %r8
+-	pcmpeqb	%xmm1, %xmm2
+-	orq	%r8, %rdx
+-	salq	$48, %rcx
+-	pmovmskb	%xmm5, %r8d
+-	orq	%rsi, %rdx
+-	pmovmskb	%xmm4, %esi
+-	orq	%rcx, %rdx
+-	pmovmskb	%xmm2, %ecx
+-	salq	$16, %rsi
+-	salq	$48, %rcx
+-	orq	%r9, %rsi
+-	orq	%r8, %rsi
+-	orq	%rcx, %rsi
++	movq	%rdi, %rsi
++	andq	$-VEC_SIZE, %rsi
++	movaps	(%rsi), %xmm1
++	pxor	%xmm2, %xmm2
++	PCMPEQ	%xmm1, %xmm2
++	pmovmskb %xmm2, %edx
+ 	movl	%edi, %ecx
+-	subl	%eax, %ecx
+-	shrq	%cl, %rdx
+-	shrq	%cl, %rsi
+-	testq	%rdx, %rdx
+-	je	L(loop_header2)
+-	leaq	-1(%rdx), %rax
+-	xorq	%rdx, %rax
+-	andq	%rax, %rsi
+-	je	L(exit)
+-	bsrq	%rsi, %rax
++	andl	$(VEC_SIZE - 1), %ecx
++	sarl	%cl, %edx
++	jz	L(cross_page_continue)
++	PCMPEQ	%xmm0, %xmm1
++	pmovmskb %xmm1, %eax
++	sarl	%cl, %eax
++	leal	-1(%rdx), %ecx
++	xorl	%edx, %ecx
++	andl	%ecx, %eax
++	jz	L(ret1)
++	bsrl	%eax, %eax
+ 	addq	%rdi, %rax
++#ifdef USE_AS_WCSRCHR
++	andq	$-CHAR_SIZE, %rax
++#endif
++L(ret1):
+ 	ret
+-END (strrchr)
++END(STRRCHR)
+ 
+-weak_alias (strrchr, rindex)
+-libc_hidden_builtin_def (strrchr)
++#ifndef USE_AS_WCSRCHR
++	weak_alias (STRRCHR, rindex)
++	libc_hidden_builtin_def (STRRCHR)
++#endif
+diff --git a/sysdeps/x86_64/wcsrchr.S b/sysdeps/x86_64/wcsrchr.S
+index 2f388537..ae3cfa7d 100644
+--- a/sysdeps/x86_64/wcsrchr.S
++++ b/sysdeps/x86_64/wcsrchr.S
+@@ -17,266 +17,12 @@
+    License along with the GNU C Library; if not, see
+    <http://www.gnu.org/licenses/>.  */
+ 
+-#include <sysdep.h>
+ 
+-	.text
+-ENTRY (wcsrchr)
++#define USE_AS_WCSRCHR	1
++#define NO_PMINU	1
+ 
+-	movd	%rsi, %xmm1
+-	mov	%rdi, %rcx
+-	punpckldq %xmm1, %xmm1
+-	pxor	%xmm2, %xmm2
+-	punpckldq %xmm1, %xmm1
+-	and	$63, %rcx
+-	cmp	$48, %rcx
+-	ja	L(crosscache)
++#ifndef STRRCHR
++# define STRRCHR	wcsrchr
++#endif
+ 
+-	movdqu	(%rdi), %xmm0
+-	pcmpeqd	%xmm0, %xmm2
+-	pcmpeqd	%xmm1, %xmm0
+-	pmovmskb %xmm2, %rcx
+-	pmovmskb %xmm0, %rax
+-	add	$16, %rdi
+-
+-	test	%rax, %rax
+-	jnz	L(unaligned_match1)
+-
+-	test	%rcx, %rcx
+-	jnz	L(return_null)
+-
+-	and	$-16, %rdi
+-	xor	%r8, %r8
+-	jmp	L(loop)
+-
+-	.p2align 4
+-L(unaligned_match1):
+-	test	%rcx, %rcx
+-	jnz	L(prolog_find_zero_1)
+-
+-	mov	%rax, %r8
+-	mov	%rdi, %rsi
+-	and	$-16, %rdi
+-	jmp	L(loop)
+-
+-	.p2align 4
+-L(crosscache):
+-	and	$15, %rcx
+-	and	$-16, %rdi
+-	pxor	%xmm3, %xmm3
+-	movdqa	(%rdi), %xmm0
+-	pcmpeqd	%xmm0, %xmm3
+-	pcmpeqd	%xmm1, %xmm0
+-	pmovmskb %xmm3, %rdx
+-	pmovmskb %xmm0, %rax
+-	shr	%cl, %rdx
+-	shr	%cl, %rax
+-	add	$16, %rdi
+-
+-	test	%rax, %rax
+-	jnz	L(unaligned_match)
+-
+-	test	%rdx, %rdx
+-	jnz	L(return_null)
+-
+-	xor	%r8, %r8
+-	jmp	L(loop)
+-
+-	.p2align 4
+-L(unaligned_match):
+-	test	%rdx, %rdx
+-	jnz	L(prolog_find_zero)
+-
+-	mov	%rax, %r8
+-	lea	(%rdi, %rcx), %rsi
+-
+-/* Loop start on aligned string.  */
+-	.p2align 4
+-L(loop):
+-	movdqa	(%rdi), %xmm0
+-	pcmpeqd	%xmm0, %xmm2
+-	add	$16, %rdi
+-	pcmpeqd	%xmm1, %xmm0
+-	pmovmskb %xmm2, %rcx
+-	pmovmskb %xmm0, %rax
+-	or	%rax, %rcx
+-	jnz	L(matches)
+-
+-	movdqa	(%rdi), %xmm3
+-	pcmpeqd	%xmm3, %xmm2
+-	add	$16, %rdi
+-	pcmpeqd	%xmm1, %xmm3
+-	pmovmskb %xmm2, %rcx
+-	pmovmskb %xmm3, %rax
+-	or	%rax, %rcx
+-	jnz	L(matches)
+-
+-	movdqa	(%rdi), %xmm4
+-	pcmpeqd	%xmm4, %xmm2
+-	add	$16, %rdi
+-	pcmpeqd	%xmm1, %xmm4
+-	pmovmskb %xmm2, %rcx
+-	pmovmskb %xmm4, %rax
+-	or	%rax, %rcx
+-	jnz	L(matches)
+-
+-	movdqa	(%rdi), %xmm5
+-	pcmpeqd	%xmm5, %xmm2
+-	add	$16, %rdi
+-	pcmpeqd	%xmm1, %xmm5
+-	pmovmskb %xmm2, %rcx
+-	pmovmskb %xmm5, %rax
+-	or	%rax, %rcx
+-	jz	L(loop)
+-
+-	.p2align 4
+-L(matches):
+-	test	%rax, %rax
+-	jnz	L(match)
+-L(return_value):
+-	test	%r8, %r8
+-	jz	L(return_null)
+-	mov	%r8, %rax
+-	mov	%rsi, %rdi
+-
+-	test	$15 << 4, %ah
+-	jnz	L(match_fourth_wchar)
+-	test	%ah, %ah
+-	jnz	L(match_third_wchar)
+-	test	$15 << 4, %al
+-	jnz	L(match_second_wchar)
+-	lea	-16(%rdi), %rax
+-	ret
+-
+-	.p2align 4
+-L(match):
+-	pmovmskb %xmm2, %rcx
+-	test	%rcx, %rcx
+-	jnz	L(find_zero)
+-	mov	%rax, %r8
+-	mov	%rdi, %rsi
+-	jmp	L(loop)
+-
+-	.p2align 4
+-L(find_zero):
+-	test	$15, %cl
+-	jnz	L(find_zero_in_first_wchar)
+-	test	%cl, %cl
+-	jnz	L(find_zero_in_second_wchar)
+-	test	$15, %ch
+-	jnz	L(find_zero_in_third_wchar)
+-
+-	and	$1 << 13 - 1, %rax
+-	jz	L(return_value)
+-
+-	test	$15 << 4, %ah
+-	jnz	L(match_fourth_wchar)
+-	test	%ah, %ah
+-	jnz	L(match_third_wchar)
+-	test	$15 << 4, %al
+-	jnz	L(match_second_wchar)
+-	lea	-16(%rdi), %rax
+-	ret
+-
+-	.p2align 4
+-L(find_zero_in_first_wchar):
+-	test	$1, %rax
+-	jz	L(return_value)
+-	lea	-16(%rdi), %rax
+-	ret
+-
+-	.p2align 4
+-L(find_zero_in_second_wchar):
+-	and	$1 << 5 - 1, %rax
+-	jz	L(return_value)
+-
+-	test	$15 << 4, %al
+-	jnz	L(match_second_wchar)
+-	lea	-16(%rdi), %rax
+-	ret
+-
+-	.p2align 4
+-L(find_zero_in_third_wchar):
+-	and	$1 << 9 - 1, %rax
+-	jz	L(return_value)
+-
+-	test	%ah, %ah
+-	jnz	L(match_third_wchar)
+-	test	$15 << 4, %al
+-	jnz	L(match_second_wchar)
+-	lea	-16(%rdi), %rax
+-	ret
+-
+-	.p2align 4
+-L(prolog_find_zero):
+-	add	%rcx, %rdi
+-	mov     %rdx, %rcx
+-L(prolog_find_zero_1):
+-	test	$15, %cl
+-	jnz	L(prolog_find_zero_in_first_wchar)
+-	test	%cl, %cl
+-	jnz	L(prolog_find_zero_in_second_wchar)
+-	test	$15, %ch
+-	jnz	L(prolog_find_zero_in_third_wchar)
+-
+-	and	$1 << 13 - 1, %rax
+-	jz	L(return_null)
+-
+-	test	$15 << 4, %ah
+-	jnz	L(match_fourth_wchar)
+-	test	%ah, %ah
+-	jnz	L(match_third_wchar)
+-	test	$15 << 4, %al
+-	jnz	L(match_second_wchar)
+-	lea	-16(%rdi), %rax
+-	ret
+-
+-	.p2align 4
+-L(prolog_find_zero_in_first_wchar):
+-	test	$1, %rax
+-	jz	L(return_null)
+-	lea	-16(%rdi), %rax
+-	ret
+-
+-	.p2align 4
+-L(prolog_find_zero_in_second_wchar):
+-	and	$1 << 5 - 1, %rax
+-	jz	L(return_null)
+-
+-	test	$15 << 4, %al
+-	jnz	L(match_second_wchar)
+-	lea	-16(%rdi), %rax
+-	ret
+-
+-	.p2align 4
+-L(prolog_find_zero_in_third_wchar):
+-	and	$1 << 9 - 1, %rax
+-	jz	L(return_null)
+-
+-	test	%ah, %ah
+-	jnz	L(match_third_wchar)
+-	test	$15 << 4, %al
+-	jnz	L(match_second_wchar)
+-	lea	-16(%rdi), %rax
+-	ret
+-
+-	.p2align 4
+-L(match_second_wchar):
+-	lea	-12(%rdi), %rax
+-	ret
+-
+-	.p2align 4
+-L(match_third_wchar):
+-	lea	-8(%rdi), %rax
+-	ret
+-
+-	.p2align 4
+-L(match_fourth_wchar):
+-	lea	-4(%rdi), %rax
+-	ret
+-
+-	.p2align 4
+-L(return_null):
+-	xor	%rax, %rax
+-	ret
+-
+-END (wcsrchr)
++#include "../strrchr.S"
+-- 
+GitLab
+
diff --git a/glibc-RHEL-15696-104.patch b/glibc-RHEL-15696-104.patch
new file mode 100644
index 0000000..1cb312a
--- /dev/null
+++ b/glibc-RHEL-15696-104.patch
@@ -0,0 +1,501 @@
+From df7e295d18ffa34f629578c0017a9881af7620f6 Mon Sep 17 00:00:00 2001
+From: Noah Goldstein <goldstein.w.n@gmail.com>
+Date: Thu, 21 Apr 2022 20:52:29 -0500
+Subject: [PATCH] x86: Optimize {str|wcs}rchr-avx2
+Content-type: text/plain; charset=UTF-8
+
+The new code unrolls the main loop slightly without adding too much
+overhead and minimizes the comparisons for the search CHAR.
+
+Geometric Mean of all benchmarks New / Old: 0.832
+See email for all results.
+
+Full xcheck passes on x86_64 with and without multiarch enabled.
+Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
+---
+ sysdeps/x86_64/multiarch/strrchr-avx2.S | 426 +++++++++++++++---------
+ 1 file changed, 269 insertions(+), 157 deletions(-)
+
+diff --git a/sysdeps/x86_64/multiarch/strrchr-avx2.S b/sysdeps/x86_64/multiarch/strrchr-avx2.S
+index c949410b..3d26fad4 100644
+--- a/sysdeps/x86_64/multiarch/strrchr-avx2.S
++++ b/sysdeps/x86_64/multiarch/strrchr-avx2.S
+@@ -27,9 +27,13 @@
+ # ifdef USE_AS_WCSRCHR
+ #  define VPBROADCAST	vpbroadcastd
+ #  define VPCMPEQ	vpcmpeqd
++#  define VPMIN	vpminud
++#  define CHAR_SIZE	4
+ # else
+ #  define VPBROADCAST	vpbroadcastb
+ #  define VPCMPEQ	vpcmpeqb
++#  define VPMIN	vpminub
++#  define CHAR_SIZE	1
+ # endif
+ 
+ # ifndef VZEROUPPER
+@@ -41,196 +45,304 @@
+ # endif
+ 
+ # define VEC_SIZE	32
++# define PAGE_SIZE	4096
+ 
+-	.section SECTION(.text),"ax",@progbits
+-ENTRY (STRRCHR)
+-	movd	%esi, %xmm4
+-	movl	%edi, %ecx
++	.section SECTION(.text), "ax", @progbits
++ENTRY(STRRCHR)
++	movd	%esi, %xmm7
++	movl	%edi, %eax
+ 	/* Broadcast CHAR to YMM4.  */
+-	VPBROADCAST %xmm4, %ymm4
++	VPBROADCAST %xmm7, %ymm7
+ 	vpxor	%xmm0, %xmm0, %xmm0
+ 
+-	/* Check if we may cross page boundary with one vector load.  */
+-	andl	$(2 * VEC_SIZE - 1), %ecx
+-	cmpl	$VEC_SIZE, %ecx
+-	ja	L(cros_page_boundary)
++	/* Shift here instead of `andl` to save code size (saves a fetch
++	   block).  */
++	sall	$20, %eax
++	cmpl	$((PAGE_SIZE - VEC_SIZE) << 20), %eax
++	ja	L(cross_page)
+ 
++L(page_cross_continue):
+ 	vmovdqu	(%rdi), %ymm1
+-	VPCMPEQ	%ymm1, %ymm0, %ymm2
+-	VPCMPEQ	%ymm1, %ymm4, %ymm3
+-	vpmovmskb %ymm2, %ecx
+-	vpmovmskb %ymm3, %eax
+-	addq	$VEC_SIZE, %rdi
++	/* Check end of string match.  */
++	VPCMPEQ	%ymm1, %ymm0, %ymm6
++	vpmovmskb %ymm6, %ecx
++	testl	%ecx, %ecx
++	jz	L(aligned_more)
++
++	/* Only check match with search CHAR if needed.  */
++	VPCMPEQ	%ymm1, %ymm7, %ymm1
++	vpmovmskb %ymm1, %eax
++	/* Check if match before first zero.  */
++	blsmskl	%ecx, %ecx
++	andl	%ecx, %eax
++	jz	L(ret0)
++	bsrl	%eax, %eax
++	addq	%rdi, %rax
++	/* We are off by 3 for wcsrchr if search CHAR is non-zero. If
++	   search CHAR is zero we are correct. Either way `andq
++	   -CHAR_SIZE, %rax` gets the correct result.  */
++# ifdef USE_AS_WCSRCHR
++	andq	$-CHAR_SIZE, %rax
++# endif
++L(ret0):
++L(return_vzeroupper):
++	ZERO_UPPER_VEC_REGISTERS_RETURN
++
++	/* Returns for first vec x1/x2 have hard coded backward search
++	   path for earlier matches.  */
++	.p2align 4,, 10
++L(first_vec_x1):
++	VPCMPEQ	%ymm2, %ymm7, %ymm6
++	vpmovmskb %ymm6, %eax
++	blsmskl	%ecx, %ecx
++	andl	%ecx, %eax
++	jnz	L(first_vec_x1_return)
++
++	.p2align 4,, 4
++L(first_vec_x0_test):
++	VPCMPEQ	%ymm1, %ymm7, %ymm6
++	vpmovmskb %ymm6, %eax
++	testl	%eax, %eax
++	jz	L(ret1)
++	bsrl	%eax, %eax
++	addq	%r8, %rax
++# ifdef USE_AS_WCSRCHR
++	andq	$-CHAR_SIZE, %rax
++# endif
++L(ret1):
++	VZEROUPPER_RETURN
+ 
++	.p2align 4,, 10
++L(first_vec_x0_x1_test):
++	VPCMPEQ	%ymm2, %ymm7, %ymm6
++	vpmovmskb %ymm6, %eax
++	/* Check ymm2 for search CHAR match. If no match then check ymm1
++	   before returning.  */
+ 	testl	%eax, %eax
+-	jnz	L(first_vec)
++	jz	L(first_vec_x0_test)
++	.p2align 4,, 4
++L(first_vec_x1_return):
++	bsrl	%eax, %eax
++	leaq	1(%rdi, %rax), %rax
++# ifdef USE_AS_WCSRCHR
++	andq	$-CHAR_SIZE, %rax
++# endif
++	VZEROUPPER_RETURN
+ 
+-	testl	%ecx, %ecx
+-	jnz	L(return_null)
+ 
+-	andq	$-VEC_SIZE, %rdi
+-	xorl	%edx, %edx
+-	jmp	L(aligned_loop)
++	.p2align 4,, 10
++L(first_vec_x2):
++	VPCMPEQ	%ymm3, %ymm7, %ymm6
++	vpmovmskb %ymm6, %eax
++	blsmskl	%ecx, %ecx
++	/* If no in-range search CHAR match in ymm3 then need to check
++	   ymm1/ymm2 for an earlier match (we delay checking search
++	   CHAR matches until needed).  */
++	andl	%ecx, %eax
++	jz	L(first_vec_x0_x1_test)
++	bsrl	%eax, %eax
++	leaq	(VEC_SIZE + 1)(%rdi, %rax), %rax
++# ifdef USE_AS_WCSRCHR
++	andq	$-CHAR_SIZE, %rax
++# endif
++	VZEROUPPER_RETURN
++
+ 
+ 	.p2align 4
+-L(first_vec):
+-	/* Check if there is a nul CHAR.  */
++L(aligned_more):
++	/* Save original pointer if match was in VEC 0.  */
++	movq	%rdi, %r8
++
++	/* Align src.  */
++	orq	$(VEC_SIZE - 1), %rdi
++	vmovdqu	1(%rdi), %ymm2
++	VPCMPEQ	%ymm2, %ymm0, %ymm6
++	vpmovmskb %ymm6, %ecx
+ 	testl	%ecx, %ecx
+-	jnz	L(char_and_nul_in_first_vec)
++	jnz	L(first_vec_x1)
+ 
+-	/* Remember the match and keep searching.  */
+-	movl	%eax, %edx
+-	movq	%rdi, %rsi
+-	andq	$-VEC_SIZE, %rdi
+-	jmp	L(aligned_loop)
++	vmovdqu	(VEC_SIZE + 1)(%rdi), %ymm3
++	VPCMPEQ	%ymm3, %ymm0, %ymm6
++	vpmovmskb %ymm6, %ecx
++	testl	%ecx, %ecx
++	jnz	L(first_vec_x2)
+ 
++	/* Save pointer again before realigning.  */
++	movq	%rdi, %rsi
++	addq	$(VEC_SIZE + 1), %rdi
++	andq	$-(VEC_SIZE * 2), %rdi
+ 	.p2align 4
+-L(cros_page_boundary):
+-	andl	$(VEC_SIZE - 1), %ecx
+-	andq	$-VEC_SIZE, %rdi
+-	vmovdqa	(%rdi), %ymm1
+-	VPCMPEQ	%ymm1, %ymm0, %ymm2
+-	VPCMPEQ	%ymm1, %ymm4, %ymm3
+-	vpmovmskb %ymm2, %edx
+-	vpmovmskb %ymm3, %eax
+-	shrl	%cl, %edx
+-	shrl	%cl, %eax
+-	addq	$VEC_SIZE, %rdi
+-
+-	/* Check if there is a CHAR.  */
++L(first_aligned_loop):
++	/* Do 2x VEC at a time. Any more and the cost of finding the
++	   match outweights loop benefit.  */
++	vmovdqa	(VEC_SIZE * 0)(%rdi), %ymm4
++	vmovdqa	(VEC_SIZE * 1)(%rdi), %ymm5
++
++	VPCMPEQ	%ymm4, %ymm7, %ymm6
++	VPMIN	%ymm4, %ymm5, %ymm8
++	VPCMPEQ	%ymm5, %ymm7, %ymm10
++	vpor	%ymm6, %ymm10, %ymm5
++	VPCMPEQ	%ymm8, %ymm0, %ymm8
++	vpor	%ymm5, %ymm8, %ymm9
++
++	vpmovmskb %ymm9, %eax
++	addq	$(VEC_SIZE * 2), %rdi
++	/* No zero or search CHAR.  */
+ 	testl	%eax, %eax
+-	jnz	L(found_char)
+-
+-	testl	%edx, %edx
+-	jnz	L(return_null)
++	jz	L(first_aligned_loop)
+ 
+-	jmp	L(aligned_loop)
+-
+-	.p2align 4
+-L(found_char):
+-	testl	%edx, %edx
+-	jnz	L(char_and_nul)
++	/* If no zero CHAR then go to second loop (this allows us to
++	   throw away all prior work).  */
++	vpmovmskb %ymm8, %ecx
++	testl	%ecx, %ecx
++	jz	L(second_aligned_loop_prep)
+ 
+-	/* Remember the match and keep searching.  */
+-	movl	%eax, %edx
+-	leaq	(%rdi, %rcx), %rsi
++	/* Search char could be zero so we need to get the true match.
++	 */
++	vpmovmskb %ymm5, %eax
++	testl	%eax, %eax
++	jnz	L(first_aligned_loop_return)
+ 
+-	.p2align 4
+-L(aligned_loop):
+-	vmovdqa	(%rdi), %ymm1
+-	VPCMPEQ	%ymm1, %ymm0, %ymm2
+-	addq	$VEC_SIZE, %rdi
+-	VPCMPEQ	%ymm1, %ymm4, %ymm3
+-	vpmovmskb %ymm2, %ecx
+-	vpmovmskb %ymm3, %eax
+-	orl	%eax, %ecx
+-	jnz	L(char_nor_null)
+-
+-	vmovdqa	(%rdi), %ymm1
+-	VPCMPEQ	%ymm1, %ymm0, %ymm2
+-	add	$VEC_SIZE, %rdi
+-	VPCMPEQ	%ymm1, %ymm4, %ymm3
+-	vpmovmskb %ymm2, %ecx
++	.p2align 4,, 4
++L(first_vec_x1_or_x2):
++	VPCMPEQ	%ymm3, %ymm7, %ymm3
++	VPCMPEQ	%ymm2, %ymm7, %ymm2
+ 	vpmovmskb %ymm3, %eax
+-	orl	%eax, %ecx
+-	jnz	L(char_nor_null)
+-
+-	vmovdqa	(%rdi), %ymm1
+-	VPCMPEQ	%ymm1, %ymm0, %ymm2
+-	addq	$VEC_SIZE, %rdi
+-	VPCMPEQ	%ymm1, %ymm4, %ymm3
+-	vpmovmskb %ymm2, %ecx
+-	vpmovmskb %ymm3, %eax
+-	orl	%eax, %ecx
+-	jnz	L(char_nor_null)
+-
+-	vmovdqa	(%rdi), %ymm1
+-	VPCMPEQ	%ymm1, %ymm0, %ymm2
+-	addq	$VEC_SIZE, %rdi
+-	VPCMPEQ	%ymm1, %ymm4, %ymm3
+-	vpmovmskb %ymm2, %ecx
+-	vpmovmskb %ymm3, %eax
+-	orl	%eax, %ecx
+-	jz	L(aligned_loop)
+-
+-	.p2align 4
+-L(char_nor_null):
+-	/* Find a CHAR or a nul CHAR in a loop.  */
+-	testl	%eax, %eax
+-	jnz	L(match)
+-L(return_value):
+-	testl	%edx, %edx
+-	jz	L(return_null)
+-	movl	%edx, %eax
+-	movq	%rsi, %rdi
++	vpmovmskb %ymm2, %edx
++	/* Use add for macro-fusion.  */
++	addq	%rax, %rdx
++	jz	L(first_vec_x0_test)
++	/* NB: We could move this shift to before the branch and save a
++	   bit of code size / performance on the fall through. The
++	   branch leads to the null case which generally seems hotter
++	   than char in first 3x VEC.  */
++	salq	$32, %rax
++	addq	%rdx, %rax
++	bsrq	%rax, %rax
++	leaq	1(%rsi, %rax), %rax
++# ifdef USE_AS_WCSRCHR
++	andq	$-CHAR_SIZE, %rax
++# endif
++	VZEROUPPER_RETURN
+ 
++	.p2align 4,, 8
++L(first_aligned_loop_return):
++	VPCMPEQ	%ymm4, %ymm0, %ymm4
++	vpmovmskb %ymm4, %edx
++	salq	$32, %rcx
++	orq	%rdx, %rcx
++
++	vpmovmskb %ymm10, %eax
++	vpmovmskb %ymm6, %edx
++	salq	$32, %rax
++	orq	%rdx, %rax
++	blsmskq	%rcx, %rcx
++	andq	%rcx, %rax
++	jz	L(first_vec_x1_or_x2)
++
++	bsrq	%rax, %rax
++	leaq	-(VEC_SIZE * 2)(%rdi, %rax), %rax
+ # ifdef USE_AS_WCSRCHR
+-	/* Keep the first bit for each matching CHAR for bsr.  */
+-	andl	$0x11111111, %eax
++	andq	$-CHAR_SIZE, %rax
+ # endif
+-	bsrl	%eax, %eax
+-	leaq	-VEC_SIZE(%rdi, %rax), %rax
+-L(return_vzeroupper):
+-	ZERO_UPPER_VEC_REGISTERS_RETURN
++	VZEROUPPER_RETURN
+ 
++	/* Search char cannot be zero.  */
+ 	.p2align 4
+-L(match):
+-	/* Find a CHAR.  Check if there is a nul CHAR.  */
+-	vpmovmskb %ymm2, %ecx
+-	testl	%ecx, %ecx
+-	jnz	L(find_nul)
+-
+-	/* Remember the match and keep searching.  */
+-	movl	%eax, %edx
++L(second_aligned_loop_set_furthest_match):
++	/* Save VEC and pointer from most recent match.  */
++L(second_aligned_loop_prep):
+ 	movq	%rdi, %rsi
+-	jmp	L(aligned_loop)
++	vmovdqu	%ymm6, %ymm2
++	vmovdqu	%ymm10, %ymm3
+ 
+ 	.p2align 4
+-L(find_nul):
+-# ifdef USE_AS_WCSRCHR
+-	/* Keep the first bit for each matching CHAR for bsr.  */
+-	andl	$0x11111111, %ecx
+-	andl	$0x11111111, %eax
+-# endif
+-	/* Mask out any matching bits after the nul CHAR.  */
+-	movl	%ecx, %r8d
+-	subl	$1, %r8d
+-	xorl	%ecx, %r8d
+-	andl	%r8d, %eax
++L(second_aligned_loop):
++	/* Search 2x at at time.  */
++	vmovdqa	(VEC_SIZE * 0)(%rdi), %ymm4
++	vmovdqa	(VEC_SIZE * 1)(%rdi), %ymm5
++
++	VPCMPEQ	%ymm4, %ymm7, %ymm6
++	VPMIN	%ymm4, %ymm5, %ymm1
++	VPCMPEQ	%ymm5, %ymm7, %ymm10
++	vpor	%ymm6, %ymm10, %ymm5
++	VPCMPEQ	%ymm1, %ymm0, %ymm1
++	vpor	%ymm5, %ymm1, %ymm9
++
++	vpmovmskb %ymm9, %eax
++	addq	$(VEC_SIZE * 2), %rdi
+ 	testl	%eax, %eax
+-	/* If there is no CHAR here, return the remembered one.  */
+-	jz	L(return_value)
+-	bsrl	%eax, %eax
+-	leaq	-VEC_SIZE(%rdi, %rax), %rax
+-	VZEROUPPER_RETURN
+-
+-	.p2align 4
+-L(char_and_nul):
+-	/* Find both a CHAR and a nul CHAR.  */
+-	addq	%rcx, %rdi
+-	movl	%edx, %ecx
+-L(char_and_nul_in_first_vec):
+-# ifdef USE_AS_WCSRCHR
+-	/* Keep the first bit for each matching CHAR for bsr.  */
+-	andl	$0x11111111, %ecx
+-	andl	$0x11111111, %eax
+-# endif
+-	/* Mask out any matching bits after the nul CHAR.  */
+-	movl	%ecx, %r8d
+-	subl	$1, %r8d
+-	xorl	%ecx, %r8d
+-	andl	%r8d, %eax
++	jz	L(second_aligned_loop)
++	vpmovmskb %ymm1, %ecx
++	testl	%ecx, %ecx
++	jz	L(second_aligned_loop_set_furthest_match)
++	vpmovmskb %ymm5, %eax
+ 	testl	%eax, %eax
+-	/* Return null pointer if the nul CHAR comes first.  */
+-	jz	L(return_null)
+-	bsrl	%eax, %eax
+-	leaq	-VEC_SIZE(%rdi, %rax), %rax
++	jnz	L(return_new_match)
++
++	/* This is the hot patch. We know CHAR is inbounds and that
++	   ymm3/ymm2 have latest match.  */
++	.p2align 4,, 4
++L(return_old_match):
++	vpmovmskb %ymm3, %eax
++	vpmovmskb %ymm2, %edx
++	salq	$32, %rax
++	orq	%rdx, %rax
++	bsrq	%rax, %rax
++	/* Search char cannot be zero so safe to just use lea for
++	   wcsrchr.  */
++	leaq	(VEC_SIZE * -2 -(CHAR_SIZE - 1))(%rsi, %rax), %rax
+ 	VZEROUPPER_RETURN
+ 
+-	.p2align 4
+-L(return_null):
+-	xorl	%eax, %eax
++	/* Last iteration also potentially has a match.  */
++	.p2align 4,, 8
++L(return_new_match):
++	VPCMPEQ	%ymm4, %ymm0, %ymm4
++	vpmovmskb %ymm4, %edx
++	salq	$32, %rcx
++	orq	%rdx, %rcx
++
++	vpmovmskb %ymm10, %eax
++	vpmovmskb %ymm6, %edx
++	salq	$32, %rax
++	orq	%rdx, %rax
++	blsmskq	%rcx, %rcx
++	andq	%rcx, %rax
++	jz	L(return_old_match)
++	bsrq	%rax, %rax
++	/* Search char cannot be zero so safe to just use lea for
++	   wcsrchr.  */
++	leaq	(VEC_SIZE * -2 -(CHAR_SIZE - 1))(%rdi, %rax), %rax
+ 	VZEROUPPER_RETURN
+ 
+-END (STRRCHR)
++	.p2align 4,, 4
++L(cross_page):
++	movq	%rdi, %rsi
++	andq	$-VEC_SIZE, %rsi
++	vmovdqu	(%rsi), %ymm1
++	VPCMPEQ	%ymm1, %ymm0, %ymm6
++	vpmovmskb %ymm6, %ecx
++	/* Shift out zero CHAR matches that are before the begining of
++	   src (rdi).  */
++	shrxl	%edi, %ecx, %ecx
++	testl	%ecx, %ecx
++	jz	L(page_cross_continue)
++	VPCMPEQ	%ymm1, %ymm7, %ymm1
++	vpmovmskb %ymm1, %eax
++
++	/* Shift out search CHAR matches that are before the begining of
++	   src (rdi).  */
++	shrxl	%edi, %eax, %eax
++	blsmskl	%ecx, %ecx
++	/* Check if any search CHAR match in range.  */
++	andl	%ecx, %eax
++	jz	L(ret2)
++	bsrl	%eax, %eax
++	addq	%rdi, %rax
++# ifdef USE_AS_WCSRCHR
++	andq	$-CHAR_SIZE, %rax
++# endif
++L(ret2):
++	VZEROUPPER_RETURN
++END(STRRCHR)
+ #endif
+-- 
+GitLab
+
diff --git a/glibc-RHEL-15696-105.patch b/glibc-RHEL-15696-105.patch
new file mode 100644
index 0000000..e0a157f
--- /dev/null
+++ b/glibc-RHEL-15696-105.patch
@@ -0,0 +1,558 @@
+From c966099cdc3e0fdf92f63eac09b22fa7e5f5f02d Mon Sep 17 00:00:00 2001
+From: Noah Goldstein <goldstein.w.n@gmail.com>
+Date: Thu, 21 Apr 2022 20:52:30 -0500
+Subject: [PATCH] x86: Optimize {str|wcs}rchr-evex
+Content-type: text/plain; charset=UTF-8
+
+The new code unrolls the main loop slightly without adding too much
+overhead and minimizes the comparisons for the search CHAR.
+
+Geometric Mean of all benchmarks New / Old: 0.755
+See email for all results.
+
+Full xcheck passes on x86_64 with and without multiarch enabled.
+Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
+---
+ sysdeps/x86_64/multiarch/strrchr-evex.S | 471 +++++++++++++++---------
+ 1 file changed, 290 insertions(+), 181 deletions(-)
+
+diff --git a/sysdeps/x86_64/multiarch/strrchr-evex.S b/sysdeps/x86_64/multiarch/strrchr-evex.S
+index f920b5a5..f5b6d755 100644
+--- a/sysdeps/x86_64/multiarch/strrchr-evex.S
++++ b/sysdeps/x86_64/multiarch/strrchr-evex.S
+@@ -24,242 +24,351 @@
+ #  define STRRCHR	__strrchr_evex
+ # endif
+ 
+-# define VMOVU		vmovdqu64
+-# define VMOVA		vmovdqa64
++# define VMOVU	vmovdqu64
++# define VMOVA	vmovdqa64
+ 
+ # ifdef USE_AS_WCSRCHR
++#  define SHIFT_REG	esi
++
++#  define kunpck	kunpckbw
++#  define kmov_2x	kmovd
++#  define maskz_2x	ecx
++#  define maskm_2x	eax
++#  define CHAR_SIZE	4
++#  define VPMIN	vpminud
++#  define VPTESTN	vptestnmd
+ #  define VPBROADCAST	vpbroadcastd
+-#  define VPCMP		vpcmpd
+-#  define SHIFT_REG	r8d
++#  define VPCMP	vpcmpd
+ # else
++#  define SHIFT_REG	edi
++
++#  define kunpck	kunpckdq
++#  define kmov_2x	kmovq
++#  define maskz_2x	rcx
++#  define maskm_2x	rax
++
++#  define CHAR_SIZE	1
++#  define VPMIN	vpminub
++#  define VPTESTN	vptestnmb
+ #  define VPBROADCAST	vpbroadcastb
+-#  define VPCMP		vpcmpb
+-#  define SHIFT_REG	ecx
++#  define VPCMP	vpcmpb
+ # endif
+ 
+ # define XMMZERO	xmm16
+ # define YMMZERO	ymm16
+ # define YMMMATCH	ymm17
+-# define YMM1		ymm18
++# define YMMSAVE	ymm18
++
++# define YMM1	ymm19
++# define YMM2	ymm20
++# define YMM3	ymm21
++# define YMM4	ymm22
++# define YMM5	ymm23
++# define YMM6	ymm24
++# define YMM7	ymm25
++# define YMM8	ymm26
+ 
+-# define VEC_SIZE	32
+ 
+-	.section .text.evex,"ax",@progbits
+-ENTRY (STRRCHR)
+-	movl	%edi, %ecx
++# define VEC_SIZE	32
++# define PAGE_SIZE	4096
++	.section .text.evex, "ax", @progbits
++ENTRY(STRRCHR)
++	movl	%edi, %eax
+ 	/* Broadcast CHAR to YMMMATCH.  */
+ 	VPBROADCAST %esi, %YMMMATCH
+ 
+-	vpxorq	%XMMZERO, %XMMZERO, %XMMZERO
+-
+-	/* Check if we may cross page boundary with one vector load.  */
+-	andl	$(2 * VEC_SIZE - 1), %ecx
+-	cmpl	$VEC_SIZE, %ecx
+-	ja	L(cros_page_boundary)
++	andl	$(PAGE_SIZE - 1), %eax
++	cmpl	$(PAGE_SIZE - VEC_SIZE), %eax
++	jg	L(cross_page_boundary)
+ 
++L(page_cross_continue):
+ 	VMOVU	(%rdi), %YMM1
+-
+-	/* Each bit in K0 represents a null byte in YMM1.  */
+-	VPCMP	$0, %YMMZERO, %YMM1, %k0
+-	/* Each bit in K1 represents a CHAR in YMM1.  */
+-	VPCMP	$0, %YMMMATCH, %YMM1, %k1
++	/* k0 has a 1 for each zero CHAR in YMM1.  */
++	VPTESTN	%YMM1, %YMM1, %k0
+ 	kmovd	%k0, %ecx
+-	kmovd	%k1, %eax
+-
+-	addq	$VEC_SIZE, %rdi
+-
+-	testl	%eax, %eax
+-	jnz	L(first_vec)
+-
+ 	testl	%ecx, %ecx
+-	jnz	L(return_null)
+-
+-	andq	$-VEC_SIZE, %rdi
+-	xorl	%edx, %edx
+-	jmp	L(aligned_loop)
+-
+-	.p2align 4
+-L(first_vec):
+-	/* Check if there is a null byte.  */
+-	testl	%ecx, %ecx
+-	jnz	L(char_and_nul_in_first_vec)
+-
+-	/* Remember the match and keep searching.  */
+-	movl	%eax, %edx
+-	movq	%rdi, %rsi
+-	andq	$-VEC_SIZE, %rdi
+-	jmp	L(aligned_loop)
+-
+-	.p2align 4
+-L(cros_page_boundary):
+-	andl	$(VEC_SIZE - 1), %ecx
+-	andq	$-VEC_SIZE, %rdi
++	jz	L(aligned_more)
++	/* fallthrough: zero CHAR in first VEC.  */
+ 
++	/* K1 has a 1 for each search CHAR match in YMM1.  */
++	VPCMP	$0, %YMMMATCH, %YMM1, %k1
++	kmovd	%k1, %eax
++	/* Build mask up until first zero CHAR (used to mask of
++	   potential search CHAR matches past the end of the string).
++	 */
++	blsmskl	%ecx, %ecx
++	andl	%ecx, %eax
++	jz	L(ret0)
++	/* Get last match (the `andl` removed any out of bounds
++	   matches).  */
++	bsrl	%eax, %eax
+ # ifdef USE_AS_WCSRCHR
+-	/* NB: Divide shift count by 4 since each bit in K1 represent 4
+-	   bytes.  */
+-	movl	%ecx, %SHIFT_REG
+-	sarl	$2, %SHIFT_REG
++	leaq	(%rdi, %rax, CHAR_SIZE), %rax
++# else
++	addq	%rdi, %rax
+ # endif
++L(ret0):
++	ret
+ 
+-	VMOVA	(%rdi), %YMM1
+-
+-	/* Each bit in K0 represents a null byte in YMM1.  */
+-	VPCMP	$0, %YMMZERO, %YMM1, %k0
+-	/* Each bit in K1 represents a CHAR in YMM1.  */
++	/* Returns for first vec x1/x2/x3 have hard coded backward
++	   search path for earlier matches.  */
++	.p2align 4,, 6
++L(first_vec_x1):
++	VPCMP	$0, %YMMMATCH, %YMM2, %k1
++	kmovd	%k1, %eax
++	blsmskl	%ecx, %ecx
++	/* eax non-zero if search CHAR in range.  */
++	andl	%ecx, %eax
++	jnz	L(first_vec_x1_return)
++
++	/* fallthrough: no match in YMM2 then need to check for earlier
++	   matches (in YMM1).  */
++	.p2align 4,, 4
++L(first_vec_x0_test):
+ 	VPCMP	$0, %YMMMATCH, %YMM1, %k1
+-	kmovd	%k0, %edx
+ 	kmovd	%k1, %eax
+-
+-	shrxl	%SHIFT_REG, %edx, %edx
+-	shrxl	%SHIFT_REG, %eax, %eax
+-	addq	$VEC_SIZE, %rdi
+-
+-	/* Check if there is a CHAR.  */
+ 	testl	%eax, %eax
+-	jnz	L(found_char)
+-
+-	testl	%edx, %edx
+-	jnz	L(return_null)
+-
+-	jmp	L(aligned_loop)
+-
+-	.p2align 4
+-L(found_char):
+-	testl	%edx, %edx
+-	jnz	L(char_and_nul)
+-
+-	/* Remember the match and keep searching.  */
+-	movl	%eax, %edx
+-	leaq	(%rdi, %rcx), %rsi
++	jz	L(ret1)
++	bsrl	%eax, %eax
++# ifdef USE_AS_WCSRCHR
++	leaq	(%rsi, %rax, CHAR_SIZE), %rax
++# else
++	addq	%rsi, %rax
++# endif
++L(ret1):
++	ret
+ 
+-	.p2align 4
+-L(aligned_loop):
+-	VMOVA	(%rdi), %YMM1
+-	addq	$VEC_SIZE, %rdi
++	.p2align 4,, 10
++L(first_vec_x1_or_x2):
++	VPCMP	$0, %YMM3, %YMMMATCH, %k3
++	VPCMP	$0, %YMM2, %YMMMATCH, %k2
++	/* K2 and K3 have 1 for any search CHAR match. Test if any
++	   matches between either of them. Otherwise check YMM1.  */
++	kortestd %k2, %k3
++	jz	L(first_vec_x0_test)
++
++	/* Guranteed that YMM2 and YMM3 are within range so merge the
++	   two bitmasks then get last result.  */
++	kunpck	%k2, %k3, %k3
++	kmovq	%k3, %rax
++	bsrq	%rax, %rax
++	leaq	(VEC_SIZE)(%r8, %rax, CHAR_SIZE), %rax
++	ret
+ 
+-	/* Each bit in K0 represents a null byte in YMM1.  */
+-	VPCMP	$0, %YMMZERO, %YMM1, %k0
+-	/* Each bit in K1 represents a CHAR in YMM1.  */
+-	VPCMP	$0, %YMMMATCH, %YMM1, %k1
+-	kmovd	%k0, %ecx
++	.p2align 4,, 6
++L(first_vec_x3):
++	VPCMP	$0, %YMMMATCH, %YMM4, %k1
+ 	kmovd	%k1, %eax
+-	orl	%eax, %ecx
+-	jnz	L(char_nor_null)
++	blsmskl	%ecx, %ecx
++	/* If no search CHAR match in range check YMM1/YMM2/YMM3.  */
++	andl	%ecx, %eax
++	jz	L(first_vec_x1_or_x2)
++	bsrl	%eax, %eax
++	leaq	(VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax
++	ret
+ 
+-	VMOVA	(%rdi), %YMM1
+-	add	$VEC_SIZE, %rdi
++	.p2align 4,, 6
++L(first_vec_x0_x1_test):
++	VPCMP	$0, %YMMMATCH, %YMM2, %k1
++	kmovd	%k1, %eax
++	/* Check YMM2 for last match first. If no match try YMM1.  */
++	testl	%eax, %eax
++	jz	L(first_vec_x0_test)
++	.p2align 4,, 4
++L(first_vec_x1_return):
++	bsrl	%eax, %eax
++	leaq	(VEC_SIZE)(%rdi, %rax, CHAR_SIZE), %rax
++	ret
+ 
+-	/* Each bit in K0 represents a null byte in YMM1.  */
+-	VPCMP	$0, %YMMZERO, %YMM1, %k0
+-	/* Each bit in K1 represents a CHAR in YMM1.  */
+-	VPCMP	$0, %YMMMATCH, %YMM1, %k1
+-	kmovd	%k0, %ecx
++	.p2align 4,, 10
++L(first_vec_x2):
++	VPCMP	$0, %YMMMATCH, %YMM3, %k1
+ 	kmovd	%k1, %eax
+-	orl	%eax, %ecx
+-	jnz	L(char_nor_null)
++	blsmskl	%ecx, %ecx
++	/* Check YMM3 for last match first. If no match try YMM2/YMM1.
++	 */
++	andl	%ecx, %eax
++	jz	L(first_vec_x0_x1_test)
++	bsrl	%eax, %eax
++	leaq	(VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
++	ret
+ 
+-	VMOVA	(%rdi), %YMM1
+-	addq	$VEC_SIZE, %rdi
+ 
+-	/* Each bit in K0 represents a null byte in YMM1.  */
+-	VPCMP	$0, %YMMZERO, %YMM1, %k0
+-	/* Each bit in K1 represents a CHAR in YMM1.  */
+-	VPCMP	$0, %YMMMATCH, %YMM1, %k1
++	.p2align 4
++L(aligned_more):
++	/* Need to keep original pointer incase YMM1 has last match.  */
++	movq	%rdi, %rsi
++	andq	$-VEC_SIZE, %rdi
++	VMOVU	VEC_SIZE(%rdi), %YMM2
++	VPTESTN	%YMM2, %YMM2, %k0
+ 	kmovd	%k0, %ecx
+-	kmovd	%k1, %eax
+-	orl	%eax, %ecx
+-	jnz	L(char_nor_null)
++	testl	%ecx, %ecx
++	jnz	L(first_vec_x1)
+ 
+-	VMOVA	(%rdi), %YMM1
+-	addq	$VEC_SIZE, %rdi
++	VMOVU	(VEC_SIZE * 2)(%rdi), %YMM3
++	VPTESTN	%YMM3, %YMM3, %k0
++	kmovd	%k0, %ecx
++	testl	%ecx, %ecx
++	jnz	L(first_vec_x2)
+ 
+-	/* Each bit in K0 represents a null byte in YMM1.  */
+-	VPCMP	$0, %YMMZERO, %YMM1, %k0
+-	/* Each bit in K1 represents a CHAR in YMM1.  */
+-	VPCMP	$0, %YMMMATCH, %YMM1, %k1
++	VMOVU	(VEC_SIZE * 3)(%rdi), %YMM4
++	VPTESTN	%YMM4, %YMM4, %k0
+ 	kmovd	%k0, %ecx
+-	kmovd	%k1, %eax
+-	orl	%eax, %ecx
+-	jz	L(aligned_loop)
++	movq	%rdi, %r8
++	testl	%ecx, %ecx
++	jnz	L(first_vec_x3)
+ 
++	andq	$-(VEC_SIZE * 2), %rdi
+ 	.p2align 4
+-L(char_nor_null):
+-	/* Find a CHAR or a null byte in a loop.  */
++L(first_aligned_loop):
++	/* Preserve YMM1, YMM2, YMM3, and YMM4 until we can gurantee
++	   they don't store a match.  */
++	VMOVA	(VEC_SIZE * 4)(%rdi), %YMM5
++	VMOVA	(VEC_SIZE * 5)(%rdi), %YMM6
++
++	VPCMP	$0, %YMM5, %YMMMATCH, %k2
++	vpxord	%YMM6, %YMMMATCH, %YMM7
++
++	VPMIN	%YMM5, %YMM6, %YMM8
++	VPMIN	%YMM8, %YMM7, %YMM7
++
++	VPTESTN	%YMM7, %YMM7, %k1
++	subq	$(VEC_SIZE * -2), %rdi
++	kortestd %k1, %k2
++	jz	L(first_aligned_loop)
++
++	VPCMP	$0, %YMM6, %YMMMATCH, %k3
++	VPTESTN	%YMM8, %YMM8, %k1
++	ktestd	%k1, %k1
++	jz	L(second_aligned_loop_prep)
++
++	kortestd %k2, %k3
++	jnz	L(return_first_aligned_loop)
++
++	.p2align 4,, 6
++L(first_vec_x1_or_x2_or_x3):
++	VPCMP	$0, %YMM4, %YMMMATCH, %k4
++	kmovd	%k4, %eax
+ 	testl	%eax, %eax
+-	jnz	L(match)
+-L(return_value):
+-	testl	%edx, %edx
+-	jz	L(return_null)
+-	movl	%edx, %eax
+-	movq	%rsi, %rdi
++	jz	L(first_vec_x1_or_x2)
+ 	bsrl	%eax, %eax
+-# ifdef USE_AS_WCSRCHR
+-	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
+-	leaq	-VEC_SIZE(%rdi, %rax, 4), %rax
+-# else
+-	leaq	-VEC_SIZE(%rdi, %rax), %rax
+-# endif
++	leaq	(VEC_SIZE * 3)(%r8, %rax, CHAR_SIZE), %rax
+ 	ret
+ 
+-	.p2align 4
+-L(match):
+-	/* Find a CHAR.  Check if there is a null byte.  */
+-	kmovd	%k0, %ecx
+-	testl	%ecx, %ecx
+-	jnz	L(find_nul)
++	.p2align 4,, 8
++L(return_first_aligned_loop):
++	VPTESTN	%YMM5, %YMM5, %k0
++	kunpck	%k0, %k1, %k0
++	kmov_2x	%k0, %maskz_2x
++
++	blsmsk	%maskz_2x, %maskz_2x
++	kunpck	%k2, %k3, %k3
++	kmov_2x	%k3, %maskm_2x
++	and	%maskz_2x, %maskm_2x
++	jz	L(first_vec_x1_or_x2_or_x3)
+ 
+-	/* Remember the match and keep searching.  */
+-	movl	%eax, %edx
++	bsr	%maskm_2x, %maskm_2x
++	leaq	(VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
++	ret
++
++	.p2align 4
++	/* We can throw away the work done for the first 4x checks here
++	   as we have a later match. This is the 'fast' path persay.
++	 */
++L(second_aligned_loop_prep):
++L(second_aligned_loop_set_furthest_match):
+ 	movq	%rdi, %rsi
+-	jmp	L(aligned_loop)
++	kunpck	%k2, %k3, %k4
+ 
+ 	.p2align 4
+-L(find_nul):
+-	/* Mask out any matching bits after the null byte.  */
+-	movl	%ecx, %r8d
+-	subl	$1, %r8d
+-	xorl	%ecx, %r8d
+-	andl	%r8d, %eax
+-	testl	%eax, %eax
+-	/* If there is no CHAR here, return the remembered one.  */
+-	jz	L(return_value)
+-	bsrl	%eax, %eax
++L(second_aligned_loop):
++	VMOVU	(VEC_SIZE * 4)(%rdi), %YMM1
++	VMOVU	(VEC_SIZE * 5)(%rdi), %YMM2
++
++	VPCMP	$0, %YMM1, %YMMMATCH, %k2
++	vpxord	%YMM2, %YMMMATCH, %YMM3
++
++	VPMIN	%YMM1, %YMM2, %YMM4
++	VPMIN	%YMM3, %YMM4, %YMM3
++
++	VPTESTN	%YMM3, %YMM3, %k1
++	subq	$(VEC_SIZE * -2), %rdi
++	kortestd %k1, %k2
++	jz	L(second_aligned_loop)
++
++	VPCMP	$0, %YMM2, %YMMMATCH, %k3
++	VPTESTN	%YMM4, %YMM4, %k1
++	ktestd	%k1, %k1
++	jz	L(second_aligned_loop_set_furthest_match)
++
++	kortestd %k2, %k3
++	/* branch here because there is a significant advantage interms
++	   of output dependency chance in using edx.  */
++	jnz	L(return_new_match)
++L(return_old_match):
++	kmovq	%k4, %rax
++	bsrq	%rax, %rax
++	leaq	(VEC_SIZE * 2)(%rsi, %rax, CHAR_SIZE), %rax
++	ret
++
++L(return_new_match):
++	VPTESTN	%YMM1, %YMM1, %k0
++	kunpck	%k0, %k1, %k0
++	kmov_2x	%k0, %maskz_2x
++
++	blsmsk	%maskz_2x, %maskz_2x
++	kunpck	%k2, %k3, %k3
++	kmov_2x	%k3, %maskm_2x
++	and	%maskz_2x, %maskm_2x
++	jz	L(return_old_match)
++
++	bsr	%maskm_2x, %maskm_2x
++	leaq	(VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
++	ret
++
++L(cross_page_boundary):
++	/* eax contains all the page offset bits of src (rdi). `xor rdi,
++	   rax` sets pointer will all page offset bits cleared so
++	   offset of (PAGE_SIZE - VEC_SIZE) will get last aligned VEC
++	   before page cross (guranteed to be safe to read). Doing this
++	   as opposed to `movq %rdi, %rax; andq $-VEC_SIZE, %rax` saves
++	   a bit of code size.  */
++	xorq	%rdi, %rax
++	VMOVU	(PAGE_SIZE - VEC_SIZE)(%rax), %YMM1
++	VPTESTN	%YMM1, %YMM1, %k0
++	kmovd	%k0, %ecx
++
++	/* Shift out zero CHAR matches that are before the begining of
++	   src (rdi).  */
+ # ifdef USE_AS_WCSRCHR
+-	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
+-	leaq	-VEC_SIZE(%rdi, %rax, 4), %rax
+-# else
+-	leaq	-VEC_SIZE(%rdi, %rax), %rax
++	movl	%edi, %esi
++	andl	$(VEC_SIZE - 1), %esi
++	shrl	$2, %esi
+ # endif
+-	ret
++	shrxl	%SHIFT_REG, %ecx, %ecx
+ 
+-	.p2align 4
+-L(char_and_nul):
+-	/* Find both a CHAR and a null byte.  */
+-	addq	%rcx, %rdi
+-	movl	%edx, %ecx
+-L(char_and_nul_in_first_vec):
+-	/* Mask out any matching bits after the null byte.  */
+-	movl	%ecx, %r8d
+-	subl	$1, %r8d
+-	xorl	%ecx, %r8d
+-	andl	%r8d, %eax
+-	testl	%eax, %eax
+-	/* Return null pointer if the null byte comes first.  */
+-	jz	L(return_null)
++	testl	%ecx, %ecx
++	jz	L(page_cross_continue)
++
++	/* Found zero CHAR so need to test for search CHAR.  */
++	VPCMP	$0, %YMMMATCH, %YMM1, %k1
++	kmovd	%k1, %eax
++	/* Shift out search CHAR matches that are before the begining of
++	   src (rdi).  */
++	shrxl	%SHIFT_REG, %eax, %eax
++
++	/* Check if any search CHAR match in range.  */
++	blsmskl	%ecx, %ecx
++	andl	%ecx, %eax
++	jz	L(ret3)
+ 	bsrl	%eax, %eax
+ # ifdef USE_AS_WCSRCHR
+-	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
+-	leaq	-VEC_SIZE(%rdi, %rax, 4), %rax
++	leaq	(%rdi, %rax, CHAR_SIZE), %rax
+ # else
+-	leaq	-VEC_SIZE(%rdi, %rax), %rax
++	addq	%rdi, %rax
+ # endif
++L(ret3):
+ 	ret
+ 
+-	.p2align 4
+-L(return_null):
+-	xorl	%eax, %eax
+-	ret
+-
+-END (STRRCHR)
++END(STRRCHR)
+ #endif
+-- 
+GitLab
+
diff --git a/glibc-RHEL-15696-106.patch b/glibc-RHEL-15696-106.patch
new file mode 100644
index 0000000..f3bdb17
--- /dev/null
+++ b/glibc-RHEL-15696-106.patch
@@ -0,0 +1,73 @@
+From 911c63a51c690dd1a97dfc587097277029baf00f Mon Sep 17 00:00:00 2001
+From: Noah Goldstein <goldstein.w.n@gmail.com>
+Date: Wed, 27 Apr 2022 15:13:02 -0500
+Subject: [PATCH] sysdeps: Add 'get_fast_jitter' interace in fast-jitter.h
+Content-type: text/plain; charset=UTF-8
+
+'get_fast_jitter' is meant to be used purely for performance
+purposes. In all cases it's used it should be acceptable to get no
+randomness (see default case). An example use case is in setting
+jitter for retries between threads at a lock. There is a
+performance benefit to having jitter, but only if the jitter can
+be generated very quickly and ultimately there is no serious issue
+if no jitter is generated.
+
+The implementation generally uses 'HP_TIMING_NOW' iff it is
+inlined (avoid any potential syscall paths).
+Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
+---
+ sysdeps/generic/fast-jitter.h | 42 +++++++++++++++++++++++++++++++++++
+ 1 file changed, 42 insertions(+)
+ create mode 100644 sysdeps/generic/fast-jitter.h
+
+diff --git a/sysdeps/generic/fast-jitter.h b/sysdeps/generic/fast-jitter.h
+new file mode 100644
+index 00000000..4dd53e34
+--- /dev/null
++++ b/sysdeps/generic/fast-jitter.h
+@@ -0,0 +1,42 @@
++/* Fallback for fast jitter just return 0.
++   Copyright (C) 2019-2022 Free Software Foundation, Inc.
++   This file is part of the GNU C Library.
++
++   The GNU C Library is free software; you can redistribute it and/or
++   modify it under the terms of the GNU Lesser General Public
++   License as published by the Free Software Foundation; either
++   version 2.1 of the License, or (at your option) any later version.
++
++   The GNU C Library is distributed in the hope that it will be useful,
++   but WITHOUT ANY WARRANTY; without even the implied warranty of
++   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++   Lesser General Public License for more details.
++
++   You should have received a copy of the GNU Lesser General Public
++   License along with the GNU C Library; if not, see
++   <https://www.gnu.org/licenses/>.  */
++
++#ifndef _FAST_JITTER_H
++# define _FAST_JITTER_H
++
++# include <stdint.h>
++# include <hp-timing.h>
++
++/* Baseline just return 0.  We could create jitter using a clock or
++   'random_bits' but that may imply a syscall and the goal of
++   'get_fast_jitter' is minimal overhead "randomness" when such
++   randomness helps performance.  Adding high overhead the function
++   defeats the purpose.  */
++static inline uint32_t
++get_fast_jitter (void)
++{
++# if HP_TIMING_INLINE
++  hp_timing_t jitter;
++  HP_TIMING_NOW (jitter);
++  return (uint32_t) jitter;
++# else
++  return 0;
++# endif
++}
++
++#endif
+-- 
+GitLab
+
diff --git a/glibc-RHEL-15696-107.patch b/glibc-RHEL-15696-107.patch
new file mode 100644
index 0000000..738cc23
--- /dev/null
+++ b/glibc-RHEL-15696-107.patch
@@ -0,0 +1,226 @@
+From 8162147872491bb5b48e91543b19c49a29ae6b6d Mon Sep 17 00:00:00 2001
+From: Wangyang Guo <wangyang.guo@intel.com>
+Date: Fri, 6 May 2022 01:50:10 +0000
+Subject: [PATCH] nptl: Add backoff mechanism to spinlock loop
+Content-type: text/plain; charset=UTF-8
+
+When mutiple threads waiting for lock at the same time, once lock owner
+releases the lock, waiters will see lock available and all try to lock,
+which may cause an expensive CAS storm.
+
+Binary exponential backoff with random jitter is introduced. As try-lock
+attempt increases, there is more likely that a larger number threads
+compete for adaptive mutex lock, so increase wait time in exponential.
+A random jitter is also added to avoid synchronous try-lock from other
+threads.
+
+v2: Remove read-check before try-lock for performance.
+
+v3:
+1. Restore read-check since it works well in some platform.
+2. Make backoff arch dependent, and enable it for x86_64.
+3. Limit max backoff to reduce latency in large critical section.
+
+v4: Fix strict-prototypes error in sysdeps/nptl/pthread_mutex_backoff.h
+
+v5: Commit log updated for regression in large critical section.
+
+Result of pthread-mutex-locks bench
+
+Test Platform: Xeon 8280L (2 socket, 112 CPUs in total)
+First Row: thread number
+First Col: critical section length
+Values: backoff vs upstream, time based, low is better
+
+non-critical-length: 1
+	1	2	4	8	16	32	64	112	140
+0	0.99	0.58	0.52	0.49	0.43	0.44	0.46	0.52	0.54
+1	0.98	0.43	0.56	0.50	0.44	0.45	0.50	0.56	0.57
+2	0.99	0.41	0.57	0.51	0.45	0.47	0.48	0.60	0.61
+4	0.99	0.45	0.59	0.53	0.48	0.49	0.52	0.64	0.65
+8	1.00	0.66	0.71	0.63	0.56	0.59	0.66	0.72	0.71
+16	0.97	0.78	0.91	0.73	0.67	0.70	0.79	0.80	0.80
+32	0.95	1.17	0.98	0.87	0.82	0.86	0.89	0.90	0.90
+64	0.96	0.95	1.01	1.01	0.98	1.00	1.03	0.99	0.99
+128	0.99	1.01	1.01	1.17	1.08	1.12	1.02	0.97	1.02
+
+non-critical-length: 32
+	1	2	4	8	16	32	64	112	140
+0	1.03	0.97	0.75	0.65	0.58	0.58	0.56	0.70	0.70
+1	0.94	0.95	0.76	0.65	0.58	0.58	0.61	0.71	0.72
+2	0.97	0.96	0.77	0.66	0.58	0.59	0.62	0.74	0.74
+4	0.99	0.96	0.78	0.66	0.60	0.61	0.66	0.76	0.77
+8	0.99	0.99	0.84	0.70	0.64	0.66	0.71	0.80	0.80
+16	0.98	0.97	0.95	0.76	0.70	0.73	0.81	0.85	0.84
+32	1.04	1.12	1.04	0.89	0.82	0.86	0.93	0.91	0.91
+64	0.99	1.15	1.07	1.00	0.99	1.01	1.05	0.99	0.99
+128	1.00	1.21	1.20	1.22	1.25	1.31	1.12	1.10	0.99
+
+non-critical-length: 128
+	1	2	4	8	16	32	64	112	140
+0	1.02	1.00	0.99	0.67	0.61	0.61	0.61	0.74	0.73
+1	0.95	0.99	1.00	0.68	0.61	0.60	0.60	0.74	0.74
+2	1.00	1.04	1.00	0.68	0.59	0.61	0.65	0.76	0.76
+4	1.00	0.96	0.98	0.70	0.63	0.63	0.67	0.78	0.77
+8	1.01	1.02	0.89	0.73	0.65	0.67	0.71	0.81	0.80
+16	0.99	0.96	0.96	0.79	0.71	0.73	0.80	0.84	0.84
+32	0.99	0.95	1.05	0.89	0.84	0.85	0.94	0.92	0.91
+64	1.00	0.99	1.16	1.04	1.00	1.02	1.06	0.99	0.99
+128	1.00	1.06	0.98	1.14	1.39	1.26	1.08	1.02	0.98
+
+There is regression in large critical section. But adaptive mutex is
+aimed for "quick" locks. Small critical section is more common when
+users choose to use adaptive pthread_mutex.
+
+Signed-off-by: Wangyang Guo <wangyang.guo@intel.com>
+Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
+
+Conflicts:
+	pthreadP.h
+	(had been moved)
+	nptl/pthread_mutex_lock.c
+	(max_adaptive_count renamed)
+
+---
+ nptl/pthreadP.h                             |  1 +
+ nptl/pthread_mutex_lock.c                   | 16 +++++++--
+ sysdeps/nptl/pthread_mutex_backoff.h        | 35 ++++++++++++++++++
+ sysdeps/x86_64/nptl/pthread_mutex_backoff.h | 39 +++++++++++++++++++++
+ 4 files changed, 89 insertions(+), 2 deletions(-)
+ create mode 100644 sysdeps/nptl/pthread_mutex_backoff.h
+ create mode 100644 sysdeps/x86_64/nptl/pthread_mutex_backoff.h
+
+diff --git a/nptl/pthreadP.h b/nptl/pthreadP.h
+index 7ddc166c..1550e3b6 100644
+--- a/nptl/pthreadP.h
++++ b/nptl/pthreadP.h
+@@ -33,6 +33,7 @@
+ #include <kernel-features.h>
+ #include <errno.h>
+ #include <internal-signals.h>
++#include <pthread_mutex_backoff.h>
+ 
+ 
+ /* Atomic operations on TLS memory.  */
+diff --git a/nptl/pthread_mutex_lock.c b/nptl/pthread_mutex_lock.c
+index d96a9933..c7770fc9 100644
+--- a/nptl/pthread_mutex_lock.c
++++ b/nptl/pthread_mutex_lock.c
+@@ -133,14 +133,26 @@ __pthread_mutex_lock (pthread_mutex_t *mutex)
+ 	  int cnt = 0;
+ 	  int max_cnt = MIN (MAX_ADAPTIVE_COUNT,
+ 			     mutex->__data.__spins * 2 + 10);
++	  int spin_count, exp_backoff = 1;
++	  unsigned int jitter = get_jitter ();
+ 	  do
+ 	    {
+-	      if (cnt++ >= max_cnt)
++	      /* In each loop, spin count is exponential backoff plus
++		 random jitter, random range is [0, exp_backoff-1].  */
++	      spin_count = exp_backoff + (jitter & (exp_backoff - 1));
++	      cnt += spin_count;
++	      if (cnt >= max_cnt)
+ 		{
++		  /* If cnt exceeds max spin count, just go to wait
++		     queue.  */
+ 		  LLL_MUTEX_LOCK (mutex);
+ 		  break;
+ 		}
+-	      atomic_spin_nop ();
++	      do
++		atomic_spin_nop ();
++	      while (--spin_count > 0);
++	      /* Prepare for next loop.  */
++	      exp_backoff = get_next_backoff (exp_backoff);
+ 	    }
+ 	  while (LLL_MUTEX_READ_LOCK (mutex) != 0
+ 		 || LLL_MUTEX_TRYLOCK (mutex) != 0);
+diff --git a/sysdeps/nptl/pthread_mutex_backoff.h b/sysdeps/nptl/pthread_mutex_backoff.h
+new file mode 100644
+index 00000000..5b26c22a
+--- /dev/null
++++ b/sysdeps/nptl/pthread_mutex_backoff.h
+@@ -0,0 +1,35 @@
++/* Pthread mutex backoff configuration.
++   Copyright (C) 2022 Free Software Foundation, Inc.
++   This file is part of the GNU C Library.
++
++   The GNU C Library is free software; you can redistribute it and/or
++   modify it under the terms of the GNU Lesser General Public
++   License as published by the Free Software Foundation; either
++   version 2.1 of the License, or (at your option) any later version.
++
++   The GNU C Library is distributed in the hope that it will be useful,
++   but WITHOUT ANY WARRANTY; without even the implied warranty of
++   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++   Lesser General Public License for more details.
++
++   You should have received a copy of the GNU Lesser General Public
++   License along with the GNU C Library; if not, see
++   <https://www.gnu.org/licenses/>.  */
++#ifndef _PTHREAD_MUTEX_BACKOFF_H
++#define _PTHREAD_MUTEX_BACKOFF_H 1
++
++static inline unsigned int
++get_jitter (void)
++{
++  /* Arch dependent random jitter, return 0 disables random.  */
++  return 0;
++}
++
++static inline int
++get_next_backoff (int backoff)
++{
++  /* Next backoff, return 1 disables mutex backoff.  */
++  return 1;
++}
++
++#endif
+diff --git a/sysdeps/x86_64/nptl/pthread_mutex_backoff.h b/sysdeps/x86_64/nptl/pthread_mutex_backoff.h
+new file mode 100644
+index 00000000..ec74c3d9
+--- /dev/null
++++ b/sysdeps/x86_64/nptl/pthread_mutex_backoff.h
+@@ -0,0 +1,39 @@
++/* Pthread mutex backoff configuration.
++   Copyright (C) 2022 Free Software Foundation, Inc.
++   This file is part of the GNU C Library.
++
++   The GNU C Library is free software; you can redistribute it and/or
++   modify it under the terms of the GNU Lesser General Public
++   License as published by the Free Software Foundation; either
++   version 2.1 of the License, or (at your option) any later version.
++
++   The GNU C Library is distributed in the hope that it will be useful,
++   but WITHOUT ANY WARRANTY; without even the implied warranty of
++   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++   Lesser General Public License for more details.
++
++   You should have received a copy of the GNU Lesser General Public
++   License along with the GNU C Library; if not, see
++   <https://www.gnu.org/licenses/>.  */
++#ifndef _PTHREAD_MUTEX_BACKOFF_H
++#define _PTHREAD_MUTEX_BACKOFF_H 1
++
++#include <fast-jitter.h>
++
++static inline unsigned int
++get_jitter (void)
++{
++  return get_fast_jitter ();
++}
++
++#define MAX_BACKOFF 16
++
++static inline int
++get_next_backoff (int backoff)
++{
++  /* Binary expontial backoff. Limiting max backoff
++     can reduce latency in large critical section.  */
++  return (backoff < MAX_BACKOFF) ? backoff << 1 : backoff;
++}
++
++#endif
+-- 
+GitLab
+
diff --git a/glibc-RHEL-15696-108.patch b/glibc-RHEL-15696-108.patch
new file mode 100644
index 0000000..17bf7d8
--- /dev/null
+++ b/glibc-RHEL-15696-108.patch
@@ -0,0 +1,55 @@
+From c6272098323153db373f2986c67786ea8c85f1cf Mon Sep 17 00:00:00 2001
+From: Noah Goldstein <goldstein.w.n@gmail.com>
+Date: Tue, 15 Feb 2022 08:18:15 -0600
+Subject: [PATCH] x86: Fallback {str|wcs}cmp RTM in the ncmp overflow case [BZ
+ #28896]
+Content-type: text/plain; charset=UTF-8
+
+In the overflow fallback strncmp-avx2-rtm and wcsncmp-avx2-rtm would
+call strcmp-avx2 and wcscmp-avx2 respectively. This would have
+not checks around vzeroupper and would trigger spurious
+aborts. This commit fixes that.
+
+test-strcmp, test-strncmp, test-wcscmp, and test-wcsncmp all pass on
+AVX2 machines with and without RTM.
+
+Co-authored-by: H.J. Lu <hjl.tools@gmail.com>
+---
+ sysdeps/x86_64/multiarch/strcmp-avx2.S | 8 ++------
+ 1 file changed, 2 insertions(+), 6 deletions(-)
+
+Conflicts:
+	sysdeps/x86_64/multiarch/strcmp-avx2.S
+	(split into two patches due to upstream bug differences)
+
+diff --git a/sysdeps/x86_64/multiarch/strcmp-avx2.S b/sysdeps/x86_64/multiarch/strcmp-avx2.S
+index 28cc98b6..e267c6cb 100644
+--- a/sysdeps/x86_64/multiarch/strcmp-avx2.S
++++ b/sysdeps/x86_64/multiarch/strcmp-avx2.S
+@@ -345,10 +345,10 @@ L(one_or_less):
+ 	movq	%LOCALE_REG, %rdx
+ #  endif
+ 	jb	L(ret_zero)
+-#  ifdef USE_AS_WCSCMP
+ 	/* 'nbe' covers the case where length is negative (large
+ 	   unsigned).  */
+-	jnbe	__wcscmp_avx2
++	jnbe	OVERFLOW_STRCMP
++#  ifdef USE_AS_WCSCMP
+ 	movl	(%rdi), %edx
+ 	xorl	%eax, %eax
+ 	cmpl	(%rsi), %edx
+@@ -357,10 +357,6 @@ L(one_or_less):
+ 	negl	%eax
+ 	orl	$1, %eax
+ #  else
+-	/* 'nbe' covers the case where length is negative (large
+-	   unsigned).  */
+-
+-	jnbe	__strcmp_avx2
+ 	movzbl	(%rdi), %eax
+ 	movzbl	(%rsi), %ecx
+ 	TOLOWER_gpr (%rax, %eax)
+-- 
+GitLab
+
diff --git a/glibc-RHEL-15696-109.patch b/glibc-RHEL-15696-109.patch
new file mode 100644
index 0000000..8aaa314
--- /dev/null
+++ b/glibc-RHEL-15696-109.patch
@@ -0,0 +1,60 @@
+From 259a17cc98058d2576511201f85d28cb5d9de2a2 Mon Sep 17 00:00:00 2001
+From: Stefan Liebler <stli@linux.ibm.com>
+Date: Mon, 28 Jun 2021 13:01:07 +0200
+Subject: s390x: Update math: redirect roundeven function
+
+After recent commit
+447954a206837b5f153869cfeeeab44631c3fac9
+"math: redirect roundeven function", building on
+s390x fails with:
+Error: symbol `__roundevenl' is already defined
+
+Similar to aarch64/riscv fix, this patch redirects target
+specific functions for s390x:
+commit 3213ed770cbc5821920d16caa93c85e92dd7b9f6
+"Update math: redirect roundeven function"
+
+diff --git a/sysdeps/s390/fpu/s_roundeven.c b/sysdeps/s390/fpu/s_roundeven.c
+index 40b07e054b..0773adfed0 100644
+--- a/sysdeps/s390/fpu/s_roundeven.c
++++ b/sysdeps/s390/fpu/s_roundeven.c
+@@ -18,6 +18,7 @@
+    <https://www.gnu.org/licenses/>.  */
+ 
+ #ifdef HAVE_S390_MIN_Z196_ZARCH_ASM_SUPPORT
++# define NO_MATH_REDIRECT
+ # include <math.h>
+ # include <libm-alias-double.h>
+ 
+@@ -31,7 +32,6 @@ __roundeven (double x)
+   __asm__ ("fidbra %0,4,%1,4" : "=f" (y) : "f" (x));
+   return y;
+ }
+-hidden_def (__roundeven)
+ libm_alias_double (__roundeven, roundeven)
+ 
+ #else
+diff --git a/sysdeps/s390/fpu/s_roundevenf.c b/sysdeps/s390/fpu/s_roundevenf.c
+index d2fbf3d2b6..289785bc4a 100644
+--- a/sysdeps/s390/fpu/s_roundevenf.c
++++ b/sysdeps/s390/fpu/s_roundevenf.c
+@@ -18,6 +18,7 @@
+    <https://www.gnu.org/licenses/>.  */
+ 
+ #ifdef HAVE_S390_MIN_Z196_ZARCH_ASM_SUPPORT
++# define NO_MATH_REDIRECT
+ # include <math.h>
+ # include <libm-alias-float.h>
+ 
+diff --git a/sysdeps/s390/fpu/s_roundevenl.c b/sysdeps/s390/fpu/s_roundevenl.c
+index 29ab7a8616..94b6459ab4 100644
+--- a/sysdeps/s390/fpu/s_roundevenl.c
++++ b/sysdeps/s390/fpu/s_roundevenl.c
+@@ -18,6 +18,7 @@
+    <https://www.gnu.org/licenses/>.  */
+ 
+ #ifdef HAVE_S390_MIN_Z196_ZARCH_ASM_SUPPORT
++# define NO_MATH_REDIRECT
+ # include <math.h>
+ # include <math_private.h>
+ # include <libm-alias-ldouble.h>
diff --git a/glibc-RHEL-15696-11.patch b/glibc-RHEL-15696-11.patch
new file mode 100644
index 0000000..54d7eff
--- /dev/null
+++ b/glibc-RHEL-15696-11.patch
@@ -0,0 +1,74 @@
+From 1da50d4bda07f04135dca39f40e79fc9eabed1f8 Mon Sep 17 00:00:00 2001
+From: "H.J. Lu" <hjl.tools@gmail.com>
+Date: Fri, 26 Feb 2021 05:36:59 -0800
+Subject: [PATCH] x86: Set Prefer_No_VZEROUPPER and add Prefer_AVX2_STRCMP
+Content-type: text/plain; charset=UTF-8
+
+1. Set Prefer_No_VZEROUPPER if RTM is usable to avoid RTM abort triggered
+by VZEROUPPER inside a transactionally executing RTM region.
+2. Since to compare 2 32-byte strings, 256-bit EVEX strcmp requires 2
+loads, 3 VPCMPs and 2 KORDs while AVX2 strcmp requires 1 load, 2 VPCMPEQs,
+1 VPMINU and 1 VPMOVMSKB, AVX2 strcmp is faster than EVEX strcmp.  Add
+Prefer_AVX2_STRCMP to prefer AVX2 strcmp family functions.
+---
+ sysdeps/x86/cpu-features.c                    | 20 +++++++++++++++++--
+ sysdeps/x86/cpu-tunables.c                    |  2 ++
+ ...cpu-features-preferred_feature_index_1.def |  1 +
+ 3 files changed, 21 insertions(+), 2 deletions(-)
+
+diff --git a/sysdeps/x86/cpu-features.c b/sysdeps/x86/cpu-features.c
+index 91042505..3610ee5c 100644
+--- a/sysdeps/x86/cpu-features.c
++++ b/sysdeps/x86/cpu-features.c
+@@ -524,8 +524,24 @@ init_cpu_features (struct cpu_features *cpu_features)
+ 	cpu_features->preferred[index_arch_Prefer_No_VZEROUPPER]
+ 	  |= bit_arch_Prefer_No_VZEROUPPER;
+       else
+-	cpu_features->preferred[index_arch_Prefer_No_AVX512]
+-	  |= bit_arch_Prefer_No_AVX512;
++	{
++	  cpu_features->preferred[index_arch_Prefer_No_AVX512]
++	    |= bit_arch_Prefer_No_AVX512;
++
++	  /* Avoid RTM abort triggered by VZEROUPPER inside a
++	     transactionally executing RTM region.  */
++	  if (CPU_FEATURE_USABLE_P (cpu_features, RTM))
++	    cpu_features->preferred[index_arch_Prefer_No_VZEROUPPER]
++	      |= bit_arch_Prefer_No_VZEROUPPER;
++
++	  /* Since to compare 2 32-byte strings, 256-bit EVEX strcmp
++	     requires 2 loads, 3 VPCMPs and 2 KORDs while AVX2 strcmp
++	     requires 1 load, 2 VPCMPEQs, 1 VPMINU and 1 VPMOVMSKB,
++	     AVX2 strcmp is faster than EVEX strcmp.  */
++	  if (CPU_FEATURE_USABLE_P (cpu_features, AVX2))
++	    cpu_features->preferred[index_arch_Prefer_AVX2_STRCMP]
++	      |= bit_arch_Prefer_AVX2_STRCMP;
++	}
+     }
+   /* This spells out "AuthenticAMD".  */
+   else if (ebx == 0x68747541 && ecx == 0x444d4163 && edx == 0x69746e65)
+diff --git a/sysdeps/x86/cpu-tunables.c b/sysdeps/x86/cpu-tunables.c
+index 3173b2b9..73adbaba 100644
+--- a/sysdeps/x86/cpu-tunables.c
++++ b/sysdeps/x86/cpu-tunables.c
+@@ -239,6 +239,8 @@ TUNABLE_CALLBACK (set_hwcaps) (tunable_val_t *valp)
+ 	      CHECK_GLIBC_IFUNC_PREFERRED_BOTH (n, cpu_features,
+ 						Fast_Copy_Backward,
+ 						disable, 18);
++	      CHECK_GLIBC_IFUNC_PREFERRED_NEED_BOTH
++		(n, cpu_features, Prefer_AVX2_STRCMP, AVX2, disable, 18);
+ 	    }
+ 	  break;
+ 	case 19:
+diff --git a/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def b/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def
+index 17a5cc42..4ca70b40 100644
+--- a/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def
++++ b/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def
+@@ -32,3 +32,4 @@ BIT (Prefer_ERMS)
+ BIT (Prefer_FSRM)
+ BIT (Prefer_No_AVX512)
+ BIT (MathVec_Prefer_No_AVX512)
++BIT (Prefer_AVX2_STRCMP)
+-- 
+GitLab
+
diff --git a/glibc-RHEL-15696-110.patch b/glibc-RHEL-15696-110.patch
new file mode 100644
index 0000000..c499761
--- /dev/null
+++ b/glibc-RHEL-15696-110.patch
@@ -0,0 +1,26 @@
+From 3213ed770cbc5821920d16caa93c85e92dd7b9f6 Mon Sep 17 00:00:00 2001
+From: "H.J. Lu" <hjl.tools@gmail.com>
+Date: Wed, 23 Jun 2021 13:29:41 -0700
+Subject: Update math: redirect roundeven function
+
+Redirect target specific roundeven functions for aarch64, ldbl-128ibm
+and riscv.
+
+Conflicts:
+	sysdeps/aarch64/*
+	(not needed)
+	sysdeps/riscv/*
+	(not supported)
+
+diff --git a/sysdeps/ieee754/ldbl-128ibm/s_roundevenl.c b/sysdeps/ieee754/ldbl-128ibm/s_roundevenl.c
+index 6701970f4a..90eecf496b 100644
+--- a/sysdeps/ieee754/ldbl-128ibm/s_roundevenl.c
++++ b/sysdeps/ieee754/ldbl-128ibm/s_roundevenl.c
+@@ -17,6 +17,7 @@
+    License along with the GNU C Library; if not, see
+    <http://www.gnu.org/licenses/>.  */
+ 
++#define NO_MATH_REDIRECT
+ #include <math.h>
+ #include <math_private.h>
+ 
diff --git a/glibc-RHEL-15696-12.patch b/glibc-RHEL-15696-12.patch
new file mode 100644
index 0000000..85b568e
--- /dev/null
+++ b/glibc-RHEL-15696-12.patch
@@ -0,0 +1,3410 @@
+From 1fd8c163a83d96ace1ff78fa6bac7aee084f6f77 Mon Sep 17 00:00:00 2001
+From: "H.J. Lu" <hjl.tools@gmail.com>
+Date: Fri, 5 Mar 2021 06:24:52 -0800
+Subject: [PATCH] x86-64: Add ifunc-avx2.h functions with 256-bit EVEX
+Content-type: text/plain; charset=UTF-8
+
+Update ifunc-avx2.h, strchr.c, strcmp.c, strncmp.c and wcsnlen.c to
+select the function optimized with 256-bit EVEX instructions using
+YMM16-YMM31 registers to avoid RTM abort with usable AVX512VL, AVX512BW
+and BMI2 since VZEROUPPER isn't needed at function exit.
+
+For strcmp/strncmp, prefer AVX2 strcmp/strncmp if Prefer_AVX2_STRCMP
+is set.
+---
+ sysdeps/x86_64/multiarch/Makefile          |   21 +-
+ sysdeps/x86_64/multiarch/ifunc-avx2.h      |   14 +-
+ sysdeps/x86_64/multiarch/ifunc-impl-list.c |   81 ++
+ sysdeps/x86_64/multiarch/memchr-evex.S     |  381 +++++++
+ sysdeps/x86_64/multiarch/memrchr-evex.S    |  337 +++++++
+ sysdeps/x86_64/multiarch/rawmemchr-evex.S  |    4 +
+ sysdeps/x86_64/multiarch/strchr-evex.S     |  335 +++++++
+ sysdeps/x86_64/multiarch/strchr.c          |   14 +-
+ sysdeps/x86_64/multiarch/strchrnul-evex.S  |    3 +
+ sysdeps/x86_64/multiarch/strcmp-evex.S     | 1043 ++++++++++++++++++++
+ sysdeps/x86_64/multiarch/strcmp.c          |   15 +-
+ sysdeps/x86_64/multiarch/strlen-evex.S     |  436 ++++++++
+ sysdeps/x86_64/multiarch/strncmp-evex.S    |    3 +
+ sysdeps/x86_64/multiarch/strncmp.c         |   15 +-
+ sysdeps/x86_64/multiarch/strnlen-evex.S    |    4 +
+ sysdeps/x86_64/multiarch/strrchr-evex.S    |  265 +++++
+ sysdeps/x86_64/multiarch/wcschr-evex.S     |    3 +
+ sysdeps/x86_64/multiarch/wcscmp-evex.S     |    4 +
+ sysdeps/x86_64/multiarch/wcslen-evex.S     |    4 +
+ sysdeps/x86_64/multiarch/wcsncmp-evex.S    |    5 +
+ sysdeps/x86_64/multiarch/wcsnlen-evex.S    |    5 +
+ sysdeps/x86_64/multiarch/wcsnlen.c         |   14 +-
+ sysdeps/x86_64/multiarch/wcsrchr-evex.S    |    3 +
+ sysdeps/x86_64/multiarch/wmemchr-evex.S    |    4 +
+ 24 files changed, 2996 insertions(+), 17 deletions(-)
+ create mode 100644 sysdeps/x86_64/multiarch/memchr-evex.S
+ create mode 100644 sysdeps/x86_64/multiarch/memrchr-evex.S
+ create mode 100644 sysdeps/x86_64/multiarch/rawmemchr-evex.S
+ create mode 100644 sysdeps/x86_64/multiarch/strchr-evex.S
+ create mode 100644 sysdeps/x86_64/multiarch/strchrnul-evex.S
+ create mode 100644 sysdeps/x86_64/multiarch/strcmp-evex.S
+ create mode 100644 sysdeps/x86_64/multiarch/strlen-evex.S
+ create mode 100644 sysdeps/x86_64/multiarch/strncmp-evex.S
+ create mode 100644 sysdeps/x86_64/multiarch/strnlen-evex.S
+ create mode 100644 sysdeps/x86_64/multiarch/strrchr-evex.S
+ create mode 100644 sysdeps/x86_64/multiarch/wcschr-evex.S
+ create mode 100644 sysdeps/x86_64/multiarch/wcscmp-evex.S
+ create mode 100644 sysdeps/x86_64/multiarch/wcslen-evex.S
+ create mode 100644 sysdeps/x86_64/multiarch/wcsncmp-evex.S
+ create mode 100644 sysdeps/x86_64/multiarch/wcsnlen-evex.S
+ create mode 100644 sysdeps/x86_64/multiarch/wcsrchr-evex.S
+ create mode 100644 sysdeps/x86_64/multiarch/wmemchr-evex.S
+
+Conflicts:
+	sysdeps/x86_64/multiarch/wcsnlen.c
+	(account for missing upstream macros)
+
+diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
+index 9477538a..5ce85882 100644
+--- a/sysdeps/x86_64/multiarch/Makefile
++++ b/sysdeps/x86_64/multiarch/Makefile
+@@ -39,7 +39,17 @@ sysdep_routines += strncat-c stpncpy-c strncpy-c \
+ 		   memmove-avx512-unaligned-erms \
+ 		   memset-sse2-unaligned-erms \
+ 		   memset-avx2-unaligned-erms \
+-		   memset-avx512-unaligned-erms
++		   memset-avx512-unaligned-erms \
++		   memchr-evex \
++		   memrchr-evex \
++		   rawmemchr-evex \
++		   strchr-evex \
++		   strchrnul-evex \
++		   strcmp-evex \
++		   strlen-evex \
++		   strncmp-evex \
++		   strnlen-evex \
++		   strrchr-evex
+ CFLAGS-varshift.c += -msse4
+ CFLAGS-strcspn-c.c += -msse4
+ CFLAGS-strpbrk-c.c += -msse4
+@@ -56,7 +66,14 @@ sysdep_routines += wmemcmp-sse4 wmemcmp-ssse3 wmemcmp-c \
+ 		   wcschr-sse2 wcschr-avx2 \
+ 		   wcsrchr-sse2 wcsrchr-avx2 \
+ 		   wcsnlen-sse4_1 wcsnlen-c \
+-		   wcslen-sse2 wcslen-avx2 wcsnlen-avx2
++		   wcslen-sse2 wcslen-avx2 wcsnlen-avx2 \
++		   wcschr-evex \
++		   wcscmp-evex \
++		   wcslen-evex \
++		   wcsncmp-evex \
++		   wcsnlen-evex \
++		   wcsrchr-evex \
++		   wmemchr-evex
+ endif
+ 
+ ifeq ($(subdir),debug)
+diff --git a/sysdeps/x86_64/multiarch/ifunc-avx2.h b/sysdeps/x86_64/multiarch/ifunc-avx2.h
+index 5c88640a..7081b0c9 100644
+--- a/sysdeps/x86_64/multiarch/ifunc-avx2.h
++++ b/sysdeps/x86_64/multiarch/ifunc-avx2.h
+@@ -21,16 +21,24 @@
+ 
+ extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden;
+ extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden;
++extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden;
+ 
+ static inline void *
+ IFUNC_SELECTOR (void)
+ {
+   const struct cpu_features* cpu_features = __get_cpu_features ();
+ 
+-  if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER)
+-      && CPU_FEATURE_USABLE_P (cpu_features, AVX2)
++  if (CPU_FEATURE_USABLE_P (cpu_features, AVX2)
+       && CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load))
+-    return OPTIMIZE (avx2);
++    {
++      if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
++	  && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW)
++	  && CPU_FEATURE_USABLE_P (cpu_features, BMI2))
++	return OPTIMIZE (evex);
++
++      if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
++	return OPTIMIZE (avx2);
++    }
+ 
+   return OPTIMIZE (sse2);
+ }
+diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+index fe13505c..bd7d9f19 100644
+--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
++++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+@@ -43,6 +43,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 	      IFUNC_IMPL_ADD (array, i, memchr,
+ 			      CPU_FEATURE_USABLE (AVX2),
+ 			      __memchr_avx2)
++	      IFUNC_IMPL_ADD (array, i, memchr,
++			      (CPU_FEATURE_USABLE (AVX512VL)
++			       && CPU_FEATURE_USABLE (AVX512BW)
++			       && CPU_FEATURE_USABLE (BMI2)),
++			      __memchr_evex)
+ 	      IFUNC_IMPL_ADD (array, i, memchr, 1, __memchr_sse2))
+ 
+   /* Support sysdeps/x86_64/multiarch/memcmp.c.  */
+@@ -121,6 +126,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 	      IFUNC_IMPL_ADD (array, i, memrchr,
+ 			      CPU_FEATURE_USABLE (AVX2),
+ 			      __memrchr_avx2)
++	      IFUNC_IMPL_ADD (array, i, memrchr,
++			      (CPU_FEATURE_USABLE (AVX512VL)
++			       && CPU_FEATURE_USABLE (AVX512BW)),
++			      __memrchr_evex)
++
+ 	      IFUNC_IMPL_ADD (array, i, memrchr, 1, __memrchr_sse2))
+ 
+ #ifdef SHARED
+@@ -179,6 +189,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 	      IFUNC_IMPL_ADD (array, i, rawmemchr,
+ 			      CPU_FEATURE_USABLE (AVX2),
+ 			      __rawmemchr_avx2)
++	      IFUNC_IMPL_ADD (array, i, rawmemchr,
++			      (CPU_FEATURE_USABLE (AVX512VL)
++			       && CPU_FEATURE_USABLE (AVX512BW)
++			       && CPU_FEATURE_USABLE (BMI2)),
++			      __rawmemchr_evex)
+ 	      IFUNC_IMPL_ADD (array, i, rawmemchr, 1, __rawmemchr_sse2))
+ 
+   /* Support sysdeps/x86_64/multiarch/strlen.c.  */
+@@ -186,6 +201,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 	      IFUNC_IMPL_ADD (array, i, strlen,
+ 			      CPU_FEATURE_USABLE (AVX2),
+ 			      __strlen_avx2)
++	      IFUNC_IMPL_ADD (array, i, strlen,
++			      (CPU_FEATURE_USABLE (AVX512VL)
++			       && CPU_FEATURE_USABLE (AVX512BW)),
++			      __strlen_evex)
+ 	      IFUNC_IMPL_ADD (array, i, strlen, 1, __strlen_sse2))
+ 
+   /* Support sysdeps/x86_64/multiarch/strnlen.c.  */
+@@ -193,6 +212,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 	      IFUNC_IMPL_ADD (array, i, strnlen,
+ 			      CPU_FEATURE_USABLE (AVX2),
+ 			      __strnlen_avx2)
++	      IFUNC_IMPL_ADD (array, i, strnlen,
++			      (CPU_FEATURE_USABLE (AVX512VL)
++			       && CPU_FEATURE_USABLE (AVX512BW)),
++			      __strnlen_evex)
+ 	      IFUNC_IMPL_ADD (array, i, strnlen, 1, __strnlen_sse2))
+ 
+   /* Support sysdeps/x86_64/multiarch/stpncpy.c.  */
+@@ -255,6 +278,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 	      IFUNC_IMPL_ADD (array, i, strchr,
+ 			      CPU_FEATURE_USABLE (AVX2),
+ 			      __strchr_avx2)
++	      IFUNC_IMPL_ADD (array, i, strchr,
++			      (CPU_FEATURE_USABLE (AVX512VL)
++			       && CPU_FEATURE_USABLE (AVX512BW)
++			       && CPU_FEATURE_USABLE (BMI2)),
++			      __strchr_evex)
+ 	      IFUNC_IMPL_ADD (array, i, strchr, 1, __strchr_sse2_no_bsf)
+ 	      IFUNC_IMPL_ADD (array, i, strchr, 1, __strchr_sse2))
+ 
+@@ -263,6 +291,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 	      IFUNC_IMPL_ADD (array, i, strchrnul,
+ 			      CPU_FEATURE_USABLE (AVX2),
+ 			      __strchrnul_avx2)
++	      IFUNC_IMPL_ADD (array, i, strchrnul,
++			      (CPU_FEATURE_USABLE (AVX512VL)
++			       && CPU_FEATURE_USABLE (AVX512BW)
++			       && CPU_FEATURE_USABLE (BMI2)),
++			      __strchrnul_evex)
+ 	      IFUNC_IMPL_ADD (array, i, strchrnul, 1, __strchrnul_sse2))
+ 
+   /* Support sysdeps/x86_64/multiarch/strrchr.c.  */
+@@ -270,6 +303,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 	      IFUNC_IMPL_ADD (array, i, strrchr,
+ 			      CPU_FEATURE_USABLE (AVX2),
+ 			      __strrchr_avx2)
++	      IFUNC_IMPL_ADD (array, i, strrchr,
++			      (CPU_FEATURE_USABLE (AVX512VL)
++			       && CPU_FEATURE_USABLE (AVX512BW)),
++			      __strrchr_evex)
+ 	      IFUNC_IMPL_ADD (array, i, strrchr, 1, __strrchr_sse2))
+ 
+   /* Support sysdeps/x86_64/multiarch/strcmp.c.  */
+@@ -277,6 +314,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 	      IFUNC_IMPL_ADD (array, i, strcmp,
+ 			      CPU_FEATURE_USABLE (AVX2),
+ 			      __strcmp_avx2)
++	      IFUNC_IMPL_ADD (array, i, strcmp,
++			      (CPU_FEATURE_USABLE (AVX512VL)
++			       && CPU_FEATURE_USABLE (AVX512BW)
++			       && CPU_FEATURE_USABLE (BMI2)),
++			      __strcmp_evex)
+ 	      IFUNC_IMPL_ADD (array, i, strcmp, CPU_FEATURE_USABLE (SSE4_2),
+ 			      __strcmp_sse42)
+ 	      IFUNC_IMPL_ADD (array, i, strcmp, CPU_FEATURE_USABLE (SSSE3),
+@@ -370,6 +412,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 	      IFUNC_IMPL_ADD (array, i, wcschr,
+ 			      CPU_FEATURE_USABLE (AVX2),
+ 			      __wcschr_avx2)
++	      IFUNC_IMPL_ADD (array, i, wcschr,
++			      (CPU_FEATURE_USABLE (AVX512VL)
++			       && CPU_FEATURE_USABLE (AVX512BW)
++			       && CPU_FEATURE_USABLE (BMI2)),
++			      __wcschr_evex)
+ 	      IFUNC_IMPL_ADD (array, i, wcschr, 1, __wcschr_sse2))
+ 
+   /* Support sysdeps/x86_64/multiarch/wcsrchr.c.  */
+@@ -377,6 +424,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 	      IFUNC_IMPL_ADD (array, i, wcsrchr,
+ 			      CPU_FEATURE_USABLE (AVX2),
+ 			      __wcsrchr_avx2)
++	      IFUNC_IMPL_ADD (array, i, wcsrchr,
++			      (CPU_FEATURE_USABLE (AVX512VL)
++			       && CPU_FEATURE_USABLE (AVX512BW)
++			       && CPU_FEATURE_USABLE (BMI2)),
++			      __wcsrchr_evex)
+ 	      IFUNC_IMPL_ADD (array, i, wcsrchr, 1, __wcsrchr_sse2))
+ 
+   /* Support sysdeps/x86_64/multiarch/wcscmp.c.  */
+@@ -384,6 +436,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 	      IFUNC_IMPL_ADD (array, i, wcscmp,
+ 			      CPU_FEATURE_USABLE (AVX2),
+ 			      __wcscmp_avx2)
++	      IFUNC_IMPL_ADD (array, i, wcscmp,
++			      (CPU_FEATURE_USABLE (AVX512VL)
++			       && CPU_FEATURE_USABLE (AVX512BW)
++			       && CPU_FEATURE_USABLE (BMI2)),
++			      __wcscmp_evex)
+ 	      IFUNC_IMPL_ADD (array, i, wcscmp, 1, __wcscmp_sse2))
+ 
+   /* Support sysdeps/x86_64/multiarch/wcsncmp.c.  */
+@@ -391,6 +448,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 	      IFUNC_IMPL_ADD (array, i, wcsncmp,
+ 			      CPU_FEATURE_USABLE (AVX2),
+ 			      __wcsncmp_avx2)
++	      IFUNC_IMPL_ADD (array, i, wcsncmp,
++			      (CPU_FEATURE_USABLE (AVX512VL)
++			       && CPU_FEATURE_USABLE (AVX512BW)
++			       && CPU_FEATURE_USABLE (BMI2)),
++			      __wcsncmp_evex)
+ 	      IFUNC_IMPL_ADD (array, i, wcsncmp, 1, __wcsncmp_sse2))
+ 
+   /* Support sysdeps/x86_64/multiarch/wcscpy.c.  */
+@@ -404,6 +466,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 	      IFUNC_IMPL_ADD (array, i, wcslen,
+ 			      CPU_FEATURE_USABLE (AVX2),
+ 			      __wcslen_avx2)
++	      IFUNC_IMPL_ADD (array, i, wcslen,
++			      (CPU_FEATURE_USABLE (AVX512VL)
++			       && CPU_FEATURE_USABLE (AVX512BW)
++			       && CPU_FEATURE_USABLE (BMI2)),
++			      __wcslen_evex)
+ 	      IFUNC_IMPL_ADD (array, i, wcslen, 1, __wcslen_sse2))
+ 
+   /* Support sysdeps/x86_64/multiarch/wcsnlen.c.  */
+@@ -411,6 +478,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 	      IFUNC_IMPL_ADD (array, i, wcsnlen,
+ 			      CPU_FEATURE_USABLE (AVX2),
+ 			      __wcsnlen_avx2)
++	      IFUNC_IMPL_ADD (array, i, wcsnlen,
++			      (CPU_FEATURE_USABLE (AVX512VL)
++			       && CPU_FEATURE_USABLE (AVX512BW)
++			       && CPU_FEATURE_USABLE (BMI2)),
++			      __wcsnlen_evex)
+ 	      IFUNC_IMPL_ADD (array, i, wcsnlen,
+ 			      CPU_FEATURE_USABLE (SSE4_1),
+ 			      __wcsnlen_sse4_1)
+@@ -421,6 +493,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 	      IFUNC_IMPL_ADD (array, i, wmemchr,
+ 			      CPU_FEATURE_USABLE (AVX2),
+ 			      __wmemchr_avx2)
++	      IFUNC_IMPL_ADD (array, i, wmemchr,
++			      (CPU_FEATURE_USABLE (AVX512VL)
++			       && CPU_FEATURE_USABLE (AVX512BW)
++			       && CPU_FEATURE_USABLE (BMI2)),
++			      __wmemchr_evex)
+ 	      IFUNC_IMPL_ADD (array, i, wmemchr, 1, __wmemchr_sse2))
+ 
+   /* Support sysdeps/x86_64/multiarch/wmemcmp.c.  */
+@@ -568,6 +645,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 	      IFUNC_IMPL_ADD (array, i, strncmp,
+ 			      CPU_FEATURE_USABLE (AVX2),
+ 			      __strncmp_avx2)
++	      IFUNC_IMPL_ADD (array, i, strncmp,
++			      (CPU_FEATURE_USABLE (AVX512VL)
++			       && CPU_FEATURE_USABLE (AVX512BW)),
++			      __strncmp_evex)
+ 	      IFUNC_IMPL_ADD (array, i, strncmp, CPU_FEATURE_USABLE (SSE4_2),
+ 			      __strncmp_sse42)
+ 	      IFUNC_IMPL_ADD (array, i, strncmp, CPU_FEATURE_USABLE (SSSE3),
+diff --git a/sysdeps/x86_64/multiarch/memchr-evex.S b/sysdeps/x86_64/multiarch/memchr-evex.S
+new file mode 100644
+index 00000000..6dd5d67b
+--- /dev/null
++++ b/sysdeps/x86_64/multiarch/memchr-evex.S
+@@ -0,0 +1,381 @@
++/* memchr/wmemchr optimized with 256-bit EVEX instructions.
++   Copyright (C) 2021 Free Software Foundation, Inc.
++   This file is part of the GNU C Library.
++
++   The GNU C Library is free software; you can redistribute it and/or
++   modify it under the terms of the GNU Lesser General Public
++   License as published by the Free Software Foundation; either
++   version 2.1 of the License, or (at your option) any later version.
++
++   The GNU C Library is distributed in the hope that it will be useful,
++   but WITHOUT ANY WARRANTY; without even the implied warranty of
++   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++   Lesser General Public License for more details.
++
++   You should have received a copy of the GNU Lesser General Public
++   License along with the GNU C Library; if not, see
++   <https://www.gnu.org/licenses/>.  */
++
++#if IS_IN (libc)
++
++# include <sysdep.h>
++
++# ifndef MEMCHR
++#  define MEMCHR	__memchr_evex
++# endif
++
++# ifdef USE_AS_WMEMCHR
++#  define VPBROADCAST	vpbroadcastd
++#  define VPCMP		vpcmpd
++#  define SHIFT_REG	r8d
++# else
++#  define VPBROADCAST	vpbroadcastb
++#  define VPCMP		vpcmpb
++#  define SHIFT_REG	ecx
++# endif
++
++# define XMMMATCH	xmm16
++# define YMMMATCH	ymm16
++# define YMM1		ymm17
++# define YMM2		ymm18
++# define YMM3		ymm19
++# define YMM4		ymm20
++# define YMM5		ymm21
++# define YMM6		ymm22
++
++# define VEC_SIZE 32
++
++	.section .text.evex,"ax",@progbits
++ENTRY (MEMCHR)
++# ifndef USE_AS_RAWMEMCHR
++	/* Check for zero length.  */
++	test	%RDX_LP, %RDX_LP
++	jz	L(zero)
++# endif
++	movl	%edi, %ecx
++# ifdef USE_AS_WMEMCHR
++	shl	$2, %RDX_LP
++# else
++#  ifdef __ILP32__
++	/* Clear the upper 32 bits.  */
++	movl	%edx, %edx
++#  endif
++# endif
++	/* Broadcast CHAR to YMMMATCH.  */
++	VPBROADCAST %esi, %YMMMATCH
++	/* Check if we may cross page boundary with one vector load.  */
++	andl	$(2 * VEC_SIZE - 1), %ecx
++	cmpl	$VEC_SIZE, %ecx
++	ja	L(cros_page_boundary)
++
++	/* Check the first VEC_SIZE bytes.  */
++	VPCMP	$0, (%rdi), %YMMMATCH, %k1
++	kmovd	%k1, %eax
++	testl	%eax, %eax
++
++# ifndef USE_AS_RAWMEMCHR
++	jnz	L(first_vec_x0_check)
++	/* Adjust length and check the end of data.  */
++	subq	$VEC_SIZE, %rdx
++	jbe	L(zero)
++# else
++	jnz	L(first_vec_x0)
++# endif
++
++	/* Align data for aligned loads in the loop.  */
++	addq	$VEC_SIZE, %rdi
++	andl	$(VEC_SIZE - 1), %ecx
++	andq	$-VEC_SIZE, %rdi
++
++# ifndef USE_AS_RAWMEMCHR
++	/* Adjust length.  */
++	addq	%rcx, %rdx
++
++	subq	$(VEC_SIZE * 4), %rdx
++	jbe	L(last_4x_vec_or_less)
++# endif
++	jmp	L(more_4x_vec)
++
++	.p2align 4
++L(cros_page_boundary):
++	andl	$(VEC_SIZE - 1), %ecx
++# ifdef USE_AS_WMEMCHR
++	/* NB: Divide shift count by 4 since each bit in K1 represent 4
++	   bytes.  */
++	movl	%ecx, %SHIFT_REG
++	sarl	$2, %SHIFT_REG
++# endif
++	andq	$-VEC_SIZE, %rdi
++	VPCMP	$0, (%rdi), %YMMMATCH, %k1
++	kmovd	%k1, %eax
++	/* Remove the leading bytes.  */
++	sarxl	%SHIFT_REG, %eax, %eax
++	testl	%eax, %eax
++	jz	L(aligned_more)
++	tzcntl	%eax, %eax
++# ifdef USE_AS_WMEMCHR
++	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
++	sall	$2, %eax
++# endif
++# ifndef USE_AS_RAWMEMCHR
++	/* Check the end of data.  */
++	cmpq	%rax, %rdx
++	jbe	L(zero)
++# endif
++	addq	%rdi, %rax
++	addq	%rcx, %rax
++	ret
++
++	.p2align 4
++L(aligned_more):
++# ifndef USE_AS_RAWMEMCHR
++        /* Calculate "rdx + rcx - VEC_SIZE" with "rdx - (VEC_SIZE - rcx)"
++	   instead of "(rdx + rcx) - VEC_SIZE" to void possible addition
++	   overflow.  */
++	negq	%rcx
++	addq	$VEC_SIZE, %rcx
++
++	/* Check the end of data.  */
++	subq	%rcx, %rdx
++	jbe	L(zero)
++# endif
++
++	addq	$VEC_SIZE, %rdi
++
++# ifndef USE_AS_RAWMEMCHR
++	subq	$(VEC_SIZE * 4), %rdx
++	jbe	L(last_4x_vec_or_less)
++# endif
++
++L(more_4x_vec):
++	/* Check the first 4 * VEC_SIZE.  Only one VEC_SIZE at a time
++	   since data is only aligned to VEC_SIZE.  */
++	VPCMP	$0, (%rdi), %YMMMATCH, %k1
++	kmovd	%k1, %eax
++	testl	%eax, %eax
++	jnz	L(first_vec_x0)
++
++	VPCMP	$0, VEC_SIZE(%rdi), %YMMMATCH, %k1
++	kmovd	%k1, %eax
++	testl	%eax, %eax
++	jnz	L(first_vec_x1)
++
++	VPCMP	$0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k1
++	kmovd	%k1, %eax
++	testl	%eax, %eax
++	jnz	L(first_vec_x2)
++
++	VPCMP	$0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k1
++	kmovd	%k1, %eax
++	testl	%eax, %eax
++	jnz	L(first_vec_x3)
++
++	addq	$(VEC_SIZE * 4), %rdi
++
++# ifndef USE_AS_RAWMEMCHR
++	subq	$(VEC_SIZE * 4), %rdx
++	jbe	L(last_4x_vec_or_less)
++# endif
++
++	/* Align data to 4 * VEC_SIZE.  */
++	movq	%rdi, %rcx
++	andl	$(4 * VEC_SIZE - 1), %ecx
++	andq	$-(4 * VEC_SIZE), %rdi
++
++# ifndef USE_AS_RAWMEMCHR
++	/* Adjust length.  */
++	addq	%rcx, %rdx
++# endif
++
++	.p2align 4
++L(loop_4x_vec):
++	/* Compare 4 * VEC at a time forward.  */
++	VPCMP	$0, (%rdi), %YMMMATCH, %k1
++	VPCMP	$0, VEC_SIZE(%rdi), %YMMMATCH, %k2
++	kord	%k1, %k2, %k5
++	VPCMP	$0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k3
++	VPCMP	$0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k4
++
++	kord	%k3, %k4, %k6
++	kortestd %k5, %k6
++	jnz	L(4x_vec_end)
++
++	addq	$(VEC_SIZE * 4), %rdi
++
++# ifdef USE_AS_RAWMEMCHR
++	jmp	L(loop_4x_vec)
++# else
++	subq	$(VEC_SIZE * 4), %rdx
++	ja	L(loop_4x_vec)
++
++L(last_4x_vec_or_less):
++	/* Less than 4 * VEC and aligned to VEC_SIZE.  */
++	addl	$(VEC_SIZE * 2), %edx
++	jle	L(last_2x_vec)
++
++	VPCMP	$0, (%rdi), %YMMMATCH, %k1
++	kmovd	%k1, %eax
++	testl	%eax, %eax
++	jnz	L(first_vec_x0)
++
++	VPCMP	$0, VEC_SIZE(%rdi), %YMMMATCH, %k1
++	kmovd	%k1, %eax
++	testl	%eax, %eax
++	jnz	L(first_vec_x1)
++
++	VPCMP	$0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k1
++	kmovd	%k1, %eax
++	testl	%eax, %eax
++
++	jnz	L(first_vec_x2_check)
++	subl	$VEC_SIZE, %edx
++	jle	L(zero)
++
++	VPCMP	$0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k1
++	kmovd	%k1, %eax
++	testl	%eax, %eax
++
++	jnz	L(first_vec_x3_check)
++	xorl	%eax, %eax
++	ret
++
++	.p2align 4
++L(last_2x_vec):
++	addl	$(VEC_SIZE * 2), %edx
++	VPCMP	$0, (%rdi), %YMMMATCH, %k1
++	kmovd	%k1, %eax
++	testl	%eax, %eax
++
++	jnz	L(first_vec_x0_check)
++	subl	$VEC_SIZE, %edx
++	jle	L(zero)
++
++	VPCMP	$0, VEC_SIZE(%rdi), %YMMMATCH, %k1
++	kmovd	%k1, %eax
++	testl	%eax, %eax
++	jnz	L(first_vec_x1_check)
++	xorl	%eax, %eax
++	ret
++
++	.p2align 4
++L(first_vec_x0_check):
++	tzcntl	%eax, %eax
++# ifdef USE_AS_WMEMCHR
++	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
++	sall	$2, %eax
++# endif
++	/* Check the end of data.  */
++	cmpq	%rax, %rdx
++	jbe	L(zero)
++	addq	%rdi, %rax
++	ret
++
++	.p2align 4
++L(first_vec_x1_check):
++	tzcntl	%eax, %eax
++# ifdef USE_AS_WMEMCHR
++	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
++	sall	$2, %eax
++# endif
++	/* Check the end of data.  */
++	cmpq	%rax, %rdx
++	jbe	L(zero)
++	addq	$VEC_SIZE, %rax
++	addq	%rdi, %rax
++	ret
++
++	.p2align 4
++L(first_vec_x2_check):
++	tzcntl	%eax, %eax
++# ifdef USE_AS_WMEMCHR
++	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
++	sall	$2, %eax
++# endif
++	/* Check the end of data.  */
++	cmpq	%rax, %rdx
++	jbe	L(zero)
++	addq	$(VEC_SIZE * 2), %rax
++	addq	%rdi, %rax
++	ret
++
++	.p2align 4
++L(first_vec_x3_check):
++	tzcntl	%eax, %eax
++# ifdef USE_AS_WMEMCHR
++	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
++	sall	$2, %eax
++# endif
++	/* Check the end of data.  */
++	cmpq	%rax, %rdx
++	jbe	L(zero)
++	addq	$(VEC_SIZE * 3), %rax
++	addq	%rdi, %rax
++	ret
++
++	.p2align 4
++L(zero):
++	xorl	%eax, %eax
++	ret
++# endif
++
++	.p2align 4
++L(first_vec_x0):
++	tzcntl	%eax, %eax
++# ifdef USE_AS_WMEMCHR
++	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
++	leaq	(%rdi, %rax, 4), %rax
++# else
++	addq	%rdi, %rax
++# endif
++	ret
++
++	.p2align 4
++L(first_vec_x1):
++	tzcntl	%eax, %eax
++# ifdef USE_AS_WMEMCHR
++	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
++	leaq	VEC_SIZE(%rdi, %rax, 4), %rax
++# else
++	addq	$VEC_SIZE, %rax
++	addq	%rdi, %rax
++# endif
++	ret
++
++	.p2align 4
++L(first_vec_x2):
++	tzcntl	%eax, %eax
++# ifdef USE_AS_WMEMCHR
++	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
++	leaq	(VEC_SIZE * 2)(%rdi, %rax, 4), %rax
++# else
++	addq	$(VEC_SIZE * 2), %rax
++	addq	%rdi, %rax
++# endif
++	ret
++
++	.p2align 4
++L(4x_vec_end):
++	kmovd	%k1, %eax
++	testl	%eax, %eax
++	jnz	L(first_vec_x0)
++	kmovd	%k2, %eax
++	testl	%eax, %eax
++	jnz	L(first_vec_x1)
++	kmovd	%k3, %eax
++	testl	%eax, %eax
++	jnz	L(first_vec_x2)
++	kmovd	%k4, %eax
++	testl	%eax, %eax
++L(first_vec_x3):
++	tzcntl	%eax, %eax
++# ifdef USE_AS_WMEMCHR
++	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
++	leaq	(VEC_SIZE * 3)(%rdi, %rax, 4), %rax
++# else
++	addq	$(VEC_SIZE * 3), %rax
++	addq	%rdi, %rax
++# endif
++	ret
++
++END (MEMCHR)
++#endif
+diff --git a/sysdeps/x86_64/multiarch/memrchr-evex.S b/sysdeps/x86_64/multiarch/memrchr-evex.S
+new file mode 100644
+index 00000000..16bf8e02
+--- /dev/null
++++ b/sysdeps/x86_64/multiarch/memrchr-evex.S
+@@ -0,0 +1,337 @@
++/* memrchr optimized with 256-bit EVEX instructions.
++   Copyright (C) 2021 Free Software Foundation, Inc.
++   This file is part of the GNU C Library.
++
++   The GNU C Library is free software; you can redistribute it and/or
++   modify it under the terms of the GNU Lesser General Public
++   License as published by the Free Software Foundation; either
++   version 2.1 of the License, or (at your option) any later version.
++
++   The GNU C Library is distributed in the hope that it will be useful,
++   but WITHOUT ANY WARRANTY; without even the implied warranty of
++   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++   Lesser General Public License for more details.
++
++   You should have received a copy of the GNU Lesser General Public
++   License along with the GNU C Library; if not, see
++   <https://www.gnu.org/licenses/>.  */
++
++#if IS_IN (libc)
++
++# include <sysdep.h>
++
++# define VMOVA		vmovdqa64
++
++# define YMMMATCH	ymm16
++
++# define VEC_SIZE 32
++
++	.section .text.evex,"ax",@progbits
++ENTRY (__memrchr_evex)
++	/* Broadcast CHAR to YMMMATCH.  */
++	vpbroadcastb %esi, %YMMMATCH
++
++	sub	$VEC_SIZE, %RDX_LP
++	jbe	L(last_vec_or_less)
++
++	add	%RDX_LP, %RDI_LP
++
++	/* Check the last VEC_SIZE bytes.  */
++	vpcmpb	$0, (%rdi), %YMMMATCH, %k1
++	kmovd	%k1, %eax
++	testl	%eax, %eax
++	jnz	L(last_vec_x0)
++
++	subq	$(VEC_SIZE * 4), %rdi
++	movl	%edi, %ecx
++	andl	$(VEC_SIZE - 1), %ecx
++	jz	L(aligned_more)
++
++	/* Align data for aligned loads in the loop.  */
++	addq	$VEC_SIZE, %rdi
++	addq	$VEC_SIZE, %rdx
++	andq	$-VEC_SIZE, %rdi
++	subq	%rcx, %rdx
++
++	.p2align 4
++L(aligned_more):
++	subq	$(VEC_SIZE * 4), %rdx
++	jbe	L(last_4x_vec_or_less)
++
++	/* Check the last 4 * VEC_SIZE.  Only one VEC_SIZE at a time
++	   since data is only aligned to VEC_SIZE.  */
++	vpcmpb	$0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k1
++	kmovd	%k1, %eax
++	testl	%eax, %eax
++	jnz	L(last_vec_x3)
++
++	vpcmpb	$0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k2
++	kmovd	%k2, %eax
++	testl	%eax, %eax
++	jnz	L(last_vec_x2)
++
++	vpcmpb	$0, VEC_SIZE(%rdi), %YMMMATCH, %k3
++	kmovd	%k3, %eax
++	testl	%eax, %eax
++	jnz	L(last_vec_x1)
++
++	vpcmpb	$0, (%rdi), %YMMMATCH, %k4
++	kmovd	%k4, %eax
++	testl	%eax, %eax
++	jnz	L(last_vec_x0)
++
++	/* Align data to 4 * VEC_SIZE for loop with fewer branches.
++	   There are some overlaps with above if data isn't aligned
++	   to 4 * VEC_SIZE.  */
++	movl	%edi, %ecx
++	andl	$(VEC_SIZE * 4 - 1), %ecx
++	jz	L(loop_4x_vec)
++
++	addq	$(VEC_SIZE * 4), %rdi
++	addq	$(VEC_SIZE * 4), %rdx
++	andq	$-(VEC_SIZE * 4), %rdi
++	subq	%rcx, %rdx
++
++	.p2align 4
++L(loop_4x_vec):
++	/* Compare 4 * VEC at a time forward.  */
++	subq	$(VEC_SIZE * 4), %rdi
++	subq	$(VEC_SIZE * 4), %rdx
++	jbe	L(last_4x_vec_or_less)
++
++	vpcmpb	$0, (%rdi), %YMMMATCH, %k1
++	vpcmpb	$0, VEC_SIZE(%rdi), %YMMMATCH, %k2
++	kord	%k1, %k2, %k5
++	vpcmpb	$0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k3
++	vpcmpb	$0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k4
++
++	kord	%k3, %k4, %k6
++	kortestd %k5, %k6
++	jz	L(loop_4x_vec)
++
++	/* There is a match.  */
++	kmovd	%k4, %eax
++	testl	%eax, %eax
++	jnz	L(last_vec_x3)
++
++	kmovd	%k3, %eax
++	testl	%eax, %eax
++	jnz	L(last_vec_x2)
++
++	kmovd	%k2, %eax
++	testl	%eax, %eax
++	jnz	L(last_vec_x1)
++
++	kmovd	%k1, %eax
++	bsrl	%eax, %eax
++	addq	%rdi, %rax
++	ret
++
++	.p2align 4
++L(last_4x_vec_or_less):
++	addl	$(VEC_SIZE * 4), %edx
++	cmpl	$(VEC_SIZE * 2), %edx
++	jbe	L(last_2x_vec)
++
++	vpcmpb	$0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k1
++	kmovd	%k1, %eax
++	testl	%eax, %eax
++	jnz	L(last_vec_x3)
++
++	vpcmpb	$0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k2
++	kmovd	%k2, %eax
++	testl	%eax, %eax
++	jnz	L(last_vec_x2)
++
++	vpcmpb	$0, VEC_SIZE(%rdi), %YMMMATCH, %k3
++	kmovd	%k3, %eax
++	testl	%eax, %eax
++	jnz	L(last_vec_x1_check)
++	cmpl	$(VEC_SIZE * 3), %edx
++	jbe	L(zero)
++
++	vpcmpb	$0, (%rdi), %YMMMATCH, %k4
++	kmovd	%k4, %eax
++	testl	%eax, %eax
++	jz	L(zero)
++	bsrl	%eax, %eax
++	subq	$(VEC_SIZE * 4), %rdx
++	addq	%rax, %rdx
++	jl	L(zero)
++	addq	%rdi, %rax
++	ret
++
++	.p2align 4
++L(last_2x_vec):
++	vpcmpb	$0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k1
++	kmovd	%k1, %eax
++	testl	%eax, %eax
++	jnz	L(last_vec_x3_check)
++	cmpl	$VEC_SIZE, %edx
++	jbe	L(zero)
++
++	vpcmpb	$0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k1
++	kmovd	%k1, %eax
++	testl	%eax, %eax
++	jz	L(zero)
++	bsrl	%eax, %eax
++	subq	$(VEC_SIZE * 2), %rdx
++	addq	%rax, %rdx
++	jl	L(zero)
++	addl	$(VEC_SIZE * 2), %eax
++	addq	%rdi, %rax
++	ret
++
++	.p2align 4
++L(last_vec_x0):
++	bsrl	%eax, %eax
++	addq	%rdi, %rax
++	ret
++
++	.p2align 4
++L(last_vec_x1):
++	bsrl	%eax, %eax
++	addl	$VEC_SIZE, %eax
++	addq	%rdi, %rax
++	ret
++
++	.p2align 4
++L(last_vec_x2):
++	bsrl	%eax, %eax
++	addl	$(VEC_SIZE * 2), %eax
++	addq	%rdi, %rax
++	ret
++
++	.p2align 4
++L(last_vec_x3):
++	bsrl	%eax, %eax
++	addl	$(VEC_SIZE * 3), %eax
++	addq	%rdi, %rax
++	ret
++
++	.p2align 4
++L(last_vec_x1_check):
++	bsrl	%eax, %eax
++	subq	$(VEC_SIZE * 3), %rdx
++	addq	%rax, %rdx
++	jl	L(zero)
++	addl	$VEC_SIZE, %eax
++	addq	%rdi, %rax
++	ret
++
++	.p2align 4
++L(last_vec_x3_check):
++	bsrl	%eax, %eax
++	subq	$VEC_SIZE, %rdx
++	addq	%rax, %rdx
++	jl	L(zero)
++	addl	$(VEC_SIZE * 3), %eax
++	addq	%rdi, %rax
++	ret
++
++	.p2align 4
++L(zero):
++	xorl	%eax, %eax
++	ret
++
++	.p2align 4
++L(last_vec_or_less_aligned):
++	movl	%edx, %ecx
++
++	vpcmpb	$0, (%rdi), %YMMMATCH, %k1
++
++	movl	$1, %edx
++	/* Support rdx << 32.  */
++	salq	%cl, %rdx
++	subq	$1, %rdx
++
++	kmovd	%k1, %eax
++
++	/* Remove the trailing bytes.  */
++	andl	%edx, %eax
++	testl	%eax, %eax
++	jz	L(zero)
++
++	bsrl	%eax, %eax
++	addq	%rdi, %rax
++	ret
++
++	.p2align 4
++L(last_vec_or_less):
++	addl	$VEC_SIZE, %edx
++
++	/* Check for zero length.  */
++	testl	%edx, %edx
++	jz	L(zero)
++
++	movl	%edi, %ecx
++	andl	$(VEC_SIZE - 1), %ecx
++	jz	L(last_vec_or_less_aligned)
++
++	movl	%ecx, %esi
++	movl	%ecx, %r8d
++	addl	%edx, %esi
++	andq	$-VEC_SIZE, %rdi
++
++	subl	$VEC_SIZE, %esi
++	ja	L(last_vec_2x_aligned)
++
++	/* Check the last VEC.  */
++	vpcmpb	$0, (%rdi), %YMMMATCH, %k1
++	kmovd	%k1, %eax
++
++	/* Remove the leading and trailing bytes.  */
++	sarl	%cl, %eax
++	movl	%edx, %ecx
++
++	movl	$1, %edx
++	sall	%cl, %edx
++	subl	$1, %edx
++
++	andl	%edx, %eax
++	testl	%eax, %eax
++	jz	L(zero)
++
++	bsrl	%eax, %eax
++	addq	%rdi, %rax
++	addq	%r8, %rax
++	ret
++
++	.p2align 4
++L(last_vec_2x_aligned):
++	movl	%esi, %ecx
++
++	/* Check the last VEC.  */
++	vpcmpb	$0, VEC_SIZE(%rdi), %YMMMATCH, %k1
++
++	movl	$1, %edx
++	sall	%cl, %edx
++	subl	$1, %edx
++
++	kmovd	%k1, %eax
++
++	/* Remove the trailing bytes.  */
++	andl	%edx, %eax
++
++	testl	%eax, %eax
++	jnz	L(last_vec_x1)
++
++	/* Check the second last VEC.  */
++	vpcmpb	$0, (%rdi), %YMMMATCH, %k1
++
++	movl	%r8d, %ecx
++
++	kmovd	%k1, %eax
++
++	/* Remove the leading bytes.  Must use unsigned right shift for
++	   bsrl below.  */
++	shrl	%cl, %eax
++	testl	%eax, %eax
++	jz	L(zero)
++
++	bsrl	%eax, %eax
++	addq	%rdi, %rax
++	addq	%r8, %rax
++	ret
++END (__memrchr_evex)
++#endif
+diff --git a/sysdeps/x86_64/multiarch/rawmemchr-evex.S b/sysdeps/x86_64/multiarch/rawmemchr-evex.S
+new file mode 100644
+index 00000000..ec942b77
+--- /dev/null
++++ b/sysdeps/x86_64/multiarch/rawmemchr-evex.S
+@@ -0,0 +1,4 @@
++#define MEMCHR __rawmemchr_evex
++#define USE_AS_RAWMEMCHR 1
++
++#include "memchr-evex.S"
+diff --git a/sysdeps/x86_64/multiarch/strchr-evex.S b/sysdeps/x86_64/multiarch/strchr-evex.S
+new file mode 100644
+index 00000000..ddc86a70
+--- /dev/null
++++ b/sysdeps/x86_64/multiarch/strchr-evex.S
+@@ -0,0 +1,335 @@
++/* strchr/strchrnul optimized with 256-bit EVEX instructions.
++   Copyright (C) 2021 Free Software Foundation, Inc.
++   This file is part of the GNU C Library.
++
++   The GNU C Library is free software; you can redistribute it and/or
++   modify it under the terms of the GNU Lesser General Public
++   License as published by the Free Software Foundation; either
++   version 2.1 of the License, or (at your option) any later version.
++
++   The GNU C Library is distributed in the hope that it will be useful,
++   but WITHOUT ANY WARRANTY; without even the implied warranty of
++   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++   Lesser General Public License for more details.
++
++   You should have received a copy of the GNU Lesser General Public
++   License along with the GNU C Library; if not, see
++   <https://www.gnu.org/licenses/>.  */
++
++#if IS_IN (libc)
++
++# include <sysdep.h>
++
++# ifndef STRCHR
++#  define STRCHR	__strchr_evex
++# endif
++
++# define VMOVU		vmovdqu64
++# define VMOVA		vmovdqa64
++
++# ifdef USE_AS_WCSCHR
++#  define VPBROADCAST	vpbroadcastd
++#  define VPCMP		vpcmpd
++#  define VPMINU	vpminud
++#  define CHAR_REG	esi
++#  define SHIFT_REG	r8d
++# else
++#  define VPBROADCAST	vpbroadcastb
++#  define VPCMP		vpcmpb
++#  define VPMINU	vpminub
++#  define CHAR_REG	sil
++#  define SHIFT_REG	ecx
++# endif
++
++# define XMMZERO	xmm16
++
++# define YMMZERO	ymm16
++# define YMM0		ymm17
++# define YMM1		ymm18
++# define YMM2		ymm19
++# define YMM3		ymm20
++# define YMM4		ymm21
++# define YMM5		ymm22
++# define YMM6		ymm23
++# define YMM7		ymm24
++# define YMM8		ymm25
++
++# define VEC_SIZE 32
++# define PAGE_SIZE 4096
++
++	.section .text.evex,"ax",@progbits
++ENTRY (STRCHR)
++	movl	%edi, %ecx
++# ifndef USE_AS_STRCHRNUL
++	xorl	%edx, %edx
++# endif
++
++	/* Broadcast CHAR to YMM0.	*/
++	VPBROADCAST %esi, %YMM0
++
++	vpxorq	%XMMZERO, %XMMZERO, %XMMZERO
++
++	/* Check if we cross page boundary with one vector load.  */
++	andl	$(PAGE_SIZE - 1), %ecx
++	cmpl	$(PAGE_SIZE - VEC_SIZE), %ecx
++	ja  L(cross_page_boundary)
++
++	/* Check the first VEC_SIZE bytes. Search for both CHAR and the
++	   null bytes.  */
++	VMOVU	(%rdi), %YMM1
++
++	/* Leaves only CHARS matching esi as 0.  */
++	vpxorq	%YMM1, %YMM0, %YMM2
++	VPMINU	%YMM2, %YMM1, %YMM2
++	/* Each bit in K0 represents a CHAR or a null byte in YMM1.  */
++	VPCMP	$0, %YMMZERO, %YMM2, %k0
++	ktestd	%k0, %k0
++	jz	L(more_vecs)
++	kmovd	%k0, %eax
++	tzcntl	%eax, %eax
++	/* Found CHAR or the null byte.	 */
++# ifdef USE_AS_WCSCHR
++	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
++	leaq	(%rdi, %rax, 4), %rax
++# else
++	addq	%rdi, %rax
++# endif
++# ifndef USE_AS_STRCHRNUL
++	cmp (%rax), %CHAR_REG
++	cmovne	%rdx, %rax
++# endif
++	ret
++
++	.p2align 4
++L(more_vecs):
++	/* Align data for aligned loads in the loop.  */
++	andq	$-VEC_SIZE, %rdi
++L(aligned_more):
++
++	/* Check the next 4 * VEC_SIZE.	 Only one VEC_SIZE at a time
++	   since data is only aligned to VEC_SIZE.	*/
++	VMOVA	VEC_SIZE(%rdi), %YMM1
++	addq	$VEC_SIZE, %rdi
++
++	/* Leaves only CHARS matching esi as 0.  */
++	vpxorq	%YMM1, %YMM0, %YMM2
++	VPMINU	%YMM2, %YMM1, %YMM2
++	/* Each bit in K0 represents a CHAR or a null byte in YMM1.  */
++	VPCMP	$0, %YMMZERO, %YMM2, %k0
++	kmovd	%k0, %eax
++	testl	%eax, %eax
++	jnz	L(first_vec_x0)
++
++	VMOVA	VEC_SIZE(%rdi), %YMM1
++	/* Leaves only CHARS matching esi as 0.  */
++	vpxorq	%YMM1, %YMM0, %YMM2
++	VPMINU	%YMM2, %YMM1, %YMM2
++	/* Each bit in K0 represents a CHAR or a null byte in YMM1.  */
++	VPCMP	$0, %YMMZERO, %YMM2, %k0
++	kmovd	%k0, %eax
++	testl	%eax, %eax
++	jnz	L(first_vec_x1)
++
++	VMOVA	(VEC_SIZE * 2)(%rdi), %YMM1
++	/* Leaves only CHARS matching esi as 0.  */
++	vpxorq	%YMM1, %YMM0, %YMM2
++	VPMINU	%YMM2, %YMM1, %YMM2
++	/* Each bit in K0 represents a CHAR or a null byte in YMM1.  */
++	VPCMP	$0, %YMMZERO, %YMM2, %k0
++	kmovd	%k0, %eax
++	testl	%eax, %eax
++	jnz	L(first_vec_x2)
++
++	VMOVA	(VEC_SIZE * 3)(%rdi), %YMM1
++	/* Leaves only CHARS matching esi as 0.  */
++	vpxorq	%YMM1, %YMM0, %YMM2
++	VPMINU	%YMM2, %YMM1, %YMM2
++	/* Each bit in K0 represents a CHAR or a null byte in YMM1.  */
++	VPCMP	$0, %YMMZERO, %YMM2, %k0
++	ktestd	%k0, %k0
++	jz	L(prep_loop_4x)
++
++	kmovd	%k0, %eax
++	tzcntl	%eax, %eax
++	/* Found CHAR or the null byte.	 */
++# ifdef USE_AS_WCSCHR
++	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
++	leaq	(VEC_SIZE * 3)(%rdi, %rax, 4), %rax
++# else
++	leaq	(VEC_SIZE * 3)(%rdi, %rax), %rax
++# endif
++# ifndef USE_AS_STRCHRNUL
++	cmp (%rax), %CHAR_REG
++	cmovne	%rdx, %rax
++# endif
++	ret
++
++	.p2align 4
++L(first_vec_x0):
++	tzcntl	%eax, %eax
++	/* Found CHAR or the null byte.	 */
++# ifdef USE_AS_WCSCHR
++	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
++	leaq	(%rdi, %rax, 4), %rax
++# else
++	addq	%rdi, %rax
++# endif
++# ifndef USE_AS_STRCHRNUL
++	cmp (%rax), %CHAR_REG
++	cmovne	%rdx, %rax
++# endif
++	ret
++
++	.p2align 4
++L(first_vec_x1):
++	tzcntl	%eax, %eax
++	/* Found CHAR or the null byte.	 */
++# ifdef USE_AS_WCSCHR
++	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
++	leaq	VEC_SIZE(%rdi, %rax, 4), %rax
++# else
++	leaq	VEC_SIZE(%rdi, %rax), %rax
++# endif
++# ifndef USE_AS_STRCHRNUL
++	cmp (%rax), %CHAR_REG
++	cmovne	%rdx, %rax
++# endif
++	ret
++
++	.p2align 4
++L(first_vec_x2):
++	tzcntl	%eax, %eax
++	/* Found CHAR or the null byte.	 */
++# ifdef USE_AS_WCSCHR
++	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
++	leaq	(VEC_SIZE * 2)(%rdi, %rax, 4), %rax
++# else
++	leaq	(VEC_SIZE * 2)(%rdi, %rax), %rax
++# endif
++# ifndef USE_AS_STRCHRNUL
++	cmp (%rax), %CHAR_REG
++	cmovne	%rdx, %rax
++# endif
++	ret
++
++L(prep_loop_4x):
++	/* Align data to 4 * VEC_SIZE.	*/
++	andq	$-(VEC_SIZE * 4), %rdi
++
++	.p2align 4
++L(loop_4x_vec):
++	/* Compare 4 * VEC at a time forward.  */
++	VMOVA	(VEC_SIZE * 4)(%rdi), %YMM1
++	VMOVA	(VEC_SIZE * 5)(%rdi), %YMM2
++	VMOVA	(VEC_SIZE * 6)(%rdi), %YMM3
++	VMOVA	(VEC_SIZE * 7)(%rdi), %YMM4
++
++	/* Leaves only CHARS matching esi as 0.  */
++	vpxorq	%YMM1, %YMM0, %YMM5
++	vpxorq	%YMM2, %YMM0, %YMM6
++	vpxorq	%YMM3, %YMM0, %YMM7
++	vpxorq	%YMM4, %YMM0, %YMM8
++
++	VPMINU	%YMM5, %YMM1, %YMM5
++	VPMINU	%YMM6, %YMM2, %YMM6
++	VPMINU	%YMM7, %YMM3, %YMM7
++	VPMINU	%YMM8, %YMM4, %YMM8
++
++	VPMINU	%YMM5, %YMM6, %YMM1
++	VPMINU	%YMM7, %YMM8, %YMM2
++
++	VPMINU	%YMM1, %YMM2, %YMM1
++
++	/* Each bit in K0 represents a CHAR or a null byte.  */
++	VPCMP	$0, %YMMZERO, %YMM1, %k0
++
++	addq	$(VEC_SIZE * 4), %rdi
++
++	ktestd	%k0, %k0
++	jz	L(loop_4x_vec)
++
++	/* Each bit in K0 represents a CHAR or a null byte in YMM1.  */
++	VPCMP	$0, %YMMZERO, %YMM5, %k0
++	kmovd	%k0, %eax
++	testl	%eax, %eax
++	jnz	L(first_vec_x0)
++
++	/* Each bit in K1 represents a CHAR or a null byte in YMM2.  */
++	VPCMP	$0, %YMMZERO, %YMM6, %k1
++	kmovd	%k1, %eax
++	testl	%eax, %eax
++	jnz	L(first_vec_x1)
++
++	/* Each bit in K2 represents a CHAR or a null byte in YMM3.  */
++	VPCMP	$0, %YMMZERO, %YMM7, %k2
++	/* Each bit in K3 represents a CHAR or a null byte in YMM4.  */
++	VPCMP	$0, %YMMZERO, %YMM8, %k3
++
++# ifdef USE_AS_WCSCHR
++	/* NB: Each bit in K2/K3 represents 4-byte element.  */
++	kshiftlw $8, %k3, %k1
++# else
++	kshiftlq $32, %k3, %k1
++# endif
++
++	/* Each bit in K1 represents a NULL or a mismatch.  */
++	korq	%k1, %k2, %k1
++	kmovq	%k1, %rax
++
++	tzcntq  %rax, %rax
++# ifdef USE_AS_WCSCHR
++	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
++	leaq	(VEC_SIZE * 2)(%rdi, %rax, 4), %rax
++# else
++	leaq	(VEC_SIZE * 2)(%rdi, %rax), %rax
++# endif
++# ifndef USE_AS_STRCHRNUL
++	cmp (%rax), %CHAR_REG
++	cmovne	%rdx, %rax
++# endif
++	ret
++
++	/* Cold case for crossing page with first load.	 */
++	.p2align 4
++L(cross_page_boundary):
++	andq	$-VEC_SIZE, %rdi
++	andl	$(VEC_SIZE - 1), %ecx
++
++	VMOVA	(%rdi), %YMM1
++
++	/* Leaves only CHARS matching esi as 0.  */
++	vpxorq	%YMM1, %YMM0, %YMM2
++	VPMINU	%YMM2, %YMM1, %YMM2
++	/* Each bit in K0 represents a CHAR or a null byte in YMM1.  */
++	VPCMP	$0, %YMMZERO, %YMM2, %k0
++	kmovd	%k0, %eax
++	testl	%eax, %eax
++
++# ifdef USE_AS_WCSCHR
++	/* NB: Divide shift count by 4 since each bit in K1 represent 4
++	   bytes.  */
++	movl	%ecx, %SHIFT_REG
++	sarl    $2, %SHIFT_REG
++# endif
++
++	/* Remove the leading bits.	 */
++	sarxl	%SHIFT_REG, %eax, %eax
++	testl	%eax, %eax
++
++	jz	L(aligned_more)
++	tzcntl	%eax, %eax
++	addq	%rcx, %rdi
++# ifdef USE_AS_WCSCHR
++	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
++	leaq	(%rdi, %rax, 4), %rax
++# else
++	addq	%rdi, %rax
++# endif
++# ifndef USE_AS_STRCHRNUL
++	cmp (%rax), %CHAR_REG
++	cmovne	%rdx, %rax
++# endif
++	ret
++
++END (STRCHR)
++# endif
+diff --git a/sysdeps/x86_64/multiarch/strchr.c b/sysdeps/x86_64/multiarch/strchr.c
+index 32954713..be05e197 100644
+--- a/sysdeps/x86_64/multiarch/strchr.c
++++ b/sysdeps/x86_64/multiarch/strchr.c
+@@ -29,16 +29,24 @@
+ extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden;
+ extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2_no_bsf) attribute_hidden;
+ extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden;
++extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden;
+ 
+ static inline void *
+ IFUNC_SELECTOR (void)
+ {
+   const struct cpu_features* cpu_features = __get_cpu_features ();
+ 
+-  if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER)
+-      && CPU_FEATURE_USABLE_P (cpu_features, AVX2)
++  if (CPU_FEATURE_USABLE_P (cpu_features, AVX2)
+       && CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load))
+-    return OPTIMIZE (avx2);
++    {
++      if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
++	  && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW)
++	  && CPU_FEATURE_USABLE_P (cpu_features, BMI2))
++	return OPTIMIZE (evex);
++
++      if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
++	return OPTIMIZE (avx2);
++    }
+ 
+   if (CPU_FEATURES_ARCH_P (cpu_features, Slow_BSF))
+     return OPTIMIZE (sse2_no_bsf);
+diff --git a/sysdeps/x86_64/multiarch/strchrnul-evex.S b/sysdeps/x86_64/multiarch/strchrnul-evex.S
+new file mode 100644
+index 00000000..064fe7ca
+--- /dev/null
++++ b/sysdeps/x86_64/multiarch/strchrnul-evex.S
+@@ -0,0 +1,3 @@
++#define STRCHR __strchrnul_evex
++#define USE_AS_STRCHRNUL 1
++#include "strchr-evex.S"
+diff --git a/sysdeps/x86_64/multiarch/strcmp-evex.S b/sysdeps/x86_64/multiarch/strcmp-evex.S
+new file mode 100644
+index 00000000..459eeed0
+--- /dev/null
++++ b/sysdeps/x86_64/multiarch/strcmp-evex.S
+@@ -0,0 +1,1043 @@
++/* strcmp/wcscmp/strncmp/wcsncmp optimized with 256-bit EVEX instructions.
++   Copyright (C) 2021 Free Software Foundation, Inc.
++   This file is part of the GNU C Library.
++
++   The GNU C Library is free software; you can redistribute it and/or
++   modify it under the terms of the GNU Lesser General Public
++   License as published by the Free Software Foundation; either
++   version 2.1 of the License, or (at your option) any later version.
++
++   The GNU C Library is distributed in the hope that it will be useful,
++   but WITHOUT ANY WARRANTY; without even the implied warranty of
++   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++   Lesser General Public License for more details.
++
++   You should have received a copy of the GNU Lesser General Public
++   License along with the GNU C Library; if not, see
++   <https://www.gnu.org/licenses/>.  */
++
++#if IS_IN (libc)
++
++# include <sysdep.h>
++
++# ifndef STRCMP
++#  define STRCMP	__strcmp_evex
++# endif
++
++# define PAGE_SIZE	4096
++
++/* VEC_SIZE = Number of bytes in a ymm register */
++# define VEC_SIZE	32
++
++/* Shift for dividing by (VEC_SIZE * 4).  */
++# define DIVIDE_BY_VEC_4_SHIFT	7
++# if (VEC_SIZE * 4) != (1 << DIVIDE_BY_VEC_4_SHIFT)
++#  error (VEC_SIZE * 4) != (1 << DIVIDE_BY_VEC_4_SHIFT)
++# endif
++
++# define VMOVU		vmovdqu64
++# define VMOVA		vmovdqa64
++
++# ifdef USE_AS_WCSCMP
++/* Compare packed dwords.  */
++#  define VPCMP		vpcmpd
++#  define SHIFT_REG32	r8d
++#  define SHIFT_REG64	r8
++/* 1 dword char == 4 bytes.  */
++#  define SIZE_OF_CHAR	4
++# else
++/* Compare packed bytes.  */
++#  define VPCMP		vpcmpb
++#  define SHIFT_REG32	ecx
++#  define SHIFT_REG64	rcx
++/* 1 byte char == 1 byte.  */
++#  define SIZE_OF_CHAR	1
++# endif
++
++# define XMMZERO	xmm16
++# define XMM0		xmm17
++# define XMM1		xmm18
++
++# define YMMZERO	ymm16
++# define YMM0		ymm17
++# define YMM1		ymm18
++# define YMM2		ymm19
++# define YMM3		ymm20
++# define YMM4		ymm21
++# define YMM5		ymm22
++# define YMM6		ymm23
++# define YMM7		ymm24
++
++/* Warning!
++           wcscmp/wcsncmp have to use SIGNED comparison for elements.
++           strcmp/strncmp have to use UNSIGNED comparison for elements.
++*/
++
++/* The main idea of the string comparison (byte or dword) using 256-bit
++   EVEX instructions consists of comparing (VPCMP) two ymm vectors. The
++   latter can be on either packed bytes or dwords depending on
++   USE_AS_WCSCMP. In order to check the null char, algorithm keeps the
++   matched bytes/dwords, requiring 5 EVEX instructions (3 VPCMP and 2
++   KORD). In general, the costs of comparing VEC_SIZE bytes (32-bytes)
++   are 3 VPCMP and 2 KORD instructions, together with VMOVU and ktestd
++   instructions.  Main loop (away from from page boundary) compares 4
++   vectors are a time, effectively comparing 4 x VEC_SIZE bytes (128
++   bytes) on each loop.
++
++   The routine strncmp/wcsncmp (enabled by defining USE_AS_STRNCMP) logic
++   is the same as strcmp, except that an a maximum offset is tracked.  If
++   the maximum offset is reached before a difference is found, zero is
++   returned.  */
++
++	.section .text.evex,"ax",@progbits
++ENTRY (STRCMP)
++# ifdef USE_AS_STRNCMP
++	/* Check for simple cases (0 or 1) in offset.  */
++	cmp	$1, %RDX_LP
++	je	L(char0)
++	jb	L(zero)
++#  ifdef USE_AS_WCSCMP
++	/* Convert units: from wide to byte char.  */
++	shl	$2, %RDX_LP
++#  endif
++	/* Register %r11 tracks the maximum offset.  */
++	mov	%RDX_LP, %R11_LP
++# endif
++	movl	%edi, %eax
++	xorl	%edx, %edx
++	/* Make %XMMZERO (%YMMZERO) all zeros in this function.  */
++	vpxorq	%XMMZERO, %XMMZERO, %XMMZERO
++	orl	%esi, %eax
++	andl	$(PAGE_SIZE - 1), %eax
++	cmpl	$(PAGE_SIZE - (VEC_SIZE * 4)), %eax
++	jg	L(cross_page)
++	/* Start comparing 4 vectors.  */
++	VMOVU	(%rdi), %YMM0
++	VMOVU	(%rsi), %YMM1
++
++	/* Each bit in K0 represents a mismatch in YMM0 and YMM1.  */
++	VPCMP	$4, %YMM0, %YMM1, %k0
++
++	/* Check for NULL in YMM0.  */
++	VPCMP	$0, %YMMZERO, %YMM0, %k1
++	/* Check for NULL in YMM1.  */
++	VPCMP	$0, %YMMZERO, %YMM1, %k2
++	/* Each bit in K1 represents a NULL in YMM0 or YMM1.  */
++	kord	%k1, %k2, %k1
++
++	/* Each bit in K1 represents:
++	   1. A mismatch in YMM0 and YMM1.  Or
++	   2. A NULL in YMM0 or YMM1.
++	 */
++	kord	%k0, %k1, %k1
++
++	ktestd	%k1, %k1
++	je	L(next_3_vectors)
++	kmovd	%k1, %ecx
++	tzcntl	%ecx, %edx
++# ifdef USE_AS_WCSCMP
++	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
++	sall	$2, %edx
++# endif
++# ifdef USE_AS_STRNCMP
++	/* Return 0 if the mismatched index (%rdx) is after the maximum
++	   offset (%r11).   */
++	cmpq	%r11, %rdx
++	jae	L(zero)
++# endif
++# ifdef USE_AS_WCSCMP
++	xorl	%eax, %eax
++	movl	(%rdi, %rdx), %ecx
++	cmpl	(%rsi, %rdx), %ecx
++	je	L(return)
++L(wcscmp_return):
++	setl	%al
++	negl	%eax
++	orl	$1, %eax
++L(return):
++# else
++	movzbl	(%rdi, %rdx), %eax
++	movzbl	(%rsi, %rdx), %edx
++	subl	%edx, %eax
++# endif
++	ret
++
++	.p2align 4
++L(return_vec_size):
++	kmovd	%k1, %ecx
++	tzcntl	%ecx, %edx
++# ifdef USE_AS_WCSCMP
++	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
++	sall	$2, %edx
++# endif
++# ifdef USE_AS_STRNCMP
++	/* Return 0 if the mismatched index (%rdx + VEC_SIZE) is after
++	   the maximum offset (%r11).  */
++	addq	$VEC_SIZE, %rdx
++	cmpq	%r11, %rdx
++	jae	L(zero)
++#  ifdef USE_AS_WCSCMP
++	xorl	%eax, %eax
++	movl	(%rdi, %rdx), %ecx
++	cmpl	(%rsi, %rdx), %ecx
++	jne	L(wcscmp_return)
++#  else
++	movzbl	(%rdi, %rdx), %eax
++	movzbl	(%rsi, %rdx), %edx
++	subl	%edx, %eax
++#  endif
++# else
++#  ifdef USE_AS_WCSCMP
++	xorl	%eax, %eax
++	movl	VEC_SIZE(%rdi, %rdx), %ecx
++	cmpl	VEC_SIZE(%rsi, %rdx), %ecx
++	jne	L(wcscmp_return)
++#  else
++	movzbl	VEC_SIZE(%rdi, %rdx), %eax
++	movzbl	VEC_SIZE(%rsi, %rdx), %edx
++	subl	%edx, %eax
++#  endif
++# endif
++	ret
++
++	.p2align 4
++L(return_2_vec_size):
++	kmovd	%k1, %ecx
++	tzcntl	%ecx, %edx
++# ifdef USE_AS_WCSCMP
++	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
++	sall	$2, %edx
++# endif
++# ifdef USE_AS_STRNCMP
++	/* Return 0 if the mismatched index (%rdx + 2 * VEC_SIZE) is
++	   after the maximum offset (%r11).  */
++	addq	$(VEC_SIZE * 2), %rdx
++	cmpq	%r11, %rdx
++	jae	L(zero)
++#  ifdef USE_AS_WCSCMP
++	xorl	%eax, %eax
++	movl	(%rdi, %rdx), %ecx
++	cmpl	(%rsi, %rdx), %ecx
++	jne	L(wcscmp_return)
++#  else
++	movzbl	(%rdi, %rdx), %eax
++	movzbl	(%rsi, %rdx), %edx
++	subl	%edx, %eax
++#  endif
++# else
++#  ifdef USE_AS_WCSCMP
++	xorl	%eax, %eax
++	movl	(VEC_SIZE * 2)(%rdi, %rdx), %ecx
++	cmpl	(VEC_SIZE * 2)(%rsi, %rdx), %ecx
++	jne	L(wcscmp_return)
++#  else
++	movzbl	(VEC_SIZE * 2)(%rdi, %rdx), %eax
++	movzbl	(VEC_SIZE * 2)(%rsi, %rdx), %edx
++	subl	%edx, %eax
++#  endif
++# endif
++	ret
++
++	.p2align 4
++L(return_3_vec_size):
++	kmovd	%k1, %ecx
++	tzcntl	%ecx, %edx
++# ifdef USE_AS_WCSCMP
++	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
++	sall	$2, %edx
++# endif
++# ifdef USE_AS_STRNCMP
++	/* Return 0 if the mismatched index (%rdx + 3 * VEC_SIZE) is
++	   after the maximum offset (%r11).  */
++	addq	$(VEC_SIZE * 3), %rdx
++	cmpq	%r11, %rdx
++	jae	L(zero)
++#  ifdef USE_AS_WCSCMP
++	xorl	%eax, %eax
++	movl	(%rdi, %rdx), %ecx
++	cmpl	(%rsi, %rdx), %ecx
++	jne	L(wcscmp_return)
++#  else
++	movzbl	(%rdi, %rdx), %eax
++	movzbl	(%rsi, %rdx), %edx
++	subl	%edx, %eax
++#  endif
++# else
++#  ifdef USE_AS_WCSCMP
++	xorl	%eax, %eax
++	movl	(VEC_SIZE * 3)(%rdi, %rdx), %ecx
++	cmpl	(VEC_SIZE * 3)(%rsi, %rdx), %ecx
++	jne	L(wcscmp_return)
++#  else
++	movzbl	(VEC_SIZE * 3)(%rdi, %rdx), %eax
++	movzbl	(VEC_SIZE * 3)(%rsi, %rdx), %edx
++	subl	%edx, %eax
++#  endif
++# endif
++	ret
++
++	.p2align 4
++L(next_3_vectors):
++	VMOVU	VEC_SIZE(%rdi), %YMM0
++	VMOVU	VEC_SIZE(%rsi), %YMM1
++	/* Each bit in K0 represents a mismatch in YMM0 and YMM1.  */
++	VPCMP	$4, %YMM0, %YMM1, %k0
++	VPCMP	$0, %YMMZERO, %YMM0, %k1
++	VPCMP	$0, %YMMZERO, %YMM1, %k2
++	/* Each bit in K1 represents a NULL in YMM0 or YMM1.  */
++	kord	%k1, %k2, %k1
++	/* Each bit in K1 represents a NULL or a mismatch.  */
++	kord	%k0, %k1, %k1
++	ktestd	%k1, %k1
++	jne	L(return_vec_size)
++
++	VMOVU	(VEC_SIZE * 2)(%rdi), %YMM2
++	VMOVU	(VEC_SIZE * 3)(%rdi), %YMM3
++	VMOVU	(VEC_SIZE * 2)(%rsi), %YMM4
++	VMOVU	(VEC_SIZE * 3)(%rsi), %YMM5
++
++	/* Each bit in K0 represents a mismatch in YMM2 and YMM4.  */
++	VPCMP	$4, %YMM2, %YMM4, %k0
++	VPCMP	$0, %YMMZERO, %YMM2, %k1
++	VPCMP	$0, %YMMZERO, %YMM4, %k2
++	/* Each bit in K1 represents a NULL in YMM2 or YMM4.  */
++	kord	%k1, %k2, %k1
++	/* Each bit in K1 represents a NULL or a mismatch.  */
++	kord	%k0, %k1, %k1
++	ktestd	%k1, %k1
++	jne	L(return_2_vec_size)
++
++	/* Each bit in K0 represents a mismatch in YMM3 and YMM5.  */
++	VPCMP	$4, %YMM3, %YMM5, %k0
++	VPCMP	$0, %YMMZERO, %YMM3, %k1
++	VPCMP	$0, %YMMZERO, %YMM5, %k2
++	/* Each bit in K1 represents a NULL in YMM3 or YMM5.  */
++	kord	%k1, %k2, %k1
++	/* Each bit in K1 represents a NULL or a mismatch.  */
++	kord	%k0, %k1, %k1
++	ktestd	%k1, %k1
++	jne	L(return_3_vec_size)
++L(main_loop_header):
++	leaq	(VEC_SIZE * 4)(%rdi), %rdx
++	movl	$PAGE_SIZE, %ecx
++	/* Align load via RAX.  */
++	andq	$-(VEC_SIZE * 4), %rdx
++	subq	%rdi, %rdx
++	leaq	(%rdi, %rdx), %rax
++# ifdef USE_AS_STRNCMP
++	/* Starting from this point, the maximum offset, or simply the
++	   'offset', DECREASES by the same amount when base pointers are
++	   moved forward.  Return 0 when:
++	     1) On match: offset <= the matched vector index.
++	     2) On mistmach, offset is before the mistmatched index.
++	 */
++	subq	%rdx, %r11
++	jbe	L(zero)
++# endif
++	addq	%rsi, %rdx
++	movq	%rdx, %rsi
++	andl	$(PAGE_SIZE - 1), %esi
++	/* Number of bytes before page crossing.  */
++	subq	%rsi, %rcx
++	/* Number of VEC_SIZE * 4 blocks before page crossing.  */
++	shrq	$DIVIDE_BY_VEC_4_SHIFT, %rcx
++	/* ESI: Number of VEC_SIZE * 4 blocks before page crossing.   */
++	movl	%ecx, %esi
++	jmp	L(loop_start)
++
++	.p2align 4
++L(loop):
++# ifdef USE_AS_STRNCMP
++	/* Base pointers are moved forward by 4 * VEC_SIZE.  Decrease
++	   the maximum offset (%r11) by the same amount.  */
++	subq	$(VEC_SIZE * 4), %r11
++	jbe	L(zero)
++# endif
++	addq	$(VEC_SIZE * 4), %rax
++	addq	$(VEC_SIZE * 4), %rdx
++L(loop_start):
++	testl	%esi, %esi
++	leal	-1(%esi), %esi
++	je	L(loop_cross_page)
++L(back_to_loop):
++	/* Main loop, comparing 4 vectors are a time.  */
++	VMOVA	(%rax), %YMM0
++	VMOVA	VEC_SIZE(%rax), %YMM2
++	VMOVA	(VEC_SIZE * 2)(%rax), %YMM4
++	VMOVA	(VEC_SIZE * 3)(%rax), %YMM6
++	VMOVU	(%rdx), %YMM1
++	VMOVU	VEC_SIZE(%rdx), %YMM3
++	VMOVU	(VEC_SIZE * 2)(%rdx), %YMM5
++	VMOVU	(VEC_SIZE * 3)(%rdx), %YMM7
++
++	VPCMP	$4, %YMM0, %YMM1, %k0
++	VPCMP	$0, %YMMZERO, %YMM0, %k1
++	VPCMP	$0, %YMMZERO, %YMM1, %k2
++	kord	%k1, %k2, %k1
++	/* Each bit in K4 represents a NULL or a mismatch in YMM0 and
++	   YMM1.  */
++	kord	%k0, %k1, %k4
++
++	VPCMP	$4, %YMM2, %YMM3, %k0
++	VPCMP	$0, %YMMZERO, %YMM2, %k1
++	VPCMP	$0, %YMMZERO, %YMM3, %k2
++	kord	%k1, %k2, %k1
++	/* Each bit in K5 represents a NULL or a mismatch in YMM2 and
++	   YMM3.  */
++	kord	%k0, %k1, %k5
++
++	VPCMP	$4, %YMM4, %YMM5, %k0
++	VPCMP	$0, %YMMZERO, %YMM4, %k1
++	VPCMP	$0, %YMMZERO, %YMM5, %k2
++	kord	%k1, %k2, %k1
++	/* Each bit in K6 represents a NULL or a mismatch in YMM4 and
++	   YMM5.  */
++	kord	%k0, %k1, %k6
++
++	VPCMP	$4, %YMM6, %YMM7, %k0
++	VPCMP	$0, %YMMZERO, %YMM6, %k1
++	VPCMP	$0, %YMMZERO, %YMM7, %k2
++	kord	%k1, %k2, %k1
++	/* Each bit in K7 represents a NULL or a mismatch in YMM6 and
++	   YMM7.  */
++	kord	%k0, %k1, %k7
++
++	kord	%k4, %k5, %k0
++	kord	%k6, %k7, %k1
++
++	/* Test each mask (32 bits) individually because for VEC_SIZE
++	   == 32 is not possible to OR the four masks and keep all bits
++	   in a 64-bit integer register, differing from SSE2 strcmp
++	   where ORing is possible.  */
++	kortestd %k0, %k1
++	je	L(loop)
++	ktestd	%k4, %k4
++	je	L(test_vec)
++	kmovd	%k4, %edi
++	tzcntl	%edi, %ecx
++# ifdef USE_AS_WCSCMP
++	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
++	sall	$2, %ecx
++# endif
++# ifdef USE_AS_STRNCMP
++	cmpq	%rcx, %r11
++	jbe	L(zero)
++#  ifdef USE_AS_WCSCMP
++	movq	%rax, %rsi
++	xorl	%eax, %eax
++	movl	(%rsi, %rcx), %edi
++	cmpl	(%rdx, %rcx), %edi
++	jne	L(wcscmp_return)
++#  else
++	movzbl	(%rax, %rcx), %eax
++	movzbl	(%rdx, %rcx), %edx
++	subl	%edx, %eax
++#  endif
++# else
++#  ifdef USE_AS_WCSCMP
++	movq	%rax, %rsi
++	xorl	%eax, %eax
++	movl	(%rsi, %rcx), %edi
++	cmpl	(%rdx, %rcx), %edi
++	jne	L(wcscmp_return)
++#  else
++	movzbl	(%rax, %rcx), %eax
++	movzbl	(%rdx, %rcx), %edx
++	subl	%edx, %eax
++#  endif
++# endif
++	ret
++
++	.p2align 4
++L(test_vec):
++# ifdef USE_AS_STRNCMP
++	/* The first vector matched.  Return 0 if the maximum offset
++	   (%r11) <= VEC_SIZE.  */
++	cmpq	$VEC_SIZE, %r11
++	jbe	L(zero)
++# endif
++	ktestd	%k5, %k5
++	je	L(test_2_vec)
++	kmovd	%k5, %ecx
++	tzcntl	%ecx, %edi
++# ifdef USE_AS_WCSCMP
++	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
++	sall	$2, %edi
++# endif
++# ifdef USE_AS_STRNCMP
++	addq	$VEC_SIZE, %rdi
++	cmpq	%rdi, %r11
++	jbe	L(zero)
++#  ifdef USE_AS_WCSCMP
++	movq	%rax, %rsi
++	xorl	%eax, %eax
++	movl	(%rsi, %rdi), %ecx
++	cmpl	(%rdx, %rdi), %ecx
++	jne	L(wcscmp_return)
++#  else
++	movzbl	(%rax, %rdi), %eax
++	movzbl	(%rdx, %rdi), %edx
++	subl	%edx, %eax
++#  endif
++# else
++#  ifdef USE_AS_WCSCMP
++	movq	%rax, %rsi
++	xorl	%eax, %eax
++	movl	VEC_SIZE(%rsi, %rdi), %ecx
++	cmpl	VEC_SIZE(%rdx, %rdi), %ecx
++	jne	L(wcscmp_return)
++#  else
++	movzbl	VEC_SIZE(%rax, %rdi), %eax
++	movzbl	VEC_SIZE(%rdx, %rdi), %edx
++	subl	%edx, %eax
++#  endif
++# endif
++	ret
++
++	.p2align 4
++L(test_2_vec):
++# ifdef USE_AS_STRNCMP
++	/* The first 2 vectors matched.  Return 0 if the maximum offset
++	   (%r11) <= 2 * VEC_SIZE.  */
++	cmpq	$(VEC_SIZE * 2), %r11
++	jbe	L(zero)
++# endif
++	ktestd	%k6, %k6
++	je	L(test_3_vec)
++	kmovd	%k6, %ecx
++	tzcntl	%ecx, %edi
++# ifdef USE_AS_WCSCMP
++	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
++	sall	$2, %edi
++# endif
++# ifdef USE_AS_STRNCMP
++	addq	$(VEC_SIZE * 2), %rdi
++	cmpq	%rdi, %r11
++	jbe	L(zero)
++#  ifdef USE_AS_WCSCMP
++	movq	%rax, %rsi
++	xorl	%eax, %eax
++	movl	(%rsi, %rdi), %ecx
++	cmpl	(%rdx, %rdi), %ecx
++	jne	L(wcscmp_return)
++#  else
++	movzbl	(%rax, %rdi), %eax
++	movzbl	(%rdx, %rdi), %edx
++	subl	%edx, %eax
++#  endif
++# else
++#  ifdef USE_AS_WCSCMP
++	movq	%rax, %rsi
++	xorl	%eax, %eax
++	movl	(VEC_SIZE * 2)(%rsi, %rdi), %ecx
++	cmpl	(VEC_SIZE * 2)(%rdx, %rdi), %ecx
++	jne	L(wcscmp_return)
++#  else
++	movzbl	(VEC_SIZE * 2)(%rax, %rdi), %eax
++	movzbl	(VEC_SIZE * 2)(%rdx, %rdi), %edx
++	subl	%edx, %eax
++#  endif
++# endif
++	ret
++
++	.p2align 4
++L(test_3_vec):
++# ifdef USE_AS_STRNCMP
++	/* The first 3 vectors matched.  Return 0 if the maximum offset
++	   (%r11) <= 3 * VEC_SIZE.  */
++	cmpq	$(VEC_SIZE * 3), %r11
++	jbe	L(zero)
++# endif
++	kmovd	%k7, %esi
++	tzcntl	%esi, %ecx
++# ifdef USE_AS_WCSCMP
++	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
++	sall	$2, %ecx
++# endif
++# ifdef USE_AS_STRNCMP
++	addq	$(VEC_SIZE * 3), %rcx
++	cmpq	%rcx, %r11
++	jbe	L(zero)
++#  ifdef USE_AS_WCSCMP
++	movq	%rax, %rsi
++	xorl	%eax, %eax
++	movl	(%rsi, %rcx), %esi
++	cmpl	(%rdx, %rcx), %esi
++	jne	L(wcscmp_return)
++#  else
++	movzbl	(%rax, %rcx), %eax
++	movzbl	(%rdx, %rcx), %edx
++	subl	%edx, %eax
++#  endif
++# else
++#  ifdef USE_AS_WCSCMP
++	movq	%rax, %rsi
++	xorl	%eax, %eax
++	movl	(VEC_SIZE * 3)(%rsi, %rcx), %esi
++	cmpl	(VEC_SIZE * 3)(%rdx, %rcx), %esi
++	jne	L(wcscmp_return)
++#  else
++	movzbl	(VEC_SIZE * 3)(%rax, %rcx), %eax
++	movzbl	(VEC_SIZE * 3)(%rdx, %rcx), %edx
++	subl	%edx, %eax
++#  endif
++# endif
++	ret
++
++	.p2align 4
++L(loop_cross_page):
++	xorl	%r10d, %r10d
++	movq	%rdx, %rcx
++	/* Align load via RDX.  We load the extra ECX bytes which should
++	   be ignored.  */
++	andl	$((VEC_SIZE * 4) - 1), %ecx
++	/* R10 is -RCX.  */
++	subq	%rcx, %r10
++
++	/* This works only if VEC_SIZE * 2 == 64. */
++# if (VEC_SIZE * 2) != 64
++#  error (VEC_SIZE * 2) != 64
++# endif
++
++	/* Check if the first VEC_SIZE * 2 bytes should be ignored.  */
++	cmpl	$(VEC_SIZE * 2), %ecx
++	jge	L(loop_cross_page_2_vec)
++
++	VMOVU	(%rax, %r10), %YMM2
++	VMOVU	VEC_SIZE(%rax, %r10), %YMM3
++	VMOVU	(%rdx, %r10), %YMM4
++	VMOVU	VEC_SIZE(%rdx, %r10), %YMM5
++
++	VPCMP	$4, %YMM4, %YMM2, %k0
++	VPCMP	$0, %YMMZERO, %YMM2, %k1
++	VPCMP	$0, %YMMZERO, %YMM4, %k2
++	kord	%k1, %k2, %k1
++	/* Each bit in K1 represents a NULL or a mismatch in YMM2 and
++	   YMM4.  */
++	kord	%k0, %k1, %k1
++
++	VPCMP	$4, %YMM5, %YMM3, %k3
++	VPCMP	$0, %YMMZERO, %YMM3, %k4
++	VPCMP	$0, %YMMZERO, %YMM5, %k5
++	kord	%k4, %k5, %k4
++	/* Each bit in K3 represents a NULL or a mismatch in YMM3 and
++	   YMM5.  */
++	kord	%k3, %k4, %k3
++
++# ifdef USE_AS_WCSCMP
++	/* NB: Each bit in K1/K3 represents 4-byte element.  */
++	kshiftlw $8, %k3, %k2
++	/* NB: Divide shift count by 4 since each bit in K1 represent 4
++	   bytes.  */
++	movl	%ecx, %SHIFT_REG32
++	sarl	$2, %SHIFT_REG32
++# else
++	kshiftlq $32, %k3, %k2
++# endif
++
++	/* Each bit in K1 represents a NULL or a mismatch.  */
++	korq	%k1, %k2, %k1
++	kmovq	%k1, %rdi
++
++	/* Since ECX < VEC_SIZE * 2, simply skip the first ECX bytes.  */
++	shrxq	%SHIFT_REG64, %rdi, %rdi
++	testq	%rdi, %rdi
++	je	L(loop_cross_page_2_vec)
++	tzcntq	%rdi, %rcx
++# ifdef USE_AS_WCSCMP
++	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
++	sall	$2, %ecx
++# endif
++# ifdef USE_AS_STRNCMP
++	cmpq	%rcx, %r11
++	jbe	L(zero)
++#  ifdef USE_AS_WCSCMP
++	movq	%rax, %rsi
++	xorl	%eax, %eax
++	movl	(%rsi, %rcx), %edi
++	cmpl	(%rdx, %rcx), %edi
++	jne	L(wcscmp_return)
++#  else
++	movzbl	(%rax, %rcx), %eax
++	movzbl	(%rdx, %rcx), %edx
++	subl	%edx, %eax
++#  endif
++# else
++#  ifdef USE_AS_WCSCMP
++	movq	%rax, %rsi
++	xorl	%eax, %eax
++	movl	(%rsi, %rcx), %edi
++	cmpl	(%rdx, %rcx), %edi
++	jne	L(wcscmp_return)
++#  else
++	movzbl	(%rax, %rcx), %eax
++	movzbl	(%rdx, %rcx), %edx
++	subl	%edx, %eax
++#  endif
++# endif
++	ret
++
++	.p2align 4
++L(loop_cross_page_2_vec):
++	/* The first VEC_SIZE * 2 bytes match or are ignored.  */
++	VMOVU	(VEC_SIZE * 2)(%rax, %r10), %YMM0
++	VMOVU	(VEC_SIZE * 3)(%rax, %r10), %YMM1
++	VMOVU	(VEC_SIZE * 2)(%rdx, %r10), %YMM2
++	VMOVU	(VEC_SIZE * 3)(%rdx, %r10), %YMM3
++
++	VPCMP	$4, %YMM0, %YMM2, %k0
++	VPCMP	$0, %YMMZERO, %YMM0, %k1
++	VPCMP	$0, %YMMZERO, %YMM2, %k2
++	kord	%k1, %k2, %k1
++	/* Each bit in K1 represents a NULL or a mismatch in YMM0 and
++	   YMM2.  */
++	kord	%k0, %k1, %k1
++
++	VPCMP	$4, %YMM1, %YMM3, %k3
++	VPCMP	$0, %YMMZERO, %YMM1, %k4
++	VPCMP	$0, %YMMZERO, %YMM3, %k5
++	kord	%k4, %k5, %k4
++	/* Each bit in K3 represents a NULL or a mismatch in YMM1 and
++	   YMM3.  */
++	kord	%k3, %k4, %k3
++
++# ifdef USE_AS_WCSCMP
++	/* NB: Each bit in K1/K3 represents 4-byte element.  */
++	kshiftlw $8, %k3, %k2
++# else
++	kshiftlq $32, %k3, %k2
++# endif
++
++	/* Each bit in K1 represents a NULL or a mismatch.  */
++	korq	%k1, %k2, %k1
++	kmovq	%k1, %rdi
++
++	xorl	%r8d, %r8d
++	/* If ECX > VEC_SIZE * 2, skip ECX - (VEC_SIZE * 2) bytes.  */
++	subl	$(VEC_SIZE * 2), %ecx
++	jle	1f
++	/* R8 has number of bytes skipped.  */
++	movl	%ecx, %r8d
++# ifdef USE_AS_WCSCMP
++	/* NB: Divide shift count by 4 since each bit in K1 represent 4
++	   bytes.  */
++	sarl	$2, %ecx
++# endif
++	/* Skip ECX bytes.  */
++	shrq	%cl, %rdi
++1:
++	/* Before jumping back to the loop, set ESI to the number of
++	   VEC_SIZE * 4 blocks before page crossing.  */
++	movl	$(PAGE_SIZE / (VEC_SIZE * 4) - 1), %esi
++
++	testq	%rdi, %rdi
++# ifdef USE_AS_STRNCMP
++	/* At this point, if %rdi value is 0, it already tested
++	   VEC_SIZE*4+%r10 byte starting from %rax. This label
++	   checks whether strncmp maximum offset reached or not.  */
++	je	L(string_nbyte_offset_check)
++# else
++	je	L(back_to_loop)
++# endif
++	tzcntq	%rdi, %rcx
++# ifdef USE_AS_WCSCMP
++	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
++	sall	$2, %ecx
++# endif
++	addq	%r10, %rcx
++	/* Adjust for number of bytes skipped.  */
++	addq	%r8, %rcx
++# ifdef USE_AS_STRNCMP
++	addq	$(VEC_SIZE * 2), %rcx
++	subq	%rcx, %r11
++	jbe	L(zero)
++#  ifdef USE_AS_WCSCMP
++	movq	%rax, %rsi
++	xorl	%eax, %eax
++	movl	(%rsi, %rcx), %edi
++	cmpl	(%rdx, %rcx), %edi
++	jne	L(wcscmp_return)
++#  else
++	movzbl	(%rax, %rcx), %eax
++	movzbl	(%rdx, %rcx), %edx
++	subl	%edx, %eax
++#  endif
++# else
++#  ifdef USE_AS_WCSCMP
++	movq	%rax, %rsi
++	xorl	%eax, %eax
++	movl	(VEC_SIZE * 2)(%rsi, %rcx), %edi
++	cmpl	(VEC_SIZE * 2)(%rdx, %rcx), %edi
++	jne	L(wcscmp_return)
++#  else
++	movzbl	(VEC_SIZE * 2)(%rax, %rcx), %eax
++	movzbl	(VEC_SIZE * 2)(%rdx, %rcx), %edx
++	subl	%edx, %eax
++#  endif
++# endif
++	ret
++
++# ifdef USE_AS_STRNCMP
++L(string_nbyte_offset_check):
++	leaq	(VEC_SIZE * 4)(%r10), %r10
++	cmpq	%r10, %r11
++	jbe	L(zero)
++	jmp	L(back_to_loop)
++# endif
++
++	.p2align 4
++L(cross_page_loop):
++	/* Check one byte/dword at a time.  */
++# ifdef USE_AS_WCSCMP
++	cmpl	%ecx, %eax
++# else
++	subl	%ecx, %eax
++# endif
++	jne	L(different)
++	addl	$SIZE_OF_CHAR, %edx
++	cmpl	$(VEC_SIZE * 4), %edx
++	je	L(main_loop_header)
++# ifdef USE_AS_STRNCMP
++	cmpq	%r11, %rdx
++	jae	L(zero)
++# endif
++# ifdef USE_AS_WCSCMP
++	movl	(%rdi, %rdx), %eax
++	movl	(%rsi, %rdx), %ecx
++# else
++	movzbl	(%rdi, %rdx), %eax
++	movzbl	(%rsi, %rdx), %ecx
++# endif
++	/* Check null char.  */
++	testl	%eax, %eax
++	jne	L(cross_page_loop)
++	/* Since %eax == 0, subtract is OK for both SIGNED and UNSIGNED
++	   comparisons.  */
++	subl	%ecx, %eax
++# ifndef USE_AS_WCSCMP
++L(different):
++# endif
++	ret
++
++# ifdef USE_AS_WCSCMP
++	.p2align 4
++L(different):
++	/* Use movl to avoid modifying EFLAGS.  */
++	movl	$0, %eax
++	setl	%al
++	negl	%eax
++	orl	$1, %eax
++	ret
++# endif
++
++# ifdef USE_AS_STRNCMP
++	.p2align 4
++L(zero):
++	xorl	%eax, %eax
++	ret
++
++	.p2align 4
++L(char0):
++#  ifdef USE_AS_WCSCMP
++	xorl	%eax, %eax
++	movl	(%rdi), %ecx
++	cmpl	(%rsi), %ecx
++	jne	L(wcscmp_return)
++#  else
++	movzbl	(%rsi), %ecx
++	movzbl	(%rdi), %eax
++	subl	%ecx, %eax
++#  endif
++	ret
++# endif
++
++	.p2align 4
++L(last_vector):
++	addq	%rdx, %rdi
++	addq	%rdx, %rsi
++# ifdef USE_AS_STRNCMP
++	subq	%rdx, %r11
++# endif
++	tzcntl	%ecx, %edx
++# ifdef USE_AS_WCSCMP
++	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
++	sall	$2, %edx
++# endif
++# ifdef USE_AS_STRNCMP
++	cmpq	%r11, %rdx
++	jae	L(zero)
++# endif
++# ifdef USE_AS_WCSCMP
++	xorl	%eax, %eax
++	movl	(%rdi, %rdx), %ecx
++	cmpl	(%rsi, %rdx), %ecx
++	jne	L(wcscmp_return)
++# else
++	movzbl	(%rdi, %rdx), %eax
++	movzbl	(%rsi, %rdx), %edx
++	subl	%edx, %eax
++# endif
++	ret
++
++	/* Comparing on page boundary region requires special treatment:
++	   It must done one vector at the time, starting with the wider
++	   ymm vector if possible, if not, with xmm. If fetching 16 bytes
++	   (xmm) still passes the boundary, byte comparison must be done.
++	 */
++	.p2align 4
++L(cross_page):
++	/* Try one ymm vector at a time.  */
++	cmpl	$(PAGE_SIZE - VEC_SIZE), %eax
++	jg	L(cross_page_1_vector)
++L(loop_1_vector):
++	VMOVU	(%rdi, %rdx), %YMM0
++	VMOVU	(%rsi, %rdx), %YMM1
++
++	/* Each bit in K0 represents a mismatch in YMM0 and YMM1.  */
++	VPCMP	$4, %YMM0, %YMM1, %k0
++	VPCMP	$0, %YMMZERO, %YMM0, %k1
++	VPCMP	$0, %YMMZERO, %YMM1, %k2
++	/* Each bit in K1 represents a NULL in YMM0 or YMM1.  */
++	kord	%k1, %k2, %k1
++	/* Each bit in K1 represents a NULL or a mismatch.  */
++	kord	%k0, %k1, %k1
++	kmovd	%k1, %ecx
++	testl	%ecx, %ecx
++	jne	L(last_vector)
++
++	addl	$VEC_SIZE, %edx
++
++	addl	$VEC_SIZE, %eax
++# ifdef USE_AS_STRNCMP
++	/* Return 0 if the current offset (%rdx) >= the maximum offset
++	   (%r11).  */
++	cmpq	%r11, %rdx
++	jae	L(zero)
++# endif
++	cmpl	$(PAGE_SIZE - VEC_SIZE), %eax
++	jle	L(loop_1_vector)
++L(cross_page_1_vector):
++	/* Less than 32 bytes to check, try one xmm vector.  */
++	cmpl	$(PAGE_SIZE - 16), %eax
++	jg	L(cross_page_1_xmm)
++	VMOVU	(%rdi, %rdx), %XMM0
++	VMOVU	(%rsi, %rdx), %XMM1
++
++	/* Each bit in K0 represents a mismatch in XMM0 and XMM1.  */
++	VPCMP	$4, %XMM0, %XMM1, %k0
++	VPCMP	$0, %XMMZERO, %XMM0, %k1
++	VPCMP	$0, %XMMZERO, %XMM1, %k2
++	/* Each bit in K1 represents a NULL in XMM0 or XMM1.  */
++	korw	%k1, %k2, %k1
++	/* Each bit in K1 represents a NULL or a mismatch.  */
++	korw	%k0, %k1, %k1
++	kmovw	%k1, %ecx
++	testl	%ecx, %ecx
++	jne	L(last_vector)
++
++	addl	$16, %edx
++# ifndef USE_AS_WCSCMP
++	addl	$16, %eax
++# endif
++# ifdef USE_AS_STRNCMP
++	/* Return 0 if the current offset (%rdx) >= the maximum offset
++	   (%r11).  */
++	cmpq	%r11, %rdx
++	jae	L(zero)
++# endif
++
++L(cross_page_1_xmm):
++# ifndef USE_AS_WCSCMP
++	/* Less than 16 bytes to check, try 8 byte vector.  NB: No need
++	   for wcscmp nor wcsncmp since wide char is 4 bytes.   */
++	cmpl	$(PAGE_SIZE - 8), %eax
++	jg	L(cross_page_8bytes)
++	vmovq	(%rdi, %rdx), %XMM0
++	vmovq	(%rsi, %rdx), %XMM1
++
++	/* Each bit in K0 represents a mismatch in XMM0 and XMM1.  */
++	VPCMP	$4, %XMM0, %XMM1, %k0
++	VPCMP	$0, %XMMZERO, %XMM0, %k1
++	VPCMP	$0, %XMMZERO, %XMM1, %k2
++	/* Each bit in K1 represents a NULL in XMM0 or XMM1.  */
++	kord	%k1, %k2, %k1
++	/* Each bit in K1 represents a NULL or a mismatch.  */
++	kord	%k0, %k1, %k1
++	kmovd	%k1, %ecx
++
++# ifdef USE_AS_WCSCMP
++	/* Only last 2 bits are valid.  */
++	andl	$0x3, %ecx
++# else
++	/* Only last 8 bits are valid.  */
++	andl	$0xff, %ecx
++# endif
++
++	testl	%ecx, %ecx
++	jne	L(last_vector)
++
++	addl	$8, %edx
++	addl	$8, %eax
++#  ifdef USE_AS_STRNCMP
++	/* Return 0 if the current offset (%rdx) >= the maximum offset
++	   (%r11).  */
++	cmpq	%r11, %rdx
++	jae	L(zero)
++#  endif
++
++L(cross_page_8bytes):
++	/* Less than 8 bytes to check, try 4 byte vector.  */
++	cmpl	$(PAGE_SIZE - 4), %eax
++	jg	L(cross_page_4bytes)
++	vmovd	(%rdi, %rdx), %XMM0
++	vmovd	(%rsi, %rdx), %XMM1
++
++	/* Each bit in K0 represents a mismatch in XMM0 and XMM1.  */
++	VPCMP	$4, %XMM0, %XMM1, %k0
++	VPCMP	$0, %XMMZERO, %XMM0, %k1
++	VPCMP	$0, %XMMZERO, %XMM1, %k2
++	/* Each bit in K1 represents a NULL in XMM0 or XMM1.  */
++	kord	%k1, %k2, %k1
++	/* Each bit in K1 represents a NULL or a mismatch.  */
++	kord	%k0, %k1, %k1
++	kmovd	%k1, %ecx
++
++# ifdef USE_AS_WCSCMP
++	/* Only the last bit is valid.  */
++	andl	$0x1, %ecx
++# else
++	/* Only last 4 bits are valid.  */
++	andl	$0xf, %ecx
++# endif
++
++	testl	%ecx, %ecx
++	jne	L(last_vector)
++
++	addl	$4, %edx
++#  ifdef USE_AS_STRNCMP
++	/* Return 0 if the current offset (%rdx) >= the maximum offset
++	   (%r11).  */
++	cmpq	%r11, %rdx
++	jae	L(zero)
++#  endif
++
++L(cross_page_4bytes):
++# endif
++	/* Less than 4 bytes to check, try one byte/dword at a time.  */
++# ifdef USE_AS_STRNCMP
++	cmpq	%r11, %rdx
++	jae	L(zero)
++# endif
++# ifdef USE_AS_WCSCMP
++	movl	(%rdi, %rdx), %eax
++	movl	(%rsi, %rdx), %ecx
++# else
++	movzbl	(%rdi, %rdx), %eax
++	movzbl	(%rsi, %rdx), %ecx
++# endif
++	testl	%eax, %eax
++	jne	L(cross_page_loop)
++	subl	%ecx, %eax
++	ret
++END (STRCMP)
++#endif
+diff --git a/sysdeps/x86_64/multiarch/strcmp.c b/sysdeps/x86_64/multiarch/strcmp.c
+index 3f433fbc..c5f38510 100644
+--- a/sysdeps/x86_64/multiarch/strcmp.c
++++ b/sysdeps/x86_64/multiarch/strcmp.c
+@@ -30,16 +30,25 @@ extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden;
+ extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2_unaligned) attribute_hidden;
+ extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden;
+ extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden;
++extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden;
+ 
+ static inline void *
+ IFUNC_SELECTOR (void)
+ {
+   const struct cpu_features* cpu_features = __get_cpu_features ();
+ 
+-  if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER)
+-      && CPU_FEATURE_USABLE_P (cpu_features, AVX2)
++  if (CPU_FEATURE_USABLE_P (cpu_features, AVX2)
+       && CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load))
+-    return OPTIMIZE (avx2);
++    {
++      if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
++	  && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW)
++	  && CPU_FEATURE_USABLE_P (cpu_features, BMI2)
++	  && !CPU_FEATURES_ARCH_P (cpu_features, Prefer_AVX2_STRCMP))
++	return OPTIMIZE (evex);
++
++      if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
++	return OPTIMIZE (avx2);
++    }
+ 
+   if (CPU_FEATURES_ARCH_P (cpu_features, Fast_Unaligned_Load))
+     return OPTIMIZE (sse2_unaligned);
+diff --git a/sysdeps/x86_64/multiarch/strlen-evex.S b/sysdeps/x86_64/multiarch/strlen-evex.S
+new file mode 100644
+index 00000000..cd022509
+--- /dev/null
++++ b/sysdeps/x86_64/multiarch/strlen-evex.S
+@@ -0,0 +1,436 @@
++/* strlen/strnlen/wcslen/wcsnlen optimized with 256-bit EVEX instructions.
++   Copyright (C) 2021 Free Software Foundation, Inc.
++   This file is part of the GNU C Library.
++
++   The GNU C Library is free software; you can redistribute it and/or
++   modify it under the terms of the GNU Lesser General Public
++   License as published by the Free Software Foundation; either
++   version 2.1 of the License, or (at your option) any later version.
++
++   The GNU C Library is distributed in the hope that it will be useful,
++   but WITHOUT ANY WARRANTY; without even the implied warranty of
++   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++   Lesser General Public License for more details.
++
++   You should have received a copy of the GNU Lesser General Public
++   License along with the GNU C Library; if not, see
++   <https://www.gnu.org/licenses/>.  */
++
++#if IS_IN (libc)
++
++# include <sysdep.h>
++
++# ifndef STRLEN
++#  define STRLEN	__strlen_evex
++# endif
++
++# define VMOVA		vmovdqa64
++
++# ifdef USE_AS_WCSLEN
++#  define VPCMP		vpcmpd
++#  define VPMINU	vpminud
++#  define SHIFT_REG	r9d
++# else
++#  define VPCMP		vpcmpb
++#  define VPMINU	vpminub
++#  define SHIFT_REG	ecx
++# endif
++
++# define XMMZERO	xmm16
++# define YMMZERO	ymm16
++# define YMM1		ymm17
++# define YMM2		ymm18
++# define YMM3		ymm19
++# define YMM4		ymm20
++# define YMM5		ymm21
++# define YMM6		ymm22
++
++# define VEC_SIZE 32
++
++	.section .text.evex,"ax",@progbits
++ENTRY (STRLEN)
++# ifdef USE_AS_STRNLEN
++	/* Check for zero length.  */
++	test	%RSI_LP, %RSI_LP
++	jz	L(zero)
++#  ifdef USE_AS_WCSLEN
++	shl	$2, %RSI_LP
++#  elif defined __ILP32__
++	/* Clear the upper 32 bits.  */
++	movl	%esi, %esi
++#  endif
++	mov	%RSI_LP, %R8_LP
++# endif
++	movl	%edi, %ecx
++	movq	%rdi, %rdx
++	vpxorq	%XMMZERO, %XMMZERO, %XMMZERO
++
++	/* Check if we may cross page boundary with one vector load.  */
++	andl	$(2 * VEC_SIZE - 1), %ecx
++	cmpl	$VEC_SIZE, %ecx
++	ja	L(cros_page_boundary)
++
++	/* Check the first VEC_SIZE bytes.  Each bit in K0 represents a
++	   null byte.  */
++	VPCMP	$0, (%rdi), %YMMZERO, %k0
++	kmovd	%k0, %eax
++	testl	%eax, %eax
++
++# ifdef USE_AS_STRNLEN
++	jnz	L(first_vec_x0_check)
++	/* Adjust length and check the end of data.  */
++	subq	$VEC_SIZE, %rsi
++	jbe	L(max)
++# else
++	jnz	L(first_vec_x0)
++# endif
++
++	/* Align data for aligned loads in the loop.  */
++	addq	$VEC_SIZE, %rdi
++	andl	$(VEC_SIZE - 1), %ecx
++	andq	$-VEC_SIZE, %rdi
++
++# ifdef USE_AS_STRNLEN
++	/* Adjust length.  */
++	addq	%rcx, %rsi
++
++	subq	$(VEC_SIZE * 4), %rsi
++	jbe	L(last_4x_vec_or_less)
++# endif
++	jmp	L(more_4x_vec)
++
++	.p2align 4
++L(cros_page_boundary):
++	andl	$(VEC_SIZE - 1), %ecx
++	andq	$-VEC_SIZE, %rdi
++
++# ifdef USE_AS_WCSLEN
++	/* NB: Divide shift count by 4 since each bit in K0 represent 4
++	   bytes.  */
++	movl	%ecx, %SHIFT_REG
++	sarl	$2, %SHIFT_REG
++# endif
++	VPCMP	$0, (%rdi), %YMMZERO, %k0
++	kmovd	%k0, %eax
++
++	/* Remove the leading bytes.  */
++	sarxl	%SHIFT_REG, %eax, %eax
++	testl	%eax, %eax
++	jz	L(aligned_more)
++	tzcntl	%eax, %eax
++# ifdef USE_AS_WCSLEN
++	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
++	sall	$2, %eax
++# endif
++# ifdef USE_AS_STRNLEN
++	/* Check the end of data.  */
++	cmpq	%rax, %rsi
++	jbe	L(max)
++# endif
++	addq	%rdi, %rax
++	addq	%rcx, %rax
++	subq	%rdx, %rax
++# ifdef USE_AS_WCSLEN
++	shrq	$2, %rax
++# endif
++	ret
++
++	.p2align 4
++L(aligned_more):
++# ifdef USE_AS_STRNLEN
++        /* "rcx" is less than VEC_SIZE.  Calculate "rdx + rcx - VEC_SIZE"
++	    with "rdx - (VEC_SIZE - rcx)" instead of "(rdx + rcx) - VEC_SIZE"
++	    to void possible addition overflow.  */
++	negq	%rcx
++	addq	$VEC_SIZE, %rcx
++
++	/* Check the end of data.  */
++	subq	%rcx, %rsi
++	jbe	L(max)
++# endif
++
++	addq	$VEC_SIZE, %rdi
++
++# ifdef USE_AS_STRNLEN
++	subq	$(VEC_SIZE * 4), %rsi
++	jbe	L(last_4x_vec_or_less)
++# endif
++
++L(more_4x_vec):
++	/* Check the first 4 * VEC_SIZE.  Only one VEC_SIZE at a time
++	   since data is only aligned to VEC_SIZE.  */
++	VPCMP	$0, (%rdi), %YMMZERO, %k0
++	kmovd	%k0, %eax
++	testl	%eax, %eax
++	jnz	L(first_vec_x0)
++
++	VPCMP	$0, VEC_SIZE(%rdi), %YMMZERO, %k0
++	kmovd	%k0, %eax
++	testl	%eax, %eax
++	jnz	L(first_vec_x1)
++
++	VPCMP	$0, (VEC_SIZE * 2)(%rdi), %YMMZERO, %k0
++	kmovd	%k0, %eax
++	testl	%eax, %eax
++	jnz	L(first_vec_x2)
++
++	VPCMP	$0, (VEC_SIZE * 3)(%rdi), %YMMZERO, %k0
++	kmovd	%k0, %eax
++	testl	%eax, %eax
++	jnz	L(first_vec_x3)
++
++	addq	$(VEC_SIZE * 4), %rdi
++
++# ifdef USE_AS_STRNLEN
++	subq	$(VEC_SIZE * 4), %rsi
++	jbe	L(last_4x_vec_or_less)
++# endif
++
++	/* Align data to 4 * VEC_SIZE.  */
++	movq	%rdi, %rcx
++	andl	$(4 * VEC_SIZE - 1), %ecx
++	andq	$-(4 * VEC_SIZE), %rdi
++
++# ifdef USE_AS_STRNLEN
++	/* Adjust length.  */
++	addq	%rcx, %rsi
++# endif
++
++	.p2align 4
++L(loop_4x_vec):
++	/* Compare 4 * VEC at a time forward.  */
++	VMOVA	(%rdi), %YMM1
++	VMOVA	VEC_SIZE(%rdi), %YMM2
++	VMOVA	(VEC_SIZE * 2)(%rdi), %YMM3
++	VMOVA	(VEC_SIZE * 3)(%rdi), %YMM4
++
++	VPMINU	%YMM1, %YMM2, %YMM5
++	VPMINU	%YMM3, %YMM4, %YMM6
++
++	VPMINU	%YMM5, %YMM6, %YMM5
++	VPCMP	$0, %YMM5, %YMMZERO, %k0
++	ktestd	%k0, %k0
++	jnz	L(4x_vec_end)
++
++	addq	$(VEC_SIZE * 4), %rdi
++
++# ifndef USE_AS_STRNLEN
++	jmp	L(loop_4x_vec)
++# else
++	subq	$(VEC_SIZE * 4), %rsi
++	ja	L(loop_4x_vec)
++
++L(last_4x_vec_or_less):
++	/* Less than 4 * VEC and aligned to VEC_SIZE.  */
++	addl	$(VEC_SIZE * 2), %esi
++	jle	L(last_2x_vec)
++
++	VPCMP	$0, (%rdi), %YMMZERO, %k0
++	kmovd	%k0, %eax
++	testl	%eax, %eax
++	jnz	L(first_vec_x0)
++
++	VPCMP	$0, VEC_SIZE(%rdi), %YMMZERO, %k0
++	kmovd	%k0, %eax
++	testl	%eax, %eax
++	jnz	L(first_vec_x1)
++
++	VPCMP	$0, (VEC_SIZE * 2)(%rdi), %YMMZERO, %k0
++	kmovd	%k0, %eax
++	testl	%eax, %eax
++	jnz	L(first_vec_x2_check)
++	subl	$VEC_SIZE, %esi
++	jle	L(max)
++
++	VPCMP	$0, (VEC_SIZE * 3)(%rdi), %YMMZERO, %k0
++	kmovd	%k0, %eax
++	testl	%eax, %eax
++	jnz	L(first_vec_x3_check)
++	movq	%r8, %rax
++#  ifdef USE_AS_WCSLEN
++	shrq	$2, %rax
++#  endif
++	ret
++
++	.p2align 4
++L(last_2x_vec):
++	addl	$(VEC_SIZE * 2), %esi
++
++	VPCMP	$0, (%rdi), %YMMZERO, %k0
++	kmovd	%k0, %eax
++	testl	%eax, %eax
++	jnz	L(first_vec_x0_check)
++	subl	$VEC_SIZE, %esi
++	jle	L(max)
++
++	VPCMP	$0, VEC_SIZE(%rdi), %YMMZERO, %k0
++	kmovd	%k0, %eax
++	testl	%eax, %eax
++	jnz	L(first_vec_x1_check)
++	movq	%r8, %rax
++#  ifdef USE_AS_WCSLEN
++	shrq	$2, %rax
++#  endif
++	ret
++
++	.p2align 4
++L(first_vec_x0_check):
++	tzcntl	%eax, %eax
++# ifdef USE_AS_WCSLEN
++	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
++	sall	$2, %eax
++# endif
++	/* Check the end of data.  */
++	cmpq	%rax, %rsi
++	jbe	L(max)
++	addq	%rdi, %rax
++	subq	%rdx, %rax
++#  ifdef USE_AS_WCSLEN
++	shrq	$2, %rax
++#  endif
++	ret
++
++	.p2align 4
++L(first_vec_x1_check):
++	tzcntl	%eax, %eax
++# ifdef USE_AS_WCSLEN
++	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
++	sall	$2, %eax
++# endif
++	/* Check the end of data.  */
++	cmpq	%rax, %rsi
++	jbe	L(max)
++	addq	$VEC_SIZE, %rax
++	addq	%rdi, %rax
++	subq	%rdx, %rax
++#  ifdef USE_AS_WCSLEN
++	shrq	$2, %rax
++#  endif
++	ret
++
++	.p2align 4
++L(first_vec_x2_check):
++	tzcntl	%eax, %eax
++# ifdef USE_AS_WCSLEN
++	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
++	sall	$2, %eax
++# endif
++	/* Check the end of data.  */
++	cmpq	%rax, %rsi
++	jbe	L(max)
++	addq	$(VEC_SIZE * 2), %rax
++	addq	%rdi, %rax
++	subq	%rdx, %rax
++#  ifdef USE_AS_WCSLEN
++	shrq	$2, %rax
++#  endif
++	ret
++
++	.p2align 4
++L(first_vec_x3_check):
++	tzcntl	%eax, %eax
++# ifdef USE_AS_WCSLEN
++	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
++	sall	$2, %eax
++# endif
++	/* Check the end of data.  */
++	cmpq	%rax, %rsi
++	jbe	L(max)
++	addq	$(VEC_SIZE * 3), %rax
++	addq	%rdi, %rax
++	subq	%rdx, %rax
++#  ifdef USE_AS_WCSLEN
++	shrq	$2, %rax
++#  endif
++	ret
++
++	.p2align 4
++L(max):
++	movq	%r8, %rax
++#  ifdef USE_AS_WCSLEN
++	shrq	$2, %rax
++#  endif
++	ret
++
++	.p2align 4
++L(zero):
++	xorl	%eax, %eax
++	ret
++# endif
++
++	.p2align 4
++L(first_vec_x0):
++	tzcntl	%eax, %eax
++# ifdef USE_AS_WCSLEN
++	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
++	sall	$2, %eax
++# endif
++	addq	%rdi, %rax
++	subq	%rdx, %rax
++# ifdef USE_AS_WCSLEN
++	shrq	$2, %rax
++# endif
++	ret
++
++	.p2align 4
++L(first_vec_x1):
++	tzcntl	%eax, %eax
++# ifdef USE_AS_WCSLEN
++	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
++	sall	$2, %eax
++# endif
++	addq	$VEC_SIZE, %rax
++	addq	%rdi, %rax
++	subq	%rdx, %rax
++# ifdef USE_AS_WCSLEN
++	shrq	$2, %rax
++# endif
++	ret
++
++	.p2align 4
++L(first_vec_x2):
++	tzcntl	%eax, %eax
++# ifdef USE_AS_WCSLEN
++	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
++	sall	$2, %eax
++# endif
++	addq	$(VEC_SIZE * 2), %rax
++	addq	%rdi, %rax
++	subq	%rdx, %rax
++# ifdef USE_AS_WCSLEN
++	shrq	$2, %rax
++# endif
++	ret
++
++	.p2align 4
++L(4x_vec_end):
++	VPCMP	$0, %YMM1, %YMMZERO, %k0
++	kmovd	%k0, %eax
++	testl	%eax, %eax
++	jnz	L(first_vec_x0)
++	VPCMP	$0, %YMM2, %YMMZERO, %k1
++	kmovd	%k1, %eax
++	testl	%eax, %eax
++	jnz	L(first_vec_x1)
++	VPCMP	$0, %YMM3, %YMMZERO, %k2
++	kmovd	%k2, %eax
++	testl	%eax, %eax
++	jnz	L(first_vec_x2)
++	VPCMP	$0, %YMM4, %YMMZERO, %k3
++	kmovd	%k3, %eax
++L(first_vec_x3):
++	tzcntl	%eax, %eax
++# ifdef USE_AS_WCSLEN
++	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
++	sall	$2, %eax
++# endif
++	addq	$(VEC_SIZE * 3), %rax
++	addq	%rdi, %rax
++	subq	%rdx, %rax
++# ifdef USE_AS_WCSLEN
++	shrq	$2, %rax
++# endif
++	ret
++
++END (STRLEN)
++#endif
+diff --git a/sysdeps/x86_64/multiarch/strncmp-evex.S b/sysdeps/x86_64/multiarch/strncmp-evex.S
+new file mode 100644
+index 00000000..a1d53e8c
+--- /dev/null
++++ b/sysdeps/x86_64/multiarch/strncmp-evex.S
+@@ -0,0 +1,3 @@
++#define STRCMP	__strncmp_evex
++#define USE_AS_STRNCMP 1
++#include "strcmp-evex.S"
+diff --git a/sysdeps/x86_64/multiarch/strncmp.c b/sysdeps/x86_64/multiarch/strncmp.c
+index 686d654f..4c15542f 100644
+--- a/sysdeps/x86_64/multiarch/strncmp.c
++++ b/sysdeps/x86_64/multiarch/strncmp.c
+@@ -30,16 +30,25 @@ extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden;
+ extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden;
+ extern __typeof (REDIRECT_NAME) OPTIMIZE (sse42) attribute_hidden;
+ extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden;
++extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden;
+ 
+ static inline void *
+ IFUNC_SELECTOR (void)
+ {
+   const struct cpu_features* cpu_features = __get_cpu_features ();
+ 
+-  if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER)
+-      && CPU_FEATURE_USABLE_P (cpu_features, AVX2)
++  if (CPU_FEATURE_USABLE_P (cpu_features, AVX2)
+       && CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load))
+-    return OPTIMIZE (avx2);
++    {
++      if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
++	  && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW)
++	  && CPU_FEATURE_USABLE_P (cpu_features, BMI2)
++	  && !CPU_FEATURES_ARCH_P (cpu_features, Prefer_AVX2_STRCMP))
++	return OPTIMIZE (evex);
++
++      if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
++	return OPTIMIZE (avx2);
++    }
+ 
+   if (CPU_FEATURE_USABLE_P (cpu_features, SSE4_2)
+       && !CPU_FEATURES_ARCH_P (cpu_features, Slow_SSE4_2))
+diff --git a/sysdeps/x86_64/multiarch/strnlen-evex.S b/sysdeps/x86_64/multiarch/strnlen-evex.S
+new file mode 100644
+index 00000000..722022f3
+--- /dev/null
++++ b/sysdeps/x86_64/multiarch/strnlen-evex.S
+@@ -0,0 +1,4 @@
++#define STRLEN __strnlen_evex
++#define USE_AS_STRNLEN 1
++
++#include "strlen-evex.S"
+diff --git a/sysdeps/x86_64/multiarch/strrchr-evex.S b/sysdeps/x86_64/multiarch/strrchr-evex.S
+new file mode 100644
+index 00000000..f920b5a5
+--- /dev/null
++++ b/sysdeps/x86_64/multiarch/strrchr-evex.S
+@@ -0,0 +1,265 @@
++/* strrchr/wcsrchr optimized with 256-bit EVEX instructions.
++   Copyright (C) 2021 Free Software Foundation, Inc.
++   This file is part of the GNU C Library.
++
++   The GNU C Library is free software; you can redistribute it and/or
++   modify it under the terms of the GNU Lesser General Public
++   License as published by the Free Software Foundation; either
++   version 2.1 of the License, or (at your option) any later version.
++
++   The GNU C Library is distributed in the hope that it will be useful,
++   but WITHOUT ANY WARRANTY; without even the implied warranty of
++   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++   Lesser General Public License for more details.
++
++   You should have received a copy of the GNU Lesser General Public
++   License along with the GNU C Library; if not, see
++   <https://www.gnu.org/licenses/>.  */
++
++#if IS_IN (libc)
++
++# include <sysdep.h>
++
++# ifndef STRRCHR
++#  define STRRCHR	__strrchr_evex
++# endif
++
++# define VMOVU		vmovdqu64
++# define VMOVA		vmovdqa64
++
++# ifdef USE_AS_WCSRCHR
++#  define VPBROADCAST	vpbroadcastd
++#  define VPCMP		vpcmpd
++#  define SHIFT_REG	r8d
++# else
++#  define VPBROADCAST	vpbroadcastb
++#  define VPCMP		vpcmpb
++#  define SHIFT_REG	ecx
++# endif
++
++# define XMMZERO	xmm16
++# define YMMZERO	ymm16
++# define YMMMATCH	ymm17
++# define YMM1		ymm18
++
++# define VEC_SIZE	32
++
++	.section .text.evex,"ax",@progbits
++ENTRY (STRRCHR)
++	movl	%edi, %ecx
++	/* Broadcast CHAR to YMMMATCH.  */
++	VPBROADCAST %esi, %YMMMATCH
++
++	vpxorq	%XMMZERO, %XMMZERO, %XMMZERO
++
++	/* Check if we may cross page boundary with one vector load.  */
++	andl	$(2 * VEC_SIZE - 1), %ecx
++	cmpl	$VEC_SIZE, %ecx
++	ja	L(cros_page_boundary)
++
++	VMOVU	(%rdi), %YMM1
++
++	/* Each bit in K0 represents a null byte in YMM1.  */
++	VPCMP	$0, %YMMZERO, %YMM1, %k0
++	/* Each bit in K1 represents a CHAR in YMM1.  */
++	VPCMP	$0, %YMMMATCH, %YMM1, %k1
++	kmovd	%k0, %ecx
++	kmovd	%k1, %eax
++
++	addq	$VEC_SIZE, %rdi
++
++	testl	%eax, %eax
++	jnz	L(first_vec)
++
++	testl	%ecx, %ecx
++	jnz	L(return_null)
++
++	andq	$-VEC_SIZE, %rdi
++	xorl	%edx, %edx
++	jmp	L(aligned_loop)
++
++	.p2align 4
++L(first_vec):
++	/* Check if there is a null byte.  */
++	testl	%ecx, %ecx
++	jnz	L(char_and_nul_in_first_vec)
++
++	/* Remember the match and keep searching.  */
++	movl	%eax, %edx
++	movq	%rdi, %rsi
++	andq	$-VEC_SIZE, %rdi
++	jmp	L(aligned_loop)
++
++	.p2align 4
++L(cros_page_boundary):
++	andl	$(VEC_SIZE - 1), %ecx
++	andq	$-VEC_SIZE, %rdi
++
++# ifdef USE_AS_WCSRCHR
++	/* NB: Divide shift count by 4 since each bit in K1 represent 4
++	   bytes.  */
++	movl	%ecx, %SHIFT_REG
++	sarl	$2, %SHIFT_REG
++# endif
++
++	VMOVA	(%rdi), %YMM1
++
++	/* Each bit in K0 represents a null byte in YMM1.  */
++	VPCMP	$0, %YMMZERO, %YMM1, %k0
++	/* Each bit in K1 represents a CHAR in YMM1.  */
++	VPCMP	$0, %YMMMATCH, %YMM1, %k1
++	kmovd	%k0, %edx
++	kmovd	%k1, %eax
++
++	shrxl	%SHIFT_REG, %edx, %edx
++	shrxl	%SHIFT_REG, %eax, %eax
++	addq	$VEC_SIZE, %rdi
++
++	/* Check if there is a CHAR.  */
++	testl	%eax, %eax
++	jnz	L(found_char)
++
++	testl	%edx, %edx
++	jnz	L(return_null)
++
++	jmp	L(aligned_loop)
++
++	.p2align 4
++L(found_char):
++	testl	%edx, %edx
++	jnz	L(char_and_nul)
++
++	/* Remember the match and keep searching.  */
++	movl	%eax, %edx
++	leaq	(%rdi, %rcx), %rsi
++
++	.p2align 4
++L(aligned_loop):
++	VMOVA	(%rdi), %YMM1
++	addq	$VEC_SIZE, %rdi
++
++	/* Each bit in K0 represents a null byte in YMM1.  */
++	VPCMP	$0, %YMMZERO, %YMM1, %k0
++	/* Each bit in K1 represents a CHAR in YMM1.  */
++	VPCMP	$0, %YMMMATCH, %YMM1, %k1
++	kmovd	%k0, %ecx
++	kmovd	%k1, %eax
++	orl	%eax, %ecx
++	jnz	L(char_nor_null)
++
++	VMOVA	(%rdi), %YMM1
++	add	$VEC_SIZE, %rdi
++
++	/* Each bit in K0 represents a null byte in YMM1.  */
++	VPCMP	$0, %YMMZERO, %YMM1, %k0
++	/* Each bit in K1 represents a CHAR in YMM1.  */
++	VPCMP	$0, %YMMMATCH, %YMM1, %k1
++	kmovd	%k0, %ecx
++	kmovd	%k1, %eax
++	orl	%eax, %ecx
++	jnz	L(char_nor_null)
++
++	VMOVA	(%rdi), %YMM1
++	addq	$VEC_SIZE, %rdi
++
++	/* Each bit in K0 represents a null byte in YMM1.  */
++	VPCMP	$0, %YMMZERO, %YMM1, %k0
++	/* Each bit in K1 represents a CHAR in YMM1.  */
++	VPCMP	$0, %YMMMATCH, %YMM1, %k1
++	kmovd	%k0, %ecx
++	kmovd	%k1, %eax
++	orl	%eax, %ecx
++	jnz	L(char_nor_null)
++
++	VMOVA	(%rdi), %YMM1
++	addq	$VEC_SIZE, %rdi
++
++	/* Each bit in K0 represents a null byte in YMM1.  */
++	VPCMP	$0, %YMMZERO, %YMM1, %k0
++	/* Each bit in K1 represents a CHAR in YMM1.  */
++	VPCMP	$0, %YMMMATCH, %YMM1, %k1
++	kmovd	%k0, %ecx
++	kmovd	%k1, %eax
++	orl	%eax, %ecx
++	jz	L(aligned_loop)
++
++	.p2align 4
++L(char_nor_null):
++	/* Find a CHAR or a null byte in a loop.  */
++	testl	%eax, %eax
++	jnz	L(match)
++L(return_value):
++	testl	%edx, %edx
++	jz	L(return_null)
++	movl	%edx, %eax
++	movq	%rsi, %rdi
++	bsrl	%eax, %eax
++# ifdef USE_AS_WCSRCHR
++	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
++	leaq	-VEC_SIZE(%rdi, %rax, 4), %rax
++# else
++	leaq	-VEC_SIZE(%rdi, %rax), %rax
++# endif
++	ret
++
++	.p2align 4
++L(match):
++	/* Find a CHAR.  Check if there is a null byte.  */
++	kmovd	%k0, %ecx
++	testl	%ecx, %ecx
++	jnz	L(find_nul)
++
++	/* Remember the match and keep searching.  */
++	movl	%eax, %edx
++	movq	%rdi, %rsi
++	jmp	L(aligned_loop)
++
++	.p2align 4
++L(find_nul):
++	/* Mask out any matching bits after the null byte.  */
++	movl	%ecx, %r8d
++	subl	$1, %r8d
++	xorl	%ecx, %r8d
++	andl	%r8d, %eax
++	testl	%eax, %eax
++	/* If there is no CHAR here, return the remembered one.  */
++	jz	L(return_value)
++	bsrl	%eax, %eax
++# ifdef USE_AS_WCSRCHR
++	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
++	leaq	-VEC_SIZE(%rdi, %rax, 4), %rax
++# else
++	leaq	-VEC_SIZE(%rdi, %rax), %rax
++# endif
++	ret
++
++	.p2align 4
++L(char_and_nul):
++	/* Find both a CHAR and a null byte.  */
++	addq	%rcx, %rdi
++	movl	%edx, %ecx
++L(char_and_nul_in_first_vec):
++	/* Mask out any matching bits after the null byte.  */
++	movl	%ecx, %r8d
++	subl	$1, %r8d
++	xorl	%ecx, %r8d
++	andl	%r8d, %eax
++	testl	%eax, %eax
++	/* Return null pointer if the null byte comes first.  */
++	jz	L(return_null)
++	bsrl	%eax, %eax
++# ifdef USE_AS_WCSRCHR
++	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
++	leaq	-VEC_SIZE(%rdi, %rax, 4), %rax
++# else
++	leaq	-VEC_SIZE(%rdi, %rax), %rax
++# endif
++	ret
++
++	.p2align 4
++L(return_null):
++	xorl	%eax, %eax
++	ret
++
++END (STRRCHR)
++#endif
+diff --git a/sysdeps/x86_64/multiarch/wcschr-evex.S b/sysdeps/x86_64/multiarch/wcschr-evex.S
+new file mode 100644
+index 00000000..7cb8f1e4
+--- /dev/null
++++ b/sysdeps/x86_64/multiarch/wcschr-evex.S
+@@ -0,0 +1,3 @@
++#define STRCHR __wcschr_evex
++#define USE_AS_WCSCHR 1
++#include "strchr-evex.S"
+diff --git a/sysdeps/x86_64/multiarch/wcscmp-evex.S b/sysdeps/x86_64/multiarch/wcscmp-evex.S
+new file mode 100644
+index 00000000..42e73e51
+--- /dev/null
++++ b/sysdeps/x86_64/multiarch/wcscmp-evex.S
+@@ -0,0 +1,4 @@
++#define STRCMP __wcscmp_evex
++#define USE_AS_WCSCMP 1
++
++#include "strcmp-evex.S"
+diff --git a/sysdeps/x86_64/multiarch/wcslen-evex.S b/sysdeps/x86_64/multiarch/wcslen-evex.S
+new file mode 100644
+index 00000000..bdafa83b
+--- /dev/null
++++ b/sysdeps/x86_64/multiarch/wcslen-evex.S
+@@ -0,0 +1,4 @@
++#define STRLEN __wcslen_evex
++#define USE_AS_WCSLEN 1
++
++#include "strlen-evex.S"
+diff --git a/sysdeps/x86_64/multiarch/wcsncmp-evex.S b/sysdeps/x86_64/multiarch/wcsncmp-evex.S
+new file mode 100644
+index 00000000..8a8e3107
+--- /dev/null
++++ b/sysdeps/x86_64/multiarch/wcsncmp-evex.S
+@@ -0,0 +1,5 @@
++#define STRCMP __wcsncmp_evex
++#define USE_AS_STRNCMP 1
++#define USE_AS_WCSCMP 1
++
++#include "strcmp-evex.S"
+diff --git a/sysdeps/x86_64/multiarch/wcsnlen-evex.S b/sysdeps/x86_64/multiarch/wcsnlen-evex.S
+new file mode 100644
+index 00000000..24773bb4
+--- /dev/null
++++ b/sysdeps/x86_64/multiarch/wcsnlen-evex.S
+@@ -0,0 +1,5 @@
++#define STRLEN __wcsnlen_evex
++#define USE_AS_WCSLEN 1
++#define USE_AS_STRNLEN 1
++
++#include "strlen-evex.S"
+diff --git a/sysdeps/x86_64/multiarch/wcsnlen.c b/sysdeps/x86_64/multiarch/wcsnlen.c
+index b3144c93..84254b83 100644
+--- a/sysdeps/x86_64/multiarch/wcsnlen.c
++++ b/sysdeps/x86_64/multiarch/wcsnlen.c
+@@ -29,16 +29,24 @@
+ extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden;
+ extern __typeof (REDIRECT_NAME) OPTIMIZE (sse4_1) attribute_hidden;
+ extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden;
++extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden;
+ 
+ static inline void *
+ IFUNC_SELECTOR (void)
+ {
+   const struct cpu_features* cpu_features = __get_cpu_features ();
+ 
+-  if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER)
+-      && CPU_FEATURE_USABLE_P (cpu_features, AVX2)
++  if (CPU_FEATURE_USABLE_P (cpu_features, AVX2)
+       && CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load))
+-    return OPTIMIZE (avx2);
++    {
++      if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
++	  && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW)
++	  && CPU_FEATURE_USABLE_P (cpu_features, BMI2))
++	return OPTIMIZE (evex);
++
++      if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
++	return OPTIMIZE (avx2);
++    }
+ 
+   if (CPU_FEATURE_USABLE_P (cpu_features, SSE4_1))
+     return OPTIMIZE (sse4_1);
+diff --git a/sysdeps/x86_64/multiarch/wcsrchr-evex.S b/sysdeps/x86_64/multiarch/wcsrchr-evex.S
+new file mode 100644
+index 00000000..c64602f7
+--- /dev/null
++++ b/sysdeps/x86_64/multiarch/wcsrchr-evex.S
+@@ -0,0 +1,3 @@
++#define STRRCHR __wcsrchr_evex
++#define USE_AS_WCSRCHR 1
++#include "strrchr-evex.S"
+diff --git a/sysdeps/x86_64/multiarch/wmemchr-evex.S b/sysdeps/x86_64/multiarch/wmemchr-evex.S
+new file mode 100644
+index 00000000..06cd0f9f
+--- /dev/null
++++ b/sysdeps/x86_64/multiarch/wmemchr-evex.S
+@@ -0,0 +1,4 @@
++#define MEMCHR __wmemchr_evex
++#define USE_AS_WMEMCHR 1
++
++#include "memchr-evex.S"
+-- 
+GitLab
+
diff --git a/glibc-RHEL-15696-13.patch b/glibc-RHEL-15696-13.patch
new file mode 100644
index 0000000..a88a3bc
--- /dev/null
+++ b/glibc-RHEL-15696-13.patch
@@ -0,0 +1,1488 @@
+From 525bc2a32c9710df40371f951217c6ae7a923aee Mon Sep 17 00:00:00 2001
+From: "H.J. Lu" <hjl.tools@gmail.com>
+Date: Fri, 5 Mar 2021 06:36:50 -0800
+Subject: [PATCH] x86-64: Add strcpy family functions with 256-bit EVEX
+Content-type: text/plain; charset=UTF-8
+
+Update ifunc-strcpy.h to select the function optimized with 256-bit EVEX
+instructions using YMM16-YMM31 registers to avoid RTM abort with usable
+AVX512VL and AVX512BW since VZEROUPPER isn't needed at function exit.
+---
+ sysdeps/x86_64/multiarch/Makefile          |    6 +
+ sysdeps/x86_64/multiarch/ifunc-impl-list.c |   24 +
+ sysdeps/x86_64/multiarch/ifunc-strcpy.h    |   13 +-
+ sysdeps/x86_64/multiarch/stpcpy-evex.S     |    3 +
+ sysdeps/x86_64/multiarch/stpncpy-evex.S    |    4 +
+ sysdeps/x86_64/multiarch/strcat-evex.S     |  283 ++++++
+ sysdeps/x86_64/multiarch/strcpy-evex.S     | 1003 ++++++++++++++++++++
+ sysdeps/x86_64/multiarch/strncat-evex.S    |    3 +
+ sysdeps/x86_64/multiarch/strncpy-evex.S    |    3 +
+ 9 files changed, 1339 insertions(+), 3 deletions(-)
+ create mode 100644 sysdeps/x86_64/multiarch/stpcpy-evex.S
+ create mode 100644 sysdeps/x86_64/multiarch/stpncpy-evex.S
+ create mode 100644 sysdeps/x86_64/multiarch/strcat-evex.S
+ create mode 100644 sysdeps/x86_64/multiarch/strcpy-evex.S
+ create mode 100644 sysdeps/x86_64/multiarch/strncat-evex.S
+ create mode 100644 sysdeps/x86_64/multiarch/strncpy-evex.S
+
+diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
+index 5ce85882..46783cd1 100644
+--- a/sysdeps/x86_64/multiarch/Makefile
++++ b/sysdeps/x86_64/multiarch/Makefile
+@@ -43,11 +43,17 @@ sysdep_routines += strncat-c stpncpy-c strncpy-c \
+ 		   memchr-evex \
+ 		   memrchr-evex \
+ 		   rawmemchr-evex \
++		   stpcpy-evex \
++		   stpncpy-evex \
++		   strcat-evex \
+ 		   strchr-evex \
+ 		   strchrnul-evex \
+ 		   strcmp-evex \
++		   strcpy-evex \
+ 		   strlen-evex \
++		   strncat-evex \
+ 		   strncmp-evex \
++		   strncpy-evex \
+ 		   strnlen-evex \
+ 		   strrchr-evex
+ CFLAGS-varshift.c += -msse4
+diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+index bd7d9f19..082e4da3 100644
+--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
++++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+@@ -224,6 +224,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 			      __stpncpy_ssse3)
+ 	      IFUNC_IMPL_ADD (array, i, stpncpy, CPU_FEATURE_USABLE (AVX2),
+ 			      __stpncpy_avx2)
++	      IFUNC_IMPL_ADD (array, i, stpncpy,
++			      (CPU_FEATURE_USABLE (AVX512VL)
++			       && CPU_FEATURE_USABLE (AVX512BW)),
++			      __stpncpy_evex)
+ 	      IFUNC_IMPL_ADD (array, i, stpncpy, 1,
+ 			      __stpncpy_sse2_unaligned)
+ 	      IFUNC_IMPL_ADD (array, i, stpncpy, 1, __stpncpy_sse2))
+@@ -234,6 +238,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 			      __stpcpy_ssse3)
+ 	      IFUNC_IMPL_ADD (array, i, stpcpy, CPU_FEATURE_USABLE (AVX2),
+ 			      __stpcpy_avx2)
++	      IFUNC_IMPL_ADD (array, i, stpcpy,
++			      (CPU_FEATURE_USABLE (AVX512VL)
++			       && CPU_FEATURE_USABLE (AVX512BW)),
++			      __stpcpy_evex)
+ 	      IFUNC_IMPL_ADD (array, i, stpcpy, 1, __stpcpy_sse2_unaligned)
+ 	      IFUNC_IMPL_ADD (array, i, stpcpy, 1, __stpcpy_sse2))
+ 
+@@ -268,6 +276,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+   IFUNC_IMPL (i, name, strcat,
+ 	      IFUNC_IMPL_ADD (array, i, strcat, CPU_FEATURE_USABLE (AVX2),
+ 			      __strcat_avx2)
++	      IFUNC_IMPL_ADD (array, i, strcat,
++			      (CPU_FEATURE_USABLE (AVX512VL)
++			       && CPU_FEATURE_USABLE (AVX512BW)),
++			      __strcat_evex)
+ 	      IFUNC_IMPL_ADD (array, i, strcat, CPU_FEATURE_USABLE (SSSE3),
+ 			      __strcat_ssse3)
+ 	      IFUNC_IMPL_ADD (array, i, strcat, 1, __strcat_sse2_unaligned)
+@@ -330,6 +342,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+   IFUNC_IMPL (i, name, strcpy,
+ 	      IFUNC_IMPL_ADD (array, i, strcpy, CPU_FEATURE_USABLE (AVX2),
+ 			      __strcpy_avx2)
++	      IFUNC_IMPL_ADD (array, i, strcpy,
++			      (CPU_FEATURE_USABLE (AVX512VL)
++			       && CPU_FEATURE_USABLE (AVX512BW)),
++			      __strcpy_evex)
+ 	      IFUNC_IMPL_ADD (array, i, strcpy, CPU_FEATURE_USABLE (SSSE3),
+ 			      __strcpy_ssse3)
+ 	      IFUNC_IMPL_ADD (array, i, strcpy, 1, __strcpy_sse2_unaligned)
+@@ -373,6 +389,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+   IFUNC_IMPL (i, name, strncat,
+ 	      IFUNC_IMPL_ADD (array, i, strncat, CPU_FEATURE_USABLE (AVX2),
+ 			      __strncat_avx2)
++	      IFUNC_IMPL_ADD (array, i, strncat,
++			      (CPU_FEATURE_USABLE (AVX512VL)
++			       && CPU_FEATURE_USABLE (AVX512BW)),
++			      __strncat_evex)
+ 	      IFUNC_IMPL_ADD (array, i, strncat, CPU_FEATURE_USABLE (SSSE3),
+ 			      __strncat_ssse3)
+ 	      IFUNC_IMPL_ADD (array, i, strncat, 1,
+@@ -383,6 +403,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+   IFUNC_IMPL (i, name, strncpy,
+ 	      IFUNC_IMPL_ADD (array, i, strncpy, CPU_FEATURE_USABLE (AVX2),
+ 			      __strncpy_avx2)
++	      IFUNC_IMPL_ADD (array, i, strncpy,
++			      (CPU_FEATURE_USABLE (AVX512VL)
++			       && CPU_FEATURE_USABLE (AVX512BW)),
++			      __strncpy_evex)
+ 	      IFUNC_IMPL_ADD (array, i, strncpy, CPU_FEATURE_USABLE (SSSE3),
+ 			      __strncpy_ssse3)
+ 	      IFUNC_IMPL_ADD (array, i, strncpy, 1,
+diff --git a/sysdeps/x86_64/multiarch/ifunc-strcpy.h b/sysdeps/x86_64/multiarch/ifunc-strcpy.h
+index 100dca5c..deae6348 100644
+--- a/sysdeps/x86_64/multiarch/ifunc-strcpy.h
++++ b/sysdeps/x86_64/multiarch/ifunc-strcpy.h
+@@ -25,16 +25,23 @@ extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2_unaligned)
+   attribute_hidden;
+ extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden;
+ extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden;
++extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden;
+ 
+ static inline void *
+ IFUNC_SELECTOR (void)
+ {
+   const struct cpu_features* cpu_features = __get_cpu_features ();
+ 
+-  if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER)
+-      && CPU_FEATURE_USABLE_P (cpu_features, AVX2)
++  if (CPU_FEATURE_USABLE_P (cpu_features, AVX2)
+       && CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load))
+-    return OPTIMIZE (avx2);
++    {
++      if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
++	  && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW))
++	return OPTIMIZE (evex);
++
++      if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
++	return OPTIMIZE (avx2);
++    }
+ 
+   if (CPU_FEATURES_ARCH_P (cpu_features, Fast_Unaligned_Load))
+     return OPTIMIZE (sse2_unaligned);
+diff --git a/sysdeps/x86_64/multiarch/stpcpy-evex.S b/sysdeps/x86_64/multiarch/stpcpy-evex.S
+new file mode 100644
+index 00000000..7c6f26cd
+--- /dev/null
++++ b/sysdeps/x86_64/multiarch/stpcpy-evex.S
+@@ -0,0 +1,3 @@
++#define USE_AS_STPCPY
++#define STRCPY __stpcpy_evex
++#include "strcpy-evex.S"
+diff --git a/sysdeps/x86_64/multiarch/stpncpy-evex.S b/sysdeps/x86_64/multiarch/stpncpy-evex.S
+new file mode 100644
+index 00000000..1570014d
+--- /dev/null
++++ b/sysdeps/x86_64/multiarch/stpncpy-evex.S
+@@ -0,0 +1,4 @@
++#define USE_AS_STPCPY
++#define USE_AS_STRNCPY
++#define STRCPY __stpncpy_evex
++#include "strcpy-evex.S"
+diff --git a/sysdeps/x86_64/multiarch/strcat-evex.S b/sysdeps/x86_64/multiarch/strcat-evex.S
+new file mode 100644
+index 00000000..97c3d85b
+--- /dev/null
++++ b/sysdeps/x86_64/multiarch/strcat-evex.S
+@@ -0,0 +1,283 @@
++/* strcat with 256-bit EVEX instructions.
++   Copyright (C) 2021 Free Software Foundation, Inc.
++   This file is part of the GNU C Library.
++
++   The GNU C Library is free software; you can redistribute it and/or
++   modify it under the terms of the GNU Lesser General Public
++   License as published by the Free Software Foundation; either
++   version 2.1 of the License, or (at your option) any later version.
++
++   The GNU C Library is distributed in the hope that it will be useful,
++   but WITHOUT ANY WARRANTY; without even the implied warranty of
++   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++   Lesser General Public License for more details.
++
++   You should have received a copy of the GNU Lesser General Public
++   License along with the GNU C Library; if not, see
++   <https://www.gnu.org/licenses/>.  */
++
++#if IS_IN (libc)
++
++# include <sysdep.h>
++
++# ifndef STRCAT
++#  define STRCAT  __strcat_evex
++# endif
++
++# define VMOVU		vmovdqu64
++# define VMOVA		vmovdqa64
++
++/* zero register */
++# define XMMZERO	xmm16
++# define YMMZERO	ymm16
++# define YMM0		ymm17
++# define YMM1		ymm18
++
++# define USE_AS_STRCAT
++
++/* Number of bytes in a vector register */
++# define VEC_SIZE	32
++
++	.section .text.evex,"ax",@progbits
++ENTRY (STRCAT)
++	mov	%rdi, %r9
++# ifdef USE_AS_STRNCAT
++	mov	%rdx, %r8
++# endif
++
++	xor	%eax, %eax
++	mov	%edi, %ecx
++	and	$((VEC_SIZE * 4) - 1), %ecx
++	vpxorq	%XMMZERO, %XMMZERO, %XMMZERO
++	cmp	$(VEC_SIZE * 3), %ecx
++	ja	L(fourth_vector_boundary)
++	vpcmpb	$0, (%rdi), %YMMZERO, %k0
++	kmovd	%k0, %edx
++	test	%edx, %edx
++	jnz	L(exit_null_on_first_vector)
++	mov	%rdi, %rax
++	and	$-VEC_SIZE, %rax
++	jmp	L(align_vec_size_start)
++L(fourth_vector_boundary):
++	mov	%rdi, %rax
++	and	$-VEC_SIZE, %rax
++	vpcmpb	$0, (%rax), %YMMZERO, %k0
++	mov	$-1, %r10d
++	sub	%rax, %rcx
++	shl	%cl, %r10d
++	kmovd	%k0, %edx
++	and	%r10d, %edx
++	jnz	L(exit)
++
++L(align_vec_size_start):
++	vpcmpb	$0, VEC_SIZE(%rax), %YMMZERO, %k0
++	kmovd	%k0, %edx
++	test	%edx, %edx
++	jnz	L(exit_null_on_second_vector)
++
++	vpcmpb	$0, (VEC_SIZE * 2)(%rax), %YMMZERO, %k1
++	kmovd	%k1, %edx
++	test	%edx, %edx
++	jnz	L(exit_null_on_third_vector)
++
++	vpcmpb	$0, (VEC_SIZE * 3)(%rax), %YMMZERO, %k2
++	kmovd	%k2, %edx
++	test	%edx, %edx
++	jnz	L(exit_null_on_fourth_vector)
++
++	vpcmpb	$0, (VEC_SIZE * 4)(%rax), %YMMZERO, %k3
++	kmovd	%k3, %edx
++	test	%edx, %edx
++	jnz	L(exit_null_on_fifth_vector)
++
++	vpcmpb	$0, (VEC_SIZE * 5)(%rax), %YMMZERO, %k4
++	add	$(VEC_SIZE * 4), %rax
++	kmovd	%k4, %edx
++	test	%edx, %edx
++	jnz	L(exit_null_on_second_vector)
++
++	vpcmpb	$0, (VEC_SIZE * 2)(%rax), %YMMZERO, %k1
++	kmovd	%k1, %edx
++	test	%edx, %edx
++	jnz	L(exit_null_on_third_vector)
++
++	vpcmpb	$0, (VEC_SIZE * 3)(%rax), %YMMZERO, %k2
++	kmovd	%k2, %edx
++	test	%edx, %edx
++	jnz	L(exit_null_on_fourth_vector)
++
++	vpcmpb	$0, (VEC_SIZE * 4)(%rax), %YMMZERO, %k3
++	kmovd	%k3, %edx
++	test	%edx, %edx
++	jnz	L(exit_null_on_fifth_vector)
++
++	vpcmpb	$0, (VEC_SIZE * 5)(%rax), %YMMZERO, %k4
++	kmovd	%k4, %edx
++	add	$(VEC_SIZE * 4), %rax
++	test	%edx, %edx
++	jnz	L(exit_null_on_second_vector)
++
++	vpcmpb	$0, (VEC_SIZE * 2)(%rax), %YMMZERO, %k1
++	kmovd	%k1, %edx
++	test	%edx, %edx
++	jnz	L(exit_null_on_third_vector)
++
++	vpcmpb	$0, (VEC_SIZE * 3)(%rax), %YMMZERO, %k2
++	kmovd	%k2, %edx
++	test	%edx, %edx
++	jnz	L(exit_null_on_fourth_vector)
++
++	vpcmpb	$0, (VEC_SIZE * 4)(%rax), %YMMZERO, %k3
++	kmovd	%k3, %edx
++	test	%edx, %edx
++	jnz	L(exit_null_on_fifth_vector)
++
++	vpcmpb	$0, (VEC_SIZE * 5)(%rax), %YMMZERO, %k4
++	add	$(VEC_SIZE * 4), %rax
++	kmovd	%k4, %edx
++	test	%edx, %edx
++	jnz	L(exit_null_on_second_vector)
++
++	vpcmpb	$0, (VEC_SIZE * 2)(%rax), %YMMZERO, %k1
++	kmovd	%k1, %edx
++	test	%edx, %edx
++	jnz	L(exit_null_on_third_vector)
++
++	vpcmpb	$0, (VEC_SIZE * 3)(%rax), %YMMZERO, %k2
++	kmovd	%k2, %edx
++	test	%edx, %edx
++	jnz	L(exit_null_on_fourth_vector)
++
++	vpcmpb	$0, (VEC_SIZE * 4)(%rax), %YMMZERO, %k3
++	kmovd	%k3, %edx
++	test	%edx, %edx
++	jnz	L(exit_null_on_fifth_vector)
++
++	test	$((VEC_SIZE * 4) - 1), %rax
++	jz	L(align_four_vec_loop)
++
++	vpcmpb	$0, (VEC_SIZE * 5)(%rax), %YMMZERO, %k4
++	add	$(VEC_SIZE * 5), %rax
++	kmovd	%k4, %edx
++	test	%edx, %edx
++	jnz	L(exit)
++
++	test	$((VEC_SIZE * 4) - 1), %rax
++	jz	L(align_four_vec_loop)
++
++	vpcmpb	$0, VEC_SIZE(%rax), %YMMZERO, %k0
++	add	$VEC_SIZE, %rax
++	kmovd	%k0, %edx
++	test	%edx, %edx
++	jnz	L(exit)
++
++	test	$((VEC_SIZE * 4) - 1), %rax
++	jz	L(align_four_vec_loop)
++
++	vpcmpb	$0, VEC_SIZE(%rax), %YMMZERO, %k0
++	add	$VEC_SIZE, %rax
++	kmovd	%k0, %edx
++	test	%edx, %edx
++	jnz	L(exit)
++
++	test	$((VEC_SIZE * 4) - 1), %rax
++	jz	L(align_four_vec_loop)
++
++	vpcmpb	$0, VEC_SIZE(%rax), %YMMZERO, %k1
++	add	$VEC_SIZE, %rax
++	kmovd	%k1, %edx
++	test	%edx, %edx
++	jnz	L(exit)
++
++	add	$VEC_SIZE, %rax
++
++	.p2align 4
++L(align_four_vec_loop):
++	VMOVA	(%rax), %YMM0
++	VMOVA	(VEC_SIZE * 2)(%rax), %YMM1
++	vpminub	VEC_SIZE(%rax), %YMM0, %YMM0
++	vpminub	(VEC_SIZE * 3)(%rax), %YMM1, %YMM1
++	vpminub	%YMM0, %YMM1, %YMM0
++	/* If K0 != 0, there is a null byte.  */
++	vpcmpb	$0, %YMM0, %YMMZERO, %k0
++	add	$(VEC_SIZE * 4), %rax
++	ktestd	%k0, %k0
++	jz	L(align_four_vec_loop)
++
++	vpcmpb	$0, -(VEC_SIZE * 4)(%rax), %YMMZERO, %k0
++	sub	$(VEC_SIZE * 5), %rax
++	kmovd	%k0, %edx
++	test	%edx, %edx
++	jnz	L(exit_null_on_second_vector)
++
++	vpcmpb	$0, (VEC_SIZE * 2)(%rax), %YMMZERO, %k1
++	kmovd	%k1, %edx
++	test	%edx, %edx
++	jnz	L(exit_null_on_third_vector)
++
++	vpcmpb	$0, (VEC_SIZE * 3)(%rax), %YMMZERO, %k2
++	kmovd	%k2, %edx
++	test	%edx, %edx
++	jnz	L(exit_null_on_fourth_vector)
++
++	vpcmpb	$0, (VEC_SIZE * 4)(%rax), %YMMZERO, %k3
++	kmovd	%k3, %edx
++	sub	%rdi, %rax
++	bsf	%rdx, %rdx
++	add	%rdx, %rax
++	add	$(VEC_SIZE * 4), %rax
++	jmp	L(StartStrcpyPart)
++
++	.p2align 4
++L(exit):
++	sub	%rdi, %rax
++L(exit_null_on_first_vector):
++	bsf	%rdx, %rdx
++	add	%rdx, %rax
++	jmp	L(StartStrcpyPart)
++
++	.p2align 4
++L(exit_null_on_second_vector):
++	sub	%rdi, %rax
++	bsf	%rdx, %rdx
++	add	%rdx, %rax
++	add	$VEC_SIZE, %rax
++	jmp	L(StartStrcpyPart)
++
++	.p2align 4
++L(exit_null_on_third_vector):
++	sub	%rdi, %rax
++	bsf	%rdx, %rdx
++	add	%rdx, %rax
++	add	$(VEC_SIZE * 2), %rax
++	jmp	L(StartStrcpyPart)
++
++	.p2align 4
++L(exit_null_on_fourth_vector):
++	sub	%rdi, %rax
++	bsf	%rdx, %rdx
++	add	%rdx, %rax
++	add	$(VEC_SIZE * 3), %rax
++	jmp	L(StartStrcpyPart)
++
++	.p2align 4
++L(exit_null_on_fifth_vector):
++	sub	%rdi, %rax
++	bsf	%rdx, %rdx
++	add	%rdx, %rax
++	add	$(VEC_SIZE * 4), %rax
++
++	.p2align 4
++L(StartStrcpyPart):
++	lea	(%r9, %rax), %rdi
++	mov	%rsi, %rcx
++	mov	%r9, %rax      /* save result */
++
++# ifdef USE_AS_STRNCAT
++	test	%r8, %r8
++	jz	L(ExitZero)
++#  define USE_AS_STRNCPY
++# endif
++
++# include "strcpy-evex.S"
++#endif
+diff --git a/sysdeps/x86_64/multiarch/strcpy-evex.S b/sysdeps/x86_64/multiarch/strcpy-evex.S
+new file mode 100644
+index 00000000..a343a1a6
+--- /dev/null
++++ b/sysdeps/x86_64/multiarch/strcpy-evex.S
+@@ -0,0 +1,1003 @@
++/* strcpy with 256-bit EVEX instructions.
++   Copyright (C) 2021 Free Software Foundation, Inc.
++   This file is part of the GNU C Library.
++
++   The GNU C Library is free software; you can redistribute it and/or
++   modify it under the terms of the GNU Lesser General Public
++   License as published by the Free Software Foundation; either
++   version 2.1 of the License, or (at your option) any later version.
++
++   The GNU C Library is distributed in the hope that it will be useful,
++   but WITHOUT ANY WARRANTY; without even the implied warranty of
++   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++   Lesser General Public License for more details.
++
++   You should have received a copy of the GNU Lesser General Public
++   License along with the GNU C Library; if not, see
++   <https://www.gnu.org/licenses/>.  */
++
++#if IS_IN (libc)
++
++# ifndef USE_AS_STRCAT
++#  include <sysdep.h>
++
++#  ifndef STRCPY
++#   define STRCPY  __strcpy_evex
++#  endif
++
++# endif
++
++# define VMOVU		vmovdqu64
++# define VMOVA		vmovdqa64
++
++/* Number of bytes in a vector register */
++# ifndef VEC_SIZE
++#  define VEC_SIZE	32
++# endif
++
++# define XMM2		xmm18
++# define XMM3		xmm19
++
++# define YMM2		ymm18
++# define YMM3		ymm19
++# define YMM4		ymm20
++# define YMM5		ymm21
++# define YMM6		ymm22
++# define YMM7		ymm23
++
++# ifndef USE_AS_STRCAT
++
++/* zero register */
++#  define XMMZERO	xmm16
++#  define YMMZERO	ymm16
++#  define YMM1		ymm17
++
++	.section .text.evex,"ax",@progbits
++ENTRY (STRCPY)
++#  ifdef USE_AS_STRNCPY
++	mov	%RDX_LP, %R8_LP
++	test	%R8_LP, %R8_LP
++	jz	L(ExitZero)
++#  endif
++	mov	%rsi, %rcx
++#  ifndef USE_AS_STPCPY
++	mov	%rdi, %rax      /* save result */
++#  endif
++
++	vpxorq	%XMMZERO, %XMMZERO, %XMMZERO
++# endif
++
++	and	$((VEC_SIZE * 4) - 1), %ecx
++	cmp	$(VEC_SIZE * 2), %ecx
++	jbe	L(SourceStringAlignmentLessTwoVecSize)
++
++	and	$-VEC_SIZE, %rsi
++	and	$(VEC_SIZE - 1), %ecx
++
++	vpcmpb	$0, (%rsi), %YMMZERO, %k0
++	kmovd	%k0, %edx
++	shr	%cl, %rdx
++
++# ifdef USE_AS_STRNCPY
++#  if defined USE_AS_STPCPY || defined USE_AS_STRCAT
++	mov	$VEC_SIZE, %r10
++	sub	%rcx, %r10
++	cmp	%r10, %r8
++#  else
++	mov	$(VEC_SIZE + 1), %r10
++	sub	%rcx, %r10
++	cmp	%r10, %r8
++#  endif
++	jbe	L(CopyVecSizeTailCase2OrCase3)
++# endif
++	test	%edx, %edx
++	jnz	L(CopyVecSizeTail)
++
++	vpcmpb	$0, VEC_SIZE(%rsi), %YMMZERO, %k1
++	kmovd	%k1, %edx
++
++# ifdef USE_AS_STRNCPY
++	add	$VEC_SIZE, %r10
++	cmp	%r10, %r8
++	jbe	L(CopyTwoVecSizeCase2OrCase3)
++# endif
++	test	%edx, %edx
++	jnz	L(CopyTwoVecSize)
++
++	VMOVU	(%rsi, %rcx), %YMM2   /* copy VEC_SIZE bytes */
++	VMOVU	%YMM2, (%rdi)
++
++/* If source address alignment != destination address alignment */
++	.p2align 4
++L(UnalignVecSizeBoth):
++	sub	%rcx, %rdi
++# ifdef USE_AS_STRNCPY
++	add	%rcx, %r8
++	sbb	%rcx, %rcx
++	or	%rcx, %r8
++# endif
++	mov	$VEC_SIZE, %rcx
++	VMOVA	(%rsi, %rcx), %YMM2
++	VMOVU	%YMM2, (%rdi, %rcx)
++	VMOVA	VEC_SIZE(%rsi, %rcx), %YMM2
++	vpcmpb	$0, %YMM2, %YMMZERO, %k0
++	kmovd	%k0, %edx
++	add	$VEC_SIZE, %rcx
++# ifdef USE_AS_STRNCPY
++	sub	$(VEC_SIZE * 3), %r8
++	jbe	L(CopyVecSizeCase2OrCase3)
++# endif
++	test	%edx, %edx
++# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
++	jnz	L(CopyVecSizeUnalignedVec2)
++# else
++	jnz	L(CopyVecSize)
++# endif
++
++	VMOVU	%YMM2, (%rdi, %rcx)
++	VMOVA	VEC_SIZE(%rsi, %rcx), %YMM3
++	vpcmpb	$0, %YMM3, %YMMZERO, %k0
++	kmovd	%k0, %edx
++	add	$VEC_SIZE, %rcx
++# ifdef USE_AS_STRNCPY
++	sub	$VEC_SIZE, %r8
++	jbe	L(CopyVecSizeCase2OrCase3)
++# endif
++	test	%edx, %edx
++# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
++	jnz	L(CopyVecSizeUnalignedVec3)
++# else
++	jnz	L(CopyVecSize)
++# endif
++
++	VMOVU	%YMM3, (%rdi, %rcx)
++	VMOVA	VEC_SIZE(%rsi, %rcx), %YMM4
++	vpcmpb	$0, %YMM4, %YMMZERO, %k0
++	kmovd	%k0, %edx
++	add	$VEC_SIZE, %rcx
++# ifdef USE_AS_STRNCPY
++	sub	$VEC_SIZE, %r8
++	jbe	L(CopyVecSizeCase2OrCase3)
++# endif
++	test	%edx, %edx
++# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
++	jnz	L(CopyVecSizeUnalignedVec4)
++# else
++	jnz	L(CopyVecSize)
++# endif
++
++	VMOVU	%YMM4, (%rdi, %rcx)
++	VMOVA	VEC_SIZE(%rsi, %rcx), %YMM2
++	vpcmpb	$0, %YMM2, %YMMZERO, %k0
++	kmovd	%k0, %edx
++	add	$VEC_SIZE, %rcx
++# ifdef USE_AS_STRNCPY
++	sub	$VEC_SIZE, %r8
++	jbe	L(CopyVecSizeCase2OrCase3)
++# endif
++	test	%edx, %edx
++# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
++	jnz	L(CopyVecSizeUnalignedVec2)
++# else
++	jnz	L(CopyVecSize)
++# endif
++
++	VMOVU	%YMM2, (%rdi, %rcx)
++	VMOVA	VEC_SIZE(%rsi, %rcx), %YMM2
++	vpcmpb	$0, %YMM2, %YMMZERO, %k0
++	kmovd	%k0, %edx
++	add	$VEC_SIZE, %rcx
++# ifdef USE_AS_STRNCPY
++	sub	$VEC_SIZE, %r8
++	jbe	L(CopyVecSizeCase2OrCase3)
++# endif
++	test	%edx, %edx
++# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
++	jnz	L(CopyVecSizeUnalignedVec2)
++# else
++	jnz	L(CopyVecSize)
++# endif
++
++	VMOVA	VEC_SIZE(%rsi, %rcx), %YMM3
++	VMOVU	%YMM2, (%rdi, %rcx)
++	vpcmpb	$0, %YMM3, %YMMZERO, %k0
++	kmovd	%k0, %edx
++	add	$VEC_SIZE, %rcx
++# ifdef USE_AS_STRNCPY
++	sub	$VEC_SIZE, %r8
++	jbe	L(CopyVecSizeCase2OrCase3)
++# endif
++	test	%edx, %edx
++# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
++	jnz	L(CopyVecSizeUnalignedVec3)
++# else
++	jnz	L(CopyVecSize)
++# endif
++
++	VMOVU	%YMM3, (%rdi, %rcx)
++	mov	%rsi, %rdx
++	lea	VEC_SIZE(%rsi, %rcx), %rsi
++	and	$-(VEC_SIZE * 4), %rsi
++	sub	%rsi, %rdx
++	sub	%rdx, %rdi
++# ifdef USE_AS_STRNCPY
++	lea	(VEC_SIZE * 8)(%r8, %rdx), %r8
++# endif
++L(UnalignedFourVecSizeLoop):
++	VMOVA	(%rsi), %YMM4
++	VMOVA	VEC_SIZE(%rsi), %YMM5
++	VMOVA	(VEC_SIZE * 2)(%rsi), %YMM6
++	VMOVA	(VEC_SIZE * 3)(%rsi), %YMM7
++	vpminub	%YMM5, %YMM4, %YMM2
++	vpminub	%YMM7, %YMM6, %YMM3
++	vpminub	%YMM2, %YMM3, %YMM2
++	/* If K7 != 0, there is a null byte.  */
++	vpcmpb	$0, %YMM2, %YMMZERO, %k7
++	kmovd	%k7, %edx
++# ifdef USE_AS_STRNCPY
++	sub	$(VEC_SIZE * 4), %r8
++	jbe	L(UnalignedLeaveCase2OrCase3)
++# endif
++	test	%edx, %edx
++	jnz	L(UnalignedFourVecSizeLeave)
++
++L(UnalignedFourVecSizeLoop_start):
++	add	$(VEC_SIZE * 4), %rdi
++	add	$(VEC_SIZE * 4), %rsi
++	VMOVU	%YMM4, -(VEC_SIZE * 4)(%rdi)
++	VMOVA	(%rsi), %YMM4
++	VMOVU	%YMM5, -(VEC_SIZE * 3)(%rdi)
++	VMOVA	VEC_SIZE(%rsi), %YMM5
++	vpminub	%YMM5, %YMM4, %YMM2
++	VMOVU	%YMM6, -(VEC_SIZE * 2)(%rdi)
++	VMOVA	(VEC_SIZE * 2)(%rsi), %YMM6
++	VMOVU	%YMM7, -VEC_SIZE(%rdi)
++	VMOVA	(VEC_SIZE * 3)(%rsi), %YMM7
++	vpminub	%YMM7, %YMM6, %YMM3
++	vpminub	%YMM2, %YMM3, %YMM2
++	/* If K7 != 0, there is a null byte.  */
++	vpcmpb	$0, %YMM2, %YMMZERO, %k7
++	kmovd	%k7, %edx
++# ifdef USE_AS_STRNCPY
++	sub	$(VEC_SIZE * 4), %r8
++	jbe	L(UnalignedLeaveCase2OrCase3)
++# endif
++	test	%edx, %edx
++	jz	L(UnalignedFourVecSizeLoop_start)
++
++L(UnalignedFourVecSizeLeave):
++	vpcmpb	$0, %YMM4, %YMMZERO, %k1
++	kmovd	%k1, %edx
++	test	%edx, %edx
++	jnz	L(CopyVecSizeUnaligned_0)
++
++	vpcmpb	$0, %YMM5, %YMMZERO, %k2
++	kmovd	%k2, %ecx
++	test	%ecx, %ecx
++	jnz	L(CopyVecSizeUnaligned_16)
++
++	vpcmpb	$0, %YMM6, %YMMZERO, %k3
++	kmovd	%k3, %edx
++	test	%edx, %edx
++	jnz	L(CopyVecSizeUnaligned_32)
++
++	vpcmpb	$0, %YMM7, %YMMZERO, %k4
++	kmovd	%k4, %ecx
++	bsf	%ecx, %edx
++	VMOVU	%YMM4, (%rdi)
++	VMOVU	%YMM5, VEC_SIZE(%rdi)
++	VMOVU	%YMM6, (VEC_SIZE * 2)(%rdi)
++# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
++# ifdef USE_AS_STPCPY
++	lea	(VEC_SIZE * 3)(%rdi, %rdx), %rax
++# endif
++	VMOVU	%YMM7, (VEC_SIZE * 3)(%rdi)
++	add	$(VEC_SIZE - 1), %r8
++	sub	%rdx, %r8
++	lea	((VEC_SIZE * 3) + 1)(%rdi, %rdx), %rdi
++	jmp	L(StrncpyFillTailWithZero)
++# else
++	add	$(VEC_SIZE * 3), %rsi
++	add	$(VEC_SIZE * 3), %rdi
++	jmp	L(CopyVecSizeExit)
++# endif
++
++/* If source address alignment == destination address alignment */
++
++L(SourceStringAlignmentLessTwoVecSize):
++	VMOVU	(%rsi), %YMM3
++	VMOVU	VEC_SIZE(%rsi), %YMM2
++	vpcmpb	$0, %YMM3, %YMMZERO, %k0
++	kmovd	%k0, %edx
++
++# ifdef USE_AS_STRNCPY
++#  if defined USE_AS_STPCPY || defined USE_AS_STRCAT
++	cmp	$VEC_SIZE, %r8
++#  else
++	cmp	$(VEC_SIZE + 1), %r8
++#  endif
++	jbe	L(CopyVecSizeTail1Case2OrCase3)
++# endif
++	test	%edx, %edx
++	jnz	L(CopyVecSizeTail1)
++
++	VMOVU	%YMM3, (%rdi)
++	vpcmpb	$0, %YMM2, %YMMZERO, %k0
++	kmovd	%k0, %edx
++
++# ifdef USE_AS_STRNCPY
++#  if defined USE_AS_STPCPY || defined USE_AS_STRCAT
++	cmp	$(VEC_SIZE * 2), %r8
++#  else
++	cmp	$((VEC_SIZE * 2) + 1), %r8
++#  endif
++	jbe	L(CopyTwoVecSize1Case2OrCase3)
++# endif
++	test	%edx, %edx
++	jnz	L(CopyTwoVecSize1)
++
++	and	$-VEC_SIZE, %rsi
++	and	$(VEC_SIZE - 1), %ecx
++	jmp	L(UnalignVecSizeBoth)
++
++/*------End of main part with loops---------------------*/
++
++/* Case1 */
++
++# if (!defined USE_AS_STRNCPY) || (defined USE_AS_STRCAT)
++	.p2align 4
++L(CopyVecSize):
++	add	%rcx, %rdi
++# endif
++L(CopyVecSizeTail):
++	add	%rcx, %rsi
++L(CopyVecSizeTail1):
++	bsf	%edx, %edx
++L(CopyVecSizeExit):
++	cmp	$32, %edx
++	jae	L(Exit32_63)
++	cmp	$16, %edx
++	jae	L(Exit16_31)
++	cmp	$8, %edx
++	jae	L(Exit8_15)
++	cmp	$4, %edx
++	jae	L(Exit4_7)
++	cmp	$3, %edx
++	je	L(Exit3)
++	cmp	$1, %edx
++	ja	L(Exit2)
++	je	L(Exit1)
++	movb	$0, (%rdi)
++# ifdef USE_AS_STPCPY
++	lea	(%rdi), %rax
++# endif
++# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
++	sub	$1, %r8
++	lea	1(%rdi), %rdi
++	jnz	L(StrncpyFillTailWithZero)
++# endif
++	ret
++
++	.p2align 4
++L(CopyTwoVecSize1):
++	add	$VEC_SIZE, %rsi
++	add	$VEC_SIZE, %rdi
++# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
++	sub	$VEC_SIZE, %r8
++# endif
++	jmp	L(CopyVecSizeTail1)
++
++	.p2align 4
++L(CopyTwoVecSize):
++	bsf	%edx, %edx
++	add	%rcx, %rsi
++	add	$VEC_SIZE, %edx
++	sub	%ecx, %edx
++	jmp	L(CopyVecSizeExit)
++
++	.p2align 4
++L(CopyVecSizeUnaligned_0):
++	bsf	%edx, %edx
++# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
++# ifdef USE_AS_STPCPY
++	lea	(%rdi, %rdx), %rax
++# endif
++	VMOVU	%YMM4, (%rdi)
++	add	$((VEC_SIZE * 4) - 1), %r8
++	sub	%rdx, %r8
++	lea	1(%rdi, %rdx), %rdi
++	jmp	L(StrncpyFillTailWithZero)
++# else
++	jmp	L(CopyVecSizeExit)
++# endif
++
++	.p2align 4
++L(CopyVecSizeUnaligned_16):
++	bsf	%ecx, %edx
++	VMOVU	%YMM4, (%rdi)
++# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
++# ifdef USE_AS_STPCPY
++	lea	VEC_SIZE(%rdi, %rdx), %rax
++# endif
++	VMOVU	%YMM5, VEC_SIZE(%rdi)
++	add	$((VEC_SIZE * 3) - 1), %r8
++	sub	%rdx, %r8
++	lea	(VEC_SIZE + 1)(%rdi, %rdx), %rdi
++	jmp	L(StrncpyFillTailWithZero)
++# else
++	add	$VEC_SIZE, %rsi
++	add	$VEC_SIZE, %rdi
++	jmp	L(CopyVecSizeExit)
++# endif
++
++	.p2align 4
++L(CopyVecSizeUnaligned_32):
++	bsf	%edx, %edx
++	VMOVU	%YMM4, (%rdi)
++	VMOVU	%YMM5, VEC_SIZE(%rdi)
++# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
++# ifdef USE_AS_STPCPY
++	lea	(VEC_SIZE * 2)(%rdi, %rdx), %rax
++# endif
++	VMOVU	%YMM6, (VEC_SIZE * 2)(%rdi)
++	add	$((VEC_SIZE * 2) - 1), %r8
++	sub	%rdx, %r8
++	lea	((VEC_SIZE * 2) + 1)(%rdi, %rdx), %rdi
++	jmp	L(StrncpyFillTailWithZero)
++# else
++	add	$(VEC_SIZE * 2), %rsi
++	add	$(VEC_SIZE * 2), %rdi
++	jmp	L(CopyVecSizeExit)
++# endif
++
++# ifdef USE_AS_STRNCPY
++#  ifndef USE_AS_STRCAT
++	.p2align 4
++L(CopyVecSizeUnalignedVec6):
++	VMOVU	%YMM6, (%rdi, %rcx)
++	jmp	L(CopyVecSizeVecExit)
++
++	.p2align 4
++L(CopyVecSizeUnalignedVec5):
++	VMOVU	%YMM5, (%rdi, %rcx)
++	jmp	L(CopyVecSizeVecExit)
++
++	.p2align 4
++L(CopyVecSizeUnalignedVec4):
++	VMOVU	%YMM4, (%rdi, %rcx)
++	jmp	L(CopyVecSizeVecExit)
++
++	.p2align 4
++L(CopyVecSizeUnalignedVec3):
++	VMOVU	%YMM3, (%rdi, %rcx)
++	jmp	L(CopyVecSizeVecExit)
++#  endif
++
++/* Case2 */
++
++	.p2align 4
++L(CopyVecSizeCase2):
++	add	$VEC_SIZE, %r8
++	add	%rcx, %rdi
++	add	%rcx, %rsi
++	bsf	%edx, %edx
++	cmp	%r8d, %edx
++	jb	L(CopyVecSizeExit)
++	jmp	L(StrncpyExit)
++
++	.p2align 4
++L(CopyTwoVecSizeCase2):
++	add	%rcx, %rsi
++	bsf	%edx, %edx
++	add	$VEC_SIZE, %edx
++	sub	%ecx, %edx
++	cmp	%r8d, %edx
++	jb	L(CopyVecSizeExit)
++	jmp	L(StrncpyExit)
++
++L(CopyVecSizeTailCase2):
++	add	%rcx, %rsi
++	bsf	%edx, %edx
++	cmp	%r8d, %edx
++	jb	L(CopyVecSizeExit)
++	jmp	L(StrncpyExit)
++
++L(CopyVecSizeTail1Case2):
++	bsf	%edx, %edx
++	cmp	%r8d, %edx
++	jb	L(CopyVecSizeExit)
++	jmp	L(StrncpyExit)
++
++/* Case2 or Case3,  Case3 */
++
++	.p2align 4
++L(CopyVecSizeCase2OrCase3):
++	test	%rdx, %rdx
++	jnz	L(CopyVecSizeCase2)
++L(CopyVecSizeCase3):
++	add	$VEC_SIZE, %r8
++	add	%rcx, %rdi
++	add	%rcx, %rsi
++	jmp	L(StrncpyExit)
++
++	.p2align 4
++L(CopyTwoVecSizeCase2OrCase3):
++	test	%rdx, %rdx
++	jnz	L(CopyTwoVecSizeCase2)
++	add	%rcx, %rsi
++	jmp	L(StrncpyExit)
++
++	.p2align 4
++L(CopyVecSizeTailCase2OrCase3):
++	test	%rdx, %rdx
++	jnz	L(CopyVecSizeTailCase2)
++	add	%rcx, %rsi
++	jmp	L(StrncpyExit)
++
++	.p2align 4
++L(CopyTwoVecSize1Case2OrCase3):
++	add	$VEC_SIZE, %rdi
++	add	$VEC_SIZE, %rsi
++	sub	$VEC_SIZE, %r8
++L(CopyVecSizeTail1Case2OrCase3):
++	test	%rdx, %rdx
++	jnz	L(CopyVecSizeTail1Case2)
++	jmp	L(StrncpyExit)
++# endif
++
++/*------------End labels regarding with copying 1-VEC_SIZE bytes--and 1-(VEC_SIZE*2) bytes----*/
++
++	.p2align 4
++L(Exit1):
++	movzwl	(%rsi), %edx
++	mov	%dx, (%rdi)
++# ifdef USE_AS_STPCPY
++	lea	1(%rdi), %rax
++# endif
++# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
++	sub	$2, %r8
++	lea	2(%rdi), %rdi
++	jnz	L(StrncpyFillTailWithZero)
++# endif
++	ret
++
++	.p2align 4
++L(Exit2):
++	movzwl	(%rsi), %ecx
++	mov	%cx, (%rdi)
++	movb	$0, 2(%rdi)
++# ifdef USE_AS_STPCPY
++	lea	2(%rdi), %rax
++# endif
++# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
++	sub	$3, %r8
++	lea	3(%rdi), %rdi
++	jnz	L(StrncpyFillTailWithZero)
++# endif
++	ret
++
++	.p2align 4
++L(Exit3):
++	mov	(%rsi), %edx
++	mov	%edx, (%rdi)
++# ifdef USE_AS_STPCPY
++	lea	3(%rdi), %rax
++# endif
++# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
++	sub	$4, %r8
++	lea	4(%rdi), %rdi
++	jnz	L(StrncpyFillTailWithZero)
++# endif
++	ret
++
++	.p2align 4
++L(Exit4_7):
++	mov	(%rsi), %ecx
++	mov	%ecx, (%rdi)
++	mov	-3(%rsi, %rdx), %ecx
++	mov	%ecx, -3(%rdi, %rdx)
++# ifdef USE_AS_STPCPY
++	lea	(%rdi, %rdx), %rax
++# endif
++# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
++	sub	%rdx, %r8
++	sub	$1, %r8
++	lea	1(%rdi, %rdx), %rdi
++	jnz	L(StrncpyFillTailWithZero)
++# endif
++	ret
++
++	.p2align 4
++L(Exit8_15):
++	mov	(%rsi), %rcx
++	mov	-7(%rsi, %rdx), %r9
++	mov	%rcx, (%rdi)
++	mov	%r9, -7(%rdi, %rdx)
++# ifdef USE_AS_STPCPY
++	lea	(%rdi, %rdx), %rax
++# endif
++# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
++	sub	%rdx, %r8
++	sub	$1, %r8
++	lea	1(%rdi, %rdx), %rdi
++	jnz	L(StrncpyFillTailWithZero)
++# endif
++	ret
++
++	.p2align 4
++L(Exit16_31):
++	VMOVU	(%rsi), %XMM2
++	VMOVU	-15(%rsi, %rdx), %XMM3
++	VMOVU	%XMM2, (%rdi)
++	VMOVU	%XMM3, -15(%rdi, %rdx)
++# ifdef USE_AS_STPCPY
++	lea	(%rdi, %rdx), %rax
++# endif
++# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
++	sub %rdx, %r8
++	sub $1, %r8
++	lea 1(%rdi, %rdx), %rdi
++	jnz L(StrncpyFillTailWithZero)
++# endif
++	ret
++
++	.p2align 4
++L(Exit32_63):
++	VMOVU	(%rsi), %YMM2
++	VMOVU	-31(%rsi, %rdx), %YMM3
++	VMOVU	%YMM2, (%rdi)
++	VMOVU	%YMM3, -31(%rdi, %rdx)
++# ifdef USE_AS_STPCPY
++	lea	(%rdi, %rdx), %rax
++# endif
++# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
++	sub	%rdx, %r8
++	sub	$1, %r8
++	lea	1(%rdi, %rdx), %rdi
++	jnz	L(StrncpyFillTailWithZero)
++# endif
++	ret
++
++# ifdef USE_AS_STRNCPY
++
++	.p2align 4
++L(StrncpyExit1):
++	movzbl	(%rsi), %edx
++	mov	%dl, (%rdi)
++#  ifdef USE_AS_STPCPY
++	lea	1(%rdi), %rax
++#  endif
++#  ifdef USE_AS_STRCAT
++	movb	$0, 1(%rdi)
++#  endif
++	ret
++
++	.p2align 4
++L(StrncpyExit2):
++	movzwl	(%rsi), %edx
++	mov	%dx, (%rdi)
++#  ifdef USE_AS_STPCPY
++	lea	2(%rdi), %rax
++#  endif
++#  ifdef USE_AS_STRCAT
++	movb	$0, 2(%rdi)
++#  endif
++	ret
++
++	.p2align 4
++L(StrncpyExit3_4):
++	movzwl	(%rsi), %ecx
++	movzwl	-2(%rsi, %r8), %edx
++	mov	%cx, (%rdi)
++	mov	%dx, -2(%rdi, %r8)
++#  ifdef USE_AS_STPCPY
++	lea	(%rdi, %r8), %rax
++#  endif
++#  ifdef USE_AS_STRCAT
++	movb	$0, (%rdi, %r8)
++#  endif
++	ret
++
++	.p2align 4
++L(StrncpyExit5_8):
++	mov	(%rsi), %ecx
++	mov	-4(%rsi, %r8), %edx
++	mov	%ecx, (%rdi)
++	mov	%edx, -4(%rdi, %r8)
++#  ifdef USE_AS_STPCPY
++	lea	(%rdi, %r8), %rax
++#  endif
++#  ifdef USE_AS_STRCAT
++	movb	$0, (%rdi, %r8)
++#  endif
++	ret
++
++	.p2align 4
++L(StrncpyExit9_16):
++	mov	(%rsi), %rcx
++	mov	-8(%rsi, %r8), %rdx
++	mov	%rcx, (%rdi)
++	mov	%rdx, -8(%rdi, %r8)
++#  ifdef USE_AS_STPCPY
++	lea	(%rdi, %r8), %rax
++#  endif
++#  ifdef USE_AS_STRCAT
++	movb	$0, (%rdi, %r8)
++#  endif
++	ret
++
++	.p2align 4
++L(StrncpyExit17_32):
++	VMOVU	(%rsi), %XMM2
++	VMOVU	-16(%rsi, %r8), %XMM3
++	VMOVU	%XMM2, (%rdi)
++	VMOVU	%XMM3, -16(%rdi, %r8)
++#  ifdef USE_AS_STPCPY
++	lea	(%rdi, %r8), %rax
++#  endif
++#  ifdef USE_AS_STRCAT
++	movb	$0, (%rdi, %r8)
++#  endif
++	ret
++
++	.p2align 4
++L(StrncpyExit33_64):
++	/*  0/32, 31/16 */
++	VMOVU	(%rsi), %YMM2
++	VMOVU	-VEC_SIZE(%rsi, %r8), %YMM3
++	VMOVU	%YMM2, (%rdi)
++	VMOVU	%YMM3, -VEC_SIZE(%rdi, %r8)
++#  ifdef USE_AS_STPCPY
++	lea	(%rdi, %r8), %rax
++#  endif
++#  ifdef USE_AS_STRCAT
++	movb	$0, (%rdi, %r8)
++#  endif
++	ret
++
++	.p2align 4
++L(StrncpyExit65):
++	/* 0/32, 32/32, 64/1 */
++	VMOVU	(%rsi), %YMM2
++	VMOVU	32(%rsi), %YMM3
++	mov	64(%rsi), %cl
++	VMOVU	%YMM2, (%rdi)
++	VMOVU	%YMM3, 32(%rdi)
++	mov	%cl, 64(%rdi)
++#  ifdef USE_AS_STPCPY
++	lea	65(%rdi), %rax
++#  endif
++#  ifdef USE_AS_STRCAT
++	movb	$0, 65(%rdi)
++#  endif
++	ret
++
++#  ifndef USE_AS_STRCAT
++
++	.p2align 4
++L(Fill1):
++	mov	%dl, (%rdi)
++	ret
++
++	.p2align 4
++L(Fill2):
++	mov	%dx, (%rdi)
++	ret
++
++	.p2align 4
++L(Fill3_4):
++	mov	%dx, (%rdi)
++	mov     %dx, -2(%rdi, %r8)
++	ret
++
++	.p2align 4
++L(Fill5_8):
++	mov	%edx, (%rdi)
++	mov     %edx, -4(%rdi, %r8)
++	ret
++
++	.p2align 4
++L(Fill9_16):
++	mov	%rdx, (%rdi)
++	mov	%rdx, -8(%rdi, %r8)
++	ret
++
++	.p2align 4
++L(Fill17_32):
++	VMOVU	%XMMZERO, (%rdi)
++	VMOVU	%XMMZERO, -16(%rdi, %r8)
++	ret
++
++	.p2align 4
++L(CopyVecSizeUnalignedVec2):
++	VMOVU	%YMM2, (%rdi, %rcx)
++
++	.p2align 4
++L(CopyVecSizeVecExit):
++	bsf	%edx, %edx
++	add	$(VEC_SIZE - 1), %r8
++	add	%rcx, %rdi
++#   ifdef USE_AS_STPCPY
++	lea	(%rdi, %rdx), %rax
++#   endif
++	sub	%rdx, %r8
++	lea	1(%rdi, %rdx), %rdi
++
++	.p2align 4
++L(StrncpyFillTailWithZero):
++	xor	%edx, %edx
++	sub	$VEC_SIZE, %r8
++	jbe	L(StrncpyFillExit)
++
++	VMOVU	%YMMZERO, (%rdi)
++	add	$VEC_SIZE, %rdi
++
++	mov	%rdi, %rsi
++	and	$(VEC_SIZE - 1), %esi
++	sub	%rsi, %rdi
++	add	%rsi, %r8
++	sub	$(VEC_SIZE * 4), %r8
++	jb	L(StrncpyFillLessFourVecSize)
++
++L(StrncpyFillLoopVmovdqa):
++	VMOVA	%YMMZERO, (%rdi)
++	VMOVA	%YMMZERO, VEC_SIZE(%rdi)
++	VMOVA	%YMMZERO, (VEC_SIZE * 2)(%rdi)
++	VMOVA	%YMMZERO, (VEC_SIZE * 3)(%rdi)
++	add	$(VEC_SIZE * 4), %rdi
++	sub	$(VEC_SIZE * 4), %r8
++	jae	L(StrncpyFillLoopVmovdqa)
++
++L(StrncpyFillLessFourVecSize):
++	add	$(VEC_SIZE * 2), %r8
++	jl	L(StrncpyFillLessTwoVecSize)
++	VMOVA	%YMMZERO, (%rdi)
++	VMOVA	%YMMZERO, VEC_SIZE(%rdi)
++	add	$(VEC_SIZE * 2), %rdi
++	sub	$VEC_SIZE, %r8
++	jl	L(StrncpyFillExit)
++	VMOVA	%YMMZERO, (%rdi)
++	add	$VEC_SIZE, %rdi
++	jmp	L(Fill)
++
++	.p2align 4
++L(StrncpyFillLessTwoVecSize):
++	add	$VEC_SIZE, %r8
++	jl	L(StrncpyFillExit)
++	VMOVA	%YMMZERO, (%rdi)
++	add	$VEC_SIZE, %rdi
++	jmp	L(Fill)
++
++	.p2align 4
++L(StrncpyFillExit):
++	add	$VEC_SIZE, %r8
++L(Fill):
++	cmp	$17, %r8d
++	jae	L(Fill17_32)
++	cmp	$9, %r8d
++	jae	L(Fill9_16)
++	cmp	$5, %r8d
++	jae	L(Fill5_8)
++	cmp	$3, %r8d
++	jae	L(Fill3_4)
++	cmp	$1, %r8d
++	ja	L(Fill2)
++	je	L(Fill1)
++	ret
++
++/* end of ifndef USE_AS_STRCAT */
++#  endif
++
++	.p2align 4
++L(UnalignedLeaveCase2OrCase3):
++	test	%rdx, %rdx
++	jnz	L(UnalignedFourVecSizeLeaveCase2)
++L(UnalignedFourVecSizeLeaveCase3):
++	lea	(VEC_SIZE * 4)(%r8), %rcx
++	and	$-VEC_SIZE, %rcx
++	add	$(VEC_SIZE * 3), %r8
++	jl	L(CopyVecSizeCase3)
++	VMOVU	%YMM4, (%rdi)
++	sub	$VEC_SIZE, %r8
++	jb	L(CopyVecSizeCase3)
++	VMOVU	%YMM5, VEC_SIZE(%rdi)
++	sub	$VEC_SIZE, %r8
++	jb	L(CopyVecSizeCase3)
++	VMOVU	%YMM6, (VEC_SIZE * 2)(%rdi)
++	sub	$VEC_SIZE, %r8
++	jb	L(CopyVecSizeCase3)
++	VMOVU	%YMM7, (VEC_SIZE * 3)(%rdi)
++#  ifdef USE_AS_STPCPY
++	lea	(VEC_SIZE * 4)(%rdi), %rax
++#  endif
++#  ifdef USE_AS_STRCAT
++	movb	$0, (VEC_SIZE * 4)(%rdi)
++#  endif
++	ret
++
++	.p2align 4
++L(UnalignedFourVecSizeLeaveCase2):
++	xor	%ecx, %ecx
++	vpcmpb	$0, %YMM4, %YMMZERO, %k1
++	kmovd	%k1, %edx
++	add	$(VEC_SIZE * 3), %r8
++	jle	L(CopyVecSizeCase2OrCase3)
++	test	%edx, %edx
++#  ifndef USE_AS_STRCAT
++	jnz	L(CopyVecSizeUnalignedVec4)
++#  else
++	jnz	L(CopyVecSize)
++#  endif
++	vpcmpb	$0, %YMM5, %YMMZERO, %k2
++	kmovd	%k2, %edx
++	VMOVU	%YMM4, (%rdi)
++	add	$VEC_SIZE, %rcx
++	sub	$VEC_SIZE, %r8
++	jbe	L(CopyVecSizeCase2OrCase3)
++	test	%edx, %edx
++#  ifndef USE_AS_STRCAT
++	jnz	L(CopyVecSizeUnalignedVec5)
++#  else
++	jnz	L(CopyVecSize)
++#  endif
++
++	vpcmpb	$0, %YMM6, %YMMZERO, %k3
++	kmovd	%k3, %edx
++	VMOVU	%YMM5, VEC_SIZE(%rdi)
++	add	$VEC_SIZE, %rcx
++	sub	$VEC_SIZE, %r8
++	jbe	L(CopyVecSizeCase2OrCase3)
++	test	%edx, %edx
++#  ifndef USE_AS_STRCAT
++	jnz	L(CopyVecSizeUnalignedVec6)
++#  else
++	jnz	L(CopyVecSize)
++#  endif
++
++	vpcmpb	$0, %YMM7, %YMMZERO, %k4
++	kmovd	%k4, %edx
++	VMOVU	%YMM6, (VEC_SIZE * 2)(%rdi)
++	lea	VEC_SIZE(%rdi, %rcx), %rdi
++	lea	VEC_SIZE(%rsi, %rcx), %rsi
++	bsf	%edx, %edx
++	cmp	%r8d, %edx
++	jb	L(CopyVecSizeExit)
++L(StrncpyExit):
++	cmp	$65, %r8d
++	je	L(StrncpyExit65)
++	cmp	$33, %r8d
++	jae	L(StrncpyExit33_64)
++	cmp	$17, %r8d
++	jae	L(StrncpyExit17_32)
++	cmp	$9, %r8d
++	jae	L(StrncpyExit9_16)
++	cmp	$5, %r8d
++	jae	L(StrncpyExit5_8)
++	cmp	$3, %r8d
++	jae	L(StrncpyExit3_4)
++	cmp	$1, %r8d
++	ja	L(StrncpyExit2)
++	je	L(StrncpyExit1)
++#  ifdef USE_AS_STPCPY
++	mov	%rdi, %rax
++#  endif
++#  ifdef USE_AS_STRCAT
++	movb	$0, (%rdi)
++#  endif
++	ret
++
++	.p2align 4
++L(ExitZero):
++#  ifndef USE_AS_STRCAT
++	mov	%rdi, %rax
++#  endif
++	ret
++
++# endif
++
++# ifndef USE_AS_STRCAT
++END (STRCPY)
++# else
++END (STRCAT)
++# endif
++#endif
+diff --git a/sysdeps/x86_64/multiarch/strncat-evex.S b/sysdeps/x86_64/multiarch/strncat-evex.S
+new file mode 100644
+index 00000000..8884f023
+--- /dev/null
++++ b/sysdeps/x86_64/multiarch/strncat-evex.S
+@@ -0,0 +1,3 @@
++#define USE_AS_STRNCAT
++#define STRCAT __strncat_evex
++#include "strcat-evex.S"
+diff --git a/sysdeps/x86_64/multiarch/strncpy-evex.S b/sysdeps/x86_64/multiarch/strncpy-evex.S
+new file mode 100644
+index 00000000..40e391f0
+--- /dev/null
++++ b/sysdeps/x86_64/multiarch/strncpy-evex.S
+@@ -0,0 +1,3 @@
++#define USE_AS_STRNCPY
++#define STRCPY __strncpy_evex
++#include "strcpy-evex.S"
+-- 
+GitLab
+
diff --git a/glibc-RHEL-15696-14.patch b/glibc-RHEL-15696-14.patch
new file mode 100644
index 0000000..84a4593
--- /dev/null
+++ b/glibc-RHEL-15696-14.patch
@@ -0,0 +1,242 @@
+From 63ad43566f7a25d140dc723598aeb441ad657eed Mon Sep 17 00:00:00 2001
+From: "H.J. Lu" <hjl.tools@gmail.com>
+Date: Fri, 5 Mar 2021 06:46:08 -0800
+Subject: [PATCH] x86-64: Add memmove family functions with 256-bit EVEX
+Content-type: text/plain; charset=UTF-8
+
+Update ifunc-memmove.h to select the function optimized with 256-bit EVEX
+instructions using YMM16-YMM31 registers to avoid RTM abort with usable
+AVX512VL since VZEROUPPER isn't needed at function exit.
+---
+ sysdeps/x86_64/multiarch/Makefile             |  1 +
+ sysdeps/x86_64/multiarch/ifunc-impl-list.c    | 36 +++++++++++++++++++
+ sysdeps/x86_64/multiarch/ifunc-memmove.h      | 21 +++++++++--
+ .../multiarch/memmove-evex-unaligned-erms.S   | 33 +++++++++++++++++
+ .../multiarch/memmove-vec-unaligned-erms.S    | 24 ++++++++-----
+ 5 files changed, 104 insertions(+), 11 deletions(-)
+ create mode 100644 sysdeps/x86_64/multiarch/memmove-evex-unaligned-erms.S
+
+diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
+index 46783cd1..4563fc56 100644
+--- a/sysdeps/x86_64/multiarch/Makefile
++++ b/sysdeps/x86_64/multiarch/Makefile
+@@ -41,6 +41,7 @@ sysdep_routines += strncat-c stpncpy-c strncpy-c \
+ 		   memset-avx2-unaligned-erms \
+ 		   memset-avx512-unaligned-erms \
+ 		   memchr-evex \
++		   memmove-evex-unaligned-erms \
+ 		   memrchr-evex \
+ 		   rawmemchr-evex \
+ 		   stpcpy-evex \
+diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+index 082e4da3..6bd3abfc 100644
+--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
++++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+@@ -80,6 +80,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 	      IFUNC_IMPL_ADD (array, i, __memmove_chk,
+ 			      CPU_FEATURE_USABLE (AVX),
+ 			      __memmove_chk_avx_unaligned_erms)
++	      IFUNC_IMPL_ADD (array, i, __memmove_chk,
++			      CPU_FEATURE_USABLE (AVX512VL),
++			      __memmove_chk_evex_unaligned)
++	      IFUNC_IMPL_ADD (array, i, __memmove_chk,
++			      CPU_FEATURE_USABLE (AVX512VL),
++			      __memmove_chk_evex_unaligned_erms)
+ 	      IFUNC_IMPL_ADD (array, i, __memmove_chk,
+ 			      CPU_FEATURE_USABLE (SSSE3),
+ 			      __memmove_chk_ssse3_back)
+@@ -102,6 +108,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 	      IFUNC_IMPL_ADD (array, i, memmove,
+ 			      CPU_FEATURE_USABLE (AVX),
+ 			      __memmove_avx_unaligned_erms)
++	      IFUNC_IMPL_ADD (array, i, memmove,
++			      CPU_FEATURE_USABLE (AVX512VL),
++			      __memmove_evex_unaligned)
++	      IFUNC_IMPL_ADD (array, i, memmove,
++			      CPU_FEATURE_USABLE (AVX512VL),
++			      __memmove_evex_unaligned_erms)
+ 	      IFUNC_IMPL_ADD (array, i, memmove,
+ 			      CPU_FEATURE_USABLE (AVX512F),
+ 			      __memmove_avx512_no_vzeroupper)
+@@ -565,6 +577,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 	      IFUNC_IMPL_ADD (array, i, __memcpy_chk,
+ 			      CPU_FEATURE_USABLE (AVX),
+ 			      __memcpy_chk_avx_unaligned_erms)
++	      IFUNC_IMPL_ADD (array, i, __memcpy_chk,
++			      CPU_FEATURE_USABLE (AVX512VL),
++			      __memcpy_chk_evex_unaligned)
++	      IFUNC_IMPL_ADD (array, i, __memcpy_chk,
++			      CPU_FEATURE_USABLE (AVX512VL),
++			      __memcpy_chk_evex_unaligned_erms)
+ 	      IFUNC_IMPL_ADD (array, i, __memcpy_chk,
+ 			      CPU_FEATURE_USABLE (SSSE3),
+ 			      __memcpy_chk_ssse3_back)
+@@ -587,6 +605,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 	      IFUNC_IMPL_ADD (array, i, memcpy,
+ 			      CPU_FEATURE_USABLE (AVX),
+ 			      __memcpy_avx_unaligned_erms)
++	      IFUNC_IMPL_ADD (array, i, memcpy,
++			      CPU_FEATURE_USABLE (AVX512VL),
++			      __memcpy_evex_unaligned)
++	      IFUNC_IMPL_ADD (array, i, memcpy,
++			      CPU_FEATURE_USABLE (AVX512VL),
++			      __memcpy_evex_unaligned_erms)
+ 	      IFUNC_IMPL_ADD (array, i, memcpy, CPU_FEATURE_USABLE (SSSE3),
+ 			      __memcpy_ssse3_back)
+ 	      IFUNC_IMPL_ADD (array, i, memcpy, CPU_FEATURE_USABLE (SSSE3),
+@@ -623,6 +647,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 	      IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
+ 			      CPU_FEATURE_USABLE (AVX),
+ 			      __mempcpy_chk_avx_unaligned_erms)
++	      IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
++			      CPU_FEATURE_USABLE (AVX512VL),
++			      __mempcpy_chk_evex_unaligned)
++	      IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
++			      CPU_FEATURE_USABLE (AVX512VL),
++			      __mempcpy_chk_evex_unaligned_erms)
+ 	      IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
+ 			      CPU_FEATURE_USABLE (SSSE3),
+ 			      __mempcpy_chk_ssse3_back)
+@@ -654,6 +684,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 	      IFUNC_IMPL_ADD (array, i, mempcpy,
+ 			      CPU_FEATURE_USABLE (AVX),
+ 			      __mempcpy_avx_unaligned_erms)
++	      IFUNC_IMPL_ADD (array, i, mempcpy,
++			      CPU_FEATURE_USABLE (AVX512VL),
++			      __mempcpy_evex_unaligned)
++	      IFUNC_IMPL_ADD (array, i, mempcpy,
++			      CPU_FEATURE_USABLE (AVX512VL),
++			      __mempcpy_evex_unaligned_erms)
+ 	      IFUNC_IMPL_ADD (array, i, mempcpy, CPU_FEATURE_USABLE (SSSE3),
+ 			      __mempcpy_ssse3_back)
+ 	      IFUNC_IMPL_ADD (array, i, mempcpy, CPU_FEATURE_USABLE (SSSE3),
+diff --git a/sysdeps/x86_64/multiarch/ifunc-memmove.h b/sysdeps/x86_64/multiarch/ifunc-memmove.h
+index 5e5f0299..6f8bce5f 100644
+--- a/sysdeps/x86_64/multiarch/ifunc-memmove.h
++++ b/sysdeps/x86_64/multiarch/ifunc-memmove.h
+@@ -29,6 +29,10 @@ extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3_back) attribute_hidden;
+ extern __typeof (REDIRECT_NAME) OPTIMIZE (avx_unaligned) attribute_hidden;
+ extern __typeof (REDIRECT_NAME) OPTIMIZE (avx_unaligned_erms)
+   attribute_hidden;
++extern __typeof (REDIRECT_NAME) OPTIMIZE (evex_unaligned)
++  attribute_hidden;
++extern __typeof (REDIRECT_NAME) OPTIMIZE (evex_unaligned_erms)
++  attribute_hidden;
+ extern __typeof (REDIRECT_NAME) OPTIMIZE (avx512_unaligned)
+   attribute_hidden;
+ extern __typeof (REDIRECT_NAME) OPTIMIZE (avx512_unaligned_erms)
+@@ -59,10 +63,21 @@ IFUNC_SELECTOR (void)
+ 
+   if (CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load))
+     {
+-      if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
+-	return OPTIMIZE (avx_unaligned_erms);
++      if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL))
++	{
++	  if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
++	    return OPTIMIZE (evex_unaligned_erms);
++
++	  return OPTIMIZE (evex_unaligned);
++	}
++
++      if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
++	{
++	  if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
++	    return OPTIMIZE (avx_unaligned_erms);
+ 
+-      return OPTIMIZE (avx_unaligned);
++	  return OPTIMIZE (avx_unaligned);
++	}
+     }
+ 
+   if (!CPU_FEATURE_USABLE_P (cpu_features, SSSE3)
+diff --git a/sysdeps/x86_64/multiarch/memmove-evex-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-evex-unaligned-erms.S
+new file mode 100644
+index 00000000..0cbce8f9
+--- /dev/null
++++ b/sysdeps/x86_64/multiarch/memmove-evex-unaligned-erms.S
+@@ -0,0 +1,33 @@
++#if IS_IN (libc)
++# define VEC_SIZE	32
++# define XMM0		xmm16
++# define XMM1		xmm17
++# define YMM0		ymm16
++# define YMM1		ymm17
++# define VEC0		ymm16
++# define VEC1		ymm17
++# define VEC2		ymm18
++# define VEC3		ymm19
++# define VEC4		ymm20
++# define VEC5		ymm21
++# define VEC6		ymm22
++# define VEC7		ymm23
++# define VEC8		ymm24
++# define VEC9		ymm25
++# define VEC10		ymm26
++# define VEC11		ymm27
++# define VEC12		ymm28
++# define VEC13		ymm29
++# define VEC14		ymm30
++# define VEC15		ymm31
++# define VEC(i)		VEC##i
++# define VMOVNT		vmovntdq
++# define VMOVU		vmovdqu64
++# define VMOVA		vmovdqa64
++# define VZEROUPPER
++
++# define SECTION(p)		p##.evex
++# define MEMMOVE_SYMBOL(p,s)	p##_evex_##s
++
++# include "memmove-vec-unaligned-erms.S"
++#endif
+diff --git a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
+index 274aa1c7..08e21692 100644
+--- a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
++++ b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
+@@ -48,6 +48,14 @@
+ # define MEMMOVE_CHK_SYMBOL(p,s)	MEMMOVE_SYMBOL(p, s)
+ #endif
+ 
++#ifndef XMM0
++# define XMM0				xmm0
++#endif
++
++#ifndef YMM0
++# define YMM0				ymm0
++#endif
++
+ #ifndef VZEROUPPER
+ # if VEC_SIZE > 16
+ #  define VZEROUPPER vzeroupper
+@@ -277,20 +285,20 @@ L(less_vec):
+ #if VEC_SIZE > 32
+ L(between_32_63):
+ 	/* From 32 to 63.  No branch when size == 32.  */
+-	vmovdqu	(%rsi), %ymm0
+-	vmovdqu	-32(%rsi,%rdx), %ymm1
+-	vmovdqu	%ymm0, (%rdi)
+-	vmovdqu	%ymm1, -32(%rdi,%rdx)
++	VMOVU	(%rsi), %YMM0
++	VMOVU	-32(%rsi,%rdx), %YMM1
++	VMOVU	%YMM0, (%rdi)
++	VMOVU	%YMM1, -32(%rdi,%rdx)
+ 	VZEROUPPER
+ 	ret
+ #endif
+ #if VEC_SIZE > 16
+ 	/* From 16 to 31.  No branch when size == 16.  */
+ L(between_16_31):
+-	vmovdqu	(%rsi), %xmm0
+-	vmovdqu	-16(%rsi,%rdx), %xmm1
+-	vmovdqu	%xmm0, (%rdi)
+-	vmovdqu	%xmm1, -16(%rdi,%rdx)
++	VMOVU	(%rsi), %XMM0
++	VMOVU	-16(%rsi,%rdx), %XMM1
++	VMOVU	%XMM0, (%rdi)
++	VMOVU	%XMM1, -16(%rdi,%rdx)
+ 	ret
+ #endif
+ L(between_8_15):
+-- 
+GitLab
+
diff --git a/glibc-RHEL-15696-15.patch b/glibc-RHEL-15696-15.patch
new file mode 100644
index 0000000..72cd8cf
--- /dev/null
+++ b/glibc-RHEL-15696-15.patch
@@ -0,0 +1,254 @@
+From 1b968b6b9b3aac702ac2f133e0dd16cfdbb415ee Mon Sep 17 00:00:00 2001
+From: "H.J. Lu" <hjl.tools@gmail.com>
+Date: Fri, 5 Mar 2021 07:15:03 -0800
+Subject: [PATCH] x86-64: Add memset family functions with 256-bit EVEX
+Content-type: text/plain; charset=UTF-8
+
+Update ifunc-memset.h/ifunc-wmemset.h to select the function optimized
+with 256-bit EVEX instructions using YMM16-YMM31 registers to avoid RTM
+abort with usable AVX512VL and AVX512BW since VZEROUPPER isn't needed at
+function exit.
+---
+ sysdeps/x86_64/multiarch/Makefile             |  1 +
+ sysdeps/x86_64/multiarch/ifunc-impl-list.c    | 22 +++++++++++++++++
+ sysdeps/x86_64/multiarch/ifunc-memset.h       | 24 +++++++++++++++----
+ sysdeps/x86_64/multiarch/ifunc-wmemset.h      | 13 ++++++----
+ .../multiarch/memset-evex-unaligned-erms.S    | 24 +++++++++++++++++++
+ .../multiarch/memset-vec-unaligned-erms.S     | 20 +++++++++++-----
+ 6 files changed, 90 insertions(+), 14 deletions(-)
+ create mode 100644 sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
+
+diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
+index 4563fc56..1cc0a10e 100644
+--- a/sysdeps/x86_64/multiarch/Makefile
++++ b/sysdeps/x86_64/multiarch/Makefile
+@@ -43,6 +43,7 @@ sysdep_routines += strncat-c stpncpy-c strncpy-c \
+ 		   memchr-evex \
+ 		   memmove-evex-unaligned-erms \
+ 		   memrchr-evex \
++		   memset-evex-unaligned-erms \
+ 		   rawmemchr-evex \
+ 		   stpcpy-evex \
+ 		   stpncpy-evex \
+diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+index 6bd3abfc..7cf83485 100644
+--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
++++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+@@ -160,6 +160,14 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 	      IFUNC_IMPL_ADD (array, i, __memset_chk,
+ 			      CPU_FEATURE_USABLE (AVX2),
+ 			      __memset_chk_avx2_unaligned_erms)
++	      IFUNC_IMPL_ADD (array, i, __memset_chk,
++			      (CPU_FEATURE_USABLE (AVX512VL)
++			       && CPU_FEATURE_USABLE (AVX512BW)),
++			      __memset_chk_evex_unaligned)
++	      IFUNC_IMPL_ADD (array, i, __memset_chk,
++			      (CPU_FEATURE_USABLE (AVX512VL)
++			       && CPU_FEATURE_USABLE (AVX512BW)),
++			      __memset_chk_evex_unaligned_erms)
+ 	      IFUNC_IMPL_ADD (array, i, __memset_chk,
+ 			      CPU_FEATURE_USABLE (AVX512F),
+ 			      __memset_chk_avx512_unaligned_erms)
+@@ -185,6 +193,14 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 	      IFUNC_IMPL_ADD (array, i, memset,
+ 			      CPU_FEATURE_USABLE (AVX2),
+ 			      __memset_avx2_unaligned_erms)
++	      IFUNC_IMPL_ADD (array, i, memset,
++			      (CPU_FEATURE_USABLE (AVX512VL)
++			       && CPU_FEATURE_USABLE (AVX512BW)),
++			      __memset_evex_unaligned)
++	      IFUNC_IMPL_ADD (array, i, memset,
++			      (CPU_FEATURE_USABLE (AVX512VL)
++			       && CPU_FEATURE_USABLE (AVX512BW)),
++			      __memset_evex_unaligned_erms)
+ 	      IFUNC_IMPL_ADD (array, i, memset,
+ 			      CPU_FEATURE_USABLE (AVX512F),
+ 			      __memset_avx512_unaligned_erms)
+@@ -555,6 +571,9 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 	      IFUNC_IMPL_ADD (array, i, wmemset,
+ 			      CPU_FEATURE_USABLE (AVX2),
+ 			      __wmemset_avx2_unaligned)
++	      IFUNC_IMPL_ADD (array, i, wmemset,
++			      CPU_FEATURE_USABLE (AVX512VL),
++			      __wmemset_evex_unaligned)
+ 	      IFUNC_IMPL_ADD (array, i, wmemset,
+ 			      CPU_FEATURE_USABLE (AVX512F),
+ 			      __wmemset_avx512_unaligned))
+@@ -723,6 +742,9 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 	      IFUNC_IMPL_ADD (array, i, __wmemset_chk,
+ 			      CPU_FEATURE_USABLE (AVX2),
+ 			      __wmemset_chk_avx2_unaligned)
++	      IFUNC_IMPL_ADD (array, i, __wmemset_chk,
++			      CPU_FEATURE_USABLE (AVX512VL),
++			      __wmemset_chk_evex_unaligned)
+ 	      IFUNC_IMPL_ADD (array, i, __wmemset_chk,
+ 			      CPU_FEATURE_USABLE (AVX512F),
+ 			      __wmemset_chk_avx512_unaligned))
+diff --git a/sysdeps/x86_64/multiarch/ifunc-memset.h b/sysdeps/x86_64/multiarch/ifunc-memset.h
+index 708bd72e..6f31f4dc 100644
+--- a/sysdeps/x86_64/multiarch/ifunc-memset.h
++++ b/sysdeps/x86_64/multiarch/ifunc-memset.h
+@@ -27,6 +27,10 @@ extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2_unaligned_erms)
+ extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_unaligned) attribute_hidden;
+ extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_unaligned_erms)
+   attribute_hidden;
++extern __typeof (REDIRECT_NAME) OPTIMIZE (evex_unaligned)
++  attribute_hidden;
++extern __typeof (REDIRECT_NAME) OPTIMIZE (evex_unaligned_erms)
++  attribute_hidden;
+ extern __typeof (REDIRECT_NAME) OPTIMIZE (avx512_unaligned)
+   attribute_hidden;
+ extern __typeof (REDIRECT_NAME) OPTIMIZE (avx512_unaligned_erms)
+@@ -56,10 +60,22 @@ IFUNC_SELECTOR (void)
+ 
+   if (CPU_FEATURE_USABLE_P (cpu_features, AVX2))
+     {
+-      if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
+-	return OPTIMIZE (avx2_unaligned_erms);
+-      else
+-	return OPTIMIZE (avx2_unaligned);
++      if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
++	  && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW))
++	{
++	  if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
++	    return OPTIMIZE (evex_unaligned_erms);
++
++	  return OPTIMIZE (evex_unaligned);
++	}
++
++      if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
++	{
++	  if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
++	    return OPTIMIZE (avx2_unaligned_erms);
++
++	  return OPTIMIZE (avx2_unaligned);
++	}
+     }
+ 
+   if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
+diff --git a/sysdeps/x86_64/multiarch/ifunc-wmemset.h b/sysdeps/x86_64/multiarch/ifunc-wmemset.h
+index eb242210..9290c4bf 100644
+--- a/sysdeps/x86_64/multiarch/ifunc-wmemset.h
++++ b/sysdeps/x86_64/multiarch/ifunc-wmemset.h
+@@ -20,6 +20,7 @@
+ 
+ extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2_unaligned) attribute_hidden;
+ extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_unaligned) attribute_hidden;
++extern __typeof (REDIRECT_NAME) OPTIMIZE (evex_unaligned) attribute_hidden;
+ extern __typeof (REDIRECT_NAME) OPTIMIZE (avx512_unaligned) attribute_hidden;
+ 
+ static inline void *
+@@ -27,14 +28,18 @@ IFUNC_SELECTOR (void)
+ {
+   const struct cpu_features* cpu_features = __get_cpu_features ();
+ 
+-  if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER)
+-      && CPU_FEATURE_USABLE_P (cpu_features, AVX2)
++  if (CPU_FEATURE_USABLE_P (cpu_features, AVX2)
+       && CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load))
+     {
+       if (CPU_FEATURE_USABLE_P (cpu_features, AVX512F)
+-	  && !CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_AVX512))
++	  && !CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_AVX512)
++	  && !CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
+ 	return OPTIMIZE (avx512_unaligned);
+-      else
++
++      if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL))
++	return OPTIMIZE (evex_unaligned);
++
++      if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
+ 	return OPTIMIZE (avx2_unaligned);
+     }
+ 
+diff --git a/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
+new file mode 100644
+index 00000000..ae0a4d6e
+--- /dev/null
++++ b/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
+@@ -0,0 +1,24 @@
++#if IS_IN (libc)
++# define VEC_SIZE	32
++# define XMM0		xmm16
++# define YMM0		ymm16
++# define VEC0		ymm16
++# define VEC(i)		VEC##i
++# define VMOVU		vmovdqu64
++# define VMOVA		vmovdqa64
++# define VZEROUPPER
++
++# define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
++  movq r, %rax; \
++  vpbroadcastb d, %VEC0
++
++# define WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
++  movq r, %rax; \
++  vpbroadcastd d, %VEC0
++
++# define SECTION(p)		p##.evex
++# define MEMSET_SYMBOL(p,s)	p##_evex_##s
++# define WMEMSET_SYMBOL(p,s)	p##_evex_##s
++
++# include "memset-vec-unaligned-erms.S"
++#endif
+diff --git a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
+index 9a0fd818..71e91a8f 100644
+--- a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
++++ b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
+@@ -34,6 +34,14 @@
+ # define WMEMSET_CHK_SYMBOL(p,s)	WMEMSET_SYMBOL(p, s)
+ #endif
+ 
++#ifndef XMM0
++# define XMM0				xmm0
++#endif
++
++#ifndef YMM0
++# define YMM0				ymm0
++#endif
++
+ #ifndef VZEROUPPER
+ # if VEC_SIZE > 16
+ #  define VZEROUPPER			vzeroupper
+@@ -67,7 +75,7 @@
+ ENTRY (__bzero)
+ 	mov	%RDI_LP, %RAX_LP /* Set return value.  */
+ 	mov	%RSI_LP, %RDX_LP /* Set n.  */
+-	pxor	%xmm0, %xmm0
++	pxor	%XMM0, %XMM0
+ 	jmp	L(entry_from_bzero)
+ END (__bzero)
+ weak_alias (__bzero, bzero)
+@@ -223,7 +231,7 @@ L(less_vec):
+ 	cmpb	$16, %dl
+ 	jae	L(between_16_31)
+ # endif
+-	MOVQ	%xmm0, %rcx
++	MOVQ	%XMM0, %rcx
+ 	cmpb	$8, %dl
+ 	jae	L(between_8_15)
+ 	cmpb	$4, %dl
+@@ -238,16 +246,16 @@ L(less_vec):
+ # if VEC_SIZE > 32
+ 	/* From 32 to 63.  No branch when size == 32.  */
+ L(between_32_63):
+-	vmovdqu	%ymm0, -32(%rdi,%rdx)
+-	vmovdqu	%ymm0, (%rdi)
++	VMOVU	%YMM0, -32(%rdi,%rdx)
++	VMOVU	%YMM0, (%rdi)
+ 	VZEROUPPER
+ 	ret
+ # endif
+ # if VEC_SIZE > 16
+ 	/* From 16 to 31.  No branch when size == 16.  */
+ L(between_16_31):
+-	vmovdqu	%xmm0, -16(%rdi,%rdx)
+-	vmovdqu	%xmm0, (%rdi)
++	VMOVU	%XMM0, -16(%rdi,%rdx)
++	VMOVU	%XMM0, (%rdi)
+ 	VZEROUPPER
+ 	ret
+ # endif
+-- 
+GitLab
+
diff --git a/glibc-RHEL-15696-16.patch b/glibc-RHEL-15696-16.patch
new file mode 100644
index 0000000..b3f443d
--- /dev/null
+++ b/glibc-RHEL-15696-16.patch
@@ -0,0 +1,561 @@
+From 91264fe3577fe887b4860923fa6142b5274c8965 Mon Sep 17 00:00:00 2001
+From: "H.J. Lu" <hjl.tools@gmail.com>
+Date: Fri, 5 Mar 2021 07:20:28 -0800
+Subject: [PATCH] x86-64: Add memcmp family functions with 256-bit EVEX
+Content-type: text/plain; charset=UTF-8
+
+Update ifunc-memcmp.h to select the function optimized with 256-bit EVEX
+instructions using YMM16-YMM31 registers to avoid RTM abort with usable
+AVX512VL, AVX512BW and MOVBE since VZEROUPPER isn't needed at function
+exit.
+---
+ sysdeps/x86_64/multiarch/Makefile             |   4 +-
+ sysdeps/x86_64/multiarch/ifunc-impl-list.c    |  10 +
+ sysdeps/x86_64/multiarch/ifunc-memcmp.h       |  13 +-
+ sysdeps/x86_64/multiarch/memcmp-evex-movbe.S  | 440 ++++++++++++++++++
+ sysdeps/x86_64/multiarch/wmemcmp-evex-movbe.S |   4 +
+ 5 files changed, 467 insertions(+), 4 deletions(-)
+ create mode 100644 sysdeps/x86_64/multiarch/memcmp-evex-movbe.S
+ create mode 100644 sysdeps/x86_64/multiarch/wmemcmp-evex-movbe.S
+
+diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
+index 1cc0a10e..9d79b138 100644
+--- a/sysdeps/x86_64/multiarch/Makefile
++++ b/sysdeps/x86_64/multiarch/Makefile
+@@ -41,6 +41,7 @@ sysdep_routines += strncat-c stpncpy-c strncpy-c \
+ 		   memset-avx2-unaligned-erms \
+ 		   memset-avx512-unaligned-erms \
+ 		   memchr-evex \
++		   memcmp-evex-movbe \
+ 		   memmove-evex-unaligned-erms \
+ 		   memrchr-evex \
+ 		   memset-evex-unaligned-erms \
+@@ -81,7 +82,8 @@ sysdep_routines += wmemcmp-sse4 wmemcmp-ssse3 wmemcmp-c \
+ 		   wcsncmp-evex \
+ 		   wcsnlen-evex \
+ 		   wcsrchr-evex \
+-		   wmemchr-evex
++		   wmemchr-evex \
++		   wmemcmp-evex-movbe
+ endif
+ 
+ ifeq ($(subdir),debug)
+diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+index 7cf83485..c8da910e 100644
+--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
++++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+@@ -56,6 +56,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 			      (CPU_FEATURE_USABLE (AVX2)
+ 			       && CPU_FEATURE_USABLE (MOVBE)),
+ 			      __memcmp_avx2_movbe)
++	      IFUNC_IMPL_ADD (array, i, memcmp,
++			      (CPU_FEATURE_USABLE (AVX512VL)
++			       && CPU_FEATURE_USABLE (AVX512BW)
++			       && CPU_FEATURE_USABLE (MOVBE)),
++			      __memcmp_evex_movbe)
+ 	      IFUNC_IMPL_ADD (array, i, memcmp, CPU_FEATURE_USABLE (SSE4_1),
+ 			      __memcmp_sse4_1)
+ 	      IFUNC_IMPL_ADD (array, i, memcmp, CPU_FEATURE_USABLE (SSSE3),
+@@ -558,6 +563,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 			      (CPU_FEATURE_USABLE (AVX2)
+ 			       && CPU_FEATURE_USABLE (MOVBE)),
+ 			      __wmemcmp_avx2_movbe)
++	      IFUNC_IMPL_ADD (array, i, wmemcmp,
++			      (CPU_FEATURE_USABLE (AVX512VL)
++			       && CPU_FEATURE_USABLE (AVX512BW)
++			       && CPU_FEATURE_USABLE (MOVBE)),
++			      __wmemcmp_evex_movbe)
+ 	      IFUNC_IMPL_ADD (array, i, wmemcmp, CPU_FEATURE_USABLE (SSE4_1),
+ 			      __wmemcmp_sse4_1)
+ 	      IFUNC_IMPL_ADD (array, i, wmemcmp, CPU_FEATURE_USABLE (SSSE3),
+diff --git a/sysdeps/x86_64/multiarch/ifunc-memcmp.h b/sysdeps/x86_64/multiarch/ifunc-memcmp.h
+index 6c1f3153..3ca1f0a6 100644
+--- a/sysdeps/x86_64/multiarch/ifunc-memcmp.h
++++ b/sysdeps/x86_64/multiarch/ifunc-memcmp.h
+@@ -23,17 +23,24 @@ extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden;
+ extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden;
+ extern __typeof (REDIRECT_NAME) OPTIMIZE (sse4_1) attribute_hidden;
+ extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_movbe) attribute_hidden;
++extern __typeof (REDIRECT_NAME) OPTIMIZE (evex_movbe) attribute_hidden;
+ 
+ static inline void *
+ IFUNC_SELECTOR (void)
+ {
+   const struct cpu_features* cpu_features = __get_cpu_features ();
+ 
+-  if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER)
+-      && CPU_FEATURE_USABLE_P (cpu_features, AVX2)
++  if (CPU_FEATURE_USABLE_P (cpu_features, AVX2)
+       && CPU_FEATURE_USABLE_P (cpu_features, MOVBE)
+       && CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load))
+-    return OPTIMIZE (avx2_movbe);
++    {
++      if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
++	  && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW))
++	return OPTIMIZE (evex_movbe);
++
++      if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
++	return OPTIMIZE (avx2_movbe);
++    }
+ 
+   if (CPU_FEATURE_USABLE_P (cpu_features, SSE4_1))
+     return OPTIMIZE (sse4_1);
+diff --git a/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S b/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S
+new file mode 100644
+index 00000000..9c093972
+--- /dev/null
++++ b/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S
+@@ -0,0 +1,440 @@
++/* memcmp/wmemcmp optimized with 256-bit EVEX instructions.
++   Copyright (C) 2021 Free Software Foundation, Inc.
++   This file is part of the GNU C Library.
++
++   The GNU C Library is free software; you can redistribute it and/or
++   modify it under the terms of the GNU Lesser General Public
++   License as published by the Free Software Foundation; either
++   version 2.1 of the License, or (at your option) any later version.
++
++   The GNU C Library is distributed in the hope that it will be useful,
++   but WITHOUT ANY WARRANTY; without even the implied warranty of
++   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++   Lesser General Public License for more details.
++
++   You should have received a copy of the GNU Lesser General Public
++   License along with the GNU C Library; if not, see
++   <https://www.gnu.org/licenses/>.  */
++
++#if IS_IN (libc)
++
++/* memcmp/wmemcmp is implemented as:
++   1. For size from 2 to 7 bytes, load as big endian with movbe and bswap
++      to avoid branches.
++   2. Use overlapping compare to avoid branch.
++   3. Use vector compare when size >= 4 bytes for memcmp or size >= 8
++      bytes for wmemcmp.
++   4. If size is 8 * VEC_SIZE or less, unroll the loop.
++   5. Compare 4 * VEC_SIZE at a time with the aligned first memory
++      area.
++   6. Use 2 vector compares when size is 2 * VEC_SIZE or less.
++   7. Use 4 vector compares when size is 4 * VEC_SIZE or less.
++   8. Use 8 vector compares when size is 8 * VEC_SIZE or less.  */
++
++# include <sysdep.h>
++
++# ifndef MEMCMP
++#  define MEMCMP	__memcmp_evex_movbe
++# endif
++
++# define VMOVU		vmovdqu64
++
++# ifdef USE_AS_WMEMCMP
++#  define VPCMPEQ	vpcmpeqd
++# else
++#  define VPCMPEQ	vpcmpeqb
++# endif
++
++# define XMM1		xmm17
++# define XMM2		xmm18
++# define YMM1		ymm17
++# define YMM2		ymm18
++# define YMM3		ymm19
++# define YMM4		ymm20
++# define YMM5		ymm21
++# define YMM6		ymm22
++
++# define VEC_SIZE 32
++# ifdef USE_AS_WMEMCMP
++#  define VEC_MASK 0xff
++#  define XMM_MASK 0xf
++# else
++#  define VEC_MASK 0xffffffff
++#  define XMM_MASK 0xffff
++# endif
++
++/* Warning!
++           wmemcmp has to use SIGNED comparison for elements.
++           memcmp has to use UNSIGNED comparison for elemnts.
++*/
++
++	.section .text.evex,"ax",@progbits
++ENTRY (MEMCMP)
++# ifdef USE_AS_WMEMCMP
++	shl	$2, %RDX_LP
++# elif defined __ILP32__
++	/* Clear the upper 32 bits.  */
++	movl	%edx, %edx
++# endif
++	cmp	$VEC_SIZE, %RDX_LP
++	jb	L(less_vec)
++
++	/* From VEC to 2 * VEC.  No branch when size == VEC_SIZE.  */
++	VMOVU	(%rsi), %YMM2
++	VPCMPEQ (%rdi), %YMM2, %k1
++	kmovd	%k1, %eax
++	subl    $VEC_MASK, %eax
++	jnz	L(first_vec)
++
++	cmpq	$(VEC_SIZE * 2), %rdx
++	jbe	L(last_vec)
++
++	/* More than 2 * VEC.  */
++	cmpq	$(VEC_SIZE * 8), %rdx
++	ja	L(more_8x_vec)
++	cmpq	$(VEC_SIZE * 4), %rdx
++	jb	L(last_4x_vec)
++
++	/* From 4 * VEC to 8 * VEC, inclusively. */
++	VMOVU	(%rsi), %YMM1
++	VPCMPEQ (%rdi), %YMM1, %k1
++
++	VMOVU	VEC_SIZE(%rsi), %YMM2
++	VPCMPEQ VEC_SIZE(%rdi), %YMM2, %k2
++
++	VMOVU	(VEC_SIZE * 2)(%rsi), %YMM3
++	VPCMPEQ (VEC_SIZE * 2)(%rdi), %YMM3, %k3
++
++	VMOVU	(VEC_SIZE * 3)(%rsi), %YMM4
++	VPCMPEQ (VEC_SIZE * 3)(%rdi), %YMM4, %k4
++
++	kandd	%k1, %k2, %k5
++	kandd	%k3, %k4, %k6
++	kandd	%k5, %k6, %k6
++
++	kmovd	%k6, %eax
++	cmpl	$VEC_MASK, %eax
++	jne	L(4x_vec_end)
++
++	leaq	-(4 * VEC_SIZE)(%rdi, %rdx), %rdi
++	leaq	-(4 * VEC_SIZE)(%rsi, %rdx), %rsi
++	VMOVU	(%rsi), %YMM1
++	VPCMPEQ (%rdi), %YMM1, %k1
++
++	VMOVU	VEC_SIZE(%rsi), %YMM2
++	VPCMPEQ VEC_SIZE(%rdi), %YMM2, %k2
++	kandd	%k1, %k2, %k5
++
++	VMOVU	(VEC_SIZE * 2)(%rsi), %YMM3
++	VPCMPEQ (VEC_SIZE * 2)(%rdi), %YMM3, %k3
++	kandd	%k3, %k5, %k5
++
++	VMOVU	(VEC_SIZE * 3)(%rsi), %YMM4
++	VPCMPEQ (VEC_SIZE * 3)(%rdi), %YMM4, %k4
++	kandd	%k4, %k5, %k5
++
++	kmovd	%k5, %eax
++	cmpl	$VEC_MASK, %eax
++	jne	L(4x_vec_end)
++	xorl	%eax, %eax
++	ret
++
++	.p2align 4
++L(last_2x_vec):
++	/* From VEC to 2 * VEC.  No branch when size == VEC_SIZE.  */
++	VMOVU	(%rsi), %YMM2
++	VPCMPEQ (%rdi), %YMM2, %k2
++	kmovd	%k2, %eax
++	subl    $VEC_MASK, %eax
++	jnz	L(first_vec)
++
++L(last_vec):
++	/* Use overlapping loads to avoid branches.  */
++	leaq	-VEC_SIZE(%rdi, %rdx), %rdi
++	leaq	-VEC_SIZE(%rsi, %rdx), %rsi
++	VMOVU	(%rsi), %YMM2
++	VPCMPEQ (%rdi), %YMM2, %k2
++	kmovd	%k2, %eax
++	subl    $VEC_MASK, %eax
++	jnz	L(first_vec)
++	ret
++
++	.p2align 4
++L(first_vec):
++	/* A byte or int32 is different within 16 or 32 bytes.  */
++	tzcntl	%eax, %ecx
++# ifdef USE_AS_WMEMCMP
++	xorl	%eax, %eax
++	movl	(%rdi, %rcx, 4), %edx
++	cmpl	(%rsi, %rcx, 4), %edx
++L(wmemcmp_return):
++	setl	%al
++	negl	%eax
++	orl	$1, %eax
++# else
++	movzbl	(%rdi, %rcx), %eax
++	movzbl	(%rsi, %rcx), %edx
++	sub	%edx, %eax
++# endif
++	ret
++
++# ifdef USE_AS_WMEMCMP
++	.p2align 4
++L(4):
++	xorl	%eax, %eax
++	movl	(%rdi), %edx
++	cmpl	(%rsi), %edx
++	jne	L(wmemcmp_return)
++	ret
++# else
++	.p2align 4
++L(between_4_7):
++	/* Load as big endian with overlapping movbe to avoid branches.  */
++	movbe	(%rdi), %eax
++	movbe	(%rsi), %ecx
++	shlq	$32, %rax
++	shlq	$32, %rcx
++	movbe	-4(%rdi, %rdx), %edi
++	movbe	-4(%rsi, %rdx), %esi
++	orq	%rdi, %rax
++	orq	%rsi, %rcx
++	subq	%rcx, %rax
++	je	L(exit)
++	sbbl	%eax, %eax
++	orl	$1, %eax
++	ret
++
++	.p2align 4
++L(exit):
++	ret
++
++	.p2align 4
++L(between_2_3):
++	/* Load as big endian to avoid branches.  */
++	movzwl	(%rdi), %eax
++	movzwl	(%rsi), %ecx
++	shll	$8, %eax
++	shll	$8, %ecx
++	bswap	%eax
++	bswap	%ecx
++	movb	-1(%rdi, %rdx), %al
++	movb	-1(%rsi, %rdx), %cl
++	/* Subtraction is okay because the upper 8 bits are zero.  */
++	subl	%ecx, %eax
++	ret
++
++	.p2align 4
++L(1):
++	movzbl	(%rdi), %eax
++	movzbl	(%rsi), %ecx
++	subl	%ecx, %eax
++	ret
++# endif
++
++	.p2align 4
++L(zero):
++	xorl	%eax, %eax
++	ret
++
++	.p2align 4
++L(less_vec):
++# ifdef USE_AS_WMEMCMP
++	/* It can only be 0, 4, 8, 12, 16, 20, 24, 28 bytes.  */
++	cmpb	$4, %dl
++	je	L(4)
++	jb	L(zero)
++# else
++	cmpb	$1, %dl
++	je	L(1)
++	jb	L(zero)
++	cmpb	$4, %dl
++	jb	L(between_2_3)
++	cmpb	$8, %dl
++	jb	L(between_4_7)
++# endif
++	cmpb	$16, %dl
++	jae	L(between_16_31)
++	/* It is between 8 and 15 bytes.  */
++	vmovq	(%rdi), %XMM1
++	vmovq	(%rsi), %XMM2
++	VPCMPEQ %XMM1, %XMM2, %k2
++	kmovw	%k2, %eax
++	subl    $XMM_MASK, %eax
++	jnz	L(first_vec)
++	/* Use overlapping loads to avoid branches.  */
++	leaq	-8(%rdi, %rdx), %rdi
++	leaq	-8(%rsi, %rdx), %rsi
++	vmovq	(%rdi), %XMM1
++	vmovq	(%rsi), %XMM2
++	VPCMPEQ %XMM1, %XMM2, %k2
++	kmovw	%k2, %eax
++	subl    $XMM_MASK, %eax
++	jnz	L(first_vec)
++	ret
++
++	.p2align 4
++L(between_16_31):
++	/* From 16 to 31 bytes.  No branch when size == 16.  */
++	VMOVU	(%rsi), %XMM2
++	VPCMPEQ (%rdi), %XMM2, %k2
++	kmovw	%k2, %eax
++	subl    $XMM_MASK, %eax
++	jnz	L(first_vec)
++
++	/* Use overlapping loads to avoid branches.  */
++	leaq	-16(%rdi, %rdx), %rdi
++	leaq	-16(%rsi, %rdx), %rsi
++	VMOVU	(%rsi), %XMM2
++	VPCMPEQ (%rdi), %XMM2, %k2
++	kmovw	%k2, %eax
++	subl    $XMM_MASK, %eax
++	jnz	L(first_vec)
++	ret
++
++	.p2align 4
++L(more_8x_vec):
++	/* More than 8 * VEC.  Check the first VEC.  */
++	VMOVU	(%rsi), %YMM2
++	VPCMPEQ (%rdi), %YMM2, %k2
++	kmovd	%k2, %eax
++	subl    $VEC_MASK, %eax
++	jnz	L(first_vec)
++
++	/* Align the first memory area for aligned loads in the loop.
++	   Compute how much the first memory area is misaligned.  */
++	movq	%rdi, %rcx
++	andl	$(VEC_SIZE - 1), %ecx
++	/* Get the negative of offset for alignment.  */
++	subq	$VEC_SIZE, %rcx
++	/* Adjust the second memory area.  */
++	subq	%rcx, %rsi
++	/* Adjust the first memory area which should be aligned now.  */
++	subq	%rcx, %rdi
++	/* Adjust length.  */
++	addq	%rcx, %rdx
++
++L(loop_4x_vec):
++	/* Compare 4 * VEC at a time forward.  */
++	VMOVU	(%rsi), %YMM1
++	VPCMPEQ (%rdi), %YMM1, %k1
++
++	VMOVU	VEC_SIZE(%rsi), %YMM2
++	VPCMPEQ VEC_SIZE(%rdi), %YMM2, %k2
++	kandd	%k2, %k1, %k5
++
++	VMOVU	(VEC_SIZE * 2)(%rsi), %YMM3
++	VPCMPEQ (VEC_SIZE * 2)(%rdi), %YMM3, %k3
++	kandd	%k3, %k5, %k5
++
++	VMOVU	(VEC_SIZE * 3)(%rsi), %YMM4
++	VPCMPEQ (VEC_SIZE * 3)(%rdi), %YMM4, %k4
++	kandd	%k4, %k5, %k5
++
++	kmovd	%k5, %eax
++	cmpl	$VEC_MASK, %eax
++	jne	L(4x_vec_end)
++
++	addq	$(VEC_SIZE * 4), %rdi
++	addq	$(VEC_SIZE * 4), %rsi
++
++	subq	$(VEC_SIZE * 4), %rdx
++	cmpq	$(VEC_SIZE * 4), %rdx
++	jae	L(loop_4x_vec)
++
++	/* Less than 4 * VEC.  */
++	cmpq	$VEC_SIZE, %rdx
++	jbe	L(last_vec)
++	cmpq	$(VEC_SIZE * 2), %rdx
++	jbe	L(last_2x_vec)
++
++L(last_4x_vec):
++	/* From 2 * VEC to 4 * VEC. */
++	VMOVU	(%rsi), %YMM2
++	VPCMPEQ (%rdi), %YMM2, %k2
++	kmovd	%k2, %eax
++	subl    $VEC_MASK, %eax
++	jnz	L(first_vec)
++
++	addq	$VEC_SIZE, %rdi
++	addq	$VEC_SIZE, %rsi
++	VMOVU	(%rsi), %YMM2
++	VPCMPEQ (%rdi), %YMM2, %k2
++	kmovd	%k2, %eax
++	subl    $VEC_MASK, %eax
++	jnz	L(first_vec)
++
++	/* Use overlapping loads to avoid branches.  */
++	leaq	-(3 * VEC_SIZE)(%rdi, %rdx), %rdi
++	leaq	-(3 * VEC_SIZE)(%rsi, %rdx), %rsi
++	VMOVU	(%rsi), %YMM2
++	VPCMPEQ (%rdi), %YMM2, %k2
++	kmovd	%k2, %eax
++	subl    $VEC_MASK, %eax
++	jnz	L(first_vec)
++
++	addq	$VEC_SIZE, %rdi
++	addq	$VEC_SIZE, %rsi
++	VMOVU	(%rsi), %YMM2
++	VPCMPEQ (%rdi), %YMM2, %k2
++	kmovd	%k2, %eax
++	subl    $VEC_MASK, %eax
++	jnz	L(first_vec)
++	ret
++
++	.p2align 4
++L(4x_vec_end):
++	kmovd	%k1, %eax
++	subl	$VEC_MASK, %eax
++	jnz	L(first_vec)
++	kmovd	%k2, %eax
++	subl	$VEC_MASK, %eax
++	jnz	L(first_vec_x1)
++	kmovd	%k3, %eax
++	subl	$VEC_MASK, %eax
++	jnz	L(first_vec_x2)
++	kmovd	%k4, %eax
++	subl	$VEC_MASK, %eax
++	tzcntl	%eax, %ecx
++# ifdef USE_AS_WMEMCMP
++	xorl	%eax, %eax
++	movl	(VEC_SIZE * 3)(%rdi, %rcx, 4), %edx
++	cmpl	(VEC_SIZE * 3)(%rsi, %rcx, 4), %edx
++	jmp	L(wmemcmp_return)
++# else
++	movzbl	(VEC_SIZE * 3)(%rdi, %rcx), %eax
++	movzbl	(VEC_SIZE * 3)(%rsi, %rcx), %edx
++	sub	%edx, %eax
++# endif
++	ret
++
++	.p2align 4
++L(first_vec_x1):
++	tzcntl	%eax, %ecx
++# ifdef USE_AS_WMEMCMP
++	xorl	%eax, %eax
++	movl	VEC_SIZE(%rdi, %rcx, 4), %edx
++	cmpl	VEC_SIZE(%rsi, %rcx, 4), %edx
++	jmp	L(wmemcmp_return)
++# else
++	movzbl	VEC_SIZE(%rdi, %rcx), %eax
++	movzbl	VEC_SIZE(%rsi, %rcx), %edx
++	sub	%edx, %eax
++# endif
++	ret
++
++	.p2align 4
++L(first_vec_x2):
++	tzcntl	%eax, %ecx
++# ifdef USE_AS_WMEMCMP
++	xorl	%eax, %eax
++	movl	(VEC_SIZE * 2)(%rdi, %rcx, 4), %edx
++	cmpl	(VEC_SIZE * 2)(%rsi, %rcx, 4), %edx
++	jmp	L(wmemcmp_return)
++# else
++	movzbl	(VEC_SIZE * 2)(%rdi, %rcx), %eax
++	movzbl	(VEC_SIZE * 2)(%rsi, %rcx), %edx
++	sub	%edx, %eax
++# endif
++	ret
++END (MEMCMP)
++#endif
+diff --git a/sysdeps/x86_64/multiarch/wmemcmp-evex-movbe.S b/sysdeps/x86_64/multiarch/wmemcmp-evex-movbe.S
+new file mode 100644
+index 00000000..4726d74a
+--- /dev/null
++++ b/sysdeps/x86_64/multiarch/wmemcmp-evex-movbe.S
+@@ -0,0 +1,4 @@
++#define MEMCMP __wmemcmp_evex_movbe
++#define USE_AS_WMEMCMP 1
++
++#include "memcmp-evex-movbe.S"
+-- 
+GitLab
+
diff --git a/glibc-RHEL-15696-17.patch b/glibc-RHEL-15696-17.patch
new file mode 100644
index 0000000..3176514
--- /dev/null
+++ b/glibc-RHEL-15696-17.patch
@@ -0,0 +1,2568 @@
+From 7ebba91361badf7531d4e75050627a88d424872f Mon Sep 17 00:00:00 2001
+From: "H.J. Lu" <hjl.tools@gmail.com>
+Date: Fri, 5 Mar 2021 07:26:42 -0800
+Subject: [PATCH] x86-64: Add AVX optimized string/memory functions for RTM
+Content-type: text/plain; charset=UTF-8
+
+Since VZEROUPPER triggers RTM abort while VZEROALL won't, select AVX
+optimized string/memory functions with
+
+	xtest
+	jz	1f
+	vzeroall
+	ret
+1:
+	vzeroupper
+	ret
+
+at function exit on processors with usable RTM, but without 256-bit EVEX
+instructions to avoid VZEROUPPER inside a transactionally executing RTM
+region.
+---
+ sysdeps/x86_64/multiarch/Makefile             |  27 +++
+ sysdeps/x86_64/multiarch/ifunc-avx2.h         |   4 +
+ sysdeps/x86_64/multiarch/ifunc-impl-list.c    | 170 ++++++++++++++++++
+ sysdeps/x86_64/multiarch/ifunc-memcmp.h       |   4 +
+ sysdeps/x86_64/multiarch/ifunc-memmove.h      |  12 ++
+ sysdeps/x86_64/multiarch/ifunc-memset.h       |  12 ++
+ sysdeps/x86_64/multiarch/ifunc-strcpy.h       |   4 +
+ sysdeps/x86_64/multiarch/ifunc-wmemset.h      |   5 +
+ sysdeps/x86_64/multiarch/memchr-avx2-rtm.S    |  12 ++
+ sysdeps/x86_64/multiarch/memchr-avx2.S        |  45 +++--
+ .../x86_64/multiarch/memcmp-avx2-movbe-rtm.S  |  12 ++
+ sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S  |  28 ++-
+ .../memmove-avx-unaligned-erms-rtm.S          |  17 ++
+ .../multiarch/memmove-vec-unaligned-erms.S    |  33 ++--
+ sysdeps/x86_64/multiarch/memrchr-avx2-rtm.S   |  12 ++
+ sysdeps/x86_64/multiarch/memrchr-avx2.S       |  53 +++---
+ .../memset-avx2-unaligned-erms-rtm.S          |  10 ++
+ .../multiarch/memset-avx2-unaligned-erms.S    |  12 +-
+ .../multiarch/memset-vec-unaligned-erms.S     |  41 ++---
+ sysdeps/x86_64/multiarch/rawmemchr-avx2-rtm.S |   4 +
+ sysdeps/x86_64/multiarch/stpcpy-avx2-rtm.S    |   3 +
+ sysdeps/x86_64/multiarch/stpncpy-avx2-rtm.S   |   4 +
+ sysdeps/x86_64/multiarch/strcat-avx2-rtm.S    |  12 ++
+ sysdeps/x86_64/multiarch/strcat-avx2.S        |   6 +-
+ sysdeps/x86_64/multiarch/strchr-avx2-rtm.S    |  12 ++
+ sysdeps/x86_64/multiarch/strchr-avx2.S        |  22 +--
+ sysdeps/x86_64/multiarch/strchr.c             |   4 +
+ sysdeps/x86_64/multiarch/strchrnul-avx2-rtm.S |   3 +
+ sysdeps/x86_64/multiarch/strcmp-avx2-rtm.S    |  12 ++
+ sysdeps/x86_64/multiarch/strcmp-avx2.S        |  55 +++---
+ sysdeps/x86_64/multiarch/strcmp.c             |   4 +
+ sysdeps/x86_64/multiarch/strcpy-avx2-rtm.S    |  12 ++
+ sysdeps/x86_64/multiarch/strcpy-avx2.S        |  85 ++++-----
+ sysdeps/x86_64/multiarch/strlen-avx2-rtm.S    |  12 ++
+ sysdeps/x86_64/multiarch/strlen-avx2.S        |  43 ++---
+ sysdeps/x86_64/multiarch/strncat-avx2-rtm.S   |   3 +
+ sysdeps/x86_64/multiarch/strncmp-avx2-rtm.S   |   3 +
+ sysdeps/x86_64/multiarch/strncmp.c            |   4 +
+ sysdeps/x86_64/multiarch/strncpy-avx2-rtm.S   |   3 +
+ sysdeps/x86_64/multiarch/strnlen-avx2-rtm.S   |   4 +
+ sysdeps/x86_64/multiarch/strrchr-avx2-rtm.S   |  12 ++
+ sysdeps/x86_64/multiarch/strrchr-avx2.S       |  19 +-
+ sysdeps/x86_64/multiarch/wcschr-avx2-rtm.S    |   3 +
+ sysdeps/x86_64/multiarch/wcscmp-avx2-rtm.S    |   4 +
+ sysdeps/x86_64/multiarch/wcslen-avx2-rtm.S    |   4 +
+ sysdeps/x86_64/multiarch/wcsncmp-avx2-rtm.S   |   5 +
+ sysdeps/x86_64/multiarch/wcsnlen-avx2-rtm.S   |   5 +
+ sysdeps/x86_64/multiarch/wcsnlen.c            |   4 +
+ sysdeps/x86_64/multiarch/wcsrchr-avx2-rtm.S   |   3 +
+ sysdeps/x86_64/multiarch/wmemchr-avx2-rtm.S   |   4 +
+ .../x86_64/multiarch/wmemcmp-avx2-movbe-rtm.S |   4 +
+ sysdeps/x86_64/sysdep.h                       |  22 +++
+ 52 files changed, 668 insertions(+), 244 deletions(-)
+ create mode 100644 sysdeps/x86_64/multiarch/memchr-avx2-rtm.S
+ create mode 100644 sysdeps/x86_64/multiarch/memcmp-avx2-movbe-rtm.S
+ create mode 100644 sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms-rtm.S
+ create mode 100644 sysdeps/x86_64/multiarch/memrchr-avx2-rtm.S
+ create mode 100644 sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms-rtm.S
+ create mode 100644 sysdeps/x86_64/multiarch/rawmemchr-avx2-rtm.S
+ create mode 100644 sysdeps/x86_64/multiarch/stpcpy-avx2-rtm.S
+ create mode 100644 sysdeps/x86_64/multiarch/stpncpy-avx2-rtm.S
+ create mode 100644 sysdeps/x86_64/multiarch/strcat-avx2-rtm.S
+ create mode 100644 sysdeps/x86_64/multiarch/strchr-avx2-rtm.S
+ create mode 100644 sysdeps/x86_64/multiarch/strchrnul-avx2-rtm.S
+ create mode 100644 sysdeps/x86_64/multiarch/strcmp-avx2-rtm.S
+ create mode 100644 sysdeps/x86_64/multiarch/strcpy-avx2-rtm.S
+ create mode 100644 sysdeps/x86_64/multiarch/strlen-avx2-rtm.S
+ create mode 100644 sysdeps/x86_64/multiarch/strncat-avx2-rtm.S
+ create mode 100644 sysdeps/x86_64/multiarch/strncmp-avx2-rtm.S
+ create mode 100644 sysdeps/x86_64/multiarch/strncpy-avx2-rtm.S
+ create mode 100644 sysdeps/x86_64/multiarch/strnlen-avx2-rtm.S
+ create mode 100644 sysdeps/x86_64/multiarch/strrchr-avx2-rtm.S
+ create mode 100644 sysdeps/x86_64/multiarch/wcschr-avx2-rtm.S
+ create mode 100644 sysdeps/x86_64/multiarch/wcscmp-avx2-rtm.S
+ create mode 100644 sysdeps/x86_64/multiarch/wcslen-avx2-rtm.S
+ create mode 100644 sysdeps/x86_64/multiarch/wcsncmp-avx2-rtm.S
+ create mode 100644 sysdeps/x86_64/multiarch/wcsnlen-avx2-rtm.S
+ create mode 100644 sysdeps/x86_64/multiarch/wcsrchr-avx2-rtm.S
+ create mode 100644 sysdeps/x86_64/multiarch/wmemchr-avx2-rtm.S
+ create mode 100644 sysdeps/x86_64/multiarch/wmemcmp-avx2-movbe-rtm.S
+
+Conflicts:
+	sysdeps/x86_64/multiarch/strchr-avx2.S
+	(same fix, different location)
+
+
+diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
+index 9d79b138..491c7698 100644
+--- a/sysdeps/x86_64/multiarch/Makefile
++++ b/sysdeps/x86_64/multiarch/Makefile
+@@ -40,6 +40,25 @@ sysdep_routines += strncat-c stpncpy-c strncpy-c \
+ 		   memset-sse2-unaligned-erms \
+ 		   memset-avx2-unaligned-erms \
+ 		   memset-avx512-unaligned-erms \
++		   memchr-avx2-rtm \
++		   memcmp-avx2-movbe-rtm \
++		   memmove-avx-unaligned-erms-rtm \
++		   memrchr-avx2-rtm \
++		   memset-avx2-unaligned-erms-rtm \
++		   rawmemchr-avx2-rtm \
++		   strchr-avx2-rtm \
++		   strcmp-avx2-rtm \
++		   strchrnul-avx2-rtm \
++		   stpcpy-avx2-rtm \
++		   stpncpy-avx2-rtm \
++		   strcat-avx2-rtm \
++		   strcpy-avx2-rtm \
++		   strlen-avx2-rtm \
++		   strncat-avx2-rtm \
++		   strncmp-avx2-rtm \
++		   strncpy-avx2-rtm \
++		   strnlen-avx2-rtm \
++		   strrchr-avx2-rtm \
+ 		   memchr-evex \
+ 		   memcmp-evex-movbe \
+ 		   memmove-evex-unaligned-erms \
+@@ -76,6 +95,14 @@ sysdep_routines += wmemcmp-sse4 wmemcmp-ssse3 wmemcmp-c \
+ 		   wcsrchr-sse2 wcsrchr-avx2 \
+ 		   wcsnlen-sse4_1 wcsnlen-c \
+ 		   wcslen-sse2 wcslen-avx2 wcsnlen-avx2 \
++		   wcschr-avx2-rtm \
++		   wcscmp-avx2-rtm \
++		   wcslen-avx2-rtm \
++		   wcsncmp-avx2-rtm \
++		   wcsnlen-avx2-rtm \
++		   wcsrchr-avx2-rtm \
++		   wmemchr-avx2-rtm \
++		   wmemcmp-avx2-movbe-rtm \
+ 		   wcschr-evex \
+ 		   wcscmp-evex \
+ 		   wcslen-evex \
+diff --git a/sysdeps/x86_64/multiarch/ifunc-avx2.h b/sysdeps/x86_64/multiarch/ifunc-avx2.h
+index 7081b0c9..e0f30e61 100644
+--- a/sysdeps/x86_64/multiarch/ifunc-avx2.h
++++ b/sysdeps/x86_64/multiarch/ifunc-avx2.h
+@@ -21,6 +21,7 @@
+ 
+ extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden;
+ extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden;
++extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden;
+ extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden;
+ 
+ static inline void *
+@@ -36,6 +37,9 @@ IFUNC_SELECTOR (void)
+ 	  && CPU_FEATURE_USABLE_P (cpu_features, BMI2))
+ 	return OPTIMIZE (evex);
+ 
++      if (CPU_FEATURE_USABLE_P (cpu_features, RTM))
++	return OPTIMIZE (avx2_rtm);
++
+       if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
+ 	return OPTIMIZE (avx2);
+     }
+diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+index c8da910e..c1efeec0 100644
+--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
++++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+@@ -43,6 +43,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 	      IFUNC_IMPL_ADD (array, i, memchr,
+ 			      CPU_FEATURE_USABLE (AVX2),
+ 			      __memchr_avx2)
++	      IFUNC_IMPL_ADD (array, i, memchr,
++			      (CPU_FEATURE_USABLE (AVX2)
++			       && CPU_FEATURE_USABLE (RTM)),
++			      __memchr_avx2_rtm)
+ 	      IFUNC_IMPL_ADD (array, i, memchr,
+ 			      (CPU_FEATURE_USABLE (AVX512VL)
+ 			       && CPU_FEATURE_USABLE (AVX512BW)
+@@ -56,6 +60,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 			      (CPU_FEATURE_USABLE (AVX2)
+ 			       && CPU_FEATURE_USABLE (MOVBE)),
+ 			      __memcmp_avx2_movbe)
++	      IFUNC_IMPL_ADD (array, i, memcmp,
++			      (CPU_FEATURE_USABLE (AVX2)
++			       && CPU_FEATURE_USABLE (MOVBE)
++			       && CPU_FEATURE_USABLE (RTM)),
++			      __memcmp_avx2_movbe_rtm)
+ 	      IFUNC_IMPL_ADD (array, i, memcmp,
+ 			      (CPU_FEATURE_USABLE (AVX512VL)
+ 			       && CPU_FEATURE_USABLE (AVX512BW)
+@@ -85,6 +94,14 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 	      IFUNC_IMPL_ADD (array, i, __memmove_chk,
+ 			      CPU_FEATURE_USABLE (AVX),
+ 			      __memmove_chk_avx_unaligned_erms)
++	      IFUNC_IMPL_ADD (array, i, __memmove_chk,
++			      (CPU_FEATURE_USABLE (AVX)
++			       && CPU_FEATURE_USABLE (RTM)),
++			      __memmove_chk_avx_unaligned_rtm)
++	      IFUNC_IMPL_ADD (array, i, __memmove_chk,
++			      (CPU_FEATURE_USABLE (AVX)
++			       && CPU_FEATURE_USABLE (RTM)),
++			      __memmove_chk_avx_unaligned_erms_rtm)
+ 	      IFUNC_IMPL_ADD (array, i, __memmove_chk,
+ 			      CPU_FEATURE_USABLE (AVX512VL),
+ 			      __memmove_chk_evex_unaligned)
+@@ -113,6 +130,14 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 	      IFUNC_IMPL_ADD (array, i, memmove,
+ 			      CPU_FEATURE_USABLE (AVX),
+ 			      __memmove_avx_unaligned_erms)
++	      IFUNC_IMPL_ADD (array, i, memmove,
++			      (CPU_FEATURE_USABLE (AVX)
++			       && CPU_FEATURE_USABLE (RTM)),
++			      __memmove_avx_unaligned_rtm)
++	      IFUNC_IMPL_ADD (array, i, memmove,
++			      (CPU_FEATURE_USABLE (AVX)
++			       && CPU_FEATURE_USABLE (RTM)),
++			      __memmove_avx_unaligned_erms_rtm)
+ 	      IFUNC_IMPL_ADD (array, i, memmove,
+ 			      CPU_FEATURE_USABLE (AVX512VL),
+ 			      __memmove_evex_unaligned)
+@@ -143,6 +168,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 	      IFUNC_IMPL_ADD (array, i, memrchr,
+ 			      CPU_FEATURE_USABLE (AVX2),
+ 			      __memrchr_avx2)
++	      IFUNC_IMPL_ADD (array, i, memrchr,
++			      (CPU_FEATURE_USABLE (AVX2)
++			       && CPU_FEATURE_USABLE (RTM)),
++			      __memrchr_avx2_rtm)
+ 	      IFUNC_IMPL_ADD (array, i, memrchr,
+ 			      (CPU_FEATURE_USABLE (AVX512VL)
+ 			       && CPU_FEATURE_USABLE (AVX512BW)),
+@@ -165,6 +194,14 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 	      IFUNC_IMPL_ADD (array, i, __memset_chk,
+ 			      CPU_FEATURE_USABLE (AVX2),
+ 			      __memset_chk_avx2_unaligned_erms)
++	      IFUNC_IMPL_ADD (array, i, __memset_chk,
++			      (CPU_FEATURE_USABLE (AVX2)
++			       && CPU_FEATURE_USABLE (RTM)),
++			      __memset_chk_avx2_unaligned_rtm)
++	      IFUNC_IMPL_ADD (array, i, __memset_chk,
++			      (CPU_FEATURE_USABLE (AVX2)
++			       && CPU_FEATURE_USABLE (RTM)),
++			      __memset_chk_avx2_unaligned_erms_rtm)
+ 	      IFUNC_IMPL_ADD (array, i, __memset_chk,
+ 			      (CPU_FEATURE_USABLE (AVX512VL)
+ 			       && CPU_FEATURE_USABLE (AVX512BW)),
+@@ -198,6 +235,14 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 	      IFUNC_IMPL_ADD (array, i, memset,
+ 			      CPU_FEATURE_USABLE (AVX2),
+ 			      __memset_avx2_unaligned_erms)
++	      IFUNC_IMPL_ADD (array, i, memset,
++			      (CPU_FEATURE_USABLE (AVX2)
++			       && CPU_FEATURE_USABLE (RTM)),
++			      __memset_avx2_unaligned_rtm)
++	      IFUNC_IMPL_ADD (array, i, memset,
++			      (CPU_FEATURE_USABLE (AVX2)
++			       && CPU_FEATURE_USABLE (RTM)),
++			      __memset_avx2_unaligned_erms_rtm)
+ 	      IFUNC_IMPL_ADD (array, i, memset,
+ 			      (CPU_FEATURE_USABLE (AVX512VL)
+ 			       && CPU_FEATURE_USABLE (AVX512BW)),
+@@ -222,6 +267,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 	      IFUNC_IMPL_ADD (array, i, rawmemchr,
+ 			      CPU_FEATURE_USABLE (AVX2),
+ 			      __rawmemchr_avx2)
++	      IFUNC_IMPL_ADD (array, i, rawmemchr,
++			      (CPU_FEATURE_USABLE (AVX2)
++			       && CPU_FEATURE_USABLE (RTM)),
++			      __rawmemchr_avx2_rtm)
+ 	      IFUNC_IMPL_ADD (array, i, rawmemchr,
+ 			      (CPU_FEATURE_USABLE (AVX512VL)
+ 			       && CPU_FEATURE_USABLE (AVX512BW)
+@@ -234,6 +283,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 	      IFUNC_IMPL_ADD (array, i, strlen,
+ 			      CPU_FEATURE_USABLE (AVX2),
+ 			      __strlen_avx2)
++	      IFUNC_IMPL_ADD (array, i, strlen,
++			      (CPU_FEATURE_USABLE (AVX2)
++			       && CPU_FEATURE_USABLE (RTM)),
++			      __strlen_avx2_rtm)
+ 	      IFUNC_IMPL_ADD (array, i, strlen,
+ 			      (CPU_FEATURE_USABLE (AVX512VL)
+ 			       && CPU_FEATURE_USABLE (AVX512BW)),
+@@ -245,6 +298,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 	      IFUNC_IMPL_ADD (array, i, strnlen,
+ 			      CPU_FEATURE_USABLE (AVX2),
+ 			      __strnlen_avx2)
++	      IFUNC_IMPL_ADD (array, i, strnlen,
++			      (CPU_FEATURE_USABLE (AVX2)
++			       && CPU_FEATURE_USABLE (RTM)),
++			      __strnlen_avx2_rtm)
+ 	      IFUNC_IMPL_ADD (array, i, strnlen,
+ 			      (CPU_FEATURE_USABLE (AVX512VL)
+ 			       && CPU_FEATURE_USABLE (AVX512BW)),
+@@ -257,6 +314,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 			      __stpncpy_ssse3)
+ 	      IFUNC_IMPL_ADD (array, i, stpncpy, CPU_FEATURE_USABLE (AVX2),
+ 			      __stpncpy_avx2)
++	      IFUNC_IMPL_ADD (array, i, stpncpy,
++			      (CPU_FEATURE_USABLE (AVX2)
++			       && CPU_FEATURE_USABLE (RTM)),
++			      __stpncpy_avx2_rtm)
+ 	      IFUNC_IMPL_ADD (array, i, stpncpy,
+ 			      (CPU_FEATURE_USABLE (AVX512VL)
+ 			       && CPU_FEATURE_USABLE (AVX512BW)),
+@@ -271,6 +332,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 			      __stpcpy_ssse3)
+ 	      IFUNC_IMPL_ADD (array, i, stpcpy, CPU_FEATURE_USABLE (AVX2),
+ 			      __stpcpy_avx2)
++	      IFUNC_IMPL_ADD (array, i, stpcpy,
++			      (CPU_FEATURE_USABLE (AVX2)
++			       && CPU_FEATURE_USABLE (RTM)),
++			      __stpcpy_avx2_rtm)
+ 	      IFUNC_IMPL_ADD (array, i, stpcpy,
+ 			      (CPU_FEATURE_USABLE (AVX512VL)
+ 			       && CPU_FEATURE_USABLE (AVX512BW)),
+@@ -309,6 +374,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+   IFUNC_IMPL (i, name, strcat,
+ 	      IFUNC_IMPL_ADD (array, i, strcat, CPU_FEATURE_USABLE (AVX2),
+ 			      __strcat_avx2)
++	      IFUNC_IMPL_ADD (array, i, strcat,
++			      (CPU_FEATURE_USABLE (AVX2)
++			       && CPU_FEATURE_USABLE (RTM)),
++			      __strcat_avx2_rtm)
+ 	      IFUNC_IMPL_ADD (array, i, strcat,
+ 			      (CPU_FEATURE_USABLE (AVX512VL)
+ 			       && CPU_FEATURE_USABLE (AVX512BW)),
+@@ -323,6 +392,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 	      IFUNC_IMPL_ADD (array, i, strchr,
+ 			      CPU_FEATURE_USABLE (AVX2),
+ 			      __strchr_avx2)
++	      IFUNC_IMPL_ADD (array, i, strchr,
++			      (CPU_FEATURE_USABLE (AVX2)
++			       && CPU_FEATURE_USABLE (RTM)),
++			      __strchr_avx2_rtm)
+ 	      IFUNC_IMPL_ADD (array, i, strchr,
+ 			      (CPU_FEATURE_USABLE (AVX512VL)
+ 			       && CPU_FEATURE_USABLE (AVX512BW)
+@@ -336,6 +409,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 	      IFUNC_IMPL_ADD (array, i, strchrnul,
+ 			      CPU_FEATURE_USABLE (AVX2),
+ 			      __strchrnul_avx2)
++	      IFUNC_IMPL_ADD (array, i, strchrnul,
++			      (CPU_FEATURE_USABLE (AVX2)
++			       && CPU_FEATURE_USABLE (RTM)),
++			      __strchrnul_avx2_rtm)
+ 	      IFUNC_IMPL_ADD (array, i, strchrnul,
+ 			      (CPU_FEATURE_USABLE (AVX512VL)
+ 			       && CPU_FEATURE_USABLE (AVX512BW)
+@@ -348,6 +425,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 	      IFUNC_IMPL_ADD (array, i, strrchr,
+ 			      CPU_FEATURE_USABLE (AVX2),
+ 			      __strrchr_avx2)
++	      IFUNC_IMPL_ADD (array, i, strrchr,
++			      (CPU_FEATURE_USABLE (AVX2)
++			       && CPU_FEATURE_USABLE (RTM)),
++			      __strrchr_avx2_rtm)
+ 	      IFUNC_IMPL_ADD (array, i, strrchr,
+ 			      (CPU_FEATURE_USABLE (AVX512VL)
+ 			       && CPU_FEATURE_USABLE (AVX512BW)),
+@@ -359,6 +440,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 	      IFUNC_IMPL_ADD (array, i, strcmp,
+ 			      CPU_FEATURE_USABLE (AVX2),
+ 			      __strcmp_avx2)
++	      IFUNC_IMPL_ADD (array, i, strcmp,
++			      (CPU_FEATURE_USABLE (AVX2)
++			       && CPU_FEATURE_USABLE (RTM)),
++			      __strcmp_avx2_rtm)
+ 	      IFUNC_IMPL_ADD (array, i, strcmp,
+ 			      (CPU_FEATURE_USABLE (AVX512VL)
+ 			       && CPU_FEATURE_USABLE (AVX512BW)
+@@ -375,6 +460,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+   IFUNC_IMPL (i, name, strcpy,
+ 	      IFUNC_IMPL_ADD (array, i, strcpy, CPU_FEATURE_USABLE (AVX2),
+ 			      __strcpy_avx2)
++	      IFUNC_IMPL_ADD (array, i, strcpy,
++			      (CPU_FEATURE_USABLE (AVX2)
++			       && CPU_FEATURE_USABLE (RTM)),
++			      __strcpy_avx2_rtm)
+ 	      IFUNC_IMPL_ADD (array, i, strcpy,
+ 			      (CPU_FEATURE_USABLE (AVX512VL)
+ 			       && CPU_FEATURE_USABLE (AVX512BW)),
+@@ -422,6 +511,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+   IFUNC_IMPL (i, name, strncat,
+ 	      IFUNC_IMPL_ADD (array, i, strncat, CPU_FEATURE_USABLE (AVX2),
+ 			      __strncat_avx2)
++	      IFUNC_IMPL_ADD (array, i, strncat,
++			      (CPU_FEATURE_USABLE (AVX2)
++			       && CPU_FEATURE_USABLE (RTM)),
++			      __strncat_avx2_rtm)
+ 	      IFUNC_IMPL_ADD (array, i, strncat,
+ 			      (CPU_FEATURE_USABLE (AVX512VL)
+ 			       && CPU_FEATURE_USABLE (AVX512BW)),
+@@ -436,6 +529,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+   IFUNC_IMPL (i, name, strncpy,
+ 	      IFUNC_IMPL_ADD (array, i, strncpy, CPU_FEATURE_USABLE (AVX2),
+ 			      __strncpy_avx2)
++	      IFUNC_IMPL_ADD (array, i, strncpy,
++			      (CPU_FEATURE_USABLE (AVX2)
++			       && CPU_FEATURE_USABLE (RTM)),
++			      __strncpy_avx2_rtm)
+ 	      IFUNC_IMPL_ADD (array, i, strncpy,
+ 			      (CPU_FEATURE_USABLE (AVX512VL)
+ 			       && CPU_FEATURE_USABLE (AVX512BW)),
+@@ -469,6 +566,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 	      IFUNC_IMPL_ADD (array, i, wcschr,
+ 			      CPU_FEATURE_USABLE (AVX2),
+ 			      __wcschr_avx2)
++	      IFUNC_IMPL_ADD (array, i, wcschr,
++			      (CPU_FEATURE_USABLE (AVX2)
++			       && CPU_FEATURE_USABLE (RTM)),
++			      __wcschr_avx2_rtm)
+ 	      IFUNC_IMPL_ADD (array, i, wcschr,
+ 			      (CPU_FEATURE_USABLE (AVX512VL)
+ 			       && CPU_FEATURE_USABLE (AVX512BW)
+@@ -481,6 +582,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 	      IFUNC_IMPL_ADD (array, i, wcsrchr,
+ 			      CPU_FEATURE_USABLE (AVX2),
+ 			      __wcsrchr_avx2)
++	      IFUNC_IMPL_ADD (array, i, wcsrchr,
++			      (CPU_FEATURE_USABLE (AVX2)
++			       && CPU_FEATURE_USABLE (RTM)),
++			      __wcsrchr_avx2_rtm)
+ 	      IFUNC_IMPL_ADD (array, i, wcsrchr,
+ 			      (CPU_FEATURE_USABLE (AVX512VL)
+ 			       && CPU_FEATURE_USABLE (AVX512BW)
+@@ -493,6 +598,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 	      IFUNC_IMPL_ADD (array, i, wcscmp,
+ 			      CPU_FEATURE_USABLE (AVX2),
+ 			      __wcscmp_avx2)
++	      IFUNC_IMPL_ADD (array, i, wcscmp,
++			      (CPU_FEATURE_USABLE (AVX2)
++			       && CPU_FEATURE_USABLE (RTM)),
++			      __wcscmp_avx2_rtm)
+ 	      IFUNC_IMPL_ADD (array, i, wcscmp,
+ 			      (CPU_FEATURE_USABLE (AVX512VL)
+ 			       && CPU_FEATURE_USABLE (AVX512BW)
+@@ -505,6 +614,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 	      IFUNC_IMPL_ADD (array, i, wcsncmp,
+ 			      CPU_FEATURE_USABLE (AVX2),
+ 			      __wcsncmp_avx2)
++	      IFUNC_IMPL_ADD (array, i, wcsncmp,
++			      (CPU_FEATURE_USABLE (AVX2)
++			       && CPU_FEATURE_USABLE (RTM)),
++			      __wcsncmp_avx2_rtm)
+ 	      IFUNC_IMPL_ADD (array, i, wcsncmp,
+ 			      (CPU_FEATURE_USABLE (AVX512VL)
+ 			       && CPU_FEATURE_USABLE (AVX512BW)
+@@ -523,6 +636,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 	      IFUNC_IMPL_ADD (array, i, wcslen,
+ 			      CPU_FEATURE_USABLE (AVX2),
+ 			      __wcslen_avx2)
++	      IFUNC_IMPL_ADD (array, i, wcslen,
++			      (CPU_FEATURE_USABLE (AVX2)
++			       && CPU_FEATURE_USABLE (RTM)),
++			      __wcslen_avx2_rtm)
+ 	      IFUNC_IMPL_ADD (array, i, wcslen,
+ 			      (CPU_FEATURE_USABLE (AVX512VL)
+ 			       && CPU_FEATURE_USABLE (AVX512BW)
+@@ -535,6 +652,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 	      IFUNC_IMPL_ADD (array, i, wcsnlen,
+ 			      CPU_FEATURE_USABLE (AVX2),
+ 			      __wcsnlen_avx2)
++	      IFUNC_IMPL_ADD (array, i, wcsnlen,
++			      (CPU_FEATURE_USABLE (AVX2)
++			       && CPU_FEATURE_USABLE (RTM)),
++			      __wcsnlen_avx2_rtm)
+ 	      IFUNC_IMPL_ADD (array, i, wcsnlen,
+ 			      (CPU_FEATURE_USABLE (AVX512VL)
+ 			       && CPU_FEATURE_USABLE (AVX512BW)
+@@ -550,6 +671,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 	      IFUNC_IMPL_ADD (array, i, wmemchr,
+ 			      CPU_FEATURE_USABLE (AVX2),
+ 			      __wmemchr_avx2)
++	      IFUNC_IMPL_ADD (array, i, wmemchr,
++			      (CPU_FEATURE_USABLE (AVX2)
++			       && CPU_FEATURE_USABLE (RTM)),
++			      __wmemchr_avx2_rtm)
+ 	      IFUNC_IMPL_ADD (array, i, wmemchr,
+ 			      (CPU_FEATURE_USABLE (AVX512VL)
+ 			       && CPU_FEATURE_USABLE (AVX512BW)
+@@ -563,6 +688,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 			      (CPU_FEATURE_USABLE (AVX2)
+ 			       && CPU_FEATURE_USABLE (MOVBE)),
+ 			      __wmemcmp_avx2_movbe)
++	      IFUNC_IMPL_ADD (array, i, wmemcmp,
++			      (CPU_FEATURE_USABLE (AVX2)
++			       && CPU_FEATURE_USABLE (MOVBE)
++			       && CPU_FEATURE_USABLE (RTM)),
++			      __wmemcmp_avx2_movbe_rtm)
+ 	      IFUNC_IMPL_ADD (array, i, wmemcmp,
+ 			      (CPU_FEATURE_USABLE (AVX512VL)
+ 			       && CPU_FEATURE_USABLE (AVX512BW)
+@@ -581,6 +711,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 	      IFUNC_IMPL_ADD (array, i, wmemset,
+ 			      CPU_FEATURE_USABLE (AVX2),
+ 			      __wmemset_avx2_unaligned)
++	      IFUNC_IMPL_ADD (array, i, wmemset,
++			      (CPU_FEATURE_USABLE (AVX2)
++			       && CPU_FEATURE_USABLE (RTM)),
++			      __wmemset_avx2_unaligned_rtm)
+ 	      IFUNC_IMPL_ADD (array, i, wmemset,
+ 			      CPU_FEATURE_USABLE (AVX512VL),
+ 			      __wmemset_evex_unaligned)
+@@ -606,6 +740,14 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 	      IFUNC_IMPL_ADD (array, i, __memcpy_chk,
+ 			      CPU_FEATURE_USABLE (AVX),
+ 			      __memcpy_chk_avx_unaligned_erms)
++	      IFUNC_IMPL_ADD (array, i, __memcpy_chk,
++			      (CPU_FEATURE_USABLE (AVX)
++			       && CPU_FEATURE_USABLE (RTM)),
++			      __memcpy_chk_avx_unaligned_rtm)
++	      IFUNC_IMPL_ADD (array, i, __memcpy_chk,
++			      (CPU_FEATURE_USABLE (AVX)
++			       && CPU_FEATURE_USABLE (RTM)),
++			      __memcpy_chk_avx_unaligned_erms_rtm)
+ 	      IFUNC_IMPL_ADD (array, i, __memcpy_chk,
+ 			      CPU_FEATURE_USABLE (AVX512VL),
+ 			      __memcpy_chk_evex_unaligned)
+@@ -634,6 +776,14 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 	      IFUNC_IMPL_ADD (array, i, memcpy,
+ 			      CPU_FEATURE_USABLE (AVX),
+ 			      __memcpy_avx_unaligned_erms)
++	      IFUNC_IMPL_ADD (array, i, memcpy,
++			      (CPU_FEATURE_USABLE (AVX)
++			       && CPU_FEATURE_USABLE (RTM)),
++			      __memcpy_avx_unaligned_rtm)
++	      IFUNC_IMPL_ADD (array, i, memcpy,
++			      (CPU_FEATURE_USABLE (AVX)
++			       && CPU_FEATURE_USABLE (RTM)),
++			      __memcpy_avx_unaligned_erms_rtm)
+ 	      IFUNC_IMPL_ADD (array, i, memcpy,
+ 			      CPU_FEATURE_USABLE (AVX512VL),
+ 			      __memcpy_evex_unaligned)
+@@ -676,6 +826,14 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 	      IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
+ 			      CPU_FEATURE_USABLE (AVX),
+ 			      __mempcpy_chk_avx_unaligned_erms)
++	      IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
++			      (CPU_FEATURE_USABLE (AVX)
++			       && CPU_FEATURE_USABLE (RTM)),
++			      __mempcpy_chk_avx_unaligned_rtm)
++	      IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
++			      (CPU_FEATURE_USABLE (AVX)
++			       && CPU_FEATURE_USABLE (RTM)),
++			      __mempcpy_chk_avx_unaligned_erms_rtm)
+ 	      IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
+ 			      CPU_FEATURE_USABLE (AVX512VL),
+ 			      __mempcpy_chk_evex_unaligned)
+@@ -713,6 +871,14 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 	      IFUNC_IMPL_ADD (array, i, mempcpy,
+ 			      CPU_FEATURE_USABLE (AVX),
+ 			      __mempcpy_avx_unaligned_erms)
++	      IFUNC_IMPL_ADD (array, i, mempcpy,
++			      (CPU_FEATURE_USABLE (AVX)
++			       && CPU_FEATURE_USABLE (RTM)),
++			      __mempcpy_avx_unaligned_rtm)
++	      IFUNC_IMPL_ADD (array, i, mempcpy,
++			      (CPU_FEATURE_USABLE (AVX)
++			       && CPU_FEATURE_USABLE (RTM)),
++			      __mempcpy_avx_unaligned_erms_rtm)
+ 	      IFUNC_IMPL_ADD (array, i, mempcpy,
+ 			      CPU_FEATURE_USABLE (AVX512VL),
+ 			      __mempcpy_evex_unaligned)
+@@ -734,6 +900,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 	      IFUNC_IMPL_ADD (array, i, strncmp,
+ 			      CPU_FEATURE_USABLE (AVX2),
+ 			      __strncmp_avx2)
++	      IFUNC_IMPL_ADD (array, i, strncmp,
++			      (CPU_FEATURE_USABLE (AVX2)
++			       && CPU_FEATURE_USABLE (RTM)),
++			      __strncmp_avx2_rtm)
+ 	      IFUNC_IMPL_ADD (array, i, strncmp,
+ 			      (CPU_FEATURE_USABLE (AVX512VL)
+ 			       && CPU_FEATURE_USABLE (AVX512BW)),
+diff --git a/sysdeps/x86_64/multiarch/ifunc-memcmp.h b/sysdeps/x86_64/multiarch/ifunc-memcmp.h
+index 3ca1f0a6..8043c635 100644
+--- a/sysdeps/x86_64/multiarch/ifunc-memcmp.h
++++ b/sysdeps/x86_64/multiarch/ifunc-memcmp.h
+@@ -23,6 +23,7 @@ extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden;
+ extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden;
+ extern __typeof (REDIRECT_NAME) OPTIMIZE (sse4_1) attribute_hidden;
+ extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_movbe) attribute_hidden;
++extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_movbe_rtm) attribute_hidden;
+ extern __typeof (REDIRECT_NAME) OPTIMIZE (evex_movbe) attribute_hidden;
+ 
+ static inline void *
+@@ -38,6 +39,9 @@ IFUNC_SELECTOR (void)
+ 	  && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW))
+ 	return OPTIMIZE (evex_movbe);
+ 
++      if (CPU_FEATURE_USABLE_P (cpu_features, RTM))
++	return OPTIMIZE (avx2_movbe_rtm);
++
+       if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
+ 	return OPTIMIZE (avx2_movbe);
+     }
+diff --git a/sysdeps/x86_64/multiarch/ifunc-memmove.h b/sysdeps/x86_64/multiarch/ifunc-memmove.h
+index 6f8bce5f..fa09b9fb 100644
+--- a/sysdeps/x86_64/multiarch/ifunc-memmove.h
++++ b/sysdeps/x86_64/multiarch/ifunc-memmove.h
+@@ -29,6 +29,10 @@ extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3_back) attribute_hidden;
+ extern __typeof (REDIRECT_NAME) OPTIMIZE (avx_unaligned) attribute_hidden;
+ extern __typeof (REDIRECT_NAME) OPTIMIZE (avx_unaligned_erms)
+   attribute_hidden;
++extern __typeof (REDIRECT_NAME) OPTIMIZE (avx_unaligned_rtm)
++  attribute_hidden;
++extern __typeof (REDIRECT_NAME) OPTIMIZE (avx_unaligned_erms_rtm)
++  attribute_hidden;
+ extern __typeof (REDIRECT_NAME) OPTIMIZE (evex_unaligned)
+   attribute_hidden;
+ extern __typeof (REDIRECT_NAME) OPTIMIZE (evex_unaligned_erms)
+@@ -71,6 +75,14 @@ IFUNC_SELECTOR (void)
+ 	  return OPTIMIZE (evex_unaligned);
+ 	}
+ 
++      if (CPU_FEATURE_USABLE_P (cpu_features, RTM))
++	{
++	  if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
++	    return OPTIMIZE (avx_unaligned_erms_rtm);
++
++	  return OPTIMIZE (avx_unaligned_rtm);
++	}
++
+       if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
+ 	{
+ 	  if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
+diff --git a/sysdeps/x86_64/multiarch/ifunc-memset.h b/sysdeps/x86_64/multiarch/ifunc-memset.h
+index 6f31f4dc..6f3375cc 100644
+--- a/sysdeps/x86_64/multiarch/ifunc-memset.h
++++ b/sysdeps/x86_64/multiarch/ifunc-memset.h
+@@ -27,6 +27,10 @@ extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2_unaligned_erms)
+ extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_unaligned) attribute_hidden;
+ extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_unaligned_erms)
+   attribute_hidden;
++extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_unaligned_rtm)
++  attribute_hidden;
++extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_unaligned_erms_rtm)
++  attribute_hidden;
+ extern __typeof (REDIRECT_NAME) OPTIMIZE (evex_unaligned)
+   attribute_hidden;
+ extern __typeof (REDIRECT_NAME) OPTIMIZE (evex_unaligned_erms)
+@@ -69,6 +73,14 @@ IFUNC_SELECTOR (void)
+ 	  return OPTIMIZE (evex_unaligned);
+ 	}
+ 
++      if (CPU_FEATURE_USABLE_P (cpu_features, RTM))
++	{
++	  if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
++	    return OPTIMIZE (avx2_unaligned_erms_rtm);
++
++	  return OPTIMIZE (avx2_unaligned_rtm);
++	}
++
+       if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
+ 	{
+ 	  if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
+diff --git a/sysdeps/x86_64/multiarch/ifunc-strcpy.h b/sysdeps/x86_64/multiarch/ifunc-strcpy.h
+index deae6348..a924762e 100644
+--- a/sysdeps/x86_64/multiarch/ifunc-strcpy.h
++++ b/sysdeps/x86_64/multiarch/ifunc-strcpy.h
+@@ -25,6 +25,7 @@ extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2_unaligned)
+   attribute_hidden;
+ extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden;
+ extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden;
++extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden;
+ extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden;
+ 
+ static inline void *
+@@ -39,6 +40,9 @@ IFUNC_SELECTOR (void)
+ 	  && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW))
+ 	return OPTIMIZE (evex);
+ 
++      if (CPU_FEATURE_USABLE_P (cpu_features, RTM))
++	return OPTIMIZE (avx2_rtm);
++
+       if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
+ 	return OPTIMIZE (avx2);
+     }
+diff --git a/sysdeps/x86_64/multiarch/ifunc-wmemset.h b/sysdeps/x86_64/multiarch/ifunc-wmemset.h
+index 9290c4bf..bdc94c6c 100644
+--- a/sysdeps/x86_64/multiarch/ifunc-wmemset.h
++++ b/sysdeps/x86_64/multiarch/ifunc-wmemset.h
+@@ -20,6 +20,8 @@
+ 
+ extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2_unaligned) attribute_hidden;
+ extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_unaligned) attribute_hidden;
++extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_unaligned_rtm)
++  attribute_hidden;
+ extern __typeof (REDIRECT_NAME) OPTIMIZE (evex_unaligned) attribute_hidden;
+ extern __typeof (REDIRECT_NAME) OPTIMIZE (avx512_unaligned) attribute_hidden;
+ 
+@@ -39,6 +41,9 @@ IFUNC_SELECTOR (void)
+       if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL))
+ 	return OPTIMIZE (evex_unaligned);
+ 
++      if (CPU_FEATURE_USABLE_P (cpu_features, RTM))
++	return OPTIMIZE (avx2_unaligned_rtm);
++
+       if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
+ 	return OPTIMIZE (avx2_unaligned);
+     }
+diff --git a/sysdeps/x86_64/multiarch/memchr-avx2-rtm.S b/sysdeps/x86_64/multiarch/memchr-avx2-rtm.S
+new file mode 100644
+index 00000000..87b076c7
+--- /dev/null
++++ b/sysdeps/x86_64/multiarch/memchr-avx2-rtm.S
+@@ -0,0 +1,12 @@
++#ifndef MEMCHR
++# define MEMCHR __memchr_avx2_rtm
++#endif
++
++#define ZERO_UPPER_VEC_REGISTERS_RETURN \
++  ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST
++
++#define VZEROUPPER_RETURN jmp	 L(return_vzeroupper)
++
++#define SECTION(p) p##.avx.rtm
++
++#include "memchr-avx2.S"
+diff --git a/sysdeps/x86_64/multiarch/memchr-avx2.S b/sysdeps/x86_64/multiarch/memchr-avx2.S
+index c81da19b..cf893e77 100644
+--- a/sysdeps/x86_64/multiarch/memchr-avx2.S
++++ b/sysdeps/x86_64/multiarch/memchr-avx2.S
+@@ -34,9 +34,13 @@
+ #  define VZEROUPPER	vzeroupper
+ # endif
+ 
++# ifndef SECTION
++#  define SECTION(p)	p##.avx
++# endif
++
+ # define VEC_SIZE 32
+ 
+-	.section .text.avx,"ax",@progbits
++	.section SECTION(.text),"ax",@progbits
+ ENTRY (MEMCHR)
+ # ifndef USE_AS_RAWMEMCHR
+ 	/* Check for zero length.  */
+@@ -107,8 +111,8 @@ L(cros_page_boundary):
+ # endif
+ 	addq	%rdi, %rax
+ 	addq	%rcx, %rax
+-	VZEROUPPER
+-	ret
++L(return_vzeroupper):
++	ZERO_UPPER_VEC_REGISTERS_RETURN
+ 
+ 	.p2align 4
+ L(aligned_more):
+@@ -224,8 +228,7 @@ L(last_4x_vec_or_less):
+ 
+ 	jnz	L(first_vec_x3_check)
+ 	xorl	%eax, %eax
+-	VZEROUPPER
+-	ret
++	VZEROUPPER_RETURN
+ 
+ 	.p2align 4
+ L(last_2x_vec):
+@@ -243,8 +246,7 @@ L(last_2x_vec):
+ 	testl	%eax, %eax
+ 	jnz	L(first_vec_x1_check)
+ 	xorl	%eax, %eax
+-	VZEROUPPER
+-	ret
++	VZEROUPPER_RETURN
+ 
+ 	.p2align 4
+ L(first_vec_x0_check):
+@@ -253,8 +255,7 @@ L(first_vec_x0_check):
+ 	cmpq	%rax, %rdx
+ 	jbe	L(zero)
+ 	addq	%rdi, %rax
+-	VZEROUPPER
+-	ret
++	VZEROUPPER_RETURN
+ 
+ 	.p2align 4
+ L(first_vec_x1_check):
+@@ -264,8 +265,7 @@ L(first_vec_x1_check):
+ 	jbe	L(zero)
+ 	addq	$VEC_SIZE, %rax
+ 	addq	%rdi, %rax
+-	VZEROUPPER
+-	ret
++	VZEROUPPER_RETURN
+ 
+ 	.p2align 4
+ L(first_vec_x2_check):
+@@ -275,8 +275,7 @@ L(first_vec_x2_check):
+ 	jbe	L(zero)
+ 	addq	$(VEC_SIZE * 2), %rax
+ 	addq	%rdi, %rax
+-	VZEROUPPER
+-	ret
++	VZEROUPPER_RETURN
+ 
+ 	.p2align 4
+ L(first_vec_x3_check):
+@@ -286,12 +285,14 @@ L(first_vec_x3_check):
+ 	jbe	L(zero)
+ 	addq	$(VEC_SIZE * 3), %rax
+ 	addq	%rdi, %rax
+-	VZEROUPPER
+-	ret
++	VZEROUPPER_RETURN
+ 
+ 	.p2align 4
+ L(zero):
+-	VZEROUPPER
++	xorl	%eax, %eax
++	jmp     L(return_vzeroupper)
++
++	.p2align 4
+ L(null):
+ 	xorl	%eax, %eax
+ 	ret
+@@ -301,24 +302,21 @@ L(null):
+ L(first_vec_x0):
+ 	tzcntl	%eax, %eax
+ 	addq	%rdi, %rax
+-	VZEROUPPER
+-	ret
++	VZEROUPPER_RETURN
+ 
+ 	.p2align 4
+ L(first_vec_x1):
+ 	tzcntl	%eax, %eax
+ 	addq	$VEC_SIZE, %rax
+ 	addq	%rdi, %rax
+-	VZEROUPPER
+-	ret
++	VZEROUPPER_RETURN
+ 
+ 	.p2align 4
+ L(first_vec_x2):
+ 	tzcntl	%eax, %eax
+ 	addq	$(VEC_SIZE * 2), %rax
+ 	addq	%rdi, %rax
+-	VZEROUPPER
+-	ret
++	VZEROUPPER_RETURN
+ 
+ 	.p2align 4
+ L(4x_vec_end):
+@@ -337,8 +335,7 @@ L(first_vec_x3):
+ 	tzcntl	%eax, %eax
+ 	addq	$(VEC_SIZE * 3), %rax
+ 	addq	%rdi, %rax
+-	VZEROUPPER
+-	ret
++	VZEROUPPER_RETURN
+ 
+ END (MEMCHR)
+ #endif
+diff --git a/sysdeps/x86_64/multiarch/memcmp-avx2-movbe-rtm.S b/sysdeps/x86_64/multiarch/memcmp-avx2-movbe-rtm.S
+new file mode 100644
+index 00000000..cf4eff5d
+--- /dev/null
++++ b/sysdeps/x86_64/multiarch/memcmp-avx2-movbe-rtm.S
+@@ -0,0 +1,12 @@
++#ifndef MEMCMP
++# define MEMCMP __memcmp_avx2_movbe_rtm
++#endif
++
++#define ZERO_UPPER_VEC_REGISTERS_RETURN \
++  ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST
++
++#define VZEROUPPER_RETURN jmp	 L(return_vzeroupper)
++
++#define SECTION(p) p##.avx.rtm
++
++#include "memcmp-avx2-movbe.S"
+diff --git a/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S b/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S
+index e3a35b89..9d5c9c72 100644
+--- a/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S
++++ b/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S
+@@ -47,6 +47,10 @@
+ #  define VZEROUPPER	vzeroupper
+ # endif
+ 
++# ifndef SECTION
++#  define SECTION(p)	p##.avx
++# endif
++
+ # define VEC_SIZE 32
+ # define VEC_MASK ((1 << VEC_SIZE) - 1)
+ 
+@@ -55,7 +59,7 @@
+            memcmp has to use UNSIGNED comparison for elemnts.
+ */
+ 
+-	.section .text.avx,"ax",@progbits
++	.section SECTION(.text),"ax",@progbits
+ ENTRY (MEMCMP)
+ # ifdef USE_AS_WMEMCMP
+ 	shl	$2, %RDX_LP
+@@ -123,8 +127,8 @@ ENTRY (MEMCMP)
+ 	vptest	%ymm0, %ymm5
+ 	jnc	L(4x_vec_end)
+ 	xorl	%eax, %eax
+-	VZEROUPPER
+-	ret
++L(return_vzeroupper):
++	ZERO_UPPER_VEC_REGISTERS_RETURN
+ 
+ 	.p2align 4
+ L(last_2x_vec):
+@@ -144,8 +148,7 @@ L(last_vec):
+ 	vpmovmskb %ymm2, %eax
+ 	subl    $VEC_MASK, %eax
+ 	jnz	L(first_vec)
+-	VZEROUPPER
+-	ret
++	VZEROUPPER_RETURN
+ 
+ 	.p2align 4
+ L(first_vec):
+@@ -164,8 +167,7 @@ L(wmemcmp_return):
+ 	movzbl	(%rsi, %rcx), %edx
+ 	sub	%edx, %eax
+ # endif
+-	VZEROUPPER
+-	ret
++	VZEROUPPER_RETURN
+ 
+ # ifdef USE_AS_WMEMCMP
+ 	.p2align 4
+@@ -367,8 +369,7 @@ L(last_4x_vec):
+ 	vpmovmskb %ymm2, %eax
+ 	subl    $VEC_MASK, %eax
+ 	jnz	L(first_vec)
+-	VZEROUPPER
+-	ret
++	VZEROUPPER_RETURN
+ 
+ 	.p2align 4
+ L(4x_vec_end):
+@@ -394,8 +395,7 @@ L(4x_vec_end):
+ 	movzbl	(VEC_SIZE * 3)(%rsi, %rcx), %edx
+ 	sub	%edx, %eax
+ # endif
+-	VZEROUPPER
+-	ret
++	VZEROUPPER_RETURN
+ 
+ 	.p2align 4
+ L(first_vec_x1):
+@@ -410,8 +410,7 @@ L(first_vec_x1):
+ 	movzbl	VEC_SIZE(%rsi, %rcx), %edx
+ 	sub	%edx, %eax
+ # endif
+-	VZEROUPPER
+-	ret
++	VZEROUPPER_RETURN
+ 
+ 	.p2align 4
+ L(first_vec_x2):
+@@ -426,7 +425,6 @@ L(first_vec_x2):
+ 	movzbl	(VEC_SIZE * 2)(%rsi, %rcx), %edx
+ 	sub	%edx, %eax
+ # endif
+-	VZEROUPPER
+-	ret
++	VZEROUPPER_RETURN
+ END (MEMCMP)
+ #endif
+diff --git a/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms-rtm.S b/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms-rtm.S
+new file mode 100644
+index 00000000..1ec1962e
+--- /dev/null
++++ b/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms-rtm.S
+@@ -0,0 +1,17 @@
++#if IS_IN (libc)
++# define VEC_SIZE	32
++# define VEC(i)		ymm##i
++# define VMOVNT		vmovntdq
++# define VMOVU		vmovdqu
++# define VMOVA		vmovdqa
++
++# define ZERO_UPPER_VEC_REGISTERS_RETURN \
++  ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST
++
++# define VZEROUPPER_RETURN jmp	 L(return)
++
++# define SECTION(p)		p##.avx.rtm
++# define MEMMOVE_SYMBOL(p,s)	p##_avx_##s##_rtm
++
++# include "memmove-vec-unaligned-erms.S"
++#endif
+diff --git a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
+index 08e21692..71f5954d 100644
+--- a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
++++ b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
+@@ -140,11 +140,12 @@ L(last_2x_vec):
+ 	VMOVU	-VEC_SIZE(%rsi,%rdx), %VEC(1)
+ 	VMOVU	%VEC(0), (%rdi)
+ 	VMOVU	%VEC(1), -VEC_SIZE(%rdi,%rdx)
+-	VZEROUPPER
+ #if !defined USE_MULTIARCH || !IS_IN (libc)
+ L(nop):
+-#endif
+ 	ret
++#else
++	VZEROUPPER_RETURN
++#endif
+ #if defined USE_MULTIARCH && IS_IN (libc)
+ END (MEMMOVE_SYMBOL (__memmove, unaligned))
+ 
+@@ -237,8 +238,11 @@ L(last_2x_vec):
+ 	VMOVU	%VEC(0), (%rdi)
+ 	VMOVU	%VEC(1), -VEC_SIZE(%rdi,%rdx)
+ L(return):
+-	VZEROUPPER
++#if VEC_SIZE > 16
++	ZERO_UPPER_VEC_REGISTERS_RETURN
++#else
+ 	ret
++#endif
+ 
+ L(movsb):
+ 	cmpq	__x86_shared_non_temporal_threshold(%rip), %rdx
+@@ -289,8 +293,7 @@ L(between_32_63):
+ 	VMOVU	-32(%rsi,%rdx), %YMM1
+ 	VMOVU	%YMM0, (%rdi)
+ 	VMOVU	%YMM1, -32(%rdi,%rdx)
+-	VZEROUPPER
+-	ret
++	VZEROUPPER_RETURN
+ #endif
+ #if VEC_SIZE > 16
+ 	/* From 16 to 31.  No branch when size == 16.  */
+@@ -299,7 +302,7 @@ L(between_16_31):
+ 	VMOVU	-16(%rsi,%rdx), %XMM1
+ 	VMOVU	%XMM0, (%rdi)
+ 	VMOVU	%XMM1, -16(%rdi,%rdx)
+-	ret
++	VZEROUPPER_RETURN
+ #endif
+ L(between_8_15):
+ 	/* From 8 to 15.  No branch when size == 8.  */
+@@ -352,8 +355,7 @@ L(more_2x_vec):
+ 	VMOVU	%VEC(5), -(VEC_SIZE * 2)(%rdi,%rdx)
+ 	VMOVU	%VEC(6), -(VEC_SIZE * 3)(%rdi,%rdx)
+ 	VMOVU	%VEC(7), -(VEC_SIZE * 4)(%rdi,%rdx)
+-	VZEROUPPER
+-	ret
++	VZEROUPPER_RETURN
+ L(last_4x_vec):
+ 	/* Copy from 2 * VEC to 4 * VEC. */
+ 	VMOVU	(%rsi), %VEC(0)
+@@ -364,8 +366,7 @@ L(last_4x_vec):
+ 	VMOVU	%VEC(1), VEC_SIZE(%rdi)
+ 	VMOVU	%VEC(2), -VEC_SIZE(%rdi,%rdx)
+ 	VMOVU	%VEC(3), -(VEC_SIZE * 2)(%rdi,%rdx)
+-	VZEROUPPER
+-	ret
++	VZEROUPPER_RETURN
+ 
+ L(more_8x_vec):
+ 	cmpq	%rsi, %rdi
+@@ -421,8 +422,7 @@ L(loop_4x_vec_forward):
+ 	VMOVU	%VEC(8), -(VEC_SIZE * 3)(%rcx)
+ 	/* Store the first VEC.  */
+ 	VMOVU	%VEC(4), (%r11)
+-	VZEROUPPER
+-	ret
++	VZEROUPPER_RETURN
+ 
+ L(more_8x_vec_backward):
+ 	/* Load the first 4 * VEC and last VEC to support overlapping
+@@ -473,8 +473,7 @@ L(loop_4x_vec_backward):
+ 	VMOVU	%VEC(7), (VEC_SIZE * 3)(%rdi)
+ 	/* Store the last VEC.  */
+ 	VMOVU	%VEC(8), (%r11)
+-	VZEROUPPER
+-	ret
++	VZEROUPPER_RETURN
+ 
+ #if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc)
+ L(large_forward):
+@@ -509,8 +508,7 @@ L(loop_large_forward):
+ 	VMOVU	%VEC(8), -(VEC_SIZE * 3)(%rcx)
+ 	/* Store the first VEC.  */
+ 	VMOVU	%VEC(4), (%r11)
+-	VZEROUPPER
+-	ret
++	VZEROUPPER_RETURN
+ 
+ L(large_backward):
+ 	/* Don't use non-temporal store if there is overlap between
+@@ -544,8 +542,7 @@ L(loop_large_backward):
+ 	VMOVU	%VEC(7), (VEC_SIZE * 3)(%rdi)
+ 	/* Store the last VEC.  */
+ 	VMOVU	%VEC(8), (%r11)
+-	VZEROUPPER
+-	ret
++	VZEROUPPER_RETURN
+ #endif
+ END (MEMMOVE_SYMBOL (__memmove, unaligned_erms))
+ 
+diff --git a/sysdeps/x86_64/multiarch/memrchr-avx2-rtm.S b/sysdeps/x86_64/multiarch/memrchr-avx2-rtm.S
+new file mode 100644
+index 00000000..cea2d2a7
+--- /dev/null
++++ b/sysdeps/x86_64/multiarch/memrchr-avx2-rtm.S
+@@ -0,0 +1,12 @@
++#ifndef MEMRCHR
++# define MEMRCHR __memrchr_avx2_rtm
++#endif
++
++#define ZERO_UPPER_VEC_REGISTERS_RETURN \
++  ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST
++
++#define VZEROUPPER_RETURN jmp	 L(return_vzeroupper)
++
++#define SECTION(p) p##.avx.rtm
++
++#include "memrchr-avx2.S"
+diff --git a/sysdeps/x86_64/multiarch/memrchr-avx2.S b/sysdeps/x86_64/multiarch/memrchr-avx2.S
+index ce488dd9..20efe7ac 100644
+--- a/sysdeps/x86_64/multiarch/memrchr-avx2.S
++++ b/sysdeps/x86_64/multiarch/memrchr-avx2.S
+@@ -20,14 +20,22 @@
+ 
+ # include <sysdep.h>
+ 
++# ifndef MEMRCHR
++#  define MEMRCHR	__memrchr_avx2
++# endif
++
+ # ifndef VZEROUPPER
+ #  define VZEROUPPER	vzeroupper
+ # endif
+ 
++# ifndef SECTION
++#  define SECTION(p)	p##.avx
++# endif
++
+ # define VEC_SIZE 32
+ 
+-	.section .text.avx,"ax",@progbits
+-ENTRY (__memrchr_avx2)
++	.section SECTION(.text),"ax",@progbits
++ENTRY (MEMRCHR)
+ 	/* Broadcast CHAR to YMM0.  */
+ 	vmovd	%esi, %xmm0
+ 	vpbroadcastb %xmm0, %ymm0
+@@ -134,8 +142,8 @@ L(loop_4x_vec):
+ 	vpmovmskb %ymm1, %eax
+ 	bsrl	%eax, %eax
+ 	addq	%rdi, %rax
+-	VZEROUPPER
+-	ret
++L(return_vzeroupper):
++	ZERO_UPPER_VEC_REGISTERS_RETURN
+ 
+ 	.p2align 4
+ L(last_4x_vec_or_less):
+@@ -169,8 +177,7 @@ L(last_4x_vec_or_less):
+ 	addq	%rax, %rdx
+ 	jl	L(zero)
+ 	addq	%rdi, %rax
+-	VZEROUPPER
+-	ret
++	VZEROUPPER_RETURN
+ 
+ 	.p2align 4
+ L(last_2x_vec):
+@@ -191,31 +198,27 @@ L(last_2x_vec):
+ 	jl	L(zero)
+ 	addl	$(VEC_SIZE * 2), %eax
+ 	addq	%rdi, %rax
+-	VZEROUPPER
+-	ret
++	VZEROUPPER_RETURN
+ 
+ 	.p2align 4
+ L(last_vec_x0):
+ 	bsrl	%eax, %eax
+ 	addq	%rdi, %rax
+-	VZEROUPPER
+-	ret
++	VZEROUPPER_RETURN
+ 
+ 	.p2align 4
+ L(last_vec_x1):
+ 	bsrl	%eax, %eax
+ 	addl	$VEC_SIZE, %eax
+ 	addq	%rdi, %rax
+-	VZEROUPPER
+-	ret
++	VZEROUPPER_RETURN
+ 
+ 	.p2align 4
+ L(last_vec_x2):
+ 	bsrl	%eax, %eax
+ 	addl	$(VEC_SIZE * 2), %eax
+ 	addq	%rdi, %rax
+-	VZEROUPPER
+-	ret
++	VZEROUPPER_RETURN
+ 
+ 	.p2align 4
+ L(last_vec_x3):
+@@ -232,8 +235,7 @@ L(last_vec_x1_check):
+ 	jl	L(zero)
+ 	addl	$VEC_SIZE, %eax
+ 	addq	%rdi, %rax
+-	VZEROUPPER
+-	ret
++	VZEROUPPER_RETURN
+ 
+ 	.p2align 4
+ L(last_vec_x3_check):
+@@ -243,12 +245,14 @@ L(last_vec_x3_check):
+ 	jl	L(zero)
+ 	addl	$(VEC_SIZE * 3), %eax
+ 	addq	%rdi, %rax
+-	VZEROUPPER
+-	ret
++	VZEROUPPER_RETURN
+ 
+ 	.p2align 4
+ L(zero):
+-	VZEROUPPER
++	xorl	%eax, %eax
++	VZEROUPPER_RETURN
++
++	.p2align 4
+ L(null):
+ 	xorl	%eax, %eax
+ 	ret
+@@ -273,8 +277,7 @@ L(last_vec_or_less_aligned):
+ 
+ 	bsrl	%eax, %eax
+ 	addq	%rdi, %rax
+-	VZEROUPPER
+-	ret
++	VZEROUPPER_RETURN
+ 
+ 	.p2align 4
+ L(last_vec_or_less):
+@@ -315,8 +318,7 @@ L(last_vec_or_less):
+ 	bsrl	%eax, %eax
+ 	addq	%rdi, %rax
+ 	addq	%r8, %rax
+-	VZEROUPPER
+-	ret
++	VZEROUPPER_RETURN
+ 
+ 	.p2align 4
+ L(last_vec_2x_aligned):
+@@ -353,7 +355,6 @@ L(last_vec_2x_aligned):
+ 	bsrl	%eax, %eax
+ 	addq	%rdi, %rax
+ 	addq	%r8, %rax
+-	VZEROUPPER
+-	ret
+-END (__memrchr_avx2)
++	VZEROUPPER_RETURN
++END (MEMRCHR)
+ #endif
+diff --git a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms-rtm.S b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms-rtm.S
+new file mode 100644
+index 00000000..8ac3e479
+--- /dev/null
++++ b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms-rtm.S
+@@ -0,0 +1,10 @@
++#define ZERO_UPPER_VEC_REGISTERS_RETURN \
++  ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST
++
++#define VZEROUPPER_RETURN jmp	 L(return)
++
++#define SECTION(p) p##.avx.rtm
++#define MEMSET_SYMBOL(p,s)	p##_avx2_##s##_rtm
++#define WMEMSET_SYMBOL(p,s)	p##_avx2_##s##_rtm
++
++#include "memset-avx2-unaligned-erms.S"
+diff --git a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
+index 7ab3d898..ae0860f3 100644
+--- a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
++++ b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
+@@ -14,9 +14,15 @@
+   movq r, %rax; \
+   vpbroadcastd %xmm0, %ymm0
+ 
+-# define SECTION(p)		p##.avx
+-# define MEMSET_SYMBOL(p,s)	p##_avx2_##s
+-# define WMEMSET_SYMBOL(p,s)	p##_avx2_##s
++# ifndef SECTION
++#  define SECTION(p)		p##.avx
++# endif
++# ifndef MEMSET_SYMBOL
++#  define MEMSET_SYMBOL(p,s)	p##_avx2_##s
++# endif
++# ifndef WMEMSET_SYMBOL
++#  define WMEMSET_SYMBOL(p,s)	p##_avx2_##s
++# endif
+ 
+ # include "memset-vec-unaligned-erms.S"
+ #endif
+diff --git a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
+index 71e91a8f..bae5cba4 100644
+--- a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
++++ b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
+@@ -45,17 +45,14 @@
+ #ifndef VZEROUPPER
+ # if VEC_SIZE > 16
+ #  define VZEROUPPER			vzeroupper
++#  define VZEROUPPER_SHORT_RETURN	vzeroupper; ret
+ # else
+ #  define VZEROUPPER
+ # endif
+ #endif
+ 
+ #ifndef VZEROUPPER_SHORT_RETURN
+-# if VEC_SIZE > 16
+-#  define VZEROUPPER_SHORT_RETURN	vzeroupper
+-# else
+-#  define VZEROUPPER_SHORT_RETURN	rep
+-# endif
++# define VZEROUPPER_SHORT_RETURN	rep; ret
+ #endif
+ 
+ #ifndef MOVQ
+@@ -117,8 +114,7 @@ L(entry_from_bzero):
+ 	/* From VEC and to 2 * VEC.  No branch when size == VEC_SIZE.  */
+ 	VMOVU	%VEC(0), -VEC_SIZE(%rdi,%rdx)
+ 	VMOVU	%VEC(0), (%rdi)
+-	VZEROUPPER
+-	ret
++	VZEROUPPER_RETURN
+ #if defined USE_MULTIARCH && IS_IN (libc)
+ END (MEMSET_SYMBOL (__memset, unaligned))
+ 
+@@ -141,14 +137,12 @@ ENTRY (__memset_erms)
+ ENTRY (MEMSET_SYMBOL (__memset, erms))
+ # endif
+ L(stosb):
+-	/* Issue vzeroupper before rep stosb.  */
+-	VZEROUPPER
+ 	mov	%RDX_LP, %RCX_LP
+ 	movzbl	%sil, %eax
+ 	mov	%RDI_LP, %RDX_LP
+ 	rep stosb
+ 	mov	%RDX_LP, %RAX_LP
+-	ret
++	VZEROUPPER_RETURN
+ # if VEC_SIZE == 16
+ END (__memset_erms)
+ # else
+@@ -175,8 +169,7 @@ ENTRY (MEMSET_SYMBOL (__memset, unaligned_erms))
+ 	/* From VEC and to 2 * VEC.  No branch when size == VEC_SIZE.  */
+ 	VMOVU	%VEC(0), -VEC_SIZE(%rdi,%rdx)
+ 	VMOVU	%VEC(0), (%rdi)
+-	VZEROUPPER
+-	ret
++	VZEROUPPER_RETURN
+ 
+ L(stosb_more_2x_vec):
+ 	cmp	__x86_rep_stosb_threshold(%rip), %RDX_LP
+@@ -190,8 +183,11 @@ L(more_2x_vec):
+ 	VMOVU	%VEC(0), -VEC_SIZE(%rdi,%rdx)
+ 	VMOVU	%VEC(0), -(VEC_SIZE * 2)(%rdi,%rdx)
+ L(return):
+-	VZEROUPPER
++#if VEC_SIZE > 16
++	ZERO_UPPER_VEC_REGISTERS_RETURN
++#else
+ 	ret
++#endif
+ 
+ L(loop_start):
+ 	leaq	(VEC_SIZE * 4)(%rdi), %rcx
+@@ -217,7 +213,6 @@ L(loop):
+ 	cmpq	%rcx, %rdx
+ 	jne	L(loop)
+ 	VZEROUPPER_SHORT_RETURN
+-	ret
+ L(less_vec):
+ 	/* Less than 1 VEC.  */
+ # if VEC_SIZE != 16 && VEC_SIZE != 32 && VEC_SIZE != 64
+@@ -241,40 +236,34 @@ L(less_vec):
+ 	jb	1f
+ 	movb	%cl, (%rdi)
+ 1:
+-	VZEROUPPER
+-	ret
++	VZEROUPPER_RETURN
+ # if VEC_SIZE > 32
+ 	/* From 32 to 63.  No branch when size == 32.  */
+ L(between_32_63):
+ 	VMOVU	%YMM0, -32(%rdi,%rdx)
+ 	VMOVU	%YMM0, (%rdi)
+-	VZEROUPPER
+-	ret
++	VZEROUPPER_RETURN
+ # endif
+ # if VEC_SIZE > 16
+ 	/* From 16 to 31.  No branch when size == 16.  */
+ L(between_16_31):
+ 	VMOVU	%XMM0, -16(%rdi,%rdx)
+ 	VMOVU	%XMM0, (%rdi)
+-	VZEROUPPER
+-	ret
++	VZEROUPPER_RETURN
+ # endif
+ 	/* From 8 to 15.  No branch when size == 8.  */
+ L(between_8_15):
+ 	movq	%rcx, -8(%rdi,%rdx)
+ 	movq	%rcx, (%rdi)
+-	VZEROUPPER
+-	ret
++	VZEROUPPER_RETURN
+ L(between_4_7):
+ 	/* From 4 to 7.  No branch when size == 4.  */
+ 	movl	%ecx, -4(%rdi,%rdx)
+ 	movl	%ecx, (%rdi)
+-	VZEROUPPER
+-	ret
++	VZEROUPPER_RETURN
+ L(between_2_3):
+ 	/* From 2 to 3.  No branch when size == 2.  */
+ 	movw	%cx, -2(%rdi,%rdx)
+ 	movw	%cx, (%rdi)
+-	VZEROUPPER
+-	ret
++	VZEROUPPER_RETURN
+ END (MEMSET_SYMBOL (__memset, unaligned_erms))
+diff --git a/sysdeps/x86_64/multiarch/rawmemchr-avx2-rtm.S b/sysdeps/x86_64/multiarch/rawmemchr-avx2-rtm.S
+new file mode 100644
+index 00000000..acc5f6e2
+--- /dev/null
++++ b/sysdeps/x86_64/multiarch/rawmemchr-avx2-rtm.S
+@@ -0,0 +1,4 @@
++#define MEMCHR __rawmemchr_avx2_rtm
++#define USE_AS_RAWMEMCHR 1
++
++#include "memchr-avx2-rtm.S"
+diff --git a/sysdeps/x86_64/multiarch/stpcpy-avx2-rtm.S b/sysdeps/x86_64/multiarch/stpcpy-avx2-rtm.S
+new file mode 100644
+index 00000000..2b9c07a5
+--- /dev/null
++++ b/sysdeps/x86_64/multiarch/stpcpy-avx2-rtm.S
+@@ -0,0 +1,3 @@
++#define USE_AS_STPCPY
++#define STRCPY __stpcpy_avx2_rtm
++#include "strcpy-avx2-rtm.S"
+diff --git a/sysdeps/x86_64/multiarch/stpncpy-avx2-rtm.S b/sysdeps/x86_64/multiarch/stpncpy-avx2-rtm.S
+new file mode 100644
+index 00000000..60a2ccfe
+--- /dev/null
++++ b/sysdeps/x86_64/multiarch/stpncpy-avx2-rtm.S
+@@ -0,0 +1,4 @@
++#define USE_AS_STPCPY
++#define USE_AS_STRNCPY
++#define STRCPY __stpncpy_avx2_rtm
++#include "strcpy-avx2-rtm.S"
+diff --git a/sysdeps/x86_64/multiarch/strcat-avx2-rtm.S b/sysdeps/x86_64/multiarch/strcat-avx2-rtm.S
+new file mode 100644
+index 00000000..637fb557
+--- /dev/null
++++ b/sysdeps/x86_64/multiarch/strcat-avx2-rtm.S
+@@ -0,0 +1,12 @@
++#ifndef STRCAT
++# define STRCAT __strcat_avx2_rtm
++#endif
++
++#define ZERO_UPPER_VEC_REGISTERS_RETURN \
++  ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST
++
++#define VZEROUPPER_RETURN jmp	 L(return_vzeroupper)
++
++#define SECTION(p) p##.avx.rtm
++
++#include "strcat-avx2.S"
+diff --git a/sysdeps/x86_64/multiarch/strcat-avx2.S b/sysdeps/x86_64/multiarch/strcat-avx2.S
+index b0623564..aa48c058 100644
+--- a/sysdeps/x86_64/multiarch/strcat-avx2.S
++++ b/sysdeps/x86_64/multiarch/strcat-avx2.S
+@@ -30,7 +30,11 @@
+ /* Number of bytes in a vector register */
+ # define VEC_SIZE	32
+ 
+-	.section .text.avx,"ax",@progbits
++# ifndef SECTION
++#  define SECTION(p)	p##.avx
++# endif
++
++	.section SECTION(.text),"ax",@progbits
+ ENTRY (STRCAT)
+ 	mov	%rdi, %r9
+ # ifdef USE_AS_STRNCAT
+diff --git a/sysdeps/x86_64/multiarch/strchr-avx2-rtm.S b/sysdeps/x86_64/multiarch/strchr-avx2-rtm.S
+new file mode 100644
+index 00000000..81f20d1d
+--- /dev/null
++++ b/sysdeps/x86_64/multiarch/strchr-avx2-rtm.S
+@@ -0,0 +1,12 @@
++#ifndef STRCHR
++# define STRCHR __strchr_avx2_rtm
++#endif
++
++#define ZERO_UPPER_VEC_REGISTERS_RETURN \
++  ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST
++
++#define VZEROUPPER_RETURN jmp	 L(return_vzeroupper)
++
++#define SECTION(p) p##.avx.rtm
++
++#include "strchr-avx2.S"
+diff --git a/sysdeps/x86_64/multiarch/strchr-avx2.S b/sysdeps/x86_64/multiarch/strchr-avx2.S
+index 47bc3c99..da7d2620 100644
+--- a/sysdeps/x86_64/multiarch/strchr-avx2.S
++++ b/sysdeps/x86_64/multiarch/strchr-avx2.S
+@@ -38,9 +38,13 @@
+ #  define VZEROUPPER	vzeroupper
+ # endif
+ 
++# ifndef SECTION
++#  define SECTION(p)	p##.avx
++# endif
++
+ # define VEC_SIZE 32
+ 
+-	.section .text.avx,"ax",@progbits
++	.section SECTION(.text),"ax",@progbits
+ ENTRY (STRCHR)
+ 	movl	%edi, %ecx
+ 	/* Broadcast CHAR to YMM0.  */
+@@ -93,8 +97,8 @@ L(cros_page_boundary):
+ 	cmp	(%rax), %CHAR_REG
+ 	cmovne	%rdx, %rax
+ # endif
+-	VZEROUPPER
+-	ret
++L(return_vzeroupper):
++	ZERO_UPPER_VEC_REGISTERS_RETURN
+ 
+ 	.p2align 4
+ L(aligned_more):
+@@ -190,8 +194,7 @@ L(first_vec_x0):
+ 	cmp	(%rax), %CHAR_REG
+ 	cmovne	%rdx, %rax
+ # endif
+-	VZEROUPPER
+-	ret
++	VZEROUPPER_RETURN
+ 
+ 	.p2align 4
+ L(first_vec_x1):
+@@ -205,8 +208,7 @@ L(first_vec_x1):
+ 	cmp	(%rax), %CHAR_REG
+ 	cmovne	%rdx, %rax
+ # endif
+-	VZEROUPPER
+-	ret
++	VZEROUPPER_RETURN
+ 
+ 	.p2align 4
+ L(first_vec_x2):
+@@ -220,8 +222,7 @@ L(first_vec_x2):
+ 	cmp	(%rax), %CHAR_REG
+ 	cmovne	%rdx, %rax
+ # endif
+-	VZEROUPPER
+-	ret
++	VZEROUPPER_RETURN
+ 
+ 	.p2align 4
+ L(4x_vec_end):
+@@ -247,8 +248,7 @@ L(first_vec_x3):
+ 	cmp	(%rax), %CHAR_REG
+ 	cmovne	%rdx, %rax
+ # endif
+-	VZEROUPPER
+-	ret
++	VZEROUPPER_RETURN
+ 
+ END (STRCHR)
+ #endif
+diff --git a/sysdeps/x86_64/multiarch/strchr.c b/sysdeps/x86_64/multiarch/strchr.c
+index be05e197..7e582f02 100644
+--- a/sysdeps/x86_64/multiarch/strchr.c
++++ b/sysdeps/x86_64/multiarch/strchr.c
+@@ -29,6 +29,7 @@
+ extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden;
+ extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2_no_bsf) attribute_hidden;
+ extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden;
++extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden;
+ extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden;
+ 
+ static inline void *
+@@ -44,6 +45,9 @@ IFUNC_SELECTOR (void)
+ 	  && CPU_FEATURE_USABLE_P (cpu_features, BMI2))
+ 	return OPTIMIZE (evex);
+ 
++      if (CPU_FEATURE_USABLE_P (cpu_features, RTM))
++	return OPTIMIZE (avx2_rtm);
++
+       if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
+ 	return OPTIMIZE (avx2);
+     }
+diff --git a/sysdeps/x86_64/multiarch/strchrnul-avx2-rtm.S b/sysdeps/x86_64/multiarch/strchrnul-avx2-rtm.S
+new file mode 100644
+index 00000000..cdcf818b
+--- /dev/null
++++ b/sysdeps/x86_64/multiarch/strchrnul-avx2-rtm.S
+@@ -0,0 +1,3 @@
++#define STRCHR __strchrnul_avx2_rtm
++#define USE_AS_STRCHRNUL 1
++#include "strchr-avx2-rtm.S"
+diff --git a/sysdeps/x86_64/multiarch/strcmp-avx2-rtm.S b/sysdeps/x86_64/multiarch/strcmp-avx2-rtm.S
+new file mode 100644
+index 00000000..aecd30d9
+--- /dev/null
++++ b/sysdeps/x86_64/multiarch/strcmp-avx2-rtm.S
+@@ -0,0 +1,12 @@
++#ifndef STRCMP
++# define STRCMP __strcmp_avx2_rtm
++#endif
++
++#define ZERO_UPPER_VEC_REGISTERS_RETURN \
++  ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST
++
++#define VZEROUPPER_RETURN jmp	 L(return_vzeroupper)
++
++#define SECTION(p) p##.avx.rtm
++
++#include "strcmp-avx2.S"
+diff --git a/sysdeps/x86_64/multiarch/strcmp-avx2.S b/sysdeps/x86_64/multiarch/strcmp-avx2.S
+index 8fb8eedc..5d1c9d90 100644
+--- a/sysdeps/x86_64/multiarch/strcmp-avx2.S
++++ b/sysdeps/x86_64/multiarch/strcmp-avx2.S
+@@ -55,6 +55,10 @@
+ #  define VZEROUPPER	vzeroupper
+ # endif
+ 
++# ifndef SECTION
++#  define SECTION(p)	p##.avx
++# endif
++
+ /* Warning!
+            wcscmp/wcsncmp have to use SIGNED comparison for elements.
+            strcmp/strncmp have to use UNSIGNED comparison for elements.
+@@ -75,7 +79,7 @@
+    the maximum offset is reached before a difference is found, zero is
+    returned.  */
+ 
+-	.section .text.avx,"ax",@progbits
++	.section SECTION(.text),"ax",@progbits
+ ENTRY (STRCMP)
+ # ifdef USE_AS_STRNCMP
+ 	/* Check for simple cases (0 or 1) in offset.  */
+@@ -137,8 +141,8 @@ L(return):
+ 	movzbl	(%rsi, %rdx), %edx
+ 	subl	%edx, %eax
+ # endif
+-	VZEROUPPER
+-	ret
++L(return_vzeroupper):
++	ZERO_UPPER_VEC_REGISTERS_RETURN
+ 
+ 	.p2align 4
+ L(return_vec_size):
+@@ -171,8 +175,7 @@ L(return_vec_size):
+ 	subl	%edx, %eax
+ #  endif
+ # endif
+-	VZEROUPPER
+-	ret
++	VZEROUPPER_RETURN
+ 
+ 	.p2align 4
+ L(return_2_vec_size):
+@@ -205,8 +208,7 @@ L(return_2_vec_size):
+ 	subl	%edx, %eax
+ #  endif
+ # endif
+-	VZEROUPPER
+-	ret
++	VZEROUPPER_RETURN
+ 
+ 	.p2align 4
+ L(return_3_vec_size):
+@@ -239,8 +241,7 @@ L(return_3_vec_size):
+ 	subl	%edx, %eax
+ #  endif
+ # endif
+-	VZEROUPPER
+-	ret
++	VZEROUPPER_RETURN
+ 
+ 	.p2align 4
+ L(next_3_vectors):
+@@ -366,8 +367,7 @@ L(back_to_loop):
+ 	subl	%edx, %eax
+ #  endif
+ # endif
+-	VZEROUPPER
+-	ret
++	VZEROUPPER_RETURN
+ 
+ 	.p2align 4
+ L(test_vec):
+@@ -410,8 +410,7 @@ L(test_vec):
+ 	subl	%edx, %eax
+ #  endif
+ # endif
+-	VZEROUPPER
+-	ret
++	VZEROUPPER_RETURN
+ 
+ 	.p2align 4
+ L(test_2_vec):
+@@ -454,8 +453,7 @@ L(test_2_vec):
+ 	subl	%edx, %eax
+ #  endif
+ # endif
+-	VZEROUPPER
+-	ret
++	VZEROUPPER_RETURN
+ 
+ 	.p2align 4
+ L(test_3_vec):
+@@ -496,8 +494,7 @@ L(test_3_vec):
+ 	subl	%edx, %eax
+ #  endif
+ # endif
+-	VZEROUPPER
+-	ret
++	VZEROUPPER_RETURN
+ 
+ 	.p2align 4
+ L(loop_cross_page):
+@@ -566,8 +563,7 @@ L(loop_cross_page):
+ 	subl	%edx, %eax
+ #  endif
+ # endif
+-	VZEROUPPER
+-	ret
++	VZEROUPPER_RETURN
+ 
+ 	.p2align 4
+ L(loop_cross_page_2_vec):
+@@ -641,8 +637,7 @@ L(loop_cross_page_2_vec):
+ 	subl	%edx, %eax
+ #  endif
+ # endif
+-	VZEROUPPER
+-	ret
++	VZEROUPPER_RETURN
+ 
+ # ifdef USE_AS_STRNCMP
+ L(string_nbyte_offset_check):
+@@ -684,8 +679,7 @@ L(cross_page_loop):
+ # ifndef USE_AS_WCSCMP
+ L(different):
+ # endif
+-	VZEROUPPER
+-	ret
++	VZEROUPPER_RETURN
+ 
+ # ifdef USE_AS_WCSCMP
+ 	.p2align 4
+@@ -695,16 +689,14 @@ L(different):
+ 	setl	%al
+ 	negl	%eax
+ 	orl	$1, %eax
+-	VZEROUPPER
+-	ret
++	VZEROUPPER_RETURN
+ # endif
+ 
+ # ifdef USE_AS_STRNCMP
+ 	.p2align 4
+ L(zero):
+ 	xorl	%eax, %eax
+-	VZEROUPPER
+-	ret
++	VZEROUPPER_RETURN
+ 
+ 	.p2align 4
+ L(char0):
+@@ -718,8 +710,7 @@ L(char0):
+ 	movzbl	(%rdi), %eax
+ 	subl	%ecx, %eax
+ #  endif
+-	VZEROUPPER
+-	ret
++	VZEROUPPER_RETURN
+ # endif
+ 
+ 	.p2align 4
+@@ -744,8 +735,7 @@ L(last_vector):
+ 	movzbl	(%rsi, %rdx), %edx
+ 	subl	%edx, %eax
+ # endif
+-	VZEROUPPER
+-	ret
++	VZEROUPPER_RETURN
+ 
+ 	/* Comparing on page boundary region requires special treatment:
+ 	   It must done one vector at the time, starting with the wider
+@@ -866,7 +856,6 @@ L(cross_page_4bytes):
+ 	testl	%eax, %eax
+ 	jne	L(cross_page_loop)
+ 	subl	%ecx, %eax
+-	VZEROUPPER
+-	ret
++	VZEROUPPER_RETURN
+ END (STRCMP)
+ #endif
+diff --git a/sysdeps/x86_64/multiarch/strcmp.c b/sysdeps/x86_64/multiarch/strcmp.c
+index c5f38510..11bbea2b 100644
+--- a/sysdeps/x86_64/multiarch/strcmp.c
++++ b/sysdeps/x86_64/multiarch/strcmp.c
+@@ -30,6 +30,7 @@ extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden;
+ extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2_unaligned) attribute_hidden;
+ extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden;
+ extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden;
++extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden;
+ extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden;
+ 
+ static inline void *
+@@ -46,6 +47,9 @@ IFUNC_SELECTOR (void)
+ 	  && !CPU_FEATURES_ARCH_P (cpu_features, Prefer_AVX2_STRCMP))
+ 	return OPTIMIZE (evex);
+ 
++      if (CPU_FEATURE_USABLE_P (cpu_features, RTM))
++	return OPTIMIZE (avx2_rtm);
++
+       if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
+ 	return OPTIMIZE (avx2);
+     }
+diff --git a/sysdeps/x86_64/multiarch/strcpy-avx2-rtm.S b/sysdeps/x86_64/multiarch/strcpy-avx2-rtm.S
+new file mode 100644
+index 00000000..c2c581ec
+--- /dev/null
++++ b/sysdeps/x86_64/multiarch/strcpy-avx2-rtm.S
+@@ -0,0 +1,12 @@
++#ifndef STRCPY
++# define STRCPY __strcpy_avx2_rtm
++#endif
++
++#define ZERO_UPPER_VEC_REGISTERS_RETURN \
++  ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST
++
++#define VZEROUPPER_RETURN jmp	 L(return_vzeroupper)
++
++#define SECTION(p) p##.avx.rtm
++
++#include "strcpy-avx2.S"
+diff --git a/sysdeps/x86_64/multiarch/strcpy-avx2.S b/sysdeps/x86_64/multiarch/strcpy-avx2.S
+index 81677f90..613c59aa 100644
+--- a/sysdeps/x86_64/multiarch/strcpy-avx2.S
++++ b/sysdeps/x86_64/multiarch/strcpy-avx2.S
+@@ -37,6 +37,10 @@
+ #  define VZEROUPPER	vzeroupper
+ # endif
+ 
++# ifndef SECTION
++#  define SECTION(p)	p##.avx
++# endif
++
+ /* zero register */
+ #define xmmZ	xmm0
+ #define ymmZ	ymm0
+@@ -46,7 +50,7 @@
+ 
+ # ifndef USE_AS_STRCAT
+ 
+-	.section .text.avx,"ax",@progbits
++	.section SECTION(.text),"ax",@progbits
+ ENTRY (STRCPY)
+ #  ifdef USE_AS_STRNCPY
+ 	mov	%rdx, %r8
+@@ -369,8 +373,8 @@ L(CopyVecSizeExit):
+ 	lea	1(%rdi), %rdi
+ 	jnz	L(StrncpyFillTailWithZero)
+ # endif
+-	VZEROUPPER
+-	ret
++L(return_vzeroupper):
++	ZERO_UPPER_VEC_REGISTERS_RETURN
+ 
+ 	.p2align 4
+ L(CopyTwoVecSize1):
+@@ -553,8 +557,7 @@ L(Exit1):
+ 	lea	2(%rdi), %rdi
+ 	jnz	L(StrncpyFillTailWithZero)
+ # endif
+-	VZEROUPPER
+-	ret
++	VZEROUPPER_RETURN
+ 
+ 	.p2align 4
+ L(Exit2):
+@@ -569,8 +572,7 @@ L(Exit2):
+ 	lea	3(%rdi), %rdi
+ 	jnz	L(StrncpyFillTailWithZero)
+ # endif
+-	VZEROUPPER
+-	ret
++	VZEROUPPER_RETURN
+ 
+ 	.p2align 4
+ L(Exit3):
+@@ -584,8 +586,7 @@ L(Exit3):
+ 	lea	4(%rdi), %rdi
+ 	jnz	L(StrncpyFillTailWithZero)
+ # endif
+-	VZEROUPPER
+-	ret
++	VZEROUPPER_RETURN
+ 
+ 	.p2align 4
+ L(Exit4_7):
+@@ -602,8 +603,7 @@ L(Exit4_7):
+ 	lea	1(%rdi, %rdx), %rdi
+ 	jnz	L(StrncpyFillTailWithZero)
+ # endif
+-	VZEROUPPER
+-	ret
++	VZEROUPPER_RETURN
+ 
+ 	.p2align 4
+ L(Exit8_15):
+@@ -620,8 +620,7 @@ L(Exit8_15):
+ 	lea	1(%rdi, %rdx), %rdi
+ 	jnz	L(StrncpyFillTailWithZero)
+ # endif
+-	VZEROUPPER
+-	ret
++	VZEROUPPER_RETURN
+ 
+ 	.p2align 4
+ L(Exit16_31):
+@@ -638,8 +637,7 @@ L(Exit16_31):
+ 	lea 1(%rdi, %rdx), %rdi
+ 	jnz L(StrncpyFillTailWithZero)
+ # endif
+-	VZEROUPPER
+-	ret
++	VZEROUPPER_RETURN
+ 
+ 	.p2align 4
+ L(Exit32_63):
+@@ -656,8 +654,7 @@ L(Exit32_63):
+ 	lea	1(%rdi, %rdx), %rdi
+ 	jnz	L(StrncpyFillTailWithZero)
+ # endif
+-	VZEROUPPER
+-	ret
++	VZEROUPPER_RETURN
+ 
+ # ifdef USE_AS_STRNCPY
+ 
+@@ -671,8 +668,7 @@ L(StrncpyExit1):
+ #  ifdef USE_AS_STRCAT
+ 	movb	$0, 1(%rdi)
+ #  endif
+-	VZEROUPPER
+-	ret
++	VZEROUPPER_RETURN
+ 
+ 	.p2align 4
+ L(StrncpyExit2):
+@@ -684,8 +680,7 @@ L(StrncpyExit2):
+ #  ifdef USE_AS_STRCAT
+ 	movb	$0, 2(%rdi)
+ #  endif
+-	VZEROUPPER
+-	ret
++	VZEROUPPER_RETURN
+ 
+ 	.p2align 4
+ L(StrncpyExit3_4):
+@@ -699,8 +694,7 @@ L(StrncpyExit3_4):
+ #  ifdef USE_AS_STRCAT
+ 	movb	$0, (%rdi, %r8)
+ #  endif
+-	VZEROUPPER
+-	ret
++	VZEROUPPER_RETURN
+ 
+ 	.p2align 4
+ L(StrncpyExit5_8):
+@@ -714,8 +708,7 @@ L(StrncpyExit5_8):
+ #  ifdef USE_AS_STRCAT
+ 	movb	$0, (%rdi, %r8)
+ #  endif
+-	VZEROUPPER
+-	ret
++	VZEROUPPER_RETURN
+ 
+ 	.p2align 4
+ L(StrncpyExit9_16):
+@@ -729,8 +722,7 @@ L(StrncpyExit9_16):
+ #  ifdef USE_AS_STRCAT
+ 	movb	$0, (%rdi, %r8)
+ #  endif
+-	VZEROUPPER
+-	ret
++	VZEROUPPER_RETURN
+ 
+ 	.p2align 4
+ L(StrncpyExit17_32):
+@@ -744,8 +736,7 @@ L(StrncpyExit17_32):
+ #  ifdef USE_AS_STRCAT
+ 	movb	$0, (%rdi, %r8)
+ #  endif
+-	VZEROUPPER
+-	ret
++	VZEROUPPER_RETURN
+ 
+ 	.p2align 4
+ L(StrncpyExit33_64):
+@@ -760,8 +751,7 @@ L(StrncpyExit33_64):
+ #  ifdef USE_AS_STRCAT
+ 	movb	$0, (%rdi, %r8)
+ #  endif
+-	VZEROUPPER
+-	ret
++	VZEROUPPER_RETURN
+ 
+ 	.p2align 4
+ L(StrncpyExit65):
+@@ -778,50 +768,43 @@ L(StrncpyExit65):
+ #  ifdef USE_AS_STRCAT
+ 	movb	$0, 65(%rdi)
+ #  endif
+-	VZEROUPPER
+-	ret
++	VZEROUPPER_RETURN
+ 
+ #  ifndef USE_AS_STRCAT
+ 
+ 	.p2align 4
+ L(Fill1):
+ 	mov	%dl, (%rdi)
+-	VZEROUPPER
+-	ret
++	VZEROUPPER_RETURN
+ 
+ 	.p2align 4
+ L(Fill2):
+ 	mov	%dx, (%rdi)
+-	VZEROUPPER
+-	ret
++	VZEROUPPER_RETURN
+ 
+ 	.p2align 4
+ L(Fill3_4):
+ 	mov	%dx, (%rdi)
+ 	mov     %dx, -2(%rdi, %r8)
+-	VZEROUPPER
+-	ret
++	VZEROUPPER_RETURN
+ 
+ 	.p2align 4
+ L(Fill5_8):
+ 	mov	%edx, (%rdi)
+ 	mov     %edx, -4(%rdi, %r8)
+-	VZEROUPPER
+-	ret
++	VZEROUPPER_RETURN
+ 
+ 	.p2align 4
+ L(Fill9_16):
+ 	mov	%rdx, (%rdi)
+ 	mov	%rdx, -8(%rdi, %r8)
+-	VZEROUPPER
+-	ret
++	VZEROUPPER_RETURN
+ 
+ 	.p2align 4
+ L(Fill17_32):
+ 	vmovdqu %xmmZ, (%rdi)
+ 	vmovdqu %xmmZ, -16(%rdi, %r8)
+-	VZEROUPPER
+-	ret
++	VZEROUPPER_RETURN
+ 
+ 	.p2align 4
+ L(CopyVecSizeUnalignedVec2):
+@@ -898,8 +881,7 @@ L(Fill):
+ 	cmp	$1, %r8d
+ 	ja	L(Fill2)
+ 	je	L(Fill1)
+-	VZEROUPPER
+-	ret
++	VZEROUPPER_RETURN
+ 
+ /* end of ifndef USE_AS_STRCAT */
+ #  endif
+@@ -929,8 +911,7 @@ L(UnalignedFourVecSizeLeaveCase3):
+ #  ifdef USE_AS_STRCAT
+ 	movb	$0, (VEC_SIZE * 4)(%rdi)
+ #  endif
+-	VZEROUPPER
+-	ret
++	VZEROUPPER_RETURN
+ 
+ 	.p2align 4
+ L(UnalignedFourVecSizeLeaveCase2):
+@@ -1001,16 +982,14 @@ L(StrncpyExit):
+ #  ifdef USE_AS_STRCAT
+ 	movb	$0, (%rdi)
+ #  endif
+-	VZEROUPPER
+-	ret
++	VZEROUPPER_RETURN
+ 
+ 	.p2align 4
+ L(ExitZero):
+ #  ifndef USE_AS_STRCAT
+ 	mov	%rdi, %rax
+ #  endif
+-	VZEROUPPER
+-	ret
++	VZEROUPPER_RETURN
+ 
+ # endif
+ 
+diff --git a/sysdeps/x86_64/multiarch/strlen-avx2-rtm.S b/sysdeps/x86_64/multiarch/strlen-avx2-rtm.S
+new file mode 100644
+index 00000000..75b4b761
+--- /dev/null
++++ b/sysdeps/x86_64/multiarch/strlen-avx2-rtm.S
+@@ -0,0 +1,12 @@
++#ifndef STRLEN
++# define STRLEN __strlen_avx2_rtm
++#endif
++
++#define ZERO_UPPER_VEC_REGISTERS_RETURN \
++  ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST
++
++#define VZEROUPPER_RETURN jmp	 L(return_vzeroupper)
++
++#define SECTION(p) p##.avx.rtm
++
++#include "strlen-avx2.S"
+diff --git a/sysdeps/x86_64/multiarch/strlen-avx2.S b/sysdeps/x86_64/multiarch/strlen-avx2.S
+index 645e0446..82826e10 100644
+--- a/sysdeps/x86_64/multiarch/strlen-avx2.S
++++ b/sysdeps/x86_64/multiarch/strlen-avx2.S
+@@ -36,9 +36,13 @@
+ #  define VZEROUPPER	vzeroupper
+ # endif
+ 
++# ifndef SECTION
++#  define SECTION(p)	p##.avx
++# endif
++
+ # define VEC_SIZE 32
+ 
+-	.section .text.avx,"ax",@progbits
++	.section SECTION(.text),"ax",@progbits
+ ENTRY (STRLEN)
+ # ifdef USE_AS_STRNLEN
+ 	/* Check for zero length.  */
+@@ -111,8 +115,8 @@ L(cros_page_boundary):
+ # ifdef USE_AS_WCSLEN
+ 	shrq	$2, %rax
+ # endif
+-	VZEROUPPER
+-	ret
++L(return_vzeroupper):
++	ZERO_UPPER_VEC_REGISTERS_RETURN
+ 
+ 	.p2align 4
+ L(aligned_more):
+@@ -231,8 +235,7 @@ L(last_4x_vec_or_less):
+ #  ifdef USE_AS_WCSLEN
+ 	shrq	$2, %rax
+ #  endif
+-	VZEROUPPER
+-	ret
++	VZEROUPPER_RETURN
+ 
+ 	.p2align 4
+ L(last_2x_vec):
+@@ -253,8 +256,7 @@ L(last_2x_vec):
+ #  ifdef USE_AS_WCSLEN
+ 	shrq	$2, %rax
+ #  endif
+-	VZEROUPPER
+-	ret
++	VZEROUPPER_RETURN
+ 
+ 	.p2align 4
+ L(first_vec_x0_check):
+@@ -267,8 +269,7 @@ L(first_vec_x0_check):
+ #  ifdef USE_AS_WCSLEN
+ 	shrq	$2, %rax
+ #  endif
+-	VZEROUPPER
+-	ret
++	VZEROUPPER_RETURN
+ 
+ 	.p2align 4
+ L(first_vec_x1_check):
+@@ -282,8 +283,7 @@ L(first_vec_x1_check):
+ #  ifdef USE_AS_WCSLEN
+ 	shrq	$2, %rax
+ #  endif
+-	VZEROUPPER
+-	ret
++	VZEROUPPER_RETURN
+ 
+ 	.p2align 4
+ L(first_vec_x2_check):
+@@ -297,8 +297,7 @@ L(first_vec_x2_check):
+ #  ifdef USE_AS_WCSLEN
+ 	shrq	$2, %rax
+ #  endif
+-	VZEROUPPER
+-	ret
++	VZEROUPPER_RETURN
+ 
+ 	.p2align 4
+ L(first_vec_x3_check):
+@@ -312,8 +311,7 @@ L(first_vec_x3_check):
+ #  ifdef USE_AS_WCSLEN
+ 	shrq	$2, %rax
+ #  endif
+-	VZEROUPPER
+-	ret
++	VZEROUPPER_RETURN
+ 
+ 	.p2align 4
+ L(max):
+@@ -321,8 +319,7 @@ L(max):
+ #  ifdef USE_AS_WCSLEN
+ 	shrq	$2, %rax
+ #  endif
+-	VZEROUPPER
+-	ret
++	VZEROUPPER_RETURN
+ 
+ 	.p2align 4
+ L(zero):
+@@ -338,8 +335,7 @@ L(first_vec_x0):
+ # ifdef USE_AS_WCSLEN
+ 	shrq	$2, %rax
+ # endif
+-	VZEROUPPER
+-	ret
++	VZEROUPPER_RETURN
+ 
+ 	.p2align 4
+ L(first_vec_x1):
+@@ -350,8 +346,7 @@ L(first_vec_x1):
+ # ifdef USE_AS_WCSLEN
+ 	shrq	$2, %rax
+ # endif
+-	VZEROUPPER
+-	ret
++	VZEROUPPER_RETURN
+ 
+ 	.p2align 4
+ L(first_vec_x2):
+@@ -362,8 +357,7 @@ L(first_vec_x2):
+ # ifdef USE_AS_WCSLEN
+ 	shrq	$2, %rax
+ # endif
+-	VZEROUPPER
+-	ret
++	VZEROUPPER_RETURN
+ 
+ 	.p2align 4
+ L(4x_vec_end):
+@@ -389,8 +383,7 @@ L(first_vec_x3):
+ # ifdef USE_AS_WCSLEN
+ 	shrq	$2, %rax
+ # endif
+-	VZEROUPPER
+-	ret
++	VZEROUPPER_RETURN
+ 
+ END (STRLEN)
+ #endif
+diff --git a/sysdeps/x86_64/multiarch/strncat-avx2-rtm.S b/sysdeps/x86_64/multiarch/strncat-avx2-rtm.S
+new file mode 100644
+index 00000000..0dcea18d
+--- /dev/null
++++ b/sysdeps/x86_64/multiarch/strncat-avx2-rtm.S
+@@ -0,0 +1,3 @@
++#define USE_AS_STRNCAT
++#define STRCAT __strncat_avx2_rtm
++#include "strcat-avx2-rtm.S"
+diff --git a/sysdeps/x86_64/multiarch/strncmp-avx2-rtm.S b/sysdeps/x86_64/multiarch/strncmp-avx2-rtm.S
+new file mode 100644
+index 00000000..37d1224b
+--- /dev/null
++++ b/sysdeps/x86_64/multiarch/strncmp-avx2-rtm.S
+@@ -0,0 +1,3 @@
++#define STRCMP	__strncmp_avx2_rtm
++#define USE_AS_STRNCMP 1
++#include "strcmp-avx2-rtm.S"
+diff --git a/sysdeps/x86_64/multiarch/strncmp.c b/sysdeps/x86_64/multiarch/strncmp.c
+index 4c15542f..44c85116 100644
+--- a/sysdeps/x86_64/multiarch/strncmp.c
++++ b/sysdeps/x86_64/multiarch/strncmp.c
+@@ -30,6 +30,7 @@ extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden;
+ extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden;
+ extern __typeof (REDIRECT_NAME) OPTIMIZE (sse42) attribute_hidden;
+ extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden;
++extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden;
+ extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden;
+ 
+ static inline void *
+@@ -46,6 +47,9 @@ IFUNC_SELECTOR (void)
+ 	  && !CPU_FEATURES_ARCH_P (cpu_features, Prefer_AVX2_STRCMP))
+ 	return OPTIMIZE (evex);
+ 
++      if (CPU_FEATURE_USABLE_P (cpu_features, RTM))
++	return OPTIMIZE (avx2_rtm);
++
+       if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
+ 	return OPTIMIZE (avx2);
+     }
+diff --git a/sysdeps/x86_64/multiarch/strncpy-avx2-rtm.S b/sysdeps/x86_64/multiarch/strncpy-avx2-rtm.S
+new file mode 100644
+index 00000000..79e70832
+--- /dev/null
++++ b/sysdeps/x86_64/multiarch/strncpy-avx2-rtm.S
+@@ -0,0 +1,3 @@
++#define USE_AS_STRNCPY
++#define STRCPY __strncpy_avx2_rtm
++#include "strcpy-avx2-rtm.S"
+diff --git a/sysdeps/x86_64/multiarch/strnlen-avx2-rtm.S b/sysdeps/x86_64/multiarch/strnlen-avx2-rtm.S
+new file mode 100644
+index 00000000..04f1626a
+--- /dev/null
++++ b/sysdeps/x86_64/multiarch/strnlen-avx2-rtm.S
+@@ -0,0 +1,4 @@
++#define STRLEN __strnlen_avx2_rtm
++#define USE_AS_STRNLEN 1
++
++#include "strlen-avx2-rtm.S"
+diff --git a/sysdeps/x86_64/multiarch/strrchr-avx2-rtm.S b/sysdeps/x86_64/multiarch/strrchr-avx2-rtm.S
+new file mode 100644
+index 00000000..5def14ec
+--- /dev/null
++++ b/sysdeps/x86_64/multiarch/strrchr-avx2-rtm.S
+@@ -0,0 +1,12 @@
++#ifndef STRRCHR
++# define STRRCHR __strrchr_avx2_rtm
++#endif
++
++#define ZERO_UPPER_VEC_REGISTERS_RETURN \
++  ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST
++
++#define VZEROUPPER_RETURN jmp	 L(return_vzeroupper)
++
++#define SECTION(p) p##.avx.rtm
++
++#include "strrchr-avx2.S"
+diff --git a/sysdeps/x86_64/multiarch/strrchr-avx2.S b/sysdeps/x86_64/multiarch/strrchr-avx2.S
+index 4381e6ab..9f22a15e 100644
+--- a/sysdeps/x86_64/multiarch/strrchr-avx2.S
++++ b/sysdeps/x86_64/multiarch/strrchr-avx2.S
+@@ -36,9 +36,13 @@
+ #  define VZEROUPPER	vzeroupper
+ # endif
+ 
++# ifndef SECTION
++#  define SECTION(p)	p##.avx
++# endif
++
+ # define VEC_SIZE	32
+ 
+-	.section .text.avx,"ax",@progbits
++	.section SECTION(.text),"ax",@progbits
+ ENTRY (STRRCHR)
+ 	movd	%esi, %xmm4
+ 	movl	%edi, %ecx
+@@ -166,8 +170,8 @@ L(return_value):
+ # endif
+ 	bsrl	%eax, %eax
+ 	leaq	-VEC_SIZE(%rdi, %rax), %rax
+-	VZEROUPPER
+-	ret
++L(return_vzeroupper):
++	ZERO_UPPER_VEC_REGISTERS_RETURN
+ 
+ 	.p2align 4
+ L(match):
+@@ -198,8 +202,7 @@ L(find_nul):
+ 	jz	L(return_value)
+ 	bsrl	%eax, %eax
+ 	leaq	-VEC_SIZE(%rdi, %rax), %rax
+-	VZEROUPPER
+-	ret
++	VZEROUPPER_RETURN
+ 
+ 	.p2align 4
+ L(char_and_nul):
+@@ -222,14 +225,12 @@ L(char_and_nul_in_first_vec):
+ 	jz	L(return_null)
+ 	bsrl	%eax, %eax
+ 	leaq	-VEC_SIZE(%rdi, %rax), %rax
+-	VZEROUPPER
+-	ret
++	VZEROUPPER_RETURN
+ 
+ 	.p2align 4
+ L(return_null):
+ 	xorl	%eax, %eax
+-	VZEROUPPER
+-	ret
++	VZEROUPPER_RETURN
+ 
+ END (STRRCHR)
+ #endif
+diff --git a/sysdeps/x86_64/multiarch/wcschr-avx2-rtm.S b/sysdeps/x86_64/multiarch/wcschr-avx2-rtm.S
+new file mode 100644
+index 00000000..d49dbbf0
+--- /dev/null
++++ b/sysdeps/x86_64/multiarch/wcschr-avx2-rtm.S
+@@ -0,0 +1,3 @@
++#define STRCHR __wcschr_avx2_rtm
++#define USE_AS_WCSCHR 1
++#include "strchr-avx2-rtm.S"
+diff --git a/sysdeps/x86_64/multiarch/wcscmp-avx2-rtm.S b/sysdeps/x86_64/multiarch/wcscmp-avx2-rtm.S
+new file mode 100644
+index 00000000..d6ca2b80
+--- /dev/null
++++ b/sysdeps/x86_64/multiarch/wcscmp-avx2-rtm.S
+@@ -0,0 +1,4 @@
++#define STRCMP __wcscmp_avx2_rtm
++#define USE_AS_WCSCMP 1
++
++#include "strcmp-avx2-rtm.S"
+diff --git a/sysdeps/x86_64/multiarch/wcslen-avx2-rtm.S b/sysdeps/x86_64/multiarch/wcslen-avx2-rtm.S
+new file mode 100644
+index 00000000..35658d73
+--- /dev/null
++++ b/sysdeps/x86_64/multiarch/wcslen-avx2-rtm.S
+@@ -0,0 +1,4 @@
++#define STRLEN __wcslen_avx2_rtm
++#define USE_AS_WCSLEN 1
++
++#include "strlen-avx2-rtm.S"
+diff --git a/sysdeps/x86_64/multiarch/wcsncmp-avx2-rtm.S b/sysdeps/x86_64/multiarch/wcsncmp-avx2-rtm.S
+new file mode 100644
+index 00000000..4e88c70c
+--- /dev/null
++++ b/sysdeps/x86_64/multiarch/wcsncmp-avx2-rtm.S
+@@ -0,0 +1,5 @@
++#define STRCMP __wcsncmp_avx2_rtm
++#define USE_AS_STRNCMP 1
++#define USE_AS_WCSCMP 1
++
++#include "strcmp-avx2-rtm.S"
+diff --git a/sysdeps/x86_64/multiarch/wcsnlen-avx2-rtm.S b/sysdeps/x86_64/multiarch/wcsnlen-avx2-rtm.S
+new file mode 100644
+index 00000000..7437ebee
+--- /dev/null
++++ b/sysdeps/x86_64/multiarch/wcsnlen-avx2-rtm.S
+@@ -0,0 +1,5 @@
++#define STRLEN __wcsnlen_avx2_rtm
++#define USE_AS_WCSLEN 1
++#define USE_AS_STRNLEN 1
++
++#include "strlen-avx2-rtm.S"
+diff --git a/sysdeps/x86_64/multiarch/wcsnlen.c b/sysdeps/x86_64/multiarch/wcsnlen.c
+index 84254b83..20b731ae 100644
+--- a/sysdeps/x86_64/multiarch/wcsnlen.c
++++ b/sysdeps/x86_64/multiarch/wcsnlen.c
+@@ -29,6 +29,7 @@
+ extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden;
+ extern __typeof (REDIRECT_NAME) OPTIMIZE (sse4_1) attribute_hidden;
+ extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden;
++extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden;
+ extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden;
+ 
+ static inline void *
+@@ -44,6 +45,9 @@ IFUNC_SELECTOR (void)
+ 	  && CPU_FEATURE_USABLE_P (cpu_features, BMI2))
+ 	return OPTIMIZE (evex);
+ 
++      if (CPU_FEATURE_USABLE_P (cpu_features, RTM))
++	return OPTIMIZE (avx2_rtm);
++
+       if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
+ 	return OPTIMIZE (avx2);
+     }
+diff --git a/sysdeps/x86_64/multiarch/wcsrchr-avx2-rtm.S b/sysdeps/x86_64/multiarch/wcsrchr-avx2-rtm.S
+new file mode 100644
+index 00000000..9bf76083
+--- /dev/null
++++ b/sysdeps/x86_64/multiarch/wcsrchr-avx2-rtm.S
+@@ -0,0 +1,3 @@
++#define STRRCHR __wcsrchr_avx2_rtm
++#define USE_AS_WCSRCHR 1
++#include "strrchr-avx2-rtm.S"
+diff --git a/sysdeps/x86_64/multiarch/wmemchr-avx2-rtm.S b/sysdeps/x86_64/multiarch/wmemchr-avx2-rtm.S
+new file mode 100644
+index 00000000..58ed21db
+--- /dev/null
++++ b/sysdeps/x86_64/multiarch/wmemchr-avx2-rtm.S
+@@ -0,0 +1,4 @@
++#define MEMCHR __wmemchr_avx2_rtm
++#define USE_AS_WMEMCHR 1
++
++#include "memchr-avx2-rtm.S"
+diff --git a/sysdeps/x86_64/multiarch/wmemcmp-avx2-movbe-rtm.S b/sysdeps/x86_64/multiarch/wmemcmp-avx2-movbe-rtm.S
+new file mode 100644
+index 00000000..31104d12
+--- /dev/null
++++ b/sysdeps/x86_64/multiarch/wmemcmp-avx2-movbe-rtm.S
+@@ -0,0 +1,4 @@
++#define MEMCMP __wmemcmp_avx2_movbe_rtm
++#define USE_AS_WMEMCMP 1
++
++#include "memcmp-avx2-movbe-rtm.S"
+diff --git a/sysdeps/x86_64/sysdep.h b/sysdeps/x86_64/sysdep.h
+index 1738d7f9..223f1a59 100644
+--- a/sysdeps/x86_64/sysdep.h
++++ b/sysdeps/x86_64/sysdep.h
+@@ -95,6 +95,28 @@ lose:									      \
+ #define R14_LP	r14
+ #define R15_LP	r15
+ 
++/* Zero upper vector registers and return with xtest.  NB: Use VZEROALL
++   to avoid RTM abort triggered by VZEROUPPER inside transactionally.  */
++#define ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST \
++	xtest;							\
++	jz	1f;						\
++	vzeroall;						\
++	ret;							\
++1:								\
++	vzeroupper;						\
++	ret
++
++/* Zero upper vector registers and return.  */
++#ifndef ZERO_UPPER_VEC_REGISTERS_RETURN
++# define ZERO_UPPER_VEC_REGISTERS_RETURN \
++	VZEROUPPER;						\
++	ret
++#endif
++
++#ifndef VZEROUPPER_RETURN
++# define VZEROUPPER_RETURN	VZEROUPPER; ret
++#endif
++
+ #else	/* __ASSEMBLER__ */
+ 
+ /* Long and pointer size in bytes.  */
+-- 
+GitLab
+
diff --git a/glibc-RHEL-15696-18.patch b/glibc-RHEL-15696-18.patch
new file mode 100644
index 0000000..2cf0e45
--- /dev/null
+++ b/glibc-RHEL-15696-18.patch
@@ -0,0 +1,735 @@
+From 4bd660be40967cd69072f69ebc2ad32bfcc1f206 Mon Sep 17 00:00:00 2001
+From: "H.J. Lu" <hjl.tools@gmail.com>
+Date: Tue, 23 Feb 2021 06:33:10 -0800
+Subject: [PATCH] x86: Add string/memory function tests in RTM region
+Content-type: text/plain; charset=UTF-8
+
+At function exit, AVX optimized string/memory functions have VZEROUPPER
+which triggers RTM abort.   When such functions are called inside a
+transactionally executing RTM region, RTM abort causes severe performance
+degradation.  Add tests to verify that string/memory functions won't
+cause RTM abort in RTM region.
+---
+ sysdeps/x86/Makefile          | 23 +++++++++++
+ sysdeps/x86/tst-memchr-rtm.c  | 54 ++++++++++++++++++++++++++
+ sysdeps/x86/tst-memcmp-rtm.c  | 52 +++++++++++++++++++++++++
+ sysdeps/x86/tst-memmove-rtm.c | 53 ++++++++++++++++++++++++++
+ sysdeps/x86/tst-memrchr-rtm.c | 54 ++++++++++++++++++++++++++
+ sysdeps/x86/tst-memset-rtm.c  | 45 ++++++++++++++++++++++
+ sysdeps/x86/tst-strchr-rtm.c  | 54 ++++++++++++++++++++++++++
+ sysdeps/x86/tst-strcpy-rtm.c  | 53 ++++++++++++++++++++++++++
+ sysdeps/x86/tst-string-rtm.h  | 72 +++++++++++++++++++++++++++++++++++
+ sysdeps/x86/tst-strlen-rtm.c  | 53 ++++++++++++++++++++++++++
+ sysdeps/x86/tst-strncmp-rtm.c | 52 +++++++++++++++++++++++++
+ sysdeps/x86/tst-strrchr-rtm.c | 53 ++++++++++++++++++++++++++
+ 12 files changed, 618 insertions(+)
+ create mode 100644 sysdeps/x86/tst-memchr-rtm.c
+ create mode 100644 sysdeps/x86/tst-memcmp-rtm.c
+ create mode 100644 sysdeps/x86/tst-memmove-rtm.c
+ create mode 100644 sysdeps/x86/tst-memrchr-rtm.c
+ create mode 100644 sysdeps/x86/tst-memset-rtm.c
+ create mode 100644 sysdeps/x86/tst-strchr-rtm.c
+ create mode 100644 sysdeps/x86/tst-strcpy-rtm.c
+ create mode 100644 sysdeps/x86/tst-string-rtm.h
+ create mode 100644 sysdeps/x86/tst-strlen-rtm.c
+ create mode 100644 sysdeps/x86/tst-strncmp-rtm.c
+ create mode 100644 sysdeps/x86/tst-strrchr-rtm.c
+
+diff --git a/sysdeps/x86/Makefile b/sysdeps/x86/Makefile
+index 59e928e9..5be71ada 100644
+--- a/sysdeps/x86/Makefile
++++ b/sysdeps/x86/Makefile
+@@ -17,6 +17,29 @@ endif
+ 
+ ifeq ($(subdir),string)
+ sysdep_routines += cacheinfo
++
++tests += \
++  tst-memchr-rtm \
++  tst-memcmp-rtm \
++  tst-memmove-rtm \
++  tst-memrchr-rtm \
++  tst-memset-rtm \
++  tst-strchr-rtm \
++  tst-strcpy-rtm \
++  tst-strlen-rtm \
++  tst-strncmp-rtm \
++  tst-strrchr-rtm
++
++CFLAGS-tst-memchr-rtm.c += -mrtm
++CFLAGS-tst-memcmp-rtm.c += -mrtm
++CFLAGS-tst-memmove-rtm.c += -mrtm
++CFLAGS-tst-memrchr-rtm.c += -mrtm
++CFLAGS-tst-memset-rtm.c += -mrtm
++CFLAGS-tst-strchr-rtm.c += -mrtm
++CFLAGS-tst-strcpy-rtm.c += -mrtm
++CFLAGS-tst-strlen-rtm.c += -mrtm
++CFLAGS-tst-strncmp-rtm.c += -mrtm
++CFLAGS-tst-strrchr-rtm.c += -mrtm
+ endif
+ 
+ ifneq ($(enable-cet),no)
+diff --git a/sysdeps/x86/tst-memchr-rtm.c b/sysdeps/x86/tst-memchr-rtm.c
+new file mode 100644
+index 00000000..e4749401
+--- /dev/null
++++ b/sysdeps/x86/tst-memchr-rtm.c
+@@ -0,0 +1,54 @@
++/* Test case for memchr inside a transactionally executing RTM region.
++   Copyright (C) 2021 Free Software Foundation, Inc.
++   This file is part of the GNU C Library.
++
++   The GNU C Library is free software; you can redistribute it and/or
++   modify it under the terms of the GNU Lesser General Public
++   License as published by the Free Software Foundation; either
++   version 2.1 of the License, or (at your option) any later version.
++
++   The GNU C Library is distributed in the hope that it will be useful,
++   but WITHOUT ANY WARRANTY; without even the implied warranty of
++   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++   Lesser General Public License for more details.
++
++   You should have received a copy of the GNU Lesser General Public
++   License along with the GNU C Library; if not, see
++   <https://www.gnu.org/licenses/>.  */
++
++#include <tst-string-rtm.h>
++
++#define LOOP 3000
++#define STRING_SIZE 1024
++char string1[STRING_SIZE];
++
++__attribute__ ((noinline, noclone))
++static int
++prepare (void)
++{
++  memset (string1, 'a', STRING_SIZE);
++  string1[100] = 'c';
++  string1[STRING_SIZE - 100] = 'c';
++  char *p = memchr (string1, 'c', STRING_SIZE);
++  if (p == &string1[100])
++    return EXIT_SUCCESS;
++  else
++    return EXIT_FAILURE;
++}
++
++__attribute__ ((noinline, noclone))
++static int
++function (void)
++{
++  char *p = memchr (string1, 'c', STRING_SIZE);
++  if (p == &string1[100])
++    return 0;
++  else
++    return 1;
++}
++
++static int
++do_test (void)
++{
++  return do_test_1 ("memchr", LOOP, prepare, function);
++}
+diff --git a/sysdeps/x86/tst-memcmp-rtm.c b/sysdeps/x86/tst-memcmp-rtm.c
+new file mode 100644
+index 00000000..e4c8a623
+--- /dev/null
++++ b/sysdeps/x86/tst-memcmp-rtm.c
+@@ -0,0 +1,52 @@
++/* Test case for memcmp inside a transactionally executing RTM region.
++   Copyright (C) 2021 Free Software Foundation, Inc.
++   This file is part of the GNU C Library.
++
++   The GNU C Library is free software; you can redistribute it and/or
++   modify it under the terms of the GNU Lesser General Public
++   License as published by the Free Software Foundation; either
++   version 2.1 of the License, or (at your option) any later version.
++
++   The GNU C Library is distributed in the hope that it will be useful,
++   but WITHOUT ANY WARRANTY; without even the implied warranty of
++   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++   Lesser General Public License for more details.
++
++   You should have received a copy of the GNU Lesser General Public
++   License along with the GNU C Library; if not, see
++   <https://www.gnu.org/licenses/>.  */
++
++#include <tst-string-rtm.h>
++
++#define LOOP 3000
++#define STRING_SIZE 1024
++char string1[STRING_SIZE];
++char string2[STRING_SIZE];
++
++__attribute__ ((noinline, noclone))
++static int
++prepare (void)
++{
++  memset (string1, 'a', STRING_SIZE);
++  memset (string2, 'a', STRING_SIZE);
++  if (memcmp (string1, string2, STRING_SIZE) == 0)
++    return EXIT_SUCCESS;
++  else
++    return EXIT_FAILURE;
++}
++
++__attribute__ ((noinline, noclone))
++static int
++function (void)
++{
++  if (memcmp (string1, string2, STRING_SIZE) == 0)
++    return 0;
++  else
++    return 1;
++}
++
++static int
++do_test (void)
++{
++  return do_test_1 ("memcmp", LOOP, prepare, function);
++}
+diff --git a/sysdeps/x86/tst-memmove-rtm.c b/sysdeps/x86/tst-memmove-rtm.c
+new file mode 100644
+index 00000000..4bf97ef1
+--- /dev/null
++++ b/sysdeps/x86/tst-memmove-rtm.c
+@@ -0,0 +1,53 @@
++/* Test case for memmove inside a transactionally executing RTM region.
++   Copyright (C) 2021 Free Software Foundation, Inc.
++   This file is part of the GNU C Library.
++
++   The GNU C Library is free software; you can redistribute it and/or
++   modify it under the terms of the GNU Lesser General Public
++   License as published by the Free Software Foundation; either
++   version 2.1 of the License, or (at your option) any later version.
++
++   The GNU C Library is distributed in the hope that it will be useful,
++   but WITHOUT ANY WARRANTY; without even the implied warranty of
++   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++   Lesser General Public License for more details.
++
++   You should have received a copy of the GNU Lesser General Public
++   License along with the GNU C Library; if not, see
++   <https://www.gnu.org/licenses/>.  */
++
++#include <tst-string-rtm.h>
++
++#define LOOP 3000
++#define STRING_SIZE 1024
++char string1[STRING_SIZE];
++char string2[STRING_SIZE];
++
++__attribute__ ((noinline, noclone))
++static int
++prepare (void)
++{
++  memset (string1, 'a', STRING_SIZE);
++  if (memmove (string2, string1, STRING_SIZE) == string2
++      && memcmp (string2, string1, STRING_SIZE) == 0)
++    return EXIT_SUCCESS;
++  else
++    return EXIT_FAILURE;
++}
++
++__attribute__ ((noinline, noclone))
++static int
++function (void)
++{
++  if (memmove (string2, string1, STRING_SIZE) == string2
++      && memcmp (string2, string1, STRING_SIZE) == 0)
++    return 0;
++  else
++    return 1;
++}
++
++static int
++do_test (void)
++{
++  return do_test_1 ("memmove", LOOP, prepare, function);
++}
+diff --git a/sysdeps/x86/tst-memrchr-rtm.c b/sysdeps/x86/tst-memrchr-rtm.c
+new file mode 100644
+index 00000000..a57a5a8e
+--- /dev/null
++++ b/sysdeps/x86/tst-memrchr-rtm.c
+@@ -0,0 +1,54 @@
++/* Test case for memrchr inside a transactionally executing RTM region.
++   Copyright (C) 2021 Free Software Foundation, Inc.
++   This file is part of the GNU C Library.
++
++   The GNU C Library is free software; you can redistribute it and/or
++   modify it under the terms of the GNU Lesser General Public
++   License as published by the Free Software Foundation; either
++   version 2.1 of the License, or (at your option) any later version.
++
++   The GNU C Library is distributed in the hope that it will be useful,
++   but WITHOUT ANY WARRANTY; without even the implied warranty of
++   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++   Lesser General Public License for more details.
++
++   You should have received a copy of the GNU Lesser General Public
++   License along with the GNU C Library; if not, see
++   <https://www.gnu.org/licenses/>.  */
++
++#include <tst-string-rtm.h>
++
++#define LOOP 3000
++#define STRING_SIZE 1024
++char string1[STRING_SIZE];
++
++__attribute__ ((noinline, noclone))
++static int
++prepare (void)
++{
++  memset (string1, 'a', STRING_SIZE);
++  string1[100] = 'c';
++  string1[STRING_SIZE - 100] = 'c';
++  char *p = memrchr (string1, 'c', STRING_SIZE);
++  if (p == &string1[STRING_SIZE - 100])
++    return EXIT_SUCCESS;
++  else
++    return EXIT_FAILURE;
++}
++
++__attribute__ ((noinline, noclone))
++static int
++function (void)
++{
++  char *p = memrchr (string1, 'c', STRING_SIZE);
++  if (p == &string1[STRING_SIZE - 100])
++    return 0;
++  else
++    return 1;
++}
++
++static int
++do_test (void)
++{
++  return do_test_1 ("memrchr", LOOP, prepare, function);
++}
+diff --git a/sysdeps/x86/tst-memset-rtm.c b/sysdeps/x86/tst-memset-rtm.c
+new file mode 100644
+index 00000000..bf343a4d
+--- /dev/null
++++ b/sysdeps/x86/tst-memset-rtm.c
+@@ -0,0 +1,45 @@
++/* Test case for memset inside a transactionally executing RTM region.
++   Copyright (C) 2021 Free Software Foundation, Inc.
++   This file is part of the GNU C Library.
++
++   The GNU C Library is free software; you can redistribute it and/or
++   modify it under the terms of the GNU Lesser General Public
++   License as published by the Free Software Foundation; either
++   version 2.1 of the License, or (at your option) any later version.
++
++   The GNU C Library is distributed in the hope that it will be useful,
++   but WITHOUT ANY WARRANTY; without even the implied warranty of
++   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++   Lesser General Public License for more details.
++
++   You should have received a copy of the GNU Lesser General Public
++   License along with the GNU C Library; if not, see
++   <https://www.gnu.org/licenses/>.  */
++
++#include <tst-string-rtm.h>
++
++#define LOOP 3000
++#define STRING_SIZE 1024
++char string1[STRING_SIZE];
++
++__attribute__ ((noinline, noclone))
++static int
++prepare (void)
++{
++  memset (string1, 'a', STRING_SIZE);
++  return EXIT_SUCCESS;
++}
++
++__attribute__ ((noinline, noclone))
++static int
++function (void)
++{
++  memset (string1, 'a', STRING_SIZE);
++  return 0;
++}
++
++static int
++do_test (void)
++{
++  return do_test_1 ("memset", LOOP, prepare, function);
++}
+diff --git a/sysdeps/x86/tst-strchr-rtm.c b/sysdeps/x86/tst-strchr-rtm.c
+new file mode 100644
+index 00000000..a82e29c0
+--- /dev/null
++++ b/sysdeps/x86/tst-strchr-rtm.c
+@@ -0,0 +1,54 @@
++/* Test case for strchr inside a transactionally executing RTM region.
++   Copyright (C) 2021 Free Software Foundation, Inc.
++   This file is part of the GNU C Library.
++
++   The GNU C Library is free software; you can redistribute it and/or
++   modify it under the terms of the GNU Lesser General Public
++   License as published by the Free Software Foundation; either
++   version 2.1 of the License, or (at your option) any later version.
++
++   The GNU C Library is distributed in the hope that it will be useful,
++   but WITHOUT ANY WARRANTY; without even the implied warranty of
++   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++   Lesser General Public License for more details.
++
++   You should have received a copy of the GNU Lesser General Public
++   License along with the GNU C Library; if not, see
++   <https://www.gnu.org/licenses/>.  */
++
++#include <tst-string-rtm.h>
++
++#define LOOP 3000
++#define STRING_SIZE 1024
++char string1[STRING_SIZE];
++
++__attribute__ ((noinline, noclone))
++static int
++prepare (void)
++{
++  memset (string1, 'a', STRING_SIZE - 1);
++  string1[100] = 'c';
++  string1[STRING_SIZE - 100] = 'c';
++  char *p = strchr (string1, 'c');
++  if (p == &string1[100])
++    return EXIT_SUCCESS;
++  else
++    return EXIT_FAILURE;
++}
++
++__attribute__ ((noinline, noclone))
++static int
++function (void)
++{
++  char *p = strchr (string1, 'c');
++  if (p == &string1[100])
++    return 0;
++  else
++    return 1;
++}
++
++static int
++do_test (void)
++{
++  return do_test_1 ("strchr", LOOP, prepare, function);
++}
+diff --git a/sysdeps/x86/tst-strcpy-rtm.c b/sysdeps/x86/tst-strcpy-rtm.c
+new file mode 100644
+index 00000000..2b2a583f
+--- /dev/null
++++ b/sysdeps/x86/tst-strcpy-rtm.c
+@@ -0,0 +1,53 @@
++/* Test case for strcpy inside a transactionally executing RTM region.
++   Copyright (C) 2021 Free Software Foundation, Inc.
++   This file is part of the GNU C Library.
++
++   The GNU C Library is free software; you can redistribute it and/or
++   modify it under the terms of the GNU Lesser General Public
++   License as published by the Free Software Foundation; either
++   version 2.1 of the License, or (at your option) any later version.
++
++   The GNU C Library is distributed in the hope that it will be useful,
++   but WITHOUT ANY WARRANTY; without even the implied warranty of
++   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++   Lesser General Public License for more details.
++
++   You should have received a copy of the GNU Lesser General Public
++   License along with the GNU C Library; if not, see
++   <https://www.gnu.org/licenses/>.  */
++
++#include <tst-string-rtm.h>
++
++#define LOOP 3000
++#define STRING_SIZE 1024
++char string1[STRING_SIZE];
++char string2[STRING_SIZE];
++
++__attribute__ ((noinline, noclone))
++static int
++prepare (void)
++{
++  memset (string1, 'a', STRING_SIZE - 1);
++  if (strcpy (string2, string1) == string2
++      && strcmp (string2, string1) == 0)
++    return EXIT_SUCCESS;
++  else
++    return EXIT_FAILURE;
++}
++
++__attribute__ ((noinline, noclone))
++static int
++function (void)
++{
++  if (strcpy (string2, string1) == string2
++      && strcmp (string2, string1) == 0)
++    return 0;
++  else
++    return 1;
++}
++
++static int
++do_test (void)
++{
++  return do_test_1 ("strcpy", LOOP, prepare, function);
++}
+diff --git a/sysdeps/x86/tst-string-rtm.h b/sysdeps/x86/tst-string-rtm.h
+new file mode 100644
+index 00000000..d2470afa
+--- /dev/null
++++ b/sysdeps/x86/tst-string-rtm.h
+@@ -0,0 +1,72 @@
++/* Test string function in a transactionally executing RTM region.
++   Copyright (C) 2021 Free Software Foundation, Inc.
++   This file is part of the GNU C Library.
++
++   The GNU C Library is free software; you can redistribute it and/or
++   modify it under the terms of the GNU Lesser General Public
++   License as published by the Free Software Foundation; either
++   version 2.1 of the License, or (at your option) any later version.
++
++   The GNU C Library is distributed in the hope that it will be useful,
++   but WITHOUT ANY WARRANTY; without even the implied warranty of
++   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++   Lesser General Public License for more details.
++
++   You should have received a copy of the GNU Lesser General Public
++   License along with the GNU C Library; if not, see
++   <https://www.gnu.org/licenses/>.  */
++
++#include <string.h>
++#include <x86intrin.h>
++#include <sys/platform/x86.h>
++#include <support/check.h>
++#include <support/test-driver.h>
++
++static int
++do_test_1 (const char *name, unsigned int loop, int (*prepare) (void),
++	   int (*function) (void))
++{
++  if (!CPU_FEATURE_USABLE (RTM))
++    return EXIT_UNSUPPORTED;
++
++  int status = prepare ();
++  if (status != EXIT_SUCCESS)
++    return status;
++
++  unsigned int i;
++  unsigned int naborts = 0;
++  unsigned int failed = 0;
++  for (i = 0; i < loop; i++)
++    {
++      failed |= function ();
++      if (_xbegin() == _XBEGIN_STARTED)
++	{
++	  failed |= function ();
++	  _xend();
++	}
++      else
++	{
++	  failed |= function ();
++	  ++naborts;
++	}
++    }
++
++  if (failed)
++    FAIL_EXIT1 ("%s() failed", name);
++
++  if (naborts)
++    {
++      /* NB: Low single digit (<= 5%) noise-level aborts are normal for
++	 TSX.  */
++      double rate = 100 * ((double) naborts) / ((double) loop);
++      if (rate > 5)
++	FAIL_EXIT1 ("TSX abort rate: %.2f%% (%d out of %d)",
++		    rate, naborts, loop);
++    }
++
++  return EXIT_SUCCESS;
++}
++
++static int do_test (void);
++
++#include <support/test-driver.c>
+diff --git a/sysdeps/x86/tst-strlen-rtm.c b/sysdeps/x86/tst-strlen-rtm.c
+new file mode 100644
+index 00000000..0dcf14db
+--- /dev/null
++++ b/sysdeps/x86/tst-strlen-rtm.c
+@@ -0,0 +1,53 @@
++/* Test case for strlen inside a transactionally executing RTM region.
++   Copyright (C) 2021 Free Software Foundation, Inc.
++   This file is part of the GNU C Library.
++
++   The GNU C Library is free software; you can redistribute it and/or
++   modify it under the terms of the GNU Lesser General Public
++   License as published by the Free Software Foundation; either
++   version 2.1 of the License, or (at your option) any later version.
++
++   The GNU C Library is distributed in the hope that it will be useful,
++   but WITHOUT ANY WARRANTY; without even the implied warranty of
++   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++   Lesser General Public License for more details.
++
++   You should have received a copy of the GNU Lesser General Public
++   License along with the GNU C Library; if not, see
++   <https://www.gnu.org/licenses/>.  */
++
++#include <tst-string-rtm.h>
++
++#define LOOP 3000
++#define STRING_SIZE 1024
++char string1[STRING_SIZE];
++
++__attribute__ ((noinline, noclone))
++static int
++prepare (void)
++{
++  memset (string1, 'a', STRING_SIZE - 1);
++  string1[STRING_SIZE - 100] = '\0';
++  size_t len = strlen (string1);
++  if (len == STRING_SIZE - 100)
++    return EXIT_SUCCESS;
++  else
++    return EXIT_FAILURE;
++}
++
++__attribute__ ((noinline, noclone))
++static int
++function (void)
++{
++  size_t len = strlen (string1);
++  if (len == STRING_SIZE - 100)
++    return 0;
++  else
++    return 1;
++}
++
++static int
++do_test (void)
++{
++  return do_test_1 ("strlen", LOOP, prepare, function);
++}
+diff --git a/sysdeps/x86/tst-strncmp-rtm.c b/sysdeps/x86/tst-strncmp-rtm.c
+new file mode 100644
+index 00000000..236ad951
+--- /dev/null
++++ b/sysdeps/x86/tst-strncmp-rtm.c
+@@ -0,0 +1,52 @@
++/* Test case for strncmp inside a transactionally executing RTM region.
++   Copyright (C) 2021 Free Software Foundation, Inc.
++   This file is part of the GNU C Library.
++
++   The GNU C Library is free software; you can redistribute it and/or
++   modify it under the terms of the GNU Lesser General Public
++   License as published by the Free Software Foundation; either
++   version 2.1 of the License, or (at your option) any later version.
++
++   The GNU C Library is distributed in the hope that it will be useful,
++   but WITHOUT ANY WARRANTY; without even the implied warranty of
++   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++   Lesser General Public License for more details.
++
++   You should have received a copy of the GNU Lesser General Public
++   License along with the GNU C Library; if not, see
++   <https://www.gnu.org/licenses/>.  */
++
++#include <tst-string-rtm.h>
++
++#define LOOP 3000
++#define STRING_SIZE 1024
++char string1[STRING_SIZE];
++char string2[STRING_SIZE];
++
++__attribute__ ((noinline, noclone))
++static int
++prepare (void)
++{
++  memset (string1, 'a', STRING_SIZE - 1);
++  memset (string2, 'a', STRING_SIZE - 1);
++  if (strncmp (string1, string2, STRING_SIZE) == 0)
++    return EXIT_SUCCESS;
++  else
++    return EXIT_FAILURE;
++}
++
++__attribute__ ((noinline, noclone))
++static int
++function (void)
++{
++  if (strncmp (string1, string2, STRING_SIZE) == 0)
++    return 0;
++  else
++    return 1;
++}
++
++static int
++do_test (void)
++{
++  return do_test_1 ("strncmp", LOOP, prepare, function);
++}
+diff --git a/sysdeps/x86/tst-strrchr-rtm.c b/sysdeps/x86/tst-strrchr-rtm.c
+new file mode 100644
+index 00000000..e32bfaf5
+--- /dev/null
++++ b/sysdeps/x86/tst-strrchr-rtm.c
+@@ -0,0 +1,53 @@
++/* Test case for strrchr inside a transactionally executing RTM region.
++   Copyright (C) 2021 Free Software Foundation, Inc.
++   This file is part of the GNU C Library.
++
++   The GNU C Library is free software; you can redistribute it and/or
++   modify it under the terms of the GNU Lesser General Public
++   License as published by the Free Software Foundation; either
++   version 2.1 of the License, or (at your option) any later version.
++
++   The GNU C Library is distributed in the hope that it will be useful,
++   but WITHOUT ANY WARRANTY; without even the implied warranty of
++   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++   Lesser General Public License for more details.
++
++   You should have received a copy of the GNU Lesser General Public
++   License along with the GNU C Library; if not, see
++   <https://www.gnu.org/licenses/>.  */
++
++#include <tst-string-rtm.h>
++
++#define LOOP 3000
++#define STRING_SIZE 1024
++char string1[STRING_SIZE];
++
++__attribute__ ((noinline, noclone))
++static int
++prepare (void)
++{
++  memset (string1, 'a', STRING_SIZE - 1);
++  string1[STRING_SIZE - 100] = 'c';
++  char *p = strrchr (string1, 'c');
++  if (p == &string1[STRING_SIZE - 100])
++    return EXIT_SUCCESS;
++  else
++    return EXIT_FAILURE;
++}
++
++__attribute__ ((noinline, noclone))
++static int
++function (void)
++{
++  char *p = strrchr (string1, 'c');
++  if (p == &string1[STRING_SIZE - 100])
++    return 0;
++  else
++    return 1;
++}
++
++static int
++do_test (void)
++{
++  return do_test_1 ("strrchr", LOOP, prepare, function);
++}
+-- 
+GitLab
+
diff --git a/glibc-RHEL-15696-19.patch b/glibc-RHEL-15696-19.patch
new file mode 100644
index 0000000..0500875
--- /dev/null
+++ b/glibc-RHEL-15696-19.patch
@@ -0,0 +1,148 @@
+From 4e2d8f352774b56078c34648b14a2412c38384f4 Mon Sep 17 00:00:00 2001
+From: "H.J. Lu" <hjl.tools@gmail.com>
+Date: Sun, 7 Mar 2021 09:44:18 -0800
+Subject: [PATCH] x86-64: Use ZMM16-ZMM31 in AVX512 memset family functions
+Content-type: text/plain; charset=UTF-8
+
+Update ifunc-memset.h/ifunc-wmemset.h to select the function optimized
+with AVX512 instructions using ZMM16-ZMM31 registers to avoid RTM abort
+with usable AVX512VL and AVX512BW since VZEROUPPER isn't needed at
+function exit.
+---
+ sysdeps/x86_64/multiarch/ifunc-impl-list.c       | 14 +++++++++-----
+ sysdeps/x86_64/multiarch/ifunc-memset.h          | 13 ++++++++-----
+ sysdeps/x86_64/multiarch/ifunc-wmemset.h         | 12 ++++++------
+ .../multiarch/memset-avx512-unaligned-erms.S     | 16 ++++++++--------
+ 4 files changed, 31 insertions(+), 24 deletions(-)
+
+diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+index c1efeec0..d969a156 100644
+--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
++++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+@@ -211,10 +211,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 			       && CPU_FEATURE_USABLE (AVX512BW)),
+ 			      __memset_chk_evex_unaligned_erms)
+ 	      IFUNC_IMPL_ADD (array, i, __memset_chk,
+-			      CPU_FEATURE_USABLE (AVX512F),
++			      (CPU_FEATURE_USABLE (AVX512VL)
++			       && CPU_FEATURE_USABLE (AVX512BW)),
+ 			      __memset_chk_avx512_unaligned_erms)
+ 	      IFUNC_IMPL_ADD (array, i, __memset_chk,
+-			      CPU_FEATURE_USABLE (AVX512F),
++			      (CPU_FEATURE_USABLE (AVX512VL)
++			       && CPU_FEATURE_USABLE (AVX512BW)),
+ 			      __memset_chk_avx512_unaligned)
+ 	      IFUNC_IMPL_ADD (array, i, __memset_chk,
+ 			      CPU_FEATURE_USABLE (AVX512F),
+@@ -252,10 +254,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 			       && CPU_FEATURE_USABLE (AVX512BW)),
+ 			      __memset_evex_unaligned_erms)
+ 	      IFUNC_IMPL_ADD (array, i, memset,
+-			      CPU_FEATURE_USABLE (AVX512F),
++			      (CPU_FEATURE_USABLE (AVX512VL)
++			       && CPU_FEATURE_USABLE (AVX512BW)),
+ 			      __memset_avx512_unaligned_erms)
+ 	      IFUNC_IMPL_ADD (array, i, memset,
+-			      CPU_FEATURE_USABLE (AVX512F),
++			      (CPU_FEATURE_USABLE (AVX512VL)
++			       && CPU_FEATURE_USABLE (AVX512BW)),
+ 			      __memset_avx512_unaligned)
+ 	      IFUNC_IMPL_ADD (array, i, memset,
+ 			      CPU_FEATURE_USABLE (AVX512F),
+@@ -719,7 +723,7 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 			      CPU_FEATURE_USABLE (AVX512VL),
+ 			      __wmemset_evex_unaligned)
+ 	      IFUNC_IMPL_ADD (array, i, wmemset,
+-			      CPU_FEATURE_USABLE (AVX512F),
++			      CPU_FEATURE_USABLE (AVX512VL),
+ 			      __wmemset_avx512_unaligned))
+ 
+ #ifdef SHARED
+diff --git a/sysdeps/x86_64/multiarch/ifunc-memset.h b/sysdeps/x86_64/multiarch/ifunc-memset.h
+index 6f3375cc..19795938 100644
+--- a/sysdeps/x86_64/multiarch/ifunc-memset.h
++++ b/sysdeps/x86_64/multiarch/ifunc-memset.h
+@@ -53,13 +53,16 @@ IFUNC_SELECTOR (void)
+   if (CPU_FEATURE_USABLE_P (cpu_features, AVX512F)
+       && !CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_AVX512))
+     {
+-      if (CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
+-	return OPTIMIZE (avx512_no_vzeroupper);
++      if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
++	  && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW))
++	{
++	  if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
++	    return OPTIMIZE (avx512_unaligned_erms);
+ 
+-      if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
+-	return OPTIMIZE (avx512_unaligned_erms);
++	  return OPTIMIZE (avx512_unaligned);
++	}
+ 
+-      return OPTIMIZE (avx512_unaligned);
++      return OPTIMIZE (avx512_no_vzeroupper);
+     }
+ 
+   if (CPU_FEATURE_USABLE_P (cpu_features, AVX2))
+diff --git a/sysdeps/x86_64/multiarch/ifunc-wmemset.h b/sysdeps/x86_64/multiarch/ifunc-wmemset.h
+index bdc94c6c..98c5d406 100644
+--- a/sysdeps/x86_64/multiarch/ifunc-wmemset.h
++++ b/sysdeps/x86_64/multiarch/ifunc-wmemset.h
+@@ -33,13 +33,13 @@ IFUNC_SELECTOR (void)
+   if (CPU_FEATURE_USABLE_P (cpu_features, AVX2)
+       && CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load))
+     {
+-      if (CPU_FEATURE_USABLE_P (cpu_features, AVX512F)
+-	  && !CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_AVX512)
+-	  && !CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
+-	return OPTIMIZE (avx512_unaligned);
+-
+       if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL))
+-	return OPTIMIZE (evex_unaligned);
++	{
++	  if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_AVX512))
++	    return OPTIMIZE (avx512_unaligned);
++
++	  return OPTIMIZE (evex_unaligned);
++	}
+ 
+       if (CPU_FEATURE_USABLE_P (cpu_features, RTM))
+ 	return OPTIMIZE (avx2_unaligned_rtm);
+diff --git a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
+index 0783979c..22e7b187 100644
+--- a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
++++ b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
+@@ -1,22 +1,22 @@
+ #if IS_IN (libc)
+ # define VEC_SIZE	64
+-# define VEC(i)		zmm##i
++# define XMM0		xmm16
++# define YMM0		ymm16
++# define VEC0		zmm16
++# define VEC(i)		VEC##i
+ # define VMOVU		vmovdqu64
+ # define VMOVA		vmovdqa64
++# define VZEROUPPER
+ 
+ # define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
+-  vmovd d, %xmm0; \
+   movq r, %rax; \
+-  vpbroadcastb %xmm0, %xmm0; \
+-  vpbroadcastq %xmm0, %zmm0
++  vpbroadcastb d, %VEC0
+ 
+ # define WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
+-  vmovd d, %xmm0; \
+   movq r, %rax; \
+-  vpbroadcastd %xmm0, %xmm0; \
+-  vpbroadcastq %xmm0, %zmm0
++  vpbroadcastd d, %VEC0
+ 
+-# define SECTION(p)		p##.avx512
++# define SECTION(p)		p##.evex512
+ # define MEMSET_SYMBOL(p,s)	p##_avx512_##s
+ # define WMEMSET_SYMBOL(p,s)	p##_avx512_##s
+ 
+-- 
+GitLab
+
diff --git a/glibc-RHEL-15696-2.patch b/glibc-RHEL-15696-2.patch
new file mode 100644
index 0000000..54f3ac3
--- /dev/null
+++ b/glibc-RHEL-15696-2.patch
@@ -0,0 +1,230 @@
+From b304fc201d2f6baf52ea790df8643e99772243cd Mon Sep 17 00:00:00 2001
+From: "H.J. Lu" <hjl.tools@gmail.com>
+Date: Mon, 21 Jan 2019 11:25:56 -0800
+Subject: [PATCH] x86-64 memcmp/wmemcmp: Properly handle the length parameter
+ [BZ# 24097]
+Content-type: text/plain; charset=UTF-8
+
+On x32, the size_t parameter may be passed in the lower 32 bits of a
+64-bit register with the non-zero upper 32 bits.  The string/memory
+functions written in assembly can only use the lower 32 bits of a
+64-bit register as length or must clear the upper 32 bits before using
+the full 64-bit register for length.
+
+This pach fixes memcmp/wmemcmp for x32.  Tested on x86-64 and x32.  On
+x86-64, libc.so is the same with and withou the fix.
+
+	[BZ# 24097]
+	CVE-2019-6488
+	* sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S: Use RDX_LP for
+	length.  Clear the upper 32 bits of RDX register.
+	* sysdeps/x86_64/multiarch/memcmp-sse4.S: Likewise.
+	* sysdeps/x86_64/multiarch/memcmp-ssse3.S: Likewise.
+	* sysdeps/x86_64/x32/Makefile (tests): Add tst-size_t-memcmp and
+	tst-size_t-wmemcmp.
+	* sysdeps/x86_64/x32/tst-size_t-memcmp.c: New file.
+	* sysdeps/x86_64/x32/tst-size_t-wmemcmp.c: Likewise.
+---
+ sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S |  7 +-
+ sysdeps/x86_64/multiarch/memcmp-sse4.S       |  9 ++-
+ sysdeps/x86_64/multiarch/memcmp-ssse3.S      |  7 +-
+ sysdeps/x86_64/x32/Makefile                  |  4 +-
+ sysdeps/x86_64/x32/tst-size_t-memcmp.c       | 76 ++++++++++++++++++++
+ sysdeps/x86_64/x32/tst-size_t-wmemcmp.c      | 20 ++++++
+ 6 files changed, 114 insertions(+), 9 deletions(-)
+ create mode 100644 sysdeps/x86_64/x32/tst-size_t-memcmp.c
+ create mode 100644 sysdeps/x86_64/x32/tst-size_t-wmemcmp.c
+
+Conflicts:
+	ChangeLog
+	(removed)
+
+diff --git a/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S b/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S
+index 30f764c3..e3a35b89 100644
+--- a/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S
++++ b/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S
+@@ -58,9 +58,12 @@
+ 	.section .text.avx,"ax",@progbits
+ ENTRY (MEMCMP)
+ # ifdef USE_AS_WMEMCMP
+-	shl	$2, %rdx
++	shl	$2, %RDX_LP
++# elif defined __ILP32__
++	/* Clear the upper 32 bits.  */
++	movl	%edx, %edx
+ # endif
+-	cmpq	$VEC_SIZE, %rdx
++	cmp	$VEC_SIZE, %RDX_LP
+ 	jb	L(less_vec)
+ 
+ 	/* From VEC to 2 * VEC.  No branch when size == VEC_SIZE.  */
+diff --git a/sysdeps/x86_64/multiarch/memcmp-sse4.S b/sysdeps/x86_64/multiarch/memcmp-sse4.S
+index 8e164f2c..302900f5 100644
+--- a/sysdeps/x86_64/multiarch/memcmp-sse4.S
++++ b/sysdeps/x86_64/multiarch/memcmp-sse4.S
+@@ -42,13 +42,16 @@
+ 	.section .text.sse4.1,"ax",@progbits
+ ENTRY (MEMCMP)
+ # ifdef USE_AS_WMEMCMP
+-	shl	$2, %rdx
++	shl	$2, %RDX_LP
++# elif defined __ILP32__
++	/* Clear the upper 32 bits.  */
++	mov	%edx, %edx
+ # endif
+ 	pxor	%xmm0, %xmm0
+-	cmp	$79, %rdx
++	cmp	$79, %RDX_LP
+ 	ja	L(79bytesormore)
+ # ifndef USE_AS_WMEMCMP
+-	cmp	$1, %rdx
++	cmp	$1, %RDX_LP
+ 	je	L(firstbyte)
+ # endif
+ 	add	%rdx, %rsi
+diff --git a/sysdeps/x86_64/multiarch/memcmp-ssse3.S b/sysdeps/x86_64/multiarch/memcmp-ssse3.S
+index 6f76c641..69d030fc 100644
+--- a/sysdeps/x86_64/multiarch/memcmp-ssse3.S
++++ b/sysdeps/x86_64/multiarch/memcmp-ssse3.S
+@@ -33,9 +33,12 @@
+ 	atom_text_section
+ ENTRY (MEMCMP)
+ # ifdef USE_AS_WMEMCMP
+-	shl	$2, %rdx
+-	test	%rdx, %rdx
++	shl	$2, %RDX_LP
++	test	%RDX_LP, %RDX_LP
+ 	jz	L(equal)
++# elif defined __ILP32__
++	/* Clear the upper 32 bits.  */
++	mov	%edx, %edx
+ # endif
+ 	mov	%rdx, %rcx
+ 	mov	%rdi, %rdx
+diff --git a/sysdeps/x86_64/x32/Makefile b/sysdeps/x86_64/x32/Makefile
+index 7d528889..ddec7f04 100644
+--- a/sysdeps/x86_64/x32/Makefile
++++ b/sysdeps/x86_64/x32/Makefile
+@@ -6,9 +6,9 @@ CFLAGS-s_llround.c += -fno-builtin-lround
+ endif
+ 
+ ifeq ($(subdir),string)
+-tests += tst-size_t-memchr
++tests += tst-size_t-memchr tst-size_t-memcmp
+ endif
+ 
+ ifeq ($(subdir),wcsmbs)
+-tests += tst-size_t-wmemchr
++tests += tst-size_t-wmemchr tst-size_t-wmemcmp
+ endif
+diff --git a/sysdeps/x86_64/x32/tst-size_t-memcmp.c b/sysdeps/x86_64/x32/tst-size_t-memcmp.c
+new file mode 100644
+index 00000000..9bd6fdb4
+--- /dev/null
++++ b/sysdeps/x86_64/x32/tst-size_t-memcmp.c
+@@ -0,0 +1,76 @@
++/* Test memcmp with size_t in the lower 32 bits of 64-bit register.
++   Copyright (C) 2019 Free Software Foundation, Inc.
++   This file is part of the GNU C Library.
++
++   The GNU C Library is free software; you can redistribute it and/or
++   modify it under the terms of the GNU Lesser General Public
++   License as published by the Free Software Foundation; either
++   version 2.1 of the License, or (at your option) any later version.
++
++   The GNU C Library is distributed in the hope that it will be useful,
++   but WITHOUT ANY WARRANTY; without even the implied warranty of
++   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++   Lesser General Public License for more details.
++
++   You should have received a copy of the GNU Lesser General Public
++   License along with the GNU C Library; if not, see
++   <http://www.gnu.org/licenses/>.  */
++
++#define TEST_MAIN
++#ifdef WIDE
++# define TEST_NAME "wmemcmp"
++#else
++# define TEST_NAME "memcmp"
++#endif
++
++#include "test-size_t.h"
++
++#ifdef WIDE
++# include <inttypes.h>
++# include <wchar.h>
++
++# define MEMCMP wmemcmp
++# define CHAR wchar_t
++#else
++# define MEMCMP memcmp
++# define CHAR char
++#endif
++
++IMPL (MEMCMP, 1)
++
++typedef int (*proto_t) (const CHAR *, const CHAR *, size_t);
++
++static int
++__attribute__ ((noinline, noclone))
++do_memcmp (parameter_t a, parameter_t b)
++{
++  return CALL (&b, a.p, b.p, a.len);
++}
++
++static int
++test_main (void)
++{
++  test_init ();
++
++  parameter_t dest = { { page_size / sizeof (CHAR) }, buf1 };
++  parameter_t src = { { 0 }, buf2 };
++
++  memcpy (buf1, buf2, page_size);
++
++  int ret = 0;
++  FOR_EACH_IMPL (impl, 0)
++    {
++      src.fn = impl->fn;
++      int res = do_memcmp (dest, src);
++      if (res)
++	{
++	  error (0, 0, "Wrong result in function %s: %i != 0",
++		 impl->name, res);
++	  ret = 1;
++	}
++    }
++
++  return ret ? EXIT_FAILURE : EXIT_SUCCESS;
++}
++
++#include <support/test-driver.c>
+diff --git a/sysdeps/x86_64/x32/tst-size_t-wmemcmp.c b/sysdeps/x86_64/x32/tst-size_t-wmemcmp.c
+new file mode 100644
+index 00000000..e8b5ffd0
+--- /dev/null
++++ b/sysdeps/x86_64/x32/tst-size_t-wmemcmp.c
+@@ -0,0 +1,20 @@
++/* Test wmemcmp with size_t in the lower 32 bits of 64-bit register.
++   Copyright (C) 2019 Free Software Foundation, Inc.
++   This file is part of the GNU C Library.
++
++   The GNU C Library is free software; you can redistribute it and/or
++   modify it under the terms of the GNU Lesser General Public
++   License as published by the Free Software Foundation; either
++   version 2.1 of the License, or (at your option) any later version.
++
++   The GNU C Library is distributed in the hope that it will be useful,
++   but WITHOUT ANY WARRANTY; without even the implied warranty of
++   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++   Lesser General Public License for more details.
++
++   You should have received a copy of the GNU Lesser General Public
++   License along with the GNU C Library; if not, see
++   <http://www.gnu.org/licenses/>.  */
++
++#define WIDE 1
++#include "tst-size_t-memcmp.c"
+-- 
+GitLab
+
diff --git a/glibc-RHEL-15696-20.patch b/glibc-RHEL-15696-20.patch
new file mode 100644
index 0000000..c63b3fb
--- /dev/null
+++ b/glibc-RHEL-15696-20.patch
@@ -0,0 +1,164 @@
+From e4fda4631017e49d4ee5a2755db34289b6860fa4 Mon Sep 17 00:00:00 2001
+From: "H.J. Lu" <hjl.tools@gmail.com>
+Date: Sun, 7 Mar 2021 09:45:23 -0800
+Subject: [PATCH] x86-64: Use ZMM16-ZMM31 in AVX512 memmove family functions
+Content-type: text/plain; charset=UTF-8
+
+Update ifunc-memmove.h to select the function optimized with AVX512
+instructions using ZMM16-ZMM31 registers to avoid RTM abort with usable
+AVX512VL since VZEROUPPER isn't needed at function exit.
+---
+ sysdeps/x86_64/multiarch/ifunc-impl-list.c    | 24 +++++++++---------
+ sysdeps/x86_64/multiarch/ifunc-memmove.h      | 12 +++++----
+ .../multiarch/memmove-avx512-unaligned-erms.S | 25 +++++++++++++++++--
+ 3 files changed, 42 insertions(+), 19 deletions(-)
+
+diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+index d969a156..fec384f6 100644
+--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
++++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+@@ -83,10 +83,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 			      CPU_FEATURE_USABLE (AVX512F),
+ 			      __memmove_chk_avx512_no_vzeroupper)
+ 	      IFUNC_IMPL_ADD (array, i, __memmove_chk,
+-			      CPU_FEATURE_USABLE (AVX512F),
++			      CPU_FEATURE_USABLE (AVX512VL),
+ 			      __memmove_chk_avx512_unaligned)
+ 	      IFUNC_IMPL_ADD (array, i, __memmove_chk,
+-			      CPU_FEATURE_USABLE (AVX512F),
++			      CPU_FEATURE_USABLE (AVX512VL),
+ 			      __memmove_chk_avx512_unaligned_erms)
+ 	      IFUNC_IMPL_ADD (array, i, __memmove_chk,
+ 			      CPU_FEATURE_USABLE (AVX),
+@@ -148,10 +148,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 			      CPU_FEATURE_USABLE (AVX512F),
+ 			      __memmove_avx512_no_vzeroupper)
+ 	      IFUNC_IMPL_ADD (array, i, memmove,
+-			      CPU_FEATURE_USABLE (AVX512F),
++			      CPU_FEATURE_USABLE (AVX512VL),
+ 			      __memmove_avx512_unaligned)
+ 	      IFUNC_IMPL_ADD (array, i, memmove,
+-			      CPU_FEATURE_USABLE (AVX512F),
++			      CPU_FEATURE_USABLE (AVX512VL),
+ 			      __memmove_avx512_unaligned_erms)
+ 	      IFUNC_IMPL_ADD (array, i, memmove, CPU_FEATURE_USABLE (SSSE3),
+ 			      __memmove_ssse3_back)
+@@ -733,10 +733,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 			      CPU_FEATURE_USABLE (AVX512F),
+ 			      __memcpy_chk_avx512_no_vzeroupper)
+ 	      IFUNC_IMPL_ADD (array, i, __memcpy_chk,
+-			      CPU_FEATURE_USABLE (AVX512F),
++			      CPU_FEATURE_USABLE (AVX512VL),
+ 			      __memcpy_chk_avx512_unaligned)
+ 	      IFUNC_IMPL_ADD (array, i, __memcpy_chk,
+-			      CPU_FEATURE_USABLE (AVX512F),
++			      CPU_FEATURE_USABLE (AVX512VL),
+ 			      __memcpy_chk_avx512_unaligned_erms)
+ 	      IFUNC_IMPL_ADD (array, i, __memcpy_chk,
+ 			      CPU_FEATURE_USABLE (AVX),
+@@ -802,10 +802,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 			      CPU_FEATURE_USABLE (AVX512F),
+ 			      __memcpy_avx512_no_vzeroupper)
+ 	      IFUNC_IMPL_ADD (array, i, memcpy,
+-			      CPU_FEATURE_USABLE (AVX512F),
++			      CPU_FEATURE_USABLE (AVX512VL),
+ 			      __memcpy_avx512_unaligned)
+ 	      IFUNC_IMPL_ADD (array, i, memcpy,
+-			      CPU_FEATURE_USABLE (AVX512F),
++			      CPU_FEATURE_USABLE (AVX512VL),
+ 			      __memcpy_avx512_unaligned_erms)
+ 	      IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_sse2_unaligned)
+ 	      IFUNC_IMPL_ADD (array, i, memcpy, 1,
+@@ -819,10 +819,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 			      CPU_FEATURE_USABLE (AVX512F),
+ 			      __mempcpy_chk_avx512_no_vzeroupper)
+ 	      IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
+-			      CPU_FEATURE_USABLE (AVX512F),
++			      CPU_FEATURE_USABLE (AVX512VL),
+ 			      __mempcpy_chk_avx512_unaligned)
+ 	      IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
+-			      CPU_FEATURE_USABLE (AVX512F),
++			      CPU_FEATURE_USABLE (AVX512VL),
+ 			      __mempcpy_chk_avx512_unaligned_erms)
+ 	      IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
+ 			      CPU_FEATURE_USABLE (AVX),
+@@ -864,10 +864,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 			      CPU_FEATURE_USABLE (AVX512F),
+ 			      __mempcpy_avx512_no_vzeroupper)
+ 	      IFUNC_IMPL_ADD (array, i, mempcpy,
+-			      CPU_FEATURE_USABLE (AVX512F),
++			      CPU_FEATURE_USABLE (AVX512VL),
+ 			      __mempcpy_avx512_unaligned)
+ 	      IFUNC_IMPL_ADD (array, i, mempcpy,
+-			      CPU_FEATURE_USABLE (AVX512F),
++			      CPU_FEATURE_USABLE (AVX512VL),
+ 			      __mempcpy_avx512_unaligned_erms)
+ 	      IFUNC_IMPL_ADD (array, i, mempcpy,
+ 			      CPU_FEATURE_USABLE (AVX),
+diff --git a/sysdeps/x86_64/multiarch/ifunc-memmove.h b/sysdeps/x86_64/multiarch/ifunc-memmove.h
+index fa09b9fb..014e95c7 100644
+--- a/sysdeps/x86_64/multiarch/ifunc-memmove.h
++++ b/sysdeps/x86_64/multiarch/ifunc-memmove.h
+@@ -56,13 +56,15 @@ IFUNC_SELECTOR (void)
+   if (CPU_FEATURE_USABLE_P (cpu_features, AVX512F)
+       && !CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_AVX512))
+     {
+-      if (CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
+-	return OPTIMIZE (avx512_no_vzeroupper);
++      if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL))
++	{
++	  if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
++	    return OPTIMIZE (avx512_unaligned_erms);
+ 
+-      if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
+-	return OPTIMIZE (avx512_unaligned_erms);
++	  return OPTIMIZE (avx512_unaligned);
++	}
+ 
+-      return OPTIMIZE (avx512_unaligned);
++      return OPTIMIZE (avx512_no_vzeroupper);
+     }
+ 
+   if (CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load))
+diff --git a/sysdeps/x86_64/multiarch/memmove-avx512-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-avx512-unaligned-erms.S
+index aac1515c..848848ab 100644
+--- a/sysdeps/x86_64/multiarch/memmove-avx512-unaligned-erms.S
++++ b/sysdeps/x86_64/multiarch/memmove-avx512-unaligned-erms.S
+@@ -1,11 +1,32 @@
+ #if IS_IN (libc)
+ # define VEC_SIZE	64
+-# define VEC(i)		zmm##i
++# define XMM0		xmm16
++# define XMM1		xmm17
++# define YMM0		ymm16
++# define YMM1		ymm17
++# define VEC0		zmm16
++# define VEC1		zmm17
++# define VEC2		zmm18
++# define VEC3		zmm19
++# define VEC4		zmm20
++# define VEC5		zmm21
++# define VEC6		zmm22
++# define VEC7		zmm23
++# define VEC8		zmm24
++# define VEC9		zmm25
++# define VEC10		zmm26
++# define VEC11		zmm27
++# define VEC12		zmm28
++# define VEC13		zmm29
++# define VEC14		zmm30
++# define VEC15		zmm31
++# define VEC(i)		VEC##i
+ # define VMOVNT		vmovntdq
+ # define VMOVU		vmovdqu64
+ # define VMOVA		vmovdqa64
++# define VZEROUPPER
+ 
+-# define SECTION(p)		p##.avx512
++# define SECTION(p)		p##.evex512
+ # define MEMMOVE_SYMBOL(p,s)	p##_avx512_##s
+ 
+ # include "memmove-vec-unaligned-erms.S"
+-- 
+GitLab
+
diff --git a/glibc-RHEL-15696-21.patch b/glibc-RHEL-15696-21.patch
new file mode 100644
index 0000000..319c08d
--- /dev/null
+++ b/glibc-RHEL-15696-21.patch
@@ -0,0 +1,71 @@
+From 595c22ecd8e87a27fd19270ed30fdbae9ad25426 Mon Sep 17 00:00:00 2001
+From: Sunil K Pandey <skpgkp2@gmail.com>
+Date: Thu, 1 Apr 2021 15:47:04 -0700
+Subject: [PATCH] x86-64: Fix ifdef indentation in strlen-evex.S
+Content-type: text/plain; charset=UTF-8
+
+Fix some indentations of ifdef in file strlen-evex.S which are off by 1
+and confusing to read.
+---
+ sysdeps/x86_64/multiarch/strlen-evex.S | 16 ++++++++--------
+ 1 file changed, 8 insertions(+), 8 deletions(-)
+
+diff --git a/sysdeps/x86_64/multiarch/strlen-evex.S b/sysdeps/x86_64/multiarch/strlen-evex.S
+index cd022509..05838190 100644
+--- a/sysdeps/x86_64/multiarch/strlen-evex.S
++++ b/sysdeps/x86_64/multiarch/strlen-evex.S
+@@ -276,10 +276,10 @@ L(last_2x_vec):
+ 	.p2align 4
+ L(first_vec_x0_check):
+ 	tzcntl	%eax, %eax
+-# ifdef USE_AS_WCSLEN
++#  ifdef USE_AS_WCSLEN
+ 	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
+ 	sall	$2, %eax
+-# endif
++#  endif
+ 	/* Check the end of data.  */
+ 	cmpq	%rax, %rsi
+ 	jbe	L(max)
+@@ -293,10 +293,10 @@ L(first_vec_x0_check):
+ 	.p2align 4
+ L(first_vec_x1_check):
+ 	tzcntl	%eax, %eax
+-# ifdef USE_AS_WCSLEN
++#  ifdef USE_AS_WCSLEN
+ 	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
+ 	sall	$2, %eax
+-# endif
++#  endif
+ 	/* Check the end of data.  */
+ 	cmpq	%rax, %rsi
+ 	jbe	L(max)
+@@ -311,10 +311,10 @@ L(first_vec_x1_check):
+ 	.p2align 4
+ L(first_vec_x2_check):
+ 	tzcntl	%eax, %eax
+-# ifdef USE_AS_WCSLEN
++#  ifdef USE_AS_WCSLEN
+ 	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
+ 	sall	$2, %eax
+-# endif
++#  endif
+ 	/* Check the end of data.  */
+ 	cmpq	%rax, %rsi
+ 	jbe	L(max)
+@@ -329,10 +329,10 @@ L(first_vec_x2_check):
+ 	.p2align 4
+ L(first_vec_x3_check):
+ 	tzcntl	%eax, %eax
+-# ifdef USE_AS_WCSLEN
++#  ifdef USE_AS_WCSLEN
+ 	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
+ 	sall	$2, %eax
+-# endif
++#  endif
+ 	/* Check the end of data.  */
+ 	cmpq	%rax, %rsi
+ 	jbe	L(max)
+-- 
+GitLab
+
diff --git a/glibc-RHEL-15696-22.patch b/glibc-RHEL-15696-22.patch
new file mode 100644
index 0000000..c20557b
--- /dev/null
+++ b/glibc-RHEL-15696-22.patch
@@ -0,0 +1,51 @@
+From 55bf411b451c13f0fb7ff3d3bf9a820020b45df1 Mon Sep 17 00:00:00 2001
+From: "H.J. Lu" <hjl.tools@gmail.com>
+Date: Mon, 19 Apr 2021 07:07:21 -0700
+Subject: [PATCH] x86-64: Require BMI2 for __strlen_evex and __strnlen_evex
+Content-type: text/plain; charset=UTF-8
+
+Since __strlen_evex and __strnlen_evex added by
+
+commit 1fd8c163a83d96ace1ff78fa6bac7aee084f6f77
+Author: H.J. Lu <hjl.tools@gmail.com>
+Date:   Fri Mar 5 06:24:52 2021 -0800
+
+    x86-64: Add ifunc-avx2.h functions with 256-bit EVEX
+
+use sarx:
+
+c4 e2 6a f7 c0       	sarx   %edx,%eax,%eax
+
+require BMI2 for __strlen_evex and __strnlen_evex in ifunc-impl-list.c.
+ifunc-avx2.h already requires BMI2 for EVEX implementation.
+---
+ sysdeps/x86_64/multiarch/ifunc-impl-list.c | 6 ++++--
+ 1 file changed, 4 insertions(+), 2 deletions(-)
+
+diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+index fec384f6..cbfc1a5d 100644
+--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
++++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+@@ -293,7 +293,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 			      __strlen_avx2_rtm)
+ 	      IFUNC_IMPL_ADD (array, i, strlen,
+ 			      (CPU_FEATURE_USABLE (AVX512VL)
+-			       && CPU_FEATURE_USABLE (AVX512BW)),
++			       && CPU_FEATURE_USABLE (AVX512BW)
++			       && CPU_FEATURE_USABLE (BMI2)),
+ 			      __strlen_evex)
+ 	      IFUNC_IMPL_ADD (array, i, strlen, 1, __strlen_sse2))
+ 
+@@ -308,7 +309,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 			      __strnlen_avx2_rtm)
+ 	      IFUNC_IMPL_ADD (array, i, strnlen,
+ 			      (CPU_FEATURE_USABLE (AVX512VL)
+-			       && CPU_FEATURE_USABLE (AVX512BW)),
++			       && CPU_FEATURE_USABLE (AVX512BW)
++			       && CPU_FEATURE_USABLE (BMI2)),
+ 			      __strnlen_evex)
+ 	      IFUNC_IMPL_ADD (array, i, strnlen, 1, __strnlen_sse2))
+ 
+-- 
+GitLab
+
diff --git a/glibc-RHEL-15696-23.patch b/glibc-RHEL-15696-23.patch
new file mode 100644
index 0000000..ffde3d7
--- /dev/null
+++ b/glibc-RHEL-15696-23.patch
@@ -0,0 +1,584 @@
+From acfd088a1963ba51cd83c78f95c0ab25ead79e04 Mon Sep 17 00:00:00 2001
+From: Noah Goldstein <goldstein.w.n@gmail.com>
+Date: Mon, 3 May 2021 03:01:58 -0400
+Subject: [PATCH] x86: Optimize memchr-avx2.S
+Content-type: text/plain; charset=UTF-8
+
+No bug. This commit optimizes memchr-avx2.S. The optimizations include
+replacing some branches with cmovcc, avoiding some branches entirely
+in the less_4x_vec case, making the page cross logic less strict,
+asaving a few instructions the in loop return loop. test-memchr,
+test-rawmemchr, and test-wmemchr are all passing.
+
+Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
+Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
+---
+ sysdeps/x86_64/multiarch/memchr-avx2.S | 425 ++++++++++++++-----------
+ 1 file changed, 247 insertions(+), 178 deletions(-)
+
+diff --git a/sysdeps/x86_64/multiarch/memchr-avx2.S b/sysdeps/x86_64/multiarch/memchr-avx2.S
+index cf893e77..b377f22e 100644
+--- a/sysdeps/x86_64/multiarch/memchr-avx2.S
++++ b/sysdeps/x86_64/multiarch/memchr-avx2.S
+@@ -26,8 +26,22 @@
+ 
+ # ifdef USE_AS_WMEMCHR
+ #  define VPCMPEQ	vpcmpeqd
++#  define VPBROADCAST	vpbroadcastd
++#  define CHAR_SIZE	4
+ # else
+ #  define VPCMPEQ	vpcmpeqb
++#  define VPBROADCAST	vpbroadcastb
++#  define CHAR_SIZE	1
++# endif
++
++# ifdef USE_AS_RAWMEMCHR
++#  define ERAW_PTR_REG	ecx
++#  define RRAW_PTR_REG	rcx
++#  define ALGN_PTR_REG	rdi
++# else
++#  define ERAW_PTR_REG	edi
++#  define RRAW_PTR_REG	rdi
++#  define ALGN_PTR_REG	rcx
+ # endif
+ 
+ # ifndef VZEROUPPER
+@@ -39,6 +53,7 @@
+ # endif
+ 
+ # define VEC_SIZE 32
++# define PAGE_SIZE 4096
+ 
+ 	.section SECTION(.text),"ax",@progbits
+ ENTRY (MEMCHR)
+@@ -47,295 +62,349 @@ ENTRY (MEMCHR)
+ 	test	%RDX_LP, %RDX_LP
+ 	jz	L(null)
+ # endif
+-	movl	%edi, %ecx
+-	/* Broadcast CHAR to YMM0.  */
+-	vmovd	%esi, %xmm0
+ # ifdef USE_AS_WMEMCHR
+ 	shl	$2, %RDX_LP
+-	vpbroadcastd %xmm0, %ymm0
+ # else
+ #  ifdef __ILP32__
+ 	/* Clear the upper 32 bits.  */
+ 	movl	%edx, %edx
+ #  endif
+-	vpbroadcastb %xmm0, %ymm0
+ # endif
++	/* Broadcast CHAR to YMMMATCH.  */
++	vmovd	%esi, %xmm0
++	VPBROADCAST %xmm0, %ymm0
+ 	/* Check if we may cross page boundary with one vector load.  */
+-	andl	$(2 * VEC_SIZE - 1), %ecx
+-	cmpl	$VEC_SIZE, %ecx
+-	ja	L(cros_page_boundary)
++	movl	%edi, %eax
++	andl	$(PAGE_SIZE - 1), %eax
++	cmpl	$(PAGE_SIZE - VEC_SIZE), %eax
++	ja	L(cross_page_boundary)
+ 
+ 	/* Check the first VEC_SIZE bytes.  */
+-	VPCMPEQ (%rdi), %ymm0, %ymm1
++	VPCMPEQ	(%rdi), %ymm0, %ymm1
+ 	vpmovmskb %ymm1, %eax
+-	testl	%eax, %eax
+-
+ # ifndef USE_AS_RAWMEMCHR
+-	jnz	L(first_vec_x0_check)
+-	/* Adjust length and check the end of data.  */
+-	subq	$VEC_SIZE, %rdx
+-	jbe	L(zero)
+-# else
+-	jnz	L(first_vec_x0)
++	/* If length < CHAR_PER_VEC handle special.  */
++	cmpq	$VEC_SIZE, %rdx
++	jbe	L(first_vec_x0)
+ # endif
+-
+-	/* Align data for aligned loads in the loop.  */
+-	addq	$VEC_SIZE, %rdi
+-	andl	$(VEC_SIZE - 1), %ecx
+-	andq	$-VEC_SIZE, %rdi
++	testl	%eax, %eax
++	jz	L(aligned_more)
++	tzcntl	%eax, %eax
++	addq	%rdi, %rax
++	VZEROUPPER_RETURN
+ 
+ # ifndef USE_AS_RAWMEMCHR
+-	/* Adjust length.  */
+-	addq	%rcx, %rdx
++	.p2align 5
++L(first_vec_x0):
++	/* Check if first match was before length.  */
++	tzcntl	%eax, %eax
++	xorl	%ecx, %ecx
++	cmpl	%eax, %edx
++	leaq	(%rdi, %rax), %rax
++	cmovle	%rcx, %rax
++	VZEROUPPER_RETURN
+ 
+-	subq	$(VEC_SIZE * 4), %rdx
+-	jbe	L(last_4x_vec_or_less)
++L(null):
++	xorl	%eax, %eax
++	ret
+ # endif
+-	jmp	L(more_4x_vec)
+-
+ 	.p2align 4
+-L(cros_page_boundary):
+-	andl	$(VEC_SIZE - 1), %ecx
+-	andq	$-VEC_SIZE, %rdi
+-	VPCMPEQ (%rdi), %ymm0, %ymm1
++L(cross_page_boundary):
++	/* Save pointer before aligning as its original value is necessary
++	   for computer return address if byte is found or adjusting length
++	   if it is not and this is memchr.  */
++	movq	%rdi, %rcx
++	/* Align data to VEC_SIZE - 1. ALGN_PTR_REG is rcx for memchr and
++	   rdi for rawmemchr.  */
++	orq	$(VEC_SIZE - 1), %ALGN_PTR_REG
++	VPCMPEQ	-(VEC_SIZE - 1)(%ALGN_PTR_REG), %ymm0, %ymm1
+ 	vpmovmskb %ymm1, %eax
++# ifndef USE_AS_RAWMEMCHR
++	/* Calculate length until end of page (length checked for a
++	   match).  */
++	leaq	1(%ALGN_PTR_REG), %rsi
++	subq	%RRAW_PTR_REG, %rsi
++# endif
+ 	/* Remove the leading bytes.  */
+-	sarl	%cl, %eax
+-	testl	%eax, %eax
+-	jz	L(aligned_more)
+-	tzcntl	%eax, %eax
++	sarxl	%ERAW_PTR_REG, %eax, %eax
+ # ifndef USE_AS_RAWMEMCHR
+ 	/* Check the end of data.  */
+-	cmpq	%rax, %rdx
+-	jbe	L(zero)
++	cmpq	%rsi, %rdx
++	jbe	L(first_vec_x0)
+ # endif
+-	addq	%rdi, %rax
+-	addq	%rcx, %rax
++	testl	%eax, %eax
++	jz	L(cross_page_continue)
++	tzcntl	%eax, %eax
++	addq	%RRAW_PTR_REG, %rax
+ L(return_vzeroupper):
+ 	ZERO_UPPER_VEC_REGISTERS_RETURN
+ 
+ 	.p2align 4
+-L(aligned_more):
+-# ifndef USE_AS_RAWMEMCHR
+-        /* Calculate "rdx + rcx - VEC_SIZE" with "rdx - (VEC_SIZE - rcx)"
+-	   instead of "(rdx + rcx) - VEC_SIZE" to void possible addition
+-	   overflow.  */
+-	negq	%rcx
+-	addq	$VEC_SIZE, %rcx
++L(first_vec_x1):
++	tzcntl	%eax, %eax
++	incq	%rdi
++	addq	%rdi, %rax
++	VZEROUPPER_RETURN
+ 
+-	/* Check the end of data.  */
+-	subq	%rcx, %rdx
+-	jbe	L(zero)
+-# endif
++	.p2align 4
++L(first_vec_x2):
++	tzcntl	%eax, %eax
++	addq	$(VEC_SIZE + 1), %rdi
++	addq	%rdi, %rax
++	VZEROUPPER_RETURN
++
++	.p2align 4
++L(first_vec_x3):
++	tzcntl	%eax, %eax
++	addq	$(VEC_SIZE * 2 + 1), %rdi
++	addq	%rdi, %rax
++	VZEROUPPER_RETURN
+ 
+-	addq	$VEC_SIZE, %rdi
+ 
+-# ifndef USE_AS_RAWMEMCHR
+-	subq	$(VEC_SIZE * 4), %rdx
+-	jbe	L(last_4x_vec_or_less)
+-# endif
++	.p2align 4
++L(first_vec_x4):
++	tzcntl	%eax, %eax
++	addq	$(VEC_SIZE * 3 + 1), %rdi
++	addq	%rdi, %rax
++	VZEROUPPER_RETURN
+ 
+-L(more_4x_vec):
++	.p2align 4
++L(aligned_more):
+ 	/* Check the first 4 * VEC_SIZE.  Only one VEC_SIZE at a time
+ 	   since data is only aligned to VEC_SIZE.  */
+-	VPCMPEQ (%rdi), %ymm0, %ymm1
+-	vpmovmskb %ymm1, %eax
+-	testl	%eax, %eax
+-	jnz	L(first_vec_x0)
+ 
+-	VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1
++# ifndef USE_AS_RAWMEMCHR
++L(cross_page_continue):
++	/* Align data to VEC_SIZE - 1.  */
++	xorl	%ecx, %ecx
++	subl	%edi, %ecx
++	orq	$(VEC_SIZE - 1), %rdi
++	/* esi is for adjusting length to see if near the end.  */
++	leal	(VEC_SIZE * 4 + 1)(%rdi, %rcx), %esi
++# else
++	orq	$(VEC_SIZE - 1), %rdi
++L(cross_page_continue):
++# endif
++	/* Load first VEC regardless.  */
++	VPCMPEQ	1(%rdi), %ymm0, %ymm1
+ 	vpmovmskb %ymm1, %eax
++# ifndef USE_AS_RAWMEMCHR
++	/* Adjust length. If near end handle specially.  */
++	subq	%rsi, %rdx
++	jbe	L(last_4x_vec_or_less)
++# endif
+ 	testl	%eax, %eax
+ 	jnz	L(first_vec_x1)
+ 
+-	VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm0, %ymm1
++	VPCMPEQ	(VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
+ 	vpmovmskb %ymm1, %eax
+ 	testl	%eax, %eax
+ 	jnz	L(first_vec_x2)
+ 
+-	VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1
++	VPCMPEQ	(VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm1
+ 	vpmovmskb %ymm1, %eax
+ 	testl	%eax, %eax
+ 	jnz	L(first_vec_x3)
+ 
+-	addq	$(VEC_SIZE * 4), %rdi
++	VPCMPEQ	(VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm1
++	vpmovmskb %ymm1, %eax
++	testl	%eax, %eax
++	jnz	L(first_vec_x4)
+ 
+ # ifndef USE_AS_RAWMEMCHR
++	/* Check if at last VEC_SIZE * 4 length.  */
+ 	subq	$(VEC_SIZE * 4), %rdx
+-	jbe	L(last_4x_vec_or_less)
+-# endif
+-
+-	/* Align data to 4 * VEC_SIZE.  */
+-	movq	%rdi, %rcx
+-	andl	$(4 * VEC_SIZE - 1), %ecx
+-	andq	$-(4 * VEC_SIZE), %rdi
+-
+-# ifndef USE_AS_RAWMEMCHR
+-	/* Adjust length.  */
++	jbe	L(last_4x_vec_or_less_cmpeq)
++	/* Align data to VEC_SIZE * 4 - 1 for the loop and readjust
++	   length.  */
++	incq	%rdi
++	movl	%edi, %ecx
++	orq	$(VEC_SIZE * 4 - 1), %rdi
++	andl	$(VEC_SIZE * 4 - 1), %ecx
+ 	addq	%rcx, %rdx
++# else
++	/* Align data to VEC_SIZE * 4 - 1 for loop.  */
++	incq	%rdi
++	orq	$(VEC_SIZE * 4 - 1), %rdi
+ # endif
+ 
++	/* Compare 4 * VEC at a time forward.  */
+ 	.p2align 4
+ L(loop_4x_vec):
+-	/* Compare 4 * VEC at a time forward.  */
+-	VPCMPEQ (%rdi), %ymm0, %ymm1
+-	VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm2
+-	VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm0, %ymm3
+-	VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm0, %ymm4
+-
++	VPCMPEQ	1(%rdi), %ymm0, %ymm1
++	VPCMPEQ	(VEC_SIZE + 1)(%rdi), %ymm0, %ymm2
++	VPCMPEQ	(VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm3
++	VPCMPEQ	(VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm4
+ 	vpor	%ymm1, %ymm2, %ymm5
+ 	vpor	%ymm3, %ymm4, %ymm6
+ 	vpor	%ymm5, %ymm6, %ymm5
+ 
+-	vpmovmskb %ymm5, %eax
+-	testl	%eax, %eax
+-	jnz	L(4x_vec_end)
+-
+-	addq	$(VEC_SIZE * 4), %rdi
+-
++	vpmovmskb %ymm5, %ecx
+ # ifdef USE_AS_RAWMEMCHR
+-	jmp	L(loop_4x_vec)
++	subq	$-(VEC_SIZE * 4), %rdi
++	testl	%ecx, %ecx
++	jz	L(loop_4x_vec)
+ # else
+-	subq	$(VEC_SIZE * 4), %rdx
+-	ja	L(loop_4x_vec)
++	testl	%ecx, %ecx
++	jnz	L(loop_4x_vec_end)
+ 
+-L(last_4x_vec_or_less):
+-	/* Less than 4 * VEC and aligned to VEC_SIZE.  */
+-	addl	$(VEC_SIZE * 2), %edx
+-	jle	L(last_2x_vec)
++	subq	$-(VEC_SIZE * 4), %rdi
+ 
+-	VPCMPEQ (%rdi), %ymm0, %ymm1
+-	vpmovmskb %ymm1, %eax
+-	testl	%eax, %eax
+-	jnz	L(first_vec_x0)
++	subq	$(VEC_SIZE * 4), %rdx
++	ja	L(loop_4x_vec)
+ 
+-	VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1
++	/* Fall through into less than 4 remaining vectors of length case.
++	 */
++	VPCMPEQ	(VEC_SIZE * 0 + 1)(%rdi), %ymm0, %ymm1
+ 	vpmovmskb %ymm1, %eax
++	.p2align 4
++L(last_4x_vec_or_less):
++	/* Check if first VEC contained match.  */
+ 	testl	%eax, %eax
+-	jnz	L(first_vec_x1)
++	jnz	L(first_vec_x1_check)
+ 
+-	VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm0, %ymm1
+-	vpmovmskb %ymm1, %eax
+-	testl	%eax, %eax
++	/* If remaining length > VEC_SIZE * 2.  */
++	addl	$(VEC_SIZE * 2), %edx
++	jg	L(last_4x_vec)
+ 
+-	jnz	L(first_vec_x2_check)
+-	subl	$VEC_SIZE, %edx
+-	jle	L(zero)
++L(last_2x_vec):
++	/* If remaining length < VEC_SIZE.  */
++	addl	$VEC_SIZE, %edx
++	jle	L(zero_end)
+ 
+-	VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1
++	/* Check VEC2 and compare any match with remaining length.  */
++	VPCMPEQ	(VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
+ 	vpmovmskb %ymm1, %eax
+-	testl	%eax, %eax
+-
+-	jnz	L(first_vec_x3_check)
+-	xorl	%eax, %eax
++	tzcntl	%eax, %eax
++	cmpl	%eax, %edx
++	jbe	L(set_zero_end)
++	addq	$(VEC_SIZE + 1), %rdi
++	addq	%rdi, %rax
++L(zero_end):
+ 	VZEROUPPER_RETURN
+ 
+ 	.p2align 4
+-L(last_2x_vec):
+-	addl	$(VEC_SIZE * 2), %edx
+-	VPCMPEQ (%rdi), %ymm0, %ymm1
++L(loop_4x_vec_end):
++# endif
++	/* rawmemchr will fall through into this if match was found in
++	   loop.  */
++
+ 	vpmovmskb %ymm1, %eax
+ 	testl	%eax, %eax
++	jnz	L(last_vec_x1_return)
+ 
+-	jnz	L(first_vec_x0_check)
+-	subl	$VEC_SIZE, %edx
+-	jle	L(zero)
+-
+-	VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1
+-	vpmovmskb %ymm1, %eax
++	vpmovmskb %ymm2, %eax
+ 	testl	%eax, %eax
+-	jnz	L(first_vec_x1_check)
+-	xorl	%eax, %eax
+-	VZEROUPPER_RETURN
++	jnz	L(last_vec_x2_return)
+ 
+-	.p2align 4
+-L(first_vec_x0_check):
+-	tzcntl	%eax, %eax
+-	/* Check the end of data.  */
+-	cmpq	%rax, %rdx
+-	jbe	L(zero)
++	vpmovmskb %ymm3, %eax
++	/* Combine VEC3 matches (eax) with VEC4 matches (ecx).  */
++	salq	$32, %rcx
++	orq	%rcx, %rax
++	tzcntq	%rax, %rax
++# ifdef USE_AS_RAWMEMCHR
++	subq	$(VEC_SIZE * 2 - 1), %rdi
++# else
++	subq	$-(VEC_SIZE * 2 + 1), %rdi
++# endif
+ 	addq	%rdi, %rax
+ 	VZEROUPPER_RETURN
++# ifndef USE_AS_RAWMEMCHR
+ 
+ 	.p2align 4
+ L(first_vec_x1_check):
+ 	tzcntl	%eax, %eax
+-	/* Check the end of data.  */
+-	cmpq	%rax, %rdx
+-	jbe	L(zero)
+-	addq	$VEC_SIZE, %rax
++	/* Adjust length.  */
++	subl	$-(VEC_SIZE * 4), %edx
++	/* Check if match within remaining length.  */
++	cmpl	%eax, %edx
++	jbe	L(set_zero_end)
++	incq	%rdi
+ 	addq	%rdi, %rax
+ 	VZEROUPPER_RETURN
++	.p2align 4
++L(set_zero_end):
++	xorl	%eax, %eax
++	VZEROUPPER_RETURN
++# endif
+ 
+ 	.p2align 4
+-L(first_vec_x2_check):
++L(last_vec_x1_return):
+ 	tzcntl	%eax, %eax
+-	/* Check the end of data.  */
+-	cmpq	%rax, %rdx
+-	jbe	L(zero)
+-	addq	$(VEC_SIZE * 2), %rax
++# ifdef USE_AS_RAWMEMCHR
++	subq	$(VEC_SIZE * 4 - 1), %rdi
++# else
++	incq	%rdi
++# endif
+ 	addq	%rdi, %rax
+ 	VZEROUPPER_RETURN
+ 
+ 	.p2align 4
+-L(first_vec_x3_check):
++L(last_vec_x2_return):
+ 	tzcntl	%eax, %eax
+-	/* Check the end of data.  */
+-	cmpq	%rax, %rdx
+-	jbe	L(zero)
+-	addq	$(VEC_SIZE * 3), %rax
++# ifdef USE_AS_RAWMEMCHR
++	subq	$(VEC_SIZE * 3 - 1), %rdi
++# else
++	subq	$-(VEC_SIZE + 1), %rdi
++# endif
+ 	addq	%rdi, %rax
+ 	VZEROUPPER_RETURN
+ 
++# ifndef USE_AS_RAWMEMCHR
+ 	.p2align 4
+-L(zero):
+-	xorl	%eax, %eax
+-	jmp     L(return_vzeroupper)
++L(last_4x_vec_or_less_cmpeq):
++	VPCMPEQ	(VEC_SIZE * 4 + 1)(%rdi), %ymm0, %ymm1
++	vpmovmskb %ymm1, %eax
++	subq	$-(VEC_SIZE * 4), %rdi
++	/* Check first VEC regardless.  */
++	testl	%eax, %eax
++	jnz	L(first_vec_x1_check)
+ 
++	/* If remaining length <= CHAR_PER_VEC * 2.  */
++	addl	$(VEC_SIZE * 2), %edx
++	jle	L(last_2x_vec)
+ 	.p2align 4
+-L(null):
+-	xorl	%eax, %eax
+-	ret
+-# endif
++L(last_4x_vec):
++	VPCMPEQ	(VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
++	vpmovmskb %ymm1, %eax
++	testl	%eax, %eax
++	jnz	L(last_vec_x2_return)
+ 
+-	.p2align 4
+-L(first_vec_x0):
+-	tzcntl	%eax, %eax
+-	addq	%rdi, %rax
+-	VZEROUPPER_RETURN
++	VPCMPEQ	(VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm1
++	vpmovmskb %ymm1, %eax
+ 
+-	.p2align 4
+-L(first_vec_x1):
+-	tzcntl	%eax, %eax
+-	addq	$VEC_SIZE, %rax
+-	addq	%rdi, %rax
+-	VZEROUPPER_RETURN
++	/* Create mask for possible matches within remaining length.  */
++	movq	$-1, %rcx
++	bzhiq	%rdx, %rcx, %rcx
+ 
+-	.p2align 4
+-L(first_vec_x2):
++	/* Test matches in data against length match.  */
++	andl	%ecx, %eax
++	jnz	L(last_vec_x3)
++
++	/* if remaining length <= VEC_SIZE * 3 (Note this is after
++	   remaining length was found to be > VEC_SIZE * 2.  */
++	subl	$VEC_SIZE, %edx
++	jbe	L(zero_end2)
++
++	VPCMPEQ	(VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm1
++	vpmovmskb %ymm1, %eax
++	/* Shift remaining length mask for last VEC.  */
++	shrq	$32, %rcx
++	andl	%ecx, %eax
++	jz	L(zero_end2)
+ 	tzcntl	%eax, %eax
+-	addq	$(VEC_SIZE * 2), %rax
++	addq	$(VEC_SIZE * 3 + 1), %rdi
+ 	addq	%rdi, %rax
++L(zero_end2):
+ 	VZEROUPPER_RETURN
+ 
+ 	.p2align 4
+-L(4x_vec_end):
+-	vpmovmskb %ymm1, %eax
+-	testl	%eax, %eax
+-	jnz	L(first_vec_x0)
+-	vpmovmskb %ymm2, %eax
+-	testl	%eax, %eax
+-	jnz	L(first_vec_x1)
+-	vpmovmskb %ymm3, %eax
+-	testl	%eax, %eax
+-	jnz	L(first_vec_x2)
+-	vpmovmskb %ymm4, %eax
+-	testl	%eax, %eax
+-L(first_vec_x3):
++L(last_vec_x3):
+ 	tzcntl	%eax, %eax
+-	addq	$(VEC_SIZE * 3), %rax
++	subq	$-(VEC_SIZE * 2 + 1), %rdi
+ 	addq	%rdi, %rax
+ 	VZEROUPPER_RETURN
++# endif
+ 
+ END (MEMCHR)
+ #endif
+-- 
+GitLab
+
diff --git a/glibc-RHEL-15696-24.patch b/glibc-RHEL-15696-24.patch
new file mode 100644
index 0000000..c4f24ff
--- /dev/null
+++ b/glibc-RHEL-15696-24.patch
@@ -0,0 +1,388 @@
+From 645a158978f9520e74074e8c14047503be4db0f0 Mon Sep 17 00:00:00 2001
+From: Noah Goldstein <goldstein.w.n@gmail.com>
+Date: Wed, 9 Jun 2021 16:25:32 -0400
+Subject: [PATCH] x86: Fix overflow bug with wmemchr-sse2 and wmemchr-avx2 [BZ
+ #27974]
+Content-type: text/plain; charset=UTF-8
+
+This commit fixes the bug mentioned in the previous commit.
+
+The previous implementations of wmemchr in these files relied
+on n * sizeof(wchar_t) which was not guranteed by the standard.
+
+The new overflow tests added in the previous commit now
+pass (As well as all the other tests).
+
+Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
+Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
+---
+ sysdeps/x86_64/memchr.S                | 77 +++++++++++++++++++-------
+ sysdeps/x86_64/multiarch/memchr-avx2.S | 58 +++++++++++++------
+ 2 files changed, 98 insertions(+), 37 deletions(-)
+
+diff --git a/sysdeps/x86_64/memchr.S b/sysdeps/x86_64/memchr.S
+index cb320257..24f9a0c5 100644
+--- a/sysdeps/x86_64/memchr.S
++++ b/sysdeps/x86_64/memchr.S
+@@ -21,9 +21,11 @@
+ #ifdef USE_AS_WMEMCHR
+ # define MEMCHR		wmemchr
+ # define PCMPEQ		pcmpeqd
++# define CHAR_PER_VEC	4
+ #else
+ # define MEMCHR		memchr
+ # define PCMPEQ		pcmpeqb
++# define CHAR_PER_VEC	16
+ #endif
+ 
+ /* fast SSE2 version with using pmaxub and 64 byte loop */
+@@ -33,15 +35,14 @@ ENTRY(MEMCHR)
+ 	movd	%esi, %xmm1
+ 	mov	%edi, %ecx
+ 
++#ifdef __ILP32__
++	/* Clear the upper 32 bits.  */
++	movl	%edx, %edx
++#endif
+ #ifdef USE_AS_WMEMCHR
+ 	test	%RDX_LP, %RDX_LP
+ 	jz	L(return_null)
+-	shl	$2, %RDX_LP
+ #else
+-# ifdef __ILP32__
+-	/* Clear the upper 32 bits.  */
+-	movl	%edx, %edx
+-# endif
+ 	punpcklbw %xmm1, %xmm1
+ 	test	%RDX_LP, %RDX_LP
+ 	jz	L(return_null)
+@@ -60,13 +61,16 @@ ENTRY(MEMCHR)
+ 	test	%eax, %eax
+ 
+ 	jnz	L(matches_1)
+-	sub	$16, %rdx
++	sub	$CHAR_PER_VEC, %rdx
+ 	jbe	L(return_null)
+ 	add	$16, %rdi
+ 	and	$15, %ecx
+ 	and	$-16, %rdi
++#ifdef USE_AS_WMEMCHR
++	shr	$2, %ecx
++#endif
+ 	add	%rcx, %rdx
+-	sub	$64, %rdx
++	sub	$(CHAR_PER_VEC * 4), %rdx
+ 	jbe	L(exit_loop)
+ 	jmp	L(loop_prolog)
+ 
+@@ -77,16 +81,21 @@ L(crosscache):
+ 	movdqa	(%rdi), %xmm0
+ 
+ 	PCMPEQ	%xmm1, %xmm0
+-/* Check if there is a match.  */
++	/* Check if there is a match.  */
+ 	pmovmskb %xmm0, %eax
+-/* Remove the leading bytes.  */
++	/* Remove the leading bytes.  */
+ 	sar	%cl, %eax
+ 	test	%eax, %eax
+ 	je	L(unaligned_no_match)
+-/* Check which byte is a match.  */
++	/* Check which byte is a match.  */
+ 	bsf	%eax, %eax
+-
++#ifdef USE_AS_WMEMCHR
++	mov	%eax, %esi
++	shr	$2, %esi
++	sub	%rsi, %rdx
++#else
+ 	sub	%rax, %rdx
++#endif
+ 	jbe	L(return_null)
+ 	add	%rdi, %rax
+ 	add	%rcx, %rax
+@@ -94,15 +103,18 @@ L(crosscache):
+ 
+ 	.p2align 4
+ L(unaligned_no_match):
+-        /* "rcx" is less than 16.  Calculate "rdx + rcx - 16" by using
++	/* "rcx" is less than 16.  Calculate "rdx + rcx - 16" by using
+ 	   "rdx - (16 - rcx)" instead of "(rdx + rcx) - 16" to void
+ 	   possible addition overflow.  */
+ 	neg	%rcx
+ 	add	$16, %rcx
++#ifdef USE_AS_WMEMCHR
++	shr	$2, %ecx
++#endif
+ 	sub	%rcx, %rdx
+ 	jbe	L(return_null)
+ 	add	$16, %rdi
+-	sub	$64, %rdx
++	sub	$(CHAR_PER_VEC * 4), %rdx
+ 	jbe	L(exit_loop)
+ 
+ 	.p2align 4
+@@ -135,7 +147,7 @@ L(loop_prolog):
+ 	test	$0x3f, %rdi
+ 	jz	L(align64_loop)
+ 
+-	sub	$64, %rdx
++	sub	$(CHAR_PER_VEC * 4), %rdx
+ 	jbe	L(exit_loop)
+ 
+ 	movdqa	(%rdi), %xmm0
+@@ -167,11 +179,14 @@ L(loop_prolog):
+ 	mov	%rdi, %rcx
+ 	and	$-64, %rdi
+ 	and	$63, %ecx
++#ifdef USE_AS_WMEMCHR
++	shr	$2, %ecx
++#endif
+ 	add	%rcx, %rdx
+ 
+ 	.p2align 4
+ L(align64_loop):
+-	sub	$64, %rdx
++	sub	$(CHAR_PER_VEC * 4), %rdx
+ 	jbe	L(exit_loop)
+ 	movdqa	(%rdi), %xmm0
+ 	movdqa	16(%rdi), %xmm2
+@@ -218,7 +233,7 @@ L(align64_loop):
+ 
+ 	.p2align 4
+ L(exit_loop):
+-	add	$32, %edx
++	add	$(CHAR_PER_VEC * 2), %edx
+ 	jle	L(exit_loop_32)
+ 
+ 	movdqa	(%rdi), %xmm0
+@@ -238,7 +253,7 @@ L(exit_loop):
+ 	pmovmskb %xmm3, %eax
+ 	test	%eax, %eax
+ 	jnz	L(matches32_1)
+-	sub	$16, %edx
++	sub	$CHAR_PER_VEC, %edx
+ 	jle	L(return_null)
+ 
+ 	PCMPEQ	48(%rdi), %xmm1
+@@ -250,13 +265,13 @@ L(exit_loop):
+ 
+ 	.p2align 4
+ L(exit_loop_32):
+-	add	$32, %edx
++	add	$(CHAR_PER_VEC * 2), %edx
+ 	movdqa	(%rdi), %xmm0
+ 	PCMPEQ	%xmm1, %xmm0
+ 	pmovmskb %xmm0, %eax
+ 	test	%eax, %eax
+ 	jnz	L(matches_1)
+-	sub	$16, %edx
++	sub	$CHAR_PER_VEC, %edx
+ 	jbe	L(return_null)
+ 
+ 	PCMPEQ	16(%rdi), %xmm1
+@@ -293,7 +308,13 @@ L(matches32):
+ 	.p2align 4
+ L(matches_1):
+ 	bsf	%eax, %eax
++#ifdef USE_AS_WMEMCHR
++	mov	%eax, %esi
++	shr	$2, %esi
++	sub	%rsi, %rdx
++#else
+ 	sub	%rax, %rdx
++#endif
+ 	jbe	L(return_null)
+ 	add	%rdi, %rax
+ 	ret
+@@ -301,7 +322,13 @@ L(matches_1):
+ 	.p2align 4
+ L(matches16_1):
+ 	bsf	%eax, %eax
++#ifdef USE_AS_WMEMCHR
++	mov	%eax, %esi
++	shr	$2, %esi
++	sub	%rsi, %rdx
++#else
+ 	sub	%rax, %rdx
++#endif
+ 	jbe	L(return_null)
+ 	lea	16(%rdi, %rax), %rax
+ 	ret
+@@ -309,7 +336,13 @@ L(matches16_1):
+ 	.p2align 4
+ L(matches32_1):
+ 	bsf	%eax, %eax
++#ifdef USE_AS_WMEMCHR
++	mov	%eax, %esi
++	shr	$2, %esi
++	sub	%rsi, %rdx
++#else
+ 	sub	%rax, %rdx
++#endif
+ 	jbe	L(return_null)
+ 	lea	32(%rdi, %rax), %rax
+ 	ret
+@@ -317,7 +350,13 @@ L(matches32_1):
+ 	.p2align 4
+ L(matches48_1):
+ 	bsf	%eax, %eax
++#ifdef USE_AS_WMEMCHR
++	mov	%eax, %esi
++	shr	$2, %esi
++	sub	%rsi, %rdx
++#else
+ 	sub	%rax, %rdx
++#endif
+ 	jbe	L(return_null)
+ 	lea	48(%rdi, %rax), %rax
+ 	ret
+diff --git a/sysdeps/x86_64/multiarch/memchr-avx2.S b/sysdeps/x86_64/multiarch/memchr-avx2.S
+index b377f22e..16027abb 100644
+--- a/sysdeps/x86_64/multiarch/memchr-avx2.S
++++ b/sysdeps/x86_64/multiarch/memchr-avx2.S
+@@ -54,21 +54,19 @@
+ 
+ # define VEC_SIZE 32
+ # define PAGE_SIZE 4096
++# define CHAR_PER_VEC	(VEC_SIZE / CHAR_SIZE)
+ 
+ 	.section SECTION(.text),"ax",@progbits
+ ENTRY (MEMCHR)
+ # ifndef USE_AS_RAWMEMCHR
+ 	/* Check for zero length.  */
+-	test	%RDX_LP, %RDX_LP
+-	jz	L(null)
+-# endif
+-# ifdef USE_AS_WMEMCHR
+-	shl	$2, %RDX_LP
+-# else
+ #  ifdef __ILP32__
+-	/* Clear the upper 32 bits.  */
+-	movl	%edx, %edx
++	/* Clear upper bits.  */
++	and	%RDX_LP, %RDX_LP
++#  else
++	test	%RDX_LP, %RDX_LP
+ #  endif
++	jz	L(null)
+ # endif
+ 	/* Broadcast CHAR to YMMMATCH.  */
+ 	vmovd	%esi, %xmm0
+@@ -84,7 +82,7 @@ ENTRY (MEMCHR)
+ 	vpmovmskb %ymm1, %eax
+ # ifndef USE_AS_RAWMEMCHR
+ 	/* If length < CHAR_PER_VEC handle special.  */
+-	cmpq	$VEC_SIZE, %rdx
++	cmpq	$CHAR_PER_VEC, %rdx
+ 	jbe	L(first_vec_x0)
+ # endif
+ 	testl	%eax, %eax
+@@ -98,6 +96,10 @@ ENTRY (MEMCHR)
+ L(first_vec_x0):
+ 	/* Check if first match was before length.  */
+ 	tzcntl	%eax, %eax
++#  ifdef USE_AS_WMEMCHR
++	/* NB: Multiply length by 4 to get byte count.  */
++	sall	$2, %edx
++#  endif
+ 	xorl	%ecx, %ecx
+ 	cmpl	%eax, %edx
+ 	leaq	(%rdi, %rax), %rax
+@@ -110,12 +112,12 @@ L(null):
+ # endif
+ 	.p2align 4
+ L(cross_page_boundary):
+-	/* Save pointer before aligning as its original value is necessary
+-	   for computer return address if byte is found or adjusting length
+-	   if it is not and this is memchr.  */
++	/* Save pointer before aligning as its original value is
++	   necessary for computer return address if byte is found or
++	   adjusting length if it is not and this is memchr.  */
+ 	movq	%rdi, %rcx
+-	/* Align data to VEC_SIZE - 1. ALGN_PTR_REG is rcx for memchr and
+-	   rdi for rawmemchr.  */
++	/* Align data to VEC_SIZE - 1. ALGN_PTR_REG is rcx for memchr
++	   and rdi for rawmemchr.  */
+ 	orq	$(VEC_SIZE - 1), %ALGN_PTR_REG
+ 	VPCMPEQ	-(VEC_SIZE - 1)(%ALGN_PTR_REG), %ymm0, %ymm1
+ 	vpmovmskb %ymm1, %eax
+@@ -124,6 +126,10 @@ L(cross_page_boundary):
+ 	   match).  */
+ 	leaq	1(%ALGN_PTR_REG), %rsi
+ 	subq	%RRAW_PTR_REG, %rsi
++#  ifdef USE_AS_WMEMCHR
++	/* NB: Divide bytes by 4 to get wchar_t count.  */
++	shrl	$2, %esi
++#  endif
+ # endif
+ 	/* Remove the leading bytes.  */
+ 	sarxl	%ERAW_PTR_REG, %eax, %eax
+@@ -181,6 +187,10 @@ L(cross_page_continue):
+ 	orq	$(VEC_SIZE - 1), %rdi
+ 	/* esi is for adjusting length to see if near the end.  */
+ 	leal	(VEC_SIZE * 4 + 1)(%rdi, %rcx), %esi
++#  ifdef USE_AS_WMEMCHR
++	/* NB: Divide bytes by 4 to get the wchar_t count.  */
++	sarl	$2, %esi
++#  endif
+ # else
+ 	orq	$(VEC_SIZE - 1), %rdi
+ L(cross_page_continue):
+@@ -213,7 +223,7 @@ L(cross_page_continue):
+ 
+ # ifndef USE_AS_RAWMEMCHR
+ 	/* Check if at last VEC_SIZE * 4 length.  */
+-	subq	$(VEC_SIZE * 4), %rdx
++	subq	$(CHAR_PER_VEC * 4), %rdx
+ 	jbe	L(last_4x_vec_or_less_cmpeq)
+ 	/* Align data to VEC_SIZE * 4 - 1 for the loop and readjust
+ 	   length.  */
+@@ -221,6 +231,10 @@ L(cross_page_continue):
+ 	movl	%edi, %ecx
+ 	orq	$(VEC_SIZE * 4 - 1), %rdi
+ 	andl	$(VEC_SIZE * 4 - 1), %ecx
++#  ifdef USE_AS_WMEMCHR
++	/* NB: Divide bytes by 4 to get the wchar_t count.  */
++	sarl	$2, %ecx
++#  endif
+ 	addq	%rcx, %rdx
+ # else
+ 	/* Align data to VEC_SIZE * 4 - 1 for loop.  */
+@@ -250,15 +264,19 @@ L(loop_4x_vec):
+ 
+ 	subq	$-(VEC_SIZE * 4), %rdi
+ 
+-	subq	$(VEC_SIZE * 4), %rdx
++	subq	$(CHAR_PER_VEC * 4), %rdx
+ 	ja	L(loop_4x_vec)
+ 
+-	/* Fall through into less than 4 remaining vectors of length case.
+-	 */
++	/* Fall through into less than 4 remaining vectors of length
++	   case.  */
+ 	VPCMPEQ	(VEC_SIZE * 0 + 1)(%rdi), %ymm0, %ymm1
+ 	vpmovmskb %ymm1, %eax
+ 	.p2align 4
+ L(last_4x_vec_or_less):
++#  ifdef USE_AS_WMEMCHR
++	/* NB: Multiply length by 4 to get byte count.  */
++	sall	$2, %edx
++#  endif
+ 	/* Check if first VEC contained match.  */
+ 	testl	%eax, %eax
+ 	jnz	L(first_vec_x1_check)
+@@ -355,6 +373,10 @@ L(last_vec_x2_return):
+ L(last_4x_vec_or_less_cmpeq):
+ 	VPCMPEQ	(VEC_SIZE * 4 + 1)(%rdi), %ymm0, %ymm1
+ 	vpmovmskb %ymm1, %eax
++#  ifdef USE_AS_WMEMCHR
++	/* NB: Multiply length by 4 to get byte count.  */
++	sall	$2, %edx
++#  endif
+ 	subq	$-(VEC_SIZE * 4), %rdi
+ 	/* Check first VEC regardless.  */
+ 	testl	%eax, %eax
+-- 
+GitLab
+
diff --git a/glibc-RHEL-15696-25.patch b/glibc-RHEL-15696-25.patch
new file mode 100644
index 0000000..e0ed8ea
--- /dev/null
+++ b/glibc-RHEL-15696-25.patch
@@ -0,0 +1,767 @@
+From aaa23c35071537e2dcf5807e956802ed215210aa Mon Sep 17 00:00:00 2001
+From: Noah Goldstein <goldstein.w.n@gmail.com>
+Date: Mon, 19 Apr 2021 19:36:07 -0400
+Subject: [PATCH] x86: Optimize strlen-avx2.S
+Content-type: text/plain; charset=UTF-8
+
+No bug. This commit optimizes strlen-avx2.S. The optimizations are
+mostly small things but they add up to roughly 10-30% performance
+improvement for strlen. The results for strnlen are bit more
+ambiguous. test-strlen, test-strnlen, test-wcslen, and test-wcsnlen
+are all passing.
+
+Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
+---
+ sysdeps/x86_64/multiarch/ifunc-impl-list.c |  16 +-
+ sysdeps/x86_64/multiarch/strlen-avx2.S     | 532 +++++++++++++--------
+ 2 files changed, 334 insertions(+), 214 deletions(-)
+
+diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+index cbfc1a5d..f1a6460a 100644
+--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
++++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+@@ -285,10 +285,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+   /* Support sysdeps/x86_64/multiarch/strlen.c.  */
+   IFUNC_IMPL (i, name, strlen,
+ 	      IFUNC_IMPL_ADD (array, i, strlen,
+-			      CPU_FEATURE_USABLE (AVX2),
++			      (CPU_FEATURE_USABLE (AVX2)
++			       && CPU_FEATURE_USABLE (BMI2)),
+ 			      __strlen_avx2)
+ 	      IFUNC_IMPL_ADD (array, i, strlen,
+ 			      (CPU_FEATURE_USABLE (AVX2)
++			       && CPU_FEATURE_USABLE (BMI2)
+ 			       && CPU_FEATURE_USABLE (RTM)),
+ 			      __strlen_avx2_rtm)
+ 	      IFUNC_IMPL_ADD (array, i, strlen,
+@@ -301,10 +303,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+   /* Support sysdeps/x86_64/multiarch/strnlen.c.  */
+   IFUNC_IMPL (i, name, strnlen,
+ 	      IFUNC_IMPL_ADD (array, i, strnlen,
+-			      CPU_FEATURE_USABLE (AVX2),
++			      (CPU_FEATURE_USABLE (AVX2)
++			       && CPU_FEATURE_USABLE (BMI2)),
+ 			      __strnlen_avx2)
+ 	      IFUNC_IMPL_ADD (array, i, strnlen,
+ 			      (CPU_FEATURE_USABLE (AVX2)
++			       && CPU_FEATURE_USABLE (BMI2)
+ 			       && CPU_FEATURE_USABLE (RTM)),
+ 			      __strnlen_avx2_rtm)
+ 	      IFUNC_IMPL_ADD (array, i, strnlen,
+@@ -640,10 +644,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+   /* Support sysdeps/x86_64/multiarch/wcslen.c.  */
+   IFUNC_IMPL (i, name, wcslen,
+ 	      IFUNC_IMPL_ADD (array, i, wcslen,
+-			      CPU_FEATURE_USABLE (AVX2),
++			      (CPU_FEATURE_USABLE (AVX2)
++			       && CPU_FEATURE_USABLE (BMI2)),
+ 			      __wcslen_avx2)
+ 	      IFUNC_IMPL_ADD (array, i, wcslen,
+ 			      (CPU_FEATURE_USABLE (AVX2)
++			       && CPU_FEATURE_USABLE (BMI2)
+ 			       && CPU_FEATURE_USABLE (RTM)),
+ 			      __wcslen_avx2_rtm)
+ 	      IFUNC_IMPL_ADD (array, i, wcslen,
+@@ -656,10 +662,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+   /* Support sysdeps/x86_64/multiarch/wcsnlen.c.  */
+   IFUNC_IMPL (i, name, wcsnlen,
+ 	      IFUNC_IMPL_ADD (array, i, wcsnlen,
+-			      CPU_FEATURE_USABLE (AVX2),
++			      (CPU_FEATURE_USABLE (AVX2)
++			       && CPU_FEATURE_USABLE (BMI2)),
+ 			      __wcsnlen_avx2)
+ 	      IFUNC_IMPL_ADD (array, i, wcsnlen,
+ 			      (CPU_FEATURE_USABLE (AVX2)
++			       && CPU_FEATURE_USABLE (BMI2)
+ 			       && CPU_FEATURE_USABLE (RTM)),
+ 			      __wcsnlen_avx2_rtm)
+ 	      IFUNC_IMPL_ADD (array, i, wcsnlen,
+diff --git a/sysdeps/x86_64/multiarch/strlen-avx2.S b/sysdeps/x86_64/multiarch/strlen-avx2.S
+index 82826e10..be8a5db5 100644
+--- a/sysdeps/x86_64/multiarch/strlen-avx2.S
++++ b/sysdeps/x86_64/multiarch/strlen-avx2.S
+@@ -27,9 +27,11 @@
+ # ifdef USE_AS_WCSLEN
+ #  define VPCMPEQ	vpcmpeqd
+ #  define VPMINU	vpminud
++#  define CHAR_SIZE	4
+ # else
+ #  define VPCMPEQ	vpcmpeqb
+ #  define VPMINU	vpminub
++#  define CHAR_SIZE	1
+ # endif
+ 
+ # ifndef VZEROUPPER
+@@ -41,349 +43,459 @@
+ # endif
+ 
+ # define VEC_SIZE 32
++# define PAGE_SIZE 4096
+ 
+ 	.section SECTION(.text),"ax",@progbits
+ ENTRY (STRLEN)
+ # ifdef USE_AS_STRNLEN
+-	/* Check for zero length.  */
++	/* Check zero length.  */
+ 	test	%RSI_LP, %RSI_LP
+ 	jz	L(zero)
++	/* Store max len in R8_LP before adjusting if using WCSLEN.  */
++	mov	%RSI_LP, %R8_LP
+ #  ifdef USE_AS_WCSLEN
+ 	shl	$2, %RSI_LP
+ #  elif defined __ILP32__
+ 	/* Clear the upper 32 bits.  */
+ 	movl	%esi, %esi
+ #  endif
+-	mov	%RSI_LP, %R8_LP
+ # endif
+-	movl	%edi, %ecx
++	movl	%edi, %eax
+ 	movq	%rdi, %rdx
+ 	vpxor	%xmm0, %xmm0, %xmm0
+-
++	/* Clear high bits from edi. Only keeping bits relevant to page
++	   cross check.  */
++	andl	$(PAGE_SIZE - 1), %eax
+ 	/* Check if we may cross page boundary with one vector load.  */
+-	andl	$(2 * VEC_SIZE - 1), %ecx
+-	cmpl	$VEC_SIZE, %ecx
+-	ja	L(cros_page_boundary)
++	cmpl	$(PAGE_SIZE - VEC_SIZE), %eax
++	ja	L(cross_page_boundary)
+ 
+ 	/* Check the first VEC_SIZE bytes.  */
+-	VPCMPEQ (%rdi), %ymm0, %ymm1
+-	vpmovmskb %ymm1, %eax
+-	testl	%eax, %eax
+-
++	VPCMPEQ	(%rdi), %ymm0, %ymm1
++	vpmovmskb	%ymm1, %eax
+ # ifdef USE_AS_STRNLEN
+-	jnz	L(first_vec_x0_check)
+-	/* Adjust length and check the end of data.  */
+-	subq	$VEC_SIZE, %rsi
+-	jbe	L(max)
+-# else
+-	jnz	L(first_vec_x0)
++	/* If length < VEC_SIZE handle special.  */
++	cmpq	$VEC_SIZE, %rsi
++	jbe	L(first_vec_x0)
+ # endif
+-
+-	/* Align data for aligned loads in the loop.  */
+-	addq	$VEC_SIZE, %rdi
+-	andl	$(VEC_SIZE - 1), %ecx
+-	andq	$-VEC_SIZE, %rdi
++	/* If empty continue to aligned_more. Otherwise return bit
++	   position of first match.  */
++	testl	%eax, %eax
++	jz	L(aligned_more)
++	tzcntl	%eax, %eax
++# ifdef USE_AS_WCSLEN
++	shrl	$2, %eax
++# endif
++	VZEROUPPER_RETURN
+ 
+ # ifdef USE_AS_STRNLEN
+-	/* Adjust length.  */
+-	addq	%rcx, %rsi
++L(zero):
++	xorl	%eax, %eax
++	ret
+ 
+-	subq	$(VEC_SIZE * 4), %rsi
+-	jbe	L(last_4x_vec_or_less)
++	.p2align 4
++L(first_vec_x0):
++	/* Set bit for max len so that tzcnt will return min of max len
++	   and position of first match.  */
++	btsq	%rsi, %rax
++	tzcntl	%eax, %eax
++#  ifdef USE_AS_WCSLEN
++	shrl	$2, %eax
++#  endif
++	VZEROUPPER_RETURN
+ # endif
+-	jmp	L(more_4x_vec)
+ 
+ 	.p2align 4
+-L(cros_page_boundary):
+-	andl	$(VEC_SIZE - 1), %ecx
+-	andq	$-VEC_SIZE, %rdi
+-	VPCMPEQ (%rdi), %ymm0, %ymm1
+-	vpmovmskb %ymm1, %eax
+-	/* Remove the leading bytes.  */
+-	sarl	%cl, %eax
+-	testl	%eax, %eax
+-	jz	L(aligned_more)
++L(first_vec_x1):
+ 	tzcntl	%eax, %eax
++	/* Safe to use 32 bit instructions as these are only called for
++	   size = [1, 159].  */
+ # ifdef USE_AS_STRNLEN
+-	/* Check the end of data.  */
+-	cmpq	%rax, %rsi
+-	jbe	L(max)
++	/* Use ecx which was computed earlier to compute correct value.
++	 */
++	subl	$(VEC_SIZE * 4 + 1), %ecx
++	addl	%ecx, %eax
++# else
++	subl	%edx, %edi
++	incl	%edi
++	addl	%edi, %eax
+ # endif
+-	addq	%rdi, %rax
+-	addq	%rcx, %rax
+-	subq	%rdx, %rax
+ # ifdef USE_AS_WCSLEN
+-	shrq	$2, %rax
++	shrl	$2, %eax
+ # endif
+-L(return_vzeroupper):
+-	ZERO_UPPER_VEC_REGISTERS_RETURN
++	VZEROUPPER_RETURN
+ 
+ 	.p2align 4
+-L(aligned_more):
++L(first_vec_x2):
++	tzcntl	%eax, %eax
++	/* Safe to use 32 bit instructions as these are only called for
++	   size = [1, 159].  */
+ # ifdef USE_AS_STRNLEN
+-        /* "rcx" is less than VEC_SIZE.  Calculate "rdx + rcx - VEC_SIZE"
+-	    with "rdx - (VEC_SIZE - rcx)" instead of "(rdx + rcx) - VEC_SIZE"
+-	    to void possible addition overflow.  */
+-	negq	%rcx
+-	addq	$VEC_SIZE, %rcx
+-
+-	/* Check the end of data.  */
+-	subq	%rcx, %rsi
+-	jbe	L(max)
++	/* Use ecx which was computed earlier to compute correct value.
++	 */
++	subl	$(VEC_SIZE * 3 + 1), %ecx
++	addl	%ecx, %eax
++# else
++	subl	%edx, %edi
++	addl	$(VEC_SIZE + 1), %edi
++	addl	%edi, %eax
+ # endif
++# ifdef USE_AS_WCSLEN
++	shrl	$2, %eax
++# endif
++	VZEROUPPER_RETURN
+ 
+-	addq	$VEC_SIZE, %rdi
++	.p2align 4
++L(first_vec_x3):
++	tzcntl	%eax, %eax
++	/* Safe to use 32 bit instructions as these are only called for
++	   size = [1, 159].  */
++# ifdef USE_AS_STRNLEN
++	/* Use ecx which was computed earlier to compute correct value.
++	 */
++	subl	$(VEC_SIZE * 2 + 1), %ecx
++	addl	%ecx, %eax
++# else
++	subl	%edx, %edi
++	addl	$(VEC_SIZE * 2 + 1), %edi
++	addl	%edi, %eax
++# endif
++# ifdef USE_AS_WCSLEN
++	shrl	$2, %eax
++# endif
++	VZEROUPPER_RETURN
+ 
++	.p2align 4
++L(first_vec_x4):
++	tzcntl	%eax, %eax
++	/* Safe to use 32 bit instructions as these are only called for
++	   size = [1, 159].  */
+ # ifdef USE_AS_STRNLEN
+-	subq	$(VEC_SIZE * 4), %rsi
+-	jbe	L(last_4x_vec_or_less)
++	/* Use ecx which was computed earlier to compute correct value.
++	 */
++	subl	$(VEC_SIZE + 1), %ecx
++	addl	%ecx, %eax
++# else
++	subl	%edx, %edi
++	addl	$(VEC_SIZE * 3 + 1), %edi
++	addl	%edi, %eax
+ # endif
++# ifdef USE_AS_WCSLEN
++	shrl	$2, %eax
++# endif
++	VZEROUPPER_RETURN
+ 
+-L(more_4x_vec):
++	.p2align 5
++L(aligned_more):
++	/* Align data to VEC_SIZE - 1. This is the same number of
++	   instructions as using andq with -VEC_SIZE but saves 4 bytes of
++	   code on the x4 check.  */
++	orq	$(VEC_SIZE - 1), %rdi
++L(cross_page_continue):
+ 	/* Check the first 4 * VEC_SIZE.  Only one VEC_SIZE at a time
+ 	   since data is only aligned to VEC_SIZE.  */
+-	VPCMPEQ (%rdi), %ymm0, %ymm1
+-	vpmovmskb %ymm1, %eax
+-	testl	%eax, %eax
+-	jnz	L(first_vec_x0)
+-
+-	VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1
+-	vpmovmskb %ymm1, %eax
++# ifdef USE_AS_STRNLEN
++	/* + 1 because rdi is aligned to VEC_SIZE - 1. + CHAR_SIZE because
++	   it simplies the logic in last_4x_vec_or_less.  */
++	leaq	(VEC_SIZE * 4 + CHAR_SIZE + 1)(%rdi), %rcx
++	subq	%rdx, %rcx
++# endif
++	/* Load first VEC regardless.  */
++	VPCMPEQ	1(%rdi), %ymm0, %ymm1
++# ifdef USE_AS_STRNLEN
++	/* Adjust length. If near end handle specially.  */
++	subq	%rcx, %rsi
++	jb	L(last_4x_vec_or_less)
++# endif
++	vpmovmskb	%ymm1, %eax
+ 	testl	%eax, %eax
+ 	jnz	L(first_vec_x1)
+ 
+-	VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm0, %ymm1
+-	vpmovmskb %ymm1, %eax
++	VPCMPEQ	(VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
++	vpmovmskb	%ymm1, %eax
+ 	testl	%eax, %eax
+ 	jnz	L(first_vec_x2)
+ 
+-	VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1
+-	vpmovmskb %ymm1, %eax
++	VPCMPEQ	(VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm1
++	vpmovmskb	%ymm1, %eax
+ 	testl	%eax, %eax
+ 	jnz	L(first_vec_x3)
+ 
+-	addq	$(VEC_SIZE * 4), %rdi
+-
+-# ifdef USE_AS_STRNLEN
+-	subq	$(VEC_SIZE * 4), %rsi
+-	jbe	L(last_4x_vec_or_less)
+-# endif
+-
+-	/* Align data to 4 * VEC_SIZE.  */
+-	movq	%rdi, %rcx
+-	andl	$(4 * VEC_SIZE - 1), %ecx
+-	andq	$-(4 * VEC_SIZE), %rdi
++	VPCMPEQ	(VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm1
++	vpmovmskb	%ymm1, %eax
++	testl	%eax, %eax
++	jnz	L(first_vec_x4)
+ 
++	/* Align data to VEC_SIZE * 4 - 1.  */
+ # ifdef USE_AS_STRNLEN
+-	/* Adjust length.  */
++	/* Before adjusting length check if at last VEC_SIZE * 4.  */
++	cmpq	$(VEC_SIZE * 4 - 1), %rsi
++	jbe	L(last_4x_vec_or_less_load)
++	incq	%rdi
++	movl	%edi, %ecx
++	orq	$(VEC_SIZE * 4 - 1), %rdi
++	andl	$(VEC_SIZE * 4 - 1), %ecx
++	/* Readjust length.  */
+ 	addq	%rcx, %rsi
++# else
++	incq	%rdi
++	orq	$(VEC_SIZE * 4 - 1), %rdi
+ # endif
+-
++	/* Compare 4 * VEC at a time forward.  */
+ 	.p2align 4
+ L(loop_4x_vec):
+-	/* Compare 4 * VEC at a time forward.  */
+-	vmovdqa (%rdi), %ymm1
+-	vmovdqa	VEC_SIZE(%rdi), %ymm2
+-	vmovdqa	(VEC_SIZE * 2)(%rdi), %ymm3
+-	vmovdqa	(VEC_SIZE * 3)(%rdi), %ymm4
+-	VPMINU	%ymm1, %ymm2, %ymm5
+-	VPMINU	%ymm3, %ymm4, %ymm6
+-	VPMINU	%ymm5, %ymm6, %ymm5
+-
+-	VPCMPEQ	%ymm5, %ymm0, %ymm5
+-	vpmovmskb %ymm5, %eax
+-	testl	%eax, %eax
+-	jnz	L(4x_vec_end)
+-
+-	addq	$(VEC_SIZE * 4), %rdi
+-
+-# ifndef USE_AS_STRNLEN
+-	jmp	L(loop_4x_vec)
+-# else
++# ifdef USE_AS_STRNLEN
++	/* Break if at end of length.  */
+ 	subq	$(VEC_SIZE * 4), %rsi
+-	ja	L(loop_4x_vec)
+-
+-L(last_4x_vec_or_less):
+-	/* Less than 4 * VEC and aligned to VEC_SIZE.  */
+-	addl	$(VEC_SIZE * 2), %esi
+-	jle	L(last_2x_vec)
++	jb	L(last_4x_vec_or_less_cmpeq)
++# endif
++	/* Save some code size by microfusing VPMINU with the load. Since
++	   the matches in ymm2/ymm4 can only be returned if there where no
++	   matches in ymm1/ymm3 respectively there is no issue with overlap.
++	 */
++	vmovdqa	1(%rdi), %ymm1
++	VPMINU	(VEC_SIZE + 1)(%rdi), %ymm1, %ymm2
++	vmovdqa	(VEC_SIZE * 2 + 1)(%rdi), %ymm3
++	VPMINU	(VEC_SIZE * 3 + 1)(%rdi), %ymm3, %ymm4
++
++	VPMINU	%ymm2, %ymm4, %ymm5
++	VPCMPEQ	%ymm5, %ymm0, %ymm5
++	vpmovmskb	%ymm5, %ecx
+ 
+-	VPCMPEQ (%rdi), %ymm0, %ymm1
+-	vpmovmskb %ymm1, %eax
+-	testl	%eax, %eax
+-	jnz	L(first_vec_x0)
++	subq	$-(VEC_SIZE * 4), %rdi
++	testl	%ecx, %ecx
++	jz	L(loop_4x_vec)
+ 
+-	VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1
+-	vpmovmskb %ymm1, %eax
+-	testl	%eax, %eax
+-	jnz	L(first_vec_x1)
+ 
+-	VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm0, %ymm1
+-	vpmovmskb %ymm1, %eax
++	VPCMPEQ	%ymm1, %ymm0, %ymm1
++	vpmovmskb	%ymm1, %eax
++	subq	%rdx, %rdi
+ 	testl	%eax, %eax
++	jnz	L(last_vec_return_x0)
+ 
+-	jnz	L(first_vec_x2_check)
+-	subl	$VEC_SIZE, %esi
+-	jle	L(max)
+-
+-	VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1
+-	vpmovmskb %ymm1, %eax
++	VPCMPEQ	%ymm2, %ymm0, %ymm2
++	vpmovmskb	%ymm2, %eax
+ 	testl	%eax, %eax
+-
+-	jnz	L(first_vec_x3_check)
+-	movq	%r8, %rax
+-#  ifdef USE_AS_WCSLEN
++	jnz	L(last_vec_return_x1)
++
++	/* Combine last 2 VEC.  */
++	VPCMPEQ	%ymm3, %ymm0, %ymm3
++	vpmovmskb	%ymm3, %eax
++	/* rcx has combined result from all 4 VEC. It will only be used if
++	   the first 3 other VEC all did not contain a match.  */
++	salq	$32, %rcx
++	orq	%rcx, %rax
++	tzcntq	%rax, %rax
++	subq	$(VEC_SIZE * 2 - 1), %rdi
++	addq	%rdi, %rax
++# ifdef USE_AS_WCSLEN
+ 	shrq	$2, %rax
+-#  endif
++# endif
+ 	VZEROUPPER_RETURN
+ 
++
++# ifdef USE_AS_STRNLEN
+ 	.p2align 4
+-L(last_2x_vec):
+-	addl	$(VEC_SIZE * 2), %esi
+-	VPCMPEQ (%rdi), %ymm0, %ymm1
+-	vpmovmskb %ymm1, %eax
+-	testl	%eax, %eax
++L(last_4x_vec_or_less_load):
++	/* Depending on entry adjust rdi / prepare first VEC in ymm1.  */
++	subq	$-(VEC_SIZE * 4), %rdi
++L(last_4x_vec_or_less_cmpeq):
++	VPCMPEQ	1(%rdi), %ymm0, %ymm1
++L(last_4x_vec_or_less):
+ 
+-	jnz	L(first_vec_x0_check)
+-	subl	$VEC_SIZE, %esi
+-	jle	L(max)
++	vpmovmskb	%ymm1, %eax
++	/* If remaining length > VEC_SIZE * 2. This works if esi is off by
++	   VEC_SIZE * 4.  */
++	testl	$(VEC_SIZE * 2), %esi
++	jnz	L(last_4x_vec)
+ 
+-	VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1
+-	vpmovmskb %ymm1, %eax
++	/* length may have been negative or positive by an offset of
++	   VEC_SIZE * 4 depending on where this was called from. This fixes
++	   that.  */
++	andl	$(VEC_SIZE * 4 - 1), %esi
+ 	testl	%eax, %eax
+-	jnz	L(first_vec_x1_check)
+-	movq	%r8, %rax
+-#  ifdef USE_AS_WCSLEN
+-	shrq	$2, %rax
+-#  endif
+-	VZEROUPPER_RETURN
++	jnz	L(last_vec_x1_check)
+ 
+-	.p2align 4
+-L(first_vec_x0_check):
++	subl	$VEC_SIZE, %esi
++	jb	L(max)
++
++	VPCMPEQ	(VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
++	vpmovmskb	%ymm1, %eax
+ 	tzcntl	%eax, %eax
+ 	/* Check the end of data.  */
+-	cmpq	%rax, %rsi
+-	jbe	L(max)
++	cmpl	%eax, %esi
++	jb	L(max)
++	subq	%rdx, %rdi
++	addl	$(VEC_SIZE + 1), %eax
+ 	addq	%rdi, %rax
+-	subq	%rdx, %rax
+ #  ifdef USE_AS_WCSLEN
+ 	shrq	$2, %rax
+ #  endif
+ 	VZEROUPPER_RETURN
++# endif
+ 
+ 	.p2align 4
+-L(first_vec_x1_check):
++L(last_vec_return_x0):
+ 	tzcntl	%eax, %eax
+-	/* Check the end of data.  */
+-	cmpq	%rax, %rsi
+-	jbe	L(max)
+-	addq	$VEC_SIZE, %rax
++	subq	$(VEC_SIZE * 4 - 1), %rdi
+ 	addq	%rdi, %rax
+-	subq	%rdx, %rax
+-#  ifdef USE_AS_WCSLEN
++# ifdef USE_AS_WCSLEN
+ 	shrq	$2, %rax
+-#  endif
++# endif
+ 	VZEROUPPER_RETURN
+ 
+ 	.p2align 4
+-L(first_vec_x2_check):
++L(last_vec_return_x1):
+ 	tzcntl	%eax, %eax
+-	/* Check the end of data.  */
+-	cmpq	%rax, %rsi
+-	jbe	L(max)
+-	addq	$(VEC_SIZE * 2), %rax
++	subq	$(VEC_SIZE * 3 - 1), %rdi
+ 	addq	%rdi, %rax
+-	subq	%rdx, %rax
+-#  ifdef USE_AS_WCSLEN
++# ifdef USE_AS_WCSLEN
+ 	shrq	$2, %rax
+-#  endif
++# endif
+ 	VZEROUPPER_RETURN
+ 
++# ifdef USE_AS_STRNLEN
+ 	.p2align 4
+-L(first_vec_x3_check):
++L(last_vec_x1_check):
++
+ 	tzcntl	%eax, %eax
+ 	/* Check the end of data.  */
+-	cmpq	%rax, %rsi
+-	jbe	L(max)
+-	addq	$(VEC_SIZE * 3), %rax
++	cmpl	%eax, %esi
++	jb	L(max)
++	subq	%rdx, %rdi
++	incl	%eax
+ 	addq	%rdi, %rax
+-	subq	%rdx, %rax
+ #  ifdef USE_AS_WCSLEN
+ 	shrq	$2, %rax
+ #  endif
+ 	VZEROUPPER_RETURN
+ 
+-	.p2align 4
+ L(max):
+ 	movq	%r8, %rax
++	VZEROUPPER_RETURN
++
++	.p2align 4
++L(last_4x_vec):
++	/* Test first 2x VEC normally.  */
++	testl	%eax, %eax
++	jnz	L(last_vec_x1)
++
++	VPCMPEQ	(VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
++	vpmovmskb	%ymm1, %eax
++	testl	%eax, %eax
++	jnz	L(last_vec_x2)
++
++	/* Normalize length.  */
++	andl	$(VEC_SIZE * 4 - 1), %esi
++	VPCMPEQ	(VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm1
++	vpmovmskb	%ymm1, %eax
++	testl	%eax, %eax
++	jnz	L(last_vec_x3)
++
++	subl	$(VEC_SIZE * 3), %esi
++	jb	L(max)
++
++	VPCMPEQ	(VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm1
++	vpmovmskb	%ymm1, %eax
++	tzcntl	%eax, %eax
++	/* Check the end of data.  */
++	cmpl	%eax, %esi
++	jb	L(max)
++	subq	%rdx, %rdi
++	addl	$(VEC_SIZE * 3 + 1), %eax
++	addq	%rdi, %rax
+ #  ifdef USE_AS_WCSLEN
+ 	shrq	$2, %rax
+ #  endif
+ 	VZEROUPPER_RETURN
+ 
+-	.p2align 4
+-L(zero):
+-	xorl	%eax, %eax
+-	ret
+-# endif
+ 
+ 	.p2align 4
+-L(first_vec_x0):
++L(last_vec_x1):
++	/* essentially duplicates of first_vec_x1 but use 64 bit
++	   instructions.  */
+ 	tzcntl	%eax, %eax
++	subq	%rdx, %rdi
++	incl	%eax
+ 	addq	%rdi, %rax
+-	subq	%rdx, %rax
+-# ifdef USE_AS_WCSLEN
++#  ifdef USE_AS_WCSLEN
+ 	shrq	$2, %rax
+-# endif
++#  endif
+ 	VZEROUPPER_RETURN
+ 
+ 	.p2align 4
+-L(first_vec_x1):
++L(last_vec_x2):
++	/* essentially duplicates of first_vec_x1 but use 64 bit
++	   instructions.  */
+ 	tzcntl	%eax, %eax
+-	addq	$VEC_SIZE, %rax
++	subq	%rdx, %rdi
++	addl	$(VEC_SIZE + 1), %eax
+ 	addq	%rdi, %rax
+-	subq	%rdx, %rax
+-# ifdef USE_AS_WCSLEN
++#  ifdef USE_AS_WCSLEN
+ 	shrq	$2, %rax
+-# endif
++#  endif
+ 	VZEROUPPER_RETURN
+ 
+ 	.p2align 4
+-L(first_vec_x2):
++L(last_vec_x3):
+ 	tzcntl	%eax, %eax
+-	addq	$(VEC_SIZE * 2), %rax
++	subl	$(VEC_SIZE * 2), %esi
++	/* Check the end of data.  */
++	cmpl	%eax, %esi
++	jb	L(max_end)
++	subq	%rdx, %rdi
++	addl	$(VEC_SIZE * 2 + 1), %eax
+ 	addq	%rdi, %rax
+-	subq	%rdx, %rax
+-# ifdef USE_AS_WCSLEN
++#  ifdef USE_AS_WCSLEN
+ 	shrq	$2, %rax
+-# endif
++#  endif
++	VZEROUPPER_RETURN
++L(max_end):
++	movq	%r8, %rax
+ 	VZEROUPPER_RETURN
++# endif
+ 
++	/* Cold case for crossing page with first load.	 */
+ 	.p2align 4
+-L(4x_vec_end):
+-	VPCMPEQ	%ymm1, %ymm0, %ymm1
+-	vpmovmskb %ymm1, %eax
+-	testl	%eax, %eax
+-	jnz	L(first_vec_x0)
+-	VPCMPEQ %ymm2, %ymm0, %ymm2
+-	vpmovmskb %ymm2, %eax
++L(cross_page_boundary):
++	/* Align data to VEC_SIZE - 1.  */
++	orq	$(VEC_SIZE - 1), %rdi
++	VPCMPEQ	-(VEC_SIZE - 1)(%rdi), %ymm0, %ymm1
++	vpmovmskb	%ymm1, %eax
++	/* Remove the leading bytes. sarxl only uses bits [5:0] of COUNT
++	   so no need to manually mod rdx.  */
++	sarxl	%edx, %eax, %eax
++# ifdef USE_AS_STRNLEN
+ 	testl	%eax, %eax
+-	jnz	L(first_vec_x1)
+-	VPCMPEQ %ymm3, %ymm0, %ymm3
+-	vpmovmskb %ymm3, %eax
++	jnz	L(cross_page_less_vec)
++	leaq	1(%rdi), %rcx
++	subq	%rdx, %rcx
++	/* Check length.  */
++	cmpq	%rsi, %rcx
++	jb	L(cross_page_continue)
++	movq	%r8, %rax
++# else
+ 	testl	%eax, %eax
+-	jnz	L(first_vec_x2)
+-	VPCMPEQ %ymm4, %ymm0, %ymm4
+-	vpmovmskb %ymm4, %eax
+-L(first_vec_x3):
++	jz	L(cross_page_continue)
+ 	tzcntl	%eax, %eax
+-	addq	$(VEC_SIZE * 3), %rax
+-	addq	%rdi, %rax
+-	subq	%rdx, %rax
+-# ifdef USE_AS_WCSLEN
+-	shrq	$2, %rax
++#  ifdef USE_AS_WCSLEN
++	shrl	$2, %eax
++#  endif
+ # endif
++L(return_vzeroupper):
++	ZERO_UPPER_VEC_REGISTERS_RETURN
++
++# ifdef USE_AS_STRNLEN
++	.p2align 4
++L(cross_page_less_vec):
++	tzcntl	%eax, %eax
++	cmpq	%rax, %rsi
++	cmovb	%esi, %eax
++#  ifdef USE_AS_WCSLEN
++	shrl	$2, %eax
++#  endif
+ 	VZEROUPPER_RETURN
++# endif
+ 
+ END (STRLEN)
+ #endif
+-- 
+GitLab
+
diff --git a/glibc-RHEL-15696-26.patch b/glibc-RHEL-15696-26.patch
new file mode 100644
index 0000000..d46fe6e
--- /dev/null
+++ b/glibc-RHEL-15696-26.patch
@@ -0,0 +1,701 @@
+From 2a76821c3081d2c0231ecd2618f52662cb48fccd Mon Sep 17 00:00:00 2001
+From: Noah Goldstein <goldstein.w.n@gmail.com>
+Date: Mon, 3 May 2021 03:03:19 -0400
+Subject: [PATCH] x86: Optimize memchr-evex.S
+Content-type: text/plain; charset=UTF-8
+
+No bug. This commit optimizes memchr-evex.S. The optimizations include
+replacing some branches with cmovcc, avoiding some branches entirely
+in the less_4x_vec case, making the page cross logic less strict,
+saving some ALU in the alignment process, and most importantly
+increasing ILP in the 4x loop. test-memchr, test-rawmemchr, and
+test-wmemchr are all passing.
+
+Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
+Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
+---
+ sysdeps/x86_64/multiarch/memchr-evex.S | 547 +++++++++++++++----------
+ 1 file changed, 322 insertions(+), 225 deletions(-)
+
+diff --git a/sysdeps/x86_64/multiarch/memchr-evex.S b/sysdeps/x86_64/multiarch/memchr-evex.S
+index 6dd5d67b..81d5cd64 100644
+--- a/sysdeps/x86_64/multiarch/memchr-evex.S
++++ b/sysdeps/x86_64/multiarch/memchr-evex.S
+@@ -26,14 +26,28 @@
+ 
+ # ifdef USE_AS_WMEMCHR
+ #  define VPBROADCAST	vpbroadcastd
+-#  define VPCMP		vpcmpd
+-#  define SHIFT_REG	r8d
++#  define VPMINU	vpminud
++#  define VPCMP	vpcmpd
++#  define VPCMPEQ	vpcmpeqd
++#  define CHAR_SIZE	4
+ # else
+ #  define VPBROADCAST	vpbroadcastb
+-#  define VPCMP		vpcmpb
+-#  define SHIFT_REG	ecx
++#  define VPMINU	vpminub
++#  define VPCMP	vpcmpb
++#  define VPCMPEQ	vpcmpeqb
++#  define CHAR_SIZE	1
+ # endif
+ 
++# ifdef USE_AS_RAWMEMCHR
++#  define RAW_PTR_REG	rcx
++#  define ALGN_PTR_REG	rdi
++# else
++#  define RAW_PTR_REG	rdi
++#  define ALGN_PTR_REG	rcx
++# endif
++
++# define XMMZERO	xmm23
++# define YMMZERO	ymm23
+ # define XMMMATCH	xmm16
+ # define YMMMATCH	ymm16
+ # define YMM1		ymm17
+@@ -44,6 +58,8 @@
+ # define YMM6		ymm22
+ 
+ # define VEC_SIZE 32
++# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE)
++# define PAGE_SIZE 4096
+ 
+ 	.section .text.evex,"ax",@progbits
+ ENTRY (MEMCHR)
+@@ -51,11 +67,7 @@ ENTRY (MEMCHR)
+ 	/* Check for zero length.  */
+ 	test	%RDX_LP, %RDX_LP
+ 	jz	L(zero)
+-# endif
+-	movl	%edi, %ecx
+-# ifdef USE_AS_WMEMCHR
+-	shl	$2, %RDX_LP
+-# else
++
+ #  ifdef __ILP32__
+ 	/* Clear the upper 32 bits.  */
+ 	movl	%edx, %edx
+@@ -64,318 +76,403 @@ ENTRY (MEMCHR)
+ 	/* Broadcast CHAR to YMMMATCH.  */
+ 	VPBROADCAST %esi, %YMMMATCH
+ 	/* Check if we may cross page boundary with one vector load.  */
+-	andl	$(2 * VEC_SIZE - 1), %ecx
+-	cmpl	$VEC_SIZE, %ecx
+-	ja	L(cros_page_boundary)
++	movl	%edi, %eax
++	andl	$(PAGE_SIZE - 1), %eax
++	cmpl	$(PAGE_SIZE - VEC_SIZE), %eax
++	ja	L(cross_page_boundary)
+ 
+ 	/* Check the first VEC_SIZE bytes.  */
+-	VPCMP	$0, (%rdi), %YMMMATCH, %k1
+-	kmovd	%k1, %eax
+-	testl	%eax, %eax
+-
++	VPCMP	$0, (%rdi), %YMMMATCH, %k0
++	kmovd	%k0, %eax
+ # ifndef USE_AS_RAWMEMCHR
+-	jnz	L(first_vec_x0_check)
+-	/* Adjust length and check the end of data.  */
+-	subq	$VEC_SIZE, %rdx
+-	jbe	L(zero)
++	/* If length < CHAR_PER_VEC handle special.  */
++	cmpq	$CHAR_PER_VEC, %rdx
++	jbe	L(first_vec_x0)
++# endif
++	testl	%eax, %eax
++	jz	L(aligned_more)
++	tzcntl	%eax, %eax
++# ifdef USE_AS_WMEMCHR
++	/* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count.  */
++	leaq	(%rdi, %rax, CHAR_SIZE), %rax
+ # else
+-	jnz	L(first_vec_x0)
++	addq	%rdi, %rax
+ # endif
+-
+-	/* Align data for aligned loads in the loop.  */
+-	addq	$VEC_SIZE, %rdi
+-	andl	$(VEC_SIZE - 1), %ecx
+-	andq	$-VEC_SIZE, %rdi
++	ret
+ 
+ # ifndef USE_AS_RAWMEMCHR
+-	/* Adjust length.  */
+-	addq	%rcx, %rdx
+-
+-	subq	$(VEC_SIZE * 4), %rdx
+-	jbe	L(last_4x_vec_or_less)
+-# endif
+-	jmp	L(more_4x_vec)
++L(zero):
++	xorl	%eax, %eax
++	ret
+ 
++	.p2align 5
++L(first_vec_x0):
++	/* Check if first match was before length.  */
++	tzcntl	%eax, %eax
++	xorl	%ecx, %ecx
++	cmpl	%eax, %edx
++	leaq	(%rdi, %rax, CHAR_SIZE), %rax
++	cmovle	%rcx, %rax
++	ret
++# else
++	/* NB: first_vec_x0 is 17 bytes which will leave
++	   cross_page_boundary (which is relatively cold) close enough
++	   to ideal alignment. So only realign L(cross_page_boundary) if
++	   rawmemchr.  */
+ 	.p2align 4
+-L(cros_page_boundary):
+-	andl	$(VEC_SIZE - 1), %ecx
++# endif
++L(cross_page_boundary):
++	/* Save pointer before aligning as its original value is
++	   necessary for computer return address if byte is found or
++	   adjusting length if it is not and this is memchr.  */
++	movq	%rdi, %rcx
++	/* Align data to VEC_SIZE. ALGN_PTR_REG is rcx for memchr and rdi
++	   for rawmemchr.  */
++	andq	$-VEC_SIZE, %ALGN_PTR_REG
++	VPCMP	$0, (%ALGN_PTR_REG), %YMMMATCH, %k0
++	kmovd	%k0, %r8d
+ # ifdef USE_AS_WMEMCHR
+-	/* NB: Divide shift count by 4 since each bit in K1 represent 4
++	/* NB: Divide shift count by 4 since each bit in K0 represent 4
+ 	   bytes.  */
+-	movl	%ecx, %SHIFT_REG
+-	sarl	$2, %SHIFT_REG
++	sarl	$2, %eax
++# endif
++# ifndef USE_AS_RAWMEMCHR
++	movl	$(PAGE_SIZE / CHAR_SIZE), %esi
++	subl	%eax, %esi
+ # endif
+-	andq	$-VEC_SIZE, %rdi
+-	VPCMP	$0, (%rdi), %YMMMATCH, %k1
+-	kmovd	%k1, %eax
+-	/* Remove the leading bytes.  */
+-	sarxl	%SHIFT_REG, %eax, %eax
+-	testl	%eax, %eax
+-	jz	L(aligned_more)
+-	tzcntl	%eax, %eax
+ # ifdef USE_AS_WMEMCHR
+-	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
+-	sall	$2, %eax
++	andl	$(CHAR_PER_VEC - 1), %eax
+ # endif
++	/* Remove the leading bytes.  */
++	sarxl	%eax, %r8d, %eax
+ # ifndef USE_AS_RAWMEMCHR
+ 	/* Check the end of data.  */
+-	cmpq	%rax, %rdx
+-	jbe	L(zero)
++	cmpq	%rsi, %rdx
++	jbe	L(first_vec_x0)
++# endif
++	testl	%eax, %eax
++	jz	L(cross_page_continue)
++	tzcntl	%eax, %eax
++# ifdef USE_AS_WMEMCHR
++	/* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count.  */
++	leaq	(%RAW_PTR_REG, %rax, CHAR_SIZE), %rax
++# else
++	addq	%RAW_PTR_REG, %rax
+ # endif
+-	addq	%rdi, %rax
+-	addq	%rcx, %rax
+ 	ret
+ 
+ 	.p2align 4
+-L(aligned_more):
+-# ifndef USE_AS_RAWMEMCHR
+-        /* Calculate "rdx + rcx - VEC_SIZE" with "rdx - (VEC_SIZE - rcx)"
+-	   instead of "(rdx + rcx) - VEC_SIZE" to void possible addition
+-	   overflow.  */
+-	negq	%rcx
+-	addq	$VEC_SIZE, %rcx
++L(first_vec_x1):
++	tzcntl	%eax, %eax
++	leaq	VEC_SIZE(%rdi, %rax, CHAR_SIZE), %rax
++	ret
+ 
+-	/* Check the end of data.  */
+-	subq	%rcx, %rdx
+-	jbe	L(zero)
+-# endif
++	.p2align 4
++L(first_vec_x2):
++	tzcntl	%eax, %eax
++	leaq	(VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
++	ret
+ 
+-	addq	$VEC_SIZE, %rdi
++	.p2align 4
++L(first_vec_x3):
++	tzcntl	%eax, %eax
++	leaq	(VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax
++	ret
+ 
+-# ifndef USE_AS_RAWMEMCHR
+-	subq	$(VEC_SIZE * 4), %rdx
+-	jbe	L(last_4x_vec_or_less)
+-# endif
++	.p2align 4
++L(first_vec_x4):
++	tzcntl	%eax, %eax
++	leaq	(VEC_SIZE * 4)(%rdi, %rax, CHAR_SIZE), %rax
++	ret
+ 
+-L(more_4x_vec):
++	.p2align 5
++L(aligned_more):
+ 	/* Check the first 4 * VEC_SIZE.  Only one VEC_SIZE at a time
+ 	   since data is only aligned to VEC_SIZE.  */
+-	VPCMP	$0, (%rdi), %YMMMATCH, %k1
+-	kmovd	%k1, %eax
+-	testl	%eax, %eax
+-	jnz	L(first_vec_x0)
+ 
+-	VPCMP	$0, VEC_SIZE(%rdi), %YMMMATCH, %k1
+-	kmovd	%k1, %eax
++# ifndef USE_AS_RAWMEMCHR
++	/* Align data to VEC_SIZE.  */
++L(cross_page_continue):
++	xorl	%ecx, %ecx
++	subl	%edi, %ecx
++	andq	$-VEC_SIZE, %rdi
++	/* esi is for adjusting length to see if near the end.  */
++	leal	(VEC_SIZE * 5)(%rdi, %rcx), %esi
++#  ifdef USE_AS_WMEMCHR
++	/* NB: Divide bytes by 4 to get the wchar_t count.  */
++	sarl	$2, %esi
++#  endif
++# else
++	andq	$-VEC_SIZE, %rdi
++L(cross_page_continue):
++# endif
++	/* Load first VEC regardless.  */
++	VPCMP	$0, (VEC_SIZE)(%rdi), %YMMMATCH, %k0
++	kmovd	%k0, %eax
++# ifndef USE_AS_RAWMEMCHR
++	/* Adjust length. If near end handle specially.  */
++	subq	%rsi, %rdx
++	jbe	L(last_4x_vec_or_less)
++# endif
+ 	testl	%eax, %eax
+ 	jnz	L(first_vec_x1)
+ 
+-	VPCMP	$0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k1
+-	kmovd	%k1, %eax
++	VPCMP	$0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k0
++	kmovd	%k0, %eax
+ 	testl	%eax, %eax
+ 	jnz	L(first_vec_x2)
+ 
+-	VPCMP	$0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k1
+-	kmovd	%k1, %eax
++	VPCMP	$0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k0
++	kmovd	%k0, %eax
+ 	testl	%eax, %eax
+ 	jnz	L(first_vec_x3)
+ 
+-	addq	$(VEC_SIZE * 4), %rdi
++	VPCMP	$0, (VEC_SIZE * 4)(%rdi), %YMMMATCH, %k0
++	kmovd	%k0, %eax
++	testl	%eax, %eax
++	jnz	L(first_vec_x4)
++
+ 
+ # ifndef USE_AS_RAWMEMCHR
+-	subq	$(VEC_SIZE * 4), %rdx
+-	jbe	L(last_4x_vec_or_less)
+-# endif
++	/* Check if at last CHAR_PER_VEC * 4 length.  */
++	subq	$(CHAR_PER_VEC * 4), %rdx
++	jbe	L(last_4x_vec_or_less_cmpeq)
++	addq	$VEC_SIZE, %rdi
+ 
+-	/* Align data to 4 * VEC_SIZE.  */
+-	movq	%rdi, %rcx
+-	andl	$(4 * VEC_SIZE - 1), %ecx
++	/* Align data to VEC_SIZE * 4 for the loop and readjust length.
++	 */
++#  ifdef USE_AS_WMEMCHR
++	movl	%edi, %ecx
+ 	andq	$-(4 * VEC_SIZE), %rdi
+-
+-# ifndef USE_AS_RAWMEMCHR
+-	/* Adjust length.  */
++	andl	$(VEC_SIZE * 4 - 1), %ecx
++	/* NB: Divide bytes by 4 to get the wchar_t count.  */
++	sarl	$2, %ecx
+ 	addq	%rcx, %rdx
++#  else
++	addq	%rdi, %rdx
++	andq	$-(4 * VEC_SIZE), %rdi
++	subq	%rdi, %rdx
++#  endif
++# else
++	addq	$VEC_SIZE, %rdi
++	andq	$-(4 * VEC_SIZE), %rdi
+ # endif
+ 
++	vpxorq	%XMMZERO, %XMMZERO, %XMMZERO
++
++	/* Compare 4 * VEC at a time forward.  */
+ 	.p2align 4
+ L(loop_4x_vec):
+-	/* Compare 4 * VEC at a time forward.  */
+-	VPCMP	$0, (%rdi), %YMMMATCH, %k1
+-	VPCMP	$0, VEC_SIZE(%rdi), %YMMMATCH, %k2
+-	kord	%k1, %k2, %k5
+-	VPCMP	$0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k3
+-	VPCMP	$0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k4
+-
+-	kord	%k3, %k4, %k6
+-	kortestd %k5, %k6
+-	jnz	L(4x_vec_end)
+-
+-	addq	$(VEC_SIZE * 4), %rdi
+-
++	/* It would be possible to save some instructions using 4x VPCMP
++	   but bottleneck on port 5 makes it not woth it.  */
++	VPCMP	$4, (VEC_SIZE * 4)(%rdi), %YMMMATCH, %k1
++	/* xor will set bytes match esi to zero.  */
++	vpxorq	(VEC_SIZE * 5)(%rdi), %YMMMATCH, %YMM2
++	vpxorq	(VEC_SIZE * 6)(%rdi), %YMMMATCH, %YMM3
++	VPCMP	$0, (VEC_SIZE * 7)(%rdi), %YMMMATCH, %k3
++	/* Reduce VEC2 / VEC3 with min and VEC1 with zero mask.  */
++	VPMINU	%YMM2, %YMM3, %YMM3 {%k1} {z}
++	VPCMP	$0, %YMM3, %YMMZERO, %k2
+ # ifdef USE_AS_RAWMEMCHR
+-	jmp	L(loop_4x_vec)
++	subq	$-(VEC_SIZE * 4), %rdi
++	kortestd %k2, %k3
++	jz	L(loop_4x_vec)
+ # else
+-	subq	$(VEC_SIZE * 4), %rdx
++	kortestd %k2, %k3
++	jnz	L(loop_4x_vec_end)
++
++	subq	$-(VEC_SIZE * 4), %rdi
++
++	subq	$(CHAR_PER_VEC * 4), %rdx
+ 	ja	L(loop_4x_vec)
+ 
++	/* Fall through into less than 4 remaining vectors of length case.
++	 */
++	VPCMP	$0, (VEC_SIZE * 4)(%rdi), %YMMMATCH, %k0
++	kmovd	%k0, %eax
++	addq	$(VEC_SIZE * 3), %rdi
++	.p2align 4
+ L(last_4x_vec_or_less):
+-	/* Less than 4 * VEC and aligned to VEC_SIZE.  */
+-	addl	$(VEC_SIZE * 2), %edx
+-	jle	L(last_2x_vec)
+-
+-	VPCMP	$0, (%rdi), %YMMMATCH, %k1
+-	kmovd	%k1, %eax
++	/* Check if first VEC contained match.  */
+ 	testl	%eax, %eax
+-	jnz	L(first_vec_x0)
++	jnz	L(first_vec_x1_check)
+ 
+-	VPCMP	$0, VEC_SIZE(%rdi), %YMMMATCH, %k1
+-	kmovd	%k1, %eax
+-	testl	%eax, %eax
+-	jnz	L(first_vec_x1)
++	/* If remaining length > CHAR_PER_VEC * 2.  */
++	addl	$(CHAR_PER_VEC * 2), %edx
++	jg	L(last_4x_vec)
+ 
+-	VPCMP	$0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k1
+-	kmovd	%k1, %eax
+-	testl	%eax, %eax
++L(last_2x_vec):
++	/* If remaining length < CHAR_PER_VEC.  */
++	addl	$CHAR_PER_VEC, %edx
++	jle	L(zero_end)
+ 
+-	jnz	L(first_vec_x2_check)
+-	subl	$VEC_SIZE, %edx
+-	jle	L(zero)
++	/* Check VEC2 and compare any match with remaining length.  */
++	VPCMP	$0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k0
++	kmovd	%k0, %eax
++	tzcntl	%eax, %eax
++	cmpl	%eax, %edx
++	jbe	L(set_zero_end)
++	leaq	(VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
++L(zero_end):
++	ret
+ 
+-	VPCMP	$0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k1
+-	kmovd	%k1, %eax
+-	testl	%eax, %eax
+ 
+-	jnz	L(first_vec_x3_check)
++	.p2align 4
++L(first_vec_x1_check):
++	tzcntl	%eax, %eax
++	/* Adjust length.  */
++	subl	$-(CHAR_PER_VEC * 4), %edx
++	/* Check if match within remaining length.  */
++	cmpl	%eax, %edx
++	jbe	L(set_zero_end)
++	/* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count.  */
++	leaq	VEC_SIZE(%rdi, %rax, CHAR_SIZE), %rax
++	ret
++L(set_zero_end):
+ 	xorl	%eax, %eax
+ 	ret
+ 
+ 	.p2align 4
+-L(last_2x_vec):
+-	addl	$(VEC_SIZE * 2), %edx
+-	VPCMP	$0, (%rdi), %YMMMATCH, %k1
++L(loop_4x_vec_end):
++# endif
++	/* rawmemchr will fall through into this if match was found in
++	   loop.  */
++
++	/* k1 has not of matches with VEC1.  */
+ 	kmovd	%k1, %eax
+-	testl	%eax, %eax
++# ifdef USE_AS_WMEMCHR
++	subl	$((1 << CHAR_PER_VEC) - 1), %eax
++# else
++	incl	%eax
++# endif
++	jnz	L(last_vec_x1_return)
+ 
+-	jnz	L(first_vec_x0_check)
+-	subl	$VEC_SIZE, %edx
+-	jle	L(zero)
++	VPCMP	$0, %YMM2, %YMMZERO, %k0
++	kmovd	%k0, %eax
++	testl	%eax, %eax
++	jnz	L(last_vec_x2_return)
+ 
+-	VPCMP	$0, VEC_SIZE(%rdi), %YMMMATCH, %k1
+-	kmovd	%k1, %eax
++	kmovd	%k2, %eax
+ 	testl	%eax, %eax
+-	jnz	L(first_vec_x1_check)
+-	xorl	%eax, %eax
+-	ret
++	jnz	L(last_vec_x3_return)
+ 
+-	.p2align 4
+-L(first_vec_x0_check):
++	kmovd	%k3, %eax
+ 	tzcntl	%eax, %eax
+-# ifdef USE_AS_WMEMCHR
+-	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
+-	sall	$2, %eax
++# ifdef USE_AS_RAWMEMCHR
++	leaq	(VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax
++# else
++	leaq	(VEC_SIZE * 7)(%rdi, %rax, CHAR_SIZE), %rax
+ # endif
+-	/* Check the end of data.  */
+-	cmpq	%rax, %rdx
+-	jbe	L(zero)
+-	addq	%rdi, %rax
+ 	ret
+ 
+ 	.p2align 4
+-L(first_vec_x1_check):
++L(last_vec_x1_return):
+ 	tzcntl	%eax, %eax
+-# ifdef USE_AS_WMEMCHR
+-	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
+-	sall	$2, %eax
+-# endif
+-	/* Check the end of data.  */
+-	cmpq	%rax, %rdx
+-	jbe	L(zero)
+-	addq	$VEC_SIZE, %rax
++# ifdef USE_AS_RAWMEMCHR
++#  ifdef USE_AS_WMEMCHR
++	/* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count.  */
++	leaq	(%rdi, %rax, CHAR_SIZE), %rax
++#  else
+ 	addq	%rdi, %rax
+-	ret
+-
+-	.p2align 4
+-L(first_vec_x2_check):
+-	tzcntl	%eax, %eax
+-# ifdef USE_AS_WMEMCHR
+-	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
+-	sall	$2, %eax
++#  endif
++# else
++	/* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count.  */
++	leaq	(VEC_SIZE * 4)(%rdi, %rax, CHAR_SIZE), %rax
+ # endif
+-	/* Check the end of data.  */
+-	cmpq	%rax, %rdx
+-	jbe	L(zero)
+-	addq	$(VEC_SIZE * 2), %rax
+-	addq	%rdi, %rax
+ 	ret
+ 
+ 	.p2align 4
+-L(first_vec_x3_check):
++L(last_vec_x2_return):
+ 	tzcntl	%eax, %eax
+-# ifdef USE_AS_WMEMCHR
+-	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
+-	sall	$2, %eax
++# ifdef USE_AS_RAWMEMCHR
++	/* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count.  */
++	leaq	VEC_SIZE(%rdi, %rax, CHAR_SIZE), %rax
++# else
++	/* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count.  */
++	leaq	(VEC_SIZE * 5)(%rdi, %rax, CHAR_SIZE), %rax
+ # endif
+-	/* Check the end of data.  */
+-	cmpq	%rax, %rdx
+-	jbe	L(zero)
+-	addq	$(VEC_SIZE * 3), %rax
+-	addq	%rdi, %rax
+ 	ret
+ 
+ 	.p2align 4
+-L(zero):
+-	xorl	%eax, %eax
+-	ret
+-# endif
+-
+-	.p2align 4
+-L(first_vec_x0):
++L(last_vec_x3_return):
+ 	tzcntl	%eax, %eax
+-# ifdef USE_AS_WMEMCHR
+-	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
+-	leaq	(%rdi, %rax, 4), %rax
++# ifdef USE_AS_RAWMEMCHR
++	/* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count.  */
++	leaq	(VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
+ # else
+-	addq	%rdi, %rax
++	/* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count.  */
++	leaq	(VEC_SIZE * 6)(%rdi, %rax, CHAR_SIZE), %rax
+ # endif
+ 	ret
+ 
++
++# ifndef USE_AS_RAWMEMCHR
++L(last_4x_vec_or_less_cmpeq):
++	VPCMP	$0, (VEC_SIZE * 5)(%rdi), %YMMMATCH, %k0
++	kmovd	%k0, %eax
++	subq	$-(VEC_SIZE * 4), %rdi
++	/* Check first VEC regardless.  */
++	testl	%eax, %eax
++	jnz	L(first_vec_x1_check)
++
++	/* If remaining length <= CHAR_PER_VEC * 2.  */
++	addl	$(CHAR_PER_VEC * 2), %edx
++	jle	L(last_2x_vec)
++
+ 	.p2align 4
+-L(first_vec_x1):
++L(last_4x_vec):
++	VPCMP	$0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k0
++	kmovd	%k0, %eax
++	testl	%eax, %eax
++	jnz	L(last_vec_x2)
++
++
++	VPCMP	$0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k0
++	kmovd	%k0, %eax
++	/* Create mask for possible matches within remaining length.  */
++#  ifdef USE_AS_WMEMCHR
++	movl	$((1 << (CHAR_PER_VEC * 2)) - 1), %ecx
++	bzhil	%edx, %ecx, %ecx
++#  else
++	movq	$-1, %rcx
++	bzhiq	%rdx, %rcx, %rcx
++#  endif
++	/* Test matches in data against length match.  */
++	andl	%ecx, %eax
++	jnz	L(last_vec_x3)
++
++	/* if remaining length <= CHAR_PER_VEC * 3 (Note this is after
++	   remaining length was found to be > CHAR_PER_VEC * 2.  */
++	subl	$CHAR_PER_VEC, %edx
++	jbe	L(zero_end2)
++
++
++	VPCMP	$0, (VEC_SIZE * 4)(%rdi), %YMMMATCH, %k0
++	kmovd	%k0, %eax
++	/* Shift remaining length mask for last VEC.  */
++#  ifdef USE_AS_WMEMCHR
++	shrl	$CHAR_PER_VEC, %ecx
++#  else
++	shrq	$CHAR_PER_VEC, %rcx
++#  endif
++	andl	%ecx, %eax
++	jz	L(zero_end2)
+ 	tzcntl	%eax, %eax
+-# ifdef USE_AS_WMEMCHR
+-	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
+-	leaq	VEC_SIZE(%rdi, %rax, 4), %rax
+-# else
+-	addq	$VEC_SIZE, %rax
+-	addq	%rdi, %rax
+-# endif
++	leaq	(VEC_SIZE * 4)(%rdi, %rax, CHAR_SIZE), %rax
++L(zero_end2):
+ 	ret
+ 
+-	.p2align 4
+-L(first_vec_x2):
++L(last_vec_x2):
+ 	tzcntl	%eax, %eax
+-# ifdef USE_AS_WMEMCHR
+-	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
+-	leaq	(VEC_SIZE * 2)(%rdi, %rax, 4), %rax
+-# else
+-	addq	$(VEC_SIZE * 2), %rax
+-	addq	%rdi, %rax
+-# endif
++	leaq	(VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
+ 	ret
+ 
+ 	.p2align 4
+-L(4x_vec_end):
+-	kmovd	%k1, %eax
+-	testl	%eax, %eax
+-	jnz	L(first_vec_x0)
+-	kmovd	%k2, %eax
+-	testl	%eax, %eax
+-	jnz	L(first_vec_x1)
+-	kmovd	%k3, %eax
+-	testl	%eax, %eax
+-	jnz	L(first_vec_x2)
+-	kmovd	%k4, %eax
+-	testl	%eax, %eax
+-L(first_vec_x3):
++L(last_vec_x3):
+ 	tzcntl	%eax, %eax
+-# ifdef USE_AS_WMEMCHR
+-	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
+-	leaq	(VEC_SIZE * 3)(%rdi, %rax, 4), %rax
+-# else
+-	addq	$(VEC_SIZE * 3), %rax
+-	addq	%rdi, %rax
+-# endif
++	leaq	(VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax
+ 	ret
++# endif
+ 
+ END (MEMCHR)
+ #endif
+-- 
+GitLab
+
diff --git a/glibc-RHEL-15696-27.patch b/glibc-RHEL-15696-27.patch
new file mode 100644
index 0000000..9dcf16d
--- /dev/null
+++ b/glibc-RHEL-15696-27.patch
@@ -0,0 +1,30 @@
+From 6ea916adfa0ab9af6e7dc6adcf6f977dfe017835 Mon Sep 17 00:00:00 2001
+From: Alice Xu <alice.d.xu@gmail.com>
+Date: Fri, 7 May 2021 19:03:21 -0700
+Subject: [PATCH] x86-64: Fix an unknown vector operation in memchr-evex.S
+Content-type: text/plain; charset=UTF-8
+
+An unknown vector operation occurred in commit 2a76821c308. Fixed it
+by using "ymm{k1}{z}" but not "ymm {k1} {z}".
+
+Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
+---
+ sysdeps/x86_64/multiarch/memchr-evex.S | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+diff --git a/sysdeps/x86_64/multiarch/memchr-evex.S b/sysdeps/x86_64/multiarch/memchr-evex.S
+index 81d5cd64..f3fdad4f 100644
+--- a/sysdeps/x86_64/multiarch/memchr-evex.S
++++ b/sysdeps/x86_64/multiarch/memchr-evex.S
+@@ -271,7 +271,7 @@ L(loop_4x_vec):
+ 	vpxorq	(VEC_SIZE * 6)(%rdi), %YMMMATCH, %YMM3
+ 	VPCMP	$0, (VEC_SIZE * 7)(%rdi), %YMMMATCH, %k3
+ 	/* Reduce VEC2 / VEC3 with min and VEC1 with zero mask.  */
+-	VPMINU	%YMM2, %YMM3, %YMM3 {%k1} {z}
++	VPMINU	%YMM2, %YMM3, %YMM3{%k1}{z}
+ 	VPCMP	$0, %YMM3, %YMMZERO, %k2
+ # ifdef USE_AS_RAWMEMCHR
+ 	subq	$-(VEC_SIZE * 4), %rdi
+-- 
+GitLab
+
diff --git a/glibc-RHEL-15696-28.patch b/glibc-RHEL-15696-28.patch
new file mode 100644
index 0000000..3063d4d
--- /dev/null
+++ b/glibc-RHEL-15696-28.patch
@@ -0,0 +1,566 @@
+From a0db678071c60b6c47c468d231dd0b3694ba7a98 Mon Sep 17 00:00:00 2001
+From: "H.J. Lu" <hjl.tools@gmail.com>
+Date: Tue, 22 Jun 2021 20:42:10 -0700
+Subject: [PATCH] x86-64: Move strlen.S to multiarch/strlen-vec.S
+Content-type: text/plain; charset=UTF-8
+
+Since strlen.S contains SSE2 version of strlen/strnlen and SSE4.1
+version of wcslen/wcsnlen, move strlen.S to multiarch/strlen-vec.S
+and include multiarch/strlen-vec.S from SSE2 and SSE4.1 variants.
+This also removes the unused symbols, __GI___strlen_sse2 and
+__GI___wcsnlen_sse4_1.
+---
+ sysdeps/x86_64/multiarch/strlen-sse2.S    |   2 +-
+ sysdeps/x86_64/multiarch/strlen-vec.S     | 257 ++++++++++++++++++++++
+ sysdeps/x86_64/multiarch/wcsnlen-sse4_1.S |   2 +-
+ sysdeps/x86_64/strlen.S                   | 243 +-------------------
+ 4 files changed, 262 insertions(+), 242 deletions(-)
+ create mode 100644 sysdeps/x86_64/multiarch/strlen-vec.S
+
+Conflicts:
+	sysdeps/x86_64/strlen.S b/sysdeps/x86_64/strlen.S
+	(Copyright dates, URL)
+
+diff --git a/sysdeps/x86_64/multiarch/strlen-sse2.S b/sysdeps/x86_64/multiarch/strlen-sse2.S
+index 7bc57b8d..449c8a7f 100644
+--- a/sysdeps/x86_64/multiarch/strlen-sse2.S
++++ b/sysdeps/x86_64/multiarch/strlen-sse2.S
+@@ -20,4 +20,4 @@
+ # define strlen __strlen_sse2
+ #endif
+ 
+-#include "../strlen.S"
++#include "strlen-vec.S"
+diff --git a/sysdeps/x86_64/multiarch/strlen-vec.S b/sysdeps/x86_64/multiarch/strlen-vec.S
+new file mode 100644
+index 00000000..8f660bb9
+--- /dev/null
++++ b/sysdeps/x86_64/multiarch/strlen-vec.S
+@@ -0,0 +1,257 @@
++/* SSE2 version of strlen and SSE4.1 version of wcslen.
++   Copyright (C) 2012-2021 Free Software Foundation, Inc.
++   This file is part of the GNU C Library.
++
++   The GNU C Library is free software; you can redistribute it and/or
++   modify it under the terms of the GNU Lesser General Public
++   License as published by the Free Software Foundation; either
++   version 2.1 of the License, or (at your option) any later version.
++
++   The GNU C Library is distributed in the hope that it will be useful,
++   but WITHOUT ANY WARRANTY; without even the implied warranty of
++   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++   Lesser General Public License for more details.
++
++   You should have received a copy of the GNU Lesser General Public
++   License along with the GNU C Library; if not, see
++   <https://www.gnu.org/licenses/>.  */
++
++#include <sysdep.h>
++
++#ifdef AS_WCSLEN
++# define PMINU		pminud
++# define PCMPEQ		pcmpeqd
++# define SHIFT_RETURN	shrq $2, %rax
++#else
++# define PMINU		pminub
++# define PCMPEQ		pcmpeqb
++# define SHIFT_RETURN
++#endif
++
++/* Long lived register in strlen(s), strnlen(s, n) are:
++
++	%xmm3 - zero
++	%rdi   - s
++	%r10  (s+n) & (~(64-1))
++	%r11   s+n
++*/
++
++
++.text
++ENTRY(strlen)
++
++/* Test 64 bytes from %rax for zero. Save result as bitmask in %rdx.  */
++#define FIND_ZERO	\
++	PCMPEQ	(%rax), %xmm0;	\
++	PCMPEQ	16(%rax), %xmm1;	\
++	PCMPEQ	32(%rax), %xmm2;	\
++	PCMPEQ	48(%rax), %xmm3;	\
++	pmovmskb	%xmm0, %esi;	\
++	pmovmskb	%xmm1, %edx;	\
++	pmovmskb	%xmm2, %r8d;	\
++	pmovmskb	%xmm3, %ecx;	\
++	salq	$16, %rdx;	\
++	salq	$16, %rcx;	\
++	orq	%rsi, %rdx;	\
++	orq	%r8, %rcx;	\
++	salq	$32, %rcx;	\
++	orq	%rcx, %rdx;
++
++#ifdef AS_STRNLEN
++/* Do not read anything when n==0.  */
++	test	%RSI_LP, %RSI_LP
++	jne	L(n_nonzero)
++	xor	%rax, %rax
++	ret
++L(n_nonzero):
++# ifdef AS_WCSLEN
++	shl	$2, %RSI_LP
++# endif
++
++/* Initialize long lived registers.  */
++
++	add	%RDI_LP, %RSI_LP
++	mov	%RSI_LP, %R10_LP
++	and	$-64, %R10_LP
++	mov	%RSI_LP, %R11_LP
++#endif
++
++	pxor	%xmm0, %xmm0
++	pxor	%xmm1, %xmm1
++	pxor	%xmm2, %xmm2
++	pxor	%xmm3, %xmm3
++	movq	%rdi, %rax
++	movq	%rdi, %rcx
++	andq	$4095, %rcx
++/* Offsets 4032-4047 will be aligned into 4032 thus fit into page.  */
++	cmpq	$4047, %rcx
++/* We cannot unify this branching as it would be ~6 cycles slower.  */
++	ja	L(cross_page)
++
++#ifdef AS_STRNLEN
++/* Test if end is among first 64 bytes.  */
++# define STRNLEN_PROLOG	\
++	mov	%r11, %rsi;	\
++	subq	%rax, %rsi;	\
++	andq	$-64, %rax;	\
++	testq	$-64, %rsi;	\
++	je	L(strnlen_ret)
++#else
++# define STRNLEN_PROLOG  andq $-64, %rax;
++#endif
++
++/* Ignore bits in mask that come before start of string.  */
++#define PROLOG(lab)	\
++	movq	%rdi, %rcx;	\
++	xorq	%rax, %rcx;	\
++	STRNLEN_PROLOG;	\
++	sarq	%cl, %rdx;	\
++	test	%rdx, %rdx;	\
++	je	L(lab);	\
++	bsfq	%rdx, %rax;	\
++	SHIFT_RETURN;		\
++	ret
++
++#ifdef AS_STRNLEN
++	andq	$-16, %rax
++	FIND_ZERO
++#else
++	/* Test first 16 bytes unaligned.  */
++	movdqu	(%rax), %xmm4
++	PCMPEQ	%xmm0, %xmm4
++	pmovmskb	%xmm4, %edx
++	test	%edx, %edx
++	je 	L(next48_bytes)
++	bsf	%edx, %eax /* If eax is zeroed 16bit bsf can be used.  */
++	SHIFT_RETURN
++	ret
++
++L(next48_bytes):
++/* Same as FIND_ZERO except we do not check first 16 bytes.  */
++	andq	$-16, %rax
++	PCMPEQ 16(%rax), %xmm1
++	PCMPEQ 32(%rax), %xmm2
++	PCMPEQ 48(%rax), %xmm3
++	pmovmskb	%xmm1, %edx
++	pmovmskb	%xmm2, %r8d
++	pmovmskb	%xmm3, %ecx
++	salq	$16, %rdx
++	salq	$16, %rcx
++	orq	%r8, %rcx
++	salq	$32, %rcx
++	orq	%rcx, %rdx
++#endif
++
++	/* When no zero byte is found xmm1-3 are zero so we do not have to
++	   zero them.  */
++	PROLOG(loop)
++
++	.p2align 4
++L(cross_page):
++	andq	$-64, %rax
++	FIND_ZERO
++	PROLOG(loop_init)
++
++#ifdef AS_STRNLEN
++/* We must do this check to correctly handle strnlen (s, -1).  */
++L(strnlen_ret):
++	bts	%rsi, %rdx
++	sarq	%cl, %rdx
++	test	%rdx, %rdx
++	je	L(loop_init)
++	bsfq	%rdx, %rax
++	SHIFT_RETURN
++	ret
++#endif
++	.p2align 4
++L(loop_init):
++	pxor	%xmm1, %xmm1
++	pxor	%xmm2, %xmm2
++	pxor	%xmm3, %xmm3
++#ifdef AS_STRNLEN
++	.p2align 4
++L(loop):
++
++	addq	$64, %rax
++	cmpq	%rax, %r10
++	je	L(exit_end)
++
++	movdqa	(%rax), %xmm0
++	PMINU	16(%rax), %xmm0
++	PMINU	32(%rax), %xmm0
++	PMINU	48(%rax), %xmm0
++	PCMPEQ	%xmm3, %xmm0
++	pmovmskb	%xmm0, %edx
++	testl	%edx, %edx
++	jne	L(exit)
++	jmp	L(loop)
++
++	.p2align 4
++L(exit_end):
++	cmp	%rax, %r11
++	je	L(first) /* Do not read when end is at page boundary.  */
++	pxor	%xmm0, %xmm0
++	FIND_ZERO
++
++L(first):
++	bts	%r11, %rdx
++	bsfq	%rdx, %rdx
++	addq	%rdx, %rax
++	subq	%rdi, %rax
++	SHIFT_RETURN
++	ret
++
++	.p2align 4
++L(exit):
++	pxor	%xmm0, %xmm0
++	FIND_ZERO
++
++	bsfq	%rdx, %rdx
++	addq	%rdx, %rax
++	subq	%rdi, %rax
++	SHIFT_RETURN
++	ret
++
++#else
++
++	/* Main loop.  Unrolled twice to improve L2 cache performance on core2.  */
++	.p2align 4
++L(loop):
++
++	movdqa	64(%rax), %xmm0
++	PMINU	80(%rax), %xmm0
++	PMINU	96(%rax), %xmm0
++	PMINU	112(%rax), %xmm0
++	PCMPEQ	%xmm3, %xmm0
++	pmovmskb	%xmm0, %edx
++	testl	%edx, %edx
++	jne	L(exit64)
++
++	subq	$-128, %rax
++
++	movdqa	(%rax), %xmm0
++	PMINU	16(%rax), %xmm0
++	PMINU	32(%rax), %xmm0
++	PMINU	48(%rax), %xmm0
++	PCMPEQ	%xmm3, %xmm0
++	pmovmskb	%xmm0, %edx
++	testl	%edx, %edx
++	jne	L(exit0)
++	jmp	L(loop)
++
++	.p2align 4
++L(exit64):
++	addq	$64, %rax
++L(exit0):
++	pxor	%xmm0, %xmm0
++	FIND_ZERO
++
++	bsfq	%rdx, %rdx
++	addq	%rdx, %rax
++	subq	%rdi, %rax
++	SHIFT_RETURN
++	ret
++
++#endif
++
++END(strlen)
+diff --git a/sysdeps/x86_64/multiarch/wcsnlen-sse4_1.S b/sysdeps/x86_64/multiarch/wcsnlen-sse4_1.S
+index a8cab0cb..5fa51fe0 100644
+--- a/sysdeps/x86_64/multiarch/wcsnlen-sse4_1.S
++++ b/sysdeps/x86_64/multiarch/wcsnlen-sse4_1.S
+@@ -2,4 +2,4 @@
+ #define AS_STRNLEN
+ #define strlen	__wcsnlen_sse4_1
+ 
+-#include "../strlen.S"
++#include "strlen-vec.S"
+diff --git a/sysdeps/x86_64/strlen.S b/sysdeps/x86_64/strlen.S
+index f845f3d4..ad047d84 100644
+--- a/sysdeps/x86_64/strlen.S
++++ b/sysdeps/x86_64/strlen.S
+@@ -1,5 +1,5 @@
+-/* SSE2 version of strlen/wcslen.
+-   Copyright (C) 2012-2018 Free Software Foundation, Inc.
++/* SSE2 version of strlen.
++   Copyright (C) 2021 Free Software Foundation, Inc.
+    This file is part of the GNU C Library.
+ 
+    The GNU C Library is free software; you can redistribute it and/or
+@@ -16,243 +16,6 @@
+    License along with the GNU C Library; if not, see
+    <http://www.gnu.org/licenses/>.  */
+ 
+-#include <sysdep.h>
++#include "multiarch/strlen-vec.S"
+ 
+-#ifdef AS_WCSLEN
+-# define PMINU		pminud
+-# define PCMPEQ		pcmpeqd
+-# define SHIFT_RETURN	shrq $2, %rax
+-#else
+-# define PMINU		pminub
+-# define PCMPEQ		pcmpeqb
+-# define SHIFT_RETURN
+-#endif
+-
+-/* Long lived register in strlen(s), strnlen(s, n) are:
+-
+-	%xmm3 - zero
+-	%rdi   - s
+-	%r10  (s+n) & (~(64-1))
+-	%r11   s+n
+-*/
+-
+-
+-.text
+-ENTRY(strlen)
+-
+-/* Test 64 bytes from %rax for zero. Save result as bitmask in %rdx.  */
+-#define FIND_ZERO	\
+-	PCMPEQ	(%rax), %xmm0;	\
+-	PCMPEQ	16(%rax), %xmm1;	\
+-	PCMPEQ	32(%rax), %xmm2;	\
+-	PCMPEQ	48(%rax), %xmm3;	\
+-	pmovmskb	%xmm0, %esi;	\
+-	pmovmskb	%xmm1, %edx;	\
+-	pmovmskb	%xmm2, %r8d;	\
+-	pmovmskb	%xmm3, %ecx;	\
+-	salq	$16, %rdx;	\
+-	salq	$16, %rcx;	\
+-	orq	%rsi, %rdx;	\
+-	orq	%r8, %rcx;	\
+-	salq	$32, %rcx;	\
+-	orq	%rcx, %rdx;
+-
+-#ifdef AS_STRNLEN
+-/* Do not read anything when n==0.  */
+-	test	%RSI_LP, %RSI_LP
+-	jne	L(n_nonzero)
+-	xor	%rax, %rax
+-	ret
+-L(n_nonzero):
+-# ifdef AS_WCSLEN
+-	shl	$2, %RSI_LP
+-# endif
+-
+-/* Initialize long lived registers.  */
+-
+-	add	%RDI_LP, %RSI_LP
+-	mov	%RSI_LP, %R10_LP
+-	and	$-64, %R10_LP
+-	mov	%RSI_LP, %R11_LP
+-#endif
+-
+-	pxor	%xmm0, %xmm0
+-	pxor	%xmm1, %xmm1
+-	pxor	%xmm2, %xmm2
+-	pxor	%xmm3, %xmm3
+-	movq	%rdi, %rax
+-	movq	%rdi, %rcx
+-	andq	$4095, %rcx
+-/* Offsets 4032-4047 will be aligned into 4032 thus fit into page.  */
+-	cmpq	$4047, %rcx
+-/* We cannot unify this branching as it would be ~6 cycles slower.  */
+-	ja	L(cross_page)
+-
+-#ifdef AS_STRNLEN
+-/* Test if end is among first 64 bytes.  */
+-# define STRNLEN_PROLOG	\
+-	mov	%r11, %rsi;	\
+-	subq	%rax, %rsi;	\
+-	andq	$-64, %rax;	\
+-	testq	$-64, %rsi;	\
+-	je	L(strnlen_ret)
+-#else
+-# define STRNLEN_PROLOG  andq $-64, %rax;
+-#endif
+-
+-/* Ignore bits in mask that come before start of string.  */
+-#define PROLOG(lab)	\
+-	movq	%rdi, %rcx;	\
+-	xorq	%rax, %rcx;	\
+-	STRNLEN_PROLOG;	\
+-	sarq	%cl, %rdx;	\
+-	test	%rdx, %rdx;	\
+-	je	L(lab);	\
+-	bsfq	%rdx, %rax;	\
+-	SHIFT_RETURN;		\
+-	ret
+-
+-#ifdef AS_STRNLEN
+-	andq	$-16, %rax
+-	FIND_ZERO
+-#else
+-	/* Test first 16 bytes unaligned.  */
+-	movdqu	(%rax), %xmm4
+-	PCMPEQ	%xmm0, %xmm4
+-	pmovmskb	%xmm4, %edx
+-	test	%edx, %edx
+-	je 	L(next48_bytes)
+-	bsf	%edx, %eax /* If eax is zeroed 16bit bsf can be used.  */
+-	SHIFT_RETURN
+-	ret
+-
+-L(next48_bytes):
+-/* Same as FIND_ZERO except we do not check first 16 bytes.  */
+-	andq	$-16, %rax
+-	PCMPEQ 16(%rax), %xmm1
+-	PCMPEQ 32(%rax), %xmm2
+-	PCMPEQ 48(%rax), %xmm3
+-	pmovmskb	%xmm1, %edx
+-	pmovmskb	%xmm2, %r8d
+-	pmovmskb	%xmm3, %ecx
+-	salq	$16, %rdx
+-	salq	$16, %rcx
+-	orq	%r8, %rcx
+-	salq	$32, %rcx
+-	orq	%rcx, %rdx
+-#endif
+-
+-	/* When no zero byte is found xmm1-3 are zero so we do not have to
+-	   zero them.  */
+-	PROLOG(loop)
+-
+-	.p2align 4
+-L(cross_page):
+-	andq	$-64, %rax
+-	FIND_ZERO
+-	PROLOG(loop_init)
+-
+-#ifdef AS_STRNLEN
+-/* We must do this check to correctly handle strnlen (s, -1).  */
+-L(strnlen_ret):
+-	bts	%rsi, %rdx
+-	sarq	%cl, %rdx
+-	test	%rdx, %rdx
+-	je	L(loop_init)
+-	bsfq	%rdx, %rax
+-	SHIFT_RETURN
+-	ret
+-#endif
+-	.p2align 4
+-L(loop_init):
+-	pxor	%xmm1, %xmm1
+-	pxor	%xmm2, %xmm2
+-	pxor	%xmm3, %xmm3
+-#ifdef AS_STRNLEN
+-	.p2align 4
+-L(loop):
+-
+-	addq	$64, %rax
+-	cmpq	%rax, %r10
+-	je	L(exit_end)
+-
+-	movdqa	(%rax), %xmm0
+-	PMINU	16(%rax), %xmm0
+-	PMINU	32(%rax), %xmm0
+-	PMINU	48(%rax), %xmm0
+-	PCMPEQ	%xmm3, %xmm0
+-	pmovmskb	%xmm0, %edx
+-	testl	%edx, %edx
+-	jne	L(exit)
+-	jmp	L(loop)
+-
+-	.p2align 4
+-L(exit_end):
+-	cmp	%rax, %r11
+-	je	L(first) /* Do not read when end is at page boundary.  */
+-	pxor	%xmm0, %xmm0
+-	FIND_ZERO
+-
+-L(first):
+-	bts	%r11, %rdx
+-	bsfq	%rdx, %rdx
+-	addq	%rdx, %rax
+-	subq	%rdi, %rax
+-	SHIFT_RETURN
+-	ret
+-
+-	.p2align 4
+-L(exit):
+-	pxor	%xmm0, %xmm0
+-	FIND_ZERO
+-
+-	bsfq	%rdx, %rdx
+-	addq	%rdx, %rax
+-	subq	%rdi, %rax
+-	SHIFT_RETURN
+-	ret
+-
+-#else
+-
+-	/* Main loop.  Unrolled twice to improve L2 cache performance on core2.  */
+-	.p2align 4
+-L(loop):
+-
+-	movdqa	64(%rax), %xmm0
+-	PMINU	80(%rax), %xmm0
+-	PMINU	96(%rax), %xmm0
+-	PMINU	112(%rax), %xmm0
+-	PCMPEQ	%xmm3, %xmm0
+-	pmovmskb	%xmm0, %edx
+-	testl	%edx, %edx
+-	jne	L(exit64)
+-
+-	subq	$-128, %rax
+-
+-	movdqa	(%rax), %xmm0
+-	PMINU	16(%rax), %xmm0
+-	PMINU	32(%rax), %xmm0
+-	PMINU	48(%rax), %xmm0
+-	PCMPEQ	%xmm3, %xmm0
+-	pmovmskb	%xmm0, %edx
+-	testl	%edx, %edx
+-	jne	L(exit0)
+-	jmp	L(loop)
+-
+-	.p2align 4
+-L(exit64):
+-	addq	$64, %rax
+-L(exit0):
+-	pxor	%xmm0, %xmm0
+-	FIND_ZERO
+-
+-	bsfq	%rdx, %rdx
+-	addq	%rdx, %rax
+-	subq	%rdi, %rax
+-	SHIFT_RETURN
+-	ret
+-
+-#endif
+-
+-END(strlen)
+ libc_hidden_builtin_def (strlen)
+-- 
+GitLab
+
diff --git a/glibc-RHEL-15696-29.patch b/glibc-RHEL-15696-29.patch
new file mode 100644
index 0000000..112821a
--- /dev/null
+++ b/glibc-RHEL-15696-29.patch
@@ -0,0 +1,181 @@
+From 6f573a27b6c8b4236445810a44660612323f5a73 Mon Sep 17 00:00:00 2001
+From: Noah Goldstein <goldstein.w.n@gmail.com>
+Date: Wed, 23 Jun 2021 01:19:34 -0400
+Subject: [PATCH] x86-64: Add wcslen optimize for sse4.1
+Content-type: text/plain; charset=UTF-8
+
+No bug. This comment adds the ifunc / build infrastructure
+necessary for wcslen to prefer the sse4.1 implementation
+in strlen-vec.S. test-wcslen.c is passing.
+
+Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
+Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
+---
+ sysdeps/x86_64/multiarch/Makefile          |  4 +-
+ sysdeps/x86_64/multiarch/ifunc-impl-list.c |  3 ++
+ sysdeps/x86_64/multiarch/ifunc-wcslen.h    | 52 ++++++++++++++++++++++
+ sysdeps/x86_64/multiarch/wcslen-sse4_1.S   |  4 ++
+ sysdeps/x86_64/multiarch/wcslen.c          |  2 +-
+ sysdeps/x86_64/multiarch/wcsnlen.c         | 34 +-------------
+ 6 files changed, 63 insertions(+), 36 deletions(-)
+ create mode 100644 sysdeps/x86_64/multiarch/ifunc-wcslen.h
+ create mode 100644 sysdeps/x86_64/multiarch/wcslen-sse4_1.S
+
+diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
+index 491c7698..65fde4eb 100644
+--- a/sysdeps/x86_64/multiarch/Makefile
++++ b/sysdeps/x86_64/multiarch/Makefile
+@@ -93,8 +93,8 @@ sysdep_routines += wmemcmp-sse4 wmemcmp-ssse3 wmemcmp-c \
+ 		   wcscpy-ssse3 wcscpy-c \
+ 		   wcschr-sse2 wcschr-avx2 \
+ 		   wcsrchr-sse2 wcsrchr-avx2 \
+-		   wcsnlen-sse4_1 wcsnlen-c \
+-		   wcslen-sse2 wcslen-avx2 wcsnlen-avx2 \
++		   wcslen-sse2 wcslen-sse4_1 wcslen-avx2 \
++		   wcsnlen-c wcsnlen-sse4_1 wcsnlen-avx2 \
+ 		   wcschr-avx2-rtm \
+ 		   wcscmp-avx2-rtm \
+ 		   wcslen-avx2-rtm \
+diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+index f1a6460a..580913ca 100644
+--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
++++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+@@ -657,6 +657,9 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 			       && CPU_FEATURE_USABLE (AVX512BW)
+ 			       && CPU_FEATURE_USABLE (BMI2)),
+ 			      __wcslen_evex)
++	      IFUNC_IMPL_ADD (array, i, wcsnlen,
++			      CPU_FEATURE_USABLE (SSE4_1),
++			      __wcsnlen_sse4_1)
+ 	      IFUNC_IMPL_ADD (array, i, wcslen, 1, __wcslen_sse2))
+ 
+   /* Support sysdeps/x86_64/multiarch/wcsnlen.c.  */
+diff --git a/sysdeps/x86_64/multiarch/ifunc-wcslen.h b/sysdeps/x86_64/multiarch/ifunc-wcslen.h
+new file mode 100644
+index 00000000..39e33473
+--- /dev/null
++++ b/sysdeps/x86_64/multiarch/ifunc-wcslen.h
+@@ -0,0 +1,52 @@
++/* Common definition for ifunc selections for wcslen and wcsnlen
++   All versions must be listed in ifunc-impl-list.c.
++   Copyright (C) 2017-2021 Free Software Foundation, Inc.
++   This file is part of the GNU C Library.
++
++   The GNU C Library is free software; you can redistribute it and/or
++   modify it under the terms of the GNU Lesser General Public
++   License as published by the Free Software Foundation; either
++   version 2.1 of the License, or (at your option) any later version.
++
++   The GNU C Library is distributed in the hope that it will be useful,
++   but WITHOUT ANY WARRANTY; without even the implied warranty of
++   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++   Lesser General Public License for more details.
++
++   You should have received a copy of the GNU Lesser General Public
++   License along with the GNU C Library; if not, see
++   <https://www.gnu.org/licenses/>.  */
++
++#include <init-arch.h>
++
++extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden;
++extern __typeof (REDIRECT_NAME) OPTIMIZE (sse4_1) attribute_hidden;
++extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden;
++extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden;
++extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden;
++
++static inline void *
++IFUNC_SELECTOR (void)
++{
++  const struct cpu_features* cpu_features = __get_cpu_features ();
++
++  if (CPU_FEATURE_USABLE_P (cpu_features, AVX2)
++      && CPU_FEATURE_USABLE_P (cpu_features, BMI2)
++      && CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load))
++    {
++      if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
++	  && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW))
++	return OPTIMIZE (evex);
++
++      if (CPU_FEATURE_USABLE_P (cpu_features, RTM))
++	return OPTIMIZE (avx2_rtm);
++
++      if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
++	return OPTIMIZE (avx2);
++    }
++
++  if (CPU_FEATURE_USABLE_P (cpu_features, SSE4_1))
++    return OPTIMIZE (sse4_1);
++
++  return OPTIMIZE (sse2);
++}
+diff --git a/sysdeps/x86_64/multiarch/wcslen-sse4_1.S b/sysdeps/x86_64/multiarch/wcslen-sse4_1.S
+new file mode 100644
+index 00000000..7e62621a
+--- /dev/null
++++ b/sysdeps/x86_64/multiarch/wcslen-sse4_1.S
+@@ -0,0 +1,4 @@
++#define AS_WCSLEN
++#define strlen	__wcslen_sse4_1
++
++#include "strlen-vec.S"
+diff --git a/sysdeps/x86_64/multiarch/wcslen.c b/sysdeps/x86_64/multiarch/wcslen.c
+index 6d06e47c..3b04b75b 100644
+--- a/sysdeps/x86_64/multiarch/wcslen.c
++++ b/sysdeps/x86_64/multiarch/wcslen.c
+@@ -24,7 +24,7 @@
+ # undef __wcslen
+ 
+ # define SYMBOL_NAME wcslen
+-# include "ifunc-avx2.h"
++# include "ifunc-wcslen.h"
+ 
+ libc_ifunc_redirected (__redirect_wcslen, __wcslen, IFUNC_SELECTOR ());
+ weak_alias (__wcslen, wcslen);
+diff --git a/sysdeps/x86_64/multiarch/wcsnlen.c b/sysdeps/x86_64/multiarch/wcsnlen.c
+index 20b731ae..06736410 100644
+--- a/sysdeps/x86_64/multiarch/wcsnlen.c
++++ b/sysdeps/x86_64/multiarch/wcsnlen.c
+@@ -24,39 +24,7 @@
+ # undef __wcsnlen
+ 
+ # define SYMBOL_NAME wcsnlen
+-# include <init-arch.h>
+-
+-extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden;
+-extern __typeof (REDIRECT_NAME) OPTIMIZE (sse4_1) attribute_hidden;
+-extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden;
+-extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden;
+-extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden;
+-
+-static inline void *
+-IFUNC_SELECTOR (void)
+-{
+-  const struct cpu_features* cpu_features = __get_cpu_features ();
+-
+-  if (CPU_FEATURE_USABLE_P (cpu_features, AVX2)
+-      && CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load))
+-    {
+-      if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
+-	  && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW)
+-	  && CPU_FEATURE_USABLE_P (cpu_features, BMI2))
+-	return OPTIMIZE (evex);
+-
+-      if (CPU_FEATURE_USABLE_P (cpu_features, RTM))
+-	return OPTIMIZE (avx2_rtm);
+-
+-      if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
+-	return OPTIMIZE (avx2);
+-    }
+-
+-  if (CPU_FEATURE_USABLE_P (cpu_features, SSE4_1))
+-    return OPTIMIZE (sse4_1);
+-
+-  return OPTIMIZE (sse2);
+-}
++# include "ifunc-wcslen.h"
+ 
+ libc_ifunc_redirected (__redirect_wcsnlen, __wcsnlen, IFUNC_SELECTOR ());
+ weak_alias (__wcsnlen, wcsnlen);
+-- 
+GitLab
+
diff --git a/glibc-RHEL-15696-3.patch b/glibc-RHEL-15696-3.patch
new file mode 100644
index 0000000..8f5093c
--- /dev/null
+++ b/glibc-RHEL-15696-3.patch
@@ -0,0 +1,396 @@
+From 231c56760c1e2ded21ad96bbb860b1f08c556c7a Mon Sep 17 00:00:00 2001
+From: "H.J. Lu" <hjl.tools@gmail.com>
+Date: Mon, 21 Jan 2019 11:27:25 -0800
+Subject: [PATCH] x86-64 memcpy: Properly handle the length parameter [BZ#
+ 24097]
+Content-type: text/plain; charset=UTF-8
+
+On x32, the size_t parameter may be passed in the lower 32 bits of a
+64-bit register with the non-zero upper 32 bits.  The string/memory
+functions written in assembly can only use the lower 32 bits of a
+64-bit register as length or must clear the upper 32 bits before using
+the full 64-bit register for length.
+
+This pach fixes memcpy for x32.  Tested on x86-64 and x32.  On x86-64,
+libc.so is the same with and withou the fix.
+
+	[BZ# 24097]
+	CVE-2019-6488
+	* sysdeps/x86_64/multiarch/memcpy-ssse3-back.S: Use RDX_LP for
+	length.  Clear the upper 32 bits of RDX register.
+	* sysdeps/x86_64/multiarch/memcpy-ssse3.S: Likewise.
+	* sysdeps/x86_64/multiarch/memmove-avx512-no-vzeroupper.S:
+	Likewise.
+	* sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S:
+	Likewise.
+	* sysdeps/x86_64/x32/Makefile (tests): Add tst-size_t-memcpy.
+	tst-size_t-wmemchr.
+	* sysdeps/x86_64/x32/tst-size_t-memcpy.c: New file.
+---
+ sysdeps/x86_64/multiarch/memcpy-ssse3-back.S  | 17 ++++--
+ sysdeps/x86_64/multiarch/memcpy-ssse3.S       | 17 ++++--
+ .../multiarch/memmove-avx512-no-vzeroupper.S  | 16 +++--
+ .../multiarch/memmove-vec-unaligned-erms.S    | 54 +++++++++--------
+ sysdeps/x86_64/x32/Makefile                   |  2 +-
+ sysdeps/x86_64/x32/tst-size_t-memcpy.c        | 58 +++++++++++++++++++
+ 6 files changed, 122 insertions(+), 42 deletions(-)
+ create mode 100644 sysdeps/x86_64/x32/tst-size_t-memcpy.c
+
+Conflicts:
+	ChangeLog
+	(removed)
+
+diff --git a/sysdeps/x86_64/multiarch/memcpy-ssse3-back.S b/sysdeps/x86_64/multiarch/memcpy-ssse3-back.S
+index 3cd11233..568eebd3 100644
+--- a/sysdeps/x86_64/multiarch/memcpy-ssse3-back.S
++++ b/sysdeps/x86_64/multiarch/memcpy-ssse3-back.S
+@@ -45,28 +45,33 @@
+ 	.section .text.ssse3,"ax",@progbits
+ #if !defined USE_AS_MEMPCPY && !defined USE_AS_MEMMOVE
+ ENTRY (MEMPCPY_CHK)
+-	cmpq	%rdx, %rcx
++	cmp	%RDX_LP, %RCX_LP
+ 	jb	HIDDEN_JUMPTARGET (__chk_fail)
+ END (MEMPCPY_CHK)
+ 
+ ENTRY (MEMPCPY)
+-	movq	%rdi, %rax
+-	addq	%rdx, %rax
++	mov	%RDI_LP, %RAX_LP
++	add	%RDX_LP, %RAX_LP
+ 	jmp	L(start)
+ END (MEMPCPY)
+ #endif
+ 
+ #if !defined USE_AS_BCOPY
+ ENTRY (MEMCPY_CHK)
+-	cmpq	%rdx, %rcx
++	cmp	%RDX_LP, %RCX_LP
+ 	jb	HIDDEN_JUMPTARGET (__chk_fail)
+ END (MEMCPY_CHK)
+ #endif
+ 
+ ENTRY (MEMCPY)
+-	mov	%rdi, %rax
++	mov	%RDI_LP, %RAX_LP
+ #ifdef USE_AS_MEMPCPY
+-	add	%rdx, %rax
++	add	%RDX_LP, %RAX_LP
++#endif
++
++#ifdef __ILP32__
++	/* Clear the upper 32 bits.  */
++	mov	%edx, %edx
+ #endif
+ 
+ #ifdef USE_AS_MEMMOVE
+diff --git a/sysdeps/x86_64/multiarch/memcpy-ssse3.S b/sysdeps/x86_64/multiarch/memcpy-ssse3.S
+index 0240bfa3..0bd5ee99 100644
+--- a/sysdeps/x86_64/multiarch/memcpy-ssse3.S
++++ b/sysdeps/x86_64/multiarch/memcpy-ssse3.S
+@@ -45,28 +45,33 @@
+ 	.section .text.ssse3,"ax",@progbits
+ #if !defined USE_AS_MEMPCPY && !defined USE_AS_MEMMOVE
+ ENTRY (MEMPCPY_CHK)
+-	cmpq	%rdx, %rcx
++	cmp	%RDX_LP, %RCX_LP
+ 	jb	HIDDEN_JUMPTARGET (__chk_fail)
+ END (MEMPCPY_CHK)
+ 
+ ENTRY (MEMPCPY)
+-	movq	%rdi, %rax
+-	addq	%rdx, %rax
++	mov	%RDI_LP, %RAX_LP
++	add	%RDX_LP, %RAX_LP
+ 	jmp	L(start)
+ END (MEMPCPY)
+ #endif
+ 
+ #if !defined USE_AS_BCOPY
+ ENTRY (MEMCPY_CHK)
+-	cmpq	%rdx, %rcx
++	cmp	%RDX_LP, %RCX_LP
+ 	jb	HIDDEN_JUMPTARGET (__chk_fail)
+ END (MEMCPY_CHK)
+ #endif
+ 
+ ENTRY (MEMCPY)
+-	mov	%rdi, %rax
++	mov	%RDI_LP, %RAX_LP
+ #ifdef USE_AS_MEMPCPY
+-	add	%rdx, %rax
++	add	%RDX_LP, %RAX_LP
++#endif
++
++#ifdef __ILP32__
++	/* Clear the upper 32 bits.  */
++	mov	%edx, %edx
+ #endif
+ 
+ #ifdef USE_AS_MEMMOVE
+diff --git a/sysdeps/x86_64/multiarch/memmove-avx512-no-vzeroupper.S b/sysdeps/x86_64/multiarch/memmove-avx512-no-vzeroupper.S
+index effc3ac2..6ca2bbc9 100644
+--- a/sysdeps/x86_64/multiarch/memmove-avx512-no-vzeroupper.S
++++ b/sysdeps/x86_64/multiarch/memmove-avx512-no-vzeroupper.S
+@@ -24,27 +24,31 @@
+ 
+ 	.section .text.avx512,"ax",@progbits
+ ENTRY (__mempcpy_chk_avx512_no_vzeroupper)
+-	cmpq	%rdx, %rcx
++	cmp	%RDX_LP, %RCX_LP
+ 	jb	HIDDEN_JUMPTARGET (__chk_fail)
+ END (__mempcpy_chk_avx512_no_vzeroupper)
+ 
+ ENTRY (__mempcpy_avx512_no_vzeroupper)
+-	movq	%rdi, %rax
+-	addq	%rdx, %rax
++	mov	%RDI_LP, %RAX_LP
++	add	%RDX_LP, %RAX_LP
+ 	jmp	L(start)
+ END (__mempcpy_avx512_no_vzeroupper)
+ 
+ ENTRY (__memmove_chk_avx512_no_vzeroupper)
+-	cmpq	%rdx, %rcx
++	cmp	%RDX_LP, %RCX_LP
+ 	jb	HIDDEN_JUMPTARGET (__chk_fail)
+ END (__memmove_chk_avx512_no_vzeroupper)
+ 
+ ENTRY (__memmove_avx512_no_vzeroupper)
+-	mov	%rdi, %rax
++	mov	%RDI_LP, %RAX_LP
+ # ifdef USE_AS_MEMPCPY
+-	add	%rdx, %rax
++	add	%RDX_LP, %RAX_LP
+ # endif
+ L(start):
++# ifdef __ILP32__
++	/* Clear the upper 32 bits.  */
++	mov	%edx, %edx
++# endif
+ 	lea	(%rsi, %rdx), %rcx
+ 	lea	(%rdi, %rdx), %r9
+ 	cmp	$512, %rdx
+diff --git a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
+index c952576c..274aa1c7 100644
+--- a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
++++ b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
+@@ -95,20 +95,20 @@
+ 	.section SECTION(.text),"ax",@progbits
+ #if defined SHARED && IS_IN (libc)
+ ENTRY (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned))
+-	cmpq	%rdx, %rcx
++	cmp	%RDX_LP, %RCX_LP
+ 	jb	HIDDEN_JUMPTARGET (__chk_fail)
+ END (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned))
+ #endif
+ 
+ ENTRY (MEMPCPY_SYMBOL (__mempcpy, unaligned))
+-	movq	%rdi, %rax
+-	addq	%rdx, %rax
++	mov	%RDI_LP, %RAX_LP
++	add	%RDX_LP, %RAX_LP
+ 	jmp	L(start)
+ END (MEMPCPY_SYMBOL (__mempcpy, unaligned))
+ 
+ #if defined SHARED && IS_IN (libc)
+ ENTRY (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned))
+-	cmpq	%rdx, %rcx
++	cmp	%RDX_LP, %RCX_LP
+ 	jb	HIDDEN_JUMPTARGET (__chk_fail)
+ END (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned))
+ #endif
+@@ -116,9 +116,13 @@ END (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned))
+ ENTRY (MEMMOVE_SYMBOL (__memmove, unaligned))
+ 	movq	%rdi, %rax
+ L(start):
+-	cmpq	$VEC_SIZE, %rdx
++# ifdef __ILP32__
++	/* Clear the upper 32 bits.  */
++	movl	%edx, %edx
++# endif
++	cmp	$VEC_SIZE, %RDX_LP
+ 	jb	L(less_vec)
+-	cmpq	$(VEC_SIZE * 2), %rdx
++	cmp	$(VEC_SIZE * 2), %RDX_LP
+ 	ja	L(more_2x_vec)
+ #if !defined USE_MULTIARCH || !IS_IN (libc)
+ L(last_2x_vec):
+@@ -138,38 +142,38 @@ END (MEMMOVE_SYMBOL (__memmove, unaligned))
+ 
+ # if VEC_SIZE == 16
+ ENTRY (__mempcpy_chk_erms)
+-	cmpq	%rdx, %rcx
++	cmp	%RDX_LP, %RCX_LP
+ 	jb	HIDDEN_JUMPTARGET (__chk_fail)
+ END (__mempcpy_chk_erms)
+ 
+ /* Only used to measure performance of REP MOVSB.  */
+ ENTRY (__mempcpy_erms)
+-	movq	%rdi, %rax
++	mov	%RDI_LP, %RAX_LP
+ 	/* Skip zero length.  */
+-	testq	%rdx, %rdx
++	test	%RDX_LP, %RDX_LP
+ 	jz	2f
+-	addq	%rdx, %rax
++	add	%RDX_LP, %RAX_LP
+ 	jmp	L(start_movsb)
+ END (__mempcpy_erms)
+ 
+ ENTRY (__memmove_chk_erms)
+-	cmpq	%rdx, %rcx
++	cmp	%RDX_LP, %RCX_LP
+ 	jb	HIDDEN_JUMPTARGET (__chk_fail)
+ END (__memmove_chk_erms)
+ 
+ ENTRY (__memmove_erms)
+ 	movq	%rdi, %rax
+ 	/* Skip zero length.  */
+-	testq	%rdx, %rdx
++	test	%RDX_LP, %RDX_LP
+ 	jz	2f
+ L(start_movsb):
+-	movq	%rdx, %rcx
+-	cmpq	%rsi, %rdi
++	mov	%RDX_LP, %RCX_LP
++	cmp	%RSI_LP, %RDI_LP
+ 	jb	1f
+ 	/* Source == destination is less common.  */
+ 	je	2f
+-	leaq	(%rsi,%rcx), %rdx
+-	cmpq	%rdx, %rdi
++	lea	(%rsi,%rcx), %RDX_LP
++	cmp	%RDX_LP, %RDI_LP
+ 	jb	L(movsb_backward)
+ 1:
+ 	rep movsb
+@@ -189,20 +193,20 @@ strong_alias (__memmove_chk_erms, __memcpy_chk_erms)
+ 
+ # ifdef SHARED
+ ENTRY (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned_erms))
+-	cmpq	%rdx, %rcx
++	cmp	%RDX_LP, %RCX_LP
+ 	jb	HIDDEN_JUMPTARGET (__chk_fail)
+ END (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned_erms))
+ # endif
+ 
+ ENTRY (MEMMOVE_SYMBOL (__mempcpy, unaligned_erms))
+-	movq	%rdi, %rax
+-	addq	%rdx, %rax
++	mov	%RDI_LP, %RAX_LP
++	add	%RDX_LP, %RAX_LP
+ 	jmp	L(start_erms)
+ END (MEMMOVE_SYMBOL (__mempcpy, unaligned_erms))
+ 
+ # ifdef SHARED
+ ENTRY (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned_erms))
+-	cmpq	%rdx, %rcx
++	cmp	%RDX_LP, %RCX_LP
+ 	jb	HIDDEN_JUMPTARGET (__chk_fail)
+ END (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned_erms))
+ # endif
+@@ -210,9 +214,13 @@ END (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned_erms))
+ ENTRY (MEMMOVE_SYMBOL (__memmove, unaligned_erms))
+ 	movq	%rdi, %rax
+ L(start_erms):
+-	cmpq	$VEC_SIZE, %rdx
++# ifdef __ILP32__
++	/* Clear the upper 32 bits.  */
++	movl	%edx, %edx
++# endif
++	cmp	$VEC_SIZE, %RDX_LP
+ 	jb	L(less_vec)
+-	cmpq	$(VEC_SIZE * 2), %rdx
++	cmp	$(VEC_SIZE * 2), %RDX_LP
+ 	ja	L(movsb_more_2x_vec)
+ L(last_2x_vec):
+ 	/* From VEC and to 2 * VEC.  No branch when size == VEC_SIZE. */
+@@ -236,7 +244,7 @@ L(movsb):
+ 	/* Avoid slow backward REP MOVSB.  */
+ 	jb	L(more_8x_vec_backward)
+ 1:
+-	movq	%rdx, %rcx
++	mov	%RDX_LP, %RCX_LP
+ 	rep movsb
+ L(nop):
+ 	ret
+diff --git a/sysdeps/x86_64/x32/Makefile b/sysdeps/x86_64/x32/Makefile
+index ddec7f04..2fe1e5ac 100644
+--- a/sysdeps/x86_64/x32/Makefile
++++ b/sysdeps/x86_64/x32/Makefile
+@@ -6,7 +6,7 @@ CFLAGS-s_llround.c += -fno-builtin-lround
+ endif
+ 
+ ifeq ($(subdir),string)
+-tests += tst-size_t-memchr tst-size_t-memcmp
++tests += tst-size_t-memchr tst-size_t-memcmp tst-size_t-memcpy
+ endif
+ 
+ ifeq ($(subdir),wcsmbs)
+diff --git a/sysdeps/x86_64/x32/tst-size_t-memcpy.c b/sysdeps/x86_64/x32/tst-size_t-memcpy.c
+new file mode 100644
+index 00000000..66b71e17
+--- /dev/null
++++ b/sysdeps/x86_64/x32/tst-size_t-memcpy.c
+@@ -0,0 +1,58 @@
++/* Test memcpy with size_t in the lower 32 bits of 64-bit register.
++   Copyright (C) 2019 Free Software Foundation, Inc.
++   This file is part of the GNU C Library.
++
++   The GNU C Library is free software; you can redistribute it and/or
++   modify it under the terms of the GNU Lesser General Public
++   License as published by the Free Software Foundation; either
++   version 2.1 of the License, or (at your option) any later version.
++
++   The GNU C Library is distributed in the hope that it will be useful,
++   but WITHOUT ANY WARRANTY; without even the implied warranty of
++   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++   Lesser General Public License for more details.
++
++   You should have received a copy of the GNU Lesser General Public
++   License along with the GNU C Library; if not, see
++   <http://www.gnu.org/licenses/>.  */
++
++#define TEST_NAME "memcpy"
++#include "test-size_t.h"
++
++IMPL (memcpy, 1)
++
++typedef void *(*proto_t) (void *, const void *, size_t);
++
++static void *
++__attribute__ ((noinline, noclone))
++do_memcpy (parameter_t a, parameter_t b)
++{
++  return CALL (&b, a.p, b.p, a.len);
++}
++
++static int
++test_main (void)
++{
++  test_init ();
++
++  parameter_t dest = { { page_size }, buf1 };
++  parameter_t src = { { 0 }, buf2 };
++
++  int ret = 0;
++  FOR_EACH_IMPL (impl, 0)
++    {
++      src.fn = impl->fn;
++      do_memcpy (dest, src);
++      int res = memcmp (dest.p, src.p, dest.len);
++      if (res)
++	{
++	  error (0, 0, "Wrong result in function %s: %i != 0",
++		 impl->name, res);
++	  ret = 1;
++	}
++    }
++
++  return ret ? EXIT_FAILURE : EXIT_SUCCESS;
++}
++
++#include <support/test-driver.c>
+-- 
+GitLab
+
diff --git a/glibc-RHEL-15696-30.patch b/glibc-RHEL-15696-30.patch
new file mode 100644
index 0000000..0b16f0f
--- /dev/null
+++ b/glibc-RHEL-15696-30.patch
@@ -0,0 +1,497 @@
+From a775a7a3eb1e85b54af0b4ee5ff4dcf66772a1fb Mon Sep 17 00:00:00 2001
+From: Noah Goldstein <goldstein.w.n@gmail.com>
+Date: Wed, 23 Jun 2021 01:56:29 -0400
+Subject: [PATCH] x86: Fix overflow bug in wcsnlen-sse4_1 and wcsnlen-avx2 [BZ
+ #27974]
+Content-type: text/plain; charset=UTF-8
+
+This commit fixes the bug mentioned in the previous commit.
+
+The previous implementations of wmemchr in these files relied
+on maxlen * sizeof(wchar_t) which was not guranteed by the standard.
+
+The new overflow tests added in the previous commit now
+pass (As well as all the other tests).
+
+Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
+Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
+---
+ sysdeps/x86_64/multiarch/strlen-avx2.S | 130 ++++++++++++++++++-------
+ sysdeps/x86_64/multiarch/strlen-vec.S  |  15 ++-
+ 2 files changed, 107 insertions(+), 38 deletions(-)
+
+diff --git a/sysdeps/x86_64/multiarch/strlen-avx2.S b/sysdeps/x86_64/multiarch/strlen-avx2.S
+index be8a5db5..37688966 100644
+--- a/sysdeps/x86_64/multiarch/strlen-avx2.S
++++ b/sysdeps/x86_64/multiarch/strlen-avx2.S
+@@ -44,21 +44,21 @@
+ 
+ # define VEC_SIZE 32
+ # define PAGE_SIZE 4096
++# define CHAR_PER_VEC	(VEC_SIZE / CHAR_SIZE)
+ 
+ 	.section SECTION(.text),"ax",@progbits
+ ENTRY (STRLEN)
+ # ifdef USE_AS_STRNLEN
+ 	/* Check zero length.  */
++#  ifdef __ILP32__
++	/* Clear upper bits.  */
++	and	%RSI_LP, %RSI_LP
++#  else
+ 	test	%RSI_LP, %RSI_LP
++#  endif
+ 	jz	L(zero)
+ 	/* Store max len in R8_LP before adjusting if using WCSLEN.  */
+ 	mov	%RSI_LP, %R8_LP
+-#  ifdef USE_AS_WCSLEN
+-	shl	$2, %RSI_LP
+-#  elif defined __ILP32__
+-	/* Clear the upper 32 bits.  */
+-	movl	%esi, %esi
+-#  endif
+ # endif
+ 	movl	%edi, %eax
+ 	movq	%rdi, %rdx
+@@ -72,10 +72,10 @@ ENTRY (STRLEN)
+ 
+ 	/* Check the first VEC_SIZE bytes.  */
+ 	VPCMPEQ	(%rdi), %ymm0, %ymm1
+-	vpmovmskb	%ymm1, %eax
++	vpmovmskb %ymm1, %eax
+ # ifdef USE_AS_STRNLEN
+ 	/* If length < VEC_SIZE handle special.  */
+-	cmpq	$VEC_SIZE, %rsi
++	cmpq	$CHAR_PER_VEC, %rsi
+ 	jbe	L(first_vec_x0)
+ # endif
+ 	/* If empty continue to aligned_more. Otherwise return bit
+@@ -84,6 +84,7 @@ ENTRY (STRLEN)
+ 	jz	L(aligned_more)
+ 	tzcntl	%eax, %eax
+ # ifdef USE_AS_WCSLEN
++	/* NB: Divide bytes by 4 to get wchar_t count.  */
+ 	shrl	$2, %eax
+ # endif
+ 	VZEROUPPER_RETURN
+@@ -97,9 +98,14 @@ L(zero):
+ L(first_vec_x0):
+ 	/* Set bit for max len so that tzcnt will return min of max len
+ 	   and position of first match.  */
++#  ifdef USE_AS_WCSLEN
++	/* NB: Multiply length by 4 to get byte count.  */
++	sall	$2, %esi
++#  endif
+ 	btsq	%rsi, %rax
+ 	tzcntl	%eax, %eax
+ #  ifdef USE_AS_WCSLEN
++	/* NB: Divide bytes by 4 to get wchar_t count.  */
+ 	shrl	$2, %eax
+ #  endif
+ 	VZEROUPPER_RETURN
+@@ -113,14 +119,19 @@ L(first_vec_x1):
+ # ifdef USE_AS_STRNLEN
+ 	/* Use ecx which was computed earlier to compute correct value.
+ 	 */
++#  ifdef USE_AS_WCSLEN
++	leal	-(VEC_SIZE * 4 + 1)(%rax, %rcx, 4), %eax
++#  else
+ 	subl	$(VEC_SIZE * 4 + 1), %ecx
+ 	addl	%ecx, %eax
++#  endif
+ # else
+ 	subl	%edx, %edi
+ 	incl	%edi
+ 	addl	%edi, %eax
+ # endif
+ # ifdef USE_AS_WCSLEN
++	/* NB: Divide bytes by 4 to get wchar_t count.  */
+ 	shrl	$2, %eax
+ # endif
+ 	VZEROUPPER_RETURN
+@@ -133,14 +144,19 @@ L(first_vec_x2):
+ # ifdef USE_AS_STRNLEN
+ 	/* Use ecx which was computed earlier to compute correct value.
+ 	 */
++#  ifdef USE_AS_WCSLEN
++	leal	-(VEC_SIZE * 3 + 1)(%rax, %rcx, 4), %eax
++#  else
+ 	subl	$(VEC_SIZE * 3 + 1), %ecx
+ 	addl	%ecx, %eax
++#  endif
+ # else
+ 	subl	%edx, %edi
+ 	addl	$(VEC_SIZE + 1), %edi
+ 	addl	%edi, %eax
+ # endif
+ # ifdef USE_AS_WCSLEN
++	/* NB: Divide bytes by 4 to get wchar_t count.  */
+ 	shrl	$2, %eax
+ # endif
+ 	VZEROUPPER_RETURN
+@@ -153,14 +169,19 @@ L(first_vec_x3):
+ # ifdef USE_AS_STRNLEN
+ 	/* Use ecx which was computed earlier to compute correct value.
+ 	 */
++#  ifdef USE_AS_WCSLEN
++	leal	-(VEC_SIZE * 2 + 1)(%rax, %rcx, 4), %eax
++#  else
+ 	subl	$(VEC_SIZE * 2 + 1), %ecx
+ 	addl	%ecx, %eax
++#  endif
+ # else
+ 	subl	%edx, %edi
+ 	addl	$(VEC_SIZE * 2 + 1), %edi
+ 	addl	%edi, %eax
+ # endif
+ # ifdef USE_AS_WCSLEN
++	/* NB: Divide bytes by 4 to get wchar_t count.  */
+ 	shrl	$2, %eax
+ # endif
+ 	VZEROUPPER_RETURN
+@@ -173,14 +194,19 @@ L(first_vec_x4):
+ # ifdef USE_AS_STRNLEN
+ 	/* Use ecx which was computed earlier to compute correct value.
+ 	 */
++#  ifdef USE_AS_WCSLEN
++	leal	-(VEC_SIZE * 1 + 1)(%rax, %rcx, 4), %eax
++#  else
+ 	subl	$(VEC_SIZE + 1), %ecx
+ 	addl	%ecx, %eax
++#  endif
+ # else
+ 	subl	%edx, %edi
+ 	addl	$(VEC_SIZE * 3 + 1), %edi
+ 	addl	%edi, %eax
+ # endif
+ # ifdef USE_AS_WCSLEN
++	/* NB: Divide bytes by 4 to get wchar_t count.  */
+ 	shrl	$2, %eax
+ # endif
+ 	VZEROUPPER_RETURN
+@@ -195,10 +221,14 @@ L(cross_page_continue):
+ 	/* Check the first 4 * VEC_SIZE.  Only one VEC_SIZE at a time
+ 	   since data is only aligned to VEC_SIZE.  */
+ # ifdef USE_AS_STRNLEN
+-	/* + 1 because rdi is aligned to VEC_SIZE - 1. + CHAR_SIZE because
+-	   it simplies the logic in last_4x_vec_or_less.  */
++	/* + 1 because rdi is aligned to VEC_SIZE - 1. + CHAR_SIZE
++	   because it simplies the logic in last_4x_vec_or_less.  */
+ 	leaq	(VEC_SIZE * 4 + CHAR_SIZE + 1)(%rdi), %rcx
+ 	subq	%rdx, %rcx
++#  ifdef USE_AS_WCSLEN
++	/* NB: Divide bytes by 4 to get the wchar_t count.  */
++	sarl	$2, %ecx
++#  endif
+ # endif
+ 	/* Load first VEC regardless.  */
+ 	VPCMPEQ	1(%rdi), %ymm0, %ymm1
+@@ -207,34 +237,38 @@ L(cross_page_continue):
+ 	subq	%rcx, %rsi
+ 	jb	L(last_4x_vec_or_less)
+ # endif
+-	vpmovmskb	%ymm1, %eax
++	vpmovmskb %ymm1, %eax
+ 	testl	%eax, %eax
+ 	jnz	L(first_vec_x1)
+ 
+ 	VPCMPEQ	(VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
+-	vpmovmskb	%ymm1, %eax
++	vpmovmskb %ymm1, %eax
+ 	testl	%eax, %eax
+ 	jnz	L(first_vec_x2)
+ 
+ 	VPCMPEQ	(VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm1
+-	vpmovmskb	%ymm1, %eax
++	vpmovmskb %ymm1, %eax
+ 	testl	%eax, %eax
+ 	jnz	L(first_vec_x3)
+ 
+ 	VPCMPEQ	(VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm1
+-	vpmovmskb	%ymm1, %eax
++	vpmovmskb %ymm1, %eax
+ 	testl	%eax, %eax
+ 	jnz	L(first_vec_x4)
+ 
+ 	/* Align data to VEC_SIZE * 4 - 1.  */
+ # ifdef USE_AS_STRNLEN
+ 	/* Before adjusting length check if at last VEC_SIZE * 4.  */
+-	cmpq	$(VEC_SIZE * 4 - 1), %rsi
++	cmpq	$(CHAR_PER_VEC * 4 - 1), %rsi
+ 	jbe	L(last_4x_vec_or_less_load)
+ 	incq	%rdi
+ 	movl	%edi, %ecx
+ 	orq	$(VEC_SIZE * 4 - 1), %rdi
+ 	andl	$(VEC_SIZE * 4 - 1), %ecx
++#  ifdef USE_AS_WCSLEN
++	/* NB: Divide bytes by 4 to get the wchar_t count.  */
++	sarl	$2, %ecx
++#  endif
+ 	/* Readjust length.  */
+ 	addq	%rcx, %rsi
+ # else
+@@ -246,13 +280,13 @@ L(cross_page_continue):
+ L(loop_4x_vec):
+ # ifdef USE_AS_STRNLEN
+ 	/* Break if at end of length.  */
+-	subq	$(VEC_SIZE * 4), %rsi
++	subq	$(CHAR_PER_VEC * 4), %rsi
+ 	jb	L(last_4x_vec_or_less_cmpeq)
+ # endif
+-	/* Save some code size by microfusing VPMINU with the load. Since
+-	   the matches in ymm2/ymm4 can only be returned if there where no
+-	   matches in ymm1/ymm3 respectively there is no issue with overlap.
+-	 */
++	/* Save some code size by microfusing VPMINU with the load.
++	   Since the matches in ymm2/ymm4 can only be returned if there
++	   where no matches in ymm1/ymm3 respectively there is no issue
++	   with overlap.  */
+ 	vmovdqa	1(%rdi), %ymm1
+ 	VPMINU	(VEC_SIZE + 1)(%rdi), %ymm1, %ymm2
+ 	vmovdqa	(VEC_SIZE * 2 + 1)(%rdi), %ymm3
+@@ -260,7 +294,7 @@ L(loop_4x_vec):
+ 
+ 	VPMINU	%ymm2, %ymm4, %ymm5
+ 	VPCMPEQ	%ymm5, %ymm0, %ymm5
+-	vpmovmskb	%ymm5, %ecx
++	vpmovmskb %ymm5, %ecx
+ 
+ 	subq	$-(VEC_SIZE * 4), %rdi
+ 	testl	%ecx, %ecx
+@@ -268,27 +302,28 @@ L(loop_4x_vec):
+ 
+ 
+ 	VPCMPEQ	%ymm1, %ymm0, %ymm1
+-	vpmovmskb	%ymm1, %eax
++	vpmovmskb %ymm1, %eax
+ 	subq	%rdx, %rdi
+ 	testl	%eax, %eax
+ 	jnz	L(last_vec_return_x0)
+ 
+ 	VPCMPEQ	%ymm2, %ymm0, %ymm2
+-	vpmovmskb	%ymm2, %eax
++	vpmovmskb %ymm2, %eax
+ 	testl	%eax, %eax
+ 	jnz	L(last_vec_return_x1)
+ 
+ 	/* Combine last 2 VEC.  */
+ 	VPCMPEQ	%ymm3, %ymm0, %ymm3
+-	vpmovmskb	%ymm3, %eax
+-	/* rcx has combined result from all 4 VEC. It will only be used if
+-	   the first 3 other VEC all did not contain a match.  */
++	vpmovmskb %ymm3, %eax
++	/* rcx has combined result from all 4 VEC. It will only be used
++	   if the first 3 other VEC all did not contain a match.  */
+ 	salq	$32, %rcx
+ 	orq	%rcx, %rax
+ 	tzcntq	%rax, %rax
+ 	subq	$(VEC_SIZE * 2 - 1), %rdi
+ 	addq	%rdi, %rax
+ # ifdef USE_AS_WCSLEN
++	/* NB: Divide bytes by 4 to get wchar_t count.  */
+ 	shrq	$2, %rax
+ # endif
+ 	VZEROUPPER_RETURN
+@@ -297,15 +332,19 @@ L(loop_4x_vec):
+ # ifdef USE_AS_STRNLEN
+ 	.p2align 4
+ L(last_4x_vec_or_less_load):
+-	/* Depending on entry adjust rdi / prepare first VEC in ymm1.  */
++	/* Depending on entry adjust rdi / prepare first VEC in ymm1.
++	 */
+ 	subq	$-(VEC_SIZE * 4), %rdi
+ L(last_4x_vec_or_less_cmpeq):
+ 	VPCMPEQ	1(%rdi), %ymm0, %ymm1
+ L(last_4x_vec_or_less):
+-
+-	vpmovmskb	%ymm1, %eax
+-	/* If remaining length > VEC_SIZE * 2. This works if esi is off by
+-	   VEC_SIZE * 4.  */
++#  ifdef USE_AS_WCSLEN
++	/* NB: Multiply length by 4 to get byte count.  */
++	sall	$2, %esi
++#  endif
++	vpmovmskb %ymm1, %eax
++	/* If remaining length > VEC_SIZE * 2. This works if esi is off
++	   by VEC_SIZE * 4.  */
+ 	testl	$(VEC_SIZE * 2), %esi
+ 	jnz	L(last_4x_vec)
+ 
+@@ -320,7 +359,7 @@ L(last_4x_vec_or_less):
+ 	jb	L(max)
+ 
+ 	VPCMPEQ	(VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
+-	vpmovmskb	%ymm1, %eax
++	vpmovmskb %ymm1, %eax
+ 	tzcntl	%eax, %eax
+ 	/* Check the end of data.  */
+ 	cmpl	%eax, %esi
+@@ -329,6 +368,7 @@ L(last_4x_vec_or_less):
+ 	addl	$(VEC_SIZE + 1), %eax
+ 	addq	%rdi, %rax
+ #  ifdef USE_AS_WCSLEN
++	/* NB: Divide bytes by 4 to get wchar_t count.  */
+ 	shrq	$2, %rax
+ #  endif
+ 	VZEROUPPER_RETURN
+@@ -340,6 +380,7 @@ L(last_vec_return_x0):
+ 	subq	$(VEC_SIZE * 4 - 1), %rdi
+ 	addq	%rdi, %rax
+ # ifdef USE_AS_WCSLEN
++	/* NB: Divide bytes by 4 to get wchar_t count.  */
+ 	shrq	$2, %rax
+ # endif
+ 	VZEROUPPER_RETURN
+@@ -350,6 +391,7 @@ L(last_vec_return_x1):
+ 	subq	$(VEC_SIZE * 3 - 1), %rdi
+ 	addq	%rdi, %rax
+ # ifdef USE_AS_WCSLEN
++	/* NB: Divide bytes by 4 to get wchar_t count.  */
+ 	shrq	$2, %rax
+ # endif
+ 	VZEROUPPER_RETURN
+@@ -366,6 +408,7 @@ L(last_vec_x1_check):
+ 	incl	%eax
+ 	addq	%rdi, %rax
+ #  ifdef USE_AS_WCSLEN
++	/* NB: Divide bytes by 4 to get wchar_t count.  */
+ 	shrq	$2, %rax
+ #  endif
+ 	VZEROUPPER_RETURN
+@@ -381,14 +424,14 @@ L(last_4x_vec):
+ 	jnz	L(last_vec_x1)
+ 
+ 	VPCMPEQ	(VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
+-	vpmovmskb	%ymm1, %eax
++	vpmovmskb %ymm1, %eax
+ 	testl	%eax, %eax
+ 	jnz	L(last_vec_x2)
+ 
+ 	/* Normalize length.  */
+ 	andl	$(VEC_SIZE * 4 - 1), %esi
+ 	VPCMPEQ	(VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm1
+-	vpmovmskb	%ymm1, %eax
++	vpmovmskb %ymm1, %eax
+ 	testl	%eax, %eax
+ 	jnz	L(last_vec_x3)
+ 
+@@ -396,7 +439,7 @@ L(last_4x_vec):
+ 	jb	L(max)
+ 
+ 	VPCMPEQ	(VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm1
+-	vpmovmskb	%ymm1, %eax
++	vpmovmskb %ymm1, %eax
+ 	tzcntl	%eax, %eax
+ 	/* Check the end of data.  */
+ 	cmpl	%eax, %esi
+@@ -405,6 +448,7 @@ L(last_4x_vec):
+ 	addl	$(VEC_SIZE * 3 + 1), %eax
+ 	addq	%rdi, %rax
+ #  ifdef USE_AS_WCSLEN
++	/* NB: Divide bytes by 4 to get wchar_t count.  */
+ 	shrq	$2, %rax
+ #  endif
+ 	VZEROUPPER_RETURN
+@@ -419,6 +463,7 @@ L(last_vec_x1):
+ 	incl	%eax
+ 	addq	%rdi, %rax
+ #  ifdef USE_AS_WCSLEN
++	/* NB: Divide bytes by 4 to get wchar_t count.  */
+ 	shrq	$2, %rax
+ #  endif
+ 	VZEROUPPER_RETURN
+@@ -432,6 +477,7 @@ L(last_vec_x2):
+ 	addl	$(VEC_SIZE + 1), %eax
+ 	addq	%rdi, %rax
+ #  ifdef USE_AS_WCSLEN
++	/* NB: Divide bytes by 4 to get wchar_t count.  */
+ 	shrq	$2, %rax
+ #  endif
+ 	VZEROUPPER_RETURN
+@@ -447,6 +493,7 @@ L(last_vec_x3):
+ 	addl	$(VEC_SIZE * 2 + 1), %eax
+ 	addq	%rdi, %rax
+ #  ifdef USE_AS_WCSLEN
++	/* NB: Divide bytes by 4 to get wchar_t count.  */
+ 	shrq	$2, %rax
+ #  endif
+ 	VZEROUPPER_RETURN
+@@ -455,13 +502,13 @@ L(max_end):
+ 	VZEROUPPER_RETURN
+ # endif
+ 
+-	/* Cold case for crossing page with first load.	 */
++	/* Cold case for crossing page with first load.  */
+ 	.p2align 4
+ L(cross_page_boundary):
+ 	/* Align data to VEC_SIZE - 1.  */
+ 	orq	$(VEC_SIZE - 1), %rdi
+ 	VPCMPEQ	-(VEC_SIZE - 1)(%rdi), %ymm0, %ymm1
+-	vpmovmskb	%ymm1, %eax
++	vpmovmskb %ymm1, %eax
+ 	/* Remove the leading bytes. sarxl only uses bits [5:0] of COUNT
+ 	   so no need to manually mod rdx.  */
+ 	sarxl	%edx, %eax, %eax
+@@ -470,6 +517,10 @@ L(cross_page_boundary):
+ 	jnz	L(cross_page_less_vec)
+ 	leaq	1(%rdi), %rcx
+ 	subq	%rdx, %rcx
++#  ifdef USE_AS_WCSLEN
++	/* NB: Divide bytes by 4 to get wchar_t count.  */
++	shrl	$2, %ecx
++#  endif
+ 	/* Check length.  */
+ 	cmpq	%rsi, %rcx
+ 	jb	L(cross_page_continue)
+@@ -479,6 +530,7 @@ L(cross_page_boundary):
+ 	jz	L(cross_page_continue)
+ 	tzcntl	%eax, %eax
+ #  ifdef USE_AS_WCSLEN
++	/* NB: Divide length by 4 to get wchar_t count.  */
+ 	shrl	$2, %eax
+ #  endif
+ # endif
+@@ -489,6 +541,10 @@ L(return_vzeroupper):
+ 	.p2align 4
+ L(cross_page_less_vec):
+ 	tzcntl	%eax, %eax
++#  ifdef USE_AS_WCSLEN
++	/* NB: Multiply length by 4 to get byte count.  */
++	sall	$2, %esi
++#  endif
+ 	cmpq	%rax, %rsi
+ 	cmovb	%esi, %eax
+ #  ifdef USE_AS_WCSLEN
+diff --git a/sysdeps/x86_64/multiarch/strlen-vec.S b/sysdeps/x86_64/multiarch/strlen-vec.S
+index 8f660bb9..439e486a 100644
+--- a/sysdeps/x86_64/multiarch/strlen-vec.S
++++ b/sysdeps/x86_64/multiarch/strlen-vec.S
+@@ -65,12 +65,25 @@ ENTRY(strlen)
+ 	ret
+ L(n_nonzero):
+ # ifdef AS_WCSLEN
+-	shl	$2, %RSI_LP
++/* Check for overflow from maxlen * sizeof(wchar_t). If it would
++   overflow the only way this program doesn't have undefined behavior 
++   is if there is a null terminator in valid memory so wcslen will 
++   suffice.  */
++	mov	%RSI_LP, %R10_LP
++	sar	$62, %R10_LP
++	test	%R10_LP, %R10_LP
++	jnz	__wcslen_sse4_1
++	sal	$2, %RSI_LP
+ # endif
+ 
++
+ /* Initialize long lived registers.  */
+ 
+ 	add	%RDI_LP, %RSI_LP
++# ifdef AS_WCSLEN
++/* Check for overflow again from s + maxlen * sizeof(wchar_t).  */
++	jbe	__wcslen_sse4_1
++# endif
+ 	mov	%RSI_LP, %R10_LP
+ 	and	$-64, %R10_LP
+ 	mov	%RSI_LP, %R11_LP
+-- 
+GitLab
+
diff --git a/glibc-RHEL-15696-31.patch b/glibc-RHEL-15696-31.patch
new file mode 100644
index 0000000..4ef6911
--- /dev/null
+++ b/glibc-RHEL-15696-31.patch
@@ -0,0 +1,745 @@
+From 4ba65586847751372520a36757c17f114588794e Mon Sep 17 00:00:00 2001
+From: Noah Goldstein <goldstein.w.n@gmail.com>
+Date: Mon, 19 Apr 2021 19:36:06 -0400
+Subject: [PATCH] x86: Optimize strlen-evex.S
+Content-type: text/plain; charset=UTF-8
+
+No bug. This commit optimizes strlen-evex.S. The
+optimizations are mostly small things but they add up to roughly
+10-30% performance improvement for strlen. The results for strnlen are
+bit more ambiguous. test-strlen, test-strnlen, test-wcslen, and
+test-wcsnlen are all passing.
+
+Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
+---
+ sysdeps/x86_64/multiarch/strlen-evex.S | 581 ++++++++++++++-----------
+ 1 file changed, 317 insertions(+), 264 deletions(-)
+
+diff --git a/sysdeps/x86_64/multiarch/strlen-evex.S b/sysdeps/x86_64/multiarch/strlen-evex.S
+index 05838190..4bf6874b 100644
+--- a/sysdeps/x86_64/multiarch/strlen-evex.S
++++ b/sysdeps/x86_64/multiarch/strlen-evex.S
+@@ -29,11 +29,13 @@
+ # ifdef USE_AS_WCSLEN
+ #  define VPCMP		vpcmpd
+ #  define VPMINU	vpminud
+-#  define SHIFT_REG	r9d
++#  define SHIFT_REG ecx
++#  define CHAR_SIZE	4
+ # else
+ #  define VPCMP		vpcmpb
+ #  define VPMINU	vpminub
+-#  define SHIFT_REG	ecx
++#  define SHIFT_REG edx
++#  define CHAR_SIZE	1
+ # endif
+ 
+ # define XMMZERO	xmm16
+@@ -46,132 +48,165 @@
+ # define YMM6		ymm22
+ 
+ # define VEC_SIZE 32
++# define PAGE_SIZE 4096
++# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE)
+ 
+ 	.section .text.evex,"ax",@progbits
+ ENTRY (STRLEN)
+ # ifdef USE_AS_STRNLEN
+-	/* Check for zero length.  */
++	/* Check zero length.  */
+ 	test	%RSI_LP, %RSI_LP
+ 	jz	L(zero)
+-#  ifdef USE_AS_WCSLEN
+-	shl	$2, %RSI_LP
+-#  elif defined __ILP32__
++#  ifdef __ILP32__
+ 	/* Clear the upper 32 bits.  */
+ 	movl	%esi, %esi
+ #  endif
+ 	mov	%RSI_LP, %R8_LP
+ # endif
+-	movl	%edi, %ecx
+-	movq	%rdi, %rdx
++	movl	%edi, %eax
+ 	vpxorq	%XMMZERO, %XMMZERO, %XMMZERO
+-
++	/* Clear high bits from edi. Only keeping bits relevant to page
++	   cross check.  */
++	andl	$(PAGE_SIZE - 1), %eax
+ 	/* Check if we may cross page boundary with one vector load.  */
+-	andl	$(2 * VEC_SIZE - 1), %ecx
+-	cmpl	$VEC_SIZE, %ecx
+-	ja	L(cros_page_boundary)
++	cmpl	$(PAGE_SIZE - VEC_SIZE), %eax
++	ja	L(cross_page_boundary)
+ 
+ 	/* Check the first VEC_SIZE bytes.  Each bit in K0 represents a
+ 	   null byte.  */
+ 	VPCMP	$0, (%rdi), %YMMZERO, %k0
+ 	kmovd	%k0, %eax
+-	testl	%eax, %eax
+-
+ # ifdef USE_AS_STRNLEN
+-	jnz	L(first_vec_x0_check)
+-	/* Adjust length and check the end of data.  */
+-	subq	$VEC_SIZE, %rsi
+-	jbe	L(max)
+-# else
+-	jnz	L(first_vec_x0)
++	/* If length < CHAR_PER_VEC handle special.  */
++	cmpq	$CHAR_PER_VEC, %rsi
++	jbe	L(first_vec_x0)
+ # endif
+-
+-	/* Align data for aligned loads in the loop.  */
+-	addq	$VEC_SIZE, %rdi
+-	andl	$(VEC_SIZE - 1), %ecx
+-	andq	$-VEC_SIZE, %rdi
+-
++	testl	%eax, %eax
++	jz	L(aligned_more)
++	tzcntl	%eax, %eax
++	ret
+ # ifdef USE_AS_STRNLEN
+-	/* Adjust length.  */
+-	addq	%rcx, %rsi
++L(zero):
++	xorl	%eax, %eax
++	ret
+ 
+-	subq	$(VEC_SIZE * 4), %rsi
+-	jbe	L(last_4x_vec_or_less)
++	.p2align 4
++L(first_vec_x0):
++	/* Set bit for max len so that tzcnt will return min of max len
++	   and position of first match.  */
++	btsq	%rsi, %rax
++	tzcntl	%eax, %eax
++	ret
+ # endif
+-	jmp	L(more_4x_vec)
+ 
+ 	.p2align 4
+-L(cros_page_boundary):
+-	andl	$(VEC_SIZE - 1), %ecx
+-	andq	$-VEC_SIZE, %rdi
+-
+-# ifdef USE_AS_WCSLEN
+-	/* NB: Divide shift count by 4 since each bit in K0 represent 4
+-	   bytes.  */
+-	movl	%ecx, %SHIFT_REG
+-	sarl	$2, %SHIFT_REG
++L(first_vec_x1):
++	tzcntl	%eax, %eax
++	/* Safe to use 32 bit instructions as these are only called for
++	   size = [1, 159].  */
++# ifdef USE_AS_STRNLEN
++	/* Use ecx which was computed earlier to compute correct value.
++	 */
++	leal	-(CHAR_PER_VEC * 4 + 1)(%rcx, %rax), %eax
++# else
++	subl	%edx, %edi
++#  ifdef USE_AS_WCSLEN
++	/* NB: Divide bytes by 4 to get the wchar_t count.  */
++	sarl	$2, %edi
++#  endif
++	leal	CHAR_PER_VEC(%rdi, %rax), %eax
+ # endif
+-	VPCMP	$0, (%rdi), %YMMZERO, %k0
+-	kmovd	%k0, %eax
++	ret
+ 
+-	/* Remove the leading bytes.  */
+-	sarxl	%SHIFT_REG, %eax, %eax
+-	testl	%eax, %eax
+-	jz	L(aligned_more)
++	.p2align 4
++L(first_vec_x2):
+ 	tzcntl	%eax, %eax
+-# ifdef USE_AS_WCSLEN
+-	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
+-	sall	$2, %eax
+-# endif
++	/* Safe to use 32 bit instructions as these are only called for
++	   size = [1, 159].  */
+ # ifdef USE_AS_STRNLEN
+-	/* Check the end of data.  */
+-	cmpq	%rax, %rsi
+-	jbe	L(max)
+-# endif
+-	addq	%rdi, %rax
+-	addq	%rcx, %rax
+-	subq	%rdx, %rax
+-# ifdef USE_AS_WCSLEN
+-	shrq	$2, %rax
++	/* Use ecx which was computed earlier to compute correct value.
++	 */
++	leal	-(CHAR_PER_VEC * 3 + 1)(%rcx, %rax), %eax
++# else
++	subl	%edx, %edi
++#  ifdef USE_AS_WCSLEN
++	/* NB: Divide bytes by 4 to get the wchar_t count.  */
++	sarl	$2, %edi
++#  endif
++	leal	(CHAR_PER_VEC * 2)(%rdi, %rax), %eax
+ # endif
+ 	ret
+ 
+ 	.p2align 4
+-L(aligned_more):
++L(first_vec_x3):
++	tzcntl	%eax, %eax
++	/* Safe to use 32 bit instructions as these are only called for
++	   size = [1, 159].  */
+ # ifdef USE_AS_STRNLEN
+-        /* "rcx" is less than VEC_SIZE.  Calculate "rdx + rcx - VEC_SIZE"
+-	    with "rdx - (VEC_SIZE - rcx)" instead of "(rdx + rcx) - VEC_SIZE"
+-	    to void possible addition overflow.  */
+-	negq	%rcx
+-	addq	$VEC_SIZE, %rcx
+-
+-	/* Check the end of data.  */
+-	subq	%rcx, %rsi
+-	jbe	L(max)
++	/* Use ecx which was computed earlier to compute correct value.
++	 */
++	leal	-(CHAR_PER_VEC * 2 + 1)(%rcx, %rax), %eax
++# else
++	subl	%edx, %edi
++#  ifdef USE_AS_WCSLEN
++	/* NB: Divide bytes by 4 to get the wchar_t count.  */
++	sarl	$2, %edi
++#  endif
++	leal	(CHAR_PER_VEC * 3)(%rdi, %rax), %eax
+ # endif
++	ret
+ 
+-	addq	$VEC_SIZE, %rdi
+-
++	.p2align 4
++L(first_vec_x4):
++	tzcntl	%eax, %eax
++	/* Safe to use 32 bit instructions as these are only called for
++	   size = [1, 159].  */
+ # ifdef USE_AS_STRNLEN
+-	subq	$(VEC_SIZE * 4), %rsi
+-	jbe	L(last_4x_vec_or_less)
++	/* Use ecx which was computed earlier to compute correct value.
++	 */
++	leal	-(CHAR_PER_VEC + 1)(%rcx, %rax), %eax
++# else
++	subl	%edx, %edi
++#  ifdef USE_AS_WCSLEN
++	/* NB: Divide bytes by 4 to get the wchar_t count.  */
++	sarl	$2, %edi
++#  endif
++	leal	(CHAR_PER_VEC * 4)(%rdi, %rax), %eax
+ # endif
++	ret
+ 
+-L(more_4x_vec):
++	.p2align 5
++L(aligned_more):
++	movq	%rdi, %rdx
++	/* Align data to VEC_SIZE.  */
++	andq	$-(VEC_SIZE), %rdi
++L(cross_page_continue):
+ 	/* Check the first 4 * VEC_SIZE.  Only one VEC_SIZE at a time
+ 	   since data is only aligned to VEC_SIZE.  */
+-	VPCMP	$0, (%rdi), %YMMZERO, %k0
+-	kmovd	%k0, %eax
+-	testl	%eax, %eax
+-	jnz	L(first_vec_x0)
+-
++# ifdef USE_AS_STRNLEN
++	/* + CHAR_SIZE because it simplies the logic in
++	   last_4x_vec_or_less.  */
++	leaq	(VEC_SIZE * 5 + CHAR_SIZE)(%rdi), %rcx
++	subq	%rdx, %rcx
++#  ifdef USE_AS_WCSLEN
++	/* NB: Divide bytes by 4 to get the wchar_t count.  */
++	sarl	$2, %ecx
++#  endif
++# endif
++	/* Load first VEC regardless.  */
+ 	VPCMP	$0, VEC_SIZE(%rdi), %YMMZERO, %k0
++# ifdef USE_AS_STRNLEN
++	/* Adjust length. If near end handle specially.  */
++	subq	%rcx, %rsi
++	jb	L(last_4x_vec_or_less)
++# endif
+ 	kmovd	%k0, %eax
+ 	testl	%eax, %eax
+ 	jnz	L(first_vec_x1)
+ 
+ 	VPCMP	$0, (VEC_SIZE * 2)(%rdi), %YMMZERO, %k0
+ 	kmovd	%k0, %eax
+-	testl	%eax, %eax
++	test	%eax, %eax
+ 	jnz	L(first_vec_x2)
+ 
+ 	VPCMP	$0, (VEC_SIZE * 3)(%rdi), %YMMZERO, %k0
+@@ -179,258 +214,276 @@ L(more_4x_vec):
+ 	testl	%eax, %eax
+ 	jnz	L(first_vec_x3)
+ 
+-	addq	$(VEC_SIZE * 4), %rdi
+-
+-# ifdef USE_AS_STRNLEN
+-	subq	$(VEC_SIZE * 4), %rsi
+-	jbe	L(last_4x_vec_or_less)
+-# endif
+-
+-	/* Align data to 4 * VEC_SIZE.  */
+-	movq	%rdi, %rcx
+-	andl	$(4 * VEC_SIZE - 1), %ecx
+-	andq	$-(4 * VEC_SIZE), %rdi
++	VPCMP	$0, (VEC_SIZE * 4)(%rdi), %YMMZERO, %k0
++	kmovd	%k0, %eax
++	testl	%eax, %eax
++	jnz	L(first_vec_x4)
+ 
++	addq	$VEC_SIZE, %rdi
+ # ifdef USE_AS_STRNLEN
+-	/* Adjust length.  */
++	/* Check if at last VEC_SIZE * 4 length.  */
++	cmpq	$(CHAR_PER_VEC * 4 - 1), %rsi
++	jbe	L(last_4x_vec_or_less_load)
++	movl	%edi, %ecx
++	andl	$(VEC_SIZE * 4 - 1), %ecx
++#  ifdef USE_AS_WCSLEN
++	/* NB: Divide bytes by 4 to get the wchar_t count.  */
++	sarl	$2, %ecx
++#  endif
++	/* Readjust length.  */
+ 	addq	%rcx, %rsi
+ # endif
++	/* Align data to VEC_SIZE * 4.  */
++	andq	$-(VEC_SIZE * 4), %rdi
+ 
++	/* Compare 4 * VEC at a time forward.  */
+ 	.p2align 4
+ L(loop_4x_vec):
+-	/* Compare 4 * VEC at a time forward.  */
+-	VMOVA	(%rdi), %YMM1
+-	VMOVA	VEC_SIZE(%rdi), %YMM2
+-	VMOVA	(VEC_SIZE * 2)(%rdi), %YMM3
+-	VMOVA	(VEC_SIZE * 3)(%rdi), %YMM4
+-
+-	VPMINU	%YMM1, %YMM2, %YMM5
+-	VPMINU	%YMM3, %YMM4, %YMM6
++	/* Load first VEC regardless.  */
++	VMOVA	(VEC_SIZE * 4)(%rdi), %YMM1
++# ifdef USE_AS_STRNLEN
++	/* Break if at end of length.  */
++	subq	$(CHAR_PER_VEC * 4), %rsi
++	jb	L(last_4x_vec_or_less_cmpeq)
++# endif
++	/* Save some code size by microfusing VPMINU with the load. Since
++	   the matches in ymm2/ymm4 can only be returned if there where no
++	   matches in ymm1/ymm3 respectively there is no issue with overlap.
++	 */
++	VPMINU	(VEC_SIZE * 5)(%rdi), %YMM1, %YMM2
++	VMOVA	(VEC_SIZE * 6)(%rdi), %YMM3
++	VPMINU	(VEC_SIZE * 7)(%rdi), %YMM3, %YMM4
++
++	VPCMP	$0, %YMM2, %YMMZERO, %k0
++	VPCMP	$0, %YMM4, %YMMZERO, %k1
++	subq	$-(VEC_SIZE * 4), %rdi
++	kortestd	%k0, %k1
++	jz	L(loop_4x_vec)
++
++	/* Check if end was in first half.  */
++	kmovd	%k0, %eax
++	subq	%rdx, %rdi
++# ifdef USE_AS_WCSLEN
++	shrq	$2, %rdi
++# endif
++	testl	%eax, %eax
++	jz	L(second_vec_return)
+ 
+-	VPMINU	%YMM5, %YMM6, %YMM5
+-	VPCMP	$0, %YMM5, %YMMZERO, %k0
+-	ktestd	%k0, %k0
+-	jnz	L(4x_vec_end)
++	VPCMP	$0, %YMM1, %YMMZERO, %k2
++	kmovd	%k2, %edx
++	/* Combine VEC1 matches (edx) with VEC2 matches (eax).  */
++# ifdef USE_AS_WCSLEN
++	sall	$CHAR_PER_VEC, %eax
++	orl	%edx, %eax
++	tzcntl	%eax, %eax
++# else
++	salq	$CHAR_PER_VEC, %rax
++	orq	%rdx, %rax
++	tzcntq	%rax, %rax
++# endif
++	addq	%rdi, %rax
++	ret
+ 
+-	addq	$(VEC_SIZE * 4), %rdi
+ 
+-# ifndef USE_AS_STRNLEN
+-	jmp	L(loop_4x_vec)
+-# else
+-	subq	$(VEC_SIZE * 4), %rsi
+-	ja	L(loop_4x_vec)
++# ifdef USE_AS_STRNLEN
+ 
++L(last_4x_vec_or_less_load):
++	/* Depending on entry adjust rdi / prepare first VEC in YMM1.  */
++	VMOVA	(VEC_SIZE * 4)(%rdi), %YMM1
++L(last_4x_vec_or_less_cmpeq):
++	VPCMP	$0, %YMM1, %YMMZERO, %k0
++	addq	$(VEC_SIZE * 3), %rdi
+ L(last_4x_vec_or_less):
+-	/* Less than 4 * VEC and aligned to VEC_SIZE.  */
+-	addl	$(VEC_SIZE * 2), %esi
+-	jle	L(last_2x_vec)
+-
+-	VPCMP	$0, (%rdi), %YMMZERO, %k0
+ 	kmovd	%k0, %eax
++	/* If remaining length > VEC_SIZE * 2. This works if esi is off by
++	   VEC_SIZE * 4.  */
++	testl	$(CHAR_PER_VEC * 2), %esi
++	jnz	L(last_4x_vec)
++
++	/* length may have been negative or positive by an offset of
++	   CHAR_PER_VEC * 4 depending on where this was called from. This
++	   fixes that.  */
++	andl	$(CHAR_PER_VEC * 4 - 1), %esi
+ 	testl	%eax, %eax
+-	jnz	L(first_vec_x0)
++	jnz	L(last_vec_x1_check)
+ 
+-	VPCMP	$0, VEC_SIZE(%rdi), %YMMZERO, %k0
+-	kmovd	%k0, %eax
+-	testl	%eax, %eax
+-	jnz	L(first_vec_x1)
++	/* Check the end of data.  */
++	subl	$CHAR_PER_VEC, %esi
++	jb	L(max)
+ 
+ 	VPCMP	$0, (VEC_SIZE * 2)(%rdi), %YMMZERO, %k0
+ 	kmovd	%k0, %eax
+-	testl	%eax, %eax
+-	jnz	L(first_vec_x2_check)
+-	subl	$VEC_SIZE, %esi
+-	jle	L(max)
++	tzcntl	%eax, %eax
++	/* Check the end of data.  */
++	cmpl	%eax, %esi
++	jb	L(max)
+ 
+-	VPCMP	$0, (VEC_SIZE * 3)(%rdi), %YMMZERO, %k0
+-	kmovd	%k0, %eax
+-	testl	%eax, %eax
+-	jnz	L(first_vec_x3_check)
++	subq	%rdx, %rdi
++#  ifdef USE_AS_WCSLEN
++	/* NB: Divide bytes by 4 to get the wchar_t count.  */
++	sarq	$2, %rdi
++#  endif
++	leaq	(CHAR_PER_VEC * 2)(%rdi, %rax), %rax
++	ret
++L(max):
+ 	movq	%r8, %rax
++	ret
++# endif
++
++	/* Placed here in strnlen so that the jcc L(last_4x_vec_or_less)
++	   in the 4x VEC loop can use 2 byte encoding.  */
++	.p2align 4
++L(second_vec_return):
++	VPCMP	$0, %YMM3, %YMMZERO, %k0
++	/* Combine YMM3 matches (k0) with YMM4 matches (k1).  */
++# ifdef USE_AS_WCSLEN
++	kunpckbw	%k0, %k1, %k0
++	kmovd	%k0, %eax
++	tzcntl	%eax, %eax
++# else
++	kunpckdq	%k0, %k1, %k0
++	kmovq	%k0, %rax
++	tzcntq	%rax, %rax
++# endif
++	leaq	(CHAR_PER_VEC * 2)(%rdi, %rax), %rax
++	ret
++
++
++# ifdef USE_AS_STRNLEN
++L(last_vec_x1_check):
++	tzcntl	%eax, %eax
++	/* Check the end of data.  */
++	cmpl	%eax, %esi
++	jb	L(max)
++	subq	%rdx, %rdi
+ #  ifdef USE_AS_WCSLEN
+-	shrq	$2, %rax
++	/* NB: Divide bytes by 4 to get the wchar_t count.  */
++	sarq	$2, %rdi
+ #  endif
++	leaq	(CHAR_PER_VEC)(%rdi, %rax), %rax
+ 	ret
+ 
+ 	.p2align 4
+-L(last_2x_vec):
+-	addl	$(VEC_SIZE * 2), %esi
++L(last_4x_vec):
++	/* Test first 2x VEC normally.  */
++	testl	%eax, %eax
++	jnz	L(last_vec_x1)
+ 
+-	VPCMP	$0, (%rdi), %YMMZERO, %k0
++	VPCMP	$0, (VEC_SIZE * 2)(%rdi), %YMMZERO, %k0
+ 	kmovd	%k0, %eax
+ 	testl	%eax, %eax
+-	jnz	L(first_vec_x0_check)
+-	subl	$VEC_SIZE, %esi
+-	jle	L(max)
++	jnz	L(last_vec_x2)
+ 
+-	VPCMP	$0, VEC_SIZE(%rdi), %YMMZERO, %k0
++	/* Normalize length.  */
++	andl	$(CHAR_PER_VEC * 4 - 1), %esi
++	VPCMP	$0, (VEC_SIZE * 3)(%rdi), %YMMZERO, %k0
+ 	kmovd	%k0, %eax
+ 	testl	%eax, %eax
+-	jnz	L(first_vec_x1_check)
+-	movq	%r8, %rax
+-#  ifdef USE_AS_WCSLEN
+-	shrq	$2, %rax
+-#  endif
+-	ret
++	jnz	L(last_vec_x3)
+ 
+-	.p2align 4
+-L(first_vec_x0_check):
++	/* Check the end of data.  */
++	subl	$(CHAR_PER_VEC * 3), %esi
++	jb	L(max)
++
++	VPCMP	$0, (VEC_SIZE * 4)(%rdi), %YMMZERO, %k0
++	kmovd	%k0, %eax
+ 	tzcntl	%eax, %eax
+-#  ifdef USE_AS_WCSLEN
+-	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
+-	sall	$2, %eax
+-#  endif
+ 	/* Check the end of data.  */
+-	cmpq	%rax, %rsi
+-	jbe	L(max)
+-	addq	%rdi, %rax
+-	subq	%rdx, %rax
++	cmpl	%eax, %esi
++	jb	L(max_end)
++
++	subq	%rdx, %rdi
+ #  ifdef USE_AS_WCSLEN
+-	shrq	$2, %rax
++	/* NB: Divide bytes by 4 to get the wchar_t count.  */
++	sarq	$2, %rdi
+ #  endif
++	leaq	(CHAR_PER_VEC * 4)(%rdi, %rax), %rax
+ 	ret
+ 
+ 	.p2align 4
+-L(first_vec_x1_check):
++L(last_vec_x1):
+ 	tzcntl	%eax, %eax
++	subq	%rdx, %rdi
+ #  ifdef USE_AS_WCSLEN
+-	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
+-	sall	$2, %eax
+-#  endif
+-	/* Check the end of data.  */
+-	cmpq	%rax, %rsi
+-	jbe	L(max)
+-	addq	$VEC_SIZE, %rax
+-	addq	%rdi, %rax
+-	subq	%rdx, %rax
+-#  ifdef USE_AS_WCSLEN
+-	shrq	$2, %rax
++	/* NB: Divide bytes by 4 to get the wchar_t count.  */
++	sarq	$2, %rdi
+ #  endif
++	leaq	(CHAR_PER_VEC)(%rdi, %rax), %rax
+ 	ret
+ 
+ 	.p2align 4
+-L(first_vec_x2_check):
++L(last_vec_x2):
+ 	tzcntl	%eax, %eax
++	subq	%rdx, %rdi
+ #  ifdef USE_AS_WCSLEN
+-	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
+-	sall	$2, %eax
+-#  endif
+-	/* Check the end of data.  */
+-	cmpq	%rax, %rsi
+-	jbe	L(max)
+-	addq	$(VEC_SIZE * 2), %rax
+-	addq	%rdi, %rax
+-	subq	%rdx, %rax
+-#  ifdef USE_AS_WCSLEN
+-	shrq	$2, %rax
++	/* NB: Divide bytes by 4 to get the wchar_t count.  */
++	sarq	$2, %rdi
+ #  endif
++	leaq	(CHAR_PER_VEC * 2)(%rdi, %rax), %rax
+ 	ret
+ 
+ 	.p2align 4
+-L(first_vec_x3_check):
++L(last_vec_x3):
+ 	tzcntl	%eax, %eax
+-#  ifdef USE_AS_WCSLEN
+-	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
+-	sall	$2, %eax
+-#  endif
++	subl	$(CHAR_PER_VEC * 2), %esi
+ 	/* Check the end of data.  */
+-	cmpq	%rax, %rsi
+-	jbe	L(max)
+-	addq	$(VEC_SIZE * 3), %rax
+-	addq	%rdi, %rax
+-	subq	%rdx, %rax
++	cmpl	%eax, %esi
++	jb	L(max_end)
++	subq	%rdx, %rdi
+ #  ifdef USE_AS_WCSLEN
+-	shrq	$2, %rax
++	/* NB: Divide bytes by 4 to get the wchar_t count.  */
++	sarq	$2, %rdi
+ #  endif
++	leaq	(CHAR_PER_VEC * 3)(%rdi, %rax), %rax
+ 	ret
+-
+-	.p2align 4
+-L(max):
++L(max_end):
+ 	movq	%r8, %rax
+-#  ifdef USE_AS_WCSLEN
+-	shrq	$2, %rax
+-#  endif
+-	ret
+-
+-	.p2align 4
+-L(zero):
+-	xorl	%eax, %eax
+ 	ret
+ # endif
+ 
++	/* Cold case for crossing page with first load.	 */
+ 	.p2align 4
+-L(first_vec_x0):
+-	tzcntl	%eax, %eax
+-# ifdef USE_AS_WCSLEN
+-	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
+-	sall	$2, %eax
+-# endif
+-	addq	%rdi, %rax
+-	subq	%rdx, %rax
++L(cross_page_boundary):
++	movq	%rdi, %rdx
++	/* Align data to VEC_SIZE.  */
++	andq	$-VEC_SIZE, %rdi
++	VPCMP	$0, (%rdi), %YMMZERO, %k0
++	kmovd	%k0, %eax
++	/* Remove the leading bytes.  */
+ # ifdef USE_AS_WCSLEN
+-	shrq	$2, %rax
++	/* NB: Divide shift count by 4 since each bit in K0 represent 4
++	   bytes.  */
++	movl	%edx, %ecx
++	shrl	$2, %ecx
++	andl	$(CHAR_PER_VEC - 1), %ecx
+ # endif
+-	ret
+-
+-	.p2align 4
+-L(first_vec_x1):
++	/* SHIFT_REG is ecx for USE_AS_WCSLEN and edx otherwise.  */
++	sarxl	%SHIFT_REG, %eax, %eax
++	testl	%eax, %eax
++# ifndef USE_AS_STRNLEN
++	jz	L(cross_page_continue)
+ 	tzcntl	%eax, %eax
+-# ifdef USE_AS_WCSLEN
+-	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
+-	sall	$2, %eax
+-# endif
+-	addq	$VEC_SIZE, %rax
+-	addq	%rdi, %rax
+-	subq	%rdx, %rax
+-# ifdef USE_AS_WCSLEN
+-	shrq	$2, %rax
+-# endif
+ 	ret
+-
+-	.p2align 4
+-L(first_vec_x2):
+-	tzcntl	%eax, %eax
+-# ifdef USE_AS_WCSLEN
+-	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
+-	sall	$2, %eax
+-# endif
+-	addq	$(VEC_SIZE * 2), %rax
+-	addq	%rdi, %rax
+-	subq	%rdx, %rax
+-# ifdef USE_AS_WCSLEN
+-	shrq	$2, %rax
+-# endif
++# else
++	jnz	L(cross_page_less_vec)
++#  ifndef USE_AS_WCSLEN
++	movl	%edx, %ecx
++	andl	$(CHAR_PER_VEC - 1), %ecx
++#  endif
++	movl	$CHAR_PER_VEC, %eax
++	subl	%ecx, %eax
++	/* Check the end of data.  */
++	cmpq	%rax, %rsi
++	ja	L(cross_page_continue)
++	movl	%esi, %eax
+ 	ret
+-
+-	.p2align 4
+-L(4x_vec_end):
+-	VPCMP	$0, %YMM1, %YMMZERO, %k0
+-	kmovd	%k0, %eax
+-	testl	%eax, %eax
+-	jnz	L(first_vec_x0)
+-	VPCMP	$0, %YMM2, %YMMZERO, %k1
+-	kmovd	%k1, %eax
+-	testl	%eax, %eax
+-	jnz	L(first_vec_x1)
+-	VPCMP	$0, %YMM3, %YMMZERO, %k2
+-	kmovd	%k2, %eax
+-	testl	%eax, %eax
+-	jnz	L(first_vec_x2)
+-	VPCMP	$0, %YMM4, %YMMZERO, %k3
+-	kmovd	%k3, %eax
+-L(first_vec_x3):
++L(cross_page_less_vec):
+ 	tzcntl	%eax, %eax
+-# ifdef USE_AS_WCSLEN
+-	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
+-	sall	$2, %eax
+-# endif
+-	addq	$(VEC_SIZE * 3), %rax
+-	addq	%rdi, %rax
+-	subq	%rdx, %rax
+-# ifdef USE_AS_WCSLEN
+-	shrq	$2, %rax
+-# endif
++	/* Select min of length and position of first null.  */
++	cmpq	%rax, %rsi
++	cmovb	%esi, %eax
+ 	ret
++# endif
+ 
+ END (STRLEN)
+ #endif
+-- 
+GitLab
+
diff --git a/glibc-RHEL-15696-32.patch b/glibc-RHEL-15696-32.patch
new file mode 100644
index 0000000..8f1a94a
--- /dev/null
+++ b/glibc-RHEL-15696-32.patch
@@ -0,0 +1,158 @@
+From ea8e465a6b8d0f26c72bcbe453a854de3abf68ec Mon Sep 17 00:00:00 2001
+From: "H.J. Lu" <hjl.tools@gmail.com>
+Date: Wed, 30 Jun 2021 10:47:06 -0700
+Subject: [PATCH] x86: Check RTM_ALWAYS_ABORT for RTM [BZ #28033]
+Content-type: text/plain; charset=UTF-8
+
+From
+
+https://www.intel.com/content/www/us/en/support/articles/000059422/processors.html
+
+* Intel TSX will be disabled by default.
+* The processor will force abort all Restricted Transactional Memory (RTM)
+  transactions by default.
+* A new CPUID bit CPUID.07H.0H.EDX[11](RTM_ALWAYS_ABORT) will be enumerated,
+  which is set to indicate to updated software that the loaded microcode is
+  forcing RTM abort.
+* On processors that enumerate support for RTM, the CPUID enumeration bits
+  for Intel TSX (CPUID.07H.0H.EBX[11] and CPUID.07H.0H.EBX[4]) continue to
+  be set by default after microcode update.
+* Workloads that were benefited from Intel TSX might experience a change
+  in performance.
+* System software may use a new bit in Model-Specific Register (MSR) 0x10F
+  TSX_FORCE_ABORT[TSX_CPUID_CLEAR] functionality to clear the Hardware Lock
+  Elision (HLE) and RTM bits to indicate to software that Intel TSX is
+  disabled.
+
+1. Add RTM_ALWAYS_ABORT to CPUID features.
+2. Set RTM usable only if RTM_ALWAYS_ABORT isn't set.  This skips the
+string/tst-memchr-rtm etc. testcases on the affected processors, which
+always fail after a microcde update.
+3. Check RTM feature, instead of usability, against /proc/cpuinfo.
+
+This fixes BZ #28033.
+---
+ manual/platform.texi                    | 3 +++
+ sysdeps/x86/cpu-features.c              | 5 ++++-
+ sysdeps/x86/sys/platform/x86.h          | 6 +++---
+ sysdeps/x86/tst-cpu-features-supports.c | 2 +-
+ sysdeps/x86/tst-get-cpu-features.c      | 2 ++
+ 5 files changed, 13 insertions(+), 5 deletions(-)
+
+Conflicts:
+	sysdeps/x86/bits/platform/x86.h
+	(doesn't exist)
+	sysdeps/x86/bits/platform/x86.h
+	(account for lack of upstream renames)
+
+diff --git a/manual/platform.texi b/manual/platform.texi
+index 8fec2933..b7e8aef7 100644
+--- a/manual/platform.texi
++++ b/manual/platform.texi
+@@ -510,6 +510,9 @@ capability.
+ @item
+ @code{RTM} -- RTM instruction extensions.
+ 
++@item
++@code{RTM_ALWAYS_ABORT} -- Transactions always abort, making RTM unusable.
++
+ @item
+ @code{SDBG} -- IA32_DEBUG_INTERFACE MSR for silicon debug.
+ 
+diff --git a/sysdeps/x86/cpu-features.c b/sysdeps/x86/cpu-features.c
+index 3610ee5c..4889f062 100644
+--- a/sysdeps/x86/cpu-features.c
++++ b/sysdeps/x86/cpu-features.c
+@@ -74,7 +74,6 @@ update_usable (struct cpu_features *cpu_features)
+   CPU_FEATURE_SET_USABLE (cpu_features, HLE);
+   CPU_FEATURE_SET_USABLE (cpu_features, BMI2);
+   CPU_FEATURE_SET_USABLE (cpu_features, ERMS);
+-  CPU_FEATURE_SET_USABLE (cpu_features, RTM);
+   CPU_FEATURE_SET_USABLE (cpu_features, RDSEED);
+   CPU_FEATURE_SET_USABLE (cpu_features, ADX);
+   CPU_FEATURE_SET_USABLE (cpu_features, CLFLUSHOPT);
+@@ -90,6 +89,7 @@ update_usable (struct cpu_features *cpu_features)
+   CPU_FEATURE_SET_USABLE (cpu_features, MOVDIRI);
+   CPU_FEATURE_SET_USABLE (cpu_features, MOVDIR64B);
+   CPU_FEATURE_SET_USABLE (cpu_features, FSRM);
++  CPU_FEATURE_SET_USABLE (cpu_features, RTM_ALWAYS_ABORT);
+   CPU_FEATURE_SET_USABLE (cpu_features, SERIALIZE);
+   CPU_FEATURE_SET_USABLE (cpu_features, TSXLDTRK);
+   CPU_FEATURE_SET_USABLE (cpu_features, LAHF64_SAHF64);
+@@ -779,6 +779,9 @@ no_cpuid:
+     GLRO(dl_platform) = "i586";
+ #endif
+ 
++  if (!CPU_FEATURES_CPU_P (cpu_features, RTM_ALWAYS_ABORT))
++    CPU_FEATURE_SET_USABLE (cpu_features, RTM);
++
+ #if CET_ENABLED
+ # if HAVE_TUNABLES
+   TUNABLE_GET (x86_ibt, tunable_val_t *,
+diff --git a/sysdeps/x86/sys/platform/x86.h b/sysdeps/x86/sys/platform/x86.h
+index e5cc7c68..7a434926 100644
+--- a/sysdeps/x86/sys/platform/x86.h
++++ b/sysdeps/x86/sys/platform/x86.h
+@@ -247,7 +247,7 @@ extern const struct cpu_features *__x86_get_cpu_features (unsigned int)
+ #define bit_cpu_AVX512_VP2INTERSECT (1u << 8)
+ #define bit_cpu_INDEX_7_EDX_9	(1u << 9)
+ #define bit_cpu_MD_CLEAR	(1u << 10)
+-#define bit_cpu_INDEX_7_EDX_11	(1u << 11)
++#define bit_cpu_RTM_ALWAYS_ABORT (1u << 11)
+ #define bit_cpu_INDEX_7_EDX_12	(1u << 12)
+ #define bit_cpu_INDEX_7_EDX_13	(1u << 13)
+ #define bit_cpu_SERIALIZE	(1u << 14)
+@@ -471,7 +471,7 @@ extern const struct cpu_features *__x86_get_cpu_features (unsigned int)
+ #define index_cpu_AVX512_VP2INTERSECT COMMON_CPUID_INDEX_7
+ #define index_cpu_INDEX_7_EDX_9	COMMON_CPUID_INDEX_7
+ #define index_cpu_MD_CLEAR	COMMON_CPUID_INDEX_7
+-#define index_cpu_INDEX_7_EDX_11 COMMON_CPUID_INDEX_7
++#define index_cpu_RTM_ALWAYS_ABORT COMMON_CPUID_INDEX_7
+ #define index_cpu_INDEX_7_EDX_12 COMMON_CPUID_INDEX_7
+ #define index_cpu_INDEX_7_EDX_13 COMMON_CPUID_INDEX_7
+ #define index_cpu_SERIALIZE	COMMON_CPUID_INDEX_7
+@@ -695,7 +695,7 @@ extern const struct cpu_features *__x86_get_cpu_features (unsigned int)
+ #define reg_AVX512_VP2INTERSECT	edx
+ #define reg_INDEX_7_EDX_9	edx
+ #define reg_MD_CLEAR		edx
+-#define reg_INDEX_7_EDX_11	edx
++#define reg_RTM_ALWAYS_ABORT	edx
+ #define reg_INDEX_7_EDX_12	edx
+ #define reg_INDEX_7_EDX_13	edx
+ #define reg_SERIALIZE		edx
+diff --git a/sysdeps/x86/tst-cpu-features-supports.c b/sysdeps/x86/tst-cpu-features-supports.c
+index 287cf01f..8100a319 100644
+--- a/sysdeps/x86/tst-cpu-features-supports.c
++++ b/sysdeps/x86/tst-cpu-features-supports.c
+@@ -152,7 +152,7 @@ do_test (int argc, char **argv)
+   fails += CHECK_SUPPORTS (rdpid, RDPID);
+   fails += CHECK_SUPPORTS (rdrnd, RDRAND);
+   fails += CHECK_SUPPORTS (rdseed, RDSEED);
+-  fails += CHECK_SUPPORTS (rtm, RTM);
++  fails += CHECK_CPU_SUPPORTS (rtm, RTM);
+   fails += CHECK_SUPPORTS (serialize, SERIALIZE);
+   fails += CHECK_SUPPORTS (sha, SHA);
+   fails += CHECK_CPU_SUPPORTS (shstk, SHSTK);
+diff --git a/sysdeps/x86/tst-get-cpu-features.c b/sysdeps/x86/tst-get-cpu-features.c
+index 2763deb6..0717e5d8 100644
+--- a/sysdeps/x86/tst-get-cpu-features.c
++++ b/sysdeps/x86/tst-get-cpu-features.c
+@@ -183,6 +183,7 @@ do_test (void)
+   CHECK_CPU_FEATURE (UINTR);
+   CHECK_CPU_FEATURE (AVX512_VP2INTERSECT);
+   CHECK_CPU_FEATURE (MD_CLEAR);
++  CHECK_CPU_FEATURE (RTM_ALWAYS_ABORT);
+   CHECK_CPU_FEATURE (SERIALIZE);
+   CHECK_CPU_FEATURE (HYBRID);
+   CHECK_CPU_FEATURE (TSXLDTRK);
+@@ -344,6 +345,7 @@ do_test (void)
+   CHECK_CPU_FEATURE_USABLE (FSRM);
+   CHECK_CPU_FEATURE_USABLE (AVX512_VP2INTERSECT);
+   CHECK_CPU_FEATURE_USABLE (MD_CLEAR);
++  CHECK_CPU_FEATURE_USABLE (RTM_ALWAYS_ABORT);
+   CHECK_CPU_FEATURE_USABLE (SERIALIZE);
+   CHECK_CPU_FEATURE_USABLE (HYBRID);
+   CHECK_CPU_FEATURE_USABLE (TSXLDTRK);
+-- 
+GitLab
+
diff --git a/glibc-RHEL-15696-33.patch b/glibc-RHEL-15696-33.patch
new file mode 100644
index 0000000..1196471
--- /dev/null
+++ b/glibc-RHEL-15696-33.patch
@@ -0,0 +1,51 @@
+From 0679442defedf7e52a94264975880ab8674736b2 Mon Sep 17 00:00:00 2001
+From: Noah Goldstein <goldstein.w.n@gmail.com>
+Date: Thu, 8 Jul 2021 16:13:19 -0400
+Subject: [PATCH] x86: Remove wcsnlen-sse4_1 from wcslen ifunc-impl-list [BZ
+ #28064]
+Content-type: text/plain; charset=UTF-8
+
+The following commit
+
+commit 6f573a27b6c8b4236445810a44660612323f5a73
+Author: Noah Goldstein <goldstein.w.n@gmail.com>
+Date:   Wed Jun 23 01:19:34 2021 -0400
+
+    x86-64: Add wcslen optimize for sse4.1
+
+Added wcsnlen-sse4.1 to the wcslen ifunc implementation list and did
+not add wcslen-sse4.1 to wcslen ifunc implementation list. This commit
+fixes that by removing wcsnlen-sse4.1 from the wcslen ifunc
+implementation list and adding wcslen-sse4.1 to the ifunc
+implementation list.
+
+Testing:
+test-wcslen.c, test-rsi-wcslen.c, and test-rsi-strlen.c are passing as
+well as all other tests in wcsmbs and string.
+
+Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
+Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
+Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
+---
+ sysdeps/x86_64/multiarch/ifunc-impl-list.c | 4 ++--
+ 1 file changed, 2 insertions(+), 2 deletions(-)
+
+diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+index 580913ca..695cdba6 100644
+--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
++++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+@@ -657,9 +657,9 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 			       && CPU_FEATURE_USABLE (AVX512BW)
+ 			       && CPU_FEATURE_USABLE (BMI2)),
+ 			      __wcslen_evex)
+-	      IFUNC_IMPL_ADD (array, i, wcsnlen,
++	      IFUNC_IMPL_ADD (array, i, wcslen,
+ 			      CPU_FEATURE_USABLE (SSE4_1),
+-			      __wcsnlen_sse4_1)
++			      __wcslen_sse4_1)
+ 	      IFUNC_IMPL_ADD (array, i, wcslen, 1, __wcslen_sse2))
+ 
+   /* Support sysdeps/x86_64/multiarch/wcsnlen.c.  */
+-- 
+GitLab
+
diff --git a/glibc-RHEL-15696-34.patch b/glibc-RHEL-15696-34.patch
new file mode 100644
index 0000000..f7c9a56
--- /dev/null
+++ b/glibc-RHEL-15696-34.patch
@@ -0,0 +1,135 @@
+From c6272098323153db373f2986c67786ea8c85f1cf Mon Sep 17 00:00:00 2001
+From: Noah Goldstein <goldstein.w.n@gmail.com>
+Date: Tue, 15 Feb 2022 08:18:15 -0600
+Subject: [PATCH] x86: Fallback {str|wcs}cmp RTM in the ncmp overflow case [BZ
+ #28896]
+Content-type: text/plain; charset=UTF-8
+
+In the overflow fallback strncmp-avx2-rtm and wcsncmp-avx2-rtm would
+call strcmp-avx2 and wcscmp-avx2 respectively. This would have
+not checks around vzeroupper and would trigger spurious
+aborts. This commit fixes that.
+
+test-strcmp, test-strncmp, test-wcscmp, and test-wcsncmp all pass on
+AVX2 machines with and without RTM.
+
+Co-authored-by: H.J. Lu <hjl.tools@gmail.com>
+---
+ sysdeps/x86/Makefile                        |  2 +-
+ sysdeps/x86/tst-strncmp-rtm.c               | 17 ++++++++++++++++-
+ sysdeps/x86_64/multiarch/strcmp-avx2.S      |  2 +-
+ sysdeps/x86_64/multiarch/strncmp-avx2-rtm.S |  1 +
+ sysdeps/x86_64/multiarch/strncmp-avx2.S     |  1 +
+ sysdeps/x86_64/multiarch/wcsncmp-avx2-rtm.S |  2 +-
+ sysdeps/x86_64/multiarch/wcsncmp-avx2.S     |  2 +-
+ 7 files changed, 22 insertions(+), 5 deletions(-)
+
+Conflicts:
+	sysdeps/x86_64/multiarch/strcmp-avx2.S
+	(split into two patches due to upstream bug differences)
+
+diff --git a/sysdeps/x86/Makefile b/sysdeps/x86/Makefile
+index 5be71ada..2d814915 100644
+--- a/sysdeps/x86/Makefile
++++ b/sysdeps/x86/Makefile
+@@ -38,7 +38,7 @@ CFLAGS-tst-memset-rtm.c += -mrtm
+ CFLAGS-tst-strchr-rtm.c += -mrtm
+ CFLAGS-tst-strcpy-rtm.c += -mrtm
+ CFLAGS-tst-strlen-rtm.c += -mrtm
+-CFLAGS-tst-strncmp-rtm.c += -mrtm
++CFLAGS-tst-strncmp-rtm.c += -mrtm -Wno-error
+ CFLAGS-tst-strrchr-rtm.c += -mrtm
+ endif
+ 
+diff --git a/sysdeps/x86/tst-strncmp-rtm.c b/sysdeps/x86/tst-strncmp-rtm.c
+index 236ad951..4d0004b5 100644
+--- a/sysdeps/x86/tst-strncmp-rtm.c
++++ b/sysdeps/x86/tst-strncmp-rtm.c
+@@ -16,6 +16,7 @@
+    License along with the GNU C Library; if not, see
+    <https://www.gnu.org/licenses/>.  */
+ 
++#include <stdint.h>
+ #include <tst-string-rtm.h>
+ 
+ #define LOOP 3000
+@@ -45,8 +46,22 @@ function (void)
+     return 1;
+ }
+ 
++__attribute__ ((noinline, noclone))
++static int
++function_overflow (void)
++{
++  if (strncmp (string1, string2, SIZE_MAX) == 0)
++    return 0;
++  else
++    return 1;
++}
++
+ static int
+ do_test (void)
+ {
+-  return do_test_1 ("strncmp", LOOP, prepare, function);
++  int status = do_test_1 ("strncmp", LOOP, prepare, function);
++  if (status != EXIT_SUCCESS)
++    return status;
++  status = do_test_1 ("strncmp", LOOP, prepare, function_overflow);
++  return status;
+ }
+diff --git a/sysdeps/x86_64/multiarch/strcmp-avx2.S b/sysdeps/x86_64/multiarch/strcmp-avx2.S
+index 5d1c9d90..433ae047 100644
+--- a/sysdeps/x86_64/multiarch/strcmp-avx2.S
++++ b/sysdeps/x86_64/multiarch/strcmp-avx2.S
+@@ -95,7 +95,7 @@ ENTRY (STRCMP)
+ 	   length to bound a valid memory region. In these cases just use
+ 	   'wcscmp'.  */
+ 	shrq	$56, %rcx
+-	jnz	__wcscmp_avx2
++	jnz	OVERFLOW_STRCMP
+ #  endif
+ 	/* Convert units: from wide to byte char.  */
+ 	shl	$2, %RDX_LP
+diff --git a/sysdeps/x86_64/multiarch/strncmp-avx2-rtm.S b/sysdeps/x86_64/multiarch/strncmp-avx2-rtm.S
+index 37d1224b..68bad365 100644
+--- a/sysdeps/x86_64/multiarch/strncmp-avx2-rtm.S
++++ b/sysdeps/x86_64/multiarch/strncmp-avx2-rtm.S
+@@ -1,3 +1,4 @@
+ #define STRCMP	__strncmp_avx2_rtm
+ #define USE_AS_STRNCMP 1
++#define OVERFLOW_STRCMP	__strcmp_avx2_rtm
+ #include "strcmp-avx2-rtm.S"
+diff --git a/sysdeps/x86_64/multiarch/strncmp-avx2.S b/sysdeps/x86_64/multiarch/strncmp-avx2.S
+index 1678bcc2..f138e9f1 100644
+--- a/sysdeps/x86_64/multiarch/strncmp-avx2.S
++++ b/sysdeps/x86_64/multiarch/strncmp-avx2.S
+@@ -1,3 +1,4 @@
+ #define STRCMP	__strncmp_avx2
+ #define USE_AS_STRNCMP 1
++#define OVERFLOW_STRCMP __strcmp_avx2
+ #include "strcmp-avx2.S"
+diff --git a/sysdeps/x86_64/multiarch/wcsncmp-avx2-rtm.S b/sysdeps/x86_64/multiarch/wcsncmp-avx2-rtm.S
+index 4e88c70c..f467582c 100644
+--- a/sysdeps/x86_64/multiarch/wcsncmp-avx2-rtm.S
++++ b/sysdeps/x86_64/multiarch/wcsncmp-avx2-rtm.S
+@@ -1,5 +1,5 @@
+ #define STRCMP __wcsncmp_avx2_rtm
+ #define USE_AS_STRNCMP 1
+ #define USE_AS_WCSCMP 1
+-
++#define OVERFLOW_STRCMP	__wcscmp_avx2_rtm
+ #include "strcmp-avx2-rtm.S"
+diff --git a/sysdeps/x86_64/multiarch/wcsncmp-avx2.S b/sysdeps/x86_64/multiarch/wcsncmp-avx2.S
+index 4fa1de4d..e9ede522 100644
+--- a/sysdeps/x86_64/multiarch/wcsncmp-avx2.S
++++ b/sysdeps/x86_64/multiarch/wcsncmp-avx2.S
+@@ -1,5 +1,5 @@
+ #define STRCMP __wcsncmp_avx2
+ #define USE_AS_STRNCMP 1
+ #define USE_AS_WCSCMP 1
+-
++#define OVERFLOW_STRCMP	__wcscmp_avx2
+ #include "strcmp-avx2.S"
+-- 
+GitLab
+
diff --git a/glibc-RHEL-15696-35.patch b/glibc-RHEL-15696-35.patch
new file mode 100644
index 0000000..5e4fbdd
--- /dev/null
+++ b/glibc-RHEL-15696-35.patch
@@ -0,0 +1,51 @@
+From 55c7bcc71b84123d5d4bd2814366a6b05fcf8ebd Mon Sep 17 00:00:00 2001
+From: "H.J. Lu" <hjl.tools@gmail.com>
+Date: Sat, 9 May 2020 12:04:23 -0700
+Subject: [PATCH] x86-64: Use RDX_LP on __x86_shared_non_temporal_threshold [BZ
+ #25966]
+Content-type: text/plain; charset=UTF-8
+
+Since __x86_shared_non_temporal_threshold is defined as
+
+long int __x86_shared_non_temporal_threshold;
+
+and long int is 4 bytes for x32, use RDX_LP to compare against
+__x86_shared_non_temporal_threshold in assembly code.
+---
+ sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S | 6 +++---
+ 1 file changed, 3 insertions(+), 3 deletions(-)
+
+diff --git a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
+index 71f5954d..673b73aa 100644
+--- a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
++++ b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
+@@ -245,7 +245,7 @@ L(return):
+ #endif
+ 
+ L(movsb):
+-	cmpq	__x86_shared_non_temporal_threshold(%rip), %rdx
++	cmp	__x86_shared_non_temporal_threshold(%rip), %RDX_LP
+ 	jae	L(more_8x_vec)
+ 	cmpq	%rsi, %rdi
+ 	jb	1f
+@@ -397,7 +397,7 @@ L(more_8x_vec):
+ 	addq	%r8, %rdx
+ #if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc)
+ 	/* Check non-temporal store threshold.  */
+-	cmpq	__x86_shared_non_temporal_threshold(%rip), %rdx
++	cmp	__x86_shared_non_temporal_threshold(%rip), %RDX_LP
+ 	ja	L(large_forward)
+ #endif
+ L(loop_4x_vec_forward):
+@@ -448,7 +448,7 @@ L(more_8x_vec_backward):
+ 	subq	%r8, %rdx
+ #if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc)
+ 	/* Check non-temporal store threshold.  */
+-	cmpq	__x86_shared_non_temporal_threshold(%rip), %rdx
++	cmp	__x86_shared_non_temporal_threshold(%rip), %RDX_LP
+ 	ja	L(large_backward)
+ #endif
+ L(loop_4x_vec_backward):
+-- 
+GitLab
+
diff --git a/glibc-RHEL-15696-36.patch b/glibc-RHEL-15696-36.patch
new file mode 100644
index 0000000..e00b96e
--- /dev/null
+++ b/glibc-RHEL-15696-36.patch
@@ -0,0 +1,44 @@
+From a35a59036ebae3efcdf5e8167610e0656fca9770 Mon Sep 17 00:00:00 2001
+From: "H.J. Lu" <hjl.tools@gmail.com>
+Date: Thu, 11 Jun 2020 12:41:18 -0700
+Subject: [PATCH] x86_64: Use %xmmN with vpxor to clear a vector register
+Content-type: text/plain; charset=UTF-8
+
+Since "vpxor %xmmN, %xmmN, %xmmN" clears the whole vector register, use
+%xmmN, instead of %ymmN, with vpxor to clear a vector register.
+---
+ sysdeps/x86_64/multiarch/strcmp-avx2.S  | 4 ++--
+ sysdeps/x86_64/multiarch/strrchr-avx2.S | 2 +-
+ 2 files changed, 3 insertions(+), 3 deletions(-)
+
+diff --git a/sysdeps/x86_64/multiarch/strcmp-avx2.S b/sysdeps/x86_64/multiarch/strcmp-avx2.S
+index 433ae047..70d8499b 100644
+--- a/sysdeps/x86_64/multiarch/strcmp-avx2.S
++++ b/sysdeps/x86_64/multiarch/strcmp-avx2.S
+@@ -105,8 +105,8 @@ ENTRY (STRCMP)
+ # endif
+ 	movl	%edi, %eax
+ 	xorl	%edx, %edx
+-	/* Make %ymm7 all zeros in this function.  */
+-	vpxor	%ymm7, %ymm7, %ymm7
++	/* Make %xmm7 (%ymm7) all zeros in this function.  */
++	vpxor	%xmm7, %xmm7, %xmm7
+ 	orl	%esi, %eax
+ 	andl	$(PAGE_SIZE - 1), %eax
+ 	cmpl	$(PAGE_SIZE - (VEC_SIZE * 4)), %eax
+diff --git a/sysdeps/x86_64/multiarch/strrchr-avx2.S b/sysdeps/x86_64/multiarch/strrchr-avx2.S
+index 9f22a15e..c949410b 100644
+--- a/sysdeps/x86_64/multiarch/strrchr-avx2.S
++++ b/sysdeps/x86_64/multiarch/strrchr-avx2.S
+@@ -48,7 +48,7 @@ ENTRY (STRRCHR)
+ 	movl	%edi, %ecx
+ 	/* Broadcast CHAR to YMM4.  */
+ 	VPBROADCAST %xmm4, %ymm4
+-	vpxor	%ymm0, %ymm0, %ymm0
++	vpxor	%xmm0, %xmm0, %xmm0
+ 
+ 	/* Check if we may cross page boundary with one vector load.  */
+ 	andl	$(2 * VEC_SIZE - 1), %ecx
+-- 
+GitLab
+
diff --git a/glibc-RHEL-15696-37.patch b/glibc-RHEL-15696-37.patch
new file mode 100644
index 0000000..10b0cc4
--- /dev/null
+++ b/glibc-RHEL-15696-37.patch
@@ -0,0 +1,359 @@
+From 1f745ecc2109890886b161d4791e1406fdfc29b8 Mon Sep 17 00:00:00 2001
+From: noah <goldstein.w.n@gmail.com>
+Date: Wed, 3 Feb 2021 00:38:59 -0500
+Subject: [PATCH] x86-64: Refactor and improve performance of strchr-avx2.S
+Content-type: text/plain; charset=UTF-8
+
+No bug. Just seemed the performance could be improved a bit. Observed
+and expected behavior are unchanged. Optimized body of main
+loop. Updated page cross logic and optimized accordingly. Made a few
+minor instruction selection modifications. No regressions in test
+suite. Both test-strchrnul and test-strchr passed.
+---
+ sysdeps/x86_64/multiarch/strchr-avx2.S | 225 ++++++++++++-------------
+ sysdeps/x86_64/multiarch/strchr.c      |   4 +-
+ 2 files changed, 114 insertions(+), 115 deletions(-)
+
+Conflicts:
+	sysdeps/x86_64/multiarch/strchr.c
+	(account for missing upstream macros)
+
+diff --git a/sysdeps/x86_64/multiarch/strchr-avx2.S b/sysdeps/x86_64/multiarch/strchr-avx2.S
+index da7d2620..919d256c 100644
+--- a/sysdeps/x86_64/multiarch/strchr-avx2.S
++++ b/sysdeps/x86_64/multiarch/strchr-avx2.S
+@@ -27,10 +27,12 @@
+ # ifdef USE_AS_WCSCHR
+ #  define VPBROADCAST	vpbroadcastd
+ #  define VPCMPEQ	vpcmpeqd
++#  define VPMINU	vpminud
+ #  define CHAR_REG	esi
+ # else
+ #  define VPBROADCAST	vpbroadcastb
+ #  define VPCMPEQ	vpcmpeqb
++#  define VPMINU	vpminub
+ #  define CHAR_REG	sil
+ # endif
+ 
+@@ -43,71 +45,54 @@
+ # endif
+ 
+ # define VEC_SIZE 32
++# define PAGE_SIZE 4096
+ 
+ 	.section SECTION(.text),"ax",@progbits
+ ENTRY (STRCHR)
+ 	movl	%edi, %ecx
+-	/* Broadcast CHAR to YMM0.  */
++# ifndef USE_AS_STRCHRNUL
++	xorl	%edx, %edx
++# endif
++
++	/* Broadcast CHAR to YMM0.	*/
+ 	vmovd	%esi, %xmm0
+ 	vpxor	%xmm9, %xmm9, %xmm9
+ 	VPBROADCAST %xmm0, %ymm0
+-	/* Check if we may cross page boundary with one vector load.  */
+-	andl	$(2 * VEC_SIZE - 1), %ecx
+-	cmpl	$VEC_SIZE, %ecx
+-	ja	L(cros_page_boundary)
+ 
+-	/* Check the first VEC_SIZE bytes.  Search for both CHAR and the
+-	   null byte.  */
+-	vmovdqu	(%rdi), %ymm8
+-	VPCMPEQ %ymm8, %ymm0, %ymm1
+-	VPCMPEQ %ymm8, %ymm9, %ymm2
+-	vpor	%ymm1, %ymm2, %ymm1
+-	vpmovmskb %ymm1, %eax
+-	testl	%eax, %eax
+-	jnz	L(first_vec_x0)
++	/* Check if we cross page boundary with one vector load.  */
++	andl	$(PAGE_SIZE - 1), %ecx
++	cmpl	$(PAGE_SIZE - VEC_SIZE), %ecx
++	ja  L(cross_page_boundary)
+ 
+-	/* Align data for aligned loads in the loop.  */
+-	addq	$VEC_SIZE, %rdi
+-	andl	$(VEC_SIZE - 1), %ecx
+-	andq	$-VEC_SIZE, %rdi
+-
+-	jmp	L(more_4x_vec)
+-
+-	.p2align 4
+-L(cros_page_boundary):
+-	andl	$(VEC_SIZE - 1), %ecx
+-	andq	$-VEC_SIZE, %rdi
++	/* Check the first VEC_SIZE bytes.	Search for both CHAR and the
++	   null byte.  */
+ 	vmovdqu	(%rdi), %ymm8
+ 	VPCMPEQ %ymm8, %ymm0, %ymm1
+ 	VPCMPEQ %ymm8, %ymm9, %ymm2
+ 	vpor	%ymm1, %ymm2, %ymm1
+ 	vpmovmskb %ymm1, %eax
+-	/* Remove the leading bytes.  */
+-	sarl	%cl, %eax
+ 	testl	%eax, %eax
+-	jz	L(aligned_more)
+-	/* Found CHAR or the null byte.  */
++	jz	L(more_vecs)
+ 	tzcntl	%eax, %eax
+-	addq	%rcx, %rax
+-# ifdef USE_AS_STRCHRNUL
++	/* Found CHAR or the null byte.	 */
+ 	addq	%rdi, %rax
+-# else
+-	xorl	%edx, %edx
+-	leaq	(%rdi, %rax), %rax
+-	cmp	(%rax), %CHAR_REG
++# ifndef USE_AS_STRCHRNUL
++	cmp (%rax), %CHAR_REG
+ 	cmovne	%rdx, %rax
+ # endif
+ L(return_vzeroupper):
+ 	ZERO_UPPER_VEC_REGISTERS_RETURN
+ 
+ 	.p2align 4
++L(more_vecs):
++	/* Align data for aligned loads in the loop.  */
++	andq	$-VEC_SIZE, %rdi
+ L(aligned_more):
+-	addq	$VEC_SIZE, %rdi
+ 
+-L(more_4x_vec):
+-	/* Check the first 4 * VEC_SIZE.  Only one VEC_SIZE at a time
+-	   since data is only aligned to VEC_SIZE.  */
+-	vmovdqa	(%rdi), %ymm8
++	/* Check the next 4 * VEC_SIZE.	 Only one VEC_SIZE at a time
++	   since data is only aligned to VEC_SIZE.	*/
++	vmovdqa	VEC_SIZE(%rdi), %ymm8
++	addq	$VEC_SIZE, %rdi
+ 	VPCMPEQ %ymm8, %ymm0, %ymm1
+ 	VPCMPEQ %ymm8, %ymm9, %ymm2
+ 	vpor	%ymm1, %ymm2, %ymm1
+@@ -137,61 +122,24 @@ L(more_4x_vec):
+ 	vpor	%ymm1, %ymm2, %ymm1
+ 	vpmovmskb %ymm1, %eax
+ 	testl	%eax, %eax
+-	jnz	L(first_vec_x3)
+-
+-	addq	$(VEC_SIZE * 4), %rdi
+-
+-	/* Align data to 4 * VEC_SIZE.  */
+-	movq	%rdi, %rcx
+-	andl	$(4 * VEC_SIZE - 1), %ecx
+-	andq	$-(4 * VEC_SIZE), %rdi
+-
+-	.p2align 4
+-L(loop_4x_vec):
+-	/* Compare 4 * VEC at a time forward.  */
+-	vmovdqa	(%rdi), %ymm5
+-	vmovdqa	VEC_SIZE(%rdi), %ymm6
+-	vmovdqa	(VEC_SIZE * 2)(%rdi), %ymm7
+-	vmovdqa	(VEC_SIZE * 3)(%rdi), %ymm8
+-
+-	VPCMPEQ %ymm5, %ymm0, %ymm1
+-	VPCMPEQ %ymm6, %ymm0, %ymm2
+-	VPCMPEQ %ymm7, %ymm0, %ymm3
+-	VPCMPEQ %ymm8, %ymm0, %ymm4
+-
+-	VPCMPEQ %ymm5, %ymm9, %ymm5
+-	VPCMPEQ %ymm6, %ymm9, %ymm6
+-	VPCMPEQ %ymm7, %ymm9, %ymm7
+-	VPCMPEQ %ymm8, %ymm9, %ymm8
+-
+-	vpor	%ymm1, %ymm5, %ymm1
+-	vpor	%ymm2, %ymm6, %ymm2
+-	vpor	%ymm3, %ymm7, %ymm3
+-	vpor	%ymm4, %ymm8, %ymm4
+-
+-	vpor	%ymm1, %ymm2, %ymm5
+-	vpor	%ymm3, %ymm4, %ymm6
+-
+-	vpor	%ymm5, %ymm6, %ymm5
+-
+-	vpmovmskb %ymm5, %eax
+-	testl	%eax, %eax
+-	jnz	L(4x_vec_end)
+-
+-	addq	$(VEC_SIZE * 4), %rdi
++	jz	L(prep_loop_4x)
+ 
+-	jmp	L(loop_4x_vec)
++	tzcntl	%eax, %eax
++	leaq	(VEC_SIZE * 3)(%rdi, %rax), %rax
++# ifndef USE_AS_STRCHRNUL
++	cmp (%rax), %CHAR_REG
++	cmovne	%rdx, %rax
++# endif
++	VZEROUPPER
++	ret
+ 
+ 	.p2align 4
+ L(first_vec_x0):
+-	/* Found CHAR or the null byte.  */
+ 	tzcntl	%eax, %eax
+-# ifdef USE_AS_STRCHRNUL
++	/* Found CHAR or the null byte.	 */
+ 	addq	%rdi, %rax
+-# else
+-	xorl	%edx, %edx
+-	leaq	(%rdi, %rax), %rax
+-	cmp	(%rax), %CHAR_REG
++# ifndef USE_AS_STRCHRNUL
++	cmp (%rax), %CHAR_REG
+ 	cmovne	%rdx, %rax
+ # endif
+ 	VZEROUPPER_RETURN
+@@ -199,13 +147,9 @@ L(first_vec_x0):
+ 	.p2align 4
+ L(first_vec_x1):
+ 	tzcntl	%eax, %eax
+-# ifdef USE_AS_STRCHRNUL
+-	addq	$VEC_SIZE, %rax
+-	addq	%rdi, %rax
+-# else
+-	xorl	%edx, %edx
+ 	leaq	VEC_SIZE(%rdi, %rax), %rax
+-	cmp	(%rax), %CHAR_REG
++# ifndef USE_AS_STRCHRNUL
++	cmp (%rax), %CHAR_REG
+ 	cmovne	%rdx, %rax
+ # endif
+ 	VZEROUPPER_RETURN
+@@ -213,42 +157,97 @@ L(first_vec_x1):
+ 	.p2align 4
+ L(first_vec_x2):
+ 	tzcntl	%eax, %eax
+-# ifdef USE_AS_STRCHRNUL
+-	addq	$(VEC_SIZE * 2), %rax
+-	addq	%rdi, %rax
+-# else
+-	xorl	%edx, %edx
++	/* Found CHAR or the null byte.	 */
+ 	leaq	(VEC_SIZE * 2)(%rdi, %rax), %rax
+-	cmp	(%rax), %CHAR_REG
++# ifndef USE_AS_STRCHRNUL
++	cmp (%rax), %CHAR_REG
+ 	cmovne	%rdx, %rax
+ # endif
+ 	VZEROUPPER_RETURN
+ 
++L(prep_loop_4x):
++	/* Align data to 4 * VEC_SIZE.	*/
++	andq	$-(VEC_SIZE * 4), %rdi
++
+ 	.p2align 4
+-L(4x_vec_end):
++L(loop_4x_vec):
++	/* Compare 4 * VEC at a time forward.  */
++	vmovdqa	(VEC_SIZE * 4)(%rdi), %ymm5
++	vmovdqa	(VEC_SIZE * 5)(%rdi), %ymm6
++	vmovdqa	(VEC_SIZE * 6)(%rdi), %ymm7
++	vmovdqa	(VEC_SIZE * 7)(%rdi), %ymm8
++
++	/* Leaves only CHARS matching esi as 0.	 */
++	vpxor	%ymm5, %ymm0, %ymm1
++	vpxor	%ymm6, %ymm0, %ymm2
++	vpxor	%ymm7, %ymm0, %ymm3
++	vpxor	%ymm8, %ymm0, %ymm4
++
++	VPMINU	%ymm1, %ymm5, %ymm1
++	VPMINU	%ymm2, %ymm6, %ymm2
++	VPMINU	%ymm3, %ymm7, %ymm3
++	VPMINU	%ymm4, %ymm8, %ymm4
++
++	VPMINU	%ymm1, %ymm2, %ymm5
++	VPMINU	%ymm3, %ymm4, %ymm6
++
++	VPMINU	%ymm5, %ymm6, %ymm5
++
++	VPCMPEQ %ymm5, %ymm9, %ymm5
++	vpmovmskb %ymm5, %eax
++
++	addq	$(VEC_SIZE * 4), %rdi
++	testl	%eax, %eax
++	jz  L(loop_4x_vec)
++
++	VPCMPEQ %ymm1, %ymm9, %ymm1
+ 	vpmovmskb %ymm1, %eax
+ 	testl	%eax, %eax
+ 	jnz	L(first_vec_x0)
++
++	VPCMPEQ %ymm2, %ymm9, %ymm2
+ 	vpmovmskb %ymm2, %eax
+ 	testl	%eax, %eax
+ 	jnz	L(first_vec_x1)
+-	vpmovmskb %ymm3, %eax
+-	testl	%eax, %eax
+-	jnz	L(first_vec_x2)
++
++	VPCMPEQ %ymm3, %ymm9, %ymm3
++	VPCMPEQ %ymm4, %ymm9, %ymm4
++	vpmovmskb %ymm3, %ecx
+ 	vpmovmskb %ymm4, %eax
++	salq	$32, %rax
++	orq %rcx, %rax
++	tzcntq  %rax, %rax
++	leaq	(VEC_SIZE * 2)(%rdi, %rax), %rax
++# ifndef USE_AS_STRCHRNUL
++	cmp (%rax), %CHAR_REG
++	cmovne	%rdx, %rax
++# endif
++	VZEROUPPER
++	ret
++
++	/* Cold case for crossing page with first load.	 */
++	.p2align 4
++L(cross_page_boundary):
++	andq	$-VEC_SIZE, %rdi
++	andl	$(VEC_SIZE - 1), %ecx
++
++	vmovdqa	(%rdi), %ymm8
++	VPCMPEQ %ymm8, %ymm0, %ymm1
++	VPCMPEQ %ymm8, %ymm9, %ymm2
++	vpor	%ymm1, %ymm2, %ymm1
++	vpmovmskb %ymm1, %eax
++	/* Remove the leading bits.	 */
++	sarxl	%ecx, %eax, %eax
+ 	testl	%eax, %eax
+-L(first_vec_x3):
++	jz	L(aligned_more)
+ 	tzcntl	%eax, %eax
+-# ifdef USE_AS_STRCHRNUL
+-	addq	$(VEC_SIZE * 3), %rax
++	addq	%rcx, %rdi
+ 	addq	%rdi, %rax
+-# else
+-	xorl	%edx, %edx
+-	leaq	(VEC_SIZE * 3)(%rdi, %rax), %rax
+-	cmp	(%rax), %CHAR_REG
++# ifndef USE_AS_STRCHRNUL
++	cmp (%rax), %CHAR_REG
+ 	cmovne	%rdx, %rax
+ # endif
+ 	VZEROUPPER_RETURN
+ 
+ END (STRCHR)
+-#endif
++# endif
+diff --git a/sysdeps/x86_64/multiarch/strchr.c b/sysdeps/x86_64/multiarch/strchr.c
+index 7e582f02..5225bd4f 100644
+--- a/sysdeps/x86_64/multiarch/strchr.c
++++ b/sysdeps/x86_64/multiarch/strchr.c
+@@ -38,11 +38,11 @@ IFUNC_SELECTOR (void)
+   const struct cpu_features* cpu_features = __get_cpu_features ();
+ 
+   if (CPU_FEATURE_USABLE_P (cpu_features, AVX2)
++      && CPU_FEATURE_USABLE_P (cpu_features, BMI2)
+       && CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load))
+     {
+       if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
+-	  && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW)
+-	  && CPU_FEATURE_USABLE_P (cpu_features, BMI2))
++	  && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW))
+ 	return OPTIMIZE (evex);
+ 
+       if (CPU_FEATURE_USABLE_P (cpu_features, RTM))
+-- 
+GitLab
+
diff --git a/glibc-RHEL-15696-38.patch b/glibc-RHEL-15696-38.patch
new file mode 100644
index 0000000..f97ab23
--- /dev/null
+++ b/glibc-RHEL-15696-38.patch
@@ -0,0 +1,67 @@
+From 3ec5d83d2a237d39e7fd6ef7a0bc8ac4c171a4a5 Mon Sep 17 00:00:00 2001
+From: "H.J. Lu" <hjl.tools@gmail.com>
+Date: Sat, 25 Jan 2020 14:19:40 -0800
+Subject: [PATCH] x86-64: Avoid rep movsb with short distance [BZ #27130]
+Content-type: text/plain; charset=UTF-8
+
+When copying with "rep movsb", if the distance between source and
+destination is N*4GB + [1..63] with N >= 0, performance may be very
+slow.  This patch updates memmove-vec-unaligned-erms.S for AVX and
+AVX512 versions with the distance in RCX:
+
+	cmpl	$63, %ecx
+	// Don't use "rep movsb" if ECX <= 63
+	jbe	L(Don't use rep movsb")
+	Use "rep movsb"
+
+Benchtests data with bench-memcpy, bench-memcpy-large, bench-memcpy-random
+and bench-memcpy-walk on Skylake, Ice Lake and Tiger Lake show that its
+performance impact is within noise range as "rep movsb" is only used for
+data size >= 4KB.
+---
+ .../multiarch/memmove-vec-unaligned-erms.S    | 21 +++++++++++++++++++
+ 1 file changed, 21 insertions(+)
+
+diff --git a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
+index 673b73aa..c475fed4 100644
+--- a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
++++ b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
+@@ -64,6 +64,13 @@
+ # endif
+ #endif
+ 
++/* Avoid short distance rep movsb only with non-SSE vector.  */
++#ifndef AVOID_SHORT_DISTANCE_REP_MOVSB
++# define AVOID_SHORT_DISTANCE_REP_MOVSB (VEC_SIZE > 16)
++#else
++# define AVOID_SHORT_DISTANCE_REP_MOVSB 0
++#endif
++
+ #ifndef PREFETCH
+ # define PREFETCH(addr) prefetcht0 addr
+ #endif
+@@ -255,7 +262,21 @@ L(movsb):
+ 	cmpq	%r9, %rdi
+ 	/* Avoid slow backward REP MOVSB.  */
+ 	jb	L(more_8x_vec_backward)
++# if AVOID_SHORT_DISTANCE_REP_MOVSB
++	movq	%rdi, %rcx
++	subq	%rsi, %rcx
++	jmp	2f
++# endif
+ 1:
++# if AVOID_SHORT_DISTANCE_REP_MOVSB
++	movq	%rsi, %rcx
++	subq	%rdi, %rcx
++2:
++/* Avoid "rep movsb" if RCX, the distance between source and destination,
++   is N*4GB + [1..63] with N >= 0.  */
++	cmpl	$63, %ecx
++	jbe	L(more_2x_vec)	/* Avoid "rep movsb" if ECX <= 63.  */
++# endif
+ 	mov	%RDX_LP, %RCX_LP
+ 	rep movsb
+ L(nop):
+-- 
+GitLab
+
diff --git a/glibc-RHEL-15696-39.patch b/glibc-RHEL-15696-39.patch
new file mode 100644
index 0000000..8343ba9
--- /dev/null
+++ b/glibc-RHEL-15696-39.patch
@@ -0,0 +1,449 @@
+From 1a8605b6cd257e8a74e29b5b71c057211f5fb847 Mon Sep 17 00:00:00 2001
+From: noah <goldstein.w.n@gmail.com>
+Date: Sat, 3 Apr 2021 04:12:15 -0400
+Subject: [PATCH] x86: Update large memcpy case in memmove-vec-unaligned-erms.S
+Content-type: text/plain; charset=UTF-8
+
+No Bug. This commit updates the large memcpy case (no overlap). The
+update is to perform memcpy on either 2 or 4 contiguous pages at
+once. This 1) helps to alleviate the affects of false memory aliasing
+when destination and source have a close 4k alignment and 2) In most
+cases and for most DRAM units is a modestly more efficient access
+pattern. These changes are a clear performance improvement for
+VEC_SIZE =16/32, though more ambiguous for VEC_SIZE=64. test-memcpy,
+test-memccpy, test-mempcpy, test-memmove, and tst-memmove-overflow all
+pass.
+
+Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
+---
+ .../multiarch/memmove-vec-unaligned-erms.S    | 338 ++++++++++++++----
+ 1 file changed, 265 insertions(+), 73 deletions(-)
+
+Conflicts:
+	sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
+	(different number of sections)
+
+diff --git a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
+index c475fed4..3e2dd6bc 100644
+--- a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
++++ b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
+@@ -32,7 +32,16 @@
+       overlapping addresses.
+    6. If size >= __x86_shared_non_temporal_threshold and there is no
+       overlap between destination and source, use non-temporal store
+-      instead of aligned store.  */
++      instead of aligned store copying from either 2 or 4 pages at
++      once.
++   8. For point 7) if size < 16 * __x86_shared_non_temporal_threshold
++      and source and destination do not page alias, copy from 2 pages
++      at once using non-temporal stores. Page aliasing in this case is
++      considered true if destination's page alignment - sources' page
++      alignment is less than 8 * VEC_SIZE.
++   9. If size >= 16 * __x86_shared_non_temporal_threshold or source
++      and destination do page alias copy from 4 pages at once using
++      non-temporal stores.  */
+ 
+ #include <sysdep.h>
+ 
+@@ -64,6 +73,34 @@
+ # endif
+ #endif
+ 
++#ifndef PAGE_SIZE
++# define PAGE_SIZE 4096
++#endif
++
++#if PAGE_SIZE != 4096
++# error Unsupported PAGE_SIZE
++#endif
++
++#ifndef LOG_PAGE_SIZE
++# define LOG_PAGE_SIZE 12
++#endif
++
++#if PAGE_SIZE != (1 << LOG_PAGE_SIZE)
++# error Invalid LOG_PAGE_SIZE
++#endif
++
++/* Byte per page for large_memcpy inner loop.  */
++#if VEC_SIZE == 64
++# define LARGE_LOAD_SIZE (VEC_SIZE * 2)
++#else
++# define LARGE_LOAD_SIZE (VEC_SIZE * 4)
++#endif
++
++/* Amount to shift rdx by to compare for memcpy_large_4x.  */
++#ifndef LOG_4X_MEMCPY_THRESH
++# define LOG_4X_MEMCPY_THRESH 4
++#endif
++
+ /* Avoid short distance rep movsb only with non-SSE vector.  */
+ #ifndef AVOID_SHORT_DISTANCE_REP_MOVSB
+ # define AVOID_SHORT_DISTANCE_REP_MOVSB (VEC_SIZE > 16)
+@@ -103,6 +140,28 @@
+ # error Unsupported PREFETCH_SIZE!
+ #endif
+ 
++#if LARGE_LOAD_SIZE == (VEC_SIZE * 2)
++# define LOAD_ONE_SET(base, offset, vec0, vec1, ...) \
++	VMOVU	(offset)base, vec0; \
++	VMOVU	((offset) + VEC_SIZE)base, vec1;
++# define STORE_ONE_SET(base, offset, vec0, vec1, ...) \
++	VMOVNT  vec0, (offset)base; \
++	VMOVNT  vec1, ((offset) + VEC_SIZE)base;
++#elif LARGE_LOAD_SIZE == (VEC_SIZE * 4)
++# define LOAD_ONE_SET(base, offset, vec0, vec1, vec2, vec3) \
++	VMOVU	(offset)base, vec0; \
++	VMOVU	((offset) + VEC_SIZE)base, vec1; \
++	VMOVU	((offset) + VEC_SIZE * 2)base, vec2; \
++	VMOVU	((offset) + VEC_SIZE * 3)base, vec3;
++# define STORE_ONE_SET(base, offset, vec0, vec1, vec2, vec3) \
++	VMOVNT	vec0, (offset)base; \
++	VMOVNT	vec1, ((offset) + VEC_SIZE)base; \
++	VMOVNT	vec2, ((offset) + VEC_SIZE * 2)base; \
++	VMOVNT	vec3, ((offset) + VEC_SIZE * 3)base;
++#else
++# error Invalid LARGE_LOAD_SIZE
++#endif
++
+ #ifndef SECTION
+ # error SECTION is not defined!
+ #endif
+@@ -390,6 +449,15 @@ L(last_4x_vec):
+ 	VZEROUPPER_RETURN
+ 
+ L(more_8x_vec):
++	/* Check if non-temporal move candidate.  */
++#if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc)
++	/* Check non-temporal store threshold.  */
++	cmp __x86_shared_non_temporal_threshold(%rip), %RDX_LP
++	ja	L(large_memcpy_2x)
++#endif
++	/* Entry if rdx is greater than non-temporal threshold but there
++       is overlap.  */
++L(more_8x_vec_check):
+ 	cmpq	%rsi, %rdi
+ 	ja	L(more_8x_vec_backward)
+ 	/* Source == destination is less common.  */
+@@ -416,24 +484,21 @@ L(more_8x_vec):
+ 	subq	%r8, %rdi
+ 	/* Adjust length.  */
+ 	addq	%r8, %rdx
+-#if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc)
+-	/* Check non-temporal store threshold.  */
+-	cmp	__x86_shared_non_temporal_threshold(%rip), %RDX_LP
+-	ja	L(large_forward)
+-#endif
++
++	.p2align 4
+ L(loop_4x_vec_forward):
+ 	/* Copy 4 * VEC a time forward.  */
+ 	VMOVU	(%rsi), %VEC(0)
+ 	VMOVU	VEC_SIZE(%rsi), %VEC(1)
+ 	VMOVU	(VEC_SIZE * 2)(%rsi), %VEC(2)
+ 	VMOVU	(VEC_SIZE * 3)(%rsi), %VEC(3)
+-	addq	$(VEC_SIZE * 4), %rsi
+-	subq	$(VEC_SIZE * 4), %rdx
++	subq	$-(VEC_SIZE * 4), %rsi
++	addq	$-(VEC_SIZE * 4), %rdx
+ 	VMOVA	%VEC(0), (%rdi)
+ 	VMOVA	%VEC(1), VEC_SIZE(%rdi)
+ 	VMOVA	%VEC(2), (VEC_SIZE * 2)(%rdi)
+ 	VMOVA	%VEC(3), (VEC_SIZE * 3)(%rdi)
+-	addq	$(VEC_SIZE * 4), %rdi
++	subq	$-(VEC_SIZE * 4), %rdi
+ 	cmpq	$(VEC_SIZE * 4), %rdx
+ 	ja	L(loop_4x_vec_forward)
+ 	/* Store the last 4 * VEC.  */
+@@ -467,24 +532,21 @@ L(more_8x_vec_backward):
+ 	subq	%r8, %r9
+ 	/* Adjust length.  */
+ 	subq	%r8, %rdx
+-#if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc)
+-	/* Check non-temporal store threshold.  */
+-	cmp	__x86_shared_non_temporal_threshold(%rip), %RDX_LP
+-	ja	L(large_backward)
+-#endif
++
++	.p2align 4
+ L(loop_4x_vec_backward):
+ 	/* Copy 4 * VEC a time backward.  */
+ 	VMOVU	(%rcx), %VEC(0)
+ 	VMOVU	-VEC_SIZE(%rcx), %VEC(1)
+ 	VMOVU	-(VEC_SIZE * 2)(%rcx), %VEC(2)
+ 	VMOVU	-(VEC_SIZE * 3)(%rcx), %VEC(3)
+-	subq	$(VEC_SIZE * 4), %rcx
+-	subq	$(VEC_SIZE * 4), %rdx
++	addq	$-(VEC_SIZE * 4), %rcx
++	addq	$-(VEC_SIZE * 4), %rdx
+ 	VMOVA	%VEC(0), (%r9)
+ 	VMOVA	%VEC(1), -VEC_SIZE(%r9)
+ 	VMOVA	%VEC(2), -(VEC_SIZE * 2)(%r9)
+ 	VMOVA	%VEC(3), -(VEC_SIZE * 3)(%r9)
+-	subq	$(VEC_SIZE * 4), %r9
++	addq	$-(VEC_SIZE * 4), %r9
+ 	cmpq	$(VEC_SIZE * 4), %rdx
+ 	ja	L(loop_4x_vec_backward)
+ 	/* Store the first 4 * VEC.  */
+@@ -497,72 +559,202 @@ L(loop_4x_vec_backward):
+ 	VZEROUPPER_RETURN
+ 
+ #if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc)
+-L(large_forward):
++	.p2align 4
++L(large_memcpy_2x):
++	/* Compute absolute value of difference between source and
++	   destination.  */
++	movq	%rdi, %r9
++	subq	%rsi, %r9
++	movq	%r9, %r8
++	leaq	-1(%r9), %rcx
++	sarq	$63, %r8
++	xorq	%r8, %r9
++	subq	%r8, %r9
+ 	/* Don't use non-temporal store if there is overlap between
+-	   destination and source since destination may be in cache
+-	   when source is loaded.  */
+-	leaq    (%rdi, %rdx), %r10
+-	cmpq    %r10, %rsi
+-	jb	L(loop_4x_vec_forward)
+-L(loop_large_forward):
++	   destination and source since destination may be in cache when
++	   source is loaded.  */
++	cmpq	%r9, %rdx
++	ja	L(more_8x_vec_check)
++
++	/* Cache align destination. First store the first 64 bytes then
++	   adjust alignments.  */
++	VMOVU	(%rsi), %VEC(8)
++#if VEC_SIZE < 64
++	VMOVU	VEC_SIZE(%rsi), %VEC(9)
++#if VEC_SIZE < 32
++	VMOVU	(VEC_SIZE * 2)(%rsi), %VEC(10)
++	VMOVU	(VEC_SIZE * 3)(%rsi), %VEC(11)
++#endif
++#endif
++	VMOVU	%VEC(8), (%rdi)
++#if VEC_SIZE < 64
++	VMOVU	%VEC(9), VEC_SIZE(%rdi)
++#if VEC_SIZE < 32
++	VMOVU	%VEC(10), (VEC_SIZE * 2)(%rdi)
++	VMOVU	%VEC(11), (VEC_SIZE * 3)(%rdi)
++#endif
++#endif
++	/* Adjust source, destination, and size.  */
++	movq	%rdi, %r8
++	andq	$63, %r8
++	/* Get the negative of offset for alignment.  */
++	subq	$64, %r8
++	/* Adjust source.  */
++	subq	%r8, %rsi
++	/* Adjust destination which should be aligned now.  */
++	subq	%r8, %rdi
++	/* Adjust length.  */
++	addq	%r8, %rdx
++
++	/* Test if source and destination addresses will alias. If they do
++	   the larger pipeline in large_memcpy_4x alleviated the
++	   performance drop.  */
++	testl	$(PAGE_SIZE - VEC_SIZE * 8), %ecx
++	jz	L(large_memcpy_4x)
++
++	movq	%rdx, %r10
++	shrq	$LOG_4X_MEMCPY_THRESH, %r10
++	cmp	__x86_shared_non_temporal_threshold(%rip), %r10
++	jae	L(large_memcpy_4x)
++
++	/* edx will store remainder size for copying tail.  */
++	andl	$(PAGE_SIZE * 2 - 1), %edx
++	/* r10 stores outer loop counter.  */
++	shrq	$((LOG_PAGE_SIZE + 1) - LOG_4X_MEMCPY_THRESH), %r10
++	/* Copy 4x VEC at a time from 2 pages.  */
++	.p2align 4
++L(loop_large_memcpy_2x_outer):
++	/* ecx stores inner loop counter.  */
++	movl	$(PAGE_SIZE / LARGE_LOAD_SIZE), %ecx
++L(loop_large_memcpy_2x_inner):
++	PREFETCH_ONE_SET(1, (%rsi), PREFETCHED_LOAD_SIZE)
++	PREFETCH_ONE_SET(1, (%rsi), PREFETCHED_LOAD_SIZE * 2)
++	PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE + PREFETCHED_LOAD_SIZE)
++	PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE + PREFETCHED_LOAD_SIZE * 2)
++	/* Load vectors from rsi.  */
++	LOAD_ONE_SET((%rsi), 0, %VEC(0), %VEC(1), %VEC(2), %VEC(3))
++	LOAD_ONE_SET((%rsi), PAGE_SIZE, %VEC(4), %VEC(5), %VEC(6), %VEC(7))
++	subq	$-LARGE_LOAD_SIZE, %rsi
++	/* Non-temporal store vectors to rdi.  */
++	STORE_ONE_SET((%rdi), 0, %VEC(0), %VEC(1), %VEC(2), %VEC(3))
++	STORE_ONE_SET((%rdi), PAGE_SIZE, %VEC(4), %VEC(5), %VEC(6), %VEC(7))
++	subq	$-LARGE_LOAD_SIZE, %rdi
++	decl	%ecx
++	jnz	L(loop_large_memcpy_2x_inner)
++	addq	$PAGE_SIZE, %rdi
++	addq	$PAGE_SIZE, %rsi
++	decq	%r10
++	jne	L(loop_large_memcpy_2x_outer)
++	sfence
++
++	/* Check if only last 4 loads are needed.  */
++	cmpl	$(VEC_SIZE * 4), %edx
++	jbe	L(large_memcpy_2x_end)
++
++	/* Handle the last 2 * PAGE_SIZE bytes.  */
++L(loop_large_memcpy_2x_tail):
+ 	/* Copy 4 * VEC a time forward with non-temporal stores.  */
+-	PREFETCH_ONE_SET (1, (%rsi), PREFETCHED_LOAD_SIZE * 2)
+-	PREFETCH_ONE_SET (1, (%rsi), PREFETCHED_LOAD_SIZE * 3)
++	PREFETCH_ONE_SET (1, (%rsi), PREFETCHED_LOAD_SIZE)
++	PREFETCH_ONE_SET (1, (%rdi), PREFETCHED_LOAD_SIZE)
+ 	VMOVU	(%rsi), %VEC(0)
+ 	VMOVU	VEC_SIZE(%rsi), %VEC(1)
+ 	VMOVU	(VEC_SIZE * 2)(%rsi), %VEC(2)
+ 	VMOVU	(VEC_SIZE * 3)(%rsi), %VEC(3)
+-	addq	$PREFETCHED_LOAD_SIZE, %rsi
+-	subq	$PREFETCHED_LOAD_SIZE, %rdx
+-	VMOVNT	%VEC(0), (%rdi)
+-	VMOVNT	%VEC(1), VEC_SIZE(%rdi)
+-	VMOVNT	%VEC(2), (VEC_SIZE * 2)(%rdi)
+-	VMOVNT	%VEC(3), (VEC_SIZE * 3)(%rdi)
+-	addq	$PREFETCHED_LOAD_SIZE, %rdi
+-	cmpq	$PREFETCHED_LOAD_SIZE, %rdx
+-	ja	L(loop_large_forward)
+-	sfence
++	subq	$-(VEC_SIZE * 4), %rsi
++	addl	$-(VEC_SIZE * 4), %edx
++	VMOVA	%VEC(0), (%rdi)
++	VMOVA	%VEC(1), VEC_SIZE(%rdi)
++	VMOVA	%VEC(2), (VEC_SIZE * 2)(%rdi)
++	VMOVA	%VEC(3), (VEC_SIZE * 3)(%rdi)
++	subq	$-(VEC_SIZE * 4), %rdi
++	cmpl	$(VEC_SIZE * 4), %edx
++	ja	L(loop_large_memcpy_2x_tail)
++
++L(large_memcpy_2x_end):
+ 	/* Store the last 4 * VEC.  */
+-	VMOVU	%VEC(5), (%rcx)
+-	VMOVU	%VEC(6), -VEC_SIZE(%rcx)
+-	VMOVU	%VEC(7), -(VEC_SIZE * 2)(%rcx)
+-	VMOVU	%VEC(8), -(VEC_SIZE * 3)(%rcx)
+-	/* Store the first VEC.  */
+-	VMOVU	%VEC(4), (%r11)
++	VMOVU	-(VEC_SIZE * 4)(%rsi, %rdx), %VEC(0)
++	VMOVU	-(VEC_SIZE * 3)(%rsi, %rdx), %VEC(1)
++	VMOVU	-(VEC_SIZE * 2)(%rsi, %rdx), %VEC(2)
++	VMOVU	-VEC_SIZE(%rsi, %rdx), %VEC(3)
++
++	VMOVU	%VEC(0), -(VEC_SIZE * 4)(%rdi, %rdx)
++	VMOVU	%VEC(1), -(VEC_SIZE * 3)(%rdi, %rdx)
++	VMOVU	%VEC(2), -(VEC_SIZE * 2)(%rdi, %rdx)
++	VMOVU	%VEC(3), -VEC_SIZE(%rdi, %rdx)
+ 	VZEROUPPER_RETURN
+ 
+-L(large_backward):
+-	/* Don't use non-temporal store if there is overlap between
+-	   destination and source since destination may be in cache
+-	   when source is loaded.  */
+-	leaq    (%rcx, %rdx), %r10
+-	cmpq    %r10, %r9
+-	jb	L(loop_4x_vec_backward)
+-L(loop_large_backward):
+-	/* Copy 4 * VEC a time backward with non-temporal stores.  */
+-	PREFETCH_ONE_SET (-1, (%rcx), -PREFETCHED_LOAD_SIZE * 2)
+-	PREFETCH_ONE_SET (-1, (%rcx), -PREFETCHED_LOAD_SIZE * 3)
+-	VMOVU	(%rcx), %VEC(0)
+-	VMOVU	-VEC_SIZE(%rcx), %VEC(1)
+-	VMOVU	-(VEC_SIZE * 2)(%rcx), %VEC(2)
+-	VMOVU	-(VEC_SIZE * 3)(%rcx), %VEC(3)
+-	subq	$PREFETCHED_LOAD_SIZE, %rcx
+-	subq	$PREFETCHED_LOAD_SIZE, %rdx
+-	VMOVNT	%VEC(0), (%r9)
+-	VMOVNT	%VEC(1), -VEC_SIZE(%r9)
+-	VMOVNT	%VEC(2), -(VEC_SIZE * 2)(%r9)
+-	VMOVNT	%VEC(3), -(VEC_SIZE * 3)(%r9)
+-	subq	$PREFETCHED_LOAD_SIZE, %r9
+-	cmpq	$PREFETCHED_LOAD_SIZE, %rdx
+-	ja	L(loop_large_backward)
++	.p2align 4
++L(large_memcpy_4x):
++	movq	%rdx, %r10
++	/* edx will store remainder size for copying tail.  */
++	andl	$(PAGE_SIZE * 4 - 1), %edx
++	/* r10 stores outer loop counter.  */
++	shrq	$(LOG_PAGE_SIZE + 2), %r10
++	/* Copy 4x VEC at a time from 4 pages.  */
++	.p2align 4
++L(loop_large_memcpy_4x_outer):
++	/* ecx stores inner loop counter.  */
++	movl	$(PAGE_SIZE / LARGE_LOAD_SIZE), %ecx
++L(loop_large_memcpy_4x_inner):
++	/* Only one prefetch set per page as doing 4 pages give more time
++	   for prefetcher to keep up.  */
++	PREFETCH_ONE_SET(1, (%rsi), PREFETCHED_LOAD_SIZE)
++	PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE + PREFETCHED_LOAD_SIZE)
++	PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE * 2 + PREFETCHED_LOAD_SIZE)
++	PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE * 3 + PREFETCHED_LOAD_SIZE)
++	/* Load vectors from rsi.  */
++	LOAD_ONE_SET((%rsi), 0, %VEC(0), %VEC(1), %VEC(2), %VEC(3))
++	LOAD_ONE_SET((%rsi), PAGE_SIZE, %VEC(4), %VEC(5), %VEC(6), %VEC(7))
++	LOAD_ONE_SET((%rsi), PAGE_SIZE * 2, %VEC(8), %VEC(9), %VEC(10), %VEC(11))
++	LOAD_ONE_SET((%rsi), PAGE_SIZE * 3, %VEC(12), %VEC(13), %VEC(14), %VEC(15))
++	subq	$-LARGE_LOAD_SIZE, %rsi
++	/* Non-temporal store vectors to rdi.  */
++	STORE_ONE_SET((%rdi), 0, %VEC(0), %VEC(1), %VEC(2), %VEC(3))
++	STORE_ONE_SET((%rdi), PAGE_SIZE, %VEC(4), %VEC(5), %VEC(6), %VEC(7))
++	STORE_ONE_SET((%rdi), PAGE_SIZE * 2, %VEC(8), %VEC(9), %VEC(10), %VEC(11))
++	STORE_ONE_SET((%rdi), PAGE_SIZE * 3, %VEC(12), %VEC(13), %VEC(14), %VEC(15))
++	subq	$-LARGE_LOAD_SIZE, %rdi
++	decl	%ecx
++	jnz	L(loop_large_memcpy_4x_inner)
++	addq	$(PAGE_SIZE * 3), %rdi
++	addq	$(PAGE_SIZE * 3), %rsi
++	decq	%r10
++	jne	L(loop_large_memcpy_4x_outer)
+ 	sfence
+-	/* Store the first 4 * VEC.  */
+-	VMOVU	%VEC(4), (%rdi)
+-	VMOVU	%VEC(5), VEC_SIZE(%rdi)
+-	VMOVU	%VEC(6), (VEC_SIZE * 2)(%rdi)
+-	VMOVU	%VEC(7), (VEC_SIZE * 3)(%rdi)
+-	/* Store the last VEC.  */
+-	VMOVU	%VEC(8), (%r11)
++	/* Check if only last 4 loads are needed.  */
++	cmpl	$(VEC_SIZE * 4), %edx
++	jbe	L(large_memcpy_4x_end)
++
++	/* Handle the last 4  * PAGE_SIZE bytes.  */
++L(loop_large_memcpy_4x_tail):
++	/* Copy 4 * VEC a time forward with non-temporal stores.  */
++	PREFETCH_ONE_SET (1, (%rsi), PREFETCHED_LOAD_SIZE)
++	PREFETCH_ONE_SET (1, (%rdi), PREFETCHED_LOAD_SIZE)
++	VMOVU	(%rsi), %VEC(0)
++	VMOVU	VEC_SIZE(%rsi), %VEC(1)
++	VMOVU	(VEC_SIZE * 2)(%rsi), %VEC(2)
++	VMOVU	(VEC_SIZE * 3)(%rsi), %VEC(3)
++	subq	$-(VEC_SIZE * 4), %rsi
++	addl	$-(VEC_SIZE * 4), %edx
++	VMOVA	%VEC(0), (%rdi)
++	VMOVA	%VEC(1), VEC_SIZE(%rdi)
++	VMOVA	%VEC(2), (VEC_SIZE * 2)(%rdi)
++	VMOVA	%VEC(3), (VEC_SIZE * 3)(%rdi)
++	subq	$-(VEC_SIZE * 4), %rdi
++	cmpl	$(VEC_SIZE * 4), %edx
++	ja	L(loop_large_memcpy_4x_tail)
++
++L(large_memcpy_4x_end):
++	/* Store the last 4 * VEC.  */
++	VMOVU	-(VEC_SIZE * 4)(%rsi, %rdx), %VEC(0)
++	VMOVU	-(VEC_SIZE * 3)(%rsi, %rdx), %VEC(1)
++	VMOVU	-(VEC_SIZE * 2)(%rsi, %rdx), %VEC(2)
++	VMOVU	-VEC_SIZE(%rsi, %rdx), %VEC(3)
++
++	VMOVU	%VEC(0), -(VEC_SIZE * 4)(%rdi, %rdx)
++	VMOVU	%VEC(1), -(VEC_SIZE * 3)(%rdi, %rdx)
++	VMOVU	%VEC(2), -(VEC_SIZE * 2)(%rdi, %rdx)
++	VMOVU	%VEC(3), -VEC_SIZE(%rdi, %rdx)
+ 	VZEROUPPER_RETURN
+ #endif
+ END (MEMMOVE_SYMBOL (__memmove, unaligned_erms))
+-- 
+GitLab
+
diff --git a/glibc-RHEL-15696-4.patch b/glibc-RHEL-15696-4.patch
new file mode 100644
index 0000000..531c171
--- /dev/null
+++ b/glibc-RHEL-15696-4.patch
@@ -0,0 +1,151 @@
+From ecd8b842cf37ea112e59cd9085ff1f1b6e208ae0 Mon Sep 17 00:00:00 2001
+From: "H.J. Lu" <hjl.tools@gmail.com>
+Date: Mon, 21 Jan 2019 11:29:58 -0800
+Subject: [PATCH] x86-64 memrchr: Properly handle the length parameter [BZ#
+ 24097]
+Content-type: text/plain; charset=UTF-8
+
+On x32, the size_t parameter may be passed in the lower 32 bits of a
+64-bit register with the non-zero upper 32 bits.  The string/memory
+functions written in assembly can only use the lower 32 bits of a
+64-bit register as length or must clear the upper 32 bits before using
+the full 64-bit register for length.
+
+This pach fixes memrchr for x32.  Tested on x86-64 and x32.  On x86-64,
+libc.so is the same with and withou the fix.
+
+	[BZ# 24097]
+	CVE-2019-6488
+	* sysdeps/x86_64/memrchr.S: Use RDX_LP for length.
+	* sysdeps/x86_64/multiarch/memrchr-avx2.S: Likewise.
+	* sysdeps/x86_64/x32/Makefile (tests): Add tst-size_t-memrchr.
+	* sysdeps/x86_64/x32/tst-size_t-memrchr.c: New file.
+---
+ sysdeps/x86_64/memrchr.S                |  4 +-
+ sysdeps/x86_64/multiarch/memrchr-avx2.S |  4 +-
+ sysdeps/x86_64/x32/Makefile             |  3 +-
+ sysdeps/x86_64/x32/tst-size_t-memrchr.c | 57 +++++++++++++++++++++++++
+ 4 files changed, 63 insertions(+), 5 deletions(-)
+ create mode 100644 sysdeps/x86_64/x32/tst-size_t-memrchr.c
+
+Conflicts:
+	ChangeLog
+	(removed)
+
+diff --git a/sysdeps/x86_64/memrchr.S b/sysdeps/x86_64/memrchr.S
+index b8e3fa1d..dc82f8f7 100644
+--- a/sysdeps/x86_64/memrchr.S
++++ b/sysdeps/x86_64/memrchr.S
+@@ -24,13 +24,13 @@
+ ENTRY (__memrchr)
+ 	movd	%esi, %xmm1
+ 
+-	sub	$16, %rdx
++	sub	$16, %RDX_LP
+ 	jbe	L(length_less16)
+ 
+ 	punpcklbw	%xmm1, %xmm1
+ 	punpcklbw	%xmm1, %xmm1
+ 
+-	add	%rdx, %rdi
++	add	%RDX_LP, %RDI_LP
+ 	pshufd	$0, %xmm1, %xmm1
+ 
+ 	movdqu	(%rdi), %xmm0
+diff --git a/sysdeps/x86_64/multiarch/memrchr-avx2.S b/sysdeps/x86_64/multiarch/memrchr-avx2.S
+index b41a58bc..ce488dd9 100644
+--- a/sysdeps/x86_64/multiarch/memrchr-avx2.S
++++ b/sysdeps/x86_64/multiarch/memrchr-avx2.S
+@@ -32,10 +32,10 @@ ENTRY (__memrchr_avx2)
+ 	vmovd	%esi, %xmm0
+ 	vpbroadcastb %xmm0, %ymm0
+ 
+-	subq	$VEC_SIZE, %rdx
++	sub	$VEC_SIZE, %RDX_LP
+ 	jbe	L(last_vec_or_less)
+ 
+-	addq	%rdx, %rdi
++	add	%RDX_LP, %RDI_LP
+ 
+ 	/* Check the last VEC_SIZE bytes.  */
+ 	vpcmpeqb (%rdi), %ymm0, %ymm1
+diff --git a/sysdeps/x86_64/x32/Makefile b/sysdeps/x86_64/x32/Makefile
+index 2fe1e5ac..e99dbd7c 100644
+--- a/sysdeps/x86_64/x32/Makefile
++++ b/sysdeps/x86_64/x32/Makefile
+@@ -6,7 +6,8 @@ CFLAGS-s_llround.c += -fno-builtin-lround
+ endif
+ 
+ ifeq ($(subdir),string)
+-tests += tst-size_t-memchr tst-size_t-memcmp tst-size_t-memcpy
++tests += tst-size_t-memchr tst-size_t-memcmp tst-size_t-memcpy \
++	 tst-size_t-memrchr
+ endif
+ 
+ ifeq ($(subdir),wcsmbs)
+diff --git a/sysdeps/x86_64/x32/tst-size_t-memrchr.c b/sysdeps/x86_64/x32/tst-size_t-memrchr.c
+new file mode 100644
+index 00000000..c83699c0
+--- /dev/null
++++ b/sysdeps/x86_64/x32/tst-size_t-memrchr.c
+@@ -0,0 +1,57 @@
++/* Test memrchr with size_t in the lower 32 bits of 64-bit register.
++   Copyright (C) 2019 Free Software Foundation, Inc.
++   This file is part of the GNU C Library.
++
++   The GNU C Library is free software; you can redistribute it and/or
++   modify it under the terms of the GNU Lesser General Public
++   License as published by the Free Software Foundation; either
++   version 2.1 of the License, or (at your option) any later version.
++
++   The GNU C Library is distributed in the hope that it will be useful,
++   but WITHOUT ANY WARRANTY; without even the implied warranty of
++   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++   Lesser General Public License for more details.
++
++   You should have received a copy of the GNU Lesser General Public
++   License along with the GNU C Library; if not, see
++   <http://www.gnu.org/licenses/>.  */
++
++#define TEST_NAME "memrchr"
++#include "test-size_t.h"
++
++IMPL (memchr, 1)
++
++typedef void * (*proto_t) (const void *, int, size_t);
++
++static void *
++__attribute__ ((noinline, noclone))
++do_memrchr (parameter_t a, parameter_t b)
++{
++  return CALL (&b, a.p, (uintptr_t) b.p, a.len);
++}
++
++static int
++test_main (void)
++{
++  test_init ();
++
++  parameter_t src = { { page_size }, buf2 };
++  parameter_t c = { { 0 }, (void *) (uintptr_t) 0x12 };
++
++  int ret = 0;
++  FOR_EACH_IMPL (impl, 0)
++    {
++      c.fn = impl->fn;
++      void * res = do_memrchr (src, c);
++      if (res)
++	{
++	  error (0, 0, "Wrong result in function %s: %p != NULL",
++		 impl->name, res);
++	  ret = 1;
++	}
++    }
++
++  return ret ? EXIT_FAILURE : EXIT_SUCCESS;
++}
++
++#include <support/test-driver.c>
+-- 
+GitLab
+
diff --git a/glibc-RHEL-15696-40.patch b/glibc-RHEL-15696-40.patch
new file mode 100644
index 0000000..7b7c07b
--- /dev/null
+++ b/glibc-RHEL-15696-40.patch
@@ -0,0 +1,92 @@
+From 83c5b368226c34a2f0a5287df40fc290b2b34359 Mon Sep 17 00:00:00 2001
+From: "H.J. Lu" <hjl.tools@gmail.com>
+Date: Mon, 19 Apr 2021 10:45:07 -0700
+Subject: [PATCH] x86-64: Require BMI2 for strchr-avx2.S
+Content-type: text/plain; charset=UTF-8
+
+Since strchr-avx2.S updated by
+
+commit 1f745ecc2109890886b161d4791e1406fdfc29b8
+Author: noah <goldstein.w.n@gmail.com>
+Date:   Wed Feb 3 00:38:59 2021 -0500
+
+    x86-64: Refactor and improve performance of strchr-avx2.S
+
+uses sarx:
+
+c4 e2 72 f7 c0       	sarx   %ecx,%eax,%eax
+
+for strchr-avx2 family functions, require BMI2 in ifunc-impl-list.c and
+ifunc-avx2.h.
+---
+ sysdeps/x86_64/multiarch/ifunc-avx2.h      |  4 ++--
+ sysdeps/x86_64/multiarch/ifunc-impl-list.c | 12 +++++++++---
+ 2 files changed, 11 insertions(+), 5 deletions(-)
+
+diff --git a/sysdeps/x86_64/multiarch/ifunc-avx2.h b/sysdeps/x86_64/multiarch/ifunc-avx2.h
+index e0f30e61..ef72b73f 100644
+--- a/sysdeps/x86_64/multiarch/ifunc-avx2.h
++++ b/sysdeps/x86_64/multiarch/ifunc-avx2.h
+@@ -30,11 +30,11 @@ IFUNC_SELECTOR (void)
+   const struct cpu_features* cpu_features = __get_cpu_features ();
+ 
+   if (CPU_FEATURE_USABLE_P (cpu_features, AVX2)
++      && CPU_FEATURE_USABLE_P (cpu_features, BMI2)
+       && CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load))
+     {
+       if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
+-	  && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW)
+-	  && CPU_FEATURE_USABLE_P (cpu_features, BMI2))
++	  && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW))
+ 	return OPTIMIZE (evex);
+ 
+       if (CPU_FEATURE_USABLE_P (cpu_features, RTM))
+diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+index 695cdba6..85b8863a 100644
+--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
++++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+@@ -400,10 +400,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+   /* Support sysdeps/x86_64/multiarch/strchr.c.  */
+   IFUNC_IMPL (i, name, strchr,
+ 	      IFUNC_IMPL_ADD (array, i, strchr,
+-			      CPU_FEATURE_USABLE (AVX2),
++			      (CPU_FEATURE_USABLE (AVX2)
++			       && CPU_FEATURE_USABLE (BMI2)),
+ 			      __strchr_avx2)
+ 	      IFUNC_IMPL_ADD (array, i, strchr,
+ 			      (CPU_FEATURE_USABLE (AVX2)
++			       && CPU_FEATURE_USABLE (BMI2)
+ 			       && CPU_FEATURE_USABLE (RTM)),
+ 			      __strchr_avx2_rtm)
+ 	      IFUNC_IMPL_ADD (array, i, strchr,
+@@ -417,10 +419,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+   /* Support sysdeps/x86_64/multiarch/strchrnul.c.  */
+   IFUNC_IMPL (i, name, strchrnul,
+ 	      IFUNC_IMPL_ADD (array, i, strchrnul,
+-			      CPU_FEATURE_USABLE (AVX2),
++			      (CPU_FEATURE_USABLE (AVX2)
++			       && CPU_FEATURE_USABLE (BMI2)),
+ 			      __strchrnul_avx2)
+ 	      IFUNC_IMPL_ADD (array, i, strchrnul,
+ 			      (CPU_FEATURE_USABLE (AVX2)
++			       && CPU_FEATURE_USABLE (BMI2)
+ 			       && CPU_FEATURE_USABLE (RTM)),
+ 			      __strchrnul_avx2_rtm)
+ 	      IFUNC_IMPL_ADD (array, i, strchrnul,
+@@ -574,10 +578,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+   /* Support sysdeps/x86_64/multiarch/wcschr.c.  */
+   IFUNC_IMPL (i, name, wcschr,
+ 	      IFUNC_IMPL_ADD (array, i, wcschr,
+-			      CPU_FEATURE_USABLE (AVX2),
++			      (CPU_FEATURE_USABLE (AVX2)
++			       && CPU_FEATURE_USABLE (BMI2)),
+ 			      __wcschr_avx2)
+ 	      IFUNC_IMPL_ADD (array, i, wcschr,
+ 			      (CPU_FEATURE_USABLE (AVX2)
++			       && CPU_FEATURE_USABLE (BMI2)
+ 			       && CPU_FEATURE_USABLE (RTM)),
+ 			      __wcschr_avx2_rtm)
+ 	      IFUNC_IMPL_ADD (array, i, wcschr,
+-- 
+GitLab
+
diff --git a/glibc-RHEL-15696-41.patch b/glibc-RHEL-15696-41.patch
new file mode 100644
index 0000000..aa8fc69
--- /dev/null
+++ b/glibc-RHEL-15696-41.patch
@@ -0,0 +1,265 @@
+From f53790272ce7bdc5ecd14b45f65d0464d2a61a3a Mon Sep 17 00:00:00 2001
+From: Noah Goldstein <goldstein.w.n@gmail.com>
+Date: Mon, 19 Apr 2021 17:48:10 -0400
+Subject: [PATCH] x86: Optimize less_vec evex and avx512
+ memset-vec-unaligned-erms.S
+Content-type: text/plain; charset=UTF-8
+
+No bug. This commit adds optimized cased for less_vec memset case that
+uses the avx512vl/avx512bw mask store avoiding the excessive
+branches. test-memset and test-wmemset are passing.
+
+Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
+---
+ sysdeps/x86_64/multiarch/ifunc-impl-list.c    | 40 ++++++++++-----
+ sysdeps/x86_64/multiarch/ifunc-memset.h       |  6 ++-
+ .../multiarch/memset-avx512-unaligned-erms.S  |  2 +-
+ .../multiarch/memset-evex-unaligned-erms.S    |  2 +-
+ .../multiarch/memset-vec-unaligned-erms.S     | 51 +++++++++++++++----
+ 5 files changed, 74 insertions(+), 27 deletions(-)
+
+diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+index 85b8863a..d59d65f8 100644
+--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
++++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+@@ -204,19 +204,23 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 			      __memset_chk_avx2_unaligned_erms_rtm)
+ 	      IFUNC_IMPL_ADD (array, i, __memset_chk,
+ 			      (CPU_FEATURE_USABLE (AVX512VL)
+-			       && CPU_FEATURE_USABLE (AVX512BW)),
++			       && CPU_FEATURE_USABLE (AVX512BW)
++			       && CPU_FEATURE_USABLE (BMI2)),
+ 			      __memset_chk_evex_unaligned)
+ 	      IFUNC_IMPL_ADD (array, i, __memset_chk,
+ 			      (CPU_FEATURE_USABLE (AVX512VL)
+-			       && CPU_FEATURE_USABLE (AVX512BW)),
++			       && CPU_FEATURE_USABLE (AVX512BW)
++			       && CPU_FEATURE_USABLE (BMI2)),
+ 			      __memset_chk_evex_unaligned_erms)
+ 	      IFUNC_IMPL_ADD (array, i, __memset_chk,
+ 			      (CPU_FEATURE_USABLE (AVX512VL)
+-			       && CPU_FEATURE_USABLE (AVX512BW)),
++			       && CPU_FEATURE_USABLE (AVX512BW)
++			       && CPU_FEATURE_USABLE (BMI2)),
+ 			      __memset_chk_avx512_unaligned_erms)
+ 	      IFUNC_IMPL_ADD (array, i, __memset_chk,
+ 			      (CPU_FEATURE_USABLE (AVX512VL)
+-			       && CPU_FEATURE_USABLE (AVX512BW)),
++			       && CPU_FEATURE_USABLE (AVX512BW)
++			       && CPU_FEATURE_USABLE (BMI2)),
+ 			      __memset_chk_avx512_unaligned)
+ 	      IFUNC_IMPL_ADD (array, i, __memset_chk,
+ 			      CPU_FEATURE_USABLE (AVX512F),
+@@ -247,19 +251,23 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 			      __memset_avx2_unaligned_erms_rtm)
+ 	      IFUNC_IMPL_ADD (array, i, memset,
+ 			      (CPU_FEATURE_USABLE (AVX512VL)
+-			       && CPU_FEATURE_USABLE (AVX512BW)),
++			       && CPU_FEATURE_USABLE (AVX512BW)
++			       && CPU_FEATURE_USABLE (BMI2)),
+ 			      __memset_evex_unaligned)
+ 	      IFUNC_IMPL_ADD (array, i, memset,
+ 			      (CPU_FEATURE_USABLE (AVX512VL)
+-			       && CPU_FEATURE_USABLE (AVX512BW)),
++			       && CPU_FEATURE_USABLE (AVX512BW)
++			       && CPU_FEATURE_USABLE (BMI2)),
+ 			      __memset_evex_unaligned_erms)
+ 	      IFUNC_IMPL_ADD (array, i, memset,
+ 			      (CPU_FEATURE_USABLE (AVX512VL)
+-			       && CPU_FEATURE_USABLE (AVX512BW)),
++			       && CPU_FEATURE_USABLE (AVX512BW)
++			       && CPU_FEATURE_USABLE (BMI2)),
+ 			      __memset_avx512_unaligned_erms)
+ 	      IFUNC_IMPL_ADD (array, i, memset,
+ 			      (CPU_FEATURE_USABLE (AVX512VL)
+-			       && CPU_FEATURE_USABLE (AVX512BW)),
++			       && CPU_FEATURE_USABLE (AVX512BW)
++			       && CPU_FEATURE_USABLE (BMI2)),
+ 			      __memset_avx512_unaligned)
+ 	      IFUNC_IMPL_ADD (array, i, memset,
+ 			      CPU_FEATURE_USABLE (AVX512F),
+@@ -739,10 +747,14 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 			       && CPU_FEATURE_USABLE (RTM)),
+ 			      __wmemset_avx2_unaligned_rtm)
+ 	      IFUNC_IMPL_ADD (array, i, wmemset,
+-			      CPU_FEATURE_USABLE (AVX512VL),
++			      (CPU_FEATURE_USABLE (AVX512VL)
++			       && CPU_FEATURE_USABLE (AVX512BW)
++			       && CPU_FEATURE_USABLE (BMI2)),
+ 			      __wmemset_evex_unaligned)
+ 	      IFUNC_IMPL_ADD (array, i, wmemset,
+-			      CPU_FEATURE_USABLE (AVX512VL),
++			      (CPU_FEATURE_USABLE (AVX512VL)
++			       && CPU_FEATURE_USABLE (AVX512BW)
++			       && CPU_FEATURE_USABLE (BMI2)),
+ 			      __wmemset_avx512_unaligned))
+ 
+ #ifdef SHARED
+@@ -946,10 +958,14 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 			      CPU_FEATURE_USABLE (AVX2),
+ 			      __wmemset_chk_avx2_unaligned)
+ 	      IFUNC_IMPL_ADD (array, i, __wmemset_chk,
+-			      CPU_FEATURE_USABLE (AVX512VL),
++			      (CPU_FEATURE_USABLE (AVX512VL)
++			       && CPU_FEATURE_USABLE (AVX512BW)
++			       && CPU_FEATURE_USABLE (BMI2)),
+ 			      __wmemset_chk_evex_unaligned)
+ 	      IFUNC_IMPL_ADD (array, i, __wmemset_chk,
+-			      CPU_FEATURE_USABLE (AVX512F),
++			      (CPU_FEATURE_USABLE (AVX512VL)
++			       && CPU_FEATURE_USABLE (AVX512BW)
++			       && CPU_FEATURE_USABLE (BMI2)),
+ 			      __wmemset_chk_avx512_unaligned))
+ #endif
+ 
+diff --git a/sysdeps/x86_64/multiarch/ifunc-memset.h b/sysdeps/x86_64/multiarch/ifunc-memset.h
+index 19795938..100e3707 100644
+--- a/sysdeps/x86_64/multiarch/ifunc-memset.h
++++ b/sysdeps/x86_64/multiarch/ifunc-memset.h
+@@ -54,7 +54,8 @@ IFUNC_SELECTOR (void)
+       && !CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_AVX512))
+     {
+       if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
+-	  && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW))
++          && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW)
++          && CPU_FEATURE_USABLE_P (cpu_features, BMI2))
+ 	{
+ 	  if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
+ 	    return OPTIMIZE (avx512_unaligned_erms);
+@@ -68,7 +69,8 @@ IFUNC_SELECTOR (void)
+   if (CPU_FEATURE_USABLE_P (cpu_features, AVX2))
+     {
+       if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
+-	  && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW))
++          && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW)
++          && CPU_FEATURE_USABLE_P (cpu_features, BMI2))
+ 	{
+ 	  if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
+ 	    return OPTIMIZE (evex_unaligned_erms);
+diff --git a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
+index 22e7b187..8ad842fc 100644
+--- a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
++++ b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
+@@ -19,6 +19,6 @@
+ # define SECTION(p)		p##.evex512
+ # define MEMSET_SYMBOL(p,s)	p##_avx512_##s
+ # define WMEMSET_SYMBOL(p,s)	p##_avx512_##s
+-
++# define USE_LESS_VEC_MASK_STORE	1
+ # include "memset-vec-unaligned-erms.S"
+ #endif
+diff --git a/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
+index ae0a4d6e..640f0929 100644
+--- a/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
++++ b/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
+@@ -19,6 +19,6 @@
+ # define SECTION(p)		p##.evex
+ # define MEMSET_SYMBOL(p,s)	p##_evex_##s
+ # define WMEMSET_SYMBOL(p,s)	p##_evex_##s
+-
++# define USE_LESS_VEC_MASK_STORE	1
+ # include "memset-vec-unaligned-erms.S"
+ #endif
+diff --git a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
+index bae5cba4..f877ac9d 100644
+--- a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
++++ b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
+@@ -63,6 +63,8 @@
+ # endif
+ #endif
+ 
++#define PAGE_SIZE 4096
++
+ #ifndef SECTION
+ # error SECTION is not defined!
+ #endif
+@@ -213,11 +215,38 @@ L(loop):
+ 	cmpq	%rcx, %rdx
+ 	jne	L(loop)
+ 	VZEROUPPER_SHORT_RETURN
++
++	.p2align 4
+ L(less_vec):
+ 	/* Less than 1 VEC.  */
+ # if VEC_SIZE != 16 && VEC_SIZE != 32 && VEC_SIZE != 64
+ #  error Unsupported VEC_SIZE!
+ # endif
++# ifdef USE_LESS_VEC_MASK_STORE
++	/* Clear high bits from edi. Only keeping bits relevant to page
++	   cross check. Note that we are using rax which is set in
++	   MEMSET_VDUP_TO_VEC0_AND_SET_RETURN as ptr from here on out.
++	 */
++	andl	$(PAGE_SIZE - 1), %edi
++	/* Check if VEC_SIZE store cross page. Mask stores suffer serious
++	   performance degradation when it has to fault supress.  */
++	cmpl	$(PAGE_SIZE - VEC_SIZE), %edi
++	ja	L(cross_page)
++# if VEC_SIZE > 32
++	movq	$-1, %rcx
++	bzhiq	%rdx, %rcx, %rcx
++	kmovq	%rcx, %k1
++# else
++	movl	$-1, %ecx
++	bzhil	%edx, %ecx, %ecx
++	kmovd	%ecx, %k1
++# endif
++	vmovdqu8	%VEC(0), (%rax) {%k1}
++	VZEROUPPER_RETURN
++
++	.p2align 4
++L(cross_page):
++# endif
+ # if VEC_SIZE > 32
+ 	cmpb	$32, %dl
+ 	jae	L(between_32_63)
+@@ -234,36 +263,36 @@ L(less_vec):
+ 	cmpb	$1, %dl
+ 	ja	L(between_2_3)
+ 	jb	1f
+-	movb	%cl, (%rdi)
++	movb	%cl, (%rax)
+ 1:
+ 	VZEROUPPER_RETURN
+ # if VEC_SIZE > 32
+ 	/* From 32 to 63.  No branch when size == 32.  */
+ L(between_32_63):
+-	VMOVU	%YMM0, -32(%rdi,%rdx)
+-	VMOVU	%YMM0, (%rdi)
++	VMOVU	%YMM0, -32(%rax,%rdx)
++	VMOVU	%YMM0, (%rax)
+ 	VZEROUPPER_RETURN
+ # endif
+ # if VEC_SIZE > 16
+ 	/* From 16 to 31.  No branch when size == 16.  */
+ L(between_16_31):
+-	VMOVU	%XMM0, -16(%rdi,%rdx)
+-	VMOVU	%XMM0, (%rdi)
++	VMOVU	%XMM0, -16(%rax,%rdx)
++	VMOVU	%XMM0, (%rax)
+ 	VZEROUPPER_RETURN
+ # endif
+ 	/* From 8 to 15.  No branch when size == 8.  */
+ L(between_8_15):
+-	movq	%rcx, -8(%rdi,%rdx)
+-	movq	%rcx, (%rdi)
++	movq	%rcx, -8(%rax,%rdx)
++	movq	%rcx, (%rax)
+ 	VZEROUPPER_RETURN
+ L(between_4_7):
+ 	/* From 4 to 7.  No branch when size == 4.  */
+-	movl	%ecx, -4(%rdi,%rdx)
+-	movl	%ecx, (%rdi)
++	movl	%ecx, -4(%rax,%rdx)
++	movl	%ecx, (%rax)
+ 	VZEROUPPER_RETURN
+ L(between_2_3):
+ 	/* From 2 to 3.  No branch when size == 2.  */
+-	movw	%cx, -2(%rdi,%rdx)
+-	movw	%cx, (%rdi)
++	movw	%cx, -2(%rax,%rdx)
++	movw	%cx, (%rax)
+ 	VZEROUPPER_RETURN
+ END (MEMSET_SYMBOL (__memset, unaligned_erms))
+-- 
+GitLab
+
diff --git a/glibc-RHEL-15696-42.patch b/glibc-RHEL-15696-42.patch
new file mode 100644
index 0000000..e2ca245
--- /dev/null
+++ b/glibc-RHEL-15696-42.patch
@@ -0,0 +1,396 @@
+From ccabe7971f508709d034b63b8672f6f751a3d356 Mon Sep 17 00:00:00 2001
+From: Noah Goldstein <goldstein.w.n@gmail.com>
+Date: Fri, 23 Apr 2021 15:56:24 -0400
+Subject: [PATCH] x86: Optimize strchr-avx2.S
+Content-type: text/plain; charset=UTF-8
+
+No bug. This commit optimizes strchr-avx2.S. The optimizations are all
+small things such as save an ALU in the alignment process, saving a
+few instructions in the loop return, saving some bytes in the main
+loop, and increasing the ILP in the return cases. test-strchr,
+test-strchrnul, test-wcschr, and test-wcschrnul are all passing.
+
+Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
+---
+ sysdeps/x86_64/multiarch/strchr-avx2.S | 290 +++++++++++++++----------
+ 1 file changed, 170 insertions(+), 120 deletions(-)
+
+Conflics:
+	sysdeps/x86_64/multiarch/strchr-avx2.S
+	(rearranged to account for branch changes)
+
+diff --git a/sysdeps/x86_64/multiarch/strchr-avx2.S b/sysdeps/x86_64/multiarch/strchr-avx2.S
+index 919d256c..5884726b 100644
+--- a/sysdeps/x86_64/multiarch/strchr-avx2.S
++++ b/sysdeps/x86_64/multiarch/strchr-avx2.S
+@@ -49,133 +49,144 @@
+ 
+ 	.section SECTION(.text),"ax",@progbits
+ ENTRY (STRCHR)
+-	movl	%edi, %ecx
+-# ifndef USE_AS_STRCHRNUL
+-	xorl	%edx, %edx
+-# endif
+-
+ 	/* Broadcast CHAR to YMM0.	*/
+ 	vmovd	%esi, %xmm0
++	movl	%edi, %eax
++	andl	$(PAGE_SIZE - 1), %eax
++	VPBROADCAST	%xmm0, %ymm0
+ 	vpxor	%xmm9, %xmm9, %xmm9
+-	VPBROADCAST %xmm0, %ymm0
+ 
+ 	/* Check if we cross page boundary with one vector load.  */
+-	andl	$(PAGE_SIZE - 1), %ecx
+-	cmpl	$(PAGE_SIZE - VEC_SIZE), %ecx
+-	ja  L(cross_page_boundary)
++	cmpl	$(PAGE_SIZE - VEC_SIZE), %eax
++	ja	L(cross_page_boundary)
+ 
+ 	/* Check the first VEC_SIZE bytes.	Search for both CHAR and the
+ 	   null byte.  */
+ 	vmovdqu	(%rdi), %ymm8
+-	VPCMPEQ %ymm8, %ymm0, %ymm1
+-	VPCMPEQ %ymm8, %ymm9, %ymm2
++	VPCMPEQ	%ymm8, %ymm0, %ymm1
++	VPCMPEQ	%ymm8, %ymm9, %ymm2
+ 	vpor	%ymm1, %ymm2, %ymm1
+ 	vpmovmskb %ymm1, %eax
+ 	testl	%eax, %eax
+-	jz	L(more_vecs)
++	jz	L(aligned_more)
+ 	tzcntl	%eax, %eax
++# ifndef USE_AS_STRCHRNUL
+ 	/* Found CHAR or the null byte.	 */
++	cmp	(%rdi, %rax), %CHAR_REG
++	jne	L(zero)
++# endif
+ 	addq	%rdi, %rax
++	VZEROUPPER_RETURN
++
++	/* .p2align 5 helps keep performance more consistent if ENTRY()
++	   alignment % 32 was either 16 or 0. As well this makes the
++	   alignment % 32 of the loop_4x_vec fixed which makes tuning it
++	   easier.  */
++	.p2align 5
++L(first_vec_x4):
++	tzcntl	%eax, %eax
++	addq	$(VEC_SIZE * 3 + 1), %rdi
+ # ifndef USE_AS_STRCHRNUL
+-	cmp (%rax), %CHAR_REG
+-	cmovne	%rdx, %rax
++	/* Found CHAR or the null byte.	 */
++	cmp	(%rdi, %rax), %CHAR_REG
++	jne	L(zero)
+ # endif
+-L(return_vzeroupper):
+-	ZERO_UPPER_VEC_REGISTERS_RETURN
+-
+-	.p2align 4
+-L(more_vecs):
+-	/* Align data for aligned loads in the loop.  */
+-	andq	$-VEC_SIZE, %rdi
+-L(aligned_more):
+-
+-	/* Check the next 4 * VEC_SIZE.	 Only one VEC_SIZE at a time
+-	   since data is only aligned to VEC_SIZE.	*/
+-	vmovdqa	VEC_SIZE(%rdi), %ymm8
+-	addq	$VEC_SIZE, %rdi
+-	VPCMPEQ %ymm8, %ymm0, %ymm1
+-	VPCMPEQ %ymm8, %ymm9, %ymm2
+-	vpor	%ymm1, %ymm2, %ymm1
+-	vpmovmskb %ymm1, %eax
+-	testl	%eax, %eax
+-	jnz	L(first_vec_x0)
+-
+-	vmovdqa	VEC_SIZE(%rdi), %ymm8
+-	VPCMPEQ %ymm8, %ymm0, %ymm1
+-	VPCMPEQ %ymm8, %ymm9, %ymm2
+-	vpor	%ymm1, %ymm2, %ymm1
+-	vpmovmskb %ymm1, %eax
+-	testl	%eax, %eax
+-	jnz	L(first_vec_x1)
+-
+-	vmovdqa	(VEC_SIZE * 2)(%rdi), %ymm8
+-	VPCMPEQ %ymm8, %ymm0, %ymm1
+-	VPCMPEQ %ymm8, %ymm9, %ymm2
+-	vpor	%ymm1, %ymm2, %ymm1
+-	vpmovmskb %ymm1, %eax
+-	testl	%eax, %eax
+-	jnz	L(first_vec_x2)
+-
+-	vmovdqa	(VEC_SIZE * 3)(%rdi), %ymm8
+-	VPCMPEQ %ymm8, %ymm0, %ymm1
+-	VPCMPEQ %ymm8, %ymm9, %ymm2
+-	vpor	%ymm1, %ymm2, %ymm1
+-	vpmovmskb %ymm1, %eax
+-	testl	%eax, %eax
+-	jz	L(prep_loop_4x)
++	addq	%rdi, %rax
++	VZEROUPPER_RETURN
+ 
+-	tzcntl	%eax, %eax
+-	leaq	(VEC_SIZE * 3)(%rdi, %rax), %rax
+ # ifndef USE_AS_STRCHRNUL
+-	cmp (%rax), %CHAR_REG
+-	cmovne	%rdx, %rax
++L(zero):
++	xorl	%eax, %eax
++	VZEROUPPER_RETURN
+ # endif
+-	VZEROUPPER
+-	ret
++
+ 
+ 	.p2align 4
+-L(first_vec_x0):
++L(first_vec_x1):
+ 	tzcntl	%eax, %eax
+-	/* Found CHAR or the null byte.	 */
+-	addq	%rdi, %rax
++	incq	%rdi
+ # ifndef USE_AS_STRCHRNUL
+-	cmp (%rax), %CHAR_REG
+-	cmovne	%rdx, %rax
++	/* Found CHAR or the null byte.	 */
++	cmp	(%rdi, %rax), %CHAR_REG
++	jne	L(zero)
+ # endif
++	addq	%rdi, %rax
+ 	VZEROUPPER_RETURN
+ 
+ 	.p2align 4
+-L(first_vec_x1):
++L(first_vec_x2):
+ 	tzcntl	%eax, %eax
+-	leaq	VEC_SIZE(%rdi, %rax), %rax
++	addq	$(VEC_SIZE + 1), %rdi
+ # ifndef USE_AS_STRCHRNUL
+-	cmp (%rax), %CHAR_REG
+-	cmovne	%rdx, %rax
++	/* Found CHAR or the null byte.	 */
++	cmp	(%rdi, %rax), %CHAR_REG
++	jne	L(zero)
+ # endif
++	addq	%rdi, %rax
+ 	VZEROUPPER_RETURN
+ 
+ 	.p2align 4
+-L(first_vec_x2):
++L(first_vec_x3):
+ 	tzcntl	%eax, %eax
+-	/* Found CHAR or the null byte.	 */
+-	leaq	(VEC_SIZE * 2)(%rdi, %rax), %rax
++	addq	$(VEC_SIZE * 2 + 1), %rdi
+ # ifndef USE_AS_STRCHRNUL
+-	cmp (%rax), %CHAR_REG
+-	cmovne	%rdx, %rax
++	/* Found CHAR or the null byte.	 */
++	cmp	(%rdi, %rax), %CHAR_REG
++	jne	L(zero)
+ # endif
++	addq	%rdi, %rax
+ 	VZEROUPPER_RETURN
+ 
+-L(prep_loop_4x):
+-	/* Align data to 4 * VEC_SIZE.	*/
+-	andq	$-(VEC_SIZE * 4), %rdi
++	.p2align 4
++L(aligned_more):
++	/* Align data to VEC_SIZE - 1. This is the same number of
++	   instructions as using andq -VEC_SIZE but saves 4 bytes of code
++	   on x4 check.  */
++	orq	$(VEC_SIZE - 1), %rdi
++L(cross_page_continue):
++	/* Check the next 4 * VEC_SIZE.  Only one VEC_SIZE at a time
++	   since data is only aligned to VEC_SIZE.  */
++	vmovdqa	1(%rdi), %ymm8
++	VPCMPEQ	%ymm8, %ymm0, %ymm1
++	VPCMPEQ	%ymm8, %ymm9, %ymm2
++	vpor	%ymm1, %ymm2, %ymm1
++	vpmovmskb %ymm1, %eax
++	testl	%eax, %eax
++	jnz	L(first_vec_x1)
++
++	vmovdqa	(VEC_SIZE + 1)(%rdi), %ymm8
++	VPCMPEQ	%ymm8, %ymm0, %ymm1
++	VPCMPEQ	%ymm8, %ymm9, %ymm2
++	vpor	%ymm1, %ymm2, %ymm1
++	vpmovmskb %ymm1, %eax
++	testl	%eax, %eax
++	jnz	L(first_vec_x2)
++
++	vmovdqa	(VEC_SIZE * 2 + 1)(%rdi), %ymm8
++	VPCMPEQ	%ymm8, %ymm0, %ymm1
++	VPCMPEQ	%ymm8, %ymm9, %ymm2
++	vpor	%ymm1, %ymm2, %ymm1
++	vpmovmskb %ymm1, %eax
++	testl	%eax, %eax
++	jnz	L(first_vec_x3)
+ 
++	vmovdqa	(VEC_SIZE * 3 + 1)(%rdi), %ymm8
++	VPCMPEQ	%ymm8, %ymm0, %ymm1
++	VPCMPEQ	%ymm8, %ymm9, %ymm2
++	vpor	%ymm1, %ymm2, %ymm1
++	vpmovmskb %ymm1, %eax
++	testl	%eax, %eax
++	jnz	L(first_vec_x4)
++	/* Align data to VEC_SIZE * 4 - 1.	*/
++	addq	$(VEC_SIZE * 4 + 1), %rdi
++	andq	$-(VEC_SIZE * 4), %rdi
+ 	.p2align 4
+ L(loop_4x_vec):
+ 	/* Compare 4 * VEC at a time forward.  */
+-	vmovdqa	(VEC_SIZE * 4)(%rdi), %ymm5
+-	vmovdqa	(VEC_SIZE * 5)(%rdi), %ymm6
+-	vmovdqa	(VEC_SIZE * 6)(%rdi), %ymm7
+-	vmovdqa	(VEC_SIZE * 7)(%rdi), %ymm8
++	vmovdqa	(%rdi), %ymm5
++	vmovdqa	(VEC_SIZE)(%rdi), %ymm6
++	vmovdqa	(VEC_SIZE * 2)(%rdi), %ymm7
++	vmovdqa	(VEC_SIZE * 3)(%rdi), %ymm8
+ 
+ 	/* Leaves only CHARS matching esi as 0.	 */
+ 	vpxor	%ymm5, %ymm0, %ymm1
+@@ -191,63 +202,102 @@ L(loop_4x_vec):
+ 	VPMINU	%ymm1, %ymm2, %ymm5
+ 	VPMINU	%ymm3, %ymm4, %ymm6
+ 
+-	VPMINU	%ymm5, %ymm6, %ymm5
++	VPMINU	%ymm5, %ymm6, %ymm6
+ 
+-	VPCMPEQ %ymm5, %ymm9, %ymm5
+-	vpmovmskb %ymm5, %eax
++	VPCMPEQ	%ymm6, %ymm9, %ymm6
++	vpmovmskb %ymm6, %ecx
++	subq	$-(VEC_SIZE * 4), %rdi
++	testl	%ecx, %ecx
++	jz	L(loop_4x_vec)
+ 
+-	addq	$(VEC_SIZE * 4), %rdi
+-	testl	%eax, %eax
+-	jz  L(loop_4x_vec)
+ 
+-	VPCMPEQ %ymm1, %ymm9, %ymm1
++	VPCMPEQ	%ymm1, %ymm9, %ymm1
+ 	vpmovmskb %ymm1, %eax
+ 	testl	%eax, %eax
+-	jnz	L(first_vec_x0)
++	jnz	L(last_vec_x0)
++
+ 
+-	VPCMPEQ %ymm2, %ymm9, %ymm2
++	VPCMPEQ	%ymm5, %ymm9, %ymm2
+ 	vpmovmskb %ymm2, %eax
+ 	testl	%eax, %eax
+-	jnz	L(first_vec_x1)
++	jnz	L(last_vec_x1)
++
++	VPCMPEQ	%ymm3, %ymm9, %ymm3
++	vpmovmskb %ymm3, %eax
++	/* rcx has combined result from all 4 VEC. It will only be used
++	   if the first 3 other VEC all did not contain a match.  */
++	salq	$32, %rcx
++	orq	%rcx, %rax
++	tzcntq	%rax, %rax
++	subq	$(VEC_SIZE * 2), %rdi
++# ifndef USE_AS_STRCHRNUL
++	/* Found CHAR or the null byte.	 */
++	cmp	(%rdi, %rax), %CHAR_REG
++	jne	L(zero_end)
++# endif
++	addq	%rdi, %rax
++	VZEROUPPER_RETURN
++
++
++	.p2align 4
++L(last_vec_x0):
++	tzcntl	%eax, %eax
++	addq	$-(VEC_SIZE * 4), %rdi
++# ifndef USE_AS_STRCHRNUL
++	/* Found CHAR or the null byte.	 */
++	cmp	(%rdi, %rax), %CHAR_REG
++	jne	L(zero_end)
++# endif
++	addq	%rdi, %rax
++	VZEROUPPER_RETURN
+ 
+-	VPCMPEQ %ymm3, %ymm9, %ymm3
+-	VPCMPEQ %ymm4, %ymm9, %ymm4
+-	vpmovmskb %ymm3, %ecx
+-	vpmovmskb %ymm4, %eax
+-	salq	$32, %rax
+-	orq %rcx, %rax
+-	tzcntq  %rax, %rax
+-	leaq	(VEC_SIZE * 2)(%rdi, %rax), %rax
+ # ifndef USE_AS_STRCHRNUL
+-	cmp (%rax), %CHAR_REG
+-	cmovne	%rdx, %rax
++L(zero_end):
++	xorl	%eax, %eax
++	VZEROUPPER_RETURN
+ # endif
+-	VZEROUPPER
+-	ret
++
++	.p2align 4
++L(last_vec_x1):
++	tzcntl	%eax, %eax
++	subq	$(VEC_SIZE * 3), %rdi
++# ifndef USE_AS_STRCHRNUL
++	/* Found CHAR or the null byte.	 */
++	cmp	(%rdi, %rax), %CHAR_REG
++	jne	L(zero_end)
++# endif
++	addq	%rdi, %rax
++	VZEROUPPER_RETURN
++
+ 
+ 	/* Cold case for crossing page with first load.	 */
+ 	.p2align 4
+ L(cross_page_boundary):
+-	andq	$-VEC_SIZE, %rdi
+-	andl	$(VEC_SIZE - 1), %ecx
+-
+-	vmovdqa	(%rdi), %ymm8
+-	VPCMPEQ %ymm8, %ymm0, %ymm1
+-	VPCMPEQ %ymm8, %ymm9, %ymm2
++	movq	%rdi, %rdx
++	/* Align rdi to VEC_SIZE - 1.  */
++	orq	$(VEC_SIZE - 1), %rdi
++	vmovdqa	-(VEC_SIZE - 1)(%rdi), %ymm8
++	VPCMPEQ	%ymm8, %ymm0, %ymm1
++	VPCMPEQ	%ymm8, %ymm9, %ymm2
+ 	vpor	%ymm1, %ymm2, %ymm1
+ 	vpmovmskb %ymm1, %eax
+-	/* Remove the leading bits.	 */
+-	sarxl	%ecx, %eax, %eax
++	/* Remove the leading bytes. sarxl only uses bits [5:0] of COUNT
++	   so no need to manually mod edx.  */
++	sarxl	%edx, %eax, %eax
+ 	testl	%eax, %eax
+-	jz	L(aligned_more)
++	jz	L(cross_page_continue)
+ 	tzcntl	%eax, %eax
+-	addq	%rcx, %rdi
+-	addq	%rdi, %rax
+ # ifndef USE_AS_STRCHRNUL
+-	cmp (%rax), %CHAR_REG
+-	cmovne	%rdx, %rax
++	xorl	%ecx, %ecx
++	/* Found CHAR or the null byte.	 */
++	cmp	(%rdx, %rax), %CHAR_REG
++	leaq	(%rdx, %rax), %rax
++	cmovne	%rcx, %rax
++# else
++	addq	%rdx, %rax
+ # endif
+-	VZEROUPPER_RETURN
++L(return_vzeroupper):
++	ZERO_UPPER_VEC_REGISTERS_RETURN
+ 
+ END (STRCHR)
+ # endif
+-- 
+GitLab
+
diff --git a/glibc-RHEL-15696-43.patch b/glibc-RHEL-15696-43.patch
new file mode 100644
index 0000000..9f76b11
--- /dev/null
+++ b/glibc-RHEL-15696-43.patch
@@ -0,0 +1,532 @@
+From 7f3e7c262cab4e2401e4331a6ef29c428de02044 Mon Sep 17 00:00:00 2001
+From: Noah Goldstein <goldstein.w.n@gmail.com>
+Date: Fri, 23 Apr 2021 15:56:25 -0400
+Subject: [PATCH] x86: Optimize strchr-evex.S
+Content-type: text/plain; charset=UTF-8
+
+No bug. This commit optimizes strchr-evex.S. The optimizations are
+mostly small things such as save an ALU in the alignment process,
+saving a few instructions in the loop return. The one significant
+change is saving 2 instructions in the 4x loop. test-strchr,
+test-strchrnul, test-wcschr, and test-wcschrnul are all passing.
+
+Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
+---
+ sysdeps/x86_64/multiarch/strchr-evex.S | 392 ++++++++++++++-----------
+ 1 file changed, 218 insertions(+), 174 deletions(-)
+
+diff --git a/sysdeps/x86_64/multiarch/strchr-evex.S b/sysdeps/x86_64/multiarch/strchr-evex.S
+index ddc86a70..7f9d4ee4 100644
+--- a/sysdeps/x86_64/multiarch/strchr-evex.S
++++ b/sysdeps/x86_64/multiarch/strchr-evex.S
+@@ -32,13 +32,15 @@
+ #  define VPCMP		vpcmpd
+ #  define VPMINU	vpminud
+ #  define CHAR_REG	esi
+-#  define SHIFT_REG	r8d
++#  define SHIFT_REG	ecx
++#  define CHAR_SIZE	4
+ # else
+ #  define VPBROADCAST	vpbroadcastb
+ #  define VPCMP		vpcmpb
+ #  define VPMINU	vpminub
+ #  define CHAR_REG	sil
+-#  define SHIFT_REG	ecx
++#  define SHIFT_REG	edx
++#  define CHAR_SIZE	1
+ # endif
+ 
+ # define XMMZERO	xmm16
+@@ -56,23 +58,20 @@
+ 
+ # define VEC_SIZE 32
+ # define PAGE_SIZE 4096
++# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE)
+ 
+ 	.section .text.evex,"ax",@progbits
+ ENTRY (STRCHR)
+-	movl	%edi, %ecx
+-# ifndef USE_AS_STRCHRNUL
+-	xorl	%edx, %edx
+-# endif
+-
+ 	/* Broadcast CHAR to YMM0.	*/
+-	VPBROADCAST %esi, %YMM0
+-
++	VPBROADCAST	%esi, %YMM0
++	movl	%edi, %eax
++	andl	$(PAGE_SIZE - 1), %eax
+ 	vpxorq	%XMMZERO, %XMMZERO, %XMMZERO
+ 
+-	/* Check if we cross page boundary with one vector load.  */
+-	andl	$(PAGE_SIZE - 1), %ecx
+-	cmpl	$(PAGE_SIZE - VEC_SIZE), %ecx
+-	ja  L(cross_page_boundary)
++	/* Check if we cross page boundary with one vector load.
++	   Otherwise it is safe to use an unaligned load.  */
++	cmpl	$(PAGE_SIZE - VEC_SIZE), %eax
++	ja	L(cross_page_boundary)
+ 
+ 	/* Check the first VEC_SIZE bytes. Search for both CHAR and the
+ 	   null bytes.  */
+@@ -83,251 +82,296 @@ ENTRY (STRCHR)
+ 	VPMINU	%YMM2, %YMM1, %YMM2
+ 	/* Each bit in K0 represents a CHAR or a null byte in YMM1.  */
+ 	VPCMP	$0, %YMMZERO, %YMM2, %k0
+-	ktestd	%k0, %k0
+-	jz	L(more_vecs)
+ 	kmovd	%k0, %eax
++	testl	%eax, %eax
++	jz	L(aligned_more)
+ 	tzcntl	%eax, %eax
+-	/* Found CHAR or the null byte.	 */
+ # ifdef USE_AS_WCSCHR
+-	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
+-	leaq	(%rdi, %rax, 4), %rax
++	/* NB: Multiply wchar_t count by 4 to get the number of bytes.
++	 */
++	leaq	(%rdi, %rax, CHAR_SIZE), %rax
+ # else
+ 	addq	%rdi, %rax
+ # endif
+ # ifndef USE_AS_STRCHRNUL
+-	cmp (%rax), %CHAR_REG
+-	cmovne	%rdx, %rax
++	/* Found CHAR or the null byte.	 */
++	cmp	(%rax), %CHAR_REG
++	jne	L(zero)
+ # endif
+ 	ret
+ 
+-	.p2align 4
+-L(more_vecs):
+-	/* Align data for aligned loads in the loop.  */
+-	andq	$-VEC_SIZE, %rdi
+-L(aligned_more):
+-
+-	/* Check the next 4 * VEC_SIZE.	 Only one VEC_SIZE at a time
+-	   since data is only aligned to VEC_SIZE.	*/
+-	VMOVA	VEC_SIZE(%rdi), %YMM1
+-	addq	$VEC_SIZE, %rdi
+-
+-	/* Leaves only CHARS matching esi as 0.  */
+-	vpxorq	%YMM1, %YMM0, %YMM2
+-	VPMINU	%YMM2, %YMM1, %YMM2
+-	/* Each bit in K0 represents a CHAR or a null byte in YMM1.  */
+-	VPCMP	$0, %YMMZERO, %YMM2, %k0
+-	kmovd	%k0, %eax
+-	testl	%eax, %eax
+-	jnz	L(first_vec_x0)
+-
+-	VMOVA	VEC_SIZE(%rdi), %YMM1
+-	/* Leaves only CHARS matching esi as 0.  */
+-	vpxorq	%YMM1, %YMM0, %YMM2
+-	VPMINU	%YMM2, %YMM1, %YMM2
+-	/* Each bit in K0 represents a CHAR or a null byte in YMM1.  */
+-	VPCMP	$0, %YMMZERO, %YMM2, %k0
+-	kmovd	%k0, %eax
+-	testl	%eax, %eax
+-	jnz	L(first_vec_x1)
+-
+-	VMOVA	(VEC_SIZE * 2)(%rdi), %YMM1
+-	/* Leaves only CHARS matching esi as 0.  */
+-	vpxorq	%YMM1, %YMM0, %YMM2
+-	VPMINU	%YMM2, %YMM1, %YMM2
+-	/* Each bit in K0 represents a CHAR or a null byte in YMM1.  */
+-	VPCMP	$0, %YMMZERO, %YMM2, %k0
+-	kmovd	%k0, %eax
+-	testl	%eax, %eax
+-	jnz	L(first_vec_x2)
+-
+-	VMOVA	(VEC_SIZE * 3)(%rdi), %YMM1
+-	/* Leaves only CHARS matching esi as 0.  */
+-	vpxorq	%YMM1, %YMM0, %YMM2
+-	VPMINU	%YMM2, %YMM1, %YMM2
+-	/* Each bit in K0 represents a CHAR or a null byte in YMM1.  */
+-	VPCMP	$0, %YMMZERO, %YMM2, %k0
+-	ktestd	%k0, %k0
+-	jz	L(prep_loop_4x)
+-
+-	kmovd	%k0, %eax
++	/* .p2align 5 helps keep performance more consistent if ENTRY()
++	   alignment % 32 was either 16 or 0. As well this makes the
++	   alignment % 32 of the loop_4x_vec fixed which makes tuning it
++	   easier.  */
++	.p2align 5
++L(first_vec_x3):
+ 	tzcntl	%eax, %eax
++# ifndef USE_AS_STRCHRNUL
+ 	/* Found CHAR or the null byte.	 */
+-# ifdef USE_AS_WCSCHR
+-	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
+-	leaq	(VEC_SIZE * 3)(%rdi, %rax, 4), %rax
+-# else
+-	leaq	(VEC_SIZE * 3)(%rdi, %rax), %rax
++	cmp	(VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %CHAR_REG
++	jne	L(zero)
+ # endif
++	/* NB: Multiply sizeof char type (1 or 4) to get the number of
++	   bytes.  */
++	leaq	(VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax
++	ret
++
+ # ifndef USE_AS_STRCHRNUL
+-	cmp (%rax), %CHAR_REG
+-	cmovne	%rdx, %rax
+-# endif
++L(zero):
++	xorl	%eax, %eax
+ 	ret
++# endif
+ 
+ 	.p2align 4
+-L(first_vec_x0):
++L(first_vec_x4):
++# ifndef USE_AS_STRCHRNUL
++	/* Check to see if first match was CHAR (k0) or null (k1).  */
++	kmovd	%k0, %eax
+ 	tzcntl	%eax, %eax
+-	/* Found CHAR or the null byte.	 */
+-# ifdef USE_AS_WCSCHR
+-	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
+-	leaq	(%rdi, %rax, 4), %rax
++	kmovd	%k1, %ecx
++	/* bzhil will not be 0 if first match was null.  */
++	bzhil	%eax, %ecx, %ecx
++	jne	L(zero)
+ # else
+-	addq	%rdi, %rax
+-# endif
+-# ifndef USE_AS_STRCHRNUL
+-	cmp (%rax), %CHAR_REG
+-	cmovne	%rdx, %rax
++	/* Combine CHAR and null matches.  */
++	kord	%k0, %k1, %k0
++	kmovd	%k0, %eax
++	tzcntl	%eax, %eax
+ # endif
++	/* NB: Multiply sizeof char type (1 or 4) to get the number of
++	   bytes.  */
++	leaq	(VEC_SIZE * 4)(%rdi, %rax, CHAR_SIZE), %rax
+ 	ret
+ 
+ 	.p2align 4
+ L(first_vec_x1):
+ 	tzcntl	%eax, %eax
+-	/* Found CHAR or the null byte.	 */
+-# ifdef USE_AS_WCSCHR
+-	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
+-	leaq	VEC_SIZE(%rdi, %rax, 4), %rax
+-# else
+-	leaq	VEC_SIZE(%rdi, %rax), %rax
+-# endif
+ # ifndef USE_AS_STRCHRNUL
+-	cmp (%rax), %CHAR_REG
+-	cmovne	%rdx, %rax
++	/* Found CHAR or the null byte.	 */
++	cmp	(VEC_SIZE)(%rdi, %rax, CHAR_SIZE), %CHAR_REG
++	jne	L(zero)
++
+ # endif
++	/* NB: Multiply sizeof char type (1 or 4) to get the number of
++	   bytes.  */
++	leaq	(VEC_SIZE)(%rdi, %rax, CHAR_SIZE), %rax
+ 	ret
+ 
+ 	.p2align 4
+ L(first_vec_x2):
++# ifndef USE_AS_STRCHRNUL
++	/* Check to see if first match was CHAR (k0) or null (k1).  */
++	kmovd	%k0, %eax
+ 	tzcntl	%eax, %eax
+-	/* Found CHAR or the null byte.	 */
+-# ifdef USE_AS_WCSCHR
+-	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
+-	leaq	(VEC_SIZE * 2)(%rdi, %rax, 4), %rax
++	kmovd	%k1, %ecx
++	/* bzhil will not be 0 if first match was null.  */
++	bzhil	%eax, %ecx, %ecx
++	jne	L(zero)
+ # else
+-	leaq	(VEC_SIZE * 2)(%rdi, %rax), %rax
+-# endif
+-# ifndef USE_AS_STRCHRNUL
+-	cmp (%rax), %CHAR_REG
+-	cmovne	%rdx, %rax
++	/* Combine CHAR and null matches.  */
++	kord	%k0, %k1, %k0
++	kmovd	%k0, %eax
++	tzcntl	%eax, %eax
+ # endif
++	/* NB: Multiply sizeof char type (1 or 4) to get the number of
++	   bytes.  */
++	leaq	(VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
+ 	ret
+ 
+-L(prep_loop_4x):
+-	/* Align data to 4 * VEC_SIZE.	*/
++	.p2align 4
++L(aligned_more):
++	/* Align data to VEC_SIZE.  */
++	andq	$-VEC_SIZE, %rdi
++L(cross_page_continue):
++	/* Check the next 4 * VEC_SIZE. Only one VEC_SIZE at a time since
++	   data is only aligned to VEC_SIZE. Use two alternating methods
++	   for checking VEC to balance latency and port contention.  */
++
++	/* This method has higher latency but has better port
++	   distribution.  */
++	VMOVA	(VEC_SIZE)(%rdi), %YMM1
++	/* Leaves only CHARS matching esi as 0.  */
++	vpxorq	%YMM1, %YMM0, %YMM2
++	VPMINU	%YMM2, %YMM1, %YMM2
++	/* Each bit in K0 represents a CHAR or a null byte in YMM1.  */
++	VPCMP	$0, %YMMZERO, %YMM2, %k0
++	kmovd	%k0, %eax
++	testl	%eax, %eax
++	jnz	L(first_vec_x1)
++
++	/* This method has higher latency but has better port
++	   distribution.  */
++	VMOVA	(VEC_SIZE * 2)(%rdi), %YMM1
++	/* Each bit in K0 represents a CHAR in YMM1.  */
++	VPCMP	$0, %YMM1, %YMM0, %k0
++	/* Each bit in K1 represents a CHAR in YMM1.  */
++	VPCMP	$0, %YMM1, %YMMZERO, %k1
++	kortestd	%k0, %k1
++	jnz	L(first_vec_x2)
++
++	VMOVA	(VEC_SIZE * 3)(%rdi), %YMM1
++	/* Leaves only CHARS matching esi as 0.  */
++	vpxorq	%YMM1, %YMM0, %YMM2
++	VPMINU	%YMM2, %YMM1, %YMM2
++	/* Each bit in K0 represents a CHAR or a null byte in YMM1.  */
++	VPCMP	$0, %YMMZERO, %YMM2, %k0
++	kmovd	%k0, %eax
++	testl	%eax, %eax
++	jnz	L(first_vec_x3)
++
++	VMOVA	(VEC_SIZE * 4)(%rdi), %YMM1
++	/* Each bit in K0 represents a CHAR in YMM1.  */
++	VPCMP	$0, %YMM1, %YMM0, %k0
++	/* Each bit in K1 represents a CHAR in YMM1.  */
++	VPCMP	$0, %YMM1, %YMMZERO, %k1
++	kortestd	%k0, %k1
++	jnz	L(first_vec_x4)
++
++	/* Align data to VEC_SIZE * 4 for the loop.  */
++	addq	$VEC_SIZE, %rdi
+ 	andq	$-(VEC_SIZE * 4), %rdi
+ 
+ 	.p2align 4
+ L(loop_4x_vec):
+-	/* Compare 4 * VEC at a time forward.  */
++	/* Check 4x VEC at a time. No penalty to imm32 offset with evex
++	   encoding.  */
+ 	VMOVA	(VEC_SIZE * 4)(%rdi), %YMM1
+ 	VMOVA	(VEC_SIZE * 5)(%rdi), %YMM2
+ 	VMOVA	(VEC_SIZE * 6)(%rdi), %YMM3
+ 	VMOVA	(VEC_SIZE * 7)(%rdi), %YMM4
+ 
+-	/* Leaves only CHARS matching esi as 0.  */
++	/* For YMM1 and YMM3 use xor to set the CHARs matching esi to
++	   zero.  */
+ 	vpxorq	%YMM1, %YMM0, %YMM5
+-	vpxorq	%YMM2, %YMM0, %YMM6
++	/* For YMM2 and YMM4 cmp not equals to CHAR and store result in
++	   k register. Its possible to save either 1 or 2 instructions
++	   using cmp no equals method for either YMM1 or YMM1 and YMM3
++	   respectively but bottleneck on p5 makes it not worth it.  */
++	VPCMP	$4, %YMM0, %YMM2, %k2
+ 	vpxorq	%YMM3, %YMM0, %YMM7
+-	vpxorq	%YMM4, %YMM0, %YMM8
+-
+-	VPMINU	%YMM5, %YMM1, %YMM5
+-	VPMINU	%YMM6, %YMM2, %YMM6
+-	VPMINU	%YMM7, %YMM3, %YMM7
+-	VPMINU	%YMM8, %YMM4, %YMM8
+-
+-	VPMINU	%YMM5, %YMM6, %YMM1
+-	VPMINU	%YMM7, %YMM8, %YMM2
+-
+-	VPMINU	%YMM1, %YMM2, %YMM1
+-
+-	/* Each bit in K0 represents a CHAR or a null byte.  */
+-	VPCMP	$0, %YMMZERO, %YMM1, %k0
+-
+-	addq	$(VEC_SIZE * 4), %rdi
+-
+-	ktestd	%k0, %k0
++	VPCMP	$4, %YMM0, %YMM4, %k4
++
++	/* Use min to select all zeros from either xor or end of string).
++	 */
++	VPMINU	%YMM1, %YMM5, %YMM1
++	VPMINU	%YMM3, %YMM7, %YMM3
++
++	/* Use min + zeromask to select for zeros. Since k2 and k4 will
++	   have 0 as positions that matched with CHAR which will set
++	   zero in the corresponding destination bytes in YMM2 / YMM4.
++	 */
++	VPMINU	%YMM1, %YMM2, %YMM2{%k2}{z}
++	VPMINU	%YMM3, %YMM4, %YMM4
++	VPMINU	%YMM2, %YMM4, %YMM4{%k4}{z}
++
++	VPCMP	$0, %YMMZERO, %YMM4, %k1
++	kmovd	%k1, %ecx
++	subq	$-(VEC_SIZE * 4), %rdi
++	testl	%ecx, %ecx
+ 	jz	L(loop_4x_vec)
+ 
+-	/* Each bit in K0 represents a CHAR or a null byte in YMM1.  */
+-	VPCMP	$0, %YMMZERO, %YMM5, %k0
++	VPCMP	$0, %YMMZERO, %YMM1, %k0
+ 	kmovd	%k0, %eax
+ 	testl	%eax, %eax
+-	jnz	L(first_vec_x0)
++	jnz	L(last_vec_x1)
+ 
+-	/* Each bit in K1 represents a CHAR or a null byte in YMM2.  */
+-	VPCMP	$0, %YMMZERO, %YMM6, %k1
+-	kmovd	%k1, %eax
++	VPCMP	$0, %YMMZERO, %YMM2, %k0
++	kmovd	%k0, %eax
+ 	testl	%eax, %eax
+-	jnz	L(first_vec_x1)
+-
+-	/* Each bit in K2 represents a CHAR or a null byte in YMM3.  */
+-	VPCMP	$0, %YMMZERO, %YMM7, %k2
+-	/* Each bit in K3 represents a CHAR or a null byte in YMM4.  */
+-	VPCMP	$0, %YMMZERO, %YMM8, %k3
++	jnz	L(last_vec_x2)
+ 
++	VPCMP	$0, %YMMZERO, %YMM3, %k0
++	kmovd	%k0, %eax
++	/* Combine YMM3 matches (eax) with YMM4 matches (ecx).  */
+ # ifdef USE_AS_WCSCHR
+-	/* NB: Each bit in K2/K3 represents 4-byte element.  */
+-	kshiftlw $8, %k3, %k1
++	sall	$8, %ecx
++	orl	%ecx, %eax
++	tzcntl	%eax, %eax
+ # else
+-	kshiftlq $32, %k3, %k1
++	salq	$32, %rcx
++	orq	%rcx, %rax
++	tzcntq	%rax, %rax
+ # endif
++# ifndef USE_AS_STRCHRNUL
++	/* Check if match was CHAR or null.  */
++	cmp	(VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %CHAR_REG
++	jne	L(zero_end)
++# endif
++	/* NB: Multiply sizeof char type (1 or 4) to get the number of
++	   bytes.  */
++	leaq	(VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
++	ret
+ 
+-	/* Each bit in K1 represents a NULL or a mismatch.  */
+-	korq	%k1, %k2, %k1
+-	kmovq	%k1, %rax
++# ifndef USE_AS_STRCHRNUL
++L(zero_end):
++	xorl	%eax, %eax
++	ret
++# endif
+ 
+-	tzcntq  %rax, %rax
+-# ifdef USE_AS_WCSCHR
+-	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
+-	leaq	(VEC_SIZE * 2)(%rdi, %rax, 4), %rax
+-# else
+-	leaq	(VEC_SIZE * 2)(%rdi, %rax), %rax
++	.p2align 4
++L(last_vec_x1):
++	tzcntl	%eax, %eax
++# ifndef USE_AS_STRCHRNUL
++	/* Check if match was null.  */
++	cmp	(%rdi, %rax, CHAR_SIZE), %CHAR_REG
++	jne	L(zero_end)
+ # endif
++	/* NB: Multiply sizeof char type (1 or 4) to get the number of
++	   bytes.  */
++	leaq	(%rdi, %rax, CHAR_SIZE), %rax
++	ret
++
++	.p2align 4
++L(last_vec_x2):
++	tzcntl	%eax, %eax
+ # ifndef USE_AS_STRCHRNUL
+-	cmp (%rax), %CHAR_REG
+-	cmovne	%rdx, %rax
++	/* Check if match was null.  */
++	cmp	(VEC_SIZE)(%rdi, %rax, CHAR_SIZE), %CHAR_REG
++	jne	L(zero_end)
+ # endif
++	/* NB: Multiply sizeof char type (1 or 4) to get the number of
++	   bytes.  */
++	leaq	(VEC_SIZE)(%rdi, %rax, CHAR_SIZE), %rax
+ 	ret
+ 
+ 	/* Cold case for crossing page with first load.	 */
+ 	.p2align 4
+ L(cross_page_boundary):
++	movq	%rdi, %rdx
++	/* Align rdi.  */
+ 	andq	$-VEC_SIZE, %rdi
+-	andl	$(VEC_SIZE - 1), %ecx
+-
+ 	VMOVA	(%rdi), %YMM1
+-
+ 	/* Leaves only CHARS matching esi as 0.  */
+ 	vpxorq	%YMM1, %YMM0, %YMM2
+ 	VPMINU	%YMM2, %YMM1, %YMM2
+ 	/* Each bit in K0 represents a CHAR or a null byte in YMM1.  */
+ 	VPCMP	$0, %YMMZERO, %YMM2, %k0
+ 	kmovd	%k0, %eax
+-	testl	%eax, %eax
+-
++	/* Remove the leading bits.	 */
+ # ifdef USE_AS_WCSCHR
++	movl	%edx, %SHIFT_REG
+ 	/* NB: Divide shift count by 4 since each bit in K1 represent 4
+ 	   bytes.  */
+-	movl	%ecx, %SHIFT_REG
+-	sarl    $2, %SHIFT_REG
++	sarl	$2, %SHIFT_REG
++	andl	$(CHAR_PER_VEC - 1), %SHIFT_REG
+ # endif
+-
+-	/* Remove the leading bits.	 */
+ 	sarxl	%SHIFT_REG, %eax, %eax
++	/* If eax is zero continue.  */
+ 	testl	%eax, %eax
+-
+-	jz	L(aligned_more)
++	jz	L(cross_page_continue)
+ 	tzcntl	%eax, %eax
+-	addq	%rcx, %rdi
++# ifndef USE_AS_STRCHRNUL
++	/* Check to see if match was CHAR or null.  */
++	cmp	(%rdx, %rax, CHAR_SIZE), %CHAR_REG
++	jne	L(zero_end)
++# endif
+ # ifdef USE_AS_WCSCHR
+-	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
+-	leaq	(%rdi, %rax, 4), %rax
++	/* NB: Multiply wchar_t count by 4 to get the number of
++	   bytes.  */
++	leaq	(%rdx, %rax, CHAR_SIZE), %rax
+ # else
+-	addq	%rdi, %rax
+-# endif
+-# ifndef USE_AS_STRCHRNUL
+-	cmp (%rax), %CHAR_REG
+-	cmovne	%rdx, %rax
++	addq	%rdx, %rax
+ # endif
+ 	ret
+ 
+-- 
+GitLab
+
diff --git a/glibc-RHEL-15696-44.patch b/glibc-RHEL-15696-44.patch
new file mode 100644
index 0000000..52fec88
--- /dev/null
+++ b/glibc-RHEL-15696-44.patch
@@ -0,0 +1,536 @@
+From 104c7b1967c3e78435c6f7eab5e225a7eddf9c6e Mon Sep 17 00:00:00 2001
+From: Noah Goldstein <goldstein.w.n@gmail.com>
+Date: Tue, 4 May 2021 19:02:40 -0400
+Subject: [PATCH] x86: Add EVEX optimized memchr family not safe for RTM
+Content-type: text/plain; charset=UTF-8
+
+No bug.
+
+This commit adds a new implementation for EVEX memchr that is not safe
+for RTM because it uses vzeroupper. The benefit is that by using
+ymm0-ymm15 it can use vpcmpeq and vpternlogd in the 4x loop which is
+faster than the RTM safe version which cannot use vpcmpeq because
+there is no EVEX encoding for the instruction. All parts of the
+implementation aside from the 4x loop are the same for the two
+versions and the optimization is only relevant for large sizes.
+
+Tigerlake:
+size  , algn  , Pos   , Cur T , New T , Win     , Dif
+512   , 6     , 192   , 9.2   , 9.04  , no-RTM  , 0.16
+512   , 7     , 224   , 9.19  , 8.98  , no-RTM  , 0.21
+2048  , 0     , 256   , 10.74 , 10.54 , no-RTM  , 0.2
+2048  , 0     , 512   , 14.81 , 14.87 , RTM     , 0.06
+2048  , 0     , 1024  , 22.97 , 22.57 , no-RTM  , 0.4
+2048  , 0     , 2048  , 37.49 , 34.51 , no-RTM  , 2.98   <--
+
+Icelake:
+size  , algn  , Pos   , Cur T , New T , Win     , Dif
+512   , 6     , 192   , 7.6   , 7.3   , no-RTM  , 0.3
+512   , 7     , 224   , 7.63  , 7.27  , no-RTM  , 0.36
+2048  , 0     , 256   , 8.48  , 8.38  , no-RTM  , 0.1
+2048  , 0     , 512   , 11.57 , 11.42 , no-RTM  , 0.15
+2048  , 0     , 1024  , 17.92 , 17.38 , no-RTM  , 0.54
+2048  , 0     , 2048  , 30.37 , 27.34 , no-RTM  , 3.03   <--
+
+test-memchr, test-wmemchr, and test-rawmemchr are all passing.
+
+Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
+Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
+---
+ sysdeps/x86_64/multiarch/Makefile             |   7 +-
+ sysdeps/x86_64/multiarch/ifunc-evex.h         |  55 ++++++
+ sysdeps/x86_64/multiarch/ifunc-impl-list.c    |  15 ++
+ sysdeps/x86_64/multiarch/memchr-evex-rtm.S    |   8 +
+ sysdeps/x86_64/multiarch/memchr-evex.S        | 161 ++++++++++++++----
+ sysdeps/x86_64/multiarch/memchr.c             |   2 +-
+ sysdeps/x86_64/multiarch/rawmemchr-evex-rtm.S |   3 +
+ sysdeps/x86_64/multiarch/rawmemchr.c          |   2 +-
+ sysdeps/x86_64/multiarch/wmemchr-evex-rtm.S   |   3 +
+ sysdeps/x86_64/multiarch/wmemchr.c            |   2 +-
+ 10 files changed, 217 insertions(+), 41 deletions(-)
+ create mode 100644 sysdeps/x86_64/multiarch/ifunc-evex.h
+ create mode 100644 sysdeps/x86_64/multiarch/memchr-evex-rtm.S
+ create mode 100644 sysdeps/x86_64/multiarch/rawmemchr-evex-rtm.S
+ create mode 100644 sysdeps/x86_64/multiarch/wmemchr-evex-rtm.S
+
+diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
+index 65fde4eb..26be4095 100644
+--- a/sysdeps/x86_64/multiarch/Makefile
++++ b/sysdeps/x86_64/multiarch/Makefile
+@@ -77,7 +77,9 @@ sysdep_routines += strncat-c stpncpy-c strncpy-c \
+ 		   strncmp-evex \
+ 		   strncpy-evex \
+ 		   strnlen-evex \
+-		   strrchr-evex
++		   strrchr-evex \
++		   memchr-evex-rtm \
++		   rawmemchr-evex-rtm
+ CFLAGS-varshift.c += -msse4
+ CFLAGS-strcspn-c.c += -msse4
+ CFLAGS-strpbrk-c.c += -msse4
+@@ -110,7 +112,8 @@ sysdep_routines += wmemcmp-sse4 wmemcmp-ssse3 wmemcmp-c \
+ 		   wcsnlen-evex \
+ 		   wcsrchr-evex \
+ 		   wmemchr-evex \
+-		   wmemcmp-evex-movbe
++		   wmemcmp-evex-movbe \
++		   wmemchr-evex-rtm
+ endif
+ 
+ ifeq ($(subdir),debug)
+diff --git a/sysdeps/x86_64/multiarch/ifunc-evex.h b/sysdeps/x86_64/multiarch/ifunc-evex.h
+new file mode 100644
+index 00000000..fc391edb
+--- /dev/null
++++ b/sysdeps/x86_64/multiarch/ifunc-evex.h
+@@ -0,0 +1,55 @@
++/* Common definition for ifunc selection optimized with EVEX.
++   All versions must be listed in ifunc-impl-list.c.
++   Copyright (C) 2017-2021 Free Software Foundation, Inc.
++   This file is part of the GNU C Library.
++
++   The GNU C Library is free software; you can redistribute it and/or
++   modify it under the terms of the GNU Lesser General Public
++   License as published by the Free Software Foundation; either
++   version 2.1 of the License, or (at your option) any later version.
++
++   The GNU C Library is distributed in the hope that it will be useful,
++   but WITHOUT ANY WARRANTY; without even the implied warranty of
++   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++   Lesser General Public License for more details.
++
++   You should have received a copy of the GNU Lesser General Public
++   License along with the GNU C Library; if not, see
++   <https://www.gnu.org/licenses/>.  */
++
++#include <init-arch.h>
++
++extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden;
++extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden;
++extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden;
++extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden;
++extern __typeof (REDIRECT_NAME) OPTIMIZE (evex_rtm) attribute_hidden;
++
++
++static inline void *
++IFUNC_SELECTOR (void)
++{
++  const struct cpu_features* cpu_features = __get_cpu_features ();
++
++  if (CPU_FEATURE_USABLE_P (cpu_features, AVX2)
++      && CPU_FEATURE_USABLE_P (cpu_features, BMI2)
++      && CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load))
++    {
++      if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
++	  && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW))
++	{
++	  if (CPU_FEATURE_USABLE_P (cpu_features, RTM))
++	    return OPTIMIZE (evex_rtm);
++
++	  return OPTIMIZE (evex);
++	}
++
++      if (CPU_FEATURE_USABLE_P (cpu_features, RTM))
++	return OPTIMIZE (avx2_rtm);
++
++      if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
++	return OPTIMIZE (avx2);
++    }
++
++  return OPTIMIZE (sse2);
++}
+diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+index d59d65f8..ac097e8d 100644
+--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
++++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+@@ -52,6 +52,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 			       && CPU_FEATURE_USABLE (AVX512BW)
+ 			       && CPU_FEATURE_USABLE (BMI2)),
+ 			      __memchr_evex)
++	      IFUNC_IMPL_ADD (array, i, memchr,
++			      (CPU_FEATURE_USABLE (AVX512VL)
++			       && CPU_FEATURE_USABLE (AVX512BW)
++			       && CPU_FEATURE_USABLE (BMI2)),
++			      __memchr_evex_rtm)
+ 	      IFUNC_IMPL_ADD (array, i, memchr, 1, __memchr_sse2))
+ 
+   /* Support sysdeps/x86_64/multiarch/memcmp.c.  */
+@@ -288,6 +293,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 			       && CPU_FEATURE_USABLE (AVX512BW)
+ 			       && CPU_FEATURE_USABLE (BMI2)),
+ 			      __rawmemchr_evex)
++	      IFUNC_IMPL_ADD (array, i, rawmemchr,
++			      (CPU_FEATURE_USABLE (AVX512VL)
++			       && CPU_FEATURE_USABLE (AVX512BW)
++			       && CPU_FEATURE_USABLE (BMI2)),
++			      __rawmemchr_evex_rtm)
+ 	      IFUNC_IMPL_ADD (array, i, rawmemchr, 1, __rawmemchr_sse2))
+ 
+   /* Support sysdeps/x86_64/multiarch/strlen.c.  */
+@@ -711,6 +721,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 			       && CPU_FEATURE_USABLE (AVX512BW)
+ 			       && CPU_FEATURE_USABLE (BMI2)),
+ 			      __wmemchr_evex)
++	      IFUNC_IMPL_ADD (array, i, wmemchr,
++			      (CPU_FEATURE_USABLE (AVX512VL)
++			       && CPU_FEATURE_USABLE (AVX512BW)
++			       && CPU_FEATURE_USABLE (BMI2)),
++			      __wmemchr_evex_rtm)
+ 	      IFUNC_IMPL_ADD (array, i, wmemchr, 1, __wmemchr_sse2))
+ 
+   /* Support sysdeps/x86_64/multiarch/wmemcmp.c.  */
+diff --git a/sysdeps/x86_64/multiarch/memchr-evex-rtm.S b/sysdeps/x86_64/multiarch/memchr-evex-rtm.S
+new file mode 100644
+index 00000000..19871882
+--- /dev/null
++++ b/sysdeps/x86_64/multiarch/memchr-evex-rtm.S
+@@ -0,0 +1,8 @@
++#ifndef MEMCHR
++# define MEMCHR __memchr_evex_rtm
++#endif
++
++#define USE_IN_RTM 1
++#define SECTION(p) p##.evex.rtm
++
++#include "memchr-evex.S"
+diff --git a/sysdeps/x86_64/multiarch/memchr-evex.S b/sysdeps/x86_64/multiarch/memchr-evex.S
+index f3fdad4f..4d0ed6d1 100644
+--- a/sysdeps/x86_64/multiarch/memchr-evex.S
++++ b/sysdeps/x86_64/multiarch/memchr-evex.S
+@@ -38,10 +38,32 @@
+ #  define CHAR_SIZE	1
+ # endif
+ 
++	/* In the 4x loop the RTM and non-RTM versions have data pointer
++	   off by VEC_SIZE * 4 with RTM version being VEC_SIZE * 4 greater.
++	   This is represented by BASE_OFFSET. As well because the RTM
++	   version uses vpcmp which stores a bit per element compared where
++	   the non-RTM version uses vpcmpeq which stores a bit per byte
++	   compared RET_SCALE of CHAR_SIZE is only relevant for the RTM
++	   version.  */
++# ifdef USE_IN_RTM
++#  define VZEROUPPER
++#  define BASE_OFFSET	(VEC_SIZE * 4)
++#  define RET_SCALE	CHAR_SIZE
++# else
++#  define VZEROUPPER	vzeroupper
++#  define BASE_OFFSET	0
++#  define RET_SCALE	1
++# endif
++
++	/* In the return from 4x loop memchr and rawmemchr versions have
++	   data pointers off by VEC_SIZE * 4 with memchr version being
++	   VEC_SIZE * 4 greater.  */
+ # ifdef USE_AS_RAWMEMCHR
++#  define RET_OFFSET	(BASE_OFFSET - (VEC_SIZE * 4))
+ #  define RAW_PTR_REG	rcx
+ #  define ALGN_PTR_REG	rdi
+ # else
++#  define RET_OFFSET	BASE_OFFSET
+ #  define RAW_PTR_REG	rdi
+ #  define ALGN_PTR_REG	rcx
+ # endif
+@@ -57,11 +79,15 @@
+ # define YMM5		ymm21
+ # define YMM6		ymm22
+ 
++# ifndef SECTION
++#  define SECTION(p)	p##.evex
++# endif
++
+ # define VEC_SIZE 32
+ # define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE)
+ # define PAGE_SIZE 4096
+ 
+-	.section .text.evex,"ax",@progbits
++	.section SECTION(.text),"ax",@progbits
+ ENTRY (MEMCHR)
+ # ifndef USE_AS_RAWMEMCHR
+ 	/* Check for zero length.  */
+@@ -237,14 +263,15 @@ L(cross_page_continue):
+ 	/* Check if at last CHAR_PER_VEC * 4 length.  */
+ 	subq	$(CHAR_PER_VEC * 4), %rdx
+ 	jbe	L(last_4x_vec_or_less_cmpeq)
+-	addq	$VEC_SIZE, %rdi
++	/* +VEC_SIZE if USE_IN_RTM otherwise +VEC_SIZE * 5.  */
++	addq	$(VEC_SIZE + (VEC_SIZE * 4 - BASE_OFFSET)), %rdi
+ 
+ 	/* Align data to VEC_SIZE * 4 for the loop and readjust length.
+ 	 */
+ #  ifdef USE_AS_WMEMCHR
+ 	movl	%edi, %ecx
+ 	andq	$-(4 * VEC_SIZE), %rdi
+-	andl	$(VEC_SIZE * 4 - 1), %ecx
++	subl	%edi, %ecx
+ 	/* NB: Divide bytes by 4 to get the wchar_t count.  */
+ 	sarl	$2, %ecx
+ 	addq	%rcx, %rdx
+@@ -254,15 +281,28 @@ L(cross_page_continue):
+ 	subq	%rdi, %rdx
+ #  endif
+ # else
+-	addq	$VEC_SIZE, %rdi
++	addq	$(VEC_SIZE + (VEC_SIZE * 4 - BASE_OFFSET)), %rdi
+ 	andq	$-(4 * VEC_SIZE), %rdi
+ # endif
+-
++# ifdef USE_IN_RTM
+ 	vpxorq	%XMMZERO, %XMMZERO, %XMMZERO
++# else
++	/* copy ymmmatch to ymm0 so we can use vpcmpeq which is not
++	   encodable with EVEX registers (ymm16-ymm31).  */
++	vmovdqa64 %YMMMATCH, %ymm0
++# endif
+ 
+ 	/* Compare 4 * VEC at a time forward.  */
+ 	.p2align 4
+ L(loop_4x_vec):
++	/* Two versions of the loop. One that does not require
++	   vzeroupper by not using ymm0-ymm15 and another does that require
++	   vzeroupper because it uses ymm0-ymm15. The reason why ymm0-ymm15
++	   is used at all is because there is no EVEX encoding vpcmpeq and
++	   with vpcmpeq this loop can be performed more efficiently. The
++	   non-vzeroupper version is safe for RTM while the vzeroupper
++	   version should be prefered if RTM are not supported.  */
++# ifdef USE_IN_RTM
+ 	/* It would be possible to save some instructions using 4x VPCMP
+ 	   but bottleneck on port 5 makes it not woth it.  */
+ 	VPCMP	$4, (VEC_SIZE * 4)(%rdi), %YMMMATCH, %k1
+@@ -273,12 +313,55 @@ L(loop_4x_vec):
+ 	/* Reduce VEC2 / VEC3 with min and VEC1 with zero mask.  */
+ 	VPMINU	%YMM2, %YMM3, %YMM3{%k1}{z}
+ 	VPCMP	$0, %YMM3, %YMMZERO, %k2
++# else
++	/* Since vptern can only take 3x vectors fastest to do 1 vec
++	   seperately with EVEX vpcmp.  */
++#  ifdef USE_AS_WMEMCHR
++	/* vptern can only accept masks for epi32/epi64 so can only save
++	   instruction using not equals mask on vptern with wmemchr.  */
++	VPCMP	$4, (%rdi), %YMMMATCH, %k1
++#  else
++	VPCMP	$0, (%rdi), %YMMMATCH, %k1
++#  endif
++	/* Compare 3x with vpcmpeq and or them all together with vptern.
++	 */
++	VPCMPEQ	VEC_SIZE(%rdi), %ymm0, %ymm2
++	VPCMPEQ	(VEC_SIZE * 2)(%rdi), %ymm0, %ymm3
++	VPCMPEQ	(VEC_SIZE * 3)(%rdi), %ymm0, %ymm4
++#  ifdef USE_AS_WMEMCHR
++	/* This takes the not of or between ymm2, ymm3, ymm4 as well as
++	   combines result from VEC0 with zero mask.  */
++	vpternlogd $1, %ymm2, %ymm3, %ymm4{%k1}{z}
++	vpmovmskb %ymm4, %ecx
++#  else
++	/* 254 is mask for oring ymm2, ymm3, ymm4 into ymm4.  */
++	vpternlogd $254, %ymm2, %ymm3, %ymm4
++	vpmovmskb %ymm4, %ecx
++	kmovd	%k1, %eax
++#  endif
++# endif
++
+ # ifdef USE_AS_RAWMEMCHR
+ 	subq	$-(VEC_SIZE * 4), %rdi
++# endif
++# ifdef USE_IN_RTM
+ 	kortestd %k2, %k3
++# else
++#  ifdef USE_AS_WMEMCHR
++	/* ecx contains not of matches. All 1s means no matches. incl will
++	   overflow and set zeroflag if that is the case.  */
++	incl	%ecx
++#  else
++	/* If either VEC1 (eax) or VEC2-VEC4 (ecx) are not zero. Adding
++	   to ecx is not an issue because if eax is non-zero it will be
++	   used for returning the match. If it is zero the add does
++	   nothing.  */
++	addq	%rax, %rcx
++#  endif
++# endif
++# ifdef USE_AS_RAWMEMCHR
+ 	jz	L(loop_4x_vec)
+ # else
+-	kortestd %k2, %k3
+ 	jnz	L(loop_4x_vec_end)
+ 
+ 	subq	$-(VEC_SIZE * 4), %rdi
+@@ -288,10 +371,11 @@ L(loop_4x_vec):
+ 
+ 	/* Fall through into less than 4 remaining vectors of length case.
+ 	 */
+-	VPCMP	$0, (VEC_SIZE * 4)(%rdi), %YMMMATCH, %k0
++	VPCMP	$0, BASE_OFFSET(%rdi), %YMMMATCH, %k0
++	addq	$(BASE_OFFSET - VEC_SIZE), %rdi
+ 	kmovd	%k0, %eax
+-	addq	$(VEC_SIZE * 3), %rdi
+-	.p2align 4
++	VZEROUPPER
++
+ L(last_4x_vec_or_less):
+ 	/* Check if first VEC contained match.  */
+ 	testl	%eax, %eax
+@@ -338,73 +422,78 @@ L(loop_4x_vec_end):
+ 	/* rawmemchr will fall through into this if match was found in
+ 	   loop.  */
+ 
++# if defined USE_IN_RTM || defined USE_AS_WMEMCHR
+ 	/* k1 has not of matches with VEC1.  */
+ 	kmovd	%k1, %eax
+-# ifdef USE_AS_WMEMCHR
++#  ifdef USE_AS_WMEMCHR
+ 	subl	$((1 << CHAR_PER_VEC) - 1), %eax
+-# else
++#  else
+ 	incl	%eax
++#  endif
++# else
++	/* eax already has matches for VEC1.  */
++	testl	%eax, %eax
+ # endif
+ 	jnz	L(last_vec_x1_return)
+ 
++# ifdef USE_IN_RTM
+ 	VPCMP	$0, %YMM2, %YMMZERO, %k0
+ 	kmovd	%k0, %eax
++# else
++	vpmovmskb %ymm2, %eax
++# endif
+ 	testl	%eax, %eax
+ 	jnz	L(last_vec_x2_return)
+ 
++# ifdef USE_IN_RTM
+ 	kmovd	%k2, %eax
+ 	testl	%eax, %eax
+ 	jnz	L(last_vec_x3_return)
+ 
+ 	kmovd	%k3, %eax
+ 	tzcntl	%eax, %eax
+-# ifdef USE_AS_RAWMEMCHR
+-	leaq	(VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax
++	leaq	(VEC_SIZE * 3 + RET_OFFSET)(%rdi, %rax, CHAR_SIZE), %rax
+ # else
+-	leaq	(VEC_SIZE * 7)(%rdi, %rax, CHAR_SIZE), %rax
++	vpmovmskb %ymm3, %eax
++	/* Combine matches in VEC3 (eax) with matches in VEC4 (ecx).  */
++	salq	$VEC_SIZE, %rcx
++	orq	%rcx, %rax
++	tzcntq	%rax, %rax
++	leaq	(VEC_SIZE * 2 + RET_OFFSET)(%rdi, %rax), %rax
++	VZEROUPPER
+ # endif
+ 	ret
+ 
+ 	.p2align 4
+ L(last_vec_x1_return):
+ 	tzcntl	%eax, %eax
+-# ifdef USE_AS_RAWMEMCHR
+-#  ifdef USE_AS_WMEMCHR
++# if defined USE_AS_WMEMCHR || RET_OFFSET != 0
+ 	/* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count.  */
+-	leaq	(%rdi, %rax, CHAR_SIZE), %rax
+-#  else
+-	addq	%rdi, %rax
+-#  endif
++	leaq	RET_OFFSET(%rdi, %rax, CHAR_SIZE), %rax
+ # else
+-	/* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count.  */
+-	leaq	(VEC_SIZE * 4)(%rdi, %rax, CHAR_SIZE), %rax
++	addq	%rdi, %rax
+ # endif
++	VZEROUPPER
+ 	ret
+ 
+ 	.p2align 4
+ L(last_vec_x2_return):
+ 	tzcntl	%eax, %eax
+-# ifdef USE_AS_RAWMEMCHR
+-	/* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count.  */
+-	leaq	VEC_SIZE(%rdi, %rax, CHAR_SIZE), %rax
+-# else
+-	/* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count.  */
+-	leaq	(VEC_SIZE * 5)(%rdi, %rax, CHAR_SIZE), %rax
+-# endif
++	/* NB: Multiply bytes by RET_SCALE to get the wchar_t count
++	   if relevant (RET_SCALE = CHAR_SIZE if USE_AS_WMEMCHAR and
++	   USE_IN_RTM are both defined. Otherwise RET_SCALE = 1.  */
++	leaq	(VEC_SIZE + RET_OFFSET)(%rdi, %rax, RET_SCALE), %rax
++	VZEROUPPER
+ 	ret
+ 
++# ifdef USE_IN_RTM
+ 	.p2align 4
+ L(last_vec_x3_return):
+ 	tzcntl	%eax, %eax
+-# ifdef USE_AS_RAWMEMCHR
+-	/* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count.  */
+-	leaq	(VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
+-# else
+ 	/* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count.  */
+-	leaq	(VEC_SIZE * 6)(%rdi, %rax, CHAR_SIZE), %rax
+-# endif
++	leaq	(VEC_SIZE * 2 + RET_OFFSET)(%rdi, %rax, CHAR_SIZE), %rax
+ 	ret
+-
++# endif
+ 
+ # ifndef USE_AS_RAWMEMCHR
+ L(last_4x_vec_or_less_cmpeq):
+diff --git a/sysdeps/x86_64/multiarch/memchr.c b/sysdeps/x86_64/multiarch/memchr.c
+index 016f5784..f28aea77 100644
+--- a/sysdeps/x86_64/multiarch/memchr.c
++++ b/sysdeps/x86_64/multiarch/memchr.c
+@@ -24,7 +24,7 @@
+ # undef memchr
+ 
+ # define SYMBOL_NAME memchr
+-# include "ifunc-avx2.h"
++# include "ifunc-evex.h"
+ 
+ libc_ifunc_redirected (__redirect_memchr, memchr, IFUNC_SELECTOR ());
+ strong_alias (memchr, __memchr)
+diff --git a/sysdeps/x86_64/multiarch/rawmemchr-evex-rtm.S b/sysdeps/x86_64/multiarch/rawmemchr-evex-rtm.S
+new file mode 100644
+index 00000000..deda1ca3
+--- /dev/null
++++ b/sysdeps/x86_64/multiarch/rawmemchr-evex-rtm.S
+@@ -0,0 +1,3 @@
++#define MEMCHR __rawmemchr_evex_rtm
++#define USE_AS_RAWMEMCHR 1
++#include "memchr-evex-rtm.S"
+diff --git a/sysdeps/x86_64/multiarch/rawmemchr.c b/sysdeps/x86_64/multiarch/rawmemchr.c
+index 8a0bc313..1f764f35 100644
+--- a/sysdeps/x86_64/multiarch/rawmemchr.c
++++ b/sysdeps/x86_64/multiarch/rawmemchr.c
+@@ -26,7 +26,7 @@
+ # undef __rawmemchr
+ 
+ # define SYMBOL_NAME rawmemchr
+-# include "ifunc-avx2.h"
++# include "ifunc-evex.h"
+ 
+ libc_ifunc_redirected (__redirect_rawmemchr, __rawmemchr,
+ 		       IFUNC_SELECTOR ());
+diff --git a/sysdeps/x86_64/multiarch/wmemchr-evex-rtm.S b/sysdeps/x86_64/multiarch/wmemchr-evex-rtm.S
+new file mode 100644
+index 00000000..a346cd35
+--- /dev/null
++++ b/sysdeps/x86_64/multiarch/wmemchr-evex-rtm.S
+@@ -0,0 +1,3 @@
++#define MEMCHR __wmemchr_evex_rtm
++#define USE_AS_WMEMCHR 1
++#include "memchr-evex-rtm.S"
+diff --git a/sysdeps/x86_64/multiarch/wmemchr.c b/sysdeps/x86_64/multiarch/wmemchr.c
+index 6d833702..f9c91915 100644
+--- a/sysdeps/x86_64/multiarch/wmemchr.c
++++ b/sysdeps/x86_64/multiarch/wmemchr.c
+@@ -26,7 +26,7 @@
+ # undef __wmemchr
+ 
+ # define SYMBOL_NAME wmemchr
+-# include "ifunc-avx2.h"
++# include "ifunc-evex.h"
+ 
+ libc_ifunc_redirected (__redirect_wmemchr, __wmemchr, IFUNC_SELECTOR ());
+ weak_alias (__wmemchr, wmemchr)
+-- 
+GitLab
+
diff --git a/glibc-RHEL-15696-45.patch b/glibc-RHEL-15696-45.patch
new file mode 100644
index 0000000..380217e
--- /dev/null
+++ b/glibc-RHEL-15696-45.patch
@@ -0,0 +1,873 @@
+From 16d12015c57701b08d7bbed6ec536641bcafb428 Mon Sep 17 00:00:00 2001
+From: Noah Goldstein <goldstein.w.n@gmail.com>
+Date: Mon, 17 May 2021 13:56:52 -0400
+Subject: [PATCH] x86: Optimize memcmp-avx2-movbe.S
+Content-type: text/plain; charset=UTF-8
+
+No bug. This commit optimizes memcmp-avx2.S. The optimizations include
+adding a new vec compare path for small sizes, reorganizing the entry
+control flow, and removing some unnecissary ALU instructions from the
+main loop. test-memcmp and test-wmemcmp are both passing.
+
+Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
+Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
+---
+ sysdeps/x86_64/multiarch/ifunc-impl-list.c   |   6 +
+ sysdeps/x86_64/multiarch/ifunc-memcmp.h      |   1 +
+ sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S | 676 +++++++++++--------
+ 3 files changed, 402 insertions(+), 281 deletions(-)
+
+diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+index ac097e8d..8be0d78a 100644
+--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
++++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+@@ -63,16 +63,19 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+   IFUNC_IMPL (i, name, memcmp,
+ 	      IFUNC_IMPL_ADD (array, i, memcmp,
+ 			      (CPU_FEATURE_USABLE (AVX2)
++			       && CPU_FEATURE_USABLE (BMI2)
+ 			       && CPU_FEATURE_USABLE (MOVBE)),
+ 			      __memcmp_avx2_movbe)
+ 	      IFUNC_IMPL_ADD (array, i, memcmp,
+ 			      (CPU_FEATURE_USABLE (AVX2)
++			       && CPU_FEATURE_USABLE (BMI2)
+ 			       && CPU_FEATURE_USABLE (MOVBE)
+ 			       && CPU_FEATURE_USABLE (RTM)),
+ 			      __memcmp_avx2_movbe_rtm)
+ 	      IFUNC_IMPL_ADD (array, i, memcmp,
+ 			      (CPU_FEATURE_USABLE (AVX512VL)
+ 			       && CPU_FEATURE_USABLE (AVX512BW)
++			       && CPU_FEATURE_USABLE (BMI2)
+ 			       && CPU_FEATURE_USABLE (MOVBE)),
+ 			      __memcmp_evex_movbe)
+ 	      IFUNC_IMPL_ADD (array, i, memcmp, CPU_FEATURE_USABLE (SSE4_1),
+@@ -732,16 +735,19 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+   IFUNC_IMPL (i, name, wmemcmp,
+ 	      IFUNC_IMPL_ADD (array, i, wmemcmp,
+ 			      (CPU_FEATURE_USABLE (AVX2)
++			       && CPU_FEATURE_USABLE (BMI2)
+ 			       && CPU_FEATURE_USABLE (MOVBE)),
+ 			      __wmemcmp_avx2_movbe)
+ 	      IFUNC_IMPL_ADD (array, i, wmemcmp,
+ 			      (CPU_FEATURE_USABLE (AVX2)
++			       && CPU_FEATURE_USABLE (BMI2)
+ 			       && CPU_FEATURE_USABLE (MOVBE)
+ 			       && CPU_FEATURE_USABLE (RTM)),
+ 			      __wmemcmp_avx2_movbe_rtm)
+ 	      IFUNC_IMPL_ADD (array, i, wmemcmp,
+ 			      (CPU_FEATURE_USABLE (AVX512VL)
+ 			       && CPU_FEATURE_USABLE (AVX512BW)
++			       && CPU_FEATURE_USABLE (BMI2)
+ 			       && CPU_FEATURE_USABLE (MOVBE)),
+ 			      __wmemcmp_evex_movbe)
+ 	      IFUNC_IMPL_ADD (array, i, wmemcmp, CPU_FEATURE_USABLE (SSE4_1),
+diff --git a/sysdeps/x86_64/multiarch/ifunc-memcmp.h b/sysdeps/x86_64/multiarch/ifunc-memcmp.h
+index 8043c635..690dffe8 100644
+--- a/sysdeps/x86_64/multiarch/ifunc-memcmp.h
++++ b/sysdeps/x86_64/multiarch/ifunc-memcmp.h
+@@ -33,6 +33,7 @@ IFUNC_SELECTOR (void)
+ 
+   if (CPU_FEATURE_USABLE_P (cpu_features, AVX2)
+       && CPU_FEATURE_USABLE_P (cpu_features, MOVBE)
++      && CPU_FEATURE_USABLE_P (cpu_features, BMI2)
+       && CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load))
+     {
+       if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
+diff --git a/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S b/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S
+index 9d5c9c72..16fc673e 100644
+--- a/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S
++++ b/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S
+@@ -19,17 +19,23 @@
+ #if IS_IN (libc)
+ 
+ /* memcmp/wmemcmp is implemented as:
+-   1. For size from 2 to 7 bytes, load as big endian with movbe and bswap
+-      to avoid branches.
+-   2. Use overlapping compare to avoid branch.
+-   3. Use vector compare when size >= 4 bytes for memcmp or size >= 8
+-      bytes for wmemcmp.
+-   4. If size is 8 * VEC_SIZE or less, unroll the loop.
+-   5. Compare 4 * VEC_SIZE at a time with the aligned first memory
++   1. Use ymm vector compares when possible. The only case where
++      vector compares is not possible for when size < VEC_SIZE
++      and loading from either s1 or s2 would cause a page cross.
++   2. For size from 2 to 7 bytes on page cross, load as big endian
++      with movbe and bswap to avoid branches.
++   3. Use xmm vector compare when size >= 4 bytes for memcmp or
++      size >= 8 bytes for wmemcmp.
++   4. Optimistically compare up to first 4 * VEC_SIZE one at a
++      to check for early mismatches. Only do this if its guranteed the
++      work is not wasted.
++   5. If size is 8 * VEC_SIZE or less, unroll the loop.
++   6. Compare 4 * VEC_SIZE at a time with the aligned first memory
+       area.
+-   6. Use 2 vector compares when size is 2 * VEC_SIZE or less.
+-   7. Use 4 vector compares when size is 4 * VEC_SIZE or less.
+-   8. Use 8 vector compares when size is 8 * VEC_SIZE or less.  */
++   7. Use 2 vector compares when size is 2 * VEC_SIZE or less.
++   8. Use 4 vector compares when size is 4 * VEC_SIZE or less.
++   9. Use 8 vector compares when size is 8 * VEC_SIZE or less.  */
++
+ 
+ # include <sysdep.h>
+ 
+@@ -38,8 +44,10 @@
+ # endif
+ 
+ # ifdef USE_AS_WMEMCMP
++#  define CHAR_SIZE	4
+ #  define VPCMPEQ	vpcmpeqd
+ # else
++#  define CHAR_SIZE	1
+ #  define VPCMPEQ	vpcmpeqb
+ # endif
+ 
+@@ -52,7 +60,7 @@
+ # endif
+ 
+ # define VEC_SIZE 32
+-# define VEC_MASK ((1 << VEC_SIZE) - 1)
++# define PAGE_SIZE	4096
+ 
+ /* Warning!
+            wmemcmp has to use SIGNED comparison for elements.
+@@ -71,136 +79,359 @@ ENTRY (MEMCMP)
+ 	jb	L(less_vec)
+ 
+ 	/* From VEC to 2 * VEC.  No branch when size == VEC_SIZE.  */
+-	vmovdqu	(%rsi), %ymm2
+-	VPCMPEQ (%rdi), %ymm2, %ymm2
+-	vpmovmskb %ymm2, %eax
+-	subl    $VEC_MASK, %eax
+-	jnz	L(first_vec)
++	vmovdqu	(%rsi), %ymm1
++	VPCMPEQ	(%rdi), %ymm1, %ymm1
++	vpmovmskb %ymm1, %eax
++	/* NB: eax must be destination register if going to
++	   L(return_vec_[0,2]). For L(return_vec_3 destination register
++	   must be ecx.  */
++	incl	%eax
++	jnz	L(return_vec_0)
+ 
+ 	cmpq	$(VEC_SIZE * 2), %rdx
+-	jbe	L(last_vec)
+-
+-	VPCMPEQ	%ymm0, %ymm0, %ymm0
+-	/* More than 2 * VEC.  */
+-	cmpq	$(VEC_SIZE * 8), %rdx
+-	ja	L(more_8x_vec)
+-	cmpq	$(VEC_SIZE * 4), %rdx
+-	jb	L(last_4x_vec)
+-
+-	/* From 4 * VEC to 8 * VEC, inclusively. */
+-	vmovdqu	(%rsi), %ymm1
+-	VPCMPEQ (%rdi), %ymm1, %ymm1
++	jbe	L(last_1x_vec)
+ 
++	/* Check second VEC no matter what.  */
+ 	vmovdqu	VEC_SIZE(%rsi), %ymm2
+-	VPCMPEQ VEC_SIZE(%rdi), %ymm2, %ymm2
++	VPCMPEQ	VEC_SIZE(%rdi), %ymm2, %ymm2
++	vpmovmskb %ymm2, %eax
++	/* If all 4 VEC where equal eax will be all 1s so incl will
++	   overflow and set zero flag.  */
++	incl	%eax
++	jnz	L(return_vec_1)
+ 
+-	vmovdqu	(VEC_SIZE * 2)(%rsi), %ymm3
+-	VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm3, %ymm3
++	/* Less than 4 * VEC.  */
++	cmpq	$(VEC_SIZE * 4), %rdx
++	jbe	L(last_2x_vec)
+ 
++	/* Check third and fourth VEC no matter what.  */
++	vmovdqu	(VEC_SIZE * 2)(%rsi), %ymm3
++	VPCMPEQ	(VEC_SIZE * 2)(%rdi), %ymm3, %ymm3
++	vpmovmskb %ymm3, %eax
++	incl	%eax
++	jnz	L(return_vec_2)
+ 	vmovdqu	(VEC_SIZE * 3)(%rsi), %ymm4
+-	VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm4, %ymm4
++	VPCMPEQ	(VEC_SIZE * 3)(%rdi), %ymm4, %ymm4
++	vpmovmskb %ymm4, %ecx
++	incl	%ecx
++	jnz	L(return_vec_3)
+ 
+-	vpand	%ymm1, %ymm2, %ymm5
+-	vpand	%ymm3, %ymm4, %ymm6
+-	vpand	%ymm5, %ymm6, %ymm5
++	/* Go to 4x VEC loop.  */
++	cmpq	$(VEC_SIZE * 8), %rdx
++	ja	L(more_8x_vec)
+ 
+-	vptest	%ymm0, %ymm5
+-	jnc	L(4x_vec_end)
++	/* Handle remainder of size = 4 * VEC + 1 to 8 * VEC without any
++	   branches.  */
+ 
++	/* Load first two VEC from s2 before adjusting addresses.  */
++	vmovdqu	-(VEC_SIZE * 4)(%rsi, %rdx), %ymm1
++	vmovdqu	-(VEC_SIZE * 3)(%rsi, %rdx), %ymm2
+ 	leaq	-(4 * VEC_SIZE)(%rdi, %rdx), %rdi
+ 	leaq	-(4 * VEC_SIZE)(%rsi, %rdx), %rsi
+-	vmovdqu	(%rsi), %ymm1
+-	VPCMPEQ (%rdi), %ymm1, %ymm1
+ 
+-	vmovdqu	VEC_SIZE(%rsi), %ymm2
+-	VPCMPEQ VEC_SIZE(%rdi), %ymm2, %ymm2
+-	vpand	%ymm2, %ymm1, %ymm5
++	/* Wait to load from s1 until addressed adjust due to
++	   unlamination of microfusion with complex address mode.  */
++	VPCMPEQ	(%rdi), %ymm1, %ymm1
++	VPCMPEQ	(VEC_SIZE)(%rdi), %ymm2, %ymm2
+ 
+ 	vmovdqu	(VEC_SIZE * 2)(%rsi), %ymm3
+-	VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm3, %ymm3
+-	vpand	%ymm3, %ymm5, %ymm5
+-
++	VPCMPEQ	(VEC_SIZE * 2)(%rdi), %ymm3, %ymm3
+ 	vmovdqu	(VEC_SIZE * 3)(%rsi), %ymm4
+-	VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm4, %ymm4
+-	vpand	%ymm4, %ymm5, %ymm5
++	VPCMPEQ	(VEC_SIZE * 3)(%rdi), %ymm4, %ymm4
+ 
+-	vptest	%ymm0, %ymm5
+-	jnc	L(4x_vec_end)
+-	xorl	%eax, %eax
++	/* Reduce VEC0 - VEC4.  */
++	vpand	%ymm1, %ymm2, %ymm5
++	vpand	%ymm3, %ymm4, %ymm6
++	vpand	%ymm5, %ymm6, %ymm7
++	vpmovmskb %ymm7, %ecx
++	incl	%ecx
++	jnz	L(return_vec_0_1_2_3)
++	/* NB: eax must be zero to reach here.  */
++	VZEROUPPER_RETURN
++
++	.p2align 4
++L(return_vec_0):
++	tzcntl	%eax, %eax
++# ifdef USE_AS_WMEMCMP
++	movl	(%rdi, %rax), %ecx
++	xorl	%edx, %edx
++	cmpl	(%rsi, %rax), %ecx
++	/* NB: no partial register stall here because xorl zero idiom
++	   above.  */
++	setg	%dl
++	leal	-1(%rdx, %rdx), %eax
++# else
++	movzbl	(%rsi, %rax), %ecx
++	movzbl	(%rdi, %rax), %eax
++	subl	%ecx, %eax
++# endif
+ L(return_vzeroupper):
+ 	ZERO_UPPER_VEC_REGISTERS_RETURN
+ 
+ 	.p2align 4
+-L(last_2x_vec):
+-	/* From VEC to 2 * VEC.  No branch when size == VEC_SIZE.  */
+-	vmovdqu	(%rsi), %ymm2
+-	VPCMPEQ (%rdi), %ymm2, %ymm2
+-	vpmovmskb %ymm2, %eax
+-	subl    $VEC_MASK, %eax
+-	jnz	L(first_vec)
++L(return_vec_1):
++	tzcntl	%eax, %eax
++# ifdef USE_AS_WMEMCMP
++	movl	VEC_SIZE(%rdi, %rax), %ecx
++	xorl	%edx, %edx
++	cmpl	VEC_SIZE(%rsi, %rax), %ecx
++	setg	%dl
++	leal	-1(%rdx, %rdx), %eax
++# else
++	movzbl	VEC_SIZE(%rsi, %rax), %ecx
++	movzbl	VEC_SIZE(%rdi, %rax), %eax
++	subl	%ecx, %eax
++# endif
++	VZEROUPPER_RETURN
++
++	.p2align 4
++L(return_vec_2):
++	tzcntl	%eax, %eax
++# ifdef USE_AS_WMEMCMP
++	movl	(VEC_SIZE * 2)(%rdi, %rax), %ecx
++	xorl	%edx, %edx
++	cmpl	(VEC_SIZE * 2)(%rsi, %rax), %ecx
++	setg	%dl
++	leal	-1(%rdx, %rdx), %eax
++# else
++	movzbl	(VEC_SIZE * 2)(%rsi, %rax), %ecx
++	movzbl	(VEC_SIZE * 2)(%rdi, %rax), %eax
++	subl	%ecx, %eax
++# endif
++	VZEROUPPER_RETURN
++
++	/* NB: p2align 5 here to ensure 4x loop is 32 byte aligned.  */
++	.p2align 5
++L(8x_return_vec_0_1_2_3):
++	/* Returning from L(more_8x_vec) requires restoring rsi.  */
++	addq	%rdi, %rsi
++L(return_vec_0_1_2_3):
++	vpmovmskb %ymm1, %eax
++	incl	%eax
++	jnz	L(return_vec_0)
+ 
+-L(last_vec):
+-	/* Use overlapping loads to avoid branches.  */
+-	leaq	-VEC_SIZE(%rdi, %rdx), %rdi
+-	leaq	-VEC_SIZE(%rsi, %rdx), %rsi
+-	vmovdqu	(%rsi), %ymm2
+-	VPCMPEQ (%rdi), %ymm2, %ymm2
+ 	vpmovmskb %ymm2, %eax
+-	subl    $VEC_MASK, %eax
+-	jnz	L(first_vec)
++	incl	%eax
++	jnz	L(return_vec_1)
++
++	vpmovmskb %ymm3, %eax
++	incl	%eax
++	jnz	L(return_vec_2)
++L(return_vec_3):
++	tzcntl	%ecx, %ecx
++# ifdef USE_AS_WMEMCMP
++	movl	(VEC_SIZE * 3)(%rdi, %rcx), %eax
++	xorl	%edx, %edx
++	cmpl	(VEC_SIZE * 3)(%rsi, %rcx), %eax
++	setg	%dl
++	leal	-1(%rdx, %rdx), %eax
++# else
++	movzbl	(VEC_SIZE * 3)(%rdi, %rcx), %eax
++	movzbl	(VEC_SIZE * 3)(%rsi, %rcx), %ecx
++	subl	%ecx, %eax
++# endif
++	VZEROUPPER_RETURN
++
++	.p2align 4
++L(more_8x_vec):
++	/* Set end of s1 in rdx.  */
++	leaq	-(VEC_SIZE * 4)(%rdi, %rdx), %rdx
++	/* rsi stores s2 - s1. This allows loop to only update one
++	   pointer.  */
++	subq	%rdi, %rsi
++	/* Align s1 pointer.  */
++	andq	$-VEC_SIZE, %rdi
++	/* Adjust because first 4x vec where check already.  */
++	subq	$-(VEC_SIZE * 4), %rdi
++	.p2align 4
++L(loop_4x_vec):
++	/* rsi has s2 - s1 so get correct address by adding s1 (in rdi).
++	 */
++	vmovdqu	(%rsi, %rdi), %ymm1
++	VPCMPEQ	(%rdi), %ymm1, %ymm1
++
++	vmovdqu	VEC_SIZE(%rsi, %rdi), %ymm2
++	VPCMPEQ	VEC_SIZE(%rdi), %ymm2, %ymm2
++
++	vmovdqu	(VEC_SIZE * 2)(%rsi, %rdi), %ymm3
++	VPCMPEQ	(VEC_SIZE * 2)(%rdi), %ymm3, %ymm3
++
++	vmovdqu	(VEC_SIZE * 3)(%rsi, %rdi), %ymm4
++	VPCMPEQ	(VEC_SIZE * 3)(%rdi), %ymm4, %ymm4
++
++	vpand	%ymm1, %ymm2, %ymm5
++	vpand	%ymm3, %ymm4, %ymm6
++	vpand	%ymm5, %ymm6, %ymm7
++	vpmovmskb %ymm7, %ecx
++	incl	%ecx
++	jnz	L(8x_return_vec_0_1_2_3)
++	subq	$-(VEC_SIZE * 4), %rdi
++	/* Check if s1 pointer at end.  */
++	cmpq	%rdx, %rdi
++	jb	L(loop_4x_vec)
++
++	subq	%rdx, %rdi
++	/* rdi has 4 * VEC_SIZE - remaining length.  */
++	cmpl	$(VEC_SIZE * 3), %edi
++	jae	L(8x_last_1x_vec)
++	/* Load regardless of branch.  */
++	vmovdqu	(VEC_SIZE * 2)(%rsi, %rdx), %ymm3
++	cmpl	$(VEC_SIZE * 2), %edi
++	jae	L(8x_last_2x_vec)
++
++	/* Check last 4 VEC.  */
++	vmovdqu	(%rsi, %rdx), %ymm1
++	VPCMPEQ	(%rdx), %ymm1, %ymm1
++
++	vmovdqu	VEC_SIZE(%rsi, %rdx), %ymm2
++	VPCMPEQ	VEC_SIZE(%rdx), %ymm2, %ymm2
++
++	VPCMPEQ	(VEC_SIZE * 2)(%rdx), %ymm3, %ymm3
++
++	vmovdqu	(VEC_SIZE * 3)(%rsi, %rdx), %ymm4
++	VPCMPEQ	(VEC_SIZE * 3)(%rdx), %ymm4, %ymm4
++
++	vpand	%ymm1, %ymm2, %ymm5
++	vpand	%ymm3, %ymm4, %ymm6
++	vpand	%ymm5, %ymm6, %ymm7
++	vpmovmskb %ymm7, %ecx
++	/* Restore s1 pointer to rdi.  */
++	movq	%rdx, %rdi
++	incl	%ecx
++	jnz	L(8x_return_vec_0_1_2_3)
++	/* NB: eax must be zero to reach here.  */
++	VZEROUPPER_RETURN
++
++	/* Only entry is from L(more_8x_vec).  */
++	.p2align 4
++L(8x_last_2x_vec):
++	/* Check second to last VEC. rdx store end pointer of s1 and
++	   ymm3 has already been loaded with second to last VEC from s2.
++	 */
++	VPCMPEQ	(VEC_SIZE * 2)(%rdx), %ymm3, %ymm3
++	vpmovmskb %ymm3, %eax
++	incl	%eax
++	jnz	L(8x_return_vec_2)
++	/* Check last VEC.  */
++	.p2align 4
++L(8x_last_1x_vec):
++	vmovdqu	(VEC_SIZE * 3)(%rsi, %rdx), %ymm4
++	VPCMPEQ	(VEC_SIZE * 3)(%rdx), %ymm4, %ymm4
++	vpmovmskb %ymm4, %eax
++	incl	%eax
++	jnz	L(8x_return_vec_3)
+ 	VZEROUPPER_RETURN
+ 
+ 	.p2align 4
+-L(first_vec):
+-	/* A byte or int32 is different within 16 or 32 bytes.  */
+-	tzcntl	%eax, %ecx
++L(last_2x_vec):
++	/* Check second to last VEC.  */
++	vmovdqu	-(VEC_SIZE * 2)(%rsi, %rdx), %ymm1
++	VPCMPEQ	-(VEC_SIZE * 2)(%rdi, %rdx), %ymm1, %ymm1
++	vpmovmskb %ymm1, %eax
++	incl	%eax
++	jnz	L(return_vec_1_end)
++	/* Check last VEC.  */
++L(last_1x_vec):
++	vmovdqu	-(VEC_SIZE * 1)(%rsi, %rdx), %ymm1
++	VPCMPEQ	-(VEC_SIZE * 1)(%rdi, %rdx), %ymm1, %ymm1
++	vpmovmskb %ymm1, %eax
++	incl	%eax
++	jnz	L(return_vec_0_end)
++	VZEROUPPER_RETURN
++
++	.p2align 4
++L(8x_return_vec_2):
++	subq	$VEC_SIZE, %rdx
++L(8x_return_vec_3):
++	tzcntl	%eax, %eax
++	addq	%rdx, %rax
+ # ifdef USE_AS_WMEMCMP
+-	xorl	%eax, %eax
+-	movl	(%rdi, %rcx), %edx
+-	cmpl	(%rsi, %rcx), %edx
+-L(wmemcmp_return):
+-	setl	%al
+-	negl	%eax
+-	orl	$1, %eax
++	movl	(VEC_SIZE * 3)(%rax), %ecx
++	xorl	%edx, %edx
++	cmpl	(VEC_SIZE * 3)(%rsi, %rax), %ecx
++	setg	%dl
++	leal	-1(%rdx, %rdx), %eax
+ # else
+-	movzbl	(%rdi, %rcx), %eax
+-	movzbl	(%rsi, %rcx), %edx
+-	sub	%edx, %eax
++	movzbl	(VEC_SIZE * 3)(%rsi, %rax), %ecx
++	movzbl	(VEC_SIZE * 3)(%rax), %eax
++	subl	%ecx, %eax
+ # endif
+ 	VZEROUPPER_RETURN
+ 
+-# ifdef USE_AS_WMEMCMP
+ 	.p2align 4
+-L(4):
+-	xorl	%eax, %eax
+-	movl	(%rdi), %edx
+-	cmpl	(%rsi), %edx
+-	jne	L(wmemcmp_return)
+-	ret
++L(return_vec_1_end):
++	tzcntl	%eax, %eax
++	addl	%edx, %eax
++# ifdef USE_AS_WMEMCMP
++	movl	-(VEC_SIZE * 2)(%rdi, %rax), %ecx
++	xorl	%edx, %edx
++	cmpl	-(VEC_SIZE * 2)(%rsi, %rax), %ecx
++	setg	%dl
++	leal	-1(%rdx, %rdx), %eax
+ # else
++	movzbl	-(VEC_SIZE * 2)(%rsi, %rax), %ecx
++	movzbl	-(VEC_SIZE * 2)(%rdi, %rax), %eax
++	subl	%ecx, %eax
++# endif
++	VZEROUPPER_RETURN
++
+ 	.p2align 4
+-L(between_4_7):
+-	/* Load as big endian with overlapping movbe to avoid branches.  */
+-	movbe	(%rdi), %eax
+-	movbe	(%rsi), %ecx
+-	shlq	$32, %rax
+-	shlq	$32, %rcx
+-	movbe	-4(%rdi, %rdx), %edi
+-	movbe	-4(%rsi, %rdx), %esi
+-	orq	%rdi, %rax
+-	orq	%rsi, %rcx
+-	subq	%rcx, %rax
+-	je	L(exit)
+-	sbbl	%eax, %eax
+-	orl	$1, %eax
+-	ret
++L(return_vec_0_end):
++	tzcntl	%eax, %eax
++	addl	%edx, %eax
++# ifdef USE_AS_WMEMCMP
++	movl	-VEC_SIZE(%rdi, %rax), %ecx
++	xorl	%edx, %edx
++	cmpl	-VEC_SIZE(%rsi, %rax), %ecx
++	setg	%dl
++	leal	-1(%rdx, %rdx), %eax
++# else
++	movzbl	-VEC_SIZE(%rsi, %rax), %ecx
++	movzbl	-VEC_SIZE(%rdi, %rax), %eax
++	subl	%ecx, %eax
++# endif
++	VZEROUPPER_RETURN
+ 
+ 	.p2align 4
+-L(exit):
+-	ret
++L(less_vec):
++	/* Check if one or less CHAR. This is necessary for size = 0 but
++	   is also faster for size = CHAR_SIZE.  */
++	cmpl	$CHAR_SIZE, %edx
++	jbe	L(one_or_less)
++
++	/* Check if loading one VEC from either s1 or s2 could cause a
++	   page cross. This can have false positives but is by far the
++	   fastest method.  */
++	movl	%edi, %eax
++	orl	%esi, %eax
++	andl	$(PAGE_SIZE - 1), %eax
++	cmpl	$(PAGE_SIZE - VEC_SIZE), %eax
++	jg	L(page_cross_less_vec)
++
++	/* No page cross possible.  */
++	vmovdqu	(%rsi), %ymm2
++	VPCMPEQ	(%rdi), %ymm2, %ymm2
++	vpmovmskb %ymm2, %eax
++	incl	%eax
++	/* Result will be zero if s1 and s2 match. Otherwise first set
++	   bit will be first mismatch.  */
++	bzhil	%edx, %eax, %edx
++	jnz	L(return_vec_0)
++	xorl	%eax, %eax
++	VZEROUPPER_RETURN
+ 
+ 	.p2align 4
+-L(between_2_3):
++L(page_cross_less_vec):
++	/* if USE_AS_WMEMCMP it can only be 0, 4, 8, 12, 16, 20, 24, 28
++	   bytes.  */
++	cmpl	$16, %edx
++	jae	L(between_16_31)
++# ifndef USE_AS_WMEMCMP
++	cmpl	$8, %edx
++	jae	L(between_8_15)
++	cmpl	$4, %edx
++	jae	L(between_4_7)
++
+ 	/* Load as big endian to avoid branches.  */
+ 	movzwl	(%rdi), %eax
+ 	movzwl	(%rsi), %ecx
+@@ -208,223 +439,106 @@ L(between_2_3):
+ 	shll	$8, %ecx
+ 	bswap	%eax
+ 	bswap	%ecx
+-	movb	-1(%rdi, %rdx), %al
+-	movb	-1(%rsi, %rdx), %cl
++	movzbl	-1(%rdi, %rdx), %edi
++	movzbl	-1(%rsi, %rdx), %esi
++	orl	%edi, %eax
++	orl	%esi, %ecx
+ 	/* Subtraction is okay because the upper 8 bits are zero.  */
+ 	subl	%ecx, %eax
++	/* No ymm register was touched.  */
+ 	ret
+ 
+ 	.p2align 4
+-L(1):
+-	movzbl	(%rdi), %eax
++L(one_or_less):
++	jb	L(zero)
+ 	movzbl	(%rsi), %ecx
++	movzbl	(%rdi), %eax
+ 	subl	%ecx, %eax
+-	ret
+-# endif
+-
+-	.p2align 4
+-L(zero):
+-	xorl	%eax, %eax
++	/* No ymm register was touched.  */
+ 	ret
+ 
+ 	.p2align 4
+-L(less_vec):
+-# ifdef USE_AS_WMEMCMP
+-	/* It can only be 0, 4, 8, 12, 16, 20, 24, 28 bytes.  */
+-	cmpb	$4, %dl
+-	je	L(4)
+-	jb	L(zero)
+-# else
+-	cmpb	$1, %dl
+-	je	L(1)
+-	jb	L(zero)
+-	cmpb	$4, %dl
+-	jb	L(between_2_3)
+-	cmpb	$8, %dl
+-	jb	L(between_4_7)
++L(between_8_15):
+ # endif
+-	cmpb	$16, %dl
+-	jae	L(between_16_31)
+-	/* It is between 8 and 15 bytes.  */
++	/* If USE_AS_WMEMCMP fall through into 8-15 byte case.  */
+ 	vmovq	(%rdi), %xmm1
+ 	vmovq	(%rsi), %xmm2
+-	VPCMPEQ %xmm1, %xmm2, %xmm2
++	VPCMPEQ	%xmm1, %xmm2, %xmm2
+ 	vpmovmskb %xmm2, %eax
+-	subl    $0xffff, %eax
+-	jnz	L(first_vec)
++	subl	$0xffff, %eax
++	jnz	L(return_vec_0)
+ 	/* Use overlapping loads to avoid branches.  */
+ 	leaq	-8(%rdi, %rdx), %rdi
+ 	leaq	-8(%rsi, %rdx), %rsi
+ 	vmovq	(%rdi), %xmm1
+ 	vmovq	(%rsi), %xmm2
+-	VPCMPEQ %xmm1, %xmm2, %xmm2
++	VPCMPEQ	%xmm1, %xmm2, %xmm2
+ 	vpmovmskb %xmm2, %eax
+-	subl    $0xffff, %eax
+-	jnz	L(first_vec)
++	subl	$0xffff, %eax
++	jnz	L(return_vec_0)
++	/* No ymm register was touched.  */
++	ret
++
++	.p2align 4
++L(zero):
++	xorl	%eax, %eax
+ 	ret
+ 
+ 	.p2align 4
+ L(between_16_31):
+ 	/* From 16 to 31 bytes.  No branch when size == 16.  */
+ 	vmovdqu	(%rsi), %xmm2
+-	VPCMPEQ (%rdi), %xmm2, %xmm2
++	VPCMPEQ	(%rdi), %xmm2, %xmm2
+ 	vpmovmskb %xmm2, %eax
+-	subl    $0xffff, %eax
+-	jnz	L(first_vec)
++	subl	$0xffff, %eax
++	jnz	L(return_vec_0)
+ 
+ 	/* Use overlapping loads to avoid branches.  */
++
++	vmovdqu	-16(%rsi, %rdx), %xmm2
+ 	leaq	-16(%rdi, %rdx), %rdi
+ 	leaq	-16(%rsi, %rdx), %rsi
+-	vmovdqu	(%rsi), %xmm2
+-	VPCMPEQ (%rdi), %xmm2, %xmm2
++	VPCMPEQ	(%rdi), %xmm2, %xmm2
+ 	vpmovmskb %xmm2, %eax
+-	subl    $0xffff, %eax
+-	jnz	L(first_vec)
++	subl	$0xffff, %eax
++	jnz	L(return_vec_0)
++	/* No ymm register was touched.  */
+ 	ret
+ 
+-	.p2align 4
+-L(more_8x_vec):
+-	/* More than 8 * VEC.  Check the first VEC.  */
+-	vmovdqu	(%rsi), %ymm2
+-	VPCMPEQ (%rdi), %ymm2, %ymm2
+-	vpmovmskb %ymm2, %eax
+-	subl    $VEC_MASK, %eax
+-	jnz	L(first_vec)
+-
+-	/* Align the first memory area for aligned loads in the loop.
+-	   Compute how much the first memory area is misaligned.  */
+-	movq	%rdi, %rcx
+-	andl	$(VEC_SIZE - 1), %ecx
+-	/* Get the negative of offset for alignment.  */
+-	subq	$VEC_SIZE, %rcx
+-	/* Adjust the second memory area.  */
+-	subq	%rcx, %rsi
+-	/* Adjust the first memory area which should be aligned now.  */
+-	subq	%rcx, %rdi
+-	/* Adjust length.  */
+-	addq	%rcx, %rdx
+-
+-L(loop_4x_vec):
+-	/* Compare 4 * VEC at a time forward.  */
+-	vmovdqu	(%rsi), %ymm1
+-	VPCMPEQ (%rdi), %ymm1, %ymm1
+-
+-	vmovdqu	VEC_SIZE(%rsi), %ymm2
+-	VPCMPEQ VEC_SIZE(%rdi), %ymm2, %ymm2
+-	vpand	%ymm2, %ymm1, %ymm5
+-
+-	vmovdqu	(VEC_SIZE * 2)(%rsi), %ymm3
+-	VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm3, %ymm3
+-	vpand	%ymm3, %ymm5, %ymm5
+-
+-	vmovdqu	(VEC_SIZE * 3)(%rsi), %ymm4
+-	VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm4, %ymm4
+-	vpand	%ymm4, %ymm5, %ymm5
+-
+-	vptest	%ymm0, %ymm5
+-	jnc	L(4x_vec_end)
+-
+-	addq	$(VEC_SIZE * 4), %rdi
+-	addq	$(VEC_SIZE * 4), %rsi
+-
+-	subq	$(VEC_SIZE * 4), %rdx
+-	cmpq	$(VEC_SIZE * 4), %rdx
+-	jae	L(loop_4x_vec)
+-
+-	/* Less than 4 * VEC.  */
+-	cmpq	$VEC_SIZE, %rdx
+-	jbe	L(last_vec)
+-	cmpq	$(VEC_SIZE * 2), %rdx
+-	jbe	L(last_2x_vec)
+-
+-L(last_4x_vec):
+-	/* From 2 * VEC to 4 * VEC. */
+-	vmovdqu	(%rsi), %ymm2
+-	VPCMPEQ (%rdi), %ymm2, %ymm2
+-	vpmovmskb %ymm2, %eax
+-	subl    $VEC_MASK, %eax
+-	jnz	L(first_vec)
+-
+-	addq	$VEC_SIZE, %rdi
+-	addq	$VEC_SIZE, %rsi
+-	vmovdqu	(%rsi), %ymm2
+-	VPCMPEQ (%rdi), %ymm2, %ymm2
+-	vpmovmskb %ymm2, %eax
+-	subl    $VEC_MASK, %eax
+-	jnz	L(first_vec)
+-
+-	/* Use overlapping loads to avoid branches.  */
+-	leaq	-(3 * VEC_SIZE)(%rdi, %rdx), %rdi
+-	leaq	-(3 * VEC_SIZE)(%rsi, %rdx), %rsi
+-	vmovdqu	(%rsi), %ymm2
+-	VPCMPEQ (%rdi), %ymm2, %ymm2
+-	vpmovmskb %ymm2, %eax
+-	subl    $VEC_MASK, %eax
+-	jnz	L(first_vec)
+-
+-	addq	$VEC_SIZE, %rdi
+-	addq	$VEC_SIZE, %rsi
+-	vmovdqu	(%rsi), %ymm2
+-	VPCMPEQ (%rdi), %ymm2, %ymm2
+-	vpmovmskb %ymm2, %eax
+-	subl    $VEC_MASK, %eax
+-	jnz	L(first_vec)
+-	VZEROUPPER_RETURN
+-
+-	.p2align 4
+-L(4x_vec_end):
+-	vpmovmskb %ymm1, %eax
+-	subl	$VEC_MASK, %eax
+-	jnz	L(first_vec)
+-	vpmovmskb %ymm2, %eax
+-	subl	$VEC_MASK, %eax
+-	jnz	L(first_vec_x1)
+-	vpmovmskb %ymm3, %eax
+-	subl	$VEC_MASK, %eax
+-	jnz	L(first_vec_x2)
+-	vpmovmskb %ymm4, %eax
+-	subl	$VEC_MASK, %eax
+-	tzcntl	%eax, %ecx
+ # ifdef USE_AS_WMEMCMP
+-	xorl	%eax, %eax
+-	movl	(VEC_SIZE * 3)(%rdi, %rcx), %edx
+-	cmpl	(VEC_SIZE * 3)(%rsi, %rcx), %edx
+-	jmp	L(wmemcmp_return)
+-# else
+-	movzbl	(VEC_SIZE * 3)(%rdi, %rcx), %eax
+-	movzbl	(VEC_SIZE * 3)(%rsi, %rcx), %edx
+-	sub	%edx, %eax
+-# endif
+-	VZEROUPPER_RETURN
+-
+ 	.p2align 4
+-L(first_vec_x1):
+-	tzcntl	%eax, %ecx
+-# ifdef USE_AS_WMEMCMP
+-	xorl	%eax, %eax
+-	movl	VEC_SIZE(%rdi, %rcx), %edx
+-	cmpl	VEC_SIZE(%rsi, %rcx), %edx
+-	jmp	L(wmemcmp_return)
++L(one_or_less):
++	jb	L(zero)
++	movl	(%rdi), %ecx
++	xorl	%edx, %edx
++	cmpl	(%rsi), %ecx
++	je	L(zero)
++	setg	%dl
++	leal	-1(%rdx, %rdx), %eax
++	/* No ymm register was touched.  */
++	ret
+ # else
+-	movzbl	VEC_SIZE(%rdi, %rcx), %eax
+-	movzbl	VEC_SIZE(%rsi, %rcx), %edx
+-	sub	%edx, %eax
+-# endif
+-	VZEROUPPER_RETURN
+ 
+ 	.p2align 4
+-L(first_vec_x2):
+-	tzcntl	%eax, %ecx
+-# ifdef USE_AS_WMEMCMP
+-	xorl	%eax, %eax
+-	movl	(VEC_SIZE * 2)(%rdi, %rcx), %edx
+-	cmpl	(VEC_SIZE * 2)(%rsi, %rcx), %edx
+-	jmp	L(wmemcmp_return)
+-# else
+-	movzbl	(VEC_SIZE * 2)(%rdi, %rcx), %eax
+-	movzbl	(VEC_SIZE * 2)(%rsi, %rcx), %edx
+-	sub	%edx, %eax
++L(between_4_7):
++	/* Load as big endian with overlapping movbe to avoid branches.
++	 */
++	movbe	(%rdi), %eax
++	movbe	(%rsi), %ecx
++	shlq	$32, %rax
++	shlq	$32, %rcx
++	movbe	-4(%rdi, %rdx), %edi
++	movbe	-4(%rsi, %rdx), %esi
++	orq	%rdi, %rax
++	orq	%rsi, %rcx
++	subq	%rcx, %rax
++	jz	L(zero_4_7)
++	sbbl	%eax, %eax
++	orl	$1, %eax
++L(zero_4_7):
++	/* No ymm register was touched.  */
++	ret
+ # endif
+-	VZEROUPPER_RETURN
++
+ END (MEMCMP)
+ #endif
+-- 
+GitLab
+
diff --git a/glibc-RHEL-15696-46.patch b/glibc-RHEL-15696-46.patch
new file mode 100644
index 0000000..881fe81
--- /dev/null
+++ b/glibc-RHEL-15696-46.patch
@@ -0,0 +1,851 @@
+From 4ad473e97acdc5f6d811755b67c09f2128a644ce Mon Sep 17 00:00:00 2001
+From: Noah Goldstein <goldstein.w.n@gmail.com>
+Date: Mon, 17 May 2021 13:57:24 -0400
+Subject: [PATCH] x86: Optimize memcmp-evex-movbe.S
+Content-type: text/plain; charset=UTF-8
+
+No bug. This commit optimizes memcmp-evex.S. The optimizations include
+adding a new vec compare path for small sizes, reorganizing the entry
+control flow, removing some unnecissary ALU instructions from the main
+loop, and most importantly replacing the heavy use of vpcmp + kand
+logic with vpxor + vptern. test-memcmp and test-wmemcmp are both
+passing.
+
+Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
+Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
+---
+ sysdeps/x86_64/multiarch/memcmp-evex-movbe.S | 710 +++++++++++--------
+ 1 file changed, 408 insertions(+), 302 deletions(-)
+
+diff --git a/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S b/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S
+index 9c093972..654dc7ac 100644
+--- a/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S
++++ b/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S
+@@ -19,17 +19,22 @@
+ #if IS_IN (libc)
+ 
+ /* memcmp/wmemcmp is implemented as:
+-   1. For size from 2 to 7 bytes, load as big endian with movbe and bswap
+-      to avoid branches.
+-   2. Use overlapping compare to avoid branch.
+-   3. Use vector compare when size >= 4 bytes for memcmp or size >= 8
+-      bytes for wmemcmp.
+-   4. If size is 8 * VEC_SIZE or less, unroll the loop.
+-   5. Compare 4 * VEC_SIZE at a time with the aligned first memory
++   1. Use ymm vector compares when possible. The only case where
++      vector compares is not possible for when size < CHAR_PER_VEC
++      and loading from either s1 or s2 would cause a page cross.
++   2. For size from 2 to 7 bytes on page cross, load as big endian
++      with movbe and bswap to avoid branches.
++   3. Use xmm vector compare when size >= 4 bytes for memcmp or
++      size >= 8 bytes for wmemcmp.
++   4. Optimistically compare up to first 4 * CHAR_PER_VEC one at a
++      to check for early mismatches. Only do this if its guranteed the
++      work is not wasted.
++   5. If size is 8 * VEC_SIZE or less, unroll the loop.
++   6. Compare 4 * VEC_SIZE at a time with the aligned first memory
+       area.
+-   6. Use 2 vector compares when size is 2 * VEC_SIZE or less.
+-   7. Use 4 vector compares when size is 4 * VEC_SIZE or less.
+-   8. Use 8 vector compares when size is 8 * VEC_SIZE or less.  */
++   7. Use 2 vector compares when size is 2 * CHAR_PER_VEC or less.
++   8. Use 4 vector compares when size is 4 * CHAR_PER_VEC or less.
++   9. Use 8 vector compares when size is 8 * CHAR_PER_VEC or less.  */
+ 
+ # include <sysdep.h>
+ 
+@@ -40,11 +45,21 @@
+ # define VMOVU		vmovdqu64
+ 
+ # ifdef USE_AS_WMEMCMP
+-#  define VPCMPEQ	vpcmpeqd
++#  define CHAR_SIZE	4
++#  define VPCMP	vpcmpd
+ # else
+-#  define VPCMPEQ	vpcmpeqb
++#  define CHAR_SIZE	1
++#  define VPCMP	vpcmpub
+ # endif
+ 
++# define VEC_SIZE	32
++# define PAGE_SIZE	4096
++# define CHAR_PER_VEC	(VEC_SIZE / CHAR_SIZE)
++
++# define XMM0		xmm16
++# define XMM1		xmm17
++# define XMM2		xmm18
++# define YMM0		ymm16
+ # define XMM1		xmm17
+ # define XMM2		xmm18
+ # define YMM1		ymm17
+@@ -54,15 +69,6 @@
+ # define YMM5		ymm21
+ # define YMM6		ymm22
+ 
+-# define VEC_SIZE 32
+-# ifdef USE_AS_WMEMCMP
+-#  define VEC_MASK 0xff
+-#  define XMM_MASK 0xf
+-# else
+-#  define VEC_MASK 0xffffffff
+-#  define XMM_MASK 0xffff
+-# endif
+-
+ /* Warning!
+            wmemcmp has to use SIGNED comparison for elements.
+            memcmp has to use UNSIGNED comparison for elemnts.
+@@ -70,145 +76,370 @@
+ 
+ 	.section .text.evex,"ax",@progbits
+ ENTRY (MEMCMP)
+-# ifdef USE_AS_WMEMCMP
+-	shl	$2, %RDX_LP
+-# elif defined __ILP32__
++# ifdef __ILP32__
+ 	/* Clear the upper 32 bits.  */
+ 	movl	%edx, %edx
+ # endif
+-	cmp	$VEC_SIZE, %RDX_LP
++	cmp	$CHAR_PER_VEC, %RDX_LP
+ 	jb	L(less_vec)
+ 
+ 	/* From VEC to 2 * VEC.  No branch when size == VEC_SIZE.  */
+-	VMOVU	(%rsi), %YMM2
+-	VPCMPEQ (%rdi), %YMM2, %k1
++	VMOVU	(%rsi), %YMM1
++	/* Use compare not equals to directly check for mismatch.  */
++	VPCMP	$4, (%rdi), %YMM1, %k1
+ 	kmovd	%k1, %eax
+-	subl    $VEC_MASK, %eax
+-	jnz	L(first_vec)
+-
+-	cmpq	$(VEC_SIZE * 2), %rdx
+-	jbe	L(last_vec)
+-
+-	/* More than 2 * VEC.  */
+-	cmpq	$(VEC_SIZE * 8), %rdx
+-	ja	L(more_8x_vec)
+-	cmpq	$(VEC_SIZE * 4), %rdx
+-	jb	L(last_4x_vec)
++	/* NB: eax must be destination register if going to
++	   L(return_vec_[0,2]). For L(return_vec_3 destination register
++	   must be ecx.  */
++	testl	%eax, %eax
++	jnz	L(return_vec_0)
+ 
+-	/* From 4 * VEC to 8 * VEC, inclusively. */
+-	VMOVU	(%rsi), %YMM1
+-	VPCMPEQ (%rdi), %YMM1, %k1
++	cmpq	$(CHAR_PER_VEC * 2), %rdx
++	jbe	L(last_1x_vec)
+ 
++	/* Check second VEC no matter what.  */
+ 	VMOVU	VEC_SIZE(%rsi), %YMM2
+-	VPCMPEQ VEC_SIZE(%rdi), %YMM2, %k2
++	VPCMP	$4, VEC_SIZE(%rdi), %YMM2, %k1
++	kmovd	%k1, %eax
++	testl	%eax, %eax
++	jnz	L(return_vec_1)
++
++	/* Less than 4 * VEC.  */
++	cmpq	$(CHAR_PER_VEC * 4), %rdx
++	jbe	L(last_2x_vec)
+ 
++	/* Check third and fourth VEC no matter what.  */
+ 	VMOVU	(VEC_SIZE * 2)(%rsi), %YMM3
+-	VPCMPEQ (VEC_SIZE * 2)(%rdi), %YMM3, %k3
++	VPCMP	$4, (VEC_SIZE * 2)(%rdi), %YMM3, %k1
++	kmovd	%k1, %eax
++	testl	%eax, %eax
++	jnz	L(return_vec_2)
+ 
+ 	VMOVU	(VEC_SIZE * 3)(%rsi), %YMM4
+-	VPCMPEQ (VEC_SIZE * 3)(%rdi), %YMM4, %k4
++	VPCMP	$4, (VEC_SIZE * 3)(%rdi), %YMM4, %k1
++	kmovd	%k1, %ecx
++	testl	%ecx, %ecx
++	jnz	L(return_vec_3)
+ 
+-	kandd	%k1, %k2, %k5
+-	kandd	%k3, %k4, %k6
+-	kandd	%k5, %k6, %k6
++	/* Zero YMM0. 4x VEC reduction is done with vpxor + vtern so
++	   compare with zero to get a mask is needed.  */
++	vpxorq	%XMM0, %XMM0, %XMM0
+ 
+-	kmovd	%k6, %eax
+-	cmpl	$VEC_MASK, %eax
+-	jne	L(4x_vec_end)
++	/* Go to 4x VEC loop.  */
++	cmpq	$(CHAR_PER_VEC * 8), %rdx
++	ja	L(more_8x_vec)
+ 
+-	leaq	-(4 * VEC_SIZE)(%rdi, %rdx), %rdi
+-	leaq	-(4 * VEC_SIZE)(%rsi, %rdx), %rsi
+-	VMOVU	(%rsi), %YMM1
+-	VPCMPEQ (%rdi), %YMM1, %k1
++	/* Handle remainder of size = 4 * VEC + 1 to 8 * VEC without any
++	   branches.  */
+ 
+-	VMOVU	VEC_SIZE(%rsi), %YMM2
+-	VPCMPEQ VEC_SIZE(%rdi), %YMM2, %k2
+-	kandd	%k1, %k2, %k5
++	/* Load first two VEC from s2 before adjusting addresses.  */
++	VMOVU	-(VEC_SIZE * 4)(%rsi, %rdx, CHAR_SIZE), %YMM1
++	VMOVU	-(VEC_SIZE * 3)(%rsi, %rdx, CHAR_SIZE), %YMM2
++	leaq	-(4 * VEC_SIZE)(%rdi, %rdx, CHAR_SIZE), %rdi
++	leaq	-(4 * VEC_SIZE)(%rsi, %rdx, CHAR_SIZE), %rsi
++
++	/* Wait to load from s1 until addressed adjust due to
++	   unlamination of microfusion with complex address mode.  */
++
++	/* vpxor will be all 0s if s1 and s2 are equal. Otherwise it
++	   will have some 1s.  */
++	vpxorq	(%rdi), %YMM1, %YMM1
++	vpxorq	(VEC_SIZE)(%rdi), %YMM2, %YMM2
+ 
+ 	VMOVU	(VEC_SIZE * 2)(%rsi), %YMM3
+-	VPCMPEQ (VEC_SIZE * 2)(%rdi), %YMM3, %k3
+-	kandd	%k3, %k5, %k5
++	vpxorq	(VEC_SIZE * 2)(%rdi), %YMM3, %YMM3
++	/* Or together YMM1, YMM2, and YMM3 into YMM3.  */
++	vpternlogd $0xfe, %YMM1, %YMM2, %YMM3
+ 
+ 	VMOVU	(VEC_SIZE * 3)(%rsi), %YMM4
+-	VPCMPEQ (VEC_SIZE * 3)(%rdi), %YMM4, %k4
+-	kandd	%k4, %k5, %k5
++	/* Ternary logic to xor (VEC_SIZE * 3)(%rdi) with YMM4 while
++	   oring with YMM3. Result is stored in YMM4.  */
++	vpternlogd $0xde, (VEC_SIZE * 3)(%rdi), %YMM3, %YMM4
++	/* Compare YMM4 with 0. If any 1s s1 and s2 don't match.  */
++	VPCMP	$4, %YMM4, %YMM0, %k1
++	kmovd	%k1, %ecx
++	testl	%ecx, %ecx
++	jnz	L(return_vec_0_1_2_3)
++	/* NB: eax must be zero to reach here.  */
++	ret
+ 
+-	kmovd	%k5, %eax
+-	cmpl	$VEC_MASK, %eax
+-	jne	L(4x_vec_end)
+-	xorl	%eax, %eax
++	/* NB: aligning 32 here allows for the rest of the jump targets
++	   to be tuned for 32 byte alignment. Most important this ensures
++	   the L(more_8x_vec) loop is 32 byte aligned.  */
++	.p2align 5
++L(less_vec):
++	/* Check if one or less CHAR. This is necessary for size = 0 but
++	   is also faster for size = CHAR_SIZE.  */
++	cmpl	$1, %edx
++	jbe	L(one_or_less)
++
++	/* Check if loading one VEC from either s1 or s2 could cause a
++	   page cross. This can have false positives but is by far the
++	   fastest method.  */
++	movl	%edi, %eax
++	orl	%esi, %eax
++	andl	$(PAGE_SIZE - 1), %eax
++	cmpl	$(PAGE_SIZE - VEC_SIZE), %eax
++	jg	L(page_cross_less_vec)
++
++	/* No page cross possible.  */
++	VMOVU	(%rsi), %YMM2
++	VPCMP	$4, (%rdi), %YMM2, %k1
++	kmovd	%k1, %eax
++	/* Create mask in ecx for potentially in bound matches.  */
++	bzhil	%edx, %eax, %eax
++	jnz	L(return_vec_0)
+ 	ret
+ 
+ 	.p2align 4
+-L(last_2x_vec):
+-	/* From VEC to 2 * VEC.  No branch when size == VEC_SIZE.  */
+-	VMOVU	(%rsi), %YMM2
+-	VPCMPEQ (%rdi), %YMM2, %k2
+-	kmovd	%k2, %eax
+-	subl    $VEC_MASK, %eax
+-	jnz	L(first_vec)
++L(return_vec_0):
++	tzcntl	%eax, %eax
++# ifdef USE_AS_WMEMCMP
++	movl	(%rdi, %rax, CHAR_SIZE), %ecx
++	xorl	%edx, %edx
++	cmpl	(%rsi, %rax, CHAR_SIZE), %ecx
++	/* NB: no partial register stall here because xorl zero idiom
++	   above.  */
++	setg	%dl
++	leal	-1(%rdx, %rdx), %eax
++# else
++	movzbl	(%rsi, %rax), %ecx
++	movzbl	(%rdi, %rax), %eax
++	subl	%ecx, %eax
++# endif
++	ret
+ 
+-L(last_vec):
+-	/* Use overlapping loads to avoid branches.  */
+-	leaq	-VEC_SIZE(%rdi, %rdx), %rdi
+-	leaq	-VEC_SIZE(%rsi, %rdx), %rsi
+-	VMOVU	(%rsi), %YMM2
+-	VPCMPEQ (%rdi), %YMM2, %k2
+-	kmovd	%k2, %eax
+-	subl    $VEC_MASK, %eax
+-	jnz	L(first_vec)
++	/* NB: No p2align necessary. Alignment  % 16 is naturally 1
++	   which is good enough for a target not in a loop.  */
++L(return_vec_1):
++	tzcntl	%eax, %eax
++# ifdef USE_AS_WMEMCMP
++	movl	VEC_SIZE(%rdi, %rax, CHAR_SIZE), %ecx
++	xorl	%edx, %edx
++	cmpl	VEC_SIZE(%rsi, %rax, CHAR_SIZE), %ecx
++	setg	%dl
++	leal	-1(%rdx, %rdx), %eax
++# else
++	movzbl	VEC_SIZE(%rsi, %rax), %ecx
++	movzbl	VEC_SIZE(%rdi, %rax), %eax
++	subl	%ecx, %eax
++# endif
+ 	ret
+ 
+-	.p2align 4
+-L(first_vec):
+-	/* A byte or int32 is different within 16 or 32 bytes.  */
+-	tzcntl	%eax, %ecx
++	/* NB: No p2align necessary. Alignment  % 16 is naturally 2
++	   which is good enough for a target not in a loop.  */
++L(return_vec_2):
++	tzcntl	%eax, %eax
+ # ifdef USE_AS_WMEMCMP
+-	xorl	%eax, %eax
+-	movl	(%rdi, %rcx, 4), %edx
+-	cmpl	(%rsi, %rcx, 4), %edx
+-L(wmemcmp_return):
+-	setl	%al
+-	negl	%eax
+-	orl	$1, %eax
++	movl	(VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %ecx
++	xorl	%edx, %edx
++	cmpl	(VEC_SIZE * 2)(%rsi, %rax, CHAR_SIZE), %ecx
++	setg	%dl
++	leal	-1(%rdx, %rdx), %eax
+ # else
+-	movzbl	(%rdi, %rcx), %eax
+-	movzbl	(%rsi, %rcx), %edx
+-	sub	%edx, %eax
++	movzbl	(VEC_SIZE * 2)(%rsi, %rax), %ecx
++	movzbl	(VEC_SIZE * 2)(%rdi, %rax), %eax
++	subl	%ecx, %eax
+ # endif
+ 	ret
+ 
++	.p2align 4
++L(8x_return_vec_0_1_2_3):
++	/* Returning from L(more_8x_vec) requires restoring rsi.  */
++	addq	%rdi, %rsi
++L(return_vec_0_1_2_3):
++	VPCMP	$4, %YMM1, %YMM0, %k0
++	kmovd	%k0, %eax
++	testl	%eax, %eax
++	jnz	L(return_vec_0)
++
++	VPCMP	$4, %YMM2, %YMM0, %k0
++	kmovd	%k0, %eax
++	testl	%eax, %eax
++	jnz	L(return_vec_1)
++
++	VPCMP	$4, %YMM3, %YMM0, %k0
++	kmovd	%k0, %eax
++	testl	%eax, %eax
++	jnz	L(return_vec_2)
++L(return_vec_3):
++	tzcntl	%ecx, %ecx
+ # ifdef USE_AS_WMEMCMP
++	movl	(VEC_SIZE * 3)(%rdi, %rcx, CHAR_SIZE), %eax
++	xorl	%edx, %edx
++	cmpl	(VEC_SIZE * 3)(%rsi, %rcx, CHAR_SIZE), %eax
++	setg	%dl
++	leal	-1(%rdx, %rdx), %eax
++# else
++	movzbl	(VEC_SIZE * 3)(%rdi, %rcx), %eax
++	movzbl	(VEC_SIZE * 3)(%rsi, %rcx), %ecx
++	subl	%ecx, %eax
++# endif
++	ret
++
+ 	.p2align 4
+-L(4):
+-	xorl	%eax, %eax
+-	movl	(%rdi), %edx
+-	cmpl	(%rsi), %edx
+-	jne	L(wmemcmp_return)
++L(more_8x_vec):
++	/* Set end of s1 in rdx.  */
++	leaq	-(VEC_SIZE * 4)(%rdi, %rdx, CHAR_SIZE), %rdx
++	/* rsi stores s2 - s1. This allows loop to only update one
++	   pointer.  */
++	subq	%rdi, %rsi
++	/* Align s1 pointer.  */
++	andq	$-VEC_SIZE, %rdi
++	/* Adjust because first 4x vec where check already.  */
++	subq	$-(VEC_SIZE * 4), %rdi
++	.p2align 4
++L(loop_4x_vec):
++	VMOVU	(%rsi, %rdi), %YMM1
++	vpxorq	(%rdi), %YMM1, %YMM1
++
++	VMOVU	VEC_SIZE(%rsi, %rdi), %YMM2
++	vpxorq	VEC_SIZE(%rdi), %YMM2, %YMM2
++
++	VMOVU	(VEC_SIZE * 2)(%rsi, %rdi), %YMM3
++	vpxorq	(VEC_SIZE * 2)(%rdi), %YMM3, %YMM3
++	vpternlogd $0xfe, %YMM1, %YMM2, %YMM3
++
++	VMOVU	(VEC_SIZE * 3)(%rsi, %rdi), %YMM4
++	vpternlogd $0xde, (VEC_SIZE * 3)(%rdi), %YMM3, %YMM4
++	VPCMP	$4, %YMM4, %YMM0, %k1
++	kmovd	%k1, %ecx
++	testl	%ecx, %ecx
++	jnz	L(8x_return_vec_0_1_2_3)
++	subq	$-(VEC_SIZE * 4), %rdi
++	cmpq	%rdx, %rdi
++	jb	L(loop_4x_vec)
++
++	subq	%rdx, %rdi
++	/* rdi has 4 * VEC_SIZE - remaining length.  */
++	cmpl	$(VEC_SIZE * 3), %edi
++	jae	L(8x_last_1x_vec)
++	/* Load regardless of branch.  */
++	VMOVU	(VEC_SIZE * 2)(%rsi, %rdx), %YMM3
++	cmpl	$(VEC_SIZE * 2), %edi
++	jae	L(8x_last_2x_vec)
++
++	VMOVU	(%rsi, %rdx), %YMM1
++	vpxorq	(%rdx), %YMM1, %YMM1
++
++	VMOVU	VEC_SIZE(%rsi, %rdx), %YMM2
++	vpxorq	VEC_SIZE(%rdx), %YMM2, %YMM2
++
++	vpxorq	(VEC_SIZE * 2)(%rdx), %YMM3, %YMM3
++	vpternlogd $0xfe, %YMM1, %YMM2, %YMM3
++
++	VMOVU	(VEC_SIZE * 3)(%rsi, %rdx), %YMM4
++	vpternlogd $0xde, (VEC_SIZE * 3)(%rdx), %YMM3, %YMM4
++	VPCMP	$4, %YMM4, %YMM0, %k1
++	kmovd	%k1, %ecx
++	/* Restore s1 pointer to rdi.  */
++	movq	%rdx, %rdi
++	testl	%ecx, %ecx
++	jnz	L(8x_return_vec_0_1_2_3)
++	/* NB: eax must be zero to reach here.  */
++	ret
++
++	/* Only entry is from L(more_8x_vec).  */
++	.p2align 4
++L(8x_last_2x_vec):
++	VPCMP	$4, (VEC_SIZE * 2)(%rdx), %YMM3, %k1
++	kmovd	%k1, %eax
++	testl	%eax, %eax
++	jnz	L(8x_return_vec_2)
++	/* Naturally aligned to 16 bytes.  */
++L(8x_last_1x_vec):
++	VMOVU	(VEC_SIZE * 3)(%rsi, %rdx), %YMM1
++	VPCMP	$4, (VEC_SIZE * 3)(%rdx), %YMM1, %k1
++	kmovd	%k1, %eax
++	testl	%eax, %eax
++	jnz	L(8x_return_vec_3)
++	ret
++
++	.p2align 4
++L(last_2x_vec):
++	/* Check second to last VEC.  */
++	VMOVU	-(VEC_SIZE * 2)(%rsi, %rdx, CHAR_SIZE), %YMM1
++	VPCMP	$4, -(VEC_SIZE * 2)(%rdi, %rdx, CHAR_SIZE), %YMM1, %k1
++	kmovd	%k1, %eax
++	testl	%eax, %eax
++	jnz	L(return_vec_1_end)
++
++	/* Check last VEC.  */
++	.p2align 4
++L(last_1x_vec):
++	VMOVU	-(VEC_SIZE * 1)(%rsi, %rdx, CHAR_SIZE), %YMM1
++	VPCMP	$4, -(VEC_SIZE * 1)(%rdi, %rdx, CHAR_SIZE), %YMM1, %k1
++	kmovd	%k1, %eax
++	testl	%eax, %eax
++	jnz	L(return_vec_0_end)
+ 	ret
++
++	.p2align 4
++L(8x_return_vec_2):
++	subq	$VEC_SIZE, %rdx
++L(8x_return_vec_3):
++	tzcntl	%eax, %eax
++# ifdef USE_AS_WMEMCMP
++	leaq	(%rdx, %rax, CHAR_SIZE), %rax
++	movl	(VEC_SIZE * 3)(%rax), %ecx
++	xorl	%edx, %edx
++	cmpl	(VEC_SIZE * 3)(%rsi, %rax), %ecx
++	setg	%dl
++	leal	-1(%rdx, %rdx), %eax
+ # else
++	addq	%rdx, %rax
++	movzbl	(VEC_SIZE * 3)(%rsi, %rax), %ecx
++	movzbl	(VEC_SIZE * 3)(%rax), %eax
++	subl	%ecx, %eax
++# endif
++	ret
++
+ 	.p2align 4
+-L(between_4_7):
+-	/* Load as big endian with overlapping movbe to avoid branches.  */
+-	movbe	(%rdi), %eax
+-	movbe	(%rsi), %ecx
+-	shlq	$32, %rax
+-	shlq	$32, %rcx
+-	movbe	-4(%rdi, %rdx), %edi
+-	movbe	-4(%rsi, %rdx), %esi
+-	orq	%rdi, %rax
+-	orq	%rsi, %rcx
+-	subq	%rcx, %rax
+-	je	L(exit)
+-	sbbl	%eax, %eax
+-	orl	$1, %eax
++L(return_vec_0_end):
++	tzcntl	%eax, %eax
++	addl	%edx, %eax
++# ifdef USE_AS_WMEMCMP
++	movl	-VEC_SIZE(%rdi, %rax, CHAR_SIZE), %ecx
++	xorl	%edx, %edx
++	cmpl	-VEC_SIZE(%rsi, %rax, CHAR_SIZE), %ecx
++	setg	%dl
++	leal	-1(%rdx, %rdx), %eax
++# else
++	movzbl	-VEC_SIZE(%rsi, %rax), %ecx
++	movzbl	-VEC_SIZE(%rdi, %rax), %eax
++	subl	%ecx, %eax
++# endif
+ 	ret
+ 
+ 	.p2align 4
+-L(exit):
++L(return_vec_1_end):
++	tzcntl	%eax, %eax
++	addl	%edx, %eax
++# ifdef USE_AS_WMEMCMP
++	movl	-(VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %ecx
++	xorl	%edx, %edx
++	cmpl	-(VEC_SIZE * 2)(%rsi, %rax, CHAR_SIZE), %ecx
++	setg	%dl
++	leal	-1(%rdx, %rdx), %eax
++# else
++	movzbl	-(VEC_SIZE * 2)(%rsi, %rax), %ecx
++	movzbl	-(VEC_SIZE * 2)(%rdi, %rax), %eax
++	subl	%ecx, %eax
++# endif
+ 	ret
+ 
++
+ 	.p2align 4
++L(page_cross_less_vec):
++	/* if USE_AS_WMEMCMP it can only be 0, 4, 8, 12, 16, 20, 24, 28
++	   bytes.  */
++	cmpl	$(16 / CHAR_SIZE), %edx
++	jae	L(between_16_31)
++# ifndef USE_AS_WMEMCMP
++	cmpl	$8, %edx
++	jae	L(between_8_15)
++	cmpl	$4, %edx
++	jae	L(between_4_7)
+ L(between_2_3):
+ 	/* Load as big endian to avoid branches.  */
+ 	movzwl	(%rdi), %eax
+@@ -217,224 +448,99 @@ L(between_2_3):
+ 	shll	$8, %ecx
+ 	bswap	%eax
+ 	bswap	%ecx
+-	movb	-1(%rdi, %rdx), %al
+-	movb	-1(%rsi, %rdx), %cl
++	movzbl	-1(%rdi, %rdx), %edi
++	movzbl	-1(%rsi, %rdx), %esi
++	orl	%edi, %eax
++	orl	%esi, %ecx
+ 	/* Subtraction is okay because the upper 8 bits are zero.  */
+ 	subl	%ecx, %eax
+ 	ret
+-
+ 	.p2align 4
+-L(1):
+-	movzbl	(%rdi), %eax
++L(one_or_less):
++	jb	L(zero)
+ 	movzbl	(%rsi), %ecx
++	movzbl	(%rdi), %eax
+ 	subl	%ecx, %eax
+ 	ret
+-# endif
+-
+-	.p2align 4
+-L(zero):
+-	xorl	%eax, %eax
+-	ret
+ 
+ 	.p2align 4
+-L(less_vec):
+-# ifdef USE_AS_WMEMCMP
+-	/* It can only be 0, 4, 8, 12, 16, 20, 24, 28 bytes.  */
+-	cmpb	$4, %dl
+-	je	L(4)
+-	jb	L(zero)
+-# else
+-	cmpb	$1, %dl
+-	je	L(1)
+-	jb	L(zero)
+-	cmpb	$4, %dl
+-	jb	L(between_2_3)
+-	cmpb	$8, %dl
+-	jb	L(between_4_7)
++L(between_8_15):
+ # endif
+-	cmpb	$16, %dl
+-	jae	L(between_16_31)
+-	/* It is between 8 and 15 bytes.  */
++	/* If USE_AS_WMEMCMP fall through into 8-15 byte case.  */
+ 	vmovq	(%rdi), %XMM1
+ 	vmovq	(%rsi), %XMM2
+-	VPCMPEQ %XMM1, %XMM2, %k2
+-	kmovw	%k2, %eax
+-	subl    $XMM_MASK, %eax
+-	jnz	L(first_vec)
++	VPCMP	$4, %XMM1, %XMM2, %k1
++	kmovd	%k1, %eax
++	testl	%eax, %eax
++	jnz	L(return_vec_0)
+ 	/* Use overlapping loads to avoid branches.  */
+-	leaq	-8(%rdi, %rdx), %rdi
+-	leaq	-8(%rsi, %rdx), %rsi
++	leaq	-8(%rdi, %rdx, CHAR_SIZE), %rdi
++	leaq	-8(%rsi, %rdx, CHAR_SIZE), %rsi
+ 	vmovq	(%rdi), %XMM1
+ 	vmovq	(%rsi), %XMM2
+-	VPCMPEQ %XMM1, %XMM2, %k2
+-	kmovw	%k2, %eax
+-	subl    $XMM_MASK, %eax
+-	jnz	L(first_vec)
++	VPCMP	$4, %XMM1, %XMM2, %k1
++	kmovd	%k1, %eax
++	testl	%eax, %eax
++	jnz	L(return_vec_0)
+ 	ret
+ 
+ 	.p2align 4
+-L(between_16_31):
+-	/* From 16 to 31 bytes.  No branch when size == 16.  */
+-	VMOVU	(%rsi), %XMM2
+-	VPCMPEQ (%rdi), %XMM2, %k2
+-	kmovw	%k2, %eax
+-	subl    $XMM_MASK, %eax
+-	jnz	L(first_vec)
+-
+-	/* Use overlapping loads to avoid branches.  */
+-	leaq	-16(%rdi, %rdx), %rdi
+-	leaq	-16(%rsi, %rdx), %rsi
+-	VMOVU	(%rsi), %XMM2
+-	VPCMPEQ (%rdi), %XMM2, %k2
+-	kmovw	%k2, %eax
+-	subl    $XMM_MASK, %eax
+-	jnz	L(first_vec)
++L(zero):
++	xorl	%eax, %eax
+ 	ret
+ 
+ 	.p2align 4
+-L(more_8x_vec):
+-	/* More than 8 * VEC.  Check the first VEC.  */
+-	VMOVU	(%rsi), %YMM2
+-	VPCMPEQ (%rdi), %YMM2, %k2
+-	kmovd	%k2, %eax
+-	subl    $VEC_MASK, %eax
+-	jnz	L(first_vec)
+-
+-	/* Align the first memory area for aligned loads in the loop.
+-	   Compute how much the first memory area is misaligned.  */
+-	movq	%rdi, %rcx
+-	andl	$(VEC_SIZE - 1), %ecx
+-	/* Get the negative of offset for alignment.  */
+-	subq	$VEC_SIZE, %rcx
+-	/* Adjust the second memory area.  */
+-	subq	%rcx, %rsi
+-	/* Adjust the first memory area which should be aligned now.  */
+-	subq	%rcx, %rdi
+-	/* Adjust length.  */
+-	addq	%rcx, %rdx
+-
+-L(loop_4x_vec):
+-	/* Compare 4 * VEC at a time forward.  */
+-	VMOVU	(%rsi), %YMM1
+-	VPCMPEQ (%rdi), %YMM1, %k1
+-
+-	VMOVU	VEC_SIZE(%rsi), %YMM2
+-	VPCMPEQ VEC_SIZE(%rdi), %YMM2, %k2
+-	kandd	%k2, %k1, %k5
+-
+-	VMOVU	(VEC_SIZE * 2)(%rsi), %YMM3
+-	VPCMPEQ (VEC_SIZE * 2)(%rdi), %YMM3, %k3
+-	kandd	%k3, %k5, %k5
+-
+-	VMOVU	(VEC_SIZE * 3)(%rsi), %YMM4
+-	VPCMPEQ (VEC_SIZE * 3)(%rdi), %YMM4, %k4
+-	kandd	%k4, %k5, %k5
+-
+-	kmovd	%k5, %eax
+-	cmpl	$VEC_MASK, %eax
+-	jne	L(4x_vec_end)
+-
+-	addq	$(VEC_SIZE * 4), %rdi
+-	addq	$(VEC_SIZE * 4), %rsi
+-
+-	subq	$(VEC_SIZE * 4), %rdx
+-	cmpq	$(VEC_SIZE * 4), %rdx
+-	jae	L(loop_4x_vec)
+-
+-	/* Less than 4 * VEC.  */
+-	cmpq	$VEC_SIZE, %rdx
+-	jbe	L(last_vec)
+-	cmpq	$(VEC_SIZE * 2), %rdx
+-	jbe	L(last_2x_vec)
+-
+-L(last_4x_vec):
+-	/* From 2 * VEC to 4 * VEC. */
+-	VMOVU	(%rsi), %YMM2
+-	VPCMPEQ (%rdi), %YMM2, %k2
+-	kmovd	%k2, %eax
+-	subl    $VEC_MASK, %eax
+-	jnz	L(first_vec)
+-
+-	addq	$VEC_SIZE, %rdi
+-	addq	$VEC_SIZE, %rsi
+-	VMOVU	(%rsi), %YMM2
+-	VPCMPEQ (%rdi), %YMM2, %k2
+-	kmovd	%k2, %eax
+-	subl    $VEC_MASK, %eax
+-	jnz	L(first_vec)
++L(between_16_31):
++	/* From 16 to 31 bytes.  No branch when size == 16.  */
++	VMOVU	(%rsi), %XMM2
++	VPCMP	$4, (%rdi), %XMM2, %k1
++	kmovd	%k1, %eax
++	testl	%eax, %eax
++	jnz	L(return_vec_0)
+ 
+ 	/* Use overlapping loads to avoid branches.  */
+-	leaq	-(3 * VEC_SIZE)(%rdi, %rdx), %rdi
+-	leaq	-(3 * VEC_SIZE)(%rsi, %rdx), %rsi
+-	VMOVU	(%rsi), %YMM2
+-	VPCMPEQ (%rdi), %YMM2, %k2
+-	kmovd	%k2, %eax
+-	subl    $VEC_MASK, %eax
+-	jnz	L(first_vec)
+ 
+-	addq	$VEC_SIZE, %rdi
+-	addq	$VEC_SIZE, %rsi
+-	VMOVU	(%rsi), %YMM2
+-	VPCMPEQ (%rdi), %YMM2, %k2
+-	kmovd	%k2, %eax
+-	subl    $VEC_MASK, %eax
+-	jnz	L(first_vec)
+-	ret
+-
+-	.p2align 4
+-L(4x_vec_end):
++	VMOVU	-16(%rsi, %rdx, CHAR_SIZE), %XMM2
++	leaq	-16(%rdi, %rdx, CHAR_SIZE), %rdi
++	leaq	-16(%rsi, %rdx, CHAR_SIZE), %rsi
++	VPCMP	$4, (%rdi), %XMM2, %k1
+ 	kmovd	%k1, %eax
+-	subl	$VEC_MASK, %eax
+-	jnz	L(first_vec)
+-	kmovd	%k2, %eax
+-	subl	$VEC_MASK, %eax
+-	jnz	L(first_vec_x1)
+-	kmovd	%k3, %eax
+-	subl	$VEC_MASK, %eax
+-	jnz	L(first_vec_x2)
+-	kmovd	%k4, %eax
+-	subl	$VEC_MASK, %eax
+-	tzcntl	%eax, %ecx
+-# ifdef USE_AS_WMEMCMP
+-	xorl	%eax, %eax
+-	movl	(VEC_SIZE * 3)(%rdi, %rcx, 4), %edx
+-	cmpl	(VEC_SIZE * 3)(%rsi, %rcx, 4), %edx
+-	jmp	L(wmemcmp_return)
+-# else
+-	movzbl	(VEC_SIZE * 3)(%rdi, %rcx), %eax
+-	movzbl	(VEC_SIZE * 3)(%rsi, %rcx), %edx
+-	sub	%edx, %eax
+-# endif
++	testl	%eax, %eax
++	jnz	L(return_vec_0)
+ 	ret
+ 
+-	.p2align 4
+-L(first_vec_x1):
+-	tzcntl	%eax, %ecx
+ # ifdef USE_AS_WMEMCMP
+-	xorl	%eax, %eax
+-	movl	VEC_SIZE(%rdi, %rcx, 4), %edx
+-	cmpl	VEC_SIZE(%rsi, %rcx, 4), %edx
+-	jmp	L(wmemcmp_return)
+-# else
+-	movzbl	VEC_SIZE(%rdi, %rcx), %eax
+-	movzbl	VEC_SIZE(%rsi, %rcx), %edx
+-	sub	%edx, %eax
+-# endif
++	.p2align 4
++L(one_or_less):
++	jb	L(zero)
++	movl	(%rdi), %ecx
++	xorl	%edx, %edx
++	cmpl	(%rsi), %ecx
++	je	L(zero)
++	setg	%dl
++	leal	-1(%rdx, %rdx), %eax
+ 	ret
++# else
+ 
+ 	.p2align 4
+-L(first_vec_x2):
+-	tzcntl	%eax, %ecx
+-# ifdef USE_AS_WMEMCMP
+-	xorl	%eax, %eax
+-	movl	(VEC_SIZE * 2)(%rdi, %rcx, 4), %edx
+-	cmpl	(VEC_SIZE * 2)(%rsi, %rcx, 4), %edx
+-	jmp	L(wmemcmp_return)
+-# else
+-	movzbl	(VEC_SIZE * 2)(%rdi, %rcx), %eax
+-	movzbl	(VEC_SIZE * 2)(%rsi, %rcx), %edx
+-	sub	%edx, %eax
+-# endif
++L(between_4_7):
++	/* Load as big endian with overlapping movbe to avoid branches.
++	 */
++	movbe	(%rdi), %eax
++	movbe	(%rsi), %ecx
++	shlq	$32, %rax
++	shlq	$32, %rcx
++	movbe	-4(%rdi, %rdx), %edi
++	movbe	-4(%rsi, %rdx), %esi
++	orq	%rdi, %rax
++	orq	%rsi, %rcx
++	subq	%rcx, %rax
++	jz	L(zero_4_7)
++	sbbl	%eax, %eax
++	orl	$1, %eax
++L(zero_4_7):
+ 	ret
++# endif
++
+ END (MEMCMP)
+ #endif
+-- 
+GitLab
+
diff --git a/glibc-RHEL-15696-47.patch b/glibc-RHEL-15696-47.patch
new file mode 100644
index 0000000..70c3171
--- /dev/null
+++ b/glibc-RHEL-15696-47.patch
@@ -0,0 +1,104 @@
+From 6abf27980a947f9b6e514d6b33b83059d39566ae Mon Sep 17 00:00:00 2001
+From: Noah Goldstein <goldstein.w.n@gmail.com>
+Date: Thu, 20 May 2021 13:13:51 -0400
+Subject: [PATCH] x86: Improve memset-vec-unaligned-erms.S
+Content-type: text/plain; charset=UTF-8
+
+No bug. This commit makes a few small improvements to
+memset-vec-unaligned-erms.S. The changes are 1) only aligning to 64
+instead of 128. Either alignment will perform equally well in a loop
+and 128 just increases the odds of having to do an extra iteration
+which can be significant overhead for small values. 2) Align some
+targets and the loop. 3) Remove an ALU from the alignment process. 4)
+Reorder the last 4x VEC so that they are stored after the loop. 5)
+Move the condition for leq 8x VEC to before the alignment
+process. test-memset and test-wmemset are both passing.
+
+Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
+Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
+---
+ .../multiarch/memset-vec-unaligned-erms.S     | 50 +++++++++++--------
+ 1 file changed, 28 insertions(+), 22 deletions(-)
+
+diff --git a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
+index f877ac9d..909c33f6 100644
+--- a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
++++ b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
+@@ -173,17 +173,22 @@ ENTRY (MEMSET_SYMBOL (__memset, unaligned_erms))
+ 	VMOVU	%VEC(0), (%rdi)
+ 	VZEROUPPER_RETURN
+ 
++	.p2align 4
+ L(stosb_more_2x_vec):
+ 	cmp	__x86_rep_stosb_threshold(%rip), %RDX_LP
+ 	ja	L(stosb)
++#else
++	.p2align 4
+ #endif
+ L(more_2x_vec):
+-	cmpq  $(VEC_SIZE * 4), %rdx
+-	ja	L(loop_start)
++	/* Stores to first 2x VEC before cmp as any path forward will
++	   require it.  */
+ 	VMOVU	%VEC(0), (%rdi)
+ 	VMOVU	%VEC(0), VEC_SIZE(%rdi)
+-	VMOVU	%VEC(0), -VEC_SIZE(%rdi,%rdx)
++	cmpq	$(VEC_SIZE * 4), %rdx
++	ja	L(loop_start)
+ 	VMOVU	%VEC(0), -(VEC_SIZE * 2)(%rdi,%rdx)
++	VMOVU	%VEC(0), -VEC_SIZE(%rdi,%rdx)
+ L(return):
+ #if VEC_SIZE > 16
+ 	ZERO_UPPER_VEC_REGISTERS_RETURN
+@@ -192,28 +197,29 @@ L(return):
+ #endif
+ 
+ L(loop_start):
+-	leaq	(VEC_SIZE * 4)(%rdi), %rcx
+-	VMOVU	%VEC(0), (%rdi)
+-	andq	$-(VEC_SIZE * 4), %rcx
+-	VMOVU	%VEC(0), -VEC_SIZE(%rdi,%rdx)
+-	VMOVU	%VEC(0), VEC_SIZE(%rdi)
+-	VMOVU	%VEC(0), -(VEC_SIZE * 2)(%rdi,%rdx)
+ 	VMOVU	%VEC(0), (VEC_SIZE * 2)(%rdi)
+-	VMOVU	%VEC(0), -(VEC_SIZE * 3)(%rdi,%rdx)
+ 	VMOVU	%VEC(0), (VEC_SIZE * 3)(%rdi)
+-	VMOVU	%VEC(0), -(VEC_SIZE * 4)(%rdi,%rdx)
+-	addq	%rdi, %rdx
+-	andq	$-(VEC_SIZE * 4), %rdx
+-	cmpq	%rdx, %rcx
+-	je	L(return)
++	cmpq	$(VEC_SIZE * 8), %rdx
++	jbe	L(loop_end)
++	andq	$-(VEC_SIZE * 2), %rdi
++	subq	$-(VEC_SIZE * 4), %rdi
++	leaq	-(VEC_SIZE * 4)(%rax, %rdx), %rcx
++	.p2align 4
+ L(loop):
+-	VMOVA	%VEC(0), (%rcx)
+-	VMOVA	%VEC(0), VEC_SIZE(%rcx)
+-	VMOVA	%VEC(0), (VEC_SIZE * 2)(%rcx)
+-	VMOVA	%VEC(0), (VEC_SIZE * 3)(%rcx)
+-	addq	$(VEC_SIZE * 4), %rcx
+-	cmpq	%rcx, %rdx
+-	jne	L(loop)
++	VMOVA	%VEC(0), (%rdi)
++	VMOVA	%VEC(0), VEC_SIZE(%rdi)
++	VMOVA	%VEC(0), (VEC_SIZE * 2)(%rdi)
++	VMOVA	%VEC(0), (VEC_SIZE * 3)(%rdi)
++	subq	$-(VEC_SIZE * 4), %rdi
++	cmpq	%rcx, %rdi
++	jb	L(loop)
++L(loop_end):
++	/* NB: rax is set as ptr in MEMSET_VDUP_TO_VEC0_AND_SET_RETURN.
++	       rdx as length is also unchanged.  */
++	VMOVU	%VEC(0), -(VEC_SIZE * 4)(%rax, %rdx)
++	VMOVU	%VEC(0), -(VEC_SIZE * 3)(%rax, %rdx)
++	VMOVU	%VEC(0), -(VEC_SIZE * 2)(%rax, %rdx)
++	VMOVU	%VEC(0), -VEC_SIZE(%rax, %rdx)
+ 	VZEROUPPER_SHORT_RETURN
+ 
+ 	.p2align 4
+-- 
+GitLab
+
diff --git a/glibc-RHEL-15696-48.patch b/glibc-RHEL-15696-48.patch
new file mode 100644
index 0000000..645536e
--- /dev/null
+++ b/glibc-RHEL-15696-48.patch
@@ -0,0 +1,84 @@
+From 1b992204f68af851e905c16016756fd4421e1934 Mon Sep 17 00:00:00 2001
+From: Noah Goldstein <goldstein.w.n@gmail.com>
+Date: Sun, 23 May 2021 19:43:24 -0400
+Subject: [PATCH] x86: Improve memmove-vec-unaligned-erms.S
+Content-type: text/plain; charset=UTF-8
+
+This patch changes the condition for copy 4x VEC so that if length is
+exactly equal to 4 * VEC_SIZE it will use the 4x VEC case instead of
+8x VEC case.
+
+Results For Skylake memcpy-avx2-erms
+size, al1 , al2 , Cur T   , New T   , Win , New / Cur
+128 , 0   , 0   , 9.137   , 6.873   , New , 75.22
+128 , 7   , 0   , 12.933  , 7.732   , New , 59.79
+128 , 0   , 7   , 11.852  , 6.76    , New , 57.04
+128 , 7   , 7   , 12.587  , 6.808   , New , 54.09
+
+Results For Icelake memcpy-evex-erms
+size, al1 , al2 , Cur T   , New T   , Win , New / Cur
+128 , 0   , 0   , 9.963   , 5.416   , New , 54.36
+128 , 7   , 0   , 16.467  , 8.061   , New , 48.95
+128 , 0   , 7   , 14.388  , 7.644   , New , 53.13
+128 , 7   , 7   , 14.546  , 7.642   , New , 52.54
+
+Results For Tigerlake memcpy-evex-erms
+size, al1 , al2 , Cur T   , New T   , Win , New / Cur
+128 , 0   , 0   , 8.979   , 4.95    , New , 55.13
+128 , 7   , 0   , 14.245  , 7.122   , New , 50.0
+128 , 0   , 7   , 12.668  , 6.675   , New , 52.69
+128 , 7   , 7   , 13.042  , 6.802   , New , 52.15
+
+Results For Skylake memmove-avx2-erms
+size, al1 , al2 , Cur T   , New T   , Win , New / Cur
+128 , 0   , 32  , 6.181   , 5.691   , New , 92.07
+128 , 32  , 0   , 6.165   , 5.752   , New , 93.3
+128 , 0   , 7   , 13.923  , 9.37    , New , 67.3
+128 , 7   , 0   , 12.049  , 10.182  , New , 84.5
+
+Results For Icelake memmove-evex-erms
+size, al1 , al2 , Cur T   , New T   , Win , New / Cur
+128 , 0   , 32  , 5.479   , 4.889   , New , 89.23
+128 , 32  , 0   , 5.127   , 4.911   , New , 95.79
+128 , 0   , 7   , 18.885  , 13.547  , New , 71.73
+128 , 7   , 0   , 15.565  , 14.436  , New , 92.75
+
+Results For Tigerlake memmove-evex-erms
+size, al1 , al2 , Cur T   , New T   , Win , New / Cur
+128 , 0   , 32  , 5.275   , 4.815   , New , 91.28
+128 , 32  , 0   , 5.376   , 4.565   , New , 84.91
+128 , 0   , 7   , 19.426  , 14.273  , New , 73.47
+128 , 7   , 0   , 15.924  , 14.951  , New , 93.89
+
+Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
+---
+ sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S | 6 +++---
+ 1 file changed, 3 insertions(+), 3 deletions(-)
+
+diff --git a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
+index 3e2dd6bc..572cef04 100644
+--- a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
++++ b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
+@@ -417,8 +417,8 @@ L(more_2x_vec):
+ 	cmpq	$(VEC_SIZE * 8), %rdx
+ 	ja	L(more_8x_vec)
+ 	cmpq	$(VEC_SIZE * 4), %rdx
+-	jb	L(last_4x_vec)
+-	/* Copy from 4 * VEC to 8 * VEC, inclusively. */
++	jbe	L(last_4x_vec)
++	/* Copy from 4 * VEC + 1 to 8 * VEC, inclusively. */
+ 	VMOVU	(%rsi), %VEC(0)
+ 	VMOVU	VEC_SIZE(%rsi), %VEC(1)
+ 	VMOVU	(VEC_SIZE * 2)(%rsi), %VEC(2)
+@@ -437,7 +437,7 @@ L(more_2x_vec):
+ 	VMOVU	%VEC(7), -(VEC_SIZE * 4)(%rdi,%rdx)
+ 	VZEROUPPER_RETURN
+ L(last_4x_vec):
+-	/* Copy from 2 * VEC to 4 * VEC. */
++	/* Copy from 2 * VEC + 1 to 4 * VEC, inclusively. */
+ 	VMOVU	(%rsi), %VEC(0)
+ 	VMOVU	VEC_SIZE(%rsi), %VEC(1)
+ 	VMOVU	-VEC_SIZE(%rsi,%rdx), %VEC(2)
+-- 
+GitLab
+
diff --git a/glibc-RHEL-15696-49.patch b/glibc-RHEL-15696-49.patch
new file mode 100644
index 0000000..b59f582
--- /dev/null
+++ b/glibc-RHEL-15696-49.patch
@@ -0,0 +1,55 @@
+From 08cbcd4dbc686bb38ec3093aff2f919fbff5ec17 Mon Sep 17 00:00:00 2001
+From: Noah Goldstein <goldstein.w.n@gmail.com>
+Date: Wed, 23 Jun 2021 19:19:34 -0400
+Subject: [PATCH] x86: Remove unnecessary overflow check from wcsnlen-sse4_1.S
+Content-type: text/plain; charset=UTF-8
+
+No bug. The way wcsnlen will check if near the end of maxlen
+is the following macro:
+
+	mov	%r11, %rsi;	\
+	subq	%rax, %rsi;	\
+	andq	$-64, %rax;	\
+	testq	$-64, %rsi;	\
+	je	L(strnlen_ret)
+
+Which words independently of s + maxlen overflowing. So the
+second overflow check is unnecissary for correctness and
+just extra overhead in the common no overflow case.
+
+test-strlen.c, test-wcslen.c, test-strnlen.c and test-wcsnlen.c are
+all passing
+
+Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
+Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
+---
+ sysdeps/x86_64/multiarch/strlen-vec.S | 7 -------
+ 1 file changed, 7 deletions(-)
+
+diff --git a/sysdeps/x86_64/multiarch/strlen-vec.S b/sysdeps/x86_64/multiarch/strlen-vec.S
+index 439e486a..b7657282 100644
+--- a/sysdeps/x86_64/multiarch/strlen-vec.S
++++ b/sysdeps/x86_64/multiarch/strlen-vec.S
+@@ -71,19 +71,12 @@ L(n_nonzero):
+    suffice.  */
+ 	mov	%RSI_LP, %R10_LP
+ 	sar	$62, %R10_LP
+-	test	%R10_LP, %R10_LP
+ 	jnz	__wcslen_sse4_1
+ 	sal	$2, %RSI_LP
+ # endif
+ 
+-
+ /* Initialize long lived registers.  */
+-
+ 	add	%RDI_LP, %RSI_LP
+-# ifdef AS_WCSLEN
+-/* Check for overflow again from s + maxlen * sizeof(wchar_t).  */
+-	jbe	__wcslen_sse4_1
+-# endif
+ 	mov	%RSI_LP, %R10_LP
+ 	and	$-64, %R10_LP
+ 	mov	%RSI_LP, %R11_LP
+-- 
+GitLab
+
diff --git a/glibc-RHEL-15696-5.patch b/glibc-RHEL-15696-5.patch
new file mode 100644
index 0000000..75d3978
--- /dev/null
+++ b/glibc-RHEL-15696-5.patch
@@ -0,0 +1,290 @@
+From 82d0b4a4d76db554eb6757acb790fcea30b19965 Mon Sep 17 00:00:00 2001
+From: "H.J. Lu" <hjl.tools@gmail.com>
+Date: Mon, 21 Jan 2019 11:32:24 -0800
+Subject: [PATCH] x86-64 memset/wmemset: Properly handle the length parameter
+ [BZ# 24097]
+Content-type: text/plain; charset=UTF-8
+
+On x32, the size_t parameter may be passed in the lower 32 bits of a
+64-bit register with the non-zero upper 32 bits.  The string/memory
+functions written in assembly can only use the lower 32 bits of a
+64-bit register as length or must clear the upper 32 bits before using
+the full 64-bit register for length.
+
+This pach fixes memset/wmemset for x32.  Tested on x86-64 and x32.  On
+x86-64, libc.so is the same with and withou the fix.
+
+	[BZ# 24097]
+	CVE-2019-6488
+	* sysdeps/x86_64/multiarch/memset-avx512-no-vzeroupper.S: Use
+	RDX_LP for length.  Clear the upper 32 bits of RDX register.
+	* sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S: Likewise.
+	* sysdeps/x86_64/x32/Makefile (tests): Add tst-size_t-wmemset.
+	* sysdeps/x86_64/x32/tst-size_t-memset.c: New file.
+	* sysdeps/x86_64/x32/tst-size_t-wmemset.c: Likewise.
+---
+ .../multiarch/memset-avx512-no-vzeroupper.S   |  6 +-
+ .../multiarch/memset-vec-unaligned-erms.S     | 34 +++++----
+ sysdeps/x86_64/x32/Makefile                   |  4 +-
+ sysdeps/x86_64/x32/tst-size_t-memset.c        | 73 +++++++++++++++++++
+ sysdeps/x86_64/x32/tst-size_t-wmemset.c       | 20 +++++
+ 5 files changed, 121 insertions(+), 16 deletions(-)
+ create mode 100644 sysdeps/x86_64/x32/tst-size_t-memset.c
+ create mode 100644 sysdeps/x86_64/x32/tst-size_t-wmemset.c
+
+Conflicts:
+	ChangeLog
+	(removed)
+
+diff --git a/sysdeps/x86_64/multiarch/memset-avx512-no-vzeroupper.S b/sysdeps/x86_64/multiarch/memset-avx512-no-vzeroupper.S
+index 689cc119..99e25519 100644
+--- a/sysdeps/x86_64/multiarch/memset-avx512-no-vzeroupper.S
++++ b/sysdeps/x86_64/multiarch/memset-avx512-no-vzeroupper.S
+@@ -29,12 +29,16 @@
+ 	.section .text.avx512,"ax",@progbits
+ #if defined PIC
+ ENTRY (MEMSET_CHK)
+-	cmpq	%rdx, %rcx
++	cmp	%RDX_LP, %RCX_LP
+ 	jb	HIDDEN_JUMPTARGET (__chk_fail)
+ END (MEMSET_CHK)
+ #endif
+ 
+ ENTRY (MEMSET)
++# ifdef __ILP32__
++	/* Clear the upper 32 bits.  */
++	mov	%edx, %edx
++# endif
+ 	vpxor	%xmm0, %xmm0, %xmm0
+ 	vmovd	%esi, %xmm1
+ 	lea	(%rdi, %rdx), %rsi
+diff --git a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
+index 270a1d49..9a0fd818 100644
+--- a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
++++ b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
+@@ -65,8 +65,8 @@
+ 	.section SECTION(.text),"ax",@progbits
+ #if VEC_SIZE == 16 && IS_IN (libc)
+ ENTRY (__bzero)
+-	movq	%rdi, %rax /* Set return value.  */
+-	movq	%rsi, %rdx /* Set n.  */
++	mov	%RDI_LP, %RAX_LP /* Set return value.  */
++	mov	%RSI_LP, %RDX_LP /* Set n.  */
+ 	pxor	%xmm0, %xmm0
+ 	jmp	L(entry_from_bzero)
+ END (__bzero)
+@@ -76,13 +76,13 @@ weak_alias (__bzero, bzero)
+ #if IS_IN (libc)
+ # if defined SHARED
+ ENTRY_CHK (WMEMSET_CHK_SYMBOL (__wmemset_chk, unaligned))
+-	cmpq	%rdx, %rcx
++	cmp	%RDX_LP, %RCX_LP
+ 	jb	HIDDEN_JUMPTARGET (__chk_fail)
+ END_CHK (WMEMSET_CHK_SYMBOL (__wmemset_chk, unaligned))
+ # endif
+ 
+ ENTRY (WMEMSET_SYMBOL (__wmemset, unaligned))
+-	shlq	$2, %rdx
++	shl	$2, %RDX_LP
+ 	WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi)
+ 	jmp	L(entry_from_bzero)
+ END (WMEMSET_SYMBOL (__wmemset, unaligned))
+@@ -90,13 +90,17 @@ END (WMEMSET_SYMBOL (__wmemset, unaligned))
+ 
+ #if defined SHARED && IS_IN (libc)
+ ENTRY_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned))
+-	cmpq	%rdx, %rcx
++	cmp	%RDX_LP, %RCX_LP
+ 	jb	HIDDEN_JUMPTARGET (__chk_fail)
+ END_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned))
+ #endif
+ 
+ ENTRY (MEMSET_SYMBOL (__memset, unaligned))
+ 	MEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi)
++# ifdef __ILP32__
++	/* Clear the upper 32 bits.  */
++	mov	%edx, %edx
++# endif
+ L(entry_from_bzero):
+ 	cmpq	$VEC_SIZE, %rdx
+ 	jb	L(less_vec)
+@@ -112,14 +116,14 @@ END (MEMSET_SYMBOL (__memset, unaligned))
+ 
+ # if VEC_SIZE == 16
+ ENTRY (__memset_chk_erms)
+-	cmpq	%rdx, %rcx
++	cmp	%RDX_LP, %RCX_LP
+ 	jb	HIDDEN_JUMPTARGET (__chk_fail)
+ END (__memset_chk_erms)
+ 
+ /* Only used to measure performance of REP STOSB.  */
+ ENTRY (__memset_erms)
+ 	/* Skip zero length.  */
+-	testq	%rdx, %rdx
++	test	%RDX_LP, %RDX_LP
+ 	jnz	 L(stosb)
+ 	movq	%rdi, %rax
+ 	ret
+@@ -131,11 +135,11 @@ ENTRY (MEMSET_SYMBOL (__memset, erms))
+ L(stosb):
+ 	/* Issue vzeroupper before rep stosb.  */
+ 	VZEROUPPER
+-	movq	%rdx, %rcx
++	mov	%RDX_LP, %RCX_LP
+ 	movzbl	%sil, %eax
+-	movq	%rdi, %rdx
++	mov	%RDI_LP, %RDX_LP
+ 	rep stosb
+-	movq	%rdx, %rax
++	mov	%RDX_LP, %RAX_LP
+ 	ret
+ # if VEC_SIZE == 16
+ END (__memset_erms)
+@@ -145,16 +149,20 @@ END (MEMSET_SYMBOL (__memset, erms))
+ 
+ # if defined SHARED && IS_IN (libc)
+ ENTRY_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned_erms))
+-	cmpq	%rdx, %rcx
++	cmp	%RDX_LP, %RCX_LP
+ 	jb	HIDDEN_JUMPTARGET (__chk_fail)
+ END_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned_erms))
+ # endif
+ 
+ ENTRY (MEMSET_SYMBOL (__memset, unaligned_erms))
+ 	MEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi)
+-	cmpq	$VEC_SIZE, %rdx
++# ifdef __ILP32__
++	/* Clear the upper 32 bits.  */
++	mov	%edx, %edx
++# endif
++	cmp	$VEC_SIZE, %RDX_LP
+ 	jb	L(less_vec)
+-	cmpq	$(VEC_SIZE * 2), %rdx
++	cmp	$(VEC_SIZE * 2), %RDX_LP
+ 	ja	L(stosb_more_2x_vec)
+ 	/* From VEC and to 2 * VEC.  No branch when size == VEC_SIZE.  */
+ 	VMOVU	%VEC(0), -VEC_SIZE(%rdi,%rdx)
+diff --git a/sysdeps/x86_64/x32/Makefile b/sysdeps/x86_64/x32/Makefile
+index e99dbd7c..98bd9ae9 100644
+--- a/sysdeps/x86_64/x32/Makefile
++++ b/sysdeps/x86_64/x32/Makefile
+@@ -7,9 +7,9 @@ endif
+ 
+ ifeq ($(subdir),string)
+ tests += tst-size_t-memchr tst-size_t-memcmp tst-size_t-memcpy \
+-	 tst-size_t-memrchr
++	 tst-size_t-memrchr tst-size_t-memset
+ endif
+ 
+ ifeq ($(subdir),wcsmbs)
+-tests += tst-size_t-wmemchr tst-size_t-wmemcmp
++tests += tst-size_t-wmemchr tst-size_t-wmemcmp tst-size_t-wmemset
+ endif
+diff --git a/sysdeps/x86_64/x32/tst-size_t-memset.c b/sysdeps/x86_64/x32/tst-size_t-memset.c
+new file mode 100644
+index 00000000..2c367af6
+--- /dev/null
++++ b/sysdeps/x86_64/x32/tst-size_t-memset.c
+@@ -0,0 +1,73 @@
++/* Test memset with size_t in the lower 32 bits of 64-bit register.
++   Copyright (C) 2019 Free Software Foundation, Inc.
++   This file is part of the GNU C Library.
++
++   The GNU C Library is free software; you can redistribute it and/or
++   modify it under the terms of the GNU Lesser General Public
++   License as published by the Free Software Foundation; either
++   version 2.1 of the License, or (at your option) any later version.
++
++   The GNU C Library is distributed in the hope that it will be useful,
++   but WITHOUT ANY WARRANTY; without even the implied warranty of
++   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++   Lesser General Public License for more details.
++
++   You should have received a copy of the GNU Lesser General Public
++   License along with the GNU C Library; if not, see
++   <http://www.gnu.org/licenses/>.  */
++
++#ifdef WIDE
++# define TEST_NAME "wmemset"
++#else
++# define TEST_NAME "memset"
++#endif /* WIDE */
++
++#include "test-size_t.h"
++
++#ifdef WIDE
++# include <wchar.h>
++# define MEMSET wmemset
++# define CHAR wchar_t
++#else
++# define MEMSET memset
++# define CHAR char
++#endif /* WIDE */
++
++IMPL (MEMSET, 1)
++
++typedef CHAR *(*proto_t) (CHAR *, int, size_t);
++
++static void *
++__attribute__ ((noinline, noclone))
++do_memset (parameter_t a, parameter_t b)
++{
++  return CALL (&b, a.p, (uintptr_t) b.p, a.len);
++}
++
++static int
++test_main (void)
++{
++  test_init ();
++
++  CHAR ch = 0x23;
++  parameter_t src = { { page_size / sizeof (CHAR) }, buf2 };
++  parameter_t c = { { 0 }, (void *) (uintptr_t) ch };
++
++  int ret = 0;
++  FOR_EACH_IMPL (impl, 0)
++    {
++      c.fn = impl->fn;
++      CHAR *p = (CHAR *) do_memset (src, c);
++      size_t i;
++      for (i = 0; i < src.len; i++)
++	if (p[i] != ch)
++	  {
++	    error (0, 0, "Wrong result in function %s", impl->name);
++	    ret = 1;
++	  }
++    }
++
++  return ret ? EXIT_FAILURE : EXIT_SUCCESS;
++}
++
++#include <support/test-driver.c>
+diff --git a/sysdeps/x86_64/x32/tst-size_t-wmemset.c b/sysdeps/x86_64/x32/tst-size_t-wmemset.c
+new file mode 100644
+index 00000000..955eb488
+--- /dev/null
++++ b/sysdeps/x86_64/x32/tst-size_t-wmemset.c
+@@ -0,0 +1,20 @@
++/* Test wmemset with size_t in the lower 32 bits of 64-bit register.
++   Copyright (C) 2019 Free Software Foundation, Inc.
++   This file is part of the GNU C Library.
++
++   The GNU C Library is free software; you can redistribute it and/or
++   modify it under the terms of the GNU Lesser General Public
++   License as published by the Free Software Foundation; either
++   version 2.1 of the License, or (at your option) any later version.
++
++   The GNU C Library is distributed in the hope that it will be useful,
++   but WITHOUT ANY WARRANTY; without even the implied warranty of
++   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++   Lesser General Public License for more details.
++
++   You should have received a copy of the GNU Lesser General Public
++   License along with the GNU C Library; if not, see
++   <http://www.gnu.org/licenses/>.  */
++
++#define WIDE 1
++#include "tst-size_t-memset.c"
+-- 
+GitLab
+
diff --git a/glibc-RHEL-15696-50.patch b/glibc-RHEL-15696-50.patch
new file mode 100644
index 0000000..e896698
--- /dev/null
+++ b/glibc-RHEL-15696-50.patch
@@ -0,0 +1,43 @@
+From 447954a206837b5f153869cfeeeab44631c3fac9 Mon Sep 17 00:00:00 2001
+Author: Shen-Ta Hsieh <ibmibmibm.tw@gmail.com>  2021-05-23 21:43:10
+Committer: H.J. Lu <hjl.tools@gmail.com>  2021-06-27 10:56:57
+Parent: 2c16cb88a6e5ace0fb7cedca86860ea7bde522a7 (Linux: Move timer helper routines from librt to libc)
+Child:  1683249d17e14827b6579529742eb895027dfa84 (x86_64: roundeven with sse4.1 support)
+Branches: master, remotes/origin/master and many more (41)
+Follows: glibc-2.33.9000
+Precedes: glibc-2.34
+
+    math: redirect roundeven function
+    
+    This patch redirect roundeven function for futhermore changes.
+    
+    Signed-off-by: Shen-Ta Hsieh <ibmibmibm.tw@gmail.com>
+    Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
+
+Conflicts:
+	*
+	(rewritten for older branch)
+
+diff --git a/sysdeps/ieee754/dbl-64/wordsize-64/s_roundeven.c b/sysdeps/ieee754/dbl-64/wordsize-64/s_roundeven.c
+index 7bbbb2dc..8728d0f2 100644
+--- a/sysdeps/ieee754/dbl-64/wordsize-64/s_roundeven.c
++++ b/sysdeps/ieee754/dbl-64/wordsize-64/s_roundeven.c
+@@ -17,6 +17,7 @@
+    License along with the GNU C Library; if not, see
+    <http://www.gnu.org/licenses/>.  */
+ 
++#define NO_MATH_REDIRECT
+ #include <math.h>
+ #include <math_private.h>
+ #include <libm-alias-double.h>
+@@ -67,5 +68,6 @@ __roundeven (double x)
+   INSERT_WORDS64 (x, ix);
+   return x;
+ }
+-hidden_def (__roundeven)
++#ifndef __roundeven
+ libm_alias_double (__roundeven, roundeven)
++#endif
+-- 
+GitLab
+
diff --git a/glibc-RHEL-15696-51.patch b/glibc-RHEL-15696-51.patch
new file mode 100644
index 0000000..105843d
--- /dev/null
+++ b/glibc-RHEL-15696-51.patch
@@ -0,0 +1,118 @@
+From 447954a206837b5f153869cfeeeab44631c3fac9 Mon Sep 17 00:00:00 2001
+From: Shen-Ta Hsieh <ibmibmibm.tw@gmail.com>
+Date: Mon, 24 May 2021 09:43:10 +0800
+Subject: [PATCH] math: redirect roundeven function
+Content-type: text/plain; charset=UTF-8
+
+This patch redirect roundeven function for futhermore changes.
+
+Signed-off-by: Shen-Ta Hsieh <ibmibmibm.tw@gmail.com>
+Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
+---
+ include/math.h                             | 3 ++-
+ sysdeps/ieee754/dbl-64/s_roundeven.c       | 4 +++-
+ sysdeps/ieee754/float128/s_roundevenf128.c | 1 +
+ sysdeps/ieee754/flt-32/s_roundevenf.c      | 3 +++
+ sysdeps/ieee754/ldbl-128/s_roundevenl.c    | 1 +
+ sysdeps/ieee754/ldbl-96/s_roundevenl.c     | 1 +
+ 6 files changed, 11 insertions(+), 2 deletions(-)
+
+Conflicts:
+	include/math.h
+	(missing MATH_REDIRECT macros)
+
+diff --git a/include/math.h b/include/math.h
+index e21d34b8..1f9f9a54 100644
+--- a/include/math.h
++++ b/include/math.h
+@@ -38,7 +38,6 @@ libm_hidden_proto (__issignaling)
+ libm_hidden_proto (__issignalingf)
+ libm_hidden_proto (__exp)
+ libm_hidden_proto (__expf)
+-libm_hidden_proto (__roundeven)
+ 
+ # ifndef __NO_LONG_DOUBLE_MATH
+ libm_hidden_proto (__fpclassifyl)
+@@ -56,6 +55,8 @@ libm_hidden_proto (__expm1f128)
+ 
+ # if !(defined __FINITE_MATH_ONLY__ && __FINITE_MATH_ONLY__ > 0)
+ #  ifndef NO_MATH_REDIRECT
++float (roundevenf) (float) asm ("__roundevenf");
++double (roundeven) (double) asm ("__roundeven");
+ /* Declare sqrt for use within GLIBC.  Compilers typically inline sqrt as a
+    single instruction.  Use an asm to avoid use of PLTs if it doesn't.  */
+ float (sqrtf) (float) asm ("__ieee754_sqrtf");
+diff --git a/sysdeps/ieee754/dbl-64/s_roundeven.c b/sysdeps/ieee754/dbl-64/s_roundeven.c
+index 1438e81d..61962184 100644
+--- a/sysdeps/ieee754/dbl-64/s_roundeven.c
++++ b/sysdeps/ieee754/dbl-64/s_roundeven.c
+@@ -17,6 +17,7 @@
+    License along with the GNU C Library; if not, see
+    <http://www.gnu.org/licenses/>.  */
+ 
++#define NO_MATH_REDIRECT
+ #include <math.h>
+ #include <math_private.h>
+ #include <libm-alias-double.h>
+@@ -101,5 +102,6 @@ __roundeven (double x)
+   INSERT_WORDS (x, hx, lx);
+   return x;
+ }
+-hidden_def (__roundeven)
++#ifndef __roundeven
+ libm_alias_double (__roundeven, roundeven)
++#endif
+diff --git a/sysdeps/ieee754/float128/s_roundevenf128.c b/sysdeps/ieee754/float128/s_roundevenf128.c
+index 5a9b3f39..e0faf727 100644
+--- a/sysdeps/ieee754/float128/s_roundevenf128.c
++++ b/sysdeps/ieee754/float128/s_roundevenf128.c
+@@ -1,2 +1,3 @@
++#define NO_MATH_REDIRECT
+ #include <float128_private.h>
+ #include "../ldbl-128/s_roundevenl.c"
+diff --git a/sysdeps/ieee754/flt-32/s_roundevenf.c b/sysdeps/ieee754/flt-32/s_roundevenf.c
+index 90f991d5..a661875e 100644
+--- a/sysdeps/ieee754/flt-32/s_roundevenf.c
++++ b/sysdeps/ieee754/flt-32/s_roundevenf.c
+@@ -17,6 +17,7 @@
+    License along with the GNU C Library; if not, see
+    <http://www.gnu.org/licenses/>.  */
+ 
++#define NO_MATH_REDIRECT
+ #include <math.h>
+ #include <math_private.h>
+ #include <libm-alias-float.h>
+@@ -67,4 +68,6 @@ __roundevenf (float x)
+   SET_FLOAT_WORD (x, ix);
+   return x;
+ }
++#ifndef __roundevenf
+ libm_alias_float (__roundeven, roundeven)
++#endif
+diff --git a/sysdeps/ieee754/ldbl-128/s_roundevenl.c b/sysdeps/ieee754/ldbl-128/s_roundevenl.c
+index 5fc59af4..b9375b6c 100644
+--- a/sysdeps/ieee754/ldbl-128/s_roundevenl.c
++++ b/sysdeps/ieee754/ldbl-128/s_roundevenl.c
+@@ -17,6 +17,7 @@
+    License along with the GNU C Library; if not, see
+    <http://www.gnu.org/licenses/>.  */
+ 
++#define NO_MATH_REDIRECT
+ #include <math.h>
+ #include <math_private.h>
+ #include <libm-alias-ldouble.h>
+diff --git a/sysdeps/ieee754/ldbl-96/s_roundevenl.c b/sysdeps/ieee754/ldbl-96/s_roundevenl.c
+index be2e4fa4..65031ab7 100644
+--- a/sysdeps/ieee754/ldbl-96/s_roundevenl.c
++++ b/sysdeps/ieee754/ldbl-96/s_roundevenl.c
+@@ -17,6 +17,7 @@
+    License along with the GNU C Library; if not, see
+    <http://www.gnu.org/licenses/>.  */
+ 
++#define NO_MATH_REDIRECT
+ #include <math.h>
+ #include <math_private.h>
+ #include <libm-alias-ldouble.h>
+-- 
+GitLab
+
diff --git a/glibc-RHEL-15696-52.patch b/glibc-RHEL-15696-52.patch
new file mode 100644
index 0000000..4602f51
--- /dev/null
+++ b/glibc-RHEL-15696-52.patch
@@ -0,0 +1,242 @@
+From 1683249d17e14827b6579529742eb895027dfa84 Mon Sep 17 00:00:00 2001
+From: Shen-Ta Hsieh <ibmibmibm.tw@gmail.com>
+Date: Mon, 24 May 2021 09:43:11 +0800
+Subject: [PATCH] x86_64: roundeven with sse4.1 support
+Content-type: text/plain; charset=UTF-8
+
+This patch adds support for the sse4.1 hardware floating point
+roundeven.
+
+Here is some benchmark results on my systems:
+
+=AMD Ryzen 9 3900X 12-Core Processor=
+
+* benchmark result before this commit
+|            |    roundeven |   roundevenf |
+|------------|--------------|--------------|
+| duration   |  3.75587e+09 |  3.75114e+09 |
+| iterations |  3.93053e+08 |  4.35402e+08 |
+| max        | 52.592       | 58.71        |
+| min        |  7.98        |  7.22        |
+| mean       |  9.55563     |  8.61535     |
+
+* benchmark result after this commit
+|            |     roundeven |   roundevenf |
+|------------|---------------|--------------|
+| duration   |   3.73815e+09 |  3.73738e+09 |
+| iterations |   5.82692e+08 |  5.91498e+08 |
+| max        |  56.468       | 51.642       |
+| min        |   6.27        |  6.156       |
+| mean       |   6.41532     |  6.3185      |
+
+=Intel(R) Pentium(R) CPU D1508 @ 2.20GHz=
+
+* benchmark result before this commit
+|            |    roundeven |   roundevenf |
+|------------|--------------|--------------|
+| duration   |  2.18208e+09 |  2.18258e+09 |
+| iterations |  2.39932e+08 |  2.46924e+08 |
+| max        | 96.378       | 98.035       |
+| min        |  6.776       |  5.94        |
+| mean       |  9.09456     |  8.83907     |
+
+* benchmark result after this commit
+|            |    roundeven |   roundevenf |
+|------------|--------------|--------------|
+| duration   |  2.17415e+09 |  2.17005e+09 |
+| iterations |  3.56193e+08 |  4.09824e+08 |
+| max        | 51.693       | 97.192       |
+| min        |  5.926       |  5.093       |
+| mean       |  6.10385     |  5.29507     |
+
+Signed-off-by: Shen-Ta Hsieh <ibmibmibm.tw@gmail.com>
+Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
+---
+ sysdeps/x86_64/fpu/multiarch/Makefile         |  5 +--
+ sysdeps/x86_64/fpu/multiarch/s_roundeven-c.c  |  2 ++
+ .../x86_64/fpu/multiarch/s_roundeven-sse4_1.S | 24 ++++++++++++++
+ sysdeps/x86_64/fpu/multiarch/s_roundeven.c    | 31 +++++++++++++++++++
+ sysdeps/x86_64/fpu/multiarch/s_roundevenf-c.c |  3 ++
+ .../fpu/multiarch/s_roundevenf-sse4_1.S       | 24 ++++++++++++++
+ sysdeps/x86_64/fpu/multiarch/s_roundevenf.c   | 31 +++++++++++++++++++
+ 7 files changed, 118 insertions(+), 2 deletions(-)
+ create mode 100644 sysdeps/x86_64/fpu/multiarch/s_roundeven-c.c
+ create mode 100644 sysdeps/x86_64/fpu/multiarch/s_roundeven-sse4_1.S
+ create mode 100644 sysdeps/x86_64/fpu/multiarch/s_roundeven.c
+ create mode 100644 sysdeps/x86_64/fpu/multiarch/s_roundevenf-c.c
+ create mode 100644 sysdeps/x86_64/fpu/multiarch/s_roundevenf-sse4_1.S
+ create mode 100644 sysdeps/x86_64/fpu/multiarch/s_roundevenf.c
+
+diff --git a/sysdeps/x86_64/fpu/multiarch/Makefile b/sysdeps/x86_64/fpu/multiarch/Makefile
+index 9f387248..6ddd1c01 100644
+--- a/sysdeps/x86_64/fpu/multiarch/Makefile
++++ b/sysdeps/x86_64/fpu/multiarch/Makefile
+@@ -1,11 +1,12 @@
+ ifeq ($(subdir),math)
+ libm-sysdep_routines += s_floor-c s_ceil-c s_floorf-c s_ceilf-c \
+ 			s_rint-c s_rintf-c s_nearbyint-c s_nearbyintf-c \
+-			s_trunc-c s_truncf-c
++			s_roundeven-c s_roundevenf-c s_trunc-c s_truncf-c
+ 
+ libm-sysdep_routines += s_ceil-sse4_1 s_ceilf-sse4_1 s_floor-sse4_1 \
+ 			s_floorf-sse4_1 s_nearbyint-sse4_1 \
+-			s_nearbyintf-sse4_1 s_rint-sse4_1 s_rintf-sse4_1 \
++			s_nearbyintf-sse4_1 s_roundeven-sse4_1 \
++			s_roundevenf-sse4_1 s_rint-sse4_1 s_rintf-sse4_1 \
+ 			s_trunc-sse4_1 s_truncf-sse4_1
+ 
+ libm-sysdep_routines += e_exp-fma e_log-fma e_pow-fma s_atan-fma \
+diff --git a/sysdeps/x86_64/fpu/multiarch/s_roundeven-c.c b/sysdeps/x86_64/fpu/multiarch/s_roundeven-c.c
+new file mode 100644
+index 00000000..c7be43cb
+--- /dev/null
++++ b/sysdeps/x86_64/fpu/multiarch/s_roundeven-c.c
+@@ -0,0 +1,2 @@
++#define __roundeven __roundeven_c
++#include <sysdeps/ieee754/dbl-64/s_roundeven.c>
+diff --git a/sysdeps/x86_64/fpu/multiarch/s_roundeven-sse4_1.S b/sysdeps/x86_64/fpu/multiarch/s_roundeven-sse4_1.S
+new file mode 100644
+index 00000000..6ae8f6b1
+--- /dev/null
++++ b/sysdeps/x86_64/fpu/multiarch/s_roundeven-sse4_1.S
+@@ -0,0 +1,24 @@
++/* Copyright (C) 2021 Free Software Foundation, Inc.
++   This file is part of the GNU C Library.
++
++   The GNU C Library is free software; you can redistribute it and/or
++   modify it under the terms of the GNU Lesser General Public
++   License as published by the Free Software Foundation; either
++   version 2.1 of the License, or (at your option) any later version.
++
++   The GNU C Library is distributed in the hope that it will be useful,
++   but WITHOUT ANY WARRANTY; without even the implied warranty of
++   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++   Lesser General Public License for more details.
++
++   You should have received a copy of the GNU Lesser General Public
++   License along with the GNU C Library; if not, see
++   <https://www.gnu.org/licenses/>.  */
++
++#include <sysdep.h>
++
++	.section .text.sse4.1,"ax",@progbits
++ENTRY(__roundeven_sse41)
++	roundsd	$8, %xmm0, %xmm0
++	ret
++END(__roundeven_sse41)
+diff --git a/sysdeps/x86_64/fpu/multiarch/s_roundeven.c b/sysdeps/x86_64/fpu/multiarch/s_roundeven.c
+new file mode 100644
+index 00000000..d92eda65
+--- /dev/null
++++ b/sysdeps/x86_64/fpu/multiarch/s_roundeven.c
+@@ -0,0 +1,31 @@
++/* Multiple versions of __roundeven.
++   Copyright (C) 2021 Free Software Foundation, Inc.
++   This file is part of the GNU C Library.
++
++   The GNU C Library is free software; you can redistribute it and/or
++   modify it under the terms of the GNU Lesser General Public
++   License as published by the Free Software Foundation; either
++   version 2.1 of the License, or (at your option) any later version.
++
++   The GNU C Library is distributed in the hope that it will be useful,
++   but WITHOUT ANY WARRANTY; without even the implied warranty of
++   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++   Lesser General Public License for more details.
++
++   You should have received a copy of the GNU Lesser General Public
++   License along with the GNU C Library; if not, see
++   <https://www.gnu.org/licenses/>.  */
++
++#include <libm-alias-double.h>
++
++#define roundeven __redirect_roundeven
++#define __roundeven __redirect___roundeven
++#include <math.h>
++#undef roundeven
++#undef __roundeven
++
++#define SYMBOL_NAME roundeven
++#include "ifunc-sse4_1.h"
++
++libc_ifunc_redirected (__redirect_roundeven, __roundeven, IFUNC_SELECTOR ());
++libm_alias_double (__roundeven, roundeven)
+diff --git a/sysdeps/x86_64/fpu/multiarch/s_roundevenf-c.c b/sysdeps/x86_64/fpu/multiarch/s_roundevenf-c.c
+new file mode 100644
+index 00000000..72a6e7d1
+--- /dev/null
++++ b/sysdeps/x86_64/fpu/multiarch/s_roundevenf-c.c
+@@ -0,0 +1,3 @@
++#undef __roundevenf
++#define __roundevenf __roundevenf_c
++#include <sysdeps/ieee754/flt-32/s_roundevenf.c>
+diff --git a/sysdeps/x86_64/fpu/multiarch/s_roundevenf-sse4_1.S b/sysdeps/x86_64/fpu/multiarch/s_roundevenf-sse4_1.S
+new file mode 100644
+index 00000000..a76e1080
+--- /dev/null
++++ b/sysdeps/x86_64/fpu/multiarch/s_roundevenf-sse4_1.S
+@@ -0,0 +1,24 @@
++/* Copyright (C) 2021 Free Software Foundation, Inc.
++   This file is part of the GNU C Library.
++
++   The GNU C Library is free software; you can redistribute it and/or
++   modify it under the terms of the GNU Lesser General Public
++   License as published by the Free Software Foundation; either
++   version 2.1 of the License, or (at your option) any later version.
++
++   The GNU C Library is distributed in the hope that it will be useful,
++   but WITHOUT ANY WARRANTY; without even the implied warranty of
++   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++   Lesser General Public License for more details.
++
++   You should have received a copy of the GNU Lesser General Public
++   License along with the GNU C Library; if not, see
++   <https://www.gnu.org/licenses/>.  */
++
++#include <sysdep.h>
++
++	.section .text.sse4.1,"ax",@progbits
++ENTRY(__roundevenf_sse41)
++	roundss	$8, %xmm0, %xmm0
++	ret
++END(__roundevenf_sse41)
+diff --git a/sysdeps/x86_64/fpu/multiarch/s_roundevenf.c b/sysdeps/x86_64/fpu/multiarch/s_roundevenf.c
+new file mode 100644
+index 00000000..2ee196e6
+--- /dev/null
++++ b/sysdeps/x86_64/fpu/multiarch/s_roundevenf.c
+@@ -0,0 +1,31 @@
++/* Multiple versions of __roundevenf.
++   Copyright (C) 2021 Free Software Foundation, Inc.
++   This file is part of the GNU C Library.
++
++   The GNU C Library is free software; you can redistribute it and/or
++   modify it under the terms of the GNU Lesser General Public
++   License as published by the Free Software Foundation; either
++   version 2.1 of the License, or (at your option) any later version.
++
++   The GNU C Library is distributed in the hope that it will be useful,
++   but WITHOUT ANY WARRANTY; without even the implied warranty of
++   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++   Lesser General Public License for more details.
++
++   You should have received a copy of the GNU Lesser General Public
++   License along with the GNU C Library; if not, see
++   <https://www.gnu.org/licenses/>.  */
++
++#include <libm-alias-float.h>
++
++#define roundevenf __redirect_roundevenf
++#define __roundevenf __redirect___roundevenf
++#include <math.h>
++#undef roundevenf
++#undef __roundevenf
++
++#define SYMBOL_NAME roundevenf
++#include "ifunc-sse4_1.h"
++
++libc_ifunc_redirected (__redirect_roundevenf, __roundevenf, IFUNC_SELECTOR ());
++libm_alias_float (__roundeven, roundeven)
+-- 
+GitLab
+
diff --git a/glibc-RHEL-15696-53.patch b/glibc-RHEL-15696-53.patch
new file mode 100644
index 0000000..7221d38
--- /dev/null
+++ b/glibc-RHEL-15696-53.patch
@@ -0,0 +1,41 @@
+From 7e08db3359c86c94918feb33a1182cd0ff3bb10b Mon Sep 17 00:00:00 2001
+From: Noah Goldstein <goldstein.w.n@gmail.com>
+Date: Sun, 9 Jan 2022 16:02:28 -0600
+Subject: [PATCH] x86: Fix __wcsncmp_evex in strcmp-evex.S [BZ# 28755]
+Content-type: text/plain; charset=UTF-8
+
+Fixes [BZ# 28755] for wcsncmp by redirecting length >= 2^56 to
+__wcscmp_evex. For x86_64 this covers the entire address range so any
+length larger could not possibly be used to bound `s1` or `s2`.
+
+test-strcmp, test-strncmp, test-wcscmp, and test-wcsncmp all pass.
+
+Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
+---
+ sysdeps/x86_64/multiarch/strcmp-evex.S | 10 ++++++++++
+ 1 file changed, 10 insertions(+)
+
+diff --git a/sysdeps/x86_64/multiarch/strcmp-evex.S b/sysdeps/x86_64/multiarch/strcmp-evex.S
+index 459eeed0..d5aa6daa 100644
+--- a/sysdeps/x86_64/multiarch/strcmp-evex.S
++++ b/sysdeps/x86_64/multiarch/strcmp-evex.S
+@@ -97,6 +97,16 @@ ENTRY (STRCMP)
+ 	je	L(char0)
+ 	jb	L(zero)
+ #  ifdef USE_AS_WCSCMP
++#  ifndef __ILP32__
++	movq	%rdx, %rcx
++	/* Check if length could overflow when multiplied by
++	   sizeof(wchar_t). Checking top 8 bits will cover all potential
++	   overflow cases as well as redirect cases where its impossible to
++	   length to bound a valid memory region. In these cases just use
++	   'wcscmp'.  */
++	shrq	$56, %rcx
++	jnz	__wcscmp_evex
++#  endif
+ 	/* Convert units: from wide to byte char.  */
+ 	shl	$2, %RDX_LP
+ #  endif
+-- 
+GitLab
+
diff --git a/glibc-RHEL-15696-54.patch b/glibc-RHEL-15696-54.patch
new file mode 100644
index 0000000..b2aaaa1
--- /dev/null
+++ b/glibc-RHEL-15696-54.patch
@@ -0,0 +1,268 @@
+From 78c9ec9000f873abe7a15a91b87080a2e4308260 Mon Sep 17 00:00:00 2001
+From: "H.J. Lu" <hjl.tools@gmail.com>
+Date: Fri, 20 Aug 2021 06:42:24 -0700
+Subject: [PATCH] x86-64: Optimize load of all bits set into ZMM register [BZ
+ #28252]
+Content-type: text/plain; charset=UTF-8
+
+Optimize loads of all bits set into ZMM register in AVX512 SVML codes
+by replacing
+
+	vpbroadcastq .L_2il0floatpacket.16(%rip), %zmmX
+
+and
+
+	vmovups   .L_2il0floatpacket.13(%rip), %zmmX
+
+with
+	vpternlogd $0xff, %zmmX, %zmmX, %zmmX
+
+This fixes BZ #28252.
+---
+ .../x86_64/fpu/multiarch/svml_d_cos8_core_avx512.S   |  7 +------
+ .../x86_64/fpu/multiarch/svml_d_log8_core_avx512.S   |  7 +------
+ .../x86_64/fpu/multiarch/svml_d_sin8_core_avx512.S   |  7 +------
+ .../fpu/multiarch/svml_d_sincos8_core_avx512.S       |  7 +------
+ .../x86_64/fpu/multiarch/svml_s_cosf16_core_avx512.S |  7 +------
+ .../x86_64/fpu/multiarch/svml_s_expf16_core_avx512.S |  7 +------
+ .../x86_64/fpu/multiarch/svml_s_logf16_core_avx512.S |  7 +------
+ .../x86_64/fpu/multiarch/svml_s_powf16_core_avx512.S | 12 ++----------
+ .../fpu/multiarch/svml_s_sincosf16_core_avx512.S     |  7 +------
+ .../x86_64/fpu/multiarch/svml_s_sinf16_core_avx512.S |  7 +------
+ 10 files changed, 11 insertions(+), 64 deletions(-)
+
+diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_cos8_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_d_cos8_core_avx512.S
+index 24e3b363..07dfed85 100644
+--- a/sysdeps/x86_64/fpu/multiarch/svml_d_cos8_core_avx512.S
++++ b/sysdeps/x86_64/fpu/multiarch/svml_d_cos8_core_avx512.S
+@@ -265,7 +265,7 @@ WRAPPER_IMPL_AVX512 _ZGVdN4v_cos
+         vmovaps   %zmm0, %zmm8
+ 
+ /* Check for large arguments path */
+-        vpbroadcastq .L_2il0floatpacket.16(%rip), %zmm2
++        vpternlogd $0xff, %zmm2, %zmm2, %zmm2
+ 
+ /*
+   ARGUMENT RANGE REDUCTION:
+@@ -456,8 +456,3 @@ WRAPPER_IMPL_AVX512 _ZGVdN4v_cos
+         jmp       .LBL_2_7
+ #endif
+ END (_ZGVeN8v_cos_skx)
+-
+-	.section .rodata, "a"
+-.L_2il0floatpacket.16:
+-	.long	0xffffffff,0xffffffff
+-	.type	.L_2il0floatpacket.16,@object
+diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_log8_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_d_log8_core_avx512.S
+index ae8af8d8..ddb60e5b 100644
+--- a/sysdeps/x86_64/fpu/multiarch/svml_d_log8_core_avx512.S
++++ b/sysdeps/x86_64/fpu/multiarch/svml_d_log8_core_avx512.S
+@@ -274,7 +274,7 @@ WRAPPER_IMPL_AVX512 _ZGVdN4v_log
+ 
+ /* preserve mantissa, set input exponent to 2^(-10) */
+         vpternlogq $248, _ExpMask(%rax), %zmm3, %zmm2
+-        vpbroadcastq .L_2il0floatpacket.12(%rip), %zmm1
++        vpternlogd $0xff, %zmm1, %zmm1, %zmm1
+         vpsrlq    $32, %zmm4, %zmm6
+ 
+ /* reciprocal approximation good to at least 11 bits */
+@@ -461,8 +461,3 @@ WRAPPER_IMPL_AVX512 _ZGVdN4v_log
+         jmp       .LBL_2_7
+ #endif
+ END (_ZGVeN8v_log_skx)
+-
+-	.section .rodata, "a"
+-.L_2il0floatpacket.12:
+-	.long	0xffffffff,0xffffffff
+-	.type	.L_2il0floatpacket.12,@object
+diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_sin8_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_d_sin8_core_avx512.S
+index 2d4b14fd..529c454a 100644
+--- a/sysdeps/x86_64/fpu/multiarch/svml_d_sin8_core_avx512.S
++++ b/sysdeps/x86_64/fpu/multiarch/svml_d_sin8_core_avx512.S
+@@ -261,7 +261,7 @@ WRAPPER_IMPL_AVX512 _ZGVdN4v_sin
+         andq      $-64, %rsp
+         subq      $1280, %rsp
+         movq      __svml_d_trig_data@GOTPCREL(%rip), %rax
+-        vpbroadcastq .L_2il0floatpacket.14(%rip), %zmm14
++        vpternlogd $0xff, %zmm1, %zmm1, %zmm14
+         vmovups __dAbsMask(%rax), %zmm7
+         vmovups __dInvPI(%rax), %zmm2
+         vmovups __dRShifter(%rax), %zmm1
+@@ -458,8 +458,3 @@ WRAPPER_IMPL_AVX512 _ZGVdN4v_sin
+         jmp       .LBL_2_7
+ #endif
+ END (_ZGVeN8v_sin_skx)
+-
+-	.section .rodata, "a"
+-.L_2il0floatpacket.14:
+-	.long	0xffffffff,0xffffffff
+-	.type	.L_2il0floatpacket.14,@object
+diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_sincos8_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_d_sincos8_core_avx512.S
+index 2df626c0..e501a53a 100644
+--- a/sysdeps/x86_64/fpu/multiarch/svml_d_sincos8_core_avx512.S
++++ b/sysdeps/x86_64/fpu/multiarch/svml_d_sincos8_core_avx512.S
+@@ -430,7 +430,7 @@ WRAPPER_IMPL_AVX512_fFF _ZGVdN4vl8l8_sincos
+ 
+ /* SinPoly = SinR*SinPoly */
+         vfmadd213pd %zmm5, %zmm5, %zmm4
+-        vpbroadcastq .L_2il0floatpacket.15(%rip), %zmm3
++        vpternlogd $0xff, %zmm3, %zmm3, %zmm3
+ 
+ /* Update Cos result's sign */
+         vxorpd    %zmm2, %zmm1, %zmm1
+@@ -741,8 +741,3 @@ END (_ZGVeN8vvv_sincos_knl)
+ ENTRY (_ZGVeN8vvv_sincos_skx)
+ WRAPPER_AVX512_vvv_vl8l8 _ZGVeN8vl8l8_sincos_skx
+ END (_ZGVeN8vvv_sincos_skx)
+-
+-	.section .rodata, "a"
+-.L_2il0floatpacket.15:
+-	.long	0xffffffff,0xffffffff
+-	.type	.L_2il0floatpacket.15,@object
+diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_cosf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_cosf16_core_avx512.S
+index 6ea1137b..377af394 100644
+--- a/sysdeps/x86_64/fpu/multiarch/svml_s_cosf16_core_avx512.S
++++ b/sysdeps/x86_64/fpu/multiarch/svml_s_cosf16_core_avx512.S
+@@ -278,7 +278,7 @@ WRAPPER_IMPL_AVX512 _ZGVdN8v_cosf
+   X = X - Y*PI1 - Y*PI2 - Y*PI3
+  */
+         vmovaps   %zmm0, %zmm6
+-        vmovups   .L_2il0floatpacket.13(%rip), %zmm12
++        vpternlogd $0xff, %zmm12, %zmm12, %zmm12
+         vmovups __sRShifter(%rax), %zmm3
+         vmovups __sPI1_FMA(%rax), %zmm5
+         vmovups __sA9_FMA(%rax), %zmm9
+@@ -453,8 +453,3 @@ WRAPPER_IMPL_AVX512 _ZGVdN8v_cosf
+         jmp       .LBL_2_7
+ #endif
+ END (_ZGVeN16v_cosf_skx)
+-
+-	.section .rodata, "a"
+-.L_2il0floatpacket.13:
+-	.long	0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff
+-	.type	.L_2il0floatpacket.13,@object
+diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_expf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_expf16_core_avx512.S
+index 89ba0df2..46f33d46 100644
+--- a/sysdeps/x86_64/fpu/multiarch/svml_s_expf16_core_avx512.S
++++ b/sysdeps/x86_64/fpu/multiarch/svml_s_expf16_core_avx512.S
+@@ -264,7 +264,7 @@ WRAPPER_IMPL_AVX512 _ZGVdN8v_expf
+         vmovaps   %zmm0, %zmm7
+ 
+ /* compare against threshold */
+-        vmovups   .L_2il0floatpacket.13(%rip), %zmm3
++        vpternlogd $0xff, %zmm3, %zmm3, %zmm3
+         vmovups __sInvLn2(%rax), %zmm4
+         vmovups __sShifter(%rax), %zmm1
+         vmovups __sLn2hi(%rax), %zmm6
+@@ -440,8 +440,3 @@ WRAPPER_IMPL_AVX512 _ZGVdN8v_expf
+ 
+ #endif
+ END (_ZGVeN16v_expf_skx)
+-
+-	.section .rodata, "a"
+-.L_2il0floatpacket.13:
+-	.long	0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff
+-	.type	.L_2il0floatpacket.13,@object
+diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_logf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_logf16_core_avx512.S
+index 4cf0a96f..9e254956 100644
+--- a/sysdeps/x86_64/fpu/multiarch/svml_s_logf16_core_avx512.S
++++ b/sysdeps/x86_64/fpu/multiarch/svml_s_logf16_core_avx512.S
+@@ -235,7 +235,7 @@ WRAPPER_IMPL_AVX512 _ZGVdN8v_logf
+         andq      $-64, %rsp
+         subq      $1280, %rsp
+         movq      __svml_slog_data@GOTPCREL(%rip), %rax
+-        vmovups   .L_2il0floatpacket.7(%rip), %zmm6
++        vpternlogd $0xff, %zmm6, %zmm6, %zmm6
+         vmovups _iBrkValue(%rax), %zmm4
+         vmovups _sPoly_7(%rax), %zmm8
+ 
+@@ -409,8 +409,3 @@ WRAPPER_IMPL_AVX512 _ZGVdN8v_logf
+ 
+ #endif
+ END (_ZGVeN16v_logf_skx)
+-
+-	.section .rodata, "a"
+-.L_2il0floatpacket.7:
+-	.long	0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff
+-	.type	.L_2il0floatpacket.7,@object
+diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_powf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_powf16_core_avx512.S
+index bdcd50af..e8331ba1 100644
+--- a/sysdeps/x86_64/fpu/multiarch/svml_s_powf16_core_avx512.S
++++ b/sysdeps/x86_64/fpu/multiarch/svml_s_powf16_core_avx512.S
+@@ -385,7 +385,7 @@ WRAPPER_IMPL_AVX512_ff _ZGVdN8vv_powf
+         vpsrlq    $32, %zmm3, %zmm2
+         vpmovqd   %zmm2, %ymm11
+         vcvtps2pd %ymm14, %zmm13
+-        vmovups   .L_2il0floatpacket.23(%rip), %zmm14
++        vpternlogd $0xff, %zmm14, %zmm14, %zmm14
+         vmovaps   %zmm14, %zmm26
+         vpandd _ABSMASK(%rax), %zmm1, %zmm8
+         vpcmpd    $1, _INF(%rax), %zmm8, %k2
+@@ -427,7 +427,7 @@ WRAPPER_IMPL_AVX512_ff _ZGVdN8vv_powf
+         vpmovqd   %zmm11, %ymm5
+         vpxord    %zmm10, %zmm10, %zmm10
+         vgatherdpd _Log2Rcp_lookup(%rax,%ymm4), %zmm10{%k3}
+-        vpbroadcastq .L_2il0floatpacket.24(%rip), %zmm4
++        vpternlogd $0xff, %zmm4, %zmm4, %zmm4
+         vpxord    %zmm11, %zmm11, %zmm11
+         vcvtdq2pd %ymm7, %zmm7
+         vgatherdpd _Log2Rcp_lookup(%rax,%ymm5), %zmm11{%k1}
+@@ -643,11 +643,3 @@ WRAPPER_IMPL_AVX512_ff _ZGVdN8vv_powf
+         jmp       .LBL_2_7
+ #endif
+ END (_ZGVeN16vv_powf_skx)
+-
+-	.section .rodata, "a"
+-.L_2il0floatpacket.23:
+-	.long	0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff
+-	.type	.L_2il0floatpacket.23,@object
+-.L_2il0floatpacket.24:
+-	.long	0xffffffff,0xffffffff
+-	.type	.L_2il0floatpacket.24,@object
+diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf16_core_avx512.S
+index 5fa4bc41..1f46f334 100644
+--- a/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf16_core_avx512.S
++++ b/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf16_core_avx512.S
+@@ -317,7 +317,7 @@ WRAPPER_IMPL_AVX512_fFF _ZGVdN8vvv_sincosf
+ 
+ /* Result sign calculations */
+         vpternlogd $150, %zmm0, %zmm14, %zmm1
+-        vmovups   .L_2il0floatpacket.13(%rip), %zmm14
++        vpternlogd $0xff, %zmm14, %zmm14, %zmm14
+ 
+ /* Add correction term 0.5 for cos() part */
+         vaddps    %zmm8, %zmm5, %zmm15
+@@ -748,8 +748,3 @@ END (_ZGVeN16vvv_sincosf_knl)
+ ENTRY (_ZGVeN16vvv_sincosf_skx)
+ WRAPPER_AVX512_vvv_vl4l4 _ZGVeN16vl4l4_sincosf_skx
+ END (_ZGVeN16vvv_sincosf_skx)
+-
+-	.section .rodata, "a"
+-.L_2il0floatpacket.13:
+-	.long	0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff
+-	.type	.L_2il0floatpacket.13,@object
+diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_sinf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_sinf16_core_avx512.S
+index 141f747e..1fc9308a 100644
+--- a/sysdeps/x86_64/fpu/multiarch/svml_s_sinf16_core_avx512.S
++++ b/sysdeps/x86_64/fpu/multiarch/svml_s_sinf16_core_avx512.S
+@@ -280,7 +280,7 @@ WRAPPER_IMPL_AVX512 _ZGVdN8v_sinf
+         movq      __svml_s_trig_data@GOTPCREL(%rip), %rax
+ 
+ /* Check for large and special values */
+-        vmovups   .L_2il0floatpacket.11(%rip), %zmm14
++        vpternlogd $0xff, %zmm14, %zmm14, %zmm14
+         vmovups __sAbsMask(%rax), %zmm5
+         vmovups __sInvPI(%rax), %zmm1
+         vmovups __sRShifter(%rax), %zmm2
+@@ -472,8 +472,3 @@ WRAPPER_IMPL_AVX512 _ZGVdN8v_sinf
+         jmp       .LBL_2_7
+ #endif
+ END (_ZGVeN16v_sinf_skx)
+-
+-	.section .rodata, "a"
+-.L_2il0floatpacket.11:
+-	.long	0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff
+-	.type	.L_2il0floatpacket.11,@object
+-- 
+GitLab
+
diff --git a/glibc-RHEL-15696-55.patch b/glibc-RHEL-15696-55.patch
new file mode 100644
index 0000000..d44eef1
--- /dev/null
+++ b/glibc-RHEL-15696-55.patch
@@ -0,0 +1,48 @@
+From fc5bd179ef3a953dff8d1655bd530d0e230ffe71 Mon Sep 17 00:00:00 2001
+From: Noah Goldstein <goldstein.w.n@gmail.com>
+Date: Tue, 21 Sep 2021 18:31:49 -0500
+Subject: [PATCH] x86: Modify ENTRY in sysdep.h so that p2align can be
+ specified
+Content-type: text/plain; charset=UTF-8
+
+No bug.
+
+This change adds a new macro ENTRY_P2ALIGN which takes a second
+argument, log2 of the desired function alignment.
+
+The old ENTRY(name) macro is just ENTRY_P2ALIGN(name, 4) so this
+doesn't affect any existing functionality.
+
+Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
+---
+ sysdeps/x86/sysdep.h | 7 +++++--
+ 1 file changed, 5 insertions(+), 2 deletions(-)
+
+diff --git a/sysdeps/x86/sysdep.h b/sysdeps/x86/sysdep.h
+index 01bac0f6..a70bb3a2 100644
+--- a/sysdeps/x86/sysdep.h
++++ b/sysdeps/x86/sysdep.h
+@@ -78,15 +78,18 @@ enum cf_protection_level
+ #define ASM_SIZE_DIRECTIVE(name) .size name,.-name;
+ 
+ /* Define an entry point visible from C.  */
+-#define	ENTRY(name)							      \
++#define	ENTRY_P2ALIGN(name, alignment)					      \
+   .globl C_SYMBOL_NAME(name);						      \
+   .type C_SYMBOL_NAME(name),@function;					      \
+-  .align ALIGNARG(4);							      \
++  .align ALIGNARG(alignment);						      \
+   C_LABEL(name)								      \
+   cfi_startproc;							      \
+   _CET_ENDBR;								      \
+   CALL_MCOUNT
+ 
++/* Common entry 16 byte aligns.  */
++#define ENTRY(name) ENTRY_P2ALIGN (name, 4)
++
+ #undef	END
+ #define END(name)							      \
+   cfi_endproc;								      \
+-- 
+GitLab
+
diff --git a/glibc-RHEL-15696-56.patch b/glibc-RHEL-15696-56.patch
new file mode 100644
index 0000000..45b9975
--- /dev/null
+++ b/glibc-RHEL-15696-56.patch
@@ -0,0 +1,658 @@
+From 1bd8b8d58fc9967cc073d2c13bfb6befefca2faa Mon Sep 17 00:00:00 2001
+From: Noah Goldstein <goldstein.w.n@gmail.com>
+Date: Tue, 21 Sep 2021 18:45:03 -0500
+Subject: [PATCH] x86: Optimize memcmp-evex-movbe.S for frontend behavior and
+ size
+Content-type: text/plain; charset=UTF-8
+
+No bug.
+
+The frontend optimizations are to:
+1. Reorganize logically connected basic blocks so they are either in
+   the same cache line or adjacent cache lines.
+2. Avoid cases when basic blocks unnecissarily cross cache lines.
+3. Try and 32 byte align any basic blocks possible without sacrificing
+   code size. Smaller / Less hot basic blocks are used for this.
+
+Overall code size shrunk by 168 bytes. This should make up for any
+extra costs due to aligning to 64 bytes.
+
+In general performance before deviated a great deal dependending on
+whether entry alignment % 64 was 0, 16, 32, or 48. These changes
+essentially make it so that the current implementation is at least
+equal to the best alignment of the original for any arguments.
+
+The only additional optimization is in the page cross case. Branch on
+equals case was removed from the size == [4, 7] case. As well the [4,
+7] and [2, 3] case where swapped as [4, 7] is likely a more hot
+argument size.
+
+test-memcmp and test-wmemcmp are both passing.
+---
+ sysdeps/x86_64/multiarch/memcmp-evex-movbe.S | 434 +++++++++++--------
+ 1 file changed, 242 insertions(+), 192 deletions(-)
+
+diff --git a/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S b/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S
+index 654dc7ac..2761b54f 100644
+--- a/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S
++++ b/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S
+@@ -34,7 +34,24 @@
+       area.
+    7. Use 2 vector compares when size is 2 * CHAR_PER_VEC or less.
+    8. Use 4 vector compares when size is 4 * CHAR_PER_VEC or less.
+-   9. Use 8 vector compares when size is 8 * CHAR_PER_VEC or less.  */
++   9. Use 8 vector compares when size is 8 * CHAR_PER_VEC or less.
++
++When possible the implementation tries to optimize for frontend in the
++following ways:
++Throughput:
++    1. All code sections that fit are able to run optimally out of the
++       LSD.
++    2. All code sections that fit are able to run optimally out of the
++       DSB
++    3. Basic blocks are contained in minimum number of fetch blocks
++       necessary.
++
++Latency:
++    1. Logically connected basic blocks are put in the same
++       cache-line.
++    2. Logically connected basic blocks that do not fit in the same
++       cache-line are put in adjacent lines. This can get beneficial
++       L2 spatial prefetching and L1 next-line prefetching.  */
+ 
+ # include <sysdep.h>
+ 
+@@ -47,9 +64,11 @@
+ # ifdef USE_AS_WMEMCMP
+ #  define CHAR_SIZE	4
+ #  define VPCMP	vpcmpd
++#  define VPTEST	vptestmd
+ # else
+ #  define CHAR_SIZE	1
+ #  define VPCMP	vpcmpub
++#  define VPTEST	vptestmb
+ # endif
+ 
+ # define VEC_SIZE	32
+@@ -75,7 +94,9 @@
+ */
+ 
+ 	.section .text.evex,"ax",@progbits
+-ENTRY (MEMCMP)
++/* Cache align memcmp entry. This allows for much more thorough
++   frontend optimization.  */
++ENTRY_P2ALIGN (MEMCMP, 6)
+ # ifdef __ILP32__
+ 	/* Clear the upper 32 bits.  */
+ 	movl	%edx, %edx
+@@ -89,7 +110,7 @@ ENTRY (MEMCMP)
+ 	VPCMP	$4, (%rdi), %YMM1, %k1
+ 	kmovd	%k1, %eax
+ 	/* NB: eax must be destination register if going to
+-	   L(return_vec_[0,2]). For L(return_vec_3 destination register
++	   L(return_vec_[0,2]). For L(return_vec_3) destination register
+ 	   must be ecx.  */
+ 	testl	%eax, %eax
+ 	jnz	L(return_vec_0)
+@@ -121,10 +142,6 @@ ENTRY (MEMCMP)
+ 	testl	%ecx, %ecx
+ 	jnz	L(return_vec_3)
+ 
+-	/* Zero YMM0. 4x VEC reduction is done with vpxor + vtern so
+-	   compare with zero to get a mask is needed.  */
+-	vpxorq	%XMM0, %XMM0, %XMM0
+-
+ 	/* Go to 4x VEC loop.  */
+ 	cmpq	$(CHAR_PER_VEC * 8), %rdx
+ 	ja	L(more_8x_vec)
+@@ -148,47 +165,61 @@ ENTRY (MEMCMP)
+ 
+ 	VMOVU	(VEC_SIZE * 2)(%rsi), %YMM3
+ 	vpxorq	(VEC_SIZE * 2)(%rdi), %YMM3, %YMM3
+-	/* Or together YMM1, YMM2, and YMM3 into YMM3.  */
+-	vpternlogd $0xfe, %YMM1, %YMM2, %YMM3
+ 
+ 	VMOVU	(VEC_SIZE * 3)(%rsi), %YMM4
+ 	/* Ternary logic to xor (VEC_SIZE * 3)(%rdi) with YMM4 while
+-	   oring with YMM3. Result is stored in YMM4.  */
+-	vpternlogd $0xde, (VEC_SIZE * 3)(%rdi), %YMM3, %YMM4
+-	/* Compare YMM4 with 0. If any 1s s1 and s2 don't match.  */
+-	VPCMP	$4, %YMM4, %YMM0, %k1
++	   oring with YMM1. Result is stored in YMM4.  */
++	vpternlogd $0xde, (VEC_SIZE * 3)(%rdi), %YMM1, %YMM4
++
++	/* Or together YMM2, YMM3, and YMM4 into YMM4.  */
++	vpternlogd $0xfe, %YMM2, %YMM3, %YMM4
++
++	/* Test YMM4 against itself. Store any CHAR mismatches in k1.
++	 */
++	VPTEST	%YMM4, %YMM4, %k1
++	/* k1 must go to ecx for L(return_vec_0_1_2_3).  */
+ 	kmovd	%k1, %ecx
+ 	testl	%ecx, %ecx
+ 	jnz	L(return_vec_0_1_2_3)
+ 	/* NB: eax must be zero to reach here.  */
+ 	ret
+ 
+-	/* NB: aligning 32 here allows for the rest of the jump targets
+-	   to be tuned for 32 byte alignment. Most important this ensures
+-	   the L(more_8x_vec) loop is 32 byte aligned.  */
+-	.p2align 5
+-L(less_vec):
+-	/* Check if one or less CHAR. This is necessary for size = 0 but
+-	   is also faster for size = CHAR_SIZE.  */
+-	cmpl	$1, %edx
+-	jbe	L(one_or_less)
++	.p2align 4
++L(8x_end_return_vec_0_1_2_3):
++	movq	%rdx, %rdi
++L(8x_return_vec_0_1_2_3):
++	addq	%rdi, %rsi
++L(return_vec_0_1_2_3):
++	VPTEST	%YMM1, %YMM1, %k0
++	kmovd	%k0, %eax
++	testl	%eax, %eax
++	jnz	L(return_vec_0)
+ 
+-	/* Check if loading one VEC from either s1 or s2 could cause a
+-	   page cross. This can have false positives but is by far the
+-	   fastest method.  */
+-	movl	%edi, %eax
+-	orl	%esi, %eax
+-	andl	$(PAGE_SIZE - 1), %eax
+-	cmpl	$(PAGE_SIZE - VEC_SIZE), %eax
+-	jg	L(page_cross_less_vec)
++	VPTEST	%YMM2, %YMM2, %k0
++	kmovd	%k0, %eax
++	testl	%eax, %eax
++	jnz	L(return_vec_1)
+ 
+-	/* No page cross possible.  */
+-	VMOVU	(%rsi), %YMM2
+-	VPCMP	$4, (%rdi), %YMM2, %k1
+-	kmovd	%k1, %eax
+-	/* Create mask in ecx for potentially in bound matches.  */
+-	bzhil	%edx, %eax, %eax
+-	jnz	L(return_vec_0)
++	VPTEST	%YMM3, %YMM3, %k0
++	kmovd	%k0, %eax
++	testl	%eax, %eax
++	jnz	L(return_vec_2)
++L(return_vec_3):
++	/* bsf saves 1 byte from tzcnt. This keep L(return_vec_3) in one
++	   fetch block and the entire L(*return_vec_0_1_2_3) in 1 cache
++	   line.  */
++	bsfl	%ecx, %ecx
++# ifdef USE_AS_WMEMCMP
++	movl	(VEC_SIZE * 3)(%rdi, %rcx, CHAR_SIZE), %eax
++	xorl	%edx, %edx
++	cmpl	(VEC_SIZE * 3)(%rsi, %rcx, CHAR_SIZE), %eax
++	setg	%dl
++	leal	-1(%rdx, %rdx), %eax
++# else
++	movzbl	(VEC_SIZE * 3)(%rdi, %rcx), %eax
++	movzbl	(VEC_SIZE * 3)(%rsi, %rcx), %ecx
++	subl	%ecx, %eax
++# endif
+ 	ret
+ 
+ 	.p2align 4
+@@ -209,10 +240,11 @@ L(return_vec_0):
+ # endif
+ 	ret
+ 
+-	/* NB: No p2align necessary. Alignment  % 16 is naturally 1
+-	   which is good enough for a target not in a loop.  */
++	.p2align 4
+ L(return_vec_1):
+-	tzcntl	%eax, %eax
++	/* bsf saves 1 byte over tzcnt and keeps L(return_vec_1) in one
++	   fetch block.  */
++	bsfl	%eax, %eax
+ # ifdef USE_AS_WMEMCMP
+ 	movl	VEC_SIZE(%rdi, %rax, CHAR_SIZE), %ecx
+ 	xorl	%edx, %edx
+@@ -226,10 +258,11 @@ L(return_vec_1):
+ # endif
+ 	ret
+ 
+-	/* NB: No p2align necessary. Alignment  % 16 is naturally 2
+-	   which is good enough for a target not in a loop.  */
++	.p2align 4,, 10
+ L(return_vec_2):
+-	tzcntl	%eax, %eax
++	/* bsf saves 1 byte over tzcnt and keeps L(return_vec_2) in one
++	   fetch block.  */
++	bsfl	%eax, %eax
+ # ifdef USE_AS_WMEMCMP
+ 	movl	(VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %ecx
+ 	xorl	%edx, %edx
+@@ -243,40 +276,6 @@ L(return_vec_2):
+ # endif
+ 	ret
+ 
+-	.p2align 4
+-L(8x_return_vec_0_1_2_3):
+-	/* Returning from L(more_8x_vec) requires restoring rsi.  */
+-	addq	%rdi, %rsi
+-L(return_vec_0_1_2_3):
+-	VPCMP	$4, %YMM1, %YMM0, %k0
+-	kmovd	%k0, %eax
+-	testl	%eax, %eax
+-	jnz	L(return_vec_0)
+-
+-	VPCMP	$4, %YMM2, %YMM0, %k0
+-	kmovd	%k0, %eax
+-	testl	%eax, %eax
+-	jnz	L(return_vec_1)
+-
+-	VPCMP	$4, %YMM3, %YMM0, %k0
+-	kmovd	%k0, %eax
+-	testl	%eax, %eax
+-	jnz	L(return_vec_2)
+-L(return_vec_3):
+-	tzcntl	%ecx, %ecx
+-# ifdef USE_AS_WMEMCMP
+-	movl	(VEC_SIZE * 3)(%rdi, %rcx, CHAR_SIZE), %eax
+-	xorl	%edx, %edx
+-	cmpl	(VEC_SIZE * 3)(%rsi, %rcx, CHAR_SIZE), %eax
+-	setg	%dl
+-	leal	-1(%rdx, %rdx), %eax
+-# else
+-	movzbl	(VEC_SIZE * 3)(%rdi, %rcx), %eax
+-	movzbl	(VEC_SIZE * 3)(%rsi, %rcx), %ecx
+-	subl	%ecx, %eax
+-# endif
+-	ret
+-
+ 	.p2align 4
+ L(more_8x_vec):
+ 	/* Set end of s1 in rdx.  */
+@@ -288,21 +287,19 @@ L(more_8x_vec):
+ 	andq	$-VEC_SIZE, %rdi
+ 	/* Adjust because first 4x vec where check already.  */
+ 	subq	$-(VEC_SIZE * 4), %rdi
++
+ 	.p2align 4
+ L(loop_4x_vec):
+ 	VMOVU	(%rsi, %rdi), %YMM1
+ 	vpxorq	(%rdi), %YMM1, %YMM1
+-
+ 	VMOVU	VEC_SIZE(%rsi, %rdi), %YMM2
+ 	vpxorq	VEC_SIZE(%rdi), %YMM2, %YMM2
+-
+ 	VMOVU	(VEC_SIZE * 2)(%rsi, %rdi), %YMM3
+ 	vpxorq	(VEC_SIZE * 2)(%rdi), %YMM3, %YMM3
+-	vpternlogd $0xfe, %YMM1, %YMM2, %YMM3
+-
+ 	VMOVU	(VEC_SIZE * 3)(%rsi, %rdi), %YMM4
+-	vpternlogd $0xde, (VEC_SIZE * 3)(%rdi), %YMM3, %YMM4
+-	VPCMP	$4, %YMM4, %YMM0, %k1
++	vpternlogd $0xde, (VEC_SIZE * 3)(%rdi), %YMM1, %YMM4
++	vpternlogd $0xfe, %YMM2, %YMM3, %YMM4
++	VPTEST	%YMM4, %YMM4, %k1
+ 	kmovd	%k1, %ecx
+ 	testl	%ecx, %ecx
+ 	jnz	L(8x_return_vec_0_1_2_3)
+@@ -319,28 +316,25 @@ L(loop_4x_vec):
+ 	cmpl	$(VEC_SIZE * 2), %edi
+ 	jae	L(8x_last_2x_vec)
+ 
++	vpxorq	(VEC_SIZE * 2)(%rdx), %YMM3, %YMM3
++
+ 	VMOVU	(%rsi, %rdx), %YMM1
+ 	vpxorq	(%rdx), %YMM1, %YMM1
+ 
+ 	VMOVU	VEC_SIZE(%rsi, %rdx), %YMM2
+ 	vpxorq	VEC_SIZE(%rdx), %YMM2, %YMM2
+-
+-	vpxorq	(VEC_SIZE * 2)(%rdx), %YMM3, %YMM3
+-	vpternlogd $0xfe, %YMM1, %YMM2, %YMM3
+-
+ 	VMOVU	(VEC_SIZE * 3)(%rsi, %rdx), %YMM4
+-	vpternlogd $0xde, (VEC_SIZE * 3)(%rdx), %YMM3, %YMM4
+-	VPCMP	$4, %YMM4, %YMM0, %k1
++	vpternlogd $0xde, (VEC_SIZE * 3)(%rdx), %YMM1, %YMM4
++	vpternlogd $0xfe, %YMM2, %YMM3, %YMM4
++	VPTEST	%YMM4, %YMM4, %k1
+ 	kmovd	%k1, %ecx
+-	/* Restore s1 pointer to rdi.  */
+-	movq	%rdx, %rdi
+ 	testl	%ecx, %ecx
+-	jnz	L(8x_return_vec_0_1_2_3)
++	jnz	L(8x_end_return_vec_0_1_2_3)
+ 	/* NB: eax must be zero to reach here.  */
+ 	ret
+ 
+ 	/* Only entry is from L(more_8x_vec).  */
+-	.p2align 4
++	.p2align 4,, 10
+ L(8x_last_2x_vec):
+ 	VPCMP	$4, (VEC_SIZE * 2)(%rdx), %YMM3, %k1
+ 	kmovd	%k1, %eax
+@@ -355,7 +349,31 @@ L(8x_last_1x_vec):
+ 	jnz	L(8x_return_vec_3)
+ 	ret
+ 
+-	.p2align 4
++	/* Not ideally aligned (at offset +9 bytes in fetch block) but
++	   not aligning keeps it in the same cache line as
++	   L(8x_last_1x/2x_vec) so likely worth it. As well, saves code
++	   size.  */
++	.p2align 4,, 4
++L(8x_return_vec_2):
++	subq	$VEC_SIZE, %rdx
++L(8x_return_vec_3):
++	bsfl	%eax, %eax
++# ifdef USE_AS_WMEMCMP
++	leaq	(%rdx, %rax, CHAR_SIZE), %rax
++	movl	(VEC_SIZE * 3)(%rax), %ecx
++	xorl	%edx, %edx
++	cmpl	(VEC_SIZE * 3)(%rsi, %rax), %ecx
++	setg	%dl
++	leal	-1(%rdx, %rdx), %eax
++# else
++	addq	%rdx, %rax
++	movzbl	(VEC_SIZE * 3)(%rsi, %rax), %ecx
++	movzbl	(VEC_SIZE * 3)(%rax), %eax
++	subl	%ecx, %eax
++# endif
++	ret
++
++	.p2align 4,, 10
+ L(last_2x_vec):
+ 	/* Check second to last VEC.  */
+ 	VMOVU	-(VEC_SIZE * 2)(%rsi, %rdx, CHAR_SIZE), %YMM1
+@@ -374,26 +392,49 @@ L(last_1x_vec):
+ 	jnz	L(return_vec_0_end)
+ 	ret
+ 
+-	.p2align 4
+-L(8x_return_vec_2):
+-	subq	$VEC_SIZE, %rdx
+-L(8x_return_vec_3):
+-	tzcntl	%eax, %eax
++	.p2align 4,, 10
++L(return_vec_1_end):
++	/* Use bsf to save code size. This is necessary to have
++	   L(one_or_less) fit in aligning bytes between.  */
++	bsfl	%eax, %eax
++	addl	%edx, %eax
+ # ifdef USE_AS_WMEMCMP
+-	leaq	(%rdx, %rax, CHAR_SIZE), %rax
+-	movl	(VEC_SIZE * 3)(%rax), %ecx
++	movl	-(VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %ecx
+ 	xorl	%edx, %edx
+-	cmpl	(VEC_SIZE * 3)(%rsi, %rax), %ecx
++	cmpl	-(VEC_SIZE * 2)(%rsi, %rax, CHAR_SIZE), %ecx
+ 	setg	%dl
+ 	leal	-1(%rdx, %rdx), %eax
+ # else
+-	addq	%rdx, %rax
+-	movzbl	(VEC_SIZE * 3)(%rsi, %rax), %ecx
+-	movzbl	(VEC_SIZE * 3)(%rax), %eax
++	movzbl	-(VEC_SIZE * 2)(%rsi, %rax), %ecx
++	movzbl	-(VEC_SIZE * 2)(%rdi, %rax), %eax
+ 	subl	%ecx, %eax
+ # endif
+ 	ret
+ 
++	/* NB: L(one_or_less) fits in alignment padding between
++	   L(return_vec_1_end) and L(return_vec_0_end).  */
++# ifdef USE_AS_WMEMCMP
++L(one_or_less):
++	jb	L(zero)
++	movl	(%rdi), %ecx
++	xorl	%edx, %edx
++	cmpl	(%rsi), %ecx
++	je	L(zero)
++	setg	%dl
++	leal	-1(%rdx, %rdx), %eax
++	ret
++# else
++L(one_or_less):
++	jb	L(zero)
++	movzbl	(%rsi), %ecx
++	movzbl	(%rdi), %eax
++	subl	%ecx, %eax
++	ret
++# endif
++L(zero):
++	xorl	%eax, %eax
++	ret
++
+ 	.p2align 4
+ L(return_vec_0_end):
+ 	tzcntl	%eax, %eax
+@@ -412,23 +453,56 @@ L(return_vec_0_end):
+ 	ret
+ 
+ 	.p2align 4
+-L(return_vec_1_end):
++L(less_vec):
++	/* Check if one or less CHAR. This is necessary for size == 0
++	   but is also faster for size == CHAR_SIZE.  */
++	cmpl	$1, %edx
++	jbe	L(one_or_less)
++
++	/* Check if loading one VEC from either s1 or s2 could cause a
++	   page cross. This can have false positives but is by far the
++	   fastest method.  */
++	movl	%edi, %eax
++	orl	%esi, %eax
++	andl	$(PAGE_SIZE - 1), %eax
++	cmpl	$(PAGE_SIZE - VEC_SIZE), %eax
++	jg	L(page_cross_less_vec)
++
++	/* No page cross possible.  */
++	VMOVU	(%rsi), %YMM2
++	VPCMP	$4, (%rdi), %YMM2, %k1
++	kmovd	%k1, %eax
++	/* Check if any matches where in bounds. Intentionally not
++	   storing result in eax to limit dependency chain if it goes to
++	   L(return_vec_0_lv).  */
++	bzhil	%edx, %eax, %edx
++	jnz	L(return_vec_0_lv)
++	xorl	%eax, %eax
++	ret
++
++	/* Essentially duplicate of L(return_vec_0). Ends up not costing
++	   any code as shrinks L(less_vec) by allowing 2-byte encoding of
++	   the jump and ends up fitting in aligning bytes. As well fits on
++	   same cache line as L(less_vec) so also saves a line from having
++	   to be fetched on cold calls to memcmp.  */
++	.p2align 4,, 4
++L(return_vec_0_lv):
+ 	tzcntl	%eax, %eax
+-	addl	%edx, %eax
+ # ifdef USE_AS_WMEMCMP
+-	movl	-(VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %ecx
++	movl	(%rdi, %rax, CHAR_SIZE), %ecx
+ 	xorl	%edx, %edx
+-	cmpl	-(VEC_SIZE * 2)(%rsi, %rax, CHAR_SIZE), %ecx
++	cmpl	(%rsi, %rax, CHAR_SIZE), %ecx
++	/* NB: no partial register stall here because xorl zero idiom
++	   above.  */
+ 	setg	%dl
+ 	leal	-1(%rdx, %rdx), %eax
+ # else
+-	movzbl	-(VEC_SIZE * 2)(%rsi, %rax), %ecx
+-	movzbl	-(VEC_SIZE * 2)(%rdi, %rax), %eax
++	movzbl	(%rsi, %rax), %ecx
++	movzbl	(%rdi, %rax), %eax
+ 	subl	%ecx, %eax
+ # endif
+ 	ret
+ 
+-
+ 	.p2align 4
+ L(page_cross_less_vec):
+ 	/* if USE_AS_WMEMCMP it can only be 0, 4, 8, 12, 16, 20, 24, 28
+@@ -439,108 +513,84 @@ L(page_cross_less_vec):
+ 	cmpl	$8, %edx
+ 	jae	L(between_8_15)
+ 	cmpl	$4, %edx
+-	jae	L(between_4_7)
+-L(between_2_3):
+-	/* Load as big endian to avoid branches.  */
+-	movzwl	(%rdi), %eax
+-	movzwl	(%rsi), %ecx
+-	shll	$8, %eax
+-	shll	$8, %ecx
+-	bswap	%eax
+-	bswap	%ecx
+-	movzbl	-1(%rdi, %rdx), %edi
+-	movzbl	-1(%rsi, %rdx), %esi
+-	orl	%edi, %eax
+-	orl	%esi, %ecx
+-	/* Subtraction is okay because the upper 8 bits are zero.  */
+-	subl	%ecx, %eax
+-	ret
+-	.p2align 4
+-L(one_or_less):
+-	jb	L(zero)
+-	movzbl	(%rsi), %ecx
+-	movzbl	(%rdi), %eax
+-	subl	%ecx, %eax
++	jb	L(between_2_3)
++
++	/* Load as big endian with overlapping movbe to avoid branches.
++	 */
++	movbe	(%rdi), %eax
++	movbe	(%rsi), %ecx
++	shlq	$32, %rax
++	shlq	$32, %rcx
++	movbe	-4(%rdi, %rdx), %edi
++	movbe	-4(%rsi, %rdx), %esi
++	orq	%rdi, %rax
++	orq	%rsi, %rcx
++	subq	%rcx, %rax
++	/* edx is guranteed to be positive int32 in range [4, 7].  */
++	cmovne	%edx, %eax
++	/* ecx is -1 if rcx > rax. Otherwise 0.  */
++	sbbl	%ecx, %ecx
++	/* If rcx > rax, then ecx is 0 and eax is positive. If rcx ==
++	   rax then eax and ecx are zero. If rax < rax then ecx is -1 so
++	   eax doesn't matter.  */
++	orl	%ecx, %eax
+ 	ret
+ 
+-	.p2align 4
++	.p2align 4,, 8
+ L(between_8_15):
+ # endif
+ 	/* If USE_AS_WMEMCMP fall through into 8-15 byte case.  */
+-	vmovq	(%rdi), %XMM1
+-	vmovq	(%rsi), %XMM2
+-	VPCMP	$4, %XMM1, %XMM2, %k1
++	vmovq	(%rdi), %xmm1
++	vmovq	(%rsi), %xmm2
++	VPCMP	$4, %xmm1, %xmm2, %k1
+ 	kmovd	%k1, %eax
+ 	testl	%eax, %eax
+-	jnz	L(return_vec_0)
++	jnz	L(return_vec_0_lv)
+ 	/* Use overlapping loads to avoid branches.  */
+-	leaq	-8(%rdi, %rdx, CHAR_SIZE), %rdi
+-	leaq	-8(%rsi, %rdx, CHAR_SIZE), %rsi
+-	vmovq	(%rdi), %XMM1
+-	vmovq	(%rsi), %XMM2
+-	VPCMP	$4, %XMM1, %XMM2, %k1
++	vmovq	-8(%rdi, %rdx, CHAR_SIZE), %xmm1
++	vmovq	-8(%rsi, %rdx, CHAR_SIZE), %xmm2
++	VPCMP	$4, %xmm1, %xmm2, %k1
++	addl	$(CHAR_PER_VEC - (8 / CHAR_SIZE)), %edx
+ 	kmovd	%k1, %eax
+ 	testl	%eax, %eax
+-	jnz	L(return_vec_0)
+-	ret
+-
+-	.p2align 4
+-L(zero):
+-	xorl	%eax, %eax
++	jnz	L(return_vec_0_end)
+ 	ret
+ 
+-	.p2align 4
++	.p2align 4,, 8
+ L(between_16_31):
+ 	/* From 16 to 31 bytes.  No branch when size == 16.  */
+-	VMOVU	(%rsi), %XMM2
+-	VPCMP	$4, (%rdi), %XMM2, %k1
++
++	/* Use movups to save code size.  */
++	movups	(%rsi), %xmm2
++	VPCMP	$4, (%rdi), %xmm2, %k1
+ 	kmovd	%k1, %eax
+ 	testl	%eax, %eax
+-	jnz	L(return_vec_0)
+-
++	jnz	L(return_vec_0_lv)
+ 	/* Use overlapping loads to avoid branches.  */
+-
+-	VMOVU	-16(%rsi, %rdx, CHAR_SIZE), %XMM2
+-	leaq	-16(%rdi, %rdx, CHAR_SIZE), %rdi
+-	leaq	-16(%rsi, %rdx, CHAR_SIZE), %rsi
+-	VPCMP	$4, (%rdi), %XMM2, %k1
++	movups	-16(%rsi, %rdx, CHAR_SIZE), %xmm2
++	VPCMP	$4, -16(%rdi, %rdx, CHAR_SIZE), %xmm2, %k1
++	addl	$(CHAR_PER_VEC - (16 / CHAR_SIZE)), %edx
+ 	kmovd	%k1, %eax
+ 	testl	%eax, %eax
+-	jnz	L(return_vec_0)
+-	ret
+-
+-# ifdef USE_AS_WMEMCMP
+-	.p2align 4
+-L(one_or_less):
+-	jb	L(zero)
+-	movl	(%rdi), %ecx
+-	xorl	%edx, %edx
+-	cmpl	(%rsi), %ecx
+-	je	L(zero)
+-	setg	%dl
+-	leal	-1(%rdx, %rdx), %eax
++	jnz	L(return_vec_0_end)
+ 	ret
+-# else
+ 
+-	.p2align 4
+-L(between_4_7):
+-	/* Load as big endian with overlapping movbe to avoid branches.
+-	 */
+-	movbe	(%rdi), %eax
+-	movbe	(%rsi), %ecx
+-	shlq	$32, %rax
+-	shlq	$32, %rcx
+-	movbe	-4(%rdi, %rdx), %edi
+-	movbe	-4(%rsi, %rdx), %esi
+-	orq	%rdi, %rax
+-	orq	%rsi, %rcx
+-	subq	%rcx, %rax
+-	jz	L(zero_4_7)
+-	sbbl	%eax, %eax
+-	orl	$1, %eax
+-L(zero_4_7):
++# ifndef USE_AS_WMEMCMP
++L(between_2_3):
++	/* Load as big endian to avoid branches.  */
++	movzwl	(%rdi), %eax
++	movzwl	(%rsi), %ecx
++	shll	$8, %eax
++	shll	$8, %ecx
++	bswap	%eax
++	bswap	%ecx
++	movzbl	-1(%rdi, %rdx), %edi
++	movzbl	-1(%rsi, %rdx), %esi
++	orl	%edi, %eax
++	orl	%esi, %ecx
++	/* Subtraction is okay because the upper 8 bits are zero.  */
++	subl	%ecx, %eax
+ 	ret
+ # endif
+-
+ END (MEMCMP)
+ #endif
+-- 
+GitLab
+
diff --git a/glibc-RHEL-15696-57.patch b/glibc-RHEL-15696-57.patch
new file mode 100644
index 0000000..51d5dd0
--- /dev/null
+++ b/glibc-RHEL-15696-57.patch
@@ -0,0 +1,510 @@
+From e59ced238482fd71f3e493717f14f6507346741e Mon Sep 17 00:00:00 2001
+From: Noah Goldstein <goldstein.w.n@gmail.com>
+Date: Mon, 20 Sep 2021 16:20:15 -0500
+Subject: [PATCH] x86: Optimize memset-vec-unaligned-erms.S
+Content-type: text/plain; charset=UTF-8
+
+No bug.
+
+Optimization are
+
+1. change control flow for L(more_2x_vec) to fall through to loop and
+   jump for L(less_4x_vec) and L(less_8x_vec). This uses less code
+   size and saves jumps for length > 4x VEC_SIZE.
+
+2. For EVEX/AVX512 move L(less_vec) closer to entry.
+
+3. Avoid complex address mode for length > 2x VEC_SIZE
+
+4. Slightly better aligning code for the loop from the perspective of
+   code size and uops.
+
+5. Align targets so they make full use of their fetch block and if
+   possible cache line.
+
+6. Try and reduce total number of icache lines that will need to be
+   pulled in for a given length.
+
+7. Include "local" version of stosb target. For AVX2/EVEX/AVX512
+   jumping to the stosb target in the sse2 code section will almost
+   certainly be to a new page. The new version does increase code size
+   marginally by duplicating the target but should get better iTLB
+   behavior as a result.
+
+test-memset, test-wmemset, and test-bzero are all passing.
+
+Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
+Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
+---
+ sysdeps/x86_64/memset.S                       |  10 +-
+ .../multiarch/memset-avx2-unaligned-erms.S    |  10 +-
+ .../multiarch/memset-avx512-unaligned-erms.S  |  11 +-
+ .../multiarch/memset-evex-unaligned-erms.S    |  11 +-
+ .../multiarch/memset-vec-unaligned-erms.S     | 285 ++++++++++++------
+ 5 files changed, 232 insertions(+), 95 deletions(-)
+
+Conflicts:
+	sysdeps/x86_64/memset.S
+	(GNU URL)
+
+diff --git a/sysdeps/x86_64/memset.S b/sysdeps/x86_64/memset.S
+index b3426795..8672b030 100644
+--- a/sysdeps/x86_64/memset.S
++++ b/sysdeps/x86_64/memset.S
+@@ -18,13 +18,15 @@
+    <http://www.gnu.org/licenses/>.  */
+ 
+ #include <sysdep.h>
++#define USE_WITH_SSE2	1
+ 
+ #define VEC_SIZE	16
++#define MOV_SIZE	3
++#define RET_SIZE	1
++
+ #define VEC(i)		xmm##i
+-/* Don't use movups and movaps since it will get larger nop paddings for
+-   alignment.  */
+-#define VMOVU		movdqu
+-#define VMOVA		movdqa
++#define VMOVU     movups
++#define VMOVA     movaps
+ 
+ #define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
+   movd d, %xmm0; \
+diff --git a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
+index ae0860f3..1af668af 100644
+--- a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
++++ b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
+@@ -1,8 +1,14 @@
+ #if IS_IN (libc)
++# define USE_WITH_AVX2	1
++
+ # define VEC_SIZE	32
++# define MOV_SIZE	4
++# define RET_SIZE	4
++
+ # define VEC(i)		ymm##i
+-# define VMOVU		vmovdqu
+-# define VMOVA		vmovdqa
++
++# define VMOVU     vmovdqu
++# define VMOVA     vmovdqa
+ 
+ # define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
+   vmovd d, %xmm0; \
+diff --git a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
+index 8ad842fc..f14d6f84 100644
+--- a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
++++ b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
+@@ -1,11 +1,18 @@
+ #if IS_IN (libc)
++# define USE_WITH_AVX512	1
++
+ # define VEC_SIZE	64
++# define MOV_SIZE	6
++# define RET_SIZE	1
++
+ # define XMM0		xmm16
+ # define YMM0		ymm16
+ # define VEC0		zmm16
+ # define VEC(i)		VEC##i
+-# define VMOVU		vmovdqu64
+-# define VMOVA		vmovdqa64
++
++# define VMOVU     vmovdqu64
++# define VMOVA     vmovdqa64
++
+ # define VZEROUPPER
+ 
+ # define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
+diff --git a/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
+index 640f0929..64b09e77 100644
+--- a/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
++++ b/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
+@@ -1,11 +1,18 @@
+ #if IS_IN (libc)
++# define USE_WITH_EVEX	1
++
+ # define VEC_SIZE	32
++# define MOV_SIZE	6
++# define RET_SIZE	1
++
+ # define XMM0		xmm16
+ # define YMM0		ymm16
+ # define VEC0		ymm16
+ # define VEC(i)		VEC##i
+-# define VMOVU		vmovdqu64
+-# define VMOVA		vmovdqa64
++
++# define VMOVU     vmovdqu64
++# define VMOVA     vmovdqa64
++
+ # define VZEROUPPER
+ 
+ # define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
+diff --git a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
+index 909c33f6..f08b7323 100644
+--- a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
++++ b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
+@@ -63,8 +63,27 @@
+ # endif
+ #endif
+ 
++#if VEC_SIZE == 64
++# define LOOP_4X_OFFSET	(VEC_SIZE * 4)
++#else
++# define LOOP_4X_OFFSET	(0)
++#endif
++
++#if defined USE_WITH_EVEX || defined USE_WITH_AVX512
++# define END_REG	rcx
++# define LOOP_REG	rdi
++#else
++# define END_REG	rdi
++# define LOOP_REG	rdx
++#endif
++
+ #define PAGE_SIZE 4096
+ 
++/* Macro to calculate size of small memset block for aligning
++   purposes.  */
++#define SMALL_MEMSET_ALIGN(mov_sz,	ret_sz)	(2 * (mov_sz) + (ret_sz) + 1)
++
++
+ #ifndef SECTION
+ # error SECTION is not defined!
+ #endif
+@@ -74,6 +93,7 @@
+ ENTRY (__bzero)
+ 	mov	%RDI_LP, %RAX_LP /* Set return value.  */
+ 	mov	%RSI_LP, %RDX_LP /* Set n.  */
++	xorl	%esi, %esi
+ 	pxor	%XMM0, %XMM0
+ 	jmp	L(entry_from_bzero)
+ END (__bzero)
+@@ -158,7 +178,7 @@ ENTRY_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned_erms))
+ END_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned_erms))
+ # endif
+ 
+-ENTRY (MEMSET_SYMBOL (__memset, unaligned_erms))
++ENTRY_P2ALIGN (MEMSET_SYMBOL (__memset, unaligned_erms), 6)
+ 	MEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi)
+ # ifdef __ILP32__
+ 	/* Clear the upper 32 bits.  */
+@@ -168,75 +188,43 @@ ENTRY (MEMSET_SYMBOL (__memset, unaligned_erms))
+ 	jb	L(less_vec)
+ 	cmp	$(VEC_SIZE * 2), %RDX_LP
+ 	ja	L(stosb_more_2x_vec)
+-	/* From VEC and to 2 * VEC.  No branch when size == VEC_SIZE.  */
+-	VMOVU	%VEC(0), -VEC_SIZE(%rdi,%rdx)
+-	VMOVU	%VEC(0), (%rdi)
++	/* From VEC and to 2 * VEC.  No branch when size == VEC_SIZE.
++	 */
++	VMOVU	%VEC(0), (%rax)
++	VMOVU	%VEC(0), -VEC_SIZE(%rax, %rdx)
+ 	VZEROUPPER_RETURN
+-
+-	.p2align 4
+-L(stosb_more_2x_vec):
+-	cmp	__x86_rep_stosb_threshold(%rip), %RDX_LP
+-	ja	L(stosb)
+-#else
+-	.p2align 4
+ #endif
+-L(more_2x_vec):
+-	/* Stores to first 2x VEC before cmp as any path forward will
+-	   require it.  */
+-	VMOVU	%VEC(0), (%rdi)
+-	VMOVU	%VEC(0), VEC_SIZE(%rdi)
+-	cmpq	$(VEC_SIZE * 4), %rdx
+-	ja	L(loop_start)
+-	VMOVU	%VEC(0), -(VEC_SIZE * 2)(%rdi,%rdx)
+-	VMOVU	%VEC(0), -VEC_SIZE(%rdi,%rdx)
+-L(return):
+-#if VEC_SIZE > 16
+-	ZERO_UPPER_VEC_REGISTERS_RETURN
++
++	.p2align 4,, 10
++L(last_2x_vec):
++#ifdef USE_LESS_VEC_MASK_STORE
++	VMOVU	%VEC(0), (VEC_SIZE * 2 + LOOP_4X_OFFSET)(%rcx)
++	VMOVU	%VEC(0), (VEC_SIZE * 3 + LOOP_4X_OFFSET)(%rcx)
+ #else
+-	ret
++	VMOVU	%VEC(0), (VEC_SIZE * -2)(%rdi)
++	VMOVU	%VEC(0), (VEC_SIZE * -1)(%rdi)
+ #endif
++	VZEROUPPER_RETURN
+ 
+-L(loop_start):
+-	VMOVU	%VEC(0), (VEC_SIZE * 2)(%rdi)
+-	VMOVU	%VEC(0), (VEC_SIZE * 3)(%rdi)
+-	cmpq	$(VEC_SIZE * 8), %rdx
+-	jbe	L(loop_end)
+-	andq	$-(VEC_SIZE * 2), %rdi
+-	subq	$-(VEC_SIZE * 4), %rdi
+-	leaq	-(VEC_SIZE * 4)(%rax, %rdx), %rcx
+-	.p2align 4
+-L(loop):
+-	VMOVA	%VEC(0), (%rdi)
+-	VMOVA	%VEC(0), VEC_SIZE(%rdi)
+-	VMOVA	%VEC(0), (VEC_SIZE * 2)(%rdi)
+-	VMOVA	%VEC(0), (VEC_SIZE * 3)(%rdi)
+-	subq	$-(VEC_SIZE * 4), %rdi
+-	cmpq	%rcx, %rdi
+-	jb	L(loop)
+-L(loop_end):
+-	/* NB: rax is set as ptr in MEMSET_VDUP_TO_VEC0_AND_SET_RETURN.
+-	       rdx as length is also unchanged.  */
+-	VMOVU	%VEC(0), -(VEC_SIZE * 4)(%rax, %rdx)
+-	VMOVU	%VEC(0), -(VEC_SIZE * 3)(%rax, %rdx)
+-	VMOVU	%VEC(0), -(VEC_SIZE * 2)(%rax, %rdx)
+-	VMOVU	%VEC(0), -VEC_SIZE(%rax, %rdx)
+-	VZEROUPPER_SHORT_RETURN
+-
+-	.p2align 4
++	/* If have AVX512 mask instructions put L(less_vec) close to
++	   entry as it doesn't take much space and is likely a hot target.
++	 */
++#ifdef USE_LESS_VEC_MASK_STORE
++	.p2align 4,, 10
+ L(less_vec):
+ 	/* Less than 1 VEC.  */
+ # if VEC_SIZE != 16 && VEC_SIZE != 32 && VEC_SIZE != 64
+ #  error Unsupported VEC_SIZE!
+ # endif
+-# ifdef USE_LESS_VEC_MASK_STORE
+ 	/* Clear high bits from edi. Only keeping bits relevant to page
+ 	   cross check. Note that we are using rax which is set in
+-	   MEMSET_VDUP_TO_VEC0_AND_SET_RETURN as ptr from here on out.
+-	 */
++	   MEMSET_VDUP_TO_VEC0_AND_SET_RETURN as ptr from here on out.  */
+ 	andl	$(PAGE_SIZE - 1), %edi
+-	/* Check if VEC_SIZE store cross page. Mask stores suffer serious
+-	   performance degradation when it has to fault supress.  */
++	/* Check if VEC_SIZE store cross page. Mask stores suffer
++	   serious performance degradation when it has to fault supress.
++	 */
+ 	cmpl	$(PAGE_SIZE - VEC_SIZE), %edi
++	/* This is generally considered a cold target.  */
+ 	ja	L(cross_page)
+ # if VEC_SIZE > 32
+ 	movq	$-1, %rcx
+@@ -247,58 +235,185 @@ L(less_vec):
+ 	bzhil	%edx, %ecx, %ecx
+ 	kmovd	%ecx, %k1
+ # endif
+-	vmovdqu8	%VEC(0), (%rax) {%k1}
++	vmovdqu8 %VEC(0), (%rax){%k1}
+ 	VZEROUPPER_RETURN
+ 
++# if defined USE_MULTIARCH && IS_IN (libc)
++	/* Include L(stosb_local) here if including L(less_vec) between
++	   L(stosb_more_2x_vec) and ENTRY. This is to cache align the
++	   L(stosb_more_2x_vec) target.  */
++	.p2align 4,, 10
++L(stosb_local):
++	movzbl	%sil, %eax
++	mov	%RDX_LP, %RCX_LP
++	mov	%RDI_LP, %RDX_LP
++	rep	stosb
++	mov	%RDX_LP, %RAX_LP
++	VZEROUPPER_RETURN
++# endif
++#endif
++
++#if defined USE_MULTIARCH && IS_IN (libc)
+ 	.p2align 4
+-L(cross_page):
++L(stosb_more_2x_vec):
++	cmp	__x86_rep_stosb_threshold(%rip), %RDX_LP
++	ja	L(stosb_local)
++#endif
++	/* Fallthrough goes to L(loop_4x_vec). Tests for memset (2x, 4x]
++	   and (4x, 8x] jump to target.  */
++L(more_2x_vec):
++
++	/* Two different methods of setting up pointers / compare. The
++	   two methods are based on the fact that EVEX/AVX512 mov
++	   instructions take more bytes then AVX2/SSE2 mov instructions. As
++	   well that EVEX/AVX512 machines also have fast LEA_BID. Both
++	   setup and END_REG to avoid complex address mode. For EVEX/AVX512
++	   this saves code size and keeps a few targets in one fetch block.
++	   For AVX2/SSE2 this helps prevent AGU bottlenecks.  */
++#if defined USE_WITH_EVEX || defined USE_WITH_AVX512
++	/* If EVEX/AVX512 compute END_REG - (VEC_SIZE * 4 +
++	   LOOP_4X_OFFSET) with LEA_BID.  */
++
++	/* END_REG is rcx for EVEX/AVX512.  */
++	leaq	-(VEC_SIZE * 4 + LOOP_4X_OFFSET)(%rdi, %rdx), %END_REG
++#endif
++
++	/* Stores to first 2x VEC before cmp as any path forward will
++	   require it.  */
++	VMOVU	%VEC(0), (%rax)
++	VMOVU	%VEC(0), VEC_SIZE(%rax)
++
++
++#if !(defined USE_WITH_EVEX || defined USE_WITH_AVX512)
++	/* If AVX2/SSE2 compute END_REG (rdi) with ALU.  */
++	addq	%rdx, %END_REG
++#endif
++
++	cmpq	$(VEC_SIZE * 4), %rdx
++	jbe	L(last_2x_vec)
++
++	/* Store next 2x vec regardless.  */
++	VMOVU	%VEC(0), (VEC_SIZE * 2)(%rax)
++	VMOVU	%VEC(0), (VEC_SIZE * 3)(%rax)
++
++
++#if defined USE_WITH_EVEX || defined USE_WITH_AVX512
++	/* If LOOP_4X_OFFSET don't readjust LOOP_REG (rdi), just add
++	   extra offset to addresses in loop. Used for AVX512 to save space
++	   as no way to get (VEC_SIZE * 4) in imm8.  */
++# if LOOP_4X_OFFSET == 0
++	subq	$-(VEC_SIZE * 4), %LOOP_REG
+ # endif
+-# if VEC_SIZE > 32
+-	cmpb	$32, %dl
+-	jae	L(between_32_63)
++	/* Avoid imm32 compare here to save code size.  */
++	cmpq	%rdi, %rcx
++#else
++	addq	$-(VEC_SIZE * 4), %END_REG
++	cmpq	$(VEC_SIZE * 8), %rdx
++#endif
++	jbe	L(last_4x_vec)
++#if !(defined USE_WITH_EVEX || defined USE_WITH_AVX512)
++	/* Set LOOP_REG (rdx).  */
++	leaq	(VEC_SIZE * 4)(%rax), %LOOP_REG
++#endif
++	/* Align dst for loop.  */
++	andq	$(VEC_SIZE * -2), %LOOP_REG
++	.p2align 4
++L(loop):
++	VMOVA	%VEC(0), LOOP_4X_OFFSET(%LOOP_REG)
++	VMOVA	%VEC(0), (VEC_SIZE + LOOP_4X_OFFSET)(%LOOP_REG)
++	VMOVA	%VEC(0), (VEC_SIZE * 2 + LOOP_4X_OFFSET)(%LOOP_REG)
++	VMOVA	%VEC(0), (VEC_SIZE * 3 + LOOP_4X_OFFSET)(%LOOP_REG)
++	subq	$-(VEC_SIZE * 4), %LOOP_REG
++	cmpq	%END_REG, %LOOP_REG
++	jb	L(loop)
++	.p2align 4,, MOV_SIZE
++L(last_4x_vec):
++	VMOVU	%VEC(0), LOOP_4X_OFFSET(%END_REG)
++	VMOVU	%VEC(0), (VEC_SIZE + LOOP_4X_OFFSET)(%END_REG)
++	VMOVU	%VEC(0), (VEC_SIZE * 2 + LOOP_4X_OFFSET)(%END_REG)
++	VMOVU	%VEC(0), (VEC_SIZE * 3 + LOOP_4X_OFFSET)(%END_REG)
++L(return):
++#if VEC_SIZE > 16
++	ZERO_UPPER_VEC_REGISTERS_RETURN
++#else
++	ret
++#endif
++
++	.p2align 4,, 10
++#ifndef USE_LESS_VEC_MASK_STORE
++# if defined USE_MULTIARCH && IS_IN (libc)
++	/* If no USE_LESS_VEC_MASK put L(stosb_local) here. Will be in
++	   range for 2-byte jump encoding.  */
++L(stosb_local):
++	movzbl	%sil, %eax
++	mov	%RDX_LP, %RCX_LP
++	mov	%RDI_LP, %RDX_LP
++	rep	stosb
++	mov	%RDX_LP, %RAX_LP
++	VZEROUPPER_RETURN
+ # endif
+-# if VEC_SIZE > 16
+-	cmpb	$16, %dl
++	/* Define L(less_vec) only if not otherwise defined.  */
++	.p2align 4
++L(less_vec):
++#endif
++L(cross_page):
++#if VEC_SIZE > 32
++	cmpl	$32, %edx
++	jae	L(between_32_63)
++#endif
++#if VEC_SIZE > 16
++	cmpl	$16, %edx
+ 	jae	L(between_16_31)
+-# endif
+-	MOVQ	%XMM0, %rcx
+-	cmpb	$8, %dl
++#endif
++	MOVQ	%XMM0, %rdi
++	cmpl	$8, %edx
+ 	jae	L(between_8_15)
+-	cmpb	$4, %dl
++	cmpl	$4, %edx
+ 	jae	L(between_4_7)
+-	cmpb	$1, %dl
++	cmpl	$1, %edx
+ 	ja	L(between_2_3)
+-	jb	1f
+-	movb	%cl, (%rax)
+-1:
++	jb	L(return)
++	movb	%sil, (%rax)
+ 	VZEROUPPER_RETURN
+-# if VEC_SIZE > 32
++
++	/* Align small targets only if not doing so would cross a fetch
++	   line.  */
++#if VEC_SIZE > 32
++	.p2align 4,, SMALL_MEMSET_ALIGN(MOV_SIZE, RET_SIZE)
+ 	/* From 32 to 63.  No branch when size == 32.  */
+ L(between_32_63):
+-	VMOVU	%YMM0, -32(%rax,%rdx)
+ 	VMOVU	%YMM0, (%rax)
++	VMOVU	%YMM0, -32(%rax, %rdx)
+ 	VZEROUPPER_RETURN
+-# endif
+-# if VEC_SIZE > 16
+-	/* From 16 to 31.  No branch when size == 16.  */
++#endif
++
++#if VEC_SIZE >= 32
++	.p2align 4,, SMALL_MEMSET_ALIGN(MOV_SIZE, RET_SIZE)
+ L(between_16_31):
+-	VMOVU	%XMM0, -16(%rax,%rdx)
++	/* From 16 to 31.  No branch when size == 16.  */
+ 	VMOVU	%XMM0, (%rax)
++	VMOVU	%XMM0, -16(%rax, %rdx)
+ 	VZEROUPPER_RETURN
+-# endif
+-	/* From 8 to 15.  No branch when size == 8.  */
++#endif
++
++	.p2align 4,, SMALL_MEMSET_ALIGN(3, RET_SIZE)
+ L(between_8_15):
+-	movq	%rcx, -8(%rax,%rdx)
+-	movq	%rcx, (%rax)
++	/* From 8 to 15.  No branch when size == 8.  */
++	movq	%rdi, (%rax)
++	movq	%rdi, -8(%rax, %rdx)
+ 	VZEROUPPER_RETURN
++
++	.p2align 4,, SMALL_MEMSET_ALIGN(2, RET_SIZE)
+ L(between_4_7):
+ 	/* From 4 to 7.  No branch when size == 4.  */
+-	movl	%ecx, -4(%rax,%rdx)
+-	movl	%ecx, (%rax)
++	movl	%edi, (%rax)
++	movl	%edi, -4(%rax, %rdx)
+ 	VZEROUPPER_RETURN
++
++	.p2align 4,, SMALL_MEMSET_ALIGN(3, RET_SIZE)
+ L(between_2_3):
+ 	/* From 2 to 3.  No branch when size == 2.  */
+-	movw	%cx, -2(%rax,%rdx)
+-	movw	%cx, (%rax)
++	movw	%di, (%rax)
++	movb	%dil, -1(%rax, %rdx)
+ 	VZEROUPPER_RETURN
+ END (MEMSET_SYMBOL (__memset, unaligned_erms))
+-- 
+GitLab
+
diff --git a/glibc-RHEL-15696-58.patch b/glibc-RHEL-15696-58.patch
new file mode 100644
index 0000000..cec0788
--- /dev/null
+++ b/glibc-RHEL-15696-58.patch
@@ -0,0 +1,45 @@
+From bad852b61b79503fcb3c5fc379c70f768df3e1fb Mon Sep 17 00:00:00 2001
+From: Noah Goldstein <goldstein.w.n@gmail.com>
+Date: Sat, 23 Oct 2021 01:26:47 -0400
+Subject: [PATCH] x86: Replace sse2 instructions with avx in
+ memcmp-evex-movbe.S
+Content-type: text/plain; charset=UTF-8
+
+This commit replaces two usages of SSE2 'movups' with AVX 'vmovdqu'.
+
+it could potentially be dangerous to use SSE2 if this function is ever
+called without using 'vzeroupper' beforehand. While compilers appear
+to use 'vzeroupper' before function calls if AVX2 has been used, using
+SSE2 here is more brittle. Since it is not absolutely necessary it
+should be avoided.
+
+It costs 2-extra bytes but the extra bytes should only eat into
+alignment padding.
+Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
+---
+ sysdeps/x86_64/multiarch/memcmp-evex-movbe.S | 4 ++--
+ 1 file changed, 2 insertions(+), 2 deletions(-)
+
+diff --git a/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S b/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S
+index 2761b54f..640f6757 100644
+--- a/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S
++++ b/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S
+@@ -561,13 +561,13 @@ L(between_16_31):
+ 	/* From 16 to 31 bytes.  No branch when size == 16.  */
+ 
+ 	/* Use movups to save code size.  */
+-	movups	(%rsi), %xmm2
++	vmovdqu	(%rsi), %xmm2
+ 	VPCMP	$4, (%rdi), %xmm2, %k1
+ 	kmovd	%k1, %eax
+ 	testl	%eax, %eax
+ 	jnz	L(return_vec_0_lv)
+ 	/* Use overlapping loads to avoid branches.  */
+-	movups	-16(%rsi, %rdx, CHAR_SIZE), %xmm2
++	vmovdqu	-16(%rsi, %rdx, CHAR_SIZE), %xmm2
+ 	VPCMP	$4, -16(%rdi, %rdx, CHAR_SIZE), %xmm2, %k1
+ 	addl	$(CHAR_PER_VEC - (16 / CHAR_SIZE)), %edx
+ 	kmovd	%k1, %eax
+-- 
+GitLab
+
diff --git a/glibc-RHEL-15696-59.patch b/glibc-RHEL-15696-59.patch
new file mode 100644
index 0000000..efc618c
--- /dev/null
+++ b/glibc-RHEL-15696-59.patch
@@ -0,0 +1,695 @@
+From c46e9afb2df5fc9e39ff4d13777e4b4c26e04e55 Mon Sep 17 00:00:00 2001
+From: "H.J. Lu" <hjl.tools@gmail.com>
+Date: Fri, 29 Oct 2021 12:40:20 -0700
+Subject: [PATCH] x86-64: Improve EVEX strcmp with masked load
+Content-type: text/plain; charset=UTF-8
+
+In strcmp-evex.S, to compare 2 32-byte strings, replace
+
+        VMOVU   (%rdi, %rdx), %YMM0
+        VMOVU   (%rsi, %rdx), %YMM1
+        /* Each bit in K0 represents a mismatch in YMM0 and YMM1.  */
+        VPCMP   $4, %YMM0, %YMM1, %k0
+        VPCMP   $0, %YMMZERO, %YMM0, %k1
+        VPCMP   $0, %YMMZERO, %YMM1, %k2
+        /* Each bit in K1 represents a NULL in YMM0 or YMM1.  */
+        kord    %k1, %k2, %k1
+        /* Each bit in K1 represents a NULL or a mismatch.  */
+        kord    %k0, %k1, %k1
+        kmovd   %k1, %ecx
+        testl   %ecx, %ecx
+        jne     L(last_vector)
+
+with
+
+        VMOVU   (%rdi, %rdx), %YMM0
+        VPTESTM %YMM0, %YMM0, %k2
+        /* Each bit cleared in K1 represents a mismatch or a null CHAR
+           in YMM0 and 32 bytes at (%rsi, %rdx).  */
+        VPCMP   $0, (%rsi, %rdx), %YMM0, %k1{%k2}
+        kmovd   %k1, %ecx
+        incl    %ecx
+        jne     L(last_vector)
+
+It makes EVEX strcmp faster than AVX2 strcmp by up to 40% on Tiger Lake
+and Ice Lake.
+
+Co-Authored-By: Noah Goldstein <goldstein.w.n@gmail.com>
+---
+ sysdeps/x86_64/multiarch/strcmp-evex.S | 461 +++++++++++++------------
+ 1 file changed, 243 insertions(+), 218 deletions(-)
+
+diff --git a/sysdeps/x86_64/multiarch/strcmp-evex.S b/sysdeps/x86_64/multiarch/strcmp-evex.S
+index d5aa6daa..82f12ac8 100644
+--- a/sysdeps/x86_64/multiarch/strcmp-evex.S
++++ b/sysdeps/x86_64/multiarch/strcmp-evex.S
+@@ -41,6 +41,8 @@
+ # ifdef USE_AS_WCSCMP
+ /* Compare packed dwords.  */
+ #  define VPCMP		vpcmpd
++#  define VPMINU	vpminud
++#  define VPTESTM	vptestmd
+ #  define SHIFT_REG32	r8d
+ #  define SHIFT_REG64	r8
+ /* 1 dword char == 4 bytes.  */
+@@ -48,6 +50,8 @@
+ # else
+ /* Compare packed bytes.  */
+ #  define VPCMP		vpcmpb
++#  define VPMINU	vpminub
++#  define VPTESTM	vptestmb
+ #  define SHIFT_REG32	ecx
+ #  define SHIFT_REG64	rcx
+ /* 1 byte char == 1 byte.  */
+@@ -67,6 +71,9 @@
+ # define YMM5		ymm22
+ # define YMM6		ymm23
+ # define YMM7		ymm24
++# define YMM8		ymm25
++# define YMM9		ymm26
++# define YMM10		ymm27
+ 
+ /* Warning!
+            wcscmp/wcsncmp have to use SIGNED comparison for elements.
+@@ -76,7 +83,7 @@
+ /* The main idea of the string comparison (byte or dword) using 256-bit
+    EVEX instructions consists of comparing (VPCMP) two ymm vectors. The
+    latter can be on either packed bytes or dwords depending on
+-   USE_AS_WCSCMP. In order to check the null char, algorithm keeps the
++   USE_AS_WCSCMP. In order to check the null CHAR, algorithm keeps the
+    matched bytes/dwords, requiring 5 EVEX instructions (3 VPCMP and 2
+    KORD). In general, the costs of comparing VEC_SIZE bytes (32-bytes)
+    are 3 VPCMP and 2 KORD instructions, together with VMOVU and ktestd
+@@ -123,27 +130,21 @@ ENTRY (STRCMP)
+ 	jg	L(cross_page)
+ 	/* Start comparing 4 vectors.  */
+ 	VMOVU	(%rdi), %YMM0
+-	VMOVU	(%rsi), %YMM1
+ 
+-	/* Each bit in K0 represents a mismatch in YMM0 and YMM1.  */
+-	VPCMP	$4, %YMM0, %YMM1, %k0
++	/* Each bit set in K2 represents a non-null CHAR in YMM0.  */
++	VPTESTM	%YMM0, %YMM0, %k2
+ 
+-	/* Check for NULL in YMM0.  */
+-	VPCMP	$0, %YMMZERO, %YMM0, %k1
+-	/* Check for NULL in YMM1.  */
+-	VPCMP	$0, %YMMZERO, %YMM1, %k2
+-	/* Each bit in K1 represents a NULL in YMM0 or YMM1.  */
+-	kord	%k1, %k2, %k1
++	/* Each bit cleared in K1 represents a mismatch or a null CHAR
++	   in YMM0 and 32 bytes at (%rsi).  */
++	VPCMP	$0, (%rsi), %YMM0, %k1{%k2}
+ 
+-	/* Each bit in K1 represents:
+-	   1. A mismatch in YMM0 and YMM1.  Or
+-	   2. A NULL in YMM0 or YMM1.
+-	 */
+-	kord	%k0, %k1, %k1
+-
+-	ktestd	%k1, %k1
+-	je	L(next_3_vectors)
+ 	kmovd	%k1, %ecx
++# ifdef USE_AS_WCSCMP
++	subl	$0xff, %ecx
++# else
++	incl	%ecx
++# endif
++	je	L(next_3_vectors)
+ 	tzcntl	%ecx, %edx
+ # ifdef USE_AS_WCSCMP
+ 	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
+@@ -172,9 +173,7 @@ L(return):
+ # endif
+ 	ret
+ 
+-	.p2align 4
+ L(return_vec_size):
+-	kmovd	%k1, %ecx
+ 	tzcntl	%ecx, %edx
+ # ifdef USE_AS_WCSCMP
+ 	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
+@@ -210,9 +209,7 @@ L(return_vec_size):
+ # endif
+ 	ret
+ 
+-	.p2align 4
+ L(return_2_vec_size):
+-	kmovd	%k1, %ecx
+ 	tzcntl	%ecx, %edx
+ # ifdef USE_AS_WCSCMP
+ 	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
+@@ -248,9 +245,7 @@ L(return_2_vec_size):
+ # endif
+ 	ret
+ 
+-	.p2align 4
+ L(return_3_vec_size):
+-	kmovd	%k1, %ecx
+ 	tzcntl	%ecx, %edx
+ # ifdef USE_AS_WCSCMP
+ 	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
+@@ -289,43 +284,45 @@ L(return_3_vec_size):
+ 	.p2align 4
+ L(next_3_vectors):
+ 	VMOVU	VEC_SIZE(%rdi), %YMM0
+-	VMOVU	VEC_SIZE(%rsi), %YMM1
+-	/* Each bit in K0 represents a mismatch in YMM0 and YMM1.  */
+-	VPCMP	$4, %YMM0, %YMM1, %k0
+-	VPCMP	$0, %YMMZERO, %YMM0, %k1
+-	VPCMP	$0, %YMMZERO, %YMM1, %k2
+-	/* Each bit in K1 represents a NULL in YMM0 or YMM1.  */
+-	kord	%k1, %k2, %k1
+-	/* Each bit in K1 represents a NULL or a mismatch.  */
+-	kord	%k0, %k1, %k1
+-	ktestd	%k1, %k1
++	/* Each bit set in K2 represents a non-null CHAR in YMM0.  */
++	VPTESTM	%YMM0, %YMM0, %k2
++	/* Each bit cleared in K1 represents a mismatch or a null CHAR
++	   in YMM0 and 32 bytes at VEC_SIZE(%rsi).  */
++	VPCMP	$0, VEC_SIZE(%rsi), %YMM0, %k1{%k2}
++	kmovd	%k1, %ecx
++# ifdef USE_AS_WCSCMP
++	subl	$0xff, %ecx
++# else
++	incl	%ecx
++# endif
+ 	jne	L(return_vec_size)
+ 
+-	VMOVU	(VEC_SIZE * 2)(%rdi), %YMM2
+-	VMOVU	(VEC_SIZE * 3)(%rdi), %YMM3
+-	VMOVU	(VEC_SIZE * 2)(%rsi), %YMM4
+-	VMOVU	(VEC_SIZE * 3)(%rsi), %YMM5
+-
+-	/* Each bit in K0 represents a mismatch in YMM2 and YMM4.  */
+-	VPCMP	$4, %YMM2, %YMM4, %k0
+-	VPCMP	$0, %YMMZERO, %YMM2, %k1
+-	VPCMP	$0, %YMMZERO, %YMM4, %k2
+-	/* Each bit in K1 represents a NULL in YMM2 or YMM4.  */
+-	kord	%k1, %k2, %k1
+-	/* Each bit in K1 represents a NULL or a mismatch.  */
+-	kord	%k0, %k1, %k1
+-	ktestd	%k1, %k1
++	VMOVU	(VEC_SIZE * 2)(%rdi), %YMM0
++	/* Each bit set in K2 represents a non-null CHAR in YMM0.  */
++	VPTESTM	%YMM0, %YMM0, %k2
++	/* Each bit cleared in K1 represents a mismatch or a null CHAR
++	   in YMM0 and 32 bytes at (VEC_SIZE * 2)(%rsi).  */
++	VPCMP	$0, (VEC_SIZE * 2)(%rsi), %YMM0, %k1{%k2}
++	kmovd	%k1, %ecx
++# ifdef USE_AS_WCSCMP
++	subl	$0xff, %ecx
++# else
++	incl	%ecx
++# endif
+ 	jne	L(return_2_vec_size)
+ 
+-	/* Each bit in K0 represents a mismatch in YMM3 and YMM5.  */
+-	VPCMP	$4, %YMM3, %YMM5, %k0
+-	VPCMP	$0, %YMMZERO, %YMM3, %k1
+-	VPCMP	$0, %YMMZERO, %YMM5, %k2
+-	/* Each bit in K1 represents a NULL in YMM3 or YMM5.  */
+-	kord	%k1, %k2, %k1
+-	/* Each bit in K1 represents a NULL or a mismatch.  */
+-	kord	%k0, %k1, %k1
+-	ktestd	%k1, %k1
++	VMOVU	(VEC_SIZE * 3)(%rdi), %YMM0
++	/* Each bit set in K2 represents a non-null CHAR in YMM0.  */
++	VPTESTM	%YMM0, %YMM0, %k2
++	/* Each bit cleared in K1 represents a mismatch or a null CHAR
++	   in YMM0 and 32 bytes at (VEC_SIZE * 2)(%rsi).  */
++	VPCMP	$0, (VEC_SIZE * 3)(%rsi), %YMM0, %k1{%k2}
++	kmovd	%k1, %ecx
++# ifdef USE_AS_WCSCMP
++	subl	$0xff, %ecx
++# else
++	incl	%ecx
++# endif
+ 	jne	L(return_3_vec_size)
+ L(main_loop_header):
+ 	leaq	(VEC_SIZE * 4)(%rdi), %rdx
+@@ -375,56 +372,51 @@ L(back_to_loop):
+ 	VMOVA	VEC_SIZE(%rax), %YMM2
+ 	VMOVA	(VEC_SIZE * 2)(%rax), %YMM4
+ 	VMOVA	(VEC_SIZE * 3)(%rax), %YMM6
+-	VMOVU	(%rdx), %YMM1
+-	VMOVU	VEC_SIZE(%rdx), %YMM3
+-	VMOVU	(VEC_SIZE * 2)(%rdx), %YMM5
+-	VMOVU	(VEC_SIZE * 3)(%rdx), %YMM7
+-
+-	VPCMP	$4, %YMM0, %YMM1, %k0
+-	VPCMP	$0, %YMMZERO, %YMM0, %k1
+-	VPCMP	$0, %YMMZERO, %YMM1, %k2
+-	kord	%k1, %k2, %k1
+-	/* Each bit in K4 represents a NULL or a mismatch in YMM0 and
+-	   YMM1.  */
+-	kord	%k0, %k1, %k4
+-
+-	VPCMP	$4, %YMM2, %YMM3, %k0
+-	VPCMP	$0, %YMMZERO, %YMM2, %k1
+-	VPCMP	$0, %YMMZERO, %YMM3, %k2
+-	kord	%k1, %k2, %k1
+-	/* Each bit in K5 represents a NULL or a mismatch in YMM2 and
+-	   YMM3.  */
+-	kord	%k0, %k1, %k5
+-
+-	VPCMP	$4, %YMM4, %YMM5, %k0
+-	VPCMP	$0, %YMMZERO, %YMM4, %k1
+-	VPCMP	$0, %YMMZERO, %YMM5, %k2
+-	kord	%k1, %k2, %k1
+-	/* Each bit in K6 represents a NULL or a mismatch in YMM4 and
+-	   YMM5.  */
+-	kord	%k0, %k1, %k6
+-
+-	VPCMP	$4, %YMM6, %YMM7, %k0
+-	VPCMP	$0, %YMMZERO, %YMM6, %k1
+-	VPCMP	$0, %YMMZERO, %YMM7, %k2
+-	kord	%k1, %k2, %k1
+-	/* Each bit in K7 represents a NULL or a mismatch in YMM6 and
+-	   YMM7.  */
+-	kord	%k0, %k1, %k7
+-
+-	kord	%k4, %k5, %k0
+-	kord	%k6, %k7, %k1
+-
+-	/* Test each mask (32 bits) individually because for VEC_SIZE
+-	   == 32 is not possible to OR the four masks and keep all bits
+-	   in a 64-bit integer register, differing from SSE2 strcmp
+-	   where ORing is possible.  */
+-	kortestd %k0, %k1
+-	je	L(loop)
+-	ktestd	%k4, %k4
++
++	VPMINU	%YMM0, %YMM2, %YMM8
++	VPMINU	%YMM4, %YMM6, %YMM9
++
++	/* A zero CHAR in YMM8 means that there is a null CHAR.  */
++	VPMINU	%YMM8, %YMM9, %YMM8
++
++	/* Each bit set in K1 represents a non-null CHAR in YMM8.  */
++	VPTESTM	%YMM8, %YMM8, %k1
++
++	/* (YMM ^ YMM): A non-zero CHAR represents a mismatch.  */
++	vpxorq	(%rdx), %YMM0, %YMM1
++	vpxorq	VEC_SIZE(%rdx), %YMM2, %YMM3
++	vpxorq	(VEC_SIZE * 2)(%rdx), %YMM4, %YMM5
++	vpxorq	(VEC_SIZE * 3)(%rdx), %YMM6, %YMM7
++
++	vporq	%YMM1, %YMM3, %YMM9
++	vporq	%YMM5, %YMM7, %YMM10
++
++	/* A non-zero CHAR in YMM9 represents a mismatch.  */
++	vporq	%YMM9, %YMM10, %YMM9
++
++	/* Each bit cleared in K0 represents a mismatch or a null CHAR.  */
++	VPCMP	$0, %YMMZERO, %YMM9, %k0{%k1}
++	kmovd   %k0, %ecx
++# ifdef USE_AS_WCSCMP
++	subl	$0xff, %ecx
++# else
++	incl	%ecx
++# endif
++	je	 L(loop)
++
++	/* Each bit set in K1 represents a non-null CHAR in YMM0.  */
++	VPTESTM	%YMM0, %YMM0, %k1
++	/* Each bit cleared in K0 represents a mismatch or a null CHAR
++	   in YMM0 and (%rdx).  */
++	VPCMP	$0, %YMMZERO, %YMM1, %k0{%k1}
++	kmovd	%k0, %ecx
++# ifdef USE_AS_WCSCMP
++	subl	$0xff, %ecx
++# else
++	incl	%ecx
++# endif
+ 	je	L(test_vec)
+-	kmovd	%k4, %edi
+-	tzcntl	%edi, %ecx
++	tzcntl	%ecx, %ecx
+ # ifdef USE_AS_WCSCMP
+ 	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
+ 	sall	$2, %ecx
+@@ -466,9 +458,18 @@ L(test_vec):
+ 	cmpq	$VEC_SIZE, %r11
+ 	jbe	L(zero)
+ # endif
+-	ktestd	%k5, %k5
++	/* Each bit set in K1 represents a non-null CHAR in YMM2.  */
++	VPTESTM	%YMM2, %YMM2, %k1
++	/* Each bit cleared in K0 represents a mismatch or a null CHAR
++	   in YMM2 and VEC_SIZE(%rdx).  */
++	VPCMP	$0, %YMMZERO, %YMM3, %k0{%k1}
++	kmovd	%k0, %ecx
++# ifdef USE_AS_WCSCMP
++	subl	$0xff, %ecx
++# else
++	incl	%ecx
++# endif
+ 	je	L(test_2_vec)
+-	kmovd	%k5, %ecx
+ 	tzcntl	%ecx, %edi
+ # ifdef USE_AS_WCSCMP
+ 	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
+@@ -512,9 +513,18 @@ L(test_2_vec):
+ 	cmpq	$(VEC_SIZE * 2), %r11
+ 	jbe	L(zero)
+ # endif
+-	ktestd	%k6, %k6
++	/* Each bit set in K1 represents a non-null CHAR in YMM4.  */
++	VPTESTM	%YMM4, %YMM4, %k1
++	/* Each bit cleared in K0 represents a mismatch or a null CHAR
++	   in YMM4 and (VEC_SIZE * 2)(%rdx).  */
++	VPCMP	$0, %YMMZERO, %YMM5, %k0{%k1}
++	kmovd	%k0, %ecx
++# ifdef USE_AS_WCSCMP
++	subl	$0xff, %ecx
++# else
++	incl	%ecx
++# endif
+ 	je	L(test_3_vec)
+-	kmovd	%k6, %ecx
+ 	tzcntl	%ecx, %edi
+ # ifdef USE_AS_WCSCMP
+ 	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
+@@ -558,8 +568,18 @@ L(test_3_vec):
+ 	cmpq	$(VEC_SIZE * 3), %r11
+ 	jbe	L(zero)
+ # endif
+-	kmovd	%k7, %esi
+-	tzcntl	%esi, %ecx
++	/* Each bit set in K1 represents a non-null CHAR in YMM6.  */
++	VPTESTM	%YMM6, %YMM6, %k1
++	/* Each bit cleared in K0 represents a mismatch or a null CHAR
++	   in YMM6 and (VEC_SIZE * 3)(%rdx).  */
++	VPCMP	$0, %YMMZERO, %YMM7, %k0{%k1}
++	kmovd	%k0, %ecx
++# ifdef USE_AS_WCSCMP
++	subl	$0xff, %ecx
++# else
++	incl	%ecx
++# endif
++	tzcntl	%ecx, %ecx
+ # ifdef USE_AS_WCSCMP
+ 	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
+ 	sall	$2, %ecx
+@@ -615,39 +635,51 @@ L(loop_cross_page):
+ 
+ 	VMOVU	(%rax, %r10), %YMM2
+ 	VMOVU	VEC_SIZE(%rax, %r10), %YMM3
+-	VMOVU	(%rdx, %r10), %YMM4
+-	VMOVU	VEC_SIZE(%rdx, %r10), %YMM5
+-
+-	VPCMP	$4, %YMM4, %YMM2, %k0
+-	VPCMP	$0, %YMMZERO, %YMM2, %k1
+-	VPCMP	$0, %YMMZERO, %YMM4, %k2
+-	kord	%k1, %k2, %k1
+-	/* Each bit in K1 represents a NULL or a mismatch in YMM2 and
+-	   YMM4.  */
+-	kord	%k0, %k1, %k1
+-
+-	VPCMP	$4, %YMM5, %YMM3, %k3
+-	VPCMP	$0, %YMMZERO, %YMM3, %k4
+-	VPCMP	$0, %YMMZERO, %YMM5, %k5
+-	kord	%k4, %k5, %k4
+-	/* Each bit in K3 represents a NULL or a mismatch in YMM3 and
+-	   YMM5.  */
+-	kord	%k3, %k4, %k3
++
++	/* Each bit set in K2 represents a non-null CHAR in YMM2.  */
++	VPTESTM	%YMM2, %YMM2, %k2
++	/* Each bit cleared in K1 represents a mismatch or a null CHAR
++	   in YMM2 and 32 bytes at (%rdx, %r10).  */
++	VPCMP	$0, (%rdx, %r10), %YMM2, %k1{%k2}
++	kmovd	%k1, %r9d
++	/* Don't use subl since it is the lower 16/32 bits of RDI
++	   below.  */
++	notl	%r9d
++# ifdef USE_AS_WCSCMP
++	/* Only last 8 bits are valid.  */
++	andl	$0xff, %r9d
++# endif
++
++	/* Each bit set in K4 represents a non-null CHAR in YMM3.  */
++	VPTESTM	%YMM3, %YMM3, %k4
++	/* Each bit cleared in K3 represents a mismatch or a null CHAR
++	   in YMM3 and 32 bytes at VEC_SIZE(%rdx, %r10).  */
++	VPCMP	$0, VEC_SIZE(%rdx, %r10), %YMM3, %k3{%k4}
++	kmovd	%k3, %edi
++# ifdef USE_AS_WCSCMP
++	/* Don't use subl since it is the upper 8 bits of EDI below.  */
++	notl	%edi
++	andl	$0xff, %edi
++# else
++	incl	%edi
++# endif
+ 
+ # ifdef USE_AS_WCSCMP
+-	/* NB: Each bit in K1/K3 represents 4-byte element.  */
+-	kshiftlw $8, %k3, %k2
++	/* NB: Each bit in EDI/R9D represents 4-byte element.  */
++	sall	$8, %edi
+ 	/* NB: Divide shift count by 4 since each bit in K1 represent 4
+ 	   bytes.  */
+ 	movl	%ecx, %SHIFT_REG32
+ 	sarl	$2, %SHIFT_REG32
++
++	/* Each bit in EDI represents a null CHAR or a mismatch.  */
++	orl	%r9d, %edi
+ # else
+-	kshiftlq $32, %k3, %k2
+-# endif
++	salq	$32, %rdi
+ 
+-	/* Each bit in K1 represents a NULL or a mismatch.  */
+-	korq	%k1, %k2, %k1
+-	kmovq	%k1, %rdi
++	/* Each bit in RDI represents a null CHAR or a mismatch.  */
++	orq	%r9, %rdi
++# endif
+ 
+ 	/* Since ECX < VEC_SIZE * 2, simply skip the first ECX bytes.  */
+ 	shrxq	%SHIFT_REG64, %rdi, %rdi
+@@ -692,35 +724,45 @@ L(loop_cross_page_2_vec):
+ 	/* The first VEC_SIZE * 2 bytes match or are ignored.  */
+ 	VMOVU	(VEC_SIZE * 2)(%rax, %r10), %YMM0
+ 	VMOVU	(VEC_SIZE * 3)(%rax, %r10), %YMM1
+-	VMOVU	(VEC_SIZE * 2)(%rdx, %r10), %YMM2
+-	VMOVU	(VEC_SIZE * 3)(%rdx, %r10), %YMM3
+-
+-	VPCMP	$4, %YMM0, %YMM2, %k0
+-	VPCMP	$0, %YMMZERO, %YMM0, %k1
+-	VPCMP	$0, %YMMZERO, %YMM2, %k2
+-	kord	%k1, %k2, %k1
+-	/* Each bit in K1 represents a NULL or a mismatch in YMM0 and
+-	   YMM2.  */
+-	kord	%k0, %k1, %k1
+-
+-	VPCMP	$4, %YMM1, %YMM3, %k3
+-	VPCMP	$0, %YMMZERO, %YMM1, %k4
+-	VPCMP	$0, %YMMZERO, %YMM3, %k5
+-	kord	%k4, %k5, %k4
+-	/* Each bit in K3 represents a NULL or a mismatch in YMM1 and
+-	   YMM3.  */
+-	kord	%k3, %k4, %k3
+ 
++	VPTESTM	%YMM0, %YMM0, %k2
++	/* Each bit cleared in K1 represents a mismatch or a null CHAR
++	   in YMM0 and 32 bytes at (VEC_SIZE * 2)(%rdx, %r10).  */
++	VPCMP	$0, (VEC_SIZE * 2)(%rdx, %r10), %YMM0, %k1{%k2}
++	kmovd	%k1, %r9d
++	/* Don't use subl since it is the lower 16/32 bits of RDI
++	   below.  */
++	notl	%r9d
+ # ifdef USE_AS_WCSCMP
+-	/* NB: Each bit in K1/K3 represents 4-byte element.  */
+-	kshiftlw $8, %k3, %k2
++	/* Only last 8 bits are valid.  */
++	andl	$0xff, %r9d
++# endif
++
++	VPTESTM	%YMM1, %YMM1, %k4
++	/* Each bit cleared in K3 represents a mismatch or a null CHAR
++	   in YMM1 and 32 bytes at (VEC_SIZE * 3)(%rdx, %r10).  */
++	VPCMP	$0, (VEC_SIZE * 3)(%rdx, %r10), %YMM1, %k3{%k4}
++	kmovd	%k3, %edi
++# ifdef USE_AS_WCSCMP
++	/* Don't use subl since it is the upper 8 bits of EDI below.  */
++	notl	%edi
++	andl	$0xff, %edi
+ # else
+-	kshiftlq $32, %k3, %k2
++	incl	%edi
+ # endif
+ 
+-	/* Each bit in K1 represents a NULL or a mismatch.  */
+-	korq	%k1, %k2, %k1
+-	kmovq	%k1, %rdi
++# ifdef USE_AS_WCSCMP
++	/* NB: Each bit in EDI/R9D represents 4-byte element.  */
++	sall	$8, %edi
++
++	/* Each bit in EDI represents a null CHAR or a mismatch.  */
++	orl	%r9d, %edi
++# else
++	salq	$32, %rdi
++
++	/* Each bit in RDI represents a null CHAR or a mismatch.  */
++	orq	%r9, %rdi
++# endif
+ 
+ 	xorl	%r8d, %r8d
+ 	/* If ECX > VEC_SIZE * 2, skip ECX - (VEC_SIZE * 2) bytes.  */
+@@ -729,12 +771,15 @@ L(loop_cross_page_2_vec):
+ 	/* R8 has number of bytes skipped.  */
+ 	movl	%ecx, %r8d
+ # ifdef USE_AS_WCSCMP
+-	/* NB: Divide shift count by 4 since each bit in K1 represent 4
++	/* NB: Divide shift count by 4 since each bit in RDI represent 4
+ 	   bytes.  */
+ 	sarl	$2, %ecx
+-# endif
++	/* Skip ECX bytes.  */
++	shrl	%cl, %edi
++# else
+ 	/* Skip ECX bytes.  */
+ 	shrq	%cl, %rdi
++# endif
+ 1:
+ 	/* Before jumping back to the loop, set ESI to the number of
+ 	   VEC_SIZE * 4 blocks before page crossing.  */
+@@ -818,7 +863,7 @@ L(cross_page_loop):
+ 	movzbl	(%rdi, %rdx), %eax
+ 	movzbl	(%rsi, %rdx), %ecx
+ # endif
+-	/* Check null char.  */
++	/* Check null CHAR.  */
+ 	testl	%eax, %eax
+ 	jne	L(cross_page_loop)
+ 	/* Since %eax == 0, subtract is OK for both SIGNED and UNSIGNED
+@@ -901,18 +946,17 @@ L(cross_page):
+ 	jg	L(cross_page_1_vector)
+ L(loop_1_vector):
+ 	VMOVU	(%rdi, %rdx), %YMM0
+-	VMOVU	(%rsi, %rdx), %YMM1
+-
+-	/* Each bit in K0 represents a mismatch in YMM0 and YMM1.  */
+-	VPCMP	$4, %YMM0, %YMM1, %k0
+-	VPCMP	$0, %YMMZERO, %YMM0, %k1
+-	VPCMP	$0, %YMMZERO, %YMM1, %k2
+-	/* Each bit in K1 represents a NULL in YMM0 or YMM1.  */
+-	kord	%k1, %k2, %k1
+-	/* Each bit in K1 represents a NULL or a mismatch.  */
+-	kord	%k0, %k1, %k1
++
++	VPTESTM	%YMM0, %YMM0, %k2
++	/* Each bit cleared in K1 represents a mismatch or a null CHAR
++	   in YMM0 and 32 bytes at (%rsi, %rdx).  */
++	VPCMP	$0, (%rsi, %rdx), %YMM0, %k1{%k2}
+ 	kmovd	%k1, %ecx
+-	testl	%ecx, %ecx
++# ifdef USE_AS_WCSCMP
++	subl	$0xff, %ecx
++# else
++	incl	%ecx
++# endif
+ 	jne	L(last_vector)
+ 
+ 	addl	$VEC_SIZE, %edx
+@@ -931,18 +975,17 @@ L(cross_page_1_vector):
+ 	cmpl	$(PAGE_SIZE - 16), %eax
+ 	jg	L(cross_page_1_xmm)
+ 	VMOVU	(%rdi, %rdx), %XMM0
+-	VMOVU	(%rsi, %rdx), %XMM1
+-
+-	/* Each bit in K0 represents a mismatch in XMM0 and XMM1.  */
+-	VPCMP	$4, %XMM0, %XMM1, %k0
+-	VPCMP	$0, %XMMZERO, %XMM0, %k1
+-	VPCMP	$0, %XMMZERO, %XMM1, %k2
+-	/* Each bit in K1 represents a NULL in XMM0 or XMM1.  */
+-	korw	%k1, %k2, %k1
+-	/* Each bit in K1 represents a NULL or a mismatch.  */
+-	korw	%k0, %k1, %k1
+-	kmovw	%k1, %ecx
+-	testl	%ecx, %ecx
++
++	VPTESTM	%YMM0, %YMM0, %k2
++	/* Each bit cleared in K1 represents a mismatch or a null CHAR
++	   in XMM0 and 16 bytes at (%rsi, %rdx).  */
++	VPCMP	$0, (%rsi, %rdx), %XMM0, %k1{%k2}
++	kmovd	%k1, %ecx
++# ifdef USE_AS_WCSCMP
++	subl	$0xf, %ecx
++# else
++	subl	$0xffff, %ecx
++# endif
+ 	jne	L(last_vector)
+ 
+ 	addl	$16, %edx
+@@ -965,25 +1008,16 @@ L(cross_page_1_xmm):
+ 	vmovq	(%rdi, %rdx), %XMM0
+ 	vmovq	(%rsi, %rdx), %XMM1
+ 
+-	/* Each bit in K0 represents a mismatch in XMM0 and XMM1.  */
+-	VPCMP	$4, %XMM0, %XMM1, %k0
+-	VPCMP	$0, %XMMZERO, %XMM0, %k1
+-	VPCMP	$0, %XMMZERO, %XMM1, %k2
+-	/* Each bit in K1 represents a NULL in XMM0 or XMM1.  */
+-	kord	%k1, %k2, %k1
+-	/* Each bit in K1 represents a NULL or a mismatch.  */
+-	kord	%k0, %k1, %k1
+-	kmovd	%k1, %ecx
+-
++	VPTESTM	%YMM0, %YMM0, %k2
++	/* Each bit cleared in K1 represents a mismatch or a null CHAR
++	   in XMM0 and XMM1.  */
++	VPCMP	$0, %XMM1, %XMM0, %k1{%k2}
++	kmovb	%k1, %ecx
+ # ifdef USE_AS_WCSCMP
+-	/* Only last 2 bits are valid.  */
+-	andl	$0x3, %ecx
++	subl	$0x3, %ecx
+ # else
+-	/* Only last 8 bits are valid.  */
+-	andl	$0xff, %ecx
++	subl	$0xff, %ecx
+ # endif
+-
+-	testl	%ecx, %ecx
+ 	jne	L(last_vector)
+ 
+ 	addl	$8, %edx
+@@ -1002,25 +1036,16 @@ L(cross_page_8bytes):
+ 	vmovd	(%rdi, %rdx), %XMM0
+ 	vmovd	(%rsi, %rdx), %XMM1
+ 
+-	/* Each bit in K0 represents a mismatch in XMM0 and XMM1.  */
+-	VPCMP	$4, %XMM0, %XMM1, %k0
+-	VPCMP	$0, %XMMZERO, %XMM0, %k1
+-	VPCMP	$0, %XMMZERO, %XMM1, %k2
+-	/* Each bit in K1 represents a NULL in XMM0 or XMM1.  */
+-	kord	%k1, %k2, %k1
+-	/* Each bit in K1 represents a NULL or a mismatch.  */
+-	kord	%k0, %k1, %k1
++	VPTESTM	%YMM0, %YMM0, %k2
++	/* Each bit cleared in K1 represents a mismatch or a null CHAR
++	   in XMM0 and XMM1.  */
++	VPCMP	$0, %XMM1, %XMM0, %k1{%k2}
+ 	kmovd	%k1, %ecx
+-
+ # ifdef USE_AS_WCSCMP
+-	/* Only the last bit is valid.  */
+-	andl	$0x1, %ecx
++	subl	$0x1, %ecx
+ # else
+-	/* Only last 4 bits are valid.  */
+-	andl	$0xf, %ecx
++	subl	$0xf, %ecx
+ # endif
+-
+-	testl	%ecx, %ecx
+ 	jne	L(last_vector)
+ 
+ 	addl	$4, %edx
+-- 
+GitLab
+
diff --git a/glibc-RHEL-15696-6.patch b/glibc-RHEL-15696-6.patch
new file mode 100644
index 0000000..f6725a6
--- /dev/null
+++ b/glibc-RHEL-15696-6.patch
@@ -0,0 +1,300 @@
+From ee915088a0231cd421054dbd8abab7aadf331153 Mon Sep 17 00:00:00 2001
+From: "H.J. Lu" <hjl.tools@gmail.com>
+Date: Mon, 21 Jan 2019 11:33:52 -0800
+Subject: [PATCH] x86-64 strncmp family: Properly handle the length parameter
+ [BZ# 24097]
+Content-type: text/plain; charset=UTF-8
+
+On x32, the size_t parameter may be passed in the lower 32 bits of a
+64-bit register with the non-zero upper 32 bits.  The string/memory
+functions written in assembly can only use the lower 32 bits of a
+64-bit register as length or must clear the upper 32 bits before using
+the full 64-bit register for length.
+
+This pach fixes the strncmp family for x32.  Tested on x86-64 and x32.
+On x86-64, libc.so is the same with and withou the fix.
+
+	[BZ# 24097]
+	CVE-2019-6488
+	* sysdeps/x86_64/multiarch/strcmp-avx2.S: Use RDX_LP for length.
+	* sysdeps/x86_64/multiarch/strcmp-sse42.S: Likewise.
+	* sysdeps/x86_64/strcmp.S: Likewise.
+	* sysdeps/x86_64/x32/Makefile (tests): Add tst-size_t-strncasecmp,
+	tst-size_t-strncmp and tst-size_t-wcsncmp.
+	* sysdeps/x86_64/x32/tst-size_t-strncasecmp.c: New file.
+	* sysdeps/x86_64/x32/tst-size_t-strncmp.c: Likewise.
+	* sysdeps/x86_64/x32/tst-size_t-wcsncmp.c: Likewise.
+---
+ sysdeps/x86_64/multiarch/strcmp-avx2.S      |  6 +-
+ sysdeps/x86_64/multiarch/strcmp-sse42.S     |  6 +-
+ sysdeps/x86_64/strcmp.S                     |  6 +-
+ sysdeps/x86_64/x32/Makefile                 |  6 +-
+ sysdeps/x86_64/x32/tst-size_t-strncasecmp.c | 59 ++++++++++++++++
+ sysdeps/x86_64/x32/tst-size_t-strncmp.c     | 78 +++++++++++++++++++++
+ sysdeps/x86_64/x32/tst-size_t-wcsncmp.c     | 20 ++++++
+ 7 files changed, 170 insertions(+), 11 deletions(-)
+ create mode 100644 sysdeps/x86_64/x32/tst-size_t-strncasecmp.c
+ create mode 100644 sysdeps/x86_64/x32/tst-size_t-strncmp.c
+ create mode 100644 sysdeps/x86_64/x32/tst-size_t-wcsncmp.c
+
+Conflicts:
+	ChangeLog
+	(removed)
+
+diff --git a/sysdeps/x86_64/multiarch/strcmp-avx2.S b/sysdeps/x86_64/multiarch/strcmp-avx2.S
+index 327e3d87..156c1949 100644
+--- a/sysdeps/x86_64/multiarch/strcmp-avx2.S
++++ b/sysdeps/x86_64/multiarch/strcmp-avx2.S
+@@ -79,15 +79,15 @@
+ ENTRY (STRCMP)
+ # ifdef USE_AS_STRNCMP
+ 	/* Check for simple cases (0 or 1) in offset.  */
+-	cmp	$1, %rdx
++	cmp	$1, %RDX_LP
+ 	je	L(char0)
+ 	jb	L(zero)
+ #  ifdef USE_AS_WCSCMP
+ 	/* Convert units: from wide to byte char.  */
+-	shl	$2, %rdx
++	shl	$2, %RDX_LP
+ #  endif
+ 	/* Register %r11 tracks the maximum offset.  */
+-	movq	%rdx, %r11
++	mov	%RDX_LP, %R11_LP
+ # endif
+ 	movl	%edi, %eax
+ 	xorl	%edx, %edx
+diff --git a/sysdeps/x86_64/multiarch/strcmp-sse42.S b/sysdeps/x86_64/multiarch/strcmp-sse42.S
+index d3c07bd2..a1ebea46 100644
+--- a/sysdeps/x86_64/multiarch/strcmp-sse42.S
++++ b/sysdeps/x86_64/multiarch/strcmp-sse42.S
+@@ -156,11 +156,11 @@ STRCMP_SSE42:
+ #endif
+ 
+ #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+-	test	%rdx, %rdx
++	test	%RDX_LP, %RDX_LP
+ 	je	LABEL(strcmp_exitz)
+-	cmp	$1, %rdx
++	cmp	$1, %RDX_LP
+ 	je	LABEL(Byte0)
+-	mov	%rdx, %r11
++	mov	%RDX_LP, %R11_LP
+ #endif
+ 	mov	%esi, %ecx
+ 	mov	%edi, %eax
+diff --git a/sysdeps/x86_64/strcmp.S b/sysdeps/x86_64/strcmp.S
+index e16945b9..f47c8ad4 100644
+--- a/sysdeps/x86_64/strcmp.S
++++ b/sysdeps/x86_64/strcmp.S
+@@ -135,11 +135,11 @@ ENTRY (STRCMP)
+  * This implementation uses SSE to compare up to 16 bytes at a time.
+  */
+ #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+-	test	%rdx, %rdx
++	test	%RDX_LP, %RDX_LP
+ 	je	LABEL(strcmp_exitz)
+-	cmp	$1, %rdx
++	cmp	$1, %RDX_LP
+ 	je	LABEL(Byte0)
+-	mov	%rdx, %r11
++	mov	%RDX_LP, %R11_LP
+ #endif
+ 	mov	%esi, %ecx
+ 	mov	%edi, %eax
+diff --git a/sysdeps/x86_64/x32/Makefile b/sysdeps/x86_64/x32/Makefile
+index 98bd9ae9..db302839 100644
+--- a/sysdeps/x86_64/x32/Makefile
++++ b/sysdeps/x86_64/x32/Makefile
+@@ -7,9 +7,11 @@ endif
+ 
+ ifeq ($(subdir),string)
+ tests += tst-size_t-memchr tst-size_t-memcmp tst-size_t-memcpy \
+-	 tst-size_t-memrchr tst-size_t-memset
++	 tst-size_t-memrchr tst-size_t-memset tst-size_t-strncasecmp \
++	 tst-size_t-strncmp
+ endif
+ 
+ ifeq ($(subdir),wcsmbs)
+-tests += tst-size_t-wmemchr tst-size_t-wmemcmp tst-size_t-wmemset
++tests += tst-size_t-wmemchr tst-size_t-wmemcmp tst-size_t-wmemset \
++	 tst-size_t-wcsncmp
+ endif
+diff --git a/sysdeps/x86_64/x32/tst-size_t-strncasecmp.c b/sysdeps/x86_64/x32/tst-size_t-strncasecmp.c
+new file mode 100644
+index 00000000..86233593
+--- /dev/null
++++ b/sysdeps/x86_64/x32/tst-size_t-strncasecmp.c
+@@ -0,0 +1,59 @@
++/* Test strncaecmp with size_t in the lower 32 bits of 64-bit register.
++   Copyright (C) 2019 Free Software Foundation, Inc.
++   This file is part of the GNU C Library.
++
++   The GNU C Library is free software; you can redistribute it and/or
++   modify it under the terms of the GNU Lesser General Public
++   License as published by the Free Software Foundation; either
++   version 2.1 of the License, or (at your option) any later version.
++
++   The GNU C Library is distributed in the hope that it will be useful,
++   but WITHOUT ANY WARRANTY; without even the implied warranty of
++   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++   Lesser General Public License for more details.
++
++   You should have received a copy of the GNU Lesser General Public
++   License along with the GNU C Library; if not, see
++   <http://www.gnu.org/licenses/>.  */
++
++#define TEST_NAME "strncasecmp"
++#include "test-size_t.h"
++
++IMPL (strncasecmp, 1)
++
++typedef int (*proto_t) (const char *, const char *, size_t);
++
++static int
++__attribute__ ((noinline, noclone))
++do_strncasecmp (parameter_t a, parameter_t b)
++{
++  return CALL (&b, a.p, b.p, a.len);
++}
++
++static int
++test_main (void)
++{
++  test_init ();
++
++  parameter_t dest = { { page_size }, buf1 };
++  parameter_t src = { { 0 }, buf2 };
++
++  strncpy ((char *) buf1, (const char *) buf2, page_size);
++
++  int ret = 0;
++  FOR_EACH_IMPL (impl, 0)
++    {
++      src.fn = impl->fn;
++      int res = do_strncasecmp (dest, src);
++      if (res)
++	{
++	  error (0, 0, "Wrong result in function %s: %i != 0",
++		 impl->name, res);
++	  ret = 1;
++	}
++    }
++
++  return ret ? EXIT_FAILURE : EXIT_SUCCESS;
++}
++
++#include <support/test-driver.c>
+diff --git a/sysdeps/x86_64/x32/tst-size_t-strncmp.c b/sysdeps/x86_64/x32/tst-size_t-strncmp.c
+new file mode 100644
+index 00000000..54e6bd83
+--- /dev/null
++++ b/sysdeps/x86_64/x32/tst-size_t-strncmp.c
+@@ -0,0 +1,78 @@
++/* Test strncmp with size_t in the lower 32 bits of 64-bit register.
++   Copyright (C) 2019 Free Software Foundation, Inc.
++   This file is part of the GNU C Library.
++
++   The GNU C Library is free software; you can redistribute it and/or
++   modify it under the terms of the GNU Lesser General Public
++   License as published by the Free Software Foundation; either
++   version 2.1 of the License, or (at your option) any later version.
++
++   The GNU C Library is distributed in the hope that it will be useful,
++   but WITHOUT ANY WARRANTY; without even the implied warranty of
++   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++   Lesser General Public License for more details.
++
++   You should have received a copy of the GNU Lesser General Public
++   License along with the GNU C Library; if not, see
++   <http://www.gnu.org/licenses/>.  */
++
++#ifdef WIDE
++# define TEST_NAME "wcsncmp"
++#else
++# define TEST_NAME "strncmp"
++#endif
++
++#include "test-size_t.h"
++
++#ifdef WIDE
++# include <wchar.h>
++
++# define STRNCMP wcsncmp
++# define STRNCPY wcsncpy
++# define CHAR wchar_t
++#else
++# define STRNCMP strncmp
++# define STRNCPY strncpy
++# define CHAR char
++#endif
++
++IMPL (STRNCMP, 1)
++
++typedef int (*proto_t) (const CHAR *, const CHAR *, size_t);
++
++
++static int
++__attribute__ ((noinline, noclone))
++do_strncmp (parameter_t a, parameter_t b)
++{
++  return CALL (&b, a.p, b.p, a.len);
++}
++
++static int
++test_main (void)
++{
++  test_init ();
++
++  size_t size = page_size / sizeof (CHAR);
++  parameter_t dest = { { size }, buf1 };
++  parameter_t src = { { 0 }, buf2 };
++
++  STRNCPY ((CHAR *) buf1, (const CHAR *) buf2, size);
++
++  int ret = 0;
++  FOR_EACH_IMPL (impl, 0)
++    {
++      src.fn = impl->fn;
++      int res = do_strncmp (dest, src);
++      if (res)
++	{
++	  error (0, 0, "Wrong result in function %s: %i != 0",
++		 impl->name, res);
++	  ret = 1;
++	}
++    }
++
++  return ret ? EXIT_FAILURE : EXIT_SUCCESS;
++}
++
++#include <support/test-driver.c>
+diff --git a/sysdeps/x86_64/x32/tst-size_t-wcsncmp.c b/sysdeps/x86_64/x32/tst-size_t-wcsncmp.c
+new file mode 100644
+index 00000000..4829647c
+--- /dev/null
++++ b/sysdeps/x86_64/x32/tst-size_t-wcsncmp.c
+@@ -0,0 +1,20 @@
++/* Test wcsncmp with size_t in the lower 32 bits of 64-bit register.
++   Copyright (C) 2019 Free Software Foundation, Inc.
++   This file is part of the GNU C Library.
++
++   The GNU C Library is free software; you can redistribute it and/or
++   modify it under the terms of the GNU Lesser General Public
++   License as published by the Free Software Foundation; either
++   version 2.1 of the License, or (at your option) any later version.
++
++   The GNU C Library is distributed in the hope that it will be useful,
++   but WITHOUT ANY WARRANTY; without even the implied warranty of
++   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++   Lesser General Public License for more details.
++
++   You should have received a copy of the GNU Lesser General Public
++   License along with the GNU C Library; if not, see
++   <http://www.gnu.org/licenses/>.  */
++
++#define WIDE 1
++#include "tst-size_t-strncmp.c"
+-- 
+GitLab
+
diff --git a/glibc-RHEL-15696-60.patch b/glibc-RHEL-15696-60.patch
new file mode 100644
index 0000000..a3739eb
--- /dev/null
+++ b/glibc-RHEL-15696-60.patch
@@ -0,0 +1,54 @@
+From 6720d36b6623c5e48c070d86acf61198b33e144e Mon Sep 17 00:00:00 2001
+From: Fangrui Song <maskray@google.com>
+Date: Tue, 2 Nov 2021 20:59:52 -0700
+Subject: [PATCH] x86-64: Replace movzx with movzbl
+Content-type: text/plain; charset=UTF-8
+
+Clang cannot assemble movzx in the AT&T dialect mode.
+
+../sysdeps/x86_64/strcmp.S:2232:16: error: invalid operand for instruction
+ movzx (%rsi), %ecx
+               ^~~~
+
+Change movzx to movzbl, which follows the AT&T dialect and is used
+elsewhere in the file.
+
+Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
+---
+ sysdeps/x86_64/multiarch/strcmp-sse42.S | 4 ++--
+ sysdeps/x86_64/strcmp.S                 | 4 ++--
+ 2 files changed, 4 insertions(+), 4 deletions(-)
+
+diff --git a/sysdeps/x86_64/multiarch/strcmp-sse42.S b/sysdeps/x86_64/multiarch/strcmp-sse42.S
+index a1ebea46..d8fdeb3a 100644
+--- a/sysdeps/x86_64/multiarch/strcmp-sse42.S
++++ b/sysdeps/x86_64/multiarch/strcmp-sse42.S
+@@ -1771,8 +1771,8 @@ LABEL(strcmp_exitz):
+ 	.p2align 4
+ 	// XXX Same as code above
+ LABEL(Byte0):
+-	movzx	(%rsi), %ecx
+-	movzx	(%rdi), %eax
++	movzbl	(%rsi), %ecx
++	movzbl	(%rdi), %eax
+ 
+ #if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+ 	leaq	_nl_C_LC_CTYPE_tolower+128*4(%rip), %rdx
+diff --git a/sysdeps/x86_64/strcmp.S b/sysdeps/x86_64/strcmp.S
+index f47c8ad4..aa6df898 100644
+--- a/sysdeps/x86_64/strcmp.S
++++ b/sysdeps/x86_64/strcmp.S
+@@ -2232,8 +2232,8 @@ LABEL(strcmp_exitz):
+ 
+ 	.p2align 4
+ LABEL(Byte0):
+-	movzx	(%rsi), %ecx
+-	movzx	(%rdi), %eax
++	movzbl	(%rsi), %ecx
++	movzbl	(%rdi), %eax
+ 
+ #if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+ 	leaq	_nl_C_LC_CTYPE_tolower+128*4(%rip), %rdx
+-- 
+GitLab
+
diff --git a/glibc-RHEL-15696-61.patch b/glibc-RHEL-15696-61.patch
new file mode 100644
index 0000000..d6dbe81
--- /dev/null
+++ b/glibc-RHEL-15696-61.patch
@@ -0,0 +1,56 @@
+From cf2c57526ba4b57e6863ad4db8a868e2678adce8 Mon Sep 17 00:00:00 2001
+From: "H.J. Lu" <hjl.tools@gmail.com>
+Date: Fri, 30 Apr 2021 05:58:59 -0700
+Subject: [PATCH] x86: Set rep_movsb_threshold to 2112 on processors with FSRM
+Content-type: text/plain; charset=UTF-8
+
+The glibc memcpy benchmark on Intel Core i7-1065G7 (Ice Lake) showed
+that REP MOVSB became faster after 2112 bytes:
+
+                                      Vector Move       REP MOVSB
+length=2112, align1=0, align2=0:        24.20             24.40
+length=2112, align1=1, align2=0:        26.07             23.13
+length=2112, align1=0, align2=1:        27.18             28.13
+length=2112, align1=1, align2=1:        26.23             25.16
+length=2176, align1=0, align2=0:        23.18             22.52
+length=2176, align1=2, align2=0:        25.45             22.52
+length=2176, align1=0, align2=2:        27.14             27.82
+length=2176, align1=2, align2=2:        22.73             25.56
+length=2240, align1=0, align2=0:        24.62             24.25
+length=2240, align1=3, align2=0:        29.77             27.15
+length=2240, align1=0, align2=3:        35.55             29.93
+length=2240, align1=3, align2=3:        34.49             25.15
+length=2304, align1=0, align2=0:        34.75             26.64
+length=2304, align1=4, align2=0:        32.09             22.63
+length=2304, align1=0, align2=4:        28.43             31.24
+
+Use REP MOVSB for data size > 2112 bytes in memcpy on processors with
+fast short REP MOVSB (FSRM).
+
+	* sysdeps/x86/dl-cacheinfo.h (dl_init_cacheinfo): Set
+	rep_movsb_threshold to 2112 on processors with fast short REP
+	MOVSB (FSRM).
+---
+ sysdeps/x86/cacheinfo.h | 6 ++++++
+ 1 file changed, 6 insertions(+)
+
+diff --git a/sysdeps/x86/cacheinfo.h b/sysdeps/x86/cacheinfo.h
+index f72f634a..cc3941d3 100644
+--- a/sysdeps/x86/cacheinfo.h
++++ b/sysdeps/x86/cacheinfo.h
+@@ -430,6 +430,12 @@ init_cacheinfo (void)
+       rep_movsb_threshold = 2048 * (16 / 16);
+       minimum_rep_movsb_threshold = 16 * 8;
+     }
++
++  /* NB: The default REP MOVSB threshold is 2112 on processors with fast
++     short REP MOVSB (FSRM).  */
++  if (CPU_FEATURE_USABLE_P (cpu_features, FSRM))
++    rep_movsb_threshold = 2112;
++
+   if (cpu_features->rep_movsb_threshold > minimum_rep_movsb_threshold)
+     __x86_rep_movsb_threshold = cpu_features->rep_movsb_threshold;
+   else
+-- 
+GitLab
+
diff --git a/glibc-RHEL-15696-62.patch b/glibc-RHEL-15696-62.patch
new file mode 100644
index 0000000..a7a9286
--- /dev/null
+++ b/glibc-RHEL-15696-62.patch
@@ -0,0 +1,136 @@
+From 475b63702ef38b69558fc3d31a0b66776a70f1d3 Mon Sep 17 00:00:00 2001
+From: Noah Goldstein <goldstein.w.n@gmail.com>
+Date: Mon, 1 Nov 2021 00:49:52 -0500
+Subject: [PATCH] x86: Double size of ERMS rep_movsb_threshold in
+ dl-cacheinfo.h
+Content-type: text/plain; charset=UTF-8
+
+No bug.
+
+This patch doubles the rep_movsb_threshold when using ERMS. Based on
+benchmarks the vector copy loop, especially now that it handles 4k
+aliasing, is better for these medium ranged.
+
+On Skylake with ERMS:
+
+Size,   Align1, Align2, dst>src,(rep movsb) / (vec copy)
+4096,   0,      0,      0,      0.975
+4096,   0,      0,      1,      0.953
+4096,   12,     0,      0,      0.969
+4096,   12,     0,      1,      0.872
+4096,   44,     0,      0,      0.979
+4096,   44,     0,      1,      0.83
+4096,   0,      12,     0,      1.006
+4096,   0,      12,     1,      0.989
+4096,   0,      44,     0,      0.739
+4096,   0,      44,     1,      0.942
+4096,   12,     12,     0,      1.009
+4096,   12,     12,     1,      0.973
+4096,   44,     44,     0,      0.791
+4096,   44,     44,     1,      0.961
+4096,   2048,   0,      0,      0.978
+4096,   2048,   0,      1,      0.951
+4096,   2060,   0,      0,      0.986
+4096,   2060,   0,      1,      0.963
+4096,   2048,   12,     0,      0.971
+4096,   2048,   12,     1,      0.941
+4096,   2060,   12,     0,      0.977
+4096,   2060,   12,     1,      0.949
+8192,   0,      0,      0,      0.85
+8192,   0,      0,      1,      0.845
+8192,   13,     0,      0,      0.937
+8192,   13,     0,      1,      0.939
+8192,   45,     0,      0,      0.932
+8192,   45,     0,      1,      0.927
+8192,   0,      13,     0,      0.621
+8192,   0,      13,     1,      0.62
+8192,   0,      45,     0,      0.53
+8192,   0,      45,     1,      0.516
+8192,   13,     13,     0,      0.664
+8192,   13,     13,     1,      0.659
+8192,   45,     45,     0,      0.593
+8192,   45,     45,     1,      0.575
+8192,   2048,   0,      0,      0.854
+8192,   2048,   0,      1,      0.834
+8192,   2061,   0,      0,      0.863
+8192,   2061,   0,      1,      0.857
+8192,   2048,   13,     0,      0.63
+8192,   2048,   13,     1,      0.629
+8192,   2061,   13,     0,      0.627
+8192,   2061,   13,     1,      0.62
+
+Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
+Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
+---
+ sysdeps/x86/cacheinfo.h      |  8 +++++---
+ sysdeps/x86/dl-tunables.list | 26 +++++++++++++++-----------
+ 2 files changed, 20 insertions(+), 14 deletions(-)
+
+diff --git a/sysdeps/x86/cacheinfo.h b/sysdeps/x86/cacheinfo.h
+index cc3941d3..ac025e08 100644
+--- a/sysdeps/x86/cacheinfo.h
++++ b/sysdeps/x86/cacheinfo.h
+@@ -411,18 +411,20 @@ init_cacheinfo (void)
+ 
+   /* NB: The REP MOVSB threshold must be greater than VEC_SIZE * 8.  */
+   unsigned int minimum_rep_movsb_threshold;
+-  /* NB: The default REP MOVSB threshold is 2048 * (VEC_SIZE / 16).  */
++  /* NB: The default REP MOVSB threshold is 4096 * (VEC_SIZE / 16) for
++     VEC_SIZE == 64 or 32.  For VEC_SIZE == 16, the default REP MOVSB
++     threshold is 2048 * (VEC_SIZE / 16).  */
+   unsigned int rep_movsb_threshold;
+   if (CPU_FEATURE_USABLE_P (cpu_features, AVX512F)
+       && !CPU_FEATURE_PREFERRED_P (cpu_features, Prefer_No_AVX512))
+     {
+-      rep_movsb_threshold = 2048 * (64 / 16);
++      rep_movsb_threshold = 4096 * (64 / 16);
+       minimum_rep_movsb_threshold = 64 * 8;
+     }
+   else if (CPU_FEATURE_PREFERRED_P (cpu_features,
+ 				    AVX_Fast_Unaligned_Load))
+     {
+-      rep_movsb_threshold = 2048 * (32 / 16);
++      rep_movsb_threshold = 4096 * (32 / 16);
+       minimum_rep_movsb_threshold = 32 * 8;
+     }
+   else
+diff --git a/sysdeps/x86/dl-tunables.list b/sysdeps/x86/dl-tunables.list
+index 89bf2966..56c6834a 100644
+--- a/sysdeps/x86/dl-tunables.list
++++ b/sysdeps/x86/dl-tunables.list
+@@ -32,17 +32,21 @@ glibc {
+     }
+     x86_rep_movsb_threshold {
+       type: SIZE_T
+-      # Since there is overhead to set up REP MOVSB operation, REP MOVSB
+-      # isn't faster on short data.  The memcpy micro benchmark in glibc
+-      # shows that 2KB is the approximate value above which REP MOVSB
+-      # becomes faster than SSE2 optimization on processors with Enhanced
+-      # REP MOVSB.  Since larger register size can move more data with a
+-      # single load and store, the threshold is higher with larger register
+-      # size.  Note: Since the REP MOVSB threshold must be greater than 8
+-      # times of vector size and the default value is 2048 * (vector size
+-      # / 16), the default value and the minimum value must be updated at
+-      # run-time.  NB: Don't set the default value since we can't tell if
+-      # the tunable value is set by user or not [BZ #27069].
++      # Since there is overhead to set up REP MOVSB operation, REP
++      # MOVSB isn't faster on short data.  The memcpy micro benchmark
++      # in glibc shows that 2KB is the approximate value above which
++      # REP MOVSB becomes faster than SSE2 optimization on processors
++      # with Enhanced REP MOVSB.  Since larger register size can move
++      # more data with a single load and store, the threshold is
++      # higher with larger register size.  Micro benchmarks show AVX
++      # REP MOVSB becomes faster apprximately at 8KB.  The AVX512
++      # threshold is extrapolated to 16KB.  For machines with FSRM the
++      # threshold is universally set at 2112 bytes.  Note: Since the
++      # REP MOVSB threshold must be greater than 8 times of vector
++      # size and the default value is 4096 * (vector size / 16), the
++      # default value and the minimum value must be updated at
++      # run-time.  NB: Don't set the default value since we can't tell
++      # if the tunable value is set by user or not [BZ #27069].
+       minval: 1
+     }
+     x86_rep_stosb_threshold {
+-- 
+GitLab
+
diff --git a/glibc-RHEL-15696-63.patch b/glibc-RHEL-15696-63.patch
new file mode 100644
index 0000000..c14e8b3
--- /dev/null
+++ b/glibc-RHEL-15696-63.patch
@@ -0,0 +1,2428 @@
+From 2f9062d7171850451e6044ef78d91ff8c017b9c0 Mon Sep 17 00:00:00 2001
+From: Noah Goldstein <goldstein.w.n@gmail.com>
+Date: Wed, 10 Nov 2021 16:18:56 -0600
+Subject: [PATCH] x86: Shrink memcmp-sse4.S code size
+Content-type: text/plain; charset=UTF-8
+
+No bug.
+
+This implementation refactors memcmp-sse4.S primarily with minimizing
+code size in mind. It does this by removing the lookup table logic and
+removing the unrolled check from (256, 512] bytes.
+
+memcmp-sse4 code size reduction : -3487 bytes
+wmemcmp-sse4 code size reduction: -1472 bytes
+
+The current memcmp-sse4.S implementation has a large code size
+cost. This has serious adverse affects on the ICache / ITLB. While
+in micro-benchmarks the implementations appears fast, traces of
+real-world code have shown that the speed in micro benchmarks does not
+translate when the ICache/ITLB are not primed, and that the cost
+of the code size has measurable negative affects on overall
+application performance.
+
+See https://research.google/pubs/pub48320/ for more details.
+
+Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
+Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
+---
+ sysdeps/x86_64/multiarch/memcmp-sse4.S | 2267 +++++++-----------------
+ 1 file changed, 646 insertions(+), 1621 deletions(-)
+
+diff --git a/sysdeps/x86_64/multiarch/memcmp-sse4.S b/sysdeps/x86_64/multiarch/memcmp-sse4.S
+index 302900f5..50060006 100644
+--- a/sysdeps/x86_64/multiarch/memcmp-sse4.S
++++ b/sysdeps/x86_64/multiarch/memcmp-sse4.S
+@@ -25,14 +25,14 @@
+ #  define MEMCMP	__memcmp_sse4_1
+ # endif
+ 
+-# define JMPTBL(I, B)	(I - B)
++#ifdef USE_AS_WMEMCMP
++# define CMPEQ	pcmpeqd
++# define CHAR_SIZE	4
++#else
++# define CMPEQ	pcmpeqb
++# define CHAR_SIZE	1
++#endif
+ 
+-# define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE)		\
+-  lea		TABLE(%rip), %r11;				\
+-  movslq	(%r11, INDEX, SCALE), %rcx;			\
+-  add		%r11, %rcx;					\
+-  _CET_NOTRACK jmp *%rcx;					\
+-  ud2
+ 
+ /* Warning!
+            wmemcmp has to use SIGNED comparison for elements.
+@@ -47,33 +47,253 @@ ENTRY (MEMCMP)
+ 	/* Clear the upper 32 bits.  */
+ 	mov	%edx, %edx
+ # endif
+-	pxor	%xmm0, %xmm0
+ 	cmp	$79, %RDX_LP
+ 	ja	L(79bytesormore)
++
++	cmp	$CHAR_SIZE, %RDX_LP
++	jbe	L(firstbyte)
++
++	/* N in (CHAR_SIZE, 79) bytes.  */
++	cmpl	$32, %edx
++	ja	L(more_32_bytes)
++
++	cmpl	$16, %edx
++	jae	L(16_to_32_bytes)
++
+ # ifndef USE_AS_WMEMCMP
+-	cmp	$1, %RDX_LP
+-	je	L(firstbyte)
++	cmpl	$8, %edx
++	jae	L(8_to_16_bytes)
++
++	cmpl	$4, %edx
++	jb	L(2_to_3_bytes)
++
++	movl	(%rdi), %eax
++	movl	(%rsi), %ecx
++
++	bswap	%eax
++	bswap	%ecx
++
++	shlq	$32, %rax
++	shlq	$32, %rcx
++
++	movl	-4(%rdi, %rdx), %edi
++	movl	-4(%rsi, %rdx), %esi
++
++	bswap	%edi
++	bswap	%esi
++
++	orq	%rdi, %rax
++	orq	%rsi, %rcx
++	subq	%rcx, %rax
++	cmovne	%edx, %eax
++	sbbl	%ecx, %ecx
++	orl	%ecx, %eax
++	ret
++
++	.p2align 4,, 8
++L(2_to_3_bytes):
++	movzwl	(%rdi), %eax
++	movzwl	(%rsi), %ecx
++	shll	$8, %eax
++	shll	$8, %ecx
++	bswap	%eax
++	bswap	%ecx
++	movzbl	-1(%rdi, %rdx), %edi
++	movzbl	-1(%rsi, %rdx), %esi
++	orl	%edi, %eax
++	orl	%esi, %ecx
++	subl	%ecx, %eax
++	ret
++
++	.p2align 4,, 8
++L(8_to_16_bytes):
++	movq	(%rdi), %rax
++	movq	(%rsi), %rcx
++
++	bswap	%rax
++	bswap	%rcx
++
++	subq	%rcx, %rax
++	jne	L(8_to_16_bytes_done)
++
++	movq	-8(%rdi, %rdx), %rax
++	movq	-8(%rsi, %rdx), %rcx
++
++	bswap	%rax
++	bswap	%rcx
++
++	subq	%rcx, %rax
++
++L(8_to_16_bytes_done):
++	cmovne	%edx, %eax
++	sbbl	%ecx, %ecx
++	orl	%ecx, %eax
++	ret
++# else
++	xorl	%eax, %eax
++	movl	(%rdi), %ecx
++	cmpl	(%rsi), %ecx
++	jne	L(8_to_16_bytes_done)
++	movl	4(%rdi), %ecx
++	cmpl	4(%rsi), %ecx
++	jne	L(8_to_16_bytes_done)
++	movl	-4(%rdi, %rdx), %ecx
++	cmpl	-4(%rsi, %rdx), %ecx
++	jne	L(8_to_16_bytes_done)
++	ret
+ # endif
+-	add	%rdx, %rsi
+-	add	%rdx, %rdi
+-	BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4)
+ 
+-# ifndef USE_AS_WMEMCMP
+-	.p2align 4
++	.p2align 4,, 3
++L(ret_zero):
++	xorl	%eax, %eax
++L(zero):
++	ret
++
++	.p2align 4,, 8
+ L(firstbyte):
++	jb	L(ret_zero)
++# ifdef USE_AS_WMEMCMP
++	xorl	%eax, %eax
++	movl	(%rdi), %ecx
++	cmpl	(%rsi), %ecx
++	je	L(zero)
++L(8_to_16_bytes_done):
++	setg	%al
++	leal	-1(%rax, %rax), %eax
++# else
+ 	movzbl	(%rdi), %eax
+ 	movzbl	(%rsi), %ecx
+ 	sub	%ecx, %eax
++# endif
+ 	ret
++
++	.p2align 4
++L(vec_return_begin_48):
++	addq	$16, %rdi
++	addq	$16, %rsi
++L(vec_return_begin_32):
++	bsfl	%eax, %eax
++# ifdef USE_AS_WMEMCMP
++	movl	32(%rdi, %rax), %ecx
++	xorl	%edx, %edx
++	cmpl	32(%rsi, %rax), %ecx
++	setg	%dl
++	leal	-1(%rdx, %rdx), %eax
++# else
++	movzbl	32(%rsi, %rax), %ecx
++	movzbl	32(%rdi, %rax), %eax
++	subl	%ecx, %eax
++# endif
++	ret
++
++	.p2align 4
++L(vec_return_begin_16):
++	addq	$16, %rdi
++	addq	$16, %rsi
++L(vec_return_begin):
++	bsfl	%eax, %eax
++# ifdef USE_AS_WMEMCMP
++	movl	(%rdi, %rax), %ecx
++	xorl	%edx, %edx
++	cmpl	(%rsi, %rax), %ecx
++	setg	%dl
++	leal	-1(%rdx, %rdx), %eax
++# else
++	movzbl	(%rsi, %rax), %ecx
++	movzbl	(%rdi, %rax), %eax
++	subl	%ecx, %eax
++# endif
++	ret
++
++	.p2align 4
++L(vec_return_end_16):
++	subl	$16, %edx
++L(vec_return_end):
++	bsfl	%eax, %eax
++	addl	%edx, %eax
++# ifdef USE_AS_WMEMCMP
++	movl	-16(%rdi, %rax), %ecx
++	xorl	%edx, %edx
++	cmpl	-16(%rsi, %rax), %ecx
++	setg	%dl
++	leal	-1(%rdx, %rdx), %eax
++# else
++	movzbl	-16(%rsi, %rax), %ecx
++	movzbl	-16(%rdi, %rax), %eax
++	subl	%ecx, %eax
+ # endif
++	ret
++
++	.p2align 4,, 8
++L(more_32_bytes):
++	movdqu	(%rdi), %xmm0
++	movdqu	(%rsi), %xmm1
++	CMPEQ	%xmm0, %xmm1
++	pmovmskb %xmm1, %eax
++	incw	%ax
++	jnz	L(vec_return_begin)
++
++	movdqu	16(%rdi), %xmm0
++	movdqu	16(%rsi), %xmm1
++	CMPEQ	%xmm0, %xmm1
++	pmovmskb %xmm1, %eax
++	incw	%ax
++	jnz	L(vec_return_begin_16)
++
++	cmpl	$64, %edx
++	jbe	L(32_to_64_bytes)
++	movdqu	32(%rdi), %xmm0
++	movdqu	32(%rsi), %xmm1
++	CMPEQ	%xmm0, %xmm1
++	pmovmskb %xmm1, %eax
++	incw	%ax
++	jnz	L(vec_return_begin_32)
++
++	.p2align 4,, 6
++L(32_to_64_bytes):
++	movdqu	-32(%rdi, %rdx), %xmm0
++	movdqu	-32(%rsi, %rdx), %xmm1
++	CMPEQ	%xmm0, %xmm1
++	pmovmskb %xmm1, %eax
++	incw	%ax
++	jnz	L(vec_return_end_16)
++
++	movdqu	-16(%rdi, %rdx), %xmm0
++	movdqu	-16(%rsi, %rdx), %xmm1
++	CMPEQ	%xmm0, %xmm1
++	pmovmskb %xmm1, %eax
++	incw	%ax
++	jnz	L(vec_return_end)
++	ret
++
++	.p2align 4
++L(16_to_32_bytes):
++	movdqu	(%rdi), %xmm0
++	movdqu	(%rsi), %xmm1
++	CMPEQ	%xmm0, %xmm1
++	pmovmskb %xmm1, %eax
++	incw	%ax
++	jnz	L(vec_return_begin)
++
++	movdqu	-16(%rdi, %rdx), %xmm0
++	movdqu	-16(%rsi, %rdx), %xmm1
++	CMPEQ	%xmm0, %xmm1
++	pmovmskb %xmm1, %eax
++	incw	%ax
++	jnz	L(vec_return_end)
++	ret
++
+ 
+ 	.p2align 4
+ L(79bytesormore):
++	movdqu	(%rdi), %xmm0
+ 	movdqu	(%rsi), %xmm1
+-	movdqu	(%rdi), %xmm2
+-	pxor	%xmm1, %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(16bytesin256)
++	CMPEQ	%xmm0, %xmm1
++	pmovmskb %xmm1, %eax
++	incw	%ax
++	jnz	L(vec_return_begin)
++
++
+ 	mov	%rsi, %rcx
+ 	and	$-16, %rsi
+ 	add	$16, %rsi
+@@ -86,1694 +306,499 @@ L(79bytesormore):
+ 
+ 	cmp	$128, %rdx
+ 	ja	L(128bytesormore)
+-L(less128bytes):
+-	sub	$64, %rdx
+-
+-	movdqu	(%rdi), %xmm2
+-	pxor	(%rsi), %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(16bytesin256)
+ 
+-	movdqu	16(%rdi), %xmm2
+-	pxor	16(%rsi), %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(32bytesin256)
+-
+-	movdqu	32(%rdi), %xmm2
+-	pxor	32(%rsi), %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(48bytesin256)
+-
+-	movdqu	48(%rdi), %xmm2
+-	pxor	48(%rsi), %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(64bytesin256)
+-	cmp	$32, %rdx
+-	jb	L(less32bytesin64)
+-
+-	movdqu	64(%rdi), %xmm2
+-	pxor	64(%rsi), %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(80bytesin256)
+-
+-	movdqu	80(%rdi), %xmm2
+-	pxor	80(%rsi), %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(96bytesin256)
+-	sub	$32, %rdx
+-	add	$32, %rdi
+-	add	$32, %rsi
+-L(less32bytesin64):
+-	add	$64, %rdi
+-	add	$64, %rsi
+-	add	%rdx, %rsi
+-	add	%rdx, %rdi
+-	BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4)
++	.p2align 4,, 6
++L(less128bytes):
++	movdqu	(%rdi), %xmm1
++	CMPEQ	(%rsi), %xmm1
++	pmovmskb %xmm1, %eax
++	incw	%ax
++	jnz	L(vec_return_begin)
++
++	movdqu	16(%rdi), %xmm1
++	CMPEQ	16(%rsi), %xmm1
++	pmovmskb %xmm1, %eax
++	incw	%ax
++	jnz	L(vec_return_begin_16)
++
++	movdqu	32(%rdi), %xmm1
++	CMPEQ	32(%rsi), %xmm1
++	pmovmskb %xmm1, %eax
++	incw	%ax
++	jnz	L(vec_return_begin_32)
++
++	movdqu	48(%rdi), %xmm1
++	CMPEQ	48(%rsi), %xmm1
++	pmovmskb %xmm1, %eax
++	incw	%ax
++	jnz	L(vec_return_begin_48)
++
++	cmp	$96, %rdx
++	jb	L(32_to_64_bytes)
++
++	addq	$64, %rdi
++	addq	$64, %rsi
++	subq	$64, %rdx
++
++	.p2align 4,, 6
++L(last_64_bytes):
++	movdqu	(%rdi), %xmm1
++	CMPEQ	(%rsi), %xmm1
++	pmovmskb %xmm1, %eax
++	incw	%ax
++	jnz	L(vec_return_begin)
++
++	movdqu	16(%rdi), %xmm1
++	CMPEQ	16(%rsi), %xmm1
++	pmovmskb %xmm1, %eax
++	incw	%ax
++	jnz	L(vec_return_begin_16)
++
++	movdqu	-32(%rdi, %rdx), %xmm0
++	movdqu	-32(%rsi, %rdx), %xmm1
++	CMPEQ	%xmm0, %xmm1
++	pmovmskb %xmm1, %eax
++	incw	%ax
++	jnz	L(vec_return_end_16)
++
++	movdqu	-16(%rdi, %rdx), %xmm0
++	movdqu	-16(%rsi, %rdx), %xmm1
++	CMPEQ	%xmm0, %xmm1
++	pmovmskb %xmm1, %eax
++	incw	%ax
++	jnz	L(vec_return_end)
++	ret
+ 
++	.p2align 4
+ L(128bytesormore):
+-	cmp	$512, %rdx
+-	ja	L(512bytesormore)
+ 	cmp	$256, %rdx
+-	ja	L(less512bytes)
++	ja	L(unaligned_loop)
+ L(less256bytes):
+-	sub	$128, %rdx
+-
+-	movdqu	(%rdi), %xmm2
+-	pxor	(%rsi), %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(16bytesin256)
+-
+-	movdqu	16(%rdi), %xmm2
+-	pxor	16(%rsi), %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(32bytesin256)
+-
+-	movdqu	32(%rdi), %xmm2
+-	pxor	32(%rsi), %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(48bytesin256)
+-
+-	movdqu	48(%rdi), %xmm2
+-	pxor	48(%rsi), %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(64bytesin256)
+-
+-	movdqu	64(%rdi), %xmm2
+-	pxor	64(%rsi), %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(80bytesin256)
+-
+-	movdqu	80(%rdi), %xmm2
+-	pxor	80(%rsi), %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(96bytesin256)
+-
+-	movdqu	96(%rdi), %xmm2
+-	pxor	96(%rsi), %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(112bytesin256)
+-
+-	movdqu	112(%rdi), %xmm2
+-	pxor	112(%rsi), %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(128bytesin256)
+-
+-	add	$128, %rsi
+-	add	$128, %rdi
+-
+-	cmp	$64, %rdx
+-	jae	L(less128bytes)
+-
+-	cmp	$32, %rdx
+-	jb	L(less32bytesin128)
+-
+-	movdqu	(%rdi), %xmm2
+-	pxor	(%rsi), %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(16bytesin256)
+-
+-	movdqu	16(%rdi), %xmm2
+-	pxor	16(%rsi), %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(32bytesin256)
+-	sub	$32, %rdx
+-	add	$32, %rdi
+-	add	$32, %rsi
+-L(less32bytesin128):
+-	add	%rdx, %rsi
+-	add	%rdx, %rdi
+-	BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4)
+-
+-L(less512bytes):
+-	sub	$256, %rdx
+-	movdqu	(%rdi), %xmm2
+-	pxor	(%rsi), %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(16bytesin256)
+-
+-	movdqu	16(%rdi), %xmm2
+-	pxor	16(%rsi), %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(32bytesin256)
+-
+-	movdqu	32(%rdi), %xmm2
+-	pxor	32(%rsi), %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(48bytesin256)
+-
+-	movdqu	48(%rdi), %xmm2
+-	pxor	48(%rsi), %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(64bytesin256)
+-
+-	movdqu	64(%rdi), %xmm2
+-	pxor	64(%rsi), %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(80bytesin256)
+-
+-	movdqu	80(%rdi), %xmm2
+-	pxor	80(%rsi), %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(96bytesin256)
+-
+-	movdqu	96(%rdi), %xmm2
+-	pxor	96(%rsi), %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(112bytesin256)
+-
+-	movdqu	112(%rdi), %xmm2
+-	pxor	112(%rsi), %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(128bytesin256)
+-
+-	movdqu	128(%rdi), %xmm2
+-	pxor	128(%rsi), %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(144bytesin256)
+-
+-	movdqu	144(%rdi), %xmm2
+-	pxor	144(%rsi), %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(160bytesin256)
+-
+-	movdqu	160(%rdi), %xmm2
+-	pxor	160(%rsi), %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(176bytesin256)
+-
+-	movdqu	176(%rdi), %xmm2
+-	pxor	176(%rsi), %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(192bytesin256)
+-
+-	movdqu	192(%rdi), %xmm2
+-	pxor	192(%rsi), %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(208bytesin256)
+-
+-	movdqu	208(%rdi), %xmm2
+-	pxor	208(%rsi), %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(224bytesin256)
+-
+-	movdqu	224(%rdi), %xmm2
+-	pxor	224(%rsi), %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(240bytesin256)
+-
+-	movdqu	240(%rdi), %xmm2
+-	pxor	240(%rsi), %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(256bytesin256)
+-
+-	add	$256, %rsi
+-	add	$256, %rdi
+-
+-	cmp	$128, %rdx
+-	jae	L(less256bytes)
++	movdqu	(%rdi), %xmm1
++	CMPEQ	(%rsi), %xmm1
++	pmovmskb %xmm1, %eax
++	incw	%ax
++	jnz	L(vec_return_begin)
++
++	movdqu	16(%rdi), %xmm1
++	CMPEQ	16(%rsi), %xmm1
++	pmovmskb %xmm1, %eax
++	incw	%ax
++	jnz	L(vec_return_begin_16)
++
++	movdqu	32(%rdi), %xmm1
++	CMPEQ	32(%rsi), %xmm1
++	pmovmskb %xmm1, %eax
++	incw	%ax
++	jnz	L(vec_return_begin_32)
++
++	movdqu	48(%rdi), %xmm1
++	CMPEQ	48(%rsi), %xmm1
++	pmovmskb %xmm1, %eax
++	incw	%ax
++	jnz	L(vec_return_begin_48)
++
++	addq	$64, %rdi
++	addq	$64, %rsi
++
++	movdqu	(%rdi), %xmm1
++	CMPEQ	(%rsi), %xmm1
++	pmovmskb %xmm1, %eax
++	incw	%ax
++	jnz	L(vec_return_begin)
++
++	movdqu	16(%rdi), %xmm1
++	CMPEQ	16(%rsi), %xmm1
++	pmovmskb %xmm1, %eax
++	incw	%ax
++	jnz	L(vec_return_begin_16)
++
++	movdqu	32(%rdi), %xmm1
++	CMPEQ	32(%rsi), %xmm1
++	pmovmskb %xmm1, %eax
++	incw	%ax
++	jnz	L(vec_return_begin_32)
++
++	movdqu	48(%rdi), %xmm1
++	CMPEQ	48(%rsi), %xmm1
++	pmovmskb %xmm1, %eax
++	incw	%ax
++	jnz	L(vec_return_begin_48)
++
++	addq	$-128, %rdx
++	subq	$-64, %rsi
++	subq	$-64, %rdi
+ 
+ 	cmp	$64, %rdx
+-	jae	L(less128bytes)
++	ja	L(less128bytes)
+ 
+ 	cmp	$32, %rdx
+-	jb	L(less32bytesin256)
+-
+-	movdqu	(%rdi), %xmm2
+-	pxor	(%rsi), %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(16bytesin256)
+-
+-	movdqu	16(%rdi), %xmm2
+-	pxor	16(%rsi), %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(32bytesin256)
+-	sub	$32, %rdx
+-	add	$32, %rdi
+-	add	$32, %rsi
+-L(less32bytesin256):
+-	add	%rdx, %rsi
+-	add	%rdx, %rdi
+-	BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4)
++	ja	L(last_64_bytes)
++
++	movdqu	-32(%rdi, %rdx), %xmm0
++	movdqu	-32(%rsi, %rdx), %xmm1
++	CMPEQ	%xmm0, %xmm1
++	pmovmskb %xmm1, %eax
++	incw	%ax
++	jnz	L(vec_return_end_16)
++
++	movdqu	-16(%rdi, %rdx), %xmm0
++	movdqu	-16(%rsi, %rdx), %xmm1
++	CMPEQ	%xmm0, %xmm1
++	pmovmskb %xmm1, %eax
++	incw	%ax
++	jnz	L(vec_return_end)
++	ret
+ 
+ 	.p2align 4
+-L(512bytesormore):
++L(unaligned_loop):
+ # ifdef DATA_CACHE_SIZE_HALF
+ 	mov	$DATA_CACHE_SIZE_HALF, %R8_LP
+ # else
+ 	mov	__x86_data_cache_size_half(%rip), %R8_LP
+ # endif
+-	mov	%r8, %r9
+-	shr	$1, %r8
+-	add	%r9, %r8
+-	cmp	%r8, %rdx
+-	ja	L(L2_L3_cache_unaglined)
++	movq	%r8, %r9
++	addq	%r8, %r8
++	addq	%r9, %r8
++	cmpq	%r8, %rdx
++	ja	L(L2_L3_cache_unaligned)
+ 	sub	$64, %rdx
+ 	.p2align 4
+ L(64bytesormore_loop):
+-	movdqu	(%rdi), %xmm2
+-	pxor	(%rsi), %xmm2
+-	movdqa	%xmm2, %xmm1
++	movdqu	(%rdi), %xmm0
++	movdqu	16(%rdi), %xmm1
++	movdqu	32(%rdi), %xmm2
++	movdqu	48(%rdi), %xmm3
+ 
+-	movdqu	16(%rdi), %xmm3
+-	pxor	16(%rsi), %xmm3
+-	por	%xmm3, %xmm1
++	CMPEQ	(%rsi), %xmm0
++	CMPEQ	16(%rsi), %xmm1
++	CMPEQ	32(%rsi), %xmm2
++	CMPEQ	48(%rsi), %xmm3
+ 
+-	movdqu	32(%rdi), %xmm4
+-	pxor	32(%rsi), %xmm4
+-	por	%xmm4, %xmm1
++	pand	%xmm0, %xmm1
++	pand	%xmm2, %xmm3
++	pand	%xmm1, %xmm3
+ 
+-	movdqu	48(%rdi), %xmm5
+-	pxor	48(%rsi), %xmm5
+-	por	%xmm5, %xmm1
++	pmovmskb %xmm3, %eax
++	incw	%ax
++	jnz	L(64bytesormore_loop_end)
+ 
+-	ptest	%xmm1, %xmm0
+-	jnc	L(64bytesormore_loop_end)
+ 	add	$64, %rsi
+ 	add	$64, %rdi
+ 	sub	$64, %rdx
+-	jae	L(64bytesormore_loop)
++	ja	L(64bytesormore_loop)
+ 
+-	add	$64, %rdx
+-	add	%rdx, %rsi
+-	add	%rdx, %rdi
+-	BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4)
++	.p2align 4,, 6
++L(loop_tail):
++	addq	%rdx, %rdi
++	movdqu	(%rdi), %xmm0
++	movdqu	16(%rdi), %xmm1
++	movdqu	32(%rdi), %xmm2
++	movdqu	48(%rdi), %xmm3
++
++	addq	%rdx, %rsi
++	movdqu	(%rsi), %xmm4
++	movdqu	16(%rsi), %xmm5
++	movdqu	32(%rsi), %xmm6
++	movdqu	48(%rsi), %xmm7
++
++	CMPEQ	%xmm4, %xmm0
++	CMPEQ	%xmm5, %xmm1
++	CMPEQ	%xmm6, %xmm2
++	CMPEQ	%xmm7, %xmm3
++
++	pand	%xmm0, %xmm1
++	pand	%xmm2, %xmm3
++	pand	%xmm1, %xmm3
++
++	pmovmskb %xmm3, %eax
++	incw	%ax
++	jnz	L(64bytesormore_loop_end)
++	ret
+ 
+-L(L2_L3_cache_unaglined):
+-	sub	$64, %rdx
++L(L2_L3_cache_unaligned):
++	subq	$64, %rdx
+ 	.p2align 4
+ L(L2_L3_unaligned_128bytes_loop):
+ 	prefetchnta 0x1c0(%rdi)
+ 	prefetchnta 0x1c0(%rsi)
+-	movdqu	(%rdi), %xmm2
+-	pxor	(%rsi), %xmm2
+-	movdqa	%xmm2, %xmm1
+ 
+-	movdqu	16(%rdi), %xmm3
+-	pxor	16(%rsi), %xmm3
+-	por	%xmm3, %xmm1
++	movdqu	(%rdi), %xmm0
++	movdqu	16(%rdi), %xmm1
++	movdqu	32(%rdi), %xmm2
++	movdqu	48(%rdi), %xmm3
++
++	CMPEQ	(%rsi), %xmm0
++	CMPEQ	16(%rsi), %xmm1
++	CMPEQ	32(%rsi), %xmm2
++	CMPEQ	48(%rsi), %xmm3
+ 
+-	movdqu	32(%rdi), %xmm4
+-	pxor	32(%rsi), %xmm4
+-	por	%xmm4, %xmm1
++	pand	%xmm0, %xmm1
++	pand	%xmm2, %xmm3
++	pand	%xmm1, %xmm3
+ 
+-	movdqu	48(%rdi), %xmm5
+-	pxor	48(%rsi), %xmm5
+-	por	%xmm5, %xmm1
++	pmovmskb %xmm3, %eax
++	incw	%ax
++	jnz	L(64bytesormore_loop_end)
+ 
+-	ptest	%xmm1, %xmm0
+-	jnc	L(64bytesormore_loop_end)
+ 	add	$64, %rsi
+ 	add	$64, %rdi
+ 	sub	$64, %rdx
+-	jae	L(L2_L3_unaligned_128bytes_loop)
++	ja	L(L2_L3_unaligned_128bytes_loop)
++	jmp	L(loop_tail)
+ 
+-	add	$64, %rdx
+-	add	%rdx, %rsi
+-	add	%rdx, %rdi
+-	BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4)
+ 
+-/*
+- * This case is for machines which are sensitive for unaligned instructions.
+- */
++	/* This case is for machines which are sensitive for unaligned
++	 * instructions.  */
+ 	.p2align 4
+ L(2aligned):
+ 	cmp	$128, %rdx
+ 	ja	L(128bytesormorein2aligned)
+ L(less128bytesin2aligned):
+-	sub	$64, %rdx
+-
+-	movdqa	(%rdi), %xmm2
+-	pxor	(%rsi), %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(16bytesin256)
+-
+-	movdqa	16(%rdi), %xmm2
+-	pxor	16(%rsi), %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(32bytesin256)
+-
+-	movdqa	32(%rdi), %xmm2
+-	pxor	32(%rsi), %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(48bytesin256)
+-
+-	movdqa	48(%rdi), %xmm2
+-	pxor	48(%rsi), %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(64bytesin256)
+-	cmp	$32, %rdx
+-	jb	L(less32bytesin64in2alinged)
+-
+-	movdqa	64(%rdi), %xmm2
+-	pxor	64(%rsi), %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(80bytesin256)
+-
+-	movdqa	80(%rdi), %xmm2
+-	pxor	80(%rsi), %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(96bytesin256)
+-	sub	$32, %rdx
+-	add	$32, %rdi
+-	add	$32, %rsi
+-L(less32bytesin64in2alinged):
+-	add	$64, %rdi
+-	add	$64, %rsi
+-	add	%rdx, %rsi
+-	add	%rdx, %rdi
+-	BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4)
++	movdqa	(%rdi), %xmm1
++	CMPEQ	(%rsi), %xmm1
++	pmovmskb %xmm1, %eax
++	incw	%ax
++	jnz	L(vec_return_begin)
++
++	movdqa	16(%rdi), %xmm1
++	CMPEQ	16(%rsi), %xmm1
++	pmovmskb %xmm1, %eax
++	incw	%ax
++	jnz	L(vec_return_begin_16)
++
++	movdqa	32(%rdi), %xmm1
++	CMPEQ	32(%rsi), %xmm1
++	pmovmskb %xmm1, %eax
++	incw	%ax
++	jnz	L(vec_return_begin_32)
++
++	movdqa	48(%rdi), %xmm1
++	CMPEQ	48(%rsi), %xmm1
++	pmovmskb %xmm1, %eax
++	incw	%ax
++	jnz	L(vec_return_begin_48)
++
++	cmp	$96, %rdx
++	jb	L(32_to_64_bytes)
++
++	addq	$64, %rdi
++	addq	$64, %rsi
++	subq	$64, %rdx
++
++	.p2align 4,, 6
++L(aligned_last_64_bytes):
++	movdqa	(%rdi), %xmm1
++	CMPEQ	(%rsi), %xmm1
++	pmovmskb %xmm1, %eax
++	incw	%ax
++	jnz	L(vec_return_begin)
++
++	movdqa	16(%rdi), %xmm1
++	CMPEQ	16(%rsi), %xmm1
++	pmovmskb %xmm1, %eax
++	incw	%ax
++	jnz	L(vec_return_begin_16)
++
++	movdqu	-32(%rdi, %rdx), %xmm0
++	movdqu	-32(%rsi, %rdx), %xmm1
++	CMPEQ	%xmm0, %xmm1
++	pmovmskb %xmm1, %eax
++	incw	%ax
++	jnz	L(vec_return_end_16)
++
++	movdqu	-16(%rdi, %rdx), %xmm0
++	movdqu	-16(%rsi, %rdx), %xmm1
++	CMPEQ	%xmm0, %xmm1
++	pmovmskb %xmm1, %eax
++	incw	%ax
++	jnz	L(vec_return_end)
++	ret
+ 
+ 	.p2align 4
+ L(128bytesormorein2aligned):
+-	cmp	$512, %rdx
+-	ja	L(512bytesormorein2aligned)
+ 	cmp	$256, %rdx
+-	ja	L(256bytesormorein2aligned)
++	ja	L(aligned_loop)
+ L(less256bytesin2alinged):
+-	sub	$128, %rdx
+-
+-	movdqa	(%rdi), %xmm2
+-	pxor	(%rsi), %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(16bytesin256)
+-
+-	movdqa	16(%rdi), %xmm2
+-	pxor	16(%rsi), %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(32bytesin256)
+-
+-	movdqa	32(%rdi), %xmm2
+-	pxor	32(%rsi), %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(48bytesin256)
+-
+-	movdqa	48(%rdi), %xmm2
+-	pxor	48(%rsi), %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(64bytesin256)
+-
+-	movdqa	64(%rdi), %xmm2
+-	pxor	64(%rsi), %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(80bytesin256)
+-
+-	movdqa	80(%rdi), %xmm2
+-	pxor	80(%rsi), %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(96bytesin256)
+-
+-	movdqa	96(%rdi), %xmm2
+-	pxor	96(%rsi), %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(112bytesin256)
+-
+-	movdqa	112(%rdi), %xmm2
+-	pxor	112(%rsi), %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(128bytesin256)
+-
+-	add	$128, %rsi
+-	add	$128, %rdi
++	movdqa	(%rdi), %xmm1
++	CMPEQ	(%rsi), %xmm1
++	pmovmskb %xmm1, %eax
++	incw	%ax
++	jnz	L(vec_return_begin)
++
++	movdqa	16(%rdi), %xmm1
++	CMPEQ	16(%rsi), %xmm1
++	pmovmskb %xmm1, %eax
++	incw	%ax
++	jnz	L(vec_return_begin_16)
++
++	movdqa	32(%rdi), %xmm1
++	CMPEQ	32(%rsi), %xmm1
++	pmovmskb %xmm1, %eax
++	incw	%ax
++	jnz	L(vec_return_begin_32)
++
++	movdqa	48(%rdi), %xmm1
++	CMPEQ	48(%rsi), %xmm1
++	pmovmskb %xmm1, %eax
++	incw	%ax
++	jnz	L(vec_return_begin_48)
++
++	addq	$64, %rdi
++	addq	$64, %rsi
++
++	movdqa	(%rdi), %xmm1
++	CMPEQ	(%rsi), %xmm1
++	pmovmskb %xmm1, %eax
++	incw	%ax
++	jnz	L(vec_return_begin)
++
++	movdqa	16(%rdi), %xmm1
++	CMPEQ	16(%rsi), %xmm1
++	pmovmskb %xmm1, %eax
++	incw	%ax
++	jnz	L(vec_return_begin_16)
++
++	movdqa	32(%rdi), %xmm1
++	CMPEQ	32(%rsi), %xmm1
++	pmovmskb %xmm1, %eax
++	incw	%ax
++	jnz	L(vec_return_begin_32)
++
++	movdqa	48(%rdi), %xmm1
++	CMPEQ	48(%rsi), %xmm1
++	pmovmskb %xmm1, %eax
++	incw	%ax
++	jnz	L(vec_return_begin_48)
++
++	addq	$-128, %rdx
++	subq	$-64, %rsi
++	subq	$-64, %rdi
+ 
+ 	cmp	$64, %rdx
+-	jae	L(less128bytesin2aligned)
++	ja	L(less128bytesin2aligned)
+ 
+ 	cmp	$32, %rdx
+-	jb	L(less32bytesin128in2aligned)
+-
+-	movdqu	(%rdi), %xmm2
+-	pxor	(%rsi), %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(16bytesin256)
+-
+-	movdqu	16(%rdi), %xmm2
+-	pxor	16(%rsi), %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(32bytesin256)
+-	sub	$32, %rdx
+-	add	$32, %rdi
+-	add	$32, %rsi
+-L(less32bytesin128in2aligned):
+-	add	%rdx, %rsi
+-	add	%rdx, %rdi
+-	BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4)
+-
+-	.p2align 4
+-L(256bytesormorein2aligned):
+-
+-	sub	$256, %rdx
+-	movdqa	(%rdi), %xmm2
+-	pxor	(%rsi), %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(16bytesin256)
+-
+-	movdqa	16(%rdi), %xmm2
+-	pxor	16(%rsi), %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(32bytesin256)
+-
+-	movdqa	32(%rdi), %xmm2
+-	pxor	32(%rsi), %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(48bytesin256)
+-
+-	movdqa	48(%rdi), %xmm2
+-	pxor	48(%rsi), %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(64bytesin256)
+-
+-	movdqa	64(%rdi), %xmm2
+-	pxor	64(%rsi), %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(80bytesin256)
+-
+-	movdqa	80(%rdi), %xmm2
+-	pxor	80(%rsi), %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(96bytesin256)
+-
+-	movdqa	96(%rdi), %xmm2
+-	pxor	96(%rsi), %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(112bytesin256)
+-
+-	movdqa	112(%rdi), %xmm2
+-	pxor	112(%rsi), %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(128bytesin256)
+-
+-	movdqa	128(%rdi), %xmm2
+-	pxor	128(%rsi), %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(144bytesin256)
+-
+-	movdqa	144(%rdi), %xmm2
+-	pxor	144(%rsi), %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(160bytesin256)
+-
+-	movdqa	160(%rdi), %xmm2
+-	pxor	160(%rsi), %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(176bytesin256)
+-
+-	movdqa	176(%rdi), %xmm2
+-	pxor	176(%rsi), %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(192bytesin256)
+-
+-	movdqa	192(%rdi), %xmm2
+-	pxor	192(%rsi), %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(208bytesin256)
+-
+-	movdqa	208(%rdi), %xmm2
+-	pxor	208(%rsi), %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(224bytesin256)
+-
+-	movdqa	224(%rdi), %xmm2
+-	pxor	224(%rsi), %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(240bytesin256)
+-
+-	movdqa	240(%rdi), %xmm2
+-	pxor	240(%rsi), %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(256bytesin256)
+-
+-	add	$256, %rsi
+-	add	$256, %rdi
+-
+-	cmp	$128, %rdx
+-	jae	L(less256bytesin2alinged)
+-
+-	cmp	$64, %rdx
+-	jae	L(less128bytesin2aligned)
+-
+-	cmp	$32, %rdx
+-	jb	L(less32bytesin256in2alinged)
+-
+-	movdqa	(%rdi), %xmm2
+-	pxor	(%rsi), %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(16bytesin256)
+-
+-	movdqa	16(%rdi), %xmm2
+-	pxor	16(%rsi), %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(32bytesin256)
+-	sub	$32, %rdx
+-	add	$32, %rdi
+-	add	$32, %rsi
+-L(less32bytesin256in2alinged):
+-	add	%rdx, %rsi
+-	add	%rdx, %rdi
+-	BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4)
++	ja	L(aligned_last_64_bytes)
++
++	movdqu	-32(%rdi, %rdx), %xmm0
++	movdqu	-32(%rsi, %rdx), %xmm1
++	CMPEQ	%xmm0, %xmm1
++	pmovmskb %xmm1, %eax
++	incw	%ax
++	jnz	L(vec_return_end_16)
++
++	movdqu	-16(%rdi, %rdx), %xmm0
++	movdqu	-16(%rsi, %rdx), %xmm1
++	CMPEQ	%xmm0, %xmm1
++	pmovmskb %xmm1, %eax
++	incw	%ax
++	jnz	L(vec_return_end)
++	ret
+ 
+ 	.p2align 4
+-L(512bytesormorein2aligned):
++L(aligned_loop):
+ # ifdef DATA_CACHE_SIZE_HALF
+ 	mov	$DATA_CACHE_SIZE_HALF, %R8_LP
+ # else
+ 	mov	__x86_data_cache_size_half(%rip), %R8_LP
+ # endif
+-	mov	%r8, %r9
+-	shr	$1, %r8
+-	add	%r9, %r8
+-	cmp	%r8, %rdx
+-	ja	L(L2_L3_cache_aglined)
++	movq	%r8, %r9
++	addq	%r8, %r8
++	addq	%r9, %r8
++	cmpq	%r8, %rdx
++	ja	L(L2_L3_cache_aligned)
+ 
+ 	sub	$64, %rdx
+ 	.p2align 4
+ L(64bytesormore_loopin2aligned):
+-	movdqa	(%rdi), %xmm2
+-	pxor	(%rsi), %xmm2
+-	movdqa	%xmm2, %xmm1
+-
+-	movdqa	16(%rdi), %xmm3
+-	pxor	16(%rsi), %xmm3
+-	por	%xmm3, %xmm1
++	movdqa	(%rdi), %xmm0
++	movdqa	16(%rdi), %xmm1
++	movdqa	32(%rdi), %xmm2
++	movdqa	48(%rdi), %xmm3
+ 
+-	movdqa	32(%rdi), %xmm4
+-	pxor	32(%rsi), %xmm4
+-	por	%xmm4, %xmm1
++	CMPEQ	(%rsi), %xmm0
++	CMPEQ	16(%rsi), %xmm1
++	CMPEQ	32(%rsi), %xmm2
++	CMPEQ	48(%rsi), %xmm3
+ 
+-	movdqa	48(%rdi), %xmm5
+-	pxor	48(%rsi), %xmm5
+-	por	%xmm5, %xmm1
++	pand	%xmm0, %xmm1
++	pand	%xmm2, %xmm3
++	pand	%xmm1, %xmm3
+ 
+-	ptest	%xmm1, %xmm0
+-	jnc	L(64bytesormore_loop_end)
++	pmovmskb %xmm3, %eax
++	incw	%ax
++	jnz	L(64bytesormore_loop_end)
+ 	add	$64, %rsi
+ 	add	$64, %rdi
+ 	sub	$64, %rdx
+-	jae	L(64bytesormore_loopin2aligned)
+-
+-	add	$64, %rdx
+-	add	%rdx, %rsi
+-	add	%rdx, %rdi
+-	BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4)
+-L(L2_L3_cache_aglined):
+-	sub	$64, %rdx
++	ja	L(64bytesormore_loopin2aligned)
++	jmp	L(loop_tail)
+ 
++L(L2_L3_cache_aligned):
++	subq	$64, %rdx
+ 	.p2align 4
+ L(L2_L3_aligned_128bytes_loop):
+ 	prefetchnta 0x1c0(%rdi)
+ 	prefetchnta 0x1c0(%rsi)
+-	movdqa	(%rdi), %xmm2
+-	pxor	(%rsi), %xmm2
+-	movdqa	%xmm2, %xmm1
+-
+-	movdqa	16(%rdi), %xmm3
+-	pxor	16(%rsi), %xmm3
+-	por	%xmm3, %xmm1
++	movdqa	(%rdi), %xmm0
++	movdqa	16(%rdi), %xmm1
++	movdqa	32(%rdi), %xmm2
++	movdqa	48(%rdi), %xmm3
+ 
+-	movdqa	32(%rdi), %xmm4
+-	pxor	32(%rsi), %xmm4
+-	por	%xmm4, %xmm1
++	CMPEQ	(%rsi), %xmm0
++	CMPEQ	16(%rsi), %xmm1
++	CMPEQ	32(%rsi), %xmm2
++	CMPEQ	48(%rsi), %xmm3
+ 
+-	movdqa	48(%rdi), %xmm5
+-	pxor	48(%rsi), %xmm5
+-	por	%xmm5, %xmm1
++	pand	%xmm0, %xmm1
++	pand	%xmm2, %xmm3
++	pand	%xmm1, %xmm3
+ 
+-	ptest	%xmm1, %xmm0
+-	jnc	L(64bytesormore_loop_end)
+-	add	$64, %rsi
+-	add	$64, %rdi
+-	sub	$64, %rdx
+-	jae	L(L2_L3_aligned_128bytes_loop)
+-
+-	add	$64, %rdx
+-	add	%rdx, %rsi
+-	add	%rdx, %rdi
+-	BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4)
++	pmovmskb %xmm3, %eax
++	incw	%ax
++	jnz	L(64bytesormore_loop_end)
+ 
++	addq	$64, %rsi
++	addq	$64, %rdi
++	subq	$64, %rdx
++	ja	L(L2_L3_aligned_128bytes_loop)
++	jmp	L(loop_tail)
+ 
+ 	.p2align 4
+ L(64bytesormore_loop_end):
+-	add	$16, %rdi
+-	add	$16, %rsi
+-	ptest	%xmm2, %xmm0
+-	jnc	L(16bytes)
+-
+-	add	$16, %rdi
+-	add	$16, %rsi
+-	ptest	%xmm3, %xmm0
+-	jnc	L(16bytes)
+-
+-	add	$16, %rdi
+-	add	$16, %rsi
+-	ptest	%xmm4, %xmm0
+-	jnc	L(16bytes)
+-
+-	add	$16, %rdi
+-	add	$16, %rsi
+-	jmp	L(16bytes)
+-
+-L(256bytesin256):
+-	add	$256, %rdi
+-	add	$256, %rsi
+-	jmp	L(16bytes)
+-L(240bytesin256):
+-	add	$240, %rdi
+-	add	$240, %rsi
+-	jmp	L(16bytes)
+-L(224bytesin256):
+-	add	$224, %rdi
+-	add	$224, %rsi
+-	jmp	L(16bytes)
+-L(208bytesin256):
+-	add	$208, %rdi
+-	add	$208, %rsi
+-	jmp	L(16bytes)
+-L(192bytesin256):
+-	add	$192, %rdi
+-	add	$192, %rsi
+-	jmp	L(16bytes)
+-L(176bytesin256):
+-	add	$176, %rdi
+-	add	$176, %rsi
+-	jmp	L(16bytes)
+-L(160bytesin256):
+-	add	$160, %rdi
+-	add	$160, %rsi
+-	jmp	L(16bytes)
+-L(144bytesin256):
+-	add	$144, %rdi
+-	add	$144, %rsi
+-	jmp	L(16bytes)
+-L(128bytesin256):
+-	add	$128, %rdi
+-	add	$128, %rsi
+-	jmp	L(16bytes)
+-L(112bytesin256):
+-	add	$112, %rdi
+-	add	$112, %rsi
+-	jmp	L(16bytes)
+-L(96bytesin256):
+-	add	$96, %rdi
+-	add	$96, %rsi
+-	jmp	L(16bytes)
+-L(80bytesin256):
+-	add	$80, %rdi
+-	add	$80, %rsi
+-	jmp	L(16bytes)
+-L(64bytesin256):
+-	add	$64, %rdi
+-	add	$64, %rsi
+-	jmp	L(16bytes)
+-L(48bytesin256):
+-	add	$16, %rdi
+-	add	$16, %rsi
+-L(32bytesin256):
+-	add	$16, %rdi
+-	add	$16, %rsi
+-L(16bytesin256):
+-	add	$16, %rdi
+-	add	$16, %rsi
+-L(16bytes):
+-	mov	-16(%rdi), %rax
+-	mov	-16(%rsi), %rcx
+-	cmp	%rax, %rcx
+-	jne	L(diffin8bytes)
+-L(8bytes):
+-	mov	-8(%rdi), %rax
+-	mov	-8(%rsi), %rcx
+-	cmp	%rax, %rcx
+-	jne	L(diffin8bytes)
+-	xor	%eax, %eax
+-	ret
+-
+-	.p2align 4
+-L(12bytes):
+-	mov	-12(%rdi), %rax
+-	mov	-12(%rsi), %rcx
+-	cmp	%rax, %rcx
+-	jne	L(diffin8bytes)
+-L(4bytes):
+-	mov	-4(%rsi), %ecx
+-# ifndef USE_AS_WMEMCMP
+-	mov	-4(%rdi), %eax
+-	cmp	%eax, %ecx
+-# else
+-	cmp	-4(%rdi), %ecx
+-# endif
+-	jne	L(diffin4bytes)
+-L(0bytes):
+-	xor	%eax, %eax
+-	ret
+-
+-# ifndef USE_AS_WMEMCMP
+-/* unreal case for wmemcmp */
+-	.p2align 4
+-L(65bytes):
+-	movdqu	-65(%rdi), %xmm1
+-	movdqu	-65(%rsi), %xmm2
+-	mov	$-65, %dl
+-	pxor	%xmm1, %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(less16bytes)
+-L(49bytes):
+-	movdqu	-49(%rdi), %xmm1
+-	movdqu	-49(%rsi), %xmm2
+-	mov	$-49, %dl
+-	pxor	%xmm1, %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(less16bytes)
+-L(33bytes):
+-	movdqu	-33(%rdi), %xmm1
+-	movdqu	-33(%rsi), %xmm2
+-	mov	$-33, %dl
+-	pxor	%xmm1, %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(less16bytes)
+-L(17bytes):
+-	mov	-17(%rdi), %rax
+-	mov	-17(%rsi), %rcx
+-	cmp	%rax, %rcx
+-	jne	L(diffin8bytes)
+-L(9bytes):
+-	mov	-9(%rdi), %rax
+-	mov	-9(%rsi), %rcx
+-	cmp	%rax, %rcx
+-	jne	L(diffin8bytes)
+-	movzbl	-1(%rdi), %eax
+-	movzbl	-1(%rsi), %edx
+-	sub	%edx, %eax
+-	ret
+-
+-	.p2align 4
+-L(13bytes):
+-	mov	-13(%rdi), %rax
+-	mov	-13(%rsi), %rcx
+-	cmp	%rax, %rcx
+-	jne	L(diffin8bytes)
+-	mov	-8(%rdi), %rax
+-	mov	-8(%rsi), %rcx
+-	cmp	%rax, %rcx
+-	jne	L(diffin8bytes)
+-	xor	%eax, %eax
+-	ret
+-
+-	.p2align 4
+-L(5bytes):
+-	mov	-5(%rdi), %eax
+-	mov	-5(%rsi), %ecx
+-	cmp	%eax, %ecx
+-	jne	L(diffin4bytes)
+-	movzbl	-1(%rdi), %eax
+-	movzbl	-1(%rsi), %edx
+-	sub	%edx, %eax
+-	ret
+-
+-	.p2align 4
+-L(66bytes):
+-	movdqu	-66(%rdi), %xmm1
+-	movdqu	-66(%rsi), %xmm2
+-	mov	$-66, %dl
+-	pxor	%xmm1, %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(less16bytes)
+-L(50bytes):
+-	movdqu	-50(%rdi), %xmm1
+-	movdqu	-50(%rsi), %xmm2
+-	mov	$-50, %dl
+-	pxor	%xmm1, %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(less16bytes)
+-L(34bytes):
+-	movdqu	-34(%rdi), %xmm1
+-	movdqu	-34(%rsi), %xmm2
+-	mov	$-34, %dl
+-	pxor	%xmm1, %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(less16bytes)
+-L(18bytes):
+-	mov	-18(%rdi), %rax
+-	mov	-18(%rsi), %rcx
+-	cmp	%rax, %rcx
+-	jne	L(diffin8bytes)
+-L(10bytes):
+-	mov	-10(%rdi), %rax
+-	mov	-10(%rsi), %rcx
+-	cmp	%rax, %rcx
+-	jne	L(diffin8bytes)
+-	movzwl	-2(%rdi), %eax
+-	movzwl	-2(%rsi), %ecx
+-	cmp	%cl, %al
+-	jne	L(end)
+-	and	$0xffff, %eax
+-	and	$0xffff, %ecx
+-	sub	%ecx, %eax
+-	ret
+-
+-	.p2align 4
+-L(14bytes):
+-	mov	-14(%rdi), %rax
+-	mov	-14(%rsi), %rcx
+-	cmp	%rax, %rcx
+-	jne	L(diffin8bytes)
+-	mov	-8(%rdi), %rax
+-	mov	-8(%rsi), %rcx
+-	cmp	%rax, %rcx
+-	jne	L(diffin8bytes)
+-	xor	%eax, %eax
+-	ret
+-
+-	.p2align 4
+-L(6bytes):
+-	mov	-6(%rdi), %eax
+-	mov	-6(%rsi), %ecx
+-	cmp	%eax, %ecx
+-	jne	L(diffin4bytes)
+-L(2bytes):
+-	movzwl	-2(%rsi), %ecx
+-	movzwl	-2(%rdi), %eax
+-	cmp	%cl, %al
+-	jne	L(end)
+-	and	$0xffff, %eax
+-	and	$0xffff, %ecx
+-	sub	%ecx, %eax
+-	ret
+-
+-	.p2align 4
+-L(67bytes):
+-	movdqu	-67(%rdi), %xmm2
+-	movdqu	-67(%rsi), %xmm1
+-	mov	$-67, %dl
+-	pxor	%xmm1, %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(less16bytes)
+-L(51bytes):
+-	movdqu	-51(%rdi), %xmm2
+-	movdqu	-51(%rsi), %xmm1
+-	mov	$-51, %dl
+-	pxor	%xmm1, %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(less16bytes)
+-L(35bytes):
+-	movdqu	-35(%rsi), %xmm1
+-	movdqu	-35(%rdi), %xmm2
+-	mov	$-35, %dl
+-	pxor	%xmm1, %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(less16bytes)
+-L(19bytes):
+-	mov	-19(%rdi), %rax
+-	mov	-19(%rsi), %rcx
+-	cmp	%rax, %rcx
+-	jne	L(diffin8bytes)
+-L(11bytes):
+-	mov	-11(%rdi), %rax
+-	mov	-11(%rsi), %rcx
+-	cmp	%rax, %rcx
+-	jne	L(diffin8bytes)
+-	mov	-4(%rdi), %eax
+-	mov	-4(%rsi), %ecx
+-	cmp	%eax, %ecx
+-	jne	L(diffin4bytes)
+-	xor	%eax, %eax
+-	ret
+-
+-	.p2align 4
+-L(15bytes):
+-	mov	-15(%rdi), %rax
+-	mov	-15(%rsi), %rcx
+-	cmp	%rax, %rcx
+-	jne	L(diffin8bytes)
+-	mov	-8(%rdi), %rax
+-	mov	-8(%rsi), %rcx
+-	cmp	%rax, %rcx
+-	jne	L(diffin8bytes)
+-	xor	%eax, %eax
+-	ret
+-
+-	.p2align 4
+-L(7bytes):
+-	mov	-7(%rdi), %eax
+-	mov	-7(%rsi), %ecx
+-	cmp	%eax, %ecx
+-	jne	L(diffin4bytes)
+-	mov	-4(%rdi), %eax
+-	mov	-4(%rsi), %ecx
+-	cmp	%eax, %ecx
+-	jne	L(diffin4bytes)
+-	xor	%eax, %eax
+-	ret
+-
+-	.p2align 4
+-L(3bytes):
+-	movzwl	-3(%rdi), %eax
+-	movzwl	-3(%rsi), %ecx
+-	cmp	%eax, %ecx
+-	jne	L(diffin2bytes)
+-L(1bytes):
+-	movzbl	-1(%rdi), %eax
+-	movzbl	-1(%rsi), %ecx
+-	sub	%ecx, %eax
+-	ret
+-# endif
+-
+-	.p2align 4
+-L(68bytes):
+-	movdqu	-68(%rdi), %xmm2
+-	movdqu	-68(%rsi), %xmm1
+-	mov	$-68, %dl
+-	pxor	%xmm1, %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(less16bytes)
+-L(52bytes):
+-	movdqu	-52(%rdi), %xmm2
+-	movdqu	-52(%rsi), %xmm1
+-	mov	$-52, %dl
+-	pxor	%xmm1, %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(less16bytes)
+-L(36bytes):
+-	movdqu	-36(%rdi), %xmm2
+-	movdqu	-36(%rsi), %xmm1
+-	mov	$-36, %dl
+-	pxor	%xmm1, %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(less16bytes)
+-L(20bytes):
+-	movdqu	-20(%rdi), %xmm2
+-	movdqu	-20(%rsi), %xmm1
+-	mov	$-20, %dl
+-	pxor	%xmm1, %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(less16bytes)
+-	mov	-4(%rsi), %ecx
+-
+-# ifndef USE_AS_WMEMCMP
+-	mov	-4(%rdi), %eax
+-	cmp	%eax, %ecx
+-# else
+-	cmp	-4(%rdi), %ecx
+-# endif
+-	jne	L(diffin4bytes)
+-	xor	%eax, %eax
+-	ret
+-
+-# ifndef USE_AS_WMEMCMP
+-/* unreal cases for wmemcmp */
+-	.p2align 4
+-L(69bytes):
+-	movdqu	-69(%rsi), %xmm1
+-	movdqu	-69(%rdi), %xmm2
+-	mov	$-69, %dl
+-	pxor	%xmm1, %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(less16bytes)
+-L(53bytes):
+-	movdqu	-53(%rsi), %xmm1
+-	movdqu	-53(%rdi), %xmm2
+-	mov	$-53, %dl
+-	pxor	%xmm1, %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(less16bytes)
+-L(37bytes):
+-	movdqu	-37(%rsi), %xmm1
+-	movdqu	-37(%rdi), %xmm2
+-	mov	$-37, %dl
+-	pxor	%xmm1, %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(less16bytes)
+-L(21bytes):
+-	movdqu	-21(%rsi), %xmm1
+-	movdqu	-21(%rdi), %xmm2
+-	mov	$-21, %dl
+-	pxor	%xmm1, %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(less16bytes)
+-	mov	-8(%rdi), %rax
+-	mov	-8(%rsi), %rcx
+-	cmp	%rax, %rcx
+-	jne	L(diffin8bytes)
+-	xor	%eax, %eax
+-	ret
+-
+-	.p2align 4
+-L(70bytes):
+-	movdqu	-70(%rsi), %xmm1
+-	movdqu	-70(%rdi), %xmm2
+-	mov	$-70, %dl
+-	pxor	%xmm1, %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(less16bytes)
+-L(54bytes):
+-	movdqu	-54(%rsi), %xmm1
+-	movdqu	-54(%rdi), %xmm2
+-	mov	$-54, %dl
+-	pxor	%xmm1, %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(less16bytes)
+-L(38bytes):
+-	movdqu	-38(%rsi), %xmm1
+-	movdqu	-38(%rdi), %xmm2
+-	mov	$-38, %dl
+-	pxor	%xmm1, %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(less16bytes)
+-L(22bytes):
+-	movdqu	-22(%rsi), %xmm1
+-	movdqu	-22(%rdi), %xmm2
+-	mov	$-22, %dl
+-	pxor	%xmm1, %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(less16bytes)
+-	mov	-8(%rdi), %rax
+-	mov	-8(%rsi), %rcx
+-	cmp	%rax, %rcx
+-	jne	L(diffin8bytes)
+-	xor	%eax, %eax
+-	ret
+-
+-	.p2align 4
+-L(71bytes):
+-	movdqu	-71(%rsi), %xmm1
+-	movdqu	-71(%rdi), %xmm2
+-	mov	$-71, %dl
+-	pxor	%xmm1, %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(less16bytes)
+-L(55bytes):
+-	movdqu	-55(%rdi), %xmm2
+-	movdqu	-55(%rsi), %xmm1
+-	mov	$-55, %dl
+-	pxor	%xmm1, %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(less16bytes)
+-L(39bytes):
+-	movdqu	-39(%rdi), %xmm2
+-	movdqu	-39(%rsi), %xmm1
+-	mov	$-39, %dl
+-	pxor	%xmm1, %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(less16bytes)
+-L(23bytes):
+-	movdqu	-23(%rdi), %xmm2
+-	movdqu	-23(%rsi), %xmm1
+-	mov	$-23, %dl
+-	pxor	%xmm1, %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(less16bytes)
+-	mov	-8(%rdi), %rax
+-	mov	-8(%rsi), %rcx
+-	cmp	%rax, %rcx
+-	jne	L(diffin8bytes)
+-	xor	%eax, %eax
+-	ret
+-# endif
+-
+-	.p2align 4
+-L(72bytes):
+-	movdqu	-72(%rsi), %xmm1
+-	movdqu	-72(%rdi), %xmm2
+-	mov	$-72, %dl
+-	pxor	%xmm1, %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(less16bytes)
+-L(56bytes):
+-	movdqu	-56(%rdi), %xmm2
+-	movdqu	-56(%rsi), %xmm1
+-	mov	$-56, %dl
+-	pxor	%xmm1, %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(less16bytes)
+-L(40bytes):
+-	movdqu	-40(%rdi), %xmm2
+-	movdqu	-40(%rsi), %xmm1
+-	mov	$-40, %dl
+-	pxor	%xmm1, %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(less16bytes)
+-L(24bytes):
+-	movdqu	-24(%rdi), %xmm2
+-	movdqu	-24(%rsi), %xmm1
+-	mov	$-24, %dl
+-	pxor	%xmm1, %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(less16bytes)
+-
+-	mov	-8(%rsi), %rcx
+-	mov	-8(%rdi), %rax
+-	cmp	%rax, %rcx
+-	jne	L(diffin8bytes)
+-	xor	%eax, %eax
+-	ret
+-
+-# ifndef USE_AS_WMEMCMP
+-/* unreal cases for wmemcmp */
+-	.p2align 4
+-L(73bytes):
+-	movdqu	-73(%rsi), %xmm1
+-	movdqu	-73(%rdi), %xmm2
+-	mov	$-73, %dl
+-	pxor	%xmm1, %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(less16bytes)
+-L(57bytes):
+-	movdqu	-57(%rdi), %xmm2
+-	movdqu	-57(%rsi), %xmm1
+-	mov	$-57, %dl
+-	pxor	%xmm1, %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(less16bytes)
+-L(41bytes):
+-	movdqu	-41(%rdi), %xmm2
+-	movdqu	-41(%rsi), %xmm1
+-	mov	$-41, %dl
+-	pxor	%xmm1, %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(less16bytes)
+-L(25bytes):
+-	movdqu	-25(%rdi), %xmm2
+-	movdqu	-25(%rsi), %xmm1
+-	mov	$-25, %dl
+-	pxor	%xmm1, %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(less16bytes)
+-	mov	-9(%rdi), %rax
+-	mov	-9(%rsi), %rcx
+-	cmp	%rax, %rcx
+-	jne	L(diffin8bytes)
+-	movzbl	-1(%rdi), %eax
+-	movzbl	-1(%rsi), %ecx
+-	sub	%ecx, %eax
+-	ret
+-
+-	.p2align 4
+-L(74bytes):
+-	movdqu	-74(%rsi), %xmm1
+-	movdqu	-74(%rdi), %xmm2
+-	mov	$-74, %dl
+-	pxor	%xmm1, %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(less16bytes)
+-L(58bytes):
+-	movdqu	-58(%rdi), %xmm2
+-	movdqu	-58(%rsi), %xmm1
+-	mov	$-58, %dl
+-	pxor	%xmm1, %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(less16bytes)
+-L(42bytes):
+-	movdqu	-42(%rdi), %xmm2
+-	movdqu	-42(%rsi), %xmm1
+-	mov	$-42, %dl
+-	pxor	%xmm1, %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(less16bytes)
+-L(26bytes):
+-	movdqu	-26(%rdi), %xmm2
+-	movdqu	-26(%rsi), %xmm1
+-	mov	$-26, %dl
+-	pxor	%xmm1, %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(less16bytes)
+-	mov	-10(%rdi), %rax
+-	mov	-10(%rsi), %rcx
+-	cmp	%rax, %rcx
+-	jne	L(diffin8bytes)
+-	movzwl	-2(%rdi), %eax
+-	movzwl	-2(%rsi), %ecx
+-	jmp	L(diffin2bytes)
+-
+-	.p2align 4
+-L(75bytes):
+-	movdqu	-75(%rsi), %xmm1
+-	movdqu	-75(%rdi), %xmm2
+-	mov	$-75, %dl
+-	pxor	%xmm1, %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(less16bytes)
+-L(59bytes):
+-	movdqu	-59(%rdi), %xmm2
+-	movdqu	-59(%rsi), %xmm1
+-	mov	$-59, %dl
+-	pxor	%xmm1, %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(less16bytes)
+-L(43bytes):
+-	movdqu	-43(%rdi), %xmm2
+-	movdqu	-43(%rsi), %xmm1
+-	mov	$-43, %dl
+-	pxor	%xmm1, %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(less16bytes)
+-L(27bytes):
+-	movdqu	-27(%rdi), %xmm2
+-	movdqu	-27(%rsi), %xmm1
+-	mov	$-27, %dl
+-	pxor	%xmm1, %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(less16bytes)
+-	mov	-11(%rdi), %rax
+-	mov	-11(%rsi), %rcx
+-	cmp	%rax, %rcx
+-	jne	L(diffin8bytes)
+-	mov	-4(%rdi), %eax
+-	mov	-4(%rsi), %ecx
+-	cmp	%eax, %ecx
+-	jne	L(diffin4bytes)
+-	xor	%eax, %eax
+-	ret
+-# endif
+-	.p2align 4
+-L(76bytes):
+-	movdqu	-76(%rsi), %xmm1
+-	movdqu	-76(%rdi), %xmm2
+-	mov	$-76, %dl
+-	pxor	%xmm1, %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(less16bytes)
+-L(60bytes):
+-	movdqu	-60(%rdi), %xmm2
+-	movdqu	-60(%rsi), %xmm1
+-	mov	$-60, %dl
+-	pxor	%xmm1, %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(less16bytes)
+-L(44bytes):
+-	movdqu	-44(%rdi), %xmm2
+-	movdqu	-44(%rsi), %xmm1
+-	mov	$-44, %dl
+-	pxor	%xmm1, %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(less16bytes)
+-L(28bytes):
+-	movdqu	-28(%rdi), %xmm2
+-	movdqu	-28(%rsi), %xmm1
+-	mov	$-28, %dl
+-	pxor	%xmm1, %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(less16bytes)
+-	mov	-12(%rdi), %rax
+-	mov	-12(%rsi), %rcx
+-	cmp	%rax, %rcx
+-	jne	L(diffin8bytes)
+-	mov	-4(%rsi), %ecx
+-# ifndef USE_AS_WMEMCMP
+-	mov	-4(%rdi), %eax
+-	cmp	%eax, %ecx
+-# else
+-	cmp	-4(%rdi), %ecx
+-# endif
+-	jne	L(diffin4bytes)
+-	xor	%eax, %eax
+-	ret
+-
+-# ifndef USE_AS_WMEMCMP
+-/* unreal cases for wmemcmp */
+-	.p2align 4
+-L(77bytes):
+-	movdqu	-77(%rsi), %xmm1
+-	movdqu	-77(%rdi), %xmm2
+-	mov	$-77, %dl
+-	pxor	%xmm1, %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(less16bytes)
+-L(61bytes):
+-	movdqu	-61(%rdi), %xmm2
+-	movdqu	-61(%rsi), %xmm1
+-	mov	$-61, %dl
+-	pxor	%xmm1, %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(less16bytes)
+-L(45bytes):
+-	movdqu	-45(%rdi), %xmm2
+-	movdqu	-45(%rsi), %xmm1
+-	mov	$-45, %dl
+-	pxor	%xmm1, %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(less16bytes)
+-L(29bytes):
+-	movdqu	-29(%rdi), %xmm2
+-	movdqu	-29(%rsi), %xmm1
+-	mov	$-29, %dl
+-	pxor	%xmm1, %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(less16bytes)
+-
+-	mov	-13(%rdi), %rax
+-	mov	-13(%rsi), %rcx
+-	cmp	%rax, %rcx
+-	jne	L(diffin8bytes)
+-
+-	mov	-8(%rdi), %rax
+-	mov	-8(%rsi), %rcx
+-	cmp	%rax, %rcx
+-	jne	L(diffin8bytes)
+-	xor	%eax, %eax
+-	ret
+-
+-	.p2align 4
+-L(78bytes):
+-	movdqu	-78(%rsi), %xmm1
+-	movdqu	-78(%rdi), %xmm2
+-	mov	$-78, %dl
+-	pxor	%xmm1, %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(less16bytes)
+-L(62bytes):
+-	movdqu	-62(%rdi), %xmm2
+-	movdqu	-62(%rsi), %xmm1
+-	mov	$-62, %dl
+-	pxor	%xmm1, %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(less16bytes)
+-L(46bytes):
+-	movdqu	-46(%rdi), %xmm2
+-	movdqu	-46(%rsi), %xmm1
+-	mov	$-46, %dl
+-	pxor	%xmm1, %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(less16bytes)
+-L(30bytes):
+-	movdqu	-30(%rdi), %xmm2
+-	movdqu	-30(%rsi), %xmm1
+-	mov	$-30, %dl
+-	pxor	%xmm1, %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(less16bytes)
+-	mov	-14(%rdi), %rax
+-	mov	-14(%rsi), %rcx
+-	cmp	%rax, %rcx
+-	jne	L(diffin8bytes)
+-	mov	-8(%rdi), %rax
+-	mov	-8(%rsi), %rcx
+-	cmp	%rax, %rcx
+-	jne	L(diffin8bytes)
+-	xor	%eax, %eax
+-	ret
+-
+-	.p2align 4
+-L(79bytes):
+-	movdqu	-79(%rsi), %xmm1
+-	movdqu	-79(%rdi), %xmm2
+-	mov	$-79, %dl
+-	pxor	%xmm1, %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(less16bytes)
+-L(63bytes):
+-	movdqu	-63(%rdi), %xmm2
+-	movdqu	-63(%rsi), %xmm1
+-	mov	$-63, %dl
+-	pxor	%xmm1, %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(less16bytes)
+-L(47bytes):
+-	movdqu	-47(%rdi), %xmm2
+-	movdqu	-47(%rsi), %xmm1
+-	mov	$-47, %dl
+-	pxor	%xmm1, %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(less16bytes)
+-L(31bytes):
+-	movdqu	-31(%rdi), %xmm2
+-	movdqu	-31(%rsi), %xmm1
+-	mov	$-31, %dl
+-	pxor	%xmm1, %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(less16bytes)
+-	mov	-15(%rdi), %rax
+-	mov	-15(%rsi), %rcx
+-	cmp	%rax, %rcx
+-	jne	L(diffin8bytes)
+-	mov	-8(%rdi), %rax
+-	mov	-8(%rsi), %rcx
+-	cmp	%rax, %rcx
+-	jne	L(diffin8bytes)
+-	xor	%eax, %eax
+-	ret
+-# endif
+-	.p2align 4
+-L(64bytes):
+-	movdqu	-64(%rdi), %xmm2
+-	movdqu	-64(%rsi), %xmm1
+-	mov	$-64, %dl
+-	pxor	%xmm1, %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(less16bytes)
+-L(48bytes):
+-	movdqu	-48(%rdi), %xmm2
+-	movdqu	-48(%rsi), %xmm1
+-	mov	$-48, %dl
+-	pxor	%xmm1, %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(less16bytes)
+-L(32bytes):
+-	movdqu	-32(%rdi), %xmm2
+-	movdqu	-32(%rsi), %xmm1
+-	mov	$-32, %dl
+-	pxor	%xmm1, %xmm2
+-	ptest	%xmm2, %xmm0
+-	jnc	L(less16bytes)
+-
+-	mov	-16(%rdi), %rax
+-	mov	-16(%rsi), %rcx
+-	cmp	%rax, %rcx
+-	jne	L(diffin8bytes)
+-
+-	mov	-8(%rdi), %rax
+-	mov	-8(%rsi), %rcx
+-	cmp	%rax, %rcx
+-	jne	L(diffin8bytes)
+-	xor	%eax, %eax
+-	ret
+-
+-/*
+- * Aligned 8 bytes to avoid 2 branch "taken" in one 16 alinged code block.
+- */
+-	.p2align 3
+-L(less16bytes):
+-	movsbq	%dl, %rdx
+-	mov	(%rsi, %rdx), %rcx
+-	mov	(%rdi, %rdx), %rax
+-	cmp	%rax, %rcx
+-	jne	L(diffin8bytes)
+-	mov	8(%rsi, %rdx), %rcx
+-	mov	8(%rdi, %rdx), %rax
+-L(diffin8bytes):
+-	cmp	%eax, %ecx
+-	jne	L(diffin4bytes)
+-	shr	$32, %rcx
+-	shr	$32, %rax
+-
++	pmovmskb %xmm0, %ecx
++	incw	%cx
++	jnz	L(loop_end_ret)
++
++	pmovmskb %xmm1, %ecx
++	notw	%cx
++	sall	$16, %ecx
++	jnz	L(loop_end_ret)
++
++	pmovmskb %xmm2, %ecx
++	notw	%cx
++	shlq	$32, %rcx
++	jnz	L(loop_end_ret)
++
++	addq	$48, %rdi
++	addq	$48, %rsi
++	movq	%rax, %rcx
++
++	.p2align 4,, 6
++L(loop_end_ret):
++	bsfq	%rcx, %rcx
+ # ifdef USE_AS_WMEMCMP
+-/* for wmemcmp */
+-	cmp	%eax, %ecx
+-	jne	L(diffin4bytes)
+-	xor	%eax, %eax
+-	ret
+-# endif
+-
+-L(diffin4bytes):
+-# ifndef USE_AS_WMEMCMP
+-	cmp	%cx, %ax
+-	jne	L(diffin2bytes)
+-	shr	$16, %ecx
+-	shr	$16, %eax
+-L(diffin2bytes):
+-	cmp	%cl, %al
+-	jne	L(end)
+-	and	$0xffff, %eax
+-	and	$0xffff, %ecx
+-	sub	%ecx, %eax
+-	ret
+-
+-	.p2align 4
+-L(end):
+-	and	$0xff, %eax
+-	and	$0xff, %ecx
+-	sub	%ecx, %eax
+-	ret
++	movl	(%rdi, %rcx), %eax
++	xorl	%edx, %edx
++	cmpl	(%rsi, %rcx), %eax
++	setg	%dl
++	leal	-1(%rdx, %rdx), %eax
+ # else
+-
+-/* for wmemcmp */
+-	mov	$1, %eax
+-	jl	L(nequal_bigger)
+-	neg	%eax
+-	ret
+-
+-	.p2align 4
+-L(nequal_bigger):
+-	ret
+-
+-L(unreal_case):
+-	xor	%eax, %eax
+-	ret
++	movzbl	(%rdi, %rcx), %eax
++	movzbl	(%rsi, %rcx), %ecx
++	subl	%ecx, %eax
+ # endif
+-
++	ret
+ END (MEMCMP)
+-
+-	.section .rodata.sse4.1,"a",@progbits
+-	.p2align 3
+-# ifndef USE_AS_WMEMCMP
+-L(table_64bytes):
+-	.int	JMPTBL (L(0bytes), L(table_64bytes))
+-	.int	JMPTBL (L(1bytes), L(table_64bytes))
+-	.int	JMPTBL (L(2bytes), L(table_64bytes))
+-	.int	JMPTBL (L(3bytes), L(table_64bytes))
+-	.int	JMPTBL (L(4bytes), L(table_64bytes))
+-	.int	JMPTBL (L(5bytes), L(table_64bytes))
+-	.int	JMPTBL (L(6bytes), L(table_64bytes))
+-	.int	JMPTBL (L(7bytes), L(table_64bytes))
+-	.int	JMPTBL (L(8bytes), L(table_64bytes))
+-	.int	JMPTBL (L(9bytes), L(table_64bytes))
+-	.int	JMPTBL (L(10bytes), L(table_64bytes))
+-	.int	JMPTBL (L(11bytes), L(table_64bytes))
+-	.int	JMPTBL (L(12bytes), L(table_64bytes))
+-	.int	JMPTBL (L(13bytes), L(table_64bytes))
+-	.int	JMPTBL (L(14bytes), L(table_64bytes))
+-	.int	JMPTBL (L(15bytes), L(table_64bytes))
+-	.int	JMPTBL (L(16bytes), L(table_64bytes))
+-	.int	JMPTBL (L(17bytes), L(table_64bytes))
+-	.int	JMPTBL (L(18bytes), L(table_64bytes))
+-	.int	JMPTBL (L(19bytes), L(table_64bytes))
+-	.int	JMPTBL (L(20bytes), L(table_64bytes))
+-	.int	JMPTBL (L(21bytes), L(table_64bytes))
+-	.int	JMPTBL (L(22bytes), L(table_64bytes))
+-	.int	JMPTBL (L(23bytes), L(table_64bytes))
+-	.int	JMPTBL (L(24bytes), L(table_64bytes))
+-	.int	JMPTBL (L(25bytes), L(table_64bytes))
+-	.int	JMPTBL (L(26bytes), L(table_64bytes))
+-	.int	JMPTBL (L(27bytes), L(table_64bytes))
+-	.int	JMPTBL (L(28bytes), L(table_64bytes))
+-	.int	JMPTBL (L(29bytes), L(table_64bytes))
+-	.int	JMPTBL (L(30bytes), L(table_64bytes))
+-	.int	JMPTBL (L(31bytes), L(table_64bytes))
+-	.int	JMPTBL (L(32bytes), L(table_64bytes))
+-	.int	JMPTBL (L(33bytes), L(table_64bytes))
+-	.int	JMPTBL (L(34bytes), L(table_64bytes))
+-	.int	JMPTBL (L(35bytes), L(table_64bytes))
+-	.int	JMPTBL (L(36bytes), L(table_64bytes))
+-	.int	JMPTBL (L(37bytes), L(table_64bytes))
+-	.int	JMPTBL (L(38bytes), L(table_64bytes))
+-	.int	JMPTBL (L(39bytes), L(table_64bytes))
+-	.int	JMPTBL (L(40bytes), L(table_64bytes))
+-	.int	JMPTBL (L(41bytes), L(table_64bytes))
+-	.int	JMPTBL (L(42bytes), L(table_64bytes))
+-	.int	JMPTBL (L(43bytes), L(table_64bytes))
+-	.int	JMPTBL (L(44bytes), L(table_64bytes))
+-	.int	JMPTBL (L(45bytes), L(table_64bytes))
+-	.int	JMPTBL (L(46bytes), L(table_64bytes))
+-	.int	JMPTBL (L(47bytes), L(table_64bytes))
+-	.int	JMPTBL (L(48bytes), L(table_64bytes))
+-	.int	JMPTBL (L(49bytes), L(table_64bytes))
+-	.int	JMPTBL (L(50bytes), L(table_64bytes))
+-	.int	JMPTBL (L(51bytes), L(table_64bytes))
+-	.int	JMPTBL (L(52bytes), L(table_64bytes))
+-	.int	JMPTBL (L(53bytes), L(table_64bytes))
+-	.int	JMPTBL (L(54bytes), L(table_64bytes))
+-	.int	JMPTBL (L(55bytes), L(table_64bytes))
+-	.int	JMPTBL (L(56bytes), L(table_64bytes))
+-	.int	JMPTBL (L(57bytes), L(table_64bytes))
+-	.int	JMPTBL (L(58bytes), L(table_64bytes))
+-	.int	JMPTBL (L(59bytes), L(table_64bytes))
+-	.int	JMPTBL (L(60bytes), L(table_64bytes))
+-	.int	JMPTBL (L(61bytes), L(table_64bytes))
+-	.int	JMPTBL (L(62bytes), L(table_64bytes))
+-	.int	JMPTBL (L(63bytes), L(table_64bytes))
+-	.int	JMPTBL (L(64bytes), L(table_64bytes))
+-	.int	JMPTBL (L(65bytes), L(table_64bytes))
+-	.int	JMPTBL (L(66bytes), L(table_64bytes))
+-	.int	JMPTBL (L(67bytes), L(table_64bytes))
+-	.int	JMPTBL (L(68bytes), L(table_64bytes))
+-	.int	JMPTBL (L(69bytes), L(table_64bytes))
+-	.int	JMPTBL (L(70bytes), L(table_64bytes))
+-	.int	JMPTBL (L(71bytes), L(table_64bytes))
+-	.int	JMPTBL (L(72bytes), L(table_64bytes))
+-	.int	JMPTBL (L(73bytes), L(table_64bytes))
+-	.int	JMPTBL (L(74bytes), L(table_64bytes))
+-	.int	JMPTBL (L(75bytes), L(table_64bytes))
+-	.int	JMPTBL (L(76bytes), L(table_64bytes))
+-	.int	JMPTBL (L(77bytes), L(table_64bytes))
+-	.int	JMPTBL (L(78bytes), L(table_64bytes))
+-	.int	JMPTBL (L(79bytes), L(table_64bytes))
+-# else
+-L(table_64bytes):
+-	.int	JMPTBL (L(0bytes), L(table_64bytes))
+-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+-	.int	JMPTBL (L(4bytes), L(table_64bytes))
+-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+-	.int	JMPTBL (L(8bytes), L(table_64bytes))
+-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+-	.int	JMPTBL (L(12bytes), L(table_64bytes))
+-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+-	.int	JMPTBL (L(16bytes), L(table_64bytes))
+-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+-	.int	JMPTBL (L(20bytes), L(table_64bytes))
+-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+-	.int	JMPTBL (L(24bytes), L(table_64bytes))
+-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+-	.int	JMPTBL (L(28bytes), L(table_64bytes))
+-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+-	.int	JMPTBL (L(32bytes), L(table_64bytes))
+-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+-	.int	JMPTBL (L(36bytes), L(table_64bytes))
+-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+-	.int	JMPTBL (L(40bytes), L(table_64bytes))
+-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+-	.int	JMPTBL (L(44bytes), L(table_64bytes))
+-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+-	.int	JMPTBL (L(48bytes), L(table_64bytes))
+-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+-	.int	JMPTBL (L(52bytes), L(table_64bytes))
+-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+-	.int	JMPTBL (L(56bytes), L(table_64bytes))
+-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+-	.int	JMPTBL (L(60bytes), L(table_64bytes))
+-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+-	.int	JMPTBL (L(64bytes), L(table_64bytes))
+-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+-	.int	JMPTBL (L(68bytes), L(table_64bytes))
+-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+-	.int	JMPTBL (L(72bytes), L(table_64bytes))
+-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+-	.int	JMPTBL (L(76bytes), L(table_64bytes))
+-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+-# endif
+ #endif
+-- 
+GitLab
+
diff --git a/glibc-RHEL-15696-64.patch b/glibc-RHEL-15696-64.patch
new file mode 100644
index 0000000..ba7f14a
--- /dev/null
+++ b/glibc-RHEL-15696-64.patch
@@ -0,0 +1,39 @@
+From 0b82747dc48d5bf0871bdc6da8cb6eec1256355f Mon Sep 17 00:00:00 2001
+From: "H.J. Lu" <hjl.tools@gmail.com>
+Date: Thu, 11 Nov 2021 06:31:51 -0800
+Subject: [PATCH] Avoid extra load with CAS in __pthread_mutex_lock_full [BZ
+ #28537]
+Content-type: text/plain; charset=UTF-8
+
+Replace boolean CAS with value CAS to avoid the extra load.
+
+Reviewed-by: Szabolcs Nagy <szabolcs.nagy@arm.com>
+---
+ nptl/pthread_mutex_lock.c | 10 +++++-----
+ 1 file changed, 5 insertions(+), 5 deletions(-)
+
+diff --git a/nptl/pthread_mutex_lock.c b/nptl/pthread_mutex_lock.c
+index 29cc143e..60ada70d 100644
+--- a/nptl/pthread_mutex_lock.c
++++ b/nptl/pthread_mutex_lock.c
+@@ -292,12 +292,12 @@ __pthread_mutex_lock_full (pthread_mutex_t *mutex)
+ 	     meantime.  */
+ 	  if ((oldval & FUTEX_WAITERS) == 0)
+ 	    {
+-	      if (atomic_compare_and_exchange_bool_acq (&mutex->__data.__lock,
+-							oldval | FUTEX_WAITERS,
+-							oldval)
+-		  != 0)
++	      int val;
++	      if ((val = atomic_compare_and_exchange_val_acq
++		   (&mutex->__data.__lock, oldval | FUTEX_WAITERS,
++		    oldval)) != oldval)
+ 		{
+-		  oldval = mutex->__data.__lock;
++		  oldval = val;
+ 		  continue;
+ 		}
+ 	      oldval |= FUTEX_WAITERS;
+-- 
+GitLab
+
diff --git a/glibc-RHEL-15696-65.patch b/glibc-RHEL-15696-65.patch
new file mode 100644
index 0000000..296d4a9
--- /dev/null
+++ b/glibc-RHEL-15696-65.patch
@@ -0,0 +1,39 @@
+From 49302b8fdf9103b6fc0a398678668a22fa19574c Mon Sep 17 00:00:00 2001
+From: "H.J. Lu" <hjl.tools@gmail.com>
+Date: Thu, 11 Nov 2021 06:54:01 -0800
+Subject: [PATCH] Avoid extra load with CAS in __pthread_mutex_clocklock_common
+ [BZ #28537]
+Content-type: text/plain; charset=UTF-8
+
+Replace boolean CAS with value CAS to avoid the extra load.
+
+Reviewed-by: Szabolcs Nagy <szabolcs.nagy@arm.com>
+---
+ nptl/pthread_mutex_timedlock.c | 10 +++++-----
+ 1 file changed, 5 insertions(+), 5 deletions(-)
+
+diff --git a/nptl/pthread_mutex_timedlock.c b/nptl/pthread_mutex_timedlock.c
+index 888c12fe..c4627ef6 100644
+--- a/nptl/pthread_mutex_timedlock.c
++++ b/nptl/pthread_mutex_timedlock.c
+@@ -269,12 +269,12 @@ __pthread_mutex_timedlock (pthread_mutex_t *mutex,
+ 	     meantime.  */
+ 	  if ((oldval & FUTEX_WAITERS) == 0)
+ 	    {
+-	      if (atomic_compare_and_exchange_bool_acq (&mutex->__data.__lock,
+-							oldval | FUTEX_WAITERS,
+-							oldval)
+-		  != 0)
++	      int val;
++	      if ((val = atomic_compare_and_exchange_val_acq
++		   (&mutex->__data.__lock, oldval | FUTEX_WAITERS,
++		    oldval)) != oldval)
+ 		{
+-		  oldval = mutex->__data.__lock;
++		  oldval = val;
+ 		  continue;
+ 		}
+ 	      oldval |= FUTEX_WAITERS;
+-- 
+GitLab
+
diff --git a/glibc-RHEL-15696-66.patch b/glibc-RHEL-15696-66.patch
new file mode 100644
index 0000000..4579636
--- /dev/null
+++ b/glibc-RHEL-15696-66.patch
@@ -0,0 +1,51 @@
+From d672a98a1af106bd68deb15576710cd61363f7a6 Mon Sep 17 00:00:00 2001
+From: "H.J. Lu" <hjl.tools@gmail.com>
+Date: Tue, 2 Nov 2021 18:33:07 -0700
+Subject: [PATCH] Add LLL_MUTEX_READ_LOCK [BZ #28537]
+Content-type: text/plain; charset=UTF-8
+
+CAS instruction is expensive.  From the x86 CPU's point of view, getting
+a cache line for writing is more expensive than reading.  See Appendix
+A.2 Spinlock in:
+
+https://www.intel.com/content/dam/www/public/us/en/documents/white-papers/xeon-lock-scaling-analysis-paper.pdf
+
+The full compare and swap will grab the cache line exclusive and cause
+excessive cache line bouncing.
+
+Add LLL_MUTEX_READ_LOCK to do an atomic load and skip CAS in spinlock
+loop if compare may fail to reduce cache line bouncing on contended locks.
+
+Reviewed-by: Szabolcs Nagy <szabolcs.nagy@arm.com>
+---
+ nptl/pthread_mutex_lock.c | 7 +++++++
+ 1 file changed, 7 insertions(+)
+
+diff --git a/nptl/pthread_mutex_lock.c b/nptl/pthread_mutex_lock.c
+index 60ada70d..eb4d8baa 100644
+--- a/nptl/pthread_mutex_lock.c
++++ b/nptl/pthread_mutex_lock.c
+@@ -56,6 +56,11 @@
+ #define FORCE_ELISION(m, s)
+ #endif
+ 
++#ifndef LLL_MUTEX_READ_LOCK
++# define LLL_MUTEX_READ_LOCK(mutex) \
++  atomic_load_relaxed (&(mutex)->__data.__lock)
++#endif
++
+ static int __pthread_mutex_lock_full (pthread_mutex_t *mutex)
+      __attribute_noinline__;
+ 
+@@ -136,6 +141,8 @@ __pthread_mutex_lock (pthread_mutex_t *mutex)
+ 		  break;
+ 		}
+ 	      atomic_spin_nop ();
++	      if (LLL_MUTEX_READ_LOCK (mutex) != 0)
++		continue;
+ 	    }
+ 	  while (LLL_MUTEX_TRYLOCK (mutex) != 0);
+ 
+-- 
+GitLab
+
diff --git a/glibc-RHEL-15696-67.patch b/glibc-RHEL-15696-67.patch
new file mode 100644
index 0000000..73c8306
--- /dev/null
+++ b/glibc-RHEL-15696-67.patch
@@ -0,0 +1,71 @@
+From 120ac6d238825452e8024e2f627da33b2508dfd3 Mon Sep 17 00:00:00 2001
+From: "H.J. Lu" <hjl.tools@gmail.com>
+Date: Fri, 12 Nov 2021 11:47:42 -0800
+Subject: [PATCH] Move assignment out of the CAS condition
+Content-type: text/plain; charset=UTF-8
+
+Update
+
+commit 49302b8fdf9103b6fc0a398678668a22fa19574c
+Author: H.J. Lu <hjl.tools@gmail.com>
+Date:   Thu Nov 11 06:54:01 2021 -0800
+
+    Avoid extra load with CAS in __pthread_mutex_clocklock_common [BZ #28537]
+
+    Replace boolean CAS with value CAS to avoid the extra load.
+
+and
+
+commit 0b82747dc48d5bf0871bdc6da8cb6eec1256355f
+Author: H.J. Lu <hjl.tools@gmail.com>
+Date:   Thu Nov 11 06:31:51 2021 -0800
+
+    Avoid extra load with CAS in __pthread_mutex_lock_full [BZ #28537]
+
+    Replace boolean CAS with value CAS to avoid the extra load.
+
+by moving assignment out of the CAS condition.
+---
+ nptl/pthread_mutex_lock.c      | 7 +++----
+ nptl/pthread_mutex_timedlock.c | 7 +++----
+ 2 files changed, 6 insertions(+), 8 deletions(-)
+
+diff --git a/nptl/pthread_mutex_lock.c b/nptl/pthread_mutex_lock.c
+index eb4d8baa..a633d95e 100644
+--- a/nptl/pthread_mutex_lock.c
++++ b/nptl/pthread_mutex_lock.c
+@@ -299,10 +299,9 @@ __pthread_mutex_lock_full (pthread_mutex_t *mutex)
+ 	     meantime.  */
+ 	  if ((oldval & FUTEX_WAITERS) == 0)
+ 	    {
+-	      int val;
+-	      if ((val = atomic_compare_and_exchange_val_acq
+-		   (&mutex->__data.__lock, oldval | FUTEX_WAITERS,
+-		    oldval)) != oldval)
++	      int val = atomic_compare_and_exchange_val_acq
++		(&mutex->__data.__lock, oldval | FUTEX_WAITERS, oldval);
++	      if (val != oldval)
+ 		{
+ 		  oldval = val;
+ 		  continue;
+diff --git a/nptl/pthread_mutex_timedlock.c b/nptl/pthread_mutex_timedlock.c
+index c4627ef6..a76c30b7 100644
+--- a/nptl/pthread_mutex_timedlock.c
++++ b/nptl/pthread_mutex_timedlock.c
+@@ -269,10 +269,9 @@ __pthread_mutex_timedlock (pthread_mutex_t *mutex,
+ 	     meantime.  */
+ 	  if ((oldval & FUTEX_WAITERS) == 0)
+ 	    {
+-	      int val;
+-	      if ((val = atomic_compare_and_exchange_val_acq
+-		   (&mutex->__data.__lock, oldval | FUTEX_WAITERS,
+-		    oldval)) != oldval)
++	      int val = atomic_compare_and_exchange_val_acq
++		(&mutex->__data.__lock, oldval | FUTEX_WAITERS, oldval);
++	      if (val != oldval)
+ 		{
+ 		  oldval = val;
+ 		  continue;
+-- 
+GitLab
+
diff --git a/glibc-RHEL-15696-68.patch b/glibc-RHEL-15696-68.patch
new file mode 100644
index 0000000..df35b31
--- /dev/null
+++ b/glibc-RHEL-15696-68.patch
@@ -0,0 +1,60 @@
+From 4df1fa6ddc8925a75f3da644d5da3bb16eb33f02 Mon Sep 17 00:00:00 2001
+From: Noah Goldstein <goldstein.w.n@gmail.com>
+Date: Fri, 3 Dec 2021 15:29:25 -0800
+Subject: [PATCH] x86-64: Use notl in EVEX strcmp [BZ #28646]
+Content-type: text/plain; charset=UTF-8
+
+Must use notl %edi here as lower bits are for CHAR comparisons
+potentially out of range thus can be 0 without indicating mismatch.
+This fixes BZ #28646.
+
+Co-Authored-By: H.J. Lu <hjl.tools@gmail.com>
+---
+ sysdeps/x86_64/multiarch/strcmp-evex.S | 14 ++++++++------
+ 1 file changed, 8 insertions(+), 6 deletions(-)
+
+Conflicts:
+	string/test-strcmp.c
+	(new check omitted)
+
+diff --git a/sysdeps/x86_64/multiarch/strcmp-evex.S b/sysdeps/x86_64/multiarch/strcmp-evex.S
+index 82f12ac8..6f5c4bf9 100644
+--- a/sysdeps/x86_64/multiarch/strcmp-evex.S
++++ b/sysdeps/x86_64/multiarch/strcmp-evex.S
+@@ -656,12 +656,13 @@ L(loop_cross_page):
+ 	   in YMM3 and 32 bytes at VEC_SIZE(%rdx, %r10).  */
+ 	VPCMP	$0, VEC_SIZE(%rdx, %r10), %YMM3, %k3{%k4}
+ 	kmovd	%k3, %edi
++    /* Must use notl %edi here as lower bits are for CHAR
++	   comparisons potentially out of range thus can be 0 without
++	   indicating mismatch.  */
++	notl	%edi
+ # ifdef USE_AS_WCSCMP
+ 	/* Don't use subl since it is the upper 8 bits of EDI below.  */
+-	notl	%edi
+ 	andl	$0xff, %edi
+-# else
+-	incl	%edi
+ # endif
+ 
+ # ifdef USE_AS_WCSCMP
+@@ -743,12 +744,13 @@ L(loop_cross_page_2_vec):
+ 	   in YMM1 and 32 bytes at (VEC_SIZE * 3)(%rdx, %r10).  */
+ 	VPCMP	$0, (VEC_SIZE * 3)(%rdx, %r10), %YMM1, %k3{%k4}
+ 	kmovd	%k3, %edi
++	/* Must use notl %edi here as lower bits are for CHAR
++	   comparisons potentially out of range thus can be 0 without
++	   indicating mismatch.  */
++	notl	%edi
+ # ifdef USE_AS_WCSCMP
+ 	/* Don't use subl since it is the upper 8 bits of EDI below.  */
+-	notl	%edi
+ 	andl	$0xff, %edi
+-# else
+-	incl	%edi
+ # endif
+ 
+ # ifdef USE_AS_WCSCMP
+-- 
+GitLab
+
diff --git a/glibc-RHEL-15696-69.patch b/glibc-RHEL-15696-69.patch
new file mode 100644
index 0000000..9f859f2
--- /dev/null
+++ b/glibc-RHEL-15696-69.patch
@@ -0,0 +1,35 @@
+From ceeffe968c01b1202e482f4855cb6baf5c6cb713 Mon Sep 17 00:00:00 2001
+From: "H.J. Lu" <hjl.tools@gmail.com>
+Date: Mon, 6 Dec 2021 07:14:12 -0800
+Subject: [PATCH] x86: Don't set Prefer_No_AVX512 for processors with AVX512
+ and AVX-VNNI
+Content-type: text/plain; charset=UTF-8
+
+Don't set Prefer_No_AVX512 on processors with AVX512 and AVX-VNNI since
+they won't lower CPU frequency when ZMM load and store instructions are
+used.
+---
+ sysdeps/x86/cpu-features.c | 7 +++++--
+ 1 file changed, 5 insertions(+), 2 deletions(-)
+
+diff --git a/sysdeps/x86/cpu-features.c b/sysdeps/x86/cpu-features.c
+index 956bfb4f..5ff2baa0 100644
+--- a/sysdeps/x86/cpu-features.c
++++ b/sysdeps/x86/cpu-features.c
+@@ -525,8 +525,11 @@ init_cpu_features (struct cpu_features *cpu_features)
+ 	  |= bit_arch_Prefer_No_VZEROUPPER;
+       else
+ 	{
+-	  cpu_features->preferred[index_arch_Prefer_No_AVX512]
+-	    |= bit_arch_Prefer_No_AVX512;
++	  /* Processors with AVX512 and AVX-VNNI won't lower CPU frequency
++	     when ZMM load and store instructions are used.  */
++	  if (!CPU_FEATURES_CPU_P (cpu_features, AVX_VNNI))
++	    cpu_features->preferred[index_arch_Prefer_No_AVX512]
++	      |= bit_arch_Prefer_No_AVX512;
+ 
+ 	  /* Avoid RTM abort triggered by VZEROUPPER inside a
+ 	     transactionally executing RTM region.  */
+-- 
+GitLab
+
diff --git a/glibc-RHEL-15696-7.patch b/glibc-RHEL-15696-7.patch
new file mode 100644
index 0000000..8ef468c
--- /dev/null
+++ b/glibc-RHEL-15696-7.patch
@@ -0,0 +1,153 @@
+From c7c54f65b080affb87a1513dee449c8ad6143c8b Mon Sep 17 00:00:00 2001
+From: "H.J. Lu" <hjl.tools@gmail.com>
+Date: Mon, 21 Jan 2019 11:35:18 -0800
+Subject: [PATCH] x86-64 strncpy: Properly handle the length parameter [BZ#
+ 24097]
+Content-type: text/plain; charset=UTF-8
+
+On x32, the size_t parameter may be passed in the lower 32 bits of a
+64-bit register with the non-zero upper 32 bits.  The string/memory
+functions written in assembly can only use the lower 32 bits of a
+64-bit register as length or must clear the upper 32 bits before using
+the full 64-bit register for length.
+
+This pach fixes strncpy for x32.  Tested on x86-64 and x32.  On x86-64,
+libc.so is the same with and withou the fix.
+
+	[BZ# 24097]
+	CVE-2019-6488
+	* sysdeps/x86_64/multiarch/strcpy-avx2.S: Use RDX_LP for length.
+	* sysdeps/x86_64/multiarch/strcpy-sse2-unaligned.S: Likewise.
+	* sysdeps/x86_64/multiarch/strcpy-ssse3.S: Likewise.
+	* sysdeps/x86_64/x32/Makefile (tests): Add tst-size_t-strncpy.
+	* sysdeps/x86_64/x32/tst-size_t-strncpy.c: New file.
+---
+ .../x86_64/multiarch/strcpy-sse2-unaligned.S  |  4 +-
+ sysdeps/x86_64/multiarch/strcpy-ssse3.S       |  6 +-
+ sysdeps/x86_64/x32/Makefile                   |  2 +-
+ sysdeps/x86_64/x32/tst-size_t-strncpy.c       | 58 +++++++++++++++++++
+ 4 files changed, 64 insertions(+), 6 deletions(-)
+ create mode 100644 sysdeps/x86_64/x32/tst-size_t-strncpy.c
+
+Conflicts:
+	ChangeLog
+	(removed)
+	sysdeps/x86_64/multiarch/strcpy-avx2.S
+	(skipped, only needed for x32 arch)
+
+diff --git a/sysdeps/x86_64/multiarch/strcpy-sse2-unaligned.S b/sysdeps/x86_64/multiarch/strcpy-sse2-unaligned.S
+index 72bf7e85..50aca22d 100644
+--- a/sysdeps/x86_64/multiarch/strcpy-sse2-unaligned.S
++++ b/sysdeps/x86_64/multiarch/strcpy-sse2-unaligned.S
+@@ -40,8 +40,8 @@
+ .text
+ ENTRY (STRCPY)
+ #  ifdef USE_AS_STRNCPY
+-	mov	%rdx, %r8
+-	test	%r8, %r8
++	mov	%RDX_LP, %R8_LP
++	test	%R8_LP, %R8_LP
+ 	jz	L(ExitZero)
+ #  endif
+ 	mov	%rsi, %rcx
+diff --git a/sysdeps/x86_64/multiarch/strcpy-ssse3.S b/sysdeps/x86_64/multiarch/strcpy-ssse3.S
+index 9858d0c4..0a62814a 100644
+--- a/sysdeps/x86_64/multiarch/strcpy-ssse3.S
++++ b/sysdeps/x86_64/multiarch/strcpy-ssse3.S
+@@ -31,13 +31,13 @@ ENTRY (STRCPY)
+ 
+ 	mov	%rsi, %rcx
+ #  ifdef USE_AS_STRNCPY
+-	mov	%rdx, %r8
++	mov	%RDX_LP, %R8_LP
+ #  endif
+ 	mov	%rdi, %rdx
+ #  ifdef USE_AS_STRNCPY
+-	test	%r8, %r8
++	test	%R8_LP, %R8_LP
+ 	jz	L(Exit0)
+-	cmp	$8, %r8
++	cmp	$8, %R8_LP
+ 	jbe	L(StrncpyExit8Bytes)
+ # endif
+ 	cmpb	$0, (%rcx)
+diff --git a/sysdeps/x86_64/x32/Makefile b/sysdeps/x86_64/x32/Makefile
+index db302839..2a9e20a9 100644
+--- a/sysdeps/x86_64/x32/Makefile
++++ b/sysdeps/x86_64/x32/Makefile
+@@ -8,7 +8,7 @@ endif
+ ifeq ($(subdir),string)
+ tests += tst-size_t-memchr tst-size_t-memcmp tst-size_t-memcpy \
+ 	 tst-size_t-memrchr tst-size_t-memset tst-size_t-strncasecmp \
+-	 tst-size_t-strncmp
++	 tst-size_t-strncmp tst-size_t-strncpy
+ endif
+ 
+ ifeq ($(subdir),wcsmbs)
+diff --git a/sysdeps/x86_64/x32/tst-size_t-strncpy.c b/sysdeps/x86_64/x32/tst-size_t-strncpy.c
+new file mode 100644
+index 00000000..4dec71e6
+--- /dev/null
++++ b/sysdeps/x86_64/x32/tst-size_t-strncpy.c
+@@ -0,0 +1,58 @@
++/* Test strncpy with size_t in the lower 32 bits of 64-bit register.
++   Copyright (C) 2019 Free Software Foundation, Inc.
++   This file is part of the GNU C Library.
++
++   The GNU C Library is free software; you can redistribute it and/or
++   modify it under the terms of the GNU Lesser General Public
++   License as published by the Free Software Foundation; either
++   version 2.1 of the License, or (at your option) any later version.
++
++   The GNU C Library is distributed in the hope that it will be useful,
++   but WITHOUT ANY WARRANTY; without even the implied warranty of
++   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++   Lesser General Public License for more details.
++
++   You should have received a copy of the GNU Lesser General Public
++   License along with the GNU C Library; if not, see
++   <http://www.gnu.org/licenses/>.  */
++
++#define TEST_NAME "strncpy"
++#include "test-size_t.h"
++
++IMPL (strncpy, 1)
++
++typedef char *(*proto_t) (char *, const char*, size_t);
++
++static void *
++__attribute__ ((noinline, noclone))
++do_strncpy (parameter_t a, parameter_t b)
++{
++  return CALL (&b, a.p, b.p, a.len);
++}
++
++static int
++test_main (void)
++{
++  test_init ();
++
++  parameter_t dest = { { page_size }, buf1 };
++  parameter_t src = { { 0 }, buf2 };
++
++  int ret = 0;
++  FOR_EACH_IMPL (impl, 0)
++    {
++      src.fn = impl->fn;
++      do_strncpy (dest, src);
++      int res = strncmp (dest.p, src.p, dest.len);
++      if (res)
++	{
++	  error (0, 0, "Wrong result in function %s: %i != 0",
++		 impl->name, res);
++	  ret = 1;
++	}
++    }
++
++  return ret ? EXIT_FAILURE : EXIT_SUCCESS;
++}
++
++#include <support/test-driver.c>
+-- 
+GitLab
+
diff --git a/glibc-RHEL-15696-70.patch b/glibc-RHEL-15696-70.patch
new file mode 100644
index 0000000..8935ac5
--- /dev/null
+++ b/glibc-RHEL-15696-70.patch
@@ -0,0 +1,389 @@
+From abddd61de090ae84e380aff68a98bd94ef704667 Mon Sep 17 00:00:00 2001
+From: Noah Goldstein <goldstein.w.n@gmail.com>
+Date: Fri, 24 Dec 2021 18:54:41 -0600
+Subject: [PATCH] x86: Optimize L(less_vec) case in memcmp-evex-movbe.S
+Content-type: text/plain; charset=UTF-8
+
+No bug.
+Optimizations are twofold.
+
+1) Replace page cross and 0/1 checks with masked load instructions in
+   L(less_vec). In applications this reduces branch-misses in the
+   hot [0, 32] case.
+2) Change controlflow so that L(less_vec) case gets the fall through.
+
+Change 2) helps copies in the [0, 32] size range but comes at the cost
+of copies in the [33, 64] size range.  From profiles of GCC and
+Python3, 94%+ and 99%+ of calls are in the [0, 32] range so this
+appears to the the right tradeoff.
+
+Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
+Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
+---
+ sysdeps/x86_64/multiarch/memcmp-evex-movbe.S | 249 +++++--------------
+ 1 file changed, 56 insertions(+), 193 deletions(-)
+
+diff --git a/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S b/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S
+index 640f6757..d2899e7c 100644
+--- a/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S
++++ b/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S
+@@ -62,15 +62,18 @@ Latency:
+ # define VMOVU		vmovdqu64
+ 
+ # ifdef USE_AS_WMEMCMP
++#  define VMOVU_MASK	vmovdqu32
+ #  define CHAR_SIZE	4
+ #  define VPCMP	vpcmpd
+ #  define VPTEST	vptestmd
+ # else
++#  define VMOVU_MASK	vmovdqu8
+ #  define CHAR_SIZE	1
+ #  define VPCMP	vpcmpub
+ #  define VPTEST	vptestmb
+ # endif
+ 
++
+ # define VEC_SIZE	32
+ # define PAGE_SIZE	4096
+ # define CHAR_PER_VEC	(VEC_SIZE / CHAR_SIZE)
+@@ -102,12 +105,48 @@ ENTRY_P2ALIGN (MEMCMP, 6)
+ 	movl	%edx, %edx
+ # endif
+ 	cmp	$CHAR_PER_VEC, %RDX_LP
+-	jb	L(less_vec)
++	/* Fall through for [0, VEC_SIZE] as its the hottest.  */
++	ja	L(more_1x_vec)
++
++	/* Create mask for CHAR's we want to compare. This allows us to
++	   avoid having to include page cross logic.  */
++	movl	$-1, %ecx
++	bzhil	%edx, %ecx, %ecx
++	kmovd	%ecx, %k2
++
++	/* Safe to load full ymm with mask.  */
++	VMOVU_MASK (%rsi), %YMM2{%k2}
++	VPCMP	$4,(%rdi), %YMM2, %k1{%k2}
++	kmovd	%k1, %eax
++	testl	%eax, %eax
++	jnz	L(return_vec_0)
++	ret
+ 
++	.p2align 4
++L(return_vec_0):
++	tzcntl	%eax, %eax
++# ifdef USE_AS_WMEMCMP
++	movl	(%rdi, %rax, CHAR_SIZE), %ecx
++	xorl	%edx, %edx
++	cmpl	(%rsi, %rax, CHAR_SIZE), %ecx
++	/* NB: no partial register stall here because xorl zero idiom
++	   above.  */
++	setg	%dl
++	leal	-1(%rdx, %rdx), %eax
++# else
++	movzbl	(%rsi, %rax), %ecx
++	movzbl	(%rdi, %rax), %eax
++	subl	%ecx, %eax
++# endif
++	ret
++
++
++	.p2align 4
++L(more_1x_vec):
+ 	/* From VEC to 2 * VEC.  No branch when size == VEC_SIZE.  */
+ 	VMOVU	(%rsi), %YMM1
+ 	/* Use compare not equals to directly check for mismatch.  */
+-	VPCMP	$4, (%rdi), %YMM1, %k1
++	VPCMP	$4,(%rdi), %YMM1, %k1
+ 	kmovd	%k1, %eax
+ 	/* NB: eax must be destination register if going to
+ 	   L(return_vec_[0,2]). For L(return_vec_3) destination register
+@@ -131,13 +170,13 @@ ENTRY_P2ALIGN (MEMCMP, 6)
+ 
+ 	/* Check third and fourth VEC no matter what.  */
+ 	VMOVU	(VEC_SIZE * 2)(%rsi), %YMM3
+-	VPCMP	$4, (VEC_SIZE * 2)(%rdi), %YMM3, %k1
++	VPCMP	$4,(VEC_SIZE * 2)(%rdi), %YMM3, %k1
+ 	kmovd	%k1, %eax
+ 	testl	%eax, %eax
+ 	jnz	L(return_vec_2)
+ 
+ 	VMOVU	(VEC_SIZE * 3)(%rsi), %YMM4
+-	VPCMP	$4, (VEC_SIZE * 3)(%rdi), %YMM4, %k1
++	VPCMP	$4,(VEC_SIZE * 3)(%rdi), %YMM4, %k1
+ 	kmovd	%k1, %ecx
+ 	testl	%ecx, %ecx
+ 	jnz	L(return_vec_3)
+@@ -169,7 +208,7 @@ ENTRY_P2ALIGN (MEMCMP, 6)
+ 	VMOVU	(VEC_SIZE * 3)(%rsi), %YMM4
+ 	/* Ternary logic to xor (VEC_SIZE * 3)(%rdi) with YMM4 while
+ 	   oring with YMM1. Result is stored in YMM4.  */
+-	vpternlogd $0xde, (VEC_SIZE * 3)(%rdi), %YMM1, %YMM4
++	vpternlogd $0xde,(VEC_SIZE * 3)(%rdi), %YMM1, %YMM4
+ 
+ 	/* Or together YMM2, YMM3, and YMM4 into YMM4.  */
+ 	vpternlogd $0xfe, %YMM2, %YMM3, %YMM4
+@@ -184,7 +223,8 @@ ENTRY_P2ALIGN (MEMCMP, 6)
+ 	/* NB: eax must be zero to reach here.  */
+ 	ret
+ 
+-	.p2align 4
++
++	.p2align 4,, 8
+ L(8x_end_return_vec_0_1_2_3):
+ 	movq	%rdx, %rdi
+ L(8x_return_vec_0_1_2_3):
+@@ -222,23 +262,6 @@ L(return_vec_3):
+ # endif
+ 	ret
+ 
+-	.p2align 4
+-L(return_vec_0):
+-	tzcntl	%eax, %eax
+-# ifdef USE_AS_WMEMCMP
+-	movl	(%rdi, %rax, CHAR_SIZE), %ecx
+-	xorl	%edx, %edx
+-	cmpl	(%rsi, %rax, CHAR_SIZE), %ecx
+-	/* NB: no partial register stall here because xorl zero idiom
+-	   above.  */
+-	setg	%dl
+-	leal	-1(%rdx, %rdx), %eax
+-# else
+-	movzbl	(%rsi, %rax), %ecx
+-	movzbl	(%rdi, %rax), %eax
+-	subl	%ecx, %eax
+-# endif
+-	ret
+ 
+ 	.p2align 4
+ L(return_vec_1):
+@@ -297,7 +320,7 @@ L(loop_4x_vec):
+ 	VMOVU	(VEC_SIZE * 2)(%rsi, %rdi), %YMM3
+ 	vpxorq	(VEC_SIZE * 2)(%rdi), %YMM3, %YMM3
+ 	VMOVU	(VEC_SIZE * 3)(%rsi, %rdi), %YMM4
+-	vpternlogd $0xde, (VEC_SIZE * 3)(%rdi), %YMM1, %YMM4
++	vpternlogd $0xde,(VEC_SIZE * 3)(%rdi), %YMM1, %YMM4
+ 	vpternlogd $0xfe, %YMM2, %YMM3, %YMM4
+ 	VPTEST	%YMM4, %YMM4, %k1
+ 	kmovd	%k1, %ecx
+@@ -324,7 +347,7 @@ L(loop_4x_vec):
+ 	VMOVU	VEC_SIZE(%rsi, %rdx), %YMM2
+ 	vpxorq	VEC_SIZE(%rdx), %YMM2, %YMM2
+ 	VMOVU	(VEC_SIZE * 3)(%rsi, %rdx), %YMM4
+-	vpternlogd $0xde, (VEC_SIZE * 3)(%rdx), %YMM1, %YMM4
++	vpternlogd $0xde,(VEC_SIZE * 3)(%rdx), %YMM1, %YMM4
+ 	vpternlogd $0xfe, %YMM2, %YMM3, %YMM4
+ 	VPTEST	%YMM4, %YMM4, %k1
+ 	kmovd	%k1, %ecx
+@@ -336,14 +359,14 @@ L(loop_4x_vec):
+ 	/* Only entry is from L(more_8x_vec).  */
+ 	.p2align 4,, 10
+ L(8x_last_2x_vec):
+-	VPCMP	$4, (VEC_SIZE * 2)(%rdx), %YMM3, %k1
++	VPCMP	$4,(VEC_SIZE * 2)(%rdx), %YMM3, %k1
+ 	kmovd	%k1, %eax
+ 	testl	%eax, %eax
+ 	jnz	L(8x_return_vec_2)
+ 	/* Naturally aligned to 16 bytes.  */
+ L(8x_last_1x_vec):
+ 	VMOVU	(VEC_SIZE * 3)(%rsi, %rdx), %YMM1
+-	VPCMP	$4, (VEC_SIZE * 3)(%rdx), %YMM1, %k1
++	VPCMP	$4,(VEC_SIZE * 3)(%rdx), %YMM1, %k1
+ 	kmovd	%k1, %eax
+ 	testl	%eax, %eax
+ 	jnz	L(8x_return_vec_3)
+@@ -392,7 +415,9 @@ L(last_1x_vec):
+ 	jnz	L(return_vec_0_end)
+ 	ret
+ 
+-	.p2align 4,, 10
++
++	/* Don't align. Takes 2-fetch blocks either way and aligning
++	   will cause code to spill into another cacheline.  */
+ L(return_vec_1_end):
+ 	/* Use bsf to save code size. This is necessary to have
+ 	   L(one_or_less) fit in aligning bytes between.  */
+@@ -411,31 +436,8 @@ L(return_vec_1_end):
+ # endif
+ 	ret
+ 
+-	/* NB: L(one_or_less) fits in alignment padding between
+-	   L(return_vec_1_end) and L(return_vec_0_end).  */
+-# ifdef USE_AS_WMEMCMP
+-L(one_or_less):
+-	jb	L(zero)
+-	movl	(%rdi), %ecx
+-	xorl	%edx, %edx
+-	cmpl	(%rsi), %ecx
+-	je	L(zero)
+-	setg	%dl
+-	leal	-1(%rdx, %rdx), %eax
+-	ret
+-# else
+-L(one_or_less):
+-	jb	L(zero)
+-	movzbl	(%rsi), %ecx
+-	movzbl	(%rdi), %eax
+-	subl	%ecx, %eax
+-	ret
+-# endif
+-L(zero):
+-	xorl	%eax, %eax
+-	ret
+-
+-	.p2align 4
++	/* Don't align. Takes 2-fetch blocks either way and aligning
++	   will cause code to spill into another cacheline.  */
+ L(return_vec_0_end):
+ 	tzcntl	%eax, %eax
+ 	addl	%edx, %eax
+@@ -451,146 +453,7 @@ L(return_vec_0_end):
+ 	subl	%ecx, %eax
+ # endif
+ 	ret
++	/* 1-byte until next cache line.  */
+ 
+-	.p2align 4
+-L(less_vec):
+-	/* Check if one or less CHAR. This is necessary for size == 0
+-	   but is also faster for size == CHAR_SIZE.  */
+-	cmpl	$1, %edx
+-	jbe	L(one_or_less)
+-
+-	/* Check if loading one VEC from either s1 or s2 could cause a
+-	   page cross. This can have false positives but is by far the
+-	   fastest method.  */
+-	movl	%edi, %eax
+-	orl	%esi, %eax
+-	andl	$(PAGE_SIZE - 1), %eax
+-	cmpl	$(PAGE_SIZE - VEC_SIZE), %eax
+-	jg	L(page_cross_less_vec)
+-
+-	/* No page cross possible.  */
+-	VMOVU	(%rsi), %YMM2
+-	VPCMP	$4, (%rdi), %YMM2, %k1
+-	kmovd	%k1, %eax
+-	/* Check if any matches where in bounds. Intentionally not
+-	   storing result in eax to limit dependency chain if it goes to
+-	   L(return_vec_0_lv).  */
+-	bzhil	%edx, %eax, %edx
+-	jnz	L(return_vec_0_lv)
+-	xorl	%eax, %eax
+-	ret
+-
+-	/* Essentially duplicate of L(return_vec_0). Ends up not costing
+-	   any code as shrinks L(less_vec) by allowing 2-byte encoding of
+-	   the jump and ends up fitting in aligning bytes. As well fits on
+-	   same cache line as L(less_vec) so also saves a line from having
+-	   to be fetched on cold calls to memcmp.  */
+-	.p2align 4,, 4
+-L(return_vec_0_lv):
+-	tzcntl	%eax, %eax
+-# ifdef USE_AS_WMEMCMP
+-	movl	(%rdi, %rax, CHAR_SIZE), %ecx
+-	xorl	%edx, %edx
+-	cmpl	(%rsi, %rax, CHAR_SIZE), %ecx
+-	/* NB: no partial register stall here because xorl zero idiom
+-	   above.  */
+-	setg	%dl
+-	leal	-1(%rdx, %rdx), %eax
+-# else
+-	movzbl	(%rsi, %rax), %ecx
+-	movzbl	(%rdi, %rax), %eax
+-	subl	%ecx, %eax
+-# endif
+-	ret
+-
+-	.p2align 4
+-L(page_cross_less_vec):
+-	/* if USE_AS_WMEMCMP it can only be 0, 4, 8, 12, 16, 20, 24, 28
+-	   bytes.  */
+-	cmpl	$(16 / CHAR_SIZE), %edx
+-	jae	L(between_16_31)
+-# ifndef USE_AS_WMEMCMP
+-	cmpl	$8, %edx
+-	jae	L(between_8_15)
+-	cmpl	$4, %edx
+-	jb	L(between_2_3)
+-
+-	/* Load as big endian with overlapping movbe to avoid branches.
+-	 */
+-	movbe	(%rdi), %eax
+-	movbe	(%rsi), %ecx
+-	shlq	$32, %rax
+-	shlq	$32, %rcx
+-	movbe	-4(%rdi, %rdx), %edi
+-	movbe	-4(%rsi, %rdx), %esi
+-	orq	%rdi, %rax
+-	orq	%rsi, %rcx
+-	subq	%rcx, %rax
+-	/* edx is guranteed to be positive int32 in range [4, 7].  */
+-	cmovne	%edx, %eax
+-	/* ecx is -1 if rcx > rax. Otherwise 0.  */
+-	sbbl	%ecx, %ecx
+-	/* If rcx > rax, then ecx is 0 and eax is positive. If rcx ==
+-	   rax then eax and ecx are zero. If rax < rax then ecx is -1 so
+-	   eax doesn't matter.  */
+-	orl	%ecx, %eax
+-	ret
+-
+-	.p2align 4,, 8
+-L(between_8_15):
+-# endif
+-	/* If USE_AS_WMEMCMP fall through into 8-15 byte case.  */
+-	vmovq	(%rdi), %xmm1
+-	vmovq	(%rsi), %xmm2
+-	VPCMP	$4, %xmm1, %xmm2, %k1
+-	kmovd	%k1, %eax
+-	testl	%eax, %eax
+-	jnz	L(return_vec_0_lv)
+-	/* Use overlapping loads to avoid branches.  */
+-	vmovq	-8(%rdi, %rdx, CHAR_SIZE), %xmm1
+-	vmovq	-8(%rsi, %rdx, CHAR_SIZE), %xmm2
+-	VPCMP	$4, %xmm1, %xmm2, %k1
+-	addl	$(CHAR_PER_VEC - (8 / CHAR_SIZE)), %edx
+-	kmovd	%k1, %eax
+-	testl	%eax, %eax
+-	jnz	L(return_vec_0_end)
+-	ret
+-
+-	.p2align 4,, 8
+-L(between_16_31):
+-	/* From 16 to 31 bytes.  No branch when size == 16.  */
+-
+-	/* Use movups to save code size.  */
+-	vmovdqu	(%rsi), %xmm2
+-	VPCMP	$4, (%rdi), %xmm2, %k1
+-	kmovd	%k1, %eax
+-	testl	%eax, %eax
+-	jnz	L(return_vec_0_lv)
+-	/* Use overlapping loads to avoid branches.  */
+-	vmovdqu	-16(%rsi, %rdx, CHAR_SIZE), %xmm2
+-	VPCMP	$4, -16(%rdi, %rdx, CHAR_SIZE), %xmm2, %k1
+-	addl	$(CHAR_PER_VEC - (16 / CHAR_SIZE)), %edx
+-	kmovd	%k1, %eax
+-	testl	%eax, %eax
+-	jnz	L(return_vec_0_end)
+-	ret
+-
+-# ifndef USE_AS_WMEMCMP
+-L(between_2_3):
+-	/* Load as big endian to avoid branches.  */
+-	movzwl	(%rdi), %eax
+-	movzwl	(%rsi), %ecx
+-	shll	$8, %eax
+-	shll	$8, %ecx
+-	bswap	%eax
+-	bswap	%ecx
+-	movzbl	-1(%rdi, %rdx), %edi
+-	movzbl	-1(%rsi, %rdx), %esi
+-	orl	%edi, %eax
+-	orl	%esi, %ecx
+-	/* Subtraction is okay because the upper 8 bits are zero.  */
+-	subl	%ecx, %eax
+-	ret
+-# endif
+ END (MEMCMP)
+ #endif
+-- 
+GitLab
+
diff --git a/glibc-RHEL-15696-71.patch b/glibc-RHEL-15696-71.patch
new file mode 100644
index 0000000..2d018d0
--- /dev/null
+++ b/glibc-RHEL-15696-71.patch
@@ -0,0 +1,43 @@
+From 6b8dbbd03ac88f169b65b5c7d7278576a11d2e44 Mon Sep 17 00:00:00 2001
+From: Jangwoong Kim <6812skiii@gmail.com>
+Date: Tue, 14 Dec 2021 21:30:51 +0900
+Subject: [PATCH] nptl: Effectively skip CAS in spinlock loop
+Content-type: text/plain; charset=UTF-8
+
+The commit:
+"Add LLL_MUTEX_READ_LOCK [BZ #28537]"
+SHA1: d672a98a1af106bd68deb15576710cd61363f7a6
+
+introduced LLL_MUTEX_READ_LOCK, to skip CAS in spinlock loop
+if atomic load fails. But, "continue" inside of do-while loop
+does not skip the evaluation of escape expression, thus CAS
+is not skipped.
+
+Replace do-while with while and skip LLL_MUTEX_TRYLOCK if
+LLL_MUTEX_READ_LOCK fails.
+
+Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
+---
+ nptl/pthread_mutex_lock.c | 5 ++---
+ 1 file changed, 2 insertions(+), 3 deletions(-)
+
+diff --git a/nptl/pthread_mutex_lock.c b/nptl/pthread_mutex_lock.c
+index a633d95e..d96a9933 100644
+--- a/nptl/pthread_mutex_lock.c
++++ b/nptl/pthread_mutex_lock.c
+@@ -141,10 +141,9 @@ __pthread_mutex_lock (pthread_mutex_t *mutex)
+ 		  break;
+ 		}
+ 	      atomic_spin_nop ();
+-	      if (LLL_MUTEX_READ_LOCK (mutex) != 0)
+-		continue;
+ 	    }
+-	  while (LLL_MUTEX_TRYLOCK (mutex) != 0);
++	  while (LLL_MUTEX_READ_LOCK (mutex) != 0
++		 || LLL_MUTEX_TRYLOCK (mutex) != 0);
+ 
+ 	  mutex->__data.__spins += (cnt - mutex->__data.__spins) / 8;
+ 	}
+-- 
+GitLab
+
diff --git a/glibc-RHEL-15696-72.patch b/glibc-RHEL-15696-72.patch
new file mode 100644
index 0000000..34f2a61
--- /dev/null
+++ b/glibc-RHEL-15696-72.patch
@@ -0,0 +1,146 @@
+From 7835d611af0854e69a0c71e3806f8fe379282d6f Mon Sep 17 00:00:00 2001
+From: Noah Goldstein <goldstein.w.n@gmail.com>
+Date: Fri, 18 Feb 2022 14:19:15 -0600
+Subject: [PATCH] x86: Test wcscmp RTM in the wcsncmp overflow case [BZ #28896]
+Content-type: text/plain; charset=UTF-8
+
+In the overflow fallback strncmp-avx2-rtm and wcsncmp-avx2-rtm would
+call strcmp-avx2 and wcscmp-avx2 respectively. This would have
+not checks around vzeroupper and would trigger spurious
+aborts. This commit fixes that.
+
+test-strcmp, test-strncmp, test-wcscmp, and test-wcsncmp all pass on
+AVX2 machines with and without RTM.
+Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
+---
+ sysdeps/x86/Makefile          |  5 ++++-
+ sysdeps/x86/tst-strncmp-rtm.c | 32 +++++++++++++++++++++++---------
+ sysdeps/x86/tst-wcsncmp-rtm.c | 21 +++++++++++++++++++++
+ 3 files changed, 48 insertions(+), 10 deletions(-)
+ create mode 100644 sysdeps/x86/tst-wcsncmp-rtm.c
+
+diff --git a/sysdeps/x86/Makefile b/sysdeps/x86/Makefile
+index 2d814915..c2111f49 100644
+--- a/sysdeps/x86/Makefile
++++ b/sysdeps/x86/Makefile
+@@ -28,7 +28,9 @@ tests += \
+   tst-strcpy-rtm \
+   tst-strlen-rtm \
+   tst-strncmp-rtm \
+-  tst-strrchr-rtm
++  tst-strrchr-rtm \
++  tst-wcsncmp-rtm \
++# tests
+ 
+ CFLAGS-tst-memchr-rtm.c += -mrtm
+ CFLAGS-tst-memcmp-rtm.c += -mrtm
+@@ -40,6 +42,7 @@ CFLAGS-tst-strcpy-rtm.c += -mrtm
+ CFLAGS-tst-strlen-rtm.c += -mrtm
+ CFLAGS-tst-strncmp-rtm.c += -mrtm -Wno-error
+ CFLAGS-tst-strrchr-rtm.c += -mrtm
++CFLAGS-tst-wcsncmp-rtm.c += -mrtm -Wno-error
+ endif
+ 
+ ifneq ($(enable-cet),no)
+diff --git a/sysdeps/x86/tst-strncmp-rtm.c b/sysdeps/x86/tst-strncmp-rtm.c
+index 4d0004b5..4e9f094f 100644
+--- a/sysdeps/x86/tst-strncmp-rtm.c
++++ b/sysdeps/x86/tst-strncmp-rtm.c
+@@ -19,18 +19,32 @@
+ #include <stdint.h>
+ #include <tst-string-rtm.h>
+ 
++#ifdef WIDE
++# define CHAR wchar_t
++# define MEMSET wmemset
++# define STRNCMP wcsncmp
++# define TEST_NAME wcsncmp
++#else /* !WIDE */
++# define CHAR char
++# define MEMSET memset
++# define STRNCMP strncmp
++# define TEST_NAME strncmp
++#endif /* !WIDE */
++
++
++
+ #define LOOP 3000
+ #define STRING_SIZE 1024
+-char string1[STRING_SIZE];
+-char string2[STRING_SIZE];
++CHAR string1[STRING_SIZE];
++CHAR string2[STRING_SIZE];
+ 
+ __attribute__ ((noinline, noclone))
+ static int
+ prepare (void)
+ {
+-  memset (string1, 'a', STRING_SIZE - 1);
+-  memset (string2, 'a', STRING_SIZE - 1);
+-  if (strncmp (string1, string2, STRING_SIZE) == 0)
++  MEMSET (string1, 'a', STRING_SIZE - 1);
++  MEMSET (string2, 'a', STRING_SIZE - 1);
++  if (STRNCMP (string1, string2, STRING_SIZE) == 0)
+     return EXIT_SUCCESS;
+   else
+     return EXIT_FAILURE;
+@@ -40,7 +54,7 @@ __attribute__ ((noinline, noclone))
+ static int
+ function (void)
+ {
+-  if (strncmp (string1, string2, STRING_SIZE) == 0)
++  if (STRNCMP (string1, string2, STRING_SIZE) == 0)
+     return 0;
+   else
+     return 1;
+@@ -50,7 +64,7 @@ __attribute__ ((noinline, noclone))
+ static int
+ function_overflow (void)
+ {
+-  if (strncmp (string1, string2, SIZE_MAX) == 0)
++  if (STRNCMP (string1, string2, SIZE_MAX) == 0)
+     return 0;
+   else
+     return 1;
+@@ -59,9 +73,9 @@ function_overflow (void)
+ static int
+ do_test (void)
+ {
+-  int status = do_test_1 ("strncmp", LOOP, prepare, function);
++  int status = do_test_1 (TEST_NAME, LOOP, prepare, function);
+   if (status != EXIT_SUCCESS)
+     return status;
+-  status = do_test_1 ("strncmp", LOOP, prepare, function_overflow);
++  status = do_test_1 (TEST_NAME, LOOP, prepare, function_overflow);
+   return status;
+ }
+diff --git a/sysdeps/x86/tst-wcsncmp-rtm.c b/sysdeps/x86/tst-wcsncmp-rtm.c
+new file mode 100644
+index 00000000..bad3b863
+--- /dev/null
++++ b/sysdeps/x86/tst-wcsncmp-rtm.c
+@@ -0,0 +1,21 @@
++/* Test case for wcsncmp inside a transactionally executing RTM region.
++   Copyright (C) 2022 Free Software Foundation, Inc.
++   This file is part of the GNU C Library.
++
++   The GNU C Library is free software; you can redistribute it and/or
++   modify it under the terms of the GNU Lesser General Public
++   License as published by the Free Software Foundation; either
++   version 2.1 of the License, or (at your option) any later version.
++
++   The GNU C Library is distributed in the hope that it will be useful,
++   but WITHOUT ANY WARRANTY; without even the implied warranty of
++   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++   Lesser General Public License for more details.
++
++   You should have received a copy of the GNU Lesser General Public
++   License along with the GNU C Library; if not, see
++   <https://www.gnu.org/licenses/>.  */
++
++#define WIDE 1
++#include <wchar.h>
++#include "tst-strncmp-rtm.c"
+-- 
+GitLab
+
diff --git a/glibc-RHEL-15696-73.patch b/glibc-RHEL-15696-73.patch
new file mode 100644
index 0000000..e8cc3a2
--- /dev/null
+++ b/glibc-RHEL-15696-73.patch
@@ -0,0 +1,37 @@
+From b98d0bbf747f39770e0caba7e984ce9f8f900330 Mon Sep 17 00:00:00 2001
+From: Noah Goldstein <goldstein.w.n@gmail.com>
+Date: Fri, 18 Feb 2022 17:00:25 -0600
+Subject: [PATCH] x86: Fix TEST_NAME to make it a string in tst-strncmp-rtm.c
+Content-type: text/plain; charset=UTF-8
+
+Previously TEST_NAME was passing a function pointer. This didn't fail
+because of the -Wno-error flag (to allow for overflow sizes passed
+to strncmp/wcsncmp)
+
+Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
+---
+ sysdeps/x86/tst-strncmp-rtm.c | 4 ++--
+ 1 file changed, 2 insertions(+), 2 deletions(-)
+
+diff --git a/sysdeps/x86/tst-strncmp-rtm.c b/sysdeps/x86/tst-strncmp-rtm.c
+index 4e9f094f..aef9866c 100644
+--- a/sysdeps/x86/tst-strncmp-rtm.c
++++ b/sysdeps/x86/tst-strncmp-rtm.c
+@@ -23,12 +23,12 @@
+ # define CHAR wchar_t
+ # define MEMSET wmemset
+ # define STRNCMP wcsncmp
+-# define TEST_NAME wcsncmp
++# define TEST_NAME "wcsncmp"
+ #else /* !WIDE */
+ # define CHAR char
+ # define MEMSET memset
+ # define STRNCMP strncmp
+-# define TEST_NAME strncmp
++# define TEST_NAME "strncmp"
+ #endif /* !WIDE */
+ 
+ 
+-- 
+GitLab
+
diff --git a/glibc-RHEL-15696-74.patch b/glibc-RHEL-15696-74.patch
new file mode 100644
index 0000000..e5e6842
--- /dev/null
+++ b/glibc-RHEL-15696-74.patch
@@ -0,0 +1,1798 @@
+From b77b06e0e296f1a2276c27a67e1d44f2cfa38d45 Mon Sep 17 00:00:00 2001
+From: Noah Goldstein <goldstein.w.n@gmail.com>
+Date: Mon, 10 Jan 2022 15:35:38 -0600
+Subject: [PATCH] x86: Optimize strcmp-avx2.S
+Content-type: text/plain; charset=UTF-8
+
+Optimization are primarily to the loop logic and how the page cross
+logic interacts with the loop.
+
+The page cross logic is at times more expensive for short strings near
+the end of a page but not crossing the page. This is done to retest
+the page cross conditions with a non-faulty check and to improve the
+logic for entering the loop afterwards. This is only particular cases,
+however, and is general made up for by more than 10x improvements on
+the transition from the page cross -> loop case.
+
+The non-page cross cases are improved most for smaller sizes [0, 128]
+and go about even for (128, 4096]. The loop page cross logic is
+improved so some more significant speedup is seen there as well.
+
+test-strcmp, test-strncmp, test-wcscmp, and test-wcsncmp all pass.
+
+Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
+---
+ sysdeps/x86_64/multiarch/strcmp-avx2.S | 1592 ++++++++++++++----------
+ 1 file changed, 940 insertions(+), 652 deletions(-)
+
+Conflicts:
+	sysdeps/x86_64/multiarch/strcmp-avx2.S
+	(account for sw28896 patches)
+
+diff --git a/sysdeps/x86_64/multiarch/strcmp-avx2.S b/sysdeps/x86_64/multiarch/strcmp-avx2.S
+index 70d8499b..554ffe4c 100644
+--- a/sysdeps/x86_64/multiarch/strcmp-avx2.S
++++ b/sysdeps/x86_64/multiarch/strcmp-avx2.S
+@@ -26,35 +26,57 @@
+ 
+ # define PAGE_SIZE	4096
+ 
+-/* VEC_SIZE = Number of bytes in a ymm register */
++	/* VEC_SIZE = Number of bytes in a ymm register.  */
+ # define VEC_SIZE	32
+ 
+-/* Shift for dividing by (VEC_SIZE * 4).  */
+-# define DIVIDE_BY_VEC_4_SHIFT	7
+-# if (VEC_SIZE * 4) != (1 << DIVIDE_BY_VEC_4_SHIFT)
+-#  error (VEC_SIZE * 4) != (1 << DIVIDE_BY_VEC_4_SHIFT)
+-# endif
++# define VMOVU	vmovdqu
++# define VMOVA	vmovdqa
+ 
+ # ifdef USE_AS_WCSCMP
+-/* Compare packed dwords.  */
++	/* Compare packed dwords.  */
+ #  define VPCMPEQ	vpcmpeqd
+-/* Compare packed dwords and store minimum.  */
++	/* Compare packed dwords and store minimum.  */
+ #  define VPMINU	vpminud
+-/* 1 dword char == 4 bytes.  */
++	/* 1 dword char == 4 bytes.  */
+ #  define SIZE_OF_CHAR	4
+ # else
+-/* Compare packed bytes.  */
++	/* Compare packed bytes.  */
+ #  define VPCMPEQ	vpcmpeqb
+-/* Compare packed bytes and store minimum.  */
++	/* Compare packed bytes and store minimum.  */
+ #  define VPMINU	vpminub
+-/* 1 byte char == 1 byte.  */
++	/* 1 byte char == 1 byte.  */
+ #  define SIZE_OF_CHAR	1
+ # endif
+ 
++# ifdef USE_AS_STRNCMP
++#  define LOOP_REG	r9d
++#  define LOOP_REG64	r9
++
++#  define OFFSET_REG8	r9b
++#  define OFFSET_REG	r9d
++#  define OFFSET_REG64	r9
++# else
++#  define LOOP_REG	edx
++#  define LOOP_REG64	rdx
++
++#  define OFFSET_REG8	dl
++#  define OFFSET_REG	edx
++#  define OFFSET_REG64	rdx
++# endif
++
+ # ifndef VZEROUPPER
+ #  define VZEROUPPER	vzeroupper
+ # endif
+ 
++# if defined USE_AS_STRNCMP
++#  define VEC_OFFSET	0
++# else
++#  define VEC_OFFSET	(-VEC_SIZE)
++# endif
++
++# define xmmZERO	xmm15
++# define ymmZERO	ymm15
++
+ # ifndef SECTION
+ #  define SECTION(p)	p##.avx
+ # endif
+@@ -79,783 +101,1049 @@
+    the maximum offset is reached before a difference is found, zero is
+    returned.  */
+ 
+-	.section SECTION(.text),"ax",@progbits
+-ENTRY (STRCMP)
++	.section SECTION(.text), "ax", @progbits
++ENTRY(STRCMP)
+ # ifdef USE_AS_STRNCMP
+-	/* Check for simple cases (0 or 1) in offset.  */
++#  ifdef __ILP32__
++	/* Clear the upper 32 bits.  */
++	movl	%edx, %rdx
++#  endif
+ 	cmp	$1, %RDX_LP
+-	je	L(char0)
+-	jb	L(zero)
++	/* Signed comparison intentional. We use this branch to also
++	   test cases where length >= 2^63. These very large sizes can be
++	   handled with strcmp as there is no way for that length to
++	   actually bound the buffer.  */
++	jle	L(one_or_less)
+ #  ifdef USE_AS_WCSCMP
+-#  ifndef __ILP32__
+ 	movq	%rdx, %rcx
+-	/* Check if length could overflow when multiplied by
+-	   sizeof(wchar_t). Checking top 8 bits will cover all potential
+-	   overflow cases as well as redirect cases where its impossible to
+-	   length to bound a valid memory region. In these cases just use
+-	   'wcscmp'.  */
++
++	/* Multiplying length by sizeof(wchar_t) can result in overflow.
++	   Check if that is possible. All cases where overflow are possible
++	   are cases where length is large enough that it can never be a
++	   bound on valid memory so just use wcscmp.  */
+ 	shrq	$56, %rcx
+-	jnz	OVERFLOW_STRCMP
+-#  endif
+-	/* Convert units: from wide to byte char.  */
+-	shl	$2, %RDX_LP
++	jnz	__wcscmp_avx2
++
++	leaq	(, %rdx, 4), %rdx
+ #  endif
+-	/* Register %r11 tracks the maximum offset.  */
+-	mov	%RDX_LP, %R11_LP
+ # endif
++	vpxor	%xmmZERO, %xmmZERO, %xmmZERO
+ 	movl	%edi, %eax
+-	xorl	%edx, %edx
+-	/* Make %xmm7 (%ymm7) all zeros in this function.  */
+-	vpxor	%xmm7, %xmm7, %xmm7
+ 	orl	%esi, %eax
+-	andl	$(PAGE_SIZE - 1), %eax
+-	cmpl	$(PAGE_SIZE - (VEC_SIZE * 4)), %eax
+-	jg	L(cross_page)
+-	/* Start comparing 4 vectors.  */
+-	vmovdqu	(%rdi), %ymm1
+-	VPCMPEQ	(%rsi), %ymm1, %ymm0
+-	VPMINU	%ymm1, %ymm0, %ymm0
+-	VPCMPEQ	%ymm7, %ymm0, %ymm0
+-	vpmovmskb %ymm0, %ecx
+-	testl	%ecx, %ecx
+-	je	L(next_3_vectors)
+-	tzcntl	%ecx, %edx
++	sall	$20, %eax
++	/* Check if s1 or s2 may cross a page  in next 4x VEC loads.  */
++	cmpl	$((PAGE_SIZE -(VEC_SIZE * 4)) << 20), %eax
++	ja	L(page_cross)
++
++L(no_page_cross):
++	/* Safe to compare 4x vectors.  */
++	VMOVU	(%rdi), %ymm0
++	/* 1s where s1 and s2 equal.  */
++	VPCMPEQ	(%rsi), %ymm0, %ymm1
++	/* 1s at null CHAR.  */
++	VPCMPEQ	%ymm0, %ymmZERO, %ymm2
++	/* 1s where s1 and s2 equal AND not null CHAR.  */
++	vpandn	%ymm1, %ymm2, %ymm1
++
++	/* All 1s -> keep going, any 0s -> return.  */
++	vpmovmskb %ymm1, %ecx
+ # ifdef USE_AS_STRNCMP
+-	/* Return 0 if the mismatched index (%rdx) is after the maximum
+-	   offset (%r11).   */
+-	cmpq	%r11, %rdx
+-	jae	L(zero)
++	cmpq	$VEC_SIZE, %rdx
++	jbe	L(vec_0_test_len)
+ # endif
++
++	/* All 1s represents all equals. incl will overflow to zero in
++	   all equals case. Otherwise 1s will carry until position of first
++	   mismatch.  */
++	incl	%ecx
++	jz	L(more_3x_vec)
++
++	.p2align 4,, 4
++L(return_vec_0):
++	tzcntl	%ecx, %ecx
+ # ifdef USE_AS_WCSCMP
++	movl	(%rdi, %rcx), %edx
+ 	xorl	%eax, %eax
+-	movl	(%rdi, %rdx), %ecx
+-	cmpl	(%rsi, %rdx), %ecx
+-	je	L(return)
+-L(wcscmp_return):
++	cmpl	(%rsi, %rcx), %edx
++	je	L(ret0)
+ 	setl	%al
+ 	negl	%eax
+ 	orl	$1, %eax
+-L(return):
+ # else
+-	movzbl	(%rdi, %rdx), %eax
+-	movzbl	(%rsi, %rdx), %edx
+-	subl	%edx, %eax
++	movzbl	(%rdi, %rcx), %eax
++	movzbl	(%rsi, %rcx), %ecx
++	subl	%ecx, %eax
+ # endif
++L(ret0):
+ L(return_vzeroupper):
+ 	ZERO_UPPER_VEC_REGISTERS_RETURN
+ 
+-	.p2align 4
+-L(return_vec_size):
+-	tzcntl	%ecx, %edx
+ # ifdef USE_AS_STRNCMP
+-	/* Return 0 if the mismatched index (%rdx + VEC_SIZE) is after
+-	   the maximum offset (%r11).  */
+-	addq	$VEC_SIZE, %rdx
+-	cmpq	%r11, %rdx
+-	jae	L(zero)
+-#  ifdef USE_AS_WCSCMP
++	.p2align 4,, 8
++L(vec_0_test_len):
++	notl	%ecx
++	bzhil	%edx, %ecx, %eax
++	jnz	L(return_vec_0)
++	/* Align if will cross fetch block.  */
++	.p2align 4,, 2
++L(ret_zero):
+ 	xorl	%eax, %eax
+-	movl	(%rdi, %rdx), %ecx
+-	cmpl	(%rsi, %rdx), %ecx
+-	jne	L(wcscmp_return)
+-#  else
+-	movzbl	(%rdi, %rdx), %eax
+-	movzbl	(%rsi, %rdx), %edx
+-	subl	%edx, %eax
+-#  endif
+-# else
++	VZEROUPPER_RETURN
++
++	.p2align 4,, 5
++L(one_or_less):
++	jb	L(ret_zero)
+ #  ifdef USE_AS_WCSCMP
++	/* 'nbe' covers the case where length is negative (large
++	   unsigned).  */
++	jnbe	__wcscmp_avx2
++	movl	(%rdi), %edx
+ 	xorl	%eax, %eax
+-	movl	VEC_SIZE(%rdi, %rdx), %ecx
+-	cmpl	VEC_SIZE(%rsi, %rdx), %ecx
+-	jne	L(wcscmp_return)
++	cmpl	(%rsi), %edx
++	je	L(ret1)
++	setl	%al
++	negl	%eax
++	orl	$1, %eax
+ #  else
+-	movzbl	VEC_SIZE(%rdi, %rdx), %eax
+-	movzbl	VEC_SIZE(%rsi, %rdx), %edx
+-	subl	%edx, %eax
++	/* 'nbe' covers the case where length is negative (large
++	   unsigned).  */
++
++	jnbe	__strcmp_avx2
++	movzbl	(%rdi), %eax
++	movzbl	(%rsi), %ecx
++	subl	%ecx, %eax
+ #  endif
++L(ret1):
++	ret
+ # endif
+-	VZEROUPPER_RETURN
+ 
+-	.p2align 4
+-L(return_2_vec_size):
+-	tzcntl	%ecx, %edx
++	.p2align 4,, 10
++L(return_vec_1):
++	tzcntl	%ecx, %ecx
+ # ifdef USE_AS_STRNCMP
+-	/* Return 0 if the mismatched index (%rdx + 2 * VEC_SIZE) is
+-	   after the maximum offset (%r11).  */
+-	addq	$(VEC_SIZE * 2), %rdx
+-	cmpq	%r11, %rdx
+-	jae	L(zero)
+-#  ifdef USE_AS_WCSCMP
++	/* rdx must be > CHAR_PER_VEC so save to subtract w.o fear of
++	   overflow.  */
++	addq	$-VEC_SIZE, %rdx
++	cmpq	%rcx, %rdx
++	jbe	L(ret_zero)
++# endif
++# ifdef USE_AS_WCSCMP
++	movl	VEC_SIZE(%rdi, %rcx), %edx
+ 	xorl	%eax, %eax
+-	movl	(%rdi, %rdx), %ecx
+-	cmpl	(%rsi, %rdx), %ecx
+-	jne	L(wcscmp_return)
+-#  else
+-	movzbl	(%rdi, %rdx), %eax
+-	movzbl	(%rsi, %rdx), %edx
+-	subl	%edx, %eax
+-#  endif
++	cmpl	VEC_SIZE(%rsi, %rcx), %edx
++	je	L(ret2)
++	setl	%al
++	negl	%eax
++	orl	$1, %eax
+ # else
+-#  ifdef USE_AS_WCSCMP
+-	xorl	%eax, %eax
+-	movl	(VEC_SIZE * 2)(%rdi, %rdx), %ecx
+-	cmpl	(VEC_SIZE * 2)(%rsi, %rdx), %ecx
+-	jne	L(wcscmp_return)
+-#  else
+-	movzbl	(VEC_SIZE * 2)(%rdi, %rdx), %eax
+-	movzbl	(VEC_SIZE * 2)(%rsi, %rdx), %edx
+-	subl	%edx, %eax
+-#  endif
++	movzbl	VEC_SIZE(%rdi, %rcx), %eax
++	movzbl	VEC_SIZE(%rsi, %rcx), %ecx
++	subl	%ecx, %eax
+ # endif
++L(ret2):
+ 	VZEROUPPER_RETURN
+ 
+-	.p2align 4
+-L(return_3_vec_size):
+-	tzcntl	%ecx, %edx
++	.p2align 4,, 10
+ # ifdef USE_AS_STRNCMP
+-	/* Return 0 if the mismatched index (%rdx + 3 * VEC_SIZE) is
+-	   after the maximum offset (%r11).  */
+-	addq	$(VEC_SIZE * 3), %rdx
+-	cmpq	%r11, %rdx
+-	jae	L(zero)
+-#  ifdef USE_AS_WCSCMP
++L(return_vec_3):
++	salq	$32, %rcx
++# endif
++
++L(return_vec_2):
++# ifndef USE_AS_STRNCMP
++	tzcntl	%ecx, %ecx
++# else
++	tzcntq	%rcx, %rcx
++	cmpq	%rcx, %rdx
++	jbe	L(ret_zero)
++# endif
++
++# ifdef USE_AS_WCSCMP
++	movl	(VEC_SIZE * 2)(%rdi, %rcx), %edx
+ 	xorl	%eax, %eax
+-	movl	(%rdi, %rdx), %ecx
+-	cmpl	(%rsi, %rdx), %ecx
+-	jne	L(wcscmp_return)
+-#  else
+-	movzbl	(%rdi, %rdx), %eax
+-	movzbl	(%rsi, %rdx), %edx
+-	subl	%edx, %eax
+-#  endif
++	cmpl	(VEC_SIZE * 2)(%rsi, %rcx), %edx
++	je	L(ret3)
++	setl	%al
++	negl	%eax
++	orl	$1, %eax
+ # else
++	movzbl	(VEC_SIZE * 2)(%rdi, %rcx), %eax
++	movzbl	(VEC_SIZE * 2)(%rsi, %rcx), %ecx
++	subl	%ecx, %eax
++# endif
++L(ret3):
++	VZEROUPPER_RETURN
++
++# ifndef USE_AS_STRNCMP
++	.p2align 4,, 10
++L(return_vec_3):
++	tzcntl	%ecx, %ecx
+ #  ifdef USE_AS_WCSCMP
++	movl	(VEC_SIZE * 3)(%rdi, %rcx), %edx
+ 	xorl	%eax, %eax
+-	movl	(VEC_SIZE * 3)(%rdi, %rdx), %ecx
+-	cmpl	(VEC_SIZE * 3)(%rsi, %rdx), %ecx
+-	jne	L(wcscmp_return)
++	cmpl	(VEC_SIZE * 3)(%rsi, %rcx), %edx
++	je	L(ret4)
++	setl	%al
++	negl	%eax
++	orl	$1, %eax
+ #  else
+-	movzbl	(VEC_SIZE * 3)(%rdi, %rdx), %eax
+-	movzbl	(VEC_SIZE * 3)(%rsi, %rdx), %edx
+-	subl	%edx, %eax
++	movzbl	(VEC_SIZE * 3)(%rdi, %rcx), %eax
++	movzbl	(VEC_SIZE * 3)(%rsi, %rcx), %ecx
++	subl	%ecx, %eax
+ #  endif
+-# endif
++L(ret4):
+ 	VZEROUPPER_RETURN
++# endif
++
++	.p2align 4,, 10
++L(more_3x_vec):
++	/* Safe to compare 4x vectors.  */
++	VMOVU	VEC_SIZE(%rdi), %ymm0
++	VPCMPEQ	VEC_SIZE(%rsi), %ymm0, %ymm1
++	VPCMPEQ	%ymm0, %ymmZERO, %ymm2
++	vpandn	%ymm1, %ymm2, %ymm1
++	vpmovmskb %ymm1, %ecx
++	incl	%ecx
++	jnz	L(return_vec_1)
++
++# ifdef USE_AS_STRNCMP
++	subq	$(VEC_SIZE * 2), %rdx
++	jbe	L(ret_zero)
++# endif
++
++	VMOVU	(VEC_SIZE * 2)(%rdi), %ymm0
++	VPCMPEQ	(VEC_SIZE * 2)(%rsi), %ymm0, %ymm1
++	VPCMPEQ	%ymm0, %ymmZERO, %ymm2
++	vpandn	%ymm1, %ymm2, %ymm1
++	vpmovmskb %ymm1, %ecx
++	incl	%ecx
++	jnz	L(return_vec_2)
++
++	VMOVU	(VEC_SIZE * 3)(%rdi), %ymm0
++	VPCMPEQ	(VEC_SIZE * 3)(%rsi), %ymm0, %ymm1
++	VPCMPEQ	%ymm0, %ymmZERO, %ymm2
++	vpandn	%ymm1, %ymm2, %ymm1
++	vpmovmskb %ymm1, %ecx
++	incl	%ecx
++	jnz	L(return_vec_3)
+ 
+-	.p2align 4
+-L(next_3_vectors):
+-	vmovdqu	VEC_SIZE(%rdi), %ymm6
+-	VPCMPEQ	VEC_SIZE(%rsi), %ymm6, %ymm3
+-	VPMINU	%ymm6, %ymm3, %ymm3
+-	VPCMPEQ	%ymm7, %ymm3, %ymm3
+-	vpmovmskb %ymm3, %ecx
+-	testl	%ecx, %ecx
+-	jne	L(return_vec_size)
+-	vmovdqu	(VEC_SIZE * 2)(%rdi), %ymm5
+-	vmovdqu	(VEC_SIZE * 3)(%rdi), %ymm4
+-	vmovdqu	(VEC_SIZE * 3)(%rsi), %ymm0
+-	VPCMPEQ	(VEC_SIZE * 2)(%rsi), %ymm5, %ymm2
+-	VPMINU	%ymm5, %ymm2, %ymm2
+-	VPCMPEQ	%ymm4, %ymm0, %ymm0
+-	VPCMPEQ	%ymm7, %ymm2, %ymm2
+-	vpmovmskb %ymm2, %ecx
+-	testl	%ecx, %ecx
+-	jne	L(return_2_vec_size)
+-	VPMINU	%ymm4, %ymm0, %ymm0
+-	VPCMPEQ	%ymm7, %ymm0, %ymm0
+-	vpmovmskb %ymm0, %ecx
+-	testl	%ecx, %ecx
+-	jne	L(return_3_vec_size)
+-L(main_loop_header):
+-	leaq	(VEC_SIZE * 4)(%rdi), %rdx
+-	movl	$PAGE_SIZE, %ecx
+-	/* Align load via RAX.  */
+-	andq	$-(VEC_SIZE * 4), %rdx
+-	subq	%rdi, %rdx
+-	leaq	(%rdi, %rdx), %rax
+ # ifdef USE_AS_STRNCMP
+-	/* Starting from this point, the maximum offset, or simply the
+-	   'offset', DECREASES by the same amount when base pointers are
+-	   moved forward.  Return 0 when:
+-	     1) On match: offset <= the matched vector index.
+-	     2) On mistmach, offset is before the mistmatched index.
++	cmpq	$(VEC_SIZE * 2), %rdx
++	jbe	L(ret_zero)
++# endif
++
++# ifdef USE_AS_WCSCMP
++	/* any non-zero positive value that doesn't inference with 0x1.
+ 	 */
+-	subq	%rdx, %r11
+-	jbe	L(zero)
+-# endif
+-	addq	%rsi, %rdx
+-	movq	%rdx, %rsi
+-	andl	$(PAGE_SIZE - 1), %esi
+-	/* Number of bytes before page crossing.  */
+-	subq	%rsi, %rcx
+-	/* Number of VEC_SIZE * 4 blocks before page crossing.  */
+-	shrq	$DIVIDE_BY_VEC_4_SHIFT, %rcx
+-	/* ESI: Number of VEC_SIZE * 4 blocks before page crossing.   */
+-	movl	%ecx, %esi
+-	jmp	L(loop_start)
++	movl	$2, %r8d
+ 
++# else
++	xorl	%r8d, %r8d
++# endif
++
++	/* The prepare labels are various entry points from the page
++	   cross logic.  */
++L(prepare_loop):
++
++# ifdef USE_AS_STRNCMP
++	/* Store N + (VEC_SIZE * 4) and place check at the begining of
++	   the loop.  */
++	leaq	(VEC_SIZE * 2)(%rdi, %rdx), %rdx
++# endif
++L(prepare_loop_no_len):
++
++	/* Align s1 and adjust s2 accordingly.  */
++	subq	%rdi, %rsi
++	andq	$-(VEC_SIZE * 4), %rdi
++	addq	%rdi, %rsi
++
++# ifdef USE_AS_STRNCMP
++	subq	%rdi, %rdx
++# endif
++
++L(prepare_loop_aligned):
++	/* eax stores distance from rsi to next page cross. These cases
++	   need to be handled specially as the 4x loop could potentially
++	   read memory past the length of s1 or s2 and across a page
++	   boundary.  */
++	movl	$-(VEC_SIZE * 4), %eax
++	subl	%esi, %eax
++	andl	$(PAGE_SIZE - 1), %eax
++
++	/* Loop 4x comparisons at a time.  */
+ 	.p2align 4
+ L(loop):
++
++	/* End condition for strncmp.  */
+ # ifdef USE_AS_STRNCMP
+-	/* Base pointers are moved forward by 4 * VEC_SIZE.  Decrease
+-	   the maximum offset (%r11) by the same amount.  */
+-	subq	$(VEC_SIZE * 4), %r11
+-	jbe	L(zero)
+-# endif
+-	addq	$(VEC_SIZE * 4), %rax
+-	addq	$(VEC_SIZE * 4), %rdx
+-L(loop_start):
+-	testl	%esi, %esi
+-	leal	-1(%esi), %esi
+-	je	L(loop_cross_page)
+-L(back_to_loop):
+-	/* Main loop, comparing 4 vectors are a time.  */
+-	vmovdqa	(%rax), %ymm0
+-	vmovdqa	VEC_SIZE(%rax), %ymm3
+-	VPCMPEQ	(%rdx), %ymm0, %ymm4
+-	VPCMPEQ	VEC_SIZE(%rdx), %ymm3, %ymm1
+-	VPMINU	%ymm0, %ymm4, %ymm4
+-	VPMINU	%ymm3, %ymm1, %ymm1
+-	vmovdqa	(VEC_SIZE * 2)(%rax), %ymm2
+-	VPMINU	%ymm1, %ymm4, %ymm0
+-	vmovdqa	(VEC_SIZE * 3)(%rax), %ymm3
+-	VPCMPEQ	(VEC_SIZE * 2)(%rdx), %ymm2, %ymm5
+-	VPCMPEQ	(VEC_SIZE * 3)(%rdx), %ymm3, %ymm6
+-	VPMINU	%ymm2, %ymm5, %ymm5
+-	VPMINU	%ymm3, %ymm6, %ymm6
+-	VPMINU	%ymm5, %ymm0, %ymm0
+-	VPMINU	%ymm6, %ymm0, %ymm0
+-	VPCMPEQ	%ymm7, %ymm0, %ymm0
+-
+-	/* Test each mask (32 bits) individually because for VEC_SIZE
+-	   == 32 is not possible to OR the four masks and keep all bits
+-	   in a 64-bit integer register, differing from SSE2 strcmp
+-	   where ORing is possible.  */
+-	vpmovmskb %ymm0, %ecx
++	subq	$(VEC_SIZE * 4), %rdx
++	jbe	L(ret_zero)
++# endif
++
++	subq	$-(VEC_SIZE * 4), %rdi
++	subq	$-(VEC_SIZE * 4), %rsi
++
++	/* Check if rsi loads will cross a page boundary.  */
++	addl	$-(VEC_SIZE * 4), %eax
++	jnb	L(page_cross_during_loop)
++
++	/* Loop entry after handling page cross during loop.  */
++L(loop_skip_page_cross_check):
++	VMOVA	(VEC_SIZE * 0)(%rdi), %ymm0
++	VMOVA	(VEC_SIZE * 1)(%rdi), %ymm2
++	VMOVA	(VEC_SIZE * 2)(%rdi), %ymm4
++	VMOVA	(VEC_SIZE * 3)(%rdi), %ymm6
++
++	/* ymm1 all 1s where s1 and s2 equal. All 0s otherwise.  */
++	VPCMPEQ	(VEC_SIZE * 0)(%rsi), %ymm0, %ymm1
++
++	VPCMPEQ	(VEC_SIZE * 1)(%rsi), %ymm2, %ymm3
++	VPCMPEQ	(VEC_SIZE * 2)(%rsi), %ymm4, %ymm5
++	VPCMPEQ	(VEC_SIZE * 3)(%rsi), %ymm6, %ymm7
++
++
++	/* If any mismatches or null CHAR then 0 CHAR, otherwise non-
++	   zero.  */
++	vpand	%ymm0, %ymm1, %ymm1
++
++
++	vpand	%ymm2, %ymm3, %ymm3
++	vpand	%ymm4, %ymm5, %ymm5
++	vpand	%ymm6, %ymm7, %ymm7
++
++	VPMINU	%ymm1, %ymm3, %ymm3
++	VPMINU	%ymm5, %ymm7, %ymm7
++
++	/* Reduce all 0 CHARs for the 4x VEC into ymm7.  */
++	VPMINU	%ymm3, %ymm7, %ymm7
++
++	/* If any 0 CHAR then done.  */
++	VPCMPEQ	%ymm7, %ymmZERO, %ymm7
++	vpmovmskb %ymm7, %LOOP_REG
++	testl	%LOOP_REG, %LOOP_REG
++	jz	L(loop)
++
++	/* Find which VEC has the mismatch of end of string.  */
++	VPCMPEQ	%ymm1, %ymmZERO, %ymm1
++	vpmovmskb %ymm1, %ecx
+ 	testl	%ecx, %ecx
+-	je	L(loop)
+-	VPCMPEQ	%ymm7, %ymm4, %ymm0
+-	vpmovmskb %ymm0, %edi
+-	testl	%edi, %edi
+-	je	L(test_vec)
+-	tzcntl	%edi, %ecx
++	jnz	L(return_vec_0_end)
++
++
++	VPCMPEQ	%ymm3, %ymmZERO, %ymm3
++	vpmovmskb %ymm3, %ecx
++	testl	%ecx, %ecx
++	jnz	L(return_vec_1_end)
++
++L(return_vec_2_3_end):
+ # ifdef USE_AS_STRNCMP
+-	cmpq	%rcx, %r11
+-	jbe	L(zero)
+-#  ifdef USE_AS_WCSCMP
+-	movq	%rax, %rsi
++	subq	$(VEC_SIZE * 2), %rdx
++	jbe	L(ret_zero_end)
++# endif
++
++	VPCMPEQ	%ymm5, %ymmZERO, %ymm5
++	vpmovmskb %ymm5, %ecx
++	testl	%ecx, %ecx
++	jnz	L(return_vec_2_end)
++
++	/* LOOP_REG contains matches for null/mismatch from the loop. If
++	   VEC 0,1,and 2 all have no null and no mismatches then mismatch
++	   must entirely be from VEC 3 which is fully represented by
++	   LOOP_REG.  */
++	tzcntl	%LOOP_REG, %LOOP_REG
++
++# ifdef USE_AS_STRNCMP
++	subl	$-(VEC_SIZE), %LOOP_REG
++	cmpq	%LOOP_REG64, %rdx
++	jbe	L(ret_zero_end)
++# endif
++
++# ifdef USE_AS_WCSCMP
++	movl	(VEC_SIZE * 2 - VEC_OFFSET)(%rdi, %LOOP_REG64), %ecx
+ 	xorl	%eax, %eax
+-	movl	(%rsi, %rcx), %edi
+-	cmpl	(%rdx, %rcx), %edi
+-	jne	L(wcscmp_return)
+-#  else
+-	movzbl	(%rax, %rcx), %eax
+-	movzbl	(%rdx, %rcx), %edx
+-	subl	%edx, %eax
+-#  endif
++	cmpl	(VEC_SIZE * 2 - VEC_OFFSET)(%rsi, %LOOP_REG64), %ecx
++	je	L(ret5)
++	setl	%al
++	negl	%eax
++	xorl	%r8d, %eax
+ # else
+-#  ifdef USE_AS_WCSCMP
+-	movq	%rax, %rsi
+-	xorl	%eax, %eax
+-	movl	(%rsi, %rcx), %edi
+-	cmpl	(%rdx, %rcx), %edi
+-	jne	L(wcscmp_return)
+-#  else
+-	movzbl	(%rax, %rcx), %eax
+-	movzbl	(%rdx, %rcx), %edx
+-	subl	%edx, %eax
+-#  endif
++	movzbl	(VEC_SIZE * 2 - VEC_OFFSET)(%rdi, %LOOP_REG64), %eax
++	movzbl	(VEC_SIZE * 2 - VEC_OFFSET)(%rsi, %LOOP_REG64), %ecx
++	subl	%ecx, %eax
++	xorl	%r8d, %eax
++	subl	%r8d, %eax
+ # endif
++L(ret5):
+ 	VZEROUPPER_RETURN
+ 
+-	.p2align 4
+-L(test_vec):
+ # ifdef USE_AS_STRNCMP
+-	/* The first vector matched.  Return 0 if the maximum offset
+-	   (%r11) <= VEC_SIZE.  */
+-	cmpq	$VEC_SIZE, %r11
+-	jbe	L(zero)
++	.p2align 4,, 2
++L(ret_zero_end):
++	xorl	%eax, %eax
++	VZEROUPPER_RETURN
+ # endif
+-	VPCMPEQ	%ymm7, %ymm1, %ymm1
+-	vpmovmskb %ymm1, %ecx
+-	testl	%ecx, %ecx
+-	je	L(test_2_vec)
+-	tzcntl	%ecx, %edi
++
++
++	/* The L(return_vec_N_end) differ from L(return_vec_N) in that
++	   they use the value of `r8` to negate the return value. This is
++	   because the page cross logic can swap `rdi` and `rsi`.  */
++	.p2align 4,, 10
+ # ifdef USE_AS_STRNCMP
+-	addq	$VEC_SIZE, %rdi
+-	cmpq	%rdi, %r11
+-	jbe	L(zero)
+-#  ifdef USE_AS_WCSCMP
+-	movq	%rax, %rsi
++L(return_vec_1_end):
++	salq	$32, %rcx
++# endif
++L(return_vec_0_end):
++# ifndef USE_AS_STRNCMP
++	tzcntl	%ecx, %ecx
++# else
++	tzcntq	%rcx, %rcx
++	cmpq	%rcx, %rdx
++	jbe	L(ret_zero_end)
++# endif
++
++# ifdef USE_AS_WCSCMP
++	movl	(%rdi, %rcx), %edx
+ 	xorl	%eax, %eax
+-	movl	(%rsi, %rdi), %ecx
+-	cmpl	(%rdx, %rdi), %ecx
+-	jne	L(wcscmp_return)
+-#  else
+-	movzbl	(%rax, %rdi), %eax
+-	movzbl	(%rdx, %rdi), %edx
+-	subl	%edx, %eax
+-#  endif
++	cmpl	(%rsi, %rcx), %edx
++	je	L(ret6)
++	setl	%al
++	negl	%eax
++	xorl	%r8d, %eax
+ # else
++	movzbl	(%rdi, %rcx), %eax
++	movzbl	(%rsi, %rcx), %ecx
++	subl	%ecx, %eax
++	xorl	%r8d, %eax
++	subl	%r8d, %eax
++# endif
++L(ret6):
++	VZEROUPPER_RETURN
++
++# ifndef USE_AS_STRNCMP
++	.p2align 4,, 10
++L(return_vec_1_end):
++	tzcntl	%ecx, %ecx
+ #  ifdef USE_AS_WCSCMP
+-	movq	%rax, %rsi
++	movl	VEC_SIZE(%rdi, %rcx), %edx
+ 	xorl	%eax, %eax
+-	movl	VEC_SIZE(%rsi, %rdi), %ecx
+-	cmpl	VEC_SIZE(%rdx, %rdi), %ecx
+-	jne	L(wcscmp_return)
++	cmpl	VEC_SIZE(%rsi, %rcx), %edx
++	je	L(ret7)
++	setl	%al
++	negl	%eax
++	xorl	%r8d, %eax
+ #  else
+-	movzbl	VEC_SIZE(%rax, %rdi), %eax
+-	movzbl	VEC_SIZE(%rdx, %rdi), %edx
+-	subl	%edx, %eax
++	movzbl	VEC_SIZE(%rdi, %rcx), %eax
++	movzbl	VEC_SIZE(%rsi, %rcx), %ecx
++	subl	%ecx, %eax
++	xorl	%r8d, %eax
++	subl	%r8d, %eax
+ #  endif
+-# endif
++L(ret7):
+ 	VZEROUPPER_RETURN
++# endif
+ 
+-	.p2align 4
+-L(test_2_vec):
++	.p2align 4,, 10
++L(return_vec_2_end):
++	tzcntl	%ecx, %ecx
+ # ifdef USE_AS_STRNCMP
+-	/* The first 2 vectors matched.  Return 0 if the maximum offset
+-	   (%r11) <= 2 * VEC_SIZE.  */
+-	cmpq	$(VEC_SIZE * 2), %r11
+-	jbe	L(zero)
++	cmpq	%rcx, %rdx
++	jbe	L(ret_zero_page_cross)
+ # endif
+-	VPCMPEQ	%ymm7, %ymm5, %ymm5
+-	vpmovmskb %ymm5, %ecx
+-	testl	%ecx, %ecx
+-	je	L(test_3_vec)
+-	tzcntl	%ecx, %edi
+-# ifdef USE_AS_STRNCMP
+-	addq	$(VEC_SIZE * 2), %rdi
+-	cmpq	%rdi, %r11
+-	jbe	L(zero)
+-#  ifdef USE_AS_WCSCMP
+-	movq	%rax, %rsi
++# ifdef USE_AS_WCSCMP
++	movl	(VEC_SIZE * 2)(%rdi, %rcx), %edx
+ 	xorl	%eax, %eax
+-	movl	(%rsi, %rdi), %ecx
+-	cmpl	(%rdx, %rdi), %ecx
+-	jne	L(wcscmp_return)
+-#  else
+-	movzbl	(%rax, %rdi), %eax
+-	movzbl	(%rdx, %rdi), %edx
+-	subl	%edx, %eax
+-#  endif
++	cmpl	(VEC_SIZE * 2)(%rsi, %rcx), %edx
++	je	L(ret11)
++	setl	%al
++	negl	%eax
++	xorl	%r8d, %eax
+ # else
+-#  ifdef USE_AS_WCSCMP
+-	movq	%rax, %rsi
+-	xorl	%eax, %eax
+-	movl	(VEC_SIZE * 2)(%rsi, %rdi), %ecx
+-	cmpl	(VEC_SIZE * 2)(%rdx, %rdi), %ecx
+-	jne	L(wcscmp_return)
+-#  else
+-	movzbl	(VEC_SIZE * 2)(%rax, %rdi), %eax
+-	movzbl	(VEC_SIZE * 2)(%rdx, %rdi), %edx
+-	subl	%edx, %eax
+-#  endif
++	movzbl	(VEC_SIZE * 2)(%rdi, %rcx), %eax
++	movzbl	(VEC_SIZE * 2)(%rsi, %rcx), %ecx
++	subl	%ecx, %eax
++	xorl	%r8d, %eax
++	subl	%r8d, %eax
+ # endif
++L(ret11):
+ 	VZEROUPPER_RETURN
+ 
+-	.p2align 4
+-L(test_3_vec):
++
++	/* Page cross in rsi in next 4x VEC.  */
++
++	/* TODO: Improve logic here.  */
++	.p2align 4,, 10
++L(page_cross_during_loop):
++	/* eax contains [distance_from_page - (VEC_SIZE * 4)].  */
++
++	/* Optimistically rsi and rdi and both aligned inwhich case we
++	   don't need any logic here.  */
++	cmpl	$-(VEC_SIZE * 4), %eax
++	/* Don't adjust eax before jumping back to loop and we will
++	   never hit page cross case again.  */
++	je	L(loop_skip_page_cross_check)
++
++	/* Check if we can safely load a VEC.  */
++	cmpl	$-(VEC_SIZE * 3), %eax
++	jle	L(less_1x_vec_till_page_cross)
++
++	VMOVA	(%rdi), %ymm0
++	VPCMPEQ	(%rsi), %ymm0, %ymm1
++	VPCMPEQ	%ymm0, %ymmZERO, %ymm2
++	vpandn	%ymm1, %ymm2, %ymm1
++	vpmovmskb %ymm1, %ecx
++	incl	%ecx
++	jnz	L(return_vec_0_end)
++
++	/* if distance >= 2x VEC then eax > -(VEC_SIZE * 2).  */
++	cmpl	$-(VEC_SIZE * 2), %eax
++	jg	L(more_2x_vec_till_page_cross)
++
++	.p2align 4,, 4
++L(less_1x_vec_till_page_cross):
++	subl	$-(VEC_SIZE * 4), %eax
++	/* Guranteed safe to read from rdi - VEC_SIZE here. The only
++	   concerning case is first iteration if incoming s1 was near start
++	   of a page and s2 near end. If s1 was near the start of the page
++	   we already aligned up to nearest VEC_SIZE * 4 so gurnateed safe
++	   to read back -VEC_SIZE. If rdi is truly at the start of a page
++	   here, it means the previous page (rdi - VEC_SIZE) has already
++	   been loaded earlier so must be valid.  */
++	VMOVU	-VEC_SIZE(%rdi, %rax), %ymm0
++	VPCMPEQ	-VEC_SIZE(%rsi, %rax), %ymm0, %ymm1
++	VPCMPEQ	%ymm0, %ymmZERO, %ymm2
++	vpandn	%ymm1, %ymm2, %ymm1
++	vpmovmskb %ymm1, %ecx
++
++	/* Mask of potentially valid bits. The lower bits can be out of
++	   range comparisons (but safe regarding page crosses).  */
++	movl	$-1, %r10d
++	shlxl	%esi, %r10d, %r10d
++	notl	%ecx
++
+ # ifdef USE_AS_STRNCMP
+-	/* The first 3 vectors matched.  Return 0 if the maximum offset
+-	   (%r11) <= 3 * VEC_SIZE.  */
+-	cmpq	$(VEC_SIZE * 3), %r11
+-	jbe	L(zero)
+-# endif
+-	VPCMPEQ	%ymm7, %ymm6, %ymm6
+-	vpmovmskb %ymm6, %esi
+-	tzcntl	%esi, %ecx
++	cmpq	%rax, %rdx
++	jbe	L(return_page_cross_end_check)
++# endif
++	movl	%eax, %OFFSET_REG
++	addl	$(PAGE_SIZE - VEC_SIZE * 4), %eax
++
++	andl	%r10d, %ecx
++	jz	L(loop_skip_page_cross_check)
++
++	.p2align 4,, 3
++L(return_page_cross_end):
++	tzcntl	%ecx, %ecx
++
+ # ifdef USE_AS_STRNCMP
+-	addq	$(VEC_SIZE * 3), %rcx
+-	cmpq	%rcx, %r11
+-	jbe	L(zero)
+-#  ifdef USE_AS_WCSCMP
+-	movq	%rax, %rsi
+-	xorl	%eax, %eax
+-	movl	(%rsi, %rcx), %esi
+-	cmpl	(%rdx, %rcx), %esi
+-	jne	L(wcscmp_return)
+-#  else
+-	movzbl	(%rax, %rcx), %eax
+-	movzbl	(%rdx, %rcx), %edx
+-	subl	%edx, %eax
+-#  endif
++	leal	-VEC_SIZE(%OFFSET_REG64, %rcx), %ecx
++L(return_page_cross_cmp_mem):
+ # else
+-#  ifdef USE_AS_WCSCMP
+-	movq	%rax, %rsi
++	addl	%OFFSET_REG, %ecx
++# endif
++# ifdef USE_AS_WCSCMP
++	movl	VEC_OFFSET(%rdi, %rcx), %edx
+ 	xorl	%eax, %eax
+-	movl	(VEC_SIZE * 3)(%rsi, %rcx), %esi
+-	cmpl	(VEC_SIZE * 3)(%rdx, %rcx), %esi
+-	jne	L(wcscmp_return)
+-#  else
+-	movzbl	(VEC_SIZE * 3)(%rax, %rcx), %eax
+-	movzbl	(VEC_SIZE * 3)(%rdx, %rcx), %edx
+-	subl	%edx, %eax
+-#  endif
++	cmpl	VEC_OFFSET(%rsi, %rcx), %edx
++	je	L(ret8)
++	setl	%al
++	negl	%eax
++	xorl	%r8d, %eax
++# else
++	movzbl	VEC_OFFSET(%rdi, %rcx), %eax
++	movzbl	VEC_OFFSET(%rsi, %rcx), %ecx
++	subl	%ecx, %eax
++	xorl	%r8d, %eax
++	subl	%r8d, %eax
+ # endif
++L(ret8):
+ 	VZEROUPPER_RETURN
+ 
+-	.p2align 4
+-L(loop_cross_page):
+-	xorl	%r10d, %r10d
+-	movq	%rdx, %rcx
+-	/* Align load via RDX.  We load the extra ECX bytes which should
+-	   be ignored.  */
+-	andl	$((VEC_SIZE * 4) - 1), %ecx
+-	/* R10 is -RCX.  */
+-	subq	%rcx, %r10
+-
+-	/* This works only if VEC_SIZE * 2 == 64. */
+-# if (VEC_SIZE * 2) != 64
+-#  error (VEC_SIZE * 2) != 64
+-# endif
+-
+-	/* Check if the first VEC_SIZE * 2 bytes should be ignored.  */
+-	cmpl	$(VEC_SIZE * 2), %ecx
+-	jge	L(loop_cross_page_2_vec)
+-
+-	vmovdqu	(%rax, %r10), %ymm2
+-	vmovdqu	VEC_SIZE(%rax, %r10), %ymm3
+-	VPCMPEQ	(%rdx, %r10), %ymm2, %ymm0
+-	VPCMPEQ	VEC_SIZE(%rdx, %r10), %ymm3, %ymm1
+-	VPMINU	%ymm2, %ymm0, %ymm0
+-	VPMINU	%ymm3, %ymm1, %ymm1
+-	VPCMPEQ	%ymm7, %ymm0, %ymm0
+-	VPCMPEQ	%ymm7, %ymm1, %ymm1
+-
+-	vpmovmskb %ymm0, %edi
+-	vpmovmskb %ymm1, %esi
+-
+-	salq	$32, %rsi
+-	xorq	%rsi, %rdi
+-
+-	/* Since ECX < VEC_SIZE * 2, simply skip the first ECX bytes.  */
+-	shrq	%cl, %rdi
+-
+-	testq	%rdi, %rdi
+-	je	L(loop_cross_page_2_vec)
+-	tzcntq	%rdi, %rcx
+ # ifdef USE_AS_STRNCMP
+-	cmpq	%rcx, %r11
+-	jbe	L(zero)
+-#  ifdef USE_AS_WCSCMP
+-	movq	%rax, %rsi
++	.p2align 4,, 10
++L(return_page_cross_end_check):
++	tzcntl	%ecx, %ecx
++	leal	-VEC_SIZE(%rax, %rcx), %ecx
++	cmpl	%ecx, %edx
++	ja	L(return_page_cross_cmp_mem)
+ 	xorl	%eax, %eax
+-	movl	(%rsi, %rcx), %edi
+-	cmpl	(%rdx, %rcx), %edi
+-	jne	L(wcscmp_return)
+-#  else
+-	movzbl	(%rax, %rcx), %eax
+-	movzbl	(%rdx, %rcx), %edx
+-	subl	%edx, %eax
+-#  endif
+-# else
+-#  ifdef USE_AS_WCSCMP
+-	movq	%rax, %rsi
+-	xorl	%eax, %eax
+-	movl	(%rsi, %rcx), %edi
+-	cmpl	(%rdx, %rcx), %edi
+-	jne	L(wcscmp_return)
+-#  else
+-	movzbl	(%rax, %rcx), %eax
+-	movzbl	(%rdx, %rcx), %edx
+-	subl	%edx, %eax
+-#  endif
+-# endif
+ 	VZEROUPPER_RETURN
++# endif
+ 
+-	.p2align 4
+-L(loop_cross_page_2_vec):
+-	/* The first VEC_SIZE * 2 bytes match or are ignored.  */
+-	vmovdqu	(VEC_SIZE * 2)(%rax, %r10), %ymm2
+-	vmovdqu	(VEC_SIZE * 3)(%rax, %r10), %ymm3
+-	VPCMPEQ	(VEC_SIZE * 2)(%rdx, %r10), %ymm2, %ymm5
+-	VPMINU	%ymm2, %ymm5, %ymm5
+-	VPCMPEQ	(VEC_SIZE * 3)(%rdx, %r10), %ymm3, %ymm6
+-	VPCMPEQ	%ymm7, %ymm5, %ymm5
+-	VPMINU	%ymm3, %ymm6, %ymm6
+-	VPCMPEQ	%ymm7, %ymm6, %ymm6
+-
+-	vpmovmskb %ymm5, %edi
+-	vpmovmskb %ymm6, %esi
+-
+-	salq	$32, %rsi
+-	xorq	%rsi, %rdi
+ 
+-	xorl	%r8d, %r8d
+-	/* If ECX > VEC_SIZE * 2, skip ECX - (VEC_SIZE * 2) bytes.  */
+-	subl	$(VEC_SIZE * 2), %ecx
+-	jle	1f
+-	/* Skip ECX bytes.  */
+-	shrq	%cl, %rdi
+-	/* R8 has number of bytes skipped.  */
+-	movl	%ecx, %r8d
+-1:
+-	/* Before jumping back to the loop, set ESI to the number of
+-	   VEC_SIZE * 4 blocks before page crossing.  */
+-	movl	$(PAGE_SIZE / (VEC_SIZE * 4) - 1), %esi
+-
+-	testq	%rdi, %rdi
++	.p2align 4,, 10
++L(more_2x_vec_till_page_cross):
++	/* If more 2x vec till cross we will complete a full loop
++	   iteration here.  */
++
++	VMOVU	VEC_SIZE(%rdi), %ymm0
++	VPCMPEQ	VEC_SIZE(%rsi), %ymm0, %ymm1
++	VPCMPEQ	%ymm0, %ymmZERO, %ymm2
++	vpandn	%ymm1, %ymm2, %ymm1
++	vpmovmskb %ymm1, %ecx
++	incl	%ecx
++	jnz	L(return_vec_1_end)
++
+ # ifdef USE_AS_STRNCMP
+-	/* At this point, if %rdi value is 0, it already tested
+-	   VEC_SIZE*4+%r10 byte starting from %rax. This label
+-	   checks whether strncmp maximum offset reached or not.  */
+-	je	L(string_nbyte_offset_check)
+-# else
+-	je	L(back_to_loop)
++	cmpq	$(VEC_SIZE * 2), %rdx
++	jbe	L(ret_zero_in_loop_page_cross)
+ # endif
+-	tzcntq	%rdi, %rcx
+-	addq	%r10, %rcx
+-	/* Adjust for number of bytes skipped.  */
+-	addq	%r8, %rcx
++
++	subl	$-(VEC_SIZE * 4), %eax
++
++	/* Safe to include comparisons from lower bytes.  */
++	VMOVU	-(VEC_SIZE * 2)(%rdi, %rax), %ymm0
++	VPCMPEQ	-(VEC_SIZE * 2)(%rsi, %rax), %ymm0, %ymm1
++	VPCMPEQ	%ymm0, %ymmZERO, %ymm2
++	vpandn	%ymm1, %ymm2, %ymm1
++	vpmovmskb %ymm1, %ecx
++	incl	%ecx
++	jnz	L(return_vec_page_cross_0)
++
++	VMOVU	-(VEC_SIZE * 1)(%rdi, %rax), %ymm0
++	VPCMPEQ	-(VEC_SIZE * 1)(%rsi, %rax), %ymm0, %ymm1
++	VPCMPEQ	%ymm0, %ymmZERO, %ymm2
++	vpandn	%ymm1, %ymm2, %ymm1
++	vpmovmskb %ymm1, %ecx
++	incl	%ecx
++	jnz	L(return_vec_page_cross_1)
++
+ # ifdef USE_AS_STRNCMP
+-	addq	$(VEC_SIZE * 2), %rcx
+-	subq	%rcx, %r11
+-	jbe	L(zero)
+-#  ifdef USE_AS_WCSCMP
+-	movq	%rax, %rsi
++	/* Must check length here as length might proclude reading next
++	   page.  */
++	cmpq	%rax, %rdx
++	jbe	L(ret_zero_in_loop_page_cross)
++# endif
++
++	/* Finish the loop.  */
++	VMOVA	(VEC_SIZE * 2)(%rdi), %ymm4
++	VMOVA	(VEC_SIZE * 3)(%rdi), %ymm6
++
++	VPCMPEQ	(VEC_SIZE * 2)(%rsi), %ymm4, %ymm5
++	VPCMPEQ	(VEC_SIZE * 3)(%rsi), %ymm6, %ymm7
++	vpand	%ymm4, %ymm5, %ymm5
++	vpand	%ymm6, %ymm7, %ymm7
++	VPMINU	%ymm5, %ymm7, %ymm7
++	VPCMPEQ	%ymm7, %ymmZERO, %ymm7
++	vpmovmskb %ymm7, %LOOP_REG
++	testl	%LOOP_REG, %LOOP_REG
++	jnz	L(return_vec_2_3_end)
++
++	/* Best for code size to include ucond-jmp here. Would be faster
++	   if this case is hot to duplicate the L(return_vec_2_3_end) code
++	   as fall-through and have jump back to loop on mismatch
++	   comparison.  */
++	subq	$-(VEC_SIZE * 4), %rdi
++	subq	$-(VEC_SIZE * 4), %rsi
++	addl	$(PAGE_SIZE - VEC_SIZE * 8), %eax
++# ifdef USE_AS_STRNCMP
++	subq	$(VEC_SIZE * 4), %rdx
++	ja	L(loop_skip_page_cross_check)
++L(ret_zero_in_loop_page_cross):
+ 	xorl	%eax, %eax
+-	movl	(%rsi, %rcx), %edi
+-	cmpl	(%rdx, %rcx), %edi
+-	jne	L(wcscmp_return)
+-#  else
+-	movzbl	(%rax, %rcx), %eax
+-	movzbl	(%rdx, %rcx), %edx
+-	subl	%edx, %eax
+-#  endif
++	VZEROUPPER_RETURN
+ # else
+-#  ifdef USE_AS_WCSCMP
+-	movq	%rax, %rsi
+-	xorl	%eax, %eax
+-	movl	(VEC_SIZE * 2)(%rsi, %rcx), %edi
+-	cmpl	(VEC_SIZE * 2)(%rdx, %rcx), %edi
+-	jne	L(wcscmp_return)
+-#  else
+-	movzbl	(VEC_SIZE * 2)(%rax, %rcx), %eax
+-	movzbl	(VEC_SIZE * 2)(%rdx, %rcx), %edx
+-	subl	%edx, %eax
+-#  endif
++	jmp	L(loop_skip_page_cross_check)
+ # endif
+-	VZEROUPPER_RETURN
+ 
++
++	.p2align 4,, 10
++L(return_vec_page_cross_0):
++	addl	$-VEC_SIZE, %eax
++L(return_vec_page_cross_1):
++	tzcntl	%ecx, %ecx
+ # ifdef USE_AS_STRNCMP
+-L(string_nbyte_offset_check):
+-	leaq	(VEC_SIZE * 4)(%r10), %r10
+-	cmpq	%r10, %r11
+-	jbe	L(zero)
+-	jmp	L(back_to_loop)
++	leal	-VEC_SIZE(%rax, %rcx), %ecx
++	cmpq	%rcx, %rdx
++	jbe	L(ret_zero_in_loop_page_cross)
++# else
++	addl	%eax, %ecx
+ # endif
+ 
+-	.p2align 4
+-L(cross_page_loop):
+-	/* Check one byte/dword at a time.  */
+ # ifdef USE_AS_WCSCMP
+-	cmpl	%ecx, %eax
++	movl	VEC_OFFSET(%rdi, %rcx), %edx
++	xorl	%eax, %eax
++	cmpl	VEC_OFFSET(%rsi, %rcx), %edx
++	je	L(ret9)
++	setl	%al
++	negl	%eax
++	xorl	%r8d, %eax
+ # else
++	movzbl	VEC_OFFSET(%rdi, %rcx), %eax
++	movzbl	VEC_OFFSET(%rsi, %rcx), %ecx
+ 	subl	%ecx, %eax
++	xorl	%r8d, %eax
++	subl	%r8d, %eax
+ # endif
+-	jne	L(different)
+-	addl	$SIZE_OF_CHAR, %edx
+-	cmpl	$(VEC_SIZE * 4), %edx
+-	je	L(main_loop_header)
+-# ifdef USE_AS_STRNCMP
+-	cmpq	%r11, %rdx
+-	jae	L(zero)
++L(ret9):
++	VZEROUPPER_RETURN
++
++
++	.p2align 4,, 10
++L(page_cross):
++# ifndef USE_AS_STRNCMP
++	/* If both are VEC aligned we don't need any special logic here.
++	   Only valid for strcmp where stop condition is guranteed to be
++	   reachable by just reading memory.  */
++	testl	$((VEC_SIZE - 1) << 20), %eax
++	jz	L(no_page_cross)
+ # endif
++
++	movl	%edi, %eax
++	movl	%esi, %ecx
++	andl	$(PAGE_SIZE - 1), %eax
++	andl	$(PAGE_SIZE - 1), %ecx
++
++	xorl	%OFFSET_REG, %OFFSET_REG
++
++	/* Check which is closer to page cross, s1 or s2.  */
++	cmpl	%eax, %ecx
++	jg	L(page_cross_s2)
++
++	/* The previous page cross check has false positives. Check for
++	   true positive as page cross logic is very expensive.  */
++	subl	$(PAGE_SIZE - VEC_SIZE * 4), %eax
++	jbe	L(no_page_cross)
++
++	/* Set r8 to not interfere with normal return value (rdi and rsi
++	   did not swap).  */
+ # ifdef USE_AS_WCSCMP
+-	movl	(%rdi, %rdx), %eax
+-	movl	(%rsi, %rdx), %ecx
++	/* any non-zero positive value that doesn't inference with 0x1.
++	 */
++	movl	$2, %r8d
+ # else
+-	movzbl	(%rdi, %rdx), %eax
+-	movzbl	(%rsi, %rdx), %ecx
++	xorl	%r8d, %r8d
+ # endif
+-	/* Check null char.  */
+-	testl	%eax, %eax
+-	jne	L(cross_page_loop)
+-	/* Since %eax == 0, subtract is OK for both SIGNED and UNSIGNED
+-	   comparisons.  */
+-	subl	%ecx, %eax
+-# ifndef USE_AS_WCSCMP
+-L(different):
++
++	/* Check if less than 1x VEC till page cross.  */
++	subl	$(VEC_SIZE * 3), %eax
++	jg	L(less_1x_vec_till_page)
++
++	/* If more than 1x VEC till page cross, loop throuh safely
++	   loadable memory until within 1x VEC of page cross.  */
++
++	.p2align 4,, 10
++L(page_cross_loop):
++
++	VMOVU	(%rdi, %OFFSET_REG64), %ymm0
++	VPCMPEQ	(%rsi, %OFFSET_REG64), %ymm0, %ymm1
++	VPCMPEQ	%ymm0, %ymmZERO, %ymm2
++	vpandn	%ymm1, %ymm2, %ymm1
++	vpmovmskb %ymm1, %ecx
++	incl	%ecx
++
++	jnz	L(check_ret_vec_page_cross)
++	addl	$VEC_SIZE, %OFFSET_REG
++# ifdef USE_AS_STRNCMP
++	cmpq	%OFFSET_REG64, %rdx
++	jbe	L(ret_zero_page_cross)
+ # endif
+-	VZEROUPPER_RETURN
++	addl	$VEC_SIZE, %eax
++	jl	L(page_cross_loop)
++
++	subl	%eax, %OFFSET_REG
++	/* OFFSET_REG has distance to page cross - VEC_SIZE. Guranteed
++	   to not cross page so is safe to load. Since we have already
++	   loaded at least 1 VEC from rsi it is also guranteed to be safe.
++	 */
++
++	VMOVU	(%rdi, %OFFSET_REG64), %ymm0
++	VPCMPEQ	(%rsi, %OFFSET_REG64), %ymm0, %ymm1
++	VPCMPEQ	%ymm0, %ymmZERO, %ymm2
++	vpandn	%ymm1, %ymm2, %ymm1
++	vpmovmskb %ymm1, %ecx
++
++# ifdef USE_AS_STRNCMP
++	leal	VEC_SIZE(%OFFSET_REG64), %eax
++	cmpq	%rax, %rdx
++	jbe	L(check_ret_vec_page_cross2)
++	addq	%rdi, %rdx
++# endif
++	incl	%ecx
++	jz	L(prepare_loop_no_len)
+ 
++	.p2align 4,, 4
++L(ret_vec_page_cross):
++# ifndef USE_AS_STRNCMP
++L(check_ret_vec_page_cross):
++# endif
++	tzcntl	%ecx, %ecx
++	addl	%OFFSET_REG, %ecx
++L(ret_vec_page_cross_cont):
+ # ifdef USE_AS_WCSCMP
+-	.p2align 4
+-L(different):
+-	/* Use movl to avoid modifying EFLAGS.  */
+-	movl	$0, %eax
++	movl	(%rdi, %rcx), %edx
++	xorl	%eax, %eax
++	cmpl	(%rsi, %rcx), %edx
++	je	L(ret12)
+ 	setl	%al
+ 	negl	%eax
+-	orl	$1, %eax
+-	VZEROUPPER_RETURN
++	xorl	%r8d, %eax
++# else
++	movzbl	(%rdi, %rcx), %eax
++	movzbl	(%rsi, %rcx), %ecx
++	subl	%ecx, %eax
++	xorl	%r8d, %eax
++	subl	%r8d, %eax
+ # endif
++L(ret12):
++	VZEROUPPER_RETURN
+ 
+ # ifdef USE_AS_STRNCMP
+-	.p2align 4
+-L(zero):
++	.p2align 4,, 10
++L(check_ret_vec_page_cross2):
++	incl	%ecx
++L(check_ret_vec_page_cross):
++	tzcntl	%ecx, %ecx
++	addl	%OFFSET_REG, %ecx
++	cmpq	%rcx, %rdx
++	ja	L(ret_vec_page_cross_cont)
++	.p2align 4,, 2
++L(ret_zero_page_cross):
+ 	xorl	%eax, %eax
+ 	VZEROUPPER_RETURN
++# endif
+ 
+-	.p2align 4
+-L(char0):
+-#  ifdef USE_AS_WCSCMP
+-	xorl	%eax, %eax
+-	movl	(%rdi), %ecx
+-	cmpl	(%rsi), %ecx
+-	jne	L(wcscmp_return)
+-#  else
+-	movzbl	(%rsi), %ecx
+-	movzbl	(%rdi), %eax
+-	subl	%ecx, %eax
+-#  endif
+-	VZEROUPPER_RETURN
++	.p2align 4,, 4
++L(page_cross_s2):
++	/* Ensure this is a true page cross.  */
++	subl	$(PAGE_SIZE - VEC_SIZE * 4), %ecx
++	jbe	L(no_page_cross)
++
++
++	movl	%ecx, %eax
++	movq	%rdi, %rcx
++	movq	%rsi, %rdi
++	movq	%rcx, %rsi
++
++	/* set r8 to negate return value as rdi and rsi swapped.  */
++# ifdef USE_AS_WCSCMP
++	movl	$-4, %r8d
++# else
++	movl	$-1, %r8d
+ # endif
++	xorl	%OFFSET_REG, %OFFSET_REG
+ 
+-	.p2align 4
+-L(last_vector):
+-	addq	%rdx, %rdi
+-	addq	%rdx, %rsi
++	/* Check if more than 1x VEC till page cross.  */
++	subl	$(VEC_SIZE * 3), %eax
++	jle	L(page_cross_loop)
++
++	.p2align 4,, 6
++L(less_1x_vec_till_page):
++	/* Find largest load size we can use.  */
++	cmpl	$16, %eax
++	ja	L(less_16_till_page)
++
++	VMOVU	(%rdi), %xmm0
++	VPCMPEQ	(%rsi), %xmm0, %xmm1
++	VPCMPEQ	%xmm0, %xmmZERO, %xmm2
++	vpandn	%xmm1, %xmm2, %xmm1
++	vpmovmskb %ymm1, %ecx
++	incw	%cx
++	jnz	L(check_ret_vec_page_cross)
++	movl	$16, %OFFSET_REG
+ # ifdef USE_AS_STRNCMP
+-	subq	%rdx, %r11
++	cmpq	%OFFSET_REG64, %rdx
++	jbe	L(ret_zero_page_cross_slow_case0)
++	subl	%eax, %OFFSET_REG
++# else
++	/* Explicit check for 16 byte alignment.  */
++	subl	%eax, %OFFSET_REG
++	jz	L(prepare_loop)
+ # endif
+-	tzcntl	%ecx, %edx
++
++	VMOVU	(%rdi, %OFFSET_REG64), %xmm0
++	VPCMPEQ	(%rsi, %OFFSET_REG64), %xmm0, %xmm1
++	VPCMPEQ	%xmm0, %xmmZERO, %xmm2
++	vpandn	%xmm1, %xmm2, %xmm1
++	vpmovmskb %ymm1, %ecx
++	incw	%cx
++	jnz	L(check_ret_vec_page_cross)
++
+ # ifdef USE_AS_STRNCMP
+-	cmpq	%r11, %rdx
+-	jae	L(zero)
++	addl	$16, %OFFSET_REG
++	subq	%OFFSET_REG64, %rdx
++	jbe	L(ret_zero_page_cross_slow_case0)
++	subq	$-(VEC_SIZE * 4), %rdx
++
++	leaq	-(VEC_SIZE * 4)(%rdi, %OFFSET_REG64), %rdi
++	leaq	-(VEC_SIZE * 4)(%rsi, %OFFSET_REG64), %rsi
++# else
++	leaq	(16 - VEC_SIZE * 4)(%rdi, %OFFSET_REG64), %rdi
++	leaq	(16 - VEC_SIZE * 4)(%rsi, %OFFSET_REG64), %rsi
+ # endif
+-# ifdef USE_AS_WCSCMP
++	jmp	L(prepare_loop_aligned)
++
++# ifdef USE_AS_STRNCMP
++	.p2align 4,, 2
++L(ret_zero_page_cross_slow_case0):
+ 	xorl	%eax, %eax
+-	movl	(%rdi, %rdx), %ecx
+-	cmpl	(%rsi, %rdx), %ecx
+-	jne	L(wcscmp_return)
+-# else
+-	movzbl	(%rdi, %rdx), %eax
+-	movzbl	(%rsi, %rdx), %edx
+-	subl	%edx, %eax
++	ret
+ # endif
+-	VZEROUPPER_RETURN
+ 
+-	/* Comparing on page boundary region requires special treatment:
+-	   It must done one vector at the time, starting with the wider
+-	   ymm vector if possible, if not, with xmm. If fetching 16 bytes
+-	   (xmm) still passes the boundary, byte comparison must be done.
+-	 */
+-	.p2align 4
+-L(cross_page):
+-	/* Try one ymm vector at a time.  */
+-	cmpl	$(PAGE_SIZE - VEC_SIZE), %eax
+-	jg	L(cross_page_1_vector)
+-L(loop_1_vector):
+-	vmovdqu	(%rdi, %rdx), %ymm1
+-	VPCMPEQ	(%rsi, %rdx), %ymm1, %ymm0
+-	VPMINU	%ymm1, %ymm0, %ymm0
+-	VPCMPEQ	%ymm7, %ymm0, %ymm0
+-	vpmovmskb %ymm0, %ecx
+-	testl	%ecx, %ecx
+-	jne	L(last_vector)
+ 
+-	addl	$VEC_SIZE, %edx
++	.p2align 4,, 10
++L(less_16_till_page):
++	/* Find largest load size we can use.  */
++	cmpl	$24, %eax
++	ja	L(less_8_till_page)
+ 
+-	addl	$VEC_SIZE, %eax
+-# ifdef USE_AS_STRNCMP
+-	/* Return 0 if the current offset (%rdx) >= the maximum offset
+-	   (%r11).  */
+-	cmpq	%r11, %rdx
+-	jae	L(zero)
+-# endif
+-	cmpl	$(PAGE_SIZE - VEC_SIZE), %eax
+-	jle	L(loop_1_vector)
+-L(cross_page_1_vector):
+-	/* Less than 32 bytes to check, try one xmm vector.  */
+-	cmpl	$(PAGE_SIZE - 16), %eax
+-	jg	L(cross_page_1_xmm)
+-	vmovdqu	(%rdi, %rdx), %xmm1
+-	VPCMPEQ	(%rsi, %rdx), %xmm1, %xmm0
+-	VPMINU	%xmm1, %xmm0, %xmm0
+-	VPCMPEQ	%xmm7, %xmm0, %xmm0
+-	vpmovmskb %xmm0, %ecx
+-	testl	%ecx, %ecx
+-	jne	L(last_vector)
++	vmovq	(%rdi), %xmm0
++	vmovq	(%rsi), %xmm1
++	VPCMPEQ	%xmm0, %xmmZERO, %xmm2
++	VPCMPEQ	%xmm1, %xmm0, %xmm1
++	vpandn	%xmm1, %xmm2, %xmm1
++	vpmovmskb %ymm1, %ecx
++	incb	%cl
++	jnz	L(check_ret_vec_page_cross)
+ 
+-	addl	$16, %edx
+-# ifndef USE_AS_WCSCMP
+-	addl	$16, %eax
++
++# ifdef USE_AS_STRNCMP
++	cmpq	$8, %rdx
++	jbe	L(ret_zero_page_cross_slow_case0)
+ # endif
++	movl	$24, %OFFSET_REG
++	/* Explicit check for 16 byte alignment.  */
++	subl	%eax, %OFFSET_REG
++
++
++
++	vmovq	(%rdi, %OFFSET_REG64), %xmm0
++	vmovq	(%rsi, %OFFSET_REG64), %xmm1
++	VPCMPEQ	%xmm0, %xmmZERO, %xmm2
++	VPCMPEQ	%xmm1, %xmm0, %xmm1
++	vpandn	%xmm1, %xmm2, %xmm1
++	vpmovmskb %ymm1, %ecx
++	incb	%cl
++	jnz	L(check_ret_vec_page_cross)
++
+ # ifdef USE_AS_STRNCMP
+-	/* Return 0 if the current offset (%rdx) >= the maximum offset
+-	   (%r11).  */
+-	cmpq	%r11, %rdx
+-	jae	L(zero)
+-# endif
+-
+-L(cross_page_1_xmm):
+-# ifndef USE_AS_WCSCMP
+-	/* Less than 16 bytes to check, try 8 byte vector.  NB: No need
+-	   for wcscmp nor wcsncmp since wide char is 4 bytes.   */
+-	cmpl	$(PAGE_SIZE - 8), %eax
+-	jg	L(cross_page_8bytes)
+-	vmovq	(%rdi, %rdx), %xmm1
+-	vmovq	(%rsi, %rdx), %xmm0
+-	VPCMPEQ	%xmm0, %xmm1, %xmm0
+-	VPMINU	%xmm1, %xmm0, %xmm0
+-	VPCMPEQ	%xmm7, %xmm0, %xmm0
+-	vpmovmskb %xmm0, %ecx
+-	/* Only last 8 bits are valid.  */
+-	andl	$0xff, %ecx
+-	testl	%ecx, %ecx
+-	jne	L(last_vector)
++	addl	$8, %OFFSET_REG
++	subq	%OFFSET_REG64, %rdx
++	jbe	L(ret_zero_page_cross_slow_case0)
++	subq	$-(VEC_SIZE * 4), %rdx
+ 
+-	addl	$8, %edx
+-	addl	$8, %eax
++	leaq	-(VEC_SIZE * 4)(%rdi, %OFFSET_REG64), %rdi
++	leaq	-(VEC_SIZE * 4)(%rsi, %OFFSET_REG64), %rsi
++# else
++	leaq	(8 - VEC_SIZE * 4)(%rdi, %OFFSET_REG64), %rdi
++	leaq	(8 - VEC_SIZE * 4)(%rsi, %OFFSET_REG64), %rsi
++# endif
++	jmp	L(prepare_loop_aligned)
++
++
++	.p2align 4,, 10
++L(less_8_till_page):
++# ifdef USE_AS_WCSCMP
++	/* If using wchar then this is the only check before we reach
++	   the page boundary.  */
++	movl	(%rdi), %eax
++	movl	(%rsi), %ecx
++	cmpl	%ecx, %eax
++	jnz	L(ret_less_8_wcs)
+ #  ifdef USE_AS_STRNCMP
+-	/* Return 0 if the current offset (%rdx) >= the maximum offset
+-	   (%r11).  */
+-	cmpq	%r11, %rdx
+-	jae	L(zero)
++	addq	%rdi, %rdx
++	/* We already checked for len <= 1 so cannot hit that case here.
++	 */
+ #  endif
++	testl	%eax, %eax
++	jnz	L(prepare_loop_no_len)
++	ret
+ 
+-L(cross_page_8bytes):
+-	/* Less than 8 bytes to check, try 4 byte vector.  */
+-	cmpl	$(PAGE_SIZE - 4), %eax
+-	jg	L(cross_page_4bytes)
+-	vmovd	(%rdi, %rdx), %xmm1
+-	vmovd	(%rsi, %rdx), %xmm0
+-	VPCMPEQ	%xmm0, %xmm1, %xmm0
+-	VPMINU	%xmm1, %xmm0, %xmm0
+-	VPCMPEQ	%xmm7, %xmm0, %xmm0
+-	vpmovmskb %xmm0, %ecx
+-	/* Only last 4 bits are valid.  */
+-	andl	$0xf, %ecx
+-	testl	%ecx, %ecx
+-	jne	L(last_vector)
++	.p2align 4,, 8
++L(ret_less_8_wcs):
++	setl	%OFFSET_REG8
++	negl	%OFFSET_REG
++	movl	%OFFSET_REG, %eax
++	xorl	%r8d, %eax
++	ret
++
++# else
++
++	/* Find largest load size we can use.  */
++	cmpl	$28, %eax
++	ja	L(less_4_till_page)
++
++	vmovd	(%rdi), %xmm0
++	vmovd	(%rsi), %xmm1
++	VPCMPEQ	%xmm0, %xmmZERO, %xmm2
++	VPCMPEQ	%xmm1, %xmm0, %xmm1
++	vpandn	%xmm1, %xmm2, %xmm1
++	vpmovmskb %ymm1, %ecx
++	subl	$0xf, %ecx
++	jnz	L(check_ret_vec_page_cross)
+ 
+-	addl	$4, %edx
+ #  ifdef USE_AS_STRNCMP
+-	/* Return 0 if the current offset (%rdx) >= the maximum offset
+-	   (%r11).  */
+-	cmpq	%r11, %rdx
+-	jae	L(zero)
++	cmpq	$4, %rdx
++	jbe	L(ret_zero_page_cross_slow_case1)
+ #  endif
++	movl	$28, %OFFSET_REG
++	/* Explicit check for 16 byte alignment.  */
++	subl	%eax, %OFFSET_REG
+ 
+-L(cross_page_4bytes):
+-# endif
+-	/* Less than 4 bytes to check, try one byte/dword at a time.  */
+-# ifdef USE_AS_STRNCMP
+-	cmpq	%r11, %rdx
+-	jae	L(zero)
+-# endif
+-# ifdef USE_AS_WCSCMP
+-	movl	(%rdi, %rdx), %eax
+-	movl	(%rsi, %rdx), %ecx
+-# else
+-	movzbl	(%rdi, %rdx), %eax
+-	movzbl	(%rsi, %rdx), %ecx
+-# endif
+-	testl	%eax, %eax
+-	jne	L(cross_page_loop)
++
++
++	vmovd	(%rdi, %OFFSET_REG64), %xmm0
++	vmovd	(%rsi, %OFFSET_REG64), %xmm1
++	VPCMPEQ	%xmm0, %xmmZERO, %xmm2
++	VPCMPEQ	%xmm1, %xmm0, %xmm1
++	vpandn	%xmm1, %xmm2, %xmm1
++	vpmovmskb %ymm1, %ecx
++	subl	$0xf, %ecx
++	jnz	L(check_ret_vec_page_cross)
++
++#  ifdef USE_AS_STRNCMP
++	addl	$4, %OFFSET_REG
++	subq	%OFFSET_REG64, %rdx
++	jbe	L(ret_zero_page_cross_slow_case1)
++	subq	$-(VEC_SIZE * 4), %rdx
++
++	leaq	-(VEC_SIZE * 4)(%rdi, %OFFSET_REG64), %rdi
++	leaq	-(VEC_SIZE * 4)(%rsi, %OFFSET_REG64), %rsi
++#  else
++	leaq	(4 - VEC_SIZE * 4)(%rdi, %OFFSET_REG64), %rdi
++	leaq	(4 - VEC_SIZE * 4)(%rsi, %OFFSET_REG64), %rsi
++#  endif
++	jmp	L(prepare_loop_aligned)
++
++#  ifdef USE_AS_STRNCMP
++	.p2align 4,, 2
++L(ret_zero_page_cross_slow_case1):
++	xorl	%eax, %eax
++	ret
++#  endif
++
++	.p2align 4,, 10
++L(less_4_till_page):
++	subq	%rdi, %rsi
++	/* Extremely slow byte comparison loop.  */
++L(less_4_loop):
++	movzbl	(%rdi), %eax
++	movzbl	(%rsi, %rdi), %ecx
+ 	subl	%ecx, %eax
+-	VZEROUPPER_RETURN
+-END (STRCMP)
++	jnz	L(ret_less_4_loop)
++	testl	%ecx, %ecx
++	jz	L(ret_zero_4_loop)
++#  ifdef USE_AS_STRNCMP
++	decq	%rdx
++	jz	L(ret_zero_4_loop)
++#  endif
++	incq	%rdi
++	/* end condition is reach page boundary (rdi is aligned).  */
++	testl	$31, %edi
++	jnz	L(less_4_loop)
++	leaq	-(VEC_SIZE * 4)(%rdi, %rsi), %rsi
++	addq	$-(VEC_SIZE * 4), %rdi
++#  ifdef USE_AS_STRNCMP
++	subq	$-(VEC_SIZE * 4), %rdx
++#  endif
++	jmp	L(prepare_loop_aligned)
++
++L(ret_zero_4_loop):
++	xorl	%eax, %eax
++	ret
++L(ret_less_4_loop):
++	xorl	%r8d, %eax
++	subl	%r8d, %eax
++	ret
++# endif
++END(STRCMP)
+ #endif
+-- 
+GitLab
+
diff --git a/glibc-RHEL-15696-75.patch b/glibc-RHEL-15696-75.patch
new file mode 100644
index 0000000..4bd0cd4
--- /dev/null
+++ b/glibc-RHEL-15696-75.patch
@@ -0,0 +1,1992 @@
+From 8418eb3ff4b781d31c4ed5dc6c0bd7356bc45db9 Mon Sep 17 00:00:00 2001
+From: Noah Goldstein <goldstein.w.n@gmail.com>
+Date: Mon, 10 Jan 2022 15:35:39 -0600
+Subject: [PATCH] x86: Optimize strcmp-evex.S
+Content-type: text/plain; charset=UTF-8
+
+Optimization are primarily to the loop logic and how the page cross
+logic interacts with the loop.
+
+The page cross logic is at times more expensive for short strings near
+the end of a page but not crossing the page. This is done to retest
+the page cross conditions with a non-faulty check and to improve the
+logic for entering the loop afterwards. This is only particular cases,
+however, and is general made up for by more than 10x improvements on
+the transition from the page cross -> loop case.
+
+The non-page cross cases as well are nearly universally improved.
+
+test-strcmp, test-strncmp, test-wcscmp, and test-wcsncmp all pass.
+
+Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
+---
+ sysdeps/x86_64/multiarch/strcmp-evex.S | 1712 +++++++++++++-----------
+ 1 file changed, 919 insertions(+), 793 deletions(-)
+
+diff --git a/sysdeps/x86_64/multiarch/strcmp-evex.S b/sysdeps/x86_64/multiarch/strcmp-evex.S
+index 6f5c4bf9..99d8409a 100644
+--- a/sysdeps/x86_64/multiarch/strcmp-evex.S
++++ b/sysdeps/x86_64/multiarch/strcmp-evex.S
+@@ -26,54 +26,69 @@
+ 
+ # define PAGE_SIZE	4096
+ 
+-/* VEC_SIZE = Number of bytes in a ymm register */
++	/* VEC_SIZE = Number of bytes in a ymm register.  */
+ # define VEC_SIZE	32
++# define CHAR_PER_VEC	(VEC_SIZE	/	SIZE_OF_CHAR)
+ 
+-/* Shift for dividing by (VEC_SIZE * 4).  */
+-# define DIVIDE_BY_VEC_4_SHIFT	7
+-# if (VEC_SIZE * 4) != (1 << DIVIDE_BY_VEC_4_SHIFT)
+-#  error (VEC_SIZE * 4) != (1 << DIVIDE_BY_VEC_4_SHIFT)
+-# endif
+-
+-# define VMOVU		vmovdqu64
+-# define VMOVA		vmovdqa64
++# define VMOVU	vmovdqu64
++# define VMOVA	vmovdqa64
+ 
+ # ifdef USE_AS_WCSCMP
+-/* Compare packed dwords.  */
+-#  define VPCMP		vpcmpd
++#  define TESTEQ	subl	$0xff,
++	/* Compare packed dwords.  */
++#  define VPCMP	vpcmpd
+ #  define VPMINU	vpminud
+ #  define VPTESTM	vptestmd
+-#  define SHIFT_REG32	r8d
+-#  define SHIFT_REG64	r8
+-/* 1 dword char == 4 bytes.  */
++	/* 1 dword char == 4 bytes.  */
+ #  define SIZE_OF_CHAR	4
+ # else
+-/* Compare packed bytes.  */
+-#  define VPCMP		vpcmpb
++#  define TESTEQ	incl
++	/* Compare packed bytes.  */
++#  define VPCMP	vpcmpb
+ #  define VPMINU	vpminub
+ #  define VPTESTM	vptestmb
+-#  define SHIFT_REG32	ecx
+-#  define SHIFT_REG64	rcx
+-/* 1 byte char == 1 byte.  */
++	/* 1 byte char == 1 byte.  */
+ #  define SIZE_OF_CHAR	1
+ # endif
+ 
++# ifdef USE_AS_STRNCMP
++#  define LOOP_REG	r9d
++#  define LOOP_REG64	r9
++
++#  define OFFSET_REG8	r9b
++#  define OFFSET_REG	r9d
++#  define OFFSET_REG64	r9
++# else
++#  define LOOP_REG	edx
++#  define LOOP_REG64	rdx
++
++#  define OFFSET_REG8	dl
++#  define OFFSET_REG	edx
++#  define OFFSET_REG64	rdx
++# endif
++
++# if defined USE_AS_STRNCMP || defined USE_AS_WCSCMP
++#  define VEC_OFFSET	0
++# else
++#  define VEC_OFFSET	(-VEC_SIZE)
++# endif
++
+ # define XMMZERO	xmm16
+-# define XMM0		xmm17
+-# define XMM1		xmm18
++# define XMM0	xmm17
++# define XMM1	xmm18
+ 
+ # define YMMZERO	ymm16
+-# define YMM0		ymm17
+-# define YMM1		ymm18
+-# define YMM2		ymm19
+-# define YMM3		ymm20
+-# define YMM4		ymm21
+-# define YMM5		ymm22
+-# define YMM6		ymm23
+-# define YMM7		ymm24
+-# define YMM8		ymm25
+-# define YMM9		ymm26
+-# define YMM10		ymm27
++# define YMM0	ymm17
++# define YMM1	ymm18
++# define YMM2	ymm19
++# define YMM3	ymm20
++# define YMM4	ymm21
++# define YMM5	ymm22
++# define YMM6	ymm23
++# define YMM7	ymm24
++# define YMM8	ymm25
++# define YMM9	ymm26
++# define YMM10	ymm27
+ 
+ /* Warning!
+            wcscmp/wcsncmp have to use SIGNED comparison for elements.
+@@ -96,985 +111,1096 @@
+    the maximum offset is reached before a difference is found, zero is
+    returned.  */
+ 
+-	.section .text.evex,"ax",@progbits
+-ENTRY (STRCMP)
++	.section .text.evex, "ax", @progbits
++ENTRY(STRCMP)
+ # ifdef USE_AS_STRNCMP
+-	/* Check for simple cases (0 or 1) in offset.  */
+-	cmp	$1, %RDX_LP
+-	je	L(char0)
+-	jb	L(zero)
+-#  ifdef USE_AS_WCSCMP
+-#  ifndef __ILP32__
+-	movq	%rdx, %rcx
+-	/* Check if length could overflow when multiplied by
+-	   sizeof(wchar_t). Checking top 8 bits will cover all potential
+-	   overflow cases as well as redirect cases where its impossible to
+-	   length to bound a valid memory region. In these cases just use
+-	   'wcscmp'.  */
+-	shrq	$56, %rcx
+-	jnz	__wcscmp_evex
+-#  endif
+-	/* Convert units: from wide to byte char.  */
+-	shl	$2, %RDX_LP
++#  ifdef __ILP32__
++	/* Clear the upper 32 bits.  */
++	movl	%edx, %rdx
+ #  endif
+-	/* Register %r11 tracks the maximum offset.  */
+-	mov	%RDX_LP, %R11_LP
++	cmp	$1, %RDX_LP
++	/* Signed comparison intentional. We use this branch to also
++	   test cases where length >= 2^63. These very large sizes can be
++	   handled with strcmp as there is no way for that length to
++	   actually bound the buffer.  */
++	jle	L(one_or_less)
+ # endif
+ 	movl	%edi, %eax
+-	xorl	%edx, %edx
+-	/* Make %XMMZERO (%YMMZERO) all zeros in this function.  */
+-	vpxorq	%XMMZERO, %XMMZERO, %XMMZERO
+ 	orl	%esi, %eax
+-	andl	$(PAGE_SIZE - 1), %eax
+-	cmpl	$(PAGE_SIZE - (VEC_SIZE * 4)), %eax
+-	jg	L(cross_page)
+-	/* Start comparing 4 vectors.  */
++	/* Shift out the bits irrelivant to page boundary ([63:12]).  */
++	sall	$20, %eax
++	/* Check if s1 or s2 may cross a page in next 4x VEC loads.  */
++	cmpl	$((PAGE_SIZE -(VEC_SIZE * 4)) << 20), %eax
++	ja	L(page_cross)
++
++L(no_page_cross):
++	/* Safe to compare 4x vectors.  */
+ 	VMOVU	(%rdi), %YMM0
+-
+-	/* Each bit set in K2 represents a non-null CHAR in YMM0.  */
+ 	VPTESTM	%YMM0, %YMM0, %k2
+-
+ 	/* Each bit cleared in K1 represents a mismatch or a null CHAR
+ 	   in YMM0 and 32 bytes at (%rsi).  */
+ 	VPCMP	$0, (%rsi), %YMM0, %k1{%k2}
+-
+ 	kmovd	%k1, %ecx
+-# ifdef USE_AS_WCSCMP
+-	subl	$0xff, %ecx
+-# else
+-	incl	%ecx
+-# endif
+-	je	L(next_3_vectors)
+-	tzcntl	%ecx, %edx
+-# ifdef USE_AS_WCSCMP
+-	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
+-	sall	$2, %edx
+-# endif
+ # ifdef USE_AS_STRNCMP
+-	/* Return 0 if the mismatched index (%rdx) is after the maximum
+-	   offset (%r11).   */
+-	cmpq	%r11, %rdx
+-	jae	L(zero)
++	cmpq	$CHAR_PER_VEC, %rdx
++	jbe	L(vec_0_test_len)
+ # endif
++
++	/* TESTEQ is `incl` for strcmp/strncmp and `subl $0xff` for
++	   wcscmp/wcsncmp.  */
++
++	/* All 1s represents all equals. TESTEQ will overflow to zero in
++	   all equals case. Otherwise 1s will carry until position of first
++	   mismatch.  */
++	TESTEQ	%ecx
++	jz	L(more_3x_vec)
++
++	.p2align 4,, 4
++L(return_vec_0):
++	tzcntl	%ecx, %ecx
+ # ifdef USE_AS_WCSCMP
++	movl	(%rdi, %rcx, SIZE_OF_CHAR), %edx
+ 	xorl	%eax, %eax
+-	movl	(%rdi, %rdx), %ecx
+-	cmpl	(%rsi, %rdx), %ecx
+-	je	L(return)
+-L(wcscmp_return):
++	cmpl	(%rsi, %rcx, SIZE_OF_CHAR), %edx
++	je	L(ret0)
+ 	setl	%al
+ 	negl	%eax
+ 	orl	$1, %eax
+-L(return):
+ # else
+-	movzbl	(%rdi, %rdx), %eax
+-	movzbl	(%rsi, %rdx), %edx
+-	subl	%edx, %eax
++	movzbl	(%rdi, %rcx), %eax
++	movzbl	(%rsi, %rcx), %ecx
++	subl	%ecx, %eax
+ # endif
++L(ret0):
+ 	ret
+ 
+-L(return_vec_size):
+-	tzcntl	%ecx, %edx
+-# ifdef USE_AS_WCSCMP
+-	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
+-	sall	$2, %edx
+-# endif
+ # ifdef USE_AS_STRNCMP
+-	/* Return 0 if the mismatched index (%rdx + VEC_SIZE) is after
+-	   the maximum offset (%r11).  */
+-	addq	$VEC_SIZE, %rdx
+-	cmpq	%r11, %rdx
+-	jae	L(zero)
+-#  ifdef USE_AS_WCSCMP
++	.p2align 4,, 4
++L(vec_0_test_len):
++	notl	%ecx
++	bzhil	%edx, %ecx, %eax
++	jnz	L(return_vec_0)
++	/* Align if will cross fetch block.  */
++	.p2align 4,, 2
++L(ret_zero):
+ 	xorl	%eax, %eax
+-	movl	(%rdi, %rdx), %ecx
+-	cmpl	(%rsi, %rdx), %ecx
+-	jne	L(wcscmp_return)
+-#  else
+-	movzbl	(%rdi, %rdx), %eax
+-	movzbl	(%rsi, %rdx), %edx
+-	subl	%edx, %eax
+-#  endif
+-# else
++	ret
++
++	.p2align 4,, 5
++L(one_or_less):
++	jb	L(ret_zero)
+ #  ifdef USE_AS_WCSCMP
++	/* 'nbe' covers the case where length is negative (large
++	   unsigned).  */
++	jnbe	__wcscmp_evex
++	movl	(%rdi), %edx
+ 	xorl	%eax, %eax
+-	movl	VEC_SIZE(%rdi, %rdx), %ecx
+-	cmpl	VEC_SIZE(%rsi, %rdx), %ecx
+-	jne	L(wcscmp_return)
++	cmpl	(%rsi), %edx
++	je	L(ret1)
++	setl	%al
++	negl	%eax
++	orl	$1, %eax
+ #  else
+-	movzbl	VEC_SIZE(%rdi, %rdx), %eax
+-	movzbl	VEC_SIZE(%rsi, %rdx), %edx
+-	subl	%edx, %eax
++	/* 'nbe' covers the case where length is negative (large
++	   unsigned).  */
++	jnbe	__strcmp_evex
++	movzbl	(%rdi), %eax
++	movzbl	(%rsi), %ecx
++	subl	%ecx, %eax
+ #  endif
+-# endif
++L(ret1):
+ 	ret
++# endif
+ 
+-L(return_2_vec_size):
+-	tzcntl	%ecx, %edx
++	.p2align 4,, 10
++L(return_vec_1):
++	tzcntl	%ecx, %ecx
++# ifdef USE_AS_STRNCMP
++	/* rdx must be > CHAR_PER_VEC so its safe to subtract without
++	   worrying about underflow.  */
++	addq	$-CHAR_PER_VEC, %rdx
++	cmpq	%rcx, %rdx
++	jbe	L(ret_zero)
++# endif
+ # ifdef USE_AS_WCSCMP
+-	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
+-	sall	$2, %edx
++	movl	VEC_SIZE(%rdi, %rcx, SIZE_OF_CHAR), %edx
++	xorl	%eax, %eax
++	cmpl	VEC_SIZE(%rsi, %rcx, SIZE_OF_CHAR), %edx
++	je	L(ret2)
++	setl	%al
++	negl	%eax
++	orl	$1, %eax
++# else
++	movzbl	VEC_SIZE(%rdi, %rcx), %eax
++	movzbl	VEC_SIZE(%rsi, %rcx), %ecx
++	subl	%ecx, %eax
+ # endif
++L(ret2):
++	ret
++
++	.p2align 4,, 10
+ # ifdef USE_AS_STRNCMP
+-	/* Return 0 if the mismatched index (%rdx + 2 * VEC_SIZE) is
+-	   after the maximum offset (%r11).  */
+-	addq	$(VEC_SIZE * 2), %rdx
+-	cmpq	%r11, %rdx
+-	jae	L(zero)
+-#  ifdef USE_AS_WCSCMP
+-	xorl	%eax, %eax
+-	movl	(%rdi, %rdx), %ecx
+-	cmpl	(%rsi, %rdx), %ecx
+-	jne	L(wcscmp_return)
++L(return_vec_3):
++#  if CHAR_PER_VEC <= 16
++	sall	$CHAR_PER_VEC, %ecx
+ #  else
+-	movzbl	(%rdi, %rdx), %eax
+-	movzbl	(%rsi, %rdx), %edx
+-	subl	%edx, %eax
++	salq	$CHAR_PER_VEC, %rcx
+ #  endif
++# endif
++L(return_vec_2):
++# if (CHAR_PER_VEC <= 16) || !(defined USE_AS_STRNCMP)
++	tzcntl	%ecx, %ecx
+ # else
+-#  ifdef USE_AS_WCSCMP
+-	xorl	%eax, %eax
+-	movl	(VEC_SIZE * 2)(%rdi, %rdx), %ecx
+-	cmpl	(VEC_SIZE * 2)(%rsi, %rdx), %ecx
+-	jne	L(wcscmp_return)
+-#  else
+-	movzbl	(VEC_SIZE * 2)(%rdi, %rdx), %eax
+-	movzbl	(VEC_SIZE * 2)(%rsi, %rdx), %edx
+-	subl	%edx, %eax
+-#  endif
++	tzcntq	%rcx, %rcx
+ # endif
+-	ret
+ 
+-L(return_3_vec_size):
+-	tzcntl	%ecx, %edx
+-# ifdef USE_AS_WCSCMP
+-	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
+-	sall	$2, %edx
+-# endif
+ # ifdef USE_AS_STRNCMP
+-	/* Return 0 if the mismatched index (%rdx + 3 * VEC_SIZE) is
+-	   after the maximum offset (%r11).  */
+-	addq	$(VEC_SIZE * 3), %rdx
+-	cmpq	%r11, %rdx
+-	jae	L(zero)
+-#  ifdef USE_AS_WCSCMP
++	cmpq	%rcx, %rdx
++	jbe	L(ret_zero)
++# endif
++
++# ifdef USE_AS_WCSCMP
++	movl	(VEC_SIZE * 2)(%rdi, %rcx, SIZE_OF_CHAR), %edx
+ 	xorl	%eax, %eax
+-	movl	(%rdi, %rdx), %ecx
+-	cmpl	(%rsi, %rdx), %ecx
+-	jne	L(wcscmp_return)
+-#  else
+-	movzbl	(%rdi, %rdx), %eax
+-	movzbl	(%rsi, %rdx), %edx
+-	subl	%edx, %eax
+-#  endif
++	cmpl	(VEC_SIZE * 2)(%rsi, %rcx, SIZE_OF_CHAR), %edx
++	je	L(ret3)
++	setl	%al
++	negl	%eax
++	orl	$1, %eax
+ # else
++	movzbl	(VEC_SIZE * 2)(%rdi, %rcx), %eax
++	movzbl	(VEC_SIZE * 2)(%rsi, %rcx), %ecx
++	subl	%ecx, %eax
++# endif
++L(ret3):
++	ret
++
++# ifndef USE_AS_STRNCMP
++	.p2align 4,, 10
++L(return_vec_3):
++	tzcntl	%ecx, %ecx
+ #  ifdef USE_AS_WCSCMP
++	movl	(VEC_SIZE * 3)(%rdi, %rcx, SIZE_OF_CHAR), %edx
+ 	xorl	%eax, %eax
+-	movl	(VEC_SIZE * 3)(%rdi, %rdx), %ecx
+-	cmpl	(VEC_SIZE * 3)(%rsi, %rdx), %ecx
+-	jne	L(wcscmp_return)
++	cmpl	(VEC_SIZE * 3)(%rsi, %rcx, SIZE_OF_CHAR), %edx
++	je	L(ret4)
++	setl	%al
++	negl	%eax
++	orl	$1, %eax
+ #  else
+-	movzbl	(VEC_SIZE * 3)(%rdi, %rdx), %eax
+-	movzbl	(VEC_SIZE * 3)(%rsi, %rdx), %edx
+-	subl	%edx, %eax
++	movzbl	(VEC_SIZE * 3)(%rdi, %rcx), %eax
++	movzbl	(VEC_SIZE * 3)(%rsi, %rcx), %ecx
++	subl	%ecx, %eax
+ #  endif
+-# endif
++L(ret4):
+ 	ret
++# endif
+ 
+-	.p2align 4
+-L(next_3_vectors):
+-	VMOVU	VEC_SIZE(%rdi), %YMM0
+-	/* Each bit set in K2 represents a non-null CHAR in YMM0.  */
++	/* 32 byte align here ensures the main loop is ideally aligned
++	   for DSB.  */
++	.p2align 5
++L(more_3x_vec):
++	/* Safe to compare 4x vectors.  */
++	VMOVU	(VEC_SIZE)(%rdi), %YMM0
+ 	VPTESTM	%YMM0, %YMM0, %k2
+-	/* Each bit cleared in K1 represents a mismatch or a null CHAR
+-	   in YMM0 and 32 bytes at VEC_SIZE(%rsi).  */
+-	VPCMP	$0, VEC_SIZE(%rsi), %YMM0, %k1{%k2}
++	VPCMP	$0, (VEC_SIZE)(%rsi), %YMM0, %k1{%k2}
+ 	kmovd	%k1, %ecx
+-# ifdef USE_AS_WCSCMP
+-	subl	$0xff, %ecx
+-# else
+-	incl	%ecx
++	TESTEQ	%ecx
++	jnz	L(return_vec_1)
++
++# ifdef USE_AS_STRNCMP
++	subq	$(CHAR_PER_VEC * 2), %rdx
++	jbe	L(ret_zero)
+ # endif
+-	jne	L(return_vec_size)
+ 
+ 	VMOVU	(VEC_SIZE * 2)(%rdi), %YMM0
+-	/* Each bit set in K2 represents a non-null CHAR in YMM0.  */
+ 	VPTESTM	%YMM0, %YMM0, %k2
+-	/* Each bit cleared in K1 represents a mismatch or a null CHAR
+-	   in YMM0 and 32 bytes at (VEC_SIZE * 2)(%rsi).  */
+ 	VPCMP	$0, (VEC_SIZE * 2)(%rsi), %YMM0, %k1{%k2}
+ 	kmovd	%k1, %ecx
+-# ifdef USE_AS_WCSCMP
+-	subl	$0xff, %ecx
+-# else
+-	incl	%ecx
+-# endif
+-	jne	L(return_2_vec_size)
++	TESTEQ	%ecx
++	jnz	L(return_vec_2)
+ 
+ 	VMOVU	(VEC_SIZE * 3)(%rdi), %YMM0
+-	/* Each bit set in K2 represents a non-null CHAR in YMM0.  */
+ 	VPTESTM	%YMM0, %YMM0, %k2
+-	/* Each bit cleared in K1 represents a mismatch or a null CHAR
+-	   in YMM0 and 32 bytes at (VEC_SIZE * 2)(%rsi).  */
+ 	VPCMP	$0, (VEC_SIZE * 3)(%rsi), %YMM0, %k1{%k2}
+ 	kmovd	%k1, %ecx
++	TESTEQ	%ecx
++	jnz	L(return_vec_3)
++
++# ifdef USE_AS_STRNCMP
++	cmpq	$(CHAR_PER_VEC * 2), %rdx
++	jbe	L(ret_zero)
++# endif
++
++
+ # ifdef USE_AS_WCSCMP
+-	subl	$0xff, %ecx
++	/* any non-zero positive value that doesn't inference with 0x1.
++	 */
++	movl	$2, %r8d
++
+ # else
+-	incl	%ecx
++	xorl	%r8d, %r8d
+ # endif
+-	jne	L(return_3_vec_size)
+-L(main_loop_header):
+-	leaq	(VEC_SIZE * 4)(%rdi), %rdx
+-	movl	$PAGE_SIZE, %ecx
+-	/* Align load via RAX.  */
+-	andq	$-(VEC_SIZE * 4), %rdx
+-	subq	%rdi, %rdx
+-	leaq	(%rdi, %rdx), %rax
++
++	/* The prepare labels are various entry points from the page
++	   cross logic.  */
++L(prepare_loop):
++
+ # ifdef USE_AS_STRNCMP
+-	/* Starting from this point, the maximum offset, or simply the
+-	   'offset', DECREASES by the same amount when base pointers are
+-	   moved forward.  Return 0 when:
+-	     1) On match: offset <= the matched vector index.
+-	     2) On mistmach, offset is before the mistmatched index.
+-	 */
+-	subq	%rdx, %r11
+-	jbe	L(zero)
++#  ifdef USE_AS_WCSCMP
++L(prepare_loop_no_len):
++	movl	%edi, %ecx
++	andl	$(VEC_SIZE * 4 - 1), %ecx
++	shrl	$2, %ecx
++	leaq	(CHAR_PER_VEC * 2)(%rdx, %rcx), %rdx
++#  else
++	/* Store N + (VEC_SIZE * 4) and place check at the begining of
++	   the loop.  */
++	leaq	(VEC_SIZE * 2)(%rdi, %rdx), %rdx
++L(prepare_loop_no_len):
++#  endif
++# else
++L(prepare_loop_no_len):
+ # endif
+-	addq	%rsi, %rdx
+-	movq	%rdx, %rsi
+-	andl	$(PAGE_SIZE - 1), %esi
+-	/* Number of bytes before page crossing.  */
+-	subq	%rsi, %rcx
+-	/* Number of VEC_SIZE * 4 blocks before page crossing.  */
+-	shrq	$DIVIDE_BY_VEC_4_SHIFT, %rcx
+-	/* ESI: Number of VEC_SIZE * 4 blocks before page crossing.   */
+-	movl	%ecx, %esi
+-	jmp	L(loop_start)
+ 
++	/* Align s1 and adjust s2 accordingly.  */
++	subq	%rdi, %rsi
++	andq	$-(VEC_SIZE * 4), %rdi
++L(prepare_loop_readj):
++	addq	%rdi, %rsi
++# if (defined USE_AS_STRNCMP) && !(defined USE_AS_WCSCMP)
++	subq	%rdi, %rdx
++# endif
++
++L(prepare_loop_aligned):
++	/* eax stores distance from rsi to next page cross. These cases
++	   need to be handled specially as the 4x loop could potentially
++	   read memory past the length of s1 or s2 and across a page
++	   boundary.  */
++	movl	$-(VEC_SIZE * 4), %eax
++	subl	%esi, %eax
++	andl	$(PAGE_SIZE - 1), %eax
++
++	vpxorq	%YMMZERO, %YMMZERO, %YMMZERO
++
++	/* Loop 4x comparisons at a time.  */
+ 	.p2align 4
+ L(loop):
++
++	/* End condition for strncmp.  */
+ # ifdef USE_AS_STRNCMP
+-	/* Base pointers are moved forward by 4 * VEC_SIZE.  Decrease
+-	   the maximum offset (%r11) by the same amount.  */
+-	subq	$(VEC_SIZE * 4), %r11
+-	jbe	L(zero)
++	subq	$(CHAR_PER_VEC * 4), %rdx
++	jbe	L(ret_zero)
+ # endif
+-	addq	$(VEC_SIZE * 4), %rax
+-	addq	$(VEC_SIZE * 4), %rdx
+-L(loop_start):
+-	testl	%esi, %esi
+-	leal	-1(%esi), %esi
+-	je	L(loop_cross_page)
+-L(back_to_loop):
+-	/* Main loop, comparing 4 vectors are a time.  */
+-	VMOVA	(%rax), %YMM0
+-	VMOVA	VEC_SIZE(%rax), %YMM2
+-	VMOVA	(VEC_SIZE * 2)(%rax), %YMM4
+-	VMOVA	(VEC_SIZE * 3)(%rax), %YMM6
++
++	subq	$-(VEC_SIZE * 4), %rdi
++	subq	$-(VEC_SIZE * 4), %rsi
++
++	/* Check if rsi loads will cross a page boundary.  */
++	addl	$-(VEC_SIZE * 4), %eax
++	jnb	L(page_cross_during_loop)
++
++	/* Loop entry after handling page cross during loop.  */
++L(loop_skip_page_cross_check):
++	VMOVA	(VEC_SIZE * 0)(%rdi), %YMM0
++	VMOVA	(VEC_SIZE * 1)(%rdi), %YMM2
++	VMOVA	(VEC_SIZE * 2)(%rdi), %YMM4
++	VMOVA	(VEC_SIZE * 3)(%rdi), %YMM6
+ 
+ 	VPMINU	%YMM0, %YMM2, %YMM8
+ 	VPMINU	%YMM4, %YMM6, %YMM9
+ 
+-	/* A zero CHAR in YMM8 means that there is a null CHAR.  */
+-	VPMINU	%YMM8, %YMM9, %YMM8
++	/* A zero CHAR in YMM9 means that there is a null CHAR.  */
++	VPMINU	%YMM8, %YMM9, %YMM9
+ 
+ 	/* Each bit set in K1 represents a non-null CHAR in YMM8.  */
+-	VPTESTM	%YMM8, %YMM8, %k1
++	VPTESTM	%YMM9, %YMM9, %k1
+ 
+-	/* (YMM ^ YMM): A non-zero CHAR represents a mismatch.  */
+-	vpxorq	(%rdx), %YMM0, %YMM1
+-	vpxorq	VEC_SIZE(%rdx), %YMM2, %YMM3
+-	vpxorq	(VEC_SIZE * 2)(%rdx), %YMM4, %YMM5
+-	vpxorq	(VEC_SIZE * 3)(%rdx), %YMM6, %YMM7
++	vpxorq	(VEC_SIZE * 0)(%rsi), %YMM0, %YMM1
++	vpxorq	(VEC_SIZE * 1)(%rsi), %YMM2, %YMM3
++	vpxorq	(VEC_SIZE * 2)(%rsi), %YMM4, %YMM5
++	/* Ternary logic to xor (VEC_SIZE * 3)(%rsi) with YMM6 while
++	   oring with YMM1. Result is stored in YMM6.  */
++	vpternlogd $0xde, (VEC_SIZE * 3)(%rsi), %YMM1, %YMM6
+ 
+-	vporq	%YMM1, %YMM3, %YMM9
+-	vporq	%YMM5, %YMM7, %YMM10
++	/* Or together YMM3, YMM5, and YMM6.  */
++	vpternlogd $0xfe, %YMM3, %YMM5, %YMM6
+ 
+-	/* A non-zero CHAR in YMM9 represents a mismatch.  */
+-	vporq	%YMM9, %YMM10, %YMM9
+ 
+-	/* Each bit cleared in K0 represents a mismatch or a null CHAR.  */
+-	VPCMP	$0, %YMMZERO, %YMM9, %k0{%k1}
+-	kmovd   %k0, %ecx
+-# ifdef USE_AS_WCSCMP
+-	subl	$0xff, %ecx
+-# else
+-	incl	%ecx
+-# endif
+-	je	 L(loop)
++	/* A non-zero CHAR in YMM6 represents a mismatch.  */
++	VPCMP	$0, %YMMZERO, %YMM6, %k0{%k1}
++	kmovd	%k0, %LOOP_REG
+ 
+-	/* Each bit set in K1 represents a non-null CHAR in YMM0.  */
++	TESTEQ	%LOOP_REG
++	jz	L(loop)
++
++
++	/* Find which VEC has the mismatch of end of string.  */
+ 	VPTESTM	%YMM0, %YMM0, %k1
+-	/* Each bit cleared in K0 represents a mismatch or a null CHAR
+-	   in YMM0 and (%rdx).  */
+ 	VPCMP	$0, %YMMZERO, %YMM1, %k0{%k1}
+ 	kmovd	%k0, %ecx
+-# ifdef USE_AS_WCSCMP
+-	subl	$0xff, %ecx
+-# else
+-	incl	%ecx
+-# endif
+-	je	L(test_vec)
+-	tzcntl	%ecx, %ecx
+-# ifdef USE_AS_WCSCMP
+-	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
+-	sall	$2, %ecx
+-# endif
+-# ifdef USE_AS_STRNCMP
+-	cmpq	%rcx, %r11
+-	jbe	L(zero)
+-#  ifdef USE_AS_WCSCMP
+-	movq	%rax, %rsi
+-	xorl	%eax, %eax
+-	movl	(%rsi, %rcx), %edi
+-	cmpl	(%rdx, %rcx), %edi
+-	jne	L(wcscmp_return)
+-#  else
+-	movzbl	(%rax, %rcx), %eax
+-	movzbl	(%rdx, %rcx), %edx
+-	subl	%edx, %eax
+-#  endif
+-# else
+-#  ifdef USE_AS_WCSCMP
+-	movq	%rax, %rsi
+-	xorl	%eax, %eax
+-	movl	(%rsi, %rcx), %edi
+-	cmpl	(%rdx, %rcx), %edi
+-	jne	L(wcscmp_return)
+-#  else
+-	movzbl	(%rax, %rcx), %eax
+-	movzbl	(%rdx, %rcx), %edx
+-	subl	%edx, %eax
+-#  endif
+-# endif
+-	ret
++	TESTEQ	%ecx
++	jnz	L(return_vec_0_end)
+ 
+-	.p2align 4
+-L(test_vec):
+-# ifdef USE_AS_STRNCMP
+-	/* The first vector matched.  Return 0 if the maximum offset
+-	   (%r11) <= VEC_SIZE.  */
+-	cmpq	$VEC_SIZE, %r11
+-	jbe	L(zero)
+-# endif
+-	/* Each bit set in K1 represents a non-null CHAR in YMM2.  */
+ 	VPTESTM	%YMM2, %YMM2, %k1
+-	/* Each bit cleared in K0 represents a mismatch or a null CHAR
+-	   in YMM2 and VEC_SIZE(%rdx).  */
+ 	VPCMP	$0, %YMMZERO, %YMM3, %k0{%k1}
+ 	kmovd	%k0, %ecx
+-# ifdef USE_AS_WCSCMP
+-	subl	$0xff, %ecx
+-# else
+-	incl	%ecx
+-# endif
+-	je	L(test_2_vec)
+-	tzcntl	%ecx, %edi
+-# ifdef USE_AS_WCSCMP
+-	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
+-	sall	$2, %edi
+-# endif
+-# ifdef USE_AS_STRNCMP
+-	addq	$VEC_SIZE, %rdi
+-	cmpq	%rdi, %r11
+-	jbe	L(zero)
+-#  ifdef USE_AS_WCSCMP
+-	movq	%rax, %rsi
+-	xorl	%eax, %eax
+-	movl	(%rsi, %rdi), %ecx
+-	cmpl	(%rdx, %rdi), %ecx
+-	jne	L(wcscmp_return)
+-#  else
+-	movzbl	(%rax, %rdi), %eax
+-	movzbl	(%rdx, %rdi), %edx
+-	subl	%edx, %eax
+-#  endif
+-# else
+-#  ifdef USE_AS_WCSCMP
+-	movq	%rax, %rsi
+-	xorl	%eax, %eax
+-	movl	VEC_SIZE(%rsi, %rdi), %ecx
+-	cmpl	VEC_SIZE(%rdx, %rdi), %ecx
+-	jne	L(wcscmp_return)
+-#  else
+-	movzbl	VEC_SIZE(%rax, %rdi), %eax
+-	movzbl	VEC_SIZE(%rdx, %rdi), %edx
+-	subl	%edx, %eax
+-#  endif
+-# endif
+-	ret
++	TESTEQ	%ecx
++	jnz	L(return_vec_1_end)
+ 
+-	.p2align 4
+-L(test_2_vec):
++
++	/* Handle VEC 2 and 3 without branches.  */
++L(return_vec_2_3_end):
+ # ifdef USE_AS_STRNCMP
+-	/* The first 2 vectors matched.  Return 0 if the maximum offset
+-	   (%r11) <= 2 * VEC_SIZE.  */
+-	cmpq	$(VEC_SIZE * 2), %r11
+-	jbe	L(zero)
++	subq	$(CHAR_PER_VEC * 2), %rdx
++	jbe	L(ret_zero_end)
+ # endif
+-	/* Each bit set in K1 represents a non-null CHAR in YMM4.  */
++
+ 	VPTESTM	%YMM4, %YMM4, %k1
+-	/* Each bit cleared in K0 represents a mismatch or a null CHAR
+-	   in YMM4 and (VEC_SIZE * 2)(%rdx).  */
+ 	VPCMP	$0, %YMMZERO, %YMM5, %k0{%k1}
+ 	kmovd	%k0, %ecx
+-# ifdef USE_AS_WCSCMP
+-	subl	$0xff, %ecx
++	TESTEQ	%ecx
++# if CHAR_PER_VEC <= 16
++	sall	$CHAR_PER_VEC, %LOOP_REG
++	orl	%ecx, %LOOP_REG
+ # else
+-	incl	%ecx
++	salq	$CHAR_PER_VEC, %LOOP_REG64
++	orq	%rcx, %LOOP_REG64
++# endif
++L(return_vec_3_end):
++	/* LOOP_REG contains matches for null/mismatch from the loop. If
++	   VEC 0,1,and 2 all have no null and no mismatches then mismatch
++	   must entirely be from VEC 3 which is fully represented by
++	   LOOP_REG.  */
++# if CHAR_PER_VEC <= 16
++	tzcntl	%LOOP_REG, %LOOP_REG
++# else
++	tzcntq	%LOOP_REG64, %LOOP_REG64
++# endif
++# ifdef USE_AS_STRNCMP
++	cmpq	%LOOP_REG64, %rdx
++	jbe	L(ret_zero_end)
+ # endif
+-	je	L(test_3_vec)
+-	tzcntl	%ecx, %edi
++
+ # ifdef USE_AS_WCSCMP
+-	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
+-	sall	$2, %edi
++	movl	(VEC_SIZE * 2)(%rdi, %LOOP_REG64, SIZE_OF_CHAR), %ecx
++	xorl	%eax, %eax
++	cmpl	(VEC_SIZE * 2)(%rsi, %LOOP_REG64, SIZE_OF_CHAR), %ecx
++	je	L(ret5)
++	setl	%al
++	negl	%eax
++	xorl	%r8d, %eax
++# else
++	movzbl	(VEC_SIZE * 2)(%rdi, %LOOP_REG64), %eax
++	movzbl	(VEC_SIZE * 2)(%rsi, %LOOP_REG64), %ecx
++	subl	%ecx, %eax
++	xorl	%r8d, %eax
++	subl	%r8d, %eax
+ # endif
++L(ret5):
++	ret
++
+ # ifdef USE_AS_STRNCMP
+-	addq	$(VEC_SIZE * 2), %rdi
+-	cmpq	%rdi, %r11
+-	jbe	L(zero)
+-#  ifdef USE_AS_WCSCMP
+-	movq	%rax, %rsi
++	.p2align 4,, 2
++L(ret_zero_end):
+ 	xorl	%eax, %eax
+-	movl	(%rsi, %rdi), %ecx
+-	cmpl	(%rdx, %rdi), %ecx
+-	jne	L(wcscmp_return)
++	ret
++# endif
++
++
++	/* The L(return_vec_N_end) differ from L(return_vec_N) in that
++	   they use the value of `r8` to negate the return value. This is
++	   because the page cross logic can swap `rdi` and `rsi`.  */
++	.p2align 4,, 10
++# ifdef USE_AS_STRNCMP
++L(return_vec_1_end):
++#  if CHAR_PER_VEC <= 16
++	sall	$CHAR_PER_VEC, %ecx
+ #  else
+-	movzbl	(%rax, %rdi), %eax
+-	movzbl	(%rdx, %rdi), %edx
+-	subl	%edx, %eax
++	salq	$CHAR_PER_VEC, %rcx
+ #  endif
++# endif
++L(return_vec_0_end):
++# if (CHAR_PER_VEC <= 16) || !(defined USE_AS_STRNCMP)
++	tzcntl	%ecx, %ecx
+ # else
+-#  ifdef USE_AS_WCSCMP
+-	movq	%rax, %rsi
+-	xorl	%eax, %eax
+-	movl	(VEC_SIZE * 2)(%rsi, %rdi), %ecx
+-	cmpl	(VEC_SIZE * 2)(%rdx, %rdi), %ecx
+-	jne	L(wcscmp_return)
+-#  else
+-	movzbl	(VEC_SIZE * 2)(%rax, %rdi), %eax
+-	movzbl	(VEC_SIZE * 2)(%rdx, %rdi), %edx
+-	subl	%edx, %eax
+-#  endif
++	tzcntq	%rcx, %rcx
+ # endif
+-	ret
+ 
+-	.p2align 4
+-L(test_3_vec):
+ # ifdef USE_AS_STRNCMP
+-	/* The first 3 vectors matched.  Return 0 if the maximum offset
+-	   (%r11) <= 3 * VEC_SIZE.  */
+-	cmpq	$(VEC_SIZE * 3), %r11
+-	jbe	L(zero)
++	cmpq	%rcx, %rdx
++	jbe	L(ret_zero_end)
+ # endif
+-	/* Each bit set in K1 represents a non-null CHAR in YMM6.  */
+-	VPTESTM	%YMM6, %YMM6, %k1
+-	/* Each bit cleared in K0 represents a mismatch or a null CHAR
+-	   in YMM6 and (VEC_SIZE * 3)(%rdx).  */
+-	VPCMP	$0, %YMMZERO, %YMM7, %k0{%k1}
+-	kmovd	%k0, %ecx
++
+ # ifdef USE_AS_WCSCMP
+-	subl	$0xff, %ecx
++	movl	(%rdi, %rcx, SIZE_OF_CHAR), %edx
++	xorl	%eax, %eax
++	cmpl	(%rsi, %rcx, SIZE_OF_CHAR), %edx
++	je	L(ret6)
++	setl	%al
++	negl	%eax
++	/* This is the non-zero case for `eax` so just xorl with `r8d`
++	   flip is `rdi` and `rsi` where swapped.  */
++	xorl	%r8d, %eax
+ # else
+-	incl	%ecx
++	movzbl	(%rdi, %rcx), %eax
++	movzbl	(%rsi, %rcx), %ecx
++	subl	%ecx, %eax
++	/* Flip `eax` if `rdi` and `rsi` where swapped in page cross
++	   logic. Subtract `r8d` after xor for zero case.  */
++	xorl	%r8d, %eax
++	subl	%r8d, %eax
+ # endif
++L(ret6):
++	ret
++
++# ifndef USE_AS_STRNCMP
++	.p2align 4,, 10
++L(return_vec_1_end):
+ 	tzcntl	%ecx, %ecx
+-# ifdef USE_AS_WCSCMP
+-	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
+-	sall	$2, %ecx
+-# endif
+-# ifdef USE_AS_STRNCMP
+-	addq	$(VEC_SIZE * 3), %rcx
+-	cmpq	%rcx, %r11
+-	jbe	L(zero)
+ #  ifdef USE_AS_WCSCMP
+-	movq	%rax, %rsi
++	movl	VEC_SIZE(%rdi, %rcx, SIZE_OF_CHAR), %edx
+ 	xorl	%eax, %eax
+-	movl	(%rsi, %rcx), %esi
+-	cmpl	(%rdx, %rcx), %esi
+-	jne	L(wcscmp_return)
+-#  else
+-	movzbl	(%rax, %rcx), %eax
+-	movzbl	(%rdx, %rcx), %edx
+-	subl	%edx, %eax
+-#  endif
+-# else
+-#  ifdef USE_AS_WCSCMP
+-	movq	%rax, %rsi
+-	xorl	%eax, %eax
+-	movl	(VEC_SIZE * 3)(%rsi, %rcx), %esi
+-	cmpl	(VEC_SIZE * 3)(%rdx, %rcx), %esi
+-	jne	L(wcscmp_return)
++	cmpl	VEC_SIZE(%rsi, %rcx, SIZE_OF_CHAR), %edx
++	je	L(ret7)
++	setl	%al
++	negl	%eax
++	xorl	%r8d, %eax
+ #  else
+-	movzbl	(VEC_SIZE * 3)(%rax, %rcx), %eax
+-	movzbl	(VEC_SIZE * 3)(%rdx, %rcx), %edx
+-	subl	%edx, %eax
++	movzbl	VEC_SIZE(%rdi, %rcx), %eax
++	movzbl	VEC_SIZE(%rsi, %rcx), %ecx
++	subl	%ecx, %eax
++	xorl	%r8d, %eax
++	subl	%r8d, %eax
+ #  endif
+-# endif
++L(ret7):
+ 	ret
+-
+-	.p2align 4
+-L(loop_cross_page):
+-	xorl	%r10d, %r10d
+-	movq	%rdx, %rcx
+-	/* Align load via RDX.  We load the extra ECX bytes which should
+-	   be ignored.  */
+-	andl	$((VEC_SIZE * 4) - 1), %ecx
+-	/* R10 is -RCX.  */
+-	subq	%rcx, %r10
+-
+-	/* This works only if VEC_SIZE * 2 == 64. */
+-# if (VEC_SIZE * 2) != 64
+-#  error (VEC_SIZE * 2) != 64
+ # endif
+ 
+-	/* Check if the first VEC_SIZE * 2 bytes should be ignored.  */
+-	cmpl	$(VEC_SIZE * 2), %ecx
+-	jge	L(loop_cross_page_2_vec)
+ 
+-	VMOVU	(%rax, %r10), %YMM2
+-	VMOVU	VEC_SIZE(%rax, %r10), %YMM3
++	/* Page cross in rsi in next 4x VEC.  */
+ 
+-	/* Each bit set in K2 represents a non-null CHAR in YMM2.  */
+-	VPTESTM	%YMM2, %YMM2, %k2
+-	/* Each bit cleared in K1 represents a mismatch or a null CHAR
+-	   in YMM2 and 32 bytes at (%rdx, %r10).  */
+-	VPCMP	$0, (%rdx, %r10), %YMM2, %k1{%k2}
+-	kmovd	%k1, %r9d
+-	/* Don't use subl since it is the lower 16/32 bits of RDI
+-	   below.  */
+-	notl	%r9d
+-# ifdef USE_AS_WCSCMP
+-	/* Only last 8 bits are valid.  */
+-	andl	$0xff, %r9d
+-# endif
++	/* TODO: Improve logic here.  */
++	.p2align 4,, 10
++L(page_cross_during_loop):
++	/* eax contains [distance_from_page - (VEC_SIZE * 4)].  */
+ 
+-	/* Each bit set in K4 represents a non-null CHAR in YMM3.  */
+-	VPTESTM	%YMM3, %YMM3, %k4
+-	/* Each bit cleared in K3 represents a mismatch or a null CHAR
+-	   in YMM3 and 32 bytes at VEC_SIZE(%rdx, %r10).  */
+-	VPCMP	$0, VEC_SIZE(%rdx, %r10), %YMM3, %k3{%k4}
+-	kmovd	%k3, %edi
+-    /* Must use notl %edi here as lower bits are for CHAR
+-	   comparisons potentially out of range thus can be 0 without
+-	   indicating mismatch.  */
+-	notl	%edi
+-# ifdef USE_AS_WCSCMP
+-	/* Don't use subl since it is the upper 8 bits of EDI below.  */
+-	andl	$0xff, %edi
+-# endif
++	/* Optimistically rsi and rdi and both aligned in which case we
++	   don't need any logic here.  */
++	cmpl	$-(VEC_SIZE * 4), %eax
++	/* Don't adjust eax before jumping back to loop and we will
++	   never hit page cross case again.  */
++	je	L(loop_skip_page_cross_check)
+ 
+-# ifdef USE_AS_WCSCMP
+-	/* NB: Each bit in EDI/R9D represents 4-byte element.  */
+-	sall	$8, %edi
+-	/* NB: Divide shift count by 4 since each bit in K1 represent 4
+-	   bytes.  */
+-	movl	%ecx, %SHIFT_REG32
+-	sarl	$2, %SHIFT_REG32
+-
+-	/* Each bit in EDI represents a null CHAR or a mismatch.  */
+-	orl	%r9d, %edi
+-# else
+-	salq	$32, %rdi
++	/* Check if we can safely load a VEC.  */
++	cmpl	$-(VEC_SIZE * 3), %eax
++	jle	L(less_1x_vec_till_page_cross)
+ 
+-	/* Each bit in RDI represents a null CHAR or a mismatch.  */
+-	orq	%r9, %rdi
+-# endif
++	VMOVA	(%rdi), %YMM0
++	VPTESTM	%YMM0, %YMM0, %k2
++	VPCMP	$0, (%rsi), %YMM0, %k1{%k2}
++	kmovd	%k1, %ecx
++	TESTEQ	%ecx
++	jnz	L(return_vec_0_end)
++
++	/* if distance >= 2x VEC then eax > -(VEC_SIZE * 2).  */
++	cmpl	$-(VEC_SIZE * 2), %eax
++	jg	L(more_2x_vec_till_page_cross)
++
++	.p2align 4,, 4
++L(less_1x_vec_till_page_cross):
++	subl	$-(VEC_SIZE * 4), %eax
++	/* Guranteed safe to read from rdi - VEC_SIZE here. The only
++	   concerning case is first iteration if incoming s1 was near start
++	   of a page and s2 near end. If s1 was near the start of the page
++	   we already aligned up to nearest VEC_SIZE * 4 so gurnateed safe
++	   to read back -VEC_SIZE. If rdi is truly at the start of a page
++	   here, it means the previous page (rdi - VEC_SIZE) has already
++	   been loaded earlier so must be valid.  */
++	VMOVU	-VEC_SIZE(%rdi, %rax), %YMM0
++	VPTESTM	%YMM0, %YMM0, %k2
++	VPCMP	$0, -VEC_SIZE(%rsi, %rax), %YMM0, %k1{%k2}
++
++	/* Mask of potentially valid bits. The lower bits can be out of
++	   range comparisons (but safe regarding page crosses).  */
+ 
+-	/* Since ECX < VEC_SIZE * 2, simply skip the first ECX bytes.  */
+-	shrxq	%SHIFT_REG64, %rdi, %rdi
+-	testq	%rdi, %rdi
+-	je	L(loop_cross_page_2_vec)
+-	tzcntq	%rdi, %rcx
+ # ifdef USE_AS_WCSCMP
+-	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
+-	sall	$2, %ecx
++	movl	$-1, %r10d
++	movl	%esi, %ecx
++	andl	$(VEC_SIZE - 1), %ecx
++	shrl	$2, %ecx
++	shlxl	%ecx, %r10d, %ecx
++	movzbl	%cl, %r10d
++# else
++	movl	$-1, %ecx
++	shlxl	%esi, %ecx, %r10d
+ # endif
++
++	kmovd	%k1, %ecx
++	notl	%ecx
++
++
+ # ifdef USE_AS_STRNCMP
+-	cmpq	%rcx, %r11
+-	jbe	L(zero)
+ #  ifdef USE_AS_WCSCMP
+-	movq	%rax, %rsi
+-	xorl	%eax, %eax
+-	movl	(%rsi, %rcx), %edi
+-	cmpl	(%rdx, %rcx), %edi
+-	jne	L(wcscmp_return)
++	movl	%eax, %r11d
++	shrl	$2, %r11d
++	cmpq	%r11, %rdx
+ #  else
+-	movzbl	(%rax, %rcx), %eax
+-	movzbl	(%rdx, %rcx), %edx
+-	subl	%edx, %eax
++	cmpq	%rax, %rdx
+ #  endif
++	jbe	L(return_page_cross_end_check)
++# endif
++	movl	%eax, %OFFSET_REG
++
++	/* Readjust eax before potentially returning to the loop.  */
++	addl	$(PAGE_SIZE - VEC_SIZE * 4), %eax
++
++	andl	%r10d, %ecx
++	jz	L(loop_skip_page_cross_check)
++
++	.p2align 4,, 3
++L(return_page_cross_end):
++	tzcntl	%ecx, %ecx
++
++# if (defined USE_AS_STRNCMP) || (defined USE_AS_WCSCMP)
++	leal	-VEC_SIZE(%OFFSET_REG64, %rcx, SIZE_OF_CHAR), %ecx
++L(return_page_cross_cmp_mem):
+ # else
+-#  ifdef USE_AS_WCSCMP
+-	movq	%rax, %rsi
++	addl	%OFFSET_REG, %ecx
++# endif
++# ifdef USE_AS_WCSCMP
++	movl	VEC_OFFSET(%rdi, %rcx), %edx
+ 	xorl	%eax, %eax
+-	movl	(%rsi, %rcx), %edi
+-	cmpl	(%rdx, %rcx), %edi
+-	jne	L(wcscmp_return)
+-#  else
+-	movzbl	(%rax, %rcx), %eax
+-	movzbl	(%rdx, %rcx), %edx
+-	subl	%edx, %eax
+-#  endif
++	cmpl	VEC_OFFSET(%rsi, %rcx), %edx
++	je	L(ret8)
++	setl	%al
++	negl	%eax
++	xorl	%r8d, %eax
++# else
++	movzbl	VEC_OFFSET(%rdi, %rcx), %eax
++	movzbl	VEC_OFFSET(%rsi, %rcx), %ecx
++	subl	%ecx, %eax
++	xorl	%r8d, %eax
++	subl	%r8d, %eax
+ # endif
++L(ret8):
+ 	ret
+ 
+-	.p2align 4
+-L(loop_cross_page_2_vec):
+-	/* The first VEC_SIZE * 2 bytes match or are ignored.  */
+-	VMOVU	(VEC_SIZE * 2)(%rax, %r10), %YMM0
+-	VMOVU	(VEC_SIZE * 3)(%rax, %r10), %YMM1
++# ifdef USE_AS_STRNCMP
++	.p2align 4,, 10
++L(return_page_cross_end_check):
++	tzcntl	%ecx, %ecx
++	leal	-VEC_SIZE(%rax, %rcx, SIZE_OF_CHAR), %ecx
++#  ifdef USE_AS_WCSCMP
++	sall	$2, %edx
++#  endif
++	cmpl	%ecx, %edx
++	ja	L(return_page_cross_cmp_mem)
++	xorl	%eax, %eax
++	ret
++# endif
++
+ 
++	.p2align 4,, 10
++L(more_2x_vec_till_page_cross):
++	/* If more 2x vec till cross we will complete a full loop
++	   iteration here.  */
++
++	VMOVA	VEC_SIZE(%rdi), %YMM0
+ 	VPTESTM	%YMM0, %YMM0, %k2
+-	/* Each bit cleared in K1 represents a mismatch or a null CHAR
+-	   in YMM0 and 32 bytes at (VEC_SIZE * 2)(%rdx, %r10).  */
+-	VPCMP	$0, (VEC_SIZE * 2)(%rdx, %r10), %YMM0, %k1{%k2}
+-	kmovd	%k1, %r9d
+-	/* Don't use subl since it is the lower 16/32 bits of RDI
+-	   below.  */
+-	notl	%r9d
+-# ifdef USE_AS_WCSCMP
+-	/* Only last 8 bits are valid.  */
+-	andl	$0xff, %r9d
+-# endif
++	VPCMP	$0, VEC_SIZE(%rsi), %YMM0, %k1{%k2}
++	kmovd	%k1, %ecx
++	TESTEQ	%ecx
++	jnz	L(return_vec_1_end)
+ 
+-	VPTESTM	%YMM1, %YMM1, %k4
+-	/* Each bit cleared in K3 represents a mismatch or a null CHAR
+-	   in YMM1 and 32 bytes at (VEC_SIZE * 3)(%rdx, %r10).  */
+-	VPCMP	$0, (VEC_SIZE * 3)(%rdx, %r10), %YMM1, %k3{%k4}
+-	kmovd	%k3, %edi
+-	/* Must use notl %edi here as lower bits are for CHAR
+-	   comparisons potentially out of range thus can be 0 without
+-	   indicating mismatch.  */
+-	notl	%edi
+-# ifdef USE_AS_WCSCMP
+-	/* Don't use subl since it is the upper 8 bits of EDI below.  */
+-	andl	$0xff, %edi
++# ifdef USE_AS_STRNCMP
++	cmpq	$(CHAR_PER_VEC * 2), %rdx
++	jbe	L(ret_zero_in_loop_page_cross)
+ # endif
+ 
+-# ifdef USE_AS_WCSCMP
+-	/* NB: Each bit in EDI/R9D represents 4-byte element.  */
+-	sall	$8, %edi
++	subl	$-(VEC_SIZE * 4), %eax
+ 
+-	/* Each bit in EDI represents a null CHAR or a mismatch.  */
+-	orl	%r9d, %edi
+-# else
+-	salq	$32, %rdi
++	/* Safe to include comparisons from lower bytes.  */
++	VMOVU	-(VEC_SIZE * 2)(%rdi, %rax), %YMM0
++	VPTESTM	%YMM0, %YMM0, %k2
++	VPCMP	$0, -(VEC_SIZE * 2)(%rsi, %rax), %YMM0, %k1{%k2}
++	kmovd	%k1, %ecx
++	TESTEQ	%ecx
++	jnz	L(return_vec_page_cross_0)
++
++	VMOVU	-(VEC_SIZE * 1)(%rdi, %rax), %YMM0
++	VPTESTM	%YMM0, %YMM0, %k2
++	VPCMP	$0, -(VEC_SIZE * 1)(%rsi, %rax), %YMM0, %k1{%k2}
++	kmovd	%k1, %ecx
++	TESTEQ	%ecx
++	jnz	L(return_vec_page_cross_1)
+ 
+-	/* Each bit in RDI represents a null CHAR or a mismatch.  */
+-	orq	%r9, %rdi
++# ifdef USE_AS_STRNCMP
++	/* Must check length here as length might proclude reading next
++	   page.  */
++#  ifdef USE_AS_WCSCMP
++	movl	%eax, %r11d
++	shrl	$2, %r11d
++	cmpq	%r11, %rdx
++#  else
++	cmpq	%rax, %rdx
++#  endif
++	jbe	L(ret_zero_in_loop_page_cross)
+ # endif
+ 
+-	xorl	%r8d, %r8d
+-	/* If ECX > VEC_SIZE * 2, skip ECX - (VEC_SIZE * 2) bytes.  */
+-	subl	$(VEC_SIZE * 2), %ecx
+-	jle	1f
+-	/* R8 has number of bytes skipped.  */
+-	movl	%ecx, %r8d
+-# ifdef USE_AS_WCSCMP
+-	/* NB: Divide shift count by 4 since each bit in RDI represent 4
+-	   bytes.  */
+-	sarl	$2, %ecx
+-	/* Skip ECX bytes.  */
+-	shrl	%cl, %edi
++	/* Finish the loop.  */
++	VMOVA	(VEC_SIZE * 2)(%rdi), %YMM4
++	VMOVA	(VEC_SIZE * 3)(%rdi), %YMM6
++	VPMINU	%YMM4, %YMM6, %YMM9
++	VPTESTM	%YMM9, %YMM9, %k1
++
++	vpxorq	(VEC_SIZE * 2)(%rsi), %YMM4, %YMM5
++	/* YMM6 = YMM5 | ((VEC_SIZE * 3)(%rsi) ^ YMM6).  */
++	vpternlogd $0xde, (VEC_SIZE * 3)(%rsi), %YMM5, %YMM6
++
++	VPCMP	$0, %YMMZERO, %YMM6, %k0{%k1}
++	kmovd	%k0, %LOOP_REG
++	TESTEQ	%LOOP_REG
++	jnz	L(return_vec_2_3_end)
++
++	/* Best for code size to include ucond-jmp here. Would be faster
++	   if this case is hot to duplicate the L(return_vec_2_3_end) code
++	   as fall-through and have jump back to loop on mismatch
++	   comparison.  */
++	subq	$-(VEC_SIZE * 4), %rdi
++	subq	$-(VEC_SIZE * 4), %rsi
++	addl	$(PAGE_SIZE - VEC_SIZE * 8), %eax
++# ifdef USE_AS_STRNCMP
++	subq	$(CHAR_PER_VEC * 4), %rdx
++	ja	L(loop_skip_page_cross_check)
++L(ret_zero_in_loop_page_cross):
++	xorl	%eax, %eax
++	ret
+ # else
+-	/* Skip ECX bytes.  */
+-	shrq	%cl, %rdi
++	jmp	L(loop_skip_page_cross_check)
+ # endif
+-1:
+-	/* Before jumping back to the loop, set ESI to the number of
+-	   VEC_SIZE * 4 blocks before page crossing.  */
+-	movl	$(PAGE_SIZE / (VEC_SIZE * 4) - 1), %esi
+ 
+-	testq	%rdi, %rdi
+-# ifdef USE_AS_STRNCMP
+-	/* At this point, if %rdi value is 0, it already tested
+-	   VEC_SIZE*4+%r10 byte starting from %rax. This label
+-	   checks whether strncmp maximum offset reached or not.  */
+-	je	L(string_nbyte_offset_check)
++
++	.p2align 4,, 10
++L(return_vec_page_cross_0):
++	addl	$-VEC_SIZE, %eax
++L(return_vec_page_cross_1):
++	tzcntl	%ecx, %ecx
++# if defined USE_AS_STRNCMP || defined USE_AS_WCSCMP
++	leal	-VEC_SIZE(%rax, %rcx, SIZE_OF_CHAR), %ecx
++#  ifdef USE_AS_STRNCMP
++#   ifdef USE_AS_WCSCMP
++	/* Must divide ecx instead of multiply rdx due to overflow.  */
++	movl	%ecx, %eax
++	shrl	$2, %eax
++	cmpq	%rax, %rdx
++#   else
++	cmpq	%rcx, %rdx
++#   endif
++	jbe	L(ret_zero_in_loop_page_cross)
++#  endif
+ # else
+-	je	L(back_to_loop)
++	addl	%eax, %ecx
+ # endif
+-	tzcntq	%rdi, %rcx
++
+ # ifdef USE_AS_WCSCMP
+-	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
+-	sall	$2, %ecx
+-# endif
+-	addq	%r10, %rcx
+-	/* Adjust for number of bytes skipped.  */
+-	addq	%r8, %rcx
+-# ifdef USE_AS_STRNCMP
+-	addq	$(VEC_SIZE * 2), %rcx
+-	subq	%rcx, %r11
+-	jbe	L(zero)
+-#  ifdef USE_AS_WCSCMP
+-	movq	%rax, %rsi
++	movl	VEC_OFFSET(%rdi, %rcx), %edx
+ 	xorl	%eax, %eax
+-	movl	(%rsi, %rcx), %edi
+-	cmpl	(%rdx, %rcx), %edi
+-	jne	L(wcscmp_return)
+-#  else
+-	movzbl	(%rax, %rcx), %eax
+-	movzbl	(%rdx, %rcx), %edx
+-	subl	%edx, %eax
+-#  endif
++	cmpl	VEC_OFFSET(%rsi, %rcx), %edx
++	je	L(ret9)
++	setl	%al
++	negl	%eax
++	xorl	%r8d, %eax
+ # else
+-#  ifdef USE_AS_WCSCMP
+-	movq	%rax, %rsi
+-	xorl	%eax, %eax
+-	movl	(VEC_SIZE * 2)(%rsi, %rcx), %edi
+-	cmpl	(VEC_SIZE * 2)(%rdx, %rcx), %edi
+-	jne	L(wcscmp_return)
+-#  else
+-	movzbl	(VEC_SIZE * 2)(%rax, %rcx), %eax
+-	movzbl	(VEC_SIZE * 2)(%rdx, %rcx), %edx
+-	subl	%edx, %eax
+-#  endif
++	movzbl	VEC_OFFSET(%rdi, %rcx), %eax
++	movzbl	VEC_OFFSET(%rsi, %rcx), %ecx
++	subl	%ecx, %eax
++	xorl	%r8d, %eax
++	subl	%r8d, %eax
+ # endif
++L(ret9):
+ 	ret
+ 
+-# ifdef USE_AS_STRNCMP
+-L(string_nbyte_offset_check):
+-	leaq	(VEC_SIZE * 4)(%r10), %r10
+-	cmpq	%r10, %r11
+-	jbe	L(zero)
+-	jmp	L(back_to_loop)
++
++	.p2align 4,, 10
++L(page_cross):
++# ifndef USE_AS_STRNCMP
++	/* If both are VEC aligned we don't need any special logic here.
++	   Only valid for strcmp where stop condition is guranteed to be
++	   reachable by just reading memory.  */
++	testl	$((VEC_SIZE - 1) << 20), %eax
++	jz	L(no_page_cross)
+ # endif
+ 
+-	.p2align 4
+-L(cross_page_loop):
+-	/* Check one byte/dword at a time.  */
++	movl	%edi, %eax
++	movl	%esi, %ecx
++	andl	$(PAGE_SIZE - 1), %eax
++	andl	$(PAGE_SIZE - 1), %ecx
++
++	xorl	%OFFSET_REG, %OFFSET_REG
++
++	/* Check which is closer to page cross, s1 or s2.  */
++	cmpl	%eax, %ecx
++	jg	L(page_cross_s2)
++
++	/* The previous page cross check has false positives. Check for
++	   true positive as page cross logic is very expensive.  */
++	subl	$(PAGE_SIZE - VEC_SIZE * 4), %eax
++	jbe	L(no_page_cross)
++
++
++	/* Set r8 to not interfere with normal return value (rdi and rsi
++	   did not swap).  */
+ # ifdef USE_AS_WCSCMP
+-	cmpl	%ecx, %eax
++	/* any non-zero positive value that doesn't inference with 0x1.
++	 */
++	movl	$2, %r8d
+ # else
+-	subl	%ecx, %eax
++	xorl	%r8d, %r8d
+ # endif
+-	jne	L(different)
+-	addl	$SIZE_OF_CHAR, %edx
+-	cmpl	$(VEC_SIZE * 4), %edx
+-	je	L(main_loop_header)
++
++	/* Check if less than 1x VEC till page cross.  */
++	subl	$(VEC_SIZE * 3), %eax
++	jg	L(less_1x_vec_till_page)
++
++
++	/* If more than 1x VEC till page cross, loop throuh safely
++	   loadable memory until within 1x VEC of page cross.  */
++	.p2align 4,, 8
++L(page_cross_loop):
++	VMOVU	(%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %YMM0
++	VPTESTM	%YMM0, %YMM0, %k2
++	VPCMP	$0, (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %YMM0, %k1{%k2}
++	kmovd	%k1, %ecx
++	TESTEQ	%ecx
++	jnz	L(check_ret_vec_page_cross)
++	addl	$CHAR_PER_VEC, %OFFSET_REG
+ # ifdef USE_AS_STRNCMP
+-	cmpq	%r11, %rdx
+-	jae	L(zero)
++	cmpq	%OFFSET_REG64, %rdx
++	jbe	L(ret_zero_page_cross)
+ # endif
++	addl	$VEC_SIZE, %eax
++	jl	L(page_cross_loop)
++
+ # ifdef USE_AS_WCSCMP
+-	movl	(%rdi, %rdx), %eax
+-	movl	(%rsi, %rdx), %ecx
+-# else
+-	movzbl	(%rdi, %rdx), %eax
+-	movzbl	(%rsi, %rdx), %ecx
++	shrl	$2, %eax
+ # endif
+-	/* Check null CHAR.  */
+-	testl	%eax, %eax
+-	jne	L(cross_page_loop)
+-	/* Since %eax == 0, subtract is OK for both SIGNED and UNSIGNED
+-	   comparisons.  */
+-	subl	%ecx, %eax
+-# ifndef USE_AS_WCSCMP
+-L(different):
++
++
++	subl	%eax, %OFFSET_REG
++	/* OFFSET_REG has distance to page cross - VEC_SIZE. Guranteed
++	   to not cross page so is safe to load. Since we have already
++	   loaded at least 1 VEC from rsi it is also guranteed to be safe.
++	 */
++	VMOVU	(%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %YMM0
++	VPTESTM	%YMM0, %YMM0, %k2
++	VPCMP	$0, (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %YMM0, %k1{%k2}
++
++	kmovd	%k1, %ecx
++# ifdef USE_AS_STRNCMP
++	leal	CHAR_PER_VEC(%OFFSET_REG64), %eax
++	cmpq	%rax, %rdx
++	jbe	L(check_ret_vec_page_cross2)
++#  ifdef USE_AS_WCSCMP
++	addq	$-(CHAR_PER_VEC * 2), %rdx
++#  else
++	addq	%rdi, %rdx
++#  endif
+ # endif
+-	ret
++	TESTEQ	%ecx
++	jz	L(prepare_loop_no_len)
+ 
++	.p2align 4,, 4
++L(ret_vec_page_cross):
++# ifndef USE_AS_STRNCMP
++L(check_ret_vec_page_cross):
++# endif
++	tzcntl	%ecx, %ecx
++	addl	%OFFSET_REG, %ecx
++L(ret_vec_page_cross_cont):
+ # ifdef USE_AS_WCSCMP
+-	.p2align 4
+-L(different):
+-	/* Use movl to avoid modifying EFLAGS.  */
+-	movl	$0, %eax
++	movl	(%rdi, %rcx, SIZE_OF_CHAR), %edx
++	xorl	%eax, %eax
++	cmpl	(%rsi, %rcx, SIZE_OF_CHAR), %edx
++	je	L(ret12)
+ 	setl	%al
+ 	negl	%eax
+-	orl	$1, %eax
+-	ret
++	xorl	%r8d, %eax
++# else
++	movzbl	(%rdi, %rcx, SIZE_OF_CHAR), %eax
++	movzbl	(%rsi, %rcx, SIZE_OF_CHAR), %ecx
++	subl	%ecx, %eax
++	xorl	%r8d, %eax
++	subl	%r8d, %eax
+ # endif
++L(ret12):
++	ret
++
+ 
+ # ifdef USE_AS_STRNCMP
+-	.p2align 4
+-L(zero):
++	.p2align 4,, 10
++L(check_ret_vec_page_cross2):
++	TESTEQ	%ecx
++L(check_ret_vec_page_cross):
++	tzcntl	%ecx, %ecx
++	addl	%OFFSET_REG, %ecx
++	cmpq	%rcx, %rdx
++	ja	L(ret_vec_page_cross_cont)
++	.p2align 4,, 2
++L(ret_zero_page_cross):
+ 	xorl	%eax, %eax
+ 	ret
++# endif
+ 
+-	.p2align 4
+-L(char0):
+-#  ifdef USE_AS_WCSCMP
+-	xorl	%eax, %eax
+-	movl	(%rdi), %ecx
+-	cmpl	(%rsi), %ecx
+-	jne	L(wcscmp_return)
+-#  else
+-	movzbl	(%rsi), %ecx
+-	movzbl	(%rdi), %eax
+-	subl	%ecx, %eax
+-#  endif
+-	ret
++	.p2align 4,, 4
++L(page_cross_s2):
++	/* Ensure this is a true page cross.  */
++	subl	$(PAGE_SIZE - VEC_SIZE * 4), %ecx
++	jbe	L(no_page_cross)
++
++
++	movl	%ecx, %eax
++	movq	%rdi, %rcx
++	movq	%rsi, %rdi
++	movq	%rcx, %rsi
++
++	/* set r8 to negate return value as rdi and rsi swapped.  */
++# ifdef USE_AS_WCSCMP
++	movl	$-4, %r8d
++# else
++	movl	$-1, %r8d
+ # endif
++	xorl	%OFFSET_REG, %OFFSET_REG
+ 
+-	.p2align 4
+-L(last_vector):
+-	addq	%rdx, %rdi
+-	addq	%rdx, %rsi
+-# ifdef USE_AS_STRNCMP
+-	subq	%rdx, %r11
++	/* Check if more than 1x VEC till page cross.  */
++	subl	$(VEC_SIZE * 3), %eax
++	jle	L(page_cross_loop)
++
++	.p2align 4,, 6
++L(less_1x_vec_till_page):
++# ifdef USE_AS_WCSCMP
++	shrl	$2, %eax
+ # endif
+-	tzcntl	%ecx, %edx
++	/* Find largest load size we can use.  */
++	cmpl	$(16 / SIZE_OF_CHAR), %eax
++	ja	L(less_16_till_page)
++
++	/* Use 16 byte comparison.  */
++	vmovdqu	(%rdi), %xmm0
++	VPTESTM	%xmm0, %xmm0, %k2
++	VPCMP	$0, (%rsi), %xmm0, %k1{%k2}
++	kmovd	%k1, %ecx
+ # ifdef USE_AS_WCSCMP
+-	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
+-	sall	$2, %edx
++	subl	$0xf, %ecx
++# else
++	incw	%cx
+ # endif
++	jnz	L(check_ret_vec_page_cross)
++	movl	$(16 / SIZE_OF_CHAR), %OFFSET_REG
+ # ifdef USE_AS_STRNCMP
+-	cmpq	%r11, %rdx
+-	jae	L(zero)
++	cmpq	%OFFSET_REG64, %rdx
++	jbe	L(ret_zero_page_cross_slow_case0)
++	subl	%eax, %OFFSET_REG
++# else
++	/* Explicit check for 16 byte alignment.  */
++	subl	%eax, %OFFSET_REG
++	jz	L(prepare_loop)
+ # endif
++	vmovdqu	(%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm0
++	VPTESTM	%xmm0, %xmm0, %k2
++	VPCMP	$0, (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm0, %k1{%k2}
++	kmovd	%k1, %ecx
+ # ifdef USE_AS_WCSCMP
+-	xorl	%eax, %eax
+-	movl	(%rdi, %rdx), %ecx
+-	cmpl	(%rsi, %rdx), %ecx
+-	jne	L(wcscmp_return)
++	subl	$0xf, %ecx
+ # else
+-	movzbl	(%rdi, %rdx), %eax
+-	movzbl	(%rsi, %rdx), %edx
+-	subl	%edx, %eax
++	incw	%cx
+ # endif
++	jnz	L(check_ret_vec_page_cross)
++# ifdef USE_AS_STRNCMP
++	addl	$(16 / SIZE_OF_CHAR), %OFFSET_REG
++	subq	%OFFSET_REG64, %rdx
++	jbe	L(ret_zero_page_cross_slow_case0)
++	subq	$-(CHAR_PER_VEC * 4), %rdx
++
++	leaq	-(VEC_SIZE * 4)(%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %rdi
++	leaq	-(VEC_SIZE * 4)(%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %rsi
++# else
++	leaq	(16 - VEC_SIZE * 4)(%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %rdi
++	leaq	(16 - VEC_SIZE * 4)(%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %rsi
++# endif
++	jmp	L(prepare_loop_aligned)
++
++# ifdef USE_AS_STRNCMP
++	.p2align 4,, 2
++L(ret_zero_page_cross_slow_case0):
++	xorl	%eax, %eax
+ 	ret
++# endif
+ 
+-	/* Comparing on page boundary region requires special treatment:
+-	   It must done one vector at the time, starting with the wider
+-	   ymm vector if possible, if not, with xmm. If fetching 16 bytes
+-	   (xmm) still passes the boundary, byte comparison must be done.
+-	 */
+-	.p2align 4
+-L(cross_page):
+-	/* Try one ymm vector at a time.  */
+-	cmpl	$(PAGE_SIZE - VEC_SIZE), %eax
+-	jg	L(cross_page_1_vector)
+-L(loop_1_vector):
+-	VMOVU	(%rdi, %rdx), %YMM0
+ 
+-	VPTESTM	%YMM0, %YMM0, %k2
+-	/* Each bit cleared in K1 represents a mismatch or a null CHAR
+-	   in YMM0 and 32 bytes at (%rsi, %rdx).  */
+-	VPCMP	$0, (%rsi, %rdx), %YMM0, %k1{%k2}
++	.p2align 4,, 10
++L(less_16_till_page):
++	cmpl	$(24 / SIZE_OF_CHAR), %eax
++	ja	L(less_8_till_page)
++
++	/* Use 8 byte comparison.  */
++	vmovq	(%rdi), %xmm0
++	vmovq	(%rsi), %xmm1
++	VPTESTM	%xmm0, %xmm0, %k2
++	VPCMP	$0, %xmm1, %xmm0, %k1{%k2}
+ 	kmovd	%k1, %ecx
+ # ifdef USE_AS_WCSCMP
+-	subl	$0xff, %ecx
++	subl	$0x3, %ecx
+ # else
+-	incl	%ecx
++	incb	%cl
+ # endif
+-	jne	L(last_vector)
++	jnz	L(check_ret_vec_page_cross)
+ 
+-	addl	$VEC_SIZE, %edx
+ 
+-	addl	$VEC_SIZE, %eax
+ # ifdef USE_AS_STRNCMP
+-	/* Return 0 if the current offset (%rdx) >= the maximum offset
+-	   (%r11).  */
+-	cmpq	%r11, %rdx
+-	jae	L(zero)
++	cmpq	$(8 / SIZE_OF_CHAR), %rdx
++	jbe	L(ret_zero_page_cross_slow_case0)
+ # endif
+-	cmpl	$(PAGE_SIZE - VEC_SIZE), %eax
+-	jle	L(loop_1_vector)
+-L(cross_page_1_vector):
+-	/* Less than 32 bytes to check, try one xmm vector.  */
+-	cmpl	$(PAGE_SIZE - 16), %eax
+-	jg	L(cross_page_1_xmm)
+-	VMOVU	(%rdi, %rdx), %XMM0
++	movl	$(24 / SIZE_OF_CHAR), %OFFSET_REG
++	subl	%eax, %OFFSET_REG
+ 
+-	VPTESTM	%YMM0, %YMM0, %k2
+-	/* Each bit cleared in K1 represents a mismatch or a null CHAR
+-	   in XMM0 and 16 bytes at (%rsi, %rdx).  */
+-	VPCMP	$0, (%rsi, %rdx), %XMM0, %k1{%k2}
++	vmovq	(%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm0
++	vmovq	(%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm1
++	VPTESTM	%xmm0, %xmm0, %k2
++	VPCMP	$0, %xmm1, %xmm0, %k1{%k2}
+ 	kmovd	%k1, %ecx
+ # ifdef USE_AS_WCSCMP
+-	subl	$0xf, %ecx
++	subl	$0x3, %ecx
+ # else
+-	subl	$0xffff, %ecx
++	incb	%cl
+ # endif
+-	jne	L(last_vector)
++	jnz	L(check_ret_vec_page_cross)
++
+ 
+-	addl	$16, %edx
+-# ifndef USE_AS_WCSCMP
+-	addl	$16, %eax
+-# endif
+ # ifdef USE_AS_STRNCMP
+-	/* Return 0 if the current offset (%rdx) >= the maximum offset
+-	   (%r11).  */
+-	cmpq	%r11, %rdx
+-	jae	L(zero)
++	addl	$(8 / SIZE_OF_CHAR), %OFFSET_REG
++	subq	%OFFSET_REG64, %rdx
++	jbe	L(ret_zero_page_cross_slow_case0)
++	subq	$-(CHAR_PER_VEC * 4), %rdx
++
++	leaq	-(VEC_SIZE * 4)(%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %rdi
++	leaq	-(VEC_SIZE * 4)(%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %rsi
++# else
++	leaq	(8 - VEC_SIZE * 4)(%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %rdi
++	leaq	(8 - VEC_SIZE * 4)(%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %rsi
+ # endif
++	jmp	L(prepare_loop_aligned)
+ 
+-L(cross_page_1_xmm):
+-# ifndef USE_AS_WCSCMP
+-	/* Less than 16 bytes to check, try 8 byte vector.  NB: No need
+-	   for wcscmp nor wcsncmp since wide char is 4 bytes.   */
+-	cmpl	$(PAGE_SIZE - 8), %eax
+-	jg	L(cross_page_8bytes)
+-	vmovq	(%rdi, %rdx), %XMM0
+-	vmovq	(%rsi, %rdx), %XMM1
+ 
+-	VPTESTM	%YMM0, %YMM0, %k2
+-	/* Each bit cleared in K1 represents a mismatch or a null CHAR
+-	   in XMM0 and XMM1.  */
+-	VPCMP	$0, %XMM1, %XMM0, %k1{%k2}
+-	kmovb	%k1, %ecx
++
++
++	.p2align 4,, 10
++L(less_8_till_page):
+ # ifdef USE_AS_WCSCMP
+-	subl	$0x3, %ecx
++	/* If using wchar then this is the only check before we reach
++	   the page boundary.  */
++	movl	(%rdi), %eax
++	movl	(%rsi), %ecx
++	cmpl	%ecx, %eax
++	jnz	L(ret_less_8_wcs)
++#  ifdef USE_AS_STRNCMP
++	addq	$-(CHAR_PER_VEC * 2), %rdx
++	/* We already checked for len <= 1 so cannot hit that case here.
++	 */
++#  endif
++	testl	%eax, %eax
++	jnz	L(prepare_loop)
++	ret
++
++	.p2align 4,, 8
++L(ret_less_8_wcs):
++	setl	%OFFSET_REG8
++	negl	%OFFSET_REG
++	movl	%OFFSET_REG, %eax
++	xorl	%r8d, %eax
++	ret
++
+ # else
+-	subl	$0xff, %ecx
+-# endif
+-	jne	L(last_vector)
++	cmpl	$28, %eax
++	ja	L(less_4_till_page)
++
++	vmovd	(%rdi), %xmm0
++	vmovd	(%rsi), %xmm1
++	VPTESTM	%xmm0, %xmm0, %k2
++	VPCMP	$0, %xmm1, %xmm0, %k1{%k2}
++	kmovd	%k1, %ecx
++	subl	$0xf, %ecx
++	jnz	L(check_ret_vec_page_cross)
+ 
+-	addl	$8, %edx
+-	addl	$8, %eax
+ #  ifdef USE_AS_STRNCMP
+-	/* Return 0 if the current offset (%rdx) >= the maximum offset
+-	   (%r11).  */
+-	cmpq	%r11, %rdx
+-	jae	L(zero)
++	cmpq	$4, %rdx
++	jbe	L(ret_zero_page_cross_slow_case1)
+ #  endif
++	movl	$(28 / SIZE_OF_CHAR), %OFFSET_REG
++	subl	%eax, %OFFSET_REG
+ 
+-L(cross_page_8bytes):
+-	/* Less than 8 bytes to check, try 4 byte vector.  */
+-	cmpl	$(PAGE_SIZE - 4), %eax
+-	jg	L(cross_page_4bytes)
+-	vmovd	(%rdi, %rdx), %XMM0
+-	vmovd	(%rsi, %rdx), %XMM1
+-
+-	VPTESTM	%YMM0, %YMM0, %k2
+-	/* Each bit cleared in K1 represents a mismatch or a null CHAR
+-	   in XMM0 and XMM1.  */
+-	VPCMP	$0, %XMM1, %XMM0, %k1{%k2}
++	vmovd	(%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm0
++	vmovd	(%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm1
++	VPTESTM	%xmm0, %xmm0, %k2
++	VPCMP	$0, %xmm1, %xmm0, %k1{%k2}
+ 	kmovd	%k1, %ecx
+-# ifdef USE_AS_WCSCMP
+-	subl	$0x1, %ecx
+-# else
+ 	subl	$0xf, %ecx
+-# endif
+-	jne	L(last_vector)
++	jnz	L(check_ret_vec_page_cross)
++#  ifdef USE_AS_STRNCMP
++	addl	$(4 / SIZE_OF_CHAR), %OFFSET_REG
++	subq	%OFFSET_REG64, %rdx
++	jbe	L(ret_zero_page_cross_slow_case1)
++	subq	$-(CHAR_PER_VEC * 4), %rdx
++
++	leaq	-(VEC_SIZE * 4)(%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %rdi
++	leaq	-(VEC_SIZE * 4)(%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %rsi
++#  else
++	leaq	(4 - VEC_SIZE * 4)(%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %rdi
++	leaq	(4 - VEC_SIZE * 4)(%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %rsi
++#  endif
++	jmp	L(prepare_loop_aligned)
++
+ 
+-	addl	$4, %edx
+ #  ifdef USE_AS_STRNCMP
+-	/* Return 0 if the current offset (%rdx) >= the maximum offset
+-	   (%r11).  */
+-	cmpq	%r11, %rdx
+-	jae	L(zero)
++	.p2align 4,, 2
++L(ret_zero_page_cross_slow_case1):
++	xorl	%eax, %eax
++	ret
+ #  endif
+ 
+-L(cross_page_4bytes):
+-# endif
+-	/* Less than 4 bytes to check, try one byte/dword at a time.  */
+-# ifdef USE_AS_STRNCMP
+-	cmpq	%r11, %rdx
+-	jae	L(zero)
+-# endif
+-# ifdef USE_AS_WCSCMP
+-	movl	(%rdi, %rdx), %eax
+-	movl	(%rsi, %rdx), %ecx
+-# else
+-	movzbl	(%rdi, %rdx), %eax
+-	movzbl	(%rsi, %rdx), %ecx
+-# endif
+-	testl	%eax, %eax
+-	jne	L(cross_page_loop)
++	.p2align 4,, 10
++L(less_4_till_page):
++	subq	%rdi, %rsi
++	/* Extremely slow byte comparison loop.  */
++L(less_4_loop):
++	movzbl	(%rdi), %eax
++	movzbl	(%rsi, %rdi), %ecx
+ 	subl	%ecx, %eax
++	jnz	L(ret_less_4_loop)
++	testl	%ecx, %ecx
++	jz	L(ret_zero_4_loop)
++#  ifdef USE_AS_STRNCMP
++	decq	%rdx
++	jz	L(ret_zero_4_loop)
++#  endif
++	incq	%rdi
++	/* end condition is reach page boundary (rdi is aligned).  */
++	testl	$31, %edi
++	jnz	L(less_4_loop)
++	leaq	-(VEC_SIZE * 4)(%rdi, %rsi), %rsi
++	addq	$-(VEC_SIZE * 4), %rdi
++#  ifdef USE_AS_STRNCMP
++	subq	$-(CHAR_PER_VEC * 4), %rdx
++#  endif
++	jmp	L(prepare_loop_aligned)
++
++L(ret_zero_4_loop):
++	xorl	%eax, %eax
++	ret
++L(ret_less_4_loop):
++	xorl	%r8d, %eax
++	subl	%r8d, %eax
+ 	ret
+-END (STRCMP)
++# endif
++END(STRCMP)
+ #endif
+-- 
+GitLab
+
diff --git a/glibc-RHEL-15696-76.patch b/glibc-RHEL-15696-76.patch
new file mode 100644
index 0000000..84d9a6f
--- /dev/null
+++ b/glibc-RHEL-15696-76.patch
@@ -0,0 +1,33 @@
+From c15efd011cea3d8f0494269eb539583215a1feed Mon Sep 17 00:00:00 2001
+From: "H.J. Lu" <hjl.tools@gmail.com>
+Date: Fri, 4 Feb 2022 11:09:10 -0800
+Subject: [PATCH] x86-64: Fix strcmp-avx2.S
+Content-type: text/plain; charset=UTF-8
+
+Change "movl %edx, %rdx" to "movl %edx, %edx" in:
+
+commit b77b06e0e296f1a2276c27a67e1d44f2cfa38d45
+Author: Noah Goldstein <goldstein.w.n@gmail.com>
+Date:   Mon Jan 10 15:35:38 2022 -0600
+
+    x86: Optimize strcmp-avx2.S
+---
+ sysdeps/x86_64/multiarch/strcmp-avx2.S | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+diff --git a/sysdeps/x86_64/multiarch/strcmp-avx2.S b/sysdeps/x86_64/multiarch/strcmp-avx2.S
+index 554ffe4c..04675aa4 100644
+--- a/sysdeps/x86_64/multiarch/strcmp-avx2.S
++++ b/sysdeps/x86_64/multiarch/strcmp-avx2.S
+@@ -106,7 +106,7 @@ ENTRY(STRCMP)
+ # ifdef USE_AS_STRNCMP
+ #  ifdef __ILP32__
+ 	/* Clear the upper 32 bits.  */
+-	movl	%edx, %rdx
++	movl	%edx, %edx
+ #  endif
+ 	cmp	$1, %RDX_LP
+ 	/* Signed comparison intentional. We use this branch to also
+-- 
+GitLab
+
diff --git a/glibc-RHEL-15696-77.patch b/glibc-RHEL-15696-77.patch
new file mode 100644
index 0000000..1a1cdae
--- /dev/null
+++ b/glibc-RHEL-15696-77.patch
@@ -0,0 +1,33 @@
+From 0e0199a9e02ebe42e2b36958964d63f03573c382 Mon Sep 17 00:00:00 2001
+From: "H.J. Lu" <hjl.tools@gmail.com>
+Date: Fri, 4 Feb 2022 11:11:08 -0800
+Subject: [PATCH] x86-64: Fix strcmp-evex.S
+Content-type: text/plain; charset=UTF-8
+
+Change "movl %edx, %rdx" to "movl %edx, %edx" in:
+
+commit 8418eb3ff4b781d31c4ed5dc6c0bd7356bc45db9
+Author: Noah Goldstein <goldstein.w.n@gmail.com>
+Date:   Mon Jan 10 15:35:39 2022 -0600
+
+    x86: Optimize strcmp-evex.S
+---
+ sysdeps/x86_64/multiarch/strcmp-evex.S | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+diff --git a/sysdeps/x86_64/multiarch/strcmp-evex.S b/sysdeps/x86_64/multiarch/strcmp-evex.S
+index 99d8409a..ed56af8e 100644
+--- a/sysdeps/x86_64/multiarch/strcmp-evex.S
++++ b/sysdeps/x86_64/multiarch/strcmp-evex.S
+@@ -116,7 +116,7 @@ ENTRY(STRCMP)
+ # ifdef USE_AS_STRNCMP
+ #  ifdef __ILP32__
+ 	/* Clear the upper 32 bits.  */
+-	movl	%edx, %rdx
++	movl	%edx, %edx
+ #  endif
+ 	cmp	$1, %RDX_LP
+ 	/* Signed comparison intentional. We use this branch to also
+-- 
+GitLab
+
diff --git a/glibc-RHEL-15696-78.patch b/glibc-RHEL-15696-78.patch
new file mode 100644
index 0000000..885b715
--- /dev/null
+++ b/glibc-RHEL-15696-78.patch
@@ -0,0 +1,459 @@
+From b62ace2740a106222e124cc86956448fa07abf4d Mon Sep 17 00:00:00 2001
+From: Noah Goldstein <goldstein.w.n@gmail.com>
+Date: Sun, 6 Feb 2022 00:54:18 -0600
+Subject: [PATCH] x86: Improve vec generation in memset-vec-unaligned-erms.S
+Content-type: text/plain; charset=UTF-8
+
+No bug.
+
+Split vec generation into multiple steps. This allows the
+broadcast in AVX2 to use 'xmm' registers for the L(less_vec)
+case. This saves an expensive lane-cross instruction and removes
+the need for 'vzeroupper'.
+
+For SSE2 replace 2x 'punpck' instructions with zero-idiom 'pxor' for
+byte broadcast.
+
+Results for memset-avx2 small (geomean of N = 20 benchset runs).
+
+size, New Time, Old Time, New / Old
+   0,    4.100,    3.831,     0.934
+   1,    5.074,    4.399,     0.867
+   2,    4.433,    4.411,     0.995
+   4,    4.487,    4.415,     0.984
+   8,    4.454,    4.396,     0.987
+  16,    4.502,    4.443,     0.987
+
+All relevant string/wcsmbs tests are passing.
+Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
+---
+ sysdeps/x86_64/memset.S                       |  21 ++-
+ .../multiarch/memset-avx2-unaligned-erms.S    |  18 +-
+ .../multiarch/memset-avx512-unaligned-erms.S  |  18 +-
+ .../multiarch/memset-evex-unaligned-erms.S    |  18 +-
+ .../multiarch/memset-vec-unaligned-erms.S     | 164 +++++++++++-------
+ 5 files changed, 152 insertions(+), 87 deletions(-)
+
+diff --git a/sysdeps/x86_64/memset.S b/sysdeps/x86_64/memset.S
+index 8672b030..27debd2b 100644
+--- a/sysdeps/x86_64/memset.S
++++ b/sysdeps/x86_64/memset.S
+@@ -28,17 +28,22 @@
+ #define VMOVU     movups
+ #define VMOVA     movaps
+ 
+-#define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
++# define MEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
+   movd d, %xmm0; \
+-  movq r, %rax; \
+-  punpcklbw %xmm0, %xmm0; \
+-  punpcklwd %xmm0, %xmm0; \
+-  pshufd $0, %xmm0, %xmm0
++  pxor %xmm1, %xmm1; \
++  pshufb %xmm1, %xmm0; \
++  movq r, %rax
+ 
+-#define WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
++# define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
+   movd d, %xmm0; \
+-  movq r, %rax; \
+-  pshufd $0, %xmm0, %xmm0
++  pshufd $0, %xmm0, %xmm0; \
++  movq r, %rax
++
++# define MEMSET_VDUP_TO_VEC0_HIGH()
++# define MEMSET_VDUP_TO_VEC0_LOW()
++
++# define WMEMSET_VDUP_TO_VEC0_HIGH()
++# define WMEMSET_VDUP_TO_VEC0_LOW()
+ 
+ #define SECTION(p)		p
+ 
+diff --git a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
+index 1af668af..c0bf2875 100644
+--- a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
++++ b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
+@@ -10,15 +10,18 @@
+ # define VMOVU     vmovdqu
+ # define VMOVA     vmovdqa
+ 
+-# define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
++# define MEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
+   vmovd d, %xmm0; \
+-  movq r, %rax; \
+-  vpbroadcastb %xmm0, %ymm0
++  movq r, %rax;
+ 
+-# define WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
+-  vmovd d, %xmm0; \
+-  movq r, %rax; \
+-  vpbroadcastd %xmm0, %ymm0
++# define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
++  MEMSET_SET_VEC0_AND_SET_RETURN(d, r)
++
++# define MEMSET_VDUP_TO_VEC0_HIGH() vpbroadcastb %xmm0, %ymm0
++# define MEMSET_VDUP_TO_VEC0_LOW() vpbroadcastb %xmm0, %xmm0
++
++# define WMEMSET_VDUP_TO_VEC0_HIGH() vpbroadcastd %xmm0, %ymm0
++# define WMEMSET_VDUP_TO_VEC0_LOW() vpbroadcastd %xmm0, %xmm0
+ 
+ # ifndef SECTION
+ #  define SECTION(p)		p##.avx
+@@ -30,5 +33,6 @@
+ #  define WMEMSET_SYMBOL(p,s)	p##_avx2_##s
+ # endif
+ 
++# define USE_XMM_LESS_VEC
+ # include "memset-vec-unaligned-erms.S"
+ #endif
+diff --git a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
+index f14d6f84..5241216a 100644
+--- a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
++++ b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
+@@ -15,13 +15,19 @@
+ 
+ # define VZEROUPPER
+ 
+-# define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
+-  movq r, %rax; \
+-  vpbroadcastb d, %VEC0
++# define MEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
++  vpbroadcastb d, %VEC0; \
++  movq r, %rax
+ 
+-# define WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
+-  movq r, %rax; \
+-  vpbroadcastd d, %VEC0
++# define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
++  vpbroadcastd d, %VEC0; \
++  movq r, %rax
++
++# define MEMSET_VDUP_TO_VEC0_HIGH()
++# define MEMSET_VDUP_TO_VEC0_LOW()
++
++# define WMEMSET_VDUP_TO_VEC0_HIGH()
++# define WMEMSET_VDUP_TO_VEC0_LOW()
+ 
+ # define SECTION(p)		p##.evex512
+ # define MEMSET_SYMBOL(p,s)	p##_avx512_##s
+diff --git a/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
+index 64b09e77..63700215 100644
+--- a/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
++++ b/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
+@@ -15,13 +15,19 @@
+ 
+ # define VZEROUPPER
+ 
+-# define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
+-  movq r, %rax; \
+-  vpbroadcastb d, %VEC0
++# define MEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
++  vpbroadcastb d, %VEC0; \
++  movq r, %rax
+ 
+-# define WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
+-  movq r, %rax; \
+-  vpbroadcastd d, %VEC0
++# define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
++  vpbroadcastd d, %VEC0; \
++  movq r, %rax
++
++# define MEMSET_VDUP_TO_VEC0_HIGH()
++# define MEMSET_VDUP_TO_VEC0_LOW()
++
++# define WMEMSET_VDUP_TO_VEC0_HIGH()
++# define WMEMSET_VDUP_TO_VEC0_LOW()
+ 
+ # define SECTION(p)		p##.evex
+ # define MEMSET_SYMBOL(p,s)	p##_evex_##s
+diff --git a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
+index f08b7323..a67f9833 100644
+--- a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
++++ b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
+@@ -58,8 +58,10 @@
+ #ifndef MOVQ
+ # if VEC_SIZE > 16
+ #  define MOVQ				vmovq
++#  define MOVD				vmovd
+ # else
+ #  define MOVQ				movq
++#  define MOVD				movd
+ # endif
+ #endif
+ 
+@@ -72,9 +74,17 @@
+ #if defined USE_WITH_EVEX || defined USE_WITH_AVX512
+ # define END_REG	rcx
+ # define LOOP_REG	rdi
++# define LESS_VEC_REG	rax
+ #else
+ # define END_REG	rdi
+ # define LOOP_REG	rdx
++# define LESS_VEC_REG	rdi
++#endif
++
++#ifdef USE_XMM_LESS_VEC
++# define XMM_SMALL	1
++#else
++# define XMM_SMALL	0
+ #endif
+ 
+ #define PAGE_SIZE 4096
+@@ -110,8 +120,12 @@ END_CHK (WMEMSET_CHK_SYMBOL (__wmemset_chk, unaligned))
+ 
+ ENTRY (WMEMSET_SYMBOL (__wmemset, unaligned))
+ 	shl	$2, %RDX_LP
+-	WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi)
+-	jmp	L(entry_from_bzero)
++	WMEMSET_SET_VEC0_AND_SET_RETURN (%esi, %rdi)
++	WMEMSET_VDUP_TO_VEC0_LOW()
++	cmpq	$VEC_SIZE, %rdx
++	jb	L(less_vec_no_vdup)
++	WMEMSET_VDUP_TO_VEC0_HIGH()
++	jmp	L(entry_from_wmemset)
+ END (WMEMSET_SYMBOL (__wmemset, unaligned))
+ #endif
+ 
+@@ -123,7 +137,7 @@ END_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned))
+ #endif
+ 
+ ENTRY (MEMSET_SYMBOL (__memset, unaligned))
+-	MEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi)
++	MEMSET_SET_VEC0_AND_SET_RETURN (%esi, %rdi)
+ # ifdef __ILP32__
+ 	/* Clear the upper 32 bits.  */
+ 	mov	%edx, %edx
+@@ -131,6 +145,8 @@ ENTRY (MEMSET_SYMBOL (__memset, unaligned))
+ L(entry_from_bzero):
+ 	cmpq	$VEC_SIZE, %rdx
+ 	jb	L(less_vec)
++	MEMSET_VDUP_TO_VEC0_HIGH()
++L(entry_from_wmemset):
+ 	cmpq	$(VEC_SIZE * 2), %rdx
+ 	ja	L(more_2x_vec)
+ 	/* From VEC and to 2 * VEC.  No branch when size == VEC_SIZE.  */
+@@ -179,27 +195,27 @@ END_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned_erms))
+ # endif
+ 
+ ENTRY_P2ALIGN (MEMSET_SYMBOL (__memset, unaligned_erms), 6)
+-	MEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi)
++	MEMSET_SET_VEC0_AND_SET_RETURN (%esi, %rdi)
+ # ifdef __ILP32__
+ 	/* Clear the upper 32 bits.  */
+ 	mov	%edx, %edx
+ # endif
+ 	cmp	$VEC_SIZE, %RDX_LP
+ 	jb	L(less_vec)
++	MEMSET_VDUP_TO_VEC0_HIGH ()
+ 	cmp	$(VEC_SIZE * 2), %RDX_LP
+ 	ja	L(stosb_more_2x_vec)
+-	/* From VEC and to 2 * VEC.  No branch when size == VEC_SIZE.
+-	 */
+-	VMOVU	%VEC(0), (%rax)
+-	VMOVU	%VEC(0), -VEC_SIZE(%rax, %rdx)
++	/* From VEC and to 2 * VEC.  No branch when size == VEC_SIZE.  */
++	VMOVU	%VEC(0), (%rdi)
++	VMOVU	%VEC(0), (VEC_SIZE * -1)(%rdi, %rdx)
+ 	VZEROUPPER_RETURN
+ #endif
+ 
+-	.p2align 4,, 10
++	.p2align 4,, 4
+ L(last_2x_vec):
+ #ifdef USE_LESS_VEC_MASK_STORE
+-	VMOVU	%VEC(0), (VEC_SIZE * 2 + LOOP_4X_OFFSET)(%rcx)
+-	VMOVU	%VEC(0), (VEC_SIZE * 3 + LOOP_4X_OFFSET)(%rcx)
++	VMOVU	%VEC(0), (VEC_SIZE * -2)(%rdi, %rdx)
++	VMOVU	%VEC(0), (VEC_SIZE * -1)(%rdi, %rdx)
+ #else
+ 	VMOVU	%VEC(0), (VEC_SIZE * -2)(%rdi)
+ 	VMOVU	%VEC(0), (VEC_SIZE * -1)(%rdi)
+@@ -212,6 +228,7 @@ L(last_2x_vec):
+ #ifdef USE_LESS_VEC_MASK_STORE
+ 	.p2align 4,, 10
+ L(less_vec):
++L(less_vec_no_vdup):
+ 	/* Less than 1 VEC.  */
+ # if VEC_SIZE != 16 && VEC_SIZE != 32 && VEC_SIZE != 64
+ #  error Unsupported VEC_SIZE!
+@@ -262,28 +279,18 @@ L(stosb_more_2x_vec):
+ 	/* Fallthrough goes to L(loop_4x_vec). Tests for memset (2x, 4x]
+ 	   and (4x, 8x] jump to target.  */
+ L(more_2x_vec):
+-
+-	/* Two different methods of setting up pointers / compare. The
+-	   two methods are based on the fact that EVEX/AVX512 mov
+-	   instructions take more bytes then AVX2/SSE2 mov instructions. As
+-	   well that EVEX/AVX512 machines also have fast LEA_BID. Both
+-	   setup and END_REG to avoid complex address mode. For EVEX/AVX512
+-	   this saves code size and keeps a few targets in one fetch block.
+-	   For AVX2/SSE2 this helps prevent AGU bottlenecks.  */
+-#if defined USE_WITH_EVEX || defined USE_WITH_AVX512
+-	/* If EVEX/AVX512 compute END_REG - (VEC_SIZE * 4 +
+-	   LOOP_4X_OFFSET) with LEA_BID.  */
+-
+-	/* END_REG is rcx for EVEX/AVX512.  */
+-	leaq	-(VEC_SIZE * 4 + LOOP_4X_OFFSET)(%rdi, %rdx), %END_REG
+-#endif
+-
+-	/* Stores to first 2x VEC before cmp as any path forward will
+-	   require it.  */
+-	VMOVU	%VEC(0), (%rax)
+-	VMOVU	%VEC(0), VEC_SIZE(%rax)
++	/* Store next 2x vec regardless.  */
++	VMOVU	%VEC(0), (%rdi)
++	VMOVU	%VEC(0), (VEC_SIZE * 1)(%rdi)
+ 
+ 
++	/* Two different methods of setting up pointers / compare. The two
++	   methods are based on the fact that EVEX/AVX512 mov instructions take
++	   more bytes then AVX2/SSE2 mov instructions. As well that EVEX/AVX512
++	   machines also have fast LEA_BID. Both setup and END_REG to avoid complex
++	   address mode. For EVEX/AVX512 this saves code size and keeps a few
++	   targets in one fetch block. For AVX2/SSE2 this helps prevent AGU
++	   bottlenecks.  */
+ #if !(defined USE_WITH_EVEX || defined USE_WITH_AVX512)
+ 	/* If AVX2/SSE2 compute END_REG (rdi) with ALU.  */
+ 	addq	%rdx, %END_REG
+@@ -292,6 +299,15 @@ L(more_2x_vec):
+ 	cmpq	$(VEC_SIZE * 4), %rdx
+ 	jbe	L(last_2x_vec)
+ 
++
++#if defined USE_WITH_EVEX || defined USE_WITH_AVX512
++	/* If EVEX/AVX512 compute END_REG - (VEC_SIZE * 4 + LOOP_4X_OFFSET) with
++	   LEA_BID.  */
++
++	/* END_REG is rcx for EVEX/AVX512.  */
++	leaq	-(VEC_SIZE * 4 + LOOP_4X_OFFSET)(%rdi, %rdx), %END_REG
++#endif
++
+ 	/* Store next 2x vec regardless.  */
+ 	VMOVU	%VEC(0), (VEC_SIZE * 2)(%rax)
+ 	VMOVU	%VEC(0), (VEC_SIZE * 3)(%rax)
+@@ -355,65 +371,93 @@ L(stosb_local):
+ 	/* Define L(less_vec) only if not otherwise defined.  */
+ 	.p2align 4
+ L(less_vec):
++	/* Broadcast esi to partial register (i.e VEC_SIZE == 32 broadcast to
++	   xmm). This is only does anything for AVX2.  */
++	MEMSET_VDUP_TO_VEC0_LOW ()
++L(less_vec_no_vdup):
+ #endif
+ L(cross_page):
+ #if VEC_SIZE > 32
+ 	cmpl	$32, %edx
+-	jae	L(between_32_63)
++	jge	L(between_32_63)
+ #endif
+ #if VEC_SIZE > 16
+ 	cmpl	$16, %edx
+-	jae	L(between_16_31)
++	jge	L(between_16_31)
++#endif
++#ifndef USE_XMM_LESS_VEC
++	MOVQ	%XMM0, %rcx
+ #endif
+-	MOVQ	%XMM0, %rdi
+ 	cmpl	$8, %edx
+-	jae	L(between_8_15)
++	jge	L(between_8_15)
+ 	cmpl	$4, %edx
+-	jae	L(between_4_7)
++	jge	L(between_4_7)
+ 	cmpl	$1, %edx
+-	ja	L(between_2_3)
+-	jb	L(return)
+-	movb	%sil, (%rax)
+-	VZEROUPPER_RETURN
++	jg	L(between_2_3)
++	jl	L(between_0_0)
++	movb	%sil, (%LESS_VEC_REG)
++L(between_0_0):
++	ret
+ 
+-	/* Align small targets only if not doing so would cross a fetch
+-	   line.  */
++	/* Align small targets only if not doing so would cross a fetch line.
++	 */
+ #if VEC_SIZE > 32
+ 	.p2align 4,, SMALL_MEMSET_ALIGN(MOV_SIZE, RET_SIZE)
+ 	/* From 32 to 63.  No branch when size == 32.  */
+ L(between_32_63):
+-	VMOVU	%YMM0, (%rax)
+-	VMOVU	%YMM0, -32(%rax, %rdx)
++	VMOVU	%YMM0, (%LESS_VEC_REG)
++	VMOVU	%YMM0, -32(%LESS_VEC_REG, %rdx)
+ 	VZEROUPPER_RETURN
+ #endif
+ 
+ #if VEC_SIZE >= 32
+-	.p2align 4,, SMALL_MEMSET_ALIGN(MOV_SIZE, RET_SIZE)
++	.p2align 4,, SMALL_MEMSET_ALIGN(MOV_SIZE, 1)
+ L(between_16_31):
+ 	/* From 16 to 31.  No branch when size == 16.  */
+-	VMOVU	%XMM0, (%rax)
+-	VMOVU	%XMM0, -16(%rax, %rdx)
+-	VZEROUPPER_RETURN
++	VMOVU	%XMM0, (%LESS_VEC_REG)
++	VMOVU	%XMM0, -16(%LESS_VEC_REG, %rdx)
++	ret
+ #endif
+ 
+-	.p2align 4,, SMALL_MEMSET_ALIGN(3, RET_SIZE)
++	/* Move size is 3 for SSE2, EVEX, and AVX512. Move size is 4 for AVX2.
++	 */
++	.p2align 4,, SMALL_MEMSET_ALIGN(3 + XMM_SMALL, 1)
+ L(between_8_15):
+ 	/* From 8 to 15.  No branch when size == 8.  */
+-	movq	%rdi, (%rax)
+-	movq	%rdi, -8(%rax, %rdx)
+-	VZEROUPPER_RETURN
++#ifdef USE_XMM_LESS_VEC
++	MOVQ	%XMM0, (%rdi)
++	MOVQ	%XMM0, -8(%rdi, %rdx)
++#else
++	movq	%rcx, (%LESS_VEC_REG)
++	movq	%rcx, -8(%LESS_VEC_REG, %rdx)
++#endif
++	ret
+ 
+-	.p2align 4,, SMALL_MEMSET_ALIGN(2, RET_SIZE)
++	/* Move size is 2 for SSE2, EVEX, and AVX512. Move size is 4 for AVX2.
++	 */
++	.p2align 4,, SMALL_MEMSET_ALIGN(2 << XMM_SMALL, 1)
+ L(between_4_7):
+ 	/* From 4 to 7.  No branch when size == 4.  */
+-	movl	%edi, (%rax)
+-	movl	%edi, -4(%rax, %rdx)
+-	VZEROUPPER_RETURN
++#ifdef USE_XMM_LESS_VEC
++	MOVD	%XMM0, (%rdi)
++	MOVD	%XMM0, -4(%rdi, %rdx)
++#else
++	movl	%ecx, (%LESS_VEC_REG)
++	movl	%ecx, -4(%LESS_VEC_REG, %rdx)
++#endif
++	ret
+ 
+-	.p2align 4,, SMALL_MEMSET_ALIGN(3, RET_SIZE)
++	/* 4 * XMM_SMALL for the third mov for AVX2.  */
++	.p2align 4,, 4 * XMM_SMALL + SMALL_MEMSET_ALIGN(3, 1)
+ L(between_2_3):
+ 	/* From 2 to 3.  No branch when size == 2.  */
+-	movw	%di, (%rax)
+-	movb	%dil, -1(%rax, %rdx)
+-	VZEROUPPER_RETURN
++#ifdef USE_XMM_LESS_VEC
++	movb	%sil, (%rdi)
++	movb	%sil, 1(%rdi)
++	movb	%sil, -1(%rdi, %rdx)
++#else
++	movw	%cx, (%LESS_VEC_REG)
++	movb	%sil, -1(%LESS_VEC_REG, %rdx)
++#endif
++	ret
+ END (MEMSET_SYMBOL (__memset, unaligned_erms))
+-- 
+GitLab
+
diff --git a/glibc-RHEL-15696-79.patch b/glibc-RHEL-15696-79.patch
new file mode 100644
index 0000000..91e850f
--- /dev/null
+++ b/glibc-RHEL-15696-79.patch
@@ -0,0 +1,40 @@
+From 1b0c60f95bbe2eded80b2bb5be75c0e45b11cde1 Mon Sep 17 00:00:00 2001
+From: Noah Goldstein <goldstein.w.n@gmail.com>
+Date: Mon, 7 Feb 2022 00:32:23 -0600
+Subject: [PATCH] x86: Remove SSSE3 instruction for broadcast in memset.S (SSE2
+ Only)
+Content-type: text/plain; charset=UTF-8
+
+commit b62ace2740a106222e124cc86956448fa07abf4d
+Author: Noah Goldstein <goldstein.w.n@gmail.com>
+Date:   Sun Feb 6 00:54:18 2022 -0600
+
+    x86: Improve vec generation in memset-vec-unaligned-erms.S
+
+Revert usage of 'pshufb' in broadcast logic as it is an SSSE3
+instruction and memset.S is restricted to only SSE2 instructions.
+---
+ sysdeps/x86_64/memset.S | 7 ++++---
+ 1 file changed, 4 insertions(+), 3 deletions(-)
+
+diff --git a/sysdeps/x86_64/memset.S b/sysdeps/x86_64/memset.S
+index 27debd2b..4cb4aa71 100644
+--- a/sysdeps/x86_64/memset.S
++++ b/sysdeps/x86_64/memset.S
+@@ -30,9 +30,10 @@
+ 
+ # define MEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
+   movd d, %xmm0; \
+-  pxor %xmm1, %xmm1; \
+-  pshufb %xmm1, %xmm0; \
+-  movq r, %rax
++  movq r, %rax; \
++  punpcklbw %xmm0, %xmm0; \
++  punpcklwd %xmm0, %xmm0; \
++  pshufd $0, %xmm0, %xmm0
+ 
+ # define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
+   movd d, %xmm0; \
+-- 
+GitLab
+
diff --git a/glibc-RHEL-15696-8.patch b/glibc-RHEL-15696-8.patch
new file mode 100644
index 0000000..5cf7633
--- /dev/null
+++ b/glibc-RHEL-15696-8.patch
@@ -0,0 +1,218 @@
+From 5165de69c0908e28a380cbd4bb054e55ea4abc95 Mon Sep 17 00:00:00 2001
+From: "H.J. Lu" <hjl.tools@gmail.com>
+Date: Mon, 21 Jan 2019 11:36:36 -0800
+Subject: [PATCH] x86-64 strnlen/wcsnlen: Properly handle the length parameter
+ [BZ# 24097]
+Content-type: text/plain; charset=UTF-8
+
+On x32, the size_t parameter may be passed in the lower 32 bits of a
+64-bit register with the non-zero upper 32 bits.  The string/memory
+functions written in assembly can only use the lower 32 bits of a
+64-bit register as length or must clear the upper 32 bits before using
+the full 64-bit register for length.
+
+This pach fixes strnlen/wcsnlen for x32.  Tested on x86-64 and x32.  On
+x86-64, libc.so is the same with and withou the fix.
+
+	[BZ# 24097]
+	CVE-2019-6488
+	* sysdeps/x86_64/multiarch/strlen-avx2.S: Use RSI_LP for length.
+	Clear the upper 32 bits of RSI register.
+	* sysdeps/x86_64/strlen.S: Use RSI_LP for length.
+	* sysdeps/x86_64/x32/Makefile (tests): Add tst-size_t-strnlen
+	and tst-size_t-wcsnlen.
+	* sysdeps/x86_64/x32/tst-size_t-strnlen.c: New file.
+	* sysdeps/x86_64/x32/tst-size_t-wcsnlen.c: Likewise.
+---
+ sysdeps/x86_64/multiarch/strlen-avx2.S  |  9 ++--
+ sysdeps/x86_64/strlen.S                 | 12 ++---
+ sysdeps/x86_64/x32/Makefile             |  4 +-
+ sysdeps/x86_64/x32/tst-size_t-strnlen.c | 72 +++++++++++++++++++++++++
+ sysdeps/x86_64/x32/tst-size_t-wcsnlen.c | 20 +++++++
+ 5 files changed, 106 insertions(+), 11 deletions(-)
+ create mode 100644 sysdeps/x86_64/x32/tst-size_t-strnlen.c
+ create mode 100644 sysdeps/x86_64/x32/tst-size_t-wcsnlen.c
+
+Conflicts:
+	ChangeLog
+	(removed)
+
+diff --git a/sysdeps/x86_64/multiarch/strlen-avx2.S b/sysdeps/x86_64/multiarch/strlen-avx2.S
+index fb2418cd..645e0446 100644
+--- a/sysdeps/x86_64/multiarch/strlen-avx2.S
++++ b/sysdeps/x86_64/multiarch/strlen-avx2.S
+@@ -42,12 +42,15 @@
+ ENTRY (STRLEN)
+ # ifdef USE_AS_STRNLEN
+ 	/* Check for zero length.  */
+-	testq	%rsi, %rsi
++	test	%RSI_LP, %RSI_LP
+ 	jz	L(zero)
+ #  ifdef USE_AS_WCSLEN
+-	shl	$2, %rsi
++	shl	$2, %RSI_LP
++#  elif defined __ILP32__
++	/* Clear the upper 32 bits.  */
++	movl	%esi, %esi
+ #  endif
+-	movq	%rsi, %r8
++	mov	%RSI_LP, %R8_LP
+ # endif
+ 	movl	%edi, %ecx
+ 	movq	%rdi, %rdx
+diff --git a/sysdeps/x86_64/strlen.S b/sysdeps/x86_64/strlen.S
+index 01cb5fa8..f845f3d4 100644
+--- a/sysdeps/x86_64/strlen.S
++++ b/sysdeps/x86_64/strlen.S
+@@ -59,21 +59,21 @@ ENTRY(strlen)
+ 
+ #ifdef AS_STRNLEN
+ /* Do not read anything when n==0.  */
+-	test	%rsi, %rsi
++	test	%RSI_LP, %RSI_LP
+ 	jne	L(n_nonzero)
+ 	xor	%rax, %rax
+ 	ret
+ L(n_nonzero):
+ # ifdef AS_WCSLEN
+-	shlq	$2, %rsi
++	shl	$2, %RSI_LP
+ # endif
+ 
+ /* Initialize long lived registers.  */
+ 
+-	add	%rdi, %rsi
+-	mov	%rsi, %r10
+-	and	$-64, %r10
+-	mov	%rsi, %r11
++	add	%RDI_LP, %RSI_LP
++	mov	%RSI_LP, %R10_LP
++	and	$-64, %R10_LP
++	mov	%RSI_LP, %R11_LP
+ #endif
+ 
+ 	pxor	%xmm0, %xmm0
+diff --git a/sysdeps/x86_64/x32/Makefile b/sysdeps/x86_64/x32/Makefile
+index 2a9e20a9..1557724b 100644
+--- a/sysdeps/x86_64/x32/Makefile
++++ b/sysdeps/x86_64/x32/Makefile
+@@ -8,10 +8,10 @@ endif
+ ifeq ($(subdir),string)
+ tests += tst-size_t-memchr tst-size_t-memcmp tst-size_t-memcpy \
+ 	 tst-size_t-memrchr tst-size_t-memset tst-size_t-strncasecmp \
+-	 tst-size_t-strncmp tst-size_t-strncpy
++	 tst-size_t-strncmp tst-size_t-strncpy tst-size_t-strnlen
+ endif
+ 
+ ifeq ($(subdir),wcsmbs)
+ tests += tst-size_t-wmemchr tst-size_t-wmemcmp tst-size_t-wmemset \
+-	 tst-size_t-wcsncmp
++	 tst-size_t-wcsncmp tst-size_t-wcsnlen
+ endif
+diff --git a/sysdeps/x86_64/x32/tst-size_t-strnlen.c b/sysdeps/x86_64/x32/tst-size_t-strnlen.c
+new file mode 100644
+index 00000000..690a4a8a
+--- /dev/null
++++ b/sysdeps/x86_64/x32/tst-size_t-strnlen.c
+@@ -0,0 +1,72 @@
++/* Test strnlen with size_t in the lower 32 bits of 64-bit register.
++   Copyright (C) 2019 Free Software Foundation, Inc.
++   This file is part of the GNU C Library.
++
++   The GNU C Library is free software; you can redistribute it and/or
++   modify it under the terms of the GNU Lesser General Public
++   License as published by the Free Software Foundation; either
++   version 2.1 of the License, or (at your option) any later version.
++
++   The GNU C Library is distributed in the hope that it will be useful,
++   but WITHOUT ANY WARRANTY; without even the implied warranty of
++   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++   Lesser General Public License for more details.
++
++   You should have received a copy of the GNU Lesser General Public
++   License along with the GNU C Library; if not, see
++   <http://www.gnu.org/licenses/>.  */
++
++#ifdef WIDE
++# define TEST_NAME "wcsnlen"
++#else
++# define TEST_NAME "strnlen"
++#endif /* WIDE */
++
++#include "test-size_t.h"
++
++#ifdef WIDE
++# include <wchar.h>
++# define STRNLEN wcsnlen
++# define CHAR wchar_t
++#else
++# define STRNLEN strnlen
++# define CHAR char
++#endif /* WIDE */
++
++IMPL (STRNLEN, 1)
++
++typedef size_t (*proto_t) (const CHAR *, size_t);
++
++static size_t
++__attribute__ ((noinline, noclone))
++do_strnlen (parameter_t a, parameter_t b)
++{
++  return CALL (&a, a.p, b.len);
++}
++
++static int
++test_main (void)
++{
++  test_init ();
++
++  size_t size = page_size / sizeof (CHAR);
++  parameter_t src = { { 0 }, buf2 };
++  parameter_t c = { { size }, (void *) (uintptr_t) 'a' };
++
++  int ret = 0;
++  FOR_EACH_IMPL (impl, 0)
++    {
++      src.fn = impl->fn;
++      size_t res = do_strnlen (src, c);
++      if (res != size)
++	{
++	  error (0, 0, "Wrong result in function %s: 0x%x != 0x%x",
++		 impl->name, res, size);
++	  ret = 1;
++	}
++    }
++
++  return ret ? EXIT_FAILURE : EXIT_SUCCESS;
++}
++
++#include <support/test-driver.c>
+diff --git a/sysdeps/x86_64/x32/tst-size_t-wcsnlen.c b/sysdeps/x86_64/x32/tst-size_t-wcsnlen.c
+new file mode 100644
+index 00000000..093b4bbe
+--- /dev/null
++++ b/sysdeps/x86_64/x32/tst-size_t-wcsnlen.c
+@@ -0,0 +1,20 @@
++/* Test wcsnlen with size_t in the lower 32 bits of 64-bit register.
++   Copyright (C) 2019 Free Software Foundation, Inc.
++   This file is part of the GNU C Library.
++
++   The GNU C Library is free software; you can redistribute it and/or
++   modify it under the terms of the GNU Lesser General Public
++   License as published by the Free Software Foundation; either
++   version 2.1 of the License, or (at your option) any later version.
++
++   The GNU C Library is distributed in the hope that it will be useful,
++   but WITHOUT ANY WARRANTY; without even the implied warranty of
++   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++   Lesser General Public License for more details.
++
++   You should have received a copy of the GNU Lesser General Public
++   License along with the GNU C Library; if not, see
++   <http://www.gnu.org/licenses/>.  */
++
++#define WIDE 1
++#include "tst-size_t-strnlen.c"
+-- 
+GitLab
+
diff --git a/glibc-RHEL-15696-80.patch b/glibc-RHEL-15696-80.patch
new file mode 100644
index 0000000..53a3e7e
--- /dev/null
+++ b/glibc-RHEL-15696-80.patch
@@ -0,0 +1,753 @@
+From 3d9f171bfb5325bd5f427e9fc386453358c6e840 Mon Sep 17 00:00:00 2001
+From: "H.J. Lu" <hjl.tools@gmail.com>
+Date: Mon, 7 Feb 2022 05:55:15 -0800
+Subject: [PATCH] x86-64: Optimize bzero
+Content-type: text/plain; charset=UTF-8
+
+memset with zero as the value to set is by far the majority value (99%+
+for Python3 and GCC).
+
+bzero can be slightly more optimized for this case by using a zero-idiom
+xor for broadcasting the set value to a register (vector or GPR).
+
+Co-developed-by: Noah Goldstein <goldstein.w.n@gmail.com>
+---
+ sysdeps/generic/ifunc-init.h                  |   5 +-
+ sysdeps/x86_64/memset.S                       |   8 +
+ sysdeps/x86_64/multiarch/Makefile             | 205 +++++++++++-------
+ sysdeps/x86_64/multiarch/bzero.c              | 106 +++++++++
+ sysdeps/x86_64/multiarch/ifunc-impl-list.c    |  42 ++++
+ .../memset-avx2-unaligned-erms-rtm.S          |   1 +
+ .../multiarch/memset-avx2-unaligned-erms.S    |   6 +
+ .../multiarch/memset-avx512-unaligned-erms.S  |   3 +
+ .../multiarch/memset-evex-unaligned-erms.S    |   3 +
+ .../multiarch/memset-sse2-unaligned-erms.S    |   1 +
+ .../multiarch/memset-vec-unaligned-erms.S     | 110 +++++++---
+ 11 files changed, 384 insertions(+), 106 deletions(-)
+ create mode 100644 sysdeps/x86_64/multiarch/bzero.c
+
+Conflicts:
+	sysdeps/generic/ifunc-init.h
+	(needs macros from cf4fd28ea453d1a9cec93939bc88b58ccef5437a (memcmpeq))
+	sysdeps/x86_64/multiarch/Makefile
+	(file ordering)
+
+diff --git a/sysdeps/generic/ifunc-init.h b/sysdeps/generic/ifunc-init.h
+index 241e4161..f7a72375 100644
+--- a/sysdeps/generic/ifunc-init.h
++++ b/sysdeps/generic/ifunc-init.h
+@@ -50,5 +50,8 @@
+    '__<symbol>_<variant>' as the optimized implementation and
+    '<symbol>_ifunc_selector' as the IFUNC selector.  */
+ #define REDIRECT_NAME	EVALUATOR1 (__redirect, SYMBOL_NAME)
+-#define OPTIMIZE(name)	EVALUATOR2 (SYMBOL_NAME, name)
++#define OPTIMIZE1(name)	EVALUATOR1 (SYMBOL_NAME, name)
++#define OPTIMIZE2(name)	EVALUATOR2 (SYMBOL_NAME, name)
++/* Default is to use OPTIMIZE2.  */
++#define OPTIMIZE(name)	OPTIMIZE2(name)
+ #define IFUNC_SELECTOR	EVALUATOR1 (SYMBOL_NAME, ifunc_selector)
+diff --git a/sysdeps/x86_64/memset.S b/sysdeps/x86_64/memset.S
+index 4cb4aa71..a1353f89 100644
+--- a/sysdeps/x86_64/memset.S
++++ b/sysdeps/x86_64/memset.S
+@@ -35,6 +35,9 @@
+   punpcklwd %xmm0, %xmm0; \
+   pshufd $0, %xmm0, %xmm0
+ 
++# define BZERO_ZERO_VEC0() \
++  pxor %xmm0, %xmm0
++
+ # define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
+   movd d, %xmm0; \
+   pshufd $0, %xmm0, %xmm0; \
+@@ -53,6 +56,10 @@
+ # define MEMSET_SYMBOL(p,s)	memset
+ #endif
+ 
++#ifndef BZERO_SYMBOL
++# define BZERO_SYMBOL(p,s)	__bzero
++#endif
++
+ #ifndef WMEMSET_SYMBOL
+ # define WMEMSET_CHK_SYMBOL(p,s) p
+ # define WMEMSET_SYMBOL(p,s)	__wmemset
+@@ -63,6 +70,7 @@
+ libc_hidden_builtin_def (memset)
+ 
+ #if IS_IN (libc)
++weak_alias (__bzero, bzero)
+ libc_hidden_def (__wmemset)
+ weak_alias (__wmemset, wmemset)
+ libc_hidden_weak (wmemset)
+diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
+index 26be4095..37d8d6f0 100644
+--- a/sysdeps/x86_64/multiarch/Makefile
++++ b/sysdeps/x86_64/multiarch/Makefile
+@@ -1,85 +1,130 @@
+ ifeq ($(subdir),string)
+ 
+-sysdep_routines += strncat-c stpncpy-c strncpy-c \
+-		   strcmp-sse2 strcmp-sse2-unaligned strcmp-ssse3  \
+-		   strcmp-sse4_2 strcmp-avx2 \
+-		   strncmp-sse2 strncmp-ssse3 strncmp-sse4_2 strncmp-avx2 \
+-		   memchr-sse2 rawmemchr-sse2 memchr-avx2 rawmemchr-avx2 \
+-		   memrchr-sse2 memrchr-avx2 \
+-		   memcmp-sse2 \
+-		   memcmp-avx2-movbe \
+-		   memcmp-sse4 memcpy-ssse3 \
+-		   memmove-ssse3 \
+-		   memcpy-ssse3-back \
+-		   memmove-ssse3-back \
+-		   memmove-avx512-no-vzeroupper \
+-		   strcasecmp_l-sse2 strcasecmp_l-ssse3 \
+-		   strcasecmp_l-sse4_2 strcasecmp_l-avx \
+-		   strncase_l-sse2 strncase_l-ssse3 \
+-		   strncase_l-sse4_2 strncase_l-avx \
+-		   strchr-sse2 strchrnul-sse2 strchr-avx2 strchrnul-avx2 \
+-		   strrchr-sse2 strrchr-avx2 \
+-		   strlen-sse2 strnlen-sse2 strlen-avx2 strnlen-avx2 \
+-		   strcat-avx2 strncat-avx2 \
+-		   strcat-ssse3 strncat-ssse3\
+-		   strcpy-avx2 strncpy-avx2 \
+-		   strcpy-sse2 stpcpy-sse2 \
+-		   strcpy-ssse3 strncpy-ssse3 stpcpy-ssse3 stpncpy-ssse3 \
+-		   strcpy-sse2-unaligned strncpy-sse2-unaligned \
+-		   stpcpy-sse2-unaligned stpncpy-sse2-unaligned \
+-		   stpcpy-avx2 stpncpy-avx2 \
+-		   strcat-sse2 \
+-		   strcat-sse2-unaligned strncat-sse2-unaligned \
+-		   strchr-sse2-no-bsf memcmp-ssse3 strstr-sse2-unaligned \
+-		   strcspn-sse2 strpbrk-sse2 strspn-sse2 \
+-		   strcspn-c strpbrk-c strspn-c varshift \
+-		   memset-avx512-no-vzeroupper \
+-		   memmove-sse2-unaligned-erms \
+-		   memmove-avx-unaligned-erms \
+-		   memmove-avx512-unaligned-erms \
+-		   memset-sse2-unaligned-erms \
+-		   memset-avx2-unaligned-erms \
+-		   memset-avx512-unaligned-erms \
+-		   memchr-avx2-rtm \
+-		   memcmp-avx2-movbe-rtm \
+-		   memmove-avx-unaligned-erms-rtm \
+-		   memrchr-avx2-rtm \
+-		   memset-avx2-unaligned-erms-rtm \
+-		   rawmemchr-avx2-rtm \
+-		   strchr-avx2-rtm \
+-		   strcmp-avx2-rtm \
+-		   strchrnul-avx2-rtm \
+-		   stpcpy-avx2-rtm \
+-		   stpncpy-avx2-rtm \
+-		   strcat-avx2-rtm \
+-		   strcpy-avx2-rtm \
+-		   strlen-avx2-rtm \
+-		   strncat-avx2-rtm \
+-		   strncmp-avx2-rtm \
+-		   strncpy-avx2-rtm \
+-		   strnlen-avx2-rtm \
+-		   strrchr-avx2-rtm \
+-		   memchr-evex \
+-		   memcmp-evex-movbe \
+-		   memmove-evex-unaligned-erms \
+-		   memrchr-evex \
+-		   memset-evex-unaligned-erms \
+-		   rawmemchr-evex \
+-		   stpcpy-evex \
+-		   stpncpy-evex \
+-		   strcat-evex \
+-		   strchr-evex \
+-		   strchrnul-evex \
+-		   strcmp-evex \
+-		   strcpy-evex \
+-		   strlen-evex \
+-		   strncat-evex \
+-		   strncmp-evex \
+-		   strncpy-evex \
+-		   strnlen-evex \
+-		   strrchr-evex \
+-		   memchr-evex-rtm \
+-		   rawmemchr-evex-rtm
++sysdep_routines += \
++  bzero \
++  memchr-avx2 \
++  memchr-avx2-rtm \
++  memchr-evex \
++  memchr-evex-rtm \
++  memchr-sse2 \
++  memcmp-avx2-movbe \
++  memcmp-avx2-movbe-rtm \
++  memcmp-evex-movbe \
++  memcmp-sse2 \
++  memcmp-sse4 \
++  memcmp-ssse3 \
++  memcpy-ssse3 \
++  memcpy-ssse3-back \
++  memmove-avx-unaligned-erms \
++  memmove-avx-unaligned-erms-rtm \
++  memmove-avx512-no-vzeroupper \
++  memmove-avx512-unaligned-erms \
++  memmove-evex-unaligned-erms \
++  memmove-sse2-unaligned-erms \
++  memmove-ssse3 \
++  memmove-ssse3-back \
++  memrchr-avx2 \
++  memrchr-avx2-rtm \
++  memrchr-evex \
++  memrchr-sse2 \
++  memset-avx2-unaligned-erms \
++  memset-avx2-unaligned-erms-rtm \
++  memset-avx512-no-vzeroupper \
++  memset-avx512-unaligned-erms \
++  memset-evex-unaligned-erms \
++  memset-sse2-unaligned-erms \
++  rawmemchr-avx2 \
++  rawmemchr-avx2-rtm \
++  rawmemchr-evex \
++  rawmemchr-evex-rtm \
++  rawmemchr-sse2 \
++  stpcpy-avx2 \
++  stpcpy-avx2-rtm \
++  stpcpy-evex \
++  stpcpy-sse2 \
++  stpcpy-sse2-unaligned \
++  stpcpy-ssse3 \
++  stpncpy-avx2 \
++  stpncpy-avx2-rtm \
++  stpncpy-c \
++  stpncpy-evex \
++  stpncpy-sse2-unaligned \
++  stpncpy-ssse3 \
++  strcasecmp_l-avx \
++  strcasecmp_l-sse2 \
++  strcasecmp_l-sse4_2 \
++  strcasecmp_l-ssse3 \
++  strcat-avx2 \
++  strcat-avx2-rtm \
++  strcat-evex \
++  strcat-sse2 \
++  strcat-sse2-unaligned \
++  strcat-ssse3 \
++  strchr-avx2 \
++  strchr-avx2-rtm \
++  strchr-evex \
++  strchr-sse2 \
++  strchr-sse2-no-bsf \
++  strchrnul-avx2 \
++  strchrnul-avx2-rtm \
++  strchrnul-evex \
++  strchrnul-sse2 \
++  strcmp-avx2 \
++  strcmp-avx2-rtm \
++  strcmp-evex \
++  strcmp-sse2 \
++  strcmp-sse2-unaligned \
++  strcmp-sse4_2 \
++  strcmp-ssse3 \
++  strcpy-avx2 \
++  strcpy-avx2-rtm \
++  strcpy-evex \
++  strcpy-sse2 \
++  strcpy-sse2-unaligned \
++  strcpy-ssse3 \
++  strcspn-c \
++  strcspn-sse2 \
++  strlen-avx2 \
++  strlen-avx2-rtm \
++  strlen-evex \
++  strlen-sse2 \
++  strncase_l-avx \
++  strncase_l-sse2 \
++  strncase_l-sse4_2 \
++  strncase_l-ssse3 \
++  strncat-avx2 \
++  strncat-avx2-rtm \
++  strncat-c \
++  strncat-evex \
++  strncat-sse2-unaligned \
++  strncat-ssse3 \
++  strncmp-avx2 \
++  strncmp-avx2-rtm \
++  strncmp-evex \
++  strncmp-sse2 \
++  strncmp-sse4_2 \
++  strncmp-ssse3 \
++  strncpy-avx2 \
++  strncpy-avx2-rtm \
++  strncpy-c \
++  strncpy-evex \
++  strncpy-sse2-unaligned \
++  strncpy-ssse3 \
++  strnlen-avx2 \
++  strnlen-avx2-rtm \
++  strnlen-evex \
++  strnlen-sse2 \
++  strpbrk-c \
++  strpbrk-sse2 \
++  strrchr-avx2 \
++  strrchr-avx2-rtm \
++  strrchr-evex \
++  strrchr-sse2 \
++  strspn-c \
++  strspn-sse2 \
++  strstr-sse2-unaligned \
++  varshift \
++# sysdep_routines
+ CFLAGS-varshift.c += -msse4
+ CFLAGS-strcspn-c.c += -msse4
+ CFLAGS-strpbrk-c.c += -msse4
+diff --git a/sysdeps/x86_64/multiarch/bzero.c b/sysdeps/x86_64/multiarch/bzero.c
+new file mode 100644
+index 00000000..58a14b2c
+--- /dev/null
++++ b/sysdeps/x86_64/multiarch/bzero.c
+@@ -0,0 +1,106 @@
++/* Multiple versions of bzero.
++   All versions must be listed in ifunc-impl-list.c.
++   Copyright (C) 2022 Free Software Foundation, Inc.
++   This file is part of the GNU C Library.
++
++   The GNU C Library is free software; you can redistribute it and/or
++   modify it under the terms of the GNU Lesser General Public
++   License as published by the Free Software Foundation; either
++   version 2.1 of the License, or (at your option) any later version.
++
++   The GNU C Library is distributed in the hope that it will be useful,
++   but WITHOUT ANY WARRANTY; without even the implied warranty of
++   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++   Lesser General Public License for more details.
++
++   You should have received a copy of the GNU Lesser General Public
++   License along with the GNU C Library; if not, see
++   <https://www.gnu.org/licenses/>.  */
++
++/* Define multiple versions only for the definition in libc.  */
++#if IS_IN (libc)
++# define __bzero __redirect___bzero
++# include <string.h>
++# undef __bzero
++
++# define SYMBOL_NAME __bzero
++# include <init-arch.h>
++
++extern __typeof (REDIRECT_NAME) OPTIMIZE1 (sse2_unaligned)
++  attribute_hidden;
++extern __typeof (REDIRECT_NAME) OPTIMIZE1 (sse2_unaligned_erms)
++  attribute_hidden;
++extern __typeof (REDIRECT_NAME) OPTIMIZE1 (avx2_unaligned) attribute_hidden;
++extern __typeof (REDIRECT_NAME) OPTIMIZE1 (avx2_unaligned_erms)
++  attribute_hidden;
++extern __typeof (REDIRECT_NAME) OPTIMIZE1 (avx2_unaligned_rtm)
++  attribute_hidden;
++extern __typeof (REDIRECT_NAME) OPTIMIZE1 (avx2_unaligned_erms_rtm)
++  attribute_hidden;
++extern __typeof (REDIRECT_NAME) OPTIMIZE1 (evex_unaligned)
++  attribute_hidden;
++extern __typeof (REDIRECT_NAME) OPTIMIZE1 (evex_unaligned_erms)
++  attribute_hidden;
++extern __typeof (REDIRECT_NAME) OPTIMIZE1 (avx512_unaligned)
++  attribute_hidden;
++extern __typeof (REDIRECT_NAME) OPTIMIZE1 (avx512_unaligned_erms)
++  attribute_hidden;
++
++static inline void *
++IFUNC_SELECTOR (void)
++{
++  const struct cpu_features* cpu_features = __get_cpu_features ();
++
++  if (CPU_FEATURE_USABLE_P (cpu_features, AVX512F)
++      && !CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_AVX512))
++    {
++      if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
++          && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW)
++          && CPU_FEATURE_USABLE_P (cpu_features, BMI2))
++	{
++	  if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
++	    return OPTIMIZE1 (avx512_unaligned_erms);
++
++	  return OPTIMIZE1 (avx512_unaligned);
++	}
++    }
++
++  if (CPU_FEATURE_USABLE_P (cpu_features, AVX2))
++    {
++      if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
++          && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW)
++          && CPU_FEATURE_USABLE_P (cpu_features, BMI2))
++	{
++	  if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
++	    return OPTIMIZE1 (evex_unaligned_erms);
++
++	  return OPTIMIZE1 (evex_unaligned);
++	}
++
++      if (CPU_FEATURE_USABLE_P (cpu_features, RTM))
++	{
++	  if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
++	    return OPTIMIZE1 (avx2_unaligned_erms_rtm);
++
++	  return OPTIMIZE1 (avx2_unaligned_rtm);
++	}
++
++      if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
++	{
++	  if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
++	    return OPTIMIZE1 (avx2_unaligned_erms);
++
++	  return OPTIMIZE1 (avx2_unaligned);
++	}
++    }
++
++  if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
++    return OPTIMIZE1 (sse2_unaligned_erms);
++
++  return OPTIMIZE1 (sse2_unaligned);
++}
++
++libc_ifunc_redirected (__redirect___bzero, __bzero, IFUNC_SELECTOR ());
++
++weak_alias (__bzero, bzero)
++#endif
+diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+index 8be0d78a..c963d391 100644
+--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
++++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+@@ -282,6 +282,48 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 			      __memset_avx512_no_vzeroupper)
+ 	     )
+ 
++  /* Support sysdeps/x86_64/multiarch/bzero.c.  */
++  IFUNC_IMPL (i, name, bzero,
++	      IFUNC_IMPL_ADD (array, i, bzero, 1,
++			      __bzero_sse2_unaligned)
++	      IFUNC_IMPL_ADD (array, i, bzero, 1,
++			      __bzero_sse2_unaligned_erms)
++	      IFUNC_IMPL_ADD (array, i, bzero,
++			      CPU_FEATURE_USABLE (AVX2),
++			      __bzero_avx2_unaligned)
++	      IFUNC_IMPL_ADD (array, i, bzero,
++			      CPU_FEATURE_USABLE (AVX2),
++			      __bzero_avx2_unaligned_erms)
++	      IFUNC_IMPL_ADD (array, i, bzero,
++			      (CPU_FEATURE_USABLE (AVX2)
++			       && CPU_FEATURE_USABLE (RTM)),
++			      __bzero_avx2_unaligned_rtm)
++	      IFUNC_IMPL_ADD (array, i, bzero,
++			      (CPU_FEATURE_USABLE (AVX2)
++			       && CPU_FEATURE_USABLE (RTM)),
++			      __bzero_avx2_unaligned_erms_rtm)
++	      IFUNC_IMPL_ADD (array, i, bzero,
++			      (CPU_FEATURE_USABLE (AVX512VL)
++			       && CPU_FEATURE_USABLE (AVX512BW)
++			       && CPU_FEATURE_USABLE (BMI2)),
++			      __bzero_evex_unaligned)
++	      IFUNC_IMPL_ADD (array, i, bzero,
++			      (CPU_FEATURE_USABLE (AVX512VL)
++			       && CPU_FEATURE_USABLE (AVX512BW)
++			       && CPU_FEATURE_USABLE (BMI2)),
++			      __bzero_evex_unaligned_erms)
++	      IFUNC_IMPL_ADD (array, i, bzero,
++			      (CPU_FEATURE_USABLE (AVX512VL)
++			       && CPU_FEATURE_USABLE (AVX512BW)
++			       && CPU_FEATURE_USABLE (BMI2)),
++			      __bzero_avx512_unaligned_erms)
++	      IFUNC_IMPL_ADD (array, i, bzero,
++			      (CPU_FEATURE_USABLE (AVX512VL)
++			       && CPU_FEATURE_USABLE (AVX512BW)
++			       && CPU_FEATURE_USABLE (BMI2)),
++			      __bzero_avx512_unaligned)
++	     )
++
+   /* Support sysdeps/x86_64/multiarch/rawmemchr.c.  */
+   IFUNC_IMPL (i, name, rawmemchr,
+ 	      IFUNC_IMPL_ADD (array, i, rawmemchr,
+diff --git a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms-rtm.S b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms-rtm.S
+index 8ac3e479..5a5ee6f6 100644
+--- a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms-rtm.S
++++ b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms-rtm.S
+@@ -5,6 +5,7 @@
+ 
+ #define SECTION(p) p##.avx.rtm
+ #define MEMSET_SYMBOL(p,s)	p##_avx2_##s##_rtm
++#define BZERO_SYMBOL(p,s)	p##_avx2_##s##_rtm
+ #define WMEMSET_SYMBOL(p,s)	p##_avx2_##s##_rtm
+ 
+ #include "memset-avx2-unaligned-erms.S"
+diff --git a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
+index c0bf2875..a093a283 100644
+--- a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
++++ b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
+@@ -14,6 +14,9 @@
+   vmovd d, %xmm0; \
+   movq r, %rax;
+ 
++# define BZERO_ZERO_VEC0() \
++  vpxor %xmm0, %xmm0, %xmm0
++
+ # define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
+   MEMSET_SET_VEC0_AND_SET_RETURN(d, r)
+ 
+@@ -29,6 +32,9 @@
+ # ifndef MEMSET_SYMBOL
+ #  define MEMSET_SYMBOL(p,s)	p##_avx2_##s
+ # endif
++# ifndef BZERO_SYMBOL
++#  define BZERO_SYMBOL(p,s)	p##_avx2_##s
++# endif
+ # ifndef WMEMSET_SYMBOL
+ #  define WMEMSET_SYMBOL(p,s)	p##_avx2_##s
+ # endif
+diff --git a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
+index 5241216a..727c9213 100644
+--- a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
++++ b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
+@@ -19,6 +19,9 @@
+   vpbroadcastb d, %VEC0; \
+   movq r, %rax
+ 
++# define BZERO_ZERO_VEC0() \
++  vpxorq %XMM0, %XMM0, %XMM0
++
+ # define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
+   vpbroadcastd d, %VEC0; \
+   movq r, %rax
+diff --git a/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
+index 63700215..5d8fa78f 100644
+--- a/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
++++ b/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
+@@ -19,6 +19,9 @@
+   vpbroadcastb d, %VEC0; \
+   movq r, %rax
+ 
++# define BZERO_ZERO_VEC0() \
++  vpxorq %XMM0, %XMM0, %XMM0
++
+ # define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
+   vpbroadcastd d, %VEC0; \
+   movq r, %rax
+diff --git a/sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S
+index 56b81f5c..8f579ad6 100644
+--- a/sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S
++++ b/sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S
+@@ -22,6 +22,7 @@
+ 
+ #if IS_IN (libc)
+ # define MEMSET_SYMBOL(p,s)	p##_sse2_##s
++# define BZERO_SYMBOL(p,s)	MEMSET_SYMBOL (p, s)
+ # define WMEMSET_SYMBOL(p,s)	p##_sse2_##s
+ 
+ # ifdef SHARED
+diff --git a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
+index a67f9833..06f5f5d7 100644
+--- a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
++++ b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
+@@ -26,6 +26,10 @@
+ 
+ #include <sysdep.h>
+ 
++#ifndef BZERO_SYMBOL
++# define BZERO_SYMBOL(p,s)		MEMSET_SYMBOL (p, s)
++#endif
++
+ #ifndef MEMSET_CHK_SYMBOL
+ # define MEMSET_CHK_SYMBOL(p,s)		MEMSET_SYMBOL(p, s)
+ #endif
+@@ -87,6 +91,18 @@
+ # define XMM_SMALL	0
+ #endif
+ 
++#ifdef USE_LESS_VEC_MASK_STORE
++# define SET_REG64	rcx
++# define SET_REG32	ecx
++# define SET_REG16	cx
++# define SET_REG8	cl
++#else
++# define SET_REG64	rsi
++# define SET_REG32	esi
++# define SET_REG16	si
++# define SET_REG8	sil
++#endif
++
+ #define PAGE_SIZE 4096
+ 
+ /* Macro to calculate size of small memset block for aligning
+@@ -96,18 +112,6 @@
+ 
+ #ifndef SECTION
+ # error SECTION is not defined!
+-#endif
+-
+-	.section SECTION(.text),"ax",@progbits
+-#if VEC_SIZE == 16 && IS_IN (libc)
+-ENTRY (__bzero)
+-	mov	%RDI_LP, %RAX_LP /* Set return value.  */
+-	mov	%RSI_LP, %RDX_LP /* Set n.  */
+-	xorl	%esi, %esi
+-	pxor	%XMM0, %XMM0
+-	jmp	L(entry_from_bzero)
+-END (__bzero)
+-weak_alias (__bzero, bzero)
+ #endif
+ 
+ #if IS_IN (libc)
+@@ -123,12 +127,37 @@ ENTRY (WMEMSET_SYMBOL (__wmemset, unaligned))
+ 	WMEMSET_SET_VEC0_AND_SET_RETURN (%esi, %rdi)
+ 	WMEMSET_VDUP_TO_VEC0_LOW()
+ 	cmpq	$VEC_SIZE, %rdx
+-	jb	L(less_vec_no_vdup)
++	jb	L(less_vec_from_wmemset)
+ 	WMEMSET_VDUP_TO_VEC0_HIGH()
+ 	jmp	L(entry_from_wmemset)
+ END (WMEMSET_SYMBOL (__wmemset, unaligned))
+ #endif
+ 
++ENTRY (BZERO_SYMBOL(__bzero, unaligned))
++#if VEC_SIZE > 16
++	BZERO_ZERO_VEC0 ()
++#endif
++	mov	%RDI_LP, %RAX_LP
++	mov	%RSI_LP, %RDX_LP
++#ifndef USE_LESS_VEC_MASK_STORE
++	xorl	%esi, %esi
++#endif
++	cmp	$VEC_SIZE, %RDX_LP
++	jb	L(less_vec_no_vdup)
++#ifdef USE_LESS_VEC_MASK_STORE
++	xorl	%esi, %esi
++#endif
++#if VEC_SIZE <= 16
++	BZERO_ZERO_VEC0 ()
++#endif
++	cmp	$(VEC_SIZE * 2), %RDX_LP
++	ja	L(more_2x_vec)
++	/* From VEC and to 2 * VEC.  No branch when size == VEC_SIZE.  */
++	VMOVU	%VEC(0), (%rdi)
++	VMOVU	%VEC(0), (VEC_SIZE * -1)(%rdi, %rdx)
++	VZEROUPPER_RETURN
++END (BZERO_SYMBOL(__bzero, unaligned))
++
+ #if defined SHARED && IS_IN (libc)
+ ENTRY_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned))
+ 	cmp	%RDX_LP, %RCX_LP
+@@ -142,7 +171,6 @@ ENTRY (MEMSET_SYMBOL (__memset, unaligned))
+ 	/* Clear the upper 32 bits.  */
+ 	mov	%edx, %edx
+ # endif
+-L(entry_from_bzero):
+ 	cmpq	$VEC_SIZE, %rdx
+ 	jb	L(less_vec)
+ 	MEMSET_VDUP_TO_VEC0_HIGH()
+@@ -187,6 +215,31 @@ END (__memset_erms)
+ END (MEMSET_SYMBOL (__memset, erms))
+ # endif
+ 
++ENTRY_P2ALIGN (BZERO_SYMBOL(__bzero, unaligned_erms), 6)
++# if VEC_SIZE > 16
++	BZERO_ZERO_VEC0 ()
++# endif
++	mov	%RDI_LP, %RAX_LP
++	mov	%RSI_LP, %RDX_LP
++# ifndef USE_LESS_VEC_MASK_STORE
++	xorl	%esi, %esi
++# endif
++	cmp	$VEC_SIZE, %RDX_LP
++	jb	L(less_vec_no_vdup)
++# ifdef USE_LESS_VEC_MASK_STORE
++	xorl	%esi, %esi
++# endif
++# if VEC_SIZE <= 16
++	BZERO_ZERO_VEC0 ()
++# endif
++	cmp	$(VEC_SIZE * 2), %RDX_LP
++	ja	L(stosb_more_2x_vec)
++	/* From VEC and to 2 * VEC.  No branch when size == VEC_SIZE.  */
++	VMOVU	%VEC(0), (%rdi)
++	VMOVU	%VEC(0), (VEC_SIZE * -1)(%rdi, %rdx)
++	VZEROUPPER_RETURN
++END (BZERO_SYMBOL(__bzero, unaligned_erms))
++
+ # if defined SHARED && IS_IN (libc)
+ ENTRY_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned_erms))
+ 	cmp	%RDX_LP, %RCX_LP
+@@ -229,6 +282,7 @@ L(last_2x_vec):
+ 	.p2align 4,, 10
+ L(less_vec):
+ L(less_vec_no_vdup):
++L(less_vec_from_wmemset):
+ 	/* Less than 1 VEC.  */
+ # if VEC_SIZE != 16 && VEC_SIZE != 32 && VEC_SIZE != 64
+ #  error Unsupported VEC_SIZE!
+@@ -374,8 +428,11 @@ L(less_vec):
+ 	/* Broadcast esi to partial register (i.e VEC_SIZE == 32 broadcast to
+ 	   xmm). This is only does anything for AVX2.  */
+ 	MEMSET_VDUP_TO_VEC0_LOW ()
++L(less_vec_from_wmemset):
++#if VEC_SIZE > 16
+ L(less_vec_no_vdup):
+ #endif
++#endif
+ L(cross_page):
+ #if VEC_SIZE > 32
+ 	cmpl	$32, %edx
+@@ -386,7 +443,10 @@ L(cross_page):
+ 	jge	L(between_16_31)
+ #endif
+ #ifndef USE_XMM_LESS_VEC
+-	MOVQ	%XMM0, %rcx
++	MOVQ	%XMM0, %SET_REG64
++#endif
++#if VEC_SIZE <= 16
++L(less_vec_no_vdup):
+ #endif
+ 	cmpl	$8, %edx
+ 	jge	L(between_8_15)
+@@ -395,7 +455,7 @@ L(cross_page):
+ 	cmpl	$1, %edx
+ 	jg	L(between_2_3)
+ 	jl	L(between_0_0)
+-	movb	%sil, (%LESS_VEC_REG)
++	movb	%SET_REG8, (%LESS_VEC_REG)
+ L(between_0_0):
+ 	ret
+ 
+@@ -428,8 +488,8 @@ L(between_8_15):
+ 	MOVQ	%XMM0, (%rdi)
+ 	MOVQ	%XMM0, -8(%rdi, %rdx)
+ #else
+-	movq	%rcx, (%LESS_VEC_REG)
+-	movq	%rcx, -8(%LESS_VEC_REG, %rdx)
++	movq	%SET_REG64, (%LESS_VEC_REG)
++	movq	%SET_REG64, -8(%LESS_VEC_REG, %rdx)
+ #endif
+ 	ret
+ 
+@@ -442,8 +502,8 @@ L(between_4_7):
+ 	MOVD	%XMM0, (%rdi)
+ 	MOVD	%XMM0, -4(%rdi, %rdx)
+ #else
+-	movl	%ecx, (%LESS_VEC_REG)
+-	movl	%ecx, -4(%LESS_VEC_REG, %rdx)
++	movl	%SET_REG32, (%LESS_VEC_REG)
++	movl	%SET_REG32, -4(%LESS_VEC_REG, %rdx)
+ #endif
+ 	ret
+ 
+@@ -452,12 +512,12 @@ L(between_4_7):
+ L(between_2_3):
+ 	/* From 2 to 3.  No branch when size == 2.  */
+ #ifdef USE_XMM_LESS_VEC
+-	movb	%sil, (%rdi)
+-	movb	%sil, 1(%rdi)
+-	movb	%sil, -1(%rdi, %rdx)
++	movb	%SET_REG8, (%rdi)
++	movb	%SET_REG8, 1(%rdi)
++	movb	%SET_REG8, -1(%rdi, %rdx)
+ #else
+-	movw	%cx, (%LESS_VEC_REG)
+-	movb	%sil, -1(%LESS_VEC_REG, %rdx)
++	movw	%SET_REG16, (%LESS_VEC_REG)
++	movb	%SET_REG8, -1(%LESS_VEC_REG, %rdx)
+ #endif
+ 	ret
+ END (MEMSET_SYMBOL (__memset, unaligned_erms))
+-- 
+GitLab
+
diff --git a/glibc-RHEL-15696-81.patch b/glibc-RHEL-15696-81.patch
new file mode 100644
index 0000000..960a4cc
--- /dev/null
+++ b/glibc-RHEL-15696-81.patch
@@ -0,0 +1,33 @@
+From 7912236f4a597deb092650ca79f33504ddb4af28 Mon Sep 17 00:00:00 2001
+From: Noah Goldstein <goldstein.w.n@gmail.com>
+Date: Sat, 12 Feb 2022 00:45:00 -0600
+Subject: [PATCH] x86: Set .text section in memset-vec-unaligned-erms
+Content-type: text/plain; charset=UTF-8
+
+commit 3d9f171bfb5325bd5f427e9fc386453358c6e840
+Author: H.J. Lu <hjl.tools@gmail.com>
+Date:   Mon Feb 7 05:55:15 2022 -0800
+
+    x86-64: Optimize bzero
+
+Remove setting the .text section for the code. This commit
+adds that back.
+---
+ sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S | 1 +
+ 1 file changed, 1 insertion(+)
+
+diff --git a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
+index 06f5f5d7..4fb475c0 100644
+--- a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
++++ b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
+@@ -114,6 +114,7 @@
+ # error SECTION is not defined!
+ #endif
+ 
++	.section SECTION(.text), "ax", @progbits
+ #if IS_IN (libc)
+ # if defined SHARED
+ ENTRY_CHK (WMEMSET_CHK_SYMBOL (__wmemset_chk, unaligned))
+-- 
+GitLab
+
diff --git a/glibc-RHEL-15696-82.patch b/glibc-RHEL-15696-82.patch
new file mode 100644
index 0000000..23ee46e
--- /dev/null
+++ b/glibc-RHEL-15696-82.patch
@@ -0,0 +1,90 @@
+From e108c02a5e23c8c88ce66d8705d4a24bb6b9a8bf Mon Sep 17 00:00:00 2001
+From: Noah Goldstein <goldstein.w.n@gmail.com>
+Date: Tue, 15 Feb 2022 20:27:21 -0600
+Subject: [PATCH] x86: Fix bug in strncmp-evex and strncmp-avx2 [BZ #28895]
+Content-type: text/plain; charset=UTF-8
+
+Logic can read before the start of `s1` / `s2` if both `s1` and `s2`
+are near the start of a page. To avoid having the result contimated by
+these comparisons the `strcmp` variants would mask off these
+comparisons. This was missing in the `strncmp` variants causing
+the bug. This commit adds the masking to `strncmp` so that out of
+range comparisons don't affect the result.
+
+test-strcmp, test-strncmp, test-wcscmp, and test-wcsncmp all pass as
+well a full xcheck on x86_64 linux.
+Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
+---
+ string/test-strncmp.c                  | 23 +++++++++++++++++++++++
+ sysdeps/x86_64/multiarch/strcmp-avx2.S |  1 +
+ sysdeps/x86_64/multiarch/strcmp-evex.S |  1 +
+ 3 files changed, 25 insertions(+)
+
+diff --git a/string/test-strncmp.c b/string/test-strncmp.c
+index 927a6daa..e61fffd9 100644
+--- a/string/test-strncmp.c
++++ b/string/test-strncmp.c
+@@ -403,6 +403,28 @@ check2 (void)
+   free (s2);
+ }
+ 
++static void
++check4 (void)
++{
++  /* To trigger bug 28895; We need 1) both s1 and s2 to be within 32 bytes of
++     the end of the page. 2) For there to be no mismatch/null byte before the
++     first page cross. 3) For length (`n`) to be large enough for one string to
++     cross the page. And 4) for there to be either mismatch/null bytes before
++     the start of the strings.  */
++
++  size_t size = 10;
++  size_t addr_mask = (getpagesize () - 1) ^ (sizeof (CHAR) - 1);
++  CHAR *s1 = (CHAR *)(buf1 + (addr_mask & 0xffa));
++  CHAR *s2 = (CHAR *)(buf2 + (addr_mask & 0xfed));
++  int exp_result;
++
++  STRCPY (s1, L ("tst-tlsmod%"));
++  STRCPY (s2, L ("tst-tls-manydynamic73mod"));
++  exp_result = SIMPLE_STRNCMP (s1, s2, size);
++  FOR_EACH_IMPL (impl, 0)
++  check_result (impl, s1, s2, size, exp_result);
++}
++
+ static void
+ check3 (void)
+ {
+@@ -445,6 +467,7 @@ test_main (void)
+   check1 ();
+   check2 ();
+   check3 ();
++  check4 ();
+ 
+   printf ("%23s", "");
+   FOR_EACH_IMPL (impl, 0)
+diff --git a/sysdeps/x86_64/multiarch/strcmp-avx2.S b/sysdeps/x86_64/multiarch/strcmp-avx2.S
+index 04675aa4..179cc0e3 100644
+--- a/sysdeps/x86_64/multiarch/strcmp-avx2.S
++++ b/sysdeps/x86_64/multiarch/strcmp-avx2.S
+@@ -661,6 +661,7 @@ L(ret8):
+ # ifdef USE_AS_STRNCMP
+ 	.p2align 4,, 10
+ L(return_page_cross_end_check):
++	andl	%r10d, %ecx
+ 	tzcntl	%ecx, %ecx
+ 	leal	-VEC_SIZE(%rax, %rcx), %ecx
+ 	cmpl	%ecx, %edx
+diff --git a/sysdeps/x86_64/multiarch/strcmp-evex.S b/sysdeps/x86_64/multiarch/strcmp-evex.S
+index ed56af8e..0dfa62bd 100644
+--- a/sysdeps/x86_64/multiarch/strcmp-evex.S
++++ b/sysdeps/x86_64/multiarch/strcmp-evex.S
+@@ -689,6 +689,7 @@ L(ret8):
+ # ifdef USE_AS_STRNCMP
+ 	.p2align 4,, 10
+ L(return_page_cross_end_check):
++	andl	%r10d, %ecx
+ 	tzcntl	%ecx, %ecx
+ 	leal	-VEC_SIZE(%rax, %rcx, SIZE_OF_CHAR), %ecx
+ #  ifdef USE_AS_WCSCMP
+-- 
+GitLab
+
diff --git a/glibc-RHEL-15696-83.patch b/glibc-RHEL-15696-83.patch
new file mode 100644
index 0000000..e7475a8
--- /dev/null
+++ b/glibc-RHEL-15696-83.patch
@@ -0,0 +1,77 @@
+From 9fef7039a7d04947bc89296ee0d187bc8d89b772 Mon Sep 17 00:00:00 2001
+From: Noah Goldstein <goldstein.w.n@gmail.com>
+Date: Thu, 24 Mar 2022 15:50:33 -0500
+Subject: [PATCH] x86: Fix fallback for wcsncmp_avx2 in strcmp-avx2.S [BZ
+ #28896]
+Content-type: text/plain; charset=UTF-8
+
+Overflow case for __wcsncmp_avx2_rtm should be __wcscmp_avx2_rtm not
+__wcscmp_avx2.
+
+commit ddf0992cf57a93200e0c782e2a94d0733a5a0b87
+Author: Noah Goldstein <goldstein.w.n@gmail.com>
+Date:   Sun Jan 9 16:02:21 2022 -0600
+
+    x86: Fix __wcsncmp_avx2 in strcmp-avx2.S [BZ# 28755]
+
+Set the wrong fallback function for `__wcsncmp_avx2_rtm`. It was set
+to fallback on to `__wcscmp_avx2` instead of `__wcscmp_avx2_rtm` which
+can cause spurious aborts.
+
+This change will need to be backported.
+
+All string/memory tests pass.
+Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
+---
+ sysdeps/x86/tst-strncmp-rtm.c          | 15 +++++++++++++++
+ sysdeps/x86_64/multiarch/strcmp-avx2.S |  2 +-
+ 2 files changed, 16 insertions(+), 1 deletion(-)
+
+diff --git a/sysdeps/x86/tst-strncmp-rtm.c b/sysdeps/x86/tst-strncmp-rtm.c
+index aef9866c..ba6543be 100644
+--- a/sysdeps/x86/tst-strncmp-rtm.c
++++ b/sysdeps/x86/tst-strncmp-rtm.c
+@@ -70,6 +70,16 @@ function_overflow (void)
+     return 1;
+ }
+ 
++__attribute__ ((noinline, noclone))
++static int
++function_overflow2 (void)
++{
++  if (STRNCMP (string1, string2, SIZE_MAX >> 4) == 0)
++    return 0;
++  else
++    return 1;
++}
++
+ static int
+ do_test (void)
+ {
+@@ -77,5 +87,10 @@ do_test (void)
+   if (status != EXIT_SUCCESS)
+     return status;
+   status = do_test_1 (TEST_NAME, LOOP, prepare, function_overflow);
++  if (status != EXIT_SUCCESS)
++    return status;
++  status = do_test_1 (TEST_NAME, LOOP, prepare, function_overflow2);
++  if (status != EXIT_SUCCESS)
++    return status;
+   return status;
+ }
+diff --git a/sysdeps/x86_64/multiarch/strcmp-avx2.S b/sysdeps/x86_64/multiarch/strcmp-avx2.S
+index 179cc0e3..782f9472 100644
+--- a/sysdeps/x86_64/multiarch/strcmp-avx2.S
++++ b/sysdeps/x86_64/multiarch/strcmp-avx2.S
+@@ -122,7 +122,7 @@ ENTRY(STRCMP)
+ 	   are cases where length is large enough that it can never be a
+ 	   bound on valid memory so just use wcscmp.  */
+ 	shrq	$56, %rcx
+-	jnz	__wcscmp_avx2
++	jnz	OVERFLOW_STRCMP
+ 
+ 	leaq	(, %rdx, 4), %rdx
+ #  endif
+-- 
+GitLab
+
diff --git a/glibc-RHEL-15696-84.patch b/glibc-RHEL-15696-84.patch
new file mode 100644
index 0000000..e998eff
--- /dev/null
+++ b/glibc-RHEL-15696-84.patch
@@ -0,0 +1,27 @@
+From 1283948f236f209b7d3f44b69a42b96806fa6da0 Mon Sep 17 00:00:00 2001
+From: "H.J. Lu" <hjl.tools@gmail.com>
+Date: Sat, 5 Feb 2022 11:06:01 -0800
+Subject: [PATCH] x86: Improve L to support L(XXX_SYMBOL (YYY, ZZZ))
+Content-type: text/plain; charset=UTF-8
+
+---
+ sysdeps/x86/sysdep.h | 3 ++-
+ 1 file changed, 2 insertions(+), 1 deletion(-)
+
+diff --git a/sysdeps/x86/sysdep.h b/sysdeps/x86/sysdep.h
+index a70bb3a2..49b0efe2 100644
+--- a/sysdeps/x86/sysdep.h
++++ b/sysdeps/x86/sysdep.h
+@@ -111,7 +111,8 @@ enum cf_protection_level
+ /* Local label name for asm code. */
+ #ifndef L
+ /* ELF-like local names start with `.L'.  */
+-# define L(name)	.L##name
++# define LOCAL_LABEL(name) .L##name
++# define L(name)	LOCAL_LABEL(name)
+ #endif
+ 
+ #define atom_text_section .section ".text.atom", "ax"
+-- 
+GitLab
+
diff --git a/glibc-RHEL-15696-85.patch b/glibc-RHEL-15696-85.patch
new file mode 100644
index 0000000..18f8a47
--- /dev/null
+++ b/glibc-RHEL-15696-85.patch
@@ -0,0 +1,108 @@
+From c328d0152d4b14cca58407ec68143894c8863004 Mon Sep 17 00:00:00 2001
+From: "H.J. Lu" <hjl.tools@gmail.com>
+Date: Sat, 5 Feb 2022 11:52:33 -0800
+Subject: [PATCH] x86_64/multiarch: Sort sysdep_routines and put one entry per
+ line
+Content-type: text/plain; charset=UTF-8
+
+Conflicts:
+	sysdeps/x86_64/multiarch/Makefile
+	(test order changed)
+
+---
+ sysdeps/x86_64/multiarch/Makefile | 78 +++++++++++++++++++------------
+ 1 file changed, 48 insertions(+), 30 deletions(-)
+
+diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
+index 37d8d6f0..8c9e7812 100644
+--- a/sysdeps/x86_64/multiarch/Makefile
++++ b/sysdeps/x86_64/multiarch/Makefile
+@@ -132,37 +132,55 @@ CFLAGS-strspn-c.c += -msse4
+ endif
+ 
+ ifeq ($(subdir),wcsmbs)
+-sysdep_routines += wmemcmp-sse4 wmemcmp-ssse3 wmemcmp-c \
+-		   wmemcmp-avx2-movbe \
+-		   wmemchr-sse2 wmemchr-avx2 \
+-		   wcscmp-sse2 wcscmp-avx2 \
+-		   wcsncmp-sse2 wcsncmp-avx2 \
+-		   wcscpy-ssse3 wcscpy-c \
+-		   wcschr-sse2 wcschr-avx2 \
+-		   wcsrchr-sse2 wcsrchr-avx2 \
+-		   wcslen-sse2 wcslen-sse4_1 wcslen-avx2 \
+-		   wcsnlen-c wcsnlen-sse4_1 wcsnlen-avx2 \
+-		   wcschr-avx2-rtm \
+-		   wcscmp-avx2-rtm \
+-		   wcslen-avx2-rtm \
+-		   wcsncmp-avx2-rtm \
+-		   wcsnlen-avx2-rtm \
+-		   wcsrchr-avx2-rtm \
+-		   wmemchr-avx2-rtm \
+-		   wmemcmp-avx2-movbe-rtm \
+-		   wcschr-evex \
+-		   wcscmp-evex \
+-		   wcslen-evex \
+-		   wcsncmp-evex \
+-		   wcsnlen-evex \
+-		   wcsrchr-evex \
+-		   wmemchr-evex \
+-		   wmemcmp-evex-movbe \
+-		   wmemchr-evex-rtm
++sysdep_routines += \
++  wcschr-avx2 \
++  wcschr-avx2-rtm \
++  wcschr-evex \
++  wcschr-sse2 \
++  wcscmp-avx2 \
++  wcscmp-avx2-rtm \
++  wcscmp-evex \
++  wcscmp-sse2 \
++  wcscpy-c \
++  wcscpy-ssse3 \
++  wcslen-avx2 \
++  wcslen-avx2-rtm \
++  wcslen-evex \
++  wcslen-sse2 \
++  wcslen-sse4_1 \
++  wcsncmp-avx2 \
++  wcsncmp-avx2-rtm \
++  wcsncmp-evex \
++  wcsncmp-sse2 \
++  wcsnlen-avx2 \
++  wcsnlen-avx2-rtm \
++  wcsnlen-c \
++  wcsnlen-evex \
++  wcsnlen-sse4_1 \
++  wcsrchr-avx2 \
++  wcsrchr-avx2-rtm \
++  wcsrchr-evex \
++  wcsrchr-sse2 \
++  wmemchr-avx2 \
++  wmemchr-avx2-rtm \
++  wmemchr-evex \
++  wmemchr-evex-rtm \
++  wmemchr-sse2 \
++  wmemcmp-avx2-movbe \
++  wmemcmp-avx2-movbe-rtm \
++  wmemcmp-c \
++  wmemcmp-evex-movbe \
++  wmemcmp-sse4 \
++  wmemcmp-ssse3 \
++# sysdep_routines
+ endif
+ 
+ ifeq ($(subdir),debug)
+-sysdep_routines += memcpy_chk-nonshared mempcpy_chk-nonshared \
+-		   memmove_chk-nonshared memset_chk-nonshared \
+-		   wmemset_chk-nonshared
++sysdep_routines += \
++  memcpy_chk-nonshared \
++  memmove_chk-nonshared \
++  mempcpy_chk-nonshared \
++  memset_chk-nonshared \
++  wmemset_chk-nonshared \
++# sysdep_routines
+ endif
+-- 
+GitLab
+
diff --git a/glibc-RHEL-15696-86.patch b/glibc-RHEL-15696-86.patch
new file mode 100644
index 0000000..d4fb42f
--- /dev/null
+++ b/glibc-RHEL-15696-86.patch
@@ -0,0 +1,36 @@
+From 0fb8800029d230b3711bf722b2a47db92d0e273f Mon Sep 17 00:00:00 2001
+From: "H.J. Lu" <hjl.tools@gmail.com>
+Date: Thu, 10 Feb 2022 11:52:50 -0800
+Subject: [PATCH] x86-64: Remove bzero weak alias in SS2 memset
+Content-type: text/plain; charset=UTF-8
+
+commit 3d9f171bfb5325bd5f427e9fc386453358c6e840
+Author: H.J. Lu <hjl.tools@gmail.com>
+Date:   Mon Feb 7 05:55:15 2022 -0800
+
+    x86-64: Optimize bzero
+
+added the optimized bzero.  Remove bzero weak alias in SS2 memset to
+avoid undefined __bzero in memset-sse2-unaligned-erms.
+---
+ sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S | 4 +---
+ 1 file changed, 1 insertion(+), 3 deletions(-)
+
+diff --git a/sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S
+index 8f579ad6..af51362b 100644
+--- a/sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S
++++ b/sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S
+@@ -31,9 +31,7 @@
+ # endif
+ 
+ # undef weak_alias
+-# define weak_alias(original, alias) \
+-	.weak bzero; bzero = __bzero
+-
++# define weak_alias(original, alias)
+ # undef strong_alias
+ # define strong_alias(ignored1, ignored2)
+ #endif
+-- 
+GitLab
+
diff --git a/glibc-RHEL-15696-87.patch b/glibc-RHEL-15696-87.patch
new file mode 100644
index 0000000..4882613
--- /dev/null
+++ b/glibc-RHEL-15696-87.patch
@@ -0,0 +1,29 @@
+From bf92893a14ebc161b08b28acc24fa06ae6be19cb Mon Sep 17 00:00:00 2001
+From: Adhemerval Zanella <adhemerval.zanella@linaro.org>
+Date: Thu, 10 Feb 2022 11:23:24 -0300
+Subject: [PATCH] x86_64: Remove bcopy optimizations
+Content-type: text/plain; charset=UTF-8
+
+The symbols is not present in current POSIX specification and compiler
+already generates memmove call.
+---
+ sysdeps/x86_64/multiarch/bcopy.S | 7 -------
+ 1 file changed, 7 deletions(-)
+ delete mode 100644 sysdeps/x86_64/multiarch/bcopy.S
+
+diff --git a/sysdeps/x86_64/multiarch/bcopy.S b/sysdeps/x86_64/multiarch/bcopy.S
+deleted file mode 100644
+index 639f02bd..00000000
+--- a/sysdeps/x86_64/multiarch/bcopy.S
++++ /dev/null
+@@ -1,7 +0,0 @@
+-#include <sysdep.h>
+-
+-	.text
+-ENTRY(bcopy)
+-	xchg	%rdi, %rsi
+-	jmp	__libc_memmove	/* Branch to IFUNC memmove.  */
+-END(bcopy)
+-- 
+GitLab
+
diff --git a/glibc-RHEL-15696-88.patch b/glibc-RHEL-15696-88.patch
new file mode 100644
index 0000000..d075f80
--- /dev/null
+++ b/glibc-RHEL-15696-88.patch
@@ -0,0 +1,372 @@
+From a6fbf4d51e9ba8063c4f8331564892ead9c67344 Mon Sep 17 00:00:00 2001
+From: Noah Goldstein <goldstein.w.n@gmail.com>
+Date: Wed, 23 Mar 2022 16:57:16 -0500
+Subject: [PATCH] x86: Code cleanup in strchr-avx2 and comment justifying
+ branch
+Content-type: text/plain; charset=UTF-8
+
+Small code cleanup for size: -53 bytes.
+
+Add comment justifying using a branch to do NULL/non-null return.
+
+All string/memory tests pass and no regressions in benchtests.
+
+geometric_mean(N=20) of all benchmarks Original / New: 1.00
+Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
+---
+ sysdeps/x86_64/multiarch/strchr-avx2.S | 204 +++++++++++++------------
+ 1 file changed, 107 insertions(+), 97 deletions(-)
+
+diff --git a/sysdeps/x86_64/multiarch/strchr-avx2.S b/sysdeps/x86_64/multiarch/strchr-avx2.S
+index 5884726b..89dd2bf7 100644
+--- a/sysdeps/x86_64/multiarch/strchr-avx2.S
++++ b/sysdeps/x86_64/multiarch/strchr-avx2.S
+@@ -48,13 +48,13 @@
+ # define PAGE_SIZE 4096
+ 
+ 	.section SECTION(.text),"ax",@progbits
+-ENTRY (STRCHR)
++ENTRY_P2ALIGN (STRCHR, 5)
+ 	/* Broadcast CHAR to YMM0.	*/
+ 	vmovd	%esi, %xmm0
+ 	movl	%edi, %eax
+ 	andl	$(PAGE_SIZE - 1), %eax
+ 	VPBROADCAST	%xmm0, %ymm0
+-	vpxor	%xmm9, %xmm9, %xmm9
++	vpxor	%xmm1, %xmm1, %xmm1
+ 
+ 	/* Check if we cross page boundary with one vector load.  */
+ 	cmpl	$(PAGE_SIZE - VEC_SIZE), %eax
+@@ -62,37 +62,29 @@ ENTRY (STRCHR)
+ 
+ 	/* Check the first VEC_SIZE bytes.	Search for both CHAR and the
+ 	   null byte.  */
+-	vmovdqu	(%rdi), %ymm8
+-	VPCMPEQ	%ymm8, %ymm0, %ymm1
+-	VPCMPEQ	%ymm8, %ymm9, %ymm2
+-	vpor	%ymm1, %ymm2, %ymm1
+-	vpmovmskb %ymm1, %eax
++	vmovdqu	(%rdi), %ymm2
++	VPCMPEQ	%ymm2, %ymm0, %ymm3
++	VPCMPEQ	%ymm2, %ymm1, %ymm2
++	vpor	%ymm3, %ymm2, %ymm3
++	vpmovmskb %ymm3, %eax
+ 	testl	%eax, %eax
+ 	jz	L(aligned_more)
+ 	tzcntl	%eax, %eax
+ # ifndef USE_AS_STRCHRNUL
+-	/* Found CHAR or the null byte.	 */
+-	cmp	(%rdi, %rax), %CHAR_REG
+-	jne	L(zero)
+-# endif
+-	addq	%rdi, %rax
+-	VZEROUPPER_RETURN
+-
+-	/* .p2align 5 helps keep performance more consistent if ENTRY()
+-	   alignment % 32 was either 16 or 0. As well this makes the
+-	   alignment % 32 of the loop_4x_vec fixed which makes tuning it
+-	   easier.  */
+-	.p2align 5
+-L(first_vec_x4):
+-	tzcntl	%eax, %eax
+-	addq	$(VEC_SIZE * 3 + 1), %rdi
+-# ifndef USE_AS_STRCHRNUL
+-	/* Found CHAR or the null byte.	 */
++	/* Found CHAR or the null byte.  */
+ 	cmp	(%rdi, %rax), %CHAR_REG
++	/* NB: Use a branch instead of cmovcc here. The expectation is
++	   that with strchr the user will branch based on input being
++	   null. Since this branch will be 100% predictive of the user
++	   branch a branch miss here should save what otherwise would
++	   be branch miss in the user code. Otherwise using a branch 1)
++	   saves code size and 2) is faster in highly predictable
++	   environments.  */
+ 	jne	L(zero)
+ # endif
+ 	addq	%rdi, %rax
+-	VZEROUPPER_RETURN
++L(return_vzeroupper):
++	ZERO_UPPER_VEC_REGISTERS_RETURN
+ 
+ # ifndef USE_AS_STRCHRNUL
+ L(zero):
+@@ -103,7 +95,8 @@ L(zero):
+ 
+ 	.p2align 4
+ L(first_vec_x1):
+-	tzcntl	%eax, %eax
++	/* Use bsf to save code size.  */
++	bsfl	%eax, %eax
+ 	incq	%rdi
+ # ifndef USE_AS_STRCHRNUL
+ 	/* Found CHAR or the null byte.	 */
+@@ -113,9 +106,10 @@ L(first_vec_x1):
+ 	addq	%rdi, %rax
+ 	VZEROUPPER_RETURN
+ 
+-	.p2align 4
++	.p2align 4,, 10
+ L(first_vec_x2):
+-	tzcntl	%eax, %eax
++	/* Use bsf to save code size.  */
++	bsfl	%eax, %eax
+ 	addq	$(VEC_SIZE + 1), %rdi
+ # ifndef USE_AS_STRCHRNUL
+ 	/* Found CHAR or the null byte.	 */
+@@ -125,9 +119,10 @@ L(first_vec_x2):
+ 	addq	%rdi, %rax
+ 	VZEROUPPER_RETURN
+ 
+-	.p2align 4
++	.p2align 4,, 8
+ L(first_vec_x3):
+-	tzcntl	%eax, %eax
++	/* Use bsf to save code size.  */
++	bsfl	%eax, %eax
+ 	addq	$(VEC_SIZE * 2 + 1), %rdi
+ # ifndef USE_AS_STRCHRNUL
+ 	/* Found CHAR or the null byte.	 */
+@@ -137,6 +132,21 @@ L(first_vec_x3):
+ 	addq	%rdi, %rax
+ 	VZEROUPPER_RETURN
+ 
++	.p2align 4,, 10
++L(first_vec_x4):
++	/* Use bsf to save code size.  */
++	bsfl	%eax, %eax
++	addq	$(VEC_SIZE * 3 + 1), %rdi
++# ifndef USE_AS_STRCHRNUL
++	/* Found CHAR or the null byte.	 */
++	cmp	(%rdi, %rax), %CHAR_REG
++	jne	L(zero)
++# endif
++	addq	%rdi, %rax
++	VZEROUPPER_RETURN
++
++
++
+ 	.p2align 4
+ L(aligned_more):
+ 	/* Align data to VEC_SIZE - 1. This is the same number of
+@@ -146,90 +156,92 @@ L(aligned_more):
+ L(cross_page_continue):
+ 	/* Check the next 4 * VEC_SIZE.  Only one VEC_SIZE at a time
+ 	   since data is only aligned to VEC_SIZE.  */
+-	vmovdqa	1(%rdi), %ymm8
+-	VPCMPEQ	%ymm8, %ymm0, %ymm1
+-	VPCMPEQ	%ymm8, %ymm9, %ymm2
+-	vpor	%ymm1, %ymm2, %ymm1
+-	vpmovmskb %ymm1, %eax
++	vmovdqa	1(%rdi), %ymm2
++	VPCMPEQ	%ymm2, %ymm0, %ymm3
++	VPCMPEQ	%ymm2, %ymm1, %ymm2
++	vpor	%ymm3, %ymm2, %ymm3
++	vpmovmskb %ymm3, %eax
+ 	testl	%eax, %eax
+ 	jnz	L(first_vec_x1)
+ 
+-	vmovdqa	(VEC_SIZE + 1)(%rdi), %ymm8
+-	VPCMPEQ	%ymm8, %ymm0, %ymm1
+-	VPCMPEQ	%ymm8, %ymm9, %ymm2
+-	vpor	%ymm1, %ymm2, %ymm1
+-	vpmovmskb %ymm1, %eax
++	vmovdqa	(VEC_SIZE + 1)(%rdi), %ymm2
++	VPCMPEQ	%ymm2, %ymm0, %ymm3
++	VPCMPEQ	%ymm2, %ymm1, %ymm2
++	vpor	%ymm3, %ymm2, %ymm3
++	vpmovmskb %ymm3, %eax
+ 	testl	%eax, %eax
+ 	jnz	L(first_vec_x2)
+ 
+-	vmovdqa	(VEC_SIZE * 2 + 1)(%rdi), %ymm8
+-	VPCMPEQ	%ymm8, %ymm0, %ymm1
+-	VPCMPEQ	%ymm8, %ymm9, %ymm2
+-	vpor	%ymm1, %ymm2, %ymm1
+-	vpmovmskb %ymm1, %eax
++	vmovdqa	(VEC_SIZE * 2 + 1)(%rdi), %ymm2
++	VPCMPEQ	%ymm2, %ymm0, %ymm3
++	VPCMPEQ	%ymm2, %ymm1, %ymm2
++	vpor	%ymm3, %ymm2, %ymm3
++	vpmovmskb %ymm3, %eax
+ 	testl	%eax, %eax
+ 	jnz	L(first_vec_x3)
+ 
+-	vmovdqa	(VEC_SIZE * 3 + 1)(%rdi), %ymm8
+-	VPCMPEQ	%ymm8, %ymm0, %ymm1
+-	VPCMPEQ	%ymm8, %ymm9, %ymm2
+-	vpor	%ymm1, %ymm2, %ymm1
+-	vpmovmskb %ymm1, %eax
++	vmovdqa	(VEC_SIZE * 3 + 1)(%rdi), %ymm2
++	VPCMPEQ	%ymm2, %ymm0, %ymm3
++	VPCMPEQ	%ymm2, %ymm1, %ymm2
++	vpor	%ymm3, %ymm2, %ymm3
++	vpmovmskb %ymm3, %eax
+ 	testl	%eax, %eax
+ 	jnz	L(first_vec_x4)
+-	/* Align data to VEC_SIZE * 4 - 1.	*/
+-	addq	$(VEC_SIZE * 4 + 1), %rdi
+-	andq	$-(VEC_SIZE * 4), %rdi
++	/* Align data to VEC_SIZE * 4 - 1.  */
++	incq	%rdi
++	orq	$(VEC_SIZE * 4 - 1), %rdi
+ 	.p2align 4
+ L(loop_4x_vec):
+ 	/* Compare 4 * VEC at a time forward.  */
+-	vmovdqa	(%rdi), %ymm5
+-	vmovdqa	(VEC_SIZE)(%rdi), %ymm6
+-	vmovdqa	(VEC_SIZE * 2)(%rdi), %ymm7
+-	vmovdqa	(VEC_SIZE * 3)(%rdi), %ymm8
++	vmovdqa	1(%rdi), %ymm6
++	vmovdqa	(VEC_SIZE + 1)(%rdi), %ymm7
+ 
+ 	/* Leaves only CHARS matching esi as 0.	 */
+-	vpxor	%ymm5, %ymm0, %ymm1
+ 	vpxor	%ymm6, %ymm0, %ymm2
+ 	vpxor	%ymm7, %ymm0, %ymm3
+-	vpxor	%ymm8, %ymm0, %ymm4
+ 
+-	VPMINU	%ymm1, %ymm5, %ymm1
+ 	VPMINU	%ymm2, %ymm6, %ymm2
+ 	VPMINU	%ymm3, %ymm7, %ymm3
+-	VPMINU	%ymm4, %ymm8, %ymm4
+ 
+-	VPMINU	%ymm1, %ymm2, %ymm5
+-	VPMINU	%ymm3, %ymm4, %ymm6
++	vmovdqa	(VEC_SIZE * 2 + 1)(%rdi), %ymm6
++	vmovdqa	(VEC_SIZE * 3 + 1)(%rdi), %ymm7
++
++	vpxor	%ymm6, %ymm0, %ymm4
++	vpxor	%ymm7, %ymm0, %ymm5
++
++	VPMINU	%ymm4, %ymm6, %ymm4
++	VPMINU	%ymm5, %ymm7, %ymm5
+ 
+-	VPMINU	%ymm5, %ymm6, %ymm6
++	VPMINU	%ymm2, %ymm3, %ymm6
++	VPMINU	%ymm4, %ymm5, %ymm7
+ 
+-	VPCMPEQ	%ymm6, %ymm9, %ymm6
+-	vpmovmskb %ymm6, %ecx
++	VPMINU	%ymm6, %ymm7, %ymm7
++
++	VPCMPEQ	%ymm7, %ymm1, %ymm7
++	vpmovmskb %ymm7, %ecx
+ 	subq	$-(VEC_SIZE * 4), %rdi
+ 	testl	%ecx, %ecx
+ 	jz	L(loop_4x_vec)
+ 
+-
+-	VPCMPEQ	%ymm1, %ymm9, %ymm1
+-	vpmovmskb %ymm1, %eax
++	VPCMPEQ	%ymm2, %ymm1, %ymm2
++	vpmovmskb %ymm2, %eax
+ 	testl	%eax, %eax
+ 	jnz	L(last_vec_x0)
+ 
+ 
+-	VPCMPEQ	%ymm5, %ymm9, %ymm2
+-	vpmovmskb %ymm2, %eax
++	VPCMPEQ	%ymm3, %ymm1, %ymm3
++	vpmovmskb %ymm3, %eax
+ 	testl	%eax, %eax
+ 	jnz	L(last_vec_x1)
+ 
+-	VPCMPEQ	%ymm3, %ymm9, %ymm3
+-	vpmovmskb %ymm3, %eax
++	VPCMPEQ	%ymm4, %ymm1, %ymm4
++	vpmovmskb %ymm4, %eax
+ 	/* rcx has combined result from all 4 VEC. It will only be used
+ 	   if the first 3 other VEC all did not contain a match.  */
+ 	salq	$32, %rcx
+ 	orq	%rcx, %rax
+ 	tzcntq	%rax, %rax
+-	subq	$(VEC_SIZE * 2), %rdi
++	subq	$(VEC_SIZE * 2 - 1), %rdi
+ # ifndef USE_AS_STRCHRNUL
+ 	/* Found CHAR or the null byte.	 */
+ 	cmp	(%rdi, %rax), %CHAR_REG
+@@ -239,10 +251,11 @@ L(loop_4x_vec):
+ 	VZEROUPPER_RETURN
+ 
+ 
+-	.p2align 4
++	.p2align 4,, 10
+ L(last_vec_x0):
+-	tzcntl	%eax, %eax
+-	addq	$-(VEC_SIZE * 4), %rdi
++	/* Use bsf to save code size.  */
++	bsfl	%eax, %eax
++	addq	$-(VEC_SIZE * 4 - 1), %rdi
+ # ifndef USE_AS_STRCHRNUL
+ 	/* Found CHAR or the null byte.	 */
+ 	cmp	(%rdi, %rax), %CHAR_REG
+@@ -251,16 +264,11 @@ L(last_vec_x0):
+ 	addq	%rdi, %rax
+ 	VZEROUPPER_RETURN
+ 
+-# ifndef USE_AS_STRCHRNUL
+-L(zero_end):
+-	xorl	%eax, %eax
+-	VZEROUPPER_RETURN
+-# endif
+ 
+-	.p2align 4
++	.p2align 4,, 10
+ L(last_vec_x1):
+ 	tzcntl	%eax, %eax
+-	subq	$(VEC_SIZE * 3), %rdi
++	subq	$(VEC_SIZE * 3 - 1), %rdi
+ # ifndef USE_AS_STRCHRNUL
+ 	/* Found CHAR or the null byte.	 */
+ 	cmp	(%rdi, %rax), %CHAR_REG
+@@ -269,18 +277,23 @@ L(last_vec_x1):
+ 	addq	%rdi, %rax
+ 	VZEROUPPER_RETURN
+ 
++# ifndef USE_AS_STRCHRNUL
++L(zero_end):
++	xorl	%eax, %eax
++	VZEROUPPER_RETURN
++# endif
+ 
+ 	/* Cold case for crossing page with first load.	 */
+-	.p2align 4
++	.p2align 4,, 8
+ L(cross_page_boundary):
+ 	movq	%rdi, %rdx
+ 	/* Align rdi to VEC_SIZE - 1.  */
+ 	orq	$(VEC_SIZE - 1), %rdi
+-	vmovdqa	-(VEC_SIZE - 1)(%rdi), %ymm8
+-	VPCMPEQ	%ymm8, %ymm0, %ymm1
+-	VPCMPEQ	%ymm8, %ymm9, %ymm2
+-	vpor	%ymm1, %ymm2, %ymm1
+-	vpmovmskb %ymm1, %eax
++	vmovdqa	-(VEC_SIZE - 1)(%rdi), %ymm2
++	VPCMPEQ	%ymm2, %ymm0, %ymm3
++	VPCMPEQ	%ymm2, %ymm1, %ymm2
++	vpor	%ymm3, %ymm2, %ymm3
++	vpmovmskb %ymm3, %eax
+ 	/* Remove the leading bytes. sarxl only uses bits [5:0] of COUNT
+ 	   so no need to manually mod edx.  */
+ 	sarxl	%edx, %eax, %eax
+@@ -291,13 +304,10 @@ L(cross_page_boundary):
+ 	xorl	%ecx, %ecx
+ 	/* Found CHAR or the null byte.	 */
+ 	cmp	(%rdx, %rax), %CHAR_REG
+-	leaq	(%rdx, %rax), %rax
+-	cmovne	%rcx, %rax
+-# else
+-	addq	%rdx, %rax
++	jne	L(zero_end)
+ # endif
+-L(return_vzeroupper):
+-	ZERO_UPPER_VEC_REGISTERS_RETURN
++	addq	%rdx, %rax
++	VZEROUPPER_RETURN
+ 
+ END (STRCHR)
+-# endif
++#endif
+-- 
+GitLab
+
diff --git a/glibc-RHEL-15696-89.patch b/glibc-RHEL-15696-89.patch
new file mode 100644
index 0000000..45ee946
--- /dev/null
+++ b/glibc-RHEL-15696-89.patch
@@ -0,0 +1,343 @@
+From ec285ea90415458225623ddc0492ae3f705af043 Mon Sep 17 00:00:00 2001
+From: Noah Goldstein <goldstein.w.n@gmail.com>
+Date: Wed, 23 Mar 2022 16:57:18 -0500
+Subject: [PATCH] x86: Code cleanup in strchr-evex and comment justifying
+ branch
+Content-type: text/plain; charset=UTF-8
+
+Small code cleanup for size: -81 bytes.
+
+Add comment justifying using a branch to do NULL/non-null return.
+
+All string/memory tests pass and no regressions in benchtests.
+
+geometric_mean(N=20) of all benchmarks New / Original: .985
+Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
+---
+ sysdeps/x86_64/multiarch/strchr-evex.S | 146 ++++++++++++++-----------
+ 1 file changed, 80 insertions(+), 66 deletions(-)
+
+diff --git a/sysdeps/x86_64/multiarch/strchr-evex.S b/sysdeps/x86_64/multiarch/strchr-evex.S
+index 7f9d4ee4..0b49e0ac 100644
+--- a/sysdeps/x86_64/multiarch/strchr-evex.S
++++ b/sysdeps/x86_64/multiarch/strchr-evex.S
+@@ -30,6 +30,7 @@
+ # ifdef USE_AS_WCSCHR
+ #  define VPBROADCAST	vpbroadcastd
+ #  define VPCMP		vpcmpd
++#  define VPTESTN	vptestnmd
+ #  define VPMINU	vpminud
+ #  define CHAR_REG	esi
+ #  define SHIFT_REG	ecx
+@@ -37,6 +38,7 @@
+ # else
+ #  define VPBROADCAST	vpbroadcastb
+ #  define VPCMP		vpcmpb
++#  define VPTESTN	vptestnmb
+ #  define VPMINU	vpminub
+ #  define CHAR_REG	sil
+ #  define SHIFT_REG	edx
+@@ -61,13 +63,11 @@
+ # define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE)
+ 
+ 	.section .text.evex,"ax",@progbits
+-ENTRY (STRCHR)
++ENTRY_P2ALIGN (STRCHR, 5)
+ 	/* Broadcast CHAR to YMM0.	*/
+ 	VPBROADCAST	%esi, %YMM0
+ 	movl	%edi, %eax
+ 	andl	$(PAGE_SIZE - 1), %eax
+-	vpxorq	%XMMZERO, %XMMZERO, %XMMZERO
+-
+ 	/* Check if we cross page boundary with one vector load.
+ 	   Otherwise it is safe to use an unaligned load.  */
+ 	cmpl	$(PAGE_SIZE - VEC_SIZE), %eax
+@@ -81,49 +81,35 @@ ENTRY (STRCHR)
+ 	vpxorq	%YMM1, %YMM0, %YMM2
+ 	VPMINU	%YMM2, %YMM1, %YMM2
+ 	/* Each bit in K0 represents a CHAR or a null byte in YMM1.  */
+-	VPCMP	$0, %YMMZERO, %YMM2, %k0
++	VPTESTN	%YMM2, %YMM2, %k0
+ 	kmovd	%k0, %eax
+ 	testl	%eax, %eax
+ 	jz	L(aligned_more)
+ 	tzcntl	%eax, %eax
++# ifndef USE_AS_STRCHRNUL
++	/* Found CHAR or the null byte.  */
++	cmp	(%rdi, %rax, CHAR_SIZE), %CHAR_REG
++	/* NB: Use a branch instead of cmovcc here. The expectation is
++	   that with strchr the user will branch based on input being
++	   null. Since this branch will be 100% predictive of the user
++	   branch a branch miss here should save what otherwise would
++	   be branch miss in the user code. Otherwise using a branch 1)
++	   saves code size and 2) is faster in highly predictable
++	   environments.  */
++	jne	L(zero)
++# endif
+ # ifdef USE_AS_WCSCHR
+ 	/* NB: Multiply wchar_t count by 4 to get the number of bytes.
+ 	 */
+ 	leaq	(%rdi, %rax, CHAR_SIZE), %rax
+ # else
+ 	addq	%rdi, %rax
+-# endif
+-# ifndef USE_AS_STRCHRNUL
+-	/* Found CHAR or the null byte.	 */
+-	cmp	(%rax), %CHAR_REG
+-	jne	L(zero)
+ # endif
+ 	ret
+ 
+-	/* .p2align 5 helps keep performance more consistent if ENTRY()
+-	   alignment % 32 was either 16 or 0. As well this makes the
+-	   alignment % 32 of the loop_4x_vec fixed which makes tuning it
+-	   easier.  */
+-	.p2align 5
+-L(first_vec_x3):
+-	tzcntl	%eax, %eax
+-# ifndef USE_AS_STRCHRNUL
+-	/* Found CHAR or the null byte.	 */
+-	cmp	(VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %CHAR_REG
+-	jne	L(zero)
+-# endif
+-	/* NB: Multiply sizeof char type (1 or 4) to get the number of
+-	   bytes.  */
+-	leaq	(VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax
+-	ret
+ 
+-# ifndef USE_AS_STRCHRNUL
+-L(zero):
+-	xorl	%eax, %eax
+-	ret
+-# endif
+ 
+-	.p2align 4
++	.p2align 4,, 10
+ L(first_vec_x4):
+ # ifndef USE_AS_STRCHRNUL
+ 	/* Check to see if first match was CHAR (k0) or null (k1).  */
+@@ -144,9 +130,18 @@ L(first_vec_x4):
+ 	leaq	(VEC_SIZE * 4)(%rdi, %rax, CHAR_SIZE), %rax
+ 	ret
+ 
++# ifndef USE_AS_STRCHRNUL
++L(zero):
++	xorl	%eax, %eax
++	ret
++# endif
++
++
+ 	.p2align 4
+ L(first_vec_x1):
+-	tzcntl	%eax, %eax
++	/* Use bsf here to save 1-byte keeping keeping the block in 1x
++	   fetch block. eax guranteed non-zero.  */
++	bsfl	%eax, %eax
+ # ifndef USE_AS_STRCHRNUL
+ 	/* Found CHAR or the null byte.	 */
+ 	cmp	(VEC_SIZE)(%rdi, %rax, CHAR_SIZE), %CHAR_REG
+@@ -158,7 +153,7 @@ L(first_vec_x1):
+ 	leaq	(VEC_SIZE)(%rdi, %rax, CHAR_SIZE), %rax
+ 	ret
+ 
+-	.p2align 4
++	.p2align 4,, 10
+ L(first_vec_x2):
+ # ifndef USE_AS_STRCHRNUL
+ 	/* Check to see if first match was CHAR (k0) or null (k1).  */
+@@ -179,6 +174,21 @@ L(first_vec_x2):
+ 	leaq	(VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
+ 	ret
+ 
++	.p2align 4,, 10
++L(first_vec_x3):
++	/* Use bsf here to save 1-byte keeping keeping the block in 1x
++	   fetch block. eax guranteed non-zero.  */
++	bsfl	%eax, %eax
++# ifndef USE_AS_STRCHRNUL
++	/* Found CHAR or the null byte.	 */
++	cmp	(VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %CHAR_REG
++	jne	L(zero)
++# endif
++	/* NB: Multiply sizeof char type (1 or 4) to get the number of
++	   bytes.  */
++	leaq	(VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax
++	ret
++
+ 	.p2align 4
+ L(aligned_more):
+ 	/* Align data to VEC_SIZE.  */
+@@ -195,7 +205,7 @@ L(cross_page_continue):
+ 	vpxorq	%YMM1, %YMM0, %YMM2
+ 	VPMINU	%YMM2, %YMM1, %YMM2
+ 	/* Each bit in K0 represents a CHAR or a null byte in YMM1.  */
+-	VPCMP	$0, %YMMZERO, %YMM2, %k0
++	VPTESTN	%YMM2, %YMM2, %k0
+ 	kmovd	%k0, %eax
+ 	testl	%eax, %eax
+ 	jnz	L(first_vec_x1)
+@@ -206,7 +216,7 @@ L(cross_page_continue):
+ 	/* Each bit in K0 represents a CHAR in YMM1.  */
+ 	VPCMP	$0, %YMM1, %YMM0, %k0
+ 	/* Each bit in K1 represents a CHAR in YMM1.  */
+-	VPCMP	$0, %YMM1, %YMMZERO, %k1
++	VPTESTN	%YMM1, %YMM1, %k1
+ 	kortestd	%k0, %k1
+ 	jnz	L(first_vec_x2)
+ 
+@@ -215,7 +225,7 @@ L(cross_page_continue):
+ 	vpxorq	%YMM1, %YMM0, %YMM2
+ 	VPMINU	%YMM2, %YMM1, %YMM2
+ 	/* Each bit in K0 represents a CHAR or a null byte in YMM1.  */
+-	VPCMP	$0, %YMMZERO, %YMM2, %k0
++	VPTESTN	%YMM2, %YMM2, %k0
+ 	kmovd	%k0, %eax
+ 	testl	%eax, %eax
+ 	jnz	L(first_vec_x3)
+@@ -224,7 +234,7 @@ L(cross_page_continue):
+ 	/* Each bit in K0 represents a CHAR in YMM1.  */
+ 	VPCMP	$0, %YMM1, %YMM0, %k0
+ 	/* Each bit in K1 represents a CHAR in YMM1.  */
+-	VPCMP	$0, %YMM1, %YMMZERO, %k1
++	VPTESTN	%YMM1, %YMM1, %k1
+ 	kortestd	%k0, %k1
+ 	jnz	L(first_vec_x4)
+ 
+@@ -265,33 +275,33 @@ L(loop_4x_vec):
+ 	VPMINU	%YMM3, %YMM4, %YMM4
+ 	VPMINU	%YMM2, %YMM4, %YMM4{%k4}{z}
+ 
+-	VPCMP	$0, %YMMZERO, %YMM4, %k1
++	VPTESTN	%YMM4, %YMM4, %k1
+ 	kmovd	%k1, %ecx
+ 	subq	$-(VEC_SIZE * 4), %rdi
+ 	testl	%ecx, %ecx
+ 	jz	L(loop_4x_vec)
+ 
+-	VPCMP	$0, %YMMZERO, %YMM1, %k0
++	VPTESTN	%YMM1, %YMM1, %k0
+ 	kmovd	%k0, %eax
+ 	testl	%eax, %eax
+ 	jnz	L(last_vec_x1)
+ 
+-	VPCMP	$0, %YMMZERO, %YMM2, %k0
++	VPTESTN	%YMM2, %YMM2, %k0
+ 	kmovd	%k0, %eax
+ 	testl	%eax, %eax
+ 	jnz	L(last_vec_x2)
+ 
+-	VPCMP	$0, %YMMZERO, %YMM3, %k0
++	VPTESTN	%YMM3, %YMM3, %k0
+ 	kmovd	%k0, %eax
+ 	/* Combine YMM3 matches (eax) with YMM4 matches (ecx).  */
+ # ifdef USE_AS_WCSCHR
+ 	sall	$8, %ecx
+ 	orl	%ecx, %eax
+-	tzcntl	%eax, %eax
++	bsfl	%eax, %eax
+ # else
+ 	salq	$32, %rcx
+ 	orq	%rcx, %rax
+-	tzcntq	%rax, %rax
++	bsfq	%rax, %rax
+ # endif
+ # ifndef USE_AS_STRCHRNUL
+ 	/* Check if match was CHAR or null.  */
+@@ -303,28 +313,28 @@ L(loop_4x_vec):
+ 	leaq	(VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
+ 	ret
+ 
+-# ifndef USE_AS_STRCHRNUL
+-L(zero_end):
+-	xorl	%eax, %eax
+-	ret
++	.p2align 4,, 8
++L(last_vec_x1):
++	bsfl	%eax, %eax
++# ifdef USE_AS_WCSCHR
++	/* NB: Multiply wchar_t count by 4 to get the number of bytes.
++	   */
++	leaq	(%rdi, %rax, CHAR_SIZE), %rax
++# else
++	addq	%rdi, %rax
+ # endif
+ 
+-	.p2align 4
+-L(last_vec_x1):
+-	tzcntl	%eax, %eax
+ # ifndef USE_AS_STRCHRNUL
+ 	/* Check if match was null.  */
+-	cmp	(%rdi, %rax, CHAR_SIZE), %CHAR_REG
++	cmp	(%rax), %CHAR_REG
+ 	jne	L(zero_end)
+ # endif
+-	/* NB: Multiply sizeof char type (1 or 4) to get the number of
+-	   bytes.  */
+-	leaq	(%rdi, %rax, CHAR_SIZE), %rax
++
+ 	ret
+ 
+-	.p2align 4
++	.p2align 4,, 8
+ L(last_vec_x2):
+-	tzcntl	%eax, %eax
++	bsfl	%eax, %eax
+ # ifndef USE_AS_STRCHRNUL
+ 	/* Check if match was null.  */
+ 	cmp	(VEC_SIZE)(%rdi, %rax, CHAR_SIZE), %CHAR_REG
+@@ -336,7 +346,7 @@ L(last_vec_x2):
+ 	ret
+ 
+ 	/* Cold case for crossing page with first load.	 */
+-	.p2align 4
++	.p2align 4,, 8
+ L(cross_page_boundary):
+ 	movq	%rdi, %rdx
+ 	/* Align rdi.  */
+@@ -346,9 +356,9 @@ L(cross_page_boundary):
+ 	vpxorq	%YMM1, %YMM0, %YMM2
+ 	VPMINU	%YMM2, %YMM1, %YMM2
+ 	/* Each bit in K0 represents a CHAR or a null byte in YMM1.  */
+-	VPCMP	$0, %YMMZERO, %YMM2, %k0
++	VPTESTN	%YMM2, %YMM2, %k0
+ 	kmovd	%k0, %eax
+-	/* Remove the leading bits.	 */
++	/* Remove the leading bits.  */
+ # ifdef USE_AS_WCSCHR
+ 	movl	%edx, %SHIFT_REG
+ 	/* NB: Divide shift count by 4 since each bit in K1 represent 4
+@@ -360,20 +370,24 @@ L(cross_page_boundary):
+ 	/* If eax is zero continue.  */
+ 	testl	%eax, %eax
+ 	jz	L(cross_page_continue)
+-	tzcntl	%eax, %eax
+-# ifndef USE_AS_STRCHRNUL
+-	/* Check to see if match was CHAR or null.  */
+-	cmp	(%rdx, %rax, CHAR_SIZE), %CHAR_REG
+-	jne	L(zero_end)
+-# endif
++	bsfl	%eax, %eax
++
+ # ifdef USE_AS_WCSCHR
+ 	/* NB: Multiply wchar_t count by 4 to get the number of
+ 	   bytes.  */
+ 	leaq	(%rdx, %rax, CHAR_SIZE), %rax
+ # else
+ 	addq	%rdx, %rax
++# endif
++# ifndef USE_AS_STRCHRNUL
++	/* Check to see if match was CHAR or null.  */
++	cmp	(%rax), %CHAR_REG
++	je	L(cross_page_ret)
++L(zero_end):
++	xorl	%eax, %eax
++L(cross_page_ret):
+ # endif
+ 	ret
+ 
+ END (STRCHR)
+-# endif
++#endif
+-- 
+GitLab
+
diff --git a/glibc-RHEL-15696-9.patch b/glibc-RHEL-15696-9.patch
new file mode 100644
index 0000000..5aa3e7b
--- /dev/null
+++ b/glibc-RHEL-15696-9.patch
@@ -0,0 +1,206 @@
+From 3f635fb43389b54f682fc9ed2acc0b2aaf4a923d Mon Sep 17 00:00:00 2001
+From: "H.J. Lu" <hjl.tools@gmail.com>
+Date: Mon, 4 Feb 2019 06:31:01 -0800
+Subject: [PATCH] x86-64 memcmp: Use unsigned Jcc instructions on size [BZ
+ #24155]
+Content-type: text/plain; charset=UTF-8
+
+Since the size argument is unsigned. we should use unsigned Jcc
+instructions, instead of signed, to check size.
+
+Tested on x86-64 and x32, with and without --disable-multi-arch.
+
+	[BZ #24155]
+	CVE-2019-7309
+	* NEWS: Updated for CVE-2019-7309.
+	* sysdeps/x86_64/memcmp.S: Use RDX_LP for size.  Clear the
+	upper 32 bits of RDX register for x32.  Use unsigned Jcc
+	instructions, instead of signed.
+	* sysdeps/x86_64/x32/Makefile (tests): Add tst-size_t-memcmp-2.
+	* sysdeps/x86_64/x32/tst-size_t-memcmp-2.c: New test.
+---
+ sysdeps/x86_64/memcmp.S                  | 20 +++---
+ sysdeps/x86_64/x32/Makefile              |  3 +-
+ sysdeps/x86_64/x32/tst-size_t-memcmp-2.c | 79 ++++++++++++++++++++++++
+ 3 files changed, 93 insertions(+), 9 deletions(-)
+ create mode 100644 sysdeps/x86_64/x32/tst-size_t-memcmp-2.c
+
+Conflics:
+	ChangeLog
+	(removed)
+	NEWS
+	(removed)
+
+diff --git a/sysdeps/x86_64/memcmp.S b/sysdeps/x86_64/memcmp.S
+index bcb4a2e8..45918d37 100644
+--- a/sysdeps/x86_64/memcmp.S
++++ b/sysdeps/x86_64/memcmp.S
+@@ -21,14 +21,18 @@
+ 
+ 	.text
+ ENTRY (memcmp)
+-	test	%rdx, %rdx
++#ifdef __ILP32__
++	/* Clear the upper 32 bits.  */
++	movl	%edx, %edx
++#endif
++	test	%RDX_LP, %RDX_LP
+ 	jz	L(finz)
+ 	cmpq	$1, %rdx
+-	jle	L(finr1b)
++	jbe	L(finr1b)
+ 	subq	%rdi, %rsi
+ 	movq	%rdx, %r10
+ 	cmpq	$32, %r10
+-	jge	L(gt32)
++	jae	L(gt32)
+ 	/* Handle small chunks and last block of less than 32 bytes.  */
+ L(small):
+ 	testq	$1, %r10
+@@ -156,7 +160,7 @@ L(A32):
+ 	movq	%r11, %r10
+ 	andq	$-32, %r10
+ 	cmpq	%r10, %rdi
+-        jge	L(mt16)
++        jae	L(mt16)
+ 	/* Pre-unroll to be ready for unrolled 64B loop.  */
+ 	testq	$32, %rdi
+ 	jz	L(A64)
+@@ -178,7 +182,7 @@ L(A64):
+ 	movq	%r11, %r10
+ 	andq	$-64, %r10
+ 	cmpq	%r10, %rdi
+-        jge	L(mt32)
++        jae	L(mt32)
+ 
+ L(A64main):
+ 	movdqu    (%rdi,%rsi), %xmm0
+@@ -216,7 +220,7 @@ L(mt32):
+ 	movq	%r11, %r10
+ 	andq	$-32, %r10
+ 	cmpq	%r10, %rdi
+-        jge	L(mt16)
++        jae	L(mt16)
+ 
+ L(A32main):
+ 	movdqu    (%rdi,%rsi), %xmm0
+@@ -254,7 +258,7 @@ L(ATR):
+ 	movq	%r11, %r10
+ 	andq	$-32, %r10
+ 	cmpq	%r10, %rdi
+-        jge	L(mt16)
++        jae	L(mt16)
+ 	testq	$16, %rdi
+ 	jz	L(ATR32)
+ 
+@@ -325,7 +329,7 @@ L(ATR64main):
+ 	movq	%r11, %r10
+ 	andq	$-32, %r10
+ 	cmpq	%r10, %rdi
+-        jge	L(mt16)
++        jae	L(mt16)
+ 
+ L(ATR32res):
+ 	movdqa    (%rdi,%rsi), %xmm0
+diff --git a/sysdeps/x86_64/x32/Makefile b/sysdeps/x86_64/x32/Makefile
+index 1557724b..87489565 100644
+--- a/sysdeps/x86_64/x32/Makefile
++++ b/sysdeps/x86_64/x32/Makefile
+@@ -8,7 +8,8 @@ endif
+ ifeq ($(subdir),string)
+ tests += tst-size_t-memchr tst-size_t-memcmp tst-size_t-memcpy \
+ 	 tst-size_t-memrchr tst-size_t-memset tst-size_t-strncasecmp \
+-	 tst-size_t-strncmp tst-size_t-strncpy tst-size_t-strnlen
++	 tst-size_t-strncmp tst-size_t-strncpy tst-size_t-strnlen \
++	 tst-size_t-memcmp-2
+ endif
+ 
+ ifeq ($(subdir),wcsmbs)
+diff --git a/sysdeps/x86_64/x32/tst-size_t-memcmp-2.c b/sysdeps/x86_64/x32/tst-size_t-memcmp-2.c
+new file mode 100644
+index 00000000..d8ae1a08
+--- /dev/null
++++ b/sysdeps/x86_64/x32/tst-size_t-memcmp-2.c
+@@ -0,0 +1,79 @@
++/* Test memcmp with size_t in the lower 32 bits of 64-bit register.
++   Copyright (C) 2019 Free Software Foundation, Inc.
++   This file is part of the GNU C Library.
++
++   The GNU C Library is free software; you can redistribute it and/or
++   modify it under the terms of the GNU Lesser General Public
++   License as published by the Free Software Foundation; either
++   version 2.1 of the License, or (at your option) any later version.
++
++   The GNU C Library is distributed in the hope that it will be useful,
++   but WITHOUT ANY WARRANTY; without even the implied warranty of
++   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++   Lesser General Public License for more details.
++
++   You should have received a copy of the GNU Lesser General Public
++   License along with the GNU C Library; if not, see
++   <http://www.gnu.org/licenses/>.  */
++
++#define TEST_MAIN
++#ifdef WIDE
++# define TEST_NAME "wmemcmp"
++#else
++# define TEST_NAME "memcmp"
++#endif
++
++#include "test-size_t.h"
++
++#ifdef WIDE
++# include <inttypes.h>
++# include <wchar.h>
++
++# define MEMCMP wmemcmp
++# define CHAR wchar_t
++#else
++# define MEMCMP memcmp
++# define CHAR char
++#endif
++
++IMPL (MEMCMP, 1)
++
++typedef int (*proto_t) (const CHAR *, const CHAR *, size_t);
++
++static int
++__attribute__ ((noinline, noclone))
++do_memcmp (parameter_t a, parameter_t b)
++{
++  return CALL (&b, a.p, b.p, a.len);
++}
++
++static int
++test_main (void)
++{
++  test_init ();
++
++  parameter_t dest = { { page_size / sizeof (CHAR) }, buf1 };
++  parameter_t src = { { 0 }, buf2 };
++
++  memcpy (buf1, buf2, page_size);
++
++  CHAR *p = (CHAR *) buf1;
++  p[page_size / sizeof (CHAR) - 1] = (CHAR) 1;
++
++  int ret = 0;
++  FOR_EACH_IMPL (impl, 0)
++    {
++      src.fn = impl->fn;
++      int res = do_memcmp (dest, src);
++      if (res >= 0)
++	{
++	  error (0, 0, "Wrong result in function %s: %i >= 0",
++		 impl->name, res);
++	  ret = 1;
++	}
++    }
++
++  return ret ? EXIT_FAILURE : EXIT_SUCCESS;
++}
++
++#include <support/test-driver.c>
+-- 
+GitLab
+
diff --git a/glibc-RHEL-15696-90.patch b/glibc-RHEL-15696-90.patch
new file mode 100644
index 0000000..11835aa
--- /dev/null
+++ b/glibc-RHEL-15696-90.patch
@@ -0,0 +1,147 @@
+From 30d627d477d7255345a4b713cf352ac32d644d61 Mon Sep 17 00:00:00 2001
+From: Noah Goldstein <goldstein.w.n@gmail.com>
+Date: Wed, 23 Mar 2022 16:57:22 -0500
+Subject: [PATCH] x86: Optimize strcspn and strpbrk in strcspn-c.c
+Content-type: text/plain; charset=UTF-8
+
+Use _mm_cmpeq_epi8 and _mm_movemask_epi8 to get strlen instead of
+_mm_cmpistri. Also change offset to unsigned to avoid unnecessary
+sign extensions.
+
+geometric_mean(N=20) of all benchmarks that dont fallback on
+sse2/strlen; New / Original: .928
+
+All string/memory tests pass.
+Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
+---
+ sysdeps/x86_64/multiarch/strcspn-c.c | 83 +++++++++++++---------------
+ 1 file changed, 37 insertions(+), 46 deletions(-)
+
+diff --git a/sysdeps/x86_64/multiarch/strcspn-c.c b/sysdeps/x86_64/multiarch/strcspn-c.c
+index 857af104..6cce4296 100644
+--- a/sysdeps/x86_64/multiarch/strcspn-c.c
++++ b/sysdeps/x86_64/multiarch/strcspn-c.c
+@@ -85,83 +85,74 @@ STRCSPN_SSE42 (const char *s, const char *a)
+     RETURN (NULL, strlen (s));
+ 
+   const char *aligned;
+-  __m128i mask;
+-  int offset = (int) ((size_t) a & 15);
++  __m128i mask, maskz, zero;
++  unsigned int maskz_bits;
++  unsigned int offset = (unsigned int) ((size_t) a & 15);
++  zero = _mm_set1_epi8 (0);
+   if (offset != 0)
+     {
+       /* Load masks.  */
+       aligned = (const char *) ((size_t) a & -16L);
+       __m128i mask0 = _mm_load_si128 ((__m128i *) aligned);
+-
+-      mask = __m128i_shift_right (mask0, offset);
++      maskz = _mm_cmpeq_epi8 (mask0, zero);
+ 
+       /* Find where the NULL terminator is.  */
+-      int length = _mm_cmpistri (mask, mask, 0x3a);
+-      if (length == 16 - offset)
+-	{
+-	  /* There is no NULL terminator.  */
+-	  __m128i mask1 = _mm_load_si128 ((__m128i *) (aligned + 16));
+-	  int index = _mm_cmpistri (mask1, mask1, 0x3a);
+-	  length += index;
+-
+-	  /* Don't use SSE4.2 if the length of A > 16.  */
+-	  if (length > 16)
+-	    return STRCSPN_SSE2 (s, a);
+-
+-	  if (index != 0)
+-	    {
+-	      /* Combine mask0 and mask1.  We could play games with
+-		 palignr, but frankly this data should be in L1 now
+-		 so do the merge via an unaligned load.  */
+-	      mask = _mm_loadu_si128 ((__m128i *) a);
+-	    }
+-	}
++      maskz_bits = _mm_movemask_epi8 (maskz) >> offset;
++      if (maskz_bits != 0)
++        {
++          mask = __m128i_shift_right (mask0, offset);
++          offset = (unsigned int) ((size_t) s & 15);
++          if (offset)
++            goto start_unaligned;
++
++          aligned = s;
++          goto start_loop;
++        }
+     }
+-  else
+-    {
+-      /* A is aligned.  */
+-      mask = _mm_load_si128 ((__m128i *) a);
+ 
+-      /* Find where the NULL terminator is.  */
+-      int length = _mm_cmpistri (mask, mask, 0x3a);
+-      if (length == 16)
+-	{
+-	  /* There is no NULL terminator.  Don't use SSE4.2 if the length
+-	     of A > 16.  */
+-	  if (a[16] != 0)
+-	    return STRCSPN_SSE2 (s, a);
+-	}
++  /* A is aligned.  */
++  mask = _mm_loadu_si128 ((__m128i *) a);
++  /* Find where the NULL terminator is.  */
++  maskz = _mm_cmpeq_epi8 (mask, zero);
++  maskz_bits = _mm_movemask_epi8 (maskz);
++  if (maskz_bits == 0)
++    {
++      /* There is no NULL terminator.  Don't use SSE4.2 if the length
++         of A > 16.  */
++      if (a[16] != 0)
++        return STRCSPN_SSE2 (s, a);
+     }
+ 
+-  offset = (int) ((size_t) s & 15);
++  aligned = s;
++  offset = (unsigned int) ((size_t) s & 15);
+   if (offset != 0)
+     {
++    start_unaligned:
+       /* Check partial string.  */
+       aligned = (const char *) ((size_t) s & -16L);
+       __m128i value = _mm_load_si128 ((__m128i *) aligned);
+ 
+       value = __m128i_shift_right (value, offset);
+ 
+-      int length = _mm_cmpistri (mask, value, 0x2);
++      unsigned int length = _mm_cmpistri (mask, value, 0x2);
+       /* No need to check ZFlag since ZFlag is always 1.  */
+-      int cflag = _mm_cmpistrc (mask, value, 0x2);
++      unsigned int cflag = _mm_cmpistrc (mask, value, 0x2);
+       if (cflag)
+ 	RETURN ((char *) (s + length), length);
+       /* Find where the NULL terminator is.  */
+-      int index = _mm_cmpistri (value, value, 0x3a);
++      unsigned int index = _mm_cmpistri (value, value, 0x3a);
+       if (index < 16 - offset)
+ 	RETURN (NULL, index);
+       aligned += 16;
+     }
+-  else
+-    aligned = s;
+ 
++start_loop:
+   while (1)
+     {
+       __m128i value = _mm_load_si128 ((__m128i *) aligned);
+-      int index = _mm_cmpistri (mask, value, 0x2);
+-      int cflag = _mm_cmpistrc (mask, value, 0x2);
+-      int zflag = _mm_cmpistrz (mask, value, 0x2);
++      unsigned int index = _mm_cmpistri (mask, value, 0x2);
++      unsigned int cflag = _mm_cmpistrc (mask, value, 0x2);
++      unsigned int zflag = _mm_cmpistrz (mask, value, 0x2);
+       if (cflag)
+ 	RETURN ((char *) (aligned + index), (size_t) (aligned + index - s));
+       if (zflag)
+-- 
+GitLab
+
diff --git a/glibc-RHEL-15696-91.patch b/glibc-RHEL-15696-91.patch
new file mode 100644
index 0000000..de3c8ec
--- /dev/null
+++ b/glibc-RHEL-15696-91.patch
@@ -0,0 +1,147 @@
+From 412d10343168b05b8cf6c3683457cf9711d28046 Mon Sep 17 00:00:00 2001
+From: Noah Goldstein <goldstein.w.n@gmail.com>
+Date: Wed, 23 Mar 2022 16:57:24 -0500
+Subject: [PATCH] x86: Optimize strspn in strspn-c.c
+Content-type: text/plain; charset=UTF-8
+
+Use _mm_cmpeq_epi8 and _mm_movemask_epi8 to get strlen instead of
+_mm_cmpistri. Also change offset to unsigned to avoid unnecessary
+sign extensions.
+
+geometric_mean(N=20) of all benchmarks that dont fallback on
+sse2; New / Original: .901
+
+All string/memory tests pass.
+Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
+---
+ sysdeps/x86_64/multiarch/strspn-c.c | 86 +++++++++++++----------------
+ 1 file changed, 39 insertions(+), 47 deletions(-)
+
+diff --git a/sysdeps/x86_64/multiarch/strspn-c.c b/sysdeps/x86_64/multiarch/strspn-c.c
+index 4554cff0..87c5e4bf 100644
+--- a/sysdeps/x86_64/multiarch/strspn-c.c
++++ b/sysdeps/x86_64/multiarch/strspn-c.c
+@@ -63,81 +63,73 @@ __strspn_sse42 (const char *s, const char *a)
+     return 0;
+ 
+   const char *aligned;
+-  __m128i mask;
+-  int offset = (int) ((size_t) a & 15);
++  __m128i mask, maskz, zero;
++  unsigned int maskz_bits;
++  unsigned int offset = (int) ((size_t) a & 15);
++  zero = _mm_set1_epi8 (0);
+   if (offset != 0)
+     {
+       /* Load masks.  */
+       aligned = (const char *) ((size_t) a & -16L);
+       __m128i mask0 = _mm_load_si128 ((__m128i *) aligned);
+-
+-      mask = __m128i_shift_right (mask0, offset);
++      maskz = _mm_cmpeq_epi8 (mask0, zero);
+ 
+       /* Find where the NULL terminator is.  */
+-      int length = _mm_cmpistri (mask, mask, 0x3a);
+-      if (length == 16 - offset)
+-	{
+-	  /* There is no NULL terminator.  */
+-	  __m128i mask1 = _mm_load_si128 ((__m128i *) (aligned + 16));
+-	  int index = _mm_cmpistri (mask1, mask1, 0x3a);
+-	  length += index;
+-
+-	  /* Don't use SSE4.2 if the length of A > 16.  */
+-	  if (length > 16)
+-	    return __strspn_sse2 (s, a);
+-
+-	  if (index != 0)
+-	    {
+-	      /* Combine mask0 and mask1.  We could play games with
+-		 palignr, but frankly this data should be in L1 now
+-		 so do the merge via an unaligned load.  */
+-	      mask = _mm_loadu_si128 ((__m128i *) a);
+-	    }
+-	}
++      maskz_bits = _mm_movemask_epi8 (maskz) >> offset;
++      if (maskz_bits != 0)
++        {
++          mask = __m128i_shift_right (mask0, offset);
++          offset = (unsigned int) ((size_t) s & 15);
++          if (offset)
++            goto start_unaligned;
++
++          aligned = s;
++          goto start_loop;
++        }
+     }
+-  else
+-    {
+-      /* A is aligned.  */
+-      mask = _mm_load_si128 ((__m128i *) a);
+ 
+-      /* Find where the NULL terminator is.  */
+-      int length = _mm_cmpistri (mask, mask, 0x3a);
+-      if (length == 16)
+-	{
+-	  /* There is no NULL terminator.  Don't use SSE4.2 if the length
+-	     of A > 16.  */
+-	  if (a[16] != 0)
+-	    return __strspn_sse2 (s, a);
+-	}
++  /* A is aligned.  */
++  mask = _mm_loadu_si128 ((__m128i *) a);
++
++  /* Find where the NULL terminator is.  */
++  maskz = _mm_cmpeq_epi8 (mask, zero);
++  maskz_bits = _mm_movemask_epi8 (maskz);
++  if (maskz_bits == 0)
++    {
++      /* There is no NULL terminator.  Don't use SSE4.2 if the length
++         of A > 16.  */
++      if (a[16] != 0)
++        return __strspn_sse2 (s, a);
+     }
++  aligned = s;
++  offset = (unsigned int) ((size_t) s & 15);
+ 
+-  offset = (int) ((size_t) s & 15);
+   if (offset != 0)
+     {
++    start_unaligned:
+       /* Check partial string.  */
+       aligned = (const char *) ((size_t) s & -16L);
+       __m128i value = _mm_load_si128 ((__m128i *) aligned);
++      __m128i adj_value = __m128i_shift_right (value, offset);
+ 
+-      value = __m128i_shift_right (value, offset);
+-
+-      int length = _mm_cmpistri (mask, value, 0x12);
++      unsigned int length = _mm_cmpistri (mask, adj_value, 0x12);
+       /* No need to check CFlag since it is always 1.  */
+       if (length < 16 - offset)
+ 	return length;
+       /* Find where the NULL terminator is.  */
+-      int index = _mm_cmpistri (value, value, 0x3a);
+-      if (index < 16 - offset)
++      maskz = _mm_cmpeq_epi8 (value, zero);
++      maskz_bits = _mm_movemask_epi8 (maskz) >> offset;
++      if (maskz_bits != 0)
+ 	return length;
+       aligned += 16;
+     }
+-  else
+-    aligned = s;
+ 
++start_loop:
+   while (1)
+     {
+       __m128i value = _mm_load_si128 ((__m128i *) aligned);
+-      int index = _mm_cmpistri (mask, value, 0x12);
+-      int cflag = _mm_cmpistrc (mask, value, 0x12);
++      unsigned int index = _mm_cmpistri (mask, value, 0x12);
++      unsigned int cflag = _mm_cmpistrc (mask, value, 0x12);
+       if (cflag)
+ 	return (size_t) (aligned + index - s);
+       aligned += 16;
+-- 
+GitLab
+
diff --git a/glibc-RHEL-15696-92.patch b/glibc-RHEL-15696-92.patch
new file mode 100644
index 0000000..f19914e
--- /dev/null
+++ b/glibc-RHEL-15696-92.patch
@@ -0,0 +1,175 @@
+From fe28e7d9d9535ebab4081d195c553b4fbf39d9ae Mon Sep 17 00:00:00 2001
+From: Noah Goldstein <goldstein.w.n@gmail.com>
+Date: Wed, 23 Mar 2022 16:57:26 -0500
+Subject: [PATCH] x86: Remove strcspn-sse2.S and use the generic implementation
+Content-type: text/plain; charset=UTF-8
+
+The generic implementation is faster.
+
+geometric_mean(N=20) of all benchmarks New / Original: .678
+
+All string/memory tests pass.
+Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
+---
+ .../{strcspn-sse2.S => strcspn-sse2.c}        |   6 +-
+ sysdeps/x86_64/strcspn.S                      | 122 ------------------
+ 2 files changed, 3 insertions(+), 125 deletions(-)
+ rename sysdeps/x86_64/multiarch/{strcspn-sse2.S => strcspn-sse2.c} (89%)
+ delete mode 100644 sysdeps/x86_64/strcspn.S
+
+Conflicts:
+	sysdeps/x86_64/multiarch/strcspn-sse2.S
+	(copyright header)
+
+diff --git a/sysdeps/x86_64/multiarch/strcspn-sse2.S b/sysdeps/x86_64/multiarch/strcspn-sse2.c
+similarity index 89%
+rename from sysdeps/x86_64/multiarch/strcspn-sse2.S
+rename to sysdeps/x86_64/multiarch/strcspn-sse2.c
+index 8a0c69d7..32debee4 100644
+--- a/sysdeps/x86_64/multiarch/strcspn-sse2.S
++++ b/sysdeps/x86_64/multiarch/strcspn-sse2.c
+@@ -19,10 +19,10 @@
+ #if IS_IN (libc)
+ 
+ # include <sysdep.h>
+-# define strcspn __strcspn_sse2
++# define STRCSPN __strcspn_sse2
+ 
+ # undef libc_hidden_builtin_def
+-# define libc_hidden_builtin_def(strcspn)
++# define libc_hidden_builtin_def(STRCSPN)
+ #endif
+ 
+-#include <sysdeps/x86_64/strcspn.S>
++#include <string/strcspn.c>
+diff --git a/sysdeps/x86_64/strcspn.S b/sysdeps/x86_64/strcspn.S
+deleted file mode 100644
+index 7f9202d6..00000000
+--- a/sysdeps/x86_64/strcspn.S
++++ /dev/null
+@@ -1,122 +0,0 @@
+-/* strcspn (str, ss) -- Return the length of the initial segment of STR
+-			which contains no characters from SS.
+-   For AMD x86-64.
+-   Copyright (C) 1994-2018 Free Software Foundation, Inc.
+-   This file is part of the GNU C Library.
+-   Contributed by Ulrich Drepper <drepper@gnu.ai.mit.edu>.
+-   Bug fixes by Alan Modra <Alan@SPRI.Levels.UniSA.Edu.Au>.
+-   Adopted for x86-64 by Andreas Jaeger <aj@suse.de>.
+-
+-   The GNU C Library is free software; you can redistribute it and/or
+-   modify it under the terms of the GNU Lesser General Public
+-   License as published by the Free Software Foundation; either
+-   version 2.1 of the License, or (at your option) any later version.
+-
+-   The GNU C Library is distributed in the hope that it will be useful,
+-   but WITHOUT ANY WARRANTY; without even the implied warranty of
+-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+-   Lesser General Public License for more details.
+-
+-   You should have received a copy of the GNU Lesser General Public
+-   License along with the GNU C Library; if not, see
+-   <http://www.gnu.org/licenses/>.  */
+-
+-#include <sysdep.h>
+-#include "asm-syntax.h"
+-
+-	.text
+-ENTRY (strcspn)
+-
+-	movq %rdi, %rdx		/* Save SRC.  */
+-
+-	/* First we create a table with flags for all possible characters.
+-	   For the ASCII (7bit/8bit) or ISO-8859-X character sets which are
+-	   supported by the C string functions we have 256 characters.
+-	   Before inserting marks for the stop characters we clear the whole
+-	   table.  */
+-	movq %rdi, %r8			/* Save value.  */
+-	subq $256, %rsp			/* Make space for 256 bytes.  */
+-	cfi_adjust_cfa_offset(256)
+-	movl $32,  %ecx			/* 32*8 bytes = 256 bytes.  */
+-	movq %rsp, %rdi
+-	xorl %eax, %eax			/* We store 0s.  */
+-	cld
+-	rep
+-	stosq
+-
+-	movq %rsi, %rax			/* Setup skipset.  */
+-
+-/* For understanding the following code remember that %rcx == 0 now.
+-   Although all the following instruction only modify %cl we always
+-   have a correct zero-extended 64-bit value in %rcx.  */
+-
+-	.p2align 4
+-L(2):	movb (%rax), %cl	/* get byte from skipset */
+-	testb %cl, %cl		/* is NUL char? */
+-	jz L(1)			/* yes => start compare loop */
+-	movb %cl, (%rsp,%rcx)	/* set corresponding byte in skipset table */
+-
+-	movb 1(%rax), %cl	/* get byte from skipset */
+-	testb $0xff, %cl	/* is NUL char? */
+-	jz L(1)			/* yes => start compare loop */
+-	movb %cl, (%rsp,%rcx)	/* set corresponding byte in skipset table */
+-
+-	movb 2(%rax), %cl	/* get byte from skipset */
+-	testb $0xff, %cl	/* is NUL char? */
+-	jz L(1)			/* yes => start compare loop */
+-	movb %cl, (%rsp,%rcx)	/* set corresponding byte in skipset table */
+-
+-	movb 3(%rax), %cl	/* get byte from skipset */
+-	addq $4, %rax		/* increment skipset pointer */
+-	movb %cl, (%rsp,%rcx)	/* set corresponding byte in skipset table */
+-	testb $0xff, %cl	/* is NUL char? */
+-	jnz L(2)		/* no => process next dword from skipset */
+-
+-L(1):	leaq -4(%rdx), %rax	/* prepare loop */
+-
+-	/* We use a neat trick for the following loop.  Normally we would
+-	   have to test for two termination conditions
+-	   1. a character in the skipset was found
+-	   and
+-	   2. the end of the string was found
+-	   But as a sign that the character is in the skipset we store its
+-	   value in the table.  But the value of NUL is NUL so the loop
+-	   terminates for NUL in every case.  */
+-
+-	.p2align 4
+-L(3):	addq $4, %rax		/* adjust pointer for full loop round */
+-
+-	movb (%rax), %cl	/* get byte from string */
+-	cmpb %cl, (%rsp,%rcx)	/* is it contained in skipset? */
+-	je L(4)			/* yes => return */
+-
+-	movb 1(%rax), %cl	/* get byte from string */
+-	cmpb %cl, (%rsp,%rcx)	/* is it contained in skipset? */
+-	je L(5)			/* yes => return */
+-
+-	movb 2(%rax), %cl	/* get byte from string */
+-	cmpb %cl, (%rsp,%rcx)	/* is it contained in skipset? */
+-	jz L(6)			/* yes => return */
+-
+-	movb 3(%rax), %cl	/* get byte from string */
+-	cmpb %cl, (%rsp,%rcx)	/* is it contained in skipset? */
+-	jne L(3)		/* no => start loop again */
+-
+-	incq %rax		/* adjust pointer */
+-L(6):	incq %rax
+-L(5):	incq %rax
+-
+-L(4):	addq $256, %rsp		/* remove skipset */
+-	cfi_adjust_cfa_offset(-256)
+-#ifdef USE_AS_STRPBRK
+-	xorl %edx,%edx
+-	orb %cl, %cl		/* was last character NUL? */
+-	cmovzq %rdx, %rax	/* Yes:	return NULL */
+-#else
+-	subq %rdx, %rax		/* we have to return the number of valid
+-				   characters, so compute distance to first
+-				   non-valid character */
+-#endif
+-	ret
+-END (strcspn)
+-libc_hidden_builtin_def (strcspn)
+-- 
+GitLab
+
diff --git a/glibc-RHEL-15696-93.patch b/glibc-RHEL-15696-93.patch
new file mode 100644
index 0000000..45c8527
--- /dev/null
+++ b/glibc-RHEL-15696-93.patch
@@ -0,0 +1,55 @@
+From 653358535280a599382cb6c77538a187dac6a87f Mon Sep 17 00:00:00 2001
+From: Noah Goldstein <goldstein.w.n@gmail.com>
+Date: Wed, 23 Mar 2022 16:57:27 -0500
+Subject: [PATCH] x86: Remove strpbrk-sse2.S and use the generic implementation
+Content-type: text/plain; charset=UTF-8
+
+The generic implementation is faster (see strcspn commit).
+
+All string/memory tests pass.
+Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
+---
+ .../x86_64/multiarch/{strpbrk-sse2.S => strpbrk-sse2.c}    | 7 +++----
+ sysdeps/x86_64/strpbrk.S                                   | 3 ---
+ 2 files changed, 3 insertions(+), 7 deletions(-)
+ rename sysdeps/x86_64/multiarch/{strpbrk-sse2.S => strpbrk-sse2.c} (87%)
+ delete mode 100644 sysdeps/x86_64/strpbrk.S
+
+Conflicts:
+	sysdeps/x86_64/multiarch/strpbrk-sse2.S
+	(copyright header)
+
+diff --git a/sysdeps/x86_64/multiarch/strpbrk-sse2.S b/sysdeps/x86_64/multiarch/strpbrk-sse2.c
+similarity index 87%
+rename from sysdeps/x86_64/multiarch/strpbrk-sse2.S
+rename to sysdeps/x86_64/multiarch/strpbrk-sse2.c
+index 3c6a74db..ec0b6fda 100644
+--- a/sysdeps/x86_64/multiarch/strpbrk-sse2.S
++++ b/sysdeps/x86_64/multiarch/strpbrk-sse2.c
+@@ -19,11 +19,10 @@
+ #if IS_IN (libc)
+ 
+ # include <sysdep.h>
+-# define strcspn __strpbrk_sse2
++# define STRPBRK __strpbrk_sse2
+ 
+ # undef libc_hidden_builtin_def
+-# define libc_hidden_builtin_def(strpbrk)
++# define libc_hidden_builtin_def(STRPBRK)
+ #endif
+ 
+-#define USE_AS_STRPBRK
+-#include <sysdeps/x86_64/strcspn.S>
++#include <string/strpbrk.c>
+diff --git a/sysdeps/x86_64/strpbrk.S b/sysdeps/x86_64/strpbrk.S
+deleted file mode 100644
+index 21888a5b..00000000
+--- a/sysdeps/x86_64/strpbrk.S
++++ /dev/null
+@@ -1,3 +0,0 @@
+-#define strcspn strpbrk
+-#define USE_AS_STRPBRK
+-#include <sysdeps/x86_64/strcspn.S>
+-- 
+GitLab
+
diff --git a/glibc-RHEL-15696-94.patch b/glibc-RHEL-15696-94.patch
new file mode 100644
index 0000000..2fa86da
--- /dev/null
+++ b/glibc-RHEL-15696-94.patch
@@ -0,0 +1,168 @@
+From 9c8a6ad620b49a27120ecdd7049c26bf05900397 Mon Sep 17 00:00:00 2001
+From: Noah Goldstein <goldstein.w.n@gmail.com>
+Date: Wed, 23 Mar 2022 16:57:29 -0500
+Subject: [PATCH] x86: Remove strspn-sse2.S and use the generic implementation
+Content-type: text/plain; charset=UTF-8
+
+The generic implementation is faster.
+
+geometric_mean(N=20) of all benchmarks New / Original: .710
+
+All string/memory tests pass.
+Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
+---
+ .../{strspn-sse2.S => strspn-sse2.c}          |   6 +-
+ sysdeps/x86_64/strspn.S                       | 115 ------------------
+ 2 files changed, 3 insertions(+), 118 deletions(-)
+ rename sysdeps/x86_64/multiarch/{strspn-sse2.S => strspn-sse2.c} (89%)
+ delete mode 100644 sysdeps/x86_64/strspn.S
+
+Conflicts:
+	sysdeps/x86_64/multiarch/strspn-sse2.c
+	(copyright header)
+
+diff --git a/sysdeps/x86_64/multiarch/strspn-sse2.S b/sysdeps/x86_64/multiarch/strspn-sse2.c
+similarity index 89%
+rename from sysdeps/x86_64/multiarch/strspn-sse2.S
+rename to sysdeps/x86_64/multiarch/strspn-sse2.c
+index 4686cdd5..ab0dae40 100644
+--- a/sysdeps/x86_64/multiarch/strspn-sse2.S
++++ b/sysdeps/x86_64/multiarch/strspn-sse2.c
+@@ -19,10 +19,10 @@
+ #if IS_IN (libc)
+ 
+ # include <sysdep.h>
+-# define strspn __strspn_sse2
++# define STRSPN __strspn_sse2
+ 
+ # undef libc_hidden_builtin_def
+-# define libc_hidden_builtin_def(strspn)
++# define libc_hidden_builtin_def(STRSPN)
+ #endif
+ 
+-#include <sysdeps/x86_64/strspn.S>
++#include <string/strspn.c>
+diff --git a/sysdeps/x86_64/strspn.S b/sysdeps/x86_64/strspn.S
+deleted file mode 100644
+index 635f1bc6..00000000
+--- a/sysdeps/x86_64/strspn.S
++++ /dev/null
+@@ -1,115 +0,0 @@
+-/* strspn (str, ss) -- Return the length of the initial segment of STR
+-			which contains only characters from SS.
+-   For AMD x86-64.
+-   Copyright (C) 1994-2018 Free Software Foundation, Inc.
+-   This file is part of the GNU C Library.
+-   Contributed by Ulrich Drepper <drepper@gnu.ai.mit.edu>.
+-   Bug fixes by Alan Modra <Alan@SPRI.Levels.UniSA.Edu.Au>.
+-   Adopted for x86-64 by Andreas Jaeger <aj@suse.de>.
+-
+-   The GNU C Library is free software; you can redistribute it and/or
+-   modify it under the terms of the GNU Lesser General Public
+-   License as published by the Free Software Foundation; either
+-   version 2.1 of the License, or (at your option) any later version.
+-
+-   The GNU C Library is distributed in the hope that it will be useful,
+-   but WITHOUT ANY WARRANTY; without even the implied warranty of
+-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+-   Lesser General Public License for more details.
+-
+-   You should have received a copy of the GNU Lesser General Public
+-   License along with the GNU C Library; if not, see
+-   <http://www.gnu.org/licenses/>.  */
+-
+-#include <sysdep.h>
+-
+-	.text
+-ENTRY (strspn)
+-
+-	movq %rdi, %rdx		/* Save SRC.  */
+-
+-	/* First we create a table with flags for all possible characters.
+-	   For the ASCII (7bit/8bit) or ISO-8859-X character sets which are
+-	   supported by the C string functions we have 256 characters.
+-	   Before inserting marks for the stop characters we clear the whole
+-	   table.  */
+-	movq %rdi, %r8			/* Save value.  */
+-	subq $256, %rsp			/* Make space for 256 bytes.  */
+-	cfi_adjust_cfa_offset(256)
+-	movl $32,  %ecx			/* 32*8 bytes = 256 bytes.  */
+-	movq %rsp, %rdi
+-	xorl %eax, %eax			/* We store 0s.  */
+-	cld
+-	rep
+-	stosq
+-
+-	movq %rsi, %rax			/* Setup stopset.  */
+-
+-/* For understanding the following code remember that %rcx == 0 now.
+-   Although all the following instruction only modify %cl we always
+-   have a correct zero-extended 64-bit value in %rcx.  */
+-
+-	.p2align 4
+-L(2):	movb (%rax), %cl	/* get byte from stopset */
+-	testb %cl, %cl		/* is NUL char? */
+-	jz L(1)			/* yes => start compare loop */
+-	movb %cl, (%rsp,%rcx)	/* set corresponding byte in stopset table */
+-
+-	movb 1(%rax), %cl	/* get byte from stopset */
+-	testb $0xff, %cl	/* is NUL char? */
+-	jz L(1)			/* yes => start compare loop */
+-	movb %cl, (%rsp,%rcx)	/* set corresponding byte in stopset table */
+-
+-	movb 2(%rax), %cl	/* get byte from stopset */
+-	testb $0xff, %cl	/* is NUL char? */
+-	jz L(1)			/* yes => start compare loop */
+-	movb %cl, (%rsp,%rcx)	/* set corresponding byte in stopset table */
+-
+-	movb 3(%rax), %cl	/* get byte from stopset */
+-	addq $4, %rax		/* increment stopset pointer */
+-	movb %cl, (%rsp,%rcx)	/* set corresponding byte in stopset table */
+-	testb $0xff, %cl	/* is NUL char? */
+-	jnz L(2)		/* no => process next dword from stopset */
+-
+-L(1):	leaq -4(%rdx), %rax	/* prepare loop */
+-
+-	/* We use a neat trick for the following loop.  Normally we would
+-	   have to test for two termination conditions
+-	   1. a character in the stopset was found
+-	   and
+-	   2. the end of the string was found
+-	   But as a sign that the character is in the stopset we store its
+-	   value in the table.  But the value of NUL is NUL so the loop
+-	   terminates for NUL in every case.  */
+-
+-	.p2align 4
+-L(3):	addq $4, %rax		/* adjust pointer for full loop round */
+-
+-	movb (%rax), %cl	/* get byte from string */
+-	testb %cl, (%rsp,%rcx)	/* is it contained in skipset? */
+-	jz L(4)			/* no => return */
+-
+-	movb 1(%rax), %cl	/* get byte from string */
+-	testb %cl, (%rsp,%rcx)	/* is it contained in skipset? */
+-	jz L(5)			/* no => return */
+-
+-	movb 2(%rax), %cl	/* get byte from string */
+-	testb %cl, (%rsp,%rcx)	/* is it contained in skipset? */
+-	jz L(6)			/* no => return */
+-
+-	movb 3(%rax), %cl	/* get byte from string */
+-	testb %cl, (%rsp,%rcx)	/* is it contained in skipset? */
+-	jnz L(3)		/* yes => start loop again */
+-
+-	incq %rax		/* adjust pointer */
+-L(6):	incq %rax
+-L(5):	incq %rax
+-
+-L(4):	addq $256, %rsp		/* remove stopset */
+-	cfi_adjust_cfa_offset(-256)
+-	subq %rdx, %rax		/* we have to return the number of valid
+-				   characters, so compute distance to first
+-				   non-valid character */
+-	ret
+-END (strspn)
+-libc_hidden_builtin_def (strspn)
+-- 
+GitLab
+
diff --git a/glibc-RHEL-15696-95.patch b/glibc-RHEL-15696-95.patch
new file mode 100644
index 0000000..cf21b96
--- /dev/null
+++ b/glibc-RHEL-15696-95.patch
@@ -0,0 +1,122 @@
+From 670b54bc585ea4a94f3b2e9272ba44aa6b730b73 Mon Sep 17 00:00:00 2001
+From: Noah Goldstein <goldstein.w.n@gmail.com>
+Date: Wed, 23 Mar 2022 16:57:36 -0500
+Subject: [PATCH] x86: Optimize str{n}casecmp TOLOWER logic in strcmp.S
+Content-type: text/plain; charset=UTF-8
+
+Slightly faster method of doing TOLOWER that saves an
+instruction.
+
+Also replace the hard coded 5-byte no with .p2align 4. On builds with
+CET enabled this misaligned entry to strcasecmp.
+
+geometric_mean(N=40) of all benchmarks New / Original: .894
+
+All string/memory tests pass.
+Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
+---
+ sysdeps/x86_64/strcmp.S | 64 +++++++++++++++++++----------------------
+ 1 file changed, 29 insertions(+), 35 deletions(-)
+
+diff --git a/sysdeps/x86_64/strcmp.S b/sysdeps/x86_64/strcmp.S
+index aa6df898..f454ce5b 100644
+--- a/sysdeps/x86_64/strcmp.S
++++ b/sysdeps/x86_64/strcmp.S
+@@ -78,9 +78,8 @@ ENTRY2 (__strcasecmp)
+ 	movq	__libc_tsd_LOCALE@gottpoff(%rip),%rax
+ 	mov	%fs:(%rax),%RDX_LP
+ 
+-	// XXX 5 byte should be before the function
+-	/* 5-byte NOP.  */
+-	.byte	0x0f,0x1f,0x44,0x00,0x00
++	/* Either 1 or 5 bytes (dependeing if CET is enabled).  */
++	.p2align 4
+ END2 (__strcasecmp)
+ # ifndef NO_NOLOCALE_ALIAS
+ weak_alias (__strcasecmp, strcasecmp)
+@@ -97,9 +96,8 @@ ENTRY2 (__strncasecmp)
+ 	movq	__libc_tsd_LOCALE@gottpoff(%rip),%rax
+ 	mov	%fs:(%rax),%RCX_LP
+ 
+-	// XXX 5 byte should be before the function
+-	/* 5-byte NOP.  */
+-	.byte	0x0f,0x1f,0x44,0x00,0x00
++	/* Either 1 or 5 bytes (dependeing if CET is enabled).  */
++	.p2align 4
+ END2 (__strncasecmp)
+ # ifndef NO_NOLOCALE_ALIAS
+ weak_alias (__strncasecmp, strncasecmp)
+@@ -149,22 +147,22 @@ ENTRY (STRCMP)
+ #if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+ 	.section .rodata.cst16,"aM",@progbits,16
+ 	.align 16
+-.Lbelowupper:
+-	.quad	0x4040404040404040
+-	.quad	0x4040404040404040
+-.Ltopupper:
+-	.quad	0x5b5b5b5b5b5b5b5b
+-	.quad	0x5b5b5b5b5b5b5b5b
+-.Ltouppermask:
++.Llcase_min:
++	.quad	0x3f3f3f3f3f3f3f3f
++	.quad	0x3f3f3f3f3f3f3f3f
++.Llcase_max:
++	.quad	0x9999999999999999
++	.quad	0x9999999999999999
++.Lcase_add:
+ 	.quad	0x2020202020202020
+ 	.quad	0x2020202020202020
+ 	.previous
+-	movdqa	.Lbelowupper(%rip), %xmm5
+-# define UCLOW_reg %xmm5
+-	movdqa	.Ltopupper(%rip), %xmm6
+-# define UCHIGH_reg %xmm6
+-	movdqa	.Ltouppermask(%rip), %xmm7
+-# define LCQWORD_reg %xmm7
++	movdqa	.Llcase_min(%rip), %xmm5
++# define LCASE_MIN_reg %xmm5
++	movdqa	.Llcase_max(%rip), %xmm6
++# define LCASE_MAX_reg %xmm6
++	movdqa	.Lcase_add(%rip), %xmm7
++# define CASE_ADD_reg %xmm7
+ #endif
+ 	cmp	$0x30, %ecx
+ 	ja	LABEL(crosscache)	/* rsi: 16-byte load will cross cache line */
+@@ -175,22 +173,18 @@ ENTRY (STRCMP)
+ 	movhpd	8(%rdi), %xmm1
+ 	movhpd	8(%rsi), %xmm2
+ #if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+-# define TOLOWER(reg1, reg2) \
+-	movdqa	reg1, %xmm8;					\
+-	movdqa	UCHIGH_reg, %xmm9;				\
+-	movdqa	reg2, %xmm10;					\
+-	movdqa	UCHIGH_reg, %xmm11;				\
+-	pcmpgtb	UCLOW_reg, %xmm8;				\
+-	pcmpgtb	reg1, %xmm9;					\
+-	pcmpgtb	UCLOW_reg, %xmm10;				\
+-	pcmpgtb	reg2, %xmm11;					\
+-	pand	%xmm9, %xmm8;					\
+-	pand	%xmm11, %xmm10;					\
+-	pand	LCQWORD_reg, %xmm8;				\
+-	pand	LCQWORD_reg, %xmm10;				\
+-	por	%xmm8, reg1;					\
+-	por	%xmm10, reg2
+-	TOLOWER (%xmm1, %xmm2)
++#  define TOLOWER(reg1, reg2) \
++	movdqa	LCASE_MIN_reg, %xmm8;					\
++	movdqa	LCASE_MIN_reg, %xmm9;					\
++	paddb	reg1, %xmm8;					\
++	paddb	reg2, %xmm9;					\
++	pcmpgtb	LCASE_MAX_reg, %xmm8;				\
++	pcmpgtb	LCASE_MAX_reg, %xmm9;				\
++	pandn	CASE_ADD_reg, %xmm8;					\
++	pandn	CASE_ADD_reg, %xmm9;					\
++	paddb	%xmm8, reg1;					\
++	paddb	%xmm9, reg2
++	TOLOWER	(%xmm1, %xmm2)
+ #else
+ # define TOLOWER(reg1, reg2)
+ #endif
+-- 
+GitLab
+
diff --git a/glibc-RHEL-15696-96.patch b/glibc-RHEL-15696-96.patch
new file mode 100644
index 0000000..2d3b891
--- /dev/null
+++ b/glibc-RHEL-15696-96.patch
@@ -0,0 +1,143 @@
+From d154758e618ec9324f5d339c46db0aa27e8b1226 Mon Sep 17 00:00:00 2001
+From: Noah Goldstein <goldstein.w.n@gmail.com>
+Date: Wed, 23 Mar 2022 16:57:38 -0500
+Subject: [PATCH] x86: Optimize str{n}casecmp TOLOWER logic in strcmp-sse42.S
+Content-type: text/plain; charset=UTF-8
+
+Slightly faster method of doing TOLOWER that saves an
+instruction.
+
+Also replace the hard coded 5-byte no with .p2align 4. On builds with
+CET enabled this misaligned entry to strcasecmp.
+
+geometric_mean(N=40) of all benchmarks New / Original: .920
+
+All string/memory tests pass.
+Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
+---
+ sysdeps/x86_64/multiarch/strcmp-sse42.S | 83 +++++++++++--------------
+ 1 file changed, 35 insertions(+), 48 deletions(-)
+
+diff --git a/sysdeps/x86_64/multiarch/strcmp-sse42.S b/sysdeps/x86_64/multiarch/strcmp-sse42.S
+index d8fdeb3a..59e8ddfc 100644
+--- a/sysdeps/x86_64/multiarch/strcmp-sse42.S
++++ b/sysdeps/x86_64/multiarch/strcmp-sse42.S
+@@ -89,9 +89,8 @@ ENTRY (GLABEL(__strcasecmp))
+ 	movq	__libc_tsd_LOCALE@gottpoff(%rip),%rax
+ 	mov	%fs:(%rax),%RDX_LP
+ 
+-	// XXX 5 byte should be before the function
+-	/* 5-byte NOP.  */
+-	.byte	0x0f,0x1f,0x44,0x00,0x00
++	/* Either 1 or 5 bytes (dependeing if CET is enabled).  */
++	.p2align 4
+ END (GLABEL(__strcasecmp))
+ 	/* FALLTHROUGH to strcasecmp_l.  */
+ #endif
+@@ -100,9 +99,8 @@ ENTRY (GLABEL(__strncasecmp))
+ 	movq	__libc_tsd_LOCALE@gottpoff(%rip),%rax
+ 	mov	%fs:(%rax),%RCX_LP
+ 
+-	// XXX 5 byte should be before the function
+-	/* 5-byte NOP.  */
+-	.byte	0x0f,0x1f,0x44,0x00,0x00
++	/* Either 1 or 5 bytes (dependeing if CET is enabled).  */
++	.p2align 4
+ END (GLABEL(__strncasecmp))
+ 	/* FALLTHROUGH to strncasecmp_l.  */
+ #endif
+@@ -170,27 +168,22 @@ STRCMP_SSE42:
+ #if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+ 	.section .rodata.cst16,"aM",@progbits,16
+ 	.align 16
+-LABEL(belowupper):
+-	.quad	0x4040404040404040
+-	.quad	0x4040404040404040
+-LABEL(topupper):
+-# ifdef USE_AVX
+-	.quad	0x5a5a5a5a5a5a5a5a
+-	.quad	0x5a5a5a5a5a5a5a5a
+-# else
+-	.quad	0x5b5b5b5b5b5b5b5b
+-	.quad	0x5b5b5b5b5b5b5b5b
+-# endif
+-LABEL(touppermask):
++LABEL(lcase_min):
++	.quad	0x3f3f3f3f3f3f3f3f
++	.quad	0x3f3f3f3f3f3f3f3f
++LABEL(lcase_max):
++	.quad	0x9999999999999999
++	.quad	0x9999999999999999
++LABEL(case_add):
+ 	.quad	0x2020202020202020
+ 	.quad	0x2020202020202020
+ 	.previous
+-	movdqa	LABEL(belowupper)(%rip), %xmm4
+-# define UCLOW_reg %xmm4
+-	movdqa	LABEL(topupper)(%rip), %xmm5
+-# define UCHIGH_reg %xmm5
+-	movdqa	LABEL(touppermask)(%rip), %xmm6
+-# define LCQWORD_reg %xmm6
++	movdqa	LABEL(lcase_min)(%rip), %xmm4
++# define LCASE_MIN_reg %xmm4
++	movdqa	LABEL(lcase_max)(%rip), %xmm5
++# define LCASE_MAX_reg %xmm5
++	movdqa	LABEL(case_add)(%rip), %xmm6
++# define CASE_ADD_reg %xmm6
+ #endif
+ 	cmp	$0x30, %ecx
+ 	ja	LABEL(crosscache)/* rsi: 16-byte load will cross cache line */
+@@ -201,32 +194,26 @@ LABEL(touppermask):
+ #if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+ # ifdef USE_AVX
+ #  define TOLOWER(reg1, reg2) \
+-	vpcmpgtb UCLOW_reg, reg1, %xmm7;			\
+-	vpcmpgtb UCHIGH_reg, reg1, %xmm8;			\
+-	vpcmpgtb UCLOW_reg, reg2, %xmm9;			\
+-	vpcmpgtb UCHIGH_reg, reg2, %xmm10;			\
+-	vpandn	%xmm7, %xmm8, %xmm8;					\
+-	vpandn	%xmm9, %xmm10, %xmm10;					\
+-	vpand	LCQWORD_reg, %xmm8, %xmm8;				\
+-	vpand	LCQWORD_reg, %xmm10, %xmm10;				\
+-	vpor	reg1, %xmm8, reg1;					\
+-	vpor	reg2, %xmm10, reg2
++	vpaddb	LCASE_MIN_reg, reg1, %xmm7;					\
++	vpaddb	LCASE_MIN_reg, reg2, %xmm8;					\
++	vpcmpgtb LCASE_MAX_reg, %xmm7, %xmm7;					\
++	vpcmpgtb LCASE_MAX_reg, %xmm8, %xmm8;					\
++	vpandn	CASE_ADD_reg, %xmm7, %xmm7;					\
++	vpandn	CASE_ADD_reg, %xmm8, %xmm8;					\
++	vpaddb	%xmm7, reg1, reg1;					\
++	vpaddb	%xmm8, reg2, reg2
+ # else
+ #  define TOLOWER(reg1, reg2) \
+-	movdqa	reg1, %xmm7;					\
+-	movdqa	UCHIGH_reg, %xmm8;				\
+-	movdqa	reg2, %xmm9;					\
+-	movdqa	UCHIGH_reg, %xmm10;				\
+-	pcmpgtb	UCLOW_reg, %xmm7;				\
+-	pcmpgtb	reg1, %xmm8;					\
+-	pcmpgtb	UCLOW_reg, %xmm9;				\
+-	pcmpgtb	reg2, %xmm10;					\
+-	pand	%xmm8, %xmm7;					\
+-	pand	%xmm10, %xmm9;					\
+-	pand	LCQWORD_reg, %xmm7;				\
+-	pand	LCQWORD_reg, %xmm9;				\
+-	por	%xmm7, reg1;					\
+-	por	%xmm9, reg2
++	movdqa	LCASE_MIN_reg, %xmm7;					\
++	movdqa	LCASE_MIN_reg, %xmm8;					\
++	paddb	reg1, %xmm7;					\
++	paddb	reg2, %xmm8;					\
++	pcmpgtb	LCASE_MAX_reg, %xmm7;				\
++	pcmpgtb	LCASE_MAX_reg, %xmm8;				\
++	pandn	CASE_ADD_reg, %xmm7;					\
++	pandn	CASE_ADD_reg, %xmm8;					\
++	paddb	%xmm7, reg1;					\
++	paddb	%xmm8, reg2
+ # endif
+ 	TOLOWER (%xmm1, %xmm2)
+ #else
+-- 
+GitLab
+
diff --git a/glibc-RHEL-15696-97.patch b/glibc-RHEL-15696-97.patch
new file mode 100644
index 0000000..9592795
--- /dev/null
+++ b/glibc-RHEL-15696-97.patch
@@ -0,0 +1,759 @@
+From bbf81222343fed5cd704001a2ae0d86c71544151 Mon Sep 17 00:00:00 2001
+From: Noah Goldstein <goldstein.w.n@gmail.com>
+Date: Thu, 24 Mar 2022 18:56:12 -0500
+Subject: [PATCH] x86: Add AVX2 optimized str{n}casecmp
+Content-type: text/plain; charset=UTF-8
+
+geometric_mean(N=40) of all benchmarks AVX2 / SSE42: .702
+
+All string/memory tests pass.
+Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
+---
+ sysdeps/x86_64/multiarch/Makefile             |   4 +
+ sysdeps/x86_64/multiarch/ifunc-impl-list.c    |  28 +++
+ sysdeps/x86_64/multiarch/ifunc-strcasecmp.h   |  12 +
+ .../x86_64/multiarch/strcasecmp_l-avx2-rtm.S  |  15 ++
+ sysdeps/x86_64/multiarch/strcasecmp_l-avx2.S  |  23 ++
+ sysdeps/x86_64/multiarch/strcmp-avx2.S        | 237 +++++++++++++++---
+ .../x86_64/multiarch/strncase_l-avx2-rtm.S    |  16 ++
+ sysdeps/x86_64/multiarch/strncase_l-avx2.S    |  27 ++
+ 8 files changed, 331 insertions(+), 31 deletions(-)
+ create mode 100644 sysdeps/x86_64/multiarch/strcasecmp_l-avx2-rtm.S
+ create mode 100644 sysdeps/x86_64/multiarch/strcasecmp_l-avx2.S
+ create mode 100644 sysdeps/x86_64/multiarch/strncase_l-avx2-rtm.S
+ create mode 100644 sysdeps/x86_64/multiarch/strncase_l-avx2.S
+
+diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
+index 8c9e7812..711ecf2e 100644
+--- a/sysdeps/x86_64/multiarch/Makefile
++++ b/sysdeps/x86_64/multiarch/Makefile
+@@ -51,6 +51,8 @@ sysdep_routines += \
+   stpncpy-sse2-unaligned \
+   stpncpy-ssse3 \
+   strcasecmp_l-avx \
++  strcasecmp_l-avx2 \
++  strcasecmp_l-avx2-rtm \
+   strcasecmp_l-sse2 \
+   strcasecmp_l-sse4_2 \
+   strcasecmp_l-ssse3 \
+@@ -89,6 +91,8 @@ sysdep_routines += \
+   strlen-evex \
+   strlen-sse2 \
+   strncase_l-avx \
++  strncase_l-avx2 \
++  strncase_l-avx2-rtm \
+   strncase_l-sse2 \
+   strncase_l-sse4_2 \
+   strncase_l-ssse3 \
+diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+index c963d391..d873e1be 100644
+--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
++++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+@@ -418,6 +418,13 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 
+   /* Support sysdeps/x86_64/multiarch/strcasecmp_l.c.  */
+   IFUNC_IMPL (i, name, strcasecmp,
++	      IFUNC_IMPL_ADD (array, i, strcasecmp,
++			      CPU_FEATURE_USABLE (AVX2),
++			      __strcasecmp_avx2)
++	      IFUNC_IMPL_ADD (array, i, strcasecmp,
++			      (CPU_FEATURE_USABLE (AVX2)
++			       && CPU_FEATURE_USABLE (RTM)),
++			      __strcasecmp_avx2_rtm)
+ 	      IFUNC_IMPL_ADD (array, i, strcasecmp,
+ 			      CPU_FEATURE_USABLE (AVX),
+ 			      __strcasecmp_avx)
+@@ -431,6 +438,13 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 
+   /* Support sysdeps/x86_64/multiarch/strcasecmp_l.c.  */
+   IFUNC_IMPL (i, name, strcasecmp_l,
++	      IFUNC_IMPL_ADD (array, i, strcasecmp,
++			      CPU_FEATURE_USABLE (AVX2),
++			      __strcasecmp_l_avx2)
++	      IFUNC_IMPL_ADD (array, i, strcasecmp,
++			      (CPU_FEATURE_USABLE (AVX2)
++			       && CPU_FEATURE_USABLE (RTM)),
++			      __strcasecmp_l_avx2_rtm)
+ 	      IFUNC_IMPL_ADD (array, i, strcasecmp_l,
+ 			      CPU_FEATURE_USABLE (AVX),
+ 			      __strcasecmp_l_avx)
+@@ -558,6 +572,13 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 
+   /* Support sysdeps/x86_64/multiarch/strncase_l.c.  */
+   IFUNC_IMPL (i, name, strncasecmp,
++	      IFUNC_IMPL_ADD (array, i, strncasecmp,
++			      CPU_FEATURE_USABLE (AVX2),
++			      __strncasecmp_avx2)
++	      IFUNC_IMPL_ADD (array, i, strncasecmp,
++			      (CPU_FEATURE_USABLE (AVX2)
++			       && CPU_FEATURE_USABLE (RTM)),
++			      __strncasecmp_avx2_rtm)
+ 	      IFUNC_IMPL_ADD (array, i, strncasecmp,
+ 			      CPU_FEATURE_USABLE (AVX),
+ 			      __strncasecmp_avx)
+@@ -572,6 +593,13 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 
+   /* Support sysdeps/x86_64/multiarch/strncase_l.c.  */
+   IFUNC_IMPL (i, name, strncasecmp_l,
++	      IFUNC_IMPL_ADD (array, i, strncasecmp,
++			      CPU_FEATURE_USABLE (AVX2),
++			      __strncasecmp_l_avx2)
++	      IFUNC_IMPL_ADD (array, i, strncasecmp,
++			      (CPU_FEATURE_USABLE (AVX2)
++			       && CPU_FEATURE_USABLE (RTM)),
++			      __strncasecmp_l_avx2_rtm)
+ 	      IFUNC_IMPL_ADD (array, i, strncasecmp_l,
+ 			      CPU_FEATURE_USABLE (AVX),
+ 			      __strncasecmp_l_avx)
+diff --git a/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h b/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h
+index 6a4bb078..926508c4 100644
+--- a/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h
++++ b/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h
+@@ -23,12 +23,24 @@ extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden;
+ extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden;
+ extern __typeof (REDIRECT_NAME) OPTIMIZE (sse42) attribute_hidden;
+ extern __typeof (REDIRECT_NAME) OPTIMIZE (avx) attribute_hidden;
++extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden;
++extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden;
+ 
+ static inline void *
+ IFUNC_SELECTOR (void)
+ {
+   const struct cpu_features* cpu_features = __get_cpu_features ();
+ 
++  if (CPU_FEATURE_USABLE_P (cpu_features, AVX2)
++      && CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load))
++    {
++      if (CPU_FEATURE_USABLE_P (cpu_features, RTM))
++        return OPTIMIZE (avx2_rtm);
++
++      if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
++        return OPTIMIZE (avx2);
++    }
++
+   if (CPU_FEATURE_USABLE_P (cpu_features, AVX))
+     return OPTIMIZE (avx);
+ 
+diff --git a/sysdeps/x86_64/multiarch/strcasecmp_l-avx2-rtm.S b/sysdeps/x86_64/multiarch/strcasecmp_l-avx2-rtm.S
+new file mode 100644
+index 00000000..09957fc3
+--- /dev/null
++++ b/sysdeps/x86_64/multiarch/strcasecmp_l-avx2-rtm.S
+@@ -0,0 +1,15 @@
++#ifndef STRCMP
++# define STRCMP	__strcasecmp_l_avx2_rtm
++#endif
++
++#define _GLABEL(x)	x ## _rtm
++#define GLABEL(x)	_GLABEL(x)
++
++#define ZERO_UPPER_VEC_REGISTERS_RETURN	\
++	ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST
++
++#define VZEROUPPER_RETURN	jmp L(return_vzeroupper)
++
++#define SECTION(p)	p##.avx.rtm
++
++#include "strcasecmp_l-avx2.S"
+diff --git a/sysdeps/x86_64/multiarch/strcasecmp_l-avx2.S b/sysdeps/x86_64/multiarch/strcasecmp_l-avx2.S
+new file mode 100644
+index 00000000..e2762f2a
+--- /dev/null
++++ b/sysdeps/x86_64/multiarch/strcasecmp_l-avx2.S
+@@ -0,0 +1,23 @@
++/* strcasecmp_l optimized with AVX2.
++   Copyright (C) 2017-2022 Free Software Foundation, Inc.
++   This file is part of the GNU C Library.
++
++   The GNU C Library is free software; you can redistribute it and/or
++   modify it under the terms of the GNU Lesser General Public
++   License as published by the Free Software Foundation; either
++   version 2.1 of the License, or (at your option) any later version.
++
++   The GNU C Library is distributed in the hope that it will be useful,
++   but WITHOUT ANY WARRANTY; without even the implied warranty of
++   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++   Lesser General Public License for more details.
++
++   You should have received a copy of the GNU Lesser General Public
++   License along with the GNU C Library; if not, see
++   <https://www.gnu.org/licenses/>.  */
++
++#ifndef STRCMP
++# define STRCMP	__strcasecmp_l_avx2
++#endif
++#define USE_AS_STRCASECMP_L
++#include "strcmp-avx2.S"
+diff --git a/sysdeps/x86_64/multiarch/strcmp-avx2.S b/sysdeps/x86_64/multiarch/strcmp-avx2.S
+index 782f9472..28cc98b6 100644
+--- a/sysdeps/x86_64/multiarch/strcmp-avx2.S
++++ b/sysdeps/x86_64/multiarch/strcmp-avx2.S
+@@ -20,6 +20,10 @@
+ 
+ # include <sysdep.h>
+ 
++# if defined USE_AS_STRCASECMP_L
++#  include "locale-defines.h"
++# endif
++
+ # ifndef STRCMP
+ #  define STRCMP	__strcmp_avx2
+ # endif
+@@ -74,13 +78,88 @@
+ #  define VEC_OFFSET	(-VEC_SIZE)
+ # endif
+ 
++# ifdef USE_AS_STRCASECMP_L
++#  define BYTE_LOOP_REG	OFFSET_REG
++# else
++#  define BYTE_LOOP_REG	ecx
++# endif
++
++# ifdef USE_AS_STRCASECMP_L
++#  ifdef USE_AS_STRNCMP
++#   define STRCASECMP	__strncasecmp_avx2
++#   define LOCALE_REG	rcx
++#   define LOCALE_REG_LP	RCX_LP
++#   define STRCASECMP_NONASCII	__strncasecmp_l_nonascii
++#  else
++#   define STRCASECMP	__strcasecmp_avx2
++#   define LOCALE_REG	rdx
++#   define LOCALE_REG_LP	RDX_LP
++#   define STRCASECMP_NONASCII	__strcasecmp_l_nonascii
++#  endif
++# endif
++
+ # define xmmZERO	xmm15
+ # define ymmZERO	ymm15
+ 
++# define LCASE_MIN_ymm	%ymm10
++# define LCASE_MAX_ymm	%ymm11
++# define CASE_ADD_ymm	%ymm12
++
++# define LCASE_MIN_xmm	%xmm10
++# define LCASE_MAX_xmm	%xmm11
++# define CASE_ADD_xmm	%xmm12
++
++	/* r11 is never use elsewhere so this is safe to maintain.  */
++# define TOLOWER_BASE	%r11
++
+ # ifndef SECTION
+ #  define SECTION(p)	p##.avx
+ # endif
+ 
++# ifdef USE_AS_STRCASECMP_L
++#  define REG(x, y) x ## y
++#  define TOLOWER(reg1_in, reg1_out, reg2_in, reg2_out, ext)			\
++	vpaddb	REG(LCASE_MIN_, ext), reg1_in, REG(%ext, 8);				\
++	vpaddb	REG(LCASE_MIN_, ext), reg2_in, REG(%ext, 9);				\
++	vpcmpgtb REG(LCASE_MAX_, ext), REG(%ext, 8), REG(%ext, 8);			\
++	vpcmpgtb REG(LCASE_MAX_, ext), REG(%ext, 9), REG(%ext, 9);			\
++	vpandn	REG(CASE_ADD_, ext), REG(%ext, 8), REG(%ext, 8);			\
++	vpandn	REG(CASE_ADD_, ext), REG(%ext, 9), REG(%ext, 9);			\
++	vpaddb	REG(%ext, 8), reg1_in, reg1_out;							\
++	vpaddb	REG(%ext, 9), reg2_in, reg2_out
++
++#  define TOLOWER_gpr(src, dst)	movl (TOLOWER_BASE, src, 4), dst
++#  define TOLOWER_ymm(...)	TOLOWER(__VA_ARGS__, ymm)
++#  define TOLOWER_xmm(...)	TOLOWER(__VA_ARGS__, xmm)
++
++#  define CMP_R1_R2(s1_reg, s2_reg, scratch_reg, reg_out, ext)			\
++	TOLOWER	(s1_reg, scratch_reg, s2_reg, s2_reg, ext);					\
++	VPCMPEQ	scratch_reg, s2_reg, reg_out
++
++#  define CMP_R1_S2(s1_reg, s2_mem, scratch_reg, reg_out, ext)			\
++	VMOVU	s2_mem, reg_out;											\
++	CMP_R1_R2(s1_reg, reg_out, scratch_reg, reg_out, ext)
++
++#  define CMP_R1_R2_ymm(...) CMP_R1_R2(__VA_ARGS__, ymm)
++#  define CMP_R1_R2_xmm(...) CMP_R1_R2(__VA_ARGS__, xmm)
++
++#  define CMP_R1_S2_ymm(...) CMP_R1_S2(__VA_ARGS__, ymm)
++#  define CMP_R1_S2_xmm(...) CMP_R1_S2(__VA_ARGS__, xmm)
++
++# else
++#  define TOLOWER_gpr(...)
++#  define TOLOWER_ymm(...)
++#  define TOLOWER_xmm(...)
++
++#  define CMP_R1_R2_ymm(s1_reg, s2_reg, scratch_reg, reg_out)			\
++	VPCMPEQ	s2_reg, s1_reg, reg_out
++
++#  define CMP_R1_R2_xmm(...) CMP_R1_R2_ymm(__VA_ARGS__)
++
++#  define CMP_R1_S2_ymm(...) CMP_R1_R2_ymm(__VA_ARGS__)
++#  define CMP_R1_S2_xmm(...) CMP_R1_R2_xmm(__VA_ARGS__)
++# endif
++
+ /* Warning!
+            wcscmp/wcsncmp have to use SIGNED comparison for elements.
+            strcmp/strncmp have to use UNSIGNED comparison for elements.
+@@ -102,8 +181,49 @@
+    returned.  */
+ 
+ 	.section SECTION(.text), "ax", @progbits
+-ENTRY(STRCMP)
++	.align	16
++	.type	STRCMP, @function
++	.globl	STRCMP
++	.hidden	STRCMP
++
++# ifndef GLABEL
++#  define GLABEL(...)	__VA_ARGS__
++# endif
++
++# ifdef USE_AS_STRCASECMP_L
++ENTRY (GLABEL(STRCASECMP))
++	movq	__libc_tsd_LOCALE@gottpoff(%rip), %rax
++	mov	%fs:(%rax), %LOCALE_REG_LP
++
++	/* Either 1 or 5 bytes (dependeing if CET is enabled).  */
++	.p2align 4
++END (GLABEL(STRCASECMP))
++	/* FALLTHROUGH to strcasecmp/strncasecmp_l.  */
++# endif
++
++	.p2align 4
++STRCMP:
++	cfi_startproc
++	_CET_ENDBR
++	CALL_MCOUNT
++
++# if defined USE_AS_STRCASECMP_L
++	/* We have to fall back on the C implementation for locales with
++	   encodings not matching ASCII for single bytes.  */
++#  if LOCALE_T___LOCALES != 0 || LC_CTYPE != 0
++	mov	LOCALE_T___LOCALES + LC_CTYPE * LP_SIZE(%LOCALE_REG), %RAX_LP
++#  else
++	mov	(%LOCALE_REG), %RAX_LP
++#  endif
++	testl	$1, LOCALE_DATA_VALUES + _NL_CTYPE_NONASCII_CASE * SIZEOF_VALUES(%rax)
++	jne	STRCASECMP_NONASCII
++	leaq	_nl_C_LC_CTYPE_tolower + 128 * 4(%rip), TOLOWER_BASE
++# endif
++
+ # ifdef USE_AS_STRNCMP
++	/* Don't overwrite LOCALE_REG (rcx) until we have pass
++	   L(one_or_less). Otherwise we might use the wrong locale in
++	   the OVERFLOW_STRCMP (strcasecmp_l).  */
+ #  ifdef __ILP32__
+ 	/* Clear the upper 32 bits.  */
+ 	movl	%edx, %edx
+@@ -128,6 +248,30 @@ ENTRY(STRCMP)
+ #  endif
+ # endif
+ 	vpxor	%xmmZERO, %xmmZERO, %xmmZERO
++# if defined USE_AS_STRCASECMP_L
++	.section .rodata.cst32, "aM", @progbits, 32
++	.align	32
++L(lcase_min):
++	.quad	0x3f3f3f3f3f3f3f3f
++	.quad	0x3f3f3f3f3f3f3f3f
++	.quad	0x3f3f3f3f3f3f3f3f
++	.quad	0x3f3f3f3f3f3f3f3f
++L(lcase_max):
++	.quad	0x9999999999999999
++	.quad	0x9999999999999999
++	.quad	0x9999999999999999
++	.quad	0x9999999999999999
++L(case_add):
++	.quad	0x2020202020202020
++	.quad	0x2020202020202020
++	.quad	0x2020202020202020
++	.quad	0x2020202020202020
++	.previous
++
++	vmovdqa	L(lcase_min)(%rip), LCASE_MIN_ymm
++	vmovdqa	L(lcase_max)(%rip), LCASE_MAX_ymm
++	vmovdqa	L(case_add)(%rip), CASE_ADD_ymm
++# endif
+ 	movl	%edi, %eax
+ 	orl	%esi, %eax
+ 	sall	$20, %eax
+@@ -138,8 +282,10 @@ ENTRY(STRCMP)
+ L(no_page_cross):
+ 	/* Safe to compare 4x vectors.  */
+ 	VMOVU	(%rdi), %ymm0
+-	/* 1s where s1 and s2 equal.  */
+-	VPCMPEQ	(%rsi), %ymm0, %ymm1
++	/* 1s where s1 and s2 equal. Just VPCMPEQ if its not strcasecmp.
++	   Otherwise converts ymm0 and load from rsi to lower. ymm2 is
++	   scratch and ymm1 is the return.  */
++	CMP_R1_S2_ymm (%ymm0, (%rsi), %ymm2, %ymm1)
+ 	/* 1s at null CHAR.  */
+ 	VPCMPEQ	%ymm0, %ymmZERO, %ymm2
+ 	/* 1s where s1 and s2 equal AND not null CHAR.  */
+@@ -172,6 +318,8 @@ L(return_vec_0):
+ # else
+ 	movzbl	(%rdi, %rcx), %eax
+ 	movzbl	(%rsi, %rcx), %ecx
++	TOLOWER_gpr (%rax, %eax)
++	TOLOWER_gpr (%rcx, %ecx)
+ 	subl	%ecx, %eax
+ # endif
+ L(ret0):
+@@ -192,6 +340,10 @@ L(ret_zero):
+ 
+ 	.p2align 4,, 5
+ L(one_or_less):
++#  ifdef USE_AS_STRCASECMP_L
++	/* Set locale argument for strcasecmp.  */
++	movq	%LOCALE_REG, %rdx
++#  endif
+ 	jb	L(ret_zero)
+ #  ifdef USE_AS_WCSCMP
+ 	/* 'nbe' covers the case where length is negative (large
+@@ -211,6 +363,8 @@ L(one_or_less):
+ 	jnbe	__strcmp_avx2
+ 	movzbl	(%rdi), %eax
+ 	movzbl	(%rsi), %ecx
++	TOLOWER_gpr (%rax, %eax)
++	TOLOWER_gpr (%rcx, %ecx)
+ 	subl	%ecx, %eax
+ #  endif
+ L(ret1):
+@@ -238,6 +392,8 @@ L(return_vec_1):
+ # else
+ 	movzbl	VEC_SIZE(%rdi, %rcx), %eax
+ 	movzbl	VEC_SIZE(%rsi, %rcx), %ecx
++	TOLOWER_gpr (%rax, %eax)
++	TOLOWER_gpr (%rcx, %ecx)
+ 	subl	%ecx, %eax
+ # endif
+ L(ret2):
+@@ -269,6 +425,8 @@ L(return_vec_2):
+ # else
+ 	movzbl	(VEC_SIZE * 2)(%rdi, %rcx), %eax
+ 	movzbl	(VEC_SIZE * 2)(%rsi, %rcx), %ecx
++	TOLOWER_gpr (%rax, %eax)
++	TOLOWER_gpr (%rcx, %ecx)
+ 	subl	%ecx, %eax
+ # endif
+ L(ret3):
+@@ -289,6 +447,8 @@ L(return_vec_3):
+ #  else
+ 	movzbl	(VEC_SIZE * 3)(%rdi, %rcx), %eax
+ 	movzbl	(VEC_SIZE * 3)(%rsi, %rcx), %ecx
++	TOLOWER_gpr (%rax, %eax)
++	TOLOWER_gpr (%rcx, %ecx)
+ 	subl	%ecx, %eax
+ #  endif
+ L(ret4):
+@@ -299,7 +459,7 @@ L(ret4):
+ L(more_3x_vec):
+ 	/* Safe to compare 4x vectors.  */
+ 	VMOVU	VEC_SIZE(%rdi), %ymm0
+-	VPCMPEQ	VEC_SIZE(%rsi), %ymm0, %ymm1
++	CMP_R1_S2_ymm (%ymm0, VEC_SIZE(%rsi), %ymm2, %ymm1)
+ 	VPCMPEQ	%ymm0, %ymmZERO, %ymm2
+ 	vpandn	%ymm1, %ymm2, %ymm1
+ 	vpmovmskb %ymm1, %ecx
+@@ -312,7 +472,7 @@ L(more_3x_vec):
+ # endif
+ 
+ 	VMOVU	(VEC_SIZE * 2)(%rdi), %ymm0
+-	VPCMPEQ	(VEC_SIZE * 2)(%rsi), %ymm0, %ymm1
++	CMP_R1_S2_ymm (%ymm0, (VEC_SIZE * 2)(%rsi), %ymm2, %ymm1)
+ 	VPCMPEQ	%ymm0, %ymmZERO, %ymm2
+ 	vpandn	%ymm1, %ymm2, %ymm1
+ 	vpmovmskb %ymm1, %ecx
+@@ -320,7 +480,7 @@ L(more_3x_vec):
+ 	jnz	L(return_vec_2)
+ 
+ 	VMOVU	(VEC_SIZE * 3)(%rdi), %ymm0
+-	VPCMPEQ	(VEC_SIZE * 3)(%rsi), %ymm0, %ymm1
++	CMP_R1_S2_ymm (%ymm0, (VEC_SIZE * 3)(%rsi), %ymm2, %ymm1)
+ 	VPCMPEQ	%ymm0, %ymmZERO, %ymm2
+ 	vpandn	%ymm1, %ymm2, %ymm1
+ 	vpmovmskb %ymm1, %ecx
+@@ -395,12 +555,10 @@ L(loop_skip_page_cross_check):
+ 	VMOVA	(VEC_SIZE * 3)(%rdi), %ymm6
+ 
+ 	/* ymm1 all 1s where s1 and s2 equal. All 0s otherwise.  */
+-	VPCMPEQ	(VEC_SIZE * 0)(%rsi), %ymm0, %ymm1
+-
+-	VPCMPEQ	(VEC_SIZE * 1)(%rsi), %ymm2, %ymm3
+-	VPCMPEQ	(VEC_SIZE * 2)(%rsi), %ymm4, %ymm5
+-	VPCMPEQ	(VEC_SIZE * 3)(%rsi), %ymm6, %ymm7
+-
++	CMP_R1_S2_ymm (%ymm0, (VEC_SIZE * 0)(%rsi), %ymm3, %ymm1)
++	CMP_R1_S2_ymm (%ymm2, (VEC_SIZE * 1)(%rsi), %ymm5, %ymm3)
++	CMP_R1_S2_ymm (%ymm4, (VEC_SIZE * 2)(%rsi), %ymm7, %ymm5)
++	CMP_R1_S2_ymm (%ymm6, (VEC_SIZE * 3)(%rsi), %ymm13, %ymm7)
+ 
+ 	/* If any mismatches or null CHAR then 0 CHAR, otherwise non-
+ 	   zero.  */
+@@ -469,6 +627,8 @@ L(return_vec_2_3_end):
+ # else
+ 	movzbl	(VEC_SIZE * 2 - VEC_OFFSET)(%rdi, %LOOP_REG64), %eax
+ 	movzbl	(VEC_SIZE * 2 - VEC_OFFSET)(%rsi, %LOOP_REG64), %ecx
++	TOLOWER_gpr (%rax, %eax)
++	TOLOWER_gpr (%rcx, %ecx)
+ 	subl	%ecx, %eax
+ 	xorl	%r8d, %eax
+ 	subl	%r8d, %eax
+@@ -512,6 +672,8 @@ L(return_vec_0_end):
+ # else
+ 	movzbl	(%rdi, %rcx), %eax
+ 	movzbl	(%rsi, %rcx), %ecx
++	TOLOWER_gpr (%rax, %eax)
++	TOLOWER_gpr (%rcx, %ecx)
+ 	subl	%ecx, %eax
+ 	xorl	%r8d, %eax
+ 	subl	%r8d, %eax
+@@ -534,6 +696,8 @@ L(return_vec_1_end):
+ #  else
+ 	movzbl	VEC_SIZE(%rdi, %rcx), %eax
+ 	movzbl	VEC_SIZE(%rsi, %rcx), %ecx
++	TOLOWER_gpr (%rax, %eax)
++	TOLOWER_gpr (%rcx, %ecx)
+ 	subl	%ecx, %eax
+ 	xorl	%r8d, %eax
+ 	subl	%r8d, %eax
+@@ -560,6 +724,8 @@ L(return_vec_2_end):
+ # else
+ 	movzbl	(VEC_SIZE * 2)(%rdi, %rcx), %eax
+ 	movzbl	(VEC_SIZE * 2)(%rsi, %rcx), %ecx
++	TOLOWER_gpr (%rax, %eax)
++	TOLOWER_gpr (%rcx, %ecx)
+ 	subl	%ecx, %eax
+ 	xorl	%r8d, %eax
+ 	subl	%r8d, %eax
+@@ -587,7 +753,7 @@ L(page_cross_during_loop):
+ 	jle	L(less_1x_vec_till_page_cross)
+ 
+ 	VMOVA	(%rdi), %ymm0
+-	VPCMPEQ	(%rsi), %ymm0, %ymm1
++	CMP_R1_S2_ymm (%ymm0, (%rsi), %ymm2, %ymm1)
+ 	VPCMPEQ	%ymm0, %ymmZERO, %ymm2
+ 	vpandn	%ymm1, %ymm2, %ymm1
+ 	vpmovmskb %ymm1, %ecx
+@@ -609,7 +775,7 @@ L(less_1x_vec_till_page_cross):
+ 	   here, it means the previous page (rdi - VEC_SIZE) has already
+ 	   been loaded earlier so must be valid.  */
+ 	VMOVU	-VEC_SIZE(%rdi, %rax), %ymm0
+-	VPCMPEQ	-VEC_SIZE(%rsi, %rax), %ymm0, %ymm1
++	CMP_R1_S2_ymm (%ymm0, -VEC_SIZE(%rsi, %rax), %ymm2, %ymm1)
+ 	VPCMPEQ	%ymm0, %ymmZERO, %ymm2
+ 	vpandn	%ymm1, %ymm2, %ymm1
+ 	vpmovmskb %ymm1, %ecx
+@@ -651,6 +817,8 @@ L(return_page_cross_cmp_mem):
+ # else
+ 	movzbl	VEC_OFFSET(%rdi, %rcx), %eax
+ 	movzbl	VEC_OFFSET(%rsi, %rcx), %ecx
++	TOLOWER_gpr (%rax, %eax)
++	TOLOWER_gpr (%rcx, %ecx)
+ 	subl	%ecx, %eax
+ 	xorl	%r8d, %eax
+ 	subl	%r8d, %eax
+@@ -677,7 +845,7 @@ L(more_2x_vec_till_page_cross):
+ 	   iteration here.  */
+ 
+ 	VMOVU	VEC_SIZE(%rdi), %ymm0
+-	VPCMPEQ	VEC_SIZE(%rsi), %ymm0, %ymm1
++	CMP_R1_S2_ymm (%ymm0, VEC_SIZE(%rsi), %ymm2, %ymm1)
+ 	VPCMPEQ	%ymm0, %ymmZERO, %ymm2
+ 	vpandn	%ymm1, %ymm2, %ymm1
+ 	vpmovmskb %ymm1, %ecx
+@@ -693,7 +861,7 @@ L(more_2x_vec_till_page_cross):
+ 
+ 	/* Safe to include comparisons from lower bytes.  */
+ 	VMOVU	-(VEC_SIZE * 2)(%rdi, %rax), %ymm0
+-	VPCMPEQ	-(VEC_SIZE * 2)(%rsi, %rax), %ymm0, %ymm1
++	CMP_R1_S2_ymm (%ymm0, -(VEC_SIZE * 2)(%rsi, %rax), %ymm2, %ymm1)
+ 	VPCMPEQ	%ymm0, %ymmZERO, %ymm2
+ 	vpandn	%ymm1, %ymm2, %ymm1
+ 	vpmovmskb %ymm1, %ecx
+@@ -701,7 +869,7 @@ L(more_2x_vec_till_page_cross):
+ 	jnz	L(return_vec_page_cross_0)
+ 
+ 	VMOVU	-(VEC_SIZE * 1)(%rdi, %rax), %ymm0
+-	VPCMPEQ	-(VEC_SIZE * 1)(%rsi, %rax), %ymm0, %ymm1
++	CMP_R1_S2_ymm (%ymm0, -(VEC_SIZE * 1)(%rsi, %rax), %ymm2, %ymm1)
+ 	VPCMPEQ	%ymm0, %ymmZERO, %ymm2
+ 	vpandn	%ymm1, %ymm2, %ymm1
+ 	vpmovmskb %ymm1, %ecx
+@@ -719,8 +887,8 @@ L(more_2x_vec_till_page_cross):
+ 	VMOVA	(VEC_SIZE * 2)(%rdi), %ymm4
+ 	VMOVA	(VEC_SIZE * 3)(%rdi), %ymm6
+ 
+-	VPCMPEQ	(VEC_SIZE * 2)(%rsi), %ymm4, %ymm5
+-	VPCMPEQ	(VEC_SIZE * 3)(%rsi), %ymm6, %ymm7
++	CMP_R1_S2_ymm (%ymm4, (VEC_SIZE * 2)(%rsi), %ymm7, %ymm5)
++	CMP_R1_S2_ymm (%ymm6, (VEC_SIZE * 3)(%rsi), %ymm13, %ymm7)
+ 	vpand	%ymm4, %ymm5, %ymm5
+ 	vpand	%ymm6, %ymm7, %ymm7
+ 	VPMINU	%ymm5, %ymm7, %ymm7
+@@ -771,6 +939,8 @@ L(return_vec_page_cross_1):
+ # else
+ 	movzbl	VEC_OFFSET(%rdi, %rcx), %eax
+ 	movzbl	VEC_OFFSET(%rsi, %rcx), %ecx
++	TOLOWER_gpr (%rax, %eax)
++	TOLOWER_gpr (%rcx, %ecx)
+ 	subl	%ecx, %eax
+ 	xorl	%r8d, %eax
+ 	subl	%r8d, %eax
+@@ -826,7 +996,7 @@ L(page_cross):
+ L(page_cross_loop):
+ 
+ 	VMOVU	(%rdi, %OFFSET_REG64), %ymm0
+-	VPCMPEQ	(%rsi, %OFFSET_REG64), %ymm0, %ymm1
++	CMP_R1_S2_ymm (%ymm0, (%rsi, %OFFSET_REG64), %ymm2, %ymm1)
+ 	VPCMPEQ	%ymm0, %ymmZERO, %ymm2
+ 	vpandn	%ymm1, %ymm2, %ymm1
+ 	vpmovmskb %ymm1, %ecx
+@@ -844,11 +1014,11 @@ L(page_cross_loop):
+ 	subl	%eax, %OFFSET_REG
+ 	/* OFFSET_REG has distance to page cross - VEC_SIZE. Guranteed
+ 	   to not cross page so is safe to load. Since we have already
+-	   loaded at least 1 VEC from rsi it is also guranteed to be safe.
+-	 */
++	   loaded at least 1 VEC from rsi it is also guranteed to be
++	   safe.  */
+ 
+ 	VMOVU	(%rdi, %OFFSET_REG64), %ymm0
+-	VPCMPEQ	(%rsi, %OFFSET_REG64), %ymm0, %ymm1
++	CMP_R1_S2_ymm (%ymm0, (%rsi, %OFFSET_REG64), %ymm2, %ymm1)
+ 	VPCMPEQ	%ymm0, %ymmZERO, %ymm2
+ 	vpandn	%ymm1, %ymm2, %ymm1
+ 	vpmovmskb %ymm1, %ecx
+@@ -881,6 +1051,8 @@ L(ret_vec_page_cross_cont):
+ # else
+ 	movzbl	(%rdi, %rcx), %eax
+ 	movzbl	(%rsi, %rcx), %ecx
++	TOLOWER_gpr (%rax, %eax)
++	TOLOWER_gpr (%rcx, %ecx)
+ 	subl	%ecx, %eax
+ 	xorl	%r8d, %eax
+ 	subl	%r8d, %eax
+@@ -934,7 +1106,7 @@ L(less_1x_vec_till_page):
+ 	ja	L(less_16_till_page)
+ 
+ 	VMOVU	(%rdi), %xmm0
+-	VPCMPEQ	(%rsi), %xmm0, %xmm1
++	CMP_R1_S2_xmm (%xmm0, (%rsi), %xmm2, %xmm1)
+ 	VPCMPEQ	%xmm0, %xmmZERO, %xmm2
+ 	vpandn	%xmm1, %xmm2, %xmm1
+ 	vpmovmskb %ymm1, %ecx
+@@ -952,7 +1124,7 @@ L(less_1x_vec_till_page):
+ # endif
+ 
+ 	VMOVU	(%rdi, %OFFSET_REG64), %xmm0
+-	VPCMPEQ	(%rsi, %OFFSET_REG64), %xmm0, %xmm1
++	CMP_R1_S2_xmm (%xmm0, (%rsi, %OFFSET_REG64), %xmm2, %xmm1)
+ 	VPCMPEQ	%xmm0, %xmmZERO, %xmm2
+ 	vpandn	%xmm1, %xmm2, %xmm1
+ 	vpmovmskb %ymm1, %ecx
+@@ -990,7 +1162,7 @@ L(less_16_till_page):
+ 	vmovq	(%rdi), %xmm0
+ 	vmovq	(%rsi), %xmm1
+ 	VPCMPEQ	%xmm0, %xmmZERO, %xmm2
+-	VPCMPEQ	%xmm1, %xmm0, %xmm1
++	CMP_R1_R2_xmm (%xmm0, %xmm1, %xmm3, %xmm1)
+ 	vpandn	%xmm1, %xmm2, %xmm1
+ 	vpmovmskb %ymm1, %ecx
+ 	incb	%cl
+@@ -1010,7 +1182,7 @@ L(less_16_till_page):
+ 	vmovq	(%rdi, %OFFSET_REG64), %xmm0
+ 	vmovq	(%rsi, %OFFSET_REG64), %xmm1
+ 	VPCMPEQ	%xmm0, %xmmZERO, %xmm2
+-	VPCMPEQ	%xmm1, %xmm0, %xmm1
++	CMP_R1_R2_xmm (%xmm0, %xmm1, %xmm3, %xmm1)
+ 	vpandn	%xmm1, %xmm2, %xmm1
+ 	vpmovmskb %ymm1, %ecx
+ 	incb	%cl
+@@ -1066,7 +1238,7 @@ L(ret_less_8_wcs):
+ 	vmovd	(%rdi), %xmm0
+ 	vmovd	(%rsi), %xmm1
+ 	VPCMPEQ	%xmm0, %xmmZERO, %xmm2
+-	VPCMPEQ	%xmm1, %xmm0, %xmm1
++	CMP_R1_R2_xmm (%xmm0, %xmm1, %xmm3, %xmm1)
+ 	vpandn	%xmm1, %xmm2, %xmm1
+ 	vpmovmskb %ymm1, %ecx
+ 	subl	$0xf, %ecx
+@@ -1085,7 +1257,7 @@ L(ret_less_8_wcs):
+ 	vmovd	(%rdi, %OFFSET_REG64), %xmm0
+ 	vmovd	(%rsi, %OFFSET_REG64), %xmm1
+ 	VPCMPEQ	%xmm0, %xmmZERO, %xmm2
+-	VPCMPEQ	%xmm1, %xmm0, %xmm1
++	CMP_R1_R2_xmm (%xmm0, %xmm1, %xmm3, %xmm1)
+ 	vpandn	%xmm1, %xmm2, %xmm1
+ 	vpmovmskb %ymm1, %ecx
+ 	subl	$0xf, %ecx
+@@ -1119,7 +1291,9 @@ L(less_4_till_page):
+ L(less_4_loop):
+ 	movzbl	(%rdi), %eax
+ 	movzbl	(%rsi, %rdi), %ecx
+-	subl	%ecx, %eax
++	TOLOWER_gpr (%rax, %eax)
++	TOLOWER_gpr (%rcx, %BYTE_LOOP_REG)
++	subl	%BYTE_LOOP_REG, %eax
+ 	jnz	L(ret_less_4_loop)
+ 	testl	%ecx, %ecx
+ 	jz	L(ret_zero_4_loop)
+@@ -1146,5 +1320,6 @@ L(ret_less_4_loop):
+ 	subl	%r8d, %eax
+ 	ret
+ # endif
+-END(STRCMP)
++	cfi_endproc
++	.size	STRCMP, .-STRCMP
+ #endif
+diff --git a/sysdeps/x86_64/multiarch/strncase_l-avx2-rtm.S b/sysdeps/x86_64/multiarch/strncase_l-avx2-rtm.S
+new file mode 100644
+index 00000000..58c05dcf
+--- /dev/null
++++ b/sysdeps/x86_64/multiarch/strncase_l-avx2-rtm.S
+@@ -0,0 +1,16 @@
++#ifndef STRCMP
++# define STRCMP	__strncasecmp_l_avx2_rtm
++#endif
++
++#define _GLABEL(x)	x ## _rtm
++#define GLABEL(x)	_GLABEL(x)
++
++#define ZERO_UPPER_VEC_REGISTERS_RETURN	\
++	ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST
++
++#define VZEROUPPER_RETURN	jmp L(return_vzeroupper)
++
++#define SECTION(p)	p##.avx.rtm
++#define OVERFLOW_STRCMP	__strcasecmp_l_avx2_rtm
++
++#include "strncase_l-avx2.S"
+diff --git a/sysdeps/x86_64/multiarch/strncase_l-avx2.S b/sysdeps/x86_64/multiarch/strncase_l-avx2.S
+new file mode 100644
+index 00000000..48c0aa21
+--- /dev/null
++++ b/sysdeps/x86_64/multiarch/strncase_l-avx2.S
+@@ -0,0 +1,27 @@
++/* strncasecmp_l optimized with AVX2.
++   Copyright (C) 2017-2022 Free Software Foundation, Inc.
++   This file is part of the GNU C Library.
++
++   The GNU C Library is free software; you can redistribute it and/or
++   modify it under the terms of the GNU Lesser General Public
++   License as published by the Free Software Foundation; either
++   version 2.1 of the License, or (at your option) any later version.
++
++   The GNU C Library is distributed in the hope that it will be useful,
++   but WITHOUT ANY WARRANTY; without even the implied warranty of
++   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++   Lesser General Public License for more details.
++
++   You should have received a copy of the GNU Lesser General Public
++   License along with the GNU C Library; if not, see
++   <https://www.gnu.org/licenses/>.  */
++
++#ifndef STRCMP
++# define STRCMP	__strncasecmp_l_avx2
++#endif
++#define USE_AS_STRCASECMP_L
++#define USE_AS_STRNCMP
++#ifndef OVERFLOW_STRCMP
++# define OVERFLOW_STRCMP	__strcasecmp_l_avx2
++#endif
++#include "strcmp-avx2.S"
+-- 
+GitLab
+
diff --git a/glibc-RHEL-15696-98.patch b/glibc-RHEL-15696-98.patch
new file mode 100644
index 0000000..9941bcc
--- /dev/null
+++ b/glibc-RHEL-15696-98.patch
@@ -0,0 +1,814 @@
+From 84e7c46df4086873eae28a1fb87d2cf5388b1e16 Mon Sep 17 00:00:00 2001
+From: Noah Goldstein <goldstein.w.n@gmail.com>
+Date: Thu, 24 Mar 2022 18:56:13 -0500
+Subject: [PATCH] x86: Add EVEX optimized str{n}casecmp
+Content-type: text/plain; charset=UTF-8
+
+geometric_mean(N=40) of all benchmarks EVEX / SSE42: .621
+
+All string/memory tests pass.
+Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
+---
+ sysdeps/x86_64/multiarch/Makefile            |   2 +
+ sysdeps/x86_64/multiarch/ifunc-impl-list.c   |  16 +
+ sysdeps/x86_64/multiarch/ifunc-strcasecmp.h  |   5 +
+ sysdeps/x86_64/multiarch/strcasecmp_l-evex.S |  23 ++
+ sysdeps/x86_64/multiarch/strcmp-evex.S       | 290 ++++++++++++++++---
+ sysdeps/x86_64/multiarch/strncase_l-evex.S   |  25 ++
+ 6 files changed, 321 insertions(+), 40 deletions(-)
+ create mode 100644 sysdeps/x86_64/multiarch/strcasecmp_l-evex.S
+ create mode 100644 sysdeps/x86_64/multiarch/strncase_l-evex.S
+
+diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
+index 711ecf2e..359712c1 100644
+--- a/sysdeps/x86_64/multiarch/Makefile
++++ b/sysdeps/x86_64/multiarch/Makefile
+@@ -53,6 +53,7 @@ sysdep_routines += \
+   strcasecmp_l-avx \
+   strcasecmp_l-avx2 \
+   strcasecmp_l-avx2-rtm \
++  strcasecmp_l-evex \
+   strcasecmp_l-sse2 \
+   strcasecmp_l-sse4_2 \
+   strcasecmp_l-ssse3 \
+@@ -93,6 +94,7 @@ sysdep_routines += \
+   strncase_l-avx \
+   strncase_l-avx2 \
+   strncase_l-avx2-rtm \
++  strncase_l-evex \
+   strncase_l-sse2 \
+   strncase_l-sse4_2 \
+   strncase_l-ssse3 \
+diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+index d873e1be..1dedc637 100644
+--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
++++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+@@ -418,6 +418,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 
+   /* Support sysdeps/x86_64/multiarch/strcasecmp_l.c.  */
+   IFUNC_IMPL (i, name, strcasecmp,
++	      IFUNC_IMPL_ADD (array, i, strcasecmp,
++			      (CPU_FEATURE_USABLE (AVX512VL)
++			       && CPU_FEATURE_USABLE (AVX512BW)),
++			      __strcasecmp_evex)
+ 	      IFUNC_IMPL_ADD (array, i, strcasecmp,
+ 			      CPU_FEATURE_USABLE (AVX2),
+ 			      __strcasecmp_avx2)
+@@ -438,6 +442,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 
+   /* Support sysdeps/x86_64/multiarch/strcasecmp_l.c.  */
+   IFUNC_IMPL (i, name, strcasecmp_l,
++	      IFUNC_IMPL_ADD (array, i, strcasecmp,
++			      (CPU_FEATURE_USABLE (AVX512VL)
++			       && CPU_FEATURE_USABLE (AVX512BW)),
++			      __strcasecmp_l_evex)
+ 	      IFUNC_IMPL_ADD (array, i, strcasecmp,
+ 			      CPU_FEATURE_USABLE (AVX2),
+ 			      __strcasecmp_l_avx2)
+@@ -572,6 +580,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 
+   /* Support sysdeps/x86_64/multiarch/strncase_l.c.  */
+   IFUNC_IMPL (i, name, strncasecmp,
++	      IFUNC_IMPL_ADD (array, i, strncasecmp,
++			      (CPU_FEATURE_USABLE (AVX512VL)
++			       && CPU_FEATURE_USABLE (AVX512BW)),
++			      __strncasecmp_evex)
+ 	      IFUNC_IMPL_ADD (array, i, strncasecmp,
+ 			      CPU_FEATURE_USABLE (AVX2),
+ 			      __strncasecmp_avx2)
+@@ -593,6 +605,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 
+   /* Support sysdeps/x86_64/multiarch/strncase_l.c.  */
+   IFUNC_IMPL (i, name, strncasecmp_l,
++	      IFUNC_IMPL_ADD (array, i, strncasecmp,
++			      (CPU_FEATURE_USABLE (AVX512VL)
++			       && CPU_FEATURE_USABLE (AVX512BW)),
++			      __strncasecmp_l_evex)
+ 	      IFUNC_IMPL_ADD (array, i, strncasecmp,
+ 			      CPU_FEATURE_USABLE (AVX2),
+ 			      __strncasecmp_l_avx2)
+diff --git a/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h b/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h
+index 926508c4..6dd49a21 100644
+--- a/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h
++++ b/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h
+@@ -25,6 +25,7 @@ extern __typeof (REDIRECT_NAME) OPTIMIZE (sse42) attribute_hidden;
+ extern __typeof (REDIRECT_NAME) OPTIMIZE (avx) attribute_hidden;
+ extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden;
+ extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden;
++extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden;
+ 
+ static inline void *
+ IFUNC_SELECTOR (void)
+@@ -34,6 +35,10 @@ IFUNC_SELECTOR (void)
+   if (CPU_FEATURE_USABLE_P (cpu_features, AVX2)
+       && CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load))
+     {
++      if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
++          && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW))
++        return OPTIMIZE (evex);
++
+       if (CPU_FEATURE_USABLE_P (cpu_features, RTM))
+         return OPTIMIZE (avx2_rtm);
+ 
+diff --git a/sysdeps/x86_64/multiarch/strcasecmp_l-evex.S b/sysdeps/x86_64/multiarch/strcasecmp_l-evex.S
+new file mode 100644
+index 00000000..58642db7
+--- /dev/null
++++ b/sysdeps/x86_64/multiarch/strcasecmp_l-evex.S
+@@ -0,0 +1,23 @@
++/* strcasecmp_l optimized with EVEX.
++   Copyright (C) 2017-2022 Free Software Foundation, Inc.
++   This file is part of the GNU C Library.
++
++   The GNU C Library is free software; you can redistribute it and/or
++   modify it under the terms of the GNU Lesser General Public
++   License as published by the Free Software Foundation; either
++   version 2.1 of the License, or (at your option) any later version.
++
++   The GNU C Library is distributed in the hope that it will be useful,
++   but WITHOUT ANY WARRANTY; without even the implied warranty of
++   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++   Lesser General Public License for more details.
++
++   You should have received a copy of the GNU Lesser General Public
++   License along with the GNU C Library; if not, see
++   <https://www.gnu.org/licenses/>.  */
++
++#ifndef STRCMP
++# define STRCMP	__strcasecmp_l_evex
++#endif
++#define USE_AS_STRCASECMP_L
++#include "strcmp-evex.S"
+diff --git a/sysdeps/x86_64/multiarch/strcmp-evex.S b/sysdeps/x86_64/multiarch/strcmp-evex.S
+index 0dfa62bd..b81b5775 100644
+--- a/sysdeps/x86_64/multiarch/strcmp-evex.S
++++ b/sysdeps/x86_64/multiarch/strcmp-evex.S
+@@ -19,6 +19,9 @@
+ #if IS_IN (libc)
+ 
+ # include <sysdep.h>
++# if defined USE_AS_STRCASECMP_L
++#  include "locale-defines.h"
++# endif
+ 
+ # ifndef STRCMP
+ #  define STRCMP	__strcmp_evex
+@@ -34,19 +37,29 @@
+ # define VMOVA	vmovdqa64
+ 
+ # ifdef USE_AS_WCSCMP
+-#  define TESTEQ	subl	$0xff,
++#  ifndef OVERFLOW_STRCMP
++#   define OVERFLOW_STRCMP	__wcscmp_evex
++#  endif
++
++#  define TESTEQ	subl $0xff,
+ 	/* Compare packed dwords.  */
+ #  define VPCMP	vpcmpd
+ #  define VPMINU	vpminud
+ #  define VPTESTM	vptestmd
++#  define VPTESTNM	vptestnmd
+ 	/* 1 dword char == 4 bytes.  */
+ #  define SIZE_OF_CHAR	4
+ # else
++#  ifndef OVERFLOW_STRCMP
++#   define OVERFLOW_STRCMP	__strcmp_evex
++#  endif
++
+ #  define TESTEQ	incl
+ 	/* Compare packed bytes.  */
+ #  define VPCMP	vpcmpb
+ #  define VPMINU	vpminub
+ #  define VPTESTM	vptestmb
++#  define VPTESTNM	vptestnmb
+ 	/* 1 byte char == 1 byte.  */
+ #  define SIZE_OF_CHAR	1
+ # endif
+@@ -73,11 +86,16 @@
+ #  define VEC_OFFSET	(-VEC_SIZE)
+ # endif
+ 
+-# define XMMZERO	xmm16
+ # define XMM0	xmm17
+ # define XMM1	xmm18
+ 
+-# define YMMZERO	ymm16
++# define XMM10	xmm27
++# define XMM11	xmm28
++# define XMM12	xmm29
++# define XMM13	xmm30
++# define XMM14	xmm31
++
++
+ # define YMM0	ymm17
+ # define YMM1	ymm18
+ # define YMM2	ymm19
+@@ -89,6 +107,87 @@
+ # define YMM8	ymm25
+ # define YMM9	ymm26
+ # define YMM10	ymm27
++# define YMM11	ymm28
++# define YMM12	ymm29
++# define YMM13	ymm30
++# define YMM14	ymm31
++
++# ifdef USE_AS_STRCASECMP_L
++#  define BYTE_LOOP_REG	OFFSET_REG
++# else
++#  define BYTE_LOOP_REG	ecx
++# endif
++
++# ifdef USE_AS_STRCASECMP_L
++#  ifdef USE_AS_STRNCMP
++#   define STRCASECMP	__strncasecmp_evex
++#   define LOCALE_REG	rcx
++#   define LOCALE_REG_LP	RCX_LP
++#   define STRCASECMP_NONASCII	__strncasecmp_l_nonascii
++#  else
++#   define STRCASECMP	__strcasecmp_evex
++#   define LOCALE_REG	rdx
++#   define LOCALE_REG_LP	RDX_LP
++#   define STRCASECMP_NONASCII	__strcasecmp_l_nonascii
++#  endif
++# endif
++
++# define LCASE_MIN_YMM	%YMM12
++# define LCASE_MAX_YMM	%YMM13
++# define CASE_ADD_YMM	%YMM14
++
++# define LCASE_MIN_XMM	%XMM12
++# define LCASE_MAX_XMM	%XMM13
++# define CASE_ADD_XMM	%XMM14
++
++	/* NB: wcsncmp uses r11 but strcasecmp is never used in
++	   conjunction with wcscmp.  */
++# define TOLOWER_BASE	%r11
++
++# ifdef USE_AS_STRCASECMP_L
++#  define _REG(x, y) x ## y
++#  define REG(x, y) _REG(x, y)
++#  define TOLOWER(reg1, reg2, ext)										\
++	vpsubb	REG(LCASE_MIN_, ext), reg1, REG(%ext, 10);					\
++	vpsubb	REG(LCASE_MIN_, ext), reg2, REG(%ext, 11);					\
++	vpcmpub	$1, REG(LCASE_MAX_, ext), REG(%ext, 10), %k5;				\
++	vpcmpub	$1, REG(LCASE_MAX_, ext), REG(%ext, 11), %k6;				\
++	vpaddb	reg1, REG(CASE_ADD_, ext), reg1{%k5};						\
++	vpaddb	reg2, REG(CASE_ADD_, ext), reg2{%k6}
++
++#  define TOLOWER_gpr(src, dst) movl (TOLOWER_BASE, src, 4), dst
++#  define TOLOWER_YMM(...)	TOLOWER(__VA_ARGS__, YMM)
++#  define TOLOWER_XMM(...)	TOLOWER(__VA_ARGS__, XMM)
++
++#  define CMP_R1_R2(s1_reg, s2_reg, reg_out, ext)						\
++	TOLOWER	(s1_reg, s2_reg, ext);										\
++	VPCMP	$0, s1_reg, s2_reg, reg_out
++
++#  define CMP_R1_S2(s1_reg, s2_mem, s2_reg, reg_out, ext)				\
++	VMOVU	s2_mem, s2_reg;												\
++	CMP_R1_R2(s1_reg, s2_reg, reg_out, ext)
++
++#  define CMP_R1_R2_YMM(...) CMP_R1_R2(__VA_ARGS__, YMM)
++#  define CMP_R1_R2_XMM(...) CMP_R1_R2(__VA_ARGS__, XMM)
++
++#  define CMP_R1_S2_YMM(...) CMP_R1_S2(__VA_ARGS__, YMM)
++#  define CMP_R1_S2_XMM(...) CMP_R1_S2(__VA_ARGS__, XMM)
++
++# else
++#  define TOLOWER_gpr(...)
++#  define TOLOWER_YMM(...)
++#  define TOLOWER_XMM(...)
++
++#  define CMP_R1_R2_YMM(s1_reg, s2_reg, reg_out)						\
++	VPCMP	$0, s2_reg, s1_reg, reg_out
++
++#  define CMP_R1_R2_XMM(...) CMP_R1_R2_YMM(__VA_ARGS__)
++
++#  define CMP_R1_S2_YMM(s1_reg, s2_mem, unused, reg_out)				\
++	VPCMP	$0, s2_mem, s1_reg, reg_out
++
++#  define CMP_R1_S2_XMM(...) CMP_R1_S2_YMM(__VA_ARGS__)
++# endif
+ 
+ /* Warning!
+            wcscmp/wcsncmp have to use SIGNED comparison for elements.
+@@ -112,8 +211,45 @@
+    returned.  */
+ 
+ 	.section .text.evex, "ax", @progbits
+-ENTRY(STRCMP)
++	.align	16
++	.type	STRCMP, @function
++	.globl	STRCMP
++	.hidden	STRCMP
++
++# ifdef USE_AS_STRCASECMP_L
++ENTRY (STRCASECMP)
++	movq	__libc_tsd_LOCALE@gottpoff(%rip), %rax
++	mov	%fs:(%rax), %LOCALE_REG_LP
++
++	/* Either 1 or 5 bytes (dependeing if CET is enabled).  */
++	.p2align 4
++END (STRCASECMP)
++	/* FALLTHROUGH to strcasecmp/strncasecmp_l.  */
++# endif
++
++	.p2align 4
++STRCMP:
++	cfi_startproc
++	_CET_ENDBR
++	CALL_MCOUNT
++
++# if defined USE_AS_STRCASECMP_L
++	/* We have to fall back on the C implementation for locales with
++	   encodings not matching ASCII for single bytes.  */
++#  if LOCALE_T___LOCALES != 0 || LC_CTYPE != 0
++	mov	LOCALE_T___LOCALES + LC_CTYPE * LP_SIZE(%LOCALE_REG), %RAX_LP
++#  else
++	mov	(%LOCALE_REG), %RAX_LP
++#  endif
++	testl	$1, LOCALE_DATA_VALUES + _NL_CTYPE_NONASCII_CASE * SIZEOF_VALUES(%rax)
++	jne	STRCASECMP_NONASCII
++	leaq	_nl_C_LC_CTYPE_tolower + 128 * 4(%rip), TOLOWER_BASE
++# endif
++
+ # ifdef USE_AS_STRNCMP
++	/* Don't overwrite LOCALE_REG (rcx) until we have pass
++	   L(one_or_less). Otherwise we might use the wrong locale in
++	   the OVERFLOW_STRCMP (strcasecmp_l).  */
+ #  ifdef __ILP32__
+ 	/* Clear the upper 32 bits.  */
+ 	movl	%edx, %edx
+@@ -125,6 +261,32 @@ ENTRY(STRCMP)
+ 	   actually bound the buffer.  */
+ 	jle	L(one_or_less)
+ # endif
++
++# if defined USE_AS_STRCASECMP_L
++	.section .rodata.cst32, "aM", @progbits, 32
++	.align	32
++L(lcase_min):
++	.quad	0x4141414141414141
++	.quad	0x4141414141414141
++	.quad	0x4141414141414141
++	.quad	0x4141414141414141
++L(lcase_max):
++	.quad	0x1a1a1a1a1a1a1a1a
++	.quad	0x1a1a1a1a1a1a1a1a
++	.quad	0x1a1a1a1a1a1a1a1a
++	.quad	0x1a1a1a1a1a1a1a1a
++L(case_add):
++	.quad	0x2020202020202020
++	.quad	0x2020202020202020
++	.quad	0x2020202020202020
++	.quad	0x2020202020202020
++	.previous
++
++	vmovdqa64 L(lcase_min)(%rip), LCASE_MIN_YMM
++	vmovdqa64 L(lcase_max)(%rip), LCASE_MAX_YMM
++	vmovdqa64 L(case_add)(%rip), CASE_ADD_YMM
++# endif
++
+ 	movl	%edi, %eax
+ 	orl	%esi, %eax
+ 	/* Shift out the bits irrelivant to page boundary ([63:12]).  */
+@@ -139,7 +301,7 @@ L(no_page_cross):
+ 	VPTESTM	%YMM0, %YMM0, %k2
+ 	/* Each bit cleared in K1 represents a mismatch or a null CHAR
+ 	   in YMM0 and 32 bytes at (%rsi).  */
+-	VPCMP	$0, (%rsi), %YMM0, %k1{%k2}
++	CMP_R1_S2_YMM (%YMM0, (%rsi), %YMM1, %k1){%k2}
+ 	kmovd	%k1, %ecx
+ # ifdef USE_AS_STRNCMP
+ 	cmpq	$CHAR_PER_VEC, %rdx
+@@ -169,6 +331,8 @@ L(return_vec_0):
+ # else
+ 	movzbl	(%rdi, %rcx), %eax
+ 	movzbl	(%rsi, %rcx), %ecx
++	TOLOWER_gpr (%rax, %eax)
++	TOLOWER_gpr (%rcx, %ecx)
+ 	subl	%ecx, %eax
+ # endif
+ L(ret0):
+@@ -188,11 +352,15 @@ L(ret_zero):
+ 
+ 	.p2align 4,, 5
+ L(one_or_less):
++#  ifdef USE_AS_STRCASECMP_L
++	/* Set locale argument for strcasecmp.  */
++	movq	%LOCALE_REG, %rdx
++#  endif
+ 	jb	L(ret_zero)
+-#  ifdef USE_AS_WCSCMP
+ 	/* 'nbe' covers the case where length is negative (large
+ 	   unsigned).  */
+-	jnbe	__wcscmp_evex
++	jnbe	OVERFLOW_STRCMP
++#  ifdef USE_AS_WCSCMP
+ 	movl	(%rdi), %edx
+ 	xorl	%eax, %eax
+ 	cmpl	(%rsi), %edx
+@@ -201,11 +369,10 @@ L(one_or_less):
+ 	negl	%eax
+ 	orl	$1, %eax
+ #  else
+-	/* 'nbe' covers the case where length is negative (large
+-	   unsigned).  */
+-	jnbe	__strcmp_evex
+ 	movzbl	(%rdi), %eax
+ 	movzbl	(%rsi), %ecx
++	TOLOWER_gpr (%rax, %eax)
++	TOLOWER_gpr (%rcx, %ecx)
+ 	subl	%ecx, %eax
+ #  endif
+ L(ret1):
+@@ -233,6 +400,8 @@ L(return_vec_1):
+ # else
+ 	movzbl	VEC_SIZE(%rdi, %rcx), %eax
+ 	movzbl	VEC_SIZE(%rsi, %rcx), %ecx
++	TOLOWER_gpr (%rax, %eax)
++	TOLOWER_gpr (%rcx, %ecx)
+ 	subl	%ecx, %eax
+ # endif
+ L(ret2):
+@@ -270,6 +439,8 @@ L(return_vec_2):
+ # else
+ 	movzbl	(VEC_SIZE * 2)(%rdi, %rcx), %eax
+ 	movzbl	(VEC_SIZE * 2)(%rsi, %rcx), %ecx
++	TOLOWER_gpr (%rax, %eax)
++	TOLOWER_gpr (%rcx, %ecx)
+ 	subl	%ecx, %eax
+ # endif
+ L(ret3):
+@@ -290,6 +461,8 @@ L(return_vec_3):
+ #  else
+ 	movzbl	(VEC_SIZE * 3)(%rdi, %rcx), %eax
+ 	movzbl	(VEC_SIZE * 3)(%rsi, %rcx), %ecx
++	TOLOWER_gpr (%rax, %eax)
++	TOLOWER_gpr (%rcx, %ecx)
+ 	subl	%ecx, %eax
+ #  endif
+ L(ret4):
+@@ -303,7 +476,7 @@ L(more_3x_vec):
+ 	/* Safe to compare 4x vectors.  */
+ 	VMOVU	(VEC_SIZE)(%rdi), %YMM0
+ 	VPTESTM	%YMM0, %YMM0, %k2
+-	VPCMP	$0, (VEC_SIZE)(%rsi), %YMM0, %k1{%k2}
++	CMP_R1_S2_YMM (%YMM0, VEC_SIZE(%rsi), %YMM1, %k1){%k2}
+ 	kmovd	%k1, %ecx
+ 	TESTEQ	%ecx
+ 	jnz	L(return_vec_1)
+@@ -315,14 +488,14 @@ L(more_3x_vec):
+ 
+ 	VMOVU	(VEC_SIZE * 2)(%rdi), %YMM0
+ 	VPTESTM	%YMM0, %YMM0, %k2
+-	VPCMP	$0, (VEC_SIZE * 2)(%rsi), %YMM0, %k1{%k2}
++	CMP_R1_S2_YMM (%YMM0, (VEC_SIZE * 2)(%rsi), %YMM1, %k1){%k2}
+ 	kmovd	%k1, %ecx
+ 	TESTEQ	%ecx
+ 	jnz	L(return_vec_2)
+ 
+ 	VMOVU	(VEC_SIZE * 3)(%rdi), %YMM0
+ 	VPTESTM	%YMM0, %YMM0, %k2
+-	VPCMP	$0, (VEC_SIZE * 3)(%rsi), %YMM0, %k1{%k2}
++	CMP_R1_S2_YMM (%YMM0, (VEC_SIZE * 3)(%rsi), %YMM1, %k1){%k2}
+ 	kmovd	%k1, %ecx
+ 	TESTEQ	%ecx
+ 	jnz	L(return_vec_3)
+@@ -381,7 +554,6 @@ L(prepare_loop_aligned):
+ 	subl	%esi, %eax
+ 	andl	$(PAGE_SIZE - 1), %eax
+ 
+-	vpxorq	%YMMZERO, %YMMZERO, %YMMZERO
+ 
+ 	/* Loop 4x comparisons at a time.  */
+ 	.p2align 4
+@@ -413,22 +585,35 @@ L(loop_skip_page_cross_check):
+ 	/* A zero CHAR in YMM9 means that there is a null CHAR.  */
+ 	VPMINU	%YMM8, %YMM9, %YMM9
+ 
+-	/* Each bit set in K1 represents a non-null CHAR in YMM8.  */
++	/* Each bit set in K1 represents a non-null CHAR in YMM9.  */
+ 	VPTESTM	%YMM9, %YMM9, %k1
+-
++# ifndef USE_AS_STRCASECMP_L
+ 	vpxorq	(VEC_SIZE * 0)(%rsi), %YMM0, %YMM1
+ 	vpxorq	(VEC_SIZE * 1)(%rsi), %YMM2, %YMM3
+ 	vpxorq	(VEC_SIZE * 2)(%rsi), %YMM4, %YMM5
+ 	/* Ternary logic to xor (VEC_SIZE * 3)(%rsi) with YMM6 while
+ 	   oring with YMM1. Result is stored in YMM6.  */
+ 	vpternlogd $0xde, (VEC_SIZE * 3)(%rsi), %YMM1, %YMM6
+-
++# else
++	VMOVU	(VEC_SIZE * 0)(%rsi), %YMM1
++	TOLOWER_YMM (%YMM0, %YMM1)
++	VMOVU	(VEC_SIZE * 1)(%rsi), %YMM3
++	TOLOWER_YMM (%YMM2, %YMM3)
++	VMOVU	(VEC_SIZE * 2)(%rsi), %YMM5
++	TOLOWER_YMM (%YMM4, %YMM5)
++	VMOVU	(VEC_SIZE * 3)(%rsi), %YMM7
++	TOLOWER_YMM (%YMM6, %YMM7)
++	vpxorq	%YMM0, %YMM1, %YMM1
++	vpxorq	%YMM2, %YMM3, %YMM3
++	vpxorq	%YMM4, %YMM5, %YMM5
++	vpternlogd $0xde, %YMM7, %YMM1, %YMM6
++# endif
+ 	/* Or together YMM3, YMM5, and YMM6.  */
+ 	vpternlogd $0xfe, %YMM3, %YMM5, %YMM6
+ 
+ 
+ 	/* A non-zero CHAR in YMM6 represents a mismatch.  */
+-	VPCMP	$0, %YMMZERO, %YMM6, %k0{%k1}
++	VPTESTNM %YMM6, %YMM6, %k0{%k1}
+ 	kmovd	%k0, %LOOP_REG
+ 
+ 	TESTEQ	%LOOP_REG
+@@ -437,13 +622,13 @@ L(loop_skip_page_cross_check):
+ 
+ 	/* Find which VEC has the mismatch of end of string.  */
+ 	VPTESTM	%YMM0, %YMM0, %k1
+-	VPCMP	$0, %YMMZERO, %YMM1, %k0{%k1}
++	VPTESTNM %YMM1, %YMM1, %k0{%k1}
+ 	kmovd	%k0, %ecx
+ 	TESTEQ	%ecx
+ 	jnz	L(return_vec_0_end)
+ 
+ 	VPTESTM	%YMM2, %YMM2, %k1
+-	VPCMP	$0, %YMMZERO, %YMM3, %k0{%k1}
++	VPTESTNM %YMM3, %YMM3, %k0{%k1}
+ 	kmovd	%k0, %ecx
+ 	TESTEQ	%ecx
+ 	jnz	L(return_vec_1_end)
+@@ -457,7 +642,7 @@ L(return_vec_2_3_end):
+ # endif
+ 
+ 	VPTESTM	%YMM4, %YMM4, %k1
+-	VPCMP	$0, %YMMZERO, %YMM5, %k0{%k1}
++	VPTESTNM %YMM5, %YMM5, %k0{%k1}
+ 	kmovd	%k0, %ecx
+ 	TESTEQ	%ecx
+ # if CHAR_PER_VEC <= 16
+@@ -493,6 +678,8 @@ L(return_vec_3_end):
+ # else
+ 	movzbl	(VEC_SIZE * 2)(%rdi, %LOOP_REG64), %eax
+ 	movzbl	(VEC_SIZE * 2)(%rsi, %LOOP_REG64), %ecx
++	TOLOWER_gpr (%rax, %eax)
++	TOLOWER_gpr (%rcx, %ecx)
+ 	subl	%ecx, %eax
+ 	xorl	%r8d, %eax
+ 	subl	%r8d, %eax
+@@ -545,6 +732,8 @@ L(return_vec_0_end):
+ # else
+ 	movzbl	(%rdi, %rcx), %eax
+ 	movzbl	(%rsi, %rcx), %ecx
++	TOLOWER_gpr (%rax, %eax)
++	TOLOWER_gpr (%rcx, %ecx)
+ 	subl	%ecx, %eax
+ 	/* Flip `eax` if `rdi` and `rsi` where swapped in page cross
+ 	   logic. Subtract `r8d` after xor for zero case.  */
+@@ -569,6 +758,8 @@ L(return_vec_1_end):
+ #  else
+ 	movzbl	VEC_SIZE(%rdi, %rcx), %eax
+ 	movzbl	VEC_SIZE(%rsi, %rcx), %ecx
++	TOLOWER_gpr (%rax, %eax)
++	TOLOWER_gpr (%rcx, %ecx)
+ 	subl	%ecx, %eax
+ 	xorl	%r8d, %eax
+ 	subl	%r8d, %eax
+@@ -598,7 +789,7 @@ L(page_cross_during_loop):
+ 
+ 	VMOVA	(%rdi), %YMM0
+ 	VPTESTM	%YMM0, %YMM0, %k2
+-	VPCMP	$0, (%rsi), %YMM0, %k1{%k2}
++	CMP_R1_S2_YMM (%YMM0, (%rsi), %YMM1, %k1){%k2}
+ 	kmovd	%k1, %ecx
+ 	TESTEQ	%ecx
+ 	jnz	L(return_vec_0_end)
+@@ -619,8 +810,7 @@ L(less_1x_vec_till_page_cross):
+ 	   been loaded earlier so must be valid.  */
+ 	VMOVU	-VEC_SIZE(%rdi, %rax), %YMM0
+ 	VPTESTM	%YMM0, %YMM0, %k2
+-	VPCMP	$0, -VEC_SIZE(%rsi, %rax), %YMM0, %k1{%k2}
+-
++	CMP_R1_S2_YMM (%YMM0, -VEC_SIZE(%rsi, %rax), %YMM1, %k1){%k2}
+ 	/* Mask of potentially valid bits. The lower bits can be out of
+ 	   range comparisons (but safe regarding page crosses).  */
+ 
+@@ -642,6 +832,8 @@ L(less_1x_vec_till_page_cross):
+ 
+ # ifdef USE_AS_STRNCMP
+ #  ifdef USE_AS_WCSCMP
++	/* NB: strcasecmp not used with WCSCMP so this access to r11 is
++	   safe.  */
+ 	movl	%eax, %r11d
+ 	shrl	$2, %r11d
+ 	cmpq	%r11, %rdx
+@@ -679,6 +871,8 @@ L(return_page_cross_cmp_mem):
+ # else
+ 	movzbl	VEC_OFFSET(%rdi, %rcx), %eax
+ 	movzbl	VEC_OFFSET(%rsi, %rcx), %ecx
++	TOLOWER_gpr (%rax, %eax)
++	TOLOWER_gpr (%rcx, %ecx)
+ 	subl	%ecx, %eax
+ 	xorl	%r8d, %eax
+ 	subl	%r8d, %eax
+@@ -709,7 +903,7 @@ L(more_2x_vec_till_page_cross):
+ 
+ 	VMOVA	VEC_SIZE(%rdi), %YMM0
+ 	VPTESTM	%YMM0, %YMM0, %k2
+-	VPCMP	$0, VEC_SIZE(%rsi), %YMM0, %k1{%k2}
++	CMP_R1_S2_YMM (%YMM0, VEC_SIZE(%rsi), %YMM1, %k1){%k2}
+ 	kmovd	%k1, %ecx
+ 	TESTEQ	%ecx
+ 	jnz	L(return_vec_1_end)
+@@ -724,14 +918,14 @@ L(more_2x_vec_till_page_cross):
+ 	/* Safe to include comparisons from lower bytes.  */
+ 	VMOVU	-(VEC_SIZE * 2)(%rdi, %rax), %YMM0
+ 	VPTESTM	%YMM0, %YMM0, %k2
+-	VPCMP	$0, -(VEC_SIZE * 2)(%rsi, %rax), %YMM0, %k1{%k2}
++	CMP_R1_S2_YMM (%YMM0, -(VEC_SIZE * 2)(%rsi, %rax), %YMM1, %k1){%k2}
+ 	kmovd	%k1, %ecx
+ 	TESTEQ	%ecx
+ 	jnz	L(return_vec_page_cross_0)
+ 
+ 	VMOVU	-(VEC_SIZE * 1)(%rdi, %rax), %YMM0
+ 	VPTESTM	%YMM0, %YMM0, %k2
+-	VPCMP	$0, -(VEC_SIZE * 1)(%rsi, %rax), %YMM0, %k1{%k2}
++	CMP_R1_S2_YMM (%YMM0, -(VEC_SIZE * 1)(%rsi, %rax), %YMM1, %k1){%k2}
+ 	kmovd	%k1, %ecx
+ 	TESTEQ	%ecx
+ 	jnz	L(return_vec_page_cross_1)
+@@ -740,6 +934,8 @@ L(more_2x_vec_till_page_cross):
+ 	/* Must check length here as length might proclude reading next
+ 	   page.  */
+ #  ifdef USE_AS_WCSCMP
++	/* NB: strcasecmp not used with WCSCMP so this access to r11 is
++	   safe.  */
+ 	movl	%eax, %r11d
+ 	shrl	$2, %r11d
+ 	cmpq	%r11, %rdx
+@@ -754,12 +950,19 @@ L(more_2x_vec_till_page_cross):
+ 	VMOVA	(VEC_SIZE * 3)(%rdi), %YMM6
+ 	VPMINU	%YMM4, %YMM6, %YMM9
+ 	VPTESTM	%YMM9, %YMM9, %k1
+-
++# ifndef USE_AS_STRCASECMP_L
+ 	vpxorq	(VEC_SIZE * 2)(%rsi), %YMM4, %YMM5
+ 	/* YMM6 = YMM5 | ((VEC_SIZE * 3)(%rsi) ^ YMM6).  */
+ 	vpternlogd $0xde, (VEC_SIZE * 3)(%rsi), %YMM5, %YMM6
+-
+-	VPCMP	$0, %YMMZERO, %YMM6, %k0{%k1}
++# else
++	VMOVU	(VEC_SIZE * 2)(%rsi), %YMM5
++	TOLOWER_YMM (%YMM4, %YMM5)
++	VMOVU	(VEC_SIZE * 3)(%rsi), %YMM7
++	TOLOWER_YMM (%YMM6, %YMM7)
++	vpxorq	%YMM4, %YMM5, %YMM5
++	vpternlogd $0xde, %YMM7, %YMM5, %YMM6
++# endif
++	VPTESTNM %YMM6, %YMM6, %k0{%k1}
+ 	kmovd	%k0, %LOOP_REG
+ 	TESTEQ	%LOOP_REG
+ 	jnz	L(return_vec_2_3_end)
+@@ -815,6 +1018,8 @@ L(return_vec_page_cross_1):
+ # else
+ 	movzbl	VEC_OFFSET(%rdi, %rcx), %eax
+ 	movzbl	VEC_OFFSET(%rsi, %rcx), %ecx
++	TOLOWER_gpr (%rax, %eax)
++	TOLOWER_gpr (%rcx, %ecx)
+ 	subl	%ecx, %eax
+ 	xorl	%r8d, %eax
+ 	subl	%r8d, %eax
+@@ -871,7 +1076,7 @@ L(page_cross):
+ L(page_cross_loop):
+ 	VMOVU	(%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %YMM0
+ 	VPTESTM	%YMM0, %YMM0, %k2
+-	VPCMP	$0, (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %YMM0, %k1{%k2}
++	CMP_R1_S2_YMM (%YMM0, (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %YMM1, %k1){%k2}
+ 	kmovd	%k1, %ecx
+ 	TESTEQ	%ecx
+ 	jnz	L(check_ret_vec_page_cross)
+@@ -895,7 +1100,7 @@ L(page_cross_loop):
+ 	 */
+ 	VMOVU	(%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %YMM0
+ 	VPTESTM	%YMM0, %YMM0, %k2
+-	VPCMP	$0, (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %YMM0, %k1{%k2}
++	CMP_R1_S2_YMM (%YMM0, (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %YMM1, %k1){%k2}
+ 
+ 	kmovd	%k1, %ecx
+ # ifdef USE_AS_STRNCMP
+@@ -930,6 +1135,8 @@ L(ret_vec_page_cross_cont):
+ # else
+ 	movzbl	(%rdi, %rcx, SIZE_OF_CHAR), %eax
+ 	movzbl	(%rsi, %rcx, SIZE_OF_CHAR), %ecx
++	TOLOWER_gpr (%rax, %eax)
++	TOLOWER_gpr (%rcx, %ecx)
+ 	subl	%ecx, %eax
+ 	xorl	%r8d, %eax
+ 	subl	%r8d, %eax
+@@ -989,7 +1196,7 @@ L(less_1x_vec_till_page):
+ 	/* Use 16 byte comparison.  */
+ 	vmovdqu	(%rdi), %xmm0
+ 	VPTESTM	%xmm0, %xmm0, %k2
+-	VPCMP	$0, (%rsi), %xmm0, %k1{%k2}
++	CMP_R1_S2_XMM (%xmm0, (%rsi), %xmm1, %k1){%k2}
+ 	kmovd	%k1, %ecx
+ # ifdef USE_AS_WCSCMP
+ 	subl	$0xf, %ecx
+@@ -1009,7 +1216,7 @@ L(less_1x_vec_till_page):
+ # endif
+ 	vmovdqu	(%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm0
+ 	VPTESTM	%xmm0, %xmm0, %k2
+-	VPCMP	$0, (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm0, %k1{%k2}
++	CMP_R1_S2_XMM (%xmm0, (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm1, %k1){%k2}
+ 	kmovd	%k1, %ecx
+ # ifdef USE_AS_WCSCMP
+ 	subl	$0xf, %ecx
+@@ -1048,7 +1255,7 @@ L(less_16_till_page):
+ 	vmovq	(%rdi), %xmm0
+ 	vmovq	(%rsi), %xmm1
+ 	VPTESTM	%xmm0, %xmm0, %k2
+-	VPCMP	$0, %xmm1, %xmm0, %k1{%k2}
++	CMP_R1_R2_XMM (%xmm0, %xmm1, %k1){%k2}
+ 	kmovd	%k1, %ecx
+ # ifdef USE_AS_WCSCMP
+ 	subl	$0x3, %ecx
+@@ -1068,7 +1275,7 @@ L(less_16_till_page):
+ 	vmovq	(%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm0
+ 	vmovq	(%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm1
+ 	VPTESTM	%xmm0, %xmm0, %k2
+-	VPCMP	$0, %xmm1, %xmm0, %k1{%k2}
++	CMP_R1_R2_XMM (%xmm0, %xmm1, %k1){%k2}
+ 	kmovd	%k1, %ecx
+ # ifdef USE_AS_WCSCMP
+ 	subl	$0x3, %ecx
+@@ -1128,7 +1335,7 @@ L(ret_less_8_wcs):
+ 	vmovd	(%rdi), %xmm0
+ 	vmovd	(%rsi), %xmm1
+ 	VPTESTM	%xmm0, %xmm0, %k2
+-	VPCMP	$0, %xmm1, %xmm0, %k1{%k2}
++	CMP_R1_R2_XMM (%xmm0, %xmm1, %k1){%k2}
+ 	kmovd	%k1, %ecx
+ 	subl	$0xf, %ecx
+ 	jnz	L(check_ret_vec_page_cross)
+@@ -1143,7 +1350,7 @@ L(ret_less_8_wcs):
+ 	vmovd	(%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm0
+ 	vmovd	(%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm1
+ 	VPTESTM	%xmm0, %xmm0, %k2
+-	VPCMP	$0, %xmm1, %xmm0, %k1{%k2}
++	CMP_R1_R2_XMM (%xmm0, %xmm1, %k1){%k2}
+ 	kmovd	%k1, %ecx
+ 	subl	$0xf, %ecx
+ 	jnz	L(check_ret_vec_page_cross)
+@@ -1176,7 +1383,9 @@ L(less_4_till_page):
+ L(less_4_loop):
+ 	movzbl	(%rdi), %eax
+ 	movzbl	(%rsi, %rdi), %ecx
+-	subl	%ecx, %eax
++	TOLOWER_gpr (%rax, %eax)
++	TOLOWER_gpr (%rcx, %BYTE_LOOP_REG)
++	subl	%BYTE_LOOP_REG, %eax
+ 	jnz	L(ret_less_4_loop)
+ 	testl	%ecx, %ecx
+ 	jz	L(ret_zero_4_loop)
+@@ -1203,5 +1412,6 @@ L(ret_less_4_loop):
+ 	subl	%r8d, %eax
+ 	ret
+ # endif
+-END(STRCMP)
++	cfi_endproc
++	.size	STRCMP, .-STRCMP
+ #endif
+diff --git a/sysdeps/x86_64/multiarch/strncase_l-evex.S b/sysdeps/x86_64/multiarch/strncase_l-evex.S
+new file mode 100644
+index 00000000..8a5af369
+--- /dev/null
++++ b/sysdeps/x86_64/multiarch/strncase_l-evex.S
+@@ -0,0 +1,25 @@
++/* strncasecmp_l optimized with EVEX.
++   Copyright (C) 2017-2022 Free Software Foundation, Inc.
++   This file is part of the GNU C Library.
++
++   The GNU C Library is free software; you can redistribute it and/or
++   modify it under the terms of the GNU Lesser General Public
++   License as published by the Free Software Foundation; either
++   version 2.1 of the License, or (at your option) any later version.
++
++   The GNU C Library is distributed in the hope that it will be useful,
++   but WITHOUT ANY WARRANTY; without even the implied warranty of
++   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++   Lesser General Public License for more details.
++
++   You should have received a copy of the GNU Lesser General Public
++   License along with the GNU C Library; if not, see
++   <https://www.gnu.org/licenses/>.  */
++
++#ifndef STRCMP
++# define STRCMP	__strncasecmp_l_evex
++#endif
++#define OVERFLOW_STRCMP	__strcasecmp_l_evex
++#define USE_AS_STRCASECMP_L
++#define USE_AS_STRNCMP
++#include "strcmp-evex.S"
+-- 
+GitLab
+
diff --git a/glibc-RHEL-15696-99.patch b/glibc-RHEL-15696-99.patch
new file mode 100644
index 0000000..06d5d53
--- /dev/null
+++ b/glibc-RHEL-15696-99.patch
@@ -0,0 +1,913 @@
+From 305769b2a15c2e96f9e1b5195d3c4e0d6f0f4b68 Mon Sep 17 00:00:00 2001
+From: Noah Goldstein <goldstein.w.n@gmail.com>
+Date: Wed, 23 Mar 2022 16:57:46 -0500
+Subject: [PATCH] x86: Remove AVX str{n}casecmp
+Content-type: text/plain; charset=UTF-8
+
+The rational is:
+
+1. SSE42 has nearly identical logic so any benefit is minimal (3.4%
+   regression on Tigerlake using SSE42 versus AVX across the
+   benchtest suite).
+2. AVX2 version covers the majority of targets that previously
+   prefered it.
+3. The targets where AVX would still be best (SnB and IVB) are
+   becoming outdated.
+
+All in all the saving the code size is worth it.
+
+All string/memory tests pass.
+Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
+---
+ sysdeps/x86_64/multiarch/Makefile           |   2 -
+ sysdeps/x86_64/multiarch/ifunc-impl-list.c  |  12 -
+ sysdeps/x86_64/multiarch/ifunc-strcasecmp.h |   4 -
+ sysdeps/x86_64/multiarch/strcasecmp_l-avx.S |  22 --
+ sysdeps/x86_64/multiarch/strcmp-sse42.S     | 240 +++++++++-----------
+ sysdeps/x86_64/multiarch/strncase_l-avx.S   |  22 --
+ 6 files changed, 105 insertions(+), 197 deletions(-)
+ delete mode 100644 sysdeps/x86_64/multiarch/strcasecmp_l-avx.S
+ delete mode 100644 sysdeps/x86_64/multiarch/strncase_l-avx.S
+
+diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
+index 359712c1..bca82e38 100644
+--- a/sysdeps/x86_64/multiarch/Makefile
++++ b/sysdeps/x86_64/multiarch/Makefile
+@@ -50,7 +50,6 @@ sysdep_routines += \
+   stpncpy-evex \
+   stpncpy-sse2-unaligned \
+   stpncpy-ssse3 \
+-  strcasecmp_l-avx \
+   strcasecmp_l-avx2 \
+   strcasecmp_l-avx2-rtm \
+   strcasecmp_l-evex \
+@@ -91,7 +90,6 @@ sysdep_routines += \
+   strlen-avx2-rtm \
+   strlen-evex \
+   strlen-sse2 \
+-  strncase_l-avx \
+   strncase_l-avx2 \
+   strncase_l-avx2-rtm \
+   strncase_l-evex \
+diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+index 1dedc637..14314367 100644
+--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
++++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+@@ -429,9 +429,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 			      (CPU_FEATURE_USABLE (AVX2)
+ 			       && CPU_FEATURE_USABLE (RTM)),
+ 			      __strcasecmp_avx2_rtm)
+-	      IFUNC_IMPL_ADD (array, i, strcasecmp,
+-			      CPU_FEATURE_USABLE (AVX),
+-			      __strcasecmp_avx)
+ 	      IFUNC_IMPL_ADD (array, i, strcasecmp,
+ 			      CPU_FEATURE_USABLE (SSE4_2),
+ 			      __strcasecmp_sse42)
+@@ -453,9 +450,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 			      (CPU_FEATURE_USABLE (AVX2)
+ 			       && CPU_FEATURE_USABLE (RTM)),
+ 			      __strcasecmp_l_avx2_rtm)
+-	      IFUNC_IMPL_ADD (array, i, strcasecmp_l,
+-			      CPU_FEATURE_USABLE (AVX),
+-			      __strcasecmp_l_avx)
+ 	      IFUNC_IMPL_ADD (array, i, strcasecmp_l,
+ 			      CPU_FEATURE_USABLE (SSE4_2),
+ 			      __strcasecmp_l_sse42)
+@@ -591,9 +585,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 			      (CPU_FEATURE_USABLE (AVX2)
+ 			       && CPU_FEATURE_USABLE (RTM)),
+ 			      __strncasecmp_avx2_rtm)
+-	      IFUNC_IMPL_ADD (array, i, strncasecmp,
+-			      CPU_FEATURE_USABLE (AVX),
+-			      __strncasecmp_avx)
+ 	      IFUNC_IMPL_ADD (array, i, strncasecmp,
+ 			      CPU_FEATURE_USABLE (SSE4_2),
+ 			      __strncasecmp_sse42)
+@@ -616,9 +607,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 			      (CPU_FEATURE_USABLE (AVX2)
+ 			       && CPU_FEATURE_USABLE (RTM)),
+ 			      __strncasecmp_l_avx2_rtm)
+-	      IFUNC_IMPL_ADD (array, i, strncasecmp_l,
+-			      CPU_FEATURE_USABLE (AVX),
+-			      __strncasecmp_l_avx)
+ 	      IFUNC_IMPL_ADD (array, i, strncasecmp_l,
+ 			      CPU_FEATURE_USABLE (SSE4_2),
+ 			      __strncasecmp_l_sse42)
+diff --git a/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h b/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h
+index 6dd49a21..34cfbb8f 100644
+--- a/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h
++++ b/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h
+@@ -22,7 +22,6 @@
+ extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden;
+ extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden;
+ extern __typeof (REDIRECT_NAME) OPTIMIZE (sse42) attribute_hidden;
+-extern __typeof (REDIRECT_NAME) OPTIMIZE (avx) attribute_hidden;
+ extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden;
+ extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden;
+ extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden;
+@@ -46,9 +45,6 @@ IFUNC_SELECTOR (void)
+         return OPTIMIZE (avx2);
+     }
+ 
+-  if (CPU_FEATURE_USABLE_P (cpu_features, AVX))
+-    return OPTIMIZE (avx);
+-
+   if (CPU_FEATURE_USABLE_P (cpu_features, SSE4_2)
+       && !CPU_FEATURES_ARCH_P (cpu_features, Slow_SSE4_2))
+     return OPTIMIZE (sse42);
+diff --git a/sysdeps/x86_64/multiarch/strcasecmp_l-avx.S b/sysdeps/x86_64/multiarch/strcasecmp_l-avx.S
+deleted file mode 100644
+index 56a03547..00000000
+--- a/sysdeps/x86_64/multiarch/strcasecmp_l-avx.S
++++ /dev/null
+@@ -1,22 +0,0 @@
+-/* strcasecmp_l optimized with AVX.
+-   Copyright (C) 2017-2018 Free Software Foundation, Inc.
+-   This file is part of the GNU C Library.
+-
+-   The GNU C Library is free software; you can redistribute it and/or
+-   modify it under the terms of the GNU Lesser General Public
+-   License as published by the Free Software Foundation; either
+-   version 2.1 of the License, or (at your option) any later version.
+-
+-   The GNU C Library is distributed in the hope that it will be useful,
+-   but WITHOUT ANY WARRANTY; without even the implied warranty of
+-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+-   Lesser General Public License for more details.
+-
+-   You should have received a copy of the GNU Lesser General Public
+-   License along with the GNU C Library; if not, see
+-   <http://www.gnu.org/licenses/>.  */
+-
+-#define STRCMP_SSE42 __strcasecmp_l_avx
+-#define USE_AVX 1
+-#define USE_AS_STRCASECMP_L
+-#include "strcmp-sse42.S"
+diff --git a/sysdeps/x86_64/multiarch/strcmp-sse42.S b/sysdeps/x86_64/multiarch/strcmp-sse42.S
+index 59e8ddfc..0a42b7a4 100644
+--- a/sysdeps/x86_64/multiarch/strcmp-sse42.S
++++ b/sysdeps/x86_64/multiarch/strcmp-sse42.S
+@@ -42,13 +42,8 @@
+ # define UPDATE_STRNCMP_COUNTER
+ #endif
+ 
+-#ifdef USE_AVX
+-# define SECTION	avx
+-# define GLABEL(l)	l##_avx
+-#else
+-# define SECTION	sse4.2
+-# define GLABEL(l)	l##_sse42
+-#endif
++#define SECTION	sse4.2
++#define GLABEL(l)	l##_sse42
+ 
+ #define LABEL(l)	.L##l
+ 
+@@ -106,21 +101,7 @@ END (GLABEL(__strncasecmp))
+ #endif
+ 
+ 
+-#ifdef USE_AVX
+-# define movdqa vmovdqa
+-# define movdqu vmovdqu
+-# define pmovmskb vpmovmskb
+-# define pcmpistri vpcmpistri
+-# define psubb vpsubb
+-# define pcmpeqb vpcmpeqb
+-# define psrldq vpsrldq
+-# define pslldq vpslldq
+-# define palignr vpalignr
+-# define pxor vpxor
+-# define D(arg) arg, arg
+-#else
+-# define D(arg) arg
+-#endif
++#define arg arg
+ 
+ STRCMP_SSE42:
+ 	cfi_startproc
+@@ -192,18 +173,7 @@ LABEL(case_add):
+ 	movdqu	(%rdi), %xmm1
+ 	movdqu	(%rsi), %xmm2
+ #if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+-# ifdef USE_AVX
+-#  define TOLOWER(reg1, reg2) \
+-	vpaddb	LCASE_MIN_reg, reg1, %xmm7;					\
+-	vpaddb	LCASE_MIN_reg, reg2, %xmm8;					\
+-	vpcmpgtb LCASE_MAX_reg, %xmm7, %xmm7;					\
+-	vpcmpgtb LCASE_MAX_reg, %xmm8, %xmm8;					\
+-	vpandn	CASE_ADD_reg, %xmm7, %xmm7;					\
+-	vpandn	CASE_ADD_reg, %xmm8, %xmm8;					\
+-	vpaddb	%xmm7, reg1, reg1;					\
+-	vpaddb	%xmm8, reg2, reg2
+-# else
+-#  define TOLOWER(reg1, reg2) \
++# define TOLOWER(reg1, reg2) \
+ 	movdqa	LCASE_MIN_reg, %xmm7;					\
+ 	movdqa	LCASE_MIN_reg, %xmm8;					\
+ 	paddb	reg1, %xmm7;					\
+@@ -214,15 +184,15 @@ LABEL(case_add):
+ 	pandn	CASE_ADD_reg, %xmm8;					\
+ 	paddb	%xmm7, reg1;					\
+ 	paddb	%xmm8, reg2
+-# endif
++
+ 	TOLOWER (%xmm1, %xmm2)
+ #else
+ # define TOLOWER(reg1, reg2)
+ #endif
+-	pxor	%xmm0, D(%xmm0)		/* clear %xmm0 for null char checks */
+-	pcmpeqb	%xmm1, D(%xmm0)		/* Any null chars? */
+-	pcmpeqb	%xmm2, D(%xmm1)		/* compare first 16 bytes for equality */
+-	psubb	%xmm0, D(%xmm1)		/* packed sub of comparison results*/
++	pxor	%xmm0, %xmm0		/* clear %xmm0 for null char checks */
++	pcmpeqb	%xmm1, %xmm0		/* Any null chars? */
++	pcmpeqb	%xmm2, %xmm1		/* compare first 16 bytes for equality */
++	psubb	%xmm0, %xmm1		/* packed sub of comparison results*/
+ 	pmovmskb %xmm1, %edx
+ 	sub	$0xffff, %edx		/* if first 16 bytes are same, edx == 0xffff */
+ 	jnz	LABEL(less16bytes)/* If not, find different value or null char */
+@@ -246,7 +216,7 @@ LABEL(crosscache):
+ 	xor	%r8d, %r8d
+ 	and	$0xf, %ecx		/* offset of rsi */
+ 	and	$0xf, %eax		/* offset of rdi */
+-	pxor	%xmm0, D(%xmm0)		/* clear %xmm0 for null char check */
++	pxor	%xmm0, %xmm0		/* clear %xmm0 for null char check */
+ 	cmp	%eax, %ecx
+ 	je	LABEL(ashr_0)		/* rsi and rdi relative offset same */
+ 	ja	LABEL(bigger)
+@@ -260,7 +230,7 @@ LABEL(bigger):
+ 	sub	%rcx, %r9
+ 	lea	LABEL(unaligned_table)(%rip), %r10
+ 	movslq	(%r10, %r9,4), %r9
+-	pcmpeqb	%xmm1, D(%xmm0)		/* Any null chars? */
++	pcmpeqb	%xmm1, %xmm0		/* Any null chars? */
+ 	lea	(%r10, %r9), %r10
+ 	_CET_NOTRACK jmp *%r10		/* jump to corresponding case */
+ 
+@@ -273,15 +243,15 @@ LABEL(bigger):
+ LABEL(ashr_0):
+ 
+ 	movdqa	(%rsi), %xmm1
+-	pcmpeqb	%xmm1, D(%xmm0)		/* Any null chars? */
++	pcmpeqb	%xmm1, %xmm0		/* Any null chars? */
+ #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
+-	pcmpeqb	(%rdi), D(%xmm1)	/* compare 16 bytes for equality */
++	pcmpeqb	(%rdi), %xmm1		/* compare 16 bytes for equality */
+ #else
+ 	movdqa	(%rdi), %xmm2
+ 	TOLOWER (%xmm1, %xmm2)
+-	pcmpeqb	%xmm2, D(%xmm1)		/* compare 16 bytes for equality */
++	pcmpeqb	%xmm2, %xmm1		/* compare 16 bytes for equality */
+ #endif
+-	psubb	%xmm0, D(%xmm1)		/* packed sub of comparison results*/
++	psubb	%xmm0, %xmm1		/* packed sub of comparison results*/
+ 	pmovmskb %xmm1, %r9d
+ 	shr	%cl, %edx		/* adjust 0xffff for offset */
+ 	shr	%cl, %r9d		/* adjust for 16-byte offset */
+@@ -361,10 +331,10 @@ LABEL(ashr_0_exit_use):
+  */
+ 	.p2align 4
+ LABEL(ashr_1):
+-	pslldq	$15, D(%xmm2)		/* shift first string to align with second */
++	pslldq	$15, %xmm2		/* shift first string to align with second */
+ 	TOLOWER (%xmm1, %xmm2)
+-	pcmpeqb	%xmm1, D(%xmm2)		/* compare 16 bytes for equality */
+-	psubb	%xmm0, D(%xmm2)		/* packed sub of comparison results*/
++	pcmpeqb	%xmm1, %xmm2		/* compare 16 bytes for equality */
++	psubb	%xmm0, %xmm2		/* packed sub of comparison results*/
+ 	pmovmskb %xmm2, %r9d
+ 	shr	%cl, %edx		/* adjust 0xffff for offset */
+ 	shr	%cl, %r9d		/* adjust for 16-byte offset */
+@@ -392,7 +362,7 @@ LABEL(loop_ashr_1_use):
+ 
+ LABEL(nibble_ashr_1_restart_use):
+ 	movdqa	(%rdi, %rdx), %xmm0
+-	palignr $1, -16(%rdi, %rdx), D(%xmm0)
++	palignr $1, -16(%rdi, %rdx), %xmm0
+ #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
+ 	pcmpistri	$0x1a,(%rsi,%rdx), %xmm0
+ #else
+@@ -411,7 +381,7 @@ LABEL(nibble_ashr_1_restart_use):
+ 	jg	LABEL(nibble_ashr_1_use)
+ 
+ 	movdqa	(%rdi, %rdx), %xmm0
+-	palignr $1, -16(%rdi, %rdx), D(%xmm0)
++	palignr $1, -16(%rdi, %rdx), %xmm0
+ #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
+ 	pcmpistri	$0x1a,(%rsi,%rdx), %xmm0
+ #else
+@@ -431,7 +401,7 @@ LABEL(nibble_ashr_1_restart_use):
+ LABEL(nibble_ashr_1_use):
+ 	sub	$0x1000, %r10
+ 	movdqa	-16(%rdi, %rdx), %xmm0
+-	psrldq	$1, D(%xmm0)
++	psrldq	$1, %xmm0
+ 	pcmpistri      $0x3a,%xmm0, %xmm0
+ #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+ 	cmp	%r11, %rcx
+@@ -449,10 +419,10 @@ LABEL(nibble_ashr_1_use):
+  */
+ 	.p2align 4
+ LABEL(ashr_2):
+-	pslldq	$14, D(%xmm2)
++	pslldq	$14, %xmm2
+ 	TOLOWER (%xmm1, %xmm2)
+-	pcmpeqb	%xmm1, D(%xmm2)
+-	psubb	%xmm0, D(%xmm2)
++	pcmpeqb	%xmm1, %xmm2
++	psubb	%xmm0, %xmm2
+ 	pmovmskb %xmm2, %r9d
+ 	shr	%cl, %edx
+ 	shr	%cl, %r9d
+@@ -480,7 +450,7 @@ LABEL(loop_ashr_2_use):
+ 
+ LABEL(nibble_ashr_2_restart_use):
+ 	movdqa	(%rdi, %rdx), %xmm0
+-	palignr $2, -16(%rdi, %rdx), D(%xmm0)
++	palignr $2, -16(%rdi, %rdx), %xmm0
+ #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
+ 	pcmpistri	$0x1a,(%rsi,%rdx), %xmm0
+ #else
+@@ -499,7 +469,7 @@ LABEL(nibble_ashr_2_restart_use):
+ 	jg	LABEL(nibble_ashr_2_use)
+ 
+ 	movdqa	(%rdi, %rdx), %xmm0
+-	palignr $2, -16(%rdi, %rdx), D(%xmm0)
++	palignr $2, -16(%rdi, %rdx), %xmm0
+ #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
+ 	pcmpistri	$0x1a,(%rsi,%rdx), %xmm0
+ #else
+@@ -519,7 +489,7 @@ LABEL(nibble_ashr_2_restart_use):
+ LABEL(nibble_ashr_2_use):
+ 	sub	$0x1000, %r10
+ 	movdqa	-16(%rdi, %rdx), %xmm0
+-	psrldq	$2, D(%xmm0)
++	psrldq	$2, %xmm0
+ 	pcmpistri      $0x3a,%xmm0, %xmm0
+ #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+ 	cmp	%r11, %rcx
+@@ -537,10 +507,10 @@ LABEL(nibble_ashr_2_use):
+  */
+ 	.p2align 4
+ LABEL(ashr_3):
+-	pslldq	$13, D(%xmm2)
++	pslldq	$13, %xmm2
+ 	TOLOWER (%xmm1, %xmm2)
+-	pcmpeqb	%xmm1, D(%xmm2)
+-	psubb	%xmm0, D(%xmm2)
++	pcmpeqb	%xmm1, %xmm2
++	psubb	%xmm0, %xmm2
+ 	pmovmskb %xmm2, %r9d
+ 	shr	%cl, %edx
+ 	shr	%cl, %r9d
+@@ -568,7 +538,7 @@ LABEL(loop_ashr_3_use):
+ 
+ LABEL(nibble_ashr_3_restart_use):
+ 	movdqa	(%rdi, %rdx), %xmm0
+-	palignr $3, -16(%rdi, %rdx), D(%xmm0)
++	palignr $3, -16(%rdi, %rdx), %xmm0
+ #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
+ 	pcmpistri	$0x1a,(%rsi,%rdx), %xmm0
+ #else
+@@ -587,7 +557,7 @@ LABEL(nibble_ashr_3_restart_use):
+ 	jg	LABEL(nibble_ashr_3_use)
+ 
+ 	movdqa	(%rdi, %rdx), %xmm0
+-	palignr $3, -16(%rdi, %rdx), D(%xmm0)
++	palignr $3, -16(%rdi, %rdx), %xmm0
+ #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
+ 	pcmpistri	$0x1a,(%rsi,%rdx), %xmm0
+ #else
+@@ -607,7 +577,7 @@ LABEL(nibble_ashr_3_restart_use):
+ LABEL(nibble_ashr_3_use):
+ 	sub	$0x1000, %r10
+ 	movdqa	-16(%rdi, %rdx), %xmm0
+-	psrldq	$3, D(%xmm0)
++	psrldq	$3, %xmm0
+ 	pcmpistri      $0x3a,%xmm0, %xmm0
+ #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+ 	cmp	%r11, %rcx
+@@ -625,10 +595,10 @@ LABEL(nibble_ashr_3_use):
+  */
+ 	.p2align 4
+ LABEL(ashr_4):
+-	pslldq	$12, D(%xmm2)
++	pslldq	$12, %xmm2
+ 	TOLOWER (%xmm1, %xmm2)
+-	pcmpeqb	%xmm1, D(%xmm2)
+-	psubb	%xmm0, D(%xmm2)
++	pcmpeqb	%xmm1, %xmm2
++	psubb	%xmm0, %xmm2
+ 	pmovmskb %xmm2, %r9d
+ 	shr	%cl, %edx
+ 	shr	%cl, %r9d
+@@ -657,7 +627,7 @@ LABEL(loop_ashr_4_use):
+ 
+ LABEL(nibble_ashr_4_restart_use):
+ 	movdqa	(%rdi, %rdx), %xmm0
+-	palignr $4, -16(%rdi, %rdx), D(%xmm0)
++	palignr $4, -16(%rdi, %rdx), %xmm0
+ #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
+ 	pcmpistri	$0x1a,(%rsi,%rdx), %xmm0
+ #else
+@@ -676,7 +646,7 @@ LABEL(nibble_ashr_4_restart_use):
+ 	jg	LABEL(nibble_ashr_4_use)
+ 
+ 	movdqa	(%rdi, %rdx), %xmm0
+-	palignr $4, -16(%rdi, %rdx), D(%xmm0)
++	palignr $4, -16(%rdi, %rdx), %xmm0
+ #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
+ 	pcmpistri	$0x1a,(%rsi,%rdx), %xmm0
+ #else
+@@ -696,7 +666,7 @@ LABEL(nibble_ashr_4_restart_use):
+ LABEL(nibble_ashr_4_use):
+ 	sub	$0x1000, %r10
+ 	movdqa	-16(%rdi, %rdx), %xmm0
+-	psrldq	$4, D(%xmm0)
++	psrldq	$4, %xmm0
+ 	pcmpistri      $0x3a,%xmm0, %xmm0
+ #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+ 	cmp	%r11, %rcx
+@@ -714,10 +684,10 @@ LABEL(nibble_ashr_4_use):
+  */
+ 	.p2align 4
+ LABEL(ashr_5):
+-	pslldq	$11, D(%xmm2)
++	pslldq	$11, %xmm2
+ 	TOLOWER (%xmm1, %xmm2)
+-	pcmpeqb	%xmm1, D(%xmm2)
+-	psubb	%xmm0, D(%xmm2)
++	pcmpeqb	%xmm1, %xmm2
++	psubb	%xmm0, %xmm2
+ 	pmovmskb %xmm2, %r9d
+ 	shr	%cl, %edx
+ 	shr	%cl, %r9d
+@@ -746,7 +716,7 @@ LABEL(loop_ashr_5_use):
+ 
+ LABEL(nibble_ashr_5_restart_use):
+ 	movdqa	(%rdi, %rdx), %xmm0
+-	palignr $5, -16(%rdi, %rdx), D(%xmm0)
++	palignr $5, -16(%rdi, %rdx), %xmm0
+ #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
+ 	pcmpistri	$0x1a,(%rsi,%rdx), %xmm0
+ #else
+@@ -766,7 +736,7 @@ LABEL(nibble_ashr_5_restart_use):
+ 
+ 	movdqa	(%rdi, %rdx), %xmm0
+ 
+-	palignr $5, -16(%rdi, %rdx), D(%xmm0)
++	palignr $5, -16(%rdi, %rdx), %xmm0
+ #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
+ 	pcmpistri	$0x1a,(%rsi,%rdx), %xmm0
+ #else
+@@ -786,7 +756,7 @@ LABEL(nibble_ashr_5_restart_use):
+ LABEL(nibble_ashr_5_use):
+ 	sub	$0x1000, %r10
+ 	movdqa	-16(%rdi, %rdx), %xmm0
+-	psrldq	$5, D(%xmm0)
++	psrldq	$5, %xmm0
+ 	pcmpistri      $0x3a,%xmm0, %xmm0
+ #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+ 	cmp	%r11, %rcx
+@@ -804,10 +774,10 @@ LABEL(nibble_ashr_5_use):
+  */
+ 	.p2align 4
+ LABEL(ashr_6):
+-	pslldq	$10, D(%xmm2)
++	pslldq	$10, %xmm2
+ 	TOLOWER (%xmm1, %xmm2)
+-	pcmpeqb	%xmm1, D(%xmm2)
+-	psubb	%xmm0, D(%xmm2)
++	pcmpeqb	%xmm1, %xmm2
++	psubb	%xmm0, %xmm2
+ 	pmovmskb %xmm2, %r9d
+ 	shr	%cl, %edx
+ 	shr	%cl, %r9d
+@@ -836,7 +806,7 @@ LABEL(loop_ashr_6_use):
+ 
+ LABEL(nibble_ashr_6_restart_use):
+ 	movdqa	(%rdi, %rdx), %xmm0
+-	palignr $6, -16(%rdi, %rdx), D(%xmm0)
++	palignr $6, -16(%rdi, %rdx), %xmm0
+ #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
+ 	pcmpistri $0x1a,(%rsi,%rdx), %xmm0
+ #else
+@@ -855,7 +825,7 @@ LABEL(nibble_ashr_6_restart_use):
+ 	jg	LABEL(nibble_ashr_6_use)
+ 
+ 	movdqa	(%rdi, %rdx), %xmm0
+-	palignr $6, -16(%rdi, %rdx), D(%xmm0)
++	palignr $6, -16(%rdi, %rdx), %xmm0
+ #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
+ 	pcmpistri $0x1a,(%rsi,%rdx), %xmm0
+ #else
+@@ -875,7 +845,7 @@ LABEL(nibble_ashr_6_restart_use):
+ LABEL(nibble_ashr_6_use):
+ 	sub	$0x1000, %r10
+ 	movdqa	-16(%rdi, %rdx), %xmm0
+-	psrldq	$6, D(%xmm0)
++	psrldq	$6, %xmm0
+ 	pcmpistri      $0x3a,%xmm0, %xmm0
+ #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+ 	cmp	%r11, %rcx
+@@ -893,10 +863,10 @@ LABEL(nibble_ashr_6_use):
+  */
+ 	.p2align 4
+ LABEL(ashr_7):
+-	pslldq	$9, D(%xmm2)
++	pslldq	$9, %xmm2
+ 	TOLOWER (%xmm1, %xmm2)
+-	pcmpeqb	%xmm1, D(%xmm2)
+-	psubb	%xmm0, D(%xmm2)
++	pcmpeqb	%xmm1, %xmm2
++	psubb	%xmm0, %xmm2
+ 	pmovmskb %xmm2, %r9d
+ 	shr	%cl, %edx
+ 	shr	%cl, %r9d
+@@ -925,7 +895,7 @@ LABEL(loop_ashr_7_use):
+ 
+ LABEL(nibble_ashr_7_restart_use):
+ 	movdqa	(%rdi, %rdx), %xmm0
+-	palignr $7, -16(%rdi, %rdx), D(%xmm0)
++	palignr $7, -16(%rdi, %rdx), %xmm0
+ #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
+ 	pcmpistri	$0x1a,(%rsi,%rdx), %xmm0
+ #else
+@@ -944,7 +914,7 @@ LABEL(nibble_ashr_7_restart_use):
+ 	jg	LABEL(nibble_ashr_7_use)
+ 
+ 	movdqa	(%rdi, %rdx), %xmm0
+-	palignr $7, -16(%rdi, %rdx), D(%xmm0)
++	palignr $7, -16(%rdi, %rdx), %xmm0
+ #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
+ 	pcmpistri	$0x1a,(%rsi,%rdx), %xmm0
+ #else
+@@ -964,7 +934,7 @@ LABEL(nibble_ashr_7_restart_use):
+ LABEL(nibble_ashr_7_use):
+ 	sub	$0x1000, %r10
+ 	movdqa	-16(%rdi, %rdx), %xmm0
+-	psrldq	$7, D(%xmm0)
++	psrldq	$7, %xmm0
+ 	pcmpistri      $0x3a,%xmm0, %xmm0
+ #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+ 	cmp	%r11, %rcx
+@@ -982,10 +952,10 @@ LABEL(nibble_ashr_7_use):
+  */
+ 	.p2align 4
+ LABEL(ashr_8):
+-	pslldq	$8, D(%xmm2)
++	pslldq	$8, %xmm2
+ 	TOLOWER (%xmm1, %xmm2)
+-	pcmpeqb	%xmm1, D(%xmm2)
+-	psubb	%xmm0, D(%xmm2)
++	pcmpeqb	%xmm1, %xmm2
++	psubb	%xmm0, %xmm2
+ 	pmovmskb %xmm2, %r9d
+ 	shr	%cl, %edx
+ 	shr	%cl, %r9d
+@@ -1014,7 +984,7 @@ LABEL(loop_ashr_8_use):
+ 
+ LABEL(nibble_ashr_8_restart_use):
+ 	movdqa	(%rdi, %rdx), %xmm0
+-	palignr $8, -16(%rdi, %rdx), D(%xmm0)
++	palignr $8, -16(%rdi, %rdx), %xmm0
+ #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
+ 	pcmpistri $0x1a, (%rsi,%rdx), %xmm0
+ #else
+@@ -1033,7 +1003,7 @@ LABEL(nibble_ashr_8_restart_use):
+ 	jg	LABEL(nibble_ashr_8_use)
+ 
+ 	movdqa	(%rdi, %rdx), %xmm0
+-	palignr $8, -16(%rdi, %rdx), D(%xmm0)
++	palignr $8, -16(%rdi, %rdx), %xmm0
+ #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
+ 	pcmpistri $0x1a, (%rsi,%rdx), %xmm0
+ #else
+@@ -1053,7 +1023,7 @@ LABEL(nibble_ashr_8_restart_use):
+ LABEL(nibble_ashr_8_use):
+ 	sub	$0x1000, %r10
+ 	movdqa	-16(%rdi, %rdx), %xmm0
+-	psrldq	$8, D(%xmm0)
++	psrldq	$8, %xmm0
+ 	pcmpistri      $0x3a,%xmm0, %xmm0
+ #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+ 	cmp	%r11, %rcx
+@@ -1071,10 +1041,10 @@ LABEL(nibble_ashr_8_use):
+  */
+ 	.p2align 4
+ LABEL(ashr_9):
+-	pslldq	$7, D(%xmm2)
++	pslldq	$7, %xmm2
+ 	TOLOWER (%xmm1, %xmm2)
+-	pcmpeqb	%xmm1, D(%xmm2)
+-	psubb	%xmm0, D(%xmm2)
++	pcmpeqb	%xmm1, %xmm2
++	psubb	%xmm0, %xmm2
+ 	pmovmskb %xmm2, %r9d
+ 	shr	%cl, %edx
+ 	shr	%cl, %r9d
+@@ -1104,7 +1074,7 @@ LABEL(loop_ashr_9_use):
+ LABEL(nibble_ashr_9_restart_use):
+ 	movdqa	(%rdi, %rdx), %xmm0
+ 
+-	palignr $9, -16(%rdi, %rdx), D(%xmm0)
++	palignr $9, -16(%rdi, %rdx), %xmm0
+ #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
+ 	pcmpistri $0x1a, (%rsi,%rdx), %xmm0
+ #else
+@@ -1123,7 +1093,7 @@ LABEL(nibble_ashr_9_restart_use):
+ 	jg	LABEL(nibble_ashr_9_use)
+ 
+ 	movdqa	(%rdi, %rdx), %xmm0
+-	palignr $9, -16(%rdi, %rdx), D(%xmm0)
++	palignr $9, -16(%rdi, %rdx), %xmm0
+ #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
+ 	pcmpistri $0x1a, (%rsi,%rdx), %xmm0
+ #else
+@@ -1143,7 +1113,7 @@ LABEL(nibble_ashr_9_restart_use):
+ LABEL(nibble_ashr_9_use):
+ 	sub	$0x1000, %r10
+ 	movdqa	-16(%rdi, %rdx), %xmm0
+-	psrldq	$9, D(%xmm0)
++	psrldq	$9, %xmm0
+ 	pcmpistri      $0x3a,%xmm0, %xmm0
+ #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+ 	cmp	%r11, %rcx
+@@ -1161,10 +1131,10 @@ LABEL(nibble_ashr_9_use):
+  */
+ 	.p2align 4
+ LABEL(ashr_10):
+-	pslldq	$6, D(%xmm2)
++	pslldq	$6, %xmm2
+ 	TOLOWER (%xmm1, %xmm2)
+-	pcmpeqb	%xmm1, D(%xmm2)
+-	psubb	%xmm0, D(%xmm2)
++	pcmpeqb	%xmm1, %xmm2
++	psubb	%xmm0, %xmm2
+ 	pmovmskb %xmm2, %r9d
+ 	shr	%cl, %edx
+ 	shr	%cl, %r9d
+@@ -1193,7 +1163,7 @@ LABEL(loop_ashr_10_use):
+ 
+ LABEL(nibble_ashr_10_restart_use):
+ 	movdqa	(%rdi, %rdx), %xmm0
+-	palignr $10, -16(%rdi, %rdx), D(%xmm0)
++	palignr $10, -16(%rdi, %rdx), %xmm0
+ #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
+ 	pcmpistri $0x1a, (%rsi,%rdx), %xmm0
+ #else
+@@ -1212,7 +1182,7 @@ LABEL(nibble_ashr_10_restart_use):
+ 	jg	LABEL(nibble_ashr_10_use)
+ 
+ 	movdqa	(%rdi, %rdx), %xmm0
+-	palignr $10, -16(%rdi, %rdx), D(%xmm0)
++	palignr $10, -16(%rdi, %rdx), %xmm0
+ #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
+ 	pcmpistri $0x1a, (%rsi,%rdx), %xmm0
+ #else
+@@ -1232,7 +1202,7 @@ LABEL(nibble_ashr_10_restart_use):
+ LABEL(nibble_ashr_10_use):
+ 	sub	$0x1000, %r10
+ 	movdqa	-16(%rdi, %rdx), %xmm0
+-	psrldq	$10, D(%xmm0)
++	psrldq	$10, %xmm0
+ 	pcmpistri      $0x3a,%xmm0, %xmm0
+ #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+ 	cmp	%r11, %rcx
+@@ -1250,10 +1220,10 @@ LABEL(nibble_ashr_10_use):
+  */
+ 	.p2align 4
+ LABEL(ashr_11):
+-	pslldq	$5, D(%xmm2)
++	pslldq	$5, %xmm2
+ 	TOLOWER (%xmm1, %xmm2)
+-	pcmpeqb	%xmm1, D(%xmm2)
+-	psubb	%xmm0, D(%xmm2)
++	pcmpeqb	%xmm1, %xmm2
++	psubb	%xmm0, %xmm2
+ 	pmovmskb %xmm2, %r9d
+ 	shr	%cl, %edx
+ 	shr	%cl, %r9d
+@@ -1282,7 +1252,7 @@ LABEL(loop_ashr_11_use):
+ 
+ LABEL(nibble_ashr_11_restart_use):
+ 	movdqa	(%rdi, %rdx), %xmm0
+-	palignr $11, -16(%rdi, %rdx), D(%xmm0)
++	palignr $11, -16(%rdi, %rdx), %xmm0
+ #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
+ 	pcmpistri $0x1a, (%rsi,%rdx), %xmm0
+ #else
+@@ -1301,7 +1271,7 @@ LABEL(nibble_ashr_11_restart_use):
+ 	jg	LABEL(nibble_ashr_11_use)
+ 
+ 	movdqa	(%rdi, %rdx), %xmm0
+-	palignr $11, -16(%rdi, %rdx), D(%xmm0)
++	palignr $11, -16(%rdi, %rdx), %xmm0
+ #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
+ 	pcmpistri $0x1a, (%rsi,%rdx), %xmm0
+ #else
+@@ -1321,7 +1291,7 @@ LABEL(nibble_ashr_11_restart_use):
+ LABEL(nibble_ashr_11_use):
+ 	sub	$0x1000, %r10
+ 	movdqa	-16(%rdi, %rdx), %xmm0
+-	psrldq	$11, D(%xmm0)
++	psrldq	$11, %xmm0
+ 	pcmpistri      $0x3a,%xmm0, %xmm0
+ #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+ 	cmp	%r11, %rcx
+@@ -1339,10 +1309,10 @@ LABEL(nibble_ashr_11_use):
+  */
+ 	.p2align 4
+ LABEL(ashr_12):
+-	pslldq	$4, D(%xmm2)
++	pslldq	$4, %xmm2
+ 	TOLOWER (%xmm1, %xmm2)
+-	pcmpeqb	%xmm1, D(%xmm2)
+-	psubb	%xmm0, D(%xmm2)
++	pcmpeqb	%xmm1, %xmm2
++	psubb	%xmm0, %xmm2
+ 	pmovmskb %xmm2, %r9d
+ 	shr	%cl, %edx
+ 	shr	%cl, %r9d
+@@ -1371,7 +1341,7 @@ LABEL(loop_ashr_12_use):
+ 
+ LABEL(nibble_ashr_12_restart_use):
+ 	movdqa	(%rdi, %rdx), %xmm0
+-	palignr $12, -16(%rdi, %rdx), D(%xmm0)
++	palignr $12, -16(%rdi, %rdx), %xmm0
+ #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
+ 	pcmpistri $0x1a, (%rsi,%rdx), %xmm0
+ #else
+@@ -1390,7 +1360,7 @@ LABEL(nibble_ashr_12_restart_use):
+ 	jg	LABEL(nibble_ashr_12_use)
+ 
+ 	movdqa	(%rdi, %rdx), %xmm0
+-	palignr $12, -16(%rdi, %rdx), D(%xmm0)
++	palignr $12, -16(%rdi, %rdx), %xmm0
+ #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
+ 	pcmpistri $0x1a, (%rsi,%rdx), %xmm0
+ #else
+@@ -1410,7 +1380,7 @@ LABEL(nibble_ashr_12_restart_use):
+ LABEL(nibble_ashr_12_use):
+ 	sub	$0x1000, %r10
+ 	movdqa	-16(%rdi, %rdx), %xmm0
+-	psrldq	$12, D(%xmm0)
++	psrldq	$12, %xmm0
+ 	pcmpistri      $0x3a,%xmm0, %xmm0
+ #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+ 	cmp	%r11, %rcx
+@@ -1428,10 +1398,10 @@ LABEL(nibble_ashr_12_use):
+  */
+ 	.p2align 4
+ LABEL(ashr_13):
+-	pslldq	$3, D(%xmm2)
++	pslldq	$3, %xmm2
+ 	TOLOWER (%xmm1, %xmm2)
+-	pcmpeqb	%xmm1, D(%xmm2)
+-	psubb	%xmm0, D(%xmm2)
++	pcmpeqb	%xmm1, %xmm2
++	psubb	%xmm0, %xmm2
+ 	pmovmskb %xmm2, %r9d
+ 	shr	%cl, %edx
+ 	shr	%cl, %r9d
+@@ -1461,7 +1431,7 @@ LABEL(loop_ashr_13_use):
+ 
+ LABEL(nibble_ashr_13_restart_use):
+ 	movdqa	(%rdi, %rdx), %xmm0
+-	palignr $13, -16(%rdi, %rdx), D(%xmm0)
++	palignr $13, -16(%rdi, %rdx), %xmm0
+ #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
+ 	pcmpistri $0x1a, (%rsi,%rdx), %xmm0
+ #else
+@@ -1480,7 +1450,7 @@ LABEL(nibble_ashr_13_restart_use):
+ 	jg	LABEL(nibble_ashr_13_use)
+ 
+ 	movdqa	(%rdi, %rdx), %xmm0
+-	palignr $13, -16(%rdi, %rdx), D(%xmm0)
++	palignr $13, -16(%rdi, %rdx), %xmm0
+ #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
+ 	pcmpistri $0x1a, (%rsi,%rdx), %xmm0
+ #else
+@@ -1500,7 +1470,7 @@ LABEL(nibble_ashr_13_restart_use):
+ LABEL(nibble_ashr_13_use):
+ 	sub	$0x1000, %r10
+ 	movdqa	-16(%rdi, %rdx), %xmm0
+-	psrldq	$13, D(%xmm0)
++	psrldq	$13, %xmm0
+ 	pcmpistri      $0x3a,%xmm0, %xmm0
+ #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+ 	cmp	%r11, %rcx
+@@ -1518,10 +1488,10 @@ LABEL(nibble_ashr_13_use):
+  */
+ 	.p2align 4
+ LABEL(ashr_14):
+-	pslldq  $2, D(%xmm2)
++	pslldq  $2, %xmm2
+ 	TOLOWER (%xmm1, %xmm2)
+-	pcmpeqb	%xmm1, D(%xmm2)
+-	psubb	%xmm0, D(%xmm2)
++	pcmpeqb	%xmm1, %xmm2
++	psubb	%xmm0, %xmm2
+ 	pmovmskb %xmm2, %r9d
+ 	shr	%cl, %edx
+ 	shr	%cl, %r9d
+@@ -1551,7 +1521,7 @@ LABEL(loop_ashr_14_use):
+ 
+ LABEL(nibble_ashr_14_restart_use):
+ 	movdqa	(%rdi, %rdx), %xmm0
+-	palignr $14, -16(%rdi, %rdx), D(%xmm0)
++	palignr $14, -16(%rdi, %rdx), %xmm0
+ #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
+ 	pcmpistri $0x1a, (%rsi,%rdx), %xmm0
+ #else
+@@ -1570,7 +1540,7 @@ LABEL(nibble_ashr_14_restart_use):
+ 	jg	LABEL(nibble_ashr_14_use)
+ 
+ 	movdqa	(%rdi, %rdx), %xmm0
+-	palignr $14, -16(%rdi, %rdx), D(%xmm0)
++	palignr $14, -16(%rdi, %rdx), %xmm0
+ #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
+ 	pcmpistri $0x1a, (%rsi,%rdx), %xmm0
+ #else
+@@ -1590,7 +1560,7 @@ LABEL(nibble_ashr_14_restart_use):
+ LABEL(nibble_ashr_14_use):
+ 	sub	$0x1000, %r10
+ 	movdqa	-16(%rdi, %rdx), %xmm0
+-	psrldq	$14, D(%xmm0)
++	psrldq	$14, %xmm0
+ 	pcmpistri      $0x3a,%xmm0, %xmm0
+ #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+ 	cmp	%r11, %rcx
+@@ -1608,10 +1578,10 @@ LABEL(nibble_ashr_14_use):
+  */
+ 	.p2align 4
+ LABEL(ashr_15):
+-	pslldq	$1, D(%xmm2)
++	pslldq	$1, %xmm2
+ 	TOLOWER (%xmm1, %xmm2)
+-	pcmpeqb	%xmm1, D(%xmm2)
+-	psubb	%xmm0, D(%xmm2)
++	pcmpeqb	%xmm1, %xmm2
++	psubb	%xmm0, %xmm2
+ 	pmovmskb %xmm2, %r9d
+ 	shr	%cl, %edx
+ 	shr	%cl, %r9d
+@@ -1643,7 +1613,7 @@ LABEL(loop_ashr_15_use):
+ 
+ LABEL(nibble_ashr_15_restart_use):
+ 	movdqa	(%rdi, %rdx), %xmm0
+-	palignr $15, -16(%rdi, %rdx), D(%xmm0)
++	palignr $15, -16(%rdi, %rdx), %xmm0
+ #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
+ 	pcmpistri $0x1a, (%rsi,%rdx), %xmm0
+ #else
+@@ -1662,7 +1632,7 @@ LABEL(nibble_ashr_15_restart_use):
+ 	jg	LABEL(nibble_ashr_15_use)
+ 
+ 	movdqa	(%rdi, %rdx), %xmm0
+-	palignr $15, -16(%rdi, %rdx), D(%xmm0)
++	palignr $15, -16(%rdi, %rdx), %xmm0
+ #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
+ 	pcmpistri $0x1a, (%rsi,%rdx), %xmm0
+ #else
+@@ -1682,7 +1652,7 @@ LABEL(nibble_ashr_15_restart_use):
+ LABEL(nibble_ashr_15_use):
+ 	sub	$0x1000, %r10
+ 	movdqa	-16(%rdi, %rdx), %xmm0
+-	psrldq	$15, D(%xmm0)
++	psrldq	$15, %xmm0
+ 	pcmpistri      $0x3a,%xmm0, %xmm0
+ #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+ 	cmp	%r11, %rcx
+diff --git a/sysdeps/x86_64/multiarch/strncase_l-avx.S b/sysdeps/x86_64/multiarch/strncase_l-avx.S
+deleted file mode 100644
+index 0c4e525b..00000000
+--- a/sysdeps/x86_64/multiarch/strncase_l-avx.S
++++ /dev/null
+@@ -1,22 +0,0 @@
+-/* strncasecmp_l optimized with AVX.
+-   Copyright (C) 2017-2018 Free Software Foundation, Inc.
+-   This file is part of the GNU C Library.
+-
+-   The GNU C Library is free software; you can redistribute it and/or
+-   modify it under the terms of the GNU Lesser General Public
+-   License as published by the Free Software Foundation; either
+-   version 2.1 of the License, or (at your option) any later version.
+-
+-   The GNU C Library is distributed in the hope that it will be useful,
+-   but WITHOUT ANY WARRANTY; without even the implied warranty of
+-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+-   Lesser General Public License for more details.
+-
+-   You should have received a copy of the GNU Lesser General Public
+-   License along with the GNU C Library; if not, see
+-   <http://www.gnu.org/licenses/>.  */
+-
+-#define STRCMP_SSE42 __strncasecmp_l_avx
+-#define USE_AVX 1
+-#define USE_AS_STRNCASECMP_L
+-#include "strcmp-sse42.S"
+-- 
+GitLab
+
diff --git a/glibc.spec b/glibc.spec
index a5ff930..2d5c641 100644
--- a/glibc.spec
+++ b/glibc.spec
@@ -1,6 +1,6 @@
 %define glibcsrcdir glibc-2.28
 %define glibcversion 2.28
-%define glibcrelease 245%{?dist}
+%define glibcrelease 246%{?dist}
 # Pre-release tarballs are pulled in from git using a command that is
 # effectively:
 #
@@ -1065,6 +1065,116 @@ Patch877: glibc-RHEL-16825-1.patch
 Patch878: glibc-RHEL-16825-2.patch
 Patch879: glibc-RHEL-16825-3.patch
 Patch880: glibc-RHEL-16825-4.patch
+Patch881: glibc-RHEL-15696-1.patch
+Patch882: glibc-RHEL-15696-2.patch
+Patch883: glibc-RHEL-15696-3.patch
+Patch884: glibc-RHEL-15696-4.patch
+Patch885: glibc-RHEL-15696-5.patch
+Patch886: glibc-RHEL-15696-6.patch
+Patch887: glibc-RHEL-15696-7.patch
+Patch888: glibc-RHEL-15696-8.patch
+Patch889: glibc-RHEL-15696-9.patch
+Patch890: glibc-RHEL-15696-10.patch
+Patch891: glibc-RHEL-15696-11.patch
+Patch892: glibc-RHEL-15696-12.patch
+Patch893: glibc-RHEL-15696-13.patch
+Patch894: glibc-RHEL-15696-14.patch
+Patch895: glibc-RHEL-15696-15.patch
+Patch896: glibc-RHEL-15696-16.patch
+Patch897: glibc-RHEL-15696-17.patch
+Patch898: glibc-RHEL-15696-18.patch
+Patch899: glibc-RHEL-15696-19.patch
+Patch900: glibc-RHEL-15696-20.patch
+Patch901: glibc-RHEL-15696-21.patch
+Patch902: glibc-RHEL-15696-22.patch
+Patch903: glibc-RHEL-15696-23.patch
+Patch904: glibc-RHEL-15696-24.patch
+Patch905: glibc-RHEL-15696-25.patch
+Patch906: glibc-RHEL-15696-26.patch
+Patch907: glibc-RHEL-15696-27.patch
+Patch908: glibc-RHEL-15696-28.patch
+Patch909: glibc-RHEL-15696-29.patch
+Patch910: glibc-RHEL-15696-30.patch
+Patch911: glibc-RHEL-15696-31.patch
+Patch912: glibc-RHEL-15696-32.patch
+Patch913: glibc-RHEL-15696-33.patch
+Patch914: glibc-RHEL-15696-34.patch
+Patch915: glibc-RHEL-15696-35.patch
+Patch916: glibc-RHEL-15696-36.patch
+Patch917: glibc-RHEL-15696-37.patch
+Patch918: glibc-RHEL-15696-38.patch
+Patch919: glibc-RHEL-15696-39.patch
+Patch920: glibc-RHEL-15696-40.patch
+Patch921: glibc-RHEL-15696-41.patch
+Patch922: glibc-RHEL-15696-42.patch
+Patch923: glibc-RHEL-15696-43.patch
+Patch924: glibc-RHEL-15696-44.patch
+Patch925: glibc-RHEL-15696-45.patch
+Patch926: glibc-RHEL-15696-46.patch
+Patch927: glibc-RHEL-15696-47.patch
+Patch928: glibc-RHEL-15696-48.patch
+Patch929: glibc-RHEL-15696-49.patch
+Patch930: glibc-RHEL-15696-50.patch
+Patch931: glibc-RHEL-15696-51.patch
+Patch932: glibc-RHEL-15696-52.patch
+Patch933: glibc-RHEL-15696-53.patch
+Patch934: glibc-RHEL-15696-54.patch
+Patch935: glibc-RHEL-15696-55.patch
+Patch936: glibc-RHEL-15696-56.patch
+Patch937: glibc-RHEL-15696-57.patch
+Patch938: glibc-RHEL-15696-58.patch
+Patch939: glibc-RHEL-15696-59.patch
+Patch940: glibc-RHEL-15696-60.patch
+Patch941: glibc-RHEL-15696-61.patch
+Patch942: glibc-RHEL-15696-62.patch
+Patch943: glibc-RHEL-15696-63.patch
+Patch944: glibc-RHEL-15696-64.patch
+Patch945: glibc-RHEL-15696-65.patch
+Patch946: glibc-RHEL-15696-66.patch
+Patch947: glibc-RHEL-15696-67.patch
+Patch948: glibc-RHEL-15696-68.patch
+Patch949: glibc-RHEL-15696-69.patch
+Patch950: glibc-RHEL-15696-70.patch
+Patch951: glibc-RHEL-15696-71.patch
+Patch952: glibc-RHEL-15696-72.patch
+Patch953: glibc-RHEL-15696-73.patch
+Patch954: glibc-RHEL-15696-74.patch
+Patch955: glibc-RHEL-15696-75.patch
+Patch956: glibc-RHEL-15696-76.patch
+Patch957: glibc-RHEL-15696-77.patch
+Patch958: glibc-RHEL-15696-78.patch
+Patch959: glibc-RHEL-15696-79.patch
+Patch960: glibc-RHEL-15696-80.patch
+Patch961: glibc-RHEL-15696-81.patch
+Patch962: glibc-RHEL-15696-82.patch
+Patch963: glibc-RHEL-15696-83.patch
+Patch964: glibc-RHEL-15696-84.patch
+Patch965: glibc-RHEL-15696-85.patch
+Patch966: glibc-RHEL-15696-86.patch
+Patch967: glibc-RHEL-15696-87.patch
+Patch968: glibc-RHEL-15696-88.patch
+Patch969: glibc-RHEL-15696-89.patch
+Patch970: glibc-RHEL-15696-90.patch
+Patch971: glibc-RHEL-15696-91.patch
+Patch972: glibc-RHEL-15696-92.patch
+Patch973: glibc-RHEL-15696-93.patch
+Patch974: glibc-RHEL-15696-94.patch
+Patch975: glibc-RHEL-15696-95.patch
+Patch976: glibc-RHEL-15696-96.patch
+Patch977: glibc-RHEL-15696-97.patch
+Patch978: glibc-RHEL-15696-98.patch
+Patch979: glibc-RHEL-15696-99.patch
+Patch980: glibc-RHEL-15696-100.patch
+Patch981: glibc-RHEL-15696-101.patch
+Patch982: glibc-RHEL-15696-102.patch
+Patch983: glibc-RHEL-15696-103.patch
+Patch984: glibc-RHEL-15696-104.patch
+Patch985: glibc-RHEL-15696-105.patch
+Patch986: glibc-RHEL-15696-106.patch
+Patch987: glibc-RHEL-15696-107.patch
+Patch988: glibc-RHEL-15696-108.patch
+Patch989: glibc-RHEL-15696-109.patch
+Patch990: glibc-RHEL-15696-110.patch
 
 ##############################################################################
 # Continued list of core "glibc" package information:
@@ -2896,6 +3006,9 @@ fi
 %files -f compat-libpthread-nonshared.filelist -n compat-libpthread-nonshared
 
 %changelog
+* Thu Dec 14 2023 DJ Delorie <dj@redhat.com> - 2.28-246
+- Include CentOS Hyperscaler SIG patches backported by Intel (RHEL-15696)
+
 * Fri Dec  8 2023 Florian Weimer <fweimer@redhat.com> - 2.28-245
 - Improve compatibility between underlinking and IFUNC resolvers (RHEL-16825)