Import glibc-2.34-40.fc35 from f35

* Fri Jul 22 2022 Arjun Shankar <arjun@redhat.com> - 2.34-40 - Sync with upstream branch release/2.34/master, commit b2f32e746492615a6eb3e66fac1e766e32e8deb1: - malloc: Simplify implementation of __malloc_assert - Update syscall-names.list for Linux 5.18 - x86: Add missing IS_IN (libc) check to strncmp-sse4_2.S - x86: Move mem{p}{mov|cpy}_{chk_}erms to its own file - x86: Move and slightly improve memset_erms - x86: Add definition for __wmemset_chk AVX2 RTM in ifunc impl list - x86: Put wcs{n}len-sse4.1 in the sse4.1 text section - x86: Align entry for memrchr to 64-bytes. - x86: Add BMI1/BMI2 checks for ISA_V3 check - x86: Cleanup bounds checking in large memcpy case - x86: Add bounds `x86_non_temporal_threshold` - x86: Add sse42 implementation to strcmp's ifunc - x86: Fix misordered logic for setting `rep_movsb_stop_threshold` - x86: Align varshift table to 32-bytes - x86: ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST expect no transactions - x86: Shrink code size of memchr-evex.S - x86: Shrink code size of memchr-avx2.S - x86: Optimize memrchr-avx2.S - x86: Optimize memrchr-evex.S - x86: Optimize memrchr-sse2.S - x86: Add COND_VZEROUPPER that can replace vzeroupper if no `ret` - x86: Create header for VEC classes in x86 strings library - x86_64: Add strstr function with 512-bit EVEX - x86-64: Ignore r_addend for R_X86_64_GLOB_DAT/R_X86_64_JUMP_SLOT - x86_64: Implement evex512 version of strlen, strnlen, wcslen and wcsnlen - x86_64: Remove bzero optimization - x86_64: Remove end of line trailing spaces - nptl: Fix ___pthread_unregister_cancel_restore asynchronous restore - linux: Fix mq_timereceive check for 32 bit fallback code (BZ 29304) Resolves: #2109505
2022-07-22 15:47:47 +02:00 · 2022-07-22 15:47:47 +02:00 · 668eaab0c7
commit 668eaab0c7
parent bc1778ead1
30 changed files with 5186 additions and 1 deletions
--- a/glibc-upstream-2.34-274.patch
+++ b/glibc-upstream-2.34-274.patch
@ -0,0 +1,27 @@
+commit 4b246b2bbd1d5a77035bb990d6097b7337c34bbb
+Author: Adhemerval Zanella <adhemerval.zanella@linaro.org>
+Date:   Thu Jun 30 09:08:31 2022 -0300
+
+    linux: Fix mq_timereceive check for 32 bit fallback code (BZ 29304)
+    
+    On  success,  mq_receive() and mq_timedreceive() return the number of
+    bytes in the received message, so it requires to check if the value
+    is larger than 0.
+    
+    Checked on i686-linux-gnu.
+    
+    (cherry picked from commit 71d87d85bf54f6522813aec97c19bdd24997341e)
+
+diff --git a/sysdeps/unix/sysv/linux/mq_timedreceive.c b/sysdeps/unix/sysv/linux/mq_timedreceive.c
+index 7f3a112d7f2cbbe7..1fc98752e7d6d506 100644
+--- a/sysdeps/unix/sysv/linux/mq_timedreceive.c
+++ b/sysdeps/unix/sysv/linux/mq_timedreceive.c
+@@ -41,7 +41,7 @@ ___mq_timedreceive_time64 (mqd_t mqdes, char *__restrict msg_ptr, size_t msg_len
+     {
+       int r = SYSCALL_CANCEL (mq_timedreceive_time64, mqdes, msg_ptr, msg_len,
+ 			      msg_prio, abs_timeout);
+-      if (r == 0 || errno != ENOSYS)
+      if (r >= 0 || errno != ENOSYS)
+ 	return r;
+       __set_errno (EOVERFLOW);
+       return -1;
--- a/glibc-upstream-2.34-275.patch
+++ b/glibc-upstream-2.34-275.patch
@ -0,0 +1,25 @@
+commit 7789a849234f8b303a571134abe72691ce8c2540
+Author: Adhemerval Zanella <adhemerval.zanella@linaro.org>
+Date:   Wed Jul 13 10:37:32 2022 -0300
+
+    nptl: Fix ___pthread_unregister_cancel_restore asynchronous restore
+    
+    This was due a wrong revert done on 404656009b459658.
+    
+    Checked on x86_64-linux-gnu and i686-linux-gnu.
+    
+    (cherry picked from commit f27e5e21787abc9f719879af47687221aa1027b3)
+
+diff --git a/nptl/cleanup_defer.c b/nptl/cleanup_defer.c
+index 35ba40fb0247c7cc..59571229d8ccf481 100644
+--- a/nptl/cleanup_defer.c
+++ b/nptl/cleanup_defer.c
+@@ -72,7 +72,7 @@ ___pthread_unregister_cancel_restore (__pthread_unwind_buf_t *buf)
+     return;
+ 
+   int cancelhandling = atomic_load_relaxed (&self->cancelhandling);
+-  if (cancelhandling & CANCELTYPE_BITMASK)
+  if ((cancelhandling & CANCELTYPE_BITMASK) == 0)
+     {
+       int newval;
+       do
--- a/glibc-upstream-2.34-276.patch
+++ b/glibc-upstream-2.34-276.patch
@ -0,0 +1,29 @@
+commit 8d324019e69203f5998f223d0e905de1395330ea
+Author: Sunil K Pandey <skpgkp2@gmail.com>
+Date:   Mon Jul 18 18:38:48 2022 -0700
+
+    x86_64: Remove end of line trailing spaces
+    
+    This commit remove trailing space introduced by following commit.
+    
+    commit a775a7a3eb1e85b54af0b4ee5ff4dcf66772a1fb
+    Author: Noah Goldstein <goldstein.w.n@gmail.com>
+    Date:   Wed Jun 23 01:56:29 2021 -0400
+    
+        x86: Fix overflow bug in wcsnlen-sse4_1 and wcsnlen-avx2 [BZ #27974]
+
+diff --git a/sysdeps/x86_64/multiarch/strlen-vec.S b/sysdeps/x86_64/multiarch/strlen-vec.S
+index b7657282bd2e6fa5..031753a91763b351 100644
+--- a/sysdeps/x86_64/multiarch/strlen-vec.S
+++ b/sysdeps/x86_64/multiarch/strlen-vec.S
+@@ -66,8 +66,8 @@ ENTRY(strlen)
+ L(n_nonzero):
+ # ifdef AS_WCSLEN
+ /* Check for overflow from maxlen * sizeof(wchar_t). If it would
+-   overflow the only way this program doesn't have undefined behavior 
+-   is if there is a null terminator in valid memory so wcslen will 
+   overflow the only way this program doesn't have undefined behavior
+   is if there is a null terminator in valid memory so wcslen will
+    suffice.  */
+ 	mov	%RSI_LP, %R10_LP
+ 	sar	$62, %R10_LP
--- a/glibc-upstream-2.34-277.patch
+++ b/glibc-upstream-2.34-277.patch
@ -0,0 +1,457 @@
+commit eb9aa96facc5231e208de0946870f10c21aee9f3
+Author: Adhemerval Zanella <adhemerval.zanella@linaro.org>
+Date:   Fri May 13 09:33:30 2022 -0300
+
+    x86_64: Remove bzero optimization
+    
+    Both symbols are marked as legacy in POSIX.1-2001 and removed on
+    POSIX.1-2008, although the prototypes are defined for _GNU_SOURCE
+    or _DEFAULT_SOURCE.
+    
+    GCC also replaces bcopy with a memmove and bzero with memset on default
+    configuration (to actually get a bzero libc call the code requires
+    to omit string.h inclusion and built with -fno-builtin), so it is
+    highly unlikely programs are actually calling libc bzero symbol.
+    
+    On a recent Linux distro (Ubuntu 22.04), there is no bzero calls
+    by the installed binaries.
+    
+      $ cat count_bstring.sh
+      #!/bin/bash
+    
+      files=`IFS=':';for i in $PATH; do test -d "$i" && find "$i" -maxdepth 1 -executable -type f; done`
+      total=0
+      for file in $files; do
+        symbols=`objdump -R $file 2>&1`
+        if [ $? -eq 0 ]; then
+          ncalls=`echo $symbols | grep -w $1 | wc -l`
+          ((total=total+ncalls))
+          if [ $ncalls -gt 0 ]; then
+            echo "$file: $ncalls"
+          fi
+        fi
+      done
+      echo "TOTAL=$total"
+      $ ./count_bstring.sh bzero
+      TOTAL=0
+    
+    Checked on x86_64-linux-gnu.
+    
+    (cherry picked from commit 9403b71ae97e3f1a91c796ddcbb4e6f044434734)
+
+diff --git a/sysdeps/x86_64/bzero.S b/sysdeps/x86_64/bzero.S
+deleted file mode 100644
+index f96d567fd87696af..0000000000000000
+--- a/sysdeps/x86_64/bzero.S
+++ /dev/null
+@@ -1 +0,0 @@
+-/* Implemented in memset.S.  */
+diff --git a/sysdeps/x86_64/memset.S b/sysdeps/x86_64/memset.S
+index 0358210c7ff3a976..2b64741fd10a8ec2 100644
+--- a/sysdeps/x86_64/memset.S
+++ b/sysdeps/x86_64/memset.S
+@@ -1,4 +1,4 @@
+-/* memset/bzero -- set memory area to CH/0
+/* memset -- set memory area to CH/0
+    Optimized version for x86-64.
+    Copyright (C) 2002-2021 Free Software Foundation, Inc.
+    This file is part of the GNU C Library.
+@@ -35,9 +35,6 @@
+   punpcklwd %xmm0, %xmm0; \
+   pshufd $0, %xmm0, %xmm0
+ 
+-# define BZERO_ZERO_VEC0() \
+-  pxor %xmm0, %xmm0
+-
+ # define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
+   movd d, %xmm0; \
+   pshufd $0, %xmm0, %xmm0; \
+@@ -56,10 +53,6 @@
+ # define MEMSET_SYMBOL(p,s)	memset
+ #endif
+ 
+-#ifndef BZERO_SYMBOL
+-# define BZERO_SYMBOL(p,s)	__bzero
+-#endif
+-
+ #ifndef WMEMSET_SYMBOL
+ # define WMEMSET_CHK_SYMBOL(p,s) p
+ # define WMEMSET_SYMBOL(p,s)	__wmemset
+@@ -70,7 +63,6 @@
+ libc_hidden_builtin_def (memset)
+ 
+ #if IS_IN (libc)
+-weak_alias (__bzero, bzero)
+ libc_hidden_def (__wmemset)
+ weak_alias (__wmemset, wmemset)
+ libc_hidden_weak (wmemset)
+diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
+index b503e4b81e92a11c..67401162d526f664 100644
+--- a/sysdeps/x86_64/multiarch/Makefile
+++ b/sysdeps/x86_64/multiarch/Makefile
+@@ -1,7 +1,6 @@
+ ifeq ($(subdir),string)
+ 
+ sysdep_routines += \
+-  bzero \
+   memchr-avx2 \
+   memchr-avx2-rtm \
+   memchr-evex \
+diff --git a/sysdeps/x86_64/multiarch/bzero.c b/sysdeps/x86_64/multiarch/bzero.c
+deleted file mode 100644
+index 13e399a9a1fbdeb2..0000000000000000
+--- a/sysdeps/x86_64/multiarch/bzero.c
+++ /dev/null
+@@ -1,108 +0,0 @@
+-/* Multiple versions of bzero.
+-   All versions must be listed in ifunc-impl-list.c.
+-   Copyright (C) 2022 Free Software Foundation, Inc.
+-   This file is part of the GNU C Library.
+-
+-   The GNU C Library is free software; you can redistribute it and/or
+-   modify it under the terms of the GNU Lesser General Public
+-   License as published by the Free Software Foundation; either
+-   version 2.1 of the License, or (at your option) any later version.
+-
+-   The GNU C Library is distributed in the hope that it will be useful,
+-   but WITHOUT ANY WARRANTY; without even the implied warranty of
+-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+-   Lesser General Public License for more details.
+-
+-   You should have received a copy of the GNU Lesser General Public
+-   License along with the GNU C Library; if not, see
+-   <https://www.gnu.org/licenses/>.  */
+-
+-/* Define multiple versions only for the definition in libc.  */
+-#if IS_IN (libc)
+-# define __bzero __redirect___bzero
+-# include <string.h>
+-# undef __bzero
+-
+-/* OPTIMIZE1 definition required for bzero patch.  */
+-# define OPTIMIZE1(name)	EVALUATOR1 (SYMBOL_NAME, name)
+-# define SYMBOL_NAME __bzero
+-# include <init-arch.h>
+-
+-extern __typeof (REDIRECT_NAME) OPTIMIZE1 (sse2_unaligned)
+-  attribute_hidden;
+-extern __typeof (REDIRECT_NAME) OPTIMIZE1 (sse2_unaligned_erms)
+-  attribute_hidden;
+-extern __typeof (REDIRECT_NAME) OPTIMIZE1 (avx2_unaligned) attribute_hidden;
+-extern __typeof (REDIRECT_NAME) OPTIMIZE1 (avx2_unaligned_erms)
+-  attribute_hidden;
+-extern __typeof (REDIRECT_NAME) OPTIMIZE1 (avx2_unaligned_rtm)
+-  attribute_hidden;
+-extern __typeof (REDIRECT_NAME) OPTIMIZE1 (avx2_unaligned_erms_rtm)
+-  attribute_hidden;
+-extern __typeof (REDIRECT_NAME) OPTIMIZE1 (evex_unaligned)
+-  attribute_hidden;
+-extern __typeof (REDIRECT_NAME) OPTIMIZE1 (evex_unaligned_erms)
+-  attribute_hidden;
+-extern __typeof (REDIRECT_NAME) OPTIMIZE1 (avx512_unaligned)
+-  attribute_hidden;
+-extern __typeof (REDIRECT_NAME) OPTIMIZE1 (avx512_unaligned_erms)
+-  attribute_hidden;
+-
+-static inline void *
+-IFUNC_SELECTOR (void)
+-{
+-  const struct cpu_features* cpu_features = __get_cpu_features ();
+-
+-  if (CPU_FEATURE_USABLE_P (cpu_features, AVX512F)
+-      && !CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_AVX512))
+-    {
+-      if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
+-          && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW)
+-          && CPU_FEATURE_USABLE_P (cpu_features, BMI2))
+-	{
+-	  if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
+-	    return OPTIMIZE1 (avx512_unaligned_erms);
+-
+-	  return OPTIMIZE1 (avx512_unaligned);
+-	}
+-    }
+-
+-  if (CPU_FEATURE_USABLE_P (cpu_features, AVX2))
+-    {
+-      if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
+-          && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW)
+-          && CPU_FEATURE_USABLE_P (cpu_features, BMI2))
+-	{
+-	  if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
+-	    return OPTIMIZE1 (evex_unaligned_erms);
+-
+-	  return OPTIMIZE1 (evex_unaligned);
+-	}
+-
+-      if (CPU_FEATURE_USABLE_P (cpu_features, RTM))
+-	{
+-	  if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
+-	    return OPTIMIZE1 (avx2_unaligned_erms_rtm);
+-
+-	  return OPTIMIZE1 (avx2_unaligned_rtm);
+-	}
+-
+-      if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
+-	{
+-	  if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
+-	    return OPTIMIZE1 (avx2_unaligned_erms);
+-
+-	  return OPTIMIZE1 (avx2_unaligned);
+-	}
+-    }
+-
+-  if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
+-    return OPTIMIZE1 (sse2_unaligned_erms);
+-
+-  return OPTIMIZE1 (sse2_unaligned);
+-}
+-
+-libc_ifunc_redirected (__redirect___bzero, __bzero, IFUNC_SELECTOR ());
+-
+-weak_alias (__bzero, bzero)
+-#endif
+diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+index e5e48b36c3175e68..d990a7149489efd9 100644
+--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+@@ -280,48 +280,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 			      __memset_avx512_no_vzeroupper)
+ 	     )
+ 
+-  /* Support sysdeps/x86_64/multiarch/bzero.c.  */
+-  IFUNC_IMPL (i, name, bzero,
+-	      IFUNC_IMPL_ADD (array, i, bzero, 1,
+-			      __bzero_sse2_unaligned)
+-	      IFUNC_IMPL_ADD (array, i, bzero, 1,
+-			      __bzero_sse2_unaligned_erms)
+-	      IFUNC_IMPL_ADD (array, i, bzero,
+-			      CPU_FEATURE_USABLE (AVX2),
+-			      __bzero_avx2_unaligned)
+-	      IFUNC_IMPL_ADD (array, i, bzero,
+-			      CPU_FEATURE_USABLE (AVX2),
+-			      __bzero_avx2_unaligned_erms)
+-	      IFUNC_IMPL_ADD (array, i, bzero,
+-			      (CPU_FEATURE_USABLE (AVX2)
+-			       && CPU_FEATURE_USABLE (RTM)),
+-			      __bzero_avx2_unaligned_rtm)
+-	      IFUNC_IMPL_ADD (array, i, bzero,
+-			      (CPU_FEATURE_USABLE (AVX2)
+-			       && CPU_FEATURE_USABLE (RTM)),
+-			      __bzero_avx2_unaligned_erms_rtm)
+-	      IFUNC_IMPL_ADD (array, i, bzero,
+-			      (CPU_FEATURE_USABLE (AVX512VL)
+-			       && CPU_FEATURE_USABLE (AVX512BW)
+-			       && CPU_FEATURE_USABLE (BMI2)),
+-			      __bzero_evex_unaligned)
+-	      IFUNC_IMPL_ADD (array, i, bzero,
+-			      (CPU_FEATURE_USABLE (AVX512VL)
+-			       && CPU_FEATURE_USABLE (AVX512BW)
+-			       && CPU_FEATURE_USABLE (BMI2)),
+-			      __bzero_evex_unaligned_erms)
+-	      IFUNC_IMPL_ADD (array, i, bzero,
+-			      (CPU_FEATURE_USABLE (AVX512VL)
+-			       && CPU_FEATURE_USABLE (AVX512BW)
+-			       && CPU_FEATURE_USABLE (BMI2)),
+-			      __bzero_avx512_unaligned_erms)
+-	      IFUNC_IMPL_ADD (array, i, bzero,
+-			      (CPU_FEATURE_USABLE (AVX512VL)
+-			       && CPU_FEATURE_USABLE (AVX512BW)
+-			       && CPU_FEATURE_USABLE (BMI2)),
+-			      __bzero_avx512_unaligned)
+-	     )
+-
+   /* Support sysdeps/x86_64/multiarch/rawmemchr.c.  */
+   IFUNC_IMPL (i, name, rawmemchr,
+ 	      IFUNC_IMPL_ADD (array, i, rawmemchr,
+diff --git a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms-rtm.S b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms-rtm.S
+index 5a5ee6f67299400b..8ac3e479bba488be 100644
+--- a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms-rtm.S
+++ b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms-rtm.S
+@@ -5,7 +5,6 @@
+ 
+ #define SECTION(p) p##.avx.rtm
+ #define MEMSET_SYMBOL(p,s)	p##_avx2_##s##_rtm
+-#define BZERO_SYMBOL(p,s)	p##_avx2_##s##_rtm
+ #define WMEMSET_SYMBOL(p,s)	p##_avx2_##s##_rtm
+ 
+ #include "memset-avx2-unaligned-erms.S"
+diff --git a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
+index a093a2831f3dfa0d..c0bf2875d03d51ab 100644
+--- a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
+@@ -14,9 +14,6 @@
+   vmovd d, %xmm0; \
+   movq r, %rax;
+ 
+-# define BZERO_ZERO_VEC0() \
+-  vpxor %xmm0, %xmm0, %xmm0
+-
+ # define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
+   MEMSET_SET_VEC0_AND_SET_RETURN(d, r)
+ 
+@@ -32,9 +29,6 @@
+ # ifndef MEMSET_SYMBOL
+ #  define MEMSET_SYMBOL(p,s)	p##_avx2_##s
+ # endif
+-# ifndef BZERO_SYMBOL
+-#  define BZERO_SYMBOL(p,s)	p##_avx2_##s
+-# endif
+ # ifndef WMEMSET_SYMBOL
+ #  define WMEMSET_SYMBOL(p,s)	p##_avx2_##s
+ # endif
+diff --git a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
+index 727c92133a15900f..5241216a77bf72b7 100644
+--- a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
+@@ -19,9 +19,6 @@
+   vpbroadcastb d, %VEC0; \
+   movq r, %rax
+ 
+-# define BZERO_ZERO_VEC0() \
+-  vpxorq %XMM0, %XMM0, %XMM0
+-
+ # define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
+   vpbroadcastd d, %VEC0; \
+   movq r, %rax
+diff --git a/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
+index 5d8fa78f05476b10..637002150659123c 100644
+--- a/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
+@@ -19,9 +19,6 @@
+   vpbroadcastb d, %VEC0; \
+   movq r, %rax
+ 
+-# define BZERO_ZERO_VEC0() \
+-  vpxorq %XMM0, %XMM0, %XMM0
+-
+ # define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
+   vpbroadcastd d, %VEC0; \
+   movq r, %rax
+diff --git a/sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S
+index 2951f7f5f70e274a..c47f3a9c955508a2 100644
+--- a/sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S
+@@ -22,7 +22,6 @@
+ 
+ #if IS_IN (libc)
+ # define MEMSET_SYMBOL(p,s)	p##_sse2_##s
+-# define BZERO_SYMBOL(p,s)	MEMSET_SYMBOL (p, s)
+ # define WMEMSET_SYMBOL(p,s)	p##_sse2_##s
+ 
+ # ifdef SHARED
+diff --git a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
+index d9c577fb5ff9700f..abc12d9cda1b3843 100644
+--- a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
+@@ -1,5 +1,5 @@
+-/* memset/bzero with unaligned store and rep stosb
+-   Copyright (C) 2016-2021 Free Software Foundation, Inc.
+/* memset with unaligned store and rep stosb
+   Copyright (C) 2016-2022 Free Software Foundation, Inc.
+    This file is part of the GNU C Library.
+ 
+    The GNU C Library is free software; you can redistribute it and/or
+@@ -26,10 +26,6 @@
+ 
+ #include <sysdep.h>
+ 
+-#ifndef BZERO_SYMBOL
+-# define BZERO_SYMBOL(p,s)		MEMSET_SYMBOL (p, s)
+-#endif
+-
+ #ifndef MEMSET_CHK_SYMBOL
+ # define MEMSET_CHK_SYMBOL(p,s)		MEMSET_SYMBOL(p, s)
+ #endif
+@@ -134,31 +130,6 @@ ENTRY (WMEMSET_SYMBOL (__wmemset, unaligned))
+ END (WMEMSET_SYMBOL (__wmemset, unaligned))
+ #endif
+ 
+-ENTRY (BZERO_SYMBOL(__bzero, unaligned))
+-#if VEC_SIZE > 16
+-	BZERO_ZERO_VEC0 ()
+-#endif
+-	mov	%RDI_LP, %RAX_LP
+-	mov	%RSI_LP, %RDX_LP
+-#ifndef USE_LESS_VEC_MASK_STORE
+-	xorl	%esi, %esi
+-#endif
+-	cmp	$VEC_SIZE, %RDX_LP
+-	jb	L(less_vec_no_vdup)
+-#ifdef USE_LESS_VEC_MASK_STORE
+-	xorl	%esi, %esi
+-#endif
+-#if VEC_SIZE <= 16
+-	BZERO_ZERO_VEC0 ()
+-#endif
+-	cmp	$(VEC_SIZE * 2), %RDX_LP
+-	ja	L(more_2x_vec)
+-	/* From VEC and to 2 * VEC.  No branch when size == VEC_SIZE.  */
+-	VMOVU	%VEC(0), (%rdi)
+-	VMOVU	%VEC(0), (VEC_SIZE * -1)(%rdi, %rdx)
+-	VZEROUPPER_RETURN
+-END (BZERO_SYMBOL(__bzero, unaligned))
+-
+ #if defined SHARED && IS_IN (libc)
+ ENTRY_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned))
+ 	cmp	%RDX_LP, %RCX_LP
+@@ -216,31 +187,6 @@ END (__memset_erms)
+ END (MEMSET_SYMBOL (__memset, erms))
+ # endif
+ 
+-ENTRY_P2ALIGN (BZERO_SYMBOL(__bzero, unaligned_erms), 6)
+-# if VEC_SIZE > 16
+-	BZERO_ZERO_VEC0 ()
+-# endif
+-	mov	%RDI_LP, %RAX_LP
+-	mov	%RSI_LP, %RDX_LP
+-# ifndef USE_LESS_VEC_MASK_STORE
+-	xorl	%esi, %esi
+-# endif
+-	cmp	$VEC_SIZE, %RDX_LP
+-	jb	L(less_vec_no_vdup)
+-# ifdef USE_LESS_VEC_MASK_STORE
+-	xorl	%esi, %esi
+-# endif
+-# if VEC_SIZE <= 16
+-	BZERO_ZERO_VEC0 ()
+-# endif
+-	cmp	$(VEC_SIZE * 2), %RDX_LP
+-	ja	L(stosb_more_2x_vec)
+-	/* From VEC and to 2 * VEC.  No branch when size == VEC_SIZE.  */
+-	VMOVU	%VEC(0), (%rdi)
+-	VMOVU	%VEC(0), (VEC_SIZE * -1)(%rdi, %rdx)
+-	VZEROUPPER_RETURN
+-END (BZERO_SYMBOL(__bzero, unaligned_erms))
+-
+ # if defined SHARED && IS_IN (libc)
+ ENTRY_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned_erms))
+ 	cmp	%RDX_LP, %RCX_LP
+@@ -282,7 +228,6 @@ L(last_2x_vec):
+ #ifdef USE_LESS_VEC_MASK_STORE
+ 	.p2align 4,, 10
+ L(less_vec):
+-L(less_vec_no_vdup):
+ L(less_vec_from_wmemset):
+ 	/* Less than 1 VEC.  */
+ # if VEC_SIZE != 16 && VEC_SIZE != 32 && VEC_SIZE != 64
+@@ -430,9 +375,6 @@ L(less_vec):
+ 	   xmm). This is only does anything for AVX2.  */
+ 	MEMSET_VDUP_TO_VEC0_LOW ()
+ L(less_vec_from_wmemset):
+-#if VEC_SIZE > 16
+-L(less_vec_no_vdup):
+-#endif
+ #endif
+ L(cross_page):
+ #if VEC_SIZE > 32
+@@ -445,9 +387,6 @@ L(cross_page):
+ #endif
+ #ifndef USE_XMM_LESS_VEC
+ 	MOVQ	%XMM0, %SET_REG64
+-#endif
+-#if VEC_SIZE <= 16
+-L(less_vec_no_vdup):
+ #endif
+ 	cmpl	$8, %edx
+ 	jge	L(between_8_15)
--- a/glibc-upstream-2.34-278.patch
+++ b/glibc-upstream-2.34-278.patch
@ -0,0 +1,460 @@
+commit 8ab861d295b90177b89288a2bc95c5de5e4e5bc6
+Author: Sunil K Pandey <skpgkp2@gmail.com>
+Date:   Sun Feb 27 16:39:47 2022 -0800
+
+    x86_64: Implement evex512 version of strlen, strnlen, wcslen and wcsnlen
+    
+    This patch implements following evex512 version of string functions.
+    Perf gain for evex512 version is up to 50% as compared to evex,
+    depending on length and alignment.
+    
+    Placeholder function, not used by any processor at the moment.
+    
+    - String length function using 512 bit vectors.
+    - String N length using 512 bit vectors.
+    - Wide string length using 512 bit vectors.
+    - Wide string N length using 512 bit vectors.
+    
+    Reviewed-by: Noah Goldstein <goldstein.w.n@gmail.com>
+    (cherry picked from commit 9c66efb86fe384f77435f7e326333fb2e4e10676)
+
+diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
+index 67401162d526f664..4d4ad2a3686b5bc3 100644
+--- a/sysdeps/x86_64/multiarch/Makefile
+++ b/sysdeps/x86_64/multiarch/Makefile
+@@ -87,6 +87,7 @@ sysdep_routines += \
+   strlen-avx2 \
+   strlen-avx2-rtm \
+   strlen-evex \
+  strlen-evex512 \
+   strlen-sse2 \
+   strncase_l-avx2 \
+   strncase_l-avx2-rtm \
+@@ -115,6 +116,7 @@ sysdep_routines += \
+   strnlen-avx2 \
+   strnlen-avx2-rtm \
+   strnlen-evex \
+  strnlen-evex512 \
+   strnlen-sse2 \
+   strpbrk-c \
+   strpbrk-sse2 \
+@@ -148,6 +150,7 @@ sysdep_routines += \
+   wcslen-avx2 \
+   wcslen-avx2-rtm \
+   wcslen-evex \
+  wcslen-evex512 \
+   wcslen-sse2 \
+   wcslen-sse4_1 \
+   wcsncmp-avx2 \
+@@ -158,6 +161,7 @@ sysdep_routines += \
+   wcsnlen-avx2-rtm \
+   wcsnlen-c \
+   wcsnlen-evex \
+  wcsnlen-evex512 \
+   wcsnlen-sse4_1 \
+   wcsrchr-avx2 \
+   wcsrchr-avx2-rtm \
+diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+index d990a7149489efd9..6b75a7106e174bce 100644
+--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+@@ -317,6 +317,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 			       && CPU_FEATURE_USABLE (AVX512BW)
+ 			       && CPU_FEATURE_USABLE (BMI2)),
+ 			      __strlen_evex)
+	      IFUNC_IMPL_ADD (array, i, strlen,
+			      (CPU_FEATURE_USABLE (AVX512VL)
+			       && CPU_FEATURE_USABLE (AVX512BW)
+			       && CPU_FEATURE_USABLE (BMI2)),
+			      __strlen_evex512)
+ 	      IFUNC_IMPL_ADD (array, i, strlen, 1, __strlen_sse2))
+ 
+   /* Support sysdeps/x86_64/multiarch/strnlen.c.  */
+@@ -335,6 +340,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 			       && CPU_FEATURE_USABLE (AVX512BW)
+ 			       && CPU_FEATURE_USABLE (BMI2)),
+ 			      __strnlen_evex)
+	      IFUNC_IMPL_ADD (array, i, strnlen,
+			      (CPU_FEATURE_USABLE (AVX512VL)
+			       && CPU_FEATURE_USABLE (AVX512BW)
+			       && CPU_FEATURE_USABLE (BMI2)),
+			      __strnlen_evex512)
+ 	      IFUNC_IMPL_ADD (array, i, strnlen, 1, __strnlen_sse2))
+ 
+   /* Support sysdeps/x86_64/multiarch/stpncpy.c.  */
+@@ -714,6 +724,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 			       && CPU_FEATURE_USABLE (AVX512BW)
+ 			       && CPU_FEATURE_USABLE (BMI2)),
+ 			      __wcslen_evex)
+	      IFUNC_IMPL_ADD (array, i, wcslen,
+			      (CPU_FEATURE_USABLE (AVX512VL)
+			       && CPU_FEATURE_USABLE (AVX512BW)
+			       && CPU_FEATURE_USABLE (BMI2)),
+			      __wcslen_evex512)
+ 	      IFUNC_IMPL_ADD (array, i, wcslen,
+ 			      CPU_FEATURE_USABLE (SSE4_1),
+ 			      __wcslen_sse4_1)
+@@ -735,6 +750,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 			       && CPU_FEATURE_USABLE (AVX512BW)
+ 			       && CPU_FEATURE_USABLE (BMI2)),
+ 			      __wcsnlen_evex)
+	      IFUNC_IMPL_ADD (array, i, wcsnlen,
+			      (CPU_FEATURE_USABLE (AVX512VL)
+			       && CPU_FEATURE_USABLE (AVX512BW)
+			       && CPU_FEATURE_USABLE (BMI2)),
+			      __wcsnlen_evex512)
+ 	      IFUNC_IMPL_ADD (array, i, wcsnlen,
+ 			      CPU_FEATURE_USABLE (SSE4_1),
+ 			      __wcsnlen_sse4_1)
+diff --git a/sysdeps/x86_64/multiarch/strlen-evex-base.S b/sysdeps/x86_64/multiarch/strlen-evex-base.S
+new file mode 100644
+index 0000000000000000..278c899691d89ba7
+--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strlen-evex-base.S
+@@ -0,0 +1,302 @@
+/* Placeholder function, not used by any processor at the moment.
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#if IS_IN (libc)
+
+# include <sysdep.h>
+
+# ifdef USE_AS_WCSLEN
+#  define VPCMP		vpcmpd
+#  define VPTESTN	vptestnmd
+#  define VPMINU	vpminud
+#  define CHAR_SIZE	4
+# else
+#  define VPCMP		vpcmpb
+#  define VPTESTN	vptestnmb
+#  define VPMINU	vpminub
+#  define CHAR_SIZE	1
+# endif
+
+# define XMM0		xmm16
+# define PAGE_SIZE	4096
+# define CHAR_PER_VEC	(VEC_SIZE / CHAR_SIZE)
+
+# if VEC_SIZE == 64
+#  define KMOV		kmovq
+#  define KORTEST	kortestq
+#  define RAX		rax
+#  define RCX		rcx
+#  define RDX		rdx
+#  define SHR		shrq
+#  define TEXTSUFFIX	evex512
+#  define VMM0		zmm16
+#  define VMM1		zmm17
+#  define VMM2		zmm18
+#  define VMM3		zmm19
+#  define VMM4		zmm20
+#  define VMOVA		vmovdqa64
+# elif VEC_SIZE == 32
+/* Currently Unused.  */
+#  define KMOV		kmovd
+#  define KORTEST	kortestd
+#  define RAX		eax
+#  define RCX		ecx
+#  define RDX		edx
+#  define SHR		shrl
+#  define TEXTSUFFIX	evex256
+#  define VMM0		ymm16
+#  define VMM1		ymm17
+#  define VMM2		ymm18
+#  define VMM3		ymm19
+#  define VMM4		ymm20
+#  define VMOVA		vmovdqa32
+# endif
+
+	.section .text.TEXTSUFFIX, "ax", @progbits
+/* Aligning entry point to 64 byte, provides better performance for
+   one vector length string.  */
+ENTRY_P2ALIGN (STRLEN, 6)
+# ifdef USE_AS_STRNLEN
+	/* Check zero length.  */
+	test	%RSI_LP, %RSI_LP
+	jz	L(ret_max)
+#  ifdef __ILP32__
+	/* Clear the upper 32 bits.  */
+	movl	%esi, %esi
+#  endif
+# endif
+
+	movl	%edi, %eax
+	vpxorq	%XMM0, %XMM0, %XMM0
+	andl	$(PAGE_SIZE - 1), %eax
+	cmpl	$(PAGE_SIZE - VEC_SIZE), %eax
+	ja	L(page_cross)
+
+	/* Compare [w]char for null, mask bit will be set for match.  */
+	VPCMP	$0, (%rdi), %VMM0, %k0
+	KMOV	%k0, %RAX
+	test	%RAX, %RAX
+	jz	L(align_more)
+
+	bsf	%RAX, %RAX
+# ifdef USE_AS_STRNLEN
+	cmpq	%rsi, %rax
+	cmovnb	%rsi, %rax
+# endif
+	ret
+
+	/* At this point vector max length reached.  */
+# ifdef USE_AS_STRNLEN
+	.p2align 4,,3
+L(ret_max):
+	movq	%rsi, %rax
+	ret
+# endif
+
+L(align_more):
+	leaq	VEC_SIZE(%rdi), %rax
+	/* Align rax to VEC_SIZE.  */
+	andq	$-VEC_SIZE, %rax
+# ifdef USE_AS_STRNLEN
+	movq	%rax, %rdx
+	subq	%rdi, %rdx
+#  ifdef USE_AS_WCSLEN
+	SHR	$2, %RDX
+#  endif
+	/* At this point rdx contains [w]chars already compared.  */
+	subq	%rsi, %rdx
+	jae	L(ret_max)
+	negq	%rdx
+	/* At this point rdx contains number of w[char] needs to go.
+	   Now onwards rdx will keep decrementing with each compare.  */
+# endif
+
+	/* Loop unroll 4 times for 4 vector loop.  */
+	VPCMP	$0, (%rax), %VMM0, %k0
+	KMOV	%k0, %RCX
+	test	%RCX, %RCX
+	jnz	L(ret_vec_x1)
+
+# ifdef USE_AS_STRNLEN
+	subq	$CHAR_PER_VEC, %rdx
+	jbe	L(ret_max)
+# endif
+
+	VPCMP	$0, VEC_SIZE(%rax), %VMM0, %k0
+	KMOV	%k0, %RCX
+	test	%RCX, %RCX
+	jnz	L(ret_vec_x2)
+
+# ifdef USE_AS_STRNLEN
+	subq	$CHAR_PER_VEC, %rdx
+	jbe	L(ret_max)
+# endif
+
+	VPCMP	$0, (VEC_SIZE * 2)(%rax), %VMM0, %k0
+	KMOV	%k0, %RCX
+	test	%RCX, %RCX
+	jnz	L(ret_vec_x3)
+
+# ifdef USE_AS_STRNLEN
+	subq	$CHAR_PER_VEC, %rdx
+	jbe	L(ret_max)
+# endif
+
+	VPCMP	$0, (VEC_SIZE * 3)(%rax), %VMM0, %k0
+	KMOV	%k0, %RCX
+	test	%RCX, %RCX
+	jnz	L(ret_vec_x4)
+
+# ifdef USE_AS_STRNLEN
+	subq	$CHAR_PER_VEC, %rdx
+	jbe	L(ret_max)
+	/* Save pointer before 4 x VEC_SIZE alignment.  */
+	movq	%rax, %rcx
+# endif
+
+	/* Align address to VEC_SIZE * 4 for loop.  */
+	andq	$-(VEC_SIZE * 4), %rax
+
+# ifdef USE_AS_STRNLEN
+	subq	%rax, %rcx
+#  ifdef USE_AS_WCSLEN
+	SHR	$2, %RCX
+#  endif
+	/* rcx contains number of [w]char will be recompared due to
+	   alignment fixes.  rdx must be incremented by rcx to offset
+	   alignment adjustment.  */
+	addq	%rcx, %rdx
+	/* Need jump as we don't want to add/subtract rdx for first
+	   iteration of 4 x VEC_SIZE aligned loop.  */
+	jmp	L(loop_entry)
+# endif
+
+	.p2align 4,,11
+L(loop):
+# ifdef USE_AS_STRNLEN
+	subq	$(CHAR_PER_VEC * 4), %rdx
+	jbe	L(ret_max)
+L(loop_entry):
+# endif
+	/* VPMINU and VPCMP combination provide better performance as
+	   compared to alternative combinations.  */
+	VMOVA	(VEC_SIZE * 4)(%rax), %VMM1
+	VPMINU	(VEC_SIZE * 5)(%rax), %VMM1, %VMM2
+	VMOVA	(VEC_SIZE * 6)(%rax), %VMM3
+	VPMINU	(VEC_SIZE * 7)(%rax), %VMM3, %VMM4
+
+	VPTESTN	%VMM2, %VMM2, %k0
+	VPTESTN	%VMM4, %VMM4, %k1
+
+	subq	$-(VEC_SIZE * 4), %rax
+	KORTEST	%k0, %k1
+	jz	L(loop)
+
+	VPTESTN	%VMM1, %VMM1, %k2
+	KMOV	%k2, %RCX
+	test	%RCX, %RCX
+	jnz	L(ret_vec_x1)
+
+	KMOV	%k0, %RCX
+	/* At this point, if k0 is non zero, null char must be in the
+	   second vector.  */
+	test	%RCX, %RCX
+	jnz	L(ret_vec_x2)
+
+	VPTESTN	%VMM3, %VMM3, %k3
+	KMOV	%k3, %RCX
+	test	%RCX, %RCX
+	jnz	L(ret_vec_x3)
+	/* At this point null [w]char must be in the fourth vector so no
+	   need to check.  */
+	KMOV	%k1, %RCX
+
+	/* Fourth, third, second vector terminating are pretty much
+	   same, implemented this way to avoid branching and reuse code
+	   from pre loop exit condition.  */
+L(ret_vec_x4):
+	bsf	%RCX, %RCX
+	subq	%rdi, %rax
+# ifdef USE_AS_WCSLEN
+	subq	$-(VEC_SIZE * 3), %rax
+	shrq	$2, %rax
+	addq	%rcx, %rax
+# else
+	leaq	(VEC_SIZE * 3)(%rcx, %rax), %rax
+# endif
+# ifdef USE_AS_STRNLEN
+	cmpq	%rsi, %rax
+	cmovnb	%rsi, %rax
+# endif
+	ret
+
+L(ret_vec_x3):
+	bsf	%RCX, %RCX
+	subq	%rdi, %rax
+# ifdef USE_AS_WCSLEN
+	subq	$-(VEC_SIZE * 2), %rax
+	shrq	$2, %rax
+	addq	%rcx, %rax
+# else
+	leaq	(VEC_SIZE * 2)(%rcx, %rax), %rax
+# endif
+# ifdef USE_AS_STRNLEN
+	cmpq	%rsi, %rax
+	cmovnb	%rsi, %rax
+# endif
+	ret
+
+L(ret_vec_x2):
+	subq	$-VEC_SIZE, %rax
+L(ret_vec_x1):
+	bsf	%RCX, %RCX
+	subq	%rdi, %rax
+# ifdef USE_AS_WCSLEN
+	shrq	$2, %rax
+# endif
+	addq	%rcx, %rax
+# ifdef USE_AS_STRNLEN
+	cmpq	%rsi, %rax
+	cmovnb	%rsi, %rax
+# endif
+	ret
+
+L(page_cross):
+	movl	%eax, %ecx
+# ifdef USE_AS_WCSLEN
+	andl	$(VEC_SIZE - 1), %ecx
+	sarl	$2, %ecx
+# endif
+	/* ecx contains number of w[char] to be skipped as a result
+	   of address alignment.  */
+	xorq	%rdi, %rax
+	VPCMP	$0, (PAGE_SIZE - VEC_SIZE)(%rax), %VMM0, %k0
+	KMOV	%k0, %RAX
+	/* Ignore number of character for alignment adjustment.  */
+	SHR	%cl, %RAX
+	jz	L(align_more)
+
+	bsf	%RAX, %RAX
+# ifdef USE_AS_STRNLEN
+	cmpq	%rsi, %rax
+	cmovnb	%rsi, %rax
+# endif
+	ret
+
+END (STRLEN)
+#endif
+diff --git a/sysdeps/x86_64/multiarch/strlen-evex512.S b/sysdeps/x86_64/multiarch/strlen-evex512.S
+new file mode 100644
+index 0000000000000000..116f8981c8954e2e
+--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strlen-evex512.S
+@@ -0,0 +1,7 @@
+#ifndef STRLEN
+# define STRLEN		__strlen_evex512
+#endif
+
+#define VEC_SIZE	64
+
+#include "strlen-evex-base.S"
+diff --git a/sysdeps/x86_64/multiarch/strnlen-evex512.S b/sysdeps/x86_64/multiarch/strnlen-evex512.S
+new file mode 100644
+index 0000000000000000..0b7f220214a7c33c
+--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strnlen-evex512.S
+@@ -0,0 +1,4 @@
+#define STRLEN __strnlen_evex512
+#define USE_AS_STRNLEN 1
+
+#include "strlen-evex512.S"
+diff --git a/sysdeps/x86_64/multiarch/wcslen-evex512.S b/sysdeps/x86_64/multiarch/wcslen-evex512.S
+new file mode 100644
+index 0000000000000000..f59c372b78b4fb8c
+--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wcslen-evex512.S
+@@ -0,0 +1,4 @@
+#define STRLEN __wcslen_evex512
+#define USE_AS_WCSLEN 1
+
+#include "strlen-evex512.S"
+diff --git a/sysdeps/x86_64/multiarch/wcsnlen-evex512.S b/sysdeps/x86_64/multiarch/wcsnlen-evex512.S
+new file mode 100644
+index 0000000000000000..73dcf2f210a85aac
+--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wcsnlen-evex512.S
+@@ -0,0 +1,5 @@
+#define STRLEN __wcsnlen_evex512
+#define USE_AS_WCSLEN 1
+#define USE_AS_STRNLEN 1
+
+#include "strlen-evex512.S"
--- a/glibc-upstream-2.34-279.patch
+++ b/glibc-upstream-2.34-279.patch
@ -0,0 +1,33 @@
+commit f6bc52f080e4a0195c707c01f54e2eae0ff89010
+Author: H.J. Lu <hjl.tools@gmail.com>
+Date:   Fri May 20 19:21:48 2022 -0700
+
+    x86-64: Ignore r_addend for R_X86_64_GLOB_DAT/R_X86_64_JUMP_SLOT
+    
+    According to x86-64 psABI, r_addend should be ignored for R_X86_64_GLOB_DAT
+    and R_X86_64_JUMP_SLOT.  Since linkers always set their r_addends to 0, we
+    can ignore their r_addends.
+    
+    Reviewed-by: Fangrui Song <maskray@google.com>
+    (cherry picked from commit f8587a61892cbafd98ce599131bf4f103466f084)
+
+diff --git a/sysdeps/x86_64/dl-machine.h b/sysdeps/x86_64/dl-machine.h
+index 94296719d4d9fb82..742682517179fab5 100644
+--- a/sysdeps/x86_64/dl-machine.h
+++ b/sysdeps/x86_64/dl-machine.h
+@@ -347,11 +347,13 @@ and creates an unsatisfiable circular dependency.\n",
+ #  endif
+ 	  /* Set to symbol size plus addend.  */
+ 	  value = sym->st_size;
+	  *reloc_addr = value + reloc->r_addend;
+	  break;
+ # endif
+-	  /* Fall through.  */
+
+ 	case R_X86_64_GLOB_DAT:
+ 	case R_X86_64_JUMP_SLOT:
+-	  *reloc_addr = value + reloc->r_addend;
+	  *reloc_addr = value;
+ 	  break;
+ 
+ # ifndef RESOLVE_CONFLICT_FIND_MAP
--- a/glibc-upstream-2.34-280.patch
+++ b/glibc-upstream-2.34-280.patch
@ -0,0 +1,356 @@
+commit 82a707aeb74f23bb1783af0f9e93790a2038ff7e
+Author: Raghuveer Devulapalli <raghuveer.devulapalli@intel.com>
+Date:   Mon Jun 6 12:17:43 2022 -0700
+
+    x86_64: Add strstr function with 512-bit EVEX
+    
+    Adding a 512-bit EVEX version of strstr. The algorithm works as follows:
+    
+    (1) We spend a few cycles at the begining to peek into the needle. We
+    locate an edge in the needle (first occurance of 2 consequent distinct
+    characters) and also store the first 64-bytes into a zmm register.
+    
+    (2) We search for the edge in the haystack by looking into one cache
+    line of the haystack at a time. This avoids having to read past a page
+    boundary which can cause a seg fault.
+    
+    (3) If an edge is found in the haystack we first compare the first
+    64-bytes of the needle (already stored in a zmm register) before we
+    proceed with a full string compare performed byte by byte.
+    
+    Benchmarking results: (old = strstr_sse2_unaligned, new = strstr_avx512)
+    
+    Geometric mean of all benchmarks: new / old =  0.66
+    
+    Difficult skiptable(0) : new / old =  0.02
+    Difficult skiptable(1) : new / old =  0.01
+    Difficult 2-way : new / old =  0.25
+    Difficult testing first 2 : new / old =  1.26
+    Difficult skiptable(0) : new / old =  0.05
+    Difficult skiptable(1) : new / old =  0.06
+    Difficult 2-way : new / old =  0.26
+    Difficult testing first 2 : new / old =  1.05
+    Difficult skiptable(0) : new / old =  0.42
+    Difficult skiptable(1) : new / old =  0.24
+    Difficult 2-way : new / old =  0.21
+    Difficult testing first 2 : new / old =  1.04
+    Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
+    
+    (cherry picked from commit 5082a287d5e9a1f9cb98b7c982a708a3684f1d5c)
+    
+    x86: Remove __mmask intrinsics in strstr-avx512.c
+    
+    The intrinsics are not available before GCC7 and using standard
+    operators generates code of equivalent or better quality.
+    
+    Removed:
+        _cvtmask64_u64
+        _kshiftri_mask64
+        _kand_mask64
+    
+    Geometric Mean of 5 Runs of Full Benchmark Suite New / Old: 0.958
+    
+    (cherry picked from commit f2698954ff9c2f9626d4bcb5a30eb5729714e0b0)
+
+diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
+index 4d4ad2a3686b5bc3..0e39e63ef6be6a86 100644
+--- a/sysdeps/x86_64/multiarch/Makefile
+++ b/sysdeps/x86_64/multiarch/Makefile
+@@ -126,6 +126,7 @@ sysdep_routines += \
+   strrchr-sse2 \
+   strspn-c \
+   strspn-sse2 \
+  strstr-avx512 \
+   strstr-sse2-unaligned \
+   varshift \
+ # sysdep_routines
+@@ -133,6 +134,7 @@ CFLAGS-varshift.c += -msse4
+ CFLAGS-strcspn-c.c += -msse4
+ CFLAGS-strpbrk-c.c += -msse4
+ CFLAGS-strspn-c.c += -msse4
+CFLAGS-strstr-avx512.c += -mavx512f -mavx512vl -mavx512dq -mavx512bw -mbmi -mbmi2 -O3
+ endif
+ 
+ ifeq ($(subdir),wcsmbs)
+diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+index 6b75a7106e174bce..043821278fdb6d8f 100644
+--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+@@ -633,6 +633,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 
+   /* Support sysdeps/x86_64/multiarch/strstr.c.  */
+   IFUNC_IMPL (i, name, strstr,
+              IFUNC_IMPL_ADD (array, i, strstr,
+                              (CPU_FEATURE_USABLE (AVX512VL)
+                               && CPU_FEATURE_USABLE (AVX512BW)
+                               && CPU_FEATURE_USABLE (AVX512DQ)
+                               && CPU_FEATURE_USABLE (BMI2)),
+                              __strstr_avx512)
+ 	      IFUNC_IMPL_ADD (array, i, strstr, 1, __strstr_sse2_unaligned)
+ 	      IFUNC_IMPL_ADD (array, i, strstr, 1, __strstr_sse2))
+ 
+diff --git a/sysdeps/x86_64/multiarch/strstr-avx512.c b/sysdeps/x86_64/multiarch/strstr-avx512.c
+new file mode 100644
+index 0000000000000000..e44c1a05dc0007e5
+--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strstr-avx512.c
+@@ -0,0 +1,218 @@
+/* strstr optimized with 512-bit AVX-512 instructions
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <immintrin.h>
+#include <inttypes.h>
+#include <stdbool.h>
+#include <string.h>
+
+#define FULL_MMASK64 0xffffffffffffffff
+#define ONE_64BIT 0x1ull
+#define ZMM_SIZE_IN_BYTES 64
+#define PAGESIZE 4096
+
+#define cvtmask64_u64(...) (uint64_t) (__VA_ARGS__)
+#define kshiftri_mask64(x, y) ((x) >> (y))
+#define kand_mask64(x, y) ((x) & (y))
+
+/*
+ Returns the index of the first edge within the needle, returns 0 if no edge
+ is found. Example: 'ab' is the first edge in 'aaaaaaaaaabaarddg'
+ */
+static inline size_t
+find_edge_in_needle (const char *ned)
+{
+  size_t ind = 0;
+  while (ned[ind + 1] != '\0')
+    {
+      if (ned[ind] != ned[ind + 1])
+        return ind;
+      else
+        ind = ind + 1;
+    }
+  return 0;
+}
+
+/*
+ Compare needle with haystack byte by byte at specified location
+ */
+static inline bool
+verify_string_match (const char *hay, const size_t hay_index, const char *ned,
+                     size_t ind)
+{
+  while (ned[ind] != '\0')
+    {
+      if (ned[ind] != hay[hay_index + ind])
+        return false;
+      ind = ind + 1;
+    }
+  return true;
+}
+
+/*
+ Compare needle with haystack at specified location. The first 64 bytes are
+ compared using a ZMM register.
+ */
+static inline bool
+verify_string_match_avx512 (const char *hay, const size_t hay_index,
+                            const char *ned, const __mmask64 ned_mask,
+                            const __m512i ned_zmm)
+{
+  /* check first 64 bytes using zmm and then scalar */
+  __m512i hay_zmm = _mm512_loadu_si512 (hay + hay_index); // safe to do so
+  __mmask64 match = _mm512_mask_cmpneq_epi8_mask (ned_mask, hay_zmm, ned_zmm);
+  if (match != 0x0) // failed the first few chars
+    return false;
+  else if (ned_mask == FULL_MMASK64)
+    return verify_string_match (hay, hay_index, ned, ZMM_SIZE_IN_BYTES);
+  return true;
+}
+
+char *
+__strstr_avx512 (const char *haystack, const char *ned)
+{
+  char first = ned[0];
+  if (first == '\0')
+    return (char *)haystack;
+  if (ned[1] == '\0')
+    return (char *)strchr (haystack, ned[0]);
+
+  size_t edge = find_edge_in_needle (ned);
+
+  /* ensure haystack is as long as the pos of edge in needle */
+  for (int ii = 0; ii < edge; ++ii)
+    {
+      if (haystack[ii] == '\0')
+        return NULL;
+    }
+
+  /*
+   Load 64 bytes of the needle and save it to a zmm register
+   Read one cache line at a time to avoid loading across a page boundary
+   */
+  __mmask64 ned_load_mask = _bzhi_u64 (
+      FULL_MMASK64, 64 - ((uintptr_t) (ned) & 63));
+  __m512i ned_zmm = _mm512_maskz_loadu_epi8 (ned_load_mask, ned);
+  __mmask64 ned_nullmask
+      = _mm512_mask_testn_epi8_mask (ned_load_mask, ned_zmm, ned_zmm);
+
+  if (__glibc_unlikely (ned_nullmask == 0x0))
+    {
+      ned_zmm = _mm512_loadu_si512 (ned);
+      ned_nullmask = _mm512_testn_epi8_mask (ned_zmm, ned_zmm);
+      ned_load_mask = ned_nullmask ^ (ned_nullmask - ONE_64BIT);
+      if (ned_nullmask != 0x0)
+        ned_load_mask = ned_load_mask >> 1;
+    }
+  else
+    {
+      ned_load_mask = ned_nullmask ^ (ned_nullmask - ONE_64BIT);
+      ned_load_mask = ned_load_mask >> 1;
+    }
+  const __m512i ned0 = _mm512_set1_epi8 (ned[edge]);
+  const __m512i ned1 = _mm512_set1_epi8 (ned[edge + 1]);
+
+  /*
+   Read the bytes of haystack in the current cache line
+   */
+  size_t hay_index = edge;
+  __mmask64 loadmask = _bzhi_u64 (
+      FULL_MMASK64, 64 - ((uintptr_t) (haystack + hay_index) & 63));
+  /* First load is a partial cache line */
+  __m512i hay0 = _mm512_maskz_loadu_epi8 (loadmask, haystack + hay_index);
+  /* Search for NULL and compare only till null char */
+  uint64_t nullmask
+      = cvtmask64_u64 (_mm512_mask_testn_epi8_mask (loadmask, hay0, hay0));
+  uint64_t cmpmask = nullmask ^ (nullmask - ONE_64BIT);
+  cmpmask = cmpmask & cvtmask64_u64 (loadmask);
+  /* Search for the 2 charaters of needle */
+  __mmask64 k0 = _mm512_cmpeq_epi8_mask (hay0, ned0);
+  __mmask64 k1 = _mm512_cmpeq_epi8_mask (hay0, ned1);
+  k1 = kshiftri_mask64 (k1, 1);
+  /* k2 masks tell us if both chars from needle match */
+  uint64_t k2 = cvtmask64_u64 (kand_mask64 (k0, k1)) & cmpmask;
+  /* For every match, search for the entire needle for a full match */
+  while (k2)
+    {
+      uint64_t bitcount = _tzcnt_u64 (k2);
+      k2 = _blsr_u64 (k2);
+      size_t match_pos = hay_index + bitcount - edge;
+      if (((uintptr_t) (haystack + match_pos) & (PAGESIZE - 1))
+          < PAGESIZE - 1 - ZMM_SIZE_IN_BYTES)
+        {
+          /*
+           * Use vector compare as long as you are not crossing a page
+           */
+          if (verify_string_match_avx512 (haystack, match_pos, ned,
+                                          ned_load_mask, ned_zmm))
+            return (char *)haystack + match_pos;
+        }
+      else
+        {
+          if (verify_string_match (haystack, match_pos, ned, 0))
+            return (char *)haystack + match_pos;
+        }
+    }
+  /* We haven't checked for potential match at the last char yet */
+  haystack = (const char *)(((uintptr_t) (haystack + hay_index) | 63));
+  hay_index = 0;
+
+  /*
+   Loop over one cache line at a time to prevent reading over page
+   boundary
+   */
+  __m512i hay1;
+  while (nullmask == 0)
+    {
+      hay0 = _mm512_loadu_si512 (haystack + hay_index);
+      hay1 = _mm512_load_si512 (haystack + hay_index
+                                + 1); // Always 64 byte aligned
+      nullmask = cvtmask64_u64 (_mm512_testn_epi8_mask (hay1, hay1));
+      /* Compare only till null char */
+      cmpmask = nullmask ^ (nullmask - ONE_64BIT);
+      k0 = _mm512_cmpeq_epi8_mask (hay0, ned0);
+      k1 = _mm512_cmpeq_epi8_mask (hay1, ned1);
+      /* k2 masks tell us if both chars from needle match */
+      k2 = cvtmask64_u64 (kand_mask64 (k0, k1)) & cmpmask;
+      /* For every match, compare full strings for potential match */
+      while (k2)
+        {
+          uint64_t bitcount = _tzcnt_u64 (k2);
+          k2 = _blsr_u64 (k2);
+          size_t match_pos = hay_index + bitcount - edge;
+          if (((uintptr_t) (haystack + match_pos) & (PAGESIZE - 1))
+              < PAGESIZE - 1 - ZMM_SIZE_IN_BYTES)
+            {
+              /*
+               * Use vector compare as long as you are not crossing a page
+               */
+              if (verify_string_match_avx512 (haystack, match_pos, ned,
+                                              ned_load_mask, ned_zmm))
+                return (char *)haystack + match_pos;
+            }
+          else
+            {
+              /* Compare byte by byte */
+              if (verify_string_match (haystack, match_pos, ned, 0))
+                return (char *)haystack + match_pos;
+            }
+        }
+      hay_index += ZMM_SIZE_IN_BYTES;
+    }
+  return NULL;
+}
+diff --git a/sysdeps/x86_64/multiarch/strstr.c b/sysdeps/x86_64/multiarch/strstr.c
+index 848601bde7583ca3..9474d6234e9b62d3 100644
+--- a/sysdeps/x86_64/multiarch/strstr.c
+++ b/sysdeps/x86_64/multiarch/strstr.c
+@@ -35,16 +35,32 @@
+ 
+ extern __typeof (__redirect_strstr) __strstr_sse2_unaligned attribute_hidden;
+ extern __typeof (__redirect_strstr) __strstr_sse2 attribute_hidden;
+extern __typeof (__redirect_strstr) __strstr_avx512 attribute_hidden;
+ 
+ #include "init-arch.h"
+ 
+ /* Avoid DWARF definition DIE on ifunc symbol so that GDB can handle
+    ifunc symbol properly.  */
+ extern __typeof (__redirect_strstr) __libc_strstr;
+-libc_ifunc (__libc_strstr,
+-	    HAS_ARCH_FEATURE (Fast_Unaligned_Load)
+-	    ? __strstr_sse2_unaligned
+-	    : __strstr_sse2)
+ 
+static inline void *
+IFUNC_SELECTOR (void)
+{
+  const struct cpu_features *cpu_features = __get_cpu_features ();
+
+  if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_AVX512)
+      && CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
+      && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW)
+      && CPU_FEATURE_USABLE_P (cpu_features, AVX512DQ)
+      && CPU_FEATURE_USABLE_P (cpu_features, BMI2))
+    return __strstr_avx512;
+
+  if (CPU_FEATURES_ARCH_P (cpu_features, Fast_Unaligned_Load))
+    return __strstr_sse2_unaligned;
+
+  return __strstr_sse2;
+}
+
+libc_ifunc_redirected (__redirect_strstr, __libc_strstr, IFUNC_SELECTOR ());
+ #undef strstr
+ strong_alias (__libc_strstr, strstr)
--- a/glibc-upstream-2.34-281.patch
+++ b/glibc-upstream-2.34-281.patch
@ -0,0 +1,385 @@
+commit 70be93d1c58916d289a5e6e7c7d9b989707a9e41
+Author: Noah Goldstein <goldstein.w.n@gmail.com>
+Date:   Mon Jun 6 21:11:27 2022 -0700
+
+    x86: Create header for VEC classes in x86 strings library
+    
+    This patch does not touch any existing code and is only meant to be a
+    tool for future patches so that simple source files can more easily be
+    maintained to target multiple VEC classes.
+    
+    There is no difference in the objdump of libc.so before and after this
+    patch.
+    Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
+    
+    (cherry picked from commit 8a780a6b910023e71f3173f37f0793834c047554)
+
+diff --git a/sysdeps/x86_64/multiarch/avx-rtm-vecs.h b/sysdeps/x86_64/multiarch/avx-rtm-vecs.h
+new file mode 100644
+index 0000000000000000..3f531dd47fceefe9
+--- /dev/null
+++ b/sysdeps/x86_64/multiarch/avx-rtm-vecs.h
+@@ -0,0 +1,34 @@
+/* Common config for AVX-RTM VECs
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#ifndef _AVX_RTM_VECS_H
+#define _AVX_RTM_VECS_H			1
+
+#define ZERO_UPPER_VEC_REGISTERS_RETURN	\
+	ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST
+
+#define VZEROUPPER_RETURN		jmp L(return_vzeroupper)
+
+#define USE_WITH_RTM			1
+#include "avx-vecs.h"
+
+#undef SECTION
+#define SECTION(p)				p##.avx.rtm
+
+#endif
+diff --git a/sysdeps/x86_64/multiarch/avx-vecs.h b/sysdeps/x86_64/multiarch/avx-vecs.h
+new file mode 100644
+index 0000000000000000..89680f5db827c332
+--- /dev/null
+++ b/sysdeps/x86_64/multiarch/avx-vecs.h
+@@ -0,0 +1,47 @@
+/* Common config for AVX VECs
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#ifndef _AVX_VECS_H
+#define _AVX_VECS_H			1
+
+#ifdef VEC_SIZE
+# error "Multiple VEC configs included!"
+#endif
+
+#define VEC_SIZE			32
+#include "vec-macros.h"
+
+#define USE_WITH_AVX		1
+#define SECTION(p)			p##.avx
+
+/* 4-byte mov instructions with AVX2.  */
+#define MOV_SIZE			4
+/* 1 (ret) + 3 (vzeroupper).  */
+#define RET_SIZE			4
+#define VZEROUPPER			vzeroupper
+
+#define VMOVU				vmovdqu
+#define VMOVA				vmovdqa
+#define VMOVNT				vmovntdq
+
+/* Often need to access xmm portion.  */
+#define VEC_xmm				VEC_any_xmm
+#define VEC					VEC_any_ymm
+
+#endif
+diff --git a/sysdeps/x86_64/multiarch/evex-vecs-common.h b/sysdeps/x86_64/multiarch/evex-vecs-common.h
+new file mode 100644
+index 0000000000000000..99806ebcd7bde53d
+--- /dev/null
+++ b/sysdeps/x86_64/multiarch/evex-vecs-common.h
+@@ -0,0 +1,39 @@
+/* Common config for EVEX256 and EVEX512 VECs
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#ifndef _EVEX_VECS_COMMON_H
+#define _EVEX_VECS_COMMON_H			1
+
+#include "vec-macros.h"
+
+/* 6-byte mov instructions with EVEX.  */
+#define MOV_SIZE			6
+/* No vzeroupper needed.  */
+#define RET_SIZE			1
+#define VZEROUPPER
+
+#define VMOVU				vmovdqu64
+#define VMOVA				vmovdqa64
+#define VMOVNT				vmovntdq
+
+#define VEC_xmm				VEC_hi_xmm
+#define VEC_ymm				VEC_hi_ymm
+#define VEC_zmm				VEC_hi_zmm
+
+#endif
+diff --git a/sysdeps/x86_64/multiarch/evex256-vecs.h b/sysdeps/x86_64/multiarch/evex256-vecs.h
+new file mode 100644
+index 0000000000000000..222ba46dc74cfcbd
+--- /dev/null
+++ b/sysdeps/x86_64/multiarch/evex256-vecs.h
+@@ -0,0 +1,35 @@
+/* Common config for EVEX256 VECs
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#ifndef _EVEX256_VECS_H
+#define _EVEX256_VECS_H			1
+
+#ifdef VEC_SIZE
+# error "Multiple VEC configs included!"
+#endif
+
+#define VEC_SIZE			32
+#include "evex-vecs-common.h"
+
+#define USE_WITH_EVEX256	1
+#define SECTION(p)			p##.evex
+
+#define VEC					VEC_ymm
+
+#endif
+diff --git a/sysdeps/x86_64/multiarch/evex512-vecs.h b/sysdeps/x86_64/multiarch/evex512-vecs.h
+new file mode 100644
+index 0000000000000000..d1784d5368d8cebe
+--- /dev/null
+++ b/sysdeps/x86_64/multiarch/evex512-vecs.h
+@@ -0,0 +1,35 @@
+/* Common config for EVEX512 VECs
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#ifndef _EVEX512_VECS_H
+#define _EVEX512_VECS_H			1
+
+#ifdef VEC_SIZE
+# error "Multiple VEC configs included!"
+#endif
+
+#define VEC_SIZE			64
+#include "evex-vecs-common.h"
+
+#define USE_WITH_EVEX512	1
+#define SECTION(p)			p##.evex512
+
+#define VEC					VEC_zmm
+
+#endif
+diff --git a/sysdeps/x86_64/multiarch/sse2-vecs.h b/sysdeps/x86_64/multiarch/sse2-vecs.h
+new file mode 100644
+index 0000000000000000..2b77a59d56ff2660
+--- /dev/null
+++ b/sysdeps/x86_64/multiarch/sse2-vecs.h
+@@ -0,0 +1,47 @@
+/* Common config for SSE2 VECs
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#ifndef _SSE2_VECS_H
+#define _SSE2_VECS_H			1
+
+#ifdef VEC_SIZE
+# error "Multiple VEC configs included!"
+#endif
+
+#define VEC_SIZE			16
+#include "vec-macros.h"
+
+#define USE_WITH_SSE2		1
+#define SECTION(p)			p
+
+/* 3-byte mov instructions with SSE2.  */
+#define MOV_SIZE			3
+/* No vzeroupper needed.  */
+#define RET_SIZE			1
+#define VZEROUPPER
+
+#define VMOVU				movups
+#define VMOVA				movaps
+#define VMOVNT				movntdq
+
+#define VEC_xmm				VEC_any_xmm
+#define VEC					VEC_any_xmm
+
+
+#endif
+diff --git a/sysdeps/x86_64/multiarch/vec-macros.h b/sysdeps/x86_64/multiarch/vec-macros.h
+new file mode 100644
+index 0000000000000000..9f3ffecede9feb26
+--- /dev/null
+++ b/sysdeps/x86_64/multiarch/vec-macros.h
+@@ -0,0 +1,90 @@
+/* Macro helpers for VEC_{type}({vec_num})
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#ifndef _VEC_MACROS_H
+#define _VEC_MACROS_H			1
+
+#ifndef VEC_SIZE
+# error "Never include this file directly. Always include a vector config."
+#endif
+
+/* Defines so we can use SSE2 / AVX2 / EVEX / EVEX512 encoding with same
+   VEC(N) values.  */
+#define VEC_hi_xmm0				xmm16
+#define VEC_hi_xmm1				xmm17
+#define VEC_hi_xmm2				xmm18
+#define VEC_hi_xmm3				xmm19
+#define VEC_hi_xmm4				xmm20
+#define VEC_hi_xmm5				xmm21
+#define VEC_hi_xmm6				xmm22
+#define VEC_hi_xmm7				xmm23
+#define VEC_hi_xmm8				xmm24
+#define VEC_hi_xmm9				xmm25
+#define VEC_hi_xmm10			xmm26
+#define VEC_hi_xmm11			xmm27
+#define VEC_hi_xmm12			xmm28
+#define VEC_hi_xmm13			xmm29
+#define VEC_hi_xmm14			xmm30
+#define VEC_hi_xmm15			xmm31
+
+#define VEC_hi_ymm0				ymm16
+#define VEC_hi_ymm1				ymm17
+#define VEC_hi_ymm2				ymm18
+#define VEC_hi_ymm3				ymm19
+#define VEC_hi_ymm4				ymm20
+#define VEC_hi_ymm5				ymm21
+#define VEC_hi_ymm6				ymm22
+#define VEC_hi_ymm7				ymm23
+#define VEC_hi_ymm8				ymm24
+#define VEC_hi_ymm9				ymm25
+#define VEC_hi_ymm10			ymm26
+#define VEC_hi_ymm11			ymm27
+#define VEC_hi_ymm12			ymm28
+#define VEC_hi_ymm13			ymm29
+#define VEC_hi_ymm14			ymm30
+#define VEC_hi_ymm15			ymm31
+
+#define VEC_hi_zmm0				zmm16
+#define VEC_hi_zmm1				zmm17
+#define VEC_hi_zmm2				zmm18
+#define VEC_hi_zmm3				zmm19
+#define VEC_hi_zmm4				zmm20
+#define VEC_hi_zmm5				zmm21
+#define VEC_hi_zmm6				zmm22
+#define VEC_hi_zmm7				zmm23
+#define VEC_hi_zmm8				zmm24
+#define VEC_hi_zmm9				zmm25
+#define VEC_hi_zmm10			zmm26
+#define VEC_hi_zmm11			zmm27
+#define VEC_hi_zmm12			zmm28
+#define VEC_hi_zmm13			zmm29
+#define VEC_hi_zmm14			zmm30
+#define VEC_hi_zmm15			zmm31
+
+#define PRIMITIVE_VEC(vec, num)		vec##num
+
+#define VEC_any_xmm(i)			PRIMITIVE_VEC(xmm, i)
+#define VEC_any_ymm(i)			PRIMITIVE_VEC(ymm, i)
+#define VEC_any_zmm(i)			PRIMITIVE_VEC(zmm, i)
+
+#define VEC_hi_xmm(i)			PRIMITIVE_VEC(VEC_hi_xmm, i)
+#define VEC_hi_ymm(i)			PRIMITIVE_VEC(VEC_hi_ymm, i)
+#define VEC_hi_zmm(i)			PRIMITIVE_VEC(VEC_hi_zmm, i)
+
+#endif
--- a/glibc-upstream-2.34-282.patch
+++ b/glibc-upstream-2.34-282.patch
@ -0,0 +1,90 @@
+commit e805606193e1a39956ca5ef73cb44a8796730686
+Author: Noah Goldstein <goldstein.w.n@gmail.com>
+Date:   Mon Jun 6 21:11:28 2022 -0700
+
+    x86: Add COND_VZEROUPPER that can replace vzeroupper if no `ret`
+    
+    The RTM vzeroupper mitigation has no way of replacing inline
+    vzeroupper not before a return.
+    
+    This can be useful when hoisting a vzeroupper to save code size
+    for example:
+    
+    ```
+    L(foo):
+            cmpl    %eax, %edx
+            jz      L(bar)
+            tzcntl  %eax, %eax
+            addq    %rdi, %rax
+            VZEROUPPER_RETURN
+    
+    L(bar):
+            xorl    %eax, %eax
+            VZEROUPPER_RETURN
+    ```
+    
+    Can become:
+    
+    ```
+    L(foo):
+            COND_VZEROUPPER
+            cmpl    %eax, %edx
+            jz      L(bar)
+            tzcntl  %eax, %eax
+            addq    %rdi, %rax
+            ret
+    
+    L(bar):
+            xorl    %eax, %eax
+            ret
+    ```
+    
+    This code does not change any existing functionality.
+    
+    There is no difference in the objdump of libc.so before and after this
+    patch.
+    Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
+    
+    (cherry picked from commit dd5c483b2598f411428df4d8864c15c4b8a3cd68)
+
+diff --git a/sysdeps/x86_64/multiarch/avx-rtm-vecs.h b/sysdeps/x86_64/multiarch/avx-rtm-vecs.h
+index 3f531dd47fceefe9..6ca9f5e6bae7ba72 100644
+--- a/sysdeps/x86_64/multiarch/avx-rtm-vecs.h
+++ b/sysdeps/x86_64/multiarch/avx-rtm-vecs.h
+@@ -20,6 +20,7 @@
+ #ifndef _AVX_RTM_VECS_H
+ #define _AVX_RTM_VECS_H			1
+ 
+#define COND_VZEROUPPER			COND_VZEROUPPER_XTEST
+ #define ZERO_UPPER_VEC_REGISTERS_RETURN	\
+ 	ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST
+ 
+diff --git a/sysdeps/x86_64/sysdep.h b/sysdeps/x86_64/sysdep.h
+index 7bebdeb21095eda0..93e44be22e2275f1 100644
+--- a/sysdeps/x86_64/sysdep.h
+++ b/sysdeps/x86_64/sysdep.h
+@@ -106,6 +106,24 @@ lose:									      \
+ 	vzeroupper;						\
+ 	ret
+ 
+/* Can be used to replace vzeroupper that is not directly before a
+   return.  This is useful when hoisting a vzeroupper from multiple
+   return paths to decrease the total number of vzerouppers and code
+   size.  */
+#define COND_VZEROUPPER_XTEST							\
+    xtest;							\
+    jz 1f;							\
+    vzeroall;							\
+    jmp 2f;							\
+1:							\
+    vzeroupper;							\
+2:
+
+/* In RTM define this as COND_VZEROUPPER_XTEST.  */
+#ifndef COND_VZEROUPPER
+# define COND_VZEROUPPER vzeroupper
+#endif
+
+ /* Zero upper vector registers and return.  */
+ #ifndef ZERO_UPPER_VEC_REGISTERS_RETURN
+ # define ZERO_UPPER_VEC_REGISTERS_RETURN \
--- a/glibc-upstream-2.34-283.patch
+++ b/glibc-upstream-2.34-283.patch
@ -0,0 +1,696 @@
+commit 4901009dad8b3ab141ac6e0caebe99e03a67f5eb
+Author: Noah Goldstein <goldstein.w.n@gmail.com>
+Date:   Mon Jun 6 21:11:30 2022 -0700
+
+    x86: Optimize memrchr-sse2.S
+    
+    The new code:
+        1. prioritizes smaller lengths more.
+        2. optimizes target placement more carefully.
+        3. reuses logic more.
+        4. fixes up various inefficiencies in the logic.
+    
+    The total code size saving is: 394 bytes
+    Geometric Mean of all benchmarks New / Old: 0.874
+    
+    Regressions:
+        1. The page cross case is now colder, especially re-entry from the
+           page cross case if a match is not found in the first VEC
+           (roughly 50%). My general opinion with this patch is this is
+           acceptable given the "coldness" of this case (less than 4%) and
+           generally performance improvement in the other far more common
+           cases.
+    
+        2. There are some regressions 5-15% for medium/large user-arg
+           lengths that have a match in the first VEC. This is because the
+           logic was rewritten to optimize finds in the first VEC if the
+           user-arg length is shorter (where we see roughly 20-50%
+           performance improvements). It is not always the case this is a
+           regression. My intuition is some frontend quirk is partially
+           explaining the data although I haven't been able to find the
+           root cause.
+    
+    Full xcheck passes on x86_64.
+    Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
+    
+    (cherry picked from commit 731feee3869550e93177e604604c1765d81de571)
+
+diff --git a/sysdeps/x86_64/memrchr.S b/sysdeps/x86_64/memrchr.S
+index cc2001167d77c83c..c2a5902bf9385c67 100644
+--- a/sysdeps/x86_64/memrchr.S
+++ b/sysdeps/x86_64/memrchr.S
+@@ -19,362 +19,333 @@
+    <https://www.gnu.org/licenses/>.  */
+ 
+ #include <sysdep.h>
+#define VEC_SIZE			16
+#define PAGE_SIZE			4096
+ 
+ 	.text
+-ENTRY (__memrchr)
+-	movd	%esi, %xmm1
+-
+-	sub	$16, %RDX_LP
+-	jbe	L(length_less16)
+-
+-	punpcklbw	%xmm1, %xmm1
+-	punpcklbw	%xmm1, %xmm1
+-
+-	add	%RDX_LP, %RDI_LP
+-	pshufd	$0, %xmm1, %xmm1
+-
+-	movdqu	(%rdi), %xmm0
+-	pcmpeqb	%xmm1, %xmm0
+-
+-/* Check if there is a match.  */
+-	pmovmskb	%xmm0, %eax
+-	test	%eax, %eax
+-	jnz	L(matches0)
+-
+-	sub	$64, %rdi
+-	mov	%edi, %ecx
+-	and	$15, %ecx
+-	jz	L(loop_prolog)
+-
+-	add	$16, %rdi
+-	add	$16, %rdx
+-	and	$-16, %rdi
+-	sub	%rcx, %rdx
+-
+-	.p2align 4
+-L(loop_prolog):
+-	sub	$64, %rdx
+-	jbe	L(exit_loop)
+-
+-	movdqa	48(%rdi), %xmm0
+-	pcmpeqb	%xmm1, %xmm0
+-	pmovmskb	%xmm0, %eax
+-	test	%eax, %eax
+-	jnz	L(matches48)
+-
+-	movdqa	32(%rdi), %xmm2
+-	pcmpeqb	%xmm1, %xmm2
+-	pmovmskb	%xmm2, %eax
+-	test	%eax, %eax
+-	jnz	L(matches32)
+-
+-	movdqa	16(%rdi), %xmm3
+-	pcmpeqb	%xmm1, %xmm3
+-	pmovmskb	%xmm3, %eax
+-	test	%eax, %eax
+-	jnz	L(matches16)
+-
+-	movdqa	(%rdi), %xmm4
+-	pcmpeqb	%xmm1, %xmm4
+-	pmovmskb	%xmm4, %eax
+-	test	%eax, %eax
+-	jnz	L(matches0)
+-
+-	sub	$64, %rdi
+-	sub	$64, %rdx
+-	jbe	L(exit_loop)
+-
+-	movdqa	48(%rdi), %xmm0
+-	pcmpeqb	%xmm1, %xmm0
+-	pmovmskb	%xmm0, %eax
+-	test	%eax, %eax
+-	jnz	L(matches48)
+-
+-	movdqa	32(%rdi), %xmm2
+-	pcmpeqb	%xmm1, %xmm2
+-	pmovmskb	%xmm2, %eax
+-	test	%eax, %eax
+-	jnz	L(matches32)
+-
+-	movdqa	16(%rdi), %xmm3
+-	pcmpeqb	%xmm1, %xmm3
+-	pmovmskb	%xmm3, %eax
+-	test	%eax, %eax
+-	jnz	L(matches16)
+-
+-	movdqa	(%rdi), %xmm3
+-	pcmpeqb	%xmm1, %xmm3
+-	pmovmskb	%xmm3, %eax
+-	test	%eax, %eax
+-	jnz	L(matches0)
+-
+-	mov	%edi, %ecx
+-	and	$63, %ecx
+-	jz	L(align64_loop)
+-
+-	add	$64, %rdi
+-	add	$64, %rdx
+-	and	$-64, %rdi
+-	sub	%rcx, %rdx
+-
+-	.p2align 4
+-L(align64_loop):
+-	sub	$64, %rdi
+-	sub	$64, %rdx
+-	jbe	L(exit_loop)
+-
+-	movdqa	(%rdi), %xmm0
+-	movdqa	16(%rdi), %xmm2
+-	movdqa	32(%rdi), %xmm3
+-	movdqa	48(%rdi), %xmm4
+-
+-	pcmpeqb	%xmm1, %xmm0
+-	pcmpeqb	%xmm1, %xmm2
+-	pcmpeqb	%xmm1, %xmm3
+-	pcmpeqb	%xmm1, %xmm4
+-
+-	pmaxub	%xmm3, %xmm0
+-	pmaxub	%xmm4, %xmm2
+-	pmaxub	%xmm0, %xmm2
+-	pmovmskb	%xmm2, %eax
+-
+-	test	%eax, %eax
+-	jz	L(align64_loop)
+-
+-	pmovmskb	%xmm4, %eax
+-	test	%eax, %eax
+-	jnz	L(matches48)
+-
+-	pmovmskb	%xmm3, %eax
+-	test	%eax, %eax
+-	jnz	L(matches32)
+-
+-	movdqa	16(%rdi), %xmm2
+-
+-	pcmpeqb	%xmm1, %xmm2
+-	pcmpeqb	(%rdi), %xmm1
+-
+-	pmovmskb	%xmm2, %eax
+-	test	%eax, %eax
+-	jnz	L(matches16)
+-
+-	pmovmskb	%xmm1, %eax
+-	bsr	%eax, %eax
+-
+-	add	%rdi, %rax
+ENTRY_P2ALIGN(__memrchr, 6)
+#ifdef __ILP32__
+	/* Clear upper bits.  */
+	mov	%RDX_LP, %RDX_LP
+#endif
+	movd	%esi, %xmm0
+
+	/* Get end pointer.  */
+	leaq	(%rdx, %rdi), %rcx
+
+	punpcklbw %xmm0, %xmm0
+	punpcklwd %xmm0, %xmm0
+	pshufd	$0, %xmm0, %xmm0
+
+	/* Check if we can load 1x VEC without cross a page.  */
+	testl	$(PAGE_SIZE - VEC_SIZE), %ecx
+	jz	L(page_cross)
+
+	/* NB: This load happens regardless of whether rdx (len) is zero. Since
+	   it doesn't cross a page and the standard gurantees any pointer have
+	   at least one-valid byte this load must be safe. For the entire
+	   history of the x86 memrchr implementation this has been possible so
+	   no code "should" be relying on a zero-length check before this load.
+	   The zero-length check is moved to the page cross case because it is
+	   1) pretty cold and including it pushes the hot case len <= VEC_SIZE
+	   into 2-cache lines.  */
+	movups	-(VEC_SIZE)(%rcx), %xmm1
+	pcmpeqb	%xmm0, %xmm1
+	pmovmskb %xmm1, %eax
+
+	subq	$VEC_SIZE, %rdx
+	ja	L(more_1x_vec)
+L(ret_vec_x0_test):
+	/* Zero-flag set if eax (src) is zero. Destination unchanged if src is
+	   zero.  */
+	bsrl	%eax, %eax
+	jz	L(ret_0)
+	/* Check if the CHAR match is in bounds. Need to truly zero `eax` here
+	   if out of bounds.  */
+	addl	%edx, %eax
+	jl	L(zero_0)
+	/* Since we subtracted VEC_SIZE from rdx earlier we can just add to base
+	   ptr.  */
+	addq	%rdi, %rax
+L(ret_0):
+ 	ret
+ 
+-	.p2align 4
+-L(exit_loop):
+-	add	$64, %edx
+-	cmp	$32, %edx
+-	jbe	L(exit_loop_32)
+-
+-	movdqa	48(%rdi), %xmm0
+-	pcmpeqb	%xmm1, %xmm0
+-	pmovmskb	%xmm0, %eax
+-	test	%eax, %eax
+-	jnz	L(matches48)
+-
+-	movdqa	32(%rdi), %xmm2
+-	pcmpeqb	%xmm1, %xmm2
+-	pmovmskb	%xmm2, %eax
+-	test	%eax, %eax
+-	jnz	L(matches32)
+-
+-	movdqa	16(%rdi), %xmm3
+-	pcmpeqb	%xmm1, %xmm3
+-	pmovmskb	%xmm3, %eax
+-	test	%eax, %eax
+-	jnz	L(matches16_1)
+-	cmp	$48, %edx
+-	jbe	L(return_null)
+-
+-	pcmpeqb	(%rdi), %xmm1
+-	pmovmskb	%xmm1, %eax
+-	test	%eax, %eax
+-	jnz	L(matches0_1)
+-	xor	%eax, %eax
+	.p2align 4,, 5
+L(ret_vec_x0):
+	bsrl	%eax, %eax
+	leaq	-(VEC_SIZE)(%rcx, %rax), %rax
+ 	ret
+ 
+-	.p2align 4
+-L(exit_loop_32):
+-	movdqa	48(%rdi), %xmm0
+-	pcmpeqb	%xmm1, %xmm0
+-	pmovmskb	%xmm0, %eax
+-	test	%eax, %eax
+-	jnz	L(matches48_1)
+-	cmp	$16, %edx
+-	jbe	L(return_null)
+-
+-	pcmpeqb	32(%rdi), %xmm1
+-	pmovmskb	%xmm1, %eax
+-	test	%eax, %eax
+-	jnz	L(matches32_1)
+-	xor	%eax, %eax
+	.p2align 4,, 2
+L(zero_0):
+	xorl	%eax, %eax
+ 	ret
+ 
+-	.p2align 4
+-L(matches0):
+-	bsr	%eax, %eax
+-	add	%rdi, %rax
+-	ret
+-
+-	.p2align 4
+-L(matches16):
+-	bsr	%eax, %eax
+-	lea	16(%rax, %rdi), %rax
+-	ret
+ 
+-	.p2align 4
+-L(matches32):
+-	bsr	%eax, %eax
+-	lea	32(%rax, %rdi), %rax
+	.p2align 4,, 8
+L(more_1x_vec):
+	testl	%eax, %eax
+	jnz	L(ret_vec_x0)
+
+	/* Align rcx (pointer to string).  */
+	decq	%rcx
+	andq	$-VEC_SIZE, %rcx
+
+	movq	%rcx, %rdx
+	/* NB: We could consistenyl save 1-byte in this pattern with `movaps
+	   %xmm0, %xmm1; pcmpeq IMM8(r), %xmm1; ...`. The reason against it is
+	   it adds more frontend uops (even if the moves can be eliminated) and
+	   some percentage of the time actual backend uops.  */
+	movaps	-(VEC_SIZE)(%rcx), %xmm1
+	pcmpeqb	%xmm0, %xmm1
+	subq	%rdi, %rdx
+	pmovmskb %xmm1, %eax
+
+	cmpq	$(VEC_SIZE * 2), %rdx
+	ja	L(more_2x_vec)
+L(last_2x_vec):
+	subl	$VEC_SIZE, %edx
+	jbe	L(ret_vec_x0_test)
+
+	testl	%eax, %eax
+	jnz	L(ret_vec_x0)
+
+	movaps	-(VEC_SIZE * 2)(%rcx), %xmm1
+	pcmpeqb	%xmm0, %xmm1
+	pmovmskb %xmm1, %eax
+
+	subl	$VEC_SIZE, %edx
+	bsrl	%eax, %eax
+	jz	L(ret_1)
+	addl	%edx, %eax
+	jl	L(zero_0)
+	addq	%rdi, %rax
+L(ret_1):
+ 	ret
+ 
+-	.p2align 4
+-L(matches48):
+-	bsr	%eax, %eax
+-	lea	48(%rax, %rdi), %rax
+	/* Don't align. Otherwise lose 2-byte encoding in jump to L(page_cross)
+	   causes the hot pause (length <= VEC_SIZE) to span multiple cache
+	   lines.  Naturally aligned % 16 to 8-bytes.  */
+L(page_cross):
+	/* Zero length check.  */
+	testq	%rdx, %rdx
+	jz	L(zero_0)
+
+	leaq	-1(%rcx), %r8
+	andq	$-(VEC_SIZE), %r8
+
+	movaps	(%r8), %xmm1
+	pcmpeqb	%xmm0, %xmm1
+	pmovmskb %xmm1, %esi
+	/* Shift out negative alignment (because we are starting from endptr and
+	   working backwards).  */
+	negl	%ecx
+	/* 32-bit shift but VEC_SIZE=16 so need to mask the shift count
+	   explicitly.  */
+	andl	$(VEC_SIZE - 1), %ecx
+	shl	%cl, %esi
+	movzwl	%si, %eax
+	leaq	(%rdi, %rdx), %rcx
+	cmpq	%rdi, %r8
+	ja	L(more_1x_vec)
+	subl	$VEC_SIZE, %edx
+	bsrl	%eax, %eax
+	jz	L(ret_2)
+	addl	%edx, %eax
+	jl	L(zero_1)
+	addq	%rdi, %rax
+L(ret_2):
+ 	ret
+ 
+-	.p2align 4
+-L(matches0_1):
+-	bsr	%eax, %eax
+-	sub	$64, %rdx
+-	add	%rax, %rdx
+-	jl	L(return_null)
+-	add	%rdi, %rax
+	/* Fits in aliging bytes.  */
+L(zero_1):
+	xorl	%eax, %eax
+ 	ret
+ 
+-	.p2align 4
+-L(matches16_1):
+-	bsr	%eax, %eax
+-	sub	$48, %rdx
+-	add	%rax, %rdx
+-	jl	L(return_null)
+-	lea	16(%rdi, %rax), %rax
+	.p2align 4,, 5
+L(ret_vec_x1):
+	bsrl	%eax, %eax
+	leaq	-(VEC_SIZE * 2)(%rcx, %rax), %rax
+ 	ret
+ 
+-	.p2align 4
+-L(matches32_1):
+-	bsr	%eax, %eax
+-	sub	$32, %rdx
+-	add	%rax, %rdx
+-	jl	L(return_null)
+-	lea	32(%rdi, %rax), %rax
+-	ret
+	.p2align 4,, 8
+L(more_2x_vec):
+	testl	%eax, %eax
+	jnz	L(ret_vec_x0)
+ 
+-	.p2align 4
+-L(matches48_1):
+-	bsr	%eax, %eax
+-	sub	$16, %rdx
+-	add	%rax, %rdx
+-	jl	L(return_null)
+-	lea	48(%rdi, %rax), %rax
+-	ret
+	movaps	-(VEC_SIZE * 2)(%rcx), %xmm1
+	pcmpeqb	%xmm0, %xmm1
+	pmovmskb %xmm1, %eax
+	testl	%eax, %eax
+	jnz	L(ret_vec_x1)
+ 
+-	.p2align 4
+-L(return_null):
+-	xor	%eax, %eax
+-	ret
+ 
+-	.p2align 4
+-L(length_less16_offset0):
+-	test	%edx, %edx
+-	jz	L(return_null)
+	movaps	-(VEC_SIZE * 3)(%rcx), %xmm1
+	pcmpeqb	%xmm0, %xmm1
+	pmovmskb %xmm1, %eax
+ 
+-	mov	%dl, %cl
+-	pcmpeqb	(%rdi), %xmm1
+	subq	$(VEC_SIZE * 4), %rdx
+	ja	L(more_4x_vec)
+ 
+-	mov	$1, %edx
+-	sal	%cl, %edx
+-	sub	$1, %edx
+	addl	$(VEC_SIZE), %edx
+	jle	L(ret_vec_x2_test)
+ 
+-	pmovmskb	%xmm1, %eax
+L(last_vec):
+	testl	%eax, %eax
+	jnz	L(ret_vec_x2)
+ 
+-	and	%edx, %eax
+-	test	%eax, %eax
+-	jz	L(return_null)
+	movaps	-(VEC_SIZE * 4)(%rcx), %xmm1
+	pcmpeqb	%xmm0, %xmm1
+	pmovmskb %xmm1, %eax
+ 
+-	bsr	%eax, %eax
+-	add	%rdi, %rax
+	subl	$(VEC_SIZE), %edx
+	bsrl	%eax, %eax
+	jz	L(ret_3)
+	addl	%edx, %eax
+	jl	L(zero_2)
+	addq	%rdi, %rax
+L(ret_3):
+ 	ret
+ 
+-	.p2align 4
+-L(length_less16):
+-	punpcklbw	%xmm1, %xmm1
+-	punpcklbw	%xmm1, %xmm1
+-
+-	add	$16, %edx
+-
+-	pshufd	$0, %xmm1, %xmm1
+-
+-	mov	%edi, %ecx
+-	and	$15, %ecx
+-	jz	L(length_less16_offset0)
+-
+-	mov	%cl, %dh
+-	mov	%ecx, %esi
+-	add	%dl, %dh
+-	and	$-16, %rdi
+-
+-	sub	$16, %dh
+-	ja	L(length_less16_part2)
+-
+-	pcmpeqb	(%rdi), %xmm1
+-	pmovmskb	%xmm1, %eax
+-
+-	sar	%cl, %eax
+-	mov	%dl, %cl
+-
+-	mov	$1, %edx
+-	sal	%cl, %edx
+-	sub	$1, %edx
+-
+-	and	%edx, %eax
+-	test	%eax, %eax
+-	jz	L(return_null)
+-
+-	bsr	%eax, %eax
+-	add	%rdi, %rax
+-	add	%rsi, %rax
+	.p2align 4,, 6
+L(ret_vec_x2_test):
+	bsrl	%eax, %eax
+	jz	L(zero_2)
+	addl	%edx, %eax
+	jl	L(zero_2)
+	addq	%rdi, %rax
+ 	ret
+ 
+-	.p2align 4
+-L(length_less16_part2):
+-	movdqa	16(%rdi), %xmm2
+-	pcmpeqb	%xmm1, %xmm2
+-	pmovmskb	%xmm2, %eax
+-
+-	mov	%dh, %cl
+-	mov	$1, %edx
+-	sal	%cl, %edx
+-	sub	$1, %edx
+-
+-	and	%edx, %eax
+L(zero_2):
+	xorl	%eax, %eax
+	ret
+ 
+-	test	%eax, %eax
+-	jnz	L(length_less16_part2_return)
+ 
+-	pcmpeqb	(%rdi), %xmm1
+-	pmovmskb	%xmm1, %eax
+	.p2align 4,, 5
+L(ret_vec_x2):
+	bsrl	%eax, %eax
+	leaq	-(VEC_SIZE * 3)(%rcx, %rax), %rax
+	ret
+ 
+-	mov	%esi, %ecx
+-	sar	%cl, %eax
+-	test	%eax, %eax
+-	jz	L(return_null)
+	.p2align 4,, 5
+L(ret_vec_x3):
+	bsrl	%eax, %eax
+	leaq	-(VEC_SIZE * 4)(%rcx, %rax), %rax
+	ret
+ 
+-	bsr	%eax, %eax
+-	add	%rdi, %rax
+-	add	%rsi, %rax
+	.p2align 4,, 8
+L(more_4x_vec):
+	testl	%eax, %eax
+	jnz	L(ret_vec_x2)
+
+	movaps	-(VEC_SIZE * 4)(%rcx), %xmm1
+	pcmpeqb	%xmm0, %xmm1
+	pmovmskb %xmm1, %eax
+
+	testl	%eax, %eax
+	jnz	L(ret_vec_x3)
+
+	addq	$-(VEC_SIZE * 4), %rcx
+	cmpq	$(VEC_SIZE * 4), %rdx
+	jbe	L(last_4x_vec)
+
+	/* Offset everything by 4x VEC_SIZE here to save a few bytes at the end
+	   keeping the code from spilling to the next cache line.  */
+	addq	$(VEC_SIZE * 4 - 1), %rcx
+	andq	$-(VEC_SIZE * 4), %rcx
+	leaq	(VEC_SIZE * 4)(%rdi), %rdx
+	andq	$-(VEC_SIZE * 4), %rdx
+
+	.p2align 4,, 11
+L(loop_4x_vec):
+	movaps	(VEC_SIZE * -1)(%rcx), %xmm1
+	movaps	(VEC_SIZE * -2)(%rcx), %xmm2
+	movaps	(VEC_SIZE * -3)(%rcx), %xmm3
+	movaps	(VEC_SIZE * -4)(%rcx), %xmm4
+	pcmpeqb	%xmm0, %xmm1
+	pcmpeqb	%xmm0, %xmm2
+	pcmpeqb	%xmm0, %xmm3
+	pcmpeqb	%xmm0, %xmm4
+
+	por	%xmm1, %xmm2
+	por	%xmm3, %xmm4
+	por	%xmm2, %xmm4
+
+	pmovmskb %xmm4, %esi
+	testl	%esi, %esi
+	jnz	L(loop_end)
+
+	addq	$-(VEC_SIZE * 4), %rcx
+	cmpq	%rdx, %rcx
+	jne	L(loop_4x_vec)
+
+	subl	%edi, %edx
+
+	/* Ends up being 1-byte nop.  */
+	.p2align 4,, 2
+L(last_4x_vec):
+	movaps	-(VEC_SIZE)(%rcx), %xmm1
+	pcmpeqb	%xmm0, %xmm1
+	pmovmskb %xmm1, %eax
+
+	cmpl	$(VEC_SIZE * 2), %edx
+	jbe	L(last_2x_vec)
+
+	testl	%eax, %eax
+	jnz	L(ret_vec_x0)
+
+
+	movaps	-(VEC_SIZE * 2)(%rcx), %xmm1
+	pcmpeqb	%xmm0, %xmm1
+	pmovmskb %xmm1, %eax
+
+	testl	%eax, %eax
+	jnz	L(ret_vec_end)
+
+	movaps	-(VEC_SIZE * 3)(%rcx), %xmm1
+	pcmpeqb	%xmm0, %xmm1
+	pmovmskb %xmm1, %eax
+
+	subl	$(VEC_SIZE * 3), %edx
+	ja	L(last_vec)
+	bsrl	%eax, %eax
+	jz	L(ret_4)
+	addl	%edx, %eax
+	jl	L(zero_3)
+	addq	%rdi, %rax
+L(ret_4):
+ 	ret
+ 
+-	.p2align 4
+-L(length_less16_part2_return):
+-	bsr	%eax, %eax
+-	lea	16(%rax, %rdi), %rax
+	/* Ends up being 1-byte nop.  */
+	.p2align 4,, 3
+L(loop_end):
+	pmovmskb %xmm1, %eax
+	sall	$16, %eax
+	jnz	L(ret_vec_end)
+
+	pmovmskb %xmm2, %eax
+	testl	%eax, %eax
+	jnz	L(ret_vec_end)
+
+	pmovmskb %xmm3, %eax
+	/* Combine last 2 VEC matches. If ecx (VEC3) is zero (no CHAR in VEC3)
+	   then it won't affect the result in esi (VEC4). If ecx is non-zero
+	   then CHAR in VEC3 and bsrq will use that position.  */
+	sall	$16, %eax
+	orl	%esi, %eax
+	bsrl	%eax, %eax
+	leaq	-(VEC_SIZE * 4)(%rcx, %rax), %rax
+ 	ret
+ 
+-END (__memrchr)
+L(ret_vec_end):
+	bsrl	%eax, %eax
+	leaq	(VEC_SIZE * -2)(%rax, %rcx), %rax
+	ret
+	/* Use in L(last_4x_vec). In the same cache line. This is just a spare
+	   aligning bytes.  */
+L(zero_3):
+	xorl	%eax, %eax
+	ret
+	/* 2-bytes from next cache line.  */
+END(__memrchr)
+ weak_alias (__memrchr, memrchr)
--- a/glibc-upstream-2.34-284.patch
+++ b/glibc-upstream-2.34-284.patch
@ -0,0 +1,624 @@
+commit 83a986e9fbc301e6056dbc9d9ec6888621b60f67
+Author: Noah Goldstein <goldstein.w.n@gmail.com>
+Date:   Mon Jun 6 21:11:31 2022 -0700
+
+    x86: Optimize memrchr-evex.S
+    
+    The new code:
+        1. prioritizes smaller user-arg lengths more.
+        2. optimizes target placement more carefully
+        3. reuses logic more
+        4. fixes up various inefficiencies in the logic. The biggest
+           case here is the `lzcnt` logic for checking returns which
+           saves either a branch or multiple instructions.
+    
+    The total code size saving is: 263 bytes
+    Geometric Mean of all benchmarks New / Old: 0.755
+    
+    Regressions:
+    There are some regressions. Particularly where the length (user arg
+    length) is large but the position of the match char is near the
+    beginning of the string (in first VEC). This case has roughly a
+    20% regression.
+    
+    This is because the new logic gives the hot path for immediate matches
+    to shorter lengths (the more common input). This case has roughly
+    a 35% speedup.
+    
+    Full xcheck passes on x86_64.
+    Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
+    
+    (cherry picked from commit b4209615a06b01c974f47b4998b00e4c7b1aa5d9)
+
+diff --git a/sysdeps/x86_64/multiarch/memrchr-evex.S b/sysdeps/x86_64/multiarch/memrchr-evex.S
+index 16bf8e02b1e80c84..bddc89c3754894ed 100644
+--- a/sysdeps/x86_64/multiarch/memrchr-evex.S
+++ b/sysdeps/x86_64/multiarch/memrchr-evex.S
+@@ -19,319 +19,316 @@
+ #if IS_IN (libc)
+ 
+ # include <sysdep.h>
+# include "evex256-vecs.h"
+# if VEC_SIZE != 32
+#  error "VEC_SIZE != 32 unimplemented"
+# endif
+
+# ifndef MEMRCHR
+#  define MEMRCHR				__memrchr_evex
+# endif
+
+# define PAGE_SIZE			4096
+# define VECMATCH			VEC(0)
+
+	.section SECTION(.text), "ax", @progbits
+ENTRY_P2ALIGN(MEMRCHR, 6)
+# ifdef __ILP32__
+	/* Clear upper bits.  */
+	and	%RDX_LP, %RDX_LP
+# else
+	test	%RDX_LP, %RDX_LP
+# endif
+	jz	L(zero_0)
+
+	/* Get end pointer. Minus one for two reasons. 1) It is necessary for a
+	   correct page cross check and 2) it correctly sets up end ptr to be
+	   subtract by lzcnt aligned.  */
+	leaq	-1(%rdi, %rdx), %rax
+	vpbroadcastb %esi, %VECMATCH
+
+	/* Check if we can load 1x VEC without cross a page.  */
+	testl	$(PAGE_SIZE - VEC_SIZE), %eax
+	jz	L(page_cross)
+
+	/* Don't use rax for pointer here because EVEX has better encoding with
+	   offset % VEC_SIZE == 0.  */
+	vpcmpb	$0, -(VEC_SIZE)(%rdi, %rdx), %VECMATCH, %k0
+	kmovd	%k0, %ecx
+
+	/* Fall through for rdx (len) <= VEC_SIZE (expect small sizes).  */
+	cmpq	$VEC_SIZE, %rdx
+	ja	L(more_1x_vec)
+L(ret_vec_x0_test):
+
+	/* If ecx is zero (no matches) lzcnt will set it 32 (VEC_SIZE) which
+	   will guarantee edx (len) is less than it.  */
+	lzcntl	%ecx, %ecx
+	cmpl	%ecx, %edx
+	jle	L(zero_0)
+	subq	%rcx, %rax
+	ret
+ 
+-# define VMOVA		vmovdqa64
+-
+-# define YMMMATCH	ymm16
+-
+-# define VEC_SIZE 32
+-
+-	.section .text.evex,"ax",@progbits
+-ENTRY (__memrchr_evex)
+-	/* Broadcast CHAR to YMMMATCH.  */
+-	vpbroadcastb %esi, %YMMMATCH
+-
+-	sub	$VEC_SIZE, %RDX_LP
+-	jbe	L(last_vec_or_less)
+-
+-	add	%RDX_LP, %RDI_LP
+-
+-	/* Check the last VEC_SIZE bytes.  */
+-	vpcmpb	$0, (%rdi), %YMMMATCH, %k1
+-	kmovd	%k1, %eax
+-	testl	%eax, %eax
+-	jnz	L(last_vec_x0)
+-
+-	subq	$(VEC_SIZE * 4), %rdi
+-	movl	%edi, %ecx
+-	andl	$(VEC_SIZE - 1), %ecx
+-	jz	L(aligned_more)
+-
+-	/* Align data for aligned loads in the loop.  */
+-	addq	$VEC_SIZE, %rdi
+-	addq	$VEC_SIZE, %rdx
+-	andq	$-VEC_SIZE, %rdi
+-	subq	%rcx, %rdx
+-
+-	.p2align 4
+-L(aligned_more):
+-	subq	$(VEC_SIZE * 4), %rdx
+-	jbe	L(last_4x_vec_or_less)
+-
+-	/* Check the last 4 * VEC_SIZE.  Only one VEC_SIZE at a time
+-	   since data is only aligned to VEC_SIZE.  */
+-	vpcmpb	$0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k1
+-	kmovd	%k1, %eax
+-	testl	%eax, %eax
+-	jnz	L(last_vec_x3)
+-
+-	vpcmpb	$0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k2
+-	kmovd	%k2, %eax
+-	testl	%eax, %eax
+-	jnz	L(last_vec_x2)
+-
+-	vpcmpb	$0, VEC_SIZE(%rdi), %YMMMATCH, %k3
+-	kmovd	%k3, %eax
+-	testl	%eax, %eax
+-	jnz	L(last_vec_x1)
+-
+-	vpcmpb	$0, (%rdi), %YMMMATCH, %k4
+-	kmovd	%k4, %eax
+-	testl	%eax, %eax
+-	jnz	L(last_vec_x0)
+-
+-	/* Align data to 4 * VEC_SIZE for loop with fewer branches.
+-	   There are some overlaps with above if data isn't aligned
+-	   to 4 * VEC_SIZE.  */
+-	movl	%edi, %ecx
+-	andl	$(VEC_SIZE * 4 - 1), %ecx
+-	jz	L(loop_4x_vec)
+-
+-	addq	$(VEC_SIZE * 4), %rdi
+-	addq	$(VEC_SIZE * 4), %rdx
+-	andq	$-(VEC_SIZE * 4), %rdi
+-	subq	%rcx, %rdx
+	/* Fits in aligning bytes of first cache line.  */
+L(zero_0):
+	xorl	%eax, %eax
+	ret
+ 
+-	.p2align 4
+-L(loop_4x_vec):
+-	/* Compare 4 * VEC at a time forward.  */
+-	subq	$(VEC_SIZE * 4), %rdi
+-	subq	$(VEC_SIZE * 4), %rdx
+-	jbe	L(last_4x_vec_or_less)
+-
+-	vpcmpb	$0, (%rdi), %YMMMATCH, %k1
+-	vpcmpb	$0, VEC_SIZE(%rdi), %YMMMATCH, %k2
+-	kord	%k1, %k2, %k5
+-	vpcmpb	$0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k3
+-	vpcmpb	$0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k4
+-
+-	kord	%k3, %k4, %k6
+-	kortestd %k5, %k6
+-	jz	L(loop_4x_vec)
+-
+-	/* There is a match.  */
+-	kmovd	%k4, %eax
+-	testl	%eax, %eax
+-	jnz	L(last_vec_x3)
+-
+-	kmovd	%k3, %eax
+-	testl	%eax, %eax
+-	jnz	L(last_vec_x2)
+-
+-	kmovd	%k2, %eax
+-	testl	%eax, %eax
+-	jnz	L(last_vec_x1)
+-
+-	kmovd	%k1, %eax
+-	bsrl	%eax, %eax
+-	addq	%rdi, %rax
+	.p2align 4,, 9
+L(ret_vec_x0_dec):
+	decq	%rax
+L(ret_vec_x0):
+	lzcntl	%ecx, %ecx
+	subq	%rcx, %rax
+ 	ret
+ 
+-	.p2align 4
+-L(last_4x_vec_or_less):
+-	addl	$(VEC_SIZE * 4), %edx
+-	cmpl	$(VEC_SIZE * 2), %edx
+-	jbe	L(last_2x_vec)
+	.p2align 4,, 10
+L(more_1x_vec):
+	testl	%ecx, %ecx
+	jnz	L(ret_vec_x0)
+ 
+-	vpcmpb	$0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k1
+-	kmovd	%k1, %eax
+-	testl	%eax, %eax
+-	jnz	L(last_vec_x3)
+	/* Align rax (pointer to string).  */
+	andq	$-VEC_SIZE, %rax
+ 
+-	vpcmpb	$0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k2
+-	kmovd	%k2, %eax
+-	testl	%eax, %eax
+-	jnz	L(last_vec_x2)
+	/* Recompute length after aligning.  */
+	movq	%rax, %rdx
+ 
+-	vpcmpb	$0, VEC_SIZE(%rdi), %YMMMATCH, %k3
+-	kmovd	%k3, %eax
+-	testl	%eax, %eax
+-	jnz	L(last_vec_x1_check)
+-	cmpl	$(VEC_SIZE * 3), %edx
+-	jbe	L(zero)
+	/* Need no matter what.  */
+	vpcmpb	$0, -(VEC_SIZE)(%rax), %VECMATCH, %k0
+	kmovd	%k0, %ecx
+ 
+-	vpcmpb	$0, (%rdi), %YMMMATCH, %k4
+-	kmovd	%k4, %eax
+-	testl	%eax, %eax
+-	jz	L(zero)
+-	bsrl	%eax, %eax
+-	subq	$(VEC_SIZE * 4), %rdx
+-	addq	%rax, %rdx
+-	jl	L(zero)
+-	addq	%rdi, %rax
+-	ret
+	subq	%rdi, %rdx
+ 
+-	.p2align 4
+	cmpq	$(VEC_SIZE * 2), %rdx
+	ja	L(more_2x_vec)
+ L(last_2x_vec):
+-	vpcmpb	$0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k1
+-	kmovd	%k1, %eax
+-	testl	%eax, %eax
+-	jnz	L(last_vec_x3_check)
+
+	/* Must dec rax because L(ret_vec_x0_test) expects it.  */
+	decq	%rax
+ 	cmpl	$VEC_SIZE, %edx
+-	jbe	L(zero)
+-
+-	vpcmpb	$0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k1
+-	kmovd	%k1, %eax
+-	testl	%eax, %eax
+-	jz	L(zero)
+-	bsrl	%eax, %eax
+-	subq	$(VEC_SIZE * 2), %rdx
+-	addq	%rax, %rdx
+-	jl	L(zero)
+-	addl	$(VEC_SIZE * 2), %eax
+-	addq	%rdi, %rax
+	jbe	L(ret_vec_x0_test)
+
+	testl	%ecx, %ecx
+	jnz	L(ret_vec_x0)
+
+	/* Don't use rax for pointer here because EVEX has better encoding with
+	   offset % VEC_SIZE == 0.  */
+	vpcmpb	$0, -(VEC_SIZE * 2)(%rdi, %rdx), %VECMATCH, %k0
+	kmovd	%k0, %ecx
+	/* NB: 64-bit lzcnt. This will naturally add 32 to position.  */
+	lzcntq	%rcx, %rcx
+	cmpl	%ecx, %edx
+	jle	L(zero_0)
+	subq	%rcx, %rax
+ 	ret
+ 
+-	.p2align 4
+-L(last_vec_x0):
+-	bsrl	%eax, %eax
+-	addq	%rdi, %rax
+	/* Inexpensive place to put this regarding code size / target alignments
+	   / ICache NLP. Necessary for 2-byte encoding of jump to page cross
+	   case which in turn is necessary for hot path (len <= VEC_SIZE) to fit
+	   in first cache line.  */
+L(page_cross):
+	movq	%rax, %rsi
+	andq	$-VEC_SIZE, %rsi
+	vpcmpb	$0, (%rsi), %VECMATCH, %k0
+	kmovd	%k0, %r8d
+	/* Shift out negative alignment (because we are starting from endptr and
+	   working backwards).  */
+	movl	%eax, %ecx
+	/* notl because eax already has endptr - 1.  (-x = ~(x - 1)).  */
+	notl	%ecx
+	shlxl	%ecx, %r8d, %ecx
+	cmpq	%rdi, %rsi
+	ja	L(more_1x_vec)
+	lzcntl	%ecx, %ecx
+	cmpl	%ecx, %edx
+	jle	L(zero_1)
+	subq	%rcx, %rax
+ 	ret
+ 
+-	.p2align 4
+-L(last_vec_x1):
+-	bsrl	%eax, %eax
+-	addl	$VEC_SIZE, %eax
+-	addq	%rdi, %rax
+	/* Continue creating zero labels that fit in aligning bytes and get
+	   2-byte encoding / are in the same cache line as condition.  */
+L(zero_1):
+	xorl	%eax, %eax
+ 	ret
+ 
+-	.p2align 4
+-L(last_vec_x2):
+-	bsrl	%eax, %eax
+-	addl	$(VEC_SIZE * 2), %eax
+-	addq	%rdi, %rax
+	.p2align 4,, 8
+L(ret_vec_x1):
+	/* This will naturally add 32 to position.  */
+	bsrl	%ecx, %ecx
+	leaq	-(VEC_SIZE * 2)(%rcx, %rax), %rax
+ 	ret
+ 
+-	.p2align 4
+-L(last_vec_x3):
+-	bsrl	%eax, %eax
+-	addl	$(VEC_SIZE * 3), %eax
+-	addq	%rdi, %rax
+-	ret
+	.p2align 4,, 8
+L(more_2x_vec):
+	testl	%ecx, %ecx
+	jnz	L(ret_vec_x0_dec)
+ 
+-	.p2align 4
+-L(last_vec_x1_check):
+-	bsrl	%eax, %eax
+-	subq	$(VEC_SIZE * 3), %rdx
+-	addq	%rax, %rdx
+-	jl	L(zero)
+-	addl	$VEC_SIZE, %eax
+-	addq	%rdi, %rax
+-	ret
+	vpcmpb	$0, -(VEC_SIZE * 2)(%rax), %VECMATCH, %k0
+	kmovd	%k0, %ecx
+	testl	%ecx, %ecx
+	jnz	L(ret_vec_x1)
+ 
+-	.p2align 4
+-L(last_vec_x3_check):
+-	bsrl	%eax, %eax
+-	subq	$VEC_SIZE, %rdx
+-	addq	%rax, %rdx
+-	jl	L(zero)
+-	addl	$(VEC_SIZE * 3), %eax
+-	addq	%rdi, %rax
+-	ret
+	/* Need no matter what.  */
+	vpcmpb	$0, -(VEC_SIZE * 3)(%rax), %VECMATCH, %k0
+	kmovd	%k0, %ecx
+ 
+-	.p2align 4
+-L(zero):
+-	xorl	%eax, %eax
+	subq	$(VEC_SIZE * 4), %rdx
+	ja	L(more_4x_vec)
+
+	cmpl	$(VEC_SIZE * -1), %edx
+	jle	L(ret_vec_x2_test)
+L(last_vec):
+	testl	%ecx, %ecx
+	jnz	L(ret_vec_x2)
+
+
+	/* Need no matter what.  */
+	vpcmpb	$0, -(VEC_SIZE * 4)(%rax), %VECMATCH, %k0
+	kmovd	%k0, %ecx
+	lzcntl	%ecx, %ecx
+	subq	$(VEC_SIZE * 3 + 1), %rax
+	subq	%rcx, %rax
+	cmpq	%rax, %rdi
+	ja	L(zero_1)
+ 	ret
+ 
+-	.p2align 4
+-L(last_vec_or_less_aligned):
+-	movl	%edx, %ecx
+-
+-	vpcmpb	$0, (%rdi), %YMMMATCH, %k1
+-
+-	movl	$1, %edx
+-	/* Support rdx << 32.  */
+-	salq	%cl, %rdx
+-	subq	$1, %rdx
+-
+-	kmovd	%k1, %eax
+-
+-	/* Remove the trailing bytes.  */
+-	andl	%edx, %eax
+-	testl	%eax, %eax
+-	jz	L(zero)
+-
+-	bsrl	%eax, %eax
+-	addq	%rdi, %rax
+	.p2align 4,, 8
+L(ret_vec_x2_test):
+	lzcntl	%ecx, %ecx
+	subq	$(VEC_SIZE * 2 + 1), %rax
+	subq	%rcx, %rax
+	cmpq	%rax, %rdi
+	ja	L(zero_1)
+ 	ret
+ 
+-	.p2align 4
+-L(last_vec_or_less):
+-	addl	$VEC_SIZE, %edx
+-
+-	/* Check for zero length.  */
+-	testl	%edx, %edx
+-	jz	L(zero)
+-
+-	movl	%edi, %ecx
+-	andl	$(VEC_SIZE - 1), %ecx
+-	jz	L(last_vec_or_less_aligned)
+-
+-	movl	%ecx, %esi
+-	movl	%ecx, %r8d
+-	addl	%edx, %esi
+-	andq	$-VEC_SIZE, %rdi
+	.p2align 4,, 8
+L(ret_vec_x2):
+	bsrl	%ecx, %ecx
+	leaq	-(VEC_SIZE * 3)(%rcx, %rax), %rax
+	ret
+ 
+-	subl	$VEC_SIZE, %esi
+-	ja	L(last_vec_2x_aligned)
+	.p2align 4,, 8
+L(ret_vec_x3):
+	bsrl	%ecx, %ecx
+	leaq	-(VEC_SIZE * 4)(%rcx, %rax), %rax
+	ret
+ 
+-	/* Check the last VEC.  */
+-	vpcmpb	$0, (%rdi), %YMMMATCH, %k1
+-	kmovd	%k1, %eax
+	.p2align 4,, 8
+L(more_4x_vec):
+	testl	%ecx, %ecx
+	jnz	L(ret_vec_x2)
+ 
+-	/* Remove the leading and trailing bytes.  */
+-	sarl	%cl, %eax
+-	movl	%edx, %ecx
+	vpcmpb	$0, -(VEC_SIZE * 4)(%rax), %VECMATCH, %k0
+	kmovd	%k0, %ecx
+ 
+-	movl	$1, %edx
+-	sall	%cl, %edx
+-	subl	$1, %edx
+	testl	%ecx, %ecx
+	jnz	L(ret_vec_x3)
+ 
+-	andl	%edx, %eax
+-	testl	%eax, %eax
+-	jz	L(zero)
+	/* Check if near end before re-aligning (otherwise might do an
+	   unnecessary loop iteration).  */
+	addq	$-(VEC_SIZE * 4), %rax
+	cmpq	$(VEC_SIZE * 4), %rdx
+	jbe	L(last_4x_vec)
+ 
+-	bsrl	%eax, %eax
+-	addq	%rdi, %rax
+-	addq	%r8, %rax
+-	ret
+	decq	%rax
+	andq	$-(VEC_SIZE * 4), %rax
+	movq	%rdi, %rdx
+	/* Get endptr for loop in rdx. NB: Can't just do while rax > rdi because
+	   lengths that overflow can be valid and break the comparison.  */
+	andq	$-(VEC_SIZE * 4), %rdx
+ 
+ 	.p2align 4
+-L(last_vec_2x_aligned):
+-	movl	%esi, %ecx
+-
+-	/* Check the last VEC.  */
+-	vpcmpb	$0, VEC_SIZE(%rdi), %YMMMATCH, %k1
+L(loop_4x_vec):
+	/* Store 1 were not-equals and 0 where equals in k1 (used to mask later
+	   on).  */
+	vpcmpb	$4, (VEC_SIZE * 3)(%rax), %VECMATCH, %k1
+
+	/* VEC(2/3) will have zero-byte where we found a CHAR.  */
+	vpxorq	(VEC_SIZE * 2)(%rax), %VECMATCH, %VEC(2)
+	vpxorq	(VEC_SIZE * 1)(%rax), %VECMATCH, %VEC(3)
+	vpcmpb	$0, (VEC_SIZE * 0)(%rax), %VECMATCH, %k4
+
+	/* Combine VEC(2/3) with min and maskz with k1 (k1 has zero bit where
+	   CHAR is found and VEC(2/3) have zero-byte where CHAR is found.  */
+	vpminub	%VEC(2), %VEC(3), %VEC(3){%k1}{z}
+	vptestnmb %VEC(3), %VEC(3), %k2
+
+	/* Any 1s and we found CHAR.  */
+	kortestd %k2, %k4
+	jnz	L(loop_end)
+
+	addq	$-(VEC_SIZE * 4), %rax
+	cmpq	%rdx, %rax
+	jne	L(loop_4x_vec)
+
+	/* Need to re-adjust rdx / rax for L(last_4x_vec).  */
+	subq	$-(VEC_SIZE * 4), %rdx
+	movq	%rdx, %rax
+	subl	%edi, %edx
+L(last_4x_vec):
+
+	/* Used no matter what.  */
+	vpcmpb	$0, (VEC_SIZE * -1)(%rax), %VECMATCH, %k0
+	kmovd	%k0, %ecx
+ 
+-	movl	$1, %edx
+-	sall	%cl, %edx
+-	subl	$1, %edx
+	cmpl	$(VEC_SIZE * 2), %edx
+	jbe	L(last_2x_vec)
+ 
+-	kmovd	%k1, %eax
+	testl	%ecx, %ecx
+	jnz	L(ret_vec_x0_dec)
+ 
+-	/* Remove the trailing bytes.  */
+-	andl	%edx, %eax
+ 
+-	testl	%eax, %eax
+-	jnz	L(last_vec_x1)
+	vpcmpb	$0, (VEC_SIZE * -2)(%rax), %VECMATCH, %k0
+	kmovd	%k0, %ecx
+ 
+-	/* Check the second last VEC.  */
+-	vpcmpb	$0, (%rdi), %YMMMATCH, %k1
+	testl	%ecx, %ecx
+	jnz	L(ret_vec_x1)
+ 
+-	movl	%r8d, %ecx
+	/* Used no matter what.  */
+	vpcmpb	$0, (VEC_SIZE * -3)(%rax), %VECMATCH, %k0
+	kmovd	%k0, %ecx
+ 
+-	kmovd	%k1, %eax
+	cmpl	$(VEC_SIZE * 3), %edx
+	ja	L(last_vec)
+ 
+-	/* Remove the leading bytes.  Must use unsigned right shift for
+-	   bsrl below.  */
+-	shrl	%cl, %eax
+-	testl	%eax, %eax
+-	jz	L(zero)
+	lzcntl	%ecx, %ecx
+	subq	$(VEC_SIZE * 2 + 1), %rax
+	subq	%rcx, %rax
+	cmpq	%rax, %rdi
+	jbe	L(ret_1)
+	xorl	%eax, %eax
+L(ret_1):
+	ret
+ 
+-	bsrl	%eax, %eax
+-	addq	%rdi, %rax
+-	addq	%r8, %rax
+	.p2align 4,, 6
+L(loop_end):
+	kmovd	%k1, %ecx
+	notl	%ecx
+	testl	%ecx, %ecx
+	jnz	L(ret_vec_x0_end)
+
+	vptestnmb %VEC(2), %VEC(2), %k0
+	kmovd	%k0, %ecx
+	testl	%ecx, %ecx
+	jnz	L(ret_vec_x1_end)
+
+	kmovd	%k2, %ecx
+	kmovd	%k4, %esi
+	/* Combine last 2 VEC matches. If ecx (VEC3) is zero (no CHAR in VEC3)
+	   then it won't affect the result in esi (VEC4). If ecx is non-zero
+	   then CHAR in VEC3 and bsrq will use that position.  */
+	salq	$32, %rcx
+	orq	%rsi, %rcx
+	bsrq	%rcx, %rcx
+	addq	%rcx, %rax
+	ret
+	.p2align 4,, 4
+L(ret_vec_x0_end):
+	addq	$(VEC_SIZE), %rax
+L(ret_vec_x1_end):
+	bsrl	%ecx, %ecx
+	leaq	(VEC_SIZE * 2)(%rax, %rcx), %rax
+ 	ret
+-END (__memrchr_evex)
+
+END(MEMRCHR)
+ #endif
--- a/glibc-upstream-2.34-285.patch
+++ b/glibc-upstream-2.34-285.patch
@ -0,0 +1,645 @@
+commit b05bd59823bcedee281d3fd5bd4928698ea9d69d
+Author: Noah Goldstein <goldstein.w.n@gmail.com>
+Date:   Mon Jun 6 21:11:32 2022 -0700
+
+    x86: Optimize memrchr-avx2.S
+    
+    The new code:
+        1. prioritizes smaller user-arg lengths more.
+        2. optimizes target placement more carefully
+        3. reuses logic more
+        4. fixes up various inefficiencies in the logic. The biggest
+           case here is the `lzcnt` logic for checking returns which
+           saves either a branch or multiple instructions.
+    
+    The total code size saving is: 306 bytes
+    Geometric Mean of all benchmarks New / Old: 0.760
+    
+    Regressions:
+    There are some regressions. Particularly where the length (user arg
+    length) is large but the position of the match char is near the
+    beginning of the string (in first VEC). This case has roughly a
+    10-20% regression.
+    
+    This is because the new logic gives the hot path for immediate matches
+    to shorter lengths (the more common input). This case has roughly
+    a 15-45% speedup.
+    
+    Full xcheck passes on x86_64.
+    Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
+    
+    (cherry picked from commit af5306a735eb0966fdc2f8ccdafa8888e2df0c87)
+
+diff --git a/sysdeps/x86_64/multiarch/memrchr-avx2-rtm.S b/sysdeps/x86_64/multiarch/memrchr-avx2-rtm.S
+index cea2d2a72db7406a..5e9beeeef2677c9f 100644
+--- a/sysdeps/x86_64/multiarch/memrchr-avx2-rtm.S
+++ b/sysdeps/x86_64/multiarch/memrchr-avx2-rtm.S
+@@ -2,6 +2,7 @@
+ # define MEMRCHR __memrchr_avx2_rtm
+ #endif
+ 
+#define COND_VZEROUPPER	COND_VZEROUPPER_XTEST
+ #define ZERO_UPPER_VEC_REGISTERS_RETURN \
+   ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST
+ 
+diff --git a/sysdeps/x86_64/multiarch/memrchr-avx2.S b/sysdeps/x86_64/multiarch/memrchr-avx2.S
+index ac7370cb06e9a0fd..5f8e0be18cfe4fad 100644
+--- a/sysdeps/x86_64/multiarch/memrchr-avx2.S
+++ b/sysdeps/x86_64/multiarch/memrchr-avx2.S
+@@ -21,340 +21,318 @@
+ # include <sysdep.h>
+ 
+ # ifndef MEMRCHR
+-#  define MEMRCHR	__memrchr_avx2
+#  define MEMRCHR				__memrchr_avx2
+ # endif
+ 
+ # ifndef VZEROUPPER
+-#  define VZEROUPPER	vzeroupper
+#  define VZEROUPPER			vzeroupper
+ # endif
+ 
+ # ifndef SECTION
+ #  define SECTION(p)	p##.avx
+ # endif
+ 
+-# define VEC_SIZE 32
+# define VEC_SIZE			32
+# define PAGE_SIZE			4096
+	.section SECTION(.text), "ax", @progbits
+ENTRY(MEMRCHR)
+# ifdef __ILP32__
+	/* Clear upper bits.  */
+	and	%RDX_LP, %RDX_LP
+# else
+	test	%RDX_LP, %RDX_LP
+# endif
+	jz	L(zero_0)
+ 
+-	.section SECTION(.text),"ax",@progbits
+-ENTRY (MEMRCHR)
+-	/* Broadcast CHAR to YMM0.  */
+ 	vmovd	%esi, %xmm0
+-	vpbroadcastb %xmm0, %ymm0
+-
+-	sub	$VEC_SIZE, %RDX_LP
+-	jbe	L(last_vec_or_less)
+-
+-	add	%RDX_LP, %RDI_LP
+-
+-	/* Check the last VEC_SIZE bytes.  */
+-	vpcmpeqb (%rdi), %ymm0, %ymm1
+-	vpmovmskb %ymm1, %eax
+-	testl	%eax, %eax
+-	jnz	L(last_vec_x0)
+	/* Get end pointer. Minus one for two reasons. 1) It is necessary for a
+	   correct page cross check and 2) it correctly sets up end ptr to be
+	   subtract by lzcnt aligned.  */
+	leaq	-1(%rdx, %rdi), %rax
+ 
+-	subq	$(VEC_SIZE * 4), %rdi
+-	movl	%edi, %ecx
+-	andl	$(VEC_SIZE - 1), %ecx
+-	jz	L(aligned_more)
+	vpbroadcastb %xmm0, %ymm0
+ 
+-	/* Align data for aligned loads in the loop.  */
+-	addq	$VEC_SIZE, %rdi
+-	addq	$VEC_SIZE, %rdx
+-	andq	$-VEC_SIZE, %rdi
+-	subq	%rcx, %rdx
+	/* Check if we can load 1x VEC without cross a page.  */
+	testl	$(PAGE_SIZE - VEC_SIZE), %eax
+	jz	L(page_cross)
+
+	vpcmpeqb -(VEC_SIZE - 1)(%rax), %ymm0, %ymm1
+	vpmovmskb %ymm1, %ecx
+	cmpq	$VEC_SIZE, %rdx
+	ja	L(more_1x_vec)
+
+L(ret_vec_x0_test):
+	/* If ecx is zero (no matches) lzcnt will set it 32 (VEC_SIZE) which
+	   will gurantee edx (len) is less than it.  */
+	lzcntl	%ecx, %ecx
+
+	/* Hoist vzeroupper (not great for RTM) to save code size. This allows
+	   all logic for edx (len) <= VEC_SIZE to fit in first cache line.  */
+	COND_VZEROUPPER
+	cmpl	%ecx, %edx
+	jle	L(zero_0)
+	subq	%rcx, %rax
+	ret
+ 
+-	.p2align 4
+-L(aligned_more):
+-	subq	$(VEC_SIZE * 4), %rdx
+-	jbe	L(last_4x_vec_or_less)
+-
+-	/* Check the last 4 * VEC_SIZE.  Only one VEC_SIZE at a time
+-	   since data is only aligned to VEC_SIZE.  */
+-	vpcmpeqb (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1
+-	vpmovmskb %ymm1, %eax
+-	testl	%eax, %eax
+-	jnz	L(last_vec_x3)
+-
+-	vpcmpeqb (VEC_SIZE * 2)(%rdi), %ymm0, %ymm2
+-	vpmovmskb %ymm2, %eax
+-	testl	%eax, %eax
+-	jnz	L(last_vec_x2)
+-
+-	vpcmpeqb VEC_SIZE(%rdi), %ymm0, %ymm3
+-	vpmovmskb %ymm3, %eax
+-	testl	%eax, %eax
+-	jnz	L(last_vec_x1)
+-
+-	vpcmpeqb (%rdi), %ymm0, %ymm4
+-	vpmovmskb %ymm4, %eax
+-	testl	%eax, %eax
+-	jnz	L(last_vec_x0)
+-
+-	/* Align data to 4 * VEC_SIZE for loop with fewer branches.
+-	   There are some overlaps with above if data isn't aligned
+-	   to 4 * VEC_SIZE.  */
+-	movl	%edi, %ecx
+-	andl	$(VEC_SIZE * 4 - 1), %ecx
+-	jz	L(loop_4x_vec)
+-
+-	addq	$(VEC_SIZE * 4), %rdi
+-	addq	$(VEC_SIZE * 4), %rdx
+-	andq	$-(VEC_SIZE * 4), %rdi
+-	subq	%rcx, %rdx
+	/* Fits in aligning bytes of first cache line.  */
+L(zero_0):
+	xorl	%eax, %eax
+	ret
+ 
+-	.p2align 4
+-L(loop_4x_vec):
+-	/* Compare 4 * VEC at a time forward.  */
+-	subq	$(VEC_SIZE * 4), %rdi
+-	subq	$(VEC_SIZE * 4), %rdx
+-	jbe	L(last_4x_vec_or_less)
+-
+-	vmovdqa	(%rdi), %ymm1
+-	vmovdqa	VEC_SIZE(%rdi), %ymm2
+-	vmovdqa	(VEC_SIZE * 2)(%rdi), %ymm3
+-	vmovdqa	(VEC_SIZE * 3)(%rdi), %ymm4
+-
+-	vpcmpeqb %ymm1, %ymm0, %ymm1
+-	vpcmpeqb %ymm2, %ymm0, %ymm2
+-	vpcmpeqb %ymm3, %ymm0, %ymm3
+-	vpcmpeqb %ymm4, %ymm0, %ymm4
+-
+-	vpor	%ymm1, %ymm2, %ymm5
+-	vpor	%ymm3, %ymm4, %ymm6
+-	vpor	%ymm5, %ymm6, %ymm5
+-
+-	vpmovmskb %ymm5, %eax
+-	testl	%eax, %eax
+-	jz	L(loop_4x_vec)
+-
+-	/* There is a match.  */
+-	vpmovmskb %ymm4, %eax
+-	testl	%eax, %eax
+-	jnz	L(last_vec_x3)
+-
+-	vpmovmskb %ymm3, %eax
+-	testl	%eax, %eax
+-	jnz	L(last_vec_x2)
+-
+-	vpmovmskb %ymm2, %eax
+-	testl	%eax, %eax
+-	jnz	L(last_vec_x1)
+-
+-	vpmovmskb %ymm1, %eax
+-	bsrl	%eax, %eax
+-	addq	%rdi, %rax
+	.p2align 4,, 9
+L(ret_vec_x0):
+	lzcntl	%ecx, %ecx
+	subq	%rcx, %rax
+ L(return_vzeroupper):
+ 	ZERO_UPPER_VEC_REGISTERS_RETURN
+ 
+-	.p2align 4
+-L(last_4x_vec_or_less):
+-	addl	$(VEC_SIZE * 4), %edx
+-	cmpl	$(VEC_SIZE * 2), %edx
+-	jbe	L(last_2x_vec)
+-
+-	vpcmpeqb (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1
+-	vpmovmskb %ymm1, %eax
+-	testl	%eax, %eax
+-	jnz	L(last_vec_x3)
+-
+-	vpcmpeqb (VEC_SIZE * 2)(%rdi), %ymm0, %ymm2
+-	vpmovmskb %ymm2, %eax
+-	testl	%eax, %eax
+-	jnz	L(last_vec_x2)
+-
+-	vpcmpeqb VEC_SIZE(%rdi), %ymm0, %ymm3
+-	vpmovmskb %ymm3, %eax
+-	testl	%eax, %eax
+-	jnz	L(last_vec_x1_check)
+-	cmpl	$(VEC_SIZE * 3), %edx
+-	jbe	L(zero)
+-
+-	vpcmpeqb (%rdi), %ymm0, %ymm4
+-	vpmovmskb %ymm4, %eax
+-	testl	%eax, %eax
+-	jz	L(zero)
+-	bsrl	%eax, %eax
+-	subq	$(VEC_SIZE * 4), %rdx
+-	addq	%rax, %rdx
+-	jl	L(zero)
+-	addq	%rdi, %rax
+-	VZEROUPPER_RETURN
+-
+-	.p2align 4
+	.p2align 4,, 10
+L(more_1x_vec):
+	testl	%ecx, %ecx
+	jnz	L(ret_vec_x0)
+
+	/* Align rax (string pointer).  */
+	andq	$-VEC_SIZE, %rax
+
+	/* Recompute remaining length after aligning.  */
+	movq	%rax, %rdx
+	/* Need this comparison next no matter what.  */
+	vpcmpeqb -(VEC_SIZE)(%rax), %ymm0, %ymm1
+	subq	%rdi, %rdx
+	decq	%rax
+	vpmovmskb %ymm1, %ecx
+	/* Fall through for short (hotter than length).  */
+	cmpq	$(VEC_SIZE * 2), %rdx
+	ja	L(more_2x_vec)
+ L(last_2x_vec):
+-	vpcmpeqb (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1
+-	vpmovmskb %ymm1, %eax
+-	testl	%eax, %eax
+-	jnz	L(last_vec_x3_check)
+ 	cmpl	$VEC_SIZE, %edx
+-	jbe	L(zero)
+-
+-	vpcmpeqb (VEC_SIZE * 2)(%rdi), %ymm0, %ymm1
+-	vpmovmskb %ymm1, %eax
+-	testl	%eax, %eax
+-	jz	L(zero)
+-	bsrl	%eax, %eax
+-	subq	$(VEC_SIZE * 2), %rdx
+-	addq	%rax, %rdx
+-	jl	L(zero)
+-	addl	$(VEC_SIZE * 2), %eax
+-	addq	%rdi, %rax
+-	VZEROUPPER_RETURN
+-
+-	.p2align 4
+-L(last_vec_x0):
+-	bsrl	%eax, %eax
+-	addq	%rdi, %rax
+-	VZEROUPPER_RETURN
+	jbe	L(ret_vec_x0_test)
+
+	testl	%ecx, %ecx
+	jnz	L(ret_vec_x0)
+
+	vpcmpeqb -(VEC_SIZE * 2 - 1)(%rax), %ymm0, %ymm1
+	vpmovmskb %ymm1, %ecx
+	/* 64-bit lzcnt. This will naturally add 32 to position.  */
+	lzcntq	%rcx, %rcx
+	COND_VZEROUPPER
+	cmpl	%ecx, %edx
+	jle	L(zero_0)
+	subq	%rcx, %rax
+	ret
+ 
+-	.p2align 4
+-L(last_vec_x1):
+-	bsrl	%eax, %eax
+-	addl	$VEC_SIZE, %eax
+-	addq	%rdi, %rax
+-	VZEROUPPER_RETURN
+ 
+-	.p2align 4
+-L(last_vec_x2):
+-	bsrl	%eax, %eax
+-	addl	$(VEC_SIZE * 2), %eax
+-	addq	%rdi, %rax
+	/* Inexpensive place to put this regarding code size / target alignments
+	   / ICache NLP. Necessary for 2-byte encoding of jump to page cross
+	   case which in turn is necessary for hot path (len <= VEC_SIZE) to fit
+	   in first cache line.  */
+L(page_cross):
+	movq	%rax, %rsi
+	andq	$-VEC_SIZE, %rsi
+	vpcmpeqb (%rsi), %ymm0, %ymm1
+	vpmovmskb %ymm1, %ecx
+	/* Shift out negative alignment (because we are starting from endptr and
+	   working backwards).  */
+	movl	%eax, %r8d
+	/* notl because eax already has endptr - 1.  (-x = ~(x - 1)).  */
+	notl	%r8d
+	shlxl	%r8d, %ecx, %ecx
+	cmpq	%rdi, %rsi
+	ja	L(more_1x_vec)
+	lzcntl	%ecx, %ecx
+	COND_VZEROUPPER
+	cmpl	%ecx, %edx
+	jle	L(zero_0)
+	subq	%rcx, %rax
+	ret
+	.p2align 4,, 11
+L(ret_vec_x1):
+	/* This will naturally add 32 to position.  */
+	lzcntq	%rcx, %rcx
+	subq	%rcx, %rax
+ 	VZEROUPPER_RETURN
+	.p2align 4,, 10
+L(more_2x_vec):
+	testl	%ecx, %ecx
+	jnz	L(ret_vec_x0)
+ 
+-	.p2align 4
+-L(last_vec_x3):
+-	bsrl	%eax, %eax
+-	addl	$(VEC_SIZE * 3), %eax
+-	addq	%rdi, %rax
+-	ret
+	vpcmpeqb -(VEC_SIZE * 2 - 1)(%rax), %ymm0, %ymm1
+	vpmovmskb %ymm1, %ecx
+	testl	%ecx, %ecx
+	jnz	L(ret_vec_x1)
+ 
+-	.p2align 4
+-L(last_vec_x1_check):
+-	bsrl	%eax, %eax
+-	subq	$(VEC_SIZE * 3), %rdx
+-	addq	%rax, %rdx
+-	jl	L(zero)
+-	addl	$VEC_SIZE, %eax
+-	addq	%rdi, %rax
+-	VZEROUPPER_RETURN
+ 
+-	.p2align 4
+-L(last_vec_x3_check):
+-	bsrl	%eax, %eax
+-	subq	$VEC_SIZE, %rdx
+-	addq	%rax, %rdx
+-	jl	L(zero)
+-	addl	$(VEC_SIZE * 3), %eax
+-	addq	%rdi, %rax
+-	VZEROUPPER_RETURN
+	/* Needed no matter what.  */
+	vpcmpeqb -(VEC_SIZE * 3 - 1)(%rax), %ymm0, %ymm1
+	vpmovmskb %ymm1, %ecx
+ 
+-	.p2align 4
+-L(zero):
+-	xorl	%eax, %eax
+-	VZEROUPPER_RETURN
+	subq	$(VEC_SIZE * 4), %rdx
+	ja	L(more_4x_vec)
+
+	cmpl	$(VEC_SIZE * -1), %edx
+	jle	L(ret_vec_x2_test)
+
+L(last_vec):
+	testl	%ecx, %ecx
+	jnz	L(ret_vec_x2)
+
+	/* Needed no matter what.  */
+	vpcmpeqb -(VEC_SIZE * 4 - 1)(%rax), %ymm0, %ymm1
+	vpmovmskb %ymm1, %ecx
+	lzcntl	%ecx, %ecx
+	subq	$(VEC_SIZE * 3), %rax
+	COND_VZEROUPPER
+	subq	%rcx, %rax
+	cmpq	%rax, %rdi
+	ja	L(zero_2)
+	ret
+ 
+-	.p2align 4
+-L(null):
+	/* First in aligning bytes.  */
+L(zero_2):
+ 	xorl	%eax, %eax
+ 	ret
+ 
+-	.p2align 4
+-L(last_vec_or_less_aligned):
+-	movl	%edx, %ecx
+	.p2align 4,, 4
+L(ret_vec_x2_test):
+	lzcntl	%ecx, %ecx
+	subq	$(VEC_SIZE * 2), %rax
+	COND_VZEROUPPER
+	subq	%rcx, %rax
+	cmpq	%rax, %rdi
+	ja	L(zero_2)
+	ret
+ 
+-	vpcmpeqb (%rdi), %ymm0, %ymm1
+ 
+-	movl	$1, %edx
+-	/* Support rdx << 32.  */
+-	salq	%cl, %rdx
+-	subq	$1, %rdx
+	.p2align 4,, 11
+L(ret_vec_x2):
+	/* ecx must be non-zero.  */
+	bsrl	%ecx, %ecx
+	leaq	(VEC_SIZE * -3 + 1)(%rcx, %rax), %rax
+	VZEROUPPER_RETURN
+ 
+-	vpmovmskb %ymm1, %eax
+	.p2align 4,, 14
+L(ret_vec_x3):
+	/* ecx must be non-zero.  */
+	bsrl	%ecx, %ecx
+	leaq	(VEC_SIZE * -4 + 1)(%rcx, %rax), %rax
+	VZEROUPPER_RETURN
+ 
+-	/* Remove the trailing bytes.  */
+-	andl	%edx, %eax
+-	testl	%eax, %eax
+-	jz	L(zero)
+ 
+-	bsrl	%eax, %eax
+-	addq	%rdi, %rax
+-	VZEROUPPER_RETURN
+ 
+ 	.p2align 4
+-L(last_vec_or_less):
+-	addl	$VEC_SIZE, %edx
+L(more_4x_vec):
+	testl	%ecx, %ecx
+	jnz	L(ret_vec_x2)
+ 
+-	/* Check for zero length.  */
+-	testl	%edx, %edx
+-	jz	L(null)
+	vpcmpeqb -(VEC_SIZE * 4 - 1)(%rax), %ymm0, %ymm1
+	vpmovmskb %ymm1, %ecx
+ 
+-	movl	%edi, %ecx
+-	andl	$(VEC_SIZE - 1), %ecx
+-	jz	L(last_vec_or_less_aligned)
+	testl	%ecx, %ecx
+	jnz	L(ret_vec_x3)
+ 
+-	movl	%ecx, %esi
+-	movl	%ecx, %r8d
+-	addl	%edx, %esi
+-	andq	$-VEC_SIZE, %rdi
+	/* Check if near end before re-aligning (otherwise might do an
+	   unnecissary loop iteration).  */
+	addq	$-(VEC_SIZE * 4), %rax
+	cmpq	$(VEC_SIZE * 4), %rdx
+	jbe	L(last_4x_vec)
+ 
+-	subl	$VEC_SIZE, %esi
+-	ja	L(last_vec_2x_aligned)
+	/* Align rax to (VEC_SIZE - 1).  */
+	orq	$(VEC_SIZE * 4 - 1), %rax
+	movq	%rdi, %rdx
+	/* Get endptr for loop in rdx. NB: Can't just do while rax > rdi because
+	   lengths that overflow can be valid and break the comparison.  */
+	orq	$(VEC_SIZE * 4 - 1), %rdx
+ 
+-	/* Check the last VEC.  */
+-	vpcmpeqb (%rdi), %ymm0, %ymm1
+-	vpmovmskb %ymm1, %eax
+-
+-	/* Remove the leading and trailing bytes.  */
+-	sarl	%cl, %eax
+-	movl	%edx, %ecx
+	.p2align 4
+L(loop_4x_vec):
+	/* Need this comparison next no matter what.  */
+	vpcmpeqb -(VEC_SIZE * 1 - 1)(%rax), %ymm0, %ymm1
+	vpcmpeqb -(VEC_SIZE * 2 - 1)(%rax), %ymm0, %ymm2
+	vpcmpeqb -(VEC_SIZE * 3 - 1)(%rax), %ymm0, %ymm3
+	vpcmpeqb -(VEC_SIZE * 4 - 1)(%rax), %ymm0, %ymm4
+ 
+-	movl	$1, %edx
+-	sall	%cl, %edx
+-	subl	$1, %edx
+	vpor	%ymm1, %ymm2, %ymm2
+	vpor	%ymm3, %ymm4, %ymm4
+	vpor	%ymm2, %ymm4, %ymm4
+	vpmovmskb %ymm4, %esi
+ 
+-	andl	%edx, %eax
+-	testl	%eax, %eax
+-	jz	L(zero)
+	testl	%esi, %esi
+	jnz	L(loop_end)
+ 
+-	bsrl	%eax, %eax
+-	addq	%rdi, %rax
+-	addq	%r8, %rax
+-	VZEROUPPER_RETURN
+	addq	$(VEC_SIZE * -4), %rax
+	cmpq	%rdx, %rax
+	jne	L(loop_4x_vec)
+ 
+-	.p2align 4
+-L(last_vec_2x_aligned):
+-	movl	%esi, %ecx
+	subl	%edi, %edx
+	incl	%edx
+ 
+-	/* Check the last VEC.  */
+-	vpcmpeqb VEC_SIZE(%rdi), %ymm0, %ymm1
+L(last_4x_vec):
+	/* Used no matter what.  */
+	vpcmpeqb -(VEC_SIZE * 1 - 1)(%rax), %ymm0, %ymm1
+	vpmovmskb %ymm1, %ecx
+ 
+-	movl	$1, %edx
+-	sall	%cl, %edx
+-	subl	$1, %edx
+	cmpl	$(VEC_SIZE * 2), %edx
+	jbe	L(last_2x_vec)
+ 
+-	vpmovmskb %ymm1, %eax
+	testl	%ecx, %ecx
+	jnz	L(ret_vec_x0_end)
+ 
+-	/* Remove the trailing bytes.  */
+-	andl	%edx, %eax
+	vpcmpeqb -(VEC_SIZE * 2 - 1)(%rax), %ymm0, %ymm1
+	vpmovmskb %ymm1, %ecx
+	testl	%ecx, %ecx
+	jnz	L(ret_vec_x1_end)
+ 
+-	testl	%eax, %eax
+-	jnz	L(last_vec_x1)
+	/* Used no matter what.  */
+	vpcmpeqb -(VEC_SIZE * 3 - 1)(%rax), %ymm0, %ymm1
+	vpmovmskb %ymm1, %ecx
+ 
+-	/* Check the second last VEC.  */
+-	vpcmpeqb (%rdi), %ymm0, %ymm1
+	cmpl	$(VEC_SIZE * 3), %edx
+	ja	L(last_vec)
+
+	lzcntl	%ecx, %ecx
+	subq	$(VEC_SIZE * 2), %rax
+	COND_VZEROUPPER
+	subq	%rcx, %rax
+	cmpq	%rax, %rdi
+	jbe	L(ret0)
+	xorl	%eax, %eax
+L(ret0):
+	ret
+ 
+-	movl	%r8d, %ecx
+ 
+-	vpmovmskb %ymm1, %eax
+	.p2align 4
+L(loop_end):
+	vpmovmskb %ymm1, %ecx
+	testl	%ecx, %ecx
+	jnz	L(ret_vec_x0_end)
+
+	vpmovmskb %ymm2, %ecx
+	testl	%ecx, %ecx
+	jnz	L(ret_vec_x1_end)
+
+	vpmovmskb %ymm3, %ecx
+	/* Combine last 2 VEC matches. If ecx (VEC3) is zero (no CHAR in VEC3)
+	   then it won't affect the result in esi (VEC4). If ecx is non-zero
+	   then CHAR in VEC3 and bsrq will use that position.  */
+	salq	$32, %rcx
+	orq	%rsi, %rcx
+	bsrq	%rcx, %rcx
+	leaq	(VEC_SIZE * -4 + 1)(%rcx, %rax), %rax
+	VZEROUPPER_RETURN
+ 
+-	/* Remove the leading bytes.  Must use unsigned right shift for
+-	   bsrl below.  */
+-	shrl	%cl, %eax
+-	testl	%eax, %eax
+-	jz	L(zero)
+	.p2align 4,, 4
+L(ret_vec_x1_end):
+	/* 64-bit version will automatically add 32 (VEC_SIZE).  */
+	lzcntq	%rcx, %rcx
+	subq	%rcx, %rax
+	VZEROUPPER_RETURN
+ 
+-	bsrl	%eax, %eax
+-	addq	%rdi, %rax
+-	addq	%r8, %rax
+	.p2align 4,, 4
+L(ret_vec_x0_end):
+	lzcntl	%ecx, %ecx
+	subq	%rcx, %rax
+ 	VZEROUPPER_RETURN
+-END (MEMRCHR)
+
+	/* 2 bytes until next cache line.  */
+END(MEMRCHR)
+ #endif
--- a/glibc-upstream-2.34-286.patch
+++ b/glibc-upstream-2.34-286.patch
@ -0,0 +1,323 @@
+commit a910d7e164f1d9b8e77bbea35a2d2ab89a5e26cc
+Author: Noah Goldstein <goldstein.w.n@gmail.com>
+Date:   Mon Jun 6 21:11:33 2022 -0700
+
+    x86: Shrink code size of memchr-avx2.S
+    
+    This is not meant as a performance optimization. The previous code was
+    far to liberal in aligning targets and wasted code size unnecissarily.
+    
+    The total code size saving is: 59 bytes
+    
+    There are no major changes in the benchmarks.
+    Geometric Mean of all benchmarks New / Old: 0.967
+    
+    Full xcheck passes on x86_64.
+    Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
+    
+    (cherry picked from commit 6dcbb7d95dded20153b12d76d2f4e0ef0cda4f35)
+    
+    x86: Fix page cross case in rawmemchr-avx2 [BZ #29234]
+    
+    commit 6dcbb7d95dded20153b12d76d2f4e0ef0cda4f35
+    Author: Noah Goldstein <goldstein.w.n@gmail.com>
+    Date:   Mon Jun 6 21:11:33 2022 -0700
+    
+        x86: Shrink code size of memchr-avx2.S
+    
+    Changed how the page cross case aligned string (rdi) in
+    rawmemchr. This was incompatible with how
+    `L(cross_page_continue)` expected the pointer to be aligned and
+    would cause rawmemchr to read data start started before the
+    beginning of the string. What it would read was in valid memory
+    but could count CHAR matches resulting in an incorrect return
+    value.
+    
+    This commit fixes that issue by essentially reverting the changes to
+    the L(page_cross) case as they didn't really matter.
+    
+    Test cases added and all pass with the new code (and where confirmed
+    to fail with the old code).
+    Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
+    
+    (cherry picked from commit 2c9af8421d2b4a7fcce163e7bc81a118d22fd346)
+
+diff --git a/string/test-rawmemchr.c b/string/test-rawmemchr.c
+index 085098aba8fdbc13..327c0654e69e7669 100644
+--- a/string/test-rawmemchr.c
+++ b/string/test-rawmemchr.c
+@@ -18,6 +18,7 @@
+    <https://www.gnu.org/licenses/>.  */
+ 
+ #include <assert.h>
+#include <support/xunistd.h>
+ 
+ #define TEST_MAIN
+ #define TEST_NAME "rawmemchr"
+@@ -51,13 +52,45 @@ do_one_test (impl_t *impl, const char *s, int c, char *exp_res)
+     }
+ }
+ 
+static void
+do_test_bz29234 (void)
+{
+  size_t i, j;
+  char *ptr_start;
+  char *buf = xmmap (0, 8192, PROT_READ | PROT_WRITE,
+		     MAP_PRIVATE | MAP_ANONYMOUS, -1);
+
+  memset (buf, -1, 8192);
+
+  ptr_start = buf + 4096 - 8;
+
+  /* Out of range matches before the start of a page. */
+  memset (ptr_start - 8, 0x1, 8);
+
+  for (j = 0; j < 8; ++j)
+    {
+      for (i = 0; i < 128; ++i)
+	{
+	  ptr_start[i + j] = 0x1;
+
+	  FOR_EACH_IMPL (impl, 0)
+	    do_one_test (impl, (char *) (ptr_start + j), 0x1,
+			 ptr_start + i + j);
+
+	  ptr_start[i + j] = 0xff;
+	}
+    }
+
+  xmunmap (buf, 8192);
+}
+
+ static void
+ do_test (size_t align, size_t pos, size_t len, int seek_char)
+ {
+   size_t i;
+   char *result;
+ 
+-  align &= 7;
+  align &= getpagesize () - 1;
+   if (align + len >= page_size)
+     return;
+ 
+@@ -115,6 +148,13 @@ do_random_tests (void)
+ 	    }
+ 	}
+ 
+      if (align)
+	{
+	  p[align - 1] = seek_char;
+	  if (align > 4)
+	    p[align - 4] = seek_char;
+	}
+
+       assert (pos < len);
+       size_t r = random ();
+       if ((r & 31) == 0)
+@@ -130,6 +170,13 @@ do_random_tests (void)
+ 		   result, p);
+ 	    ret = 1;
+ 	  }
+
+      if (align)
+	{
+	  p[align - 1] = seek_char;
+	  if (align > 4)
+	    p[align - 4] = seek_char;
+	}
+     }
+ }
+ 
+@@ -151,14 +198,22 @@ test_main (void)
+       do_test (i, 64, 256, 23);
+       do_test (0, 16 << i, 2048, 0);
+       do_test (i, 64, 256, 0);
+
+      do_test (getpagesize () - i, 64, 256, 23);
+      do_test (getpagesize () - i, 64, 256, 0);
+     }
+   for (i = 1; i < 32; ++i)
+     {
+       do_test (0, i, i + 1, 23);
+       do_test (0, i, i + 1, 0);
+
+      do_test (getpagesize () - 7, i, i + 1, 23);
+      do_test (getpagesize () - i / 2, i, i + 1, 23);
+      do_test (getpagesize () - i, i, i + 1, 23);
+     }
+ 
+   do_random_tests ();
+  do_test_bz29234 ();
+   return ret;
+ }
+ 
+diff --git a/sysdeps/x86_64/multiarch/memchr-avx2-rtm.S b/sysdeps/x86_64/multiarch/memchr-avx2-rtm.S
+index 87b076c7c403ba85..c4d71938c5a3ed24 100644
+--- a/sysdeps/x86_64/multiarch/memchr-avx2-rtm.S
+++ b/sysdeps/x86_64/multiarch/memchr-avx2-rtm.S
+@@ -2,6 +2,7 @@
+ # define MEMCHR __memchr_avx2_rtm
+ #endif
+ 
+#define COND_VZEROUPPER	COND_VZEROUPPER_XTEST
+ #define ZERO_UPPER_VEC_REGISTERS_RETURN \
+   ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST
+ 
+diff --git a/sysdeps/x86_64/multiarch/memchr-avx2.S b/sysdeps/x86_64/multiarch/memchr-avx2.S
+index afdb95650232fdac..9e0b7dd1f4fe9909 100644
+--- a/sysdeps/x86_64/multiarch/memchr-avx2.S
+++ b/sysdeps/x86_64/multiarch/memchr-avx2.S
+@@ -57,7 +57,7 @@
+ # define CHAR_PER_VEC	(VEC_SIZE / CHAR_SIZE)
+ 
+ 	.section SECTION(.text),"ax",@progbits
+-ENTRY (MEMCHR)
+ENTRY_P2ALIGN (MEMCHR, 5)
+ # ifndef USE_AS_RAWMEMCHR
+ 	/* Check for zero length.  */
+ #  ifdef __ILP32__
+@@ -87,12 +87,14 @@ ENTRY (MEMCHR)
+ # endif
+ 	testl	%eax, %eax
+ 	jz	L(aligned_more)
+-	tzcntl	%eax, %eax
+	bsfl	%eax, %eax
+ 	addq	%rdi, %rax
+-	VZEROUPPER_RETURN
+L(return_vzeroupper):
+	ZERO_UPPER_VEC_REGISTERS_RETURN
+
+ 
+ # ifndef USE_AS_RAWMEMCHR
+-	.p2align 5
+	.p2align 4
+ L(first_vec_x0):
+ 	/* Check if first match was before length.  */
+ 	tzcntl	%eax, %eax
+@@ -100,58 +102,31 @@ L(first_vec_x0):
+ 	/* NB: Multiply length by 4 to get byte count.  */
+ 	sall	$2, %edx
+ #  endif
+-	xorl	%ecx, %ecx
+    COND_VZEROUPPER
+	/* Use branch instead of cmovcc so L(first_vec_x0) fits in one fetch
+	   block. branch here as opposed to cmovcc is not that costly. Common
+	   usage of memchr is to check if the return was NULL (if string was
+	   known to contain CHAR user would use rawmemchr). This branch will be
+	   highly correlated with the user branch and can be used by most
+	   modern branch predictors to predict the user branch.  */
+ 	cmpl	%eax, %edx
+-	leaq	(%rdi, %rax), %rax
+-	cmovle	%rcx, %rax
+-	VZEROUPPER_RETURN
+-
+-L(null):
+-	xorl	%eax, %eax
+-	ret
+-# endif
+-	.p2align 4
+-L(cross_page_boundary):
+-	/* Save pointer before aligning as its original value is
+-	   necessary for computer return address if byte is found or
+-	   adjusting length if it is not and this is memchr.  */
+-	movq	%rdi, %rcx
+-	/* Align data to VEC_SIZE - 1. ALGN_PTR_REG is rcx for memchr
+-	   and rdi for rawmemchr.  */
+-	orq	$(VEC_SIZE - 1), %ALGN_PTR_REG
+-	VPCMPEQ	-(VEC_SIZE - 1)(%ALGN_PTR_REG), %ymm0, %ymm1
+-	vpmovmskb %ymm1, %eax
+-# ifndef USE_AS_RAWMEMCHR
+-	/* Calculate length until end of page (length checked for a
+-	   match).  */
+-	leaq	1(%ALGN_PTR_REG), %rsi
+-	subq	%RRAW_PTR_REG, %rsi
+-#  ifdef USE_AS_WMEMCHR
+-	/* NB: Divide bytes by 4 to get wchar_t count.  */
+-	shrl	$2, %esi
+-#  endif
+-# endif
+-	/* Remove the leading bytes.  */
+-	sarxl	%ERAW_PTR_REG, %eax, %eax
+-# ifndef USE_AS_RAWMEMCHR
+-	/* Check the end of data.  */
+-	cmpq	%rsi, %rdx
+-	jbe	L(first_vec_x0)
+    jle  L(null)
+	addq	%rdi, %rax
+    ret
+ # endif
+-	testl	%eax, %eax
+-	jz	L(cross_page_continue)
+-	tzcntl	%eax, %eax
+-	addq	%RRAW_PTR_REG, %rax
+-L(return_vzeroupper):
+-	ZERO_UPPER_VEC_REGISTERS_RETURN
+ 
+-	.p2align 4
+	.p2align 4,, 10
+ L(first_vec_x1):
+-	tzcntl	%eax, %eax
+	bsfl	%eax, %eax
+ 	incq	%rdi
+ 	addq	%rdi, %rax
+ 	VZEROUPPER_RETURN
+-
+# ifndef USE_AS_RAWMEMCHR
+	/* First in aligning bytes here.  */
+L(null):
+	xorl	%eax, %eax
+	ret
+# endif
+ 	.p2align 4
+ L(first_vec_x2):
+ 	tzcntl	%eax, %eax
+@@ -340,7 +315,7 @@ L(first_vec_x1_check):
+ 	incq	%rdi
+ 	addq	%rdi, %rax
+ 	VZEROUPPER_RETURN
+-	.p2align 4
+	.p2align 4,, 6
+ L(set_zero_end):
+ 	xorl	%eax, %eax
+ 	VZEROUPPER_RETURN
+@@ -428,5 +403,39 @@ L(last_vec_x3):
+ 	VZEROUPPER_RETURN
+ # endif
+ 
+	.p2align 4
+L(cross_page_boundary):
+	/* Save pointer before aligning as its original value is necessary for
+	   computer return address if byte is found or adjusting length if it
+	   is not and this is memchr.  */
+	movq	%rdi, %rcx
+	/* Align data to VEC_SIZE - 1. ALGN_PTR_REG is rcx for memchr
+	   and rdi for rawmemchr.  */
+	orq	$(VEC_SIZE - 1), %ALGN_PTR_REG
+	VPCMPEQ	-(VEC_SIZE - 1)(%ALGN_PTR_REG), %ymm0, %ymm1
+	vpmovmskb %ymm1, %eax
+# ifndef USE_AS_RAWMEMCHR
+	/* Calculate length until end of page (length checked for a match).  */
+	leaq	1(%ALGN_PTR_REG), %rsi
+	subq	%RRAW_PTR_REG, %rsi
+#  ifdef USE_AS_WMEMCHR
+	/* NB: Divide bytes by 4 to get wchar_t count.  */
+	shrl	$2, %esi
+#  endif
+# endif
+	/* Remove the leading bytes.  */
+	sarxl	%ERAW_PTR_REG, %eax, %eax
+# ifndef USE_AS_RAWMEMCHR
+	/* Check the end of data.  */
+	cmpq	%rsi, %rdx
+	jbe	L(first_vec_x0)
+# endif
+	testl	%eax, %eax
+	jz	L(cross_page_continue)
+	bsfl	%eax, %eax
+	addq	%RRAW_PTR_REG, %rax
+	VZEROUPPER_RETURN
+
+
+ END (MEMCHR)
+ #endif
--- a/glibc-upstream-2.34-287.patch
+++ b/glibc-upstream-2.34-287.patch
@ -0,0 +1,130 @@
+commit 3c87383a20daff9a230439e31b778716bfed4d8b
+Author: Noah Goldstein <goldstein.w.n@gmail.com>
+Date:   Mon Jun 6 21:11:34 2022 -0700
+
+    x86: Shrink code size of memchr-evex.S
+    
+    This is not meant as a performance optimization. The previous code was
+    far to liberal in aligning targets and wasted code size unnecissarily.
+    
+    The total code size saving is: 64 bytes
+    
+    There are no non-negligible changes in the benchmarks.
+    Geometric Mean of all benchmarks New / Old: 1.000
+    
+    Full xcheck passes on x86_64.
+    Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
+    
+    (cherry picked from commit 56da3fe1dd075285fa8186d44b3c28e68c687e62)
+
+diff --git a/sysdeps/x86_64/multiarch/memchr-evex.S b/sysdeps/x86_64/multiarch/memchr-evex.S
+index 4d0ed6d136f099e1..68381c99a4948134 100644
+--- a/sysdeps/x86_64/multiarch/memchr-evex.S
+++ b/sysdeps/x86_64/multiarch/memchr-evex.S
+@@ -88,7 +88,7 @@
+ # define PAGE_SIZE 4096
+ 
+ 	.section SECTION(.text),"ax",@progbits
+-ENTRY (MEMCHR)
+ENTRY_P2ALIGN (MEMCHR, 6)
+ # ifndef USE_AS_RAWMEMCHR
+ 	/* Check for zero length.  */
+ 	test	%RDX_LP, %RDX_LP
+@@ -131,22 +131,24 @@ L(zero):
+ 	xorl	%eax, %eax
+ 	ret
+ 
+-	.p2align 5
+	.p2align 4
+ L(first_vec_x0):
+-	/* Check if first match was before length.  */
+-	tzcntl	%eax, %eax
+-	xorl	%ecx, %ecx
+-	cmpl	%eax, %edx
+-	leaq	(%rdi, %rax, CHAR_SIZE), %rax
+-	cmovle	%rcx, %rax
+	/* Check if first match was before length. NB: tzcnt has false data-
+	   dependency on destination. eax already had a data-dependency on esi
+	   so this should have no affect here.  */
+	tzcntl	%eax, %esi
+#  ifdef USE_AS_WMEMCHR
+	leaq	(%rdi, %rsi, CHAR_SIZE), %rdi
+#  else
+	addq	%rsi, %rdi
+#  endif
+	xorl	%eax, %eax
+	cmpl	%esi, %edx
+	cmovg	%rdi, %rax
+ 	ret
+-# else
+-	/* NB: first_vec_x0 is 17 bytes which will leave
+-	   cross_page_boundary (which is relatively cold) close enough
+-	   to ideal alignment. So only realign L(cross_page_boundary) if
+-	   rawmemchr.  */
+-	.p2align 4
+ # endif
+
+	.p2align 4
+ L(cross_page_boundary):
+ 	/* Save pointer before aligning as its original value is
+ 	   necessary for computer return address if byte is found or
+@@ -400,10 +402,14 @@ L(last_2x_vec):
+ L(zero_end):
+ 	ret
+ 
+L(set_zero_end):
+	xorl	%eax, %eax
+	ret
+ 
+ 	.p2align 4
+ L(first_vec_x1_check):
+-	tzcntl	%eax, %eax
+	/* eax must be non-zero. Use bsfl to save code size.  */
+	bsfl	%eax, %eax
+ 	/* Adjust length.  */
+ 	subl	$-(CHAR_PER_VEC * 4), %edx
+ 	/* Check if match within remaining length.  */
+@@ -412,9 +418,6 @@ L(first_vec_x1_check):
+ 	/* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count.  */
+ 	leaq	VEC_SIZE(%rdi, %rax, CHAR_SIZE), %rax
+ 	ret
+-L(set_zero_end):
+-	xorl	%eax, %eax
+-	ret
+ 
+ 	.p2align 4
+ L(loop_4x_vec_end):
+@@ -464,7 +467,7 @@ L(loop_4x_vec_end):
+ # endif
+ 	ret
+ 
+-	.p2align 4
+	.p2align 4,, 10
+ L(last_vec_x1_return):
+ 	tzcntl	%eax, %eax
+ # if defined USE_AS_WMEMCHR || RET_OFFSET != 0
+@@ -496,6 +499,7 @@ L(last_vec_x3_return):
+ # endif
+ 
+ # ifndef USE_AS_RAWMEMCHR
+	.p2align 4,, 5
+ L(last_4x_vec_or_less_cmpeq):
+ 	VPCMP	$0, (VEC_SIZE * 5)(%rdi), %YMMMATCH, %k0
+ 	kmovd	%k0, %eax
+@@ -546,7 +550,7 @@ L(last_4x_vec):
+ #  endif
+ 	andl	%ecx, %eax
+ 	jz	L(zero_end2)
+-	tzcntl	%eax, %eax
+	bsfl	%eax, %eax
+ 	leaq	(VEC_SIZE * 4)(%rdi, %rax, CHAR_SIZE), %rax
+ L(zero_end2):
+ 	ret
+@@ -562,6 +566,6 @@ L(last_vec_x3):
+ 	leaq	(VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax
+ 	ret
+ # endif
+-
+	/* 7 bytes from next cache line.  */
+ END (MEMCHR)
+ #endif
--- a/glibc-upstream-2.34-288.patch
+++ b/glibc-upstream-2.34-288.patch
@ -0,0 +1,33 @@
+commit 820504e3edd7276bf869d543ad5b57187ff9c9b6
+Author: Noah Goldstein <goldstein.w.n@gmail.com>
+Date:   Fri Jun 3 18:52:37 2022 -0500
+
+    x86: ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST expect no transactions
+    
+    Give fall-through path to `vzeroupper` and taken-path to `vzeroall`.
+    
+    Generally even on machines with RTM the expectation is the
+    string-library functions will not be called in transactions.
+    Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
+    
+    (cherry picked from commit c28db9cb29a7d6cf3ce08fd8445e6b7dea03f35b)
+
+diff --git a/sysdeps/x86_64/sysdep.h b/sysdeps/x86_64/sysdep.h
+index 93e44be22e2275f1..04478b097cdffe20 100644
+--- a/sysdeps/x86_64/sysdep.h
+++ b/sysdeps/x86_64/sysdep.h
+@@ -99,11 +99,11 @@ lose:									      \
+    to avoid RTM abort triggered by VZEROUPPER inside transactionally.  */
+ #define ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST \
+ 	xtest;							\
+-	jz	1f;						\
+-	vzeroall;						\
+	jnz	1f;						\
+	vzeroupper;						\
+ 	ret;							\
+ 1:								\
+-	vzeroupper;						\
+	vzeroall;						\
+ 	ret
+ 
+ /* Can be used to replace vzeroupper that is not directly before a
--- a/glibc-upstream-2.34-289.patch
+++ b/glibc-upstream-2.34-289.patch
@ -0,0 +1,41 @@
+commit fc54e1fae854e6ee6361cd4ddf900c36fce8158e
+Author: Noah Goldstein <goldstein.w.n@gmail.com>
+Date:   Wed Jun 8 21:16:51 2022 -0700
+
+    x86: Align varshift table to 32-bytes
+    
+    This ensures the load will never split a cache line.
+    
+    (cherry picked from commit 0f91811333f23b61cf681cab2704b35a0a073b97)
+
+diff --git a/sysdeps/x86_64/multiarch/varshift.c b/sysdeps/x86_64/multiarch/varshift.c
+index 45267b0a6823459a..1f563542666bc4f1 100644
+--- a/sysdeps/x86_64/multiarch/varshift.c
+++ b/sysdeps/x86_64/multiarch/varshift.c
+@@ -16,9 +16,10 @@
+    License along with the GNU C Library; if not, see
+    <https://www.gnu.org/licenses/>.  */
+ 
+-#include "varshift.h"
+#include <stdint.h>
+ 
+-const int8_t ___m128i_shift_right[31] attribute_hidden =
+const int8_t ___m128i_shift_right[31] attribute_hidden
+    __attribute__((aligned(32))) =
+   {
+     0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15,
+     -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1
+diff --git a/sysdeps/x86_64/multiarch/varshift.h b/sysdeps/x86_64/multiarch/varshift.h
+index 32f2173dd2a95b3a..745d48fa7c775136 100644
+--- a/sysdeps/x86_64/multiarch/varshift.h
+++ b/sysdeps/x86_64/multiarch/varshift.h
+@@ -19,7 +19,8 @@
+ #include <stdint.h>
+ #include <tmmintrin.h>
+ 
+-extern const int8_t ___m128i_shift_right[31] attribute_hidden;
+extern const int8_t ___m128i_shift_right[31] attribute_hidden
+    __attribute__ ((aligned (32)));
+ 
+ static __inline__ __m128i
+ __m128i_shift_right (__m128i value, unsigned long int offset)
--- a/glibc-upstream-2.34-290.patch
+++ b/glibc-upstream-2.34-290.patch
@ -0,0 +1,56 @@
+commit 6e008c884dad5a25f91085c68d044bb5e2d63761
+Author: Noah Goldstein <goldstein.w.n@gmail.com>
+Date:   Tue Jun 14 13:50:11 2022 -0700
+
+    x86: Fix misordered logic for setting `rep_movsb_stop_threshold`
+    
+    Move the setting of `rep_movsb_stop_threshold` to after the tunables
+    have been collected so that the `rep_movsb_stop_threshold` (which
+    is used to redirect control flow to the non_temporal case) will
+    use any user value for `non_temporal_threshold` (set using
+    glibc.cpu.x86_non_temporal_threshold)
+    
+    (cherry picked from commit 035591551400cfc810b07244a015c9411e8bff7c)
+
+diff --git a/sysdeps/x86/dl-cacheinfo.h b/sysdeps/x86/dl-cacheinfo.h
+index 2e43e67e4f4037d3..560bf260e8fbd7bf 100644
+--- a/sysdeps/x86/dl-cacheinfo.h
+++ b/sysdeps/x86/dl-cacheinfo.h
+@@ -898,18 +898,6 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
+   if (CPU_FEATURE_USABLE_P (cpu_features, FSRM))
+     rep_movsb_threshold = 2112;
+ 
+-  unsigned long int rep_movsb_stop_threshold;
+-  /* ERMS feature is implemented from AMD Zen3 architecture and it is
+-     performing poorly for data above L2 cache size. Henceforth, adding
+-     an upper bound threshold parameter to limit the usage of Enhanced
+-     REP MOVSB operations and setting its value to L2 cache size.  */
+-  if (cpu_features->basic.kind == arch_kind_amd)
+-    rep_movsb_stop_threshold = core;
+-  /* Setting the upper bound of ERMS to the computed value of
+-     non-temporal threshold for architectures other than AMD.  */
+-  else
+-    rep_movsb_stop_threshold = non_temporal_threshold;
+-
+   /* The default threshold to use Enhanced REP STOSB.  */
+   unsigned long int rep_stosb_threshold = 2048;
+ 
+@@ -951,6 +939,18 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
+ 			   SIZE_MAX);
+ #endif
+ 
+  unsigned long int rep_movsb_stop_threshold;
+  /* ERMS feature is implemented from AMD Zen3 architecture and it is
+     performing poorly for data above L2 cache size. Henceforth, adding
+     an upper bound threshold parameter to limit the usage of Enhanced
+     REP MOVSB operations and setting its value to L2 cache size.  */
+  if (cpu_features->basic.kind == arch_kind_amd)
+    rep_movsb_stop_threshold = core;
+  /* Setting the upper bound of ERMS to the computed value of
+     non-temporal threshold for architectures other than AMD.  */
+  else
+    rep_movsb_stop_threshold = non_temporal_threshold;
+
+   cpu_features->data_cache_size = data;
+   cpu_features->shared_cache_size = shared;
+   cpu_features->non_temporal_threshold = non_temporal_threshold;
--- a/glibc-upstream-2.34-291.patch
+++ b/glibc-upstream-2.34-291.patch
@ -0,0 +1,38 @@
+commit 9d50e162eef88e1f870a941b0a973060e984e7ca
+Author: Noah Goldstein <goldstein.w.n@gmail.com>
+Date:   Tue Jun 14 15:37:28 2022 -0700
+
+    x86: Add sse42 implementation to strcmp's ifunc
+    
+    This has been missing since the the ifuncs where added.
+    
+    The performance of SSE4.2 is preferable to to SSE2.
+    
+    Measured on Tigerlake with N = 20 runs.
+    Geometric Mean of all benchmarks SSE4.2 / SSE2: 0.906
+    
+    (cherry picked from commit ff439c47173565fbff4f0f78d07b0f14e4a7db05)
+
+diff --git a/sysdeps/x86_64/multiarch/strcmp.c b/sysdeps/x86_64/multiarch/strcmp.c
+index 7c2901bf44456259..b457fb4c150e4407 100644
+--- a/sysdeps/x86_64/multiarch/strcmp.c
+++ b/sysdeps/x86_64/multiarch/strcmp.c
+@@ -29,6 +29,7 @@
+ extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden;
+ extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2_unaligned) attribute_hidden;
+ extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE (sse42) attribute_hidden;
+ extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden;
+ extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden;
+ extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden;
+@@ -53,6 +54,10 @@ IFUNC_SELECTOR (void)
+ 	return OPTIMIZE (avx2);
+     }
+ 
+  if (CPU_FEATURE_USABLE_P (cpu_features, SSE4_2)
+      && !CPU_FEATURES_ARCH_P (cpu_features, Slow_SSE4_2))
+    return OPTIMIZE (sse42);
+
+   if (CPU_FEATURES_ARCH_P (cpu_features, Fast_Unaligned_Load))
+     return OPTIMIZE (sse2_unaligned);
+ 
--- a/glibc-upstream-2.34-292.patch
+++ b/glibc-upstream-2.34-292.patch
@ -0,0 +1,54 @@
+commit 94b0dc9419bd038cb85b364a7556569386c31741
+Author: Noah Goldstein <goldstein.w.n@gmail.com>
+Date:   Wed Jun 15 10:41:29 2022 -0700
+
+    x86: Add bounds `x86_non_temporal_threshold`
+    
+    The lower-bound (16448) and upper-bound (SIZE_MAX / 16) are assumed
+    by memmove-vec-unaligned-erms.
+    
+    The lower-bound is needed because memmove-vec-unaligned-erms unrolls
+    the loop aggressively in the L(large_memset_4x) case.
+    
+    The upper-bound is needed because memmove-vec-unaligned-erms
+    right-shifts the value of `x86_non_temporal_threshold` by
+    LOG_4X_MEMCPY_THRESH (4) which without a bound may overflow.
+    
+    The lack of lower-bound can be a correctness issue. The lack of
+    upper-bound cannot.
+    
+    (cherry picked from commit b446822b6ae4e8149902a78cdd4a886634ad6321)
+
+diff --git a/manual/tunables.texi b/manual/tunables.texi
+index 28ff502990c2a10f..5ab3212f34e3dc37 100644
+--- a/manual/tunables.texi
+++ b/manual/tunables.texi
+@@ -47,7 +47,7 @@ glibc.malloc.mxfast: 0x0 (min: 0x0, max: 0xffffffffffffffff)
+ glibc.elision.skip_lock_busy: 3 (min: -2147483648, max: 2147483647)
+ glibc.malloc.top_pad: 0x0 (min: 0x0, max: 0xffffffffffffffff)
+ glibc.cpu.x86_rep_stosb_threshold: 0x800 (min: 0x1, max: 0xffffffffffffffff)
+-glibc.cpu.x86_non_temporal_threshold: 0xc0000 (min: 0x0, max: 0xffffffffffffffff)
+glibc.cpu.x86_non_temporal_threshold: 0xc0000 (min: 0x4040, max: 0x0fffffffffffffff)
+ glibc.cpu.x86_shstk:
+ glibc.cpu.hwcap_mask: 0x6 (min: 0x0, max: 0xffffffffffffffff)
+ glibc.malloc.mmap_max: 0 (min: -2147483648, max: 2147483647)
+diff --git a/sysdeps/x86/dl-cacheinfo.h b/sysdeps/x86/dl-cacheinfo.h
+index 560bf260e8fbd7bf..8f85f70858413ebe 100644
+--- a/sysdeps/x86/dl-cacheinfo.h
+++ b/sysdeps/x86/dl-cacheinfo.h
+@@ -931,8 +931,14 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
+ 
+   TUNABLE_SET_WITH_BOUNDS (x86_data_cache_size, data, 0, SIZE_MAX);
+   TUNABLE_SET_WITH_BOUNDS (x86_shared_cache_size, shared, 0, SIZE_MAX);
+  /* SIZE_MAX >> 4 because memmove-vec-unaligned-erms right-shifts the value of
+     'x86_non_temporal_threshold' by `LOG_4X_MEMCPY_THRESH` (4) and it is best
+     if that operation cannot overflow. Minimum of 0x4040 (16448) because the
+     L(large_memset_4x) loops need 64-byte to cache align and enough space for
+     at least 1 iteration of 4x PAGE_SIZE unrolled loop.  Both values are
+     reflected in the manual.  */
+   TUNABLE_SET_WITH_BOUNDS (x86_non_temporal_threshold, non_temporal_threshold,
+-			   0, SIZE_MAX);
+			   0x4040, SIZE_MAX >> 4);
+   TUNABLE_SET_WITH_BOUNDS (x86_rep_movsb_threshold, rep_movsb_threshold,
+ 			   minimum_rep_movsb_threshold, SIZE_MAX);
+   TUNABLE_SET_WITH_BOUNDS (x86_rep_stosb_threshold, rep_stosb_threshold, 1,
--- a/glibc-upstream-2.34-293.patch
+++ b/glibc-upstream-2.34-293.patch
@ -0,0 +1,88 @@
+commit ba1c3f23d9ba63c38333116eec6043c471c378c4
+Author: Noah Goldstein <goldstein.w.n@gmail.com>
+Date:   Wed Jun 15 10:41:28 2022 -0700
+
+    x86: Cleanup bounds checking in large memcpy case
+    
+    1. Fix incorrect lower-bound threshold in L(large_memcpy_2x).
+       Previously was using `__x86_rep_movsb_threshold` and should
+       have been using `__x86_shared_non_temporal_threshold`.
+    
+    2. Avoid reloading __x86_shared_non_temporal_threshold before
+       the L(large_memcpy_4x) bounds check.
+    
+    3. Document the second bounds check for L(large_memcpy_4x)
+       more clearly.
+    
+    (cherry picked from commit 89a25c6f64746732b87eaf433af0964b564d4a92)
+
+diff --git a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
+index 7b27cbdda5fb99f7..618d46d8ce28828c 100644
+--- a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
+@@ -118,7 +118,13 @@
+ # define LARGE_LOAD_SIZE (VEC_SIZE * 4)
+ #endif
+ 
+-/* Amount to shift rdx by to compare for memcpy_large_4x.  */
+/* Amount to shift __x86_shared_non_temporal_threshold by for
+   bound for memcpy_large_4x. This is essentially use to to
+   indicate that the copy is far beyond the scope of L3
+   (assuming no user config x86_non_temporal_threshold) and to
+   use a more aggressively unrolled loop.  NB: before
+   increasing the value also update initialization of
+   x86_non_temporal_threshold.  */
+ #ifndef LOG_4X_MEMCPY_THRESH
+ # define LOG_4X_MEMCPY_THRESH 4
+ #endif
+@@ -724,9 +730,14 @@ L(skip_short_movsb_check):
+ 	.p2align 4,, 10
+ #if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc)
+ L(large_memcpy_2x_check):
+-	cmp	__x86_rep_movsb_threshold(%rip), %RDX_LP
+-	jb	L(more_8x_vec_check)
+	/* Entry from L(large_memcpy_2x) has a redundant load of
+	   __x86_shared_non_temporal_threshold(%rip). L(large_memcpy_2x)
+	   is only use for the non-erms memmove which is generally less
+	   common.  */
+ L(large_memcpy_2x):
+	mov	__x86_shared_non_temporal_threshold(%rip), %R11_LP
+	cmp	%R11_LP, %RDX_LP
+	jb	L(more_8x_vec_check)
+ 	/* To reach this point it is impossible for dst > src and
+ 	   overlap. Remaining to check is src > dst and overlap. rcx
+ 	   already contains dst - src. Negate rcx to get src - dst. If
+@@ -774,18 +785,21 @@ L(large_memcpy_2x):
+ 	/* ecx contains -(dst - src). not ecx will return dst - src - 1
+ 	   which works for testing aliasing.  */
+ 	notl	%ecx
+	movq	%rdx, %r10
+ 	testl	$(PAGE_SIZE - VEC_SIZE * 8), %ecx
+ 	jz	L(large_memcpy_4x)
+ 
+-	movq	%rdx, %r10
+-	shrq	$LOG_4X_MEMCPY_THRESH, %r10
+-	cmp	__x86_shared_non_temporal_threshold(%rip), %r10
+	/* r11 has __x86_shared_non_temporal_threshold.  Shift it left
+	   by LOG_4X_MEMCPY_THRESH to get L(large_memcpy_4x) threshold.
+	 */
+	shlq	$LOG_4X_MEMCPY_THRESH, %r11
+	cmp	%r11, %rdx
+ 	jae	L(large_memcpy_4x)
+ 
+ 	/* edx will store remainder size for copying tail.  */
+ 	andl	$(PAGE_SIZE * 2 - 1), %edx
+ 	/* r10 stores outer loop counter.  */
+-	shrq	$((LOG_PAGE_SIZE + 1) - LOG_4X_MEMCPY_THRESH), %r10
+	shrq	$(LOG_PAGE_SIZE + 1), %r10
+ 	/* Copy 4x VEC at a time from 2 pages.  */
+ 	.p2align 4
+ L(loop_large_memcpy_2x_outer):
+@@ -850,7 +864,6 @@ L(large_memcpy_2x_end):
+ 
+ 	.p2align 4
+ L(large_memcpy_4x):
+-	movq	%rdx, %r10
+ 	/* edx will store remainder size for copying tail.  */
+ 	andl	$(PAGE_SIZE * 4 - 1), %edx
+ 	/* r10 stores outer loop counter.  */
--- a/glibc-upstream-2.34-294.patch
+++ b/glibc-upstream-2.34-294.patch
@ -0,0 +1,27 @@
+commit c51d8d383cfb92142b86d8d1822159f3bea10d16
+Author: Noah Goldstein <goldstein.w.n@gmail.com>
+Date:   Thu Jun 16 15:01:08 2022 -0700
+
+    x86: Add BMI1/BMI2 checks for ISA_V3 check
+    
+    BMI1/BMI2 are part of the ISA V3 requirements:
+    https://en.wikipedia.org/wiki/X86-64
+    
+    And defined by GCC when building with `-march=x86-64-v3`
+    
+    (cherry picked from commit 8da9f346cb2051844348785b8a932ec44489e0b7)
+
+diff --git a/sysdeps/x86/isa-level.c b/sysdeps/x86/isa-level.c
+index 49ef4aa6122072cf..07815381122c94c3 100644
+--- a/sysdeps/x86/isa-level.c
+++ b/sysdeps/x86/isa-level.c
+@@ -47,7 +47,8 @@
+ # endif
+ 
+ # if ISA_V2 && defined __AVX__ && defined __AVX2__ && defined __F16C__ \
+-     && defined __FMA__ && defined __LZCNT__ && defined HAVE_X86_MOVBE
+     && defined __FMA__ && defined __LZCNT__ && defined HAVE_X86_MOVBE \
+     && defined __BMI__ && defined __BMI2__
+ /* NB: ISAs in x86-64 ISA level v3 are used.  */
+ #  define ISA_V3	GNU_PROPERTY_X86_ISA_1_V3
+ # else
--- a/glibc-upstream-2.34-295.patch
+++ b/glibc-upstream-2.34-295.patch
@ -0,0 +1,28 @@
+commit d201c59177b98946d7f80145e7b4d02991d04805
+Author: Noah Goldstein <goldstein.w.n@gmail.com>
+Date:   Fri Jun 24 09:42:12 2022 -0700
+
+    x86: Align entry for memrchr to 64-bytes.
+    
+    The function was tuned around 64-byte entry alignment and performs
+    better for all sizes with it.
+    
+    As well different code boths where explicitly written to touch the
+    minimum number of cache line i.e sizes <= 32 touch only the entry
+    cache line.
+    
+    (cherry picked from commit 227afaa67213efcdce6a870ef5086200f1076438)
+
+diff --git a/sysdeps/x86_64/multiarch/memrchr-avx2.S b/sysdeps/x86_64/multiarch/memrchr-avx2.S
+index 5f8e0be18cfe4fad..edd8180ba1ede9a5 100644
+--- a/sysdeps/x86_64/multiarch/memrchr-avx2.S
+++ b/sysdeps/x86_64/multiarch/memrchr-avx2.S
+@@ -35,7 +35,7 @@
+ # define VEC_SIZE			32
+ # define PAGE_SIZE			4096
+ 	.section SECTION(.text), "ax", @progbits
+-ENTRY(MEMRCHR)
+ENTRY_P2ALIGN(MEMRCHR, 6)
+ # ifdef __ILP32__
+ 	/* Clear upper bits.  */
+ 	and	%RDX_LP, %RDX_LP
--- a/glibc-upstream-2.34-296.patch
+++ b/glibc-upstream-2.34-296.patch
@ -0,0 +1,56 @@
+commit aadd0a1c7c89d016e1186c81c0efcafa36bf84fc
+Author: Noah Goldstein <goldstein.w.n@gmail.com>
+Date:   Fri Jun 24 09:42:15 2022 -0700
+
+    x86: Put wcs{n}len-sse4.1 in the sse4.1 text section
+    
+    Previously was missing but the two implementations shouldn't get in
+    the sse2 (generic) text section.
+    
+    (cherry picked from commit afc6e4328ff80973bde50d5401691b4c4b2e522c)
+
+diff --git a/sysdeps/x86_64/multiarch/strlen-vec.S b/sysdeps/x86_64/multiarch/strlen-vec.S
+index 031753a91763b351..762f4755020c35f9 100644
+--- a/sysdeps/x86_64/multiarch/strlen-vec.S
+++ b/sysdeps/x86_64/multiarch/strlen-vec.S
+@@ -28,6 +28,10 @@
+ # define SHIFT_RETURN
+ #endif
+ 
+#ifndef SECTION
+# define SECTION(p)	p
+#endif
+
+ /* Long lived register in strlen(s), strnlen(s, n) are:
+ 
+ 	%xmm3 - zero
+@@ -37,7 +41,7 @@
+ */
+ 
+ 
+-.text
+	.section SECTION(.text),"ax",@progbits
+ ENTRY(strlen)
+ 
+ /* Test 64 bytes from %rax for zero. Save result as bitmask in %rdx.  */
+diff --git a/sysdeps/x86_64/multiarch/wcslen-sse4_1.S b/sysdeps/x86_64/multiarch/wcslen-sse4_1.S
+index 7e62621afc729492..e306a77f51e650d1 100644
+--- a/sysdeps/x86_64/multiarch/wcslen-sse4_1.S
+++ b/sysdeps/x86_64/multiarch/wcslen-sse4_1.S
+@@ -1,4 +1,5 @@
+ #define AS_WCSLEN
+ #define strlen	__wcslen_sse4_1
+#define SECTION(p)	p##.sse4.1
+ 
+ #include "strlen-vec.S"
+diff --git a/sysdeps/x86_64/multiarch/wcsnlen-sse4_1.S b/sysdeps/x86_64/multiarch/wcsnlen-sse4_1.S
+index 5fa51fe07cbbdf5c..d2f7dd6e2254736c 100644
+--- a/sysdeps/x86_64/multiarch/wcsnlen-sse4_1.S
+++ b/sysdeps/x86_64/multiarch/wcsnlen-sse4_1.S
+@@ -1,5 +1,6 @@
+ #define AS_WCSLEN
+ #define AS_STRNLEN
+ #define strlen	__wcsnlen_sse4_1
+#define SECTION(p)	p##.sse4.1
+ 
+ #include "strlen-vec.S"
--- a/glibc-upstream-2.34-297.patch
+++ b/glibc-upstream-2.34-297.patch
@ -0,0 +1,25 @@
+commit f4598f0351559f1a4176d7ce0154423d98bcfb0d
+Author: Noah Goldstein <goldstein.w.n@gmail.com>
+Date:   Wed Jun 29 16:07:04 2022 -0700
+
+    x86: Add definition for __wmemset_chk AVX2 RTM in ifunc impl list
+    
+    This was simply missing and meant we weren't testing it properly.
+    
+    (cherry picked from commit 2a1099020cdc1e4c9c928156aa85c8cf9d540291)
+
+diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+index 043821278fdb6d8f..8d649e263eb24b8a 100644
+--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+@@ -1032,6 +1032,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 	      IFUNC_IMPL_ADD (array, i, __wmemset_chk,
+ 			      CPU_FEATURE_USABLE (AVX2),
+ 			      __wmemset_chk_avx2_unaligned)
+	      IFUNC_IMPL_ADD (array, i, __wmemset_chk,
+			      (CPU_FEATURE_USABLE (AVX2)
+			       && CPU_FEATURE_USABLE (RTM)),
+			      __wmemset_chk_avx2_unaligned_rtm)
+ 	      IFUNC_IMPL_ADD (array, i, __wmemset_chk,
+ 			      (CPU_FEATURE_USABLE (AVX512VL)
+ 			       && CPU_FEATURE_USABLE (AVX512BW)
--- a/glibc-upstream-2.34-298.patch
+++ b/glibc-upstream-2.34-298.patch
@ -0,0 +1,124 @@
+commit 7079931c51547854323fe2ed6fdccf2a1b8b04d7
+Author: Noah Goldstein <goldstein.w.n@gmail.com>
+Date:   Wed Jun 29 16:07:05 2022 -0700
+
+    x86: Move and slightly improve memset_erms
+    
+    Implementation wise:
+        1. Remove the VZEROUPPER as memset_{impl}_unaligned_erms does not
+           use the L(stosb) label that was previously defined.
+    
+        2. Don't give the hotpath (fallthrough) to zero size.
+    
+    Code positioning wise:
+    
+    Move memset_{chk}_erms to its own file.  Leaving it in between the
+    memset_{impl}_unaligned both adds unnecessary complexity to the
+    file and wastes space in a relatively hot cache section.
+    
+    (cherry picked from commit 4a3f29e7e475dd4e7cce2a24c187e6fb7b5b0a05)
+
+diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
+index 0e39e63ef6be6a86..da9f16286a763556 100644
+--- a/sysdeps/x86_64/multiarch/Makefile
+++ b/sysdeps/x86_64/multiarch/Makefile
+@@ -29,6 +29,7 @@ sysdep_routines += \
+   memset-avx2-unaligned-erms-rtm \
+   memset-avx512-no-vzeroupper \
+   memset-avx512-unaligned-erms \
+  memset-erms \
+   memset-evex-unaligned-erms \
+   memset-sse2-unaligned-erms \
+   rawmemchr-avx2 \
+diff --git a/sysdeps/x86_64/multiarch/memset-erms.S b/sysdeps/x86_64/multiarch/memset-erms.S
+new file mode 100644
+index 0000000000000000..e83cccc731f0a7ea
+--- /dev/null
+++ b/sysdeps/x86_64/multiarch/memset-erms.S
+@@ -0,0 +1,44 @@
+/* memset implement with rep stosb
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+
+#include <sysdep.h>
+
+#if defined USE_MULTIARCH && IS_IN (libc)
+	.text
+ENTRY (__memset_chk_erms)
+	cmp	%RDX_LP, %RCX_LP
+	jb	HIDDEN_JUMPTARGET (__chk_fail)
+END (__memset_chk_erms)
+
+/* Only used to measure performance of REP STOSB.  */
+ENTRY (__memset_erms)
+	/* Skip zero length.  */
+	test	%RDX_LP, %RDX_LP
+	jz	 L(stosb_return_zero)
+	mov	%RDX_LP, %RCX_LP
+	movzbl	%sil, %eax
+	mov	%RDI_LP, %RDX_LP
+	rep stosb
+	mov	%RDX_LP, %RAX_LP
+	ret
+L(stosb_return_zero):
+	movq	%rdi, %rax
+	ret
+END (__memset_erms)
+#endif
+diff --git a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
+index abc12d9cda1b3843..905d0fa4643d5768 100644
+--- a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
+@@ -156,37 +156,6 @@ L(entry_from_wmemset):
+ #if defined USE_MULTIARCH && IS_IN (libc)
+ END (MEMSET_SYMBOL (__memset, unaligned))
+ 
+-# if VEC_SIZE == 16
+-ENTRY (__memset_chk_erms)
+-	cmp	%RDX_LP, %RCX_LP
+-	jb	HIDDEN_JUMPTARGET (__chk_fail)
+-END (__memset_chk_erms)
+-
+-/* Only used to measure performance of REP STOSB.  */
+-ENTRY (__memset_erms)
+-	/* Skip zero length.  */
+-	test	%RDX_LP, %RDX_LP
+-	jnz	 L(stosb)
+-	movq	%rdi, %rax
+-	ret
+-# else
+-/* Provide a hidden symbol to debugger.  */
+-	.hidden	MEMSET_SYMBOL (__memset, erms)
+-ENTRY (MEMSET_SYMBOL (__memset, erms))
+-# endif
+-L(stosb):
+-	mov	%RDX_LP, %RCX_LP
+-	movzbl	%sil, %eax
+-	mov	%RDI_LP, %RDX_LP
+-	rep stosb
+-	mov	%RDX_LP, %RAX_LP
+-	VZEROUPPER_RETURN
+-# if VEC_SIZE == 16
+-END (__memset_erms)
+-# else
+-END (MEMSET_SYMBOL (__memset, erms))
+-# endif
+-
+ # if defined SHARED && IS_IN (libc)
+ ENTRY_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned_erms))
+ 	cmp	%RDX_LP, %RCX_LP
--- a/glibc-upstream-2.34-299.patch
+++ b/glibc-upstream-2.34-299.patch
@ -0,0 +1,163 @@
+commit 35f9c72c8bd7bc30deb412e966e2f548241b15d2
+Author: Noah Goldstein <goldstein.w.n@gmail.com>
+Date:   Wed Jun 29 16:07:15 2022 -0700
+
+    x86: Move mem{p}{mov|cpy}_{chk_}erms to its own file
+    
+    The primary memmove_{impl}_unaligned_erms implementations don't
+    interact with this function. Putting them in same file both
+    wastes space and unnecessarily bloats a hot code section.
+    
+    (cherry picked from commit 21925f64730d52eb7d8b2fb62b412f8ab92b0caf)
+
+diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
+index da9f16286a763556..b9ea5b60c2be1b0a 100644
+--- a/sysdeps/x86_64/multiarch/Makefile
+++ b/sysdeps/x86_64/multiarch/Makefile
+@@ -17,6 +17,7 @@ sysdep_routines += \
+   memmove-avx-unaligned-erms-rtm \
+   memmove-avx512-no-vzeroupper \
+   memmove-avx512-unaligned-erms \
+  memmove-erms \
+   memmove-evex-unaligned-erms \
+   memmove-sse2-unaligned-erms \
+   memmove-ssse3 \
+diff --git a/sysdeps/x86_64/multiarch/memmove-erms.S b/sysdeps/x86_64/multiarch/memmove-erms.S
+new file mode 100644
+index 0000000000000000..2d3a6ccb76d77052
+--- /dev/null
+++ b/sysdeps/x86_64/multiarch/memmove-erms.S
+@@ -0,0 +1,72 @@
+/* memcpy/mempcpy/memmove implement with rep movsb
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+
+#include <sysdep.h>
+
+#if defined USE_MULTIARCH && IS_IN (libc)
+	.text
+ENTRY (__mempcpy_chk_erms)
+	cmp	%RDX_LP, %RCX_LP
+	jb	HIDDEN_JUMPTARGET (__chk_fail)
+END (__mempcpy_chk_erms)
+
+/* Only used to measure performance of REP MOVSB.  */
+ENTRY (__mempcpy_erms)
+	mov	%RDI_LP, %RAX_LP
+	/* Skip zero length.  */
+	test	%RDX_LP, %RDX_LP
+	jz	2f
+	add	%RDX_LP, %RAX_LP
+	jmp	L(start_movsb)
+END (__mempcpy_erms)
+
+ENTRY (__memmove_chk_erms)
+	cmp	%RDX_LP, %RCX_LP
+	jb	HIDDEN_JUMPTARGET (__chk_fail)
+END (__memmove_chk_erms)
+
+ENTRY (__memmove_erms)
+	movq	%rdi, %rax
+	/* Skip zero length.  */
+	test	%RDX_LP, %RDX_LP
+	jz	2f
+L(start_movsb):
+	mov	%RDX_LP, %RCX_LP
+	cmp	%RSI_LP, %RDI_LP
+	jb	1f
+	/* Source == destination is less common.  */
+	je	2f
+	lea	(%rsi,%rcx), %RDX_LP
+	cmp	%RDX_LP, %RDI_LP
+	jb	L(movsb_backward)
+1:
+	rep movsb
+2:
+	ret
+L(movsb_backward):
+	leaq	-1(%rdi,%rcx), %rdi
+	leaq	-1(%rsi,%rcx), %rsi
+	std
+	rep movsb
+	cld
+	ret
+END (__memmove_erms)
+strong_alias (__memmove_erms, __memcpy_erms)
+strong_alias (__memmove_chk_erms, __memcpy_chk_erms)
+#endif
+diff --git a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
+index 618d46d8ce28828c..93c7e6883a254434 100644
+--- a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
+@@ -239,56 +239,6 @@ L(start):
+ #endif
+ #if defined USE_MULTIARCH && IS_IN (libc)
+ END (MEMMOVE_SYMBOL (__memmove, unaligned))
+-# if VEC_SIZE == 16
+-ENTRY (__mempcpy_chk_erms)
+-	cmp	%RDX_LP, %RCX_LP
+-	jb	HIDDEN_JUMPTARGET (__chk_fail)
+-END (__mempcpy_chk_erms)
+-
+-/* Only used to measure performance of REP MOVSB.  */
+-ENTRY (__mempcpy_erms)
+-	mov	%RDI_LP, %RAX_LP
+-	/* Skip zero length.  */
+-	test	%RDX_LP, %RDX_LP
+-	jz	2f
+-	add	%RDX_LP, %RAX_LP
+-	jmp	L(start_movsb)
+-END (__mempcpy_erms)
+-
+-ENTRY (__memmove_chk_erms)
+-	cmp	%RDX_LP, %RCX_LP
+-	jb	HIDDEN_JUMPTARGET (__chk_fail)
+-END (__memmove_chk_erms)
+-
+-ENTRY (__memmove_erms)
+-	movq	%rdi, %rax
+-	/* Skip zero length.  */
+-	test	%RDX_LP, %RDX_LP
+-	jz	2f
+-L(start_movsb):
+-	mov	%RDX_LP, %RCX_LP
+-	cmp	%RSI_LP, %RDI_LP
+-	jb	1f
+-	/* Source == destination is less common.  */
+-	je	2f
+-	lea	(%rsi,%rcx), %RDX_LP
+-	cmp	%RDX_LP, %RDI_LP
+-	jb	L(movsb_backward)
+-1:
+-	rep movsb
+-2:
+-	ret
+-L(movsb_backward):
+-	leaq	-1(%rdi,%rcx), %rdi
+-	leaq	-1(%rsi,%rcx), %rsi
+-	std
+-	rep movsb
+-	cld
+-	ret
+-END (__memmove_erms)
+-strong_alias (__memmove_erms, __memcpy_erms)
+-strong_alias (__memmove_chk_erms, __memcpy_chk_erms)
+-# endif
+ 
+ # ifdef SHARED
+ ENTRY (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned_erms))
--- a/glibc-upstream-2.34-300.patch
+++ b/glibc-upstream-2.34-300.patch
@ -0,0 +1,38 @@
+commit ccc54bd61c768b6a27f9305a0831b76a7b6d706f
+Author: Noah Goldstein <goldstein.w.n@gmail.com>
+Date:   Wed Jun 29 18:56:18 2022 -0700
+
+    x86: Add missing IS_IN (libc) check to strncmp-sse4_2.S
+    
+    Was missing to for the multiarch build rtld-strncmp-sse4_2.os was
+    being built and exporting symbols:
+    
+    build/glibc/string/rtld-strncmp-sse4_2.os:
+    0000000000000000 T __strncmp_sse42
+    
+    Introduced in:
+    
+    commit 11ffcacb64a939c10cfc713746b8ec88837f5c4a
+    Author: H.J. Lu <hjl.tools@gmail.com>
+    Date:   Wed Jun 21 12:10:50 2017 -0700
+    
+        x86-64: Implement strcmp family IFUNC selectors in C
+    
+    (cherry picked from commit 96ac447d915ea5ecef3f9168cc13f4e731349a3b)
+
+diff --git a/sysdeps/x86_64/multiarch/strncmp-sse4_2.S b/sysdeps/x86_64/multiarch/strncmp-sse4_2.S
+index 22f51a0dfd2770c9..85dc363bf9d6273d 100644
+--- a/sysdeps/x86_64/multiarch/strncmp-sse4_2.S
+++ b/sysdeps/x86_64/multiarch/strncmp-sse4_2.S
+@@ -16,6 +16,8 @@
+    License along with the GNU C Library; if not, see
+    <https://www.gnu.org/licenses/>.  */
+ 
+-#define STRCMP_SSE42 __strncmp_sse42
+-#define USE_AS_STRNCMP
+-#include "strcmp-sse42.S"
+#if IS_IN (libc)
+# define STRCMP_SSE42 __strncmp_sse42
+# define USE_AS_STRNCMP
+# include "strcmp-sse42.S"
+#endif
--- a/glibc-upstream-2.34-301.patch
+++ b/glibc-upstream-2.34-301.patch
@ -0,0 +1,28 @@
+commit b991af50632a2da262b00b2375d9120f23b25525
+Author: Joseph Myers <joseph@codesourcery.com>
+Date:   Wed May 25 14:37:28 2022 +0000
+
+    Update syscall-names.list for Linux 5.18
+    
+    Linux 5.18 has no new syscalls.  Update the version number in
+    syscall-names.list to reflect that it is still current for 5.18.
+    
+    Tested with build-many-glibcs.py.
+    
+    (cherry picked from commit 3d9926663cba19f40d26d8a8ab3b2a7cc09ffb13)
+
+diff --git a/sysdeps/unix/sysv/linux/syscall-names.list b/sysdeps/unix/sysv/linux/syscall-names.list
+index e2743c649586d97a..95370e2ec5dbc4a7 100644
+--- a/sysdeps/unix/sysv/linux/syscall-names.list
+++ b/sysdeps/unix/sysv/linux/syscall-names.list
+@@ -21,8 +21,8 @@
+ # This file can list all potential system calls.  The names are only
+ # used if the installed kernel headers also provide them.
+ 
+-# The list of system calls is current as of Linux 5.17.
+-kernel 5.17
+# The list of system calls is current as of Linux 5.18.
+kernel 5.18
+ 
+ FAST_atomic_update
+ FAST_cmpxchg
--- a/glibc-upstream-2.34-302.patch
+++ b/glibc-upstream-2.34-302.patch
@ -0,0 +1,44 @@
+commit b2f32e746492615a6eb3e66fac1e766e32e8deb1
+Author: Florian Weimer <fweimer@redhat.com>
+Date:   Thu Jul 21 12:12:08 2022 +0200
+
+    malloc: Simplify implementation of __malloc_assert
+    
+    It is prudent not to run too much code after detecting heap
+    corruption, and __fxprintf is really complex.  The line number
+    and file name do not carry much information, so it is not included
+    in the error message.  (__libc_message only supports %s formatting.)
+    The function name and assertion should provide some context.
+    
+    Reviewed-by: Siddhesh Poyarekar <siddhesh@sourceware.org>
+    (cherry picked from commit ac8047cdf326504f652f7db97ec96c0e0cee052f)
+
+diff --git a/malloc/malloc.c b/malloc/malloc.c
+index 7882c70f0a0312d1..d31e985ecce968fe 100644
+--- a/malloc/malloc.c
+++ b/malloc/malloc.c
+@@ -294,19 +294,14 @@
+ # define __assert_fail(assertion, file, line, function)			\
+ 	 __malloc_assert(assertion, file, line, function)
+ 
+-extern const char *__progname;
+-
+-static void
+_Noreturn static void
+ __malloc_assert (const char *assertion, const char *file, unsigned int line,
+ 		 const char *function)
+ {
+-  (void) __fxprintf (NULL, "%s%s%s:%u: %s%sAssertion `%s' failed.\n",
+-		     __progname, __progname[0] ? ": " : "",
+-		     file, line,
+-		     function ? function : "", function ? ": " : "",
+-		     assertion);
+-  fflush (stderr);
+-  abort ();
+  __libc_message (do_abort, "\
+Fatal glibc error: malloc assertion failure in %s: %s\n",
+		  function, assertion);
+  __builtin_unreachable ();
+ }
+ #endif
+ #endif
--- a/glibc.spec
+++ b/glibc.spec
@ -148,7 +148,7 @@ end \
 Summary: The GNU libc libraries
 Name: glibc
 Version: %{glibcversion}
-Release: 39%{?dist}
+Release: 40%{?dist}

 # In general, GPLv2+ is used by programs, LGPLv2+ is used for
 # libraries.
@ -550,6 +550,35 @@ Patch342: glibc-upstream-2.34-272.patch
 Patch343: glibc-upstream-2.34-273.patch
 Patch344: glibc-rh2096191-1.patch
 Patch345: glibc-rh2096191-2.patch
+Patch346: glibc-upstream-2.34-274.patch
+Patch347: glibc-upstream-2.34-275.patch
+Patch348: glibc-upstream-2.34-276.patch
+Patch349: glibc-upstream-2.34-277.patch
+Patch350: glibc-upstream-2.34-278.patch
+Patch351: glibc-upstream-2.34-279.patch
+Patch352: glibc-upstream-2.34-280.patch
+Patch353: glibc-upstream-2.34-281.patch
+Patch354: glibc-upstream-2.34-282.patch
+Patch355: glibc-upstream-2.34-283.patch
+Patch356: glibc-upstream-2.34-284.patch
+Patch357: glibc-upstream-2.34-285.patch
+Patch358: glibc-upstream-2.34-286.patch
+Patch359: glibc-upstream-2.34-287.patch
+Patch360: glibc-upstream-2.34-288.patch
+Patch361: glibc-upstream-2.34-289.patch
+Patch362: glibc-upstream-2.34-290.patch
+Patch363: glibc-upstream-2.34-291.patch
+Patch364: glibc-upstream-2.34-292.patch
+Patch365: glibc-upstream-2.34-293.patch
+Patch366: glibc-upstream-2.34-294.patch
+Patch367: glibc-upstream-2.34-295.patch
+Patch368: glibc-upstream-2.34-296.patch
+Patch369: glibc-upstream-2.34-297.patch
+Patch370: glibc-upstream-2.34-298.patch
+Patch371: glibc-upstream-2.34-299.patch
+Patch372: glibc-upstream-2.34-300.patch
+Patch373: glibc-upstream-2.34-301.patch
+Patch374: glibc-upstream-2.34-302.patch

 ##############################################################################
 # Continued list of core "glibc" package information:
@ -2606,6 +2635,39 @@ fi
 %files -f compat-libpthread-nonshared.filelist -n compat-libpthread-nonshared

 %changelog
+* Fri Jul 22 2022 Arjun Shankar <arjun@redhat.com> - 2.34-40
+- Sync with upstream branch release/2.34/master,
+  commit b2f32e746492615a6eb3e66fac1e766e32e8deb1:
+- malloc: Simplify implementation of __malloc_assert
+- Update syscall-names.list for Linux 5.18
+- x86: Add missing IS_IN (libc) check to strncmp-sse4_2.S
+- x86: Move mem{p}{mov|cpy}_{chk_}erms to its own file
+- x86: Move and slightly improve memset_erms
+- x86: Add definition for __wmemset_chk AVX2 RTM in ifunc impl list
+- x86: Put wcs{n}len-sse4.1 in the sse4.1 text section
+- x86: Align entry for memrchr to 64-bytes.
+- x86: Add BMI1/BMI2 checks for ISA_V3 check
+- x86: Cleanup bounds checking in large memcpy case
+- x86: Add bounds `x86_non_temporal_threshold`
+- x86: Add sse42 implementation to strcmp's ifunc
+- x86: Fix misordered logic for setting `rep_movsb_stop_threshold`
+- x86: Align varshift table to 32-bytes
+- x86: ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST expect no transactions
+- x86: Shrink code size of memchr-evex.S
+- x86: Shrink code size of memchr-avx2.S
+- x86: Optimize memrchr-avx2.S
+- x86: Optimize memrchr-evex.S
+- x86: Optimize memrchr-sse2.S
+- x86: Add COND_VZEROUPPER that can replace vzeroupper if no `ret`
+- x86: Create header for VEC classes in x86 strings library
+- x86_64: Add strstr function with 512-bit EVEX
+- x86-64: Ignore r_addend for R_X86_64_GLOB_DAT/R_X86_64_JUMP_SLOT
+- x86_64: Implement evex512 version of strlen, strnlen, wcslen and wcsnlen
+- x86_64: Remove bzero optimization
+- x86_64: Remove end of line trailing spaces
+- nptl: Fix ___pthread_unregister_cancel_restore asynchronous restore
+- linux: Fix mq_timereceive check for 32 bit fallback code (BZ 29304)
+
 * Fri Jun 24 2022 Florian Weimer <fweimer@redhat.com> - 2.34-39
 - Add the no-aaaa DNS stub resolver option (#2096191)